commit 6218daeff47e8f02e19b722fe188873a35efd963 Author: yiekheng Date: Sun Apr 12 18:39:18 2026 +0800 Squashed 'manga-dl/' content from commit 9cb9b8c git-subtree-dir: manga-dl git-subtree-split: 9cb9b8c7fdbc3622146c162c9e9ec5e7e3c518a6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..580249b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.env +__pycache__/ +manga-content/ +.browser-data/ +cookies.txt +.DS_Store diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..8b1b75c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,160 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Single-file interactive toolkit (`manga.py`) that downloads manga from m.happymh.com, stores images in Cloudflare R2 as WebP, and writes metadata to PostgreSQL. Runs as an arrow-key TUI backed by a persistent Chrome session. + +## Commands + +```bash +pip install -r requirements.txt # playwright, boto3, psycopg2-binary, Pillow, python-dotenv, simple-term-menu +python manga.py # launch the TUI (no CLI args) +``` + +No tests, no lint config, no build step. Requires Google Chrome or Chromium installed. The script auto-detects from `CHROME_CANDIDATES` (macOS/Linux/Windows paths). R2 and DB credentials load lazily — see `.env` section below. + +## Architecture + +### Anti-bot: real Chrome + CDP + persistent profile + +Cloudflare fingerprints both the TLS handshake and the browser process. The anti-detection chain matters — changing any link breaks downloads: + +1. **`subprocess.Popen(CHROME_PATH, ...)`** launches the user's real Chrome binary, not Playwright's Chromium. This gives a genuine TLS fingerprint. +2. **`connect_over_cdp`** attaches Playwright to Chrome via DevTools Protocol. Playwright never *launches* Chrome — only sends CDP commands to a separately-running process. +3. **Persistent `--user-data-dir=.browser-data`** preserves `cf_clearance` cookies between runs. After the user solves Cloudflare once (Setup menu), subsequent runs skip the challenge. +4. **Single session (`_session_singleton`)** — Chrome is lazy-started on first operation and reused across all commands in one `python manga.py` run. Closed only on Quit. `with_browser(func)` catches "target closed" / "disconnected" errors, resets the singleton, and retries once. +5. **`hide_chrome()`** runs `osascript -e 'tell application "System Events" to set visible of process "Google Chrome" to false'` after launch so the window doesn't steal focus. No-op on non-macOS. + +**Do not switch to headless mode.** Tried — Cloudflare blocks it because the fingerprint differs from real Chrome. **Do not parallelize chapter work across threads** with Playwright's sync API — each thread would need its own event loop and crashes with "no running event loop". + +### Cloudflare handling + +`wait_for_cloudflare(session)` polls `page.title()` and `page.url` for the "Just a moment" / `/challenge` markers. Recovery is manual: the user is shown the browser window and solves CAPTCHA. The Setup menu (`cmd_setup`) is the dedicated flow for this. During sync/check-missing, if the reading API returns 403, the script prints "CF blocked — run Setup" and stops. + +### Navigation: `page.goto` vs JS assignment + +- **Manga listing page** (`/manga/`) uses `page.goto(..., wait_until="commit")`. Works because Cloudflare on this route is lenient. +- **Reader page** (`/mangaread//`) uses `page.evaluate("window.location.href = '...'")` — bypasses CF's detection of CDP `Page.navigate` for the stricter reader route. + +### Image pipeline (happymh) + +Per chapter (in `_try_get_chapter_images`): +1. Register a response listener that matches `/apis/manga/reading` **AND** `cid=` in the URL **AND** validates `data.id` in the response body matches. Drops pre-fetched neighbouring chapters. +2. Navigate the reader URL via `window.location.href` assignment. +3. DOM-count sanity check: `[class*="imgContainer"]` total minus `[class*="imgNext"]` gives the current chapter's actual page count. Trim captured list if it includes next-chapter previews. +4. `fetch_image_bytes(page, img)` runs `fetch(url)` via `page.evaluate` inside a `page.expect_response(...)` block. The body is read via CDP (`response.body()`) — zero base64 overhead. Fallback strips the `?q=50` query if the original URL fails. +5. `fetch_all_pages(page, images, max_attempts=3)` retries each failed page up to 3 times with 2s backoff between rounds. Returns `{page_num: bytes}` for successful fetches. + +### R2 + DB write ordering + +**Page rows are inserted into the DB only after the R2 upload succeeds.** This prevents orphan DB records pointing to missing R2 objects. Every `INSERT INTO "Page"` includes `width` and `height` read from the JPEG/WebP bytes via PIL (`Image.open(...).width`). + +### Storage layouts + +``` +# Local (download command) +manga-content//detail.json # title, author, genres, description, mg-cover URL +manga-content//cover.jpg # captured from page load traffic +manga-content// /.jpg + +# R2 (upload / sync) +manga//cover.webp +manga//chapters//.webp +``` + +Chapter order is the API's ascending index (1-based). Chapter names can repeat (announcements, extras) so the DB `Chapter.number` column uses this index, not parsed chapter titles. + +### Menu actions + +- **Setup** (`cmd_setup`) → brings Chrome to front, user solves CF, validates `cf_clearance` cookie. +- **Download** (`cmd_download`) → picks URL from `manga.json`, optional chapter multi-select; saves JPGs locally. +- **Upload** (`cmd_upload` → `upload_manga_to_r2`) → converts local JPGs → WebP, uploads to R2, writes DB rows. +- **Sync** (`cmd_sync`) → combined download+upload via RAM (no local files), refreshes `Manga` row metadata, only inserts chapters missing from DB. +- **R2 / DB management** submenu (`tui_r2_manage`): + - **Status** — single-pass R2 object count grouped by slug, plus DB row counts + - **Edit manga info** (`tui_edit_manga`) — title/description/genre/status/coverUrl + - **Delete specific manga** — R2 prefix + cascade DB delete + - **Delete specific chapter** (`tui_delete_chapter`) — multi-select or "All chapters" + - **Check missing pages** (`tui_check_missing_pages`) — for each chapter: if site page count ≠ R2 count, re-upload **inline** (browser still on that reader page); if counts match but DB `width`/`height` are NULL or 0, fix by reading WebP bytes from R2 (no re-upload) + - **Clear ALL (R2 + DB)** + - **Recompress manga** (`r2_recompress`) — re-encodes every WebP under `manga//` at quality=65, overwrites in place + +### WebP encoding + +`_to_webp_bytes(img, quality=WEBP_QUALITY=75, method=6)` — method=6 is the slowest/smallest preset. Covers use quality 80 via `make_cover` (crops to 400×560 aspect, then resizes). Resize-during-encode was explicitly removed — page originals' dimensions are preserved. + +### ESC to stop + +`EscListener` puts stdin in cbreak mode (POSIX `termios`+`tty`) and runs a daemon thread listening for `\x1b`. Download/Upload/Sync check `esc.stop.is_set()` between chapters and cleanly exit. Restores terminal mode on `__exit__`. No-op on Windows (no termios) and when stdin isn't a TTY. + +### Lazy config loading + +`_ensure_config()` is called at the start of each R2/DB helper. It reads required env vars and constructs the boto3 client on first use. If env vars are missing, it prints the missing list and `sys.exit(1)` — no KeyError traceback on import. `s3`, `BUCKET`, `PUBLIC_URL`, `DATABASE_URL` are module globals set by that call. + +## Environment variables (.env) + +``` +R2_ACCOUNT_ID= # cloudflare account id +R2_ACCESS_KEY= +R2_SECRET_KEY= +R2_BUCKET= +R2_PUBLIC_URL= # e.g. https://pub-xxx.r2.dev (trailing slash stripped) +DATABASE_URL= # postgresql://user:pass@host:port/dbname +``` + +Missing any of these produces a friendly error on first R2/DB operation, not on import. + +## DB schema expectations + +The script reads/writes but does **not** create tables. Create them externally: + +```sql +CREATE TABLE "Manga" ( + id SERIAL PRIMARY KEY, + slug TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + description TEXT, + "coverUrl" TEXT, + genre TEXT, -- comma-joined list of all genres + status TEXT NOT NULL, -- PUBLISHED | DRAFT | HIDDEN + "createdAt" TIMESTAMPTZ NOT NULL, + "updatedAt" TIMESTAMPTZ NOT NULL +); + +CREATE TABLE "Chapter" ( + id SERIAL PRIMARY KEY, + "mangaId" INTEGER NOT NULL REFERENCES "Manga"(id), + number INTEGER NOT NULL, -- 1-based index from the API order + title TEXT NOT NULL, + UNIQUE ("mangaId", number) +); + +CREATE TABLE "Page" ( + id SERIAL PRIMARY KEY, + "chapterId" INTEGER NOT NULL REFERENCES "Chapter"(id), + number INTEGER NOT NULL, -- 1-based page number + "imageUrl" TEXT NOT NULL, + width INTEGER, + height INTEGER, + UNIQUE ("chapterId", number) +); +``` + +Column identifiers are camelCase with double quotes — matches Prisma default naming. + +## Where to change what + +| Task | Location | +|---|---| +| Add a new site | Extract happymh-specific bits: `fetch_chapters_via_api`, `fetch_chapters_from_dom`, `fetch_metadata`, `_try_get_chapter_images`, the `/mcover/` cover capture in `load_manga_page`, the reader URL shape. Keep Chrome/R2/DB/TUI as common. | +| New menu item | Add to `show_menu` list in `main` and dispatch in the `if idx == N:` ladder. For R2/DB ops, add to `tui_r2_manage`. | +| Tweak CF detection | `wait_for_cloudflare` / `_wait_for_cf_on_page` — edit the title/URL heuristics carefully, both ops check the same signals. | +| Change image quality | `WEBP_QUALITY` at top of file; cover quality is hard-coded 80 in `make_cover`. | +| Add a new Page-table column | Update all three `INSERT INTO "Page"` sites (`upload_manga_to_r2`, `cmd_sync`, `tui_check_missing_pages` re-upload branch) and the `SELECT ... FROM "Page"` in the dim-check query. | +| Change parallelism | `UPLOAD_WORKERS` for R2 uploads; do **not** introduce chapter-level threading (sync Playwright breaks). | + +## Future: multi-site support + +Current code is happymh-specific (selectors, API paths, URL patterns). To generalise, a site module would implement `fetch_chapters(page, slug)`, `get_chapter_images(page, slug, chapter_id)`, and `fetch_metadata(page)`, keeping the Chrome/R2/DB/TUI layer common. diff --git a/manga.json b/manga.json new file mode 100644 index 0000000..707854a --- /dev/null +++ b/manga.json @@ -0,0 +1,6 @@ +[ + "https://m.happymh.com/manga/fangkainagenvwu", + "https://m.happymh.com/manga/jueduijiangan", + "https://m.happymh.com/manga/xingjiandashi", + "https://m.happymh.com/manga/moutianchengweimoshen" +] \ No newline at end of file diff --git a/manga.py b/manga.py new file mode 100644 index 0000000..8e5c032 --- /dev/null +++ b/manga.py @@ -0,0 +1,2062 @@ +""" +Manga toolkit — download from m.happymh.com, upload to Cloudflare R2. + +Usage: + python manga.py +""" + +import io +import json +import os +import platform +import re +import select +import sys +import time +import socket +import subprocess +import threading + +IS_MACOS = platform.system() == "Darwin" + +# POSIX-only TTY modules; EscListener is a no-op on Windows. +try: + import termios + import tty + _HAS_TERMIOS = True +except ImportError: + termios = None + tty = None + _HAS_TERMIOS = False +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +import psycopg2 +from PIL import Image +from dotenv import load_dotenv +from playwright.sync_api import sync_playwright +from simple_term_menu import TerminalMenu + +load_dotenv() + +# ── Config ───────────────────────────────────────────────── + +BASE_URL = "https://m.happymh.com" +ROOT_DIR = Path(__file__).parent +CONTENT_DIR = ROOT_DIR / "manga-content" +MANGA_JSON = ROOT_DIR / "manga.json" +BROWSER_DATA = ROOT_DIR / ".browser-data" +CDP_PORT = 9333 +REQUEST_DELAY = 1.5 +UPLOAD_WORKERS = 8 + +CHROME_CANDIDATES = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", # macOS + "/usr/bin/google-chrome", # Linux + "/usr/bin/google-chrome-stable", + "/usr/bin/chromium", + "/usr/bin/chromium-browser", + r"C:\Program Files\Google\Chrome\Application\chrome.exe", # Windows + r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe", +] + + +def _find_chrome(): + for p in CHROME_CANDIDATES: + if Path(p).exists(): + return p + return None + + +CHROME_PATH = _find_chrome() + + +# R2/DB config loaded lazily so missing .env gives a friendly error, not KeyError on import. +_REQUIRED_ENV = ("R2_ACCOUNT_ID", "R2_ACCESS_KEY", "R2_SECRET_KEY", "R2_BUCKET", "R2_PUBLIC_URL", "DATABASE_URL") +s3 = None +BUCKET = None +PUBLIC_URL = None +DATABASE_URL = None +_config_loaded = False + + +def _ensure_config(): + global s3, BUCKET, PUBLIC_URL, DATABASE_URL, _config_loaded + if _config_loaded: + return + missing = [k for k in _REQUIRED_ENV if not os.environ.get(k)] + if missing: + print("Missing env vars (check .env):") + for k in missing: + print(f" {k}") + sys.exit(1) + s3 = boto3.client( + "s3", + endpoint_url=f"https://{os.environ['R2_ACCOUNT_ID']}.r2.cloudflarestorage.com", + aws_access_key_id=os.environ["R2_ACCESS_KEY"], + aws_secret_access_key=os.environ["R2_SECRET_KEY"], + region_name="auto", + ) + BUCKET = os.environ["R2_BUCKET"] + PUBLIC_URL = os.environ["R2_PUBLIC_URL"].rstrip("/") + DATABASE_URL = os.environ["DATABASE_URL"] + _config_loaded = True + + +# ── ESC listener ─────────────────────────────────────────── + + +class EscListener: + """Context manager: listens for ESC key in background, sets self.stop event.""" + + def __init__(self): + self.stop = threading.Event() + self._thread = None + self._old = None + self._fd = None + + def __enter__(self): + if not _HAS_TERMIOS or not sys.stdin.isatty(): + return self + self._fd = sys.stdin.fileno() + try: + self._old = termios.tcgetattr(self._fd) + tty.setcbreak(self._fd) + except Exception: + self._old = None + return self + self._thread = threading.Thread(target=self._listen, daemon=True) + self._thread.start() + return self + + def _listen(self): + while not self.stop.is_set(): + try: + r, _, _ = select.select([sys.stdin], [], [], 0.2) + if r and sys.stdin.read(1) == "\x1b": + self.stop.set() + print("\n ESC pressed — stopping after current item...") + return + except Exception: + return + + def __exit__(self, *args): + self.stop.set() + if self._old is not None: + try: + termios.tcsetattr(self._fd, termios.TCSADRAIN, self._old) + except Exception: + pass + + +# ── Chrome management ────────────────────────────────────── + + +def hide_chrome(): + """Hide Chrome window (macOS only; no-op elsewhere).""" + if not IS_MACOS: + return + try: + subprocess.Popen( + ["osascript", "-e", + 'tell application "System Events" to set visible of process "Google Chrome" to false'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + except Exception: + pass + + +def is_port_open(port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(("localhost", port)) == 0 + + +def launch_chrome(start_url=None): + if is_port_open(CDP_PORT): + return None + if not CHROME_PATH or not Path(CHROME_PATH).exists(): + print(" Chrome not found. Install Google Chrome or Chromium.") + print(" Searched:") + for p in CHROME_CANDIDATES: + print(f" {p}") + return None + cmd = [ + CHROME_PATH, + f"--remote-debugging-port={CDP_PORT}", + f"--user-data-dir={BROWSER_DATA}", + "--no-first-run", + "--no-default-browser-check", + "--window-position=0,0", + "--window-size=800,600", + "--no-focus-on-navigate", + ] + if start_url: + cmd.append(start_url) + proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + for _ in range(30): + if is_port_open(CDP_PORT): + time.sleep(1) + hide_chrome() + return proc + time.sleep(0.5) + print(" Chrome failed to start") + return None + + +class BrowserSession: + """Manages Chrome + CDP lifecycle.""" + + def __init__(self): + self.chrome_proc = None + self.playwright = None + self.browser = None + self.page = None + + def start(self): + self.chrome_proc = launch_chrome() + self.playwright = sync_playwright().start() + self.browser = self.playwright.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") + context = self.browser.contexts[0] + self.page = context.pages[0] if context.pages else context.new_page() + + def close(self): + try: + self.browser.close() + except Exception: + pass + if self.chrome_proc: + self.chrome_proc.terminate() + if self.playwright: + self.playwright.stop() + + +_session_singleton = None + + +def get_session(): + """Get or lazy-start the global Chrome session.""" + global _session_singleton + if _session_singleton is None: + _session_singleton = BrowserSession() + _session_singleton.start() + return _session_singleton + + +def close_session(): + """Close the global Chrome session (called on exit).""" + global _session_singleton + if _session_singleton is not None: + _session_singleton.close() + _session_singleton = None + + +def with_browser(func): + """Run func(session) using the persistent Chrome session. + If the session crashed (target closed etc.), reset and retry once.""" + session = get_session() + try: + return func(session) + except Exception as e: + msg = str(e).lower() + if "target" in msg or "browser" in msg or "closed" in msg or "disconnected" in msg: + print(" Browser session lost, restarting...") + close_session() + return func(get_session()) + raise + + +# ── Cloudflare ───────────────────────────────────────────── + + +def _wait_for_cf_on_page(page, timeout=120): + """Wait for CF to resolve on a specific page.""" + for i in range(timeout): + try: + title = page.title() + except Exception: + time.sleep(1) + continue + if "Just a moment" in title or "challenge" in page.url: + time.sleep(1) + continue + if title and ("嗨皮漫画" in title or "happymh" in page.url): + return True + time.sleep(1) + return False + + +def wait_for_cloudflare(session, timeout=120): + """Wait for CF to resolve. User solves in the visible browser window.""" + page = session.page + for i in range(timeout): + try: + title = page.title() + except Exception: + time.sleep(1) + continue + if "Just a moment" in title or "challenge" in page.url: + if i == 0: + print(" CF challenge — solve in browser...") + elif i % 15 == 0: + print(f" Still waiting for CF... ({i}s)") + time.sleep(1) + continue + if title and ("嗨皮漫画" in title or "happymh" in page.url): + return True + time.sleep(1) + print(" CF timed out.") + return False + + +# ── Happymh: chapter fetching ───────────────────────────── + + +def fetch_chapters_via_api(page, slug): + result = page.evaluate(""" + async (slug) => { + const all = []; + let total = 0; + for (let p = 1; p <= 30; p++) { + const url = `/v2.0/apis/manga/chapterByPage?code=${slug}&lang=cn&order=asc&page=${p}&_t=${Date.now()}`; + try { + const ctrl = new AbortController(); + setTimeout(() => ctrl.abort(), 10000); + const r = await fetch(url, { signal: ctrl.signal }); + if (!r.ok) { if (p === 1) return { error: r.status }; break; } + const json = await r.json(); + if (!json.data) break; + total = json.data.total || total; + let items = null; + for (const val of Object.values(json.data)) { + if (Array.isArray(val) && val.length > 0) { items = val; break; } + } + if (!items || items.length === 0) break; + for (const ch of items) { + all.push({ id: String(ch.id || ''), chapterName: ch.chapterName || ch.name || '' }); + } + if (total && all.length >= total) break; + } catch (e) { + if (p === 1) return { error: e.message }; + break; + } + } + return { chapters: all, total }; + } + """, slug) + if result and result.get("chapters") and len(result["chapters"]) > 0: + chapters = result["chapters"] + total = result.get("total", len(chapters)) + print(f" API: {len(chapters)}/{total} chapters") + return chapters + if result and result.get("error"): + print(f" API error: {result['error']}") + return None + + +def fetch_chapters_from_dom(page): + try: + page.wait_for_selector("a[href*='/mangaread/']", timeout=15000) + page.wait_for_timeout(1000) + except Exception: + return None + + for selector in ["text=展开全部", "text=查看全部", "text=全部章节", "text=展开更多", "text=更多"]: + try: + btn = page.query_selector(selector) + if btn and btn.is_visible(): + btn.click() + page.wait_for_timeout(2000) + break + except Exception: + continue + + try: + page.wait_for_selector(".MuiDrawer-paper", timeout=5000) + except Exception: + pass + + try: + sort_btn = page.query_selector("text=点我改变排序") + if sort_btn and sort_btn.is_visible(): + sort_btn.click() + page.wait_for_timeout(2000) + except Exception: + pass + + total = page.evaluate(""" + () => { + const spans = document.querySelectorAll('.MuiDrawer-paper span'); + for (const s of spans) { + const m = s.textContent.match(/共(\\d+)个章节/); + if (m) return parseInt(m[1]); + } + return 0; + } + """) + + for _ in range(50): + count = page.evaluate("document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length") + if total and count >= total: + break + clicked = page.evaluate(""" + () => { + const walker = document.createTreeWalker( + document.querySelector('.MuiDrawer-paper') || document.body, NodeFilter.SHOW_TEXT + ); + while (walker.nextNode()) { + if (walker.currentNode.textContent.includes('加载更多')) { + let el = walker.currentNode.parentElement; + while (el && el.tagName !== 'LI') el = el.parentElement; + if (el) { el.click(); return true; } + walker.currentNode.parentElement.click(); + return true; + } + } + return false; + } + """) + if not clicked: + break + page.wait_for_timeout(1000) + + chapters = page.evaluate(""" + () => { + const container = document.querySelector('.MuiDrawer-paper') || document; + const links = container.querySelectorAll('a[href*="/mangaread/"]'); + const chapters = [], seen = new Set(); + links.forEach(a => { + const match = a.getAttribute('href').match(/\\/mangaread\\/[^/]+\\/(\\d+)/); + if (match && !seen.has(match[1])) { + seen.add(match[1]); + const name = a.textContent.trim(); + if (name && name !== '开始阅读') chapters.push({ id: match[1], chapterName: name }); + } + }); + return chapters; + } + """) + + try: + page.keyboard.press("Escape") + except Exception: + pass + return chapters if chapters else None + + +# ── Happymh: metadata & cover ───────────────────────────── + + +def fetch_metadata(page): + html_text = page.content() + metadata = {"mg-url": page.url} + m = re.search(r'

(.*?)

', html_text) + if m: + metadata["mg-title"] = m.group(1).strip() + m = re.search(r'

.*?]*>(.*?)', html_text, re.DOTALL) + if m: + metadata["mg-author"] = m.group(1).strip() + genre_matches = re.findall(r'

.*?

', html_text, re.DOTALL) + if genre_matches: + metadata["mg-genres"] = re.findall(r']*>(.*?)', genre_matches[0]) + m = re.search(r'
.*?]*>(.*?)

', html_text, re.DOTALL) + if m: + metadata["mg-description"] = m.group(1).strip() + if not metadata.get("mg-description"): + m = re.search(r']*>(.*?)', html_text, re.DOTALL) + if m: + desc = re.sub(r'<[^>]+>', '', m.group(1)).strip() + if desc: + metadata["mg-description"] = desc + cover_url = page.evaluate(""" + () => { + const og = document.querySelector('meta[property="og:image"]'); + if (og) return og.content; + for (const sel of ['img.mg-cover', 'img[src*="mcover"]']) { + const img = document.querySelector(sel); + if (img && img.src) return img.src; + } + return null; + } + """) + if cover_url: + metadata["mg-cover"] = cover_url + return metadata + + +# ── Happymh: image download ─────────────────────────────── + + +def _try_get_chapter_images(page, slug, chapter_id): + """Single attempt to get chapter images. Returns (images, api_status).""" + captured_images = [] + api_info = {"found": False, "status": None, "error": None} + + def on_response(response): + if "/apis/manga/reading" not in response.url: + return + # Only capture our chapter, skip prefetched ones + if f"cid={chapter_id}" not in response.url and f"cid%3D{chapter_id}" not in response.url: + return + # Ignore if we already captured images (prevent duplicate/prefetch) + if captured_images: + return + api_info["found"] = True + api_info["status"] = response.status + if response.status != 200: + api_info["error"] = f"status {response.status}" + return + try: + data = response.json() + # Verify chapter ID in response body + resp_cid = str(data.get("data", {}).get("id", "")) + if resp_cid and resp_cid != str(chapter_id): + return + scans = data.get("data", {}).get("scans", []) + if isinstance(scans, str): + scans = json.loads(scans) + for scan in scans: + if isinstance(scan, dict) and "url" in scan: + captured_images.append({ + "url": scan["url"], + "no_referrer": scan.get("r", 0) != 0, + }) + except Exception as e: + api_info["error"] = str(e) + + page.on("response", on_response) + reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}" + try: + page.evaluate(f"window.location.href = '{reader_url}'") + except Exception: + pass + hide_chrome() + + time.sleep(2) + try: + page.evaluate("window.close = () => {}") + except Exception: + pass + + if not _wait_for_cf_on_page(page, timeout=90): + try: + page.remove_listener("response", on_response) + except Exception: + pass + return [], api_info + + deadline = time.time() + 20 + while time.time() < deadline: + if captured_images: + break + try: + page.wait_for_timeout(500) + except Exception: + break + + try: + page.remove_listener("response", on_response) + except Exception: + pass + + if not api_info["found"]: + print(" API not intercepted") + elif api_info["error"]: + print(f" API: {api_info['error']}") + + # Filter out next-chapter preview images by counting DOM containers + if captured_images: + try: + counts = page.evaluate(""" + () => { + const all = document.querySelectorAll('[class*="imgContainer"]').length; + const next = document.querySelectorAll('[class*="imgNext"]').length; + return { all, next, current: all - next }; + } + """) + if counts and counts.get("next", 0) > 0: + actual = counts["current"] + if 0 < actual < len(captured_images): + captured_images = captured_images[:actual] + except Exception: + pass + + # DOM fallback + if not captured_images: + try: + page.wait_for_timeout(3000) + dom_images = page.evaluate(""" + () => { + const imgs = document.querySelectorAll('img[src*="http"]'); + const nextImgs = new Set( + Array.from(document.querySelectorAll('[class*="imgNext"] img')) + .map(img => img.src) + ); + const urls = [], seen = new Set(); + imgs.forEach(img => { + const src = img.src || ''; + if (src && !seen.has(src) && !nextImgs.has(src) + && !src.includes('/mcover/') + && !src.includes('cloudflare') && !src.includes('.svg')) { + seen.add(src); urls.push(src); + } + }); + return urls; + } + """) + if dom_images: + print(f" DOM: {len(dom_images)} images") + for u in dom_images: + captured_images.append({"url": u, "no_referrer": False}) + except Exception as e: + print(f" DOM failed: {e}") + + return captured_images, api_info + + +def get_chapter_images(page, slug, chapter_id): + """Get chapter images using given page. On API 403, returns empty (caller should handle CF).""" + images, api_info = _try_get_chapter_images(page, slug, chapter_id) + return images, api_info + + +def fetch_all_pages(page, images, max_attempts=3): + """Fetch all pages with retry using given page. Returns {page_num: bytes}.""" + total = len(images) + page_bytes = {} + pending = list(enumerate(images, 1)) + + for attempt in range(1, max_attempts + 1): + if not pending: + break + if attempt > 1: + time.sleep(2) + + next_pending = [] + for pn, img in pending: + body = fetch_image_bytes(page, img) + if body: + page_bytes[pn] = body + else: + next_pending.append((pn, img)) + time.sleep(0.1) + pending = next_pending + + return page_bytes + + +def _fetch_via_page(page, url, ref_policy): + try: + with page.expect_response(lambda r: url.split("?")[0] in r.url, timeout=15000) as resp_info: + page.evaluate("([u, r]) => fetch(u, { referrerPolicy: r })", [url, ref_policy]) + response = resp_info.value + if response.status == 200: + body = response.body() + if body and len(body) > 100: + return body + except Exception: + pass + return None + + +def fetch_image_bytes(page, img): + """Fetch image via browser network stack using given page. Tries URL variants on failure.""" + url = img["url"] + ref_policy = "no-referrer" if img.get("no_referrer") else "origin" + + # Try original URL + body = _fetch_via_page(page, url, ref_policy) + if body: + return body + + # Fallback: strip query string (e.g., ?q=50) + if "?" in url: + body = _fetch_via_page(page, url.split("?")[0], ref_policy) + if body: + return body + + return None + + +def download_image(page, img, save_path): + """Fetch image and save to disk.""" + if save_path.exists(): + return True + body = fetch_image_bytes(page, img) + if body: + save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.write_bytes(body) + return True + return False + + +# ── R2 / Upload ──────────────────────────────────────────── + + +WEBP_QUALITY = 75 + + +def _to_webp_bytes(img, quality=WEBP_QUALITY, method=6): + buf = io.BytesIO() + img.save(buf, format="WEBP", quality=quality, method=method) + return buf.getvalue() + + +def convert_to_webp(source, quality=WEBP_QUALITY): + return _to_webp_bytes(Image.open(source), quality) + + +def probe_and_webp(source, quality=WEBP_QUALITY): + """Open once; return (width, height, webp_bytes).""" + with Image.open(source) as img: + return img.width, img.height, _to_webp_bytes(img, quality) + + +def insert_pages(cur, chapter_id, page_urls): + """page_urls: {page_num: (url, width, height)}. Inserts in page_num order.""" + for pn in sorted(page_urls): + url, w, h = page_urls[pn] + cur.execute( + 'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)', + (chapter_id, pn, url, w, h), + ) + + +def make_cover(source, width=400, height=560): + img = Image.open(source) + target_ratio = width / height + img_ratio = img.width / img.height + if img_ratio > target_ratio: + new_width = int(img.height * target_ratio) + left = (img.width - new_width) // 2 + img = img.crop((left, 0, left + new_width, img.height)) + else: + new_height = int(img.width / target_ratio) + img = img.crop((0, 0, img.width, new_height)) + img = img.resize((width, height), Image.LANCZOS) + return _to_webp_bytes(img, quality=80) + + +def upload_to_r2(key, data, content_type="image/webp"): + _ensure_config() + s3.put_object(Bucket=BUCKET, Key=key, Body=data, ContentType=content_type) + return f"{PUBLIC_URL}/{key}" + + +def r2_key_exists(key): + _ensure_config() + try: + s3.head_object(Bucket=BUCKET, Key=key) + return True + except s3.exceptions.ClientError: + return False + + +def get_db(): + _ensure_config() + conn = psycopg2.connect(DATABASE_URL) + conn.set_client_encoding("UTF8") + return conn + + +def parse_chapter_dir(dir_name): + m = re.match(r"^(\d+)\s+(.+)$", dir_name) + if m: + return int(m.group(1)), m.group(2) + return 0, dir_name + + +# ── Helpers ──────────────────────────────────────────────── + + +def load_manga_urls(): + if not MANGA_JSON.exists(): + return [] + data = json.loads(MANGA_JSON.read_text(encoding="utf-8")) + return data if isinstance(data, list) else [] + + +def slug_from_url(url): + return urlparse(url).path.strip("/").split("/")[-1] + + +def get_existing_chapters(manga_dir): + existing = set() + if manga_dir.exists(): + for entry in manga_dir.iterdir(): + if entry.is_dir() and any(entry.glob("*.jpg")): + existing.add(entry.name) + return existing + + +def list_local_manga(): + if not CONTENT_DIR.exists(): + return [] + return sorted(d.name for d in CONTENT_DIR.iterdir() if d.is_dir() and not d.name.startswith(".")) + + +# ── Core: download manga ────────────────────────────────── + + +def load_manga_page(session, slug): + """Navigate to manga page, pass CF, return (chapters, metadata, cover_bytes) or None.""" + cover_responses = {} + + def on_cover(response): + if "/mcover/" in response.url and response.status == 200: + try: + cover_responses[response.url] = response.body() + except Exception: + pass + + page = session.page + page.on("response", on_cover) + + print(" Loading manga page...") + try: + page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) + except Exception: + pass + hide_chrome() + if not wait_for_cloudflare(session): + page = session.page + try: + page.remove_listener("response", on_cover) + except Exception: + pass + return None + + page = session.page # may have changed after CF restart + print(" Fetching chapters...") + chapters = fetch_chapters_via_api(page, slug) + if not chapters: + print(" API failed, trying DOM...") + chapters = fetch_chapters_from_dom(page) + + metadata = fetch_metadata(page) + + # Wait for cover image to be present in DOM (up to 8s) + cover_url = None + for _ in range(16): + cover_url = page.evaluate(""" + () => { + const sels = ['img.mip-fill-content[src*="mcover"]', 'img[src*="/mcover/"]']; + for (const s of sels) { + const img = document.querySelector(s); + if (img && img.src) return img.src; + } + return null; + } + """) + if cover_url: + break + page.wait_for_timeout(500) + + # Give the response another moment to be captured + if cover_url and cover_url not in cover_responses: + page.wait_for_timeout(1500) + + try: + page.remove_listener("response", on_cover) + except Exception: + pass + + cover_body = None + if cover_url: + cover_body = cover_responses.get(cover_url) + if not cover_body: + for url, data in cover_responses.items(): + if cover_url.split("?")[0] in url or url.split("?")[0] in cover_url: + cover_body = data + break + + if not cover_body: + if cover_url: + print(f" Cover URL found but body not captured ({len(cover_responses)} responses)") + else: + print(f" No cover URL found in DOM") + + return chapters, metadata, cover_body + + +def save_manga_local(slug, metadata, cover_body): + """Save metadata and cover to local manga-content/.""" + manga_dir = CONTENT_DIR / slug + manga_dir.mkdir(parents=True, exist_ok=True) + + detail_path = manga_dir / "detail.json" + if metadata: + existing = {} + if detail_path.exists(): + try: + existing = json.loads(detail_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + pass + existing.update(metadata) + detail_path.write_text(json.dumps(existing, ensure_ascii=False, indent=4), encoding="utf-8") + + cover_path = manga_dir / "cover.jpg" + if not cover_path.exists() and cover_body and len(cover_body) > 100: + cover_path.write_bytes(cover_body) + print(f" Cover saved ({len(cover_body)} bytes)") + + +def download_chapter(session, slug, chapter_index, chapter, manga_dir): + """Download a single chapter's images. Returns True if successful.""" + ch_id = chapter["id"] + ch_name = chapter["chapterName"] + folder_name = f"{chapter_index} {ch_name}" + chapter_dir = manga_dir / folder_name + + images, _ = get_chapter_images(session.page, slug, ch_id) + if not images: + print(f" No images") + return False + + print(f" {len(images)} pages") + chapter_dir.mkdir(parents=True, exist_ok=True) + + page_bytes = fetch_all_pages(session.page, images) + ok = 0 + for pn, body in page_bytes.items(): + save_path = chapter_dir / f"{pn}.jpg" + save_path.write_bytes(body) + ok += 1 + + print(f" {ok}/{len(images)} downloaded" + " " * 20) + + if ok < len(images): + try: + chapter_dir.rmdir() + except Exception: + pass + return False + + time.sleep(REQUEST_DELAY) + return True + + +# ── Core: upload manga ──────────────────────────────────── + + +def upload_manga_to_r2(manga_name, conn): + """Upload a local manga to R2 and create DB records.""" + manga_path = CONTENT_DIR / manga_name + detail_path = manga_path / "detail.json" + + if not detail_path.exists(): + print(f" Skipping {manga_name}: no detail.json") + return + + detail = json.loads(detail_path.read_text(encoding="utf-8")) + title = detail.get("mg-title", manga_name) + slug = manga_name + genres = detail.get("mg-genres", []) + description = detail.get("mg-description", "") + genre = ", ".join(genres) if genres else "Drama" + + cur = conn.cursor() + + # Cover + cover_file = manga_path / "cover.jpg" + cover_url = "" + cover_key = f"manga/{slug}/cover.webp" + if cover_file.exists(): + if not r2_key_exists(cover_key): + cover_url = upload_to_r2(cover_key, make_cover(cover_file)) + print(f" Cover uploaded") + else: + cover_url = f"{PUBLIC_URL}/{cover_key}" + + # Manga record + cur.execute('SELECT id, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + if row: + manga_id, existing_cover = row + if cover_url and cover_url != existing_cover: + cur.execute('UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', (cover_url, manga_id)) + conn.commit() + else: + cur.execute( + 'INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") ' + "VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) RETURNING id", + (title, description, cover_url, slug, genre), + ) + manga_id = cur.fetchone()[0] + conn.commit() + print(f" Created manga (id: {manga_id})") + + # Chapters + chapter_dirs = sorted( + [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")], + key=lambda d: parse_chapter_dir(d.name)[0], + ) + + for chapter_dir in chapter_dirs: + order_num, chapter_title = parse_chapter_dir(chapter_dir.name) + if order_num == 0: + continue + + cur.execute('SELECT id FROM "Chapter" WHERE "mangaId" = %s AND number = %s', (manga_id, order_num)) + if cur.fetchone(): + print(f" [{order_num}] {chapter_title} — skip") + continue + + page_files = sorted( + [f for f in chapter_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")], + key=lambda f: int(re.search(r"(\d+)", f.stem).group(1)) if re.search(r"(\d+)", f.stem) else 0, + ) + if not page_files: + continue + + print(f" [{order_num}] {chapter_title} ({len(page_files)} pages)") + + # Upload to R2 first + def process_page(args, _slug=slug, _order=order_num): + j, pf = args + r2_key = f"manga/{_slug}/chapters/{_order}/{j}.webp" + if r2_key_exists(r2_key): + with Image.open(pf) as img: + return j, f"{PUBLIC_URL}/{r2_key}", img.width, img.height + w, h, webp = probe_and_webp(pf) + return j, upload_to_r2(r2_key, webp), w, h + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)} + for future in as_completed(futures): + j, url, w, h = future.result() + page_urls[j] = (url, w, h) + done += 1 + print(f" {done}/{len(page_files)}", end="\r") + + if not page_urls: + print(f" Upload failed, skip") + continue + + # DB records only after R2 upload succeeds + cur.execute( + 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', + (manga_id, order_num, chapter_title), + ) + chapter_id = cur.fetchone()[0] + insert_pages(cur, chapter_id, page_urls) + conn.commit() + print(f" {len(page_files)} pages uploaded" + " " * 10) + + +# ── Commands ─────────────────────────────────────────────── + + +def cmd_setup(): + print("\n Chrome will open. Solve Cloudflare on:") + print(" 1. m.happymh.com") + print(" 2. Any manga page") + print(" 3. Any reader page\n") + + session = get_session() + try: + session.page.goto(BASE_URL, wait_until="commit", timeout=60000) + except Exception: + pass + + # Bring Chrome to front for setup + try: + subprocess.Popen( + ["osascript", "-e", 'tell application "Google Chrome" to activate'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + except Exception: + pass + + input(" Press ENTER when done... ") + + cookies = session.browser.contexts[0].cookies() + cf = [c for c in cookies if c["name"] == "cf_clearance"] + print(f" cf_clearance: {'found' if cf else 'NOT found'}") + hide_chrome() + print() + + +def cmd_download(manga_url=None, chapter_set=None): + """Download manga. chapter_set is a set of 1-based indices, or None for all.""" + urls = [manga_url] if manga_url else load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return + + print(f"\n Downloading {len(urls)} manga(s)... (ESC to stop)\n") + + def run(session): + with EscListener() as esc: + for url in urls: + if esc.stop.is_set(): + break + slug = slug_from_url(url) + try: + result = load_manga_page(session, slug) + if not result: + continue + chapters, metadata, cover_body = result + if not chapters: + print(" No chapters found.") + continue + print(f" Found {len(chapters)} chapters") + save_manga_local(slug, metadata, cover_body) + + existing = get_existing_chapters(CONTENT_DIR / slug) + + for i, ch in enumerate(chapters, 1): + if esc.stop.is_set(): + break + if chapter_set and i not in chapter_set: + continue + if any(ch["chapterName"] in name for name in existing): + print(f" [{i}/{len(chapters)}] {ch['chapterName']} — skip") + continue + print(f" [{i}/{len(chapters)}] {ch['chapterName']} (id={ch['id']})") + download_chapter(session, slug, i, ch, CONTENT_DIR / slug) + print(f"\n Done: {slug}") + except Exception as e: + print(f"\n Error: {url}: {e}") + import traceback + traceback.print_exc() + + with_browser(run) + print("\nDownload complete!") + + +def cmd_upload(manga_name=None): + if manga_name: + names = [manga_name] + else: + names = list_local_manga() + if not names: + print(" No manga in manga-content/") + return + + print(f"\n Uploading {len(names)} manga(s)... (ESC to stop)") + conn = get_db() + try: + with EscListener() as esc: + for name in names: + if esc.stop.is_set(): + break + print(f"\n {'='*50}") + print(f" {name}") + print(f" {'='*50}") + upload_manga_to_r2(name, conn) + finally: + conn.close() + print("\nUpload complete!") + + +def cmd_sync(manga_url=None): + """Sync: fetch latest chapters, stream directly to R2 (no local save).""" + urls = [manga_url] if manga_url else load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return + + conn = get_db() + + def run(session): + with EscListener() as esc: + for url in urls: + if esc.stop.is_set(): + break + slug = slug_from_url(url) + + print(f"\n{'='*60}") + print(f"Syncing: {slug}") + print(f"{'='*60}") + + # 1. Load manga page + get chapters + result = load_manga_page(session, slug) + if not result: + continue + chapters, metadata, cover_body = result + if not chapters: + print(" No chapters found.") + continue + print(f" {len(chapters)} chapters on site") + + # 2. Ensure manga in DB + cur = conn.cursor() + title = metadata.get("mg-title", slug) + genres = metadata.get("mg-genres", []) + description = metadata.get("mg-description", "") + genre = ", ".join(genres) if genres else "Drama" + + # Cover → R2 (from RAM) + cover_url = "" + cover_key = f"manga/{slug}/cover.webp" + if cover_body and len(cover_body) > 100: + if not r2_key_exists(cover_key): + cover_webp = make_cover(io.BytesIO(cover_body)) + cover_url = upload_to_r2(cover_key, cover_webp) + print(f" Cover uploaded to R2") + else: + cover_url = f"{PUBLIC_URL}/{cover_key}" + + cur.execute('SELECT id FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + if row: + manga_id = row[0] + # Refresh metadata fields (cover only updated if we have a new one) + if cover_url: + cur.execute( + 'UPDATE "Manga" SET title = %s, description = %s, genre = %s, ' + '"coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', + (title, description, genre, cover_url, manga_id), + ) + else: + cur.execute( + 'UPDATE "Manga" SET title = %s, description = %s, genre = %s, ' + '"updatedAt" = NOW() WHERE id = %s', + (title, description, genre, manga_id), + ) + conn.commit() + print(f" Updated metadata (genre: {genre})") + else: + cur.execute( + 'INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") ' + "VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) RETURNING id", + (title, description, cover_url, slug, genre), + ) + manga_id = cur.fetchone()[0] + conn.commit() + print(f" Created manga in DB (id: {manga_id})") + + # 3. Find chapters missing from DB + cur.execute('SELECT number FROM "Chapter" WHERE "mangaId" = %s', (manga_id,)) + existing_numbers = {row[0] for row in cur.fetchall()} + + # 3. Collect chapters to sync + todo = [(i, ch) for i, ch in enumerate(chapters, 1) if i not in existing_numbers] + + if not todo: + print(" Already up to date!") + continue + + print(f" {len(todo)} new chapters to sync") + + completed = 0 + skipped = 0 + for i, ch in todo: + if esc.stop.is_set(): + break + ch_name = ch["chapterName"] + print(f" [{i}/{len(chapters)}] {ch_name} (id={ch['id']})") + + images, api_info = get_chapter_images(session.page, slug, ch["id"]) + if not images and api_info.get("status") == 403: + print(f" CF blocked — run Setup and try again") + esc.stop.set() + break + if not images: + print(f" No images") + skipped += 1 + continue + + print(f" {len(images)} pages") + page_bytes = fetch_all_pages(session.page, images) + if len(page_bytes) < len(images): + missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] + print(f" Could not fetch pages: {missing}, skipping chapter") + skipped += 1 + continue + + def upload_one(args, _slug=slug, _i=i): + pn, raw = args + r2_key = f"manga/{_slug}/chapters/{_i}/{pn}.webp" + w, h, webp = probe_and_webp(io.BytesIO(raw)) + return pn, upload_to_r2(r2_key, webp), w, h + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for pn, r2_url, w, h in pool.map(upload_one, page_bytes.items()): + page_urls[pn] = (r2_url, w, h) + done += 1 + print(f" R2: {done}/{len(page_bytes)}", end="\r") + + if not page_urls: + skipped += 1 + continue + + cur.execute( + 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', + (manga_id, i, ch_name), + ) + chapter_id = cur.fetchone()[0] + insert_pages(cur, chapter_id, page_urls) + conn.commit() + completed += 1 + print(f" {len(page_urls)} pages synced" + " " * 20) + time.sleep(REQUEST_DELAY) + + print(f" Synced {completed}/{len(todo)} chapters ({skipped} skipped)") + + try: + with_browser(run) + finally: + conn.close() + + print("\nSync complete!") + + +def r2_list_prefixes(): + """List manga slugs in R2 by scanning top-level prefixes under manga/.""" + _ensure_config() + slugs = set() + paginator = s3.get_paginator("list_objects_v2") + for pg in paginator.paginate(Bucket=BUCKET, Prefix="manga/", Delimiter="/"): + for prefix in pg.get("CommonPrefixes", []): + # "manga/slug/" -> "slug" + slug = prefix["Prefix"].split("/")[1] + if slug: + slugs.add(slug) + return sorted(slugs) + + +def r2_count_by_prefix(prefix): + """Count objects under a prefix.""" + _ensure_config() + total = 0 + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): + total += len(pg.get("Contents", [])) + return total + + +def r2_delete_prefix(prefix): + """Delete all objects under a prefix.""" + _ensure_config() + total = 0 + batches = [] + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): + objects = pg.get("Contents", []) + if objects: + batches.append([{"Key": obj["Key"]} for obj in objects]) + + def delete_batch(keys): + s3.delete_objects(Bucket=BUCKET, Delete={"Objects": keys}) + return len(keys) + + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for count in pool.map(delete_batch, batches): + total += count + print(f" {total} deleted", end="\r") + print(f" {total} objects deleted" + " " * 10) + return total + + +def r2_recompress(slug, quality=65): + """Download all webp images for a manga, re-encode at lower quality, re-upload.""" + _ensure_config() + prefix = f"manga/{slug}/" + keys = [] + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): + for obj in pg.get("Contents", []): + if obj["Key"].endswith(".webp"): + keys.append(obj["Key"]) + + if not keys: + print(f" No webp files for {slug}") + return + + print(f" {len(keys)} files to recompress (quality={quality})") + saved_total = 0 + failed = 0 + + def recompress_one(key): + try: + original = s3.get_object(Bucket=BUCKET, Key=key)["Body"].read() + new_data = _to_webp_bytes(Image.open(io.BytesIO(original)), quality=quality) + saved = len(original) - len(new_data) + if saved > 0: + s3.put_object(Bucket=BUCKET, Key=key, Body=new_data, ContentType="image/webp") + return saved + return 0 + except Exception: + return -1 + + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for saved in pool.map(recompress_one, keys): + done += 1 + if saved < 0: + failed += 1 + else: + saved_total += saved + print(f" {done}/{len(keys)} — saved {saved_total // 1024} KB", end="\r") + + msg = f" Done: {done}/{len(keys)} processed, {saved_total // (1024 * 1024)} MB saved" + if failed: + msg += f" ({failed} failed)" + print(msg + " " * 10) + + +# ── TUI ──────────────────────────────────────────────────── + + +def tui_select(title, options, back=True, search=False): + """Arrow-key menu. Returns selected index or -1.""" + items = list(options) + if back: + items.append("[Back]") + menu = TerminalMenu( + items, + title=title, + search_key="/" if search else None, + show_search_hint=search, + ) + idx = menu.show() + if idx is None or (back and idx == len(items) - 1): + return -1 + return idx + + +_title_cache = {} + +def get_manga_title(slug): + """Read manga title from detail.json or DB, fallback to slug.""" + if slug in _title_cache: + return _title_cache[slug] + # Try local detail.json first + detail_path = CONTENT_DIR / slug / "detail.json" + if detail_path.exists(): + try: + detail = json.loads(detail_path.read_text(encoding="utf-8")) + title = detail.get("mg-title") + if title: + _title_cache[slug] = title + return title + except Exception: + pass + # Try database (batch load all titles) + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT slug, title FROM "Manga"') + for row in cur.fetchall(): + _title_cache[row[0]] = row[1] + conn.close() + if slug in _title_cache: + return _title_cache[slug] + except Exception: + pass + return slug + + +def manga_display_name(slug): + """Format: 'title (slug)' or just 'slug'.""" + title = get_manga_title(slug) + if title != slug: + return f"{title} ({slug})" + return slug + + +def tui_pick_manga_url(include_all=True): + """Pick manga from manga.json. Shows title + slug.""" + urls = load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return None + slugs = [slug_from_url(u) for u in urls] + items = [] + if include_all: + items.append("All manga") + items += [f"{i+1}. {manga_display_name(s)}" for i, s in enumerate(slugs)] + idx = tui_select("Select manga (/ to search):", items, search=True) + if idx < 0: + return None + if include_all: + if idx == 0: + return "__all__" + return urls[idx - 1] + return urls[idx] + + +def tui_pick_local(include_all=True): + """Pick from local manga-content/. Shows title + slug.""" + local = list_local_manga() + if not local: + print(" No manga in manga-content/") + return None + items = [] + if include_all: + items.append("All manga") + items += [f"{i+1}. {manga_display_name(name)}" for i, name in enumerate(local)] + idx = tui_select("Select manga (/ to search):", items, search=True) + if idx < 0: + return None + if include_all: + if idx == 0: + return "__all__" + return local[idx - 1] + return local[idx] + + +def tui_pick_r2(): + """Pick manga from R2. Shows title + slug.""" + slugs = r2_list_prefixes() + if not slugs: + print(" R2 is empty") + return None + items = [f"{i+1}. {manga_display_name(s)}" for i, s in enumerate(slugs)] + idx = tui_select("Select manga (/ to search):", items, search=True) + return slugs[idx] if idx >= 0 else None + + +def tui_pick_chapters(chapters, slug=None): + """Multi-select chapter picker. Space to toggle, Enter to confirm. + Existing chapters shown grayed out. Returns set of selected 1-based indices, or None for all.""" + # Check which chapters already exist locally + existing = set() + if slug: + existing = get_existing_chapters(CONTENT_DIR / slug) + + # Count existing + existing_count = 0 + for i, ch in enumerate(chapters, 1): + if any(ch["chapterName"] in name for name in existing): + existing_count += 1 + + idx = tui_select(f"{len(chapters)} chapters ({existing_count} downloaded)", [ + "All chapters (skip existing)", + "Select chapters (space to toggle)", + ]) + if idx == -1: + return "back" + if idx == 0: + return None # all + + items = [] + for i, ch in enumerate(chapters, 1): + done = any(ch["chapterName"] in name for name in existing) + label = f"{i}. {ch['chapterName']}" + if done: + label = f"\033[90m{label} [done]\033[0m" + items.append(label) + + menu = TerminalMenu( + items, + title="Space=toggle, Enter=confirm, /=search:", + multi_select=True, + show_multi_select_hint=True, + search_key="/", + show_search_hint=True, + ) + selected = menu.show() + if selected is None: + return "back" + if isinstance(selected, int): + selected = (selected,) + return {i + 1 for i in selected} # 1-based + + +def tui_download(): + picked = tui_pick_manga_url() + if not picked: + return + if picked == "__all__": + cmd_download() + return + + slug = slug_from_url(picked) + print(f"\n Fetching chapters for {slug}...") + + def get_chapters(session): + page = session.page + try: + page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) + except Exception: + pass + if not wait_for_cloudflare(session): + return None + return fetch_chapters_via_api(session.page, slug) + + chapters = with_browser(get_chapters) + + if not chapters: + print(" Could not get chapters") + return + + result = tui_pick_chapters(chapters, slug=slug) + if result == "back": + return + cmd_download(picked, chapter_set=result) + + +def tui_upload(): + picked = tui_pick_local() + if not picked: + return + if picked == "__all__": + cmd_upload() + else: + cmd_upload(picked) + + +def tui_sync(): + picked = tui_pick_manga_url() + if not picked: + return + if picked == "__all__": + cmd_sync() + else: + cmd_sync(picked) + + +def tui_edit_manga(): + """Edit manga metadata (title, description, genre, status) in DB.""" + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT slug, title FROM "Manga" ORDER BY title') + rows = cur.fetchall() + except Exception as e: + print(f" DB error: {e}") + return + + if not rows: + print(" No manga in DB") + conn.close() + return + + items = [f"{i+1}. {title} ({slug})" for i, (slug, title) in enumerate(rows)] + sel = tui_select("Select manga to edit (/ to search):", items, search=True) + if sel < 0: + conn.close() + return + + slug, _ = rows[sel] + cur.execute('SELECT id, title, description, genre, status, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + if not row: + print(" Not found") + conn.close() + return + mid, title, description, genre, status, cover_url = row + + while True: + print(f"\n Editing: {slug}") + print(f" title: {title}") + print(f" description: {(description or '')[:80]}{'...' if description and len(description) > 80 else ''}") + print(f" genre: {genre}") + print(f" status: {status}") + print(f" coverUrl: {cover_url}") + + idx = tui_select("Edit field", [ + "title", "description", "genre", "status", "coverUrl", + "Save & exit", "Discard & exit", + ]) + if idx == -1 or idx == 6: + print(" Discarded.") + break + if idx == 5: + cur.execute( + 'UPDATE "Manga" SET title = %s, description = %s, genre = %s, ' + 'status = %s, "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', + (title, description, genre, status, cover_url, mid), + ) + conn.commit() + print(" Saved.") + break + + if idx == 3: # status + opts = ["PUBLISHED", "DRAFT", "HIDDEN"] + s_idx = tui_select("Status:", opts) + if s_idx >= 0: + status = opts[s_idx] + else: + field_name = ["title", "description", "genre", "status", "coverUrl"][idx] + current = {"title": title, "description": description or "", "genre": genre, "coverUrl": cover_url or ""}[field_name] + print(f" Current: {current}") + new_val = input(f" New {field_name} (empty=keep): ").strip() + if new_val: + if idx == 0: title = new_val + elif idx == 1: description = new_val + elif idx == 2: genre = new_val + elif idx == 4: cover_url = new_val + + conn.close() + + +def _pick_manga_and_chapters(conn, prompt="Select chapters", multi=True): + """Helper: pick manga from DB, then pick chapter(s). Returns (slug, [(ch_id, ch_num, ch_title), ...]) or None.""" + cur = conn.cursor() + cur.execute('SELECT id, slug, title FROM "Manga" ORDER BY title') + mangas = cur.fetchall() + if not mangas: + print(" No manga in DB") + return None + + items = [f"{i+1}. {title} ({slug})" for i, (_, slug, title) in enumerate(mangas)] + sel = tui_select("Select manga (/ to search):", items, search=True) + if sel < 0: + return None + manga_id, slug, _ = mangas[sel] + + cur.execute('SELECT id, number, title FROM "Chapter" WHERE "mangaId" = %s ORDER BY number', (manga_id,)) + chapters = cur.fetchall() + if not chapters: + print(" No chapters in DB for this manga") + return None + + if multi: + scope = tui_select(f"{prompt}: {len(chapters)} chapters", [ + "All chapters", + "Select specific chapters", + ]) + if scope == -1: + return None + if scope == 0: + return slug, list(chapters) + + items = [f"{num}. {title}" for _, num, title in chapters] + menu = TerminalMenu( + items, + title="Space=toggle, Enter=confirm, /=search:", + multi_select=True, + show_multi_select_hint=True, + search_key="/", + show_search_hint=True, + ) + selected = menu.show() + if not selected: + return None + if isinstance(selected, int): + selected = (selected,) + picked = [chapters[i] for i in selected] + else: + items = [f"{num}. {title}" for _, num, title in chapters] + sel = tui_select(f"{prompt} (/ to search):", items, search=True) + if sel < 0: + return None + picked = [chapters[sel]] + + return slug, picked + + +def tui_delete_chapter(): + """Delete specific chapter(s) from R2 + DB.""" + try: + conn = get_db() + except Exception as e: + print(f" DB error: {e}") + return + try: + result = _pick_manga_and_chapters(conn, "Select chapters to delete") + if not result: + return + slug, to_delete = result + confirm = input(f" Delete {len(to_delete)} chapter(s) from R2 + DB? [y/N] ").strip().lower() + if confirm != "y": + print(" Cancelled.") + return + + cur = conn.cursor() + for ch_id, ch_num, ch_title in to_delete: + print(f" Deleting [{ch_num}] {ch_title}...") + r2_delete_prefix(f"manga/{slug}/chapters/{ch_num}/") + cur.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) + cur.execute('DELETE FROM "Chapter" WHERE id = %s', (ch_id,)) + conn.commit() + print(f" Done.") + finally: + conn.close() + + +def tui_check_missing_pages(): + """Check selected chapters against the site's actual page count and re-upload if mismatched.""" + try: + conn = get_db() + except Exception as e: + print(f" DB error: {e}") + return + + try: + result = _pick_manga_and_chapters(conn, "Select chapters to check") + if not result: + return + slug, selected_chapters = result + + if slug not in [slug_from_url(u) for u in load_manga_urls()]: + print(f" {slug} not in manga.json — cannot re-fetch pages") + return + except Exception: + conn.close() + raise + + # Load reader pages and compare site's actual page count vs R2 + def run(session): + with EscListener() as esc: + result = load_manga_page(session, slug) + if not result: + return + chapters, _, _ = result + if not chapters: + return + + cur2 = conn.cursor() + fixed_dims = 0 + reuploaded = 0 + + print(f"\n Checking {len(selected_chapters)} chapters...") + for ch_id, ch_num, ch_title in selected_chapters: + if esc.stop.is_set(): + break + if ch_num > len(chapters): + print(f" [{ch_num}] {ch_title}: out of range on site") + continue + + ch = chapters[ch_num - 1] + images, api_info = get_chapter_images(session.page, slug, ch["id"]) + if not images: + if api_info.get("status") == 403: + print(f" [{ch_num}] CF blocked — run Setup") + esc.stop.set() + break + print(f" [{ch_num}] {ch_title}: no images from site") + continue + + site_count = len(images) + r2_count = r2_count_by_prefix(f"manga/{slug}/chapters/{ch_num}/") + + if site_count != r2_count: + print(f" [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-uploading...") + # Re-upload IMMEDIATELY while browser is on this chapter's reader page + page_bytes = fetch_all_pages(session.page, images) + if len(page_bytes) < len(images): + missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] + print(f" Could not fetch pages: {missing}") + for mn in missing: + print(f" page {mn}: {images[mn-1]['url']}") + print(f" Skipping chapter") + continue + + def upload_page(args, _slug=slug, _n=ch_num): + pn, raw = args + r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" + with Image.open(io.BytesIO(raw)) as img: + w, h = img.width, img.height + return pn, upload_to_r2(r2_key, convert_to_webp(io.BytesIO(raw))), w, h + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for pn, r2_url, w, h in pool.map(upload_page, page_bytes.items()): + page_urls[pn] = (r2_url, w, h) + done += 1 + print(f" R2: {done}/{len(page_bytes)}", end="\r") + + cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) + for pn in sorted(page_urls): + url, w, h = page_urls[pn] + cur2.execute( + 'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)', + (ch_id, pn, url, w, h), + ) + conn.commit() + reuploaded += 1 + print(f" {len(page_urls)} pages restored" + " " * 20) + continue + + # Count matches — check if DB has valid width/height for all pages + cur2.execute( + 'SELECT COUNT(*), ' + 'COUNT(*) FILTER (WHERE width IS NULL OR width <= 0), ' + 'COUNT(*) FILTER (WHERE height IS NULL OR height <= 0), ' + 'MIN(width), MAX(width), MIN(height), MAX(height) ' + 'FROM "Page" WHERE "chapterId" = %s', + (ch_id,), + ) + db_count, bad_w, bad_h, min_w, max_w, min_h, max_h = cur2.fetchone() + bad_count = max(bad_w, bad_h) + if bad_count > 0: + print(f" [{ch_num}] {ch_title}: {bad_count} pages need dims — fixing from R2...") + cur2.execute( + 'SELECT id, number FROM "Page" WHERE "chapterId" = %s ' + 'AND (width IS NULL OR width = 0 OR height IS NULL OR height = 0) ' + 'ORDER BY number', + (ch_id,), + ) + pages = cur2.fetchall() + + def read_dims(args, _slug=slug, _n=ch_num): + page_id, pn = args + r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" + try: + data = s3.get_object(Bucket=BUCKET, Key=r2_key)["Body"].read() + with Image.open(io.BytesIO(data)) as img: + return page_id, img.width, img.height + except Exception: + return page_id, None, None + + updated = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for page_id, w, h in pool.map(read_dims, pages): + if w and h: + cur2.execute( + 'UPDATE "Page" SET width = %s, height = %s WHERE id = %s', + (w, h, page_id), + ) + updated += 1 + conn.commit() + fixed_dims += 1 + print(f" {updated}/{len(pages)} dims updated") + else: + print(f" [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})") + + print(f"\n Done: {reuploaded} re-uploaded, {fixed_dims} dim-fixed") + + try: + with_browser(run) + finally: + conn.close() + + print("\nCheck complete!") + + +def tui_r2_manage(): + while True: + idx = tui_select("R2 / DB Management", [ + "Status", + "Edit manga info", + "Delete specific manga", + "Delete specific chapter", + "Check missing pages", + "Clear ALL (R2 + DB)", + "Recompress manga (quality 65)", + ]) + if idx == -1: + break + + elif idx == 0: + _ensure_config() + slug_counts = {} + total = 0 + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET): + for obj in pg.get("Contents", []): + total += 1 + parts = obj["Key"].split("/") + if len(parts) >= 2 and parts[0] == "manga": + slug_counts[parts[1]] = slug_counts.get(parts[1], 0) + 1 + print(f"\n R2: {total} objects, {len(slug_counts)} manga") + for slug in sorted(slug_counts): + print(f" {manga_display_name(slug)}: {slug_counts[slug]} objects") + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT COUNT(*) FROM "Manga"') + mc = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Chapter"') + cc = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Page"') + pc = cur.fetchone()[0] + print(f" DB: {mc} manga, {cc} chapters, {pc} pages") + conn.close() + except Exception as e: + print(f" DB: {e}") + input("\n Press ENTER...") + + elif idx == 1: + tui_edit_manga() + + elif idx == 2: + picked = tui_pick_r2() + if not picked: + continue + confirm = input(f" Delete {picked} from R2 + DB? [y/N] ").strip().lower() + if confirm == "y": + r2_delete_prefix(f"manga/{picked}/") + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT id FROM "Manga" WHERE slug = %s', (picked,)) + row = cur.fetchone() + if row: + mid = row[0] + cur.execute('DELETE FROM "Page" WHERE "chapterId" IN (SELECT id FROM "Chapter" WHERE "mangaId" = %s)', (mid,)) + cur.execute('DELETE FROM "Chapter" WHERE "mangaId" = %s', (mid,)) + cur.execute('DELETE FROM "Manga" WHERE id = %s', (mid,)) + conn.commit() + print(f" Removed from R2 + DB") + conn.close() + except Exception as e: + print(f" DB error: {e}") + + elif idx == 3: + tui_delete_chapter() + + elif idx == 4: + tui_check_missing_pages() + + elif idx == 5: + confirm = input(" Delete ALL R2 + DB? [y/N] ").strip().lower() + if confirm == "y": + r2_delete_prefix("") + try: + conn = get_db() + cur = conn.cursor() + for t in ['"Page"', '"Chapter"', '"Manga"']: + cur.execute(f"DELETE FROM {t}") + conn.commit() + conn.close() + print(" All cleared") + except Exception as e: + print(f" DB error: {e}") + + elif idx == 6: + slugs = r2_list_prefixes() + if not slugs: + print(" R2 is empty") + continue + items = ["All manga"] + [f"{i+1}. {manga_display_name(s)}" for i, s in enumerate(slugs)] + sel = tui_select("Recompress which? (quality=65, overwrites originals)", items, search=True) + if sel < 0: + continue + targets = slugs if sel == 0 else [slugs[sel - 1]] + confirm = input(f" Recompress {len(targets)} manga to quality 65? [y/N] ").strip().lower() + if confirm != "y": + continue + for slug in targets: + print(f"\n {manga_display_name(slug)}") + r2_recompress(slug, quality=65) + + +def main(): + try: + while True: + idx = tui_select("Manga Toolkit", [ + "Setup (solve Cloudflare)", + "Download", + "Upload (local -> R2)", + "Sync (site -> R2)", + "R2 / DB management", + "Quit", + ], back=False) + + if idx is None or idx == -1 or idx == 5: + break + elif idx == 0: + cmd_setup() + elif idx == 1: + tui_download() + elif idx == 2: + tui_upload() + elif idx == 3: + tui_sync() + elif idx == 4: + tui_r2_manage() + finally: + close_session() + print("Bye!") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..659228e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +playwright +boto3 +psycopg2-binary +Pillow +python-dotenv +simple-term-menu