From fab3b413b8e6650235f72e525c588c5f8c20c799 Mon Sep 17 00:00:00 2001 From: yiekheng Date: Sun, 12 Apr 2026 08:56:05 +0800 Subject: [PATCH] Merge download.py and upload.py into unified manga.py with TUI - Single interactive script (arrow-key TUI via simple-term-menu) replaces download.py, upload.py, and export_cookies.py - Add sync command: streams new chapters site -> R2 directly without saving locally (uses RAM as cache) - Add R2/DB management submenu (status, delete specific, clear all) - Multi-select chapter picker with already-downloaded marked grayed out - Chapter list fetched via /v2.0/apis/manga/chapterByPage with pagination - Cover image captured from page network traffic (no extra fetch) - Filter prefetched next-chapter images via DOM container count - Chrome runs hidden via AppleScript on macOS (except setup mode) - DB records only created after R2 upload succeeds (no orphan rows) - Parallel R2 uploads (8 workers) with WebP method=6 quality=75 - Update CLAUDE.md to reflect new architecture - Add requirements.txt Co-Authored-By: Claude Opus 4.6 (1M context) --- CLAUDE.md | 70 ++- download.py | 686 --------------------- export_cookies.py | 92 --- manga.json | 6 +- manga.py | 1481 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 + upload.py | 393 ------------ 7 files changed, 1536 insertions(+), 1198 deletions(-) delete mode 100644 download.py delete mode 100644 export_cookies.py create mode 100644 manga.py create mode 100644 requirements.txt delete mode 100644 upload.py diff --git a/CLAUDE.md b/CLAUDE.md index e494aaf..667b1fe 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,44 +4,64 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -Manga downloader for m.happymh.com. Reads manga URLs from `manga.json` and downloads chapter images into `manga-content/`. +Manga downloader and uploader toolkit. Currently supports m.happymh.com, designed for future multi-site support. -## Data Flow +- `manga.py` — Single interactive CLI. Download, upload, and sync manga. Launches real Chrome via subprocess, connects via CDP, bypasses Cloudflare. Uploads to R2 + PostgreSQL. -1. **Input**: `manga.json` — JSON array of manga URLs (e.g., `["https://m.happymh.com/manga/butiange"]`) -2. **Output**: `manga-content// /*.jpg` — downloaded page images -3. **Metadata**: `manga-content//detail.json` — stores manga/chapter metadata +## Architecture + +### Anti-bot Strategy +- Chrome launched via `subprocess.Popen` (not Playwright) to avoid automation detection +- Playwright connects via CDP (`connect_over_cdp`) for scripting only +- Persistent browser profile in `.browser-data/` preserves Cloudflare sessions +- All navigation uses JS (`window.location.href`) or `page.goto` with `wait_until="commit"` +- Images downloaded via `response.body()` from network interception (no base64) + +### Data Flow +1. **Input**: `manga.json` — JSON array of manga URLs +2. **Download**: Chrome navigates to manga page → API fetches chapter list → navigates to reader pages → intercepts image URLs from API → downloads via browser fetch +3. **Local storage**: `manga-content//` with cover.jpg, detail.json, and chapter folders +4. **Upload**: Converts JPG→WebP → uploads to R2 → creates DB records + +### Key APIs (happymh) +- Chapter list: `GET /v2.0/apis/manga/chapterByPage?code=&lang=cn&order=asc&page=` +- Chapter images: `GET /v2.0/apis/manga/reading?code=&cid=` (intercepted from reader page) +- Cover: Captured from page load traffic (`/mcover/` responses) ## Directory Convention ``` manga-content/ - butiange/ - detail.json - 1 第一回/ + / + detail.json # metadata (title, author, genres, description, cover URL) + cover.jpg # cover image captured from page traffic + 1 / # chapter folder (ordered by API sequence) 1.jpg 2.jpg - 3.jpg - 2 第二回/ ... ``` -- Manga name is the URL slug (last path segment of the manga URL) -- Chapter folders are named ` ` (e.g., `1 第一回`) -- Image filenames are sequential page numbers (`1.jpg`, `2.jpg`, ...) +## R2 Storage Layout -## Metadata Format (`detail.json`) +``` +manga/<slug>/cover.webp +manga/<slug>/chapters/<number>/<page>.webp +``` -Each manga folder contains a `detail.json` with fields: -- `mg-url` — source URL on m.happymh.com -- `mg-title` — manga title (Chinese) -- `mg-author` — author name -- `mg-genres` — array of genre tags -- `mg-description` — synopsis text +## Environment Variables (.env) -## Target Site +``` +R2_ACCOUNT_ID= +R2_ACCESS_KEY= +R2_SECRET_KEY= +R2_BUCKET= +R2_PUBLIC_URL= +DATABASE_URL=postgresql://... +``` -- Base URL: `https://m.happymh.com` -- Manga page: `/manga/<slug>` — contains chapter listing -- Chapter page: `/reads/<slug>/<chapter-id>` — contains page images -- The site is mobile-oriented; requests should use appropriate mobile User-Agent headers +## Future: Multi-site Support + +Current code is specific to happymh.com. To add new sites: +- Extract site-specific logic (chapter fetching, image URL extraction, CF handling) into per-site modules +- Keep shared infrastructure (Chrome management, image download, upload) in common modules +- Each site module implements: `fetch_chapters(page, slug)`, `get_chapter_images(page, slug, chapter_id)`, `fetch_metadata(page)` diff --git a/download.py b/download.py deleted file mode 100644 index 2b5d464..0000000 --- a/download.py +++ /dev/null @@ -1,686 +0,0 @@ -""" -Manga downloader for m.happymh.com (educational purposes only). - -Launches real Chrome via subprocess (not Playwright), then connects via -Chrome DevTools Protocol. Images are downloaded directly via HTTP. - -Usage: - python download.py --setup # open Chrome, solve CF manually, exit - python download.py # download manga from manga.json -""" - -import json -import re -import sys -import time -import socket -import subprocess -from pathlib import Path -from urllib.parse import urlparse - -from playwright.sync_api import sync_playwright - -BASE_URL = "https://m.happymh.com" -USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/143.0.0.0 Safari/537.36" -) -ROOT_DIR = Path(__file__).parent -CONTENT_DIR = ROOT_DIR / "manga-content" -MANGA_JSON = ROOT_DIR / "manga.json" -BROWSER_DATA = ROOT_DIR / ".browser-data" -CDP_PORT = 9333 -REQUEST_DELAY = 1.5 - -CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" - - -def is_port_open(port): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(("localhost", port)) == 0 - - -def launch_chrome(start_url=None): - """Launch real Chrome with CDP port.""" - if is_port_open(CDP_PORT): - print(f"Chrome already on port {CDP_PORT}") - return None - - if not Path(CHROME_PATH).exists(): - print(f"Chrome not found at: {CHROME_PATH}") - sys.exit(1) - - cmd = [ - CHROME_PATH, - f"--remote-debugging-port={CDP_PORT}", - f"--user-data-dir={BROWSER_DATA}", - "--no-first-run", - "--no-default-browser-check", - ] - if start_url: - cmd.append(start_url) - - proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - - for _ in range(30): - if is_port_open(CDP_PORT): - time.sleep(1) - return proc - time.sleep(0.5) - - print("Chrome failed to start") - sys.exit(1) - - -def wait_for_cloudflare(page, timeout=120): - """Wait for CF to resolve. User solves CAPTCHA manually if needed.""" - for i in range(timeout): - try: - title = page.title() - except Exception: - time.sleep(1) - continue - - if "Just a moment" in title or "challenge" in page.url: - if i == 0: - print(" CF challenge — solve in browser...") - elif i % 15 == 0: - print(f" Still waiting for CF... ({i}s)") - time.sleep(1) - continue - - if title and "嗨皮漫画" in title: - return True - if title and "happymh" in page.url: - return True - - time.sleep(1) - - print(" CF timed out.") - return False - - -def fetch_chapters_via_api(page, slug): - """Get full chapter list via chapterByPage API with pagination.""" - result = page.evaluate(""" - async (slug) => { - const all = []; - let total = 0; - for (let p = 1; p <= 30; p++) { - const url = `/v2.0/apis/manga/chapterByPage?code=${slug}&lang=cn&order=asc&page=${p}&_t=${Date.now()}`; - try { - const ctrl = new AbortController(); - setTimeout(() => ctrl.abort(), 10000); - const r = await fetch(url, { signal: ctrl.signal }); - if (!r.ok) { if (p === 1) return { error: r.status }; break; } - const json = await r.json(); - if (!json.data) break; - - total = json.data.total || total; - - // Find chapter array in response - let items = null; - for (const val of Object.values(json.data)) { - if (Array.isArray(val) && val.length > 0) { - items = val; - break; - } - } - if (!items || items.length === 0) break; - - for (const ch of items) { - all.push({ - id: String(ch.id || ''), - chapterName: ch.chapterName || ch.name || '', - }); - } - - if (total && all.length >= total) break; - } catch (e) { - if (p === 1) return { error: e.message }; - break; - } - } - return { chapters: all, total }; - } - """, slug) - - if result and result.get("chapters") and len(result["chapters"]) > 0: - chapters = result["chapters"] - total = result.get("total", len(chapters)) - print(f" API: {len(chapters)}/{total} chapters") - return chapters - - if result and result.get("error"): - print(f" API error: {result['error']}") - - return None - - -def fetch_chapters_from_dom(page): - """Scrape all chapters from the MUI Drawer chapter list. - Opens drawer, clicks 'load more' repeatedly, then scrapes.""" - try: - page.wait_for_selector("a[href*='/mangaread/']", timeout=15000) - page.wait_for_timeout(1000) - except Exception: - print(" No chapter links found") - return None - - # Step 1: Open the chapter list drawer - for selector in [ - "text=展开全部", "text=查看全部", "text=全部章节", - "text=展开更多", "text=更多", - "[class*='expand']", "[class*='more']", - ]: - try: - btn = page.query_selector(selector) - if btn and btn.is_visible(): - btn.click() - print(" Opening chapter drawer...") - page.wait_for_timeout(2000) - break - except Exception: - continue - - # Step 2: Wait for drawer - try: - page.wait_for_selector(".MuiDrawer-paper", timeout=5000) - except Exception: - print(" Drawer not found, using page chapters") - - # Step 3: Click sort button to get ascending order (oldest first) - try: - sort_btn = page.query_selector("text=点我改变排序") - if sort_btn and sort_btn.is_visible(): - sort_btn.click() - print(" Sorting ascending...") - page.wait_for_timeout(2000) - except Exception: - pass - - # Step 4: Click "点我加载更多" until all chapters loaded - # Get expected total from header "共177个章节" - total = page.evaluate(""" - () => { - const spans = document.querySelectorAll('.MuiDrawer-paper span'); - for (const s of spans) { - const m = s.textContent.match(/共(\\d+)个章节/); - if (m) return parseInt(m[1]); - } - return 0; - } - """) - if total: - print(f" Total chapters: {total}") - - for round_num in range(50): - count = page.evaluate( - "document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length" - ) - if total and count >= total: - break - print(f" Loading... {count}/{total or '?'}", end="\r") - - # Find and click the "load more" element — search fresh each time - clicked = page.evaluate(""" - () => { - const walker = document.createTreeWalker( - document.querySelector('.MuiDrawer-paper') || document.body, - NodeFilter.SHOW_TEXT - ); - while (walker.nextNode()) { - if (walker.currentNode.textContent.includes('加载更多')) { - let el = walker.currentNode.parentElement; - while (el && el.tagName !== 'LI') el = el.parentElement; - if (el) { el.click(); return true; } - walker.currentNode.parentElement.click(); - return true; - } - } - return false; - } - """) - if not clicked: - break - page.wait_for_timeout(1000) - - count = page.evaluate( - "document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length" - ) - print(f" Loaded {count} chapters" + " " * 20) - - # Step 5: Scrape chapters from the drawer - chapters = page.evaluate(""" - () => { - const drawer = document.querySelector('.MuiDrawer-paper'); - const container = drawer || document; - const links = container.querySelectorAll('a[href*="/mangaread/"]'); - const chapters = []; - const seen = new Set(); - links.forEach(a => { - const href = a.getAttribute('href'); - const match = href.match(/\\/mangaread\\/[^/]+\\/(\\d+)/); - if (match && !seen.has(match[1])) { - seen.add(match[1]); - const name = a.textContent.trim(); - if (name && name !== '开始阅读') { - chapters.push({ id: match[1], chapterName: name }); - } - } - }); - return chapters; - } - """) - - # Step 6: Close drawer - try: - page.keyboard.press("Escape") - page.wait_for_timeout(500) - except Exception: - pass - - return chapters if chapters else None - - -def fetch_metadata(page): - """Extract manga metadata and cover URL from the loaded page.""" - html_text = page.content() - metadata = {"mg-url": page.url} - - m = re.search(r'<h2 class="mg-title">(.*?)</h2>', html_text) - if m: - metadata["mg-title"] = m.group(1).strip() - - m = re.search(r'<p class="mg-sub-title"><a[^>]*>(.*?)</a>', html_text) - if m: - metadata["mg-author"] = m.group(1).strip() - - genre_matches = re.findall(r'<p class="mg-cate">.*?</p>', html_text, re.DOTALL) - if genre_matches: - genres = re.findall(r'<a[^>]*>(.*?)</a>', genre_matches[0]) - metadata["mg-genres"] = genres - - m = re.search(r'<div class="mg-desc">.*?<p[^>]*>(.*?)</p>', html_text, re.DOTALL) - if m: - metadata["mg-description"] = m.group(1).strip() - - # Extract cover image URL - cover_url = page.evaluate(""" - () => { - // Try og:image meta tag - const og = document.querySelector('meta[property="og:image"]'); - if (og) return og.content; - // Try common cover selectors - const selectors = ['img.mg-cover', '.mg-cover img', '.cover img', 'img[src*="mcover"]']; - for (const sel of selectors) { - const img = document.querySelector(sel); - if (img && img.src) return img.src; - } - return null; - } - """) - if cover_url: - metadata["mg-cover"] = cover_url - - return metadata - - -def get_chapter_images(page, slug, chapter_id): - """Navigate to reader page, intercept the API response for image URLs.""" - captured_images = [] - api_info = {"found": False, "error": None} - - def on_response(response): - if "/apis/manga/reading" not in response.url: - return - api_info["found"] = True - if response.status != 200: - api_info["error"] = f"status {response.status}" - return - try: - data = response.json() - scans = data.get("data", {}).get("scans", []) - if isinstance(scans, str): - scans = json.loads(scans) - for scan in scans: - if isinstance(scan, dict) and "url" in scan: - captured_images.append({ - "url": scan["url"], - "no_referrer": scan.get("r", 0) != 0, - }) - except Exception as e: - api_info["error"] = str(e) - - page.on("response", on_response) - - reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}" - print(" Loading reader...") - try: - page.evaluate(f"window.location.href = '{reader_url}'") - except Exception: - pass - - time.sleep(2) - - try: - page.evaluate("window.close = () => {}") - except Exception: - pass - - print(" Waiting for page...") - if not wait_for_cloudflare(page, timeout=90): - page.remove_listener("response", on_response) - return [] - - print(" Waiting for API...") - deadline = time.time() + 20 - while time.time() < deadline: - if captured_images: - break - try: - page.wait_for_timeout(500) - except Exception: - break - - page.remove_listener("response", on_response) - - if not api_info["found"]: - print(" API not intercepted") - elif api_info["error"]: - print(f" API: {api_info['error']}") - - # DOM fallback - if not captured_images: - try: - page.wait_for_timeout(3000) - dom_images = page.evaluate(""" - () => { - const imgs = document.querySelectorAll('img[src*="http"]'); - const urls = []; - const seen = new Set(); - imgs.forEach(img => { - const src = img.src || ''; - if (src && !seen.has(src) && !src.includes('/mcover/') - && !src.includes('cloudflare') && !src.includes('.svg')) { - seen.add(src); - urls.push(src); - } - }); - return urls; - } - """) - if dom_images: - print(f" DOM: {len(dom_images)} images") - for u in dom_images: - captured_images.append({"url": u, "no_referrer": False}) - except Exception as e: - print(f" DOM failed: {e}") - - return captured_images - - -def download_image(page, img, save_path): - """Download image via browser network stack. Captures raw bytes via CDP — no base64.""" - if save_path.exists(): - return True - - url = img["url"] - ref_policy = "no-referrer" if img.get("no_referrer") else "origin" - - try: - with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info: - page.evaluate( - "([u, r]) => fetch(u, { referrerPolicy: r })", - [url, ref_policy], - ) - response = resp_info.value - if response.status == 200: - body = response.body() # raw bytes from network layer - if body and len(body) > 100: - save_path.parent.mkdir(parents=True, exist_ok=True) - save_path.write_bytes(body) - return True - except Exception as e: - if not hasattr(download_image, "_err_logged"): - download_image._err_logged = True - print(f"\n First error: {e}") - return False - - return False - - -def get_existing_chapters(manga_dir): - existing = set() - if manga_dir.exists(): - for entry in manga_dir.iterdir(): - if entry.is_dir() and any(entry.glob("*.jpg")): - existing.add(entry.name) - return existing - - -def download_manga(page, manga_url): - """Download all chapters using a single page.""" - slug = urlparse(manga_url).path.strip("/").split("/")[-1] - manga_dir = CONTENT_DIR / slug - - print(f"\n{'='*60}") - print(f"Manga: {slug}") - print(f"{'='*60}") - - # Intercept all cover images from page load traffic - cover_responses = {} - - def on_manga_response(response): - if "/mcover/" in response.url and response.status == 200: - try: - cover_responses[response.url] = response.body() - except Exception: - pass - - page.on("response", on_manga_response) - - print("Loading manga page...") - try: - page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) - except Exception: - pass - if not wait_for_cloudflare(page): - page.remove_listener("response", on_manga_response) - return - - print("Fetching chapters via API...") - chapters = fetch_chapters_via_api(page, slug) - if not chapters: - print(" API failed, trying DOM...") - chapters = fetch_chapters_from_dom(page) - if not chapters: - print("No chapters found.") - return - - print(f"Found {len(chapters)} chapters") - - metadata = fetch_metadata(page) - manga_dir.mkdir(parents=True, exist_ok=True) - detail_path = manga_dir / "detail.json" - if metadata: - existing_meta = {} - if detail_path.exists(): - try: - existing_meta = json.loads(detail_path.read_text(encoding="utf-8")) - except json.JSONDecodeError: - pass - existing_meta.update(metadata) - detail_path.write_text( - json.dumps(existing_meta, ensure_ascii=False, indent=4), - encoding="utf-8", - ) - - # Save cover image — match the correct one from DOM - page.remove_listener("response", on_manga_response) - cover_path = manga_dir / "cover.jpg" - if not cover_path.exists(): - # Find the actual cover URL from the first mip-fill-content img - cover_url = page.evaluate(""" - () => { - const img = document.querySelector('img.mip-fill-content[src*="mcover"]'); - return img ? img.src : null; - } - """) - cover_body = None - if cover_url: - # Exact match first - cover_body = cover_responses.get(cover_url) - # Substring match fallback - if not cover_body: - for url, data in cover_responses.items(): - if cover_url.split("?")[0] in url or url.split("?")[0] in cover_url: - cover_body = data - break - - if cover_body and len(cover_body) > 100: - cover_path.write_bytes(cover_body) - print(f"Cover saved ({len(cover_body)} bytes)") - else: - print(f"Cover not found (captured {len(cover_responses)} mcover images, target: {cover_url})") - - existing_chapters = get_existing_chapters(manga_dir) - - # Chapters are already in DOM order (ascending from drawer) - chapters_sorted = chapters - - for i, chapter in enumerate(chapters_sorted, 1): - ch_id = chapter["id"] - ch_name = chapter["chapterName"] - folder_name = f"{i} {ch_name}" - - # Skip if this chapter already downloaded (check by chapter name) - already = any(ch_name in name for name in existing_chapters) - if already: - print(f" [{i}/{len(chapters_sorted)}] {ch_name} — skip") - continue - - print(f" [{i}/{len(chapters_sorted)}] {ch_name} (id={ch_id})") - - images = get_chapter_images(page, slug, ch_id) - if not images: - print(f" No images") - continue - - print(f" {len(images)} pages") - chapter_dir = manga_dir / folder_name - chapter_dir.mkdir(parents=True, exist_ok=True) - - # Download images via browser network stack (raw bytes, no base64) - ok = 0 - failed = [] - for pn, img in enumerate(images, 1): - save_path = chapter_dir / f"{pn}.jpg" - if download_image(page, img, save_path): - ok += 1 - print(f" {pn}/{len(images)}", end="\r") - else: - failed.append((pn, img)) - time.sleep(0.1) - - # Retry failed images once - if failed: - time.sleep(1) - for pn, img in failed: - save_path = chapter_dir / f"{pn}.jpg" - if download_image(page, img, save_path): - ok += 1 - else: - print(f" {pn}/{len(images)} FAIL") - time.sleep(0.3) - - print(f" {ok}/{len(images)} downloaded" + " " * 20) - - if ok == 0: - try: - chapter_dir.rmdir() - except Exception: - pass - - time.sleep(REQUEST_DELAY) - - print(f"\nDone: {slug}") - - -def setup_mode(): - """Launch Chrome for manual CF solving.""" - print("=== SETUP ===") - print("Chrome will open. Do this:") - print(" 1. Go to m.happymh.com — solve Cloudflare") - print(" 2. Open a manga page — solve CF if prompted") - print(" 3. Open a chapter reader — solve CF if prompted") - print(" 4. Press ENTER here when done\n") - - chrome_proc = launch_chrome(BASE_URL) - - input(">>> Press ENTER when Cloudflare is solved... ") - - try: - with sync_playwright() as p: - browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") - ctx = browser.contexts[0] - cookies = ctx.cookies() - cf = [c for c in cookies if c["name"] == "cf_clearance"] - if cf: - print("cf_clearance found!") - else: - print("Warning: cf_clearance not found") - browser.close() - except Exception as e: - print(f"Could not verify: {e}") - - if chrome_proc: - chrome_proc.terminate() - - print("Done. Now run: python download.py") - - -def main(): - if "--setup" in sys.argv: - setup_mode() - return - - if not MANGA_JSON.exists(): - print(f"Error: {MANGA_JSON} not found") - sys.exit(1) - - manga_urls = json.loads(MANGA_JSON.read_text(encoding="utf-8")) - if not isinstance(manga_urls, list) or not manga_urls: - print("Error: manga.json should be a JSON array of URLs") - sys.exit(1) - - print(f"Found {len(manga_urls)} manga(s)") - print("Launching Chrome...\n") - - chrome_proc = launch_chrome() - - try: - with sync_playwright() as p: - browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") - context = browser.contexts[0] - page = context.pages[0] if context.pages else context.new_page() - - for url in manga_urls: - try: - download_manga(page, url) - except Exception as e: - print(f"\nError: {url}: {e}") - import traceback - traceback.print_exc() - - browser.close() - finally: - if chrome_proc: - chrome_proc.terminate() - - print("\nAll done!") - - -if __name__ == "__main__": - main() diff --git a/export_cookies.py b/export_cookies.py deleted file mode 100644 index f1a70b5..0000000 --- a/export_cookies.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Opens a browser to m.happymh.com, waits for you to pass Cloudflare, -then saves cookies to cookies.txt in Netscape format. - -Install: - pip install playwright - playwright install chromium - -Usage: - python export_cookies.py -""" - -import time -from pathlib import Path - -try: - from playwright.sync_api import sync_playwright -except ImportError: - print("Playwright not installed. Run:") - print(" pip install playwright") - print(" playwright install chromium") - raise SystemExit(1) - -COOKIES_FILE = Path(__file__).parent / "cookies.txt" -TARGET_URL = "https://m.happymh.com" - - -def cookies_to_netscape(cookies): - """Convert Playwright cookies to Netscape cookies.txt format.""" - lines = ["# Netscape HTTP Cookie File", ""] - for c in cookies: - domain = c["domain"] - # Netscape format: leading dot means accessible to subdomains - if not domain.startswith("."): - domain = "." + domain - flag = "TRUE" # accessible to subdomains - path = c.get("path", "/") - secure = "TRUE" if c.get("secure", False) else "FALSE" - expires = str(int(c.get("expires", 0))) - name = c["name"] - value = c["value"] - lines.append(f"{domain}\t{flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}") - return "\n".join(lines) + "\n" - - -def main(): - print("Opening browser to m.happymh.com...") - print("Once the page loads (past Cloudflare), press ENTER here to save cookies.\n") - - with sync_playwright() as p: - browser = p.chromium.launch(headless=False) - context = browser.new_context( - user_agent=( - "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) " - "Version/16.0 Mobile/15E148 Safari/604.1" - ), - viewport={"width": 390, "height": 844}, - is_mobile=True, - ) - - page = context.new_page() - page.goto(TARGET_URL) - - input(">>> Page opened. Pass Cloudflare if needed, then press ENTER to save cookies... ") - - cookies = context.cookies() - if not cookies: - print("No cookies found!") - browser.close() - return - - # Check for cf_clearance - cookie_names = [c["name"] for c in cookies] - if "cf_clearance" in cookie_names: - print("cf_clearance cookie found (Cloudflare passed)") - else: - print("Warning: cf_clearance not found. You may still be on the challenge page.") - answer = input("Save anyway? [y/N] ").strip().lower() - if answer != "y": - browser.close() - return - - text = cookies_to_netscape(cookies) - COOKIES_FILE.write_text(text) - print(f"\nSaved {len(cookies)} cookies to {COOKIES_FILE}") - - browser.close() - - -if __name__ == "__main__": - main() diff --git a/manga.json b/manga.json index a84f9e2..707854a 100644 --- a/manga.json +++ b/manga.json @@ -1,4 +1,6 @@ [ - "https://m.happymh.com/manga/moutianchengweimoshen", - "https://m.happymh.com/manga/butiange" + "https://m.happymh.com/manga/fangkainagenvwu", + "https://m.happymh.com/manga/jueduijiangan", + "https://m.happymh.com/manga/xingjiandashi", + "https://m.happymh.com/manga/moutianchengweimoshen" ] \ No newline at end of file diff --git a/manga.py b/manga.py new file mode 100644 index 0000000..68d635d --- /dev/null +++ b/manga.py @@ -0,0 +1,1481 @@ +""" +Manga toolkit — download from m.happymh.com, upload to Cloudflare R2. + +Usage: + python manga.py +""" + +import io +import json +import os +import re +import time +import socket +import subprocess +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from urllib.parse import urlparse + +import boto3 +import psycopg2 +from PIL import Image +from dotenv import load_dotenv +from playwright.sync_api import sync_playwright +from simple_term_menu import TerminalMenu + +load_dotenv() + +# ── Config ───────────────────────────────────────────────── + +BASE_URL = "https://m.happymh.com" +ROOT_DIR = Path(__file__).parent +CONTENT_DIR = ROOT_DIR / "manga-content" +MANGA_JSON = ROOT_DIR / "manga.json" +BROWSER_DATA = ROOT_DIR / ".browser-data" +CDP_PORT = 9333 +REQUEST_DELAY = 1.5 +UPLOAD_WORKERS = 8 +CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + +# R2 +s3 = boto3.client( + "s3", + endpoint_url=f"https://{os.environ['R2_ACCOUNT_ID']}.r2.cloudflarestorage.com", + aws_access_key_id=os.environ["R2_ACCESS_KEY"], + aws_secret_access_key=os.environ["R2_SECRET_KEY"], + region_name="auto", +) +BUCKET = os.environ["R2_BUCKET"] +PUBLIC_URL = os.environ["R2_PUBLIC_URL"].rstrip("/") +DATABASE_URL = os.environ["DATABASE_URL"] + + +# ── Chrome management ────────────────────────────────────── + + +def hide_chrome(): + """Hide Chrome window on macOS.""" + try: + subprocess.Popen( + ["osascript", "-e", + 'tell application "System Events" to set visible of process "Google Chrome" to false'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + except Exception: + pass + + +def is_port_open(port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(("localhost", port)) == 0 + + +def launch_chrome(start_url=None): + if is_port_open(CDP_PORT): + return None + if not Path(CHROME_PATH).exists(): + print(f" Chrome not found at: {CHROME_PATH}") + return None + cmd = [ + CHROME_PATH, + f"--remote-debugging-port={CDP_PORT}", + f"--user-data-dir={BROWSER_DATA}", + "--no-first-run", + "--no-default-browser-check", + "--window-position=0,0", + "--window-size=800,600", + "--no-focus-on-navigate", + ] + if start_url: + cmd.append(start_url) + proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + for _ in range(30): + if is_port_open(CDP_PORT): + time.sleep(1) + hide_chrome() + return proc + time.sleep(0.5) + print(" Chrome failed to start") + return None + + +class BrowserSession: + """Manages Chrome + CDP lifecycle.""" + + def __init__(self): + self.chrome_proc = None + self.playwright = None + self.browser = None + self.page = None + + def start(self): + self.chrome_proc = launch_chrome() + self.playwright = sync_playwright().start() + self.browser = self.playwright.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") + context = self.browser.contexts[0] + self.page = context.pages[0] if context.pages else context.new_page() + + def close(self): + try: + self.browser.close() + except Exception: + pass + if self.chrome_proc: + self.chrome_proc.terminate() + if self.playwright: + self.playwright.stop() + + +def with_browser(func): + """Run func(session) inside a Chrome+CDP session. Returns func's result.""" + session = BrowserSession() + session.start() + try: + return func(session) + finally: + session.close() + + +# ── Cloudflare ───────────────────────────────────────────── + + +def wait_for_cloudflare(session, timeout=120): + """Wait for CF to resolve. User solves in the visible browser window.""" + page = session.page + for i in range(timeout): + try: + title = page.title() + except Exception: + time.sleep(1) + continue + if "Just a moment" in title or "challenge" in page.url: + if i == 0: + print(" CF challenge — solve in browser...") + elif i % 15 == 0: + print(f" Still waiting for CF... ({i}s)") + time.sleep(1) + continue + if title and ("嗨皮漫画" in title or "happymh" in page.url): + return True + time.sleep(1) + print(" CF timed out.") + return False + + +# ── Happymh: chapter fetching ───────────────────────────── + + +def fetch_chapters_via_api(page, slug): + result = page.evaluate(""" + async (slug) => { + const all = []; + let total = 0; + for (let p = 1; p <= 30; p++) { + const url = `/v2.0/apis/manga/chapterByPage?code=${slug}&lang=cn&order=asc&page=${p}&_t=${Date.now()}`; + try { + const ctrl = new AbortController(); + setTimeout(() => ctrl.abort(), 10000); + const r = await fetch(url, { signal: ctrl.signal }); + if (!r.ok) { if (p === 1) return { error: r.status }; break; } + const json = await r.json(); + if (!json.data) break; + total = json.data.total || total; + let items = null; + for (const val of Object.values(json.data)) { + if (Array.isArray(val) && val.length > 0) { items = val; break; } + } + if (!items || items.length === 0) break; + for (const ch of items) { + all.push({ id: String(ch.id || ''), chapterName: ch.chapterName || ch.name || '' }); + } + if (total && all.length >= total) break; + } catch (e) { + if (p === 1) return { error: e.message }; + break; + } + } + return { chapters: all, total }; + } + """, slug) + if result and result.get("chapters") and len(result["chapters"]) > 0: + chapters = result["chapters"] + total = result.get("total", len(chapters)) + print(f" API: {len(chapters)}/{total} chapters") + return chapters + if result and result.get("error"): + print(f" API error: {result['error']}") + return None + + +def fetch_chapters_from_dom(page): + try: + page.wait_for_selector("a[href*='/mangaread/']", timeout=15000) + page.wait_for_timeout(1000) + except Exception: + return None + + for selector in ["text=展开全部", "text=查看全部", "text=全部章节", "text=展开更多", "text=更多"]: + try: + btn = page.query_selector(selector) + if btn and btn.is_visible(): + btn.click() + page.wait_for_timeout(2000) + break + except Exception: + continue + + try: + page.wait_for_selector(".MuiDrawer-paper", timeout=5000) + except Exception: + pass + + try: + sort_btn = page.query_selector("text=点我改变排序") + if sort_btn and sort_btn.is_visible(): + sort_btn.click() + page.wait_for_timeout(2000) + except Exception: + pass + + total = page.evaluate(""" + () => { + const spans = document.querySelectorAll('.MuiDrawer-paper span'); + for (const s of spans) { + const m = s.textContent.match(/共(\\d+)个章节/); + if (m) return parseInt(m[1]); + } + return 0; + } + """) + + for _ in range(50): + count = page.evaluate("document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length") + if total and count >= total: + break + clicked = page.evaluate(""" + () => { + const walker = document.createTreeWalker( + document.querySelector('.MuiDrawer-paper') || document.body, NodeFilter.SHOW_TEXT + ); + while (walker.nextNode()) { + if (walker.currentNode.textContent.includes('加载更多')) { + let el = walker.currentNode.parentElement; + while (el && el.tagName !== 'LI') el = el.parentElement; + if (el) { el.click(); return true; } + walker.currentNode.parentElement.click(); + return true; + } + } + return false; + } + """) + if not clicked: + break + page.wait_for_timeout(1000) + + chapters = page.evaluate(""" + () => { + const container = document.querySelector('.MuiDrawer-paper') || document; + const links = container.querySelectorAll('a[href*="/mangaread/"]'); + const chapters = [], seen = new Set(); + links.forEach(a => { + const match = a.getAttribute('href').match(/\\/mangaread\\/[^/]+\\/(\\d+)/); + if (match && !seen.has(match[1])) { + seen.add(match[1]); + const name = a.textContent.trim(); + if (name && name !== '开始阅读') chapters.push({ id: match[1], chapterName: name }); + } + }); + return chapters; + } + """) + + try: + page.keyboard.press("Escape") + except Exception: + pass + return chapters if chapters else None + + +# ── Happymh: metadata & cover ───────────────────────────── + + +def fetch_metadata(page): + html_text = page.content() + metadata = {"mg-url": page.url} + m = re.search(r'<h2 class="mg-title">(.*?)</h2>', html_text) + if m: + metadata["mg-title"] = m.group(1).strip() + m = re.search(r'<p class="mg-sub-title">.*?<a[^>]*>(.*?)</a>', html_text, re.DOTALL) + if m: + metadata["mg-author"] = m.group(1).strip() + genre_matches = re.findall(r'<p class="mg-cate">.*?</p>', html_text, re.DOTALL) + if genre_matches: + metadata["mg-genres"] = re.findall(r'<a[^>]*>(.*?)</a>', genre_matches[0]) + m = re.search(r'<div class="mg-desc">.*?<p[^>]*>(.*?)</p>', html_text, re.DOTALL) + if m: + metadata["mg-description"] = m.group(1).strip() + if not metadata.get("mg-description"): + m = re.search(r'<mip-showmore[^>]*>(.*?)</mip-showmore>', html_text, re.DOTALL) + if m: + desc = re.sub(r'<[^>]+>', '', m.group(1)).strip() + if desc: + metadata["mg-description"] = desc + cover_url = page.evaluate(""" + () => { + const og = document.querySelector('meta[property="og:image"]'); + if (og) return og.content; + for (const sel of ['img.mg-cover', 'img[src*="mcover"]']) { + const img = document.querySelector(sel); + if (img && img.src) return img.src; + } + return null; + } + """) + if cover_url: + metadata["mg-cover"] = cover_url + return metadata + + +# ── Happymh: image download ─────────────────────────────── + + +def _try_get_chapter_images(session, slug, chapter_id): + """Single attempt to get chapter images. Returns (images, api_status).""" + captured_images = [] + api_info = {"found": False, "status": None, "error": None} + + def on_response(response): + if "/apis/manga/reading" not in response.url: + return + # Only capture our chapter, skip prefetched ones + if f"cid={chapter_id}" not in response.url and f"cid%3D{chapter_id}" not in response.url: + return + # Ignore if we already captured images (prevent duplicate/prefetch) + if captured_images: + return + api_info["found"] = True + api_info["status"] = response.status + if response.status != 200: + api_info["error"] = f"status {response.status}" + return + try: + data = response.json() + # Verify chapter ID in response body + resp_cid = str(data.get("data", {}).get("id", "")) + if resp_cid and resp_cid != str(chapter_id): + return + scans = data.get("data", {}).get("scans", []) + if isinstance(scans, str): + scans = json.loads(scans) + for scan in scans: + if isinstance(scan, dict) and "url" in scan: + captured_images.append({ + "url": scan["url"], + "no_referrer": scan.get("r", 0) != 0, + }) + except Exception as e: + api_info["error"] = str(e) + + page = session.page + page.on("response", on_response) + reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}" + print(" Loading reader...") + try: + page.evaluate(f"window.location.href = '{reader_url}'") + except Exception: + pass + hide_chrome() + + time.sleep(2) + try: + page.evaluate("window.close = () => {}") + except Exception: + pass + + print(" Waiting for page...") + if not wait_for_cloudflare(session, timeout=90): + page = session.page + try: + page.remove_listener("response", on_response) + except Exception: + pass + return [], api_info + + page = session.page + print(" Waiting for API...") + deadline = time.time() + 20 + while time.time() < deadline: + if captured_images: + break + try: + page.wait_for_timeout(500) + except Exception: + break + + try: + page.remove_listener("response", on_response) + except Exception: + pass + + if not api_info["found"]: + print(" API not intercepted") + elif api_info["error"]: + print(f" API: {api_info['error']}") + + # Filter out next-chapter preview images by counting DOM containers + if captured_images: + try: + counts = page.evaluate(""" + () => { + const all = document.querySelectorAll('[class*="imgContainer"]').length; + const next = document.querySelectorAll('[class*="imgNext"]').length; + return { all, next, current: all - next }; + } + """) + if counts and counts.get("next", 0) > 0: + actual = counts["current"] + if 0 < actual < len(captured_images): + captured_images = captured_images[:actual] + except Exception: + pass + + # DOM fallback + if not captured_images: + try: + page.wait_for_timeout(3000) + dom_images = page.evaluate(""" + () => { + const imgs = document.querySelectorAll('img[src*="http"]'); + const nextImgs = new Set( + Array.from(document.querySelectorAll('[class*="imgNext"] img')) + .map(img => img.src) + ); + const urls = [], seen = new Set(); + imgs.forEach(img => { + const src = img.src || ''; + if (src && !seen.has(src) && !nextImgs.has(src) + && !src.includes('/mcover/') + && !src.includes('cloudflare') && !src.includes('.svg')) { + seen.add(src); urls.push(src); + } + }); + return urls; + } + """) + if dom_images: + print(f" DOM: {len(dom_images)} images") + for u in dom_images: + captured_images.append({"url": u, "no_referrer": False}) + except Exception as e: + print(f" DOM failed: {e}") + + return captured_images, api_info + + +def get_chapter_images(session, slug, chapter_id): + """Get chapter images. On API 403 (CF expired), navigate to solve and retry.""" + images, api_info = _try_get_chapter_images(session, slug, chapter_id) + if images: + return images + + if api_info.get("status") == 403: + print(" CF expired — solve in browser...") + page = session.page + try: + page.goto(f"{BASE_URL}/mangaread/{slug}/{chapter_id}", wait_until="commit", timeout=60000) + except Exception: + pass + if wait_for_cloudflare(session, timeout=120): + images, _ = _try_get_chapter_images(session, slug, chapter_id) + + return images + + +def fetch_image_bytes(session, img): + """Fetch image via browser network stack, return raw bytes or None.""" + page = session.page + url = img["url"] + ref_policy = "no-referrer" if img.get("no_referrer") else "origin" + try: + with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info: + page.evaluate("([u, r]) => fetch(u, { referrerPolicy: r })", [url, ref_policy]) + response = resp_info.value + if response.status == 200: + body = response.body() + if body and len(body) > 100: + return body + except Exception as e: + if not hasattr(fetch_image_bytes, "_err_logged"): + fetch_image_bytes._err_logged = True + print(f"\n First error: {e}") + return None + + +def download_image(session, img, save_path): + """Fetch image and save to disk.""" + if save_path.exists(): + return True + body = fetch_image_bytes(session, img) + if body: + save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.write_bytes(body) + return True + return False + + +# ── R2 / Upload ──────────────────────────────────────────── + + +WEBP_QUALITY = 75 + + +def _to_webp_bytes(img, quality=WEBP_QUALITY, method=6): + buf = io.BytesIO() + img.save(buf, format="WEBP", quality=quality, method=method) + return buf.getvalue() + + +def convert_to_webp(source, quality=WEBP_QUALITY): + return _to_webp_bytes(Image.open(source), quality) + + +def make_cover(source, width=400, height=560): + img = Image.open(source) + target_ratio = width / height + img_ratio = img.width / img.height + if img_ratio > target_ratio: + new_width = int(img.height * target_ratio) + left = (img.width - new_width) // 2 + img = img.crop((left, 0, left + new_width, img.height)) + else: + new_height = int(img.width / target_ratio) + img = img.crop((0, 0, img.width, new_height)) + img = img.resize((width, height), Image.LANCZOS) + return _to_webp_bytes(img, quality=80) + + +def upload_to_r2(key, data, content_type="image/webp"): + s3.put_object(Bucket=BUCKET, Key=key, Body=data, ContentType=content_type) + return f"{PUBLIC_URL}/{key}" + + +def r2_key_exists(key): + try: + s3.head_object(Bucket=BUCKET, Key=key) + return True + except s3.exceptions.ClientError: + return False + + +def get_db(): + conn = psycopg2.connect(DATABASE_URL) + conn.set_client_encoding("UTF8") + return conn + + +def parse_chapter_dir(dir_name): + m = re.match(r"^(\d+)\s+(.+)$", dir_name) + if m: + return int(m.group(1)), m.group(2) + return 0, dir_name + + +# ── Helpers ──────────────────────────────────────────────── + + +def load_manga_urls(): + if not MANGA_JSON.exists(): + return [] + data = json.loads(MANGA_JSON.read_text(encoding="utf-8")) + return data if isinstance(data, list) else [] + + +def slug_from_url(url): + return urlparse(url).path.strip("/").split("/")[-1] + + +def get_existing_chapters(manga_dir): + existing = set() + if manga_dir.exists(): + for entry in manga_dir.iterdir(): + if entry.is_dir() and any(entry.glob("*.jpg")): + existing.add(entry.name) + return existing + + +def list_local_manga(): + if not CONTENT_DIR.exists(): + return [] + return sorted(d.name for d in CONTENT_DIR.iterdir() if d.is_dir() and not d.name.startswith(".")) + + +# ── Core: download manga ────────────────────────────────── + + +def load_manga_page(session, slug): + """Navigate to manga page, pass CF, return (chapters, metadata, cover_bytes) or None.""" + cover_responses = {} + + def on_cover(response): + if "/mcover/" in response.url and response.status == 200: + try: + cover_responses[response.url] = response.body() + except Exception: + pass + + page = session.page + page.on("response", on_cover) + + print(" Loading manga page...") + try: + page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) + except Exception: + pass + hide_chrome() + if not wait_for_cloudflare(session): + page = session.page + try: + page.remove_listener("response", on_cover) + except Exception: + pass + return None + + page = session.page # may have changed after CF restart + print(" Fetching chapters...") + chapters = fetch_chapters_via_api(page, slug) + if not chapters: + print(" API failed, trying DOM...") + chapters = fetch_chapters_from_dom(page) + + metadata = fetch_metadata(page) + + # Wait for cover image to be present in DOM (up to 8s) + cover_url = None + for _ in range(16): + cover_url = page.evaluate(""" + () => { + const sels = ['img.mip-fill-content[src*="mcover"]', 'img[src*="/mcover/"]']; + for (const s of sels) { + const img = document.querySelector(s); + if (img && img.src) return img.src; + } + return null; + } + """) + if cover_url: + break + page.wait_for_timeout(500) + + # Give the response another moment to be captured + if cover_url and cover_url not in cover_responses: + page.wait_for_timeout(1500) + + try: + page.remove_listener("response", on_cover) + except Exception: + pass + + cover_body = None + if cover_url: + cover_body = cover_responses.get(cover_url) + if not cover_body: + for url, data in cover_responses.items(): + if cover_url.split("?")[0] in url or url.split("?")[0] in cover_url: + cover_body = data + break + + if not cover_body: + if cover_url: + print(f" Cover URL found but body not captured ({len(cover_responses)} responses)") + else: + print(f" No cover URL found in DOM") + + return chapters, metadata, cover_body + + +def save_manga_local(slug, metadata, cover_body): + """Save metadata and cover to local manga-content/.""" + manga_dir = CONTENT_DIR / slug + manga_dir.mkdir(parents=True, exist_ok=True) + + detail_path = manga_dir / "detail.json" + if metadata: + existing = {} + if detail_path.exists(): + try: + existing = json.loads(detail_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + pass + existing.update(metadata) + detail_path.write_text(json.dumps(existing, ensure_ascii=False, indent=4), encoding="utf-8") + + cover_path = manga_dir / "cover.jpg" + if not cover_path.exists() and cover_body and len(cover_body) > 100: + cover_path.write_bytes(cover_body) + print(f" Cover saved ({len(cover_body)} bytes)") + + +def download_chapter(session, slug, chapter_index, chapter, manga_dir): + """Download a single chapter's images. Returns True if successful.""" + ch_id = chapter["id"] + ch_name = chapter["chapterName"] + folder_name = f"{chapter_index} {ch_name}" + chapter_dir = manga_dir / folder_name + + images = get_chapter_images(session, slug, ch_id) + if not images: + print(f" No images") + return False + + print(f" {len(images)} pages") + chapter_dir.mkdir(parents=True, exist_ok=True) + + ok = 0 + failed = [] + for pn, img in enumerate(images, 1): + save_path = chapter_dir / f"{pn}.jpg" + if download_image(session, img, save_path): + ok += 1 + print(f" {pn}/{len(images)}", end="\r") + else: + failed.append((pn, img)) + time.sleep(0.1) + + if failed: + time.sleep(1) + for pn, img in failed: + save_path = chapter_dir / f"{pn}.jpg" + if download_image(session, img, save_path): + ok += 1 + else: + print(f" {pn}/{len(images)} FAIL") + time.sleep(0.3) + + print(f" {ok}/{len(images)} downloaded" + " " * 20) + + if ok == 0: + try: + chapter_dir.rmdir() + except Exception: + pass + return False + + time.sleep(REQUEST_DELAY) + return True + + +# ── Core: upload manga ──────────────────────────────────── + + +def upload_manga_to_r2(manga_name, conn): + """Upload a local manga to R2 and create DB records.""" + manga_path = CONTENT_DIR / manga_name + detail_path = manga_path / "detail.json" + + if not detail_path.exists(): + print(f" Skipping {manga_name}: no detail.json") + return + + detail = json.loads(detail_path.read_text(encoding="utf-8")) + title = detail.get("mg-title", manga_name) + slug = manga_name + genres = detail.get("mg-genres", []) + description = detail.get("mg-description", "") + if not description and genres: + description = f"Genres: {', '.join(genres)}" + genre = genres[0] if genres else "Drama" + + cur = conn.cursor() + + # Cover + cover_file = manga_path / "cover.jpg" + cover_url = "" + cover_key = f"manga/{slug}/cover.webp" + if cover_file.exists(): + if not r2_key_exists(cover_key): + cover_url = upload_to_r2(cover_key, make_cover(cover_file)) + print(f" Cover uploaded") + else: + cover_url = f"{PUBLIC_URL}/{cover_key}" + + # Manga record + cur.execute('SELECT id, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + if row: + manga_id, existing_cover = row + if cover_url and cover_url != existing_cover: + cur.execute('UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', (cover_url, manga_id)) + conn.commit() + else: + cur.execute( + 'INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") ' + "VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) RETURNING id", + (title, description, cover_url, slug, genre), + ) + manga_id = cur.fetchone()[0] + conn.commit() + print(f" Created manga (id: {manga_id})") + + # Chapters + chapter_dirs = sorted( + [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")], + key=lambda d: parse_chapter_dir(d.name)[0], + ) + + for chapter_dir in chapter_dirs: + order_num, chapter_title = parse_chapter_dir(chapter_dir.name) + if order_num == 0: + continue + + cur.execute('SELECT id FROM "Chapter" WHERE "mangaId" = %s AND number = %s', (manga_id, order_num)) + if cur.fetchone(): + print(f" [{order_num}] {chapter_title} — skip") + continue + + page_files = sorted( + [f for f in chapter_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")], + key=lambda f: int(re.search(r"(\d+)", f.stem).group(1)) if re.search(r"(\d+)", f.stem) else 0, + ) + if not page_files: + continue + + print(f" [{order_num}] {chapter_title} ({len(page_files)} pages)") + + # Upload to R2 first + def process_page(args, _slug=slug, _order=order_num): + j, pf = args + r2_key = f"manga/{_slug}/chapters/{_order}/{j}.webp" + if not r2_key_exists(r2_key): + return j, upload_to_r2(r2_key, convert_to_webp(pf)) + return j, f"{PUBLIC_URL}/{r2_key}" + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)} + for future in as_completed(futures): + j, url = future.result() + page_urls[j] = url + done += 1 + print(f" {done}/{len(page_files)}", end="\r") + + if not page_urls: + print(f" Upload failed, skip") + continue + + # DB records only after R2 upload succeeds + cur.execute( + 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', + (manga_id, order_num, chapter_title), + ) + chapter_id = cur.fetchone()[0] + for j in sorted(page_urls): + cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, j, page_urls[j])) + conn.commit() + print(f" {len(page_files)} pages uploaded" + " " * 10) + + +# ── Commands ─────────────────────────────────────────────── + + +def cmd_setup(): + print("\n Chrome will open. Solve Cloudflare on:") + print(" 1. m.happymh.com") + print(" 2. Any manga page") + print(" 3. Any reader page\n") + + chrome_proc = launch_chrome(BASE_URL) + if not chrome_proc and not is_port_open(CDP_PORT): + print(" Failed to launch Chrome") + return + + input(" Press ENTER when done... ") + + try: + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") + cookies = browser.contexts[0].cookies() + cf = [c for c in cookies if c["name"] == "cf_clearance"] + print(f" cf_clearance: {'found' if cf else 'NOT found'}") + browser.close() + except Exception as e: + print(f" Could not verify: {e}") + + if chrome_proc: + chrome_proc.terminate() + print() + + +def cmd_download(manga_url=None, chapter_set=None): + """Download manga. chapter_set is a set of 1-based indices, or None for all.""" + urls = [manga_url] if manga_url else load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return + + print(f"\n Downloading {len(urls)} manga(s)...\n") + + def run(session): + for url in urls: + slug = slug_from_url(url) + try: + result = load_manga_page(session, slug) + if not result: + continue + chapters, metadata, cover_body = result + if not chapters: + print(" No chapters found.") + continue + print(f" Found {len(chapters)} chapters") + save_manga_local(slug, metadata, cover_body) + + existing = get_existing_chapters(CONTENT_DIR / slug) + + for i, ch in enumerate(chapters, 1): + if chapter_set and i not in chapter_set: + continue + if any(ch["chapterName"] in name for name in existing): + print(f" [{i}/{len(chapters)}] {ch['chapterName']} — skip") + continue + print(f" [{i}/{len(chapters)}] {ch['chapterName']} (id={ch['id']})") + download_chapter(session, slug, i, ch, CONTENT_DIR / slug) + print(f"\n Done: {slug}") + except Exception as e: + print(f"\n Error: {url}: {e}") + import traceback + traceback.print_exc() + + with_browser(run) + print("\nDownload complete!") + + +def cmd_upload(manga_name=None): + if manga_name: + names = [manga_name] + else: + names = list_local_manga() + if not names: + print(" No manga in manga-content/") + return + + print(f"\n Uploading {len(names)} manga(s)...") + conn = get_db() + try: + for name in names: + print(f"\n {'='*50}") + print(f" {name}") + print(f" {'='*50}") + upload_manga_to_r2(name, conn) + finally: + conn.close() + print("\nUpload complete!") + + +def cmd_sync(manga_url=None): + """Sync: fetch latest chapters, stream directly to R2 (no local save).""" + urls = [manga_url] if manga_url else load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return + + conn = get_db() + + def run(session): + for url in urls: + slug = slug_from_url(url) + + print(f"\n{'='*60}") + print(f"Syncing: {slug}") + print(f"{'='*60}") + + # 1. Load manga page + get chapters + result = load_manga_page(session, slug) + if not result: + continue + chapters, metadata, cover_body = result + if not chapters: + print(" No chapters found.") + continue + print(f" {len(chapters)} chapters on site") + + # 2. Ensure manga in DB + cur = conn.cursor() + title = metadata.get("mg-title", slug) + genres = metadata.get("mg-genres", []) + description = metadata.get("mg-description", "") + genre = genres[0] if genres else "Drama" + + # Cover → R2 (from RAM) + cover_url = "" + cover_key = f"manga/{slug}/cover.webp" + if cover_body and len(cover_body) > 100: + if not r2_key_exists(cover_key): + cover_webp = make_cover(io.BytesIO(cover_body)) + cover_url = upload_to_r2(cover_key, cover_webp) + print(f" Cover uploaded to R2") + else: + cover_url = f"{PUBLIC_URL}/{cover_key}" + + cur.execute('SELECT id FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + if row: + manga_id = row[0] + if cover_url: + cur.execute('UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', (cover_url, manga_id)) + conn.commit() + else: + cur.execute( + 'INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") ' + "VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) RETURNING id", + (title, description, cover_url, slug, genre), + ) + manga_id = cur.fetchone()[0] + conn.commit() + print(f" Created manga in DB (id: {manga_id})") + + # 3. Find chapters missing from DB + cur.execute('SELECT number FROM "Chapter" WHERE "mangaId" = %s', (manga_id,)) + existing_numbers = {row[0] for row in cur.fetchall()} + + new_count = 0 + for i, ch in enumerate(chapters, 1): + ch_name = ch["chapterName"] + if i in existing_numbers: + continue + + new_count += 1 + print(f" [{i}/{len(chapters)}] {ch_name} (id={ch['id']})") + + # Get image URLs from reader page + images = get_chapter_images(session, slug, ch["id"]) + if not images: + print(f" No images") + continue + + print(f" {len(images)} pages") + + # Fetch each image into RAM, convert to WebP, upload to R2 + page_bytes = {} # page_num -> raw bytes + ok = 0 + for pn, img in enumerate(images, 1): + body = fetch_image_bytes(session, img) + if body: + page_bytes[pn] = body + ok += 1 + print(f" Fetched {pn}/{len(images)}", end="\r") + else: + print(f" {pn}/{len(images)} FAIL") + time.sleep(0.1) + + if not page_bytes: + print(f" No images fetched, skip") + continue + + # Upload to R2 first + def upload_page(args, _slug=slug, _i=i): + pn, raw = args + r2_key = f"manga/{_slug}/chapters/{_i}/{pn}.webp" + webp = convert_to_webp(io.BytesIO(raw)) + return pn, upload_to_r2(r2_key, webp) + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()} + for future in as_completed(futures): + pn, r2_url = future.result() + page_urls[pn] = r2_url + done += 1 + print(f" R2: {done}/{len(page_bytes)}", end="\r") + + if not page_urls: + print(f" R2 upload failed, skip") + continue + + # Only create DB records after R2 upload succeeds + cur.execute( + 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', + (manga_id, i, ch_name), + ) + chapter_id = cur.fetchone()[0] + for pn in sorted(page_urls): + cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, pn, page_urls[pn])) + conn.commit() + print(f" {len(page_urls)} pages synced" + " " * 20) + + time.sleep(REQUEST_DELAY) + + if new_count == 0: + print(" Already up to date!") + else: + print(f" Synced {new_count} new chapters") + + try: + with_browser(run) + finally: + conn.close() + + print("\nSync complete!") + + +def r2_list_prefixes(): + """List manga slugs in R2 by scanning top-level prefixes under manga/.""" + slugs = set() + paginator = s3.get_paginator("list_objects_v2") + for pg in paginator.paginate(Bucket=BUCKET, Prefix="manga/", Delimiter="/"): + for prefix in pg.get("CommonPrefixes", []): + # "manga/slug/" -> "slug" + slug = prefix["Prefix"].split("/")[1] + if slug: + slugs.add(slug) + return sorted(slugs) + + +def r2_count_by_prefix(prefix): + """Count objects under a prefix.""" + total = 0 + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): + total += len(pg.get("Contents", [])) + return total + + +def r2_delete_prefix(prefix): + """Delete all objects under a prefix.""" + total = 0 + batches = [] + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET, Prefix=prefix): + objects = pg.get("Contents", []) + if objects: + batches.append([{"Key": obj["Key"]} for obj in objects]) + + def delete_batch(keys): + s3.delete_objects(Bucket=BUCKET, Delete={"Objects": keys}) + return len(keys) + + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for count in pool.map(delete_batch, batches): + total += count + print(f" {total} deleted", end="\r") + print(f" {total} objects deleted" + " " * 10) + return total + + +# ── TUI ──────────────────────────────────────────────────── + + +def tui_select(title, options, back=True, search=False): + """Arrow-key menu. Returns selected index or -1.""" + items = list(options) + if back: + items.append("[Back]") + menu = TerminalMenu( + items, + title=title, + search_key="/" if search else None, + show_search_hint=search, + ) + idx = menu.show() + if idx is None or (back and idx == len(items) - 1): + return -1 + return idx + + +_title_cache = {} + +def get_manga_title(slug): + """Read manga title from detail.json or DB, fallback to slug.""" + if slug in _title_cache: + return _title_cache[slug] + # Try local detail.json first + detail_path = CONTENT_DIR / slug / "detail.json" + if detail_path.exists(): + try: + detail = json.loads(detail_path.read_text(encoding="utf-8")) + title = detail.get("mg-title") + if title: + _title_cache[slug] = title + return title + except Exception: + pass + # Try database (batch load all titles) + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT slug, title FROM "Manga"') + for row in cur.fetchall(): + _title_cache[row[0]] = row[1] + conn.close() + if slug in _title_cache: + return _title_cache[slug] + except Exception: + pass + return slug + + +def manga_display_name(slug): + """Format: 'title (slug)' or just 'slug'.""" + title = get_manga_title(slug) + if title != slug: + return f"{title} ({slug})" + return slug + + +def tui_pick_manga_url(include_all=True): + """Pick manga from manga.json. Shows title + slug.""" + urls = load_manga_urls() + if not urls: + print(" No URLs in manga.json") + return None + slugs = [slug_from_url(u) for u in urls] + items = [] + if include_all: + items.append("All manga") + items += [f"{i+1}. {manga_display_name(s)}" for i, s in enumerate(slugs)] + idx = tui_select("Select manga (/ to search):", items, search=True) + if idx < 0: + return None + if include_all: + if idx == 0: + return "__all__" + return urls[idx - 1] + return urls[idx] + + +def tui_pick_local(include_all=True): + """Pick from local manga-content/. Shows title + slug.""" + local = list_local_manga() + if not local: + print(" No manga in manga-content/") + return None + items = [] + if include_all: + items.append("All manga") + items += [f"{i+1}. {manga_display_name(name)}" for i, name in enumerate(local)] + idx = tui_select("Select manga (/ to search):", items, search=True) + if idx < 0: + return None + if include_all: + if idx == 0: + return "__all__" + return local[idx - 1] + return local[idx] + + +def tui_pick_r2(): + """Pick manga from R2. Shows title + slug.""" + slugs = r2_list_prefixes() + if not slugs: + print(" R2 is empty") + return None + items = [f"{i+1}. {manga_display_name(s)}" for i, s in enumerate(slugs)] + idx = tui_select("Select manga (/ to search):", items, search=True) + return slugs[idx] if idx >= 0 else None + + +def tui_pick_chapters(chapters, slug=None): + """Multi-select chapter picker. Space to toggle, Enter to confirm. + Existing chapters shown grayed out. Returns set of selected 1-based indices, or None for all.""" + # Check which chapters already exist locally + existing = set() + if slug: + existing = get_existing_chapters(CONTENT_DIR / slug) + + # Count existing + existing_count = 0 + for i, ch in enumerate(chapters, 1): + if any(ch["chapterName"] in name for name in existing): + existing_count += 1 + + idx = tui_select(f"{len(chapters)} chapters ({existing_count} downloaded)", [ + "All chapters (skip existing)", + "Select chapters (space to toggle)", + ]) + if idx == -1: + return "back" + if idx == 0: + return None # all + + items = [] + for i, ch in enumerate(chapters, 1): + done = any(ch["chapterName"] in name for name in existing) + label = f"{i}. {ch['chapterName']}" + if done: + label = f"\033[90m{label} [done]\033[0m" + items.append(label) + + menu = TerminalMenu( + items, + title="Space=toggle, Enter=confirm, /=search:", + multi_select=True, + show_multi_select_hint=True, + search_key="/", + show_search_hint=True, + ) + selected = menu.show() + if selected is None: + return "back" + if isinstance(selected, int): + selected = (selected,) + return {i + 1 for i in selected} # 1-based + + +def tui_download(): + picked = tui_pick_manga_url() + if not picked: + return + if picked == "__all__": + cmd_download() + return + + slug = slug_from_url(picked) + print(f"\n Fetching chapters for {slug}...") + + def get_chapters(session): + page = session.page + try: + page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) + except Exception: + pass + if not wait_for_cloudflare(session): + return None + return fetch_chapters_via_api(session.page, slug) + + chapters = with_browser(get_chapters) + + if not chapters: + print(" Could not get chapters") + return + + result = tui_pick_chapters(chapters, slug=slug) + if result == "back": + return + cmd_download(picked, chapter_set=result) + + +def tui_upload(): + picked = tui_pick_local() + if not picked: + return + if picked == "__all__": + cmd_upload() + else: + cmd_upload(picked) + + +def tui_sync(): + picked = tui_pick_manga_url() + if not picked: + return + if picked == "__all__": + cmd_sync() + else: + cmd_sync(picked) + + +def tui_r2_manage(): + while True: + idx = tui_select("R2 / DB Management", [ + "Status", + "Delete specific manga", + "Clear ALL (R2 + DB)", + ]) + if idx == -1: + break + + elif idx == 0: + # Count R2 objects in single pass + slug_counts = {} + total = 0 + for pg in s3.get_paginator("list_objects_v2").paginate(Bucket=BUCKET): + for obj in pg.get("Contents", []): + total += 1 + parts = obj["Key"].split("/") + if len(parts) >= 2 and parts[0] == "manga": + slug_counts[parts[1]] = slug_counts.get(parts[1], 0) + 1 + print(f"\n R2: {total} objects, {len(slug_counts)} manga") + for slug in sorted(slug_counts): + print(f" {manga_display_name(slug)}: {slug_counts[slug]} objects") + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT COUNT(*) FROM "Manga"') + mc = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Chapter"') + cc = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Page"') + pc = cur.fetchone()[0] + print(f" DB: {mc} manga, {cc} chapters, {pc} pages") + conn.close() + except Exception as e: + print(f" DB: {e}") + input("\n Press ENTER...") + + elif idx == 1: + picked = tui_pick_r2() + if not picked: + continue + confirm = input(f" Delete {picked} from R2 + DB? [y/N] ").strip().lower() + if confirm == "y": + r2_delete_prefix(f"manga/{picked}/") + try: + conn = get_db() + cur = conn.cursor() + cur.execute('SELECT id FROM "Manga" WHERE slug = %s', (picked,)) + row = cur.fetchone() + if row: + mid = row[0] + cur.execute('DELETE FROM "Page" WHERE "chapterId" IN (SELECT id FROM "Chapter" WHERE "mangaId" = %s)', (mid,)) + cur.execute('DELETE FROM "Chapter" WHERE "mangaId" = %s', (mid,)) + cur.execute('DELETE FROM "Manga" WHERE id = %s', (mid,)) + conn.commit() + print(f" Removed from R2 + DB") + conn.close() + except Exception as e: + print(f" DB error: {e}") + + elif idx == 2: + confirm = input(" Delete ALL R2 + DB? [y/N] ").strip().lower() + if confirm == "y": + r2_delete_prefix("") + try: + conn = get_db() + cur = conn.cursor() + for t in ['"Page"', '"Chapter"', '"Manga"']: + cur.execute(f"DELETE FROM {t}") + conn.commit() + conn.close() + print(" All cleared") + except Exception as e: + print(f" DB error: {e}") + + +def main(): + while True: + idx = tui_select("Manga Toolkit", [ + "Setup (solve Cloudflare)", + "Download", + "Upload (local -> R2)", + "Sync (site -> R2)", + "R2 / DB management", + "Quit", + ], back=False) + + if idx is None or idx == -1 or idx == 5: + break + elif idx == 0: + cmd_setup() + elif idx == 1: + tui_download() + elif idx == 2: + tui_upload() + elif idx == 3: + tui_sync() + elif idx == 4: + tui_r2_manage() + + print("Bye!") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..659228e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +playwright +boto3 +psycopg2-binary +Pillow +python-dotenv +simple-term-menu diff --git a/upload.py b/upload.py deleted file mode 100644 index e74ed31..0000000 --- a/upload.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -Interactive manga uploader — Cloudflare R2 + PostgreSQL. - -R2 storage layout: - manga/<slug>/cover.webp - manga/<slug>/chapters/<number>/<page>.webp - -Usage: - python upload.py -""" - -import io -import json -import os -import re -import sys -from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path - -import boto3 -import psycopg2 -from PIL import Image -from dotenv import load_dotenv - -load_dotenv() - -ROOT_DIR = Path(__file__).parent -CONTENT_DIR = ROOT_DIR / "manga-content" - -# R2 config -s3 = boto3.client( - "s3", - endpoint_url=f"https://{os.environ['R2_ACCOUNT_ID']}.r2.cloudflarestorage.com", - aws_access_key_id=os.environ["R2_ACCESS_KEY"], - aws_secret_access_key=os.environ["R2_SECRET_KEY"], - region_name="auto", -) -BUCKET = os.environ["R2_BUCKET"] -PUBLIC_URL = os.environ["R2_PUBLIC_URL"].rstrip("/") - -# Database -DATABASE_URL = os.environ["DATABASE_URL"] -UPLOAD_WORKERS = 8 - - -def convert_to_webp(image_path, quality=80): - img = Image.open(image_path) - buf = io.BytesIO() - img.save(buf, format="WEBP", quality=quality) - buf.seek(0) - return buf.read() - - -def make_cover(image_path, width=400, height=560): - img = Image.open(image_path) - target_ratio = width / height - img_ratio = img.width / img.height - if img_ratio > target_ratio: - new_width = int(img.height * target_ratio) - left = (img.width - new_width) // 2 - img = img.crop((left, 0, left + new_width, img.height)) - else: - new_height = int(img.width / target_ratio) - img = img.crop((0, 0, img.width, new_height)) - img = img.resize((width, height), Image.LANCZOS) - buf = io.BytesIO() - img.save(buf, format="WEBP", quality=85) - buf.seek(0) - return buf.read() - - -def upload_to_r2(key, data, content_type="image/webp"): - s3.put_object(Bucket=BUCKET, Key=key, Body=data, ContentType=content_type) - return f"{PUBLIC_URL}/{key}" - - -def r2_key_exists(key): - try: - s3.head_object(Bucket=BUCKET, Key=key) - return True - except s3.exceptions.ClientError: - return False - - -def get_db(): - conn = psycopg2.connect(DATABASE_URL) - conn.set_client_encoding("UTF8") - return conn - - -def parse_chapter_dir(dir_name): - """Parse '1 001. 序章' -> (1, '001. 序章').""" - m = re.match(r"^(\d+)\s+(.+)$", dir_name) - if m: - return int(m.group(1)), m.group(2) - return 0, dir_name - - -def list_local_manga(): - """List manga directories in manga-content/.""" - dirs = sorted( - d.name for d in CONTENT_DIR.iterdir() - if d.is_dir() and not d.name.startswith(".") - ) - return dirs - - -# ── Commands ────────────────────────────────────────────── - - -def cmd_reset(): - """Clear all R2 storage.""" - print("\nClearing R2 bucket...") - total = 0 - batches = [] - paginator = s3.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=BUCKET): - objects = page.get("Contents", []) - if not objects: - break - batches.append([{"Key": obj["Key"]} for obj in objects]) - - # Delete batches in parallel - def delete_batch(keys): - s3.delete_objects(Bucket=BUCKET, Delete={"Objects": keys}) - return len(keys) - - with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: - for count in pool.map(delete_batch, batches): - total += count - print(f" {total} deleted", end="\r") - - print(f" {total} objects deleted from R2" + " " * 10) - print("R2 cleared. Run 'upload' to re-upload.\n") - - -def cmd_status(conn): - """Show current state of R2 and database.""" - cur = conn.cursor() - - # DB counts - cur.execute('SELECT COUNT(*) FROM "Manga"') - manga_count = cur.fetchone()[0] - cur.execute('SELECT COUNT(*) FROM "Chapter"') - chapter_count = cur.fetchone()[0] - cur.execute('SELECT COUNT(*) FROM "Page"') - page_count = cur.fetchone()[0] - - print(f"\n Database: {manga_count} manga, {chapter_count} chapters, {page_count} pages") - - # List manga in DB - cur.execute('SELECT slug, title, (SELECT COUNT(*) FROM "Chapter" WHERE "mangaId" = "Manga".id) FROM "Manga" ORDER BY slug') - for slug, title, ch_count in cur.fetchall(): - print(f" {slug}: {title} ({ch_count} chapters)") - - # R2 count - total = 0 - paginator = s3.get_paginator("list_objects_v2") - for page in paginator.paginate(Bucket=BUCKET): - total += len(page.get("Contents", [])) - print(f" R2: {total} objects") - - # Local - local = list_local_manga() - print(f" Local: {len(local)} manga in manga-content/") - for name in local: - manga_path = CONTENT_DIR / name - chapters = [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")] - has_cover = (manga_path / "cover.jpg").exists() - print(f" {name}: {len(chapters)} chapters, cover: {'yes' if has_cover else 'no'}") - print() - - -def cmd_upload(conn, manga_name=None): - """Upload manga to R2 and create DB records.""" - if manga_name: - manga_dirs = [manga_name] - if not (CONTENT_DIR / manga_name).is_dir(): - print(f" Not found: {CONTENT_DIR / manga_name}") - return - else: - manga_dirs = list_local_manga() - - if not manga_dirs: - print(" No manga found in manga-content/") - return - - print(f"\n Uploading {len(manga_dirs)} manga(s)...") - - for name in manga_dirs: - upload_manga(name, conn) - - print("\nUpload complete!") - - -def upload_manga(manga_name, conn): - manga_path = CONTENT_DIR / manga_name - detail_path = manga_path / "detail.json" - - if not detail_path.exists(): - print(f" Skipping {manga_name}: no detail.json") - return - - detail = json.loads(detail_path.read_text(encoding="utf-8")) - title = detail.get("mg-title", manga_name) - slug = manga_name - genres = detail.get("mg-genres", []) - description = detail.get("mg-description", "") - if not description and genres: - description = f"Genres: {', '.join(genres)}" - genre = genres[0] if genres else "Drama" - - print(f"\n {'='*50}") - print(f" {title} ({slug})") - print(f" {'='*50}") - - cur = conn.cursor() - - # Cover - cover_file = manga_path / "cover.jpg" - cover_url = "" - cover_key = f"manga/{slug}/cover.webp" - - if cover_file.exists(): - if not r2_key_exists(cover_key): - cover_data = make_cover(cover_file) - cover_url = upload_to_r2(cover_key, cover_data) - print(f" Cover uploaded") - else: - cover_url = f"{PUBLIC_URL}/{cover_key}" - print(f" Cover exists") - else: - print(" No cover.jpg") - - # Manga record - cur.execute('SELECT id, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,)) - row = cur.fetchone() - - if row: - manga_id, existing_cover = row - print(f" Manga exists (id: {manga_id})") - if cover_url and cover_url != existing_cover: - cur.execute( - 'UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', - (cover_url, manga_id), - ) - conn.commit() - else: - cur.execute( - """ - INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") - VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) - RETURNING id - """, - (title, description, cover_url, slug, genre), - ) - manga_id = cur.fetchone()[0] - conn.commit() - print(f" Created manga (id: {manga_id})") - - # Chapters - chapter_dirs = sorted( - [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")], - key=lambda d: parse_chapter_dir(d.name)[0], - ) - - for chapter_dir in chapter_dirs: - order_num, chapter_title = parse_chapter_dir(chapter_dir.name) - if order_num == 0: - continue - - cur.execute( - 'SELECT id FROM "Chapter" WHERE "mangaId" = %s AND number = %s', - (manga_id, order_num), - ) - if cur.fetchone(): - print(f" [{order_num}] {chapter_title} — skip") - continue - - page_files = sorted( - [f for f in chapter_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")], - key=lambda f: int(re.search(r"(\d+)", f.stem).group(1)) if re.search(r"(\d+)", f.stem) else 0, - ) - - if not page_files: - continue - - print(f" [{order_num}] {chapter_title} ({len(page_files)} pages)") - - cur.execute( - 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', - (manga_id, order_num, chapter_title), - ) - chapter_id = cur.fetchone()[0] - conn.commit() - - # Parallel convert + upload - def process_page(args): - j, page_file = args - r2_key = f"manga/{slug}/chapters/{order_num}/{j}.webp" - if not r2_key_exists(r2_key): - webp_data = convert_to_webp(page_file) - return j, upload_to_r2(r2_key, webp_data) - return j, f"{PUBLIC_URL}/{r2_key}" - - page_urls = {} - done = 0 - with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: - futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)} - for future in as_completed(futures): - j, url = future.result() - page_urls[j] = url - done += 1 - print(f" {done}/{len(page_files)}", end="\r") - - # Batch insert page records in order - for j in sorted(page_urls): - cur.execute( - 'INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', - (chapter_id, j, page_urls[j]), - ) - - conn.commit() - print(f" {len(page_files)} pages uploaded" + " " * 10) - - -# ── Interactive loop ────────────────────────────────────── - - -def show_menu(): - print() - print("=" * 40) - print(" Manga Uploader") - print("=" * 40) - print(" 1. Status") - print(" 2. Upload all manga") - print(" 3. Upload specific manga") - print(" 4. Reset R2 storage") - print(" 0. Quit") - print() - - -def main(): - conn = get_db() - try: - while True: - show_menu() - try: - choice = input("Select [0-4]: ").strip() - except (EOFError, KeyboardInterrupt): - print() - break - - if choice == "0": - break - elif choice == "1": - cmd_status(conn) - elif choice == "2": - cmd_upload(conn) - elif choice == "3": - local = list_local_manga() - if not local: - print(" No manga in manga-content/") - continue - print() - for i, name in enumerate(local, 1): - print(f" {i}. {name}") - print() - pick = input("Select manga number: ").strip() - try: - idx = int(pick) - 1 - if 0 <= idx < len(local): - cmd_upload(conn, local[idx]) - else: - print(" Invalid selection") - except ValueError: - print(" Invalid input") - elif choice == "4": - confirm = input(" Delete ALL R2 objects? [y/N] ").strip().lower() - if confirm == "y": - cmd_reset() - else: - print(" Cancelled.") - else: - print(" Invalid choice") - finally: - conn.close() - - print("Bye!") - - -if __name__ == "__main__": - main()