commit 721ad213eefcda1d60e54a211da40f6c1e39c3dc Author: yiekheng Date: Sat Apr 11 16:55:13 2026 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..580249b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.env +__pycache__/ +manga-content/ +.browser-data/ +cookies.txt +.DS_Store diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..e494aaf --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,47 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Manga downloader for m.happymh.com. Reads manga URLs from `manga.json` and downloads chapter images into `manga-content/`. + +## Data Flow + +1. **Input**: `manga.json` — JSON array of manga URLs (e.g., `["https://m.happymh.com/manga/butiange"]`) +2. **Output**: `manga-content// /*.jpg` — downloaded page images +3. **Metadata**: `manga-content//detail.json` — stores manga/chapter metadata + +## Directory Convention + +``` +manga-content/ + butiange/ + detail.json + 1 第一回/ + 1.jpg + 2.jpg + 3.jpg + 2 第二回/ + ... +``` + +- Manga name is the URL slug (last path segment of the manga URL) +- Chapter folders are named ` ` (e.g., `1 第一回`) +- Image filenames are sequential page numbers (`1.jpg`, `2.jpg`, ...) + +## Metadata Format (`detail.json`) + +Each manga folder contains a `detail.json` with fields: +- `mg-url` — source URL on m.happymh.com +- `mg-title` — manga title (Chinese) +- `mg-author` — author name +- `mg-genres` — array of genre tags +- `mg-description` — synopsis text + +## Target Site + +- Base URL: `https://m.happymh.com` +- Manga page: `/manga/<slug>` — contains chapter listing +- Chapter page: `/reads/<slug>/<chapter-id>` — contains page images +- The site is mobile-oriented; requests should use appropriate mobile User-Agent headers diff --git a/download.py b/download.py new file mode 100644 index 0000000..2b5d464 --- /dev/null +++ b/download.py @@ -0,0 +1,686 @@ +""" +Manga downloader for m.happymh.com (educational purposes only). + +Launches real Chrome via subprocess (not Playwright), then connects via +Chrome DevTools Protocol. Images are downloaded directly via HTTP. + +Usage: + python download.py --setup # open Chrome, solve CF manually, exit + python download.py # download manga from manga.json +""" + +import json +import re +import sys +import time +import socket +import subprocess +from pathlib import Path +from urllib.parse import urlparse + +from playwright.sync_api import sync_playwright + +BASE_URL = "https://m.happymh.com" +USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/143.0.0.0 Safari/537.36" +) +ROOT_DIR = Path(__file__).parent +CONTENT_DIR = ROOT_DIR / "manga-content" +MANGA_JSON = ROOT_DIR / "manga.json" +BROWSER_DATA = ROOT_DIR / ".browser-data" +CDP_PORT = 9333 +REQUEST_DELAY = 1.5 + +CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + + +def is_port_open(port): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(("localhost", port)) == 0 + + +def launch_chrome(start_url=None): + """Launch real Chrome with CDP port.""" + if is_port_open(CDP_PORT): + print(f"Chrome already on port {CDP_PORT}") + return None + + if not Path(CHROME_PATH).exists(): + print(f"Chrome not found at: {CHROME_PATH}") + sys.exit(1) + + cmd = [ + CHROME_PATH, + f"--remote-debugging-port={CDP_PORT}", + f"--user-data-dir={BROWSER_DATA}", + "--no-first-run", + "--no-default-browser-check", + ] + if start_url: + cmd.append(start_url) + + proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + for _ in range(30): + if is_port_open(CDP_PORT): + time.sleep(1) + return proc + time.sleep(0.5) + + print("Chrome failed to start") + sys.exit(1) + + +def wait_for_cloudflare(page, timeout=120): + """Wait for CF to resolve. User solves CAPTCHA manually if needed.""" + for i in range(timeout): + try: + title = page.title() + except Exception: + time.sleep(1) + continue + + if "Just a moment" in title or "challenge" in page.url: + if i == 0: + print(" CF challenge — solve in browser...") + elif i % 15 == 0: + print(f" Still waiting for CF... ({i}s)") + time.sleep(1) + continue + + if title and "嗨皮漫画" in title: + return True + if title and "happymh" in page.url: + return True + + time.sleep(1) + + print(" CF timed out.") + return False + + +def fetch_chapters_via_api(page, slug): + """Get full chapter list via chapterByPage API with pagination.""" + result = page.evaluate(""" + async (slug) => { + const all = []; + let total = 0; + for (let p = 1; p <= 30; p++) { + const url = `/v2.0/apis/manga/chapterByPage?code=${slug}&lang=cn&order=asc&page=${p}&_t=${Date.now()}`; + try { + const ctrl = new AbortController(); + setTimeout(() => ctrl.abort(), 10000); + const r = await fetch(url, { signal: ctrl.signal }); + if (!r.ok) { if (p === 1) return { error: r.status }; break; } + const json = await r.json(); + if (!json.data) break; + + total = json.data.total || total; + + // Find chapter array in response + let items = null; + for (const val of Object.values(json.data)) { + if (Array.isArray(val) && val.length > 0) { + items = val; + break; + } + } + if (!items || items.length === 0) break; + + for (const ch of items) { + all.push({ + id: String(ch.id || ''), + chapterName: ch.chapterName || ch.name || '', + }); + } + + if (total && all.length >= total) break; + } catch (e) { + if (p === 1) return { error: e.message }; + break; + } + } + return { chapters: all, total }; + } + """, slug) + + if result and result.get("chapters") and len(result["chapters"]) > 0: + chapters = result["chapters"] + total = result.get("total", len(chapters)) + print(f" API: {len(chapters)}/{total} chapters") + return chapters + + if result and result.get("error"): + print(f" API error: {result['error']}") + + return None + + +def fetch_chapters_from_dom(page): + """Scrape all chapters from the MUI Drawer chapter list. + Opens drawer, clicks 'load more' repeatedly, then scrapes.""" + try: + page.wait_for_selector("a[href*='/mangaread/']", timeout=15000) + page.wait_for_timeout(1000) + except Exception: + print(" No chapter links found") + return None + + # Step 1: Open the chapter list drawer + for selector in [ + "text=展开全部", "text=查看全部", "text=全部章节", + "text=展开更多", "text=更多", + "[class*='expand']", "[class*='more']", + ]: + try: + btn = page.query_selector(selector) + if btn and btn.is_visible(): + btn.click() + print(" Opening chapter drawer...") + page.wait_for_timeout(2000) + break + except Exception: + continue + + # Step 2: Wait for drawer + try: + page.wait_for_selector(".MuiDrawer-paper", timeout=5000) + except Exception: + print(" Drawer not found, using page chapters") + + # Step 3: Click sort button to get ascending order (oldest first) + try: + sort_btn = page.query_selector("text=点我改变排序") + if sort_btn and sort_btn.is_visible(): + sort_btn.click() + print(" Sorting ascending...") + page.wait_for_timeout(2000) + except Exception: + pass + + # Step 4: Click "点我加载更多" until all chapters loaded + # Get expected total from header "共177个章节" + total = page.evaluate(""" + () => { + const spans = document.querySelectorAll('.MuiDrawer-paper span'); + for (const s of spans) { + const m = s.textContent.match(/共(\\d+)个章节/); + if (m) return parseInt(m[1]); + } + return 0; + } + """) + if total: + print(f" Total chapters: {total}") + + for round_num in range(50): + count = page.evaluate( + "document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length" + ) + if total and count >= total: + break + print(f" Loading... {count}/{total or '?'}", end="\r") + + # Find and click the "load more" element — search fresh each time + clicked = page.evaluate(""" + () => { + const walker = document.createTreeWalker( + document.querySelector('.MuiDrawer-paper') || document.body, + NodeFilter.SHOW_TEXT + ); + while (walker.nextNode()) { + if (walker.currentNode.textContent.includes('加载更多')) { + let el = walker.currentNode.parentElement; + while (el && el.tagName !== 'LI') el = el.parentElement; + if (el) { el.click(); return true; } + walker.currentNode.parentElement.click(); + return true; + } + } + return false; + } + """) + if not clicked: + break + page.wait_for_timeout(1000) + + count = page.evaluate( + "document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length" + ) + print(f" Loaded {count} chapters" + " " * 20) + + # Step 5: Scrape chapters from the drawer + chapters = page.evaluate(""" + () => { + const drawer = document.querySelector('.MuiDrawer-paper'); + const container = drawer || document; + const links = container.querySelectorAll('a[href*="/mangaread/"]'); + const chapters = []; + const seen = new Set(); + links.forEach(a => { + const href = a.getAttribute('href'); + const match = href.match(/\\/mangaread\\/[^/]+\\/(\\d+)/); + if (match && !seen.has(match[1])) { + seen.add(match[1]); + const name = a.textContent.trim(); + if (name && name !== '开始阅读') { + chapters.push({ id: match[1], chapterName: name }); + } + } + }); + return chapters; + } + """) + + # Step 6: Close drawer + try: + page.keyboard.press("Escape") + page.wait_for_timeout(500) + except Exception: + pass + + return chapters if chapters else None + + +def fetch_metadata(page): + """Extract manga metadata and cover URL from the loaded page.""" + html_text = page.content() + metadata = {"mg-url": page.url} + + m = re.search(r'<h2 class="mg-title">(.*?)</h2>', html_text) + if m: + metadata["mg-title"] = m.group(1).strip() + + m = re.search(r'<p class="mg-sub-title"><a[^>]*>(.*?)</a>', html_text) + if m: + metadata["mg-author"] = m.group(1).strip() + + genre_matches = re.findall(r'<p class="mg-cate">.*?</p>', html_text, re.DOTALL) + if genre_matches: + genres = re.findall(r'<a[^>]*>(.*?)</a>', genre_matches[0]) + metadata["mg-genres"] = genres + + m = re.search(r'<div class="mg-desc">.*?<p[^>]*>(.*?)</p>', html_text, re.DOTALL) + if m: + metadata["mg-description"] = m.group(1).strip() + + # Extract cover image URL + cover_url = page.evaluate(""" + () => { + // Try og:image meta tag + const og = document.querySelector('meta[property="og:image"]'); + if (og) return og.content; + // Try common cover selectors + const selectors = ['img.mg-cover', '.mg-cover img', '.cover img', 'img[src*="mcover"]']; + for (const sel of selectors) { + const img = document.querySelector(sel); + if (img && img.src) return img.src; + } + return null; + } + """) + if cover_url: + metadata["mg-cover"] = cover_url + + return metadata + + +def get_chapter_images(page, slug, chapter_id): + """Navigate to reader page, intercept the API response for image URLs.""" + captured_images = [] + api_info = {"found": False, "error": None} + + def on_response(response): + if "/apis/manga/reading" not in response.url: + return + api_info["found"] = True + if response.status != 200: + api_info["error"] = f"status {response.status}" + return + try: + data = response.json() + scans = data.get("data", {}).get("scans", []) + if isinstance(scans, str): + scans = json.loads(scans) + for scan in scans: + if isinstance(scan, dict) and "url" in scan: + captured_images.append({ + "url": scan["url"], + "no_referrer": scan.get("r", 0) != 0, + }) + except Exception as e: + api_info["error"] = str(e) + + page.on("response", on_response) + + reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}" + print(" Loading reader...") + try: + page.evaluate(f"window.location.href = '{reader_url}'") + except Exception: + pass + + time.sleep(2) + + try: + page.evaluate("window.close = () => {}") + except Exception: + pass + + print(" Waiting for page...") + if not wait_for_cloudflare(page, timeout=90): + page.remove_listener("response", on_response) + return [] + + print(" Waiting for API...") + deadline = time.time() + 20 + while time.time() < deadline: + if captured_images: + break + try: + page.wait_for_timeout(500) + except Exception: + break + + page.remove_listener("response", on_response) + + if not api_info["found"]: + print(" API not intercepted") + elif api_info["error"]: + print(f" API: {api_info['error']}") + + # DOM fallback + if not captured_images: + try: + page.wait_for_timeout(3000) + dom_images = page.evaluate(""" + () => { + const imgs = document.querySelectorAll('img[src*="http"]'); + const urls = []; + const seen = new Set(); + imgs.forEach(img => { + const src = img.src || ''; + if (src && !seen.has(src) && !src.includes('/mcover/') + && !src.includes('cloudflare') && !src.includes('.svg')) { + seen.add(src); + urls.push(src); + } + }); + return urls; + } + """) + if dom_images: + print(f" DOM: {len(dom_images)} images") + for u in dom_images: + captured_images.append({"url": u, "no_referrer": False}) + except Exception as e: + print(f" DOM failed: {e}") + + return captured_images + + +def download_image(page, img, save_path): + """Download image via browser network stack. Captures raw bytes via CDP — no base64.""" + if save_path.exists(): + return True + + url = img["url"] + ref_policy = "no-referrer" if img.get("no_referrer") else "origin" + + try: + with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info: + page.evaluate( + "([u, r]) => fetch(u, { referrerPolicy: r })", + [url, ref_policy], + ) + response = resp_info.value + if response.status == 200: + body = response.body() # raw bytes from network layer + if body and len(body) > 100: + save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.write_bytes(body) + return True + except Exception as e: + if not hasattr(download_image, "_err_logged"): + download_image._err_logged = True + print(f"\n First error: {e}") + return False + + return False + + +def get_existing_chapters(manga_dir): + existing = set() + if manga_dir.exists(): + for entry in manga_dir.iterdir(): + if entry.is_dir() and any(entry.glob("*.jpg")): + existing.add(entry.name) + return existing + + +def download_manga(page, manga_url): + """Download all chapters using a single page.""" + slug = urlparse(manga_url).path.strip("/").split("/")[-1] + manga_dir = CONTENT_DIR / slug + + print(f"\n{'='*60}") + print(f"Manga: {slug}") + print(f"{'='*60}") + + # Intercept all cover images from page load traffic + cover_responses = {} + + def on_manga_response(response): + if "/mcover/" in response.url and response.status == 200: + try: + cover_responses[response.url] = response.body() + except Exception: + pass + + page.on("response", on_manga_response) + + print("Loading manga page...") + try: + page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000) + except Exception: + pass + if not wait_for_cloudflare(page): + page.remove_listener("response", on_manga_response) + return + + print("Fetching chapters via API...") + chapters = fetch_chapters_via_api(page, slug) + if not chapters: + print(" API failed, trying DOM...") + chapters = fetch_chapters_from_dom(page) + if not chapters: + print("No chapters found.") + return + + print(f"Found {len(chapters)} chapters") + + metadata = fetch_metadata(page) + manga_dir.mkdir(parents=True, exist_ok=True) + detail_path = manga_dir / "detail.json" + if metadata: + existing_meta = {} + if detail_path.exists(): + try: + existing_meta = json.loads(detail_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + pass + existing_meta.update(metadata) + detail_path.write_text( + json.dumps(existing_meta, ensure_ascii=False, indent=4), + encoding="utf-8", + ) + + # Save cover image — match the correct one from DOM + page.remove_listener("response", on_manga_response) + cover_path = manga_dir / "cover.jpg" + if not cover_path.exists(): + # Find the actual cover URL from the first mip-fill-content img + cover_url = page.evaluate(""" + () => { + const img = document.querySelector('img.mip-fill-content[src*="mcover"]'); + return img ? img.src : null; + } + """) + cover_body = None + if cover_url: + # Exact match first + cover_body = cover_responses.get(cover_url) + # Substring match fallback + if not cover_body: + for url, data in cover_responses.items(): + if cover_url.split("?")[0] in url or url.split("?")[0] in cover_url: + cover_body = data + break + + if cover_body and len(cover_body) > 100: + cover_path.write_bytes(cover_body) + print(f"Cover saved ({len(cover_body)} bytes)") + else: + print(f"Cover not found (captured {len(cover_responses)} mcover images, target: {cover_url})") + + existing_chapters = get_existing_chapters(manga_dir) + + # Chapters are already in DOM order (ascending from drawer) + chapters_sorted = chapters + + for i, chapter in enumerate(chapters_sorted, 1): + ch_id = chapter["id"] + ch_name = chapter["chapterName"] + folder_name = f"{i} {ch_name}" + + # Skip if this chapter already downloaded (check by chapter name) + already = any(ch_name in name for name in existing_chapters) + if already: + print(f" [{i}/{len(chapters_sorted)}] {ch_name} — skip") + continue + + print(f" [{i}/{len(chapters_sorted)}] {ch_name} (id={ch_id})") + + images = get_chapter_images(page, slug, ch_id) + if not images: + print(f" No images") + continue + + print(f" {len(images)} pages") + chapter_dir = manga_dir / folder_name + chapter_dir.mkdir(parents=True, exist_ok=True) + + # Download images via browser network stack (raw bytes, no base64) + ok = 0 + failed = [] + for pn, img in enumerate(images, 1): + save_path = chapter_dir / f"{pn}.jpg" + if download_image(page, img, save_path): + ok += 1 + print(f" {pn}/{len(images)}", end="\r") + else: + failed.append((pn, img)) + time.sleep(0.1) + + # Retry failed images once + if failed: + time.sleep(1) + for pn, img in failed: + save_path = chapter_dir / f"{pn}.jpg" + if download_image(page, img, save_path): + ok += 1 + else: + print(f" {pn}/{len(images)} FAIL") + time.sleep(0.3) + + print(f" {ok}/{len(images)} downloaded" + " " * 20) + + if ok == 0: + try: + chapter_dir.rmdir() + except Exception: + pass + + time.sleep(REQUEST_DELAY) + + print(f"\nDone: {slug}") + + +def setup_mode(): + """Launch Chrome for manual CF solving.""" + print("=== SETUP ===") + print("Chrome will open. Do this:") + print(" 1. Go to m.happymh.com — solve Cloudflare") + print(" 2. Open a manga page — solve CF if prompted") + print(" 3. Open a chapter reader — solve CF if prompted") + print(" 4. Press ENTER here when done\n") + + chrome_proc = launch_chrome(BASE_URL) + + input(">>> Press ENTER when Cloudflare is solved... ") + + try: + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") + ctx = browser.contexts[0] + cookies = ctx.cookies() + cf = [c for c in cookies if c["name"] == "cf_clearance"] + if cf: + print("cf_clearance found!") + else: + print("Warning: cf_clearance not found") + browser.close() + except Exception as e: + print(f"Could not verify: {e}") + + if chrome_proc: + chrome_proc.terminate() + + print("Done. Now run: python download.py") + + +def main(): + if "--setup" in sys.argv: + setup_mode() + return + + if not MANGA_JSON.exists(): + print(f"Error: {MANGA_JSON} not found") + sys.exit(1) + + manga_urls = json.loads(MANGA_JSON.read_text(encoding="utf-8")) + if not isinstance(manga_urls, list) or not manga_urls: + print("Error: manga.json should be a JSON array of URLs") + sys.exit(1) + + print(f"Found {len(manga_urls)} manga(s)") + print("Launching Chrome...\n") + + chrome_proc = launch_chrome() + + try: + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}") + context = browser.contexts[0] + page = context.pages[0] if context.pages else context.new_page() + + for url in manga_urls: + try: + download_manga(page, url) + except Exception as e: + print(f"\nError: {url}: {e}") + import traceback + traceback.print_exc() + + browser.close() + finally: + if chrome_proc: + chrome_proc.terminate() + + print("\nAll done!") + + +if __name__ == "__main__": + main() diff --git a/export_cookies.py b/export_cookies.py new file mode 100644 index 0000000..f1a70b5 --- /dev/null +++ b/export_cookies.py @@ -0,0 +1,92 @@ +""" +Opens a browser to m.happymh.com, waits for you to pass Cloudflare, +then saves cookies to cookies.txt in Netscape format. + +Install: + pip install playwright + playwright install chromium + +Usage: + python export_cookies.py +""" + +import time +from pathlib import Path + +try: + from playwright.sync_api import sync_playwright +except ImportError: + print("Playwright not installed. Run:") + print(" pip install playwright") + print(" playwright install chromium") + raise SystemExit(1) + +COOKIES_FILE = Path(__file__).parent / "cookies.txt" +TARGET_URL = "https://m.happymh.com" + + +def cookies_to_netscape(cookies): + """Convert Playwright cookies to Netscape cookies.txt format.""" + lines = ["# Netscape HTTP Cookie File", ""] + for c in cookies: + domain = c["domain"] + # Netscape format: leading dot means accessible to subdomains + if not domain.startswith("."): + domain = "." + domain + flag = "TRUE" # accessible to subdomains + path = c.get("path", "/") + secure = "TRUE" if c.get("secure", False) else "FALSE" + expires = str(int(c.get("expires", 0))) + name = c["name"] + value = c["value"] + lines.append(f"{domain}\t{flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}") + return "\n".join(lines) + "\n" + + +def main(): + print("Opening browser to m.happymh.com...") + print("Once the page loads (past Cloudflare), press ENTER here to save cookies.\n") + + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + context = browser.new_context( + user_agent=( + "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) " + "Version/16.0 Mobile/15E148 Safari/604.1" + ), + viewport={"width": 390, "height": 844}, + is_mobile=True, + ) + + page = context.new_page() + page.goto(TARGET_URL) + + input(">>> Page opened. Pass Cloudflare if needed, then press ENTER to save cookies... ") + + cookies = context.cookies() + if not cookies: + print("No cookies found!") + browser.close() + return + + # Check for cf_clearance + cookie_names = [c["name"] for c in cookies] + if "cf_clearance" in cookie_names: + print("cf_clearance cookie found (Cloudflare passed)") + else: + print("Warning: cf_clearance not found. You may still be on the challenge page.") + answer = input("Save anyway? [y/N] ").strip().lower() + if answer != "y": + browser.close() + return + + text = cookies_to_netscape(cookies) + COOKIES_FILE.write_text(text) + print(f"\nSaved {len(cookies)} cookies to {COOKIES_FILE}") + + browser.close() + + +if __name__ == "__main__": + main() diff --git a/manga.json b/manga.json new file mode 100644 index 0000000..a84f9e2 --- /dev/null +++ b/manga.json @@ -0,0 +1,4 @@ +[ + "https://m.happymh.com/manga/moutianchengweimoshen", + "https://m.happymh.com/manga/butiange" +] \ No newline at end of file diff --git a/upload.py b/upload.py new file mode 100644 index 0000000..e74ed31 --- /dev/null +++ b/upload.py @@ -0,0 +1,393 @@ +""" +Interactive manga uploader — Cloudflare R2 + PostgreSQL. + +R2 storage layout: + manga/<slug>/cover.webp + manga/<slug>/chapters/<number>/<page>.webp + +Usage: + python upload.py +""" + +import io +import json +import os +import re +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import boto3 +import psycopg2 +from PIL import Image +from dotenv import load_dotenv + +load_dotenv() + +ROOT_DIR = Path(__file__).parent +CONTENT_DIR = ROOT_DIR / "manga-content" + +# R2 config +s3 = boto3.client( + "s3", + endpoint_url=f"https://{os.environ['R2_ACCOUNT_ID']}.r2.cloudflarestorage.com", + aws_access_key_id=os.environ["R2_ACCESS_KEY"], + aws_secret_access_key=os.environ["R2_SECRET_KEY"], + region_name="auto", +) +BUCKET = os.environ["R2_BUCKET"] +PUBLIC_URL = os.environ["R2_PUBLIC_URL"].rstrip("/") + +# Database +DATABASE_URL = os.environ["DATABASE_URL"] +UPLOAD_WORKERS = 8 + + +def convert_to_webp(image_path, quality=80): + img = Image.open(image_path) + buf = io.BytesIO() + img.save(buf, format="WEBP", quality=quality) + buf.seek(0) + return buf.read() + + +def make_cover(image_path, width=400, height=560): + img = Image.open(image_path) + target_ratio = width / height + img_ratio = img.width / img.height + if img_ratio > target_ratio: + new_width = int(img.height * target_ratio) + left = (img.width - new_width) // 2 + img = img.crop((left, 0, left + new_width, img.height)) + else: + new_height = int(img.width / target_ratio) + img = img.crop((0, 0, img.width, new_height)) + img = img.resize((width, height), Image.LANCZOS) + buf = io.BytesIO() + img.save(buf, format="WEBP", quality=85) + buf.seek(0) + return buf.read() + + +def upload_to_r2(key, data, content_type="image/webp"): + s3.put_object(Bucket=BUCKET, Key=key, Body=data, ContentType=content_type) + return f"{PUBLIC_URL}/{key}" + + +def r2_key_exists(key): + try: + s3.head_object(Bucket=BUCKET, Key=key) + return True + except s3.exceptions.ClientError: + return False + + +def get_db(): + conn = psycopg2.connect(DATABASE_URL) + conn.set_client_encoding("UTF8") + return conn + + +def parse_chapter_dir(dir_name): + """Parse '1 001. 序章' -> (1, '001. 序章').""" + m = re.match(r"^(\d+)\s+(.+)$", dir_name) + if m: + return int(m.group(1)), m.group(2) + return 0, dir_name + + +def list_local_manga(): + """List manga directories in manga-content/.""" + dirs = sorted( + d.name for d in CONTENT_DIR.iterdir() + if d.is_dir() and not d.name.startswith(".") + ) + return dirs + + +# ── Commands ────────────────────────────────────────────── + + +def cmd_reset(): + """Clear all R2 storage.""" + print("\nClearing R2 bucket...") + total = 0 + batches = [] + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=BUCKET): + objects = page.get("Contents", []) + if not objects: + break + batches.append([{"Key": obj["Key"]} for obj in objects]) + + # Delete batches in parallel + def delete_batch(keys): + s3.delete_objects(Bucket=BUCKET, Delete={"Objects": keys}) + return len(keys) + + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for count in pool.map(delete_batch, batches): + total += count + print(f" {total} deleted", end="\r") + + print(f" {total} objects deleted from R2" + " " * 10) + print("R2 cleared. Run 'upload' to re-upload.\n") + + +def cmd_status(conn): + """Show current state of R2 and database.""" + cur = conn.cursor() + + # DB counts + cur.execute('SELECT COUNT(*) FROM "Manga"') + manga_count = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Chapter"') + chapter_count = cur.fetchone()[0] + cur.execute('SELECT COUNT(*) FROM "Page"') + page_count = cur.fetchone()[0] + + print(f"\n Database: {manga_count} manga, {chapter_count} chapters, {page_count} pages") + + # List manga in DB + cur.execute('SELECT slug, title, (SELECT COUNT(*) FROM "Chapter" WHERE "mangaId" = "Manga".id) FROM "Manga" ORDER BY slug') + for slug, title, ch_count in cur.fetchall(): + print(f" {slug}: {title} ({ch_count} chapters)") + + # R2 count + total = 0 + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=BUCKET): + total += len(page.get("Contents", [])) + print(f" R2: {total} objects") + + # Local + local = list_local_manga() + print(f" Local: {len(local)} manga in manga-content/") + for name in local: + manga_path = CONTENT_DIR / name + chapters = [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")] + has_cover = (manga_path / "cover.jpg").exists() + print(f" {name}: {len(chapters)} chapters, cover: {'yes' if has_cover else 'no'}") + print() + + +def cmd_upload(conn, manga_name=None): + """Upload manga to R2 and create DB records.""" + if manga_name: + manga_dirs = [manga_name] + if not (CONTENT_DIR / manga_name).is_dir(): + print(f" Not found: {CONTENT_DIR / manga_name}") + return + else: + manga_dirs = list_local_manga() + + if not manga_dirs: + print(" No manga found in manga-content/") + return + + print(f"\n Uploading {len(manga_dirs)} manga(s)...") + + for name in manga_dirs: + upload_manga(name, conn) + + print("\nUpload complete!") + + +def upload_manga(manga_name, conn): + manga_path = CONTENT_DIR / manga_name + detail_path = manga_path / "detail.json" + + if not detail_path.exists(): + print(f" Skipping {manga_name}: no detail.json") + return + + detail = json.loads(detail_path.read_text(encoding="utf-8")) + title = detail.get("mg-title", manga_name) + slug = manga_name + genres = detail.get("mg-genres", []) + description = detail.get("mg-description", "") + if not description and genres: + description = f"Genres: {', '.join(genres)}" + genre = genres[0] if genres else "Drama" + + print(f"\n {'='*50}") + print(f" {title} ({slug})") + print(f" {'='*50}") + + cur = conn.cursor() + + # Cover + cover_file = manga_path / "cover.jpg" + cover_url = "" + cover_key = f"manga/{slug}/cover.webp" + + if cover_file.exists(): + if not r2_key_exists(cover_key): + cover_data = make_cover(cover_file) + cover_url = upload_to_r2(cover_key, cover_data) + print(f" Cover uploaded") + else: + cover_url = f"{PUBLIC_URL}/{cover_key}" + print(f" Cover exists") + else: + print(" No cover.jpg") + + # Manga record + cur.execute('SELECT id, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,)) + row = cur.fetchone() + + if row: + manga_id, existing_cover = row + print(f" Manga exists (id: {manga_id})") + if cover_url and cover_url != existing_cover: + cur.execute( + 'UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s', + (cover_url, manga_id), + ) + conn.commit() + else: + cur.execute( + """ + INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt") + VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW()) + RETURNING id + """, + (title, description, cover_url, slug, genre), + ) + manga_id = cur.fetchone()[0] + conn.commit() + print(f" Created manga (id: {manga_id})") + + # Chapters + chapter_dirs = sorted( + [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")], + key=lambda d: parse_chapter_dir(d.name)[0], + ) + + for chapter_dir in chapter_dirs: + order_num, chapter_title = parse_chapter_dir(chapter_dir.name) + if order_num == 0: + continue + + cur.execute( + 'SELECT id FROM "Chapter" WHERE "mangaId" = %s AND number = %s', + (manga_id, order_num), + ) + if cur.fetchone(): + print(f" [{order_num}] {chapter_title} — skip") + continue + + page_files = sorted( + [f for f in chapter_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")], + key=lambda f: int(re.search(r"(\d+)", f.stem).group(1)) if re.search(r"(\d+)", f.stem) else 0, + ) + + if not page_files: + continue + + print(f" [{order_num}] {chapter_title} ({len(page_files)} pages)") + + cur.execute( + 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', + (manga_id, order_num, chapter_title), + ) + chapter_id = cur.fetchone()[0] + conn.commit() + + # Parallel convert + upload + def process_page(args): + j, page_file = args + r2_key = f"manga/{slug}/chapters/{order_num}/{j}.webp" + if not r2_key_exists(r2_key): + webp_data = convert_to_webp(page_file) + return j, upload_to_r2(r2_key, webp_data) + return j, f"{PUBLIC_URL}/{r2_key}" + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)} + for future in as_completed(futures): + j, url = future.result() + page_urls[j] = url + done += 1 + print(f" {done}/{len(page_files)}", end="\r") + + # Batch insert page records in order + for j in sorted(page_urls): + cur.execute( + 'INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', + (chapter_id, j, page_urls[j]), + ) + + conn.commit() + print(f" {len(page_files)} pages uploaded" + " " * 10) + + +# ── Interactive loop ────────────────────────────────────── + + +def show_menu(): + print() + print("=" * 40) + print(" Manga Uploader") + print("=" * 40) + print(" 1. Status") + print(" 2. Upload all manga") + print(" 3. Upload specific manga") + print(" 4. Reset R2 storage") + print(" 0. Quit") + print() + + +def main(): + conn = get_db() + try: + while True: + show_menu() + try: + choice = input("Select [0-4]: ").strip() + except (EOFError, KeyboardInterrupt): + print() + break + + if choice == "0": + break + elif choice == "1": + cmd_status(conn) + elif choice == "2": + cmd_upload(conn) + elif choice == "3": + local = list_local_manga() + if not local: + print(" No manga in manga-content/") + continue + print() + for i, name in enumerate(local, 1): + print(f" {i}. {name}") + print() + pick = input("Select manga number: ").strip() + try: + idx = int(pick) - 1 + if 0 <= idx < len(local): + cmd_upload(conn, local[idx]) + else: + print(" Invalid selection") + except ValueError: + print(" Invalid input") + elif choice == "4": + confirm = input(" Delete ALL R2 objects? [y/N] ").strip().lower() + if confirm == "y": + cmd_reset() + else: + print(" Cancelled.") + else: + print(" Invalid choice") + finally: + conn.close() + + print("Bye!") + + +if __name__ == "__main__": + main()