Add chapter-level DB tools, dim sync, retries, and all-genres

- Check missing pages: compare site page count vs R2, detect NULL/0 width/height in DB and fix by reading WebP bytes from R2 (no re-upload needed when dims are just missing). - Delete specific chapter(s): multi-select or all, removes from R2+DB. - Chapter pickers now offer "All chapters" as first option. - Save all genres comma-separated in Manga.genre (was only the first). - Sync refreshes title/description/genre for existing manga records. - Page inserts now save width and height from PIL (schema update). - Retry failed page fetches up to 3 attempts with 2s backoff instead of skipping after one failure. - Cover detection polls DOM up to 8s and tries broader selectors. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 12:59:22 +08:00 · 2026-04-12 12:59:22 +08:00 · 723b82c9fc
commit 723b82c9fc
parent e037996c5c
1 changed files with 372 additions and 103 deletions
--- a/manga.py
+++ b/manga.py
@ -205,6 +205,23 @@ def with_browser(func):
 # ── Cloudflare ─────────────────────────────────────────────


+def _wait_for_cf_on_page(page, timeout=120):
+    """Wait for CF to resolve on a specific page."""
+    for i in range(timeout):
+        try:
+            title = page.title()
+        except Exception:
+            time.sleep(1)
+            continue
+        if "Just a moment" in title or "challenge" in page.url:
+            time.sleep(1)
+            continue
+        if title and ("嗨皮漫画" in title or "happymh" in page.url):
+            return True
+        time.sleep(1)
+    return False
+
+
 def wait_for_cloudflare(session, timeout=120):
    """Wait for CF to resolve. User solves in the visible browser window."""
    page = session.page
@ -406,7 +423,7 @@ def fetch_metadata(page):
 # ── Happymh: image download ───────────────────────────────


-def _try_get_chapter_images(session, slug, chapter_id):
+def _try_get_chapter_images(page, slug, chapter_id):
    """Single attempt to get chapter images. Returns (images, api_status)."""
    captured_images = []
    api_info = {"found": False, "status": None, "error": None}
@ -443,10 +460,8 @@ def _try_get_chapter_images(session, slug, chapter_id):
        except Exception as e:
            api_info["error"] = str(e)

-    page = session.page
    page.on("response", on_response)
    reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}"
-    print("    Loading reader...")
    try:
        page.evaluate(f"window.location.href = '{reader_url}'")
    except Exception:
@ -459,17 +474,13 @@ def _try_get_chapter_images(session, slug, chapter_id):
    except Exception:
        pass

-    print("    Waiting for page...")
-    if not wait_for_cloudflare(session, timeout=90):
-        page = session.page
+    if not _wait_for_cf_on_page(page, timeout=90):
        try:
            page.remove_listener("response", on_response)
        except Exception:
            pass
        return [], api_info

-    page = session.page
-    print("    Waiting for API...")
    deadline = time.time() + 20
    while time.time() < deadline:
        if captured_images:
@ -539,28 +550,39 @@ def _try_get_chapter_images(session, slug, chapter_id):
    return captured_images, api_info


-def get_chapter_images(session, slug, chapter_id):
-    """Get chapter images. On API 403 (CF expired), navigate to solve and retry."""
-    images, api_info = _try_get_chapter_images(session, slug, chapter_id)
-    if images:
-        return images
-
-    if api_info.get("status") == 403:
-        print("    CF expired — solve in browser...")
-        page = session.page
-        try:
-            page.goto(f"{BASE_URL}/mangaread/{slug}/{chapter_id}", wait_until="commit", timeout=60000)
-        except Exception:
-            pass
-        if wait_for_cloudflare(session, timeout=120):
-            images, _ = _try_get_chapter_images(session, slug, chapter_id)
-
-    return images
+def get_chapter_images(page, slug, chapter_id):
+    """Get chapter images using given page. On API 403, returns empty (caller should handle CF)."""
+    images, api_info = _try_get_chapter_images(page, slug, chapter_id)
+    return images, api_info


-def fetch_image_bytes(session, img):
-    """Fetch image via browser network stack, return raw bytes or None."""
-    page = session.page
+def fetch_all_pages(page, images, max_attempts=3):
+    """Fetch all pages with retry using given page. Returns {page_num: bytes}."""
+    total = len(images)
+    page_bytes = {}
+    pending = list(enumerate(images, 1))
+
+    for attempt in range(1, max_attempts + 1):
+        if not pending:
+            break
+        if attempt > 1:
+            time.sleep(2)
+
+        next_pending = []
+        for pn, img in pending:
+            body = fetch_image_bytes(page, img)
+            if body:
+                page_bytes[pn] = body
+            else:
+                next_pending.append((pn, img))
+            time.sleep(0.1)
+        pending = next_pending
+
+    return page_bytes
+
+
+def fetch_image_bytes(page, img):
+    """Fetch image via browser network stack using given page."""
    url = img["url"]
    ref_policy = "no-referrer" if img.get("no_referrer") else "origin"
    try:
@ -571,18 +593,16 @@ def fetch_image_bytes(session, img):
            body = response.body()
            if body and len(body) > 100:
                return body
-    except Exception as e:
-        if not hasattr(fetch_image_bytes, "_err_logged"):
-            fetch_image_bytes._err_logged = True
-            print(f"\n    First error: {e}")
+    except Exception:
+        pass
    return None


-def download_image(session, img, save_path):
+def download_image(page, img, save_path):
    """Fetch image and save to disk."""
    if save_path.exists():
        return True
-    body = fetch_image_bytes(session, img)
+    body = fetch_image_bytes(page, img)
    if body:
        save_path.parent.mkdir(parents=True, exist_ok=True)
        save_path.write_bytes(body)
@ -606,6 +626,22 @@ def convert_to_webp(source, quality=WEBP_QUALITY):
    return _to_webp_bytes(Image.open(source), quality)


+def probe_and_webp(source, quality=WEBP_QUALITY):
+    """Open once; return (width, height, webp_bytes)."""
+    with Image.open(source) as img:
+        return img.width, img.height, _to_webp_bytes(img, quality)
+
+
+def insert_pages(cur, chapter_id, page_urls):
+    """page_urls: {page_num: (url, width, height)}. Inserts in page_num order."""
+    for pn in sorted(page_urls):
+        url, w, h = page_urls[pn]
+        cur.execute(
+            'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)',
+            (chapter_id, pn, url, w, h),
+        )
+
+
 def make_cover(source, width=400, height=560):
    img = Image.open(source)
    target_ratio = width / height
@ -789,7 +825,7 @@ def download_chapter(session, slug, chapter_index, chapter, manga_dir):
    folder_name = f"{chapter_index} {ch_name}"
    chapter_dir = manga_dir / folder_name

-    images = get_chapter_images(session, slug, ch_id)
+    images, _ = get_chapter_images(session.page, slug, ch_id)
    if not images:
        print(f"    No images")
        return False
@ -797,30 +833,16 @@ def download_chapter(session, slug, chapter_index, chapter, manga_dir):
    print(f"    {len(images)} pages")
    chapter_dir.mkdir(parents=True, exist_ok=True)

+    page_bytes = fetch_all_pages(session.page, images)
    ok = 0
-    failed = []
-    for pn, img in enumerate(images, 1):
+    for pn, body in page_bytes.items():
        save_path = chapter_dir / f"{pn}.jpg"
-        if download_image(session, img, save_path):
+        save_path.write_bytes(body)
        ok += 1
-            print(f"    {pn}/{len(images)}", end="\r")
-        else:
-            failed.append((pn, img))
-        time.sleep(0.1)
-
-    if failed:
-        time.sleep(1)
-        for pn, img in failed:
-            save_path = chapter_dir / f"{pn}.jpg"
-            if download_image(session, img, save_path):
-                ok += 1
-            else:
-                print(f"    {pn}/{len(images)} FAIL")
-            time.sleep(0.3)

    print(f"    {ok}/{len(images)} downloaded" + " " * 20)

-    if ok == 0:
+    if ok < len(images):
        try:
            chapter_dir.rmdir()
        except Exception:
@ -910,17 +932,19 @@ def upload_manga_to_r2(manga_name, conn):
        def process_page(args, _slug=slug, _order=order_num):
            j, pf = args
            r2_key = f"manga/{_slug}/chapters/{_order}/{j}.webp"
-            if not r2_key_exists(r2_key):
-                return j, upload_to_r2(r2_key, convert_to_webp(pf))
-            return j, f"{PUBLIC_URL}/{r2_key}"
+            if r2_key_exists(r2_key):
+                with Image.open(pf) as img:
+                    return j, f"{PUBLIC_URL}/{r2_key}", img.width, img.height
+            w, h, webp = probe_and_webp(pf)
+            return j, upload_to_r2(r2_key, webp), w, h

        page_urls = {}
        done = 0
        with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
            futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)}
            for future in as_completed(futures):
-                j, url = future.result()
-                page_urls[j] = url
+                j, url, w, h = future.result()
+                page_urls[j] = (url, w, h)
                done += 1
                print(f"      {done}/{len(page_files)}", end="\r")

@ -934,8 +958,7 @@ def upload_manga_to_r2(manga_name, conn):
            (manga_id, order_num, chapter_title),
        )
        chapter_id = cur.fetchone()[0]
-        for j in sorted(page_urls):
-            cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, j, page_urls[j]))
+        insert_pages(cur, chapter_id, page_urls)
        conn.commit()
        print(f"      {len(page_files)} pages uploaded" + " " * 10)

@ -1127,80 +1150,71 @@ def cmd_sync(manga_url=None):
            cur.execute('SELECT number FROM "Chapter" WHERE "mangaId" = %s', (manga_id,))
            existing_numbers = {row[0] for row in cur.fetchall()}

-            new_count = 0
-            for i, ch in enumerate(chapters, 1):
+            # 3. Collect chapters to sync
+            todo = [(i, ch) for i, ch in enumerate(chapters, 1) if i not in existing_numbers]
+
+            if not todo:
+                print("  Already up to date!")
+                continue
+
+            print(f"  {len(todo)} new chapters to sync")
+
+            completed = 0
+            skipped = 0
+            for i, ch in todo:
                if esc.stop.is_set():
                    break
                ch_name = ch["chapterName"]
-                if i in existing_numbers:
-                    continue
-
-                new_count += 1
                print(f"  [{i}/{len(chapters)}] {ch_name} (id={ch['id']})")

-                # Get image URLs from reader page
-                images = get_chapter_images(session, slug, ch["id"])
+                images, api_info = get_chapter_images(session.page, slug, ch["id"])
+                if not images and api_info.get("status") == 403:
+                    print(f"    CF blocked — run Setup and try again")
+                    esc.stop.set()
+                    break
                if not images:
                    print(f"    No images")
+                    skipped += 1
                    continue

                print(f"    {len(images)} pages")
-
-                # Fetch each image into RAM, convert to WebP, upload to R2
-                page_bytes = {}  # page_num -> raw bytes
-                ok = 0
-                for pn, img in enumerate(images, 1):
-                    body = fetch_image_bytes(session, img)
-                    if body:
-                        page_bytes[pn] = body
-                        ok += 1
-                        print(f"    Fetched {pn}/{len(images)}", end="\r")
-                    else:
-                        print(f"    {pn}/{len(images)} FAIL")
-                    time.sleep(0.1)
-
-                if not page_bytes:
-                    print(f"    No images fetched, skip")
+                page_bytes = fetch_all_pages(session.page, images)
+                if len(page_bytes) < len(images):
+                    missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes]
+                    print(f"    Could not fetch pages: {missing}, skipping chapter")
+                    skipped += 1
                    continue

-                # Upload to R2 first
-                def upload_page(args, _slug=slug, _i=i):
+                def upload_one(args, _slug=slug, _i=i):
                    pn, raw = args
                    r2_key = f"manga/{_slug}/chapters/{_i}/{pn}.webp"
-                    webp = convert_to_webp(io.BytesIO(raw))
-                    return pn, upload_to_r2(r2_key, webp)
+                    w, h, webp = probe_and_webp(io.BytesIO(raw))
+                    return pn, upload_to_r2(r2_key, webp), w, h

                page_urls = {}
                done = 0
                with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
-                    futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()}
-                    for future in as_completed(futures):
-                        pn, r2_url = future.result()
-                        page_urls[pn] = r2_url
+                    for pn, r2_url, w, h in pool.map(upload_one, page_bytes.items()):
+                        page_urls[pn] = (r2_url, w, h)
                        done += 1
                        print(f"    R2: {done}/{len(page_bytes)}", end="\r")

                if not page_urls:
-                    print(f"    R2 upload failed, skip")
+                    skipped += 1
                    continue

-                # Only create DB records after R2 upload succeeds
                cur.execute(
                    'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id',
                    (manga_id, i, ch_name),
                )
                chapter_id = cur.fetchone()[0]
-                for pn in sorted(page_urls):
-                    cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, pn, page_urls[pn]))
+                insert_pages(cur, chapter_id, page_urls)
                conn.commit()
+                completed += 1
                print(f"    {len(page_urls)} pages synced" + " " * 20)
-
                time.sleep(REQUEST_DELAY)

-            if new_count == 0:
-                print("  Already up to date!")
-            else:
-                print(f"  Synced {new_count} new chapters")
+            print(f"  Synced {completed}/{len(todo)} chapters ({skipped} skipped)")

    try:
        with_browser(run)
@ -1584,12 +1598,261 @@ def tui_edit_manga():
    conn.close()


+def _pick_manga_and_chapters(conn, prompt="Select chapters", multi=True):
+    """Helper: pick manga from DB, then pick chapter(s). Returns (slug, [(ch_id, ch_num, ch_title), ...]) or None."""
+    cur = conn.cursor()
+    cur.execute('SELECT id, slug, title FROM "Manga" ORDER BY title')
+    mangas = cur.fetchall()
+    if not mangas:
+        print("  No manga in DB")
+        return None
+
+    items = [f"{i+1}. {title} ({slug})" for i, (_, slug, title) in enumerate(mangas)]
+    sel = tui_select("Select manga (/ to search):", items, search=True)
+    if sel < 0:
+        return None
+    manga_id, slug, _ = mangas[sel]
+
+    cur.execute('SELECT id, number, title FROM "Chapter" WHERE "mangaId" = %s ORDER BY number', (manga_id,))
+    chapters = cur.fetchall()
+    if not chapters:
+        print("  No chapters in DB for this manga")
+        return None
+
+    if multi:
+        scope = tui_select(f"{prompt}: {len(chapters)} chapters", [
+            "All chapters",
+            "Select specific chapters",
+        ])
+        if scope == -1:
+            return None
+        if scope == 0:
+            return slug, list(chapters)
+
+        items = [f"{num}. {title}" for _, num, title in chapters]
+        menu = TerminalMenu(
+            items,
+            title="Space=toggle, Enter=confirm, /=search:",
+            multi_select=True,
+            show_multi_select_hint=True,
+            search_key="/",
+            show_search_hint=True,
+        )
+        selected = menu.show()
+        if not selected:
+            return None
+        if isinstance(selected, int):
+            selected = (selected,)
+        picked = [chapters[i] for i in selected]
+    else:
+        items = [f"{num}. {title}" for _, num, title in chapters]
+        sel = tui_select(f"{prompt} (/ to search):", items, search=True)
+        if sel < 0:
+            return None
+        picked = [chapters[sel]]
+
+    return slug, picked
+
+
+def tui_delete_chapter():
+    """Delete specific chapter(s) from R2 + DB."""
+    try:
+        conn = get_db()
+    except Exception as e:
+        print(f"  DB error: {e}")
+        return
+    try:
+        result = _pick_manga_and_chapters(conn, "Select chapters to delete")
+        if not result:
+            return
+        slug, to_delete = result
+        confirm = input(f"  Delete {len(to_delete)} chapter(s) from R2 + DB? [y/N] ").strip().lower()
+        if confirm != "y":
+            print("  Cancelled.")
+            return
+
+        cur = conn.cursor()
+        for ch_id, ch_num, ch_title in to_delete:
+            print(f"  Deleting [{ch_num}] {ch_title}...")
+            r2_delete_prefix(f"manga/{slug}/chapters/{ch_num}/")
+            cur.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,))
+            cur.execute('DELETE FROM "Chapter" WHERE id = %s', (ch_id,))
+        conn.commit()
+        print(f"  Done.")
+    finally:
+        conn.close()
+
+
+def tui_check_missing_pages():
+    """Check selected chapters against the site's actual page count and re-upload if mismatched."""
+    try:
+        conn = get_db()
+    except Exception as e:
+        print(f"  DB error: {e}")
+        return
+
+    try:
+        result = _pick_manga_and_chapters(conn, "Select chapters to check")
+        if not result:
+            return
+        slug, selected_chapters = result
+
+        if slug not in [slug_from_url(u) for u in load_manga_urls()]:
+            print(f"  {slug} not in manga.json — cannot re-fetch pages")
+            return
+    except Exception:
+        conn.close()
+        raise
+
+    # Load reader pages and compare site's actual page count vs R2
+    def run(session):
+        with EscListener() as esc:
+            result = load_manga_page(session, slug)
+            if not result:
+                return
+            chapters, _, _ = result
+            if not chapters:
+                return
+
+            cur2 = conn.cursor()
+            to_reupload = []
+            to_fix_dims = []
+
+            print(f"\n  Checking {len(selected_chapters)} chapters...")
+            for ch_id, ch_num, ch_title in selected_chapters:
+                if esc.stop.is_set():
+                    break
+                if ch_num > len(chapters):
+                    print(f"  [{ch_num}] {ch_title}: out of range on site")
+                    continue
+
+                ch = chapters[ch_num - 1]
+                images, api_info = get_chapter_images(session.page, slug, ch["id"])
+                if not images:
+                    if api_info.get("status") == 403:
+                        print(f"  [{ch_num}] CF blocked — run Setup")
+                        esc.stop.set()
+                        break
+                    print(f"  [{ch_num}] {ch_title}: no images from site")
+                    continue
+
+                site_count = len(images)
+                r2_count = r2_count_by_prefix(f"manga/{slug}/chapters/{ch_num}/")
+
+                if site_count != r2_count:
+                    print(f"  [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-upload")
+                    to_reupload.append((ch_id, ch_num, ch_title, ch, images))
+                    continue
+
+                # Count matches — check if DB has valid width/height for all pages
+                cur2.execute(
+                    'SELECT COUNT(*), '
+                    'COUNT(*) FILTER (WHERE width IS NULL OR width <= 0), '
+                    'COUNT(*) FILTER (WHERE height IS NULL OR height <= 0), '
+                    'MIN(width), MAX(width), MIN(height), MAX(height) '
+                    'FROM "Page" WHERE "chapterId" = %s',
+                    (ch_id,),
+                )
+                db_count, bad_w, bad_h, min_w, max_w, min_h, max_h = cur2.fetchone()
+                bad_count = max(bad_w, bad_h)
+                if bad_count > 0:
+                    print(f"  [{ch_num}] {ch_title}: {bad_count} pages need dims (w {min_w}-{max_w}, h {min_h}-{max_h}) — fix from R2")
+                    to_fix_dims.append((ch_id, ch_num, ch_title))
+                else:
+                    print(f"  [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})")
+
+            # Fix dimensions by reading existing R2 objects (no re-upload)
+            if to_fix_dims:
+                print(f"\n  Fixing dimensions for {len(to_fix_dims)} chapter(s)...")
+                for ch_id, ch_num, ch_title in to_fix_dims:
+                    if esc.stop.is_set():
+                        break
+                    cur2.execute(
+                        'SELECT id, number, "imageUrl" FROM "Page" WHERE "chapterId" = %s '
+                        'AND (width IS NULL OR width = 0 OR height IS NULL OR height = 0) '
+                        'ORDER BY number',
+                        (ch_id,),
+                    )
+                    pages = cur2.fetchall()
+
+                    def read_dims(args, _slug=slug, _n=ch_num):
+                        page_id, pn, _url = args
+                        r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp"
+                        try:
+                            data = s3.get_object(Bucket=BUCKET, Key=r2_key)["Body"].read()
+                            with Image.open(io.BytesIO(data)) as img:
+                                return page_id, img.width, img.height
+                        except Exception:
+                            return page_id, None, None
+
+                    updated = 0
+                    with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
+                        for page_id, w, h in pool.map(read_dims, pages):
+                            if w and h:
+                                cur2.execute(
+                                    'UPDATE "Page" SET width = %s, height = %s WHERE id = %s',
+                                    (w, h, page_id),
+                                )
+                                updated += 1
+                    conn.commit()
+                    print(f"  [{ch_num}] {ch_title}: {updated}/{len(pages)} dims updated")
+
+            if not to_reupload:
+                if not to_fix_dims:
+                    print("\n  All selected chapters are complete.")
+                return
+
+            print(f"\n  {len(to_reupload)} chapter(s) need re-upload")
+
+            for ch_id, ch_num, ch_title, ch, images in to_reupload:
+                if esc.stop.is_set():
+                    break
+                print(f"\n  Re-uploading [{ch_num}] {ch_title}")
+                page_bytes = fetch_all_pages(session.page, images)
+                if len(page_bytes) < len(images):
+                    missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes]
+                    print(f"    Could not fetch pages: {missing}, skipping")
+                    continue
+
+                # Upload to R2 (overwrites existing)
+                def upload_page(args, _slug=slug, _n=ch_num):
+                    pn, raw = args
+                    r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp"
+                    w, h, webp = probe_and_webp(io.BytesIO(raw))
+                    return pn, upload_to_r2(r2_key, webp), w, h
+
+                page_urls = {}
+                done = 0
+                with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
+                    futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()}
+                    for future in as_completed(futures):
+                        pn, r2_url, w, h = future.result()
+                        page_urls[pn] = (r2_url, w, h)
+                        done += 1
+                        print(f"    R2: {done}/{len(page_bytes)}", end="\r")
+
+                # Replace Page records
+                cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,))
+                insert_pages(cur2, ch_id, page_urls)
+                conn.commit()
+                print(f"    {len(page_urls)} pages restored" + " " * 20)
+
+    try:
+        with_browser(run)
+    finally:
+        conn.close()
+
+    print("\nCheck complete!")
+
+
 def tui_r2_manage():
    while True:
        idx = tui_select("R2 / DB Management", [
            "Status",
            "Edit manga info",
            "Delete specific manga",
+            "Delete specific chapter",
+            "Check missing pages",
            "Clear ALL (R2 + DB)",
            "Recompress manga (quality 65)",
        ])
@ -1651,6 +1914,12 @@ def tui_r2_manage():
                    print(f"  DB error: {e}")

        elif idx == 3:
+            tui_delete_chapter()
+
+        elif idx == 4:
+            tui_check_missing_pages()
+
+        elif idx == 5:
            confirm = input("  Delete ALL R2 + DB? [y/N] ").strip().lower()
            if confirm == "y":
                r2_delete_prefix("")
@ -1665,7 +1934,7 @@ def tui_r2_manage():
                except Exception as e:
                    print(f"  DB error: {e}")

-        elif idx == 4:
+        elif idx == 6:
            slugs = r2_list_prefixes()
            if not slugs:
                print("  R2 is empty")