From 051b2e191f8ca85e9c4f72ed5c74f97a3f73552d Mon Sep 17 00:00:00 2001
From: yiekheng <yiekheng@04080616.xyz>
Date: Sun, 12 Apr 2026 13:12:55 +0800
Subject: [PATCH] Re-upload inline during check, add URL fallback for flaky CDN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Check missing pages: re-upload happens immediately when mismatch is
  detected, while the browser is still on that chapter's reader page.
  Eliminates the two-phase approach that required re-navigating and
  could hit wrong referrer.
- fetch_image_bytes falls back to stripped-query URL (e.g. drops ?q=50)
  when the original fails — works around CDN variants that may not
  have the quality-scaled version.
- Log the specific failing URL on fetch failure for diagnostics.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 manga.py | 129 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 67 insertions(+), 62 deletions(-)

diff --git a/manga.py b/manga.py
index c483dc7..b07711c 100644
--- a/manga.py
+++ b/manga.py
@@ -581,12 +581,9 @@ def fetch_all_pages(page, images, max_attempts=3):
     return page_bytes
 
 
-def fetch_image_bytes(page, img):
-    """Fetch image via browser network stack using given page."""
-    url = img["url"]
-    ref_policy = "no-referrer" if img.get("no_referrer") else "origin"
+def _fetch_via_page(page, url, ref_policy):
     try:
-        with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info:
+        with page.expect_response(lambda r: url.split("?")[0] in r.url, timeout=15000) as resp_info:
             page.evaluate("([u, r]) => fetch(u, { referrerPolicy: r })", [url, ref_policy])
         response = resp_info.value
         if response.status == 200:
@@ -598,6 +595,25 @@ def fetch_image_bytes(page, img):
     return None
 
 
+def fetch_image_bytes(page, img):
+    """Fetch image via browser network stack using given page. Tries URL variants on failure."""
+    url = img["url"]
+    ref_policy = "no-referrer" if img.get("no_referrer") else "origin"
+
+    # Try original URL
+    body = _fetch_via_page(page, url, ref_policy)
+    if body:
+        return body
+
+    # Fallback: strip query string (e.g., ?q=50)
+    if "?" in url:
+        body = _fetch_via_page(page, url.split("?")[0], ref_policy)
+        if body:
+            return body
+
+    return None
+
+
 def download_image(page, img, save_path):
     """Fetch image and save to disk."""
     if save_path.exists():
@@ -1715,8 +1731,8 @@ def tui_check_missing_pages():
                 return
 
             cur2 = conn.cursor()
-            to_reupload = []
-            to_fix_dims = []
+            fixed_dims = 0
+            reuploaded = 0
 
             print(f"\n  Checking {len(selected_chapters)} chapters...")
             for ch_id, ch_num, ch_title in selected_chapters:
@@ -1740,8 +1756,42 @@ def tui_check_missing_pages():
                 r2_count = r2_count_by_prefix(f"manga/{slug}/chapters/{ch_num}/")
 
                 if site_count != r2_count:
-                    print(f"  [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-upload")
-                    to_reupload.append((ch_id, ch_num, ch_title, ch, images))
+                    print(f"  [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-uploading...")
+                    # Re-upload IMMEDIATELY while browser is on this chapter's reader page
+                    page_bytes = fetch_all_pages(session.page, images)
+                    if len(page_bytes) < len(images):
+                        missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes]
+                        print(f"    Could not fetch pages: {missing}")
+                        for mn in missing:
+                            print(f"      page {mn}: {images[mn-1]['url']}")
+                        print(f"    Skipping chapter")
+                        continue
+
+                    def upload_page(args, _slug=slug, _n=ch_num):
+                        pn, raw = args
+                        r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp"
+                        with Image.open(io.BytesIO(raw)) as img:
+                            w, h = img.width, img.height
+                        return pn, upload_to_r2(r2_key, convert_to_webp(io.BytesIO(raw))), w, h
+
+                    page_urls = {}
+                    done = 0
+                    with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
+                        for pn, r2_url, w, h in pool.map(upload_page, page_bytes.items()):
+                            page_urls[pn] = (r2_url, w, h)
+                            done += 1
+                            print(f"    R2: {done}/{len(page_bytes)}", end="\r")
+
+                    cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,))
+                    for pn in sorted(page_urls):
+                        url, w, h = page_urls[pn]
+                        cur2.execute(
+                            'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)',
+                            (ch_id, pn, url, w, h),
+                        )
+                    conn.commit()
+                    reuploaded += 1
+                    print(f"    {len(page_urls)} pages restored" + " " * 20)
                     continue
 
                 # Count matches — check if DB has valid width/height for all pages
@@ -1756,19 +1806,9 @@ def tui_check_missing_pages():
                 db_count, bad_w, bad_h, min_w, max_w, min_h, max_h = cur2.fetchone()
                 bad_count = max(bad_w, bad_h)
                 if bad_count > 0:
-                    print(f"  [{ch_num}] {ch_title}: {bad_count} pages need dims (w {min_w}-{max_w}, h {min_h}-{max_h}) — fix from R2")
-                    to_fix_dims.append((ch_id, ch_num, ch_title))
-                else:
-                    print(f"  [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})")
-
-            # Fix dimensions by reading existing R2 objects (no re-upload)
-            if to_fix_dims:
-                print(f"\n  Fixing dimensions for {len(to_fix_dims)} chapter(s)...")
-                for ch_id, ch_num, ch_title in to_fix_dims:
-                    if esc.stop.is_set():
-                        break
+                    print(f"  [{ch_num}] {ch_title}: {bad_count} pages need dims — fixing from R2...")
                     cur2.execute(
-                        'SELECT id, number, "imageUrl" FROM "Page" WHERE "chapterId" = %s '
+                        'SELECT id, number FROM "Page" WHERE "chapterId" = %s '
                         'AND (width IS NULL OR width = 0 OR height IS NULL OR height = 0) '
                         'ORDER BY number',
                         (ch_id,),
@@ -1776,7 +1816,7 @@ def tui_check_missing_pages():
                     pages = cur2.fetchall()
 
                     def read_dims(args, _slug=slug, _n=ch_num):
-                        page_id, pn, _url = args
+                        page_id, pn = args
                         r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp"
                         try:
                             data = s3.get_object(Bucket=BUCKET, Key=r2_key)["Body"].read()
@@ -1795,47 +1835,12 @@ def tui_check_missing_pages():
                                 )
                                 updated += 1
                     conn.commit()
-                    print(f"  [{ch_num}] {ch_title}: {updated}/{len(pages)} dims updated")
+                    fixed_dims += 1
+                    print(f"    {updated}/{len(pages)} dims updated")
+                else:
+                    print(f"  [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})")
 
-            if not to_reupload:
-                if not to_fix_dims:
-                    print("\n  All selected chapters are complete.")
-                return
-
-            print(f"\n  {len(to_reupload)} chapter(s) need re-upload")
-
-            for ch_id, ch_num, ch_title, ch, images in to_reupload:
-                if esc.stop.is_set():
-                    break
-                print(f"\n  Re-uploading [{ch_num}] {ch_title}")
-                page_bytes = fetch_all_pages(session.page, images)
-                if len(page_bytes) < len(images):
-                    missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes]
-                    print(f"    Could not fetch pages: {missing}, skipping")
-                    continue
-
-                # Upload to R2 (overwrites existing)
-                def upload_page(args, _slug=slug, _n=ch_num):
-                    pn, raw = args
-                    r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp"
-                    w, h, webp = probe_and_webp(io.BytesIO(raw))
-                    return pn, upload_to_r2(r2_key, webp), w, h
-
-                page_urls = {}
-                done = 0
-                with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
-                    futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()}
-                    for future in as_completed(futures):
-                        pn, r2_url, w, h = future.result()
-                        page_urls[pn] = (r2_url, w, h)
-                        done += 1
-                        print(f"    R2: {done}/{len(page_bytes)}", end="\r")
-
-                # Replace Page records
-                cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,))
-                insert_pages(cur2, ch_id, page_urls)
-                conn.commit()
-                print(f"    {len(page_urls)} pages restored" + " " * 20)
+            print(f"\n  Done: {reuploaded} re-uploaded, {fixed_dims} dim-fixed")
 
     try:
         with_browser(run)