From 051b2e191f8ca85e9c4f72ed5c74f97a3f73552d Mon Sep 17 00:00:00 2001 From: yiekheng Date: Sun, 12 Apr 2026 13:12:55 +0800 Subject: [PATCH] Re-upload inline during check, add URL fallback for flaky CDN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Check missing pages: re-upload happens immediately when mismatch is detected, while the browser is still on that chapter's reader page. Eliminates the two-phase approach that required re-navigating and could hit wrong referrer. - fetch_image_bytes falls back to stripped-query URL (e.g. drops ?q=50) when the original fails — works around CDN variants that may not have the quality-scaled version. - Log the specific failing URL on fetch failure for diagnostics. Co-Authored-By: Claude Opus 4.6 (1M context) --- manga.py | 129 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 62 deletions(-) diff --git a/manga.py b/manga.py index c483dc7..b07711c 100644 --- a/manga.py +++ b/manga.py @@ -581,12 +581,9 @@ def fetch_all_pages(page, images, max_attempts=3): return page_bytes -def fetch_image_bytes(page, img): - """Fetch image via browser network stack using given page.""" - url = img["url"] - ref_policy = "no-referrer" if img.get("no_referrer") else "origin" +def _fetch_via_page(page, url, ref_policy): try: - with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info: + with page.expect_response(lambda r: url.split("?")[0] in r.url, timeout=15000) as resp_info: page.evaluate("([u, r]) => fetch(u, { referrerPolicy: r })", [url, ref_policy]) response = resp_info.value if response.status == 200: @@ -598,6 +595,25 @@ def fetch_image_bytes(page, img): return None +def fetch_image_bytes(page, img): + """Fetch image via browser network stack using given page. Tries URL variants on failure.""" + url = img["url"] + ref_policy = "no-referrer" if img.get("no_referrer") else "origin" + + # Try original URL + body = _fetch_via_page(page, url, ref_policy) + if body: + return body + + # Fallback: strip query string (e.g., ?q=50) + if "?" in url: + body = _fetch_via_page(page, url.split("?")[0], ref_policy) + if body: + return body + + return None + + def download_image(page, img, save_path): """Fetch image and save to disk.""" if save_path.exists(): @@ -1715,8 +1731,8 @@ def tui_check_missing_pages(): return cur2 = conn.cursor() - to_reupload = [] - to_fix_dims = [] + fixed_dims = 0 + reuploaded = 0 print(f"\n Checking {len(selected_chapters)} chapters...") for ch_id, ch_num, ch_title in selected_chapters: @@ -1740,8 +1756,42 @@ def tui_check_missing_pages(): r2_count = r2_count_by_prefix(f"manga/{slug}/chapters/{ch_num}/") if site_count != r2_count: - print(f" [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-upload") - to_reupload.append((ch_id, ch_num, ch_title, ch, images)) + print(f" [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-uploading...") + # Re-upload IMMEDIATELY while browser is on this chapter's reader page + page_bytes = fetch_all_pages(session.page, images) + if len(page_bytes) < len(images): + missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] + print(f" Could not fetch pages: {missing}") + for mn in missing: + print(f" page {mn}: {images[mn-1]['url']}") + print(f" Skipping chapter") + continue + + def upload_page(args, _slug=slug, _n=ch_num): + pn, raw = args + r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" + with Image.open(io.BytesIO(raw)) as img: + w, h = img.width, img.height + return pn, upload_to_r2(r2_key, convert_to_webp(io.BytesIO(raw))), w, h + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for pn, r2_url, w, h in pool.map(upload_page, page_bytes.items()): + page_urls[pn] = (r2_url, w, h) + done += 1 + print(f" R2: {done}/{len(page_bytes)}", end="\r") + + cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) + for pn in sorted(page_urls): + url, w, h = page_urls[pn] + cur2.execute( + 'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)', + (ch_id, pn, url, w, h), + ) + conn.commit() + reuploaded += 1 + print(f" {len(page_urls)} pages restored" + " " * 20) continue # Count matches — check if DB has valid width/height for all pages @@ -1756,19 +1806,9 @@ def tui_check_missing_pages(): db_count, bad_w, bad_h, min_w, max_w, min_h, max_h = cur2.fetchone() bad_count = max(bad_w, bad_h) if bad_count > 0: - print(f" [{ch_num}] {ch_title}: {bad_count} pages need dims (w {min_w}-{max_w}, h {min_h}-{max_h}) — fix from R2") - to_fix_dims.append((ch_id, ch_num, ch_title)) - else: - print(f" [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})") - - # Fix dimensions by reading existing R2 objects (no re-upload) - if to_fix_dims: - print(f"\n Fixing dimensions for {len(to_fix_dims)} chapter(s)...") - for ch_id, ch_num, ch_title in to_fix_dims: - if esc.stop.is_set(): - break + print(f" [{ch_num}] {ch_title}: {bad_count} pages need dims — fixing from R2...") cur2.execute( - 'SELECT id, number, "imageUrl" FROM "Page" WHERE "chapterId" = %s ' + 'SELECT id, number FROM "Page" WHERE "chapterId" = %s ' 'AND (width IS NULL OR width = 0 OR height IS NULL OR height = 0) ' 'ORDER BY number', (ch_id,), @@ -1776,7 +1816,7 @@ def tui_check_missing_pages(): pages = cur2.fetchall() def read_dims(args, _slug=slug, _n=ch_num): - page_id, pn, _url = args + page_id, pn = args r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" try: data = s3.get_object(Bucket=BUCKET, Key=r2_key)["Body"].read() @@ -1795,47 +1835,12 @@ def tui_check_missing_pages(): ) updated += 1 conn.commit() - print(f" [{ch_num}] {ch_title}: {updated}/{len(pages)} dims updated") + fixed_dims += 1 + print(f" {updated}/{len(pages)} dims updated") + else: + print(f" [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})") - if not to_reupload: - if not to_fix_dims: - print("\n All selected chapters are complete.") - return - - print(f"\n {len(to_reupload)} chapter(s) need re-upload") - - for ch_id, ch_num, ch_title, ch, images in to_reupload: - if esc.stop.is_set(): - break - print(f"\n Re-uploading [{ch_num}] {ch_title}") - page_bytes = fetch_all_pages(session.page, images) - if len(page_bytes) < len(images): - missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] - print(f" Could not fetch pages: {missing}, skipping") - continue - - # Upload to R2 (overwrites existing) - def upload_page(args, _slug=slug, _n=ch_num): - pn, raw = args - r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" - w, h, webp = probe_and_webp(io.BytesIO(raw)) - return pn, upload_to_r2(r2_key, webp), w, h - - page_urls = {} - done = 0 - with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: - futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()} - for future in as_completed(futures): - pn, r2_url, w, h = future.result() - page_urls[pn] = (r2_url, w, h) - done += 1 - print(f" R2: {done}/{len(page_bytes)}", end="\r") - - # Replace Page records - cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) - insert_pages(cur2, ch_id, page_urls) - conn.commit() - print(f" {len(page_urls)} pages restored" + " " * 20) + print(f"\n Done: {reuploaded} re-uploaded, {fixed_dims} dim-fixed") try: with_browser(run)