From 723b82c9fc807cffb9f3e0e8bea1115d51a97b88 Mon Sep 17 00:00:00 2001 From: yiekheng Date: Sun, 12 Apr 2026 12:59:22 +0800 Subject: [PATCH] Add chapter-level DB tools, dim sync, retries, and all-genres - Check missing pages: compare site page count vs R2, detect NULL/0 width/height in DB and fix by reading WebP bytes from R2 (no re-upload needed when dims are just missing). - Delete specific chapter(s): multi-select or all, removes from R2+DB. - Chapter pickers now offer "All chapters" as first option. - Save all genres comma-separated in Manga.genre (was only the first). - Sync refreshes title/description/genre for existing manga records. - Page inserts now save width and height from PIL (schema update). - Retry failed page fetches up to 3 attempts with 2s backoff instead of skipping after one failure. - Cover detection polls DOM up to 8s and tries broader selectors. Co-Authored-By: Claude Opus 4.6 (1M context) --- manga.py | 475 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 372 insertions(+), 103 deletions(-) diff --git a/manga.py b/manga.py index 2a21248..c483dc7 100644 --- a/manga.py +++ b/manga.py @@ -205,6 +205,23 @@ def with_browser(func): # ── Cloudflare ───────────────────────────────────────────── +def _wait_for_cf_on_page(page, timeout=120): + """Wait for CF to resolve on a specific page.""" + for i in range(timeout): + try: + title = page.title() + except Exception: + time.sleep(1) + continue + if "Just a moment" in title or "challenge" in page.url: + time.sleep(1) + continue + if title and ("嗨皮漫画" in title or "happymh" in page.url): + return True + time.sleep(1) + return False + + def wait_for_cloudflare(session, timeout=120): """Wait for CF to resolve. User solves in the visible browser window.""" page = session.page @@ -406,7 +423,7 @@ def fetch_metadata(page): # ── Happymh: image download ─────────────────────────────── -def _try_get_chapter_images(session, slug, chapter_id): +def _try_get_chapter_images(page, slug, chapter_id): """Single attempt to get chapter images. Returns (images, api_status).""" captured_images = [] api_info = {"found": False, "status": None, "error": None} @@ -443,10 +460,8 @@ def _try_get_chapter_images(session, slug, chapter_id): except Exception as e: api_info["error"] = str(e) - page = session.page page.on("response", on_response) reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}" - print(" Loading reader...") try: page.evaluate(f"window.location.href = '{reader_url}'") except Exception: @@ -459,17 +474,13 @@ def _try_get_chapter_images(session, slug, chapter_id): except Exception: pass - print(" Waiting for page...") - if not wait_for_cloudflare(session, timeout=90): - page = session.page + if not _wait_for_cf_on_page(page, timeout=90): try: page.remove_listener("response", on_response) except Exception: pass return [], api_info - page = session.page - print(" Waiting for API...") deadline = time.time() + 20 while time.time() < deadline: if captured_images: @@ -539,28 +550,39 @@ def _try_get_chapter_images(session, slug, chapter_id): return captured_images, api_info -def get_chapter_images(session, slug, chapter_id): - """Get chapter images. On API 403 (CF expired), navigate to solve and retry.""" - images, api_info = _try_get_chapter_images(session, slug, chapter_id) - if images: - return images - - if api_info.get("status") == 403: - print(" CF expired — solve in browser...") - page = session.page - try: - page.goto(f"{BASE_URL}/mangaread/{slug}/{chapter_id}", wait_until="commit", timeout=60000) - except Exception: - pass - if wait_for_cloudflare(session, timeout=120): - images, _ = _try_get_chapter_images(session, slug, chapter_id) - - return images +def get_chapter_images(page, slug, chapter_id): + """Get chapter images using given page. On API 403, returns empty (caller should handle CF).""" + images, api_info = _try_get_chapter_images(page, slug, chapter_id) + return images, api_info -def fetch_image_bytes(session, img): - """Fetch image via browser network stack, return raw bytes or None.""" - page = session.page +def fetch_all_pages(page, images, max_attempts=3): + """Fetch all pages with retry using given page. Returns {page_num: bytes}.""" + total = len(images) + page_bytes = {} + pending = list(enumerate(images, 1)) + + for attempt in range(1, max_attempts + 1): + if not pending: + break + if attempt > 1: + time.sleep(2) + + next_pending = [] + for pn, img in pending: + body = fetch_image_bytes(page, img) + if body: + page_bytes[pn] = body + else: + next_pending.append((pn, img)) + time.sleep(0.1) + pending = next_pending + + return page_bytes + + +def fetch_image_bytes(page, img): + """Fetch image via browser network stack using given page.""" url = img["url"] ref_policy = "no-referrer" if img.get("no_referrer") else "origin" try: @@ -571,18 +593,16 @@ def fetch_image_bytes(session, img): body = response.body() if body and len(body) > 100: return body - except Exception as e: - if not hasattr(fetch_image_bytes, "_err_logged"): - fetch_image_bytes._err_logged = True - print(f"\n First error: {e}") + except Exception: + pass return None -def download_image(session, img, save_path): +def download_image(page, img, save_path): """Fetch image and save to disk.""" if save_path.exists(): return True - body = fetch_image_bytes(session, img) + body = fetch_image_bytes(page, img) if body: save_path.parent.mkdir(parents=True, exist_ok=True) save_path.write_bytes(body) @@ -606,6 +626,22 @@ def convert_to_webp(source, quality=WEBP_QUALITY): return _to_webp_bytes(Image.open(source), quality) +def probe_and_webp(source, quality=WEBP_QUALITY): + """Open once; return (width, height, webp_bytes).""" + with Image.open(source) as img: + return img.width, img.height, _to_webp_bytes(img, quality) + + +def insert_pages(cur, chapter_id, page_urls): + """page_urls: {page_num: (url, width, height)}. Inserts in page_num order.""" + for pn in sorted(page_urls): + url, w, h = page_urls[pn] + cur.execute( + 'INSERT INTO "Page" ("chapterId", number, "imageUrl", width, height) VALUES (%s, %s, %s, %s, %s)', + (chapter_id, pn, url, w, h), + ) + + def make_cover(source, width=400, height=560): img = Image.open(source) target_ratio = width / height @@ -789,7 +825,7 @@ def download_chapter(session, slug, chapter_index, chapter, manga_dir): folder_name = f"{chapter_index} {ch_name}" chapter_dir = manga_dir / folder_name - images = get_chapter_images(session, slug, ch_id) + images, _ = get_chapter_images(session.page, slug, ch_id) if not images: print(f" No images") return False @@ -797,30 +833,16 @@ def download_chapter(session, slug, chapter_index, chapter, manga_dir): print(f" {len(images)} pages") chapter_dir.mkdir(parents=True, exist_ok=True) + page_bytes = fetch_all_pages(session.page, images) ok = 0 - failed = [] - for pn, img in enumerate(images, 1): + for pn, body in page_bytes.items(): save_path = chapter_dir / f"{pn}.jpg" - if download_image(session, img, save_path): - ok += 1 - print(f" {pn}/{len(images)}", end="\r") - else: - failed.append((pn, img)) - time.sleep(0.1) - - if failed: - time.sleep(1) - for pn, img in failed: - save_path = chapter_dir / f"{pn}.jpg" - if download_image(session, img, save_path): - ok += 1 - else: - print(f" {pn}/{len(images)} FAIL") - time.sleep(0.3) + save_path.write_bytes(body) + ok += 1 print(f" {ok}/{len(images)} downloaded" + " " * 20) - if ok == 0: + if ok < len(images): try: chapter_dir.rmdir() except Exception: @@ -910,17 +932,19 @@ def upload_manga_to_r2(manga_name, conn): def process_page(args, _slug=slug, _order=order_num): j, pf = args r2_key = f"manga/{_slug}/chapters/{_order}/{j}.webp" - if not r2_key_exists(r2_key): - return j, upload_to_r2(r2_key, convert_to_webp(pf)) - return j, f"{PUBLIC_URL}/{r2_key}" + if r2_key_exists(r2_key): + with Image.open(pf) as img: + return j, f"{PUBLIC_URL}/{r2_key}", img.width, img.height + w, h, webp = probe_and_webp(pf) + return j, upload_to_r2(r2_key, webp), w, h page_urls = {} done = 0 with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)} for future in as_completed(futures): - j, url = future.result() - page_urls[j] = url + j, url, w, h = future.result() + page_urls[j] = (url, w, h) done += 1 print(f" {done}/{len(page_files)}", end="\r") @@ -934,8 +958,7 @@ def upload_manga_to_r2(manga_name, conn): (manga_id, order_num, chapter_title), ) chapter_id = cur.fetchone()[0] - for j in sorted(page_urls): - cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, j, page_urls[j])) + insert_pages(cur, chapter_id, page_urls) conn.commit() print(f" {len(page_files)} pages uploaded" + " " * 10) @@ -1127,80 +1150,71 @@ def cmd_sync(manga_url=None): cur.execute('SELECT number FROM "Chapter" WHERE "mangaId" = %s', (manga_id,)) existing_numbers = {row[0] for row in cur.fetchall()} - new_count = 0 - for i, ch in enumerate(chapters, 1): + # 3. Collect chapters to sync + todo = [(i, ch) for i, ch in enumerate(chapters, 1) if i not in existing_numbers] + + if not todo: + print(" Already up to date!") + continue + + print(f" {len(todo)} new chapters to sync") + + completed = 0 + skipped = 0 + for i, ch in todo: if esc.stop.is_set(): break ch_name = ch["chapterName"] - if i in existing_numbers: - continue - - new_count += 1 print(f" [{i}/{len(chapters)}] {ch_name} (id={ch['id']})") - # Get image URLs from reader page - images = get_chapter_images(session, slug, ch["id"]) + images, api_info = get_chapter_images(session.page, slug, ch["id"]) + if not images and api_info.get("status") == 403: + print(f" CF blocked — run Setup and try again") + esc.stop.set() + break if not images: print(f" No images") + skipped += 1 continue print(f" {len(images)} pages") - - # Fetch each image into RAM, convert to WebP, upload to R2 - page_bytes = {} # page_num -> raw bytes - ok = 0 - for pn, img in enumerate(images, 1): - body = fetch_image_bytes(session, img) - if body: - page_bytes[pn] = body - ok += 1 - print(f" Fetched {pn}/{len(images)}", end="\r") - else: - print(f" {pn}/{len(images)} FAIL") - time.sleep(0.1) - - if not page_bytes: - print(f" No images fetched, skip") + page_bytes = fetch_all_pages(session.page, images) + if len(page_bytes) < len(images): + missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] + print(f" Could not fetch pages: {missing}, skipping chapter") + skipped += 1 continue - # Upload to R2 first - def upload_page(args, _slug=slug, _i=i): + def upload_one(args, _slug=slug, _i=i): pn, raw = args r2_key = f"manga/{_slug}/chapters/{_i}/{pn}.webp" - webp = convert_to_webp(io.BytesIO(raw)) - return pn, upload_to_r2(r2_key, webp) + w, h, webp = probe_and_webp(io.BytesIO(raw)) + return pn, upload_to_r2(r2_key, webp), w, h page_urls = {} done = 0 with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: - futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()} - for future in as_completed(futures): - pn, r2_url = future.result() - page_urls[pn] = r2_url + for pn, r2_url, w, h in pool.map(upload_one, page_bytes.items()): + page_urls[pn] = (r2_url, w, h) done += 1 print(f" R2: {done}/{len(page_bytes)}", end="\r") if not page_urls: - print(f" R2 upload failed, skip") + skipped += 1 continue - # Only create DB records after R2 upload succeeds cur.execute( 'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id', (manga_id, i, ch_name), ) chapter_id = cur.fetchone()[0] - for pn in sorted(page_urls): - cur.execute('INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)', (chapter_id, pn, page_urls[pn])) + insert_pages(cur, chapter_id, page_urls) conn.commit() + completed += 1 print(f" {len(page_urls)} pages synced" + " " * 20) - time.sleep(REQUEST_DELAY) - if new_count == 0: - print(" Already up to date!") - else: - print(f" Synced {new_count} new chapters") + print(f" Synced {completed}/{len(todo)} chapters ({skipped} skipped)") try: with_browser(run) @@ -1584,12 +1598,261 @@ def tui_edit_manga(): conn.close() +def _pick_manga_and_chapters(conn, prompt="Select chapters", multi=True): + """Helper: pick manga from DB, then pick chapter(s). Returns (slug, [(ch_id, ch_num, ch_title), ...]) or None.""" + cur = conn.cursor() + cur.execute('SELECT id, slug, title FROM "Manga" ORDER BY title') + mangas = cur.fetchall() + if not mangas: + print(" No manga in DB") + return None + + items = [f"{i+1}. {title} ({slug})" for i, (_, slug, title) in enumerate(mangas)] + sel = tui_select("Select manga (/ to search):", items, search=True) + if sel < 0: + return None + manga_id, slug, _ = mangas[sel] + + cur.execute('SELECT id, number, title FROM "Chapter" WHERE "mangaId" = %s ORDER BY number', (manga_id,)) + chapters = cur.fetchall() + if not chapters: + print(" No chapters in DB for this manga") + return None + + if multi: + scope = tui_select(f"{prompt}: {len(chapters)} chapters", [ + "All chapters", + "Select specific chapters", + ]) + if scope == -1: + return None + if scope == 0: + return slug, list(chapters) + + items = [f"{num}. {title}" for _, num, title in chapters] + menu = TerminalMenu( + items, + title="Space=toggle, Enter=confirm, /=search:", + multi_select=True, + show_multi_select_hint=True, + search_key="/", + show_search_hint=True, + ) + selected = menu.show() + if not selected: + return None + if isinstance(selected, int): + selected = (selected,) + picked = [chapters[i] for i in selected] + else: + items = [f"{num}. {title}" for _, num, title in chapters] + sel = tui_select(f"{prompt} (/ to search):", items, search=True) + if sel < 0: + return None + picked = [chapters[sel]] + + return slug, picked + + +def tui_delete_chapter(): + """Delete specific chapter(s) from R2 + DB.""" + try: + conn = get_db() + except Exception as e: + print(f" DB error: {e}") + return + try: + result = _pick_manga_and_chapters(conn, "Select chapters to delete") + if not result: + return + slug, to_delete = result + confirm = input(f" Delete {len(to_delete)} chapter(s) from R2 + DB? [y/N] ").strip().lower() + if confirm != "y": + print(" Cancelled.") + return + + cur = conn.cursor() + for ch_id, ch_num, ch_title in to_delete: + print(f" Deleting [{ch_num}] {ch_title}...") + r2_delete_prefix(f"manga/{slug}/chapters/{ch_num}/") + cur.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) + cur.execute('DELETE FROM "Chapter" WHERE id = %s', (ch_id,)) + conn.commit() + print(f" Done.") + finally: + conn.close() + + +def tui_check_missing_pages(): + """Check selected chapters against the site's actual page count and re-upload if mismatched.""" + try: + conn = get_db() + except Exception as e: + print(f" DB error: {e}") + return + + try: + result = _pick_manga_and_chapters(conn, "Select chapters to check") + if not result: + return + slug, selected_chapters = result + + if slug not in [slug_from_url(u) for u in load_manga_urls()]: + print(f" {slug} not in manga.json — cannot re-fetch pages") + return + except Exception: + conn.close() + raise + + # Load reader pages and compare site's actual page count vs R2 + def run(session): + with EscListener() as esc: + result = load_manga_page(session, slug) + if not result: + return + chapters, _, _ = result + if not chapters: + return + + cur2 = conn.cursor() + to_reupload = [] + to_fix_dims = [] + + print(f"\n Checking {len(selected_chapters)} chapters...") + for ch_id, ch_num, ch_title in selected_chapters: + if esc.stop.is_set(): + break + if ch_num > len(chapters): + print(f" [{ch_num}] {ch_title}: out of range on site") + continue + + ch = chapters[ch_num - 1] + images, api_info = get_chapter_images(session.page, slug, ch["id"]) + if not images: + if api_info.get("status") == 403: + print(f" [{ch_num}] CF blocked — run Setup") + esc.stop.set() + break + print(f" [{ch_num}] {ch_title}: no images from site") + continue + + site_count = len(images) + r2_count = r2_count_by_prefix(f"manga/{slug}/chapters/{ch_num}/") + + if site_count != r2_count: + print(f" [{ch_num}] {ch_title}: site={site_count}, R2={r2_count} — re-upload") + to_reupload.append((ch_id, ch_num, ch_title, ch, images)) + continue + + # Count matches — check if DB has valid width/height for all pages + cur2.execute( + 'SELECT COUNT(*), ' + 'COUNT(*) FILTER (WHERE width IS NULL OR width <= 0), ' + 'COUNT(*) FILTER (WHERE height IS NULL OR height <= 0), ' + 'MIN(width), MAX(width), MIN(height), MAX(height) ' + 'FROM "Page" WHERE "chapterId" = %s', + (ch_id,), + ) + db_count, bad_w, bad_h, min_w, max_w, min_h, max_h = cur2.fetchone() + bad_count = max(bad_w, bad_h) + if bad_count > 0: + print(f" [{ch_num}] {ch_title}: {bad_count} pages need dims (w {min_w}-{max_w}, h {min_h}-{max_h}) — fix from R2") + to_fix_dims.append((ch_id, ch_num, ch_title)) + else: + print(f" [{ch_num}] {ch_title}: {site_count} pages OK (w {min_w}-{max_w}, h {min_h}-{max_h})") + + # Fix dimensions by reading existing R2 objects (no re-upload) + if to_fix_dims: + print(f"\n Fixing dimensions for {len(to_fix_dims)} chapter(s)...") + for ch_id, ch_num, ch_title in to_fix_dims: + if esc.stop.is_set(): + break + cur2.execute( + 'SELECT id, number, "imageUrl" FROM "Page" WHERE "chapterId" = %s ' + 'AND (width IS NULL OR width = 0 OR height IS NULL OR height = 0) ' + 'ORDER BY number', + (ch_id,), + ) + pages = cur2.fetchall() + + def read_dims(args, _slug=slug, _n=ch_num): + page_id, pn, _url = args + r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" + try: + data = s3.get_object(Bucket=BUCKET, Key=r2_key)["Body"].read() + with Image.open(io.BytesIO(data)) as img: + return page_id, img.width, img.height + except Exception: + return page_id, None, None + + updated = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + for page_id, w, h in pool.map(read_dims, pages): + if w and h: + cur2.execute( + 'UPDATE "Page" SET width = %s, height = %s WHERE id = %s', + (w, h, page_id), + ) + updated += 1 + conn.commit() + print(f" [{ch_num}] {ch_title}: {updated}/{len(pages)} dims updated") + + if not to_reupload: + if not to_fix_dims: + print("\n All selected chapters are complete.") + return + + print(f"\n {len(to_reupload)} chapter(s) need re-upload") + + for ch_id, ch_num, ch_title, ch, images in to_reupload: + if esc.stop.is_set(): + break + print(f"\n Re-uploading [{ch_num}] {ch_title}") + page_bytes = fetch_all_pages(session.page, images) + if len(page_bytes) < len(images): + missing = [pn for pn in range(1, len(images) + 1) if pn not in page_bytes] + print(f" Could not fetch pages: {missing}, skipping") + continue + + # Upload to R2 (overwrites existing) + def upload_page(args, _slug=slug, _n=ch_num): + pn, raw = args + r2_key = f"manga/{_slug}/chapters/{_n}/{pn}.webp" + w, h, webp = probe_and_webp(io.BytesIO(raw)) + return pn, upload_to_r2(r2_key, webp), w, h + + page_urls = {} + done = 0 + with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool: + futures = {pool.submit(upload_page, (pn, raw)): pn for pn, raw in page_bytes.items()} + for future in as_completed(futures): + pn, r2_url, w, h = future.result() + page_urls[pn] = (r2_url, w, h) + done += 1 + print(f" R2: {done}/{len(page_bytes)}", end="\r") + + # Replace Page records + cur2.execute('DELETE FROM "Page" WHERE "chapterId" = %s', (ch_id,)) + insert_pages(cur2, ch_id, page_urls) + conn.commit() + print(f" {len(page_urls)} pages restored" + " " * 20) + + try: + with_browser(run) + finally: + conn.close() + + print("\nCheck complete!") + + def tui_r2_manage(): while True: idx = tui_select("R2 / DB Management", [ "Status", "Edit manga info", "Delete specific manga", + "Delete specific chapter", + "Check missing pages", "Clear ALL (R2 + DB)", "Recompress manga (quality 65)", ]) @@ -1651,6 +1914,12 @@ def tui_r2_manage(): print(f" DB error: {e}") elif idx == 3: + tui_delete_chapter() + + elif idx == 4: + tui_check_missing_pages() + + elif idx == 5: confirm = input(" Delete ALL R2 + DB? [y/N] ").strip().lower() if confirm == "y": r2_delete_prefix("") @@ -1665,7 +1934,7 @@ def tui_r2_manage(): except Exception as e: print(f" DB error: {e}") - elif idx == 4: + elif idx == 6: slugs = r2_list_prefixes() if not slugs: print(" R2 is empty")