first commit
This commit is contained in:
commit
721ad213ee
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
.env
|
||||
__pycache__/
|
||||
manga-content/
|
||||
.browser-data/
|
||||
cookies.txt
|
||||
.DS_Store
|
||||
47
CLAUDE.md
Normal file
47
CLAUDE.md
Normal file
@ -0,0 +1,47 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Manga downloader for m.happymh.com. Reads manga URLs from `manga.json` and downloads chapter images into `manga-content/`.
|
||||
|
||||
## Data Flow
|
||||
|
||||
1. **Input**: `manga.json` — JSON array of manga URLs (e.g., `["https://m.happymh.com/manga/butiange"]`)
|
||||
2. **Output**: `manga-content/<manga-name>/<chapter-number> <chapter-title>/*.jpg` — downloaded page images
|
||||
3. **Metadata**: `manga-content/<manga-name>/detail.json` — stores manga/chapter metadata
|
||||
|
||||
## Directory Convention
|
||||
|
||||
```
|
||||
manga-content/
|
||||
butiange/
|
||||
detail.json
|
||||
1 第一回/
|
||||
1.jpg
|
||||
2.jpg
|
||||
3.jpg
|
||||
2 第二回/
|
||||
...
|
||||
```
|
||||
|
||||
- Manga name is the URL slug (last path segment of the manga URL)
|
||||
- Chapter folders are named `<number> <title>` (e.g., `1 第一回`)
|
||||
- Image filenames are sequential page numbers (`1.jpg`, `2.jpg`, ...)
|
||||
|
||||
## Metadata Format (`detail.json`)
|
||||
|
||||
Each manga folder contains a `detail.json` with fields:
|
||||
- `mg-url` — source URL on m.happymh.com
|
||||
- `mg-title` — manga title (Chinese)
|
||||
- `mg-author` — author name
|
||||
- `mg-genres` — array of genre tags
|
||||
- `mg-description` — synopsis text
|
||||
|
||||
## Target Site
|
||||
|
||||
- Base URL: `https://m.happymh.com`
|
||||
- Manga page: `/manga/<slug>` — contains chapter listing
|
||||
- Chapter page: `/reads/<slug>/<chapter-id>` — contains page images
|
||||
- The site is mobile-oriented; requests should use appropriate mobile User-Agent headers
|
||||
686
download.py
Normal file
686
download.py
Normal file
@ -0,0 +1,686 @@
|
||||
"""
|
||||
Manga downloader for m.happymh.com (educational purposes only).
|
||||
|
||||
Launches real Chrome via subprocess (not Playwright), then connects via
|
||||
Chrome DevTools Protocol. Images are downloaded directly via HTTP.
|
||||
|
||||
Usage:
|
||||
python download.py --setup # open Chrome, solve CF manually, exit
|
||||
python download.py # download manga from manga.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
BASE_URL = "https://m.happymh.com"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/143.0.0.0 Safari/537.36"
|
||||
)
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CONTENT_DIR = ROOT_DIR / "manga-content"
|
||||
MANGA_JSON = ROOT_DIR / "manga.json"
|
||||
BROWSER_DATA = ROOT_DIR / ".browser-data"
|
||||
CDP_PORT = 9333
|
||||
REQUEST_DELAY = 1.5
|
||||
|
||||
CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
||||
|
||||
|
||||
def is_port_open(port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
return s.connect_ex(("localhost", port)) == 0
|
||||
|
||||
|
||||
def launch_chrome(start_url=None):
|
||||
"""Launch real Chrome with CDP port."""
|
||||
if is_port_open(CDP_PORT):
|
||||
print(f"Chrome already on port {CDP_PORT}")
|
||||
return None
|
||||
|
||||
if not Path(CHROME_PATH).exists():
|
||||
print(f"Chrome not found at: {CHROME_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
cmd = [
|
||||
CHROME_PATH,
|
||||
f"--remote-debugging-port={CDP_PORT}",
|
||||
f"--user-data-dir={BROWSER_DATA}",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
]
|
||||
if start_url:
|
||||
cmd.append(start_url)
|
||||
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
for _ in range(30):
|
||||
if is_port_open(CDP_PORT):
|
||||
time.sleep(1)
|
||||
return proc
|
||||
time.sleep(0.5)
|
||||
|
||||
print("Chrome failed to start")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def wait_for_cloudflare(page, timeout=120):
|
||||
"""Wait for CF to resolve. User solves CAPTCHA manually if needed."""
|
||||
for i in range(timeout):
|
||||
try:
|
||||
title = page.title()
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
if "Just a moment" in title or "challenge" in page.url:
|
||||
if i == 0:
|
||||
print(" CF challenge — solve in browser...")
|
||||
elif i % 15 == 0:
|
||||
print(f" Still waiting for CF... ({i}s)")
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
if title and "嗨皮漫画" in title:
|
||||
return True
|
||||
if title and "happymh" in page.url:
|
||||
return True
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
print(" CF timed out.")
|
||||
return False
|
||||
|
||||
|
||||
def fetch_chapters_via_api(page, slug):
|
||||
"""Get full chapter list via chapterByPage API with pagination."""
|
||||
result = page.evaluate("""
|
||||
async (slug) => {
|
||||
const all = [];
|
||||
let total = 0;
|
||||
for (let p = 1; p <= 30; p++) {
|
||||
const url = `/v2.0/apis/manga/chapterByPage?code=${slug}&lang=cn&order=asc&page=${p}&_t=${Date.now()}`;
|
||||
try {
|
||||
const ctrl = new AbortController();
|
||||
setTimeout(() => ctrl.abort(), 10000);
|
||||
const r = await fetch(url, { signal: ctrl.signal });
|
||||
if (!r.ok) { if (p === 1) return { error: r.status }; break; }
|
||||
const json = await r.json();
|
||||
if (!json.data) break;
|
||||
|
||||
total = json.data.total || total;
|
||||
|
||||
// Find chapter array in response
|
||||
let items = null;
|
||||
for (const val of Object.values(json.data)) {
|
||||
if (Array.isArray(val) && val.length > 0) {
|
||||
items = val;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!items || items.length === 0) break;
|
||||
|
||||
for (const ch of items) {
|
||||
all.push({
|
||||
id: String(ch.id || ''),
|
||||
chapterName: ch.chapterName || ch.name || '',
|
||||
});
|
||||
}
|
||||
|
||||
if (total && all.length >= total) break;
|
||||
} catch (e) {
|
||||
if (p === 1) return { error: e.message };
|
||||
break;
|
||||
}
|
||||
}
|
||||
return { chapters: all, total };
|
||||
}
|
||||
""", slug)
|
||||
|
||||
if result and result.get("chapters") and len(result["chapters"]) > 0:
|
||||
chapters = result["chapters"]
|
||||
total = result.get("total", len(chapters))
|
||||
print(f" API: {len(chapters)}/{total} chapters")
|
||||
return chapters
|
||||
|
||||
if result and result.get("error"):
|
||||
print(f" API error: {result['error']}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def fetch_chapters_from_dom(page):
|
||||
"""Scrape all chapters from the MUI Drawer chapter list.
|
||||
Opens drawer, clicks 'load more' repeatedly, then scrapes."""
|
||||
try:
|
||||
page.wait_for_selector("a[href*='/mangaread/']", timeout=15000)
|
||||
page.wait_for_timeout(1000)
|
||||
except Exception:
|
||||
print(" No chapter links found")
|
||||
return None
|
||||
|
||||
# Step 1: Open the chapter list drawer
|
||||
for selector in [
|
||||
"text=展开全部", "text=查看全部", "text=全部章节",
|
||||
"text=展开更多", "text=更多",
|
||||
"[class*='expand']", "[class*='more']",
|
||||
]:
|
||||
try:
|
||||
btn = page.query_selector(selector)
|
||||
if btn and btn.is_visible():
|
||||
btn.click()
|
||||
print(" Opening chapter drawer...")
|
||||
page.wait_for_timeout(2000)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Step 2: Wait for drawer
|
||||
try:
|
||||
page.wait_for_selector(".MuiDrawer-paper", timeout=5000)
|
||||
except Exception:
|
||||
print(" Drawer not found, using page chapters")
|
||||
|
||||
# Step 3: Click sort button to get ascending order (oldest first)
|
||||
try:
|
||||
sort_btn = page.query_selector("text=点我改变排序")
|
||||
if sort_btn and sort_btn.is_visible():
|
||||
sort_btn.click()
|
||||
print(" Sorting ascending...")
|
||||
page.wait_for_timeout(2000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Step 4: Click "点我加载更多" until all chapters loaded
|
||||
# Get expected total from header "共177个章节"
|
||||
total = page.evaluate("""
|
||||
() => {
|
||||
const spans = document.querySelectorAll('.MuiDrawer-paper span');
|
||||
for (const s of spans) {
|
||||
const m = s.textContent.match(/共(\\d+)个章节/);
|
||||
if (m) return parseInt(m[1]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
""")
|
||||
if total:
|
||||
print(f" Total chapters: {total}")
|
||||
|
||||
for round_num in range(50):
|
||||
count = page.evaluate(
|
||||
"document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length"
|
||||
)
|
||||
if total and count >= total:
|
||||
break
|
||||
print(f" Loading... {count}/{total or '?'}", end="\r")
|
||||
|
||||
# Find and click the "load more" element — search fresh each time
|
||||
clicked = page.evaluate("""
|
||||
() => {
|
||||
const walker = document.createTreeWalker(
|
||||
document.querySelector('.MuiDrawer-paper') || document.body,
|
||||
NodeFilter.SHOW_TEXT
|
||||
);
|
||||
while (walker.nextNode()) {
|
||||
if (walker.currentNode.textContent.includes('加载更多')) {
|
||||
let el = walker.currentNode.parentElement;
|
||||
while (el && el.tagName !== 'LI') el = el.parentElement;
|
||||
if (el) { el.click(); return true; }
|
||||
walker.currentNode.parentElement.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
""")
|
||||
if not clicked:
|
||||
break
|
||||
page.wait_for_timeout(1000)
|
||||
|
||||
count = page.evaluate(
|
||||
"document.querySelectorAll('.MuiDrawer-paper a[href*=\"/mangaread/\"]').length"
|
||||
)
|
||||
print(f" Loaded {count} chapters" + " " * 20)
|
||||
|
||||
# Step 5: Scrape chapters from the drawer
|
||||
chapters = page.evaluate("""
|
||||
() => {
|
||||
const drawer = document.querySelector('.MuiDrawer-paper');
|
||||
const container = drawer || document;
|
||||
const links = container.querySelectorAll('a[href*="/mangaread/"]');
|
||||
const chapters = [];
|
||||
const seen = new Set();
|
||||
links.forEach(a => {
|
||||
const href = a.getAttribute('href');
|
||||
const match = href.match(/\\/mangaread\\/[^/]+\\/(\\d+)/);
|
||||
if (match && !seen.has(match[1])) {
|
||||
seen.add(match[1]);
|
||||
const name = a.textContent.trim();
|
||||
if (name && name !== '开始阅读') {
|
||||
chapters.push({ id: match[1], chapterName: name });
|
||||
}
|
||||
}
|
||||
});
|
||||
return chapters;
|
||||
}
|
||||
""")
|
||||
|
||||
# Step 6: Close drawer
|
||||
try:
|
||||
page.keyboard.press("Escape")
|
||||
page.wait_for_timeout(500)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return chapters if chapters else None
|
||||
|
||||
|
||||
def fetch_metadata(page):
|
||||
"""Extract manga metadata and cover URL from the loaded page."""
|
||||
html_text = page.content()
|
||||
metadata = {"mg-url": page.url}
|
||||
|
||||
m = re.search(r'<h2 class="mg-title">(.*?)</h2>', html_text)
|
||||
if m:
|
||||
metadata["mg-title"] = m.group(1).strip()
|
||||
|
||||
m = re.search(r'<p class="mg-sub-title"><a[^>]*>(.*?)</a>', html_text)
|
||||
if m:
|
||||
metadata["mg-author"] = m.group(1).strip()
|
||||
|
||||
genre_matches = re.findall(r'<p class="mg-cate">.*?</p>', html_text, re.DOTALL)
|
||||
if genre_matches:
|
||||
genres = re.findall(r'<a[^>]*>(.*?)</a>', genre_matches[0])
|
||||
metadata["mg-genres"] = genres
|
||||
|
||||
m = re.search(r'<div class="mg-desc">.*?<p[^>]*>(.*?)</p>', html_text, re.DOTALL)
|
||||
if m:
|
||||
metadata["mg-description"] = m.group(1).strip()
|
||||
|
||||
# Extract cover image URL
|
||||
cover_url = page.evaluate("""
|
||||
() => {
|
||||
// Try og:image meta tag
|
||||
const og = document.querySelector('meta[property="og:image"]');
|
||||
if (og) return og.content;
|
||||
// Try common cover selectors
|
||||
const selectors = ['img.mg-cover', '.mg-cover img', '.cover img', 'img[src*="mcover"]'];
|
||||
for (const sel of selectors) {
|
||||
const img = document.querySelector(sel);
|
||||
if (img && img.src) return img.src;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
""")
|
||||
if cover_url:
|
||||
metadata["mg-cover"] = cover_url
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def get_chapter_images(page, slug, chapter_id):
|
||||
"""Navigate to reader page, intercept the API response for image URLs."""
|
||||
captured_images = []
|
||||
api_info = {"found": False, "error": None}
|
||||
|
||||
def on_response(response):
|
||||
if "/apis/manga/reading" not in response.url:
|
||||
return
|
||||
api_info["found"] = True
|
||||
if response.status != 200:
|
||||
api_info["error"] = f"status {response.status}"
|
||||
return
|
||||
try:
|
||||
data = response.json()
|
||||
scans = data.get("data", {}).get("scans", [])
|
||||
if isinstance(scans, str):
|
||||
scans = json.loads(scans)
|
||||
for scan in scans:
|
||||
if isinstance(scan, dict) and "url" in scan:
|
||||
captured_images.append({
|
||||
"url": scan["url"],
|
||||
"no_referrer": scan.get("r", 0) != 0,
|
||||
})
|
||||
except Exception as e:
|
||||
api_info["error"] = str(e)
|
||||
|
||||
page.on("response", on_response)
|
||||
|
||||
reader_url = f"{BASE_URL}/mangaread/{slug}/{chapter_id}"
|
||||
print(" Loading reader...")
|
||||
try:
|
||||
page.evaluate(f"window.location.href = '{reader_url}'")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
try:
|
||||
page.evaluate("window.close = () => {}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(" Waiting for page...")
|
||||
if not wait_for_cloudflare(page, timeout=90):
|
||||
page.remove_listener("response", on_response)
|
||||
return []
|
||||
|
||||
print(" Waiting for API...")
|
||||
deadline = time.time() + 20
|
||||
while time.time() < deadline:
|
||||
if captured_images:
|
||||
break
|
||||
try:
|
||||
page.wait_for_timeout(500)
|
||||
except Exception:
|
||||
break
|
||||
|
||||
page.remove_listener("response", on_response)
|
||||
|
||||
if not api_info["found"]:
|
||||
print(" API not intercepted")
|
||||
elif api_info["error"]:
|
||||
print(f" API: {api_info['error']}")
|
||||
|
||||
# DOM fallback
|
||||
if not captured_images:
|
||||
try:
|
||||
page.wait_for_timeout(3000)
|
||||
dom_images = page.evaluate("""
|
||||
() => {
|
||||
const imgs = document.querySelectorAll('img[src*="http"]');
|
||||
const urls = [];
|
||||
const seen = new Set();
|
||||
imgs.forEach(img => {
|
||||
const src = img.src || '';
|
||||
if (src && !seen.has(src) && !src.includes('/mcover/')
|
||||
&& !src.includes('cloudflare') && !src.includes('.svg')) {
|
||||
seen.add(src);
|
||||
urls.push(src);
|
||||
}
|
||||
});
|
||||
return urls;
|
||||
}
|
||||
""")
|
||||
if dom_images:
|
||||
print(f" DOM: {len(dom_images)} images")
|
||||
for u in dom_images:
|
||||
captured_images.append({"url": u, "no_referrer": False})
|
||||
except Exception as e:
|
||||
print(f" DOM failed: {e}")
|
||||
|
||||
return captured_images
|
||||
|
||||
|
||||
def download_image(page, img, save_path):
|
||||
"""Download image via browser network stack. Captures raw bytes via CDP — no base64."""
|
||||
if save_path.exists():
|
||||
return True
|
||||
|
||||
url = img["url"]
|
||||
ref_policy = "no-referrer" if img.get("no_referrer") else "origin"
|
||||
|
||||
try:
|
||||
with page.expect_response(lambda r: url in r.url, timeout=15000) as resp_info:
|
||||
page.evaluate(
|
||||
"([u, r]) => fetch(u, { referrerPolicy: r })",
|
||||
[url, ref_policy],
|
||||
)
|
||||
response = resp_info.value
|
||||
if response.status == 200:
|
||||
body = response.body() # raw bytes from network layer
|
||||
if body and len(body) > 100:
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
save_path.write_bytes(body)
|
||||
return True
|
||||
except Exception as e:
|
||||
if not hasattr(download_image, "_err_logged"):
|
||||
download_image._err_logged = True
|
||||
print(f"\n First error: {e}")
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_existing_chapters(manga_dir):
|
||||
existing = set()
|
||||
if manga_dir.exists():
|
||||
for entry in manga_dir.iterdir():
|
||||
if entry.is_dir() and any(entry.glob("*.jpg")):
|
||||
existing.add(entry.name)
|
||||
return existing
|
||||
|
||||
|
||||
def download_manga(page, manga_url):
|
||||
"""Download all chapters using a single page."""
|
||||
slug = urlparse(manga_url).path.strip("/").split("/")[-1]
|
||||
manga_dir = CONTENT_DIR / slug
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Manga: {slug}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Intercept all cover images from page load traffic
|
||||
cover_responses = {}
|
||||
|
||||
def on_manga_response(response):
|
||||
if "/mcover/" in response.url and response.status == 200:
|
||||
try:
|
||||
cover_responses[response.url] = response.body()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page.on("response", on_manga_response)
|
||||
|
||||
print("Loading manga page...")
|
||||
try:
|
||||
page.goto(f"{BASE_URL}/manga/{slug}", wait_until="commit", timeout=60000)
|
||||
except Exception:
|
||||
pass
|
||||
if not wait_for_cloudflare(page):
|
||||
page.remove_listener("response", on_manga_response)
|
||||
return
|
||||
|
||||
print("Fetching chapters via API...")
|
||||
chapters = fetch_chapters_via_api(page, slug)
|
||||
if not chapters:
|
||||
print(" API failed, trying DOM...")
|
||||
chapters = fetch_chapters_from_dom(page)
|
||||
if not chapters:
|
||||
print("No chapters found.")
|
||||
return
|
||||
|
||||
print(f"Found {len(chapters)} chapters")
|
||||
|
||||
metadata = fetch_metadata(page)
|
||||
manga_dir.mkdir(parents=True, exist_ok=True)
|
||||
detail_path = manga_dir / "detail.json"
|
||||
if metadata:
|
||||
existing_meta = {}
|
||||
if detail_path.exists():
|
||||
try:
|
||||
existing_meta = json.loads(detail_path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
existing_meta.update(metadata)
|
||||
detail_path.write_text(
|
||||
json.dumps(existing_meta, ensure_ascii=False, indent=4),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# Save cover image — match the correct one from DOM
|
||||
page.remove_listener("response", on_manga_response)
|
||||
cover_path = manga_dir / "cover.jpg"
|
||||
if not cover_path.exists():
|
||||
# Find the actual cover URL from the first mip-fill-content img
|
||||
cover_url = page.evaluate("""
|
||||
() => {
|
||||
const img = document.querySelector('img.mip-fill-content[src*="mcover"]');
|
||||
return img ? img.src : null;
|
||||
}
|
||||
""")
|
||||
cover_body = None
|
||||
if cover_url:
|
||||
# Exact match first
|
||||
cover_body = cover_responses.get(cover_url)
|
||||
# Substring match fallback
|
||||
if not cover_body:
|
||||
for url, data in cover_responses.items():
|
||||
if cover_url.split("?")[0] in url or url.split("?")[0] in cover_url:
|
||||
cover_body = data
|
||||
break
|
||||
|
||||
if cover_body and len(cover_body) > 100:
|
||||
cover_path.write_bytes(cover_body)
|
||||
print(f"Cover saved ({len(cover_body)} bytes)")
|
||||
else:
|
||||
print(f"Cover not found (captured {len(cover_responses)} mcover images, target: {cover_url})")
|
||||
|
||||
existing_chapters = get_existing_chapters(manga_dir)
|
||||
|
||||
# Chapters are already in DOM order (ascending from drawer)
|
||||
chapters_sorted = chapters
|
||||
|
||||
for i, chapter in enumerate(chapters_sorted, 1):
|
||||
ch_id = chapter["id"]
|
||||
ch_name = chapter["chapterName"]
|
||||
folder_name = f"{i} {ch_name}"
|
||||
|
||||
# Skip if this chapter already downloaded (check by chapter name)
|
||||
already = any(ch_name in name for name in existing_chapters)
|
||||
if already:
|
||||
print(f" [{i}/{len(chapters_sorted)}] {ch_name} — skip")
|
||||
continue
|
||||
|
||||
print(f" [{i}/{len(chapters_sorted)}] {ch_name} (id={ch_id})")
|
||||
|
||||
images = get_chapter_images(page, slug, ch_id)
|
||||
if not images:
|
||||
print(f" No images")
|
||||
continue
|
||||
|
||||
print(f" {len(images)} pages")
|
||||
chapter_dir = manga_dir / folder_name
|
||||
chapter_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download images via browser network stack (raw bytes, no base64)
|
||||
ok = 0
|
||||
failed = []
|
||||
for pn, img in enumerate(images, 1):
|
||||
save_path = chapter_dir / f"{pn}.jpg"
|
||||
if download_image(page, img, save_path):
|
||||
ok += 1
|
||||
print(f" {pn}/{len(images)}", end="\r")
|
||||
else:
|
||||
failed.append((pn, img))
|
||||
time.sleep(0.1)
|
||||
|
||||
# Retry failed images once
|
||||
if failed:
|
||||
time.sleep(1)
|
||||
for pn, img in failed:
|
||||
save_path = chapter_dir / f"{pn}.jpg"
|
||||
if download_image(page, img, save_path):
|
||||
ok += 1
|
||||
else:
|
||||
print(f" {pn}/{len(images)} FAIL")
|
||||
time.sleep(0.3)
|
||||
|
||||
print(f" {ok}/{len(images)} downloaded" + " " * 20)
|
||||
|
||||
if ok == 0:
|
||||
try:
|
||||
chapter_dir.rmdir()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
print(f"\nDone: {slug}")
|
||||
|
||||
|
||||
def setup_mode():
|
||||
"""Launch Chrome for manual CF solving."""
|
||||
print("=== SETUP ===")
|
||||
print("Chrome will open. Do this:")
|
||||
print(" 1. Go to m.happymh.com — solve Cloudflare")
|
||||
print(" 2. Open a manga page — solve CF if prompted")
|
||||
print(" 3. Open a chapter reader — solve CF if prompted")
|
||||
print(" 4. Press ENTER here when done\n")
|
||||
|
||||
chrome_proc = launch_chrome(BASE_URL)
|
||||
|
||||
input(">>> Press ENTER when Cloudflare is solved... ")
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}")
|
||||
ctx = browser.contexts[0]
|
||||
cookies = ctx.cookies()
|
||||
cf = [c for c in cookies if c["name"] == "cf_clearance"]
|
||||
if cf:
|
||||
print("cf_clearance found!")
|
||||
else:
|
||||
print("Warning: cf_clearance not found")
|
||||
browser.close()
|
||||
except Exception as e:
|
||||
print(f"Could not verify: {e}")
|
||||
|
||||
if chrome_proc:
|
||||
chrome_proc.terminate()
|
||||
|
||||
print("Done. Now run: python download.py")
|
||||
|
||||
|
||||
def main():
|
||||
if "--setup" in sys.argv:
|
||||
setup_mode()
|
||||
return
|
||||
|
||||
if not MANGA_JSON.exists():
|
||||
print(f"Error: {MANGA_JSON} not found")
|
||||
sys.exit(1)
|
||||
|
||||
manga_urls = json.loads(MANGA_JSON.read_text(encoding="utf-8"))
|
||||
if not isinstance(manga_urls, list) or not manga_urls:
|
||||
print("Error: manga.json should be a JSON array of URLs")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(manga_urls)} manga(s)")
|
||||
print("Launching Chrome...\n")
|
||||
|
||||
chrome_proc = launch_chrome()
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.connect_over_cdp(f"http://localhost:{CDP_PORT}")
|
||||
context = browser.contexts[0]
|
||||
page = context.pages[0] if context.pages else context.new_page()
|
||||
|
||||
for url in manga_urls:
|
||||
try:
|
||||
download_manga(page, url)
|
||||
except Exception as e:
|
||||
print(f"\nError: {url}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
browser.close()
|
||||
finally:
|
||||
if chrome_proc:
|
||||
chrome_proc.terminate()
|
||||
|
||||
print("\nAll done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
92
export_cookies.py
Normal file
92
export_cookies.py
Normal file
@ -0,0 +1,92 @@
|
||||
"""
|
||||
Opens a browser to m.happymh.com, waits for you to pass Cloudflare,
|
||||
then saves cookies to cookies.txt in Netscape format.
|
||||
|
||||
Install:
|
||||
pip install playwright
|
||||
playwright install chromium
|
||||
|
||||
Usage:
|
||||
python export_cookies.py
|
||||
"""
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
print("Playwright not installed. Run:")
|
||||
print(" pip install playwright")
|
||||
print(" playwright install chromium")
|
||||
raise SystemExit(1)
|
||||
|
||||
COOKIES_FILE = Path(__file__).parent / "cookies.txt"
|
||||
TARGET_URL = "https://m.happymh.com"
|
||||
|
||||
|
||||
def cookies_to_netscape(cookies):
|
||||
"""Convert Playwright cookies to Netscape cookies.txt format."""
|
||||
lines = ["# Netscape HTTP Cookie File", ""]
|
||||
for c in cookies:
|
||||
domain = c["domain"]
|
||||
# Netscape format: leading dot means accessible to subdomains
|
||||
if not domain.startswith("."):
|
||||
domain = "." + domain
|
||||
flag = "TRUE" # accessible to subdomains
|
||||
path = c.get("path", "/")
|
||||
secure = "TRUE" if c.get("secure", False) else "FALSE"
|
||||
expires = str(int(c.get("expires", 0)))
|
||||
name = c["name"]
|
||||
value = c["value"]
|
||||
lines.append(f"{domain}\t{flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def main():
|
||||
print("Opening browser to m.happymh.com...")
|
||||
print("Once the page loads (past Cloudflare), press ENTER here to save cookies.\n")
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
||||
"Version/16.0 Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
viewport={"width": 390, "height": 844},
|
||||
is_mobile=True,
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
page.goto(TARGET_URL)
|
||||
|
||||
input(">>> Page opened. Pass Cloudflare if needed, then press ENTER to save cookies... ")
|
||||
|
||||
cookies = context.cookies()
|
||||
if not cookies:
|
||||
print("No cookies found!")
|
||||
browser.close()
|
||||
return
|
||||
|
||||
# Check for cf_clearance
|
||||
cookie_names = [c["name"] for c in cookies]
|
||||
if "cf_clearance" in cookie_names:
|
||||
print("cf_clearance cookie found (Cloudflare passed)")
|
||||
else:
|
||||
print("Warning: cf_clearance not found. You may still be on the challenge page.")
|
||||
answer = input("Save anyway? [y/N] ").strip().lower()
|
||||
if answer != "y":
|
||||
browser.close()
|
||||
return
|
||||
|
||||
text = cookies_to_netscape(cookies)
|
||||
COOKIES_FILE.write_text(text)
|
||||
print(f"\nSaved {len(cookies)} cookies to {COOKIES_FILE}")
|
||||
|
||||
browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
manga.json
Normal file
4
manga.json
Normal file
@ -0,0 +1,4 @@
|
||||
[
|
||||
"https://m.happymh.com/manga/moutianchengweimoshen",
|
||||
"https://m.happymh.com/manga/butiange"
|
||||
]
|
||||
393
upload.py
Normal file
393
upload.py
Normal file
@ -0,0 +1,393 @@
|
||||
"""
|
||||
Interactive manga uploader — Cloudflare R2 + PostgreSQL.
|
||||
|
||||
R2 storage layout:
|
||||
manga/<slug>/cover.webp
|
||||
manga/<slug>/chapters/<number>/<page>.webp
|
||||
|
||||
Usage:
|
||||
python upload.py
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
import boto3
|
||||
import psycopg2
|
||||
from PIL import Image
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CONTENT_DIR = ROOT_DIR / "manga-content"
|
||||
|
||||
# R2 config
|
||||
s3 = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=f"https://{os.environ['R2_ACCOUNT_ID']}.r2.cloudflarestorage.com",
|
||||
aws_access_key_id=os.environ["R2_ACCESS_KEY"],
|
||||
aws_secret_access_key=os.environ["R2_SECRET_KEY"],
|
||||
region_name="auto",
|
||||
)
|
||||
BUCKET = os.environ["R2_BUCKET"]
|
||||
PUBLIC_URL = os.environ["R2_PUBLIC_URL"].rstrip("/")
|
||||
|
||||
# Database
|
||||
DATABASE_URL = os.environ["DATABASE_URL"]
|
||||
UPLOAD_WORKERS = 8
|
||||
|
||||
|
||||
def convert_to_webp(image_path, quality=80):
|
||||
img = Image.open(image_path)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="WEBP", quality=quality)
|
||||
buf.seek(0)
|
||||
return buf.read()
|
||||
|
||||
|
||||
def make_cover(image_path, width=400, height=560):
|
||||
img = Image.open(image_path)
|
||||
target_ratio = width / height
|
||||
img_ratio = img.width / img.height
|
||||
if img_ratio > target_ratio:
|
||||
new_width = int(img.height * target_ratio)
|
||||
left = (img.width - new_width) // 2
|
||||
img = img.crop((left, 0, left + new_width, img.height))
|
||||
else:
|
||||
new_height = int(img.width / target_ratio)
|
||||
img = img.crop((0, 0, img.width, new_height))
|
||||
img = img.resize((width, height), Image.LANCZOS)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="WEBP", quality=85)
|
||||
buf.seek(0)
|
||||
return buf.read()
|
||||
|
||||
|
||||
def upload_to_r2(key, data, content_type="image/webp"):
|
||||
s3.put_object(Bucket=BUCKET, Key=key, Body=data, ContentType=content_type)
|
||||
return f"{PUBLIC_URL}/{key}"
|
||||
|
||||
|
||||
def r2_key_exists(key):
|
||||
try:
|
||||
s3.head_object(Bucket=BUCKET, Key=key)
|
||||
return True
|
||||
except s3.exceptions.ClientError:
|
||||
return False
|
||||
|
||||
|
||||
def get_db():
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
conn.set_client_encoding("UTF8")
|
||||
return conn
|
||||
|
||||
|
||||
def parse_chapter_dir(dir_name):
|
||||
"""Parse '1 001. 序章' -> (1, '001. 序章')."""
|
||||
m = re.match(r"^(\d+)\s+(.+)$", dir_name)
|
||||
if m:
|
||||
return int(m.group(1)), m.group(2)
|
||||
return 0, dir_name
|
||||
|
||||
|
||||
def list_local_manga():
|
||||
"""List manga directories in manga-content/."""
|
||||
dirs = sorted(
|
||||
d.name for d in CONTENT_DIR.iterdir()
|
||||
if d.is_dir() and not d.name.startswith(".")
|
||||
)
|
||||
return dirs
|
||||
|
||||
|
||||
# ── Commands ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def cmd_reset():
|
||||
"""Clear all R2 storage."""
|
||||
print("\nClearing R2 bucket...")
|
||||
total = 0
|
||||
batches = []
|
||||
paginator = s3.get_paginator("list_objects_v2")
|
||||
for page in paginator.paginate(Bucket=BUCKET):
|
||||
objects = page.get("Contents", [])
|
||||
if not objects:
|
||||
break
|
||||
batches.append([{"Key": obj["Key"]} for obj in objects])
|
||||
|
||||
# Delete batches in parallel
|
||||
def delete_batch(keys):
|
||||
s3.delete_objects(Bucket=BUCKET, Delete={"Objects": keys})
|
||||
return len(keys)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
|
||||
for count in pool.map(delete_batch, batches):
|
||||
total += count
|
||||
print(f" {total} deleted", end="\r")
|
||||
|
||||
print(f" {total} objects deleted from R2" + " " * 10)
|
||||
print("R2 cleared. Run 'upload' to re-upload.\n")
|
||||
|
||||
|
||||
def cmd_status(conn):
|
||||
"""Show current state of R2 and database."""
|
||||
cur = conn.cursor()
|
||||
|
||||
# DB counts
|
||||
cur.execute('SELECT COUNT(*) FROM "Manga"')
|
||||
manga_count = cur.fetchone()[0]
|
||||
cur.execute('SELECT COUNT(*) FROM "Chapter"')
|
||||
chapter_count = cur.fetchone()[0]
|
||||
cur.execute('SELECT COUNT(*) FROM "Page"')
|
||||
page_count = cur.fetchone()[0]
|
||||
|
||||
print(f"\n Database: {manga_count} manga, {chapter_count} chapters, {page_count} pages")
|
||||
|
||||
# List manga in DB
|
||||
cur.execute('SELECT slug, title, (SELECT COUNT(*) FROM "Chapter" WHERE "mangaId" = "Manga".id) FROM "Manga" ORDER BY slug')
|
||||
for slug, title, ch_count in cur.fetchall():
|
||||
print(f" {slug}: {title} ({ch_count} chapters)")
|
||||
|
||||
# R2 count
|
||||
total = 0
|
||||
paginator = s3.get_paginator("list_objects_v2")
|
||||
for page in paginator.paginate(Bucket=BUCKET):
|
||||
total += len(page.get("Contents", []))
|
||||
print(f" R2: {total} objects")
|
||||
|
||||
# Local
|
||||
local = list_local_manga()
|
||||
print(f" Local: {len(local)} manga in manga-content/")
|
||||
for name in local:
|
||||
manga_path = CONTENT_DIR / name
|
||||
chapters = [d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")]
|
||||
has_cover = (manga_path / "cover.jpg").exists()
|
||||
print(f" {name}: {len(chapters)} chapters, cover: {'yes' if has_cover else 'no'}")
|
||||
print()
|
||||
|
||||
|
||||
def cmd_upload(conn, manga_name=None):
|
||||
"""Upload manga to R2 and create DB records."""
|
||||
if manga_name:
|
||||
manga_dirs = [manga_name]
|
||||
if not (CONTENT_DIR / manga_name).is_dir():
|
||||
print(f" Not found: {CONTENT_DIR / manga_name}")
|
||||
return
|
||||
else:
|
||||
manga_dirs = list_local_manga()
|
||||
|
||||
if not manga_dirs:
|
||||
print(" No manga found in manga-content/")
|
||||
return
|
||||
|
||||
print(f"\n Uploading {len(manga_dirs)} manga(s)...")
|
||||
|
||||
for name in manga_dirs:
|
||||
upload_manga(name, conn)
|
||||
|
||||
print("\nUpload complete!")
|
||||
|
||||
|
||||
def upload_manga(manga_name, conn):
|
||||
manga_path = CONTENT_DIR / manga_name
|
||||
detail_path = manga_path / "detail.json"
|
||||
|
||||
if not detail_path.exists():
|
||||
print(f" Skipping {manga_name}: no detail.json")
|
||||
return
|
||||
|
||||
detail = json.loads(detail_path.read_text(encoding="utf-8"))
|
||||
title = detail.get("mg-title", manga_name)
|
||||
slug = manga_name
|
||||
genres = detail.get("mg-genres", [])
|
||||
description = detail.get("mg-description", "")
|
||||
if not description and genres:
|
||||
description = f"Genres: {', '.join(genres)}"
|
||||
genre = genres[0] if genres else "Drama"
|
||||
|
||||
print(f"\n {'='*50}")
|
||||
print(f" {title} ({slug})")
|
||||
print(f" {'='*50}")
|
||||
|
||||
cur = conn.cursor()
|
||||
|
||||
# Cover
|
||||
cover_file = manga_path / "cover.jpg"
|
||||
cover_url = ""
|
||||
cover_key = f"manga/{slug}/cover.webp"
|
||||
|
||||
if cover_file.exists():
|
||||
if not r2_key_exists(cover_key):
|
||||
cover_data = make_cover(cover_file)
|
||||
cover_url = upload_to_r2(cover_key, cover_data)
|
||||
print(f" Cover uploaded")
|
||||
else:
|
||||
cover_url = f"{PUBLIC_URL}/{cover_key}"
|
||||
print(f" Cover exists")
|
||||
else:
|
||||
print(" No cover.jpg")
|
||||
|
||||
# Manga record
|
||||
cur.execute('SELECT id, "coverUrl" FROM "Manga" WHERE slug = %s', (slug,))
|
||||
row = cur.fetchone()
|
||||
|
||||
if row:
|
||||
manga_id, existing_cover = row
|
||||
print(f" Manga exists (id: {manga_id})")
|
||||
if cover_url and cover_url != existing_cover:
|
||||
cur.execute(
|
||||
'UPDATE "Manga" SET "coverUrl" = %s, "updatedAt" = NOW() WHERE id = %s',
|
||||
(cover_url, manga_id),
|
||||
)
|
||||
conn.commit()
|
||||
else:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO "Manga" (title, description, "coverUrl", slug, genre, status, "createdAt", "updatedAt")
|
||||
VALUES (%s, %s, %s, %s, %s, 'PUBLISHED', NOW(), NOW())
|
||||
RETURNING id
|
||||
""",
|
||||
(title, description, cover_url, slug, genre),
|
||||
)
|
||||
manga_id = cur.fetchone()[0]
|
||||
conn.commit()
|
||||
print(f" Created manga (id: {manga_id})")
|
||||
|
||||
# Chapters
|
||||
chapter_dirs = sorted(
|
||||
[d for d in manga_path.iterdir() if d.is_dir() and not d.name.startswith(".")],
|
||||
key=lambda d: parse_chapter_dir(d.name)[0],
|
||||
)
|
||||
|
||||
for chapter_dir in chapter_dirs:
|
||||
order_num, chapter_title = parse_chapter_dir(chapter_dir.name)
|
||||
if order_num == 0:
|
||||
continue
|
||||
|
||||
cur.execute(
|
||||
'SELECT id FROM "Chapter" WHERE "mangaId" = %s AND number = %s',
|
||||
(manga_id, order_num),
|
||||
)
|
||||
if cur.fetchone():
|
||||
print(f" [{order_num}] {chapter_title} — skip")
|
||||
continue
|
||||
|
||||
page_files = sorted(
|
||||
[f for f in chapter_dir.iterdir() if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".webp")],
|
||||
key=lambda f: int(re.search(r"(\d+)", f.stem).group(1)) if re.search(r"(\d+)", f.stem) else 0,
|
||||
)
|
||||
|
||||
if not page_files:
|
||||
continue
|
||||
|
||||
print(f" [{order_num}] {chapter_title} ({len(page_files)} pages)")
|
||||
|
||||
cur.execute(
|
||||
'INSERT INTO "Chapter" ("mangaId", number, title) VALUES (%s, %s, %s) RETURNING id',
|
||||
(manga_id, order_num, chapter_title),
|
||||
)
|
||||
chapter_id = cur.fetchone()[0]
|
||||
conn.commit()
|
||||
|
||||
# Parallel convert + upload
|
||||
def process_page(args):
|
||||
j, page_file = args
|
||||
r2_key = f"manga/{slug}/chapters/{order_num}/{j}.webp"
|
||||
if not r2_key_exists(r2_key):
|
||||
webp_data = convert_to_webp(page_file)
|
||||
return j, upload_to_r2(r2_key, webp_data)
|
||||
return j, f"{PUBLIC_URL}/{r2_key}"
|
||||
|
||||
page_urls = {}
|
||||
done = 0
|
||||
with ThreadPoolExecutor(max_workers=UPLOAD_WORKERS) as pool:
|
||||
futures = {pool.submit(process_page, (j, f)): j for j, f in enumerate(page_files, 1)}
|
||||
for future in as_completed(futures):
|
||||
j, url = future.result()
|
||||
page_urls[j] = url
|
||||
done += 1
|
||||
print(f" {done}/{len(page_files)}", end="\r")
|
||||
|
||||
# Batch insert page records in order
|
||||
for j in sorted(page_urls):
|
||||
cur.execute(
|
||||
'INSERT INTO "Page" ("chapterId", number, "imageUrl") VALUES (%s, %s, %s)',
|
||||
(chapter_id, j, page_urls[j]),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
print(f" {len(page_files)} pages uploaded" + " " * 10)
|
||||
|
||||
|
||||
# ── Interactive loop ──────────────────────────────────────
|
||||
|
||||
|
||||
def show_menu():
|
||||
print()
|
||||
print("=" * 40)
|
||||
print(" Manga Uploader")
|
||||
print("=" * 40)
|
||||
print(" 1. Status")
|
||||
print(" 2. Upload all manga")
|
||||
print(" 3. Upload specific manga")
|
||||
print(" 4. Reset R2 storage")
|
||||
print(" 0. Quit")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
conn = get_db()
|
||||
try:
|
||||
while True:
|
||||
show_menu()
|
||||
try:
|
||||
choice = input("Select [0-4]: ").strip()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print()
|
||||
break
|
||||
|
||||
if choice == "0":
|
||||
break
|
||||
elif choice == "1":
|
||||
cmd_status(conn)
|
||||
elif choice == "2":
|
||||
cmd_upload(conn)
|
||||
elif choice == "3":
|
||||
local = list_local_manga()
|
||||
if not local:
|
||||
print(" No manga in manga-content/")
|
||||
continue
|
||||
print()
|
||||
for i, name in enumerate(local, 1):
|
||||
print(f" {i}. {name}")
|
||||
print()
|
||||
pick = input("Select manga number: ").strip()
|
||||
try:
|
||||
idx = int(pick) - 1
|
||||
if 0 <= idx < len(local):
|
||||
cmd_upload(conn, local[idx])
|
||||
else:
|
||||
print(" Invalid selection")
|
||||
except ValueError:
|
||||
print(" Invalid input")
|
||||
elif choice == "4":
|
||||
confirm = input(" Delete ALL R2 objects? [y/N] ").strip().lower()
|
||||
if confirm == "y":
|
||||
cmd_reset()
|
||||
else:
|
||||
print(" Cancelled.")
|
||||
else:
|
||||
print(" Invalid choice")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
print("Bye!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user