First commit for scraper

2025-03-27 17:33:38 +08:00 · 2025-03-27 17:33:38 +08:00 · a3bf75bd36
commit a3bf75bd36
7 changed files with 2083 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/node_modules
+/test
--- a/colamanga_scraper.js
+++ b/colamanga_scraper.js
@ -0,0 +1,131 @@
+const { BaseScraper } = require('./scraper.js');
+const sharp = require('sharp');
+const fs = require('fs');
+const path = require('path');
+
+class ColaMangaScraper extends BaseScraper {
+    constructor() {
+        super();
+    }
+
+    async saveBufferAsWebp(buffer, filename, dir = '.') {
+        const dirPath = path.resolve(dir);
+        const filePath = path.join(dirPath, filename);
+
+        try {
+            await fs.promises.mkdir(dirPath, { recursive: true });
+            await sharp(buffer).webp({ quality: 80 }).toFormat('webp').toFile(filePath);
+        } catch (error) {
+            console.error(`Failed to save ${filename}:`, error);
+        }
+    }
+
+    async getMangaInfo(mangaUrl) {
+        const page = await this.loadPage(mangaUrl);
+        await page.waitForSelector('.fed-deta-info', { visible: true });
+        const mangaName = await page.$eval('.fed-deta-content h1', el => el.textContent);
+        const elements = await page.$$('.fed-deta-content li');
+        const mangaInfo = {
+            name: mangaName,
+            author: '',
+            nickNames: [],
+            genres: [],
+            status: '',
+            chapters: []
+        };
+        for (const el of elements) {
+            const span = await el.$eval('span', el => el.textContent.trim());
+            if (span === '状态') {
+                mangaInfo.status = await el.$eval('a', el => el.textContent);
+            } else if (span === '作者') {
+                mangaInfo.author = await el.$eval('a', el => el.textContent);
+            } else if (span === '别名') {
+                mangaInfo.nickNames = await el.$$eval('a', els => els.map(el => el.textContent));
+            } else if (span === '类别') {
+                mangaInfo.genres = await el.$$eval('a', els => els.map(el => el.textContent));
+            }
+        }
+        const chapterElements = await page.$$('.all_data_list li');
+        mangaInfo.chapters = await Promise.all(chapterElements.map(async el => {
+            const chapterName = await el.$eval('a', el => el.textContent);
+            const chapterUrl = await el.$eval('a', el => el.getAttribute('href'));
+            return {
+                name: chapterName,
+                url: chapterUrl
+            };
+        }));
+        while (!this.pages_response[this.pages.indexOf(page)]) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+        }
+        mangaInfo.coverPic = this.pages_response[this.pages.indexOf(page)];
+        await this.closePage(page);
+        return mangaInfo;
+    }
+
+    async downloadChapterPics(chapter, chapterDir = ".") {
+        const directoryPath = path.resolve(chapterDir);
+        if (fs.existsSync(directoryPath)) {
+            console.log(`Skipping ${chapter.name} as it already exists`);
+            return;
+        }
+        fs.mkdirSync(directoryPath, { recursive: true });
+
+        const page = await this.loadPage(chapter.url);
+        const pageIndex = this.pages.indexOf(page);
+        await page.waitForSelector('.mh_mangalist', { visible: true });
+
+        for (let attempt = 0; attempt < 10; attempt++) {
+            console.log(`Downloading ${chapter.name}, attempt ${attempt + 1}`);
+            await this.scrollPage(page);
+            const loadingElements = await page.$$eval('.mh_loading:not([style*="display: none"])', elements => elements.length);
+            await new Promise(resolve => setTimeout(resolve, 1000));
+            if (loadingElements === 0 && Object.keys(this.pages_response[pageIndex]).length !== 0) {
+                break;
+            }
+        }
+
+        const responses = this.pages_response[pageIndex];
+        for (const [url, response] of Object.entries(responses)) {
+            const fileName = (await this.getImgOrder(page, url)) + '.webp';
+            const buffer = await (new Blob([response], { type: 'image/webp' })).arrayBuffer();
+            await this.saveBufferToWebp(buffer, fileName, chapterDir);
+        }
+
+        await this.closePage(page);
+    }
+
+    async downloadChapter(chapters, dir = ".") {
+        const dirPath = path.resolve(dir);
+        if (!fs.existsSync(dirPath)) {
+            fs.mkdirSync(dirPath, { recursive: true });
+        }
+        // const chapter = chapters[Math.floor(Math.random() * chapters.length)];
+        for (const chapter of chapters) {
+            await this.downloadChapterPics(chapter, path.join(dir, chapter.name));
+        }
+    }
+
+    async getImgOrder(page, src) {
+        const loadingAttributes = await page.$$eval('.mh_comicpic', (elements, src) => {
+            return elements
+                .filter(el => el.querySelector(`img[src="${src}"]`))
+                .map(el => el.getAttribute('p'))
+                .map(str => str.padStart(elements.length.toString().length, '0'));
+        }, src);
+        return loadingAttributes;
+    }
+}
+
+(async () => {
+    const scraper = new ColaMangaScraper();
+    await scraper.init();
+    const mangaUrl = 'https://www.colamanga.com/manga-od825111/';
+    const mangaInfo = await scraper.getMangaInfo(mangaUrl);
+    await scraper.saveBufferToWebp(mangaInfo.coverPic, 'cover.webp', 'test');
+    await scraper.downloadChapter(mangaInfo.chapters, 'test');
+    await scraper.closeAllPages();
+    console.log(mangaInfo);
+})();
+
+
+
--- a/index.js
+++ b/index.js
@ -0,0 +1,8 @@
+const axios = require('axios');
+
+const postData = {
+    mangaId: 'yourMangaId',
+    mangaName: 'yourMangaName'
+};
+
+axios.post('http://localhost:4000/insert', postData)
--- a/manga_urls.json
+++ b/manga_urls.json
@ -0,0 +1,3 @@
+{
+    "大奉打更人" : "https://www.colamanga.com/manga-lo816008/"
+}
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -0,0 +1,18 @@
+{
+  "name": "sunnymh-scrap",
+  "version": "1.0.0",
+  "main": "index.cjs",
+  "scripts": {
+    "start": "node index.cjs",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "",
+  "license": "ISC",
+  "description": "",
+  "dependencies": {
+    "axios": "^1.8.4",
+    "fs": "^0.0.1-security",
+    "puppeteer": "^24.4.0",
+    "sharp": "^0.33.5"
+  }
+}
--- a/scraper.js
+++ b/scraper.js
@ -0,0 +1,81 @@
+const puppeteer = require('puppeteer');
+
+exports.BaseScraper = class BaseScraper {
+    constructor() {
+        this.browser = null;
+        this.pages = [];
+        this.pages_response = [];
+    }
+
+    async init() {
+        this.browser = await puppeteer.launch({
+            // headless: false,
+            args: ['--no-sandbox', '--disable-setuid-sandbox']
+        });
+    }
+
+    async openPage(url) {
+        const page = await this.browser.newPage();
+
+        page.on('response', async (response) => {
+            const pageIndex = this.pages.indexOf(page);
+            
+            if (response.url().includes('https://res.colamanga.com')) {
+                this.pages_response[pageIndex] = await response.buffer();
+            }
+            else if (/blob:https:\/\/www\.colamanga\.com\//.test(response.url())) {
+                if (!this.pages_response[pageIndex]) {
+                    this.pages_response[pageIndex] = {};
+                }
+                this.pages_response[pageIndex][response.url()] = await response.buffer();
+            }
+            
+        });
+
+        await page.goto(url, { waitUntil: 'domcontentloaded' });
+        this.pages.push(page);
+        return page;
+    }
+
+    
+    async closePage(page) {
+        const pageIndex = this.pages.indexOf(page);
+        if (pageIndex !== -1) {
+            this.pages.splice(pageIndex, 1);
+            this.pages_response.splice(pageIndex, 1);
+        }
+        await page.close();
+    }
+
+    async closeAllPages() {
+        await Promise.all(this.pages.map(page => page.close()));
+        this.pages = [];
+        this.pages_response = [];
+        await this.browser.close();
+    }
+
+    async loadPage(url) {
+        const page = await this.openPage(url);
+        return page;
+    }
+
+    async scrollPage(page) {
+        await page.evaluate(async () => {
+            window.scrollTo(0, 0);
+            await new Promise((resolve) => {
+                const distance = 100; // distance to scroll
+                const delay = 100; // delay between scrolls
+                const scrollInterval = setInterval(() => {
+                    window.scrollBy(0, distance);
+                    if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight) {
+                        clearInterval(scrollInterval);
+                        resolve();
+                    }
+                }, delay);
+            });
+        });
+    }
+
+}
+
+