First commit for scraper
This commit is contained in:
commit
a3bf75bd36
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/node_modules
|
||||||
|
/test
|
||||||
131
colamanga_scraper.js
Normal file
131
colamanga_scraper.js
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
const { BaseScraper } = require('./scraper.js');
|
||||||
|
const sharp = require('sharp');
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
class ColaMangaScraper extends BaseScraper {
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
async saveBufferAsWebp(buffer, filename, dir = '.') {
|
||||||
|
const dirPath = path.resolve(dir);
|
||||||
|
const filePath = path.join(dirPath, filename);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await fs.promises.mkdir(dirPath, { recursive: true });
|
||||||
|
await sharp(buffer).webp({ quality: 80 }).toFormat('webp').toFile(filePath);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to save ${filename}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getMangaInfo(mangaUrl) {
|
||||||
|
const page = await this.loadPage(mangaUrl);
|
||||||
|
await page.waitForSelector('.fed-deta-info', { visible: true });
|
||||||
|
const mangaName = await page.$eval('.fed-deta-content h1', el => el.textContent);
|
||||||
|
const elements = await page.$$('.fed-deta-content li');
|
||||||
|
const mangaInfo = {
|
||||||
|
name: mangaName,
|
||||||
|
author: '',
|
||||||
|
nickNames: [],
|
||||||
|
genres: [],
|
||||||
|
status: '',
|
||||||
|
chapters: []
|
||||||
|
};
|
||||||
|
for (const el of elements) {
|
||||||
|
const span = await el.$eval('span', el => el.textContent.trim());
|
||||||
|
if (span === '状态') {
|
||||||
|
mangaInfo.status = await el.$eval('a', el => el.textContent);
|
||||||
|
} else if (span === '作者') {
|
||||||
|
mangaInfo.author = await el.$eval('a', el => el.textContent);
|
||||||
|
} else if (span === '别名') {
|
||||||
|
mangaInfo.nickNames = await el.$$eval('a', els => els.map(el => el.textContent));
|
||||||
|
} else if (span === '类别') {
|
||||||
|
mangaInfo.genres = await el.$$eval('a', els => els.map(el => el.textContent));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const chapterElements = await page.$$('.all_data_list li');
|
||||||
|
mangaInfo.chapters = await Promise.all(chapterElements.map(async el => {
|
||||||
|
const chapterName = await el.$eval('a', el => el.textContent);
|
||||||
|
const chapterUrl = await el.$eval('a', el => el.getAttribute('href'));
|
||||||
|
return {
|
||||||
|
name: chapterName,
|
||||||
|
url: chapterUrl
|
||||||
|
};
|
||||||
|
}));
|
||||||
|
while (!this.pages_response[this.pages.indexOf(page)]) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
}
|
||||||
|
mangaInfo.coverPic = this.pages_response[this.pages.indexOf(page)];
|
||||||
|
await this.closePage(page);
|
||||||
|
return mangaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadChapterPics(chapter, chapterDir = ".") {
|
||||||
|
const directoryPath = path.resolve(chapterDir);
|
||||||
|
if (fs.existsSync(directoryPath)) {
|
||||||
|
console.log(`Skipping ${chapter.name} as it already exists`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
fs.mkdirSync(directoryPath, { recursive: true });
|
||||||
|
|
||||||
|
const page = await this.loadPage(chapter.url);
|
||||||
|
const pageIndex = this.pages.indexOf(page);
|
||||||
|
await page.waitForSelector('.mh_mangalist', { visible: true });
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt < 10; attempt++) {
|
||||||
|
console.log(`Downloading ${chapter.name}, attempt ${attempt + 1}`);
|
||||||
|
await this.scrollPage(page);
|
||||||
|
const loadingElements = await page.$$eval('.mh_loading:not([style*="display: none"])', elements => elements.length);
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||||
|
if (loadingElements === 0 && Object.keys(this.pages_response[pageIndex]).length !== 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const responses = this.pages_response[pageIndex];
|
||||||
|
for (const [url, response] of Object.entries(responses)) {
|
||||||
|
const fileName = (await this.getImgOrder(page, url)) + '.webp';
|
||||||
|
const buffer = await (new Blob([response], { type: 'image/webp' })).arrayBuffer();
|
||||||
|
await this.saveBufferToWebp(buffer, fileName, chapterDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.closePage(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadChapter(chapters, dir = ".") {
|
||||||
|
const dirPath = path.resolve(dir);
|
||||||
|
if (!fs.existsSync(dirPath)) {
|
||||||
|
fs.mkdirSync(dirPath, { recursive: true });
|
||||||
|
}
|
||||||
|
// const chapter = chapters[Math.floor(Math.random() * chapters.length)];
|
||||||
|
for (const chapter of chapters) {
|
||||||
|
await this.downloadChapterPics(chapter, path.join(dir, chapter.name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getImgOrder(page, src) {
|
||||||
|
const loadingAttributes = await page.$$eval('.mh_comicpic', (elements, src) => {
|
||||||
|
return elements
|
||||||
|
.filter(el => el.querySelector(`img[src="${src}"]`))
|
||||||
|
.map(el => el.getAttribute('p'))
|
||||||
|
.map(str => str.padStart(elements.length.toString().length, '0'));
|
||||||
|
}, src);
|
||||||
|
return loadingAttributes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const scraper = new ColaMangaScraper();
|
||||||
|
await scraper.init();
|
||||||
|
const mangaUrl = 'https://www.colamanga.com/manga-od825111/';
|
||||||
|
const mangaInfo = await scraper.getMangaInfo(mangaUrl);
|
||||||
|
await scraper.saveBufferToWebp(mangaInfo.coverPic, 'cover.webp', 'test');
|
||||||
|
await scraper.downloadChapter(mangaInfo.chapters, 'test');
|
||||||
|
await scraper.closeAllPages();
|
||||||
|
console.log(mangaInfo);
|
||||||
|
})();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
8
index.js
Normal file
8
index.js
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
const axios = require('axios');
|
||||||
|
|
||||||
|
const postData = {
|
||||||
|
mangaId: 'yourMangaId',
|
||||||
|
mangaName: 'yourMangaName'
|
||||||
|
};
|
||||||
|
|
||||||
|
axios.post('http://localhost:4000/insert', postData)
|
||||||
3
manga_urls.json
Normal file
3
manga_urls.json
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"大奉打更人" : "https://www.colamanga.com/manga-lo816008/"
|
||||||
|
}
|
||||||
1840
package-lock.json
generated
Normal file
1840
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
package.json
Normal file
18
package.json
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"name": "sunnymh-scrap",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"main": "index.cjs",
|
||||||
|
"scripts": {
|
||||||
|
"start": "node index.cjs",
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"description": "",
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.8.4",
|
||||||
|
"fs": "^0.0.1-security",
|
||||||
|
"puppeteer": "^24.4.0",
|
||||||
|
"sharp": "^0.33.5"
|
||||||
|
}
|
||||||
|
}
|
||||||
81
scraper.js
Normal file
81
scraper.js
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
const puppeteer = require('puppeteer');
|
||||||
|
|
||||||
|
exports.BaseScraper = class BaseScraper {
|
||||||
|
constructor() {
|
||||||
|
this.browser = null;
|
||||||
|
this.pages = [];
|
||||||
|
this.pages_response = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
async init() {
|
||||||
|
this.browser = await puppeteer.launch({
|
||||||
|
// headless: false,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async openPage(url) {
|
||||||
|
const page = await this.browser.newPage();
|
||||||
|
|
||||||
|
page.on('response', async (response) => {
|
||||||
|
const pageIndex = this.pages.indexOf(page);
|
||||||
|
|
||||||
|
if (response.url().includes('https://res.colamanga.com')) {
|
||||||
|
this.pages_response[pageIndex] = await response.buffer();
|
||||||
|
}
|
||||||
|
else if (/blob:https:\/\/www\.colamanga\.com\//.test(response.url())) {
|
||||||
|
if (!this.pages_response[pageIndex]) {
|
||||||
|
this.pages_response[pageIndex] = {};
|
||||||
|
}
|
||||||
|
this.pages_response[pageIndex][response.url()] = await response.buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||||
|
this.pages.push(page);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async closePage(page) {
|
||||||
|
const pageIndex = this.pages.indexOf(page);
|
||||||
|
if (pageIndex !== -1) {
|
||||||
|
this.pages.splice(pageIndex, 1);
|
||||||
|
this.pages_response.splice(pageIndex, 1);
|
||||||
|
}
|
||||||
|
await page.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
async closeAllPages() {
|
||||||
|
await Promise.all(this.pages.map(page => page.close()));
|
||||||
|
this.pages = [];
|
||||||
|
this.pages_response = [];
|
||||||
|
await this.browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadPage(url) {
|
||||||
|
const page = await this.openPage(url);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrollPage(page) {
|
||||||
|
await page.evaluate(async () => {
|
||||||
|
window.scrollTo(0, 0);
|
||||||
|
await new Promise((resolve) => {
|
||||||
|
const distance = 100; // distance to scroll
|
||||||
|
const delay = 100; // delay between scrolls
|
||||||
|
const scrollInterval = setInterval(() => {
|
||||||
|
window.scrollBy(0, distance);
|
||||||
|
if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight) {
|
||||||
|
clearInterval(scrollInterval);
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
}, delay);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user