First commit for scraper
This commit is contained in:
commit
a3bf75bd36
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/node_modules
|
||||
/test
|
||||
131
colamanga_scraper.js
Normal file
131
colamanga_scraper.js
Normal file
@ -0,0 +1,131 @@
|
||||
const { BaseScraper } = require('./scraper.js');
|
||||
const sharp = require('sharp');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
class ColaMangaScraper extends BaseScraper {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
async saveBufferAsWebp(buffer, filename, dir = '.') {
|
||||
const dirPath = path.resolve(dir);
|
||||
const filePath = path.join(dirPath, filename);
|
||||
|
||||
try {
|
||||
await fs.promises.mkdir(dirPath, { recursive: true });
|
||||
await sharp(buffer).webp({ quality: 80 }).toFormat('webp').toFile(filePath);
|
||||
} catch (error) {
|
||||
console.error(`Failed to save ${filename}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
async getMangaInfo(mangaUrl) {
|
||||
const page = await this.loadPage(mangaUrl);
|
||||
await page.waitForSelector('.fed-deta-info', { visible: true });
|
||||
const mangaName = await page.$eval('.fed-deta-content h1', el => el.textContent);
|
||||
const elements = await page.$$('.fed-deta-content li');
|
||||
const mangaInfo = {
|
||||
name: mangaName,
|
||||
author: '',
|
||||
nickNames: [],
|
||||
genres: [],
|
||||
status: '',
|
||||
chapters: []
|
||||
};
|
||||
for (const el of elements) {
|
||||
const span = await el.$eval('span', el => el.textContent.trim());
|
||||
if (span === '状态') {
|
||||
mangaInfo.status = await el.$eval('a', el => el.textContent);
|
||||
} else if (span === '作者') {
|
||||
mangaInfo.author = await el.$eval('a', el => el.textContent);
|
||||
} else if (span === '别名') {
|
||||
mangaInfo.nickNames = await el.$$eval('a', els => els.map(el => el.textContent));
|
||||
} else if (span === '类别') {
|
||||
mangaInfo.genres = await el.$$eval('a', els => els.map(el => el.textContent));
|
||||
}
|
||||
}
|
||||
const chapterElements = await page.$$('.all_data_list li');
|
||||
mangaInfo.chapters = await Promise.all(chapterElements.map(async el => {
|
||||
const chapterName = await el.$eval('a', el => el.textContent);
|
||||
const chapterUrl = await el.$eval('a', el => el.getAttribute('href'));
|
||||
return {
|
||||
name: chapterName,
|
||||
url: chapterUrl
|
||||
};
|
||||
}));
|
||||
while (!this.pages_response[this.pages.indexOf(page)]) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
mangaInfo.coverPic = this.pages_response[this.pages.indexOf(page)];
|
||||
await this.closePage(page);
|
||||
return mangaInfo;
|
||||
}
|
||||
|
||||
async downloadChapterPics(chapter, chapterDir = ".") {
|
||||
const directoryPath = path.resolve(chapterDir);
|
||||
if (fs.existsSync(directoryPath)) {
|
||||
console.log(`Skipping ${chapter.name} as it already exists`);
|
||||
return;
|
||||
}
|
||||
fs.mkdirSync(directoryPath, { recursive: true });
|
||||
|
||||
const page = await this.loadPage(chapter.url);
|
||||
const pageIndex = this.pages.indexOf(page);
|
||||
await page.waitForSelector('.mh_mangalist', { visible: true });
|
||||
|
||||
for (let attempt = 0; attempt < 10; attempt++) {
|
||||
console.log(`Downloading ${chapter.name}, attempt ${attempt + 1}`);
|
||||
await this.scrollPage(page);
|
||||
const loadingElements = await page.$$eval('.mh_loading:not([style*="display: none"])', elements => elements.length);
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
if (loadingElements === 0 && Object.keys(this.pages_response[pageIndex]).length !== 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const responses = this.pages_response[pageIndex];
|
||||
for (const [url, response] of Object.entries(responses)) {
|
||||
const fileName = (await this.getImgOrder(page, url)) + '.webp';
|
||||
const buffer = await (new Blob([response], { type: 'image/webp' })).arrayBuffer();
|
||||
await this.saveBufferToWebp(buffer, fileName, chapterDir);
|
||||
}
|
||||
|
||||
await this.closePage(page);
|
||||
}
|
||||
|
||||
async downloadChapter(chapters, dir = ".") {
|
||||
const dirPath = path.resolve(dir);
|
||||
if (!fs.existsSync(dirPath)) {
|
||||
fs.mkdirSync(dirPath, { recursive: true });
|
||||
}
|
||||
// const chapter = chapters[Math.floor(Math.random() * chapters.length)];
|
||||
for (const chapter of chapters) {
|
||||
await this.downloadChapterPics(chapter, path.join(dir, chapter.name));
|
||||
}
|
||||
}
|
||||
|
||||
async getImgOrder(page, src) {
|
||||
const loadingAttributes = await page.$$eval('.mh_comicpic', (elements, src) => {
|
||||
return elements
|
||||
.filter(el => el.querySelector(`img[src="${src}"]`))
|
||||
.map(el => el.getAttribute('p'))
|
||||
.map(str => str.padStart(elements.length.toString().length, '0'));
|
||||
}, src);
|
||||
return loadingAttributes;
|
||||
}
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const scraper = new ColaMangaScraper();
|
||||
await scraper.init();
|
||||
const mangaUrl = 'https://www.colamanga.com/manga-od825111/';
|
||||
const mangaInfo = await scraper.getMangaInfo(mangaUrl);
|
||||
await scraper.saveBufferToWebp(mangaInfo.coverPic, 'cover.webp', 'test');
|
||||
await scraper.downloadChapter(mangaInfo.chapters, 'test');
|
||||
await scraper.closeAllPages();
|
||||
console.log(mangaInfo);
|
||||
})();
|
||||
|
||||
|
||||
|
||||
8
index.js
Normal file
8
index.js
Normal file
@ -0,0 +1,8 @@
|
||||
const axios = require('axios');
|
||||
|
||||
const postData = {
|
||||
mangaId: 'yourMangaId',
|
||||
mangaName: 'yourMangaName'
|
||||
};
|
||||
|
||||
axios.post('http://localhost:4000/insert', postData)
|
||||
3
manga_urls.json
Normal file
3
manga_urls.json
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"大奉打更人" : "https://www.colamanga.com/manga-lo816008/"
|
||||
}
|
||||
1840
package-lock.json
generated
Normal file
1840
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
package.json
Normal file
18
package.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "sunnymh-scrap",
|
||||
"version": "1.0.0",
|
||||
"main": "index.cjs",
|
||||
"scripts": {
|
||||
"start": "node index.cjs",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"description": "",
|
||||
"dependencies": {
|
||||
"axios": "^1.8.4",
|
||||
"fs": "^0.0.1-security",
|
||||
"puppeteer": "^24.4.0",
|
||||
"sharp": "^0.33.5"
|
||||
}
|
||||
}
|
||||
81
scraper.js
Normal file
81
scraper.js
Normal file
@ -0,0 +1,81 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
exports.BaseScraper = class BaseScraper {
|
||||
constructor() {
|
||||
this.browser = null;
|
||||
this.pages = [];
|
||||
this.pages_response = [];
|
||||
}
|
||||
|
||||
async init() {
|
||||
this.browser = await puppeteer.launch({
|
||||
// headless: false,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
});
|
||||
}
|
||||
|
||||
async openPage(url) {
|
||||
const page = await this.browser.newPage();
|
||||
|
||||
page.on('response', async (response) => {
|
||||
const pageIndex = this.pages.indexOf(page);
|
||||
|
||||
if (response.url().includes('https://res.colamanga.com')) {
|
||||
this.pages_response[pageIndex] = await response.buffer();
|
||||
}
|
||||
else if (/blob:https:\/\/www\.colamanga\.com\//.test(response.url())) {
|
||||
if (!this.pages_response[pageIndex]) {
|
||||
this.pages_response[pageIndex] = {};
|
||||
}
|
||||
this.pages_response[pageIndex][response.url()] = await response.buffer();
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
this.pages.push(page);
|
||||
return page;
|
||||
}
|
||||
|
||||
|
||||
async closePage(page) {
|
||||
const pageIndex = this.pages.indexOf(page);
|
||||
if (pageIndex !== -1) {
|
||||
this.pages.splice(pageIndex, 1);
|
||||
this.pages_response.splice(pageIndex, 1);
|
||||
}
|
||||
await page.close();
|
||||
}
|
||||
|
||||
async closeAllPages() {
|
||||
await Promise.all(this.pages.map(page => page.close()));
|
||||
this.pages = [];
|
||||
this.pages_response = [];
|
||||
await this.browser.close();
|
||||
}
|
||||
|
||||
async loadPage(url) {
|
||||
const page = await this.openPage(url);
|
||||
return page;
|
||||
}
|
||||
|
||||
async scrollPage(page) {
|
||||
await page.evaluate(async () => {
|
||||
window.scrollTo(0, 0);
|
||||
await new Promise((resolve) => {
|
||||
const distance = 100; // distance to scroll
|
||||
const delay = 100; // delay between scrolls
|
||||
const scrollInterval = setInterval(() => {
|
||||
window.scrollBy(0, distance);
|
||||
if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight) {
|
||||
clearInterval(scrollInterval);
|
||||
resolve();
|
||||
}
|
||||
}, delay);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user