~lyg/book-crawler.git

New file
			@@ -0,0 +1,607 @@
			import xlsx from "node-xlsx";
			import axios from "axios";
			import * as fs from "fs";
			import path from "path";
			import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
			import { HttpsProxyAgent } from "https-proxy-agent";
			import { execFileSync } from "child_process";
			import wordsjs from 'wordlist-js';
			import usPlaceList from "./us-place-list.mjs";
			import usPeronNameList from "./us-pseron-name-list.mjs";
			import * as pdfLib from 'pdf-lib';
			import * as cheerio from 'cheerio';

			/-------------读取配置---------------/
			let config = JSON.parse(fs.readFileSync('./config.json'));

			/* ------------日志-------------- */
			let logFile;
			function initLogger() {
			const _log = console.log;
			if (!fs.existsSync('./logs')) {
			fs.mkdirSync('./logs', { recursive: true });
			}
			logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
			console.log = function (...text) {
			text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
			_log(text);
			logFile.write(text + '\n');
			};
			}

			/* ----------axios代理------------ */
			const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
			const myAxios = axios.create({
			proxy: false,
			httpsAgent,
			});

			/**
			* 获取pdf文件页数
			* @param {string} filepath pdf 文件路径
			* @returns 页数
			*/
			async function getPdfPages(filepath) {
			const buf = fs.readFileSync(filepath);
			const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true });
			const pages = pdfDoc.getPages().length;
			return pages;
			}

			function allWords() {
			const words = {};
			wordsjs.usPlaces = usPlaceList;
			wordsjs.usPeronNameList = usPeronNameList;
			for (const key in wordsjs.default) {
			if (Object.hasOwnProperty.call(wordsjs.default, key)) {
			for (const word of wordsjs.default[key]) {
			words[word] = true;
			}
			}
			}
			return words;
			}

			const wordsMap = allWords();

			/**
			* 统计单词数量
			* @param {string} str 字符串
			* @returns 单词数量
			*/
			function countWordSize(str) {
			let count = 0;
			str = str.replace(/[ ]{2,}/g, ' ');
			for (let i = 0; i < str.length; i++) {
			if (str[i] === ' ') {
			count++;
			}
			}
			return count;
			}

			/**
			* 获取错误单词比例
			* @param {string} text 文本
			* @returns 错误单词比例
			*/
			function incorrectWordRatio(text) {
			text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
			const words = text.split(' ');
			const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
			return incorrectWordCnt / words.length;
			}

			/**
			* 符号占比 0 ~ 1
			* @param {string} text 文本
			*/
			function symbolRatio(text) {
			// 非字母数字字符占比
			return (text.match(/[^a-zA-Z0-9 ]/g) \|\| []).length / text.length;
			}

			/**
			* 清理文本
			* @param {string} text 要清理的文本
			*/
			function cleanText(text) {
			text = text.replace(/(\r)/g, '');
			const googlePage = text.substring(0, 10000);
			if (googlePage.includes('google')) {
			text = googlePage.replace(/^(.\|\n)books[ ]\.[ ]google[ ]\.[ ]*com/ig, '') + text.substring(10000);
			}
			// if (!/.{170,}/g.test(text) \|\| text.includes('google')) {
			text = text.replace(/[ ]{2,}/g, ' ')
			if (!/.{170,}/g.test(text)) {
			// 每行不超过170个字符
			text = text.replace(/(.{170,})\n/g, '$1');
			}
			text = text.replace(/\n+/g, '\n');
			text = text.replace(/-\n/g, '-');
			const lines = text.split('\n');
			const result = [];
			for (const line of lines) {
			// 符号比太高的不要
			const incorrectRatio = incorrectWordRatio(line);
			if (symbolRatio(line) > 0.2) {
			if (incorrectRatio > 0.65) {
			continue;
			}
			}
			// 去除空格后连续重复单个字符3次及以上不要
			const wordSize = countWordSize(line);
			if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
			if (wordSize < 5 \|\| incorrectRatio > 0.65) {
			continue;
			}
			}
			// 连续三个标点符号及以上,错误率大于0.65不要
			if (incorrectRatio > 0.65 && /([\.,'";:\|!@#$%^&<>?`~•¬»«]){3,}/.test(line)) {
			continue;
			}
			// 单词数量太少的不要
			if (wordSize > 5 && incorrectRatio > 0.65) {
			continue;
			}
			// 有google的不要
			if (/.(google)./ig.test(line)) {
			continue;
			}
			// 只有一个字符不要
			const ret = line.trim().replace(/[■•*¬»«^-]/g, '');
			if (ret.length <= 1) {
			continue;
			}
			if (ret == 'Digitized by') {
			continue;
			}
			result.push(ret);
			}
			text = result.join('\n');
			// }
			return text;
			}

			/**
			* 解压文本文件
			* @param {string} zipFile 压缩文件路径
			* @param {string} txtFile 文本文件路径
			*/
			function unzip(zipFile, txtFile) {
			const tmpdir = `./tmpdir/${threadId}`;
			execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
			const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
			.sort((a, b) => a.size.size - b.size.size).pop();
			fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
			fs.rmSync(`${tmpdir}`, { recursive: true });
			}

			/**
			* 获取要下载熟图书信息
			* @param {number} startRow 起始行，包含
			* @param {number} endRow 结束行，不包含
			* @returns
			*/
			function getBooksFromExcel(startRow, endRow) {
			const workSheets = xlsx.parse("【反馈客户】7月批次书单 - 已撞库.xlsx");
			const sheet = workSheets[0];
			const data = sheet.data.slice(startRow, endRow);
			const books = data.map((row) => {
			return {
			id: row[0],
			isbn: row[1],
			title: row[2],
			subTitle: row[3],
			author: row[4],
			publisher: row[5],
			pubDate: row[6],
			ztf: row[7],
			format: row[8],
			language: row[9],
			brief: row[10],
			pages: row[11],
			state: row[12],
			format: row[13],
			file: row[14],
			url: row[15],
			};
			});
			return books;
			}

			/**
			* 格式化关键字
			* @param {string} text 要搜索的关键字
			* @param {boolean} titleWithNumbers 是否标题中包含数字
			* @returns 处理后的关键字
			*/
			function formatKw(text, titleWithNumbers) {
			if (titleWithNumbers) {
			text = text;
			} else {
			text = text.replace(/[\d]/g, "");
			}
			text = text.split(' ').slice(0, 6).join("+");
			return text;
			}


			async function sleep(ms) {
			return new Promise((resolve) => {
			setTimeout(resolve, ms);
			});
			}

			async function retry(func, maxTry = 3, delay = 3000) {
			try {
			return await func();
			} catch (e) {
			if (maxTry > 0) {
			await sleep(delay);
			return await retry(func, maxTry - 1, delay);
			} else {
			throw e;
			}
			}
			}

			/**
			* 获取书籍详情页url
			* @param {*} book
			*/
			async function getBookDetailPageUrl(book) {
			const url = `https://libgen.vg/index.php?req=${book.title}&columns%5B%5D=t&topics%5B%5D=f&res=25&filesuns=all`;
			return await retry(async () => {
			const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
			const group = /.href="(edition.php\?id=\d+)"./g.exec(resp.data);
			if (group) {
			return `https://libgen.vg/${group[1]}`;
			} else {
			return ''
			}
			})
			.catch(() => '');
			}

			async function openBookDetailPage(book, detailPageUrl) {
			console.log(`打开详情: ${detailPageUrl}`);
			return await retry(async () => {
			const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
			const html = cheerio.load(resp.data);
			const trList = html('tr');
			const files = [];
			let epubUrl = null;
			let pdfUrl = null;
			for (const tr of trList) {
			const trEle = cheerio.load(tr);
			const aEle = trEle('td:nth-child(1) > a');
			const url = aEle.attr()['href'];
			const tdEle = trEle('td:nth-child(2)');
			tdEle.find('br').replaceWith(' ');
			const gp = /.* Extension: (\S+) \S*/.exec(tdEle.text());
			const ext = gp[1].toLowerCase();
			if (ext == 'pdf') {
			pdfUrl = `https://libgen.vg/${url}`;
			}
			if (ext == 'epub') {
			epubUrl = `https://libgen.vg/${url}`;
			}
			}
			if (epubUrl \|\| pdfUrl) {
			return epubUrl ?? pdfUrl;
			} else {
			return '';
			}
			})
			.catch(() => {
			book.state = "打开详情页失败";
			console.log(`打开详情页失败: ${book.id} ${book.title}`);
			return '';
			});
			}

			async function getDownloadUrl(book, url) {
			return await retry(async () => {
			const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
			const group = /.href="(get.php\?md5=[0-9a-f]+.)".*/g.exec(resp.data);
			if (group) {
			return `https://libgen.vg/${group[1]}`;
			} else {
			return '';
			}
			})
			.catch(() => {
			book.state = "获取下载链接失败";
			console.log(`获取下载链接失败: ${book.id} ${book.title}`);
			return '';
			});
			}

			async function downloadFile(book, url) {
			console.log(`下载文件: ${url}`);
			await retry(() => {
			const timeoutTime = 10 * 60 * 1000;
			const source = axios.CancelToken.source();
			const timeout = setTimeout(() => {
			source.cancel("timeout");
			}, timeoutTime);
			return new Promise((resolve, reject) => myAxios
			.get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
			.then((response) => {
			response.headers['content-disposition'];
			let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? '';
			ext = ext.substring(0, ext.length - 1);

			const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
			book.url = url;
			if (fs.existsSync(filepath)) {
			book.state = `下载完成`;
			book.format = ext;
			book.file = filepath;
			console.log(`下载完成：${filepath}`);
			return;
			}
			const stream = response.data;
			const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
			const out = fs.createWriteStream(_filepath);
			stream.pipe(out);
			stream.on("end", async () => {
			clearTimeout(timeout);
			book.state = `下载完成`;
			book.format = ext;
			book.file = filepath;
			book.url = url;
			book.pages = await getPdfPages(filepath).catch(e => 0);
			resolve(true);
			});
			stream.on("error", (err) => {
			clearTimeout(timeout);
			console.error(err);
			reject(false);
			try {
			out.close();
			fs.unlink(filepath, (e) => console.error(e));
			} catch (e) {
			console.error(e);
			}
			});
			})
			.catch((e) => {
			clearTimeout(timeout);
			console.log(`下载失败，错误码: ${e?.response?.status ?? e.code}`);
			book.url = url;
			if (e.response?.status === 403 \|\| e.response?.status === 401) {
			book.state = "没有下载权限";
			console.log(`下载失败: ${book.id} ${book.title} ${url}`);
			resolve(true);
			} else {
			reject(false);
			}
			}));
			}).catch(e => {
			book.state = "下载失败";
			console.log(`下载失败: ${book.id} ${book.title} ${url}`);
			return false
			});
			}

			function isAlreadyDownloaded(book) {
			const id = `${book.id} ${book.isbn}`;
			return alreadyDownloadedBooks.includes(id);
			}

			function nextBook() {
			return new Promise(resolve => {
			const cb = (message) => {
			if (message.type === 'book') {
			resolve(message.data);
			parentPort.removeListener('message', cb);
			}
			};
			parentPort.on('message', cb);
			parentPort.postMessage({ type: 'get-book', threadId });

			});
			}


			async function downloadBooks(books) {

			for (; ;) {
			const book = await nextBook();
			if (!book) {
			break;
			}
			books.push(book);
			if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
			// 定时退出
			break;
			}
			bookCount++;
			if (isAlreadyDownloaded(book)) {
			skipCount++;
			book.skip = true;
			continue;
			}
			if (book.state && (book.state === "没有搜索结果" \|\| book.state === "没有pdf或text文件" \|\| book.state === "下载完成")) {
			// 跳过没有搜索结果或没有pdf或text文件的书籍
			skipCount++;
			continue;
			}
			console.log(`开始下载: ${book.id} ${book.title}`);
			// 打开搜索页面并搜索
			let detailPageUrl = await getBookDetailPageUrl(book);
			if (!detailPageUrl) {
			book.state = "没有搜索结果";
			continue;
			}
			// 等一段时间再打开详情页
			sleep(getRandomNumber(500, 1000));
			// 打开详情页，并获取下载链接
			const filePageUrl = await openBookDetailPage(book, detailPageUrl);
			if (!filePageUrl) {
			console.log(`没有文件: ${book.id} ${book.title}`);
			continue;
			}
			const url = await getDownloadUrl(book, filePageUrl);
			if (!url) {
			console.log(`没有文件: ${book.id} ${book.title}`);
			continue;
			}
			// 等待一段时间再下载
			await sleep(getRandomNumber(500, 1000));
			// 下载文件
			try {
			await downloadFile(book, url);
			console.log(`下载完成: ${book.id} ${book.title}`);
			console.log('finish: ' + JSON.stringify(book));
			} catch (e) { }
			successCount++;
			// 等一段时间再下一个
			sleep(getRandomNumber(500, 1000));
			}
			}

			function saveBooks(books) {
			console.log("保存下载状态数据");
			const workSheets = xlsx.parse("【反馈客户】7月批次书单 - 已撞库.xlsx");
			const sheet = workSheets[0];
			const data = sheet.data;
			for (const book of books) {
			const index = data.findIndex((row) => row[0] === book.id);
			if (index > -1) {
			data[index][5] = book.publisher;
			data[index][6] = book.pubDate;
			data[index][11] = book.pages;
			data[index][12] = book.state;
			data[index][13] = book.format;
			data[index][14] = book.file;
			data[index][15] = book.url;
			}
			}

			const buffer = xlsx.build([{ name: "Sheet1", data }]);
			try {
			fs.writeFileSync("./【反馈客户】7月批次书单 - 已撞库.xlsx", buffer, (err) => { });
			console.log("保存完成: ./【反馈客户】7月批次书单 - 已撞库.xlsx");
			} catch (e) {
			console.error(e);
			const outfile = `${Date.now()}.json`;
			fs.writeFileSync(outfile, JSON.stringify(data));
			console.log("保存完成: " + outfile);
			}
			}


			/**
			* 毫秒转时分秒格式
			* @param {number} ms 毫秒值
			*/
			function msFormat(ms) {
			const sec = Math.floor(ms / 1000);
			const min = Math.floor(sec / 60);
			const hour = Math.floor(min / 60);
			const day = Math.floor(hour / 24);
			const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
			return format;
			}

			/**
			* 获取随机值
			* @param {number} min 最小值
			* @param {number} max 最大值
			* @returns 随机值
			*/
			function getRandomNumber(min, max) {
			return Math.random() * (max - min) + min;
			}

			// 开始时间
			const startTime = Date.now();
			// 下载成功的数量
			let successCount = 0;
			// 图书数量
			let bookCount = 0;
			// 跳过的数量，已经下载过或没有搜索到的数量
			let skipCount = 0;
			let alreadyDownloadedBooks = [];

			function getAlreadyDownloadedBooks() {
			const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8');
			const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it);
			const files = fs.readdirSync('./downloads');
			books.push(...files);
			return books.map(it => path.basename(it, path.extname(it)).trim());
			}

			function startDownload() {
			initLogger();
			const books = [];
			downloadBooks(books)
			.then(() => {
			console.log(`线程：${threadId}全部完成，共下载${bookCount}本，成功下载${successCount}本，跳过${skipCount}本，失败${bookCount - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
			})
			.catch(e => {
			console.error(e);
			})
			.finally(async () => {
			// saveBooks(books);
			parentPort.postMessage({ type: "books", data: books });
			logFile.close();
			});
			}

			function main() {

			if (!fs.existsSync('tmpdir')) {
			fs.mkdirSync('tmpdir', { recursive: true });
			}
			if (!fs.existsSync('downloads')) {
			fs.mkdirSync('downloads', { recursive: true });
			}
			// 多进程执行
			if (isMainThread) {
			initLogger();
			let downloadCnt = 0;
			const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
			const { startRow, endRow, threadSize } = config;
			console.log(`线程数：${threadSize}, 开始行：${startRow}, 结束行：${endRow}`);
			let finishThreadCnt = 0;
			const finishBooks = [];
			const books = getBooksFromExcel(startRow, endRow);

			for (let i = 0; i < threadSize; i++) {
			const worker = new Worker("./src/book-download.mjs", { workerData: { alreadyDownloadedBooks } });
			worker.on("message", (message) => {
			if (message.type === 'books') {
			finishBooks.push(...message.data);
			finishThreadCnt++;
			if (finishThreadCnt >= threadSize) {
			successCount = finishBooks.filter(it => it.state === '下载完成').length;
			skipCount = finishBooks.filter(it => it.skip).length;
			console.log(`全部线程完成，共下载${downloadCnt}本，成功下载${successCount}本，跳过${skipCount}，失败${downloadCnt - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
			saveBooks(finishBooks);
			}
			} else if (message.type === 'get-book') {
			downloadCnt++;
			worker.postMessage({ type: "book", data: books.shift() });
			}
			});
			}
			// 监听退出信号，保存已经下载的图书信息
			process.on('SIGINT', () => {
			successCount = finishBooks.filter(it => it.state === '下载完成').length;
			skipCount = finishBooks.filter(it => it.skip).length;
			console.log(`进程被手动结束，共下载${downloadCnt}本，成功下载${successCount}本，跳过${skipCount}，失败${downloadCnt - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
			saveBooks(finishBooks);
			process.exit(0);
			});
			} else {
			alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
			startDownload();

			}
			}

			main();