import xlsx from "node-xlsx";
|
import axios from "axios";
|
import * as fs from "fs";
|
import path from "path";
|
import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
|
import { HttpsProxyAgent } from "https-proxy-agent";
|
import { execFileSync } from "child_process";
|
import wordsjs from 'wordlist-js';
|
import usPlaceList from "./us-place-list.mjs";
|
import usPeronNameList from "./us-pseron-name-list.mjs";
|
import * as pdfLib from 'pdf-lib';
|
import * as cheerio from 'cheerio';
|
|
/*-------------读取配置---------------*/
|
let config = JSON.parse(fs.readFileSync('./config.json'));
|
|
/* ------------日志-------------- */
|
let logFile;
|
function initLogger() {
|
const _log = console.log;
|
if (!fs.existsSync('./logs')) {
|
fs.mkdirSync('./logs', { recursive: true });
|
}
|
logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
|
console.log = function (...text) {
|
text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
|
_log(text);
|
logFile.write(text + '\n');
|
};
|
}
|
|
/* ----------axios代理------------ */
|
const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
|
const myAxios = axios.create({
|
proxy: false,
|
httpsAgent,
|
});
|
|
/**
|
* 获取pdf文件页数
|
* @param {string} filepath pdf 文件路径
|
* @returns 页数
|
*/
|
async function getPdfPages(filepath) {
|
const buf = fs.readFileSync(filepath);
|
const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true });
|
const pages = pdfDoc.getPages().length;
|
return pages;
|
}
|
|
function allWords() {
|
const words = {};
|
wordsjs.usPlaces = usPlaceList;
|
wordsjs.usPeronNameList = usPeronNameList;
|
for (const key in wordsjs.default) {
|
if (Object.hasOwnProperty.call(wordsjs.default, key)) {
|
for (const word of wordsjs.default[key]) {
|
words[word] = true;
|
}
|
}
|
}
|
return words;
|
}
|
|
const wordsMap = allWords();
|
|
/**
|
* 统计单词数量
|
* @param {string} str 字符串
|
* @returns 单词数量
|
*/
|
function countWordSize(str) {
|
let count = 0;
|
str = str.replace(/[ ]{2,}/g, ' ');
|
for (let i = 0; i < str.length; i++) {
|
if (str[i] === ' ') {
|
count++;
|
}
|
}
|
return count;
|
}
|
|
/**
|
* 获取错误单词比例
|
* @param {string} text 文本
|
* @returns 错误单词比例
|
*/
|
function incorrectWordRatio(text) {
|
text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
|
const words = text.split(' ');
|
const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
|
return incorrectWordCnt / words.length;
|
}
|
|
/**
|
* 符号占比 0 ~ 1
|
* @param {string} text 文本
|
*/
|
function symbolRatio(text) {
|
// 非字母数字字符占比
|
return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
|
}
|
|
/**
|
* 清理文本
|
* @param {string} text 要清理的文本
|
*/
|
function cleanText(text) {
|
text = text.replace(/(\r)/g, '');
|
const googlePage = text.substring(0, 10000);
|
if (googlePage.includes('google')) {
|
text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
|
}
|
// if (!/.{170,}/g.test(text) || text.includes('google')) {
|
text = text.replace(/[ ]{2,}/g, ' ')
|
if (!/.{170,}/g.test(text)) {
|
// 每行不超过170个字符
|
text = text.replace(/(.{170,})\n/g, '$1');
|
}
|
text = text.replace(/\n+/g, '\n');
|
text = text.replace(/-\n/g, '-');
|
const lines = text.split('\n');
|
const result = [];
|
for (const line of lines) {
|
// 符号比太高的不要
|
const incorrectRatio = incorrectWordRatio(line);
|
if (symbolRatio(line) > 0.2) {
|
if (incorrectRatio > 0.65) {
|
continue;
|
}
|
}
|
// 去除空格后 连续重复单个字符3次及以上不要
|
const wordSize = countWordSize(line);
|
if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
|
if (wordSize < 5 || incorrectRatio > 0.65) {
|
continue;
|
}
|
}
|
// 连续三个标点符号及以上,错误率大于0.65不要
|
if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) {
|
continue;
|
}
|
// 单词数量太少的不要
|
if (wordSize > 5 && incorrectRatio > 0.65) {
|
continue;
|
}
|
// 有google的不要
|
if (/.*(google).*/ig.test(line)) {
|
continue;
|
}
|
// 只有一个字符不要
|
const ret = line.trim().replace(/[■•*¬»«^-]/g, '');
|
if (ret.length <= 1) {
|
continue;
|
}
|
if (ret == 'Digitized by') {
|
continue;
|
}
|
result.push(ret);
|
}
|
text = result.join('\n');
|
// }
|
return text;
|
}
|
|
/**
|
* 解压文本文件
|
* @param {string} zipFile 压缩文件路径
|
* @param {string} txtFile 文本文件路径
|
*/
|
function unzip(zipFile, txtFile) {
|
const tmpdir = `./tmpdir/${threadId}`;
|
execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
|
const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
|
.sort((a, b) => a.size.size - b.size.size).pop();
|
fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
|
fs.rmSync(`${tmpdir}`, { recursive: true });
|
}
|
|
/**
|
* 获取要下载熟图书信息
|
* @param {number} startRow 起始行,包含
|
* @param {number} endRow 结束行,不包含
|
* @returns
|
*/
|
function getBooksFromExcel(startRow, endRow) {
|
const workSheets = xlsx.parse("【反馈客户】7月批次书单 - 已撞库.xlsx");
|
const sheet = workSheets[0];
|
const data = sheet.data.slice(startRow, endRow);
|
const books = data.map((row) => {
|
return {
|
id: row[0],
|
isbn: row[1],
|
title: row[2],
|
subTitle: row[3],
|
author: row[4],
|
publisher: row[5],
|
pubDate: row[6],
|
ztf: row[7],
|
format: row[8],
|
language: row[9],
|
brief: row[10],
|
pages: row[11],
|
state: row[12],
|
format: row[13],
|
file: row[14],
|
url: row[15],
|
};
|
});
|
return books;
|
}
|
|
/**
|
* 格式化关键字
|
* @param {string} text 要搜索的关键字
|
* @param {boolean} titleWithNumbers 是否标题中包含数字
|
* @returns 处理后的关键字
|
*/
|
function formatKw(text, titleWithNumbers) {
|
if (titleWithNumbers) {
|
text = text;
|
} else {
|
text = text.replace(/[\d]/g, "");
|
}
|
text = text.split(' ').slice(0, 6).join("+");
|
return text;
|
}
|
|
|
async function sleep(ms) {
|
return new Promise((resolve) => {
|
setTimeout(resolve, ms);
|
});
|
}
|
|
async function retry(func, maxTry = 3, delay = 3000) {
|
try {
|
return await func();
|
} catch (e) {
|
if (maxTry > 0) {
|
await sleep(delay);
|
return await retry(func, maxTry - 1, delay);
|
} else {
|
throw e;
|
}
|
}
|
}
|
|
/**
|
* 获取书籍详情页url
|
* @param {*} book
|
*/
|
async function getBookDetailPageUrl(book) {
|
const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g,'+')}&criteria=title&language=&format=`;
|
return await retry(async () => {
|
const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
|
// const html = cheerio.load(resp.data);
|
// const url = html('body > table > tbody > tr:nth-child(1) > td:nth-child(6) > ul > li:nth-child(1) > a')?.attr('href') ?? '';
|
// return url;
|
const group = /.*href="(http:\/\/library.lol\/fiction\/[0-9a-zA-Z]+)".*/g.exec(resp.data);
|
if (group) {
|
return `${group[1]}`;
|
} else {
|
return ''
|
}
|
})
|
.catch(() => '');
|
}
|
|
async function openBookDetailPage(book, detailPageUrl) {
|
console.log(`打开详情: ${detailPageUrl}`);
|
return await retry(async () => {
|
const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
|
const html = cheerio.load(resp.data);
|
const trList = html('tr');
|
const files = [];
|
let epubUrl = null;
|
let pdfUrl = null;
|
for (const tr of trList) {
|
const trEle = cheerio.load(tr);
|
const aEle = trEle('td:nth-child(1) > a');
|
const url = aEle.attr()['href'];
|
const tdEle = trEle('td:nth-child(2)');
|
tdEle.find('br').replaceWith(' ');
|
const gp = /.* Extension: (\S+) \S*/.exec(tdEle.text());
|
const ext = gp[1].toLowerCase();
|
if (ext == 'pdf') {
|
pdfUrl = `https://libgen.vg/${url}`;
|
}
|
if (ext == 'epub') {
|
epubUrl = `https://libgen.vg/${url}`;
|
}
|
}
|
if (epubUrl || pdfUrl) {
|
return epubUrl ?? pdfUrl;
|
} else {
|
return '';
|
}
|
})
|
.catch(() => {
|
book.state = "打开详情页失败";
|
console.log(`打开详情页失败: ${book.id} ${book.title}`);
|
return '';
|
});
|
}
|
|
async function getDownloadUrl(book, url) {
|
return await retry(async () => {
|
const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
|
const group = /.*href="(\S+)".*>GET<.*/g.exec(resp.data);
|
if (group) {
|
return `${group[1]}`;
|
} else {
|
return '';
|
}
|
})
|
.catch(() => {
|
book.state = "获取下载链接失败";
|
console.log(`获取下载链接失败: ${book.id} ${book.title}`);
|
return '';
|
});
|
}
|
|
async function downloadFile(book, url) {
|
console.log(`下载文件: ${url}`);
|
await retry(() => {
|
const timeoutTime = 10 * 60 * 1000;
|
const source = axios.CancelToken.source();
|
const timeout = setTimeout(() => {
|
source.cancel("timeout");
|
}, timeoutTime);
|
return new Promise((resolve, reject) => myAxios
|
.get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
|
.then((response) => {
|
response.headers['content-disposition'];
|
let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? '';
|
ext = ext.substring(0, ext.length - 1);
|
|
const filepath = `./downloads/${book.id}.${ext}`;
|
book.url = url;
|
if (fs.existsSync(filepath)) {
|
book.state = `下载完成`;
|
book.format = ext;
|
book.file = filepath;
|
console.log(`下载完成:${filepath}`);
|
return;
|
}
|
const stream = response.data;
|
const _filepath = `./downloads/${book.id}.${ext}`;
|
const out = fs.createWriteStream(_filepath);
|
stream.pipe(out);
|
stream.on("end", async () => {
|
clearTimeout(timeout);
|
book.state = `下载完成`;
|
book.format = ext;
|
book.file = filepath;
|
book.url = url;
|
// book.pages = await getPdfPages(filepath).catch(e => 0);
|
resolve(true);
|
});
|
stream.on("error", (err) => {
|
clearTimeout(timeout);
|
console.error(err);
|
reject(false);
|
try {
|
out.close();
|
fs.unlink(filepath, (e) => console.error(e));
|
} catch (e) {
|
console.error(e);
|
}
|
});
|
})
|
.catch((e) => {
|
clearTimeout(timeout);
|
console.log(`下载失败,错误码: ${e?.response?.status ?? e.code}`);
|
book.url = url;
|
if (e.response?.status === 403 || e.response?.status === 401) {
|
book.state = "没有下载权限";
|
console.log(`下载失败: ${book.id} ${book.title} ${url}`);
|
resolve(true);
|
} else {
|
reject(false);
|
}
|
}));
|
}).catch(e => {
|
book.state = "下载失败";
|
console.log(`下载失败: ${book.id} ${book.title} ${url}`);
|
return false
|
});
|
}
|
|
function isAlreadyDownloaded(book) {
|
const id = `${book.id} ${book.isbn}`;
|
return alreadyDownloadedBooks.includes(id);
|
}
|
|
function nextBook() {
|
return new Promise(resolve => {
|
const cb = (message) => {
|
if (message.type === 'book') {
|
resolve(message.data);
|
parentPort.removeListener('message', cb);
|
}
|
};
|
parentPort.on('message', cb);
|
parentPort.postMessage({ type: 'get-book', threadId });
|
|
});
|
}
|
|
|
async function downloadBooks(books) {
|
|
for (; ;) {
|
const book = await nextBook();
|
if (!book) {
|
break;
|
}
|
books.push(book);
|
if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
|
// 定时退出
|
break;
|
}
|
bookCount++;
|
if (isAlreadyDownloaded(book)) {
|
skipCount++;
|
book.skip = true;
|
continue;
|
}
|
if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) {
|
// 跳过没有搜索结果或没有pdf或text文件的书籍
|
skipCount++;
|
continue;
|
}
|
console.log(`开始下载: ${book.id} ${book.title}`);
|
// 打开搜索页面并搜索
|
let detailPageUrl = await getBookDetailPageUrl(book);
|
if (!detailPageUrl) {
|
book.state = "没有搜索结果";
|
continue;
|
}
|
// 等一段时间再打开详情页
|
sleep(getRandomNumber(500, 1000));
|
// 打开详情页,并获取下载链接
|
// const filePageUrl = await openBookDetailPage(book, detailPageUrl);
|
// if (!filePageUrl) {
|
// console.log(`没有文件: ${book.id} ${book.title}`);
|
// continue;
|
// }
|
const url = await getDownloadUrl(book, detailPageUrl);
|
if (!url) {
|
console.log(`没有文件: ${book.id} ${book.title}`);
|
continue;
|
}
|
// 等待一段时间再下载
|
await sleep(getRandomNumber(500, 1000));
|
// 下载文件
|
try {
|
await downloadFile(book, url);
|
console.log(`下载完成: ${book.id} ${book.title}`);
|
console.log('finish: ' + JSON.stringify(book));
|
} catch (e) { }
|
successCount++;
|
// 等一段时间再下一个
|
sleep(getRandomNumber(500, 1000));
|
}
|
}
|
|
function saveBooks(books) {
|
console.log("保存下载状态数据");
|
const workSheets = xlsx.parse("【反馈客户】7月批次书单 - 已撞库.xlsx");
|
const sheet = workSheets[0];
|
const data = sheet.data;
|
for (const book of books) {
|
const index = data.findIndex((row) => row[0] === book.id);
|
if (index > -1) {
|
data[index][5] = book.publisher;
|
data[index][6] = book.pubDate;
|
data[index][11] = book.pages;
|
data[index][12] = book.state;
|
data[index][13] = book.format;
|
data[index][14] = book.file;
|
data[index][15] = book.url;
|
}
|
}
|
|
const buffer = xlsx.build([{ name: "Sheet1", data }]);
|
try {
|
fs.writeFileSync("./【反馈客户】7月批次书单 - 已撞库.xlsx", buffer, (err) => { });
|
console.log("保存完成: ./【反馈客户】7月批次书单 - 已撞库.xlsx");
|
} catch (e) {
|
console.error(e);
|
const outfile = `${Date.now()}.json`;
|
fs.writeFileSync(outfile, JSON.stringify(data));
|
console.log("保存完成: " + outfile);
|
}
|
}
|
|
|
/**
|
* 毫秒转时分秒格式
|
* @param {number} ms 毫秒值
|
*/
|
function msFormat(ms) {
|
const sec = Math.floor(ms / 1000);
|
const min = Math.floor(sec / 60);
|
const hour = Math.floor(min / 60);
|
const day = Math.floor(hour / 24);
|
const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
|
return format;
|
}
|
|
/**
|
* 获取随机值
|
* @param {number} min 最小值
|
* @param {number} max 最大值
|
* @returns 随机值
|
*/
|
function getRandomNumber(min, max) {
|
return Math.random() * (max - min) + min;
|
}
|
|
// 开始时间
|
const startTime = Date.now();
|
// 下载成功的数量
|
let successCount = 0;
|
// 图书数量
|
let bookCount = 0;
|
// 跳过的数量,已经下载过或没有搜索到的数量
|
let skipCount = 0;
|
let alreadyDownloadedBooks = [];
|
|
function getAlreadyDownloadedBooks() {
|
const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8');
|
const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it);
|
const files = fs.readdirSync('./downloads');
|
books.push(...files);
|
return books.map(it => path.basename(it, path.extname(it)).trim());
|
}
|
|
function startDownload() {
|
initLogger();
|
const books = [];
|
downloadBooks(books)
|
.then(() => {
|
console.log(`线程:${threadId}全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
})
|
.catch(e => {
|
console.error(e);
|
})
|
.finally(async () => {
|
// saveBooks(books);
|
parentPort.postMessage({ type: "books", data: books });
|
logFile.close();
|
});
|
}
|
|
function main() {
|
|
if (!fs.existsSync('tmpdir')) {
|
fs.mkdirSync('tmpdir', { recursive: true });
|
}
|
if (!fs.existsSync('downloads')) {
|
fs.mkdirSync('downloads', { recursive: true });
|
}
|
// 多进程执行
|
if (isMainThread) {
|
initLogger();
|
let downloadCnt = 0;
|
const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
|
const { startRow, endRow, threadSize } = config;
|
console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`);
|
let finishThreadCnt = 0;
|
const finishBooks = [];
|
const books = getBooksFromExcel(startRow, endRow);
|
|
for (let i = 0; i < threadSize; i++) {
|
const worker = new Worker("./src/book-download.mjs", { workerData: { alreadyDownloadedBooks } });
|
worker.on("message", (message) => {
|
if (message.type === 'books') {
|
finishBooks.push(...message.data);
|
finishThreadCnt++;
|
if (finishThreadCnt >= threadSize) {
|
successCount = finishBooks.filter(it => it.state === '下载完成').length;
|
skipCount = finishBooks.filter(it => it.skip).length;
|
console.log(`全部线程完成,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
saveBooks(finishBooks);
|
}
|
} else if (message.type === 'get-book') {
|
downloadCnt++;
|
worker.postMessage({ type: "book", data: books.shift() });
|
}
|
});
|
}
|
// 监听退出信号,保存已经下载的图书信息
|
process.on('SIGINT', () => {
|
successCount = finishBooks.filter(it => it.state === '下载完成').length;
|
skipCount = finishBooks.filter(it => it.skip).length;
|
console.log(`进程被手动结束,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
saveBooks(finishBooks);
|
process.exit(0);
|
});
|
} else {
|
alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
|
startDownload();
|
|
}
|
}
|
|
main();
|