From 34cb9466af82821baf37cb57de7409a318a9d544 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期二, 23 七月 2024 10:12:18 +0800 Subject: [PATCH] 增加libgen下载图书脚本 --- src/book-download.mjs | 607 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 607 insertions(+), 0 deletions(-) diff --git a/src/book-download.mjs b/src/book-download.mjs new file mode 100644 index 0000000..861e2bf --- /dev/null +++ b/src/book-download.mjs @@ -0,0 +1,607 @@ +import xlsx from "node-xlsx"; +import axios from "axios"; +import * as fs from "fs"; +import path from "path"; +import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; +import { HttpsProxyAgent } from "https-proxy-agent"; +import { execFileSync } from "child_process"; +import wordsjs from 'wordlist-js'; +import usPlaceList from "./us-place-list.mjs"; +import usPeronNameList from "./us-pseron-name-list.mjs"; +import * as pdfLib from 'pdf-lib'; +import * as cheerio from 'cheerio'; + +/*-------------璇诲彇閰嶇疆---------------*/ +let config = JSON.parse(fs.readFileSync('./config.json')); + +/* ------------鏃ュ織-------------- */ +let logFile; +function initLogger() { + const _log = console.log; + if (!fs.existsSync('./logs')) { + fs.mkdirSync('./logs', { recursive: true }); + } + logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); + console.log = function (...text) { + text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; + _log(text); + logFile.write(text + '\n'); + }; +} + +/* ----------axios浠g悊------------ */ +const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); +const myAxios = axios.create({ + proxy: false, + httpsAgent, +}); + +/** + * 鑾峰彇pdf鏂囦欢椤垫暟 + * @param {string} filepath pdf 鏂囦欢璺緞 + * @returns 椤垫暟 + */ +async function getPdfPages(filepath) { + const buf = fs.readFileSync(filepath); + const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true }); + const pages = pdfDoc.getPages().length; + return pages; +} + +function allWords() { + const words = {}; + wordsjs.usPlaces = usPlaceList; + wordsjs.usPeronNameList = usPeronNameList; + for (const key in wordsjs.default) { + if (Object.hasOwnProperty.call(wordsjs.default, key)) { + for (const word of wordsjs.default[key]) { + words[word] = true; + } + } + } + return words; +} + +const wordsMap = allWords(); + +/** + * 缁熻鍗曡瘝鏁伴噺 + * @param {string} str 瀛楃涓� + * @returns 鍗曡瘝鏁伴噺 + */ +function countWordSize(str) { + let count = 0; + str = str.replace(/[ ]{2,}/g, ' '); + for (let i = 0; i < str.length; i++) { + if (str[i] === ' ') { + count++; + } + } + return count; +} + +/** + * 鑾峰彇閿欒鍗曡瘝姣斾緥 + * @param {string} text 鏂囨湰 + * @returns 閿欒鍗曡瘝姣斾緥 + */ +function incorrectWordRatio(text) { + text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); + const words = text.split(' '); + const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; + return incorrectWordCnt / words.length; +} + +/** + * 绗﹀彿鍗犳瘮 0 ~ 1 + * @param {string} text 鏂囨湰 + */ +function symbolRatio(text) { + // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣� + return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; +} + +/** + * 娓呯悊鏂囨湰 + * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 + */ +function cleanText(text) { + text = text.replace(/(\r)/g, ''); + const googlePage = text.substring(0, 10000); + if (googlePage.includes('google')) { + text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); + } + // if (!/.{170,}/g.test(text) || text.includes('google')) { + text = text.replace(/[ ]{2,}/g, ' ') + if (!/.{170,}/g.test(text)) { + // 姣忚涓嶈秴杩�170涓瓧绗� + text = text.replace(/(.{170,})\n/g, '$1'); + } + text = text.replace(/\n+/g, '\n'); + text = text.replace(/-\n/g, '-'); + const lines = text.split('\n'); + const result = []; + for (const line of lines) { + // 绗﹀彿姣斿お楂樼殑涓嶈 + const incorrectRatio = incorrectWordRatio(line); + if (symbolRatio(line) > 0.2) { + if (incorrectRatio > 0.65) { + continue; + } + } + // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈 + const wordSize = countWordSize(line); + if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { + if (wordSize < 5 || incorrectRatio > 0.65) { + continue; + } + } + // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈 + if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) { + continue; + } + // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕� + if (wordSize > 5 && incorrectRatio > 0.65) { + continue; + } + // 鏈塯oogle鐨勪笉瑕� + if (/.*(google).*/ig.test(line)) { + continue; + } + // 鍙湁涓�涓瓧绗︿笉瑕� + const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, ''); + if (ret.length <= 1) { + continue; + } + if (ret == 'Digitized by') { + continue; + } + result.push(ret); + } + text = result.join('\n'); + // } + return text; +} + +/** + * 瑙e帇鏂囨湰鏂囦欢 + * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞 + * @param {string} txtFile 鏂囨湰鏂囦欢璺緞 + */ +function unzip(zipFile, txtFile) { + const tmpdir = `./tmpdir/${threadId}`; + execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) + const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) + .sort((a, b) => a.size.size - b.size.size).pop(); + fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); + fs.rmSync(`${tmpdir}`, { recursive: true }); +} + +/** + * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 + * @param {number} startRow 璧峰琛岋紝鍖呭惈 + * @param {number} endRow 缁撴潫琛岋紝涓嶅寘鍚� + * @returns + */ +function getBooksFromExcel(startRow, endRow) { + const workSheets = xlsx.parse("銆愬弽棣堝鎴枫��7鏈堟壒娆′功鍗� - 宸叉挒搴�.xlsx"); + const sheet = workSheets[0]; + const data = sheet.data.slice(startRow, endRow); + const books = data.map((row) => { + return { + id: row[0], + isbn: row[1], + title: row[2], + subTitle: row[3], + author: row[4], + publisher: row[5], + pubDate: row[6], + ztf: row[7], + format: row[8], + language: row[9], + brief: row[10], + pages: row[11], + state: row[12], + format: row[13], + file: row[14], + url: row[15], + }; + }); + return books; +} + +/** + * 鏍煎紡鍖栧叧閿瓧 + * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� + * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� + * @returns 澶勭悊鍚庣殑鍏抽敭瀛� + */ +function formatKw(text, titleWithNumbers) { + if (titleWithNumbers) { + text = text; + } else { + text = text.replace(/[\d]/g, ""); + } + text = text.split(' ').slice(0, 6).join("+"); + return text; +} + + +async function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +async function retry(func, maxTry = 3, delay = 3000) { + try { + return await func(); + } catch (e) { + if (maxTry > 0) { + await sleep(delay); + return await retry(func, maxTry - 1, delay); + } else { + throw e; + } + } +} + +/** + * 鑾峰彇涔︾睄璇︽儏椤祏rl + * @param {*} book + */ +async function getBookDetailPageUrl(book) { + const url = `https://libgen.vg/index.php?req=${book.title}&columns%5B%5D=t&topics%5B%5D=f&res=25&filesuns=all`; + return await retry(async () => { + const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) + const group = /.*href="(edition.php\?id=\d+)".*/g.exec(resp.data); + if (group) { + return `https://libgen.vg/${group[1]}`; + } else { + return '' + } + }) + .catch(() => ''); +} + +async function openBookDetailPage(book, detailPageUrl) { + console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`); + return await retry(async () => { + const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); + const html = cheerio.load(resp.data); + const trList = html('tr'); + const files = []; + let epubUrl = null; + let pdfUrl = null; + for (const tr of trList) { + const trEle = cheerio.load(tr); + const aEle = trEle('td:nth-child(1) > a'); + const url = aEle.attr()['href']; + const tdEle = trEle('td:nth-child(2)'); + tdEle.find('br').replaceWith(' '); + const gp = /.* Extension: (\S+) \S*/.exec(tdEle.text()); + const ext = gp[1].toLowerCase(); + if (ext == 'pdf') { + pdfUrl = `https://libgen.vg/${url}`; + } + if (ext == 'epub') { + epubUrl = `https://libgen.vg/${url}`; + } + } + if (epubUrl || pdfUrl) { + return epubUrl ?? pdfUrl; + } else { + return ''; + } + }) + .catch(() => { + book.state = "鎵撳紑璇︽儏椤靛け璐�"; + console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`); + return ''; + }); +} + +async function getDownloadUrl(book, url) { + return await retry(async () => { + const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); + const group = /.*href="(get.php\?md5=[0-9a-f]+.*)".*/g.exec(resp.data); + if (group) { + return `https://libgen.vg/${group[1]}`; + } else { + return ''; + } + }) + .catch(() => { + book.state = "鑾峰彇涓嬭浇閾炬帴澶辫触"; + console.log(`鑾峰彇涓嬭浇閾炬帴澶辫触: ${book.id} ${book.title}`); + return ''; + }); +} + +async function downloadFile(book, url) { + console.log(`涓嬭浇鏂囦欢: ${url}`); + await retry(() => { + const timeoutTime = 10 * 60 * 1000; + const source = axios.CancelToken.source(); + const timeout = setTimeout(() => { + source.cancel("timeout"); + }, timeoutTime); + return new Promise((resolve, reject) => myAxios + .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token }) + .then((response) => { + response.headers['content-disposition']; + let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? ''; + ext = ext.substring(0, ext.length - 1); + + const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + book.url = url; + if (fs.existsSync(filepath)) { + book.state = `涓嬭浇瀹屾垚`; + book.format = ext; + book.file = filepath; + console.log(`涓嬭浇瀹屾垚锛�${filepath}`); + return; + } + const stream = response.data; + const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const out = fs.createWriteStream(_filepath); + stream.pipe(out); + stream.on("end", async () => { + clearTimeout(timeout); + book.state = `涓嬭浇瀹屾垚`; + book.format = ext; + book.file = filepath; + book.url = url; + book.pages = await getPdfPages(filepath).catch(e => 0); + resolve(true); + }); + stream.on("error", (err) => { + clearTimeout(timeout); + console.error(err); + reject(false); + try { + out.close(); + fs.unlink(filepath, (e) => console.error(e)); + } catch (e) { + console.error(e); + } + }); + }) + .catch((e) => { + clearTimeout(timeout); + console.log(`涓嬭浇澶辫触锛岄敊璇爜: ${e?.response?.status ?? e.code}`); + book.url = url; + if (e.response?.status === 403 || e.response?.status === 401) { + book.state = "娌℃湁涓嬭浇鏉冮檺"; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + resolve(true); + } else { + reject(false); + } + })); + }).catch(e => { + book.state = "涓嬭浇澶辫触"; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + return false + }); +} + +function isAlreadyDownloaded(book) { + const id = `${book.id} ${book.isbn}`; + return alreadyDownloadedBooks.includes(id); +} + +function nextBook() { + return new Promise(resolve => { + const cb = (message) => { + if (message.type === 'book') { + resolve(message.data); + parentPort.removeListener('message', cb); + } + }; + parentPort.on('message', cb); + parentPort.postMessage({ type: 'get-book', threadId }); + + }); +} + + +async function downloadBooks(books) { + + for (; ;) { + const book = await nextBook(); + if (!book) { + break; + } + books.push(book); + if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { + // 瀹氭椂閫�鍑� + break; + } + bookCount++; + if (isAlreadyDownloaded(book)) { + skipCount++; + book.skip = true; + continue; + } + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� + skipCount++; + continue; + } + console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); + // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� + let detailPageUrl = await getBookDetailPageUrl(book); + if (!detailPageUrl) { + book.state = "娌℃湁鎼滅储缁撴灉"; + continue; + } + // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� + sleep(getRandomNumber(500, 1000)); + // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙栦笅杞介摼鎺� + const filePageUrl = await openBookDetailPage(book, detailPageUrl); + if (!filePageUrl) { + console.log(`娌℃湁鏂囦欢: ${book.id} ${book.title}`); + continue; + } + const url = await getDownloadUrl(book, filePageUrl); + if (!url) { + console.log(`娌℃湁鏂囦欢: ${book.id} ${book.title}`); + continue; + } + // 绛夊緟涓�娈垫椂闂村啀涓嬭浇 + await sleep(getRandomNumber(500, 1000)); + // 涓嬭浇鏂囦欢 + try { + await downloadFile(book, url); + console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`); + console.log('finish: ' + JSON.stringify(book)); + } catch (e) { } + successCount++; + // 绛変竴娈垫椂闂村啀涓嬩竴涓� + sleep(getRandomNumber(500, 1000)); + } +} + +function saveBooks(books) { + console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�"); + const workSheets = xlsx.parse("銆愬弽棣堝鎴枫��7鏈堟壒娆′功鍗� - 宸叉挒搴�.xlsx"); + const sheet = workSheets[0]; + const data = sheet.data; + for (const book of books) { + const index = data.findIndex((row) => row[0] === book.id); + if (index > -1) { + data[index][5] = book.publisher; + data[index][6] = book.pubDate; + data[index][11] = book.pages; + data[index][12] = book.state; + data[index][13] = book.format; + data[index][14] = book.file; + data[index][15] = book.url; + } + } + + const buffer = xlsx.build([{ name: "Sheet1", data }]); + try { + fs.writeFileSync("./銆愬弽棣堝鎴枫��7鏈堟壒娆′功鍗� - 宸叉挒搴�.xlsx", buffer, (err) => { }); + console.log("淇濆瓨瀹屾垚: ./銆愬弽棣堝鎴枫��7鏈堟壒娆′功鍗� - 宸叉挒搴�.xlsx"); + } catch (e) { + console.error(e); + const outfile = `${Date.now()}.json`; + fs.writeFileSync(outfile, JSON.stringify(data)); + console.log("淇濆瓨瀹屾垚: " + outfile); + } +} + + +/** + * 姣杞椂鍒嗙鏍煎紡 + * @param {number} ms 姣鍊� + */ +function msFormat(ms) { + const sec = Math.floor(ms / 1000); + const min = Math.floor(sec / 60); + const hour = Math.floor(min / 60); + const day = Math.floor(hour / 24); + const format = `${day > 0 ? `${day}澶ー : ""}${hour % 24}鏃�${min % 60}鍒�${sec % 60}绉抈; + return format; +} + +/** + * 鑾峰彇闅忔満鍊� + * @param {number} min 鏈�灏忓�� + * @param {number} max 鏈�澶у�� + * @returns 闅忔満鍊� + */ +function getRandomNumber(min, max) { + return Math.random() * (max - min) + min; +} + +// 寮�濮嬫椂闂� +const startTime = Date.now(); +// 涓嬭浇鎴愬姛鐨勬暟閲� +let successCount = 0; +// 鍥句功鏁伴噺 +let bookCount = 0; +// 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺 +let skipCount = 0; +let alreadyDownloadedBooks = []; + +function getAlreadyDownloadedBooks() { + const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8'); + const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it); + const files = fs.readdirSync('./downloads'); + books.push(...files); + return books.map(it => path.basename(it, path.extname(it)).trim()); +} + +function startDownload() { + initLogger(); + const books = []; + downloadBooks(books) + .then(() => { + console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + }) + .catch(e => { + console.error(e); + }) + .finally(async () => { + // saveBooks(books); + parentPort.postMessage({ type: "books", data: books }); + logFile.close(); + }); +} + +function main() { + + if (!fs.existsSync('tmpdir')) { + fs.mkdirSync('tmpdir', { recursive: true }); + } + if (!fs.existsSync('downloads')) { + fs.mkdirSync('downloads', { recursive: true }); + } + // 澶氳繘绋嬫墽琛� + if (isMainThread) { + initLogger(); + let downloadCnt = 0; + const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); + const { startRow, endRow, threadSize } = config; + console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); + let finishThreadCnt = 0; + const finishBooks = []; + const books = getBooksFromExcel(startRow, endRow); + + for (let i = 0; i < threadSize; i++) { + const worker = new Worker("./src/book-download.mjs", { workerData: { alreadyDownloadedBooks } }); + worker.on("message", (message) => { + if (message.type === 'books') { + finishBooks.push(...message.data); + finishThreadCnt++; + if (finishThreadCnt >= threadSize) { + successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + } + } else if (message.type === 'get-book') { + downloadCnt++; + worker.postMessage({ type: "book", data: books.shift() }); + } + }); + } + // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭� + process.on('SIGINT', () => { + successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + process.exit(0); + }); + } else { + alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; + startDownload(); + + } +} + +main(); -- Gitblit v1.9.1