From 35a49bbd1b9c131a3a2db734f1351837022930a5 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期二, 11 六月 2024 22:21:11 +0800 Subject: [PATCH] 图书下载修改多线程并发下载分配策略,统一由主线程分配给子线程下载图书信息 --- src/main.mjs | 234 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 files changed, 192 insertions(+), 42 deletions(-) diff --git a/src/main.mjs b/src/main.mjs index 7b3a874..0e0a42f 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -4,16 +4,28 @@ import proxy from "selenium-webdriver/proxy.js"; import axios from "axios"; import * as fs from "fs"; +import path from "path"; +import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; +import { resolve } from "path"; + +/*-------------璇诲彇閰嶇疆---------------*/ +let config = JSON.parse(fs.readFileSync('./config.json')); /* ------------鏃ュ織-------------- */ -const _log = console.log; -const logFile = fs.createWriteStream('./logs.log'); -console.log = function (text) { - text = `${new Date().toLocaleString()} ${text ?? ''}`; - _log(text); - logFile.write(text + '\n'); -}; +let logFile; +function initLogger() { + const _log = console.log; + if (!fs.existsSync('./logs')) { + fs.mkdirSync('./logs', { recursive: true }); + } + logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); + console.log = function (...text) { + text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; + _log(text); + logFile.write(text + '\n'); + }; +} /* ----------axios浠g悊------------ */ const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); @@ -61,6 +73,12 @@ */ async function createDriver() { const opts = new ChromeOptions(); + if (config.headless) { + opts.addArguments("--headless");//寮�鍚棤澶存ā寮� + } + if (config.disableGpu) { + opts.addArguments("--disable-gpu");//绂佹gpu娓叉煋 + } opts.addArguments("--ignore-ssl-error"); // 蹇界暐ssl閿欒 opts.addArguments("--no-sandbox"); // 绂佺敤娌欑洅妯″紡 opts.addArguments("blink-settings=imagesEnabled=false"); //绂佺敤鍥剧墖鍔犺浇 @@ -77,14 +95,21 @@ /** * 鏍煎紡鍖栧叧閿瓧 * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� + * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� * @returns 澶勭悊鍚庣殑鍏抽敭瀛� */ -function formatKw(text) { - // 鍙繚鐣欎腑鏂囥�佽嫳鏂囥�佹暟瀛楀拰涓嬪垝绾� - return text.replace(/[^\u4e00-\u9fa5\w \d]/g, ""); +function formatKw(text, titleWithNumbers) { + // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃 + const regex = /[^\u4e00-\u9fa5\w\s\d]/g; + if (titleWithNumbers) { + text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, ""); + } else { + text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, ""); + } + text = text.split(' ').slice(0, 10).join("+"); + return text; } -const driver = await createDriver(); async function sleep(ms) { return new Promise((resolve) => { @@ -109,11 +134,11 @@ * 鎵撳紑鎼滅储椤甸潰骞舵悳绱� * @param {*} book */ -async function openSearchPage(book) { - console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title)}`); +async function openSearchPage(book, titleWithNumbers) { + console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); return await retry(async () => { // 鑾峰彇椤甸潰 - const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`; + const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; await driver.get(searchUrl); }).then(() => true) .catch(() => false); @@ -167,7 +192,7 @@ await driver.wait( until.elementLocated( By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`) - ) + ), 15000 ); }) .then(() => true) @@ -223,14 +248,32 @@ async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); + const ext = url.split(".").pop(); + const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + if (fs.existsSync(filepath)) { + book.state = `涓嬭浇瀹屾垚`; + book.format = ext; + book.file = filepath; + book.url = url; + console.log(`涓嬭浇瀹屾垚锛�${filepath}`); + return; + } await retry(() => { return new Promise((resolve, reject) => myAxios .get(url, { responseType: "stream" }) .then((response) => { + const len = response.headers['content-length']; + if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { + // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇 + book.state = "涓嬭浇澶辫触"; + book.url = url; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + reject(false); + return; + } const stream = response.data; - const ext = url.split(".").pop(); - const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; - stream.pipe(fs.createWriteStream(filepath)); + const out = fs.createWriteStream(filepath); + stream.pipe(out); stream.on("end", () => { book.state = `涓嬭浇瀹屾垚`; book.format = ext; @@ -239,30 +282,83 @@ console.log(`涓嬭浇瀹屾垚锛�${filepath}`); resolve(true); }); + stream.on("error", (err) => { + console.error(err); + book.state = "涓嬭浇澶辫触"; + book.url = url; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + reject(false); + try { + out.close(); + fs.unlink(filepath, (e) => console.error(e)); + } catch (e) { + console.error(e); + } + }); }) .catch((e) => { console.error(e); book.state = "涓嬭浇澶辫触"; book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`); + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); reject(false); })); + }).catch(e => { + return false + }); +} + +function isAlreadyDownloaded(book) { + const id = `${book.id} ${book.isbn}`; + return alreadyDownloadedBooks.includes(id); +} + +function nextBook() { + return new Promise(resolve => { + const cb = (message) => { + if (message.type === 'book') { + resolve(message.data); + parentPort.removeListener('message', cb); + } + }; + parentPort.on('message', cb); + parentPort.postMessage({ type: 'get-book', threadId }); + }); } async function downloadBooks(books) { - for (const book of books) { - if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢")) { - // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� - continue; + driver = await createDriver(); + + for (; ;) { + const book = await nextBook(); + if (!book) { + break; + } + books.push(book); + if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { + // 瀹氭椂閫�鍑� + break; } bookCount++; + if (isAlreadyDownloaded(book)) { + skipCount++; + continue; + } + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� + skipCount++; + continue; + } console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� - if (!await openSearchPage(book)) { - console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`); - book.state = "鎵撳紑鎼滅储椤甸潰澶辫触"; - continue; + if (!await openSearchPage(book, true)) { + // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 + if (!await openSearchPage(book, false)) { + console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`); + book.state = "鎵撳紑鎼滅储椤甸潰澶辫触"; + continue; + } } // 妫�娴嬫悳绱㈢粨鏋� const hasBook = await checkSearchResult(book); @@ -277,30 +373,30 @@ continue; } // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� - sleep(getRandomNumber(3000, 10000)); + sleep(getRandomNumber(1000, 30000)); // 鎵撳紑璇︽儏椤� await openBookDetailPage(book, detailPageUrl); // 鑾峰彇涓嬭浇閾炬帴 const url = await getDownloadUrl(book); if (!url) { continue; } // 绛夊緟涓�娈垫椂闂村啀涓嬭浇 - await sleep(getRandomNumber(3000, 10000)); + await sleep(getRandomNumber(1000, 30000)); // 涓嬭浇鏂囦欢 - await downloadFile(book, url); - console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`); + try { + await downloadFile(book, url); + console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`); + } catch (e) { } successCount++; // 绛変竴娈垫椂闂村啀涓嬩竴涓� - sleep(getRandomNumber(3000, 10000)); + sleep(getRandomNumber(1000, 30000)); } - await driver.close(); - await driver.quit(); } function saveBooks(books) { console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�"); const workSheets = xlsx.parse("銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx"); const sheet = workSheets[0]; - const data = sheet.data.slice(2); + const data = sheet.data; for (const book of books) { const index = data.findIndex((row) => row[0] === book.id); if (index > -1) { @@ -312,7 +408,7 @@ } const buffer = xlsx.build([{ name: "Sheet1", data }]); - fs.writeFile("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { }); + fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { }); console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx"); } @@ -346,18 +442,72 @@ let successCount = 0; // 鍥句功鏁伴噺 let bookCount = 0; +// 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺 +let skipCount = 0; +// chrome椹卞姩 +let driver; +let alreadyDownloadedBooks = []; + +function getAlreadyDownloadedBooks() { + const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8'); + const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it); + const files = fs.readdirSync('./downloads'); + books.push(...files); + return books.map(it => path.basename(it, path.extname(it)).trim()); +} function main() { - const range = JSON.parse(fs.readFileSync('./config.json')); - const books = getBooksFromExcel(range.startRow, range.endRow); + initLogger(); + const books = []; downloadBooks(books) .then(() => { - console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝澶辫触${bookCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); }) - .finally(() => { - saveBooks(books); + .catch(e => { + console.error(e); + }) + .finally(async () => { + // saveBooks(books); + parentPort.postMessage({ type: "books", data: books }); logFile.close(); + try { + await driver.close(); + await driver.quit(); + } catch (e) { } }); } -main(); +// 澶氳繘绋嬫墽琛� +if (isMainThread) { + initLogger(); + const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); + const { startRow, endRow, threadSize } = config; + console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); + let finishCnt = 0; + const finishBooks = []; + const thBookSize = (endRow - startRow) / threadSize; + const books = getBooksFromExcel(startRow, endRow); + + for (let sr = startRow; sr < endRow; sr += thBookSize) { + let er = sr + thBookSize; + if (er > endRow) { + er = endRow; + } + const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } }); + worker.on("message", (message) => { + if (message.type === 'books') { + finishBooks.push(...message.data); + finishCnt++; + if (finishCnt >= threadSize) { + saveBooks(finishBooks); + } + } else if (message.type === 'get-book') { + worker.postMessage({ type: "book", data: books.shift() }); + } + }); + } +} else { + alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; + main(); +} + -- Gitblit v1.9.1