From fccce144a33cfae425b078cdb3af5fbf8916bfe3 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期日, 09 六月 2024 15:21:57 +0800 Subject: [PATCH] 增加多线程、定时退出功能 --- config.json | 11 +++ .gitignore | 3 src/main.mjs | 118 ++++++++++++++++++++++++++++++-------- README.md | 4 + 4 files changed, 108 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index f520366..e221ade 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ node_modules **/*.xlsx downloads -**/*.log \ No newline at end of file +**/*.log +logs \ No newline at end of file diff --git a/README.md b/README.md index 85ddb89..9a26d05 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,10 @@ "startRow": 1, // 缁撴潫琛岋紝涓嶅寘鍚� "endRow": 101, + // 绾跨▼鏁� + "threadSize": 4, + // 瀹氭椂缁撴潫鏃堕棿锛屽崟浣嶅垎閽燂紝0琛ㄧず涓嶈缃畾鏃剁粨鏉熸椂闂� + "endOfTime": 0, // 鏄惁浠ユ棤澶存ā寮忓惎鍔╟hrome "headless": true, // 鏄惁绂佺敤GPU diff --git a/config.json b/config.json index 62b0a52..cacd154 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,16 @@ { + "//璧峰琛屽彿锛屼粠0寮�濮嬶紝绗�0琛屾槸琛ㄥご锛屽寘鍚�": "//", "startRow": 1, - "endRow": 101, + + "//缁撴潫琛屽彿锛屼粠0寮�濮嬶紝涓嶅寘鍚琛�": "//", + "endRow": 2001, + "//绾跨▼鏁�": "//", + "threadSize": 4, + + "//瀹氭椂缁撴潫鏃堕棿锛屽崟浣嶅垎閽燂紝0琛ㄧず涓嶈缃畾鏃剁粨鏉熸椂闂�": "//", + "endOfTime": 60, + "//鏃犲ご妯″紡": "//", "headless": true, diff --git a/src/main.mjs b/src/main.mjs index a4f743e..16f82a7 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -4,19 +4,27 @@ import proxy from "selenium-webdriver/proxy.js"; import axios from "axios"; import * as fs from "fs"; +import { Worker, isMainThread, parentPort, workerData } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; +import { resolve } from "path"; /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); /* ------------鏃ュ織-------------- */ -const _log = console.log; -const logFile = fs.createWriteStream('./logs.log', { flags: 'a', encoding: 'utf8' }); -console.log = function (text) { - text = `${new Date().toLocaleString()} ${text ?? ''}`; - _log(text); - logFile.write(text + '\n'); -}; +let logFile; +function initLogger() { + const _log = console.log; + if (!fs.existsSync('./logs')) { + fs.mkdirSync('./logs', { recursive: true }); + } + logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' }); + console.log = function (...text) { + text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; + _log(text); + logFile.write(text + '\n'); + }; +} /* ----------axios浠g悊------------ */ const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); @@ -86,11 +94,19 @@ /** * 鏍煎紡鍖栧叧閿瓧 * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� + * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� * @returns 澶勭悊鍚庣殑鍏抽敭瀛� */ -function formatKw(text) { - // 鍙繚鐣欎腑鏂囥�佽嫳鏂囥�佹暟瀛楀拰涓嬪垝绾� - return text.replace(/[^\u4e00-\u9fa5\w \d]/g, ""); +function formatKw(text, titleWithNumbers) { + // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃 + const regex = /[^\u4e00-\u9fa5\w\s\d]/g; + if (titleWithNumbers) { + text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, ""); + } else { + text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, ""); + } + text = text.split(' ').slice(0, 10).join("+"); + return text; } @@ -117,11 +133,11 @@ * 鎵撳紑鎼滅储椤甸潰骞舵悳绱� * @param {*} book */ -async function openSearchPage(book) { - console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title)}`); +async function openSearchPage(book, titleWithNumbers) { + console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); return await retry(async () => { // 鑾峰彇椤甸潰 - const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`; + const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; await driver.get(searchUrl); }).then(() => true) .catch(() => false); @@ -175,7 +191,7 @@ await driver.wait( until.elementLocated( By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`) - ) + ), 15000 ); }) .then(() => true) @@ -245,6 +261,15 @@ return new Promise((resolve, reject) => myAxios .get(url, { responseType: "stream" }) .then((response) => { + const len = response.headers['content-length']; + if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { + // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇 + book.state = "涓嬭浇澶辫触"; + book.url = url; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + reject(false); + return; + } const stream = response.data; const out = fs.createWriteStream(filepath); stream.pipe(out); @@ -260,11 +285,11 @@ console.error(err); book.state = "涓嬭浇澶辫触"; book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`); + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); reject(false); try { out.close(); - fs.unlink(filepath,(e)=>console.error(e)); + fs.unlink(filepath, (e) => console.error(e)); } catch (e) { console.error(e); } @@ -274,7 +299,7 @@ console.error(e); book.state = "涓嬭浇澶辫触"; book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`); + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); reject(false); })); }).catch(e => { @@ -283,7 +308,12 @@ } async function downloadBooks(books) { + driver = await createDriver(); for (const book of books) { + if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { + // 瀹氭椂閫�鍑� + break; + } bookCount++; if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� @@ -292,10 +322,13 @@ } console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� - if (!await openSearchPage(book)) { - console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`); - book.state = "鎵撳紑鎼滅储椤甸潰澶辫触"; - continue; + if (!await openSearchPage(book, true)) { + // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 + if (!await openSearchPage(book, false)) { + console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`); + book.state = "鎵撳紑鎼滅储椤甸潰澶辫触"; + continue; + } } // 妫�娴嬫悳绱㈢粨鏋� const hasBook = await checkSearchResult(book); @@ -322,7 +355,7 @@ try { await downloadFile(book, url); console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`); - }catch(e){} + } catch (e) { } successCount++; // 绛変竴娈垫椂闂村啀涓嬩竴涓� sleep(getRandomNumber(3000, 10000)); @@ -381,8 +414,10 @@ let bookCount = 0; // 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺 let skipCount = 0; -const driver = await createDriver(); +// chrome椹卞姩 +let driver; function main() { + initLogger(); const books = getBooksFromExcel(config.startRow, config.endRow); downloadBooks(books) .then(() => { @@ -392,13 +427,44 @@ console.error(e); }) .finally(async () => { - saveBooks(books); + // saveBooks(books); + parentPort.postMessage({ type: "books", data: books }); logFile.close(); try { await driver.close(); await driver.quit(); - }catch(e){} + } catch (e) { } }); } -main(); +// 澶氳繘绋嬫墽琛� +if (isMainThread) { + console.log(`绾跨▼鏁帮細${config.threadSize}, 寮�濮嬭锛�${config.startRow}, 缁撴潫琛岋細${config.endRow}`); + let startRow = config.startRow; + let endRow = config.endRow; + let finishCnt = 0; + const finishBooks = []; + const threadSize = config.threadSize; + const thBookSize = endRow - startRow / threadSize + for (let sr = startRow; sr < endRow; sr += thBookSize) { + let er = sr + thBookSize; + if (er > endRow) { + er = endRow; + } + const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er } }); + worker.on("message", (message) => { + if (message.type === 'books') { + finishBooks.push(...message.data); + finishCnt++; + if (finishCnt >= config.threadSize) { + saveBooks(finishBooks); + } + } + }); + } +} else { + config.startRow = workerData.startRow; + config.endRow = workerData.endRow; + main(); +} + -- Gitblit v1.9.1