From fccce144a33cfae425b078cdb3af5fbf8916bfe3 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期日, 09 六月 2024 15:21:57 +0800
Subject: [PATCH] 增加多线程、定时退出功能

---
 src/main.mjs |  118 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 92 insertions(+), 26 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index a4f743e..16f82a7 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -4,19 +4,27 @@
 import proxy from "selenium-webdriver/proxy.js";
 import axios from "axios";
 import * as fs from "fs";
+import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
+import { resolve } from "path";
 
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
 
 /* ------------鏃ュ織-------------- */
-const _log = console.log;
-const logFile = fs.createWriteStream('./logs.log', { flags: 'a', encoding: 'utf8' });
-console.log = function (text) {
-  text = `${new Date().toLocaleString()} ${text ?? ''}`;
-  _log(text);
-  logFile.write(text + '\n');
-};
+let logFile;
+function initLogger() {
+  const _log = console.log;
+  if (!fs.existsSync('./logs')) {
+    fs.mkdirSync('./logs', { recursive: true });
+  }
+  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' });
+  console.log = function (...text) {
+    text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
+    _log(text);
+    logFile.write(text + '\n');
+  };
+}
 
 /* ----------axios浠g悊------------ */
 const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
@@ -86,11 +94,19 @@
 /**
  * 鏍煎紡鍖栧叧閿瓧
  * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛�
+ * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛�
  * @returns 澶勭悊鍚庣殑鍏抽敭瀛�
  */
-function formatKw(text) {
-  // 鍙繚鐣欎腑鏂囥�佽嫳鏂囥�佹暟瀛楀拰涓嬪垝绾�
-  return text.replace(/[^\u4e00-\u9fa5\w \d]/g, "");
+function formatKw(text, titleWithNumbers) {
+  // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃
+  const regex = /[^\u4e00-\u9fa5\w\s\d]/g;
+  if (titleWithNumbers) {
+    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, "");
+  } else {
+    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, "");
+  }
+  text = text.split(' ').slice(0, 10).join("+");
+  return text;
 }
 
 
@@ -117,11 +133,11 @@
  * 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
  * @param {*} book 
  */
-async function openSearchPage(book) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title)}`);
+async function openSearchPage(book, titleWithNumbers) {
+  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
   return await retry(async () => {
     // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`;
+    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
     await driver.get(searchUrl);
   }).then(() => true)
     .catch(() => false);
@@ -175,7 +191,7 @@
     await driver.wait(
       until.elementLocated(
         By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`)
-      )
+      ), 15000
     );
   })
     .then(() => true)
@@ -245,6 +261,15 @@
     return new Promise((resolve, reject) => myAxios
       .get(url, { responseType: "stream" })
       .then((response) => {
+        const len = response.headers['content-length'];
+        if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
+          // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇
+          book.state = "涓嬭浇澶辫触";
+          book.url = url;
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
+          reject(false);
+          return;
+        }
         const stream = response.data;
         const out = fs.createWriteStream(filepath);
         stream.pipe(out);
@@ -260,11 +285,11 @@
           console.error(err);
           book.state = "涓嬭浇澶辫触";
           book.url = url;
-          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`);
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
           reject(false);
           try {
             out.close();
-            fs.unlink(filepath,(e)=>console.error(e));
+            fs.unlink(filepath, (e) => console.error(e));
           } catch (e) {
             console.error(e);
           }
@@ -274,7 +299,7 @@
         console.error(e);
         book.state = "涓嬭浇澶辫触";
         book.url = url;
-        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`);
+        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
         reject(false);
       }));
   }).catch(e => {
@@ -283,7 +308,12 @@
 }
 
 async function downloadBooks(books) {
+  driver = await createDriver();
   for (const book of books) {
+    if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
+      // 瀹氭椂閫�鍑�
+      break;
+    }
     bookCount++;
     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
@@ -292,10 +322,13 @@
     }
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
-    if (!await openSearchPage(book)) {
-      console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
-      book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
-      continue;
+    if (!await openSearchPage(book, true)) {
+      // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
+      if (!await openSearchPage(book, false)) {
+        console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
+        book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
+        continue;
+      }
     }
     // 妫�娴嬫悳绱㈢粨鏋�
     const hasBook = await checkSearchResult(book);
@@ -322,7 +355,7 @@
     try {
       await downloadFile(book, url);
       console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`);
-    }catch(e){}
+    } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
     sleep(getRandomNumber(3000, 10000));
@@ -381,8 +414,10 @@
 let bookCount = 0;
 // 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺
 let skipCount = 0;
-const driver = await createDriver();
+// chrome椹卞姩
+let driver;
 function main() {
+  initLogger();
   const books = getBooksFromExcel(config.startRow, config.endRow);
   downloadBooks(books)
     .then(() => {
@@ -392,13 +427,44 @@
       console.error(e);
     })
     .finally(async () => {
-      saveBooks(books);
+      // saveBooks(books);
+      parentPort.postMessage({ type: "books", data: books });
       logFile.close();
       try {
         await driver.close();
         await driver.quit();
-      }catch(e){}
+      } catch (e) { }
     });
 }
 
-main();
+// 澶氳繘绋嬫墽琛�
+if (isMainThread) {
+  console.log(`绾跨▼鏁帮細${config.threadSize}, 寮�濮嬭锛�${config.startRow}, 缁撴潫琛岋細${config.endRow}`);
+  let startRow = config.startRow;
+  let endRow = config.endRow;
+  let finishCnt = 0;
+  const finishBooks = [];
+  const threadSize = config.threadSize;
+  const thBookSize = endRow - startRow / threadSize
+  for (let sr = startRow; sr < endRow; sr += thBookSize) {
+    let er = sr + thBookSize;
+    if (er > endRow) {
+      er = endRow;
+    }
+    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er } });
+    worker.on("message", (message) => {
+      if (message.type === 'books') {
+        finishBooks.push(...message.data);
+        finishCnt++;
+        if (finishCnt >= config.threadSize) {
+          saveBooks(finishBooks);
+        }
+      }
+    });
+  }
+} else {
+  config.startRow = workerData.startRow;
+  config.endRow = workerData.endRow;
+  main();
+}
+

--
Gitblit v1.9.1