From 35a49bbd1b9c131a3a2db734f1351837022930a5 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期二, 11 六月 2024 22:21:11 +0800
Subject: [PATCH] 图书下载修改多线程并发下载分配策略,统一由主线程分配给子线程下载图书信息

---
 src/main.mjs |  234 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 192 insertions(+), 42 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index 7b3a874..0e0a42f 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -4,16 +4,28 @@
 import proxy from "selenium-webdriver/proxy.js";
 import axios from "axios";
 import * as fs from "fs";
+import path from "path";
+import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
+import { resolve } from "path";
+
+/*-------------璇诲彇閰嶇疆---------------*/
+let config = JSON.parse(fs.readFileSync('./config.json'));
 
 /* ------------鏃ュ織-------------- */
-const _log = console.log;
-const logFile = fs.createWriteStream('./logs.log');
-console.log = function (text) {
-  text = `${new Date().toLocaleString()} ${text ?? ''}`;
-  _log(text);
-  logFile.write(text + '\n');
-};
+let logFile;
+function initLogger() {
+  const _log = console.log;
+  if (!fs.existsSync('./logs')) {
+    fs.mkdirSync('./logs', { recursive: true });
+  }
+  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
+  console.log = function (...text) {
+    text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
+    _log(text);
+    logFile.write(text + '\n');
+  };
+}
 
 /* ----------axios浠g悊------------ */
 const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
@@ -61,6 +73,12 @@
  */
 async function createDriver() {
   const opts = new ChromeOptions();
+  if (config.headless) {
+    opts.addArguments("--headless");//寮�鍚棤澶存ā寮�
+  }
+  if (config.disableGpu) {
+    opts.addArguments("--disable-gpu");//绂佹gpu娓叉煋
+  }
   opts.addArguments("--ignore-ssl-error"); // 蹇界暐ssl閿欒
   opts.addArguments("--no-sandbox"); // 绂佺敤娌欑洅妯″紡
   opts.addArguments("blink-settings=imagesEnabled=false"); //绂佺敤鍥剧墖鍔犺浇
@@ -77,14 +95,21 @@
 /**
  * 鏍煎紡鍖栧叧閿瓧
  * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛�
+ * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛�
  * @returns 澶勭悊鍚庣殑鍏抽敭瀛�
  */
-function formatKw(text) {
-  // 鍙繚鐣欎腑鏂囥�佽嫳鏂囥�佹暟瀛楀拰涓嬪垝绾�
-  return text.replace(/[^\u4e00-\u9fa5\w \d]/g, "");
+function formatKw(text, titleWithNumbers) {
+  // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃
+  const regex = /[^\u4e00-\u9fa5\w\s\d]/g;
+  if (titleWithNumbers) {
+    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, "");
+  } else {
+    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, "");
+  }
+  text = text.split(' ').slice(0, 10).join("+");
+  return text;
 }
 
-const driver = await createDriver();
 
 async function sleep(ms) {
   return new Promise((resolve) => {
@@ -109,11 +134,11 @@
  * 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
  * @param {*} book 
  */
-async function openSearchPage(book) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title)}`);
+async function openSearchPage(book, titleWithNumbers) {
+  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
   return await retry(async () => {
     // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`;
+    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
     await driver.get(searchUrl);
   }).then(() => true)
     .catch(() => false);
@@ -167,7 +192,7 @@
     await driver.wait(
       until.elementLocated(
         By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`)
-      )
+      ), 15000
     );
   })
     .then(() => true)
@@ -223,14 +248,32 @@
 
 async function downloadFile(book, url) {
   console.log(`涓嬭浇鏂囦欢: ${url}`);
+  const ext = url.split(".").pop();
+  const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+  if (fs.existsSync(filepath)) {
+    book.state = `涓嬭浇瀹屾垚`;
+    book.format = ext;
+    book.file = filepath;
+    book.url = url;
+    console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
+    return;
+  }
   await retry(() => {
     return new Promise((resolve, reject) => myAxios
       .get(url, { responseType: "stream" })
       .then((response) => {
+        const len = response.headers['content-length'];
+        if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
+          // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇
+          book.state = "涓嬭浇澶辫触";
+          book.url = url;
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
+          reject(false);
+          return;
+        }
         const stream = response.data;
-        const ext = url.split(".").pop();
-        const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
-        stream.pipe(fs.createWriteStream(filepath));
+        const out = fs.createWriteStream(filepath);
+        stream.pipe(out);
         stream.on("end", () => {
           book.state = `涓嬭浇瀹屾垚`;
           book.format = ext;
@@ -239,30 +282,83 @@
           console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
           resolve(true);
         });
+        stream.on("error", (err) => {
+          console.error(err);
+          book.state = "涓嬭浇澶辫触";
+          book.url = url;
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
+          reject(false);
+          try {
+            out.close();
+            fs.unlink(filepath, (e) => console.error(e));
+          } catch (e) {
+            console.error(e);
+          }
+        });
       })
       .catch((e) => {
         console.error(e);
         book.state = "涓嬭浇澶辫触";
         book.url = url;
-        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title}`);
+        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
         reject(false);
       }));
+  }).catch(e => {
+    return false
+  });
+}
+
+function isAlreadyDownloaded(book) {
+  const id = `${book.id} ${book.isbn}`;
+  return alreadyDownloadedBooks.includes(id);
+}
+
+function nextBook() {
+  return new Promise(resolve => {
+    const cb = (message) => {
+      if (message.type === 'book') {
+        resolve(message.data);
+        parentPort.removeListener('message', cb);
+      }
+    };
+    parentPort.on('message', cb);
+    parentPort.postMessage({ type: 'get-book', threadId });
+
   });
 }
 
 async function downloadBooks(books) {
-  for (const book of books) {
-    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢")) {
-      // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
-      continue;
+  driver = await createDriver();
+
+  for (; ;) {
+    const book = await nextBook();
+    if (!book) {
+      break;
+    }
+    books.push(book);
+    if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
+      // 瀹氭椂閫�鍑�
+      break;
     }
     bookCount++;
+    if (isAlreadyDownloaded(book)) {
+      skipCount++;
+      continue;
+    }
+    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
+      // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
+      skipCount++;
+      continue;
+    }
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
-    if (!await openSearchPage(book)) {
-      console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
-      book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
-      continue;
+    if (!await openSearchPage(book, true)) {
+      // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
+      if (!await openSearchPage(book, false)) {
+        console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
+        book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
+        continue;
+      }
     }
     // 妫�娴嬫悳绱㈢粨鏋�
     const hasBook = await checkSearchResult(book);
@@ -277,30 +373,30 @@
       continue;
     }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(3000, 10000));
+    sleep(getRandomNumber(1000, 30000));
     // 鎵撳紑璇︽儏椤�
     await openBookDetailPage(book, detailPageUrl);
     // 鑾峰彇涓嬭浇閾炬帴
     const url = await getDownloadUrl(book);
     if (!url) { continue; }
     // 绛夊緟涓�娈垫椂闂村啀涓嬭浇
-    await sleep(getRandomNumber(3000, 10000));
+    await sleep(getRandomNumber(1000, 30000));
     // 涓嬭浇鏂囦欢
-    await downloadFile(book, url);
-    console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`);
+    try {
+      await downloadFile(book, url);
+      console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`);
+    } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
-    sleep(getRandomNumber(3000, 10000));
+    sleep(getRandomNumber(1000, 30000));
   }
-  await driver.close();
-  await driver.quit();
 }
 
 function saveBooks(books) {
   console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�");
   const workSheets = xlsx.parse("銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
   const sheet = workSheets[0];
-  const data = sheet.data.slice(2);
+  const data = sheet.data;
   for (const book of books) {
     const index = data.findIndex((row) => row[0] === book.id);
     if (index > -1) {
@@ -312,7 +408,7 @@
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
-  fs.writeFile("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
+  fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
   console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
 }
 
@@ -346,18 +442,72 @@
 let successCount = 0;
 // 鍥句功鏁伴噺
 let bookCount = 0;
+// 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺
+let skipCount = 0;
+// chrome椹卞姩
+let driver;
+let alreadyDownloadedBooks = [];
+
+function getAlreadyDownloadedBooks() {
+  const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8');
+  const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it);
+  const files = fs.readdirSync('./downloads');
+  books.push(...files);
+  return books.map(it => path.basename(it, path.extname(it)).trim());
+}
 
 function main() {
-  const range = JSON.parse(fs.readFileSync('./config.json'));
-  const books = getBooksFromExcel(range.startRow, range.endRow);
+  initLogger();
+  const books = [];
   downloadBooks(books)
     .then(() => {
-      console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝澶辫触${bookCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
     })
-    .finally(() => {
-      saveBooks(books);
+    .catch(e => {
+      console.error(e);
+    })
+    .finally(async () => {
+      // saveBooks(books);
+      parentPort.postMessage({ type: "books", data: books });
       logFile.close();
+      try {
+        await driver.close();
+        await driver.quit();
+      } catch (e) { }
     });
 }
 
-main();
+// 澶氳繘绋嬫墽琛�
+if (isMainThread) {
+  initLogger();
+  const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
+  const { startRow, endRow, threadSize } = config;
+  console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
+  let finishCnt = 0;
+  const finishBooks = [];
+  const thBookSize = (endRow - startRow) / threadSize;
+  const books = getBooksFromExcel(startRow, endRow);
+
+  for (let sr = startRow; sr < endRow; sr += thBookSize) {
+    let er = sr + thBookSize;
+    if (er > endRow) {
+      er = endRow;
+    }
+    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
+    worker.on("message", (message) => {
+      if (message.type === 'books') {
+        finishBooks.push(...message.data);
+        finishCnt++;
+        if (finishCnt >= threadSize) {
+          saveBooks(finishBooks);
+        }
+      } else if (message.type === 'get-book') {
+        worker.postMessage({ type: "book", data: books.shift() });
+      }
+    });
+  }
+} else {
+  alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
+  main();
+}
+

--
Gitblit v1.9.1