From b1e4e03c682ecff03aa6a6045eea234082acbd59 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期二, 11 六月 2024 23:10:02 +0800
Subject: [PATCH] 修改文件下载超时时长为10分钟

---
 src/main.mjs |   86 ++++++++++++++++++++++++++++++++++---------
 1 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index 0e897a6..cc3d0e5 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -4,7 +4,8 @@
 import proxy from "selenium-webdriver/proxy.js";
 import axios from "axios";
 import * as fs from "fs";
-import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
+import path from "path";
+import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
 import { resolve } from "path";
 
@@ -18,7 +19,7 @@
   if (!fs.existsSync('./logs')) {
     fs.mkdirSync('./logs', { recursive: true });
   }
-  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' });
+  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
   console.log = function (...text) {
     text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
     _log(text);
@@ -258,8 +259,13 @@
     return;
   }
   await retry(() => {
+    const timeoutTime = 10 * 60 * 1000;
+    const source = axios.CancelToken.source();
+    const timeout = setTimeout(() => {
+      source.cancel("timeout");
+    }, timeoutTime);
     return new Promise((resolve, reject) => myAxios
-      .get(url, { responseType: "stream" })
+      .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
       .then((response) => {
         const len = response.headers['content-length'];
         if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
@@ -274,6 +280,7 @@
         const out = fs.createWriteStream(filepath);
         stream.pipe(out);
         stream.on("end", () => {
+          clearTimeout(timeout);
           book.state = `涓嬭浇瀹屾垚`;
           book.format = ext;
           book.file = filepath;
@@ -282,6 +289,7 @@
           resolve(true);
         });
         stream.on("error", (err) => {
+          clearTimeout(timeout);
           console.error(err);
           book.state = "涓嬭浇澶辫触";
           book.url = url;
@@ -296,6 +304,7 @@
         });
       })
       .catch((e) => {
+        clearTimeout(timeout);
         console.error(e);
         book.state = "涓嬭浇澶辫触";
         book.url = url;
@@ -307,14 +316,43 @@
   });
 }
 
+function isAlreadyDownloaded(book) {
+  const id = `${book.id} ${book.isbn}`;
+  return alreadyDownloadedBooks.includes(id);
+}
+
+function nextBook() {
+  return new Promise(resolve => {
+    const cb = (message) => {
+      if (message.type === 'book') {
+        resolve(message.data);
+        parentPort.removeListener('message', cb);
+      }
+    };
+    parentPort.on('message', cb);
+    parentPort.postMessage({ type: 'get-book', threadId });
+
+  });
+}
+
 async function downloadBooks(books) {
   driver = await createDriver();
-  for (const book of books) {
+
+  for (; ;) {
+    const book = await nextBook();
+    if (!book) {
+      break;
+    }
+    books.push(book);
     if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
       // 瀹氭椂閫�鍑�
       break;
     }
     bookCount++;
+    if (isAlreadyDownloaded(book)) {
+      skipCount++;
+      continue;
+    }
     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
       skipCount++;
@@ -343,14 +381,14 @@
       continue;
     }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(3000, 10000));
+    sleep(getRandomNumber(1000, 30000));
     // 鎵撳紑璇︽儏椤�
     await openBookDetailPage(book, detailPageUrl);
     // 鑾峰彇涓嬭浇閾炬帴
     const url = await getDownloadUrl(book);
     if (!url) { continue; }
     // 绛夊緟涓�娈垫椂闂村啀涓嬭浇
-    await sleep(getRandomNumber(3000, 10000));
+    await sleep(getRandomNumber(1000, 30000));
     // 涓嬭浇鏂囦欢
     try {
       await downloadFile(book, url);
@@ -358,7 +396,7 @@
     } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
-    sleep(getRandomNumber(3000, 10000));
+    sleep(getRandomNumber(1000, 30000));
   }
 }
 
@@ -378,7 +416,7 @@
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
-  fs.writeFile("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
+  fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
   console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
 }
 
@@ -416,9 +454,19 @@
 let skipCount = 0;
 // chrome椹卞姩
 let driver;
+let alreadyDownloadedBooks = [];
+
+function getAlreadyDownloadedBooks() {
+  const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8');
+  const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it);
+  const files = fs.readdirSync('./downloads');
+  books.push(...files);
+  return books.map(it => path.basename(it, path.extname(it)).trim());
+}
+
 function main() {
   initLogger();
-  const books = getBooksFromExcel(config.startRow, config.endRow);
+  const books = [];
   downloadBooks(books)
     .then(() => {
       console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
@@ -440,32 +488,34 @@
 // 澶氳繘绋嬫墽琛�
 if (isMainThread) {
   initLogger();
-  console.log(`绾跨▼鏁帮細${config.threadSize}, 寮�濮嬭锛�${config.startRow}, 缁撴潫琛岋細${config.endRow}`);
-  let startRow = config.startRow;
-  let endRow = config.endRow;
+  const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
+  const { startRow, endRow, threadSize } = config;
+  console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
   let finishCnt = 0;
   const finishBooks = [];
-  const threadSize = config.threadSize;
-  const thBookSize = (endRow - startRow) / threadSize
+  const thBookSize = (endRow - startRow) / threadSize;
+  const books = getBooksFromExcel(startRow, endRow);
+
   for (let sr = startRow; sr < endRow; sr += thBookSize) {
     let er = sr + thBookSize;
     if (er > endRow) {
       er = endRow;
     }
-    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er } });
+    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
     worker.on("message", (message) => {
       if (message.type === 'books') {
         finishBooks.push(...message.data);
         finishCnt++;
-        if (finishCnt >= config.threadSize) {
+        if (finishCnt >= threadSize) {
           saveBooks(finishBooks);
         }
+      } else if (message.type === 'get-book') {
+        worker.postMessage({ type: "book", data: books.shift() });
       }
     });
   }
 } else {
-  config.startRow = workerData.startRow;
-  config.endRow = workerData.endRow;
+  alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
   main();
 }
 

--
Gitblit v1.9.1