From ce8cb9c851fa66c7c2902ceb57e369d3cecf1a28 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期四, 01 八月 2024 01:48:56 +0800
Subject: [PATCH] 复制bt下载的文件,bt任务控制

---
 src/main.mjs |  539 +++++++++++++++++++++++++++++++++++------------------------
 1 files changed, 322 insertions(+), 217 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index a53edc5..2fbe23a 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -1,13 +1,14 @@
 import xlsx from "node-xlsx";
-import { Builder, Browser, until, By } from "selenium-webdriver";
-import { Options as ChromeOptions } from "selenium-webdriver/chrome.js";
-import proxy from "selenium-webdriver/proxy.js";
 import axios from "axios";
 import * as fs from "fs";
 import path from "path";
-import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
+import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
-import { resolve } from "path";
+import { execFileSync } from "child_process";
+import wordsjs from 'wordlist-js';
+import usPlaceList from "./us-place-list.mjs";
+import usPeronNameList from "./us-pseron-name-list.mjs";
+import * as pdfLib from 'pdf-lib';
 
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
@@ -19,7 +20,7 @@
   if (!fs.existsSync('./logs')) {
     fs.mkdirSync('./logs', { recursive: true });
   }
-  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' });
+  logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
   console.log = function (...text) {
     text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
     _log(text);
@@ -33,6 +34,147 @@
   proxy: false,
   httpsAgent,
 });
+
+/**
+ * 鑾峰彇pdf鏂囦欢椤垫暟
+ * @param {string} filepath pdf 鏂囦欢璺緞
+ * @returns 椤垫暟
+ */
+async function getPdfPages(filepath) {
+  const buf = fs.readFileSync(filepath);
+  const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true });
+  const pages = pdfDoc.getPages().length;
+  return pages;
+}
+
+function allWords() {
+  const words = {};
+  wordsjs.usPlaces = usPlaceList;
+  wordsjs.usPeronNameList = usPeronNameList;
+  for (const key in wordsjs.default) {
+    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
+      for (const word of wordsjs.default[key]) {
+        words[word] = true;
+      }
+    }
+  }
+  return words;
+}
+
+const wordsMap = allWords();
+
+/**
+ * 缁熻鍗曡瘝鏁伴噺
+ * @param {string} str 瀛楃涓�
+ * @returns 鍗曡瘝鏁伴噺
+ */
+function countWordSize(str) {
+  let count = 0;
+  str = str.replace(/[ ]{2,}/g, ' ');
+  for (let i = 0; i < str.length; i++) {
+    if (str[i] === ' ') {
+      count++;
+    }
+  }
+  return count;
+}
+
+/**
+ * 鑾峰彇閿欒鍗曡瘝姣斾緥
+ * @param {string} text 鏂囨湰
+ * @returns 閿欒鍗曡瘝姣斾緥
+ */
+function incorrectWordRatio(text) {
+  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
+  const words = text.split(' ');
+  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
+  return incorrectWordCnt / words.length;
+}
+
+/**
+ * 绗﹀彿鍗犳瘮 0 ~ 1
+ * @param {string} text 鏂囨湰
+ */
+function symbolRatio(text) {
+  // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣�
+  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
+}
+
+/**
+ * 娓呯悊鏂囨湰
+ * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰
+ */
+function cleanText(text) {
+  text = text.replace(/(\r)/g, '');
+  const googlePage = text.substring(0, 10000);
+  if (googlePage.includes('google')) {
+    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
+  }
+  // if (!/.{170,}/g.test(text) || text.includes('google')) {
+  text = text.replace(/[ ]{2,}/g, ' ')
+  if (!/.{170,}/g.test(text)) {
+    // 姣忚涓嶈秴杩�170涓瓧绗�
+    text = text.replace(/(.{170,})\n/g, '$1');
+  }
+  text = text.replace(/\n+/g, '\n');
+  text = text.replace(/-\n/g, '-');
+  const lines = text.split('\n');
+  const result = [];
+  for (const line of lines) {
+    // 绗﹀彿姣斿お楂樼殑涓嶈
+    const incorrectRatio = incorrectWordRatio(line);
+    if (symbolRatio(line) > 0.2) {
+      if (incorrectRatio > 0.65) {
+        continue;
+      }
+    }
+    // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈
+    const wordSize = countWordSize(line);
+    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
+      if (wordSize < 5 || incorrectRatio > 0.65) {
+        continue;
+      }
+    }
+    // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈
+    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) {
+      continue;
+    }
+    // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕�
+    if (wordSize > 5 && incorrectRatio > 0.65) {
+      continue;
+    }
+    // 鏈塯oogle鐨勪笉瑕�
+    if (/.*(google).*/ig.test(line)) {
+      continue;
+    }
+    // 鍙湁涓�涓瓧绗︿笉瑕�
+    const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, '');
+    if (ret.length <= 1) {
+      continue;
+    }
+    if (ret == 'Digitized by') {
+      continue;
+    }
+    result.push(ret);
+  }
+  text = result.join('\n');
+  // }
+  return text;
+}
+
+/**
+ * 瑙e帇鏂囨湰鏂囦欢
+ * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞
+ * @param {string} txtFile 鏂囨湰鏂囦欢璺緞
+ */
+function unzip(zipFile, txtFile) {
+  const tmpdir = `./tmpdir/${threadId}`;
+  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
+  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
+    .sort((a, b) => a.size.size - b.size.size).pop();
+  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
+  fs.rmSync(`${tmpdir}`, { recursive: true });
+}
 
 /**
  * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅
@@ -68,45 +210,18 @@
 }
 
 /**
- * 鍒涘缓娴忚鍣ㄩ┍鍔�
- * @returns chrome娴忚鍣ㄩ┍鍔�
- */
-async function createDriver() {
-  const opts = new ChromeOptions();
-  if (config.headless) {
-    opts.addArguments("--headless");//寮�鍚棤澶存ā寮�
-  }
-  if (config.disableGpu) {
-    opts.addArguments("--disable-gpu");//绂佹gpu娓叉煋
-  }
-  opts.addArguments("--ignore-ssl-error"); // 蹇界暐ssl閿欒
-  opts.addArguments("--no-sandbox"); // 绂佺敤娌欑洅妯″紡
-  opts.addArguments("blink-settings=imagesEnabled=false"); //绂佺敤鍥剧墖鍔犺浇
-  // proxy
-  opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' }))
-  const driver = await new Builder()
-    .setChromeOptions(opts)
-    .forBrowser(Browser.CHROME)
-    .build();
-  driver.manage().setTimeouts({ implicit: 10000 });
-  return driver;
-}
-
-/**
  * 鏍煎紡鍖栧叧閿瓧
  * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛�
  * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛�
  * @returns 澶勭悊鍚庣殑鍏抽敭瀛�
  */
 function formatKw(text, titleWithNumbers) {
-  // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃
-  const regex = /[^\u4e00-\u9fa5\w\s\d]/g;
   if (titleWithNumbers) {
-    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, "");
+    text = text;
   } else {
-    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, "");
+    text = text.replace(/[\d]/g, "");
   }
-  text = text.split(' ').slice(0, 10).join("+");
+  text = text.split(' ').slice(0, 6).join("+");
   return text;
 }
 
@@ -131,162 +246,109 @@
 }
 
 /**
- * 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
+ * 鑾峰彇涔︾睄璇︽儏椤祏rl
  * @param {*} book 
  */
-async function openSearchPage(book, titleWithNumbers) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
+async function getBookDetailPageUrl(book, titleWithNumbers) {
+  const kw = formatKw(book.title, titleWithNumbers);
+  const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`;
+  const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}`
+  console.log(`鎵撳紑鎼滅储: ${searchUrl}`);
   return await retry(async () => {
-    // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
-    await driver.get(searchUrl);
-  }).then(() => true)
-    .catch(() => false);
-}
-
-/**
- * 妫�娴嬫悳绱㈢粨鏋�
- * @param {*} book 
- * @returns true: 鏈夋悳绱㈢粨鏋滐紝false: 娌℃湁鎼滅储缁撴灉
- */
-async function checkSearchResult(book) {
-  console.log(`妫�娴嬫悳绱㈢粨鏋渀);
-  return await retry(async () => {
-    const text = await driver.executeScript(`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#content-container > empty-placeholder").shadowRoot.querySelector("div > h2").textContent`);
-    if (text && text.includes("Your search did not match any items in the Archive. Try different keywords or a more general search.")) {
-      // 娌℃湁鎼滅储缁撴灉
-      book.state = "娌℃湁鎼滅储缁撴灉";
-      console.log(`娌℃湁鎼滅储缁撴灉: ${book.id} ${book.title}`);
-      return false;
-    } else {
-      return true;
+    const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
+    const { total, hits } = resp.data.response.body.hits
+    if (total === 0) {
+      return '';
     }
-  }, 2)
-    .catch(() => {
-      return true;
-    });
-}
-
-async function findBookDetailPageUrl(book) {
-  console.log(`鏌ユ壘璇︽儏椤祏rl`);
-  return retry(async () => {
-    let detailPageUrl;
-    try {
-      detailPageUrl = await driver.executeScript(
-        `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article:nth-child(2) > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
-      );
-    } catch (e) {
-      detailPageUrl = await driver.executeScript(
-        `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
-      );
-    }
-    return detailPageUrl;
+    const hit = hits[0];
+    const { identifier, title, creator } = hit.fields
+    return `https://archive.org/details/${identifier}`;
   })
     .catch(() => '');
 }
 
 async function openBookDetailPage(book, detailPageUrl) {
-  console.log(`鎵撳紑璇︽儏: https://archive.org${detailPageUrl}`);
+  console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`);
   return await retry(async () => {
-    await driver.get(`https://archive.org${detailPageUrl}`);
-    await driver.wait(
-      until.elementLocated(
-        By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`)
-      ), 15000
-    );
+    const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
+    const html = resp.data;
+    const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]);
+    book.publisher = data.metadata.publisher;
+    book.pubDate = data.metadata.date;
+    const identifier = data.metadata.identifier;
+    const fileData = data.files.find(f => f.format === 'Text PDF');
+    if (!fileData) {
+      return '';
+    }
+    const fileUrl = `https://archive.org/download/${identifier}/${fileData.name}`;
+    return fileUrl;
   })
-    .then(() => true)
     .catch(() => {
       book.state = "鎵撳紑璇︽儏椤靛け璐�";
       console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`);
-      return false;
-    });
-}
-
-async function getDownloadUrl(book) {
-  console.log(`鑾峰彇涓嬭浇閾炬帴`);
-  function getFullUrl(url) {
-    if (!url) { return ''; }
-    return url.startsWith("http") ? url : `https://archive.org${url}`;
-  }
-  return await retry(async () => {
-    const elements = await driver.findElements(
-      By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div/a`)
-    );
-
-    let pdfUrl = "";
-    let textUrl = "";
-    for (const el of elements) {
-      let text = await el.getText();
-      if (text) {
-        text = text.trim().split("\n")[0];
-        const href = getFullUrl(await el.getAttribute("href"));
-        if (text.toLowerCase() === "pdf") {
-          pdfUrl = href;
-        } else if (text.toLowerCase() === "full text") {
-          textUrl = href;
-        } else if (text.toLowerCase() === "ocr search text") {
-          textUrl = href;
-        }
-      }
-    }
-
-    if (pdfUrl) {
-      return pdfUrl;
-    } else if (textUrl) {
-      return textUrl;
-    } else {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
-      return ''
-    }
-  })
-    .catch(() => {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
       return '';
     });
 }
 
+/**
+ * 浠嶩TML鎻愬彇鏂囨湰
+ * @param {string} text html鏂囨湰
+ * @returns 鏂囨湰
+ */
+function getTextFromHtml(text) {
+  if (text.includes("<!DOCTYPE html>")) {
+    const s = text.indexOf('<pre>') + 6;
+    const e = text.indexOf('</pre>');
+    text = text.substring(s, e);
+    // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
+  }
+  return text;
+}
+
 async function downloadFile(book, url) {
   console.log(`涓嬭浇鏂囦欢: ${url}`);
-  const ext = url.split(".").pop();
+  const ext = url.split(".").pop().toLowerCase();
   const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+  book.url = url;
   if (fs.existsSync(filepath)) {
     book.state = `涓嬭浇瀹屾垚`;
     book.format = ext;
     book.file = filepath;
-    book.url = url;
+    book.pages = await getPdfPages(filepath).catch(() => 0);
     console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
     return;
   }
   await retry(() => {
+    const timeoutTime = 10 * 60 * 1000;
+    const source = axios.CancelToken.source();
+    const timeout = setTimeout(() => {
+      source.cancel("timeout");
+    }, timeoutTime);
     return new Promise((resolve, reject) => myAxios
-      .get(url, { responseType: "stream" })
+      .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
       .then((response) => {
         const len = response.headers['content-length'];
         if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
           // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇
-          book.state = "涓嬭浇澶辫触";
-          book.url = url;
-          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
           reject(false);
           return;
         }
         const stream = response.data;
-        const out = fs.createWriteStream(filepath);
+        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+        const out = fs.createWriteStream(_filepath);
         stream.pipe(out);
-        stream.on("end", () => {
+        stream.on("end", async () => {
+          clearTimeout(timeout);
           book.state = `涓嬭浇瀹屾垚`;
           book.format = ext;
           book.file = filepath;
           book.url = url;
-          console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
+          book.pages = await getPdfPages(filepath).catch(e => 0);
           resolve(true);
         });
         stream.on("error", (err) => {
+          clearTimeout(timeout);
           console.error(err);
-          book.state = "涓嬭浇澶辫触";
-          book.url = url;
-          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
           reject(false);
           try {
             out.close();
@@ -297,13 +359,20 @@
         });
       })
       .catch((e) => {
-        console.error(e);
-        book.state = "涓嬭浇澶辫触";
+        clearTimeout(timeout);
+        console.log(`涓嬭浇澶辫触锛岄敊璇爜: ${e?.response?.status ?? e.code}`);
         book.url = url;
-        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
-        reject(false);
+        if (e.response?.status === 403 || e.response?.status === 401) {
+          book.state = "娌℃湁涓嬭浇鏉冮檺";
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
+          resolve(true);
+        } else {
+          reject(false);
+        }
       }));
   }).catch(e => {
+    book.state = "涓嬭浇澶辫触";
+    console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
     return false
   });
 }
@@ -313,9 +382,29 @@
   return alreadyDownloadedBooks.includes(id);
 }
 
+function nextBook() {
+  return new Promise(resolve => {
+    const cb = (message) => {
+      if (message.type === 'book') {
+        resolve(message.data);
+        parentPort.removeListener('message', cb);
+      }
+    };
+    parentPort.on('message', cb);
+    parentPort.postMessage({ type: 'get-book', threadId });
+
+  });
+}
+
+
 async function downloadBooks(books) {
-  driver = await createDriver();
-  for (const book of books) {
+
+  for (; ;) {
+    const book = await nextBook();
+    if (!book) {
+      break;
+    }
+    books.push(book);
     if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
       // 瀹氭椂閫�鍑�
       break;
@@ -323,6 +412,7 @@
     bookCount++;
     if (isAlreadyDownloaded(book)) {
       skipCount++;
+      book.skip = true;
       continue;
     }
     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
@@ -332,43 +422,35 @@
     }
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
-    if (!await openSearchPage(book, true)) {
+    let detailPageUrl = await getBookDetailPageUrl(book, true);
+    if (!detailPageUrl) {
       // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
-      if (!await openSearchPage(book, false)) {
-        console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
-        book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
+      detailPageUrl = await getBookDetailPageUrl(book, false);
+      if (!detailPageUrl) {
+        console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
+        book.state = "娌℃湁鎼滅储缁撴灉";
         continue;
       }
     }
-    // 妫�娴嬫悳绱㈢粨鏋�
-    const hasBook = await checkSearchResult(book);
-    if (!hasBook) {
-      continue;
-    }
-    // 鑾峰彇璇︽儏椤甸摼鎺�
-    const detailPageUrl = await findBookDetailPageUrl(book);
-    if (!detailPageUrl) {
-      console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
-      book.state = "鑾峰彇璇︽儏椤甸摼鎺ュけ璐�";
-      continue;
-    }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(1000, 30000));
-    // 鎵撳紑璇︽儏椤�
-    await openBookDetailPage(book, detailPageUrl);
-    // 鑾峰彇涓嬭浇閾炬帴
-    const url = await getDownloadUrl(book);
-    if (!url) { continue; }
+    sleep(getRandomNumber(500, 1000));
+    // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙栦笅杞介摼鎺�
+    const url = await openBookDetailPage(book, detailPageUrl);
+    if (!url) {
+      console.log(`娌℃湁pdf鎴杢ext鏂囦欢: ${book.id} ${book.title}`);
+      continue;
+    }
     // 绛夊緟涓�娈垫椂闂村啀涓嬭浇
-    await sleep(getRandomNumber(1000, 30000));
+    await sleep(getRandomNumber(500, 1000));
     // 涓嬭浇鏂囦欢
     try {
       await downloadFile(book, url);
       console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`);
+      console.log('finish: ' + JSON.stringify(book));
     } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
-    sleep(getRandomNumber(1000, 30000));
+    sleep(getRandomNumber(500, 1000));
   }
 }
 
@@ -380,6 +462,9 @@
   for (const book of books) {
     const index = data.findIndex((row) => row[0] === book.id);
     if (index > -1) {
+      data[index][5] = book.publisher;
+      data[index][6] = book.pubDate;
+      data[index][11] = book.pages;
       data[index][12] = book.state;
       data[index][13] = book.format;
       data[index][14] = book.file;
@@ -388,8 +473,15 @@
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
-  fs.writeFile("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
-  console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
+  try {
+    fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
+    console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
+  } catch (e) {
+    console.error(e);
+    const outfile = `${Date.now()}.json`;
+    fs.writeFileSync(outfile, JSON.stringify(data));
+    console.log("淇濆瓨瀹屾垚: " + outfile);
+  }
 }
 
 
@@ -424,8 +516,6 @@
 let bookCount = 0;
 // 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺
 let skipCount = 0;
-// chrome椹卞姩
-let driver;
 let alreadyDownloadedBooks = [];
 
 function getAlreadyDownloadedBooks() {
@@ -436,12 +526,12 @@
   return books.map(it => path.basename(it, path.extname(it)).trim());
 }
 
-function main() {
+function startDownload() {
   initLogger();
-  const books = getBooksFromExcel(config.startRow, config.endRow);
+  const books = [];
   downloadBooks(books)
     .then(() => {
-      console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
     })
     .catch(e => {
       console.error(e);
@@ -450,44 +540,59 @@
       // saveBooks(books);
       parentPort.postMessage({ type: "books", data: books });
       logFile.close();
-      try {
-        await driver.close();
-        await driver.quit();
-      } catch (e) { }
     });
 }
 
-// 澶氳繘绋嬫墽琛�
-if (isMainThread) {
-  initLogger();
-  const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
-  console.log(`绾跨▼鏁帮細${config.threadSize}, 寮�濮嬭锛�${config.startRow}, 缁撴潫琛岋細${config.endRow}`);
-  let startRow = config.startRow;
-  let endRow = config.endRow;
-  let finishCnt = 0;
-  const finishBooks = [];
-  const threadSize = config.threadSize;
-  const thBookSize = (endRow - startRow) / threadSize
-  for (let sr = startRow; sr < endRow; sr += thBookSize) {
-    let er = sr + thBookSize;
-    if (er > endRow) {
-      er = endRow;
-    }
-    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
-    worker.on("message", (message) => {
-      if (message.type === 'books') {
-        finishBooks.push(...message.data);
-        finishCnt++;
-        if (finishCnt >= config.threadSize) {
-          saveBooks(finishBooks);
-        }
-      }
-    });
+function main() {
+
+  if (!fs.existsSync('tmpdir')) {
+    fs.mkdirSync('tmpdir', { recursive: true });
   }
-} else {
-  config.startRow = workerData.startRow;
-  config.endRow = workerData.endRow;
-  alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
-  main();
+  if (!fs.existsSync('downloads')) {
+    fs.mkdirSync('downloads', { recursive: true });
+  }
+  // 澶氳繘绋嬫墽琛�
+  if (isMainThread) {
+    initLogger();
+    let downloadCnt = 0;
+    const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
+    const { startRow, endRow, threadSize } = config;
+    console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
+    let finishThreadCnt = 0;
+    const finishBooks = [];
+    const books = getBooksFromExcel(startRow, endRow);
+
+    for (let i = 0; i < threadSize; i++) {
+      const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
+      worker.on("message", (message) => {
+        if (message.type === 'books') {
+          finishBooks.push(...message.data);
+          finishThreadCnt++;
+          if (finishThreadCnt >= threadSize) {
+            successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length;
+            skipCount = finishBooks.filter(it => it.skip).length;
+            console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+            saveBooks(finishBooks);
+          }
+        } else if (message.type === 'get-book') {
+          downloadCnt++;
+          worker.postMessage({ type: "book", data: books.shift() });
+        }
+      });
+    }
+    // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭�
+    process.on('SIGINT', () => {
+      successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length;
+      skipCount = finishBooks.filter(it => it.skip).length;
+      console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      saveBooks(finishBooks);
+      process.exit(0);
+    });
+  } else {
+    alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
+    startDownload();
+
+  }
 }
 
+main();
\ No newline at end of file

--
Gitblit v1.9.1