From 92c1b3abe15b82486427ef2e9e2455524e0c6c84 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期五, 14 六月 2024 23:01:48 +0800
Subject: [PATCH] 回写图书信息到Excel

---
 src/main.mjs |  272 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 232 insertions(+), 40 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index a53edc5..bbf02b6 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -5,10 +5,13 @@
 import axios from "axios";
 import * as fs from "fs";
 import path from "path";
-import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
+import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
 import { resolve } from "path";
-
+import { execFileSync } from "child_process";
+import wordsjs from 'wordlist-js';
+import usPlaceList from "./us-place-list.mjs";
+import usPeronNameList from "./us-pseron-name-list.mjs";
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
 
@@ -19,7 +22,7 @@
   if (!fs.existsSync('./logs')) {
     fs.mkdirSync('./logs', { recursive: true });
   }
-  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' });
+  logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
   console.log = function (...text) {
     text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
     _log(text);
@@ -33,6 +36,135 @@
   proxy: false,
   httpsAgent,
 });
+
+function allWords() {
+  const words = {};
+  wordsjs.usPlaces = usPlaceList;
+  wordsjs.usPeronNameList = usPeronNameList;
+  for (const key in wordsjs.default) {
+    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
+      for (const word of wordsjs.default[key]) {
+        words[word] = true;
+      }
+    }
+  }
+  return words;
+}
+
+const wordsMap = allWords();
+
+/**
+ * 缁熻鍗曡瘝鏁伴噺
+ * @param {string} str 瀛楃涓�
+ * @returns 鍗曡瘝鏁伴噺
+ */
+function countWordSize(str) {
+  let count = 0;
+  str = str.replace(/[ ]{2,}/g, ' ');
+  for (let i = 0; i < str.length; i++) {
+    if (str[i] === ' ') {
+      count++;
+    }
+  }
+  return count;
+}
+
+/**
+ * 鑾峰彇閿欒鍗曡瘝姣斾緥
+ * @param {string} text 鏂囨湰
+ * @returns 閿欒鍗曡瘝姣斾緥
+ */
+function incorrectWordRatio(text) {
+  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
+  const words = text.split(' ');
+  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
+  return incorrectWordCnt / words.length;
+}
+
+/**
+ * 绗﹀彿鍗犳瘮 0 ~ 1
+ * @param {string} text 鏂囨湰
+ */
+function symbolRatio(text) {
+  // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣�
+  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
+}
+
+/**
+ * 娓呯悊鏂囨湰
+ * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰
+ */
+function cleanText(text) {
+  text = text.replace(/(\r)/g, '');
+  const googlePage = text.substring(0, 10000);
+  if (googlePage.includes('google')) {
+    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
+  }
+  // if (!/.{170,}/g.test(text) || text.includes('google')) {
+  text = text.replace(/[ ]{2,}/g, ' ')
+  if (!/.{170,}/g.test(text)) {
+    // 姣忚涓嶈秴杩�170涓瓧绗�
+    text = text.replace(/(.{170,})\n/g, '$1');
+  }
+  text = text.replace(/\n+/g, '\n');
+  text = text.replace(/-\n/g, '-');
+  const lines = text.split('\n');
+  const result = [];
+  for (const line of lines) {
+    // 绗﹀彿姣斿お楂樼殑涓嶈
+    const incorrectRatio = incorrectWordRatio(line);
+    if (symbolRatio(line) > 0.2) {
+      if (incorrectRatio > 0.65) {
+        continue;
+      }
+    }
+    // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈
+    const wordSize = countWordSize(line);
+    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
+      if (wordSize < 5 || incorrectRatio > 0.65) {
+        continue;
+      }
+    }
+    // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈
+    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) {
+      continue;
+    }
+    // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕�
+    if (wordSize > 5 && incorrectRatio > 0.65) {
+      continue;
+    }
+    // 鏈塯oogle鐨勪笉瑕�
+    if (/.*(google).*/ig.test(line)) {
+      continue;
+    }
+    // 鍙湁涓�涓瓧绗︿笉瑕�
+    const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, '');
+    if (ret.length <= 1) {
+      continue;
+    }
+    if (ret == 'Digitized by') {
+      continue;
+    }
+    result.push(ret);
+  }
+  text = result.join('\n');
+  // }
+  return text;
+}
+
+/**
+ * 瑙e帇鏂囨湰鏂囦欢
+ * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞
+ * @param {string} txtFile 鏂囨湰鏂囦欢璺緞
+ */
+function unzip(zipFile, txtFile) {
+  const tmpdir = `./tmpdir/${threadId}`;
+  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
+  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
+    .sort((a, b) => a.size.size - b.size.size).pop();
+  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
+  fs.rmSync(`${tmpdir}`, { recursive: true });
+}
 
 /**
  * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅
@@ -99,14 +231,12 @@
  * @returns 澶勭悊鍚庣殑鍏抽敭瀛�
  */
 function formatKw(text, titleWithNumbers) {
-  // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃
-  const regex = /[^\u4e00-\u9fa5\w\s\d]/g;
   if (titleWithNumbers) {
-    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, "");
+    text = text;
   } else {
-    text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, "");
+    text = text.replace(/[\d]/g, "");
   }
-  text = text.split(' ').slice(0, 10).join("+");
+  text = text.split(' ').slice(0, 6).join("+");
   return text;
 }
 
@@ -135,10 +265,10 @@
  * @param {*} book 
  */
 async function openSearchPage(book, titleWithNumbers) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
+  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`);
   return await retry(async () => {
     // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
+    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`;
     await driver.get(searchUrl);
   }).then(() => true)
     .catch(() => false);
@@ -236,19 +366,34 @@
     } else if (textUrl) {
       return textUrl;
     } else {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
+      book.state = "娌℃湁text鏂囦欢";
       return ''
     }
   })
     .catch(() => {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
+      book.state = "娌℃湁text鏂囦欢";
       return '';
     });
 }
 
+/**
+ * 浠嶩TML鎻愬彇鏂囨湰
+ * @param {string} text html鏂囨湰
+ * @returns 鏂囨湰
+ */
+function getTextFromHtml(text) {
+  if (text.includes("<!DOCTYPE html>")) {
+    const s = text.indexOf('<pre>') + 6;
+    const e = text.indexOf('</pre>');
+    text = text.substring(s, e);
+    // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
+  }
+  return text;
+}
+
 async function downloadFile(book, url) {
   console.log(`涓嬭浇鏂囦欢: ${url}`);
-  const ext = url.split(".").pop();
+  const ext = url.split(".").pop().toLowerCase();
   const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
   if (fs.existsSync(filepath)) {
     book.state = `涓嬭浇瀹屾垚`;
@@ -259,8 +404,13 @@
     return;
   }
   await retry(() => {
+    const timeoutTime = 10 * 60 * 1000;
+    const source = axios.CancelToken.source();
+    const timeout = setTimeout(() => {
+      source.cancel("timeout");
+    }, timeoutTime);
     return new Promise((resolve, reject) => myAxios
-      .get(url, { responseType: "stream" })
+      .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
       .then((response) => {
         const len = response.headers['content-length'];
         if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
@@ -272,17 +422,19 @@
           return;
         }
         const stream = response.data;
-        const out = fs.createWriteStream(filepath);
+        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+        const out = fs.createWriteStream(_filepath);
         stream.pipe(out);
         stream.on("end", () => {
+          clearTimeout(timeout);
           book.state = `涓嬭浇瀹屾垚`;
           book.format = ext;
           book.file = filepath;
           book.url = url;
-          console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
           resolve(true);
         });
         stream.on("error", (err) => {
+          clearTimeout(timeout);
           console.error(err);
           book.state = "涓嬭浇澶辫触";
           book.url = url;
@@ -297,6 +449,7 @@
         });
       })
       .catch((e) => {
+        clearTimeout(timeout);
         console.error(e);
         book.state = "涓嬭浇澶辫触";
         book.url = url;
@@ -313,23 +466,52 @@
   return alreadyDownloadedBooks.includes(id);
 }
 
+function nextBook() {
+  return new Promise(resolve => {
+    const cb = (message) => {
+      if (message.type === 'book') {
+        resolve(message.data);
+        parentPort.removeListener('message', cb);
+      }
+    };
+    parentPort.on('message', cb);
+    parentPort.postMessage({ type: 'get-book', threadId });
+
+  });
+}
+
+function getBookInfo(book) {
+  return retry(async () => {
+    book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e => 0);
+    book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e => 0);
+    let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e => 0);
+    if (pages) { book.pages = pages.split(' / ')[1]; }
+  });
+}
+
 async function downloadBooks(books) {
   driver = await createDriver();
-  for (const book of books) {
+
+  for (; ;) {
+    const book = await nextBook();
+    if (!book) {
+      break;
+    }
+    books.push(book);
     if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
       // 瀹氭椂閫�鍑�
       break;
     }
     bookCount++;
-    if (isAlreadyDownloaded(book)) {
+    /*if (isAlreadyDownloaded(book)) {
       skipCount++;
       continue;
     }
-    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
+     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
       skipCount++;
       continue;
-    }
+    } */
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
     if (!await openSearchPage(book, true)) {
@@ -353,22 +535,24 @@
       continue;
     }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(1000, 30000));
+    sleep(getRandomNumber(500, 10000));
     // 鎵撳紑璇︽儏椤�
     await openBookDetailPage(book, detailPageUrl);
+    await getBookInfo(book);
     // 鑾峰彇涓嬭浇閾炬帴
     const url = await getDownloadUrl(book);
     if (!url) { continue; }
     // 绛夊緟涓�娈垫椂闂村啀涓嬭浇
-    await sleep(getRandomNumber(1000, 30000));
+    await sleep(getRandomNumber(500, 10000));
     // 涓嬭浇鏂囦欢
     try {
       await downloadFile(book, url);
       console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`);
+      console.log('finish: ' + JSON.stringify(book));
     } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
-    sleep(getRandomNumber(1000, 30000));
+    sleep(getRandomNumber(500, 10000));
   }
 }
 
@@ -380,6 +564,9 @@
   for (const book of books) {
     const index = data.findIndex((row) => row[0] === book.id);
     if (index > -1) {
+      data[index][5] = book.publisher;
+      data[index][6] = book.pubDate;
+      data[index][11] = book.pages;
       data[index][12] = book.state;
       data[index][13] = book.format;
       data[index][14] = book.file;
@@ -388,7 +575,7 @@
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
-  fs.writeFile("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
+  fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
   console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
 }
 
@@ -438,10 +625,10 @@
 
 function main() {
   initLogger();
-  const books = getBooksFromExcel(config.startRow, config.endRow);
+  const books = [];
   downloadBooks(books)
     .then(() => {
-      console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
     })
     .catch(e => {
       console.error(e);
@@ -457,37 +644,42 @@
     });
 }
 
+if (!fs.existsSync('tmpdir')) {
+  fs.mkdirSync('tmpdir', { recursive: true });
+}
+if (!fs.existsSync('downloads')) {
+  fs.mkdirSync('downloads', { recursive: true });
+}
+
 // 澶氳繘绋嬫墽琛�
 if (isMainThread) {
   initLogger();
   const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
-  console.log(`绾跨▼鏁帮細${config.threadSize}, 寮�濮嬭锛�${config.startRow}, 缁撴潫琛岋細${config.endRow}`);
-  let startRow = config.startRow;
-  let endRow = config.endRow;
+  const { startRow, endRow, threadSize } = config;
+  console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
   let finishCnt = 0;
   const finishBooks = [];
-  const threadSize = config.threadSize;
-  const thBookSize = (endRow - startRow) / threadSize
-  for (let sr = startRow; sr < endRow; sr += thBookSize) {
-    let er = sr + thBookSize;
-    if (er > endRow) {
-      er = endRow;
-    }
-    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
+  const books = getBooksFromExcel(startRow, endRow);
+
+  for (let i = 0; i < threadSize; i++) {
+    const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
     worker.on("message", (message) => {
       if (message.type === 'books') {
         finishBooks.push(...message.data);
         finishCnt++;
-        if (finishCnt >= config.threadSize) {
+        if (finishCnt >= threadSize) {
           saveBooks(finishBooks);
         }
+      } else if (message.type === 'get-book') {
+        worker.postMessage({ type: "book", data: books.shift() });
       }
     });
   }
 } else {
-  config.startRow = workerData.startRow;
-  config.endRow = workerData.endRow;
   alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
   main();
 }
 
+// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt";
+// let text = fs.readFileSync(filepath, 'utf8');
+// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
\ No newline at end of file

--
Gitblit v1.9.1