From 655f90e9e4544fdb8fa37ca0223fb686d4020b88 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期五, 14 六月 2024 22:35:37 +0800
Subject: [PATCH] txt版

---
 src/main.mjs |  167 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index 20703fd..4e7605b 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -9,7 +9,9 @@
 import { HttpsProxyAgent } from "https-proxy-agent";
 import { resolve } from "path";
 import { execFileSync } from "child_process";
-
+import wordsjs from 'wordlist-js';
+import usPlaceList from "./us-place-list.mjs";
+import usPeronNameList from "./us-pseron-name-list.mjs";
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
 
@@ -35,10 +37,32 @@
   httpsAgent,
 });
 
-function countChar(str, char) {
+function allWords() {
+  const words = {};
+  wordsjs.usPlaces = usPlaceList;
+  wordsjs.usPeronNameList = usPeronNameList;
+  for (const key in wordsjs.default) {
+    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
+      for (const word of wordsjs.default[key]) {
+        words[word] = true;
+      }
+    }
+  }
+  return words;
+}
+
+const wordsMap = allWords();
+
+/**
+ * 缁熻鍗曡瘝鏁伴噺
+ * @param {string} str 瀛楃涓�
+ * @returns 鍗曡瘝鏁伴噺
+ */
+function countWordSize(str) {
   let count = 0;
+  str = str.replace(/[ ]{2,}/g, ' ');
   for (let i = 0; i < str.length; i++) {
-    if (str[i] === char) {
+    if (str[i] === ' ') {
       count++;
     }
   }
@@ -46,34 +70,86 @@
 }
 
 /**
+ * 鑾峰彇閿欒鍗曡瘝姣斾緥
+ * @param {string} text 鏂囨湰
+ * @returns 閿欒鍗曡瘝姣斾緥
+ */
+function incorrectWordRatio(text) {
+  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
+  const words = text.split(' ');
+  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
+  return incorrectWordCnt / words.length;
+}
+
+/**
+ * 绗﹀彿鍗犳瘮 0 ~ 1
+ * @param {string} text 鏂囨湰
+ */
+function symbolRatio(text) {
+  // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣�
+  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
+}
+
+/**
  * 娓呯悊鏂囨湰
  * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰
  */
 function cleanText(text) {
-  if (text.includes('google')) {
-    text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '')
+  text = text.replace(/(\r)/g, '');
+  const googlePage = text.substring(0, 10000);
+  if (googlePage.includes('google')) {
+    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
   }
-  // 濡傛灉鏄痮cr璇嗗埆鐨勬枃鏈紝姣忚瀛楃鏁颁竴鑸笉浼氳秴杩�170
-  if (!/.{170,}/g.test(text) || text.includes('google')) {
-    text = text.replace(/(\r|鈻�)/g, '');
-    text = text.replace(/[ ]{2,}/g, ' ')
-    text = text.replace(/(.+)\n/g, '$1');
-    text = text.replace(/\n+/g, '\n');
-    text = text.replace(/-\n/g, '-');
-    const lines = text.split('\n');
-    const result = [];
-    for (const line of lines) {
-      const wordSize = countChar(line, ' ');
-      if (wordSize >= 10) {
-        if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) {
-          result.push(line.trim());
-        }
+  // if (!/.{170,}/g.test(text) || text.includes('google')) {
+  text = text.replace(/[ ]{2,}/g, ' ')
+  if (!/.{170,}/g.test(text)) {
+    // 姣忚涓嶈秴杩�170涓瓧绗�
+    text = text.replace(/(.{170,})\n/g, '$1');
+  }
+  text = text.replace(/\n+/g, '\n');
+  text = text.replace(/-\n/g, '-');
+  const lines = text.split('\n');
+  const result = [];
+  for (const line of lines) {
+    // 绗﹀彿姣斿お楂樼殑涓嶈
+    const incorrectRatio = incorrectWordRatio(line);
+    if (symbolRatio(line) > 0.2) {
+      if (incorrectRatio > 0.65) {
+        continue;
       }
     }
-    return result.join('\n');
-  } else {
-    return text;
+    // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈
+    const wordSize = countWordSize(line);
+    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
+      if (wordSize < 5 || incorrectRatio > 0.65) {
+        continue;
+      }
+    }
+    // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈
+    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) {
+      continue;
+    }
+    // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕�
+    if (wordSize > 5 && incorrectRatio > 0.65) {
+      continue;
+    }
+    // 鏈塯oogle鐨勪笉瑕�
+    if (/.*(google).*/ig.test(line)) {
+      continue;
+    }
+    // 鍙湁涓�涓瓧绗︿笉瑕�
+    const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, '');
+    if (ret.length <= 1) {
+      continue;
+    }
+    if (ret == 'Digitized by') {
+      continue;
+    }
+    result.push(ret);
   }
+  text = result.join('\n');
+  // }
+  return text;
 }
 
 /**
@@ -301,6 +377,21 @@
     });
 }
 
+/**
+ * 浠嶩TML鎻愬彇鏂囨湰
+ * @param {string} text html鏂囨湰
+ * @returns 鏂囨湰
+ */
+function getTextFromHtml(text) {
+  if (text.includes("<!DOCTYPE html>")) {
+    const s = text.indexOf('<pre>') + 6;
+    const e = text.indexOf('</pre>');
+    text = text.substring(s, e);
+    // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
+  }
+  return text;
+}
+
 async function downloadFile(book, url) {
   console.log(`涓嬭浇鏂囦欢: ${url}`);
   const ext = url.split(".").pop().toLowerCase();
@@ -345,12 +436,11 @@
           setTimeout(() => {
             if (ext === "gz" || ext === "zip") {
               unzip(_filepath, filepath);
+              fs.unlinkSync(_filepath);
             }
             let text = fs.readFileSync(filepath, 'utf-8');
-            if (text.includes("<!DOCTYPE html>")) {
-              text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
-              fs.writeFileSync(filepath, text, 'utf-8');
-            }
+            text = getTextFromHtml(text);
+            fs.writeFileSync(filepath, text, 'utf-8');
             try {
               fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
             } catch (e) {
@@ -412,6 +502,18 @@
   });
 }
 
+function getBookInfo(book) {
+  return retry(async () => {
+    const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`);
+    const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`);
+    let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`);
+    pages = pages.split(' / ')[1];
+    book.publisher = publisher;
+    book.pubDate = datePublished;
+    book.pages = pages;
+  });
+}
+
 async function downloadBooks(books) {
   driver = await createDriver();
 
@@ -426,15 +528,15 @@
       break;
     }
     bookCount++;
-    if (isAlreadyDownloaded(book)) {
+    /*if (isAlreadyDownloaded(book)) {
       skipCount++;
       continue;
     }
-    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
+     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
       skipCount++;
       continue;
-    }
+    } */
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
     if (!await openSearchPage(book, true)) {
@@ -461,6 +563,7 @@
     sleep(getRandomNumber(500, 10000));
     // 鎵撳紑璇︽儏椤�
     await openBookDetailPage(book, detailPageUrl);
+    await getBookInfo(book);
     // 鑾峰彇涓嬭浇閾炬帴
     const url = await getDownloadUrl(book);
     if (!url) { continue; }
@@ -598,6 +701,6 @@
   main();
 }
 
-// const filepath = "D:\\projects\\book-crawler\\downloads\\10231261 978-1-331-76167-9.txt";
-// const text = fs.readFileSync(filepath, 'utf8');
+// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt";
+// let text = fs.readFileSync(filepath, 'utf8');
 // fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
\ No newline at end of file

--
Gitblit v1.9.1