From 8d546341fbf8fc45543cf33e40097bae994cdfd5 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期五, 14 六月 2024 01:33:31 +0800
Subject: [PATCH] 清理文本栈溢出异常处理

---
 src/main.mjs |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index 7aca6af..20703fd 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -8,6 +8,7 @@
 import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
 import { resolve } from "path";
+import { execFileSync } from "child_process";
 
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
@@ -19,7 +20,7 @@
   if (!fs.existsSync('./logs')) {
     fs.mkdirSync('./logs', { recursive: true });
   }
-  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
+  logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
   console.log = function (...text) {
     text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
     _log(text);
@@ -33,6 +34,61 @@
   proxy: false,
   httpsAgent,
 });
+
+function countChar(str, char) {
+  let count = 0;
+  for (let i = 0; i < str.length; i++) {
+    if (str[i] === char) {
+      count++;
+    }
+  }
+  return count;
+}
+
+/**
+ * 娓呯悊鏂囨湰
+ * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰
+ */
+function cleanText(text) {
+  if (text.includes('google')) {
+    text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '')
+  }
+  // 濡傛灉鏄痮cr璇嗗埆鐨勬枃鏈紝姣忚瀛楃鏁颁竴鑸笉浼氳秴杩�170
+  if (!/.{170,}/g.test(text) || text.includes('google')) {
+    text = text.replace(/(\r|鈻�)/g, '');
+    text = text.replace(/[ ]{2,}/g, ' ')
+    text = text.replace(/(.+)\n/g, '$1');
+    text = text.replace(/\n+/g, '\n');
+    text = text.replace(/-\n/g, '-');
+    const lines = text.split('\n');
+    const result = [];
+    for (const line of lines) {
+      const wordSize = countChar(line, ' ');
+      if (wordSize >= 10) {
+        if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) {
+          result.push(line.trim());
+        }
+      }
+    }
+    return result.join('\n');
+  } else {
+    return text;
+  }
+}
+
+/**
+ * 瑙e帇鏂囨湰鏂囦欢
+ * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞
+ * @param {string} txtFile 鏂囨湰鏂囦欢璺緞
+ */
+function unzip(zipFile, txtFile) {
+  const tmpdir = `./tmpdir/${threadId}`;
+  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
+  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
+    .sort((a, b) => a.size.size - b.size.size).pop();
+  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
+  fs.rmSync(`${tmpdir}`, { recursive: true });
+}
 
 /**
  * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅
@@ -133,10 +189,10 @@
  * @param {*} book 
  */
 async function openSearchPage(book, titleWithNumbers) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
+  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`);
   return await retry(async () => {
     // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
+    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`;
     await driver.get(searchUrl);
   }).then(() => true)
     .catch(() => false);
@@ -229,25 +285,26 @@
       }
     }
 
-    if (pdfUrl) {
+    /* if (pdfUrl) {
       return pdfUrl;
-    } else if (textUrl) {
+    } else  */
+    if (textUrl) {
       return textUrl;
     } else {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
+      book.state = "娌℃湁text鏂囦欢";
       return ''
     }
   })
     .catch(() => {
-      book.state = "娌℃湁pdf鎴杢ext鏂囦欢";
+      book.state = "娌℃湁text鏂囦欢";
       return '';
     });
 }
 
 async function downloadFile(book, url) {
   console.log(`涓嬭浇鏂囦欢: ${url}`);
-  const ext = url.split(".").pop();
-  const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+  const ext = url.split(".").pop().toLowerCase();
+  const filepath = `./downloads/${book.id} ${book.isbn}.txt`;
   if (fs.existsSync(filepath)) {
     book.state = `涓嬭浇瀹屾垚`;
     book.format = ext;
@@ -275,7 +332,8 @@
           return;
         }
         const stream = response.data;
-        const out = fs.createWriteStream(filepath);
+        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+        const out = fs.createWriteStream(_filepath);
         stream.pipe(out);
         stream.on("end", () => {
           clearTimeout(timeout);
@@ -284,6 +342,27 @@
           book.file = filepath;
           book.url = url;
           console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
+          setTimeout(() => {
+            if (ext === "gz" || ext === "zip") {
+              unzip(_filepath, filepath);
+            }
+            let text = fs.readFileSync(filepath, 'utf-8');
+            if (text.includes("<!DOCTYPE html>")) {
+              text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
+              fs.writeFileSync(filepath, text, 'utf-8');
+            }
+            try {
+              fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
+            } catch (e) {
+              reject(e);
+              try {
+                out.close();
+                fs.unlink(filepath, (e) => console.error(e));
+              } catch (e) {
+                console.error(e);
+              }
+            }
+          }, 1000);
           resolve(true);
         });
         stream.on("error", (err) => {
@@ -467,7 +546,7 @@
   const books = [];
   downloadBooks(books)
     .then(() => {
-      console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
     })
     .catch(e => {
       console.error(e);
@@ -483,6 +562,13 @@
     });
 }
 
+if (!fs.existsSync('tmpdir')) {
+  fs.mkdirSync('tmpdir', { recursive: true });
+}
+if (!fs.existsSync('downloads')) {
+  fs.mkdirSync('downloads', { recursive: true });
+}
+
 // 澶氳繘绋嬫墽琛�
 if (isMainThread) {
   initLogger();
@@ -491,15 +577,10 @@
   console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
   let finishCnt = 0;
   const finishBooks = [];
-  const thBookSize = (endRow - startRow) / threadSize;
   const books = getBooksFromExcel(startRow, endRow);
 
-  for (let sr = startRow; sr < endRow; sr += thBookSize) {
-    let er = sr + thBookSize;
-    if (er > endRow) {
-      er = endRow;
-    }
-    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
+  for (let i = 0; i < threadSize; i++) {
+    const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
     worker.on("message", (message) => {
       if (message.type === 'books') {
         finishBooks.push(...message.data);
@@ -517,3 +598,6 @@
   main();
 }
 
+// const filepath = "D:\\projects\\book-crawler\\downloads\\10231261 978-1-331-76167-9.txt";
+// const text = fs.readFileSync(filepath, 'utf8');
+// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
\ No newline at end of file

--
Gitblit v1.9.1