From 1200a8ccf12266f04635c32b497461f7b29d85d9 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 17 七月 2024 09:02:15 +0800
Subject: [PATCH] 修改isbn log解析

---
 src/main.mjs |  351 ++++++++++++++++++++++------------------------------------
 1 files changed, 132 insertions(+), 219 deletions(-)

diff --git a/src/main.mjs b/src/main.mjs
index bbf02b6..2fbe23a 100644
--- a/src/main.mjs
+++ b/src/main.mjs
@@ -1,17 +1,15 @@
 import xlsx from "node-xlsx";
-import { Builder, Browser, until, By } from "selenium-webdriver";
-import { Options as ChromeOptions } from "selenium-webdriver/chrome.js";
-import proxy from "selenium-webdriver/proxy.js";
 import axios from "axios";
 import * as fs from "fs";
 import path from "path";
 import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
-import { resolve } from "path";
 import { execFileSync } from "child_process";
 import wordsjs from 'wordlist-js';
 import usPlaceList from "./us-place-list.mjs";
 import usPeronNameList from "./us-pseron-name-list.mjs";
+import * as pdfLib from 'pdf-lib';
+
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
 
@@ -36,6 +34,18 @@
   proxy: false,
   httpsAgent,
 });
+
+/**
+ * 鑾峰彇pdf鏂囦欢椤垫暟
+ * @param {string} filepath pdf 鏂囦欢璺緞
+ * @returns 椤垫暟
+ */
+async function getPdfPages(filepath) {
+  const buf = fs.readFileSync(filepath);
+  const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true });
+  const pages = pdfDoc.getPages().length;
+  return pages;
+}
 
 function allWords() {
   const words = {};
@@ -200,31 +210,6 @@
 }
 
 /**
- * 鍒涘缓娴忚鍣ㄩ┍鍔�
- * @returns chrome娴忚鍣ㄩ┍鍔�
- */
-async function createDriver() {
-  const opts = new ChromeOptions();
-  if (config.headless) {
-    opts.addArguments("--headless");//寮�鍚棤澶存ā寮�
-  }
-  if (config.disableGpu) {
-    opts.addArguments("--disable-gpu");//绂佹gpu娓叉煋
-  }
-  opts.addArguments("--ignore-ssl-error"); // 蹇界暐ssl閿欒
-  opts.addArguments("--no-sandbox"); // 绂佺敤娌欑洅妯″紡
-  opts.addArguments("blink-settings=imagesEnabled=false"); //绂佺敤鍥剧墖鍔犺浇
-  // proxy
-  opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' }))
-  const driver = await new Builder()
-    .setChromeOptions(opts)
-    .forBrowser(Browser.CHROME)
-    .build();
-  driver.manage().setTimeouts({ implicit: 10000 });
-  return driver;
-}
-
-/**
  * 鏍煎紡鍖栧叧閿瓧
  * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛�
  * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛�
@@ -261,117 +246,46 @@
 }
 
 /**
- * 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
+ * 鑾峰彇涔︾睄璇︽儏椤祏rl
  * @param {*} book 
  */
-async function openSearchPage(book, titleWithNumbers) {
-  console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`);
+async function getBookDetailPageUrl(book, titleWithNumbers) {
+  const kw = formatKw(book.title, titleWithNumbers);
+  const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`;
+  const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}`
+  console.log(`鎵撳紑鎼滅储: ${searchUrl}`);
   return await retry(async () => {
-    // 鑾峰彇椤甸潰
-    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`;
-    await driver.get(searchUrl);
-  }).then(() => true)
-    .catch(() => false);
-}
-
-/**
- * 妫�娴嬫悳绱㈢粨鏋�
- * @param {*} book 
- * @returns true: 鏈夋悳绱㈢粨鏋滐紝false: 娌℃湁鎼滅储缁撴灉
- */
-async function checkSearchResult(book) {
-  console.log(`妫�娴嬫悳绱㈢粨鏋渀);
-  return await retry(async () => {
-    const text = await driver.executeScript(`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#content-container > empty-placeholder").shadowRoot.querySelector("div > h2").textContent`);
-    if (text && text.includes("Your search did not match any items in the Archive. Try different keywords or a more general search.")) {
-      // 娌℃湁鎼滅储缁撴灉
-      book.state = "娌℃湁鎼滅储缁撴灉";
-      console.log(`娌℃湁鎼滅储缁撴灉: ${book.id} ${book.title}`);
-      return false;
-    } else {
-      return true;
+    const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
+    const { total, hits } = resp.data.response.body.hits
+    if (total === 0) {
+      return '';
     }
-  }, 2)
-    .catch(() => {
-      return true;
-    });
-}
-
-async function findBookDetailPageUrl(book) {
-  console.log(`鏌ユ壘璇︽儏椤祏rl`);
-  return retry(async () => {
-    let detailPageUrl;
-    try {
-      detailPageUrl = await driver.executeScript(
-        `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article:nth-child(2) > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
-      );
-    } catch (e) {
-      detailPageUrl = await driver.executeScript(
-        `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
-      );
-    }
-    return detailPageUrl;
+    const hit = hits[0];
+    const { identifier, title, creator } = hit.fields
+    return `https://archive.org/details/${identifier}`;
   })
     .catch(() => '');
 }
 
 async function openBookDetailPage(book, detailPageUrl) {
-  console.log(`鎵撳紑璇︽儏: https://archive.org${detailPageUrl}`);
+  console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`);
   return await retry(async () => {
-    await driver.get(`https://archive.org${detailPageUrl}`);
-    await driver.wait(
-      until.elementLocated(
-        By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`)
-      ), 15000
-    );
+    const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
+    const html = resp.data;
+    const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]);
+    book.publisher = data.metadata.publisher;
+    book.pubDate = data.metadata.date;
+    const identifier = data.metadata.identifier;
+    const fileData = data.files.find(f => f.format === 'Text PDF');
+    if (!fileData) {
+      return '';
+    }
+    const fileUrl = `https://archive.org/download/${identifier}/${fileData.name}`;
+    return fileUrl;
   })
-    .then(() => true)
     .catch(() => {
       book.state = "鎵撳紑璇︽儏椤靛け璐�";
       console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`);
-      return false;
-    });
-}
-
-async function getDownloadUrl(book) {
-  console.log(`鑾峰彇涓嬭浇閾炬帴`);
-  function getFullUrl(url) {
-    if (!url) { return ''; }
-    return url.startsWith("http") ? url : `https://archive.org${url}`;
-  }
-  return await retry(async () => {
-    const elements = await driver.findElements(
-      By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div/a`)
-    );
-
-    let pdfUrl = "";
-    let textUrl = "";
-    for (const el of elements) {
-      let text = await el.getText();
-      if (text) {
-        text = text.trim().split("\n")[0];
-        const href = getFullUrl(await el.getAttribute("href"));
-        if (text.toLowerCase() === "pdf") {
-          pdfUrl = href;
-        } else if (text.toLowerCase() === "full text") {
-          textUrl = href;
-        } else if (text.toLowerCase() === "ocr search text") {
-          textUrl = href;
-        }
-      }
-    }
-
-    if (pdfUrl) {
-      return pdfUrl;
-    } else if (textUrl) {
-      return textUrl;
-    } else {
-      book.state = "娌℃湁text鏂囦欢";
-      return ''
-    }
-  })
-    .catch(() => {
-      book.state = "娌℃湁text鏂囦欢";
       return '';
     });
 }
@@ -395,11 +309,12 @@
   console.log(`涓嬭浇鏂囦欢: ${url}`);
   const ext = url.split(".").pop().toLowerCase();
   const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
+  book.url = url;
   if (fs.existsSync(filepath)) {
     book.state = `涓嬭浇瀹屾垚`;
     book.format = ext;
     book.file = filepath;
-    book.url = url;
+    book.pages = await getPdfPages(filepath).catch(() => 0);
     console.log(`涓嬭浇瀹屾垚锛�${filepath}`);
     return;
   }
@@ -415,9 +330,6 @@
         const len = response.headers['content-length'];
         if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
           // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇
-          book.state = "涓嬭浇澶辫触";
-          book.url = url;
-          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
           reject(false);
           return;
         }
@@ -425,20 +337,18 @@
         const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
         const out = fs.createWriteStream(_filepath);
         stream.pipe(out);
-        stream.on("end", () => {
+        stream.on("end", async () => {
           clearTimeout(timeout);
           book.state = `涓嬭浇瀹屾垚`;
           book.format = ext;
           book.file = filepath;
           book.url = url;
+          book.pages = await getPdfPages(filepath).catch(e => 0);
           resolve(true);
         });
         stream.on("error", (err) => {
           clearTimeout(timeout);
           console.error(err);
-          book.state = "涓嬭浇澶辫触";
-          book.url = url;
-          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
           reject(false);
           try {
             out.close();
@@ -450,13 +360,19 @@
       })
       .catch((e) => {
         clearTimeout(timeout);
-        console.error(e);
-        book.state = "涓嬭浇澶辫触";
+        console.log(`涓嬭浇澶辫触锛岄敊璇爜: ${e?.response?.status ?? e.code}`);
         book.url = url;
-        console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
-        reject(false);
+        if (e.response?.status === 403 || e.response?.status === 401) {
+          book.state = "娌℃湁涓嬭浇鏉冮檺";
+          console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
+          resolve(true);
+        } else {
+          reject(false);
+        }
       }));
   }).catch(e => {
+    book.state = "涓嬭浇澶辫触";
+    console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`);
     return false
   });
 }
@@ -480,17 +396,8 @@
   });
 }
 
-function getBookInfo(book) {
-  return retry(async () => {
-    book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e => 0);
-    book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e => 0);
-    let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e => 0);
-    if (pages) { book.pages = pages.split(' / ')[1]; }
-  });
-}
 
 async function downloadBooks(books) {
-  driver = await createDriver();
 
   for (; ;) {
     const book = await nextBook();
@@ -503,47 +410,38 @@
       break;
     }
     bookCount++;
-    /*if (isAlreadyDownloaded(book)) {
+    if (isAlreadyDownloaded(book)) {
       skipCount++;
+      book.skip = true;
       continue;
     }
-     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
+    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
       skipCount++;
       continue;
-    } */
+    }
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
-    if (!await openSearchPage(book, true)) {
+    let detailPageUrl = await getBookDetailPageUrl(book, true);
+    if (!detailPageUrl) {
       // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
-      if (!await openSearchPage(book, false)) {
-        console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`);
-        book.state = "鎵撳紑鎼滅储椤甸潰澶辫触";
+      detailPageUrl = await getBookDetailPageUrl(book, false);
+      if (!detailPageUrl) {
+        console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
+        book.state = "娌℃湁鎼滅储缁撴灉";
         continue;
       }
     }
-    // 妫�娴嬫悳绱㈢粨鏋�
-    const hasBook = await checkSearchResult(book);
-    if (!hasBook) {
-      continue;
-    }
-    // 鑾峰彇璇︽儏椤甸摼鎺�
-    const detailPageUrl = await findBookDetailPageUrl(book);
-    if (!detailPageUrl) {
-      console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
-      book.state = "鑾峰彇璇︽儏椤甸摼鎺ュけ璐�";
-      continue;
-    }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(500, 10000));
-    // 鎵撳紑璇︽儏椤�
-    await openBookDetailPage(book, detailPageUrl);
-    await getBookInfo(book);
-    // 鑾峰彇涓嬭浇閾炬帴
-    const url = await getDownloadUrl(book);
-    if (!url) { continue; }
+    sleep(getRandomNumber(500, 1000));
+    // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙栦笅杞介摼鎺�
+    const url = await openBookDetailPage(book, detailPageUrl);
+    if (!url) {
+      console.log(`娌℃湁pdf鎴杢ext鏂囦欢: ${book.id} ${book.title}`);
+      continue;
+    }
     // 绛夊緟涓�娈垫椂闂村啀涓嬭浇
-    await sleep(getRandomNumber(500, 10000));
+    await sleep(getRandomNumber(500, 1000));
     // 涓嬭浇鏂囦欢
     try {
       await downloadFile(book, url);
@@ -552,7 +450,7 @@
     } catch (e) { }
     successCount++;
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
-    sleep(getRandomNumber(500, 10000));
+    sleep(getRandomNumber(500, 1000));
   }
 }
 
@@ -575,8 +473,15 @@
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
-  fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
-  console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
+  try {
+    fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { });
+    console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx");
+  } catch (e) {
+    console.error(e);
+    const outfile = `${Date.now()}.json`;
+    fs.writeFileSync(outfile, JSON.stringify(data));
+    console.log("淇濆瓨瀹屾垚: " + outfile);
+  }
 }
 
 
@@ -611,8 +516,6 @@
 let bookCount = 0;
 // 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺
 let skipCount = 0;
-// chrome椹卞姩
-let driver;
 let alreadyDownloadedBooks = [];
 
 function getAlreadyDownloadedBooks() {
@@ -623,7 +526,7 @@
   return books.map(it => path.basename(it, path.extname(it)).trim());
 }
 
-function main() {
+function startDownload() {
   initLogger();
   const books = [];
   downloadBooks(books)
@@ -637,49 +540,59 @@
       // saveBooks(books);
       parentPort.postMessage({ type: "books", data: books });
       logFile.close();
-      try {
-        await driver.close();
-        await driver.quit();
-      } catch (e) { }
     });
 }
 
-if (!fs.existsSync('tmpdir')) {
-  fs.mkdirSync('tmpdir', { recursive: true });
-}
-if (!fs.existsSync('downloads')) {
-  fs.mkdirSync('downloads', { recursive: true });
-}
+function main() {
 
-// 澶氳繘绋嬫墽琛�
-if (isMainThread) {
-  initLogger();
-  const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
-  const { startRow, endRow, threadSize } = config;
-  console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
-  let finishCnt = 0;
-  const finishBooks = [];
-  const books = getBooksFromExcel(startRow, endRow);
-
-  for (let i = 0; i < threadSize; i++) {
-    const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
-    worker.on("message", (message) => {
-      if (message.type === 'books') {
-        finishBooks.push(...message.data);
-        finishCnt++;
-        if (finishCnt >= threadSize) {
-          saveBooks(finishBooks);
-        }
-      } else if (message.type === 'get-book') {
-        worker.postMessage({ type: "book", data: books.shift() });
-      }
-    });
+  if (!fs.existsSync('tmpdir')) {
+    fs.mkdirSync('tmpdir', { recursive: true });
   }
-} else {
-  alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
-  main();
+  if (!fs.existsSync('downloads')) {
+    fs.mkdirSync('downloads', { recursive: true });
+  }
+  // 澶氳繘绋嬫墽琛�
+  if (isMainThread) {
+    initLogger();
+    let downloadCnt = 0;
+    const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
+    const { startRow, endRow, threadSize } = config;
+    console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
+    let finishThreadCnt = 0;
+    const finishBooks = [];
+    const books = getBooksFromExcel(startRow, endRow);
+
+    for (let i = 0; i < threadSize; i++) {
+      const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
+      worker.on("message", (message) => {
+        if (message.type === 'books') {
+          finishBooks.push(...message.data);
+          finishThreadCnt++;
+          if (finishThreadCnt >= threadSize) {
+            successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length;
+            skipCount = finishBooks.filter(it => it.skip).length;
+            console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+            saveBooks(finishBooks);
+          }
+        } else if (message.type === 'get-book') {
+          downloadCnt++;
+          worker.postMessage({ type: "book", data: books.shift() });
+        }
+      });
+    }
+    // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭�
+    process.on('SIGINT', () => {
+      successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length;
+      skipCount = finishBooks.filter(it => it.skip).length;
+      console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      saveBooks(finishBooks);
+      process.exit(0);
+    });
+  } else {
+    alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
+    startDownload();
+
+  }
 }
 
-// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt";
-// let text = fs.readFileSync(filepath, 'utf8');
-// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
\ No newline at end of file
+main();
\ No newline at end of file

--
Gitblit v1.9.1