From 831697d95be0123fade180aedded20db01f1884b Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 17 七月 2024 12:23:19 +0800
Subject: [PATCH] 修改isbn查询

---
 yarn.lock                |  113 ++++++++++++++++++++++
 package.json             |    1 
 src/book-isbn-search.mjs |  147 ++++++++++++++++++-----------
 3 files changed, 204 insertions(+), 57 deletions(-)

diff --git a/package.json b/package.json
index cf39113..c54a6e4 100644
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
   "devDependencies": {},
   "dependencies": {
     "axios": "^1.7.2",
+    "cheerio": "^1.0.0-rc.12",
     "https-proxy-agent": "^7.0.4",
     "node-xlsx": "^0.24.0",
     "pdf-lib": "^1.17.1",
diff --git a/src/book-isbn-search.mjs b/src/book-isbn-search.mjs
index 4b4b362..a86b02a 100644
--- a/src/book-isbn-search.mjs
+++ b/src/book-isbn-search.mjs
@@ -3,8 +3,9 @@
 import * as fs from "fs";
 import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
 import { HttpsProxyAgent } from "https-proxy-agent";
+import * as cheerio from 'cheerio';
 
-const EXCEL_FILE = "fiction-noisbn.xlsx";
+const EXCEL_FILE = "book-list.xlsx";
 
 /*-------------璇诲彇閰嶇疆---------------*/
 let config = JSON.parse(fs.readFileSync('./config.json'));
@@ -30,31 +31,6 @@
   proxy: false,
   httpsAgent,
 });
-
-/**
- * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅
- * @param {number} startRow 璧峰琛岋紝鍖呭惈
- * @param {number} endRow 缁撴潫琛岋紝涓嶅寘鍚�
- * @returns 
- */
-function getBooksFromExcel(startRow, endRow) {
-  const workSheets = xlsx.parse(EXCEL_FILE);
-  const sheet = workSheets[0];
-  const data = sheet.data.slice(startRow, endRow);
-  const books = data.map((row) => {
-    return {
-      id: row[0],
-      title: row[1],
-      author: row[2],
-      year: row[3],
-      publisher: row[4],
-      isbn: row[5],
-      extension: row[6],
-      state: row[7]
-    };
-  });
-  return books;
-}
 
 /**
  * 鏍煎紡鍖栧叧閿瓧
@@ -166,32 +142,39 @@
       break;
     }
     bookCount++;
-    if (isAlreadyDownloaded(book)) {
-      skipCount++;
-      book.skip = true;
-      continue;
-    }
+    // if (isAlreadyDownloaded(book)) {
+    //   skipCount++;
+    //   book.skip = true;
+    //   continue;
+    // }
     if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
       // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
       skipCount++;
       continue;
     }
     console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
+
+
+
     // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
-    let detailPageUrl = await getBookDetailPageUrl(book, true);
-    if (!detailPageUrl) {
-      // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
-      detailPageUrl = await getBookDetailPageUrl(book, false);
-      if (!detailPageUrl) {
-        console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
-        book.state = "娌℃湁鎼滅储缁撴灉";
-        continue;
-      }
-    }
+    // let detailPageUrl = await getBookDetailPageUrl(book, true);
+    // if (!detailPageUrl) {
+    //   // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
+    //   detailPageUrl = await getBookDetailPageUrl(book, false);
+    //   if (!detailPageUrl) {
+    //     console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
+    //     book.state = "娌℃湁鎼滅储缁撴灉";
+    //     continue;
+    //   }
+    // }
     // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
-    sleep(getRandomNumber(500, 1000));
+    // sleep(getRandomNumber(500, 1000));
     // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙杋sbn
+    const detailPageUrl = `https://archive.org/details/${book.id}`;
     await openBookDetailPage(book, detailPageUrl);
+    if (book.isbn) {
+      parentPort.postMessage({ type: "book", data: book });
+    }
     // 绛変竴娈垫椂闂村啀涓嬩竴涓�
     sleep(getRandomNumber(500, 1000));
   }
@@ -199,19 +182,11 @@
 
 function saveBooks(books) {
   console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�");
-  const workSheets = xlsx.parse(EXCEL_FILE);
-  const sheet = workSheets[0];
+  const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] };
   const data = sheet.data;
   for (const book of books) {
-    const index = data.findIndex((row) => row[0] === book.id);
-    if (index > -1) {
-      data[index][5] = book.isbn;
-      if (!data[index][3])
-        data[index][3] = book.pubDate;
-      if (!data[index][4])
-        data[index][4] = book.publisher;
-      data[index][7] = book.state;
-    }
+    const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn];
+    data.push(row);
   }
 
   const buffer = xlsx.build([{ name: "Sheet1", data }]);
@@ -271,11 +246,54 @@
     })
     .finally(async () => {
       // saveBooks(books);
-      parentPort.postMessage({ type: "books", data: books });
+      // parentPort.postMessage({ type: "books", data: books });
       logFile.close();
     });
 }
 
+let year = 2024;
+let codeIndex = 0;
+const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"];
+
+async function getBookList(pageSize, page, code) {
+  const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`;
+  return await myAxios.get(url);
+}
+
+async function getBooks() {
+  let page = 1;
+  const pageSize = 100;
+  let total = 0;
+  const code = codeList[codeIndex];
+  console.log(`${year}骞� ${codeIndex}`);
+  const bookList = [];
+  do {
+    console.log(`姝e湪鑾峰彇 ${year} 骞� ${code} 鍒嗙被 ${page} 椤礰);
+    const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => {
+      console.log(`鑾峰彇澶辫触锛�${year} 骞� ${code} 鍒嗙被 ${page} 椤礰);
+    });;
+    if (!resp) {
+      continue;
+    }
+    const { total: _total, hits } = resp.data.response.body.hits
+    total = _total;
+    for (const hit of hits) {
+      const { identifier, title, creator } = hit.fields
+      const author = creator?.join(", ");
+      bookList.push({ id: identifier, title, author });
+    }
+    page++;
+    await sleep(getRandomNumber(300, 800));
+  } while (pageSize * page < total);
+  codeIndex++;
+  if (codeIndex == codeList.length) {
+    year--;
+    codeIndex = 0;
+  }
+  return bookList;
+}
+
+let getBookPromise = null;
 function main() {
 
   if (!fs.existsSync('tmpdir')) {
@@ -289,12 +307,15 @@
     console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
     let finishThreadCnt = 0;
     const finishBooks = [];
-    const books = getBooksFromExcel(startRow, endRow);
+    const books = [];
 
     for (let i = 0; i < threadSize; i++) {
       const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} });
-      worker.on("message", (message) => {
-        if (message.type === 'books') {
+      worker.on("message", async (message) => {
+        if (message.type === 'book') {
+          finishBooks.push(message.data);
+        }
+        else if (message.type === 'books') {
           finishBooks.push(...message.data);
           finishThreadCnt++;
           if (finishThreadCnt >= threadSize) {
@@ -305,6 +326,18 @@
           }
         } else if (message.type === 'get-book') {
           downloadCnt++;
+          if (getBookPromise) {
+            await getBookPromise.finally();
+          }
+          if (books.length == 0) {
+            do {
+              if (year > 1950) {
+                getBookPromise = getBooks();
+                books.push(...await getBookPromise.finally());
+                getBookPromise = null;
+              }
+            } while (!books.length);
+          }
           worker.postMessage({ type: "book", data: books.shift() });
         }
       });
diff --git a/yarn.lock b/yarn.lock
index d69f37b..f8a3dc3 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -37,6 +37,36 @@
     form-data "^4.0.0"
     proxy-from-env "^1.1.0"
 
+boolbase@^1.0.0:
+  version "1.0.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
+  integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==
+
+cheerio-select@^2.1.0:
+  version "2.1.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4"
+  integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==
+  dependencies:
+    boolbase "^1.0.0"
+    css-select "^5.1.0"
+    css-what "^6.1.0"
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+    domutils "^3.0.1"
+
+cheerio@^1.0.0-rc.12:
+  version "1.0.0-rc.12"
+  resolved "https://mirrors.cloud.tencent.com/npm/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683"
+  integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==
+  dependencies:
+    cheerio-select "^2.1.0"
+    dom-serializer "^2.0.0"
+    domhandler "^5.0.3"
+    domutils "^3.0.1"
+    htmlparser2 "^8.0.1"
+    parse5 "^7.0.0"
+    parse5-htmlparser2-tree-adapter "^7.0.0"
+
 combined-stream@^1.0.8:
   version "1.0.8"
   resolved "https://mirrors.cloud.tencent.com/npm/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f"
@@ -49,6 +79,22 @@
   resolved "https://mirrors.cloud.tencent.com/npm/core-util-is/-/core-util-is-1.0.3.tgz#a6042d3634c2b27e9328f837b965fac83808db85"
   integrity sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==
 
+css-select@^5.1.0:
+  version "5.1.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
+  integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
+  dependencies:
+    boolbase "^1.0.0"
+    css-what "^6.1.0"
+    domhandler "^5.0.2"
+    domutils "^3.0.1"
+    nth-check "^2.0.1"
+
+css-what@^6.1.0:
+  version "6.1.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
+  integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
+
 debug@4, debug@^4.3.4:
   version "4.3.5"
   resolved "https://mirrors.cloud.tencent.com/npm/debug/-/debug-4.3.5.tgz#e83444eceb9fedd4a1da56d671ae2446a01a6e1e"
@@ -60,6 +106,41 @@
   version "1.0.0"
   resolved "https://mirrors.cloud.tencent.com/npm/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619"
   integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==
+
+dom-serializer@^2.0.0:
+  version "2.0.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53"
+  integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==
+  dependencies:
+    domelementtype "^2.3.0"
+    domhandler "^5.0.2"
+    entities "^4.2.0"
+
+domelementtype@^2.3.0:
+  version "2.3.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d"
+  integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==
+
+domhandler@^5.0.2, domhandler@^5.0.3:
+  version "5.0.3"
+  resolved "https://mirrors.cloud.tencent.com/npm/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
+  integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
+  dependencies:
+    domelementtype "^2.3.0"
+
+domutils@^3.0.1:
+  version "3.1.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/domutils/-/domutils-3.1.0.tgz#c47f551278d3dc4b0b1ab8cbb42d751a6f0d824e"
+  integrity sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==
+  dependencies:
+    dom-serializer "^2.0.0"
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+
+entities@^4.2.0, entities@^4.4.0:
+  version "4.5.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48"
+  integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==
 
 follow-redirects@^1.15.6:
   version "1.15.6"
@@ -74,6 +155,16 @@
     asynckit "^0.4.0"
     combined-stream "^1.0.8"
     mime-types "^2.1.12"
+
+htmlparser2@^8.0.1:
+  version "8.0.2"
+  resolved "https://mirrors.cloud.tencent.com/npm/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21"
+  integrity sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==
+  dependencies:
+    domelementtype "^2.3.0"
+    domhandler "^5.0.3"
+    domutils "^3.0.1"
+    entities "^4.4.0"
 
 https-proxy-agent@^7.0.4:
   version "7.0.4"
@@ -139,11 +230,33 @@
   dependencies:
     xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"
 
+nth-check@^2.0.1:
+  version "2.1.1"
+  resolved "https://mirrors.cloud.tencent.com/npm/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d"
+  integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==
+  dependencies:
+    boolbase "^1.0.0"
+
 pako@^1.0.10, pako@^1.0.11, pako@^1.0.6, pako@~1.0.2:
   version "1.0.11"
   resolved "https://mirrors.cloud.tencent.com/npm/pako/-/pako-1.0.11.tgz#6c9599d340d54dfd3946380252a35705a6b992bf"
   integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==
 
+parse5-htmlparser2-tree-adapter@^7.0.0:
+  version "7.0.0"
+  resolved "https://mirrors.cloud.tencent.com/npm/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz#23c2cc233bcf09bb7beba8b8a69d46b08c62c2f1"
+  integrity sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==
+  dependencies:
+    domhandler "^5.0.2"
+    parse5 "^7.0.0"
+
+parse5@^7.0.0:
+  version "7.1.2"
+  resolved "https://mirrors.cloud.tencent.com/npm/parse5/-/parse5-7.1.2.tgz#0736bebbfd77793823240a23b7fc5e010b7f8e32"
+  integrity sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==
+  dependencies:
+    entities "^4.4.0"
+
 pdf-lib@^1.17.1:
   version "1.17.1"
   resolved "https://mirrors.cloud.tencent.com/npm/pdf-lib/-/pdf-lib-1.17.1.tgz#9e7dd21261a0c1fb17992580885b39e7d08f451f"

--
Gitblit v1.9.1