From 42295152e10773a2bd394ac14f6feb2c4bc501a7 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期六, 13 七月 2024 13:34:13 +0800
Subject: [PATCH] 增加ISBN获取脚本

---
 config.json              |    6 
 package.json             |    4 
 src/book-isbn-search.mjs |  326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 332 insertions(+), 4 deletions(-)

diff --git a/config.json b/config.json
index 13adc51..8e59de2 100644
--- a/config.json
+++ b/config.json
@@ -1,12 +1,12 @@
 {
   "//璧峰琛屽彿锛屼粠0寮�濮嬶紝绗�0琛屾槸琛ㄥご锛屽寘鍚�": "//",
-  "startRow": 5,
+  "startRow": 1,
   
   "//缁撴潫琛屽彿锛屼粠0寮�濮嬶紝涓嶅寘鍚琛�": "//",
-  "endRow": 10,
+  "endRow": 1095110,
 
   "//绾跨▼鏁�": "//",
-  "threadSize": 1,
+  "threadSize": 10,
 
   "//瀹氭椂缁撴潫鏃堕棿锛屽崟浣嶅垎閽燂紝0琛ㄧず涓嶈缃畾鏃剁粨鏉熸椂闂�": "//",
   "endOfTime": 0,
diff --git a/package.json b/package.json
index ce8c83c..cf39113 100644
--- a/package.json
+++ b/package.json
@@ -7,7 +7,9 @@
   "scripts": {
     "download": "node src/main.mjs",
     "book-list": "node src/book-list-download.mjs",
-    "parse-log": "node src/parse-log.mjs"
+    "parse-log": "node src/parse-log.mjs",
+    "trans": "node src/trans.mjs",
+    "book-isbn": "node src/book-isbn-search.mjs"
   },
   "devDependencies": {},
   "dependencies": {
diff --git a/src/book-isbn-search.mjs b/src/book-isbn-search.mjs
new file mode 100644
index 0000000..4b4b362
--- /dev/null
+++ b/src/book-isbn-search.mjs
@@ -0,0 +1,326 @@
+import xlsx from "node-xlsx";
+import axios from "axios";
+import * as fs from "fs";
+import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
+import { HttpsProxyAgent } from "https-proxy-agent";
+
+const EXCEL_FILE = "fiction-noisbn.xlsx";
+
+/*-------------璇诲彇閰嶇疆---------------*/
+let config = JSON.parse(fs.readFileSync('./config.json'));
+
+/* ------------鏃ュ織-------------- */
+let logFile;
+function initLogger() {
+  const _log = console.log;
+  if (!fs.existsSync('./book-isbn-logs')) {
+    fs.mkdirSync('./book-isbn-logs', { recursive: true });
+  }
+  logFile = fs.createWriteStream(`./book-isbn-logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
+  console.log = function (...text) {
+    text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
+    _log(text);
+    logFile.write(text + '\n');
+  };
+}
+
+/* ----------axios浠g悊------------ */
+const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
+const myAxios = axios.create({
+  proxy: false,
+  httpsAgent,
+});
+
+/**
+ * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅
+ * @param {number} startRow 璧峰琛岋紝鍖呭惈
+ * @param {number} endRow 缁撴潫琛岋紝涓嶅寘鍚�
+ * @returns 
+ */
+function getBooksFromExcel(startRow, endRow) {
+  const workSheets = xlsx.parse(EXCEL_FILE);
+  const sheet = workSheets[0];
+  const data = sheet.data.slice(startRow, endRow);
+  const books = data.map((row) => {
+    return {
+      id: row[0],
+      title: row[1],
+      author: row[2],
+      year: row[3],
+      publisher: row[4],
+      isbn: row[5],
+      extension: row[6],
+      state: row[7]
+    };
+  });
+  return books;
+}
+
+/**
+ * 鏍煎紡鍖栧叧閿瓧
+ * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛�
+ * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛�
+ * @returns 澶勭悊鍚庣殑鍏抽敭瀛�
+ */
+function formatKw(text, titleWithNumbers) {
+  if (titleWithNumbers) {
+    text = text;
+  } else {
+    text = text.replace(/[\d]/g, "");
+  }
+  text = text.split(' ').slice(0, 6).join("+");
+  return text;
+}
+
+
+async function sleep(ms) {
+  return new Promise((resolve) => {
+    setTimeout(resolve, ms);
+  });
+}
+
+async function retry(func, maxTry = 3, delay = 3000) {
+  try {
+    return await func();
+  } catch (e) {
+    if (maxTry > 0) {
+      await sleep(delay);
+      return await retry(func, maxTry - 1, delay);
+    } else {
+      throw e;
+    }
+  }
+}
+
+/**
+ * 鑾峰彇涔︾睄璇︽儏椤祏rl
+ * @param {*} book 
+ */
+async function getBookDetailPageUrl(book, titleWithNumbers) {
+  const kw = formatKw(book.title, titleWithNumbers);
+  const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`;
+  const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}`
+  console.log(`鎵撳紑鎼滅储: ${searchUrl}`);
+  return await retry(async () => {
+    const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
+    const { total, hits } = resp.data.response.body.hits
+    if (total === 0) {
+      return '';
+    }
+    const hit = hits[0];
+    const { identifier, title, creator } = hit.fields
+    return `https://archive.org/details/${identifier}`;
+  })
+    .catch(() => '');
+}
+
+async function openBookDetailPage(book, detailPageUrl) {
+  console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`);
+  return await retry(async () => {
+    const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
+    const html = resp.data;
+    const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]);
+    if (data.metadata.isbn?.length) {
+      data.metadata.isbn.sort((a, b) => b.length - a.length);
+      book.isbn = data.metadata.isbn[0];
+    }
+    book.publisher = data.metadata.publisher;
+    book.pubDate = data.metadata.date;
+  })
+    .catch(() => {
+      book.state = "鎵撳紑璇︽儏椤靛け璐�";
+      console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`);
+      return '';
+    });
+}
+
+function isAlreadyDownloaded(book) {
+  return book.isbn;
+}
+
+function nextBook() {
+  return new Promise(resolve => {
+    const cb = (message) => {
+      if (message.type === 'book') {
+        resolve(message.data);
+        parentPort.removeListener('message', cb);
+      }
+    };
+    parentPort.on('message', cb);
+    parentPort.postMessage({ type: 'get-book', threadId });
+
+  });
+}
+
+
+async function downloadBooks(books) {
+
+  for (; ;) {
+    const book = await nextBook();
+    if (!book) {
+      break;
+    }
+    books.push(book);
+    if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
+      // 瀹氭椂閫�鍑�
+      break;
+    }
+    bookCount++;
+    if (isAlreadyDownloaded(book)) {
+      skipCount++;
+      book.skip = true;
+      continue;
+    }
+    if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) {
+      // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫�
+      skipCount++;
+      continue;
+    }
+    console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`);
+    // 鎵撳紑鎼滅储椤甸潰骞舵悳绱�
+    let detailPageUrl = await getBookDetailPageUrl(book, true);
+    if (!detailPageUrl) {
+      // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧
+      detailPageUrl = await getBookDetailPageUrl(book, false);
+      if (!detailPageUrl) {
+        console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`);
+        book.state = "娌℃湁鎼滅储缁撴灉";
+        continue;
+      }
+    }
+    // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤�
+    sleep(getRandomNumber(500, 1000));
+    // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙杋sbn
+    await openBookDetailPage(book, detailPageUrl);
+    // 绛変竴娈垫椂闂村啀涓嬩竴涓�
+    sleep(getRandomNumber(500, 1000));
+  }
+}
+
+function saveBooks(books) {
+  console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�");
+  const workSheets = xlsx.parse(EXCEL_FILE);
+  const sheet = workSheets[0];
+  const data = sheet.data;
+  for (const book of books) {
+    const index = data.findIndex((row) => row[0] === book.id);
+    if (index > -1) {
+      data[index][5] = book.isbn;
+      if (!data[index][3])
+        data[index][3] = book.pubDate;
+      if (!data[index][4])
+        data[index][4] = book.publisher;
+      data[index][7] = book.state;
+    }
+  }
+
+  const buffer = xlsx.build([{ name: "Sheet1", data }]);
+  try {
+    fs.writeFileSync(EXCEL_FILE, buffer, (err) => { });
+    console.log("淇濆瓨瀹屾垚: ", EXCEL_FILE);
+  } catch (e) {
+    console.error(e);
+    const outfile = `${Date.now()}.json`;
+    fs.writeFileSync(outfile, JSON.stringify(data));
+    console.log("淇濆瓨瀹屾垚: " + outfile);
+  }
+}
+
+
+/**
+ * 姣杞椂鍒嗙鏍煎紡
+ * @param {number} ms 姣鍊�
+ */
+function msFormat(ms) {
+  const sec = Math.floor(ms / 1000);
+  const min = Math.floor(sec / 60);
+  const hour = Math.floor(min / 60);
+  const day = Math.floor(hour / 24);
+  const format = `${day > 0 ? `${day}澶ー : ""}${hour % 24}鏃�${min % 60}鍒�${sec % 60}绉抈;
+  return format;
+}
+
+/**
+ * 鑾峰彇闅忔満鍊�
+ * @param {number} min 鏈�灏忓��
+ * @param {number} max 鏈�澶у��
+ * @returns 闅忔満鍊�
+ */
+function getRandomNumber(min, max) {
+  return Math.random() * (max - min) + min;
+}
+
+// 寮�濮嬫椂闂�
+const startTime = Date.now();
+// 涓嬭浇鎴愬姛鐨勬暟閲�
+let successCount = 0;
+// 鍥句功鏁伴噺
+let bookCount = 0;
+// 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺
+let skipCount = 0;
+
+function startDownload() {
+  initLogger();
+  const books = [];
+  downloadBooks(books)
+    .then(() => {
+      console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+    })
+    .catch(e => {
+      console.error(e);
+    })
+    .finally(async () => {
+      // saveBooks(books);
+      parentPort.postMessage({ type: "books", data: books });
+      logFile.close();
+    });
+}
+
+function main() {
+
+  if (!fs.existsSync('tmpdir')) {
+    fs.mkdirSync('tmpdir', { recursive: true });
+  }
+  // 澶氳繘绋嬫墽琛�
+  if (isMainThread) {
+    initLogger();
+    let downloadCnt = 0;
+    const { startRow, endRow, threadSize } = config;
+    console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`);
+    let finishThreadCnt = 0;
+    const finishBooks = [];
+    const books = getBooksFromExcel(startRow, endRow);
+
+    for (let i = 0; i < threadSize; i++) {
+      const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} });
+      worker.on("message", (message) => {
+        if (message.type === 'books') {
+          finishBooks.push(...message.data);
+          finishThreadCnt++;
+          if (finishThreadCnt >= threadSize) {
+            successCount = finishBooks.filter(it => it.isbn).length;
+            skipCount = finishBooks.filter(it => it.skip).length;
+            console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+            saveBooks(finishBooks);
+          }
+        } else if (message.type === 'get-book') {
+          downloadCnt++;
+          worker.postMessage({ type: "book", data: books.shift() });
+        }
+      });
+    }
+    // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭�
+    process.on('SIGINT', () => {
+      successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length;
+      skipCount = finishBooks.filter(it => it.skip).length;
+      console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙);
+      saveBooks(finishBooks);
+      process.exit(0);
+    });
+  } else {
+    startDownload();
+
+  }
+}
+
+main();
\ No newline at end of file

--
Gitblit v1.9.1