From 831697d95be0123fade180aedded20db01f1884b Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 17 七月 2024 12:23:19 +0800 Subject: [PATCH] 修改isbn查询 --- src/book-isbn-search.mjs | 147 ++++++++++++++++++++++++++++++------------------- 1 files changed, 90 insertions(+), 57 deletions(-) diff --git a/src/book-isbn-search.mjs b/src/book-isbn-search.mjs index 4b4b362..a86b02a 100644 --- a/src/book-isbn-search.mjs +++ b/src/book-isbn-search.mjs @@ -3,8 +3,9 @@ import * as fs from "fs"; import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; +import * as cheerio from 'cheerio'; -const EXCEL_FILE = "fiction-noisbn.xlsx"; +const EXCEL_FILE = "book-list.xlsx"; /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -30,31 +31,6 @@ proxy: false, httpsAgent, }); - -/** - * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 - * @param {number} startRow 璧峰琛岋紝鍖呭惈 - * @param {number} endRow 缁撴潫琛岋紝涓嶅寘鍚� - * @returns - */ -function getBooksFromExcel(startRow, endRow) { - const workSheets = xlsx.parse(EXCEL_FILE); - const sheet = workSheets[0]; - const data = sheet.data.slice(startRow, endRow); - const books = data.map((row) => { - return { - id: row[0], - title: row[1], - author: row[2], - year: row[3], - publisher: row[4], - isbn: row[5], - extension: row[6], - state: row[7] - }; - }); - return books; -} /** * 鏍煎紡鍖栧叧閿瓧 @@ -166,32 +142,39 @@ break; } bookCount++; - if (isAlreadyDownloaded(book)) { - skipCount++; - book.skip = true; - continue; - } + // if (isAlreadyDownloaded(book)) { + // skipCount++; + // book.skip = true; + // continue; + // } if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� skipCount++; continue; } console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); + + + // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� - let detailPageUrl = await getBookDetailPageUrl(book, true); - if (!detailPageUrl) { - // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 - detailPageUrl = await getBookDetailPageUrl(book, false); - if (!detailPageUrl) { - console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`); - book.state = "娌℃湁鎼滅储缁撴灉"; - continue; - } - } + // let detailPageUrl = await getBookDetailPageUrl(book, true); + // if (!detailPageUrl) { + // // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 + // detailPageUrl = await getBookDetailPageUrl(book, false); + // if (!detailPageUrl) { + // console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`); + // book.state = "娌℃湁鎼滅储缁撴灉"; + // continue; + // } + // } // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� - sleep(getRandomNumber(500, 1000)); + // sleep(getRandomNumber(500, 1000)); // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙杋sbn + const detailPageUrl = `https://archive.org/details/${book.id}`; await openBookDetailPage(book, detailPageUrl); + if (book.isbn) { + parentPort.postMessage({ type: "book", data: book }); + } // 绛変竴娈垫椂闂村啀涓嬩竴涓� sleep(getRandomNumber(500, 1000)); } @@ -199,19 +182,11 @@ function saveBooks(books) { console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�"); - const workSheets = xlsx.parse(EXCEL_FILE); - const sheet = workSheets[0]; + const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] }; const data = sheet.data; for (const book of books) { - const index = data.findIndex((row) => row[0] === book.id); - if (index > -1) { - data[index][5] = book.isbn; - if (!data[index][3]) - data[index][3] = book.pubDate; - if (!data[index][4]) - data[index][4] = book.publisher; - data[index][7] = book.state; - } + const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn]; + data.push(row); } const buffer = xlsx.build([{ name: "Sheet1", data }]); @@ -271,11 +246,54 @@ }) .finally(async () => { // saveBooks(books); - parentPort.postMessage({ type: "books", data: books }); + // parentPort.postMessage({ type: "books", data: books }); logFile.close(); }); } +let year = 2024; +let codeIndex = 0; +const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]; + +async function getBookList(pageSize, page, code) { + const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`; + return await myAxios.get(url); +} + +async function getBooks() { + let page = 1; + const pageSize = 100; + let total = 0; + const code = codeList[codeIndex]; + console.log(`${year}骞� ${codeIndex}`); + const bookList = []; + do { + console.log(`姝e湪鑾峰彇 ${year} 骞� ${code} 鍒嗙被 ${page} 椤礰); + const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => { + console.log(`鑾峰彇澶辫触锛�${year} 骞� ${code} 鍒嗙被 ${page} 椤礰); + });; + if (!resp) { + continue; + } + const { total: _total, hits } = resp.data.response.body.hits + total = _total; + for (const hit of hits) { + const { identifier, title, creator } = hit.fields + const author = creator?.join(", "); + bookList.push({ id: identifier, title, author }); + } + page++; + await sleep(getRandomNumber(300, 800)); + } while (pageSize * page < total); + codeIndex++; + if (codeIndex == codeList.length) { + year--; + codeIndex = 0; + } + return bookList; +} + +let getBookPromise = null; function main() { if (!fs.existsSync('tmpdir')) { @@ -289,12 +307,15 @@ console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); let finishThreadCnt = 0; const finishBooks = []; - const books = getBooksFromExcel(startRow, endRow); + const books = []; for (let i = 0; i < threadSize; i++) { const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} }); - worker.on("message", (message) => { - if (message.type === 'books') { + worker.on("message", async (message) => { + if (message.type === 'book') { + finishBooks.push(message.data); + } + else if (message.type === 'books') { finishBooks.push(...message.data); finishThreadCnt++; if (finishThreadCnt >= threadSize) { @@ -305,6 +326,18 @@ } } else if (message.type === 'get-book') { downloadCnt++; + if (getBookPromise) { + await getBookPromise.finally(); + } + if (books.length == 0) { + do { + if (year > 1950) { + getBookPromise = getBooks(); + books.push(...await getBookPromise.finally()); + getBookPromise = null; + } + } while (!books.length); + } worker.postMessage({ type: "book", data: books.shift() }); } }); -- Gitblit v1.9.1