package.json | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
src/book-isbn-search.mjs | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
yarn.lock | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
package.json
@@ -14,6 +14,7 @@ "devDependencies": {}, "dependencies": { "axios": "^1.7.2", "cheerio": "^1.0.0-rc.12", "https-proxy-agent": "^7.0.4", "node-xlsx": "^0.24.0", "pdf-lib": "^1.17.1", src/book-isbn-search.mjs
@@ -3,8 +3,9 @@ import * as fs from "fs"; import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; import * as cheerio from 'cheerio'; const EXCEL_FILE = "fiction-noisbn.xlsx"; const EXCEL_FILE = "book-list.xlsx"; /*-------------读取配置---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -30,31 +31,6 @@ proxy: false, httpsAgent, }); /** * 获取要下载熟图书信息 * @param {number} startRow 起始行,包含 * @param {number} endRow 结束行,不包含 * @returns */ function getBooksFromExcel(startRow, endRow) { const workSheets = xlsx.parse(EXCEL_FILE); const sheet = workSheets[0]; const data = sheet.data.slice(startRow, endRow); const books = data.map((row) => { return { id: row[0], title: row[1], author: row[2], year: row[3], publisher: row[4], isbn: row[5], extension: row[6], state: row[7] }; }); return books; } /** * 格式化关键字 @@ -166,32 +142,39 @@ break; } bookCount++; if (isAlreadyDownloaded(book)) { skipCount++; book.skip = true; continue; } // if (isAlreadyDownloaded(book)) { // skipCount++; // book.skip = true; // continue; // } if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { // 跳过没有搜索结果或没有pdf或text文件的书籍 skipCount++; continue; } console.log(`开始下载: ${book.id} ${book.title}`); // 打开搜索页面并搜索 let detailPageUrl = await getBookDetailPageUrl(book, true); if (!detailPageUrl) { // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 detailPageUrl = await getBookDetailPageUrl(book, false); if (!detailPageUrl) { console.log(`获取详情页链接失败: ${book.id} ${book.title}`); book.state = "没有搜索结果"; continue; } } // let detailPageUrl = await getBookDetailPageUrl(book, true); // if (!detailPageUrl) { // // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 // detailPageUrl = await getBookDetailPageUrl(book, false); // if (!detailPageUrl) { // console.log(`获取详情页链接失败: ${book.id} ${book.title}`); // book.state = "没有搜索结果"; // continue; // } // } // 等一段时间再打开详情页 sleep(getRandomNumber(500, 1000)); // sleep(getRandomNumber(500, 1000)); // 打开详情页,并获取isbn const detailPageUrl = `https://archive.org/details/${book.id}`; await openBookDetailPage(book, detailPageUrl); if (book.isbn) { parentPort.postMessage({ type: "book", data: book }); } // 等一段时间再下一个 sleep(getRandomNumber(500, 1000)); } @@ -199,19 +182,11 @@ function saveBooks(books) { console.log("保存下载状态数据"); const workSheets = xlsx.parse(EXCEL_FILE); const sheet = workSheets[0]; const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] }; const data = sheet.data; for (const book of books) { const index = data.findIndex((row) => row[0] === book.id); if (index > -1) { data[index][5] = book.isbn; if (!data[index][3]) data[index][3] = book.pubDate; if (!data[index][4]) data[index][4] = book.publisher; data[index][7] = book.state; } const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn]; data.push(row); } const buffer = xlsx.build([{ name: "Sheet1", data }]); @@ -271,11 +246,54 @@ }) .finally(async () => { // saveBooks(books); parentPort.postMessage({ type: "books", data: books }); // parentPort.postMessage({ type: "books", data: books }); logFile.close(); }); } let year = 2024; let codeIndex = 0; const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]; async function getBookList(pageSize, page, code) { const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`; return await myAxios.get(url); } async function getBooks() { let page = 1; const pageSize = 100; let total = 0; const code = codeList[codeIndex]; console.log(`${year}年 ${codeIndex}`); const bookList = []; do { console.log(`正在获取 ${year} 年 ${code} 分类 ${page} 页`); const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => { console.log(`获取失败:${year} 年 ${code} 分类 ${page} 页`); });; if (!resp) { continue; } const { total: _total, hits } = resp.data.response.body.hits total = _total; for (const hit of hits) { const { identifier, title, creator } = hit.fields const author = creator?.join(", "); bookList.push({ id: identifier, title, author }); } page++; await sleep(getRandomNumber(300, 800)); } while (pageSize * page < total); codeIndex++; if (codeIndex == codeList.length) { year--; codeIndex = 0; } return bookList; } let getBookPromise = null; function main() { if (!fs.existsSync('tmpdir')) { @@ -289,12 +307,15 @@ console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); let finishThreadCnt = 0; const finishBooks = []; const books = getBooksFromExcel(startRow, endRow); const books = []; for (let i = 0; i < threadSize; i++) { const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} }); worker.on("message", (message) => { if (message.type === 'books') { worker.on("message", async (message) => { if (message.type === 'book') { finishBooks.push(message.data); } else if (message.type === 'books') { finishBooks.push(...message.data); finishThreadCnt++; if (finishThreadCnt >= threadSize) { @@ -305,6 +326,18 @@ } } else if (message.type === 'get-book') { downloadCnt++; if (getBookPromise) { await getBookPromise.finally(); } if (books.length == 0) { do { if (year > 1950) { getBookPromise = getBooks(); books.push(...await getBookPromise.finally()); getBookPromise = null; } } while (!books.length); } worker.postMessage({ type: "book", data: books.shift() }); } }); yarn.lock
@@ -37,6 +37,36 @@ form-data "^4.0.0" proxy-from-env "^1.1.0" boolbase@^1.0.0: version "1.0.0" resolved "https://mirrors.cloud.tencent.com/npm/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww== cheerio-select@^2.1.0: version "2.1.0" resolved "https://mirrors.cloud.tencent.com/npm/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4" integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g== dependencies: boolbase "^1.0.0" css-select "^5.1.0" css-what "^6.1.0" domelementtype "^2.3.0" domhandler "^5.0.3" domutils "^3.0.1" cheerio@^1.0.0-rc.12: version "1.0.0-rc.12" resolved "https://mirrors.cloud.tencent.com/npm/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683" integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q== dependencies: cheerio-select "^2.1.0" dom-serializer "^2.0.0" domhandler "^5.0.3" domutils "^3.0.1" htmlparser2 "^8.0.1" parse5 "^7.0.0" parse5-htmlparser2-tree-adapter "^7.0.0" combined-stream@^1.0.8: version "1.0.8" resolved "https://mirrors.cloud.tencent.com/npm/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f" @@ -49,6 +79,22 @@ resolved "https://mirrors.cloud.tencent.com/npm/core-util-is/-/core-util-is-1.0.3.tgz#a6042d3634c2b27e9328f837b965fac83808db85" integrity sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ== css-select@^5.1.0: version "5.1.0" resolved "https://mirrors.cloud.tencent.com/npm/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6" integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg== dependencies: boolbase "^1.0.0" css-what "^6.1.0" domhandler "^5.0.2" domutils "^3.0.1" nth-check "^2.0.1" css-what@^6.1.0: version "6.1.0" resolved "https://mirrors.cloud.tencent.com/npm/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4" integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw== debug@4, debug@^4.3.4: version "4.3.5" resolved "https://mirrors.cloud.tencent.com/npm/debug/-/debug-4.3.5.tgz#e83444eceb9fedd4a1da56d671ae2446a01a6e1e" @@ -60,6 +106,41 @@ version "1.0.0" resolved "https://mirrors.cloud.tencent.com/npm/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619" integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ== dom-serializer@^2.0.0: version "2.0.0" resolved "https://mirrors.cloud.tencent.com/npm/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53" integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg== dependencies: domelementtype "^2.3.0" domhandler "^5.0.2" entities "^4.2.0" domelementtype@^2.3.0: version "2.3.0" resolved "https://mirrors.cloud.tencent.com/npm/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d" integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw== domhandler@^5.0.2, domhandler@^5.0.3: version "5.0.3" resolved "https://mirrors.cloud.tencent.com/npm/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31" integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w== dependencies: domelementtype "^2.3.0" domutils@^3.0.1: version "3.1.0" resolved "https://mirrors.cloud.tencent.com/npm/domutils/-/domutils-3.1.0.tgz#c47f551278d3dc4b0b1ab8cbb42d751a6f0d824e" integrity sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA== dependencies: dom-serializer "^2.0.0" domelementtype "^2.3.0" domhandler "^5.0.3" entities@^4.2.0, entities@^4.4.0: version "4.5.0" resolved "https://mirrors.cloud.tencent.com/npm/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48" integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw== follow-redirects@^1.15.6: version "1.15.6" @@ -74,6 +155,16 @@ asynckit "^0.4.0" combined-stream "^1.0.8" mime-types "^2.1.12" htmlparser2@^8.0.1: version "8.0.2" resolved "https://mirrors.cloud.tencent.com/npm/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21" integrity sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA== dependencies: domelementtype "^2.3.0" domhandler "^5.0.3" domutils "^3.0.1" entities "^4.4.0" https-proxy-agent@^7.0.4: version "7.0.4" @@ -139,11 +230,33 @@ dependencies: xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" nth-check@^2.0.1: version "2.1.1" resolved "https://mirrors.cloud.tencent.com/npm/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d" integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w== dependencies: boolbase "^1.0.0" pako@^1.0.10, pako@^1.0.11, pako@^1.0.6, pako@~1.0.2: version "1.0.11" resolved "https://mirrors.cloud.tencent.com/npm/pako/-/pako-1.0.11.tgz#6c9599d340d54dfd3946380252a35705a6b992bf" integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw== parse5-htmlparser2-tree-adapter@^7.0.0: version "7.0.0" resolved "https://mirrors.cloud.tencent.com/npm/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz#23c2cc233bcf09bb7beba8b8a69d46b08c62c2f1" integrity sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g== dependencies: domhandler "^5.0.2" parse5 "^7.0.0" parse5@^7.0.0: version "7.1.2" resolved "https://mirrors.cloud.tencent.com/npm/parse5/-/parse5-7.1.2.tgz#0736bebbfd77793823240a23b7fc5e010b7f8e32" integrity sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw== dependencies: entities "^4.4.0" pdf-lib@^1.17.1: version "1.17.1" resolved "https://mirrors.cloud.tencent.com/npm/pdf-lib/-/pdf-lib-1.17.1.tgz#9e7dd21261a0c1fb17992580885b39e7d08f451f"