| | |
| | | import * as fs from "fs"; |
| | | import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import * as cheerio from 'cheerio'; |
| | | |
| | | const EXCEL_FILE = "fiction-noisbn.xlsx"; |
| | | const EXCEL_FILE = "book-list.xlsx"; |
| | | |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | |
| | | proxy: false, |
| | | httpsAgent, |
| | | }); |
| | | |
| | | /** |
| | | * 获取要下载熟图书信息 |
| | | * @param {number} startRow 起始行,包含 |
| | | * @param {number} endRow 结束行,不包含 |
| | | * @returns |
| | | */ |
| | | function getBooksFromExcel(startRow, endRow) { |
| | | const workSheets = xlsx.parse(EXCEL_FILE); |
| | | const sheet = workSheets[0]; |
| | | const data = sheet.data.slice(startRow, endRow); |
| | | const books = data.map((row) => { |
| | | return { |
| | | id: row[0], |
| | | title: row[1], |
| | | author: row[2], |
| | | year: row[3], |
| | | publisher: row[4], |
| | | isbn: row[5], |
| | | extension: row[6], |
| | | state: row[7] |
| | | }; |
| | | }); |
| | | return books; |
| | | } |
| | | |
| | | /** |
| | | * 格式化关键字 |
| | |
| | | break; |
| | | } |
| | | bookCount++; |
| | | if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | book.skip = true; |
| | | continue; |
| | | } |
| | | // if (isAlreadyDownloaded(book)) { |
| | | // skipCount++; |
| | | // book.skip = true; |
| | | // continue; |
| | | // } |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | | continue; |
| | | } |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | |
| | | |
| | | |
| | | // 打开搜索页面并搜索 |
| | | let detailPageUrl = await getBookDetailPageUrl(book, true); |
| | | if (!detailPageUrl) { |
| | | // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 |
| | | detailPageUrl = await getBookDetailPageUrl(book, false); |
| | | if (!detailPageUrl) { |
| | | console.log(`获取详情页链接失败: ${book.id} ${book.title}`); |
| | | book.state = "没有搜索结果"; |
| | | continue; |
| | | } |
| | | } |
| | | // let detailPageUrl = await getBookDetailPageUrl(book, true); |
| | | // if (!detailPageUrl) { |
| | | // // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 |
| | | // detailPageUrl = await getBookDetailPageUrl(book, false); |
| | | // if (!detailPageUrl) { |
| | | // console.log(`获取详情页链接失败: ${book.id} ${book.title}`); |
| | | // book.state = "没有搜索结果"; |
| | | // continue; |
| | | // } |
| | | // } |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(500, 1000)); |
| | | // sleep(getRandomNumber(500, 1000)); |
| | | // 打开详情页,并获取isbn |
| | | const detailPageUrl = `https://archive.org/details/${book.id}`; |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | if (book.isbn) { |
| | | parentPort.postMessage({ type: "book", data: book }); |
| | | } |
| | | // 等一段时间再下一个 |
| | | sleep(getRandomNumber(500, 1000)); |
| | | } |
| | |
| | | |
| | | function saveBooks(books) { |
| | | console.log("保存下载状态数据"); |
| | | const workSheets = xlsx.parse(EXCEL_FILE); |
| | | const sheet = workSheets[0]; |
| | | const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] }; |
| | | const data = sheet.data; |
| | | for (const book of books) { |
| | | const index = data.findIndex((row) => row[0] === book.id); |
| | | if (index > -1) { |
| | | data[index][5] = book.isbn; |
| | | if (!data[index][3]) |
| | | data[index][3] = book.pubDate; |
| | | if (!data[index][4]) |
| | | data[index][4] = book.publisher; |
| | | data[index][7] = book.state; |
| | | } |
| | | const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn]; |
| | | data.push(row); |
| | | } |
| | | |
| | | const buffer = xlsx.build([{ name: "Sheet1", data }]); |
| | |
| | | }) |
| | | .finally(async () => { |
| | | // saveBooks(books); |
| | | parentPort.postMessage({ type: "books", data: books }); |
| | | // parentPort.postMessage({ type: "books", data: books }); |
| | | logFile.close(); |
| | | }); |
| | | } |
| | | |
| | | let year = 2024; |
| | | let codeIndex = 0; |
| | | const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]; |
| | | |
| | | async function getBookList(pageSize, page, code) { |
| | | const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`; |
| | | return await myAxios.get(url); |
| | | } |
| | | |
| | | async function getBooks() { |
| | | let page = 1; |
| | | const pageSize = 100; |
| | | let total = 0; |
| | | const code = codeList[codeIndex]; |
| | | console.log(`${year}年 ${codeIndex}`); |
| | | const bookList = []; |
| | | do { |
| | | console.log(`正在获取 ${year} 年 ${code} 分类 ${page} 页`); |
| | | const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => { |
| | | console.log(`获取失败:${year} 年 ${code} 分类 ${page} 页`); |
| | | });; |
| | | if (!resp) { |
| | | continue; |
| | | } |
| | | const { total: _total, hits } = resp.data.response.body.hits |
| | | total = _total; |
| | | for (const hit of hits) { |
| | | const { identifier, title, creator } = hit.fields |
| | | const author = creator?.join(", "); |
| | | bookList.push({ id: identifier, title, author }); |
| | | } |
| | | page++; |
| | | await sleep(getRandomNumber(300, 800)); |
| | | } while (pageSize * page < total); |
| | | codeIndex++; |
| | | if (codeIndex == codeList.length) { |
| | | year--; |
| | | codeIndex = 0; |
| | | } |
| | | return bookList; |
| | | } |
| | | |
| | | let getBookPromise = null; |
| | | function main() { |
| | | |
| | | if (!fs.existsSync('tmpdir')) { |
| | |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishThreadCnt = 0; |
| | | const finishBooks = []; |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | const books = []; |
| | | |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | worker.on("message", async (message) => { |
| | | if (message.type === 'book') { |
| | | finishBooks.push(message.data); |
| | | } |
| | | else if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishThreadCnt++; |
| | | if (finishThreadCnt >= threadSize) { |
| | |
| | | } |
| | | } else if (message.type === 'get-book') { |
| | | downloadCnt++; |
| | | if (getBookPromise) { |
| | | await getBookPromise.finally(); |
| | | } |
| | | if (books.length == 0) { |
| | | do { |
| | | if (year > 1950) { |
| | | getBookPromise = getBooks(); |
| | | books.push(...await getBookPromise.finally()); |
| | | getBookPromise = null; |
| | | } |
| | | } while (!books.length); |
| | | } |
| | | worker.postMessage({ type: "book", data: books.shift() }); |
| | | } |
| | | }); |