New file |
| | |
| | | import xlsx from "node-xlsx"; |
| | | import axios from "axios"; |
| | | import * as fs from "fs"; |
| | | import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | |
| | | const EXCEL_FILE = "fiction-noisbn.xlsx"; |
| | | |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | | |
| | | /* ------------日志-------------- */ |
| | | let logFile; |
| | | function initLogger() { |
| | | const _log = console.log; |
| | | if (!fs.existsSync('./book-isbn-logs')) { |
| | | fs.mkdirSync('./book-isbn-logs', { recursive: true }); |
| | | } |
| | | logFile = fs.createWriteStream(`./book-isbn-logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | console.log = function (...text) { |
| | | text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; |
| | | _log(text); |
| | | logFile.write(text + '\n'); |
| | | }; |
| | | } |
| | | |
| | | /* ----------axios代理------------ */ |
| | | const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); |
| | | const myAxios = axios.create({ |
| | | proxy: false, |
| | | httpsAgent, |
| | | }); |
| | | |
| | | /** |
| | | * 获取要下载熟图书信息 |
| | | * @param {number} startRow 起始行,包含 |
| | | * @param {number} endRow 结束行,不包含 |
| | | * @returns |
| | | */ |
| | | function getBooksFromExcel(startRow, endRow) { |
| | | const workSheets = xlsx.parse(EXCEL_FILE); |
| | | const sheet = workSheets[0]; |
| | | const data = sheet.data.slice(startRow, endRow); |
| | | const books = data.map((row) => { |
| | | return { |
| | | id: row[0], |
| | | title: row[1], |
| | | author: row[2], |
| | | year: row[3], |
| | | publisher: row[4], |
| | | isbn: row[5], |
| | | extension: row[6], |
| | | state: row[7] |
| | | }; |
| | | }); |
| | | return books; |
| | | } |
| | | |
| | | /** |
| | | * 格式化关键字 |
| | | * @param {string} text 要搜索的关键字 |
| | | * @param {boolean} titleWithNumbers 是否标题中包含数字 |
| | | * @returns 处理后的关键字 |
| | | */ |
| | | function formatKw(text, titleWithNumbers) { |
| | | if (titleWithNumbers) { |
| | | text = text; |
| | | } else { |
| | | text = text.replace(/[\d]/g, ""); |
| | | } |
| | | text = text.split(' ').slice(0, 6).join("+"); |
| | | return text; |
| | | } |
| | | |
| | | |
| | | async function sleep(ms) { |
| | | return new Promise((resolve) => { |
| | | setTimeout(resolve, ms); |
| | | }); |
| | | } |
| | | |
| | | async function retry(func, maxTry = 3, delay = 3000) { |
| | | try { |
| | | return await func(); |
| | | } catch (e) { |
| | | if (maxTry > 0) { |
| | | await sleep(delay); |
| | | return await retry(func, maxTry - 1, delay); |
| | | } else { |
| | | throw e; |
| | | } |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * 获取书籍详情页url |
| | | * @param {*} book |
| | | */ |
| | | async function getBookDetailPageUrl(book, titleWithNumbers) { |
| | | const kw = formatKw(book.title, titleWithNumbers); |
| | | const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`; |
| | | const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}` |
| | | console.log(`打开搜索: ${searchUrl}`); |
| | | return await retry(async () => { |
| | | const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) |
| | | const { total, hits } = resp.data.response.body.hits |
| | | if (total === 0) { |
| | | return ''; |
| | | } |
| | | const hit = hits[0]; |
| | | const { identifier, title, creator } = hit.fields |
| | | return `https://archive.org/details/${identifier}`; |
| | | }) |
| | | .catch(() => ''); |
| | | } |
| | | |
| | | async function openBookDetailPage(book, detailPageUrl) { |
| | | console.log(`打开详情: ${detailPageUrl}`); |
| | | return await retry(async () => { |
| | | const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); |
| | | const html = resp.data; |
| | | const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]); |
| | | if (data.metadata.isbn?.length) { |
| | | data.metadata.isbn.sort((a, b) => b.length - a.length); |
| | | book.isbn = data.metadata.isbn[0]; |
| | | } |
| | | book.publisher = data.metadata.publisher; |
| | | book.pubDate = data.metadata.date; |
| | | }) |
| | | .catch(() => { |
| | | book.state = "打开详情页失败"; |
| | | console.log(`打开详情页失败: ${book.id} ${book.title}`); |
| | | return ''; |
| | | }); |
| | | } |
| | | |
| | | function isAlreadyDownloaded(book) { |
| | | return book.isbn; |
| | | } |
| | | |
| | | function nextBook() { |
| | | return new Promise(resolve => { |
| | | const cb = (message) => { |
| | | if (message.type === 'book') { |
| | | resolve(message.data); |
| | | parentPort.removeListener('message', cb); |
| | | } |
| | | }; |
| | | parentPort.on('message', cb); |
| | | parentPort.postMessage({ type: 'get-book', threadId }); |
| | | |
| | | }); |
| | | } |
| | | |
| | | |
| | | async function downloadBooks(books) { |
| | | |
| | | for (; ;) { |
| | | const book = await nextBook(); |
| | | if (!book) { |
| | | break; |
| | | } |
| | | books.push(book); |
| | | if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { |
| | | // 定时退出 |
| | | break; |
| | | } |
| | | bookCount++; |
| | | if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | book.skip = true; |
| | | continue; |
| | | } |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | | continue; |
| | | } |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | // 打开搜索页面并搜索 |
| | | let detailPageUrl = await getBookDetailPageUrl(book, true); |
| | | if (!detailPageUrl) { |
| | | // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 |
| | | detailPageUrl = await getBookDetailPageUrl(book, false); |
| | | if (!detailPageUrl) { |
| | | console.log(`获取详情页链接失败: ${book.id} ${book.title}`); |
| | | book.state = "没有搜索结果"; |
| | | continue; |
| | | } |
| | | } |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(500, 1000)); |
| | | // 打开详情页,并获取isbn |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | // 等一段时间再下一个 |
| | | sleep(getRandomNumber(500, 1000)); |
| | | } |
| | | } |
| | | |
| | | function saveBooks(books) { |
| | | console.log("保存下载状态数据"); |
| | | const workSheets = xlsx.parse(EXCEL_FILE); |
| | | const sheet = workSheets[0]; |
| | | const data = sheet.data; |
| | | for (const book of books) { |
| | | const index = data.findIndex((row) => row[0] === book.id); |
| | | if (index > -1) { |
| | | data[index][5] = book.isbn; |
| | | if (!data[index][3]) |
| | | data[index][3] = book.pubDate; |
| | | if (!data[index][4]) |
| | | data[index][4] = book.publisher; |
| | | data[index][7] = book.state; |
| | | } |
| | | } |
| | | |
| | | const buffer = xlsx.build([{ name: "Sheet1", data }]); |
| | | try { |
| | | fs.writeFileSync(EXCEL_FILE, buffer, (err) => { }); |
| | | console.log("保存完成: ", EXCEL_FILE); |
| | | } catch (e) { |
| | | console.error(e); |
| | | const outfile = `${Date.now()}.json`; |
| | | fs.writeFileSync(outfile, JSON.stringify(data)); |
| | | console.log("保存完成: " + outfile); |
| | | } |
| | | } |
| | | |
| | | |
| | | /** |
| | | * 毫秒转时分秒格式 |
| | | * @param {number} ms 毫秒值 |
| | | */ |
| | | function msFormat(ms) { |
| | | const sec = Math.floor(ms / 1000); |
| | | const min = Math.floor(sec / 60); |
| | | const hour = Math.floor(min / 60); |
| | | const day = Math.floor(hour / 24); |
| | | const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`; |
| | | return format; |
| | | } |
| | | |
| | | /** |
| | | * 获取随机值 |
| | | * @param {number} min 最小值 |
| | | * @param {number} max 最大值 |
| | | * @returns 随机值 |
| | | */ |
| | | function getRandomNumber(min, max) { |
| | | return Math.random() * (max - min) + min; |
| | | } |
| | | |
| | | // 开始时间 |
| | | const startTime = Date.now(); |
| | | // 下载成功的数量 |
| | | let successCount = 0; |
| | | // 图书数量 |
| | | let bookCount = 0; |
| | | // 跳过的数量,已经下载过或没有搜索到的数量 |
| | | let skipCount = 0; |
| | | |
| | | function startDownload() { |
| | | initLogger(); |
| | | const books = []; |
| | | downloadBooks(books) |
| | | .then(() => { |
| | | console.log(`线程:${threadId}全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | }) |
| | | .catch(e => { |
| | | console.error(e); |
| | | }) |
| | | .finally(async () => { |
| | | // saveBooks(books); |
| | | parentPort.postMessage({ type: "books", data: books }); |
| | | logFile.close(); |
| | | }); |
| | | } |
| | | |
| | | function main() { |
| | | |
| | | if (!fs.existsSync('tmpdir')) { |
| | | fs.mkdirSync('tmpdir', { recursive: true }); |
| | | } |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | | let downloadCnt = 0; |
| | | const { startRow, endRow, threadSize } = config; |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishThreadCnt = 0; |
| | | const finishBooks = []; |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishThreadCnt++; |
| | | if (finishThreadCnt >= threadSize) { |
| | | successCount = finishBooks.filter(it => it.isbn).length; |
| | | skipCount = finishBooks.filter(it => it.skip).length; |
| | | console.log(`全部线程完成,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | saveBooks(finishBooks); |
| | | } |
| | | } else if (message.type === 'get-book') { |
| | | downloadCnt++; |
| | | worker.postMessage({ type: "book", data: books.shift() }); |
| | | } |
| | | }); |
| | | } |
| | | // 监听退出信号,保存已经下载的图书信息 |
| | | process.on('SIGINT', () => { |
| | | successCount = finishBooks.filter(it => it.state === '下载完成').length; |
| | | skipCount = finishBooks.filter(it => it.skip).length; |
| | | console.log(`进程被手动结束,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | saveBooks(finishBooks); |
| | | process.exit(0); |
| | | }); |
| | | } else { |
| | | startDownload(); |
| | | |
| | | } |
| | | } |
| | | |
| | | main(); |