From 42295152e10773a2bd394ac14f6feb2c4bc501a7 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期六, 13 七月 2024 13:34:13 +0800 Subject: [PATCH] 增加ISBN获取脚本 --- config.json | 6 package.json | 4 src/book-isbn-search.mjs | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 332 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index 13adc51..8e59de2 100644 --- a/config.json +++ b/config.json @@ -1,12 +1,12 @@ { "//璧峰琛屽彿锛屼粠0寮�濮嬶紝绗�0琛屾槸琛ㄥご锛屽寘鍚�": "//", - "startRow": 5, + "startRow": 1, "//缁撴潫琛屽彿锛屼粠0寮�濮嬶紝涓嶅寘鍚琛�": "//", - "endRow": 10, + "endRow": 1095110, "//绾跨▼鏁�": "//", - "threadSize": 1, + "threadSize": 10, "//瀹氭椂缁撴潫鏃堕棿锛屽崟浣嶅垎閽燂紝0琛ㄧず涓嶈缃畾鏃剁粨鏉熸椂闂�": "//", "endOfTime": 0, diff --git a/package.json b/package.json index ce8c83c..cf39113 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,9 @@ "scripts": { "download": "node src/main.mjs", "book-list": "node src/book-list-download.mjs", - "parse-log": "node src/parse-log.mjs" + "parse-log": "node src/parse-log.mjs", + "trans": "node src/trans.mjs", + "book-isbn": "node src/book-isbn-search.mjs" }, "devDependencies": {}, "dependencies": { diff --git a/src/book-isbn-search.mjs b/src/book-isbn-search.mjs new file mode 100644 index 0000000..4b4b362 --- /dev/null +++ b/src/book-isbn-search.mjs @@ -0,0 +1,326 @@ +import xlsx from "node-xlsx"; +import axios from "axios"; +import * as fs from "fs"; +import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; +import { HttpsProxyAgent } from "https-proxy-agent"; + +const EXCEL_FILE = "fiction-noisbn.xlsx"; + +/*-------------璇诲彇閰嶇疆---------------*/ +let config = JSON.parse(fs.readFileSync('./config.json')); + +/* ------------鏃ュ織-------------- */ +let logFile; +function initLogger() { + const _log = console.log; + if (!fs.existsSync('./book-isbn-logs')) { + fs.mkdirSync('./book-isbn-logs', { recursive: true }); + } + logFile = fs.createWriteStream(`./book-isbn-logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); + console.log = function (...text) { + text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; + _log(text); + logFile.write(text + '\n'); + }; +} + +/* ----------axios浠g悊------------ */ +const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); +const myAxios = axios.create({ + proxy: false, + httpsAgent, +}); + +/** + * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 + * @param {number} startRow 璧峰琛岋紝鍖呭惈 + * @param {number} endRow 缁撴潫琛岋紝涓嶅寘鍚� + * @returns + */ +function getBooksFromExcel(startRow, endRow) { + const workSheets = xlsx.parse(EXCEL_FILE); + const sheet = workSheets[0]; + const data = sheet.data.slice(startRow, endRow); + const books = data.map((row) => { + return { + id: row[0], + title: row[1], + author: row[2], + year: row[3], + publisher: row[4], + isbn: row[5], + extension: row[6], + state: row[7] + }; + }); + return books; +} + +/** + * 鏍煎紡鍖栧叧閿瓧 + * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� + * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� + * @returns 澶勭悊鍚庣殑鍏抽敭瀛� + */ +function formatKw(text, titleWithNumbers) { + if (titleWithNumbers) { + text = text; + } else { + text = text.replace(/[\d]/g, ""); + } + text = text.split(' ').slice(0, 6).join("+"); + return text; +} + + +async function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +async function retry(func, maxTry = 3, delay = 3000) { + try { + return await func(); + } catch (e) { + if (maxTry > 0) { + await sleep(delay); + return await retry(func, maxTry - 1, delay); + } else { + throw e; + } + } +} + +/** + * 鑾峰彇涔︾睄璇︽儏椤祏rl + * @param {*} book + */ +async function getBookDetailPageUrl(book, titleWithNumbers) { + const kw = formatKw(book.title, titleWithNumbers); + const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`; + const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}` + console.log(`鎵撳紑鎼滅储: ${searchUrl}`); + return await retry(async () => { + const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) + const { total, hits } = resp.data.response.body.hits + if (total === 0) { + return ''; + } + const hit = hits[0]; + const { identifier, title, creator } = hit.fields + return `https://archive.org/details/${identifier}`; + }) + .catch(() => ''); +} + +async function openBookDetailPage(book, detailPageUrl) { + console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`); + return await retry(async () => { + const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); + const html = resp.data; + const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]); + if (data.metadata.isbn?.length) { + data.metadata.isbn.sort((a, b) => b.length - a.length); + book.isbn = data.metadata.isbn[0]; + } + book.publisher = data.metadata.publisher; + book.pubDate = data.metadata.date; + }) + .catch(() => { + book.state = "鎵撳紑璇︽儏椤靛け璐�"; + console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`); + return ''; + }); +} + +function isAlreadyDownloaded(book) { + return book.isbn; +} + +function nextBook() { + return new Promise(resolve => { + const cb = (message) => { + if (message.type === 'book') { + resolve(message.data); + parentPort.removeListener('message', cb); + } + }; + parentPort.on('message', cb); + parentPort.postMessage({ type: 'get-book', threadId }); + + }); +} + + +async function downloadBooks(books) { + + for (; ;) { + const book = await nextBook(); + if (!book) { + break; + } + books.push(book); + if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { + // 瀹氭椂閫�鍑� + break; + } + bookCount++; + if (isAlreadyDownloaded(book)) { + skipCount++; + book.skip = true; + continue; + } + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� + skipCount++; + continue; + } + console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); + // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� + let detailPageUrl = await getBookDetailPageUrl(book, true); + if (!detailPageUrl) { + // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 + detailPageUrl = await getBookDetailPageUrl(book, false); + if (!detailPageUrl) { + console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`); + book.state = "娌℃湁鎼滅储缁撴灉"; + continue; + } + } + // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� + sleep(getRandomNumber(500, 1000)); + // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙杋sbn + await openBookDetailPage(book, detailPageUrl); + // 绛変竴娈垫椂闂村啀涓嬩竴涓� + sleep(getRandomNumber(500, 1000)); + } +} + +function saveBooks(books) { + console.log("淇濆瓨涓嬭浇鐘舵�佹暟鎹�"); + const workSheets = xlsx.parse(EXCEL_FILE); + const sheet = workSheets[0]; + const data = sheet.data; + for (const book of books) { + const index = data.findIndex((row) => row[0] === book.id); + if (index > -1) { + data[index][5] = book.isbn; + if (!data[index][3]) + data[index][3] = book.pubDate; + if (!data[index][4]) + data[index][4] = book.publisher; + data[index][7] = book.state; + } + } + + const buffer = xlsx.build([{ name: "Sheet1", data }]); + try { + fs.writeFileSync(EXCEL_FILE, buffer, (err) => { }); + console.log("淇濆瓨瀹屾垚: ", EXCEL_FILE); + } catch (e) { + console.error(e); + const outfile = `${Date.now()}.json`; + fs.writeFileSync(outfile, JSON.stringify(data)); + console.log("淇濆瓨瀹屾垚: " + outfile); + } +} + + +/** + * 姣杞椂鍒嗙鏍煎紡 + * @param {number} ms 姣鍊� + */ +function msFormat(ms) { + const sec = Math.floor(ms / 1000); + const min = Math.floor(sec / 60); + const hour = Math.floor(min / 60); + const day = Math.floor(hour / 24); + const format = `${day > 0 ? `${day}澶ー : ""}${hour % 24}鏃�${min % 60}鍒�${sec % 60}绉抈; + return format; +} + +/** + * 鑾峰彇闅忔満鍊� + * @param {number} min 鏈�灏忓�� + * @param {number} max 鏈�澶у�� + * @returns 闅忔満鍊� + */ +function getRandomNumber(min, max) { + return Math.random() * (max - min) + min; +} + +// 寮�濮嬫椂闂� +const startTime = Date.now(); +// 涓嬭浇鎴愬姛鐨勬暟閲� +let successCount = 0; +// 鍥句功鏁伴噺 +let bookCount = 0; +// 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺 +let skipCount = 0; + +function startDownload() { + initLogger(); + const books = []; + downloadBooks(books) + .then(() => { + console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + }) + .catch(e => { + console.error(e); + }) + .finally(async () => { + // saveBooks(books); + parentPort.postMessage({ type: "books", data: books }); + logFile.close(); + }); +} + +function main() { + + if (!fs.existsSync('tmpdir')) { + fs.mkdirSync('tmpdir', { recursive: true }); + } + // 澶氳繘绋嬫墽琛� + if (isMainThread) { + initLogger(); + let downloadCnt = 0; + const { startRow, endRow, threadSize } = config; + console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); + let finishThreadCnt = 0; + const finishBooks = []; + const books = getBooksFromExcel(startRow, endRow); + + for (let i = 0; i < threadSize; i++) { + const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} }); + worker.on("message", (message) => { + if (message.type === 'books') { + finishBooks.push(...message.data); + finishThreadCnt++; + if (finishThreadCnt >= threadSize) { + successCount = finishBooks.filter(it => it.isbn).length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + } + } else if (message.type === 'get-book') { + downloadCnt++; + worker.postMessage({ type: "book", data: books.shift() }); + } + }); + } + // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭� + process.on('SIGINT', () => { + successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + process.exit(0); + }); + } else { + startDownload(); + + } +} + +main(); \ No newline at end of file -- Gitblit v1.9.1