From 32516fdf61b42d7aa096fd393c916249d7fdb223 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期五, 19 七月 2024 21:22:04 +0800 Subject: [PATCH] 增加包含isbn的书单获取 --- src/main.mjs | 351 ++++++++++++++++++++++------------------------------------ 1 files changed, 132 insertions(+), 219 deletions(-) diff --git a/src/main.mjs b/src/main.mjs index bbf02b6..2fbe23a 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -1,17 +1,15 @@ import xlsx from "node-xlsx"; -import { Builder, Browser, until, By } from "selenium-webdriver"; -import { Options as ChromeOptions } from "selenium-webdriver/chrome.js"; -import proxy from "selenium-webdriver/proxy.js"; import axios from "axios"; import * as fs from "fs"; import path from "path"; import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; -import { resolve } from "path"; import { execFileSync } from "child_process"; import wordsjs from 'wordlist-js'; import usPlaceList from "./us-place-list.mjs"; import usPeronNameList from "./us-pseron-name-list.mjs"; +import * as pdfLib from 'pdf-lib'; + /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -36,6 +34,18 @@ proxy: false, httpsAgent, }); + +/** + * 鑾峰彇pdf鏂囦欢椤垫暟 + * @param {string} filepath pdf 鏂囦欢璺緞 + * @returns 椤垫暟 + */ +async function getPdfPages(filepath) { + const buf = fs.readFileSync(filepath); + const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true }); + const pages = pdfDoc.getPages().length; + return pages; +} function allWords() { const words = {}; @@ -200,31 +210,6 @@ } /** - * 鍒涘缓娴忚鍣ㄩ┍鍔� - * @returns chrome娴忚鍣ㄩ┍鍔� - */ -async function createDriver() { - const opts = new ChromeOptions(); - if (config.headless) { - opts.addArguments("--headless");//寮�鍚棤澶存ā寮� - } - if (config.disableGpu) { - opts.addArguments("--disable-gpu");//绂佹gpu娓叉煋 - } - opts.addArguments("--ignore-ssl-error"); // 蹇界暐ssl閿欒 - opts.addArguments("--no-sandbox"); // 绂佺敤娌欑洅妯″紡 - opts.addArguments("blink-settings=imagesEnabled=false"); //绂佺敤鍥剧墖鍔犺浇 - // proxy - opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' })) - const driver = await new Builder() - .setChromeOptions(opts) - .forBrowser(Browser.CHROME) - .build(); - driver.manage().setTimeouts({ implicit: 10000 }); - return driver; -} - -/** * 鏍煎紡鍖栧叧閿瓧 * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� @@ -261,117 +246,46 @@ } /** - * 鎵撳紑鎼滅储椤甸潰骞舵悳绱� + * 鑾峰彇涔︾睄璇︽儏椤祏rl * @param {*} book */ -async function openSearchPage(book, titleWithNumbers) { - console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); +async function getBookDetailPageUrl(book, titleWithNumbers) { + const kw = formatKw(book.title, titleWithNumbers); + const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`; + const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}` + console.log(`鎵撳紑鎼滅储: ${searchUrl}`); return await retry(async () => { - // 鑾峰彇椤甸潰 - const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; - await driver.get(searchUrl); - }).then(() => true) - .catch(() => false); -} - -/** - * 妫�娴嬫悳绱㈢粨鏋� - * @param {*} book - * @returns true: 鏈夋悳绱㈢粨鏋滐紝false: 娌℃湁鎼滅储缁撴灉 - */ -async function checkSearchResult(book) { - console.log(`妫�娴嬫悳绱㈢粨鏋渀); - return await retry(async () => { - const text = await driver.executeScript(`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#content-container > empty-placeholder").shadowRoot.querySelector("div > h2").textContent`); - if (text && text.includes("Your search did not match any items in the Archive. Try different keywords or a more general search.")) { - // 娌℃湁鎼滅储缁撴灉 - book.state = "娌℃湁鎼滅储缁撴灉"; - console.log(`娌℃湁鎼滅储缁撴灉: ${book.id} ${book.title}`); - return false; - } else { - return true; + const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) + const { total, hits } = resp.data.response.body.hits + if (total === 0) { + return ''; } - }, 2) - .catch(() => { - return true; - }); -} - -async function findBookDetailPageUrl(book) { - console.log(`鏌ユ壘璇︽儏椤祏rl`); - return retry(async () => { - let detailPageUrl; - try { - detailPageUrl = await driver.executeScript( - `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article:nth-child(2) > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value` - ); - } catch (e) { - detailPageUrl = await driver.executeScript( - `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value` - ); - } - return detailPageUrl; + const hit = hits[0]; + const { identifier, title, creator } = hit.fields + return `https://archive.org/details/${identifier}`; }) .catch(() => ''); } async function openBookDetailPage(book, detailPageUrl) { - console.log(`鎵撳紑璇︽儏: https://archive.org${detailPageUrl}`); + console.log(`鎵撳紑璇︽儏: ${detailPageUrl}`); return await retry(async () => { - await driver.get(`https://archive.org${detailPageUrl}`); - await driver.wait( - until.elementLocated( - By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`) - ), 15000 - ); + const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); + const html = resp.data; + const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]); + book.publisher = data.metadata.publisher; + book.pubDate = data.metadata.date; + const identifier = data.metadata.identifier; + const fileData = data.files.find(f => f.format === 'Text PDF'); + if (!fileData) { + return ''; + } + const fileUrl = `https://archive.org/download/${identifier}/${fileData.name}`; + return fileUrl; }) - .then(() => true) .catch(() => { book.state = "鎵撳紑璇︽儏椤靛け璐�"; console.log(`鎵撳紑璇︽儏椤靛け璐�: ${book.id} ${book.title}`); - return false; - }); -} - -async function getDownloadUrl(book) { - console.log(`鑾峰彇涓嬭浇閾炬帴`); - function getFullUrl(url) { - if (!url) { return ''; } - return url.startsWith("http") ? url : `https://archive.org${url}`; - } - return await retry(async () => { - const elements = await driver.findElements( - By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div/a`) - ); - - let pdfUrl = ""; - let textUrl = ""; - for (const el of elements) { - let text = await el.getText(); - if (text) { - text = text.trim().split("\n")[0]; - const href = getFullUrl(await el.getAttribute("href")); - if (text.toLowerCase() === "pdf") { - pdfUrl = href; - } else if (text.toLowerCase() === "full text") { - textUrl = href; - } else if (text.toLowerCase() === "ocr search text") { - textUrl = href; - } - } - } - - if (pdfUrl) { - return pdfUrl; - } else if (textUrl) { - return textUrl; - } else { - book.state = "娌℃湁text鏂囦欢"; - return '' - } - }) - .catch(() => { - book.state = "娌℃湁text鏂囦欢"; return ''; }); } @@ -395,11 +309,12 @@ console.log(`涓嬭浇鏂囦欢: ${url}`); const ext = url.split(".").pop().toLowerCase(); const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + book.url = url; if (fs.existsSync(filepath)) { book.state = `涓嬭浇瀹屾垚`; book.format = ext; book.file = filepath; - book.url = url; + book.pages = await getPdfPages(filepath).catch(() => 0); console.log(`涓嬭浇瀹屾垚锛�${filepath}`); return; } @@ -415,9 +330,6 @@ const len = response.headers['content-length']; if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { // 涓嶆槸pdf鎴杢xt鏂囦欢锛屼笖鏂囦欢澶т簬200M锛屼笉涓嬭浇 - book.state = "涓嬭浇澶辫触"; - book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); reject(false); return; } @@ -425,20 +337,18 @@ const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; const out = fs.createWriteStream(_filepath); stream.pipe(out); - stream.on("end", () => { + stream.on("end", async () => { clearTimeout(timeout); book.state = `涓嬭浇瀹屾垚`; book.format = ext; book.file = filepath; book.url = url; + book.pages = await getPdfPages(filepath).catch(e => 0); resolve(true); }); stream.on("error", (err) => { clearTimeout(timeout); console.error(err); - book.state = "涓嬭浇澶辫触"; - book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); reject(false); try { out.close(); @@ -450,13 +360,19 @@ }) .catch((e) => { clearTimeout(timeout); - console.error(e); - book.state = "涓嬭浇澶辫触"; + console.log(`涓嬭浇澶辫触锛岄敊璇爜: ${e?.response?.status ?? e.code}`); book.url = url; - console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); - reject(false); + if (e.response?.status === 403 || e.response?.status === 401) { + book.state = "娌℃湁涓嬭浇鏉冮檺"; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); + resolve(true); + } else { + reject(false); + } })); }).catch(e => { + book.state = "涓嬭浇澶辫触"; + console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); return false }); } @@ -480,17 +396,8 @@ }); } -function getBookInfo(book) { - return retry(async () => { - book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e => 0); - book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e => 0); - let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e => 0); - if (pages) { book.pages = pages.split(' / ')[1]; } - }); -} async function downloadBooks(books) { - driver = await createDriver(); for (; ;) { const book = await nextBook(); @@ -503,47 +410,38 @@ break; } bookCount++; - /*if (isAlreadyDownloaded(book)) { + if (isAlreadyDownloaded(book)) { skipCount++; + book.skip = true; continue; } - if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� skipCount++; continue; - } */ + } console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� - if (!await openSearchPage(book, true)) { + let detailPageUrl = await getBookDetailPageUrl(book, true); + if (!detailPageUrl) { // 鍏堢敤鍖呭惈鏁板瓧鐨勫叧閿瓧锛屽鏋滄病鏈夌粨鏋滃啀鐢ㄤ笉鍖呭惈鏁板瓧鐨勫叧閿瓧 - if (!await openSearchPage(book, false)) { - console.log(`鎵撳紑鎼滅储椤甸潰澶辫触: ${book.id} ${book.title}`); - book.state = "鎵撳紑鎼滅储椤甸潰澶辫触"; + detailPageUrl = await getBookDetailPageUrl(book, false); + if (!detailPageUrl) { + console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`); + book.state = "娌℃湁鎼滅储缁撴灉"; continue; } } - // 妫�娴嬫悳绱㈢粨鏋� - const hasBook = await checkSearchResult(book); - if (!hasBook) { - continue; - } - // 鑾峰彇璇︽儏椤甸摼鎺� - const detailPageUrl = await findBookDetailPageUrl(book); - if (!detailPageUrl) { - console.log(`鑾峰彇璇︽儏椤甸摼鎺ュけ璐�: ${book.id} ${book.title}`); - book.state = "鑾峰彇璇︽儏椤甸摼鎺ュけ璐�"; - continue; - } // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� - sleep(getRandomNumber(500, 10000)); - // 鎵撳紑璇︽儏椤� - await openBookDetailPage(book, detailPageUrl); - await getBookInfo(book); - // 鑾峰彇涓嬭浇閾炬帴 - const url = await getDownloadUrl(book); - if (!url) { continue; } + sleep(getRandomNumber(500, 1000)); + // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙栦笅杞介摼鎺� + const url = await openBookDetailPage(book, detailPageUrl); + if (!url) { + console.log(`娌℃湁pdf鎴杢ext鏂囦欢: ${book.id} ${book.title}`); + continue; + } // 绛夊緟涓�娈垫椂闂村啀涓嬭浇 - await sleep(getRandomNumber(500, 10000)); + await sleep(getRandomNumber(500, 1000)); // 涓嬭浇鏂囦欢 try { await downloadFile(book, url); @@ -552,7 +450,7 @@ } catch (e) { } successCount++; // 绛変竴娈垫椂闂村啀涓嬩竴涓� - sleep(getRandomNumber(500, 10000)); + sleep(getRandomNumber(500, 1000)); } } @@ -575,8 +473,15 @@ } const buffer = xlsx.build([{ name: "Sheet1", data }]); - fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { }); - console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx"); + try { + fs.writeFileSync("./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx", buffer, (err) => { }); + console.log("淇濆瓨瀹屾垚: ./銆愮浜屾壒浜屾澶勭悊鍚庛�戜氦浠樻竻鍗�.xlsx"); + } catch (e) { + console.error(e); + const outfile = `${Date.now()}.json`; + fs.writeFileSync(outfile, JSON.stringify(data)); + console.log("淇濆瓨瀹屾垚: " + outfile); + } } @@ -611,8 +516,6 @@ let bookCount = 0; // 璺宠繃鐨勬暟閲忥紝宸茬粡涓嬭浇杩囨垨娌℃湁鎼滅储鍒扮殑鏁伴噺 let skipCount = 0; -// chrome椹卞姩 -let driver; let alreadyDownloadedBooks = []; function getAlreadyDownloadedBooks() { @@ -623,7 +526,7 @@ return books.map(it => path.basename(it, path.extname(it)).trim()); } -function main() { +function startDownload() { initLogger(); const books = []; downloadBooks(books) @@ -637,49 +540,59 @@ // saveBooks(books); parentPort.postMessage({ type: "books", data: books }); logFile.close(); - try { - await driver.close(); - await driver.quit(); - } catch (e) { } }); } -if (!fs.existsSync('tmpdir')) { - fs.mkdirSync('tmpdir', { recursive: true }); -} -if (!fs.existsSync('downloads')) { - fs.mkdirSync('downloads', { recursive: true }); -} +function main() { -// 澶氳繘绋嬫墽琛� -if (isMainThread) { - initLogger(); - const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); - const { startRow, endRow, threadSize } = config; - console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); - let finishCnt = 0; - const finishBooks = []; - const books = getBooksFromExcel(startRow, endRow); - - for (let i = 0; i < threadSize; i++) { - const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); - worker.on("message", (message) => { - if (message.type === 'books') { - finishBooks.push(...message.data); - finishCnt++; - if (finishCnt >= threadSize) { - saveBooks(finishBooks); - } - } else if (message.type === 'get-book') { - worker.postMessage({ type: "book", data: books.shift() }); - } - }); + if (!fs.existsSync('tmpdir')) { + fs.mkdirSync('tmpdir', { recursive: true }); } -} else { - alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; - main(); + if (!fs.existsSync('downloads')) { + fs.mkdirSync('downloads', { recursive: true }); + } + // 澶氳繘绋嬫墽琛� + if (isMainThread) { + initLogger(); + let downloadCnt = 0; + const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); + const { startRow, endRow, threadSize } = config; + console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); + let finishThreadCnt = 0; + const finishBooks = []; + const books = getBooksFromExcel(startRow, endRow); + + for (let i = 0; i < threadSize; i++) { + const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); + worker.on("message", (message) => { + if (message.type === 'books') { + finishBooks.push(...message.data); + finishThreadCnt++; + if (finishThreadCnt >= threadSize) { + successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`鍏ㄩ儴绾跨▼瀹屾垚锛屽叡涓嬭浇${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + } + } else if (message.type === 'get-book') { + downloadCnt++; + worker.postMessage({ type: "book", data: books.shift() }); + } + }); + } + // 鐩戝惉閫�鍑轰俊鍙凤紝淇濆瓨宸茬粡涓嬭浇鐨勫浘涔︿俊鎭� + process.on('SIGINT', () => { + successCount = finishBooks.filter(it => it.state === '涓嬭浇瀹屾垚').length; + skipCount = finishBooks.filter(it => it.skip).length; + console.log(`杩涚▼琚墜鍔ㄧ粨鏉燂紝鍏变笅杞�${downloadCnt}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}锛屽け璐�${downloadCnt - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + saveBooks(finishBooks); + process.exit(0); + }); + } else { + alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; + startDownload(); + + } } -// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; -// let text = fs.readFileSync(filepath, 'utf8'); -// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); \ No newline at end of file +main(); \ No newline at end of file -- Gitblit v1.9.1