| | |
| | | import xlsx from "node-xlsx"; |
| | | import { Builder, Browser, until, By } from "selenium-webdriver"; |
| | | import { Options as ChromeOptions } from "selenium-webdriver/chrome.js"; |
| | | import proxy from "selenium-webdriver/proxy.js"; |
| | | import axios from "axios"; |
| | | import * as fs from "fs"; |
| | | import path from "path"; |
| | | import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import { resolve } from "path"; |
| | | import { execFileSync } from "child_process"; |
| | | import wordsjs from 'wordlist-js'; |
| | | import usPlaceList from "./us-place-list.mjs"; |
| | | import usPeronNameList from "./us-pseron-name-list.mjs"; |
| | | import * as pdfLib from 'pdf-lib'; |
| | | |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | | |
| | |
| | | proxy: false, |
| | | httpsAgent, |
| | | }); |
| | | |
| | | /** |
| | | * 获取pdf文件页数 |
| | | * @param {string} filepath pdf 文件路径 |
| | | * @returns 页数 |
| | | */ |
| | | async function getPdfPages(filepath) { |
| | | const buf = fs.readFileSync(filepath); |
| | | const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true }); |
| | | const pages = pdfDoc.getPages().length; |
| | | return pages; |
| | | } |
| | | |
| | | function allWords() { |
| | | const words = {}; |
| | |
| | | } |
| | | |
| | | /** |
| | | * 创建浏览器驱动 |
| | | * @returns chrome浏览器驱动 |
| | | */ |
| | | async function createDriver() { |
| | | const opts = new ChromeOptions(); |
| | | if (config.headless) { |
| | | opts.addArguments("--headless");//开启无头模式 |
| | | } |
| | | if (config.disableGpu) { |
| | | opts.addArguments("--disable-gpu");//禁止gpu渲染 |
| | | } |
| | | opts.addArguments("--ignore-ssl-error"); // 忽略ssl错误 |
| | | opts.addArguments("--no-sandbox"); // 禁用沙盒模式 |
| | | opts.addArguments("blink-settings=imagesEnabled=false"); //禁用图片加载 |
| | | // proxy |
| | | opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' })) |
| | | const driver = await new Builder() |
| | | .setChromeOptions(opts) |
| | | .forBrowser(Browser.CHROME) |
| | | .build(); |
| | | driver.manage().setTimeouts({ implicit: 10000 }); |
| | | return driver; |
| | | } |
| | | |
| | | /** |
| | | * 格式化关键字 |
| | | * @param {string} text 要搜索的关键字 |
| | | * @param {boolean} titleWithNumbers 是否标题中包含数字 |
| | |
| | | } |
| | | |
| | | /** |
| | | * 打开搜索页面并搜索 |
| | | * 获取书籍详情页url |
| | | * @param {*} book |
| | | */ |
| | | async function openSearchPage(book, titleWithNumbers) { |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); |
| | | async function getBookDetailPageUrl(book, titleWithNumbers) { |
| | | const kw = formatKw(book.title, titleWithNumbers); |
| | | const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`; |
| | | const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}` |
| | | console.log(`打开搜索: ${searchUrl}`); |
| | | return await retry(async () => { |
| | | // 获取页面 |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; |
| | | await driver.get(searchUrl); |
| | | }).then(() => true) |
| | | .catch(() => false); |
| | | } |
| | | |
| | | /** |
| | | * 检测搜索结果 |
| | | * @param {*} book |
| | | * @returns true: 有搜索结果,false: 没有搜索结果 |
| | | */ |
| | | async function checkSearchResult(book) { |
| | | console.log(`检测搜索结果`); |
| | | return await retry(async () => { |
| | | const text = await driver.executeScript(`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#content-container > empty-placeholder").shadowRoot.querySelector("div > h2").textContent`); |
| | | if (text && text.includes("Your search did not match any items in the Archive. Try different keywords or a more general search.")) { |
| | | // 没有搜索结果 |
| | | book.state = "没有搜索结果"; |
| | | console.log(`没有搜索结果: ${book.id} ${book.title}`); |
| | | return false; |
| | | } else { |
| | | return true; |
| | | const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) |
| | | const { total, hits } = resp.data.response.body.hits |
| | | if (total === 0) { |
| | | return ''; |
| | | } |
| | | }, 2) |
| | | .catch(() => { |
| | | return true; |
| | | }); |
| | | } |
| | | |
| | | async function findBookDetailPageUrl(book) { |
| | | console.log(`查找详情页url`); |
| | | return retry(async () => { |
| | | let detailPageUrl; |
| | | try { |
| | | detailPageUrl = await driver.executeScript( |
| | | `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article:nth-child(2) > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value` |
| | | ); |
| | | } catch (e) { |
| | | detailPageUrl = await driver.executeScript( |
| | | `return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value` |
| | | ); |
| | | } |
| | | return detailPageUrl; |
| | | const hit = hits[0]; |
| | | const { identifier, title, creator } = hit.fields |
| | | return `https://archive.org/details/${identifier}`; |
| | | }) |
| | | .catch(() => ''); |
| | | } |
| | | |
| | | async function openBookDetailPage(book, detailPageUrl) { |
| | | console.log(`打开详情: https://archive.org${detailPageUrl}`); |
| | | console.log(`打开详情: ${detailPageUrl}`); |
| | | return await retry(async () => { |
| | | await driver.get(`https://archive.org${detailPageUrl}`); |
| | | await driver.wait( |
| | | until.elementLocated( |
| | | By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`) |
| | | ), 15000 |
| | | ); |
| | | const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); |
| | | const html = resp.data; |
| | | const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]); |
| | | book.publisher = data.metadata.publisher; |
| | | book.pubDate = data.metadata.date; |
| | | const identifier = data.metadata.identifier; |
| | | const fileData = data.files.find(f => f.format === 'Text PDF'); |
| | | if (!fileData) { |
| | | return ''; |
| | | } |
| | | const fileUrl = `https://archive.org/download/${identifier}/${fileData.name}`; |
| | | return fileUrl; |
| | | }) |
| | | .then(() => true) |
| | | .catch(() => { |
| | | book.state = "打开详情页失败"; |
| | | console.log(`打开详情页失败: ${book.id} ${book.title}`); |
| | | return false; |
| | | }); |
| | | } |
| | | |
| | | async function getDownloadUrl(book) { |
| | | console.log(`获取下载链接`); |
| | | function getFullUrl(url) { |
| | | if (!url) { return ''; } |
| | | return url.startsWith("http") ? url : `https://archive.org${url}`; |
| | | } |
| | | return await retry(async () => { |
| | | const elements = await driver.findElements( |
| | | By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div/a`) |
| | | ); |
| | | |
| | | let pdfUrl = ""; |
| | | let textUrl = ""; |
| | | for (const el of elements) { |
| | | let text = await el.getText(); |
| | | if (text) { |
| | | text = text.trim().split("\n")[0]; |
| | | const href = getFullUrl(await el.getAttribute("href")); |
| | | if (text.toLowerCase() === "pdf") { |
| | | pdfUrl = href; |
| | | } else if (text.toLowerCase() === "full text") { |
| | | textUrl = href; |
| | | } else if (text.toLowerCase() === "ocr search text") { |
| | | textUrl = href; |
| | | } |
| | | } |
| | | } |
| | | |
| | | /* if (pdfUrl) { |
| | | return pdfUrl; |
| | | } else */ |
| | | if (textUrl) { |
| | | return textUrl; |
| | | } else { |
| | | book.state = "没有text文件"; |
| | | return '' |
| | | } |
| | | }) |
| | | .catch(() => { |
| | | book.state = "没有text文件"; |
| | | return ''; |
| | | }); |
| | | } |
| | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | const ext = url.split(".").pop().toLowerCase(); |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.txt`; |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | book.url = url; |
| | | if (fs.existsSync(filepath)) { |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | book.url = url; |
| | | book.pages = await getPdfPages(filepath).catch(() => 0); |
| | | console.log(`下载完成:${filepath}`); |
| | | return; |
| | | } |
| | |
| | | const len = response.headers['content-length']; |
| | | if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { |
| | | // 不是pdf或txt文件,且文件大于200M,不下载 |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | return; |
| | | } |
| | |
| | | const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const out = fs.createWriteStream(_filepath); |
| | | stream.pipe(out); |
| | | stream.on("end", () => { |
| | | stream.on("end", async () => { |
| | | clearTimeout(timeout); |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | book.url = url; |
| | | console.log(`下载完成:${filepath}`); |
| | | setTimeout(() => { |
| | | if (ext === "gz" || ext === "zip") { |
| | | unzip(_filepath, filepath); |
| | | fs.unlinkSync(_filepath); |
| | | } |
| | | let text = fs.readFileSync(filepath, 'utf-8'); |
| | | text = getTextFromHtml(text); |
| | | fs.writeFileSync(filepath, text, 'utf-8'); |
| | | try { |
| | | fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |
| | | } catch (e) { |
| | | reject(e); |
| | | try { |
| | | out.close(); |
| | | fs.unlink(filepath, (e) => console.error(e)); |
| | | } catch (e) { |
| | | console.error(e); |
| | | } |
| | | } |
| | | }, 1000); |
| | | book.pages = await getPdfPages(filepath).catch(e => 0); |
| | | resolve(true); |
| | | }); |
| | | stream.on("error", (err) => { |
| | | clearTimeout(timeout); |
| | | console.error(err); |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | try { |
| | | out.close(); |
| | |
| | | }) |
| | | .catch((e) => { |
| | | clearTimeout(timeout); |
| | | console.error(e); |
| | | book.state = "下载失败"; |
| | | console.log(`下载失败,错误码: ${e?.response?.status ?? e.code}`); |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | if (e.response?.status === 403 || e.response?.status === 401) { |
| | | book.state = "没有下载权限"; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | resolve(true); |
| | | } else { |
| | | reject(false); |
| | | } |
| | | })); |
| | | }).catch(e => { |
| | | book.state = "下载失败"; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | return false |
| | | }); |
| | | } |
| | |
| | | }); |
| | | } |
| | | |
| | | function getBookInfo(book) { |
| | | return retry(async () => { |
| | | const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`); |
| | | const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`); |
| | | let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`); |
| | | pages = pages.split(' / ')[1]; |
| | | book.publisher = publisher; |
| | | book.pubDate = datePublished; |
| | | book.pages = pages; |
| | | }); |
| | | } |
| | | |
| | | async function downloadBooks(books) { |
| | | driver = await createDriver(); |
| | | |
| | | for (; ;) { |
| | | const book = await nextBook(); |
| | |
| | | break; |
| | | } |
| | | bookCount++; |
| | | /*if (isAlreadyDownloaded(book)) { |
| | | if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | book.skip = true; |
| | | continue; |
| | | } |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | | continue; |
| | | } */ |
| | | } |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | // 打开搜索页面并搜索 |
| | | if (!await openSearchPage(book, true)) { |
| | | let detailPageUrl = await getBookDetailPageUrl(book, true); |
| | | if (!detailPageUrl) { |
| | | // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 |
| | | if (!await openSearchPage(book, false)) { |
| | | console.log(`打开搜索页面失败: ${book.id} ${book.title}`); |
| | | book.state = "打开搜索页面失败"; |
| | | detailPageUrl = await getBookDetailPageUrl(book, false); |
| | | if (!detailPageUrl) { |
| | | console.log(`获取详情页链接失败: ${book.id} ${book.title}`); |
| | | book.state = "没有搜索结果"; |
| | | continue; |
| | | } |
| | | } |
| | | // 检测搜索结果 |
| | | const hasBook = await checkSearchResult(book); |
| | | if (!hasBook) { |
| | | continue; |
| | | } |
| | | // 获取详情页链接 |
| | | const detailPageUrl = await findBookDetailPageUrl(book); |
| | | if (!detailPageUrl) { |
| | | console.log(`获取详情页链接失败: ${book.id} ${book.title}`); |
| | | book.state = "获取详情页链接失败"; |
| | | continue; |
| | | } |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(500, 10000)); |
| | | // 打开详情页 |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | await getBookInfo(book); |
| | | // 获取下载链接 |
| | | const url = await getDownloadUrl(book); |
| | | if (!url) { continue; } |
| | | sleep(getRandomNumber(500, 1000)); |
| | | // 打开详情页,并获取下载链接 |
| | | const url = await openBookDetailPage(book, detailPageUrl); |
| | | if (!url) { |
| | | console.log(`没有pdf或text文件: ${book.id} ${book.title}`); |
| | | continue; |
| | | } |
| | | // 等待一段时间再下载 |
| | | await sleep(getRandomNumber(500, 10000)); |
| | | await sleep(getRandomNumber(500, 1000)); |
| | | // 下载文件 |
| | | try { |
| | | await downloadFile(book, url); |
| | | console.log(`下载完成: ${book.id} ${book.title}`); |
| | | console.log('finish: ' + JSON.stringify(book)); |
| | | } catch (e) { } |
| | | successCount++; |
| | | // 等一段时间再下一个 |
| | | sleep(getRandomNumber(500, 10000)); |
| | | sleep(getRandomNumber(500, 1000)); |
| | | } |
| | | } |
| | | |
| | |
| | | for (const book of books) { |
| | | const index = data.findIndex((row) => row[0] === book.id); |
| | | if (index > -1) { |
| | | data[index][5] = book.publisher; |
| | | data[index][6] = book.pubDate; |
| | | data[index][11] = book.pages; |
| | | data[index][12] = book.state; |
| | | data[index][13] = book.format; |
| | | data[index][14] = book.file; |
| | |
| | | } |
| | | |
| | | const buffer = xlsx.build([{ name: "Sheet1", data }]); |
| | | fs.writeFileSync("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { }); |
| | | console.log("保存完成: ./【第二批二次处理后】交付清单.xlsx"); |
| | | try { |
| | | fs.writeFileSync("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { }); |
| | | console.log("保存完成: ./【第二批二次处理后】交付清单.xlsx"); |
| | | } catch (e) { |
| | | console.error(e); |
| | | const outfile = `${Date.now()}.json`; |
| | | fs.writeFileSync(outfile, JSON.stringify(data)); |
| | | console.log("保存完成: " + outfile); |
| | | } |
| | | } |
| | | |
| | | |
| | |
| | | let bookCount = 0; |
| | | // 跳过的数量,已经下载过或没有搜索到的数量 |
| | | let skipCount = 0; |
| | | // chrome驱动 |
| | | let driver; |
| | | let alreadyDownloadedBooks = []; |
| | | |
| | | function getAlreadyDownloadedBooks() { |
| | |
| | | return books.map(it => path.basename(it, path.extname(it)).trim()); |
| | | } |
| | | |
| | | function main() { |
| | | function startDownload() { |
| | | initLogger(); |
| | | const books = []; |
| | | downloadBooks(books) |
| | |
| | | // saveBooks(books); |
| | | parentPort.postMessage({ type: "books", data: books }); |
| | | logFile.close(); |
| | | try { |
| | | await driver.close(); |
| | | await driver.quit(); |
| | | } catch (e) { } |
| | | }); |
| | | } |
| | | |
| | | if (!fs.existsSync('tmpdir')) { |
| | | fs.mkdirSync('tmpdir', { recursive: true }); |
| | | } |
| | | if (!fs.existsSync('downloads')) { |
| | | fs.mkdirSync('downloads', { recursive: true }); |
| | | } |
| | | function main() { |
| | | |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | | const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); |
| | | const { startRow, endRow, threadSize } = config; |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishCnt = 0; |
| | | const finishBooks = []; |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishCnt++; |
| | | if (finishCnt >= threadSize) { |
| | | saveBooks(finishBooks); |
| | | } |
| | | } else if (message.type === 'get-book') { |
| | | worker.postMessage({ type: "book", data: books.shift() }); |
| | | } |
| | | }); |
| | | if (!fs.existsSync('tmpdir')) { |
| | | fs.mkdirSync('tmpdir', { recursive: true }); |
| | | } |
| | | } else { |
| | | alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; |
| | | main(); |
| | | if (!fs.existsSync('downloads')) { |
| | | fs.mkdirSync('downloads', { recursive: true }); |
| | | } |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | | let downloadCnt = 0; |
| | | const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); |
| | | const { startRow, endRow, threadSize } = config; |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishThreadCnt = 0; |
| | | const finishBooks = []; |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishThreadCnt++; |
| | | if (finishThreadCnt >= threadSize) { |
| | | successCount = finishBooks.filter(it => it.state === '下载完成').length; |
| | | skipCount = finishBooks.filter(it => it.skip).length; |
| | | console.log(`全部线程完成,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | saveBooks(finishBooks); |
| | | } |
| | | } else if (message.type === 'get-book') { |
| | | downloadCnt++; |
| | | worker.postMessage({ type: "book", data: books.shift() }); |
| | | } |
| | | }); |
| | | } |
| | | // 监听退出信号,保存已经下载的图书信息 |
| | | process.on('SIGINT', () => { |
| | | successCount = finishBooks.filter(it => it.state === '下载完成').length; |
| | | skipCount = finishBooks.filter(it => it.skip).length; |
| | | console.log(`进程被手动结束,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | saveBooks(finishBooks); |
| | | process.exit(0); |
| | | }); |
| | | } else { |
| | | alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; |
| | | startDownload(); |
| | | |
| | | } |
| | | } |
| | | |
| | | // const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; |
| | | // let text = fs.readFileSync(filepath, 'utf8'); |
| | | // fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |
| | | main(); |