| | |
| | | import proxy from "selenium-webdriver/proxy.js"; |
| | | import axios from "axios"; |
| | | import * as fs from "fs"; |
| | | import { Worker, isMainThread, parentPort, workerData } from 'worker_threads'; |
| | | import path from "path"; |
| | | import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import { resolve } from "path"; |
| | | |
| | | import { execFileSync } from "child_process"; |
| | | import wordsjs from 'wordlist-js'; |
| | | import usPlaceList from "./us-place-list.mjs"; |
| | | import usPeronNameList from "./us-pseron-name-list.mjs"; |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | | |
| | |
| | | if (!fs.existsSync('./logs')) { |
| | | fs.mkdirSync('./logs', { recursive: true }); |
| | | } |
| | | logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | console.log = function (...text) { |
| | | text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; |
| | | _log(text); |
| | |
| | | proxy: false, |
| | | httpsAgent, |
| | | }); |
| | | |
| | | function allWords() { |
| | | const words = {}; |
| | | wordsjs.usPlaces = usPlaceList; |
| | | wordsjs.usPeronNameList = usPeronNameList; |
| | | for (const key in wordsjs.default) { |
| | | if (Object.hasOwnProperty.call(wordsjs.default, key)) { |
| | | for (const word of wordsjs.default[key]) { |
| | | words[word] = true; |
| | | } |
| | | } |
| | | } |
| | | return words; |
| | | } |
| | | |
| | | const wordsMap = allWords(); |
| | | |
| | | /** |
| | | * 统计单词数量 |
| | | * @param {string} str 字符串 |
| | | * @returns 单词数量 |
| | | */ |
| | | function countWordSize(str) { |
| | | let count = 0; |
| | | str = str.replace(/[ ]{2,}/g, ' '); |
| | | for (let i = 0; i < str.length; i++) { |
| | | if (str[i] === ' ') { |
| | | count++; |
| | | } |
| | | } |
| | | return count; |
| | | } |
| | | |
| | | /** |
| | | * 获取错误单词比例 |
| | | * @param {string} text 文本 |
| | | * @returns 错误单词比例 |
| | | */ |
| | | function incorrectWordRatio(text) { |
| | | text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); |
| | | const words = text.split(' '); |
| | | const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; |
| | | return incorrectWordCnt / words.length; |
| | | } |
| | | |
| | | /** |
| | | * 符号占比 0 ~ 1 |
| | | * @param {string} text 文本 |
| | | */ |
| | | function symbolRatio(text) { |
| | | // 非字母数字字符占比 |
| | | return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; |
| | | } |
| | | |
| | | /** |
| | | * 清理文本 |
| | | * @param {string} text 要清理的文本 |
| | | */ |
| | | function cleanText(text) { |
| | | text = text.replace(/(\r)/g, ''); |
| | | const googlePage = text.substring(0, 10000); |
| | | if (googlePage.includes('google')) { |
| | | text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); |
| | | } |
| | | // if (!/.{170,}/g.test(text) || text.includes('google')) { |
| | | text = text.replace(/[ ]{2,}/g, ' ') |
| | | if (!/.{170,}/g.test(text)) { |
| | | // 每行不超过170个字符 |
| | | text = text.replace(/(.{170,})\n/g, '$1'); |
| | | } |
| | | text = text.replace(/\n+/g, '\n'); |
| | | text = text.replace(/-\n/g, '-'); |
| | | const lines = text.split('\n'); |
| | | const result = []; |
| | | for (const line of lines) { |
| | | // 符号比太高的不要 |
| | | const incorrectRatio = incorrectWordRatio(line); |
| | | if (symbolRatio(line) > 0.2) { |
| | | if (incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | } |
| | | // 去除空格后 连续重复单个字符3次及以上不要 |
| | | const wordSize = countWordSize(line); |
| | | if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { |
| | | if (wordSize < 5 || incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | } |
| | | // 连续三个标点符号及以上,错误率大于0.65不要 |
| | | if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) { |
| | | continue; |
| | | } |
| | | // 单词数量太少的不要 |
| | | if (wordSize > 5 && incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | // 有google的不要 |
| | | if (/.*(google).*/ig.test(line)) { |
| | | continue; |
| | | } |
| | | // 只有一个字符不要 |
| | | const ret = line.trim().replace(/[■•*¬»«^-]/g, ''); |
| | | if (ret.length <= 1) { |
| | | continue; |
| | | } |
| | | if (ret == 'Digitized by') { |
| | | continue; |
| | | } |
| | | result.push(ret); |
| | | } |
| | | text = result.join('\n'); |
| | | // } |
| | | return text; |
| | | } |
| | | |
| | | /** |
| | | * 解压文本文件 |
| | | * @param {string} zipFile 压缩文件路径 |
| | | * @param {string} txtFile 文本文件路径 |
| | | */ |
| | | function unzip(zipFile, txtFile) { |
| | | const tmpdir = `./tmpdir/${threadId}`; |
| | | execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) |
| | | const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) |
| | | .sort((a, b) => a.size.size - b.size.size).pop(); |
| | | fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); |
| | | fs.rmSync(`${tmpdir}`, { recursive: true }); |
| | | } |
| | | |
| | | /** |
| | | * 获取要下载熟图书信息 |
| | |
| | | * @returns 处理后的关键字 |
| | | */ |
| | | function formatKw(text, titleWithNumbers) { |
| | | // 只保留空格、中文、英文、法文、德文、希腊文 |
| | | const regex = /[^\u4e00-\u9fa5\w\s\d]/g; |
| | | if (titleWithNumbers) { |
| | | text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, ""); |
| | | text = text; |
| | | } else { |
| | | text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, ""); |
| | | text = text.replace(/[\d]/g, ""); |
| | | } |
| | | text = text.split(' ').slice(0, 10).join("+"); |
| | | text = text.split(' ').slice(0, 6).join("+"); |
| | | return text; |
| | | } |
| | | |
| | |
| | | * @param {*} book |
| | | */ |
| | | async function openSearchPage(book, titleWithNumbers) { |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); |
| | | return await retry(async () => { |
| | | // 获取页面 |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; |
| | | await driver.get(searchUrl); |
| | | }).then(() => true) |
| | | .catch(() => false); |
| | |
| | | } else if (textUrl) { |
| | | return textUrl; |
| | | } else { |
| | | book.state = "没有pdf或text文件"; |
| | | book.state = "没有text文件"; |
| | | return '' |
| | | } |
| | | }) |
| | | .catch(() => { |
| | | book.state = "没有pdf或text文件"; |
| | | book.state = "没有text文件"; |
| | | return ''; |
| | | }); |
| | | } |
| | | |
| | | /** |
| | | * 从HTML提取文本 |
| | | * @param {string} text html文本 |
| | | * @returns 文本 |
| | | */ |
| | | function getTextFromHtml(text) { |
| | | if (text.includes("<!DOCTYPE html>")) { |
| | | const s = text.indexOf('<pre>') + 6; |
| | | const e = text.indexOf('</pre>'); |
| | | text = text.substring(s, e); |
| | | // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; |
| | | } |
| | | return text; |
| | | } |
| | | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | const ext = url.split(".").pop(); |
| | | const ext = url.split(".").pop().toLowerCase(); |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | if (fs.existsSync(filepath)) { |
| | | book.state = `下载完成`; |
| | |
| | | return; |
| | | } |
| | | await retry(() => { |
| | | const timeoutTime = 10 * 60 * 1000; |
| | | const source = axios.CancelToken.source(); |
| | | const timeout = setTimeout(() => { |
| | | source.cancel("timeout"); |
| | | }, timeoutTime); |
| | | return new Promise((resolve, reject) => myAxios |
| | | .get(url, { responseType: "stream" }) |
| | | .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token }) |
| | | .then((response) => { |
| | | const len = response.headers['content-length']; |
| | | if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { |
| | |
| | | return; |
| | | } |
| | | const stream = response.data; |
| | | const out = fs.createWriteStream(filepath); |
| | | const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const out = fs.createWriteStream(_filepath); |
| | | stream.pipe(out); |
| | | stream.on("end", () => { |
| | | clearTimeout(timeout); |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | book.url = url; |
| | | console.log(`下载完成:${filepath}`); |
| | | resolve(true); |
| | | }); |
| | | stream.on("error", (err) => { |
| | | clearTimeout(timeout); |
| | | console.error(err); |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | |
| | | }); |
| | | }) |
| | | .catch((e) => { |
| | | clearTimeout(timeout); |
| | | console.error(e); |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | |
| | | }); |
| | | } |
| | | |
| | | function isAlreadyDownloaded(book) { |
| | | const id = `${book.id} ${book.isbn}`; |
| | | return alreadyDownloadedBooks.includes(id); |
| | | } |
| | | |
| | | function nextBook() { |
| | | return new Promise(resolve => { |
| | | const cb = (message) => { |
| | | if (message.type === 'book') { |
| | | resolve(message.data); |
| | | parentPort.removeListener('message', cb); |
| | | } |
| | | }; |
| | | parentPort.on('message', cb); |
| | | parentPort.postMessage({ type: 'get-book', threadId }); |
| | | |
| | | }); |
| | | } |
| | | |
| | | function getBookInfo(book) { |
| | | return retry(async () => { |
| | | book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e => 0); |
| | | book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e => 0); |
| | | let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e => 0); |
| | | if (pages) { book.pages = pages.split(' / ')[1]; } |
| | | }); |
| | | } |
| | | |
| | | async function downloadBooks(books) { |
| | | driver = await createDriver(); |
| | | for (const book of books) { |
| | | |
| | | for (; ;) { |
| | | const book = await nextBook(); |
| | | if (!book) { |
| | | break; |
| | | } |
| | | books.push(book); |
| | | if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { |
| | | // 定时退出 |
| | | break; |
| | | } |
| | | bookCount++; |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | /*if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | continue; |
| | | } |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | | continue; |
| | | } */ |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | // 打开搜索页面并搜索 |
| | | if (!await openSearchPage(book, true)) { |
| | |
| | | continue; |
| | | } |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(3000, 10000)); |
| | | sleep(getRandomNumber(500, 10000)); |
| | | // 打开详情页 |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | await getBookInfo(book); |
| | | // 获取下载链接 |
| | | const url = await getDownloadUrl(book); |
| | | if (!url) { continue; } |
| | | // 等待一段时间再下载 |
| | | await sleep(getRandomNumber(3000, 10000)); |
| | | await sleep(getRandomNumber(500, 10000)); |
| | | // 下载文件 |
| | | try { |
| | | await downloadFile(book, url); |
| | | console.log(`下载完成: ${book.id} ${book.title}`); |
| | | console.log('finish: ' + JSON.stringify(book)); |
| | | } catch (e) { } |
| | | successCount++; |
| | | // 等一段时间再下一个 |
| | | sleep(getRandomNumber(3000, 10000)); |
| | | sleep(getRandomNumber(500, 10000)); |
| | | } |
| | | } |
| | | |
| | |
| | | for (const book of books) { |
| | | const index = data.findIndex((row) => row[0] === book.id); |
| | | if (index > -1) { |
| | | data[index][5] = book.publisher; |
| | | data[index][6] = book.pubDate; |
| | | data[index][11] = book.pages; |
| | | data[index][12] = book.state; |
| | | data[index][13] = book.format; |
| | | data[index][14] = book.file; |
| | |
| | | } |
| | | |
| | | const buffer = xlsx.build([{ name: "Sheet1", data }]); |
| | | fs.writeFile("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { }); |
| | | fs.writeFileSync("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { }); |
| | | console.log("保存完成: ./【第二批二次处理后】交付清单.xlsx"); |
| | | } |
| | | |
| | |
| | | let skipCount = 0; |
| | | // chrome驱动 |
| | | let driver; |
| | | let alreadyDownloadedBooks = []; |
| | | |
| | | function getAlreadyDownloadedBooks() { |
| | | const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8'); |
| | | const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it); |
| | | const files = fs.readdirSync('./downloads'); |
| | | books.push(...files); |
| | | return books.map(it => path.basename(it, path.extname(it)).trim()); |
| | | } |
| | | |
| | | function main() { |
| | | initLogger(); |
| | | const books = getBooksFromExcel(config.startRow, config.endRow); |
| | | const books = []; |
| | | downloadBooks(books) |
| | | .then(() => { |
| | | console.log(`全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | console.log(`线程:${threadId}全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | }) |
| | | .catch(e => { |
| | | console.error(e); |
| | |
| | | }); |
| | | } |
| | | |
| | | if (!fs.existsSync('tmpdir')) { |
| | | fs.mkdirSync('tmpdir', { recursive: true }); |
| | | } |
| | | if (!fs.existsSync('downloads')) { |
| | | fs.mkdirSync('downloads', { recursive: true }); |
| | | } |
| | | |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | | console.log(`线程数:${config.threadSize}, 开始行:${config.startRow}, 结束行:${config.endRow}`); |
| | | let startRow = config.startRow; |
| | | let endRow = config.endRow; |
| | | const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); |
| | | const { startRow, endRow, threadSize } = config; |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishCnt = 0; |
| | | const finishBooks = []; |
| | | const threadSize = config.threadSize; |
| | | const thBookSize = endRow - startRow / threadSize |
| | | for (let sr = startRow; sr < endRow; sr += thBookSize) { |
| | | let er = sr + thBookSize; |
| | | if (er > endRow) { |
| | | er = endRow; |
| | | } |
| | | const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er } }); |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishCnt++; |
| | | if (finishCnt >= config.threadSize) { |
| | | if (finishCnt >= threadSize) { |
| | | saveBooks(finishBooks); |
| | | } |
| | | } else if (message.type === 'get-book') { |
| | | worker.postMessage({ type: "book", data: books.shift() }); |
| | | } |
| | | }); |
| | | } |
| | | } else { |
| | | config.startRow = workerData.startRow; |
| | | config.endRow = workerData.endRow; |
| | | alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; |
| | | main(); |
| | | } |
| | | |
| | | // const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; |
| | | // let text = fs.readFileSync(filepath, 'utf8'); |
| | | // fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |