| | |
| | | import proxy from "selenium-webdriver/proxy.js"; |
| | | import axios from "axios"; |
| | | import * as fs from "fs"; |
| | | import path from "path"; |
| | | import { Worker, isMainThread, parentPort, workerData } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import { resolve } from "path"; |
| | | |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | | |
| | | /* ------------日志-------------- */ |
| | | const _log = console.log; |
| | | const logFile = fs.createWriteStream('./logs.log', { flags: 'a', encoding: 'utf8' }); |
| | | console.log = function (text) { |
| | | text = `${new Date().toLocaleString()} ${text ?? ''}`; |
| | | _log(text); |
| | | logFile.write(text + '\n'); |
| | | }; |
| | | let logFile; |
| | | function initLogger() { |
| | | const _log = console.log; |
| | | if (!fs.existsSync('./logs')) { |
| | | fs.mkdirSync('./logs', { recursive: true }); |
| | | } |
| | | logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | console.log = function (...text) { |
| | | text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; |
| | | _log(text); |
| | | logFile.write(text + '\n'); |
| | | }; |
| | | } |
| | | |
| | | /* ----------axios代理------------ */ |
| | | const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`); |
| | |
| | | /** |
| | | * 格式化关键字 |
| | | * @param {string} text 要搜索的关键字 |
| | | * @param {boolean} titleWithNumbers 是否标题中包含数字 |
| | | * @returns 处理后的关键字 |
| | | */ |
| | | function formatKw(text) { |
| | | // 只保留中文、英文、数字和下划线 |
| | | return text.replace(/[^\u4e00-\u9fa5\w \d]/g, ""); |
| | | function formatKw(text, titleWithNumbers) { |
| | | // 只保留空格、中文、英文、法文、德文、希腊文 |
| | | const regex = /[^\u4e00-\u9fa5\w\s\d]/g; |
| | | if (titleWithNumbers) { |
| | | text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, ""); |
| | | } else { |
| | | text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, ""); |
| | | } |
| | | text = text.split(' ').slice(0, 10).join("+"); |
| | | return text; |
| | | } |
| | | |
| | | |
| | |
| | | * 打开搜索页面并搜索 |
| | | * @param {*} book |
| | | */ |
| | | async function openSearchPage(book) { |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title)}`); |
| | | async function openSearchPage(book, titleWithNumbers) { |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); |
| | | return await retry(async () => { |
| | | // 获取页面 |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`; |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; |
| | | await driver.get(searchUrl); |
| | | }).then(() => true) |
| | | .catch(() => false); |
| | |
| | | await driver.wait( |
| | | until.elementLocated( |
| | | By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`) |
| | | ) |
| | | ), 15000 |
| | | ); |
| | | }) |
| | | .then(() => true) |
| | |
| | | return new Promise((resolve, reject) => myAxios |
| | | .get(url, { responseType: "stream" }) |
| | | .then((response) => { |
| | | const len = response.headers['content-length']; |
| | | if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { |
| | | // 不是pdf或txt文件,且文件大于200M,不下载 |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | return; |
| | | } |
| | | const stream = response.data; |
| | | const out = fs.createWriteStream(filepath); |
| | | stream.pipe(out); |
| | |
| | | console.error(err); |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title}`); |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | try { |
| | | out.close(); |
| | | fs.unlink(filepath,(e)=>console.error(e)); |
| | | fs.unlink(filepath, (e) => console.error(e)); |
| | | } catch (e) { |
| | | console.error(e); |
| | | } |
| | |
| | | console.error(e); |
| | | book.state = "下载失败"; |
| | | book.url = url; |
| | | console.log(`下载失败: ${book.id} ${book.title}`); |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | reject(false); |
| | | })); |
| | | }).catch(e => { |
| | |
| | | }); |
| | | } |
| | | |
| | | function isAlreadyDownloaded(book) { |
| | | const id = `${book.id} ${book.isbn}`; |
| | | return alreadyDownloadedBooks.includes(id); |
| | | } |
| | | |
| | | async function downloadBooks(books) { |
| | | driver = await createDriver(); |
| | | for (const book of books) { |
| | | if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) { |
| | | // 定时退出 |
| | | break; |
| | | } |
| | | bookCount++; |
| | | if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | continue; |
| | | } |
| | | if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) { |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | |
| | | } |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | // 打开搜索页面并搜索 |
| | | if (!await openSearchPage(book)) { |
| | | console.log(`打开搜索页面失败: ${book.id} ${book.title}`); |
| | | book.state = "打开搜索页面失败"; |
| | | continue; |
| | | if (!await openSearchPage(book, true)) { |
| | | // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字 |
| | | if (!await openSearchPage(book, false)) { |
| | | console.log(`打开搜索页面失败: ${book.id} ${book.title}`); |
| | | book.state = "打开搜索页面失败"; |
| | | continue; |
| | | } |
| | | } |
| | | // 检测搜索结果 |
| | | const hasBook = await checkSearchResult(book); |
| | |
| | | continue; |
| | | } |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(3000, 10000)); |
| | | sleep(getRandomNumber(1000, 30000)); |
| | | // 打开详情页 |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | // 获取下载链接 |
| | | const url = await getDownloadUrl(book); |
| | | if (!url) { continue; } |
| | | // 等待一段时间再下载 |
| | | await sleep(getRandomNumber(3000, 10000)); |
| | | await sleep(getRandomNumber(1000, 30000)); |
| | | // 下载文件 |
| | | try { |
| | | await downloadFile(book, url); |
| | | console.log(`下载完成: ${book.id} ${book.title}`); |
| | | }catch(e){} |
| | | } catch (e) { } |
| | | successCount++; |
| | | // 等一段时间再下一个 |
| | | sleep(getRandomNumber(3000, 10000)); |
| | | sleep(getRandomNumber(1000, 30000)); |
| | | } |
| | | } |
| | | |
| | |
| | | let bookCount = 0; |
| | | // 跳过的数量,已经下载过或没有搜索到的数量 |
| | | let skipCount = 0; |
| | | const driver = await createDriver(); |
| | | // chrome驱动 |
| | | let driver; |
| | | let alreadyDownloadedBooks = []; |
| | | |
| | | function getAlreadyDownloadedBooks() { |
| | | const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8'); |
| | | const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it); |
| | | const files = fs.readdirSync('./downloads'); |
| | | books.push(...files); |
| | | return books.map(it => path.basename(it, path.extname(it)).trim()); |
| | | } |
| | | |
| | | function main() { |
| | | initLogger(); |
| | | const books = getBooksFromExcel(config.startRow, config.endRow); |
| | | downloadBooks(books) |
| | | .then(() => { |
| | |
| | | console.error(e); |
| | | }) |
| | | .finally(async () => { |
| | | saveBooks(books); |
| | | // saveBooks(books); |
| | | parentPort.postMessage({ type: "books", data: books }); |
| | | logFile.close(); |
| | | try { |
| | | await driver.close(); |
| | | await driver.quit(); |
| | | }catch(e){} |
| | | } catch (e) { } |
| | | }); |
| | | } |
| | | |
| | | main(); |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | | const alreadyDownloadedBooks = getAlreadyDownloadedBooks(); |
| | | console.log(`线程数:${config.threadSize}, 开始行:${config.startRow}, 结束行:${config.endRow}`); |
| | | let startRow = config.startRow; |
| | | let endRow = config.endRow; |
| | | let finishCnt = 0; |
| | | const finishBooks = []; |
| | | const threadSize = config.threadSize; |
| | | const thBookSize = (endRow - startRow) / threadSize |
| | | for (let sr = startRow; sr < endRow; sr += thBookSize) { |
| | | let er = sr + thBookSize; |
| | | if (er > endRow) { |
| | | er = endRow; |
| | | } |
| | | const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |
| | | finishCnt++; |
| | | if (finishCnt >= config.threadSize) { |
| | | saveBooks(finishBooks); |
| | | } |
| | | } |
| | | }); |
| | | } |
| | | } else { |
| | | config.startRow = workerData.startRow; |
| | | config.endRow = workerData.endRow; |
| | | alreadyDownloadedBooks = workerData.alreadyDownloadedBooks; |
| | | main(); |
| | | } |
| | | |