| | |
| | | import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import { resolve } from "path"; |
| | | import { execFileSync } from "child_process"; |
| | | |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | |
| | | if (!fs.existsSync('./logs')) { |
| | | fs.mkdirSync('./logs', { recursive: true }); |
| | | } |
| | | logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); |
| | | console.log = function (...text) { |
| | | text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; |
| | | _log(text); |
| | |
| | | proxy: false, |
| | | httpsAgent, |
| | | }); |
| | | |
| | | function countChar(str, char) { |
| | | let count = 0; |
| | | for (let i = 0; i < str.length; i++) { |
| | | if (str[i] === char) { |
| | | count++; |
| | | } |
| | | } |
| | | return count; |
| | | } |
| | | |
| | | /** |
| | | * 清理文本 |
| | | * @param {string} text 要清理的文本 |
| | | */ |
| | | function cleanText(text) { |
| | | if (text.includes('google')) { |
| | | text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') |
| | | } |
| | | // 如果是ocr识别的文本,每行字符数一般不会超过170 |
| | | if (!/.{170,}/g.test(text) || text.includes('google')) { |
| | | text = text.replace(/(\r|■)/g, ''); |
| | | text = text.replace(/[ ]{2,}/g, ' ') |
| | | text = text.replace(/(.+)\n/g, '$1'); |
| | | text = text.replace(/\n+/g, '\n'); |
| | | text = text.replace(/-\n/g, '-'); |
| | | const lines = text.split('\n'); |
| | | const result = []; |
| | | for (const line of lines) { |
| | | const wordSize = countChar(line, ' '); |
| | | if (wordSize >= 10) { |
| | | if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) { |
| | | result.push(line.trim()); |
| | | } |
| | | } |
| | | } |
| | | return result.join('\n'); |
| | | } else { |
| | | return text; |
| | | } |
| | | } |
| | | |
| | | /** |
| | | * 解压文本文件 |
| | | * @param {string} zipFile 压缩文件路径 |
| | | * @param {string} txtFile 文本文件路径 |
| | | */ |
| | | function unzip(zipFile, txtFile) { |
| | | const tmpdir = `./tmpdir/${threadId}`; |
| | | execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) |
| | | const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) |
| | | .sort((a, b) => a.size.size - b.size.size).pop(); |
| | | fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); |
| | | fs.rmSync(`${tmpdir}`, { recursive: true }); |
| | | } |
| | | |
| | | /** |
| | | * 获取要下载熟图书信息 |
| | |
| | | * @param {*} book |
| | | */ |
| | | async function openSearchPage(book, titleWithNumbers) { |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); |
| | | console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); |
| | | return await retry(async () => { |
| | | // 获取页面 |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; |
| | | const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; |
| | | await driver.get(searchUrl); |
| | | }).then(() => true) |
| | | .catch(() => false); |
| | |
| | | } |
| | | } |
| | | |
| | | if (pdfUrl) { |
| | | /* if (pdfUrl) { |
| | | return pdfUrl; |
| | | } else if (textUrl) { |
| | | } else */ |
| | | if (textUrl) { |
| | | return textUrl; |
| | | } else { |
| | | book.state = "没有pdf或text文件"; |
| | | book.state = "没有text文件"; |
| | | return '' |
| | | } |
| | | }) |
| | | .catch(() => { |
| | | book.state = "没有pdf或text文件"; |
| | | book.state = "没有text文件"; |
| | | return ''; |
| | | }); |
| | | } |
| | | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | const ext = url.split(".").pop(); |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const ext = url.split(".").pop().toLowerCase(); |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.txt`; |
| | | if (fs.existsSync(filepath)) { |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | |
| | | return; |
| | | } |
| | | const stream = response.data; |
| | | const out = fs.createWriteStream(filepath); |
| | | const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const out = fs.createWriteStream(_filepath); |
| | | stream.pipe(out); |
| | | stream.on("end", () => { |
| | | clearTimeout(timeout); |
| | |
| | | book.file = filepath; |
| | | book.url = url; |
| | | console.log(`下载完成:${filepath}`); |
| | | setTimeout(() => { |
| | | if (ext === "gz" || ext === "zip") { |
| | | unzip(_filepath, filepath); |
| | | } |
| | | let text = fs.readFileSync(filepath, 'utf-8'); |
| | | if (text.includes("<!DOCTYPE html>")) { |
| | | text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; |
| | | fs.writeFileSync(filepath, text, 'utf-8'); |
| | | } |
| | | fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |
| | | }, 1000); |
| | | resolve(true); |
| | | }); |
| | | stream.on("error", (err) => { |
| | |
| | | const books = []; |
| | | downloadBooks(books) |
| | | .then(() => { |
| | | console.log(`全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | console.log(`线程:${threadId}全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`); |
| | | }) |
| | | .catch(e => { |
| | | console.error(e); |
| | |
| | | }); |
| | | } |
| | | |
| | | if (!fs.existsSync('tmpdir')) { |
| | | fs.mkdirSync('tmpdir', { recursive: true }); |
| | | } |
| | | if (!fs.existsSync('downloads')) { |
| | | fs.mkdirSync('downloads', { recursive: true }); |
| | | } |
| | | |
| | | // 多进程执行 |
| | | if (isMainThread) { |
| | | initLogger(); |
| | |
| | | console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`); |
| | | let finishCnt = 0; |
| | | const finishBooks = []; |
| | | const thBookSize = (endRow - startRow) / threadSize; |
| | | const books = getBooksFromExcel(startRow, endRow); |
| | | |
| | | for (let sr = startRow; sr < endRow; sr += thBookSize) { |
| | | let er = sr + thBookSize; |
| | | if (er > endRow) { |
| | | er = endRow; |
| | | } |
| | | const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } }); |
| | | for (let i = 0; i < threadSize; i++) { |
| | | const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); |
| | | worker.on("message", (message) => { |
| | | if (message.type === 'books') { |
| | | finishBooks.push(...message.data); |