From fb4b60f782a4c263890d5d706aa61a3697fffca2 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期五, 14 六月 2024 00:53:21 +0800 Subject: [PATCH] 下载txt --- src/main.mjs | 107 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 files changed, 89 insertions(+), 18 deletions(-) diff --git a/src/main.mjs b/src/main.mjs index 7aca6af..ce09c61 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -8,6 +8,7 @@ import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; import { resolve } from "path"; +import { execFileSync } from "child_process"; /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -19,7 +20,7 @@ if (!fs.existsSync('./logs')) { fs.mkdirSync('./logs', { recursive: true }); } - logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); + logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); console.log = function (...text) { text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; _log(text); @@ -33,6 +34,61 @@ proxy: false, httpsAgent, }); + +function countChar(str, char) { + let count = 0; + for (let i = 0; i < str.length; i++) { + if (str[i] === char) { + count++; + } + } + return count; +} + +/** + * 娓呯悊鏂囨湰 + * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 + */ +function cleanText(text) { + if (text.includes('google')) { + text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + } + // 濡傛灉鏄痮cr璇嗗埆鐨勬枃鏈紝姣忚瀛楃鏁颁竴鑸笉浼氳秴杩�170 + if (!/.{170,}/g.test(text) || text.includes('google')) { + text = text.replace(/(\r|鈻�)/g, ''); + text = text.replace(/[ ]{2,}/g, ' ') + text = text.replace(/(.+)\n/g, '$1'); + text = text.replace(/\n+/g, '\n'); + text = text.replace(/-\n/g, '-'); + const lines = text.split('\n'); + const result = []; + for (const line of lines) { + const wordSize = countChar(line, ' '); + if (wordSize >= 10) { + if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) { + result.push(line.trim()); + } + } + } + return result.join('\n'); + } else { + return text; + } +} + +/** + * 瑙e帇鏂囨湰鏂囦欢 + * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞 + * @param {string} txtFile 鏂囨湰鏂囦欢璺緞 + */ +function unzip(zipFile, txtFile) { + const tmpdir = `./tmpdir/${threadId}`; + execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) + const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) + .sort((a, b) => a.size.size - b.size.size).pop(); + fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); + fs.rmSync(`${tmpdir}`, { recursive: true }); +} /** * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 @@ -133,10 +189,10 @@ * @param {*} book */ async function openSearchPage(book, titleWithNumbers) { - console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); + console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); return await retry(async () => { // 鑾峰彇椤甸潰 - const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; + const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; await driver.get(searchUrl); }).then(() => true) .catch(() => false); @@ -229,25 +285,26 @@ } } - if (pdfUrl) { + /* if (pdfUrl) { return pdfUrl; - } else if (textUrl) { + } else */ + if (textUrl) { return textUrl; } else { - book.state = "娌℃湁pdf鎴杢ext鏂囦欢"; + book.state = "娌℃湁text鏂囦欢"; return '' } }) .catch(() => { - book.state = "娌℃湁pdf鎴杢ext鏂囦欢"; + book.state = "娌℃湁text鏂囦欢"; return ''; }); } async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); - const ext = url.split(".").pop(); - const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const ext = url.split(".").pop().toLowerCase(); + const filepath = `./downloads/${book.id} ${book.isbn}.txt`; if (fs.existsSync(filepath)) { book.state = `涓嬭浇瀹屾垚`; book.format = ext; @@ -275,7 +332,8 @@ return; } const stream = response.data; - const out = fs.createWriteStream(filepath); + const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const out = fs.createWriteStream(_filepath); stream.pipe(out); stream.on("end", () => { clearTimeout(timeout); @@ -284,6 +342,17 @@ book.file = filepath; book.url = url; console.log(`涓嬭浇瀹屾垚锛�${filepath}`); + setTimeout(() => { + if (ext === "gz" || ext === "zip") { + unzip(_filepath, filepath); + } + let text = fs.readFileSync(filepath, 'utf-8'); + if (text.includes("<!DOCTYPE html>")) { + text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; + fs.writeFileSync(filepath, text, 'utf-8'); + } + fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); + }, 1000); resolve(true); }); stream.on("error", (err) => { @@ -467,7 +536,7 @@ const books = []; downloadBooks(books) .then(() => { - console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); }) .catch(e => { console.error(e); @@ -483,6 +552,13 @@ }); } +if (!fs.existsSync('tmpdir')) { + fs.mkdirSync('tmpdir', { recursive: true }); +} +if (!fs.existsSync('downloads')) { + fs.mkdirSync('downloads', { recursive: true }); +} + // 澶氳繘绋嬫墽琛� if (isMainThread) { initLogger(); @@ -491,15 +567,10 @@ console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); let finishCnt = 0; const finishBooks = []; - const thBookSize = (endRow - startRow) / threadSize; const books = getBooksFromExcel(startRow, endRow); - for (let sr = startRow; sr < endRow; sr += thBookSize) { - let er = sr + thBookSize; - if (er > endRow) { - er = endRow; - } - const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } }); + for (let i = 0; i < threadSize; i++) { + const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); worker.on("message", (message) => { if (message.type === 'books') { finishBooks.push(...message.data); -- Gitblit v1.9.1