From 92c1b3abe15b82486427ef2e9e2455524e0c6c84 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期五, 14 六月 2024 23:01:48 +0800 Subject: [PATCH] 回写图书信息到Excel --- src/main.mjs | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 201 insertions(+), 29 deletions(-) diff --git a/src/main.mjs b/src/main.mjs index 0e0a42f..bbf02b6 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -8,7 +8,10 @@ import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads'; import { HttpsProxyAgent } from "https-proxy-agent"; import { resolve } from "path"; - +import { execFileSync } from "child_process"; +import wordsjs from 'wordlist-js'; +import usPlaceList from "./us-place-list.mjs"; +import usPeronNameList from "./us-pseron-name-list.mjs"; /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -19,7 +22,7 @@ if (!fs.existsSync('./logs')) { fs.mkdirSync('./logs', { recursive: true }); } - logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); + logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' }); console.log = function (...text) { text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`; _log(text); @@ -33,6 +36,135 @@ proxy: false, httpsAgent, }); + +function allWords() { + const words = {}; + wordsjs.usPlaces = usPlaceList; + wordsjs.usPeronNameList = usPeronNameList; + for (const key in wordsjs.default) { + if (Object.hasOwnProperty.call(wordsjs.default, key)) { + for (const word of wordsjs.default[key]) { + words[word] = true; + } + } + } + return words; +} + +const wordsMap = allWords(); + +/** + * 缁熻鍗曡瘝鏁伴噺 + * @param {string} str 瀛楃涓� + * @returns 鍗曡瘝鏁伴噺 + */ +function countWordSize(str) { + let count = 0; + str = str.replace(/[ ]{2,}/g, ' '); + for (let i = 0; i < str.length; i++) { + if (str[i] === ' ') { + count++; + } + } + return count; +} + +/** + * 鑾峰彇閿欒鍗曡瘝姣斾緥 + * @param {string} text 鏂囨湰 + * @returns 閿欒鍗曡瘝姣斾緥 + */ +function incorrectWordRatio(text) { + text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); + const words = text.split(' '); + const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; + return incorrectWordCnt / words.length; +} + +/** + * 绗﹀彿鍗犳瘮 0 ~ 1 + * @param {string} text 鏂囨湰 + */ +function symbolRatio(text) { + // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣� + return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; +} + +/** + * 娓呯悊鏂囨湰 + * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 + */ +function cleanText(text) { + text = text.replace(/(\r)/g, ''); + const googlePage = text.substring(0, 10000); + if (googlePage.includes('google')) { + text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); + } + // if (!/.{170,}/g.test(text) || text.includes('google')) { + text = text.replace(/[ ]{2,}/g, ' ') + if (!/.{170,}/g.test(text)) { + // 姣忚涓嶈秴杩�170涓瓧绗� + text = text.replace(/(.{170,})\n/g, '$1'); + } + text = text.replace(/\n+/g, '\n'); + text = text.replace(/-\n/g, '-'); + const lines = text.split('\n'); + const result = []; + for (const line of lines) { + // 绗﹀彿姣斿お楂樼殑涓嶈 + const incorrectRatio = incorrectWordRatio(line); + if (symbolRatio(line) > 0.2) { + if (incorrectRatio > 0.65) { + continue; + } + } + // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈 + const wordSize = countWordSize(line); + if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { + if (wordSize < 5 || incorrectRatio > 0.65) { + continue; + } + } + // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈 + if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) { + continue; + } + // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕� + if (wordSize > 5 && incorrectRatio > 0.65) { + continue; + } + // 鏈塯oogle鐨勪笉瑕� + if (/.*(google).*/ig.test(line)) { + continue; + } + // 鍙湁涓�涓瓧绗︿笉瑕� + const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, ''); + if (ret.length <= 1) { + continue; + } + if (ret == 'Digitized by') { + continue; + } + result.push(ret); + } + text = result.join('\n'); + // } + return text; +} + +/** + * 瑙e帇鏂囨湰鏂囦欢 + * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞 + * @param {string} txtFile 鏂囨湰鏂囦欢璺緞 + */ +function unzip(zipFile, txtFile) { + const tmpdir = `./tmpdir/${threadId}`; + execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) + const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) + .sort((a, b) => a.size.size - b.size.size).pop(); + fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); + fs.rmSync(`${tmpdir}`, { recursive: true }); +} /** * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 @@ -99,14 +231,12 @@ * @returns 澶勭悊鍚庣殑鍏抽敭瀛� */ function formatKw(text, titleWithNumbers) { - // 鍙繚鐣欑┖鏍笺�佷腑鏂囥�佽嫳鏂囥�佹硶鏂囥�佸痉鏂囥�佸笇鑵婃枃 - const regex = /[^\u4e00-\u9fa5\w\s\d]/g; if (titleWithNumbers) { - text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f \d]/g, ""); + text = text; } else { - text = text.replace(/[^\u4e00-\u9fa5a-zA-Z\u00c0-\u024f ]/g, ""); + text = text.replace(/[\d]/g, ""); } - text = text.split(' ').slice(0, 10).join("+"); + text = text.split(' ').slice(0, 6).join("+"); return text; } @@ -135,10 +265,10 @@ * @param {*} book */ async function openSearchPage(book, titleWithNumbers) { - console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`); + console.log(`鎵撳紑鎼滅储: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`); return await retry(async () => { // 鑾峰彇椤甸潰 - const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`; + const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`; await driver.get(searchUrl); }).then(() => true) .catch(() => false); @@ -236,19 +366,34 @@ } else if (textUrl) { return textUrl; } else { - book.state = "娌℃湁pdf鎴杢ext鏂囦欢"; + book.state = "娌℃湁text鏂囦欢"; return '' } }) .catch(() => { - book.state = "娌℃湁pdf鎴杢ext鏂囦欢"; + book.state = "娌℃湁text鏂囦欢"; return ''; }); } +/** + * 浠嶩TML鎻愬彇鏂囨湰 + * @param {string} text html鏂囨湰 + * @returns 鏂囨湰 + */ +function getTextFromHtml(text) { + if (text.includes("<!DOCTYPE html>")) { + const s = text.indexOf('<pre>') + 6; + const e = text.indexOf('</pre>'); + text = text.substring(s, e); + // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; + } + return text; +} + async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); - const ext = url.split(".").pop(); + const ext = url.split(".").pop().toLowerCase(); const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; if (fs.existsSync(filepath)) { book.state = `涓嬭浇瀹屾垚`; @@ -259,8 +404,13 @@ return; } await retry(() => { + const timeoutTime = 10 * 60 * 1000; + const source = axios.CancelToken.source(); + const timeout = setTimeout(() => { + source.cancel("timeout"); + }, timeoutTime); return new Promise((resolve, reject) => myAxios - .get(url, { responseType: "stream" }) + .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token }) .then((response) => { const len = response.headers['content-length']; if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) { @@ -272,17 +422,19 @@ return; } const stream = response.data; - const out = fs.createWriteStream(filepath); + const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const out = fs.createWriteStream(_filepath); stream.pipe(out); stream.on("end", () => { + clearTimeout(timeout); book.state = `涓嬭浇瀹屾垚`; book.format = ext; book.file = filepath; book.url = url; - console.log(`涓嬭浇瀹屾垚锛�${filepath}`); resolve(true); }); stream.on("error", (err) => { + clearTimeout(timeout); console.error(err); book.state = "涓嬭浇澶辫触"; book.url = url; @@ -297,6 +449,7 @@ }); }) .catch((e) => { + clearTimeout(timeout); console.error(e); book.state = "涓嬭浇澶辫触"; book.url = url; @@ -327,6 +480,15 @@ }); } +function getBookInfo(book) { + return retry(async () => { + book.publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`).catch(e => 0); + book.pubDate = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`).catch(e => 0); + let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`).catch(e => 0); + if (pages) { book.pages = pages.split(' / ')[1]; } + }); +} + async function downloadBooks(books) { driver = await createDriver(); @@ -341,15 +503,15 @@ break; } bookCount++; - if (isAlreadyDownloaded(book)) { + /*if (isAlreadyDownloaded(book)) { skipCount++; continue; } - if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� skipCount++; continue; - } + } */ console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� if (!await openSearchPage(book, true)) { @@ -373,22 +535,24 @@ continue; } // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� - sleep(getRandomNumber(1000, 30000)); + sleep(getRandomNumber(500, 10000)); // 鎵撳紑璇︽儏椤� await openBookDetailPage(book, detailPageUrl); + await getBookInfo(book); // 鑾峰彇涓嬭浇閾炬帴 const url = await getDownloadUrl(book); if (!url) { continue; } // 绛夊緟涓�娈垫椂闂村啀涓嬭浇 - await sleep(getRandomNumber(1000, 30000)); + await sleep(getRandomNumber(500, 10000)); // 涓嬭浇鏂囦欢 try { await downloadFile(book, url); console.log(`涓嬭浇瀹屾垚: ${book.id} ${book.title}`); + console.log('finish: ' + JSON.stringify(book)); } catch (e) { } successCount++; // 绛変竴娈垫椂闂村啀涓嬩竴涓� - sleep(getRandomNumber(1000, 30000)); + sleep(getRandomNumber(500, 10000)); } } @@ -400,6 +564,9 @@ for (const book of books) { const index = data.findIndex((row) => row[0] === book.id); if (index > -1) { + data[index][5] = book.publisher; + data[index][6] = book.pubDate; + data[index][11] = book.pages; data[index][12] = book.state; data[index][13] = book.format; data[index][14] = book.file; @@ -461,7 +628,7 @@ const books = []; downloadBooks(books) .then(() => { - console.log(`鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); + console.log(`绾跨▼锛�${threadId}鍏ㄩ儴瀹屾垚锛屽叡涓嬭浇${bookCount}鏈紝鎴愬姛涓嬭浇${successCount}鏈紝璺宠繃${skipCount}鏈紝澶辫触${bookCount - skipCount - successCount}鏈紝鑰楁椂锛� ${msFormat(Date.now() - startTime)}銆俙); }) .catch(e => { console.error(e); @@ -477,6 +644,13 @@ }); } +if (!fs.existsSync('tmpdir')) { + fs.mkdirSync('tmpdir', { recursive: true }); +} +if (!fs.existsSync('downloads')) { + fs.mkdirSync('downloads', { recursive: true }); +} + // 澶氳繘绋嬫墽琛� if (isMainThread) { initLogger(); @@ -485,15 +659,10 @@ console.log(`绾跨▼鏁帮細${threadSize}, 寮�濮嬭锛�${startRow}, 缁撴潫琛岋細${endRow}`); let finishCnt = 0; const finishBooks = []; - const thBookSize = (endRow - startRow) / threadSize; const books = getBooksFromExcel(startRow, endRow); - for (let sr = startRow; sr < endRow; sr += thBookSize) { - let er = sr + thBookSize; - if (er > endRow) { - er = endRow; - } - const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } }); + for (let i = 0; i < threadSize; i++) { + const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } }); worker.on("message", (message) => { if (message.type === 'books') { finishBooks.push(...message.data); @@ -511,3 +680,6 @@ main(); } +// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; +// let text = fs.readFileSync(filepath, 'utf8'); +// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); \ No newline at end of file -- Gitblit v1.9.1