From 655f90e9e4544fdb8fa37ca0223fb686d4020b88 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期五, 14 六月 2024 22:35:37 +0800 Subject: [PATCH] txt版 --- src/main.mjs | 167 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 files changed, 135 insertions(+), 32 deletions(-) diff --git a/src/main.mjs b/src/main.mjs index 20703fd..4e7605b 100644 --- a/src/main.mjs +++ b/src/main.mjs @@ -9,7 +9,9 @@ import { HttpsProxyAgent } from "https-proxy-agent"; import { resolve } from "path"; import { execFileSync } from "child_process"; - +import wordsjs from 'wordlist-js'; +import usPlaceList from "./us-place-list.mjs"; +import usPeronNameList from "./us-pseron-name-list.mjs"; /*-------------璇诲彇閰嶇疆---------------*/ let config = JSON.parse(fs.readFileSync('./config.json')); @@ -35,10 +37,32 @@ httpsAgent, }); -function countChar(str, char) { +function allWords() { + const words = {}; + wordsjs.usPlaces = usPlaceList; + wordsjs.usPeronNameList = usPeronNameList; + for (const key in wordsjs.default) { + if (Object.hasOwnProperty.call(wordsjs.default, key)) { + for (const word of wordsjs.default[key]) { + words[word] = true; + } + } + } + return words; +} + +const wordsMap = allWords(); + +/** + * 缁熻鍗曡瘝鏁伴噺 + * @param {string} str 瀛楃涓� + * @returns 鍗曡瘝鏁伴噺 + */ +function countWordSize(str) { let count = 0; + str = str.replace(/[ ]{2,}/g, ' '); for (let i = 0; i < str.length; i++) { - if (str[i] === char) { + if (str[i] === ' ') { count++; } } @@ -46,34 +70,86 @@ } /** + * 鑾峰彇閿欒鍗曡瘝姣斾緥 + * @param {string} text 鏂囨湰 + * @returns 閿欒鍗曡瘝姣斾緥 + */ +function incorrectWordRatio(text) { + text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); + const words = text.split(' '); + const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; + return incorrectWordCnt / words.length; +} + +/** + * 绗﹀彿鍗犳瘮 0 ~ 1 + * @param {string} text 鏂囨湰 + */ +function symbolRatio(text) { + // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣� + return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; +} + +/** * 娓呯悊鏂囨湰 * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 */ function cleanText(text) { - if (text.includes('google')) { - text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text = text.replace(/(\r)/g, ''); + const googlePage = text.substring(0, 10000); + if (googlePage.includes('google')) { + text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); } - // 濡傛灉鏄痮cr璇嗗埆鐨勬枃鏈紝姣忚瀛楃鏁颁竴鑸笉浼氳秴杩�170 - if (!/.{170,}/g.test(text) || text.includes('google')) { - text = text.replace(/(\r|鈻�)/g, ''); - text = text.replace(/[ ]{2,}/g, ' ') - text = text.replace(/(.+)\n/g, '$1'); - text = text.replace(/\n+/g, '\n'); - text = text.replace(/-\n/g, '-'); - const lines = text.split('\n'); - const result = []; - for (const line of lines) { - const wordSize = countChar(line, ' '); - if (wordSize >= 10) { - if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) { - result.push(line.trim()); - } + // if (!/.{170,}/g.test(text) || text.includes('google')) { + text = text.replace(/[ ]{2,}/g, ' ') + if (!/.{170,}/g.test(text)) { + // 姣忚涓嶈秴杩�170涓瓧绗� + text = text.replace(/(.{170,})\n/g, '$1'); + } + text = text.replace(/\n+/g, '\n'); + text = text.replace(/-\n/g, '-'); + const lines = text.split('\n'); + const result = []; + for (const line of lines) { + // 绗﹀彿姣斿お楂樼殑涓嶈 + const incorrectRatio = incorrectWordRatio(line); + if (symbolRatio(line) > 0.2) { + if (incorrectRatio > 0.65) { + continue; } } - return result.join('\n'); - } else { - return text; + // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈 + const wordSize = countWordSize(line); + if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { + if (wordSize < 5 || incorrectRatio > 0.65) { + continue; + } + } + // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈 + if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) { + continue; + } + // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕� + if (wordSize > 5 && incorrectRatio > 0.65) { + continue; + } + // 鏈塯oogle鐨勪笉瑕� + if (/.*(google).*/ig.test(line)) { + continue; + } + // 鍙湁涓�涓瓧绗︿笉瑕� + const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, ''); + if (ret.length <= 1) { + continue; + } + if (ret == 'Digitized by') { + continue; + } + result.push(ret); } + text = result.join('\n'); + // } + return text; } /** @@ -301,6 +377,21 @@ }); } +/** + * 浠嶩TML鎻愬彇鏂囨湰 + * @param {string} text html鏂囨湰 + * @returns 鏂囨湰 + */ +function getTextFromHtml(text) { + if (text.includes("<!DOCTYPE html>")) { + const s = text.indexOf('<pre>') + 6; + const e = text.indexOf('</pre>'); + text = text.substring(s, e); + // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; + } + return text; +} + async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); const ext = url.split(".").pop().toLowerCase(); @@ -345,12 +436,11 @@ setTimeout(() => { if (ext === "gz" || ext === "zip") { unzip(_filepath, filepath); + fs.unlinkSync(_filepath); } let text = fs.readFileSync(filepath, 'utf-8'); - if (text.includes("<!DOCTYPE html>")) { - text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; - fs.writeFileSync(filepath, text, 'utf-8'); - } + text = getTextFromHtml(text); + fs.writeFileSync(filepath, text, 'utf-8'); try { fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); } catch (e) { @@ -412,6 +502,18 @@ }); } +function getBookInfo(book) { + return retry(async () => { + const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`); + const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`); + let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`); + pages = pages.split(' / ')[1]; + book.publisher = publisher; + book.pubDate = datePublished; + book.pages = pages; + }); +} + async function downloadBooks(books) { driver = await createDriver(); @@ -426,15 +528,15 @@ break; } bookCount++; - if (isAlreadyDownloaded(book)) { + /*if (isAlreadyDownloaded(book)) { skipCount++; continue; } - if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { + if (book.state && (book.state === "娌℃湁鎼滅储缁撴灉" || book.state === "娌℃湁pdf鎴杢ext鏂囦欢" || book.state === "涓嬭浇瀹屾垚")) { // 璺宠繃娌℃湁鎼滅储缁撴灉鎴栨病鏈塸df鎴杢ext鏂囦欢鐨勪功绫� skipCount++; continue; - } + } */ console.log(`寮�濮嬩笅杞�: ${book.id} ${book.title}`); // 鎵撳紑鎼滅储椤甸潰骞舵悳绱� if (!await openSearchPage(book, true)) { @@ -461,6 +563,7 @@ sleep(getRandomNumber(500, 10000)); // 鎵撳紑璇︽儏椤� await openBookDetailPage(book, detailPageUrl); + await getBookInfo(book); // 鑾峰彇涓嬭浇閾炬帴 const url = await getDownloadUrl(book); if (!url) { continue; } @@ -598,6 +701,6 @@ main(); } -// const filepath = "D:\\projects\\book-crawler\\downloads\\10231261 978-1-331-76167-9.txt"; -// const text = fs.readFileSync(filepath, 'utf8'); +// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; +// let text = fs.readFileSync(filepath, 'utf8'); // fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); \ No newline at end of file -- Gitblit v1.9.1