| | |
| | | import { HttpsProxyAgent } from "https-proxy-agent"; |
| | | import { resolve } from "path"; |
| | | import { execFileSync } from "child_process"; |
| | | |
| | | import wordsjs from 'wordlist-js'; |
| | | import usPlaceList from "./us-place-list.mjs"; |
| | | import usPeronNameList from "./us-pseron-name-list.mjs"; |
| | | /*-------------读取配置---------------*/ |
| | | let config = JSON.parse(fs.readFileSync('./config.json')); |
| | | |
| | |
| | | httpsAgent, |
| | | }); |
| | | |
| | | function countChar(str, char) { |
| | | function allWords() { |
| | | const words = {}; |
| | | wordsjs.usPlaces = usPlaceList; |
| | | wordsjs.usPeronNameList = usPeronNameList; |
| | | for (const key in wordsjs.default) { |
| | | if (Object.hasOwnProperty.call(wordsjs.default, key)) { |
| | | for (const word of wordsjs.default[key]) { |
| | | words[word] = true; |
| | | } |
| | | } |
| | | } |
| | | return words; |
| | | } |
| | | |
| | | const wordsMap = allWords(); |
| | | |
| | | /** |
| | | * 统计单词数量 |
| | | * @param {string} str 字符串 |
| | | * @returns 单词数量 |
| | | */ |
| | | function countWordSize(str) { |
| | | let count = 0; |
| | | str = str.replace(/[ ]{2,}/g, ' '); |
| | | for (let i = 0; i < str.length; i++) { |
| | | if (str[i] === char) { |
| | | if (str[i] === ' ') { |
| | | count++; |
| | | } |
| | | } |
| | |
| | | } |
| | | |
| | | /** |
| | | * 获取错误单词比例 |
| | | * @param {string} text 文本 |
| | | * @returns 错误单词比例 |
| | | */ |
| | | function incorrectWordRatio(text) { |
| | | text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); |
| | | const words = text.split(' '); |
| | | const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; |
| | | return incorrectWordCnt / words.length; |
| | | } |
| | | |
| | | /** |
| | | * 符号占比 0 ~ 1 |
| | | * @param {string} text 文本 |
| | | */ |
| | | function symbolRatio(text) { |
| | | // 非字母数字字符占比 |
| | | return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; |
| | | } |
| | | |
| | | /** |
| | | * 清理文本 |
| | | * @param {string} text 要清理的文本 |
| | | */ |
| | | function cleanText(text) { |
| | | if (text.includes('google')) { |
| | | text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') |
| | | text = text.replace(/(\r)/g, ''); |
| | | const googlePage = text.substring(0, 10000); |
| | | if (googlePage.includes('google')) { |
| | | text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); |
| | | } |
| | | // 如果是ocr识别的文本,每行字符数一般不会超过170 |
| | | if (!/.{170,}/g.test(text) || text.includes('google')) { |
| | | text = text.replace(/(\r|■)/g, ''); |
| | | // if (!/.{170,}/g.test(text) || text.includes('google')) { |
| | | text = text.replace(/[ ]{2,}/g, ' ') |
| | | text = text.replace(/(.+)\n/g, '$1'); |
| | | if (!/.{170,}/g.test(text)) { |
| | | // 每行不超过170个字符 |
| | | text = text.replace(/(.{170,})\n/g, '$1'); |
| | | } |
| | | text = text.replace(/\n+/g, '\n'); |
| | | text = text.replace(/-\n/g, '-'); |
| | | const lines = text.split('\n'); |
| | | const result = []; |
| | | for (const line of lines) { |
| | | const wordSize = countChar(line, ' '); |
| | | if (wordSize >= 10) { |
| | | if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) { |
| | | result.push(line.trim()); |
| | | // 符号比太高的不要 |
| | | const incorrectRatio = incorrectWordRatio(line); |
| | | if (symbolRatio(line) > 0.2) { |
| | | if (incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | } |
| | | // 去除空格后 连续重复单个字符3次及以上不要 |
| | | const wordSize = countWordSize(line); |
| | | if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { |
| | | if (wordSize < 5 || incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | return result.join('\n'); |
| | | } else { |
| | | } |
| | | // 连续三个标点符号及以上,错误率大于0.65不要 |
| | | if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) { |
| | | continue; |
| | | } |
| | | // 单词数量太少的不要 |
| | | if (wordSize > 5 && incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | // 有google的不要 |
| | | if (/.*(google).*/ig.test(line)) { |
| | | continue; |
| | | } |
| | | // 只有一个字符不要 |
| | | const ret = line.trim().replace(/[■•*¬»«^-]/g, ''); |
| | | if (ret.length <= 1) { |
| | | continue; |
| | | } |
| | | if (ret == 'Digitized by') { |
| | | continue; |
| | | } |
| | | result.push(ret); |
| | | } |
| | | text = result.join('\n'); |
| | | // } |
| | | return text; |
| | | } |
| | | } |
| | | |
| | | /** |
| | |
| | | }); |
| | | } |
| | | |
| | | /** |
| | | * 从HTML提取文本 |
| | | * @param {string} text html文本 |
| | | * @returns 文本 |
| | | */ |
| | | function getTextFromHtml(text) { |
| | | if (text.includes("<!DOCTYPE html>")) { |
| | | const s = text.indexOf('<pre>') + 6; |
| | | const e = text.indexOf('</pre>'); |
| | | text = text.substring(s, e); |
| | | // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; |
| | | } |
| | | return text; |
| | | } |
| | | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | const ext = url.split(".").pop().toLowerCase(); |
| | |
| | | setTimeout(() => { |
| | | if (ext === "gz" || ext === "zip") { |
| | | unzip(_filepath, filepath); |
| | | fs.unlinkSync(_filepath); |
| | | } |
| | | let text = fs.readFileSync(filepath, 'utf-8'); |
| | | if (text.includes("<!DOCTYPE html>")) { |
| | | text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2]; |
| | | text = getTextFromHtml(text); |
| | | fs.writeFileSync(filepath, text, 'utf-8'); |
| | | } |
| | | try { |
| | | fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |
| | | } catch (e) { |
| | |
| | | }); |
| | | } |
| | | |
| | | function getBookInfo(book) { |
| | | return retry(async () => { |
| | | const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`); |
| | | const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`); |
| | | let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`); |
| | | pages = pages.split(' / ')[1]; |
| | | book.publisher = publisher; |
| | | book.pubDate = datePublished; |
| | | book.pages = pages; |
| | | }); |
| | | } |
| | | |
| | | async function downloadBooks(books) { |
| | | driver = await createDriver(); |
| | | |
| | |
| | | break; |
| | | } |
| | | bookCount++; |
| | | if (isAlreadyDownloaded(book)) { |
| | | /*if (isAlreadyDownloaded(book)) { |
| | | skipCount++; |
| | | continue; |
| | | } |
| | |
| | | // 跳过没有搜索结果或没有pdf或text文件的书籍 |
| | | skipCount++; |
| | | continue; |
| | | } |
| | | } */ |
| | | console.log(`开始下载: ${book.id} ${book.title}`); |
| | | // 打开搜索页面并搜索 |
| | | if (!await openSearchPage(book, true)) { |
| | |
| | | sleep(getRandomNumber(500, 10000)); |
| | | // 打开详情页 |
| | | await openBookDetailPage(book, detailPageUrl); |
| | | await getBookInfo(book); |
| | | // 获取下载链接 |
| | | const url = await getDownloadUrl(book); |
| | | if (!url) { continue; } |
| | |
| | | main(); |
| | | } |
| | | |
| | | // const filepath = "D:\\projects\\book-crawler\\downloads\\10231261 978-1-331-76167-9.txt"; |
| | | // const text = fs.readFileSync(filepath, 'utf8'); |
| | | // const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt"; |
| | | // let text = fs.readFileSync(filepath, 'utf8'); |
| | | // fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8'); |