| | |
| | | return pages; |
| | | } |
| | | |
| | | function allWords() { |
| | | const words = {}; |
| | | wordsjs.usPlaces = usPlaceList; |
| | | wordsjs.usPeronNameList = usPeronNameList; |
| | | for (const key in wordsjs.default) { |
| | | if (Object.hasOwnProperty.call(wordsjs.default, key)) { |
| | | for (const word of wordsjs.default[key]) { |
| | | words[word] = true; |
| | | } |
| | | } |
| | | } |
| | | return words; |
| | | } |
| | | |
| | | const wordsMap = allWords(); |
| | | |
| | | /** |
| | | * 统计单词数量 |
| | | * @param {string} str 字符串 |
| | | * @returns 单词数量 |
| | | */ |
| | | function countWordSize(str) { |
| | | let count = 0; |
| | | str = str.replace(/[ ]{2,}/g, ' '); |
| | | for (let i = 0; i < str.length; i++) { |
| | | if (str[i] === ' ') { |
| | | count++; |
| | | } |
| | | } |
| | | return count; |
| | | } |
| | | |
| | | /** |
| | | * 获取错误单词比例 |
| | | * @param {string} text 文本 |
| | | * @returns 错误单词比例 |
| | | */ |
| | | function incorrectWordRatio(text) { |
| | | text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); |
| | | const words = text.split(' '); |
| | | const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; |
| | | return incorrectWordCnt / words.length; |
| | | } |
| | | |
| | | /** |
| | | * 符号占比 0 ~ 1 |
| | | * @param {string} text 文本 |
| | | */ |
| | | function symbolRatio(text) { |
| | | // 非字母数字字符占比 |
| | | return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; |
| | | } |
| | | |
| | | /** |
| | | * 清理文本 |
| | | * @param {string} text 要清理的文本 |
| | | */ |
| | | function cleanText(text) { |
| | | text = text.replace(/(\r)/g, ''); |
| | | const googlePage = text.substring(0, 10000); |
| | | if (googlePage.includes('google')) { |
| | | text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); |
| | | } |
| | | // if (!/.{170,}/g.test(text) || text.includes('google')) { |
| | | text = text.replace(/[ ]{2,}/g, ' ') |
| | | if (!/.{170,}/g.test(text)) { |
| | | // 每行不超过170个字符 |
| | | text = text.replace(/(.{170,})\n/g, '$1'); |
| | | } |
| | | text = text.replace(/\n+/g, '\n'); |
| | | text = text.replace(/-\n/g, '-'); |
| | | const lines = text.split('\n'); |
| | | const result = []; |
| | | for (const line of lines) { |
| | | // 符号比太高的不要 |
| | | const incorrectRatio = incorrectWordRatio(line); |
| | | if (symbolRatio(line) > 0.2) { |
| | | if (incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | } |
| | | // 去除空格后 连续重复单个字符3次及以上不要 |
| | | const wordSize = countWordSize(line); |
| | | if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { |
| | | if (wordSize < 5 || incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | } |
| | | // 连续三个标点符号及以上,错误率大于0.65不要 |
| | | if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) { |
| | | continue; |
| | | } |
| | | // 单词数量太少的不要 |
| | | if (wordSize > 5 && incorrectRatio > 0.65) { |
| | | continue; |
| | | } |
| | | // 有google的不要 |
| | | if (/.*(google).*/ig.test(line)) { |
| | | continue; |
| | | } |
| | | // 只有一个字符不要 |
| | | const ret = line.trim().replace(/[■•*¬»«^-]/g, ''); |
| | | if (ret.length <= 1) { |
| | | continue; |
| | | } |
| | | if (ret == 'Digitized by') { |
| | | continue; |
| | | } |
| | | result.push(ret); |
| | | } |
| | | text = result.join('\n'); |
| | | // } |
| | | return text; |
| | | } |
| | | |
| | | /** |
| | | * 解压文本文件 |
| | | * @param {string} zipFile 压缩文件路径 |
| | | * @param {string} txtFile 文本文件路径 |
| | | */ |
| | | function unzip(zipFile, txtFile) { |
| | | const tmpdir = `./tmpdir/${threadId}`; |
| | | execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) |
| | | const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) |
| | | .sort((a, b) => a.size.size - b.size.size).pop(); |
| | | fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); |
| | | fs.rmSync(`${tmpdir}`, { recursive: true }); |
| | | } |
| | | |
| | | /** |
| | | * 获取要下载熟图书信息 |
| | | * @param {number} startRow 起始行,包含 |
| | |
| | | return books; |
| | | } |
| | | |
| | | /** |
| | | * 格式化关键字 |
| | | * @param {string} text 要搜索的关键字 |
| | | * @param {boolean} titleWithNumbers 是否标题中包含数字 |
| | | * @returns 处理后的关键字 |
| | | */ |
| | | function formatKw(text, titleWithNumbers) { |
| | | if (titleWithNumbers) { |
| | | text = text; |
| | | } else { |
| | | text = text.replace(/[\d]/g, ""); |
| | | } |
| | | text = text.split(' ').slice(0, 6).join("+"); |
| | | return text; |
| | | } |
| | | |
| | | |
| | | async function sleep(ms) { |
| | | return new Promise((resolve) => { |
| | | setTimeout(resolve, ms); |
| | |
| | | * @param {*} book |
| | | */ |
| | | async function getBookDetailPageUrl(book) { |
| | | const url = `https://libgen.vg/index.php?req=${book.title}&columns%5B%5D=t&topics%5B%5D=f&res=25&filesuns=all`; |
| | | const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g, '+')}&criteria=title&language=&format=`; |
| | | return await retry(async () => { |
| | | const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) |
| | | const group = /.*href="(edition.php\?id=\d+)".*/g.exec(resp.data); |
| | | // const html = cheerio.load(resp.data); |
| | | // const url = html('body > table > tbody > tr:nth-child(1) > td:nth-child(6) > ul > li:nth-child(1) > a')?.attr('href') ?? ''; |
| | | // return url; |
| | | const group = /.*href="(http:\/\/library.lol\/fiction\/[0-9a-zA-Z]+)".*/g.exec(resp.data); |
| | | if (group) { |
| | | return `https://libgen.vg/${group[1]}`; |
| | | return `${group[1]}`; |
| | | } else { |
| | | return '' |
| | | } |
| | | }) |
| | | .catch(() => ''); |
| | | .catch((e) => { |
| | | console.error(e.message); |
| | | return ''; |
| | | }); |
| | | } |
| | | |
| | | async function openBookDetailPage(book, detailPageUrl) { |
| | |
| | | const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); |
| | | const html = cheerio.load(resp.data); |
| | | const trList = html('tr'); |
| | | const files = []; |
| | | let epubUrl = null; |
| | | let pdfUrl = null; |
| | | for (const tr of trList) { |
| | |
| | | async function getDownloadUrl(book, url) { |
| | | return await retry(async () => { |
| | | const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); |
| | | const group = /.*href="(get.php\?md5=[0-9a-f]+.*)".*/g.exec(resp.data); |
| | | const group = /.*href="(\S+)".*>GET<.*/g.exec(resp.data); |
| | | if (group) { |
| | | return `https://libgen.vg/${group[1]}`; |
| | | return `${group[1]}`; |
| | | } else { |
| | | return ''; |
| | | } |
| | |
| | | async function downloadFile(book, url) { |
| | | console.log(`下载文件: ${url}`); |
| | | await retry(() => { |
| | | const timeoutTime = 10 * 60 * 1000; |
| | | const timeoutTime = 1 * 60 * 1000; |
| | | const source = axios.CancelToken.source(); |
| | | const timeout = setTimeout(() => { |
| | | source.cancel("timeout"); |
| | |
| | | let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? ''; |
| | | ext = ext.substring(0, ext.length - 1); |
| | | |
| | | const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const filepath = `./downloads/${book.id}.${ext}`; |
| | | book.url = url; |
| | | if (fs.existsSync(filepath)) { |
| | | book.state = `下载完成`; |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | console.log(`下载完成:${filepath}`); |
| | | resolve(true); |
| | | return; |
| | | } |
| | | const stream = response.data; |
| | | const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; |
| | | const _filepath = `./downloads/${book.id}.${ext}`; |
| | | const out = fs.createWriteStream(_filepath); |
| | | stream.pipe(out); |
| | | stream.on("end", async () => { |
| | |
| | | book.format = ext; |
| | | book.file = filepath; |
| | | book.url = url; |
| | | book.pages = await getPdfPages(filepath).catch(e => 0); |
| | | // book.pages = await getPdfPages(filepath).catch(e => 0); |
| | | resolve(true); |
| | | }); |
| | | stream.on("error", (err) => { |
| | |
| | | reject(false); |
| | | } |
| | | })); |
| | | }).catch(e => { |
| | | }, 1).catch(e => { |
| | | book.state = "下载失败"; |
| | | console.log(`下载失败: ${book.id} ${book.title} ${url}`); |
| | | return false |
| | |
| | | // 等一段时间再打开详情页 |
| | | sleep(getRandomNumber(500, 1000)); |
| | | // 打开详情页,并获取下载链接 |
| | | const filePageUrl = await openBookDetailPage(book, detailPageUrl); |
| | | if (!filePageUrl) { |
| | | console.log(`没有文件: ${book.id} ${book.title}`); |
| | | continue; |
| | | } |
| | | const url = await getDownloadUrl(book, filePageUrl); |
| | | // const filePageUrl = await openBookDetailPage(book, detailPageUrl); |
| | | // if (!filePageUrl) { |
| | | // console.log(`没有文件: ${book.id} ${book.title}`); |
| | | // continue; |
| | | // } |
| | | const url = await getDownloadUrl(book, detailPageUrl); |
| | | if (!url) { |
| | | console.log(`没有文件: ${book.id} ${book.title}`); |
| | | continue; |