From ce8cb9c851fa66c7c2902ceb57e369d3cecf1a28 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期四, 01 八月 2024 01:48:56 +0800 Subject: [PATCH] 复制bt下载的文件,bt任务控制 --- src/book-download.mjs | 159 ++-------------------------------------------------- 1 files changed, 8 insertions(+), 151 deletions(-) diff --git a/src/book-download.mjs b/src/book-download.mjs index f5760a9..386e84e 100644 --- a/src/book-download.mjs +++ b/src/book-download.mjs @@ -48,135 +48,6 @@ return pages; } -function allWords() { - const words = {}; - wordsjs.usPlaces = usPlaceList; - wordsjs.usPeronNameList = usPeronNameList; - for (const key in wordsjs.default) { - if (Object.hasOwnProperty.call(wordsjs.default, key)) { - for (const word of wordsjs.default[key]) { - words[word] = true; - } - } - } - return words; -} - -const wordsMap = allWords(); - -/** - * 缁熻鍗曡瘝鏁伴噺 - * @param {string} str 瀛楃涓� - * @returns 鍗曡瘝鏁伴噺 - */ -function countWordSize(str) { - let count = 0; - str = str.replace(/[ ]{2,}/g, ' '); - for (let i = 0; i < str.length; i++) { - if (str[i] === ' ') { - count++; - } - } - return count; -} - -/** - * 鑾峰彇閿欒鍗曡瘝姣斾緥 - * @param {string} text 鏂囨湰 - * @returns 閿欒鍗曡瘝姣斾緥 - */ -function incorrectWordRatio(text) { - text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); - const words = text.split(' '); - const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; - return incorrectWordCnt / words.length; -} - -/** - * 绗﹀彿鍗犳瘮 0 ~ 1 - * @param {string} text 鏂囨湰 - */ -function symbolRatio(text) { - // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣� - return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; -} - -/** - * 娓呯悊鏂囨湰 - * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 - */ -function cleanText(text) { - text = text.replace(/(\r)/g, ''); - const googlePage = text.substring(0, 10000); - if (googlePage.includes('google')) { - text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); - } - // if (!/.{170,}/g.test(text) || text.includes('google')) { - text = text.replace(/[ ]{2,}/g, ' ') - if (!/.{170,}/g.test(text)) { - // 姣忚涓嶈秴杩�170涓瓧绗� - text = text.replace(/(.{170,})\n/g, '$1'); - } - text = text.replace(/\n+/g, '\n'); - text = text.replace(/-\n/g, '-'); - const lines = text.split('\n'); - const result = []; - for (const line of lines) { - // 绗﹀彿姣斿お楂樼殑涓嶈 - const incorrectRatio = incorrectWordRatio(line); - if (symbolRatio(line) > 0.2) { - if (incorrectRatio > 0.65) { - continue; - } - } - // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈 - const wordSize = countWordSize(line); - if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { - if (wordSize < 5 || incorrectRatio > 0.65) { - continue; - } - } - // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈 - if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) { - continue; - } - // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕� - if (wordSize > 5 && incorrectRatio > 0.65) { - continue; - } - // 鏈塯oogle鐨勪笉瑕� - if (/.*(google).*/ig.test(line)) { - continue; - } - // 鍙湁涓�涓瓧绗︿笉瑕� - const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, ''); - if (ret.length <= 1) { - continue; - } - if (ret == 'Digitized by') { - continue; - } - result.push(ret); - } - text = result.join('\n'); - // } - return text; -} - -/** - * 瑙e帇鏂囨湰鏂囦欢 - * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞 - * @param {string} txtFile 鏂囨湰鏂囦欢璺緞 - */ -function unzip(zipFile, txtFile) { - const tmpdir = `./tmpdir/${threadId}`; - execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) - const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) - .sort((a, b) => a.size.size - b.size.size).pop(); - fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); - fs.rmSync(`${tmpdir}`, { recursive: true }); -} - /** * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 * @param {number} startRow 璧峰琛岋紝鍖呭惈 @@ -210,23 +81,6 @@ return books; } -/** - * 鏍煎紡鍖栧叧閿瓧 - * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� - * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� - * @returns 澶勭悊鍚庣殑鍏抽敭瀛� - */ -function formatKw(text, titleWithNumbers) { - if (titleWithNumbers) { - text = text; - } else { - text = text.replace(/[\d]/g, ""); - } - text = text.split(' ').slice(0, 6).join("+"); - return text; -} - - async function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); @@ -251,7 +105,7 @@ * @param {*} book */ async function getBookDetailPageUrl(book) { - const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g,'+')}&criteria=title&language=&format=`; + const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g, '+')}&criteria=title&language=&format=`; return await retry(async () => { const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) // const html = cheerio.load(resp.data); @@ -264,7 +118,10 @@ return '' } }) - .catch(() => ''); + .catch((e) => { + console.error(e.message); + return ''; + }); } async function openBookDetailPage(book, detailPageUrl) { @@ -273,7 +130,6 @@ const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); const html = cheerio.load(resp.data); const trList = html('tr'); - const files = []; let epubUrl = null; let pdfUrl = null; for (const tr of trList) { @@ -324,7 +180,7 @@ async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); await retry(() => { - const timeoutTime = 10 * 60 * 1000; + const timeoutTime = 1 * 60 * 1000; const source = axios.CancelToken.source(); const timeout = setTimeout(() => { source.cancel("timeout"); @@ -343,6 +199,7 @@ book.format = ext; book.file = filepath; console.log(`涓嬭浇瀹屾垚锛�${filepath}`); + resolve(true); return; } const stream = response.data; @@ -382,7 +239,7 @@ reject(false); } })); - }).catch(e => { + }, 1).catch(e => { book.state = "涓嬭浇澶辫触"; console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); return false -- Gitblit v1.9.1