From ce8cb9c851fa66c7c2902ceb57e369d3cecf1a28 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期四, 01 八月 2024 01:48:56 +0800 Subject: [PATCH] 复制bt下载的文件,bt任务控制 --- src/book-download.mjs | 188 ++++++----------------------------------------- 1 files changed, 24 insertions(+), 164 deletions(-) diff --git a/src/book-download.mjs b/src/book-download.mjs index 861e2bf..386e84e 100644 --- a/src/book-download.mjs +++ b/src/book-download.mjs @@ -48,135 +48,6 @@ return pages; } -function allWords() { - const words = {}; - wordsjs.usPlaces = usPlaceList; - wordsjs.usPeronNameList = usPeronNameList; - for (const key in wordsjs.default) { - if (Object.hasOwnProperty.call(wordsjs.default, key)) { - for (const word of wordsjs.default[key]) { - words[word] = true; - } - } - } - return words; -} - -const wordsMap = allWords(); - -/** - * 缁熻鍗曡瘝鏁伴噺 - * @param {string} str 瀛楃涓� - * @returns 鍗曡瘝鏁伴噺 - */ -function countWordSize(str) { - let count = 0; - str = str.replace(/[ ]{2,}/g, ' '); - for (let i = 0; i < str.length; i++) { - if (str[i] === ' ') { - count++; - } - } - return count; -} - -/** - * 鑾峰彇閿欒鍗曡瘝姣斾緥 - * @param {string} text 鏂囨湰 - * @returns 閿欒鍗曡瘝姣斾緥 - */ -function incorrectWordRatio(text) { - text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1"); - const words = text.split(' '); - const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length; - return incorrectWordCnt / words.length; -} - -/** - * 绗﹀彿鍗犳瘮 0 ~ 1 - * @param {string} text 鏂囨湰 - */ -function symbolRatio(text) { - // 闈炲瓧姣嶆暟瀛楀瓧绗﹀崰姣� - return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length; -} - -/** - * 娓呯悊鏂囨湰 - * @param {string} text 瑕佹竻鐞嗙殑鏂囨湰 - */ -function cleanText(text) { - text = text.replace(/(\r)/g, ''); - const googlePage = text.substring(0, 10000); - if (googlePage.includes('google')) { - text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000); - } - // if (!/.{170,}/g.test(text) || text.includes('google')) { - text = text.replace(/[ ]{2,}/g, ' ') - if (!/.{170,}/g.test(text)) { - // 姣忚涓嶈秴杩�170涓瓧绗� - text = text.replace(/(.{170,})\n/g, '$1'); - } - text = text.replace(/\n+/g, '\n'); - text = text.replace(/-\n/g, '-'); - const lines = text.split('\n'); - const result = []; - for (const line of lines) { - // 绗﹀彿姣斿お楂樼殑涓嶈 - const incorrectRatio = incorrectWordRatio(line); - if (symbolRatio(line) > 0.2) { - if (incorrectRatio > 0.65) { - continue; - } - } - // 鍘婚櫎绌烘牸鍚� 杩炵画閲嶅鍗曚釜瀛楃3娆″強浠ヤ笂涓嶈 - const wordSize = countWordSize(line); - if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) { - if (wordSize < 5 || incorrectRatio > 0.65) { - continue; - } - } - // 杩炵画涓変釜鏍囩偣绗﹀彿鍙婁互涓�,閿欒鐜囧ぇ浜�0.65涓嶈 - if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~鈥�*卢禄芦]){3,}/.test(line)) { - continue; - } - // 鍗曡瘝鏁伴噺澶皯鐨勪笉瑕� - if (wordSize > 5 && incorrectRatio > 0.65) { - continue; - } - // 鏈塯oogle鐨勪笉瑕� - if (/.*(google).*/ig.test(line)) { - continue; - } - // 鍙湁涓�涓瓧绗︿笉瑕� - const ret = line.trim().replace(/[鈻犫��*卢禄芦^-]/g, ''); - if (ret.length <= 1) { - continue; - } - if (ret == 'Digitized by') { - continue; - } - result.push(ret); - } - text = result.join('\n'); - // } - return text; -} - -/** - * 瑙e帇鏂囨湰鏂囦欢 - * @param {string} zipFile 鍘嬬缉鏂囦欢璺緞 - * @param {string} txtFile 鏂囨湰鏂囦欢璺緞 - */ -function unzip(zipFile, txtFile) { - const tmpdir = `./tmpdir/${threadId}`; - execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`]) - const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file })) - .sort((a, b) => a.size.size - b.size.size).pop(); - fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true }); - fs.rmSync(`${tmpdir}`, { recursive: true }); -} - /** * 鑾峰彇瑕佷笅杞界啛鍥句功淇℃伅 * @param {number} startRow 璧峰琛岋紝鍖呭惈 @@ -210,23 +81,6 @@ return books; } -/** - * 鏍煎紡鍖栧叧閿瓧 - * @param {string} text 瑕佹悳绱㈢殑鍏抽敭瀛� - * @param {boolean} titleWithNumbers 鏄惁鏍囬涓寘鍚暟瀛� - * @returns 澶勭悊鍚庣殑鍏抽敭瀛� - */ -function formatKw(text, titleWithNumbers) { - if (titleWithNumbers) { - text = text; - } else { - text = text.replace(/[\d]/g, ""); - } - text = text.split(' ').slice(0, 6).join("+"); - return text; -} - - async function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); @@ -251,17 +105,23 @@ * @param {*} book */ async function getBookDetailPageUrl(book) { - const url = `https://libgen.vg/index.php?req=${book.title}&columns%5B%5D=t&topics%5B%5D=f&res=25&filesuns=all`; + const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g, '+')}&criteria=title&language=&format=`; return await retry(async () => { const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }) - const group = /.*href="(edition.php\?id=\d+)".*/g.exec(resp.data); + // const html = cheerio.load(resp.data); + // const url = html('body > table > tbody > tr:nth-child(1) > td:nth-child(6) > ul > li:nth-child(1) > a')?.attr('href') ?? ''; + // return url; + const group = /.*href="(http:\/\/library.lol\/fiction\/[0-9a-zA-Z]+)".*/g.exec(resp.data); if (group) { - return `https://libgen.vg/${group[1]}`; + return `${group[1]}`; } else { return '' } }) - .catch(() => ''); + .catch((e) => { + console.error(e.message); + return ''; + }); } async function openBookDetailPage(book, detailPageUrl) { @@ -270,7 +130,6 @@ const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); const html = cheerio.load(resp.data); const trList = html('tr'); - const files = []; let epubUrl = null; let pdfUrl = null; for (const tr of trList) { @@ -304,9 +163,9 @@ async function getDownloadUrl(book, url) { return await retry(async () => { const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } }); - const group = /.*href="(get.php\?md5=[0-9a-f]+.*)".*/g.exec(resp.data); + const group = /.*href="(\S+)".*>GET<.*/g.exec(resp.data); if (group) { - return `https://libgen.vg/${group[1]}`; + return `${group[1]}`; } else { return ''; } @@ -321,7 +180,7 @@ async function downloadFile(book, url) { console.log(`涓嬭浇鏂囦欢: ${url}`); await retry(() => { - const timeoutTime = 10 * 60 * 1000; + const timeoutTime = 1 * 60 * 1000; const source = axios.CancelToken.source(); const timeout = setTimeout(() => { source.cancel("timeout"); @@ -333,17 +192,18 @@ let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? ''; ext = ext.substring(0, ext.length - 1); - const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const filepath = `./downloads/${book.id}.${ext}`; book.url = url; if (fs.existsSync(filepath)) { book.state = `涓嬭浇瀹屾垚`; book.format = ext; book.file = filepath; console.log(`涓嬭浇瀹屾垚锛�${filepath}`); + resolve(true); return; } const stream = response.data; - const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`; + const _filepath = `./downloads/${book.id}.${ext}`; const out = fs.createWriteStream(_filepath); stream.pipe(out); stream.on("end", async () => { @@ -352,7 +212,7 @@ book.format = ext; book.file = filepath; book.url = url; - book.pages = await getPdfPages(filepath).catch(e => 0); + // book.pages = await getPdfPages(filepath).catch(e => 0); resolve(true); }); stream.on("error", (err) => { @@ -379,7 +239,7 @@ reject(false); } })); - }).catch(e => { + }, 1).catch(e => { book.state = "涓嬭浇澶辫触"; console.log(`涓嬭浇澶辫触: ${book.id} ${book.title} ${url}`); return false @@ -439,12 +299,12 @@ // 绛変竴娈垫椂闂村啀鎵撳紑璇︽儏椤� sleep(getRandomNumber(500, 1000)); // 鎵撳紑璇︽儏椤碉紝骞惰幏鍙栦笅杞介摼鎺� - const filePageUrl = await openBookDetailPage(book, detailPageUrl); - if (!filePageUrl) { - console.log(`娌℃湁鏂囦欢: ${book.id} ${book.title}`); - continue; - } - const url = await getDownloadUrl(book, filePageUrl); + // const filePageUrl = await openBookDetailPage(book, detailPageUrl); + // if (!filePageUrl) { + // console.log(`娌℃湁鏂囦欢: ${book.id} ${book.title}`); + // continue; + // } + const url = await getDownloadUrl(book, detailPageUrl); if (!url) { console.log(`娌℃湁鏂囦欢: ${book.id} ${book.title}`); continue; -- Gitblit v1.9.1