~lyg/book-crawler.git - Gitblit

李玉刚 / book-crawler

图书批量下载

blame | 历史 | 补丁 | 提交 | 提交对比 | ignore whitespace

增加数据库图书ISBN补全脚本

lyg

2024-07-30 6a6078c5d393bffda15e682994811468ff86963e

 src/book-download.mjs

@@ -48,135 +48,6 @@
  return pages;
}

function allWords() {
  const words = {};
  wordsjs.usPlaces = usPlaceList;
  wordsjs.usPeronNameList = usPeronNameList;
  for (const key in wordsjs.default) {
    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
      for (const word of wordsjs.default[key]) {
        words[word] = true;
      }
    }
  }
  return words;
}

const wordsMap = allWords();

/**
 * 统计单词数量
 * @param {string} str 字符串
 * @returns 单词数量
 */
function countWordSize(str) {
  let count = 0;
  str = str.replace(/[ ]{2,}/g, ' ');
  for (let i = 0; i < str.length; i++) {
    if (str[i] === ' ') {
      count++;
    }
  }
  return count;
}

/**
 * 获取错误单词比例
 * @param {string} text 文本
 * @returns 错误单词比例
 */
function incorrectWordRatio(text) {
  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
  const words = text.split(' ');
  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
  return incorrectWordCnt / words.length;
}

/**
 * 符号占比 0 ~ 1
 * @param {string} text 文本
 */
function symbolRatio(text) {
  // 非字母数字字符占比
  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
}

/**
 * 清理文本
 * @param {string} text 要清理的文本
 */
function cleanText(text) {
  text = text.replace(/(\r)/g, '');
  const googlePage = text.substring(0, 10000);
  if (googlePage.includes('google')) {
    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
  }
  // if (!/.{170,}/g.test(text) || text.includes('google')) {
  text = text.replace(/[ ]{2,}/g, ' ')
  if (!/.{170,}/g.test(text)) {
    // 每行不超过170个字符
    text = text.replace(/(.{170,})\n/g, '$1');
  }
  text = text.replace(/\n+/g, '\n');
  text = text.replace(/-\n/g, '-');
  const lines = text.split('\n');
  const result = [];
  for (const line of lines) {
    // 符号比太高的不要
    const incorrectRatio = incorrectWordRatio(line);
    if (symbolRatio(line) > 0.2) {
      if (incorrectRatio > 0.65) {
        continue;
      }
    }
    // 去除空格后 连续重复单个字符3次及以上不要
    const wordSize = countWordSize(line);
    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
      if (wordSize < 5 || incorrectRatio > 0.65) {
        continue;
      }
    }
    // 连续三个标点符号及以上,错误率大于0.65不要
    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) {
      continue;
    }
    // 单词数量太少的不要
    if (wordSize > 5 && incorrectRatio > 0.65) {
      continue;
    }
    // 有google的不要
    if (/.*(google).*/ig.test(line)) {
      continue;
    }
    // 只有一个字符不要
    const ret = line.trim().replace(/[■•*¬»«^-]/g, '');
    if (ret.length <= 1) {
      continue;
    }
    if (ret == 'Digitized by') {
      continue;
    }
    result.push(ret);
  }
  text = result.join('\n');
  // }
  return text;
}

/**
 * 解压文本文件
 * @param {string} zipFile 压缩文件路径
 * @param {string} txtFile 文本文件路径
 */
function unzip(zipFile, txtFile) {
  const tmpdir = `./tmpdir/${threadId}`;
  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
    .sort((a, b) => a.size.size - b.size.size).pop();
  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
  fs.rmSync(`${tmpdir}`, { recursive: true });
}

/**
 * 获取要下载熟图书信息
 * @param {number} startRow 起始行，包含
@@ -210,23 +81,6 @@
  return books;
}

/**
 * 格式化关键字
 * @param {string} text 要搜索的关键字
 * @param {boolean} titleWithNumbers 是否标题中包含数字
 * @returns 处理后的关键字
 */
function formatKw(text, titleWithNumbers) {
  if (titleWithNumbers) {
    text = text;
  } else {
    text = text.replace(/[\d]/g, "");
  }
  text = text.split(' ').slice(0, 6).join("+");
  return text;
}


async function sleep(ms) {
  return new Promise((resolve) => {
    setTimeout(resolve, ms);
@@ -251,17 +105,23 @@
 * @param {*} book 
 */
async function getBookDetailPageUrl(book) {
  const url = `https://libgen.vg/index.php?req=${book.title}&columns%5B%5D=t&topics%5B%5D=f&res=25&filesuns=all`;
  const url = `https://libgen.rs/fiction/?q=${book.title.replace(/ /g, '+')}&criteria=title&language=&format=`;
  return await retry(async () => {
    const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
    const group = /.*href="(edition.php\?id=\d+)".*/g.exec(resp.data);
    // const html = cheerio.load(resp.data);
    // const url = html('body > table > tbody > tr:nth-child(1) > td:nth-child(6) > ul > li:nth-child(1) > a')?.attr('href') ?? '';
    // return url;
    const group = /.*href="(http:\/\/library.lol\/fiction\/[0-9a-zA-Z]+)".*/g.exec(resp.data);
    if (group) {
      return `https://libgen.vg/${group[1]}`;
      return `${group[1]}`;
    } else {
      return ''
    }
  })
    .catch(() => '');
    .catch((e) => {
      console.error(e.message);
      return '';
    });
}

async function openBookDetailPage(book, detailPageUrl) {
@@ -270,7 +130,6 @@
    const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
    const html = cheerio.load(resp.data);
    const trList = html('tr');
    const files = [];
    let epubUrl = null;
    let pdfUrl = null;
    for (const tr of trList) {
@@ -304,9 +163,9 @@
async function getDownloadUrl(book, url) {
  return await retry(async () => {
    const resp = await myAxios.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
    const group = /.*href="(get.php\?md5=[0-9a-f]+.*)".*/g.exec(resp.data);
    const group = /.*href="(\S+)".*>GET<.*/g.exec(resp.data);
    if (group) {
      return `https://libgen.vg/${group[1]}`;
      return `${group[1]}`;
    } else {
      return '';
    }
@@ -321,7 +180,7 @@
async function downloadFile(book, url) {
  console.log(`下载文件: ${url}`);
  await retry(() => {
    const timeoutTime = 10 * 60 * 1000;
    const timeoutTime = 1 * 60 * 1000;
    const source = axios.CancelToken.source();
    const timeout = setTimeout(() => {
      source.cancel("timeout");
@@ -333,17 +192,18 @@
        let ext = response.headers['content-disposition'].split('filename=')[1].split('.').pop() ?? '';
        ext = ext.substring(0, ext.length - 1);

        const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
        const filepath = `./downloads/${book.id}.${ext}`;
        book.url = url;
        if (fs.existsSync(filepath)) {
          book.state = `下载完成`;
          book.format = ext;
          book.file = filepath;
          console.log(`下载完成：${filepath}`);
          resolve(true);
          return;
        }
        const stream = response.data;
        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
        const _filepath = `./downloads/${book.id}.${ext}`;
        const out = fs.createWriteStream(_filepath);
        stream.pipe(out);
        stream.on("end", async () => {
@@ -352,7 +212,7 @@
          book.format = ext;
          book.file = filepath;
          book.url = url;
          book.pages = await getPdfPages(filepath).catch(e => 0);
          // book.pages = await getPdfPages(filepath).catch(e => 0);
          resolve(true);
        });
        stream.on("error", (err) => {
@@ -379,7 +239,7 @@
          reject(false);
        }
      }));
  }).catch(e => {
  }, 1).catch(e => {
    book.state = "下载失败";
    console.log(`下载失败: ${book.id} ${book.title} ${url}`);
    return false
@@ -439,12 +299,12 @@
    // 等一段时间再打开详情页
    sleep(getRandomNumber(500, 1000));
    // 打开详情页，并获取下载链接
    const filePageUrl = await openBookDetailPage(book, detailPageUrl);
    if (!filePageUrl) {
      console.log(`没有文件: ${book.id} ${book.title}`);
      continue;
    }
    const url = await getDownloadUrl(book, filePageUrl);
    // const filePageUrl = await openBookDetailPage(book, detailPageUrl);
    // if (!filePageUrl) {
    //   console.log(`没有文件: ${book.id} ${book.title}`);
    //   continue;
    // }
    const url = await getDownloadUrl(book, detailPageUrl);
    if (!url) {
      console.log(`没有文件: ${book.id} ${book.title}`);
      continue;