~lyg/book-crawler.git - Gitblit

李玉刚 / book-crawler

图书批量下载

blame | 历史 | 补丁 | 提交 | 提交对比 | show whitespace

lyg

2024-06-14 655f90e9e4544fdb8fa37ca0223fb686d4020b88

 src/main.mjs

@@ -9,7 +9,9 @@
import { HttpsProxyAgent } from "https-proxy-agent";
import { resolve } from "path";
import { execFileSync } from "child_process";

import wordsjs from 'wordlist-js';
import usPlaceList from "./us-place-list.mjs";
import usPeronNameList from "./us-pseron-name-list.mjs";
/*-------------读取配置---------------*/
let config = JSON.parse(fs.readFileSync('./config.json'));

@@ -35,10 +37,32 @@
  httpsAgent,
});

function countChar(str, char) {
function allWords() {
  const words = {};
  wordsjs.usPlaces = usPlaceList;
  wordsjs.usPeronNameList = usPeronNameList;
  for (const key in wordsjs.default) {
    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
      for (const word of wordsjs.default[key]) {
        words[word] = true;
      }
    }
  }
  return words;
}

const wordsMap = allWords();

/**
 * 统计单词数量
 * @param {string} str 字符串
 * @returns 单词数量
 */
function countWordSize(str) {
  let count = 0;
  str = str.replace(/[ ]{2,}/g, ' ');
  for (let i = 0; i < str.length; i++) {
    if (str[i] === char) {
    if (str[i] === ' ') {
      count++;
    }
  }
@@ -46,34 +70,86 @@
}

/**
 * 获取错误单词比例
 * @param {string} text 文本
 * @returns 错误单词比例
 */
function incorrectWordRatio(text) {
  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
  const words = text.split(' ');
  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
  return incorrectWordCnt / words.length;
}

/**
 * 符号占比 0 ~ 1
 * @param {string} text 文本
 */
function symbolRatio(text) {
  // 非字母数字字符占比
  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
}

/**
 * 清理文本
 * @param {string} text 要清理的文本
 */
function cleanText(text) {
  if (text.includes('google')) {
    text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '')
  text = text.replace(/(\r)/g, '');
  const googlePage = text.substring(0, 10000);
  if (googlePage.includes('google')) {
    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
  }
  // 如果是ocr识别的文本，每行字符数一般不会超过170
  if (!/.{170,}/g.test(text) || text.includes('google')) {
    text = text.replace(/(\r|■)/g, '');
  // if (!/.{170,}/g.test(text) || text.includes('google')) {
    text = text.replace(/[ ]{2,}/g, ' ')
    text = text.replace(/(.+)\n/g, '$1');
  if (!/.{170,}/g.test(text)) {
    // 每行不超过170个字符
    text = text.replace(/(.{170,})\n/g, '$1');
  }
    text = text.replace(/\n+/g, '\n');
    text = text.replace(/-\n/g, '-');
    const lines = text.split('\n');
    const result = [];
    for (const line of lines) {
      const wordSize = countChar(line, ' ');
      if (wordSize >= 10) {
        if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) {
          result.push(line.trim());
    // 符号比太高的不要
    const incorrectRatio = incorrectWordRatio(line);
    if (symbolRatio(line) > 0.2) {
      if (incorrectRatio > 0.65) {
        continue;
        }
      }
    // 去除空格后 连续重复单个字符3次及以上不要
    const wordSize = countWordSize(line);
    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
      if (wordSize < 5 || incorrectRatio > 0.65) {
        continue;
    }
    return result.join('\n');
  } else {
    }
    // 连续三个标点符号及以上,错误率大于0.65不要
    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) {
      continue;
    }
    // 单词数量太少的不要
    if (wordSize > 5 && incorrectRatio > 0.65) {
      continue;
    }
    // 有google的不要
    if (/.*(google).*/ig.test(line)) {
      continue;
    }
    // 只有一个字符不要
    const ret = line.trim().replace(/[■•*¬»«^-]/g, '');
    if (ret.length <= 1) {
      continue;
    }
    if (ret == 'Digitized by') {
      continue;
    }
    result.push(ret);
  }
  text = result.join('\n');
  // }
    return text;
  }
}

/**
@@ -301,6 +377,21 @@
    });
}

/**
 * 从HTML提取文本
 * @param {string} text html文本
 * @returns 文本
 */
function getTextFromHtml(text) {
  if (text.includes("<!DOCTYPE html>")) {
    const s = text.indexOf('<pre>') + 6;
    const e = text.indexOf('</pre>');
    text = text.substring(s, e);
    // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
  }
  return text;
}

async function downloadFile(book, url) {
  console.log(`下载文件: ${url}`);
  const ext = url.split(".").pop().toLowerCase();
@@ -345,12 +436,11 @@
          setTimeout(() => {
            if (ext === "gz" || ext === "zip") {
              unzip(_filepath, filepath);
              fs.unlinkSync(_filepath);
            }
            let text = fs.readFileSync(filepath, 'utf-8');
            if (text.includes("<!DOCTYPE html>")) {
              text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
            text = getTextFromHtml(text);
              fs.writeFileSync(filepath, text, 'utf-8');
            }
            try {
              fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
            } catch (e) {
@@ -412,6 +502,18 @@
  });
}

function getBookInfo(book) {
  return retry(async () => {
    const publisher = await driver.executeScript(`return document.querySelector("span[itemprop=publisher]").textContent`);
    const datePublished = await driver.executeScript(`return document.querySelector("span[itemprop=datePublished]").textContent`);
    let pages = await driver.executeScript(`return document.querySelector("span[data-id=resultsCount]").textContent`);
    pages = pages.split(' / ')[1];
    book.publisher = publisher;
    book.pubDate = datePublished;
    book.pages = pages;
  });
}

async function downloadBooks(books) {
  driver = await createDriver();

@@ -426,7 +528,7 @@
      break;
    }
    bookCount++;
    if (isAlreadyDownloaded(book)) {
    /*if (isAlreadyDownloaded(book)) {
      skipCount++;
      continue;
    }
@@ -434,7 +536,7 @@
      // 跳过没有搜索结果或没有pdf或text文件的书籍
      skipCount++;
      continue;
    }
    } */
    console.log(`开始下载: ${book.id} ${book.title}`);
    // 打开搜索页面并搜索
    if (!await openSearchPage(book, true)) {
@@ -461,6 +563,7 @@
    sleep(getRandomNumber(500, 10000));
    // 打开详情页
    await openBookDetailPage(book, detailPageUrl);
    await getBookInfo(book);
    // 获取下载链接
    const url = await getDownloadUrl(book);
    if (!url) { continue; }
@@ -598,6 +701,6 @@
  main();
}

// const filepath = "D:\\projects\\book-crawler\\downloads\\10231261 978-1-331-76167-9.txt";
// const text = fs.readFileSync(filepath, 'utf8');
// const filepath = "D:\\projects\\book-crawler\\downloads\\10482686 978-1-333-27648-5.txt";
// let text = fs.readFileSync(filepath, 'utf8');
// fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');