~lyg/book-crawler.git

import xlsx from "node-xlsx";
import axios from "axios";
import * as fs from "fs";
import path from "path";
import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
import { HttpsProxyAgent } from "https-proxy-agent";
import { execFileSync } from "child_process";
import wordsjs from 'wordlist-js';
import usPlaceList from "./us-place-list.mjs";
import usPeronNameList from "./us-pseron-name-list.mjs";
import * as pdfLib from 'pdf-lib';
 
/*-------------读取配置---------------*/
let config = JSON.parse(fs.readFileSync('./config.json'));
 
/* ------------日志-------------- */
let logFile;
function initLogger() {
  const _log = console.log;
  if (!fs.existsSync('./logs')) {
    fs.mkdirSync('./logs', { recursive: true });
  }
  logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
  console.log = function (...text) {
    text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
    _log(text);
    logFile.write(text + '\n');
  };
}
 
/* ----------axios代理------------ */
const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
const myAxios = axios.create({
  proxy: false,
  httpsAgent,
});
 
/**
 * 获取pdf文件页数
 * @param {string} filepath pdf 文件路径
 * @returns 页数
 */
async function getPdfPages(filepath) {
  const buf = fs.readFileSync(filepath);
  const pdfDoc = await pdfLib.PDFDocument.load(buf, { ignoreEncryption: true });
  const pages = pdfDoc.getPages().length;
  return pages;
}
 
function allWords() {
  const words = {};
  wordsjs.usPlaces = usPlaceList;
  wordsjs.usPeronNameList = usPeronNameList;
  for (const key in wordsjs.default) {
    if (Object.hasOwnProperty.call(wordsjs.default, key)) {
      for (const word of wordsjs.default[key]) {
        words[word] = true;
      }
    }
  }
  return words;
}
 
const wordsMap = allWords();
 
/**
 * 统计单词数量
 * @param {string} str 字符串
 * @returns 单词数量
 */
function countWordSize(str) {
  let count = 0;
  str = str.replace(/[ ]{2,}/g, ' ');
  for (let i = 0; i < str.length; i++) {
    if (str[i] === ' ') {
      count++;
    }
  }
  return count;
}
 
/**
 * 获取错误单词比例
 * @param {string} text 文本
 * @returns 错误单词比例
 */
function incorrectWordRatio(text) {
  text = text.replace(/[ ]+/g, ' ').replace(/([a-zA-Z])[\.!?,;"')]/g, "$1");
  const words = text.split(' ');
  const incorrectWordCnt = words.filter(word => !wordsMap[word.toLocaleLowerCase()] && !/\d+/g.test(word)).length;
  return incorrectWordCnt / words.length;
}
 
/**
 * 符号占比 0 ~ 1
 * @param {string} text 文本
 */
function symbolRatio(text) {
  // 非字母数字字符占比
  return (text.match(/[^a-zA-Z0-9 ]/g) || []).length / text.length;
}
 
/**
 * 清理文本
 * @param {string} text 要清理的文本
 */
function cleanText(text) {
  text = text.replace(/(\r)/g, '');
  const googlePage = text.substring(0, 10000);
  if (googlePage.includes('google')) {
    text = googlePage.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '') + text.substring(10000);
  }
  // if (!/.{170,}/g.test(text) || text.includes('google')) {
  text = text.replace(/[ ]{2,}/g, ' ')
  if (!/.{170,}/g.test(text)) {
    // 每行不超过170个字符
    text = text.replace(/(.{170,})\n/g, '$1');
  }
  text = text.replace(/\n+/g, '\n');
  text = text.replace(/-\n/g, '-');
  const lines = text.split('\n');
  const result = [];
  for (const line of lines) {
    // 符号比太高的不要
    const incorrectRatio = incorrectWordRatio(line);
    if (symbolRatio(line) > 0.2) {
      if (incorrectRatio > 0.65) {
        continue;
      }
    }
    // 去除空格后 连续重复单个字符3次及以上不要
    const wordSize = countWordSize(line);
    if (/([\D])\1{2,}/.test(line.replace(/[ ]+/g, ''))) {
      if (wordSize < 5 || incorrectRatio > 0.65) {
        continue;
      }
    }
    // 连续三个标点符号及以上,错误率大于0.65不要
    if (incorrectRatio > 0.65 && /([\.,'";:|!@#$%^&*\(\)<>?`~•*¬»«]){3,}/.test(line)) {
      continue;
    }
    // 单词数量太少的不要
    if (wordSize > 5 && incorrectRatio > 0.65) {
      continue;
    }
    // 有google的不要
    if (/.*(google).*/ig.test(line)) {
      continue;
    }
    // 只有一个字符不要
    const ret = line.trim().replace(/[■•*¬»«^-]/g, '');
    if (ret.length <= 1) {
      continue;
    }
    if (ret == 'Digitized by') {
      continue;
    }
    result.push(ret);
  }
  text = result.join('\n');
  // }
  return text;
}
 
/**
 * 解压文本文件
 * @param {string} zipFile 压缩文件路径
 * @param {string} txtFile 文本文件路径
 */
function unzip(zipFile, txtFile) {
  const tmpdir = `./tmpdir/${threadId}`;
  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
    .sort((a, b) => a.size.size - b.size.size).pop();
  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
  fs.rmSync(`${tmpdir}`, { recursive: true });
}
 
/**
 * 获取要下载熟图书信息
 * @param {number} startRow 起始行，包含
 * @param {number} endRow 结束行，不包含
 * @returns 
 */
function getBooksFromExcel(startRow, endRow) {
  const workSheets = xlsx.parse("【第二批二次处理后】交付清单.xlsx");
  const sheet = workSheets[0];
  const data = sheet.data.slice(startRow, endRow);
  const books = data.map((row) => {
    return {
      id: row[0],
      isbn: row[1],
      title: row[2],
      subTitle: row[3],
      author: row[4],
      publisher: row[5],
      pubDate: row[6],
      ztf: row[7],
      format: row[8],
      language: row[9],
      brief: row[10],
      pages: row[11],
      state: row[12],
      format: row[13],
      file: row[14],
      url: row[15],
    };
  });
  return books;
}
 
/**
 * 格式化关键字
 * @param {string} text 要搜索的关键字
 * @param {boolean} titleWithNumbers 是否标题中包含数字
 * @returns 处理后的关键字
 */
function formatKw(text, titleWithNumbers) {
  if (titleWithNumbers) {
    text = text;
  } else {
    text = text.replace(/[\d]/g, "");
  }
  text = text.split(' ').slice(0, 6).join("+");
  return text;
}
 
 
async function sleep(ms) {
  return new Promise((resolve) => {
    setTimeout(resolve, ms);
  });
}
 
async function retry(func, maxTry = 3, delay = 3000) {
  try {
    return await func();
  } catch (e) {
    if (maxTry > 0) {
      await sleep(delay);
      return await retry(func, maxTry - 1, delay);
    } else {
      throw e;
    }
  }
}
 
/**
 * 获取书籍详情页url
 * @param {*} book 
 */
async function getBookDetailPageUrl(book, titleWithNumbers) {
  const kw = formatKw(book.title, titleWithNumbers);
  const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`;
  const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}`
  console.log(`打开搜索: ${searchUrl}`);
  return await retry(async () => {
    const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
    const { total, hits } = resp.data.response.body.hits
    if (total === 0) {
      return '';
    }
    const hit = hits[0];
    const { identifier, title, creator } = hit.fields
    return `https://archive.org/details/${identifier}`;
  })
    .catch(() => '');
}
 
async function openBookDetailPage(book, detailPageUrl) {
  console.log(`打开详情: ${detailPageUrl}`);
  return await retry(async () => {
    const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
    const html = resp.data;
    const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]);
    book.publisher = data.metadata.publisher;
    book.pubDate = data.metadata.date;
    const identifier = data.metadata.identifier;
    const fileData = data.files.find(f => f.format === 'Text PDF');
    if (!fileData) {
      return '';
    }
    const fileUrl = `https://archive.org/download/${identifier}/${fileData.name}`;
    return fileUrl;
  })
    .catch(() => {
      book.state = "打开详情页失败";
      console.log(`打开详情页失败: ${book.id} ${book.title}`);
      return '';
    });
}
 
/**
 * 从HTML提取文本
 * @param {string} text html文本
 * @returns 文本
 */
function getTextFromHtml(text) {
  if (text.includes("<!DOCTYPE html>")) {
    const s = text.indexOf('<pre>') + 6;
    const e = text.indexOf('</pre>');
    text = text.substring(s, e);
    // text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
  }
  return text;
}
 
async function downloadFile(book, url) {
  console.log(`下载文件: ${url}`);
  const ext = url.split(".").pop().toLowerCase();
  const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
  book.url = url;
  if (fs.existsSync(filepath)) {
    book.state = `下载完成`;
    book.format = ext;
    book.file = filepath;
    book.pages = await getPdfPages(filepath).catch(() => 0);
    console.log(`下载完成：${filepath}`);
    return;
  }
  await retry(() => {
    const timeoutTime = 10 * 60 * 1000;
    const source = axios.CancelToken.source();
    const timeout = setTimeout(() => {
      source.cancel("timeout");
    }, timeoutTime);
    return new Promise((resolve, reject) => myAxios
      .get(url, { responseType: "stream", timeout: timeoutTime, cancelToken: source.token })
      .then((response) => {
        const len = response.headers['content-length'];
        if (ext !== "pdf" && ext !== "txt" && len > 200 * 1024 * 1024) {
          // 不是pdf或txt文件，且文件大于200M，不下载
          reject(false);
          return;
        }
        const stream = response.data;
        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
        const out = fs.createWriteStream(_filepath);
        stream.pipe(out);
        stream.on("end", async () => {
          clearTimeout(timeout);
          book.state = `下载完成`;
          book.format = ext;
          book.file = filepath;
          book.url = url;
          book.pages = await getPdfPages(filepath).catch(e => 0);
          resolve(true);
        });
        stream.on("error", (err) => {
          clearTimeout(timeout);
          console.error(err);
          reject(false);
          try {
            out.close();
            fs.unlink(filepath, (e) => console.error(e));
          } catch (e) {
            console.error(e);
          }
        });
      })
      .catch((e) => {
        clearTimeout(timeout);
        console.log(`下载失败，错误码: ${e?.response?.status ?? e.code}`);
        book.url = url;
        if (e.response?.status === 403 || e.response?.status === 401) {
          book.state = "没有下载权限";
          console.log(`下载失败: ${book.id} ${book.title} ${url}`);
          resolve(true);
        } else {
          reject(false);
        }
      }));
  }).catch(e => {
    book.state = "下载失败";
    console.log(`下载失败: ${book.id} ${book.title} ${url}`);
    return false
  });
}
 
function isAlreadyDownloaded(book) {
  const id = `${book.id} ${book.isbn}`;
  return alreadyDownloadedBooks.includes(id);
}
 
function nextBook() {
  return new Promise(resolve => {
    const cb = (message) => {
      if (message.type === 'book') {
        resolve(message.data);
        parentPort.removeListener('message', cb);
      }
    };
    parentPort.on('message', cb);
    parentPort.postMessage({ type: 'get-book', threadId });
 
  });
}
 
 
async function downloadBooks(books) {
 
  for (; ;) {
    const book = await nextBook();
    if (!book) {
      break;
    }
    books.push(book);
    if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
      // 定时退出
      break;
    }
    bookCount++;
    if (isAlreadyDownloaded(book)) {
      skipCount++;
      book.skip = true;
      continue;
    }
    if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) {
      // 跳过没有搜索结果或没有pdf或text文件的书籍
      skipCount++;
      continue;
    }
    console.log(`开始下载: ${book.id} ${book.title}`);
    // 打开搜索页面并搜索
    let detailPageUrl = await getBookDetailPageUrl(book, true);
    if (!detailPageUrl) {
      // 先用包含数字的关键字，如果没有结果再用不包含数字的关键字
      detailPageUrl = await getBookDetailPageUrl(book, false);
      if (!detailPageUrl) {
        console.log(`获取详情页链接失败: ${book.id} ${book.title}`);
        book.state = "没有搜索结果";
        continue;
      }
    }
    // 等一段时间再打开详情页
    sleep(getRandomNumber(500, 1000));
    // 打开详情页，并获取下载链接
    const url = await openBookDetailPage(book, detailPageUrl);
    if (!url) {
      console.log(`没有pdf或text文件: ${book.id} ${book.title}`);
      continue;
    }
    // 等待一段时间再下载
    await sleep(getRandomNumber(500, 1000));
    // 下载文件
    try {
      await downloadFile(book, url);
      console.log(`下载完成: ${book.id} ${book.title}`);
      console.log('finish: ' + JSON.stringify(book));
    } catch (e) { }
    successCount++;
    // 等一段时间再下一个
    sleep(getRandomNumber(500, 1000));
  }
}
 
function saveBooks(books) {
  console.log("保存下载状态数据");
  const workSheets = xlsx.parse("【第二批二次处理后】交付清单.xlsx");
  const sheet = workSheets[0];
  const data = sheet.data;
  for (const book of books) {
    const index = data.findIndex((row) => row[0] === book.id);
    if (index > -1) {
      data[index][5] = book.publisher;
      data[index][6] = book.pubDate;
      data[index][11] = book.pages;
      data[index][12] = book.state;
      data[index][13] = book.format;
      data[index][14] = book.file;
      data[index][15] = book.url;
    }
  }
 
  const buffer = xlsx.build([{ name: "Sheet1", data }]);
  try {
    fs.writeFileSync("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { });
    console.log("保存完成: ./【第二批二次处理后】交付清单.xlsx");
  } catch (e) {
    console.error(e);
    const outfile = `${Date.now()}.json`;
    fs.writeFileSync(outfile, JSON.stringify(data));
    console.log("保存完成: " + outfile);
  }
}
 
 
/**
 * 毫秒转时分秒格式
 * @param {number} ms 毫秒值
 */
function msFormat(ms) {
  const sec = Math.floor(ms / 1000);
  const min = Math.floor(sec / 60);
  const hour = Math.floor(min / 60);
  const day = Math.floor(hour / 24);
  const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
  return format;
}
 
/**
 * 获取随机值
 * @param {number} min 最小值
 * @param {number} max 最大值
 * @returns 随机值
 */
function getRandomNumber(min, max) {
  return Math.random() * (max - min) + min;
}
 
// 开始时间
const startTime = Date.now();
// 下载成功的数量
let successCount = 0;
// 图书数量
let bookCount = 0;
// 跳过的数量，已经下载过或没有搜索到的数量
let skipCount = 0;
let alreadyDownloadedBooks = [];
 
function getAlreadyDownloadedBooks() {
  const text = fs.readFileSync('./alreadyDownloadedBooks.txt', 'utf-8');
  const books = text.replace(/\r/g, '').split('\n').map(it => it.trim()).filter(it => it);
  const files = fs.readdirSync('./downloads');
  books.push(...files);
  return books.map(it => path.basename(it, path.extname(it)).trim());
}
 
function startDownload() {
  initLogger();
  const books = [];
  downloadBooks(books)
    .then(() => {
      console.log(`线程：${threadId}全部完成，共下载${bookCount}本，成功下载${successCount}本，跳过${skipCount}本，失败${bookCount - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
    })
    .catch(e => {
      console.error(e);
    })
    .finally(async () => {
      // saveBooks(books);
      parentPort.postMessage({ type: "books", data: books });
      logFile.close();
    });
}
 
function main() {
 
  if (!fs.existsSync('tmpdir')) {
    fs.mkdirSync('tmpdir', { recursive: true });
  }
  if (!fs.existsSync('downloads')) {
    fs.mkdirSync('downloads', { recursive: true });
  }
  // 多进程执行
  if (isMainThread) {
    initLogger();
    let downloadCnt = 0;
    const alreadyDownloadedBooks = getAlreadyDownloadedBooks();
    const { startRow, endRow, threadSize } = config;
    console.log(`线程数：${threadSize}, 开始行：${startRow}, 结束行：${endRow}`);
    let finishThreadCnt = 0;
    const finishBooks = [];
    const books = getBooksFromExcel(startRow, endRow);
 
    for (let i = 0; i < threadSize; i++) {
      const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
      worker.on("message", (message) => {
        if (message.type === 'books') {
          finishBooks.push(...message.data);
          finishThreadCnt++;
          if (finishThreadCnt >= threadSize) {
            successCount = finishBooks.filter(it => it.state === '下载完成').length;
            skipCount = finishBooks.filter(it => it.skip).length;
            console.log(`全部线程完成，共下载${downloadCnt}本，成功下载${successCount}本，跳过${skipCount}，失败${downloadCnt - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
            saveBooks(finishBooks);
          }
        } else if (message.type === 'get-book') {
          downloadCnt++;
          worker.postMessage({ type: "book", data: books.shift() });
        }
      });
    }
    // 监听退出信号，保存已经下载的图书信息
    process.on('SIGINT', () => {
      successCount = finishBooks.filter(it => it.state === '下载完成').length;
      skipCount = finishBooks.filter(it => it.skip).length;
      console.log(`进程被手动结束，共下载${downloadCnt}本，成功下载${successCount}本，跳过${skipCount}，失败${downloadCnt - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
      saveBooks(finishBooks);
      process.exit(0);
    });
  } else {
    alreadyDownloadedBooks = workerData.alreadyDownloadedBooks;
    startDownload();
 
  }
}
 
main();