~lyg/book-crawler.git - Gitblit

李玉刚 / book-crawler

图书批量下载

blame | 历史 | 补丁 | 提交 | 提交对比 | ignore whitespace

lyg

2024-06-14 fb4b60f782a4c263890d5d706aa61a3697fffca2

 src/main.mjs

@@ -8,6 +8,7 @@
import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
import { HttpsProxyAgent } from "https-proxy-agent";
import { resolve } from "path";
import { execFileSync } from "child_process";

/*-------------读取配置---------------*/
let config = JSON.parse(fs.readFileSync('./config.json'));
@@ -19,7 +20,7 @@
  if (!fs.existsSync('./logs')) {
    fs.mkdirSync('./logs', { recursive: true });
  }
  logFile = fs.createWriteStream(`./logs/logs-${config.startRow}-${config.endRow}-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
  logFile = fs.createWriteStream(`./logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
  console.log = function (...text) {
    text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
    _log(text);
@@ -33,6 +34,61 @@
  proxy: false,
  httpsAgent,
});

function countChar(str, char) {
  let count = 0;
  for (let i = 0; i < str.length; i++) {
    if (str[i] === char) {
      count++;
    }
  }
  return count;
}

/**
 * 清理文本
 * @param {string} text 要清理的文本
 */
function cleanText(text) {
  if (text.includes('google')) {
    text = text.replace(/^(.|\n)*books[ ]*\.[ ]*google[ ]*\.[ ]*com/ig, '')
  }
  // 如果是ocr识别的文本，每行字符数一般不会超过170
  if (!/.{170,}/g.test(text) || text.includes('google')) {
    text = text.replace(/(\r|■)/g, '');
    text = text.replace(/[ ]{2,}/g, ' ')
    text = text.replace(/(.+)\n/g, '$1');
    text = text.replace(/\n+/g, '\n');
    text = text.replace(/-\n/g, '-');
    const lines = text.split('\n');
    const result = [];
    for (const line of lines) {
      const wordSize = countChar(line, ' ');
      if (wordSize >= 10) {
        if (!/.*[^a-z0-9\-]{6,}.*/gi.test(line)) {
          result.push(line.trim());
        }
      }
    }
    return result.join('\n');
  } else {
    return text;
  }
}

/**
 * 解压文本文件
 * @param {string} zipFile 压缩文件路径
 * @param {string} txtFile 文本文件路径
 */
function unzip(zipFile, txtFile) {
  const tmpdir = `./tmpdir/${threadId}`;
  execFileSync('./7za.exe', ['x', '-aoa', zipFile, `-o${tmpdir}`])
  const file = fs.readdirSync(tmpdir).map(file => ({ size: fs.statSync(`${tmpdir}/${file}`), name: file }))
    .sort((a, b) => a.size.size - b.size.size).pop();
  fs.cpSync(`${tmpdir}/${file.name}`, txtFile, { overwrite: true });
  fs.rmSync(`${tmpdir}`, { recursive: true });
}

/**
 * 获取要下载熟图书信息
@@ -133,10 +189,10 @@
 * @param {*} book 
 */
async function openSearchPage(book, titleWithNumbers) {
  console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`);
  console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`);
  return await retry(async () => {
    // 获取页面
    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}`;
    const searchUrl = `https://archive.org/search?query=${formatKw(book.title, titleWithNumbers)}&sin=TXT`;
    await driver.get(searchUrl);
  }).then(() => true)
    .catch(() => false);
@@ -229,25 +285,26 @@
      }
    }

    if (pdfUrl) {
    /* if (pdfUrl) {
      return pdfUrl;
    } else if (textUrl) {
    } else  */
    if (textUrl) {
      return textUrl;
    } else {
      book.state = "没有pdf或text文件";
      book.state = "没有text文件";
      return ''
    }
  })
    .catch(() => {
      book.state = "没有pdf或text文件";
      book.state = "没有text文件";
      return '';
    });
}

async function downloadFile(book, url) {
  console.log(`下载文件: ${url}`);
  const ext = url.split(".").pop();
  const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
  const ext = url.split(".").pop().toLowerCase();
  const filepath = `./downloads/${book.id} ${book.isbn}.txt`;
  if (fs.existsSync(filepath)) {
    book.state = `下载完成`;
    book.format = ext;
@@ -275,7 +332,8 @@
          return;
        }
        const stream = response.data;
        const out = fs.createWriteStream(filepath);
        const _filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
        const out = fs.createWriteStream(_filepath);
        stream.pipe(out);
        stream.on("end", () => {
          clearTimeout(timeout);
@@ -284,6 +342,17 @@
          book.file = filepath;
          book.url = url;
          console.log(`下载完成：${filepath}`);
          setTimeout(() => {
            if (ext === "gz" || ext === "zip") {
              unzip(_filepath, filepath);
            }
            let text = fs.readFileSync(filepath, 'utf-8');
            if (text.includes("<!DOCTYPE html>")) {
              text = /(.|\n)*<pre>((.|\n)*)<\/pre>(.|\n)*/g.exec(text)[2];
              fs.writeFileSync(filepath, text, 'utf-8');
            }
            fs.writeFileSync(filepath + '.result.txt', cleanText(text), 'utf-8');
          }, 1000);
          resolve(true);
        });
        stream.on("error", (err) => {
@@ -467,7 +536,7 @@
  const books = [];
  downloadBooks(books)
    .then(() => {
      console.log(`全部完成，共下载${bookCount}本，成功下载${successCount}本，跳过${skipCount}本，失败${bookCount - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
      console.log(`线程：${threadId}全部完成，共下载${bookCount}本，成功下载${successCount}本，跳过${skipCount}本，失败${bookCount - skipCount - successCount}本，耗时： ${msFormat(Date.now() - startTime)}。`);
    })
    .catch(e => {
      console.error(e);
@@ -483,6 +552,13 @@
    });
}

if (!fs.existsSync('tmpdir')) {
  fs.mkdirSync('tmpdir', { recursive: true });
}
if (!fs.existsSync('downloads')) {
  fs.mkdirSync('downloads', { recursive: true });
}

// 多进程执行
if (isMainThread) {
  initLogger();
@@ -491,15 +567,10 @@
  console.log(`线程数：${threadSize}, 开始行：${startRow}, 结束行：${endRow}`);
  let finishCnt = 0;
  const finishBooks = [];
  const thBookSize = (endRow - startRow) / threadSize;
  const books = getBooksFromExcel(startRow, endRow);

  for (let sr = startRow; sr < endRow; sr += thBookSize) {
    let er = sr + thBookSize;
    if (er > endRow) {
      er = endRow;
    }
    const worker = new Worker("./src/main.mjs", { workerData: { startRow: sr, endRow: er, alreadyDownloadedBooks } });
  for (let i = 0; i < threadSize; i++) {
    const worker = new Worker("./src/main.mjs", { workerData: { alreadyDownloadedBooks } });
    worker.on("message", (message) => {
      if (message.type === 'books') {
        finishBooks.push(...message.data);