~lyg/book-crawler.git - Gitblit

李玉刚 / book-crawler

图书批量下载

blame | 历史 | 补丁 | 提交 | 提交对比 | ignore whitespace

修改isbn查询

lyg

2024-07-17 831697d95be0123fade180aedded20db01f1884b

 src/book-isbn-search.mjs

@@ -3,8 +3,9 @@
import * as fs from "fs";
import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
import { HttpsProxyAgent } from "https-proxy-agent";
import * as cheerio from 'cheerio';

const EXCEL_FILE = "fiction-noisbn.xlsx";
const EXCEL_FILE = "book-list.xlsx";

/*-------------读取配置---------------*/
let config = JSON.parse(fs.readFileSync('./config.json'));
@@ -30,31 +31,6 @@
  proxy: false,
  httpsAgent,
});

/**
 * 获取要下载熟图书信息
 * @param {number} startRow 起始行，包含
 * @param {number} endRow 结束行，不包含
 * @returns 
 */
function getBooksFromExcel(startRow, endRow) {
  const workSheets = xlsx.parse(EXCEL_FILE);
  const sheet = workSheets[0];
  const data = sheet.data.slice(startRow, endRow);
  const books = data.map((row) => {
    return {
      id: row[0],
      title: row[1],
      author: row[2],
      year: row[3],
      publisher: row[4],
      isbn: row[5],
      extension: row[6],
      state: row[7]
    };
  });
  return books;
}

/**
 * 格式化关键字
@@ -166,32 +142,39 @@
      break;
    }
    bookCount++;
    if (isAlreadyDownloaded(book)) {
      skipCount++;
      book.skip = true;
      continue;
    }
    // if (isAlreadyDownloaded(book)) {
    //   skipCount++;
    //   book.skip = true;
    //   continue;
    // }
    if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) {
      // 跳过没有搜索结果或没有pdf或text文件的书籍
      skipCount++;
      continue;
    }
    console.log(`开始下载: ${book.id} ${book.title}`);



    // 打开搜索页面并搜索
    let detailPageUrl = await getBookDetailPageUrl(book, true);
    if (!detailPageUrl) {
      // 先用包含数字的关键字，如果没有结果再用不包含数字的关键字
      detailPageUrl = await getBookDetailPageUrl(book, false);
      if (!detailPageUrl) {
        console.log(`获取详情页链接失败: ${book.id} ${book.title}`);
        book.state = "没有搜索结果";
        continue;
      }
    }
    // let detailPageUrl = await getBookDetailPageUrl(book, true);
    // if (!detailPageUrl) {
    //   // 先用包含数字的关键字，如果没有结果再用不包含数字的关键字
    //   detailPageUrl = await getBookDetailPageUrl(book, false);
    //   if (!detailPageUrl) {
    //     console.log(`获取详情页链接失败: ${book.id} ${book.title}`);
    //     book.state = "没有搜索结果";
    //     continue;
    //   }
    // }
    // 等一段时间再打开详情页
    sleep(getRandomNumber(500, 1000));
    // sleep(getRandomNumber(500, 1000));
    // 打开详情页，并获取isbn
    const detailPageUrl = `https://archive.org/details/${book.id}`;
    await openBookDetailPage(book, detailPageUrl);
    if (book.isbn) {
      parentPort.postMessage({ type: "book", data: book });
    }
    // 等一段时间再下一个
    sleep(getRandomNumber(500, 1000));
  }
@@ -199,19 +182,11 @@

function saveBooks(books) {
  console.log("保存下载状态数据");
  const workSheets = xlsx.parse(EXCEL_FILE);
  const sheet = workSheets[0];
  const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] };
  const data = sheet.data;
  for (const book of books) {
    const index = data.findIndex((row) => row[0] === book.id);
    if (index > -1) {
      data[index][5] = book.isbn;
      if (!data[index][3])
        data[index][3] = book.pubDate;
      if (!data[index][4])
        data[index][4] = book.publisher;
      data[index][7] = book.state;
    }
    const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn];
    data.push(row);
  }

  const buffer = xlsx.build([{ name: "Sheet1", data }]);
@@ -271,11 +246,54 @@
    })
    .finally(async () => {
      // saveBooks(books);
      parentPort.postMessage({ type: "books", data: books });
      // parentPort.postMessage({ type: "books", data: books });
      logFile.close();
    });
}

let year = 2024;
let codeIndex = 0;
const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"];

async function getBookList(pageSize, page, code) {
  const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`;
  return await myAxios.get(url);
}

async function getBooks() {
  let page = 1;
  const pageSize = 100;
  let total = 0;
  const code = codeList[codeIndex];
  console.log(`${year}年 ${codeIndex}`);
  const bookList = [];
  do {
    console.log(`正在获取 ${year} 年 ${code} 分类 ${page} 页`);
    const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => {
      console.log(`获取失败：${year} 年 ${code} 分类 ${page} 页`);
    });;
    if (!resp) {
      continue;
    }
    const { total: _total, hits } = resp.data.response.body.hits
    total = _total;
    for (const hit of hits) {
      const { identifier, title, creator } = hit.fields
      const author = creator?.join(", ");
      bookList.push({ id: identifier, title, author });
    }
    page++;
    await sleep(getRandomNumber(300, 800));
  } while (pageSize * page < total);
  codeIndex++;
  if (codeIndex == codeList.length) {
    year--;
    codeIndex = 0;
  }
  return bookList;
}

let getBookPromise = null;
function main() {

  if (!fs.existsSync('tmpdir')) {
@@ -289,12 +307,15 @@
    console.log(`线程数：${threadSize}, 开始行：${startRow}, 结束行：${endRow}`);
    let finishThreadCnt = 0;
    const finishBooks = [];
    const books = getBooksFromExcel(startRow, endRow);
    const books = [];

    for (let i = 0; i < threadSize; i++) {
      const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} });
      worker.on("message", (message) => {
        if (message.type === 'books') {
      worker.on("message", async (message) => {
        if (message.type === 'book') {
          finishBooks.push(message.data);
        }
        else if (message.type === 'books') {
          finishBooks.push(...message.data);
          finishThreadCnt++;
          if (finishThreadCnt >= threadSize) {
@@ -305,6 +326,18 @@
          }
        } else if (message.type === 'get-book') {
          downloadCnt++;
          if (getBookPromise) {
            await getBookPromise.finally();
          }
          if (books.length == 0) {
            do {
              if (year > 1950) {
                getBookPromise = getBooks();
                books.push(...await getBookPromise.finally());
                getBookPromise = null;
              }
            } while (!books.length);
          }
          worker.postMessage({ type: "book", data: books.shift() });
        }
      });