import xlsx from "node-xlsx";
|
import axios from "axios";
|
import * as fs from "fs";
|
import { Worker, isMainThread, parentPort, workerData, threadId } from 'worker_threads';
|
import { HttpsProxyAgent } from "https-proxy-agent";
|
import * as cheerio from 'cheerio';
|
|
const EXCEL_FILE = "book-list.xlsx";
|
|
/*-------------读取配置---------------*/
|
let config = JSON.parse(fs.readFileSync('./config.json'));
|
|
/* ------------日志-------------- */
|
let logFile;
|
function initLogger() {
|
const _log = console.log;
|
if (!fs.existsSync('./book-isbn-logs')) {
|
fs.mkdirSync('./book-isbn-logs', { recursive: true });
|
}
|
logFile = fs.createWriteStream(`./book-isbn-logs/logs-thread${threadId}.log`, { flags: 'a', encoding: 'utf8' });
|
console.log = function (...text) {
|
text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
|
_log(text);
|
logFile.write(text + '\n');
|
};
|
}
|
|
/* ----------axios代理------------ */
|
const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
|
const myAxios = axios.create({
|
proxy: false,
|
httpsAgent,
|
});
|
|
/**
|
* 格式化关键字
|
* @param {string} text 要搜索的关键字
|
* @param {boolean} titleWithNumbers 是否标题中包含数字
|
* @returns 处理后的关键字
|
*/
|
function formatKw(text, titleWithNumbers) {
|
if (titleWithNumbers) {
|
text = text;
|
} else {
|
text = text.replace(/[\d]/g, "");
|
}
|
text = text.split(' ').slice(0, 6).join("+");
|
return text;
|
}
|
|
|
async function sleep(ms) {
|
return new Promise((resolve) => {
|
setTimeout(resolve, ms);
|
});
|
}
|
|
async function retry(func, maxTry = 3, delay = 3000) {
|
try {
|
return await func();
|
} catch (e) {
|
if (maxTry > 0) {
|
await sleep(delay);
|
return await retry(func, maxTry - 1, delay);
|
} else {
|
throw e;
|
}
|
}
|
}
|
|
/**
|
* 获取书籍详情页url
|
* @param {*} book
|
*/
|
async function getBookDetailPageUrl(book, titleWithNumbers) {
|
const kw = formatKw(book.title, titleWithNumbers);
|
const clientUrl = `https://archive.org/search?query=${kw}&sin=TXT`;
|
const searchUrl = `https://archive.org/services/search/beta/page_production/?service_backend=fts&user_query=${encodeURIComponent(kw)}&hits_per_page=1&page=1&aggregations=false&client_url=${encodeURIComponent(clientUrl)}`
|
console.log(`打开搜索: ${searchUrl}`);
|
return await retry(async () => {
|
const resp = await myAxios.get(searchUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } })
|
const { total, hits } = resp.data.response.body.hits
|
if (total === 0) {
|
return '';
|
}
|
const hit = hits[0];
|
const { identifier, title, creator } = hit.fields
|
return `https://archive.org/details/${identifier}`;
|
})
|
.catch(() => '');
|
}
|
|
async function openBookDetailPage(book, detailPageUrl) {
|
console.log(`打开详情: ${detailPageUrl}`);
|
return await retry(async () => {
|
const resp = await myAxios.get(detailPageUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } });
|
const html = resp.data;
|
const data = JSON.parse(/<input class="js-ia-metadata" type="hidden" value='(.*)'\/>/g.exec(html)[1]);
|
if (data.metadata.isbn?.length) {
|
data.metadata.isbn.sort((a, b) => b.length - a.length);
|
book.isbn = data.metadata.isbn[0];
|
}
|
book.publisher = data.metadata.publisher;
|
book.pubDate = data.metadata.date;
|
})
|
.catch(() => {
|
book.state = "打开详情页失败";
|
console.log(`打开详情页失败: ${book.id} ${book.title}`);
|
return '';
|
});
|
}
|
|
function isAlreadyDownloaded(book) {
|
return book.isbn;
|
}
|
|
function nextBook() {
|
return new Promise(resolve => {
|
const cb = (message) => {
|
if (message.type === 'book') {
|
resolve(message.data);
|
parentPort.removeListener('message', cb);
|
}
|
};
|
parentPort.on('message', cb);
|
parentPort.postMessage({ type: 'get-book', threadId });
|
|
});
|
}
|
|
|
async function downloadBooks(books) {
|
|
for (; ;) {
|
const book = await nextBook();
|
if (!book) {
|
break;
|
}
|
books.push(book);
|
if (config.endOfTime && Date.now() - startTime > 1000 * 60 * config.endOfTime) {
|
// 定时退出
|
break;
|
}
|
bookCount++;
|
// if (isAlreadyDownloaded(book)) {
|
// skipCount++;
|
// book.skip = true;
|
// continue;
|
// }
|
if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件" || book.state === "下载完成")) {
|
// 跳过没有搜索结果或没有pdf或text文件的书籍
|
skipCount++;
|
continue;
|
}
|
console.log(`开始下载: ${book.id} ${book.title}`);
|
|
|
|
// 打开搜索页面并搜索
|
// let detailPageUrl = await getBookDetailPageUrl(book, true);
|
// if (!detailPageUrl) {
|
// // 先用包含数字的关键字,如果没有结果再用不包含数字的关键字
|
// detailPageUrl = await getBookDetailPageUrl(book, false);
|
// if (!detailPageUrl) {
|
// console.log(`获取详情页链接失败: ${book.id} ${book.title}`);
|
// book.state = "没有搜索结果";
|
// continue;
|
// }
|
// }
|
// 等一段时间再打开详情页
|
// sleep(getRandomNumber(500, 1000));
|
// 打开详情页,并获取isbn
|
const detailPageUrl = `https://archive.org/details/${book.id}`;
|
await openBookDetailPage(book, detailPageUrl);
|
if (book.isbn) {
|
parentPort.postMessage({ type: "book", data: book });
|
}
|
// 等一段时间再下一个
|
sleep(getRandomNumber(500, 1000));
|
}
|
}
|
|
function saveBooks(books) {
|
console.log("保存下载状态数据");
|
const sheet = { name: "Sheet1", data: [["ID", "Title", "Author", "Year", "Publisher", "ISBN"]] };
|
const data = sheet.data;
|
for (const book of books) {
|
const row = [book.id, book.title, book.author, book.pubDate, book.publisher, book.isbn];
|
data.push(row);
|
}
|
|
const buffer = xlsx.build([{ name: "Sheet1", data }]);
|
try {
|
fs.writeFileSync(EXCEL_FILE, buffer, (err) => { });
|
console.log("保存完成: ", EXCEL_FILE);
|
} catch (e) {
|
console.error(e);
|
const outfile = `${Date.now()}.json`;
|
fs.writeFileSync(outfile, JSON.stringify(data));
|
console.log("保存完成: " + outfile);
|
}
|
}
|
|
|
/**
|
* 毫秒转时分秒格式
|
* @param {number} ms 毫秒值
|
*/
|
function msFormat(ms) {
|
const sec = Math.floor(ms / 1000);
|
const min = Math.floor(sec / 60);
|
const hour = Math.floor(min / 60);
|
const day = Math.floor(hour / 24);
|
const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
|
return format;
|
}
|
|
/**
|
* 获取随机值
|
* @param {number} min 最小值
|
* @param {number} max 最大值
|
* @returns 随机值
|
*/
|
function getRandomNumber(min, max) {
|
return Math.random() * (max - min) + min;
|
}
|
|
// 开始时间
|
const startTime = Date.now();
|
// 下载成功的数量
|
let successCount = 0;
|
// 图书数量
|
let bookCount = 0;
|
// 跳过的数量,已经下载过或没有搜索到的数量
|
let skipCount = 0;
|
|
function startDownload() {
|
initLogger();
|
const books = [];
|
downloadBooks(books)
|
.then(() => {
|
console.log(`线程:${threadId}全部完成,共下载${bookCount}本,成功下载${successCount}本,跳过${skipCount}本,失败${bookCount - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
})
|
.catch(e => {
|
console.error(e);
|
})
|
.finally(async () => {
|
// saveBooks(books);
|
// parentPort.postMessage({ type: "books", data: books });
|
logFile.close();
|
});
|
}
|
|
let year = 2024;
|
let codeIndex = 0;
|
const codeList = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"];
|
|
async function getBookList(pageSize, page, code) {
|
const url = `https://archive.org/services/search/beta/page_production/?user_query=&page_type=collection_details&page_target=books&hits_per_page=${pageSize}&page=${page}&filter_map=%7B%22year%22%3A%7B%222023%22%3A%22gte%22%2C%222024%22%3A%22lte%22%7D%2C%22firstTitle%22%3A%7B%22${code}%22%3A%22inc%22%7D%7D&sort=titleSorter%3Aasc&aggregations=false&uid=R%3A1e845903aec74dee14bd-S%3A8cde5bf234b86bf96a75-P%3A1-K%3Ah-T%3A1718106108852`;
|
return await myAxios.get(url);
|
}
|
|
async function getBooks() {
|
let page = 1;
|
const pageSize = 100;
|
let total = 0;
|
const code = codeList[codeIndex];
|
console.log(`${year}年 ${codeIndex}`);
|
const bookList = [];
|
do {
|
console.log(`正在获取 ${year} 年 ${code} 分类 ${page} 页`);
|
const resp = await retry(() => getBookList(pageSize, page, code)).catch((e) => {
|
console.log(`获取失败:${year} 年 ${code} 分类 ${page} 页`);
|
});;
|
if (!resp) {
|
continue;
|
}
|
const { total: _total, hits } = resp.data.response.body.hits
|
total = _total;
|
for (const hit of hits) {
|
const { identifier, title, creator } = hit.fields
|
const author = creator?.join(", ");
|
bookList.push({ id: identifier, title, author });
|
}
|
page++;
|
await sleep(getRandomNumber(300, 800));
|
} while (pageSize * page < total);
|
codeIndex++;
|
if (codeIndex == codeList.length) {
|
year--;
|
codeIndex = 0;
|
}
|
return bookList;
|
}
|
|
let getBookPromise = null;
|
function main() {
|
|
if (!fs.existsSync('tmpdir')) {
|
fs.mkdirSync('tmpdir', { recursive: true });
|
}
|
// 多进程执行
|
if (isMainThread) {
|
initLogger();
|
let downloadCnt = 0;
|
const { startRow, endRow, threadSize } = config;
|
console.log(`线程数:${threadSize}, 开始行:${startRow}, 结束行:${endRow}`);
|
let finishThreadCnt = 0;
|
const finishBooks = [];
|
const books = [];
|
|
for (let i = 0; i < threadSize; i++) {
|
const worker = new Worker("./src/book-isbn-search.mjs", { workerData: {} });
|
worker.on("message", async (message) => {
|
if (message.type === 'book') {
|
finishBooks.push(message.data);
|
}
|
else if (message.type === 'books') {
|
finishBooks.push(...message.data);
|
finishThreadCnt++;
|
if (finishThreadCnt >= threadSize) {
|
successCount = finishBooks.filter(it => it.isbn).length;
|
skipCount = finishBooks.filter(it => it.skip).length;
|
console.log(`全部线程完成,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
saveBooks(finishBooks);
|
}
|
} else if (message.type === 'get-book') {
|
downloadCnt++;
|
if (getBookPromise) {
|
await getBookPromise.finally();
|
}
|
if (books.length == 0) {
|
do {
|
if (year > 1950) {
|
getBookPromise = getBooks();
|
books.push(...await getBookPromise.finally());
|
getBookPromise = null;
|
}
|
} while (!books.length);
|
}
|
worker.postMessage({ type: "book", data: books.shift() });
|
}
|
});
|
}
|
// 监听退出信号,保存已经下载的图书信息
|
process.on('SIGINT', () => {
|
successCount = finishBooks.filter(it => it.state === '下载完成').length;
|
skipCount = finishBooks.filter(it => it.skip).length;
|
console.log(`进程被手动结束,共下载${downloadCnt}本,成功下载${successCount}本,跳过${skipCount},失败${downloadCnt - skipCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
saveBooks(finishBooks);
|
process.exit(0);
|
});
|
} else {
|
startDownload();
|
|
}
|
}
|
|
main();
|