import xlsx from "node-xlsx";
|
import { Builder, Browser, until, By, WebDriver } from "selenium-webdriver";
|
import { Options as ChromeOptions } from "selenium-webdriver/chrome.js";
|
import proxy from "selenium-webdriver/proxy.js";
|
import axios from "axios";
|
import * as fs from "fs";
|
import { Worker, isMainThread, parentPort, threadId } from 'worker_threads';
|
import { HttpsProxyAgent } from "https-proxy-agent";
|
import { resolve } from "path";
|
import sqlite3 from "sqlite3";
|
import * as cheerio from 'cheerio';
|
|
/*-------------读取配置---------------*/
|
let config = JSON.parse(fs.readFileSync('./config.json'));
|
|
/* ------------日志-------------- */
|
let logFile;
|
function initLogger() {
|
const _log = console.log;
|
if (!fs.existsSync('./book-list-logs')) {
|
fs.mkdirSync('./book-list-logs', { recursive: true });
|
}
|
logFile = fs.createWriteStream(`./book-list-logs/logs-${config.startRow}-${config.endRow}.log`, { flags: 'a', encoding: 'utf8' });
|
console.log = function (...text) {
|
text = `${new Date().toLocaleString()} ${text.join(' ') ?? ''}`;
|
_log(text);
|
logFile.write(text + '\n');
|
};
|
}
|
|
/* ----------axios代理------------ */
|
const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
|
const myAxios = axios.create({
|
proxy: false,
|
httpsAgent,
|
});
|
|
/* ----------- sqlite3 ------------- */
|
let db = new sqlite3.Database('./book-list.db', (err) => {
|
console.error(err);
|
});
|
|
function initDb() {
|
db.serialize(() => {
|
db.run("CREATE TABLE IF NOT EXISTS t_books (Title TEXT, Author TEXT, Year INTEGER, Publisher TEXT, ISBN TEXT NOT NULL UNIQUE, PRIMARY KEY (ISBN))");
|
});
|
}
|
|
function closeDb() {
|
if (db) {
|
db.close();
|
db = null;
|
}
|
}
|
|
function addBooks(books) {
|
db.run("begin transaction");
|
for (const book of books) {
|
db.run("INSERT INTO t_books (Title, Author, Year, Publisher, ISBN) VALUES (?,?,?,?,?)",
|
[book.title, book.author, book.year, book.publisher, book.isbn], (err) => {
|
if (!err) {
|
downloadCnt++;
|
}
|
});
|
}
|
db.run("commit");
|
}
|
|
function addBook(book) {
|
db.run("INSERT INTO t_books (Title, Author, Year, Publisher, ISBN) VALUES (?,?,?,?,?)",
|
[book.title, book.author, book.year, book.publisher, book.isbn], (err) => {
|
if (!err) {
|
downloadCnt++;
|
}
|
});
|
}
|
|
/**
|
* 创建浏览器驱动
|
* @returns {WebDriver} chrome浏览器驱动
|
*/
|
/* async function createDriver() {
|
const opts = new ChromeOptions();
|
if (config.headless) {
|
opts.addArguments("--headless");//开启无头模式
|
}
|
if (config.disableGpu) {
|
opts.addArguments("--disable-gpu");//禁止gpu渲染
|
}
|
opts.excludeSwitches(["enable-automation", 'enable-logging']); // 禁用自动化,
|
opts.addArguments("--user-data-dir=D:\\book-list-crawler-cache");
|
opts.addArguments("--disk-cache-dir=D:\\book-list-crawler-cache");
|
opts.addArguments("--ignore-ssl-error"); // 忽略ssl错误
|
opts.addArguments("--no-sandbox"); // 禁用沙盒模式
|
opts.addArguments("blink-settings=imagesEnabled=false"); //禁用图片加载
|
// proxy
|
opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' }))
|
const driver = await new Builder()
|
.setChromeOptions(opts)
|
.forBrowser(Browser.CHROME)
|
.build();
|
driver.manage().setTimeouts({ implicit: 10000 });
|
return driver;
|
} */
|
|
async function sleep(ms) {
|
return new Promise((resolve) => {
|
setTimeout(resolve, ms);
|
});
|
}
|
|
async function retry(func, maxTry = 3, delay = 3000) {
|
try {
|
return await func();
|
} catch (e) {
|
if (maxTry > 0) {
|
await sleep(delay);
|
return await retry(func, maxTry - 1, delay);
|
} else {
|
throw e;
|
}
|
}
|
}
|
|
/**
|
* 打开搜索页面并搜索
|
* @param {*} bookName
|
*/
|
async function openSearchPage(bookName) {
|
return await retry(async () => {
|
// 获取页面
|
const searchUrl = `https://www.campusbooks.com/books/search?keywords=${bookName}&op=Search`;
|
await driver.get(searchUrl);
|
await driver.wait(until.elementLocated(By.xpath('//*[@id="search-results-list"]')), 10000);
|
}).then(() => true)
|
.catch(() => false);
|
}
|
|
/**
|
* 检测搜索结果
|
* @param {*} book
|
* @returns true: 有搜索结果,false: 没有搜索结果
|
*/
|
async function getSearchResult() {
|
console.log(`检测搜索结果`);
|
return await retry(async () => {
|
const bookDivs = await driver.findElements(By.xpath('//div[@class="col-md-8 col-xs-9 div-o"]'), 1000);
|
for (const bookDiv of bookDivs) {
|
const book = {};
|
const h3 = await bookDiv.findElement(By.xpath('./h3'));
|
if (h3) {
|
const title = await h3.getText();
|
book.title = title;
|
}
|
const filedDivs = await bookDiv.findElements(By.xpath('./dl/dt'), 1000);
|
if (filedDivs) {
|
for (const filedDiv of filedDivs) {
|
const filedText = await filedDiv.getText();
|
if (filedText.startsWith("Author:")) {
|
book.author = filedText.replace("Author: ", "");
|
} else if (filedText.startsWith("ISBN 10:")) {
|
book.isbn = filedText.replace("ISBN 10: ", "");
|
} else if (filedText.startsWith("ISBN 13:")) {
|
book.isbn = filedText.replace("ISBN 13: ", "");
|
} else if (filedText.startsWith("Edition:")) {
|
book.edition = filedText.replace("Edition: ", "");
|
}
|
}
|
}
|
if (book.isbn) {
|
parentPort.postMessage({ type: 'book', data: book });
|
}
|
}
|
return true;
|
}, 2)
|
.catch((e) => {
|
console.error(e);
|
return true;
|
});
|
}
|
|
function getBookName() {
|
return new Promise(resolve => {
|
const cb = (message) => {
|
if (message.type === 'bookName' && message.threadId) {
|
resolve(message.data);
|
parentPort.removeListener('message', cb);
|
}
|
};
|
parentPort.on('message', cb);
|
parentPort.postMessage({ type: 'getBookName', threadId });
|
|
});
|
}
|
|
async function getBook() {
|
// driver = await createDriver();
|
for (; ;) {
|
try {
|
const bookName = await getBookName();
|
if (!bookName) { return; }
|
console.log(`获取: ${bookName}`);
|
const kw = bookName.replace(/ /g, '+');
|
const resp = await axios.get(`https://www.campusbooks.com/books/search?keywords=${kw}&op=Search`)
|
const html = cheerio.load(resp.data);
|
const bookDivs = html('#search-results-list > div > div.col-md-8.col-xs-9.div-o');
|
for (const _bookDiv of bookDivs) {
|
const bookDiv = cheerio.load(_bookDiv)
|
const book = {};
|
const h3 = bookDiv('h3');
|
if (h3) {
|
book.title = h3.text();
|
}
|
const filedDivs = bookDiv('dl > dt');
|
if (filedDivs) {
|
for (const _filedDiv of filedDivs) {
|
const filedDiv = cheerio.load(_filedDiv);
|
const filedText = filedDiv.text()?.trim();
|
if (filedText?.startsWith("Author:")) {
|
book.author = filedText.replace("Author: ", "");
|
} else if (filedText?.startsWith("ISBN 10:")) {
|
book.isbn = filedText.replace("ISBN 10: ", "");
|
} else if (filedText?.startsWith("ISBN 13:")) {
|
book.isbn = filedText.replace("ISBN 13: ", "");
|
} else if (filedText?.startsWith("Edition:")) {
|
book.edition = filedText.replace("Edition: ", "");
|
}
|
}
|
}
|
if (book.isbn) {
|
parentPort.postMessage({ type: 'book', data: book });
|
}
|
}
|
// 打开搜索页面并搜索
|
// if (!await openSearchPage(bookName, true)) {
|
// continue;
|
// }
|
// 获取搜索结果
|
// await getSearchResult(bookName);
|
// 等一段时间再下一个
|
sleep(getRandomNumber(300, 1000));
|
} catch (e) {
|
console.error(e);
|
}
|
}
|
}
|
|
|
/**
|
* 毫秒转时分秒格式
|
* @param {number} ms 毫秒值
|
*/
|
function msFormat(ms) {
|
const sec = Math.floor(ms / 1000);
|
const min = Math.floor(sec / 60);
|
const hour = Math.floor(min / 60);
|
const day = Math.floor(hour / 24);
|
const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
|
return format;
|
}
|
|
/**
|
* 获取随机值
|
* @param {number} min 最小值
|
* @param {number} max 最大值
|
* @returns 随机值
|
*/
|
function getRandomNumber(min, max) {
|
return Math.random() * (max - min) + min;
|
}
|
|
function importFromExcel() {
|
initDb();
|
const file = './76w.xlsx';
|
const workSheets = xlsx.parse(file);
|
const sheet = workSheets[0];
|
sheet.data.shift();
|
const books = [];
|
sheet.data.forEach((row) => {
|
const title = row[0];
|
const author = row[1]
|
const year = row[2];
|
const publisher = row[3];
|
const isbn = row[4].split(',').sort((a, b) => b.length - a.length)[0];
|
|
books.push({ title, author, year, publisher, isbn });
|
});
|
addBooks(books);
|
closeDb();
|
}
|
|
// 开始时间
|
const startTime = Date.now();
|
// 图书数量
|
let downloadCnt = 0;
|
// chrome驱动
|
/** @type {WebDriver} */
|
let driver;
|
function startTask() {
|
initLogger();
|
getBook()
|
.catch(e => {
|
console.error(e);
|
})
|
.finally(async () => {
|
parentPort.postMessage({ type: "finish" });
|
logFile.close();
|
try {
|
// await driver.close();
|
// await driver.quit();
|
} catch (e) { }
|
});
|
}
|
|
if (!fs.existsSync('D:\\book-list-crawler-cache')) {
|
fs.mkdirSync('D:\\book-list-crawler-cache', { recursive: true });
|
}
|
|
function main() {
|
// 多进程执行
|
if (isMainThread) {
|
console.log(`线程数:${config.threadSize}`);
|
initDb();
|
let finishCnt = 0;
|
const threadSize = config.threadSize;
|
const bookNames = fs.readFileSync('./bookNames.txt', 'utf8').replace(/\r/, '').split('\n');
|
for (let i = 0; i < threadSize; i++) {
|
const worker = new Worker("./src/book-list-download2.mjs", { workerData: {} });
|
worker.on("message", (message) => {
|
if (message.type === 'book') {
|
addBook(message.data);
|
}
|
else if (message.type === 'getBookName') {
|
const bookName = bookNames.shift();
|
if (bookName)
|
console.log(bookName, `剩于:${bookNames.length},已获取${downloadCnt}本`);
|
worker.postMessage({ type: "bookName", data: bookName, threadId: message.threadId });
|
} else if (message.type === 'finish') {
|
finishCnt++;
|
if (finishCnt == threadSize) {
|
closeDb();
|
console.log(`共下载${downloadCnt}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
}
|
}
|
});
|
}
|
process.on('SIGINT', () => {
|
closeDb();
|
console.log(`进程被手动结束,共下载${downloadCnt}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
process.exit(0);
|
});
|
} else {
|
startTask();
|
}
|
}
|
|
// importFromExcel();
|
main();
|