import xlsx from "node-xlsx";
|
import { Builder, Browser, until, By } from "selenium-webdriver";
|
import { Options as ChromeOptions } from "selenium-webdriver/chrome.js";
|
import proxy from "selenium-webdriver/proxy.js";
|
import axios from "axios";
|
import * as fs from "fs";
|
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
/* ------------日志-------------- */
|
const _log = console.log;
|
const logFile = fs.createWriteStream('./logs.log');
|
console.log = function (text) {
|
text = `${new Date().toLocaleString()} ${text ?? ''}`;
|
_log(text);
|
logFile.write(text + '\n');
|
};
|
|
/* ----------axios代理------------ */
|
const httpsAgent = new HttpsProxyAgent(`http://127.0.0.1:10809`);
|
const myAxios = axios.create({
|
proxy: false,
|
httpsAgent,
|
});
|
|
/**
|
* 获取要下载熟图书信息
|
* @param {number} startRow 起始行,包含
|
* @param {number} endRow 结束行,不包含
|
* @returns
|
*/
|
function getBooksFromExcel(startRow, endRow) {
|
const workSheets = xlsx.parse("【第二批二次处理后】交付清单.xlsx");
|
const sheet = workSheets[0];
|
const data = sheet.data.slice(startRow, endRow);
|
const books = data.map((row) => {
|
return {
|
id: row[0],
|
isbn: row[1],
|
title: row[2],
|
subTitle: row[3],
|
author: row[4],
|
publisher: row[5],
|
pubDate: row[6],
|
ztf: row[7],
|
format: row[8],
|
language: row[9],
|
brief: row[10],
|
pages: row[11],
|
state: row[12],
|
format: row[13],
|
file: row[14],
|
url: row[15],
|
};
|
});
|
return books;
|
}
|
|
/**
|
* 创建浏览器驱动
|
* @returns chrome浏览器驱动
|
*/
|
async function createDriver() {
|
const opts = new ChromeOptions();
|
opts.addArguments("--ignore-ssl-error"); // 忽略ssl错误
|
opts.addArguments("--no-sandbox"); // 禁用沙盒模式
|
opts.addArguments("blink-settings=imagesEnabled=false"); //禁用图片加载
|
// proxy
|
opts.setProxy(proxy.manual({ http: 'http://127.0.0.1:10809', https: 'http://127.0.0.1:10809' }))
|
const driver = await new Builder()
|
.setChromeOptions(opts)
|
.forBrowser(Browser.CHROME)
|
.build();
|
driver.manage().setTimeouts({ implicit: 10000 });
|
return driver;
|
}
|
|
/**
|
* 格式化关键字
|
* @param {string} text 要搜索的关键字
|
* @returns 处理后的关键字
|
*/
|
function formatKw(text) {
|
// 只保留中文、英文、数字和下划线
|
return text.replace(/[^\u4e00-\u9fa5\w \d]/g, "");
|
}
|
|
const driver = await createDriver();
|
|
async function sleep(ms) {
|
return new Promise((resolve) => {
|
setTimeout(resolve, ms);
|
});
|
}
|
|
async function retry(func, maxTry = 3, delay = 3000) {
|
try {
|
return await func();
|
} catch (e) {
|
if (maxTry > 0) {
|
await sleep(delay);
|
return await retry(func, maxTry - 1, delay);
|
} else {
|
throw e;
|
}
|
}
|
}
|
|
/**
|
* 打开搜索页面并搜索
|
* @param {*} book
|
*/
|
async function openSearchPage(book) {
|
console.log(`打开搜索: https://archive.org/search?query=${formatKw(book.title)}`);
|
return await retry(async () => {
|
// 获取页面
|
const searchUrl = `https://archive.org/search?query=${formatKw(book.title)}`;
|
await driver.get(searchUrl);
|
}).then(() => true)
|
.catch(() => false);
|
}
|
|
/**
|
* 检测搜索结果
|
* @param {*} book
|
* @returns true: 有搜索结果,false: 没有搜索结果
|
*/
|
async function checkSearchResult(book) {
|
console.log(`检测搜索结果`);
|
return await retry(async () => {
|
const text = await driver.executeScript(`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#content-container > empty-placeholder").shadowRoot.querySelector("div > h2").textContent`);
|
if (text && text.includes("Your search did not match any items in the Archive. Try different keywords or a more general search.")) {
|
// 没有搜索结果
|
book.state = "没有搜索结果";
|
console.log(`没有搜索结果: ${book.id} ${book.title}`);
|
return false;
|
} else {
|
return true;
|
}
|
}, 2)
|
.catch(() => {
|
return true;
|
});
|
}
|
|
async function findBookDetailPageUrl(book) {
|
console.log(`查找详情页url`);
|
return retry(async () => {
|
let detailPageUrl;
|
try {
|
detailPageUrl = await driver.executeScript(
|
`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article:nth-child(2) > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
|
);
|
} catch (e) {
|
detailPageUrl = await driver.executeScript(
|
`return document.querySelector("body > app-root").shadowRoot.querySelector("#maincontent > div > router-slot > search-page").shadowRoot.querySelector("#collection-browser-container > collection-browser").shadowRoot.querySelector("#right-column > infinite-scroller").shadowRoot.querySelector("#container > article > tile-dispatcher").shadowRoot.querySelector("#container > a").attributes.href.value`
|
);
|
}
|
return detailPageUrl;
|
})
|
.catch(() => '');
|
}
|
|
async function openBookDetailPage(book, detailPageUrl) {
|
console.log(`打开详情: https://archive.org${detailPageUrl}`);
|
return await retry(async () => {
|
await driver.get(`https://archive.org${detailPageUrl}`);
|
await driver.wait(
|
until.elementLocated(
|
By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div`)
|
)
|
);
|
})
|
.then(() => true)
|
.catch(() => {
|
book.state = "打开详情页失败";
|
console.log(`打开详情页失败: ${book.id} ${book.title}`);
|
return false;
|
});
|
}
|
|
async function getDownloadUrl(book) {
|
console.log(`获取下载链接`);
|
function getFullUrl(url) {
|
if (!url) { return ''; }
|
return url.startsWith("http") ? url : `https://archive.org${url}`;
|
}
|
return await retry(async () => {
|
const elements = await driver.findElements(
|
By.xpath(`//*[@id="maincontent"]/div[5]/div/div/div[2]/section[2]/div/a`)
|
);
|
|
let pdfUrl = "";
|
let textUrl = "";
|
for (const el of elements) {
|
let text = await el.getText();
|
if (text) {
|
text = text.trim().split("\n")[0];
|
const href = getFullUrl(await el.getAttribute("href"));
|
if (text.toLowerCase() === "pdf") {
|
pdfUrl = href;
|
} else if (text.toLowerCase() === "full text") {
|
textUrl = href;
|
} else if (text.toLowerCase() === "ocr search text") {
|
textUrl = href;
|
}
|
}
|
}
|
|
if (pdfUrl) {
|
return pdfUrl;
|
} else if (textUrl) {
|
return textUrl;
|
} else {
|
book.state = "没有pdf或text文件";
|
return ''
|
}
|
})
|
.catch(() => {
|
book.state = "没有pdf或text文件";
|
return '';
|
});
|
}
|
|
async function downloadFile(book, url) {
|
console.log(`下载文件: ${url}`);
|
await retry(() => {
|
return new Promise((resolve, reject) => myAxios
|
.get(url, { responseType: "stream" })
|
.then((response) => {
|
const stream = response.data;
|
const ext = url.split(".").pop();
|
const filepath = `./downloads/${book.id} ${book.isbn}.${ext}`;
|
stream.pipe(fs.createWriteStream(filepath));
|
stream.on("end", () => {
|
book.state = `下载完成`;
|
book.format = ext;
|
book.file = filepath;
|
book.url = url;
|
console.log(`下载完成:${filepath}`);
|
resolve(true);
|
});
|
})
|
.catch((e) => {
|
console.error(e);
|
book.state = "下载失败";
|
book.url = url;
|
console.log(`下载失败: ${book.id} ${book.title}`);
|
reject(false);
|
}));
|
});
|
}
|
|
async function downloadBooks(books) {
|
for (const book of books) {
|
if (book.state && (book.state === "没有搜索结果" || book.state === "没有pdf或text文件")) {
|
// 跳过没有搜索结果或没有pdf或text文件的书籍
|
continue;
|
}
|
bookCount++;
|
console.log(`开始下载: ${book.id} ${book.title}`);
|
// 打开搜索页面并搜索
|
if (!await openSearchPage(book)) {
|
console.log(`打开搜索页面失败: ${book.id} ${book.title}`);
|
book.state = "打开搜索页面失败";
|
continue;
|
}
|
// 检测搜索结果
|
const hasBook = await checkSearchResult(book);
|
if (!hasBook) {
|
continue;
|
}
|
// 获取详情页链接
|
const detailPageUrl = await findBookDetailPageUrl(book);
|
if (!detailPageUrl) {
|
console.log(`获取详情页链接失败: ${book.id} ${book.title}`);
|
book.state = "获取详情页链接失败";
|
continue;
|
}
|
// 等一段时间再打开详情页
|
sleep(getRandomNumber(3000, 10000));
|
// 打开详情页
|
await openBookDetailPage(book, detailPageUrl);
|
// 获取下载链接
|
const url = await getDownloadUrl(book);
|
if (!url) { continue; }
|
// 等待一段时间再下载
|
await sleep(getRandomNumber(3000, 10000));
|
// 下载文件
|
await downloadFile(book, url);
|
console.log(`下载完成: ${book.id} ${book.title}`);
|
successCount++;
|
// 等一段时间再下一个
|
sleep(getRandomNumber(3000, 10000));
|
}
|
await driver.close();
|
await driver.quit();
|
}
|
|
function saveBooks(books) {
|
console.log("保存下载状态数据");
|
const workSheets = xlsx.parse("【第二批二次处理后】交付清单.xlsx");
|
const sheet = workSheets[0];
|
const data = sheet.data.slice(2);
|
for (const book of books) {
|
const index = data.findIndex((row) => row[0] === book.id);
|
if (index > -1) {
|
data[index][12] = book.state;
|
data[index][13] = book.format;
|
data[index][14] = book.file;
|
data[index][15] = book.url;
|
}
|
}
|
|
const buffer = xlsx.build([{ name: "Sheet1", data }]);
|
fs.writeFile("./【第二批二次处理后】交付清单.xlsx", buffer, (err) => { });
|
console.log("保存完成: ./【第二批二次处理后】交付清单.xlsx");
|
}
|
|
|
/**
|
* 毫秒转时分秒格式
|
* @param {number} ms 毫秒值
|
*/
|
function msFormat(ms) {
|
const sec = Math.floor(ms / 1000);
|
const min = Math.floor(sec / 60);
|
const hour = Math.floor(min / 60);
|
const day = Math.floor(hour / 24);
|
const format = `${day > 0 ? `${day}天` : ""}${hour % 24}时${min % 60}分${sec % 60}秒`;
|
return format;
|
}
|
|
/**
|
* 获取随机值
|
* @param {number} min 最小值
|
* @param {number} max 最大值
|
* @returns 随机值
|
*/
|
function getRandomNumber(min, max) {
|
return Math.random() * (max - min) + min;
|
}
|
|
// 开始时间
|
const startTime = Date.now();
|
// 下载成功的数量
|
let successCount = 0;
|
// 图书数量
|
let bookCount = 0;
|
|
function main() {
|
const range = JSON.parse(fs.readFileSync('./config.json'));
|
const books = getBooksFromExcel(range.startRow, range.endRow);
|
downloadBooks(books)
|
.then(() => {
|
console.log(`全部完成,共下载${bookCount}本,成功下载${successCount}本,失败${bookCount - successCount}本,耗时: ${msFormat(Date.now() - startTime)}。`);
|
})
|
.finally(() => {
|
saveBooks(books);
|
logFile.close();
|
});
|
}
|
|
main();
|