import { Client } from '@elastic/elasticsearch';
|
import xlsx from "node-xlsx";
|
import * as fs from 'fs';
|
import * as mysql from 'mysql2';
|
const INDEX_NAME = 'zlib_books';
|
|
const client = new Client({ node: 'http://localhost:9200' });
|
let connection;
|
|
async function createIndex() {
|
try {
|
if (await client.indices.exists({ index: INDEX_NAME })) { return; }
|
await client.indices.create({
|
index: INDEX_NAME,
|
body: {
|
mappings: {
|
properties: {
|
id: { type: 'keyword' },
|
zid: { type: 'keyword' },
|
title: { type: 'text' },
|
author: { type: 'text' },
|
isbn: { type: 'text' },
|
ext: { type: 'text' },
|
file: { type: 'text' }
|
},
|
},
|
}
|
});
|
} catch (e) {
|
console.error(e);
|
}
|
}
|
|
function getBooksFromExcel(excelFile) {
|
const workSheets = xlsx.parse(excelFile);
|
const result = [];
|
for (const sheet of workSheets) {
|
const books = sheet.data;
|
// books.shift();
|
for (const row of books) {
|
const [id, isbn, title, author, ext] = row;
|
result.push({ zid: `${id}`, isbn: `${isbn ?? ''}`, title: `${title ?? ''}`, author: `${author ?? ''}`, ext: `${ext}`, file: `${id}.${ext}`.toLowerCase() });
|
}
|
}
|
return result;
|
}
|
|
let dbIdx = 0;
|
let pageSize = 300000;
|
|
function getBook100000() {
|
return new Promise((resolve, reject) => {
|
connection.query(
|
`SELECT books.zlibrary_id id, isbn.isbn,title,Author,extension, md5_reported FROM isbn LEFT JOIN books ON books.zlibrary_id=isbn.zlibrary_id LIMIT ${dbIdx * pageSize}, ${pageSize}`,
|
function (error, results, fields) {
|
if (error) throw error;
|
const bookList = results.map(row => ({ zid: `${row.id}`, isbn: `${row.isbn ?? ''}`, title: `${row.title ?? ''}`, author: `${row.author ?? ''}`, md5: `${row.md5_reported}`, ext: `${row.extension}`, file: `${row.id}.${row.extension}`.toLowerCase() }));
|
resolve(bookList);
|
dbIdx++;
|
}
|
);
|
});
|
}
|
|
async function indexBooks() {
|
// const files = [
|
// // "C:\\Users\\lyg\\Downloads\\书单z-0-100w.xlsx",
|
// // "C:\\Users\\lyg\\Downloads\\书单z-100w-200w.xlsx",
|
// // "C:\\Users\\lyg\\Downloads\\书单z-200w-300w.xlsx",
|
// // "C:\\Users\\lyg\\Downloads\\书单z-300w-400w.xlsx",
|
// "C:\\Users\\lyg\\Downloads\\书单z-400w-500w.xlsx",
|
// "C:\\Users\\lyg\\Downloads\\书单z-500w-5027926.xlsx",
|
// ];
|
// const bookList = [];
|
// for (const file of files) {
|
// const books = getBooksFromExcel(file);
|
// books.forEach(bk => bookList.push(bk));
|
// }
|
|
// const bookGroups = [];
|
// for (let i = 0; i < bookList.length; i += 10000) {
|
// bookGroups.push(bookList.slice(i, i + 10000));
|
// }
|
|
for (; ;) {
|
const bookGroup = await getBook100000();
|
if (!bookGroup.length) return;
|
const body = bookGroup.map(book => ([{ index: { _index: INDEX_NAME, _id: book.id } }, book])).flat();
|
try {
|
const response = await client.bulk({ body });
|
console.log(response.errors);
|
} catch (e) {
|
console.error(e);
|
}
|
}
|
}
|
|
async function importBooToEs() {
|
connection = mysql.createConnection({
|
host: 'localhost',
|
user: 'root',
|
password: 'xA123456',
|
database: 'fiction'
|
});
|
connection.connect();
|
await createIndex();
|
await indexBooks();
|
}
|
|
|
// const EXCEL_FILE = "D:\\书单\\【反馈客户】7月批次书单 - 已撞库.xlsx"; // 190795 libgen匹配92788本书 88920
|
// const EXCEL_FILE = "D:\\书单\\清单第二批0723-已撞库.xlsx"; // 79752 libgen匹配35177本书 9112
|
// const EXCEL_FILE = "D:\\书单\\8月\\【反馈客户】书籍_中文在线0813_已完成_2.xlsx";
|
// const EXCEL_FILE = "D:\\书单\\清单第二批0723-已撞库-zlib.xlsx"; // 79752 libgen匹配35177本书 9112
|
// const EXCEL_FILE = "D:\\书单\\【反馈客户】7月批次书单 - 已撞库-zlib.xlsx"; // 79752 libgen匹配35177本书 9112
|
/**
|
*
|
* @param {{isbn:boolean,title:boolean}} filter 过滤条件
|
* @param {{zid:number,isbn:number,title:number,author:number,md5:number,file:number}} colIdx 列索引
|
* @param {string} excelFile excel
|
*/
|
async function searchBook(filter, colIdx, excelFile) {
|
let books = [];
|
const workSheets = xlsx.parse(excelFile);
|
const header = workSheets[0].data[0];
|
for (const sheet of workSheets) {
|
books = books.concat(sheet.data.slice(1));
|
}
|
let cnt = 0;
|
let bookCnt = 0;
|
const promiseList = [];
|
for (const book of books) {
|
cnt++;
|
if (cnt % 1000 == 0) {
|
console.log('当前%d', cnt);
|
}
|
if (filter.title != null && !book[colIdx.title]) { continue; }
|
|
const query = {};
|
if (filter.isbn != null) {
|
const isbn = `${book[colIdx.isbn]}`;
|
if (isbn.includes(',')) {
|
const isbns=isbn.split(',');
|
query.terms = {
|
isbn: isbns.map(i => i.trim().toLowerCase())
|
}
|
} else {
|
query.term = {
|
isbn: isbn.toLowerCase()
|
}
|
}
|
} else if (filter.title != null) {
|
if (book[colIdx.author] != null) {
|
query.bool = {
|
must: [
|
{
|
match: {
|
title: { query: book[colIdx.title] }
|
}
|
},
|
{
|
match: {
|
author: { query: book[colIdx.author] }
|
}
|
}
|
]
|
}
|
} else {
|
query.match = { title: book[colIdx.title] }
|
}
|
} else if (filter.zid != null) {
|
query.term = {
|
zid: book[colIdx.zid]
|
};
|
}
|
|
const promise = client.search({
|
index: INDEX_NAME,
|
size: 1,
|
query
|
}).then(resp => {
|
if (resp.hits.total.value == 0) { return; }
|
// if ((resp.hits.max_score ?? 0) < 30) {
|
// return;
|
// }
|
if (colIdx.zid != null) {
|
book[colIdx.zid] = resp.hits.hits[0]?._source?.zid;
|
}
|
if (colIdx.file) {
|
book[colIdx.file] = resp.hits.hits[0]?._source?.file;
|
}
|
if (colIdx.md5) {
|
book[colIdx.md5] = resp.hits.hits[0]?._source?.md5;
|
}
|
bookCnt++;
|
if (bookCnt % 1000 == 0) {
|
console.log('已匹配:%s', bookCnt);
|
}
|
});
|
promiseList.push(promise);
|
if (promiseList.length >= 20) {
|
await Promise.all(promiseList);
|
promiseList.length = 0;
|
}
|
}
|
if (promiseList.length) {
|
await Promise.all(promiseList);
|
}
|
|
console.log(`已匹配:${bookCnt}`);
|
saveToExcel(books, header, colIdx, excelFile);
|
}
|
|
async function saveToExcel(books, header, colIdx, excelFile) {
|
// const libgenIds = fs.readFileSync("D:\\书单\\已下载libgen_id.txt", 'utf8').split('\n');
|
// const libgenIdMap = new Map();
|
// libgenIds.forEach(id => libgenIdMap.set(id, true));
|
|
const data =
|
books.filter(book => book[colIdx.file]);
|
console.log(data.length - 1);
|
data.splice(0, 0, header);
|
const sheets = [];
|
for (let i = 0; i < data.length; i += 1000000) {
|
sheets.push({ name: "Sheet" + (i / 1000000), data: data.slice(i, i + 1000000) });
|
}
|
const buffer = xlsx.build(sheets);
|
|
const notMatchData = books
|
.filter(book => !book[colIdx.file]);
|
notMatchData.splice(0, 0, header);
|
const buffer2 = xlsx.build([{ name: "Sheet1", data: notMatchData }]);
|
try {
|
fs.writeFileSync(`${excelFile.substring(0, excelFile.lastIndexOf('.'))}.zlib.xlsx`, buffer, (err) => { });
|
fs.writeFileSync(`${excelFile.substring(0, excelFile.lastIndexOf('.'))}.zlib.nomatch.xlsx`, buffer2, (err) => { });
|
} catch (e) {
|
console.error(e);
|
}
|
}
|
|
async function searchByIsbn(colIdx, excelFile) {
|
searchBook({ isbn: true }, colIdx, excelFile);
|
}
|
|
async function searchByTitle(colIdx, excelFile) {
|
searchBook({ title: true }, colIdx, excelFile);
|
}
|
async function searchByZid(colIdx, excelFile) {
|
searchBook({ zid: true }, colIdx, excelFile);
|
}
|
|
function addFile2Excel() {
|
const fileMap = new Map();
|
fs.readFileSync("zlib-files.txt", "utf-8").split("\n").forEach(line => {
|
const [id, ext] = line.split(".");
|
fileMap.set(id, line);
|
});
|
const excelFile = "D:\\书单\\8月\\书单1-2.xlsx";
|
const workSheets = xlsx.parse(excelFile);
|
for (const sheet of workSheets) {
|
sheet.data.slice(1).forEach(row => {
|
const id = row[0];
|
const file = fileMap.get(id + '');
|
row[4] = file;
|
});
|
}
|
const buffer = xlsx.build(workSheets);
|
fs.writeFileSync(`${excelFile}.file.xlsx`, buffer, (err) => { });
|
}
|
|
// importBooToEs();
|
// searchByIsbn(
|
// { isbn: 2, zid: 6, md5: 7, file: 8 },
|
// "D:\\书单\\8月\\【反馈客户】书籍_中文在线0813_已完成_3.xlsx"
|
// );
|
searchByZid(
|
{ isbn: 3, zid: 0, md5: 4, file: 5 },
|
"C:\\Users\\lyg\\Documents\\WeChat Files\\wxid_e8swvjxjvxz321\\FileStorage\\File\\2024-08\\书单-929648本.xlsx"
|
);
|
// searchByTitle(
|
// { title: 0, isbn: 2, author: 3, zid: 6, md5: 7, file: 8 },
|
// "D:\\书单\\8月\\【反馈客户】书籍_中文在线0813_已完成_3.xlsx"
|
// )
|
|
// addFile2Excel();
|