From 6a6078c5d393bffda15e682994811468ff86963e Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期二, 30 七月 2024 01:40:34 +0800 Subject: [PATCH] 增加数据库图书ISBN补全脚本 --- src/book-list-to-es.mjs | 133 ++++++++++++++++++++++++++++++++++++++++++++ package.json | 2 2 files changed, 135 insertions(+), 0 deletions(-) diff --git a/package.json b/package.json index 649275d..2dc2c80 100644 --- a/package.json +++ b/package.json @@ -14,8 +14,10 @@ }, "devDependencies": {}, "dependencies": { + "@elastic/elasticsearch": "8", "axios": "^1.7.2", "cheerio": "^1.0.0-rc.12", + "fuse.js": "^7.0.0", "https-proxy-agent": "^7.0.4", "jssoup": "^0.0.15", "node-xlsx": "^0.24.0", diff --git a/src/book-list-to-es.mjs b/src/book-list-to-es.mjs new file mode 100644 index 0000000..eea886a --- /dev/null +++ b/src/book-list-to-es.mjs @@ -0,0 +1,133 @@ +import { Client } from '@elastic/elasticsearch'; +import xlsx from "node-xlsx"; +const client = new Client({ node: 'http://localhost:9200' }); +import sqlite3 from "sqlite3"; + +async function createIndex() { + try { + if (await client.indices.exists({ index: 'books' })) { return; } + await client.indices.create({ + index: 'books', + body: { + mappings: { + properties: { + id: { type: 'keyword' }, + title: { type: 'text' }, + author: { type: 'text' }, + isbn: { type: 'text' }, + }, + }, + } + }); + } catch (e) { + console.error(e); + } +} + +function getBooksFromDb() { + return new Promise((resolve, reject) => { + const db = new sqlite3.Database("./book-list.db"); + db.all("SELECT Title as title,Author as author,ISBN as isbn FROM t_books", (err, rows) => { + if (err) { + console.error(err); + } else { + resolve(rows); + } + db.close(); + }); + }); +} + +async function indexBooks() { + const books = await getBooksFromDb(); + const bookGroups = []; + for (let i = 0; i < books.length; i += 1000) { + bookGroups.push(books.slice(i, i + 1000)); + } + + for (const bookGroup of bookGroups) { + const body = bookGroup.map(book => ([{ index: { _index: 'books', _id: book.id } }, book])).flat(); + try { + const response = await client.bulk({ body }); + console.log(response.errors); + } catch (e) { + console.error(e); + } + } +} + +async function importBooToEs() { + await createIndex(); + await indexBooks(); +} + +async function searchBook() { + const books = []; + const workSheets = xlsx.parse("./fictionnoisbn.xlsx"); + for (const sheet of workSheets) { + books.push(...sheet.data.map(row => ({ id: row[0], title: row[1], author: row[2] }))); + } + let cnt = 0; + let bookCnt = 0; + for (const book of books) { + cnt++; + if (cnt % 1000 == 0) { + console.log('褰撳墠%d', cnt); + } + if (!book.title) { continue; } + const resp = await client.search({ + index: 'books', + size: 1, + query: { + bool: book.author ? { + must: [ + { + match: { + title: { query: book.title } + } + }, + { + match: { + author: { query: book.author } + } + } + ] + } : undefined, + match: book.author ? undefined : { + 'title': book.title, + } + } + }); + if ((resp.hits.max_score ?? 0) < 25) { continue; } + const isbn = resp.hits.hits[0]?._source?.isbn; + if (isbn) { + book.isbn = isbn; + book.title2 = resp.hits.hits[0]?._source?.title; + book.author2 = resp.hits.hits[0]?._source?.author; + } + bookCnt++; + if (bookCnt % 1000 == 0) { + console.log('宸插尮閰嶏細%s', bookCnt); + } + } + console.log(bookCnt); + saveToDb(books); +} + +async function saveToDb(books) { + const db = new sqlite3.Database("./book-list-result.db"); + db.serialize(function () { + db.run("CREATE TABLE IF NOT EXISTS t_books (id TEXT PRIMARY KEY, Title TEXT, Author TEXT, ISBN TEXT, Title2 TEXT, Author2 TEXT)"); + db.run("BEGIN TRANSACTION"); + const stmt = db.prepare("INSERT INTO t_books (id, Title, Author, ISBN, Title2, Author2) VALUES (?,?,?,?,?,?)"); + for (const book of books.filter(book => book.isbn)) { + stmt.run([book.id, book.title, book.author, book.isbn, book.title2, book.author2]); + } + stmt.finalize(); + db.run("COMMIT"); + db.close(); + }); +} + +// importBooToEs(); +searchBook(); \ No newline at end of file -- Gitblit v1.9.1