From 6a6078c5d393bffda15e682994811468ff86963e Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期二, 30 七月 2024 01:40:34 +0800
Subject: [PATCH] 增加数据库图书ISBN补全脚本

---
 src/book-list-to-es.mjs |  133 ++++++++++++++++++++++++++++++++++++++++++++
 package.json            |    2 
 2 files changed, 135 insertions(+), 0 deletions(-)

diff --git a/package.json b/package.json
index 649275d..2dc2c80 100644
--- a/package.json
+++ b/package.json
@@ -14,8 +14,10 @@
   },
   "devDependencies": {},
   "dependencies": {
+    "@elastic/elasticsearch": "8",
     "axios": "^1.7.2",
     "cheerio": "^1.0.0-rc.12",
+    "fuse.js": "^7.0.0",
     "https-proxy-agent": "^7.0.4",
     "jssoup": "^0.0.15",
     "node-xlsx": "^0.24.0",
diff --git a/src/book-list-to-es.mjs b/src/book-list-to-es.mjs
new file mode 100644
index 0000000..eea886a
--- /dev/null
+++ b/src/book-list-to-es.mjs
@@ -0,0 +1,133 @@
+import { Client } from '@elastic/elasticsearch';
+import xlsx from "node-xlsx";
+const client = new Client({ node: 'http://localhost:9200' });
+import sqlite3 from "sqlite3";
+
+async function createIndex() {
+  try {
+    if (await client.indices.exists({ index: 'books' })) { return; }
+    await client.indices.create({
+      index: 'books',
+      body: {
+        mappings: {
+          properties: {
+            id: { type: 'keyword' },
+            title: { type: 'text' },
+            author: { type: 'text' },
+            isbn: { type: 'text' },
+          },
+        },
+      }
+    });
+  } catch (e) {
+    console.error(e);
+  }
+}
+
+function getBooksFromDb() {
+  return new Promise((resolve, reject) => {
+    const db = new sqlite3.Database("./book-list.db");
+    db.all("SELECT Title as title,Author as author,ISBN as isbn FROM t_books", (err, rows) => {
+      if (err) {
+        console.error(err);
+      } else {
+        resolve(rows);
+      }
+      db.close();
+    });
+  });
+}
+
+async function indexBooks() {
+  const books = await getBooksFromDb();
+  const bookGroups = [];
+  for (let i = 0; i < books.length; i += 1000) {
+    bookGroups.push(books.slice(i, i + 1000));
+  }
+
+  for (const bookGroup of bookGroups) {
+    const body = bookGroup.map(book => ([{ index: { _index: 'books', _id: book.id } }, book])).flat();
+    try {
+      const response = await client.bulk({ body });
+      console.log(response.errors);
+    } catch (e) {
+      console.error(e);
+    }
+  }
+}
+
+async function importBooToEs() {
+  await createIndex();
+  await indexBooks();
+}
+
+async function searchBook() {
+  const books = [];
+  const workSheets = xlsx.parse("./fictionnoisbn.xlsx");
+  for (const sheet of workSheets) {
+    books.push(...sheet.data.map(row => ({ id: row[0], title: row[1], author: row[2] })));
+  }
+  let cnt = 0;
+  let bookCnt = 0;
+  for (const book of books) {
+    cnt++;
+    if (cnt % 1000 == 0) {
+      console.log('褰撳墠%d', cnt);
+    }
+    if (!book.title) { continue; }
+    const resp = await client.search({
+      index: 'books',
+      size: 1,
+      query: {
+        bool: book.author ? {
+          must: [
+            {
+              match: {
+                title: { query: book.title }
+              }
+            },
+            {
+              match: {
+                author: { query: book.author }
+              }
+            }
+          ]
+        } : undefined,
+        match: book.author ? undefined : {
+          'title': book.title,
+        }
+      }
+    });
+    if ((resp.hits.max_score ?? 0) < 25) { continue; }
+    const isbn = resp.hits.hits[0]?._source?.isbn;
+    if (isbn) {
+      book.isbn = isbn;
+      book.title2 = resp.hits.hits[0]?._source?.title;
+      book.author2 = resp.hits.hits[0]?._source?.author;
+    }
+    bookCnt++;
+    if (bookCnt % 1000 == 0) {
+      console.log('宸插尮閰嶏細%s', bookCnt);
+    }
+  }
+  console.log(bookCnt);
+  saveToDb(books);
+}
+
+async function saveToDb(books) {
+  const db = new sqlite3.Database("./book-list-result.db");
+  db.serialize(function () {
+    db.run("CREATE TABLE IF NOT EXISTS t_books (id TEXT PRIMARY KEY, Title TEXT, Author TEXT, ISBN TEXT, Title2 TEXT, Author2 TEXT)");
+    db.run("BEGIN TRANSACTION");
+    const stmt = db.prepare("INSERT INTO t_books (id, Title, Author, ISBN, Title2, Author2) VALUES (?,?,?,?,?,?)");
+    for (const book of books.filter(book => book.isbn)) {
+      stmt.run([book.id, book.title, book.author, book.isbn, book.title2, book.author2]);
+    }
+    stmt.finalize();
+    db.run("COMMIT");
+    db.close();
+  });
+}
+
+// importBooToEs();
+searchBook();
\ No newline at end of file

--
Gitblit v1.9.1