From acde3bd32f07bf02839a21e8fe5b4e69bfca2251 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 14 五月 2025 10:37:00 +0800
Subject: [PATCH] docx文档拆分，文档段落实体词提取，存入mysql数据库。

---
 knowledgebase/doc/doc_processor.py |  133 +++++++++++++++++++++++++++++---------------
 1 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py
index 7dccb8b..bc0e0dd 100644
--- a/knowledgebase/doc/doc_processor.py
+++ b/knowledgebase/doc/doc_processor.py
@@ -1,65 +1,106 @@
 # -*- coding: utf-8 -*-
 # @file: doc_processor.py
 # @author: lyg
-# @date: 20250427
+# @date: 2025-5-13
 # @version: 
-# @description: 澶勭悊鏂囨。锛屾彁鍙栫珷鑺備俊鎭紝鎻愬彇椤电爜淇℃伅锛屾彁鍙栧疄浣撹瘝锛屽啓鍏ュ浘鏁版嵁搴擄紙neo4j锛夈��
-from knowledgebase.db.neo4j import Neo4jHelper
-from knowledgebase.doc.doc_split import DocSplit
-from knowledgebase.doc.entity_recognition import EntityRecognition
+# @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗ｏ紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆�
+from langchain_core.messages import HumanMessage
+
+from knowledgebase.doc.docx_split import DocSplit
 import asyncio
+from knowledgebase.db.doc_db_helper import doc_dbh
+from knowledgebase.doc.entity_helper import entity_helper
+from knowledgebase.doc.entity_recognition import EntityRecognition
+import os.path
+
+from knowledgebase.doc.models import DocInfo, ParagraphInfo
+from knowledgebase.llm import llm
+from knowledgebase.log import Log
+from knowledgebase import utils
 
 
 class DocProcessor:
-    def __init__(self, pdf_file):
-        self.doc_split = DocSplit(pdf_file)
-        self.entity_recognition = EntityRecognition()
-        self.neo4j = Neo4jHelper()
+    def __init__(self, docx_file: str):
+        """
+        鏂囨。澶勭悊
+        :param docx_file: 瑕佸鐞嗙殑鏂囨。
+        """
+        Log.info(f'寮�濮嬪鐞嗘枃妗ｏ細{docx_file}')
+        self.docx_file = docx_file
+        self.doc_split = DocSplit(docx_file)
+        self.doc_type = self.get_doc_type()
+        self.entity_recognition = EntityRecognition(self.doc_type)
+        self.doc_id = 0
 
-    async def gen_page_entities(self, page_info):
-        # 鑾峰彇椤甸潰瀹炰綋璇�
-        page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text))
-        page_info.entities = page_entities
+    def get_doc_type(self):
+        Log.info(f'璇嗗埆鏂囨。绫诲瀷锛歿self.docx_file}')
+        rules = '锛沑n'.join([f'- {it}锛歿entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()])
+        msg = HumanMessage(f'''
+# 鎸囦护
+璇蜂粠涓嬮潰鐨勬枃浠跺悕涓瘑鍒枃妗ｇ被鍨嬶紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆�
+鏂囦欢鍚嶏細{os.path.basename(self.docx_file)}
+# 璇嗗埆瑙勫垯
+{rules}
+# 绀轰緥
+閬ユ祴澶х翰
+''')
+        resp = llm.invoke([msg])
+        Log.info(f'璇嗗埆缁撴灉锛歿resp.content}')
+        return resp.content
+
+    async def gen_sect_entities(self, paragraph: ParagraphInfo):
+        # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}')
+        # 鑾峰彇绔犺妭瀹炰綋璇�
+        entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text))
+        Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}')
+        if entities:
+            paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities]
+            paragraph.entities = [e for e in paragraph.entities if e]
 
     def process(self):
-        # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10椤�
+        self.doc_split.split()
+        # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10涓�
         batch_size = 10
-        for i in range(0, len(self.doc_split.page_infos), batch_size):
-            batch_page_infos = self.doc_split.page_infos[i:i + batch_size]
+        for i in range(0, len(self.doc_split.paragraphs), batch_size):
+            batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size]
             tasks = []
-            for page_info in batch_page_infos:
-                tasks.append(self.gen_page_entities(page_info))
-            asyncio.run(asyncio.gather(*tasks))
-        self.save_to_neo4j()
+            for paragraph in batch_paragraphs:
+                tasks.append(self.gen_sect_entities(paragraph))
 
-    def save_to_neo4j(self):
-        """
-        淇濆瓨椤靛拰椤靛疄浣撹瘝鍒皀eo4j鏁版嵁搴撱��
+            async def run():
+                await asyncio.gather(*tasks)
 
-        1.姣忎竴椤典负涓�涓狽ode锛�
-        2.姣忎竴涓疄浣撹瘝涓轰竴涓狽ode锛�
-        3.椤靛拰瀹炰綋璇嶇洿鎺ュ缓绔嬪叧绯� 椤�->瀹炰綋璇�
-        :return:
+            asyncio.run(run())
+        # 淇濆瓨鍒版暟鎹簱
+        self.save_to_db()
+
+    def save_to_db(self):
         """
-        for page_info in self.doc_split.page_infos:
-            # 鍒涘缓椤佃妭鐐�
-            page_node = self.neo4j.create_page_node(page_info)
-            entity_nodes = []
-            for entity in page_info.entities:
-                # 鍒涘缓瀹炰綋璇嶈妭鐐�
-                entity_node = self.neo4j.create_entity_node(entity)
-                # 寤虹珛鍏崇郴 椤�->瀹炰綋璇�
-                self.neo4j.create_page_entity_relationship(page_node, entity_node)
-                entity_nodes.append(entity_node)
-            if len(entity_nodes) > 0:
-                for i in range(len(entity_nodes)):
-                    prev_entity_node = entity_nodes[i]
-                    for entity_node in entity_nodes[i + 1:]:
-                        # 寤虹珛鍏崇郴 涓�椤典腑鐨� 瀹炰綋璇�1->瀹炰綋璇�2
-                        self.neo4j.create_entity_relationship(prev_entity_node, entity_node)
+        淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱銆�
+        """
+        Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱...')
+        with open(self.docx_file, 'rb') as f:
+            file_bytes = f.read()
+            md5 = utils.generate_bytes_md5(file_bytes)
+        doc = DocInfo(os.path.basename(self.docx_file), md5, self.doc_type, self.doc_split.paragraph_tree)
+        self.doc_id = doc_dbh.add_doc(doc)
+        for paragraph in doc.paragraphs:
+            doc_dbh.add_paragraph(self.doc_id, None, paragraph)
+        Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚')
 
 
 if __name__ == '__main__':
-    pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�111.pdf"
-    doc_processor = DocProcessor(pdf_file)
-    doc_processor.process()
+    files = [
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx"
+    ]
+    for file in files:
+        doc_processor = DocProcessor(file)
+        doc_processor.process()
+
+    # doc_dbh.get_docs()

--
Gitblit v1.9.1