From acde3bd32f07bf02839a21e8fe5b4e69bfca2251 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 10:37:00 +0800 Subject: [PATCH] docx文档拆分,文档段落实体词提取,存入mysql数据库。 --- knowledgebase/doc/doc_processor.py | 133 +++++++++++++++++++++++++++++--------------- 1 files changed, 87 insertions(+), 46 deletions(-) diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py index 7dccb8b..bc0e0dd 100644 --- a/knowledgebase/doc/doc_processor.py +++ b/knowledgebase/doc/doc_processor.py @@ -1,65 +1,106 @@ # -*- coding: utf-8 -*- # @file: doc_processor.py # @author: lyg -# @date: 20250427 +# @date: 2025-5-13 # @version: -# @description: 澶勭悊鏂囨。锛屾彁鍙栫珷鑺備俊鎭紝鎻愬彇椤电爜淇℃伅锛屾彁鍙栧疄浣撹瘝锛屽啓鍏ュ浘鏁版嵁搴擄紙neo4j锛夈�� -from knowledgebase.db.neo4j import Neo4jHelper -from knowledgebase.doc.doc_split import DocSplit -from knowledgebase.doc.entity_recognition import EntityRecognition +# @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗o紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆� +from langchain_core.messages import HumanMessage + +from knowledgebase.doc.docx_split import DocSplit import asyncio +from knowledgebase.db.doc_db_helper import doc_dbh +from knowledgebase.doc.entity_helper import entity_helper +from knowledgebase.doc.entity_recognition import EntityRecognition +import os.path + +from knowledgebase.doc.models import DocInfo, ParagraphInfo +from knowledgebase.llm import llm +from knowledgebase.log import Log +from knowledgebase import utils class DocProcessor: - def __init__(self, pdf_file): - self.doc_split = DocSplit(pdf_file) - self.entity_recognition = EntityRecognition() - self.neo4j = Neo4jHelper() + def __init__(self, docx_file: str): + """ + 鏂囨。澶勭悊 + :param docx_file: 瑕佸鐞嗙殑鏂囨。 + """ + Log.info(f'寮�濮嬪鐞嗘枃妗o細{docx_file}') + self.docx_file = docx_file + self.doc_split = DocSplit(docx_file) + self.doc_type = self.get_doc_type() + self.entity_recognition = EntityRecognition(self.doc_type) + self.doc_id = 0 - async def gen_page_entities(self, page_info): - # 鑾峰彇椤甸潰瀹炰綋璇� - page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text)) - page_info.entities = page_entities + def get_doc_type(self): + Log.info(f'璇嗗埆鏂囨。绫诲瀷锛歿self.docx_file}') + rules = '锛沑n'.join([f'- {it}锛歿entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()]) + msg = HumanMessage(f''' +# 鎸囦护 +璇蜂粠涓嬮潰鐨勬枃浠跺悕涓瘑鍒枃妗g被鍨嬶紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆� +鏂囦欢鍚嶏細{os.path.basename(self.docx_file)} +# 璇嗗埆瑙勫垯 +{rules} +# 绀轰緥 +閬ユ祴澶х翰 +''') + resp = llm.invoke([msg]) + Log.info(f'璇嗗埆缁撴灉锛歿resp.content}') + return resp.content + + async def gen_sect_entities(self, paragraph: ParagraphInfo): + # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}') + # 鑾峰彇绔犺妭瀹炰綋璇� + entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text)) + Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}') + if entities: + paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities] + paragraph.entities = [e for e in paragraph.entities if e] def process(self): - # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10椤� + self.doc_split.split() + # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10涓� batch_size = 10 - for i in range(0, len(self.doc_split.page_infos), batch_size): - batch_page_infos = self.doc_split.page_infos[i:i + batch_size] + for i in range(0, len(self.doc_split.paragraphs), batch_size): + batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size] tasks = [] - for page_info in batch_page_infos: - tasks.append(self.gen_page_entities(page_info)) - asyncio.run(asyncio.gather(*tasks)) - self.save_to_neo4j() + for paragraph in batch_paragraphs: + tasks.append(self.gen_sect_entities(paragraph)) - def save_to_neo4j(self): - """ - 淇濆瓨椤靛拰椤靛疄浣撹瘝鍒皀eo4j鏁版嵁搴撱�� + async def run(): + await asyncio.gather(*tasks) - 1.姣忎竴椤典负涓�涓狽ode锛� - 2.姣忎竴涓疄浣撹瘝涓轰竴涓狽ode锛� - 3.椤靛拰瀹炰綋璇嶇洿鎺ュ缓绔嬪叧绯� 椤�->瀹炰綋璇� - :return: + asyncio.run(run()) + # 淇濆瓨鍒版暟鎹簱 + self.save_to_db() + + def save_to_db(self): """ - for page_info in self.doc_split.page_infos: - # 鍒涘缓椤佃妭鐐� - page_node = self.neo4j.create_page_node(page_info) - entity_nodes = [] - for entity in page_info.entities: - # 鍒涘缓瀹炰綋璇嶈妭鐐� - entity_node = self.neo4j.create_entity_node(entity) - # 寤虹珛鍏崇郴 椤�->瀹炰綋璇� - self.neo4j.create_page_entity_relationship(page_node, entity_node) - entity_nodes.append(entity_node) - if len(entity_nodes) > 0: - for i in range(len(entity_nodes)): - prev_entity_node = entity_nodes[i] - for entity_node in entity_nodes[i + 1:]: - # 寤虹珛鍏崇郴 涓�椤典腑鐨� 瀹炰綋璇�1->瀹炰綋璇�2 - self.neo4j.create_entity_relationship(prev_entity_node, entity_node) + 淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱銆� + """ + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱...') + with open(self.docx_file, 'rb') as f: + file_bytes = f.read() + md5 = utils.generate_bytes_md5(file_bytes) + doc = DocInfo(os.path.basename(self.docx_file), md5, self.doc_type, self.doc_split.paragraph_tree) + self.doc_id = doc_dbh.add_doc(doc) + for paragraph in doc.paragraphs: + doc_dbh.add_paragraph(self.doc_id, None, paragraph) + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚') if __name__ == '__main__': - pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�111.pdf" - doc_processor = DocProcessor(pdf_file) - doc_processor.process() + files = [ + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx" + ] + for file in files: + doc_processor = DocProcessor(file) + doc_processor.process() + + # doc_dbh.get_docs() -- Gitblit v1.9.1