From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期一, 07 七月 2025 16:20:25 +0800 Subject: [PATCH] 生成数据库 --- knowledgebase/doc/doc_processor.py | 270 +++++++++++++++++++++++++++++++++++++++++++---------- 1 files changed, 219 insertions(+), 51 deletions(-) diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py index 7dccb8b..b0f3b9c 100644 --- a/knowledgebase/doc/doc_processor.py +++ b/knowledgebase/doc/doc_processor.py @@ -1,65 +1,233 @@ # -*- coding: utf-8 -*- # @file: doc_processor.py # @author: lyg -# @date: 20250427 +# @date: 2025-5-13 # @version: -# @description: 澶勭悊鏂囨。锛屾彁鍙栫珷鑺備俊鎭紝鎻愬彇椤电爜淇℃伅锛屾彁鍙栧疄浣撹瘝锛屽啓鍏ュ浘鏁版嵁搴擄紙neo4j锛夈�� -from knowledgebase.db.neo4j import Neo4jHelper -from knowledgebase.doc.doc_split import DocSplit -from knowledgebase.doc.entity_recognition import EntityRecognition +# @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗o紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆� +from langchain_core.messages import HumanMessage +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate + +from knowledgebase.db.doc_db_models import TEntity +from knowledgebase.doc.docx_split import DocSplit import asyncio +from knowledgebase.db.doc_db_helper import doc_dbh +from knowledgebase.doc.entity_helper import get_entity_helper +from knowledgebase.doc.entity_recognition import EntityRecognition +import os.path + +from knowledgebase.doc.models import DocInfo, ParagraphInfo, DocType +from knowledgebase.llm import llm +from knowledgebase.log import Log +from knowledgebase import utils class DocProcessor: - def __init__(self, pdf_file): - self.doc_split = DocSplit(pdf_file) - self.entity_recognition = EntityRecognition() - self.neo4j = Neo4jHelper() + def __init__(self, docx_file: str): + """ + 鏂囨。澶勭悊 + :param docx_file: 瑕佸鐞嗙殑鏂囨。 + """ + Log.info(f'寮�濮嬪鐞嗘枃妗o細{docx_file}') + self.docx_file = docx_file + self.doc_type = self.get_doc_type() + self.doc_split = DocSplit(docx_file, self.doc_type) + self.entity_recognition = EntityRecognition(self.doc_type) + self.doc_id = 0 - async def gen_page_entities(self, page_info): - # 鑾峰彇椤甸潰瀹炰綋璇� - page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text)) - page_info.entities = page_entities + def get_doc_type(self): + entity_helper = get_entity_helper() + Log.info(f'璇嗗埆鏂囨。绫诲瀷锛歿self.docx_file}') + rules = '锛沑n'.join([f'- {it}锛歿entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()]) + msg = HumanMessage(f''' +# 鎸囦护 +璇蜂粠涓嬮潰鐨勬枃浠跺悕涓瘑鍒枃妗g被鍨嬶紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆� +鏂囦欢鍚嶏細{os.path.basename(self.docx_file)} +# 璇嗗埆瑙勫垯 +{rules} +# 绀轰緥 +閬ユ祴澶х翰 +''') + resp = llm.invoke([msg]) + Log.info(f'璇嗗埆缁撴灉锛歿resp.content}') + return resp.content + + async def get_tc_info(self, paragraph: ParagraphInfo): + if self.doc_type not in [DocType.tc_format]: + return '' + prompt = HumanMessagePromptTemplate.from_template(''' +# 鎸囦护 +璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆鎸囦护淇℃伅锛屽鏋滆瘑鍒け璐ヤ笉瑕佽緭鍑轰换浣曞瓧绗︺�� +鎸囦护淇℃伅鍖呮嫭锛氭寚浠ゅ悕绉般�� +# 璇嗗埆瑙勫垯 +- 鏂囨湰鍐呭涓洪仴鎺ф寚浠ゆ暟鎹煙鎴栭仴鎺ф寚浠ゅ簲鐢ㄦ暟鎹殑瀹氫箟鎻忚堪銆� +# 绾︽潫 +- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛� +- 鎸囦护鍚嶇О鍦ㄧ珷鑺傛爣棰樹腑锛屾彁鍙栨寚浠ゅ悕绉拌鍜屾枃鏈腑鐨勪弗鏍间竴鑷达紱 +- 濡傛灉娌℃湁璇嗗埆鍒版寚浠や俊鎭笉瑕佽緭鍑轰换浣曞瓧绗︼紱 +- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙i噴鎬ф枃鏈紱 +- 杈撳嚭json鏍煎紡銆� +# 绀轰緥 - 璇嗗埆鍒版寚浠� +{{ + "name": "xxx" +}} +# 绀轰緥 - 鏈瘑鍒埌鎸囦护 +"" +# 鏂囨湰鍐呭锛� +{text} +''') + chain = prompt.prompt | llm | JsonOutputParser() + resp = await chain.ainvoke({"text": paragraph.full_text}) + import json + # Log.info(f'>>>>>>鎸囦护璇嗗埆锛歕n{paragraph.full_text}') + # Log.info(f'<<<<<<鎸囦护锛歿json.dumps(resp, ensure_ascii=False)}') + return resp + + async def get_tm_pkt_info(self, paragraph: ParagraphInfo): + if self.doc_type not in [DocType.tm_outline, DocType.tm_pkt_design]: + return '' + prompt = HumanMessagePromptTemplate.from_template(''' +# 鎸囦护 +璇嗗埆閬ユ祴鍖呬俊鎭紝璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆閬ユ祴鍖呬俊鎭紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆� +璇嗗埆瑙勫垯锛氱珷鑺傛爣棰樹腑鍖呭惈鍖呭悕绉板拰浠e彿锛岀珷鑺傚唴瀹逛负琛ㄦ牸锛岃〃鏍间腑鍖呮嫭鍖呭ご瀹氫箟鍜屽寘鍙傛暟瀹氫箟銆� +鎻愬彇鐨勯仴娴嬪寘淇℃伅鍖呮嫭锛氬寘鍚嶇О锛屽寘浠e彿銆� +# 绾︽潫 +- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛� +- 鏂囨湰鎻忚堪鐨勫唴瀹规槸鍗曚釜閬ユ祴鍖咃紝濡傛灉鏈夊涓仴娴嬪寘鍒欎笉瑕佽緭鍑轰换浣曞瓧绗︼紱 +- 鏂囨湰缁撴瀯閫氬父鏄細鍖呭悕绉般�佷唬鍙峰拰APID(搴旂敤杩囩▼鏍囪瘑)鍦ㄥ紑澶达紙搴旂敤杩囩▼鏍囪瘑涔熸湁鍙兘鍦ㄨ〃鏍间腑锛夛紝鍚庨潰绱ф帴鐫�鏄寘澶村拰鍙傛暟瀹氫箟琛紱 +- 濡傛灉娌℃湁璇嗗埆鍒伴仴娴嬪寘淇℃伅涓嶈杈撳嚭浠讳綍瀛楃锛� +- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙i噴鎬ф枃鏈紱 +- 杈撳嚭json鏍煎紡銆� +# 绗﹀悎瑕佹眰鐨勬枃鏈粨鏋�1 +1.1.1 code xxx鍖�(APID=0x123) +```json +琛ㄦ牸鍐呭 +``` +# 绗﹀悎瑕佹眰鐨勬枃鏈粨鏋�2 +1.1.1 code xxx鍖� +```json +琛ㄦ牸鍐呭 +搴旂敤杩囩▼鏍囪瘑 +... +``` +# 绀轰緥 - 璇嗗埆鍒版暟鎹寘 +{{ + "name": "xxx鍖�", + "code": "TMS001" +}} +# 绀轰緥 - 鏈瘑鍒埌鏁版嵁鍖� +"" +# 鏂囨湰鍐呭锛� +{text} +''') + chain = prompt.prompt | llm | JsonOutputParser() + resp = await chain.ainvoke({"text": paragraph.full_text}) + return resp + + async def get_chapter_refs(self, paragraph: ParagraphInfo, toc: [str]) -> [str]: + if self.doc_type not in [DocType.tc_format]: + return '' + toc_text = '\n'.join(toc) + prompt = HumanMessagePromptTemplate.from_template(f''' +# 瑙掕壊 +浣犳槸涓�鍚嶈祫娣辩殑杞欢宸ョ▼甯堛�� +# 鎸囦护 +甯姪鎴戝畬鎴愬鏂囨湰涓紩鐢ㄥ叧绯荤殑鎶藉彇锛屽垽鏂綋鍓嶆枃鏈腑鏄惁鍖呭惈浜嗗紩鐢ㄤ俊鎭紝渚嬪鍖呭惈浠ヤ笅鍏抽敭瀛楋細鈥滆瑙�1.1鈥濄�佲�滆1.1鈥濄�佲�滃叿浣撹1.1鈥濄�佲�滆闄勫綍鈥濈瓑銆� +濡傛灉鍖呭惈寮曠敤锛屽皢寮曠敤涓庘�滅洰褰曞唴瀹光�濅腑鐨勭洰褰曟潯鐩繘琛屽尮閰嶃�� +灏嗗尮閰嶅埌鐨勭洰褰曟潯鐩緭鍑猴紝杈撳嚭鏍煎紡涓簀son鏍煎紡銆� +# 绾︽潫 +- 鏄惁鍖呭惈寮曠敤鐨勫垽鏂潯浠朵腑蹇呴』鍖呭惈寮曠敤鐩稿叧鐨勬弿杩帮紝渚嬪锛氣�滆瑙�1.1鈥濄�佲�滆1.1鈥濄�佲�滃叿浣撹1.1鈥濄�佲�滆闄勫綍鈥濈瓑锛� +- 娉ㄦ剰涓嶈鑷繁寮曠敤鑷繁锛� +- 浠呮彁鍙栫洰褰曞唴瀹逛腑鍖呭惈鐨勬潯鐩紝濡傛灉鐩綍鍐呭涓嶅寘鍚垯涓嶆彁鍙栵紱 +- 濡傛灉浠呴潬鏍囬鍙风爜鏃犳硶纭畾鐩綍鏉$洰鐨勶紝鏍规嵁鏂囨湰鍐呭鍖归厤瀵瑰簲鐨勭洰褰曟潯鐩紱 +- 杈撳嚭鐨勫唴瀹瑰繀椤绘槸鐩綍涓殑鏉$洰锛� +- 杈撳嚭json鏍煎紡锛屼笉瑕佽緭鍑轰换浣昷son浠ュ鐨勫瓧绗︺�� +# 杈撳嚭妗堜緥 +["1.1 xxx"] +# 鐩綍鍐呭锛� +{toc_text} +# 鏂囨湰鍐呭锛� +{{text}} +''') + chain = prompt.prompt | llm | JsonOutputParser() + resp = await chain.ainvoke({"text": paragraph.full_text}) + return resp + + async def gen_chapter_entities(self, paragraph: ParagraphInfo, paragraphs: [ParagraphInfo], toc: [str]): + # 鑾峰彇绔犺妭瀹炰綋璇� + entity_names_task = self.entity_recognition.run(paragraph.full_text) + # 鑾峰彇鎸囦护淇℃伅 + cmd_task = self.get_tc_info(paragraph) + # 鑾峰彇閬ユ祴鍖呬俊鎭� + pkt_task = self.get_tm_pkt_info(paragraph) + # 鑾峰彇鏂囨。寮曠敤 + refs_task = self.get_chapter_refs(paragraph, toc) + entity_names, cmd, pkt, chapter_refs = await asyncio.gather(entity_names_task, cmd_task, pkt_task, refs_task) + + Log.info(f'绔犺妭{paragraph.title_num}瀹炰綋璇嶏細{entity_names}') + Log.info(f'绔犺妭{paragraph.title_num}寮曠敤锛歿chapter_refs}') + if entity_names: + paragraph.entities = doc_dbh.get_entities_by_names(entity_names) + + if pkt: + entity = TEntity(name=pkt['code'], type='閬ユ祴鍖呴厤缃�', prompts='', doc_type='') + e = doc_dbh.get_entity(entity) + if e: + entity.id = e.id + else: + doc_dbh.add_entity(entity) + Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}") + paragraph.entities.append(entity) + + if cmd: + entity = TEntity(name=cmd['name'], type='鎸囦护鏍煎紡閰嶇疆', prompts='', doc_type='') + e = doc_dbh.get_entity(entity) + if e: + entity.id = e.id + else: + doc_dbh.add_entity(entity) + Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}") + paragraph.entities.append(entity) + # 鑾峰彇寮曠敤淇℃伅 + if chapter_refs: + for ref in chapter_refs: + _p = next(filter(lambda p: ref == p.title, self.doc_split.paragraphs), None) + if _p: + if paragraph != _p: + paragraph.refs.append(_p) def process(self): - # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10椤� - batch_size = 10 - for i in range(0, len(self.doc_split.page_infos), batch_size): - batch_page_infos = self.doc_split.page_infos[i:i + batch_size] - tasks = [] - for page_info in batch_page_infos: - tasks.append(self.gen_page_entities(page_info)) - asyncio.run(asyncio.gather(*tasks)) - self.save_to_neo4j() + self.doc_split.split() + # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10涓� + tasks = [] + toc = [] + for p in self.doc_split.paragraphs: + if p.title_level: + toc.append(p.title) + for paragraph in self.doc_split.paragraphs: + tasks.append(self.gen_chapter_entities(paragraph, self.doc_split.paragraphs, toc)) - def save_to_neo4j(self): + async def run(): + await asyncio.gather(*tasks) + + asyncio.run(run()) + # 淇濆瓨鍒版暟鎹簱 + self.save_to_db() + + def save_to_db(self): """ - 淇濆瓨椤靛拰椤靛疄浣撹瘝鍒皀eo4j鏁版嵁搴撱�� - - 1.姣忎竴椤典负涓�涓狽ode锛� - 2.姣忎竴涓疄浣撹瘝涓轰竴涓狽ode锛� - 3.椤靛拰瀹炰綋璇嶇洿鎺ュ缓绔嬪叧绯� 椤�->瀹炰綋璇� - :return: + 淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱銆� """ - for page_info in self.doc_split.page_infos: - # 鍒涘缓椤佃妭鐐� - page_node = self.neo4j.create_page_node(page_info) - entity_nodes = [] - for entity in page_info.entities: - # 鍒涘缓瀹炰綋璇嶈妭鐐� - entity_node = self.neo4j.create_entity_node(entity) - # 寤虹珛鍏崇郴 椤�->瀹炰綋璇� - self.neo4j.create_page_entity_relationship(page_node, entity_node) - entity_nodes.append(entity_node) - if len(entity_nodes) > 0: - for i in range(len(entity_nodes)): - prev_entity_node = entity_nodes[i] - for entity_node in entity_nodes[i + 1:]: - # 寤虹珛鍏崇郴 涓�椤典腑鐨� 瀹炰綋璇�1->瀹炰綋璇�2 - self.neo4j.create_entity_relationship(prev_entity_node, entity_node) - - -if __name__ == '__main__': - pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�111.pdf" - doc_processor = DocProcessor(pdf_file) - doc_processor.process() + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱...') + with open(self.docx_file, 'rb') as f: + file_bytes = f.read() + md5 = utils.generate_bytes_md5(file_bytes) + doc = DocInfo(os.path.basename(self.docx_file), md5, self.doc_type, self.doc_split.paragraph_tree) + self.doc_id = doc_dbh.add_doc(doc) + for paragraph in doc.paragraphs: + doc_dbh.add_paragraph(self.doc_id, None, paragraph) + for paragraph in self.doc_split.paragraphs: + for ref_paragraph in paragraph.refs: + doc_dbh.add_paragraph_ref_link(paragraph.id, ref_paragraph.id) + Log.info(f"{paragraph.title} 寮曠敤浜�-> {ref_paragraph.title}") + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚') -- Gitblit v1.9.1