From c099e6662b8a6e320ac314d31eda9b40455e5aa7 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期四, 22 五月 2025 09:27:37 +0800 Subject: [PATCH] 修改指令json生成相关提示词和代码逻辑 --- knowledgebase/doc/doc_processor.py | 121 +++++++++++++++++++++++++++++++--------- 1 files changed, 94 insertions(+), 27 deletions(-) diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py index d681d94..28092ed 100644 --- a/knowledgebase/doc/doc_processor.py +++ b/knowledgebase/doc/doc_processor.py @@ -5,7 +5,10 @@ # @version: # @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗o紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆� from langchain_core.messages import HumanMessage +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from knowledgebase.db.doc_db_models import TEntity from knowledgebase.doc.docx_split import DocSplit import asyncio from knowledgebase.db.doc_db_helper import doc_dbh @@ -13,7 +16,7 @@ from knowledgebase.doc.entity_recognition import EntityRecognition import os.path -from knowledgebase.doc.models import DocInfo, ParagraphInfo +from knowledgebase.doc.models import DocInfo, ParagraphInfo, DocType from knowledgebase.llm import llm from knowledgebase.log import Log from knowledgebase import utils @@ -27,8 +30,8 @@ """ Log.info(f'寮�濮嬪鐞嗘枃妗o細{docx_file}') self.docx_file = docx_file - self.doc_split = DocSplit(docx_file) self.doc_type = self.get_doc_type() + self.doc_split = DocSplit(docx_file, self.doc_type) self.entity_recognition = EntityRecognition(self.doc_type) self.doc_id = 0 @@ -47,15 +50,96 @@ resp = llm.invoke([msg]) Log.info(f'璇嗗埆缁撴灉锛歿resp.content}') return resp.content + def get_tc_info(self, paragraph: ParagraphInfo): + if self.doc_type not in [DocType.tc_format]: + return '' + prompt = HumanMessagePromptTemplate.from_template(''' +# 鎸囦护 +璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆鎸囦护淇℃伅锛屽鏋滆瘑鍒け璐ヤ笉瑕佽緭鍑轰换浣曞瓧绗︺�� +鎸囦护淇℃伅鍖呮嫭锛氭寚浠ゅ悕绉般�� +# 璇嗗埆瑙勫垯 +- 鏂囨湰鍐呭涓洪仴鎺ф寚浠ゆ暟鎹煙鎴栭仴鎺ф寚浠ゅ簲鐢ㄦ暟鎹殑瀹氫箟鎻忚堪銆� +# 绾︽潫 +- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛� +- 鎸囦护鍚嶇О鍦ㄧ珷鑺傛爣棰樹腑锛屾彁鍙栨寚浠ゅ悕绉拌鍜屾枃鏈腑鐨勪弗鏍间竴鑷达紱 +- 濡傛灉娌℃湁璇嗗埆鍒版寚浠や俊鎭笉瑕佽緭鍑轰换浣曞瓧绗︼紱 +- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙i噴鎬ф枃鏈紱 +- 杈撳嚭json鏍煎紡銆� +# 绀轰緥 - 璇嗗埆鍒版寚浠� +{{ + "name": "xxx" +}} +# 绀轰緥 - 鏈瘑鍒埌鏁版嵁鍖� +"" +# 鏂囨湰鍐呭锛� +{text} +''') + chain = prompt.prompt | llm | JsonOutputParser() + resp = chain.invoke({"text": paragraph.full_text}) + return resp + def get_tm_pkt_info(self, paragraph: ParagraphInfo): + if self.doc_type not in [DocType.tm_outline, DocType.tm_pkt_design]: + return '' + prompt = HumanMessagePromptTemplate.from_template(''' +# 鎸囦护 +璇嗗埆閬ユ祴鍖呬俊鎭紝璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆閬ユ祴鍖呬俊鎭紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆� +璇嗗埆瑙勫垯锛氱珷鑺傛爣棰樹腑鍖呭惈鍖呭悕绉板拰浠e彿锛岀珷鑺傚唴瀹逛负琛ㄦ牸锛岃〃鏍间腑鍖呮嫭鍖呭ご瀹氫箟鍜屽寘鍙傛暟瀹氫箟銆� +鎻愬彇鐨勯仴娴嬪寘淇℃伅鍖呮嫭锛氬寘鍚嶇О锛屽寘浠e彿锛孉PID銆� +# 绾︽潫 +- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛� +- 鏂囨湰鎻忚堪鐨勫唴瀹规槸鍗曚釜閬ユ祴鍖咃紝濡傛灉鏈夊涓仴娴嬪寘鍒欎笉瑕佽緭鍑轰换浣曞瓧绗︼紱 +- 鏂囨湰缁撴瀯閫氬父鏄細鍖呭悕绉般�佷唬鍙峰拰APID鍦ㄥ紑澶达紝鍚庨潰绱ф帴鐫�鏄寘澶村拰鍙傛暟瀹氫箟琛紱 +- 濡傛灉娌℃湁璇嗗埆鍒伴仴娴嬪寘淇℃伅涓嶈杈撳嚭浠讳綍瀛楃锛� +- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙i噴鎬ф枃鏈紱 +- 杈撳嚭json鏍煎紡銆� +# 澶嶅悎瑕佹眰鐨勬枃鏈粨鏋� +1.1.1 code xxx鍖�(APID=0x123) +```json +琛ㄦ牸鍐呭 +``` +# 绀轰緥 - 璇嗗埆鍒版暟鎹寘 +{{ + "name": "xxx鍖�", + "code": "xxx", + "apid": 123 +}} +# 绀轰緥 - 鏈瘑鍒埌鏁版嵁鍖� +"" +# 鏂囨湰鍐呭锛� +{text} +''') + chain = prompt.prompt | llm | JsonOutputParser() + resp = chain.invoke({"text": paragraph.full_text}) + return resp - async def gen_sect_entities(self, paragraph: ParagraphInfo): - # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}') + async def gen_chapter_entities(self, paragraph: ParagraphInfo): # 鑾峰彇绔犺妭瀹炰綋璇� - entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text)) - Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}') - if entities: - paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities] - paragraph.entities = [e for e in paragraph.entities if e] + entity_names = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text)) + Log.info(f'绔犺妭{paragraph.title_num}瀹炰綋璇嶏細{entity_names}') + if entity_names: + paragraph.entities = doc_dbh.get_entities_by_names(entity_names) + # 鑾峰彇閬ユ祴鍖呬俊鎭� + pkt = self.get_tm_pkt_info(paragraph) + if pkt: + entity = TEntity(name=pkt['code'], type='閬ユ祴鍖呴厤缃�', prompts='', doc_type='') + e = doc_dbh.get_entity(entity) + if e: + entity.id = e.id + else: + doc_dbh.add_entity(entity) + Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}") + paragraph.entities.append(entity) + # 鑾峰彇鎸囦护淇℃伅 + cmd = self.get_tc_info(paragraph) + if cmd: + entity = TEntity(name=cmd['name'], type='鎸囦护鏍煎紡閰嶇疆', prompts='', doc_type='') + e = doc_dbh.get_entity(entity) + if e: + entity.id = e.id + else: + doc_dbh.add_entity(entity) + Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}") + paragraph.entities.append(entity) def process(self): self.doc_split.split() @@ -65,7 +149,7 @@ batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size] tasks = [] for paragraph in batch_paragraphs: - tasks.append(self.gen_sect_entities(paragraph)) + tasks.append(self.gen_chapter_entities(paragraph)) async def run(): await asyncio.gather(*tasks) @@ -87,20 +171,3 @@ for paragraph in doc.paragraphs: doc_dbh.add_paragraph(self.doc_id, None, paragraph) Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚') - - -if __name__ == '__main__': - files = [ - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx", - # r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx" - ] - for file in files: - doc_processor = DocProcessor(file) - doc_processor.process() - - # doc_dbh.get_docs() -- Gitblit v1.9.1