From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期一, 07 七月 2025 16:20:25 +0800
Subject: [PATCH] 生成数据库

---
 knowledgebase/doc/doc_processor.py |  199 ++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 163 insertions(+), 36 deletions(-)

diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py
index d681d94..b0f3b9c 100644
--- a/knowledgebase/doc/doc_processor.py
+++ b/knowledgebase/doc/doc_processor.py
@@ -5,15 +5,18 @@
 # @version: 
 # @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗ｏ紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆�
 from langchain_core.messages import HumanMessage
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
+from knowledgebase.db.doc_db_models import TEntity
 from knowledgebase.doc.docx_split import DocSplit
 import asyncio
 from knowledgebase.db.doc_db_helper import doc_dbh
-from knowledgebase.doc.entity_helper import entity_helper
+from knowledgebase.doc.entity_helper import get_entity_helper
 from knowledgebase.doc.entity_recognition import EntityRecognition
 import os.path
 
-from knowledgebase.doc.models import DocInfo, ParagraphInfo
+from knowledgebase.doc.models import DocInfo, ParagraphInfo, DocType
 from knowledgebase.llm import llm
 from knowledgebase.log import Log
 from knowledgebase import utils
@@ -27,12 +30,13 @@
         """
         Log.info(f'寮�濮嬪鐞嗘枃妗ｏ細{docx_file}')
         self.docx_file = docx_file
-        self.doc_split = DocSplit(docx_file)
         self.doc_type = self.get_doc_type()
+        self.doc_split = DocSplit(docx_file, self.doc_type)
         self.entity_recognition = EntityRecognition(self.doc_type)
         self.doc_id = 0
 
     def get_doc_type(self):
+        entity_helper = get_entity_helper()
         Log.info(f'璇嗗埆鏂囨。绫诲瀷锛歿self.docx_file}')
         rules = '锛沑n'.join([f'- {it}锛歿entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()])
         msg = HumanMessage(f'''
@@ -48,29 +52,165 @@
         Log.info(f'璇嗗埆缁撴灉锛歿resp.content}')
         return resp.content
 
-    async def gen_sect_entities(self, paragraph: ParagraphInfo):
-        # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}')
+    async def get_tc_info(self, paragraph: ParagraphInfo):
+        if self.doc_type not in [DocType.tc_format]:
+            return ''
+        prompt = HumanMessagePromptTemplate.from_template('''
+# 鎸囦护
+璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆鎸囦护淇℃伅锛屽鏋滆瘑鍒け璐ヤ笉瑕佽緭鍑轰换浣曞瓧绗︺��
+鎸囦护淇℃伅鍖呮嫭锛氭寚浠ゅ悕绉般��
+# 璇嗗埆瑙勫垯
+- 鏂囨湰鍐呭涓洪仴鎺ф寚浠ゆ暟鎹煙鎴栭仴鎺ф寚浠ゅ簲鐢ㄦ暟鎹殑瀹氫箟鎻忚堪銆�
+# 绾︽潫
+- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛�
+- 鎸囦护鍚嶇О鍦ㄧ珷鑺傛爣棰樹腑锛屾彁鍙栨寚浠ゅ悕绉拌鍜屾枃鏈腑鐨勪弗鏍间竴鑷达紱
+- 濡傛灉娌℃湁璇嗗埆鍒版寚浠や俊鎭笉瑕佽緭鍑轰换浣曞瓧绗︼紱
+- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙ｉ噴鎬ф枃鏈紱
+- 杈撳嚭json鏍煎紡銆�
+# 绀轰緥 - 璇嗗埆鍒版寚浠�
+{{
+    "name": "xxx"
+}}
+# 绀轰緥 - 鏈瘑鍒埌鎸囦护
+""
+# 鏂囨湰鍐呭锛�
+{text}
+''')
+        chain = prompt.prompt | llm | JsonOutputParser()
+        resp = await chain.ainvoke({"text": paragraph.full_text})
+        import json
+        # Log.info(f'>>>>>>鎸囦护璇嗗埆锛歕n{paragraph.full_text}')
+        # Log.info(f'<<<<<<鎸囦护锛歿json.dumps(resp, ensure_ascii=False)}')
+        return resp
+
+    async def get_tm_pkt_info(self, paragraph: ParagraphInfo):
+        if self.doc_type not in [DocType.tm_outline, DocType.tm_pkt_design]:
+            return ''
+        prompt = HumanMessagePromptTemplate.from_template('''
+# 鎸囦护
+璇嗗埆閬ユ祴鍖呬俊鎭紝璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆閬ユ祴鍖呬俊鎭紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆�
+璇嗗埆瑙勫垯锛氱珷鑺傛爣棰樹腑鍖呭惈鍖呭悕绉板拰浠ｅ彿锛岀珷鑺傚唴瀹逛负琛ㄦ牸锛岃〃鏍间腑鍖呮嫭鍖呭ご瀹氫箟鍜屽寘鍙傛暟瀹氫箟銆�
+鎻愬彇鐨勯仴娴嬪寘淇℃伅鍖呮嫭锛氬寘鍚嶇О锛屽寘浠ｅ彿銆�
+# 绾︽潫
+- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛�
+- 鏂囨湰鎻忚堪鐨勫唴瀹规槸鍗曚釜閬ユ祴鍖咃紝濡傛灉鏈夊涓仴娴嬪寘鍒欎笉瑕佽緭鍑轰换浣曞瓧绗︼紱
+- 鏂囨湰缁撴瀯閫氬父鏄細鍖呭悕绉般�佷唬鍙峰拰APID(搴旂敤杩囩▼鏍囪瘑)鍦ㄥ紑澶达紙搴旂敤杩囩▼鏍囪瘑涔熸湁鍙兘鍦ㄨ〃鏍间腑锛夛紝鍚庨潰绱ф帴鐫�鏄寘澶村拰鍙傛暟瀹氫箟琛紱
+- 濡傛灉娌℃湁璇嗗埆鍒伴仴娴嬪寘淇℃伅涓嶈杈撳嚭浠讳綍瀛楃锛�
+- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙ｉ噴鎬ф枃鏈紱
+- 杈撳嚭json鏍煎紡銆�
+# 绗﹀悎瑕佹眰鐨勬枃鏈粨鏋�1
+1.1.1 code xxx鍖�(APID=0x123)
+```json
+琛ㄦ牸鍐呭
+``` 
+# 绗﹀悎瑕佹眰鐨勬枃鏈粨鏋�2
+1.1.1 code xxx鍖�
+```json
+琛ㄦ牸鍐呭
+搴旂敤杩囩▼鏍囪瘑
+...
+``` 
+# 绀轰緥 - 璇嗗埆鍒版暟鎹寘
+{{
+    "name": "xxx鍖�",
+    "code": "TMS001"
+}}
+# 绀轰緥 - 鏈瘑鍒埌鏁版嵁鍖�
+""
+# 鏂囨湰鍐呭锛�
+{text}
+''')
+        chain = prompt.prompt | llm | JsonOutputParser()
+        resp = await chain.ainvoke({"text": paragraph.full_text})
+        return resp
+
+    async def get_chapter_refs(self, paragraph: ParagraphInfo, toc: [str]) -> [str]:
+        if self.doc_type not in [DocType.tc_format]:
+            return ''
+        toc_text = '\n'.join(toc)
+        prompt = HumanMessagePromptTemplate.from_template(f'''
+# 瑙掕壊
+浣犳槸涓�鍚嶈祫娣辩殑杞欢宸ョ▼甯堛��
+# 鎸囦护
+甯姪鎴戝畬鎴愬鏂囨湰涓紩鐢ㄥ叧绯荤殑鎶藉彇锛屽垽鏂綋鍓嶆枃鏈腑鏄惁鍖呭惈浜嗗紩鐢ㄤ俊鎭紝渚嬪鍖呭惈浠ヤ笅鍏抽敭瀛楋細鈥滆瑙�1.1鈥濄�佲�滆1.1鈥濄�佲�滃叿浣撹1.1鈥濄�佲�滆闄勫綍鈥濈瓑銆�
+濡傛灉鍖呭惈寮曠敤锛屽皢寮曠敤涓庘�滅洰褰曞唴瀹光�濅腑鐨勭洰褰曟潯鐩繘琛屽尮閰嶃��
+灏嗗尮閰嶅埌鐨勭洰褰曟潯鐩緭鍑猴紝杈撳嚭鏍煎紡涓簀son鏍煎紡銆�
+# 绾︽潫
+- 鏄惁鍖呭惈寮曠敤鐨勫垽鏂潯浠朵腑蹇呴』鍖呭惈寮曠敤鐩稿叧鐨勬弿杩帮紝渚嬪锛氣�滆瑙�1.1鈥濄�佲�滆1.1鈥濄�佲�滃叿浣撹1.1鈥濄�佲�滆闄勫綍鈥濈瓑锛�
+- 娉ㄦ剰涓嶈鑷繁寮曠敤鑷繁锛�
+- 浠呮彁鍙栫洰褰曞唴瀹逛腑鍖呭惈鐨勬潯鐩紝濡傛灉鐩綍鍐呭涓嶅寘鍚垯涓嶆彁鍙栵紱
+- 濡傛灉浠呴潬鏍囬鍙风爜鏃犳硶纭畾鐩綍鏉＄洰鐨勶紝鏍规嵁鏂囨湰鍐呭鍖归厤瀵瑰簲鐨勭洰褰曟潯鐩紱
+- 杈撳嚭鐨勫唴瀹瑰繀椤绘槸鐩綍涓殑鏉＄洰锛�
+- 杈撳嚭json鏍煎紡锛屼笉瑕佽緭鍑轰换浣昷son浠ュ鐨勫瓧绗︺��
+# 杈撳嚭妗堜緥
+["1.1 xxx"]
+# 鐩綍鍐呭锛�
+{toc_text}
+# 鏂囨湰鍐呭锛�
+{{text}}
+''')
+        chain = prompt.prompt | llm | JsonOutputParser()
+        resp = await chain.ainvoke({"text": paragraph.full_text})
+        return resp
+
+    async def gen_chapter_entities(self, paragraph: ParagraphInfo, paragraphs: [ParagraphInfo], toc: [str]):
         # 鑾峰彇绔犺妭瀹炰綋璇�
-        entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text))
-        Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}')
-        if entities:
-            paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities]
-            paragraph.entities = [e for e in paragraph.entities if e]
+        entity_names_task = self.entity_recognition.run(paragraph.full_text)
+        # 鑾峰彇鎸囦护淇℃伅
+        cmd_task = self.get_tc_info(paragraph)
+        # 鑾峰彇閬ユ祴鍖呬俊鎭�
+        pkt_task = self.get_tm_pkt_info(paragraph)
+        # 鑾峰彇鏂囨。寮曠敤
+        refs_task = self.get_chapter_refs(paragraph, toc)
+        entity_names, cmd, pkt, chapter_refs = await asyncio.gather(entity_names_task, cmd_task, pkt_task, refs_task)
+
+        Log.info(f'绔犺妭{paragraph.title_num}瀹炰綋璇嶏細{entity_names}')
+        Log.info(f'绔犺妭{paragraph.title_num}寮曠敤锛歿chapter_refs}')
+        if entity_names:
+            paragraph.entities = doc_dbh.get_entities_by_names(entity_names)
+
+        if pkt:
+            entity = TEntity(name=pkt['code'], type='閬ユ祴鍖呴厤缃�', prompts='', doc_type='')
+            e = doc_dbh.get_entity(entity)
+            if e:
+                entity.id = e.id
+            else:
+                doc_dbh.add_entity(entity)
+                Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}")
+            paragraph.entities.append(entity)
+
+        if cmd:
+            entity = TEntity(name=cmd['name'], type='鎸囦护鏍煎紡閰嶇疆', prompts='', doc_type='')
+            e = doc_dbh.get_entity(entity)
+            if e:
+                entity.id = e.id
+            else:
+                doc_dbh.add_entity(entity)
+                Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}")
+            paragraph.entities.append(entity)
+        # 鑾峰彇寮曠敤淇℃伅
+        if chapter_refs:
+            for ref in chapter_refs:
+                _p = next(filter(lambda p: ref == p.title, self.doc_split.paragraphs), None)
+                if _p:
+                    if paragraph != _p:
+                        paragraph.refs.append(_p)
 
     def process(self):
         self.doc_split.split()
         # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10涓�
-        batch_size = 10
-        for i in range(0, len(self.doc_split.paragraphs), batch_size):
-            batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size]
-            tasks = []
-            for paragraph in batch_paragraphs:
-                tasks.append(self.gen_sect_entities(paragraph))
+        tasks = []
+        toc = []
+        for p in self.doc_split.paragraphs:
+            if p.title_level:
+                toc.append(p.title)
+        for paragraph in self.doc_split.paragraphs:
+            tasks.append(self.gen_chapter_entities(paragraph, self.doc_split.paragraphs, toc))
 
-            async def run():
-                await asyncio.gather(*tasks)
+        async def run():
+            await asyncio.gather(*tasks)
 
-            asyncio.run(run())
+        asyncio.run(run())
         # 淇濆瓨鍒版暟鎹簱
         self.save_to_db()
 
@@ -86,21 +226,8 @@
         self.doc_id = doc_dbh.add_doc(doc)
         for paragraph in doc.paragraphs:
             doc_dbh.add_paragraph(self.doc_id, None, paragraph)
+        for paragraph in self.doc_split.paragraphs:
+            for ref_paragraph in paragraph.refs:
+                doc_dbh.add_paragraph_ref_link(paragraph.id, ref_paragraph.id)
+                Log.info(f"{paragraph.title} 寮曠敤浜�-> {ref_paragraph.title}")
         Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚')
-
-
-if __name__ == '__main__':
-    files = [
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx",
-        # r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx"
-    ]
-    for file in files:
-        doc_processor = DocProcessor(file)
-        doc_processor.process()
-
-    # doc_dbh.get_docs()

--
Gitblit v1.9.1