From a108e36f37a07a4a861fb6aafb9231de60dc1abf Mon Sep 17 00:00:00 2001
From: YM <479443481@qq.com>
Date: 星期四, 15 五月 2025 18:49:12 +0800
Subject: [PATCH] Merge branch 'master' of http://182.92.203.7:2001/r/KnowledgeBase

---
 testcases/test_doc_db_helper.py            |   12 +
 knowledgebase/doc/models.py                |   13 +
 knowledgebase/db/doc_db_helper.py          |   40 ++-
 knowledgebase/gen_base_db/json_generate.py |  206 +++++++++++-----------
 testcases/test_docx_split.py               |   16 +
 knowledgebase/doc/entity_recognition.py    |    4 
 testcases/test_doc_processor.py            |   29 +++
 knowledgebase/doc/doc_processor.py         |   84 ++++++---
 knowledgebase/doc/docx_split.py            |   22 --
 knowledgebase/db/doc_db_models.py          |   20 ++
 tpl/entities.json                          |    6 
 knowledgebase/doc/entity_helper.py         |   10 
 main.py                                    |   67 -------
 knowledgebase/log/__init__.py              |    2 
 14 files changed, 289 insertions(+), 242 deletions(-)

diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py
index fe36fb2..6b8d6cb 100644
--- a/knowledgebase/db/doc_db_helper.py
+++ b/knowledgebase/db/doc_db_helper.py
@@ -82,6 +82,12 @@
         self.session.commit()
         return paragraph_entity_link.id
 
+    def get_entity(self, entity):
+        ret = self.session.query(TEntity).where(
+            TEntity.name == entity.name and TEntity.type == entity.type and TEntity.doc_type == entity.doc_type).first()
+        if ret:
+            return ret
+
     def add_entity(self, entity):
         """
         娣诲姞瀹炰綋
@@ -106,11 +112,11 @@
     def get_docs(self) -> list[TDoc]:
         return self.session.query(TDoc).all()
 
-    def get_text_with_entities(self, entity_names: list[str]) -> str:
+    def get_texts_with_entities(self, entity_names: list[str]):
         """
-        鏍规嵁瀹炰綋璇嶈幏鍙栨枃鏈唴瀹�
+        鏍规嵁瀹炰綋璇嶈幏鍙栨枃鏈唴瀹瑰垪琛�
         :param entity_names: list[str] - 瀹炰綋璇�
-        :return: str - 鏂囨湰
+        :return: list[str] - 鏂囨湰鍒楄〃
         """
         if not entity_names:
             return ""
@@ -118,26 +124,30 @@
         _entitie_ids = [entity.id for entity in _entities]
         links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all()
         _paragraphs = [link.paragraph for link in links]
+        return [self.get_paragraph_full_text(p) for p in _paragraphs]
+    def get_text_with_entities(self, entity_names: list[str]) -> str:
+        """
+        鏍规嵁瀹炰綋璇嶈幏鍙栨枃鏈唴瀹�
+        :param entity_names: list[str] - 瀹炰綋璇�
+        :return: str - 鏂囨湰
+        """
+        texts = self.get_texts_with_entities(entity_names)
+        return '\n'.join(texts)
 
-        return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs])
+    def get_entities_by_names(self, names: list[str]):
+        _entities = self.session.query(TEntity).where(TEntity.name.in_(names)).all()
+        return _entities
 
     def get_paragraph_full_text(self, p: TParagraph):
         result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text
         return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children])
+
+    def get_entities_by_doc_type(self, doc_type):
+        _entities = self.session.query(TEntity).where(TEntity.doc_type == doc_type).all()
+        return _entities
 
     def commit(self):
         self.session.commit()
 
 
 doc_dbh = DocDbHelper()
-
-# if __name__ == '__main__':
-#     text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�'])
-#     print(text)
-#     doc_db = DocDbHelper()
-#     # doc_db.insert_entities()
-#     doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test'))
-#     p1 = doc_db.add_paragraph(doc.id, None, ParagraphInfo(text='test1', title_level=1, num=1, num_level=1))
-#     p2 = doc_db.add_paragraph(doc.id, p1.id, ParagraphInfo(text='test2', title_level=2, num=1, num_level=2))
-#     p3 = doc_db.add_paragraph(doc.id, p2.id, ParagraphInfo(text='test3', title_level=3, num=1, num_level=3))
-#     doc_db.add_paragraph_ref_link(TParagraphRefLink(parent_id=p1.id, child_id=p3.id))
diff --git a/knowledgebase/db/doc_db_models.py b/knowledgebase/db/doc_db_models.py
index e5a11ea..01ab1d8 100644
--- a/knowledgebase/db/doc_db_models.py
+++ b/knowledgebase/db/doc_db_models.py
@@ -120,6 +120,25 @@
     is_del = Column(Integer)
 
 
+# class TTmPacket(Base):
+#     __tablename__ = 't_tm_packets'
+#     id = Column(Integer, primary_key=True)
+#     name = Column(Text)
+#     code = Column(Text)
+#     apid = Column(Integer)
+#     is_del = Column(Integer)
+#
+#
+# class TTmPacketParagraphLink(Base):
+#     __tablename__ = 't_tm_packet_paragraph_links'
+#     id = Column(Integer, primary_key=True)
+#     tm_packet_id = Column(Integer, ForeignKey('t_tm_packets.id'))
+#     paragraph_id = Column(Integer, ForeignKey('t_paragraphs.id'))
+#     tm_packet = relationship("TTmPacket", foreign_keys=[tm_packet_id], uselist=False)
+#     paragraph = relationship("TParagraph", foreign_keys=[paragraph_id], uselist=False)
+#     is_del = Column(Integer)
+
+
 def init_doc_db():
     """
     鍒濆鍖栨枃妗f暟鎹簱
@@ -128,6 +147,7 @@
     # mysql
     Log.info("杩炴帴骞跺垵濮嬪寲鏂囨。鏁版嵁搴�...")
     engine = create_engine('mysql+pymysql://root:123456@192.168.3.145:3306/knowledgebase', echo=False)
+    # engine = create_engine('sqlite:///doc_db.db', echo=False)
     Base.metadata.create_all(engine)
     SessionFactory = sessionmaker(bind=engine)
     Session = scoped_session(SessionFactory)
diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py
index d681d94..866246b 100644
--- a/knowledgebase/doc/doc_processor.py
+++ b/knowledgebase/doc/doc_processor.py
@@ -5,7 +5,10 @@
 # @version: 
 # @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗o紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆�
 from langchain_core.messages import HumanMessage
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
 
+from knowledgebase.db.doc_db_models import TEntity
 from knowledgebase.doc.docx_split import DocSplit
 import asyncio
 from knowledgebase.db.doc_db_helper import doc_dbh
@@ -13,7 +16,7 @@
 from knowledgebase.doc.entity_recognition import EntityRecognition
 import os.path
 
-from knowledgebase.doc.models import DocInfo, ParagraphInfo
+from knowledgebase.doc.models import DocInfo, ParagraphInfo, DocType
 from knowledgebase.llm import llm
 from knowledgebase.log import Log
 from knowledgebase import utils
@@ -27,8 +30,8 @@
         """
         Log.info(f'寮�濮嬪鐞嗘枃妗o細{docx_file}')
         self.docx_file = docx_file
-        self.doc_split = DocSplit(docx_file)
         self.doc_type = self.get_doc_type()
+        self.doc_split = DocSplit(docx_file, self.doc_type)
         self.entity_recognition = EntityRecognition(self.doc_type)
         self.doc_id = 0
 
@@ -48,14 +51,58 @@
         Log.info(f'璇嗗埆缁撴灉锛歿resp.content}')
         return resp.content
 
-    async def gen_sect_entities(self, paragraph: ParagraphInfo):
-        # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}')
+    def get_tm_pkt_info(self, paragraph: ParagraphInfo):
+        if self.doc_type not in [DocType.tm_outline, DocType.tm_pkt_design]:
+            return ''
+        prompt = HumanMessagePromptTemplate.from_template('''
+# 鎸囦护
+璇嗗埆閬ユ祴鍖呬俊鎭紝璇蜂粠涓嬮潰鐨勬枃鏈腑璇嗗埆閬ユ祴鍖呬俊鎭紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆�
+璇嗗埆瑙勫垯锛氱珷鑺傛爣棰樹腑鍖呭惈鍖呭悕绉板拰浠e彿锛岀珷鑺傚唴瀹逛负琛ㄦ牸锛岃〃鏍间腑鍖呮嫭鍖呭ご瀹氫箟鍜屽寘鍙傛暟瀹氫箟銆�
+鎻愬彇鐨勯仴娴嬪寘淇℃伅鍖呮嫭锛氬寘鍚嶇О锛屽寘浠e彿锛孉PID銆�
+# 绾︽潫
+- 濡傛灉鏂囨湰鍐呭鏄洰褰曞垯涓嶈杈撳嚭浠讳綍瀛楃锛�
+- 鏂囨湰鎻忚堪鐨勫唴瀹规槸鍗曚釜閬ユ祴鍖咃紝濡傛灉鏈夊涓仴娴嬪寘鍒欎笉瑕佽緭鍑轰换浣曞瓧绗︼紱
+- 鏂囨湰缁撴瀯閫氬父鏄細鍖呭悕绉般�佷唬鍙峰拰APID鍦ㄥ紑澶达紝鍚庨潰绱ф帴鐫�鏄寘澶村拰鍙傛暟瀹氫箟琛紱
+- 濡傛灉娌℃湁璇嗗埆鍒伴仴娴嬪寘淇℃伅涓嶈杈撳嚭浠讳綍瀛楃锛�
+- 璇嗗埆澶辫触锛屼笉瑕佽緭鍑轰换浣曞唴瀹癸紝鍖呮嫭瑙i噴鎬ф枃鏈紱
+- 杈撳嚭json鏍煎紡銆�
+# 澶嶅悎瑕佹眰鐨勬枃鏈粨鏋�
+1.1.1 code xxx鍖�(APID=0x123)
+```json
+琛ㄦ牸鍐呭
+``` 
+# 绀轰緥 - 璇嗗埆鍒版暟鎹寘
+{{
+    "name": "xxx鍖�",
+    "code": "xxx",
+    "apid": 123
+}}
+# 绀轰緥 - 鏈瘑鍒埌鏁版嵁鍖�
+""
+# 鏂囨湰鍐呭锛�
+{text}
+''')
+        chain = prompt.prompt | llm | JsonOutputParser()
+        resp = chain.invoke({"text": paragraph.full_text})
+        return resp
+
+    async def gen_chapter_entities(self, paragraph: ParagraphInfo):
         # 鑾峰彇绔犺妭瀹炰綋璇�
-        entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text))
-        Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}')
-        if entities:
-            paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities]
-            paragraph.entities = [e for e in paragraph.entities if e]
+        entity_names = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text))
+        Log.info(f'绔犺妭{paragraph.title_num}瀹炰綋璇嶏細{entity_names}')
+        if entity_names:
+            paragraph.entities = doc_dbh.get_entities_by_names(entity_names)
+        # 鑾峰彇閬ユ祴鍖呬俊鎭�
+        pkt = self.get_tm_pkt_info(paragraph)
+        if pkt:
+            entity = TEntity(name=pkt['code'], type='閬ユ祴鍖呴厤缃�', prompts='', doc_type='')
+            e = doc_dbh.get_entity(entity)
+            if e:
+                entity.id = e.id
+                return e
+            doc_dbh.add_entity(entity)
+            Log.info(f"鏂板Entity锛歿entity.name}锛宨d锛歿entity.id}")
+            paragraph.entities.append(entity)
 
     def process(self):
         self.doc_split.split()
@@ -65,7 +112,7 @@
             batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size]
             tasks = []
             for paragraph in batch_paragraphs:
-                tasks.append(self.gen_sect_entities(paragraph))
+                tasks.append(self.gen_chapter_entities(paragraph))
 
             async def run():
                 await asyncio.gather(*tasks)
@@ -87,20 +134,3 @@
         for paragraph in doc.paragraphs:
             doc_dbh.add_paragraph(self.doc_id, None, paragraph)
         Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚')
-
-
-if __name__ == '__main__':
-    files = [
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx",
-        # r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx",
-        r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx"
-    ]
-    for file in files:
-        doc_processor = DocProcessor(file)
-        doc_processor.process()
-
-    # doc_dbh.get_docs()
diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py
index 22d189a..4270b05 100644
--- a/knowledgebase/doc/docx_split.py
+++ b/knowledgebase/doc/docx_split.py
@@ -27,12 +27,14 @@
 
     """
 
-    def __init__(self, docx_file: str):
+    def __init__(self, docx_file: str, docx_type: str):
         """
         docx鏂囨。鎷嗗垎
         :param docx_file: 瑕佹媶鍒嗙殑docx鏂囦欢璺緞
+        :param docx_type: 鏂囨。绫诲瀷
         """
         self.docx_file = docx_file
+        self.docx_type = docx_type
         self.image_to_text = ImageToText()
         self.paragraphs: list[ParagraphInfo] = []
         self.paragraph_tree: list[ParagraphInfo] = []
@@ -291,21 +293,3 @@
         # 鏇挎崲鍘熷鍒楄〃鍐呭锛岄伩鍏嶅娆� remove 鎿嶄綔
         self.paragraphs[:] = _paragraphs
         self.paragraph_tree = result
-
-
-if __name__ == '__main__':
-    docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx'
-    # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx'
-    doc_split = DocSplit(docx_file)
-    doc_split.split()
-    # er = EntityRecognition()
-    # db = Neo4jHelper()
-    # for trunk in doc_split.trunks:
-    #     print('娈佃惤鏂囨湰锛�')
-    #     print(trunk)
-    #     print('瀹炰綋璇嶏細')
-    #     print(er.run(trunk))
-    # entities = er.run(trunk)
-    # db.create_page_node()
-    print("\n".join([x.full_text_with_children for x in doc_split.paragraphs]))
-    print()
diff --git a/knowledgebase/doc/entity_helper.py b/knowledgebase/doc/entity_helper.py
index 659c5ae..219eed2 100644
--- a/knowledgebase/doc/entity_helper.py
+++ b/knowledgebase/doc/entity_helper.py
@@ -15,15 +15,11 @@
 class EntityHelper:
     # 鏂囨。绫诲瀷鍜岃瘑鍒彁绀鸿瘝map
     doc_prompt_map: dict
-    # 鎵�鏈夊疄浣�
-    entities: list[TEntity]
 
     def __init__(self):
         Log.info("鍒濆鍖朎ntityHelper")
         current_dir = os.path.dirname(__file__)
-        self.entities = doc_dbh.get_all_entities()
         self.doc_prompt_map = {}
-        entity_names = [entity.name for entity in self.entities]
         with open(f'{current_dir}/../../tpl/entities.json', 'r', encoding='utf-8') as f:
             text = f.read()
             obj = json.loads(text)
@@ -33,13 +29,11 @@
                     prompts = obj2[doc_ty]['prompts']
                     self.doc_prompt_map[doc_ty] = prompts
                     for entity in obj2[doc_ty]['entities']:
-                        if entity in entity_names:
-                            continue
                         _entity = TEntity(name=entity, type=ty, doc_type=doc_ty,
                                           prompts=obj2[doc_ty]['entities'][entity])
+                        if doc_dbh.get_entity(_entity):
+                            continue
                         doc_dbh.add_entity(_entity)
-                        self.entities.append(_entity)
                         Log.info(f"鏂板Entity锛歿entity}锛宨d锛歿_entity.id}")
-
 
 entity_helper = EntityHelper()
diff --git a/knowledgebase/doc/entity_recognition.py b/knowledgebase/doc/entity_recognition.py
index 1144aab..f95fb8f 100644
--- a/knowledgebase/doc/entity_recognition.py
+++ b/knowledgebase/doc/entity_recognition.py
@@ -11,7 +11,7 @@
 import json
 
 from knowledgebase import utils
-from knowledgebase.doc.entity_helper import entity_helper
+from knowledgebase.db.doc_db_helper import doc_dbh
 from knowledgebase.log import Log
 
 llm = ChatOpenAI(temperature=0,
@@ -31,7 +31,7 @@
 
     def __init__(self, doc_type: str):
         # 瀹炰綋璇嶅垪琛�
-        entities = list(filter(lambda x: x.doc_type == doc_type, entity_helper.entities))
+        entities = doc_dbh.get_entities_by_doc_type(doc_type)
         entity_list = '锛�'.join([entity.name for entity in entities]) + "銆�"
         entity_rules = "锛沑n".join([f"- {entity.name}锛歿entity.prompts}" for entity in entities]) + "銆�"
         tpl = """
diff --git a/knowledgebase/doc/models.py b/knowledgebase/doc/models.py
index 4749fee..7eacdc8 100644
--- a/knowledgebase/doc/models.py
+++ b/knowledgebase/doc/models.py
@@ -6,6 +6,7 @@
 # @description: 鏂囨。鐩稿叧鏁版嵁绫�
 from dataclasses import dataclass
 import typing
+from enum import Enum
 
 from knowledgebase.db.doc_db_models import TEntity
 
@@ -115,3 +116,15 @@
         self.file = file
         self.file_type = file_type
         self.paragraphs: typing.List[ParagraphInfo] = paragraphs
+
+
+class _DocType:
+    tm_outline = '閬ユ祴澶х翰'
+    user_requirements = '鐢ㄦ埛闇�姹�'
+    tm_pkt_design = '婧愬寘璁捐'
+    bus_comm_proto = '鎬荤嚎閫氫俊鍗忚'
+    tc_format = '鎸囦护鏍煎紡'
+    tc_cmd_table = '閬ユ帶鎸囦护琛�'
+
+
+DocType = _DocType()
diff --git a/knowledgebase/gen_base_db/json_generate.py b/knowledgebase/gen_base_db/json_generate.py
index ab7076a..062ed24 100644
--- a/knowledgebase/gen_base_db/json_generate.py
+++ b/knowledgebase/gen_base_db/json_generate.py
@@ -134,6 +134,15 @@
         """
         return doc_dbh.get_text_with_entities(entity_names)
 
+    @staticmethod
+    def get_texts_with_entity(entity_names: list[str]) -> list[str]:
+        """
+        鏍规嵁瀹炰綋璇嶈幏鍙栨枃妗f枃鏈�
+        :param entity_names: str - 瀹炰綋璇嶅悕绉�
+        :return: str - 鏂囨湰鍐呭
+        """
+        return doc_dbh.get_texts_with_entities(entity_names)
+
     def run(self):
         # 鏍规嵁鏂囨。锛岀敓鎴愮粨鏋勫寲鏁版嵁
         self.handle_tm_structured_data()
@@ -349,6 +358,7 @@
         def validation(gen_text):
             vcs = json.loads(gen_text)
             assert next(filter(lambda it: re.match('^[0-1]+$', it['VCID']), vcs)), '鐢熸垚鐨刅CID蹇呴』鏄簩杩涘埗'
+
         doc_text = self.get_text_with_entity(['铏氭嫙淇¢亾瀹氫箟'])
         result = self.call_model(_msg, 'out/' + dev.code + '_铏氭嫙淇¢亾.json', doc_text, validation)
         Log.info('铏氭嫙淇¢亾锛�' + result)
@@ -380,7 +390,8 @@
             pkts = json.loads(gen_text)
             assert len(pkts), 'VC婧愬寘鍒楄〃涓嶈兘涓虹┖'
 
-        text = self.call_model(_msg, 'out/' + dev.code + '_閬ユ祴婧愬寘涓嬩紶鏃舵満.json', ['閬ユ祴婧愬寘涓嬩紶鏃舵満'], validation)
+        doc_text = self.get_text_with_entity(['閬ユ祴婧愬寘涓嬩紶鏃舵満'])
+        text = self.call_model(_msg, 'out/' + dev.code + '_閬ユ祴婧愬寘涓嬩紶鏃舵満.json', doc_text, validation)
         Log.info('閬ユ祴婧愬寘鎵�灞炶櫄鎷熶俊閬擄細' + text)
         return json.loads(text)
 
@@ -410,7 +421,8 @@
                 }
             ]
         """
-        result = self.call_model(_msg, 'out/' + dev.code + '_婧愬寘鍒楄〃.json', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'])
+        doc_text = self.get_text_with_entity(['婧愬寘鍒楄〃'])
+        result = self.call_model(_msg, 'out/' + dev.code + '_婧愬寘鍒楄〃.json', doc_text)
         Log.info('閬ユ祴婧愬寘鍒楄〃锛�' + result)
         return json.loads(result)
 
@@ -434,7 +446,8 @@
                 # 渚嬪瓙锛�
                 {"last_par_pos":128, "par_num": 20}
             """
-            text = self.call_model(_msg, '', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'])
+            doc_text = self.get_text_with_entity([pkt_id])
+            text = self.call_model(_msg, '', doc_text)
             result = json.loads(text)
             last_par_pos = result['last_par_pos']
             par_num = result['par_num']
@@ -494,7 +507,7 @@
                 ]
             """
 
-            def validation(gen_text):
+            def _validation(gen_text):
                 _pkt = json.loads(gen_text)
                 with open(f'out/tmp/{time.time()}.json', 'w') as f:
                     f.write(gen_text)
@@ -504,7 +517,7 @@
                 # assert par_num == len(_pkt['datas']), f'鏁版嵁鍩熷弬鏁颁釜鏁颁笉瀵癸紒棰勮{par_num}涓紝瀹為檯{len(_pkt["datas"])}'
                 assert last_par_pos == _pkt['datas'][-1]['pos'], '鏈�鍚庝竴涓弬鏁扮殑瀛楄妭浣嶇疆涓嶅锛�'
 
-            result = self.call_model(_msg, f'out/鏁版嵁鍖�-{pkt_name}.json', [], ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'], validation)
+            result = self.call_model(_msg, f'out/鏁版嵁鍖�-{pkt_name}.json', doc_text, _validation)
             Log.info(f'鏁版嵁鍖呪�渰pkt_name}鈥濅俊鎭細' + result)
             pkt = json.loads(result)
         else:
@@ -517,87 +530,72 @@
         return pkt
 
     def gen_bus(self):
-        _msg = """
-            # 鎸囦护
-            鎴戦渶瑕佷粠鏂囨。涓彁鍙栫粡鎬荤嚎鐨勬暟鎹寘鍒楄〃锛屼綘瑕佸府鍔╂垜瀹屾垚缁忔�荤嚎鐨勬暟鎹寘鍒楄〃鐨勬彁鍙栥��
-            # 闇�姹�
-            璇锋瀽鏂囨。锛屽垪鍑烘�荤嚎閫氫俊鍖呬紶杈撶害瀹氫腑鎻忚堪鐨勬墍鏈夋暟鎹寘鍒楄〃锛�
-            鏁版嵁鍖呭瓧娈靛寘鎷細id(鏁版嵁鍖呬唬鍙�)銆乶ame(鏁版嵁鍖呭悕绉�)銆乤pid(16杩涘埗瀛楃涓�)銆乻ervice(鏈嶅姟瀛愭湇鍔�)銆乴ength(bit闀垮害)銆乮nterval(浼犺緭鍛ㄦ湡)銆乻ubAddr(瀛愬湴鍧�/妯″紡)銆乫rameNum(閫氫俊甯у彿)銆�
-            transSer(浼犺緭鏈嶅姟)銆乶ote(澶囨敞)銆乺tAddr(鎵�灞濺T鐨勫湴鍧�鍗佽繘鍒�)銆乺t(鎵�灞瀝t鍚嶇О)銆乼hroughBus(鏄惁缁忚繃鎬荤嚎)銆乥urst(鏄惁绐佸彂)銆乼ransDirect(浼犺緭鏂瑰悜)銆�
-            # 绾︽潫
-            - frameNum锛氫娇鐢ㄦ枃妗d腑鐨勬枃鏈笉瑕佸仛浠讳綍杞崲锛�
-            - subAddr锛氬�间负鈥滄繁搴︹�濄�佲�滃钩閾衡�濄�佲�滄暟瀛椻�濇垨null锛�
-            - 鏄惁缁忚繃鎬荤嚎鐨勫垽鏂緷鎹細鈥滃娉ㄢ�濆垪濉啓浜嗗唴瀹圭被浼尖�滀笉缁忚繃鎬荤嚎鈥濈殑鏂囧瓧琛ㄧず涓嶇粡杩囨�荤嚎鍚﹀垯缁忚繃鎬荤嚎锛�
-            - 浼犺緭鏈嶅姟鍒嗕笁绉嶏細SetData(缃暟)銆丟etData(鍙栨暟)銆丏ataBlock(鏁版嵁鍧椾紶杈�)锛�
-            - 浼犺緭鏂瑰悜鍒嗏�濇敹鈥滃拰鈥濆彂鈥滐紝浼犺緭鏈嶅姟濡傛灉鏄�濆彇鏁扳�滄槸鈥濇敹鈥滐紝濡傛灉鏄�濇暟鎹潡浼犺緭鈥滃垯鏍规嵁鍖呮墍鍦ㄧ殑鍒嗙郴缁熶互鍙婅〃鏍肩殑鈥濅紶杈撴柟鍚戔�滃垪杩涜鍒ゆ柇锛屽垽鏂浜嶴MU鏉ヨ鏄敹杩樻槸鍙戯紱
-            - 鏄惁绐佸彂锛氭牴鎹〃鏍间腑鐨勨�濅紶杈撳懆鏈熲�滃垪杩涜鍒ゆ柇锛屽鏋滃~鍐欎簡绫讳技鈥濈獊鍙戔�滅殑鏂囧瓧琛ㄧず鏄獊鍙戝惁鍒欒〃绀轰笉鏄獊鍙戯紱
-            - 涓嶈婕忔帀浠讳綍涓�涓暟鎹寘锛�
-            - 鏁版嵁缁撴瀯鏈�澶栧眰鏄暟缁勶紝鏁扮粍鍏冪礌涓烘暟鎹寘锛屼互JSON鏍煎紡杈撳嚭锛屼笉瑕佽緭鍑篔SON浠ュ鐨勪换浣曟枃鏈��
-            # 渚嬪瓙
-            [
-                {
-                    "id": "PCS005",
-                    "name": "鎬荤嚎绠$悊锛堝唴閮ㄦ寚浠わ級",
-                    "apid": "418",
-                    "service": "(1, 2)",
-                    "length": 1,
-                    "interval": 1000,
-                    "subAddr": null,
-                    "frameNum": "1|2",
-                    "transSer": "DataBlock",
-                    "note": "",
-                    "rtAddr": 28,
-                    "rt": "鏁版嵁鎺ュ彛鍗曞厓XIU",
-                    "throughBus": true,
-                    "burst": true,
-                    "transDirect": "鍙�"
-                }
-            ]
-        """
+        self.bus_pkts = []
+        doc_text_list = self.get_texts_with_entity(['鍒嗙郴缁熸簮鍖�'])
+        for doc_text in doc_text_list:
+            _msg = """
+                # 鎸囦护
+                鎴戦渶瑕佷粠鏂囨。涓彁鍙栫粡鎬荤嚎鐨勬暟鎹寘鍒楄〃锛屼綘瑕佸府鍔╂垜瀹屾垚缁忔�荤嚎鐨勬暟鎹寘鍒楄〃鐨勬彁鍙栥��
+                # 闇�姹�
+                璇锋瀽鏂囨。锛屽垪鍑烘�荤嚎閫氫俊鍖呬紶杈撶害瀹氫腑鎻忚堪鐨勬墍鏈夋暟鎹寘鍒楄〃锛�
+                鏁版嵁鍖呭瓧娈靛寘鎷細id(鏁版嵁鍖呬唬鍙�)銆乶ame(鏁版嵁鍖呭悕绉�)銆乤pid(16杩涘埗瀛楃涓�)銆乻ervice(鏈嶅姟瀛愭湇鍔�)銆乴ength(bit闀垮害)銆乮nterval(浼犺緭鍛ㄦ湡)銆乻ubAddr(瀛愬湴鍧�/妯″紡)銆乫rameNum(閫氫俊甯у彿)銆�
+                transSer(浼犺緭鏈嶅姟)銆乶ote(澶囨敞)銆乺tAddr(鎵�灞濺T鐨勫湴鍧�鍗佽繘鍒�)銆乺t(鎵�灞瀝t鍚嶇О)銆乼hroughBus(鏄惁缁忚繃鎬荤嚎)銆乥urst(鏄惁绐佸彂)銆乼ransDirect(浼犺緭鏂瑰悜)銆�
+                # 绾︽潫
+                - frameNum锛氫娇鐢ㄦ枃妗d腑鐨勬枃鏈笉瑕佸仛浠讳綍杞崲锛�
+                - subAddr锛氬�间负鈥滄繁搴︹�濄�佲�滃钩閾衡�濄�佲�滄暟瀛椻�濇垨null锛�
+                - 鏄惁缁忚繃鎬荤嚎鐨勫垽鏂緷鎹細鈥滃娉ㄢ�濆垪濉啓浜嗗唴瀹圭被浼尖�滀笉缁忚繃鎬荤嚎鈥濈殑鏂囧瓧琛ㄧず涓嶇粡杩囨�荤嚎鍚﹀垯缁忚繃鎬荤嚎锛�
+                - 浼犺緭鏈嶅姟鍒嗕笁绉嶏細SetData(缃暟)銆丟etData(鍙栨暟)銆丏ataBlock(鏁版嵁鍧椾紶杈�)锛�
+                - 浼犺緭鏂瑰悜鍒嗏�濇敹鈥滃拰鈥濆彂鈥滐紝浼犺緭鏈嶅姟濡傛灉鏄�濆彇鏁扳�滄槸鈥濇敹鈥滐紝濡傛灉鏄�濇暟鎹潡浼犺緭鈥滃垯鏍规嵁鍖呮墍鍦ㄧ殑鍒嗙郴缁熶互鍙婅〃鏍肩殑鈥濅紶杈撴柟鍚戔�滃垪杩涜鍒ゆ柇锛屽垽鏂浜嶴MU鏉ヨ鏄敹杩樻槸鍙戯紱
+                - 鏄惁绐佸彂锛氭牴鎹〃鏍间腑鐨勨�濅紶杈撳懆鏈熲�滃垪杩涜鍒ゆ柇锛屽鏋滃~鍐欎簡绫讳技鈥濈獊鍙戔�滅殑鏂囧瓧琛ㄧず鏄獊鍙戝惁鍒欒〃绀轰笉鏄獊鍙戯紱
+                - 涓嶈婕忔帀浠讳綍涓�涓暟鎹寘锛�
+                - 鏁版嵁缁撴瀯鏈�澶栧眰鏄暟缁勶紝鏁扮粍鍏冪礌涓烘暟鎹寘锛屼互JSON鏍煎紡杈撳嚭锛屼笉瑕佽緭鍑篔SON浠ュ鐨勪换浣曟枃鏈��
+                # 渚嬪瓙
+                [
+                    {
+                        "id": "PCS005",
+                        "name": "鎬荤嚎绠$悊锛堝唴閮ㄦ寚浠わ級",
+                        "apid": "418",
+                        "service": "(1, 2)",
+                        "length": 1,
+                        "interval": 1000,
+                        "subAddr": null,
+                        "frameNum": "1|2",
+                        "transSer": "DataBlock",
+                        "note": "",
+                        "rtAddr": 28,
+                        "rt": "鏁版嵁鎺ュ彛鍗曞厓XIU",
+                        "throughBus": true,
+                        "burst": true,
+                        "transDirect": "鍙�"
+                    }
+                ]
+            """
 
-        def validation(gen_text):
-            json.loads(gen_text)
+            def validation(gen_text):
+                json.loads(gen_text)
 
-        result = self.call_model(_msg, 'out/鎬荤嚎.json', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'], validation)
-        Log.info('鎬荤嚎鏁版嵁鍖咃細' + result)
+            result = self.call_model(_msg, 'out/鎬荤嚎.json', doc_text, validation)
+            Log.info('鎬荤嚎鏁版嵁鍖咃細' + result)
 
-        pkts = json.loads(result)
-        # 绛涢�夌粡鎬荤嚎鐨勬暟鎹寘
-        pkts = list(filter(lambda it: it['throughBus'], pkts))
-        # 绛涢�夋湁apid鐨勬暟鎹寘
-        pkts = list(filter(lambda it: it['apid'], pkts))
+            pkts = json.loads(result)
+            # 绛涢�夌粡鎬荤嚎鐨勬暟鎹寘
+            pkts = list(filter(lambda it: it['throughBus'], pkts))
+            # 绛涢�夋湁apid鐨勬暟鎹寘
+            pkts = list(filter(lambda it: it['apid'], pkts))
 
-        pkts2 = []
-        # todo 杩欎竴姝ュ簲璇ラ�氳繃鏁版嵁搴撶瓫閫夛紝鏁版嵁搴撲腑宸茬粡鏈夋墍鏈夐仴娴嬪寘浠ュ強閬ユ祴鍖呭搴旂殑瀹氫箟娈佃惤鏂囨湰
-        for pkt in pkts:
-            if self.pkt_in_tm_pkts(pkt["name"]):
-                pkts2.append(pkt)
-        for pkt in pkts2:
-            self.gen_pkt_details(pkt['name'], pkt['id'])
-            _pkt = self.gen_pkt_details(pkt['name'], pkt['id'])
-            if _pkt:
-                pkt['children'] = []
-                pkt['children'].extend(_pkt['datas'])
-                pkt['length'] = _pkt['length']
-        self.bus_pkts = pkts
-
-    def pkt_in_tm_pkts(self, pkt_name):
-        _msg = f"""
-            # 鎸囦护
-            鎴戦渶瑕佷粠鏂囨。涓垎鏋愬垽璇绘槸鍚︽湁鏌愪釜閬ユ祴鍖呯殑瀛楁琛ㄦ弿杩帮紝浣犺甯姪鎴戝垽鏂��
-            # 闂
-            鏂囨。涓湁閬ユ祴鍖呪�渰pkt_name}鈥濈殑瀛楁琛ㄦ弿杩板悧锛�
-            娉ㄦ剰锛氶仴娴嬪寘鐨勫瓧娈佃〃绱ф帴鐫�閬ユ祴鍖呯珷鑺傛爣棰橈紝濡傛灉绔犺妭鏍囬鍚庨潰鐪佺暐浜嗘垨鑰呰瑙亁xx鍒欐槸娌℃湁瀛楁琛ㄦ弿杩般��
-            # 绾︽潫
-            - 鏍规嵁鏂囨。鍐呭杈撳嚭锛�
-            - 閬ユ祴鍖呭悕绉板繀椤诲畬鍏ㄥ尮閰嶏紱
-            - 杈撳嚭鈥滄棤鈥濇垨鈥滄湁鈥濓紝涓嶈杈撳嚭鍏朵粬浠讳綍鍐呭銆�
-            # 渚嬪瓙
-            鏈�
-        """
-        text = self.call_model(_msg, f'out/pkts/鏈夋棤鏁版嵁鍖�-{pkt_name}.txt', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'])
-        Log.info(f'鏂囨。涓湁鏃犫�渰pkt_name}鈥濈殑瀛楁鎻忚堪锛�' + text)
-        return text == '鏈�'
+            # pkts2 = []
+            # todo 杩欎竴姝ュ簲璇ラ�氳繃鏁版嵁搴撶瓫閫夛紝鏁版嵁搴撲腑瀛樺偍浜嗘瘡涓暟鎹寘鐨勪唬鍙峰疄浣�
+            # for pkt in pkts:
+            #     if self.pkt_in_tm_pkts(pkt["name"]):
+            #         pkts2.append(pkt)
+            for pkt in pkts:
+                self.gen_pkt_details(pkt['name'], pkt['id'])
+                _pkt = self.gen_pkt_details(pkt['name'], pkt['id'])
+                if _pkt:
+                    pkt['children'] = []
+                    pkt['children'].extend(_pkt['datas'])
+                    pkt['length'] = _pkt['length']
+            self.bus_pkts.extend(pkts)
 
     # endregion 閬ユ祴-end
 
@@ -642,7 +640,8 @@
         def validation(gen_text):
             json.loads(gen_text)
 
-        text = self.call_model(_msg, 'out/tc_transfer_frame.json', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'], validation)
+        doc_text = self.get_text_with_entity(['閬ユ帶甯ф牸寮�'])
+        text = self.call_model(_msg, 'out/tc_transfer_frame.json', doc_text, validation)
         result: dict = json.loads(text)
         format_text = utils.read_from_file('tpl/tc_transfer_frame.json')
         format_text = utils.replace_tpl_paras(format_text, result)
@@ -681,7 +680,8 @@
         def validation(gen_text):
             json.loads(gen_text)
 
-        text = self.call_model(_msg, 'out/tc_transfer_pkt.json', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'], validation)
+        doc_text = self.get_text_with_entity(['閬ユ帶鍖呮牸寮�'])
+        text = self.call_model(_msg, 'out/tc_transfer_pkt.json', doc_text, validation)
         result = json.loads(text)
 
         format_text = utils.read_from_file('tpl/tc_pkt_format.json')
@@ -691,25 +691,29 @@
         return pkt_format
 
     def gen_tc_transfer_pkts(self):
-        _msg = '''
-            # 鎸囦护
-            鍒嗘瀽鏂囨。鍒楀嚭鎵�鏈夌殑閬ユ帶婧愬寘銆�
-            # 杈撳嚭渚嬪瓙锛�
-            [{
-            "name": "xxx",
-            "code":"pkt",
-            "搴旂敤杩囩▼鏍囪瘑绗�":"0xAA",
-            "鏈嶅姟绫诲瀷":"0x1",
-            "鏈嶅姟瀛愮被鍨�":"0x2"
-            }]
-        '''
+        doc_text_list = self.get_texts_with_entity(['APID鍒嗛厤'])
+        pkts = []
+        for doc_text in doc_text_list:
+            _msg = '''
+                # 鎸囦护
+                鍒嗘瀽鏂囨。鍒楀嚭鎵�鏈夌殑閬ユ帶婧愬寘銆�
+                # 杈撳嚭渚嬪瓙锛�
+                [{
+                "name": "xxx",
+                "code":"pkt",
+                "搴旂敤杩囩▼鏍囪瘑绗�":"0xAA",
+                "鏈嶅姟绫诲瀷":"0x1",
+                "鏈嶅姟瀛愮被鍨�":"0x2"
+                }]
+            '''
 
-        def validation(gen_text):
-            json.loads(gen_text)
+            def validation(gen_text):
+                json.loads(gen_text)
 
-        text = self.call_model(_msg, 'out/tc_transfer_pkts.json', ['杩欓噷鏄枃妗d腑鎶藉彇鐨勫唴瀹�'], validation)
-        Log.info('閬ユ帶鍖呭垪琛細' + text)
-        return json.loads(text)
+            text = self.call_model(_msg, 'out/tc_transfer_pkts.json', doc_text, validation)
+            Log.info('閬ユ帶鍖呭垪琛細' + text)
+            pkts.extend(json.loads(text))
+        return pkts
 
     def gen_tc_pkt_details(self, pkt):
         tc_name = pkt['name']
diff --git a/knowledgebase/log/__init__.py b/knowledgebase/log/__init__.py
index b6411d8..ee89a7d 100644
--- a/knowledgebase/log/__init__.py
+++ b/knowledgebase/log/__init__.py
@@ -10,7 +10,7 @@
 logger.setLevel(logging.DEBUG)
 
 # 鍒涘缓涓�涓枃浠跺鐞嗗櫒
-file_handler = logging.FileHandler('logs.log')
+file_handler = logging.FileHandler('logs.log', encoding='utf-8')
 file_handler.setLevel(logging.DEBUG)
 
 # 鍒涘缓涓�涓帶鍒跺彴澶勭悊鍣�
diff --git a/main.py b/main.py
index 50eec97..e69de29 100644
--- a/main.py
+++ b/main.py
@@ -1,67 +0,0 @@
-import math
-import os
-import random
-import time
-
-from knowledgebase.markitdown import MarkItDown
-
-from doc_to_docx import doc_to_docx
-
-
-def process_docs(directory):
-    # 閬嶅巻鐩綍涓嬬殑鎵�鏈夋枃浠�
-    for filename in os.listdir(directory):
-        # 鍒ゆ柇鏄惁涓� doc 鏂囦欢
-        if filename.endswith(".doc"):
-            # 杞崲涓� docx
-            doc_to_docx(directory + filename, directory + filename.replace(".doc", ".docx"))
-
-
-md = MarkItDown()
-
-
-def to_markdown(dst_dir: str):
-    text = ''
-    # 閬嶅巻鏂囦欢澶逛笅鐨勬墍鏈夋枃浠�
-    for file in os.listdir(dst_dir):
-        # 鍒ゆ柇鏄惁涓� docx 鏂囦欢
-        if file.endswith(".docx"):
-            # 杞崲涓� md
-            result = md.convert(dst_dir + file)
-            text = result.text_content
-            out_file = dst_dir + file + '.md'
-            with open(out_file, 'w', encoding='utf-8') as f:
-                f.write(text)
-    return out_file
-
-
-# 1.瑙f瀽鏂囨。
-# 2.杈撳叆鏂囨。
-# 3.鍚姩LangFlow
-def main():
-    doc_dir = ".\\doc\\"
-    # 澶勭悊鏂囨。
-    # process_docs(doc_dir)
-    # 鏂囨。杞崲涓簃arkdown
-    md_file = to_markdown(doc_dir)
-
-    md_file = 'D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.md'
-    # 鍚姩澶фā鍨嬪鐞嗘祦绋�
-    # ret_text = LangFlow([md_file]).run()
-    # 淇濆瓨缁撴灉
-    # with open('D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.text', 'w', encoding='utf-8') as f:
-    #     f.write(ret_text)
-
-
-def get_bit_mask(start, end):
-    bits = math.ceil((end + 1) / 8) * 8
-    if bits == 0:
-        bits = 8
-    mask = 0
-    for i in range(start, end + 1):
-        mask |= 1 << (bits - i - 1)
-    return mask
-
-
-# if __name__ == '__main__':
-#     main()
diff --git a/testcases/test_doc_db_helper.py b/testcases/test_doc_db_helper.py
new file mode 100644
index 0000000..cce6a36
--- /dev/null
+++ b/testcases/test_doc_db_helper.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# 
+# @author: 
+# @date: 
+# @version: 
+# @description:
+from knowledgebase.db.doc_db_helper import doc_dbh
+
+
+def test():
+    text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�'])
+    print(text)
diff --git a/testcases/test_doc_processor.py b/testcases/test_doc_processor.py
new file mode 100644
index 0000000..ecad70a
--- /dev/null
+++ b/testcases/test_doc_processor.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# 
+# @author: 
+# @date: 
+# @version: 
+# @description:
+from knowledgebase.db.doc_db_helper import doc_dbh
+from knowledgebase.doc.doc_processor import DocProcessor
+
+
+def test_process():
+    files = [
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx",
+        # r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx",
+        r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx"
+    ]
+    for file in files:
+        doc_processor = DocProcessor(file)
+        doc_processor.process()
+def test_get_text_by_entity():
+    text = doc_dbh.get_text_with_entities(['鍒嗙郴缁熸簮鍖�'])
+    print(text)
+if __name__ == '__main__':
+    # test_process()
+    test_get_text_by_entity()
\ No newline at end of file
diff --git a/testcases/test_docx_split.py b/testcases/test_docx_split.py
new file mode 100644
index 0000000..6745526
--- /dev/null
+++ b/testcases/test_docx_split.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+# 
+# @author: 
+# @date: 
+# @version: 
+# @description:
+from knowledgebase.doc.docx_split import DocSplit
+
+
+class TestDocxSplit:
+    def test_split(self):
+        docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx'
+        # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx'
+        doc_split = DocSplit(docx_file, "鎬荤嚎閫氫俊鍗忚")
+        doc_split.split()
+        print("\n".join([x.full_text_with_children for x in doc_split.paragraphs]))
diff --git a/tpl/entities.json b/tpl/entities.json
index 6925a33..276f12f 100644
--- a/tpl/entities.json
+++ b/tpl/entities.json
@@ -15,13 +15,15 @@
         "閬ユ祴鏍煎紡瀹氫箟": "涓�鑸湪鈥滈仴娴嬫牸寮忊�濈珷鑺傦紝鍐呭鍖呭惈鈥濋仴娴嬪抚鈥� 鈥濋仴娴嬪寘鈥滃叿浣撴牸寮忕殑瀹氫箟",
         "铏氭嫙淇¢亾瀹氫箟": "绔犺妭鍚嶅寘鍚�滆櫄鎷熶俊閬撯�濓紝鍐呭鍖呭惈铏氭嫙淇¢亾鐨勫垝鍒嗭紝鍚勬簮鍖呭湪鍚勮櫄鎷熶俊閬撲笅浼犲垎閰�",
         "鎻掑叆鍩�": "绔犺妭鍚嶅寘鍚�滄彃鍏ュ煙鈥濓紝鍐呭涓轰竴寮犺〃鏍硷紝瀹氫箟浜嗘彃鍏ュ煙涓殑閬ユ祴鍙傛暟",
-        "婧愬寘鍙傛暟琛�": "绔犺妭鍚嶅寘鍚�滄簮鍖呰璁♀�濓紝鍐呭涓哄涓簮鍖呭叿浣撳弬鏁扮殑琛ㄦ牸锛屾瘡涓簮鍖呭崟鐙竴寮犺〃鏍�"
+        "婧愬寘鍙傛暟琛�": "绔犺妭鍚嶅寘鍚�滄簮鍖呰璁♀�濓紝鍐呭涓哄涓簮鍖呭叿浣撳弬鏁扮殑琛ㄦ牸锛屾瘡涓簮鍖呭崟鐙竴寮犺〃鏍�",
+        "閬ユ祴婧愬寘涓嬩紶鏃舵満": "绔犺妭鍚嶅寘鍚被浼尖�滈仴娴嬫簮鍖呬笅浼犳椂鏈衡�濈殑鏂囨湰锛屽唴瀹逛负涓�涓〃鏍兼弿杩伴仴娴嬫簮鍖呬笅浼犳椂鏈�"
       }
     },
     "婧愬寘璁捐": {
       "prompts": "鏂囦欢鍚嶉�氬父鍖呭惈鈥滄簮鍖呪�濆叧閿瓧",
       "entities": {
-        "婧愬寘鍙傛暟琛�": "閫氬父涓哄彾瀛愯妭鐐癸紝绔犺妭鍚嶉�氬父涓� 鈥渪xx鍖呪�濓紝鍐呭涓烘簮鍖呭弬鏁拌〃鏍硷紝瀹氫箟浜嗗寘澶淬�佹暟鎹煙鍏蜂綋鍐呭"
+        "婧愬寘鍙傛暟琛�": "閫氬父涓哄彾瀛愯妭鐐癸紝绔犺妭鍚嶉�氬父涓� 鈥渪xx鍖呪�濓紝鍐呭涓烘簮鍖呭弬鏁拌〃鏍硷紝瀹氫箟浜嗗寘澶淬�佹暟鎹煙鍏蜂綋鍐呭",
+        "婧愬寘鍒楄〃": "绔犺妭鍚嶅寘鍚�滈仴娴嬫簮鍖呯被鍨嬪畾涔夆�濈殑鏂囨湰鍐呭"
       }
     }
   },

--
Gitblit v1.9.1