From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 14 五月 2025 16:04:28 +0800
Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取

---
 knowledgebase/db/doc_db_helper.py |   22 ++++++++++++++++++++++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py
index 5089e30..fe36fb2 100644
--- a/knowledgebase/db/doc_db_helper.py
+++ b/knowledgebase/db/doc_db_helper.py
@@ -17,6 +17,7 @@
     """
     鏂囨。鏁版嵁搴撳姪鎵�
     """
+
     def __init__(self):
         self.session = init_doc_db()
 
@@ -105,6 +106,25 @@
     def get_docs(self) -> list[TDoc]:
         return self.session.query(TDoc).all()
 
+    def get_text_with_entities(self, entity_names: list[str]) -> str:
+        """
+        鏍规嵁瀹炰綋璇嶈幏鍙栨枃鏈唴瀹�
+        :param entity_names: list[str] - 瀹炰綋璇�
+        :return: str - 鏂囨湰
+        """
+        if not entity_names:
+            return ""
+        _entities = self.session.query(TEntity).where(TEntity.name.in_(entity_names)).all()
+        _entitie_ids = [entity.id for entity in _entities]
+        links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all()
+        _paragraphs = [link.paragraph for link in links]
+
+        return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs])
+
+    def get_paragraph_full_text(self, p: TParagraph):
+        result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text
+        return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children])
+
     def commit(self):
         self.session.commit()
 
@@ -112,6 +132,8 @@
 doc_dbh = DocDbHelper()
 
 # if __name__ == '__main__':
+#     text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�'])
+#     print(text)
 #     doc_db = DocDbHelper()
 #     # doc_db.insert_entities()
 #     doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test'))

--
Gitblit v1.9.1