From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 14 五月 2025 16:04:28 +0800
Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取

---
 knowledgebase/db/doc_db_helper.py |   12 +++++++++++-
 1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py
index d1231cc..fe36fb2 100644
--- a/knowledgebase/db/doc_db_helper.py
+++ b/knowledgebase/db/doc_db_helper.py
@@ -114,8 +114,16 @@
         """
         if not entity_names:
             return ""
+        _entities = self.session.query(TEntity).where(TEntity.name.in_(entity_names)).all()
+        _entitie_ids = [entity.id for entity in _entities]
+        links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all()
+        _paragraphs = [link.paragraph for link in links]
 
-        return '\n'.join([entity.name for entity in self.get_all_entities() if entity.name in entity_names])
+        return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs])
+
+    def get_paragraph_full_text(self, p: TParagraph):
+        result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text
+        return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children])
 
     def commit(self):
         self.session.commit()
@@ -124,6 +132,8 @@
 doc_dbh = DocDbHelper()
 
 # if __name__ == '__main__':
+#     text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�'])
+#     print(text)
 #     doc_db = DocDbHelper()
 #     # doc_db.insert_entities()
 #     doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test'))

--
Gitblit v1.9.1