From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 16:04:28 +0800 Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取 --- knowledgebase/db/doc_db_helper.py | 12 +++++++++++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py index d1231cc..fe36fb2 100644 --- a/knowledgebase/db/doc_db_helper.py +++ b/knowledgebase/db/doc_db_helper.py @@ -114,8 +114,16 @@ """ if not entity_names: return "" + _entities = self.session.query(TEntity).where(TEntity.name.in_(entity_names)).all() + _entitie_ids = [entity.id for entity in _entities] + links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all() + _paragraphs = [link.paragraph for link in links] - return '\n'.join([entity.name for entity in self.get_all_entities() if entity.name in entity_names]) + return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs]) + + def get_paragraph_full_text(self, p: TParagraph): + result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text + return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children]) def commit(self): self.session.commit() @@ -124,6 +132,8 @@ doc_dbh = DocDbHelper() # if __name__ == '__main__': +# text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�']) +# print(text) # doc_db = DocDbHelper() # # doc_db.insert_entities() # doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test')) -- Gitblit v1.9.1