From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 16:04:28 +0800 Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取 --- knowledgebase/db/doc_db_helper.py | 22 ++++++++++++++++++++++ 1 files changed, 22 insertions(+), 0 deletions(-) diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py index 5089e30..fe36fb2 100644 --- a/knowledgebase/db/doc_db_helper.py +++ b/knowledgebase/db/doc_db_helper.py @@ -17,6 +17,7 @@ """ 鏂囨。鏁版嵁搴撳姪鎵� """ + def __init__(self): self.session = init_doc_db() @@ -105,6 +106,25 @@ def get_docs(self) -> list[TDoc]: return self.session.query(TDoc).all() + def get_text_with_entities(self, entity_names: list[str]) -> str: + """ + 鏍规嵁瀹炰綋璇嶈幏鍙栨枃鏈唴瀹� + :param entity_names: list[str] - 瀹炰綋璇� + :return: str - 鏂囨湰 + """ + if not entity_names: + return "" + _entities = self.session.query(TEntity).where(TEntity.name.in_(entity_names)).all() + _entitie_ids = [entity.id for entity in _entities] + links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all() + _paragraphs = [link.paragraph for link in links] + + return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs]) + + def get_paragraph_full_text(self, p: TParagraph): + result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text + return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children]) + def commit(self): self.session.commit() @@ -112,6 +132,8 @@ doc_dbh = DocDbHelper() # if __name__ == '__main__': +# text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�']) +# print(text) # doc_db = DocDbHelper() # # doc_db.insert_entities() # doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test')) -- Gitblit v1.9.1