From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 16:04:28 +0800 Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取 --- knowledgebase/doc/doc_processor.py | 2 +- knowledgebase/doc/entity_helper.py | 1 + knowledgebase/db/doc_db_helper.py | 12 +++++++++++- knowledgebase/doc/entity_recognition.py | 24 +++++++++++++++--------- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py index d1231cc..fe36fb2 100644 --- a/knowledgebase/db/doc_db_helper.py +++ b/knowledgebase/db/doc_db_helper.py @@ -114,8 +114,16 @@ """ if not entity_names: return "" + _entities = self.session.query(TEntity).where(TEntity.name.in_(entity_names)).all() + _entitie_ids = [entity.id for entity in _entities] + links = self.session.query(TParagraphEntityLink).where(TParagraphEntityLink.entity_id.in_(_entitie_ids)).all() + _paragraphs = [link.paragraph for link in links] - return '\n'.join([entity.name for entity in self.get_all_entities() if entity.name in entity_names]) + return '\n'.join([self.get_paragraph_full_text(p) for p in _paragraphs]) + + def get_paragraph_full_text(self, p: TParagraph): + result = p.text if p.title_level == 0 else p.title_num + ' ' + p.text + return result + '\n' + '\n'.join([self.get_paragraph_full_text(p) for p in p.children]) def commit(self): self.session.commit() @@ -124,6 +132,8 @@ doc_dbh = DocDbHelper() # if __name__ == '__main__': +# text = doc_dbh.get_text_with_entities(['閬ユ帶鍖呮牸寮�']) +# print(text) # doc_db = DocDbHelper() # # doc_db.insert_entities() # doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test')) diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py index bc0e0dd..d681d94 100644 --- a/knowledgebase/doc/doc_processor.py +++ b/knowledgebase/doc/doc_processor.py @@ -96,7 +96,7 @@ r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx", r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx", r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx", - r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx", + # r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx", r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx" ] for file in files: diff --git a/knowledgebase/doc/entity_helper.py b/knowledgebase/doc/entity_helper.py index e354449..659c5ae 100644 --- a/knowledgebase/doc/entity_helper.py +++ b/knowledgebase/doc/entity_helper.py @@ -38,6 +38,7 @@ _entity = TEntity(name=entity, type=ty, doc_type=doc_ty, prompts=obj2[doc_ty]['entities'][entity]) doc_dbh.add_entity(_entity) + self.entities.append(_entity) Log.info(f"鏂板Entity锛歿entity}锛宨d锛歿_entity.id}") diff --git a/knowledgebase/doc/entity_recognition.py b/knowledgebase/doc/entity_recognition.py index 8b3d58e..1144aab 100644 --- a/knowledgebase/doc/entity_recognition.py +++ b/knowledgebase/doc/entity_recognition.py @@ -12,6 +12,7 @@ from knowledgebase import utils from knowledgebase.doc.entity_helper import entity_helper +from knowledgebase.log import Log llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct", @@ -25,30 +26,35 @@ 浣跨敤langchain鏋勫缓瀹炰綋鎶藉彇娴佺▼銆� """ + use_cache = False cache_file = "entity_recognition.cache" def __init__(self, doc_type: str): # 瀹炰綋璇嶅垪琛� - entities = filter(lambda x: x.doc_type == doc_type, entity_helper.entities) - entity_list = '锛沑n'.join([f'- {entity.name}锛歿entity.prompts}' for entity in entities]) + "銆�" - msg = HumanMessagePromptTemplate.from_template(template=""" + entities = list(filter(lambda x: x.doc_type == doc_type, entity_helper.entities)) + entity_list = '锛�'.join([entity.name for entity in entities]) + "銆�" + entity_rules = "锛沑n".join([f"- {entity.name}锛歿entity.prompts}" for entity in entities]) + "銆�" + tpl = """ # 鎸囦护 -璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛紝瀹炰綋璇嶅垪琛ㄥ畾涔夊涓嬶細 -## 瀹炰綋璇嶅垪琛ㄥ強璇嗗埆瑙勫垯 +璇锋牴鎹疄浣撹瘝鍒ゆ柇瑙勫垯浠庣粰瀹氱殑鏂囨湰涓垽鏂槸鍚︽湁涓嬪垪瀹炰綋璇嶇浉鍏冲唴瀹癸紝濡傛灉鏈夊垯杈撳嚭鐩稿叧鐨勫疄浣撹瘝锛屾病鏈夊垯涓嶈緭鍑猴紝瀹炰綋璇嶅垪琛ㄥ畾涔夊涓嬶細 """ + entity_list + """ +## 瀹炰綋璇嶅垽鏂鍒欙細 +""" + entity_rules + """ # 绾︽潫 - 杈撳嚭鏍煎紡涓篔SON鏍煎紡锛� -- 鎻愬彇鐨勫疄浣撹瘝蹇呴』鏄笂闈㈠垪涓剧殑瀹炰綋璇嶏紱 +- 鎻愬彇鐨勫疄浣撹瘝蹇呴』鏄細""" + entity_list + """锛� +- 濡傛灉娌℃湁澶嶅悎涓婅堪瑙勫垯鐨勫疄浣撹瘝鍒欎笉瑕佽緭鍑轰换浣曞疄浣撹瘝锛� - 杈撳嚭鏁版嵁缁撴瀯涓哄瓧绗︿覆鏁扮粍銆� # 绀轰緥 ```json -["閬ユ帶甯ф牸寮�","閬ユ帶鍖呮牸寮�"] +[\"""" + entities[0].name + """\"] ``` # 鏂囨湰濡備笅锛� {text} """ - ) + Log.info(tpl) + msg = HumanMessagePromptTemplate.from_template(template=tpl) prompt = ChatPromptTemplate.from_messages([msg]) parser = JsonOutputParser(pydantic_object=list[str]) self.chain = prompt | llm | parser @@ -77,7 +83,7 @@ """ # 缂撳瓨鍛戒腑 text_md5 = utils.generate_text_md5(in_text) - if text_md5 in self.cache: + if self.use_cache and text_md5 in self.cache: return self.cache[text_md5] result = self.chain.invoke({"text": in_text}) self.cache[text_md5] = result -- Gitblit v1.9.1