From acde3bd32f07bf02839a21e8fe5b4e69bfca2251 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 10:37:00 +0800 Subject: [PATCH] docx文档拆分,文档段落实体词提取,存入mysql数据库。 --- knowledgebase/doc/entity_recognition.py | 25 +++++++++++++++++-------- 1 files changed, 17 insertions(+), 8 deletions(-) diff --git a/knowledgebase/doc/entity_recognition.py b/knowledgebase/doc/entity_recognition.py index 6512bfe..8b3d58e 100644 --- a/knowledgebase/doc/entity_recognition.py +++ b/knowledgebase/doc/entity_recognition.py @@ -11,6 +11,12 @@ import json from knowledgebase import utils +from knowledgebase.doc.entity_helper import entity_helper + +llm = ChatOpenAI(temperature=0, + model="qwen2.5-72b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") class EntityRecognition: @@ -21,20 +27,22 @@ """ cache_file = "entity_recognition.cache" - def __init__(self): - llm = ChatOpenAI(temperature=0, - model="qwen2.5-72b-instruct", - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + def __init__(self, doc_type: str): + # 瀹炰綋璇嶅垪琛� + entities = filter(lambda x: x.doc_type == doc_type, entity_helper.entities) + entity_list = '锛沑n'.join([f'- {entity.name}锛歿entity.prompts}' for entity in entities]) + "銆�" msg = HumanMessagePromptTemplate.from_template(template=""" # 鎸囦护 -璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛ㄣ�� +璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛紝瀹炰綋璇嶅垪琛ㄥ畾涔夊涓嬶細 +## 瀹炰綋璇嶅垪琛ㄥ強璇嗗埆瑙勫垯 +""" + entity_list + """ # 绾︽潫 - 杈撳嚭鏍煎紡涓篔SON鏍煎紡锛� +- 鎻愬彇鐨勫疄浣撹瘝蹇呴』鏄笂闈㈠垪涓剧殑瀹炰綋璇嶏紱 - 杈撳嚭鏁版嵁缁撴瀯涓哄瓧绗︿覆鏁扮粍銆� # 绀轰緥 ```json -["瀹炰綋1","瀹炰綋2"] +["閬ユ帶甯ф牸寮�","閬ユ帶鍖呮牸寮�"] ``` # 鏂囨湰濡備笅锛� @@ -65,9 +73,10 @@ def run(self, in_text: str) -> list[str]: """ 杩愯瀹炰綋璇嗗埆鎶藉彇銆� + :param in_text: str - 杈撳叆鏂囨湰 """ # 缂撳瓨鍛戒腑 - text_md5 = utils.generate_md5(in_text) + text_md5 = utils.generate_text_md5(in_text) if text_md5 in self.cache: return self.cache[text_md5] result = self.chain.invoke({"text": in_text}) -- Gitblit v1.9.1