From acde3bd32f07bf02839a21e8fe5b4e69bfca2251 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期三, 14 五月 2025 10:37:00 +0800
Subject: [PATCH] docx文档拆分,文档段落实体词提取,存入mysql数据库。

---
 knowledgebase/doc/entity_recognition.py |   25 +++++++++++++++++--------
 1 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/knowledgebase/doc/entity_recognition.py b/knowledgebase/doc/entity_recognition.py
index 6512bfe..8b3d58e 100644
--- a/knowledgebase/doc/entity_recognition.py
+++ b/knowledgebase/doc/entity_recognition.py
@@ -11,6 +11,12 @@
 import json
 
 from knowledgebase import utils
+from knowledgebase.doc.entity_helper import entity_helper
+
+llm = ChatOpenAI(temperature=0,
+                 model="qwen2.5-72b-instruct",
+                 base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+                 api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
 
 
 class EntityRecognition:
@@ -21,20 +27,22 @@
     """
     cache_file = "entity_recognition.cache"
 
-    def __init__(self):
-        llm = ChatOpenAI(temperature=0,
-                         model="qwen2.5-72b-instruct",
-                         base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
-                         api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
+    def __init__(self, doc_type: str):
+        # 瀹炰綋璇嶅垪琛�
+        entities = filter(lambda x: x.doc_type == doc_type, entity_helper.entities)
+        entity_list = '锛沑n'.join([f'- {entity.name}锛歿entity.prompts}' for entity in entities]) + "銆�"
         msg = HumanMessagePromptTemplate.from_template(template="""
 # 鎸囦护
-璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛ㄣ��
+璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛紝瀹炰綋璇嶅垪琛ㄥ畾涔夊涓嬶細
+## 瀹炰綋璇嶅垪琛ㄥ強璇嗗埆瑙勫垯
+""" + entity_list + """
 # 绾︽潫
 - 杈撳嚭鏍煎紡涓篔SON鏍煎紡锛�
+- 鎻愬彇鐨勫疄浣撹瘝蹇呴』鏄笂闈㈠垪涓剧殑瀹炰綋璇嶏紱
 - 杈撳嚭鏁版嵁缁撴瀯涓哄瓧绗︿覆鏁扮粍銆�
 # 绀轰緥
 ```json
-["瀹炰綋1","瀹炰綋2"]
+["閬ユ帶甯ф牸寮�","閬ユ帶鍖呮牸寮�"]
 ```
 
 # 鏂囨湰濡備笅锛�
@@ -65,9 +73,10 @@
     def run(self, in_text: str) -> list[str]:
         """
         杩愯瀹炰綋璇嗗埆鎶藉彇銆�
+        :param in_text: str - 杈撳叆鏂囨湰
         """
         # 缂撳瓨鍛戒腑
-        text_md5 = utils.generate_md5(in_text)
+        text_md5 = utils.generate_text_md5(in_text)
         if text_md5 in self.cache:
             return self.cache[text_md5]
         result = self.chain.invoke({"text": in_text})

--
Gitblit v1.9.1