From b75a49c22e7d2b9aa8d3dc4975df8801c52b4d5b Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 16:04:28 +0800 Subject: [PATCH] 修改文档拆分和实体词提取逻辑,增加实体词文本抽取 --- knowledgebase/utils.py | 61 ++++++++++++++++++++++++++++++ 1 files changed, 61 insertions(+), 0 deletions(-) diff --git a/knowledgebase/utils.py b/knowledgebase/utils.py index c785dfe..0314db2 100644 --- a/knowledgebase/utils.py +++ b/knowledgebase/utils.py @@ -1,4 +1,8 @@ import math +import hashlib +import os +import json +import re def get_bit_mask(start, end): @@ -9,3 +13,60 @@ for i in range(start, end + 1): mask |= 1 << (bits - i - 1) return mask + + +def generate_text_md5(input_string): + # 鍒涘缓涓�涓� md5 鍝堝笇瀵硅薄 + md5_hash = hashlib.md5() + + # 鏇存柊鍝堝笇瀵硅薄鐨勫唴瀹癸紙闇�瑕佸皢瀛楃涓茬紪鐮佷负瀛楄妭锛� + md5_hash.update(input_string.encode('utf-8')) + + # 鑾峰彇鍝堝笇鍊肩殑鍗佸叚杩涘埗琛ㄧず + md5_digest = md5_hash.hexdigest() + + return md5_digest + +def generate_bytes_md5(input_bytes): + # 鍒涘缓涓�涓� md5 鍝堝笇瀵硅薄 + md5_hash = hashlib.md5() + + # 鏇存柊鍝堝笇瀵硅薄鍐呭 + md5_hash.update(input_bytes) + + # 鑾峰彇鍝堝笇鍊肩殑鍗佸叚杩涘埗琛ㄧず + md5_digest = md5_hash.hexdigest() + + return md5_digest + +def file_exists(cache_file: str): + return os.path.exists(cache_file) + + +def read_from_file(cache_file: str) -> str: + with open(cache_file, 'r', encoding='utf-8') as f: + text = f.read() + return text + + +def save_to_file(text, cache_file): + with open(cache_file, 'w', encoding='utf-8') as f: + f.write(text) + + +def replace_tpl_paras(tpl_text: str, data: dict): + for key, val in data.items(): + if not isinstance(val, str): + val = json.dumps(json.dumps(val, ensure_ascii=False), ensure_ascii=False)[1:-1] + tpl_text = tpl_text.replace('{{' + key + '}}', val) + return tpl_text + + +def to_file_name(text: str): + """ + 灏嗘枃鏈浆涓哄悎娉曠殑鏂囦欢鍚嶇О銆� + 灏嗙壒娈婂瓧绗︽浛鎹负_ + :param text: + :return: + """ + return re.sub(r'[\\/:*?"<>|]', '_', text) -- Gitblit v1.9.1