From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期一, 07 七月 2025 16:20:25 +0800 Subject: [PATCH] 生成数据库 --- main.py | 69 ++++++++++++---------------------- 1 files changed, 25 insertions(+), 44 deletions(-) diff --git a/main.py b/main.py index 701bf1d..6cc6b78 100644 --- a/main.py +++ b/main.py @@ -1,53 +1,34 @@ +import asyncio import os -from lang_flow import LangFlow -from markitdown import MarkItDown +import sys -from doc_to_docx import doc_to_docx +from db_struct_flow import DbStructFlow, tc_data_generate +from knowledgebase.db.doc_db_helper import doc_dbh +from knowledgebase.doc.doc_processor import DocProcessor +from knowledgebase.doc.entity_helper import init_entity_helper -def process_docs(directory): - # 閬嶅巻鐩綍涓嬬殑鎵�鏈夋枃浠� - for filename in os.listdir(directory): - # 鍒ゆ柇鏄惁涓� doc 鏂囦欢 - if filename.endswith(".doc"): - # 杞崲涓� docx - doc_to_docx(directory + filename, directory + filename.replace(".doc", ".docx")) +def doc_split(project_path): + docs_path = f'{project_path}/docs' + files = os.listdir(docs_path) + files = [f'{docs_path}/{x}' for x in filter(lambda x: x.endswith('.docx'), files)] + for file in files: + DocProcessor(file).process() -md = MarkItDown() - -def to_markdown(dst_dir: str): - text = '' - # 閬嶅巻鏂囦欢澶逛笅鐨勬墍鏈夋枃浠� - for file in os.listdir(dst_dir): - # 鍒ゆ柇鏄惁涓� docx 鏂囦欢 - if file.endswith(".docx"): - # 杞崲涓� md - result = md.convert(dst_dir + file) - text += '\n\n' + result.text_content - out_file = dst_dir + 'docs.md' - with open(out_file, 'w', encoding='utf-8') as f: - f.write(text) - return out_file - - -# 1.瑙f瀽鏂囨。 -# 2.杈撳叆鏂囨。 -# 3.鍚姩LangFlow def main(): - # doc_dir = "D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\" - # 澶勭悊鏂囨。 - # process_docs(doc_dir) - # 鏂囨。杞崲涓簃arkdown - # md_file = to_markdown(doc_dir) - - md_file = 'D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.md' + project_path = sys.argv[1] + if not project_path: + print("missing project path. eg: python main.py <path/to/project>") + return + # 鎷嗗垎鏂囨。 + doc_dbh.set_project_path(project_path) + init_entity_helper() + # doc_split(project_path) # 鍚姩澶фā鍨嬪鐞嗘祦绋� - ret_text = LangFlow([md_file]).run() - # 淇濆瓨缁撴灉 - # with open('D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.text', 'w', encoding='utf-8') as f: - # f.write(ret_text) - -if __name__ == '__main__': - main() + asyncio.run(DbStructFlow(project_path).run()) + # 鐢熸垚鎸囦护鏁版嵁琛� + tc_data_generate() +if __name__ == "__main__": + main() \ No newline at end of file -- Gitblit v1.9.1