From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期一, 07 七月 2025 16:20:25 +0800
Subject: [PATCH] 生成数据库

---
 main.py |   83 ++++++++++++-----------------------------
 1 files changed, 25 insertions(+), 58 deletions(-)

diff --git a/main.py b/main.py
index 50eec97..6cc6b78 100644
--- a/main.py
+++ b/main.py
@@ -1,67 +1,34 @@
-import math
+import asyncio
 import os
-import random
-import time
+import sys
 
-from knowledgebase.markitdown import MarkItDown
-
-from doc_to_docx import doc_to_docx
+from db_struct_flow import DbStructFlow, tc_data_generate
+from knowledgebase.db.doc_db_helper import doc_dbh
+from knowledgebase.doc.doc_processor import DocProcessor
+from knowledgebase.doc.entity_helper import init_entity_helper
 
 
-def process_docs(directory):
-    # 閬嶅巻鐩綍涓嬬殑鎵�鏈夋枃浠�
-    for filename in os.listdir(directory):
-        # 鍒ゆ柇鏄惁涓� doc 鏂囦欢
-        if filename.endswith(".doc"):
-            # 杞崲涓� docx
-            doc_to_docx(directory + filename, directory + filename.replace(".doc", ".docx"))
+def doc_split(project_path):
+    docs_path = f'{project_path}/docs'
+    files = os.listdir(docs_path)
+    files = [f'{docs_path}/{x}' for x in filter(lambda x: x.endswith('.docx'), files)]
+    for file in files:
+        DocProcessor(file).process()
 
 
-md = MarkItDown()
 
-
-def to_markdown(dst_dir: str):
-    text = ''
-    # 閬嶅巻鏂囦欢澶逛笅鐨勬墍鏈夋枃浠�
-    for file in os.listdir(dst_dir):
-        # 鍒ゆ柇鏄惁涓� docx 鏂囦欢
-        if file.endswith(".docx"):
-            # 杞崲涓� md
-            result = md.convert(dst_dir + file)
-            text = result.text_content
-            out_file = dst_dir + file + '.md'
-            with open(out_file, 'w', encoding='utf-8') as f:
-                f.write(text)
-    return out_file
-
-
-# 1.瑙f瀽鏂囨。
-# 2.杈撳叆鏂囨。
-# 3.鍚姩LangFlow
 def main():
-    doc_dir = ".\\doc\\"
-    # 澶勭悊鏂囨。
-    # process_docs(doc_dir)
-    # 鏂囨。杞崲涓簃arkdown
-    md_file = to_markdown(doc_dir)
-
-    md_file = 'D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.md'
+    project_path = sys.argv[1]
+    if not project_path:
+        print("missing project path. eg: python main.py <path/to/project>")
+        return
+    # 鎷嗗垎鏂囨。
+    doc_dbh.set_project_path(project_path)
+    init_entity_helper()
+    # doc_split(project_path)
     # 鍚姩澶фā鍨嬪鐞嗘祦绋�
-    # ret_text = LangFlow([md_file]).run()
-    # 淇濆瓨缁撴灉
-    # with open('D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.text', 'w', encoding='utf-8') as f:
-    #     f.write(ret_text)
-
-
-def get_bit_mask(start, end):
-    bits = math.ceil((end + 1) / 8) * 8
-    if bits == 0:
-        bits = 8
-    mask = 0
-    for i in range(start, end + 1):
-        mask |= 1 << (bits - i - 1)
-    return mask
-
-
-# if __name__ == '__main__':
-#     main()
+    asyncio.run(DbStructFlow(project_path).run())
+    # 鐢熸垚鎸囦护鏁版嵁琛�
+    tc_data_generate()
+if __name__ == "__main__":
+    main()
\ No newline at end of file

--
Gitblit v1.9.1