From e60d75228fb161e464ca59fa2526bf0765f4d902 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期四, 22 五月 2025 12:35:55 +0800 Subject: [PATCH] 修改指令json生成,加入fastapi --- knowledgebase/doc/docx_split.py | 22 +++------------------- 1 files changed, 3 insertions(+), 19 deletions(-) diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py index 22d189a..4270b05 100644 --- a/knowledgebase/doc/docx_split.py +++ b/knowledgebase/doc/docx_split.py @@ -27,12 +27,14 @@ """ - def __init__(self, docx_file: str): + def __init__(self, docx_file: str, docx_type: str): """ docx鏂囨。鎷嗗垎 :param docx_file: 瑕佹媶鍒嗙殑docx鏂囦欢璺緞 + :param docx_type: 鏂囨。绫诲瀷 """ self.docx_file = docx_file + self.docx_type = docx_type self.image_to_text = ImageToText() self.paragraphs: list[ParagraphInfo] = [] self.paragraph_tree: list[ParagraphInfo] = [] @@ -291,21 +293,3 @@ # 鏇挎崲鍘熷鍒楄〃鍐呭锛岄伩鍏嶅娆� remove 鎿嶄綔 self.paragraphs[:] = _paragraphs self.paragraph_tree = result - - -if __name__ == '__main__': - docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx' - # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx' - doc_split = DocSplit(docx_file) - doc_split.split() - # er = EntityRecognition() - # db = Neo4jHelper() - # for trunk in doc_split.trunks: - # print('娈佃惤鏂囨湰锛�') - # print(trunk) - # print('瀹炰綋璇嶏細') - # print(er.run(trunk)) - # entities = er.run(trunk) - # db.create_page_node() - print("\n".join([x.full_text_with_children for x in doc_split.paragraphs])) - print() -- Gitblit v1.9.1