lyg
2025-05-22 e60d75228fb161e464ca59fa2526bf0765f4d902
knowledgebase/doc/docx_split.py
@@ -27,12 +27,14 @@
    """
    def __init__(self, docx_file: str):
    def __init__(self, docx_file: str, docx_type: str):
        """
        docx文档拆分
        :param docx_file: 要拆分的docx文件路径
        :param docx_type: 文档类型
        """
        self.docx_file = docx_file
        self.docx_type = docx_type
        self.image_to_text = ImageToText()
        self.paragraphs: list[ParagraphInfo] = []
        self.paragraph_tree: list[ParagraphInfo] = []
@@ -291,21 +293,3 @@
        # 替换原始列表内容,避免多次 remove 操作
        self.paragraphs[:] = _paragraphs
        self.paragraph_tree = result
if __name__ == '__main__':
    docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机1553B总线传输通信帧分配(公开).docx'
    # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx'
    doc_split = DocSplit(docx_file)
    doc_split.split()
    # er = EntityRecognition()
    # db = Neo4jHelper()
    # for trunk in doc_split.trunks:
    #     print('段落文本:')
    #     print(trunk)
    #     print('实体词:')
    #     print(er.run(trunk))
    # entities = er.run(trunk)
    # db.create_page_node()
    print("\n".join([x.full_text_with_children for x in doc_split.paragraphs]))
    print()