| | |
| | | |
| | | """ |
| | | |
| | | def __init__(self, docx_file: str): |
| | | def __init__(self, docx_file: str, docx_type: str): |
| | | """ |
| | | docx文档拆分 |
| | | :param docx_file: 要拆分的docx文件路径 |
| | | :param docx_type: 文档类型 |
| | | """ |
| | | self.docx_file = docx_file |
| | | self.docx_type = docx_type |
| | | self.image_to_text = ImageToText() |
| | | self.paragraphs: list[ParagraphInfo] = [] |
| | | self.paragraph_tree: list[ParagraphInfo] = [] |
| | |
| | | # 替换原始列表内容,避免多次 remove 操作 |
| | | self.paragraphs[:] = _paragraphs |
| | | self.paragraph_tree = result |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机1553B总线传输通信帧分配(公开).docx' |
| | | # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx' |
| | | doc_split = DocSplit(docx_file) |
| | | doc_split.split() |
| | | # er = EntityRecognition() |
| | | # db = Neo4jHelper() |
| | | # for trunk in doc_split.trunks: |
| | | # print('段落文本:') |
| | | # print(trunk) |
| | | # print('实体词:') |
| | | # print(er.run(trunk)) |
| | | # entities = er.run(trunk) |
| | | # db.create_page_node() |
| | | print("\n".join([x.full_text_with_children for x in doc_split.paragraphs])) |
| | | print() |