| | |
| | | from knowledgebase.doc.image_to_text import ImageToText |
| | | from knowledgebase.doc.models import ParagraphInfo |
| | | from knowledgebase.log import Log |
| | | |
| | | from bs4 import BeautifulSoup |
| | | |
| | | class DocSplit: |
| | | """ |
| | |
| | | |
| | | """ |
| | | |
| | | def __init__(self, docx_file: str): |
| | | def __init__(self, docx_file: str, docx_type: str): |
| | | """ |
| | | docx文档拆分 |
| | | :param docx_file: 要拆分的docx文件路径 |
| | | :param docx_type: 文档类型 |
| | | """ |
| | | self.docx_file = docx_file |
| | | self.docx_type = docx_type |
| | | self.image_to_text = ImageToText() |
| | | self.paragraphs: list[ParagraphInfo] = [] |
| | | self.paragraph_tree: list[ParagraphInfo] = [] |
| | |
| | | else: |
| | | # 单元格文本获取 |
| | | text = cell.text |
| | | if cell._element.xml.find("w:ins")!=-1: |
| | | soup = BeautifulSoup(cell._element.xml, "xml") |
| | | text = ''.join([x.get_text() for x in soup.find_all("w:t")]) |
| | | # row_data[headers[row_idx]] = text |
| | | row_data.append(text) |
| | | row_idx += 1 |
| | |
| | | # 获取标题多级编号 |
| | | paragraph = document.paragraphs[paragraph_cnt] |
| | | p_text = paragraph.text |
| | | is_toc = paragraph.style.name.startswith('TOC') and '目' in p_text and '录' in p_text |
| | | try: |
| | | num = element.pPr.numPr.numId.val |
| | | level = element.pPr.numPr.ilvl.val |
| | |
| | | level = 0 |
| | | if p_text: |
| | | title_level = self.get_title_level(paragraph) |
| | | self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level)) |
| | | self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level, is_toc)) |
| | | # 检查是否是图片,如果是图片则转换为文本 |
| | | img_data = self.get_image_text(paragraph) |
| | | if img_data: |
| | |
| | | ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0)) |
| | | else: |
| | | continue |
| | | # 去除目录 |
| | | self.remove_toc(self.paragraphs) |
| | | # 生成标题编号 |
| | | Log.info(f"开始生成标题编号和列表编号") |
| | | self.gen_title_num(self.paragraphs) |
| | | # 生成树形结构 |
| | | Log.info(f"开始生成树形结构") |
| | | self.gen_paragraph_tree(self.paragraphs) |
| | | |
| | | @staticmethod |
| | | def remove_toc(paragraphs: [ParagraphInfo]): |
| | | rm_list = [] |
| | | for p in paragraphs: |
| | | if p.is_toc: |
| | | rm_list.append(p) |
| | | elif rm_list and p.title_level == 1: |
| | | break |
| | | elif rm_list: |
| | | rm_list.append(p) |
| | | for p in rm_list: |
| | | paragraphs.remove(p) |
| | | |
| | | @staticmethod |
| | | def get_image_text(paragraph): |
| | |
| | | :param img_data: bytes - 图片数据 |
| | | :return: str - 文本 |
| | | """ |
| | | return '' |
| | | return self.image_to_text.gen_text_from_img(img_data) |
| | | |
| | | def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]): |
| | |
| | | # 替换原始列表内容,避免多次 remove 操作 |
| | | self.paragraphs[:] = _paragraphs |
| | | self.paragraph_tree = result |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机1553B总线传输通信帧分配(公开).docx' |
| | | # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx' |
| | | doc_split = DocSplit(docx_file) |
| | | doc_split.split() |
| | | # er = EntityRecognition() |
| | | # db = Neo4jHelper() |
| | | # for trunk in doc_split.trunks: |
| | | # print('段落文本:') |
| | | # print(trunk) |
| | | # print('实体词:') |
| | | # print(er.run(trunk)) |
| | | # entities = er.run(trunk) |
| | | # db.create_page_node() |
| | | print("\n".join([x.full_text_with_children for x in doc_split.paragraphs])) |
| | | print() |