lyg
2 天以前 22f370322412074174cde20ecfd14ec03657ab63
knowledgebase/doc/docx_split.py
@@ -15,7 +15,7 @@
from knowledgebase.doc.image_to_text import ImageToText
from knowledgebase.doc.models import ParagraphInfo
from knowledgebase.log import Log
from bs4 import BeautifulSoup
class DocSplit:
    """
@@ -27,12 +27,14 @@
    """
    def __init__(self, docx_file: str):
    def __init__(self, docx_file: str, docx_type: str):
        """
        docx文档拆分
        :param docx_file: 要拆分的docx文件路径
        :param docx_type: 文档类型
        """
        self.docx_file = docx_file
        self.docx_type = docx_type
        self.image_to_text = ImageToText()
        self.paragraphs: list[ParagraphInfo] = []
        self.paragraph_tree: list[ParagraphInfo] = []
@@ -69,6 +71,9 @@
                else:
                    # 单元格文本获取
                    text = cell.text
                    if cell._element.xml.find("w:ins")!=-1:
                        soup = BeautifulSoup(cell._element.xml, "xml")
                        text = ''.join([x.get_text() for x in soup.find_all("w:t")])
                # row_data[headers[row_idx]] = text
                row_data.append(text)
                row_idx += 1
@@ -92,6 +97,7 @@
                # 获取标题多级编号
                paragraph = document.paragraphs[paragraph_cnt]
                p_text = paragraph.text
                is_toc = paragraph.style.name.startswith('TOC') and '目' in p_text and '录' in p_text
                try:
                    num = element.pPr.numPr.numId.val
                    level = element.pPr.numPr.ilvl.val
@@ -100,7 +106,7 @@
                    level = 0
                if p_text:
                    title_level = self.get_title_level(paragraph)
                    self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level))
                    self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level, is_toc))
                # 检查是否是图片,如果是图片则转换为文本
                img_data = self.get_image_text(paragraph)
                if img_data:
@@ -116,12 +122,27 @@
                    ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0))
            else:
                continue
        # 去除目录
        self.remove_toc(self.paragraphs)
        # 生成标题编号
        Log.info(f"开始生成标题编号和列表编号")
        self.gen_title_num(self.paragraphs)
        # 生成树形结构
        Log.info(f"开始生成树形结构")
        self.gen_paragraph_tree(self.paragraphs)
    @staticmethod
    def remove_toc(paragraphs: [ParagraphInfo]):
        rm_list = []
        for p in paragraphs:
            if p.is_toc:
                rm_list.append(p)
            elif rm_list and p.title_level == 1:
                break
            elif rm_list:
                rm_list.append(p)
        for p in rm_list:
            paragraphs.remove(p)
    @staticmethod
    def get_image_text(paragraph):
@@ -246,6 +267,7 @@
        :param img_data: bytes - 图片数据
        :return: str - 文本
        """
        return ''
        return self.image_to_text.gen_text_from_img(img_data)
    def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]):
@@ -291,21 +313,3 @@
        # 替换原始列表内容,避免多次 remove 操作
        self.paragraphs[:] = _paragraphs
        self.paragraph_tree = result
if __name__ == '__main__':
    docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机1553B总线传输通信帧分配(公开).docx'
    # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx'
    doc_split = DocSplit(docx_file)
    doc_split.split()
    # er = EntityRecognition()
    # db = Neo4jHelper()
    # for trunk in doc_split.trunks:
    #     print('段落文本:')
    #     print(trunk)
    #     print('实体词:')
    #     print(er.run(trunk))
    # entities = er.run(trunk)
    # db.create_page_node()
    print("\n".join([x.full_text_with_children for x in doc_split.paragraphs]))
    print()