New file |
| | |
| | | # -*- coding: utf-8 -*- |
| | | # |
| | | # @author: lyg, ym |
| | | # @date: 2025-5-8 |
| | | # @version: 1 |
| | | # @description: docxææ¡£æåå¨ï¼æ ¹æ®æ®µè½æåï¼å°å¾çåè¡¨æ ¼è½¬æ¢ä¸ºjsonæ°æ®ã |
| | | import docx |
| | | import docx.table |
| | | import json |
| | | from dataclasses import dataclass |
| | | from PIL import Image |
| | | import io |
| | | import re |
| | | |
| | | from knowledgebase.doc.image_to_text import ImageToText |
| | | |
| | | |
| | | @dataclass |
| | | class ParagraphInfo: |
| | | """ |
| | | 段è½ä¿¡æ¯ |
| | | :param text: str - æ®µè½ææ¬ |
| | | :param level: int - 段è½çº§å«ï¼1-9级æ é¢ï¼0è¡¨ç¤ºæ£æ |
| | | :param title_no: str - æ é¢ç¼å·ï¼å¦1.1ã1.1.1ç |
| | | """ |
| | | text: str |
| | | level: int |
| | | title_no: str |
| | | |
| | | @property |
| | | def full_text(self): |
| | | """ |
| | | è·å段è½å®æ´ææ¬ï¼å
嫿 é¢ç¼å· |
| | | :return: str - 段è½å®æ´ææ¬ |
| | | """ |
| | | return f"{self.title_no} {self.text}" |
| | | |
| | | def __init__(self, text: str, level: int): |
| | | """ |
| | | 段è½ä¿¡æ¯ |
| | | :param text: str - æ®µè½ææ¬ |
| | | :param level: int - 段è½çº§å«ï¼1-9级æ é¢ï¼0è¡¨ç¤ºæ£æ |
| | | """ |
| | | self.text = text |
| | | self.level = level |
| | | self.title_no = '' |
| | | |
| | | |
| | | class DocSplit: |
| | | """ |
| | | docxææ¡£æåå¨ï¼æ ¹æ®æ®µè½æåï¼å°å¾çåè¡¨æ ¼è½¬æ¢ä¸ºjsonæ°æ®ã |
| | | 1.å°è£
段è½ä¿¡æ¯ |
| | | 2.å°å¾çåè¡¨æ ¼è½¬æ¢ä¸ºjson |
| | | 3.å°æ®µè½æç
§ææ¡£æ é¢çº§å«ç»åææ å½¢ç»æ |
| | | |
| | | """ |
| | | |
| | | def __init__(self, doc_file): |
| | | self.doc_file = doc_file |
| | | self.image_to_text = ImageToText() |
| | | self.paragraphs:list[ParagraphInfo] = [] |
| | | |
| | | def table_to_json(self, table: docx.table.Table): |
| | | """ |
| | | å°è¡¨æ ¼è½¬æ¢ä¸º JSON æ ¼å¼ |
| | | |
| | | :param table: docx.table.Table - è¦è½¬æ¢çè¡¨æ ¼å¯¹è±¡ |
| | | :return list - è¡¨æ ¼æ°æ®ï¼ä»¥ JSON æ ¼å¼è¡¨ç¤º |
| | | """ |
| | | table_data = [] |
| | | headers = [] |
| | | first_row = True |
| | | row: docx.table._Row |
| | | for row in table.rows: |
| | | if first_row: |
| | | for cell in row.cells: |
| | | headers.append(cell.text) |
| | | first_row = False |
| | | continue |
| | | row_data = {} |
| | | row_idx = 0 |
| | | for cell in row.cells: |
| | | if cell.tables: |
| | | # åµå¥è¡¨æ ¼å¤ç |
| | | if len(cell.tables) == 1: |
| | | text = self.table_to_json(cell.tables[0]) |
| | | else: |
| | | text = [] |
| | | for tbl in cell.tables: |
| | | tbl_json = self.table_to_json(tbl) |
| | | text.append(tbl_json) |
| | | else: |
| | | # åå
æ ¼ææ¬è·å |
| | | text = cell.text |
| | | row_data[headers[row_idx]] = text |
| | | row_idx += 1 |
| | | |
| | | table_data.append(row_data) |
| | | return table_data |
| | | |
| | | def split(self): |
| | | """ |
| | | å°ææ¡£æåææ®µè½ï¼å¹¶è¿å段è½å表 |
| | | |
| | | :return: list[ParagraphInfo] - 段è½å表 |
| | | """ |
| | | document = docx.Document(self.doc_file) |
| | | table_cnt = 0 |
| | | paragraph_cnt = 0 |
| | | |
| | | for element in document.element.body: |
| | | if element.tag.endswith('p'): # æ®µè½ |
| | | # è·åæ é¢å¤çº§ç¼å· |
| | | paragraph = document.paragraphs[paragraph_cnt] |
| | | paragraph_text = paragraph.text |
| | | if paragraph_text: |
| | | self.paragraphs.append(ParagraphInfo(paragraph_text, self.get_header_level(paragraph))) |
| | | # æ£æ¥æ¯å¦æ¯å¾çï¼å¦ææ¯å¾çå转æ¢ä¸ºææ¬ |
| | | img_data = self.get_image_blob(paragraph) |
| | | if img_data: |
| | | text = self.gen_text_from_img(img_data) |
| | | self.paragraphs.append(ParagraphInfo(text, 0)) |
| | | paragraph_cnt += 1 |
| | | elif element.tag.endswith('tbl'): # è¡¨æ ¼ |
| | | table = document.tables[table_cnt] # è·åå½åè¡¨æ ¼å¯¹è±¡ |
| | | table_cnt += 1 |
| | | table_data = self.table_to_json(table) |
| | | self.paragraphs.append(ParagraphInfo(json.dumps(table_data, indent=4, ensure_ascii=False), 0)) |
| | | else: |
| | | continue |
| | | # çææ é¢ç¼å· |
| | | self.gen_title_no(self.paragraphs) |
| | | |
| | | @staticmethod |
| | | def get_image_blob(paragraph): |
| | | # éåæ®µè½ä¸çææRun对象ï¼å¾çé常å¨åç¬çRunä¸ï¼ |
| | | for run in paragraph.runs: |
| | | xml = run._element.xml |
| | | if xml.find('v:imagedata') != -1: |
| | | # ä½¿ç¨æ£åè¡¨è¾¾å¼æ¥æ¾r:id屿§ |
| | | match = re.search(r'r:id="([^"]+)"', xml) |
| | | if match: |
| | | r_id = match.group(1) |
| | | if r_id: |
| | | # è·åå¾çä¿¡æ¯ |
| | | image_part = paragraph.part.rels[r_id].target_part |
| | | return DocSplit.image_convert(image_part.blob, "png") |
| | | if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: |
| | | # ä½¿ç¨æ£åè¡¨è¾¾å¼æ¥æ¾r:embed屿§ |
| | | match = re.search(r'r:embed="([^"]+)"', xml) |
| | | if match: |
| | | r_id = match.group(1) |
| | | if r_id: |
| | | # è·åå¾çä¿¡æ¯ |
| | | image_part = paragraph.part.rels[r_id].target_part |
| | | return DocSplit.image_convert(image_part.blob, "png") |
| | | return None |
| | | |
| | | @staticmethod |
| | | def gen_title_no(paragraphs: list[ParagraphInfo]): |
| | | title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] |
| | | for i in range(len(paragraphs)): |
| | | if paragraphs[i].level > 0: |
| | | for j in range(paragraphs[i].level - 1): |
| | | title_levels[j] = 1 |
| | | paragraphs[i].title_no = '.'.join([str(x) for x in title_levels[0:paragraphs[i].level]]) |
| | | title_levels[paragraphs[i].level - 1] += 1 |
| | | else: |
| | | title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] |
| | | |
| | | @staticmethod |
| | | def get_header_level(paragraph) -> int: |
| | | if paragraph.style.base_style: |
| | | style = paragraph.style.base_style |
| | | else: |
| | | style = paragraph.style |
| | | if style and style.name.startswith('Heading'): |
| | | # è·åæ é¢çº§å« |
| | | level = int(style.name.split(' ')[1]) |
| | | return level |
| | | else: |
| | | return 0 |
| | | |
| | | @staticmethod |
| | | def image_convert(_in: bytes, _out_format: str) -> bytes: |
| | | in_io = io.BytesIO() |
| | | in_io.write(_in) |
| | | img = Image.open(in_io, "r") |
| | | out_io = io.BytesIO() |
| | | img.save(out_io, "png") |
| | | out_io.seek(0) |
| | | return out_io.read() |
| | | |
| | | def gen_text_from_img(self, img_data:bytes): |
| | | return self.image_to_text.gen_text_from_img(img_data) |
| | | |
| | | if __name__ == '__main__': |
| | | doc_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\ZLæ ¼å¼(å
¬å¼).docx' |
| | | doc_split = DocSplit(doc_file) |
| | | doc_split.split() |
| | | print("\n".join([x.full_text for x in doc_split.paragraphs])) |