From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期一, 07 七月 2025 16:20:25 +0800 Subject: [PATCH] 生成数据库 --- knowledgebase/doc/docx_split.py | 260 +++++++++++++++++++++++++++++++++++++-------------- 1 files changed, 187 insertions(+), 73 deletions(-) diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py index 52df48f..4a97292 100644 --- a/knowledgebase/doc/docx_split.py +++ b/knowledgebase/doc/docx_split.py @@ -7,58 +7,37 @@ import docx import docx.table import json -from dataclasses import dataclass from PIL import Image import io import re +import typing from knowledgebase.doc.image_to_text import ImageToText - - -@dataclass -class ParagraphInfo: - """ - 娈佃惤淇℃伅 - :param text: str - 娈佃惤鏂囨湰 - :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 - :param title_no: str - 鏍囬缂栧彿锛屽1.1銆�1.1.1绛� - """ - text: str - level: int - title_no: str - - @property - def full_text(self): - """ - 鑾峰彇娈佃惤瀹屾暣鏂囨湰锛屽寘鍚爣棰樼紪鍙� - :return: str - 娈佃惤瀹屾暣鏂囨湰 - """ - return f"{self.title_no} {self.text}" - - def __init__(self, text: str, level: int): - """ - 娈佃惤淇℃伅 - :param text: str - 娈佃惤鏂囨湰 - :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 - """ - self.text = text - self.level = level - self.title_no = '' - +from knowledgebase.doc.models import ParagraphInfo +from knowledgebase.log import Log +from bs4 import BeautifulSoup class DocSplit: """ docx鏂囨。鎷嗗垎鍣紝鏍规嵁娈佃惤鎷嗗垎锛屽皢鍥剧墖鍜岃〃鏍艰浆鎹负json鏁版嵁銆� 1.灏佽娈佃惤淇℃伅 - 2.灏嗗浘鐗囧拰琛ㄦ牸杞崲涓簀son - 3.灏嗘钀芥寜鐓ф枃妗f爣棰樼骇鍒粍鍚堟垚鏍戝舰缁撴瀯 + 2.灏嗗浘鐗囪浆鎹负鑷劧璇█鎻忚堪 + 3.灏嗚〃鏍艰浆鎹负json鏍煎紡 + 4.灏嗘钀芥寜鐓ф枃妗f爣棰樼骇鍒粍鍚堟垚鏍戝舰缁撴瀯 """ - def __init__(self, doc_file): - self.doc_file = doc_file + def __init__(self, docx_file: str, docx_type: str): + """ + docx鏂囨。鎷嗗垎 + :param docx_file: 瑕佹媶鍒嗙殑docx鏂囦欢璺緞 + :param docx_type: 鏂囨。绫诲瀷 + """ + self.docx_file = docx_file + self.docx_type = docx_type self.image_to_text = ImageToText() - self.paragraphs:list[ParagraphInfo] = [] + self.paragraphs: list[ParagraphInfo] = [] + self.paragraph_tree: list[ParagraphInfo] = [] def table_to_json(self, table: docx.table.Table): """ @@ -67,8 +46,8 @@ :param table: docx.table.Table - 瑕佽浆鎹㈢殑琛ㄦ牸瀵硅薄 :return list - 琛ㄦ牸鏁版嵁锛屼互 JSON 鏍煎紡琛ㄧず """ - table_data = [] headers = [] + table_data = [headers] first_row = True row: docx.table._Row for row in table.rows: @@ -77,7 +56,7 @@ headers.append(cell.text) first_row = False continue - row_data = {} + row_data = [] row_idx = 0 for cell in row.cells: if cell.tables: @@ -92,7 +71,11 @@ else: # 鍗曞厓鏍兼枃鏈幏鍙� text = cell.text - row_data[headers[row_idx]] = text + if cell._element.xml.find("w:ins")!=-1: + soup = BeautifulSoup(cell._element.xml, "xml") + text = ''.join([x.get_text() for x in soup.find_all("w:t")]) + # row_data[headers[row_idx]] = text + row_data.append(text) row_idx += 1 table_data.append(row_data) @@ -104,7 +87,8 @@ :return: list[ParagraphInfo] - 娈佃惤鍒楄〃 """ - document = docx.Document(self.doc_file) + Log.info(f"寮�濮嬫媶鍒嗘枃妗o細{self.docx_file}") + document = docx.Document(self.docx_file) table_cnt = 0 paragraph_cnt = 0 @@ -112,27 +96,61 @@ if element.tag.endswith('p'): # 娈佃惤 # 鑾峰彇鏍囬澶氱骇缂栧彿 paragraph = document.paragraphs[paragraph_cnt] - paragraph_text = paragraph.text - if paragraph_text: - self.paragraphs.append(ParagraphInfo(paragraph_text, self.get_header_level(paragraph))) + p_text = paragraph.text + is_toc = paragraph.style.name.startswith('TOC') and '鐩�' in p_text and '褰�' in p_text + try: + num = element.pPr.numPr.numId.val + level = element.pPr.numPr.ilvl.val + except: + num = 0 + level = 0 + if p_text: + title_level = self.get_title_level(paragraph) + self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level, is_toc)) # 妫�鏌ユ槸鍚︽槸鍥剧墖锛屽鏋滄槸鍥剧墖鍒欒浆鎹负鏂囨湰 - img_data = self.get_image_blob(paragraph) + img_data = self.get_image_text(paragraph) if img_data: text = self.gen_text_from_img(img_data) - self.paragraphs.append(ParagraphInfo(text, 0)) + text = f"```鍥剧墖锛堜互涓嬪唴瀹逛负鍥剧墖鎻忚堪锛塡n{text}\n```" + self.paragraphs.append(ParagraphInfo(text, 0, num, level)) paragraph_cnt += 1 elif element.tag.endswith('tbl'): # 琛ㄦ牸 table = document.tables[table_cnt] # 鑾峰彇褰撳墠琛ㄦ牸瀵硅薄 table_cnt += 1 table_data = self.table_to_json(table) - self.paragraphs.append(ParagraphInfo(json.dumps(table_data, indent=4, ensure_ascii=False), 0)) + self.paragraphs.append( + ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0)) else: continue + # 鍘婚櫎鐩綍 + self.remove_toc(self.paragraphs) # 鐢熸垚鏍囬缂栧彿 - self.gen_title_no(self.paragraphs) + Log.info(f"寮�濮嬬敓鎴愭爣棰樼紪鍙峰拰鍒楄〃缂栧彿") + self.gen_title_num(self.paragraphs) + # 鐢熸垚鏍戝舰缁撴瀯 + Log.info(f"寮�濮嬬敓鎴愭爲褰㈢粨鏋�") + self.gen_paragraph_tree(self.paragraphs) @staticmethod - def get_image_blob(paragraph): + def remove_toc(paragraphs: [ParagraphInfo]): + rm_list = [] + for p in paragraphs: + if p.is_toc: + rm_list.append(p) + elif rm_list and p.title_level == 1: + break + elif rm_list: + rm_list.append(p) + for p in rm_list: + paragraphs.remove(p) + + @staticmethod + def get_image_text(paragraph): + """ + 鑾峰彇娈佃惤涓殑鍥剧墖鎻忚堪 + :param paragraph: 娈佃惤 + :return: 鍥剧墖鍐呭鎻忚堪淇℃伅 + """ # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級 for run in paragraph.runs: xml = run._element.xml @@ -144,7 +162,7 @@ if r_id: # 鑾峰彇鍥剧墖淇℃伅 image_part = paragraph.part.rels[r_id].target_part - return DocSplit.image_convert(image_part.blob, "png") + return DocSplit.image_convert(image_part.blob) if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴�� match = re.search(r'r:embed="([^"]+)"', xml) @@ -153,36 +171,88 @@ if r_id: # 鑾峰彇鍥剧墖淇℃伅 image_part = paragraph.part.rels[r_id].target_part - return DocSplit.image_convert(image_part.blob, "png") + return DocSplit.image_convert(image_part.blob) return None @staticmethod - def gen_title_no(paragraphs: list[ParagraphInfo]): - title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] - for i in range(len(paragraphs)): - if paragraphs[i].level > 0: - for j in range(paragraphs[i].level - 1): - title_levels[j] = 1 - paragraphs[i].title_no = '.'.join([str(x) for x in title_levels[0:paragraphs[i].level]]) - title_levels[paragraphs[i].level - 1] += 1 + def gen_title_num(paragraphs: list[ParagraphInfo]): + """ + 鐢熸垚鏍囬缂栧彿鍜屽垪琛ㄧ紪鍙� + 鏍囬绾у埆浠�1-9锛�0琛ㄧず姝f枃 + + :param paragraphs: list[ParagraphInfo] - 娈佃惤鍒楄〃 + :return: None + """ + MAX_TITLE_LEVEL = 9 # 瀹氫箟涓哄父閲忥紝渚夸簬缁熶竴绠$悊鍜屾墿灞� + title_levels = [0] * MAX_TITLE_LEVEL # 鍒濆鍖栦负鍏�0 + + list_counters = [0] * MAX_TITLE_LEVEL + + def format_number(level: int, value: int) -> str: + # 浣跨敤鏄犲皠鏂瑰紡绠�鍖栭�昏緫 + if level < 0 or level > 4: + return str(value) + formats = { + 0: lambda v: f"({v})", + 1: lambda v: f"{v})", + 2: lambda v: f"({chr(96 + v)})", + 3: lambda v: f"{chr(96 + v)})", + 4: lambda v: chr(96 + v), + } + return formats[level](value) + + for p in paragraphs: + if p.title_level > 0: + title_levels[p.title_level - 1] += 1 + for i in range(p.title_level, MAX_TITLE_LEVEL): + title_levels[i] = 0 + p.title_num = '.'.join([str(x) for x in title_levels[:p.title_level]]) + list_counters = [0] * MAX_TITLE_LEVEL else: - title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] + # 澶勭悊鍒楄〃缂栧彿 + if p.num > 0: + level = p.num_level + + # 鏍¢獙 level 鍚堟硶鎬� + if level < 0 or level >= MAX_TITLE_LEVEL: + continue + list_counters[level] += 1 + + # 閲嶇疆褰撳墠灞傜骇涔嬪悗鐨勮鏁板櫒 + for l in range(level + 1, MAX_TITLE_LEVEL): + list_counters[l] = 0 + + # 褰撳墠灞傜骇閫掑骞惰祴鍊� + p.title_num = format_number(level, list_counters[level]) + else: + list_counters = [0] * MAX_TITLE_LEVEL @staticmethod - def get_header_level(paragraph) -> int: - if paragraph.style.base_style: - style = paragraph.style.base_style - else: - style = paragraph.style + def get_title_level(paragraph) -> int: + """ + 鑾峰彇娈佃惤鏍囬绾у埆 + + :param paragraph: docx.paragraph.Paragraph - 瑕佽幏鍙栨爣棰樼骇鍒殑娈佃惤瀵硅薄 + :return: int - 鏍囬绾у埆锛�0 琛ㄧず闈炴爣棰� + """ + style = paragraph.style if style and style.name.startswith('Heading'): # 鑾峰彇鏍囬绾у埆 level = int(style.name.split(' ')[1]) + return level + elif style.base_style and style.base_style.name.startswith('Heading'): + level = int(style.base_style.name.split(' ')[1]) return level else: return 0 @staticmethod - def image_convert(_in: bytes, _out_format: str) -> bytes: + def image_convert(_in: bytes) -> bytes: + """ + 灏嗗浘鐗囪浆鎹负png鏍煎紡鐨刡ytes + :param _in: bytes - 鍥剧墖鏁版嵁 + :return: bytes - png鏍煎紡鐨勫浘鐗囨暟鎹� + """ in_io = io.BytesIO() in_io.write(_in) img = Image.open(in_io, "r") @@ -191,11 +261,55 @@ out_io.seek(0) return out_io.read() - def gen_text_from_img(self, img_data:bytes): + def gen_text_from_img(self, img_data: bytes): + """ + 鍒╃敤LLM灏嗗浘鐗囪浆涓烘枃鏈� + :param img_data: bytes - 鍥剧墖鏁版嵁 + :return: str - 鏂囨湰 + """ + return '' return self.image_to_text.gen_text_from_img(img_data) -if __name__ == '__main__': - doc_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\ZL鏍煎紡(鍏紑).docx' - doc_split = DocSplit(doc_file) - doc_split.split() - print("\n".join([x.full_text for x in doc_split.paragraphs])) + def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]): + """ + 鐢熸垚娈佃惤鏍戠粨鏋勶紝鏍规嵁title_level鍒掑垎娈佃惤鏍� + + :param paragraphs: list[ParagraphInfo] - 娈佃惤鍒楄〃锛堜細琚師鍦颁慨鏀癸級 + """ + if not paragraphs: + return + + stack = [] + result = [] + _paragraphs = [] + + def merge_paragraph_text(info: ParagraphInfo): + text_nodes = [child for child in info.children if child.title_level == 0] + info.text += '\n' + '\n'.join([child.full_text for child in text_nodes]) + info.children = [child for child in info.children if child.title_level > 0] + + for p in paragraphs: + if p.title_level == 1: + result.append(p) + # 娓呯悊鏍堥《姣斿綋鍓嶇骇鍒綆鎴栫浉绛夌殑鑺傜偣 + while stack and p.title_level != 0 and stack[-1].title_level >= p.title_level: + _p = stack.pop() + merge_paragraph_text(_p) + + if p.title_level > 0: + if len(stack): + stack[-1].children.append(p) + stack.append(p) + _paragraphs.append(p) + elif len(stack): + stack[-1].children.append(p) + else: + # 闈炴爣棰樻钀界洿鎺ュ姞鍏ョ粨鏋� + result.append(p) + + while stack: + merge_paragraph_text(stack.pop()) + + # 鏇挎崲鍘熷鍒楄〃鍐呭锛岄伩鍏嶅娆� remove 鎿嶄綔 + self.paragraphs[:] = _paragraphs + self.paragraph_tree = result -- Gitblit v1.9.1