From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期一, 07 七月 2025 16:20:25 +0800 Subject: [PATCH] 生成数据库 --- knowledgebase/doc/docx_split.py | 24 ++++++++++++++++++++++-- 1 files changed, 22 insertions(+), 2 deletions(-) diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py index 4270b05..4a97292 100644 --- a/knowledgebase/doc/docx_split.py +++ b/knowledgebase/doc/docx_split.py @@ -15,7 +15,7 @@ from knowledgebase.doc.image_to_text import ImageToText from knowledgebase.doc.models import ParagraphInfo from knowledgebase.log import Log - +from bs4 import BeautifulSoup class DocSplit: """ @@ -71,6 +71,9 @@ else: # 鍗曞厓鏍兼枃鏈幏鍙� text = cell.text + if cell._element.xml.find("w:ins")!=-1: + soup = BeautifulSoup(cell._element.xml, "xml") + text = ''.join([x.get_text() for x in soup.find_all("w:t")]) # row_data[headers[row_idx]] = text row_data.append(text) row_idx += 1 @@ -94,6 +97,7 @@ # 鑾峰彇鏍囬澶氱骇缂栧彿 paragraph = document.paragraphs[paragraph_cnt] p_text = paragraph.text + is_toc = paragraph.style.name.startswith('TOC') and '鐩�' in p_text and '褰�' in p_text try: num = element.pPr.numPr.numId.val level = element.pPr.numPr.ilvl.val @@ -102,7 +106,7 @@ level = 0 if p_text: title_level = self.get_title_level(paragraph) - self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level)) + self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level, is_toc)) # 妫�鏌ユ槸鍚︽槸鍥剧墖锛屽鏋滄槸鍥剧墖鍒欒浆鎹负鏂囨湰 img_data = self.get_image_text(paragraph) if img_data: @@ -118,12 +122,27 @@ ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0)) else: continue + # 鍘婚櫎鐩綍 + self.remove_toc(self.paragraphs) # 鐢熸垚鏍囬缂栧彿 Log.info(f"寮�濮嬬敓鎴愭爣棰樼紪鍙峰拰鍒楄〃缂栧彿") self.gen_title_num(self.paragraphs) # 鐢熸垚鏍戝舰缁撴瀯 Log.info(f"寮�濮嬬敓鎴愭爲褰㈢粨鏋�") self.gen_paragraph_tree(self.paragraphs) + + @staticmethod + def remove_toc(paragraphs: [ParagraphInfo]): + rm_list = [] + for p in paragraphs: + if p.is_toc: + rm_list.append(p) + elif rm_list and p.title_level == 1: + break + elif rm_list: + rm_list.append(p) + for p in rm_list: + paragraphs.remove(p) @staticmethod def get_image_text(paragraph): @@ -248,6 +267,7 @@ :param img_data: bytes - 鍥剧墖鏁版嵁 :return: str - 鏂囨湰 """ + return '' return self.image_to_text.gen_text_from_img(img_data) def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]): -- Gitblit v1.9.1