From 22f370322412074174cde20ecfd14ec03657ab63 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期一, 07 七月 2025 16:20:25 +0800 Subject: [PATCH] 生成数据库 --- knowledgebase/doc/docx_split.py | 46 +++++++++++++++++++++++++--------------------- 1 files changed, 25 insertions(+), 21 deletions(-) diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py index 22d189a..4a97292 100644 --- a/knowledgebase/doc/docx_split.py +++ b/knowledgebase/doc/docx_split.py @@ -15,7 +15,7 @@ from knowledgebase.doc.image_to_text import ImageToText from knowledgebase.doc.models import ParagraphInfo from knowledgebase.log import Log - +from bs4 import BeautifulSoup class DocSplit: """ @@ -27,12 +27,14 @@ """ - def __init__(self, docx_file: str): + def __init__(self, docx_file: str, docx_type: str): """ docx鏂囨。鎷嗗垎 :param docx_file: 瑕佹媶鍒嗙殑docx鏂囦欢璺緞 + :param docx_type: 鏂囨。绫诲瀷 """ self.docx_file = docx_file + self.docx_type = docx_type self.image_to_text = ImageToText() self.paragraphs: list[ParagraphInfo] = [] self.paragraph_tree: list[ParagraphInfo] = [] @@ -69,6 +71,9 @@ else: # 鍗曞厓鏍兼枃鏈幏鍙� text = cell.text + if cell._element.xml.find("w:ins")!=-1: + soup = BeautifulSoup(cell._element.xml, "xml") + text = ''.join([x.get_text() for x in soup.find_all("w:t")]) # row_data[headers[row_idx]] = text row_data.append(text) row_idx += 1 @@ -92,6 +97,7 @@ # 鑾峰彇鏍囬澶氱骇缂栧彿 paragraph = document.paragraphs[paragraph_cnt] p_text = paragraph.text + is_toc = paragraph.style.name.startswith('TOC') and '鐩�' in p_text and '褰�' in p_text try: num = element.pPr.numPr.numId.val level = element.pPr.numPr.ilvl.val @@ -100,7 +106,7 @@ level = 0 if p_text: title_level = self.get_title_level(paragraph) - self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level)) + self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level, is_toc)) # 妫�鏌ユ槸鍚︽槸鍥剧墖锛屽鏋滄槸鍥剧墖鍒欒浆鎹负鏂囨湰 img_data = self.get_image_text(paragraph) if img_data: @@ -116,12 +122,27 @@ ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0)) else: continue + # 鍘婚櫎鐩綍 + self.remove_toc(self.paragraphs) # 鐢熸垚鏍囬缂栧彿 Log.info(f"寮�濮嬬敓鎴愭爣棰樼紪鍙峰拰鍒楄〃缂栧彿") self.gen_title_num(self.paragraphs) # 鐢熸垚鏍戝舰缁撴瀯 Log.info(f"寮�濮嬬敓鎴愭爲褰㈢粨鏋�") self.gen_paragraph_tree(self.paragraphs) + + @staticmethod + def remove_toc(paragraphs: [ParagraphInfo]): + rm_list = [] + for p in paragraphs: + if p.is_toc: + rm_list.append(p) + elif rm_list and p.title_level == 1: + break + elif rm_list: + rm_list.append(p) + for p in rm_list: + paragraphs.remove(p) @staticmethod def get_image_text(paragraph): @@ -246,6 +267,7 @@ :param img_data: bytes - 鍥剧墖鏁版嵁 :return: str - 鏂囨湰 """ + return '' return self.image_to_text.gen_text_from_img(img_data) def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]): @@ -291,21 +313,3 @@ # 鏇挎崲鍘熷鍒楄〃鍐呭锛岄伩鍏嶅娆� remove 鎿嶄綔 self.paragraphs[:] = _paragraphs self.paragraph_tree = result - - -if __name__ == '__main__': - docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx' - # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx' - doc_split = DocSplit(docx_file) - doc_split.split() - # er = EntityRecognition() - # db = Neo4jHelper() - # for trunk in doc_split.trunks: - # print('娈佃惤鏂囨湰锛�') - # print(trunk) - # print('瀹炰綋璇嶏細') - # print(er.run(trunk)) - # entities = er.run(trunk) - # db.create_page_node() - print("\n".join([x.full_text_with_children for x in doc_split.paragraphs])) - print() -- Gitblit v1.9.1