From b2bef1e9348fef4010ed713497fdea85fc751c66 Mon Sep 17 00:00:00 2001 From: YM <479443481@qq.com> Date: 星期四, 08 五月 2025 16:55:31 +0800 Subject: [PATCH] 文档解析,获取章节结构,并获取图片转换为base64 --- .gitignore | 3 static/images/test.png | 0 vision_test.py | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++ static/doc/ZL格式(公开).docx | 0 4 files changed, 159 insertions(+), 1 deletions(-) diff --git a/.gitignore b/.gitignore index ddcc4f8..7bc84b4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ /datas /.conda /docs -/out* \ No newline at end of file +/out* +__pycache__ \ No newline at end of file diff --git "a/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx" "b/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx" new file mode 100644 index 0000000..754aaee --- /dev/null +++ "b/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx" Binary files differ diff --git a/static/images/test.png b/static/images/test.png new file mode 100644 index 0000000..aca2fdd --- /dev/null +++ b/static/images/test.png Binary files differ diff --git a/vision_test.py b/vision_test.py new file mode 100644 index 0000000..3e41c4c --- /dev/null +++ b/vision_test.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-7 +# @version: 1 +# @description:瑙嗚璇嗗埆鏂囨。鍐呭 + +from langchain_openai.chat_models import ChatOpenAI +from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.output_parsers import JsonOutputParser +from docx import Document +from PIL import Image +from io import BytesIO +import re +import json +import base64 + + +class VisionTest: + def __init__(self, file): + self.llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + image = base64.b64encode(open(file, 'rb').read()).decode() + self.prompt = ChatPromptTemplate.from_messages([ + SystemMessage("浣犳槸涓�涓祫娣辫蒋浠跺伐绋嬪笀锛岃鍒嗘瀽鍥剧墖鍥炵瓟闂銆�"), + HumanMessage(content=[ + {"type": "text", "text": "{msg}"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image}"}, + } + ]) + ]) + + def run(self, msg): + chain = self.prompt | self.llm + resp = chain.invoke({"msg": msg}) + print(resp.content) + + # def get_document_chapters(doc_path): + # doc = Document(doc_path) + # chapters = [] + # current_chapter = None + + # for para in doc.paragraphs: + # if para.style.name.startswith('Heading'): # 妫�鏌ユ槸鍚︿负鏍囬鏍峰紡 + # level = int(para.style.name.replace('Heading', '')) # 鑾峰彇鏍囬绾у埆 + # current_chapter = {'level': level, 'title': para.text, 'content': []} + # chapters.append(current_chapter) + # elif current_chapter is not None: + # current_chapter['content'].append(para.text) # 娣诲姞鍐呭鍒板綋鍓嶇珷鑺� + + # return chapters + + def has_image(self, paragraph): + # 閫氳繃妫�鏌ML涓殑宓屽叆寮忓璞℃潵鍒ゆ柇鏄惁鏈夊浘鐗� + xml = paragraph._element.xml + return 'w:object' in xml or 'w:drawing' in xml + + def convert_blob_to_png_base64(self, image_blob): + try: + # 鎵撳紑鍥剧墖 + image = Image.open(BytesIO(image_blob)) + # 鍒涘缓鍐呭瓨缂撳啿鍖� + buffer = BytesIO() + # 淇濆瓨涓篜NG鏍煎紡 + image.save(buffer, format="PNG") + # 鑾峰彇PNG鏍煎紡鐨勪簩杩涘埗鏁版嵁 + png_data = buffer.getvalue() + # 杞崲涓築ase64缂栫爜 + base64_data = base64.b64encode(png_data).decode('utf-8') + return base64_data + except Exception as e: + print(f"Error: {e}") + return None + + def get_image_blob(self, paragraph): + # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級 + for run in paragraph.runs: + xml = run._element.xml + if xml.find('v:imagedata') != -1: + # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:id灞炴�� + match = re.search(r'r:id="([^"]+)"', xml) + if match: + r_id = match.group(1) + if r_id: + # 鑾峰彇鍥剧墖淇℃伅 + image_part = paragraph.part.rels[r_id].target_part + return image_part.blob + if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: + # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴�� + match = re.search(r'r:embed="([^"]+)"', xml) + if match: + r_id = match.group(1) + if r_id: + # 鑾峰彇鍥剧墖淇℃伅 + image_part = paragraph.part.rels[r_id].target_part + return image_part.blob + return None + + def loadDoc(self): + doc = Document('./static/doc/ZL鏍煎紡(鍏紑).docx') + # 鎸夌収鏍囬鑾峰彇娈佃惤鐨勫眰绾х粨鏋� + titles = [] + for paragraph in doc.paragraphs: + if paragraph.text != "": + # 鏂囧瓧涓嶄负绌� + if paragraph.style.base_style is not None: + # 鏈塨ase_style + if paragraph.style.base_style.name.startswith('Heading'): + # 鏄爣棰� + level = int(paragraph.style.base_style.name.split(' ')[-1]) + obj = {} + obj["level"] = level + obj["text"] = paragraph.text + obj["child"] = [] + titles.append(obj) + else: + length = len(titles) + if "child" in titles[length -1]: + obj = {} + obj["text"] = paragraph.text + titles[length -1]['child'].append(obj) + else: + # 娌℃湁base_style + length = len(titles) + obj = {} + obj["text"] = paragraph.text + if length > 0 and "child" in titles[length -1]: + # 濡傛灉鏄爣棰樺唴鐨刟ppend杩涙爣棰樼殑child + titles[length -1]['child'].append(obj) + else: + # 闈炴爣棰樺唴鐨勭洿鎺ユ斁鍦ㄧ涓�灞� + titles.append(obj) + else: + # 鏂囧瓧涓虹┖鏃讹紝鍙兘鏄浘鐗囨垨鑰呰〃鏍� + if self.has_image(paragraph): + # 褰撳墠娈佃惤涓哄浘鐗� + obj = {} + # 鑾峰彇鍥剧墖鐨刡lob + img = self.get_image_blob(paragraph) + if img is not None: + imgBase64 = self.convert_blob_to_png_base64(img) + if imgBase64 is not None: + obj["imgBase64"] = imgBase64 + titles[length -1]['child'].append(obj) + # 鍦ㄨ繖閲屾墿灞曞垽鏂〃鏍� + print(titles) + # for para in doc.paragraphs: + # print(para.text) + # print('------------------------') + +if __name__ == '__main__': + vision = VisionTest("./images/test.png") + # vision.run("闂") + vision.loadDoc() -- Gitblit v1.9.1