From b2bef1e9348fef4010ed713497fdea85fc751c66 Mon Sep 17 00:00:00 2001
From: YM <479443481@qq.com>
Date: 星期四, 08 五月 2025 16:55:31 +0800
Subject: [PATCH] 文档解析,获取章节结构,并获取图片转换为base64

---
 .gitignore               |    3 
 static/images/test.png   |    0 
 vision_test.py           |  157 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 static/doc/ZL格式(公开).docx |    0 
 4 files changed, 159 insertions(+), 1 deletions(-)

diff --git a/.gitignore b/.gitignore
index ddcc4f8..7bc84b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 /datas
 /.conda
 /docs
-/out*
\ No newline at end of file
+/out*
+__pycache__
\ No newline at end of file
diff --git "a/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx" "b/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx"
new file mode 100644
index 0000000..754aaee
--- /dev/null
+++ "b/static/doc/ZL\346\240\274\345\274\217\050\345\205\254\345\274\200\051.docx"
Binary files differ
diff --git a/static/images/test.png b/static/images/test.png
new file mode 100644
index 0000000..aca2fdd
--- /dev/null
+++ b/static/images/test.png
Binary files differ
diff --git a/vision_test.py b/vision_test.py
new file mode 100644
index 0000000..3e41c4c
--- /dev/null
+++ b/vision_test.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+#
+# @author: lyg
+# @date: 2025-5-7
+# @version: 1
+# @description:瑙嗚璇嗗埆鏂囨。鍐呭
+
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.output_parsers import JsonOutputParser
+from docx import Document
+from PIL import Image
+from io import BytesIO
+import re
+import json
+import base64
+
+
+class VisionTest:
+    def __init__(self, file):
+        self.llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct",
+                              base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
+        image = base64.b64encode(open(file, 'rb').read()).decode()
+        self.prompt = ChatPromptTemplate.from_messages([
+            SystemMessage("浣犳槸涓�涓祫娣辫蒋浠跺伐绋嬪笀锛岃鍒嗘瀽鍥剧墖鍥炵瓟闂銆�"),
+            HumanMessage(content=[
+                {"type": "text", "text": "{msg}"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
+                }
+            ])
+        ])
+
+    def run(self, msg):
+        chain = self.prompt | self.llm
+        resp = chain.invoke({"msg": msg})
+        print(resp.content)
+
+    # def get_document_chapters(doc_path):
+    #     doc = Document(doc_path)
+    #     chapters = []
+    #     current_chapter = None
+        
+    #     for para in doc.paragraphs:
+    #         if para.style.name.startswith('Heading'):  # 妫�鏌ユ槸鍚︿负鏍囬鏍峰紡
+    #             level = int(para.style.name.replace('Heading', ''))  # 鑾峰彇鏍囬绾у埆
+    #             current_chapter = {'level': level, 'title': para.text, 'content': []}
+    #             chapters.append(current_chapter)
+    #         elif current_chapter is not None:
+    #             current_chapter['content'].append(para.text)  # 娣诲姞鍐呭鍒板綋鍓嶇珷鑺�
+        
+    #     return chapters
+
+    def has_image(self, paragraph):
+        # 閫氳繃妫�鏌ML涓殑宓屽叆寮忓璞℃潵鍒ゆ柇鏄惁鏈夊浘鐗�
+        xml = paragraph._element.xml
+        return 'w:object' in xml or 'w:drawing' in xml
+    
+    def convert_blob_to_png_base64(self, image_blob):
+        try:
+            # 鎵撳紑鍥剧墖
+            image = Image.open(BytesIO(image_blob))
+            # 鍒涘缓鍐呭瓨缂撳啿鍖�
+            buffer = BytesIO()
+            # 淇濆瓨涓篜NG鏍煎紡
+            image.save(buffer, format="PNG")
+            # 鑾峰彇PNG鏍煎紡鐨勪簩杩涘埗鏁版嵁
+            png_data = buffer.getvalue()
+            # 杞崲涓築ase64缂栫爜
+            base64_data = base64.b64encode(png_data).decode('utf-8')
+            return base64_data
+        except Exception as e:
+            print(f"Error: {e}")
+            return None
+        
+    def get_image_blob(self, paragraph):
+        # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級
+        for run in paragraph.runs:
+            xml = run._element.xml
+            if xml.find('v:imagedata') != -1:
+                # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:id灞炴��
+                match = re.search(r'r:id="([^"]+)"', xml)
+                if match:
+                    r_id = match.group(1)
+                    if r_id:
+                        # 鑾峰彇鍥剧墖淇℃伅
+                        image_part = paragraph.part.rels[r_id].target_part
+                        return image_part.blob
+            if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1:
+                # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴��
+                match = re.search(r'r:embed="([^"]+)"', xml)
+                if match:
+                    r_id = match.group(1)
+                    if r_id:
+                        # 鑾峰彇鍥剧墖淇℃伅
+                        image_part = paragraph.part.rels[r_id].target_part
+                        return image_part.blob
+        return None
+
+    def loadDoc(self):
+        doc = Document('./static/doc/ZL鏍煎紡(鍏紑).docx')
+        # 鎸夌収鏍囬鑾峰彇娈佃惤鐨勫眰绾х粨鏋�
+        titles = []
+        for paragraph in doc.paragraphs:
+            if paragraph.text != "":
+                # 鏂囧瓧涓嶄负绌�
+                if paragraph.style.base_style is not None:
+                    # 鏈塨ase_style
+                    if paragraph.style.base_style.name.startswith('Heading'):
+                        # 鏄爣棰�
+                        level = int(paragraph.style.base_style.name.split(' ')[-1])
+                        obj = {}
+                        obj["level"] = level
+                        obj["text"] = paragraph.text
+                        obj["child"] = []
+                        titles.append(obj)
+                    else:
+                        length = len(titles)
+                        if "child" in titles[length -1]:
+                            obj = {}
+                            obj["text"] = paragraph.text
+                            titles[length -1]['child'].append(obj)
+                else:
+                    # 娌℃湁base_style
+                    length = len(titles)
+                    obj = {}
+                    obj["text"] = paragraph.text
+                    if length > 0 and "child" in titles[length -1]:
+                        # 濡傛灉鏄爣棰樺唴鐨刟ppend杩涙爣棰樼殑child
+                        titles[length -1]['child'].append(obj)
+                    else:
+                        # 闈炴爣棰樺唴鐨勭洿鎺ユ斁鍦ㄧ涓�灞�
+                        titles.append(obj)
+            else:
+                # 鏂囧瓧涓虹┖鏃讹紝鍙兘鏄浘鐗囨垨鑰呰〃鏍�
+                if self.has_image(paragraph):
+                    # 褰撳墠娈佃惤涓哄浘鐗�
+                    obj = {}
+                    # 鑾峰彇鍥剧墖鐨刡lob
+                    img = self.get_image_blob(paragraph)
+                    if img is not None:
+                        imgBase64 = self.convert_blob_to_png_base64(img)
+                        if imgBase64 is not None:
+                            obj["imgBase64"] = imgBase64
+                            titles[length -1]['child'].append(obj)
+                # 鍦ㄨ繖閲屾墿灞曞垽鏂〃鏍�
+        print(titles)
+        # for para in doc.paragraphs:
+        #     print(para.text)
+        #     print('------------------------')
+
+if __name__ == '__main__':
+    vision = VisionTest("./images/test.png")
+    # vision.run("闂")
+    vision.loadDoc()

--
Gitblit v1.9.1