From 494637879bc8f5dd9c3d43481927b4a0c07e2f34 Mon Sep 17 00:00:00 2001
From: lyg <1543117173@qq.com>
Date: 星期四, 08 五月 2025 19:02:18 +0800
Subject: [PATCH] docx文档切割,表格转json,图片内容识别为文本。

---
 vision_test.py |  139 ++++------------------------------------------
 1 files changed, 12 insertions(+), 127 deletions(-)

diff --git a/vision_test.py b/vision_test.py
index 3e41c4c..4b23588 100644
--- a/vision_test.py
+++ b/vision_test.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-#
+# 
 # @author: lyg
 # @date: 2025-5-7
 # @version: 1
@@ -7,25 +7,24 @@
 
 from langchain_openai.chat_models import ChatOpenAI
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
-from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.messages import HumanMessage,SystemMessage
 from langchain_core.output_parsers import JsonOutputParser
-from docx import Document
-from PIL import Image
-from io import BytesIO
-import re
 import json
 import base64
 
 
 class VisionTest:
-    def __init__(self, file):
-        self.llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct",
-                              base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
+    def __init__(self,file):
+        self.llm = ChatOpenAI(temperature=0,
+                              model="qwen2.5-72b-instruct",
+                              base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+                              api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
+
         image = base64.b64encode(open(file, 'rb').read()).decode()
         self.prompt = ChatPromptTemplate.from_messages([
             SystemMessage("浣犳槸涓�涓祫娣辫蒋浠跺伐绋嬪笀锛岃鍒嗘瀽鍥剧墖鍥炵瓟闂銆�"),
             HumanMessage(content=[
-                {"type": "text", "text": "{msg}"},
+                {"type": "text", "text": "describe the weather in this image"},
                 {
                     "type": "image_url",
                     "image_url": {"url": f"data:image/jpeg;base64,{image}"},
@@ -33,125 +32,11 @@
             ])
         ])
 
-    def run(self, msg):
+    def run(self,msg):
         chain = self.prompt | self.llm
         resp = chain.invoke({"msg": msg})
         print(resp.content)
 
-    # def get_document_chapters(doc_path):
-    #     doc = Document(doc_path)
-    #     chapters = []
-    #     current_chapter = None
-        
-    #     for para in doc.paragraphs:
-    #         if para.style.name.startswith('Heading'):  # 妫�鏌ユ槸鍚︿负鏍囬鏍峰紡
-    #             level = int(para.style.name.replace('Heading', ''))  # 鑾峰彇鏍囬绾у埆
-    #             current_chapter = {'level': level, 'title': para.text, 'content': []}
-    #             chapters.append(current_chapter)
-    #         elif current_chapter is not None:
-    #             current_chapter['content'].append(para.text)  # 娣诲姞鍐呭鍒板綋鍓嶇珷鑺�
-        
-    #     return chapters
-
-    def has_image(self, paragraph):
-        # 閫氳繃妫�鏌ML涓殑宓屽叆寮忓璞℃潵鍒ゆ柇鏄惁鏈夊浘鐗�
-        xml = paragraph._element.xml
-        return 'w:object' in xml or 'w:drawing' in xml
-    
-    def convert_blob_to_png_base64(self, image_blob):
-        try:
-            # 鎵撳紑鍥剧墖
-            image = Image.open(BytesIO(image_blob))
-            # 鍒涘缓鍐呭瓨缂撳啿鍖�
-            buffer = BytesIO()
-            # 淇濆瓨涓篜NG鏍煎紡
-            image.save(buffer, format="PNG")
-            # 鑾峰彇PNG鏍煎紡鐨勪簩杩涘埗鏁版嵁
-            png_data = buffer.getvalue()
-            # 杞崲涓築ase64缂栫爜
-            base64_data = base64.b64encode(png_data).decode('utf-8')
-            return base64_data
-        except Exception as e:
-            print(f"Error: {e}")
-            return None
-        
-    def get_image_blob(self, paragraph):
-        # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級
-        for run in paragraph.runs:
-            xml = run._element.xml
-            if xml.find('v:imagedata') != -1:
-                # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:id灞炴��
-                match = re.search(r'r:id="([^"]+)"', xml)
-                if match:
-                    r_id = match.group(1)
-                    if r_id:
-                        # 鑾峰彇鍥剧墖淇℃伅
-                        image_part = paragraph.part.rels[r_id].target_part
-                        return image_part.blob
-            if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1:
-                # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴��
-                match = re.search(r'r:embed="([^"]+)"', xml)
-                if match:
-                    r_id = match.group(1)
-                    if r_id:
-                        # 鑾峰彇鍥剧墖淇℃伅
-                        image_part = paragraph.part.rels[r_id].target_part
-                        return image_part.blob
-        return None
-
-    def loadDoc(self):
-        doc = Document('./static/doc/ZL鏍煎紡(鍏紑).docx')
-        # 鎸夌収鏍囬鑾峰彇娈佃惤鐨勫眰绾х粨鏋�
-        titles = []
-        for paragraph in doc.paragraphs:
-            if paragraph.text != "":
-                # 鏂囧瓧涓嶄负绌�
-                if paragraph.style.base_style is not None:
-                    # 鏈塨ase_style
-                    if paragraph.style.base_style.name.startswith('Heading'):
-                        # 鏄爣棰�
-                        level = int(paragraph.style.base_style.name.split(' ')[-1])
-                        obj = {}
-                        obj["level"] = level
-                        obj["text"] = paragraph.text
-                        obj["child"] = []
-                        titles.append(obj)
-                    else:
-                        length = len(titles)
-                        if "child" in titles[length -1]:
-                            obj = {}
-                            obj["text"] = paragraph.text
-                            titles[length -1]['child'].append(obj)
-                else:
-                    # 娌℃湁base_style
-                    length = len(titles)
-                    obj = {}
-                    obj["text"] = paragraph.text
-                    if length > 0 and "child" in titles[length -1]:
-                        # 濡傛灉鏄爣棰樺唴鐨刟ppend杩涙爣棰樼殑child
-                        titles[length -1]['child'].append(obj)
-                    else:
-                        # 闈炴爣棰樺唴鐨勭洿鎺ユ斁鍦ㄧ涓�灞�
-                        titles.append(obj)
-            else:
-                # 鏂囧瓧涓虹┖鏃讹紝鍙兘鏄浘鐗囨垨鑰呰〃鏍�
-                if self.has_image(paragraph):
-                    # 褰撳墠娈佃惤涓哄浘鐗�
-                    obj = {}
-                    # 鑾峰彇鍥剧墖鐨刡lob
-                    img = self.get_image_blob(paragraph)
-                    if img is not None:
-                        imgBase64 = self.convert_blob_to_png_base64(img)
-                        if imgBase64 is not None:
-                            obj["imgBase64"] = imgBase64
-                            titles[length -1]['child'].append(obj)
-                # 鍦ㄨ繖閲屾墿灞曞垽鏂〃鏍�
-        print(titles)
-        # for para in doc.paragraphs:
-        #     print(para.text)
-        #     print('------------------------')
-
 if __name__ == '__main__':
-    vision = VisionTest("./images/test.png")
-    # vision.run("闂")
-    vision.loadDoc()
+    vision = VisionTest("image_path")
+    vision.run("闂")
\ No newline at end of file

--
Gitblit v1.9.1