YM
2025-05-08 b2bef1e9348fef4010ed713497fdea85fc751c66
文档解析,获取章节结构,并获取图片转换为base64
1个文件已修改
3个文件已添加
158 ■■■■■ 已修改文件
.gitignore 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
static/doc/ZL格式(公开).docx 补丁 | 查看 | 原始文档 | blame | 历史
static/images/test.png 补丁 | 查看 | 原始文档 | blame | 历史
vision_test.py 157 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
.gitignore
@@ -5,3 +5,4 @@
/.conda
/docs
/out*
__pycache__
static/doc/ZL¸ñʽ(¹«¿ª).docx
Binary files differ
static/images/test.png
vision_test.py
New file
@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
#
# @author: lyg
# @date: 2025-5-7
# @version: 1
# @description:视觉识别文档内容
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import JsonOutputParser
from docx import Document
from PIL import Image
from io import BytesIO
import re
import json
import base64
class VisionTest:
    def __init__(self, file):
        self.llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct",
                              base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key="sk-15ecf7e273ad4b729c7f7f42b542749e")
        image = base64.b64encode(open(file, 'rb').read()).decode()
        self.prompt = ChatPromptTemplate.from_messages([
            SystemMessage("你是一个资深软件工程师,请分析图片回答问题。"),
            HumanMessage(content=[
                {"type": "text", "text": "{msg}"},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                }
            ])
        ])
    def run(self, msg):
        chain = self.prompt | self.llm
        resp = chain.invoke({"msg": msg})
        print(resp.content)
    # def get_document_chapters(doc_path):
    #     doc = Document(doc_path)
    #     chapters = []
    #     current_chapter = None
    #     for para in doc.paragraphs:
    #         if para.style.name.startswith('Heading'):  # æ£€æŸ¥æ˜¯å¦ä¸ºæ ‡é¢˜æ ·å¼
    #             level = int(para.style.name.replace('Heading', ''))  # èŽ·å–æ ‡é¢˜çº§åˆ«
    #             current_chapter = {'level': level, 'title': para.text, 'content': []}
    #             chapters.append(current_chapter)
    #         elif current_chapter is not None:
    #             current_chapter['content'].append(para.text)  # æ·»åŠ å†…å®¹åˆ°å½“å‰ç« èŠ‚
    #     return chapters
    def has_image(self, paragraph):
        # é€šè¿‡æ£€æŸ¥XML中的嵌入式对象来判断是否有图片
        xml = paragraph._element.xml
        return 'w:object' in xml or 'w:drawing' in xml
    def convert_blob_to_png_base64(self, image_blob):
        try:
            # æ‰“开图片
            image = Image.open(BytesIO(image_blob))
            # åˆ›å»ºå†…存缓冲区
            buffer = BytesIO()
            # ä¿å­˜ä¸ºPNG格式
            image.save(buffer, format="PNG")
            # èŽ·å–PNG格式的二进制数据
            png_data = buffer.getvalue()
            # è½¬æ¢ä¸ºBase64编码
            base64_data = base64.b64encode(png_data).decode('utf-8')
            return base64_data
        except Exception as e:
            print(f"Error: {e}")
            return None
    def get_image_blob(self, paragraph):
        # éåŽ†æ®µè½ä¸­çš„æ‰€æœ‰Run对象(图片通常在单独的Run中)
        for run in paragraph.runs:
            xml = run._element.xml
            if xml.find('v:imagedata') != -1:
                # ä½¿ç”¨æ­£åˆ™è¡¨è¾¾å¼æŸ¥æ‰¾r:id属性
                match = re.search(r'r:id="([^"]+)"', xml)
                if match:
                    r_id = match.group(1)
                    if r_id:
                        # èŽ·å–å›¾ç‰‡ä¿¡æ¯
                        image_part = paragraph.part.rels[r_id].target_part
                        return image_part.blob
            if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1:
                # ä½¿ç”¨æ­£åˆ™è¡¨è¾¾å¼æŸ¥æ‰¾r:embed属性
                match = re.search(r'r:embed="([^"]+)"', xml)
                if match:
                    r_id = match.group(1)
                    if r_id:
                        # èŽ·å–å›¾ç‰‡ä¿¡æ¯
                        image_part = paragraph.part.rels[r_id].target_part
                        return image_part.blob
        return None
    def loadDoc(self):
        doc = Document('./static/doc/ZL格式(公开).docx')
        # æŒ‰ç…§æ ‡é¢˜èŽ·å–æ®µè½çš„å±‚çº§ç»“æž„
        titles = []
        for paragraph in doc.paragraphs:
            if paragraph.text != "":
                # æ–‡å­—不为空
                if paragraph.style.base_style is not None:
                    # æœ‰base_style
                    if paragraph.style.base_style.name.startswith('Heading'):
                        # æ˜¯æ ‡é¢˜
                        level = int(paragraph.style.base_style.name.split(' ')[-1])
                        obj = {}
                        obj["level"] = level
                        obj["text"] = paragraph.text
                        obj["child"] = []
                        titles.append(obj)
                    else:
                        length = len(titles)
                        if "child" in titles[length -1]:
                            obj = {}
                            obj["text"] = paragraph.text
                            titles[length -1]['child'].append(obj)
                else:
                    # æ²¡æœ‰base_style
                    length = len(titles)
                    obj = {}
                    obj["text"] = paragraph.text
                    if length > 0 and "child" in titles[length -1]:
                        # å¦‚果是标题内的append进标题的child
                        titles[length -1]['child'].append(obj)
                    else:
                        # éžæ ‡é¢˜å†…的直接放在第一层
                        titles.append(obj)
            else:
                # æ–‡å­—为空时,可能是图片或者表格
                if self.has_image(paragraph):
                    # å½“前段落为图片
                    obj = {}
                    # èŽ·å–å›¾ç‰‡çš„blob
                    img = self.get_image_blob(paragraph)
                    if img is not None:
                        imgBase64 = self.convert_blob_to_png_base64(img)
                        if imgBase64 is not None:
                            obj["imgBase64"] = imgBase64
                            titles[length -1]['child'].append(obj)
                # åœ¨è¿™é‡Œæ‰©å±•判断表格
        print(titles)
        # for para in doc.paragraphs:
        #     print(para.text)
        #     print('------------------------')
if __name__ == '__main__':
    vision = VisionTest("./images/test.png")
    # vision.run("问题")
    vision.loadDoc()