From e22a78a2f2857ff98ec624b7c4f5c15b2c8362dd Mon Sep 17 00:00:00 2001 From: YM <479443481@qq.com> Date: 星期一, 12 五月 2025 16:19:22 +0800 Subject: [PATCH] Merge branch 'master' of http://182.92.203.7:2001/r/KnowledgeBase --- knowledgebase/doc/docx_split.py | 201 +++++++++++++++++++++++++ knowledgebase/gen_base_db/__init__.py | 6 .gitignore | 1 vision_test.py | 139 +--------------- knowledgebase/doc/image_to_text.py | 44 +++++ knowledgebase/gen_base_db/db_generate.py | 10 + knowledgebase/gen_base_db/json_generate.py | 11 + knowledgebase/llm.py | 17 ++ 8 files changed, 302 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index 7bc84b4..9256fae 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ /.conda /docs /out* +/packages __pycache__ \ No newline at end of file diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py new file mode 100644 index 0000000..52df48f --- /dev/null +++ b/knowledgebase/doc/docx_split.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg, ym +# @date: 2025-5-8 +# @version: 1 +# @description: docx鏂囨。鎷嗗垎鍣紝鏍规嵁娈佃惤鎷嗗垎锛屽皢鍥剧墖鍜岃〃鏍艰浆鎹负json鏁版嵁銆� +import docx +import docx.table +import json +from dataclasses import dataclass +from PIL import Image +import io +import re + +from knowledgebase.doc.image_to_text import ImageToText + + +@dataclass +class ParagraphInfo: + """ + 娈佃惤淇℃伅 + :param text: str - 娈佃惤鏂囨湰 + :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 + :param title_no: str - 鏍囬缂栧彿锛屽1.1銆�1.1.1绛� + """ + text: str + level: int + title_no: str + + @property + def full_text(self): + """ + 鑾峰彇娈佃惤瀹屾暣鏂囨湰锛屽寘鍚爣棰樼紪鍙� + :return: str - 娈佃惤瀹屾暣鏂囨湰 + """ + return f"{self.title_no} {self.text}" + + def __init__(self, text: str, level: int): + """ + 娈佃惤淇℃伅 + :param text: str - 娈佃惤鏂囨湰 + :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 + """ + self.text = text + self.level = level + self.title_no = '' + + +class DocSplit: + """ + docx鏂囨。鎷嗗垎鍣紝鏍规嵁娈佃惤鎷嗗垎锛屽皢鍥剧墖鍜岃〃鏍艰浆鎹负json鏁版嵁銆� + 1.灏佽娈佃惤淇℃伅 + 2.灏嗗浘鐗囧拰琛ㄦ牸杞崲涓簀son + 3.灏嗘钀芥寜鐓ф枃妗f爣棰樼骇鍒粍鍚堟垚鏍戝舰缁撴瀯 + + """ + + def __init__(self, doc_file): + self.doc_file = doc_file + self.image_to_text = ImageToText() + self.paragraphs:list[ParagraphInfo] = [] + + def table_to_json(self, table: docx.table.Table): + """ + 灏嗚〃鏍艰浆鎹负 JSON 鏍煎紡 + + :param table: docx.table.Table - 瑕佽浆鎹㈢殑琛ㄦ牸瀵硅薄 + :return list - 琛ㄦ牸鏁版嵁锛屼互 JSON 鏍煎紡琛ㄧず + """ + table_data = [] + headers = [] + first_row = True + row: docx.table._Row + for row in table.rows: + if first_row: + for cell in row.cells: + headers.append(cell.text) + first_row = False + continue + row_data = {} + row_idx = 0 + for cell in row.cells: + if cell.tables: + # 宓屽琛ㄦ牸澶勭悊 + if len(cell.tables) == 1: + text = self.table_to_json(cell.tables[0]) + else: + text = [] + for tbl in cell.tables: + tbl_json = self.table_to_json(tbl) + text.append(tbl_json) + else: + # 鍗曞厓鏍兼枃鏈幏鍙� + text = cell.text + row_data[headers[row_idx]] = text + row_idx += 1 + + table_data.append(row_data) + return table_data + + def split(self): + """ + 灏嗘枃妗f媶鍒嗘垚娈佃惤锛屽苟杩斿洖娈佃惤鍒楄〃 + + :return: list[ParagraphInfo] - 娈佃惤鍒楄〃 + """ + document = docx.Document(self.doc_file) + table_cnt = 0 + paragraph_cnt = 0 + + for element in document.element.body: + if element.tag.endswith('p'): # 娈佃惤 + # 鑾峰彇鏍囬澶氱骇缂栧彿 + paragraph = document.paragraphs[paragraph_cnt] + paragraph_text = paragraph.text + if paragraph_text: + self.paragraphs.append(ParagraphInfo(paragraph_text, self.get_header_level(paragraph))) + # 妫�鏌ユ槸鍚︽槸鍥剧墖锛屽鏋滄槸鍥剧墖鍒欒浆鎹负鏂囨湰 + img_data = self.get_image_blob(paragraph) + if img_data: + text = self.gen_text_from_img(img_data) + self.paragraphs.append(ParagraphInfo(text, 0)) + paragraph_cnt += 1 + elif element.tag.endswith('tbl'): # 琛ㄦ牸 + table = document.tables[table_cnt] # 鑾峰彇褰撳墠琛ㄦ牸瀵硅薄 + table_cnt += 1 + table_data = self.table_to_json(table) + self.paragraphs.append(ParagraphInfo(json.dumps(table_data, indent=4, ensure_ascii=False), 0)) + else: + continue + # 鐢熸垚鏍囬缂栧彿 + self.gen_title_no(self.paragraphs) + + @staticmethod + def get_image_blob(paragraph): + # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級 + for run in paragraph.runs: + xml = run._element.xml + if xml.find('v:imagedata') != -1: + # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:id灞炴�� + match = re.search(r'r:id="([^"]+)"', xml) + if match: + r_id = match.group(1) + if r_id: + # 鑾峰彇鍥剧墖淇℃伅 + image_part = paragraph.part.rels[r_id].target_part + return DocSplit.image_convert(image_part.blob, "png") + if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: + # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴�� + match = re.search(r'r:embed="([^"]+)"', xml) + if match: + r_id = match.group(1) + if r_id: + # 鑾峰彇鍥剧墖淇℃伅 + image_part = paragraph.part.rels[r_id].target_part + return DocSplit.image_convert(image_part.blob, "png") + return None + + @staticmethod + def gen_title_no(paragraphs: list[ParagraphInfo]): + title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] + for i in range(len(paragraphs)): + if paragraphs[i].level > 0: + for j in range(paragraphs[i].level - 1): + title_levels[j] = 1 + paragraphs[i].title_no = '.'.join([str(x) for x in title_levels[0:paragraphs[i].level]]) + title_levels[paragraphs[i].level - 1] += 1 + else: + title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] + + @staticmethod + def get_header_level(paragraph) -> int: + if paragraph.style.base_style: + style = paragraph.style.base_style + else: + style = paragraph.style + if style and style.name.startswith('Heading'): + # 鑾峰彇鏍囬绾у埆 + level = int(style.name.split(' ')[1]) + return level + else: + return 0 + + @staticmethod + def image_convert(_in: bytes, _out_format: str) -> bytes: + in_io = io.BytesIO() + in_io.write(_in) + img = Image.open(in_io, "r") + out_io = io.BytesIO() + img.save(out_io, "png") + out_io.seek(0) + return out_io.read() + + def gen_text_from_img(self, img_data:bytes): + return self.image_to_text.gen_text_from_img(img_data) + +if __name__ == '__main__': + doc_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\ZL鏍煎紡(鍏紑).docx' + doc_split = DocSplit(doc_file) + doc_split.split() + print("\n".join([x.full_text for x in doc_split.paragraphs])) diff --git a/knowledgebase/doc/image_to_text.py b/knowledgebase/doc/image_to_text.py new file mode 100644 index 0000000..b98c0b9 --- /dev/null +++ b/knowledgebase/doc/image_to_text.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-8 +# @version: 1 +# @description: 鍒╃敤LLM灏嗗浘鐗囪浆涓烘枃鏈�� + +from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.output_parsers import JsonOutputParser +import json +import base64 + +from knowledgebase.llm import vision_llm + + +class ImageToText: + def __init__(self): + self.llm = vision_llm + self.prompt = ChatPromptTemplate.from_messages([ + ("system", "浣犳槸涓�涓祫娣辫蒋浠跺伐绋嬪笀锛岃鍒嗘瀽鍥剧墖涓殑鍐呭銆�"), + ( + "user", + [ + {"type": "text", "text": "{msg}"}, + { + "type": "image_url", + "image_url": {"url": "data:image/jpeg;base64,{image}"}, + } + ], + ) + ]) + + def gen_text_from_img(self, image: bytes) -> str: + """ + 浠庡浘鐗囩敓鎴愭枃鏈�� + + :param image: 鍥剧墖鏁版嵁 + :return: 鏂囨湰 + """ + image = base64.b64encode(image).decode() + chain = self.prompt | self.llm + resp = chain.invoke({"msg": "浣跨敤鑷劧璇█杈撳嚭鍥剧墖涓殑鍐呭锛屼笉瑕佸仛杩囧鐨勮В閲娿�傝緭鍑烘牸寮忎负绾枃鏈��", "image": image}) + return resp.content diff --git a/knowledgebase/gen_base_db/__init__.py b/knowledgebase/gen_base_db/__init__.py new file mode 100644 index 0000000..2bb45e4 --- /dev/null +++ b/knowledgebase/gen_base_db/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# @author: +# @date: +# @version: +# @description: diff --git a/knowledgebase/gen_base_db/db_generate.py b/knowledgebase/gen_base_db/db_generate.py new file mode 100644 index 0000000..cbfa28a --- /dev/null +++ b/knowledgebase/gen_base_db/db_generate.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# +# @author: +# @date: +# @version: +# @description: + +class DbGenerate: + def __init__(self): + pass \ No newline at end of file diff --git a/knowledgebase/gen_base_db/json_generate.py b/knowledgebase/gen_base_db/json_generate.py new file mode 100644 index 0000000..c19de82 --- /dev/null +++ b/knowledgebase/gen_base_db/json_generate.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +# +# @author: +# @date: +# @version: +# @description: + +from knowledgebase.llm import llm +class JsonGenerate: + def __init__(self): + self.llm = llm diff --git a/knowledgebase/llm.py b/knowledgebase/llm.py new file mode 100644 index 0000000..ac6e035 --- /dev/null +++ b/knowledgebase/llm.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-8 +# @version: 1 +# @description: 鍏叡langchain LLM 瀹炰緥 +from langchain_openai.chat_models import ChatOpenAI + +llm = ChatOpenAI(temperature=0, + model="qwen2.5-72b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + +vision_llm = ChatOpenAI(temperature=0, + model="qwen2.5-vl-32b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") \ No newline at end of file diff --git a/vision_test.py b/vision_test.py index f9872d4..fcc7138 100644 --- a/vision_test.py +++ b/vision_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# +# # @author: lyg # @date: 2025-5-7 # @version: 1 @@ -7,25 +7,24 @@ from langchain_openai.chat_models import ChatOpenAI from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate -from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.messages import HumanMessage,SystemMessage from langchain_core.output_parsers import JsonOutputParser -from docx import Document -from PIL import Image -from io import BytesIO -import re import json import base64 class VisionTest: - def __init__(self, file): - self.llm = ChatOpenAI(temperature=0, model="qwen2.5-72b-instruct", - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + def __init__(self,file): + self.llm = ChatOpenAI(temperature=0, + model="qwen2.5-72b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + image = base64.b64encode(open(file, 'rb').read()).decode() self.prompt = ChatPromptTemplate.from_messages([ SystemMessage("浣犳槸涓�涓祫娣辫蒋浠跺伐绋嬪笀锛岃鍒嗘瀽鍥剧墖鍥炵瓟闂銆�"), HumanMessage(content=[ - {"type": "text", "text": "{msg}"}, + {"type": "text", "text": "describe the weather in this image"}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}, @@ -33,125 +32,11 @@ ]) ]) - def run(self, msg): + def run(self,msg): chain = self.prompt | self.llm resp = chain.invoke({"msg": msg}) print(resp.content) - # def get_document_chapters(doc_path): - # doc = Document(doc_path) - # chapters = [] - # current_chapter = None - - # for para in doc.paragraphs: - # if para.style.name.startswith('Heading'): # 妫�鏌ユ槸鍚︿负鏍囬鏍峰紡 - # level = int(para.style.name.replace('Heading', '')) # 鑾峰彇鏍囬绾у埆 - # current_chapter = {'level': level, 'title': para.text, 'content': []} - # chapters.append(current_chapter) - # elif current_chapter is not None: - # current_chapter['content'].append(para.text) # 娣诲姞鍐呭鍒板綋鍓嶇珷鑺� - - # return chapters - - def has_image(self, paragraph): - # 閫氳繃妫�鏌ML涓殑宓屽叆寮忓璞℃潵鍒ゆ柇鏄惁鏈夊浘鐗� - xml = paragraph._element.xml - return 'w:object' in xml or 'w:drawing' in xml - - def convert_blob_to_png_base64(self, image_blob): - try: - # 鎵撳紑鍥剧墖 - image = Image.open(BytesIO(image_blob)) - # 鍒涘缓鍐呭瓨缂撳啿鍖� - buffer = BytesIO() - # 淇濆瓨涓篜NG鏍煎紡 - image.save(buffer, format="PNG") - # 鑾峰彇PNG鏍煎紡鐨勪簩杩涘埗鏁版嵁 - png_data = buffer.getvalue() - # 杞崲涓築ase64缂栫爜 - base64_data = base64.b64encode(png_data).decode('utf-8') - return base64_data - except Exception as e: - print(f"Error: {e}") - return None - - def get_image_blob(self, paragraph): - # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級 - for run in paragraph.runs: - xml = run._element.xml - if xml.find('v:imagedata') != -1: - # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:id灞炴�� - match = re.search(r'r:id="([^"]+)"', xml) - if match: - r_id = match.group(1) - if r_id: - # 鑾峰彇鍥剧墖淇℃伅 - image_part = paragraph.part.rels[r_id].target_part - return image_part.blob - if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: - # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴�� - match = re.search(r'r:embed="([^"]+)"', xml) - if match: - r_id = match.group(1) - if r_id: - # 鑾峰彇鍥剧墖淇℃伅 - image_part = paragraph.part.rels[r_id].target_part - return image_part.blob - return None - - def loadDoc(self): - doc = Document('./static/doc/ZL鏍煎紡(鍏紑).docx') - # 鎸夌収鏍囬鑾峰彇娈佃惤鐨勫眰绾х粨鏋� - titles = [] - for paragraph in doc.paragraphs: - if paragraph.text != "": - # 鏂囧瓧涓嶄负绌� - if paragraph.style.base_style is not None: - # 鏈塨ase_style - if paragraph.style.base_style.name.startswith('Heading'): - # 鏄爣棰� - level = int(paragraph.style.base_style.name.split(' ')[-1]) - obj = {} - obj["level"] = level - obj["text"] = paragraph.text - obj["child"] = [] - titles.append(obj) - else: - length = len(titles) - if "child" in titles[length -1]: - obj = {} - obj["text"] = paragraph.text - titles[length -1]['child'].append(obj) - else: - # 娌℃湁base_style - length = len(titles) - obj = {} - obj["text"] = paragraph.text - if length > 0 and "child" in titles[length -1]: - # 濡傛灉鏄爣棰樺唴鐨刟ppend杩涙爣棰樼殑child - titles[length -1]['child'].append(obj) - else: - # 闈炴爣棰樺唴鐨勭洿鎺ユ斁鍦ㄧ涓�灞� - titles.append(obj) - else: - # 鏂囧瓧涓虹┖鏃讹紝鍙兘鏄浘鐗囨垨鑰呰〃鏍� - if self.has_image(paragraph): - # 褰撳墠娈佃惤涓哄浘鐗� - obj = {} - # 鑾峰彇鍥剧墖鐨刡lob - img = self.get_image_blob(paragraph) - if img is not None: - imgBase64 = self.convert_blob_to_png_base64(img) - if imgBase64 is not None: - obj["imgBase64"] = imgBase64 - titles[length -1]['child'].append(obj) - # 鍦ㄨ繖閲屾墿灞曞垽鏂〃鏍� - print(titles) - # for para in doc.paragraphs: - # print(para.text) - # print('------------------------') - if __name__ == '__main__': - vision = VisionTest("./static/images/test.png") - # vision.run("闂") - vision.loadDoc() + vision = VisionTest("image_path") + vision.run("闂") -- Gitblit v1.9.1