From acde3bd32f07bf02839a21e8fe5b4e69bfca2251 Mon Sep 17 00:00:00 2001 From: lyg <1543117173@qq.com> Date: 星期三, 14 五月 2025 10:37:00 +0800 Subject: [PATCH] docx文档拆分,文档段落实体词提取,存入mysql数据库。 --- .gitignore | 3 knowledgebase/doc/models.py | 117 +++++ knowledgebase/db/doc_db_helper.py | 121 +++++ knowledgebase/doc/entity_recognition.py | 25 tpl/tc_transfer_frame.json | 83 +++ knowledgebase/doc/doc_convert.py | 14 knowledgebase/doc/doc_processor.py | 133 +++-- knowledgebase/doc/docx_split.py | 250 +++++++--- knowledgebase/db/doc_db_models.py | 135 +++++ /dev/null | 147 ------ tpl/entities.json | 54 ++ knowledgebase/db/neo4j.py | 19 knowledgebase/doc/entity_helper.py | 44 + knowledgebase/utils.py | 13 knowledgebase/log/__init__.py | 50 ++ tpl/tc_pkt_format.json | 147 ++++++ 16 files changed, 1,067 insertions(+), 288 deletions(-) diff --git a/.gitignore b/.gitignore index 9256fae..702c38e 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ /docs /out* /packages -__pycache__ \ No newline at end of file +__pycache__ +/static/ \ No newline at end of file diff --git a/knowledgebase/db/doc_db_helper.py b/knowledgebase/db/doc_db_helper.py new file mode 100644 index 0000000..5089e30 --- /dev/null +++ b/knowledgebase/db/doc_db_helper.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-12 +# @version: 1 +# @description: 鏂囨。鏁版嵁搴撳姪鎵嬶紝mysql鏁版嵁搴� + +import json + +from knowledgebase.db.doc_db_models import init_doc_db, TDoc, TEntity, TParagraph, TParagraphLink, TParagraphRefLink, \ + TParagraphEntityLink + +from knowledgebase.doc.models import ParagraphInfo, DocInfo + + +class DocDbHelper: + """ + 鏂囨。鏁版嵁搴撳姪鎵� + """ + def __init__(self): + self.session = init_doc_db() + + def add_doc(self, doc_info: DocInfo) -> int: + """ + 娣诲姞鏂囨。 + """ + _doc = TDoc( + file=doc_info.file, + file_name=doc_info.file_name, + is_del=0, + ) + self.session.add(_doc) + self.session.commit() + return _doc.id + + def add_paragraph(self, doc_id: int, parent_id: int, paragraph_info: ParagraphInfo) -> TParagraph: + """ + 娣诲姞娈佃惤 + :param doc_id: 鏂囨。id + :param parent_id: 鐖舵钀絠d + :param paragraph_info: 娈佃惤淇℃伅 + """ + _paragraph = TParagraph( + doc_id=doc_id, + text=paragraph_info.text, + title_level=paragraph_info.title_level, + title_num=paragraph_info.title_num, + num=paragraph_info.num, + num_level=paragraph_info.num_level, + parent_id=parent_id, + is_del=0, + ) + self.session.add(_paragraph) + self.session.commit() + if parent_id is not None: + paragraph_link = TParagraphLink(parent_id=parent_id, child_id=_paragraph.id) + self.add_paragraph_link(paragraph_link) + if paragraph_info.entities: + for entity in paragraph_info.entities: + self.add_paragraph_entity_link(TParagraphEntityLink(paragraph_id=_paragraph.id, entity_id=entity.id)) + if paragraph_info.children: + for child in paragraph_info.children: + self.add_paragraph(doc_id, _paragraph.id, child) + return _paragraph + + def add_paragraph_link(self, paragraph_link): + """ + 娣诲姞娈佃惤鍏崇郴 + :param paragraph_link: 娈佃惤鍏崇郴 + """ + self.session.add(paragraph_link) + self.session.commit() + return paragraph_link.id + + def add_paragraph_entity_link(self, paragraph_entity_link): + """ + 娣诲姞娈佃惤瀹炰綋鍏崇郴 + :param paragraph_entity_link: 娈佃惤瀹炰綋鍏崇郴 + """ + self.session.add(paragraph_entity_link) + self.session.commit() + return paragraph_entity_link.id + + def add_entity(self, entity): + """ + 娣诲姞瀹炰綋 + :param entity: 瀹炰綋 + """ + self.session.add(entity) + self.session.commit() + return entity.id + + def add_paragraph_ref_link(self, paragraph_ref_link): + """ + 娣诲姞娈佃惤寮曠敤鍏崇郴 + :param paragraph_ref_link: 娈佃惤寮曠敤鍏崇郴 + """ + self.session.add(paragraph_ref_link) + self.session.commit() + return paragraph_ref_link + + def get_all_entities(self) -> list[TEntity]: + return self.session.query(TEntity).all() + + def get_docs(self) -> list[TDoc]: + return self.session.query(TDoc).all() + + def commit(self): + self.session.commit() + + +doc_dbh = DocDbHelper() + +# if __name__ == '__main__': +# doc_db = DocDbHelper() +# # doc_db.insert_entities() +# doc = doc_db.add_doc(DocInfo(file='aaa', file_name='test')) +# p1 = doc_db.add_paragraph(doc.id, None, ParagraphInfo(text='test1', title_level=1, num=1, num_level=1)) +# p2 = doc_db.add_paragraph(doc.id, p1.id, ParagraphInfo(text='test2', title_level=2, num=1, num_level=2)) +# p3 = doc_db.add_paragraph(doc.id, p2.id, ParagraphInfo(text='test3', title_level=3, num=1, num_level=3)) +# doc_db.add_paragraph_ref_link(TParagraphRefLink(parent_id=p1.id, child_id=p3.id)) diff --git a/knowledgebase/db/doc_db_models.py b/knowledgebase/db/doc_db_models.py new file mode 100644 index 0000000..e5a11ea --- /dev/null +++ b/knowledgebase/db/doc_db_models.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-12 +# @version: 1 +# @description: 鏂囨。鏁版嵁搴撴ā鍨� + +from sqlalchemy import create_engine, Column, DateTime, Integer, Text, ForeignKey +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship +from sqlalchemy.orm import sessionmaker, scoped_session + +from knowledgebase.log import Log + +Base = declarative_base() +metadata = Base.metadata + + +class TParagraph(Base): + """ + 娈佃惤琛� + """ + __tablename__ = 't_paragraphs' + id = Column(Integer, primary_key=True) + text = Column(Text) + title_level = Column(Integer) + title_num = Column(Text) + num_level = Column(Integer) + num = Column(Integer) + doc_id = Column(Integer, ForeignKey('t_docs.id')) + parent_id = Column(Integer, ForeignKey('t_paragraphs.id')) + parent_link = relationship("TParagraphLink", foreign_keys='TParagraphLink.child_id', back_populates='parent', + uselist=False) + children_links = relationship("TParagraphLink", foreign_keys='TParagraphLink.parent_id', back_populates='child') + ref_links = relationship("TParagraphRefLink", foreign_keys='TParagraphRefLink.child_id', back_populates='parent') + entity_links = relationship("TParagraphEntityLink") + is_del = Column(Integer) + + @property + def children(self): + return [link.child for link in self.children_links] + + @property + def parent(self): + if self.parent_link: + return self.parent_link.parent + return None + + def refs(self): + return [link.child for link in self.ref_links] + + +class TParagraphLink(Base): + """ + 娈佃惤link琛� + """ + __tablename__ = 't_paragraph_links' + id = Column(Integer, primary_key=True) + parent_id = Column(Integer, ForeignKey('t_paragraphs.id')) + parent = relationship("TParagraph", foreign_keys=[parent_id], back_populates="children_links") + child_id = Column(Integer, ForeignKey('t_paragraphs.id')) + child = relationship("TParagraph", foreign_keys=[child_id], back_populates="parent_link") + is_del = Column(Integer) + + +class TParagraphRefLink(Base): + """ + 娈佃惤寮曠敤link琛� + """ + __tablename__ = 't_paragraph_ref_links' + id = Column(Integer, primary_key=True) + parent_id = Column(Integer, ForeignKey('t_paragraphs.id')) + parent = relationship("TParagraph", foreign_keys=[parent_id], back_populates="ref_links") + child_id = Column(Integer, ForeignKey('t_paragraphs.id')) + child = relationship("TParagraph", foreign_keys=[child_id], viewonly=True) + is_del = Column(Integer) + + +class TParagraphEntityLink(Base): + """ + 娈佃惤瀹炰綋link琛� + """ + __tablename__ = 't_paragraph_entity_links' + id = Column(Integer, primary_key=True) + paragraph_id = Column(Integer, ForeignKey('t_paragraphs.id')) + paragraph = relationship("TParagraph", foreign_keys=[paragraph_id], back_populates="entity_links") + entity_id = Column(Integer, ForeignKey('t_entities.id')) + entity = relationship("TEntity", foreign_keys=[entity_id]) + is_del = Column(Integer) + + +class TDoc(Base): + """ + 鏂囨。琛� + """ + __tablename__ = 't_docs' + id = Column(Integer, primary_key=True) + file = Column(Text) + file_name = Column(Text) + paragraphs = relationship("TParagraph") + is_del = Column(Integer) + + +class TEntity(Base): + """ + 瀹炰綋琛� + """ + __tablename__ = 't_entities' + id = Column(Integer, primary_key=True) + name = Column(Text) + prompts = Column(Text) + type = Column(Text) + doc_type = Column(Text) + + +class TEntityLink(Base): + __tablename__ = 't_entity_links' + id = Column(Integer, primary_key=True) + entity_id = Column(Integer, ForeignKey('t_entities.id')) + is_del = Column(Integer) + + +def init_doc_db(): + """ + 鍒濆鍖栨枃妗f暟鎹簱 + :return: 鏁版嵁搴搒ession瀹炰緥 + """ + # mysql + Log.info("杩炴帴骞跺垵濮嬪寲鏂囨。鏁版嵁搴�...") + engine = create_engine('mysql+pymysql://root:123456@192.168.3.145:3306/knowledgebase', echo=False) + Base.metadata.create_all(engine) + SessionFactory = sessionmaker(bind=engine) + Session = scoped_session(SessionFactory) + session = Session() + return session diff --git a/knowledgebase/db/neo4j.py b/knowledgebase/db/neo4j.py index 5b9f887..b0916f4 100644 --- a/knowledgebase/db/neo4j.py +++ b/knowledgebase/db/neo4j.py @@ -24,7 +24,15 @@ entities=json.dumps(page_info.entities, ensure_ascii=False, indent=2)) self.graph.create(node) return node - + def create_trunk_node(self, trunk, entities): + """ + 鍒涘缓椤甸潰鑺傜偣 + """ + # 鍒涘缓鑺傜偣 + node = Node("Trunk", trunk=trunk, + entities=json.dumps(entities, ensure_ascii=False, indent=2)) + self.graph.create(node) + return node def create_entity_node(self, entity: str): """ 鍒涘缓瀹炰綋鑺傜偣 @@ -36,6 +44,15 @@ self.graph.create(node) return node + + def create_trunk_entity_relationship(self, page_node, entity_node): + """ + 鍒涘缓椤甸潰鍜屽疄浣撹妭鐐圭殑鍏崇郴 + """ + relationship = Relationship(page_node, "trunk_entity", entity_node) + self.graph.create(relationship) + return relationship + def create_page_entity_relationship(self, page_node, entity_node): """ 鍒涘缓椤甸潰鍜屽疄浣撹妭鐐圭殑鍏崇郴 diff --git a/knowledgebase/doc/doc_convert.py b/knowledgebase/doc/doc_convert.py index dd9d7c5..db2bc32 100644 --- a/knowledgebase/doc/doc_convert.py +++ b/knowledgebase/doc/doc_convert.py @@ -69,17 +69,3 @@ print(f"鏂囦欢 {docx_file} 宸叉垚鍔熻浆鎹负 {pdf_file}锛�") except Exception as e: print(f"鍑虹幇閿欒: {e}") - - -def test(): - # doc_to_docx("D:\\projects\\KnowledgeBase\\doc\\XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�.doc", - # "D:\\projects\\KnowledgeBase\\doc\\XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�111.docx") - # docx_to_pdf("D:/workspace/PythonProjects/KnowledgeBase/doc/ZL鏍煎紡(鍏紑).docx", - # "D:/workspace/PythonProjects/KnowledgeBase/doc/ZL鏍煎紡(鍏紑).pdf") - import pymupdf4llm - md_text = pymupdf4llm.to_markdown("D:/workspace/PythonProjects/KnowledgeBase/doc/ZL鏍煎紡(鍏紑).pdf") - print(md_text) - - -if __name__ == '__main__': - test() diff --git a/knowledgebase/doc/doc_processor.py b/knowledgebase/doc/doc_processor.py index 7dccb8b..bc0e0dd 100644 --- a/knowledgebase/doc/doc_processor.py +++ b/knowledgebase/doc/doc_processor.py @@ -1,65 +1,106 @@ # -*- coding: utf-8 -*- # @file: doc_processor.py # @author: lyg -# @date: 20250427 +# @date: 2025-5-13 # @version: -# @description: 澶勭悊鏂囨。锛屾彁鍙栫珷鑺備俊鎭紝鎻愬彇椤电爜淇℃伅锛屾彁鍙栧疄浣撹瘝锛屽啓鍏ュ浘鏁版嵁搴擄紙neo4j锛夈�� -from knowledgebase.db.neo4j import Neo4jHelper -from knowledgebase.doc.doc_split import DocSplit -from knowledgebase.doc.entity_recognition import EntityRecognition +# @description: 澶勭悊鏂囨。锛屾媶鍒嗘枃妗o紝灏嗘媶鍒嗗悗鐨勭珷鑺備繚瀛樺埌鏁版嵁搴撲腑銆� +from langchain_core.messages import HumanMessage + +from knowledgebase.doc.docx_split import DocSplit import asyncio +from knowledgebase.db.doc_db_helper import doc_dbh +from knowledgebase.doc.entity_helper import entity_helper +from knowledgebase.doc.entity_recognition import EntityRecognition +import os.path + +from knowledgebase.doc.models import DocInfo, ParagraphInfo +from knowledgebase.llm import llm +from knowledgebase.log import Log +from knowledgebase import utils class DocProcessor: - def __init__(self, pdf_file): - self.doc_split = DocSplit(pdf_file) - self.entity_recognition = EntityRecognition() - self.neo4j = Neo4jHelper() + def __init__(self, docx_file: str): + """ + 鏂囨。澶勭悊 + :param docx_file: 瑕佸鐞嗙殑鏂囨。 + """ + Log.info(f'寮�濮嬪鐞嗘枃妗o細{docx_file}') + self.docx_file = docx_file + self.doc_split = DocSplit(docx_file) + self.doc_type = self.get_doc_type() + self.entity_recognition = EntityRecognition(self.doc_type) + self.doc_id = 0 - async def gen_page_entities(self, page_info): - # 鑾峰彇椤甸潰瀹炰綋璇� - page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text)) - page_info.entities = page_entities + def get_doc_type(self): + Log.info(f'璇嗗埆鏂囨。绫诲瀷锛歿self.docx_file}') + rules = '锛沑n'.join([f'- {it}锛歿entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()]) + msg = HumanMessage(f''' +# 鎸囦护 +璇蜂粠涓嬮潰鐨勬枃浠跺悕涓瘑鍒枃妗g被鍨嬶紝濡傛灉璇嗗埆澶辫触涓嶈杈撳嚭浠讳綍瀛楃銆� +鏂囦欢鍚嶏細{os.path.basename(self.docx_file)} +# 璇嗗埆瑙勫垯 +{rules} +# 绀轰緥 +閬ユ祴澶х翰 +''') + resp = llm.invoke([msg]) + Log.info(f'璇嗗埆缁撴灉锛歿resp.content}') + return resp.content + + async def gen_sect_entities(self, paragraph: ParagraphInfo): + # Log.info(f'鐢熸垚绔犺妭瀹炰綋璇嶏細{paragraph.full_text}') + # 鑾峰彇绔犺妭瀹炰綋璇� + entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text)) + Log.info(f'绔犺妭瀹炰綋璇嶏細{entities}') + if entities: + paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities] + paragraph.entities = [e for e in paragraph.entities if e] def process(self): - # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10椤� + self.doc_split.split() + # 鍒嗘壒骞跺彂澶勭悊锛屾瘡鎵�10涓� batch_size = 10 - for i in range(0, len(self.doc_split.page_infos), batch_size): - batch_page_infos = self.doc_split.page_infos[i:i + batch_size] + for i in range(0, len(self.doc_split.paragraphs), batch_size): + batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size] tasks = [] - for page_info in batch_page_infos: - tasks.append(self.gen_page_entities(page_info)) - asyncio.run(asyncio.gather(*tasks)) - self.save_to_neo4j() + for paragraph in batch_paragraphs: + tasks.append(self.gen_sect_entities(paragraph)) - def save_to_neo4j(self): - """ - 淇濆瓨椤靛拰椤靛疄浣撹瘝鍒皀eo4j鏁版嵁搴撱�� + async def run(): + await asyncio.gather(*tasks) - 1.姣忎竴椤典负涓�涓狽ode锛� - 2.姣忎竴涓疄浣撹瘝涓轰竴涓狽ode锛� - 3.椤靛拰瀹炰綋璇嶇洿鎺ュ缓绔嬪叧绯� 椤�->瀹炰綋璇� - :return: + asyncio.run(run()) + # 淇濆瓨鍒版暟鎹簱 + self.save_to_db() + + def save_to_db(self): """ - for page_info in self.doc_split.page_infos: - # 鍒涘缓椤佃妭鐐� - page_node = self.neo4j.create_page_node(page_info) - entity_nodes = [] - for entity in page_info.entities: - # 鍒涘缓瀹炰綋璇嶈妭鐐� - entity_node = self.neo4j.create_entity_node(entity) - # 寤虹珛鍏崇郴 椤�->瀹炰綋璇� - self.neo4j.create_page_entity_relationship(page_node, entity_node) - entity_nodes.append(entity_node) - if len(entity_nodes) > 0: - for i in range(len(entity_nodes)): - prev_entity_node = entity_nodes[i] - for entity_node in entity_nodes[i + 1:]: - # 寤虹珛鍏崇郴 涓�椤典腑鐨� 瀹炰綋璇�1->瀹炰綋璇�2 - self.neo4j.create_entity_relationship(prev_entity_node, entity_node) + 淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱銆� + """ + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱...') + with open(self.docx_file, 'rb') as f: + file_bytes = f.read() + md5 = utils.generate_bytes_md5(file_bytes) + doc = DocInfo(os.path.basename(self.docx_file), md5, self.doc_type, self.doc_split.paragraph_tree) + self.doc_id = doc_dbh.add_doc(doc) + for paragraph in doc.paragraphs: + doc_dbh.add_paragraph(self.doc_id, None, paragraph) + Log.info('淇濆瓨娈佃惤鍜屾钀藉疄浣撹瘝鍏崇郴鍒版暟鎹簱瀹屾垚') if __name__ == '__main__': - pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D鏃犱汉鏈烘帰娴嬪ぇ绾诧紙鍏紑锛�111.pdf" - doc_processor = DocProcessor(pdf_file) - doc_processor.process() + files = [ + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈哄垎绯荤粺閬ユ祴婧愬寘璁捐鎶ュ憡锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈鸿蒋浠剁敤鎴烽渶姹傦紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬪ぇ绾诧紙鍏紑锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈洪仴娴嬩俊鍙峰垎閰嶈〃锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈烘寚浠ゆ牸寮忎笌缂栫爜瀹氫箟锛堝叕寮�锛�.docx", + r"D:\workspace\PythonProjects\KnowledgeBase\doc\鎸囦护鏍煎紡(鍏紑).docx" + ] + for file in files: + doc_processor = DocProcessor(file) + doc_processor.process() + + # doc_dbh.get_docs() diff --git a/knowledgebase/doc/docx_split.py b/knowledgebase/doc/docx_split.py index 52df48f..22d189a 100644 --- a/knowledgebase/doc/docx_split.py +++ b/knowledgebase/doc/docx_split.py @@ -7,58 +7,35 @@ import docx import docx.table import json -from dataclasses import dataclass from PIL import Image import io import re +import typing from knowledgebase.doc.image_to_text import ImageToText - - -@dataclass -class ParagraphInfo: - """ - 娈佃惤淇℃伅 - :param text: str - 娈佃惤鏂囨湰 - :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 - :param title_no: str - 鏍囬缂栧彿锛屽1.1銆�1.1.1绛� - """ - text: str - level: int - title_no: str - - @property - def full_text(self): - """ - 鑾峰彇娈佃惤瀹屾暣鏂囨湰锛屽寘鍚爣棰樼紪鍙� - :return: str - 娈佃惤瀹屾暣鏂囨湰 - """ - return f"{self.title_no} {self.text}" - - def __init__(self, text: str, level: int): - """ - 娈佃惤淇℃伅 - :param text: str - 娈佃惤鏂囨湰 - :param level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 - """ - self.text = text - self.level = level - self.title_no = '' +from knowledgebase.doc.models import ParagraphInfo +from knowledgebase.log import Log class DocSplit: """ docx鏂囨。鎷嗗垎鍣紝鏍规嵁娈佃惤鎷嗗垎锛屽皢鍥剧墖鍜岃〃鏍艰浆鎹负json鏁版嵁銆� 1.灏佽娈佃惤淇℃伅 - 2.灏嗗浘鐗囧拰琛ㄦ牸杞崲涓簀son - 3.灏嗘钀芥寜鐓ф枃妗f爣棰樼骇鍒粍鍚堟垚鏍戝舰缁撴瀯 + 2.灏嗗浘鐗囪浆鎹负鑷劧璇█鎻忚堪 + 3.灏嗚〃鏍艰浆鎹负json鏍煎紡 + 4.灏嗘钀芥寜鐓ф枃妗f爣棰樼骇鍒粍鍚堟垚鏍戝舰缁撴瀯 """ - def __init__(self, doc_file): - self.doc_file = doc_file + def __init__(self, docx_file: str): + """ + docx鏂囨。鎷嗗垎 + :param docx_file: 瑕佹媶鍒嗙殑docx鏂囦欢璺緞 + """ + self.docx_file = docx_file self.image_to_text = ImageToText() - self.paragraphs:list[ParagraphInfo] = [] + self.paragraphs: list[ParagraphInfo] = [] + self.paragraph_tree: list[ParagraphInfo] = [] def table_to_json(self, table: docx.table.Table): """ @@ -67,8 +44,8 @@ :param table: docx.table.Table - 瑕佽浆鎹㈢殑琛ㄦ牸瀵硅薄 :return list - 琛ㄦ牸鏁版嵁锛屼互 JSON 鏍煎紡琛ㄧず """ - table_data = [] headers = [] + table_data = [headers] first_row = True row: docx.table._Row for row in table.rows: @@ -77,7 +54,7 @@ headers.append(cell.text) first_row = False continue - row_data = {} + row_data = [] row_idx = 0 for cell in row.cells: if cell.tables: @@ -92,7 +69,8 @@ else: # 鍗曞厓鏍兼枃鏈幏鍙� text = cell.text - row_data[headers[row_idx]] = text + # row_data[headers[row_idx]] = text + row_data.append(text) row_idx += 1 table_data.append(row_data) @@ -104,7 +82,8 @@ :return: list[ParagraphInfo] - 娈佃惤鍒楄〃 """ - document = docx.Document(self.doc_file) + Log.info(f"寮�濮嬫媶鍒嗘枃妗o細{self.docx_file}") + document = docx.Document(self.docx_file) table_cnt = 0 paragraph_cnt = 0 @@ -112,27 +91,45 @@ if element.tag.endswith('p'): # 娈佃惤 # 鑾峰彇鏍囬澶氱骇缂栧彿 paragraph = document.paragraphs[paragraph_cnt] - paragraph_text = paragraph.text - if paragraph_text: - self.paragraphs.append(ParagraphInfo(paragraph_text, self.get_header_level(paragraph))) + p_text = paragraph.text + try: + num = element.pPr.numPr.numId.val + level = element.pPr.numPr.ilvl.val + except: + num = 0 + level = 0 + if p_text: + title_level = self.get_title_level(paragraph) + self.paragraphs.append(ParagraphInfo(p_text, title_level, num, level)) # 妫�鏌ユ槸鍚︽槸鍥剧墖锛屽鏋滄槸鍥剧墖鍒欒浆鎹负鏂囨湰 - img_data = self.get_image_blob(paragraph) + img_data = self.get_image_text(paragraph) if img_data: text = self.gen_text_from_img(img_data) - self.paragraphs.append(ParagraphInfo(text, 0)) + text = f"```鍥剧墖锛堜互涓嬪唴瀹逛负鍥剧墖鎻忚堪锛塡n{text}\n```" + self.paragraphs.append(ParagraphInfo(text, 0, num, level)) paragraph_cnt += 1 elif element.tag.endswith('tbl'): # 琛ㄦ牸 table = document.tables[table_cnt] # 鑾峰彇褰撳墠琛ㄦ牸瀵硅薄 table_cnt += 1 table_data = self.table_to_json(table) - self.paragraphs.append(ParagraphInfo(json.dumps(table_data, indent=4, ensure_ascii=False), 0)) + self.paragraphs.append( + ParagraphInfo("```json\n" + json.dumps(table_data, indent=4, ensure_ascii=False) + "\n```", 0)) else: continue # 鐢熸垚鏍囬缂栧彿 - self.gen_title_no(self.paragraphs) + Log.info(f"寮�濮嬬敓鎴愭爣棰樼紪鍙峰拰鍒楄〃缂栧彿") + self.gen_title_num(self.paragraphs) + # 鐢熸垚鏍戝舰缁撴瀯 + Log.info(f"寮�濮嬬敓鎴愭爲褰㈢粨鏋�") + self.gen_paragraph_tree(self.paragraphs) @staticmethod - def get_image_blob(paragraph): + def get_image_text(paragraph): + """ + 鑾峰彇娈佃惤涓殑鍥剧墖鎻忚堪 + :param paragraph: 娈佃惤 + :return: 鍥剧墖鍐呭鎻忚堪淇℃伅 + """ # 閬嶅巻娈佃惤涓殑鎵�鏈塕un瀵硅薄锛堝浘鐗囬�氬父鍦ㄥ崟鐙殑Run涓級 for run in paragraph.runs: xml = run._element.xml @@ -144,7 +141,7 @@ if r_id: # 鑾峰彇鍥剧墖淇℃伅 image_part = paragraph.part.rels[r_id].target_part - return DocSplit.image_convert(image_part.blob, "png") + return DocSplit.image_convert(image_part.blob) if xml.find('wp:inline') != -1 or xml.find('wp:anchor') != -1: # 浣跨敤姝e垯琛ㄨ揪寮忔煡鎵緍:embed灞炴�� match = re.search(r'r:embed="([^"]+)"', xml) @@ -153,36 +150,88 @@ if r_id: # 鑾峰彇鍥剧墖淇℃伅 image_part = paragraph.part.rels[r_id].target_part - return DocSplit.image_convert(image_part.blob, "png") + return DocSplit.image_convert(image_part.blob) return None @staticmethod - def gen_title_no(paragraphs: list[ParagraphInfo]): - title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] - for i in range(len(paragraphs)): - if paragraphs[i].level > 0: - for j in range(paragraphs[i].level - 1): - title_levels[j] = 1 - paragraphs[i].title_no = '.'.join([str(x) for x in title_levels[0:paragraphs[i].level]]) - title_levels[paragraphs[i].level - 1] += 1 + def gen_title_num(paragraphs: list[ParagraphInfo]): + """ + 鐢熸垚鏍囬缂栧彿鍜屽垪琛ㄧ紪鍙� + 鏍囬绾у埆浠�1-9锛�0琛ㄧず姝f枃 + + :param paragraphs: list[ParagraphInfo] - 娈佃惤鍒楄〃 + :return: None + """ + MAX_TITLE_LEVEL = 9 # 瀹氫箟涓哄父閲忥紝渚夸簬缁熶竴绠$悊鍜屾墿灞� + title_levels = [0] * MAX_TITLE_LEVEL # 鍒濆鍖栦负鍏�0 + + list_counters = [0] * MAX_TITLE_LEVEL + + def format_number(level: int, value: int) -> str: + # 浣跨敤鏄犲皠鏂瑰紡绠�鍖栭�昏緫 + if level < 0 or level > 4: + return str(value) + formats = { + 0: lambda v: f"({v})", + 1: lambda v: f"{v})", + 2: lambda v: f"({chr(96 + v)})", + 3: lambda v: f"{chr(96 + v)})", + 4: lambda v: chr(96 + v), + } + return formats[level](value) + + for p in paragraphs: + if p.title_level > 0: + title_levels[p.title_level - 1] += 1 + for i in range(p.title_level, MAX_TITLE_LEVEL): + title_levels[i] = 0 + p.title_num = '.'.join([str(x) for x in title_levels[:p.title_level]]) + list_counters = [0] * MAX_TITLE_LEVEL else: - title_levels = [1, 1, 1, 1, 1, 1, 1, 1, 1] + # 澶勭悊鍒楄〃缂栧彿 + if p.num > 0: + level = p.num_level + + # 鏍¢獙 level 鍚堟硶鎬� + if level < 0 or level >= MAX_TITLE_LEVEL: + continue + list_counters[level] += 1 + + # 閲嶇疆褰撳墠灞傜骇涔嬪悗鐨勮鏁板櫒 + for l in range(level + 1, MAX_TITLE_LEVEL): + list_counters[l] = 0 + + # 褰撳墠灞傜骇閫掑骞惰祴鍊� + p.title_num = format_number(level, list_counters[level]) + else: + list_counters = [0] * MAX_TITLE_LEVEL @staticmethod - def get_header_level(paragraph) -> int: - if paragraph.style.base_style: - style = paragraph.style.base_style - else: - style = paragraph.style + def get_title_level(paragraph) -> int: + """ + 鑾峰彇娈佃惤鏍囬绾у埆 + + :param paragraph: docx.paragraph.Paragraph - 瑕佽幏鍙栨爣棰樼骇鍒殑娈佃惤瀵硅薄 + :return: int - 鏍囬绾у埆锛�0 琛ㄧず闈炴爣棰� + """ + style = paragraph.style if style and style.name.startswith('Heading'): # 鑾峰彇鏍囬绾у埆 level = int(style.name.split(' ')[1]) + return level + elif style.base_style and style.base_style.name.startswith('Heading'): + level = int(style.base_style.name.split(' ')[1]) return level else: return 0 @staticmethod - def image_convert(_in: bytes, _out_format: str) -> bytes: + def image_convert(_in: bytes) -> bytes: + """ + 灏嗗浘鐗囪浆鎹负png鏍煎紡鐨刡ytes + :param _in: bytes - 鍥剧墖鏁版嵁 + :return: bytes - png鏍煎紡鐨勫浘鐗囨暟鎹� + """ in_io = io.BytesIO() in_io.write(_in) img = Image.open(in_io, "r") @@ -191,11 +240,72 @@ out_io.seek(0) return out_io.read() - def gen_text_from_img(self, img_data:bytes): + def gen_text_from_img(self, img_data: bytes): + """ + 鍒╃敤LLM灏嗗浘鐗囪浆涓烘枃鏈� + :param img_data: bytes - 鍥剧墖鏁版嵁 + :return: str - 鏂囨湰 + """ return self.image_to_text.gen_text_from_img(img_data) + def gen_paragraph_tree(self, paragraphs: typing.List[ParagraphInfo]): + """ + 鐢熸垚娈佃惤鏍戠粨鏋勶紝鏍规嵁title_level鍒掑垎娈佃惤鏍� + + :param paragraphs: list[ParagraphInfo] - 娈佃惤鍒楄〃锛堜細琚師鍦颁慨鏀癸級 + """ + if not paragraphs: + return + + stack = [] + result = [] + _paragraphs = [] + + def merge_paragraph_text(info: ParagraphInfo): + text_nodes = [child for child in info.children if child.title_level == 0] + info.text += '\n' + '\n'.join([child.full_text for child in text_nodes]) + info.children = [child for child in info.children if child.title_level > 0] + + for p in paragraphs: + if p.title_level == 1: + result.append(p) + # 娓呯悊鏍堥《姣斿綋鍓嶇骇鍒綆鎴栫浉绛夌殑鑺傜偣 + while stack and p.title_level != 0 and stack[-1].title_level >= p.title_level: + _p = stack.pop() + merge_paragraph_text(_p) + + if p.title_level > 0: + if len(stack): + stack[-1].children.append(p) + stack.append(p) + _paragraphs.append(p) + elif len(stack): + stack[-1].children.append(p) + else: + # 闈炴爣棰樻钀界洿鎺ュ姞鍏ョ粨鏋� + result.append(p) + + while stack: + merge_paragraph_text(stack.pop()) + + # 鏇挎崲鍘熷鍒楄〃鍐呭锛岄伩鍏嶅娆� remove 鎿嶄綔 + self.paragraphs[:] = _paragraphs + self.paragraph_tree = result + + if __name__ == '__main__': - doc_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\ZL鏍煎紡(鍏紑).docx' - doc_split = DocSplit(doc_file) + docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D鏃犱汉鏈�1553B鎬荤嚎浼犺緭閫氫俊甯у垎閰嶏紙鍏紑锛�.docx' + # docx_file = r'D:\workspace\PythonProjects\KnowledgeBase\doc\table_test.docx' + doc_split = DocSplit(docx_file) doc_split.split() - print("\n".join([x.full_text for x in doc_split.paragraphs])) + # er = EntityRecognition() + # db = Neo4jHelper() + # for trunk in doc_split.trunks: + # print('娈佃惤鏂囨湰锛�') + # print(trunk) + # print('瀹炰綋璇嶏細') + # print(er.run(trunk)) + # entities = er.run(trunk) + # db.create_page_node() + print("\n".join([x.full_text_with_children for x in doc_split.paragraphs])) + print() diff --git a/knowledgebase/doc/entity_helper.py b/knowledgebase/doc/entity_helper.py new file mode 100644 index 0000000..e354449 --- /dev/null +++ b/knowledgebase/doc/entity_helper.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# @author: +# @date: +# @version: +# @description: +import json +import os.path + +from knowledgebase.db.doc_db_helper import doc_dbh +from knowledgebase.db.doc_db_models import TEntity +from knowledgebase.log import Log + + +class EntityHelper: + # 鏂囨。绫诲瀷鍜岃瘑鍒彁绀鸿瘝map + doc_prompt_map: dict + # 鎵�鏈夊疄浣� + entities: list[TEntity] + + def __init__(self): + Log.info("鍒濆鍖朎ntityHelper") + current_dir = os.path.dirname(__file__) + self.entities = doc_dbh.get_all_entities() + self.doc_prompt_map = {} + entity_names = [entity.name for entity in self.entities] + with open(f'{current_dir}/../../tpl/entities.json', 'r', encoding='utf-8') as f: + text = f.read() + obj = json.loads(text) + for ty in obj: + obj2 = obj[ty] + for doc_ty in obj2: + prompts = obj2[doc_ty]['prompts'] + self.doc_prompt_map[doc_ty] = prompts + for entity in obj2[doc_ty]['entities']: + if entity in entity_names: + continue + _entity = TEntity(name=entity, type=ty, doc_type=doc_ty, + prompts=obj2[doc_ty]['entities'][entity]) + doc_dbh.add_entity(_entity) + Log.info(f"鏂板Entity锛歿entity}锛宨d锛歿_entity.id}") + + +entity_helper = EntityHelper() diff --git a/knowledgebase/doc/entity_recognition.py b/knowledgebase/doc/entity_recognition.py index 6512bfe..8b3d58e 100644 --- a/knowledgebase/doc/entity_recognition.py +++ b/knowledgebase/doc/entity_recognition.py @@ -11,6 +11,12 @@ import json from knowledgebase import utils +from knowledgebase.doc.entity_helper import entity_helper + +llm = ChatOpenAI(temperature=0, + model="qwen2.5-72b-instruct", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") class EntityRecognition: @@ -21,20 +27,22 @@ """ cache_file = "entity_recognition.cache" - def __init__(self): - llm = ChatOpenAI(temperature=0, - model="qwen2.5-72b-instruct", - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - api_key="sk-15ecf7e273ad4b729c7f7f42b542749e") + def __init__(self, doc_type: str): + # 瀹炰綋璇嶅垪琛� + entities = filter(lambda x: x.doc_type == doc_type, entity_helper.entities) + entity_list = '锛沑n'.join([f'- {entity.name}锛歿entity.prompts}' for entity in entities]) + "銆�" msg = HumanMessagePromptTemplate.from_template(template=""" # 鎸囦护 -璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛ㄣ�� +璇蜂粠缁欏畾鐨勬枃鏈腑鎻愬彇瀹炰綋璇嶅垪琛紝瀹炰綋璇嶅垪琛ㄥ畾涔夊涓嬶細 +## 瀹炰綋璇嶅垪琛ㄥ強璇嗗埆瑙勫垯 +""" + entity_list + """ # 绾︽潫 - 杈撳嚭鏍煎紡涓篔SON鏍煎紡锛� +- 鎻愬彇鐨勫疄浣撹瘝蹇呴』鏄笂闈㈠垪涓剧殑瀹炰綋璇嶏紱 - 杈撳嚭鏁版嵁缁撴瀯涓哄瓧绗︿覆鏁扮粍銆� # 绀轰緥 ```json -["瀹炰綋1","瀹炰綋2"] +["閬ユ帶甯ф牸寮�","閬ユ帶鍖呮牸寮�"] ``` # 鏂囨湰濡備笅锛� @@ -65,9 +73,10 @@ def run(self, in_text: str) -> list[str]: """ 杩愯瀹炰綋璇嗗埆鎶藉彇銆� + :param in_text: str - 杈撳叆鏂囨湰 """ # 缂撳瓨鍛戒腑 - text_md5 = utils.generate_md5(in_text) + text_md5 = utils.generate_text_md5(in_text) if text_md5 in self.cache: return self.cache[text_md5] result = self.chain.invoke({"text": in_text}) diff --git a/knowledgebase/doc/models.py b/knowledgebase/doc/models.py new file mode 100644 index 0000000..4749fee --- /dev/null +++ b/knowledgebase/doc/models.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# +# @author: lyg +# @date: 2025-5-12 +# @version: 1 +# @description: 鏂囨。鐩稿叧鏁版嵁绫� +from dataclasses import dataclass +import typing + +from knowledgebase.db.doc_db_models import TEntity + + +@dataclass +class ParagraphInfo: + """ + 娈佃惤淇℃伅 + + 灞炴�э細 + text: str - 娈佃惤鏂囨湰 + title_level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 + title_num: str - 鏍囬缂栧彿锛屽1.1銆�1.1.1绛夛紝鍒楄〃缂栧彿锛屽(1)銆�(2) + num_level: int - 鍒楄〃搴忓彿绾у埆锛�0琛ㄧず姝f枃 + num: int - 鍒楄〃搴忓彿锛屽鏋滄槸鍒楄〃 + children: typing.List[ParagraphInfo] - 瀛愭钀藉垪琛� + refs: 寮曠敤鏂囨。 + """ + text: str + title_level: int + title_num: str + num_level: int + num: int + children: typing.List + refs: typing.List + entities: typing.List[TEntity] + + @property + def full_text(self): + """ + 鑾峰彇娈佃惤瀹屾暣鏂囨湰锛屽寘鍚爣棰樼紪鍙� + :return: str - 娈佃惤瀹屾暣鏂囨湰 + """ + if self.title_num: + return f"{self.title_num}. {self.text}" + else: + return f"{self.text}" + + @property + def full_text_with_children(self): + """ + 鑾峰彇娈佃惤瀹屾暣鏂囨湰锛屽寘鍚爣棰樼紪鍙峰拰瀛愭钀� + :return: str - 娈佃惤瀹屾暣鏂囨湰 + """ + full_text = '' + if self.title_num: + full_text = f"{self.title_num}. {self.text}" + else: + full_text = f"{self.text}" + if len(self.children): + for child in self.children: + full_text = full_text + "\n" + child.full_text_with_children + return full_text + + def __init__(self, text: str, title_level: int, num=0, num_level=0): + """ + 娈佃惤淇℃伅 + + 灞炴�э細 + text: str - 娈佃惤鏂囨湰 + title_level: int - 娈佃惤绾у埆锛�1-9绾ф爣棰橈紝0琛ㄧず姝f枃 + num: int - 鍒楄〃搴忓彿 + num_level: int - 鍒楄〃搴忓彿绾у埆 + """ + self.text = text + self.title_level = title_level + self.title_num = '' + self.num = num + self.num_level = num_level + self.children: typing.List[ParagraphInfo] = [] + self.entities: typing.List[TEntity] = [] + + def __str__(self): + return f"{self.full_text}" + + def __repr__(self): + return f"{self.full_text}" + + +@dataclass +class DocInfo: + """ + 鏂囨。淇℃伅 + + 灞炴�э細 + id: int - id + file_name: str - 鏂囨。鍚嶇О銆� + file: typing.BinaryIO - 鏂囨。鏂囦欢銆� + file_type: str - 鏂囨。绫诲瀷 + paragraphs: typing.List[ParagraphInfo] - 鏂囨。娈佃惤鍒楄〃銆� + """ + id: int + file_name: str + file: str + file_type: str + paragraphs: typing.List[ParagraphInfo] + + def __init__(self, file_name: str, file: bytes, file_type: str, paragraphs: typing.List[ParagraphInfo]): + """ + 鏂囨。淇℃伅 + + 灞炴�э細 + file_name: str - 鏂囨。鍚嶇О銆� + file: bytes - 鏂囨。鏂囦欢銆� + """ + self.file_name = file_name + self.file = file + self.file_type = file_type + self.paragraphs: typing.List[ParagraphInfo] = paragraphs diff --git a/knowledgebase/log/__init__.py b/knowledgebase/log/__init__.py new file mode 100644 index 0000000..b6411d8 --- /dev/null +++ b/knowledgebase/log/__init__.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# @author: +# @date: +# @version: +# @description: +import logging + +logger = logging.getLogger('logs_logger') +logger.setLevel(logging.DEBUG) + +# 鍒涘缓涓�涓枃浠跺鐞嗗櫒 +file_handler = logging.FileHandler('logs.log') +file_handler.setLevel(logging.DEBUG) + +# 鍒涘缓涓�涓帶鍒跺彴澶勭悊鍣� +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) + +# 鍒涘缓涓�涓棩蹇楁牸寮� +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +file_handler.setFormatter(formatter) +console_handler.setFormatter(formatter) + +# 灏嗗鐞嗗櫒娣诲姞鍒拌褰曞櫒 +logger.addHandler(file_handler) +logger.addHandler(console_handler) + + +class Log: + + @staticmethod + def debug(msg): + logger.debug(msg) + + @staticmethod + def info(msg): + logger.info(msg) + + @staticmethod + def error(msg): + logger.error(msg) + + @staticmethod + def warning(msg): + logger.warning(msg) + + @staticmethod + def critical(msg): + logger.critical(msg) diff --git a/knowledgebase/utils.py b/knowledgebase/utils.py index ab7d2d1..0314db2 100644 --- a/knowledgebase/utils.py +++ b/knowledgebase/utils.py @@ -15,7 +15,7 @@ return mask -def generate_md5(input_string): +def generate_text_md5(input_string): # 鍒涘缓涓�涓� md5 鍝堝笇瀵硅薄 md5_hash = hashlib.md5() @@ -27,6 +27,17 @@ return md5_digest +def generate_bytes_md5(input_bytes): + # 鍒涘缓涓�涓� md5 鍝堝笇瀵硅薄 + md5_hash = hashlib.md5() + + # 鏇存柊鍝堝笇瀵硅薄鍐呭 + md5_hash.update(input_bytes) + + # 鑾峰彇鍝堝笇鍊肩殑鍗佸叚杩涘埗琛ㄧず + md5_digest = md5_hash.hexdigest() + + return md5_digest def file_exists(cache_file: str): return os.path.exists(cache_file) diff --git a/lang_flow.py b/lang_flow.py deleted file mode 100644 index a1b5f27..0000000 --- a/lang_flow.py +++ /dev/null @@ -1,147 +0,0 @@ -import os -from openai import OpenAI -from pathlib import Path -import re -import json - -API_KEY = 'sk-15ecf7e273ad4b729c7f7f42b542749e' - -msg = """ -浣犳槸涓�鍚嶇粡楠屼赴瀵岀殑閫氫俊鍗忚鍒嗘瀽甯堛�備綘鎷ユ湁鎴愮啛鐨勫崗璁枃妗i槄璇昏兘鍔涘拰鍗忚鏂囨。鍒嗘瀽鑳藉姏銆� 1.浣犻渶瑕佽繍鐢ㄨ嚜宸辩殑闃呰鑳藉姏鐞嗚В涓婇潰杩欎簺鏂囨。锛岃繖浜涙枃妗i兘灞炰簬鍨嬪彿涓篨A-5D鐨勬棤浜烘満绯荤粺锛� 2.鏂囨。涓弿杩扮殑鍗忚鏁翠綋缁撴瀯鍒掑垎涓猴細鍨嬪彿涓嬪寘鍚澶囷紝璁惧涓嬪寘鍚暟鎹祦锛屾暟鎹祦涓嬪寘鍚帰娴嬪抚锛屾帰娴嬪抚鍖呭惈鏁版嵁鍖咃紝鏁版嵁鍖呬笅闈㈠寘鍚叿浣撶殑鍙傛暟缁撴瀯锛� 3.鎵�鏈変俊鎭潵婧愯渚濇嵁鏂囨。杩涜鍒嗘瀽锛屼笉瑕佷娇鐢ㄧ綉缁滃唴瀹瑰拰鑷富鐢熸垚锛� 4.浣跨敤鏍戝舰缁撴瀯鐨凧SON灏嗕綘鐞嗚В鐨勭粨鏋勮繘琛岃緭鍑猴紝澶氭潯淇℃伅浣跨敤鏁扮粍鍖呭惈瀵硅薄鐨勬牸寮忥紝瀛愪俊鎭寘鍚湪key涓篶hild鐨勬暟缁勪腑锛屽瓙灞傜粨鏋勪笌鐖跺眰鐩稿悓锛� 5.涓嶈杈撳嚭鍏朵粬鏂囧瓧锛岀洿鎺ヨ緭鍑哄搴旂殑json锛涜杈撳嚭鏂囨。涓弿杩扮殑鎺㈡祴甯х粨鏋勶紱 -""" - - -class LangFlow: - files = [] - file_objects = [] - - def __init__(self, doc_files): - self.client = OpenAI( - api_key=API_KEY, - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - ) - if doc_files: - self.files = doc_files - self.load_file_objs() - self.delete_all_files() - self.upload_files() - - def load_file_objs(self): - file_stk = self.client.files.list() - self.file_objects = file_stk.data - - def delete_all_files(self): - for file_object in self.file_objects: - self.client.files.delete(file_object.id) - - def upload_file(self, file_path): - file_object = self.client.files.create(file=Path(file_path), purpose="file-extract") - return file_object - - def upload_files(self): - self.file_objects = [] - for file_path in self.files: - file_object = self.upload_file(file_path) - self.file_objects.append(file_object) - - def run(self): - result = self.tm_frame_gen() - result += self.tm_pkts_gen() - return result - - def _gen(self, msgs, msg): - messages = [] if msgs is None else msgs - if len(messages) == 0: - # 濡傛灉鏄涓�娆℃彁闂姞鍏ユ枃妗� - for file_object in self.file_objects: - messages.append({'role': 'system', 'content': 'fileid://' + file_object.id}) - messages.append({'role': 'user', 'content': msg}) - completion = self.client.chat.completions.create( - model="qwen-long", - messages=messages, - stream=False, - temperature=0.0, - top_p=0, - # stream_options={"include_usage": True} - ) - return completion - - def tm_frame_gen(self): - result = "" - messages = [] - _msg = """ -浣犳槸涓�鍚嶇粡楠屼赴瀵岀殑閫氫俊鍗忚鍒嗘瀽甯堛�備綘鎷ユ湁鎴愮啛鐨勫崗璁枃妗i槄璇昏兘鍔涘拰鍗忚鏂囨。鍒嗘瀽鑳藉姏銆� -1.浣犻渶瑕佽繍鐢ㄨ嚜宸辩殑闃呰鑳藉姏鐞嗚В涓婇潰杩欎簺鏂囨。锛岃繖浜涙枃妗i兘灞炰簬鍨嬪彿涓篨A-5D鐨勬棤浜烘満绯荤粺锛� -2.鏂囨。涓弿杩扮殑鍗忚鏁翠綋缁撴瀯鍒掑垎涓猴細鍨嬪彿涓嬪寘鍚澶囷紝璁惧涓嬪寘鍚暟鎹祦锛屾暟鎹祦涓嬪寘鍚帰娴嬪抚锛屾帰娴嬪抚鍖呭惈鏁版嵁鍖咃紝鏁版嵁鍖呬笅闈㈠寘鍚叿浣撶殑鍙傛暟缁撴瀯锛� -3.鎵�鏈変俊鎭潵婧愯渚濇嵁鏂囨。杩涜鍒嗘瀽锛屼笉瑕佷娇鐢ㄧ綉缁滃唴瀹瑰拰鑷富鐢熸垚锛� -4.浣跨敤鏍戝舰缁撴瀯鐨凧SON灏嗕綘鐞嗚В鐨勭粨鏋勮繘琛岃緭鍑猴紝澶氭潯淇℃伅浣跨敤鏁扮粍鍖呭惈瀵硅薄鐨勬牸寮忥紝瀛愪俊鎭寘鍚湪key涓篶hild鐨勬暟缁勪腑锛屽瓙灞傜粨鏋勪笌鐖跺眰鐩稿悓锛� -5.涓嶈杈撳嚭鍏朵粬鏂囧瓧锛岀洿鎺ヨ緭鍑哄搴旂殑json锛� -璇疯緭鍑烘枃妗d腑鎻忚堪鐨勬帰娴嬪抚缁撴瀯 - """ - completion = self._gen(messages, _msg) - full_content = self.remove_markdown(completion.choices[0].message.content) - print(full_content) - messages.append({'role': 'assistant', 'content': full_content}) - _msg2 = """ -璇烽槄璇绘枃妗o紝缁欎笂闈son鍔犲叆浠ヤ笅瀛楁锛� -1.type锛屼緥濡傝澶囧眰type涓篸evice锛� -2.length锛屼緥濡�8bit锛� -浠呬粎杈撳嚭json - """ - completion = self._gen(messages, _msg2) - messages.append({'role': 'user', 'content': _msg2}) - full_content = self.remove_markdown(completion.choices[0].message.content) - print('鎺㈡祴甯х粨鏋勶細'+full_content) - result += '鎺㈡祴甯х粨鏋勶細' - result += full_content - return result - - def tm_pkts_gen(self): - result = "" - messages = [] - _msg = """ -浣犳槸涓�鍚嶇粡楠屼赴瀵岀殑閫氫俊鍗忚鍒嗘瀽甯堛�備綘鎷ユ湁鎴愮啛鐨勫崗璁枃妗i槄璇昏兘鍔涘拰鍗忚鏂囨。鍒嗘瀽鑳藉姏銆� -1.浣犻渶瑕佽繍鐢ㄨ嚜宸辩殑闃呰鑳藉姏鐞嗚В涓婇潰杩欎簺鏂囨。锛岃繖浜涙枃妗i兘灞炰簬鍨嬪彿涓篨A-5D鐨勬棤浜烘満绯荤粺锛� -2.鎵�鏈変俊鎭潵婧愯渚濇嵁鏂囨。杩涜鍒嗘瀽锛屼笉瑕佷娇鐢ㄧ綉缁滃唴瀹瑰拰鑷富鐢熸垚锛� -3.涓嶈杈撳嚭鍏朵粬鏂囧瓧锛岀洿鎺ヨ緭鍑哄搴旂殑json锛� -4.璇疯緭鍑烘帰娴嬪抚涓嬫帰娴嬪弬鏁版簮鍖呭垪琛紝浠呰緭鍑哄寘鍚嶇О鏁扮粍銆� - """ - completion = self._gen(messages, _msg) - full_content = completion.choices[0].message.content - full_content = self.remove_markdown(full_content) - print(f'鎵�鏈夋帰娴嬪抚鍙傛暟婧愬寘锛�') - print(full_content) - result += '鎵�鏈夋帰娴嬪抚鍙傛暟婧愬寘锛�' - result += full_content - # 瑙f瀽json - pkts = json.loads(full_content) - for pkt in pkts: - pkt_json = self.tm_pkt_gen(pkt) - print(f'鏁版嵁鍖咃細{pkt}') - print(pkt_json) - result += f'鏁版嵁鍖咃細{pkt}\n' - result += pkt_json - return result - - def tm_pkt_gen(self, pkt_name): - messages = [] - _msg = f""" -浣犳槸涓�鍚嶇粡楠屼赴瀵岀殑閫氫俊鍗忚鍒嗘瀽甯堛�備綘鎷ユ湁鎴愮啛鐨勫崗璁枃妗i槄璇昏兘鍔涘拰鍗忚鏂囨。鍒嗘瀽鑳藉姏銆� -1.浣犻渶瑕佽繍鐢ㄨ嚜宸辩殑闃呰鑳藉姏鐞嗚В涓婇潰杩欎簺鏂囨。锛岃繖浜涙枃妗i兘灞炰簬鍨嬪彿涓篨A-5D鐨勬棤浜烘満绯荤粺锛� -2.鎵�鏈変俊鎭潵婧愯渚濇嵁鏂囨。杩涜鍒嗘瀽锛屼笉瑕佷娇鐢ㄧ綉缁滃唴瀹瑰拰鑷富鐢熸垚锛� -3.浣跨敤鏍戝舰缁撴瀯鐨凧SON灏嗕綘鐞嗚В鐨勭粨鏋勮繘琛岃緭鍑猴紝澶氭潯淇℃伅浣跨敤鏁扮粍鍖呭惈瀵硅薄鐨勬牸寮忥紝瀛愪俊鎭寘鍚湪key涓篶hild鐨勬暟缁勪腑锛屽瓙灞傜粨鏋勪笌鐖跺眰鐩稿悓锛� -4.涓嶈杈撳嚭鍏朵粬鏂囧瓧锛岀洿鎺ヨ緭鍑哄搴旂殑json锛� -璇疯緭鍑烘枃妗d腑鎻忚堪鐨剓pkt_name}鐨勭粨鏋勭殑鍚勪釜灞傜骇 - """ - completion = self._gen(messages, _msg) - return self.remove_markdown(completion.choices[0].message.content) - - def remove_markdown(self, text): - # 鍘绘帀寮�澶寸殑```json - text = re.sub(r'^```json', '', text) - # 鍘绘帀缁撳熬鐨刞``json - text = re.sub(r'```$', '', text) - return text -# if __name__ == '__main__': -# text = ChatFlow().run() -# print(text) diff --git a/tpl/entities.json b/tpl/entities.json new file mode 100644 index 0000000..6925a33 --- /dev/null +++ b/tpl/entities.json @@ -0,0 +1,54 @@ +{ + "鍨嬪彿鍩虹淇℃伅": { + "鐢ㄦ埛闇�姹�": { + "prompts": "鏂囦欢鍚嶅寘鍚�滈渶姹傗�濓紝濡傛灉鏈夊浠介渶姹傛枃浠讹紝閭d箞閫夋嫨鍖呭惈鈥滄槦鍔$鐞嗏�濈被浼煎叧閿瓧鐨勶紝鍐呭鍖呭惈閬ユ帶閬ユ祴鐩稿叧鍔熻兘鎻忚堪", + "entities": { + "绯荤粺姒傝堪": "涓�鑸湪绗竴绔犺妭锛岀被浼间簬鈥滃墠瑷�鈥濇垨鈥滄杩扳�濈殑绔犺妭锛岃繖閲屾弿杩颁簡鍨嬪彿鍩烘湰淇℃伅鍙婂垎绯荤粺缁勬垚", + "鎬荤嚎绠$悊": "涓�鑸槸鈥滄槦鍔$鐞嗏�濈殑瀛愮珷鑺傦紝绔犺妭鍚嶅寘鍚�滄�荤嚎鈥濓紝鍐呭涓昏鎻忚堪鎬荤嚎鐩稿叧鍔熻兘鎴栧紩鐢ㄦ枃浠�" + } + } + }, + "閬ユ祴鍖呴厤缃�": { + "閬ユ祴澶х翰": { + "prompts": "鏂囦欢鍚嶉�氬父鍖呭惈鈥滈仴娴嬧�� 鈥滃ぇ绾测�濈瓑鍏抽敭瀛楋紝鍐呭鍖呭惈瀵归仴娴嬪抚鍙婇仴娴嬪寘鏍煎紡鐨勫畾涔�", + "entities": { + "閬ユ祴鏍煎紡瀹氫箟": "涓�鑸湪鈥滈仴娴嬫牸寮忊�濈珷鑺傦紝鍐呭鍖呭惈鈥濋仴娴嬪抚鈥� 鈥濋仴娴嬪寘鈥滃叿浣撴牸寮忕殑瀹氫箟", + "铏氭嫙淇¢亾瀹氫箟": "绔犺妭鍚嶅寘鍚�滆櫄鎷熶俊閬撯�濓紝鍐呭鍖呭惈铏氭嫙淇¢亾鐨勫垝鍒嗭紝鍚勬簮鍖呭湪鍚勮櫄鎷熶俊閬撲笅浼犲垎閰�", + "鎻掑叆鍩�": "绔犺妭鍚嶅寘鍚�滄彃鍏ュ煙鈥濓紝鍐呭涓轰竴寮犺〃鏍硷紝瀹氫箟浜嗘彃鍏ュ煙涓殑閬ユ祴鍙傛暟", + "婧愬寘鍙傛暟琛�": "绔犺妭鍚嶅寘鍚�滄簮鍖呰璁♀�濓紝鍐呭涓哄涓簮鍖呭叿浣撳弬鏁扮殑琛ㄦ牸锛屾瘡涓簮鍖呭崟鐙竴寮犺〃鏍�" + } + }, + "婧愬寘璁捐": { + "prompts": "鏂囦欢鍚嶉�氬父鍖呭惈鈥滄簮鍖呪�濆叧閿瓧", + "entities": { + "婧愬寘鍙傛暟琛�": "閫氬父涓哄彾瀛愯妭鐐癸紝绔犺妭鍚嶉�氬父涓� 鈥渪xx鍖呪�濓紝鍐呭涓烘簮鍖呭弬鏁拌〃鏍硷紝瀹氫箟浜嗗寘澶淬�佹暟鎹煙鍏蜂綋鍐呭" + } + } + }, + "鎬荤嚎閰嶇疆": { + "鎬荤嚎閫氫俊鍗忚": { + "prompts": "鏂囦欢鍚嶄腑鍖呭惈鈥滄�荤嚎鈥濆叧閿瓧锛屽唴瀹逛负鍚勫垎绯荤粺婧愬寘鍦ㄦ�荤嚎涓婁紶杈撶殑瀹氫箟", + "entities": { + "RT鍦板潃鍒嗛厤": "绔犺妭鍚嶅寘鍚�淩T鍦板潃鈥濓紝鍐呭涓哄悇鍒嗙郴缁熷拰RT鍦板潃鍒嗛厤鍏崇郴鐨勮〃鏍�", + "鍒嗙郴缁熸簮鍖�": "閫氬父鍦ㄥ彾瀛愮珷鑺備腑锛屽唴瀹逛负璇ュ垎绯荤粺鍚勬簮鍖呭湪鎬荤嚎涓婁紶杈撴椂鎵�浣跨敤鐨勨�滀紶杈撴湇鍔♀�濄�佲�滃瓙鍦板潃鈥濄�佲�滈�氫俊甯у彿鈥濈瓑锛屽苟鎻忚堪浜嗘簮鍖呭悕绉般�丄PID銆佸寘闀跨瓑淇℃伅", + "婧愬寘鍙傛暟琛�": "绔犺妭鍚嶅寘鍚�滄簮鍖呰璁♀�濓紝鍐呭涓哄涓簮鍖呭叿浣撳弬鏁扮殑琛ㄦ牸锛屾瘡涓簮鍖呭崟鐙竴寮犺〃鏍�" + } + } + }, + "鎸囦护鏍煎紡閰嶇疆": { + "鎸囦护鏍煎紡": { + "prompts": "鏂囦欢鍚嶄腑鍖呭惈鈥滄寚浠ゆ牸寮忊�濆叧閿瓧锛屽唴瀹逛负鎸囦护鏍煎紡鐨勫畾涔�", + "entities": { + "閬ユ帶甯ф牸寮�": "绔犺妭鍚嶅寘鍚�滈仴鎺р�濆拰鈥滃抚鈥濆叧閿瓧锛屽唴瀹逛负閬ユ帶甯у悇瀛楁鐨勫畾涔�", + "閬ユ帶鍖呮牸寮�": "绔犺妭鍚嶅寘鍚�滈仴鎺р�濆拰鈥滃寘鈥濆叧閿瓧锛屽唴瀹逛负閬ユ帶鍖呭悇瀛楁鐨勫畾涔�", + "APID鍒嗛厤": "绔犺妭鍚嶅寘鍚�淎PID鈥濇垨鈥滃簲鐢ㄨ繃绋嬫爣璇嗏�濆叧閿瓧锛屽唴瀹逛负APID鍊肩殑鏋氫妇琛ㄨ揪锛屽湪閬ユ帶鍖呮暟鎹煙鎴栨寚浠ゅ崟鍏冪殑瀹氫箟绔犺妭涓紝鍖呭惈鏈夊浜嶢PID鍊肩殑鎻忚堪" + } + }, + "閬ユ帶鎸囦护琛�": { + "prompts": "鏂囦欢鍚嶄腑鍖呭惈鈥滈仴鎺ф寚浠も�濆拰鈥滆〃鈥濆叧閿瓧锛屽唴瀹逛负閬ユ帶鎸囦护浠e彿銆侀�氶亾鍙峰拰鎸囦护鍚嶇О锛堟剰涔夛級鐨勬弿杩�", + "entities": { + "寮�鍏虫寚浠や唬鍙�": "绔犺妭鍚嶅寘鍚�滈仴鎺р�濆拰鈥滄寚浠も�濆叧閿瓧锛屽唴瀹逛负閬ユ帶鎸囦护浠e彿銆侀�氶亾鍙峰拰鎸囦护鍚嶇О锛堟剰涔夛級鐨勬弿杩般�備竴鑸棿鎺NOFF鎸囦护绛夊悓浜庨棿鎺ユ寚浠わ紝鐩存帴鎸囦护绛夊悓浜庨仴鎺ф澘ONOFF鎸囦护" + } + } + } +} \ No newline at end of file diff --git a/tpl/tc_pkt_format.json b/tpl/tc_pkt_format.json new file mode 100644 index 0000000..5b0c64f --- /dev/null +++ b/tpl/tc_pkt_format.json @@ -0,0 +1,147 @@ +{ + "name": "閬ユ帶鍖�", + "type": "pkt", + "children": [ + { + "name": "涓诲澶�", + "code": "primaryHeader", + "length": 48, + "type": "combPkt", + "children": [ + { + "name": "鍖呰瘑鍒�", + "code": "packetIdentifier", + "length": 16, + "type": "combPkt", + "children": [ + { + "name": "鍖呯増鏈彿", + "code": "packetVersionNumber", + "length": 3, + "value": "{{鍖呯増鏈彿}}", + "type": "const" + }, + { + "name": "鍖呯被鍨�", + "code": "packetType", + "length": 1, + "value": "{{鍖呯被鍨媫}", + "type": "const" + }, + { + "name": "鏁版嵁鍖哄ご鏍囧織", + "code": "dataFieldHeaderFlag", + "length": 1, + "value": "{{鏁版嵁鍖哄ご鏍囧織}}", + "type": "const" + }, + { + "name": "搴旂敤杩囩▼鏍囪瘑绗�", + "code": "apid", + "length": 11, + "value": "{{搴旂敤杩囩▼鏍囪瘑绗}", + "type": "const" + } + ] + }, + { + "name": "鍖呭簭鍒楁帶鍒�", + "code": "sequenceControl", + "length": 16, + "type": "combPkt", + "children": [ + { + "name": "搴忓垪鏍囧織", + "code": "sequenceFlags", + "length": 2, + "value": "{{搴忓垪鏍囧織}}", + "type": "const" + }, + { + "name": "鍖呭簭鍒楄鏁�", + "code": "packetSequenceCount", + "length": 14, + "type": "const", + "value": "0" + } + ] + }, + { + "name": "鍖呴暱", + "code": "packetLength", + "length": 16, + "type": "length", + "value": { + "start": "secondaryHeader", + "end": "packetDataEnd", + "formula": "N-1" + } + } + ] + }, + { + "name": "鍓澶�", + "code": "secondaryHeader", + "length": 8, + "type": "combPkt", + "children": [ + { + "name": "鍓澶存爣蹇�", + "code": "ccsdsSecondaryHeaderFlag", + "length": 1, + "value": "{{鍓澶存爣蹇梷}", + "type": "const" + }, + { + "name": "閬ユ帶鍖呯増鏈彿", + "code": "tcPktVersionNumber", + "length": 3, + "value": "{{閬ユ帶鍖呯増鏈彿}}", + "type": "const" + }, + { + "name": "鍛戒护姝g‘搴旂瓟", + "code": "acknowledgmentFlag", + "length": 4, + "type": "const", + "value": "{{鍛戒护姝g‘搴旂瓟}}" + }, + { + "name": "鏈嶅姟绫诲瀷", + "code": "serviceType", + "length": 8, + "type": "const", + "value": "{{鏈嶅姟绫诲瀷}}" + }, + { + "name": "鏈嶅姟瀛愮被鍨�", + "code": "serviceSubtype", + "length": 8, + "type": "const", + "value": "{{鏈嶅姟瀛愮被鍨媫}" + }, + { + "name": "婧愬湴鍧�", + "code": "sourceAddr", + "length": 8, + "value": "{{婧愬湴鍧�}}", + "type": "const" + } + ] + }, + { + "name": "搴旂敤鏁版嵁鍖�", + "code": "data", + "length": null, + "type": "insUnitList", + "children": [] + }, + { + "name": "鍖呭樊閿欐帶鍒跺煙", + "code": "pktCheckSum", + "length": 16, + "type": "checkSum" + } + ], + "subPkts": [] +} \ No newline at end of file diff --git a/tpl/tc_transfer_frame.json b/tpl/tc_transfer_frame.json new file mode 100644 index 0000000..f12853e --- /dev/null +++ b/tpl/tc_transfer_frame.json @@ -0,0 +1,83 @@ +{ + "name": "閬ユ帶甯�", + "type": "pkt", + "children": [ + { + "name": "涓诲澶�", + "code": "primaryHeader", + "length": 40, + "type": "combPkt", + "children": [ + { + "name": "鐗堟湰鍙�", + "code": "versionNumber", + "length": 2, + "value": "{{鐗堟湰鍙穧}", + "type": "const" + }, + { + "name": "閫氳繃鏍囧織", + "code": "passFlag", + "length": 1, + "type": "const", + "value": "{{閫氳繃鏍囧織}}" + }, + { + "name": "鎺у埗鍛戒护鏍囧織", + "code": "controlCommandFlag", + "length": 1, + "value": "{{鎺у埗鍛戒护鏍囧織}}", + "type": "const" + }, + { + "name": "绌洪棽浣�", + "code": "idleBits", + "length": 2, + "value": "{{绌洪棽浣峿}", + "type": "const" + }, + { + "name": "鑸ぉ鍣ㄦ爣璇�", + "code": "staID", + "length": 10, + "value": "{{鑸ぉ鍣ㄦ爣璇唥}", + "type": "const" + }, + { + "name": "铏氭嫙淇¢亾鏍囪瘑", + "code": "vcid", + "length": 6, + "type": "enum", + "enums": "{{铏氭嫙淇¢亾鏍囪瘑}}" + }, + { + "name": "甯ч暱", + "code": "frameLength", + "length": 10, + "type": "length", + "value": {"start": "START", "end": "END", "formula": "N-1"} + }, + { + "name": "甯у簭鍒楀彿", + "code": "frameSequenceNumber", + "length": 8, + "type": "const", + "value": "0" + } + ] + }, + { + "name": "浼犻�佸抚鏁版嵁鍩�", + "code": "dataField", + "length": 8136, + "type": "subPkt" + }, + { + "name": "甯у樊閿欐帶鍒跺煙", + "code": "frameCRC", + "length": 16, + "type": "checkSum" + } + ], + "subPkts": [] +} \ No newline at end of file -- Gitblit v1.9.1