KnowledgeBase.git

			@@ -5,15 +5,18 @@
			# @version:
			# @description: 处理文档，拆分文档，将拆分后的章节保存到数据库中。
			from langchain_core.messages import HumanMessage
			from langchain_core.output_parsers import JsonOutputParser
			from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

			from knowledgebase.db.doc_db_models import TEntity
			from knowledgebase.doc.docx_split import DocSplit
			import asyncio
			from knowledgebase.db.doc_db_helper import doc_dbh
			from knowledgebase.doc.entity_helper import entity_helper
			from knowledgebase.doc.entity_helper import get_entity_helper
			from knowledgebase.doc.entity_recognition import EntityRecognition
			import os.path

			from knowledgebase.doc.models import DocInfo, ParagraphInfo
			from knowledgebase.doc.models import DocInfo, ParagraphInfo, DocType
			from knowledgebase.llm import llm
			from knowledgebase.log import Log
			from knowledgebase import utils
			@@ -27,12 +30,13 @@
			"""
			Log.info(f'开始处理文档：{docx_file}')
			self.docx_file = docx_file
			self.doc_split = DocSplit(docx_file)
			self.doc_type = self.get_doc_type()
			self.doc_split = DocSplit(docx_file, self.doc_type)
			self.entity_recognition = EntityRecognition(self.doc_type)
			self.doc_id = 0

			def get_doc_type(self):
			entity_helper = get_entity_helper()
			Log.info(f'识别文档类型：{self.docx_file}')
			rules = '；\n'.join([f'- {it}：{entity_helper.doc_prompt_map[it]}' for it in entity_helper.doc_prompt_map.keys()])
			msg = HumanMessage(f'''
			@@ -48,29 +52,165 @@
			Log.info(f'识别结果：{resp.content}')
			return resp.content

			async def gen_sect_entities(self, paragraph: ParagraphInfo):
			# Log.info(f'生成章节实体词：{paragraph.full_text}')
			async def get_tc_info(self, paragraph: ParagraphInfo):
			if self.doc_type not in [DocType.tc_format]:
			return ''
			prompt = HumanMessagePromptTemplate.from_template('''
			# 指令
			请从下面的文本中识别指令信息，如果识别失败不要输出任何字符。
			指令信息包括：指令名称。
			# 识别规则
			- 文本内容为遥控指令数据域或遥控指令应用数据的定义描述。
			# 约束
			- 如果文本内容是目录则不要输出任何字符；
			- 指令名称在章节标题中，提取指令名称要和文本中的严格一致；
			- 如果没有识别到指令信息不要输出任何字符；
			- 识别失败，不要输出任何内容，包括解释性文本；
			- 输出json格式。
			# 示例 - 识别到指令
			{{
			"name": "xxx"
			}}
			# 示例 - 未识别到指令
			""
			# 文本内容：
			{text}
			''')
			chain = prompt.prompt \| llm \| JsonOutputParser()
			resp = await chain.ainvoke({"text": paragraph.full_text})
			import json
			# Log.info(f'>>>>>>指令识别：\n{paragraph.full_text}')
			# Log.info(f'<<<<<<指令：{json.dumps(resp, ensure_ascii=False)}')
			return resp

			async def get_tm_pkt_info(self, paragraph: ParagraphInfo):
			if self.doc_type not in [DocType.tm_outline, DocType.tm_pkt_design]:
			return ''
			prompt = HumanMessagePromptTemplate.from_template('''
			# 指令
			识别遥测包信息，请从下面的文本中识别遥测包信息，如果识别失败不要输出任何字符。
			识别规则：章节标题中包含包名称和代号，章节内容为表格，表格中包括包头定义和包参数定义。
			提取的遥测包信息包括：包名称，包代号。
			# 约束
			- 如果文本内容是目录则不要输出任何字符；
			- 文本描述的内容是单个遥测包，如果有多个遥测包则不要输出任何字符；
			- 文本结构通常是：包名称、代号和APID(应用过程标识)在开头（应用过程标识也有可能在表格中），后面紧接着是包头和参数定义表；
			- 如果没有识别到遥测包信息不要输出任何字符；
			- 识别失败，不要输出任何内容，包括解释性文本；
			- 输出json格式。
			# 符合要求的文本结构1
			1.1.1 code xxx包(APID=0x123)
			```json
			表格内容
			```
			# 符合要求的文本结构2
			1.1.1 code xxx包
			```json
			表格内容
			应用过程标识
			...
			```
			# 示例 - 识别到数据包
			{{
			"name": "xxx包",
			"code": "TMS001"
			}}
			# 示例 - 未识别到数据包
			""
			# 文本内容：
			{text}
			''')
			chain = prompt.prompt \| llm \| JsonOutputParser()
			resp = await chain.ainvoke({"text": paragraph.full_text})
			return resp

			async def get_chapter_refs(self, paragraph: ParagraphInfo, toc: [str]) -> [str]:
			if self.doc_type not in [DocType.tc_format]:
			return ''
			toc_text = '\n'.join(toc)
			prompt = HumanMessagePromptTemplate.from_template(f'''
			# 角色
			你是一名资深的软件工程师。
			# 指令
			帮助我完成对文本中引用关系的抽取，判断当前文本中是否包含了引用信息，例如包含以下关键字：“详见1.1”、“见1.1”、“具体见1.1”、“见附录”等。
			如果包含引用，将引用与“目录内容”中的目录条目进行匹配。
			将匹配到的目录条目输出，输出格式为json格式。
			# 约束
			- 是否包含引用的判断条件中必须包含引用相关的描述，例如：“详见1.1”、“见1.1”、“具体见1.1”、“见附录”等；
			- 注意不要自己引用自己；
			- 仅提取目录内容中包含的条目，如果目录内容不包含则不提取；
			- 如果仅靠标题号码无法确定目录条目的，根据文本内容匹配对应的目录条目；
			- 输出的内容必须是目录中的条目；
			- 输出json格式，不要输出任何json以外的字符。
			# 输出案例
			["1.1 xxx"]
			# 目录内容：
			{toc_text}
			# 文本内容：
			{{text}}
			''')
			chain = prompt.prompt \| llm \| JsonOutputParser()
			resp = await chain.ainvoke({"text": paragraph.full_text})
			return resp

			async def gen_chapter_entities(self, paragraph: ParagraphInfo, paragraphs: [ParagraphInfo], toc: [str]):
			# 获取章节实体词
			entities = await asyncio.to_thread(lambda: self.entity_recognition.run(paragraph.full_text))
			Log.info(f'章节实体词：{entities}')
			if entities:
			paragraph.entities = [next(filter(lambda x: x.name == e, entity_helper.entities), None) for e in entities]
			paragraph.entities = [e for e in paragraph.entities if e]
			entity_names_task = self.entity_recognition.run(paragraph.full_text)
			# 获取指令信息
			cmd_task = self.get_tc_info(paragraph)
			# 获取遥测包信息
			pkt_task = self.get_tm_pkt_info(paragraph)
			# 获取文档引用
			refs_task = self.get_chapter_refs(paragraph, toc)
			entity_names, cmd, pkt, chapter_refs = await asyncio.gather(entity_names_task, cmd_task, pkt_task, refs_task)

			Log.info(f'章节{paragraph.title_num}实体词：{entity_names}')
			Log.info(f'章节{paragraph.title_num}引用：{chapter_refs}')
			if entity_names:
			paragraph.entities = doc_dbh.get_entities_by_names(entity_names)

			if pkt:
			entity = TEntity(name=pkt['code'], type='遥测包配置', prompts='', doc_type='')
			e = doc_dbh.get_entity(entity)
			if e:
			entity.id = e.id
			else:
			doc_dbh.add_entity(entity)
			Log.info(f"新增Entity：{entity.name}，id：{entity.id}")
			paragraph.entities.append(entity)

			if cmd:
			entity = TEntity(name=cmd['name'], type='指令格式配置', prompts='', doc_type='')
			e = doc_dbh.get_entity(entity)
			if e:
			entity.id = e.id
			else:
			doc_dbh.add_entity(entity)
			Log.info(f"新增Entity：{entity.name}，id：{entity.id}")
			paragraph.entities.append(entity)
			# 获取引用信息
			if chapter_refs:
			for ref in chapter_refs:
			_p = next(filter(lambda p: ref == p.title, self.doc_split.paragraphs), None)
			if _p:
			if paragraph != _p:
			paragraph.refs.append(_p)

			def process(self):
			self.doc_split.split()
			# 分批并发处理，每批10个
			batch_size = 10
			for i in range(0, len(self.doc_split.paragraphs), batch_size):
			batch_paragraphs = self.doc_split.paragraphs[i:i + batch_size]
			tasks = []
			for paragraph in batch_paragraphs:
			tasks.append(self.gen_sect_entities(paragraph))
			tasks = []
			toc = []
			for p in self.doc_split.paragraphs:
			if p.title_level:
			toc.append(p.title)
			for paragraph in self.doc_split.paragraphs:
			tasks.append(self.gen_chapter_entities(paragraph, self.doc_split.paragraphs, toc))

			async def run():
			await asyncio.gather(*tasks)
			async def run():
			await asyncio.gather(*tasks)

			asyncio.run(run())
			asyncio.run(run())
			# 保存到数据库
			self.save_to_db()

			@@ -86,21 +226,8 @@
			self.doc_id = doc_dbh.add_doc(doc)
			for paragraph in doc.paragraphs:
			doc_dbh.add_paragraph(self.doc_id, None, paragraph)
			for paragraph in self.doc_split.paragraphs:
			for ref_paragraph in paragraph.refs:
			doc_dbh.add_paragraph_ref_link(paragraph.id, ref_paragraph.id)
			Log.info(f"{paragraph.title} 引用了-> {ref_paragraph.title}")
			Log.info('保存段落和段落实体词关系到数据库完成')


			if __name__ == '__main__':
			files = [
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机1553B总线传输通信帧分配（公开）.docx",
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机分系统遥测源包设计报告（公开）.docx",
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机软件用户需求（公开）.docx",
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机遥测大纲（公开）.docx",
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机遥测信号分配表（公开）.docx",
			# r"D:\workspace\PythonProjects\KnowledgeBase\doc\XA-5D无人机指令格式与编码定义（公开）.docx",
			r"D:\workspace\PythonProjects\KnowledgeBase\doc\指令格式(公开).docx"
			]
			for file in files:
			doc_processor = DocProcessor(file)
			doc_processor.process()

			# doc_dbh.get_docs()