# -*- coding: utf-8 -*- # @file: doc_processor.py # @author: lyg # @date: 20250427 # @version: # @description: 处理文档,提取章节信息,提取页码信息,提取实体词,写入图数据库(neo4j)。 from knowledgebase.db.neo4j import Neo4jHelper from knowledgebase.doc.doc_split import DocSplit from knowledgebase.doc.entity_recognition import EntityRecognition import asyncio class DocProcessor: def __init__(self, pdf_file): self.doc_split = DocSplit(pdf_file) self.entity_recognition = EntityRecognition() self.neo4j = Neo4jHelper() async def gen_page_entities(self, page_info): # 获取页面实体词 page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text)) page_info.entities = page_entities def process(self): # 分批并发处理,每批10页 batch_size = 10 for i in range(0, len(self.doc_split.page_infos), batch_size): batch_page_infos = self.doc_split.page_infos[i:i + batch_size] tasks = [] for page_info in batch_page_infos: tasks.append(self.gen_page_entities(page_info)) asyncio.run(asyncio.gather(*tasks)) self.save_to_neo4j() def save_to_neo4j(self): """ 保存页和页实体词到neo4j数据库。 1.每一页为一个Node; 2.每一个实体词为一个Node; 3.页和实体词直接建立关系 页->实体词 :return: """ for page_info in self.doc_split.page_infos: # 创建页节点 page_node = self.neo4j.create_page_node(page_info) entity_nodes = [] for entity in page_info.entities: # 创建实体词节点 entity_node = self.neo4j.create_entity_node(entity) # 建立关系 页->实体词 self.neo4j.create_page_entity_relationship(page_node, entity_node) entity_nodes.append(entity_node) if len(entity_nodes) > 0: for i in range(len(entity_nodes)): prev_entity_node = entity_nodes[i] for entity_node in entity_nodes[i + 1:]: # 建立关系 一页中的 实体词1->实体词2 self.neo4j.create_entity_relationship(prev_entity_node, entity_node) if __name__ == '__main__': pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D无人机探测大纲(公开)111.pdf" doc_processor = DocProcessor(pdf_file) doc_processor.process()