1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
# @file: doc_processor.py
# @author: lyg
# @date: 20250427
# @version: 
# @description: 处理文档,提取章节信息,提取页码信息,提取实体词,写入图数据库(neo4j)。
from knowledgebase.db.neo4j import Neo4jHelper
from knowledgebase.doc.doc_split import DocSplit
from knowledgebase.doc.entity_recognition import EntityRecognition
import asyncio
 
 
class DocProcessor:
    def __init__(self, pdf_file):
        self.doc_split = DocSplit(pdf_file)
        self.entity_recognition = EntityRecognition()
        self.neo4j = Neo4jHelper()
 
    async def gen_page_entities(self, page_info):
        # 获取页面实体词
        page_entities = await asyncio.to_thread(lambda: self.entity_recognition.run(page_info.text))
        page_info.entities = page_entities
 
    def process(self):
        # 分批并发处理,每批10页
        batch_size = 10
        for i in range(0, len(self.doc_split.page_infos), batch_size):
            batch_page_infos = self.doc_split.page_infos[i:i + batch_size]
            tasks = []
            for page_info in batch_page_infos:
                tasks.append(self.gen_page_entities(page_info))
            asyncio.run(asyncio.gather(*tasks))
        self.save_to_neo4j()
 
    def save_to_neo4j(self):
        """
        保存页和页实体词到neo4j数据库。
 
        1.每一页为一个Node;
        2.每一个实体词为一个Node;
        3.页和实体词直接建立关系 页->实体词
        :return:
        """
        for page_info in self.doc_split.page_infos:
            # 创建页节点
            page_node = self.neo4j.create_page_node(page_info)
            entity_nodes = []
            for entity in page_info.entities:
                # 创建实体词节点
                entity_node = self.neo4j.create_entity_node(entity)
                # 建立关系 页->实体词
                self.neo4j.create_page_entity_relationship(page_node, entity_node)
                entity_nodes.append(entity_node)
            if len(entity_nodes) > 0:
                for i in range(len(entity_nodes)):
                    prev_entity_node = entity_nodes[i]
                    for entity_node in entity_nodes[i + 1:]:
                        # 建立关系 一页中的 实体词1->实体词2
                        self.neo4j.create_entity_relationship(prev_entity_node, entity_node)
 
 
if __name__ == '__main__':
    pdf_file = "D:/workspace/PythonProjects/KnowledgeBase/doc/XA-5D无人机探测大纲(公开)111.pdf"
    doc_processor = DocProcessor(pdf_file)
    doc_processor.process()