import math
|
import os
|
|
from knowledgebase.markitdown import MarkItDown
|
|
from doc_to_docx import doc_to_docx
|
|
|
def process_docs(directory):
|
# 遍历目录下的所有文件
|
for filename in os.listdir(directory):
|
# 判断是否为 doc 文件
|
if filename.endswith(".doc"):
|
# 转换为 docx
|
doc_to_docx(directory + filename, directory + filename.replace(".doc", ".docx"))
|
|
|
md = MarkItDown()
|
|
|
def to_markdown(dst_dir: str):
|
text = ''
|
# 遍历文件夹下的所有文件
|
for file in os.listdir(dst_dir):
|
# 判断是否为 docx 文件
|
if file.endswith(".docx"):
|
# 转换为 md
|
result = md.convert(dst_dir + file)
|
text = result.text_content
|
out_file = dst_dir + file + '.md'
|
with open(out_file, 'w', encoding='utf-8') as f:
|
f.write(text)
|
return out_file
|
|
|
# 1.解析文档
|
# 2.输入文档
|
# 3.启动LangFlow
|
def main():
|
doc_dir = ".\\doc\\"
|
# 处理文档
|
# process_docs(doc_dir)
|
# 文档转换为markdown
|
md_file = to_markdown(doc_dir)
|
|
md_file = 'D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.md'
|
# 启动大模型处理流程
|
# ret_text = LangFlow([md_file]).run()
|
# 保存结果
|
# with open('D:\\workspace\\PythonProjects\\KnowledgeBase\\doc\\test.text', 'w', encoding='utf-8') as f:
|
# f.write(ret_text)
|
|
|
def get_bit_mask(start, end):
|
bits = math.ceil((end + 1) / 8) * 8
|
if bits == 0:
|
bits = 8
|
mask = 0
|
for i in range(start, end + 1):
|
mask |= 1 << (bits - i - 1)
|
return mask
|
|
|
if __name__ == '__main__':
|
main()
|