pdf_parser.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import os
  2. import re
  3. import fitz
  4. import json
  5. import argparse
  6. import gradio as gr
  7. from tqdm import tqdm
  8. from utils.module import save_json
  9. from utils.pdf_extract import extract_text
  10. from utils.text_parser import content_extract
  11. def pdf2json(pdf_path):
  12. save_path = "./pdf_json/" + os.path.splitext(os.path.basename(pdf_path))[0] + "/"
  13. if not os.path.exists(save_path):
  14. os.mkdir(save_path)
  15. pattern_1 = r'货\s*号'
  16. pattern_2 = r'款\s*号'
  17. doc = fitz.open(pdf_path)
  18. page_number = len(doc)
  19. print(f"page number: {page_number}")
  20. for i in tqdm(range(page_number+1)):
  21. text = extract_text(pdf_path, i)
  22. # print(f"page-{i} content: {text}")
  23. if re.search(pattern_1, text) or re.search(pattern_2, text):
  24. json_content = content_extract(text)
  25. json_file = json.loads(json_content)
  26. save_json(f'{save_path}{json_file["货号"]}.json', json_file)
  27. return save_path
  28. def pdf2image(pdfPath, zoom_x=2, zoom_y=2, rotation_angle=0):
  29. # 创建图像保持目录
  30. imgPath = "./pdf_image/" + os.path.splitext(os.path.basename(pdfPath))[0] + "/"
  31. if not os.path.exists(imgPath):
  32. os.mkdir(imgPath)
  33. # 打开PDF文件
  34. pdf = fitz.open(pdfPath)
  35. # 逐页读取PDF
  36. for pg in tqdm(range(0, pdf.page_count)):
  37. pattern_1 = r'货\s*号'
  38. pattern_2 = r'款\s*号'
  39. page = pdf[pg]
  40. page_content = page.get_text()
  41. if re.search(pattern_1, page_content) or re.search(pattern_2, page_content):
  42. print(f"process: {pg}")
  43. json_content = content_extract(page_content)
  44. json_file = json.loads(json_content)
  45. # 设置缩放和旋转系数
  46. trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
  47. pm = page.get_pixmap(matrix=trans, alpha=False)
  48. # 开始写图像
  49. pm._writeIMG(imgPath + json_file["货号"] + ".png", format_="png", jpg_quality=100)
  50. pdf.close()
  51. return imgPath
  52. def add_image(json_file_path, image_url):
  53. """添加商品图像字段到指定的JSON文件"""
  54. # 读取JSON文件
  55. with open(json_file_path, 'r', encoding='utf-8') as file:
  56. data = json.load(file)
  57. # 添加“商品图像”字段
  58. data['商品图像'] = image_url
  59. # 将更新后的数据写回到JSON文件
  60. with open(json_file_path, 'w', encoding='utf-8') as file:
  61. json.dump(data, file, ensure_ascii=False, indent=4)
  62. print("已成功添加“商品图像”字段。")
  63. def add_info(json_folder, image_folder):
  64. miss = 0
  65. for filename in tqdm(os.listdir(json_folder)):
  66. json_file_path = os.path.join(json_folder, filename)
  67. image_url = os.path.join("./database/image", filename.replace("json","png"))
  68. image_exit = os.path.join(image_folder, filename.replace("json","png"))
  69. if os.path.exists(image_exit):
  70. print(f"正在处理:{image_url}")
  71. add_image(json_file_path, image_url)
  72. else:
  73. miss += 1
  74. print(f"图片文件不存在:{image_exit}")
  75. print(f"图片文件缺失数量:{miss}")
  76. def pdf2db(pdf_path):
  77. # 检查是否上传了文件
  78. if not pdf_path:
  79. return "请先上传PDF文件"
  80. try:
  81. json_folder = pdf2json(pdf_path)
  82. image_folder = pdf2image(pdf_path)
  83. add_info(json_folder, image_folder)
  84. os.system(f'cp {json_folder}* ./database/meta/')
  85. os.system(f'cp {image_folder}* ./database/image/')
  86. return "已完成PDF解析..."
  87. except Exception as e:
  88. return f"解析过程中发生错误:{str(e)}"
  89. # 创建Gradio界面
  90. with gr.Blocks(title="PDF解析器") as app:
  91. gr.Markdown("## PDF文件解析工具")
  92. gr.Markdown("上传PDF文件后点击解析按钮查看文本内容")
  93. with gr.Row():
  94. # 文件上传组件
  95. file_input = gr.File(
  96. label="选择PDF文件",
  97. file_types=[".pdf"],
  98. file_count="single"
  99. )
  100. # 解析按钮
  101. parse_btn = gr.Button("解析PDF", variant="primary")
  102. # 结果显示组件
  103. text_output = gr.Textbox(
  104. label="解析结果",
  105. placeholder="解析后的文本将显示在此处...",
  106. lines=20,
  107. max_lines=50
  108. )
  109. # 绑定按钮点击事件
  110. parse_btn.click(
  111. fn=pdf2db,
  112. inputs=file_input,
  113. outputs=text_output
  114. )
  115. if __name__ == "__main__":
  116. app.launch(server_name='0.0.0.0', server_port=1111)