123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import os
- import re
- import fitz
- import json
- import argparse
- import gradio as gr
- from tqdm import tqdm
- from utils.module import save_json
- from utils.pdf_extract import extract_text
- from utils.text_parser import content_extract
- def pdf2json(pdf_path):
- save_path = "./pdf_json/" + os.path.splitext(os.path.basename(pdf_path))[0] + "/"
- if not os.path.exists(save_path):
- os.mkdir(save_path)
- pattern_1 = r'货\s*号'
- pattern_2 = r'款\s*号'
- doc = fitz.open(pdf_path)
- page_number = len(doc)
- print(f"page number: {page_number}")
- for i in tqdm(range(page_number+1)):
- text = extract_text(pdf_path, i)
- # print(f"page-{i} content: {text}")
- if re.search(pattern_1, text) or re.search(pattern_2, text):
- json_content = content_extract(text)
- json_file = json.loads(json_content)
- save_json(f'{save_path}{json_file["货号"]}.json', json_file)
- return save_path
- def pdf2image(pdfPath, zoom_x=2, zoom_y=2, rotation_angle=0):
- # 创建图像保持目录
- imgPath = "./pdf_image/" + os.path.splitext(os.path.basename(pdfPath))[0] + "/"
- if not os.path.exists(imgPath):
- os.mkdir(imgPath)
- # 打开PDF文件
- pdf = fitz.open(pdfPath)
- # 逐页读取PDF
- for pg in tqdm(range(0, pdf.page_count)):
- pattern_1 = r'货\s*号'
- pattern_2 = r'款\s*号'
- page = pdf[pg]
- page_content = page.get_text()
- if re.search(pattern_1, page_content) or re.search(pattern_2, page_content):
- print(f"process: {pg}")
- json_content = content_extract(page_content)
- json_file = json.loads(json_content)
- # 设置缩放和旋转系数
- trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
- pm = page.get_pixmap(matrix=trans, alpha=False)
- # 开始写图像
- pm._writeIMG(imgPath + json_file["货号"] + ".png", format_="png", jpg_quality=100)
- pdf.close()
- return imgPath
- def add_image(json_file_path, image_url):
- """添加商品图像字段到指定的JSON文件"""
- # 读取JSON文件
- with open(json_file_path, 'r', encoding='utf-8') as file:
- data = json.load(file)
- # 添加“商品图像”字段
- data['商品图像'] = image_url
- # 将更新后的数据写回到JSON文件
- with open(json_file_path, 'w', encoding='utf-8') as file:
- json.dump(data, file, ensure_ascii=False, indent=4)
- print("已成功添加“商品图像”字段。")
- def add_info(json_folder, image_folder):
- miss = 0
- for filename in tqdm(os.listdir(json_folder)):
- json_file_path = os.path.join(json_folder, filename)
- image_url = os.path.join("./database/image", filename.replace("json","png"))
- image_exit = os.path.join(image_folder, filename.replace("json","png"))
- if os.path.exists(image_exit):
- print(f"正在处理:{image_url}")
- add_image(json_file_path, image_url)
- else:
- miss += 1
- print(f"图片文件不存在:{image_exit}")
- print(f"图片文件缺失数量:{miss}")
- def pdf2db(pdf_path):
- # 检查是否上传了文件
- if not pdf_path:
- return "请先上传PDF文件"
- try:
- json_folder = pdf2json(pdf_path)
- image_folder = pdf2image(pdf_path)
- add_info(json_folder, image_folder)
- os.system(f'cp {json_folder}* ./database/meta/')
- os.system(f'cp {image_folder}* ./database/image/')
- return "已完成PDF解析..."
- except Exception as e:
- return f"解析过程中发生错误:{str(e)}"
- # 创建Gradio界面
- with gr.Blocks(title="PDF解析器") as app:
- gr.Markdown("## PDF文件解析工具")
- gr.Markdown("上传PDF文件后点击解析按钮查看文本内容")
-
- with gr.Row():
- # 文件上传组件
- file_input = gr.File(
- label="选择PDF文件",
- file_types=[".pdf"],
- file_count="single"
- )
- # 解析按钮
- parse_btn = gr.Button("解析PDF", variant="primary")
-
- # 结果显示组件
- text_output = gr.Textbox(
- label="解析结果",
- placeholder="解析后的文本将显示在此处...",
- lines=20,
- max_lines=50
- )
-
- # 绑定按钮点击事件
- parse_btn.click(
- fn=pdf2db,
- inputs=file_input,
- outputs=text_output
- )
- if __name__ == "__main__":
- app.launch(server_name='0.0.0.0', server_port=1111)
|