import os import re import fitz import json import argparse import gradio as gr from tqdm import tqdm from utils.module import save_json from utils.pdf_extract import extract_text from utils.text_parser import content_extract def pdf2json(pdf_path): save_path = "./pdf_json/" + os.path.splitext(os.path.basename(pdf_path))[0] + "/" if not os.path.exists(save_path): os.mkdir(save_path) pattern_1 = r'货\s*号' pattern_2 = r'款\s*号' doc = fitz.open(pdf_path) page_number = len(doc) print(f"page number: {page_number}") for i in tqdm(range(page_number+1)): text = extract_text(pdf_path, i) # print(f"page-{i} content: {text}") if re.search(pattern_1, text) or re.search(pattern_2, text): json_content = content_extract(text) json_file = json.loads(json_content) save_json(f'{save_path}{json_file["货号"]}.json', json_file) return save_path def pdf2image(pdfPath, zoom_x=2, zoom_y=2, rotation_angle=0): # 创建图像保持目录 imgPath = "./pdf_image/" + os.path.splitext(os.path.basename(pdfPath))[0] + "/" if not os.path.exists(imgPath): os.mkdir(imgPath) # 打开PDF文件 pdf = fitz.open(pdfPath) # 逐页读取PDF for pg in tqdm(range(0, pdf.page_count)): pattern_1 = r'货\s*号' pattern_2 = r'款\s*号' page = pdf[pg] page_content = page.get_text() if re.search(pattern_1, page_content) or re.search(pattern_2, page_content): print(f"process: {pg}") json_content = content_extract(page_content) json_file = json.loads(json_content) # 设置缩放和旋转系数 trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle) pm = page.get_pixmap(matrix=trans, alpha=False) # 开始写图像 pm._writeIMG(imgPath + json_file["货号"] + ".png", format_="png", jpg_quality=100) pdf.close() return imgPath def add_image(json_file_path, image_url): """添加商品图像字段到指定的JSON文件""" # 读取JSON文件 with open(json_file_path, 'r', encoding='utf-8') as file: data = json.load(file) # 添加“商品图像”字段 data['商品图像'] = image_url # 将更新后的数据写回到JSON文件 with open(json_file_path, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) print("已成功添加“商品图像”字段。") def add_info(json_folder, image_folder): miss = 0 for filename in tqdm(os.listdir(json_folder)): json_file_path = os.path.join(json_folder, filename) image_url = os.path.join("./database/image", filename.replace("json","png")) image_exit = os.path.join(image_folder, filename.replace("json","png")) if os.path.exists(image_exit): print(f"正在处理:{image_url}") add_image(json_file_path, image_url) else: miss += 1 print(f"图片文件不存在:{image_exit}") print(f"图片文件缺失数量:{miss}") def pdf2db(pdf_path): # 检查是否上传了文件 if not pdf_path: return "请先上传PDF文件" try: json_folder = pdf2json(pdf_path) image_folder = pdf2image(pdf_path) add_info(json_folder, image_folder) os.system(f'cp {json_folder}* ./database/meta/') os.system(f'cp {image_folder}* ./database/image/') return "已完成PDF解析..." except Exception as e: return f"解析过程中发生错误:{str(e)}" # 创建Gradio界面 with gr.Blocks(title="PDF解析器") as app: gr.Markdown("## PDF文件解析工具") gr.Markdown("上传PDF文件后点击解析按钮查看文本内容") with gr.Row(): # 文件上传组件 file_input = gr.File( label="选择PDF文件", file_types=[".pdf"], file_count="single" ) # 解析按钮 parse_btn = gr.Button("解析PDF", variant="primary") # 结果显示组件 text_output = gr.Textbox( label="解析结果", placeholder="解析后的文本将显示在此处...", lines=20, max_lines=50 ) # 绑定按钮点击事件 parse_btn.click( fn=pdf2db, inputs=file_input, outputs=text_output ) if __name__ == "__main__": app.launch(server_name='0.0.0.0', server_port=1111)