MinerU / demo /app.py
derful's picture
Upload folder using huggingface_hub
240e0a0 verified
raw
history blame
2.12 kB
import os
import json
import gradio as gr
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
def process_pdf(file_path):
try:
pdf_bytes = open(file_path, "rb").read()
model_json = [] # model_json传空list使用内置模型解析
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join('uploads', 'images')
if not os.path.exists(local_image_dir):
os.makedirs(local_image_dir)
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
logger.error("need model list input")
return None
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
return md_content
except Exception as e:
logger.exception(e)
return None
def extract_markdown_from_pdf(pdf):
# 保存上传的PDF文件
file_path = os.path.join('uploads', pdf.name)
with open(file_path, 'wb') as f:
f.write(pdf.read())
# 处理PDF文件并生成Markdown内容
md_content = process_pdf(file_path)
return md_content
def main():
# 创建Gradio接口
with gr.Blocks() as demo:
gr.Markdown("# PDF to Markdown Converter")
with gr.Row():
with gr.Column():
pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
md_output = gr.Markdown(label="Extracted Markdown")
extract_button = gr.Button("Extract Markdown")
extract_button.click(extract_markdown_from_pdf, inputs=[
pdf_file], outputs=[md_output])
demo.launch(share=True)
if __name__ == '__main__':
main()