euler314's picture
Update app.py
ec386e0 verified
raw
history blame
2.67 kB
# app.py
import os, json
import gradio as gr
# MinerU API imports
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
def convert_with_mineru(pdf_path, out_format):
# 1) Read file into MinerU dataset(s)
datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3}
# Prepare writers
tmp_dir = "output"
img_dir = os.path.join(tmp_dir, "images")
os.makedirs(img_dir, exist_ok=True)
md_writer = FileBasedDataWriter(tmp_dir)
img_writer = FileBasedDataWriter(img_dir)
all_pages = []
for ds in datasets:
# 2) Classify & infer
if ds.classify() == SupportedPdfParseMethod.OCR:
infer = ds.apply(doc_analyze, ocr=True)
pipe = infer.pipe_ocr_mode(img_writer)
else:
infer = ds.apply(doc_analyze, ocr=False)
pipe = infer.pipe_txt_mode(img_writer)
# 3) Dump per‐document Markdown + collect
basename = os.path.splitext(os.path.basename(pdf_path))[0]
md_fname = f"{basename}.md"
pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
page_md = f.read()
# 4) Collect structured JSON (middle JSON)
json_fname = f"{basename}_content_list.json"
pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
content_list = json.load(f)
all_pages.append({
"markdown": page_md,
"content_list": content_list
})
# 5) Return desired format
if out_format == "markdown":
# Concatenate all documents
return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
else:
return json.dumps(all_pages, ensure_ascii=False, indent=2)
# Gradio interface
demo = gr.Interface(
fn=convert_with_mineru,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(["markdown","json"], value="markdown", label="Output format")
],
outputs=gr.Code(label="Result"),
title="MinerU-Powered PDF → Markdown/JSON",
description=(
"Leverage the advanced MinerU engine to extract text, images, tables, "
"and formulas from your PDF into clean Markdown or structured JSON."
)
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)