Spaces:
Running
Running
# app.py | |
import os, json | |
import gradio as gr | |
# MinerU API imports | |
from magic_pdf.data.read_api import read_local_pdfs | |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter | |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
from magic_pdf.config.enums import SupportedPdfParseMethod | |
def convert_with_mineru(pdf_path, out_format): | |
# 1) Read file into MinerU dataset(s) | |
datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3} | |
# Prepare writers | |
tmp_dir = "output" | |
img_dir = os.path.join(tmp_dir, "images") | |
os.makedirs(img_dir, exist_ok=True) | |
md_writer = FileBasedDataWriter(tmp_dir) | |
img_writer = FileBasedDataWriter(img_dir) | |
all_pages = [] | |
for ds in datasets: | |
# 2) Classify & infer | |
if ds.classify() == SupportedPdfParseMethod.OCR: | |
infer = ds.apply(doc_analyze, ocr=True) | |
pipe = infer.pipe_ocr_mode(img_writer) | |
else: | |
infer = ds.apply(doc_analyze, ocr=False) | |
pipe = infer.pipe_txt_mode(img_writer) | |
# 3) Dump per‐document Markdown + collect | |
basename = os.path.splitext(os.path.basename(pdf_path))[0] | |
md_fname = f"{basename}.md" | |
pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir)) | |
with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f: | |
page_md = f.read() | |
# 4) Collect structured JSON (middle JSON) | |
json_fname = f"{basename}_content_list.json" | |
pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir)) | |
with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f: | |
content_list = json.load(f) | |
all_pages.append({ | |
"markdown": page_md, | |
"content_list": content_list | |
}) | |
# 5) Return desired format | |
if out_format == "markdown": | |
# Concatenate all documents | |
return "\n\n---\n\n".join(p["markdown"] for p in all_pages) | |
else: | |
return json.dumps(all_pages, ensure_ascii=False, indent=2) | |
# Gradio interface | |
demo = gr.Interface( | |
fn=convert_with_mineru, | |
inputs=[ | |
gr.File(label="Upload PDF", file_types=[".pdf"]), | |
gr.Radio(["markdown","json"], value="markdown", label="Output format") | |
], | |
outputs=gr.Code(label="Result"), | |
title="MinerU-Powered PDF → Markdown/JSON", | |
description=( | |
"Leverage the advanced MinerU engine to extract text, images, tables, " | |
"and formulas from your PDF into clean Markdown or structured JSON." | |
) | |
) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |