Spaces:

euler314
/

file_extension_change

Running

App Files Files Community

file_extension_change / app.py

euler314

Update app.py

ec386e0 verified 3 days ago

raw

history blame

2.67 kB

	# app.py
	import os, json
	import gradio as gr

	# MinerU API imports
	from magic_pdf.data.read_api import read_local_pdfs
	from magic_pdf.data.data_reader_writer import FileBasedDataWriter
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
	from magic_pdf.config.enums import SupportedPdfParseMethod

	def convert_with_mineru(pdf_path, out_format):
	# 1) Read file into MinerU dataset(s)
	datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3}

	# Prepare writers
	tmp_dir = "output"
	img_dir = os.path.join(tmp_dir, "images")
	os.makedirs(img_dir, exist_ok=True)
	md_writer = FileBasedDataWriter(tmp_dir)
	img_writer = FileBasedDataWriter(img_dir)

	all_pages = []

	for ds in datasets:
	# 2) Classify & infer
	if ds.classify() == SupportedPdfParseMethod.OCR:
	infer = ds.apply(doc_analyze, ocr=True)
	pipe = infer.pipe_ocr_mode(img_writer)
	else:
	infer = ds.apply(doc_analyze, ocr=False)
	pipe = infer.pipe_txt_mode(img_writer)

	# 3) Dump per‐document Markdown + collect
	basename = os.path.splitext(os.path.basename(pdf_path))[0]
	md_fname = f"{basename}.md"
	pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
	with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
	page_md = f.read()

	# 4) Collect structured JSON (middle JSON)
	json_fname = f"{basename}_content_list.json"
	pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
	with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
	content_list = json.load(f)

	all_pages.append({
	"markdown": page_md,
	"content_list": content_list
	})

	# 5) Return desired format
	if out_format == "markdown":
	# Concatenate all documents
	return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
	else:
	return json.dumps(all_pages, ensure_ascii=False, indent=2)

	# Gradio interface
	demo = gr.Interface(
	fn=convert_with_mineru,
	inputs=[
	gr.File(label="Upload PDF", file_types=[".pdf"]),
	gr.Radio(["markdown","json"], value="markdown", label="Output format")
	],
	outputs=gr.Code(label="Result"),
	title="MinerU-Powered PDF → Markdown/JSON",
	description=(
	"Leverage the advanced MinerU engine to extract text, images, tables, "
	"and formulas from your PDF into clean Markdown or structured JSON."
	)
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)