Spaces:
Running
Running
import os | |
import shutil | |
import subprocess | |
import tempfile | |
import json | |
import gradio as gr | |
def process_upload(pdf_file, output_format): | |
out_dir = tempfile.mkdtemp() | |
fmt = "markdown" if output_format == "markdown" else "json" | |
cmd = [ | |
"marker_single", | |
pdf_file.name, | |
"--output_format", fmt, | |
"--output_dir", out_dir, | |
"--paginate_output" | |
] | |
subprocess.run(cmd, check=True) | |
# Recursively find only .md/.json files | |
collected = [] | |
for root, _, files in os.walk(out_dir): | |
for fname in sorted(files): | |
if fmt == "markdown" and fname.lower().endswith(".md"): | |
collected.append(os.path.join(root, fname)) | |
elif fmt == "json" and fname.lower().endswith(".json"): | |
collected.append(os.path.join(root, fname)) | |
pages = [] | |
for path in collected: | |
with open(path, 'r', encoding='utf-8') as f: | |
pages.append(f.read()) | |
shutil.rmtree(out_dir) | |
if output_format == "markdown": | |
return "\n\n---\n\n".join(pages) | |
else: | |
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False) | |
demo = gr.Interface( | |
fn=process_upload, | |
inputs=[ | |
gr.File(label="Upload PDF", file_types=[".pdf"]), | |
gr.Radio(["markdown","json"], value="markdown", label="Output format") | |
], | |
outputs=gr.Code(label="Converted Output"), | |
title="PDF β Markdown/JSON with LaTeX Support", | |
description=( | |
"Upload a PDF and get back Markdown or structured JSON, " | |
"with math preserved as LaTeX." | |
) | |
) | |
if __name__=="__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |