euler314's picture
Update app.py
f25ee15 verified
import os
import shutil
import subprocess
import tempfile
import json
import gradio as gr
def process_upload(pdf_file, output_format):
out_dir = tempfile.mkdtemp()
fmt = "markdown" if output_format == "markdown" else "json"
cmd = [
"marker_single",
pdf_file.name,
"--output_format", fmt,
"--output_dir", out_dir,
"--paginate_output"
]
subprocess.run(cmd, check=True)
# Recursively find only .md/.json files
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fmt == "markdown" and fname.lower().endswith(".md"):
collected.append(os.path.join(root, fname))
elif fmt == "json" and fname.lower().endswith(".json"):
collected.append(os.path.join(root, fname))
pages = []
for path in collected:
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
shutil.rmtree(out_dir)
if output_format == "markdown":
return "\n\n---\n\n".join(pages)
else:
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(["markdown","json"], value="markdown", label="Output format")
],
outputs=gr.Code(label="Converted Output"),
title="PDF β†’ Markdown/JSON with LaTeX Support",
description=(
"Upload a PDF and get back Markdown or structured JSON, "
"with math preserved as LaTeX."
)
)
if __name__=="__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)