Spaces:
Sleeping
Sleeping
# app.py | |
import os | |
import shutil | |
import subprocess | |
import tempfile | |
import json | |
import pypandoc | |
import gradio as gr | |
def run_marker(pdf_path, out_dir, fmt): | |
"""Run Marker to convert PDF, extracting images & math.""" | |
cmd = [ | |
"marker_single", | |
pdf_path, | |
"--output_format", fmt, | |
"--output_dir", out_dir, | |
"--extract_images", "True", # pass explicit boolean | |
"--paginate_output", "True" | |
] | |
subprocess.run(cmd, check=True) | |
def collect_outputs(out_dir, ext): | |
"""Recursively gather all files with the given extension.""" | |
collected = [] | |
for root, _, files in os.walk(out_dir): | |
for fname in sorted(files): | |
if fname.lower().endswith(ext): | |
collected.append(os.path.join(root, fname)) | |
return collected | |
def process_upload(pdf_file, output_format): | |
# 1) Create temp dir for Marker outputs | |
out_dir = tempfile.mkdtemp() | |
# 2) Map Gradio choice to Marker’s format | |
fmt = { | |
"markdown": "markdown", | |
"json": "json", | |
"docx": "markdown" # produce .md before converting to DOCX | |
}[output_format] | |
# 3) Run Marker CLI | |
run_marker(pdf_file.name, out_dir, fmt) | |
# 4) Read the generated pages | |
ext = ".json" if output_format == "json" else ".md" | |
pages = [] | |
for path in collect_outputs(out_dir, ext): | |
with open(path, 'r', encoding='utf-8') as f: | |
pages.append(f.read()) | |
# 5) DOCX branch: combine markdown and convert via Pandoc | |
if output_format == "docx": | |
md_path = os.path.join(out_dir, "combined.md") | |
with open(md_path, "w", encoding="utf-8") as f: | |
f.write("\n\n---\n\n".join(pages)) | |
docx_path = tempfile.mktemp(suffix=".docx") | |
pypandoc.convert_file( | |
md_path, | |
"docx", | |
outputfile=docx_path, | |
extra_args=[f"--resource-path={out_dir}"] | |
) | |
# Clean up and return the path to the .docx file | |
shutil.rmtree(out_dir) | |
return docx_path | |
# 6) Non-DOCX: clean up and return Markdown or JSON string | |
shutil.rmtree(out_dir) | |
if output_format == "markdown": | |
return "\n\n---\n\n".join(pages) | |
else: | |
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False) | |
# Gradio Interface | |
demo = gr.Interface( | |
fn=process_upload, | |
inputs=[ | |
gr.File(label="Upload PDF", file_types=[".pdf"]), | |
gr.Radio( | |
choices=["markdown", "json", "docx"], | |
value="markdown", | |
label="Output format" | |
) | |
], | |
outputs=gr.File(label="Download Result"), | |
title="PDF → Markdown/JSON/DOCX Converter", | |
description=( | |
"Upload a PDF (with images & math). " | |
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. " | |
"Or choose **DOCX** to get a Word document with everything embedded." | |
) | |
) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |