# app.py import os import shutil import subprocess import tempfile import json import pypandoc import gradio as gr def run_marker(pdf_path, out_dir, fmt): """Run Marker to convert PDF, extracting images & math.""" cmd = [ "marker_single", pdf_path, "--output_format", fmt, "--output_dir", out_dir, "--extract_images", # standalone flag "--paginate_output" # standalone flag ] subprocess.run(cmd, check=True) def collect_outputs(out_dir, ext): """Recursively gather all files with the given extension.""" collected = [] for root, _, files in os.walk(out_dir): for fname in sorted(files): if fname.lower().endswith(ext): collected.append(os.path.join(root, fname)) return collected def process_upload(pdf_file, output_format): # 1) Create temp dir for Marker outputs out_dir = tempfile.mkdtemp() # 2) Map Gradio choice to Marker’s format fmt = { "markdown": "markdown", "json": "json", "docx": "markdown" # still produce .md before converting to DOCX }[output_format] # 3) Run Marker CLI run_marker(pdf_file.name, out_dir, fmt) # 4) Read the generated pages ext = ".json" if output_format == "json" else ".md" pages = [] for path in collect_outputs(out_dir, ext): with open(path, 'r', encoding='utf-8') as f: pages.append(f.read()) # 5) DOCX branch: combine markdown and convert via Pandoc if output_format == "docx": md_path = os.path.join(out_dir, "combined.md") with open(md_path, "w", encoding="utf-8") as f: f.write("\n\n---\n\n".join(pages)) docx_path = tempfile.mktemp(suffix=".docx") pypandoc.convert_file( md_path, "docx", outputfile=docx_path, extra_args=[f"--resource-path={out_dir}"] ) # Clean up and return the path to the .docx file shutil.rmtree(out_dir) return docx_path # 6) Non-DOCX: clean up and return Markdown or JSON string result = None if output_format == "markdown": result = "\n\n---\n\n".join(pages) else: result = json.dumps({"pages": pages}, indent=2, ensure_ascii=False) shutil.rmtree(out_dir) return result # Gradio Interface demo = gr.Interface( fn=process_upload, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Radio( choices=["markdown", "json", "docx"], value="markdown", label="Output format" ) ], outputs=gr.File(label="Download Result"), title="PDF → Markdown/JSON/DOCX Converter", description=( "Upload a PDF (with images & math). " "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. " "Or choose **DOCX** to get a Word document with everything embedded." ) ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)