Spaces:

euler314
/

file_extension_change

Sleeping

File size: 3,048 Bytes

cdb52cd
 
f25ee15
 
 
 
 
b89a1c3
 
683fa93
 
b89a1c3
 
0532015
 
b89a1c3
0532015
 
dac7551
 
0532015
 
f25ee15
b89a1c3
cdb52cd
f25ee15
 
 
b89a1c3
f25ee15
b89a1c3
 
 
cdb52cd
b89a1c3
cdb52cd
 
 
 
 
dac7551
cdb52cd
 
 
b89a1c3
f25ee15
cdb52cd
 
f25ee15
b89a1c3
0532015
f25ee15
 
cdb52cd
b89a1c3
 
 
 
f25ee15
b89a1c3
 
 
 
 
 
 
 
cdb52cd
b89a1c3
 
 
cdb52cd
dac7551
e219826
dac7551
e219826
dac7551
 
 
 
e219826
b89a1c3
683fa93
e219826
 
0532015
cdb52cd
 
 
 
 
e219826
b89a1c3
 
e219826
cdb52cd
b89a1c3
 
0532015
683fa93
 
b89a1c3
dac7551

# app.py

import os
import shutil
import subprocess
import tempfile
import json

import pypandoc
import gradio as gr

def run_marker(pdf_path, out_dir, fmt):
    """Run Marker to convert PDF, extracting images & math."""
    cmd = [
        "marker_single",
        pdf_path,
        "--output_format", fmt,
        "--output_dir", out_dir,
        "--extract_images",    # standalone flag
        "--paginate_output"    # standalone flag
    ]
    subprocess.run(cmd, check=True)

def collect_outputs(out_dir, ext):
    """Recursively gather all files with the given extension."""
    collected = []
    for root, _, files in os.walk(out_dir):
        for fname in sorted(files):
            if fname.lower().endswith(ext):
                collected.append(os.path.join(root, fname))
    return collected

def process_upload(pdf_file, output_format):
    # 1) Create temp dir for Marker outputs
    out_dir = tempfile.mkdtemp()

    # 2) Map Gradio choice to Marker’s format
    fmt = {
        "markdown": "markdown",
        "json":     "json",
        "docx":     "markdown"   # still produce .md before converting to DOCX
    }[output_format]

    # 3) Run Marker CLI
    run_marker(pdf_file.name, out_dir, fmt)

    # 4) Read the generated pages
    ext = ".json" if output_format == "json" else ".md"
    pages = []
    for path in collect_outputs(out_dir, ext):
        with open(path, 'r', encoding='utf-8') as f:
            pages.append(f.read())

    # 5) DOCX branch: combine markdown and convert via Pandoc
    if output_format == "docx":
        md_path = os.path.join(out_dir, "combined.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("\n\n---\n\n".join(pages))

        docx_path = tempfile.mktemp(suffix=".docx")
        pypandoc.convert_file(
            md_path,
            "docx",
            outputfile=docx_path,
            extra_args=[f"--resource-path={out_dir}"]
        )

        # Clean up and return the path to the .docx file
        shutil.rmtree(out_dir)
        return docx_path

    # 6) Non-DOCX: clean up and return Markdown or JSON string
    result = None
    if output_format == "markdown":
        result = "\n\n---\n\n".join(pages)
    else:
        result = json.dumps({"pages": pages}, indent=2, ensure_ascii=False)

    shutil.rmtree(out_dir)
    return result

# Gradio Interface
demo = gr.Interface(
    fn=process_upload,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Radio(
            choices=["markdown", "json", "docx"],
            value="markdown",
            label="Output format"
        )
    ],
    outputs=gr.File(label="Download Result"),
    title="PDF → Markdown/JSON/DOCX Converter",
    description=(
        "Upload a PDF (with images & math). "
        "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
        "Or choose **DOCX** to get a Word document with everything embedded."
    )
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)