euler314's picture
Update app.py
cdb52cd verified
raw
history blame
3.02 kB
# app.py
import os
import shutil
import subprocess
import tempfile
import json
import pypandoc
import gradio as gr
def run_marker(pdf_path, out_dir, fmt):
"""Run Marker to convert PDF, extracting images & math."""
cmd = [
"marker_single",
pdf_path,
"--output_format", fmt,
"--output_dir", out_dir,
"--extract_images", "True", # pass explicit boolean
"--paginate_output", "True"
]
subprocess.run(cmd, check=True)
def collect_outputs(out_dir, ext):
"""Recursively gather all files with the given extension."""
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fname.lower().endswith(ext):
collected.append(os.path.join(root, fname))
return collected
def process_upload(pdf_file, output_format):
# 1) Create temp dir for Marker outputs
out_dir = tempfile.mkdtemp()
# 2) Map Gradio choice to Marker’s format
fmt = {
"markdown": "markdown",
"json": "json",
"docx": "markdown" # produce .md before converting to DOCX
}[output_format]
# 3) Run Marker CLI
run_marker(pdf_file.name, out_dir, fmt)
# 4) Read the generated pages
ext = ".json" if output_format == "json" else ".md"
pages = []
for path in collect_outputs(out_dir, ext):
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
# 5) DOCX branch: combine markdown and convert via Pandoc
if output_format == "docx":
md_path = os.path.join(out_dir, "combined.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("\n\n---\n\n".join(pages))
docx_path = tempfile.mktemp(suffix=".docx")
pypandoc.convert_file(
md_path,
"docx",
outputfile=docx_path,
extra_args=[f"--resource-path={out_dir}"]
)
# Clean up and return the path to the .docx file
shutil.rmtree(out_dir)
return docx_path
# 6) Non-DOCX: clean up and return Markdown or JSON string
shutil.rmtree(out_dir)
if output_format == "markdown":
return "\n\n---\n\n".join(pages)
else:
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
# Gradio Interface
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(
choices=["markdown", "json", "docx"],
value="markdown",
label="Output format"
)
],
outputs=gr.File(label="Download Result"),
title="PDF → Markdown/JSON/DOCX Converter",
description=(
"Upload a PDF (with images & math). "
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
"Or choose **DOCX** to get a Word document with everything embedded."
)
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)