Spaces:
Running
Running
File size: 3,048 Bytes
cdb52cd f25ee15 b89a1c3 683fa93 b89a1c3 0532015 b89a1c3 0532015 dac7551 0532015 f25ee15 b89a1c3 cdb52cd f25ee15 b89a1c3 f25ee15 b89a1c3 cdb52cd b89a1c3 cdb52cd dac7551 cdb52cd b89a1c3 f25ee15 cdb52cd f25ee15 b89a1c3 0532015 f25ee15 cdb52cd b89a1c3 f25ee15 b89a1c3 cdb52cd b89a1c3 cdb52cd dac7551 e219826 dac7551 e219826 dac7551 e219826 b89a1c3 683fa93 e219826 0532015 cdb52cd e219826 b89a1c3 e219826 cdb52cd b89a1c3 0532015 683fa93 b89a1c3 dac7551 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# app.py
import os
import shutil
import subprocess
import tempfile
import json
import pypandoc
import gradio as gr
def run_marker(pdf_path, out_dir, fmt):
"""Run Marker to convert PDF, extracting images & math."""
cmd = [
"marker_single",
pdf_path,
"--output_format", fmt,
"--output_dir", out_dir,
"--extract_images", # standalone flag
"--paginate_output" # standalone flag
]
subprocess.run(cmd, check=True)
def collect_outputs(out_dir, ext):
"""Recursively gather all files with the given extension."""
collected = []
for root, _, files in os.walk(out_dir):
for fname in sorted(files):
if fname.lower().endswith(ext):
collected.append(os.path.join(root, fname))
return collected
def process_upload(pdf_file, output_format):
# 1) Create temp dir for Marker outputs
out_dir = tempfile.mkdtemp()
# 2) Map Gradio choice to Marker’s format
fmt = {
"markdown": "markdown",
"json": "json",
"docx": "markdown" # still produce .md before converting to DOCX
}[output_format]
# 3) Run Marker CLI
run_marker(pdf_file.name, out_dir, fmt)
# 4) Read the generated pages
ext = ".json" if output_format == "json" else ".md"
pages = []
for path in collect_outputs(out_dir, ext):
with open(path, 'r', encoding='utf-8') as f:
pages.append(f.read())
# 5) DOCX branch: combine markdown and convert via Pandoc
if output_format == "docx":
md_path = os.path.join(out_dir, "combined.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("\n\n---\n\n".join(pages))
docx_path = tempfile.mktemp(suffix=".docx")
pypandoc.convert_file(
md_path,
"docx",
outputfile=docx_path,
extra_args=[f"--resource-path={out_dir}"]
)
# Clean up and return the path to the .docx file
shutil.rmtree(out_dir)
return docx_path
# 6) Non-DOCX: clean up and return Markdown or JSON string
result = None
if output_format == "markdown":
result = "\n\n---\n\n".join(pages)
else:
result = json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
shutil.rmtree(out_dir)
return result
# Gradio Interface
demo = gr.Interface(
fn=process_upload,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Radio(
choices=["markdown", "json", "docx"],
value="markdown",
label="Output format"
)
],
outputs=gr.File(label="Download Result"),
title="PDF → Markdown/JSON/DOCX Converter",
description=(
"Upload a PDF (with images & math). "
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
"Or choose **DOCX** to get a Word document with everything embedded."
)
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|