Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import shutil | |
| import subprocess | |
| import tempfile | |
| import json | |
| import pypandoc | |
| import gradio as gr | |
| def run_marker(pdf_path, out_dir, fmt): | |
| """Run Marker to convert PDF, extracting images & math.""" | |
| cmd = [ | |
| "marker_single", | |
| pdf_path, | |
| "--output_format", fmt, | |
| "--output_dir", out_dir, | |
| "--extract_images", # standalone flag | |
| "--paginate_output" # standalone flag | |
| ] | |
| subprocess.run(cmd, check=True) | |
| def collect_outputs(out_dir, ext): | |
| """Recursively gather all files with the given extension.""" | |
| collected = [] | |
| for root, _, files in os.walk(out_dir): | |
| for fname in sorted(files): | |
| if fname.lower().endswith(ext): | |
| collected.append(os.path.join(root, fname)) | |
| return collected | |
| def process_upload(pdf_file, output_format): | |
| # 1) Create temp dir for Marker outputs | |
| out_dir = tempfile.mkdtemp() | |
| # 2) Map Gradio choice to Marker’s format | |
| fmt = { | |
| "markdown": "markdown", | |
| "json": "json", | |
| "docx": "markdown" # still produce .md before converting to DOCX | |
| }[output_format] | |
| # 3) Run Marker CLI | |
| run_marker(pdf_file.name, out_dir, fmt) | |
| # 4) Read the generated pages | |
| ext = ".json" if output_format == "json" else ".md" | |
| pages = [] | |
| for path in collect_outputs(out_dir, ext): | |
| with open(path, 'r', encoding='utf-8') as f: | |
| pages.append(f.read()) | |
| # 5) DOCX branch: combine markdown and convert via Pandoc | |
| if output_format == "docx": | |
| md_path = os.path.join(out_dir, "combined.md") | |
| with open(md_path, "w", encoding="utf-8") as f: | |
| f.write("\n\n---\n\n".join(pages)) | |
| docx_path = tempfile.mktemp(suffix=".docx") | |
| pypandoc.convert_file( | |
| md_path, | |
| "docx", | |
| outputfile=docx_path, | |
| extra_args=[f"--resource-path={out_dir}"] | |
| ) | |
| # Clean up and return the path to the .docx file | |
| shutil.rmtree(out_dir) | |
| return docx_path | |
| # 6) Non-DOCX: clean up and return Markdown or JSON string | |
| result = None | |
| if output_format == "markdown": | |
| result = "\n\n---\n\n".join(pages) | |
| else: | |
| result = json.dumps({"pages": pages}, indent=2, ensure_ascii=False) | |
| shutil.rmtree(out_dir) | |
| return result | |
| # Gradio Interface | |
| demo = gr.Interface( | |
| fn=process_upload, | |
| inputs=[ | |
| gr.File(label="Upload PDF", file_types=[".pdf"]), | |
| gr.Radio( | |
| choices=["markdown", "json", "docx"], | |
| value="markdown", | |
| label="Output format" | |
| ) | |
| ], | |
| outputs=gr.File(label="Download Result"), | |
| title="PDF → Markdown/JSON/DOCX Converter", | |
| description=( | |
| "Upload a PDF (with images & math). " | |
| "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. " | |
| "Or choose **DOCX** to get a Word document with everything embedded." | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |