Spaces:

euler314
/

file_extension_change

Running

App Files Files Community

euler314 commited on 1 day ago

Commit

b89a1c3

verified ·

1 Parent(s): 4b175fd

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -16

app.py CHANGED Viewed

@@ -3,54 +3,92 @@ import shutil
 import subprocess
 import tempfile
 import json
 import gradio as gr
-def process_upload(pdf_file, output_format):
-    out_dir = tempfile.mkdtemp()
-    fmt = "markdown" if output_format == "markdown" else "json"
     cmd = [
         "marker_single",
-        pdf_file.name,
         "--output_format", fmt,
         "--output_dir", out_dir,
         "--paginate_output"
     ]
     subprocess.run(cmd, check=True)
-    # Recursively find only .md/.json files
     collected = []
     for root, _, files in os.walk(out_dir):
         for fname in sorted(files):
-            if fmt == "markdown" and fname.lower().endswith(".md"):
-                collected.append(os.path.join(root, fname))
-            elif fmt == "json" and fname.lower().endswith(".json"):
                 collected.append(os.path.join(root, fname))
     pages = []
-    for path in collected:
         with open(path, 'r', encoding='utf-8') as f:
             pages.append(f.read())
-    shutil.rmtree(out_dir)
     if output_format == "markdown":
         return "\n\n---\n\n".join(pages)
     else:
         return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
 demo = gr.Interface(
     fn=process_upload,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Radio(["markdown","json"], value="markdown", label="Output format")
     ],
-    outputs=gr.Code(label="Converted Output"),
-    title="PDF → Markdown/JSON with LaTeX Support",
     description=(
-        "Upload a PDF and get back Markdown or structured JSON, "
-        "with math preserved as LaTeX."
     )
 )
-if __name__=="__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import subprocess
 import tempfile
 import json
+import pypandoc
 import gradio as gr
+def run_marker(pdf_path, out_dir, fmt):
+    """Run Marker to convert PDF, extracting images & math."""
     cmd = [
         "marker_single",
+        pdf_path,
         "--output_format", fmt,
         "--output_dir", out_dir,
+        "--extract_images",    # ensure images get saved
         "--paginate_output"
     ]
     subprocess.run(cmd, check=True)
+def collect_outputs(out_dir, ext):
+    """Recursively gather all files with given extension."""
     collected = []
     for root, _, files in os.walk(out_dir):
         for fname in sorted(files):
+            if fname.lower().endswith(ext):
                 collected.append(os.path.join(root, fname))
+    return collected
+def process_upload(pdf_file, output_format):
+    # 1) Temp dir for Marker outputs
+    out_dir = tempfile.mkdtemp()
+    fmt = {"markdown": "markdown", "json": "json"}[output_format]
+    run_marker(pdf_file.name, out_dir, fmt)
+    # 2) Read pages
+    ext = ".md" if output_format in ["markdown","docx"] else ".json"
     pages = []
+    for path in collect_outputs(out_dir, ext):
         with open(path, 'r', encoding='utf-8') as f:
             pages.append(f.read())
+    # 3) Cleanup Marker temp files if not doing docx
+    #    (but keep them for Pandoc image embedding)
+    #    so we delay full cleanup until end.
+    # 4) If Word requested, first join markdown then convert.
+    if output_format == "docx":
+        # write a single temp .md
+        md_path = os.path.join(out_dir, "combined.md")
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write("\n\n---\n\n".join(pages))
+        # produce .docx via Pandoc, telling it where images live
+        docx_path = tempfile.mktemp(suffix=".docx")
+        pypandoc.convert_file(
+            md_path,
+            "docx",
+            outputfile=docx_path,
+            extra_args=[f"--resource-path={out_dir}"]
+        )
+        # clean up Marker outputs
+        shutil.rmtree(out_dir)
+        return docx_path
+    # 5) Non-docx: join or wrap JSON
+    shutil.rmtree(out_dir)
     if output_format == "markdown":
         return "\n\n---\n\n".join(pages)
     else:
         return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
+# Gradio Interface
 demo = gr.Interface(
     fn=process_upload,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Radio(choices=["markdown", "json", "docx"],
+                 value="markdown",
+                 label="Output format")
     ],
+    outputs=gr.File(label="Download Result"),
+    title="PDF → Markdown/JSON/DOCX Converter",
     description=(
+        "Upload a PDF (even with images & math). "
+        "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
+        "Or choose **DOCX** to get a Word document with everything embedded."
     )
 )
+if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)