Spaces:

euler314
/

file_extension_change

Running

App Files Files Community

euler314 commited on about 18 hours ago

Commit

cdb52cd

verified ·

1 Parent(s): b89a1c3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -21

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import shutil
 import subprocess
@@ -14,13 +16,13 @@ def run_marker(pdf_path, out_dir, fmt):
         pdf_path,
         "--output_format", fmt,
         "--output_dir", out_dir,
-        "--extract_images",    # ensure images get saved
-        "--paginate_output"
     ]
     subprocess.run(cmd, check=True)
 def collect_outputs(out_dir, ext):
-    """Recursively gather all files with given extension."""
     collected = []
     for root, _, files in os.walk(out_dir):
         for fname in sorted(files):
@@ -29,30 +31,32 @@ def collect_outputs(out_dir, ext):
     return collected
 def process_upload(pdf_file, output_format):
-    # 1) Temp dir for Marker outputs
     out_dir = tempfile.mkdtemp()
-    fmt = {"markdown": "markdown", "json": "json"}[output_format]
     run_marker(pdf_file.name, out_dir, fmt)
-    # 2) Read pages
-    ext = ".md" if output_format in ["markdown","docx"] else ".json"
     pages = []
     for path in collect_outputs(out_dir, ext):
         with open(path, 'r', encoding='utf-8') as f:
             pages.append(f.read())
-    # 3) Cleanup Marker temp files if not doing docx
-    #    (but keep them for Pandoc image embedding)
-    #    so we delay full cleanup until end.
-    # 4) If Word requested, first join markdown then convert.
     if output_format == "docx":
-        # write a single temp .md
         md_path = os.path.join(out_dir, "combined.md")
         with open(md_path, "w", encoding="utf-8") as f:
             f.write("\n\n---\n\n".join(pages))
-        # produce .docx via Pandoc, telling it where images live
         docx_path = tempfile.mktemp(suffix=".docx")
         pypandoc.convert_file(
             md_path,
@@ -61,11 +65,11 @@ def process_upload(pdf_file, output_format):
             extra_args=[f"--resource-path={out_dir}"]
         )
-        # clean up Marker outputs
         shutil.rmtree(out_dir)
         return docx_path
-    # 5) Non-docx: join or wrap JSON
     shutil.rmtree(out_dir)
     if output_format == "markdown":
         return "\n\n---\n\n".join(pages)
@@ -77,18 +81,20 @@ demo = gr.Interface(
     fn=process_upload,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Radio(choices=["markdown", "json", "docx"],
-                 value="markdown",
-                 label="Output format")
     ],
     outputs=gr.File(label="Download Result"),
     title="PDF → Markdown/JSON/DOCX Converter",
     description=(
-        "Upload a PDF (even with images & math). "
         "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
         "Or choose **DOCX** to get a Word document with everything embedded."
     )
 )
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+# app.py
 import os
 import shutil
 import subprocess
         pdf_path,
         "--output_format", fmt,
         "--output_dir", out_dir,
+        "--extract_images", "True",    # pass explicit boolean
+        "--paginate_output", "True"
     ]
     subprocess.run(cmd, check=True)
 def collect_outputs(out_dir, ext):
+    """Recursively gather all files with the given extension."""
     collected = []
     for root, _, files in os.walk(out_dir):
         for fname in sorted(files):
     return collected
 def process_upload(pdf_file, output_format):
+    # 1) Create temp dir for Marker outputs
     out_dir = tempfile.mkdtemp()
+    # 2) Map Gradio choice to Marker’s format
+    fmt = {
+        "markdown": "markdown",
+        "json":     "json",
+        "docx":     "markdown"   # produce .md before converting to DOCX
+    }[output_format]
+    # 3) Run Marker CLI
     run_marker(pdf_file.name, out_dir, fmt)
+    # 4) Read the generated pages
+    ext = ".json" if output_format == "json" else ".md"
     pages = []
     for path in collect_outputs(out_dir, ext):
         with open(path, 'r', encoding='utf-8') as f:
             pages.append(f.read())
+    # 5) DOCX branch: combine markdown and convert via Pandoc
     if output_format == "docx":
         md_path = os.path.join(out_dir, "combined.md")
         with open(md_path, "w", encoding="utf-8") as f:
             f.write("\n\n---\n\n".join(pages))
         docx_path = tempfile.mktemp(suffix=".docx")
         pypandoc.convert_file(
             md_path,
             extra_args=[f"--resource-path={out_dir}"]
         )
+        # Clean up and return the path to the .docx file
         shutil.rmtree(out_dir)
         return docx_path
+    # 6) Non-DOCX: clean up and return Markdown or JSON string
     shutil.rmtree(out_dir)
     if output_format == "markdown":
         return "\n\n---\n\n".join(pages)
     fn=process_upload,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Radio(
+            choices=["markdown", "json", "docx"],
+            value="markdown",
+            label="Output format"
+        )
     ],
     outputs=gr.File(label="Download Result"),
     title="PDF → Markdown/JSON/DOCX Converter",
     description=(
+        "Upload a PDF (with images & math). "
         "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
         "Or choose **DOCX** to get a Word document with everything embedded."
     )
 )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)