Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import os
|
2 |
import shutil
|
3 |
import subprocess
|
@@ -14,13 +16,13 @@ def run_marker(pdf_path, out_dir, fmt):
|
|
14 |
pdf_path,
|
15 |
"--output_format", fmt,
|
16 |
"--output_dir", out_dir,
|
17 |
-
"--extract_images", #
|
18 |
-
"--paginate_output"
|
19 |
]
|
20 |
subprocess.run(cmd, check=True)
|
21 |
|
22 |
def collect_outputs(out_dir, ext):
|
23 |
-
"""Recursively gather all files with given extension."""
|
24 |
collected = []
|
25 |
for root, _, files in os.walk(out_dir):
|
26 |
for fname in sorted(files):
|
@@ -29,30 +31,32 @@ def collect_outputs(out_dir, ext):
|
|
29 |
return collected
|
30 |
|
31 |
def process_upload(pdf_file, output_format):
|
32 |
-
# 1)
|
33 |
out_dir = tempfile.mkdtemp()
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
run_marker(pdf_file.name, out_dir, fmt)
|
36 |
|
37 |
-
#
|
38 |
-
ext = ".
|
39 |
pages = []
|
40 |
for path in collect_outputs(out_dir, ext):
|
41 |
with open(path, 'r', encoding='utf-8') as f:
|
42 |
pages.append(f.read())
|
43 |
|
44 |
-
#
|
45 |
-
# (but keep them for Pandoc image embedding)
|
46 |
-
# so we delay full cleanup until end.
|
47 |
-
|
48 |
-
# 4) If Word requested, first join markdown then convert.
|
49 |
if output_format == "docx":
|
50 |
-
# write a single temp .md
|
51 |
md_path = os.path.join(out_dir, "combined.md")
|
52 |
with open(md_path, "w", encoding="utf-8") as f:
|
53 |
f.write("\n\n---\n\n".join(pages))
|
54 |
|
55 |
-
# produce .docx via Pandoc, telling it where images live
|
56 |
docx_path = tempfile.mktemp(suffix=".docx")
|
57 |
pypandoc.convert_file(
|
58 |
md_path,
|
@@ -61,11 +65,11 @@ def process_upload(pdf_file, output_format):
|
|
61 |
extra_args=[f"--resource-path={out_dir}"]
|
62 |
)
|
63 |
|
64 |
-
#
|
65 |
shutil.rmtree(out_dir)
|
66 |
return docx_path
|
67 |
|
68 |
-
#
|
69 |
shutil.rmtree(out_dir)
|
70 |
if output_format == "markdown":
|
71 |
return "\n\n---\n\n".join(pages)
|
@@ -77,18 +81,20 @@ demo = gr.Interface(
|
|
77 |
fn=process_upload,
|
78 |
inputs=[
|
79 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
80 |
-
gr.Radio(
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
],
|
84 |
outputs=gr.File(label="Download Result"),
|
85 |
title="PDF → Markdown/JSON/DOCX Converter",
|
86 |
description=(
|
87 |
-
"Upload a PDF (
|
88 |
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
|
89 |
"Or choose **DOCX** to get a Word document with everything embedded."
|
90 |
)
|
91 |
)
|
92 |
|
93 |
if __name__ == "__main__":
|
94 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import os
|
4 |
import shutil
|
5 |
import subprocess
|
|
|
16 |
pdf_path,
|
17 |
"--output_format", fmt,
|
18 |
"--output_dir", out_dir,
|
19 |
+
"--extract_images", "True", # pass explicit boolean
|
20 |
+
"--paginate_output", "True"
|
21 |
]
|
22 |
subprocess.run(cmd, check=True)
|
23 |
|
24 |
def collect_outputs(out_dir, ext):
|
25 |
+
"""Recursively gather all files with the given extension."""
|
26 |
collected = []
|
27 |
for root, _, files in os.walk(out_dir):
|
28 |
for fname in sorted(files):
|
|
|
31 |
return collected
|
32 |
|
33 |
def process_upload(pdf_file, output_format):
|
34 |
+
# 1) Create temp dir for Marker outputs
|
35 |
out_dir = tempfile.mkdtemp()
|
36 |
+
|
37 |
+
# 2) Map Gradio choice to Marker’s format
|
38 |
+
fmt = {
|
39 |
+
"markdown": "markdown",
|
40 |
+
"json": "json",
|
41 |
+
"docx": "markdown" # produce .md before converting to DOCX
|
42 |
+
}[output_format]
|
43 |
+
|
44 |
+
# 3) Run Marker CLI
|
45 |
run_marker(pdf_file.name, out_dir, fmt)
|
46 |
|
47 |
+
# 4) Read the generated pages
|
48 |
+
ext = ".json" if output_format == "json" else ".md"
|
49 |
pages = []
|
50 |
for path in collect_outputs(out_dir, ext):
|
51 |
with open(path, 'r', encoding='utf-8') as f:
|
52 |
pages.append(f.read())
|
53 |
|
54 |
+
# 5) DOCX branch: combine markdown and convert via Pandoc
|
|
|
|
|
|
|
|
|
55 |
if output_format == "docx":
|
|
|
56 |
md_path = os.path.join(out_dir, "combined.md")
|
57 |
with open(md_path, "w", encoding="utf-8") as f:
|
58 |
f.write("\n\n---\n\n".join(pages))
|
59 |
|
|
|
60 |
docx_path = tempfile.mktemp(suffix=".docx")
|
61 |
pypandoc.convert_file(
|
62 |
md_path,
|
|
|
65 |
extra_args=[f"--resource-path={out_dir}"]
|
66 |
)
|
67 |
|
68 |
+
# Clean up and return the path to the .docx file
|
69 |
shutil.rmtree(out_dir)
|
70 |
return docx_path
|
71 |
|
72 |
+
# 6) Non-DOCX: clean up and return Markdown or JSON string
|
73 |
shutil.rmtree(out_dir)
|
74 |
if output_format == "markdown":
|
75 |
return "\n\n---\n\n".join(pages)
|
|
|
81 |
fn=process_upload,
|
82 |
inputs=[
|
83 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
84 |
+
gr.Radio(
|
85 |
+
choices=["markdown", "json", "docx"],
|
86 |
+
value="markdown",
|
87 |
+
label="Output format"
|
88 |
+
)
|
89 |
],
|
90 |
outputs=gr.File(label="Download Result"),
|
91 |
title="PDF → Markdown/JSON/DOCX Converter",
|
92 |
description=(
|
93 |
+
"Upload a PDF (with images & math). "
|
94 |
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
|
95 |
"Or choose **DOCX** to get a Word document with everything embedded."
|
96 |
)
|
97 |
)
|
98 |
|
99 |
if __name__ == "__main__":
|
100 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
|