Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,54 +3,92 @@ import shutil
|
|
3 |
import subprocess
|
4 |
import tempfile
|
5 |
import json
|
|
|
|
|
6 |
import gradio as gr
|
7 |
|
8 |
-
def
|
9 |
-
|
10 |
-
fmt = "markdown" if output_format == "markdown" else "json"
|
11 |
cmd = [
|
12 |
"marker_single",
|
13 |
-
|
14 |
"--output_format", fmt,
|
15 |
"--output_dir", out_dir,
|
|
|
16 |
"--paginate_output"
|
17 |
]
|
18 |
subprocess.run(cmd, check=True)
|
19 |
|
20 |
-
|
|
|
21 |
collected = []
|
22 |
for root, _, files in os.walk(out_dir):
|
23 |
for fname in sorted(files):
|
24 |
-
if
|
25 |
-
collected.append(os.path.join(root, fname))
|
26 |
-
elif fmt == "json" and fname.lower().endswith(".json"):
|
27 |
collected.append(os.path.join(root, fname))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
|
|
29 |
pages = []
|
30 |
-
for path in
|
31 |
with open(path, 'r', encoding='utf-8') as f:
|
32 |
pages.append(f.read())
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
if output_format == "markdown":
|
37 |
return "\n\n---\n\n".join(pages)
|
38 |
else:
|
39 |
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
|
40 |
|
|
|
41 |
demo = gr.Interface(
|
42 |
fn=process_upload,
|
43 |
inputs=[
|
44 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
45 |
-
gr.Radio(["markdown","json"
|
|
|
|
|
46 |
],
|
47 |
-
outputs=gr.
|
48 |
-
title="PDF → Markdown/JSON
|
49 |
description=(
|
50 |
-
"Upload a PDF
|
51 |
-
"
|
|
|
52 |
)
|
53 |
)
|
54 |
|
55 |
-
if __name__=="__main__":
|
56 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
3 |
import subprocess
|
4 |
import tempfile
|
5 |
import json
|
6 |
+
|
7 |
+
import pypandoc
|
8 |
import gradio as gr
|
9 |
|
10 |
+
def run_marker(pdf_path, out_dir, fmt):
|
11 |
+
"""Run Marker to convert PDF, extracting images & math."""
|
|
|
12 |
cmd = [
|
13 |
"marker_single",
|
14 |
+
pdf_path,
|
15 |
"--output_format", fmt,
|
16 |
"--output_dir", out_dir,
|
17 |
+
"--extract_images", # ensure images get saved
|
18 |
"--paginate_output"
|
19 |
]
|
20 |
subprocess.run(cmd, check=True)
|
21 |
|
22 |
+
def collect_outputs(out_dir, ext):
|
23 |
+
"""Recursively gather all files with given extension."""
|
24 |
collected = []
|
25 |
for root, _, files in os.walk(out_dir):
|
26 |
for fname in sorted(files):
|
27 |
+
if fname.lower().endswith(ext):
|
|
|
|
|
28 |
collected.append(os.path.join(root, fname))
|
29 |
+
return collected
|
30 |
+
|
31 |
+
def process_upload(pdf_file, output_format):
|
32 |
+
# 1) Temp dir for Marker outputs
|
33 |
+
out_dir = tempfile.mkdtemp()
|
34 |
+
fmt = {"markdown": "markdown", "json": "json"}[output_format]
|
35 |
+
run_marker(pdf_file.name, out_dir, fmt)
|
36 |
|
37 |
+
# 2) Read pages
|
38 |
+
ext = ".md" if output_format in ["markdown","docx"] else ".json"
|
39 |
pages = []
|
40 |
+
for path in collect_outputs(out_dir, ext):
|
41 |
with open(path, 'r', encoding='utf-8') as f:
|
42 |
pages.append(f.read())
|
43 |
|
44 |
+
# 3) Cleanup Marker temp files if not doing docx
|
45 |
+
# (but keep them for Pandoc image embedding)
|
46 |
+
# so we delay full cleanup until end.
|
47 |
+
|
48 |
+
# 4) If Word requested, first join markdown then convert.
|
49 |
+
if output_format == "docx":
|
50 |
+
# write a single temp .md
|
51 |
+
md_path = os.path.join(out_dir, "combined.md")
|
52 |
+
with open(md_path, "w", encoding="utf-8") as f:
|
53 |
+
f.write("\n\n---\n\n".join(pages))
|
54 |
|
55 |
+
# produce .docx via Pandoc, telling it where images live
|
56 |
+
docx_path = tempfile.mktemp(suffix=".docx")
|
57 |
+
pypandoc.convert_file(
|
58 |
+
md_path,
|
59 |
+
"docx",
|
60 |
+
outputfile=docx_path,
|
61 |
+
extra_args=[f"--resource-path={out_dir}"]
|
62 |
+
)
|
63 |
+
|
64 |
+
# clean up Marker outputs
|
65 |
+
shutil.rmtree(out_dir)
|
66 |
+
return docx_path
|
67 |
+
|
68 |
+
# 5) Non-docx: join or wrap JSON
|
69 |
+
shutil.rmtree(out_dir)
|
70 |
if output_format == "markdown":
|
71 |
return "\n\n---\n\n".join(pages)
|
72 |
else:
|
73 |
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
|
74 |
|
75 |
+
# Gradio Interface
|
76 |
demo = gr.Interface(
|
77 |
fn=process_upload,
|
78 |
inputs=[
|
79 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
80 |
+
gr.Radio(choices=["markdown", "json", "docx"],
|
81 |
+
value="markdown",
|
82 |
+
label="Output format")
|
83 |
],
|
84 |
+
outputs=gr.File(label="Download Result"),
|
85 |
+
title="PDF → Markdown/JSON/DOCX Converter",
|
86 |
description=(
|
87 |
+
"Upload a PDF (even with images & math). "
|
88 |
+
"Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
|
89 |
+
"Or choose **DOCX** to get a Word document with everything embedded."
|
90 |
)
|
91 |
)
|
92 |
|
93 |
+
if __name__ == "__main__":
|
94 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|