euler314 commited on
Commit
cdb52cd
·
verified ·
1 Parent(s): b89a1c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -21
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import shutil
3
  import subprocess
@@ -14,13 +16,13 @@ def run_marker(pdf_path, out_dir, fmt):
14
  pdf_path,
15
  "--output_format", fmt,
16
  "--output_dir", out_dir,
17
- "--extract_images", # ensure images get saved
18
- "--paginate_output"
19
  ]
20
  subprocess.run(cmd, check=True)
21
 
22
  def collect_outputs(out_dir, ext):
23
- """Recursively gather all files with given extension."""
24
  collected = []
25
  for root, _, files in os.walk(out_dir):
26
  for fname in sorted(files):
@@ -29,30 +31,32 @@ def collect_outputs(out_dir, ext):
29
  return collected
30
 
31
  def process_upload(pdf_file, output_format):
32
- # 1) Temp dir for Marker outputs
33
  out_dir = tempfile.mkdtemp()
34
- fmt = {"markdown": "markdown", "json": "json"}[output_format]
 
 
 
 
 
 
 
 
35
  run_marker(pdf_file.name, out_dir, fmt)
36
 
37
- # 2) Read pages
38
- ext = ".md" if output_format in ["markdown","docx"] else ".json"
39
  pages = []
40
  for path in collect_outputs(out_dir, ext):
41
  with open(path, 'r', encoding='utf-8') as f:
42
  pages.append(f.read())
43
 
44
- # 3) Cleanup Marker temp files if not doing docx
45
- # (but keep them for Pandoc image embedding)
46
- # so we delay full cleanup until end.
47
-
48
- # 4) If Word requested, first join markdown then convert.
49
  if output_format == "docx":
50
- # write a single temp .md
51
  md_path = os.path.join(out_dir, "combined.md")
52
  with open(md_path, "w", encoding="utf-8") as f:
53
  f.write("\n\n---\n\n".join(pages))
54
 
55
- # produce .docx via Pandoc, telling it where images live
56
  docx_path = tempfile.mktemp(suffix=".docx")
57
  pypandoc.convert_file(
58
  md_path,
@@ -61,11 +65,11 @@ def process_upload(pdf_file, output_format):
61
  extra_args=[f"--resource-path={out_dir}"]
62
  )
63
 
64
- # clean up Marker outputs
65
  shutil.rmtree(out_dir)
66
  return docx_path
67
 
68
- # 5) Non-docx: join or wrap JSON
69
  shutil.rmtree(out_dir)
70
  if output_format == "markdown":
71
  return "\n\n---\n\n".join(pages)
@@ -77,18 +81,20 @@ demo = gr.Interface(
77
  fn=process_upload,
78
  inputs=[
79
  gr.File(label="Upload PDF", file_types=[".pdf"]),
80
- gr.Radio(choices=["markdown", "json", "docx"],
81
- value="markdown",
82
- label="Output format")
 
 
83
  ],
84
  outputs=gr.File(label="Download Result"),
85
  title="PDF → Markdown/JSON/DOCX Converter",
86
  description=(
87
- "Upload a PDF (even with images & math). "
88
  "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
89
  "Or choose **DOCX** to get a Word document with everything embedded."
90
  )
91
  )
92
 
93
  if __name__ == "__main__":
94
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # app.py
2
+
3
  import os
4
  import shutil
5
  import subprocess
 
16
  pdf_path,
17
  "--output_format", fmt,
18
  "--output_dir", out_dir,
19
+ "--extract_images", "True", # pass explicit boolean
20
+ "--paginate_output", "True"
21
  ]
22
  subprocess.run(cmd, check=True)
23
 
24
  def collect_outputs(out_dir, ext):
25
+ """Recursively gather all files with the given extension."""
26
  collected = []
27
  for root, _, files in os.walk(out_dir):
28
  for fname in sorted(files):
 
31
  return collected
32
 
33
  def process_upload(pdf_file, output_format):
34
+ # 1) Create temp dir for Marker outputs
35
  out_dir = tempfile.mkdtemp()
36
+
37
+ # 2) Map Gradio choice to Marker’s format
38
+ fmt = {
39
+ "markdown": "markdown",
40
+ "json": "json",
41
+ "docx": "markdown" # produce .md before converting to DOCX
42
+ }[output_format]
43
+
44
+ # 3) Run Marker CLI
45
  run_marker(pdf_file.name, out_dir, fmt)
46
 
47
+ # 4) Read the generated pages
48
+ ext = ".json" if output_format == "json" else ".md"
49
  pages = []
50
  for path in collect_outputs(out_dir, ext):
51
  with open(path, 'r', encoding='utf-8') as f:
52
  pages.append(f.read())
53
 
54
+ # 5) DOCX branch: combine markdown and convert via Pandoc
 
 
 
 
55
  if output_format == "docx":
 
56
  md_path = os.path.join(out_dir, "combined.md")
57
  with open(md_path, "w", encoding="utf-8") as f:
58
  f.write("\n\n---\n\n".join(pages))
59
 
 
60
  docx_path = tempfile.mktemp(suffix=".docx")
61
  pypandoc.convert_file(
62
  md_path,
 
65
  extra_args=[f"--resource-path={out_dir}"]
66
  )
67
 
68
+ # Clean up and return the path to the .docx file
69
  shutil.rmtree(out_dir)
70
  return docx_path
71
 
72
+ # 6) Non-DOCX: clean up and return Markdown or JSON string
73
  shutil.rmtree(out_dir)
74
  if output_format == "markdown":
75
  return "\n\n---\n\n".join(pages)
 
81
  fn=process_upload,
82
  inputs=[
83
  gr.File(label="Upload PDF", file_types=[".pdf"]),
84
+ gr.Radio(
85
+ choices=["markdown", "json", "docx"],
86
+ value="markdown",
87
+ label="Output format"
88
+ )
89
  ],
90
  outputs=gr.File(label="Download Result"),
91
  title="PDF → Markdown/JSON/DOCX Converter",
92
  description=(
93
+ "Upload a PDF (with images & math). "
94
  "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
95
  "Or choose **DOCX** to get a Word document with everything embedded."
96
  )
97
  )
98
 
99
  if __name__ == "__main__":
100
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)