euler314 commited on
Commit
b89a1c3
·
verified ·
1 Parent(s): 4b175fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -16
app.py CHANGED
@@ -3,54 +3,92 @@ import shutil
3
  import subprocess
4
  import tempfile
5
  import json
 
 
6
  import gradio as gr
7
 
8
- def process_upload(pdf_file, output_format):
9
- out_dir = tempfile.mkdtemp()
10
- fmt = "markdown" if output_format == "markdown" else "json"
11
  cmd = [
12
  "marker_single",
13
- pdf_file.name,
14
  "--output_format", fmt,
15
  "--output_dir", out_dir,
 
16
  "--paginate_output"
17
  ]
18
  subprocess.run(cmd, check=True)
19
 
20
- # Recursively find only .md/.json files
 
21
  collected = []
22
  for root, _, files in os.walk(out_dir):
23
  for fname in sorted(files):
24
- if fmt == "markdown" and fname.lower().endswith(".md"):
25
- collected.append(os.path.join(root, fname))
26
- elif fmt == "json" and fname.lower().endswith(".json"):
27
  collected.append(os.path.join(root, fname))
 
 
 
 
 
 
 
28
 
 
 
29
  pages = []
30
- for path in collected:
31
  with open(path, 'r', encoding='utf-8') as f:
32
  pages.append(f.read())
33
 
34
- shutil.rmtree(out_dir)
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  if output_format == "markdown":
37
  return "\n\n---\n\n".join(pages)
38
  else:
39
  return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
40
 
 
41
  demo = gr.Interface(
42
  fn=process_upload,
43
  inputs=[
44
  gr.File(label="Upload PDF", file_types=[".pdf"]),
45
- gr.Radio(["markdown","json"], value="markdown", label="Output format")
 
 
46
  ],
47
- outputs=gr.Code(label="Converted Output"),
48
- title="PDF → Markdown/JSON with LaTeX Support",
49
  description=(
50
- "Upload a PDF and get back Markdown or structured JSON, "
51
- "with math preserved as LaTeX."
 
52
  )
53
  )
54
 
55
- if __name__=="__main__":
56
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import subprocess
4
  import tempfile
5
  import json
6
+
7
+ import pypandoc
8
  import gradio as gr
9
 
10
+ def run_marker(pdf_path, out_dir, fmt):
11
+ """Run Marker to convert PDF, extracting images & math."""
 
12
  cmd = [
13
  "marker_single",
14
+ pdf_path,
15
  "--output_format", fmt,
16
  "--output_dir", out_dir,
17
+ "--extract_images", # ensure images get saved
18
  "--paginate_output"
19
  ]
20
  subprocess.run(cmd, check=True)
21
 
22
+ def collect_outputs(out_dir, ext):
23
+ """Recursively gather all files with given extension."""
24
  collected = []
25
  for root, _, files in os.walk(out_dir):
26
  for fname in sorted(files):
27
+ if fname.lower().endswith(ext):
 
 
28
  collected.append(os.path.join(root, fname))
29
+ return collected
30
+
31
+ def process_upload(pdf_file, output_format):
32
+ # 1) Temp dir for Marker outputs
33
+ out_dir = tempfile.mkdtemp()
34
+ fmt = {"markdown": "markdown", "json": "json"}[output_format]
35
+ run_marker(pdf_file.name, out_dir, fmt)
36
 
37
+ # 2) Read pages
38
+ ext = ".md" if output_format in ["markdown","docx"] else ".json"
39
  pages = []
40
+ for path in collect_outputs(out_dir, ext):
41
  with open(path, 'r', encoding='utf-8') as f:
42
  pages.append(f.read())
43
 
44
+ # 3) Cleanup Marker temp files if not doing docx
45
+ # (but keep them for Pandoc image embedding)
46
+ # so we delay full cleanup until end.
47
+
48
+ # 4) If Word requested, first join markdown then convert.
49
+ if output_format == "docx":
50
+ # write a single temp .md
51
+ md_path = os.path.join(out_dir, "combined.md")
52
+ with open(md_path, "w", encoding="utf-8") as f:
53
+ f.write("\n\n---\n\n".join(pages))
54
 
55
+ # produce .docx via Pandoc, telling it where images live
56
+ docx_path = tempfile.mktemp(suffix=".docx")
57
+ pypandoc.convert_file(
58
+ md_path,
59
+ "docx",
60
+ outputfile=docx_path,
61
+ extra_args=[f"--resource-path={out_dir}"]
62
+ )
63
+
64
+ # clean up Marker outputs
65
+ shutil.rmtree(out_dir)
66
+ return docx_path
67
+
68
+ # 5) Non-docx: join or wrap JSON
69
+ shutil.rmtree(out_dir)
70
  if output_format == "markdown":
71
  return "\n\n---\n\n".join(pages)
72
  else:
73
  return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
74
 
75
+ # Gradio Interface
76
  demo = gr.Interface(
77
  fn=process_upload,
78
  inputs=[
79
  gr.File(label="Upload PDF", file_types=[".pdf"]),
80
+ gr.Radio(choices=["markdown", "json", "docx"],
81
+ value="markdown",
82
+ label="Output format")
83
  ],
84
+ outputs=gr.File(label="Download Result"),
85
+ title="PDF → Markdown/JSON/DOCX Converter",
86
  description=(
87
+ "Upload a PDF (even with images & math). "
88
+ "Choose **Markdown** or **JSON** to get text + LaTeX math and extracted images. "
89
+ "Or choose **DOCX** to get a Word document with everything embedded."
90
  )
91
  )
92
 
93
+ if __name__ == "__main__":
94
  demo.launch(server_name="0.0.0.0", server_port=7860)