euler314 commited on
Commit
f25ee15
·
verified ·
1 Parent(s): 0532015

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -18
app.py CHANGED
@@ -1,32 +1,42 @@
1
- import os, shutil, subprocess, tempfile, json
 
 
 
 
2
  import gradio as gr
3
 
4
  def process_upload(pdf_file, output_format):
5
- # Create temp output directory
6
  out_dir = tempfile.mkdtemp()
7
- # Run Marker CLI: outputs files marker-0000.md or marker-0000.json
8
- fmt = "markdown" if output_format=="markdown" else "json"
9
  cmd = [
10
  "marker_single",
11
  pdf_file.name,
12
  "--output_format", fmt,
13
  "--output_dir", out_dir,
14
- "--paginate_output" # page separators
15
  ]
16
  subprocess.run(cmd, check=True)
17
- # Read and combine results
18
- results = []
19
- for fname in sorted(os.listdir(out_dir)):
20
- path = os.path.join(out_dir, fname)
 
 
 
 
 
 
 
 
21
  with open(path, 'r', encoding='utf-8') as f:
22
- results.append(f.read())
23
- # Cleanup
24
  shutil.rmtree(out_dir)
 
25
  if output_format == "markdown":
26
- return "\n\n---\n\n".join(results)
27
  else:
28
- # If JSON, combine into list of pages
29
- return json.dumps({"pages": results}, indent=2, ensure_ascii=False)
30
 
31
  demo = gr.Interface(
32
  fn=process_upload,
@@ -34,12 +44,11 @@ demo = gr.Interface(
34
  gr.File(label="Upload PDF", file_types=[".pdf"]),
35
  gr.Radio(["markdown","json"], value="markdown", label="Output format")
36
  ],
37
- outputs=gr.Code(label="Output"),
38
  title="PDF → Markdown/JSON with LaTeX Support",
39
  description=(
40
- "Uploads a PDF and uses Marker to extract text, structure, and LaTeX math. "
41
- "Choose Markdown to get a single .md with `$...$`/`$$...$$` math, "
42
- "or JSON for a page-by-page array."
43
  )
44
  )
45
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import tempfile
5
+ import json
6
  import gradio as gr
7
 
8
  def process_upload(pdf_file, output_format):
 
9
  out_dir = tempfile.mkdtemp()
10
+ fmt = "markdown" if output_format == "markdown" else "json"
 
11
  cmd = [
12
  "marker_single",
13
  pdf_file.name,
14
  "--output_format", fmt,
15
  "--output_dir", out_dir,
16
+ "--paginate_output"
17
  ]
18
  subprocess.run(cmd, check=True)
19
+
20
+ # Recursively find only .md/.json files
21
+ collected = []
22
+ for root, _, files in os.walk(out_dir):
23
+ for fname in sorted(files):
24
+ if fmt == "markdown" and fname.lower().endswith(".md"):
25
+ collected.append(os.path.join(root, fname))
26
+ elif fmt == "json" and fname.lower().endswith(".json"):
27
+ collected.append(os.path.join(root, fname))
28
+
29
+ pages = []
30
+ for path in collected:
31
  with open(path, 'r', encoding='utf-8') as f:
32
+ pages.append(f.read())
33
+
34
  shutil.rmtree(out_dir)
35
+
36
  if output_format == "markdown":
37
+ return "\n\n---\n\n".join(pages)
38
  else:
39
+ return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
 
40
 
41
  demo = gr.Interface(
42
  fn=process_upload,
 
44
  gr.File(label="Upload PDF", file_types=[".pdf"]),
45
  gr.Radio(["markdown","json"], value="markdown", label="Output format")
46
  ],
47
+ outputs=gr.Code(label="Converted Output"),
48
  title="PDF → Markdown/JSON with LaTeX Support",
49
  description=(
50
+ "Upload a PDF and get back Markdown or structured JSON, "
51
+ "with math preserved as LaTeX."
 
52
  )
53
  )
54