Liam Dyer commited on
Commit
312add7
1 Parent(s): 4199c92

rewrite with pypdf and ocrmypdf

Browse files
Files changed (3) hide show
  1. app.py +18 -67
  2. packages.txt +2 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,88 +1,39 @@
1
  import spaces
2
  import gradio as gr
3
- import surya.detection as detection
4
- import surya.layout as layout
5
- import os
6
- import base64
7
-
8
-
9
- # Monkey patch to prevent spawning processes
10
- def batch_text_detection(images, model, processor, batch_size=None):
11
- preds, orig_sizes = detection.batch_detection(
12
- images, model, processor, batch_size=batch_size
13
- )
14
- results = []
15
- for i in range(len(images)):
16
- result = detection.parallel_get_lines(preds[i], orig_sizes[i])
17
- results.append(result)
18
-
19
- return results
20
-
21
-
22
- detection.batch_text_detection = batch_text_detection
23
-
24
-
25
- def batch_layout_detection(
26
- images, model, processor, detection_results=None, batch_size=None
27
- ):
28
- preds, orig_sizes = layout.batch_detection(
29
- images, model, processor, batch_size=batch_size
30
- )
31
- id2label = model.config.id2label
32
-
33
- results = []
34
- for i in range(len(images)):
35
- result = layout.parallel_get_regions(
36
- preds[i],
37
- orig_sizes[i],
38
- id2label,
39
- detection_results[i] if detection_results else None,
40
- )
41
- results.append(result)
42
-
43
- return results
44
-
45
-
46
- layout.batch_layout_detection = batch_layout_detection
47
-
48
- from marker.convert import convert_single_pdf
49
- from marker.models import load_all_models
50
-
51
- model_list = load_all_models()
52
 
53
 
54
  @spaces.GPU
55
- def convert(pdf_file, extract_images):
56
- global model_list
57
-
58
- full_text, images, out_meta = convert_single_pdf(
59
- pdf_file, model_list, batch_multiplier=16
60
- )
61
- image_data = {}
62
- if extract_images:
63
- for filename, image in images.items():
64
- image.save(filename, "PNG")
65
 
66
- with open(filename, "rb") as f:
67
- image_bytes = f.read()
 
 
68
 
69
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
70
- image_data[filename] = image_base64
 
 
 
71
 
72
- os.remove(filename)
 
 
 
73
 
74
- return full_text, out_meta, image_data
75
 
76
 
77
  gr.Interface(
78
  convert,
79
  inputs=[
80
  gr.File(label="Upload PDF", type="filepath"),
81
- gr.Checkbox(label="Extract Images"),
82
  ],
83
  outputs=[
84
  gr.Text(label="Markdown"),
85
  gr.JSON(label="Metadata"),
86
- gr.JSON(label="Images"),
87
  ],
88
  ).launch()
 
1
  import spaces
2
  import gradio as gr
3
+ from pypdf import PdfReader
4
+ import ocrmypdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  @spaces.GPU
8
+ def convert(pdf_file):
9
+ reader = PdfReader(pdf_file)
 
 
 
 
 
 
 
 
10
 
11
+ # Check if there are any images
12
+ image_count = 0
13
+ for page in reader.pages:
14
+ image_count += len(page.images)
15
 
16
+ # If there are images, perform OCR on the document
17
+ if image_count > 0:
18
+ out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
19
+ ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
20
+ pdf_file = out_pdf_file
21
 
22
+ # Extract text
23
+ full_text = ""
24
+ for idx, page in enumerate(reader.pages):
25
+ full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
26
 
27
+ return full_text, reader.metadata
28
 
29
 
30
  gr.Interface(
31
  convert,
32
  inputs=[
33
  gr.File(label="Upload PDF", type="filepath"),
 
34
  ],
35
  outputs=[
36
  gr.Text(label="Markdown"),
37
  gr.JSON(label="Metadata"),
 
38
  ],
39
  ).launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ocrmypdf
2
+ tesseract-ocr-eng
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- marker-pdf==0.2.5
 
 
1
+ ocrmypdf==16.3.1
2
+ pypdf==4.2.0