02alexander commited on
Commit
d170be2
·
1 Parent(s): cbcc4bf

lfs for files

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This Dockerfile is used for creating the Hugging Face docker space
2
+ # See: https://huggingface.co/docs/hub/en/spaces-sdks-docker
3
+
4
+ FROM python:3.11.8
5
+
6
+ RUN apt update && apt upgrade -y && apt install -y ffmpeg poppler-utils
7
+
8
+ # Set up a new user named "user" with user ID 1000
9
+ RUN useradd -m -u 1000 user
10
+
11
+ # Switch to the "user" user
12
+ USER user
13
+
14
+ # Set home to the user's home directory
15
+ ENV HOME=/home/user \
16
+ PATH=/home/user/.local/bin:$PATH
17
+
18
+ # Set the working directory to the user's home directory
19
+ WORKDIR $HOME/app
20
+
21
+ # Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
22
+ RUN pip install --no-cache-dir --upgrade pip
23
+
24
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
+ COPY --chown=user . $HOME/app
26
+
27
+ # Install requirements.txt
28
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
29
+
30
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
31
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -2,30 +2,66 @@ from __future__ import annotations
2
 
3
  import os
4
  from pathlib import Path
 
 
 
5
 
6
  import gradio as gr # type: ignore
7
  import rerun as rr
8
- import rerun.blueprint as rrb
 
9
  from gradio_rerun import Rerun # type: ignore
10
- from ocr import detect_and_log_layout # type: ignore
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  @rr.thread_local_stream("PaddleOCR")
14
- def log_to_rr(img_path: Path):
15
- print(img_path)
16
  stream = rr.binary_stream()
17
 
18
- blueprint = rrb.Blueprint(
19
- rrb.Vertical(
20
- rrb.Spatial2DView(name="Input", contents=["Image/**"]),
21
- ),
22
- collapse_panels=True,
23
- )
24
- rr.send_blueprint(blueprint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- detect_and_log_layout(img_path)
27
 
28
- yield stream.read()
 
29
 
30
  DESCRIPTION = """
31
  This space demonstrates the ability to visualize and verify the document layout analysis and text detection using [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR).
@@ -37,19 +73,20 @@ with gr.Blocks() as demo:
37
  with gr.Row():
38
  with gr.Column(scale=1):
39
  with gr.Row():
40
- input_image = gr.Image(label="Input Image", image_mode="RGBA", sources="upload", type="filepath")
 
41
  with gr.Row():
42
  button = gr.Button()
43
  with gr.Row():
44
  gr.Examples(
45
  examples=[os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))],
46
- inputs=[input_image],
47
  label="Examples",
48
  cache_examples=False,
49
  examples_per_page=12,
50
  )
51
  with gr.Column(scale=4):
52
  viewer = Rerun(streaming=True, height=900)
53
- button.click(log_to_rr, inputs=[input_image], outputs=[viewer])
54
 
55
- demo.launch()
 
2
 
3
  import os
4
  from pathlib import Path
5
+ from queue import SimpleQueue
6
+ from threading import Thread
7
+ from typing import Any
8
 
9
  import gradio as gr # type: ignore
10
  import rerun as rr
11
+ from fastapi import FastAPI
12
+ from fastapi.middleware.cors import CORSMiddleware
13
  from gradio_rerun import Rerun # type: ignore
 
14
 
15
+ from ocr import detect_and_log_layouts
16
+
17
+ CUSTOM_PATH = "/"
18
+
19
+ app = FastAPI()
20
+
21
+ origins = [
22
+ "https://app.rerun.io",
23
+ ]
24
+
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=origins,
28
+ )
29
+
30
+ def file_ocr(log_queue: SimpleQueue[Any], file_path: str):
31
+ detect_and_log_layouts(log_queue, file_path)
32
+ log_queue.put("done")
33
 
34
  @rr.thread_local_stream("PaddleOCR")
35
+ def log_to_rr(file_path: Path):
 
36
  stream = rr.binary_stream()
37
 
38
+ log_queue: SimpleQueue[Any] = SimpleQueue()
39
+ handle = Thread(target=file_ocr, args=[log_queue, str(file_path)])
40
+ handle.start()
41
+
42
+ while True:
43
+ msg = log_queue.get()
44
+ if msg == "done":
45
+ break
46
+
47
+ msg_type = msg[0]
48
+
49
+ if msg_type == "blueprint":
50
+ blueprint = msg[1]
51
+ rr.send_blueprint(blueprint)
52
+ elif msg_type == "log":
53
+ entity_path = msg[1]
54
+ args = msg[2]
55
+ kwargs = msg[3] if len(msg) >= 4 else {}
56
+ # print(entity_path)
57
+ # print(args)
58
+ # print(kwargs)
59
+ rr.log(entity_path, *args, **kwargs)
60
 
61
+ yield stream.read()
62
 
63
+ handle.join()
64
+ print("done")
65
 
66
  DESCRIPTION = """
67
  This space demonstrates the ability to visualize and verify the document layout analysis and text detection using [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR).
 
73
  with gr.Row():
74
  with gr.Column(scale=1):
75
  with gr.Row():
76
+ #input_image = gr.Image(label="Input Image", image_mode="RGBA", sources="upload", type="filepath")
77
+ input_file = gr.File(label="Input file (image/pdf)")
78
  with gr.Row():
79
  button = gr.Button()
80
  with gr.Row():
81
  gr.Examples(
82
  examples=[os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))],
83
+ inputs=[input_file],
84
  label="Examples",
85
  cache_examples=False,
86
  examples_per_page=12,
87
  )
88
  with gr.Column(scale=4):
89
  viewer = Rerun(streaming=True, height=900)
90
+ button.click(log_to_rr, inputs=[input_file], outputs=[viewer])
91
 
92
+ app = gr.mount_gradio_app(app, demo, path=CUSTOM_PATH)
examples/{paper.png → paper_page.png} RENAMED
File without changes
ocr.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """OCR template."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ import os
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from queue import SimpleQueue
11
+ from typing import Any, Final, Iterable, Optional, TypeAlias
12
+
13
+ import cv2 as cv2
14
+ import numpy as np
15
+ import numpy.typing as npt
16
+ import pandas as pd # type: ignore
17
+ import pdf2image # type: ignore
18
+ import rerun as rr # pip install rerun-sdk
19
+ import rerun.blueprint as rrb
20
+ from paddleocr import PPStructure # type: ignore
21
+ from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes # type: ignore
22
+
23
+ EXAMPLE_DIR: Final = Path(os.path.dirname(__file__))
24
+ DATASET_DIR: Final = EXAMPLE_DIR / "dataset"
25
+
26
+ SAMPLE_IMAGE_URLs = ["https://storage.googleapis.com/rerun-example-datasets/ocr/paper.png"]
27
+
28
+ LayoutStructure: TypeAlias = tuple[
29
+ list[str], list[str], list[rrb.Spatial2DView], list[rrb.Spatial2DView], list[rrb.Spatial2DView]
30
+ ]
31
+
32
+ # Supportive Classes
33
+
34
+
35
+ class Color:
36
+ Red = (255, 0, 0)
37
+ Green = (0, 255, 0)
38
+ Blue = (0, 0, 255)
39
+ Yellow = (255, 255, 0)
40
+ Cyan = (0, 255, 255)
41
+ Magenta = (255, 0, 255)
42
+ Purple = (128, 0, 128)
43
+ Orange = (255, 165, 0)
44
+
45
+
46
+ """
47
+ LayoutType:
48
+ Defines an enumeration for different types of document layout elements, each associated with a unique number, name,
49
+ and color. Types:
50
+ - UNKNOWN: Default type for undefined or unrecognized elements, represented by purple.
51
+ - TITLE: Represents the title of a document, represented by red.
52
+ - TEXT: Represents plain text content within the document, represented by green.
53
+ - FIGURE: Represents graphical or image content, represented by blue.
54
+ - FIGURE_CAPTION: Represents captions for figures, represented by yellow.
55
+ - TABLE: Represents tabular data, represented by cyan.
56
+ - TABLE_CAPTION: Represents captions for tables, represented by magenta.
57
+ - REFERENCE: Represents citation references within the document, also represented by purple.
58
+ - Footer: Represents footer of the document, represented as orange.
59
+ """
60
+
61
+
62
+ class LayoutType(Enum):
63
+ UNKNOWN = (0, "unknown", Color.Purple)
64
+ TITLE = (1, "title", Color.Red)
65
+ TEXT = (2, "text", Color.Green)
66
+ FIGURE = (3, "figure", Color.Blue)
67
+ FIGURE_CAPTION = (4, "figure_caption", Color.Yellow)
68
+ TABLE = (5, "table", Color.Cyan)
69
+ TABLE_CAPTION = (6, "table_caption", Color.Magenta)
70
+ REFERENCE = (7, "reference", Color.Purple)
71
+ FOOTER = (8, "footer", Color.Orange)
72
+
73
+ def __str__(self) -> str:
74
+ return str(self.value[1]) # Returns the string part (type)
75
+
76
+ @property
77
+ def number(self) -> int:
78
+ return self.value[0] # Returns the numerical identifier
79
+
80
+ @property
81
+ def type(self) -> str:
82
+ return self.value[1] # Returns the type
83
+
84
+ @property
85
+ def color(self) -> tuple[int, int, int]:
86
+ return self.value[2] # Returns the color
87
+
88
+ @staticmethod
89
+ def get_class_id(text: str) -> int:
90
+ try:
91
+ return LayoutType[text.upper()].number
92
+ except KeyError:
93
+ logging.warning(f"Invalid layout type {text}")
94
+ return 0
95
+
96
+ @staticmethod
97
+ def get_type(text: str) -> LayoutType:
98
+ try:
99
+ return LayoutType[text.upper()]
100
+ except KeyError:
101
+ logging.warning(f"Invalid layout type {text}")
102
+ return LayoutType.UNKNOWN
103
+
104
+ @classmethod
105
+ def get_annotation(cls) -> list[tuple[int, str, tuple[int, int, int]]]:
106
+ return [(layout.number, layout.type, layout.color) for layout in cls]
107
+
108
+
109
+ """
110
+ Layout Class:
111
+ The main purpose of this class is to:
112
+ 1. Keep track of the layout types (including type, numbering)
113
+ 2. Save the detections for each layout (text, img or table)
114
+ 3. Save the bounding box of each detected layout
115
+ 4. Generate the recovery text document
116
+ """
117
+
118
+
119
+ class Layout:
120
+ def __init__(self, show_unknown: bool = False):
121
+ self.counts = {layout_type: 0 for layout_type in LayoutType}
122
+ self.records: dict[LayoutType, Any] = {layout_type: [] for layout_type in LayoutType}
123
+ self.recovery = """"""
124
+ self.show_unknown = show_unknown
125
+
126
+ def add(
127
+ self,
128
+ layout_type: LayoutType,
129
+ bounding_box: list[int],
130
+ detections: Optional[Iterable[dict[str, Any]]] = None,
131
+ table: Optional[str] = None,
132
+ figure: Optional[dict[str, Any]] = None,
133
+ ) -> None:
134
+ if layout_type in LayoutType:
135
+ self.counts[layout_type] += 1
136
+ name = f"{layout_type}{self.counts[layout_type]}"
137
+ logging.info(f"Saved layout type {layout_type} with name: {name}")
138
+ self.records[layout_type].append({
139
+ "type": layout_type,
140
+ "name": name,
141
+ "bounding_box": bounding_box,
142
+ "detections": detections,
143
+ "table": table,
144
+ })
145
+ if layout_type != LayoutType.UNKNOWN or self.show_unknown: # Discards the unknown layout types detections
146
+ path = f"recording://Image/{layout_type.type.title()}/{name.title()}"
147
+ self.recovery += f"\n\n## [{name.title()}]({path})\n\n" # Log Type as Heading
148
+ # Enhancement - Logged image for Figure type TODO(#6517)
149
+ if layout_type == LayoutType.TABLE:
150
+ if table:
151
+ self.recovery += table # Log details (table)
152
+ elif detections:
153
+ for index, detection in enumerate(detections):
154
+ path_text = f"recording://Image/{layout_type.type.title()}/{name.title()}/Detections/{index}"
155
+ self.recovery += f' [{detection["text"]}]({path_text})' # Log details (text)
156
+ else:
157
+ logging.warning(f"Invalid layout type detected: {layout_type}")
158
+
159
+ def get_count(self, layout_type: LayoutType) -> int:
160
+ if layout_type in LayoutType:
161
+ return self.counts[layout_type]
162
+ else:
163
+ raise ValueError("Invalid layout type")
164
+
165
+ def get_records(self) -> dict[LayoutType, list[dict[str, Any]]]:
166
+ return self.records
167
+
168
+ def save_all_layouts(self, results: list[dict[str, Any]]) -> None:
169
+ for line in results:
170
+ self.save_layout_data(line)
171
+ for layout_type in LayoutType:
172
+ logging.info(f"Number of detections for type {layout_type}: {self.counts[layout_type]}")
173
+
174
+ def save_layout_data(self, line: dict[str, Any]) -> None:
175
+ type = line.get("type", "empty")
176
+ box = line.get("bbox", [0, 0, 0, 0])
177
+ layout_type = LayoutType.get_type(type)
178
+ detections, table, img = [], None, None
179
+ if layout_type == LayoutType.TABLE:
180
+ table = self.get_table_markdown(line)
181
+ elif layout_type == LayoutType.FIGURE:
182
+ detections = self.get_detections(line)
183
+ img = line.get("img") # Currently not in use
184
+ else:
185
+ detections = self.get_detections(line)
186
+ self.add(layout_type, box, detections=detections, table=table, figure=img)
187
+
188
+ @staticmethod
189
+ def get_detections(line: dict[str, Any]) -> list[dict[str, Any]]:
190
+ detections = []
191
+ results = line.get("res")
192
+ if results is not None:
193
+ for i, result in enumerate(results):
194
+ text = result.get("text")
195
+ confidence = result.get("confidence")
196
+ box = result.get("text_region")
197
+ x_min, y_min = box[0]
198
+ x_max, y_max = box[2]
199
+ new_box = [x_min, y_min, x_max, y_max]
200
+ detections.append({"id": i, "text": text, "confidence": confidence, "box": new_box})
201
+ return detections
202
+
203
+ # Safely attempt to extract the HTML table from the results
204
+ @staticmethod
205
+ def get_table_markdown(line: dict[str, Any]) -> str:
206
+ try:
207
+ html_table = line.get("res", {}).get("html")
208
+ if not html_table:
209
+ return "No table found."
210
+
211
+ dataframes = pd.read_html(html_table)
212
+ if not dataframes:
213
+ return "No data extracted from the table."
214
+
215
+ markdown_table = dataframes[0].to_markdown()
216
+ return markdown_table # type: ignore[no-any-return]
217
+
218
+ except Exception as e:
219
+ return f"Error processing the table: {str(e)}"
220
+
221
+
222
+ def process_layout_records(log_queue: SimpleQueue[Any], layout: Layout, page_path: str) -> LayoutStructure:
223
+ paths, detections_paths = [], []
224
+ zoom_paths: list[rrb.Spatial2DView] = []
225
+ zoom_paths_figures: list[rrb.Spatial2DView] = []
226
+ zoom_paths_tables: list[rrb.Spatial2DView] = []
227
+ zoom_paths_texts: list[rrb.Spatial2DView] = []
228
+
229
+ for layout_type in LayoutType:
230
+ for record in layout.records[layout_type]:
231
+ record_name = record["name"].title()
232
+ record_base_path = f"{page_path}/Image/{layout_type.type.title()}/{record_name}"
233
+ paths.append(f"-{record_base_path}/**")
234
+ detections_paths.append(f"-{record_base_path}/Detections/**")
235
+
236
+ # Log bounding box
237
+ log_queue.put([
238
+ "log",
239
+ record_base_path,
240
+ [
241
+ rr.Boxes2D(
242
+ array=record["bounding_box"],
243
+ array_format=rr.Box2DFormat.XYXY,
244
+ labels=[str(layout_type.type)],
245
+ class_ids=[str(layout_type.number)],
246
+ ),
247
+ rr.AnyValues(name=record_name),
248
+ ],
249
+ ])
250
+
251
+ log_detections(log_queue, layout_type, record, record_base_path)
252
+
253
+ # Prepare zoom path views
254
+ update_zoom_paths(
255
+ layout,
256
+ layout_type,
257
+ record,
258
+ paths,
259
+ page_path,
260
+ zoom_paths,
261
+ zoom_paths_figures,
262
+ zoom_paths_tables,
263
+ zoom_paths_texts,
264
+ )
265
+
266
+ return paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts
267
+
268
+
269
+ def log_detections(log_queue: SimpleQueue, layout_type: LayoutType, record: dict[str, Any], page_path: str) -> None:
270
+ if layout_type == LayoutType.TABLE:
271
+ log_queue.put([
272
+ "log",
273
+ f"Extracted{record['name']}",
274
+ [rr.TextDocument(record["table"], media_type=rr.MediaType.MARKDOWN)],
275
+ ])
276
+ else:
277
+ for detection in record.get("detections", []):
278
+ log_queue.put([
279
+ "log",
280
+ f"{page_path}/Detections/{detection['id']}",
281
+ [
282
+ rr.Boxes2D(
283
+ array=detection["box"], array_format=rr.Box2DFormat.XYXY, class_ids=[str(layout_type.number)]
284
+ ),
285
+ rr.AnyValues(
286
+ DetectionID=detection["id"], Text=detection["text"], Confidence=detection["confidence"]
287
+ ),
288
+ ],
289
+ ])
290
+
291
+
292
+ def update_zoom_paths(
293
+ layout: Layout,
294
+ layout_type: LayoutType,
295
+ record: dict[str, Any],
296
+ paths: list[str],
297
+ page_path: str,
298
+ zoom_paths: list[rrb.Spatial2DView],
299
+ zoom_paths_figures: list[rrb.Spatial2DView],
300
+ zoom_paths_tables: list[rrb.Spatial2DView],
301
+ zoom_paths_texts: list[rrb.Spatial2DView],
302
+ ) -> None:
303
+ if layout_type in [LayoutType.FIGURE, LayoutType.TABLE, LayoutType.TEXT]:
304
+ current_paths = paths.copy()
305
+ current_paths.remove(f"-{page_path}/Image/{layout_type.type.title()}/{record['name'].title()}/**")
306
+ bounds = rrb.VisualBounds2D(
307
+ x_range=[record["bounding_box"][0] - 10, record["bounding_box"][2] + 10],
308
+ y_range=[record["bounding_box"][1] - 10, record["bounding_box"][3] + 10],
309
+ )
310
+
311
+ # Add to zoom paths
312
+ view = rrb.Spatial2DView(
313
+ name=record["name"].title(), contents=[f"{page_path}/Image/**"] + current_paths, visual_bounds=bounds
314
+ )
315
+ zoom_paths.append(view)
316
+
317
+ # Add to type-specific zoom paths
318
+ if layout_type == LayoutType.FIGURE:
319
+ zoom_paths_figures.append(view)
320
+ elif layout_type == LayoutType.TABLE:
321
+ zoom_paths_tables.append(view)
322
+ elif layout_type != LayoutType.UNKNOWN or layout.show_unknown:
323
+ zoom_paths_texts.append(view)
324
+
325
+
326
+ def generate_blueprint(
327
+ layouts: list[Layout],
328
+ page_paths: list[str],
329
+ processed_layouts: list[LayoutStructure],
330
+ ) -> rrb.Blueprint:
331
+ page_tabs = []
332
+ for layout, (page_path, processed_layout) in zip(layouts, zip(page_paths, processed_layouts)):
333
+ paths, detections_paths, zoom_paths_figures, zoom_paths_tables, zoom_paths_texts = processed_layout
334
+
335
+ section_tabs = []
336
+ content_data: dict[str, Any] = {
337
+ "Figures": zoom_paths_figures,
338
+ "Tables": zoom_paths_tables,
339
+ "Texts": zoom_paths_texts,
340
+ }
341
+
342
+ for name, paths in content_data.items():
343
+ if paths:
344
+ section_tabs.append(rrb.Tabs(*paths, name=name)) # type: ignore[arg-type]
345
+
346
+ page_tabs.append(
347
+ rrb.Vertical(
348
+ rrb.Horizontal(
349
+ rrb.Spatial2DView(
350
+ name="Layout",
351
+ origin=f"{page_path}/Image/",
352
+ contents=[f"{page_path}/Image/**"] + detections_paths,
353
+ ),
354
+ rrb.Spatial2DView(name="Detections", contents=[f"{page_path}/Image/**"]),
355
+ rrb.TextDocumentView(name="Recovery", contents=f"{page_path}/Recovery"),
356
+ ),
357
+ rrb.Horizontal(*section_tabs),
358
+ name=page_path,
359
+ row_shares=[4, 3],
360
+ )
361
+ )
362
+
363
+ return rrb.Blueprint(
364
+ rrb.Tabs(*page_tabs),
365
+ collapse_panels=True,
366
+ )
367
+
368
+
369
+ def detect_and_log_layouts(log_queue: SimpleQueue[Any], file_path: str) -> None:
370
+ images: list[npt.NDArray[np.uint8]] = []
371
+ if file_path.endswith(".pdf"):
372
+ # convert pdf to images
373
+ images.extend(np.array(img, dtype=np.uint8) for img in pdf2image.convert_from_path(file_path))
374
+ else:
375
+ # read image
376
+ img = cv2.imread(file_path)
377
+ coloured_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
378
+ images.append(coloured_image.astype(np.uint8))
379
+
380
+ # Extracte the layout from each image
381
+ layouts: list[Layout] = []
382
+ page_paths = [f"page_{i + 1}" for i in range(len(images))]
383
+ processed_layouts: list[LayoutStructure] = []
384
+ for i, (image, page_path) in enumerate(zip(images, page_paths)):
385
+ layouts.append(detect_and_log_layout(log_queue, image, page_path))
386
+
387
+ # Generate and send a blueprint based on the detected layouts
388
+ processed_layouts.append(
389
+ process_layout_records(
390
+ log_queue,
391
+ layouts[-1],
392
+ page_path,
393
+ )
394
+ )
395
+ logging.info("Sending blueprint...")
396
+ blueprint = generate_blueprint(layouts, page_paths, processed_layouts)
397
+ log_queue.put(["blueprint", blueprint])
398
+ logging.info("Blueprint sent...")
399
+
400
+
401
+ def detect_and_log_layout(log_queue: SimpleQueue, coloured_image: npt.NDArray[np.uint8], page_path: str = "") -> Layout:
402
+ # Layout Object - This will contain the detected layouts and their detections
403
+ layout = Layout()
404
+
405
+ # Log Image and add Annotation Context
406
+ log_queue.put([
407
+ "log",
408
+ f"{page_path}/Image",
409
+ [rr.Image(coloured_image)],
410
+ ])
411
+ log_queue.put([
412
+ "log",
413
+ f"{page_path}/Image",
414
+ # The annotation is defined in the Layout class based on its properties
415
+ [rr.AnnotationContext(LayoutType.get_annotation())],
416
+ {
417
+ "static": True,
418
+ },
419
+ ])
420
+
421
+ # Paddle Model - Getting Predictions
422
+ logging.info("Start detection... (It usually takes more than 10-20 seconds per page)")
423
+ ocr_model_pp = PPStructure(show_log=False, recovery=True)
424
+ logging.info("model loaded")
425
+ result_pp = ocr_model_pp(coloured_image)
426
+ _, w, _ = coloured_image.shape
427
+ result_pp = sorted_layout_boxes(result_pp, w)
428
+ logging.info("Detection finished...")
429
+
430
+ # Add results to the layout
431
+ layout.save_all_layouts(result_pp)
432
+ logging.info("All results are saved...")
433
+
434
+ # Recovery Text Document for the detected text
435
+ log_queue.put([
436
+ "log",
437
+ f"{page_path}/Recovery",
438
+ [rr.TextDocument(layout.recovery, media_type=rr.MediaType.MARKDOWN)],
439
+ ])
440
+
441
+ return layout
pyproject.toml CHANGED
@@ -1,5 +1,3 @@
1
- # Copied from https://github.com/rerun-io/rerun_template
2
-
3
  [tool.ruff]
4
  # https://beta.ruff.rs/docs/configuration/
5
 
 
 
 
1
  [tool.ruff]
2
  # https://beta.ruff.rs/docs/configuration/
3
 
requirements.txt CHANGED
@@ -2,4 +2,12 @@ rerun-sdk>=0.16.0,<0.17.0
2
  spaces
3
  gradio
4
  gradio_rerun
5
- git+https://github.com/rerun-io/rerun.git@c5f817686c6a1b8723ddeee25a9aed0ff4fa1e92#egg=ocr&subdirectory=examples/python/ocr
 
 
 
 
 
 
 
 
 
2
  spaces
3
  gradio
4
  gradio_rerun
5
+ fastapi
6
+ pdf2image
7
+ opencv-python
8
+ paddleclas
9
+ paddleocr
10
+ paddlepaddle
11
+ pandas
12
+ setuptools
13
+ tabulate