AlhitawiMohammed22 commited on
Commit
ff135d3
1 Parent(s): 9002e70

Create Builder Script

Browse files
Files changed (1) hide show
  1. builder.py +305 -0
builder.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Copyright (C) 2021, Mindee.
3
+
4
+ # This program is licensed under the Apache License version 2.
5
+ # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
6
+
7
+
8
+ from typing import Any, Dict, List, Tuple
9
+ import pandas as pd
10
+
11
+ import numpy as np
12
+ from scipy.cluster.hierarchy import fclusterdata
13
+
14
+ from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes
15
+ from doctr.utils.repr import NestedObject
16
+
17
+ __all__ = ['DocumentBuilder']
18
+
19
+
20
+ class DocumentBuilder(NestedObject):
21
+ """Implements a document builder
22
+ Args:
23
+ resolve_lines: whether words should be automatically grouped into lines
24
+ resolve_blocks: whether lines should be automatically grouped into blocks
25
+ paragraph_break: relative length of the minimum space separating paragraphs
26
+ export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle
27
+ box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ resolve_lines: bool = True,
33
+ resolve_blocks: bool = True,
34
+ paragraph_break: float = 0.035,
35
+ export_as_straight_boxes: bool = False,
36
+ ) -> None:
37
+
38
+ self.resolve_lines = resolve_lines
39
+ self.resolve_blocks = resolve_blocks
40
+ self.paragraph_break = paragraph_break
41
+ self.export_as_straight_boxes = export_as_straight_boxes
42
+
43
+ @staticmethod
44
+ def _sort_boxes(boxes: np.ndarray) -> np.ndarray:
45
+ """Sort bounding boxes from top to bottom, left to right
46
+ Args:
47
+ boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
48
+ Returns:
49
+ tuple: indices of ordered boxes of shape (N,), boxes
50
+ If straight boxes are passed tpo the function, boxes are unchanged
51
+ else: boxes returned are straight boxes fitted to the straightened rotated boxes
52
+ so that we fit the lines afterwards to the straigthened page
53
+ """
54
+ if boxes.ndim == 3:
55
+ boxes = rotate_boxes(
56
+ loc_preds=boxes,
57
+ angle=-estimate_page_angle(boxes),
58
+ orig_shape=(1024, 1024),
59
+ min_angle=5.,
60
+ )
61
+ boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
62
+ return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes
63
+
64
+ def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]:
65
+ """Split a line in sub_lines
66
+ Args:
67
+ boxes: bounding boxes of shape (N, 4)
68
+ word_idcs: list of indexes for the words of the line
69
+ Returns:
70
+ A list of (sub-)lines computed from the original line (words)
71
+ """
72
+ lines = []
73
+ # Sort words horizontally
74
+ word_idcs = [word_idcs[idx]
75
+ for idx in boxes[word_idcs, 0].argsort().tolist()]
76
+
77
+ # Eventually split line horizontally
78
+ if len(word_idcs) < 2:
79
+ lines.append(word_idcs)
80
+ else:
81
+ sub_line = [word_idcs[0]]
82
+ for i in word_idcs[1:]:
83
+ horiz_break = True
84
+
85
+ prev_box = boxes[sub_line[-1]]
86
+ # Compute distance between boxes
87
+ dist = boxes[i, 0] - prev_box[2]
88
+ # If distance between boxes is lower than paragraph break, same sub-line
89
+ if dist < self.paragraph_break:
90
+ horiz_break = False
91
+
92
+ if horiz_break:
93
+ lines.append(sub_line)
94
+ sub_line = []
95
+
96
+ sub_line.append(i)
97
+ lines.append(sub_line)
98
+
99
+ return lines
100
+
101
+ def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]:
102
+ """Order boxes to group them in lines
103
+ Args:
104
+ boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
105
+ Returns:
106
+ nested list of box indices
107
+ """
108
+
109
+ # Sort boxes, and straighten the boxes if they are rotated
110
+ idxs, boxes = self._sort_boxes(boxes)
111
+
112
+ # Compute median for boxes heights
113
+ y_med = np.median(boxes[:, 3] - boxes[:, 1])
114
+
115
+ lines = []
116
+ words = [idxs[0]] # Assign the top-left word to the first line
117
+ # Define a mean y-center for the line
118
+ y_center_sum = boxes[idxs[0]][[1, 3]].mean()
119
+
120
+ for idx in idxs[1:]:
121
+ vert_break = True
122
+
123
+ # Compute y_dist
124
+ y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words))
125
+ # If y-center of the box is close enough to mean y-center of the line, same line
126
+ if y_dist < y_med / 2:
127
+ vert_break = False
128
+
129
+ if vert_break:
130
+ # Compute sub-lines (horizontal split)
131
+ lines.extend(self._resolve_sub_lines(boxes, words))
132
+ words = []
133
+ y_center_sum = 0
134
+
135
+ words.append(idx)
136
+ y_center_sum += boxes[idx][[1, 3]].mean()
137
+
138
+ # Use the remaining words to form the last(s) line(s)
139
+ if len(words) > 0:
140
+ # Compute sub-lines (horizontal split)
141
+ lines.extend(self._resolve_sub_lines(boxes, words))
142
+
143
+ return lines
144
+
145
+ @staticmethod
146
+ def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]:
147
+ """Order lines to group them in blocks
148
+ Args:
149
+ boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
150
+ lines: list of lines, each line is a list of idx
151
+ Returns:
152
+ nested list of box indices
153
+ """
154
+ # Resolve enclosing boxes of lines
155
+ if boxes.ndim == 3:
156
+ box_lines = np.asarray([
157
+ resolve_enclosing_rbbox(
158
+ [tuple(boxes[idx, :, :]) for idx in line])
159
+ for line in lines # type: ignore[misc]
160
+ ])
161
+ else:
162
+ _box_lines = [
163
+ resolve_enclosing_bbox([
164
+ # type: ignore[misc]
165
+ (tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line
166
+ ])
167
+ for line in lines
168
+ ]
169
+ box_lines = np.asarray([(x1, y1, x2, y2)
170
+ for ((x1, y1), (x2, y2)) in _box_lines])
171
+
172
+ # Compute geometrical features of lines to clusterize
173
+ # Clusterizing only with box centers yield to poor results for complex documents
174
+ if boxes.ndim == 3:
175
+ box_features = np.stack(
176
+ (
177
+ (box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2,
178
+ (box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2,
179
+ (box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2,
180
+ (box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2,
181
+ (box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2,
182
+ (box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2,
183
+ ), axis=-1
184
+ )
185
+ else:
186
+ box_features = np.stack(
187
+ (
188
+ (box_lines[:, 0] + box_lines[:, 3]) / 2,
189
+ (box_lines[:, 1] + box_lines[:, 2]) / 2,
190
+ (box_lines[:, 0] + box_lines[:, 2]) / 2,
191
+ (box_lines[:, 1] + box_lines[:, 3]) / 2,
192
+ box_lines[:, 0],
193
+ box_lines[:, 1],
194
+ ), axis=-1
195
+ )
196
+ # Compute clusters
197
+ clusters = fclusterdata(
198
+ box_features, t=0.1, depth=4, criterion='distance', metric='euclidean')
199
+
200
+ _blocks: Dict[int, List[int]] = {}
201
+ # Form clusters
202
+ for line_idx, cluster_idx in enumerate(clusters):
203
+ if cluster_idx in _blocks.keys():
204
+ _blocks[cluster_idx].append(line_idx)
205
+ else:
206
+ _blocks[cluster_idx] = [line_idx]
207
+
208
+ # Retrieve word-box level to return a fully nested structure
209
+ blocks = [[lines[idx] for idx in block] for block in _blocks.values()]
210
+
211
+ return blocks
212
+
213
+ def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any:
214
+ """Gather independent words in structured blocks
215
+ Args:
216
+ boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
217
+ word_preds: list of all detected words of the page, of shape N
218
+ Returns:
219
+ list of block elements
220
+ """
221
+
222
+ if boxes.shape[0] != len(word_preds):
223
+ raise ValueError(
224
+ f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}")
225
+
226
+ if boxes.shape[0] == 0:
227
+ return []
228
+
229
+ # Decide whether we try to form lines
230
+ _boxes = boxes
231
+ if self.resolve_lines:
232
+ lines = self._resolve_lines(
233
+ _boxes if _boxes.ndim == 3 else _boxes[:, :4])
234
+ # Decide whether we try to form blocks
235
+ if self.resolve_blocks and len(lines) > 1:
236
+ _blocks = self._resolve_blocks(
237
+ _boxes if _boxes.ndim == 3 else _boxes[:, :4], lines)
238
+ else:
239
+ _blocks = [lines]
240
+ else:
241
+ # Sort bounding boxes, one line for all boxes, one block for the line
242
+ lines = [self._sort_boxes(
243
+ _boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]]
244
+ _blocks = [lines]
245
+
246
+ rows = []
247
+ for block_idx, lines in enumerate(_blocks):
248
+ for line_idx, line in enumerate(lines):
249
+ for i,idx in enumerate(line):
250
+ h, w = page_shapes
251
+ row = (
252
+ block_idx, line_idx, i, word_preds[idx],
253
+ int(round(boxes[idx, 0]*w)
254
+ ), int(round(boxes[idx, 1]*h)),
255
+ int(round(boxes[idx, 2]*w)
256
+ ), int(round(boxes[idx, 3]*h)),
257
+ int(round(boxes[idx, 4]*100))
258
+ )
259
+ rows.append(row)
260
+
261
+ return rows
262
+
263
+ def extra_repr(self) -> str:
264
+ return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
265
+ f"paragraph_break={self.paragraph_break}, "
266
+ f"export_as_straight_boxes={self.export_as_straight_boxes}")
267
+
268
+ def __call__(
269
+ self,
270
+ boxes: List[np.ndarray],
271
+ text_preds: List[List[Tuple[str, float]]],
272
+ page_shapes: List[Tuple[int, int]]
273
+ ) -> pd.DataFrame:
274
+ """Re-arrange detected words into structured blocks
275
+ Args:
276
+ boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
277
+ or (*, 6) for all words for a given page
278
+ text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
279
+ page_shape: shape of each page, of size N
280
+ Returns:
281
+ document object
282
+ """
283
+ if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
284
+ raise ValueError(
285
+ "All arguments are expected to be lists of the same size")
286
+
287
+ if self.export_as_straight_boxes and len(boxes) > 0:
288
+ # If boxes are already straight OK, else fit a bounding rect
289
+ if boxes[0].ndim == 3:
290
+ straight_boxes = []
291
+ # Iterate over pages
292
+ for p_boxes in boxes:
293
+ # Iterate over boxes of the pages
294
+ straight_boxes.append(np.concatenate(
295
+ (p_boxes.min(1), p_boxes.max(1)), 1))
296
+ boxes = straight_boxes
297
+
298
+ _pages = [
299
+ pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[
300
+ "block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score"
301
+ ])
302
+ for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
303
+ ]
304
+
305
+ return _pages