JaMe76 commited on
Commit
397d15f
β€’
1 Parent(s): 04c7117

update space

Browse files
Files changed (2) hide show
  1. app.py +120 -83
  2. conf_dd_one.yaml +57 -18
app.py CHANGED
@@ -1,27 +1,33 @@
1
  import os
2
  os.system('pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
3
 
 
 
4
  # work around: https://discuss.huggingface.co/t/how-to-install-a-specific-version-of-gradio-in-spaces/13552
5
  os.system("pip uninstall -y gradio")
6
  os.system("pip install gradio==3.4.1")
 
7
 
8
  from os import getcwd, path, environ
9
  import deepdoctection as dd
10
  from deepdoctection.dataflow.serialize import DataFromList
11
 
 
 
 
12
  import gradio as gr
13
 
14
 
15
  _DD_ONE = "conf_dd_one.yaml"
16
- _DETECTIONS = ["table", "ocr"]
17
 
18
- dd.ModelCatalog.register("layout/model_final_inf_only.pt",dd.ModelProfile(
19
- name="layout/model_final_inf_only.pt",
20
- description="Detectron2 layout detection model trained on private datasets",
21
- config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
22
  size=[274632215],
23
  tp_model=False,
24
- hf_repo_id=environ.get("HF_REPO"),
25
  hf_model_name="model_final_inf_only.pt",
26
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
27
  categories={"1": dd.LayoutType.text,
@@ -29,6 +35,33 @@ dd.ModelCatalog.register("layout/model_final_inf_only.pt",dd.ModelProfile(
29
  "3": dd.LayoutType.list,
30
  "4": dd.LayoutType.table,
31
  "5": dd.LayoutType.figure},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  ))
33
 
34
  # Set up of the configuration and logging. Models are globally defined, so that they are not re-loaded once the input
@@ -60,26 +93,30 @@ categories_item = dd.ModelCatalog.get_profile(cfg.WEIGHTS.D2ITEM).categories
60
  assert categories_item is not None
61
  d_item = dd.D2FrcnnDetector(item_config_path, item_weights_path, categories_item, device=cfg.DEVICE)
62
 
63
- # word detector
64
- det = dd.DoctrTextlineDetector()
65
 
66
- # text recognizer
67
- rec = dd.DoctrTextRecognizer()
68
 
69
 
70
- def build_gradio_analyzer(table, table_ref, ocr):
71
  """Building the Detectron2/DocTr analyzer based on the given config"""
72
 
73
  cfg.freeze(freezed=False)
74
- cfg.TAB = table
75
- cfg.TAB_REF = table_ref
76
- cfg.OCR = ocr
77
  cfg.freeze()
78
 
79
  pipe_component_list = []
80
  layout = dd.ImageLayoutService(d_layout, to_image=True, crop_image=True)
81
  pipe_component_list.append(layout)
82
 
 
 
 
 
83
  if cfg.TAB:
84
 
85
  detect_result_generator = dd.DetectResultGenerator(categories_cell)
@@ -92,15 +129,12 @@ def build_gradio_analyzer(table, table_ref, ocr):
92
 
93
  table_segmentation = dd.TableSegmentationService(
94
  cfg.SEGMENTATION.ASSIGNMENT_RULE,
95
- cfg.SEGMENTATION.IOU_THRESHOLD_ROWS
96
- if cfg.SEGMENTATION.ASSIGNMENT_RULE in ["iou"]
97
- else cfg.SEGMENTATION.IOA_THRESHOLD_ROWS,
98
- cfg.SEGMENTATION.IOU_THRESHOLD_COLS
99
- if cfg.SEGMENTATION.ASSIGNMENT_RULE in ["iou"]
100
- else cfg.SEGMENTATION.IOA_THRESHOLD_COLS,
101
  cfg.SEGMENTATION.FULL_TABLE_TILING,
102
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
103
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
 
104
  )
105
  pipe_component_list.append(table_segmentation)
106
 
@@ -109,69 +143,43 @@ def build_gradio_analyzer(table, table_ref, ocr):
109
  pipe_component_list.append(table_segmentation_refinement)
110
 
111
  if cfg.OCR:
112
- d_layout_text = dd.ImageLayoutService(det, to_image=True, crop_image=True)
113
- pipe_component_list.append(d_layout_text)
114
 
115
- d_text = dd.TextExtractionService(rec, extract_from_roi="WORD")
116
  pipe_component_list.append(d_text)
117
 
118
- match = dd.MatchingService(
 
 
 
119
  parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
120
- child_categories=dd.LayoutType.word,
121
  matching_rule=cfg.WORD_MATCHING.RULE,
122
- threshold=cfg.WORD_MATCHING.IOU_THRESHOLD
123
- if cfg.WORD_MATCHING.RULE in ["iou"]
124
- else cfg.WORD_MATCHING.IOA_THRESHOLD,
125
  )
126
- pipe_component_list.append(match)
 
127
  order = dd.TextOrderService(
128
- text_container=dd.LayoutType.word,
129
- floating_text_block_names=[dd.LayoutType.title, dd.LayoutType.text, dd.LayoutType.list],
130
- text_block_names=[
131
- dd.LayoutType.title,
132
- dd.LayoutType.text,
133
- dd.LayoutType.list,
134
- dd.LayoutType.cell,
135
- dd.CellType.header,
136
- dd.CellType.body,
137
- ],
138
  )
139
  pipe_component_list.append(order)
140
 
141
  pipe = dd.DoctectionPipe(pipeline_component_list=pipe_component_list)
142
 
143
- return pipe
144
-
145
-
146
- def prepare_output(dp, add_table, add_ocr):
147
- out = dp.as_dict()
148
- out.pop("_image")
149
 
150
- layout_items = dp.layouts
151
- if add_ocr:
152
- layout_items.sort(key=lambda x: x.reading_order)
153
- layout_items_str = ""
154
- for item in layout_items:
155
- layout_items_str += f"\n {item.category_name}: {item.text}"
156
- if add_table:
157
- html_list = [table.html for table in dp.tables]
158
- if html_list:
159
- html = ("\n").join(html_list)
160
- else:
161
- html = None
162
- else:
163
- html = None
164
-
165
- return dp.viz(show_table_structure=False), layout_items_str, html, out
166
 
167
 
168
- def analyze_image(img, pdf, attributes):
169
 
170
  # creating an image object and passing to the analyzer by using dataflows
171
- add_table = _DETECTIONS[0] in attributes
172
- add_ocr = _DETECTIONS[1] in attributes
173
-
174
- analyzer = build_gradio_analyzer(add_table, add_table, add_ocr)
175
 
176
  if img is not None:
177
  image = dd.Image(file_name="input.png", location="")
@@ -180,20 +188,39 @@ def analyze_image(img, pdf, attributes):
180
  df = DataFromList(lst=[image])
181
  df = analyzer.analyze(dataset_dataflow=df)
182
  elif pdf:
183
- df = analyzer.analyze(path=pdf.name, max_datapoints=3)
184
  else:
185
  raise ValueError
186
 
187
  df.reset_state()
188
- df_iter = iter(df)
189
 
190
- dp = next(df_iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- return prepare_output(dp, add_table, add_ocr)
193
 
194
 
195
  demo = gr.Blocks(css="scrollbar.css")
196
 
 
197
  with demo:
198
  with gr.Box():
199
  gr.Markdown("<h1><center>deepdoctection - A Document AI Package</center></h1>")
@@ -201,8 +228,11 @@ with demo:
201
  " and document layout analysis tasks using deep learning models. It does not implement models"
202
  " but enables you to build pipelines using highly acknowledged libraries for object detection,"
203
  " OCR and selected NLP tasks and provides an integrated frameworks for fine-tuning, evaluating"
204
- " and running models.\n This pipeline consists of a stack of models powered by <strong>Detectron2"
205
- "</strong> for layout analysis and table recognition and <strong>DocTr</strong> for OCR.")
 
 
 
206
  with gr.Box():
207
  gr.Markdown("<h2><center>Upload a document and choose setting</center></h2>")
208
  with gr.Row():
@@ -221,8 +251,9 @@ with demo:
221
  gr.Examples(examples=[path.join(getcwd(), "sample_3.pdf")], inputs = inputs_pdf)
222
 
223
  with gr.Row():
224
- tok_input = gr.CheckboxGroup(
225
- _DETECTIONS, value=_DETECTIONS, label="Additional extractions", interactive=True)
 
226
  with gr.Row():
227
  btn = gr.Button("Run model", variant="primary")
228
 
@@ -233,17 +264,23 @@ with demo:
233
  with gr.Box():
234
  gr.Markdown("<center><strong>Contiguous text</strong></center>")
235
  image_text = gr.Textbox()
236
- with gr.Box():
237
- gr.Markdown("<center><strong>Table</strong></center>")
238
- html = gr.HTML()
239
- with gr.Box():
240
- gr.Markdown("<center><strong>JSON</strong></center>")
241
- json = gr.JSON()
242
  with gr.Column():
243
  with gr.Box():
244
  gr.Markdown("<center><strong>Layout detection</strong></center>")
245
- image_output = gr.Image(type="numpy", label="Output Image")
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- btn.click(fn=analyze_image, inputs=[inputs, inputs_pdf, tok_input], outputs=[image_output, image_text, html, json])
 
248
 
249
- demo.launch()
1
  import os
2
  os.system('pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
3
 
4
+ credentials_kwargs={"aws_access_key_id": os.environ["ACCESS_KEY"],"aws_secret_access_key": os.environ["SECRET_KEY"]}
5
+
6
  # work around: https://discuss.huggingface.co/t/how-to-install-a-specific-version-of-gradio-in-spaces/13552
7
  os.system("pip uninstall -y gradio")
8
  os.system("pip install gradio==3.4.1")
9
+ os.system(os.environ["DD_ADDONS"])
10
 
11
  from os import getcwd, path, environ
12
  import deepdoctection as dd
13
  from deepdoctection.dataflow.serialize import DataFromList
14
 
15
+ from dd_addons.extern import PdfTextDetector, PostProcessor, get_xsl_path
16
+ from dd_addons.pipe.conn import PostProcessorService
17
+
18
  import gradio as gr
19
 
20
 
21
  _DD_ONE = "conf_dd_one.yaml"
22
+ _XSL_PATH = get_xsl_path()
23
 
24
+ dd.ModelCatalog.register("xrf_layout/model_final_inf_only.pt",dd.ModelProfile(
25
+ name="xrf_layout/model_final_inf_only.pt",
26
+ description="layout_detection/morning-dragon-114",
27
+ config="xrf_dd/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
28
  size=[274632215],
29
  tp_model=False,
30
+ hf_repo_id=environ.get("HF_REPO_LAYOUT"),
31
  hf_model_name="model_final_inf_only.pt",
32
  hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
33
  categories={"1": dd.LayoutType.text,
35
  "3": dd.LayoutType.list,
36
  "4": dd.LayoutType.table,
37
  "5": dd.LayoutType.figure},
38
+ model_wrapper="D2FrcnnDetector",
39
+ ))
40
+
41
+ dd.ModelCatalog.register("xrf_cell/model_final_inf_only.pt", dd.ModelProfile(
42
+ name="xrf_cell/model_final_inf_only.pt",
43
+ description="cell_detection/restful-eon-6",
44
+ config="xrf_dd/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
45
+ size=[274583063],
46
+ tp_model=False,
47
+ hf_repo_id=environ.get("HF_REPO_CELL"),
48
+ hf_model_name="model_final_inf_only.pt",
49
+ hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
50
+ categories={"1": dd.LayoutType.cell},
51
+ model_wrapper="D2FrcnnDetector",
52
+ ))
53
+
54
+ dd.ModelCatalog.register("xrf_item/model_final_inf_only.pt", dd.ModelProfile(
55
+ name="xrf_item/model_final_inf_only.pt",
56
+ description="item_detection/firm_plasma_14",
57
+ config="xrf_dd/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
58
+ size=[274595351],
59
+ tp_model=False,
60
+ hf_repo_id=environ.get("HF_REPO_ITEM"),
61
+ hf_model_name="model_final_inf_only.pt",
62
+ hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
63
+ categories={"1": dd.LayoutType.row, "2": dd.LayoutType.column},
64
+ model_wrapper="D2FrcnnDetector",
65
  ))
66
 
67
  # Set up of the configuration and logging. Models are globally defined, so that they are not re-loaded once the input
93
  assert categories_item is not None
94
  d_item = dd.D2FrcnnDetector(item_config_path, item_weights_path, categories_item, device=cfg.DEVICE)
95
 
96
+ # pdf miner
97
+ pdf_text = PdfTextDetector(_XSL_PATH)
98
 
99
+ # text detector
100
+ tex_text = dd.TextractOcrDetector(**credentials_kwargs)
101
 
102
 
103
+ def build_gradio_analyzer():
104
  """Building the Detectron2/DocTr analyzer based on the given config"""
105
 
106
  cfg.freeze(freezed=False)
107
+ cfg.TAB = True
108
+ cfg.TAB_REF = True
109
+ cfg.OCR = True
110
  cfg.freeze()
111
 
112
  pipe_component_list = []
113
  layout = dd.ImageLayoutService(d_layout, to_image=True, crop_image=True)
114
  pipe_component_list.append(layout)
115
 
116
+ nms_service = dd.AnnotationNmsService(nms_pairs=cfg.LAYOUT_NMS_PAIRS.COMBINATIONS,
117
+ thresholds=cfg.LAYOUT_NMS_PAIRS.THRESHOLDS)
118
+ pipe_component_list.append(nms_service)
119
+
120
  if cfg.TAB:
121
 
122
  detect_result_generator = dd.DetectResultGenerator(categories_cell)
129
 
130
  table_segmentation = dd.TableSegmentationService(
131
  cfg.SEGMENTATION.ASSIGNMENT_RULE,
132
+ cfg.SEGMENTATION.THRESHOLD_ROWS,
133
+ cfg.SEGMENTATION.THRESHOLD_COLS,
 
 
 
 
134
  cfg.SEGMENTATION.FULL_TABLE_TILING,
135
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
136
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
137
+ cfg.SEGMENTATION.STRETCH_RULE
138
  )
139
  pipe_component_list.append(table_segmentation)
140
 
143
  pipe_component_list.append(table_segmentation_refinement)
144
 
145
  if cfg.OCR:
 
 
146
 
147
+ d_text = dd.TextExtractionService(pdf_text)
148
  pipe_component_list.append(d_text)
149
 
150
+ t_text = dd.TextExtractionService(tex_text,skip_if_text_extracted=True)
151
+ pipe_component_list.append(t_text)
152
+
153
+ match_words = dd.MatchingService(
154
  parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
155
+ child_categories=cfg.WORD_MATCHING.CHILD_CATEGORIES,
156
  matching_rule=cfg.WORD_MATCHING.RULE,
157
+ threshold=cfg.WORD_MATCHING.THRESHOLD,
158
+ max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY
 
159
  )
160
+ pipe_component_list.append(match_words)
161
+
162
  order = dd.TextOrderService(
163
+ text_container=cfg.TEXT_ORDERING.TEXT_CONTAINER,
164
+ floating_text_block_names=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK,
165
+ text_block_names=cfg.TEXT_ORDERING.TEXT_BLOCK,
166
+ text_containers_to_text_block=cfg.TEXT_ORDERING.TEXT_CONTAINER_TO_TEXT_BLOCK
 
 
 
 
 
 
167
  )
168
  pipe_component_list.append(order)
169
 
170
  pipe = dd.DoctectionPipe(pipeline_component_list=pipe_component_list)
171
 
172
+ post_processor = PostProcessor("deepdoctection", **credentials_kwargs)
173
+ post_service = PostProcessorService(post_processor)
174
+ pipe_component_list.append(post_service)
 
 
 
175
 
176
+ return pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
 
179
+ def analyze_image(img, pdf, max_datapoints):
180
 
181
  # creating an image object and passing to the analyzer by using dataflows
182
+ analyzer = build_gradio_analyzer()
 
 
 
183
 
184
  if img is not None:
185
  image = dd.Image(file_name="input.png", location="")
188
  df = DataFromList(lst=[image])
189
  df = analyzer.analyze(dataset_dataflow=df)
190
  elif pdf:
191
+ df = analyzer.analyze(path=pdf.name, max_datapoints=max_datapoints)
192
  else:
193
  raise ValueError
194
 
195
  df.reset_state()
 
196
 
197
+ layout_items_str = ""
198
+ jsonl_out = []
199
+ dpts = []
200
+ html_list = []
201
+
202
+ for dp in df:
203
+ dpts.append(dp)
204
+ out = dp.as_dict()
205
+ jsonl_out.append(out)
206
+ out.pop("_image")
207
+ layout_items = dp.layouts
208
+ layout_items.sort(key=lambda x: x.reading_order)
209
+ layout_items_str += f"\n\n -------- PAGE NUMBER: {dp.page_number+1} ------------- \n"
210
+ for item in layout_items:
211
+ layout_items_str += f"\n {item.category_name}: {item.text}"
212
+ html_list.extend([table.html for table in dp.tables])
213
+ if html_list:
214
+ html = ("<br /><br /><br />").join(html_list)
215
+ else:
216
+ html = None
217
 
218
+ return [dp.viz(show_cells=False) for dp in dpts], layout_items_str, html, jsonl_out
219
 
220
 
221
  demo = gr.Blocks(css="scrollbar.css")
222
 
223
+
224
  with demo:
225
  with gr.Box():
226
  gr.Markdown("<h1><center>deepdoctection - A Document AI Package</center></h1>")
228
  " and document layout analysis tasks using deep learning models. It does not implement models"
229
  " but enables you to build pipelines using highly acknowledged libraries for object detection,"
230
  " OCR and selected NLP tasks and provides an integrated frameworks for fine-tuning, evaluating"
231
+ " and running models.<br />"
232
+ "This pipeline consists of a stack of models powered by <strong>Detectron2"
233
+ "</strong> for layout analysis and table recognition. OCR will be provided as well. You can process"
234
+ "an image or even a PDF-document. Up to nine pages can be processed. <br />")
235
+ gr.Markdown("[https://github.com/deepdoctection/deepdoctection](https://github.com/deepdoctection/deepdoctection)")
236
  with gr.Box():
237
  gr.Markdown("<h2><center>Upload a document and choose setting</center></h2>")
238
  with gr.Row():
251
  gr.Examples(examples=[path.join(getcwd(), "sample_3.pdf")], inputs = inputs_pdf)
252
 
253
  with gr.Row():
254
+ max_imgs = gr.Slider(1, 8, value=2, step=1, label="Number of pages in multi page PDF",
255
+ info="Will stop after 9 pages")
256
+
257
  with gr.Row():
258
  btn = gr.Button("Run model", variant="primary")
259
 
264
  with gr.Box():
265
  gr.Markdown("<center><strong>Contiguous text</strong></center>")
266
  image_text = gr.Textbox()
 
 
 
 
 
 
267
  with gr.Column():
268
  with gr.Box():
269
  gr.Markdown("<center><strong>Layout detection</strong></center>")
270
+ gallery = gr.Gallery(
271
+ label="Output images", show_label=False, elem_id="gallery"
272
+ ).style(grid=2)
273
+ with gr.Row():
274
+ with gr.Box():
275
+ gr.Markdown("<center><strong>Table</strong></center>")
276
+ html = gr.HTML()
277
+
278
+ with gr.Row():
279
+ with gr.Box():
280
+ gr.Markdown("<center><strong>JSON</strong></center>")
281
+ json = gr.JSON()
282
 
283
+ btn.click(fn=analyze_image, inputs=[inputs, inputs_pdf, max_imgs],
284
+ outputs=[gallery, image_text, html, json])
285
 
286
+ demo.launch()
conf_dd_one.yaml CHANGED
@@ -1,26 +1,65 @@
1
  CONFIG:
2
- D2LAYOUT: dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml
3
- D2CELL: dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml
4
- D2ITEM: dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml
5
  WEIGHTS:
6
- D2LAYOUT: layout/model_final_inf_only.pt
7
- D2CELL: cell/d2_model_1849999_cell_inf_only.pt
8
- D2ITEM: item/d2_model_1639999_item_inf_only.pt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  SEGMENTATION:
10
  ASSIGNMENT_RULE: ioa
11
- IOU_THRESHOLD_ROWS: 0.01
12
- IOU_THRESHOLD_COLS: 0.001
13
- IOA_THRESHOLD_ROWS: 0.4
14
- IOA_THRESHOLD_COLS: 0.4
15
  FULL_TABLE_TILING: True
16
- REMOVE_IOU_THRESHOLD_ROWS: 0.001
17
- REMOVE_IOU_THRESHOLD_COLS: 0.001
 
 
18
  WORD_MATCHING:
19
  PARENTAL_CATEGORIES:
20
- - TEXT
21
- - TITLE
22
- - CELL
23
- - LIST
 
 
 
24
  RULE: ioa
25
- IOU_THRESHOLD: 0.001
26
- IOA_THRESHOLD: 0.6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  CONFIG:
2
+ D2LAYOUT: xrf_dd/layout/CASCADE_RCNN_R_50_FPN_GN.yaml
3
+ D2CELL: xrf_dd/cell/CASCADE_RCNN_R_50_FPN_GN.yaml
4
+ D2ITEM: xrf_dd/item/CASCADE_RCNN_R_50_FPN_GN.yaml
5
  WEIGHTS:
6
+ D2LAYOUT: xrf_layout/model_final_inf_only.pt
7
+ D2CELL: xrf_cell/model_final_inf_only.pt
8
+ D2ITEM: xrf_item/model_final_inf_only.pt
9
+ LAYOUT_NMS_PAIRS:
10
+ COMBINATIONS:
11
+ - - text
12
+ - table
13
+ - - title
14
+ - table
15
+ - - text
16
+ - list
17
+ - - title
18
+ - list
19
+ - - text
20
+ - title
21
+ - - list
22
+ - table
23
+ THRESHOLDS:
24
+ - 0.005
25
+ - 0.005
26
+ - 0.542
27
+ - 0.1
28
+ - 0.699
29
+ - 0.01
30
  SEGMENTATION:
31
  ASSIGNMENT_RULE: ioa
32
+ THRESHOLD_ROWS: 0.9
33
+ THRESHOLD_COLS: 0.9
 
 
34
  FULL_TABLE_TILING: True
35
+ REMOVE_IOU_THRESHOLD_ROWS: 0.5
36
+ REMOVE_IOU_THRESHOLD_COLS: 0.5
37
+ STRETCH_RULE: equal
38
+ USE_REFINEMENT: False
39
  WORD_MATCHING:
40
  PARENTAL_CATEGORIES:
41
+ - text
42
+ - title
43
+ - list
44
+ - figure
45
+ - cell
46
+ CHILD_CATEGORIES:
47
+ - word
48
  RULE: ioa
49
+ THRESHOLD: 0.4
50
+ MAX_PARENT_ONLY: True
51
+ TEXT_ORDERING:
52
+ TEXT_CONTAINER: word
53
+ FLOATING_TEXT_BLOCK:
54
+ - title
55
+ - text
56
+ - list
57
+ - figure
58
+ TEXT_BLOCK:
59
+ - title
60
+ - text
61
+ - list
62
+ - cell
63
+ - figure
64
+ TEXT_CONTAINER_TO_TEXT_BLOCK: True
65
+ DEVICE: cpu