pierreguillou commited on
Commit
716ca18
·
1 Parent(s): 5328a20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -31
app.py CHANGED
@@ -89,23 +89,25 @@ label2id_layoutxlm = model_layoutxlm.config.label2id
89
  num_labels_layoutxlm = len(id2label_layoutxlm)
90
 
91
  # APP outputs
92
- def app_outputs(uploaded_pdf):
 
93
  filename, msg, images = pdf_to_images(uploaded_pdf)
94
  num_images = len(images)
95
 
96
  if not msg.startswith("Error with the PDF"):
97
-
98
  # Extraction of image data (text and bounding boxes)
99
  dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images)
100
  # prepare our data in the format of the model
101
- encoded_dataset = dataset.map(prepare_inference_features_paragraph, batched=True, batch_size=64, remove_columns=dataset.column_names)
 
102
  custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
103
  # Get predictions (token level)
104
- outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
105
- # Get predictions (paragraph level)
106
- probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
107
  # Get labeled images with lines bounding boxes
108
- images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
109
 
110
  img_files = list()
111
  # get image of PDF without bounding boxes
@@ -143,63 +145,97 @@ def app_outputs(uploaded_pdf):
143
  df, df_empty = dict(), pd.DataFrame()
144
  df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
145
 
146
- return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Gradio APP
149
- with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
150
  gr.HTML("""
151
- <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)</h1></div>
152
- <div style="margin-top: 40px"><p>(03/31/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at paragraph level</a> (chunk size of 512 tokens).</p></div>
153
- <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
154
- <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at paragraph level) on the individual tokens and then, visualize the result at paragraph level!</p></div>
155
- <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at paragraph level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
156
- <div><p>However, the inference time per page can be high when running the model on CPU due to the number of paragraph predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">Document AI | Inference at paragraph level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
157
- <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
158
- <ul><li>(03/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base</a></li><li>(03/25/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15" target="_blank">Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level</a></li><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
159
  """)
160
  with gr.Row():
161
  pdf_file = gr.File(label="PDF")
162
  with gr.Row():
163
- submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
164
- reset_btn = gr.Button(value="Clear")
165
  with gr.Row():
166
- output_msg = gr.Textbox(label="Output message")
 
 
 
 
 
 
167
  with gr.Row():
168
  fileboxes = []
169
- for num_page in range(max_imgboxes):
170
- file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
 
 
 
171
  fileboxes.append(file_path)
172
  with gr.Row():
173
  imgboxes = []
174
- for num_page in range(max_imgboxes):
175
- img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
 
 
 
176
  imgboxes.append(img)
177
  with gr.Row():
178
  csvboxes = []
179
- for num_page in range(max_imgboxes):
180
- csv = gr.File(visible=True, label=f"CSV file at paragraph level (page {num_page})")
 
 
 
181
  csvboxes.append(csv)
182
  with gr.Row():
183
  dfboxes = []
184
- for num_page in range(max_imgboxes):
185
  df = gr.Dataframe(
186
  headers=["bounding boxes", "texts", "labels"],
187
  datatype=["str", "str", "str"],
188
  col_count=(3, "fixed"),
189
  visible=True,
190
- label=f"Data of page {num_page}",
 
 
 
 
 
 
 
 
 
 
 
191
  type="pandas",
192
  wrap=True
193
  )
194
  dfboxes.append(df)
195
 
196
- outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
 
197
  submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
 
198
  # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91
199
  reset_btn.click(
200
- lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
201
  inputs=[],
202
- outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
203
  )
204
 
205
  gr.Examples(
 
89
  num_labels_layoutxlm = len(id2label_layoutxlm)
90
 
91
  # APP outputs
92
+ # APP outputs by model
93
+ def app_outputs_by_model(uploaded_pdf, model_id, model, tokenizer, max_length, id2label, cls_box, sep_box):
94
  filename, msg, images = pdf_to_images(uploaded_pdf)
95
  num_images = len(images)
96
 
97
  if not msg.startswith("Error with the PDF"):
98
+
99
  # Extraction of image data (text and bounding boxes)
100
  dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images)
101
  # prepare our data in the format of the model
102
+ prepare_inference_features_partial = partial(prepare_inference_features_paragraph, tokenizer=tokenizer, max_length=max_length, cls_box=cls_box, sep_box=sep_box)
103
+ encoded_dataset = dataset.map(prepare_inference_features_partial, batched=True, batch_size=64, remove_columns=dataset.column_names)
104
  custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
105
  # Get predictions (token level)
106
+ outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset, model_id, model)
107
+ # Get predictions (line level)
108
+ probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box)
109
  # Get labeled images with lines bounding boxes
110
+ images = get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
111
 
112
  img_files = list()
113
  # get image of PDF without bounding boxes
 
145
  df, df_empty = dict(), pd.DataFrame()
146
  df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
147
 
148
+ return msg, img_files[0], images[0], csv_files[0], df[0]
149
+
150
+ def app_outputs(uploaded_pdf):
151
+ msg_lilt, img_files_lilt, images_lilt, csv_files_lilt, df_lilt = app_outputs_by_model(uploaded_pdf,
152
+ model_id=model_id_lilt, model=model_lilt, tokenizer=tokenizer_lilt,
153
+ max_length=max_length_lilt, id2label=id2label_lilt, cls_box=cls_box, sep_box=sep_box_lilt)
154
+
155
+ msg_layoutxlm, img_files_layoutxlm, images_layoutxlm, csv_files_layoutxlm, df_layoutxlm = app_outputs_by_model(uploaded_pdf,
156
+ model_id=model_id_layoutxlm, model=model_layoutxlm, tokenizer=tokenizer_layoutxlm,
157
+ max_length=max_length_layoutxlm, id2label=id2label_layoutxlm, cls_box=cls_box, sep_box=sep_box_layoutxlm)
158
+
159
+ return msg_lilt, msg_layoutxlm, img_files_lilt, img_files_layoutxlm, images_lilt, images_layoutxlm, csv_files_lilt, csv_files_layoutxlm, df_lilt, df_layoutxlm
160
 
161
  # Gradio APP
162
+ with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v1 - LiLT base vs LayoutXLM base)", css=".gradio-container") as demo:
163
  gr.HTML("""
164
+ <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at paragraph level (v1 - LiLT base vs LayoutXLM base)</h1></div>
165
+ <div style="margin-top: 40px"><p>(04/01/2023) This Inference APP compares - only on the first PDF page - 2 Document Understanding models finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> at paragraph level (chunk size of 512 tokens): <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">LiLT base combined with XLM-RoBERTa base</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">LayoutXLM base combined with XLM-RoBERTa base</a>.</p></div>
166
+ <div><p>To test these 2 models separately, use their corresponding APP on Hugging Face Spaces: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">LiLT base APP (v1 - paragraph level)</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2" target="_blank">LayoutXLM base APP (v2 - paragraph level)</a>.</p></div><div style="margin-top: 20px"><p>Links to Document Understanding APPs:</p><ul><li>Line level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">v2 (LayoutXLM base)</a></li><li>Paragraph level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2" target="_blank">v2 (LayoutXLM base)</a></li></ul></div><div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p><ul><li>(03/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base</a></li><li>(03/25/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15" target="_blank">Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level</a></li><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
 
 
 
 
 
167
  """)
168
  with gr.Row():
169
  pdf_file = gr.File(label="PDF")
170
  with gr.Row():
171
+ submit_btn = gr.Button(f"Get layout detection by LiLT and LayoutXLM on the first PDF page")
172
+ reset_btn = gr.Button(value="Clear")
173
  with gr.Row():
174
+ output_messages = []
175
+ with gr.Column():
176
+ output_msg = gr.Textbox(label="LiLT output message")
177
+ output_messages.append(output_msg)
178
+ with gr.Column():
179
+ output_msg = gr.Textbox(label="LayoutXLM output message")
180
+ output_messages.append(output_msg)
181
  with gr.Row():
182
  fileboxes = []
183
+ with gr.Column():
184
+ file_path = gr.File(visible=True, label=f"LiLT image file")
185
+ fileboxes.append(file_path)
186
+ with gr.Column():
187
+ file_path = gr.File(visible=True, label=f"LayoutXLM image file")
188
  fileboxes.append(file_path)
189
  with gr.Row():
190
  imgboxes = []
191
+ with gr.Column():
192
+ img = gr.Image(type="pil", label=f"Lilt Image")
193
+ imgboxes.append(img)
194
+ with gr.Column():
195
+ img = gr.Image(type="pil", label=f"LayoutXLM Image")
196
  imgboxes.append(img)
197
  with gr.Row():
198
  csvboxes = []
199
+ with gr.Column():
200
+ csv = gr.File(visible=True, label=f"LiLT csv file at paragraph level")
201
+ csvboxes.append(csv)
202
+ with gr.Column():
203
+ csv = gr.File(visible=True, label=f"LayoutXLM csv file at paragraph level")
204
  csvboxes.append(csv)
205
  with gr.Row():
206
  dfboxes = []
207
+ with gr.Column():
208
  df = gr.Dataframe(
209
  headers=["bounding boxes", "texts", "labels"],
210
  datatype=["str", "str", "str"],
211
  col_count=(3, "fixed"),
212
  visible=True,
213
+ label=f"LiLT data",
214
+ type="pandas",
215
+ wrap=True
216
+ )
217
+ dfboxes.append(df)
218
+ with gr.Column():
219
+ df = gr.Dataframe(
220
+ headers=["bounding boxes", "texts", "labels"],
221
+ datatype=["str", "str", "str"],
222
+ col_count=(3, "fixed"),
223
+ visible=True,
224
+ label=f"LayoutXLM data",
225
  type="pandas",
226
  wrap=True
227
  )
228
  dfboxes.append(df)
229
 
230
+ outputboxes = output_messages + fileboxes + imgboxes + csvboxes + dfboxes
231
+
232
  submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
233
+
234
  # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91
235
  reset_btn.click(
236
+ lambda: [pdf_file.update(value=None)] + [output_msg.update(value=None) for output_msg in output_messages] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
237
  inputs=[],
238
+ outputs=[pdf_file] + output_messages + fileboxes + imgboxes + csvboxes + dfboxes
239
  )
240
 
241
  gr.Examples(