pierreguillou
commited on
Commit
•
b565cf9
1
Parent(s):
fde868d
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
|
6 |
+
from operator import itemgetter
|
7 |
+
import collections
|
8 |
+
|
9 |
+
import pypdf
|
10 |
+
from pypdf import PdfReader
|
11 |
+
from pypdf.errors import PdfReadError
|
12 |
+
|
13 |
+
import pdf2image
|
14 |
+
from pdf2image import convert_from_path
|
15 |
+
import langdetect
|
16 |
+
from langdetect import detect_langs
|
17 |
+
|
18 |
+
import pandas as pd
|
19 |
+
import numpy as np
|
20 |
+
import random
|
21 |
+
import tempfile
|
22 |
+
import itertools
|
23 |
+
|
24 |
+
from matplotlib import font_manager
|
25 |
+
from PIL import Image, ImageDraw, ImageFont
|
26 |
+
import cv2
|
27 |
+
|
28 |
+
## files
|
29 |
+
|
30 |
+
import sys
|
31 |
+
sys.path.insert(0, 'files/')
|
32 |
+
|
33 |
+
import functions
|
34 |
+
from functions import *
|
35 |
+
|
36 |
+
# update pip
|
37 |
+
os.system('python -m pip install --upgrade pip')
|
38 |
+
|
39 |
+
# APP outputs
|
40 |
+
def app_outputs(uploaded_pdf):
|
41 |
+
filename, msg, images = pdf_to_images(uploaded_pdf)
|
42 |
+
num_images = len(images)
|
43 |
+
|
44 |
+
if not msg.startswith("Error with the PDF"):
|
45 |
+
|
46 |
+
# Extraction of image data (text and bounding boxes)
|
47 |
+
dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
|
48 |
+
# prepare our data in the format of the model
|
49 |
+
encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
|
50 |
+
custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
|
51 |
+
# Get predictions (token level)
|
52 |
+
outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
|
53 |
+
# Get predictions (line level)
|
54 |
+
probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
|
55 |
+
# Get labeled images with lines bounding boxes
|
56 |
+
images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
|
57 |
+
|
58 |
+
img_files = list()
|
59 |
+
# get image of PDF without bounding boxes
|
60 |
+
for i in range(num_images):
|
61 |
+
if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
|
62 |
+
else: img_file = filename.replace(".pdf", ".png")
|
63 |
+
images[i].save(img_file)
|
64 |
+
img_files.append(img_file)
|
65 |
+
|
66 |
+
if num_images < max_imgboxes:
|
67 |
+
img_files += [image_blank]*(max_imgboxes - num_images)
|
68 |
+
images += [Image.open(image_blank)]*(max_imgboxes - num_images)
|
69 |
+
for count in range(max_imgboxes - num_images):
|
70 |
+
df[num_images + count] = pd.DataFrame()
|
71 |
+
else:
|
72 |
+
img_files = img_files[:max_imgboxes]
|
73 |
+
images = images[:max_imgboxes]
|
74 |
+
df = dict(itertools.islice(df.items(), max_imgboxes))
|
75 |
+
|
76 |
+
# save
|
77 |
+
csv_files = list()
|
78 |
+
for i in range(max_imgboxes):
|
79 |
+
csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
|
80 |
+
csv_files.append(gr.File.update(value=csv_file, visible=True))
|
81 |
+
df[i].to_csv(csv_file, encoding="utf-8", index=False)
|
82 |
+
|
83 |
+
else:
|
84 |
+
img_files, images, csv_files = [""]*3,[""]*3,[""]*3
|
85 |
+
img_files[0], img_files[1], img_files[2] = image_blank, image_blank, image_blank
|
86 |
+
images[0], images[1], images[2] = Image.open(image_blank), Image.open(image_blank), Image.open(image_blank)
|
87 |
+
csv_file = "csv_wo_content.csv"
|
88 |
+
csv_files[0], csv_files[1], csv_files[2] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
|
89 |
+
df, df_empty = dict(), pd.DataFrame()
|
90 |
+
df[0], df[1], df[2] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
|
91 |
+
|
92 |
+
return msg, img_files[0], img_files[1], img_files[2], images[0], images[1], images[2], csv_files[0], csv_files[1], csv_files[2], df[0], df[1], df[2]
|
93 |
+
|
94 |
+
# gradio APP
|
95 |
+
with gr.Blocks(title="Inference APP for Document Understanding at line level (v1)", css=".gradio-container") as demo:
|
96 |
+
gr.HTML("""
|
97 |
+
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1)</h1></div>
|
98 |
+
<div style="margin-top: 40px"><p>(02/12/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base</a> at line level (chunk size of 384 tokens).</p></div>
|
99 |
+
<div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to understand any language. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can classifly any bounding box (and its OCR text) to 11 labels (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
|
100 |
+
<div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine ourselves (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) as we'll need to do it in real life to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
|
101 |
+
<div><p>From any PDF (of any language), it allows to get all pages with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels).</p></div>
|
102 |
+
<div><p>To avoid running this APP for too long, <b>only the first 3 pages are processed by this APP</b>. If you want to update this limit, you can either clone this APP and change the value of the parameter <code>max_imgboxes</code>, or run the corresponding notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)</a>" which does not have this limit.</p></div>
|
103 |
+
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
|
104 |
+
<ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">(02/10/2023) Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank"> (01/31/2023) Document AI | DocLayNet image viewer APP</a></li><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
|
105 |
+
""")
|
106 |
+
with gr.Row():
|
107 |
+
pdf_file = gr.File(label="PDF")
|
108 |
+
with gr.Row():
|
109 |
+
submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
|
110 |
+
reset_btn = gr.Button(value="Clear")
|
111 |
+
with gr.Row():
|
112 |
+
output_msg = gr.Textbox(label="Output message")
|
113 |
+
with gr.Row():
|
114 |
+
fileboxes = []
|
115 |
+
for num_page in range(max_imgboxes):
|
116 |
+
file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
|
117 |
+
fileboxes.append(file_path)
|
118 |
+
with gr.Row():
|
119 |
+
imgboxes = []
|
120 |
+
for num_page in range(max_imgboxes):
|
121 |
+
img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
|
122 |
+
imgboxes.append(img)
|
123 |
+
with gr.Row():
|
124 |
+
csvboxes = []
|
125 |
+
for num_page in range(max_imgboxes):
|
126 |
+
csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
|
127 |
+
csvboxes.append(csv)
|
128 |
+
with gr.Row():
|
129 |
+
dfboxes = []
|
130 |
+
for num_page in range(max_imgboxes):
|
131 |
+
df = gr.Dataframe(
|
132 |
+
headers=["bounding boxes", "texts", "labels"],
|
133 |
+
datatype=["str", "str", "str"],
|
134 |
+
col_count=(3, "fixed"),
|
135 |
+
visible=True,
|
136 |
+
label=f"Data of page {num_page}",
|
137 |
+
type="pandas",
|
138 |
+
wrap=True
|
139 |
+
)
|
140 |
+
dfboxes.append(df)
|
141 |
+
|
142 |
+
outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
|
143 |
+
submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
|
144 |
+
reset_btn.click(
|
145 |
+
lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
|
146 |
+
inputs=[],
|
147 |
+
outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
|
148 |
+
)
|
149 |
+
|
150 |
+
gr.Examples(
|
151 |
+
[["files/example.pdf"]],
|
152 |
+
[pdf_file],
|
153 |
+
outputboxes,
|
154 |
+
fn=app_outputs,
|
155 |
+
cache_examples=True,
|
156 |
+
)
|
157 |
+
|
158 |
+
demo.launch(debug=True)
|