File size: 9,155 Bytes
74250da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7eb037e
74250da
 
 
 
 
 
 
 
 
 
 
 
 
 
7eb037e
 
74250da
 
 
7eb037e
74250da
2244eb6
 
 
 
74250da
 
 
 
 
 
 
 
 
 
 
 
 
 
545165f
 
 
 
74250da
 
 
 
 
 
 
 
 
 
 
 
 
cc6733e
74250da
b343f17
 
74250da
 
 
b343f17
74250da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import gradio as gr
import re
import string

from operator import itemgetter
import collections

import pypdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError

import pdf2image
from pdf2image import convert_from_path
import langdetect
from langdetect import detect_langs

import pandas as pd
import numpy as np
import random
import tempfile
import itertools

from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
import cv2

## files

import sys  
sys.path.insert(0, 'files/')

import functions
from functions import *

# update pip
os.system('python -m pip install --upgrade pip')

# model
from transformers import AutoTokenizer, AutoModelForTokenClassification

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "NiamaLynn/lilt-roberta-DocLayNet-base_lines_ml256-v1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id);
model.to(device);

# APP outputs 
def app_outputs(uploaded_pdf):
    filename, msg, images = pdf_to_images(uploaded_pdf)
    num_images = len(images)
    path = os.path.dirname(__file__)
    if not msg.startswith("Error with the PDF"):
    
        # Extraction of image data (text and bounding boxes)
        dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
        # prepare our data in the format of the model
        encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
        custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
        # Get predictions (token level)
        outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
        # Get predictions (line level)
        probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
        # Get labeled images with lines bounding boxes
        images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)

        
      
        img_files = list()
        # get image of PDF without bounding boxes
        for i in range(num_images):
            if filename != path+"/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
            else: img_file = filename.replace(".pdf", ".png")
            if not os.path.exists(img_file):
               os.makedirs(os.path.dirname(img_file))
               images[i].save(img_file)
               img_files.append(img_file)

        if num_images < max_imgboxes:
            img_files += [image_blank]*(max_imgboxes - num_images)
            images += [Image.open(image_blank)]*(max_imgboxes - num_images)
            for count in range(max_imgboxes - num_images):
                df[num_images + count] = pd.DataFrame()
        else:
            img_files = img_files[:max_imgboxes]
            images = images[:max_imgboxes]
            df = dict(itertools.islice(df.items(), max_imgboxes))

        csv_files = list()
        for i in range(max_imgboxes):
            csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
            if not os.path.exists(csv_file):
               os.makedirs(os.path.dirname(csv_file))
               csv_files.append(gr.File(value=csv_file, visible=True))  # Create new File instances
               df[i].to_csv(csv_file, encoding="utf-8", index=False)

    else:  
        img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
        img_files[0], img_files[1] = image_blank, image_blank
        images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
        csv_file = "csv_wo_content.csv"
        csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
        df, df_empty = dict(), pd.DataFrame()
        df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
    
    return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]

# gradio APP
with gr.Blocks(title="Application for document layout analysis at line level (v1 - LiLT base)", css=".gradio-container") as demo:
    gr.HTML("""
    <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference application for Documents layout analysis at line level (v1 - LiLT base)</h1></div>
    <div style="margin-top: 40px"><p>(02/12/2023) This application uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/NiamaLynn/lilt-roberta-DocLayNet-base_lines_ml256-v1" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 256 tokens).</p></div>
    <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
    <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
    <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
    <div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>
    """)
    with gr.Row():
        pdf_file = gr.File(label="PDF")
    with gr.Row():
        submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
        reset_btn = gr.Button(value="Clear")
    with gr.Row():
        output_msg = gr.Textbox(label="Output message")
    with gr.Row():
        fileboxes = []
        for num_page in range(max_imgboxes):
            file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
            fileboxes.append(file_path)
    with gr.Row():
        imgboxes = []
        for num_page in range(max_imgboxes):
            img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
            imgboxes.append(img)
    with gr.Row():
        csvboxes = []
        for num_page in range(max_imgboxes):
            csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
            csvboxes.append(csv)
    with gr.Row():
        dfboxes = []
        for num_page in range(max_imgboxes):
            df = gr.Dataframe(
                      headers=["bounding boxes", "texts", "labels"],
                      datatype=["str", "str", "str"],
                      col_count=(3, "fixed"), 
                      visible=True,
                      label=f"Data of page {num_page}",
                      type="pandas",
                      wrap=True
                    )
            dfboxes.append(df)

    outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
    submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
    reset_btn.click(
        lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
        inputs=[],
        outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
    )
    
    gr.Examples(
        [["files/example.pdf"]],
        [pdf_file],
        outputboxes,
        fn=app_outputs,
        cache_examples=True,
        )
    
demo.launch()