import os import gradio as gr import re import string from operator import itemgetter import collections import pypdf from pypdf import PdfReader from pypdf.errors import PdfReadError import pdf2image from pdf2image import convert_from_path import langdetect from langdetect import detect_langs import pandas as pd import numpy as np import random import tempfile import itertools from matplotlib import font_manager from PIL import Image, ImageDraw, ImageFont import cv2 ## files import sys sys.path.insert(0, 'files/') import functions from functions import * # update pip os.system('python -m pip install --upgrade pip') # APP outputs def app_outputs(uploaded_pdf): filename, msg, images = pdf_to_images(uploaded_pdf) num_images = len(images) if not msg.startswith("Error with the PDF"): # Extraction of image data (text and bounding boxes) dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images) # prepare our data in the format of the model encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names) custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer) # Get predictions (token level) outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset) # Get predictions (line level) probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes) # Get labeled images with lines bounding boxes images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict) img_files = list() # get image of PDF without bounding boxes for i in range(num_images): if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png") else: img_file = filename.replace(".pdf", ".png") images[i].save(img_file) img_files.append(img_file) if num_images < max_imgboxes: img_files += [image_blank]*(max_imgboxes - num_images) images += [Image.open(image_blank)]*(max_imgboxes - num_images) for count in range(max_imgboxes - num_images): df[num_images + count] = pd.DataFrame() else: img_files = img_files[:max_imgboxes] images = images[:max_imgboxes] df = dict(itertools.islice(df.items(), max_imgboxes)) # save csv_files = list() for i in range(max_imgboxes): csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv") csv_files.append(gr.File.update(value=csv_file, visible=True)) df[i].to_csv(csv_file, encoding="utf-8", index=False) else: img_files, images, csv_files = [""]*3,[""]*3,[""]*3 img_files[0], img_files[1], img_files[2] = image_blank, image_blank, image_blank images[0], images[1], images[2] = Image.open(image_blank), Image.open(image_blank), Image.open(image_blank) csv_file = "csv_wo_content.csv" csv_files[0], csv_files[1], csv_files[2] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True) df, df_empty = dict(), pd.DataFrame() df[0], df[1], df[2] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False) return msg, img_files[0], img_files[1], img_files[2], images[0], images[1], images[2], csv_files[0], csv_files[1], csv_files[2], df[0], df[1], df[2] # gradio APP with gr.Blocks(title="Inference APP for Document Understanding at line level (v1)", css=".gradio-container") as demo: gr.HTML("""
(02/12/2023) This Inference APP uses the model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level (chunk size of 384 tokens).
LiLT (Language-Independent Layout Transformer) is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model XML-RoBERTa base, this finetuned model has the capacity to understand any language. Finetuned on the dataset DocLayNet base, it can classifly any bounding box (and its OCR text) to 11 labels (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).
It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine ourselves (PyTesseract) as we'll need to do it in real life to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!
From any PDF (of any language), it allows to get all pages with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels).
To avoid running this APP for too long, only the first 3 pages are processed by this APP. If you want to update this limit, you can either clone this APP and change the value of the parameter max_imgboxes
, or run the corresponding notebook "Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)" which does not have this limit.
More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts: