import os # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158) # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html') os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') # install detectron2 that matches pytorch 1.8 # See https://detectron2.readthedocs.io/tutorials/install.html for instructions #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html') os.system('pip install git+https://github.com/facebookresearch/detectron2.git') import detectron2 from detectron2.utils.logger import setup_logger setup_logger() import gradio as gr import re import string from operator import itemgetter import collections import pypdf from pypdf import PdfReader from pypdf.errors import PdfReadError import pdf2image from pdf2image import convert_from_path import langdetect from langdetect import detect_langs import pandas as pd import numpy as np import random import tempfile import itertools from matplotlib import font_manager from PIL import Image, ImageDraw, ImageFont import cv2 ## files import sys sys.path.insert(0, 'files/') import functions from functions import * # update pip os.system('python -m pip install --upgrade pip') ## model / feature extractor / tokenizer # models model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" # get device import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## model LiLT import transformers from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); model_lilt.to(device); ## model LayoutXLM from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); model_layoutxlm.to(device); # feature extractor from transformers import LayoutLMv2FeatureExtractor feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) # tokenizer from transformers import AutoTokenizer tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) # get labels id2label_lilt = model_lilt.config.id2label label2id_lilt = model_lilt.config.label2id num_labels_lilt = len(id2label_lilt) id2label_layoutxlm = model_layoutxlm.config.id2label label2id_layoutxlm = model_layoutxlm.config.label2id num_labels_layoutxlm = len(id2label_layoutxlm) # APP outputs by model def app_outputs_by_model(uploaded_pdf, model_id, model, tokenizer, max_length, id2label, cls_box, sep_box): filename, msg, images = pdf_to_images(uploaded_pdf) num_images = len(images) if not msg.startswith("Error with the PDF"): # Extraction of image data (text and bounding boxes) dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images) # prepare our data in the format of the model prepare_inference_features_partial = partial(prepare_inference_features, tokenizer=tokenizer, max_length=max_length, cls_box=cls_box, sep_box=sep_box) encoded_dataset = dataset.map(prepare_inference_features_partial, batched=True, batch_size=64, remove_columns=dataset.column_names) custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer) # Get predictions (token level) outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset, model_id, model) # Get predictions (line level) probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box) # Get labeled images with lines bounding boxes images = get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict) img_files = list() # get image of PDF without bounding boxes for i in range(num_images): if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png") else: img_file = filename.replace(".pdf", ".png") img_file = img_file.replace("/", "_") images[i].save(img_file) img_files.append(img_file) if num_images < max_imgboxes: img_files += [image_blank]*(max_imgboxes - num_images) images += [Image.open(image_blank)]*(max_imgboxes - num_images) for count in range(max_imgboxes - num_images): df[num_images + count] = pd.DataFrame() else: img_files = img_files[:max_imgboxes] images = images[:max_imgboxes] df = dict(itertools.islice(df.items(), max_imgboxes)) # save csv_files = list() for i in range(max_imgboxes): csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv") csv_file = csv_file.replace("/", "_") csv_files.append(gr.File.update(value=csv_file, visible=True)) df[i].to_csv(csv_file, encoding="utf-8", index=False) else: img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes img_files[0], img_files[1] = image_blank, image_blank images[0], images[1] = Image.open(image_blank), Image.open(image_blank) csv_file = "csv_wo_content.csv" csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True) df, df_empty = dict(), pd.DataFrame() df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False) return msg, img_files[0], images[0], csv_files[0], df[0] def app_outputs(uploaded_pdf): msg_lilt, img_files_lilt, images_lilt, csv_files_lilt, df_lilt = app_outputs_by_model(uploaded_pdf, model_id=model_id_lilt, model=model_lilt, tokenizer=tokenizer_lilt, max_length=max_length_lilt, id2label=id2label_lilt, cls_box=cls_box, sep_box=sep_box_lilt) msg_layoutxlm, img_files_layoutxlm, images_layoutxlm, csv_files_layoutxlm, df_layoutxlm = app_outputs_by_model(uploaded_pdf, model_id=model_id_layoutxlm, model=model_layoutxlm, tokenizer=tokenizer_layoutxlm, max_length=max_length_layoutxlm, id2label=id2label_layoutxlm, cls_box=cls_box, sep_box=sep_box_layoutxlm) return msg_lilt, msg_layoutxlm, img_files_lilt, img_files_layoutxlm, images_lilt, images_layoutxlm, csv_files_lilt, csv_files_layoutxlm, df_lilt, df_layoutxlm # gradio APP with gr.Blocks(title="Inference APP for Document Understanding at line level (v1 - LiLT base vs LayoutXLM base)", css=".gradio-container") as demo: gr.HTML("""

Inference APP for Document Understanding at line level (v1 - LiLT base vs LayoutXLM base)

(03/08/2023) This Inference APP compares - only on the first PDF page - 2 Document Understanding models finetuned on the dataset DocLayNet base at line level (chunk size of 384 tokens): LiLT base combined with XLM-RoBERTa base and LayoutXLM base combined with XLM-RoBERTa base.

To test these 2 models separately, use their corresponding APP on Hugging Face Spaces: LiLT base APP (v1 - line level) and LayoutXLM base APP (v2 - line level).

Links to Document Understanding APPs:

More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:

""") with gr.Row(): pdf_file = gr.File(label="PDF") with gr.Row(): submit_btn = gr.Button(f"Get layout detection by LiLT and LayoutXLM on the first PDF page") reset_btn = gr.Button(value="Clear") with gr.Row(): output_messages = [] with gr.Column(): output_msg = gr.Textbox(label="LiLT output message") output_messages.append(output_msg) with gr.Column(): output_msg = gr.Textbox(label="LayoutXLM output message") output_messages.append(output_msg) with gr.Row(): fileboxes = [] with gr.Column(): file_path = gr.File(visible=True, label=f"LiLT image file") fileboxes.append(file_path) with gr.Column(): file_path = gr.File(visible=True, label=f"LayoutXLM image file") fileboxes.append(file_path) with gr.Row(): imgboxes = [] with gr.Column(): img = gr.Image(type="pil", label=f"Lilt Image") imgboxes.append(img) with gr.Column(): img = gr.Image(type="pil", label=f"LayoutXLM Image") imgboxes.append(img) with gr.Row(): csvboxes = [] with gr.Column(): csv = gr.File(visible=True, label=f"LiLT csv file at line level") csvboxes.append(csv) with gr.Column(): csv = gr.File(visible=True, label=f"LayoutXLM csv file at line level") csvboxes.append(csv) with gr.Row(): dfboxes = [] with gr.Column(): df = gr.Dataframe( headers=["bounding boxes", "texts", "labels"], datatype=["str", "str", "str"], col_count=(3, "fixed"), visible=True, label=f"LiLT data", type="pandas", wrap=True ) dfboxes.append(df) with gr.Column(): df = gr.Dataframe( headers=["bounding boxes", "texts", "labels"], datatype=["str", "str", "str"], col_count=(3, "fixed"), visible=True, label=f"LayoutXLM data", type="pandas", wrap=True ) dfboxes.append(df) outputboxes = output_messages + fileboxes + imgboxes + csvboxes + dfboxes submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes) # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91 reset_btn.click( lambda: [pdf_file.update(value=None)] + [output_msg.update(value=None) for output_msg in output_messages] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes], inputs=[], outputs=[pdf_file] + output_messages + fileboxes + imgboxes + csvboxes + dfboxes ) gr.Examples( [["files/example.pdf"]], [pdf_file], outputboxes, fn=app_outputs, cache_examples=True, ) demo.launch()