Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1

Runtime error

App Files Files Community

Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1 / app.py

pierreguillou

Update app.py

32fb013 over 1 year ago

raw

history blame contribute delete

No virus

15.5 kB

	import os

	# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
	# os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
	os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')

	# install detectron2 that matches pytorch 1.8
	# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
	#os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
	os.system('pip install git+https://github.com/facebookresearch/detectron2.git')

	import detectron2
	from detectron2.utils.logger import setup_logger
	setup_logger()

	import gradio as gr
	import re
	import string

	from operator import itemgetter
	import collections

	import pypdf
	from pypdf import PdfReader
	from pypdf.errors import PdfReadError

	import pdf2image
	from pdf2image import convert_from_path
	import langdetect
	from langdetect import detect_langs

	import pandas as pd
	import numpy as np
	import random
	import tempfile
	import itertools

	from matplotlib import font_manager
	from PIL import Image, ImageDraw, ImageFont
	import cv2

	## files

	import sys
	sys.path.insert(0, 'files/')

	import functions
	from functions import *

	# update pip
	os.system('python -m pip install --upgrade pip')

	## model / feature extractor / tokenizer

	# models
	model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
	model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"

	# get device
	import torch
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	## model LiLT
	import transformers
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
	model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
	model_lilt.to(device);

	## model LayoutXLM
	from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
	model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
	model_layoutxlm.to(device);

	# feature extractor
	from transformers import LayoutLMv2FeatureExtractor
	feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)

	# tokenizer
	from transformers import AutoTokenizer
	tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)

	# get labels
	id2label_lilt = model_lilt.config.id2label
	label2id_lilt = model_lilt.config.label2id
	num_labels_lilt = len(id2label_lilt)

	id2label_layoutxlm = model_layoutxlm.config.id2label
	label2id_layoutxlm = model_layoutxlm.config.label2id
	num_labels_layoutxlm = len(id2label_layoutxlm)

	# APP outputs by model
	def app_outputs_by_model(uploaded_pdf, model_id, model, tokenizer, max_length, id2label, cls_box, sep_box):
	filename, msg, images = pdf_to_images(uploaded_pdf)
	num_images = len(images)

	if not msg.startswith("Error with the PDF"):

	# Extraction of image data (text and bounding boxes)
	dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
	# prepare our data in the format of the model
	prepare_inference_features_partial = partial(prepare_inference_features, tokenizer=tokenizer, max_length=max_length, cls_box=cls_box, sep_box=sep_box)
	encoded_dataset = dataset.map(prepare_inference_features_partial, batched=True, batch_size=64, remove_columns=dataset.column_names)
	custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
	# Get predictions (token level)
	outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset, model_id, model)
	# Get predictions (line level)
	probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box)
	# Get labeled images with lines bounding boxes
	images = get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)

	img_files = list()
	# get image of PDF without bounding boxes
	for i in range(num_images):
	if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
	else: img_file = filename.replace(".pdf", ".png")
	img_file = img_file.replace("/", "_")
	images[i].save(img_file)
	img_files.append(img_file)

	if num_images < max_imgboxes:
	img_files += [image_blank]*(max_imgboxes - num_images)
	images += [Image.open(image_blank)]*(max_imgboxes - num_images)
	for count in range(max_imgboxes - num_images):
	df[num_images + count] = pd.DataFrame()
	else:
	img_files = img_files[:max_imgboxes]
	images = images[:max_imgboxes]
	df = dict(itertools.islice(df.items(), max_imgboxes))

	# save
	csv_files = list()
	for i in range(max_imgboxes):
	csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
	csv_file = csv_file.replace("/", "_")
	csv_files.append(gr.File.update(value=csv_file, visible=True))
	df[i].to_csv(csv_file, encoding="utf-8", index=False)

	else:
	img_files, images, csv_files = [""]max_imgboxes, [""]max_imgboxes, [""]*max_imgboxes
	img_files[0], img_files[1] = image_blank, image_blank
	images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
	csv_file = "csv_wo_content.csv"
	csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
	df, df_empty = dict(), pd.DataFrame()
	df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)

	return msg, img_files[0], images[0], csv_files[0], df[0]

	def app_outputs(uploaded_pdf):
	msg_lilt, img_files_lilt, images_lilt, csv_files_lilt, df_lilt = app_outputs_by_model(uploaded_pdf,
	model_id=model_id_lilt, model=model_lilt, tokenizer=tokenizer_lilt,
	max_length=max_length_lilt, id2label=id2label_lilt, cls_box=cls_box, sep_box=sep_box_lilt)

	msg_layoutxlm, img_files_layoutxlm, images_layoutxlm, csv_files_layoutxlm, df_layoutxlm = app_outputs_by_model(uploaded_pdf,
	model_id=model_id_layoutxlm, model=model_layoutxlm, tokenizer=tokenizer_layoutxlm,
	max_length=max_length_layoutxlm, id2label=id2label_layoutxlm, cls_box=cls_box, sep_box=sep_box_layoutxlm)

	return msg_lilt, msg_layoutxlm, img_files_lilt, img_files_layoutxlm, images_lilt, images_layoutxlm, csv_files_lilt, csv_files_layoutxlm, df_lilt, df_layoutxlm

	# gradio APP
	with gr.Blocks(title="Inference APP for Document Understanding at line level (v1 - LiLT base vs LayoutXLM base)", css=".gradio-container") as demo:
	gr.HTML("""
	<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1 - LiLT base vs LayoutXLM base)</h1></div>
	<div style="margin-top: 40px"><p>(03/08/2023) This Inference APP compares - only on the first PDF page - 2 Document Understanding models finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> at line level (chunk size of 384 tokens): <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">LiLT base combined with XLM-RoBERTa base</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">LayoutXLM base combined with XLM-RoBERTa base</a>.</p></div>
	<div><p>To test these 2 models separately, use their corresponding APP on Hugging Face Spaces: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">LiLT base APP (v1 - line level)</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">LayoutXLM base APP (v2 - line level)</a>.</p></div><div style="margin-top: 20px"><p>Links to Document Understanding APPs:</p><ul><li>Line level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">v1 (LiLT base)</a> \| <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">v2 (LayoutXLM base)</a></li><li>Paragraph level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v1 (LiLT base)</a> \| <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2" target="_blank">v2 (LayoutXLM base)</a></li></ul></div><div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p><ul><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="" target="_blank">Document AI \| Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI \| Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI \| Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI \| DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI \| Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
	""")
	with gr.Row():
	pdf_file = gr.File(label="PDF")
	with gr.Row():
	submit_btn = gr.Button(f"Get layout detection by LiLT and LayoutXLM on the first PDF page")
	reset_btn = gr.Button(value="Clear")
	with gr.Row():
	output_messages = []
	with gr.Column():
	output_msg = gr.Textbox(label="LiLT output message")
	output_messages.append(output_msg)
	with gr.Column():
	output_msg = gr.Textbox(label="LayoutXLM output message")
	output_messages.append(output_msg)
	with gr.Row():
	fileboxes = []
	with gr.Column():
	file_path = gr.File(visible=True, label=f"LiLT image file")
	fileboxes.append(file_path)
	with gr.Column():
	file_path = gr.File(visible=True, label=f"LayoutXLM image file")
	fileboxes.append(file_path)
	with gr.Row():
	imgboxes = []
	with gr.Column():
	img = gr.Image(type="pil", label=f"Lilt Image")
	imgboxes.append(img)
	with gr.Column():
	img = gr.Image(type="pil", label=f"LayoutXLM Image")
	imgboxes.append(img)
	with gr.Row():
	csvboxes = []
	with gr.Column():
	csv = gr.File(visible=True, label=f"LiLT csv file at line level")
	csvboxes.append(csv)
	with gr.Column():
	csv = gr.File(visible=True, label=f"LayoutXLM csv file at line level")
	csvboxes.append(csv)
	with gr.Row():
	dfboxes = []
	with gr.Column():
	df = gr.Dataframe(
	headers=["bounding boxes", "texts", "labels"],
	datatype=["str", "str", "str"],
	col_count=(3, "fixed"),
	visible=True,
	label=f"LiLT data",
	type="pandas",
	wrap=True
	)
	dfboxes.append(df)
	with gr.Column():
	df = gr.Dataframe(
	headers=["bounding boxes", "texts", "labels"],
	datatype=["str", "str", "str"],
	col_count=(3, "fixed"),
	visible=True,
	label=f"LayoutXLM data",
	type="pandas",
	wrap=True
	)
	dfboxes.append(df)

	outputboxes = output_messages + fileboxes + imgboxes + csvboxes + dfboxes

	submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)

	# https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91
	reset_btn.click(
	lambda: [pdf_file.update(value=None)] + [output_msg.update(value=None) for output_msg in output_messages] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
	inputs=[],
	outputs=[pdf_file] + output_messages + fileboxes + imgboxes + csvboxes + dfboxes
	)

	gr.Examples(
	[["files/example.pdf"]],
	[pdf_file],
	outputboxes,
	fn=app_outputs,
	cache_examples=True,
	)

	demo.launch()