Inference-APP-Document-Understanding-at-linelevel-v1

Sleeping

App Files Files Community

Inference-APP-Document-Understanding-at-linelevel-v1 / app.py

pierreguillou

Create app.py

b565cf9 about 2 years ago

raw

history blame

10.4 kB

	import os
	import gradio as gr
	import re
	import string

	from operator import itemgetter
	import collections

	import pypdf
	from pypdf import PdfReader
	from pypdf.errors import PdfReadError

	import pdf2image
	from pdf2image import convert_from_path
	import langdetect
	from langdetect import detect_langs

	import pandas as pd
	import numpy as np
	import random
	import tempfile
	import itertools

	from matplotlib import font_manager
	from PIL import Image, ImageDraw, ImageFont
	import cv2

	## files

	import sys
	sys.path.insert(0, 'files/')

	import functions
	from functions import *

	# update pip
	os.system('python -m pip install --upgrade pip')

	# APP outputs
	def app_outputs(uploaded_pdf):
	filename, msg, images = pdf_to_images(uploaded_pdf)
	num_images = len(images)

	if not msg.startswith("Error with the PDF"):

	# Extraction of image data (text and bounding boxes)
	dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
	# prepare our data in the format of the model
	encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
	custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
	# Get predictions (token level)
	outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
	# Get predictions (line level)
	probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
	# Get labeled images with lines bounding boxes
	images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)

	img_files = list()
	# get image of PDF without bounding boxes
	for i in range(num_images):
	if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
	else: img_file = filename.replace(".pdf", ".png")
	images[i].save(img_file)
	img_files.append(img_file)

	if num_images < max_imgboxes:
	img_files += [image_blank]*(max_imgboxes - num_images)
	images += [Image.open(image_blank)]*(max_imgboxes - num_images)
	for count in range(max_imgboxes - num_images):
	df[num_images + count] = pd.DataFrame()
	else:
	img_files = img_files[:max_imgboxes]
	images = images[:max_imgboxes]
	df = dict(itertools.islice(df.items(), max_imgboxes))

	# save
	csv_files = list()
	for i in range(max_imgboxes):
	csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
	csv_files.append(gr.File.update(value=csv_file, visible=True))
	df[i].to_csv(csv_file, encoding="utf-8", index=False)

	else:
	img_files, images, csv_files = [""]3,[""]3,[""]*3
	img_files[0], img_files[1], img_files[2] = image_blank, image_blank, image_blank
	images[0], images[1], images[2] = Image.open(image_blank), Image.open(image_blank), Image.open(image_blank)
	csv_file = "csv_wo_content.csv"
	csv_files[0], csv_files[1], csv_files[2] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
	df, df_empty = dict(), pd.DataFrame()
	df[0], df[1], df[2] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)

	return msg, img_files[0], img_files[1], img_files[2], images[0], images[1], images[2], csv_files[0], csv_files[1], csv_files[2], df[0], df[1], df[2]

	# gradio APP
	with gr.Blocks(title="Inference APP for Document Understanding at line level (v1)", css=".gradio-container") as demo:
	gr.HTML("""
	<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1)</h1></div>
	<div style="margin-top: 40px"><p>(02/12/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base</a> at line level (chunk size of 384 tokens).</p></div>
	<div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to understand any language. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can classifly any bounding box (and its OCR text) to 11 labels (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
	<div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine ourselves (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) as we'll need to do it in real life to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
	<div><p>From any PDF (of any language), it allows to get all pages with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels).</p></div>
	<div><p>To avoid running this APP for too long, <b>only the first 3 pages are processed by this APP</b>. If you want to update this limit, you can either clone this APP and change the value of the parameter <code>max_imgboxes</code>, or run the corresponding notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI \| Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)</a>" which does not have this limit.</p></div>
	<div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
	<ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">(02/10/2023) Document AI \| Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank"> (01/31/2023) Document AI \| DocLayNet image viewer APP</a></li><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI \| Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
	""")
	with gr.Row():
	pdf_file = gr.File(label="PDF")
	with gr.Row():
	submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
	reset_btn = gr.Button(value="Clear")
	with gr.Row():
	output_msg = gr.Textbox(label="Output message")
	with gr.Row():
	fileboxes = []
	for num_page in range(max_imgboxes):
	file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
	fileboxes.append(file_path)
	with gr.Row():
	imgboxes = []
	for num_page in range(max_imgboxes):
	img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
	imgboxes.append(img)
	with gr.Row():
	csvboxes = []
	for num_page in range(max_imgboxes):
	csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
	csvboxes.append(csv)
	with gr.Row():
	dfboxes = []
	for num_page in range(max_imgboxes):
	df = gr.Dataframe(
	headers=["bounding boxes", "texts", "labels"],
	datatype=["str", "str", "str"],
	col_count=(3, "fixed"),
	visible=True,
	label=f"Data of page {num_page}",
	type="pandas",
	wrap=True
	)
	dfboxes.append(df)

	outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
	submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
	reset_btn.click(
	lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
	inputs=[],
	outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes,
	)

	gr.Examples(
	[["files/example.pdf"]],
	[pdf_file],
	outputboxes,
	fn=app_outputs,
	cache_examples=True,
	)

	demo.launch(debug=True)