invoice_document_headers_extraction_with_donut

Running

App Files Files Community

invoice_document_headers_extraction_with_donut / app.py

to-be

Update app.py

650ae3b over 1 year ago

raw

history blame

No virus

7.12 kB

	import re
	import gradio as gr

	import torch
	from transformers import DonutProcessor, VisionEncoderDecoderModel
	from PIL import Image
	import requests
	from io import BytesIO
	import json
	import os


	processor = DonutProcessor.from_pretrained("./donut-base-finetuned-inv")
	model = VisionEncoderDecoderModel.from_pretrained("./donut-base-finetuned-inv")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)


	def update_status(state):
	if state == "start_or_clear":
	state = 'processing' #current state becomes
	return (gr.update(value="snowangel.gif",visible=True),gr.update(value="snowangel.gif",visible=True))
	elif state == "processing":
	state = 'finished_processing' #current state becomes
	return (gr.update(value="",visible=False),gr.update(value="",visible=False))
	elif state == "finished_processing":
	state = 'processing' #current state becomes
	return (gr.update(value="snowangel.gif",visible=True),gr.update(value="snowangel.gif",visible=True))

	def process_document(image):

	# prepare encoder inputs
	pixel_values = processor(image, return_tensors="pt").pixel_values

	# prepare decoder inputs
	task_prompt = "<s_cord-v2>"
	decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

	# generate answer
	outputs = model.generate(
	pixel_values.to(device),
	decoder_input_ids=decoder_input_ids.to(device),
	max_length=model.decoder.config.max_position_embeddings,
	early_stopping=True,
	pad_token_id=processor.tokenizer.pad_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	use_cache=True,
	num_beams=1,
	bad_words_ids=[[processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	)

	# postprocess
	sequence = processor.batch_decode(outputs.sequences)[0]
	sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
	sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token

	img2.update(visible=False)
	return processor.token2json(sequence), image

	title = '<table align="center" border="0" cellpadding="1" cellspacing="1" ><tbody><tr><td style="text-align:center"><img alt="" src="https://huggingface.co/spaces/to-be/invoice_document_headers_extraction_with_donut/resolve/main/circling_small.gif" style="float:right; height:50px; width:50px" /></td><td style="text-align:center"><h1>   Welcome</h1></td><td style="text-align:center"><img alt="" src="https://huggingface.co/spaces/to-be/invoice_document_headers_extraction_with_donut/resolve/main/circling2_small.gif" style="float:left; height:50px; width:50px" /></td></tr></tbody></table>'
	paragraph1 = '<p>Basic idea of this 🍩 model is to give it an image as input and extract indexes as text. No bounding boxes or confidences are generated.<br /> For more info, see the <a href="https://arxiv.org/abs/2111.15664">original paper</a> and the 🤗 <a href="https://huggingface.co/naver-clova-ix/donut-base">model</a>.</p>'
	paragraph2 = '<p><strong>Training</strong>:<br />The model was trained with a few thousand of annotated invoices and non-invoices (for those the doctype will be 'Other'). They span across different countries and languages. They are always one page only. The dataset is proprietary unfortunately. Model is set to input resolution of 1280x1920 pixels. So any sample you want to try with higher dpi than 150 has no added value.<br />It was trained for about 4 hours on a NVIDIA RTX A4000 for 20k steps with a val_metric of 0.03413819904382196 at the end.<br />The <u>following indexes</u> were included in the train set:</p><ul><li><span style="font-family:Calibri"><span style="color:black">DocType</span></span></li><li><span style="font-family:Calibri"><span style="color:black">Currency</span></span></li><li><span style="font-family:Calibri"><span style="color:black">DocumentDate</span></span></li><li><span style="font-family:Calibri"><span style="color:black">GrossAmount</span></span></li><li><span style="font-family:Calibri"><span style="color:black">InvoiceNumber</span></span></li><li><span style="font-family:Calibri"><span style="color:black">NetAmount</span></span></li><li><span style="font-family:Calibri"><span style="color:black">TaxAmount</span></span></li><li><span style="font-family:Calibri"><span style="color:black">OrderNumber</span></span></li><li><span style="font-family:Calibri"><span style="color:black">CreditorCountry</span></span></li></ul>'
	#demo = gr.Interface(fn=process_document,inputs=gr_image,outputs="json",title="Demo: Donut 🍩 for invoice header retrieval", description=description,
	# article=article,enable_queue=True, examples=[["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]], cache_examples=False)
	paragraph3 = '<p><strong>Try it out:</strong><br />To use it, simply upload your image and click 'submit', or click one of the examples to load them.<br /><em>(because this is running on the free cpu tier, it will take about 40 secs before you see a result. On a GPU it takes less than 2 seconds)</em></p><p> </p><p>Have fun 😎</p><p>Toon Beerten</p>'

	css = "#inp {height: auto !important; width: 100% !important;}"
	# css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
	# css = ".output_image, .input_image {height: 600px !important}"

	#css = ".image-preview {height: auto !important;}"
	#css='div {margin-left: auto; margin-right: auto; width: 100%;background-image: url("background.gif"); repeat 0 0;}')

	with gr.Blocks(css=css) as demo:
	state = gr.State(value='start_or_clear')

	gr.HTML(title)
	gr.HTML(paragraph1)
	gr.HTML(paragraph2)
	gr.HTML(paragraph3)

	with gr.Row().style():
	with gr.Column(scale=1):
	inp = gr.Image(label='Upload invoice here:') #.style(height=400)
	with gr.Column(scale=2):
	gr.Examples([["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]], inputs=[inp],label='Or use one of these examples:')
	with gr.Row().style(equal_height=True,height=200,rounded=False):
	with gr.Column(scale=1):
	img2 = gr.Image("drinking.gif",label=' ',visible=False).style(rounded=True)
	with gr.Column(scale=1):
	btn = gr.Button("↓ Extract ↓")
	with gr.Column(scale=1):
	img3 = gr.Image("snowangel.gif",label=' ',visible=False).style(rounded=True)
	with gr.Row().style():
	with gr.Column(scale=2):
	imgout = gr.Image(label='Uploaded document:',elem_id="inp")
	with gr.Column(scale=1):
	jsonout = gr.JSON(label='Extracted information:')
	#imgout.clear(fn=update_status,inputs=state,outputs=[img2,img3])
	#imgout.change(fn=update_status,inputs=state,outputs=[img2,img3])
	btn.click(fn=process_document, inputs=inp, outputs=[jsonout,imgout])

	demo.launch()