DocVQA-Sanctum / app.py
krishnapal2308
adding description and pix2struct output fix
0f51c16
import gradio as gr
import warnings
import os
import pix2struct, layoutlm, donut
warnings.filterwarnings('ignore')
desc = """Step into the DocVQA Sanctum, where three formidable models stand ready to tackle your document queries head-on! Discover the prowess of LayoutLM, Pix2Struct, and Donut as they decode your document images and provide insightful answers to your questions.
From LayoutLM's adept layout analysis to Pix2Struct's prowess in structural understanding and Donut's skill in content comprehension, this demo offers a captivating showcase of cutting-edge document visual question answering (DocVQA) technologies.
**Please Note:** Kindly allow a few moments for result generation, as the models are currently being inferred on CPU.
For a brief overview of what document visual question answering is, check out my latest blog post [here](https://medium.com/@krishnapal2308/understanding-docvqa-document-visual-question-answering-9e3db222bfed)."""
def process_image_and_generate_output(image, model_selection, question):
result = ''
if image is None:
return "Please select an image", None
if model_selection == "LayoutLM":
result = layoutlm.get_result(image, question)
return result
if model_selection == 'Pix2Struct':
result = pix2struct.get_result(image, question)
return result
if model_selection == 'Donut':
result = donut.get_result(image, question)
return result
return result
sample_images = [
[os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the Age Group?"],
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the Industry Group?"]
]
# Create a dropdown to select sample image
image_input = gr.Image(label="Upload Image", type='filepath')
# Create a dropdown to choose the model
model_selection_input = gr.Radio(["LayoutLM", "Pix2Struct", "Donut"],
label="Choose Model")
question_input = gr.Text(label="Question")
iface = gr.Interface(fn=process_image_and_generate_output,
inputs=[image_input, model_selection_input, question_input],
outputs=gr.Text(label="Result"),
allow_flagging='never',
examples=sample_images,
title="DocVQA Sanctum", description=desc)
iface.launch()