import gradio as gr import warnings import os import pix2struct, layoutlm, donut warnings.filterwarnings('ignore') desc = """Step into the DocVQA Sanctum, where three formidable models stand ready to tackle your document queries head-on! Discover the prowess of LayoutLM, Pix2Struct, and Donut as they decode your document images and provide insightful answers to your questions. From LayoutLM's adept layout analysis to Pix2Struct's prowess in structural understanding and Donut's skill in content comprehension, this demo offers a captivating showcase of cutting-edge document visual question answering (DocVQA) technologies. **Please Note:** Kindly allow a few moments for result generation, as the models are currently being inferred on CPU. For a brief overview of what document visual question answering is, check out my latest blog post [here](https://medium.com/@krishnapal2308/understanding-docvqa-document-visual-question-answering-9e3db222bfed).""" def process_image_and_generate_output(image, model_selection, question): result = '' if image is None: return "Please select an image", None if model_selection == "LayoutLM": result = layoutlm.get_result(image, question) return result if model_selection == 'Pix2Struct': result = pix2struct.get_result(image, question) return result if model_selection == 'Donut': result = donut.get_result(image, question) return result return result sample_images = [ [os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"], [os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the Age Group?"], [os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the Industry Group?"] ] # Create a dropdown to select sample image image_input = gr.Image(label="Upload Image", type='filepath') # Create a dropdown to choose the model model_selection_input = gr.Radio(["LayoutLM", "Pix2Struct", "Donut"], label="Choose Model") question_input = gr.Text(label="Question") iface = gr.Interface(fn=process_image_and_generate_output, inputs=[image_input, model_selection_input, question_input], outputs=gr.Text(label="Result"), allow_flagging='never', examples=sample_images, title="DocVQA Sanctum", description=desc) iface.launch()