Spaces:

krishnapal2308
/

DocVQA-Sanctum

Runtime error

App Files Files Community

krishnapal2308 commited on Feb 15, 2024

Commit

0f51c16

1 Parent(s): 0bd5bed

adding description and pix2struct output fix

Browse files

Files changed (6) hide show

.idea/misc.xml +3 -0
__pycache__/donut.cpython-312.pyc +0 -0
__pycache__/layoutlm.cpython-312.pyc +0 -0
__pycache__/pix2struct.cpython-312.pyc +0 -0
app.py +13 -3
pix2struct.py +1 -1

.idea/misc.xml CHANGED Viewed

@@ -1,4 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
 </project>

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12 (docvqa_venv)" />
+  </component>
   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
 </project>

__pycache__/donut.cpython-312.pyc ADDED Viewed

Binary file (2.37 kB). View file

__pycache__/layoutlm.cpython-312.pyc ADDED Viewed

Binary file (526 Bytes). View file

__pycache__/pix2struct.cpython-312.pyc ADDED Viewed

Binary file (1.09 kB). View file

app.py CHANGED Viewed

@@ -2,8 +2,18 @@ import gradio as gr
 import warnings
 import os
 import pix2struct, layoutlm, donut
 warnings.filterwarnings('ignore')
 def process_image_and_generate_output(image, model_selection, question):
     result = ''
@@ -25,8 +35,8 @@ def process_image_and_generate_output(image, model_selection, question):
 sample_images = [
     [os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
-    [os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the NIC Code?"],
-    [os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the NIC Code?"]
 ]
 # Create a dropdown to select sample image
@@ -42,6 +52,6 @@ iface = gr.Interface(fn=process_image_and_generate_output,
                      outputs=gr.Text(label="Result"),
                      allow_flagging='never',
                      examples=sample_images,
-                     title="DocVQA Sanctum")
 iface.launch()

 import warnings
 import os
 import pix2struct, layoutlm, donut
 warnings.filterwarnings('ignore')
+desc = """Step into the DocVQA Sanctum, where three formidable models stand ready to tackle your document queries head-on! Discover the prowess of LayoutLM, Pix2Struct, and Donut as they decode your document images and provide insightful answers to your questions.
+From LayoutLM's adept layout analysis to Pix2Struct's prowess in structural understanding and Donut's skill in content comprehension, this demo offers a captivating showcase of cutting-edge document visual question answering (DocVQA) technologies.
+**Please Note:** Kindly allow a few moments for result generation, as the models are currently being inferred on CPU.
+For a brief overview of what document visual question answering is, check out my latest blog post [here](https://medium.com/@krishnapal2308/understanding-docvqa-document-visual-question-answering-9e3db222bfed)."""
 def process_image_and_generate_output(image, model_selection, question):
     result = ''
 sample_images = [
     [os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
+    [os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the Age Group?"],
+    [os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the Industry Group?"]
 ]
 # Create a dropdown to select sample image
                      outputs=gr.Text(label="Result"),
                      allow_flagging='never',
                      examples=sample_images,
+                     title="DocVQA Sanctum", description=desc)
 iface.launch()

pix2struct.py CHANGED Viewed

@@ -12,4 +12,4 @@ def get_result(image_path, question):
     predictions = model.generate(**inputs, max_new_tokens=256)
     predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
-    return predicted_answer

     predictions = model.generate(**inputs, max_new_tokens=256)
     predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
+    return predicted_answer[0]