krishnapal2308 commited on
Commit
0f51c16
·
1 Parent(s): 0bd5bed

adding description and pix2struct output fix

Browse files
.idea/misc.xml CHANGED
@@ -1,4 +1,7 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
 
 
 
3
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
4
  </project>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.12 (docvqa_venv)" />
5
+ </component>
6
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
7
  </project>
__pycache__/donut.cpython-312.pyc ADDED
Binary file (2.37 kB). View file
 
__pycache__/layoutlm.cpython-312.pyc ADDED
Binary file (526 Bytes). View file
 
__pycache__/pix2struct.cpython-312.pyc ADDED
Binary file (1.09 kB). View file
 
app.py CHANGED
@@ -2,8 +2,18 @@ import gradio as gr
2
  import warnings
3
  import os
4
  import pix2struct, layoutlm, donut
 
5
  warnings.filterwarnings('ignore')
6
 
 
 
 
 
 
 
 
 
 
7
 
8
  def process_image_and_generate_output(image, model_selection, question):
9
  result = ''
@@ -25,8 +35,8 @@ def process_image_and_generate_output(image, model_selection, question):
25
 
26
  sample_images = [
27
  [os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
28
- [os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the NIC Code?"],
29
- [os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the NIC Code?"]
30
  ]
31
 
32
  # Create a dropdown to select sample image
@@ -42,6 +52,6 @@ iface = gr.Interface(fn=process_image_and_generate_output,
42
  outputs=gr.Text(label="Result"),
43
  allow_flagging='never',
44
  examples=sample_images,
45
- title="DocVQA Sanctum")
46
 
47
  iface.launch()
 
2
  import warnings
3
  import os
4
  import pix2struct, layoutlm, donut
5
+
6
  warnings.filterwarnings('ignore')
7
 
8
+ desc = """Step into the DocVQA Sanctum, where three formidable models stand ready to tackle your document queries head-on! Discover the prowess of LayoutLM, Pix2Struct, and Donut as they decode your document images and provide insightful answers to your questions.
9
+
10
+ From LayoutLM's adept layout analysis to Pix2Struct's prowess in structural understanding and Donut's skill in content comprehension, this demo offers a captivating showcase of cutting-edge document visual question answering (DocVQA) technologies.
11
+
12
+ **Please Note:** Kindly allow a few moments for result generation, as the models are currently being inferred on CPU.
13
+
14
+ For a brief overview of what document visual question answering is, check out my latest blog post [here](https://medium.com/@krishnapal2308/understanding-docvqa-document-visual-question-answering-9e3db222bfed)."""
15
+
16
+
17
 
18
  def process_image_and_generate_output(image, model_selection, question):
19
  result = ''
 
35
 
36
  sample_images = [
37
  [os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
38
+ [os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the Age Group?"],
39
+ [os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the Industry Group?"]
40
  ]
41
 
42
  # Create a dropdown to select sample image
 
52
  outputs=gr.Text(label="Result"),
53
  allow_flagging='never',
54
  examples=sample_images,
55
+ title="DocVQA Sanctum", description=desc)
56
 
57
  iface.launch()
pix2struct.py CHANGED
@@ -12,4 +12,4 @@ def get_result(image_path, question):
12
  predictions = model.generate(**inputs, max_new_tokens=256)
13
  predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
14
 
15
- return predicted_answer
 
12
  predictions = model.generate(**inputs, max_new_tokens=256)
13
  predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
14
 
15
+ return predicted_answer[0]