Spaces:
Runtime error
Runtime error
krishnapal2308
commited on
Commit
·
0f51c16
1
Parent(s):
0bd5bed
adding description and pix2struct output fix
Browse files- .idea/misc.xml +3 -0
- __pycache__/donut.cpython-312.pyc +0 -0
- __pycache__/layoutlm.cpython-312.pyc +0 -0
- __pycache__/pix2struct.cpython-312.pyc +0 -0
- app.py +13 -3
- pix2struct.py +1 -1
.idea/misc.xml
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
|
|
|
|
|
|
3 |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
|
4 |
</project>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="Python 3.12 (docvqa_venv)" />
|
5 |
+
</component>
|
6 |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (docvqa_venv)" project-jdk-type="Python SDK" />
|
7 |
</project>
|
__pycache__/donut.cpython-312.pyc
ADDED
Binary file (2.37 kB). View file
|
|
__pycache__/layoutlm.cpython-312.pyc
ADDED
Binary file (526 Bytes). View file
|
|
__pycache__/pix2struct.cpython-312.pyc
ADDED
Binary file (1.09 kB). View file
|
|
app.py
CHANGED
@@ -2,8 +2,18 @@ import gradio as gr
|
|
2 |
import warnings
|
3 |
import os
|
4 |
import pix2struct, layoutlm, donut
|
|
|
5 |
warnings.filterwarnings('ignore')
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def process_image_and_generate_output(image, model_selection, question):
|
9 |
result = ''
|
@@ -25,8 +35,8 @@ def process_image_and_generate_output(image, model_selection, question):
|
|
25 |
|
26 |
sample_images = [
|
27 |
[os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
|
28 |
-
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the
|
29 |
-
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the
|
30 |
]
|
31 |
|
32 |
# Create a dropdown to select sample image
|
@@ -42,6 +52,6 @@ iface = gr.Interface(fn=process_image_and_generate_output,
|
|
42 |
outputs=gr.Text(label="Result"),
|
43 |
allow_flagging='never',
|
44 |
examples=sample_images,
|
45 |
-
title="DocVQA Sanctum")
|
46 |
|
47 |
iface.launch()
|
|
|
2 |
import warnings
|
3 |
import os
|
4 |
import pix2struct, layoutlm, donut
|
5 |
+
|
6 |
warnings.filterwarnings('ignore')
|
7 |
|
8 |
+
desc = """Step into the DocVQA Sanctum, where three formidable models stand ready to tackle your document queries head-on! Discover the prowess of LayoutLM, Pix2Struct, and Donut as they decode your document images and provide insightful answers to your questions.
|
9 |
+
|
10 |
+
From LayoutLM's adept layout analysis to Pix2Struct's prowess in structural understanding and Donut's skill in content comprehension, this demo offers a captivating showcase of cutting-edge document visual question answering (DocVQA) technologies.
|
11 |
+
|
12 |
+
**Please Note:** Kindly allow a few moments for result generation, as the models are currently being inferred on CPU.
|
13 |
+
|
14 |
+
For a brief overview of what document visual question answering is, check out my latest blog post [here](https://medium.com/@krishnapal2308/understanding-docvqa-document-visual-question-answering-9e3db222bfed)."""
|
15 |
+
|
16 |
+
|
17 |
|
18 |
def process_image_and_generate_output(image, model_selection, question):
|
19 |
result = ''
|
|
|
35 |
|
36 |
sample_images = [
|
37 |
[os.path.join(os.path.dirname(__file__), "images/1.png"), "LayoutLM", "What is the NIC Code?"],
|
38 |
+
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Pix2Struct", "What is the Age Group?"],
|
39 |
+
[os.path.join(os.path.dirname(__file__), "images/1.png"), "Donut", "What is the Industry Group?"]
|
40 |
]
|
41 |
|
42 |
# Create a dropdown to select sample image
|
|
|
52 |
outputs=gr.Text(label="Result"),
|
53 |
allow_flagging='never',
|
54 |
examples=sample_images,
|
55 |
+
title="DocVQA Sanctum", description=desc)
|
56 |
|
57 |
iface.launch()
|
pix2struct.py
CHANGED
@@ -12,4 +12,4 @@ def get_result(image_path, question):
|
|
12 |
predictions = model.generate(**inputs, max_new_tokens=256)
|
13 |
predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
|
14 |
|
15 |
-
return predicted_answer
|
|
|
12 |
predictions = model.generate(**inputs, max_new_tokens=256)
|
13 |
predicted_answer = processor.batch_decode(predictions, skip_special_tokens=True)
|
14 |
|
15 |
+
return predicted_answer[0]
|