Spaces:
Runtime error
Runtime error
Add chatbot and audiobot pages to application.
Browse filesUse streamlit to create chatbot and audiobot pages for visual
question answering.
- .gitignore +10 -0
- README.md +24 -1
- app.py +39 -0
- audiobot.py +60 -0
- chatbot.py +57 -0
- model/predictor.py +48 -0
- requirements.txt +116 -0
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
|
5 |
+
# Distribution / packaging
|
6 |
+
.Python
|
7 |
+
|
8 |
+
# Installer logs
|
9 |
+
pip-log.txt
|
10 |
+
pip-delete-this-directory.txt
|
README.md
CHANGED
@@ -4,10 +4,33 @@ emoji: π
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
|
|
7 |
sdk_version: 1.10.0
|
8 |
app_file: app.py
|
|
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
+
python_version: 3.9.0
|
8 |
sdk_version: 1.10.0
|
9 |
app_file: app.py
|
10 |
+
models: ['Madhuri/t5_small_vqa_fs', 'dandelin/vilt-b32-finetuned-vqa']
|
11 |
pinned: false
|
12 |
license: mit
|
13 |
---
|
14 |
|
15 |
+
## Visual Question Answering - Bot
|
16 |
+
|
17 |
+
VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
|
18 |
+
Here, we merged Vision transformer and Language generator with the audio transformer.
|
19 |
+
We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
|
20 |
+
Please use the radio buttons below to navigate.
|
21 |
+
|
22 |
+
|
23 |
+
## References
|
24 |
+
|
25 |
+
> ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision
|
26 |
+
>
|
27 |
+
> Author: Wonjae Kim and Bokyung Son and Ildoo Kim
|
28 |
+
>
|
29 |
+
> Year: 2021
|
30 |
+
>
|
31 |
+
> eprint: 2102.03334
|
32 |
+
>
|
33 |
+
> archivePrefix: arXiv
|
34 |
+
>
|
35 |
+
> primaryClass: stat.ML
|
36 |
+
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from model import predictor
|
4 |
+
import audiobot
|
5 |
+
import chatbot
|
6 |
+
import os
|
7 |
+
|
8 |
+
|
9 |
+
def run():
|
10 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
11 |
+
st.set_page_config(
|
12 |
+
page_title='Visual Question Answering - Bot',
|
13 |
+
page_icon=':robot:',
|
14 |
+
layout='wide'
|
15 |
+
)
|
16 |
+
|
17 |
+
st.session_state['predictor'] = predictor.Predictor()
|
18 |
+
|
19 |
+
st.sidebar.title('VQA Bot')
|
20 |
+
st.sidebar.write('''
|
21 |
+
VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
|
22 |
+
Here, we merged Vision transformer and Language generator with the audio transformer.
|
23 |
+
We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
|
24 |
+
Please use the radio buttons below to navigate.
|
25 |
+
''')
|
26 |
+
|
27 |
+
selected_page = st.sidebar.radio('', ('VQA Chatbot', 'VQA Audiobot'))
|
28 |
+
if selected_page == 'VQA Chatbot':
|
29 |
+
chatbot.show()
|
30 |
+
elif selected_page == 'VQA Audiobot':
|
31 |
+
audiobot.show()
|
32 |
+
|
33 |
+
st.sidebar.write(
|
34 |
+
'Madhuri Sakhare: [Hugging Face](https://huggingface.co/Madhuri)',
|
35 |
+
'[Github](https://github.com/msak1612/vqa_chatbot)'
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
run()
|
audiobot.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
from bokeh.models.widgets import Button
|
5 |
+
from bokeh.models import CustomJS
|
6 |
+
from streamlit_bokeh_events import streamlit_bokeh_events
|
7 |
+
import subprocess
|
8 |
+
|
9 |
+
|
10 |
+
def show():
|
11 |
+
st.title('Visual Question Answering - Audiobot')
|
12 |
+
st.markdown('''
|
13 |
+
<h4 style='text-align: center; color: #B2BEB5;'>
|
14 |
+
<i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
|
15 |
+
Please upload image and fire away!
|
16 |
+
</i></h4>
|
17 |
+
''', unsafe_allow_html=True)
|
18 |
+
upload_pic = st.file_uploader('Choose an image...', type=[
|
19 |
+
'jpg', 'png', 'jpeg'], accept_multiple_files=False)
|
20 |
+
if upload_pic is not None:
|
21 |
+
st.session_state.image = Image.open(upload_pic)
|
22 |
+
st.image(upload_pic, use_column_width='auto')
|
23 |
+
else:
|
24 |
+
st.session_state.image = None
|
25 |
+
|
26 |
+
# Speech recognition based in streamlit based on
|
27 |
+
# https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
|
28 |
+
stt_button = Button(label='Ask', width=100)
|
29 |
+
stt_button.js_on_event('button_click', CustomJS(code='''
|
30 |
+
var recognition = new webkitSpeechRecognition();
|
31 |
+
recognition.continuous = false;
|
32 |
+
recognition.interimResults = false;
|
33 |
+
|
34 |
+
recognition.onresult = function (e) {
|
35 |
+
var value = '';
|
36 |
+
for (var i = e.resultIndex; i < e.results.length; ++i) {
|
37 |
+
if (e.results[i].isFinal) {
|
38 |
+
value += e.results[i][0].transcript;
|
39 |
+
}
|
40 |
+
}
|
41 |
+
if ( value != '') {
|
42 |
+
document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
|
43 |
+
}
|
44 |
+
}
|
45 |
+
recognition.start();
|
46 |
+
'''))
|
47 |
+
|
48 |
+
result = streamlit_bokeh_events(
|
49 |
+
stt_button,
|
50 |
+
events='GET_TEXT',
|
51 |
+
key='listen',
|
52 |
+
refresh_on_update=False,
|
53 |
+
override_height=75,
|
54 |
+
debounce_time=0)
|
55 |
+
|
56 |
+
if result:
|
57 |
+
if 'GET_TEXT' in result:
|
58 |
+
answer = st.session_state.predictor.predict_answer_from_text(
|
59 |
+
st.session_state.image, result.get('GET_TEXT'))
|
60 |
+
subprocess.check_output(['say', answer])
|
chatbot.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_chat import message
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
def init_chat_history():
|
7 |
+
if 'question' not in st.session_state:
|
8 |
+
st.session_state['question'] = []
|
9 |
+
|
10 |
+
if 'answer' not in st.session_state:
|
11 |
+
st.session_state['answer'] = []
|
12 |
+
|
13 |
+
|
14 |
+
def update_chat_messages():
|
15 |
+
if st.session_state['answer']:
|
16 |
+
for i in range(len(st.session_state['answer'])-1, -1, -1):
|
17 |
+
message(st.session_state['answer'][i], key=str(i))
|
18 |
+
message(st.session_state['question'][i],
|
19 |
+
is_user=True, key=str(i) + '_user')
|
20 |
+
|
21 |
+
|
22 |
+
def predict(image, input):
|
23 |
+
if image is None or not input:
|
24 |
+
return
|
25 |
+
|
26 |
+
answer = st.session_state.predictor.predict_answer_from_text(image, input)
|
27 |
+
st.session_state.question.append(input)
|
28 |
+
st.session_state.answer.append(answer)
|
29 |
+
|
30 |
+
|
31 |
+
def show():
|
32 |
+
init_chat_history()
|
33 |
+
|
34 |
+
st.title('Visual Question Answering - Chatbot')
|
35 |
+
st.markdown('''
|
36 |
+
<h4 style='text-align: center; color: #B2BEB5;'>
|
37 |
+
<i>Hi, I am a Visual Chatbot, capable of answering a sequence of questions about images.
|
38 |
+
Please upload image and fire away!
|
39 |
+
</i></h4>
|
40 |
+
''', unsafe_allow_html=True)
|
41 |
+
|
42 |
+
image_col, text_col = st.columns(2)
|
43 |
+
with image_col:
|
44 |
+
upload_pic = st.file_uploader('Choose an image...', type=[
|
45 |
+
'jpg', 'png', 'jpeg'], accept_multiple_files=False)
|
46 |
+
if upload_pic is not None:
|
47 |
+
image = Image.open(upload_pic)
|
48 |
+
st.image(upload_pic, use_column_width='auto')
|
49 |
+
else:
|
50 |
+
st.session_state.question.clear()
|
51 |
+
st.session_state.answer.clear()
|
52 |
+
st.session_state.input = ''
|
53 |
+
with text_col:
|
54 |
+
input = st.text_input('', '', key='input')
|
55 |
+
if input:
|
56 |
+
predict(image, input)
|
57 |
+
update_chat_messages()
|
model/predictor.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import ViltProcessor
|
3 |
+
from transformers import ViltForQuestionAnswering
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
from transformers import AutoModelForSeq2SeqLM
|
6 |
+
|
7 |
+
import os
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
'''
|
12 |
+
Visual Question Answering Model to generate answer statement for
|
13 |
+
question.
|
14 |
+
'''
|
15 |
+
|
16 |
+
|
17 |
+
@st.experimental_singleton
|
18 |
+
class Predictor:
|
19 |
+
def __init__(self):
|
20 |
+
auth_token = os.environ.get('TOKEN') or True
|
21 |
+
self.vqa_processor = ViltProcessor.from_pretrained(
|
22 |
+
'dandelin/vilt-b32-finetuned-vqa')
|
23 |
+
self.vqa_model = ViltForQuestionAnswering.from_pretrained(
|
24 |
+
'dandelin/vilt-b32-finetuned-vqa')
|
25 |
+
self.qa_model = AutoModelForSeq2SeqLM.from_pretrained(
|
26 |
+
'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
|
27 |
+
self.qa_tokenizer = AutoTokenizer.from_pretrained(
|
28 |
+
'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
|
29 |
+
|
30 |
+
def predict_answer_from_text(self, image, question):
|
31 |
+
if not question or image is None:
|
32 |
+
return ''
|
33 |
+
|
34 |
+
# process question using image model
|
35 |
+
encoding = self.vqa_processor(image, question, return_tensors='pt')
|
36 |
+
with torch.no_grad():
|
37 |
+
outputs = self.vqa_model(**encoding)
|
38 |
+
short_answer = self.vqa_model.config.id2label[outputs.logits.argmax(
|
39 |
+
-1).item()]
|
40 |
+
|
41 |
+
# generate statement using sentence generator model
|
42 |
+
prompt = question + '. ' + short_answer
|
43 |
+
input_ids = self.qa_tokenizer(prompt, return_tensors='pt').input_ids
|
44 |
+
with torch.no_grad():
|
45 |
+
output_ids = self.qa_model.generate(input_ids)
|
46 |
+
answers = self.qa_tokenizer.batch_decode(
|
47 |
+
output_ids, skip_special_tokens=True)
|
48 |
+
return answers[0]
|
requirements.txt
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==4.2.0
|
2 |
+
ansicolors==1.1.8
|
3 |
+
ansiwrap==0.8.4
|
4 |
+
appnope==0.1.3
|
5 |
+
argon2-cffi==21.3.0
|
6 |
+
argon2-cffi-bindings==21.2.0
|
7 |
+
asttokens==2.0.5
|
8 |
+
attrs==21.4.0
|
9 |
+
backcall==0.2.0
|
10 |
+
beautifulsoup4==4.11.1
|
11 |
+
bleach==5.0.0
|
12 |
+
blinker==1.4
|
13 |
+
bokeh==2.4.3
|
14 |
+
cachetools==5.2.0
|
15 |
+
certifi==2022.6.15
|
16 |
+
cffi==1.15.0
|
17 |
+
chainmap==1.0.3
|
18 |
+
charset-normalizer==2.0.12
|
19 |
+
click==8.1.3
|
20 |
+
combomethod==1.0.12
|
21 |
+
commonmark==0.9.1
|
22 |
+
debugpy==1.6.0
|
23 |
+
decorator==5.1.1
|
24 |
+
defusedxml==0.7.1
|
25 |
+
entrypoints==0.4
|
26 |
+
executing==0.8.3
|
27 |
+
fastjsonschema==2.15.3
|
28 |
+
filelock==3.7.1
|
29 |
+
gitdb==4.0.9
|
30 |
+
GitPython==3.1.27
|
31 |
+
huggingface-hub==0.7.0
|
32 |
+
idna==3.3
|
33 |
+
importlib-metadata==4.11.4
|
34 |
+
ipykernel==6.15.0
|
35 |
+
ipython==8.4.0
|
36 |
+
ipython-genutils==0.2.0
|
37 |
+
ipywidgets==7.7.0
|
38 |
+
jedi==0.18.1
|
39 |
+
Jinja2==3.1.2
|
40 |
+
jsonschema==4.6.0
|
41 |
+
jupyter-client==7.3.4
|
42 |
+
jupyter-core==4.10.0
|
43 |
+
jupyterlab-pygments==0.2.2
|
44 |
+
jupyterlab-widgets==1.1.0
|
45 |
+
MarkupSafe==2.1.1
|
46 |
+
matplotlib-inline==0.1.3
|
47 |
+
mementos==1.3.1
|
48 |
+
mistune==0.8.4
|
49 |
+
nbclient==0.6.4
|
50 |
+
nbconvert==6.5.0
|
51 |
+
nbformat==5.4.0
|
52 |
+
nest-asyncio==1.5.5
|
53 |
+
notebook==6.4.12
|
54 |
+
nulltype==2.3.1
|
55 |
+
numpy==1.22.4
|
56 |
+
options==1.4.10
|
57 |
+
packaging==21.3
|
58 |
+
pandas==1.4.2
|
59 |
+
pandocfilters==1.5.0
|
60 |
+
parso==0.8.3
|
61 |
+
pexpect==4.8.0
|
62 |
+
pickleshare==0.7.5
|
63 |
+
Pillow==9.1.1
|
64 |
+
prometheus-client==0.14.1
|
65 |
+
prompt-toolkit==3.0.29
|
66 |
+
protobuf==3.20.1
|
67 |
+
psutil==5.9.1
|
68 |
+
ptyprocess==0.7.0
|
69 |
+
pure-eval==0.2.2
|
70 |
+
pyarrow==8.0.0
|
71 |
+
pycparser==2.21
|
72 |
+
pydeck==0.7.1
|
73 |
+
Pygments==2.12.0
|
74 |
+
Pympler==1.0.1
|
75 |
+
pyparsing==3.0.9
|
76 |
+
pyrsistent==0.18.1
|
77 |
+
python-dateutil==2.8.2
|
78 |
+
pytz==2022.1
|
79 |
+
pytz-deprecation-shim==0.1.0.post0
|
80 |
+
PyYAML==6.0
|
81 |
+
pyzmq==23.1.0
|
82 |
+
regex==2022.6.2
|
83 |
+
requests==2.28.0
|
84 |
+
rich==12.4.4
|
85 |
+
say==1.6.6
|
86 |
+
semver==2.13.0
|
87 |
+
Send2Trash==1.8.0
|
88 |
+
simplere==1.2.13
|
89 |
+
six==1.12.0
|
90 |
+
smmap==5.0.0
|
91 |
+
soupsieve==2.3.2.post1
|
92 |
+
stack-data==0.3.0
|
93 |
+
streamlit==1.10.0
|
94 |
+
streamlit-bokeh-events==0.1.2
|
95 |
+
streamlit-chat==0.0.2.1
|
96 |
+
subprocess.run==0.0.8
|
97 |
+
terminado==0.15.0
|
98 |
+
textwrap3==0.9.2
|
99 |
+
tinycss2==1.1.1
|
100 |
+
tokenizers==0.12.1
|
101 |
+
toml==0.10.2
|
102 |
+
toolz==0.11.2
|
103 |
+
torch==1.11.0
|
104 |
+
tornado==6.1
|
105 |
+
tqdm==4.64.0
|
106 |
+
traitlets==5.3.0
|
107 |
+
transformers==4.20.0
|
108 |
+
typing_extensions==4.2.0
|
109 |
+
tzdata==2022.1
|
110 |
+
tzlocal==4.2
|
111 |
+
urllib3==1.26.9
|
112 |
+
validators==0.20.0
|
113 |
+
wcwidth==0.2.5
|
114 |
+
webencodings==0.5.1
|
115 |
+
widgetsnbextension==3.6.0
|
116 |
+
zipp==3.8.0
|