Spaces:
Runtime error
Runtime error
Use spinner during long time operations
Browse filesAdd basic check for english words in the question.
- app.py +5 -4
- audiobot.py +68 -45
- chatbot.py +5 -1
- model/predictor.py +12 -3
app.py
CHANGED
@@ -39,9 +39,10 @@ def run():
|
|
39 |
'[Github](https://github.com/msak1612/vqa_chatbot)'
|
40 |
)
|
41 |
|
42 |
-
if '
|
43 |
-
thread = threading.Thread(target=runInThread)
|
44 |
-
add_script_run_ctx(thread)
|
45 |
-
thread.start()
|
|
|
46 |
|
47 |
run()
|
|
|
39 |
'[Github](https://github.com/msak1612/vqa_chatbot)'
|
40 |
)
|
41 |
|
42 |
+
if 'thread' not in st.session_state:
|
43 |
+
st.session_state.thread = threading.Thread(target=runInThread)
|
44 |
+
add_script_run_ctx(st.session_state.thread)
|
45 |
+
st.session_state.thread.start()
|
46 |
+
|
47 |
|
48 |
run()
|
audiobot.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
import streamlit as st
|
2 |
from PIL import Image
|
3 |
-
|
4 |
from bokeh.models.widgets import Button
|
5 |
from bokeh.models import CustomJS
|
6 |
from streamlit_bokeh_events import streamlit_bokeh_events
|
|
|
|
|
7 |
|
8 |
def show():
|
|
|
|
|
9 |
st.title('Visual Question Answering - Audiobot')
|
10 |
st.markdown('''
|
11 |
<h4 style='text-align: center; color: #B2BEB5;'>
|
@@ -13,56 +16,76 @@ def show():
|
|
13 |
Please upload image and fire away!
|
14 |
</i></h4>
|
15 |
''', unsafe_allow_html=True)
|
16 |
-
upload_pic = st.file_uploader('Choose an image...', type=[
|
17 |
-
'jpg', 'png', 'jpeg'], accept_multiple_files=False)
|
18 |
-
if upload_pic is not None:
|
19 |
-
st.session_state.image = Image.open(upload_pic)
|
20 |
-
st.image(upload_pic, use_column_width='auto')
|
21 |
-
else:
|
22 |
-
st.session_state.image = None
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
recognition.interimResults = false;
|
32 |
|
33 |
-
|
34 |
-
var
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
}
|
39 |
}
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
if result:
|
56 |
-
if 'GET_TEXT' in result:
|
57 |
-
answer = st.session_state.predictor.predict_answer_from_text(
|
58 |
-
st.session_state.image, result.get('GET_TEXT'))
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
1 |
import streamlit as st
|
2 |
from PIL import Image
|
|
|
3 |
from bokeh.models.widgets import Button
|
4 |
from bokeh.models import CustomJS
|
5 |
from streamlit_bokeh_events import streamlit_bokeh_events
|
6 |
+
from bokeh.models.widgets.buttons import Button
|
7 |
+
import time
|
8 |
|
9 |
def show():
|
10 |
+
st.session_state.audio_answer = ''
|
11 |
+
|
12 |
st.title('Visual Question Answering - Audiobot')
|
13 |
st.markdown('''
|
14 |
<h4 style='text-align: center; color: #B2BEB5;'>
|
|
|
16 |
Please upload image and fire away!
|
17 |
</i></h4>
|
18 |
''', unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
weights = [5,2]
|
21 |
+
image_col, audio_col = st.columns(weights)
|
22 |
+
with image_col:
|
23 |
+
upload_pic = st.file_uploader('Choose an image...', type=[
|
24 |
+
'jpg', 'png', 'jpeg'], accept_multiple_files=False)
|
25 |
+
if upload_pic is not None:
|
26 |
+
st.session_state.image = Image.open(upload_pic)
|
27 |
+
st.image(upload_pic, use_column_width='auto')
|
28 |
+
else:
|
29 |
+
st.session_state.image = None
|
30 |
+
|
31 |
+
with audio_col:
|
32 |
+
welcome_text='Hello and welcome, I have been trained as visual question answering model. You are welcome to look at any image and ask me any questions about it. I will do my best to provide the most accurate information possible based on my expertise. Select an image of interest by pressing the browse files button. Now use the Ask question button to ask a question. Please feel free to ask me any questions about this image. Now, to get my answer, press the Get answer button.'
|
33 |
+
welcome_button = Button(label='About Me')
|
34 |
+
welcome_button.js_on_event('button_click', CustomJS(code=f'''
|
35 |
+
var u = new SpeechSynthesisUtterance();
|
36 |
+
u.text = '{welcome_text}';
|
37 |
+
u.lang = 'en-US';
|
38 |
+
|
39 |
+
speechSynthesis.speak(u);
|
40 |
+
'''))
|
41 |
+
st.bokeh_chart(welcome_button)
|
42 |
|
43 |
+
# Speech recognition based in streamlit based on
|
44 |
+
# https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
|
45 |
+
stt_button = Button(label='Ask Question')
|
|
|
46 |
|
47 |
+
stt_button.js_on_event('button_click', CustomJS(code="""
|
48 |
+
var recognition = new webkitSpeechRecognition();
|
49 |
+
recognition.continuous = false;
|
50 |
+
recognition.interimResults = false;
|
51 |
+
|
52 |
+
recognition.onresult = function (e) {
|
53 |
+
var value = "";
|
54 |
+
for (var i = e.resultIndex; i < e.results.length; ++i) {
|
55 |
+
if (e.results[i].isFinal) {
|
56 |
+
value += e.results[i][0].transcript;
|
57 |
+
}
|
58 |
+
}
|
59 |
+
if ( value != '' ) {
|
60 |
+
document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
|
61 |
}
|
62 |
}
|
63 |
+
recognition.start();
|
64 |
+
"""))
|
65 |
+
|
66 |
+
result = streamlit_bokeh_events(
|
67 |
+
stt_button,
|
68 |
+
events='GET_TEXT',
|
69 |
+
key='stt_listen',
|
70 |
+
refresh_on_update=False,
|
71 |
+
override_height=40,
|
72 |
+
debounce_time=0)
|
73 |
|
74 |
+
if result:
|
75 |
+
if 'GET_TEXT' in result:
|
76 |
+
with st.spinner('Preparing answer...'):
|
77 |
+
while 'predictor' not in st.session_state:
|
78 |
+
time.sleep(2)
|
79 |
+
st.session_state.audio_answer = st.session_state.predictor.predict_answer_from_text(
|
80 |
+
st.session_state.image, result.get('GET_TEXT'))
|
81 |
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
tts_button = Button(label='Get Answer')
|
84 |
+
tts_button.js_on_event('button_click', CustomJS(code=f"""
|
85 |
+
var u = new SpeechSynthesisUtterance();
|
86 |
+
u.text = '{st.session_state.audio_answer}';
|
87 |
+
u.lang = 'en-US';
|
88 |
|
89 |
+
speechSynthesis.speak(u);
|
90 |
+
"""))
|
91 |
+
st.bokeh_chart(tts_button)
|
chatbot.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
from streamlit_chat import message
|
3 |
from PIL import Image
|
4 |
-
|
5 |
|
6 |
def init_chat_history():
|
7 |
if 'question' not in st.session_state:
|
@@ -23,6 +23,10 @@ def predict(image, input):
|
|
23 |
if image is None or not input:
|
24 |
return
|
25 |
|
|
|
|
|
|
|
|
|
26 |
answer = st.session_state.predictor.predict_answer_from_text(image, input)
|
27 |
st.session_state.question.append(input)
|
28 |
st.session_state.answer.append(answer)
|
|
|
1 |
import streamlit as st
|
2 |
from streamlit_chat import message
|
3 |
from PIL import Image
|
4 |
+
import time
|
5 |
|
6 |
def init_chat_history():
|
7 |
if 'question' not in st.session_state:
|
|
|
23 |
if image is None or not input:
|
24 |
return
|
25 |
|
26 |
+
with st.spinner('Preparing answer...'):
|
27 |
+
while 'predictor' not in st.session_state:
|
28 |
+
time.sleep(2)
|
29 |
+
|
30 |
answer = st.session_state.predictor.predict_answer_from_text(image, input)
|
31 |
st.session_state.question.append(input)
|
32 |
st.session_state.answer.append(answer)
|
model/predictor.py
CHANGED
@@ -5,6 +5,8 @@ from transformers import AutoTokenizer
|
|
5 |
from transformers import AutoModelForSeq2SeqLM
|
6 |
|
7 |
import os
|
|
|
|
|
8 |
import torch
|
9 |
|
10 |
|
@@ -27,9 +29,15 @@ class Predictor:
|
|
27 |
self.qa_tokenizer = AutoTokenizer.from_pretrained(
|
28 |
'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# process question using image model
|
35 |
encoding = self.vqa_processor(image, question, return_tensors='pt')
|
@@ -45,4 +53,5 @@ class Predictor:
|
|
45 |
output_ids = self.qa_model.generate(input_ids)
|
46 |
answers = self.qa_tokenizer.batch_decode(
|
47 |
output_ids, skip_special_tokens=True)
|
|
|
48 |
return answers[0]
|
|
|
5 |
from transformers import AutoModelForSeq2SeqLM
|
6 |
|
7 |
import os
|
8 |
+
import re
|
9 |
+
import string
|
10 |
import torch
|
11 |
|
12 |
|
|
|
29 |
self.qa_tokenizer = AutoTokenizer.from_pretrained(
|
30 |
'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
|
31 |
|
32 |
+
|
33 |
+
def predict_answer_from_text(self, image, input):
|
34 |
+
if image is None:
|
35 |
+
return 'Please select an image...'
|
36 |
+
|
37 |
+
chars = re.escape(string.punctuation)
|
38 |
+
question = re.sub(r'['+chars+']', '', input)
|
39 |
+
if not question or len(question.split()) < 3:
|
40 |
+
return 'I cannot understand, please ask a valid question...'
|
41 |
|
42 |
# process question using image model
|
43 |
encoding = self.vqa_processor(image, question, return_tensors='pt')
|
|
|
53 |
output_ids = self.qa_model.generate(input_ids)
|
54 |
answers = self.qa_tokenizer.batch_decode(
|
55 |
output_ids, skip_special_tokens=True)
|
56 |
+
|
57 |
return answers[0]
|