Madhuri commited on
Commit
3089ae4
1 Parent(s): a7d489b

Use spinner during long time operations

Browse files

Add basic check for english words in the question.

Files changed (4) hide show
  1. app.py +5 -4
  2. audiobot.py +68 -45
  3. chatbot.py +5 -1
  4. model/predictor.py +12 -3
app.py CHANGED
@@ -39,9 +39,10 @@ def run():
39
  '[Github](https://github.com/msak1612/vqa_chatbot)'
40
  )
41
 
42
- if 'predictor' not in st.session_state:
43
- thread = threading.Thread(target=runInThread)
44
- add_script_run_ctx(thread)
45
- thread.start()
 
46
 
47
  run()
 
39
  '[Github](https://github.com/msak1612/vqa_chatbot)'
40
  )
41
 
42
+ if 'thread' not in st.session_state:
43
+ st.session_state.thread = threading.Thread(target=runInThread)
44
+ add_script_run_ctx(st.session_state.thread)
45
+ st.session_state.thread.start()
46
+
47
 
48
  run()
audiobot.py CHANGED
@@ -1,11 +1,14 @@
1
  import streamlit as st
2
  from PIL import Image
3
-
4
  from bokeh.models.widgets import Button
5
  from bokeh.models import CustomJS
6
  from streamlit_bokeh_events import streamlit_bokeh_events
 
 
7
 
8
  def show():
 
 
9
  st.title('Visual Question Answering - Audiobot')
10
  st.markdown('''
11
  <h4 style='text-align: center; color: #B2BEB5;'>
@@ -13,56 +16,76 @@ def show():
13
  Please upload image and fire away!
14
  </i></h4>
15
  ''', unsafe_allow_html=True)
16
- upload_pic = st.file_uploader('Choose an image...', type=[
17
- 'jpg', 'png', 'jpeg'], accept_multiple_files=False)
18
- if upload_pic is not None:
19
- st.session_state.image = Image.open(upload_pic)
20
- st.image(upload_pic, use_column_width='auto')
21
- else:
22
- st.session_state.image = None
23
 
24
- # Speech recognition based in streamlit based on
25
- # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
26
- stt_button = Button(label='Ask Question', width=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- stt_button.js_on_event('button_click', CustomJS(code='''
29
- var recognition = new webkitSpeechRecognition();
30
- recognition.continuous = false;
31
- recognition.interimResults = false;
32
 
33
- recognition.onresult = function (e) {
34
- var value = '';
35
- for (var i = e.resultIndex; i < e.results.length; ++i) {
36
- if (e.results[i].isFinal) {
37
- value += e.results[i][0].transcript;
 
 
 
 
 
 
 
 
 
38
  }
39
  }
40
- if ( value != '' ) {
41
- document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
42
- }
43
- }
44
- recognition.start();
45
- '''))
 
 
 
 
46
 
47
- result = streamlit_bokeh_events(
48
- stt_button,
49
- events='GET_TEXT',
50
- key='stt_listen',
51
- refresh_on_update=False,
52
- override_height=75,
53
- debounce_time=0)
54
 
55
- if result:
56
- if 'GET_TEXT' in result:
57
- answer = st.session_state.predictor.predict_answer_from_text(
58
- st.session_state.image, result.get('GET_TEXT'))
59
 
60
- tts_button = Button(label="Get Answer", width=100)
61
- tts_button.js_on_event("button_click", CustomJS(code=f"""
62
- var u = new SpeechSynthesisUtterance();
63
- u.text = "{answer}";
64
- u.lang = 'en-US';
65
 
66
- speechSynthesis.speak(u);
67
- """))
68
- st.bokeh_chart(tts_button)
 
1
  import streamlit as st
2
  from PIL import Image
 
3
  from bokeh.models.widgets import Button
4
  from bokeh.models import CustomJS
5
  from streamlit_bokeh_events import streamlit_bokeh_events
6
+ from bokeh.models.widgets.buttons import Button
7
+ import time
8
 
9
  def show():
10
+ st.session_state.audio_answer = ''
11
+
12
  st.title('Visual Question Answering - Audiobot')
13
  st.markdown('''
14
  <h4 style='text-align: center; color: #B2BEB5;'>
 
16
  Please upload image and fire away!
17
  </i></h4>
18
  ''', unsafe_allow_html=True)
 
 
 
 
 
 
 
19
 
20
+ weights = [5,2]
21
+ image_col, audio_col = st.columns(weights)
22
+ with image_col:
23
+ upload_pic = st.file_uploader('Choose an image...', type=[
24
+ 'jpg', 'png', 'jpeg'], accept_multiple_files=False)
25
+ if upload_pic is not None:
26
+ st.session_state.image = Image.open(upload_pic)
27
+ st.image(upload_pic, use_column_width='auto')
28
+ else:
29
+ st.session_state.image = None
30
+
31
+ with audio_col:
32
+ welcome_text='Hello and welcome, I have been trained as visual question answering model. You are welcome to look at any image and ask me any questions about it. I will do my best to provide the most accurate information possible based on my expertise. Select an image of interest by pressing the browse files button. Now use the Ask question button to ask a question. Please feel free to ask me any questions about this image. Now, to get my answer, press the Get answer button.'
33
+ welcome_button = Button(label='About Me')
34
+ welcome_button.js_on_event('button_click', CustomJS(code=f'''
35
+ var u = new SpeechSynthesisUtterance();
36
+ u.text = '{welcome_text}';
37
+ u.lang = 'en-US';
38
+
39
+ speechSynthesis.speak(u);
40
+ '''))
41
+ st.bokeh_chart(welcome_button)
42
 
43
+ # Speech recognition based in streamlit based on
44
+ # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
45
+ stt_button = Button(label='Ask Question')
 
46
 
47
+ stt_button.js_on_event('button_click', CustomJS(code="""
48
+ var recognition = new webkitSpeechRecognition();
49
+ recognition.continuous = false;
50
+ recognition.interimResults = false;
51
+
52
+ recognition.onresult = function (e) {
53
+ var value = "";
54
+ for (var i = e.resultIndex; i < e.results.length; ++i) {
55
+ if (e.results[i].isFinal) {
56
+ value += e.results[i][0].transcript;
57
+ }
58
+ }
59
+ if ( value != '' ) {
60
+ document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
61
  }
62
  }
63
+ recognition.start();
64
+ """))
65
+
66
+ result = streamlit_bokeh_events(
67
+ stt_button,
68
+ events='GET_TEXT',
69
+ key='stt_listen',
70
+ refresh_on_update=False,
71
+ override_height=40,
72
+ debounce_time=0)
73
 
74
+ if result:
75
+ if 'GET_TEXT' in result:
76
+ with st.spinner('Preparing answer...'):
77
+ while 'predictor' not in st.session_state:
78
+ time.sleep(2)
79
+ st.session_state.audio_answer = st.session_state.predictor.predict_answer_from_text(
80
+ st.session_state.image, result.get('GET_TEXT'))
81
 
 
 
 
 
82
 
83
+ tts_button = Button(label='Get Answer')
84
+ tts_button.js_on_event('button_click', CustomJS(code=f"""
85
+ var u = new SpeechSynthesisUtterance();
86
+ u.text = '{st.session_state.audio_answer}';
87
+ u.lang = 'en-US';
88
 
89
+ speechSynthesis.speak(u);
90
+ """))
91
+ st.bokeh_chart(tts_button)
chatbot.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  from streamlit_chat import message
3
  from PIL import Image
4
-
5
 
6
  def init_chat_history():
7
  if 'question' not in st.session_state:
@@ -23,6 +23,10 @@ def predict(image, input):
23
  if image is None or not input:
24
  return
25
 
 
 
 
 
26
  answer = st.session_state.predictor.predict_answer_from_text(image, input)
27
  st.session_state.question.append(input)
28
  st.session_state.answer.append(answer)
 
1
  import streamlit as st
2
  from streamlit_chat import message
3
  from PIL import Image
4
+ import time
5
 
6
  def init_chat_history():
7
  if 'question' not in st.session_state:
 
23
  if image is None or not input:
24
  return
25
 
26
+ with st.spinner('Preparing answer...'):
27
+ while 'predictor' not in st.session_state:
28
+ time.sleep(2)
29
+
30
  answer = st.session_state.predictor.predict_answer_from_text(image, input)
31
  st.session_state.question.append(input)
32
  st.session_state.answer.append(answer)
model/predictor.py CHANGED
@@ -5,6 +5,8 @@ from transformers import AutoTokenizer
5
  from transformers import AutoModelForSeq2SeqLM
6
 
7
  import os
 
 
8
  import torch
9
 
10
 
@@ -27,9 +29,15 @@ class Predictor:
27
  self.qa_tokenizer = AutoTokenizer.from_pretrained(
28
  'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
29
 
30
- def predict_answer_from_text(self, image, question):
31
- if not question or image is None:
32
- return ''
 
 
 
 
 
 
33
 
34
  # process question using image model
35
  encoding = self.vqa_processor(image, question, return_tensors='pt')
@@ -45,4 +53,5 @@ class Predictor:
45
  output_ids = self.qa_model.generate(input_ids)
46
  answers = self.qa_tokenizer.batch_decode(
47
  output_ids, skip_special_tokens=True)
 
48
  return answers[0]
 
5
  from transformers import AutoModelForSeq2SeqLM
6
 
7
  import os
8
+ import re
9
+ import string
10
  import torch
11
 
12
 
 
29
  self.qa_tokenizer = AutoTokenizer.from_pretrained(
30
  'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
31
 
32
+
33
+ def predict_answer_from_text(self, image, input):
34
+ if image is None:
35
+ return 'Please select an image...'
36
+
37
+ chars = re.escape(string.punctuation)
38
+ question = re.sub(r'['+chars+']', '', input)
39
+ if not question or len(question.split()) < 3:
40
+ return 'I cannot understand, please ask a valid question...'
41
 
42
  # process question using image model
43
  encoding = self.vqa_processor(image, question, return_tensors='pt')
 
53
  output_ids = self.qa_model.generate(input_ids)
54
  answers = self.qa_tokenizer.batch_decode(
55
  output_ids, skip_special_tokens=True)
56
+
57
  return answers[0]