Spaces:

Madhuri
/

vqa_audiobot

Runtime error

App Files Files Community

Madhuri commited on Jun 21, 2022

Commit

3089ae4

•

1 Parent(s): a7d489b

Use spinner during long time operations

Browse files

Add basic check for english words in the question.

Files changed (4) hide show

app.py +5 -4
audiobot.py +68 -45
chatbot.py +5 -1
model/predictor.py +12 -3

app.py CHANGED Viewed

@@ -39,9 +39,10 @@ def run():
         '[Github](https://github.com/msak1612/vqa_chatbot)'
     )
-    if 'predictor' not in st.session_state:
-        thread = threading.Thread(target=runInThread)
-        add_script_run_ctx(thread)
-        thread.start()
 run()

         '[Github](https://github.com/msak1612/vqa_chatbot)'
     )
+    if 'thread' not in st.session_state:
+        st.session_state.thread = threading.Thread(target=runInThread)
+        add_script_run_ctx(st.session_state.thread)
+        st.session_state.thread.start()
 run()

audiobot.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import streamlit as st
 from PIL import Image
 from bokeh.models.widgets import Button
 from bokeh.models import CustomJS
 from streamlit_bokeh_events import streamlit_bokeh_events
 def show():
     st.title('Visual Question Answering - Audiobot')
     st.markdown('''
             <h4 style='text-align: center; color: #B2BEB5;'>
@@ -13,56 +16,76 @@ def show():
                 Please upload image and fire away!
             </i></h4>
             ''', unsafe_allow_html=True)
-    upload_pic = st.file_uploader('Choose an image...', type=[
-                                  'jpg', 'png', 'jpeg'], accept_multiple_files=False)
-    if upload_pic is not None:
-        st.session_state.image = Image.open(upload_pic)
-        st.image(upload_pic, use_column_width='auto')
-    else:
-        st.session_state.image = None
-    # Speech recognition based in streamlit based on
-    # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
-    stt_button = Button(label='Ask Question', width=100)
-    stt_button.js_on_event('button_click', CustomJS(code='''
-        var recognition = new webkitSpeechRecognition();
-        recognition.continuous = false;
-        recognition.interimResults = false;
-        recognition.onresult = function (e) {
-            var value = '';
-            for (var i = e.resultIndex; i < e.results.length; ++i) {
-                if (e.results[i].isFinal) {
-                    value += e.results[i][0].transcript;
                 }
             }
-            if ( value != '' ) {
-                document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
-            }
-        }
-        recognition.start();
-        '''))
-    result = streamlit_bokeh_events(
-        stt_button,
-        events='GET_TEXT',
-        key='stt_listen',
-        refresh_on_update=False,
-        override_height=75,
-        debounce_time=0)
-    if result:
-        if 'GET_TEXT' in result:
-            answer = st.session_state.predictor.predict_answer_from_text(
-                st.session_state.image, result.get('GET_TEXT'))
-            tts_button = Button(label="Get Answer", width=100)
-            tts_button.js_on_event("button_click", CustomJS(code=f"""
-                var u = new SpeechSynthesisUtterance();
-                u.text = "{answer}";
-                u.lang = 'en-US';
-                speechSynthesis.speak(u);
-                """))
-            st.bokeh_chart(tts_button)

 import streamlit as st
 from PIL import Image
 from bokeh.models.widgets import Button
 from bokeh.models import CustomJS
 from streamlit_bokeh_events import streamlit_bokeh_events
+from bokeh.models.widgets.buttons import Button
+import time
 def show():
+    st.session_state.audio_answer = ''
     st.title('Visual Question Answering - Audiobot')
     st.markdown('''
             <h4 style='text-align: center; color: #B2BEB5;'>
                 Please upload image and fire away!
             </i></h4>
             ''', unsafe_allow_html=True)
+    weights = [5,2]
+    image_col, audio_col = st.columns(weights)
+    with image_col:
+        upload_pic = st.file_uploader('Choose an image...', type=[
+            'jpg', 'png', 'jpeg'], accept_multiple_files=False)
+        if upload_pic is not None:
+            st.session_state.image = Image.open(upload_pic)
+            st.image(upload_pic, use_column_width='auto')
+        else:
+            st.session_state.image = None
+    with audio_col:
+        welcome_text='Hello and welcome, I have been trained  as  visual question answering model. You are welcome to look at any image and ask me any questions about it.  I will do my best to provide the most accurate information possible based on my expertise. Select an image of interest by pressing the browse files button.  Now use the Ask question button to ask a question. Please feel free to ask me any questions about this image. Now, to get my answer, press the Get answer button.'
+        welcome_button = Button(label='About Me')
+        welcome_button.js_on_event('button_click', CustomJS(code=f'''
+            var u = new SpeechSynthesisUtterance();
+            u.text = '{welcome_text}';
+            u.lang = 'en-US';
+            speechSynthesis.speak(u);
+            '''))
+        st.bokeh_chart(welcome_button)
+        # Speech recognition based in streamlit based on
+        # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
+        stt_button = Button(label='Ask Question')
+        stt_button.js_on_event('button_click', CustomJS(code="""
+            var recognition = new webkitSpeechRecognition();
+            recognition.continuous = false;
+            recognition.interimResults = false;
+            recognition.onresult = function (e) {
+                var value = "";
+                for (var i = e.resultIndex; i < e.results.length; ++i) {
+                    if (e.results[i].isFinal) {
+                        value += e.results[i][0].transcript;
+                    }
+                }
+                if ( value != '' ) {
+                    document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
                 }
             }
+            recognition.start();
+            """))
+        result = streamlit_bokeh_events(
+            stt_button,
+            events='GET_TEXT',
+            key='stt_listen',
+            refresh_on_update=False,
+            override_height=40,
+            debounce_time=0)
+        if result:
+            if 'GET_TEXT' in result:
+                with st.spinner('Preparing answer...'):
+                    while 'predictor' not in st.session_state:
+                        time.sleep(2)
+                    st.session_state.audio_answer = st.session_state.predictor.predict_answer_from_text(
+                        st.session_state.image, result.get('GET_TEXT'))
+        tts_button = Button(label='Get Answer')
+        tts_button.js_on_event('button_click', CustomJS(code=f"""
+            var u = new SpeechSynthesisUtterance();
+            u.text = '{st.session_state.audio_answer}';
+            u.lang = 'en-US';
+            speechSynthesis.speak(u);
+            """))
+        st.bokeh_chart(tts_button)

chatbot.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 from streamlit_chat import message
 from PIL import Image
 def init_chat_history():
     if 'question' not in st.session_state:
@@ -23,6 +23,10 @@ def predict(image, input):
     if image is None or not input:
         return
     answer = st.session_state.predictor.predict_answer_from_text(image, input)
     st.session_state.question.append(input)
     st.session_state.answer.append(answer)

 import streamlit as st
 from streamlit_chat import message
 from PIL import Image
+import time
 def init_chat_history():
     if 'question' not in st.session_state:
     if image is None or not input:
         return
+    with st.spinner('Preparing answer...'):
+        while 'predictor' not in st.session_state:
+            time.sleep(2)
     answer = st.session_state.predictor.predict_answer_from_text(image, input)
     st.session_state.question.append(input)
     st.session_state.answer.append(answer)

model/predictor.py CHANGED Viewed

@@ -5,6 +5,8 @@ from transformers import AutoTokenizer
 from transformers import AutoModelForSeq2SeqLM
 import os
 import torch
@@ -27,9 +29,15 @@ class Predictor:
         self.qa_tokenizer = AutoTokenizer.from_pretrained(
             'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
-    def predict_answer_from_text(self, image, question):
-        if not question or image is None:
-            return ''
         # process question using image model
         encoding = self.vqa_processor(image, question, return_tensors='pt')
@@ -45,4 +53,5 @@ class Predictor:
             output_ids = self.qa_model.generate(input_ids)
         answers = self.qa_tokenizer.batch_decode(
             output_ids, skip_special_tokens=True)
         return answers[0]

 from transformers import AutoModelForSeq2SeqLM
 import os
+import re
+import string
 import torch
         self.qa_tokenizer = AutoTokenizer.from_pretrained(
             'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
+    def predict_answer_from_text(self, image, input):
+        if image is None:
+            return 'Please select an image...'
+        chars = re.escape(string.punctuation)
+        question = re.sub(r'['+chars+']', '', input)
+        if not question or len(question.split()) < 3:
+            return 'I cannot understand, please ask a valid question...'
         # process question using image model
         encoding = self.vqa_processor(image, question, return_tensors='pt')
             output_ids = self.qa_model.generate(input_ids)
         answers = self.qa_tokenizer.batch_decode(
             output_ids, skip_special_tokens=True)
         return answers[0]