File size: 2,212 Bytes
7a69915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import streamlit as st
from PIL import Image

from bokeh.models.widgets import Button
from bokeh.models import CustomJS
from streamlit_bokeh_events import streamlit_bokeh_events
import subprocess


def show():
    st.title('Visual Question Answering - Audiobot')
    st.markdown('''
            <h4 style='text-align: center; color: #B2BEB5;'>
            <i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
                Please upload image and fire away!
            </i></h4>
            ''', unsafe_allow_html=True)
    upload_pic = st.file_uploader('Choose an image...', type=[
                                  'jpg', 'png', 'jpeg'], accept_multiple_files=False)
    if upload_pic is not None:
        st.session_state.image = Image.open(upload_pic)
        st.image(upload_pic, use_column_width='auto')
    else:
        st.session_state.image = None

    # Speech recognition based in streamlit based on
    # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
    stt_button = Button(label='Ask', width=100)
    stt_button.js_on_event('button_click', CustomJS(code='''
        var recognition = new webkitSpeechRecognition();
        recognition.continuous = false;
        recognition.interimResults = false;
    
        recognition.onresult = function (e) {
            var value = '';
            for (var i = e.resultIndex; i < e.results.length; ++i) {
                if (e.results[i].isFinal) {
                    value += e.results[i][0].transcript;
                }
            }
            if ( value != '') {
                document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
            }
        }
        recognition.start();
        '''))

    result = streamlit_bokeh_events(
        stt_button,
        events='GET_TEXT',
        key='listen',
        refresh_on_update=False,
        override_height=75,
        debounce_time=0)

    if result:
        if 'GET_TEXT' in result:
            answer = st.session_state.predictor.predict_answer_from_text(
                st.session_state.image, result.get('GET_TEXT'))
            subprocess.check_output(['say', answer])