File size: 5,028 Bytes
7a69915
 
 
 
6fda2a0
7a69915
3089ae4
 
4c71f4e
6fda2a0
 
 
4c71f4e
6fda2a0
5560825
4c71f4e
7a69915
3089ae4
 
7aa61b0
7a69915
 
 
 
 
 
 
6fda2a0
 
 
 
 
 
8009ea0
6fda2a0
 
 
 
 
 
 
 
5560825
3089ae4
 
6fda2a0
 
 
 
 
4c71f4e
 
 
 
3089ae4
 
6fda2a0
 
3089ae4
 
5560825
 
3089ae4
 
 
 
 
 
 
 
16f792f
3089ae4
 
5560825
387c1bd
3089ae4
 
 
 
 
 
5560825
3089ae4
 
 
 
 
 
 
7a69915
 
3089ae4
 
 
 
 
 
 
 
 
 
7a69915
3089ae4
 
6fda2a0
 
 
4c71f4e
 
7a69915
5560825
3089ae4
 
 
 
16f792f
3089ae4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
from PIL import Image
from bokeh.models.widgets import Button
from bokeh.models import CustomJS
from st_clickable_images import clickable_images
from streamlit_bokeh_events import streamlit_bokeh_events
from bokeh.models.widgets.buttons import Button
import time
from helper import *


def upload_image_callback():
    st.session_state.uploaded_image = upload_image_to_server()
    st.session_state.input = ''


def show():
    st.session_state.audio_answer = ''

    st.title('Welcome to Visual Question Answering - Audiobot')
    st.markdown('''
            <h4 style='text-align: center; color: #B2BEB5;'>
            <i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
                Please upload image and fire away!
            </i></h4>
            ''', unsafe_allow_html=True)

    update_gallery_images()
    if 'gallery' in st.session_state:
        clicked = clickable_images(
            st.session_state.gallery,
            titles=[f"Image #{str(i)}" for i in range(2)],
            div_style={"display": "flex",
                       "justify-content": "center", "flex-wrap": "wrap"},
            img_style={"margin": "5px", "height": "100px"},
        )

        if 'clicked' not in st.session_state or st.session_state.clicked != clicked:
            st.session_state.uploaded_image = st.session_state.gallery_images[clicked]
            st.session_state.clicked = clicked
            st.session_state.input = ''

    weights = [5, 2]
    image_col, audio_col = st.columns(weights)
    with image_col:
        st.file_uploader('Select an image...', type=[
            'jpg', 'jpeg'], accept_multiple_files=False,
            on_change=upload_image_callback, key='uploader')

        if st.session_state.uploaded_image is not None:
            st.session_state.image = Image.open(
                st.session_state.uploaded_image)
            st.image(st.session_state.uploaded_image,
                     use_column_width='always')
        else:
            st.session_state.image = None
            st.session_state.input = ''
            st.session_state.audio_answer = ''

    with audio_col:
        welcome_text = 'Hello and Welcome. I have been trained  as  visual question answering model. You are welcome to look at any image and ask me any questions about it.  I will do my best to provide the most accurate information possible based on my expertise. Select an image of interest by pressing the browse files button.  Now use the Ask question button to ask a question. Please feel free to ask me any questions about this image. Now. to get my answer. press the Get answer button.'
        welcome_button = Button(label='About Me', width=100)
        welcome_button.js_on_event('button_click', CustomJS(code=f'''
            var u = new SpeechSynthesisUtterance();
            u.text = '{welcome_text}';
            u.lang = 'en-US';

            speechSynthesis.speak(u);
            '''))
        st.bokeh_chart(welcome_button)

        # Speech recognition based in streamlit based on
        # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
        stt_button = Button(label='Ask Question', width=100)

        stt_button.js_on_event('button_click', CustomJS(code="""
            var recognition = new webkitSpeechRecognition();
            recognition.continuous = false;
            recognition.interimResults = false;

            recognition.onresult = function (e) {
                var value = '';
                for (var i = e.resultIndex; i < e.results.length; ++i) {
                    if (e.results[i].isFinal) {
                        value += e.results[i][0].transcript;
                    }
                }
                if ( value != '' ) {
                    document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
                }
            }
            recognition.start();
            """))

        result = streamlit_bokeh_events(
            stt_button,
            events='GET_TEXT',
            key='stt_listen',
            refresh_on_update=False,
            override_height=40,
            debounce_time=0)

        if result:
            if 'GET_TEXT' in result:
                if 'question' not in st.session_state or st.session_state.question != result.get('GET_TEXT'):
                    st.session_state['question'] = result.get('GET_TEXT')
                    with st.spinner('Preparing answer...'):
                        st.session_state.audio_answer = request_answer(
                            st.session_state.server_image_file, result.get('GET_TEXT'))

        tts_button = Button(label='Get Answer', width=100)
        tts_button.js_on_event('button_click', CustomJS(code=f"""
            var u = new SpeechSynthesisUtterance();
            u.text = '{st.session_state.audio_answer}';
            u.lang = 'en-US';

            speechSynthesis.speak(u);
            """))
        st.bokeh_chart(tts_button)