Spaces:

Madhuri
/

vqa_audiobot

Runtime error

App Files Files Community

vqa_audiobot / audiobot.py

Madhuri

Add warp for deployment in huggingface

8009ea0 almost 2 years ago

raw history blame

No virus

5.81 kB

	from turtle import width
	import streamlit as st
	from PIL import Image
	from bokeh.models.widgets import Button
	from bokeh.models import CustomJS
	from st_clickable_images import clickable_images
	from streamlit_bokeh_events import streamlit_bokeh_events
	from bokeh.models.widgets.buttons import Button
	import time
	from os.path import *
	from os import listdir
	import base64

	def update_gallery_images():
	if 'gallery' not in st.session_state:
	st.session_state.gallery = []
	st.session_state.gallery_images = []
	image_path = join(dirname(abspath(__file__)), 'images')
	for f in listdir(image_path):
	if f.startswith('image'):
	with open(join(image_path, f), "rb") as image:
	encoded = base64.b64encode(image.read()).decode()
	st.session_state.gallery.append(
	f"data:image/jpeg;base64,{encoded}")
	st.session_state.gallery_images.append(join(image_path, f))


	def upload_image_callback():
	st.session_state.uploaded_image = st.session_state.uploader
	st.session_state.input = ''

	def show():
	st.session_state.audio_answer = ''

	st.title('Welcome to Visual Question Answering - Audiobot')
	st.markdown('''
	<h4 style='text-align: center; color: #B2BEB5;'>
	<i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
	Please upload image and fire away!
	</i></h4>
	''', unsafe_allow_html=True)

	update_gallery_images()
	if 'gallery' in st.session_state:
	clicked = clickable_images(
	st.session_state.gallery,
	titles=[f"Image #{str(i)}" for i in range(2)],
	div_style={"display": "flex",
	"justify-content": "center", "flex-wrap": "wrap"},
	img_style={"margin": "5px", "height": "100px"},
	)

	if 'clicked' not in st.session_state or st.session_state.clicked != clicked:
	st.session_state.uploaded_image = st.session_state.gallery_images[clicked]
	st.session_state.clicked = clicked
	st.session_state.input = ''

	weights = [5, 2]
	image_col, audio_col = st.columns(weights)
	with image_col:
	st.file_uploader('Select an image...', type=[
	'jpg', 'jpeg'], accept_multiple_files=False,
	on_change=upload_image_callback, key='uploader')

	if st.session_state.uploaded_image is not None:
	st.session_state.image = Image.open(st.session_state.uploaded_image)
	st.image(st.session_state.uploaded_image, use_column_width='always')
	else:
	st.session_state.image = None
	st.session_state.input = ''
	st.session_state.audio_answer = ''

	with audio_col:
	welcome_text = 'Hello and Welcome. I have been trained as visual question answering model. You are welcome to look at any image and ask me any questions about it. I will do my best to provide the most accurate information possible based on my expertise. Select an image of interest by pressing the browse files button. Now use the Ask question button to ask a question. Please feel free to ask me any questions about this image. Now. to get my answer. press the Get answer button.'
	welcome_button = Button(label='About Me', width=100)
	welcome_button.js_on_event('button_click', CustomJS(code=f'''
	var u = new SpeechSynthesisUtterance();
	u.text = '{welcome_text}';
	u.lang = 'en-US';

	speechSynthesis.speak(u);
	'''))
	st.bokeh_chart(welcome_button)

	# Speech recognition based in streamlit based on
	# https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
	stt_button = Button(label='Ask Question', width=100)

	stt_button.js_on_event('button_click', CustomJS(code="""
	var recognition = new webkitSpeechRecognition();
	recognition.continuous = false;
	recognition.interimResults = false;

	recognition.onresult = function (e) {
	var value = '';
	for (var i = e.resultIndex; i < e.results.length; ++i) {
	if (e.results[i].isFinal) {
	value += e.results[i][0].transcript;
	}
	}
	if ( value != '' ) {
	document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
	}
	}
	recognition.start();
	"""))

	result = streamlit_bokeh_events(
	stt_button,
	events='GET_TEXT',
	key='stt_listen',
	refresh_on_update=False,
	override_height=40,
	debounce_time=0)

	if result:
	if 'GET_TEXT' in result:
	if 'question' not in st.session_state or st.session_state.question != result.get('GET_TEXT'):
	st.session_state['question'] = result.get('GET_TEXT')
	with st.spinner('Preparing answer...'):
	while 'predictor' not in st.session_state:
	time.sleep(2)
	st.session_state.audio_answer = st.session_state.predictor.predict_answer_from_text(
	st.session_state.image, result.get('GET_TEXT'))

	tts_button = Button(label='Get Answer', width=100)
	tts_button.js_on_event('button_click', CustomJS(code=f"""
	var u = new SpeechSynthesisUtterance();
	u.text = '{st.session_state.audio_answer}';
	u.lang = 'en-US';

	speechSynthesis.speak(u);
	"""))
	st.bokeh_chart(tts_button)