Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import pipeline | |
from PIL import Image | |
from gtts import gTTS | |
import os | |
# Load the Visual Question Answering (VQA) model | |
vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering") | |
# Load the Text-to-Speech (TTS) model | |
tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech") | |
# Create a Streamlit app | |
st.title("Visual Question Answering and Text-to-Speech") | |
# Sidebar for user inputs | |
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) | |
question_input = st.text_input("Enter Question") | |
# Function to perform Visual Question Answering and Text-to-Speech | |
def perform_vqa_and_tts(image, question): | |
if image is not None and question: | |
image = Image.open(image) | |
st.image(image, caption="Uploaded Image", use_column_width=True) | |
st.write("Question:", question) | |
# Visual Question Answering | |
vqa_input = { | |
"question": question, | |
"context": "This is an image.", | |
} | |
vqa_output = vqa_model(image=image, **vqa_input) | |
answer = vqa_output['answer'] | |
st.write("Answer:", answer) | |
# Text-to-Speech using TTS model | |
audio_output = tts(answer) | |
audio_bytes = audio_output[0]['audio'] | |
st.audio(audio_bytes, format='audio/wav') | |
# Button to trigger Visual Question Answering and Text-to-Speech | |
if st.button("Perform VQA and TTS"): | |
perform_vqa_and_tts(uploaded_image, question_input) | |