import streamlit as st from transformers import pipeline from PIL import Image # Load the Visual Question Answering (VQA) model vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering") # Load the Text-to-Speech (TTS) model tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech") # Create a Streamlit app st.title("Visual Question Answering and Text-to-Speech") # Sidebar for user inputs uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) question_input = st.text_input("Enter Question") # Function to perform Visual Question Answering and Text-to-Speech def perform_vqa_and_tts(image, question): if image is not None and question: image = Image.open(image) st.image(image, caption="Uploaded Image", use_column_width=True) st.write("Question:", question) # Visual Question Answering vqa_input = { "question": question, "context": "This is an image.", } vqa_output = vqa_model(image=image, **vqa_input) answer = vqa_output['answer'] st.write("Answer:", answer) # Text-to-Speech using TTS model audio_output = tts(answer) audio_bytes = audio_output[0]['audio'] st.audio(audio_bytes, format='audio/wav') # Button to trigger Visual Question Answering and Text-to-Speech if st.button("Perform VQA and TTS"): perform_vqa_and_tts(uploaded_image, question_input)