Steven-GU-Yu-Di's picture
Update app.py
16ac1d6 verified
import os
os.system('pip install torch')
os.system('pip install transformers')
from PIL import Image
import io
import streamlit as st
from transformers import pipeline
vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
tts_pipeline = pipeline("text-to-speech", "suno/bark")
def main():
st.title("Visual Question Answering & Text-to-Audio App")
image = st.file_uploader("Upload an image", type=["jpg", "png"])
question = st.text_input("Enter your question")
if image and question:
image = Image.open(io.BytesIO(image.getvalue()))
vqa_result = vqa_pipeline({"image": image, "question": question})
answer = vqa_result[0]['answer']
st.write(f"Answer: {answer}")
if st.button("Convert Answer to Audio"):
tts_result = tts_pipeline(answer)
audio_data = tts_result['audio']
st.audio(audio_data, format="audio/ogg")
if __name__ == "__main__":
main()