Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
from transformers import T5Config | |
from datasets import load_dataset | |
import torch | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = embeddings_dataset[7306]["xvector"] | |
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0) | |
# 加载 Visual Question Answering 模型 microsoft/git-base-vqav2 | |
vqa_pipeline = pipeline("text2text-generation", model="microsoft/git-base-vqav2") | |
# 加载文本到语音模型 | |
text_to_speech_pipeline = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
def main(): | |
st.title("Visual Question Answering with Text-to-Speech") | |
image_path = st.text_input("Enter image path:") | |
question = st.text_input("Enter your question:") | |
if st.button("Get Answer"): | |
answer = vqa_pipeline(question, image_path)[0]['generated_text'] | |
# 将说话者的嵌入向量作为文本的一部分传递给文本到语音模型 | |
text_with_speaker = f"{answer} Speaker Embeddings: {speaker_embeddings}" | |
audio_data = text_to_speech_pipeline(text_with_speaker) | |
st.write("Answer:", answer) | |
st.audio(audio_data[0]["audio"], format='audio/wav') | |
if __name__ == '__main__': | |
main() |