import streamlit as st from transformers import VitsModel, AutoTokenizer import torch import soundfile as sf # Title and Description st.title("Text-to-Speech with VitsModel") st.write("Enter some English text, and I'll generate audio for you!") # Load Model and Tokenizer @st.cache_resource # Cache the model for efficiency def load_tts_model(): model = VitsModel.from_pretrained("facebook/mms-tts-eng") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") return model, tokenizer model, tokenizer = load_tts_model() # User Input user_text = st.text_input("Enter your text here:") # Generate Audio on Button Click if st.button("Generate Speech"): if not user_text: st.warning("Please enter some text.") else: inputs = tokenizer(user_text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform # Specify sample rate (assuming it's the correct rate for the model) sample_rate = 16000 # Or replace with the correct sample rate for 'facebook/mms-tts-eng' # Optionally save to a temporary file (if needed) sf.write("temp_audio.wav", output[0].numpy(), sample_rate) # Choose one of the following playback methods: # Method 1: Play from temporary file st.audio("temp_audio.wav") # Method 2: Play directly with sample rate st.audio(output[0].numpy(), sample_rate=sample_rate)