Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import soundfile as sf | |
| # Title and Description | |
| st.title("Text-to-Speech with VitsModel") | |
| st.write("Enter some English text, and I'll generate audio for you!") | |
| # Load Model and Tokenizer | |
| # Cache the model for efficiency | |
| def load_tts_model(): | |
| model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
| return model, tokenizer | |
| model, tokenizer = load_tts_model() | |
| # User Input | |
| user_text = st.text_input("Enter your text here:") | |
| # Generate Audio on Button Click | |
| if st.button("Generate Speech"): | |
| if not user_text: | |
| st.warning("Please enter some text.") | |
| else: | |
| inputs = tokenizer(user_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| # Specify sample rate (assuming it's the correct rate for the model) | |
| sample_rate = 16000 # Or replace with the correct sample rate for 'facebook/mms-tts-eng' | |
| # Optionally save to a temporary file (if needed) | |
| sf.write("temp_audio.wav", output[0].numpy(), sample_rate) | |
| # Choose one of the following playback methods: | |
| # Method 1: Play from temporary file | |
| st.audio("temp_audio.wav") | |
| # Method 2: Play directly with sample rate | |
| st.audio(output[0].numpy(), sample_rate=sample_rate) | |