speech / app.py
Update app.py
b959239 verified
import streamlit as st
from transformers import VitsModel, AutoTokenizer
import torch
import soundfile as sf
# Title and Description
st.title("Text-to-Speech with VitsModel")
st.write("Enter some English text, and I'll generate audio for you!")
# Load Model and Tokenizer
@st.cache_resource # Cache the model for efficiency
def load_tts_model():
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
return model, tokenizer
model, tokenizer = load_tts_model()
# User Input
user_text = st.text_input("Enter your text here:")
# Generate Audio on Button Click
if st.button("Generate Speech"):
if not user_text:
st.warning("Please enter some text.")
inputs = tokenizer(user_text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
# Specify sample rate (assuming it's the correct rate for the model)
sample_rate = 16000 # Or replace with the correct sample rate for 'facebook/mms-tts-eng'
# Optionally save to a temporary file (if needed)
sf.write("temp_audio.wav", output[0].numpy(), sample_rate)
# Choose one of the following playback methods:
# Method 1: Play from temporary file
# Method 2: Play directly with sample rate
st.audio(output[0].numpy(), sample_rate=sample_rate)