Spaces:

Emerging-Tech
/

speech

Sleeping

speech / app.py

Update app.py

b959239 verified over 1 year ago

1.49 kB

	import streamlit as st
	from transformers import VitsModel, AutoTokenizer
	import torch
	import soundfile as sf

	# Title and Description
	st.title("Text-to-Speech with VitsModel")
	st.write("Enter some English text, and I'll generate audio for you!")

	# Load Model and Tokenizer
	@st.cache_resource # Cache the model for efficiency
	def load_tts_model():
	model = VitsModel.from_pretrained("facebook/mms-tts-eng")
	tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
	return model, tokenizer

	model, tokenizer = load_tts_model()

	# User Input
	user_text = st.text_input("Enter your text here:")

	# Generate Audio on Button Click
	if st.button("Generate Speech"):
	if not user_text:
	st.warning("Please enter some text.")
	else:
	inputs = tokenizer(user_text, return_tensors="pt")
	with torch.no_grad():
	output = model(**inputs).waveform

	# Specify sample rate (assuming it's the correct rate for the model)
	sample_rate = 16000 # Or replace with the correct sample rate for 'facebook/mms-tts-eng'

	# Optionally save to a temporary file (if needed)
	sf.write("temp_audio.wav", output[0].numpy(), sample_rate)

	# Choose one of the following playback methods:

	# Method 1: Play from temporary file
	st.audio("temp_audio.wav")

	# Method 2: Play directly with sample rate
	st.audio(output[0].numpy(), sample_rate=sample_rate)