Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
import torch | |
import torchaudio | |
import noisereduce as nr | |
import numpy as np | |
import soundfile as sf | |
# Load models and processor | |
st.title("Ratan Tata SpeechT5 TTS Demo") | |
processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder | |
model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device}) | |
# Upload audio file for voice embeddings | |
uploaded_file = '' | |
if uploaded_file is not None: | |
signal, fs = torchaudio.load(uploaded_file) | |
speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy() | |
speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) | |
# Text input for TTS | |
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.") | |
if st.button("Generate Speech"): | |
def split_text_by_length(text, max_length=80): | |
words = text.split() | |
result = [] | |
current_line = [] | |
for word in words: | |
if len(' '.join(current_line + [word])) > max_length: | |
result.append(' '.join(current_line)) | |
current_line = [word] | |
else: | |
current_line.append(word) | |
if current_line: | |
result.append(' '.join(current_line)) | |
return result | |
splited_text = split_text_by_length(input_text) | |
all_speech = [] | |
for i in splited_text: | |
inputs = processor(text=i, return_tensors="pt") | |
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
if isinstance(speech_chunk, torch.Tensor): | |
speech_chunk = speech_chunk.cpu().numpy() | |
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) | |
all_speech.append(reduced_noise_chunk) | |
concatenated_speech = np.concatenate(all_speech) | |
# Save the output audio | |
sf.write("output_speech.wav", concatenated_speech, 16000) | |
st.audio("output_speech.wav") | |
st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!") | |