import streamlit as st from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier import torch import torchaudio import noisereduce as nr import numpy as np import soundfile as sf # Load models and processor st.title("Ratan Tata SpeechT5 TTS Demo") processor = SpeechT5Processor.from_pretrained("checkpoint-60000") # Replace with model folder model = SpeechT5ForTextToSpeech.from_pretrained("checkpoint-60000") # Replace with model folder vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams(source=spk_model_name, run_opts={"device": device}) # Upload audio file for voice embeddings uploaded_file = '' if uploaded_file is not None: signal, fs = torchaudio.load(uploaded_file) speaker_embeddings = speaker_model.encode_batch(signal).squeeze().cpu().numpy() speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Text input for TTS input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.") if st.button("Generate Speech"): def split_text_by_length(text, max_length=80): words = text.split() result = [] current_line = [] for word in words: if len(' '.join(current_line + [word])) > max_length: result.append(' '.join(current_line)) current_line = [word] else: current_line.append(word) if current_line: result.append(' '.join(current_line)) return result splited_text = split_text_by_length(input_text) all_speech = [] for i in splited_text: inputs = processor(text=i, return_tensors="pt") speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) if isinstance(speech_chunk, torch.Tensor): speech_chunk = speech_chunk.cpu().numpy() reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) all_speech.append(reduced_noise_chunk) concatenated_speech = np.concatenate(all_speech) # Save the output audio sf.write("output_speech.wav", concatenated_speech, 16000) st.audio("output_speech.wav") st.write("Upload an audio file, input text, and generate speech that mimics Ratan Tata's voice!")