from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier from IPython.display import Audio from datasets import load_dataset import noisereduce as nr import soundfile as sf import os, torchaudio import numpy as np import torch import streamlit as st import uuid st.title("Ratan Tata SpeechT5 TTS Demo") processor = SpeechT5Processor.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder processor.tokenizer.split_special_tokens = True model = SpeechT5ForTextToSpeech.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name), ) signal, fs =torchaudio.load('default_ratan_tata_voice.wav') print(signal, fs) # Ensure to detach and clone before converting to tensor if needed speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60" # Splits the text into chunks of max_length, preserving words words = text.split() result = [] current_line = [] for word in words: # Check if adding the next word exceeds the maximum length if len(' '.join(current_line + [word])) > max_length: result.append(' '.join(current_line)) current_line = [word] else: current_line.append(word) # Add the last remaining part if current_line: result.append(' '.join(current_line)) return result if st.button("Generate Speech"): input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.") splited_text=split_text_by_length(input_text,max_length=80) print(splited_text) all_speech = [] # Assuming splited_text is already defined for i in splited_text: inputs = processor(text=i, return_tensors="pt") speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) print(speech_chunk) if isinstance(speech_chunk, torch.Tensor): speech_chunk = speech_chunk.cpu().numpy() # Apply noise reduction to each speech chunk reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate all_speech.append(reduced_noise_chunk) # Concatenate the noise-reduced speech chunks concatenated_speech = np.concatenate(all_speech) temp = str(uuid.uuid4()) output_path = os.path.join("./tmp", f"output_speech_{temp}.wav") sf.write(output_path, concatenated_speech, 16000) st.audio(output_path)