Spaces:
Sleeping
Sleeping
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
from IPython.display import Audio | |
from datasets import load_dataset | |
import noisereduce as nr | |
import soundfile as sf | |
import os, torchaudio | |
import numpy as np | |
import torch | |
import streamlit as st | |
import uuid | |
st.title("Ratan Tata SpeechT5 TTS Demo") | |
processor = SpeechT5Processor.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder | |
processor.tokenizer.split_special_tokens = True | |
model = SpeechT5ForTextToSpeech.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
speaker_model = EncoderClassifier.from_hparams( | |
source=spk_model_name, | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", spk_model_name), | |
) | |
signal, fs =torchaudio.load('default_ratan_tata_voice.wav') | |
print(signal, fs) | |
# Ensure to detach and clone before converting to tensor if needed | |
speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor | |
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings | |
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array | |
speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary | |
def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60" | |
# Splits the text into chunks of max_length, preserving words | |
words = text.split() | |
result = [] | |
current_line = [] | |
for word in words: | |
# Check if adding the next word exceeds the maximum length | |
if len(' '.join(current_line + [word])) > max_length: | |
result.append(' '.join(current_line)) | |
current_line = [word] | |
else: | |
current_line.append(word) | |
# Add the last remaining part | |
if current_line: | |
result.append(' '.join(current_line)) | |
return result | |
if st.button("Generate Speech"): | |
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.") | |
splited_text=split_text_by_length(input_text,max_length=80) | |
print(splited_text) | |
all_speech = [] | |
# Assuming splited_text is already defined | |
for i in splited_text: | |
inputs = processor(text=i, return_tensors="pt") | |
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
print(speech_chunk) | |
if isinstance(speech_chunk, torch.Tensor): | |
speech_chunk = speech_chunk.cpu().numpy() | |
# Apply noise reduction to each speech chunk | |
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate | |
all_speech.append(reduced_noise_chunk) | |
# Concatenate the noise-reduced speech chunks | |
concatenated_speech = np.concatenate(all_speech) | |
temp = str(uuid.uuid4()) | |
output_path = os.path.join("./tmp", f"output_speech_{temp}.wav") | |
sf.write(output_path, concatenated_speech, 16000) | |
st.audio(output_path) | |