RamananR's picture
Update app.py
bc9ed41 verified
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from IPython.display import Audio
from datasets import load_dataset
import noisereduce as nr
import soundfile as sf
import os, torchaudio
import numpy as np
import torch
import streamlit as st
import uuid
st.title("Ratan Tata SpeechT5 TTS Demo")
processor = SpeechT5Processor.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder
processor.tokenizer.split_special_tokens = True
model = SpeechT5ForTextToSpeech.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
signal, fs =torchaudio.load('default_ratan_tata_voice.wav')
print(signal, fs)
# Ensure to detach and clone before converting to tensor if needed
speaker_embeddings = speaker_model.encode_batch(signal) # Directly passing signal as a tensor, no need to wrap in torch.tensor
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # Normalize the embeddings
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() # Squeeze and convert to numpy array
speaker_embeddings = torch.tensor(np.array([speaker_embeddings])) # Convert back to tensor if necessary
def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60"
# Splits the text into chunks of max_length, preserving words
words = text.split()
result = []
current_line = []
for word in words:
# Check if adding the next word exceeds the maximum length
if len(' '.join(current_line + [word])) > max_length:
result.append(' '.join(current_line))
current_line = [word]
else:
current_line.append(word)
# Add the last remaining part
if current_line:
result.append(' '.join(current_line))
return result
if st.button("Generate Speech"):
input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
splited_text=split_text_by_length(input_text,max_length=80)
print(splited_text)
all_speech = []
# Assuming splited_text is already defined
for i in splited_text:
inputs = processor(text=i, return_tensors="pt")
speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
print(speech_chunk)
if isinstance(speech_chunk, torch.Tensor):
speech_chunk = speech_chunk.cpu().numpy()
# Apply noise reduction to each speech chunk
reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000) # assuming 16kHz sample rate
all_speech.append(reduced_noise_chunk)
# Concatenate the noise-reduced speech chunks
concatenated_speech = np.concatenate(all_speech)
temp = str(uuid.uuid4())
output_path = os.path.join("./tmp", f"output_speech_{temp}.wav")
sf.write(output_path, concatenated_speech, 16000)
st.audio(output_path)