from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
from IPython.display import Audio
from datasets import load_dataset
import noisereduce as nr
import soundfile as sf
import os, torchaudio
import numpy as np
import torch
import streamlit as st
import uuid
st.title("Ratan Tata SpeechT5 TTS Demo")

processor = SpeechT5Processor.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder 
processor.tokenizer.split_special_tokens = True
model = SpeechT5ForTextToSpeech.from_pretrained("Ratan_tata_Voice_Cloned_tts")#Replace with the model folder 
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)
signal, fs =torchaudio.load('default_ratan_tata_voice.wav')
print(signal, fs)
# Ensure to detach and clone before converting to tensor if needed
speaker_embeddings = speaker_model.encode_batch(signal)  # Directly passing signal as a tensor, no need to wrap in torch.tensor
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)  # Normalize the embeddings
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()  # Squeeze and convert to numpy array
speaker_embeddings = torch.tensor(np.array([speaker_embeddings]))  # Convert back to tensor if necessary


def split_text_by_length(text, max_length=60):#from the paper speech_t5 max char length 120 char "max_length=60"
    # Splits the text into chunks of max_length, preserving words
    words = text.split()
    result = []
    current_line = []

    for word in words:
        # Check if adding the next word exceeds the maximum length
        if len(' '.join(current_line + [word])) > max_length:
            result.append(' '.join(current_line))
            current_line = [word]
        else:
            current_line.append(word)
    
    # Add the last remaining part
    if current_line:
        result.append(' '.join(current_line))
    
    return result


if st.button("Generate Speech"):
    input_text = st.text_area("Enter the text to be synthesized:", value="This is a generated audio example.")
    splited_text=split_text_by_length(input_text,max_length=80)
    print(splited_text)
    all_speech = []

# Assuming splited_text is already defined
    for i in splited_text:

        inputs = processor(text=i, return_tensors="pt")
        speech_chunk = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)  
        print(speech_chunk)
        if isinstance(speech_chunk, torch.Tensor):
            speech_chunk = speech_chunk.cpu().numpy()
    
        # Apply noise reduction to each speech chunk
        reduced_noise_chunk = nr.reduce_noise(y=speech_chunk, sr=16000)  # assuming 16kHz sample rate
    
        all_speech.append(reduced_noise_chunk)

# Concatenate the noise-reduced speech chunks
    concatenated_speech = np.concatenate(all_speech)

    temp = str(uuid.uuid4())
    output_path = os.path.join("./tmp", f"output_speech_{temp}.wav")
    sf.write(output_path, concatenated_speech, 16000)
    st.audio(output_path)