from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
import torch
import gradio as gr

# Load the TTS model and processor
model_checkpoint = "ejazhabibdar/speecht5_finetuned_voxpopuli_nl"
model = SpeechT5ForTextToSpeech.from_pretrained(model_checkpoint)
processor = SpeechT5Processor.from_pretrained(model_checkpoint)

# Load the vocoder for generating speech
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
vocoder.to(device)

# Set the model to evaluation mode
model.eval()
vocoder.eval()

# Define the TTS function
def text_to_speech(text):
    # Preprocess the input text
    inputs = processor(text=text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder)

    return speech.tolist()

# Create a Gradio interface
iface = gr.Interface(
    fn=text_to_speech,
    inputs="text",
    outputs="audio",
    title="Text-to-Speech",
    description="Enter the text and listen to the generated speech.",
    theme="huggingface",
)

if __name__ == "__main__":
    iface.launch()