from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan import torch import gradio as gr # Load the TTS model and processor model_checkpoint = "ejazhabibdar/speecht5_finetuned_voxpopuli_nl" model = SpeechT5ForTextToSpeech.from_pretrained(model_checkpoint) processor = SpeechT5Processor.from_pretrained(model_checkpoint) # Load the vocoder for generating speech vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Set the device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) vocoder.to(device) # Set the model to evaluation mode model.eval() vocoder.eval() # Define the TTS function def text_to_speech(text): # Preprocess the input text inputs = processor(text=text, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Generate speech with torch.no_grad(): speech = model.generate_speech(inputs["input_ids"], vocoder=vocoder) return speech.tolist() # Create a Gradio interface iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Text-to-Speech", description="Enter the text and listen to the generated speech.", theme="huggingface", ) if __name__ == "__main__": iface.launch()