from transformers import pipeline from datasets import load_dataset import soundfile as sf import torch import gradio as gr # Text-to-speech pipeline synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") def synthesize_speech(text, speaker_id): speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0) speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding}) sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"]) return "speech.wav" # Define your interface iface = gr.Interface( fn=synthesize_speech, inputs=[gr.Textbox(label="Enter your text"), gr.Slider(minimum=0, maximum=len(embeddings_dataset)-1, label="Speaker ID")], outputs="audio", title="Text-to-Speech Synthesizer", description="Type a text and choose a speaker to synthesize speech." ) # Launch the app iface.launch()