import gradio as gr from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True) def predict(prompt, language, audio_file_pth): tts.tts_to_file( text=prompt, file_path="output.wav", speaker_wav=audio_file_pth, language=language, ) return gr.make_waveform( audio="output.wav", ) title = "XTTS: MVP" gr.Interface( fn=predict, inputs=[ gr.Textbox( label="Text Prompt", info="One or two sentences at a time is better", placeholder="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", ), gr.Dropdown( label="Language", info="Select an output language for the synthesised speech", choices=[ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cz", "ar", "zh", ], max_choices=1, ), gr.Audio( label="Reference Audio", info="Upload a reference audio for target speaker voice", type="filepath", ), ], outputs=[ gr.Video(label="Synthesised Speech"), ], title=title, ).launch(debug=True)