import gradio as gr import numpy as np import torch from pathlib import Path import tempfile from TTS.api import TTS from TTS.utils.manage import ModelManager title = "Coqui.ai: Text-to-Speech generation and Voice Conversion" description = """ Coqui.ai is the library for advanced Text-to-Speech generation and Voice Conversion. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. Coqui.ai comes with pretrained models, tools for measuring dataset quality and already used in 20+ languages for products and research projects. For the demo we selected 6 best performing TTS models from Coqui.ai library. How to use: Select a model from dropdown box. Some multispeaker/multilingual models allow to select speaker and language as well. Upload or select voice to be cloned [or record using the microphone -TBD]. Enter text in the text box or upload audio file [or record using the microphone -TBD]. Press "Text to speech" or "Convert audio" button. For TTS task you can choose not to clone voice and hear original voice of the model. """ article = """

References: original GitHub | Documentation | Voice cloning on Coqui Studio | Text-to-Speech paper collection

With few exceptions, Voice Conversion in Coqui.ai is implemented with FreeVC model
@misc{li2022freevc,
      title={FreeVC: Towards High-Quality Text-Free One-Shot Voice Conversion}, 
      author={Jingyi li and Weiping tu and Li xiao},
      year={2022},
      eprint={2210.15418},
      archivePrefix={arXiv},
      primaryClass={cs.SD}
}
""" class TTS_local(TTS): def __init__(self, model_name=None, output_prefix: str = './', progress_bar: bool = True, gpu=False): super().__init__( model_name=None, model_path=None, config_path=None, vocoder_path=None, vocoder_config_path=None, progress_bar=progress_bar, gpu=False, ) self.manager = ModelManager(models_file=self.get_models_file_path(), output_prefix=output_prefix, progress_bar=progress_bar, verbose=False) if model_name is not None: if "tts_models" in model_name or "coqui_studio" in model_name: self.load_tts_model_by_name(model_name, gpu) elif "voice_conversion_models" in model_name: self.load_vc_model_by_name(model_name, gpu) device = "cuda" if torch.cuda.is_available() else "cpu" GPU = device == "cuda" INT16MAX = np.iinfo(np.int16).max MODEL_DIR = './' MANAGER = ModelManager(verbose=False) model_ids = MANAGER.list_models() local_model_ids = [p.parts[-1].replace('--', '/') for p in (Path(MODEL_DIR) / 'tts').glob('*') if p.is_dir() and (p.parts[-1].replace('--', '/') in model_ids)] model_tts_ids = [model for model in local_model_ids if 'tts_models' in model and ('/multilingual/' in model or '/en/' in model)] model_vocoder_ids = [model for model in local_model_ids if 'vocoder_models' in model and ('/universal/' in model or '/en/' in model)] model_vconv_ids = [model for model in local_model_ids if 'voice_conversion_models' in model and ('/multilingual/' in model or '/en/' in model)] VC_MODEL = TTS_local(model_name='voice_conversion_models/multilingual/vctk/freevc24', output_prefix=MODEL_DIR, progress_bar=False, gpu=GPU) examples_pt = 'examples' allowed_extentions = ['.mp3', '.wav'] examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions} verse = """Mary had a little lamb, Its fleece was white as snow. Everywhere the child went, The little lamb was sure to go.""" def on_model_tts_select(model_name): tts_var = TTS(model_name=model_name, progress_bar=False, gpu=GPU) languages = tts_var.languages if tts_var.is_multi_lingual else [''] speakers = [s.replace('\n', '-n') for s in tts_var.speakers] if tts_var.is_multi_speaker else [''] # there's weird speaker formatting language = languages[0] speaker = speakers[0] return tts_var, gr.update(choices=languages, value=language, interactive=tts_var.is_multi_lingual),\ gr.update(choices=speakers, value=speaker, interactive=tts_var.is_multi_speaker) def on_voicedropdown(x): return examples[x] def voice_clone(source_wav, target_wav): print(f'model: {VC_MODEL.model_name}\nsource_wav: {source_wav}\ntarget_wav: {target_wav}') sample_rate = VC_MODEL.voice_converter.output_sample_rate if source_wav is None or target_wav is None: return (sample_rate, np.zeros(0).astype(np.int16)) speech = VC_MODEL.voice_conversion(source_wav=source_wav, target_wav=target_wav) speech = (np.array(speech) * INT16MAX).astype(np.int16) return (sample_rate, speech) def text_to_speech(text, tts_model, language, speaker, target_wav, use_original_voice): if len(text.strip()) == 0 or tts_model is None or (target_wav is None and not use_original_voice): return (16000, np.zeros(0).astype(np.int16)) sample_rate = tts_model.synthesizer.output_sample_rate if tts_model.is_multi_speaker: speaker = {s.replace('\n', '-n'): s for s in tts_model.speakers}[speaker] # there's weird speaker formatting print(f'model: {tts_model.model_name}\nlanguage: {language}\nspeaker: {speaker}') language = None if language == '' else language speaker = None if speaker == '' else speaker if use_original_voice: print('Using original voice') speech = tts_model.tts(text, language=language, speaker=speaker) elif tts_model.synthesizer.tts_model.speaker_manager and tts_model.synthesizer.tts_model.speaker_manager.encoder_ap: print('voice cloning with the tts') speech = tts_model.tts(text, language=language, speaker_wav=target_wav) else: print('voice cloning with the voice conversion model') # speech = tts_model.tts_with_vc(text, language=language, speaker_wav=target_wav) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # Lazy code... save it to a temp file to resample it while reading it for VC tts_model.tts_to_file(text, language=language, speaker=speaker, file_path=fp.name) speech = VC_MODEL.voice_conversion(source_wav=fp.name, target_wav=target_wav) sample_rate = VC_MODEL.voice_converter.output_sample_rate speech = (np.array(speech) * INT16MAX).astype(np.int16) return (sample_rate, speech) with gr.Blocks() as demo: tts_model = gr.State(None) def activate(*args): return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args) def deactivate(*args): return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args) gr.Markdown(description) with gr.Row(equal_height=True): with gr.Column(scale=5, min_width=50): model_tts_dropdown = gr.Dropdown(model_tts_ids, value=None, label='Text-to-speech model', interactive=True) with gr.Column(scale=1, min_width=10): language_dropdown = gr.Dropdown(None, value=None, label='Language', interactive=False, visible=True) with gr.Column(scale=1, min_width=10): speaker_dropdown = gr.Dropdown(None, value=None, label='Speaker', interactive=False, visible=True) with gr.Accordion("Target voice", open=False) as accordion: gr.Markdown("Upload target voice...") with gr.Row(equal_height=True): voice_upload = gr.Audio(label='Upload target voice', source='upload', type='filepath') voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True) with gr.Row(equal_height=True): with gr.Column(scale=2): with gr.Row(equal_height=True): with gr.Column(): text_to_convert = gr.Textbox(verse) orig_voice = gr.Checkbox(label='Use original voice') voice_to_convert = gr.Audio(label="Upload voice to convert", source='upload', type='filepath') with gr.Row(equal_height=True): button_text = gr.Button('Text to speech', interactive=True) button_audio = gr.Button('Convert audio', interactive=True) with gr.Row(equal_height=True): speech = gr.Audio(label='Converted Speech', type='numpy', visible=True, interactive=False) # actions model_tts_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=on_model_tts_select, inputs=[model_tts_dropdown], outputs=[tts_model, language_dropdown, speaker_dropdown]).\ then(activate, [button_text, button_audio], [button_text, button_audio]) voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\ then(activate, [button_text, button_audio], [button_text, button_audio]) button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=text_to_speech, inputs=[text_to_convert, tts_model, language_dropdown, speaker_dropdown, voice_upload, orig_voice], outputs=speech).\ then(activate, [button_text, button_audio], [button_text, button_audio]) button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=voice_clone, inputs=[voice_to_convert, voice_upload], outputs=speech).\ then(activate, [button_text, button_audio], [button_text, button_audio]) gr.HTML(article) demo.launch(share=False)