""" Copyright 2022 Balacoon TTS interactive demo """ import logging from typing import cast import gradio as gr from balacoon_tts import TTS from huggingface_hub import hf_hub_download, list_repo_files # global tts module, initialized from a model selected tts = None def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: gr.Markdown( """

Text-to-Speech

1. Write an utterance to generate, 2. Select the model to synthesize with 3. Select speaker (only for multi-speaker models) 4. Hit "Generate" and listen to the result! When you select model for the first time, it will take a little time to download it. You can learn more about models available [here](https://huggingface.co/balacoon/tts), visit [Balacoon website](https://balacoon.com/) for more info. """ ) with gr.Row(variant="panel"): text = gr.Textbox(label="Text", placeholder="Type something here...") with gr.Row(): with gr.Column(variant="panel"): repo_files = list_repo_files(repo_id="balacoon/tts") model_files = [x for x in repo_files if x.endswith(".addon")] model_name = gr.Dropdown( label="Model", choices=model_files, ) with gr.Column(variant="panel"): speaker = gr.Dropdown(label="Speaker", choices=[]) def set_model(model_name_str: str): """ gets value from `model_name`, loads model, re-initializes tts object, gets list of speakers that model supports and set them to `speaker` """ model_path = hf_hub_download( repo_id="balacoon/tts", filename=model_name_str ) global tts tts = TTS(model_path) speakers = tts.get_speakers() if speakers: visible = True value = speakers[-1] else: visible = False value = "" return gr.Dropdown.update( choices=speakers, value=value, visible=visible ) model_name.change(set_model, inputs=model_name, outputs=speaker) with gr.Row(variant="panel"): generate = gr.Button("Generate") with gr.Row(variant="panel"): audio = gr.Audio() def synthesize_audio(text_str: str, speaker_str: str = ""): """ gets utterance to synthesize from `text` Textbox and speaker name from `speaker` dropdown list. speaker name might be empty for single-speaker models. Synthesizes the waveform and updates `audio` with it. """ if not text_str: logging.info("text or speaker are not provided") return None global tts if len(text_str) > 1024: text_str = text_str[:1024] samples = cast(TTS, tts).synthesize(text_str, speaker_str) return gr.Audio.update(value=(24000, samples)) generate.click(synthesize_audio, inputs=[text, speaker], outputs=audio) demo.launch() if __name__ == "__main__": main()