import gradio as gr import numpy as np import spaces from ipa import g2p from ipa.ipa import text_to_ipa from models import models_config @spaces.GPU def _do_tts(model_id, ipa, language_name, speaker_name=None, speaker_wav=None): model = models_config[model_id]["model"] if speaker_wav is not None: return model.tts( ipa, speaker_wav=speaker_wav, language_name=language_name, split_sentences=False, ) return model.tts( ipa, speaker_name=speaker_name, language_name=language_name, split_sentences=False, ) def text_to_speech( model_id: str, use_default_emb_or_custom: str, speaker_wav, speaker: str, language: str, dialect: str, speed: float, text: str, ): if len(text) == 0: raise gr.Error("請勿輸入空字串。") tag = language if language not in g2p: tag = f"{language}_{dialect}" ipa = text_to_ipa(text, tag, g2p) models_config[model_id]["model"].tts_model.length_scale = speed if use_default_emb_or_custom == "預設語者": wav = _do_tts( model_id, ipa, speaker_name=speaker if len(models_config[model_id]["speaker_mapping"]) > 1 else None, language_name=language, ) else: wav = _do_tts( model_id, ipa, speaker_wav=speaker_wav, language_name=language, ) return ( models_config[model_id]["model"].tts_model.config.audio.sample_rate, np.array(wav), ) def when_model_selected(model_id): model_config = models_config[model_id] speaker_drop_down_choices = [ (k, v) for k, v in model_config["speaker_mapping"].items() ] language_radio_choices = [ (k, v) for k, v in model_config["language_mapping"].items() ] use_default_emb_or_ref_radio_visible = False if model_config["model"].tts_model.config.model_args.speaker_encoder_model_path: use_default_emb_or_ref_radio_visible = True return ( gr.update( choices=speaker_drop_down_choices, value=speaker_drop_down_choices[0][1] if len(speaker_drop_down_choices) > 0 else None, interactive=len(speaker_drop_down_choices) > 1, ), gr.update( choices=language_radio_choices, value=language_radio_choices[0][1], interactive=len(language_radio_choices) > 1, ), gr.update(visible=use_default_emb_or_ref_radio_visible, value="預設語者"), ) def use_default_emb_or_custom_radio_input(use_default_emb_or_custom): if use_default_emb_or_custom == "客製化語者": return gr.update(visible=True), gr.update(visible=False) return gr.update(visible=False), gr.update(visible=True) def language_radio_changed(language): if language in g2p: return gr.update(visible=False) dialect_choices = [tag.split("_")[1] for tag in g2p.keys() if language in tag] return gr.update( choices=dialect_choices, value=dialect_choices[0], interactive=len(dialect_choices) > 1, ) demo = gr.Blocks( title="臺灣南島語語音合成系統", css="@import url(https://tauhu.tw/tauhu-oo.css);", theme=gr.themes.Default( font=( "tauhu-oo", gr.themes.GoogleFont("Source Sans Pro"), "ui-sans-serif", "system-ui", "sans-serif", ) ), ) with demo: default_model_id = list(models_config.keys())[0] model_drop_down = gr.Dropdown( models_config.keys(), value=default_model_id, label="模型", ) use_default_emb_or_custom_radio = gr.Radio( label="語者類型", choices=["預設語者", "客製化語者"], value="預設語者", visible=True, show_label=False, ) speaker_wav = gr.Audio( label="客製化語音", visible=True, editable=False, type="filepath", waveform_options=gr.WaveformOptions( show_controls=False, sample_rate=16000, ), ) speaker_drop_down = gr.Dropdown( choices=[ (k, v) for k, v in models_config[default_model_id]["speaker_mapping"].items() ], value=list(models_config[default_model_id]["speaker_mapping"].values())[0], label="語者", interactive=len(models_config[default_model_id]["speaker_mapping"]) > 1, visible=False, ) use_default_emb_or_custom_radio.change( use_default_emb_or_custom_radio_input, inputs=[use_default_emb_or_custom_radio], outputs=[speaker_wav, speaker_drop_down], ) default_language = list( models_config[default_model_id]["language_mapping"].values() )[0] language_radio = gr.Radio( choices=[ (k, v) for k, v in models_config[default_model_id]["language_mapping"].items() ], value=default_language, label="語言", interactive=len(models_config[default_model_id]["language_mapping"]) > 1, ) default_dialect_choices = [ tag.split("_")[1] for tag in g2p.keys() if default_language in tag ] dialect_radio = gr.Radio( choices=default_dialect_choices, value=default_dialect_choices[0], label="方言", interactive=len(default_dialect_choices) > 1, ) language_radio.change( language_radio_changed, inputs=[language_radio], outputs=[dialect_radio] ) model_drop_down.input( when_model_selected, inputs=[model_drop_down], outputs=[speaker_drop_down, language_radio, use_default_emb_or_custom_radio], ) input_text = gr.Textbox( label="輸入文字", value="", ) speed = gr.Slider(maximum=1.5, minimum=0.5, value=1, label="語速") with open("DEMO.md") as tong: gr.Markdown(tong.read()) gr.Interface( text_to_speech, inputs=[ model_drop_down, use_default_emb_or_custom_radio, speaker_wav, speaker_drop_down, language_radio, dialect_radio, speed, input_text, ], outputs=[ gr.Audio(interactive=False, label="合成語音", show_download_button=True), ], allow_flagging="auto", ) gr.Examples( [ [ "預設語者", "formosan_dict_ami#wav/formosan_dict_ami/000002_2.31-6.09.wav", "阿美", "南勢", "mikadavu ku vavainay, i vavahiyan, a luma’", ], [ "預設語者", "formosan_dict_ami#wav/formosan_dict_ami/000035_0.00-3.69.wav", "阿美", "南勢", "mikadavu ku vavainay, i vavahiyan, a luma’", ], ], label="範例", inputs=[ use_default_emb_or_custom_radio, speaker_drop_down, language_radio, dialect_radio, input_text, ], ) demo.launch()