import gradio as gr import torch from modules.webui.webui_utils import ( get_speakers, get_styles, load_spk_info, refine_text, tts_generate, ) from modules.webui import webui_config from modules.webui.examples import example_texts from modules import config default_text_content = """ chat T T S 是一款强大的对话式文本转语音模型。它有中英混读和多说话人的能力。 """.strip() def create_tts_interface(): speakers = get_speakers() def get_speaker_show_name(spk): if spk.gender == "*" or spk.gender == "": return spk.name return f"{spk.gender} : {spk.name}" speaker_names = ["*random"] + [ get_speaker_show_name(speaker) for speaker in speakers ] styles = ["*auto"] + [s.get("name") for s in get_styles()] history = [] with gr.Row(): with gr.Column(scale=1): with gr.Group(): gr.Markdown("🎛️Sampling") temperature_input = gr.Slider( 0.01, 2.0, value=0.3, step=0.01, label="Temperature" ) top_p_input = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Top P") top_k_input = gr.Slider(1, 50, value=20, step=1, label="Top K") batch_size_input = gr.Slider( 1, webui_config.max_batch_size, value=4, step=1, label="Batch Size", ) with gr.Row(): with gr.Group(): gr.Markdown("🎭Style") gr.Markdown("- 后缀为 `_p` 表示带prompt,效果更强但是影响质量") style_input_dropdown = gr.Dropdown( choices=styles, # label="Choose Style", interactive=True, show_label=False, value="*auto", ) with gr.Row(): with gr.Group(): gr.Markdown("🗣️Speaker") with gr.Tabs(): with gr.Tab(label="Pick"): spk_input_text = gr.Textbox( label="Speaker (Text or Seed)", value="female2", show_label=False, ) spk_input_dropdown = gr.Dropdown( choices=speaker_names, # label="Choose Speaker", interactive=True, value="female : female2", show_label=False, ) spk_rand_button = gr.Button( value="🎲", # tooltip="Random Seed", variant="secondary", ) spk_input_dropdown.change( fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(), inputs=[spk_input_dropdown], outputs=[spk_input_text], ) spk_rand_button.click( lambda x: str(torch.randint(0, 2**32 - 1, (1,)).item()), inputs=[spk_input_text], outputs=[spk_input_text], ) with gr.Tab(label="Upload"): spk_file_upload = gr.File(label="Speaker (Upload)") gr.Markdown("📝Speaker info") infos = gr.Markdown("empty") spk_file_upload.change( fn=load_spk_info, inputs=[spk_file_upload], outputs=[infos], ), with gr.Group(): gr.Markdown("💃Inference Seed") infer_seed_input = gr.Number( value=42, label="Inference Seed", show_label=False, minimum=-1, maximum=2**32 - 1, ) infer_seed_rand_button = gr.Button( value="🎲", # tooltip="Random Seed", variant="secondary", ) use_decoder_input = gr.Checkbox( value=True, label="Use Decoder", visible=False ) with gr.Group(): gr.Markdown("🔧Prompt engineering") prompt1_input = gr.Textbox(label="Prompt 1") prompt2_input = gr.Textbox(label="Prompt 2") prefix_input = gr.Textbox(label="Prefix") prompt_audio = gr.File( label="prompt_audio", visible=webui_config.experimental ) infer_seed_rand_button.click( lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()), inputs=[infer_seed_input], outputs=[infer_seed_input], ) with gr.Column(scale=4): with gr.Group(): input_title = gr.Markdown( "📝Text Input", elem_id="input-title", ) gr.Markdown(f"- 字数限制{webui_config.tts_max:,}字,超过部分截断") gr.Markdown("- 如果尾字吞字不读,可以试试结尾加上 `[lbreak]`") gr.Markdown( "- If the input text is all in English, it is recommended to check disable_normalize" ) text_input = gr.Textbox( show_label=False, label="Text to Speech", lines=10, placeholder="输入文本或选择示例", elem_id="text-input", value=default_text_content, ) # TODO 字数统计,其实实现很好写,但是就是会触发loading...并且还要和后端交互... # text_input.change( # fn=lambda x: ( # f"📝Text Input ({len(x)} char)" # if x # else ( # "📝Text Input (0 char)" # if not x # else "📝Text Input (0 char)" # ) # ), # inputs=[text_input], # outputs=[input_title], # ) with gr.Row(): contorl_tokens = [ "[laugh]", "[uv_break]", "[v_break]", "[lbreak]", ] for tk in contorl_tokens: t_btn = gr.Button(tk) t_btn.click( lambda text, tk=tk: text + " " + tk, inputs=[text_input], outputs=[text_input], ) with gr.Group(): gr.Markdown("🎄Examples") sample_dropdown = gr.Dropdown( choices=[sample["text"] for sample in example_texts], show_label=False, value=None, interactive=True, ) sample_dropdown.change( fn=lambda x: x, inputs=[sample_dropdown], outputs=[text_input], ) with gr.Group(): gr.Markdown("🎨Output") tts_output = gr.Audio(label="Generated Audio", format="mp3") with gr.Column(scale=1): with gr.Group(): gr.Markdown("🎶Refiner") refine_prompt_input = gr.Textbox( label="Refine Prompt", value="[oral_2][laugh_0][break_6]", ) refine_button = gr.Button("✍️Refine Text") with gr.Group(): gr.Markdown("🔊Generate") disable_normalize_input = gr.Checkbox( value=False, label="Disable Normalize" ) with gr.Group(): gr.Markdown("💪🏼Enhance") enable_enhance = gr.Checkbox(value=True, label="Enable Enhance") enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise") tts_button = gr.Button( "🔊Generate Audio", variant="primary", elem_classes="big-button", ) refine_button.click( refine_text, inputs=[text_input, refine_prompt_input], outputs=[text_input], ) tts_button.click( tts_generate, inputs=[ text_input, temperature_input, top_p_input, top_k_input, spk_input_text, infer_seed_input, use_decoder_input, prompt1_input, prompt2_input, prefix_input, style_input_dropdown, disable_normalize_input, batch_size_input, enable_enhance, enable_de_noise, spk_file_upload, ], outputs=tts_output, )