# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree import subprocess command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app" subprocess.check_output(command_to_run, shell=True, text=True) import gradio as gr import os import inference SUPPORTED_SPEAKERS = { "Cori Samuel":"hifitts_92", "Phil Benson":"hifitts_6097", "Mike Pelton":"hifitts_6670", "Tony Oliva":"hifitts_6671", "Maria Kasper":"hifitts_8051", "John Van Stan":"hifitts_9017", "Helen Taylor":"hifitts_9136", "Sylviamb":"hifitts_11614", "Celine Major":"hifitts_11697", "LikeManyWaters":"hifitts_12787" } def tts_inference( input_text, target_speaker, duration ): ### Target Speaker ### target_speaker = SUPPORTED_SPEAKERS[target_speaker] args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] args_list += ["--speaker_name_1", target_speaker] args_list += ["--speaker_name_2", None] args_list += ["--text", input_text] args_list += ["--mode","single"] args_list += ["--duration_control",str(float(2.05-duration))] args_list += ["--output_dir", "result"] args_list += ["--log_level", "debug"] os.environ["WORK_DIR"] = "./" inference.main(args_list) ### Display ### result_file = os.path.join( "result/single/test_pred.wav" ) return result_file def tc_inference( input_text, target_speaker_1, target_speaker_2, confusion_degree, duration ): ### Target Speaker ### target_speaker_1 = SUPPORTED_SPEAKERS[target_speaker_1] if target_speaker_2 is not None: target_speaker_2 = SUPPORTED_SPEAKERS[target_speaker_2] args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"] args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"] args_list += ["--speaker_name_1", target_speaker_1] args_list += ["--speaker_name_2", target_speaker_2] args_list += ["--alpha", str(float(confusion_degree))] args_list += ["--text", input_text] args_list += ["--mode","single"] args_list += ["--duration_control",str(float(2.05-duration))] args_list += ["--output_dir", "result"] args_list += ["--log_level", "debug"] os.environ["WORK_DIR"] = "./" inference.main(args_list) ### Display ### source_speaker_1 = os.path.join( "result/single/s1.wav" ) source_speaker_2 = os.path.join( "result/single/s2.wav" ) result_file = os.path.join( "result/single/test_pred.wav" ) return source_speaker_1, source_speaker_2, result_file # Section 1: TTS tts_demo_inputs = [ gr.Textbox( label="Input Text", type="text", placeholder="Type something here.." ), gr.Radio( choices=list(SUPPORTED_SPEAKERS.keys()), label="Target Speaker", value="Cori Samuel" ), gr.Slider( 0.1, 2, value=1, step=0.05, label="Speaking Rate", info="As the step number increases, the speaking rate will be faster.", ) ] tts_demo_output = gr.Audio(label="Generated Speech") # Section 2: Timbre confusion tc_demo_inputs = [ gr.Textbox( label="Input Text", type="text", placeholder="Type something here.." ), gr.Radio( choices=list(SUPPORTED_SPEAKERS.keys()), label="Target Speaker 1", value="Cori Samuel" ), gr.Radio( choices=list(SUPPORTED_SPEAKERS.keys()), label="Target Speaker 2", value="Phil Benson" ), gr.Slider( 0, 1, value=0.5, step=0.1, label="Fusion Degree", info="As the step number increases, the generated voice will be more similar to speaker 2.", ), gr.Slider( 0.1, 2, value=1, step=0.05, label="Speaking Rate", info="As the step number increases, the speaking rate will be faster.", ) ] tc_demo_outputs = [ gr.Audio(label="Target Speaker 1"), gr.Audio(label="Target Speaker 2"), gr.Audio(label="Interpolated Speech") ] with gr.Blocks() as demo: gr.Interface( fn=tts_inference, inputs=tts_demo_inputs, outputs=tts_demo_output, title="Amphion Text-to-Speech", description="This demo offers an Amphion TTS pretrained model (VITS) for you to explore." ) gr.Interface( fn=tc_inference, inputs=tc_demo_inputs, outputs=tc_demo_outputs, title="Voice Fusion", description="In this section, you can choose two speakers to create a voice mix. Adjust the ‘Fusion Degree’ slider to customize your desired mix ratio between the two speakers." ) demo.queue() demo.launch() # if __name__ == "__main__": # demo.launch(share=True)