Text-to-Speech / app.py
zyingt's picture
adjust speaking rate range
90c437b verified
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree
import subprocess
command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app"
subprocess.check_output(command_to_run, shell=True, text=True)
import gradio as gr
import os
import inference
SUPPORTED_SPEAKERS = {
"Cori Samuel":"hifitts_92",
"Phil Benson":"hifitts_6097",
"Mike Pelton":"hifitts_6670",
"Tony Oliva":"hifitts_6671",
"Maria Kasper":"hifitts_8051",
"John Van Stan":"hifitts_9017",
"Helen Taylor":"hifitts_9136",
"Sylviamb":"hifitts_11614",
"Celine Major":"hifitts_11697",
"LikeManyWaters":"hifitts_12787"
}
def tts_inference(
input_text,
target_speaker,
duration
):
### Target Speaker ###
target_speaker = SUPPORTED_SPEAKERS[target_speaker]
args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"]
args_list += ["--speaker_name_1", target_speaker]
args_list += ["--speaker_name_2", None]
args_list += ["--text", input_text]
args_list += ["--mode","single"]
args_list += ["--duration_control",str(float(2.05-duration))]
args_list += ["--output_dir", "result"]
args_list += ["--log_level", "debug"]
os.environ["WORK_DIR"] = "./"
inference.main(args_list)
### Display ###
result_file = os.path.join(
"result/single/test_pred.wav"
)
return result_file
def tc_inference(
input_text,
target_speaker_1,
target_speaker_2,
confusion_degree,
duration
):
### Target Speaker ###
target_speaker_1 = SUPPORTED_SPEAKERS[target_speaker_1]
if target_speaker_2 is not None:
target_speaker_2 = SUPPORTED_SPEAKERS[target_speaker_2]
args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
args_list += ["--checkpoint_path", "./expdir/checkpoint/latest-checkpoint"]
args_list += ["--speaker_name_1", target_speaker_1]
args_list += ["--speaker_name_2", target_speaker_2]
args_list += ["--alpha", str(float(confusion_degree))]
args_list += ["--text", input_text]
args_list += ["--mode","single"]
args_list += ["--duration_control",str(float(2.05-duration))]
args_list += ["--output_dir", "result"]
args_list += ["--log_level", "debug"]
os.environ["WORK_DIR"] = "./"
inference.main(args_list)
### Display ###
source_speaker_1 = os.path.join(
"result/single/s1.wav"
)
source_speaker_2 = os.path.join(
"result/single/s2.wav"
)
result_file = os.path.join(
"result/single/test_pred.wav"
)
return source_speaker_1, source_speaker_2, result_file
# Section 1: TTS
tts_demo_inputs = [
gr.Textbox(
label="Input Text",
type="text",
placeholder="Type something here.."
),
gr.Radio(
choices=list(SUPPORTED_SPEAKERS.keys()),
label="Target Speaker",
value="Cori Samuel"
),
gr.Slider(
0.1,
2,
value=1,
step=0.05,
label="Speaking Rate",
info="As the step number increases, the speaking rate will be faster.",
)
]
tts_demo_output = gr.Audio(label="Generated Speech")
# Section 2: Timbre confusion
tc_demo_inputs = [
gr.Textbox(
label="Input Text",
type="text",
placeholder="Type something here.."
),
gr.Radio(
choices=list(SUPPORTED_SPEAKERS.keys()),
label="Target Speaker 1",
value="Cori Samuel"
),
gr.Radio(
choices=list(SUPPORTED_SPEAKERS.keys()),
label="Target Speaker 2",
value="Phil Benson"
),
gr.Slider(
0,
1,
value=0.5,
step=0.1,
label="Fusion Degree",
info="As the step number increases, the generated voice will be more similar to speaker 2.",
),
gr.Slider(
0.1,
2,
value=1,
step=0.05,
label="Speaking Rate",
info="As the step number increases, the speaking rate will be faster.",
)
]
tc_demo_outputs = [
gr.Audio(label="Target Speaker 1"),
gr.Audio(label="Target Speaker 2"),
gr.Audio(label="Interpolated Speech")
]
with gr.Blocks() as demo:
gr.Interface(
fn=tts_inference,
inputs=tts_demo_inputs,
outputs=tts_demo_output,
title="Amphion Text-to-Speech",
description="This demo offers an Amphion TTS pretrained model (VITS) for you to explore."
)
gr.Interface(
fn=tc_inference,
inputs=tc_demo_inputs,
outputs=tc_demo_outputs,
title="Voice Fusion",
description="In this section, you can choose two speakers to create a voice mix. Adjust the ‘Fusion Degree’ slider to customize your desired mix ratio between the two speakers."
)
demo.queue()
demo.launch()
# if __name__ == "__main__":
# demo.launch(share=True)