transformer: | |
encoder_layer: 4 | |
encoder_head: 2 | |
encoder_hidden: 256 | |
decoder_layer: 6 | |
decoder_head: 2 | |
decoder_hidden: 256 | |
conv_filter_size: 1024 | |
conv_kernel_size: [9, 1] | |
encoder_dropout: 0.2 | |
decoder_dropout: 0.2 | |
variance_predictor: | |
filter_size: 256 | |
kernel_size: 3 | |
dropout: 0.5 | |
variance_embedding: | |
pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing | |
energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing | |
n_bins: 256 | |
# gst: | |
# use_gst: False | |
# conv_filters: [32, 32, 64, 64, 128, 128] | |
# gru_hidden: 128 | |
# token_size: 128 | |
# n_style_token: 10 | |
# attn_head: 4 | |
multi_speaker: True | |
multi_emotion: False | |
bert_emotion: False | |
max_seq_len: 1000 | |
vocoder: | |
model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN' | |
speaker: "universal" # support 'LJSpeech', 'universal' | |