transformer: encoder_layer: 4 encoder_head: 2 encoder_hidden: 256 decoder_layer: 6 decoder_head: 2 decoder_hidden: 256 conv_filter_size: 1024 conv_kernel_size: [9, 1] encoder_dropout: 0.2 decoder_dropout: 0.2 variance_predictor: filter_size: 256 kernel_size: 3 dropout: 0.5 variance_embedding: pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing n_bins: 256 # gst: # use_gst: False # conv_filters: [32, 32, 64, 64, 128, 128] # gru_hidden: 128 # token_size: 128 # n_style_token: 10 # attn_head: 4 multi_speaker: True multi_emotion: True max_seq_len: 1000 vocoder: model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN' speaker: "universal" # support 'LJSpeech', 'universal'