base_config: configs/tts/base.yaml task_cls: tasks.tts.fs2.FastSpeech2Task # model hidden_size: 256 dropout: 0.1 encoder_type: fft # fft|tacotron|tacotron2|conformer encoder_K: 8 # for tacotron encoder decoder_type: fft # fft|rnn|conv|conformer use_pos_embed: true # duration predictor_hidden: -1 predictor_kernel: 5 predictor_layers: 2 dur_predictor_kernel: 3 dur_predictor_layers: 2 predictor_dropout: 0.5 # pitch and energy use_pitch_embed: true pitch_type: ph # frame|ph|cwt use_uv: true cwt_hidden_size: 128 cwt_layers: 2 cwt_loss: l1 cwt_add_f0_loss: false cwt_std_scale: 0.8 pitch_ar: false #pitch_embed_type: 0q pitch_loss: 'l1' # l1|l2|ssim pitch_norm: log use_energy_embed: false # reference encoder and speaker embedding use_spk_id: false use_split_spk_id: false use_spk_embed: false use_var_enc: false lambda_commit: 0.25 ref_norm_layer: bn pitch_enc_hidden_stride_kernel: - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size - 0,2,5 - 0,2,5 dur_enc_hidden_stride_kernel: - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size - 0,2,3 - 0,1,3 # mel mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 # loss lambda lambda_f0: 1.0 lambda_uv: 1.0 lambda_energy: 0.1 lambda_ph_dur: 1.0 lambda_sent_dur: 1.0 lambda_word_dur: 1.0 predictor_grad: 0.1 # train and eval pretrain_fs_ckpt: '' warmup_updates: 2000 max_tokens: 32000 max_sentences: 100000 max_eval_sentences: 1 max_updates: 120000 num_valid_plots: 5 num_test_samples: 0 test_ids: [] use_gt_dur: false use_gt_f0: false # exp dur_loss: mse # huber|mol norm_type: gn