accumulate_grad_batches: 1 audio_num_mel_bins: 80 audio_sample_rate: 24000 base_config: - configs/tts/lj/fs2.yaml binarization_args: shuffle: false with_align: true with_f0: true with_f0cwt: true with_spk_embed: true with_txt: true with_wav: false binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer binary_data_dir: data/binary/xiaoma1022_24k_128hop check_val_every_n_epoch: 10 clip_grad_norm: 1 cwt_add_f0_loss: false cwt_hidden_size: 128 cwt_layers: 2 cwt_loss: l1 cwt_std_scale: 0.8 debug: false dec_ffn_kernel_size: 9 dec_layers: 4 decoder_type: fft dict_dir: '' dropout: 0.1 ds_workers: 4 dur_enc_hidden_stride_kernel: - 0,2,3 - 0,2,3 - 0,1,3 dur_loss: mse dur_predictor_kernel: 3 dur_predictor_layers: 2 enc_ffn_kernel_size: 9 enc_layers: 4 encoder_K: 8 encoder_type: fft endless_ds: true ffn_act: gelu ffn_padding: SAME fft_size: 512 fmax: 12000 fmin: 30 gen_dir_name: '' hidden_size: 256 hop_size: 128 infer: false lambda_commit: 0.25 lambda_energy: 0.1 lambda_f0: 1.0 lambda_ph_dur: 1.0 lambda_sent_dur: 1.0 lambda_uv: 1.0 lambda_word_dur: 1.0 load_ckpt: '' log_interval: 100 loud_norm: false lr: 2.0 max_epochs: 1000 max_eval_sentences: 1 max_eval_tokens: 60000 max_frames: 5000 max_input_tokens: 1550 max_sentences: 100000 max_tokens: 20000 max_updates: 60000 mel_loss: l1 mel_vmax: 1.5 mel_vmin: -6 min_level_db: -120 norm_type: gn num_ckpt_keep: 3 num_heads: 2 num_sanity_val_steps: 5 num_spk: 1 num_test_samples: 20 num_valid_plots: 10 optimizer_adam_beta1: 0.9 optimizer_adam_beta2: 0.98 out_wav_norm: false pitch_ar: false pitch_enc_hidden_stride_kernel: - 0,2,5 - 0,2,5 - 0,2,5 pitch_extractor_conv_layers: 2 pitch_loss: l1 pitch_norm: log pitch_type: frame pre_align_args: allow_no_txt: false denoise: false forced_align: mfa txt_processor: en use_sox: false use_tone: true pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign predictor_dropout: 0.5 predictor_grad: 0.1 predictor_hidden: -1 predictor_kernel: 5 predictor_layers: 2 prenet_dropout: 0.5 prenet_hidden_size: 256 pretrain_fs_ckpt: '' processed_data_dir: data/processed/ljspeech profile_infer: false raw_data_dir: data/raw/LJSpeech-1.1 ref_norm_layer: bn reset_phone_dict: true save_best: false save_ckpt: true save_codes: - configs - modules - tasks - utils - usr save_f0: false save_gt: false seed: 1234 sort_by_len: true stop_token_weight: 5.0 task_cls: tasks.tts.pe.PitchExtractionTask test_ids: - 68 - 70 - 74 - 87 - 110 - 172 - 190 - 215 - 231 - 294 - 316 - 324 - 402 - 422 - 485 - 500 - 505 - 508 - 509 - 519 test_input_dir: '' test_num: 523 test_set_name: test train_set_name: train use_denoise: false use_energy_embed: false use_gt_dur: false use_gt_f0: false use_pitch_embed: true use_pos_embed: true use_spk_embed: false use_spk_id: false use_split_spk_id: false use_uv: true use_var_enc: false val_check_interval: 2000 valid_num: 348 valid_set_name: valid vocoder: pwg vocoder_ckpt: '' warmup_updates: 2000 weight_decay: 0 win_size: 512 work_dir: checkpoints/0102_xiaoma_pe