base_config: - configs/tts/base.yaml - configs/tts/base_zh.yaml datasets: [] test_prefixes: [] test_num: 0 valid_num: 0 pre_align_cls: data_gen.singing.pre_align.SingingPreAlign binarizer_cls: data_gen.singing.binarize.SingingBinarizer pre_align_args: use_tone: false # for ZH forced_align: mfa use_sox: true hop_size: 128 # Hop size. fft_size: 512 # FFT size. win_size: 512 # FFT size. max_frames: 8000 fmin: 50 # Minimum freq in mel basis calculation. fmax: 11025 # Maximum frequency in mel basis calculation. pitch_type: frame hidden_size: 256 mel_loss: "ssim:0.5|l1:0.5" lambda_f0: 0.0 lambda_uv: 0.0 lambda_energy: 0.0 lambda_ph_dur: 0.0 lambda_sent_dur: 0.0 lambda_word_dur: 0.0 predictor_grad: 0.0 use_spk_embed: true use_spk_id: false max_tokens: 20000 max_updates: 400000 num_spk: 100 save_f0: true use_gt_dur: true use_gt_f0: true