# This EXPERIMENTAL configuration is for ESPnet2 to finetune # Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run # this config, you need to specify "--tts_task gan_tts" # option for tts.sh at least and use 22050 hz audio as the # training data (mainly tested on LJspeech). # This configuration tested on 4 GPUs with 12GB GPU memory. # It takes around less than 1 week to finish the training but # 100k iters model should generate reasonable results. # YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS # IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES. ########################################################## # TTS MODEL SETTING # ########################################################## tts: joint_text2wav tts_conf: # copied from pretrained model's config.yaml text2mel_type: tacotron2 text2mel_params: embed_dim: 512 # char or phn embedding dimension elayers: 1 # number of blstm layers in encoder eunits: 512 # number of blstm units econv_layers: 3 # number of convolutional layers in encoder econv_chans: 512 # number of channels in convolutional layer econv_filts: 5 # filter size of convolutional layer atype: location # attention function type adim: 512 # attention dimension aconv_chans: 32 # number of channels in convolutional layer of attention aconv_filts: 15 # filter size of convolutional layer of attention cumulate_att_w: true # whether to cumulate attention weight dlayers: 2 # number of lstm layers in decoder dunits: 1024 # number of lstm units in decoder prenet_layers: 2 # number of layers in prenet prenet_units: 256 # number of units in prenet postnet_layers: 5 # number of layers in postnet postnet_chans: 512 # number of channels in postnet postnet_filts: 5 # filter size of postnet layer output_activation: null # activation function for the final output use_batch_norm: true # whether to use batch normalization in encoder use_concate: true # whether to concatenate encoder embedding with decoder outputs use_residual: false # whether to use residual connection in encoder spk_embed_dim: 192 # speaker embedding dimension spk_embed_integration_type: add # how to integrate speaker embedding dropout_rate: 0.5 # dropout rate zoneout_rate: 0.1 # zoneout rate reduction_factor: 1 # reduction factor use_masking: true # whether to apply masking for padded part in loss calculation bce_pos_weight: 10.0 # weight of positive sample in binary cross entropy calculation use_guided_attn_loss: true # whether to use guided attention loss guided_attn_loss_sigma: 0.4 # sigma of guided attention loss guided_attn_loss_lambda: 1.0 # strength of guided attention loss # copied from pretrained vocoder's config.yaml vocoder_type: hifigan_generator vocoder_params: bias: true channels: 512 in_channels: 80 kernel_size: 7 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 out_channels: 1 resblock_dilations: - - 1 - 3 - 5 - - 1 - 3 - 5 - - 1 - 3 - 5 resblock_kernel_sizes: - 3 - 7 - 11 upsample_kernel_sizes: - 16 - 16 - 4 - 4 upsample_scales: - 8 - 8 - 2 - 2 use_additional_convs: true use_weight_norm: true # copied from pretrained vocoder's config.yaml discriminator_type: hifigan_multi_scale_multi_period_discriminator discriminator_params: follow_official_norm: true period_discriminator_params: bias: true channels: 32 downsample_scales: - 3 - 3 - 3 - 3 - 1 in_channels: 1 kernel_sizes: - 5 - 3 max_downsample_channels: 1024 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 out_channels: 1 use_spectral_norm: false use_weight_norm: true periods: - 2 - 3 - 5 - 7 - 11 scale_discriminator_params: bias: true channels: 128 downsample_scales: - 4 - 4 - 4 - 4 - 1 in_channels: 1 kernel_sizes: - 15 - 41 - 5 - 3 max_downsample_channels: 1024 max_groups: 16 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 out_channels: 1 scale_downsample_pooling: AvgPool1d scale_downsample_pooling_params: kernel_size: 4 padding: 2 stride: 2 scales: 3 # loss function related generator_adv_loss_params: average_by_discriminators: false # whether to average loss value by #discriminators loss_type: mse # loss type, "mse" or "hinge" discriminator_adv_loss_params: average_by_discriminators: false # whether to average loss value by #discriminators loss_type: mse # loss type, "mse" or "hinge" use_feat_match_loss: true # whether to use feat match loss feat_match_loss_params: average_by_discriminators: false # whether to average loss value by #discriminators average_by_layers: false # whether to average loss value by #layers of each discriminator include_final_outputs: true # whether to include final outputs for loss calculation use_mel_loss: true # whether to use mel-spectrogram loss mel_loss_params: fs: 22050 # must be the same as the training data n_fft: 1024 # fft points hop_length: 256 # hop size win_length: null # window length window: hann # window type n_mels: 80 # number of Mel basis fmin: 0 # minimum frequency for Mel basis fmax: null # maximum frequency for Mel basis log_base: null # null represent natural log lambda_text2mel: 1.0 # loss scaling coefficient for text2mel loss lambda_adv: 1.0 # loss scaling coefficient for adversarial loss lambda_mel: 45.0 # loss scaling coefficient for Mel loss lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss # others sampling_rate: 22050 # needed in the inference for saving wav segment_size: 32 # segment size for random windowed discriminator cache_generator_outputs: true # whether to cache generator outputs in the training # extra module for additional inputs #pitch_extract: dio # pitch extractor type #pitch_extract_conf: # reduction_factor: 1 #pitch_normalize: global_mvn # normalizer for the pitch feature #energy_extract: energy # energy extractor type #energy_extract_conf: # reduction_factor: 1 #energy_normalize: global_mvn # normalizer for the energy feature # initialization (might need to modify for your own pretrained model) init_param: - exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel - exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder - exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator ########################################################## # OPTIMIZER & SCHEDULER SETTING # ########################################################## # optimizer setting for generator optim: adam optim_conf: lr: 1.25e-5 betas: [0.5, 0.9] weight_decay: 0.0 scheduler: exponentiallr scheduler_conf: gamma: 0.999875 # optimizer setting for discriminator optim2: adam optim2_conf: lr: 1.25e-5 betas: [0.5, 0.9] weight_decay: 0.0 scheduler2: exponentiallr scheduler2_conf: gamma: 0.999875 generator_first: true # whether to start updating generator first ########################################################## # OTHER TRAINING SETTING # ########################################################## #num_iters_per_epoch: 1000 # number of iterations per epoch max_epoch: 500 # number of epochs accum_grad: 1 # gradient accumulation batch_bins: 1600000 # batch bins (feats_type=raw) batch_type: numel # how to make batch grad_clip: -1 # gradient clipping norm grad_noise: false # whether to use gradient noise injection sort_in_batch: descending # how to sort data in making batch sort_batch: descending # how to sort created batches num_workers: 4 # number of workers of data loader use_amp: false # whether to use pytorch amp log_interval: 50 # log interval in iterations keep_nbest_models: 5 # number of models to keep num_att_plot: 3 # number of attention figures to be saved in every check seed: 777 # random seed number patience: null # patience for early stopping unused_parameters: true # needed for multi gpu case best_model_criterion: # criterion to save the best models - - valid - text2mel_loss - min - - train - text2mel_loss - min - - train - total_count - max cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic # in the case of GAN-TTS training, we strongly recommend setting to false cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it # therefore, we set to false as a default (recommend trying both cases)