accum_grad: 1 allow_variable_data_keys: false batch_bins: 5000000 batch_size: 20 batch_type: numel best_model_criterion: - - train - total_count - max bpemodel: null chunk_length: 500 chunk_shift_ratio: 0.5 cleaner: tacotron collect_stats: false config: ./conf/tuning/train_xvector_vits.yaml cudnn_benchmark: false cudnn_deterministic: false cudnn_enabled: true detect_anomaly: false dist_backend: nccl dist_init_method: env:// dist_launcher: null dist_master_addr: localhost dist_master_port: 60056 dist_rank: 0 dist_world_size: 4 distributed: true dry_run: false early_stopping_criterion: - valid - loss - min energy_extract: null energy_extract_conf: {} energy_normalize: null energy_normalize_conf: {} feats_extract: linear_spectrogram feats_extract_conf: hop_length: 256 n_fft: 1024 win_length: null fold_length: - 150 - 204800 freeze_param: [] g2p: g2p_en_no_space generator_first: false grad_clip: -1 grad_clip_type: 2.0 grad_noise: false ignore_init_mismatch: false init_param: [] iterator_type: sequence keep_nbest_models: 10 local_rank: 0 log_interval: 50 log_level: INFO max_cache_fd: 32 max_cache_size: 0.0 max_epoch: 100 model_conf: {} multiple_iterator: false multiprocessing_distributed: true ngpu: 1 no_forward_run: false non_linguistic_symbols: null normalize: null normalize_conf: {} num_att_plot: 3 num_cache_chunks: 1024 num_iters_per_epoch: 10000 num_workers: 4 odim: null optim: adamw optim2: adamw optim2_conf: betas: - 0.8 - 0.99 eps: 1.0e-09 lr: 0.0002 weight_decay: 0.0 optim_conf: betas: - 0.8 - 0.99 eps: 1.0e-09 lr: 0.0002 weight_decay: 0.0 output_dir: exp/tts_train_xvector_vits_raw_phn_tacotron_g2p_en_no_space patience: null pitch_extract: null pitch_extract_conf: {} pitch_normalize: null pitch_normalize_conf: {} pretrain_path: null print_config: false required: - output_dir - token_list resume: true scheduler: exponentiallr scheduler2: exponentiallr scheduler2_conf: gamma: 0.999875 scheduler_conf: gamma: 0.999875 seed: 777 sharded_ddp: false sort_batch: descending sort_in_batch: descending token_list: - - - AH0 - T - N - D - S - R - L - IH1 - DH - M - K - Z - EH1 - AE1 - IH0 - AH1 - W - ',' - HH - ER0 - P - IY1 - V - F - B - UW1 - AA1 - AY1 - AO1 - . - EY1 - IY0 - OW1 - NG - G - SH - Y - AW1 - CH - ER1 - UH1 - TH - JH - '''' - '?' - OW0 - EH2 - '!' - IH2 - OY1 - EY2 - AY2 - EH0 - UW0 - AA2 - AE2 - OW2 - AO2 - AE0 - AH2 - ZH - AA0 - UW2 - IY2 - AY0 - AO0 - AW2 - EY0 - UH2 - ER2 - AW0 - '...' - UH0 - OY2 - . . . - OY0 - . . . . - .. - . ... - . . - . . . . . - .. .. - '... .' - token_type: phn train_data_path_and_name_and_type: - - dump/22k/raw/train-clean-460/text - text - text - - dump/22k/raw/train-clean-460/wav.scp - speech - sound - - dump/22k/xvector/train-clean-460/xvector.scp - spembs - kaldi_ark train_dtype: float32 train_shape_file: - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/text_shape.phn - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/speech_shape tts: vits tts_conf: cache_generator_outputs: true discriminator_adv_loss_params: average_by_discriminators: false loss_type: mse discriminator_params: follow_official_norm: false period_discriminator_params: bias: true channels: 32 downsample_scales: - 3 - 3 - 3 - 3 - 1 in_channels: 1 kernel_sizes: - 5 - 3 max_downsample_channels: 1024 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 out_channels: 1 use_spectral_norm: false use_weight_norm: true periods: - 2 - 3 - 5 - 7 - 11 scale_discriminator_params: bias: true channels: 128 downsample_scales: - 2 - 2 - 4 - 4 - 1 in_channels: 1 kernel_sizes: - 15 - 41 - 5 - 3 max_downsample_channels: 1024 max_groups: 16 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 out_channels: 1 use_spectral_norm: false use_weight_norm: true scale_downsample_pooling: AvgPool1d scale_downsample_pooling_params: kernel_size: 4 padding: 2 stride: 2 scales: 1 discriminator_type: hifigan_multi_scale_multi_period_discriminator feat_match_loss_params: average_by_discriminators: false average_by_layers: false include_final_outputs: true generator_adv_loss_params: average_by_discriminators: false loss_type: mse generator_params: aux_channels: 513 decoder_channels: 512 decoder_kernel_size: 7 decoder_resblock_dilations: - - 1 - 3 - 5 - - 1 - 3 - 5 - - 1 - 3 - 5 decoder_resblock_kernel_sizes: - 3 - 7 - 11 decoder_upsample_kernel_sizes: - 16 - 16 - 4 - 4 decoder_upsample_scales: - 8 - 8 - 2 - 2 flow_base_dilation: 1 flow_dropout_rate: 0.0 flow_flows: 4 flow_kernel_size: 5 flow_layers: 4 global_channels: 256 hidden_channels: 192 posterior_encoder_base_dilation: 1 posterior_encoder_dropout_rate: 0.0 posterior_encoder_kernel_size: 5 posterior_encoder_layers: 16 posterior_encoder_stacks: 1 segment_size: 32 spk_embed_dim: 512 spks: -1 stochastic_duration_predictor_dds_conv_layers: 3 stochastic_duration_predictor_dropout_rate: 0.5 stochastic_duration_predictor_flows: 4 stochastic_duration_predictor_kernel_size: 3 text_encoder_activation_type: swish text_encoder_attention_dropout_rate: 0.1 text_encoder_attention_heads: 2 text_encoder_blocks: 6 text_encoder_conformer_kernel_size: -1 text_encoder_dropout_rate: 0.1 text_encoder_ffn_expand: 4 text_encoder_normalize_before: true text_encoder_positional_dropout_rate: 0.0 text_encoder_positional_encoding_layer_type: rel_pos text_encoder_positionwise_conv_kernel_size: 3 text_encoder_positionwise_layer_type: conv1d text_encoder_self_attention_layer_type: rel_selfattn use_conformer_conv_in_text_encoder: false use_macaron_style_in_text_encoder: true use_only_mean_in_flow: true use_weight_norm_in_decoder: true use_weight_norm_in_flow: true use_weight_norm_in_posterior_encoder: true vocabs: 86 generator_type: vits_generator lambda_adv: 1.0 lambda_dur: 1.0 lambda_feat_match: 2.0 lambda_kl: 1.0 lambda_mel: 45.0 mel_loss_params: fmax: null fmin: 0 fs: 22050 hop_length: 256 log_base: null n_fft: 1024 n_mels: 80 win_length: null window: hann sampling_rate: 22050 unused_parameters: true use_amp: false use_preprocessor: true use_tensorboard: true use_wandb: false val_scheduler_criterion: - valid - loss valid_batch_bins: null valid_batch_size: null valid_batch_type: null valid_data_path_and_name_and_type: - - dump/22k/raw/dev-clean/text - text - text - - dump/22k/raw/dev-clean/wav.scp - speech - sound - - dump/22k/xvector/dev-clean/xvector.scp - spembs - kaldi_ark valid_max_cache_size: null valid_shape_file: - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/text_shape.phn - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/speech_shape version: 0.10.3a2 wandb_entity: null wandb_id: null wandb_model_log_interval: -1 wandb_name: null wandb_project: null write_collected_feats: false