config: conf/encodec2d_dwconv_groupratio8_mag_phase_16k_n32_600k_step_rmseg_use_power.yaml print_config: false log_level: INFO dry_run: false iterator_type: sequence output_dir: exp/encodec2d_dwconv_groupratio8_mag_phase_16k_n32_600k_step_rmseg_use_power_raw_en_libritts ngpu: 2 seed: 0 num_workers: 8 num_att_plot: 0 dist_backend: nccl dist_init_method: env:// dist_world_size: null dist_rank: null local_rank: 0 dist_master_addr: null dist_master_port: null dist_launcher: null multiprocessing_distributed: true unused_parameters: true sharded_ddp: false cudnn_enabled: true cudnn_benchmark: false cudnn_deterministic: false collect_stats: false write_collected_feats: false max_epoch: 60 max_update: 9223372036854775807 patience: null val_scheduler_criterion: - valid - loss early_stopping_criterion: - valid - loss - min best_model_criterion: - - valid - generator_multi_spectral_recon_loss - min keep_nbest_models: 60 nbest_averaging_interval: 0 grad_clip: -1 grad_clip_type: 2.0 grad_noise: false accum_grad: 1 no_forward_run: false resume: true train_dtype: float32 use_amp: false log_interval: 50 use_tensorboard: true use_wandb: false wandb_project: null wandb_id: null wandb_entity: null wandb_name: null wandb_model_log_interval: -1 detect_anomaly: false pretrain_path: null init_param: [] ignore_init_mismatch: true freeze_param: [] num_iters_per_epoch: 10000 batch_size: 32 valid_batch_size: null batch_bins: 2000000 valid_batch_bins: null drop_last: true train_shape_file: - exp/tokenizer_states_16k/train/speech_shape valid_shape_file: - exp/tokenizer_states_16k/dev/speech_shape batch_type: unsorted valid_batch_type: null speech_length_min: -1 speech_length_max: -1 fold_length: - 512 - 150 sort_in_batch: descending sort_batch: descending multiple_iterator: false chunk_length: 500 chunk_shift_ratio: 0.5 num_cache_chunks: 1024 dataset_type: small dataset_conf: {} train_data_file: null valid_data_file: null train_data_path_and_name_and_type: - - dump/raw_16k/train/wav.scp.pai - speech - kaldi_ark valid_data_path_and_name_and_type: - - dump/raw_16k/dev/wav.scp.pai - speech - kaldi_ark allow_variable_data_keys: false max_cache_size: 0.0 max_cache_fd: 32 valid_max_cache_size: null optim: adam optim_conf: lr: 0.0003 betas: - 0.5 - 0.9 scheduler: null scheduler_conf: step_size: 8 gamma: 0.1 optim2: adam optim2_conf: lr: 0.0003 betas: - 0.5 - 0.9 scheduler2: null scheduler2_conf: step_size: 8 gamma: 0.1 use_pai: true simple_ddp: false num_worker_count: 1 generator_first: false input_size: 3 cmvn_file: null disc_grad_clip: -1 disc_grad_clip_type: 2.0 gen_train_interval: 1 disc_train_interval: 1 stat_flops: false use_preprocessor: true speech_volume_normalize: null speech_rms_normalize: false speech_max_length: 40800 sampling_rate: 16000 valid_max_length: 40800 frontend: null frontend_conf: {} normalize: null normalize_conf: {} encoder: encodec_seanet_encoder_2d encoder_conf: ratios: - - 4 - 1 - - 4 - 1 - - 4 - 2 - - 4 - 1 norm: time_group_norm norm_params: num_groups: 1 causal: false dilation_base: 2 kernel_size: 3 last_kernel_size: 3 seq_model: none quantizer: costume_quantizer quantizer_conf: codebook_size: 1024 num_quantizers: 32 ema_decay: 0.99 kmeans_init: true sampling_rate: 16000 quantize_dropout: true rand_num_quant: - 1 - 2 - 4 - 8 - 16 - 32 use_ddp: true encoder_hop_length: 320 decoder: encodec_seanet_decoder_2d decoder_conf: ratios: - - 4 - 1 - - 4 - 1 - - 4 - 2 - - 4 - 1 norm: time_group_norm norm_params: num_groups: 1 causal: false channels: 3 dilation_base: 2 kernel_size: 3 last_kernel_size: 3 tr_conv_group_ratio: 8 seq_model: none model: freq_codec model_conf: odim: 128 multi_spectral_window_powers_of_two: - 5 - 6 - 7 - 8 - 9 - 10 target_sample_hz: 16000 audio_normalize: true segment_dur: null overlap_ratio: null use_power_spec_loss: true codec_domain: - mag_phase - mag_phase discriminator: multiple_disc discriminator_conf: input_size: 1 disc_conf_list: - filters: 32 name: encodec_multi_scale_stft_discriminator distributed: true version: 0.2.0