# ################################ # Model: Neural SI-SNR Estimator with Pool training strategy (https://arxiv.org/pdf/2110.10812.pdf) # Dataset : LibriMix and WHAMR! # ################################ # # Basic parameters # Seed needs to be set at top of yaml, before objects with parameters are made # seed: 1234 __set_seed: !apply:torch.manual_seed [1234] # Data params # e.g. '/yourpath/wsj0-mix/2speakers' # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used # e.g. /yourpath/wsj0-processed/si_tr_s/ # you need to convert the original wsj0 to 8k # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/ rir_path: /miniscratch/subakany/whamr_rirs_wav experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234 train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv wsj_data_folder: /network/tmp1/subakany/wham_original train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s use_whamr_train: true whamr_proportion: 0.6 test_onwsj: false skip_prep: false ckpt_interval_minutes: 60 # Experiment params auto_mix_prec: false # Set it to True for mixed precision test_only: false num_spks: 2 # set to 3 for wsj0-3mix progressbar: true save_audio: false # Save estimated sources on disk sample_rate: 8000 # Training parameters N_epochs: 200 batch_size: 1 lr: 0.0001 clip_grad_norm: 5 loss_upper_lim: 999999 # this is the upper limit for an acceptable loss # if True, the training sequences are cut to a specified length limit_training_signal_len: false # this is the length of sequences if we choose to limit # the signal length of training sequences training_signal_len: 32000000 # Set it to True to dynamically create mixtures at training time dynamic_mixing: true use_wham_noise: true use_reverb_augment: true # Parameters for data augmentation use_wavedrop: false use_speedperturb: true use_speedperturb_sameforeachsource: false use_rand_shift: false min_shift: -8000 max_shift: 8000 speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment perturb_prob: 1.0 drop_freq_prob: 0.0 drop_chunk_prob: 0.0 sample_rate: 8000 speeds: [95, 100, 105] wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment perturb_prob: 0.0 drop_freq_prob: 1.0 drop_chunk_prob: 1.0 sample_rate: 8000 # loss thresholding -- this thresholds the training loss threshold_byloss: true threshold: -30 # Encoder parameters N_encoder_out: 256 out_channels: 256 kernel_size: 16 kernel_stride: 8 # Dataloader options dataloader_opts: batch_size: 1 num_workers: 0 # Specifying the network snrmin: 0 snrmax: 10 out_n_neurons: 16 use_snr_compression: true separation_norm_type: stnorm # compute_features: !new:speechbrain.lobes.features.Fbank # n_mels: !ref # left_frames: 0 # right_frames: 0 # deltas: False latent_dim: 128 n_inp: 256 encoder: &id006 !new:speechbrain.nnet.containers.Sequential input_shape: [!!null '', 2, !!null ''] cnn1: !new:speechbrain.nnet.CNN.Conv1d in_channels: 2 kernel_size: 4 out_channels: 128 stride: 1 skip_transpose: true padding: valid relu1: !new:torch.nn.ReLU cnn2: !new:speechbrain.nnet.CNN.Conv1d in_channels: 128 kernel_size: 4 out_channels: 128 stride: 2 skip_transpose: true padding: valid relu2: !new:torch.nn.ReLU cnn3: !new:speechbrain.nnet.CNN.Conv1d in_channels: 128 kernel_size: 4 out_channels: 128 stride: 2 skip_transpose: true padding: valid relu3: !new:torch.nn.ReLU cnn4: !new:speechbrain.nnet.CNN.Conv1d in_channels: 128 kernel_size: 4 out_channels: 128 stride: 2 skip_transpose: true padding: valid relu4: !new:torch.nn.ReLU cnn5: !new:speechbrain.nnet.CNN.Conv1d in_channels: 128 kernel_size: 4 out_channels: 128 stride: 2 skip_transpose: true padding: valid stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling # classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN # input_size: !ref # channels: [1024, 1024, 1024, 1024, 3072] # kernel_sizes: [5, 3, 3, 3, 1] # dilations: [1, 2, 3, 4, 1] # attention_channels: 128 # lin_neurons: 192 #classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier # input_size: 192 # out_neurons: !ref # # classifier_out: !new:speechbrain.nnet.linear.Linear # input_size: 256 # n_neurons: 1 encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential # lr_scheduler: !ref input_shape: [!!null '', 256] layer1: !new:speechbrain.nnet.linear.Linear input_size: 256 n_neurons: 256 relu: !new:torch.nn.ReLU layer2: !new:speechbrain.nnet.linear.Linear input_size: 256 n_neurons: 1 sigm: !new:torch.nn.Sigmoid classifier_loss: !new:torch.nn.CrossEntropyLoss optimizer: !name:torch.optim.Adam lr: 0.0001 weight_decay: 0 loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau factor: 0.5 patience: 2 dont_halve_until_epoch: 95 epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 200 modules: encoder: *id006 encoder_out: *id007 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save recoverables: counter: *id008 encoder: *id006 encoder_out: *id007 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt num_separators_per_model: 3 separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: encoder: !ref encoder_out: !ref