cemsubakan commited on
Commit
ca690f1
1 Parent(s): 5ad3c07

simplified hyperparams.yaml and adding hyperparams_train.yaml

Browse files
Files changed (2) hide show
  1. hyperparams.yaml +2 -161
  2. hyperparams_training.yaml +234 -0
hyperparams.yaml CHANGED
@@ -1,128 +1,17 @@
1
- # Generated 2021-09-17 from:
2
- # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/snrestimator_yamls/timedom_convnet_whamr_v2_stnorm_manyseparators.yaml
3
- # yamllint disable
4
  # ################################
5
- # Model: SepFormer for source separation
6
- # https://arxiv.org/abs/2010.13154
7
- # Dataset : WSJ0-2mix and WSJ0-3mix
8
  # ################################
9
- #
10
- # Basic parameters
11
- # Seed needs to be set at top of yaml, before objects with parameters are made
12
- #
13
- seed: 1234
14
- __set_seed: !apply:torch.manual_seed [1234]
15
 
16
- # Data params
17
-
18
- # e.g. '/yourpath/wsj0-mix/2speakers'
19
- # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
20
- data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/
21
-
22
- # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
23
- # e.g. /yourpath/wsj0-processed/si_tr_s/
24
- # you need to convert the original wsj0 to 8k
25
- # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
26
- base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/
27
- rir_path: /miniscratch/subakany/whamr_rirs_wav
28
-
29
- experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators
30
- output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234
31
- train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
32
- save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
33
- train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv
34
- valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv
35
- test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv
36
-
37
- wsj_data_folder: /network/tmp1/subakany/wham_original
38
- train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv
39
- test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv
40
- base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s
41
- use_whamr_train: true
42
- whamr_proportion: 0.6
43
-
44
- test_onwsj: false
45
-
46
- skip_prep: false
47
-
48
- ckpt_interval_minutes: 60
49
-
50
- # Experiment params
51
- auto_mix_prec: false # Set it to True for mixed precision
52
- test_only: false
53
- num_spks: 2 # set to 3 for wsj0-3mix
54
- progressbar: true
55
- save_audio: false # Save estimated sources on disk
56
  sample_rate: 8000
57
 
58
- # Training parameters
59
- N_epochs: 200
60
- batch_size: 1
61
- lr: 0.0001
62
- clip_grad_norm: 5
63
- loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
64
- # if True, the training sequences are cut to a specified length
65
- limit_training_signal_len: false
66
- # this is the length of sequences if we choose to limit
67
- # the signal length of training sequences
68
- training_signal_len: 32000000
69
-
70
- # Set it to True to dynamically create mixtures at training time
71
- dynamic_mixing: true
72
- use_wham_noise: true
73
- use_reverb_augment: true
74
-
75
- # Parameters for data augmentation
76
- use_wavedrop: false
77
- use_speedperturb: true
78
- use_speedperturb_sameforeachsource: false
79
- use_rand_shift: false
80
- min_shift: -8000
81
- max_shift: 8000
82
-
83
- speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
84
- perturb_prob: 1.0
85
- drop_freq_prob: 0.0
86
- drop_chunk_prob: 0.0
87
- sample_rate: 8000
88
- speeds: [95, 100, 105]
89
-
90
- wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
91
- perturb_prob: 0.0
92
- drop_freq_prob: 1.0
93
- drop_chunk_prob: 1.0
94
- sample_rate: 8000
95
-
96
- # loss thresholding -- this thresholds the training loss
97
- threshold_byloss: true
98
- threshold: -30
99
-
100
- # Encoder parameters
101
- N_encoder_out: 256
102
- out_channels: 256
103
- kernel_size: 16
104
- kernel_stride: 8
105
-
106
- # Dataloader options
107
- dataloader_opts:
108
- batch_size: 1
109
- num_workers: 0
110
-
111
-
112
  # Specifying the network
113
 
114
  snrmin: 0
115
  snrmax: 10
116
- out_n_neurons: 16
117
  use_snr_compression: true
118
  separation_norm_type: stnorm
119
 
120
- # compute_features: !new:speechbrain.lobes.features.Fbank
121
- # n_mels: !ref <n_mels>
122
- # left_frames: 0
123
- # right_frames: 0
124
- # deltas: False
125
-
126
  latent_dim: 128
127
  n_inp: 256
128
  encoder: &id006 !new:speechbrain.nnet.containers.Sequential
@@ -169,26 +58,7 @@ encoder: &id006 !new:speechbrain.nnet.containers.Sequential
169
 
170
  stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling
171
 
172
-
173
- # classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
174
- # input_size: !ref <n_inp>
175
- # channels: [1024, 1024, 1024, 1024, 3072]
176
- # kernel_sizes: [5, 3, 3, 3, 1]
177
- # dilations: [1, 2, 3, 4, 1]
178
- # attention_channels: 128
179
- # lin_neurons: 192
180
-
181
- #classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
182
- # input_size: 192
183
- # out_neurons: !ref <out_n_neurons>
184
- #
185
- # classifier_out: !new:speechbrain.nnet.linear.Linear
186
- # input_size: 256
187
- # n_neurons: 1
188
-
189
  encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
190
- # lr_scheduler: !ref <lr_scheduler>
191
-
192
  input_shape: [!!null '', 256]
193
  layer1: !new:speechbrain.nnet.linear.Linear
194
  input_size: 256
@@ -199,38 +69,9 @@ encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
199
  n_neurons: 1
200
  sigm: !new:torch.nn.Sigmoid
201
 
202
-
203
-
204
- classifier_loss: !new:torch.nn.CrossEntropyLoss
205
-
206
- optimizer: !name:torch.optim.Adam
207
- lr: 0.0001
208
- weight_decay: 0
209
-
210
- loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
211
-
212
- lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
213
- factor: 0.5
214
- patience: 2
215
- dont_halve_until_epoch: 95
216
-
217
- epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
218
- limit: 200
219
-
220
  modules:
221
  encoder: *id006
222
  encoder_out: *id007
223
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
224
- checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
225
- recoverables:
226
- counter: *id008
227
- encoder: *id006
228
- encoder_out: *id007
229
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
230
- save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
231
-
232
- num_separators_per_model: 3
233
- separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/
234
 
235
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
236
  loadables:
 
 
 
1
  # ################################
2
+ # Model: Neural SI-SNR Estimator with Pool training strategy (https://arxiv.org/pdf/2110.10812.pdf)
3
+ # Dataset : LibriMix and WHAMR!
 
4
  # ################################
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  sample_rate: 8000
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Specifying the network
9
 
10
  snrmin: 0
11
  snrmax: 10
 
12
  use_snr_compression: true
13
  separation_norm_type: stnorm
14
 
 
 
 
 
 
 
15
  latent_dim: 128
16
  n_inp: 256
17
  encoder: &id006 !new:speechbrain.nnet.containers.Sequential
58
 
59
  stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
 
 
62
  input_shape: [!!null '', 256]
63
  layer1: !new:speechbrain.nnet.linear.Linear
64
  input_size: 256
69
  n_neurons: 1
70
  sigm: !new:torch.nn.Sigmoid
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  modules:
73
  encoder: *id006
74
  encoder_out: *id007
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
77
  loadables:
hyperparams_training.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: Neural SI-SNR Estimator with Pool training strategy (https://arxiv.org/pdf/2110.10812.pdf)
3
+ # Dataset : LibriMix and WHAMR!
4
+ # ################################
5
+ #
6
+ # Basic parameters
7
+ # Seed needs to be set at top of yaml, before objects with parameters are made
8
+ #
9
+ seed: 1234
10
+ __set_seed: !apply:torch.manual_seed [1234]
11
+
12
+ # Data params
13
+
14
+ # e.g. '/yourpath/wsj0-mix/2speakers'
15
+ # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
16
+ data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/
17
+
18
+ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
19
+ # e.g. /yourpath/wsj0-processed/si_tr_s/
20
+ # you need to convert the original wsj0 to 8k
21
+ # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
22
+ base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/
23
+ rir_path: /miniscratch/subakany/whamr_rirs_wav
24
+
25
+ experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators
26
+ output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234
27
+ train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
28
+ save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
29
+ train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv
30
+ valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv
31
+ test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv
32
+
33
+ wsj_data_folder: /network/tmp1/subakany/wham_original
34
+ train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv
35
+ test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv
36
+ base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s
37
+ use_whamr_train: true
38
+ whamr_proportion: 0.6
39
+
40
+ test_onwsj: false
41
+
42
+ skip_prep: false
43
+
44
+ ckpt_interval_minutes: 60
45
+
46
+ # Experiment params
47
+ auto_mix_prec: false # Set it to True for mixed precision
48
+ test_only: false
49
+ num_spks: 2 # set to 3 for wsj0-3mix
50
+ progressbar: true
51
+ save_audio: false # Save estimated sources on disk
52
+ sample_rate: 8000
53
+
54
+ # Training parameters
55
+ N_epochs: 200
56
+ batch_size: 1
57
+ lr: 0.0001
58
+ clip_grad_norm: 5
59
+ loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
60
+ # if True, the training sequences are cut to a specified length
61
+ limit_training_signal_len: false
62
+ # this is the length of sequences if we choose to limit
63
+ # the signal length of training sequences
64
+ training_signal_len: 32000000
65
+
66
+ # Set it to True to dynamically create mixtures at training time
67
+ dynamic_mixing: true
68
+ use_wham_noise: true
69
+ use_reverb_augment: true
70
+
71
+ # Parameters for data augmentation
72
+ use_wavedrop: false
73
+ use_speedperturb: true
74
+ use_speedperturb_sameforeachsource: false
75
+ use_rand_shift: false
76
+ min_shift: -8000
77
+ max_shift: 8000
78
+
79
+ speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
80
+ perturb_prob: 1.0
81
+ drop_freq_prob: 0.0
82
+ drop_chunk_prob: 0.0
83
+ sample_rate: 8000
84
+ speeds: [95, 100, 105]
85
+
86
+ wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
87
+ perturb_prob: 0.0
88
+ drop_freq_prob: 1.0
89
+ drop_chunk_prob: 1.0
90
+ sample_rate: 8000
91
+
92
+ # loss thresholding -- this thresholds the training loss
93
+ threshold_byloss: true
94
+ threshold: -30
95
+
96
+ # Encoder parameters
97
+ N_encoder_out: 256
98
+ out_channels: 256
99
+ kernel_size: 16
100
+ kernel_stride: 8
101
+
102
+ # Dataloader options
103
+ dataloader_opts:
104
+ batch_size: 1
105
+ num_workers: 0
106
+
107
+
108
+ # Specifying the network
109
+
110
+ snrmin: 0
111
+ snrmax: 10
112
+ out_n_neurons: 16
113
+ use_snr_compression: true
114
+ separation_norm_type: stnorm
115
+
116
+ # compute_features: !new:speechbrain.lobes.features.Fbank
117
+ # n_mels: !ref <n_mels>
118
+ # left_frames: 0
119
+ # right_frames: 0
120
+ # deltas: False
121
+
122
+ latent_dim: 128
123
+ n_inp: 256
124
+ encoder: &id006 !new:speechbrain.nnet.containers.Sequential
125
+ input_shape: [!!null '', 2, !!null '']
126
+ cnn1: !new:speechbrain.nnet.CNN.Conv1d
127
+ in_channels: 2
128
+ kernel_size: 4
129
+ out_channels: 128
130
+ stride: 1
131
+ skip_transpose: true
132
+ padding: valid
133
+ relu1: !new:torch.nn.ReLU
134
+ cnn2: !new:speechbrain.nnet.CNN.Conv1d
135
+ in_channels: 128
136
+ kernel_size: 4
137
+ out_channels: 128
138
+ stride: 2
139
+ skip_transpose: true
140
+ padding: valid
141
+ relu2: !new:torch.nn.ReLU
142
+ cnn3: !new:speechbrain.nnet.CNN.Conv1d
143
+ in_channels: 128
144
+ kernel_size: 4
145
+ out_channels: 128
146
+ stride: 2
147
+ skip_transpose: true
148
+ padding: valid
149
+ relu3: !new:torch.nn.ReLU
150
+ cnn4: !new:speechbrain.nnet.CNN.Conv1d
151
+ in_channels: 128
152
+ kernel_size: 4
153
+ out_channels: 128
154
+ stride: 2
155
+ skip_transpose: true
156
+ padding: valid
157
+ relu4: !new:torch.nn.ReLU
158
+ cnn5: !new:speechbrain.nnet.CNN.Conv1d
159
+ in_channels: 128
160
+ kernel_size: 4
161
+ out_channels: 128
162
+ stride: 2
163
+ skip_transpose: true
164
+ padding: valid
165
+
166
+ stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling
167
+
168
+
169
+ # classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
170
+ # input_size: !ref <n_inp>
171
+ # channels: [1024, 1024, 1024, 1024, 3072]
172
+ # kernel_sizes: [5, 3, 3, 3, 1]
173
+ # dilations: [1, 2, 3, 4, 1]
174
+ # attention_channels: 128
175
+ # lin_neurons: 192
176
+
177
+ #classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
178
+ # input_size: 192
179
+ # out_neurons: !ref <out_n_neurons>
180
+ #
181
+ # classifier_out: !new:speechbrain.nnet.linear.Linear
182
+ # input_size: 256
183
+ # n_neurons: 1
184
+
185
+ encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
186
+ # lr_scheduler: !ref <lr_scheduler>
187
+
188
+ input_shape: [!!null '', 256]
189
+ layer1: !new:speechbrain.nnet.linear.Linear
190
+ input_size: 256
191
+ n_neurons: 256
192
+ relu: !new:torch.nn.ReLU
193
+ layer2: !new:speechbrain.nnet.linear.Linear
194
+ input_size: 256
195
+ n_neurons: 1
196
+ sigm: !new:torch.nn.Sigmoid
197
+
198
+
199
+
200
+ classifier_loss: !new:torch.nn.CrossEntropyLoss
201
+
202
+ optimizer: !name:torch.optim.Adam
203
+ lr: 0.0001
204
+ weight_decay: 0
205
+
206
+ loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
207
+
208
+ lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
209
+ factor: 0.5
210
+ patience: 2
211
+ dont_halve_until_epoch: 95
212
+
213
+ epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
214
+ limit: 200
215
+
216
+ modules:
217
+ encoder: *id006
218
+ encoder_out: *id007
219
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
220
+ checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
221
+ recoverables:
222
+ counter: *id008
223
+ encoder: *id006
224
+ encoder_out: *id007
225
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
226
+ save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
227
+
228
+ num_separators_per_model: 3
229
+ separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/
230
+
231
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
232
+ loadables:
233
+ encoder: !ref <encoder>
234
+ encoder_out: !ref <encoder_out>