File size: 8,223 Bytes
9a645d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# Generated 2021-09-17 from:
# /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/snrestimator_yamls/timedom_convnet_whamr_v2_stnorm_manyseparators.yaml
# yamllint disable
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
# Dataset : WSJ0-2mix and WSJ0-3mix
# ################################
#
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
#
seed: 1234
__set_seed: !apply:torch.manual_seed [1234]

# Data params

# e.g. '/yourpath/wsj0-mix/2speakers'
# end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/

# the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
# e.g. /yourpath/wsj0-processed/si_tr_s/
# you need to convert the original wsj0 to 8k
# you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/
rir_path: /miniscratch/subakany/whamr_rirs_wav

experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators
output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234
train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv
valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv
test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv

wsj_data_folder: /network/tmp1/subakany/wham_original
train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv
test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv
base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s
use_whamr_train: true
whamr_proportion: 0.6

test_onwsj: false

skip_prep: false

ckpt_interval_minutes: 60

# Experiment params
auto_mix_prec: false # Set it to True for mixed precision
test_only: false
num_spks: 2 # set to 3 for wsj0-3mix
progressbar: true
save_audio: false # Save estimated sources on disk
sample_rate: 8000

# Training parameters
N_epochs: 200
batch_size: 1
lr: 0.0001
clip_grad_norm: 5
loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: false
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000000

# Set it to True to dynamically create mixtures at training time
dynamic_mixing: true
use_wham_noise: true
use_reverb_augment: true

# Parameters for data augmentation
use_wavedrop: false
use_speedperturb: true
use_speedperturb_sameforeachsource: false
use_rand_shift: false
min_shift: -8000
max_shift: 8000

speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 1.0
  drop_freq_prob: 0.0
  drop_chunk_prob: 0.0
  sample_rate: 8000
  speeds: [95, 100, 105]

wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 0.0
  drop_freq_prob: 1.0
  drop_chunk_prob: 1.0
  sample_rate: 8000

# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30

# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8

# Dataloader options
dataloader_opts:
  batch_size: 1
  num_workers: 0


# Specifying the network
Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
  kernel_size: 16
  out_channels: 256


SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
  num_spks: 2
  in_channels: 256
  out_channels: 256
  num_layers: 2
  K: 250
  intra_model: *id001
  inter_model: *id002
  norm: ln
  linear_layer_after_inter_intra: false
  skip_around_intra: true

Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
  in_channels: 256
  out_channels: 1
  kernel_size: 16
  stride: 8
  bias: false

snrmin: 0
snrmax: 10
out_n_neurons: 16
use_snr_compression: true
separation_norm_type: stnorm

# compute_features: !new:speechbrain.lobes.features.Fbank
#     n_mels: !ref <n_mels>
#     left_frames: 0
#     right_frames: 0
#     deltas: False

latent_dim: 128
n_inp: 256
encoder: &id006 !new:speechbrain.nnet.containers.Sequential
  input_shape: [!!null '', 2, !!null '']
  cnn1: !new:speechbrain.nnet.CNN.Conv1d
    in_channels: 2
    kernel_size: 4
    out_channels: 128
    stride: 1
    skip_transpose: true
    padding: valid
  relu1: !new:torch.nn.ReLU
  cnn2: !new:speechbrain.nnet.CNN.Conv1d
    in_channels: 128
    kernel_size: 4
    out_channels: 128
    stride: 2
    skip_transpose: true
    padding: valid
  relu2: !new:torch.nn.ReLU
  cnn3: !new:speechbrain.nnet.CNN.Conv1d
    in_channels: 128
    kernel_size: 4
    out_channels: 128
    stride: 2
    skip_transpose: true
    padding: valid
  relu3: !new:torch.nn.ReLU
  cnn4: !new:speechbrain.nnet.CNN.Conv1d
    in_channels: 128
    kernel_size: 4
    out_channels: 128
    stride: 2
    skip_transpose: true
    padding: valid
  relu4: !new:torch.nn.ReLU
  cnn5: !new:speechbrain.nnet.CNN.Conv1d
    in_channels: 128
    kernel_size: 4
    out_channels: 128
    stride: 2
    skip_transpose: true
    padding: valid

stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling


        # classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
        #     input_size: !ref <n_inp>
        #     channels: [1024, 1024, 1024, 1024, 3072]
        #     kernel_sizes: [5, 3, 3, 3, 1]
        #     dilations: [1, 2, 3, 4, 1]
        #     attention_channels: 128
        #     lin_neurons: 192

#classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
#    input_size: 192
#    out_neurons: !ref <out_n_neurons>
#
# classifier_out: !new:speechbrain.nnet.linear.Linear
#     input_size: 256
#     n_neurons: 1

encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
        # lr_scheduler: !ref <lr_scheduler>

  input_shape: [!!null '', 256]
  layer1: !new:speechbrain.nnet.linear.Linear
    input_size: 256
    n_neurons: 256
  relu: !new:torch.nn.ReLU
  layer2: !new:speechbrain.nnet.linear.Linear
    input_size: 256
    n_neurons: 1
  sigm: !new:torch.nn.Sigmoid



classifier_loss: !new:torch.nn.CrossEntropyLoss

optimizer: !name:torch.optim.Adam
  lr: 0.0001
  weight_decay: 0

loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper

lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
  factor: 0.5
  patience: 2
  dont_halve_until_epoch: 95

epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 200

modules:
  encoder: *id003
  decoder: *id004
  masknet: *id005
  encoder: *id006
  encoder_out: *id007
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
  recoverables:
    counter: *id008
    encoder: *id006
    encoder_out: *id007
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt

num_separators_per_model: 3
separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        encoder: !ref <encoder>
        encoder_out: !ref <encoder_out>