English
Sound Classification
CNN14
cemsubakan commited on
Commit
d24c3c9
1 Parent(s): 95c068c

added the missing files

Browse files
classifier_esc50.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85ef49491db26ce50ee49753ed83cb7b7eb760d47f4c1a01fb2bdef0dcea704
3
+ size 1647311
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6f7dcf4eb97e68fb0989e3fbc9c667c60eaa0c598753e86e7b07bac0729755
3
+ size 301999678
embedding_model_esc50ft.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:375b53b0107759f58b173759d9c439211a648970f3d0ea02a2ace179cf8550f7
3
+ size 301999678
hyperparams.yaml ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-11-21 from:
2
+ # /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
3
+ # yamllint disable
4
+ # #################################
5
+ # Basic training parameters for sound classification using the ESC50 dataset.
6
+ # This recipe uses the ecapa-tdnn backbone for classification.
7
+ #
8
+ # Author:
9
+ # * Cem Subakan
10
+ # (based on the SpeechBrain UrbanSound8k recipe)
11
+ # #################################
12
+
13
+ # Seed needs to be set at top of yaml, before objects with parameters are made
14
+ seed: 11
15
+ __set_seed: !!python/object/apply:torch.manual_seed [11]
16
+
17
+ # Set up folders for reading from and writing to
18
+ # Dataset must already exist at `audio_data_folder`
19
+ data_folder: /data2/ESC-50-master
20
+ # e.g., /localscratch/UrbanSound8K
21
+ open_rir_folder: <data_folder>/RIRS # Change if needed
22
+ audio_data_folder: /data2/ESC-50-master/audio
23
+
24
+ # TODO the follwing folder will contain the resampled audio
25
+ # files (mono channel and config SR) to train on
26
+ #reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
27
+ #
28
+ experiment_name: cnn14
29
+ output_folder: ./results/cnn14/11
30
+ save_folder: ./results/cnn14/11/save
31
+ train_log: ./results/cnn14/11/train_log.txt
32
+
33
+ test_only: false
34
+
35
+ # Tensorboard logs
36
+ use_tensorboard: false
37
+ tensorboard_logs_folder: ./results/cnn14/11/tb_logs/
38
+
39
+ # Path where data manifest files will be stored
40
+ train_annotation: /data2/ESC-50-master/manifest/train.json
41
+ valid_annotation: /data2/ESC-50-master/manifest/valid.json
42
+ test_annotation: /data2/ESC-50-master/manifest/test.json
43
+
44
+ # To standardize results, UrbanSound8k has pre-separated samples into
45
+ # 10 folds for multi-fold validation
46
+ train_fold_nums: [1, 2, 3]
47
+ valid_fold_nums: [4]
48
+ test_fold_nums: [5]
49
+ skip_manifest_creation: false
50
+
51
+ ckpt_interval_minutes: 15 # save checkpoint every N min
52
+
53
+ # Training parameters
54
+ number_of_epochs: 200
55
+ batch_size: 32
56
+ lr: 0.0002
57
+ base_lr: 0.00000001
58
+ max_lr: 0.0002
59
+ step_size: 65000
60
+ sample_rate: 44100
61
+
62
+ device: cpu
63
+
64
+ # Feature parameters
65
+ n_mels: 80
66
+ left_frames: 0
67
+ right_frames: 0
68
+ deltas: false
69
+ amp_to_db: true
70
+ normalize: true
71
+
72
+ # Number of classes
73
+ out_n_neurons: 50
74
+
75
+ # Note that it's actually important to shuffle the data here
76
+ # (or at the very least, not sort the data by duration)
77
+ # Also note that this does not violate the UrbanSound8k "no-shuffle" policy
78
+ # because this does not mix samples from folds in train to valid/test, only
79
+ # within train or valid, or test
80
+ shuffle: true
81
+ dataloader_options:
82
+ batch_size: 32
83
+ shuffle: true
84
+ num_workers: 0
85
+
86
+ # Functions
87
+ compute_features: &id003 !new:speechbrain.lobes.features.Fbank
88
+ n_mels: 80
89
+ left_frames: 0
90
+ right_frames: 0
91
+ deltas: false
92
+ sample_rate: 44100
93
+ n_fft: 1024
94
+ win_length: 20
95
+ hop_length: 10
96
+
97
+ use_pretrain: false
98
+ embedding_model: &id009 !new:recipes.ESC50.classification.custom_models.Cnn14
99
+ mel_bins: 80
100
+ emb_dim: 2048
101
+
102
+ classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
103
+ input_size: 2048
104
+ out_neurons: 50
105
+ lin_blocks: 1
106
+
107
+ epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter
108
+
109
+
110
+ # If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
111
+ limit: 200
112
+
113
+
114
+ augment_wavedrop: &id004 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
115
+ sample_rate: 44100
116
+ speeds: [100]
117
+
118
+ augment_speed: &id005 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
119
+ sample_rate: 44100
120
+ speeds: [95, 100, 105]
121
+
122
+ add_rev: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
123
+ openrir_folder: /data2/ESC-50-master/RIRS
124
+ openrir_max_noise_len: 3.0 # seconds
125
+ reverb_prob: 1.0
126
+ noise_prob: 0.0
127
+ noise_snr_low: 0
128
+ noise_snr_high: 15
129
+ rir_scale_factor: 1.0
130
+
131
+ add_noise: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
132
+ openrir_folder: /data2/ESC-50-master/RIRS
133
+ openrir_max_noise_len: 3.0 # seconds
134
+ reverb_prob: 0.0
135
+ noise_prob: 1.0
136
+ noise_snr_low: 0
137
+ noise_snr_high: 15
138
+ rir_scale_factor: 1.0
139
+
140
+ add_rev_noise: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
141
+ openrir_folder: /data2/ESC-50-master/RIRS
142
+ openrir_max_noise_len: 3.0 # seconds
143
+ reverb_prob: 1.0
144
+ noise_prob: 1.0
145
+ noise_snr_low: 0
146
+ noise_snr_high: 15
147
+ rir_scale_factor: 1.0
148
+
149
+
150
+ # Definition of the augmentation pipeline.
151
+ # If concat_augment = False, the augmentation techniques are applied
152
+ # in sequence. If concat_augment = True, all the augmented signals
153
+ # # are concatenated in a single big batch.
154
+
155
+ augment_pipeline: []
156
+ concat_augment: true
157
+
158
+ mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization
159
+
160
+ norm_type: sentence
161
+ std_norm: false
162
+
163
+ # pre-processing
164
+ n_fft: 1024
165
+ spec_mag_power: 0.5
166
+ hop_length: 11.6099
167
+ win_length: 23.2199
168
+ compute_stft: &id001 !new:speechbrain.processing.features.STFT
169
+ n_fft: 1024
170
+ hop_length: 11.6099
171
+ win_length: 23.2199
172
+ sample_rate: 44100
173
+
174
+ compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
175
+ n_mels: 80
176
+ n_fft: 1024
177
+ sample_rate: 44100
178
+
179
+ modules:
180
+ compute_stft: *id001
181
+ compute_fbank: *id002
182
+ compute_features: *id003
183
+ augment_wavedrop: *id004
184
+ augment_speed: *id005
185
+ add_rev: *id006
186
+ add_noise: *id007
187
+ add_rev_noise: *id008
188
+ embedding_model: *id009
189
+ classifier: *id010
190
+ mean_var_norm: *id011
191
+ compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
192
+ loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
193
+ margin: 0.2
194
+ scale: 30
195
+
196
+ # compute_error: !name:speechbrain.nnet.losses.classification_error
197
+
198
+ opt_class: !name:torch.optim.Adam
199
+ lr: 0.0002
200
+ weight_decay: 0.000002
201
+
202
+ lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
203
+ base_lr: 0.00000001
204
+ max_lr: 0.0002
205
+ step_size: 65000
206
+
207
+ # Logging + checkpoints
208
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
209
+ save_file: ./results/cnn14/11/train_log.txt
210
+
211
+ error_stats: !name:speechbrain.utils.metric_stats.MetricStats
212
+ metric: !name:speechbrain.nnet.losses.classification_error
213
+ reduction: batch
214
+
215
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
216
+ checkpoints_dir: ./results/cnn14/11/save
217
+ recoverables:
218
+ embedding_model: *id009
219
+ classifier: *id010
220
+ normalizer: *id011
221
+ counter: *id012
222
+
223
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
224
+ loadables:
225
+ embedding_model: !ref <embedding_model>
226
+ classifier: !ref <classifier>