English
Sound Classification
CNN14
File size: 6,191 Bytes
d24c3c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# Generated 2022-11-21 from:
# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
# yamllint disable
# #################################
# Basic training parameters for sound classification using the ESC50 dataset.
# This recipe uses the ecapa-tdnn backbone for classification.
#
# Author:
#  * Cem Subakan
#  (based on the SpeechBrain UrbanSound8k recipe)
# #################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 11
__set_seed: !!python/object/apply:torch.manual_seed [11]

# Set up folders for reading from and writing to
# Dataset must already exist at `audio_data_folder`
data_folder: /data2/ESC-50-master
                          # e.g., /localscratch/UrbanSound8K
open_rir_folder: <data_folder>/RIRS # Change if needed
audio_data_folder: /data2/ESC-50-master/audio

# TODO the follwing folder will contain the resampled audio
# files (mono channel and config SR) to train on
#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
#
experiment_name: cnn14
output_folder: ./results/cnn14/11
save_folder: ./results/cnn14/11/save
train_log: ./results/cnn14/11/train_log.txt

test_only: false

# Tensorboard logs
use_tensorboard: false
tensorboard_logs_folder: ./results/cnn14/11/tb_logs/

# Path where data manifest files will be stored
train_annotation: /data2/ESC-50-master/manifest/train.json
valid_annotation: /data2/ESC-50-master/manifest/valid.json
test_annotation: /data2/ESC-50-master/manifest/test.json

# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: false

ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 200
batch_size: 32
lr: 0.0002
base_lr: 0.00000001
max_lr: 0.0002
step_size: 65000
sample_rate: 44100

device: cpu

# Feature parameters
n_mels: 80
left_frames: 0
right_frames: 0
deltas: false
amp_to_db: true
normalize: true

# Number of classes
out_n_neurons: 50

# Note that it's actually important to shuffle the data here
# (or at the very least, not sort the data by duration)
# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
# because this does not mix samples from folds in train to valid/test, only
# within train or valid, or test
shuffle: true
dataloader_options:
  batch_size: 32
  shuffle: true
  num_workers: 0

# Functions
compute_features: &id003 !new:speechbrain.lobes.features.Fbank
  n_mels: 80
  left_frames: 0
  right_frames: 0
  deltas: false
  sample_rate: 44100
  n_fft: 1024
  win_length: 20
  hop_length: 10

use_pretrain: false
embedding_model: &id009 !new:recipes.ESC50.classification.custom_models.Cnn14
  mel_bins: 80
  emb_dim: 2048

classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 2048
  out_neurons: 50
  lin_blocks: 1

epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter


# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
  limit: 200


augment_wavedrop: &id004 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 44100
  speeds: [100]

augment_speed: &id005 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 44100
  speeds: [95, 100, 105]

add_rev: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 0.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_rev_noise: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0


# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.

augment_pipeline: []
concat_augment: true

mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization

  norm_type: sentence
  std_norm: false

# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: &id001 !new:speechbrain.processing.features.STFT
  n_fft: 1024
  hop_length: 11.6099
  win_length: 23.2199
  sample_rate: 44100

compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
  n_mels: 80
  n_fft: 1024
  sample_rate: 44100

modules:
  compute_stft: *id001
  compute_fbank: *id002
  compute_features: *id003
  augment_wavedrop: *id004
  augment_speed: *id005
  add_rev: *id006
  add_noise: *id007
  add_rev_noise: *id008
  embedding_model: *id009
  classifier: *id010
  mean_var_norm: *id011
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
    margin: 0.2
    scale: 30

# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
  lr: 0.0002
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
  base_lr: 0.00000001
  max_lr: 0.0002
  step_size: 65000

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: ./results/cnn14/11/train_log.txt

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
  metric: !name:speechbrain.nnet.losses.classification_error
    reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: ./results/cnn14/11/save
  recoverables:
    embedding_model: *id009
    classifier: *id010
    normalizer: *id011
    counter: *id012

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>