File size: 6,191 Bytes
d24c3c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# Generated 2022-11-21 from:
# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
# yamllint disable
# #################################
# Basic training parameters for sound classification using the ESC50 dataset.
# This recipe uses the ecapa-tdnn backbone for classification.
#
# Author:
# * Cem Subakan
# (based on the SpeechBrain UrbanSound8k recipe)
# #################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 11
__set_seed: !!python/object/apply:torch.manual_seed [11]
# Set up folders for reading from and writing to
# Dataset must already exist at `audio_data_folder`
data_folder: /data2/ESC-50-master
# e.g., /localscratch/UrbanSound8K
open_rir_folder: <data_folder>/RIRS # Change if needed
audio_data_folder: /data2/ESC-50-master/audio
# TODO the follwing folder will contain the resampled audio
# files (mono channel and config SR) to train on
#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
#
experiment_name: cnn14
output_folder: ./results/cnn14/11
save_folder: ./results/cnn14/11/save
train_log: ./results/cnn14/11/train_log.txt
test_only: false
# Tensorboard logs
use_tensorboard: false
tensorboard_logs_folder: ./results/cnn14/11/tb_logs/
# Path where data manifest files will be stored
train_annotation: /data2/ESC-50-master/manifest/train.json
valid_annotation: /data2/ESC-50-master/manifest/valid.json
test_annotation: /data2/ESC-50-master/manifest/test.json
# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: false
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 200
batch_size: 32
lr: 0.0002
base_lr: 0.00000001
max_lr: 0.0002
step_size: 65000
sample_rate: 44100
device: cpu
# Feature parameters
n_mels: 80
left_frames: 0
right_frames: 0
deltas: false
amp_to_db: true
normalize: true
# Number of classes
out_n_neurons: 50
# Note that it's actually important to shuffle the data here
# (or at the very least, not sort the data by duration)
# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
# because this does not mix samples from folds in train to valid/test, only
# within train or valid, or test
shuffle: true
dataloader_options:
batch_size: 32
shuffle: true
num_workers: 0
# Functions
compute_features: &id003 !new:speechbrain.lobes.features.Fbank
n_mels: 80
left_frames: 0
right_frames: 0
deltas: false
sample_rate: 44100
n_fft: 1024
win_length: 20
hop_length: 10
use_pretrain: false
embedding_model: &id009 !new:recipes.ESC50.classification.custom_models.Cnn14
mel_bins: 80
emb_dim: 2048
classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
input_size: 2048
out_neurons: 50
lin_blocks: 1
epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter
# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
limit: 200
augment_wavedrop: &id004 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 44100
speeds: [100]
augment_speed: &id005 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 44100
speeds: [95, 100, 105]
add_rev: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /data2/ESC-50-master/RIRS
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 1.0
noise_prob: 0.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
add_noise: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /data2/ESC-50-master/RIRS
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
add_rev_noise: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /data2/ESC-50-master/RIRS
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 1.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.
augment_pipeline: []
concat_augment: true
mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
std_norm: false
# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: &id001 !new:speechbrain.processing.features.STFT
n_fft: 1024
hop_length: 11.6099
win_length: 23.2199
sample_rate: 44100
compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
n_mels: 80
n_fft: 1024
sample_rate: 44100
modules:
compute_stft: *id001
compute_fbank: *id002
compute_features: *id003
augment_wavedrop: *id004
augment_speed: *id005
add_rev: *id006
add_noise: *id007
add_rev_noise: *id008
embedding_model: *id009
classifier: *id010
mean_var_norm: *id011
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
margin: 0.2
scale: 30
# compute_error: !name:speechbrain.nnet.losses.classification_error
opt_class: !name:torch.optim.Adam
lr: 0.0002
weight_decay: 0.000002
lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
base_lr: 0.00000001
max_lr: 0.0002
step_size: 65000
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: ./results/cnn14/11/train_log.txt
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: ./results/cnn14/11/save
recoverables:
embedding_model: *id009
classifier: *id010
normalizer: *id011
counter: *id012
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
|