File size: 6,191 Bytes

d24c3c9

# Generated 2022-11-21 from:
# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
# yamllint disable
# #################################
# Basic training parameters for sound classification using the ESC50 dataset.
# This recipe uses the ecapa-tdnn backbone for classification.
#
# Author:
#  * Cem Subakan
#  (based on the SpeechBrain UrbanSound8k recipe)
# #################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 11
__set_seed: !!python/object/apply:torch.manual_seed [11]

# Set up folders for reading from and writing to
# Dataset must already exist at `audio_data_folder`
data_folder: /data2/ESC-50-master
                          # e.g., /localscratch/UrbanSound8K
open_rir_folder: <data_folder>/RIRS # Change if needed
audio_data_folder: /data2/ESC-50-master/audio

# TODO the follwing folder will contain the resampled audio
# files (mono channel and config SR) to train on
#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
#
experiment_name: cnn14
output_folder: ./results/cnn14/11
save_folder: ./results/cnn14/11/save
train_log: ./results/cnn14/11/train_log.txt

test_only: false

# Tensorboard logs
use_tensorboard: false
tensorboard_logs_folder: ./results/cnn14/11/tb_logs/

# Path where data manifest files will be stored
train_annotation: /data2/ESC-50-master/manifest/train.json
valid_annotation: /data2/ESC-50-master/manifest/valid.json
test_annotation: /data2/ESC-50-master/manifest/test.json

# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: false

ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 200
batch_size: 32
lr: 0.0002
base_lr: 0.00000001
max_lr: 0.0002
step_size: 65000
sample_rate: 44100

device: cpu

# Feature parameters
n_mels: 80
left_frames: 0
right_frames: 0
deltas: false
amp_to_db: true
normalize: true

# Number of classes
out_n_neurons: 50

# Note that it's actually important to shuffle the data here
# (or at the very least, not sort the data by duration)
# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
# because this does not mix samples from folds in train to valid/test, only
# within train or valid, or test
shuffle: true
dataloader_options:
  batch_size: 32
  shuffle: true
  num_workers: 0

# Functions
compute_features: &id003 !new:speechbrain.lobes.features.Fbank
  n_mels: 80
  left_frames: 0
  right_frames: 0
  deltas: false
  sample_rate: 44100
  n_fft: 1024
  win_length: 20
  hop_length: 10

use_pretrain: false
embedding_model: &id009 !new:recipes.ESC50.classification.custom_models.Cnn14
  mel_bins: 80
  emb_dim: 2048

classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 2048
  out_neurons: 50
  lin_blocks: 1

epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter


# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
  limit: 200


augment_wavedrop: &id004 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 44100
  speeds: [100]

augment_speed: &id005 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 44100
  speeds: [95, 100, 105]

add_rev: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 0.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_rev_noise: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /data2/ESC-50-master/RIRS
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0


# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.

augment_pipeline: []
concat_augment: true

mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization

  norm_type: sentence
  std_norm: false

# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: &id001 !new:speechbrain.processing.features.STFT
  n_fft: 1024
  hop_length: 11.6099
  win_length: 23.2199
  sample_rate: 44100

compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
  n_mels: 80
  n_fft: 1024
  sample_rate: 44100

modules:
  compute_stft: *id001
  compute_fbank: *id002
  compute_features: *id003
  augment_wavedrop: *id004
  augment_speed: *id005
  add_rev: *id006
  add_noise: *id007
  add_rev_noise: *id008
  embedding_model: *id009
  classifier: *id010
  mean_var_norm: *id011
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
    margin: 0.2
    scale: 30

# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
  lr: 0.0002
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
  base_lr: 0.00000001
  max_lr: 0.0002
  step_size: 65000

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: ./results/cnn14/11/train_log.txt

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
  metric: !name:speechbrain.nnet.losses.classification_error
    reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: ./results/cnn14/11/save
  recoverables:
    embedding_model: *id009
    classifier: *id010
    normalizer: *id011
    counter: *id012

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>