cnn14-esc50 / hyperparams.yaml

Update hyperparams.yaml

d6ae021 over 1 year ago

5.08 kB

	# Generated 2022-11-21 from:
	# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
	# yamllint disable
	# #################################
	# Basic training parameters for sound classification using the ESC50 dataset.
	# This recipe uses the ecapa-tdnn backbone for classification.
	#
	# Author:
	# * Cem Subakan
	# (based on the SpeechBrain UrbanSound8k recipe)
	# #################################

	# Seed needs to be set at top of yaml, before objects with parameters are made
	seed: 11
	__set_seed: !!python/object/apply:torch.manual_seed [11]

	# Set up folders for reading from and writing to
	# Dataset must already exist at `audio_data_folder`
	data_folder: /data2/ESC-50-master
	# e.g., /localscratch/UrbanSound8K
	open_rir_folder: <data_folder>/RIRS # Change if needed
	audio_data_folder: /data2/ESC-50-master/audio

	# TODO the follwing folder will contain the resampled audio
	# files (mono channel and config SR) to train on
	#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
	#
	experiment_name: cnn14
	output_folder: ./results/cnn14/11
	save_folder: ./results/cnn14/11/save
	train_log: ./results/cnn14/11/train_log.txt

	test_only: false

	# Tensorboard logs
	use_tensorboard: false
	tensorboard_logs_folder: ./results/cnn14/11/tb_logs/

	# Path where data manifest files will be stored
	train_annotation: /data2/ESC-50-master/manifest/train.json
	valid_annotation: /data2/ESC-50-master/manifest/valid.json
	test_annotation: /data2/ESC-50-master/manifest/test.json

	# To standardize results, UrbanSound8k has pre-separated samples into
	# 10 folds for multi-fold validation
	train_fold_nums: [1, 2, 3]
	valid_fold_nums: [4]
	test_fold_nums: [5]
	skip_manifest_creation: false

	ckpt_interval_minutes: 15 # save checkpoint every N min

	# Training parameters
	number_of_epochs: 200
	batch_size: 32
	lr: 0.0002
	base_lr: 0.00000001
	max_lr: 0.0002
	step_size: 65000
	sample_rate: 44100

	device: cpu

	# Feature parameters
	n_mels: 80
	left_frames: 0
	right_frames: 0
	deltas: false
	amp_to_db: true
	normalize: true

	# Number of classes
	out_n_neurons: 50

	# Note that it's actually important to shuffle the data here
	# (or at the very least, not sort the data by duration)
	# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
	# because this does not mix samples from folds in train to valid/test, only
	# within train or valid, or test
	shuffle: true
	dataloader_options:
	batch_size: 32
	shuffle: true
	num_workers: 0

	# Functions
	compute_features: &id003 !new:speechbrain.lobes.features.Fbank
	n_mels: 80
	left_frames: 0
	right_frames: 0
	deltas: false
	sample_rate: 44100
	n_fft: 1024
	win_length: 20
	hop_length: 10

	use_pretrain: false
	embedding_model: &id009 !new:speechbrain.lobes.models.Cnn14.Cnn14
	mel_bins: 80
	emb_dim: 2048

	classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
	input_size: 2048
	out_neurons: 50
	lin_blocks: 1

	epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter


	# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
	limit: 200


	# Definition of the augmentation pipeline.
	# If concat_augment = False, the augmentation techniques are applied
	# in sequence. If concat_augment = True, all the augmented signals
	# # are concatenated in a single big batch.

	augment_pipeline: []
	concat_augment: true

	mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization

	norm_type: sentence
	std_norm: false

	# pre-processing
	n_fft: 1024
	spec_mag_power: 0.5
	hop_length: 11.6099
	win_length: 23.2199
	compute_stft: &id001 !new:speechbrain.processing.features.STFT
	n_fft: 1024
	hop_length: 11.6099
	win_length: 23.2199
	sample_rate: 44100

	compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
	n_mels: 80
	n_fft: 1024
	sample_rate: 44100

	modules:
	compute_stft: *id001
	compute_fbank: *id002
	compute_features: *id003
	embedding_model: *id009
	classifier: *id010
	mean_var_norm: *id011
	compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
	loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
	margin: 0.2
	scale: 30

	# compute_error: !name:speechbrain.nnet.losses.classification_error

	opt_class: !name:torch.optim.Adam
	lr: 0.0002
	weight_decay: 0.000002

	lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
	base_lr: 0.00000001
	max_lr: 0.0002
	step_size: 65000

	# Logging + checkpoints
	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file: ./results/cnn14/11/train_log.txt

	error_stats: !name:speechbrain.utils.metric_stats.MetricStats
	metric: !name:speechbrain.nnet.losses.classification_error
	reduction: batch

	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: ./results/cnn14/11/save
	recoverables:
	embedding_model: *id009
	classifier: *id010
	normalizer: *id011
	counter: *id012

	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	embedding_model: !ref <embedding_model>
	classifier: !ref <classifier>