tdnn-vox2 / hyperparams.yaml

Upload 4 files

6f41a81 over 2 years ago

5.99 kB

	# Generated 2022-11-24 from:
	# /home/pcp22wc/exps/speaker-recognition/hparams/train_tdnn.yaml
	# yamllint disable
	# ################################
	# Model: Speaker identification with Vanilla TDNN (Xvector)
	# Authors: Yang Wang
	# ################################

	# Basic parameters
	seed: 914
	__set_seed: !apply:torch.manual_seed [914]
	output_folder: results/tdnn_augment/914
	save_folder: results/tdnn_augment/914/save
	train_log: results/tdnn_augment/914/train_log.txt

	# Data files
	data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test # e.g. /path/to/Voxceleb
	train_annotation: results/tdnn_augment/914/save/train.csv
	valid_annotation: results/tdnn_augment/914/save/dev.csv

	# Folder to extract data augmentation files
	rir_folder: /fastdata/pcp22wc/audio # Change it if needed
	musan_folder: /fastdata/pcp22wc/audio/musan
	music_csv: results/tdnn_augment/914/save/music.csv
	noise_csv: results/tdnn_augment/914/save/noise.csv
	speech_csv: results/tdnn_augment/914/save/speech.csv

	# Use the following links for the official voxceleb splits:
	# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
	# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
	# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
	# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
	# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
	verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt

	skip_prep: true
	ckpt_interval_minutes: 15 # save checkpoint every N min

	# Training parameters
	number_of_epochs: 30
	batch_size: 512
	lr: 0.001
	lr_final: 0.0001
	step_size: 65000
	sample_rate: 16000
	sentence_len: 3.0 # seconds
	shuffle: true
	random_chunk: true

	# Feature parameters
	n_mels: 80
	deltas: false

	# Number of speakers
	out_n_neurons: 5994 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2

	dataloader_options:
	batch_size: 512
	shuffle: true
	num_workers: 8

	# Functions
	compute_features: &id009 !new:speechbrain.lobes.features.Fbank
	n_mels: 80
	deltas: false

	embedding_model: &id010 !new:speechbrain.lobes.models.Xvector.Xvector
	in_channels: 80
	activation: !name:torch.nn.LeakyReLU
	tdnn_blocks: 5
	tdnn_channels: [512, 512, 512, 512, 1500]
	tdnn_kernel_sizes: [5, 3, 3, 1, 1]
	tdnn_dilations: [1, 2, 3, 1, 1]
	lin_neurons: 512

	classifier: &id011 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
	input_size: 512
	out_neurons: 5994

	epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter
	limit: 30


	augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
	sample_rate: 16000
	speeds: [100]

	augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
	sample_rate: 16000
	speeds: [95, 100, 105]

	add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
	openrir_folder: /fastdata/pcp22wc/audio
	openrir_max_noise_len: 3.0 # seconds
	reverb_prob: 1.0
	noise_prob: 0.0
	noise_snr_low: 0
	noise_snr_high: 15
	rir_scale_factor: 1.0

	add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
	openrir_folder: /fastdata/pcp22wc/audio
	openrir_max_noise_len: 3.0 # seconds
	reverb_prob: 0.0
	noise_prob: 1.0
	noise_snr_low: 0
	noise_snr_high: 15
	rir_scale_factor: 1.0

	add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
	openrir_folder: /fastdata/pcp22wc/audio
	openrir_max_noise_len: 3.0 # seconds
	reverb_prob: 1.0
	noise_prob: 1.0
	noise_snr_low: 0
	noise_snr_high: 15
	rir_scale_factor: 1.0

	add_noise_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
	noise_csv: results/tdnn_augment/914/save/noise.csv
	babble_prob: 0.0
	reverb_prob: 0.0
	noise_prob: 1.0
	noise_snr_low: 0
	noise_snr_high: 15

	add_music_musan: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
	noise_csv: results/tdnn_augment/914/save/music.csv
	babble_prob: 0.0
	reverb_prob: 0.0
	noise_prob: 1.0
	noise_snr_low: 0
	noise_snr_high: 15

	add_speech_musan: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
	noise_csv: results/tdnn_augment/914/save/speech.csv
	babble_prob: 0.0
	reverb_prob: 0.0
	noise_prob: 1.0
	noise_snr_low: 0
	noise_snr_high: 15

	# Definition of the augmentation pipeline.
	# If concat_augment = False, the augmentation techniques are applied
	# in sequence. If concat_augment = True, all the augmented signals
	# # are concatenated in a single big batch.

	augment_pipeline: [id001, id002, id003, id004, id005, id006, id007, id008]
	concat_augment: true

	mean_var_norm: &id012 !new:speechbrain.processing.features.InputNormalization

	norm_type: sentence
	std_norm: false

	modules:
	compute_features: *id009
	augment_wavedrop: *id001
	augment_speed: *id002
	add_rev: *id003
	add_noise: *id004
	add_rev_noise: *id005
	add_noise_musan: *id006
	add_music_musan: *id007
	add_speech_musan: *id008
	embedding_model: *id010
	classifier: *id011
	mean_var_norm: *id012
	compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
	loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
	margin: 0.2
	scale: 30

	# compute_error: !name:speechbrain.nnet.losses.classification_error

	opt_class: !name:torch.optim.Adam
	lr: 0.001
	weight_decay: 0.000002

	lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
	initial_value: 0.001
	final_value: 0.0001
	epoch_count: 30

	# Logging + checkpoints
	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file: results/tdnn_augment/914/train_log.txt

	error_stats: !name:speechbrain.utils.metric_stats.MetricStats
	metric: !name:speechbrain.nnet.losses.classification_error
	reduction: batch

	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: results/tdnn_augment/914/save
	recoverables:
	embedding_model: *id010
	classifier: *id011
	normalizer: *id012
	counter: *id013