# Generated 2022-11-24 from: # /home/pcp22wc/exps/speaker-recognition/hparams/train_tdnn.yaml # yamllint disable # ################################ # Model: Speaker identification with Vanilla TDNN (Xvector) # Authors: Yang Wang # ################################ # Basic parameters seed: 914 __set_seed: !apply:torch.manual_seed [914] output_folder: results/tdnn_augment/914 save_folder: results/tdnn_augment/914/save train_log: results/tdnn_augment/914/train_log.txt # Data files data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test # e.g. /path/to/Voxceleb train_annotation: results/tdnn_augment/914/save/train.csv valid_annotation: results/tdnn_augment/914/save/dev.csv # Folder to extract data augmentation files rir_folder: /fastdata/pcp22wc/audio # Change it if needed musan_folder: /fastdata/pcp22wc/audio/musan music_csv: results/tdnn_augment/914/save/music.csv noise_csv: results/tdnn_augment/914/save/noise.csv speech_csv: results/tdnn_augment/914/save/speech.csv # Use the following links for the official voxceleb splits: # VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt # VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt # VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt. # VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set. # Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing. verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt skip_prep: true ckpt_interval_minutes: 15 # save checkpoint every N min # Training parameters number_of_epochs: 30 batch_size: 512 lr: 0.001 lr_final: 0.0001 step_size: 65000 sample_rate: 16000 sentence_len: 3.0 # seconds shuffle: true random_chunk: true # Feature parameters n_mels: 80 deltas: false # Number of speakers out_n_neurons: 5994 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2 dataloader_options: batch_size: 512 shuffle: true num_workers: 8 # Functions compute_features: &id009 !new:speechbrain.lobes.features.Fbank n_mels: 80 deltas: false embedding_model: &id010 !new:speechbrain.lobes.models.Xvector.Xvector in_channels: 80 activation: !name:torch.nn.LeakyReLU tdnn_blocks: 5 tdnn_channels: [512, 512, 512, 512, 1500] tdnn_kernel_sizes: [5, 3, 3, 1, 1] tdnn_dilations: [1, 2, 3, 1, 1] lin_neurons: 512 classifier: &id011 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier input_size: 512 out_neurons: 5994 epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 30 augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [100] augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [95, 100, 105] add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /fastdata/pcp22wc/audio openrir_max_noise_len: 3.0 # seconds reverb_prob: 1.0 noise_prob: 0.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /fastdata/pcp22wc/audio openrir_max_noise_len: 3.0 # seconds reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /fastdata/pcp22wc/audio openrir_max_noise_len: 3.0 # seconds reverb_prob: 1.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 add_noise_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt noise_csv: results/tdnn_augment/914/save/noise.csv babble_prob: 0.0 reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 add_music_musan: &id007 !new:speechbrain.lobes.augment.EnvCorrupt noise_csv: results/tdnn_augment/914/save/music.csv babble_prob: 0.0 reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 add_speech_musan: &id008 !new:speechbrain.lobes.augment.EnvCorrupt noise_csv: results/tdnn_augment/914/save/speech.csv babble_prob: 0.0 reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 # Definition of the augmentation pipeline. # If concat_augment = False, the augmentation techniques are applied # in sequence. If concat_augment = True, all the augmented signals # # are concatenated in a single big batch. augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006, *id007, *id008] concat_augment: true mean_var_norm: &id012 !new:speechbrain.processing.features.InputNormalization norm_type: sentence std_norm: false modules: compute_features: *id009 augment_wavedrop: *id001 augment_speed: *id002 add_rev: *id003 add_noise: *id004 add_rev_noise: *id005 add_noise_musan: *id006 add_music_musan: *id007 add_speech_musan: *id008 embedding_model: *id010 classifier: *id011 mean_var_norm: *id012 compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin margin: 0.2 scale: 30 # compute_error: !name:speechbrain.nnet.losses.classification_error opt_class: !name:torch.optim.Adam lr: 0.001 weight_decay: 0.000002 lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler initial_value: 0.001 final_value: 0.0001 epoch_count: 30 # Logging + checkpoints train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/tdnn_augment/914/train_log.txt error_stats: !name:speechbrain.utils.metric_stats.MetricStats metric: !name:speechbrain.nnet.losses.classification_error reduction: batch checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/tdnn_augment/914/save recoverables: embedding_model: *id010 classifier: *id011 normalizer: *id012 counter: *id013