yangwang825 commited on
Commit
6f41a81
1 Parent(s): 44ddddf

Upload 4 files

Browse files
Files changed (4) hide show
  1. classifier.ckpt +3 -0
  2. embedding_model.ckpt +3 -0
  3. hyperparams.yaml +199 -0
  4. label_encoder.txt +0 -0
classifier.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a81ebd44f0894c6ce55b6670516a15687ddf1db249d63f96b85c9bdea306d0
3
+ size 12276596
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eabee3a0046d37e8a436fdb99f3dfd0a6b04b5fddbdaf005382c201fa76ea6c
3
+ size 17460526
hyperparams.yaml ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-11-24 from:
2
+ # /home/pcp22wc/exps/speaker-recognition/hparams/train_tdnn.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: Speaker identification with Vanilla TDNN (Xvector)
6
+ # Authors: Yang Wang
7
+ # ################################
8
+
9
+ # Basic parameters
10
+ seed: 914
11
+ __set_seed: !apply:torch.manual_seed [914]
12
+ output_folder: results/tdnn_augment/914
13
+ save_folder: results/tdnn_augment/914/save
14
+ train_log: results/tdnn_augment/914/train_log.txt
15
+
16
+ # Data files
17
+ data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test # e.g. /path/to/Voxceleb
18
+ train_annotation: results/tdnn_augment/914/save/train.csv
19
+ valid_annotation: results/tdnn_augment/914/save/dev.csv
20
+
21
+ # Folder to extract data augmentation files
22
+ rir_folder: /fastdata/pcp22wc/audio # Change it if needed
23
+ musan_folder: /fastdata/pcp22wc/audio/musan
24
+ music_csv: results/tdnn_augment/914/save/music.csv
25
+ noise_csv: results/tdnn_augment/914/save/noise.csv
26
+ speech_csv: results/tdnn_augment/914/save/speech.csv
27
+
28
+ # Use the following links for the official voxceleb splits:
29
+ # VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
30
+ # VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
31
+ # VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
32
+ # VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
33
+ # Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
34
+ verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
35
+
36
+ skip_prep: true
37
+ ckpt_interval_minutes: 15 # save checkpoint every N min
38
+
39
+ # Training parameters
40
+ number_of_epochs: 30
41
+ batch_size: 512
42
+ lr: 0.001
43
+ lr_final: 0.0001
44
+ step_size: 65000
45
+ sample_rate: 16000
46
+ sentence_len: 3.0 # seconds
47
+ shuffle: true
48
+ random_chunk: true
49
+
50
+ # Feature parameters
51
+ n_mels: 80
52
+ deltas: false
53
+
54
+ # Number of speakers
55
+ out_n_neurons: 5994 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2
56
+
57
+ dataloader_options:
58
+ batch_size: 512
59
+ shuffle: true
60
+ num_workers: 8
61
+
62
+ # Functions
63
+ compute_features: &id009 !new:speechbrain.lobes.features.Fbank
64
+ n_mels: 80
65
+ deltas: false
66
+
67
+ embedding_model: &id010 !new:speechbrain.lobes.models.Xvector.Xvector
68
+ in_channels: 80
69
+ activation: !name:torch.nn.LeakyReLU
70
+ tdnn_blocks: 5
71
+ tdnn_channels: [512, 512, 512, 512, 1500]
72
+ tdnn_kernel_sizes: [5, 3, 3, 1, 1]
73
+ tdnn_dilations: [1, 2, 3, 1, 1]
74
+ lin_neurons: 512
75
+
76
+ classifier: &id011 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
77
+ input_size: 512
78
+ out_neurons: 5994
79
+
80
+ epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter
81
+ limit: 30
82
+
83
+
84
+ augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
85
+ sample_rate: 16000
86
+ speeds: [100]
87
+
88
+ augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
89
+ sample_rate: 16000
90
+ speeds: [95, 100, 105]
91
+
92
+ add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
93
+ openrir_folder: /fastdata/pcp22wc/audio
94
+ openrir_max_noise_len: 3.0 # seconds
95
+ reverb_prob: 1.0
96
+ noise_prob: 0.0
97
+ noise_snr_low: 0
98
+ noise_snr_high: 15
99
+ rir_scale_factor: 1.0
100
+
101
+ add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
102
+ openrir_folder: /fastdata/pcp22wc/audio
103
+ openrir_max_noise_len: 3.0 # seconds
104
+ reverb_prob: 0.0
105
+ noise_prob: 1.0
106
+ noise_snr_low: 0
107
+ noise_snr_high: 15
108
+ rir_scale_factor: 1.0
109
+
110
+ add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
111
+ openrir_folder: /fastdata/pcp22wc/audio
112
+ openrir_max_noise_len: 3.0 # seconds
113
+ reverb_prob: 1.0
114
+ noise_prob: 1.0
115
+ noise_snr_low: 0
116
+ noise_snr_high: 15
117
+ rir_scale_factor: 1.0
118
+
119
+ add_noise_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
120
+ noise_csv: results/tdnn_augment/914/save/noise.csv
121
+ babble_prob: 0.0
122
+ reverb_prob: 0.0
123
+ noise_prob: 1.0
124
+ noise_snr_low: 0
125
+ noise_snr_high: 15
126
+
127
+ add_music_musan: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
128
+ noise_csv: results/tdnn_augment/914/save/music.csv
129
+ babble_prob: 0.0
130
+ reverb_prob: 0.0
131
+ noise_prob: 1.0
132
+ noise_snr_low: 0
133
+ noise_snr_high: 15
134
+
135
+ add_speech_musan: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
136
+ noise_csv: results/tdnn_augment/914/save/speech.csv
137
+ babble_prob: 0.0
138
+ reverb_prob: 0.0
139
+ noise_prob: 1.0
140
+ noise_snr_low: 0
141
+ noise_snr_high: 15
142
+
143
+ # Definition of the augmentation pipeline.
144
+ # If concat_augment = False, the augmentation techniques are applied
145
+ # in sequence. If concat_augment = True, all the augmented signals
146
+ # # are concatenated in a single big batch.
147
+
148
+ augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006, *id007, *id008]
149
+ concat_augment: true
150
+
151
+ mean_var_norm: &id012 !new:speechbrain.processing.features.InputNormalization
152
+
153
+ norm_type: sentence
154
+ std_norm: false
155
+
156
+ modules:
157
+ compute_features: *id009
158
+ augment_wavedrop: *id001
159
+ augment_speed: *id002
160
+ add_rev: *id003
161
+ add_noise: *id004
162
+ add_rev_noise: *id005
163
+ add_noise_musan: *id006
164
+ add_music_musan: *id007
165
+ add_speech_musan: *id008
166
+ embedding_model: *id010
167
+ classifier: *id011
168
+ mean_var_norm: *id012
169
+ compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
170
+ loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
171
+ margin: 0.2
172
+ scale: 30
173
+
174
+ # compute_error: !name:speechbrain.nnet.losses.classification_error
175
+
176
+ opt_class: !name:torch.optim.Adam
177
+ lr: 0.001
178
+ weight_decay: 0.000002
179
+
180
+ lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
181
+ initial_value: 0.001
182
+ final_value: 0.0001
183
+ epoch_count: 30
184
+
185
+ # Logging + checkpoints
186
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
187
+ save_file: results/tdnn_augment/914/train_log.txt
188
+
189
+ error_stats: !name:speechbrain.utils.metric_stats.MetricStats
190
+ metric: !name:speechbrain.nnet.losses.classification_error
191
+ reduction: batch
192
+
193
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
194
+ checkpoints_dir: results/tdnn_augment/914/save
195
+ recoverables:
196
+ embedding_model: *id010
197
+ classifier: *id011
198
+ normalizer: *id012
199
+ counter: *id013
label_encoder.txt ADDED
The diff for this file is too large to render. See raw diff