Peng Wei commited on
Commit
db346ec
1 Parent(s): c555f66

split the data to two repos

Browse files
conf/titanet-finetune.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: &name "TitaNet-Finetune"
2
+ sample_rate: &sample_rate 16000
3
+
4
+ init_from_pretrained_model:
5
+ speaker_tasks:
6
+ name: 'titanet_large'
7
+ include: ["preprocessor","encoder"]
8
+ exclude: ["decoder.final"] # Add specific layer names here to exlude or just ["decoder"] if to exclude all of decoder pretrained weights
9
+
10
+ model:
11
+ train_ds:
12
+ manifest_filepath: ???
13
+ sample_rate: 16000
14
+ labels: null
15
+ batch_size: 64
16
+ shuffle: True
17
+ is_tarred: False
18
+ tarred_audio_filepaths: null
19
+ tarred_shard_strategy: "scatter"
20
+ augmentor:
21
+ speed:
22
+ prob: 0.3
23
+ sr: *sample_rate
24
+ resample_type: 'kaiser_fast'
25
+ min_speed_rate: 0.95
26
+ max_speed_rate: 1.05
27
+
28
+ validation_ds:
29
+ manifest_filepath: ???
30
+ sample_rate: 16000
31
+ labels: null
32
+ batch_size: 128
33
+ shuffle: False
34
+
35
+ test_ds:
36
+ manifest_filepath: ???
37
+ sample_rate: 16000
38
+ labels: null
39
+ batch_size: 1
40
+ shuffle: False
41
+ embedding_dir: './embeddings'
42
+
43
+ model_defaults:
44
+ filters: 1024
45
+ repeat: 3
46
+ dropout: 0.1
47
+ separable: true
48
+ se: true
49
+ se_context_size: -1
50
+ kernel_size_factor: 1.0
51
+
52
+ preprocessor:
53
+ _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
54
+ normalize: "per_feature"
55
+ window_size: 0.025
56
+ sample_rate: *sample_rate
57
+ window_stride: 0.01
58
+ window: "hann"
59
+ features: &n_mels 80
60
+ n_fft: 512
61
+ frame_splicing: 1
62
+ dither: 0.00001
63
+
64
+ encoder:
65
+ _target_: nemo.collections.asr.modules.ConvASREncoder
66
+ feat_in: *n_mels
67
+ activation: relu
68
+ conv_mask: true
69
+
70
+ jasper:
71
+ - filters: ${model.model_defaults.filters}
72
+ repeat: 1
73
+ kernel: [3]
74
+ stride: [1]
75
+ dilation: [1]
76
+ dropout: 0.0
77
+ residual: false
78
+ separable: ${model.model_defaults.separable}
79
+ se: ${model.model_defaults.se}
80
+ se_context_size: ${model.model_defaults.se_context_size}
81
+
82
+ - filters: ${model.model_defaults.filters}
83
+ repeat: ${model.model_defaults.repeat}
84
+ kernel: [7]
85
+ stride: [1]
86
+ dilation: [1]
87
+ dropout: ${model.model_defaults.dropout}
88
+ residual: true
89
+ separable: ${model.model_defaults.separable}
90
+ se: ${model.model_defaults.se}
91
+ se_context_size: ${model.model_defaults.se_context_size}
92
+
93
+ - filters: ${model.model_defaults.filters}
94
+ repeat: ${model.model_defaults.repeat}
95
+ kernel: [11]
96
+ stride: [1]
97
+ dilation: [1]
98
+ dropout: ${model.model_defaults.dropout}
99
+ residual: true
100
+ separable: ${model.model_defaults.separable}
101
+ se: ${model.model_defaults.se}
102
+ se_context_size: ${model.model_defaults.se_context_size}
103
+
104
+ - filters: ${model.model_defaults.filters}
105
+ repeat: ${model.model_defaults.repeat}
106
+ kernel: [15]
107
+ stride: [1]
108
+ dilation: [1]
109
+ dropout: ${model.model_defaults.dropout}
110
+ residual: true
111
+ separable: ${model.model_defaults.separable}
112
+ se: ${model.model_defaults.se}
113
+ se_context_size: ${model.model_defaults.se_context_size}
114
+
115
+ - filters: &enc_feat_out 3072
116
+ repeat: 1
117
+ kernel: [1]
118
+ stride: [1]
119
+ dilation: [1]
120
+ dropout: 0.0
121
+ residual: false
122
+ separable: ${model.model_defaults.separable}
123
+ se: ${model.model_defaults.se}
124
+ se_context_size: ${model.model_defaults.se_context_size}
125
+
126
+ decoder:
127
+ _target_: nemo.collections.asr.modules.SpeakerDecoder
128
+ feat_in: *enc_feat_out
129
+ num_classes: ???
130
+ pool_mode: 'attention'
131
+ emb_sizes: 192
132
+
133
+ loss:
134
+ _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss # you could also use cross-entrophy loss
135
+ scale: 30
136
+ margin: 0.2
137
+
138
+ optim_param_groups:
139
+ encoder:
140
+ lr: .001
141
+
142
+ optim:
143
+ name: adamw
144
+ lr: .0001 #(original titanet-large was trained with 0.08 lr)
145
+ weight_decay: 0.0002
146
+
147
+ # scheduler setup
148
+ sched:
149
+ name: CosineAnnealing
150
+ warmup_ratio: 0.1
151
+ min_lr: 0.0
152
+
153
+ trainer:
154
+ devices: 1 # number of gpus (original titanet-large was trained on 4 nodes with 8 gpus each)
155
+ max_epochs: 10
156
+ max_steps: -1 # computed at runtime if not set
157
+ num_nodes: 1
158
+ accelerator: gpu
159
+ strategy: ddp
160
+ deterministic: True
161
+ enable_checkpointing: False
162
+ logger: False
163
+ log_every_n_steps: 1 # Interval of logging.
164
+ val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
165
+ gradient_clip_val: 1.0
166
+
167
+ exp_manager:
168
+ exp_dir: null
169
+ name: *name
170
+ create_tensorboard_logger: True
171
+ create_checkpoint_callback: True
data/cv-corpus-15.0-2023-09-08/pt.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118caf67b881210258e2b249a90cbdebb9ecb4cf34601990008eb8b8444d49d1
3
+ size 16925448567