Automatic Speech Recognition
NeMo
PyTorch
Icelandic
speech
audio
CTC
NeMo
QuartzNet
QuartzNet15x5
icelandic
Eval Results
carlosdanielhernandezmena commited on
Commit
bbb3d6f
1 Parent(s): a516085

Adding the acoustic model (.nemo) and the architecture (.yaml) to the Repo

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ stt_is_quartznet15x5_ft_ep56_875h.nemo filter=lfs diff=lfs merge=lfs -text
QuartzNet_FT15x5_Icelandic.yaml ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: &name "QuartzNet15x5"
2
+
3
+ model:
4
+ sample_rate: &sample_rate 16000
5
+ repeat: &repeat 5
6
+ dropout: &dropout 0.0
7
+ separable: &separable true
8
+ labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "á", "æ", "é", "í", "ð", "ó", "ö", "ú", "ý", "þ"]
9
+
10
+ train_ds:
11
+ manifest_filepath: ???
12
+ sample_rate: 16000
13
+ labels: *labels
14
+ batch_size: 16 ##########################
15
+ trim_silence: True
16
+ max_duration: 16.7
17
+ shuffle: True
18
+ num_workers: 8
19
+ pin_memory: true
20
+ # tarred datasets
21
+ is_tarred: false
22
+ tarred_audio_filepaths: null
23
+ shuffle_n: 2048
24
+ # bucketing params
25
+ bucketing_strategy: "synced_randomized"
26
+ bucketing_batch_size: null
27
+
28
+ validation_ds:
29
+ manifest_filepath: ???
30
+ sample_rate: 16000
31
+ labels: *labels
32
+ batch_size: 16 ##########################
33
+ shuffle: False
34
+ num_workers: 8
35
+ pin_memory: true
36
+
37
+ preprocessor:
38
+ _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
39
+ normalize: "per_feature"
40
+ window_size: 0.02
41
+ sample_rate: *sample_rate
42
+ window_stride: 0.01
43
+ window: "hann"
44
+ features: &n_mels 64
45
+ n_fft: 512
46
+ frame_splicing: 1
47
+ dither: 1.0e-05
48
+
49
+ spec_augment:
50
+ _target_: nemo.collections.asr.modules.SpectrogramAugmentation
51
+ rect_freq: 50
52
+ rect_masks: 5
53
+ rect_time: 120
54
+
55
+ encoder:
56
+ _target_: nemo.collections.asr.modules.ConvASREncoder
57
+ feat_in: *n_mels
58
+ activation: relu
59
+ conv_mask: true
60
+
61
+ jasper:
62
+ #1
63
+ - dilation: [1]
64
+ dropout: *dropout
65
+ filters: 256
66
+ kernel: [33]
67
+ repeat: 1
68
+ residual: false
69
+ separable: *separable
70
+ stride: [2]
71
+ #2
72
+ - dilation: [1]
73
+ dropout: *dropout
74
+ filters: 256
75
+ kernel: [33]
76
+ repeat: *repeat
77
+ residual: true
78
+ separable: *separable
79
+ stride: [1]
80
+ #3
81
+ - dilation: [1]
82
+ dropout: *dropout
83
+ filters: 256
84
+ kernel: [33]
85
+ repeat: *repeat
86
+ residual: true
87
+ separable: *separable
88
+ stride: [1]
89
+ #4
90
+ - dilation: [1]
91
+ dropout: *dropout
92
+ filters: 256
93
+ kernel: [33]
94
+ repeat: *repeat
95
+ residual: true
96
+ separable: *separable
97
+ stride: [1]
98
+ #5
99
+ - dilation: [1]
100
+ dropout: *dropout
101
+ filters: 256
102
+ kernel: [39]
103
+ repeat: *repeat
104
+ residual: true
105
+ separable: *separable
106
+ stride: [1]
107
+ #6
108
+ - dilation: [1]
109
+ dropout: *dropout
110
+ filters: 256
111
+ kernel: [39]
112
+ repeat: *repeat
113
+ residual: true
114
+ separable: *separable
115
+ stride: [1]
116
+ #7
117
+ - dilation: [1]
118
+ dropout: *dropout
119
+ filters: 256
120
+ kernel: [39]
121
+ repeat: *repeat
122
+ residual: true
123
+ separable: *separable
124
+ stride: [1]
125
+ #8
126
+ - dilation: [1]
127
+ dropout: *dropout
128
+ filters: 512
129
+ kernel: [51]
130
+ repeat: *repeat
131
+ residual: true
132
+ separable: *separable
133
+ stride: [1]
134
+ #9
135
+ - dilation: [1]
136
+ dropout: *dropout
137
+ filters: 512
138
+ kernel: [51]
139
+ repeat: *repeat
140
+ residual: true
141
+ separable: *separable
142
+ stride: [1]
143
+ #10
144
+ - dilation: [1]
145
+ dropout: *dropout
146
+ filters: 512
147
+ kernel: [51]
148
+ repeat: *repeat
149
+ residual: true
150
+ separable: *separable
151
+ stride: [1]
152
+ #11
153
+ - dilation: [1]
154
+ dropout: *dropout
155
+ filters: 512
156
+ kernel: [63]
157
+ repeat: *repeat
158
+ residual: true
159
+ separable: *separable
160
+ stride: [1]
161
+ #12
162
+ - dilation: [1]
163
+ dropout: *dropout
164
+ filters: 512
165
+ kernel: [63]
166
+ repeat: *repeat
167
+ residual: true
168
+ separable: *separable
169
+ stride: [1]
170
+ #13
171
+ - dilation: [1]
172
+ dropout: *dropout
173
+ filters: 512
174
+ kernel: [63]
175
+ repeat: *repeat
176
+ residual: true
177
+ separable: *separable
178
+ stride: [1]
179
+ #14
180
+ - dilation: [1]
181
+ dropout: *dropout
182
+ filters: 512
183
+ kernel: [75]
184
+ repeat: *repeat
185
+ residual: true
186
+ separable: *separable
187
+ stride: [1]
188
+ #15
189
+ - dilation: [1]
190
+ dropout: *dropout
191
+ filters: 512
192
+ kernel: [75]
193
+ repeat: *repeat
194
+ residual: true
195
+ separable: *separable
196
+ stride: [1]
197
+ #16
198
+ - dilation: [1]
199
+ dropout: *dropout
200
+ filters: 512
201
+ kernel: [75]
202
+ repeat: *repeat
203
+ residual: true
204
+ separable: *separable
205
+ stride: [1]
206
+ #17
207
+ - dilation: [2]
208
+ dropout: *dropout
209
+ filters: 512
210
+ kernel: [87]
211
+ repeat: 1
212
+ residual: false
213
+ separable: *separable
214
+ stride: [1]
215
+ #18
216
+ - dilation: [1]
217
+ dropout: *dropout
218
+ filters: &enc_filters 1024
219
+ kernel: [1]
220
+ repeat: 1
221
+ residual: false
222
+ stride: [1]
223
+
224
+ decoder:
225
+ _target_: nemo.collections.asr.modules.ConvASRDecoder
226
+ feat_in: *enc_filters
227
+ num_classes: 37
228
+ vocabulary: *labels
229
+
230
+ optim:
231
+ name: novograd
232
+ # _target_: nemo.core.optim.optimizers.Novograd
233
+ lr: 0.0012
234
+ # optimizer arguments
235
+ betas: [0.95, 0.25]
236
+ weight_decay: 0.001
237
+
238
+ # scheduler setup
239
+ sched:
240
+ name: CosineAnnealing
241
+
242
+ # pytorch lightning args
243
+ # monitor: val_loss
244
+ # reduce_on_plateau: false
245
+
246
+ # Scheduler params
247
+ warmup_steps: null
248
+ warmup_ratio: null
249
+ min_lr: 0.0
250
+ last_epoch: -1
251
+
252
+ trainer:
253
+ devices: 1 # number of gpus
254
+ max_epochs: 5
255
+ max_steps: -1 # computed at runtime if not set
256
+ num_nodes: 1
257
+ accelerator: gpu
258
+ strategy: ddp
259
+ accumulate_grad_batches: 1
260
+ enable_checkpointing: False # Provided by exp_manager
261
+ logger: False # Provided by exp_manager
262
+ log_every_n_steps: 1 # Interval of logging.
263
+ val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
264
+ benchmark: false # needs to be false for models with variable-length speech input as it slows down training
265
+
266
+ exp_manager:
267
+ exp_dir: null
268
+ name: *name
269
+ create_tensorboard_logger: True
270
+ create_checkpoint_callback: True
271
+ checkpoint_callback_params:
272
+ monitor: "val_wer"
273
+ mode: "min"
274
+ create_wandb_logger: False
275
+ wandb_logger_kwargs:
276
+ name: null
277
+ project: null
278
+
stt_is_quartznet15x5_ft_ep56_875h.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470116b575b5e9a84f5bbfad7755c5b7e13b364bb99e2dc15137920669353a29
3
+ size 76359680