slplab commited on
Commit
caa37fe
·
1 Parent(s): 5668ddf

Initial commit for wav2vec2-large-robust pronunciation model

Browse files
config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-robust",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_dropout_prob": 0.1,
60
+ "hidden_size": 1024,
61
+ "initializer_range": 0.02,
62
+ "intermediate_size": 4096,
63
+ "layer_norm_eps": 1e-05,
64
+ "layerdrop": 0.1,
65
+ "mask_feature_length": 10,
66
+ "mask_feature_min_masks": 0,
67
+ "mask_feature_prob": 0.0,
68
+ "mask_time_length": 10,
69
+ "mask_time_min_masks": 2,
70
+ "mask_time_prob": 0.05,
71
+ "model_type": "wav2vec2",
72
+ "num_adapter_layers": 3,
73
+ "num_attention_heads": 16,
74
+ "num_codevector_groups": 2,
75
+ "num_codevectors_per_group": 320,
76
+ "num_conv_pos_embedding_groups": 16,
77
+ "num_conv_pos_embeddings": 128,
78
+ "num_feat_extract_layers": 7,
79
+ "num_hidden_layers": 24,
80
+ "num_negatives": 100,
81
+ "output_hidden_size": 1024,
82
+ "pad_token_id": 436,
83
+ "proj_codevector_dim": 768,
84
+ "tdnn_dilation": [
85
+ 1,
86
+ 2,
87
+ 3,
88
+ 1,
89
+ 1
90
+ ],
91
+ "tdnn_dim": [
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 512,
96
+ 1500
97
+ ],
98
+ "tdnn_kernel": [
99
+ 5,
100
+ 3,
101
+ 3,
102
+ 1,
103
+ 1
104
+ ],
105
+ "torch_dtype": "float32",
106
+ "transformers_version": "4.39.1",
107
+ "use_weighted_layer_sum": false,
108
+ "vocab_size": 440,
109
+ "xvector_output_dim": 512
110
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1511757b5f36ccab26fbea89be5514cfe4565bc77fd2a6c5661e92253c616c1
3
+ size 1263611496
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "436": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "437": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "438": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "439": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_phonemize": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "phone_delimiter_token": " ",
43
+ "phonemizer_backend": "espeak",
44
+ "phonemizer_lang": "en-us",
45
+ "processor_class": "Wav2Vec2Processor",
46
+ "tokenizer_class": "Wav2Vec2PhonemeCTCTokenizer",
47
+ "unk_token": "<unk>",
48
+ "word_delimiter_token": null
49
+ }
vocab.json ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 436,
3
+ "<unk>": 437,
4
+ "aa": 0,
5
+ "aa_ih": 1,
6
+ "aa_l": 2,
7
+ "aa_ng": 3,
8
+ "aa_r": 4,
9
+ "aa_uh": 5,
10
+ "aa_w": 6,
11
+ "aa_w_ih": 7,
12
+ "ae": 8,
13
+ "ae_ax": 9,
14
+ "ae_ax_l": 10,
15
+ "ae_ih": 11,
16
+ "ae_l": 12,
17
+ "ah": 13,
18
+ "ah_ax": 14,
19
+ "ah_f": 15,
20
+ "ah_ih": 16,
21
+ "ah_l": 17,
22
+ "ah_l_ih": 18,
23
+ "ah_l_s": 19,
24
+ "ah_m": 20,
25
+ "ah_n": 21,
26
+ "ah_p": 22,
27
+ "ah_r": 23,
28
+ "ah_uh": 24,
29
+ "ah_uw": 25,
30
+ "ah_w": 26,
31
+ "ao": 27,
32
+ "ao_l": 28,
33
+ "ao_r": 29,
34
+ "aw": 30,
35
+ "aw_ih": 31,
36
+ "aw_n": 32,
37
+ "ax": 33,
38
+ "ax_b": 34,
39
+ "ax_d": 35,
40
+ "ax_ih": 36,
41
+ "ax_k": 37,
42
+ "ax_l": 38,
43
+ "ax_l_eh": 39,
44
+ "ax_l_l": 40,
45
+ "ax_m": 41,
46
+ "ax_r": 42,
47
+ "ax_r_ih": 43,
48
+ "ax_z": 44,
49
+ "axr": 45,
50
+ "axr_ih": 46,
51
+ "axr_s": 47,
52
+ "axr_z": 48,
53
+ "ay": 49,
54
+ "ay_ae": 50,
55
+ "ay_ax": 51,
56
+ "ay_eh": 52,
57
+ "ay_l": 53,
58
+ "ay_m": 54,
59
+ "ay_n": 55,
60
+ "ay_r": 56,
61
+ "ay_t": 57,
62
+ "ay_w_ih": 58,
63
+ "b": 59,
64
+ "b*": 60,
65
+ "b_ah": 61,
66
+ "b_ah_l": 62,
67
+ "b_ax": 63,
68
+ "b_d": 64,
69
+ "b_eh": 65,
70
+ "b_eu": 66,
71
+ "b_ih": 67,
72
+ "b_ih_k": 68,
73
+ "b_uh": 69,
74
+ "bd": 70,
75
+ "ch": 71,
76
+ "ch_ax_w_ax": 72,
77
+ "ch_eu": 73,
78
+ "ch_ih": 74,
79
+ "ch_uh": 75,
80
+ "ch_uh_ah": 76,
81
+ "ch_y": 77,
82
+ "d": 78,
83
+ "d*": 79,
84
+ "d_ax": 80,
85
+ "d_d": 81,
86
+ "d_eu": 82,
87
+ "d_ey": 83,
88
+ "d_ih": 84,
89
+ "d_ih_d": 85,
90
+ "d_ih_d_eu": 86,
91
+ "d_ih_ng": 87,
92
+ "d_ix": 88,
93
+ "d_iy": 89,
94
+ "d_l": 90,
95
+ "d_o": 91,
96
+ "d_r": 92,
97
+ "d_t": 93,
98
+ "d_w": 94,
99
+ "d_y": 95,
100
+ "d_z": 96,
101
+ "dd": 97,
102
+ "dd_eu": 98,
103
+ "dh": 99,
104
+ "dh_ah": 100,
105
+ "dh_eh": 101,
106
+ "dh_eu": 102,
107
+ "dh_ey": 103,
108
+ "dx": 104,
109
+ "dx_eu": 105,
110
+ "dx_ih": 106,
111
+ "dx_ih_d": 107,
112
+ "dx_ix": 108,
113
+ "eh": 109,
114
+ "eh_ah": 110,
115
+ "eh_ax": 111,
116
+ "eh_ax_l_aa": 112,
117
+ "eh_g": 113,
118
+ "eh_ih": 114,
119
+ "eh_ih_aa": 115,
120
+ "eh_k": 116,
121
+ "eh_l": 117,
122
+ "eh_m": 118,
123
+ "eh_r": 119,
124
+ "eh_td": 120,
125
+ "eh_uw": 121,
126
+ "eh_w_ih": 122,
127
+ "er": 123,
128
+ "er_eh": 124,
129
+ "er_ih": 125,
130
+ "er_l": 126,
131
+ "er_n": 127,
132
+ "eu": 128,
133
+ "ey": 129,
134
+ "ey_ah": 130,
135
+ "ey_d": 131,
136
+ "ey_dd": 132,
137
+ "ey_f": 133,
138
+ "ey_ih": 134,
139
+ "ey_jh": 135,
140
+ "ey_p": 136,
141
+ "ey_s": 137,
142
+ "ey_w_ih": 138,
143
+ "ey_w_iy": 139,
144
+ "f": 140,
145
+ "f_ah": 141,
146
+ "f_ao": 142,
147
+ "f_ax": 143,
148
+ "f_eu": 144,
149
+ "f_r": 145,
150
+ "f_w": 146,
151
+ "g": 147,
152
+ "g*": 148,
153
+ "g*_w": 149,
154
+ "g_ax": 150,
155
+ "g_eu": 151,
156
+ "g_ih": 152,
157
+ "g_l": 153,
158
+ "g_uh": 154,
159
+ "g_y": 155,
160
+ "gd": 156,
161
+ "hh": 157,
162
+ "hh_ah": 158,
163
+ "hh_eu": 159,
164
+ "hh_ih": 160,
165
+ "hh_ix": 161,
166
+ "hh_ow": 162,
167
+ "hh_w": 163,
168
+ "hh_y": 164,
169
+ "ih": 165,
170
+ "ih_aa": 166,
171
+ "ih_ah": 167,
172
+ "ih_ax": 168,
173
+ "ih_ax_l": 169,
174
+ "ih_axr": 170,
175
+ "ih_d": 171,
176
+ "ih_eh": 172,
177
+ "ih_ih": 173,
178
+ "ih_k": 174,
179
+ "ih_l": 175,
180
+ "ih_n": 176,
181
+ "ih_n_ih": 177,
182
+ "ih_ng": 178,
183
+ "ih_r": 179,
184
+ "ih_s": 180,
185
+ "ih_td": 181,
186
+ "ih_uh": 182,
187
+ "ih_w_ih": 183,
188
+ "ih_z": 184,
189
+ "ix": 185,
190
+ "ix_d": 186,
191
+ "iy": 187,
192
+ "iy_aa": 188,
193
+ "iy_ah": 189,
194
+ "iy_ax": 190,
195
+ "iy_axr": 191,
196
+ "iy_d": 192,
197
+ "iy_l": 193,
198
+ "iy_m": 194,
199
+ "iy_n": 195,
200
+ "iy_ng": 196,
201
+ "iy_p": 197,
202
+ "iy_s": 198,
203
+ "iy_s_eu": 199,
204
+ "iy_z": 200,
205
+ "jh": 201,
206
+ "jh_ax": 202,
207
+ "jh_eu": 203,
208
+ "jh_ih": 204,
209
+ "jh_ih_s": 205,
210
+ "jh_ix": 206,
211
+ "jh_iy": 207,
212
+ "jh_uh": 208,
213
+ "jh_w": 209,
214
+ "k": 210,
215
+ "k_ah": 211,
216
+ "k_ao": 212,
217
+ "k_ax": 213,
218
+ "k_eu": 214,
219
+ "k_ih": 215,
220
+ "k_l": 216,
221
+ "k_o": 217,
222
+ "k_s": 218,
223
+ "k_s_eu": 219,
224
+ "k_t": 220,
225
+ "k_t_eu": 221,
226
+ "k_th": 222,
227
+ "k_uh": 223,
228
+ "k_w": 224,
229
+ "k_y": 225,
230
+ "kd": 226,
231
+ "kd_s": 227,
232
+ "l": 228,
233
+ "l_ah": 229,
234
+ "l_ah_l": 230,
235
+ "l_ax": 231,
236
+ "l_ay": 232,
237
+ "l_eh": 233,
238
+ "l_eu": 234,
239
+ "l_f": 235,
240
+ "l_ih": 236,
241
+ "l_ix": 237,
242
+ "l_s": 238,
243
+ "m": 239,
244
+ "m_ax": 240,
245
+ "m_b": 241,
246
+ "m_b_eu": 242,
247
+ "m_eh": 243,
248
+ "m_eu": 244,
249
+ "m_f": 245,
250
+ "m_ih": 246,
251
+ "m_ix": 247,
252
+ "m_m": 248,
253
+ "m_n": 249,
254
+ "m_s": 250,
255
+ "m_t_eu": 251,
256
+ "m_z": 252,
257
+ "n": 253,
258
+ "n_ax": 254,
259
+ "n_d": 255,
260
+ "n_d_eu": 256,
261
+ "n_dd": 257,
262
+ "n_eh": 258,
263
+ "n_eu": 259,
264
+ "n_ih": 260,
265
+ "n_ix": 261,
266
+ "n_n": 262,
267
+ "n_o": 263,
268
+ "n_s": 264,
269
+ "n_td": 265,
270
+ "n_y": 266,
271
+ "ng": 267,
272
+ "ng_ah": 268,
273
+ "ng_g": 269,
274
+ "ng_k": 270,
275
+ "ng_s": 271,
276
+ "o": 272,
277
+ "o_aa": 273,
278
+ "o_ah": 274,
279
+ "o_ax": 275,
280
+ "o_ih": 276,
281
+ "o_l": 277,
282
+ "o_n": 278,
283
+ "o_r": 279,
284
+ "o_s": 280,
285
+ "o_uh": 281,
286
+ "ow": 282,
287
+ "ow_ix": 283,
288
+ "ow_s": 284,
289
+ "ow_t": 285,
290
+ "ow_td": 286,
291
+ "oy": 287,
292
+ "oy_n": 288,
293
+ "p": 289,
294
+ "p_ah": 290,
295
+ "p_ax": 291,
296
+ "p_b": 292,
297
+ "p_eh": 293,
298
+ "p_eu": 294,
299
+ "p_ih": 295,
300
+ "p_ix": 296,
301
+ "p_l": 297,
302
+ "p_o": 298,
303
+ "p_r": 299,
304
+ "p_uh": 300,
305
+ "pd": 301,
306
+ "r": 302,
307
+ "r_aa": 303,
308
+ "r_ah": 304,
309
+ "r_ax": 305,
310
+ "r_ay": 306,
311
+ "r_eh": 307,
312
+ "r_eu": 308,
313
+ "r_ih": 309,
314
+ "r_l": 310,
315
+ "r_o": 311,
316
+ "r_s": 312,
317
+ "r_uh": 313,
318
+ "r_w": 314,
319
+ "r_w_ah_n": 315,
320
+ "r_y": 316,
321
+ "s": 317,
322
+ "s_ah": 318,
323
+ "s_ax": 319,
324
+ "s_ch": 320,
325
+ "s_d": 321,
326
+ "s_dd": 322,
327
+ "s_eh": 323,
328
+ "s_eu": 324,
329
+ "s_eu_l_ih": 325,
330
+ "s_ih": 326,
331
+ "s_ih_s": 327,
332
+ "s_ih_s_eu": 328,
333
+ "s_iy": 329,
334
+ "s_t": 330,
335
+ "s_t_eu": 331,
336
+ "s_td": 332,
337
+ "s_ts": 333,
338
+ "s_uh": 334,
339
+ "s_y": 335,
340
+ "sh": 336,
341
+ "sh_ax": 337,
342
+ "sh_ih": 338,
343
+ "sh_ih_dd": 339,
344
+ "sh_iy": 340,
345
+ "sh_uh": 341,
346
+ "sh_uw": 342,
347
+ "sil": 343,
348
+ "t": 344,
349
+ "t_ah": 345,
350
+ "t_ax": 346,
351
+ "t_ay": 347,
352
+ "t_eh": 348,
353
+ "t_eu": 349,
354
+ "t_eu_s": 350,
355
+ "t_eu_s_eu": 351,
356
+ "t_eu_t": 352,
357
+ "t_ih": 353,
358
+ "t_ih_d": 354,
359
+ "t_ix": 355,
360
+ "t_iy": 356,
361
+ "t_o": 357,
362
+ "t_r": 358,
363
+ "t_s": 359,
364
+ "t_uh": 360,
365
+ "t_uw": 361,
366
+ "t_y": 362,
367
+ "td": 363,
368
+ "th": 364,
369
+ "th_ax": 365,
370
+ "th_eu": 366,
371
+ "th_r": 367,
372
+ "ts": 368,
373
+ "ts_ax": 369,
374
+ "ts_eu": 370,
375
+ "ts_y": 371,
376
+ "uh": 372,
377
+ "uh_aa": 373,
378
+ "uh_ah": 374,
379
+ "uh_ah_l": 375,
380
+ "uh_ax": 376,
381
+ "uh_axr": 377,
382
+ "uh_er": 378,
383
+ "uh_ih": 379,
384
+ "uh_l": 380,
385
+ "uh_r": 381,
386
+ "uh_s": 382,
387
+ "uh_w_ao": 383,
388
+ "uw": 384,
389
+ "uw_ah": 385,
390
+ "uw_ax": 386,
391
+ "uw_er": 387,
392
+ "uw_ih": 388,
393
+ "uw_l": 389,
394
+ "uw_m": 390,
395
+ "uw_r": 391,
396
+ "uw_w_ih": 392,
397
+ "uw_w_iy": 393,
398
+ "v": 394,
399
+ "v_ax": 395,
400
+ "v_axr": 396,
401
+ "v_eh": 397,
402
+ "v_eu": 398,
403
+ "v_ih": 399,
404
+ "v_s": 400,
405
+ "w": 401,
406
+ "w_ae": 402,
407
+ "w_ah": 403,
408
+ "w_ao": 404,
409
+ "w_axr": 405,
410
+ "w_ay": 406,
411
+ "w_eh": 407,
412
+ "w_er": 408,
413
+ "w_ih": 409,
414
+ "w_ih_l": 410,
415
+ "w_r": 411,
416
+ "y": 412,
417
+ "y_ah": 413,
418
+ "y_aw": 414,
419
+ "y_eh": 415,
420
+ "y_er": 416,
421
+ "y_ih_r": 417,
422
+ "y_o": 418,
423
+ "y_o_l": 419,
424
+ "y_uh": 420,
425
+ "y_uw": 421,
426
+ "z": 422,
427
+ "z_ax": 423,
428
+ "z_d_eu": 424,
429
+ "z_eu": 425,
430
+ "z_eu_d": 426,
431
+ "z_ih": 427,
432
+ "z_ih_s": 428,
433
+ "z_ih_z": 429,
434
+ "z_s_eu": 430,
435
+ "zh": 431,
436
+ "zh_eu": 432,
437
+ "zh_ih": 433,
438
+ "zh_uh": 434,
439
+ "zh_uw": 435
440
+ }