creynier commited on
Commit
1eda59d
1 Parent(s): 55eb6dc
checkpoint-10000/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "codevector_dim": 256,
12
+ "contrastive_logits_temperature": 0.1,
13
+ "conv_bias": false,
14
+ "conv_dim": [
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512
22
+ ],
23
+ "conv_kernel": [
24
+ 10,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 3,
29
+ 2,
30
+ 2
31
+ ],
32
+ "conv_stride": [
33
+ 5,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2
40
+ ],
41
+ "ctc_loss_reduction": "mean",
42
+ "ctc_zero_infinity": false,
43
+ "diversity_loss_weight": 0.1,
44
+ "do_stable_layer_norm": false,
45
+ "eos_token_id": 2,
46
+ "feat_extract_activation": "gelu",
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.0,
51
+ "freeze_feat_extract_train": true,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_size": 768,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "layer_norm_eps": 1e-05,
58
+ "layerdrop": 0.0,
59
+ "mask_channel_length": 10,
60
+ "mask_channel_min_space": 1,
61
+ "mask_channel_other": 0.0,
62
+ "mask_channel_prob": 0.0,
63
+ "mask_channel_selection": "static",
64
+ "mask_feature_length": 10,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 5,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.05,
70
+ "mask_time_selection": "static",
71
+ "model_type": "wav2vec2",
72
+ "no_mask_channel_overlap": false,
73
+ "no_mask_time_overlap": false,
74
+ "num_attention_heads": 12,
75
+ "num_codevector_groups": 2,
76
+ "num_codevectors_per_group": 320,
77
+ "num_conv_pos_embedding_groups": 16,
78
+ "num_conv_pos_embeddings": 128,
79
+ "num_feat_extract_layers": 7,
80
+ "num_hidden_layers": 12,
81
+ "num_negatives": 100,
82
+ "pad_token_id": 29,
83
+ "proj_codevector_dim": 256,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.11.3",
86
+ "use_weighted_layer_sum": false,
87
+ "vocab_size": 32
88
+ }
checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f2e04a7f4017dab4cd20f07f5d5de610962c103ce0008ba25d0d51c2bc7819
3
+ size 721685265
checkpoint-10000/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
checkpoint-10000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a6f6554b9888f24e4870f3516b7e9a519c0b9e81213db58cfec3af5c06c91e4
3
+ size 377670039
checkpoint-10000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e4e30751787b6fb66ae49da730726c7c3fee10163175dc2a373144256f736f4
3
+ size 17563
checkpoint-10000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:771cf8609fdaaed2ab63948ad61b6f20669806667bb75ab8d1a0a1f02c855d8f
3
+ size 559
checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c95942339e0b222e33ff2982743910a05bb8b9bf2c1b7856cc26cbb05674d4
3
+ size 623
checkpoint-10000/trainer_state.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 27.47252747252747,
5
+ "global_step": 10000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.37,
12
+ "learning_rate": 4.99e-05,
13
+ "loss": 3.5823,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 1.37,
18
+ "eval_loss": 3.32504940032959,
19
+ "eval_runtime": 45.0187,
20
+ "eval_samples_per_second": 14.394,
21
+ "eval_steps_per_second": 0.466,
22
+ "eval_wer": 1.0,
23
+ "step": 500
24
+ },
25
+ {
26
+ "epoch": 2.75,
27
+ "learning_rate": 9.970000000000001e-05,
28
+ "loss": 1.6148,
29
+ "step": 1000
30
+ },
31
+ {
32
+ "epoch": 2.75,
33
+ "eval_loss": 0.9707893133163452,
34
+ "eval_runtime": 41.6379,
35
+ "eval_samples_per_second": 15.563,
36
+ "eval_steps_per_second": 0.504,
37
+ "eval_wer": 0.5453111956219274,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 4.12,
42
+ "learning_rate": 9.50100806451613e-05,
43
+ "loss": 0.7901,
44
+ "step": 1500
45
+ },
46
+ {
47
+ "epoch": 4.12,
48
+ "eval_loss": 0.7987369298934937,
49
+ "eval_runtime": 42.0128,
50
+ "eval_samples_per_second": 15.424,
51
+ "eval_steps_per_second": 0.5,
52
+ "eval_wer": 0.4292737222892125,
53
+ "step": 1500
54
+ },
55
+ {
56
+ "epoch": 5.49,
57
+ "learning_rate": 8.996975806451613e-05,
58
+ "loss": 0.5529,
59
+ "step": 2000
60
+ },
61
+ {
62
+ "epoch": 5.49,
63
+ "eval_loss": 0.803723156452179,
64
+ "eval_runtime": 41.9975,
65
+ "eval_samples_per_second": 15.43,
66
+ "eval_steps_per_second": 0.5,
67
+ "eval_wer": 0.4209257026249884,
68
+ "step": 2000
69
+ },
70
+ {
71
+ "epoch": 6.87,
72
+ "learning_rate": 8.492943548387097e-05,
73
+ "loss": 0.4404,
74
+ "step": 2500
75
+ },
76
+ {
77
+ "epoch": 6.87,
78
+ "eval_loss": 0.7943486571311951,
79
+ "eval_runtime": 42.0152,
80
+ "eval_samples_per_second": 15.423,
81
+ "eval_steps_per_second": 0.5,
82
+ "eval_wer": 0.45524533902235415,
83
+ "step": 2500
84
+ },
85
+ {
86
+ "epoch": 8.24,
87
+ "learning_rate": 7.988911290322581e-05,
88
+ "loss": 0.3528,
89
+ "step": 3000
90
+ },
91
+ {
92
+ "epoch": 8.24,
93
+ "eval_loss": 0.8733369708061218,
94
+ "eval_runtime": 42.4523,
95
+ "eval_samples_per_second": 15.264,
96
+ "eval_steps_per_second": 0.495,
97
+ "eval_wer": 0.37658844263055374,
98
+ "step": 3000
99
+ },
100
+ {
101
+ "epoch": 9.62,
102
+ "learning_rate": 7.484879032258065e-05,
103
+ "loss": 0.3008,
104
+ "step": 3500
105
+ },
106
+ {
107
+ "epoch": 9.62,
108
+ "eval_loss": 0.914967954158783,
109
+ "eval_runtime": 42.0647,
110
+ "eval_samples_per_second": 15.405,
111
+ "eval_steps_per_second": 0.499,
112
+ "eval_wer": 0.3725999443465356,
113
+ "step": 3500
114
+ },
115
+ {
116
+ "epoch": 10.99,
117
+ "learning_rate": 6.98084677419355e-05,
118
+ "loss": 0.2625,
119
+ "step": 4000
120
+ },
121
+ {
122
+ "epoch": 10.99,
123
+ "eval_loss": 0.9753163456916809,
124
+ "eval_runtime": 41.804,
125
+ "eval_samples_per_second": 15.501,
126
+ "eval_steps_per_second": 0.502,
127
+ "eval_wer": 0.3690752249327521,
128
+ "step": 4000
129
+ },
130
+ {
131
+ "epoch": 12.36,
132
+ "learning_rate": 6.476814516129032e-05,
133
+ "loss": 0.2153,
134
+ "step": 4500
135
+ },
136
+ {
137
+ "epoch": 12.36,
138
+ "eval_loss": 0.9292902946472168,
139
+ "eval_runtime": 53.6443,
140
+ "eval_samples_per_second": 12.08,
141
+ "eval_steps_per_second": 0.391,
142
+ "eval_wer": 0.35284296447453856,
143
+ "step": 4500
144
+ },
145
+ {
146
+ "epoch": 13.74,
147
+ "learning_rate": 5.9727822580645166e-05,
148
+ "loss": 0.1926,
149
+ "step": 5000
150
+ },
151
+ {
152
+ "epoch": 13.74,
153
+ "eval_loss": 0.9629846215248108,
154
+ "eval_runtime": 43.3856,
155
+ "eval_samples_per_second": 14.936,
156
+ "eval_steps_per_second": 0.484,
157
+ "eval_wer": 0.3697245153510806,
158
+ "step": 5000
159
+ },
160
+ {
161
+ "epoch": 15.11,
162
+ "learning_rate": 5.46875e-05,
163
+ "loss": 0.1634,
164
+ "step": 5500
165
+ },
166
+ {
167
+ "epoch": 15.11,
168
+ "eval_loss": 0.9719225764274597,
169
+ "eval_runtime": 43.1527,
170
+ "eval_samples_per_second": 15.016,
171
+ "eval_steps_per_second": 0.487,
172
+ "eval_wer": 0.37473332714961505,
173
+ "step": 5500
174
+ },
175
+ {
176
+ "epoch": 16.48,
177
+ "learning_rate": 4.964717741935484e-05,
178
+ "loss": 0.1422,
179
+ "step": 6000
180
+ },
181
+ {
182
+ "epoch": 16.48,
183
+ "eval_loss": 1.0191898345947266,
184
+ "eval_runtime": 43.4487,
185
+ "eval_samples_per_second": 14.914,
186
+ "eval_steps_per_second": 0.483,
187
+ "eval_wer": 0.38085520823671276,
188
+ "step": 6000
189
+ },
190
+ {
191
+ "epoch": 17.86,
192
+ "learning_rate": 4.460685483870968e-05,
193
+ "loss": 0.1253,
194
+ "step": 6500
195
+ },
196
+ {
197
+ "epoch": 17.86,
198
+ "eval_loss": 1.0450010299682617,
199
+ "eval_runtime": 42.8089,
200
+ "eval_samples_per_second": 15.137,
201
+ "eval_steps_per_second": 0.491,
202
+ "eval_wer": 0.37686670995269456,
203
+ "step": 6500
204
+ },
205
+ {
206
+ "epoch": 19.23,
207
+ "learning_rate": 3.956653225806452e-05,
208
+ "loss": 0.1068,
209
+ "step": 7000
210
+ },
211
+ {
212
+ "epoch": 19.23,
213
+ "eval_loss": 1.0906848907470703,
214
+ "eval_runtime": 43.3307,
215
+ "eval_samples_per_second": 14.955,
216
+ "eval_steps_per_second": 0.485,
217
+ "eval_wer": 0.359243112883777,
218
+ "step": 7000
219
+ },
220
+ {
221
+ "epoch": 20.6,
222
+ "learning_rate": 3.4526209677419356e-05,
223
+ "loss": 0.095,
224
+ "step": 7500
225
+ },
226
+ {
227
+ "epoch": 20.6,
228
+ "eval_loss": 1.0443437099456787,
229
+ "eval_runtime": 53.9592,
230
+ "eval_samples_per_second": 12.009,
231
+ "eval_steps_per_second": 0.389,
232
+ "eval_wer": 0.35794453204711996,
233
+ "step": 7500
234
+ },
235
+ {
236
+ "epoch": 21.98,
237
+ "learning_rate": 2.9495967741935488e-05,
238
+ "loss": 0.0814,
239
+ "step": 8000
240
+ },
241
+ {
242
+ "epoch": 21.98,
243
+ "eval_loss": 1.140635371208191,
244
+ "eval_runtime": 43.1083,
245
+ "eval_samples_per_second": 15.032,
246
+ "eval_steps_per_second": 0.487,
247
+ "eval_wer": 0.36258232074946667,
248
+ "step": 8000
249
+ },
250
+ {
251
+ "epoch": 23.35,
252
+ "learning_rate": 2.4455645161290326e-05,
253
+ "loss": 0.069,
254
+ "step": 8500
255
+ },
256
+ {
257
+ "epoch": 23.35,
258
+ "eval_loss": 1.1894210577011108,
259
+ "eval_runtime": 42.7802,
260
+ "eval_samples_per_second": 15.147,
261
+ "eval_steps_per_second": 0.491,
262
+ "eval_wer": 0.3690752249327521,
263
+ "step": 8500
264
+ },
265
+ {
266
+ "epoch": 24.73,
267
+ "learning_rate": 1.942540322580645e-05,
268
+ "loss": 0.0623,
269
+ "step": 9000
270
+ },
271
+ {
272
+ "epoch": 24.73,
273
+ "eval_loss": 1.2040750980377197,
274
+ "eval_runtime": 43.2859,
275
+ "eval_samples_per_second": 14.97,
276
+ "eval_steps_per_second": 0.485,
277
+ "eval_wer": 0.3687042018365643,
278
+ "step": 9000
279
+ },
280
+ {
281
+ "epoch": 26.1,
282
+ "learning_rate": 1.4385080645161292e-05,
283
+ "loss": 0.0554,
284
+ "step": 9500
285
+ },
286
+ {
287
+ "epoch": 26.1,
288
+ "eval_loss": 1.2136509418487549,
289
+ "eval_runtime": 42.5198,
290
+ "eval_samples_per_second": 15.24,
291
+ "eval_steps_per_second": 0.494,
292
+ "eval_wer": 0.35720248585474446,
293
+ "step": 9500
294
+ },
295
+ {
296
+ "epoch": 27.47,
297
+ "learning_rate": 9.344758064516129e-06,
298
+ "loss": 0.0475,
299
+ "step": 10000
300
+ },
301
+ {
302
+ "epoch": 27.47,
303
+ "eval_loss": 1.2355363368988037,
304
+ "eval_runtime": 43.577,
305
+ "eval_samples_per_second": 14.87,
306
+ "eval_steps_per_second": 0.482,
307
+ "eval_wer": 0.3565531954364159,
308
+ "step": 10000
309
+ }
310
+ ],
311
+ "max_steps": 10920,
312
+ "num_train_epochs": 30,
313
+ "total_flos": 1.4947492785279646e+19,
314
+ "trial_name": null,
315
+ "trial_params": null
316
+ }
checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d998aad278c0bb911fd9ded0353bc04ca5056945240341f90e545e377ba43f2
3
+ size 2863
checkpoint-10500/config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "codevector_dim": 256,
12
+ "contrastive_logits_temperature": 0.1,
13
+ "conv_bias": false,
14
+ "conv_dim": [
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512
22
+ ],
23
+ "conv_kernel": [
24
+ 10,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 3,
29
+ 2,
30
+ 2
31
+ ],
32
+ "conv_stride": [
33
+ 5,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2
40
+ ],
41
+ "ctc_loss_reduction": "mean",
42
+ "ctc_zero_infinity": false,
43
+ "diversity_loss_weight": 0.1,
44
+ "do_stable_layer_norm": false,
45
+ "eos_token_id": 2,
46
+ "feat_extract_activation": "gelu",
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.0,
51
+ "freeze_feat_extract_train": true,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_size": 768,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "layer_norm_eps": 1e-05,
58
+ "layerdrop": 0.0,
59
+ "mask_channel_length": 10,
60
+ "mask_channel_min_space": 1,
61
+ "mask_channel_other": 0.0,
62
+ "mask_channel_prob": 0.0,
63
+ "mask_channel_selection": "static",
64
+ "mask_feature_length": 10,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 5,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.05,
70
+ "mask_time_selection": "static",
71
+ "model_type": "wav2vec2",
72
+ "no_mask_channel_overlap": false,
73
+ "no_mask_time_overlap": false,
74
+ "num_attention_heads": 12,
75
+ "num_codevector_groups": 2,
76
+ "num_codevectors_per_group": 320,
77
+ "num_conv_pos_embedding_groups": 16,
78
+ "num_conv_pos_embeddings": 128,
79
+ "num_feat_extract_layers": 7,
80
+ "num_hidden_layers": 12,
81
+ "num_negatives": 100,
82
+ "pad_token_id": 29,
83
+ "proj_codevector_dim": 256,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.11.3",
86
+ "use_weighted_layer_sum": false,
87
+ "vocab_size": 32
88
+ }
checkpoint-10500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea4f76fcc9f6d9ea40150bb4e01d304d0772c7fba4c1aa7fea14b1725df1414e
3
+ size 721685265
checkpoint-10500/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
checkpoint-10500/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d42de1bc77469fe30684b2e5c05fcafab3828fa74d7190534b6fbea66f10844
3
+ size 377670039
checkpoint-10500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c18378be313c39a465357862e4f74412d52436f287b6d03fa899b9d973d72e2
3
+ size 17563
checkpoint-10500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36accd578a3753adf01ee13fa036622b2454b71534c2490afa37778ef7236892
3
+ size 559
checkpoint-10500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c2140cd816daecbe941131ce3e89734069c9b4974308433698907fb82469575
3
+ size 623
checkpoint-10500/trainer_state.json ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 28.846153846153847,
5
+ "global_step": 10500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.37,
12
+ "learning_rate": 4.99e-05,
13
+ "loss": 3.5823,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 1.37,
18
+ "eval_loss": 3.32504940032959,
19
+ "eval_runtime": 45.0187,
20
+ "eval_samples_per_second": 14.394,
21
+ "eval_steps_per_second": 0.466,
22
+ "eval_wer": 1.0,
23
+ "step": 500
24
+ },
25
+ {
26
+ "epoch": 2.75,
27
+ "learning_rate": 9.970000000000001e-05,
28
+ "loss": 1.6148,
29
+ "step": 1000
30
+ },
31
+ {
32
+ "epoch": 2.75,
33
+ "eval_loss": 0.9707893133163452,
34
+ "eval_runtime": 41.6379,
35
+ "eval_samples_per_second": 15.563,
36
+ "eval_steps_per_second": 0.504,
37
+ "eval_wer": 0.5453111956219274,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 4.12,
42
+ "learning_rate": 9.50100806451613e-05,
43
+ "loss": 0.7901,
44
+ "step": 1500
45
+ },
46
+ {
47
+ "epoch": 4.12,
48
+ "eval_loss": 0.7987369298934937,
49
+ "eval_runtime": 42.0128,
50
+ "eval_samples_per_second": 15.424,
51
+ "eval_steps_per_second": 0.5,
52
+ "eval_wer": 0.4292737222892125,
53
+ "step": 1500
54
+ },
55
+ {
56
+ "epoch": 5.49,
57
+ "learning_rate": 8.996975806451613e-05,
58
+ "loss": 0.5529,
59
+ "step": 2000
60
+ },
61
+ {
62
+ "epoch": 5.49,
63
+ "eval_loss": 0.803723156452179,
64
+ "eval_runtime": 41.9975,
65
+ "eval_samples_per_second": 15.43,
66
+ "eval_steps_per_second": 0.5,
67
+ "eval_wer": 0.4209257026249884,
68
+ "step": 2000
69
+ },
70
+ {
71
+ "epoch": 6.87,
72
+ "learning_rate": 8.492943548387097e-05,
73
+ "loss": 0.4404,
74
+ "step": 2500
75
+ },
76
+ {
77
+ "epoch": 6.87,
78
+ "eval_loss": 0.7943486571311951,
79
+ "eval_runtime": 42.0152,
80
+ "eval_samples_per_second": 15.423,
81
+ "eval_steps_per_second": 0.5,
82
+ "eval_wer": 0.45524533902235415,
83
+ "step": 2500
84
+ },
85
+ {
86
+ "epoch": 8.24,
87
+ "learning_rate": 7.988911290322581e-05,
88
+ "loss": 0.3528,
89
+ "step": 3000
90
+ },
91
+ {
92
+ "epoch": 8.24,
93
+ "eval_loss": 0.8733369708061218,
94
+ "eval_runtime": 42.4523,
95
+ "eval_samples_per_second": 15.264,
96
+ "eval_steps_per_second": 0.495,
97
+ "eval_wer": 0.37658844263055374,
98
+ "step": 3000
99
+ },
100
+ {
101
+ "epoch": 9.62,
102
+ "learning_rate": 7.484879032258065e-05,
103
+ "loss": 0.3008,
104
+ "step": 3500
105
+ },
106
+ {
107
+ "epoch": 9.62,
108
+ "eval_loss": 0.914967954158783,
109
+ "eval_runtime": 42.0647,
110
+ "eval_samples_per_second": 15.405,
111
+ "eval_steps_per_second": 0.499,
112
+ "eval_wer": 0.3725999443465356,
113
+ "step": 3500
114
+ },
115
+ {
116
+ "epoch": 10.99,
117
+ "learning_rate": 6.98084677419355e-05,
118
+ "loss": 0.2625,
119
+ "step": 4000
120
+ },
121
+ {
122
+ "epoch": 10.99,
123
+ "eval_loss": 0.9753163456916809,
124
+ "eval_runtime": 41.804,
125
+ "eval_samples_per_second": 15.501,
126
+ "eval_steps_per_second": 0.502,
127
+ "eval_wer": 0.3690752249327521,
128
+ "step": 4000
129
+ },
130
+ {
131
+ "epoch": 12.36,
132
+ "learning_rate": 6.476814516129032e-05,
133
+ "loss": 0.2153,
134
+ "step": 4500
135
+ },
136
+ {
137
+ "epoch": 12.36,
138
+ "eval_loss": 0.9292902946472168,
139
+ "eval_runtime": 53.6443,
140
+ "eval_samples_per_second": 12.08,
141
+ "eval_steps_per_second": 0.391,
142
+ "eval_wer": 0.35284296447453856,
143
+ "step": 4500
144
+ },
145
+ {
146
+ "epoch": 13.74,
147
+ "learning_rate": 5.9727822580645166e-05,
148
+ "loss": 0.1926,
149
+ "step": 5000
150
+ },
151
+ {
152
+ "epoch": 13.74,
153
+ "eval_loss": 0.9629846215248108,
154
+ "eval_runtime": 43.3856,
155
+ "eval_samples_per_second": 14.936,
156
+ "eval_steps_per_second": 0.484,
157
+ "eval_wer": 0.3697245153510806,
158
+ "step": 5000
159
+ },
160
+ {
161
+ "epoch": 15.11,
162
+ "learning_rate": 5.46875e-05,
163
+ "loss": 0.1634,
164
+ "step": 5500
165
+ },
166
+ {
167
+ "epoch": 15.11,
168
+ "eval_loss": 0.9719225764274597,
169
+ "eval_runtime": 43.1527,
170
+ "eval_samples_per_second": 15.016,
171
+ "eval_steps_per_second": 0.487,
172
+ "eval_wer": 0.37473332714961505,
173
+ "step": 5500
174
+ },
175
+ {
176
+ "epoch": 16.48,
177
+ "learning_rate": 4.964717741935484e-05,
178
+ "loss": 0.1422,
179
+ "step": 6000
180
+ },
181
+ {
182
+ "epoch": 16.48,
183
+ "eval_loss": 1.0191898345947266,
184
+ "eval_runtime": 43.4487,
185
+ "eval_samples_per_second": 14.914,
186
+ "eval_steps_per_second": 0.483,
187
+ "eval_wer": 0.38085520823671276,
188
+ "step": 6000
189
+ },
190
+ {
191
+ "epoch": 17.86,
192
+ "learning_rate": 4.460685483870968e-05,
193
+ "loss": 0.1253,
194
+ "step": 6500
195
+ },
196
+ {
197
+ "epoch": 17.86,
198
+ "eval_loss": 1.0450010299682617,
199
+ "eval_runtime": 42.8089,
200
+ "eval_samples_per_second": 15.137,
201
+ "eval_steps_per_second": 0.491,
202
+ "eval_wer": 0.37686670995269456,
203
+ "step": 6500
204
+ },
205
+ {
206
+ "epoch": 19.23,
207
+ "learning_rate": 3.956653225806452e-05,
208
+ "loss": 0.1068,
209
+ "step": 7000
210
+ },
211
+ {
212
+ "epoch": 19.23,
213
+ "eval_loss": 1.0906848907470703,
214
+ "eval_runtime": 43.3307,
215
+ "eval_samples_per_second": 14.955,
216
+ "eval_steps_per_second": 0.485,
217
+ "eval_wer": 0.359243112883777,
218
+ "step": 7000
219
+ },
220
+ {
221
+ "epoch": 20.6,
222
+ "learning_rate": 3.4526209677419356e-05,
223
+ "loss": 0.095,
224
+ "step": 7500
225
+ },
226
+ {
227
+ "epoch": 20.6,
228
+ "eval_loss": 1.0443437099456787,
229
+ "eval_runtime": 53.9592,
230
+ "eval_samples_per_second": 12.009,
231
+ "eval_steps_per_second": 0.389,
232
+ "eval_wer": 0.35794453204711996,
233
+ "step": 7500
234
+ },
235
+ {
236
+ "epoch": 21.98,
237
+ "learning_rate": 2.9495967741935488e-05,
238
+ "loss": 0.0814,
239
+ "step": 8000
240
+ },
241
+ {
242
+ "epoch": 21.98,
243
+ "eval_loss": 1.140635371208191,
244
+ "eval_runtime": 43.1083,
245
+ "eval_samples_per_second": 15.032,
246
+ "eval_steps_per_second": 0.487,
247
+ "eval_wer": 0.36258232074946667,
248
+ "step": 8000
249
+ },
250
+ {
251
+ "epoch": 23.35,
252
+ "learning_rate": 2.4455645161290326e-05,
253
+ "loss": 0.069,
254
+ "step": 8500
255
+ },
256
+ {
257
+ "epoch": 23.35,
258
+ "eval_loss": 1.1894210577011108,
259
+ "eval_runtime": 42.7802,
260
+ "eval_samples_per_second": 15.147,
261
+ "eval_steps_per_second": 0.491,
262
+ "eval_wer": 0.3690752249327521,
263
+ "step": 8500
264
+ },
265
+ {
266
+ "epoch": 24.73,
267
+ "learning_rate": 1.942540322580645e-05,
268
+ "loss": 0.0623,
269
+ "step": 9000
270
+ },
271
+ {
272
+ "epoch": 24.73,
273
+ "eval_loss": 1.2040750980377197,
274
+ "eval_runtime": 43.2859,
275
+ "eval_samples_per_second": 14.97,
276
+ "eval_steps_per_second": 0.485,
277
+ "eval_wer": 0.3687042018365643,
278
+ "step": 9000
279
+ },
280
+ {
281
+ "epoch": 26.1,
282
+ "learning_rate": 1.4385080645161292e-05,
283
+ "loss": 0.0554,
284
+ "step": 9500
285
+ },
286
+ {
287
+ "epoch": 26.1,
288
+ "eval_loss": 1.2136509418487549,
289
+ "eval_runtime": 42.5198,
290
+ "eval_samples_per_second": 15.24,
291
+ "eval_steps_per_second": 0.494,
292
+ "eval_wer": 0.35720248585474446,
293
+ "step": 9500
294
+ },
295
+ {
296
+ "epoch": 27.47,
297
+ "learning_rate": 9.344758064516129e-06,
298
+ "loss": 0.0475,
299
+ "step": 10000
300
+ },
301
+ {
302
+ "epoch": 27.47,
303
+ "eval_loss": 1.2355363368988037,
304
+ "eval_runtime": 43.577,
305
+ "eval_samples_per_second": 14.87,
306
+ "eval_steps_per_second": 0.482,
307
+ "eval_wer": 0.3565531954364159,
308
+ "step": 10000
309
+ },
310
+ {
311
+ "epoch": 28.85,
312
+ "learning_rate": 4.304435483870968e-06,
313
+ "loss": 0.042,
314
+ "step": 10500
315
+ },
316
+ {
317
+ "epoch": 28.85,
318
+ "eval_loss": 1.2368338108062744,
319
+ "eval_runtime": 43.2519,
320
+ "eval_samples_per_second": 14.982,
321
+ "eval_steps_per_second": 0.486,
322
+ "eval_wer": 0.35553288192189964,
323
+ "step": 10500
324
+ }
325
+ ],
326
+ "max_steps": 10920,
327
+ "num_train_epochs": 30,
328
+ "total_flos": 1.5691152295536976e+19,
329
+ "trial_name": null,
330
+ "trial_params": null
331
+ }
checkpoint-10500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d998aad278c0bb911fd9ded0353bc04ca5056945240341f90e545e377ba43f2
3
+ size 2863
config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "codevector_dim": 256,
12
+ "contrastive_logits_temperature": 0.1,
13
+ "conv_bias": false,
14
+ "conv_dim": [
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512
22
+ ],
23
+ "conv_kernel": [
24
+ 10,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 3,
29
+ 2,
30
+ 2
31
+ ],
32
+ "conv_stride": [
33
+ 5,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2
40
+ ],
41
+ "ctc_loss_reduction": "mean",
42
+ "ctc_zero_infinity": false,
43
+ "diversity_loss_weight": 0.1,
44
+ "do_stable_layer_norm": false,
45
+ "eos_token_id": 2,
46
+ "feat_extract_activation": "gelu",
47
+ "feat_extract_norm": "group",
48
+ "feat_proj_dropout": 0.1,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.0,
51
+ "freeze_feat_extract_train": true,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_size": 768,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "layer_norm_eps": 1e-05,
58
+ "layerdrop": 0.0,
59
+ "mask_channel_length": 10,
60
+ "mask_channel_min_space": 1,
61
+ "mask_channel_other": 0.0,
62
+ "mask_channel_prob": 0.0,
63
+ "mask_channel_selection": "static",
64
+ "mask_feature_length": 10,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 5,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.05,
70
+ "mask_time_selection": "static",
71
+ "model_type": "wav2vec2",
72
+ "no_mask_channel_overlap": false,
73
+ "no_mask_time_overlap": false,
74
+ "num_attention_heads": 12,
75
+ "num_codevector_groups": 2,
76
+ "num_codevectors_per_group": 320,
77
+ "num_conv_pos_embedding_groups": 16,
78
+ "num_conv_pos_embeddings": 128,
79
+ "num_feat_extract_layers": 7,
80
+ "num_hidden_layers": 12,
81
+ "num_negatives": 100,
82
+ "pad_token_id": 29,
83
+ "proj_codevector_dim": 256,
84
+ "torch_dtype": "float32",
85
+ "transformers_version": "4.11.3",
86
+ "use_weighted_layer_sum": false,
87
+ "vocab_size": 32
88
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f4e1617defbcf85b67d55294f8177b259e3daf55269592cf44d09962844275
3
+ size 377670039
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d998aad278c0bb911fd9ded0353bc04ca5056945240341f90e545e377ba43f2
3
+ size 2863