marinone94 commited on
Commit
309997b
1 Parent(s): c97f56c

End of training

Browse files
all_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.09,
3
+ "eval_loss": 1.8916987180709839,
4
+ "eval_runtime": 96.9796,
5
+ "eval_samples_per_second": 4.063,
6
+ "eval_steps_per_second": 0.134,
7
+ "eval_wer": 15.494331342191881,
8
+ "test_loss": 0.5623113512992859,
9
+ "test_runtime": 121.6703,
10
+ "test_samples_per_second": 5.318,
11
+ "test_steps_per_second": 0.173,
12
+ "test_wer": 20.965372507869883,
13
+ "train_loss": 0.35074408769753995,
14
+ "train_runtime": 2707.3827,
15
+ "train_samples_per_second": 9.621,
16
+ "train_steps_per_second": 0.15
17
+ }
checkpoint-360/config.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-tiny",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 384,
15
+ "decoder_attention_heads": 6,
16
+ "decoder_ffn_dim": 1536,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 4,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 6,
22
+ "encoder_ffn_dim": 1536,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 4,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": [
27
+ [
28
+ 1,
29
+ 50259
30
+ ],
31
+ [
32
+ 2,
33
+ 50359
34
+ ],
35
+ [
36
+ 3,
37
+ 50363
38
+ ]
39
+ ],
40
+ "init_std": 0.02,
41
+ "is_encoder_decoder": true,
42
+ "max_length": 448,
43
+ "max_source_positions": 1500,
44
+ "max_target_positions": 448,
45
+ "model_type": "whisper",
46
+ "num_hidden_layers": 4,
47
+ "num_mel_bins": 80,
48
+ "pad_token_id": 50257,
49
+ "scale_embedding": false,
50
+ "suppress_tokens": [
51
+ 1,
52
+ 2,
53
+ 7,
54
+ 8,
55
+ 9,
56
+ 10,
57
+ 14,
58
+ 25,
59
+ 26,
60
+ 27,
61
+ 28,
62
+ 29,
63
+ 31,
64
+ 58,
65
+ 59,
66
+ 60,
67
+ 61,
68
+ 62,
69
+ 63,
70
+ 90,
71
+ 91,
72
+ 92,
73
+ 93,
74
+ 359,
75
+ 503,
76
+ 522,
77
+ 542,
78
+ 873,
79
+ 893,
80
+ 902,
81
+ 918,
82
+ 922,
83
+ 931,
84
+ 1350,
85
+ 1853,
86
+ 1982,
87
+ 2460,
88
+ 2627,
89
+ 3246,
90
+ 3253,
91
+ 3268,
92
+ 3536,
93
+ 3846,
94
+ 3961,
95
+ 4183,
96
+ 4667,
97
+ 6585,
98
+ 6647,
99
+ 7273,
100
+ 9061,
101
+ 9383,
102
+ 10428,
103
+ 10929,
104
+ 11938,
105
+ 12033,
106
+ 12331,
107
+ 12562,
108
+ 13793,
109
+ 14157,
110
+ 14635,
111
+ 15265,
112
+ 15618,
113
+ 16553,
114
+ 16604,
115
+ 18362,
116
+ 18956,
117
+ 20075,
118
+ 21675,
119
+ 22520,
120
+ 26130,
121
+ 26161,
122
+ 26435,
123
+ 28279,
124
+ 29464,
125
+ 31650,
126
+ 32302,
127
+ 32470,
128
+ 36865,
129
+ 42863,
130
+ 47425,
131
+ 49870,
132
+ 50254,
133
+ 50258,
134
+ 50360,
135
+ 50361,
136
+ 50362
137
+ ],
138
+ "torch_dtype": "float32",
139
+ "transformers_version": "4.26.0.dev0",
140
+ "use_cache": true,
141
+ "vocab_size": 51865
142
+ }
checkpoint-360/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:513fbc5d03bd07f32d77cf2f5dcc0d8298575b96fbda2ed1de30f1cb859889ae
3
+ size 302183173
checkpoint-360/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-360/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48879b1ce776151b602f3a1bdf10683d776d3f0765214b322443dddb1d951006
3
+ size 151098921
checkpoint-360/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89f9781ff6e5ab617d91036a7029d39a2832fa624ae853afb0f238fb19535016
3
+ size 14575
checkpoint-360/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d681e2b2a56f2134611cbb1679a9f32470e4cf3a48f4a2243741f0852b30ae
3
+ size 557
checkpoint-360/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:922e864e56c484925ddcd495d1c992405fc4f95d13329256b422ef0f40cc0891
3
+ size 627
checkpoint-360/trainer_state.json ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.07862407862408,
5
+ "global_step": 360,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 0.0,
13
+ "loss": 1.8118,
14
+ "step": 4
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 3.6585365853658536e-07,
19
+ "loss": 1.8122,
20
+ "step": 8
21
+ },
22
+ {
23
+ "epoch": 0.03,
24
+ "learning_rate": 8.53658536585366e-07,
25
+ "loss": 1.8174,
26
+ "step": 12
27
+ },
28
+ {
29
+ "epoch": 0.04,
30
+ "learning_rate": 1.3414634146341465e-06,
31
+ "loss": 1.7616,
32
+ "step": 16
33
+ },
34
+ {
35
+ "epoch": 0.05,
36
+ "learning_rate": 1.8292682926829268e-06,
37
+ "loss": 1.6875,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 2.317073170731708e-06,
43
+ "loss": 1.5201,
44
+ "step": 24
45
+ },
46
+ {
47
+ "epoch": 0.07,
48
+ "learning_rate": 2.8048780487804884e-06,
49
+ "loss": 1.3982,
50
+ "step": 28
51
+ },
52
+ {
53
+ "epoch": 0.08,
54
+ "learning_rate": 3.292682926829269e-06,
55
+ "loss": 1.3541,
56
+ "step": 32
57
+ },
58
+ {
59
+ "epoch": 0.09,
60
+ "learning_rate": 3.780487804878049e-06,
61
+ "loss": 1.2092,
62
+ "step": 36
63
+ },
64
+ {
65
+ "epoch": 0.1,
66
+ "learning_rate": 4.268292682926829e-06,
67
+ "loss": 1.1599,
68
+ "step": 40
69
+ },
70
+ {
71
+ "epoch": 0.1,
72
+ "eval_loss": 1.142654299736023,
73
+ "eval_runtime": 101.9854,
74
+ "eval_samples_per_second": 3.863,
75
+ "eval_steps_per_second": 0.127,
76
+ "eval_wer": 15.213946117274169,
77
+ "step": 40
78
+ },
79
+ {
80
+ "epoch": 1.01,
81
+ "learning_rate": 4.75609756097561e-06,
82
+ "loss": 1.0124,
83
+ "step": 44
84
+ },
85
+ {
86
+ "epoch": 1.02,
87
+ "learning_rate": 5.243902439024391e-06,
88
+ "loss": 0.9171,
89
+ "step": 48
90
+ },
91
+ {
92
+ "epoch": 1.03,
93
+ "learning_rate": 5.731707317073171e-06,
94
+ "loss": 0.8027,
95
+ "step": 52
96
+ },
97
+ {
98
+ "epoch": 1.04,
99
+ "learning_rate": 6.219512195121951e-06,
100
+ "loss": 0.7284,
101
+ "step": 56
102
+ },
103
+ {
104
+ "epoch": 1.05,
105
+ "learning_rate": 6.707317073170733e-06,
106
+ "loss": 0.6185,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 1.06,
111
+ "learning_rate": 7.1951219512195125e-06,
112
+ "loss": 0.57,
113
+ "step": 64
114
+ },
115
+ {
116
+ "epoch": 1.07,
117
+ "learning_rate": 7.682926829268293e-06,
118
+ "loss": 0.4985,
119
+ "step": 68
120
+ },
121
+ {
122
+ "epoch": 1.08,
123
+ "learning_rate": 8.170731707317073e-06,
124
+ "loss": 0.488,
125
+ "step": 72
126
+ },
127
+ {
128
+ "epoch": 1.09,
129
+ "learning_rate": 8.658536585365854e-06,
130
+ "loss": 0.4569,
131
+ "step": 76
132
+ },
133
+ {
134
+ "epoch": 1.1,
135
+ "learning_rate": 9.146341463414635e-06,
136
+ "loss": 0.4655,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 1.1,
141
+ "eval_loss": 0.5613037943840027,
142
+ "eval_runtime": 91.9697,
143
+ "eval_samples_per_second": 4.284,
144
+ "eval_steps_per_second": 0.141,
145
+ "eval_wer": 17.591125198098258,
146
+ "step": 80
147
+ },
148
+ {
149
+ "epoch": 2.0,
150
+ "learning_rate": 9.634146341463415e-06,
151
+ "loss": 0.425,
152
+ "step": 84
153
+ },
154
+ {
155
+ "epoch": 2.01,
156
+ "learning_rate": 9.96923076923077e-06,
157
+ "loss": 0.4162,
158
+ "step": 88
159
+ },
160
+ {
161
+ "epoch": 2.02,
162
+ "learning_rate": 9.846153846153848e-06,
163
+ "loss": 0.3809,
164
+ "step": 92
165
+ },
166
+ {
167
+ "epoch": 2.03,
168
+ "learning_rate": 9.723076923076924e-06,
169
+ "loss": 0.3533,
170
+ "step": 96
171
+ },
172
+ {
173
+ "epoch": 2.04,
174
+ "learning_rate": 9.600000000000001e-06,
175
+ "loss": 0.3511,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 2.05,
180
+ "learning_rate": 9.476923076923079e-06,
181
+ "loss": 0.3475,
182
+ "step": 104
183
+ },
184
+ {
185
+ "epoch": 2.06,
186
+ "learning_rate": 9.353846153846155e-06,
187
+ "loss": 0.321,
188
+ "step": 108
189
+ },
190
+ {
191
+ "epoch": 2.07,
192
+ "learning_rate": 9.230769230769232e-06,
193
+ "loss": 0.2859,
194
+ "step": 112
195
+ },
196
+ {
197
+ "epoch": 2.08,
198
+ "learning_rate": 9.107692307692308e-06,
199
+ "loss": 0.3191,
200
+ "step": 116
201
+ },
202
+ {
203
+ "epoch": 2.09,
204
+ "learning_rate": 8.984615384615386e-06,
205
+ "loss": 0.2753,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 2.09,
210
+ "eval_loss": 0.5241264700889587,
211
+ "eval_runtime": 88.0526,
212
+ "eval_samples_per_second": 4.475,
213
+ "eval_steps_per_second": 0.148,
214
+ "eval_wer": 17.21321467755699,
215
+ "step": 120
216
+ },
217
+ {
218
+ "epoch": 3.0,
219
+ "learning_rate": 8.861538461538463e-06,
220
+ "loss": 0.3104,
221
+ "step": 124
222
+ },
223
+ {
224
+ "epoch": 3.01,
225
+ "learning_rate": 8.73846153846154e-06,
226
+ "loss": 0.2734,
227
+ "step": 128
228
+ },
229
+ {
230
+ "epoch": 3.02,
231
+ "learning_rate": 8.615384615384617e-06,
232
+ "loss": 0.2608,
233
+ "step": 132
234
+ },
235
+ {
236
+ "epoch": 3.03,
237
+ "learning_rate": 8.492307692307693e-06,
238
+ "loss": 0.2509,
239
+ "step": 136
240
+ },
241
+ {
242
+ "epoch": 3.04,
243
+ "learning_rate": 8.36923076923077e-06,
244
+ "loss": 0.2548,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 3.05,
249
+ "learning_rate": 8.246153846153848e-06,
250
+ "loss": 0.2469,
251
+ "step": 144
252
+ },
253
+ {
254
+ "epoch": 3.06,
255
+ "learning_rate": 8.123076923076924e-06,
256
+ "loss": 0.2231,
257
+ "step": 148
258
+ },
259
+ {
260
+ "epoch": 3.07,
261
+ "learning_rate": 8.000000000000001e-06,
262
+ "loss": 0.2138,
263
+ "step": 152
264
+ },
265
+ {
266
+ "epoch": 3.08,
267
+ "learning_rate": 7.876923076923077e-06,
268
+ "loss": 0.2349,
269
+ "step": 156
270
+ },
271
+ {
272
+ "epoch": 3.09,
273
+ "learning_rate": 7.753846153846155e-06,
274
+ "loss": 0.2077,
275
+ "step": 160
276
+ },
277
+ {
278
+ "epoch": 3.09,
279
+ "eval_loss": 0.5241798758506775,
280
+ "eval_runtime": 88.5317,
281
+ "eval_samples_per_second": 4.45,
282
+ "eval_steps_per_second": 0.147,
283
+ "eval_wer": 17.26197732536877,
284
+ "step": 160
285
+ },
286
+ {
287
+ "epoch": 3.1,
288
+ "learning_rate": 7.630769230769232e-06,
289
+ "loss": 0.2322,
290
+ "step": 164
291
+ },
292
+ {
293
+ "epoch": 4.01,
294
+ "learning_rate": 7.507692307692308e-06,
295
+ "loss": 0.2036,
296
+ "step": 168
297
+ },
298
+ {
299
+ "epoch": 4.02,
300
+ "learning_rate": 7.384615384615386e-06,
301
+ "loss": 0.2058,
302
+ "step": 172
303
+ },
304
+ {
305
+ "epoch": 4.03,
306
+ "learning_rate": 7.261538461538462e-06,
307
+ "loss": 0.1797,
308
+ "step": 176
309
+ },
310
+ {
311
+ "epoch": 4.04,
312
+ "learning_rate": 7.1384615384615385e-06,
313
+ "loss": 0.186,
314
+ "step": 180
315
+ },
316
+ {
317
+ "epoch": 4.05,
318
+ "learning_rate": 7.015384615384616e-06,
319
+ "loss": 0.2035,
320
+ "step": 184
321
+ },
322
+ {
323
+ "epoch": 4.06,
324
+ "learning_rate": 6.892307692307693e-06,
325
+ "loss": 0.1794,
326
+ "step": 188
327
+ },
328
+ {
329
+ "epoch": 4.07,
330
+ "learning_rate": 6.76923076923077e-06,
331
+ "loss": 0.1589,
332
+ "step": 192
333
+ },
334
+ {
335
+ "epoch": 4.08,
336
+ "learning_rate": 6.646153846153846e-06,
337
+ "loss": 0.1879,
338
+ "step": 196
339
+ },
340
+ {
341
+ "epoch": 4.09,
342
+ "learning_rate": 6.523076923076923e-06,
343
+ "loss": 0.1636,
344
+ "step": 200
345
+ },
346
+ {
347
+ "epoch": 4.09,
348
+ "eval_loss": 0.5289868712425232,
349
+ "eval_runtime": 95.5188,
350
+ "eval_samples_per_second": 4.125,
351
+ "eval_steps_per_second": 0.136,
352
+ "eval_wer": 17.66426916981592,
353
+ "step": 200
354
+ },
355
+ {
356
+ "epoch": 4.1,
357
+ "learning_rate": 6.4000000000000006e-06,
358
+ "loss": 0.1767,
359
+ "step": 204
360
+ },
361
+ {
362
+ "epoch": 5.01,
363
+ "learning_rate": 6.276923076923077e-06,
364
+ "loss": 0.1657,
365
+ "step": 208
366
+ },
367
+ {
368
+ "epoch": 5.02,
369
+ "learning_rate": 6.153846153846155e-06,
370
+ "loss": 0.1607,
371
+ "step": 212
372
+ },
373
+ {
374
+ "epoch": 5.03,
375
+ "learning_rate": 6.030769230769231e-06,
376
+ "loss": 0.1458,
377
+ "step": 216
378
+ },
379
+ {
380
+ "epoch": 5.04,
381
+ "learning_rate": 5.907692307692308e-06,
382
+ "loss": 0.1541,
383
+ "step": 220
384
+ },
385
+ {
386
+ "epoch": 5.05,
387
+ "learning_rate": 5.784615384615385e-06,
388
+ "loss": 0.1494,
389
+ "step": 224
390
+ },
391
+ {
392
+ "epoch": 5.06,
393
+ "learning_rate": 5.661538461538462e-06,
394
+ "loss": 0.144,
395
+ "step": 228
396
+ },
397
+ {
398
+ "epoch": 5.07,
399
+ "learning_rate": 5.538461538461539e-06,
400
+ "loss": 0.1311,
401
+ "step": 232
402
+ },
403
+ {
404
+ "epoch": 5.08,
405
+ "learning_rate": 5.415384615384615e-06,
406
+ "loss": 0.1411,
407
+ "step": 236
408
+ },
409
+ {
410
+ "epoch": 5.09,
411
+ "learning_rate": 5.292307692307693e-06,
412
+ "loss": 0.1322,
413
+ "step": 240
414
+ },
415
+ {
416
+ "epoch": 5.09,
417
+ "eval_loss": 0.5350630283355713,
418
+ "eval_runtime": 92.5111,
419
+ "eval_samples_per_second": 4.259,
420
+ "eval_steps_per_second": 0.141,
421
+ "eval_wer": 18.2128489576984,
422
+ "step": 240
423
+ },
424
+ {
425
+ "epoch": 5.1,
426
+ "learning_rate": 5.16923076923077e-06,
427
+ "loss": 0.1436,
428
+ "step": 244
429
+ },
430
+ {
431
+ "epoch": 6.0,
432
+ "learning_rate": 5.046153846153846e-06,
433
+ "loss": 0.1375,
434
+ "step": 248
435
+ },
436
+ {
437
+ "epoch": 6.01,
438
+ "learning_rate": 4.923076923076924e-06,
439
+ "loss": 0.1361,
440
+ "step": 252
441
+ },
442
+ {
443
+ "epoch": 6.02,
444
+ "learning_rate": 4.800000000000001e-06,
445
+ "loss": 0.129,
446
+ "step": 256
447
+ },
448
+ {
449
+ "epoch": 6.03,
450
+ "learning_rate": 4.676923076923077e-06,
451
+ "loss": 0.1127,
452
+ "step": 260
453
+ },
454
+ {
455
+ "epoch": 6.04,
456
+ "learning_rate": 4.553846153846154e-06,
457
+ "loss": 0.1266,
458
+ "step": 264
459
+ },
460
+ {
461
+ "epoch": 6.05,
462
+ "learning_rate": 4.430769230769232e-06,
463
+ "loss": 0.1193,
464
+ "step": 268
465
+ },
466
+ {
467
+ "epoch": 6.06,
468
+ "learning_rate": 4.307692307692308e-06,
469
+ "loss": 0.1127,
470
+ "step": 272
471
+ },
472
+ {
473
+ "epoch": 6.07,
474
+ "learning_rate": 4.184615384615385e-06,
475
+ "loss": 0.1064,
476
+ "step": 276
477
+ },
478
+ {
479
+ "epoch": 6.08,
480
+ "learning_rate": 4.061538461538462e-06,
481
+ "loss": 0.123,
482
+ "step": 280
483
+ },
484
+ {
485
+ "epoch": 6.08,
486
+ "eval_loss": 0.5429388284683228,
487
+ "eval_runtime": 91.5818,
488
+ "eval_samples_per_second": 4.302,
489
+ "eval_steps_per_second": 0.142,
490
+ "eval_wer": 18.907716689016212,
491
+ "step": 280
492
+ },
493
+ {
494
+ "epoch": 6.09,
495
+ "learning_rate": 3.938461538461539e-06,
496
+ "loss": 0.1057,
497
+ "step": 284
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "learning_rate": 3.815384615384616e-06,
502
+ "loss": 0.1258,
503
+ "step": 288
504
+ },
505
+ {
506
+ "epoch": 7.01,
507
+ "learning_rate": 3.692307692307693e-06,
508
+ "loss": 0.1108,
509
+ "step": 292
510
+ },
511
+ {
512
+ "epoch": 7.02,
513
+ "learning_rate": 3.5692307692307692e-06,
514
+ "loss": 0.1115,
515
+ "step": 296
516
+ },
517
+ {
518
+ "epoch": 7.03,
519
+ "learning_rate": 3.4461538461538464e-06,
520
+ "loss": 0.0998,
521
+ "step": 300
522
+ },
523
+ {
524
+ "epoch": 7.04,
525
+ "learning_rate": 3.323076923076923e-06,
526
+ "loss": 0.1106,
527
+ "step": 304
528
+ },
529
+ {
530
+ "epoch": 7.05,
531
+ "learning_rate": 3.2000000000000003e-06,
532
+ "loss": 0.1045,
533
+ "step": 308
534
+ },
535
+ {
536
+ "epoch": 7.06,
537
+ "learning_rate": 3.0769230769230774e-06,
538
+ "loss": 0.0908,
539
+ "step": 312
540
+ },
541
+ {
542
+ "epoch": 7.07,
543
+ "learning_rate": 2.953846153846154e-06,
544
+ "loss": 0.0931,
545
+ "step": 316
546
+ },
547
+ {
548
+ "epoch": 7.08,
549
+ "learning_rate": 2.830769230769231e-06,
550
+ "loss": 0.1074,
551
+ "step": 320
552
+ },
553
+ {
554
+ "epoch": 7.08,
555
+ "eval_loss": 0.5500437021255493,
556
+ "eval_runtime": 104.0907,
557
+ "eval_samples_per_second": 3.785,
558
+ "eval_steps_per_second": 0.125,
559
+ "eval_wer": 19.054004632451544,
560
+ "step": 320
561
+ },
562
+ {
563
+ "epoch": 7.09,
564
+ "learning_rate": 2.7076923076923076e-06,
565
+ "loss": 0.0937,
566
+ "step": 324
567
+ },
568
+ {
569
+ "epoch": 7.1,
570
+ "learning_rate": 2.584615384615385e-06,
571
+ "loss": 0.1091,
572
+ "step": 328
573
+ },
574
+ {
575
+ "epoch": 8.01,
576
+ "learning_rate": 2.461538461538462e-06,
577
+ "loss": 0.0951,
578
+ "step": 332
579
+ },
580
+ {
581
+ "epoch": 8.02,
582
+ "learning_rate": 2.3384615384615387e-06,
583
+ "loss": 0.1003,
584
+ "step": 336
585
+ },
586
+ {
587
+ "epoch": 8.03,
588
+ "learning_rate": 2.215384615384616e-06,
589
+ "loss": 0.0836,
590
+ "step": 340
591
+ },
592
+ {
593
+ "epoch": 8.04,
594
+ "learning_rate": 2.0923076923076926e-06,
595
+ "loss": 0.0907,
596
+ "step": 344
597
+ },
598
+ {
599
+ "epoch": 8.05,
600
+ "learning_rate": 1.9692307692307693e-06,
601
+ "loss": 0.1013,
602
+ "step": 348
603
+ },
604
+ {
605
+ "epoch": 8.06,
606
+ "learning_rate": 1.8461538461538465e-06,
607
+ "loss": 0.0891,
608
+ "step": 352
609
+ },
610
+ {
611
+ "epoch": 8.07,
612
+ "learning_rate": 1.7230769230769232e-06,
613
+ "loss": 0.077,
614
+ "step": 356
615
+ },
616
+ {
617
+ "epoch": 8.08,
618
+ "learning_rate": 1.6000000000000001e-06,
619
+ "loss": 0.1007,
620
+ "step": 360
621
+ },
622
+ {
623
+ "epoch": 8.08,
624
+ "eval_loss": 0.5552565455436707,
625
+ "eval_runtime": 88.458,
626
+ "eval_samples_per_second": 4.454,
627
+ "eval_steps_per_second": 0.147,
628
+ "eval_wer": 19.310008533463368,
629
+ "step": 360
630
+ }
631
+ ],
632
+ "max_steps": 407,
633
+ "num_train_epochs": 9223372036854775807,
634
+ "total_flos": 5.6288618938368e+17,
635
+ "trial_name": null,
636
+ "trial_params": null
637
+ }
checkpoint-360/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcbca0d141969bcb1c3cd0ef5a009221139334753b899d88e4d5003bd23f4b5f
3
+ size 3579
checkpoint-400/config.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-tiny",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 384,
15
+ "decoder_attention_heads": 6,
16
+ "decoder_ffn_dim": 1536,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 4,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 6,
22
+ "encoder_ffn_dim": 1536,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 4,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": [
27
+ [
28
+ 1,
29
+ 50259
30
+ ],
31
+ [
32
+ 2,
33
+ 50359
34
+ ],
35
+ [
36
+ 3,
37
+ 50363
38
+ ]
39
+ ],
40
+ "init_std": 0.02,
41
+ "is_encoder_decoder": true,
42
+ "max_length": 448,
43
+ "max_source_positions": 1500,
44
+ "max_target_positions": 448,
45
+ "model_type": "whisper",
46
+ "num_hidden_layers": 4,
47
+ "num_mel_bins": 80,
48
+ "pad_token_id": 50257,
49
+ "scale_embedding": false,
50
+ "suppress_tokens": [
51
+ 1,
52
+ 2,
53
+ 7,
54
+ 8,
55
+ 9,
56
+ 10,
57
+ 14,
58
+ 25,
59
+ 26,
60
+ 27,
61
+ 28,
62
+ 29,
63
+ 31,
64
+ 58,
65
+ 59,
66
+ 60,
67
+ 61,
68
+ 62,
69
+ 63,
70
+ 90,
71
+ 91,
72
+ 92,
73
+ 93,
74
+ 359,
75
+ 503,
76
+ 522,
77
+ 542,
78
+ 873,
79
+ 893,
80
+ 902,
81
+ 918,
82
+ 922,
83
+ 931,
84
+ 1350,
85
+ 1853,
86
+ 1982,
87
+ 2460,
88
+ 2627,
89
+ 3246,
90
+ 3253,
91
+ 3268,
92
+ 3536,
93
+ 3846,
94
+ 3961,
95
+ 4183,
96
+ 4667,
97
+ 6585,
98
+ 6647,
99
+ 7273,
100
+ 9061,
101
+ 9383,
102
+ 10428,
103
+ 10929,
104
+ 11938,
105
+ 12033,
106
+ 12331,
107
+ 12562,
108
+ 13793,
109
+ 14157,
110
+ 14635,
111
+ 15265,
112
+ 15618,
113
+ 16553,
114
+ 16604,
115
+ 18362,
116
+ 18956,
117
+ 20075,
118
+ 21675,
119
+ 22520,
120
+ 26130,
121
+ 26161,
122
+ 26435,
123
+ 28279,
124
+ 29464,
125
+ 31650,
126
+ 32302,
127
+ 32470,
128
+ 36865,
129
+ 42863,
130
+ 47425,
131
+ 49870,
132
+ 50254,
133
+ 50258,
134
+ 50360,
135
+ 50361,
136
+ 50362
137
+ ],
138
+ "torch_dtype": "float32",
139
+ "transformers_version": "4.26.0.dev0",
140
+ "use_cache": true,
141
+ "vocab_size": 51865
142
+ }
checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca2f216746e89a3c2dc592e56702d375bf97996f49afe1d761ee97223c74e35c
3
+ size 302183173
checkpoint-400/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f00e8aebd51836c62a6367fa1a3bad01938ccd285ac6cc2c2dd7b6e9755793
3
+ size 151098921
checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0fee234ab8151645a0895ac4e9559fbd6bec4f70f802b8c94db562d283ad737
3
+ size 14639
checkpoint-400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37d85a7f00fa7bae4774c70bac351a030ccefea202dbec056f5a4d44e50b132c
3
+ size 557
checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cf1f3c596de0ecc246fc5b02e9720244273de83ebc2f79d153609594e679a82
3
+ size 627
checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.076167076167076,
5
+ "global_step": 400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 0.0,
13
+ "loss": 1.8118,
14
+ "step": 4
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 3.6585365853658536e-07,
19
+ "loss": 1.8122,
20
+ "step": 8
21
+ },
22
+ {
23
+ "epoch": 0.03,
24
+ "learning_rate": 8.53658536585366e-07,
25
+ "loss": 1.8174,
26
+ "step": 12
27
+ },
28
+ {
29
+ "epoch": 0.04,
30
+ "learning_rate": 1.3414634146341465e-06,
31
+ "loss": 1.7616,
32
+ "step": 16
33
+ },
34
+ {
35
+ "epoch": 0.05,
36
+ "learning_rate": 1.8292682926829268e-06,
37
+ "loss": 1.6875,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 2.317073170731708e-06,
43
+ "loss": 1.5201,
44
+ "step": 24
45
+ },
46
+ {
47
+ "epoch": 0.07,
48
+ "learning_rate": 2.8048780487804884e-06,
49
+ "loss": 1.3982,
50
+ "step": 28
51
+ },
52
+ {
53
+ "epoch": 0.08,
54
+ "learning_rate": 3.292682926829269e-06,
55
+ "loss": 1.3541,
56
+ "step": 32
57
+ },
58
+ {
59
+ "epoch": 0.09,
60
+ "learning_rate": 3.780487804878049e-06,
61
+ "loss": 1.2092,
62
+ "step": 36
63
+ },
64
+ {
65
+ "epoch": 0.1,
66
+ "learning_rate": 4.268292682926829e-06,
67
+ "loss": 1.1599,
68
+ "step": 40
69
+ },
70
+ {
71
+ "epoch": 0.1,
72
+ "eval_loss": 1.142654299736023,
73
+ "eval_runtime": 101.9854,
74
+ "eval_samples_per_second": 3.863,
75
+ "eval_steps_per_second": 0.127,
76
+ "eval_wer": 15.213946117274169,
77
+ "step": 40
78
+ },
79
+ {
80
+ "epoch": 1.01,
81
+ "learning_rate": 4.75609756097561e-06,
82
+ "loss": 1.0124,
83
+ "step": 44
84
+ },
85
+ {
86
+ "epoch": 1.02,
87
+ "learning_rate": 5.243902439024391e-06,
88
+ "loss": 0.9171,
89
+ "step": 48
90
+ },
91
+ {
92
+ "epoch": 1.03,
93
+ "learning_rate": 5.731707317073171e-06,
94
+ "loss": 0.8027,
95
+ "step": 52
96
+ },
97
+ {
98
+ "epoch": 1.04,
99
+ "learning_rate": 6.219512195121951e-06,
100
+ "loss": 0.7284,
101
+ "step": 56
102
+ },
103
+ {
104
+ "epoch": 1.05,
105
+ "learning_rate": 6.707317073170733e-06,
106
+ "loss": 0.6185,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 1.06,
111
+ "learning_rate": 7.1951219512195125e-06,
112
+ "loss": 0.57,
113
+ "step": 64
114
+ },
115
+ {
116
+ "epoch": 1.07,
117
+ "learning_rate": 7.682926829268293e-06,
118
+ "loss": 0.4985,
119
+ "step": 68
120
+ },
121
+ {
122
+ "epoch": 1.08,
123
+ "learning_rate": 8.170731707317073e-06,
124
+ "loss": 0.488,
125
+ "step": 72
126
+ },
127
+ {
128
+ "epoch": 1.09,
129
+ "learning_rate": 8.658536585365854e-06,
130
+ "loss": 0.4569,
131
+ "step": 76
132
+ },
133
+ {
134
+ "epoch": 1.1,
135
+ "learning_rate": 9.146341463414635e-06,
136
+ "loss": 0.4655,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 1.1,
141
+ "eval_loss": 0.5613037943840027,
142
+ "eval_runtime": 91.9697,
143
+ "eval_samples_per_second": 4.284,
144
+ "eval_steps_per_second": 0.141,
145
+ "eval_wer": 17.591125198098258,
146
+ "step": 80
147
+ },
148
+ {
149
+ "epoch": 2.0,
150
+ "learning_rate": 9.634146341463415e-06,
151
+ "loss": 0.425,
152
+ "step": 84
153
+ },
154
+ {
155
+ "epoch": 2.01,
156
+ "learning_rate": 9.96923076923077e-06,
157
+ "loss": 0.4162,
158
+ "step": 88
159
+ },
160
+ {
161
+ "epoch": 2.02,
162
+ "learning_rate": 9.846153846153848e-06,
163
+ "loss": 0.3809,
164
+ "step": 92
165
+ },
166
+ {
167
+ "epoch": 2.03,
168
+ "learning_rate": 9.723076923076924e-06,
169
+ "loss": 0.3533,
170
+ "step": 96
171
+ },
172
+ {
173
+ "epoch": 2.04,
174
+ "learning_rate": 9.600000000000001e-06,
175
+ "loss": 0.3511,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 2.05,
180
+ "learning_rate": 9.476923076923079e-06,
181
+ "loss": 0.3475,
182
+ "step": 104
183
+ },
184
+ {
185
+ "epoch": 2.06,
186
+ "learning_rate": 9.353846153846155e-06,
187
+ "loss": 0.321,
188
+ "step": 108
189
+ },
190
+ {
191
+ "epoch": 2.07,
192
+ "learning_rate": 9.230769230769232e-06,
193
+ "loss": 0.2859,
194
+ "step": 112
195
+ },
196
+ {
197
+ "epoch": 2.08,
198
+ "learning_rate": 9.107692307692308e-06,
199
+ "loss": 0.3191,
200
+ "step": 116
201
+ },
202
+ {
203
+ "epoch": 2.09,
204
+ "learning_rate": 8.984615384615386e-06,
205
+ "loss": 0.2753,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 2.09,
210
+ "eval_loss": 0.5241264700889587,
211
+ "eval_runtime": 88.0526,
212
+ "eval_samples_per_second": 4.475,
213
+ "eval_steps_per_second": 0.148,
214
+ "eval_wer": 17.21321467755699,
215
+ "step": 120
216
+ },
217
+ {
218
+ "epoch": 3.0,
219
+ "learning_rate": 8.861538461538463e-06,
220
+ "loss": 0.3104,
221
+ "step": 124
222
+ },
223
+ {
224
+ "epoch": 3.01,
225
+ "learning_rate": 8.73846153846154e-06,
226
+ "loss": 0.2734,
227
+ "step": 128
228
+ },
229
+ {
230
+ "epoch": 3.02,
231
+ "learning_rate": 8.615384615384617e-06,
232
+ "loss": 0.2608,
233
+ "step": 132
234
+ },
235
+ {
236
+ "epoch": 3.03,
237
+ "learning_rate": 8.492307692307693e-06,
238
+ "loss": 0.2509,
239
+ "step": 136
240
+ },
241
+ {
242
+ "epoch": 3.04,
243
+ "learning_rate": 8.36923076923077e-06,
244
+ "loss": 0.2548,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 3.05,
249
+ "learning_rate": 8.246153846153848e-06,
250
+ "loss": 0.2469,
251
+ "step": 144
252
+ },
253
+ {
254
+ "epoch": 3.06,
255
+ "learning_rate": 8.123076923076924e-06,
256
+ "loss": 0.2231,
257
+ "step": 148
258
+ },
259
+ {
260
+ "epoch": 3.07,
261
+ "learning_rate": 8.000000000000001e-06,
262
+ "loss": 0.2138,
263
+ "step": 152
264
+ },
265
+ {
266
+ "epoch": 3.08,
267
+ "learning_rate": 7.876923076923077e-06,
268
+ "loss": 0.2349,
269
+ "step": 156
270
+ },
271
+ {
272
+ "epoch": 3.09,
273
+ "learning_rate": 7.753846153846155e-06,
274
+ "loss": 0.2077,
275
+ "step": 160
276
+ },
277
+ {
278
+ "epoch": 3.09,
279
+ "eval_loss": 0.5241798758506775,
280
+ "eval_runtime": 88.5317,
281
+ "eval_samples_per_second": 4.45,
282
+ "eval_steps_per_second": 0.147,
283
+ "eval_wer": 17.26197732536877,
284
+ "step": 160
285
+ },
286
+ {
287
+ "epoch": 3.1,
288
+ "learning_rate": 7.630769230769232e-06,
289
+ "loss": 0.2322,
290
+ "step": 164
291
+ },
292
+ {
293
+ "epoch": 4.01,
294
+ "learning_rate": 7.507692307692308e-06,
295
+ "loss": 0.2036,
296
+ "step": 168
297
+ },
298
+ {
299
+ "epoch": 4.02,
300
+ "learning_rate": 7.384615384615386e-06,
301
+ "loss": 0.2058,
302
+ "step": 172
303
+ },
304
+ {
305
+ "epoch": 4.03,
306
+ "learning_rate": 7.261538461538462e-06,
307
+ "loss": 0.1797,
308
+ "step": 176
309
+ },
310
+ {
311
+ "epoch": 4.04,
312
+ "learning_rate": 7.1384615384615385e-06,
313
+ "loss": 0.186,
314
+ "step": 180
315
+ },
316
+ {
317
+ "epoch": 4.05,
318
+ "learning_rate": 7.015384615384616e-06,
319
+ "loss": 0.2035,
320
+ "step": 184
321
+ },
322
+ {
323
+ "epoch": 4.06,
324
+ "learning_rate": 6.892307692307693e-06,
325
+ "loss": 0.1794,
326
+ "step": 188
327
+ },
328
+ {
329
+ "epoch": 4.07,
330
+ "learning_rate": 6.76923076923077e-06,
331
+ "loss": 0.1589,
332
+ "step": 192
333
+ },
334
+ {
335
+ "epoch": 4.08,
336
+ "learning_rate": 6.646153846153846e-06,
337
+ "loss": 0.1879,
338
+ "step": 196
339
+ },
340
+ {
341
+ "epoch": 4.09,
342
+ "learning_rate": 6.523076923076923e-06,
343
+ "loss": 0.1636,
344
+ "step": 200
345
+ },
346
+ {
347
+ "epoch": 4.09,
348
+ "eval_loss": 0.5289868712425232,
349
+ "eval_runtime": 95.5188,
350
+ "eval_samples_per_second": 4.125,
351
+ "eval_steps_per_second": 0.136,
352
+ "eval_wer": 17.66426916981592,
353
+ "step": 200
354
+ },
355
+ {
356
+ "epoch": 4.1,
357
+ "learning_rate": 6.4000000000000006e-06,
358
+ "loss": 0.1767,
359
+ "step": 204
360
+ },
361
+ {
362
+ "epoch": 5.01,
363
+ "learning_rate": 6.276923076923077e-06,
364
+ "loss": 0.1657,
365
+ "step": 208
366
+ },
367
+ {
368
+ "epoch": 5.02,
369
+ "learning_rate": 6.153846153846155e-06,
370
+ "loss": 0.1607,
371
+ "step": 212
372
+ },
373
+ {
374
+ "epoch": 5.03,
375
+ "learning_rate": 6.030769230769231e-06,
376
+ "loss": 0.1458,
377
+ "step": 216
378
+ },
379
+ {
380
+ "epoch": 5.04,
381
+ "learning_rate": 5.907692307692308e-06,
382
+ "loss": 0.1541,
383
+ "step": 220
384
+ },
385
+ {
386
+ "epoch": 5.05,
387
+ "learning_rate": 5.784615384615385e-06,
388
+ "loss": 0.1494,
389
+ "step": 224
390
+ },
391
+ {
392
+ "epoch": 5.06,
393
+ "learning_rate": 5.661538461538462e-06,
394
+ "loss": 0.144,
395
+ "step": 228
396
+ },
397
+ {
398
+ "epoch": 5.07,
399
+ "learning_rate": 5.538461538461539e-06,
400
+ "loss": 0.1311,
401
+ "step": 232
402
+ },
403
+ {
404
+ "epoch": 5.08,
405
+ "learning_rate": 5.415384615384615e-06,
406
+ "loss": 0.1411,
407
+ "step": 236
408
+ },
409
+ {
410
+ "epoch": 5.09,
411
+ "learning_rate": 5.292307692307693e-06,
412
+ "loss": 0.1322,
413
+ "step": 240
414
+ },
415
+ {
416
+ "epoch": 5.09,
417
+ "eval_loss": 0.5350630283355713,
418
+ "eval_runtime": 92.5111,
419
+ "eval_samples_per_second": 4.259,
420
+ "eval_steps_per_second": 0.141,
421
+ "eval_wer": 18.2128489576984,
422
+ "step": 240
423
+ },
424
+ {
425
+ "epoch": 5.1,
426
+ "learning_rate": 5.16923076923077e-06,
427
+ "loss": 0.1436,
428
+ "step": 244
429
+ },
430
+ {
431
+ "epoch": 6.0,
432
+ "learning_rate": 5.046153846153846e-06,
433
+ "loss": 0.1375,
434
+ "step": 248
435
+ },
436
+ {
437
+ "epoch": 6.01,
438
+ "learning_rate": 4.923076923076924e-06,
439
+ "loss": 0.1361,
440
+ "step": 252
441
+ },
442
+ {
443
+ "epoch": 6.02,
444
+ "learning_rate": 4.800000000000001e-06,
445
+ "loss": 0.129,
446
+ "step": 256
447
+ },
448
+ {
449
+ "epoch": 6.03,
450
+ "learning_rate": 4.676923076923077e-06,
451
+ "loss": 0.1127,
452
+ "step": 260
453
+ },
454
+ {
455
+ "epoch": 6.04,
456
+ "learning_rate": 4.553846153846154e-06,
457
+ "loss": 0.1266,
458
+ "step": 264
459
+ },
460
+ {
461
+ "epoch": 6.05,
462
+ "learning_rate": 4.430769230769232e-06,
463
+ "loss": 0.1193,
464
+ "step": 268
465
+ },
466
+ {
467
+ "epoch": 6.06,
468
+ "learning_rate": 4.307692307692308e-06,
469
+ "loss": 0.1127,
470
+ "step": 272
471
+ },
472
+ {
473
+ "epoch": 6.07,
474
+ "learning_rate": 4.184615384615385e-06,
475
+ "loss": 0.1064,
476
+ "step": 276
477
+ },
478
+ {
479
+ "epoch": 6.08,
480
+ "learning_rate": 4.061538461538462e-06,
481
+ "loss": 0.123,
482
+ "step": 280
483
+ },
484
+ {
485
+ "epoch": 6.08,
486
+ "eval_loss": 0.5429388284683228,
487
+ "eval_runtime": 91.5818,
488
+ "eval_samples_per_second": 4.302,
489
+ "eval_steps_per_second": 0.142,
490
+ "eval_wer": 18.907716689016212,
491
+ "step": 280
492
+ },
493
+ {
494
+ "epoch": 6.09,
495
+ "learning_rate": 3.938461538461539e-06,
496
+ "loss": 0.1057,
497
+ "step": 284
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "learning_rate": 3.815384615384616e-06,
502
+ "loss": 0.1258,
503
+ "step": 288
504
+ },
505
+ {
506
+ "epoch": 7.01,
507
+ "learning_rate": 3.692307692307693e-06,
508
+ "loss": 0.1108,
509
+ "step": 292
510
+ },
511
+ {
512
+ "epoch": 7.02,
513
+ "learning_rate": 3.5692307692307692e-06,
514
+ "loss": 0.1115,
515
+ "step": 296
516
+ },
517
+ {
518
+ "epoch": 7.03,
519
+ "learning_rate": 3.4461538461538464e-06,
520
+ "loss": 0.0998,
521
+ "step": 300
522
+ },
523
+ {
524
+ "epoch": 7.04,
525
+ "learning_rate": 3.323076923076923e-06,
526
+ "loss": 0.1106,
527
+ "step": 304
528
+ },
529
+ {
530
+ "epoch": 7.05,
531
+ "learning_rate": 3.2000000000000003e-06,
532
+ "loss": 0.1045,
533
+ "step": 308
534
+ },
535
+ {
536
+ "epoch": 7.06,
537
+ "learning_rate": 3.0769230769230774e-06,
538
+ "loss": 0.0908,
539
+ "step": 312
540
+ },
541
+ {
542
+ "epoch": 7.07,
543
+ "learning_rate": 2.953846153846154e-06,
544
+ "loss": 0.0931,
545
+ "step": 316
546
+ },
547
+ {
548
+ "epoch": 7.08,
549
+ "learning_rate": 2.830769230769231e-06,
550
+ "loss": 0.1074,
551
+ "step": 320
552
+ },
553
+ {
554
+ "epoch": 7.08,
555
+ "eval_loss": 0.5500437021255493,
556
+ "eval_runtime": 104.0907,
557
+ "eval_samples_per_second": 3.785,
558
+ "eval_steps_per_second": 0.125,
559
+ "eval_wer": 19.054004632451544,
560
+ "step": 320
561
+ },
562
+ {
563
+ "epoch": 7.09,
564
+ "learning_rate": 2.7076923076923076e-06,
565
+ "loss": 0.0937,
566
+ "step": 324
567
+ },
568
+ {
569
+ "epoch": 7.1,
570
+ "learning_rate": 2.584615384615385e-06,
571
+ "loss": 0.1091,
572
+ "step": 328
573
+ },
574
+ {
575
+ "epoch": 8.01,
576
+ "learning_rate": 2.461538461538462e-06,
577
+ "loss": 0.0951,
578
+ "step": 332
579
+ },
580
+ {
581
+ "epoch": 8.02,
582
+ "learning_rate": 2.3384615384615387e-06,
583
+ "loss": 0.1003,
584
+ "step": 336
585
+ },
586
+ {
587
+ "epoch": 8.03,
588
+ "learning_rate": 2.215384615384616e-06,
589
+ "loss": 0.0836,
590
+ "step": 340
591
+ },
592
+ {
593
+ "epoch": 8.04,
594
+ "learning_rate": 2.0923076923076926e-06,
595
+ "loss": 0.0907,
596
+ "step": 344
597
+ },
598
+ {
599
+ "epoch": 8.05,
600
+ "learning_rate": 1.9692307692307693e-06,
601
+ "loss": 0.1013,
602
+ "step": 348
603
+ },
604
+ {
605
+ "epoch": 8.06,
606
+ "learning_rate": 1.8461538461538465e-06,
607
+ "loss": 0.0891,
608
+ "step": 352
609
+ },
610
+ {
611
+ "epoch": 8.07,
612
+ "learning_rate": 1.7230769230769232e-06,
613
+ "loss": 0.077,
614
+ "step": 356
615
+ },
616
+ {
617
+ "epoch": 8.08,
618
+ "learning_rate": 1.6000000000000001e-06,
619
+ "loss": 0.1007,
620
+ "step": 360
621
+ },
622
+ {
623
+ "epoch": 8.08,
624
+ "eval_loss": 0.5552565455436707,
625
+ "eval_runtime": 88.458,
626
+ "eval_samples_per_second": 4.454,
627
+ "eval_steps_per_second": 0.147,
628
+ "eval_wer": 19.310008533463368,
629
+ "step": 360
630
+ },
631
+ {
632
+ "epoch": 8.09,
633
+ "learning_rate": 1.476923076923077e-06,
634
+ "loss": 0.0849,
635
+ "step": 364
636
+ },
637
+ {
638
+ "epoch": 8.1,
639
+ "learning_rate": 1.3538461538461538e-06,
640
+ "loss": 0.0971,
641
+ "step": 368
642
+ },
643
+ {
644
+ "epoch": 9.01,
645
+ "learning_rate": 1.230769230769231e-06,
646
+ "loss": 0.0876,
647
+ "step": 372
648
+ },
649
+ {
650
+ "epoch": 9.02,
651
+ "learning_rate": 1.107692307692308e-06,
652
+ "loss": 0.0879,
653
+ "step": 376
654
+ },
655
+ {
656
+ "epoch": 9.03,
657
+ "learning_rate": 9.846153846153847e-07,
658
+ "loss": 0.0805,
659
+ "step": 380
660
+ },
661
+ {
662
+ "epoch": 9.04,
663
+ "learning_rate": 8.615384615384616e-07,
664
+ "loss": 0.0888,
665
+ "step": 384
666
+ },
667
+ {
668
+ "epoch": 9.05,
669
+ "learning_rate": 7.384615384615385e-07,
670
+ "loss": 0.0858,
671
+ "step": 388
672
+ },
673
+ {
674
+ "epoch": 9.06,
675
+ "learning_rate": 6.153846153846155e-07,
676
+ "loss": 0.0825,
677
+ "step": 392
678
+ },
679
+ {
680
+ "epoch": 9.07,
681
+ "learning_rate": 4.923076923076923e-07,
682
+ "loss": 0.0748,
683
+ "step": 396
684
+ },
685
+ {
686
+ "epoch": 9.08,
687
+ "learning_rate": 3.6923076923076927e-07,
688
+ "loss": 0.0876,
689
+ "step": 400
690
+ },
691
+ {
692
+ "epoch": 9.08,
693
+ "eval_loss": 0.5568162202835083,
694
+ "eval_runtime": 89.7223,
695
+ "eval_samples_per_second": 4.391,
696
+ "eval_steps_per_second": 0.145,
697
+ "eval_wer": 19.3465805193222,
698
+ "step": 400
699
+ }
700
+ ],
701
+ "max_steps": 407,
702
+ "num_train_epochs": 9223372036854775807,
703
+ "total_flos": 6.2536891981824e+17,
704
+ "trial_name": null,
705
+ "trial_params": null
706
+ }
checkpoint-400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcbca0d141969bcb1c3cd0ef5a009221139334753b899d88e4d5003bd23f4b5f
3
+ size 3579
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_loss": 1.8916987180709839,
3
+ "eval_runtime": 96.9796,
4
+ "eval_samples_per_second": 4.063,
5
+ "eval_steps_per_second": 0.134,
6
+ "eval_wer": 15.494331342191881
7
+ }
huggingface_training.py CHANGED
@@ -32,8 +32,8 @@ dataset = load_dataset(dataset_id, dataset_language_code, streaming=True)
32
 
33
  """The first time you run this code, make sure everything works fine using a small sample and low number of training steps. Just uncomment the next cell and run it. One note: since the dataset is loaded in streaming mode, the instruction will not be executed immediately. Instead, the dataset will be subsampled only when data will be needed during training."""
34
 
35
- test_script = True
36
- # test_script = False
37
 
38
  ## Sample dataset for testing
39
  if test_script is True:
@@ -236,14 +236,14 @@ Last, we can track our training using several experiment tracking tools. I use W
236
  """
237
 
238
  ## If you don't want to track your experiment with WandB, run this!
239
- os.environ["WANDB_DISABLED"] = "true"
240
- report_to = "none"
241
 
242
  # If you have a wandb account, login!
243
  # Otherwise, edit this cell to loging with your favourite experiment tracker(s)
244
- # wandb.login()
245
- # wandb.init(project="whisper-training-post")
246
- # report_to = "wandb"
247
 
248
  # Define (and create, if missing) output directory
249
  output_dir = "."
@@ -264,12 +264,12 @@ eval_bs = 2 if test_script is True else 32
264
  # Then we infer the number of steps
265
  # TODO: how did I find it?
266
  num_training_samples = 2602
267
- num_epochs = 5
268
  max_steps_full_training = ceil(num_training_samples * num_epochs / train_bs)
269
  max_steps = 2 if test_script is True else max_steps_full_training
270
 
271
  # We don't want to evaluate too often since it slows down training a lot
272
- eval_steps = 1 if test_script is True else int(max_steps / 5)
273
  logging_steps = 1 if test_script is True else int(max_steps / 100)
274
 
275
  training_args = Seq2SeqTrainingArguments(
@@ -319,54 +319,54 @@ I hope you haven't left yet. If you have, bad for you, as we are ready for train
319
  As Whisper is a pretrained model ready to be used off-the-shelf, it is advisable to evaluate it before training on both the validation and test sets. Let's make sure we make no harm to it.
320
  """
321
 
322
- # eval_metrics = trainer.evaluate(
323
- # eval_dataset=preprocessed_dataset["validation"],
324
- # metric_key_prefix="eval",
325
- # max_length=448,
326
- # num_beams=1,
327
- # # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
328
- # )
329
 
330
- # trainer.log_metrics("eval", eval_metrics)
331
- # trainer.save_metrics("eval", eval_metrics)
332
- # print(eval_metrics)
333
 
334
- # test_metrics = trainer.evaluate(
335
- # eval_dataset=preprocessed_dataset["test"],
336
- # metric_key_prefix="test",
337
- # max_length=448,
338
- # num_beams=1,
339
- # # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
340
- # )
341
 
342
- # trainer.log_metrics("test", test_metrics)
343
- # trainer.save_metrics("test", test_metrics)
344
- # print(test_metrics)
345
 
346
- # train_result = trainer.train()
347
  trainer.save_model()
348
 
349
- # metrics = train_result.metrics
350
- # trainer.log_metrics("train", metrics)
351
- # trainer.save_metrics("train", metrics)
352
- # trainer.save_state()
353
- # print(metrics)
354
 
355
  """ADD SOMETHING ABOUT THE TRAINING.
356
 
357
  Now let's evaluate the
358
  """
359
 
360
- # final_metrics = trainer.evaluate(
361
- # eval_dataset=preprocessed_dataset["test"],
362
- # metric_key_prefix="test",
363
- # max_length=448,
364
- # num_beams=1,
365
- # # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
366
- # )
367
 
368
- # trainer.log_metrics("test", final_metrics)
369
- # trainer.save_metrics("test", final_metrics)
370
- # print(final_metrics)
371
 
372
  trainer.push_to_hub()
 
32
 
33
  """The first time you run this code, make sure everything works fine using a small sample and low number of training steps. Just uncomment the next cell and run it. One note: since the dataset is loaded in streaming mode, the instruction will not be executed immediately. Instead, the dataset will be subsampled only when data will be needed during training."""
34
 
35
+ # test_script = True
36
+ test_script = False
37
 
38
  ## Sample dataset for testing
39
  if test_script is True:
 
236
  """
237
 
238
  ## If you don't want to track your experiment with WandB, run this!
239
+ # os.environ["WANDB_DISABLED"] = "true"
240
+ # report_to = "none"
241
 
242
  # If you have a wandb account, login!
243
  # Otherwise, edit this cell to loging with your favourite experiment tracker(s)
244
+ wandb.login()
245
+ wandb.init(project="whisper-training-post")
246
+ report_to = "wandb"
247
 
248
  # Define (and create, if missing) output directory
249
  output_dir = "."
 
264
  # Then we infer the number of steps
265
  # TODO: how did I find it?
266
  num_training_samples = 2602
267
+ num_epochs = 10
268
  max_steps_full_training = ceil(num_training_samples * num_epochs / train_bs)
269
  max_steps = 2 if test_script is True else max_steps_full_training
270
 
271
  # We don't want to evaluate too often since it slows down training a lot
272
+ eval_steps = 1 if test_script is True else int(max_steps / 10)
273
  logging_steps = 1 if test_script is True else int(max_steps / 100)
274
 
275
  training_args = Seq2SeqTrainingArguments(
 
319
  As Whisper is a pretrained model ready to be used off-the-shelf, it is advisable to evaluate it before training on both the validation and test sets. Let's make sure we make no harm to it.
320
  """
321
 
322
+ eval_metrics = trainer.evaluate(
323
+ eval_dataset=preprocessed_dataset["validation"],
324
+ metric_key_prefix="eval",
325
+ max_length=448,
326
+ num_beams=1,
327
+ # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
328
+ )
329
 
330
+ trainer.log_metrics("eval", eval_metrics)
331
+ trainer.save_metrics("eval", eval_metrics)
332
+ print(eval_metrics)
333
 
334
+ test_metrics = trainer.evaluate(
335
+ eval_dataset=preprocessed_dataset["test"],
336
+ metric_key_prefix="test",
337
+ max_length=448,
338
+ num_beams=1,
339
+ # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
340
+ )
341
 
342
+ trainer.log_metrics("test", test_metrics)
343
+ trainer.save_metrics("test", test_metrics)
344
+ print(test_metrics)
345
 
346
+ train_result = trainer.train()
347
  trainer.save_model()
348
 
349
+ metrics = train_result.metrics
350
+ trainer.log_metrics("train", metrics)
351
+ trainer.save_metrics("train", metrics)
352
+ trainer.save_state()
353
+ print(metrics)
354
 
355
  """ADD SOMETHING ABOUT THE TRAINING.
356
 
357
  Now let's evaluate the
358
  """
359
 
360
+ final_metrics = trainer.evaluate(
361
+ eval_dataset=preprocessed_dataset["test"],
362
+ metric_key_prefix="test",
363
+ max_length=448,
364
+ num_beams=1,
365
+ # gen_kwargs={"key": value} to provide additional generation specific arguments by keyword
366
+ )
367
 
368
+ trainer.log_metrics("test", final_metrics)
369
+ trainer.save_metrics("test", final_metrics)
370
+ print(final_metrics)
371
 
372
  trainer.push_to_hub()
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3843686519777a4550909e8bd4961dcf7425e7183295f03d09a433a271f0887
3
  size 151098921
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21343063174657acd721a023a2780da91e0bede1cc15233f17e5468d93d0ae51
3
  size 151098921
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.09,
3
+ "test_loss": 0.5623113512992859,
4
+ "test_runtime": 121.6703,
5
+ "test_samples_per_second": 5.318,
6
+ "test_steps_per_second": 0.173,
7
+ "test_wer": 20.965372507869883
8
+ }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.09,
3
+ "train_loss": 0.35074408769753995,
4
+ "train_runtime": 2707.3827,
5
+ "train_samples_per_second": 9.621,
6
+ "train_steps_per_second": 0.15
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.093366093366093,
5
+ "global_step": 407,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 0.0,
13
+ "loss": 1.8118,
14
+ "step": 4
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 3.6585365853658536e-07,
19
+ "loss": 1.8122,
20
+ "step": 8
21
+ },
22
+ {
23
+ "epoch": 0.03,
24
+ "learning_rate": 8.53658536585366e-07,
25
+ "loss": 1.8174,
26
+ "step": 12
27
+ },
28
+ {
29
+ "epoch": 0.04,
30
+ "learning_rate": 1.3414634146341465e-06,
31
+ "loss": 1.7616,
32
+ "step": 16
33
+ },
34
+ {
35
+ "epoch": 0.05,
36
+ "learning_rate": 1.8292682926829268e-06,
37
+ "loss": 1.6875,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 2.317073170731708e-06,
43
+ "loss": 1.5201,
44
+ "step": 24
45
+ },
46
+ {
47
+ "epoch": 0.07,
48
+ "learning_rate": 2.8048780487804884e-06,
49
+ "loss": 1.3982,
50
+ "step": 28
51
+ },
52
+ {
53
+ "epoch": 0.08,
54
+ "learning_rate": 3.292682926829269e-06,
55
+ "loss": 1.3541,
56
+ "step": 32
57
+ },
58
+ {
59
+ "epoch": 0.09,
60
+ "learning_rate": 3.780487804878049e-06,
61
+ "loss": 1.2092,
62
+ "step": 36
63
+ },
64
+ {
65
+ "epoch": 0.1,
66
+ "learning_rate": 4.268292682926829e-06,
67
+ "loss": 1.1599,
68
+ "step": 40
69
+ },
70
+ {
71
+ "epoch": 0.1,
72
+ "eval_loss": 1.142654299736023,
73
+ "eval_runtime": 101.9854,
74
+ "eval_samples_per_second": 3.863,
75
+ "eval_steps_per_second": 0.127,
76
+ "eval_wer": 15.213946117274169,
77
+ "step": 40
78
+ },
79
+ {
80
+ "epoch": 1.01,
81
+ "learning_rate": 4.75609756097561e-06,
82
+ "loss": 1.0124,
83
+ "step": 44
84
+ },
85
+ {
86
+ "epoch": 1.02,
87
+ "learning_rate": 5.243902439024391e-06,
88
+ "loss": 0.9171,
89
+ "step": 48
90
+ },
91
+ {
92
+ "epoch": 1.03,
93
+ "learning_rate": 5.731707317073171e-06,
94
+ "loss": 0.8027,
95
+ "step": 52
96
+ },
97
+ {
98
+ "epoch": 1.04,
99
+ "learning_rate": 6.219512195121951e-06,
100
+ "loss": 0.7284,
101
+ "step": 56
102
+ },
103
+ {
104
+ "epoch": 1.05,
105
+ "learning_rate": 6.707317073170733e-06,
106
+ "loss": 0.6185,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 1.06,
111
+ "learning_rate": 7.1951219512195125e-06,
112
+ "loss": 0.57,
113
+ "step": 64
114
+ },
115
+ {
116
+ "epoch": 1.07,
117
+ "learning_rate": 7.682926829268293e-06,
118
+ "loss": 0.4985,
119
+ "step": 68
120
+ },
121
+ {
122
+ "epoch": 1.08,
123
+ "learning_rate": 8.170731707317073e-06,
124
+ "loss": 0.488,
125
+ "step": 72
126
+ },
127
+ {
128
+ "epoch": 1.09,
129
+ "learning_rate": 8.658536585365854e-06,
130
+ "loss": 0.4569,
131
+ "step": 76
132
+ },
133
+ {
134
+ "epoch": 1.1,
135
+ "learning_rate": 9.146341463414635e-06,
136
+ "loss": 0.4655,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 1.1,
141
+ "eval_loss": 0.5613037943840027,
142
+ "eval_runtime": 91.9697,
143
+ "eval_samples_per_second": 4.284,
144
+ "eval_steps_per_second": 0.141,
145
+ "eval_wer": 17.591125198098258,
146
+ "step": 80
147
+ },
148
+ {
149
+ "epoch": 2.0,
150
+ "learning_rate": 9.634146341463415e-06,
151
+ "loss": 0.425,
152
+ "step": 84
153
+ },
154
+ {
155
+ "epoch": 2.01,
156
+ "learning_rate": 9.96923076923077e-06,
157
+ "loss": 0.4162,
158
+ "step": 88
159
+ },
160
+ {
161
+ "epoch": 2.02,
162
+ "learning_rate": 9.846153846153848e-06,
163
+ "loss": 0.3809,
164
+ "step": 92
165
+ },
166
+ {
167
+ "epoch": 2.03,
168
+ "learning_rate": 9.723076923076924e-06,
169
+ "loss": 0.3533,
170
+ "step": 96
171
+ },
172
+ {
173
+ "epoch": 2.04,
174
+ "learning_rate": 9.600000000000001e-06,
175
+ "loss": 0.3511,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 2.05,
180
+ "learning_rate": 9.476923076923079e-06,
181
+ "loss": 0.3475,
182
+ "step": 104
183
+ },
184
+ {
185
+ "epoch": 2.06,
186
+ "learning_rate": 9.353846153846155e-06,
187
+ "loss": 0.321,
188
+ "step": 108
189
+ },
190
+ {
191
+ "epoch": 2.07,
192
+ "learning_rate": 9.230769230769232e-06,
193
+ "loss": 0.2859,
194
+ "step": 112
195
+ },
196
+ {
197
+ "epoch": 2.08,
198
+ "learning_rate": 9.107692307692308e-06,
199
+ "loss": 0.3191,
200
+ "step": 116
201
+ },
202
+ {
203
+ "epoch": 2.09,
204
+ "learning_rate": 8.984615384615386e-06,
205
+ "loss": 0.2753,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 2.09,
210
+ "eval_loss": 0.5241264700889587,
211
+ "eval_runtime": 88.0526,
212
+ "eval_samples_per_second": 4.475,
213
+ "eval_steps_per_second": 0.148,
214
+ "eval_wer": 17.21321467755699,
215
+ "step": 120
216
+ },
217
+ {
218
+ "epoch": 3.0,
219
+ "learning_rate": 8.861538461538463e-06,
220
+ "loss": 0.3104,
221
+ "step": 124
222
+ },
223
+ {
224
+ "epoch": 3.01,
225
+ "learning_rate": 8.73846153846154e-06,
226
+ "loss": 0.2734,
227
+ "step": 128
228
+ },
229
+ {
230
+ "epoch": 3.02,
231
+ "learning_rate": 8.615384615384617e-06,
232
+ "loss": 0.2608,
233
+ "step": 132
234
+ },
235
+ {
236
+ "epoch": 3.03,
237
+ "learning_rate": 8.492307692307693e-06,
238
+ "loss": 0.2509,
239
+ "step": 136
240
+ },
241
+ {
242
+ "epoch": 3.04,
243
+ "learning_rate": 8.36923076923077e-06,
244
+ "loss": 0.2548,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 3.05,
249
+ "learning_rate": 8.246153846153848e-06,
250
+ "loss": 0.2469,
251
+ "step": 144
252
+ },
253
+ {
254
+ "epoch": 3.06,
255
+ "learning_rate": 8.123076923076924e-06,
256
+ "loss": 0.2231,
257
+ "step": 148
258
+ },
259
+ {
260
+ "epoch": 3.07,
261
+ "learning_rate": 8.000000000000001e-06,
262
+ "loss": 0.2138,
263
+ "step": 152
264
+ },
265
+ {
266
+ "epoch": 3.08,
267
+ "learning_rate": 7.876923076923077e-06,
268
+ "loss": 0.2349,
269
+ "step": 156
270
+ },
271
+ {
272
+ "epoch": 3.09,
273
+ "learning_rate": 7.753846153846155e-06,
274
+ "loss": 0.2077,
275
+ "step": 160
276
+ },
277
+ {
278
+ "epoch": 3.09,
279
+ "eval_loss": 0.5241798758506775,
280
+ "eval_runtime": 88.5317,
281
+ "eval_samples_per_second": 4.45,
282
+ "eval_steps_per_second": 0.147,
283
+ "eval_wer": 17.26197732536877,
284
+ "step": 160
285
+ },
286
+ {
287
+ "epoch": 3.1,
288
+ "learning_rate": 7.630769230769232e-06,
289
+ "loss": 0.2322,
290
+ "step": 164
291
+ },
292
+ {
293
+ "epoch": 4.01,
294
+ "learning_rate": 7.507692307692308e-06,
295
+ "loss": 0.2036,
296
+ "step": 168
297
+ },
298
+ {
299
+ "epoch": 4.02,
300
+ "learning_rate": 7.384615384615386e-06,
301
+ "loss": 0.2058,
302
+ "step": 172
303
+ },
304
+ {
305
+ "epoch": 4.03,
306
+ "learning_rate": 7.261538461538462e-06,
307
+ "loss": 0.1797,
308
+ "step": 176
309
+ },
310
+ {
311
+ "epoch": 4.04,
312
+ "learning_rate": 7.1384615384615385e-06,
313
+ "loss": 0.186,
314
+ "step": 180
315
+ },
316
+ {
317
+ "epoch": 4.05,
318
+ "learning_rate": 7.015384615384616e-06,
319
+ "loss": 0.2035,
320
+ "step": 184
321
+ },
322
+ {
323
+ "epoch": 4.06,
324
+ "learning_rate": 6.892307692307693e-06,
325
+ "loss": 0.1794,
326
+ "step": 188
327
+ },
328
+ {
329
+ "epoch": 4.07,
330
+ "learning_rate": 6.76923076923077e-06,
331
+ "loss": 0.1589,
332
+ "step": 192
333
+ },
334
+ {
335
+ "epoch": 4.08,
336
+ "learning_rate": 6.646153846153846e-06,
337
+ "loss": 0.1879,
338
+ "step": 196
339
+ },
340
+ {
341
+ "epoch": 4.09,
342
+ "learning_rate": 6.523076923076923e-06,
343
+ "loss": 0.1636,
344
+ "step": 200
345
+ },
346
+ {
347
+ "epoch": 4.09,
348
+ "eval_loss": 0.5289868712425232,
349
+ "eval_runtime": 95.5188,
350
+ "eval_samples_per_second": 4.125,
351
+ "eval_steps_per_second": 0.136,
352
+ "eval_wer": 17.66426916981592,
353
+ "step": 200
354
+ },
355
+ {
356
+ "epoch": 4.1,
357
+ "learning_rate": 6.4000000000000006e-06,
358
+ "loss": 0.1767,
359
+ "step": 204
360
+ },
361
+ {
362
+ "epoch": 5.01,
363
+ "learning_rate": 6.276923076923077e-06,
364
+ "loss": 0.1657,
365
+ "step": 208
366
+ },
367
+ {
368
+ "epoch": 5.02,
369
+ "learning_rate": 6.153846153846155e-06,
370
+ "loss": 0.1607,
371
+ "step": 212
372
+ },
373
+ {
374
+ "epoch": 5.03,
375
+ "learning_rate": 6.030769230769231e-06,
376
+ "loss": 0.1458,
377
+ "step": 216
378
+ },
379
+ {
380
+ "epoch": 5.04,
381
+ "learning_rate": 5.907692307692308e-06,
382
+ "loss": 0.1541,
383
+ "step": 220
384
+ },
385
+ {
386
+ "epoch": 5.05,
387
+ "learning_rate": 5.784615384615385e-06,
388
+ "loss": 0.1494,
389
+ "step": 224
390
+ },
391
+ {
392
+ "epoch": 5.06,
393
+ "learning_rate": 5.661538461538462e-06,
394
+ "loss": 0.144,
395
+ "step": 228
396
+ },
397
+ {
398
+ "epoch": 5.07,
399
+ "learning_rate": 5.538461538461539e-06,
400
+ "loss": 0.1311,
401
+ "step": 232
402
+ },
403
+ {
404
+ "epoch": 5.08,
405
+ "learning_rate": 5.415384615384615e-06,
406
+ "loss": 0.1411,
407
+ "step": 236
408
+ },
409
+ {
410
+ "epoch": 5.09,
411
+ "learning_rate": 5.292307692307693e-06,
412
+ "loss": 0.1322,
413
+ "step": 240
414
+ },
415
+ {
416
+ "epoch": 5.09,
417
+ "eval_loss": 0.5350630283355713,
418
+ "eval_runtime": 92.5111,
419
+ "eval_samples_per_second": 4.259,
420
+ "eval_steps_per_second": 0.141,
421
+ "eval_wer": 18.2128489576984,
422
+ "step": 240
423
+ },
424
+ {
425
+ "epoch": 5.1,
426
+ "learning_rate": 5.16923076923077e-06,
427
+ "loss": 0.1436,
428
+ "step": 244
429
+ },
430
+ {
431
+ "epoch": 6.0,
432
+ "learning_rate": 5.046153846153846e-06,
433
+ "loss": 0.1375,
434
+ "step": 248
435
+ },
436
+ {
437
+ "epoch": 6.01,
438
+ "learning_rate": 4.923076923076924e-06,
439
+ "loss": 0.1361,
440
+ "step": 252
441
+ },
442
+ {
443
+ "epoch": 6.02,
444
+ "learning_rate": 4.800000000000001e-06,
445
+ "loss": 0.129,
446
+ "step": 256
447
+ },
448
+ {
449
+ "epoch": 6.03,
450
+ "learning_rate": 4.676923076923077e-06,
451
+ "loss": 0.1127,
452
+ "step": 260
453
+ },
454
+ {
455
+ "epoch": 6.04,
456
+ "learning_rate": 4.553846153846154e-06,
457
+ "loss": 0.1266,
458
+ "step": 264
459
+ },
460
+ {
461
+ "epoch": 6.05,
462
+ "learning_rate": 4.430769230769232e-06,
463
+ "loss": 0.1193,
464
+ "step": 268
465
+ },
466
+ {
467
+ "epoch": 6.06,
468
+ "learning_rate": 4.307692307692308e-06,
469
+ "loss": 0.1127,
470
+ "step": 272
471
+ },
472
+ {
473
+ "epoch": 6.07,
474
+ "learning_rate": 4.184615384615385e-06,
475
+ "loss": 0.1064,
476
+ "step": 276
477
+ },
478
+ {
479
+ "epoch": 6.08,
480
+ "learning_rate": 4.061538461538462e-06,
481
+ "loss": 0.123,
482
+ "step": 280
483
+ },
484
+ {
485
+ "epoch": 6.08,
486
+ "eval_loss": 0.5429388284683228,
487
+ "eval_runtime": 91.5818,
488
+ "eval_samples_per_second": 4.302,
489
+ "eval_steps_per_second": 0.142,
490
+ "eval_wer": 18.907716689016212,
491
+ "step": 280
492
+ },
493
+ {
494
+ "epoch": 6.09,
495
+ "learning_rate": 3.938461538461539e-06,
496
+ "loss": 0.1057,
497
+ "step": 284
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "learning_rate": 3.815384615384616e-06,
502
+ "loss": 0.1258,
503
+ "step": 288
504
+ },
505
+ {
506
+ "epoch": 7.01,
507
+ "learning_rate": 3.692307692307693e-06,
508
+ "loss": 0.1108,
509
+ "step": 292
510
+ },
511
+ {
512
+ "epoch": 7.02,
513
+ "learning_rate": 3.5692307692307692e-06,
514
+ "loss": 0.1115,
515
+ "step": 296
516
+ },
517
+ {
518
+ "epoch": 7.03,
519
+ "learning_rate": 3.4461538461538464e-06,
520
+ "loss": 0.0998,
521
+ "step": 300
522
+ },
523
+ {
524
+ "epoch": 7.04,
525
+ "learning_rate": 3.323076923076923e-06,
526
+ "loss": 0.1106,
527
+ "step": 304
528
+ },
529
+ {
530
+ "epoch": 7.05,
531
+ "learning_rate": 3.2000000000000003e-06,
532
+ "loss": 0.1045,
533
+ "step": 308
534
+ },
535
+ {
536
+ "epoch": 7.06,
537
+ "learning_rate": 3.0769230769230774e-06,
538
+ "loss": 0.0908,
539
+ "step": 312
540
+ },
541
+ {
542
+ "epoch": 7.07,
543
+ "learning_rate": 2.953846153846154e-06,
544
+ "loss": 0.0931,
545
+ "step": 316
546
+ },
547
+ {
548
+ "epoch": 7.08,
549
+ "learning_rate": 2.830769230769231e-06,
550
+ "loss": 0.1074,
551
+ "step": 320
552
+ },
553
+ {
554
+ "epoch": 7.08,
555
+ "eval_loss": 0.5500437021255493,
556
+ "eval_runtime": 104.0907,
557
+ "eval_samples_per_second": 3.785,
558
+ "eval_steps_per_second": 0.125,
559
+ "eval_wer": 19.054004632451544,
560
+ "step": 320
561
+ },
562
+ {
563
+ "epoch": 7.09,
564
+ "learning_rate": 2.7076923076923076e-06,
565
+ "loss": 0.0937,
566
+ "step": 324
567
+ },
568
+ {
569
+ "epoch": 7.1,
570
+ "learning_rate": 2.584615384615385e-06,
571
+ "loss": 0.1091,
572
+ "step": 328
573
+ },
574
+ {
575
+ "epoch": 8.01,
576
+ "learning_rate": 2.461538461538462e-06,
577
+ "loss": 0.0951,
578
+ "step": 332
579
+ },
580
+ {
581
+ "epoch": 8.02,
582
+ "learning_rate": 2.3384615384615387e-06,
583
+ "loss": 0.1003,
584
+ "step": 336
585
+ },
586
+ {
587
+ "epoch": 8.03,
588
+ "learning_rate": 2.215384615384616e-06,
589
+ "loss": 0.0836,
590
+ "step": 340
591
+ },
592
+ {
593
+ "epoch": 8.04,
594
+ "learning_rate": 2.0923076923076926e-06,
595
+ "loss": 0.0907,
596
+ "step": 344
597
+ },
598
+ {
599
+ "epoch": 8.05,
600
+ "learning_rate": 1.9692307692307693e-06,
601
+ "loss": 0.1013,
602
+ "step": 348
603
+ },
604
+ {
605
+ "epoch": 8.06,
606
+ "learning_rate": 1.8461538461538465e-06,
607
+ "loss": 0.0891,
608
+ "step": 352
609
+ },
610
+ {
611
+ "epoch": 8.07,
612
+ "learning_rate": 1.7230769230769232e-06,
613
+ "loss": 0.077,
614
+ "step": 356
615
+ },
616
+ {
617
+ "epoch": 8.08,
618
+ "learning_rate": 1.6000000000000001e-06,
619
+ "loss": 0.1007,
620
+ "step": 360
621
+ },
622
+ {
623
+ "epoch": 8.08,
624
+ "eval_loss": 0.5552565455436707,
625
+ "eval_runtime": 88.458,
626
+ "eval_samples_per_second": 4.454,
627
+ "eval_steps_per_second": 0.147,
628
+ "eval_wer": 19.310008533463368,
629
+ "step": 360
630
+ },
631
+ {
632
+ "epoch": 8.09,
633
+ "learning_rate": 1.476923076923077e-06,
634
+ "loss": 0.0849,
635
+ "step": 364
636
+ },
637
+ {
638
+ "epoch": 8.1,
639
+ "learning_rate": 1.3538461538461538e-06,
640
+ "loss": 0.0971,
641
+ "step": 368
642
+ },
643
+ {
644
+ "epoch": 9.01,
645
+ "learning_rate": 1.230769230769231e-06,
646
+ "loss": 0.0876,
647
+ "step": 372
648
+ },
649
+ {
650
+ "epoch": 9.02,
651
+ "learning_rate": 1.107692307692308e-06,
652
+ "loss": 0.0879,
653
+ "step": 376
654
+ },
655
+ {
656
+ "epoch": 9.03,
657
+ "learning_rate": 9.846153846153847e-07,
658
+ "loss": 0.0805,
659
+ "step": 380
660
+ },
661
+ {
662
+ "epoch": 9.04,
663
+ "learning_rate": 8.615384615384616e-07,
664
+ "loss": 0.0888,
665
+ "step": 384
666
+ },
667
+ {
668
+ "epoch": 9.05,
669
+ "learning_rate": 7.384615384615385e-07,
670
+ "loss": 0.0858,
671
+ "step": 388
672
+ },
673
+ {
674
+ "epoch": 9.06,
675
+ "learning_rate": 6.153846153846155e-07,
676
+ "loss": 0.0825,
677
+ "step": 392
678
+ },
679
+ {
680
+ "epoch": 9.07,
681
+ "learning_rate": 4.923076923076923e-07,
682
+ "loss": 0.0748,
683
+ "step": 396
684
+ },
685
+ {
686
+ "epoch": 9.08,
687
+ "learning_rate": 3.6923076923076927e-07,
688
+ "loss": 0.0876,
689
+ "step": 400
690
+ },
691
+ {
692
+ "epoch": 9.08,
693
+ "eval_loss": 0.5568162202835083,
694
+ "eval_runtime": 89.7223,
695
+ "eval_samples_per_second": 4.391,
696
+ "eval_steps_per_second": 0.145,
697
+ "eval_wer": 19.3465805193222,
698
+ "step": 400
699
+ },
700
+ {
701
+ "epoch": 9.09,
702
+ "learning_rate": 2.4615384615384616e-07,
703
+ "loss": 0.0802,
704
+ "step": 404
705
+ },
706
+ {
707
+ "epoch": 9.09,
708
+ "step": 407,
709
+ "total_flos": 6.36398180352e+17,
710
+ "train_loss": 0.35074408769753995,
711
+ "train_runtime": 2707.3827,
712
+ "train_samples_per_second": 9.621,
713
+ "train_steps_per_second": 0.15
714
+ }
715
+ ],
716
+ "max_steps": 407,
717
+ "num_train_epochs": 9223372036854775807,
718
+ "total_flos": 6.36398180352e+17,
719
+ "trial_name": null,
720
+ "trial_params": null
721
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61d71f34a5c8fca893f540e362d0ae479fb1a6d77e2fa71c8da729fdb87d1f54
3
  size 3579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcbca0d141969bcb1c3cd0ef5a009221139334753b899d88e4d5003bd23f4b5f
3
  size 3579