marinone94 commited on
Commit
3644968
1 Parent(s): df76731

clean training step

Browse files
checkpoint-360/config.json DELETED
@@ -1,142 +0,0 @@
1
- {
2
- "_name_or_path": "openai/whisper-tiny",
3
- "activation_dropout": 0.0,
4
- "activation_function": "gelu",
5
- "architectures": [
6
- "WhisperForConditionalGeneration"
7
- ],
8
- "attention_dropout": 0.0,
9
- "begin_suppress_tokens": [
10
- 220,
11
- 50257
12
- ],
13
- "bos_token_id": 50257,
14
- "d_model": 384,
15
- "decoder_attention_heads": 6,
16
- "decoder_ffn_dim": 1536,
17
- "decoder_layerdrop": 0.0,
18
- "decoder_layers": 4,
19
- "decoder_start_token_id": 50258,
20
- "dropout": 0.0,
21
- "encoder_attention_heads": 6,
22
- "encoder_ffn_dim": 1536,
23
- "encoder_layerdrop": 0.0,
24
- "encoder_layers": 4,
25
- "eos_token_id": 50257,
26
- "forced_decoder_ids": [
27
- [
28
- 1,
29
- 50259
30
- ],
31
- [
32
- 2,
33
- 50359
34
- ],
35
- [
36
- 3,
37
- 50363
38
- ]
39
- ],
40
- "init_std": 0.02,
41
- "is_encoder_decoder": true,
42
- "max_length": 448,
43
- "max_source_positions": 1500,
44
- "max_target_positions": 448,
45
- "model_type": "whisper",
46
- "num_hidden_layers": 4,
47
- "num_mel_bins": 80,
48
- "pad_token_id": 50257,
49
- "scale_embedding": false,
50
- "suppress_tokens": [
51
- 1,
52
- 2,
53
- 7,
54
- 8,
55
- 9,
56
- 10,
57
- 14,
58
- 25,
59
- 26,
60
- 27,
61
- 28,
62
- 29,
63
- 31,
64
- 58,
65
- 59,
66
- 60,
67
- 61,
68
- 62,
69
- 63,
70
- 90,
71
- 91,
72
- 92,
73
- 93,
74
- 359,
75
- 503,
76
- 522,
77
- 542,
78
- 873,
79
- 893,
80
- 902,
81
- 918,
82
- 922,
83
- 931,
84
- 1350,
85
- 1853,
86
- 1982,
87
- 2460,
88
- 2627,
89
- 3246,
90
- 3253,
91
- 3268,
92
- 3536,
93
- 3846,
94
- 3961,
95
- 4183,
96
- 4667,
97
- 6585,
98
- 6647,
99
- 7273,
100
- 9061,
101
- 9383,
102
- 10428,
103
- 10929,
104
- 11938,
105
- 12033,
106
- 12331,
107
- 12562,
108
- 13793,
109
- 14157,
110
- 14635,
111
- 15265,
112
- 15618,
113
- 16553,
114
- 16604,
115
- 18362,
116
- 18956,
117
- 20075,
118
- 21675,
119
- 22520,
120
- 26130,
121
- 26161,
122
- 26435,
123
- 28279,
124
- 29464,
125
- 31650,
126
- 32302,
127
- 32470,
128
- 36865,
129
- 42863,
130
- 47425,
131
- 49870,
132
- 50254,
133
- 50258,
134
- 50360,
135
- 50361,
136
- 50362
137
- ],
138
- "torch_dtype": "float32",
139
- "transformers_version": "4.26.0.dev0",
140
- "use_cache": true,
141
- "vocab_size": 51865
142
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-360/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:513fbc5d03bd07f32d77cf2f5dcc0d8298575b96fbda2ed1de30f1cb859889ae
3
- size 302183173
 
 
 
 
checkpoint-360/preprocessor_config.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-360/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:48879b1ce776151b602f3a1bdf10683d776d3f0765214b322443dddb1d951006
3
- size 151098921
 
 
 
 
checkpoint-360/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:89f9781ff6e5ab617d91036a7029d39a2832fa624ae853afb0f238fb19535016
3
- size 14575
 
 
 
 
checkpoint-360/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:12d681e2b2a56f2134611cbb1679a9f32470e4cf3a48f4a2243741f0852b30ae
3
- size 557
 
 
 
 
checkpoint-360/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:922e864e56c484925ddcd495d1c992405fc4f95d13329256b422ef0f40cc0891
3
- size 627
 
 
 
 
checkpoint-360/trainer_state.json DELETED
@@ -1,637 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 8.07862407862408,
5
- "global_step": 360,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0.0,
13
- "loss": 1.8118,
14
- "step": 4
15
- },
16
- {
17
- "epoch": 0.02,
18
- "learning_rate": 3.6585365853658536e-07,
19
- "loss": 1.8122,
20
- "step": 8
21
- },
22
- {
23
- "epoch": 0.03,
24
- "learning_rate": 8.53658536585366e-07,
25
- "loss": 1.8174,
26
- "step": 12
27
- },
28
- {
29
- "epoch": 0.04,
30
- "learning_rate": 1.3414634146341465e-06,
31
- "loss": 1.7616,
32
- "step": 16
33
- },
34
- {
35
- "epoch": 0.05,
36
- "learning_rate": 1.8292682926829268e-06,
37
- "loss": 1.6875,
38
- "step": 20
39
- },
40
- {
41
- "epoch": 0.06,
42
- "learning_rate": 2.317073170731708e-06,
43
- "loss": 1.5201,
44
- "step": 24
45
- },
46
- {
47
- "epoch": 0.07,
48
- "learning_rate": 2.8048780487804884e-06,
49
- "loss": 1.3982,
50
- "step": 28
51
- },
52
- {
53
- "epoch": 0.08,
54
- "learning_rate": 3.292682926829269e-06,
55
- "loss": 1.3541,
56
- "step": 32
57
- },
58
- {
59
- "epoch": 0.09,
60
- "learning_rate": 3.780487804878049e-06,
61
- "loss": 1.2092,
62
- "step": 36
63
- },
64
- {
65
- "epoch": 0.1,
66
- "learning_rate": 4.268292682926829e-06,
67
- "loss": 1.1599,
68
- "step": 40
69
- },
70
- {
71
- "epoch": 0.1,
72
- "eval_loss": 1.142654299736023,
73
- "eval_runtime": 101.9854,
74
- "eval_samples_per_second": 3.863,
75
- "eval_steps_per_second": 0.127,
76
- "eval_wer": 15.213946117274169,
77
- "step": 40
78
- },
79
- {
80
- "epoch": 1.01,
81
- "learning_rate": 4.75609756097561e-06,
82
- "loss": 1.0124,
83
- "step": 44
84
- },
85
- {
86
- "epoch": 1.02,
87
- "learning_rate": 5.243902439024391e-06,
88
- "loss": 0.9171,
89
- "step": 48
90
- },
91
- {
92
- "epoch": 1.03,
93
- "learning_rate": 5.731707317073171e-06,
94
- "loss": 0.8027,
95
- "step": 52
96
- },
97
- {
98
- "epoch": 1.04,
99
- "learning_rate": 6.219512195121951e-06,
100
- "loss": 0.7284,
101
- "step": 56
102
- },
103
- {
104
- "epoch": 1.05,
105
- "learning_rate": 6.707317073170733e-06,
106
- "loss": 0.6185,
107
- "step": 60
108
- },
109
- {
110
- "epoch": 1.06,
111
- "learning_rate": 7.1951219512195125e-06,
112
- "loss": 0.57,
113
- "step": 64
114
- },
115
- {
116
- "epoch": 1.07,
117
- "learning_rate": 7.682926829268293e-06,
118
- "loss": 0.4985,
119
- "step": 68
120
- },
121
- {
122
- "epoch": 1.08,
123
- "learning_rate": 8.170731707317073e-06,
124
- "loss": 0.488,
125
- "step": 72
126
- },
127
- {
128
- "epoch": 1.09,
129
- "learning_rate": 8.658536585365854e-06,
130
- "loss": 0.4569,
131
- "step": 76
132
- },
133
- {
134
- "epoch": 1.1,
135
- "learning_rate": 9.146341463414635e-06,
136
- "loss": 0.4655,
137
- "step": 80
138
- },
139
- {
140
- "epoch": 1.1,
141
- "eval_loss": 0.5613037943840027,
142
- "eval_runtime": 91.9697,
143
- "eval_samples_per_second": 4.284,
144
- "eval_steps_per_second": 0.141,
145
- "eval_wer": 17.591125198098258,
146
- "step": 80
147
- },
148
- {
149
- "epoch": 2.0,
150
- "learning_rate": 9.634146341463415e-06,
151
- "loss": 0.425,
152
- "step": 84
153
- },
154
- {
155
- "epoch": 2.01,
156
- "learning_rate": 9.96923076923077e-06,
157
- "loss": 0.4162,
158
- "step": 88
159
- },
160
- {
161
- "epoch": 2.02,
162
- "learning_rate": 9.846153846153848e-06,
163
- "loss": 0.3809,
164
- "step": 92
165
- },
166
- {
167
- "epoch": 2.03,
168
- "learning_rate": 9.723076923076924e-06,
169
- "loss": 0.3533,
170
- "step": 96
171
- },
172
- {
173
- "epoch": 2.04,
174
- "learning_rate": 9.600000000000001e-06,
175
- "loss": 0.3511,
176
- "step": 100
177
- },
178
- {
179
- "epoch": 2.05,
180
- "learning_rate": 9.476923076923079e-06,
181
- "loss": 0.3475,
182
- "step": 104
183
- },
184
- {
185
- "epoch": 2.06,
186
- "learning_rate": 9.353846153846155e-06,
187
- "loss": 0.321,
188
- "step": 108
189
- },
190
- {
191
- "epoch": 2.07,
192
- "learning_rate": 9.230769230769232e-06,
193
- "loss": 0.2859,
194
- "step": 112
195
- },
196
- {
197
- "epoch": 2.08,
198
- "learning_rate": 9.107692307692308e-06,
199
- "loss": 0.3191,
200
- "step": 116
201
- },
202
- {
203
- "epoch": 2.09,
204
- "learning_rate": 8.984615384615386e-06,
205
- "loss": 0.2753,
206
- "step": 120
207
- },
208
- {
209
- "epoch": 2.09,
210
- "eval_loss": 0.5241264700889587,
211
- "eval_runtime": 88.0526,
212
- "eval_samples_per_second": 4.475,
213
- "eval_steps_per_second": 0.148,
214
- "eval_wer": 17.21321467755699,
215
- "step": 120
216
- },
217
- {
218
- "epoch": 3.0,
219
- "learning_rate": 8.861538461538463e-06,
220
- "loss": 0.3104,
221
- "step": 124
222
- },
223
- {
224
- "epoch": 3.01,
225
- "learning_rate": 8.73846153846154e-06,
226
- "loss": 0.2734,
227
- "step": 128
228
- },
229
- {
230
- "epoch": 3.02,
231
- "learning_rate": 8.615384615384617e-06,
232
- "loss": 0.2608,
233
- "step": 132
234
- },
235
- {
236
- "epoch": 3.03,
237
- "learning_rate": 8.492307692307693e-06,
238
- "loss": 0.2509,
239
- "step": 136
240
- },
241
- {
242
- "epoch": 3.04,
243
- "learning_rate": 8.36923076923077e-06,
244
- "loss": 0.2548,
245
- "step": 140
246
- },
247
- {
248
- "epoch": 3.05,
249
- "learning_rate": 8.246153846153848e-06,
250
- "loss": 0.2469,
251
- "step": 144
252
- },
253
- {
254
- "epoch": 3.06,
255
- "learning_rate": 8.123076923076924e-06,
256
- "loss": 0.2231,
257
- "step": 148
258
- },
259
- {
260
- "epoch": 3.07,
261
- "learning_rate": 8.000000000000001e-06,
262
- "loss": 0.2138,
263
- "step": 152
264
- },
265
- {
266
- "epoch": 3.08,
267
- "learning_rate": 7.876923076923077e-06,
268
- "loss": 0.2349,
269
- "step": 156
270
- },
271
- {
272
- "epoch": 3.09,
273
- "learning_rate": 7.753846153846155e-06,
274
- "loss": 0.2077,
275
- "step": 160
276
- },
277
- {
278
- "epoch": 3.09,
279
- "eval_loss": 0.5241798758506775,
280
- "eval_runtime": 88.5317,
281
- "eval_samples_per_second": 4.45,
282
- "eval_steps_per_second": 0.147,
283
- "eval_wer": 17.26197732536877,
284
- "step": 160
285
- },
286
- {
287
- "epoch": 3.1,
288
- "learning_rate": 7.630769230769232e-06,
289
- "loss": 0.2322,
290
- "step": 164
291
- },
292
- {
293
- "epoch": 4.01,
294
- "learning_rate": 7.507692307692308e-06,
295
- "loss": 0.2036,
296
- "step": 168
297
- },
298
- {
299
- "epoch": 4.02,
300
- "learning_rate": 7.384615384615386e-06,
301
- "loss": 0.2058,
302
- "step": 172
303
- },
304
- {
305
- "epoch": 4.03,
306
- "learning_rate": 7.261538461538462e-06,
307
- "loss": 0.1797,
308
- "step": 176
309
- },
310
- {
311
- "epoch": 4.04,
312
- "learning_rate": 7.1384615384615385e-06,
313
- "loss": 0.186,
314
- "step": 180
315
- },
316
- {
317
- "epoch": 4.05,
318
- "learning_rate": 7.015384615384616e-06,
319
- "loss": 0.2035,
320
- "step": 184
321
- },
322
- {
323
- "epoch": 4.06,
324
- "learning_rate": 6.892307692307693e-06,
325
- "loss": 0.1794,
326
- "step": 188
327
- },
328
- {
329
- "epoch": 4.07,
330
- "learning_rate": 6.76923076923077e-06,
331
- "loss": 0.1589,
332
- "step": 192
333
- },
334
- {
335
- "epoch": 4.08,
336
- "learning_rate": 6.646153846153846e-06,
337
- "loss": 0.1879,
338
- "step": 196
339
- },
340
- {
341
- "epoch": 4.09,
342
- "learning_rate": 6.523076923076923e-06,
343
- "loss": 0.1636,
344
- "step": 200
345
- },
346
- {
347
- "epoch": 4.09,
348
- "eval_loss": 0.5289868712425232,
349
- "eval_runtime": 95.5188,
350
- "eval_samples_per_second": 4.125,
351
- "eval_steps_per_second": 0.136,
352
- "eval_wer": 17.66426916981592,
353
- "step": 200
354
- },
355
- {
356
- "epoch": 4.1,
357
- "learning_rate": 6.4000000000000006e-06,
358
- "loss": 0.1767,
359
- "step": 204
360
- },
361
- {
362
- "epoch": 5.01,
363
- "learning_rate": 6.276923076923077e-06,
364
- "loss": 0.1657,
365
- "step": 208
366
- },
367
- {
368
- "epoch": 5.02,
369
- "learning_rate": 6.153846153846155e-06,
370
- "loss": 0.1607,
371
- "step": 212
372
- },
373
- {
374
- "epoch": 5.03,
375
- "learning_rate": 6.030769230769231e-06,
376
- "loss": 0.1458,
377
- "step": 216
378
- },
379
- {
380
- "epoch": 5.04,
381
- "learning_rate": 5.907692307692308e-06,
382
- "loss": 0.1541,
383
- "step": 220
384
- },
385
- {
386
- "epoch": 5.05,
387
- "learning_rate": 5.784615384615385e-06,
388
- "loss": 0.1494,
389
- "step": 224
390
- },
391
- {
392
- "epoch": 5.06,
393
- "learning_rate": 5.661538461538462e-06,
394
- "loss": 0.144,
395
- "step": 228
396
- },
397
- {
398
- "epoch": 5.07,
399
- "learning_rate": 5.538461538461539e-06,
400
- "loss": 0.1311,
401
- "step": 232
402
- },
403
- {
404
- "epoch": 5.08,
405
- "learning_rate": 5.415384615384615e-06,
406
- "loss": 0.1411,
407
- "step": 236
408
- },
409
- {
410
- "epoch": 5.09,
411
- "learning_rate": 5.292307692307693e-06,
412
- "loss": 0.1322,
413
- "step": 240
414
- },
415
- {
416
- "epoch": 5.09,
417
- "eval_loss": 0.5350630283355713,
418
- "eval_runtime": 92.5111,
419
- "eval_samples_per_second": 4.259,
420
- "eval_steps_per_second": 0.141,
421
- "eval_wer": 18.2128489576984,
422
- "step": 240
423
- },
424
- {
425
- "epoch": 5.1,
426
- "learning_rate": 5.16923076923077e-06,
427
- "loss": 0.1436,
428
- "step": 244
429
- },
430
- {
431
- "epoch": 6.0,
432
- "learning_rate": 5.046153846153846e-06,
433
- "loss": 0.1375,
434
- "step": 248
435
- },
436
- {
437
- "epoch": 6.01,
438
- "learning_rate": 4.923076923076924e-06,
439
- "loss": 0.1361,
440
- "step": 252
441
- },
442
- {
443
- "epoch": 6.02,
444
- "learning_rate": 4.800000000000001e-06,
445
- "loss": 0.129,
446
- "step": 256
447
- },
448
- {
449
- "epoch": 6.03,
450
- "learning_rate": 4.676923076923077e-06,
451
- "loss": 0.1127,
452
- "step": 260
453
- },
454
- {
455
- "epoch": 6.04,
456
- "learning_rate": 4.553846153846154e-06,
457
- "loss": 0.1266,
458
- "step": 264
459
- },
460
- {
461
- "epoch": 6.05,
462
- "learning_rate": 4.430769230769232e-06,
463
- "loss": 0.1193,
464
- "step": 268
465
- },
466
- {
467
- "epoch": 6.06,
468
- "learning_rate": 4.307692307692308e-06,
469
- "loss": 0.1127,
470
- "step": 272
471
- },
472
- {
473
- "epoch": 6.07,
474
- "learning_rate": 4.184615384615385e-06,
475
- "loss": 0.1064,
476
- "step": 276
477
- },
478
- {
479
- "epoch": 6.08,
480
- "learning_rate": 4.061538461538462e-06,
481
- "loss": 0.123,
482
- "step": 280
483
- },
484
- {
485
- "epoch": 6.08,
486
- "eval_loss": 0.5429388284683228,
487
- "eval_runtime": 91.5818,
488
- "eval_samples_per_second": 4.302,
489
- "eval_steps_per_second": 0.142,
490
- "eval_wer": 18.907716689016212,
491
- "step": 280
492
- },
493
- {
494
- "epoch": 6.09,
495
- "learning_rate": 3.938461538461539e-06,
496
- "loss": 0.1057,
497
- "step": 284
498
- },
499
- {
500
- "epoch": 7.0,
501
- "learning_rate": 3.815384615384616e-06,
502
- "loss": 0.1258,
503
- "step": 288
504
- },
505
- {
506
- "epoch": 7.01,
507
- "learning_rate": 3.692307692307693e-06,
508
- "loss": 0.1108,
509
- "step": 292
510
- },
511
- {
512
- "epoch": 7.02,
513
- "learning_rate": 3.5692307692307692e-06,
514
- "loss": 0.1115,
515
- "step": 296
516
- },
517
- {
518
- "epoch": 7.03,
519
- "learning_rate": 3.4461538461538464e-06,
520
- "loss": 0.0998,
521
- "step": 300
522
- },
523
- {
524
- "epoch": 7.04,
525
- "learning_rate": 3.323076923076923e-06,
526
- "loss": 0.1106,
527
- "step": 304
528
- },
529
- {
530
- "epoch": 7.05,
531
- "learning_rate": 3.2000000000000003e-06,
532
- "loss": 0.1045,
533
- "step": 308
534
- },
535
- {
536
- "epoch": 7.06,
537
- "learning_rate": 3.0769230769230774e-06,
538
- "loss": 0.0908,
539
- "step": 312
540
- },
541
- {
542
- "epoch": 7.07,
543
- "learning_rate": 2.953846153846154e-06,
544
- "loss": 0.0931,
545
- "step": 316
546
- },
547
- {
548
- "epoch": 7.08,
549
- "learning_rate": 2.830769230769231e-06,
550
- "loss": 0.1074,
551
- "step": 320
552
- },
553
- {
554
- "epoch": 7.08,
555
- "eval_loss": 0.5500437021255493,
556
- "eval_runtime": 104.0907,
557
- "eval_samples_per_second": 3.785,
558
- "eval_steps_per_second": 0.125,
559
- "eval_wer": 19.054004632451544,
560
- "step": 320
561
- },
562
- {
563
- "epoch": 7.09,
564
- "learning_rate": 2.7076923076923076e-06,
565
- "loss": 0.0937,
566
- "step": 324
567
- },
568
- {
569
- "epoch": 7.1,
570
- "learning_rate": 2.584615384615385e-06,
571
- "loss": 0.1091,
572
- "step": 328
573
- },
574
- {
575
- "epoch": 8.01,
576
- "learning_rate": 2.461538461538462e-06,
577
- "loss": 0.0951,
578
- "step": 332
579
- },
580
- {
581
- "epoch": 8.02,
582
- "learning_rate": 2.3384615384615387e-06,
583
- "loss": 0.1003,
584
- "step": 336
585
- },
586
- {
587
- "epoch": 8.03,
588
- "learning_rate": 2.215384615384616e-06,
589
- "loss": 0.0836,
590
- "step": 340
591
- },
592
- {
593
- "epoch": 8.04,
594
- "learning_rate": 2.0923076923076926e-06,
595
- "loss": 0.0907,
596
- "step": 344
597
- },
598
- {
599
- "epoch": 8.05,
600
- "learning_rate": 1.9692307692307693e-06,
601
- "loss": 0.1013,
602
- "step": 348
603
- },
604
- {
605
- "epoch": 8.06,
606
- "learning_rate": 1.8461538461538465e-06,
607
- "loss": 0.0891,
608
- "step": 352
609
- },
610
- {
611
- "epoch": 8.07,
612
- "learning_rate": 1.7230769230769232e-06,
613
- "loss": 0.077,
614
- "step": 356
615
- },
616
- {
617
- "epoch": 8.08,
618
- "learning_rate": 1.6000000000000001e-06,
619
- "loss": 0.1007,
620
- "step": 360
621
- },
622
- {
623
- "epoch": 8.08,
624
- "eval_loss": 0.5552565455436707,
625
- "eval_runtime": 88.458,
626
- "eval_samples_per_second": 4.454,
627
- "eval_steps_per_second": 0.147,
628
- "eval_wer": 19.310008533463368,
629
- "step": 360
630
- }
631
- ],
632
- "max_steps": 407,
633
- "num_train_epochs": 9223372036854775807,
634
- "total_flos": 5.6288618938368e+17,
635
- "trial_name": null,
636
- "trial_params": null
637
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-360/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcbca0d141969bcb1c3cd0ef5a009221139334753b899d88e4d5003bd23f4b5f
3
- size 3579
 
 
 
 
checkpoint-400/config.json DELETED
@@ -1,142 +0,0 @@
1
- {
2
- "_name_or_path": "openai/whisper-tiny",
3
- "activation_dropout": 0.0,
4
- "activation_function": "gelu",
5
- "architectures": [
6
- "WhisperForConditionalGeneration"
7
- ],
8
- "attention_dropout": 0.0,
9
- "begin_suppress_tokens": [
10
- 220,
11
- 50257
12
- ],
13
- "bos_token_id": 50257,
14
- "d_model": 384,
15
- "decoder_attention_heads": 6,
16
- "decoder_ffn_dim": 1536,
17
- "decoder_layerdrop": 0.0,
18
- "decoder_layers": 4,
19
- "decoder_start_token_id": 50258,
20
- "dropout": 0.0,
21
- "encoder_attention_heads": 6,
22
- "encoder_ffn_dim": 1536,
23
- "encoder_layerdrop": 0.0,
24
- "encoder_layers": 4,
25
- "eos_token_id": 50257,
26
- "forced_decoder_ids": [
27
- [
28
- 1,
29
- 50259
30
- ],
31
- [
32
- 2,
33
- 50359
34
- ],
35
- [
36
- 3,
37
- 50363
38
- ]
39
- ],
40
- "init_std": 0.02,
41
- "is_encoder_decoder": true,
42
- "max_length": 448,
43
- "max_source_positions": 1500,
44
- "max_target_positions": 448,
45
- "model_type": "whisper",
46
- "num_hidden_layers": 4,
47
- "num_mel_bins": 80,
48
- "pad_token_id": 50257,
49
- "scale_embedding": false,
50
- "suppress_tokens": [
51
- 1,
52
- 2,
53
- 7,
54
- 8,
55
- 9,
56
- 10,
57
- 14,
58
- 25,
59
- 26,
60
- 27,
61
- 28,
62
- 29,
63
- 31,
64
- 58,
65
- 59,
66
- 60,
67
- 61,
68
- 62,
69
- 63,
70
- 90,
71
- 91,
72
- 92,
73
- 93,
74
- 359,
75
- 503,
76
- 522,
77
- 542,
78
- 873,
79
- 893,
80
- 902,
81
- 918,
82
- 922,
83
- 931,
84
- 1350,
85
- 1853,
86
- 1982,
87
- 2460,
88
- 2627,
89
- 3246,
90
- 3253,
91
- 3268,
92
- 3536,
93
- 3846,
94
- 3961,
95
- 4183,
96
- 4667,
97
- 6585,
98
- 6647,
99
- 7273,
100
- 9061,
101
- 9383,
102
- 10428,
103
- 10929,
104
- 11938,
105
- 12033,
106
- 12331,
107
- 12562,
108
- 13793,
109
- 14157,
110
- 14635,
111
- 15265,
112
- 15618,
113
- 16553,
114
- 16604,
115
- 18362,
116
- 18956,
117
- 20075,
118
- 21675,
119
- 22520,
120
- 26130,
121
- 26161,
122
- 26435,
123
- 28279,
124
- 29464,
125
- 31650,
126
- 32302,
127
- 32470,
128
- 36865,
129
- 42863,
130
- 47425,
131
- 49870,
132
- 50254,
133
- 50258,
134
- 50360,
135
- 50361,
136
- 50362
137
- ],
138
- "torch_dtype": "float32",
139
- "transformers_version": "4.26.0.dev0",
140
- "use_cache": true,
141
- "vocab_size": 51865
142
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-400/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca2f216746e89a3c2dc592e56702d375bf97996f49afe1d761ee97223c74e35c
3
- size 302183173
 
 
 
 
checkpoint-400/preprocessor_config.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2f00e8aebd51836c62a6367fa1a3bad01938ccd285ac6cc2c2dd7b6e9755793
3
- size 151098921
 
 
 
 
checkpoint-400/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0fee234ab8151645a0895ac4e9559fbd6bec4f70f802b8c94db562d283ad737
3
- size 14639
 
 
 
 
checkpoint-400/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:37d85a7f00fa7bae4774c70bac351a030ccefea202dbec056f5a4d44e50b132c
3
- size 557
 
 
 
 
checkpoint-400/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cf1f3c596de0ecc246fc5b02e9720244273de83ebc2f79d153609594e679a82
3
- size 627
 
 
 
 
checkpoint-400/trainer_state.json DELETED
@@ -1,706 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 9.076167076167076,
5
- "global_step": 400,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 0.0,
13
- "loss": 1.8118,
14
- "step": 4
15
- },
16
- {
17
- "epoch": 0.02,
18
- "learning_rate": 3.6585365853658536e-07,
19
- "loss": 1.8122,
20
- "step": 8
21
- },
22
- {
23
- "epoch": 0.03,
24
- "learning_rate": 8.53658536585366e-07,
25
- "loss": 1.8174,
26
- "step": 12
27
- },
28
- {
29
- "epoch": 0.04,
30
- "learning_rate": 1.3414634146341465e-06,
31
- "loss": 1.7616,
32
- "step": 16
33
- },
34
- {
35
- "epoch": 0.05,
36
- "learning_rate": 1.8292682926829268e-06,
37
- "loss": 1.6875,
38
- "step": 20
39
- },
40
- {
41
- "epoch": 0.06,
42
- "learning_rate": 2.317073170731708e-06,
43
- "loss": 1.5201,
44
- "step": 24
45
- },
46
- {
47
- "epoch": 0.07,
48
- "learning_rate": 2.8048780487804884e-06,
49
- "loss": 1.3982,
50
- "step": 28
51
- },
52
- {
53
- "epoch": 0.08,
54
- "learning_rate": 3.292682926829269e-06,
55
- "loss": 1.3541,
56
- "step": 32
57
- },
58
- {
59
- "epoch": 0.09,
60
- "learning_rate": 3.780487804878049e-06,
61
- "loss": 1.2092,
62
- "step": 36
63
- },
64
- {
65
- "epoch": 0.1,
66
- "learning_rate": 4.268292682926829e-06,
67
- "loss": 1.1599,
68
- "step": 40
69
- },
70
- {
71
- "epoch": 0.1,
72
- "eval_loss": 1.142654299736023,
73
- "eval_runtime": 101.9854,
74
- "eval_samples_per_second": 3.863,
75
- "eval_steps_per_second": 0.127,
76
- "eval_wer": 15.213946117274169,
77
- "step": 40
78
- },
79
- {
80
- "epoch": 1.01,
81
- "learning_rate": 4.75609756097561e-06,
82
- "loss": 1.0124,
83
- "step": 44
84
- },
85
- {
86
- "epoch": 1.02,
87
- "learning_rate": 5.243902439024391e-06,
88
- "loss": 0.9171,
89
- "step": 48
90
- },
91
- {
92
- "epoch": 1.03,
93
- "learning_rate": 5.731707317073171e-06,
94
- "loss": 0.8027,
95
- "step": 52
96
- },
97
- {
98
- "epoch": 1.04,
99
- "learning_rate": 6.219512195121951e-06,
100
- "loss": 0.7284,
101
- "step": 56
102
- },
103
- {
104
- "epoch": 1.05,
105
- "learning_rate": 6.707317073170733e-06,
106
- "loss": 0.6185,
107
- "step": 60
108
- },
109
- {
110
- "epoch": 1.06,
111
- "learning_rate": 7.1951219512195125e-06,
112
- "loss": 0.57,
113
- "step": 64
114
- },
115
- {
116
- "epoch": 1.07,
117
- "learning_rate": 7.682926829268293e-06,
118
- "loss": 0.4985,
119
- "step": 68
120
- },
121
- {
122
- "epoch": 1.08,
123
- "learning_rate": 8.170731707317073e-06,
124
- "loss": 0.488,
125
- "step": 72
126
- },
127
- {
128
- "epoch": 1.09,
129
- "learning_rate": 8.658536585365854e-06,
130
- "loss": 0.4569,
131
- "step": 76
132
- },
133
- {
134
- "epoch": 1.1,
135
- "learning_rate": 9.146341463414635e-06,
136
- "loss": 0.4655,
137
- "step": 80
138
- },
139
- {
140
- "epoch": 1.1,
141
- "eval_loss": 0.5613037943840027,
142
- "eval_runtime": 91.9697,
143
- "eval_samples_per_second": 4.284,
144
- "eval_steps_per_second": 0.141,
145
- "eval_wer": 17.591125198098258,
146
- "step": 80
147
- },
148
- {
149
- "epoch": 2.0,
150
- "learning_rate": 9.634146341463415e-06,
151
- "loss": 0.425,
152
- "step": 84
153
- },
154
- {
155
- "epoch": 2.01,
156
- "learning_rate": 9.96923076923077e-06,
157
- "loss": 0.4162,
158
- "step": 88
159
- },
160
- {
161
- "epoch": 2.02,
162
- "learning_rate": 9.846153846153848e-06,
163
- "loss": 0.3809,
164
- "step": 92
165
- },
166
- {
167
- "epoch": 2.03,
168
- "learning_rate": 9.723076923076924e-06,
169
- "loss": 0.3533,
170
- "step": 96
171
- },
172
- {
173
- "epoch": 2.04,
174
- "learning_rate": 9.600000000000001e-06,
175
- "loss": 0.3511,
176
- "step": 100
177
- },
178
- {
179
- "epoch": 2.05,
180
- "learning_rate": 9.476923076923079e-06,
181
- "loss": 0.3475,
182
- "step": 104
183
- },
184
- {
185
- "epoch": 2.06,
186
- "learning_rate": 9.353846153846155e-06,
187
- "loss": 0.321,
188
- "step": 108
189
- },
190
- {
191
- "epoch": 2.07,
192
- "learning_rate": 9.230769230769232e-06,
193
- "loss": 0.2859,
194
- "step": 112
195
- },
196
- {
197
- "epoch": 2.08,
198
- "learning_rate": 9.107692307692308e-06,
199
- "loss": 0.3191,
200
- "step": 116
201
- },
202
- {
203
- "epoch": 2.09,
204
- "learning_rate": 8.984615384615386e-06,
205
- "loss": 0.2753,
206
- "step": 120
207
- },
208
- {
209
- "epoch": 2.09,
210
- "eval_loss": 0.5241264700889587,
211
- "eval_runtime": 88.0526,
212
- "eval_samples_per_second": 4.475,
213
- "eval_steps_per_second": 0.148,
214
- "eval_wer": 17.21321467755699,
215
- "step": 120
216
- },
217
- {
218
- "epoch": 3.0,
219
- "learning_rate": 8.861538461538463e-06,
220
- "loss": 0.3104,
221
- "step": 124
222
- },
223
- {
224
- "epoch": 3.01,
225
- "learning_rate": 8.73846153846154e-06,
226
- "loss": 0.2734,
227
- "step": 128
228
- },
229
- {
230
- "epoch": 3.02,
231
- "learning_rate": 8.615384615384617e-06,
232
- "loss": 0.2608,
233
- "step": 132
234
- },
235
- {
236
- "epoch": 3.03,
237
- "learning_rate": 8.492307692307693e-06,
238
- "loss": 0.2509,
239
- "step": 136
240
- },
241
- {
242
- "epoch": 3.04,
243
- "learning_rate": 8.36923076923077e-06,
244
- "loss": 0.2548,
245
- "step": 140
246
- },
247
- {
248
- "epoch": 3.05,
249
- "learning_rate": 8.246153846153848e-06,
250
- "loss": 0.2469,
251
- "step": 144
252
- },
253
- {
254
- "epoch": 3.06,
255
- "learning_rate": 8.123076923076924e-06,
256
- "loss": 0.2231,
257
- "step": 148
258
- },
259
- {
260
- "epoch": 3.07,
261
- "learning_rate": 8.000000000000001e-06,
262
- "loss": 0.2138,
263
- "step": 152
264
- },
265
- {
266
- "epoch": 3.08,
267
- "learning_rate": 7.876923076923077e-06,
268
- "loss": 0.2349,
269
- "step": 156
270
- },
271
- {
272
- "epoch": 3.09,
273
- "learning_rate": 7.753846153846155e-06,
274
- "loss": 0.2077,
275
- "step": 160
276
- },
277
- {
278
- "epoch": 3.09,
279
- "eval_loss": 0.5241798758506775,
280
- "eval_runtime": 88.5317,
281
- "eval_samples_per_second": 4.45,
282
- "eval_steps_per_second": 0.147,
283
- "eval_wer": 17.26197732536877,
284
- "step": 160
285
- },
286
- {
287
- "epoch": 3.1,
288
- "learning_rate": 7.630769230769232e-06,
289
- "loss": 0.2322,
290
- "step": 164
291
- },
292
- {
293
- "epoch": 4.01,
294
- "learning_rate": 7.507692307692308e-06,
295
- "loss": 0.2036,
296
- "step": 168
297
- },
298
- {
299
- "epoch": 4.02,
300
- "learning_rate": 7.384615384615386e-06,
301
- "loss": 0.2058,
302
- "step": 172
303
- },
304
- {
305
- "epoch": 4.03,
306
- "learning_rate": 7.261538461538462e-06,
307
- "loss": 0.1797,
308
- "step": 176
309
- },
310
- {
311
- "epoch": 4.04,
312
- "learning_rate": 7.1384615384615385e-06,
313
- "loss": 0.186,
314
- "step": 180
315
- },
316
- {
317
- "epoch": 4.05,
318
- "learning_rate": 7.015384615384616e-06,
319
- "loss": 0.2035,
320
- "step": 184
321
- },
322
- {
323
- "epoch": 4.06,
324
- "learning_rate": 6.892307692307693e-06,
325
- "loss": 0.1794,
326
- "step": 188
327
- },
328
- {
329
- "epoch": 4.07,
330
- "learning_rate": 6.76923076923077e-06,
331
- "loss": 0.1589,
332
- "step": 192
333
- },
334
- {
335
- "epoch": 4.08,
336
- "learning_rate": 6.646153846153846e-06,
337
- "loss": 0.1879,
338
- "step": 196
339
- },
340
- {
341
- "epoch": 4.09,
342
- "learning_rate": 6.523076923076923e-06,
343
- "loss": 0.1636,
344
- "step": 200
345
- },
346
- {
347
- "epoch": 4.09,
348
- "eval_loss": 0.5289868712425232,
349
- "eval_runtime": 95.5188,
350
- "eval_samples_per_second": 4.125,
351
- "eval_steps_per_second": 0.136,
352
- "eval_wer": 17.66426916981592,
353
- "step": 200
354
- },
355
- {
356
- "epoch": 4.1,
357
- "learning_rate": 6.4000000000000006e-06,
358
- "loss": 0.1767,
359
- "step": 204
360
- },
361
- {
362
- "epoch": 5.01,
363
- "learning_rate": 6.276923076923077e-06,
364
- "loss": 0.1657,
365
- "step": 208
366
- },
367
- {
368
- "epoch": 5.02,
369
- "learning_rate": 6.153846153846155e-06,
370
- "loss": 0.1607,
371
- "step": 212
372
- },
373
- {
374
- "epoch": 5.03,
375
- "learning_rate": 6.030769230769231e-06,
376
- "loss": 0.1458,
377
- "step": 216
378
- },
379
- {
380
- "epoch": 5.04,
381
- "learning_rate": 5.907692307692308e-06,
382
- "loss": 0.1541,
383
- "step": 220
384
- },
385
- {
386
- "epoch": 5.05,
387
- "learning_rate": 5.784615384615385e-06,
388
- "loss": 0.1494,
389
- "step": 224
390
- },
391
- {
392
- "epoch": 5.06,
393
- "learning_rate": 5.661538461538462e-06,
394
- "loss": 0.144,
395
- "step": 228
396
- },
397
- {
398
- "epoch": 5.07,
399
- "learning_rate": 5.538461538461539e-06,
400
- "loss": 0.1311,
401
- "step": 232
402
- },
403
- {
404
- "epoch": 5.08,
405
- "learning_rate": 5.415384615384615e-06,
406
- "loss": 0.1411,
407
- "step": 236
408
- },
409
- {
410
- "epoch": 5.09,
411
- "learning_rate": 5.292307692307693e-06,
412
- "loss": 0.1322,
413
- "step": 240
414
- },
415
- {
416
- "epoch": 5.09,
417
- "eval_loss": 0.5350630283355713,
418
- "eval_runtime": 92.5111,
419
- "eval_samples_per_second": 4.259,
420
- "eval_steps_per_second": 0.141,
421
- "eval_wer": 18.2128489576984,
422
- "step": 240
423
- },
424
- {
425
- "epoch": 5.1,
426
- "learning_rate": 5.16923076923077e-06,
427
- "loss": 0.1436,
428
- "step": 244
429
- },
430
- {
431
- "epoch": 6.0,
432
- "learning_rate": 5.046153846153846e-06,
433
- "loss": 0.1375,
434
- "step": 248
435
- },
436
- {
437
- "epoch": 6.01,
438
- "learning_rate": 4.923076923076924e-06,
439
- "loss": 0.1361,
440
- "step": 252
441
- },
442
- {
443
- "epoch": 6.02,
444
- "learning_rate": 4.800000000000001e-06,
445
- "loss": 0.129,
446
- "step": 256
447
- },
448
- {
449
- "epoch": 6.03,
450
- "learning_rate": 4.676923076923077e-06,
451
- "loss": 0.1127,
452
- "step": 260
453
- },
454
- {
455
- "epoch": 6.04,
456
- "learning_rate": 4.553846153846154e-06,
457
- "loss": 0.1266,
458
- "step": 264
459
- },
460
- {
461
- "epoch": 6.05,
462
- "learning_rate": 4.430769230769232e-06,
463
- "loss": 0.1193,
464
- "step": 268
465
- },
466
- {
467
- "epoch": 6.06,
468
- "learning_rate": 4.307692307692308e-06,
469
- "loss": 0.1127,
470
- "step": 272
471
- },
472
- {
473
- "epoch": 6.07,
474
- "learning_rate": 4.184615384615385e-06,
475
- "loss": 0.1064,
476
- "step": 276
477
- },
478
- {
479
- "epoch": 6.08,
480
- "learning_rate": 4.061538461538462e-06,
481
- "loss": 0.123,
482
- "step": 280
483
- },
484
- {
485
- "epoch": 6.08,
486
- "eval_loss": 0.5429388284683228,
487
- "eval_runtime": 91.5818,
488
- "eval_samples_per_second": 4.302,
489
- "eval_steps_per_second": 0.142,
490
- "eval_wer": 18.907716689016212,
491
- "step": 280
492
- },
493
- {
494
- "epoch": 6.09,
495
- "learning_rate": 3.938461538461539e-06,
496
- "loss": 0.1057,
497
- "step": 284
498
- },
499
- {
500
- "epoch": 7.0,
501
- "learning_rate": 3.815384615384616e-06,
502
- "loss": 0.1258,
503
- "step": 288
504
- },
505
- {
506
- "epoch": 7.01,
507
- "learning_rate": 3.692307692307693e-06,
508
- "loss": 0.1108,
509
- "step": 292
510
- },
511
- {
512
- "epoch": 7.02,
513
- "learning_rate": 3.5692307692307692e-06,
514
- "loss": 0.1115,
515
- "step": 296
516
- },
517
- {
518
- "epoch": 7.03,
519
- "learning_rate": 3.4461538461538464e-06,
520
- "loss": 0.0998,
521
- "step": 300
522
- },
523
- {
524
- "epoch": 7.04,
525
- "learning_rate": 3.323076923076923e-06,
526
- "loss": 0.1106,
527
- "step": 304
528
- },
529
- {
530
- "epoch": 7.05,
531
- "learning_rate": 3.2000000000000003e-06,
532
- "loss": 0.1045,
533
- "step": 308
534
- },
535
- {
536
- "epoch": 7.06,
537
- "learning_rate": 3.0769230769230774e-06,
538
- "loss": 0.0908,
539
- "step": 312
540
- },
541
- {
542
- "epoch": 7.07,
543
- "learning_rate": 2.953846153846154e-06,
544
- "loss": 0.0931,
545
- "step": 316
546
- },
547
- {
548
- "epoch": 7.08,
549
- "learning_rate": 2.830769230769231e-06,
550
- "loss": 0.1074,
551
- "step": 320
552
- },
553
- {
554
- "epoch": 7.08,
555
- "eval_loss": 0.5500437021255493,
556
- "eval_runtime": 104.0907,
557
- "eval_samples_per_second": 3.785,
558
- "eval_steps_per_second": 0.125,
559
- "eval_wer": 19.054004632451544,
560
- "step": 320
561
- },
562
- {
563
- "epoch": 7.09,
564
- "learning_rate": 2.7076923076923076e-06,
565
- "loss": 0.0937,
566
- "step": 324
567
- },
568
- {
569
- "epoch": 7.1,
570
- "learning_rate": 2.584615384615385e-06,
571
- "loss": 0.1091,
572
- "step": 328
573
- },
574
- {
575
- "epoch": 8.01,
576
- "learning_rate": 2.461538461538462e-06,
577
- "loss": 0.0951,
578
- "step": 332
579
- },
580
- {
581
- "epoch": 8.02,
582
- "learning_rate": 2.3384615384615387e-06,
583
- "loss": 0.1003,
584
- "step": 336
585
- },
586
- {
587
- "epoch": 8.03,
588
- "learning_rate": 2.215384615384616e-06,
589
- "loss": 0.0836,
590
- "step": 340
591
- },
592
- {
593
- "epoch": 8.04,
594
- "learning_rate": 2.0923076923076926e-06,
595
- "loss": 0.0907,
596
- "step": 344
597
- },
598
- {
599
- "epoch": 8.05,
600
- "learning_rate": 1.9692307692307693e-06,
601
- "loss": 0.1013,
602
- "step": 348
603
- },
604
- {
605
- "epoch": 8.06,
606
- "learning_rate": 1.8461538461538465e-06,
607
- "loss": 0.0891,
608
- "step": 352
609
- },
610
- {
611
- "epoch": 8.07,
612
- "learning_rate": 1.7230769230769232e-06,
613
- "loss": 0.077,
614
- "step": 356
615
- },
616
- {
617
- "epoch": 8.08,
618
- "learning_rate": 1.6000000000000001e-06,
619
- "loss": 0.1007,
620
- "step": 360
621
- },
622
- {
623
- "epoch": 8.08,
624
- "eval_loss": 0.5552565455436707,
625
- "eval_runtime": 88.458,
626
- "eval_samples_per_second": 4.454,
627
- "eval_steps_per_second": 0.147,
628
- "eval_wer": 19.310008533463368,
629
- "step": 360
630
- },
631
- {
632
- "epoch": 8.09,
633
- "learning_rate": 1.476923076923077e-06,
634
- "loss": 0.0849,
635
- "step": 364
636
- },
637
- {
638
- "epoch": 8.1,
639
- "learning_rate": 1.3538461538461538e-06,
640
- "loss": 0.0971,
641
- "step": 368
642
- },
643
- {
644
- "epoch": 9.01,
645
- "learning_rate": 1.230769230769231e-06,
646
- "loss": 0.0876,
647
- "step": 372
648
- },
649
- {
650
- "epoch": 9.02,
651
- "learning_rate": 1.107692307692308e-06,
652
- "loss": 0.0879,
653
- "step": 376
654
- },
655
- {
656
- "epoch": 9.03,
657
- "learning_rate": 9.846153846153847e-07,
658
- "loss": 0.0805,
659
- "step": 380
660
- },
661
- {
662
- "epoch": 9.04,
663
- "learning_rate": 8.615384615384616e-07,
664
- "loss": 0.0888,
665
- "step": 384
666
- },
667
- {
668
- "epoch": 9.05,
669
- "learning_rate": 7.384615384615385e-07,
670
- "loss": 0.0858,
671
- "step": 388
672
- },
673
- {
674
- "epoch": 9.06,
675
- "learning_rate": 6.153846153846155e-07,
676
- "loss": 0.0825,
677
- "step": 392
678
- },
679
- {
680
- "epoch": 9.07,
681
- "learning_rate": 4.923076923076923e-07,
682
- "loss": 0.0748,
683
- "step": 396
684
- },
685
- {
686
- "epoch": 9.08,
687
- "learning_rate": 3.6923076923076927e-07,
688
- "loss": 0.0876,
689
- "step": 400
690
- },
691
- {
692
- "epoch": 9.08,
693
- "eval_loss": 0.5568162202835083,
694
- "eval_runtime": 89.7223,
695
- "eval_samples_per_second": 4.391,
696
- "eval_steps_per_second": 0.145,
697
- "eval_wer": 19.3465805193222,
698
- "step": 400
699
- }
700
- ],
701
- "max_steps": 407,
702
- "num_train_epochs": 9223372036854775807,
703
- "total_flos": 6.2536891981824e+17,
704
- "trial_name": null,
705
- "trial_params": null
706
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-400/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcbca0d141969bcb1c3cd0ef5a009221139334753b899d88e4d5003bd23f4b5f
3
- size 3579
 
 
 
 
huggingface_training.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os # used to create output directory
2
  from dataclasses import dataclass # used to define data collator
3
  from math import ceil # used to round up decimals
@@ -27,13 +29,13 @@ config = AutoConfig.from_pretrained(model_id)
27
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
28
 
29
  dataset_id = "google/fleurs"
30
- dataset_language_code = "en_us"
31
  dataset = load_dataset(dataset_id, dataset_language_code, streaming=True)
32
 
33
  """The first time you run this code, make sure everything works fine using a small sample and low number of training steps. Just uncomment the next cell and run it. One note: since the dataset is loaded in streaming mode, the instruction will not be executed immediately. Instead, the dataset will be subsampled only when data will be needed during training."""
34
 
35
- # test_script = True
36
- test_script = False
37
 
38
  ## Sample dataset for testing
39
  if test_script is True:
@@ -186,11 +188,11 @@ class ShuffleCallback(TrainerCallback):
186
  In our specific case, we could skip this step since English transcription is the default behaviour. Still, this is how you would do if you were in a multilingual setting.
187
  """
188
 
189
- processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
190
 
191
  ## If you wanted to transcribe in Swedish
192
  ## (Of course, you'd need a Swedish dataset)
193
- # processor.tokenizer.set_prefix_tokens(language="sv", task="transcribe")
194
 
195
  ## If you wanted to get an English transcription from Swedish audio
196
  # processor.tokenizer.set_prefix_tokens(language="sv", task="translate")
@@ -245,10 +247,6 @@ wandb.login()
245
  wandb.init(project="whisper-training-post")
246
  report_to = "wandb"
247
 
248
- # Define (and create, if missing) output directory
249
- output_dir = "."
250
- # os.makedirs(output_dir, exist_ok=True)
251
-
252
  # Check if we have a GPU.
253
  # In case, we will use mixed precision
254
  # to reduce memory footprint with
@@ -257,23 +255,24 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
257
  use_fp16 = (device == "cuda")
258
 
259
  # Let's first define the batch sizes
260
- # Increase it if you have more than 16GB GPU
261
  train_bs = 4 if test_script is True else 64
262
  eval_bs = 2 if test_script is True else 32
263
 
264
  # Then we infer the number of steps
265
  # TODO: how did I find it?
266
- num_training_samples = 2602
267
- num_epochs = 10
268
  max_steps_full_training = ceil(num_training_samples * num_epochs / train_bs)
269
  max_steps = 2 if test_script is True else max_steps_full_training
270
 
271
  # We don't want to evaluate too often since it slows down training a lot
 
272
  eval_steps = 1 if test_script is True else int(max_steps / 10)
273
  logging_steps = 1 if test_script is True else int(max_steps / 100)
274
 
275
  training_args = Seq2SeqTrainingArguments(
276
- output_dir=output_dir,
277
  do_train=True,
278
  do_eval=True,
279
  max_steps=max_steps,
@@ -293,7 +292,7 @@ training_args = Seq2SeqTrainingArguments(
293
  predict_with_generate=True,
294
  generation_num_beams=1,
295
  # track experiment
296
- report_to=report_to, # edit this line to track with your favourite experiment tracker(s)
297
  )
298
 
299
  """Now we can provide the trainer with the model, tokenizer (important: use the one you set language and task to! In this example, it is `processor.tokenizer`), training arguments, datasets, data collator, callback, and the method to compute metrics during evaluation.
@@ -369,4 +368,6 @@ trainer.log_metrics("test", final_metrics)
369
  trainer.save_metrics("test", final_metrics)
370
  print(final_metrics)
371
 
 
 
372
  trainer.push_to_hub()
 
1
+ """ Whisper training script using Hugging Face Transformers. """
2
+
3
  import os # used to create output directory
4
  from dataclasses import dataclass # used to define data collator
5
  from math import ceil # used to round up decimals
 
29
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
30
 
31
  dataset_id = "google/fleurs"
32
+ dataset_language_code = "sv_se"
33
  dataset = load_dataset(dataset_id, dataset_language_code, streaming=True)
34
 
35
  """The first time you run this code, make sure everything works fine using a small sample and low number of training steps. Just uncomment the next cell and run it. One note: since the dataset is loaded in streaming mode, the instruction will not be executed immediately. Instead, the dataset will be subsampled only when data will be needed during training."""
36
 
37
+ test_script = True
38
+ # test_script = False
39
 
40
  ## Sample dataset for testing
41
  if test_script is True:
 
188
  In our specific case, we could skip this step since English transcription is the default behaviour. Still, this is how you would do if you were in a multilingual setting.
189
  """
190
 
191
+ # processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
192
 
193
  ## If you wanted to transcribe in Swedish
194
  ## (Of course, you'd need a Swedish dataset)
195
+ processor.tokenizer.set_prefix_tokens(language="sv", task="transcribe")
196
 
197
  ## If you wanted to get an English transcription from Swedish audio
198
  # processor.tokenizer.set_prefix_tokens(language="sv", task="translate")
 
247
  wandb.init(project="whisper-training-post")
248
  report_to = "wandb"
249
 
 
 
 
 
250
  # Check if we have a GPU.
251
  # In case, we will use mixed precision
252
  # to reduce memory footprint with
 
255
  use_fp16 = (device == "cuda")
256
 
257
  # Let's first define the batch sizes
258
+ # Adapt it to your hardware
259
  train_bs = 4 if test_script is True else 64
260
  eval_bs = 2 if test_script is True else 32
261
 
262
  # Then we infer the number of steps
263
  # TODO: how did I find it?
264
+ num_training_samples = 2385
265
+ num_epochs = 3
266
  max_steps_full_training = ceil(num_training_samples * num_epochs / train_bs)
267
  max_steps = 2 if test_script is True else max_steps_full_training
268
 
269
  # We don't want to evaluate too often since it slows down training a lot
270
+ # but neither too little, since we want to see how the model is training
271
  eval_steps = 1 if test_script is True else int(max_steps / 10)
272
  logging_steps = 1 if test_script is True else int(max_steps / 100)
273
 
274
  training_args = Seq2SeqTrainingArguments(
275
+ output_dir=".",
276
  do_train=True,
277
  do_eval=True,
278
  max_steps=max_steps,
 
292
  predict_with_generate=True,
293
  generation_num_beams=1,
294
  # track experiment
295
+ report_to=report_to
296
  )
297
 
298
  """Now we can provide the trainer with the model, tokenizer (important: use the one you set language and task to! In this example, it is `processor.tokenizer`), training arguments, datasets, data collator, callback, and the method to compute metrics during evaluation.
 
368
  trainer.save_metrics("test", final_metrics)
369
  print(final_metrics)
370
 
371
+ # Pushing to hub during training slows down training
372
+ # so we push it only in the end.
373
  trainer.push_to_hub()