AlekseyKorshuk commited on
Commit
77278cc
1 Parent(s): 776fb1c

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +201 -501
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 1.4392181396484376,
4
- "train_runtime": 183.8573,
5
  "train_samples": 156,
6
- "train_samples_per_second": 4.242,
7
- "train_steps_per_second": 0.272
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.999542236328125,
4
+ "train_runtime": 244.9371,
5
  "train_samples": 156,
6
+ "train_samples_per_second": 1.911,
7
+ "train_steps_per_second": 0.122
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 1.4392181396484376,
4
- "train_runtime": 183.8573,
5
  "train_samples": 156,
6
- "train_samples_per_second": 4.242,
7
- "train_steps_per_second": 0.272
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.999542236328125,
4
+ "train_runtime": 244.9371,
5
  "train_samples": 156,
6
+ "train_samples_per_second": 1.911,
7
+ "train_steps_per_second": 0.122
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
- "global_step": 50,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -17,9 +17,9 @@
17
  "epoch": 0.1,
18
  "eval_accuracy": 0.05291319857312723,
19
  "eval_loss": 2.6484375,
20
- "eval_runtime": 1.5576,
21
- "eval_samples_per_second": 18.618,
22
- "eval_steps_per_second": 1.284,
23
  "step": 1
24
  },
25
  {
@@ -32,14 +32,14 @@
32
  "epoch": 0.2,
33
  "eval_accuracy": 0.05291319857312723,
34
  "eval_loss": 2.6484375,
35
- "eval_runtime": 1.275,
36
- "eval_samples_per_second": 22.746,
37
- "eval_steps_per_second": 1.569,
38
  "step": 2
39
  },
40
  {
41
  "epoch": 0.3,
42
- "learning_rate": 2.9970400926424075e-05,
43
  "loss": 2.6365,
44
  "step": 3
45
  },
@@ -47,729 +47,429 @@
47
  "epoch": 0.3,
48
  "eval_accuracy": 0.05601796802748051,
49
  "eval_loss": 2.55078125,
50
- "eval_runtime": 1.1095,
51
- "eval_samples_per_second": 26.137,
52
- "eval_steps_per_second": 1.803,
53
  "step": 3
54
  },
55
  {
56
  "epoch": 0.4,
57
- "learning_rate": 2.988172051971717e-05,
58
  "loss": 2.5088,
59
  "step": 4
60
  },
61
  {
62
  "epoch": 0.4,
63
- "eval_accuracy": 0.056282203725723345,
64
  "eval_loss": 2.533203125,
65
- "eval_runtime": 1.0069,
66
- "eval_samples_per_second": 28.8,
67
- "eval_steps_per_second": 1.986,
68
  "step": 4
69
  },
70
  {
71
  "epoch": 0.5,
72
- "learning_rate": 2.9734308760930333e-05,
73
- "loss": 2.7297,
74
  "step": 5
75
  },
76
  {
77
  "epoch": 0.5,
78
- "eval_accuracy": 0.05667855727308759,
79
  "eval_loss": 2.517578125,
80
- "eval_runtime": 1.0127,
81
- "eval_samples_per_second": 28.637,
82
- "eval_steps_per_second": 1.975,
83
  "step": 5
84
  },
85
  {
86
  "epoch": 0.6,
87
- "learning_rate": 2.9528747416929467e-05,
88
- "loss": 2.9702,
89
  "step": 6
90
  },
91
  {
92
  "epoch": 0.6,
93
- "eval_accuracy": 0.05720702866957326,
94
  "eval_loss": 2.494140625,
95
- "eval_runtime": 1.2159,
96
- "eval_samples_per_second": 23.851,
97
- "eval_steps_per_second": 1.645,
98
  "step": 6
99
  },
100
  {
101
  "epoch": 0.7,
102
- "learning_rate": 2.9265847744427305e-05,
103
- "loss": 2.729,
104
  "step": 7
105
  },
106
  {
107
  "epoch": 0.7,
108
- "eval_accuracy": 0.05681067512220901,
109
  "eval_loss": 2.48828125,
110
- "eval_runtime": 1.0262,
111
- "eval_samples_per_second": 28.259,
112
- "eval_steps_per_second": 1.949,
113
  "step": 7
114
  },
115
  {
116
  "epoch": 0.8,
117
- "learning_rate": 2.894664728832377e-05,
118
- "loss": 2.6172,
119
  "step": 8
120
  },
121
  {
122
  "epoch": 0.8,
123
  "eval_accuracy": 0.05780155899061963,
124
- "eval_loss": 2.478515625,
125
- "eval_runtime": 1.3127,
126
- "eval_samples_per_second": 22.092,
127
- "eval_steps_per_second": 1.524,
128
  "step": 8
129
  },
130
  {
131
  "epoch": 0.9,
132
- "learning_rate": 2.8572405786990293e-05,
133
- "loss": 2.6428,
134
  "step": 9
135
  },
136
  {
137
  "epoch": 0.9,
138
- "eval_accuracy": 0.058065794688862464,
139
  "eval_loss": 2.458984375,
140
- "eval_runtime": 1.0971,
141
- "eval_samples_per_second": 26.432,
142
- "eval_steps_per_second": 1.823,
143
  "step": 9
144
  },
145
  {
146
  "epoch": 1.0,
147
- "learning_rate": 2.8144600200657953e-05,
148
- "loss": 2.5681,
149
  "step": 10
150
  },
151
  {
152
  "epoch": 1.0,
153
- "eval_accuracy": 0.05899061963271238,
154
- "eval_loss": 2.435546875,
155
- "eval_runtime": 1.3171,
156
- "eval_samples_per_second": 22.018,
157
- "eval_steps_per_second": 1.518,
158
  "step": 10
159
  },
160
  {
161
  "epoch": 1.1,
162
- "learning_rate": 2.7664918882530227e-05,
163
- "loss": 2.1885,
164
  "step": 11
165
  },
166
  {
167
  "epoch": 1.1,
168
  "eval_accuracy": 0.05866032500990884,
169
  "eval_loss": 2.423828125,
170
- "eval_runtime": 1.2181,
171
- "eval_samples_per_second": 23.807,
172
- "eval_steps_per_second": 1.642,
173
  "step": 11
174
  },
175
  {
176
  "epoch": 1.2,
177
- "learning_rate": 2.7135254915624213e-05,
178
- "loss": 1.981,
179
  "step": 12
180
  },
181
  {
182
  "epoch": 1.2,
183
- "eval_accuracy": 0.05872638393446954,
184
  "eval_loss": 2.421875,
185
- "eval_runtime": 1.6039,
186
- "eval_samples_per_second": 18.081,
187
- "eval_steps_per_second": 1.247,
188
  "step": 12
189
  },
190
  {
191
  "epoch": 1.3,
192
- "learning_rate": 2.655769864163684e-05,
193
- "loss": 1.8673,
194
  "step": 13
195
  },
196
  {
197
  "epoch": 1.3,
198
- "eval_accuracy": 0.0591227374818338,
199
- "eval_loss": 2.41796875,
200
- "eval_runtime": 1.0123,
201
- "eval_samples_per_second": 28.649,
202
- "eval_steps_per_second": 1.976,
203
  "step": 13
204
  },
205
  {
206
  "epoch": 1.4,
207
- "learning_rate": 2.5934529411321174e-05,
208
- "loss": 1.7321,
209
  "step": 14
210
  },
211
  {
212
  "epoch": 1.4,
213
- "eval_accuracy": 0.05958514995375875,
214
- "eval_loss": 2.41796875,
215
- "eval_runtime": 1.0152,
216
- "eval_samples_per_second": 28.567,
217
- "eval_steps_per_second": 1.97,
218
  "step": 14
219
  },
220
  {
221
  "epoch": 1.5,
222
- "learning_rate": 2.5268206588930332e-05,
223
- "loss": 1.6355,
224
  "step": 15
225
  },
226
  {
227
  "epoch": 1.5,
228
- "eval_accuracy": 0.060113621350244416,
229
- "eval_loss": 2.41796875,
230
- "eval_runtime": 1.4219,
231
- "eval_samples_per_second": 20.396,
232
- "eval_steps_per_second": 1.407,
233
  "step": 15
234
  },
235
  {
236
  "epoch": 1.6,
237
- "learning_rate": 2.4561359846230346e-05,
238
- "loss": 1.7758,
239
  "step": 16
240
  },
241
  {
242
  "epoch": 1.6,
243
- "eval_accuracy": 0.06017968027480513,
244
- "eval_loss": 2.419921875,
245
- "eval_runtime": 1.618,
246
- "eval_samples_per_second": 17.923,
247
- "eval_steps_per_second": 1.236,
248
  "step": 16
249
  },
250
  {
251
  "epoch": 1.7,
252
- "learning_rate": 2.3816778784387097e-05,
253
- "loss": 2.0162,
254
  "step": 17
255
  },
256
  {
257
  "epoch": 1.7,
258
- "eval_accuracy": 0.06050997489760867,
259
- "eval_loss": 2.408203125,
260
- "eval_runtime": 1.1225,
261
- "eval_samples_per_second": 25.835,
262
- "eval_steps_per_second": 1.782,
263
  "step": 17
264
  },
265
  {
266
  "epoch": 1.8,
267
- "learning_rate": 2.303740192468495e-05,
268
- "loss": 1.8037,
269
  "step": 18
270
  },
271
  {
272
  "epoch": 1.8,
273
- "eval_accuracy": 0.06050997489760867,
274
- "eval_loss": 2.396484375,
275
- "eval_runtime": 0.912,
276
- "eval_samples_per_second": 31.8,
277
- "eval_steps_per_second": 2.193,
278
  "step": 18
279
  },
280
  {
281
  "epoch": 1.9,
282
- "learning_rate": 2.222630511152573e-05,
283
- "loss": 1.7204,
284
  "step": 19
285
  },
286
  {
287
  "epoch": 1.9,
288
- "eval_accuracy": 0.0607742105958515,
289
- "eval_loss": 2.375,
290
- "eval_runtime": 1.2202,
291
- "eval_samples_per_second": 23.767,
292
- "eval_steps_per_second": 1.639,
293
  "step": 19
294
  },
295
  {
296
  "epoch": 2.0,
297
- "learning_rate": 2.138668937347609e-05,
298
- "loss": 1.7831,
299
  "step": 20
300
  },
301
  {
302
  "epoch": 2.0,
303
- "eval_accuracy": 0.060906328444972915,
304
- "eval_loss": 2.357421875,
305
- "eval_runtime": 1.0082,
306
- "eval_samples_per_second": 28.765,
307
- "eval_steps_per_second": 1.984,
308
  "step": 20
309
  },
310
  {
311
  "epoch": 2.1,
312
- "learning_rate": 2.052186829027017e-05,
313
- "loss": 1.299,
314
  "step": 21
315
  },
316
  {
317
  "epoch": 2.1,
318
- "eval_accuracy": 0.06163297661514071,
319
- "eval_loss": 2.349609375,
320
- "eval_runtime": 1.2114,
321
- "eval_samples_per_second": 23.94,
322
- "eval_steps_per_second": 1.651,
323
  "step": 21
324
  },
325
  {
326
  "epoch": 2.2,
327
- "learning_rate": 1.963525491562421e-05,
328
- "loss": 1.4463,
329
  "step": 22
330
  },
331
  {
332
  "epoch": 2.2,
333
- "eval_accuracy": 0.06196327123794425,
334
- "eval_loss": 2.349609375,
335
- "eval_runtime": 1.5179,
336
- "eval_samples_per_second": 19.105,
337
- "eval_steps_per_second": 1.318,
338
  "step": 22
339
  },
340
  {
341
  "epoch": 2.3,
342
- "learning_rate": 1.8730348307472828e-05,
343
- "loss": 1.1733,
344
  "step": 23
345
  },
346
  {
347
  "epoch": 2.3,
348
- "eval_accuracy": 0.061699035539701415,
349
- "eval_loss": 2.365234375,
350
- "eval_runtime": 1.3077,
351
- "eval_samples_per_second": 22.176,
352
- "eval_steps_per_second": 1.529,
353
  "step": 23
354
  },
355
  {
356
  "epoch": 2.4,
357
- "learning_rate": 1.781071971878587e-05,
358
- "loss": 1.1142,
359
  "step": 24
360
  },
361
  {
362
  "epoch": 2.4,
363
- "eval_accuracy": 0.06255780155899061,
364
- "eval_loss": 2.388671875,
365
- "eval_runtime": 1.1042,
366
- "eval_samples_per_second": 26.264,
367
- "eval_steps_per_second": 1.811,
368
  "step": 24
369
  },
370
  {
371
  "epoch": 2.5,
372
- "learning_rate": 1.6879998503464565e-05,
373
- "loss": 1.3107,
374
  "step": 25
375
  },
376
  {
377
  "epoch": 2.5,
378
- "eval_accuracy": 0.06268991940811204,
379
- "eval_loss": 2.421875,
380
- "eval_runtime": 0.9167,
381
- "eval_samples_per_second": 31.634,
382
- "eval_steps_per_second": 2.182,
383
  "step": 25
384
  },
385
  {
386
  "epoch": 2.6,
387
- "learning_rate": 1.5941857792939702e-05,
388
- "loss": 1.011,
389
  "step": 26
390
  },
391
  {
392
  "epoch": 2.6,
393
- "eval_accuracy": 0.06216144801162637,
394
- "eval_loss": 2.455078125,
395
- "eval_runtime": 0.9207,
396
- "eval_samples_per_second": 31.497,
397
- "eval_steps_per_second": 2.172,
398
  "step": 26
399
  },
400
  {
401
  "epoch": 2.7,
402
- "learning_rate": 1.5e-05,
403
- "loss": 1.3403,
404
  "step": 27
405
  },
406
  {
407
  "epoch": 2.7,
408
- "eval_accuracy": 0.061566917690579995,
409
- "eval_loss": 2.4765625,
410
- "eval_runtime": 1.5266,
411
- "eval_samples_per_second": 18.997,
412
- "eval_steps_per_second": 1.31,
413
  "step": 27
414
  },
415
  {
416
  "epoch": 2.8,
417
- "learning_rate": 1.5e-05,
418
- "loss": 1.3108,
419
  "step": 28
420
  },
421
  {
422
  "epoch": 2.8,
423
- "eval_accuracy": 0.061566917690579995,
424
- "eval_loss": 2.4765625,
425
- "eval_runtime": 1.252,
426
- "eval_samples_per_second": 23.163,
427
- "eval_steps_per_second": 1.597,
428
  "step": 28
429
  },
430
  {
431
  "epoch": 2.9,
432
- "learning_rate": 1.40581422070603e-05,
433
- "loss": 1.0076,
434
  "step": 29
435
  },
436
  {
437
  "epoch": 2.9,
438
- "eval_accuracy": 0.06189721231338354,
439
- "eval_loss": 2.4609375,
440
- "eval_runtime": 0.9112,
441
- "eval_samples_per_second": 31.825,
442
- "eval_steps_per_second": 2.195,
443
  "step": 29
444
  },
445
  {
446
  "epoch": 3.0,
447
- "learning_rate": 1.3120001496535434e-05,
448
- "loss": 0.8656,
449
  "step": 30
450
  },
451
  {
452
  "epoch": 3.0,
453
- "eval_accuracy": 0.062359624785308494,
454
- "eval_loss": 2.451171875,
455
- "eval_runtime": 1.5156,
456
- "eval_samples_per_second": 19.134,
457
- "eval_steps_per_second": 1.32,
458
  "step": 30
459
  },
460
  {
461
- "epoch": 3.1,
462
- "learning_rate": 1.2189280281214128e-05,
463
- "loss": 0.6635,
464
- "step": 31
465
- },
466
- {
467
- "epoch": 3.1,
468
- "eval_accuracy": 0.06282203725723345,
469
- "eval_loss": 2.451171875,
470
- "eval_runtime": 1.313,
471
- "eval_samples_per_second": 22.087,
472
- "eval_steps_per_second": 1.523,
473
- "step": 31
474
- },
475
- {
476
- "epoch": 3.2,
477
- "learning_rate": 1.1269651692527181e-05,
478
- "loss": 0.9996,
479
- "step": 32
480
- },
481
- {
482
- "epoch": 3.2,
483
- "eval_accuracy": 0.06348262650284053,
484
- "eval_loss": 2.443359375,
485
- "eval_runtime": 1.01,
486
- "eval_samples_per_second": 28.711,
487
- "eval_steps_per_second": 1.98,
488
- "step": 32
489
- },
490
- {
491
- "epoch": 3.3,
492
- "learning_rate": 1.036474508437579e-05,
493
- "loss": 0.9029,
494
- "step": 33
495
- },
496
- {
497
- "epoch": 3.3,
498
- "eval_accuracy": 0.06368080327652266,
499
- "eval_loss": 2.447265625,
500
- "eval_runtime": 1.4214,
501
- "eval_samples_per_second": 20.402,
502
- "eval_steps_per_second": 1.407,
503
- "step": 33
504
- },
505
- {
506
- "epoch": 3.4,
507
- "learning_rate": 9.478131709729831e-06,
508
- "loss": 0.8329,
509
- "step": 34
510
- },
511
- {
512
- "epoch": 3.4,
513
- "eval_accuracy": 0.06374686220108336,
514
- "eval_loss": 2.455078125,
515
- "eval_runtime": 0.9136,
516
- "eval_samples_per_second": 31.742,
517
- "eval_steps_per_second": 2.189,
518
- "step": 34
519
- },
520
- {
521
- "epoch": 3.5,
522
- "learning_rate": 8.61331062652391e-06,
523
- "loss": 0.8012,
524
- "step": 35
525
- },
526
- {
527
- "epoch": 3.5,
528
- "eval_accuracy": 0.06387898005020479,
529
- "eval_loss": 2.46484375,
530
- "eval_runtime": 1.6062,
531
- "eval_samples_per_second": 18.055,
532
- "eval_steps_per_second": 1.245,
533
- "step": 35
534
- },
535
- {
536
- "epoch": 3.6,
537
- "learning_rate": 7.773694888474268e-06,
538
- "loss": 0.5814,
539
- "step": 36
540
- },
541
- {
542
- "epoch": 3.6,
543
- "eval_accuracy": 0.0640110978993262,
544
- "eval_loss": 2.490234375,
545
- "eval_runtime": 1.6209,
546
- "eval_samples_per_second": 17.891,
547
- "eval_steps_per_second": 1.234,
548
- "step": 36
549
- },
550
- {
551
- "epoch": 3.7,
552
- "learning_rate": 6.962598075315047e-06,
553
- "loss": 1.0688,
554
- "step": 37
555
- },
556
- {
557
- "epoch": 3.7,
558
- "eval_accuracy": 0.06381292112564407,
559
- "eval_loss": 2.509765625,
560
- "eval_runtime": 1.2235,
561
- "eval_samples_per_second": 23.703,
562
- "eval_steps_per_second": 1.635,
563
- "step": 37
564
- },
565
- {
566
- "epoch": 3.8,
567
- "learning_rate": 6.1832212156129045e-06,
568
- "loss": 0.8688,
569
- "step": 38
570
- },
571
- {
572
- "epoch": 3.8,
573
- "eval_accuracy": 0.06348262650284053,
574
- "eval_loss": 2.517578125,
575
- "eval_runtime": 1.5088,
576
- "eval_samples_per_second": 19.221,
577
- "eval_steps_per_second": 1.326,
578
- "step": 38
579
- },
580
- {
581
- "epoch": 3.9,
582
- "learning_rate": 5.438640153769654e-06,
583
- "loss": 0.7341,
584
- "step": 39
585
- },
586
- {
587
- "epoch": 3.9,
588
- "eval_accuracy": 0.06381292112564407,
589
- "eval_loss": 2.51953125,
590
- "eval_runtime": 1.0209,
591
- "eval_samples_per_second": 28.406,
592
- "eval_steps_per_second": 1.959,
593
- "step": 39
594
- },
595
- {
596
- "epoch": 4.0,
597
- "learning_rate": 4.731793411069669e-06,
598
- "loss": 0.7102,
599
- "step": 40
600
- },
601
- {
602
- "epoch": 4.0,
603
- "eval_accuracy": 0.0640110978993262,
604
- "eval_loss": 2.51953125,
605
- "eval_runtime": 1.3112,
606
- "eval_samples_per_second": 22.116,
607
- "eval_steps_per_second": 1.525,
608
- "step": 40
609
- },
610
- {
611
- "epoch": 4.1,
612
- "learning_rate": 4.06547058867883e-06,
613
- "loss": 0.7079,
614
- "step": 41
615
- },
616
- {
617
- "epoch": 4.1,
618
- "eval_accuracy": 0.06414321574844761,
619
- "eval_loss": 2.51953125,
620
- "eval_runtime": 1.0046,
621
- "eval_samples_per_second": 28.868,
622
- "eval_steps_per_second": 1.991,
623
- "step": 41
624
- },
625
- {
626
- "epoch": 4.2,
627
- "learning_rate": 3.442301358363163e-06,
628
- "loss": 0.7656,
629
- "step": 42
630
- },
631
- {
632
- "epoch": 4.2,
633
- "eval_accuracy": 0.06427533359756903,
634
- "eval_loss": 2.51953125,
635
- "eval_runtime": 1.3174,
636
- "eval_samples_per_second": 22.012,
637
- "eval_steps_per_second": 1.518,
638
- "step": 42
639
- },
640
- {
641
- "epoch": 4.3,
642
- "learning_rate": 2.86474508437579e-06,
643
- "loss": 0.6377,
644
- "step": 43
645
- },
646
- {
647
- "epoch": 4.3,
648
- "eval_accuracy": 0.06447351037125115,
649
- "eval_loss": 2.52734375,
650
- "eval_runtime": 1.1043,
651
- "eval_samples_per_second": 26.261,
652
- "eval_steps_per_second": 1.811,
653
- "step": 43
654
- },
655
- {
656
- "epoch": 4.4,
657
- "learning_rate": 2.335081117469777e-06,
658
- "loss": 0.5898,
659
- "step": 44
660
- },
661
- {
662
- "epoch": 4.4,
663
- "eval_accuracy": 0.06414321574844761,
664
- "eval_loss": 2.53515625,
665
- "eval_runtime": 1.0226,
666
- "eval_samples_per_second": 28.359,
667
- "eval_steps_per_second": 1.956,
668
- "step": 44
669
- },
670
- {
671
- "epoch": 4.5,
672
- "learning_rate": 1.8553997993420495e-06,
673
- "loss": 0.5958,
674
- "step": 45
675
- },
676
- {
677
- "epoch": 4.5,
678
- "eval_accuracy": 0.06407715682388691,
679
- "eval_loss": 2.54296875,
680
- "eval_runtime": 0.91,
681
- "eval_samples_per_second": 31.867,
682
- "eval_steps_per_second": 2.198,
683
- "step": 45
684
- },
685
- {
686
- "epoch": 4.6,
687
- "learning_rate": 1.4275942130097097e-06,
688
- "loss": 0.7048,
689
- "step": 46
690
- },
691
- {
692
- "epoch": 4.6,
693
- "eval_accuracy": 0.0640110978993262,
694
- "eval_loss": 2.548828125,
695
- "eval_runtime": 1.2089,
696
- "eval_samples_per_second": 23.989,
697
- "eval_steps_per_second": 1.654,
698
- "step": 46
699
- },
700
- {
701
- "epoch": 4.7,
702
- "learning_rate": 1.0533527116762298e-06,
703
- "loss": 0.5435,
704
- "step": 47
705
- },
706
- {
707
- "epoch": 4.7,
708
- "eval_accuracy": 0.06414321574844761,
709
- "eval_loss": 2.552734375,
710
- "eval_runtime": 1.3311,
711
- "eval_samples_per_second": 21.786,
712
- "eval_steps_per_second": 1.502,
713
- "step": 47
714
- },
715
- {
716
- "epoch": 4.8,
717
- "learning_rate": 7.341522555726971e-07,
718
- "loss": 0.4769,
719
- "step": 48
720
- },
721
- {
722
- "epoch": 4.8,
723
- "eval_accuracy": 0.0640110978993262,
724
- "eval_loss": 2.552734375,
725
- "eval_runtime": 1.0089,
726
- "eval_samples_per_second": 28.743,
727
- "eval_steps_per_second": 1.982,
728
- "step": 48
729
- },
730
- {
731
- "epoch": 4.9,
732
- "learning_rate": 4.7125258307053385e-07,
733
- "loss": 0.6583,
734
- "step": 49
735
- },
736
- {
737
- "epoch": 4.9,
738
- "eval_accuracy": 0.06420927467300833,
739
- "eval_loss": 2.5546875,
740
- "eval_runtime": 1.0099,
741
- "eval_samples_per_second": 28.715,
742
- "eval_steps_per_second": 1.98,
743
- "step": 49
744
- },
745
- {
746
- "epoch": 5.0,
747
- "learning_rate": 2.6569123906967083e-07,
748
- "loss": 0.7168,
749
- "step": 50
750
- },
751
- {
752
- "epoch": 5.0,
753
- "eval_accuracy": 0.06414321574844761,
754
- "eval_loss": 2.5546875,
755
- "eval_runtime": 1.5134,
756
- "eval_samples_per_second": 19.163,
757
- "eval_steps_per_second": 1.322,
758
- "step": 50
759
- },
760
- {
761
- "epoch": 5.0,
762
- "step": 50,
763
- "total_flos": 2477483753472.0,
764
- "train_loss": 1.4392181396484376,
765
- "train_runtime": 183.8573,
766
- "train_samples_per_second": 4.242,
767
- "train_steps_per_second": 0.272
768
  }
769
  ],
770
- "max_steps": 50,
771
- "num_train_epochs": 5,
772
- "total_flos": 2477483753472.0,
773
  "trial_name": null,
774
  "trial_params": null
775
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "global_step": 30,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
17
  "epoch": 0.1,
18
  "eval_accuracy": 0.05291319857312723,
19
  "eval_loss": 2.6484375,
20
+ "eval_runtime": 1.1592,
21
+ "eval_samples_per_second": 25.016,
22
+ "eval_steps_per_second": 1.725,
23
  "step": 1
24
  },
25
  {
 
32
  "epoch": 0.2,
33
  "eval_accuracy": 0.05291319857312723,
34
  "eval_loss": 2.6484375,
35
+ "eval_runtime": 1.6595,
36
+ "eval_samples_per_second": 17.475,
37
+ "eval_steps_per_second": 1.205,
38
  "step": 2
39
  },
40
  {
41
  "epoch": 0.3,
42
+ "learning_rate": 2.99178284305241e-05,
43
  "loss": 2.6365,
44
  "step": 3
45
  },
 
47
  "epoch": 0.3,
48
  "eval_accuracy": 0.05601796802748051,
49
  "eval_loss": 2.55078125,
50
+ "eval_runtime": 1.2012,
51
+ "eval_samples_per_second": 24.142,
52
+ "eval_steps_per_second": 1.665,
53
  "step": 3
54
  },
55
  {
56
  "epoch": 0.4,
57
+ "learning_rate": 2.9672214011007087e-05,
58
  "loss": 2.5088,
59
  "step": 4
60
  },
61
  {
62
  "epoch": 0.4,
63
+ "eval_accuracy": 0.05615008587660193,
64
  "eval_loss": 2.533203125,
65
+ "eval_runtime": 0.9032,
66
+ "eval_samples_per_second": 32.109,
67
+ "eval_steps_per_second": 2.214,
68
  "step": 4
69
  },
70
  {
71
  "epoch": 0.5,
72
+ "learning_rate": 2.9265847744427305e-05,
73
+ "loss": 2.7307,
74
  "step": 5
75
  },
76
  {
77
  "epoch": 0.5,
78
+ "eval_accuracy": 0.05654643942396618,
79
  "eval_loss": 2.517578125,
80
+ "eval_runtime": 1.6122,
81
+ "eval_samples_per_second": 17.988,
82
+ "eval_steps_per_second": 1.241,
83
  "step": 5
84
  },
85
  {
86
  "epoch": 0.6,
87
+ "learning_rate": 2.8703181864639013e-05,
88
+ "loss": 2.969,
89
  "step": 6
90
  },
91
  {
92
  "epoch": 0.6,
93
+ "eval_accuracy": 0.05714096974501255,
94
  "eval_loss": 2.494140625,
95
+ "eval_runtime": 1.5031,
96
+ "eval_samples_per_second": 19.294,
97
+ "eval_steps_per_second": 1.331,
98
  "step": 6
99
  },
100
  {
101
  "epoch": 0.7,
102
+ "learning_rate": 2.7990381056766583e-05,
103
+ "loss": 2.7283,
104
  "step": 7
105
  },
106
  {
107
  "epoch": 0.7,
108
+ "eval_accuracy": 0.056744616197648305,
109
  "eval_loss": 2.48828125,
110
+ "eval_runtime": 0.999,
111
+ "eval_samples_per_second": 29.03,
112
+ "eval_steps_per_second": 2.002,
113
  "step": 7
114
  },
115
  {
116
  "epoch": 0.8,
117
+ "learning_rate": 2.7135254915624213e-05,
118
+ "loss": 2.6157,
119
  "step": 8
120
  },
121
  {
122
  "epoch": 0.8,
123
  "eval_accuracy": 0.05780155899061963,
124
+ "eval_loss": 2.4765625,
125
+ "eval_runtime": 0.9999,
126
+ "eval_samples_per_second": 29.003,
127
+ "eval_steps_per_second": 2.0,
128
  "step": 8
129
  },
130
  {
131
  "epoch": 0.9,
132
+ "learning_rate": 2.6147172382160913e-05,
133
+ "loss": 2.6406,
134
  "step": 9
135
  },
136
  {
137
  "epoch": 0.9,
138
+ "eval_accuracy": 0.0583300303871053,
139
  "eval_loss": 2.458984375,
140
+ "eval_runtime": 1.206,
141
+ "eval_samples_per_second": 24.046,
142
+ "eval_steps_per_second": 1.658,
143
  "step": 9
144
  },
145
  {
146
  "epoch": 1.0,
147
+ "learning_rate": 2.5036959095382875e-05,
148
+ "loss": 2.5701,
149
  "step": 10
150
  },
151
  {
152
  "epoch": 1.0,
153
+ "eval_accuracy": 0.05872638393446954,
154
+ "eval_loss": 2.4375,
155
+ "eval_runtime": 1.0035,
156
+ "eval_samples_per_second": 28.898,
157
+ "eval_steps_per_second": 1.993,
158
  "step": 10
159
  },
160
  {
161
  "epoch": 1.1,
162
+ "learning_rate": 2.3816778784387097e-05,
163
+ "loss": 2.2017,
164
  "step": 11
165
  },
166
  {
167
  "epoch": 1.1,
168
  "eval_accuracy": 0.05866032500990884,
169
  "eval_loss": 2.423828125,
170
+ "eval_runtime": 1.1973,
171
+ "eval_samples_per_second": 24.222,
172
+ "eval_steps_per_second": 1.67,
173
  "step": 11
174
  },
175
  {
176
  "epoch": 1.2,
177
+ "learning_rate": 2.25e-05,
178
+ "loss": 2.0039,
179
  "step": 12
180
  },
181
  {
182
  "epoch": 1.2,
183
+ "eval_accuracy": 0.05859426608534813,
184
  "eval_loss": 2.421875,
185
+ "eval_runtime": 1.009,
186
+ "eval_samples_per_second": 28.742,
187
+ "eval_steps_per_second": 1.982,
188
  "step": 12
189
  },
190
  {
191
  "epoch": 1.3,
192
+ "learning_rate": 2.1101049646137008e-05,
193
+ "loss": 1.8981,
194
  "step": 13
195
  },
196
  {
197
  "epoch": 1.3,
198
+ "eval_accuracy": 0.058858501783590964,
199
+ "eval_loss": 2.416015625,
200
+ "eval_runtime": 1.2002,
201
+ "eval_samples_per_second": 24.163,
202
+ "eval_steps_per_second": 1.666,
203
  "step": 13
204
  },
205
  {
206
  "epoch": 1.4,
207
+ "learning_rate": 1.963525491562421e-05,
208
+ "loss": 1.7683,
209
  "step": 14
210
  },
211
  {
212
  "epoch": 1.4,
213
+ "eval_accuracy": 0.059453032104637336,
214
+ "eval_loss": 2.416015625,
215
+ "eval_runtime": 1.0984,
216
+ "eval_samples_per_second": 26.402,
217
+ "eval_steps_per_second": 1.821,
218
  "step": 14
219
  },
220
  {
221
  "epoch": 1.5,
222
+ "learning_rate": 1.8118675362266388e-05,
223
+ "loss": 1.6746,
224
  "step": 15
225
  },
226
  {
227
  "epoch": 1.5,
228
+ "eval_accuracy": 0.059981503501123,
229
+ "eval_loss": 2.412109375,
230
+ "eval_runtime": 0.8904,
231
+ "eval_samples_per_second": 32.57,
232
+ "eval_steps_per_second": 2.246,
233
  "step": 15
234
  },
235
  {
236
  "epoch": 1.6,
237
+ "learning_rate": 1.6567926949014805e-05,
238
+ "loss": 1.8051,
239
  "step": 16
240
  },
241
  {
242
  "epoch": 1.6,
243
+ "eval_accuracy": 0.06004756242568371,
244
+ "eval_loss": 2.41015625,
245
+ "eval_runtime": 1.1118,
246
+ "eval_samples_per_second": 26.085,
247
+ "eval_steps_per_second": 1.799,
248
  "step": 16
249
  },
250
  {
251
  "epoch": 1.7,
252
+ "learning_rate": 1.5e-05,
253
+ "loss": 2.0457,
254
  "step": 17
255
  },
256
  {
257
  "epoch": 1.7,
258
+ "eval_accuracy": 0.06017968027480513,
259
+ "eval_loss": 2.404296875,
260
+ "eval_runtime": 1.1017,
261
+ "eval_samples_per_second": 26.322,
262
+ "eval_steps_per_second": 1.815,
263
  "step": 17
264
  },
265
  {
266
  "epoch": 1.8,
267
+ "learning_rate": 1.3432073050985201e-05,
268
+ "loss": 1.8257,
269
  "step": 18
270
  },
271
  {
272
  "epoch": 1.8,
273
+ "eval_accuracy": 0.060576033822169376,
274
+ "eval_loss": 2.400390625,
275
+ "eval_runtime": 1.5124,
276
+ "eval_samples_per_second": 19.175,
277
+ "eval_steps_per_second": 1.322,
278
  "step": 18
279
  },
280
  {
281
  "epoch": 1.9,
282
+ "learning_rate": 1.1881324637733613e-05,
283
+ "loss": 1.744,
284
  "step": 19
285
  },
286
  {
287
  "epoch": 1.9,
288
+ "eval_accuracy": 0.06070815167129079,
289
+ "eval_loss": 2.388671875,
290
+ "eval_runtime": 0.9026,
291
+ "eval_samples_per_second": 32.129,
292
+ "eval_steps_per_second": 2.216,
293
  "step": 19
294
  },
295
  {
296
  "epoch": 2.0,
297
+ "learning_rate": 1.1881324637733613e-05,
298
+ "loss": 1.8232,
299
  "step": 20
300
  },
301
  {
302
  "epoch": 2.0,
303
+ "eval_accuracy": 0.06070815167129079,
304
+ "eval_loss": 2.388671875,
305
+ "eval_runtime": 1.3479,
306
+ "eval_samples_per_second": 21.515,
307
+ "eval_steps_per_second": 1.484,
308
  "step": 20
309
  },
310
  {
311
  "epoch": 2.1,
312
+ "learning_rate": 1.036474508437579e-05,
313
+ "loss": 1.4741,
314
  "step": 21
315
  },
316
  {
317
  "epoch": 2.1,
318
+ "eval_accuracy": 0.06097238736953362,
319
+ "eval_loss": 2.3828125,
320
+ "eval_runtime": 1.1016,
321
+ "eval_samples_per_second": 26.326,
322
+ "eval_steps_per_second": 1.816,
323
  "step": 21
324
  },
325
  {
326
  "epoch": 2.2,
327
+ "learning_rate": 8.898950353863e-06,
328
+ "loss": 1.651,
329
  "step": 22
330
  },
331
  {
332
  "epoch": 2.2,
333
+ "eval_accuracy": 0.06084026952041221,
334
+ "eval_loss": 2.376953125,
335
+ "eval_runtime": 1.0964,
336
+ "eval_samples_per_second": 26.45,
337
+ "eval_steps_per_second": 1.824,
338
  "step": 22
339
  },
340
  {
341
  "epoch": 2.3,
342
+ "learning_rate": 7.500000000000004e-06,
343
+ "loss": 1.3732,
344
  "step": 23
345
  },
346
  {
347
  "epoch": 2.3,
348
+ "eval_accuracy": 0.06097238736953362,
349
+ "eval_loss": 2.373046875,
350
+ "eval_runtime": 1.1007,
351
+ "eval_samples_per_second": 26.348,
352
+ "eval_steps_per_second": 1.817,
353
  "step": 23
354
  },
355
  {
356
  "epoch": 2.4,
357
+ "learning_rate": 6.1832212156129045e-06,
358
+ "loss": 1.3151,
359
  "step": 24
360
  },
361
  {
362
  "epoch": 2.4,
363
+ "eval_accuracy": 0.061038446294094335,
364
+ "eval_loss": 2.373046875,
365
+ "eval_runtime": 1.0081,
366
+ "eval_samples_per_second": 28.767,
367
+ "eval_steps_per_second": 1.984,
368
  "step": 24
369
  },
370
  {
371
  "epoch": 2.5,
372
+ "learning_rate": 4.963040904617131e-06,
373
+ "loss": 1.5302,
374
  "step": 25
375
  },
376
  {
377
  "epoch": 2.5,
378
+ "eval_accuracy": 0.061038446294094335,
379
+ "eval_loss": 2.373046875,
380
+ "eval_runtime": 1.2116,
381
+ "eval_samples_per_second": 23.936,
382
+ "eval_steps_per_second": 1.651,
383
  "step": 25
384
  },
385
  {
386
  "epoch": 2.6,
387
+ "learning_rate": 3.852827617839085e-06,
388
+ "loss": 1.2539,
389
  "step": 26
390
  },
391
  {
392
  "epoch": 2.6,
393
+ "eval_accuracy": 0.06117056414321575,
394
+ "eval_loss": 2.375,
395
+ "eval_runtime": 1.4027,
396
+ "eval_samples_per_second": 20.674,
397
+ "eval_steps_per_second": 1.426,
398
  "step": 26
399
  },
400
  {
401
  "epoch": 2.7,
402
+ "learning_rate": 2.86474508437579e-06,
403
+ "loss": 1.6211,
404
  "step": 27
405
  },
406
  {
407
  "epoch": 2.7,
408
+ "eval_accuracy": 0.061236623067776455,
409
+ "eval_loss": 2.376953125,
410
+ "eval_runtime": 0.9053,
411
+ "eval_samples_per_second": 32.035,
412
+ "eval_steps_per_second": 2.209,
413
  "step": 27
414
  },
415
  {
416
  "epoch": 2.8,
417
+ "learning_rate": 2.0096189432334194e-06,
418
+ "loss": 1.6047,
419
  "step": 28
420
  },
421
  {
422
  "epoch": 2.8,
423
+ "eval_accuracy": 0.06130268199233716,
424
+ "eval_loss": 2.376953125,
425
+ "eval_runtime": 1.0962,
426
+ "eval_samples_per_second": 26.456,
427
+ "eval_steps_per_second": 1.825,
428
  "step": 28
429
  },
430
  {
431
  "epoch": 2.9,
432
+ "learning_rate": 1.2968181353609854e-06,
433
+ "loss": 1.1953,
434
  "step": 29
435
  },
436
  {
437
  "epoch": 2.9,
438
+ "eval_accuracy": 0.06143479984145858,
439
+ "eval_loss": 2.37890625,
440
+ "eval_runtime": 1.52,
441
+ "eval_samples_per_second": 19.079,
442
+ "eval_steps_per_second": 1.316,
443
  "step": 29
444
  },
445
  {
446
  "epoch": 3.0,
447
+ "learning_rate": 7.341522555726971e-07,
448
+ "loss": 1.1621,
449
  "step": 30
450
  },
451
  {
452
  "epoch": 3.0,
453
+ "eval_accuracy": 0.06143479984145858,
454
+ "eval_loss": 2.37890625,
455
+ "eval_runtime": 1.3108,
456
+ "eval_samples_per_second": 22.124,
457
+ "eval_steps_per_second": 1.526,
458
  "step": 30
459
  },
460
  {
461
+ "epoch": 3.0,
462
+ "step": 30,
463
+ "total_flos": 1466265894912.0,
464
+ "train_loss": 1.999542236328125,
465
+ "train_runtime": 244.9371,
466
+ "train_samples_per_second": 1.911,
467
+ "train_steps_per_second": 0.122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  }
469
  ],
470
+ "max_steps": 30,
471
+ "num_train_epochs": 3,
472
+ "total_flos": 1466265894912.0,
473
  "trial_name": null,
474
  "trial_params": null
475
  }