AlekseyKorshuk commited on
Commit
1adcee9
1 Parent(s): 9f2543d

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +129 -414
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "train_loss": 2.2499421772203947,
4
- "train_runtime": 531.0436,
5
  "train_samples": 303,
6
- "train_samples_per_second": 1.141,
7
- "train_steps_per_second": 0.072
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 2.5612600226151314,
4
+ "train_runtime": 437.2347,
5
  "train_samples": 303,
6
+ "train_samples_per_second": 0.693,
7
+ "train_steps_per_second": 0.043
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "train_loss": 2.2499421772203947,
4
- "train_runtime": 531.0436,
5
  "train_samples": 303,
6
- "train_samples_per_second": 1.141,
7
- "train_steps_per_second": 0.072
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 2.5612600226151314,
4
+ "train_runtime": 437.2347,
5
  "train_samples": 303,
6
+ "train_samples_per_second": 0.693,
7
+ "train_steps_per_second": 0.043
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
- "global_step": 38,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -17,9 +17,9 @@
17
  "epoch": 0.05,
18
  "eval_accuracy": 0.062219502243982046,
19
  "eval_loss": 2.654296875,
20
- "eval_runtime": 3.4086,
21
- "eval_samples_per_second": 16.722,
22
- "eval_steps_per_second": 1.173,
23
  "step": 1
24
  },
25
  {
@@ -32,14 +32,14 @@
32
  "epoch": 0.11,
33
  "eval_accuracy": 0.062219502243982046,
34
  "eval_loss": 2.654296875,
35
- "eval_runtime": 2.4357,
36
- "eval_samples_per_second": 23.402,
37
- "eval_steps_per_second": 1.642,
38
  "step": 2
39
  },
40
  {
41
  "epoch": 0.16,
42
- "learning_rate": 2.994876739510005e-05,
43
  "loss": 2.6003,
44
  "step": 3
45
  },
@@ -47,549 +47,264 @@
47
  "epoch": 0.16,
48
  "eval_accuracy": 0.06265016546534294,
49
  "eval_loss": 2.6015625,
50
- "eval_runtime": 2.3986,
51
- "eval_samples_per_second": 23.763,
52
- "eval_steps_per_second": 1.668,
53
  "step": 3
54
  },
55
  {
56
  "epoch": 0.21,
57
- "learning_rate": 2.9795419551040836e-05,
58
  "loss": 2.5603,
59
  "step": 4
60
  },
61
  {
62
  "epoch": 0.21,
63
- "eval_accuracy": 0.0627181649213473,
64
  "eval_loss": 2.5703125,
65
- "eval_runtime": 2.5872,
66
- "eval_samples_per_second": 22.032,
67
- "eval_steps_per_second": 1.546,
68
  "step": 4
69
  },
70
  {
71
  "epoch": 0.26,
72
- "learning_rate": 2.9541003989089956e-05,
73
- "loss": 2.6067,
74
  "step": 5
75
  },
76
  {
77
  "epoch": 0.26,
78
- "eval_accuracy": 0.06289949680402557,
79
  "eval_loss": 2.55078125,
80
- "eval_runtime": 2.7942,
81
- "eval_samples_per_second": 20.4,
82
- "eval_steps_per_second": 1.432,
83
  "step": 5
84
  },
85
  {
86
  "epoch": 0.32,
87
- "learning_rate": 2.9187258625509518e-05,
88
  "loss": 2.5439,
89
  "step": 6
90
  },
91
  {
92
  "epoch": 0.32,
93
- "eval_accuracy": 0.0626954984360125,
94
- "eval_loss": 2.546875,
95
- "eval_runtime": 2.8026,
96
- "eval_samples_per_second": 20.338,
97
- "eval_steps_per_second": 1.427,
98
  "step": 6
99
  },
100
  {
101
  "epoch": 0.37,
102
- "learning_rate": 2.873659989982586e-05,
103
- "loss": 2.4459,
104
  "step": 7
105
  },
106
  {
107
  "epoch": 0.37,
108
- "eval_accuracy": 0.06289949680402557,
109
- "eval_loss": 2.548828125,
110
- "eval_runtime": 2.3868,
111
- "eval_samples_per_second": 23.881,
112
- "eval_steps_per_second": 1.676,
113
  "step": 7
114
  },
115
  {
116
  "epoch": 0.42,
117
- "learning_rate": 2.8192106268097336e-05,
118
- "loss": 2.5439,
119
  "step": 8
120
  },
121
  {
122
  "epoch": 0.42,
123
- "eval_accuracy": 0.06278616437735164,
124
- "eval_loss": 2.548828125,
125
- "eval_runtime": 3.0044,
126
- "eval_samples_per_second": 18.972,
127
- "eval_steps_per_second": 1.331,
128
  "step": 8
129
  },
130
  {
131
  "epoch": 0.47,
132
- "learning_rate": 2.7557497173937928e-05,
133
- "loss": 2.6125,
134
  "step": 9
135
  },
136
  {
137
  "epoch": 0.47,
138
- "eval_accuracy": 0.06317149462804297,
139
- "eval_loss": 2.54296875,
140
- "eval_runtime": 2.9895,
141
- "eval_samples_per_second": 19.067,
142
- "eval_steps_per_second": 1.338,
143
  "step": 9
144
  },
145
  {
146
  "epoch": 0.53,
147
- "learning_rate": 2.6837107640945904e-05,
148
- "loss": 2.4583,
149
  "step": 10
150
  },
151
  {
152
  "epoch": 0.53,
153
- "eval_accuracy": 0.06262749898000816,
154
- "eval_loss": 2.5390625,
155
- "eval_runtime": 3.2063,
156
- "eval_samples_per_second": 17.778,
157
- "eval_steps_per_second": 1.248,
158
  "step": 10
159
  },
160
  {
161
  "epoch": 0.58,
162
- "learning_rate": 2.6035858660096975e-05,
163
- "loss": 2.5088,
164
  "step": 11
165
  },
166
  {
167
  "epoch": 0.58,
168
- "eval_accuracy": 0.06287683031869079,
169
- "eval_loss": 2.533203125,
170
- "eval_runtime": 2.5988,
171
- "eval_samples_per_second": 21.933,
172
- "eval_steps_per_second": 1.539,
173
  "step": 11
174
  },
175
  {
176
  "epoch": 0.63,
177
- "learning_rate": 2.5159223574386117e-05,
178
- "loss": 2.6033,
179
  "step": 12
180
  },
181
  {
182
  "epoch": 0.63,
183
- "eval_accuracy": 0.06308082868670384,
184
  "eval_loss": 2.521484375,
185
- "eval_runtime": 2.595,
186
- "eval_samples_per_second": 21.965,
187
- "eval_steps_per_second": 1.541,
188
  "step": 12
189
  },
190
  {
191
  "epoch": 0.68,
192
- "learning_rate": 2.4213190690345018e-05,
193
- "loss": 2.5037,
194
  "step": 13
195
  },
196
  {
197
  "epoch": 0.68,
198
- "eval_accuracy": 0.0632848270547169,
199
- "eval_loss": 2.515625,
200
- "eval_runtime": 3.1996,
201
- "eval_samples_per_second": 17.815,
202
- "eval_steps_per_second": 1.25,
203
  "step": 13
204
  },
205
  {
206
  "epoch": 0.74,
207
- "learning_rate": 2.320422237183641e-05,
208
- "loss": 2.6033,
209
  "step": 14
210
  },
211
  {
212
  "epoch": 0.74,
213
- "eval_accuracy": 0.06339815948139081,
214
- "eval_loss": 2.5078125,
215
- "eval_runtime": 2.4053,
216
- "eval_samples_per_second": 23.698,
217
- "eval_steps_per_second": 1.663,
218
  "step": 14
219
  },
220
  {
221
  "epoch": 0.79,
222
- "learning_rate": 2.2139210895556104e-05,
223
- "loss": 2.6023,
224
  "step": 15
225
  },
226
  {
227
  "epoch": 0.79,
228
- "eval_accuracy": 0.06339815948139081,
229
- "eval_loss": 2.501953125,
230
- "eval_runtime": 2.804,
231
- "eval_samples_per_second": 20.328,
232
- "eval_steps_per_second": 1.427,
233
  "step": 15
234
  },
235
  {
236
  "epoch": 0.84,
237
- "learning_rate": 2.1025431369794546e-05,
238
- "loss": 2.5034,
239
  "step": 16
240
  },
241
  {
242
  "epoch": 0.84,
243
- "eval_accuracy": 0.06333016002538647,
244
- "eval_loss": 2.49609375,
245
- "eval_runtime": 2.1988,
246
- "eval_samples_per_second": 25.923,
247
- "eval_steps_per_second": 1.819,
248
  "step": 16
249
  },
250
  {
251
  "epoch": 0.89,
252
- "learning_rate": 1.9870492038070255e-05,
253
- "loss": 2.4353,
254
  "step": 17
255
  },
256
  {
257
  "epoch": 0.89,
258
- "eval_accuracy": 0.0631488281427082,
259
- "eval_loss": 2.490234375,
260
- "eval_runtime": 2.5999,
261
- "eval_samples_per_second": 21.924,
262
- "eval_steps_per_second": 1.538,
263
  "step": 17
264
  },
265
  {
266
  "epoch": 0.95,
267
- "learning_rate": 1.8682282307111988e-05,
268
- "loss": 2.6262,
269
  "step": 18
270
  },
271
  {
272
  "epoch": 0.95,
273
- "eval_accuracy": 0.06333016002538647,
274
- "eval_loss": 2.486328125,
275
- "eval_runtime": 2.9949,
276
- "eval_samples_per_second": 19.032,
277
- "eval_steps_per_second": 1.336,
278
  "step": 18
279
  },
280
  {
281
  "epoch": 1.0,
282
- "learning_rate": 1.746891885421101e-05,
283
- "loss": 2.5613,
284
  "step": 19
285
  },
286
  {
287
  "epoch": 1.0,
288
- "eval_accuracy": 0.0631261616573734,
289
- "eval_loss": 2.484375,
290
- "eval_runtime": 3.2109,
291
- "eval_samples_per_second": 17.752,
292
- "eval_steps_per_second": 1.246,
293
- "step": 19
294
- },
295
- {
296
- "epoch": 1.05,
297
- "learning_rate": 1.623869018208499e-05,
298
- "loss": 2.115,
299
- "step": 20
300
- },
301
- {
302
- "epoch": 1.05,
303
- "eval_accuracy": 0.06317149462804297,
304
- "eval_loss": 2.48046875,
305
- "eval_runtime": 3.1997,
306
- "eval_samples_per_second": 17.814,
307
- "eval_steps_per_second": 1.25,
308
- "step": 20
309
- },
310
- {
311
- "epoch": 1.11,
312
- "learning_rate": 1.5e-05,
313
- "loss": 2.0885,
314
- "step": 21
315
- },
316
- {
317
- "epoch": 1.11,
318
- "eval_accuracy": 0.06351149190806474,
319
- "eval_loss": 2.48046875,
320
- "eval_runtime": 2.7957,
321
- "eval_samples_per_second": 20.388,
322
- "eval_steps_per_second": 1.431,
323
- "step": 21
324
- },
325
- {
326
- "epoch": 1.16,
327
- "learning_rate": 1.5e-05,
328
- "loss": 1.9777,
329
- "step": 22
330
- },
331
- {
332
- "epoch": 1.16,
333
- "eval_accuracy": 0.06351149190806474,
334
- "eval_loss": 2.48046875,
335
- "eval_runtime": 2.6375,
336
- "eval_samples_per_second": 21.612,
337
- "eval_steps_per_second": 1.517,
338
- "step": 22
339
- },
340
- {
341
- "epoch": 1.21,
342
- "learning_rate": 1.3761309817915017e-05,
343
- "loss": 2.1053,
344
- "step": 23
345
- },
346
- {
347
- "epoch": 1.21,
348
- "eval_accuracy": 0.06335282651072124,
349
- "eval_loss": 2.48046875,
350
- "eval_runtime": 2.3919,
351
- "eval_samples_per_second": 23.831,
352
- "eval_steps_per_second": 1.672,
353
- "step": 23
354
- },
355
- {
356
- "epoch": 1.26,
357
- "learning_rate": 1.2531081145788989e-05,
358
- "loss": 1.9324,
359
- "step": 24
360
- },
361
- {
362
- "epoch": 1.26,
363
- "eval_accuracy": 0.06364749082007344,
364
- "eval_loss": 2.482421875,
365
- "eval_runtime": 2.7945,
366
- "eval_samples_per_second": 20.397,
367
- "eval_steps_per_second": 1.431,
368
- "step": 24
369
- },
370
- {
371
- "epoch": 1.32,
372
- "learning_rate": 1.1317717692888014e-05,
373
- "loss": 1.9122,
374
- "step": 25
375
- },
376
- {
377
- "epoch": 1.32,
378
- "eval_accuracy": 0.06369282379074301,
379
- "eval_loss": 2.48828125,
380
- "eval_runtime": 2.7869,
381
- "eval_samples_per_second": 20.453,
382
- "eval_steps_per_second": 1.435,
383
- "step": 25
384
- },
385
- {
386
- "epoch": 1.37,
387
- "learning_rate": 1.0129507961929749e-05,
388
- "loss": 2.1058,
389
- "step": 26
390
- },
391
- {
392
- "epoch": 1.37,
393
- "eval_accuracy": 0.06389682215875607,
394
- "eval_loss": 2.49609375,
395
- "eval_runtime": 2.4103,
396
- "eval_samples_per_second": 23.648,
397
- "eval_steps_per_second": 1.66,
398
- "step": 26
399
- },
400
- {
401
- "epoch": 1.42,
402
- "learning_rate": 8.974568630205462e-06,
403
- "loss": 1.9803,
404
- "step": 27
405
- },
406
- {
407
- "epoch": 1.42,
408
- "eval_accuracy": 0.06389682215875607,
409
- "eval_loss": 2.501953125,
410
- "eval_runtime": 2.7926,
411
- "eval_samples_per_second": 20.411,
412
- "eval_steps_per_second": 1.432,
413
- "step": 27
414
- },
415
- {
416
- "epoch": 1.47,
417
- "learning_rate": 7.860789104443897e-06,
418
- "loss": 1.6486,
419
- "step": 28
420
- },
421
- {
422
- "epoch": 1.47,
423
- "eval_accuracy": 0.06378348973208214,
424
- "eval_loss": 2.509765625,
425
- "eval_runtime": 3.2131,
426
- "eval_samples_per_second": 17.74,
427
- "eval_steps_per_second": 1.245,
428
- "step": 28
429
- },
430
- {
431
- "epoch": 1.53,
432
- "learning_rate": 6.795777628163599e-06,
433
- "loss": 1.8983,
434
- "step": 29
435
- },
436
- {
437
- "epoch": 1.53,
438
- "eval_accuracy": 0.06357949136406908,
439
- "eval_loss": 2.515625,
440
- "eval_runtime": 3.1898,
441
- "eval_samples_per_second": 17.869,
442
- "eval_steps_per_second": 1.254,
443
- "step": 29
444
- },
445
- {
446
- "epoch": 1.58,
447
- "learning_rate": 5.786809309654983e-06,
448
- "loss": 1.8105,
449
- "step": 30
450
- },
451
- {
452
- "epoch": 1.58,
453
- "eval_accuracy": 0.06335282651072124,
454
- "eval_loss": 2.521484375,
455
- "eval_runtime": 3.1961,
456
- "eval_samples_per_second": 17.834,
457
- "eval_steps_per_second": 1.252,
458
- "step": 30
459
- },
460
- {
461
- "epoch": 1.63,
462
- "learning_rate": 4.840776425613887e-06,
463
- "loss": 1.9916,
464
- "step": 31
465
- },
466
- {
467
- "epoch": 1.63,
468
- "eval_accuracy": 0.06344349245206038,
469
- "eval_loss": 2.5234375,
470
- "eval_runtime": 2.7902,
471
- "eval_samples_per_second": 20.428,
472
- "eval_steps_per_second": 1.434,
473
- "step": 31
474
- },
475
- {
476
- "epoch": 1.68,
477
- "learning_rate": 3.964141339903026e-06,
478
- "loss": 1.886,
479
- "step": 32
480
- },
481
- {
482
- "epoch": 1.68,
483
- "eval_accuracy": 0.06346615893739517,
484
- "eval_loss": 2.525390625,
485
- "eval_runtime": 2.6003,
486
- "eval_samples_per_second": 21.921,
487
- "eval_steps_per_second": 1.538,
488
- "step": 32
489
- },
490
- {
491
- "epoch": 1.74,
492
- "learning_rate": 3.162892359054098e-06,
493
- "loss": 1.8013,
494
- "step": 33
495
- },
496
- {
497
- "epoch": 1.74,
498
- "eval_accuracy": 0.06344349245206038,
499
- "eval_loss": 2.52734375,
500
- "eval_runtime": 2.3881,
501
- "eval_samples_per_second": 23.868,
502
- "eval_steps_per_second": 1.675,
503
- "step": 33
504
- },
505
- {
506
- "epoch": 1.79,
507
- "learning_rate": 2.442502826062072e-06,
508
- "loss": 1.8435,
509
- "step": 34
510
- },
511
- {
512
- "epoch": 1.79,
513
- "eval_accuracy": 0.06344349245206038,
514
- "eval_loss": 2.525390625,
515
- "eval_runtime": 3.2047,
516
- "eval_samples_per_second": 17.786,
517
- "eval_steps_per_second": 1.248,
518
- "step": 34
519
- },
520
- {
521
- "epoch": 1.84,
522
- "learning_rate": 1.8078937319026655e-06,
523
- "loss": 2.1229,
524
- "step": 35
525
- },
526
- {
527
- "epoch": 1.84,
528
- "eval_accuracy": 0.06348882542272995,
529
- "eval_loss": 2.5234375,
530
- "eval_runtime": 2.4087,
531
- "eval_samples_per_second": 23.664,
532
- "eval_steps_per_second": 1.661,
533
- "step": 35
534
- },
535
- {
536
- "epoch": 1.89,
537
- "learning_rate": 1.2634001001741375e-06,
538
- "loss": 1.8739,
539
- "step": 36
540
- },
541
- {
542
- "epoch": 1.89,
543
  "eval_accuracy": 0.06357949136406908,
544
- "eval_loss": 2.5234375,
545
- "eval_runtime": 3.196,
546
- "eval_samples_per_second": 17.835,
547
- "eval_steps_per_second": 1.252,
548
- "step": 36
549
- },
550
- {
551
- "epoch": 1.95,
552
- "learning_rate": 8.127413744904805e-07,
553
- "loss": 1.7528,
554
- "step": 37
555
- },
556
- {
557
- "epoch": 1.95,
558
- "eval_accuracy": 0.06369282379074301,
559
- "eval_loss": 2.5234375,
560
- "eval_runtime": 2.1908,
561
- "eval_samples_per_second": 26.018,
562
- "eval_steps_per_second": 1.826,
563
- "step": 37
564
- },
565
- {
566
- "epoch": 2.0,
567
- "learning_rate": 4.589960109100444e-07,
568
- "loss": 1.9462,
569
- "step": 38
570
  },
571
  {
572
- "epoch": 2.0,
573
- "eval_accuracy": 0.06360215784940387,
574
- "eval_loss": 2.521484375,
575
- "eval_runtime": 2.6164,
576
- "eval_samples_per_second": 21.786,
577
- "eval_steps_per_second": 1.529,
578
- "step": 38
579
- },
580
- {
581
- "epoch": 2.0,
582
- "step": 38,
583
- "total_flos": 2058056761344.0,
584
- "train_loss": 2.2499421772203947,
585
- "train_runtime": 531.0436,
586
- "train_samples_per_second": 1.141,
587
- "train_steps_per_second": 0.072
588
  }
589
  ],
590
- "max_steps": 38,
591
- "num_train_epochs": 2,
592
- "total_flos": 2058056761344.0,
593
  "trial_name": null,
594
  "trial_params": null
595
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "global_step": 19,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
17
  "epoch": 0.05,
18
  "eval_accuracy": 0.062219502243982046,
19
  "eval_loss": 2.654296875,
20
+ "eval_runtime": 2.3869,
21
+ "eval_samples_per_second": 23.881,
22
+ "eval_steps_per_second": 1.676,
23
  "step": 1
24
  },
25
  {
 
32
  "epoch": 0.11,
33
  "eval_accuracy": 0.062219502243982046,
34
  "eval_loss": 2.654296875,
35
+ "eval_runtime": 3.2001,
36
+ "eval_samples_per_second": 17.812,
37
+ "eval_steps_per_second": 1.25,
38
  "step": 2
39
  },
40
  {
41
  "epoch": 0.16,
42
+ "learning_rate": 2.9795419551040836e-05,
43
  "loss": 2.6003,
44
  "step": 3
45
  },
 
47
  "epoch": 0.16,
48
  "eval_accuracy": 0.06265016546534294,
49
  "eval_loss": 2.6015625,
50
+ "eval_runtime": 3.1764,
51
+ "eval_samples_per_second": 17.945,
52
+ "eval_steps_per_second": 1.259,
53
  "step": 3
54
  },
55
  {
56
  "epoch": 0.21,
57
+ "learning_rate": 2.9187258625509518e-05,
58
  "loss": 2.5603,
59
  "step": 4
60
  },
61
  {
62
  "epoch": 0.21,
63
+ "eval_accuracy": 0.06260483249467337,
64
  "eval_loss": 2.5703125,
65
+ "eval_runtime": 2.9857,
66
+ "eval_samples_per_second": 19.091,
67
+ "eval_steps_per_second": 1.34,
68
  "step": 4
69
  },
70
  {
71
  "epoch": 0.26,
72
+ "learning_rate": 2.8192106268097336e-05,
73
+ "loss": 2.606,
74
  "step": 5
75
  },
76
  {
77
  "epoch": 0.26,
78
+ "eval_accuracy": 0.06292216328936036,
79
  "eval_loss": 2.55078125,
80
+ "eval_runtime": 2.5694,
81
+ "eval_samples_per_second": 22.184,
82
+ "eval_steps_per_second": 1.557,
83
  "step": 5
84
  },
85
  {
86
  "epoch": 0.32,
87
+ "learning_rate": 2.6837107640945904e-05,
88
  "loss": 2.5439,
89
  "step": 6
90
  },
91
  {
92
  "epoch": 0.32,
93
+ "eval_accuracy": 0.06292216328936036,
94
+ "eval_loss": 2.544921875,
95
+ "eval_runtime": 3.1779,
96
+ "eval_samples_per_second": 17.937,
97
+ "eval_steps_per_second": 1.259,
98
  "step": 6
99
  },
100
  {
101
  "epoch": 0.37,
102
+ "learning_rate": 2.5159223574386117e-05,
103
+ "loss": 2.4449,
104
  "step": 7
105
  },
106
  {
107
  "epoch": 0.37,
108
+ "eval_accuracy": 0.06287683031869079,
109
+ "eval_loss": 2.546875,
110
+ "eval_runtime": 2.3628,
111
+ "eval_samples_per_second": 24.124,
112
+ "eval_steps_per_second": 1.693,
113
  "step": 7
114
  },
115
  {
116
  "epoch": 0.42,
117
+ "learning_rate": 2.320422237183641e-05,
118
+ "loss": 2.5422,
119
  "step": 8
120
  },
121
  {
122
  "epoch": 0.42,
123
+ "eval_accuracy": 0.0629901627453647,
124
+ "eval_loss": 2.546875,
125
+ "eval_runtime": 3.1712,
126
+ "eval_samples_per_second": 17.974,
127
+ "eval_steps_per_second": 1.261,
128
  "step": 8
129
  },
130
  {
131
  "epoch": 0.47,
132
+ "learning_rate": 2.1025431369794546e-05,
133
+ "loss": 2.6101,
134
  "step": 9
135
  },
136
  {
137
  "epoch": 0.47,
138
+ "eval_accuracy": 0.06319416111337776,
139
+ "eval_loss": 2.541015625,
140
+ "eval_runtime": 2.3922,
141
+ "eval_samples_per_second": 23.827,
142
+ "eval_steps_per_second": 1.672,
143
  "step": 9
144
  },
145
  {
146
  "epoch": 0.53,
147
+ "learning_rate": 1.8682282307111988e-05,
148
+ "loss": 2.4482,
149
  "step": 10
150
  },
151
  {
152
  "epoch": 0.53,
153
+ "eval_accuracy": 0.0629901627453647,
154
+ "eval_loss": 2.53515625,
155
+ "eval_runtime": 2.3705,
156
+ "eval_samples_per_second": 24.045,
157
+ "eval_steps_per_second": 1.687,
158
  "step": 10
159
  },
160
  {
161
  "epoch": 0.58,
162
+ "learning_rate": 1.623869018208499e-05,
163
+ "loss": 2.501,
164
  "step": 11
165
  },
166
  {
167
  "epoch": 0.58,
168
+ "eval_accuracy": 0.0631261616573734,
169
+ "eval_loss": 2.529296875,
170
+ "eval_runtime": 2.7727,
171
+ "eval_samples_per_second": 20.558,
172
+ "eval_steps_per_second": 1.443,
173
  "step": 11
174
  },
175
  {
176
  "epoch": 0.63,
177
+ "learning_rate": 1.3761309817915017e-05,
178
+ "loss": 2.5967,
179
  "step": 12
180
  },
181
  {
182
  "epoch": 0.63,
183
+ "eval_accuracy": 0.06337549299605603,
184
  "eval_loss": 2.521484375,
185
+ "eval_runtime": 2.168,
186
+ "eval_samples_per_second": 26.292,
187
+ "eval_steps_per_second": 1.845,
188
  "step": 12
189
  },
190
  {
191
  "epoch": 0.68,
192
+ "learning_rate": 1.1317717692888014e-05,
193
+ "loss": 2.4998,
194
  "step": 13
195
  },
196
  {
197
  "epoch": 0.68,
198
+ "eval_accuracy": 0.06346615893739517,
199
+ "eval_loss": 2.513671875,
200
+ "eval_runtime": 3.1858,
201
+ "eval_samples_per_second": 17.892,
202
+ "eval_steps_per_second": 1.256,
203
  "step": 13
204
  },
205
  {
206
  "epoch": 0.74,
207
+ "learning_rate": 8.974568630205462e-06,
208
+ "loss": 2.5957,
209
  "step": 14
210
  },
211
  {
212
  "epoch": 0.74,
213
+ "eval_accuracy": 0.06364749082007344,
214
+ "eval_loss": 2.509765625,
215
+ "eval_runtime": 2.1678,
216
+ "eval_samples_per_second": 26.294,
217
+ "eval_steps_per_second": 1.845,
218
  "step": 14
219
  },
220
  {
221
  "epoch": 0.79,
222
+ "learning_rate": 6.795777628163599e-06,
223
+ "loss": 2.5967,
224
  "step": 15
225
  },
226
  {
227
  "epoch": 0.79,
228
+ "eval_accuracy": 0.06387415567342128,
229
+ "eval_loss": 2.50390625,
230
+ "eval_runtime": 3.188,
231
+ "eval_samples_per_second": 17.879,
232
+ "eval_steps_per_second": 1.255,
233
  "step": 15
234
  },
235
  {
236
  "epoch": 0.84,
237
+ "learning_rate": 4.840776425613887e-06,
238
+ "loss": 2.5022,
239
  "step": 16
240
  },
241
  {
242
  "epoch": 0.84,
243
+ "eval_accuracy": 0.06373815676141258,
244
+ "eval_loss": 2.5,
245
+ "eval_runtime": 3.178,
246
+ "eval_samples_per_second": 17.936,
247
+ "eval_steps_per_second": 1.259,
248
  "step": 16
249
  },
250
  {
251
  "epoch": 0.89,
252
+ "learning_rate": 3.162892359054098e-06,
253
+ "loss": 2.4314,
254
  "step": 17
255
  },
256
  {
257
  "epoch": 0.89,
258
+ "eval_accuracy": 0.06371549027607779,
259
+ "eval_loss": 2.498046875,
260
+ "eval_runtime": 2.968,
261
+ "eval_samples_per_second": 19.205,
262
+ "eval_steps_per_second": 1.348,
263
  "step": 17
264
  },
265
  {
266
  "epoch": 0.95,
267
+ "learning_rate": 1.8078937319026655e-06,
268
+ "loss": 2.6279,
269
  "step": 18
270
  },
271
  {
272
  "epoch": 0.95,
273
+ "eval_accuracy": 0.06362482433473865,
274
+ "eval_loss": 2.49609375,
275
+ "eval_runtime": 3.1736,
276
+ "eval_samples_per_second": 17.961,
277
+ "eval_steps_per_second": 1.26,
278
  "step": 18
279
  },
280
  {
281
  "epoch": 1.0,
282
+ "learning_rate": 8.127413744904805e-07,
283
+ "loss": 2.571,
284
  "step": 19
285
  },
286
  {
287
  "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  "eval_accuracy": 0.06357949136406908,
289
+ "eval_loss": 2.49609375,
290
+ "eval_runtime": 2.9674,
291
+ "eval_samples_per_second": 19.208,
292
+ "eval_steps_per_second": 1.348,
293
+ "step": 19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  },
295
  {
296
+ "epoch": 1.0,
297
+ "step": 19,
298
+ "total_flos": 1001216802816.0,
299
+ "train_loss": 2.5612600226151314,
300
+ "train_runtime": 437.2347,
301
+ "train_samples_per_second": 0.693,
302
+ "train_steps_per_second": 0.043
 
 
 
 
 
 
 
 
 
303
  }
304
  ],
305
+ "max_steps": 19,
306
+ "num_train_epochs": 1,
307
+ "total_flos": 1001216802816.0,
308
  "trial_name": null,
309
  "trial_params": null
310
  }