BilelDJ commited on
Commit
4e0edaa
1 Parent(s): 8292467

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +9 -9
  2. eval_results.json +4 -4
  3. train_results.json +5 -5
  4. trainer_state.json +135 -885
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 1.166153073310852,
4
- "eval_runtime": 24.7105,
5
- "eval_samples_per_second": 2.023,
6
- "eval_steps_per_second": 0.283,
7
- "total_flos": 58050663970068.0,
8
- "train_loss": 1.8778469576500356,
9
- "train_runtime": 2427.903,
10
- "train_samples_per_second": 0.411,
11
- "train_steps_per_second": 0.026
12
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_loss": 0.5955778956413269,
4
+ "eval_runtime": 6.1056,
5
+ "eval_samples_per_second": 1.638,
6
+ "eval_steps_per_second": 0.819,
7
+ "total_flos": 5816699796600.0,
8
+ "train_loss": 1.3652541448495217,
9
+ "train_runtime": 216.8469,
10
+ "train_samples_per_second": 0.461,
11
+ "train_steps_per_second": 0.065
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_loss": 1.166153073310852,
4
- "eval_runtime": 24.7105,
5
- "eval_samples_per_second": 2.023,
6
- "eval_steps_per_second": 0.283
7
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_loss": 0.5955778956413269,
4
+ "eval_runtime": 6.1056,
5
+ "eval_samples_per_second": 1.638,
6
+ "eval_steps_per_second": 0.819
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "total_flos": 58050663970068.0,
4
- "train_loss": 1.8778469576500356,
5
- "train_runtime": 2427.903,
6
- "train_samples_per_second": 0.411,
7
- "train_steps_per_second": 0.026
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "total_flos": 5816699796600.0,
4
+ "train_loss": 1.3652541448495217,
5
+ "train_runtime": 216.8469,
6
+ "train_samples_per_second": 0.461,
7
+ "train_steps_per_second": 0.065
8
  }
trainer_state.json CHANGED
@@ -3,983 +3,233 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 1.0,
6
- "global_step": 64,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03125,
13
- "grad_norm": 254.00668334960938,
14
- "learning_rate": 4.921875e-05,
15
- "loss": 1.9525,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.03125,
20
- "eval_loss": 1.4612890481948853,
21
- "eval_runtime": 25.9325,
22
- "eval_samples_per_second": 1.928,
23
- "eval_steps_per_second": 0.27,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.0625,
28
- "grad_norm": 210.81161499023438,
29
- "learning_rate": 4.8437500000000005e-05,
30
- "loss": 1.0619,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.0625,
35
- "eval_loss": 3.5714147090911865,
36
- "eval_runtime": 14.3357,
37
- "eval_samples_per_second": 3.488,
38
- "eval_steps_per_second": 0.488,
39
  "step": 2
40
  },
41
  {
42
- "epoch": 0.09375,
43
- "grad_norm": 285.1878967285156,
44
- "learning_rate": 4.765625e-05,
45
- "loss": 3.3708,
46
  "step": 3
47
  },
48
  {
49
- "epoch": 0.09375,
50
- "eval_loss": 2.727013349533081,
51
- "eval_runtime": 19.6621,
52
- "eval_samples_per_second": 2.543,
53
- "eval_steps_per_second": 0.356,
54
  "step": 3
55
  },
56
  {
57
- "epoch": 0.125,
58
- "grad_norm": 218.46783447265625,
59
- "learning_rate": 4.6875e-05,
60
- "loss": 3.8741,
61
  "step": 4
62
  },
63
  {
64
- "epoch": 0.125,
65
- "eval_loss": 3.152341842651367,
66
- "eval_runtime": 14.2606,
67
- "eval_samples_per_second": 3.506,
68
- "eval_steps_per_second": 0.491,
69
  "step": 4
70
  },
71
  {
72
- "epoch": 0.15625,
73
- "grad_norm": 97.45307159423828,
74
- "learning_rate": 4.609375e-05,
75
- "loss": 2.893,
76
  "step": 5
77
  },
78
  {
79
- "epoch": 0.15625,
80
- "eval_loss": 2.370089054107666,
81
- "eval_runtime": 19.323,
82
- "eval_samples_per_second": 2.588,
83
- "eval_steps_per_second": 0.362,
84
  "step": 5
85
  },
86
  {
87
- "epoch": 0.1875,
88
- "grad_norm": 107.88156127929688,
89
- "learning_rate": 4.5312500000000004e-05,
90
- "loss": 3.0239,
91
  "step": 6
92
  },
93
  {
94
- "epoch": 0.1875,
95
- "eval_loss": 2.017429828643799,
96
- "eval_runtime": 14.3416,
97
- "eval_samples_per_second": 3.486,
98
- "eval_steps_per_second": 0.488,
99
  "step": 6
100
  },
101
  {
102
- "epoch": 0.21875,
103
- "grad_norm": 49.51248550415039,
104
- "learning_rate": 4.453125e-05,
105
- "loss": 2.8805,
106
  "step": 7
107
  },
108
  {
109
- "epoch": 0.21875,
110
- "eval_loss": 1.99916410446167,
111
- "eval_runtime": 23.2348,
112
- "eval_samples_per_second": 2.152,
113
- "eval_steps_per_second": 0.301,
114
  "step": 7
115
  },
116
  {
117
- "epoch": 0.25,
118
- "grad_norm": 30.55683708190918,
119
- "learning_rate": 4.375e-05,
120
- "loss": 2.8668,
121
  "step": 8
122
  },
123
  {
124
- "epoch": 0.25,
125
- "eval_loss": 1.956161379814148,
126
- "eval_runtime": 14.3127,
127
- "eval_samples_per_second": 3.493,
128
- "eval_steps_per_second": 0.489,
129
  "step": 8
130
  },
131
  {
132
- "epoch": 0.28125,
133
- "grad_norm": 13.740416526794434,
134
- "learning_rate": 4.2968750000000004e-05,
135
- "loss": 2.7863,
136
  "step": 9
137
  },
138
  {
139
- "epoch": 0.28125,
140
- "eval_loss": 1.9640475511550903,
141
- "eval_runtime": 18.7885,
142
- "eval_samples_per_second": 2.661,
143
- "eval_steps_per_second": 0.373,
144
  "step": 9
145
  },
146
  {
147
- "epoch": 0.3125,
148
- "grad_norm": 13.121553421020508,
149
- "learning_rate": 4.21875e-05,
150
- "loss": 2.7809,
151
  "step": 10
152
  },
153
  {
154
- "epoch": 0.3125,
155
- "eval_loss": 2.0438995361328125,
156
- "eval_runtime": 14.5855,
157
- "eval_samples_per_second": 3.428,
158
- "eval_steps_per_second": 0.48,
159
  "step": 10
160
  },
161
  {
162
- "epoch": 0.34375,
163
- "grad_norm": 28.015024185180664,
164
- "learning_rate": 4.140625e-05,
165
- "loss": 2.8319,
166
  "step": 11
167
  },
168
  {
169
- "epoch": 0.34375,
170
- "eval_loss": 1.9908080101013184,
171
- "eval_runtime": 20.0428,
172
- "eval_samples_per_second": 2.495,
173
- "eval_steps_per_second": 0.349,
174
  "step": 11
175
  },
176
  {
177
- "epoch": 0.375,
178
- "grad_norm": 14.510987281799316,
179
- "learning_rate": 4.0625000000000005e-05,
180
- "loss": 2.6912,
181
  "step": 12
182
  },
183
  {
184
- "epoch": 0.375,
185
- "eval_loss": 1.9438972473144531,
186
- "eval_runtime": 15.1294,
187
- "eval_samples_per_second": 3.305,
188
- "eval_steps_per_second": 0.463,
189
  "step": 12
190
  },
191
  {
192
- "epoch": 0.40625,
193
- "grad_norm": 21.442792892456055,
194
- "learning_rate": 3.984375e-05,
195
- "loss": 2.9355,
196
  "step": 13
197
  },
198
  {
199
- "epoch": 0.40625,
200
- "eval_loss": 1.9830740690231323,
201
- "eval_runtime": 19.4816,
202
- "eval_samples_per_second": 2.567,
203
- "eval_steps_per_second": 0.359,
204
  "step": 13
205
  },
206
- {
207
- "epoch": 0.4375,
208
- "grad_norm": 11.233291625976562,
209
- "learning_rate": 3.90625e-05,
210
- "loss": 2.6233,
211
- "step": 14
212
- },
213
- {
214
- "epoch": 0.4375,
215
- "eval_loss": 1.9741547107696533,
216
- "eval_runtime": 14.0253,
217
- "eval_samples_per_second": 3.565,
218
- "eval_steps_per_second": 0.499,
219
- "step": 14
220
- },
221
- {
222
- "epoch": 0.46875,
223
- "grad_norm": 16.227378845214844,
224
- "learning_rate": 3.828125e-05,
225
- "loss": 2.5995,
226
- "step": 15
227
- },
228
- {
229
- "epoch": 0.46875,
230
- "eval_loss": 1.9386694431304932,
231
- "eval_runtime": 20.5374,
232
- "eval_samples_per_second": 2.435,
233
- "eval_steps_per_second": 0.341,
234
- "step": 15
235
- },
236
- {
237
- "epoch": 0.5,
238
- "grad_norm": 19.815183639526367,
239
- "learning_rate": 3.7500000000000003e-05,
240
- "loss": 2.4588,
241
- "step": 16
242
- },
243
- {
244
- "epoch": 0.5,
245
- "eval_loss": 1.8799047470092773,
246
- "eval_runtime": 14.1007,
247
- "eval_samples_per_second": 3.546,
248
- "eval_steps_per_second": 0.496,
249
- "step": 16
250
- },
251
- {
252
- "epoch": 0.53125,
253
- "grad_norm": 27.6804141998291,
254
- "learning_rate": 3.671875e-05,
255
- "loss": 2.6622,
256
- "step": 17
257
- },
258
- {
259
- "epoch": 0.53125,
260
- "eval_loss": 1.9120107889175415,
261
- "eval_runtime": 19.021,
262
- "eval_samples_per_second": 2.629,
263
- "eval_steps_per_second": 0.368,
264
- "step": 17
265
- },
266
- {
267
- "epoch": 0.5625,
268
- "grad_norm": 53.53272247314453,
269
- "learning_rate": 3.59375e-05,
270
- "loss": 2.6819,
271
- "step": 18
272
- },
273
- {
274
- "epoch": 0.5625,
275
- "eval_loss": 1.826271414756775,
276
- "eval_runtime": 14.0375,
277
- "eval_samples_per_second": 3.562,
278
- "eval_steps_per_second": 0.499,
279
- "step": 18
280
- },
281
- {
282
- "epoch": 0.59375,
283
- "grad_norm": 31.6351318359375,
284
- "learning_rate": 3.5156250000000004e-05,
285
- "loss": 2.6057,
286
- "step": 19
287
- },
288
- {
289
- "epoch": 0.59375,
290
- "eval_loss": 1.8063328266143799,
291
- "eval_runtime": 19.3292,
292
- "eval_samples_per_second": 2.587,
293
- "eval_steps_per_second": 0.362,
294
- "step": 19
295
- },
296
- {
297
- "epoch": 0.625,
298
- "grad_norm": 81.76189422607422,
299
- "learning_rate": 3.4375e-05,
300
- "loss": 2.1173,
301
- "step": 20
302
- },
303
- {
304
- "epoch": 0.625,
305
- "eval_loss": 1.813103437423706,
306
- "eval_runtime": 14.0502,
307
- "eval_samples_per_second": 3.559,
308
- "eval_steps_per_second": 0.498,
309
- "step": 20
310
- },
311
- {
312
- "epoch": 0.65625,
313
- "grad_norm": 28.440521240234375,
314
- "learning_rate": 3.359375e-05,
315
- "loss": 2.4934,
316
- "step": 21
317
- },
318
- {
319
- "epoch": 0.65625,
320
- "eval_loss": 1.8328737020492554,
321
- "eval_runtime": 19.9467,
322
- "eval_samples_per_second": 2.507,
323
- "eval_steps_per_second": 0.351,
324
- "step": 21
325
- },
326
- {
327
- "epoch": 0.6875,
328
- "grad_norm": 31.066139221191406,
329
- "learning_rate": 3.2812500000000005e-05,
330
- "loss": 2.4664,
331
- "step": 22
332
- },
333
- {
334
- "epoch": 0.6875,
335
- "eval_loss": 1.8729884624481201,
336
- "eval_runtime": 14.8515,
337
- "eval_samples_per_second": 3.367,
338
- "eval_steps_per_second": 0.471,
339
- "step": 22
340
- },
341
- {
342
- "epoch": 0.71875,
343
- "grad_norm": 47.6856803894043,
344
- "learning_rate": 3.203125e-05,
345
- "loss": 2.3591,
346
- "step": 23
347
- },
348
- {
349
- "epoch": 0.71875,
350
- "eval_loss": 1.9386744499206543,
351
- "eval_runtime": 19.4361,
352
- "eval_samples_per_second": 2.573,
353
- "eval_steps_per_second": 0.36,
354
- "step": 23
355
- },
356
- {
357
- "epoch": 0.75,
358
- "grad_norm": 44.08198928833008,
359
- "learning_rate": 3.125e-05,
360
- "loss": 2.4319,
361
- "step": 24
362
- },
363
- {
364
- "epoch": 0.75,
365
- "eval_loss": 1.9019237756729126,
366
- "eval_runtime": 13.9736,
367
- "eval_samples_per_second": 3.578,
368
- "eval_steps_per_second": 0.501,
369
- "step": 24
370
- },
371
- {
372
- "epoch": 0.78125,
373
- "grad_norm": 59.23967361450195,
374
- "learning_rate": 3.0468750000000002e-05,
375
- "loss": 2.5579,
376
- "step": 25
377
- },
378
- {
379
- "epoch": 0.78125,
380
- "eval_loss": 1.8655019998550415,
381
- "eval_runtime": 19.5942,
382
- "eval_samples_per_second": 2.552,
383
- "eval_steps_per_second": 0.357,
384
- "step": 25
385
- },
386
- {
387
- "epoch": 0.8125,
388
- "grad_norm": 44.628658294677734,
389
- "learning_rate": 2.96875e-05,
390
- "loss": 2.4048,
391
- "step": 26
392
- },
393
- {
394
- "epoch": 0.8125,
395
- "eval_loss": 1.8706899881362915,
396
- "eval_runtime": 14.1375,
397
- "eval_samples_per_second": 3.537,
398
- "eval_steps_per_second": 0.495,
399
- "step": 26
400
- },
401
- {
402
- "epoch": 0.84375,
403
- "grad_norm": 59.57282638549805,
404
- "learning_rate": 2.890625e-05,
405
- "loss": 2.7706,
406
- "step": 27
407
- },
408
- {
409
- "epoch": 0.84375,
410
- "eval_loss": 1.744444727897644,
411
- "eval_runtime": 19.9669,
412
- "eval_samples_per_second": 2.504,
413
- "eval_steps_per_second": 0.351,
414
- "step": 27
415
- },
416
- {
417
- "epoch": 0.875,
418
- "grad_norm": 38.48733901977539,
419
- "learning_rate": 2.8125000000000003e-05,
420
- "loss": 2.7279,
421
- "step": 28
422
- },
423
- {
424
- "epoch": 0.875,
425
- "eval_loss": 1.6933951377868652,
426
- "eval_runtime": 14.0043,
427
- "eval_samples_per_second": 3.57,
428
- "eval_steps_per_second": 0.5,
429
- "step": 28
430
- },
431
- {
432
- "epoch": 0.90625,
433
- "grad_norm": 38.13273620605469,
434
- "learning_rate": 2.734375e-05,
435
- "loss": 2.5247,
436
- "step": 29
437
- },
438
- {
439
- "epoch": 0.90625,
440
- "eval_loss": 1.6971945762634277,
441
- "eval_runtime": 18.8375,
442
- "eval_samples_per_second": 2.654,
443
- "eval_steps_per_second": 0.372,
444
- "step": 29
445
- },
446
- {
447
- "epoch": 0.9375,
448
- "grad_norm": 34.31296920776367,
449
- "learning_rate": 2.6562500000000002e-05,
450
- "loss": 2.2336,
451
- "step": 30
452
- },
453
- {
454
- "epoch": 0.9375,
455
- "eval_loss": 1.7182470560073853,
456
- "eval_runtime": 14.4648,
457
- "eval_samples_per_second": 3.457,
458
- "eval_steps_per_second": 0.484,
459
- "step": 30
460
- },
461
- {
462
- "epoch": 0.96875,
463
- "grad_norm": 34.71999740600586,
464
- "learning_rate": 2.578125e-05,
465
- "loss": 2.2479,
466
- "step": 31
467
- },
468
- {
469
- "epoch": 0.96875,
470
- "eval_loss": 1.7286863327026367,
471
- "eval_runtime": 19.0105,
472
- "eval_samples_per_second": 2.63,
473
- "eval_steps_per_second": 0.368,
474
- "step": 31
475
- },
476
- {
477
- "epoch": 1.0,
478
- "grad_norm": 54.52072525024414,
479
- "learning_rate": 2.5e-05,
480
- "loss": 0.9398,
481
- "step": 32
482
- },
483
- {
484
- "epoch": 1.0,
485
- "eval_loss": 1.7176891565322876,
486
- "eval_runtime": 15.5832,
487
- "eval_samples_per_second": 3.209,
488
- "eval_steps_per_second": 0.449,
489
- "step": 32
490
- },
491
- {
492
- "epoch": 1.03125,
493
- "grad_norm": 37.95784378051758,
494
- "learning_rate": 2.4218750000000003e-05,
495
- "loss": 2.2821,
496
- "step": 33
497
- },
498
- {
499
- "epoch": 1.03125,
500
- "eval_loss": 1.6573299169540405,
501
- "eval_runtime": 19.2268,
502
- "eval_samples_per_second": 2.601,
503
- "eval_steps_per_second": 0.364,
504
- "step": 33
505
- },
506
- {
507
- "epoch": 1.0625,
508
- "grad_norm": 32.960391998291016,
509
- "learning_rate": 2.34375e-05,
510
- "loss": 2.339,
511
- "step": 34
512
- },
513
- {
514
- "epoch": 1.0625,
515
- "eval_loss": 1.623481273651123,
516
- "eval_runtime": 14.5998,
517
- "eval_samples_per_second": 3.425,
518
- "eval_steps_per_second": 0.479,
519
- "step": 34
520
- },
521
- {
522
- "epoch": 1.09375,
523
- "grad_norm": 31.484209060668945,
524
- "learning_rate": 2.2656250000000002e-05,
525
- "loss": 1.7187,
526
- "step": 35
527
- },
528
- {
529
- "epoch": 1.09375,
530
- "eval_loss": 1.5644508600234985,
531
- "eval_runtime": 20.7871,
532
- "eval_samples_per_second": 2.405,
533
- "eval_steps_per_second": 0.337,
534
- "step": 35
535
- },
536
- {
537
- "epoch": 1.125,
538
- "grad_norm": 28.109580993652344,
539
- "learning_rate": 2.1875e-05,
540
- "loss": 1.585,
541
- "step": 36
542
- },
543
- {
544
- "epoch": 1.125,
545
- "eval_loss": 1.4872223138809204,
546
- "eval_runtime": 14.0172,
547
- "eval_samples_per_second": 3.567,
548
- "eval_steps_per_second": 0.499,
549
- "step": 36
550
- },
551
- {
552
- "epoch": 1.15625,
553
- "grad_norm": 34.407615661621094,
554
- "learning_rate": 2.109375e-05,
555
- "loss": 1.5801,
556
- "step": 37
557
- },
558
- {
559
- "epoch": 1.15625,
560
- "eval_loss": 1.429572582244873,
561
- "eval_runtime": 19.6378,
562
- "eval_samples_per_second": 2.546,
563
- "eval_steps_per_second": 0.356,
564
- "step": 37
565
- },
566
- {
567
- "epoch": 1.1875,
568
- "grad_norm": 34.66839599609375,
569
- "learning_rate": 2.0312500000000002e-05,
570
- "loss": 1.8689,
571
- "step": 38
572
- },
573
- {
574
- "epoch": 1.1875,
575
- "eval_loss": 1.4055802822113037,
576
- "eval_runtime": 15.0122,
577
- "eval_samples_per_second": 3.331,
578
- "eval_steps_per_second": 0.466,
579
- "step": 38
580
- },
581
- {
582
- "epoch": 1.21875,
583
- "grad_norm": 38.05352020263672,
584
- "learning_rate": 1.953125e-05,
585
- "loss": 1.5591,
586
- "step": 39
587
- },
588
- {
589
- "epoch": 1.21875,
590
- "eval_loss": 1.398817777633667,
591
- "eval_runtime": 20.1748,
592
- "eval_samples_per_second": 2.478,
593
- "eval_steps_per_second": 0.347,
594
- "step": 39
595
- },
596
- {
597
- "epoch": 1.25,
598
- "grad_norm": 45.93744659423828,
599
- "learning_rate": 1.8750000000000002e-05,
600
- "loss": 1.4769,
601
- "step": 40
602
- },
603
- {
604
- "epoch": 1.25,
605
- "eval_loss": 1.3957644701004028,
606
- "eval_runtime": 14.2141,
607
- "eval_samples_per_second": 3.518,
608
- "eval_steps_per_second": 0.492,
609
- "step": 40
610
- },
611
- {
612
- "epoch": 1.28125,
613
- "grad_norm": 67.26871490478516,
614
- "learning_rate": 1.796875e-05,
615
- "loss": 1.2898,
616
- "step": 41
617
- },
618
- {
619
- "epoch": 1.28125,
620
- "eval_loss": 1.3631155490875244,
621
- "eval_runtime": 18.8012,
622
- "eval_samples_per_second": 2.659,
623
- "eval_steps_per_second": 0.372,
624
- "step": 41
625
- },
626
- {
627
- "epoch": 1.3125,
628
- "grad_norm": 48.31477737426758,
629
- "learning_rate": 1.71875e-05,
630
- "loss": 1.4675,
631
- "step": 42
632
- },
633
- {
634
- "epoch": 1.3125,
635
- "eval_loss": 1.3446946144104004,
636
- "eval_runtime": 14.0177,
637
- "eval_samples_per_second": 3.567,
638
- "eval_steps_per_second": 0.499,
639
- "step": 42
640
- },
641
- {
642
- "epoch": 1.34375,
643
- "grad_norm": 43.087947845458984,
644
- "learning_rate": 1.6406250000000002e-05,
645
- "loss": 1.0123,
646
- "step": 43
647
- },
648
- {
649
- "epoch": 1.34375,
650
- "eval_loss": 1.3272167444229126,
651
- "eval_runtime": 19.8127,
652
- "eval_samples_per_second": 2.524,
653
- "eval_steps_per_second": 0.353,
654
- "step": 43
655
- },
656
- {
657
- "epoch": 1.375,
658
- "grad_norm": 50.14296340942383,
659
- "learning_rate": 1.5625e-05,
660
- "loss": 1.4516,
661
- "step": 44
662
- },
663
- {
664
- "epoch": 1.375,
665
- "eval_loss": 1.3105698823928833,
666
- "eval_runtime": 14.0327,
667
- "eval_samples_per_second": 3.563,
668
- "eval_steps_per_second": 0.499,
669
- "step": 44
670
- },
671
- {
672
- "epoch": 1.40625,
673
- "grad_norm": 56.435394287109375,
674
- "learning_rate": 1.484375e-05,
675
- "loss": 1.2132,
676
- "step": 45
677
- },
678
- {
679
- "epoch": 1.40625,
680
- "eval_loss": 1.2978880405426025,
681
- "eval_runtime": 19.3431,
682
- "eval_samples_per_second": 2.585,
683
- "eval_steps_per_second": 0.362,
684
- "step": 45
685
- },
686
- {
687
- "epoch": 1.4375,
688
- "grad_norm": 62.44044494628906,
689
- "learning_rate": 1.4062500000000001e-05,
690
- "loss": 1.1181,
691
- "step": 46
692
- },
693
- {
694
- "epoch": 1.4375,
695
- "eval_loss": 1.2973986864089966,
696
- "eval_runtime": 14.4572,
697
- "eval_samples_per_second": 3.458,
698
- "eval_steps_per_second": 0.484,
699
- "step": 46
700
- },
701
- {
702
- "epoch": 1.46875,
703
- "grad_norm": 49.06483459472656,
704
- "learning_rate": 1.3281250000000001e-05,
705
- "loss": 0.7987,
706
- "step": 47
707
- },
708
- {
709
- "epoch": 1.46875,
710
- "eval_loss": 1.3060061931610107,
711
- "eval_runtime": 18.8411,
712
- "eval_samples_per_second": 2.654,
713
- "eval_steps_per_second": 0.372,
714
- "step": 47
715
- },
716
- {
717
- "epoch": 1.5,
718
- "grad_norm": 59.63069534301758,
719
- "learning_rate": 1.25e-05,
720
- "loss": 0.7248,
721
- "step": 48
722
- },
723
- {
724
- "epoch": 1.5,
725
- "eval_loss": 1.3088918924331665,
726
- "eval_runtime": 16.3052,
727
- "eval_samples_per_second": 3.067,
728
- "eval_steps_per_second": 0.429,
729
- "step": 48
730
- },
731
- {
732
- "epoch": 1.53125,
733
- "grad_norm": 85.45850372314453,
734
- "learning_rate": 1.171875e-05,
735
- "loss": 0.8957,
736
- "step": 49
737
- },
738
- {
739
- "epoch": 1.53125,
740
- "eval_loss": 1.318437933921814,
741
- "eval_runtime": 19.1539,
742
- "eval_samples_per_second": 2.61,
743
- "eval_steps_per_second": 0.365,
744
- "step": 49
745
- },
746
- {
747
- "epoch": 1.5625,
748
- "grad_norm": 69.75704956054688,
749
- "learning_rate": 1.09375e-05,
750
- "loss": 0.8146,
751
- "step": 50
752
- },
753
- {
754
- "epoch": 1.5625,
755
- "eval_loss": 1.3147114515304565,
756
- "eval_runtime": 14.4752,
757
- "eval_samples_per_second": 3.454,
758
- "eval_steps_per_second": 0.484,
759
- "step": 50
760
- },
761
- {
762
- "epoch": 1.59375,
763
- "grad_norm": 55.29493713378906,
764
- "learning_rate": 1.0156250000000001e-05,
765
- "loss": 0.7953,
766
- "step": 51
767
- },
768
- {
769
- "epoch": 1.59375,
770
- "eval_loss": 1.2989192008972168,
771
- "eval_runtime": 20.9074,
772
- "eval_samples_per_second": 2.392,
773
- "eval_steps_per_second": 0.335,
774
- "step": 51
775
- },
776
- {
777
- "epoch": 1.625,
778
- "grad_norm": 71.8622817993164,
779
- "learning_rate": 9.375000000000001e-06,
780
- "loss": 1.1628,
781
- "step": 52
782
- },
783
- {
784
- "epoch": 1.625,
785
- "eval_loss": 1.2810921669006348,
786
- "eval_runtime": 14.0147,
787
- "eval_samples_per_second": 3.568,
788
- "eval_steps_per_second": 0.499,
789
- "step": 52
790
- },
791
- {
792
- "epoch": 1.65625,
793
- "grad_norm": 71.86141204833984,
794
- "learning_rate": 8.59375e-06,
795
- "loss": 1.5174,
796
- "step": 53
797
- },
798
- {
799
- "epoch": 1.65625,
800
- "eval_loss": 1.2605785131454468,
801
- "eval_runtime": 18.7275,
802
- "eval_samples_per_second": 2.67,
803
- "eval_steps_per_second": 0.374,
804
- "step": 53
805
- },
806
- {
807
- "epoch": 1.6875,
808
- "grad_norm": 62.22434997558594,
809
- "learning_rate": 7.8125e-06,
810
- "loss": 0.9822,
811
- "step": 54
812
- },
813
- {
814
- "epoch": 1.6875,
815
- "eval_loss": 1.2227574586868286,
816
- "eval_runtime": 13.9543,
817
- "eval_samples_per_second": 3.583,
818
- "eval_steps_per_second": 0.502,
819
- "step": 54
820
- },
821
- {
822
- "epoch": 1.71875,
823
- "grad_norm": 54.53126525878906,
824
- "learning_rate": 7.031250000000001e-06,
825
- "loss": 1.0173,
826
- "step": 55
827
- },
828
- {
829
- "epoch": 1.71875,
830
- "eval_loss": 1.1982362270355225,
831
- "eval_runtime": 19.0289,
832
- "eval_samples_per_second": 2.628,
833
- "eval_steps_per_second": 0.368,
834
- "step": 55
835
- },
836
- {
837
- "epoch": 1.75,
838
- "grad_norm": 76.07511138916016,
839
- "learning_rate": 6.25e-06,
840
- "loss": 1.3464,
841
- "step": 56
842
- },
843
- {
844
- "epoch": 1.75,
845
- "eval_loss": 1.1850783824920654,
846
- "eval_runtime": 14.0372,
847
- "eval_samples_per_second": 3.562,
848
- "eval_steps_per_second": 0.499,
849
- "step": 56
850
- },
851
- {
852
- "epoch": 1.78125,
853
- "grad_norm": 72.6391830444336,
854
- "learning_rate": 5.46875e-06,
855
- "loss": 0.8926,
856
- "step": 57
857
- },
858
- {
859
- "epoch": 1.78125,
860
- "eval_loss": 1.1799843311309814,
861
- "eval_runtime": 22.4922,
862
- "eval_samples_per_second": 2.223,
863
- "eval_steps_per_second": 0.311,
864
- "step": 57
865
- },
866
- {
867
- "epoch": 1.8125,
868
- "grad_norm": 56.88068771362305,
869
- "learning_rate": 4.6875000000000004e-06,
870
- "loss": 0.9163,
871
- "step": 58
872
- },
873
- {
874
- "epoch": 1.8125,
875
- "eval_loss": 1.1798558235168457,
876
- "eval_runtime": 14.0488,
877
- "eval_samples_per_second": 3.559,
878
- "eval_steps_per_second": 0.498,
879
- "step": 58
880
- },
881
- {
882
- "epoch": 1.84375,
883
- "grad_norm": 45.068748474121094,
884
- "learning_rate": 3.90625e-06,
885
- "loss": 0.7737,
886
- "step": 59
887
- },
888
- {
889
- "epoch": 1.84375,
890
- "eval_loss": 1.178802490234375,
891
- "eval_runtime": 20.1914,
892
- "eval_samples_per_second": 2.476,
893
- "eval_steps_per_second": 0.347,
894
- "step": 59
895
- },
896
- {
897
- "epoch": 1.875,
898
- "grad_norm": 65.94618225097656,
899
- "learning_rate": 3.125e-06,
900
- "loss": 0.9931,
901
- "step": 60
902
- },
903
- {
904
- "epoch": 1.875,
905
- "eval_loss": 1.1730504035949707,
906
- "eval_runtime": 14.2215,
907
- "eval_samples_per_second": 3.516,
908
- "eval_steps_per_second": 0.492,
909
- "step": 60
910
- },
911
- {
912
- "epoch": 1.90625,
913
- "grad_norm": 67.28651428222656,
914
- "learning_rate": 2.3437500000000002e-06,
915
- "loss": 0.775,
916
- "step": 61
917
- },
918
- {
919
- "epoch": 1.90625,
920
- "eval_loss": 1.1700712442398071,
921
- "eval_runtime": 24.2206,
922
- "eval_samples_per_second": 2.064,
923
- "eval_steps_per_second": 0.289,
924
- "step": 61
925
- },
926
- {
927
- "epoch": 1.9375,
928
- "grad_norm": 45.8297004699707,
929
- "learning_rate": 1.5625e-06,
930
- "loss": 0.655,
931
- "step": 62
932
- },
933
- {
934
- "epoch": 1.9375,
935
- "eval_loss": 1.1683861017227173,
936
- "eval_runtime": 13.9273,
937
- "eval_samples_per_second": 3.59,
938
- "eval_steps_per_second": 0.503,
939
- "step": 62
940
- },
941
- {
942
- "epoch": 1.96875,
943
- "grad_norm": 64.58641052246094,
944
- "learning_rate": 7.8125e-07,
945
- "loss": 1.0303,
946
- "step": 63
947
- },
948
- {
949
- "epoch": 1.96875,
950
- "eval_loss": 1.1670336723327637,
951
- "eval_runtime": 22.0932,
952
- "eval_samples_per_second": 2.263,
953
- "eval_steps_per_second": 0.317,
954
- "step": 63
955
- },
956
  {
957
  "epoch": 2.0,
958
- "grad_norm": 94.33624267578125,
959
  "learning_rate": 0.0,
960
- "loss": 0.2727,
961
- "step": 64
962
  },
963
  {
964
  "epoch": 2.0,
965
- "eval_loss": 1.166153073310852,
966
- "eval_runtime": 15.0662,
967
- "eval_samples_per_second": 3.319,
968
- "eval_steps_per_second": 0.465,
969
- "step": 64
970
  },
971
  {
972
  "epoch": 2.0,
973
- "step": 64,
974
- "total_flos": 58050663970068.0,
975
- "train_loss": 1.8778469576500356,
976
- "train_runtime": 2427.903,
977
- "train_samples_per_second": 0.411,
978
- "train_steps_per_second": 0.026
979
  }
980
  ],
981
  "logging_steps": 1.0,
982
- "max_steps": 64,
983
  "num_input_tokens_seen": 0,
984
  "num_train_epochs": 2,
985
  "save_steps": 500,
@@ -995,8 +245,8 @@
995
  "attributes": {}
996
  }
997
  },
998
- "total_flos": 58050663970068.0,
999
- "train_batch_size": 16,
1000
  "trial_name": null,
1001
  "trial_params": null
1002
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 2.0,
5
  "eval_steps": 1.0,
6
+ "global_step": 14,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.14285714285714285,
13
+ "grad_norm": 203.78257751464844,
14
+ "learning_rate": 4.642857142857143e-05,
15
+ "loss": 0.8088,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.14285714285714285,
20
+ "eval_loss": 1.0832017660140991,
21
+ "eval_runtime": 4.4764,
22
+ "eval_samples_per_second": 2.234,
23
+ "eval_steps_per_second": 1.117,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.2857142857142857,
28
+ "grad_norm": 551.1321411132812,
29
+ "learning_rate": 4.2857142857142856e-05,
30
+ "loss": 1.5034,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.2857142857142857,
35
+ "eval_loss": 0.8266311883926392,
36
+ "eval_runtime": 2.9969,
37
+ "eval_samples_per_second": 3.337,
38
+ "eval_steps_per_second": 1.668,
39
  "step": 2
40
  },
41
  {
42
+ "epoch": 0.42857142857142855,
43
+ "grad_norm": 342.6697998046875,
44
+ "learning_rate": 3.928571428571429e-05,
45
+ "loss": 1.617,
46
  "step": 3
47
  },
48
  {
49
+ "epoch": 0.42857142857142855,
50
+ "eval_loss": 0.7760497331619263,
51
+ "eval_runtime": 2.9732,
52
+ "eval_samples_per_second": 3.363,
53
+ "eval_steps_per_second": 1.682,
54
  "step": 3
55
  },
56
  {
57
+ "epoch": 0.5714285714285714,
58
+ "grad_norm": 792.9654541015625,
59
+ "learning_rate": 3.571428571428572e-05,
60
+ "loss": 2.2375,
61
  "step": 4
62
  },
63
  {
64
+ "epoch": 0.5714285714285714,
65
+ "eval_loss": 0.7384463548660278,
66
+ "eval_runtime": 3.1473,
67
+ "eval_samples_per_second": 3.177,
68
+ "eval_steps_per_second": 1.589,
69
  "step": 4
70
  },
71
  {
72
+ "epoch": 0.7142857142857143,
73
+ "grad_norm": 210.89300537109375,
74
+ "learning_rate": 3.2142857142857144e-05,
75
+ "loss": 1.5411,
76
  "step": 5
77
  },
78
  {
79
+ "epoch": 0.7142857142857143,
80
+ "eval_loss": 0.82770174741745,
81
+ "eval_runtime": 3.0122,
82
+ "eval_samples_per_second": 3.32,
83
+ "eval_steps_per_second": 1.66,
84
  "step": 5
85
  },
86
  {
87
+ "epoch": 0.8571428571428571,
88
+ "grad_norm": 314.6759338378906,
89
+ "learning_rate": 2.857142857142857e-05,
90
+ "loss": 2.4631,
91
  "step": 6
92
  },
93
  {
94
+ "epoch": 0.8571428571428571,
95
+ "eval_loss": 0.44329363107681274,
96
+ "eval_runtime": 4.1331,
97
+ "eval_samples_per_second": 2.42,
98
+ "eval_steps_per_second": 1.21,
99
  "step": 6
100
  },
101
  {
102
+ "epoch": 1.0,
103
+ "grad_norm": 17.239919662475586,
104
+ "learning_rate": 2.5e-05,
105
+ "loss": 0.0217,
106
  "step": 7
107
  },
108
  {
109
+ "epoch": 1.0,
110
+ "eval_loss": 0.9471458196640015,
111
+ "eval_runtime": 3.013,
112
+ "eval_samples_per_second": 3.319,
113
+ "eval_steps_per_second": 1.659,
114
  "step": 7
115
  },
116
  {
117
+ "epoch": 1.1428571428571428,
118
+ "grad_norm": 213.36294555664062,
119
+ "learning_rate": 2.1428571428571428e-05,
120
+ "loss": 2.4217,
121
  "step": 8
122
  },
123
  {
124
+ "epoch": 1.1428571428571428,
125
+ "eval_loss": 0.9182891845703125,
126
+ "eval_runtime": 2.9638,
127
+ "eval_samples_per_second": 3.374,
128
+ "eval_steps_per_second": 1.687,
129
  "step": 8
130
  },
131
  {
132
+ "epoch": 1.2857142857142856,
133
+ "grad_norm": 200.6674346923828,
134
+ "learning_rate": 1.785714285714286e-05,
135
+ "loss": 2.0588,
136
  "step": 9
137
  },
138
  {
139
+ "epoch": 1.2857142857142856,
140
+ "eval_loss": 0.7376034259796143,
141
+ "eval_runtime": 2.958,
142
+ "eval_samples_per_second": 3.381,
143
+ "eval_steps_per_second": 1.69,
144
  "step": 9
145
  },
146
  {
147
+ "epoch": 1.4285714285714286,
148
+ "grad_norm": 215.76296997070312,
149
+ "learning_rate": 1.4285714285714285e-05,
150
+ "loss": 1.6484,
151
  "step": 10
152
  },
153
  {
154
+ "epoch": 1.4285714285714286,
155
+ "eval_loss": 0.5826703906059265,
156
+ "eval_runtime": 2.9863,
157
+ "eval_samples_per_second": 3.349,
158
+ "eval_steps_per_second": 1.674,
159
  "step": 10
160
  },
161
  {
162
+ "epoch": 1.5714285714285714,
163
+ "grad_norm": 122.98780059814453,
164
+ "learning_rate": 1.0714285714285714e-05,
165
+ "loss": 0.9379,
166
  "step": 11
167
  },
168
  {
169
+ "epoch": 1.5714285714285714,
170
+ "eval_loss": 0.5854327082633972,
171
+ "eval_runtime": 2.9738,
172
+ "eval_samples_per_second": 3.363,
173
+ "eval_steps_per_second": 1.681,
174
  "step": 11
175
  },
176
  {
177
+ "epoch": 1.7142857142857144,
178
+ "grad_norm": 110.82337951660156,
179
+ "learning_rate": 7.142857142857143e-06,
180
+ "loss": 0.8608,
181
  "step": 12
182
  },
183
  {
184
+ "epoch": 1.7142857142857144,
185
+ "eval_loss": 0.5832847952842712,
186
+ "eval_runtime": 6.4862,
187
+ "eval_samples_per_second": 1.542,
188
+ "eval_steps_per_second": 0.771,
189
  "step": 12
190
  },
191
  {
192
+ "epoch": 1.8571428571428572,
193
+ "grad_norm": 91.99958801269531,
194
+ "learning_rate": 3.5714285714285714e-06,
195
+ "loss": 0.958,
196
  "step": 13
197
  },
198
  {
199
+ "epoch": 1.8571428571428572,
200
+ "eval_loss": 0.5970481634140015,
201
+ "eval_runtime": 2.9762,
202
+ "eval_samples_per_second": 3.36,
203
+ "eval_steps_per_second": 1.68,
204
  "step": 13
205
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  {
207
  "epoch": 2.0,
208
+ "grad_norm": 28.6821346282959,
209
  "learning_rate": 0.0,
210
+ "loss": 0.0355,
211
+ "step": 14
212
  },
213
  {
214
  "epoch": 2.0,
215
+ "eval_loss": 0.5955778956413269,
216
+ "eval_runtime": 5.3785,
217
+ "eval_samples_per_second": 1.859,
218
+ "eval_steps_per_second": 0.93,
219
+ "step": 14
220
  },
221
  {
222
  "epoch": 2.0,
223
+ "step": 14,
224
+ "total_flos": 5816699796600.0,
225
+ "train_loss": 1.3652541448495217,
226
+ "train_runtime": 216.8469,
227
+ "train_samples_per_second": 0.461,
228
+ "train_steps_per_second": 0.065
229
  }
230
  ],
231
  "logging_steps": 1.0,
232
+ "max_steps": 14,
233
  "num_input_tokens_seen": 0,
234
  "num_train_epochs": 2,
235
  "save_steps": 500,
 
245
  "attributes": {}
246
  }
247
  },
248
+ "total_flos": 5816699796600.0,
249
+ "train_batch_size": 8,
250
  "trial_name": null,
251
  "trial_params": null
252
  }