mikhail-panzo commited on
Commit
901ab5a
1 Parent(s): ad1a92c

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99e38e3602d41711cdbb533cb7d6b84bc03b5cfdcec09079e6e94d8e9c933c1f
3
  size 577789320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0336bab9ad53a5d9ba35f689531e4f56cffd1eb07fbe59ee2bf923acde76a8
3
  size 577789320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4324a138b51ccff30cd6582605837ac09263f11a633b6ebf8526d9c133d37e6a
3
  size 1155772233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d54884dc3d75a7228a6f73783a44ed6321489769a21ba9feb34fcacc24f3c9
3
  size 1155772233
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddee4e1cd9c11fae1531a3888b26c6306dfc6effea2b5b4de3f934096de4907a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99eeec94447248854bf811d769a2a208fc950d0961184a4f99f03ffdc252b32b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc650e789a99f1d69f79d9aa960ac1927e43a2cad64cee2ef28fa7a0ac21a5a3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a1a187666ea0e44f9d015f844e1601f5b4c6844588e1b362a3c9b6a7527a74f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.3102165162563324,
3
- "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-9500",
4
- "epoch": 15.916230366492147,
5
  "eval_steps": 500,
6
- "global_step": 9500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -81,1420 +81,16 @@
81
  {
82
  "epoch": 0.837696335078534,
83
  "eval_loss": 0.4566049873828888,
84
- "eval_runtime": 268.5202,
85
- "eval_samples_per_second": 31.614,
86
- "eval_steps_per_second": 3.955,
87
  "step": 500
88
- },
89
- {
90
- "epoch": 0.9214659685863874,
91
- "grad_norm": 1.9436837434768677,
92
- "learning_rate": 2.7450000000000003e-05,
93
- "loss": 0.5079,
94
- "step": 550
95
- },
96
- {
97
- "epoch": 1.0052356020942408,
98
- "grad_norm": 1.819956660270691,
99
- "learning_rate": 2.995e-05,
100
- "loss": 0.4969,
101
- "step": 600
102
- },
103
- {
104
- "epoch": 1.0890052356020943,
105
- "grad_norm": 5.457251071929932,
106
- "learning_rate": 3.245e-05,
107
- "loss": 0.4977,
108
- "step": 650
109
- },
110
- {
111
- "epoch": 1.1727748691099475,
112
- "grad_norm": 3.183980703353882,
113
- "learning_rate": 3.495e-05,
114
- "loss": 0.4923,
115
- "step": 700
116
- },
117
- {
118
- "epoch": 1.256544502617801,
119
- "grad_norm": 7.1660051345825195,
120
- "learning_rate": 3.745e-05,
121
- "loss": 0.4802,
122
- "step": 750
123
- },
124
- {
125
- "epoch": 1.3403141361256545,
126
- "grad_norm": 5.499026775360107,
127
- "learning_rate": 3.995e-05,
128
- "loss": 0.4754,
129
- "step": 800
130
- },
131
- {
132
- "epoch": 1.4240837696335078,
133
- "grad_norm": 2.8053908348083496,
134
- "learning_rate": 4.245e-05,
135
- "loss": 0.4669,
136
- "step": 850
137
- },
138
- {
139
- "epoch": 1.5078534031413613,
140
- "grad_norm": 3.017005443572998,
141
- "learning_rate": 4.495e-05,
142
- "loss": 0.4604,
143
- "step": 900
144
- },
145
- {
146
- "epoch": 1.5916230366492146,
147
- "grad_norm": 2.7971177101135254,
148
- "learning_rate": 4.745e-05,
149
- "loss": 0.4565,
150
- "step": 950
151
- },
152
- {
153
- "epoch": 1.675392670157068,
154
- "grad_norm": 3.1588356494903564,
155
- "learning_rate": 4.995e-05,
156
- "loss": 0.455,
157
- "step": 1000
158
- },
159
- {
160
- "epoch": 1.675392670157068,
161
- "eval_loss": 0.40312233567237854,
162
- "eval_runtime": 271.3585,
163
- "eval_samples_per_second": 31.283,
164
- "eval_steps_per_second": 3.914,
165
- "step": 1000
166
- },
167
- {
168
- "epoch": 1.7591623036649215,
169
- "grad_norm": 2.2053232192993164,
170
- "learning_rate": 5.245e-05,
171
- "loss": 0.4543,
172
- "step": 1050
173
- },
174
- {
175
- "epoch": 1.8429319371727748,
176
- "grad_norm": 2.0562164783477783,
177
- "learning_rate": 5.495e-05,
178
- "loss": 0.4456,
179
- "step": 1100
180
- },
181
- {
182
- "epoch": 1.9267015706806283,
183
- "grad_norm": 2.730119466781616,
184
- "learning_rate": 5.745e-05,
185
- "loss": 0.4355,
186
- "step": 1150
187
- },
188
- {
189
- "epoch": 2.0104712041884816,
190
- "grad_norm": 1.7484283447265625,
191
- "learning_rate": 5.995000000000001e-05,
192
- "loss": 0.4299,
193
- "step": 1200
194
- },
195
- {
196
- "epoch": 2.094240837696335,
197
- "grad_norm": 1.1786061525344849,
198
- "learning_rate": 6.245000000000001e-05,
199
- "loss": 0.4305,
200
- "step": 1250
201
- },
202
- {
203
- "epoch": 2.1780104712041886,
204
- "grad_norm": 1.98978590965271,
205
- "learning_rate": 6.494999999999999e-05,
206
- "loss": 0.4295,
207
- "step": 1300
208
- },
209
- {
210
- "epoch": 2.261780104712042,
211
- "grad_norm": 2.818659782409668,
212
- "learning_rate": 6.745e-05,
213
- "loss": 0.4235,
214
- "step": 1350
215
- },
216
- {
217
- "epoch": 2.345549738219895,
218
- "grad_norm": 2.3864262104034424,
219
- "learning_rate": 6.995e-05,
220
- "loss": 0.4271,
221
- "step": 1400
222
- },
223
- {
224
- "epoch": 2.4293193717277486,
225
- "grad_norm": 1.3647903203964233,
226
- "learning_rate": 7.245000000000001e-05,
227
- "loss": 0.4208,
228
- "step": 1450
229
- },
230
- {
231
- "epoch": 2.513089005235602,
232
- "grad_norm": 2.2144172191619873,
233
- "learning_rate": 7.495e-05,
234
- "loss": 0.4175,
235
- "step": 1500
236
- },
237
- {
238
- "epoch": 2.513089005235602,
239
- "eval_loss": 0.3777858018875122,
240
- "eval_runtime": 273.3281,
241
- "eval_samples_per_second": 31.058,
242
- "eval_steps_per_second": 3.885,
243
- "step": 1500
244
- },
245
- {
246
- "epoch": 2.5968586387434556,
247
- "grad_norm": 1.6483193635940552,
248
- "learning_rate": 7.745e-05,
249
- "loss": 0.414,
250
- "step": 1550
251
- },
252
- {
253
- "epoch": 2.680628272251309,
254
- "grad_norm": 1.7688554525375366,
255
- "learning_rate": 7.995e-05,
256
- "loss": 0.4153,
257
- "step": 1600
258
- },
259
- {
260
- "epoch": 2.7643979057591626,
261
- "grad_norm": 1.2314317226409912,
262
- "learning_rate": 8.245e-05,
263
- "loss": 0.4089,
264
- "step": 1650
265
- },
266
- {
267
- "epoch": 2.8481675392670156,
268
- "grad_norm": 1.6623793840408325,
269
- "learning_rate": 8.495e-05,
270
- "loss": 0.4124,
271
- "step": 1700
272
- },
273
- {
274
- "epoch": 2.931937172774869,
275
- "grad_norm": 3.812507390975952,
276
- "learning_rate": 8.745000000000001e-05,
277
- "loss": 0.4112,
278
- "step": 1750
279
- },
280
- {
281
- "epoch": 3.0157068062827226,
282
- "grad_norm": 2.141019821166992,
283
- "learning_rate": 8.995e-05,
284
- "loss": 0.4081,
285
- "step": 1800
286
- },
287
- {
288
- "epoch": 3.099476439790576,
289
- "grad_norm": 1.8928133249282837,
290
- "learning_rate": 9.245e-05,
291
- "loss": 0.4067,
292
- "step": 1850
293
- },
294
- {
295
- "epoch": 3.183246073298429,
296
- "grad_norm": 2.322817087173462,
297
- "learning_rate": 9.495e-05,
298
- "loss": 0.4088,
299
- "step": 1900
300
- },
301
- {
302
- "epoch": 3.2670157068062826,
303
- "grad_norm": 2.1984918117523193,
304
- "learning_rate": 9.745000000000001e-05,
305
- "loss": 0.3976,
306
- "step": 1950
307
- },
308
- {
309
- "epoch": 3.350785340314136,
310
- "grad_norm": 2.0455121994018555,
311
- "learning_rate": 9.995e-05,
312
- "loss": 0.4022,
313
- "step": 2000
314
- },
315
- {
316
- "epoch": 3.350785340314136,
317
- "eval_loss": 0.3677983582019806,
318
- "eval_runtime": 274.4574,
319
- "eval_samples_per_second": 30.93,
320
- "eval_steps_per_second": 3.869,
321
- "step": 2000
322
- },
323
- {
324
- "epoch": 3.4345549738219896,
325
- "grad_norm": 1.2897744178771973,
326
- "learning_rate": 9.951e-05,
327
- "loss": 0.4026,
328
- "step": 2050
329
- },
330
- {
331
- "epoch": 3.518324607329843,
332
- "grad_norm": 1.470860242843628,
333
- "learning_rate": 9.901e-05,
334
- "loss": 0.4008,
335
- "step": 2100
336
- },
337
- {
338
- "epoch": 3.6020942408376966,
339
- "grad_norm": 1.2159388065338135,
340
- "learning_rate": 9.851e-05,
341
- "loss": 0.3971,
342
- "step": 2150
343
- },
344
- {
345
- "epoch": 3.6858638743455496,
346
- "grad_norm": 2.0348379611968994,
347
- "learning_rate": 9.801e-05,
348
- "loss": 0.396,
349
- "step": 2200
350
- },
351
- {
352
- "epoch": 3.769633507853403,
353
- "grad_norm": 1.7535659074783325,
354
- "learning_rate": 9.751e-05,
355
- "loss": 0.3929,
356
- "step": 2250
357
- },
358
- {
359
- "epoch": 3.8534031413612566,
360
- "grad_norm": 1.361984372138977,
361
- "learning_rate": 9.701e-05,
362
- "loss": 0.3905,
363
- "step": 2300
364
- },
365
- {
366
- "epoch": 3.93717277486911,
367
- "grad_norm": 1.7380383014678955,
368
- "learning_rate": 9.651e-05,
369
- "loss": 0.3957,
370
- "step": 2350
371
- },
372
- {
373
- "epoch": 4.020942408376963,
374
- "grad_norm": 1.2679184675216675,
375
- "learning_rate": 9.601e-05,
376
- "loss": 0.388,
377
- "step": 2400
378
- },
379
- {
380
- "epoch": 4.104712041884817,
381
- "grad_norm": 1.274625301361084,
382
- "learning_rate": 9.551e-05,
383
- "loss": 0.3887,
384
- "step": 2450
385
- },
386
- {
387
- "epoch": 4.18848167539267,
388
- "grad_norm": 1.813714861869812,
389
- "learning_rate": 9.501e-05,
390
- "loss": 0.3865,
391
- "step": 2500
392
- },
393
- {
394
- "epoch": 4.18848167539267,
395
- "eval_loss": 0.35398951172828674,
396
- "eval_runtime": 271.385,
397
- "eval_samples_per_second": 31.28,
398
- "eval_steps_per_second": 3.913,
399
- "step": 2500
400
- },
401
- {
402
- "epoch": 4.272251308900524,
403
- "grad_norm": 2.468984842300415,
404
- "learning_rate": 9.451000000000002e-05,
405
- "loss": 0.3902,
406
- "step": 2550
407
- },
408
- {
409
- "epoch": 4.356020942408377,
410
- "grad_norm": 1.2810943126678467,
411
- "learning_rate": 9.401e-05,
412
- "loss": 0.386,
413
- "step": 2600
414
- },
415
- {
416
- "epoch": 4.439790575916231,
417
- "grad_norm": 1.6781765222549438,
418
- "learning_rate": 9.351e-05,
419
- "loss": 0.383,
420
- "step": 2650
421
- },
422
- {
423
- "epoch": 4.523560209424084,
424
- "grad_norm": 1.617163896560669,
425
- "learning_rate": 9.301e-05,
426
- "loss": 0.3849,
427
- "step": 2700
428
- },
429
- {
430
- "epoch": 4.607329842931938,
431
- "grad_norm": 1.4169151782989502,
432
- "learning_rate": 9.251000000000001e-05,
433
- "loss": 0.3807,
434
- "step": 2750
435
- },
436
- {
437
- "epoch": 4.69109947643979,
438
- "grad_norm": 1.1944037675857544,
439
- "learning_rate": 9.201000000000001e-05,
440
- "loss": 0.3838,
441
- "step": 2800
442
- },
443
- {
444
- "epoch": 4.774869109947644,
445
- "grad_norm": 1.7312718629837036,
446
- "learning_rate": 9.151000000000001e-05,
447
- "loss": 0.3808,
448
- "step": 2850
449
- },
450
- {
451
- "epoch": 4.858638743455497,
452
- "grad_norm": 1.357228398323059,
453
- "learning_rate": 9.101000000000001e-05,
454
- "loss": 0.3832,
455
- "step": 2900
456
- },
457
- {
458
- "epoch": 4.942408376963351,
459
- "grad_norm": 1.2495553493499756,
460
- "learning_rate": 9.051000000000001e-05,
461
- "loss": 0.3837,
462
- "step": 2950
463
- },
464
- {
465
- "epoch": 5.026178010471204,
466
- "grad_norm": 1.3688994646072388,
467
- "learning_rate": 9.001e-05,
468
- "loss": 0.3802,
469
- "step": 3000
470
- },
471
- {
472
- "epoch": 5.026178010471204,
473
- "eval_loss": 0.3458922803401947,
474
- "eval_runtime": 277.371,
475
- "eval_samples_per_second": 30.605,
476
- "eval_steps_per_second": 3.829,
477
- "step": 3000
478
- },
479
- {
480
- "epoch": 5.109947643979058,
481
- "grad_norm": 1.0916550159454346,
482
- "learning_rate": 8.951e-05,
483
- "loss": 0.3747,
484
- "step": 3050
485
- },
486
- {
487
- "epoch": 5.193717277486911,
488
- "grad_norm": 1.4605640172958374,
489
- "learning_rate": 8.901e-05,
490
- "loss": 0.3765,
491
- "step": 3100
492
- },
493
- {
494
- "epoch": 5.277486910994765,
495
- "grad_norm": 1.302049994468689,
496
- "learning_rate": 8.851e-05,
497
- "loss": 0.3753,
498
- "step": 3150
499
- },
500
- {
501
- "epoch": 5.361256544502618,
502
- "grad_norm": 1.0380531549453735,
503
- "learning_rate": 8.801e-05,
504
- "loss": 0.3735,
505
- "step": 3200
506
- },
507
- {
508
- "epoch": 5.445026178010472,
509
- "grad_norm": 2.157710075378418,
510
- "learning_rate": 8.751000000000001e-05,
511
- "loss": 0.3766,
512
- "step": 3250
513
- },
514
- {
515
- "epoch": 5.528795811518324,
516
- "grad_norm": 2.2072594165802,
517
- "learning_rate": 8.701000000000001e-05,
518
- "loss": 0.3767,
519
- "step": 3300
520
- },
521
- {
522
- "epoch": 5.612565445026178,
523
- "grad_norm": 1.258347749710083,
524
- "learning_rate": 8.651e-05,
525
- "loss": 0.3709,
526
- "step": 3350
527
- },
528
- {
529
- "epoch": 5.696335078534031,
530
- "grad_norm": 1.7026106119155884,
531
- "learning_rate": 8.601e-05,
532
- "loss": 0.3715,
533
- "step": 3400
534
- },
535
- {
536
- "epoch": 5.780104712041885,
537
- "grad_norm": 1.1708229780197144,
538
- "learning_rate": 8.551e-05,
539
- "loss": 0.3716,
540
- "step": 3450
541
- },
542
- {
543
- "epoch": 5.863874345549738,
544
- "grad_norm": 2.3675355911254883,
545
- "learning_rate": 8.501e-05,
546
- "loss": 0.3693,
547
- "step": 3500
548
- },
549
- {
550
- "epoch": 5.863874345549738,
551
- "eval_loss": 0.3417563736438751,
552
- "eval_runtime": 272.8827,
553
- "eval_samples_per_second": 31.109,
554
- "eval_steps_per_second": 3.892,
555
- "step": 3500
556
- },
557
- {
558
- "epoch": 5.947643979057592,
559
- "grad_norm": 1.6144191026687622,
560
- "learning_rate": 8.451e-05,
561
- "loss": 0.3666,
562
- "step": 3550
563
- },
564
- {
565
- "epoch": 6.031413612565445,
566
- "grad_norm": 1.4944205284118652,
567
- "learning_rate": 8.401e-05,
568
- "loss": 0.3657,
569
- "step": 3600
570
- },
571
- {
572
- "epoch": 6.115183246073299,
573
- "grad_norm": 1.0198278427124023,
574
- "learning_rate": 8.351e-05,
575
- "loss": 0.3702,
576
- "step": 3650
577
- },
578
- {
579
- "epoch": 6.198952879581152,
580
- "grad_norm": 2.195380926132202,
581
- "learning_rate": 8.300999999999999e-05,
582
- "loss": 0.3686,
583
- "step": 3700
584
- },
585
- {
586
- "epoch": 6.282722513089006,
587
- "grad_norm": 1.3650749921798706,
588
- "learning_rate": 8.251e-05,
589
- "loss": 0.3701,
590
- "step": 3750
591
- },
592
- {
593
- "epoch": 6.366492146596858,
594
- "grad_norm": 1.6887727975845337,
595
- "learning_rate": 8.201000000000001e-05,
596
- "loss": 0.3677,
597
- "step": 3800
598
- },
599
- {
600
- "epoch": 6.450261780104712,
601
- "grad_norm": 0.8709685206413269,
602
- "learning_rate": 8.151000000000001e-05,
603
- "loss": 0.3678,
604
- "step": 3850
605
- },
606
- {
607
- "epoch": 6.534031413612565,
608
- "grad_norm": 1.0899595022201538,
609
- "learning_rate": 8.101000000000001e-05,
610
- "loss": 0.3641,
611
- "step": 3900
612
- },
613
- {
614
- "epoch": 6.617801047120419,
615
- "grad_norm": 1.1222867965698242,
616
- "learning_rate": 8.051000000000001e-05,
617
- "loss": 0.3691,
618
- "step": 3950
619
- },
620
- {
621
- "epoch": 6.701570680628272,
622
- "grad_norm": 1.0771104097366333,
623
- "learning_rate": 8.001e-05,
624
- "loss": 0.3674,
625
- "step": 4000
626
- },
627
- {
628
- "epoch": 6.701570680628272,
629
- "eval_loss": 0.3313756585121155,
630
- "eval_runtime": 279.286,
631
- "eval_samples_per_second": 30.395,
632
- "eval_steps_per_second": 3.803,
633
- "step": 4000
634
- },
635
- {
636
- "epoch": 6.785340314136126,
637
- "grad_norm": 1.868295669555664,
638
- "learning_rate": 7.951e-05,
639
- "loss": 0.3617,
640
- "step": 4050
641
- },
642
- {
643
- "epoch": 6.869109947643979,
644
- "grad_norm": 1.0599360466003418,
645
- "learning_rate": 7.901e-05,
646
- "loss": 0.3637,
647
- "step": 4100
648
- },
649
- {
650
- "epoch": 6.952879581151833,
651
- "grad_norm": 1.4801158905029297,
652
- "learning_rate": 7.851e-05,
653
- "loss": 0.363,
654
- "step": 4150
655
- },
656
- {
657
- "epoch": 7.036649214659686,
658
- "grad_norm": 1.137289047241211,
659
- "learning_rate": 7.801000000000001e-05,
660
- "loss": 0.3622,
661
- "step": 4200
662
- },
663
- {
664
- "epoch": 7.12041884816754,
665
- "grad_norm": 1.2109190225601196,
666
- "learning_rate": 7.751000000000001e-05,
667
- "loss": 0.3668,
668
- "step": 4250
669
- },
670
- {
671
- "epoch": 7.204188481675392,
672
- "grad_norm": 1.1171132326126099,
673
- "learning_rate": 7.701000000000001e-05,
674
- "loss": 0.3594,
675
- "step": 4300
676
- },
677
- {
678
- "epoch": 7.287958115183246,
679
- "grad_norm": 1.2529895305633545,
680
- "learning_rate": 7.651e-05,
681
- "loss": 0.3635,
682
- "step": 4350
683
- },
684
- {
685
- "epoch": 7.371727748691099,
686
- "grad_norm": 1.352792739868164,
687
- "learning_rate": 7.601e-05,
688
- "loss": 0.3627,
689
- "step": 4400
690
- },
691
- {
692
- "epoch": 7.455497382198953,
693
- "grad_norm": 0.8809813261032104,
694
- "learning_rate": 7.552e-05,
695
- "loss": 0.3647,
696
- "step": 4450
697
- },
698
- {
699
- "epoch": 7.539267015706806,
700
- "grad_norm": 4.0386962890625,
701
- "learning_rate": 7.502e-05,
702
- "loss": 0.3582,
703
- "step": 4500
704
- },
705
- {
706
- "epoch": 7.539267015706806,
707
- "eval_loss": 0.32692766189575195,
708
- "eval_runtime": 272.0854,
709
- "eval_samples_per_second": 31.2,
710
- "eval_steps_per_second": 3.903,
711
- "step": 4500
712
- },
713
- {
714
- "epoch": 7.62303664921466,
715
- "grad_norm": 1.616075873374939,
716
- "learning_rate": 7.452e-05,
717
- "loss": 0.3603,
718
- "step": 4550
719
- },
720
- {
721
- "epoch": 7.706806282722513,
722
- "grad_norm": 2.2668583393096924,
723
- "learning_rate": 7.402e-05,
724
- "loss": 0.3622,
725
- "step": 4600
726
- },
727
- {
728
- "epoch": 7.790575916230367,
729
- "grad_norm": 1.0464789867401123,
730
- "learning_rate": 7.352e-05,
731
- "loss": 0.3667,
732
- "step": 4650
733
- },
734
- {
735
- "epoch": 7.87434554973822,
736
- "grad_norm": 1.2528297901153564,
737
- "learning_rate": 7.302e-05,
738
- "loss": 0.3631,
739
- "step": 4700
740
- },
741
- {
742
- "epoch": 7.958115183246074,
743
- "grad_norm": 1.72895085811615,
744
- "learning_rate": 7.252e-05,
745
- "loss": 0.3567,
746
- "step": 4750
747
- },
748
- {
749
- "epoch": 8.041884816753926,
750
- "grad_norm": 1.5020617246627808,
751
- "learning_rate": 7.202e-05,
752
- "loss": 0.3553,
753
- "step": 4800
754
- },
755
- {
756
- "epoch": 8.12565445026178,
757
- "grad_norm": 1.976888656616211,
758
- "learning_rate": 7.151999999999999e-05,
759
- "loss": 0.3569,
760
- "step": 4850
761
- },
762
- {
763
- "epoch": 8.209424083769633,
764
- "grad_norm": 1.156580924987793,
765
- "learning_rate": 7.102000000000001e-05,
766
- "loss": 0.3659,
767
- "step": 4900
768
- },
769
- {
770
- "epoch": 8.293193717277488,
771
- "grad_norm": 0.9017566442489624,
772
- "learning_rate": 7.052000000000001e-05,
773
- "loss": 0.3549,
774
- "step": 4950
775
- },
776
- {
777
- "epoch": 8.37696335078534,
778
- "grad_norm": 1.5168513059616089,
779
- "learning_rate": 7.002000000000001e-05,
780
- "loss": 0.362,
781
- "step": 5000
782
- },
783
- {
784
- "epoch": 8.37696335078534,
785
- "eval_loss": 0.34056970477104187,
786
- "eval_runtime": 276.7614,
787
- "eval_samples_per_second": 30.673,
788
- "eval_steps_per_second": 3.837,
789
- "step": 5000
790
- },
791
- {
792
- "epoch": 8.460732984293193,
793
- "grad_norm": 1.111985206604004,
794
- "learning_rate": 6.952000000000001e-05,
795
- "loss": 0.3553,
796
- "step": 5050
797
- },
798
- {
799
- "epoch": 8.544502617801047,
800
- "grad_norm": 1.3966108560562134,
801
- "learning_rate": 6.902000000000001e-05,
802
- "loss": 0.3545,
803
- "step": 5100
804
- },
805
- {
806
- "epoch": 8.6282722513089,
807
- "grad_norm": 1.3428140878677368,
808
- "learning_rate": 6.852e-05,
809
- "loss": 0.3609,
810
- "step": 5150
811
- },
812
- {
813
- "epoch": 8.712041884816754,
814
- "grad_norm": 1.9436802864074707,
815
- "learning_rate": 6.802e-05,
816
- "loss": 0.3547,
817
- "step": 5200
818
- },
819
- {
820
- "epoch": 8.795811518324607,
821
- "grad_norm": 1.1481266021728516,
822
- "learning_rate": 6.752e-05,
823
- "loss": 0.3569,
824
- "step": 5250
825
- },
826
- {
827
- "epoch": 8.879581151832461,
828
- "grad_norm": 1.410223364830017,
829
- "learning_rate": 6.702e-05,
830
- "loss": 0.3558,
831
- "step": 5300
832
- },
833
- {
834
- "epoch": 8.963350785340314,
835
- "grad_norm": 1.7548959255218506,
836
- "learning_rate": 6.652000000000001e-05,
837
- "loss": 0.3561,
838
- "step": 5350
839
- },
840
- {
841
- "epoch": 9.047120418848168,
842
- "grad_norm": 1.343935489654541,
843
- "learning_rate": 6.602000000000001e-05,
844
- "loss": 0.3609,
845
- "step": 5400
846
- },
847
- {
848
- "epoch": 9.13089005235602,
849
- "grad_norm": 1.5190401077270508,
850
- "learning_rate": 6.552000000000001e-05,
851
- "loss": 0.3504,
852
- "step": 5450
853
- },
854
- {
855
- "epoch": 9.214659685863875,
856
- "grad_norm": 0.8521016240119934,
857
- "learning_rate": 6.502e-05,
858
- "loss": 0.3521,
859
- "step": 5500
860
- },
861
- {
862
- "epoch": 9.214659685863875,
863
- "eval_loss": 0.3218235671520233,
864
- "eval_runtime": 279.5684,
865
- "eval_samples_per_second": 30.365,
866
- "eval_steps_per_second": 3.799,
867
- "step": 5500
868
- },
869
- {
870
- "epoch": 9.298429319371728,
871
- "grad_norm": 1.0284796953201294,
872
- "learning_rate": 6.452e-05,
873
- "loss": 0.356,
874
- "step": 5550
875
- },
876
- {
877
- "epoch": 9.38219895287958,
878
- "grad_norm": 1.8278234004974365,
879
- "learning_rate": 6.402e-05,
880
- "loss": 0.356,
881
- "step": 5600
882
- },
883
- {
884
- "epoch": 9.465968586387435,
885
- "grad_norm": 0.9208963513374329,
886
- "learning_rate": 6.352e-05,
887
- "loss": 0.3504,
888
- "step": 5650
889
- },
890
- {
891
- "epoch": 9.549738219895287,
892
- "grad_norm": 1.295639991760254,
893
- "learning_rate": 6.302e-05,
894
- "loss": 0.3551,
895
- "step": 5700
896
- },
897
- {
898
- "epoch": 9.633507853403142,
899
- "grad_norm": 0.9757601022720337,
900
- "learning_rate": 6.252e-05,
901
- "loss": 0.3529,
902
- "step": 5750
903
- },
904
- {
905
- "epoch": 9.717277486910994,
906
- "grad_norm": 1.451418399810791,
907
- "learning_rate": 6.202e-05,
908
- "loss": 0.3537,
909
- "step": 5800
910
- },
911
- {
912
- "epoch": 9.801047120418849,
913
- "grad_norm": 2.2001028060913086,
914
- "learning_rate": 6.152e-05,
915
- "loss": 0.3522,
916
- "step": 5850
917
- },
918
- {
919
- "epoch": 9.884816753926701,
920
- "grad_norm": 1.1149827241897583,
921
- "learning_rate": 6.102e-05,
922
- "loss": 0.3472,
923
- "step": 5900
924
- },
925
- {
926
- "epoch": 9.968586387434556,
927
- "grad_norm": 1.4035720825195312,
928
- "learning_rate": 6.0519999999999997e-05,
929
- "loss": 0.3525,
930
- "step": 5950
931
- },
932
- {
933
- "epoch": 10.052356020942408,
934
- "grad_norm": 1.0732487440109253,
935
- "learning_rate": 6.002e-05,
936
- "loss": 0.3485,
937
- "step": 6000
938
- },
939
- {
940
- "epoch": 10.052356020942408,
941
- "eval_loss": 0.31853485107421875,
942
- "eval_runtime": 271.779,
943
- "eval_samples_per_second": 31.235,
944
- "eval_steps_per_second": 3.908,
945
- "step": 6000
946
- },
947
- {
948
- "epoch": 10.136125654450261,
949
- "grad_norm": 1.2576690912246704,
950
- "learning_rate": 5.952e-05,
951
- "loss": 0.3488,
952
- "step": 6050
953
- },
954
- {
955
- "epoch": 10.219895287958115,
956
- "grad_norm": 1.2645186185836792,
957
- "learning_rate": 5.902e-05,
958
- "loss": 0.3537,
959
- "step": 6100
960
- },
961
- {
962
- "epoch": 10.303664921465968,
963
- "grad_norm": 1.743445634841919,
964
- "learning_rate": 5.852000000000001e-05,
965
- "loss": 0.3501,
966
- "step": 6150
967
- },
968
- {
969
- "epoch": 10.387434554973822,
970
- "grad_norm": 1.2827191352844238,
971
- "learning_rate": 5.802000000000001e-05,
972
- "loss": 0.349,
973
- "step": 6200
974
- },
975
- {
976
- "epoch": 10.471204188481675,
977
- "grad_norm": 1.0109118223190308,
978
- "learning_rate": 5.7520000000000005e-05,
979
- "loss": 0.3495,
980
- "step": 6250
981
- },
982
- {
983
- "epoch": 10.55497382198953,
984
- "grad_norm": 1.420745611190796,
985
- "learning_rate": 5.7020000000000006e-05,
986
- "loss": 0.3493,
987
- "step": 6300
988
- },
989
- {
990
- "epoch": 10.638743455497382,
991
- "grad_norm": 1.2105921506881714,
992
- "learning_rate": 5.652000000000001e-05,
993
- "loss": 0.3487,
994
- "step": 6350
995
- },
996
- {
997
- "epoch": 10.722513089005236,
998
- "grad_norm": 1.1536401510238647,
999
- "learning_rate": 5.602000000000001e-05,
1000
- "loss": 0.35,
1001
- "step": 6400
1002
- },
1003
- {
1004
- "epoch": 10.806282722513089,
1005
- "grad_norm": 1.0635104179382324,
1006
- "learning_rate": 5.5520000000000004e-05,
1007
- "loss": 0.3475,
1008
- "step": 6450
1009
- },
1010
- {
1011
- "epoch": 10.890052356020943,
1012
- "grad_norm": 1.4069427251815796,
1013
- "learning_rate": 5.5020000000000005e-05,
1014
- "loss": 0.3472,
1015
- "step": 6500
1016
- },
1017
- {
1018
- "epoch": 10.890052356020943,
1019
- "eval_loss": 0.3199196457862854,
1020
- "eval_runtime": 276.9702,
1021
- "eval_samples_per_second": 30.65,
1022
- "eval_steps_per_second": 3.834,
1023
- "step": 6500
1024
- },
1025
- {
1026
- "epoch": 10.973821989528796,
1027
- "grad_norm": 0.8649620413780212,
1028
- "learning_rate": 5.4520000000000007e-05,
1029
- "loss": 0.3496,
1030
- "step": 6550
1031
- },
1032
- {
1033
- "epoch": 11.057591623036648,
1034
- "grad_norm": 2.6794686317443848,
1035
- "learning_rate": 5.402e-05,
1036
- "loss": 0.3482,
1037
- "step": 6600
1038
- },
1039
- {
1040
- "epoch": 11.141361256544503,
1041
- "grad_norm": 1.6224123239517212,
1042
- "learning_rate": 5.352e-05,
1043
- "loss": 0.3498,
1044
- "step": 6650
1045
- },
1046
- {
1047
- "epoch": 11.225130890052355,
1048
- "grad_norm": 1.2548692226409912,
1049
- "learning_rate": 5.3020000000000004e-05,
1050
- "loss": 0.346,
1051
- "step": 6700
1052
- },
1053
- {
1054
- "epoch": 11.30890052356021,
1055
- "grad_norm": 1.390360713005066,
1056
- "learning_rate": 5.2520000000000005e-05,
1057
- "loss": 0.345,
1058
- "step": 6750
1059
- },
1060
- {
1061
- "epoch": 11.392670157068062,
1062
- "grad_norm": 1.1040029525756836,
1063
- "learning_rate": 5.202e-05,
1064
- "loss": 0.3477,
1065
- "step": 6800
1066
- },
1067
- {
1068
- "epoch": 11.476439790575917,
1069
- "grad_norm": 1.0738588571548462,
1070
- "learning_rate": 5.152e-05,
1071
- "loss": 0.3455,
1072
- "step": 6850
1073
- },
1074
- {
1075
- "epoch": 11.56020942408377,
1076
- "grad_norm": 1.0175799131393433,
1077
- "learning_rate": 5.102e-05,
1078
- "loss": 0.3448,
1079
- "step": 6900
1080
- },
1081
- {
1082
- "epoch": 11.643979057591624,
1083
- "grad_norm": 1.8546490669250488,
1084
- "learning_rate": 5.052e-05,
1085
- "loss": 0.346,
1086
- "step": 6950
1087
- },
1088
- {
1089
- "epoch": 11.727748691099476,
1090
- "grad_norm": 1.7156524658203125,
1091
- "learning_rate": 5.002e-05,
1092
- "loss": 0.3469,
1093
- "step": 7000
1094
- },
1095
- {
1096
- "epoch": 11.727748691099476,
1097
- "eval_loss": 0.31849026679992676,
1098
- "eval_runtime": 283.0428,
1099
- "eval_samples_per_second": 29.992,
1100
- "eval_steps_per_second": 3.752,
1101
- "step": 7000
1102
- },
1103
- {
1104
- "epoch": 11.81151832460733,
1105
- "grad_norm": 1.1094063520431519,
1106
- "learning_rate": 4.952e-05,
1107
- "loss": 0.346,
1108
- "step": 7050
1109
- },
1110
- {
1111
- "epoch": 11.895287958115183,
1112
- "grad_norm": 1.8263230323791504,
1113
- "learning_rate": 4.902e-05,
1114
- "loss": 0.3496,
1115
- "step": 7100
1116
- },
1117
- {
1118
- "epoch": 11.979057591623036,
1119
- "grad_norm": 1.4049593210220337,
1120
- "learning_rate": 4.852e-05,
1121
- "loss": 0.3433,
1122
- "step": 7150
1123
- },
1124
- {
1125
- "epoch": 12.06282722513089,
1126
- "grad_norm": 1.3455963134765625,
1127
- "learning_rate": 4.8030000000000006e-05,
1128
- "loss": 0.3518,
1129
- "step": 7200
1130
- },
1131
- {
1132
- "epoch": 12.146596858638743,
1133
- "grad_norm": 1.174660325050354,
1134
- "learning_rate": 4.753e-05,
1135
- "loss": 0.348,
1136
- "step": 7250
1137
- },
1138
- {
1139
- "epoch": 12.230366492146597,
1140
- "grad_norm": 1.2765902280807495,
1141
- "learning_rate": 4.703e-05,
1142
- "loss": 0.345,
1143
- "step": 7300
1144
- },
1145
- {
1146
- "epoch": 12.31413612565445,
1147
- "grad_norm": 1.419295072555542,
1148
- "learning_rate": 4.6530000000000003e-05,
1149
- "loss": 0.3436,
1150
- "step": 7350
1151
- },
1152
- {
1153
- "epoch": 12.397905759162304,
1154
- "grad_norm": 1.3437247276306152,
1155
- "learning_rate": 4.603e-05,
1156
- "loss": 0.3469,
1157
- "step": 7400
1158
- },
1159
- {
1160
- "epoch": 12.481675392670157,
1161
- "grad_norm": 1.6074751615524292,
1162
- "learning_rate": 4.553e-05,
1163
- "loss": 0.3461,
1164
- "step": 7450
1165
- },
1166
- {
1167
- "epoch": 12.565445026178011,
1168
- "grad_norm": 1.432062029838562,
1169
- "learning_rate": 4.503e-05,
1170
- "loss": 0.3441,
1171
- "step": 7500
1172
- },
1173
- {
1174
- "epoch": 12.565445026178011,
1175
- "eval_loss": 0.3222896158695221,
1176
- "eval_runtime": 282.6486,
1177
- "eval_samples_per_second": 30.034,
1178
- "eval_steps_per_second": 3.757,
1179
- "step": 7500
1180
- },
1181
- {
1182
- "epoch": 12.649214659685864,
1183
- "grad_norm": 1.4210392236709595,
1184
- "learning_rate": 4.453e-05,
1185
- "loss": 0.3436,
1186
- "step": 7550
1187
- },
1188
- {
1189
- "epoch": 12.732984293193716,
1190
- "grad_norm": 1.275467038154602,
1191
- "learning_rate": 4.4030000000000004e-05,
1192
- "loss": 0.3453,
1193
- "step": 7600
1194
- },
1195
- {
1196
- "epoch": 12.81675392670157,
1197
- "grad_norm": 1.1207870244979858,
1198
- "learning_rate": 4.3530000000000005e-05,
1199
- "loss": 0.3438,
1200
- "step": 7650
1201
- },
1202
- {
1203
- "epoch": 12.900523560209423,
1204
- "grad_norm": 1.8535631895065308,
1205
- "learning_rate": 4.3030000000000006e-05,
1206
- "loss": 0.3442,
1207
- "step": 7700
1208
- },
1209
- {
1210
- "epoch": 12.984293193717278,
1211
- "grad_norm": 1.0426372289657593,
1212
- "learning_rate": 4.253e-05,
1213
- "loss": 0.3494,
1214
- "step": 7750
1215
- },
1216
- {
1217
- "epoch": 13.06806282722513,
1218
- "grad_norm": 1.3337020874023438,
1219
- "learning_rate": 4.203e-05,
1220
- "loss": 0.3413,
1221
- "step": 7800
1222
- },
1223
- {
1224
- "epoch": 13.151832460732985,
1225
- "grad_norm": 1.017905592918396,
1226
- "learning_rate": 4.1530000000000004e-05,
1227
- "loss": 0.3417,
1228
- "step": 7850
1229
- },
1230
- {
1231
- "epoch": 13.235602094240837,
1232
- "grad_norm": 1.166343331336975,
1233
- "learning_rate": 4.103e-05,
1234
- "loss": 0.3443,
1235
- "step": 7900
1236
- },
1237
- {
1238
- "epoch": 13.319371727748692,
1239
- "grad_norm": 1.4170418977737427,
1240
- "learning_rate": 4.053e-05,
1241
- "loss": 0.3433,
1242
- "step": 7950
1243
- },
1244
- {
1245
- "epoch": 13.403141361256544,
1246
- "grad_norm": 1.125741720199585,
1247
- "learning_rate": 4.003e-05,
1248
- "loss": 0.3422,
1249
- "step": 8000
1250
- },
1251
- {
1252
- "epoch": 13.403141361256544,
1253
- "eval_loss": 0.31487980484962463,
1254
- "eval_runtime": 278.3852,
1255
- "eval_samples_per_second": 30.494,
1256
- "eval_steps_per_second": 3.815,
1257
- "step": 8000
1258
- },
1259
- {
1260
- "epoch": 13.486910994764397,
1261
- "grad_norm": 1.5452402830123901,
1262
- "learning_rate": 3.953e-05,
1263
- "loss": 0.3403,
1264
- "step": 8050
1265
- },
1266
- {
1267
- "epoch": 13.570680628272251,
1268
- "grad_norm": 0.9096773862838745,
1269
- "learning_rate": 3.903e-05,
1270
- "loss": 0.3409,
1271
- "step": 8100
1272
- },
1273
- {
1274
- "epoch": 13.654450261780104,
1275
- "grad_norm": 1.6249001026153564,
1276
- "learning_rate": 3.853e-05,
1277
- "loss": 0.3414,
1278
- "step": 8150
1279
- },
1280
- {
1281
- "epoch": 13.738219895287958,
1282
- "grad_norm": 0.9276340007781982,
1283
- "learning_rate": 3.803000000000001e-05,
1284
- "loss": 0.3389,
1285
- "step": 8200
1286
- },
1287
- {
1288
- "epoch": 13.821989528795811,
1289
- "grad_norm": 1.7416585683822632,
1290
- "learning_rate": 3.753e-05,
1291
- "loss": 0.343,
1292
- "step": 8250
1293
- },
1294
- {
1295
- "epoch": 13.905759162303665,
1296
- "grad_norm": 2.2160768508911133,
1297
- "learning_rate": 3.703e-05,
1298
- "loss": 0.3402,
1299
- "step": 8300
1300
- },
1301
- {
1302
- "epoch": 13.989528795811518,
1303
- "grad_norm": 1.0885984897613525,
1304
- "learning_rate": 3.6530000000000004e-05,
1305
- "loss": 0.3407,
1306
- "step": 8350
1307
- },
1308
- {
1309
- "epoch": 14.073298429319372,
1310
- "grad_norm": 0.9969326853752136,
1311
- "learning_rate": 3.6030000000000006e-05,
1312
- "loss": 0.3447,
1313
- "step": 8400
1314
- },
1315
- {
1316
- "epoch": 14.157068062827225,
1317
- "grad_norm": 1.2978531122207642,
1318
- "learning_rate": 3.553e-05,
1319
- "loss": 0.3377,
1320
- "step": 8450
1321
- },
1322
- {
1323
- "epoch": 14.24083769633508,
1324
- "grad_norm": 1.0465147495269775,
1325
- "learning_rate": 3.503e-05,
1326
- "loss": 0.3396,
1327
- "step": 8500
1328
- },
1329
- {
1330
- "epoch": 14.24083769633508,
1331
- "eval_loss": 0.310507208108902,
1332
- "eval_runtime": 279.5625,
1333
- "eval_samples_per_second": 30.365,
1334
- "eval_steps_per_second": 3.799,
1335
- "step": 8500
1336
- },
1337
- {
1338
- "epoch": 14.324607329842932,
1339
- "grad_norm": 2.537041425704956,
1340
- "learning_rate": 3.453e-05,
1341
- "loss": 0.3418,
1342
- "step": 8550
1343
- },
1344
- {
1345
- "epoch": 14.408376963350785,
1346
- "grad_norm": 1.3357998132705688,
1347
- "learning_rate": 3.403e-05,
1348
- "loss": 0.3408,
1349
- "step": 8600
1350
- },
1351
- {
1352
- "epoch": 14.492146596858639,
1353
- "grad_norm": 0.8550173044204712,
1354
- "learning_rate": 3.353e-05,
1355
- "loss": 0.3408,
1356
- "step": 8650
1357
- },
1358
- {
1359
- "epoch": 14.575916230366492,
1360
- "grad_norm": 1.4455218315124512,
1361
- "learning_rate": 3.303e-05,
1362
- "loss": 0.3407,
1363
- "step": 8700
1364
- },
1365
- {
1366
- "epoch": 14.659685863874346,
1367
- "grad_norm": 1.0547473430633545,
1368
- "learning_rate": 3.253e-05,
1369
- "loss": 0.3382,
1370
- "step": 8750
1371
- },
1372
- {
1373
- "epoch": 14.743455497382199,
1374
- "grad_norm": 1.5398694276809692,
1375
- "learning_rate": 3.2029999999999997e-05,
1376
- "loss": 0.3402,
1377
- "step": 8800
1378
- },
1379
- {
1380
- "epoch": 14.827225130890053,
1381
- "grad_norm": 1.008465051651001,
1382
- "learning_rate": 3.1530000000000005e-05,
1383
- "loss": 0.3433,
1384
- "step": 8850
1385
- },
1386
- {
1387
- "epoch": 14.910994764397905,
1388
- "grad_norm": 1.8319462537765503,
1389
- "learning_rate": 3.1030000000000006e-05,
1390
- "loss": 0.341,
1391
- "step": 8900
1392
- },
1393
- {
1394
- "epoch": 14.99476439790576,
1395
- "grad_norm": 1.1432167291641235,
1396
- "learning_rate": 3.053e-05,
1397
- "loss": 0.3369,
1398
- "step": 8950
1399
- },
1400
- {
1401
- "epoch": 15.078534031413612,
1402
- "grad_norm": 1.098186731338501,
1403
- "learning_rate": 3.0030000000000002e-05,
1404
- "loss": 0.3396,
1405
- "step": 9000
1406
- },
1407
- {
1408
- "epoch": 15.078534031413612,
1409
- "eval_loss": 0.31039854884147644,
1410
- "eval_runtime": 280.3967,
1411
- "eval_samples_per_second": 30.275,
1412
- "eval_steps_per_second": 3.787,
1413
- "step": 9000
1414
- },
1415
- {
1416
- "epoch": 15.162303664921467,
1417
- "grad_norm": 1.0989015102386475,
1418
- "learning_rate": 2.9530000000000004e-05,
1419
- "loss": 0.3381,
1420
- "step": 9050
1421
- },
1422
- {
1423
- "epoch": 15.24607329842932,
1424
- "grad_norm": 1.1959214210510254,
1425
- "learning_rate": 2.903e-05,
1426
- "loss": 0.3381,
1427
- "step": 9100
1428
- },
1429
- {
1430
- "epoch": 15.329842931937172,
1431
- "grad_norm": 0.9721996188163757,
1432
- "learning_rate": 2.853e-05,
1433
- "loss": 0.3384,
1434
- "step": 9150
1435
- },
1436
- {
1437
- "epoch": 15.413612565445026,
1438
- "grad_norm": 1.2921016216278076,
1439
- "learning_rate": 2.803e-05,
1440
- "loss": 0.3375,
1441
- "step": 9200
1442
- },
1443
- {
1444
- "epoch": 15.497382198952879,
1445
- "grad_norm": 1.1854231357574463,
1446
- "learning_rate": 2.753e-05,
1447
- "loss": 0.3389,
1448
- "step": 9250
1449
- },
1450
- {
1451
- "epoch": 15.581151832460733,
1452
- "grad_norm": 1.571321725845337,
1453
- "learning_rate": 2.703e-05,
1454
- "loss": 0.3406,
1455
- "step": 9300
1456
- },
1457
- {
1458
- "epoch": 15.664921465968586,
1459
- "grad_norm": 1.2595016956329346,
1460
- "learning_rate": 2.6540000000000003e-05,
1461
- "loss": 0.3392,
1462
- "step": 9350
1463
- },
1464
- {
1465
- "epoch": 15.74869109947644,
1466
- "grad_norm": 1.2291969060897827,
1467
- "learning_rate": 2.6040000000000005e-05,
1468
- "loss": 0.3362,
1469
- "step": 9400
1470
- },
1471
- {
1472
- "epoch": 15.832460732984293,
1473
- "grad_norm": 1.0605494976043701,
1474
- "learning_rate": 2.5540000000000003e-05,
1475
- "loss": 0.3388,
1476
- "step": 9450
1477
- },
1478
- {
1479
- "epoch": 15.916230366492147,
1480
- "grad_norm": 0.9927255511283875,
1481
- "learning_rate": 2.504e-05,
1482
- "loss": 0.3391,
1483
- "step": 9500
1484
- },
1485
- {
1486
- "epoch": 15.916230366492147,
1487
- "eval_loss": 0.3102165162563324,
1488
- "eval_runtime": 279.552,
1489
- "eval_samples_per_second": 30.366,
1490
- "eval_steps_per_second": 3.799,
1491
- "step": 9500
1492
  }
1493
  ],
1494
  "logging_steps": 50,
1495
- "max_steps": 12000,
1496
  "num_input_tokens_seen": 0,
1497
- "num_train_epochs": 21,
1498
  "save_steps": 500,
1499
  "stateful_callbacks": {
1500
  "TrainerControl": {
@@ -1508,7 +104,7 @@
1508
  "attributes": {}
1509
  }
1510
  },
1511
- "total_flos": 1.7021322045447034e+17,
1512
  "train_batch_size": 16,
1513
  "trial_name": null,
1514
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.4566049873828888,
3
+ "best_model_checkpoint": "mikhail_panzo/zlm_b128_le4_s8000/checkpoint-500",
4
+ "epoch": 0.837696335078534,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
81
  {
82
  "epoch": 0.837696335078534,
83
  "eval_loss": 0.4566049873828888,
84
+ "eval_runtime": 281.3712,
85
+ "eval_samples_per_second": 30.17,
86
+ "eval_steps_per_second": 3.774,
87
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  }
89
  ],
90
  "logging_steps": 50,
91
+ "max_steps": 8000,
92
  "num_input_tokens_seen": 0,
93
+ "num_train_epochs": 14,
94
  "save_steps": 500,
95
  "stateful_callbacks": {
96
  "TrainerControl": {
 
104
  "attributes": {}
105
  }
106
  },
107
+ "total_flos": 8963407491426432.0,
108
  "train_batch_size": 16,
109
  "trial_name": null,
110
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a94a4f6f1d582f4a7bb7ca6a113897e919df3fe53304bd598160a40931f24f6
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11675416f8a34c5963cafc78c11d51d2aedc5632f839698999d98e8c1dadbc99
3
  size 5304