jdannem6 commited on
Commit
36707c4
1 Parent(s): 0588207

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1793 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40a3f22374b10f1dacc2052bbe1eeb9d3ec51c4d0215210d58e218ca693293da
3
  size 2692969128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7407eb7257531b614b61c8d1d84d4fdb719b07a14e72e981f4319bd3b34e5f33
3
  size 2692969128
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dae1cc318abadbe97655b2e89cba3a93d2fadb650d801744bdfc7c6a4ccca5c0
3
  size 5386075202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:818910f673e75761c43fb2f9eda548ec1856808990050a5beae905ebe0e58a55
3
  size 5386075202
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38fbaa4aaa427747240a6b65afd267f1edcb968fa67c3bf21b881737ad1b8da3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f2917558eb213e594a8e2999ee1e91aeb27af45c09cae9c3a3a202b99cef622
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30c1646ffd4f2e4e86a7c5c87af0949f3be46b7539d2a0137b1bb01bf3e8bbe5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992f92f28f32913b10be7c822cdebf0abc4183b682b29981995e05a2a28bcd4d
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.6027244925498962,
3
  "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek_FULL_HardArith_Interm_20240424-065814/checkpoint-5000",
4
- "epoch": 0.1875,
5
  "eval_steps": 500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5377,6 +5377,1796 @@
5377
  "eval_samples_per_second": 26.2,
5378
  "eval_steps_per_second": 26.2,
5379
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5380
  }
5381
  ],
5382
  "logging_steps": 10,
@@ -5384,7 +7174,7 @@
5384
  "num_input_tokens_seen": 0,
5385
  "num_train_epochs": 1,
5386
  "save_steps": 2500,
5387
- "total_flos": 1.1800273747968e+17,
5388
  "train_batch_size": 1,
5389
  "trial_name": null,
5390
  "trial_params": null
 
1
  {
2
  "best_metric": 0.6027244925498962,
3
  "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek_FULL_HardArith_Interm_20240424-065814/checkpoint-5000",
4
+ "epoch": 0.25,
5
  "eval_steps": 500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5377
  "eval_samples_per_second": 26.2,
5378
  "eval_steps_per_second": 26.2,
5379
  "step": 7500
5380
+ },
5381
+ {
5382
+ "epoch": 0.19,
5383
+ "grad_norm": 4.6875,
5384
+ "learning_rate": 5.242105263157895e-06,
5385
+ "loss": 0.5938,
5386
+ "step": 7510
5387
+ },
5388
+ {
5389
+ "epoch": 0.19,
5390
+ "grad_norm": 5.0625,
5391
+ "learning_rate": 5.2210526315789475e-06,
5392
+ "loss": 0.6286,
5393
+ "step": 7520
5394
+ },
5395
+ {
5396
+ "epoch": 0.19,
5397
+ "grad_norm": 4.53125,
5398
+ "learning_rate": 5.2e-06,
5399
+ "loss": 0.7065,
5400
+ "step": 7530
5401
+ },
5402
+ {
5403
+ "epoch": 0.19,
5404
+ "grad_norm": 3.28125,
5405
+ "learning_rate": 5.178947368421054e-06,
5406
+ "loss": 0.5862,
5407
+ "step": 7540
5408
+ },
5409
+ {
5410
+ "epoch": 0.19,
5411
+ "grad_norm": 4.5,
5412
+ "learning_rate": 5.157894736842106e-06,
5413
+ "loss": 0.5015,
5414
+ "step": 7550
5415
+ },
5416
+ {
5417
+ "epoch": 0.19,
5418
+ "grad_norm": 2.640625,
5419
+ "learning_rate": 5.136842105263158e-06,
5420
+ "loss": 0.6041,
5421
+ "step": 7560
5422
+ },
5423
+ {
5424
+ "epoch": 0.19,
5425
+ "grad_norm": 5.59375,
5426
+ "learning_rate": 5.115789473684211e-06,
5427
+ "loss": 0.6845,
5428
+ "step": 7570
5429
+ },
5430
+ {
5431
+ "epoch": 0.19,
5432
+ "grad_norm": 6.59375,
5433
+ "learning_rate": 5.0947368421052635e-06,
5434
+ "loss": 0.698,
5435
+ "step": 7580
5436
+ },
5437
+ {
5438
+ "epoch": 0.19,
5439
+ "grad_norm": 9.8125,
5440
+ "learning_rate": 5.073684210526316e-06,
5441
+ "loss": 0.4452,
5442
+ "step": 7590
5443
+ },
5444
+ {
5445
+ "epoch": 0.19,
5446
+ "grad_norm": 3.1875,
5447
+ "learning_rate": 5.052631578947369e-06,
5448
+ "loss": 0.6485,
5449
+ "step": 7600
5450
+ },
5451
+ {
5452
+ "epoch": 0.19,
5453
+ "grad_norm": 5.8125,
5454
+ "learning_rate": 5.0315789473684214e-06,
5455
+ "loss": 0.5703,
5456
+ "step": 7610
5457
+ },
5458
+ {
5459
+ "epoch": 0.19,
5460
+ "grad_norm": 3.953125,
5461
+ "learning_rate": 5.010526315789475e-06,
5462
+ "loss": 0.5871,
5463
+ "step": 7620
5464
+ },
5465
+ {
5466
+ "epoch": 0.19,
5467
+ "grad_norm": 5.5625,
5468
+ "learning_rate": 4.989473684210527e-06,
5469
+ "loss": 0.6024,
5470
+ "step": 7630
5471
+ },
5472
+ {
5473
+ "epoch": 0.19,
5474
+ "grad_norm": 2.9375,
5475
+ "learning_rate": 4.968421052631579e-06,
5476
+ "loss": 0.5723,
5477
+ "step": 7640
5478
+ },
5479
+ {
5480
+ "epoch": 0.19,
5481
+ "grad_norm": 3.15625,
5482
+ "learning_rate": 4.947368421052632e-06,
5483
+ "loss": 0.584,
5484
+ "step": 7650
5485
+ },
5486
+ {
5487
+ "epoch": 0.19,
5488
+ "grad_norm": 8.4375,
5489
+ "learning_rate": 4.926315789473685e-06,
5490
+ "loss": 0.6387,
5491
+ "step": 7660
5492
+ },
5493
+ {
5494
+ "epoch": 0.19,
5495
+ "grad_norm": 4.34375,
5496
+ "learning_rate": 4.905263157894737e-06,
5497
+ "loss": 0.6574,
5498
+ "step": 7670
5499
+ },
5500
+ {
5501
+ "epoch": 0.19,
5502
+ "grad_norm": 3.578125,
5503
+ "learning_rate": 4.88421052631579e-06,
5504
+ "loss": 0.5907,
5505
+ "step": 7680
5506
+ },
5507
+ {
5508
+ "epoch": 0.19,
5509
+ "grad_norm": 4.5,
5510
+ "learning_rate": 4.863157894736843e-06,
5511
+ "loss": 0.6319,
5512
+ "step": 7690
5513
+ },
5514
+ {
5515
+ "epoch": 0.19,
5516
+ "grad_norm": 3.234375,
5517
+ "learning_rate": 4.842105263157895e-06,
5518
+ "loss": 0.5053,
5519
+ "step": 7700
5520
+ },
5521
+ {
5522
+ "epoch": 0.19,
5523
+ "grad_norm": 1.7421875,
5524
+ "learning_rate": 4.821052631578948e-06,
5525
+ "loss": 0.5135,
5526
+ "step": 7710
5527
+ },
5528
+ {
5529
+ "epoch": 0.19,
5530
+ "grad_norm": 17.375,
5531
+ "learning_rate": 4.800000000000001e-06,
5532
+ "loss": 0.6893,
5533
+ "step": 7720
5534
+ },
5535
+ {
5536
+ "epoch": 0.19,
5537
+ "grad_norm": 3.609375,
5538
+ "learning_rate": 4.778947368421053e-06,
5539
+ "loss": 0.6257,
5540
+ "step": 7730
5541
+ },
5542
+ {
5543
+ "epoch": 0.19,
5544
+ "grad_norm": 4.46875,
5545
+ "learning_rate": 4.757894736842106e-06,
5546
+ "loss": 0.5808,
5547
+ "step": 7740
5548
+ },
5549
+ {
5550
+ "epoch": 0.19,
5551
+ "grad_norm": 3.984375,
5552
+ "learning_rate": 4.736842105263158e-06,
5553
+ "loss": 0.518,
5554
+ "step": 7750
5555
+ },
5556
+ {
5557
+ "epoch": 0.19,
5558
+ "grad_norm": 5.90625,
5559
+ "learning_rate": 4.71578947368421e-06,
5560
+ "loss": 0.5991,
5561
+ "step": 7760
5562
+ },
5563
+ {
5564
+ "epoch": 0.19,
5565
+ "grad_norm": 4.71875,
5566
+ "learning_rate": 4.694736842105264e-06,
5567
+ "loss": 0.6514,
5568
+ "step": 7770
5569
+ },
5570
+ {
5571
+ "epoch": 0.19,
5572
+ "grad_norm": 3.109375,
5573
+ "learning_rate": 4.6736842105263166e-06,
5574
+ "loss": 0.489,
5575
+ "step": 7780
5576
+ },
5577
+ {
5578
+ "epoch": 0.19,
5579
+ "grad_norm": 3.609375,
5580
+ "learning_rate": 4.652631578947368e-06,
5581
+ "loss": 0.5181,
5582
+ "step": 7790
5583
+ },
5584
+ {
5585
+ "epoch": 0.2,
5586
+ "grad_norm": 2.78125,
5587
+ "learning_rate": 4.631578947368421e-06,
5588
+ "loss": 0.5767,
5589
+ "step": 7800
5590
+ },
5591
+ {
5592
+ "epoch": 0.2,
5593
+ "grad_norm": 3.171875,
5594
+ "learning_rate": 4.6105263157894745e-06,
5595
+ "loss": 0.4968,
5596
+ "step": 7810
5597
+ },
5598
+ {
5599
+ "epoch": 0.2,
5600
+ "grad_norm": 4.8125,
5601
+ "learning_rate": 4.589473684210526e-06,
5602
+ "loss": 0.6562,
5603
+ "step": 7820
5604
+ },
5605
+ {
5606
+ "epoch": 0.2,
5607
+ "grad_norm": 2.703125,
5608
+ "learning_rate": 4.568421052631579e-06,
5609
+ "loss": 0.6273,
5610
+ "step": 7830
5611
+ },
5612
+ {
5613
+ "epoch": 0.2,
5614
+ "grad_norm": 4.75,
5615
+ "learning_rate": 4.547368421052632e-06,
5616
+ "loss": 0.6351,
5617
+ "step": 7840
5618
+ },
5619
+ {
5620
+ "epoch": 0.2,
5621
+ "grad_norm": 2.9375,
5622
+ "learning_rate": 4.526315789473685e-06,
5623
+ "loss": 0.5495,
5624
+ "step": 7850
5625
+ },
5626
+ {
5627
+ "epoch": 0.2,
5628
+ "grad_norm": 10.6875,
5629
+ "learning_rate": 4.505263157894737e-06,
5630
+ "loss": 0.6603,
5631
+ "step": 7860
5632
+ },
5633
+ {
5634
+ "epoch": 0.2,
5635
+ "grad_norm": 4.25,
5636
+ "learning_rate": 4.48421052631579e-06,
5637
+ "loss": 0.5525,
5638
+ "step": 7870
5639
+ },
5640
+ {
5641
+ "epoch": 0.2,
5642
+ "grad_norm": 3.171875,
5643
+ "learning_rate": 4.463157894736842e-06,
5644
+ "loss": 0.6219,
5645
+ "step": 7880
5646
+ },
5647
+ {
5648
+ "epoch": 0.2,
5649
+ "grad_norm": 2.859375,
5650
+ "learning_rate": 4.442105263157896e-06,
5651
+ "loss": 0.5518,
5652
+ "step": 7890
5653
+ },
5654
+ {
5655
+ "epoch": 0.2,
5656
+ "grad_norm": 3.109375,
5657
+ "learning_rate": 4.4210526315789476e-06,
5658
+ "loss": 0.5462,
5659
+ "step": 7900
5660
+ },
5661
+ {
5662
+ "epoch": 0.2,
5663
+ "grad_norm": 3.359375,
5664
+ "learning_rate": 4.4e-06,
5665
+ "loss": 0.5776,
5666
+ "step": 7910
5667
+ },
5668
+ {
5669
+ "epoch": 0.2,
5670
+ "grad_norm": 4.4375,
5671
+ "learning_rate": 4.378947368421053e-06,
5672
+ "loss": 0.4804,
5673
+ "step": 7920
5674
+ },
5675
+ {
5676
+ "epoch": 0.2,
5677
+ "grad_norm": 3.578125,
5678
+ "learning_rate": 4.3578947368421055e-06,
5679
+ "loss": 0.5858,
5680
+ "step": 7930
5681
+ },
5682
+ {
5683
+ "epoch": 0.2,
5684
+ "grad_norm": 3.71875,
5685
+ "learning_rate": 4.336842105263158e-06,
5686
+ "loss": 0.6847,
5687
+ "step": 7940
5688
+ },
5689
+ {
5690
+ "epoch": 0.2,
5691
+ "grad_norm": 3.984375,
5692
+ "learning_rate": 4.315789473684211e-06,
5693
+ "loss": 0.6626,
5694
+ "step": 7950
5695
+ },
5696
+ {
5697
+ "epoch": 0.2,
5698
+ "grad_norm": 9.4375,
5699
+ "learning_rate": 4.2947368421052635e-06,
5700
+ "loss": 0.5526,
5701
+ "step": 7960
5702
+ },
5703
+ {
5704
+ "epoch": 0.2,
5705
+ "grad_norm": 4.625,
5706
+ "learning_rate": 4.273684210526316e-06,
5707
+ "loss": 0.5661,
5708
+ "step": 7970
5709
+ },
5710
+ {
5711
+ "epoch": 0.2,
5712
+ "grad_norm": 4.28125,
5713
+ "learning_rate": 4.252631578947369e-06,
5714
+ "loss": 0.6182,
5715
+ "step": 7980
5716
+ },
5717
+ {
5718
+ "epoch": 0.2,
5719
+ "grad_norm": 6.3125,
5720
+ "learning_rate": 4.2315789473684215e-06,
5721
+ "loss": 0.629,
5722
+ "step": 7990
5723
+ },
5724
+ {
5725
+ "epoch": 0.2,
5726
+ "grad_norm": 3.59375,
5727
+ "learning_rate": 4.210526315789474e-06,
5728
+ "loss": 0.7233,
5729
+ "step": 8000
5730
+ },
5731
+ {
5732
+ "epoch": 0.2,
5733
+ "eval_loss": 0.6029936671257019,
5734
+ "eval_runtime": 38.1479,
5735
+ "eval_samples_per_second": 26.214,
5736
+ "eval_steps_per_second": 26.214,
5737
+ "step": 8000
5738
+ },
5739
+ {
5740
+ "epoch": 0.2,
5741
+ "grad_norm": 2.828125,
5742
+ "learning_rate": 4.189473684210527e-06,
5743
+ "loss": 0.5451,
5744
+ "step": 8010
5745
+ },
5746
+ {
5747
+ "epoch": 0.2,
5748
+ "grad_norm": 5.0,
5749
+ "learning_rate": 4.1684210526315794e-06,
5750
+ "loss": 0.5498,
5751
+ "step": 8020
5752
+ },
5753
+ {
5754
+ "epoch": 0.2,
5755
+ "grad_norm": 4.875,
5756
+ "learning_rate": 4.147368421052632e-06,
5757
+ "loss": 0.5518,
5758
+ "step": 8030
5759
+ },
5760
+ {
5761
+ "epoch": 0.2,
5762
+ "grad_norm": 3.609375,
5763
+ "learning_rate": 4.126315789473685e-06,
5764
+ "loss": 0.6088,
5765
+ "step": 8040
5766
+ },
5767
+ {
5768
+ "epoch": 0.2,
5769
+ "grad_norm": 41.0,
5770
+ "learning_rate": 4.105263157894737e-06,
5771
+ "loss": 0.6399,
5772
+ "step": 8050
5773
+ },
5774
+ {
5775
+ "epoch": 0.2,
5776
+ "grad_norm": 3.578125,
5777
+ "learning_rate": 4.08421052631579e-06,
5778
+ "loss": 0.5412,
5779
+ "step": 8060
5780
+ },
5781
+ {
5782
+ "epoch": 0.2,
5783
+ "grad_norm": 4.21875,
5784
+ "learning_rate": 4.063157894736842e-06,
5785
+ "loss": 0.4933,
5786
+ "step": 8070
5787
+ },
5788
+ {
5789
+ "epoch": 0.2,
5790
+ "grad_norm": 3.234375,
5791
+ "learning_rate": 4.042105263157895e-06,
5792
+ "loss": 0.6182,
5793
+ "step": 8080
5794
+ },
5795
+ {
5796
+ "epoch": 0.2,
5797
+ "grad_norm": 2.953125,
5798
+ "learning_rate": 4.021052631578948e-06,
5799
+ "loss": 0.5342,
5800
+ "step": 8090
5801
+ },
5802
+ {
5803
+ "epoch": 0.2,
5804
+ "grad_norm": 1.6328125,
5805
+ "learning_rate": 4.000000000000001e-06,
5806
+ "loss": 0.5838,
5807
+ "step": 8100
5808
+ },
5809
+ {
5810
+ "epoch": 0.2,
5811
+ "grad_norm": 2.40625,
5812
+ "learning_rate": 3.9789473684210525e-06,
5813
+ "loss": 0.6717,
5814
+ "step": 8110
5815
+ },
5816
+ {
5817
+ "epoch": 0.2,
5818
+ "grad_norm": 7.0,
5819
+ "learning_rate": 3.957894736842106e-06,
5820
+ "loss": 0.6484,
5821
+ "step": 8120
5822
+ },
5823
+ {
5824
+ "epoch": 0.2,
5825
+ "grad_norm": 10.4375,
5826
+ "learning_rate": 3.936842105263159e-06,
5827
+ "loss": 0.6253,
5828
+ "step": 8130
5829
+ },
5830
+ {
5831
+ "epoch": 0.2,
5832
+ "grad_norm": 3.078125,
5833
+ "learning_rate": 3.9157894736842104e-06,
5834
+ "loss": 0.5562,
5835
+ "step": 8140
5836
+ },
5837
+ {
5838
+ "epoch": 0.2,
5839
+ "grad_norm": 5.4375,
5840
+ "learning_rate": 3.894736842105263e-06,
5841
+ "loss": 0.6162,
5842
+ "step": 8150
5843
+ },
5844
+ {
5845
+ "epoch": 0.2,
5846
+ "grad_norm": 2.328125,
5847
+ "learning_rate": 3.873684210526316e-06,
5848
+ "loss": 0.5602,
5849
+ "step": 8160
5850
+ },
5851
+ {
5852
+ "epoch": 0.2,
5853
+ "grad_norm": 6.78125,
5854
+ "learning_rate": 3.852631578947369e-06,
5855
+ "loss": 0.5414,
5856
+ "step": 8170
5857
+ },
5858
+ {
5859
+ "epoch": 0.2,
5860
+ "grad_norm": 4.40625,
5861
+ "learning_rate": 3.831578947368421e-06,
5862
+ "loss": 0.5783,
5863
+ "step": 8180
5864
+ },
5865
+ {
5866
+ "epoch": 0.2,
5867
+ "grad_norm": 3.46875,
5868
+ "learning_rate": 3.810526315789474e-06,
5869
+ "loss": 0.583,
5870
+ "step": 8190
5871
+ },
5872
+ {
5873
+ "epoch": 0.2,
5874
+ "grad_norm": 6.5625,
5875
+ "learning_rate": 3.789473684210527e-06,
5876
+ "loss": 0.632,
5877
+ "step": 8200
5878
+ },
5879
+ {
5880
+ "epoch": 0.21,
5881
+ "grad_norm": 3.625,
5882
+ "learning_rate": 3.768421052631579e-06,
5883
+ "loss": 0.5573,
5884
+ "step": 8210
5885
+ },
5886
+ {
5887
+ "epoch": 0.21,
5888
+ "grad_norm": 3.578125,
5889
+ "learning_rate": 3.7473684210526317e-06,
5890
+ "loss": 0.5176,
5891
+ "step": 8220
5892
+ },
5893
+ {
5894
+ "epoch": 0.21,
5895
+ "grad_norm": 2.796875,
5896
+ "learning_rate": 3.7263157894736848e-06,
5897
+ "loss": 0.4684,
5898
+ "step": 8230
5899
+ },
5900
+ {
5901
+ "epoch": 0.21,
5902
+ "grad_norm": 6.71875,
5903
+ "learning_rate": 3.7052631578947374e-06,
5904
+ "loss": 0.6114,
5905
+ "step": 8240
5906
+ },
5907
+ {
5908
+ "epoch": 0.21,
5909
+ "grad_norm": 5.5625,
5910
+ "learning_rate": 3.6842105263157896e-06,
5911
+ "loss": 0.5437,
5912
+ "step": 8250
5913
+ },
5914
+ {
5915
+ "epoch": 0.21,
5916
+ "grad_norm": 4.53125,
5917
+ "learning_rate": 3.6631578947368423e-06,
5918
+ "loss": 0.5119,
5919
+ "step": 8260
5920
+ },
5921
+ {
5922
+ "epoch": 0.21,
5923
+ "grad_norm": 4.21875,
5924
+ "learning_rate": 3.642105263157895e-06,
5925
+ "loss": 0.5742,
5926
+ "step": 8270
5927
+ },
5928
+ {
5929
+ "epoch": 0.21,
5930
+ "grad_norm": 5.46875,
5931
+ "learning_rate": 3.621052631578948e-06,
5932
+ "loss": 0.6704,
5933
+ "step": 8280
5934
+ },
5935
+ {
5936
+ "epoch": 0.21,
5937
+ "grad_norm": 3.25,
5938
+ "learning_rate": 3.6000000000000003e-06,
5939
+ "loss": 0.5655,
5940
+ "step": 8290
5941
+ },
5942
+ {
5943
+ "epoch": 0.21,
5944
+ "grad_norm": 3.15625,
5945
+ "learning_rate": 3.578947368421053e-06,
5946
+ "loss": 0.5382,
5947
+ "step": 8300
5948
+ },
5949
+ {
5950
+ "epoch": 0.21,
5951
+ "grad_norm": 6.875,
5952
+ "learning_rate": 3.5578947368421056e-06,
5953
+ "loss": 0.5485,
5954
+ "step": 8310
5955
+ },
5956
+ {
5957
+ "epoch": 0.21,
5958
+ "grad_norm": 7.3125,
5959
+ "learning_rate": 3.536842105263158e-06,
5960
+ "loss": 0.5124,
5961
+ "step": 8320
5962
+ },
5963
+ {
5964
+ "epoch": 0.21,
5965
+ "grad_norm": 4.625,
5966
+ "learning_rate": 3.515789473684211e-06,
5967
+ "loss": 0.4819,
5968
+ "step": 8330
5969
+ },
5970
+ {
5971
+ "epoch": 0.21,
5972
+ "grad_norm": 6.53125,
5973
+ "learning_rate": 3.4947368421052635e-06,
5974
+ "loss": 0.5999,
5975
+ "step": 8340
5976
+ },
5977
+ {
5978
+ "epoch": 0.21,
5979
+ "grad_norm": 4.3125,
5980
+ "learning_rate": 3.473684210526316e-06,
5981
+ "loss": 0.5241,
5982
+ "step": 8350
5983
+ },
5984
+ {
5985
+ "epoch": 0.21,
5986
+ "grad_norm": 7.78125,
5987
+ "learning_rate": 3.4526315789473684e-06,
5988
+ "loss": 0.6193,
5989
+ "step": 8360
5990
+ },
5991
+ {
5992
+ "epoch": 0.21,
5993
+ "grad_norm": 5.09375,
5994
+ "learning_rate": 3.4315789473684215e-06,
5995
+ "loss": 0.6701,
5996
+ "step": 8370
5997
+ },
5998
+ {
5999
+ "epoch": 0.21,
6000
+ "grad_norm": 2.453125,
6001
+ "learning_rate": 3.410526315789474e-06,
6002
+ "loss": 0.4458,
6003
+ "step": 8380
6004
+ },
6005
+ {
6006
+ "epoch": 0.21,
6007
+ "grad_norm": 2.53125,
6008
+ "learning_rate": 3.3894736842105264e-06,
6009
+ "loss": 0.5441,
6010
+ "step": 8390
6011
+ },
6012
+ {
6013
+ "epoch": 0.21,
6014
+ "grad_norm": 3.71875,
6015
+ "learning_rate": 3.368421052631579e-06,
6016
+ "loss": 0.5787,
6017
+ "step": 8400
6018
+ },
6019
+ {
6020
+ "epoch": 0.21,
6021
+ "grad_norm": 5.1875,
6022
+ "learning_rate": 3.347368421052632e-06,
6023
+ "loss": 0.5972,
6024
+ "step": 8410
6025
+ },
6026
+ {
6027
+ "epoch": 0.21,
6028
+ "grad_norm": 3.828125,
6029
+ "learning_rate": 3.3263157894736848e-06,
6030
+ "loss": 0.5784,
6031
+ "step": 8420
6032
+ },
6033
+ {
6034
+ "epoch": 0.21,
6035
+ "grad_norm": 2.40625,
6036
+ "learning_rate": 3.305263157894737e-06,
6037
+ "loss": 0.6005,
6038
+ "step": 8430
6039
+ },
6040
+ {
6041
+ "epoch": 0.21,
6042
+ "grad_norm": 4.875,
6043
+ "learning_rate": 3.2842105263157897e-06,
6044
+ "loss": 0.5343,
6045
+ "step": 8440
6046
+ },
6047
+ {
6048
+ "epoch": 0.21,
6049
+ "grad_norm": 3.0,
6050
+ "learning_rate": 3.2631578947368423e-06,
6051
+ "loss": 0.5407,
6052
+ "step": 8450
6053
+ },
6054
+ {
6055
+ "epoch": 0.21,
6056
+ "grad_norm": 3.234375,
6057
+ "learning_rate": 3.2421052631578945e-06,
6058
+ "loss": 0.5047,
6059
+ "step": 8460
6060
+ },
6061
+ {
6062
+ "epoch": 0.21,
6063
+ "grad_norm": 5.6875,
6064
+ "learning_rate": 3.2210526315789476e-06,
6065
+ "loss": 0.4705,
6066
+ "step": 8470
6067
+ },
6068
+ {
6069
+ "epoch": 0.21,
6070
+ "grad_norm": 2.515625,
6071
+ "learning_rate": 3.2000000000000003e-06,
6072
+ "loss": 0.5838,
6073
+ "step": 8480
6074
+ },
6075
+ {
6076
+ "epoch": 0.21,
6077
+ "grad_norm": 3.453125,
6078
+ "learning_rate": 3.178947368421053e-06,
6079
+ "loss": 0.547,
6080
+ "step": 8490
6081
+ },
6082
+ {
6083
+ "epoch": 0.21,
6084
+ "grad_norm": 5.0625,
6085
+ "learning_rate": 3.157894736842105e-06,
6086
+ "loss": 0.6311,
6087
+ "step": 8500
6088
+ },
6089
+ {
6090
+ "epoch": 0.21,
6091
+ "eval_loss": 0.6170388460159302,
6092
+ "eval_runtime": 38.1693,
6093
+ "eval_samples_per_second": 26.199,
6094
+ "eval_steps_per_second": 26.199,
6095
+ "step": 8500
6096
+ },
6097
+ {
6098
+ "epoch": 0.21,
6099
+ "grad_norm": 5.96875,
6100
+ "learning_rate": 3.1368421052631582e-06,
6101
+ "loss": 0.6415,
6102
+ "step": 8510
6103
+ },
6104
+ {
6105
+ "epoch": 0.21,
6106
+ "grad_norm": 3.484375,
6107
+ "learning_rate": 3.115789473684211e-06,
6108
+ "loss": 0.5877,
6109
+ "step": 8520
6110
+ },
6111
+ {
6112
+ "epoch": 0.21,
6113
+ "grad_norm": 3.8125,
6114
+ "learning_rate": 3.094736842105263e-06,
6115
+ "loss": 0.5951,
6116
+ "step": 8530
6117
+ },
6118
+ {
6119
+ "epoch": 0.21,
6120
+ "grad_norm": 4.0625,
6121
+ "learning_rate": 3.0736842105263158e-06,
6122
+ "loss": 0.6047,
6123
+ "step": 8540
6124
+ },
6125
+ {
6126
+ "epoch": 0.21,
6127
+ "grad_norm": 2.609375,
6128
+ "learning_rate": 3.052631578947369e-06,
6129
+ "loss": 0.5694,
6130
+ "step": 8550
6131
+ },
6132
+ {
6133
+ "epoch": 0.21,
6134
+ "grad_norm": 4.0625,
6135
+ "learning_rate": 3.0315789473684215e-06,
6136
+ "loss": 0.6126,
6137
+ "step": 8560
6138
+ },
6139
+ {
6140
+ "epoch": 0.21,
6141
+ "grad_norm": 2.75,
6142
+ "learning_rate": 3.0105263157894737e-06,
6143
+ "loss": 0.521,
6144
+ "step": 8570
6145
+ },
6146
+ {
6147
+ "epoch": 0.21,
6148
+ "grad_norm": 4.46875,
6149
+ "learning_rate": 2.9894736842105264e-06,
6150
+ "loss": 0.6389,
6151
+ "step": 8580
6152
+ },
6153
+ {
6154
+ "epoch": 0.21,
6155
+ "grad_norm": 4.75,
6156
+ "learning_rate": 2.9684210526315795e-06,
6157
+ "loss": 0.5909,
6158
+ "step": 8590
6159
+ },
6160
+ {
6161
+ "epoch": 0.21,
6162
+ "grad_norm": 4.5625,
6163
+ "learning_rate": 2.9473684210526317e-06,
6164
+ "loss": 0.6578,
6165
+ "step": 8600
6166
+ },
6167
+ {
6168
+ "epoch": 0.22,
6169
+ "grad_norm": 6.0,
6170
+ "learning_rate": 2.9263157894736844e-06,
6171
+ "loss": 0.5277,
6172
+ "step": 8610
6173
+ },
6174
+ {
6175
+ "epoch": 0.22,
6176
+ "grad_norm": 2.71875,
6177
+ "learning_rate": 2.905263157894737e-06,
6178
+ "loss": 0.5313,
6179
+ "step": 8620
6180
+ },
6181
+ {
6182
+ "epoch": 0.22,
6183
+ "grad_norm": 3.34375,
6184
+ "learning_rate": 2.88421052631579e-06,
6185
+ "loss": 0.5201,
6186
+ "step": 8630
6187
+ },
6188
+ {
6189
+ "epoch": 0.22,
6190
+ "grad_norm": 4.0,
6191
+ "learning_rate": 2.8631578947368423e-06,
6192
+ "loss": 0.7751,
6193
+ "step": 8640
6194
+ },
6195
+ {
6196
+ "epoch": 0.22,
6197
+ "grad_norm": 2.546875,
6198
+ "learning_rate": 2.842105263157895e-06,
6199
+ "loss": 0.6536,
6200
+ "step": 8650
6201
+ },
6202
+ {
6203
+ "epoch": 0.22,
6204
+ "grad_norm": 6.34375,
6205
+ "learning_rate": 2.8210526315789476e-06,
6206
+ "loss": 0.6661,
6207
+ "step": 8660
6208
+ },
6209
+ {
6210
+ "epoch": 0.22,
6211
+ "grad_norm": 4.65625,
6212
+ "learning_rate": 2.8000000000000003e-06,
6213
+ "loss": 0.6284,
6214
+ "step": 8670
6215
+ },
6216
+ {
6217
+ "epoch": 0.22,
6218
+ "grad_norm": 5.5625,
6219
+ "learning_rate": 2.7789473684210525e-06,
6220
+ "loss": 0.673,
6221
+ "step": 8680
6222
+ },
6223
+ {
6224
+ "epoch": 0.22,
6225
+ "grad_norm": 3.03125,
6226
+ "learning_rate": 2.7578947368421056e-06,
6227
+ "loss": 0.646,
6228
+ "step": 8690
6229
+ },
6230
+ {
6231
+ "epoch": 0.22,
6232
+ "grad_norm": 2.765625,
6233
+ "learning_rate": 2.7368421052631583e-06,
6234
+ "loss": 0.5917,
6235
+ "step": 8700
6236
+ },
6237
+ {
6238
+ "epoch": 0.22,
6239
+ "grad_norm": 6.03125,
6240
+ "learning_rate": 2.7157894736842105e-06,
6241
+ "loss": 0.5706,
6242
+ "step": 8710
6243
+ },
6244
+ {
6245
+ "epoch": 0.22,
6246
+ "grad_norm": 3.171875,
6247
+ "learning_rate": 2.694736842105263e-06,
6248
+ "loss": 0.477,
6249
+ "step": 8720
6250
+ },
6251
+ {
6252
+ "epoch": 0.22,
6253
+ "grad_norm": 5.875,
6254
+ "learning_rate": 2.6736842105263162e-06,
6255
+ "loss": 0.646,
6256
+ "step": 8730
6257
+ },
6258
+ {
6259
+ "epoch": 0.22,
6260
+ "grad_norm": 4.75,
6261
+ "learning_rate": 2.652631578947369e-06,
6262
+ "loss": 0.638,
6263
+ "step": 8740
6264
+ },
6265
+ {
6266
+ "epoch": 0.22,
6267
+ "grad_norm": 3.4375,
6268
+ "learning_rate": 2.631578947368421e-06,
6269
+ "loss": 0.6282,
6270
+ "step": 8750
6271
+ },
6272
+ {
6273
+ "epoch": 0.22,
6274
+ "grad_norm": 3.6875,
6275
+ "learning_rate": 2.6105263157894738e-06,
6276
+ "loss": 0.6418,
6277
+ "step": 8760
6278
+ },
6279
+ {
6280
+ "epoch": 0.22,
6281
+ "grad_norm": 2.828125,
6282
+ "learning_rate": 2.589473684210527e-06,
6283
+ "loss": 0.6228,
6284
+ "step": 8770
6285
+ },
6286
+ {
6287
+ "epoch": 0.22,
6288
+ "grad_norm": 3.765625,
6289
+ "learning_rate": 2.568421052631579e-06,
6290
+ "loss": 0.5794,
6291
+ "step": 8780
6292
+ },
6293
+ {
6294
+ "epoch": 0.22,
6295
+ "grad_norm": 4.9375,
6296
+ "learning_rate": 2.5473684210526317e-06,
6297
+ "loss": 0.5207,
6298
+ "step": 8790
6299
+ },
6300
+ {
6301
+ "epoch": 0.22,
6302
+ "grad_norm": 4.75,
6303
+ "learning_rate": 2.5263157894736844e-06,
6304
+ "loss": 0.623,
6305
+ "step": 8800
6306
+ },
6307
+ {
6308
+ "epoch": 0.22,
6309
+ "grad_norm": 4.40625,
6310
+ "learning_rate": 2.5052631578947375e-06,
6311
+ "loss": 0.6086,
6312
+ "step": 8810
6313
+ },
6314
+ {
6315
+ "epoch": 0.22,
6316
+ "grad_norm": 3.109375,
6317
+ "learning_rate": 2.4842105263157897e-06,
6318
+ "loss": 0.579,
6319
+ "step": 8820
6320
+ },
6321
+ {
6322
+ "epoch": 0.22,
6323
+ "grad_norm": 3.65625,
6324
+ "learning_rate": 2.4631578947368424e-06,
6325
+ "loss": 0.4531,
6326
+ "step": 8830
6327
+ },
6328
+ {
6329
+ "epoch": 0.22,
6330
+ "grad_norm": 4.75,
6331
+ "learning_rate": 2.442105263157895e-06,
6332
+ "loss": 0.5167,
6333
+ "step": 8840
6334
+ },
6335
+ {
6336
+ "epoch": 0.22,
6337
+ "grad_norm": 7.78125,
6338
+ "learning_rate": 2.4210526315789477e-06,
6339
+ "loss": 0.6362,
6340
+ "step": 8850
6341
+ },
6342
+ {
6343
+ "epoch": 0.22,
6344
+ "grad_norm": 5.53125,
6345
+ "learning_rate": 2.4000000000000003e-06,
6346
+ "loss": 0.6628,
6347
+ "step": 8860
6348
+ },
6349
+ {
6350
+ "epoch": 0.22,
6351
+ "grad_norm": 2.453125,
6352
+ "learning_rate": 2.378947368421053e-06,
6353
+ "loss": 0.722,
6354
+ "step": 8870
6355
+ },
6356
+ {
6357
+ "epoch": 0.22,
6358
+ "grad_norm": 2.921875,
6359
+ "learning_rate": 2.357894736842105e-06,
6360
+ "loss": 0.5313,
6361
+ "step": 8880
6362
+ },
6363
+ {
6364
+ "epoch": 0.22,
6365
+ "grad_norm": 5.25,
6366
+ "learning_rate": 2.3368421052631583e-06,
6367
+ "loss": 0.6277,
6368
+ "step": 8890
6369
+ },
6370
+ {
6371
+ "epoch": 0.22,
6372
+ "grad_norm": 4.0,
6373
+ "learning_rate": 2.3157894736842105e-06,
6374
+ "loss": 0.5938,
6375
+ "step": 8900
6376
+ },
6377
+ {
6378
+ "epoch": 0.22,
6379
+ "grad_norm": 3.546875,
6380
+ "learning_rate": 2.294736842105263e-06,
6381
+ "loss": 0.6851,
6382
+ "step": 8910
6383
+ },
6384
+ {
6385
+ "epoch": 0.22,
6386
+ "grad_norm": 6.0,
6387
+ "learning_rate": 2.273684210526316e-06,
6388
+ "loss": 0.4746,
6389
+ "step": 8920
6390
+ },
6391
+ {
6392
+ "epoch": 0.22,
6393
+ "grad_norm": 4.0,
6394
+ "learning_rate": 2.2526315789473685e-06,
6395
+ "loss": 0.5109,
6396
+ "step": 8930
6397
+ },
6398
+ {
6399
+ "epoch": 0.22,
6400
+ "grad_norm": 2.421875,
6401
+ "learning_rate": 2.231578947368421e-06,
6402
+ "loss": 0.4732,
6403
+ "step": 8940
6404
+ },
6405
+ {
6406
+ "epoch": 0.22,
6407
+ "grad_norm": 3.234375,
6408
+ "learning_rate": 2.2105263157894738e-06,
6409
+ "loss": 0.7736,
6410
+ "step": 8950
6411
+ },
6412
+ {
6413
+ "epoch": 0.22,
6414
+ "grad_norm": 4.34375,
6415
+ "learning_rate": 2.1894736842105264e-06,
6416
+ "loss": 0.6251,
6417
+ "step": 8960
6418
+ },
6419
+ {
6420
+ "epoch": 0.22,
6421
+ "grad_norm": 2.75,
6422
+ "learning_rate": 2.168421052631579e-06,
6423
+ "loss": 0.6428,
6424
+ "step": 8970
6425
+ },
6426
+ {
6427
+ "epoch": 0.22,
6428
+ "grad_norm": 11.5625,
6429
+ "learning_rate": 2.1473684210526317e-06,
6430
+ "loss": 0.5886,
6431
+ "step": 8980
6432
+ },
6433
+ {
6434
+ "epoch": 0.22,
6435
+ "grad_norm": 3.8125,
6436
+ "learning_rate": 2.1263157894736844e-06,
6437
+ "loss": 0.597,
6438
+ "step": 8990
6439
+ },
6440
+ {
6441
+ "epoch": 0.23,
6442
+ "grad_norm": 1.8125,
6443
+ "learning_rate": 2.105263157894737e-06,
6444
+ "loss": 0.527,
6445
+ "step": 9000
6446
+ },
6447
+ {
6448
+ "epoch": 0.23,
6449
+ "eval_loss": 0.6106051802635193,
6450
+ "eval_runtime": 38.1285,
6451
+ "eval_samples_per_second": 26.227,
6452
+ "eval_steps_per_second": 26.227,
6453
+ "step": 9000
6454
+ },
6455
+ {
6456
+ "epoch": 0.23,
6457
+ "grad_norm": 4.375,
6458
+ "learning_rate": 2.0842105263157897e-06,
6459
+ "loss": 0.5752,
6460
+ "step": 9010
6461
+ },
6462
+ {
6463
+ "epoch": 0.23,
6464
+ "grad_norm": 4.9375,
6465
+ "learning_rate": 2.0631578947368424e-06,
6466
+ "loss": 0.5578,
6467
+ "step": 9020
6468
+ },
6469
+ {
6470
+ "epoch": 0.23,
6471
+ "grad_norm": 3.9375,
6472
+ "learning_rate": 2.042105263157895e-06,
6473
+ "loss": 0.6715,
6474
+ "step": 9030
6475
+ },
6476
+ {
6477
+ "epoch": 0.23,
6478
+ "grad_norm": 6.28125,
6479
+ "learning_rate": 2.0210526315789477e-06,
6480
+ "loss": 0.5559,
6481
+ "step": 9040
6482
+ },
6483
+ {
6484
+ "epoch": 0.23,
6485
+ "grad_norm": 2.734375,
6486
+ "learning_rate": 2.0000000000000003e-06,
6487
+ "loss": 0.5839,
6488
+ "step": 9050
6489
+ },
6490
+ {
6491
+ "epoch": 0.23,
6492
+ "grad_norm": 4.375,
6493
+ "learning_rate": 1.978947368421053e-06,
6494
+ "loss": 0.6546,
6495
+ "step": 9060
6496
+ },
6497
+ {
6498
+ "epoch": 0.23,
6499
+ "grad_norm": 5.1875,
6500
+ "learning_rate": 1.9578947368421052e-06,
6501
+ "loss": 0.4581,
6502
+ "step": 9070
6503
+ },
6504
+ {
6505
+ "epoch": 0.23,
6506
+ "grad_norm": 1.8046875,
6507
+ "learning_rate": 1.936842105263158e-06,
6508
+ "loss": 0.6111,
6509
+ "step": 9080
6510
+ },
6511
+ {
6512
+ "epoch": 0.23,
6513
+ "grad_norm": 4.0625,
6514
+ "learning_rate": 1.9157894736842105e-06,
6515
+ "loss": 0.569,
6516
+ "step": 9090
6517
+ },
6518
+ {
6519
+ "epoch": 0.23,
6520
+ "grad_norm": 3.25,
6521
+ "learning_rate": 1.8947368421052634e-06,
6522
+ "loss": 0.6089,
6523
+ "step": 9100
6524
+ },
6525
+ {
6526
+ "epoch": 0.23,
6527
+ "grad_norm": 8.0625,
6528
+ "learning_rate": 1.8736842105263158e-06,
6529
+ "loss": 0.5164,
6530
+ "step": 9110
6531
+ },
6532
+ {
6533
+ "epoch": 0.23,
6534
+ "grad_norm": 2.875,
6535
+ "learning_rate": 1.8526315789473687e-06,
6536
+ "loss": 0.5259,
6537
+ "step": 9120
6538
+ },
6539
+ {
6540
+ "epoch": 0.23,
6541
+ "grad_norm": 4.0625,
6542
+ "learning_rate": 1.8315789473684211e-06,
6543
+ "loss": 0.6318,
6544
+ "step": 9130
6545
+ },
6546
+ {
6547
+ "epoch": 0.23,
6548
+ "grad_norm": 3.625,
6549
+ "learning_rate": 1.810526315789474e-06,
6550
+ "loss": 0.5954,
6551
+ "step": 9140
6552
+ },
6553
+ {
6554
+ "epoch": 0.23,
6555
+ "grad_norm": 4.25,
6556
+ "learning_rate": 1.7894736842105265e-06,
6557
+ "loss": 0.5146,
6558
+ "step": 9150
6559
+ },
6560
+ {
6561
+ "epoch": 0.23,
6562
+ "grad_norm": 5.28125,
6563
+ "learning_rate": 1.768421052631579e-06,
6564
+ "loss": 0.6194,
6565
+ "step": 9160
6566
+ },
6567
+ {
6568
+ "epoch": 0.23,
6569
+ "grad_norm": 3.3125,
6570
+ "learning_rate": 1.7473684210526318e-06,
6571
+ "loss": 0.5031,
6572
+ "step": 9170
6573
+ },
6574
+ {
6575
+ "epoch": 0.23,
6576
+ "grad_norm": 3.59375,
6577
+ "learning_rate": 1.7263157894736842e-06,
6578
+ "loss": 0.5199,
6579
+ "step": 9180
6580
+ },
6581
+ {
6582
+ "epoch": 0.23,
6583
+ "grad_norm": 12.75,
6584
+ "learning_rate": 1.705263157894737e-06,
6585
+ "loss": 0.6261,
6586
+ "step": 9190
6587
+ },
6588
+ {
6589
+ "epoch": 0.23,
6590
+ "grad_norm": 3.640625,
6591
+ "learning_rate": 1.6842105263157895e-06,
6592
+ "loss": 0.6349,
6593
+ "step": 9200
6594
+ },
6595
+ {
6596
+ "epoch": 0.23,
6597
+ "grad_norm": 3.59375,
6598
+ "learning_rate": 1.6631578947368424e-06,
6599
+ "loss": 0.5244,
6600
+ "step": 9210
6601
+ },
6602
+ {
6603
+ "epoch": 0.23,
6604
+ "grad_norm": 5.15625,
6605
+ "learning_rate": 1.6421052631578948e-06,
6606
+ "loss": 0.5428,
6607
+ "step": 9220
6608
+ },
6609
+ {
6610
+ "epoch": 0.23,
6611
+ "grad_norm": 2.625,
6612
+ "learning_rate": 1.6210526315789473e-06,
6613
+ "loss": 0.6296,
6614
+ "step": 9230
6615
+ },
6616
+ {
6617
+ "epoch": 0.23,
6618
+ "grad_norm": 4.125,
6619
+ "learning_rate": 1.6000000000000001e-06,
6620
+ "loss": 0.5923,
6621
+ "step": 9240
6622
+ },
6623
+ {
6624
+ "epoch": 0.23,
6625
+ "grad_norm": 3.875,
6626
+ "learning_rate": 1.5789473684210526e-06,
6627
+ "loss": 0.6146,
6628
+ "step": 9250
6629
+ },
6630
+ {
6631
+ "epoch": 0.23,
6632
+ "grad_norm": 6.4375,
6633
+ "learning_rate": 1.5578947368421054e-06,
6634
+ "loss": 0.6558,
6635
+ "step": 9260
6636
+ },
6637
+ {
6638
+ "epoch": 0.23,
6639
+ "grad_norm": 5.28125,
6640
+ "learning_rate": 1.5368421052631579e-06,
6641
+ "loss": 0.5426,
6642
+ "step": 9270
6643
+ },
6644
+ {
6645
+ "epoch": 0.23,
6646
+ "grad_norm": 3.046875,
6647
+ "learning_rate": 1.5157894736842108e-06,
6648
+ "loss": 0.611,
6649
+ "step": 9280
6650
+ },
6651
+ {
6652
+ "epoch": 0.23,
6653
+ "grad_norm": 4.5625,
6654
+ "learning_rate": 1.4947368421052632e-06,
6655
+ "loss": 0.5013,
6656
+ "step": 9290
6657
+ },
6658
+ {
6659
+ "epoch": 0.23,
6660
+ "grad_norm": 3.34375,
6661
+ "learning_rate": 1.4736842105263159e-06,
6662
+ "loss": 0.599,
6663
+ "step": 9300
6664
+ },
6665
+ {
6666
+ "epoch": 0.23,
6667
+ "grad_norm": 4.3125,
6668
+ "learning_rate": 1.4526315789473685e-06,
6669
+ "loss": 0.5549,
6670
+ "step": 9310
6671
+ },
6672
+ {
6673
+ "epoch": 0.23,
6674
+ "grad_norm": 7.375,
6675
+ "learning_rate": 1.4315789473684212e-06,
6676
+ "loss": 0.5597,
6677
+ "step": 9320
6678
+ },
6679
+ {
6680
+ "epoch": 0.23,
6681
+ "grad_norm": 4.9375,
6682
+ "learning_rate": 1.4105263157894738e-06,
6683
+ "loss": 0.5402,
6684
+ "step": 9330
6685
+ },
6686
+ {
6687
+ "epoch": 0.23,
6688
+ "grad_norm": 3.34375,
6689
+ "learning_rate": 1.3894736842105263e-06,
6690
+ "loss": 0.56,
6691
+ "step": 9340
6692
+ },
6693
+ {
6694
+ "epoch": 0.23,
6695
+ "grad_norm": 3.5,
6696
+ "learning_rate": 1.3684210526315791e-06,
6697
+ "loss": 0.5991,
6698
+ "step": 9350
6699
+ },
6700
+ {
6701
+ "epoch": 0.23,
6702
+ "grad_norm": 3.625,
6703
+ "learning_rate": 1.3473684210526316e-06,
6704
+ "loss": 0.499,
6705
+ "step": 9360
6706
+ },
6707
+ {
6708
+ "epoch": 0.23,
6709
+ "grad_norm": 4.125,
6710
+ "learning_rate": 1.3263157894736844e-06,
6711
+ "loss": 0.5946,
6712
+ "step": 9370
6713
+ },
6714
+ {
6715
+ "epoch": 0.23,
6716
+ "grad_norm": 4.59375,
6717
+ "learning_rate": 1.3052631578947369e-06,
6718
+ "loss": 0.678,
6719
+ "step": 9380
6720
+ },
6721
+ {
6722
+ "epoch": 0.23,
6723
+ "grad_norm": 2.9375,
6724
+ "learning_rate": 1.2842105263157895e-06,
6725
+ "loss": 0.7289,
6726
+ "step": 9390
6727
+ },
6728
+ {
6729
+ "epoch": 0.23,
6730
+ "grad_norm": 3.953125,
6731
+ "learning_rate": 1.2631578947368422e-06,
6732
+ "loss": 0.6926,
6733
+ "step": 9400
6734
+ },
6735
+ {
6736
+ "epoch": 0.24,
6737
+ "grad_norm": 9.1875,
6738
+ "learning_rate": 1.2421052631578948e-06,
6739
+ "loss": 0.6079,
6740
+ "step": 9410
6741
+ },
6742
+ {
6743
+ "epoch": 0.24,
6744
+ "grad_norm": 9.4375,
6745
+ "learning_rate": 1.2210526315789475e-06,
6746
+ "loss": 0.6642,
6747
+ "step": 9420
6748
+ },
6749
+ {
6750
+ "epoch": 0.24,
6751
+ "grad_norm": 4.28125,
6752
+ "learning_rate": 1.2000000000000002e-06,
6753
+ "loss": 0.5805,
6754
+ "step": 9430
6755
+ },
6756
+ {
6757
+ "epoch": 0.24,
6758
+ "grad_norm": 4.6875,
6759
+ "learning_rate": 1.1789473684210526e-06,
6760
+ "loss": 0.564,
6761
+ "step": 9440
6762
+ },
6763
+ {
6764
+ "epoch": 0.24,
6765
+ "grad_norm": 3.34375,
6766
+ "learning_rate": 1.1578947368421053e-06,
6767
+ "loss": 0.571,
6768
+ "step": 9450
6769
+ },
6770
+ {
6771
+ "epoch": 0.24,
6772
+ "grad_norm": 4.625,
6773
+ "learning_rate": 1.136842105263158e-06,
6774
+ "loss": 0.538,
6775
+ "step": 9460
6776
+ },
6777
+ {
6778
+ "epoch": 0.24,
6779
+ "grad_norm": 7.84375,
6780
+ "learning_rate": 1.1157894736842106e-06,
6781
+ "loss": 0.5777,
6782
+ "step": 9470
6783
+ },
6784
+ {
6785
+ "epoch": 0.24,
6786
+ "grad_norm": 3.65625,
6787
+ "learning_rate": 1.0947368421052632e-06,
6788
+ "loss": 0.683,
6789
+ "step": 9480
6790
+ },
6791
+ {
6792
+ "epoch": 0.24,
6793
+ "grad_norm": 3.3125,
6794
+ "learning_rate": 1.0736842105263159e-06,
6795
+ "loss": 0.5143,
6796
+ "step": 9490
6797
+ },
6798
+ {
6799
+ "epoch": 0.24,
6800
+ "grad_norm": 3.109375,
6801
+ "learning_rate": 1.0526315789473685e-06,
6802
+ "loss": 0.4899,
6803
+ "step": 9500
6804
+ },
6805
+ {
6806
+ "epoch": 0.24,
6807
+ "eval_loss": 0.6154074668884277,
6808
+ "eval_runtime": 38.1761,
6809
+ "eval_samples_per_second": 26.194,
6810
+ "eval_steps_per_second": 26.194,
6811
+ "step": 9500
6812
+ },
6813
+ {
6814
+ "epoch": 0.24,
6815
+ "grad_norm": 7.125,
6816
+ "learning_rate": 1.0315789473684212e-06,
6817
+ "loss": 0.7389,
6818
+ "step": 9510
6819
+ },
6820
+ {
6821
+ "epoch": 0.24,
6822
+ "grad_norm": 3.734375,
6823
+ "learning_rate": 1.0105263157894738e-06,
6824
+ "loss": 0.5922,
6825
+ "step": 9520
6826
+ },
6827
+ {
6828
+ "epoch": 0.24,
6829
+ "grad_norm": 4.46875,
6830
+ "learning_rate": 9.894736842105265e-07,
6831
+ "loss": 0.5658,
6832
+ "step": 9530
6833
+ },
6834
+ {
6835
+ "epoch": 0.24,
6836
+ "grad_norm": 3.765625,
6837
+ "learning_rate": 9.68421052631579e-07,
6838
+ "loss": 0.608,
6839
+ "step": 9540
6840
+ },
6841
+ {
6842
+ "epoch": 0.24,
6843
+ "grad_norm": 3.78125,
6844
+ "learning_rate": 9.473684210526317e-07,
6845
+ "loss": 0.5337,
6846
+ "step": 9550
6847
+ },
6848
+ {
6849
+ "epoch": 0.24,
6850
+ "grad_norm": 3.515625,
6851
+ "learning_rate": 9.263157894736844e-07,
6852
+ "loss": 0.5736,
6853
+ "step": 9560
6854
+ },
6855
+ {
6856
+ "epoch": 0.24,
6857
+ "grad_norm": 4.03125,
6858
+ "learning_rate": 9.05263157894737e-07,
6859
+ "loss": 0.5319,
6860
+ "step": 9570
6861
+ },
6862
+ {
6863
+ "epoch": 0.24,
6864
+ "grad_norm": 3.859375,
6865
+ "learning_rate": 8.842105263157895e-07,
6866
+ "loss": 0.5011,
6867
+ "step": 9580
6868
+ },
6869
+ {
6870
+ "epoch": 0.24,
6871
+ "grad_norm": 2.828125,
6872
+ "learning_rate": 8.631578947368421e-07,
6873
+ "loss": 0.6734,
6874
+ "step": 9590
6875
+ },
6876
+ {
6877
+ "epoch": 0.24,
6878
+ "grad_norm": 2.625,
6879
+ "learning_rate": 8.421052631578948e-07,
6880
+ "loss": 0.5045,
6881
+ "step": 9600
6882
+ },
6883
+ {
6884
+ "epoch": 0.24,
6885
+ "grad_norm": 4.78125,
6886
+ "learning_rate": 8.210526315789474e-07,
6887
+ "loss": 0.5957,
6888
+ "step": 9610
6889
+ },
6890
+ {
6891
+ "epoch": 0.24,
6892
+ "grad_norm": 2.984375,
6893
+ "learning_rate": 8.000000000000001e-07,
6894
+ "loss": 0.6944,
6895
+ "step": 9620
6896
+ },
6897
+ {
6898
+ "epoch": 0.24,
6899
+ "grad_norm": 2.546875,
6900
+ "learning_rate": 7.789473684210527e-07,
6901
+ "loss": 0.5249,
6902
+ "step": 9630
6903
+ },
6904
+ {
6905
+ "epoch": 0.24,
6906
+ "grad_norm": 12.9375,
6907
+ "learning_rate": 7.578947368421054e-07,
6908
+ "loss": 0.5981,
6909
+ "step": 9640
6910
+ },
6911
+ {
6912
+ "epoch": 0.24,
6913
+ "grad_norm": 7.25,
6914
+ "learning_rate": 7.368421052631579e-07,
6915
+ "loss": 0.633,
6916
+ "step": 9650
6917
+ },
6918
+ {
6919
+ "epoch": 0.24,
6920
+ "grad_norm": 3.59375,
6921
+ "learning_rate": 7.157894736842106e-07,
6922
+ "loss": 0.5535,
6923
+ "step": 9660
6924
+ },
6925
+ {
6926
+ "epoch": 0.24,
6927
+ "grad_norm": 5.8125,
6928
+ "learning_rate": 6.947368421052631e-07,
6929
+ "loss": 0.6233,
6930
+ "step": 9670
6931
+ },
6932
+ {
6933
+ "epoch": 0.24,
6934
+ "grad_norm": 11.4375,
6935
+ "learning_rate": 6.736842105263158e-07,
6936
+ "loss": 0.5603,
6937
+ "step": 9680
6938
+ },
6939
+ {
6940
+ "epoch": 0.24,
6941
+ "grad_norm": 3.890625,
6942
+ "learning_rate": 6.526315789473684e-07,
6943
+ "loss": 0.5977,
6944
+ "step": 9690
6945
+ },
6946
+ {
6947
+ "epoch": 0.24,
6948
+ "grad_norm": 3.25,
6949
+ "learning_rate": 6.315789473684211e-07,
6950
+ "loss": 0.5737,
6951
+ "step": 9700
6952
+ },
6953
+ {
6954
+ "epoch": 0.24,
6955
+ "grad_norm": 8.0625,
6956
+ "learning_rate": 6.105263157894738e-07,
6957
+ "loss": 0.5797,
6958
+ "step": 9710
6959
+ },
6960
+ {
6961
+ "epoch": 0.24,
6962
+ "grad_norm": 6.4375,
6963
+ "learning_rate": 5.894736842105263e-07,
6964
+ "loss": 0.6256,
6965
+ "step": 9720
6966
+ },
6967
+ {
6968
+ "epoch": 0.24,
6969
+ "grad_norm": 2.484375,
6970
+ "learning_rate": 5.68421052631579e-07,
6971
+ "loss": 0.4984,
6972
+ "step": 9730
6973
+ },
6974
+ {
6975
+ "epoch": 0.24,
6976
+ "grad_norm": 4.28125,
6977
+ "learning_rate": 5.473684210526316e-07,
6978
+ "loss": 0.6489,
6979
+ "step": 9740
6980
+ },
6981
+ {
6982
+ "epoch": 0.24,
6983
+ "grad_norm": 2.328125,
6984
+ "learning_rate": 5.263157894736843e-07,
6985
+ "loss": 0.5204,
6986
+ "step": 9750
6987
+ },
6988
+ {
6989
+ "epoch": 0.24,
6990
+ "grad_norm": 3.09375,
6991
+ "learning_rate": 5.052631578947369e-07,
6992
+ "loss": 0.6044,
6993
+ "step": 9760
6994
+ },
6995
+ {
6996
+ "epoch": 0.24,
6997
+ "grad_norm": 3.078125,
6998
+ "learning_rate": 4.842105263157895e-07,
6999
+ "loss": 0.6061,
7000
+ "step": 9770
7001
+ },
7002
+ {
7003
+ "epoch": 0.24,
7004
+ "grad_norm": 5.59375,
7005
+ "learning_rate": 4.631578947368422e-07,
7006
+ "loss": 0.5491,
7007
+ "step": 9780
7008
+ },
7009
+ {
7010
+ "epoch": 0.24,
7011
+ "grad_norm": 3.46875,
7012
+ "learning_rate": 4.421052631578947e-07,
7013
+ "loss": 0.5567,
7014
+ "step": 9790
7015
+ },
7016
+ {
7017
+ "epoch": 0.24,
7018
+ "grad_norm": 3.671875,
7019
+ "learning_rate": 4.210526315789474e-07,
7020
+ "loss": 0.6661,
7021
+ "step": 9800
7022
+ },
7023
+ {
7024
+ "epoch": 0.25,
7025
+ "grad_norm": 2.875,
7026
+ "learning_rate": 4.0000000000000003e-07,
7027
+ "loss": 0.672,
7028
+ "step": 9810
7029
+ },
7030
+ {
7031
+ "epoch": 0.25,
7032
+ "grad_norm": 7.15625,
7033
+ "learning_rate": 3.789473684210527e-07,
7034
+ "loss": 0.5887,
7035
+ "step": 9820
7036
+ },
7037
+ {
7038
+ "epoch": 0.25,
7039
+ "grad_norm": 2.453125,
7040
+ "learning_rate": 3.578947368421053e-07,
7041
+ "loss": 0.4978,
7042
+ "step": 9830
7043
+ },
7044
+ {
7045
+ "epoch": 0.25,
7046
+ "grad_norm": 2.859375,
7047
+ "learning_rate": 3.368421052631579e-07,
7048
+ "loss": 0.5254,
7049
+ "step": 9840
7050
+ },
7051
+ {
7052
+ "epoch": 0.25,
7053
+ "grad_norm": 3.515625,
7054
+ "learning_rate": 3.1578947368421055e-07,
7055
+ "loss": 0.5738,
7056
+ "step": 9850
7057
+ },
7058
+ {
7059
+ "epoch": 0.25,
7060
+ "grad_norm": 3.828125,
7061
+ "learning_rate": 2.9473684210526315e-07,
7062
+ "loss": 0.5846,
7063
+ "step": 9860
7064
+ },
7065
+ {
7066
+ "epoch": 0.25,
7067
+ "grad_norm": 6.0625,
7068
+ "learning_rate": 2.736842105263158e-07,
7069
+ "loss": 0.6591,
7070
+ "step": 9870
7071
+ },
7072
+ {
7073
+ "epoch": 0.25,
7074
+ "grad_norm": 5.0,
7075
+ "learning_rate": 2.5263157894736846e-07,
7076
+ "loss": 0.6697,
7077
+ "step": 9880
7078
+ },
7079
+ {
7080
+ "epoch": 0.25,
7081
+ "grad_norm": 4.25,
7082
+ "learning_rate": 2.315789473684211e-07,
7083
+ "loss": 0.4831,
7084
+ "step": 9890
7085
+ },
7086
+ {
7087
+ "epoch": 0.25,
7088
+ "grad_norm": 2.65625,
7089
+ "learning_rate": 2.105263157894737e-07,
7090
+ "loss": 0.5504,
7091
+ "step": 9900
7092
+ },
7093
+ {
7094
+ "epoch": 0.25,
7095
+ "grad_norm": 2.65625,
7096
+ "learning_rate": 1.8947368421052634e-07,
7097
+ "loss": 0.5576,
7098
+ "step": 9910
7099
+ },
7100
+ {
7101
+ "epoch": 0.25,
7102
+ "grad_norm": 3.875,
7103
+ "learning_rate": 1.6842105263157895e-07,
7104
+ "loss": 0.4733,
7105
+ "step": 9920
7106
+ },
7107
+ {
7108
+ "epoch": 0.25,
7109
+ "grad_norm": 3.03125,
7110
+ "learning_rate": 1.4736842105263158e-07,
7111
+ "loss": 0.5077,
7112
+ "step": 9930
7113
+ },
7114
+ {
7115
+ "epoch": 0.25,
7116
+ "grad_norm": 4.5,
7117
+ "learning_rate": 1.2631578947368423e-07,
7118
+ "loss": 0.5068,
7119
+ "step": 9940
7120
+ },
7121
+ {
7122
+ "epoch": 0.25,
7123
+ "grad_norm": 2.453125,
7124
+ "learning_rate": 1.0526315789473685e-07,
7125
+ "loss": 0.4754,
7126
+ "step": 9950
7127
+ },
7128
+ {
7129
+ "epoch": 0.25,
7130
+ "grad_norm": 6.09375,
7131
+ "learning_rate": 8.421052631578947e-08,
7132
+ "loss": 0.5751,
7133
+ "step": 9960
7134
+ },
7135
+ {
7136
+ "epoch": 0.25,
7137
+ "grad_norm": 3.65625,
7138
+ "learning_rate": 6.315789473684211e-08,
7139
+ "loss": 0.5133,
7140
+ "step": 9970
7141
+ },
7142
+ {
7143
+ "epoch": 0.25,
7144
+ "grad_norm": 3.171875,
7145
+ "learning_rate": 4.2105263157894737e-08,
7146
+ "loss": 0.6084,
7147
+ "step": 9980
7148
+ },
7149
+ {
7150
+ "epoch": 0.25,
7151
+ "grad_norm": 16.5,
7152
+ "learning_rate": 2.1052631578947368e-08,
7153
+ "loss": 0.6159,
7154
+ "step": 9990
7155
+ },
7156
+ {
7157
+ "epoch": 0.25,
7158
+ "grad_norm": 4.46875,
7159
+ "learning_rate": 0.0,
7160
+ "loss": 0.6108,
7161
+ "step": 10000
7162
+ },
7163
+ {
7164
+ "epoch": 0.25,
7165
+ "eval_loss": 0.6137469410896301,
7166
+ "eval_runtime": 38.1278,
7167
+ "eval_samples_per_second": 26.228,
7168
+ "eval_steps_per_second": 26.228,
7169
+ "step": 10000
7170
  }
7171
  ],
7172
  "logging_steps": 10,
 
7174
  "num_input_tokens_seen": 0,
7175
  "num_train_epochs": 1,
7176
  "save_steps": 2500,
7177
+ "total_flos": 1.5733698330624e+17,
7178
  "train_batch_size": 1,
7179
  "trial_name": null,
7180
  "trial_params": null