jdannem6 commited on
Commit
ca7f9e4
1 Parent(s): 75243b0

Uploaded checkpoint-20000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:162241111a35b4dd2bd8251eb44c4f9ed485c39f432082deec2b913318be26b3
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:140c22cb100bb7fb3b9f92ae13ec5bb2bfcde7ed82d7e4434fc5a235f98cb24e
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2640cb6f400e7439fff7ee437394f2676dd7a329f43b4ef033bc2e958e48c385
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f49faa425cc20765775b424c81e8f5599e3725a2dc79226d42d68c4573812cfe
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6ca59958578f29e636921c5d01edf609d279634685f5c1700ffd019a9a229f9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d70b07077c15c8bd127eaf0a24ba45e81ca7ce6ae410b7a625f50c345ec6eb1f
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb29930cc2c8e69e7c76b92867840499fb9c566b9d6b348753e567d4e680bb99
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc694a733ff91b79c5eaf7bcfe8aa41771c4ef8a47d325d2a9e9f6bc78f946
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.3866758346557617,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-12500",
4
- "epoch": 0.4375,
5
  "eval_steps": 500,
6
- "global_step": 17500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12537,6 +12537,1796 @@
12537
  "eval_samples_per_second": 15.113,
12538
  "eval_steps_per_second": 15.113,
12539
  "step": 17500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12540
  }
12541
  ],
12542
  "logging_steps": 10,
@@ -12544,7 +14334,7 @@
12544
  "num_input_tokens_seen": 0,
12545
  "num_train_epochs": 1,
12546
  "save_steps": 2500,
12547
- "total_flos": 2.8178720489472e+17,
12548
  "train_batch_size": 1,
12549
  "trial_name": null,
12550
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.348677158355713,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-20000",
4
+ "epoch": 0.5,
5
  "eval_steps": 500,
6
+ "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12537
  "eval_samples_per_second": 15.113,
12538
  "eval_steps_per_second": 15.113,
12539
  "step": 17500
12540
+ },
12541
+ {
12542
+ "epoch": 0.44,
12543
+ "grad_norm": 5.189449310302734,
12544
+ "learning_rate": 8.467796610169492e-06,
12545
+ "loss": 1.352,
12546
+ "step": 17510
12547
+ },
12548
+ {
12549
+ "epoch": 0.44,
12550
+ "grad_norm": 2.765326499938965,
12551
+ "learning_rate": 8.461016949152543e-06,
12552
+ "loss": 1.5275,
12553
+ "step": 17520
12554
+ },
12555
+ {
12556
+ "epoch": 0.44,
12557
+ "grad_norm": 3.60880446434021,
12558
+ "learning_rate": 8.454237288135593e-06,
12559
+ "loss": 1.2793,
12560
+ "step": 17530
12561
+ },
12562
+ {
12563
+ "epoch": 0.44,
12564
+ "grad_norm": 3.072174310684204,
12565
+ "learning_rate": 8.447457627118644e-06,
12566
+ "loss": 1.2746,
12567
+ "step": 17540
12568
+ },
12569
+ {
12570
+ "epoch": 0.44,
12571
+ "grad_norm": 3.3222594261169434,
12572
+ "learning_rate": 8.440677966101696e-06,
12573
+ "loss": 1.3359,
12574
+ "step": 17550
12575
+ },
12576
+ {
12577
+ "epoch": 0.44,
12578
+ "grad_norm": 1.3298100233078003,
12579
+ "learning_rate": 8.433898305084747e-06,
12580
+ "loss": 1.3826,
12581
+ "step": 17560
12582
+ },
12583
+ {
12584
+ "epoch": 0.44,
12585
+ "grad_norm": 4.06503438949585,
12586
+ "learning_rate": 8.427118644067797e-06,
12587
+ "loss": 1.3339,
12588
+ "step": 17570
12589
+ },
12590
+ {
12591
+ "epoch": 0.44,
12592
+ "grad_norm": 6.150786399841309,
12593
+ "learning_rate": 8.420338983050848e-06,
12594
+ "loss": 1.3414,
12595
+ "step": 17580
12596
+ },
12597
+ {
12598
+ "epoch": 0.44,
12599
+ "grad_norm": 9.045748710632324,
12600
+ "learning_rate": 8.413559322033898e-06,
12601
+ "loss": 1.3183,
12602
+ "step": 17590
12603
+ },
12604
+ {
12605
+ "epoch": 0.44,
12606
+ "grad_norm": 4.415981769561768,
12607
+ "learning_rate": 8.40677966101695e-06,
12608
+ "loss": 1.3323,
12609
+ "step": 17600
12610
+ },
12611
+ {
12612
+ "epoch": 0.44,
12613
+ "grad_norm": 8.40713882446289,
12614
+ "learning_rate": 8.400000000000001e-06,
12615
+ "loss": 1.2519,
12616
+ "step": 17610
12617
+ },
12618
+ {
12619
+ "epoch": 0.44,
12620
+ "grad_norm": 6.841925144195557,
12621
+ "learning_rate": 8.39322033898305e-06,
12622
+ "loss": 1.4799,
12623
+ "step": 17620
12624
+ },
12625
+ {
12626
+ "epoch": 0.44,
12627
+ "grad_norm": 10.993582725524902,
12628
+ "learning_rate": 8.386440677966102e-06,
12629
+ "loss": 1.3495,
12630
+ "step": 17630
12631
+ },
12632
+ {
12633
+ "epoch": 0.44,
12634
+ "grad_norm": 2.115006685256958,
12635
+ "learning_rate": 8.379661016949153e-06,
12636
+ "loss": 1.3902,
12637
+ "step": 17640
12638
+ },
12639
+ {
12640
+ "epoch": 0.44,
12641
+ "grad_norm": 3.485889434814453,
12642
+ "learning_rate": 8.372881355932205e-06,
12643
+ "loss": 1.4462,
12644
+ "step": 17650
12645
+ },
12646
+ {
12647
+ "epoch": 0.44,
12648
+ "grad_norm": 5.725930690765381,
12649
+ "learning_rate": 8.366101694915255e-06,
12650
+ "loss": 1.4603,
12651
+ "step": 17660
12652
+ },
12653
+ {
12654
+ "epoch": 0.44,
12655
+ "grad_norm": 6.5530548095703125,
12656
+ "learning_rate": 8.359322033898306e-06,
12657
+ "loss": 1.4954,
12658
+ "step": 17670
12659
+ },
12660
+ {
12661
+ "epoch": 0.44,
12662
+ "grad_norm": 6.006740570068359,
12663
+ "learning_rate": 8.352542372881357e-06,
12664
+ "loss": 1.3556,
12665
+ "step": 17680
12666
+ },
12667
+ {
12668
+ "epoch": 0.44,
12669
+ "grad_norm": 5.5967936515808105,
12670
+ "learning_rate": 8.345762711864409e-06,
12671
+ "loss": 1.0719,
12672
+ "step": 17690
12673
+ },
12674
+ {
12675
+ "epoch": 0.44,
12676
+ "grad_norm": 5.70643949508667,
12677
+ "learning_rate": 8.338983050847458e-06,
12678
+ "loss": 1.2968,
12679
+ "step": 17700
12680
+ },
12681
+ {
12682
+ "epoch": 0.44,
12683
+ "grad_norm": 3.280465841293335,
12684
+ "learning_rate": 8.332203389830508e-06,
12685
+ "loss": 1.2669,
12686
+ "step": 17710
12687
+ },
12688
+ {
12689
+ "epoch": 0.44,
12690
+ "grad_norm": 3.098782777786255,
12691
+ "learning_rate": 8.32542372881356e-06,
12692
+ "loss": 1.4704,
12693
+ "step": 17720
12694
+ },
12695
+ {
12696
+ "epoch": 0.44,
12697
+ "grad_norm": 2.825495958328247,
12698
+ "learning_rate": 8.318644067796611e-06,
12699
+ "loss": 1.2165,
12700
+ "step": 17730
12701
+ },
12702
+ {
12703
+ "epoch": 0.44,
12704
+ "grad_norm": 3.8040249347686768,
12705
+ "learning_rate": 8.311864406779662e-06,
12706
+ "loss": 1.3623,
12707
+ "step": 17740
12708
+ },
12709
+ {
12710
+ "epoch": 0.44,
12711
+ "grad_norm": 3.4438130855560303,
12712
+ "learning_rate": 8.305084745762712e-06,
12713
+ "loss": 1.4255,
12714
+ "step": 17750
12715
+ },
12716
+ {
12717
+ "epoch": 0.44,
12718
+ "grad_norm": 2.187945604324341,
12719
+ "learning_rate": 8.298305084745763e-06,
12720
+ "loss": 1.2181,
12721
+ "step": 17760
12722
+ },
12723
+ {
12724
+ "epoch": 0.44,
12725
+ "grad_norm": 2.0644514560699463,
12726
+ "learning_rate": 8.291525423728815e-06,
12727
+ "loss": 1.4669,
12728
+ "step": 17770
12729
+ },
12730
+ {
12731
+ "epoch": 0.44,
12732
+ "grad_norm": 4.314215660095215,
12733
+ "learning_rate": 8.284745762711866e-06,
12734
+ "loss": 1.3776,
12735
+ "step": 17780
12736
+ },
12737
+ {
12738
+ "epoch": 0.44,
12739
+ "grad_norm": 2.948415517807007,
12740
+ "learning_rate": 8.277966101694916e-06,
12741
+ "loss": 1.3198,
12742
+ "step": 17790
12743
+ },
12744
+ {
12745
+ "epoch": 0.45,
12746
+ "grad_norm": 7.77257776260376,
12747
+ "learning_rate": 8.271186440677966e-06,
12748
+ "loss": 1.3826,
12749
+ "step": 17800
12750
+ },
12751
+ {
12752
+ "epoch": 0.45,
12753
+ "grad_norm": 2.2085230350494385,
12754
+ "learning_rate": 8.264406779661017e-06,
12755
+ "loss": 1.3443,
12756
+ "step": 17810
12757
+ },
12758
+ {
12759
+ "epoch": 0.45,
12760
+ "grad_norm": 2.8318214416503906,
12761
+ "learning_rate": 8.257627118644068e-06,
12762
+ "loss": 1.3126,
12763
+ "step": 17820
12764
+ },
12765
+ {
12766
+ "epoch": 0.45,
12767
+ "grad_norm": 4.544606685638428,
12768
+ "learning_rate": 8.25084745762712e-06,
12769
+ "loss": 1.2955,
12770
+ "step": 17830
12771
+ },
12772
+ {
12773
+ "epoch": 0.45,
12774
+ "grad_norm": 5.595729351043701,
12775
+ "learning_rate": 8.24406779661017e-06,
12776
+ "loss": 1.3814,
12777
+ "step": 17840
12778
+ },
12779
+ {
12780
+ "epoch": 0.45,
12781
+ "grad_norm": 6.201912879943848,
12782
+ "learning_rate": 8.237288135593221e-06,
12783
+ "loss": 1.4243,
12784
+ "step": 17850
12785
+ },
12786
+ {
12787
+ "epoch": 0.45,
12788
+ "grad_norm": 7.697713851928711,
12789
+ "learning_rate": 8.230508474576272e-06,
12790
+ "loss": 1.326,
12791
+ "step": 17860
12792
+ },
12793
+ {
12794
+ "epoch": 0.45,
12795
+ "grad_norm": 3.7588555812835693,
12796
+ "learning_rate": 8.223728813559324e-06,
12797
+ "loss": 1.1265,
12798
+ "step": 17870
12799
+ },
12800
+ {
12801
+ "epoch": 0.45,
12802
+ "grad_norm": 8.472000122070312,
12803
+ "learning_rate": 8.216949152542373e-06,
12804
+ "loss": 1.2678,
12805
+ "step": 17880
12806
+ },
12807
+ {
12808
+ "epoch": 0.45,
12809
+ "grad_norm": 4.802706241607666,
12810
+ "learning_rate": 8.210169491525425e-06,
12811
+ "loss": 1.5116,
12812
+ "step": 17890
12813
+ },
12814
+ {
12815
+ "epoch": 0.45,
12816
+ "grad_norm": 5.357900142669678,
12817
+ "learning_rate": 8.203389830508475e-06,
12818
+ "loss": 1.4435,
12819
+ "step": 17900
12820
+ },
12821
+ {
12822
+ "epoch": 0.45,
12823
+ "grad_norm": 3.5331151485443115,
12824
+ "learning_rate": 8.196610169491526e-06,
12825
+ "loss": 1.3838,
12826
+ "step": 17910
12827
+ },
12828
+ {
12829
+ "epoch": 0.45,
12830
+ "grad_norm": 2.591485023498535,
12831
+ "learning_rate": 8.189830508474577e-06,
12832
+ "loss": 1.3595,
12833
+ "step": 17920
12834
+ },
12835
+ {
12836
+ "epoch": 0.45,
12837
+ "grad_norm": 9.1065034866333,
12838
+ "learning_rate": 8.183050847457627e-06,
12839
+ "loss": 1.4109,
12840
+ "step": 17930
12841
+ },
12842
+ {
12843
+ "epoch": 0.45,
12844
+ "grad_norm": 3.2992680072784424,
12845
+ "learning_rate": 8.176271186440678e-06,
12846
+ "loss": 1.3554,
12847
+ "step": 17940
12848
+ },
12849
+ {
12850
+ "epoch": 0.45,
12851
+ "grad_norm": 10.191650390625,
12852
+ "learning_rate": 8.16949152542373e-06,
12853
+ "loss": 1.3561,
12854
+ "step": 17950
12855
+ },
12856
+ {
12857
+ "epoch": 0.45,
12858
+ "grad_norm": 5.769218444824219,
12859
+ "learning_rate": 8.162711864406781e-06,
12860
+ "loss": 1.4044,
12861
+ "step": 17960
12862
+ },
12863
+ {
12864
+ "epoch": 0.45,
12865
+ "grad_norm": 47.62531661987305,
12866
+ "learning_rate": 8.155932203389831e-06,
12867
+ "loss": 1.6211,
12868
+ "step": 17970
12869
+ },
12870
+ {
12871
+ "epoch": 0.45,
12872
+ "grad_norm": 3.514390230178833,
12873
+ "learning_rate": 8.149152542372882e-06,
12874
+ "loss": 1.288,
12875
+ "step": 17980
12876
+ },
12877
+ {
12878
+ "epoch": 0.45,
12879
+ "grad_norm": 3.9963278770446777,
12880
+ "learning_rate": 8.142372881355934e-06,
12881
+ "loss": 1.3625,
12882
+ "step": 17990
12883
+ },
12884
+ {
12885
+ "epoch": 0.45,
12886
+ "grad_norm": 4.2144856452941895,
12887
+ "learning_rate": 8.135593220338983e-06,
12888
+ "loss": 1.3154,
12889
+ "step": 18000
12890
+ },
12891
+ {
12892
+ "epoch": 0.45,
12893
+ "eval_loss": 1.3663108348846436,
12894
+ "eval_runtime": 66.1644,
12895
+ "eval_samples_per_second": 15.114,
12896
+ "eval_steps_per_second": 15.114,
12897
+ "step": 18000
12898
+ },
12899
+ {
12900
+ "epoch": 0.45,
12901
+ "grad_norm": 3.047025680541992,
12902
+ "learning_rate": 8.128813559322035e-06,
12903
+ "loss": 1.3131,
12904
+ "step": 18010
12905
+ },
12906
+ {
12907
+ "epoch": 0.45,
12908
+ "grad_norm": 5.798911094665527,
12909
+ "learning_rate": 8.122033898305085e-06,
12910
+ "loss": 1.4371,
12911
+ "step": 18020
12912
+ },
12913
+ {
12914
+ "epoch": 0.45,
12915
+ "grad_norm": 3.8710880279541016,
12916
+ "learning_rate": 8.115254237288136e-06,
12917
+ "loss": 1.5147,
12918
+ "step": 18030
12919
+ },
12920
+ {
12921
+ "epoch": 0.45,
12922
+ "grad_norm": 6.445448398590088,
12923
+ "learning_rate": 8.108474576271187e-06,
12924
+ "loss": 1.3332,
12925
+ "step": 18040
12926
+ },
12927
+ {
12928
+ "epoch": 0.45,
12929
+ "grad_norm": 5.0877275466918945,
12930
+ "learning_rate": 8.101694915254237e-06,
12931
+ "loss": 1.5076,
12932
+ "step": 18050
12933
+ },
12934
+ {
12935
+ "epoch": 0.45,
12936
+ "grad_norm": 6.0062737464904785,
12937
+ "learning_rate": 8.094915254237289e-06,
12938
+ "loss": 1.3257,
12939
+ "step": 18060
12940
+ },
12941
+ {
12942
+ "epoch": 0.45,
12943
+ "grad_norm": 3.2079477310180664,
12944
+ "learning_rate": 8.08813559322034e-06,
12945
+ "loss": 1.2162,
12946
+ "step": 18070
12947
+ },
12948
+ {
12949
+ "epoch": 0.45,
12950
+ "grad_norm": 2.974025249481201,
12951
+ "learning_rate": 8.081355932203391e-06,
12952
+ "loss": 1.3433,
12953
+ "step": 18080
12954
+ },
12955
+ {
12956
+ "epoch": 0.45,
12957
+ "grad_norm": 11.321368217468262,
12958
+ "learning_rate": 8.074576271186441e-06,
12959
+ "loss": 1.4845,
12960
+ "step": 18090
12961
+ },
12962
+ {
12963
+ "epoch": 0.45,
12964
+ "grad_norm": 3.768314838409424,
12965
+ "learning_rate": 8.067796610169492e-06,
12966
+ "loss": 1.3288,
12967
+ "step": 18100
12968
+ },
12969
+ {
12970
+ "epoch": 0.45,
12971
+ "grad_norm": 3.9936201572418213,
12972
+ "learning_rate": 8.061016949152542e-06,
12973
+ "loss": 1.2507,
12974
+ "step": 18110
12975
+ },
12976
+ {
12977
+ "epoch": 0.45,
12978
+ "grad_norm": 3.69484543800354,
12979
+ "learning_rate": 8.054237288135594e-06,
12980
+ "loss": 1.2615,
12981
+ "step": 18120
12982
+ },
12983
+ {
12984
+ "epoch": 0.45,
12985
+ "grad_norm": 6.352110862731934,
12986
+ "learning_rate": 8.047457627118645e-06,
12987
+ "loss": 1.2888,
12988
+ "step": 18130
12989
+ },
12990
+ {
12991
+ "epoch": 0.45,
12992
+ "grad_norm": 8.259808540344238,
12993
+ "learning_rate": 8.040677966101695e-06,
12994
+ "loss": 1.3588,
12995
+ "step": 18140
12996
+ },
12997
+ {
12998
+ "epoch": 0.45,
12999
+ "grad_norm": 8.203502655029297,
13000
+ "learning_rate": 8.033898305084746e-06,
13001
+ "loss": 1.2716,
13002
+ "step": 18150
13003
+ },
13004
+ {
13005
+ "epoch": 0.45,
13006
+ "grad_norm": 3.8740904331207275,
13007
+ "learning_rate": 8.027118644067797e-06,
13008
+ "loss": 1.4191,
13009
+ "step": 18160
13010
+ },
13011
+ {
13012
+ "epoch": 0.45,
13013
+ "grad_norm": 3.9104349613189697,
13014
+ "learning_rate": 8.020338983050849e-06,
13015
+ "loss": 1.1126,
13016
+ "step": 18170
13017
+ },
13018
+ {
13019
+ "epoch": 0.45,
13020
+ "grad_norm": 4.973033428192139,
13021
+ "learning_rate": 8.013559322033899e-06,
13022
+ "loss": 1.4034,
13023
+ "step": 18180
13024
+ },
13025
+ {
13026
+ "epoch": 0.45,
13027
+ "grad_norm": 5.2367329597473145,
13028
+ "learning_rate": 8.00677966101695e-06,
13029
+ "loss": 1.4094,
13030
+ "step": 18190
13031
+ },
13032
+ {
13033
+ "epoch": 0.46,
13034
+ "grad_norm": 4.842703819274902,
13035
+ "learning_rate": 8.000000000000001e-06,
13036
+ "loss": 1.4423,
13037
+ "step": 18200
13038
+ },
13039
+ {
13040
+ "epoch": 0.46,
13041
+ "grad_norm": 6.39046573638916,
13042
+ "learning_rate": 7.993220338983053e-06,
13043
+ "loss": 1.3491,
13044
+ "step": 18210
13045
+ },
13046
+ {
13047
+ "epoch": 0.46,
13048
+ "grad_norm": 3.424907922744751,
13049
+ "learning_rate": 7.986440677966102e-06,
13050
+ "loss": 1.268,
13051
+ "step": 18220
13052
+ },
13053
+ {
13054
+ "epoch": 0.46,
13055
+ "grad_norm": 4.363860607147217,
13056
+ "learning_rate": 7.979661016949152e-06,
13057
+ "loss": 1.2921,
13058
+ "step": 18230
13059
+ },
13060
+ {
13061
+ "epoch": 0.46,
13062
+ "grad_norm": 10.35586929321289,
13063
+ "learning_rate": 7.972881355932204e-06,
13064
+ "loss": 1.4456,
13065
+ "step": 18240
13066
+ },
13067
+ {
13068
+ "epoch": 0.46,
13069
+ "grad_norm": 11.094548225402832,
13070
+ "learning_rate": 7.966101694915255e-06,
13071
+ "loss": 1.5682,
13072
+ "step": 18250
13073
+ },
13074
+ {
13075
+ "epoch": 0.46,
13076
+ "grad_norm": 10.801241874694824,
13077
+ "learning_rate": 7.959322033898306e-06,
13078
+ "loss": 1.3475,
13079
+ "step": 18260
13080
+ },
13081
+ {
13082
+ "epoch": 0.46,
13083
+ "grad_norm": 3.893005132675171,
13084
+ "learning_rate": 7.952542372881356e-06,
13085
+ "loss": 1.3858,
13086
+ "step": 18270
13087
+ },
13088
+ {
13089
+ "epoch": 0.46,
13090
+ "grad_norm": 8.01429557800293,
13091
+ "learning_rate": 7.945762711864407e-06,
13092
+ "loss": 1.3695,
13093
+ "step": 18280
13094
+ },
13095
+ {
13096
+ "epoch": 0.46,
13097
+ "grad_norm": 9.507207870483398,
13098
+ "learning_rate": 7.938983050847459e-06,
13099
+ "loss": 1.2442,
13100
+ "step": 18290
13101
+ },
13102
+ {
13103
+ "epoch": 0.46,
13104
+ "grad_norm": 4.206308364868164,
13105
+ "learning_rate": 7.93220338983051e-06,
13106
+ "loss": 1.241,
13107
+ "step": 18300
13108
+ },
13109
+ {
13110
+ "epoch": 0.46,
13111
+ "grad_norm": 7.3320794105529785,
13112
+ "learning_rate": 7.92542372881356e-06,
13113
+ "loss": 1.3338,
13114
+ "step": 18310
13115
+ },
13116
+ {
13117
+ "epoch": 0.46,
13118
+ "grad_norm": 4.259428977966309,
13119
+ "learning_rate": 7.91864406779661e-06,
13120
+ "loss": 1.3902,
13121
+ "step": 18320
13122
+ },
13123
+ {
13124
+ "epoch": 0.46,
13125
+ "grad_norm": 15.748085975646973,
13126
+ "learning_rate": 7.911864406779661e-06,
13127
+ "loss": 1.282,
13128
+ "step": 18330
13129
+ },
13130
+ {
13131
+ "epoch": 0.46,
13132
+ "grad_norm": 7.123409748077393,
13133
+ "learning_rate": 7.905084745762712e-06,
13134
+ "loss": 1.3311,
13135
+ "step": 18340
13136
+ },
13137
+ {
13138
+ "epoch": 0.46,
13139
+ "grad_norm": 7.022697925567627,
13140
+ "learning_rate": 7.898305084745764e-06,
13141
+ "loss": 1.292,
13142
+ "step": 18350
13143
+ },
13144
+ {
13145
+ "epoch": 0.46,
13146
+ "grad_norm": 3.3042855262756348,
13147
+ "learning_rate": 7.891525423728814e-06,
13148
+ "loss": 1.3587,
13149
+ "step": 18360
13150
+ },
13151
+ {
13152
+ "epoch": 0.46,
13153
+ "grad_norm": 3.5053913593292236,
13154
+ "learning_rate": 7.884745762711865e-06,
13155
+ "loss": 1.5188,
13156
+ "step": 18370
13157
+ },
13158
+ {
13159
+ "epoch": 0.46,
13160
+ "grad_norm": 2.6854681968688965,
13161
+ "learning_rate": 7.877966101694916e-06,
13162
+ "loss": 1.3633,
13163
+ "step": 18380
13164
+ },
13165
+ {
13166
+ "epoch": 0.46,
13167
+ "grad_norm": 3.8879082202911377,
13168
+ "learning_rate": 7.871186440677968e-06,
13169
+ "loss": 1.5491,
13170
+ "step": 18390
13171
+ },
13172
+ {
13173
+ "epoch": 0.46,
13174
+ "grad_norm": 9.43989086151123,
13175
+ "learning_rate": 7.864406779661017e-06,
13176
+ "loss": 1.2686,
13177
+ "step": 18400
13178
+ },
13179
+ {
13180
+ "epoch": 0.46,
13181
+ "grad_norm": 4.684770107269287,
13182
+ "learning_rate": 7.857627118644069e-06,
13183
+ "loss": 1.2463,
13184
+ "step": 18410
13185
+ },
13186
+ {
13187
+ "epoch": 0.46,
13188
+ "grad_norm": 2.699023485183716,
13189
+ "learning_rate": 7.850847457627119e-06,
13190
+ "loss": 1.5216,
13191
+ "step": 18420
13192
+ },
13193
+ {
13194
+ "epoch": 0.46,
13195
+ "grad_norm": 3.1254053115844727,
13196
+ "learning_rate": 7.84406779661017e-06,
13197
+ "loss": 1.4231,
13198
+ "step": 18430
13199
+ },
13200
+ {
13201
+ "epoch": 0.46,
13202
+ "grad_norm": 3.4634742736816406,
13203
+ "learning_rate": 7.837288135593221e-06,
13204
+ "loss": 1.4174,
13205
+ "step": 18440
13206
+ },
13207
+ {
13208
+ "epoch": 0.46,
13209
+ "grad_norm": 2.7554078102111816,
13210
+ "learning_rate": 7.830508474576271e-06,
13211
+ "loss": 1.2646,
13212
+ "step": 18450
13213
+ },
13214
+ {
13215
+ "epoch": 0.46,
13216
+ "grad_norm": 7.782464504241943,
13217
+ "learning_rate": 7.823728813559322e-06,
13218
+ "loss": 1.273,
13219
+ "step": 18460
13220
+ },
13221
+ {
13222
+ "epoch": 0.46,
13223
+ "grad_norm": 12.717724800109863,
13224
+ "learning_rate": 7.816949152542374e-06,
13225
+ "loss": 1.3536,
13226
+ "step": 18470
13227
+ },
13228
+ {
13229
+ "epoch": 0.46,
13230
+ "grad_norm": 4.644845008850098,
13231
+ "learning_rate": 7.810169491525425e-06,
13232
+ "loss": 1.567,
13233
+ "step": 18480
13234
+ },
13235
+ {
13236
+ "epoch": 0.46,
13237
+ "grad_norm": 2.032013177871704,
13238
+ "learning_rate": 7.803389830508475e-06,
13239
+ "loss": 1.4428,
13240
+ "step": 18490
13241
+ },
13242
+ {
13243
+ "epoch": 0.46,
13244
+ "grad_norm": 8.91115951538086,
13245
+ "learning_rate": 7.796610169491526e-06,
13246
+ "loss": 1.4071,
13247
+ "step": 18500
13248
+ },
13249
+ {
13250
+ "epoch": 0.46,
13251
+ "eval_loss": 1.3208402395248413,
13252
+ "eval_runtime": 66.1773,
13253
+ "eval_samples_per_second": 15.111,
13254
+ "eval_steps_per_second": 15.111,
13255
+ "step": 18500
13256
+ },
13257
+ {
13258
+ "epoch": 0.46,
13259
+ "grad_norm": 4.1798248291015625,
13260
+ "learning_rate": 7.789830508474578e-06,
13261
+ "loss": 1.3752,
13262
+ "step": 18510
13263
+ },
13264
+ {
13265
+ "epoch": 0.46,
13266
+ "grad_norm": 3.6975016593933105,
13267
+ "learning_rate": 7.783050847457628e-06,
13268
+ "loss": 1.3685,
13269
+ "step": 18520
13270
+ },
13271
+ {
13272
+ "epoch": 0.46,
13273
+ "grad_norm": 4.421247959136963,
13274
+ "learning_rate": 7.776271186440679e-06,
13275
+ "loss": 1.5264,
13276
+ "step": 18530
13277
+ },
13278
+ {
13279
+ "epoch": 0.46,
13280
+ "grad_norm": 8.739845275878906,
13281
+ "learning_rate": 7.769491525423729e-06,
13282
+ "loss": 1.4609,
13283
+ "step": 18540
13284
+ },
13285
+ {
13286
+ "epoch": 0.46,
13287
+ "grad_norm": 3.287424325942993,
13288
+ "learning_rate": 7.76271186440678e-06,
13289
+ "loss": 1.1154,
13290
+ "step": 18550
13291
+ },
13292
+ {
13293
+ "epoch": 0.46,
13294
+ "grad_norm": 3.1192266941070557,
13295
+ "learning_rate": 7.755932203389831e-06,
13296
+ "loss": 1.2787,
13297
+ "step": 18560
13298
+ },
13299
+ {
13300
+ "epoch": 0.46,
13301
+ "grad_norm": 8.368815422058105,
13302
+ "learning_rate": 7.749152542372881e-06,
13303
+ "loss": 1.3339,
13304
+ "step": 18570
13305
+ },
13306
+ {
13307
+ "epoch": 0.46,
13308
+ "grad_norm": 6.241825580596924,
13309
+ "learning_rate": 7.742372881355933e-06,
13310
+ "loss": 1.2293,
13311
+ "step": 18580
13312
+ },
13313
+ {
13314
+ "epoch": 0.46,
13315
+ "grad_norm": 9.267044067382812,
13316
+ "learning_rate": 7.735593220338984e-06,
13317
+ "loss": 1.4354,
13318
+ "step": 18590
13319
+ },
13320
+ {
13321
+ "epoch": 0.47,
13322
+ "grad_norm": 5.569955348968506,
13323
+ "learning_rate": 7.728813559322035e-06,
13324
+ "loss": 1.3941,
13325
+ "step": 18600
13326
+ },
13327
+ {
13328
+ "epoch": 0.47,
13329
+ "grad_norm": 11.273942947387695,
13330
+ "learning_rate": 7.722033898305085e-06,
13331
+ "loss": 1.1127,
13332
+ "step": 18610
13333
+ },
13334
+ {
13335
+ "epoch": 0.47,
13336
+ "grad_norm": 3.7952206134796143,
13337
+ "learning_rate": 7.715254237288136e-06,
13338
+ "loss": 1.3437,
13339
+ "step": 18620
13340
+ },
13341
+ {
13342
+ "epoch": 0.47,
13343
+ "grad_norm": 2.9084484577178955,
13344
+ "learning_rate": 7.708474576271186e-06,
13345
+ "loss": 1.3646,
13346
+ "step": 18630
13347
+ },
13348
+ {
13349
+ "epoch": 0.47,
13350
+ "grad_norm": 4.15964937210083,
13351
+ "learning_rate": 7.701694915254238e-06,
13352
+ "loss": 1.4666,
13353
+ "step": 18640
13354
+ },
13355
+ {
13356
+ "epoch": 0.47,
13357
+ "grad_norm": 4.938930988311768,
13358
+ "learning_rate": 7.694915254237289e-06,
13359
+ "loss": 1.4065,
13360
+ "step": 18650
13361
+ },
13362
+ {
13363
+ "epoch": 0.47,
13364
+ "grad_norm": 8.298666954040527,
13365
+ "learning_rate": 7.688135593220339e-06,
13366
+ "loss": 1.354,
13367
+ "step": 18660
13368
+ },
13369
+ {
13370
+ "epoch": 0.47,
13371
+ "grad_norm": 3.1837921142578125,
13372
+ "learning_rate": 7.68135593220339e-06,
13373
+ "loss": 1.3879,
13374
+ "step": 18670
13375
+ },
13376
+ {
13377
+ "epoch": 0.47,
13378
+ "grad_norm": 2.978053331375122,
13379
+ "learning_rate": 7.674576271186441e-06,
13380
+ "loss": 1.3765,
13381
+ "step": 18680
13382
+ },
13383
+ {
13384
+ "epoch": 0.47,
13385
+ "grad_norm": 5.5602827072143555,
13386
+ "learning_rate": 7.667796610169493e-06,
13387
+ "loss": 1.2135,
13388
+ "step": 18690
13389
+ },
13390
+ {
13391
+ "epoch": 0.47,
13392
+ "grad_norm": 5.481113910675049,
13393
+ "learning_rate": 7.661016949152543e-06,
13394
+ "loss": 1.3841,
13395
+ "step": 18700
13396
+ },
13397
+ {
13398
+ "epoch": 0.47,
13399
+ "grad_norm": 9.419681549072266,
13400
+ "learning_rate": 7.654237288135594e-06,
13401
+ "loss": 1.2349,
13402
+ "step": 18710
13403
+ },
13404
+ {
13405
+ "epoch": 0.47,
13406
+ "grad_norm": 4.955466270446777,
13407
+ "learning_rate": 7.647457627118645e-06,
13408
+ "loss": 1.4208,
13409
+ "step": 18720
13410
+ },
13411
+ {
13412
+ "epoch": 0.47,
13413
+ "grad_norm": 5.841220855712891,
13414
+ "learning_rate": 7.640677966101695e-06,
13415
+ "loss": 1.4526,
13416
+ "step": 18730
13417
+ },
13418
+ {
13419
+ "epoch": 0.47,
13420
+ "grad_norm": 3.1412012577056885,
13421
+ "learning_rate": 7.633898305084746e-06,
13422
+ "loss": 1.4606,
13423
+ "step": 18740
13424
+ },
13425
+ {
13426
+ "epoch": 0.47,
13427
+ "grad_norm": 4.624940395355225,
13428
+ "learning_rate": 7.627118644067797e-06,
13429
+ "loss": 1.3875,
13430
+ "step": 18750
13431
+ },
13432
+ {
13433
+ "epoch": 0.47,
13434
+ "grad_norm": 9.828381538391113,
13435
+ "learning_rate": 7.6203389830508476e-06,
13436
+ "loss": 1.4079,
13437
+ "step": 18760
13438
+ },
13439
+ {
13440
+ "epoch": 0.47,
13441
+ "grad_norm": 5.299017429351807,
13442
+ "learning_rate": 7.613559322033899e-06,
13443
+ "loss": 1.278,
13444
+ "step": 18770
13445
+ },
13446
+ {
13447
+ "epoch": 0.47,
13448
+ "grad_norm": 6.449117183685303,
13449
+ "learning_rate": 7.6067796610169495e-06,
13450
+ "loss": 1.4141,
13451
+ "step": 18780
13452
+ },
13453
+ {
13454
+ "epoch": 0.47,
13455
+ "grad_norm": 6.961145401000977,
13456
+ "learning_rate": 7.600000000000001e-06,
13457
+ "loss": 1.2828,
13458
+ "step": 18790
13459
+ },
13460
+ {
13461
+ "epoch": 0.47,
13462
+ "grad_norm": 8.588834762573242,
13463
+ "learning_rate": 7.5932203389830515e-06,
13464
+ "loss": 1.3544,
13465
+ "step": 18800
13466
+ },
13467
+ {
13468
+ "epoch": 0.47,
13469
+ "grad_norm": 2.5683794021606445,
13470
+ "learning_rate": 7.586440677966103e-06,
13471
+ "loss": 1.4127,
13472
+ "step": 18810
13473
+ },
13474
+ {
13475
+ "epoch": 0.47,
13476
+ "grad_norm": 2.101924180984497,
13477
+ "learning_rate": 7.5796610169491534e-06,
13478
+ "loss": 1.229,
13479
+ "step": 18820
13480
+ },
13481
+ {
13482
+ "epoch": 0.47,
13483
+ "grad_norm": 5.389444351196289,
13484
+ "learning_rate": 7.572881355932205e-06,
13485
+ "loss": 1.3112,
13486
+ "step": 18830
13487
+ },
13488
+ {
13489
+ "epoch": 0.47,
13490
+ "grad_norm": 3.154507875442505,
13491
+ "learning_rate": 7.5661016949152545e-06,
13492
+ "loss": 1.2603,
13493
+ "step": 18840
13494
+ },
13495
+ {
13496
+ "epoch": 0.47,
13497
+ "grad_norm": 4.938506126403809,
13498
+ "learning_rate": 7.559322033898305e-06,
13499
+ "loss": 1.5423,
13500
+ "step": 18850
13501
+ },
13502
+ {
13503
+ "epoch": 0.47,
13504
+ "grad_norm": 5.838810443878174,
13505
+ "learning_rate": 7.5525423728813565e-06,
13506
+ "loss": 1.0651,
13507
+ "step": 18860
13508
+ },
13509
+ {
13510
+ "epoch": 0.47,
13511
+ "grad_norm": 7.946393013000488,
13512
+ "learning_rate": 7.545762711864407e-06,
13513
+ "loss": 1.4706,
13514
+ "step": 18870
13515
+ },
13516
+ {
13517
+ "epoch": 0.47,
13518
+ "grad_norm": 12.6395902633667,
13519
+ "learning_rate": 7.5389830508474584e-06,
13520
+ "loss": 1.4036,
13521
+ "step": 18880
13522
+ },
13523
+ {
13524
+ "epoch": 0.47,
13525
+ "grad_norm": 4.769916534423828,
13526
+ "learning_rate": 7.532203389830509e-06,
13527
+ "loss": 1.33,
13528
+ "step": 18890
13529
+ },
13530
+ {
13531
+ "epoch": 0.47,
13532
+ "grad_norm": 5.928445816040039,
13533
+ "learning_rate": 7.52542372881356e-06,
13534
+ "loss": 1.4844,
13535
+ "step": 18900
13536
+ },
13537
+ {
13538
+ "epoch": 0.47,
13539
+ "grad_norm": 4.121998310089111,
13540
+ "learning_rate": 7.518644067796611e-06,
13541
+ "loss": 1.2672,
13542
+ "step": 18910
13543
+ },
13544
+ {
13545
+ "epoch": 0.47,
13546
+ "grad_norm": 1.256914496421814,
13547
+ "learning_rate": 7.511864406779662e-06,
13548
+ "loss": 1.337,
13549
+ "step": 18920
13550
+ },
13551
+ {
13552
+ "epoch": 0.47,
13553
+ "grad_norm": 9.951703071594238,
13554
+ "learning_rate": 7.505084745762713e-06,
13555
+ "loss": 1.3359,
13556
+ "step": 18930
13557
+ },
13558
+ {
13559
+ "epoch": 0.47,
13560
+ "grad_norm": 5.373715400695801,
13561
+ "learning_rate": 7.498305084745763e-06,
13562
+ "loss": 1.2437,
13563
+ "step": 18940
13564
+ },
13565
+ {
13566
+ "epoch": 0.47,
13567
+ "grad_norm": 7.120504379272461,
13568
+ "learning_rate": 7.491525423728814e-06,
13569
+ "loss": 1.1883,
13570
+ "step": 18950
13571
+ },
13572
+ {
13573
+ "epoch": 0.47,
13574
+ "grad_norm": 7.661159992218018,
13575
+ "learning_rate": 7.4847457627118646e-06,
13576
+ "loss": 1.2648,
13577
+ "step": 18960
13578
+ },
13579
+ {
13580
+ "epoch": 0.47,
13581
+ "grad_norm": 5.859286308288574,
13582
+ "learning_rate": 7.477966101694916e-06,
13583
+ "loss": 1.3106,
13584
+ "step": 18970
13585
+ },
13586
+ {
13587
+ "epoch": 0.47,
13588
+ "grad_norm": 7.276169300079346,
13589
+ "learning_rate": 7.4711864406779665e-06,
13590
+ "loss": 1.393,
13591
+ "step": 18980
13592
+ },
13593
+ {
13594
+ "epoch": 0.47,
13595
+ "grad_norm": 6.31447172164917,
13596
+ "learning_rate": 7.464406779661018e-06,
13597
+ "loss": 1.4051,
13598
+ "step": 18990
13599
+ },
13600
+ {
13601
+ "epoch": 0.47,
13602
+ "grad_norm": 8.36728572845459,
13603
+ "learning_rate": 7.4576271186440685e-06,
13604
+ "loss": 1.4003,
13605
+ "step": 19000
13606
+ },
13607
+ {
13608
+ "epoch": 0.47,
13609
+ "eval_loss": 1.347457766532898,
13610
+ "eval_runtime": 66.1461,
13611
+ "eval_samples_per_second": 15.118,
13612
+ "eval_steps_per_second": 15.118,
13613
+ "step": 19000
13614
+ },
13615
+ {
13616
+ "epoch": 0.48,
13617
+ "grad_norm": 3.911505937576294,
13618
+ "learning_rate": 7.45084745762712e-06,
13619
+ "loss": 1.2646,
13620
+ "step": 19010
13621
+ },
13622
+ {
13623
+ "epoch": 0.48,
13624
+ "grad_norm": 3.6765291690826416,
13625
+ "learning_rate": 7.4440677966101704e-06,
13626
+ "loss": 1.3827,
13627
+ "step": 19020
13628
+ },
13629
+ {
13630
+ "epoch": 0.48,
13631
+ "grad_norm": 3.899599313735962,
13632
+ "learning_rate": 7.437288135593221e-06,
13633
+ "loss": 1.4144,
13634
+ "step": 19030
13635
+ },
13636
+ {
13637
+ "epoch": 0.48,
13638
+ "grad_norm": 6.436791896820068,
13639
+ "learning_rate": 7.430508474576272e-06,
13640
+ "loss": 1.2087,
13641
+ "step": 19040
13642
+ },
13643
+ {
13644
+ "epoch": 0.48,
13645
+ "grad_norm": 4.624211311340332,
13646
+ "learning_rate": 7.423728813559322e-06,
13647
+ "loss": 1.3824,
13648
+ "step": 19050
13649
+ },
13650
+ {
13651
+ "epoch": 0.48,
13652
+ "grad_norm": 6.657593727111816,
13653
+ "learning_rate": 7.4169491525423735e-06,
13654
+ "loss": 1.3856,
13655
+ "step": 19060
13656
+ },
13657
+ {
13658
+ "epoch": 0.48,
13659
+ "grad_norm": 7.132912635803223,
13660
+ "learning_rate": 7.410169491525424e-06,
13661
+ "loss": 1.4319,
13662
+ "step": 19070
13663
+ },
13664
+ {
13665
+ "epoch": 0.48,
13666
+ "grad_norm": 2.8681843280792236,
13667
+ "learning_rate": 7.4033898305084754e-06,
13668
+ "loss": 1.419,
13669
+ "step": 19080
13670
+ },
13671
+ {
13672
+ "epoch": 0.48,
13673
+ "grad_norm": 5.819919109344482,
13674
+ "learning_rate": 7.396610169491526e-06,
13675
+ "loss": 1.3988,
13676
+ "step": 19090
13677
+ },
13678
+ {
13679
+ "epoch": 0.48,
13680
+ "grad_norm": 6.482515811920166,
13681
+ "learning_rate": 7.3898305084745766e-06,
13682
+ "loss": 1.3366,
13683
+ "step": 19100
13684
+ },
13685
+ {
13686
+ "epoch": 0.48,
13687
+ "grad_norm": 10.341208457946777,
13688
+ "learning_rate": 7.383050847457628e-06,
13689
+ "loss": 1.3502,
13690
+ "step": 19110
13691
+ },
13692
+ {
13693
+ "epoch": 0.48,
13694
+ "grad_norm": 2.820133686065674,
13695
+ "learning_rate": 7.3762711864406785e-06,
13696
+ "loss": 1.415,
13697
+ "step": 19120
13698
+ },
13699
+ {
13700
+ "epoch": 0.48,
13701
+ "grad_norm": 7.112204551696777,
13702
+ "learning_rate": 7.36949152542373e-06,
13703
+ "loss": 1.2609,
13704
+ "step": 19130
13705
+ },
13706
+ {
13707
+ "epoch": 0.48,
13708
+ "grad_norm": 6.172835826873779,
13709
+ "learning_rate": 7.3627118644067805e-06,
13710
+ "loss": 1.2456,
13711
+ "step": 19140
13712
+ },
13713
+ {
13714
+ "epoch": 0.48,
13715
+ "grad_norm": 2.9154605865478516,
13716
+ "learning_rate": 7.355932203389831e-06,
13717
+ "loss": 1.3332,
13718
+ "step": 19150
13719
+ },
13720
+ {
13721
+ "epoch": 0.48,
13722
+ "grad_norm": 6.8542256355285645,
13723
+ "learning_rate": 7.3491525423728816e-06,
13724
+ "loss": 1.2728,
13725
+ "step": 19160
13726
+ },
13727
+ {
13728
+ "epoch": 0.48,
13729
+ "grad_norm": 7.636801719665527,
13730
+ "learning_rate": 7.342372881355932e-06,
13731
+ "loss": 1.3744,
13732
+ "step": 19170
13733
+ },
13734
+ {
13735
+ "epoch": 0.48,
13736
+ "grad_norm": 7.556679725646973,
13737
+ "learning_rate": 7.3355932203389835e-06,
13738
+ "loss": 1.446,
13739
+ "step": 19180
13740
+ },
13741
+ {
13742
+ "epoch": 0.48,
13743
+ "grad_norm": 22.831199645996094,
13744
+ "learning_rate": 7.328813559322034e-06,
13745
+ "loss": 1.358,
13746
+ "step": 19190
13747
+ },
13748
+ {
13749
+ "epoch": 0.48,
13750
+ "grad_norm": 10.184800148010254,
13751
+ "learning_rate": 7.3220338983050855e-06,
13752
+ "loss": 1.2379,
13753
+ "step": 19200
13754
+ },
13755
+ {
13756
+ "epoch": 0.48,
13757
+ "grad_norm": 1.9062511920928955,
13758
+ "learning_rate": 7.315254237288136e-06,
13759
+ "loss": 1.3724,
13760
+ "step": 19210
13761
+ },
13762
+ {
13763
+ "epoch": 0.48,
13764
+ "grad_norm": 9.429797172546387,
13765
+ "learning_rate": 7.3084745762711874e-06,
13766
+ "loss": 1.33,
13767
+ "step": 19220
13768
+ },
13769
+ {
13770
+ "epoch": 0.48,
13771
+ "grad_norm": 3.129505157470703,
13772
+ "learning_rate": 7.301694915254238e-06,
13773
+ "loss": 1.1795,
13774
+ "step": 19230
13775
+ },
13776
+ {
13777
+ "epoch": 0.48,
13778
+ "grad_norm": 6.839028835296631,
13779
+ "learning_rate": 7.294915254237289e-06,
13780
+ "loss": 1.2983,
13781
+ "step": 19240
13782
+ },
13783
+ {
13784
+ "epoch": 0.48,
13785
+ "grad_norm": 15.552626609802246,
13786
+ "learning_rate": 7.288135593220339e-06,
13787
+ "loss": 1.4867,
13788
+ "step": 19250
13789
+ },
13790
+ {
13791
+ "epoch": 0.48,
13792
+ "grad_norm": 3.4032084941864014,
13793
+ "learning_rate": 7.28135593220339e-06,
13794
+ "loss": 1.4256,
13795
+ "step": 19260
13796
+ },
13797
+ {
13798
+ "epoch": 0.48,
13799
+ "grad_norm": 6.437047481536865,
13800
+ "learning_rate": 7.274576271186441e-06,
13801
+ "loss": 1.3215,
13802
+ "step": 19270
13803
+ },
13804
+ {
13805
+ "epoch": 0.48,
13806
+ "grad_norm": 5.592834949493408,
13807
+ "learning_rate": 7.267796610169492e-06,
13808
+ "loss": 1.3466,
13809
+ "step": 19280
13810
+ },
13811
+ {
13812
+ "epoch": 0.48,
13813
+ "grad_norm": 10.394824028015137,
13814
+ "learning_rate": 7.261016949152543e-06,
13815
+ "loss": 1.3806,
13816
+ "step": 19290
13817
+ },
13818
+ {
13819
+ "epoch": 0.48,
13820
+ "grad_norm": 6.921483039855957,
13821
+ "learning_rate": 7.2542372881355936e-06,
13822
+ "loss": 1.5624,
13823
+ "step": 19300
13824
+ },
13825
+ {
13826
+ "epoch": 0.48,
13827
+ "grad_norm": 3.078634262084961,
13828
+ "learning_rate": 7.247457627118645e-06,
13829
+ "loss": 1.1216,
13830
+ "step": 19310
13831
+ },
13832
+ {
13833
+ "epoch": 0.48,
13834
+ "grad_norm": 6.003988742828369,
13835
+ "learning_rate": 7.2406779661016955e-06,
13836
+ "loss": 1.2759,
13837
+ "step": 19320
13838
+ },
13839
+ {
13840
+ "epoch": 0.48,
13841
+ "grad_norm": 2.419410467147827,
13842
+ "learning_rate": 7.233898305084747e-06,
13843
+ "loss": 1.2067,
13844
+ "step": 19330
13845
+ },
13846
+ {
13847
+ "epoch": 0.48,
13848
+ "grad_norm": 3.014220952987671,
13849
+ "learning_rate": 7.2271186440677975e-06,
13850
+ "loss": 1.1819,
13851
+ "step": 19340
13852
+ },
13853
+ {
13854
+ "epoch": 0.48,
13855
+ "grad_norm": 5.271297454833984,
13856
+ "learning_rate": 7.220338983050849e-06,
13857
+ "loss": 1.4742,
13858
+ "step": 19350
13859
+ },
13860
+ {
13861
+ "epoch": 0.48,
13862
+ "grad_norm": 2.456838607788086,
13863
+ "learning_rate": 7.2135593220338986e-06,
13864
+ "loss": 1.4656,
13865
+ "step": 19360
13866
+ },
13867
+ {
13868
+ "epoch": 0.48,
13869
+ "grad_norm": 5.350526332855225,
13870
+ "learning_rate": 7.206779661016949e-06,
13871
+ "loss": 1.1238,
13872
+ "step": 19370
13873
+ },
13874
+ {
13875
+ "epoch": 0.48,
13876
+ "grad_norm": 5.42751932144165,
13877
+ "learning_rate": 7.2000000000000005e-06,
13878
+ "loss": 1.4704,
13879
+ "step": 19380
13880
+ },
13881
+ {
13882
+ "epoch": 0.48,
13883
+ "grad_norm": 3.97116756439209,
13884
+ "learning_rate": 7.193220338983051e-06,
13885
+ "loss": 1.4455,
13886
+ "step": 19390
13887
+ },
13888
+ {
13889
+ "epoch": 0.48,
13890
+ "grad_norm": 5.157166481018066,
13891
+ "learning_rate": 7.1864406779661025e-06,
13892
+ "loss": 1.3725,
13893
+ "step": 19400
13894
+ },
13895
+ {
13896
+ "epoch": 0.49,
13897
+ "grad_norm": 4.2230072021484375,
13898
+ "learning_rate": 7.179661016949153e-06,
13899
+ "loss": 1.3768,
13900
+ "step": 19410
13901
+ },
13902
+ {
13903
+ "epoch": 0.49,
13904
+ "grad_norm": 5.513181686401367,
13905
+ "learning_rate": 7.1728813559322044e-06,
13906
+ "loss": 1.3018,
13907
+ "step": 19420
13908
+ },
13909
+ {
13910
+ "epoch": 0.49,
13911
+ "grad_norm": 7.353570461273193,
13912
+ "learning_rate": 7.166101694915255e-06,
13913
+ "loss": 1.513,
13914
+ "step": 19430
13915
+ },
13916
+ {
13917
+ "epoch": 0.49,
13918
+ "grad_norm": 6.263894081115723,
13919
+ "learning_rate": 7.159322033898306e-06,
13920
+ "loss": 1.4521,
13921
+ "step": 19440
13922
+ },
13923
+ {
13924
+ "epoch": 0.49,
13925
+ "grad_norm": 13.368189811706543,
13926
+ "learning_rate": 7.152542372881357e-06,
13927
+ "loss": 1.3158,
13928
+ "step": 19450
13929
+ },
13930
+ {
13931
+ "epoch": 0.49,
13932
+ "grad_norm": 4.644199371337891,
13933
+ "learning_rate": 7.145762711864407e-06,
13934
+ "loss": 1.4184,
13935
+ "step": 19460
13936
+ },
13937
+ {
13938
+ "epoch": 0.49,
13939
+ "grad_norm": 5.206189155578613,
13940
+ "learning_rate": 7.138983050847458e-06,
13941
+ "loss": 1.3848,
13942
+ "step": 19470
13943
+ },
13944
+ {
13945
+ "epoch": 0.49,
13946
+ "grad_norm": 7.3628153800964355,
13947
+ "learning_rate": 7.132203389830509e-06,
13948
+ "loss": 1.2512,
13949
+ "step": 19480
13950
+ },
13951
+ {
13952
+ "epoch": 0.49,
13953
+ "grad_norm": 7.519322395324707,
13954
+ "learning_rate": 7.12542372881356e-06,
13955
+ "loss": 1.261,
13956
+ "step": 19490
13957
+ },
13958
+ {
13959
+ "epoch": 0.49,
13960
+ "grad_norm": 1.5350415706634521,
13961
+ "learning_rate": 7.1186440677966106e-06,
13962
+ "loss": 1.2517,
13963
+ "step": 19500
13964
+ },
13965
+ {
13966
+ "epoch": 0.49,
13967
+ "eval_loss": 1.3194345235824585,
13968
+ "eval_runtime": 66.1305,
13969
+ "eval_samples_per_second": 15.122,
13970
+ "eval_steps_per_second": 15.122,
13971
+ "step": 19500
13972
+ },
13973
+ {
13974
+ "epoch": 0.49,
13975
+ "grad_norm": 3.0648765563964844,
13976
+ "learning_rate": 7.111864406779662e-06,
13977
+ "loss": 1.398,
13978
+ "step": 19510
13979
+ },
13980
+ {
13981
+ "epoch": 0.49,
13982
+ "grad_norm": 5.124095439910889,
13983
+ "learning_rate": 7.1050847457627125e-06,
13984
+ "loss": 1.404,
13985
+ "step": 19520
13986
+ },
13987
+ {
13988
+ "epoch": 0.49,
13989
+ "grad_norm": 2.8961544036865234,
13990
+ "learning_rate": 7.098305084745764e-06,
13991
+ "loss": 1.2651,
13992
+ "step": 19530
13993
+ },
13994
+ {
13995
+ "epoch": 0.49,
13996
+ "grad_norm": 3.0641605854034424,
13997
+ "learning_rate": 7.0915254237288145e-06,
13998
+ "loss": 1.3587,
13999
+ "step": 19540
14000
+ },
14001
+ {
14002
+ "epoch": 0.49,
14003
+ "grad_norm": 3.618454933166504,
14004
+ "learning_rate": 7.084745762711865e-06,
14005
+ "loss": 1.3533,
14006
+ "step": 19550
14007
+ },
14008
+ {
14009
+ "epoch": 0.49,
14010
+ "grad_norm": 3.3083536624908447,
14011
+ "learning_rate": 7.077966101694916e-06,
14012
+ "loss": 1.2859,
14013
+ "step": 19560
14014
+ },
14015
+ {
14016
+ "epoch": 0.49,
14017
+ "grad_norm": 1.260048508644104,
14018
+ "learning_rate": 7.071186440677966e-06,
14019
+ "loss": 1.2224,
14020
+ "step": 19570
14021
+ },
14022
+ {
14023
+ "epoch": 0.49,
14024
+ "grad_norm": 5.410586833953857,
14025
+ "learning_rate": 7.0644067796610175e-06,
14026
+ "loss": 1.2916,
14027
+ "step": 19580
14028
+ },
14029
+ {
14030
+ "epoch": 0.49,
14031
+ "grad_norm": 1.9295905828475952,
14032
+ "learning_rate": 7.057627118644068e-06,
14033
+ "loss": 1.3004,
14034
+ "step": 19590
14035
+ },
14036
+ {
14037
+ "epoch": 0.49,
14038
+ "grad_norm": 10.48406982421875,
14039
+ "learning_rate": 7.0508474576271195e-06,
14040
+ "loss": 1.3105,
14041
+ "step": 19600
14042
+ },
14043
+ {
14044
+ "epoch": 0.49,
14045
+ "grad_norm": 4.99778413772583,
14046
+ "learning_rate": 7.04406779661017e-06,
14047
+ "loss": 1.2481,
14048
+ "step": 19610
14049
+ },
14050
+ {
14051
+ "epoch": 0.49,
14052
+ "grad_norm": 5.476469993591309,
14053
+ "learning_rate": 7.037288135593221e-06,
14054
+ "loss": 1.345,
14055
+ "step": 19620
14056
+ },
14057
+ {
14058
+ "epoch": 0.49,
14059
+ "grad_norm": 7.250665664672852,
14060
+ "learning_rate": 7.030508474576272e-06,
14061
+ "loss": 1.2993,
14062
+ "step": 19630
14063
+ },
14064
+ {
14065
+ "epoch": 0.49,
14066
+ "grad_norm": 7.802820682525635,
14067
+ "learning_rate": 7.0237288135593225e-06,
14068
+ "loss": 1.3106,
14069
+ "step": 19640
14070
+ },
14071
+ {
14072
+ "epoch": 0.49,
14073
+ "grad_norm": 2.5077409744262695,
14074
+ "learning_rate": 7.016949152542374e-06,
14075
+ "loss": 1.1298,
14076
+ "step": 19650
14077
+ },
14078
+ {
14079
+ "epoch": 0.49,
14080
+ "grad_norm": 2.3012547492980957,
14081
+ "learning_rate": 7.0101694915254245e-06,
14082
+ "loss": 1.2944,
14083
+ "step": 19660
14084
+ },
14085
+ {
14086
+ "epoch": 0.49,
14087
+ "grad_norm": 10.731738090515137,
14088
+ "learning_rate": 7.003389830508475e-06,
14089
+ "loss": 1.2545,
14090
+ "step": 19670
14091
+ },
14092
+ {
14093
+ "epoch": 0.49,
14094
+ "grad_norm": 11.851224899291992,
14095
+ "learning_rate": 6.996610169491526e-06,
14096
+ "loss": 1.3364,
14097
+ "step": 19680
14098
+ },
14099
+ {
14100
+ "epoch": 0.49,
14101
+ "grad_norm": 2.060750722885132,
14102
+ "learning_rate": 6.989830508474576e-06,
14103
+ "loss": 1.2909,
14104
+ "step": 19690
14105
+ },
14106
+ {
14107
+ "epoch": 0.49,
14108
+ "grad_norm": 8.549054145812988,
14109
+ "learning_rate": 6.9830508474576275e-06,
14110
+ "loss": 1.405,
14111
+ "step": 19700
14112
+ },
14113
+ {
14114
+ "epoch": 0.49,
14115
+ "grad_norm": 5.37472677230835,
14116
+ "learning_rate": 6.976271186440678e-06,
14117
+ "loss": 1.5615,
14118
+ "step": 19710
14119
+ },
14120
+ {
14121
+ "epoch": 0.49,
14122
+ "grad_norm": 4.753462314605713,
14123
+ "learning_rate": 6.9694915254237295e-06,
14124
+ "loss": 1.362,
14125
+ "step": 19720
14126
+ },
14127
+ {
14128
+ "epoch": 0.49,
14129
+ "grad_norm": 6.330374717712402,
14130
+ "learning_rate": 6.96271186440678e-06,
14131
+ "loss": 1.3563,
14132
+ "step": 19730
14133
+ },
14134
+ {
14135
+ "epoch": 0.49,
14136
+ "grad_norm": 6.962733268737793,
14137
+ "learning_rate": 6.9559322033898315e-06,
14138
+ "loss": 1.3272,
14139
+ "step": 19740
14140
+ },
14141
+ {
14142
+ "epoch": 0.49,
14143
+ "grad_norm": 13.372142791748047,
14144
+ "learning_rate": 6.949152542372882e-06,
14145
+ "loss": 1.4422,
14146
+ "step": 19750
14147
+ },
14148
+ {
14149
+ "epoch": 0.49,
14150
+ "grad_norm": 4.561896800994873,
14151
+ "learning_rate": 6.942372881355933e-06,
14152
+ "loss": 1.3691,
14153
+ "step": 19760
14154
+ },
14155
+ {
14156
+ "epoch": 0.49,
14157
+ "grad_norm": 2.2547922134399414,
14158
+ "learning_rate": 6.935593220338983e-06,
14159
+ "loss": 1.4905,
14160
+ "step": 19770
14161
+ },
14162
+ {
14163
+ "epoch": 0.49,
14164
+ "grad_norm": 4.15011739730835,
14165
+ "learning_rate": 6.928813559322034e-06,
14166
+ "loss": 1.3899,
14167
+ "step": 19780
14168
+ },
14169
+ {
14170
+ "epoch": 0.49,
14171
+ "grad_norm": 11.278037071228027,
14172
+ "learning_rate": 6.922033898305085e-06,
14173
+ "loss": 1.0803,
14174
+ "step": 19790
14175
+ },
14176
+ {
14177
+ "epoch": 0.49,
14178
+ "grad_norm": 9.100043296813965,
14179
+ "learning_rate": 6.915254237288136e-06,
14180
+ "loss": 1.4593,
14181
+ "step": 19800
14182
+ },
14183
+ {
14184
+ "epoch": 0.5,
14185
+ "grad_norm": 10.643383026123047,
14186
+ "learning_rate": 6.908474576271187e-06,
14187
+ "loss": 1.3571,
14188
+ "step": 19810
14189
+ },
14190
+ {
14191
+ "epoch": 0.5,
14192
+ "grad_norm": 11.314017295837402,
14193
+ "learning_rate": 6.901694915254238e-06,
14194
+ "loss": 1.2376,
14195
+ "step": 19820
14196
+ },
14197
+ {
14198
+ "epoch": 0.5,
14199
+ "grad_norm": 3.855220317840576,
14200
+ "learning_rate": 6.894915254237289e-06,
14201
+ "loss": 1.1095,
14202
+ "step": 19830
14203
+ },
14204
+ {
14205
+ "epoch": 0.5,
14206
+ "grad_norm": 5.896849155426025,
14207
+ "learning_rate": 6.8881355932203395e-06,
14208
+ "loss": 1.2078,
14209
+ "step": 19840
14210
+ },
14211
+ {
14212
+ "epoch": 0.5,
14213
+ "grad_norm": 2.1142632961273193,
14214
+ "learning_rate": 6.881355932203391e-06,
14215
+ "loss": 1.3017,
14216
+ "step": 19850
14217
+ },
14218
+ {
14219
+ "epoch": 0.5,
14220
+ "grad_norm": 7.116094589233398,
14221
+ "learning_rate": 6.8745762711864415e-06,
14222
+ "loss": 1.217,
14223
+ "step": 19860
14224
+ },
14225
+ {
14226
+ "epoch": 0.5,
14227
+ "grad_norm": 3.392282724380493,
14228
+ "learning_rate": 6.867796610169493e-06,
14229
+ "loss": 1.4027,
14230
+ "step": 19870
14231
+ },
14232
+ {
14233
+ "epoch": 0.5,
14234
+ "grad_norm": 5.760110855102539,
14235
+ "learning_rate": 6.861016949152543e-06,
14236
+ "loss": 1.2888,
14237
+ "step": 19880
14238
+ },
14239
+ {
14240
+ "epoch": 0.5,
14241
+ "grad_norm": 2.312904119491577,
14242
+ "learning_rate": 6.854237288135593e-06,
14243
+ "loss": 1.39,
14244
+ "step": 19890
14245
+ },
14246
+ {
14247
+ "epoch": 0.5,
14248
+ "grad_norm": 9.748230934143066,
14249
+ "learning_rate": 6.8474576271186445e-06,
14250
+ "loss": 1.408,
14251
+ "step": 19900
14252
+ },
14253
+ {
14254
+ "epoch": 0.5,
14255
+ "grad_norm": 8.029982566833496,
14256
+ "learning_rate": 6.840677966101695e-06,
14257
+ "loss": 1.2631,
14258
+ "step": 19910
14259
+ },
14260
+ {
14261
+ "epoch": 0.5,
14262
+ "grad_norm": 5.408463954925537,
14263
+ "learning_rate": 6.8338983050847465e-06,
14264
+ "loss": 1.3545,
14265
+ "step": 19920
14266
+ },
14267
+ {
14268
+ "epoch": 0.5,
14269
+ "grad_norm": 4.480403900146484,
14270
+ "learning_rate": 6.827118644067797e-06,
14271
+ "loss": 1.4549,
14272
+ "step": 19930
14273
+ },
14274
+ {
14275
+ "epoch": 0.5,
14276
+ "grad_norm": 8.981225967407227,
14277
+ "learning_rate": 6.8203389830508485e-06,
14278
+ "loss": 1.3352,
14279
+ "step": 19940
14280
+ },
14281
+ {
14282
+ "epoch": 0.5,
14283
+ "grad_norm": 1.9335066080093384,
14284
+ "learning_rate": 6.813559322033899e-06,
14285
+ "loss": 1.2825,
14286
+ "step": 19950
14287
+ },
14288
+ {
14289
+ "epoch": 0.5,
14290
+ "grad_norm": 7.6206464767456055,
14291
+ "learning_rate": 6.80677966101695e-06,
14292
+ "loss": 1.4976,
14293
+ "step": 19960
14294
+ },
14295
+ {
14296
+ "epoch": 0.5,
14297
+ "grad_norm": 14.687816619873047,
14298
+ "learning_rate": 6.800000000000001e-06,
14299
+ "loss": 1.4687,
14300
+ "step": 19970
14301
+ },
14302
+ {
14303
+ "epoch": 0.5,
14304
+ "grad_norm": 9.034219741821289,
14305
+ "learning_rate": 6.793220338983051e-06,
14306
+ "loss": 1.4715,
14307
+ "step": 19980
14308
+ },
14309
+ {
14310
+ "epoch": 0.5,
14311
+ "grad_norm": 8.120539665222168,
14312
+ "learning_rate": 6.786440677966102e-06,
14313
+ "loss": 1.4331,
14314
+ "step": 19990
14315
+ },
14316
+ {
14317
+ "epoch": 0.5,
14318
+ "grad_norm": 6.594362258911133,
14319
+ "learning_rate": 6.779661016949153e-06,
14320
+ "loss": 1.4516,
14321
+ "step": 20000
14322
+ },
14323
+ {
14324
+ "epoch": 0.5,
14325
+ "eval_loss": 1.348677158355713,
14326
+ "eval_runtime": 66.1188,
14327
+ "eval_samples_per_second": 15.124,
14328
+ "eval_steps_per_second": 15.124,
14329
+ "step": 20000
14330
  }
14331
  ],
14332
  "logging_steps": 10,
 
14334
  "num_input_tokens_seen": 0,
14335
  "num_train_epochs": 1,
14336
  "save_steps": 2500,
14337
+ "total_flos": 3.2204251987968e+17,
14338
  "train_batch_size": 1,
14339
  "trial_name": null,
14340
  "trial_params": null