kanishka
/

smolm-autoreg-bpe-counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new-3e-4

kanishka commited on Jun 22

Commit

9f43b63

•

1 Parent(s): c06cd83

End of training

Browse files

Files changed (6) hide show

README.md +14 -2
all_results.json +15 -0
eval_results.json +10 -0
runs/Jun21_17-29-53_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1719090849.phyl-ling-p01.la.utexas.edu.1078584.1 +3 -0
train_results.json +8 -0
trainer_state.json +2814 -0

README.md CHANGED Viewed

@@ -1,11 +1,23 @@
 ---
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new-3e-4
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
 # smolm-autoreg-bpe-counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new-3e-4
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4250
 - Accuracy: 0.4092

 ---
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new
 metrics:
 - accuracy
 model-index:
 - name: smolm-autoreg-bpe-counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new-3e-4
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new
+      type: kanishka/counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.4091656007481136
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # smolm-autoreg-bpe-counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new-3e-4
+This model was trained from scratch on the kanishka/counterfactual_babylm_aann_indef_articles_with_pl_nouns_removal_new dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.4250
 - Accuracy: 0.4092

all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4091656007481136,
+    "eval_loss": 3.4249629974365234,
+    "eval_runtime": 155.4739,
+    "eval_samples": 57920,
+    "eval_samples_per_second": 372.538,
+    "eval_steps_per_second": 5.821,
+    "perplexity": 30.72150805591893,
+    "train_loss": 3.055753937752016,
+    "train_runtime": 81288.5778,
+    "train_samples": 595173,
+    "train_samples_per_second": 146.435,
+    "train_steps_per_second": 4.576
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4091656007481136,
+    "eval_loss": 3.4249629974365234,
+    "eval_runtime": 155.4739,
+    "eval_samples": 57920,
+    "eval_samples_per_second": 372.538,
+    "eval_steps_per_second": 5.821,
+    "perplexity": 30.72150805591893
+}

runs/Jun21_17-29-53_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1719090849.phyl-ling-p01.la.utexas.edu.1078584.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:176c53e59369deeb3b3afd7578a977f14dfc98ce86905ebfe845e3d6d2831ad9
+size 417

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 20.0,
+    "train_loss": 3.055753937752016,
+    "train_runtime": 81288.5778,
+    "train_samples": 595173,
+    "train_samples_per_second": 146.435,
+    "train_steps_per_second": 4.576
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2814 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 500,
+  "global_step": 372000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.8259170651435852,
+      "learning_rate": 9.375e-06,
+      "loss": 6.8606,
+      "step": 1000
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.7919405102729797,
+      "learning_rate": 1.875e-05,
+      "loss": 5.3621,
+      "step": 2000
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9477707147598267,
+      "learning_rate": 2.8125e-05,
+      "loss": 5.0308,
+      "step": 3000
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.9951394200325012,
+      "learning_rate": 3.75e-05,
+      "loss": 4.8123,
+      "step": 4000
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.9227204918861389,
+      "learning_rate": 4.6874999999999994e-05,
+      "loss": 4.6357,
+      "step": 5000
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.9371578097343445,
+      "learning_rate": 5.625e-05,
+      "loss": 4.4935,
+      "step": 6000
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.9119696617126465,
+      "learning_rate": 6.5625e-05,
+      "loss": 4.3815,
+      "step": 7000
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.9198476672172546,
+      "learning_rate": 7.5e-05,
+      "loss": 4.2884,
+      "step": 8000
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8627898693084717,
+      "learning_rate": 8.437499999999999e-05,
+      "loss": 4.208,
+      "step": 9000
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.8556337356567383,
+      "learning_rate": 9.374999999999999e-05,
+      "loss": 4.1353,
+      "step": 10000
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.8405832052230835,
+      "learning_rate": 0.000103115625,
+      "loss": 4.0695,
+      "step": 11000
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8061829805374146,
+      "learning_rate": 0.000112490625,
+      "loss": 4.0086,
+      "step": 12000
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8495123386383057,
+      "learning_rate": 0.000121865625,
+      "loss": 3.946,
+      "step": 13000
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9331762790679932,
+      "learning_rate": 0.000131221875,
+      "loss": 3.8919,
+      "step": 14000
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8087450265884399,
+      "learning_rate": 0.000140596875,
+      "loss": 3.8453,
+      "step": 15000
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.78471440076828,
+      "learning_rate": 0.0001499625,
+      "loss": 3.8107,
+      "step": 16000
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.7369292974472046,
+      "learning_rate": 0.000159328125,
+      "loss": 3.7708,
+      "step": 17000
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7355145812034607,
+      "learning_rate": 0.000168703125,
+      "loss": 3.7386,
+      "step": 18000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.3445651589741385,
+      "eval_loss": 3.938964366912842,
+      "eval_runtime": 153.6743,
+      "eval_samples_per_second": 376.901,
+      "eval_steps_per_second": 5.889,
+      "step": 18600
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.745087206363678,
+      "learning_rate": 0.00017806875,
+      "loss": 3.7067,
+      "step": 19000
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.9550297260284424,
+      "learning_rate": 0.00018744374999999999,
+      "loss": 3.6708,
+      "step": 20000
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.6702427864074707,
+      "learning_rate": 0.00019680937499999996,
+      "loss": 3.6515,
+      "step": 21000
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.7245630025863647,
+      "learning_rate": 0.00020617499999999998,
+      "loss": 3.6354,
+      "step": 22000
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.6476539969444275,
+      "learning_rate": 0.00021554999999999998,
+      "loss": 3.6117,
+      "step": 23000
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.6578693389892578,
+      "learning_rate": 0.00022492499999999998,
+      "loss": 3.5979,
+      "step": 24000
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.6044876575469971,
+      "learning_rate": 0.00023429999999999998,
+      "loss": 3.5757,
+      "step": 25000
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.6016374826431274,
+      "learning_rate": 0.00024366562499999997,
+      "loss": 3.572,
+      "step": 26000
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.56680828332901,
+      "learning_rate": 0.000253040625,
+      "loss": 3.552,
+      "step": 27000
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.5599876046180725,
+      "learning_rate": 0.00026240624999999994,
+      "loss": 3.541,
+      "step": 28000
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.5561350584030151,
+      "learning_rate": 0.00027178124999999994,
+      "loss": 3.5302,
+      "step": 29000
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.5627046227455139,
+      "learning_rate": 0.00028115624999999994,
+      "loss": 3.5212,
+      "step": 30000
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.5255282521247864,
+      "learning_rate": 0.000290521875,
+      "loss": 3.5043,
+      "step": 31000
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5291178822517395,
+      "learning_rate": 0.000299896875,
+      "loss": 3.4957,
+      "step": 32000
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 0.48417210578918457,
+      "learning_rate": 0.0002991282352941176,
+      "loss": 3.4827,
+      "step": 33000
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.49497950077056885,
+      "learning_rate": 0.00029824588235294113,
+      "loss": 3.4696,
+      "step": 34000
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.45140179991722107,
+      "learning_rate": 0.0002973635294117647,
+      "loss": 3.4628,
+      "step": 35000
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.4502553939819336,
+      "learning_rate": 0.00029648205882352937,
+      "loss": 3.4448,
+      "step": 36000
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.47246313095092773,
+      "learning_rate": 0.0002955997058823529,
+      "loss": 3.4323,
+      "step": 37000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.37465563664896706,
+      "eval_loss": 3.6439149379730225,
+      "eval_runtime": 155.8711,
+      "eval_samples_per_second": 371.589,
+      "eval_steps_per_second": 5.806,
+      "step": 37200
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.4650474488735199,
+      "learning_rate": 0.00029471735294117645,
+      "loss": 3.3917,
+      "step": 38000
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.4563141465187073,
+      "learning_rate": 0.00029383588235294114,
+      "loss": 3.3726,
+      "step": 39000
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.42616134881973267,
+      "learning_rate": 0.0002929535294117647,
+      "loss": 3.3674,
+      "step": 40000
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.4414719343185425,
+      "learning_rate": 0.00029207117647058817,
+      "loss": 3.3679,
+      "step": 41000
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.42499449849128723,
+      "learning_rate": 0.0002911897058823529,
+      "loss": 3.3584,
+      "step": 42000
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 0.4124417304992676,
+      "learning_rate": 0.00029030735294117646,
+      "loss": 3.3522,
+      "step": 43000
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 0.3980655372142792,
+      "learning_rate": 0.00028942499999999995,
+      "loss": 3.3481,
+      "step": 44000
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.40041157603263855,
+      "learning_rate": 0.0002885426470588235,
+      "loss": 3.3422,
+      "step": 45000
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 0.399441123008728,
+      "learning_rate": 0.00028766117647058823,
+      "loss": 3.3346,
+      "step": 46000
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 0.39357373118400574,
+      "learning_rate": 0.0002867788235294118,
+      "loss": 3.3326,
+      "step": 47000
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.4075976610183716,
+      "learning_rate": 0.00028589647058823526,
+      "loss": 3.3236,
+      "step": 48000
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 0.3738775849342346,
+      "learning_rate": 0.00028501499999999995,
+      "loss": 3.3247,
+      "step": 49000
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 0.37619298696517944,
+      "learning_rate": 0.0002841326470588235,
+      "loss": 3.3113,
+      "step": 50000
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.37718719244003296,
+      "learning_rate": 0.00028325029411764704,
+      "loss": 3.308,
+      "step": 51000
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.3818044066429138,
+      "learning_rate": 0.0002823679411764706,
+      "loss": 3.3057,
+      "step": 52000
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 0.3819046914577484,
+      "learning_rate": 0.00028148647058823527,
+      "loss": 3.2991,
+      "step": 53000
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.3860973119735718,
+      "learning_rate": 0.0002806041176470588,
+      "loss": 3.2951,
+      "step": 54000
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.3734324276447296,
+      "learning_rate": 0.00027972176470588236,
+      "loss": 3.2906,
+      "step": 55000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.3877894103231077,
+      "eval_loss": 3.5132408142089844,
+      "eval_runtime": 154.2632,
+      "eval_samples_per_second": 375.462,
+      "eval_steps_per_second": 5.867,
+      "step": 55800
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 0.3738992214202881,
+      "learning_rate": 0.00027884029411764705,
+      "loss": 3.271,
+      "step": 56000
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 0.3885354697704315,
+      "learning_rate": 0.0002779579411764706,
+      "loss": 3.2208,
+      "step": 57000
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.3763429522514343,
+      "learning_rate": 0.0002770755882352941,
+      "loss": 3.2269,
+      "step": 58000
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.37402838468551636,
+      "learning_rate": 0.00027619411764705877,
+      "loss": 3.2296,
+      "step": 59000
+    },
+    {
+      "epoch": 3.23,
+      "grad_norm": 0.40755459666252136,
+      "learning_rate": 0.0002753117647058823,
+      "loss": 3.2297,
+      "step": 60000
+    },
+    {
+      "epoch": 3.28,
+      "grad_norm": 0.36526888608932495,
+      "learning_rate": 0.00027442941176470585,
+      "loss": 3.2264,
+      "step": 61000
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 0.3764183521270752,
+      "learning_rate": 0.0002735470588235294,
+      "loss": 3.2268,
+      "step": 62000
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.3648715615272522,
+      "learning_rate": 0.0002726664705882353,
+      "loss": 3.2206,
+      "step": 63000
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.3624918758869171,
+      "learning_rate": 0.00027178411764705883,
+      "loss": 3.2268,
+      "step": 64000
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 0.3546142876148224,
+      "learning_rate": 0.0002709017647058823,
+      "loss": 3.2182,
+      "step": 65000
+    },
+    {
+      "epoch": 3.55,
+      "grad_norm": 0.36598387360572815,
+      "learning_rate": 0.00027002029411764706,
+      "loss": 3.2185,
+      "step": 66000
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.34152278304100037,
+      "learning_rate": 0.00026913794117647055,
+      "loss": 3.2204,
+      "step": 67000
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 0.3589687943458557,
+      "learning_rate": 0.0002682555882352941,
+      "loss": 3.2158,
+      "step": 68000
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.36120596528053284,
+      "learning_rate": 0.0002673741176470588,
+      "loss": 3.2098,
+      "step": 69000
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.35094866156578064,
+      "learning_rate": 0.0002664917647058823,
+      "loss": 3.2098,
+      "step": 70000
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 0.3578512966632843,
+      "learning_rate": 0.000265610294117647,
+      "loss": 3.2093,
+      "step": 71000
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 0.34762489795684814,
+      "learning_rate": 0.00026472794117647056,
+      "loss": 3.2072,
+      "step": 72000
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.3419814109802246,
+      "learning_rate": 0.0002638455882352941,
+      "loss": 3.2054,
+      "step": 73000
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 0.3337661027908325,
+      "learning_rate": 0.0002629641176470588,
+      "loss": 3.2008,
+      "step": 74000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.3951671566309091,
+      "eval_loss": 3.4662108421325684,
+      "eval_runtime": 153.9091,
+      "eval_samples_per_second": 376.326,
+      "eval_steps_per_second": 5.88,
+      "step": 74400
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 0.36316078901290894,
+      "learning_rate": 0.00026208176470588233,
+      "loss": 3.1617,
+      "step": 75000
+    },
+    {
+      "epoch": 4.09,
+      "grad_norm": 0.35584449768066406,
+      "learning_rate": 0.0002611994117647058,
+      "loss": 3.1371,
+      "step": 76000
+    },
+    {
+      "epoch": 4.14,
+      "grad_norm": 0.36811745166778564,
+      "learning_rate": 0.00026031705882352936,
+      "loss": 3.1406,
+      "step": 77000
+    },
+    {
+      "epoch": 4.19,
+      "grad_norm": 0.37321680784225464,
+      "learning_rate": 0.0002594355882352941,
+      "loss": 3.1471,
+      "step": 78000
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 0.3459160029888153,
+      "learning_rate": 0.00025855323529411765,
+      "loss": 3.1461,
+      "step": 79000
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.3662152588367462,
+      "learning_rate": 0.00025767176470588234,
+      "loss": 3.1455,
+      "step": 80000
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 0.35658517479896545,
+      "learning_rate": 0.0002567894117647059,
+      "loss": 3.1517,
+      "step": 81000
+    },
+    {
+      "epoch": 4.41,
+      "grad_norm": 0.3443373739719391,
+      "learning_rate": 0.00025590794117647057,
+      "loss": 3.1475,
+      "step": 82000
+    },
+    {
+      "epoch": 4.46,
+      "grad_norm": 0.34918272495269775,
+      "learning_rate": 0.0002550264705882353,
+      "loss": 3.1517,
+      "step": 83000
+    },
+    {
+      "epoch": 4.52,
+      "grad_norm": 0.3572300672531128,
+      "learning_rate": 0.0002541441176470588,
+      "loss": 3.1507,
+      "step": 84000
+    },
+    {
+      "epoch": 4.57,
+      "grad_norm": 0.35121145844459534,
+      "learning_rate": 0.00025326176470588234,
+      "loss": 3.15,
+      "step": 85000
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 0.3452964723110199,
+      "learning_rate": 0.00025238029411764703,
+      "loss": 3.1446,
+      "step": 86000
+    },
+    {
+      "epoch": 4.68,
+      "grad_norm": 0.3484768271446228,
+      "learning_rate": 0.0002514979411764706,
+      "loss": 3.1492,
+      "step": 87000
+    },
+    {
+      "epoch": 4.73,
+      "grad_norm": 0.34911030530929565,
+      "learning_rate": 0.0002506155882352941,
+      "loss": 3.1486,
+      "step": 88000
+    },
+    {
+      "epoch": 4.78,
+      "grad_norm": 0.349050909280777,
+      "learning_rate": 0.0002497332352941176,
+      "loss": 3.1433,
+      "step": 89000
+    },
+    {
+      "epoch": 4.84,
+      "grad_norm": 0.35526707768440247,
+      "learning_rate": 0.00024885176470588235,
+      "loss": 3.1434,
+      "step": 90000
+    },
+    {
+      "epoch": 4.89,
+      "grad_norm": 0.32889583706855774,
+      "learning_rate": 0.00024797029411764704,
+      "loss": 3.1436,
+      "step": 91000
+    },
+    {
+      "epoch": 4.95,
+      "grad_norm": 0.34612250328063965,
+      "learning_rate": 0.0002470879411764706,
+      "loss": 3.1419,
+      "step": 92000
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.8200662136077881,
+      "learning_rate": 0.00024620647058823527,
+      "loss": 3.1424,
+      "step": 93000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.3987985575166068,
+      "eval_loss": 3.425178289413452,
+      "eval_runtime": 154.096,
+      "eval_samples_per_second": 375.87,
+      "eval_steps_per_second": 5.873,
+      "step": 93000
+    },
+    {
+      "epoch": 5.05,
+      "grad_norm": 0.37201544642448425,
+      "learning_rate": 0.0002453241176470588,
+      "loss": 3.0771,
+      "step": 94000
+    },
+    {
+      "epoch": 5.11,
+      "grad_norm": 0.34167686104774475,
+      "learning_rate": 0.0002444417647058823,
+      "loss": 3.0806,
+      "step": 95000
+    },
+    {
+      "epoch": 5.16,
+      "grad_norm": 0.3650292158126831,
+      "learning_rate": 0.00024355941176470585,
+      "loss": 3.0847,
+      "step": 96000
+    },
+    {
+      "epoch": 5.22,
+      "grad_norm": 0.3709043860435486,
+      "learning_rate": 0.00024267794117647056,
+      "loss": 3.0885,
+      "step": 97000
+    },
+    {
+      "epoch": 5.27,
+      "grad_norm": 0.3401245176792145,
+      "learning_rate": 0.0002417955882352941,
+      "loss": 3.0927,
+      "step": 98000
+    },
+    {
+      "epoch": 5.32,
+      "grad_norm": 0.37579312920570374,
+      "learning_rate": 0.00024091323529411765,
+      "loss": 3.0929,
+      "step": 99000
+    },
+    {
+      "epoch": 5.38,
+      "grad_norm": 0.3333818316459656,
+      "learning_rate": 0.00024003176470588234,
+      "loss": 3.0932,
+      "step": 100000
+    },
+    {
+      "epoch": 5.43,
+      "grad_norm": 0.35665878653526306,
+      "learning_rate": 0.00023914941176470585,
+      "loss": 3.0917,
+      "step": 101000
+    },
+    {
+      "epoch": 5.48,
+      "grad_norm": 0.356718510389328,
+      "learning_rate": 0.00023826705882352937,
+      "loss": 3.0903,
+      "step": 102000
+    },
+    {
+      "epoch": 5.54,
+      "grad_norm": 0.3443525433540344,
+      "learning_rate": 0.0002373847058823529,
+      "loss": 3.0916,
+      "step": 103000
+    },
+    {
+      "epoch": 5.59,
+      "grad_norm": 0.35697805881500244,
+      "learning_rate": 0.00023650323529411763,
+      "loss": 3.0911,
+      "step": 104000
+    },
+    {
+      "epoch": 5.65,
+      "grad_norm": 0.3541969060897827,
+      "learning_rate": 0.00023562176470588234,
+      "loss": 3.0943,
+      "step": 105000
+    },
+    {
+      "epoch": 5.7,
+      "grad_norm": 0.3476627767086029,
+      "learning_rate": 0.00023473941176470586,
+      "loss": 3.0968,
+      "step": 106000
+    },
+    {
+      "epoch": 5.75,
+      "grad_norm": 0.3471356928348541,
+      "learning_rate": 0.0002338570588235294,
+      "loss": 3.0948,
+      "step": 107000
+    },
+    {
+      "epoch": 5.81,
+      "grad_norm": 0.3633851110935211,
+      "learning_rate": 0.00023297470588235292,
+      "loss": 3.0946,
+      "step": 108000
+    },
+    {
+      "epoch": 5.86,
+      "grad_norm": 0.3264976441860199,
+      "learning_rate": 0.00023209323529411763,
+      "loss": 3.0992,
+      "step": 109000
+    },
+    {
+      "epoch": 5.91,
+      "grad_norm": 0.3441069722175598,
+      "learning_rate": 0.00023121176470588232,
+      "loss": 3.0985,
+      "step": 110000
+    },
+    {
+      "epoch": 5.97,
+      "grad_norm": 0.34033203125,
+      "learning_rate": 0.00023032941176470584,
+      "loss": 3.0983,
+      "step": 111000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.40229210557430617,
+      "eval_loss": 3.4145569801330566,
+      "eval_runtime": 154.4814,
+      "eval_samples_per_second": 374.932,
+      "eval_steps_per_second": 5.858,
+      "step": 111600
+    },
+    {
+      "epoch": 6.02,
+      "grad_norm": 0.36109837889671326,
+      "learning_rate": 0.00022944705882352938,
+      "loss": 3.0671,
+      "step": 112000
+    },
+    {
+      "epoch": 6.08,
+      "grad_norm": 0.35247114300727844,
+      "learning_rate": 0.00022856470588235292,
+      "loss": 3.0266,
+      "step": 113000
+    },
+    {
+      "epoch": 6.13,
+      "grad_norm": 0.3495902419090271,
+      "learning_rate": 0.00022768323529411764,
+      "loss": 3.0356,
+      "step": 114000
+    },
+    {
+      "epoch": 6.18,
+      "grad_norm": 0.34144967794418335,
+      "learning_rate": 0.00022680176470588233,
+      "loss": 3.0441,
+      "step": 115000
+    },
+    {
+      "epoch": 6.24,
+      "grad_norm": 0.3760474920272827,
+      "learning_rate": 0.00022591941176470587,
+      "loss": 3.043,
+      "step": 116000
+    },
+    {
+      "epoch": 6.29,
+      "grad_norm": 0.34772148728370667,
+      "learning_rate": 0.00022503705882352942,
+      "loss": 3.0446,
+      "step": 117000
+    },
+    {
+      "epoch": 6.34,
+      "grad_norm": 0.3700126111507416,
+      "learning_rate": 0.00022415470588235293,
+      "loss": 3.0467,
+      "step": 118000
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.34513553977012634,
+      "learning_rate": 0.00022327323529411762,
+      "loss": 3.0465,
+      "step": 119000
+    },
+    {
+      "epoch": 6.45,
+      "grad_norm": 0.3503216505050659,
+      "learning_rate": 0.00022239176470588234,
+      "loss": 3.0513,
+      "step": 120000
+    },
+    {
+      "epoch": 6.51,
+      "grad_norm": 0.35930097103118896,
+      "learning_rate": 0.00022150941176470585,
+      "loss": 3.0512,
+      "step": 121000
+    },
+    {
+      "epoch": 6.56,
+      "grad_norm": 0.3546220362186432,
+      "learning_rate": 0.0002206270588235294,
+      "loss": 3.0504,
+      "step": 122000
+    },
+    {
+      "epoch": 6.61,
+      "grad_norm": 0.36518198251724243,
+      "learning_rate": 0.0002197447058823529,
+      "loss": 3.0584,
+      "step": 123000
+    },
+    {
+      "epoch": 6.67,
+      "grad_norm": 0.34408724308013916,
+      "learning_rate": 0.00021886323529411763,
+      "loss": 3.0513,
+      "step": 124000
+    },
+    {
+      "epoch": 6.72,
+      "grad_norm": 0.34966805577278137,
+      "learning_rate": 0.00021798088235294117,
+      "loss": 3.0594,
+      "step": 125000
+    },
+    {
+      "epoch": 6.77,
+      "grad_norm": 0.32780104875564575,
+      "learning_rate": 0.00021709941176470589,
+      "loss": 3.0573,
+      "step": 126000
+    },
+    {
+      "epoch": 6.83,
+      "grad_norm": 0.3445553779602051,
+      "learning_rate": 0.00021621794117647055,
+      "loss": 3.0567,
+      "step": 127000
+    },
+    {
+      "epoch": 6.88,
+      "grad_norm": 0.3367459177970886,
+      "learning_rate": 0.0002153355882352941,
+      "loss": 3.0543,
+      "step": 128000
+    },
+    {
+      "epoch": 6.94,
+      "grad_norm": 0.33958423137664795,
+      "learning_rate": 0.00021445323529411763,
+      "loss": 3.0624,
+      "step": 129000
+    },
+    {
+      "epoch": 6.99,
+      "grad_norm": 0.3499571979045868,
+      "learning_rate": 0.00021357176470588232,
+      "loss": 3.061,
+      "step": 130000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.40386995614506527,
+      "eval_loss": 3.3960554599761963,
+      "eval_runtime": 154.1063,
+      "eval_samples_per_second": 375.844,
+      "eval_steps_per_second": 5.873,
+      "step": 130200
+    },
+    {
+      "epoch": 7.04,
+      "grad_norm": 0.3685757517814636,
+      "learning_rate": 0.00021268941176470586,
+      "loss": 2.9998,
+      "step": 131000
+    },
+    {
+      "epoch": 7.1,
+      "grad_norm": 0.35535717010498047,
+      "learning_rate": 0.0002118070588235294,
+      "loss": 2.9926,
+      "step": 132000
+    },
+    {
+      "epoch": 7.15,
+      "grad_norm": 0.36104437708854675,
+      "learning_rate": 0.0002109255882352941,
+      "loss": 2.9975,
+      "step": 133000
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 0.36811739206314087,
+      "learning_rate": 0.00021004323529411764,
+      "loss": 3.0098,
+      "step": 134000
+    },
+    {
+      "epoch": 7.26,
+      "grad_norm": 0.36876553297042847,
+      "learning_rate": 0.00020916088235294118,
+      "loss": 3.0089,
+      "step": 135000
+    },
+    {
+      "epoch": 7.31,
+      "grad_norm": 0.35547611117362976,
+      "learning_rate": 0.00020827852941176467,
+      "loss": 3.0134,
+      "step": 136000
+    },
+    {
+      "epoch": 7.37,
+      "grad_norm": 0.3423675000667572,
+      "learning_rate": 0.0002073970588235294,
+      "loss": 3.0142,
+      "step": 137000
+    },
+    {
+      "epoch": 7.42,
+      "grad_norm": 0.34084853529930115,
+      "learning_rate": 0.0002065155882352941,
+      "loss": 3.0152,
+      "step": 138000
+    },
+    {
+      "epoch": 7.47,
+      "grad_norm": 0.3496473431587219,
+      "learning_rate": 0.00020563323529411762,
+      "loss": 3.0168,
+      "step": 139000
+    },
+    {
+      "epoch": 7.53,
+      "grad_norm": 0.3702958822250366,
+      "learning_rate": 0.00020475176470588234,
+      "loss": 3.0159,
+      "step": 140000
+    },
+    {
+      "epoch": 7.58,
+      "grad_norm": 0.36530712246894836,
+      "learning_rate": 0.00020386941176470588,
+      "loss": 3.0178,
+      "step": 141000
+    },
+    {
+      "epoch": 7.63,
+      "grad_norm": 0.38187021017074585,
+      "learning_rate": 0.0002029870588235294,
+      "loss": 3.0227,
+      "step": 142000
+    },
+    {
+      "epoch": 7.69,
+      "grad_norm": 0.3322228491306305,
+      "learning_rate": 0.00020210470588235294,
+      "loss": 3.0172,
+      "step": 143000
+    },
+    {
+      "epoch": 7.74,
+      "grad_norm": 0.3619314730167389,
+      "learning_rate": 0.00020122235294117648,
+      "loss": 3.0196,
+      "step": 144000
+    },
+    {
+      "epoch": 7.8,
+      "grad_norm": 0.3536997437477112,
+      "learning_rate": 0.00020034088235294114,
+      "loss": 3.023,
+      "step": 145000
+    },
+    {
+      "epoch": 7.85,
+      "grad_norm": 0.34041669964790344,
+      "learning_rate": 0.00019945852941176468,
+      "loss": 3.0198,
+      "step": 146000
+    },
+    {
+      "epoch": 7.9,
+      "grad_norm": 0.35122403502464294,
+      "learning_rate": 0.0001985770588235294,
+      "loss": 3.0192,
+      "step": 147000
+    },
+    {
+      "epoch": 7.96,
+      "grad_norm": 0.3664146363735199,
+      "learning_rate": 0.00019769470588235292,
+      "loss": 3.0241,
+      "step": 148000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.4060721752262614,
+      "eval_loss": 3.367530584335327,
+      "eval_runtime": 153.8583,
+      "eval_samples_per_second": 376.45,
+      "eval_steps_per_second": 5.882,
+      "step": 148800
+    },
+    {
+      "epoch": 8.01,
+      "grad_norm": 0.36462950706481934,
+      "learning_rate": 0.00019681323529411763,
+      "loss": 3.0094,
+      "step": 149000
+    },
+    {
+      "epoch": 8.06,
+      "grad_norm": 0.3626813590526581,
+      "learning_rate": 0.00019593088235294118,
+      "loss": 2.9568,
+      "step": 150000
+    },
+    {
+      "epoch": 8.12,
+      "grad_norm": 0.36206358671188354,
+      "learning_rate": 0.0001950485294117647,
+      "loss": 2.9686,
+      "step": 151000
+    },
+    {
+      "epoch": 8.17,
+      "grad_norm": 0.3388898968696594,
+      "learning_rate": 0.0001941670588235294,
+      "loss": 2.9665,
+      "step": 152000
+    },
+    {
+      "epoch": 8.23,
+      "grad_norm": 0.3698641359806061,
+      "learning_rate": 0.0001932855882352941,
+      "loss": 2.9736,
+      "step": 153000
+    },
+    {
+      "epoch": 8.28,
+      "grad_norm": 0.3924822509288788,
+      "learning_rate": 0.0001924032352941176,
+      "loss": 2.9774,
+      "step": 154000
+    },
+    {
+      "epoch": 8.33,
+      "grad_norm": 0.35343238711357117,
+      "learning_rate": 0.00019152088235294116,
+      "loss": 2.9752,
+      "step": 155000
+    },
+    {
+      "epoch": 8.39,
+      "grad_norm": 0.3765462636947632,
+      "learning_rate": 0.00019063852941176467,
+      "loss": 2.9832,
+      "step": 156000
+    },
+    {
+      "epoch": 8.44,
+      "grad_norm": 0.35611894726753235,
+      "learning_rate": 0.00018975617647058821,
+      "loss": 2.9807,
+      "step": 157000
+    },
+    {
+      "epoch": 8.49,
+      "grad_norm": 0.34774050116539,
+      "learning_rate": 0.00018887470588235293,
+      "loss": 2.983,
+      "step": 158000
+    },
+    {
+      "epoch": 8.55,
+      "grad_norm": 0.35137563943862915,
+      "learning_rate": 0.00018799235294117647,
+      "loss": 2.986,
+      "step": 159000
+    },
+    {
+      "epoch": 8.6,
+      "grad_norm": 0.3774917423725128,
+      "learning_rate": 0.00018711088235294116,
+      "loss": 2.9858,
+      "step": 160000
+    },
+    {
+      "epoch": 8.66,
+      "grad_norm": 0.37547916173934937,
+      "learning_rate": 0.0001862285294117647,
+      "loss": 2.9906,
+      "step": 161000
+    },
+    {
+      "epoch": 8.71,
+      "grad_norm": 0.35136911273002625,
+      "learning_rate": 0.0001853461764705882,
+      "loss": 2.9897,
+      "step": 162000
+    },
+    {
+      "epoch": 8.76,
+      "grad_norm": 0.36987271904945374,
+      "learning_rate": 0.0001844647058823529,
+      "loss": 2.9947,
+      "step": 163000
+    },
+    {
+      "epoch": 8.82,
+      "grad_norm": 0.35084474086761475,
+      "learning_rate": 0.00018358235294117645,
+      "loss": 2.9918,
+      "step": 164000
+    },
+    {
+      "epoch": 8.87,
+      "grad_norm": 0.35935935378074646,
+      "learning_rate": 0.00018269999999999997,
+      "loss": 2.9973,
+      "step": 165000
+    },
+    {
+      "epoch": 8.92,
+      "grad_norm": 0.3475947678089142,
+      "learning_rate": 0.00018181852941176468,
+      "loss": 2.9949,
+      "step": 166000
+    },
+    {
+      "epoch": 8.98,
+      "grad_norm": 0.35769519209861755,
+      "learning_rate": 0.00018093617647058823,
+      "loss": 2.9955,
+      "step": 167000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.4071023765505084,
+      "eval_loss": 3.368950366973877,
+      "eval_runtime": 154.162,
+      "eval_samples_per_second": 375.709,
+      "eval_steps_per_second": 5.87,
+      "step": 167400
+    },
+    {
+      "epoch": 9.03,
+      "grad_norm": 0.36424490809440613,
+      "learning_rate": 0.00018005382352941174,
+      "loss": 2.9539,
+      "step": 168000
+    },
+    {
+      "epoch": 9.09,
+      "grad_norm": 0.3711196184158325,
+      "learning_rate": 0.00017917147058823529,
+      "loss": 2.9351,
+      "step": 169000
+    },
+    {
+      "epoch": 9.14,
+      "grad_norm": 0.37114304304122925,
+      "learning_rate": 0.00017828911764705883,
+      "loss": 2.9387,
+      "step": 170000
+    },
+    {
+      "epoch": 9.19,
+      "grad_norm": 0.37651318311691284,
+      "learning_rate": 0.00017740852941176466,
+      "loss": 2.9445,
+      "step": 171000
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 0.3873383104801178,
+      "learning_rate": 0.0001765261764705882,
+      "loss": 2.9402,
+      "step": 172000
+    },
+    {
+      "epoch": 9.3,
+      "grad_norm": 0.3732360303401947,
+      "learning_rate": 0.00017564382352941175,
+      "loss": 2.9515,
+      "step": 173000
+    },
+    {
+      "epoch": 9.35,
+      "grad_norm": 0.3868846595287323,
+      "learning_rate": 0.00017476235294117647,
+      "loss": 2.9519,
+      "step": 174000
+    },
+    {
+      "epoch": 9.41,
+      "grad_norm": 0.3689320981502533,
+      "learning_rate": 0.00017387999999999998,
+      "loss": 2.9559,
+      "step": 175000
+    },
+    {
+      "epoch": 9.46,
+      "grad_norm": 0.3811360001564026,
+      "learning_rate": 0.00017299764705882352,
+      "loss": 2.9543,
+      "step": 176000
+    },
+    {
+      "epoch": 9.52,
+      "grad_norm": 0.3691989481449127,
+      "learning_rate": 0.00017211529411764704,
+      "loss": 2.9546,
+      "step": 177000
+    },
+    {
+      "epoch": 9.57,
+      "grad_norm": 0.35540491342544556,
+      "learning_rate": 0.00017123470588235293,
+      "loss": 2.9598,
+      "step": 178000
+    },
+    {
+      "epoch": 9.62,
+      "grad_norm": 0.36182084679603577,
+      "learning_rate": 0.00017035235294117645,
+      "loss": 2.9608,
+      "step": 179000
+    },
+    {
+      "epoch": 9.68,
+      "grad_norm": 0.3745622932910919,
+      "learning_rate": 0.00016946999999999996,
+      "loss": 2.9601,
+      "step": 180000
+    },
+    {
+      "epoch": 9.73,
+      "grad_norm": 0.3744942247867584,
+      "learning_rate": 0.0001685876470588235,
+      "loss": 2.962,
+      "step": 181000
+    },
+    {
+      "epoch": 9.78,
+      "grad_norm": 0.3739373981952667,
+      "learning_rate": 0.00016770617647058822,
+      "loss": 2.969,
+      "step": 182000
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 0.38088759779930115,
+      "learning_rate": 0.00016682382352941174,
+      "loss": 2.9639,
+      "step": 183000
+    },
+    {
+      "epoch": 9.89,
+      "grad_norm": 0.36885249614715576,
+      "learning_rate": 0.00016594147058823528,
+      "loss": 2.9691,
+      "step": 184000
+    },
+    {
+      "epoch": 9.95,
+      "grad_norm": 0.37970590591430664,
+      "learning_rate": 0.00016506,
+      "loss": 2.9675,
+      "step": 185000
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.8675829172134399,
+      "learning_rate": 0.00016417764705882354,
+      "loss": 2.971,
+      "step": 186000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.4077077331943161,
+      "eval_loss": 3.366788625717163,
+      "eval_runtime": 154.5461,
+      "eval_samples_per_second": 374.775,
+      "eval_steps_per_second": 5.856,
+      "step": 186000
+    },
+    {
+      "epoch": 10.05,
+      "grad_norm": 0.3905154764652252,
+      "learning_rate": 0.00016329529411764705,
+      "loss": 2.9004,
+      "step": 187000
+    },
+    {
+      "epoch": 10.11,
+      "grad_norm": 0.3730432093143463,
+      "learning_rate": 0.0001624129411764706,
+      "loss": 2.9095,
+      "step": 188000
+    },
+    {
+      "epoch": 10.16,
+      "grad_norm": 0.3711690306663513,
+      "learning_rate": 0.00016153147058823526,
+      "loss": 2.9136,
+      "step": 189000
+    },
+    {
+      "epoch": 10.22,
+      "grad_norm": 0.399265855550766,
+      "learning_rate": 0.00016064999999999997,
+      "loss": 2.9178,
+      "step": 190000
+    },
+    {
+      "epoch": 10.27,
+      "grad_norm": 0.3668770492076874,
+      "learning_rate": 0.00015976764705882352,
+      "loss": 2.9229,
+      "step": 191000
+    },
+    {
+      "epoch": 10.32,
+      "grad_norm": 0.37367990612983704,
+      "learning_rate": 0.00015888529411764703,
+      "loss": 2.9243,
+      "step": 192000
+    },
+    {
+      "epoch": 10.38,
+      "grad_norm": 0.385092169046402,
+      "learning_rate": 0.00015800382352941175,
+      "loss": 2.9262,
+      "step": 193000
+    },
+    {
+      "epoch": 10.43,
+      "grad_norm": 0.3799744248390198,
+      "learning_rate": 0.00015712235294117647,
+      "loss": 2.9308,
+      "step": 194000
+    },
+    {
+      "epoch": 10.48,
+      "grad_norm": 0.3523291051387787,
+      "learning_rate": 0.00015624,
+      "loss": 2.93,
+      "step": 195000
+    },
+    {
+      "epoch": 10.54,
+      "grad_norm": 0.38555893301963806,
+      "learning_rate": 0.00015535764705882352,
+      "loss": 2.9368,
+      "step": 196000
+    },
+    {
+      "epoch": 10.59,
+      "grad_norm": 0.4180794954299927,
+      "learning_rate": 0.00015447529411764704,
+      "loss": 2.9374,
+      "step": 197000
+    },
+    {
+      "epoch": 10.65,
+      "grad_norm": 0.3763350248336792,
+      "learning_rate": 0.00015359294117647056,
+      "loss": 2.9371,
+      "step": 198000
+    },
+    {
+      "epoch": 10.7,
+      "grad_norm": 0.3720884919166565,
+      "learning_rate": 0.00015271147058823527,
+      "loss": 2.9383,
+      "step": 199000
+    },
+    {
+      "epoch": 10.75,
+      "grad_norm": 0.4129893481731415,
+      "learning_rate": 0.00015182911764705882,
+      "loss": 2.9436,
+      "step": 200000
+    },
+    {
+      "epoch": 10.81,
+      "grad_norm": 0.3609713613986969,
+      "learning_rate": 0.00015094676470588233,
+      "loss": 2.9392,
+      "step": 201000
+    },
+    {
+      "epoch": 10.86,
+      "grad_norm": 0.3772646486759186,
+      "learning_rate": 0.00015006441176470587,
+      "loss": 2.9428,
+      "step": 202000
+    },
+    {
+      "epoch": 10.91,
+      "grad_norm": 0.37511298060417175,
+      "learning_rate": 0.0001491829411764706,
+      "loss": 2.941,
+      "step": 203000
+    },
+    {
+      "epoch": 10.97,
+      "grad_norm": 0.37095674872398376,
+      "learning_rate": 0.00014830147058823528,
+      "loss": 2.9425,
+      "step": 204000
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.40833525915256785,
+      "eval_loss": 3.3716514110565186,
+      "eval_runtime": 154.5582,
+      "eval_samples_per_second": 374.746,
+      "eval_steps_per_second": 5.855,
+      "step": 204600
+    },
+    {
+      "epoch": 11.02,
+      "grad_norm": 0.4002268612384796,
+      "learning_rate": 0.00014741911764705882,
+      "loss": 2.9159,
+      "step": 205000
+    },
+    {
+      "epoch": 11.08,
+      "grad_norm": 0.38958606123924255,
+      "learning_rate": 0.00014653676470588234,
+      "loss": 2.8845,
+      "step": 206000
+    },
+    {
+      "epoch": 11.13,
+      "grad_norm": 0.39512166380882263,
+      "learning_rate": 0.00014565529411764705,
+      "loss": 2.888,
+      "step": 207000
+    },
+    {
+      "epoch": 11.18,
+      "grad_norm": 0.38474199175834656,
+      "learning_rate": 0.00014477294117647057,
+      "loss": 2.8945,
+      "step": 208000
+    },
+    {
+      "epoch": 11.24,
+      "grad_norm": 0.3705930709838867,
+      "learning_rate": 0.00014389147058823529,
+      "loss": 2.9003,
+      "step": 209000
+    },
+    {
+      "epoch": 11.29,
+      "grad_norm": 0.3930906057357788,
+      "learning_rate": 0.0001430091176470588,
+      "loss": 2.8984,
+      "step": 210000
+    },
+    {
+      "epoch": 11.34,
+      "grad_norm": 0.38158994913101196,
+      "learning_rate": 0.00014212676470588234,
+      "loss": 2.9031,
+      "step": 211000
+    },
+    {
+      "epoch": 11.4,
+      "grad_norm": 0.4204154312610626,
+      "learning_rate": 0.0001412444117647059,
+      "loss": 2.9029,
+      "step": 212000
+    },
+    {
+      "epoch": 11.45,
+      "grad_norm": 0.3909905254840851,
+      "learning_rate": 0.00014036294117647058,
+      "loss": 2.9031,
+      "step": 213000
+    },
+    {
+      "epoch": 11.51,
+      "grad_norm": 0.38486233353614807,
+      "learning_rate": 0.0001394805882352941,
+      "loss": 2.9061,
+      "step": 214000
+    },
+    {
+      "epoch": 11.56,
+      "grad_norm": 0.3670654594898224,
+      "learning_rate": 0.0001385991176470588,
+      "loss": 2.9114,
+      "step": 215000
+    },
+    {
+      "epoch": 11.61,
+      "grad_norm": 0.38542190194129944,
+      "learning_rate": 0.00013771676470588235,
+      "loss": 2.913,
+      "step": 216000
+    },
+    {
+      "epoch": 11.67,
+      "grad_norm": 0.4009072184562683,
+      "learning_rate": 0.00013683441176470587,
+      "loss": 2.9139,
+      "step": 217000
+    },
+    {
+      "epoch": 11.72,
+      "grad_norm": 0.37454643845558167,
+      "learning_rate": 0.0001359520588235294,
+      "loss": 2.9155,
+      "step": 218000
+    },
+    {
+      "epoch": 11.77,
+      "grad_norm": 0.38453200459480286,
+      "learning_rate": 0.00013506970588235293,
+      "loss": 2.9198,
+      "step": 219000
+    },
+    {
+      "epoch": 11.83,
+      "grad_norm": 0.38210123777389526,
+      "learning_rate": 0.00013418735294117647,
+      "loss": 2.9175,
+      "step": 220000
+    },
+    {
+      "epoch": 11.88,
+      "grad_norm": 0.38491418957710266,
+      "learning_rate": 0.00013330676470588233,
+      "loss": 2.9218,
+      "step": 221000
+    },
+    {
+      "epoch": 11.94,
+      "grad_norm": 0.40083280205726624,
+      "learning_rate": 0.00013242441176470587,
+      "loss": 2.9197,
+      "step": 222000
+    },
+    {
+      "epoch": 11.99,
+      "grad_norm": 0.37707728147506714,
+      "learning_rate": 0.0001315420588235294,
+      "loss": 2.9175,
+      "step": 223000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.4085028054259733,
+      "eval_loss": 3.383648157119751,
+      "eval_runtime": 153.9053,
+      "eval_samples_per_second": 376.335,
+      "eval_steps_per_second": 5.88,
+      "step": 223200
+    },
+    {
+      "epoch": 12.04,
+      "grad_norm": 0.3987793028354645,
+      "learning_rate": 0.00013065970588235293,
+      "loss": 2.8729,
+      "step": 224000
+    },
+    {
+      "epoch": 12.1,
+      "grad_norm": 0.4100884199142456,
+      "learning_rate": 0.00012977823529411765,
+      "loss": 2.8662,
+      "step": 225000
+    },
+    {
+      "epoch": 12.15,
+      "grad_norm": 0.4005632996559143,
+      "learning_rate": 0.00012889676470588234,
+      "loss": 2.8718,
+      "step": 226000
+    },
+    {
+      "epoch": 12.2,
+      "grad_norm": 0.4106426537036896,
+      "learning_rate": 0.00012801441176470588,
+      "loss": 2.8702,
+      "step": 227000
+    },
+    {
+      "epoch": 12.26,
+      "grad_norm": 0.4060533046722412,
+      "learning_rate": 0.0001271320588235294,
+      "loss": 2.8727,
+      "step": 228000
+    },
+    {
+      "epoch": 12.31,
+      "grad_norm": 0.40115979313850403,
+      "learning_rate": 0.0001262505882352941,
+      "loss": 2.8805,
+      "step": 229000
+    },
+    {
+      "epoch": 12.37,
+      "grad_norm": 0.3902336359024048,
+      "learning_rate": 0.00012536823529411763,
+      "loss": 2.8781,
+      "step": 230000
+    },
+    {
+      "epoch": 12.42,
+      "grad_norm": 0.40399476885795593,
+      "learning_rate": 0.00012448588235294117,
+      "loss": 2.8814,
+      "step": 231000
+    },
+    {
+      "epoch": 12.47,
+      "grad_norm": 0.39339208602905273,
+      "learning_rate": 0.0001236044117647059,
+      "loss": 2.8871,
+      "step": 232000
+    },
+    {
+      "epoch": 12.53,
+      "grad_norm": 0.38773778080940247,
+      "learning_rate": 0.0001227220588235294,
+      "loss": 2.8931,
+      "step": 233000
+    },
+    {
+      "epoch": 12.58,
+      "grad_norm": 0.4028586447238922,
+      "learning_rate": 0.00012183970588235292,
+      "loss": 2.892,
+      "step": 234000
+    },
+    {
+      "epoch": 12.63,
+      "grad_norm": 0.38995110988616943,
+      "learning_rate": 0.00012095823529411763,
+      "loss": 2.892,
+      "step": 235000
+    },
+    {
+      "epoch": 12.69,
+      "grad_norm": 0.41991814970970154,
+      "learning_rate": 0.00012007588235294116,
+      "loss": 2.8932,
+      "step": 236000
+    },
+    {
+      "epoch": 12.74,
+      "grad_norm": 0.39780813455581665,
+      "learning_rate": 0.0001191935294117647,
+      "loss": 2.8923,
+      "step": 237000
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 0.40335002541542053,
+      "learning_rate": 0.00011831117647058824,
+      "loss": 2.8947,
+      "step": 238000
+    },
+    {
+      "epoch": 12.85,
+      "grad_norm": 0.4120156168937683,
+      "learning_rate": 0.00011742970588235293,
+      "loss": 2.8976,
+      "step": 239000
+    },
+    {
+      "epoch": 12.9,
+      "grad_norm": 0.4305998682975769,
+      "learning_rate": 0.00011654735294117645,
+      "loss": 2.8974,
+      "step": 240000
+    },
+    {
+      "epoch": 12.96,
+      "grad_norm": 0.3948858976364136,
+      "learning_rate": 0.00011566499999999998,
+      "loss": 2.8993,
+      "step": 241000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.4095500704043683,
+      "eval_loss": 3.368474006652832,
+      "eval_runtime": 154.1284,
+      "eval_samples_per_second": 375.791,
+      "eval_steps_per_second": 5.872,
+      "step": 241800
+    },
+    {
+      "epoch": 13.01,
+      "grad_norm": 0.40451323986053467,
+      "learning_rate": 0.0001147835294117647,
+      "loss": 2.8895,
+      "step": 242000
+    },
+    {
+      "epoch": 13.06,
+      "grad_norm": 0.40097764134407043,
+      "learning_rate": 0.00011390117647058822,
+      "loss": 2.8413,
+      "step": 243000
+    },
+    {
+      "epoch": 13.12,
+      "grad_norm": 0.41899681091308594,
+      "learning_rate": 0.00011301970588235293,
+      "loss": 2.8457,
+      "step": 244000
+    },
+    {
+      "epoch": 13.17,
+      "grad_norm": 0.41395074129104614,
+      "learning_rate": 0.00011213735294117646,
+      "loss": 2.8478,
+      "step": 245000
+    },
+    {
+      "epoch": 13.23,
+      "grad_norm": 0.4069136083126068,
+      "learning_rate": 0.00011125499999999999,
+      "loss": 2.8546,
+      "step": 246000
+    },
+    {
+      "epoch": 13.28,
+      "grad_norm": 0.4198252260684967,
+      "learning_rate": 0.00011037264705882352,
+      "loss": 2.8586,
+      "step": 247000
+    },
+    {
+      "epoch": 13.33,
+      "grad_norm": 0.4080910384654999,
+      "learning_rate": 0.00010949117647058822,
+      "loss": 2.8595,
+      "step": 248000
+    },
+    {
+      "epoch": 13.39,
+      "grad_norm": 0.4152771532535553,
+      "learning_rate": 0.00010860970588235294,
+      "loss": 2.8632,
+      "step": 249000
+    },
+    {
+      "epoch": 13.44,
+      "grad_norm": 0.38884419202804565,
+      "learning_rate": 0.00010772735294117647,
+      "loss": 2.8635,
+      "step": 250000
+    },
+    {
+      "epoch": 13.49,
+      "grad_norm": 0.4043819010257721,
+      "learning_rate": 0.000106845,
+      "loss": 2.8614,
+      "step": 251000
+    },
+    {
+      "epoch": 13.55,
+      "grad_norm": 0.39720532298088074,
+      "learning_rate": 0.00010596352941176469,
+      "loss": 2.8706,
+      "step": 252000
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 0.408447802066803,
+      "learning_rate": 0.00010508117647058823,
+      "loss": 2.8703,
+      "step": 253000
+    },
+    {
+      "epoch": 13.66,
+      "grad_norm": 0.43144288659095764,
+      "learning_rate": 0.00010419882352941176,
+      "loss": 2.8704,
+      "step": 254000
+    },
+    {
+      "epoch": 13.71,
+      "grad_norm": 0.4294928312301636,
+      "learning_rate": 0.00010331735294117646,
+      "loss": 2.8696,
+      "step": 255000
+    },
+    {
+      "epoch": 13.76,
+      "grad_norm": 0.39523276686668396,
+      "learning_rate": 0.00010243499999999998,
+      "loss": 2.874,
+      "step": 256000
+    },
+    {
+      "epoch": 13.82,
+      "grad_norm": 0.3995026648044586,
+      "learning_rate": 0.00010155352941176469,
+      "loss": 2.8786,
+      "step": 257000
+    },
+    {
+      "epoch": 13.87,
+      "grad_norm": 0.4120596647262573,
+      "learning_rate": 0.00010067117647058822,
+      "loss": 2.8783,
+      "step": 258000
+    },
+    {
+      "epoch": 13.92,
+      "grad_norm": 0.40986043214797974,
+      "learning_rate": 9.978882352941177e-05,
+      "loss": 2.8766,
+      "step": 259000
+    },
+    {
+      "epoch": 13.98,
+      "grad_norm": 0.39782214164733887,
+      "learning_rate": 9.89064705882353e-05,
+      "loss": 2.8802,
+      "step": 260000
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.4093544429993336,
+      "eval_loss": 3.386864423751831,
+      "eval_runtime": 154.0798,
+      "eval_samples_per_second": 375.909,
+      "eval_steps_per_second": 5.874,
+      "step": 260400
+    },
+    {
+      "epoch": 14.03,
+      "grad_norm": 0.42637449502944946,
+      "learning_rate": 9.802499999999998e-05,
+      "loss": 2.8424,
+      "step": 261000
+    },
+    {
+      "epoch": 14.09,
+      "grad_norm": 0.4381449818611145,
+      "learning_rate": 9.714264705882351e-05,
+      "loss": 2.827,
+      "step": 262000
+    },
+    {
+      "epoch": 14.14,
+      "grad_norm": 0.41219663619995117,
+      "learning_rate": 9.626117647058823e-05,
+      "loss": 2.8299,
+      "step": 263000
+    },
+    {
+      "epoch": 14.19,
+      "grad_norm": 0.3997000455856323,
+      "learning_rate": 9.537882352941176e-05,
+      "loss": 2.8347,
+      "step": 264000
+    },
+    {
+      "epoch": 14.25,
+      "grad_norm": 0.4551340639591217,
+      "learning_rate": 9.449647058823527e-05,
+      "loss": 2.8429,
+      "step": 265000
+    },
+    {
+      "epoch": 14.3,
+      "grad_norm": 0.42454150319099426,
+      "learning_rate": 9.361411764705882e-05,
+      "loss": 2.8375,
+      "step": 266000
+    },
+    {
+      "epoch": 14.35,
+      "grad_norm": 0.4199308454990387,
+      "learning_rate": 9.273264705882352e-05,
+      "loss": 2.8383,
+      "step": 267000
+    },
+    {
+      "epoch": 14.41,
+      "grad_norm": 0.4250849485397339,
+      "learning_rate": 9.185117647058824e-05,
+      "loss": 2.8448,
+      "step": 268000
+    },
+    {
+      "epoch": 14.46,
+      "grad_norm": 0.4406766891479492,
+      "learning_rate": 9.096882352941175e-05,
+      "loss": 2.8431,
+      "step": 269000
+    },
+    {
+      "epoch": 14.52,
+      "grad_norm": 0.40331459045410156,
+      "learning_rate": 9.008647058823528e-05,
+      "loss": 2.8485,
+      "step": 270000
+    },
+    {
+      "epoch": 14.57,
+      "grad_norm": 0.4189225137233734,
+      "learning_rate": 8.920411764705881e-05,
+      "loss": 2.8496,
+      "step": 271000
+    },
+    {
+      "epoch": 14.62,
+      "grad_norm": 0.4331813454627991,
+      "learning_rate": 8.832264705882353e-05,
+      "loss": 2.8533,
+      "step": 272000
+    },
+    {
+      "epoch": 14.68,
+      "grad_norm": 0.4302915930747986,
+      "learning_rate": 8.744029411764706e-05,
+      "loss": 2.8538,
+      "step": 273000
+    },
+    {
+      "epoch": 14.73,
+      "grad_norm": 0.4052475094795227,
+      "learning_rate": 8.655794117647057e-05,
+      "loss": 2.8511,
+      "step": 274000
+    },
+    {
+      "epoch": 14.78,
+      "grad_norm": 0.41182762384414673,
+      "learning_rate": 8.56755882352941e-05,
+      "loss": 2.8544,
+      "step": 275000
+    },
+    {
+      "epoch": 14.84,
+      "grad_norm": 0.40615832805633545,
+      "learning_rate": 8.479411764705882e-05,
+      "loss": 2.8531,
+      "step": 276000
+    },
+    {
+      "epoch": 14.89,
+      "grad_norm": 0.4004068374633789,
+      "learning_rate": 8.391176470588235e-05,
+      "loss": 2.8566,
+      "step": 277000
+    },
+    {
+      "epoch": 14.95,
+      "grad_norm": 0.4328298270702362,
+      "learning_rate": 8.302941176470588e-05,
+      "loss": 2.8582,
+      "step": 278000
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 1.0608173608779907,
+      "learning_rate": 8.21470588235294e-05,
+      "loss": 2.8591,
+      "step": 279000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.40931400079540814,
+      "eval_loss": 3.390340566635132,
+      "eval_runtime": 154.3289,
+      "eval_samples_per_second": 375.302,
+      "eval_steps_per_second": 5.864,
+      "step": 279000
+    },
+    {
+      "epoch": 15.05,
+      "grad_norm": 0.4320835471153259,
+      "learning_rate": 8.126558823529411e-05,
+      "loss": 2.8083,
+      "step": 280000
+    },
+    {
+      "epoch": 15.11,
+      "grad_norm": 0.43483027815818787,
+      "learning_rate": 8.038411764705882e-05,
+      "loss": 2.8112,
+      "step": 281000
+    },
+    {
+      "epoch": 15.16,
+      "grad_norm": 0.4238964021205902,
+      "learning_rate": 7.950176470588235e-05,
+      "loss": 2.8149,
+      "step": 282000
+    },
+    {
+      "epoch": 15.22,
+      "grad_norm": 0.40951046347618103,
+      "learning_rate": 7.862029411764704e-05,
+      "loss": 2.8164,
+      "step": 283000
+    },
+    {
+      "epoch": 15.27,
+      "grad_norm": 0.4388927221298218,
+      "learning_rate": 7.773794117647059e-05,
+      "loss": 2.8185,
+      "step": 284000
+    },
+    {
+      "epoch": 15.32,
+      "grad_norm": 0.42049768567085266,
+      "learning_rate": 7.685647058823529e-05,
+      "loss": 2.8229,
+      "step": 285000
+    },
+    {
+      "epoch": 15.38,
+      "grad_norm": 0.43744581937789917,
+      "learning_rate": 7.597411764705883e-05,
+      "loss": 2.8277,
+      "step": 286000
+    },
+    {
+      "epoch": 15.43,
+      "grad_norm": 0.45774605870246887,
+      "learning_rate": 7.509176470588235e-05,
+      "loss": 2.8205,
+      "step": 287000
+    },
+    {
+      "epoch": 15.48,
+      "grad_norm": 0.4613402783870697,
+      "learning_rate": 7.421029411764705e-05,
+      "loss": 2.8287,
+      "step": 288000
+    },
+    {
+      "epoch": 15.54,
+      "grad_norm": 0.4393397569656372,
+      "learning_rate": 7.332794117647058e-05,
+      "loss": 2.8314,
+      "step": 289000
+    },
+    {
+      "epoch": 15.59,
+      "grad_norm": 0.4375053942203522,
+      "learning_rate": 7.244558823529411e-05,
+      "loss": 2.8311,
+      "step": 290000
+    },
+    {
+      "epoch": 15.65,
+      "grad_norm": 0.42573311924934387,
+      "learning_rate": 7.156411764705882e-05,
+      "loss": 2.8296,
+      "step": 291000
+    },
+    {
+      "epoch": 15.7,
+      "grad_norm": 0.43389272689819336,
+      "learning_rate": 7.068264705882353e-05,
+      "loss": 2.8284,
+      "step": 292000
+    },
+    {
+      "epoch": 15.75,
+      "grad_norm": 0.43407490849494934,
+      "learning_rate": 6.980029411764706e-05,
+      "loss": 2.8322,
+      "step": 293000
+    },
+    {
+      "epoch": 15.81,
+      "grad_norm": 0.42439669370651245,
+      "learning_rate": 6.891794117647059e-05,
+      "loss": 2.8386,
+      "step": 294000
+    },
+    {
+      "epoch": 15.86,
+      "grad_norm": 0.4558635354042053,
+      "learning_rate": 6.803558823529411e-05,
+      "loss": 2.837,
+      "step": 295000
+    },
+    {
+      "epoch": 15.91,
+      "grad_norm": 0.4102713167667389,
+      "learning_rate": 6.715323529411764e-05,
+      "loss": 2.8375,
+      "step": 296000
+    },
+    {
+      "epoch": 15.97,
+      "grad_norm": 0.4180239737033844,
+      "learning_rate": 6.627176470588235e-05,
+      "loss": 2.8397,
+      "step": 297000
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.4098871111636606,
+      "eval_loss": 3.3899056911468506,
+      "eval_runtime": 154.0201,
+      "eval_samples_per_second": 376.055,
+      "eval_steps_per_second": 5.876,
+      "step": 297600
+    },
+    {
+      "epoch": 16.02,
+      "grad_norm": 0.446920782327652,
+      "learning_rate": 6.538941176470588e-05,
+      "loss": 2.8222,
+      "step": 298000
+    },
+    {
+      "epoch": 16.08,
+      "grad_norm": 0.4178001582622528,
+      "learning_rate": 6.45070588235294e-05,
+      "loss": 2.7924,
+      "step": 299000
+    },
+    {
+      "epoch": 16.13,
+      "grad_norm": 0.45794540643692017,
+      "learning_rate": 6.362558823529411e-05,
+      "loss": 2.7949,
+      "step": 300000
+    },
+    {
+      "epoch": 16.18,
+      "grad_norm": 0.4420151114463806,
+      "learning_rate": 6.274323529411764e-05,
+      "loss": 2.7997,
+      "step": 301000
+    },
+    {
+      "epoch": 16.24,
+      "grad_norm": 0.4682018458843231,
+      "learning_rate": 6.186088235294117e-05,
+      "loss": 2.8017,
+      "step": 302000
+    },
+    {
+      "epoch": 16.29,
+      "grad_norm": 0.44723376631736755,
+      "learning_rate": 6.09785294117647e-05,
+      "loss": 2.807,
+      "step": 303000
+    },
+    {
+      "epoch": 16.34,
+      "grad_norm": 0.44858965277671814,
+      "learning_rate": 6.0097941176470585e-05,
+      "loss": 2.8091,
+      "step": 304000
+    },
+    {
+      "epoch": 16.4,
+      "grad_norm": 0.43858635425567627,
+      "learning_rate": 5.921558823529411e-05,
+      "loss": 2.8055,
+      "step": 305000
+    },
+    {
+      "epoch": 16.45,
+      "grad_norm": 0.4233960509300232,
+      "learning_rate": 5.8333235294117644e-05,
+      "loss": 2.8104,
+      "step": 306000
+    },
+    {
+      "epoch": 16.51,
+      "grad_norm": 0.4392797648906708,
+      "learning_rate": 5.745088235294117e-05,
+      "loss": 2.8122,
+      "step": 307000
+    },
+    {
+      "epoch": 16.56,
+      "grad_norm": 0.4333525002002716,
+      "learning_rate": 5.6568529411764696e-05,
+      "loss": 2.8119,
+      "step": 308000
+    },
+    {
+      "epoch": 16.61,
+      "grad_norm": 0.4427085518836975,
+      "learning_rate": 5.568617647058823e-05,
+      "loss": 2.8139,
+      "step": 309000
+    },
+    {
+      "epoch": 16.67,
+      "grad_norm": 0.445634663105011,
+      "learning_rate": 5.4804705882352935e-05,
+      "loss": 2.8149,
+      "step": 310000
+    },
+    {
+      "epoch": 16.72,
+      "grad_norm": 0.4446145296096802,
+      "learning_rate": 5.3922352941176464e-05,
+      "loss": 2.8152,
+      "step": 311000
+    },
+    {
+      "epoch": 16.77,
+      "grad_norm": 0.43026256561279297,
+      "learning_rate": 5.3040882352941166e-05,
+      "loss": 2.8141,
+      "step": 312000
+    },
+    {
+      "epoch": 16.83,
+      "grad_norm": 0.4380964934825897,
+      "learning_rate": 5.21585294117647e-05,
+      "loss": 2.8157,
+      "step": 313000
+    },
+    {
+      "epoch": 16.88,
+      "grad_norm": 0.4361090362071991,
+      "learning_rate": 5.1277058823529405e-05,
+      "loss": 2.8152,
+      "step": 314000
+    },
+    {
+      "epoch": 16.94,
+      "grad_norm": 0.4253467321395874,
+      "learning_rate": 5.0394705882352935e-05,
+      "loss": 2.8229,
+      "step": 315000
+    },
+    {
+      "epoch": 16.99,
+      "grad_norm": 0.4476911425590515,
+      "learning_rate": 4.951323529411764e-05,
+      "loss": 2.8158,
+      "step": 316000
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.40948181578777654,
+      "eval_loss": 3.3992156982421875,
+      "eval_runtime": 154.2196,
+      "eval_samples_per_second": 375.568,
+      "eval_steps_per_second": 5.868,
+      "step": 316200
+    },
+    {
+      "epoch": 17.04,
+      "grad_norm": 0.43660619854927063,
+      "learning_rate": 4.863088235294117e-05,
+      "loss": 2.7885,
+      "step": 317000
+    },
+    {
+      "epoch": 17.1,
+      "grad_norm": 0.46034106612205505,
+      "learning_rate": 4.7749411764705875e-05,
+      "loss": 2.7768,
+      "step": 318000
+    },
+    {
+      "epoch": 17.15,
+      "grad_norm": 0.43989402055740356,
+      "learning_rate": 4.6867058823529405e-05,
+      "loss": 2.7883,
+      "step": 319000
+    },
+    {
+      "epoch": 17.2,
+      "grad_norm": 0.4530872702598572,
+      "learning_rate": 4.598470588235294e-05,
+      "loss": 2.786,
+      "step": 320000
+    },
+    {
+      "epoch": 17.26,
+      "grad_norm": 0.4461182653903961,
+      "learning_rate": 4.5103235294117643e-05,
+      "loss": 2.7879,
+      "step": 321000
+    },
+    {
+      "epoch": 17.31,
+      "grad_norm": 0.4403826892375946,
+      "learning_rate": 4.422088235294117e-05,
+      "loss": 2.7923,
+      "step": 322000
+    },
+    {
+      "epoch": 17.37,
+      "grad_norm": 0.4319212734699249,
+      "learning_rate": 4.33385294117647e-05,
+      "loss": 2.7894,
+      "step": 323000
+    },
+    {
+      "epoch": 17.42,
+      "grad_norm": 0.4447585344314575,
+      "learning_rate": 4.245617647058823e-05,
+      "loss": 2.7921,
+      "step": 324000
+    },
+    {
+      "epoch": 17.47,
+      "grad_norm": 0.4347590208053589,
+      "learning_rate": 4.157382352941177e-05,
+      "loss": 2.7921,
+      "step": 325000
+    },
+    {
+      "epoch": 17.53,
+      "grad_norm": 0.42081958055496216,
+      "learning_rate": 4.069235294117647e-05,
+      "loss": 2.7944,
+      "step": 326000
+    },
+    {
+      "epoch": 17.58,
+      "grad_norm": 0.4446498155593872,
+      "learning_rate": 3.981e-05,
+      "loss": 2.7944,
+      "step": 327000
+    },
+    {
+      "epoch": 17.63,
+      "grad_norm": 0.44777339696884155,
+      "learning_rate": 3.89285294117647e-05,
+      "loss": 2.7971,
+      "step": 328000
+    },
+    {
+      "epoch": 17.69,
+      "grad_norm": 0.4410764276981354,
+      "learning_rate": 3.804617647058824e-05,
+      "loss": 2.7989,
+      "step": 329000
+    },
+    {
+      "epoch": 17.74,
+      "grad_norm": 0.4594448208808899,
+      "learning_rate": 3.716470588235294e-05,
+      "loss": 2.7943,
+      "step": 330000
+    },
+    {
+      "epoch": 17.8,
+      "grad_norm": 0.4540853798389435,
+      "learning_rate": 3.628323529411764e-05,
+      "loss": 2.7992,
+      "step": 331000
+    },
+    {
+      "epoch": 17.85,
+      "grad_norm": 0.4680405259132385,
+      "learning_rate": 3.540088235294117e-05,
+      "loss": 2.7983,
+      "step": 332000
+    },
+    {
+      "epoch": 17.9,
+      "grad_norm": 0.43989479541778564,
+      "learning_rate": 3.451941176470588e-05,
+      "loss": 2.7966,
+      "step": 333000
+    },
+    {
+      "epoch": 17.96,
+      "grad_norm": 0.4248621463775635,
+      "learning_rate": 3.363705882352941e-05,
+      "loss": 2.7994,
+      "step": 334000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.4089663456370789,
+      "eval_loss": 3.412942409515381,
+      "eval_runtime": 154.3857,
+      "eval_samples_per_second": 375.164,
+      "eval_steps_per_second": 5.862,
+      "step": 334800
+    },
+    {
+      "epoch": 18.01,
+      "grad_norm": 0.4774115979671478,
+      "learning_rate": 3.2754705882352934e-05,
+      "loss": 2.7919,
+      "step": 335000
+    },
+    {
+      "epoch": 18.06,
+      "grad_norm": 0.45676735043525696,
+      "learning_rate": 3.187323529411764e-05,
+      "loss": 2.7683,
+      "step": 336000
+    },
+    {
+      "epoch": 18.12,
+      "grad_norm": 0.43984270095825195,
+      "learning_rate": 3.099088235294117e-05,
+      "loss": 2.7712,
+      "step": 337000
+    },
+    {
+      "epoch": 18.17,
+      "grad_norm": 0.4677587151527405,
+      "learning_rate": 3.0108529411764705e-05,
+      "loss": 2.773,
+      "step": 338000
+    },
+    {
+      "epoch": 18.23,
+      "grad_norm": 0.4494146704673767,
+      "learning_rate": 2.9227058823529408e-05,
+      "loss": 2.7703,
+      "step": 339000
+    },
+    {
+      "epoch": 18.28,
+      "grad_norm": 0.4525000751018524,
+      "learning_rate": 2.834470588235294e-05,
+      "loss": 2.7731,
+      "step": 340000
+    },
+    {
+      "epoch": 18.33,
+      "grad_norm": 0.45488810539245605,
+      "learning_rate": 2.7463235294117643e-05,
+      "loss": 2.7729,
+      "step": 341000
+    },
+    {
+      "epoch": 18.39,
+      "grad_norm": 0.4790239930152893,
+      "learning_rate": 2.6580882352941176e-05,
+      "loss": 2.7788,
+      "step": 342000
+    },
+    {
+      "epoch": 18.44,
+      "grad_norm": 0.44312784075737,
+      "learning_rate": 2.5698529411764705e-05,
+      "loss": 2.7814,
+      "step": 343000
+    },
+    {
+      "epoch": 18.49,
+      "grad_norm": 0.45970168709754944,
+      "learning_rate": 2.481705882352941e-05,
+      "loss": 2.7782,
+      "step": 344000
+    },
+    {
+      "epoch": 18.55,
+      "grad_norm": 0.489257276058197,
+      "learning_rate": 2.393470588235294e-05,
+      "loss": 2.7795,
+      "step": 345000
+    },
+    {
+      "epoch": 18.6,
+      "grad_norm": 0.46976953744888306,
+      "learning_rate": 2.3052352941176467e-05,
+      "loss": 2.7802,
+      "step": 346000
+    },
+    {
+      "epoch": 18.66,
+      "grad_norm": 0.4579465985298157,
+      "learning_rate": 2.2169999999999996e-05,
+      "loss": 2.7777,
+      "step": 347000
+    },
+    {
+      "epoch": 18.71,
+      "grad_norm": 0.4497370719909668,
+      "learning_rate": 2.1288529411764702e-05,
+      "loss": 2.7802,
+      "step": 348000
+    },
+    {
+      "epoch": 18.76,
+      "grad_norm": 0.4875960648059845,
+      "learning_rate": 2.0406176470588235e-05,
+      "loss": 2.777,
+      "step": 349000
+    },
+    {
+      "epoch": 18.82,
+      "grad_norm": 0.4670606255531311,
+      "learning_rate": 1.9524705882352937e-05,
+      "loss": 2.7818,
+      "step": 350000
+    },
+    {
+      "epoch": 18.87,
+      "grad_norm": 0.44986578822135925,
+      "learning_rate": 1.864235294117647e-05,
+      "loss": 2.7842,
+      "step": 351000
+    },
+    {
+      "epoch": 18.92,
+      "grad_norm": 0.4309654235839844,
+      "learning_rate": 1.7760882352941176e-05,
+      "loss": 2.778,
+      "step": 352000
+    },
+    {
+      "epoch": 18.98,
+      "grad_norm": 0.4595903158187866,
+      "learning_rate": 1.6878529411764705e-05,
+      "loss": 2.7773,
+      "step": 353000
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.40923123535481654,
+      "eval_loss": 3.421142339706421,
+      "eval_runtime": 154.3823,
+      "eval_samples_per_second": 375.172,
+      "eval_steps_per_second": 5.862,
+      "step": 353400
+    },
+    {
+      "epoch": 19.03,
+      "grad_norm": 0.4593981206417084,
+      "learning_rate": 1.599705882352941e-05,
+      "loss": 2.7664,
+      "step": 354000
+    },
+    {
+      "epoch": 19.09,
+      "grad_norm": 0.4566170275211334,
+      "learning_rate": 1.5114705882352939e-05,
+      "loss": 2.7573,
+      "step": 355000
+    },
+    {
+      "epoch": 19.14,
+      "grad_norm": 0.475852370262146,
+      "learning_rate": 1.423235294117647e-05,
+      "loss": 2.7558,
+      "step": 356000
+    },
+    {
+      "epoch": 19.19,
+      "grad_norm": 0.4608045518398285,
+      "learning_rate": 1.3349999999999998e-05,
+      "loss": 2.7629,
+      "step": 357000
+    },
+    {
+      "epoch": 19.25,
+      "grad_norm": 0.4460165500640869,
+      "learning_rate": 1.2467647058823529e-05,
+      "loss": 2.7636,
+      "step": 358000
+    },
+    {
+      "epoch": 19.3,
+      "grad_norm": 0.46552643179893494,
+      "learning_rate": 1.1587058823529412e-05,
+      "loss": 2.7604,
+      "step": 359000
+    },
+    {
+      "epoch": 19.35,
+      "grad_norm": 0.43840447068214417,
+      "learning_rate": 1.070470588235294e-05,
+      "loss": 2.7645,
+      "step": 360000
+    },
+    {
+      "epoch": 19.41,
+      "grad_norm": 0.46308407187461853,
+      "learning_rate": 9.822352941176468e-06,
+      "loss": 2.7647,
+      "step": 361000
+    },
+    {
+      "epoch": 19.46,
+      "grad_norm": 0.4742732048034668,
+      "learning_rate": 8.939999999999999e-06,
+      "loss": 2.7654,
+      "step": 362000
+    },
+    {
+      "epoch": 19.52,
+      "grad_norm": 0.47486451268196106,
+      "learning_rate": 8.058529411764705e-06,
+      "loss": 2.7646,
+      "step": 363000
+    },
+    {
+      "epoch": 19.57,
+      "grad_norm": 0.46561646461486816,
+      "learning_rate": 7.176176470588234e-06,
+      "loss": 2.7654,
+      "step": 364000
+    },
+    {
+      "epoch": 19.62,
+      "grad_norm": 0.46853920817375183,
+      "learning_rate": 6.295588235294118e-06,
+      "loss": 2.7647,
+      "step": 365000
+    },
+    {
+      "epoch": 19.68,
+      "grad_norm": 0.48070281744003296,
+      "learning_rate": 5.413235294117647e-06,
+      "loss": 2.7639,
+      "step": 366000
+    },
+    {
+      "epoch": 19.73,
+      "grad_norm": 0.46360719203948975,
+      "learning_rate": 4.530882352941176e-06,
+      "loss": 2.7618,
+      "step": 367000
+    },
+    {
+      "epoch": 19.78,
+      "grad_norm": 0.43898245692253113,
+      "learning_rate": 3.6494117647058817e-06,
+      "loss": 2.7675,
+      "step": 368000
+    },
+    {
+      "epoch": 19.84,
+      "grad_norm": 0.44373300671577454,
+      "learning_rate": 2.7670588235294112e-06,
+      "loss": 2.7632,
+      "step": 369000
+    },
+    {
+      "epoch": 19.89,
+      "grad_norm": 0.45887884497642517,
+      "learning_rate": 1.8847058823529411e-06,
+      "loss": 2.7626,
+      "step": 370000
+    },
+    {
+      "epoch": 19.95,
+      "grad_norm": 0.45596691966056824,
+      "learning_rate": 1.0023529411764706e-06,
+      "loss": 2.7648,
+      "step": 371000
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1.0607672929763794,
+      "learning_rate": 1.2088235294117647e-07,
+      "loss": 2.7599,
+      "step": 372000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.4091656007481136,
+      "eval_loss": 3.4249629974365234,
+      "eval_runtime": 154.4535,
+      "eval_samples_per_second": 375.0,
+      "eval_steps_per_second": 5.859,
+      "step": 372000
+    },
+    {
+      "epoch": 20.0,
+      "step": 372000,
+      "total_flos": 1.56728915394048e+18,
+      "train_loss": 3.055753937752016,
+      "train_runtime": 81288.5778,
+      "train_samples_per_second": 146.435,
+      "train_steps_per_second": 4.576
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 372000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 5000,
+  "total_flos": 1.56728915394048e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}