diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7031 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.999896960329727,
+  "eval_steps": 200,
+  "global_step": 4852,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00020607934054611026,
+      "grad_norm": 0.22813845522641535,
+      "learning_rate": 4.1152263374485604e-07,
+      "loss": 0.3029,
+      "step": 1
+    },
+    {
+      "epoch": 0.0010303967027305513,
+      "grad_norm": 0.369767682912768,
+      "learning_rate": 2.05761316872428e-06,
+      "loss": 0.531,
+      "step": 5
+    },
+    {
+      "epoch": 0.0020607934054611026,
+      "grad_norm": 0.516859665385139,
+      "learning_rate": 4.11522633744856e-06,
+      "loss": 0.5792,
+      "step": 10
+    },
+    {
+      "epoch": 0.0030911901081916537,
+      "grad_norm": 0.39330735500575664,
+      "learning_rate": 6.172839506172839e-06,
+      "loss": 0.4978,
+      "step": 15
+    },
+    {
+      "epoch": 0.004121586810922205,
+      "grad_norm": 0.5867330845716273,
+      "learning_rate": 8.23045267489712e-06,
+      "loss": 0.5701,
+      "step": 20
+    },
+    {
+      "epoch": 0.005151983513652756,
+      "grad_norm": 0.5652480516594349,
+      "learning_rate": 1.02880658436214e-05,
+      "loss": 0.6254,
+      "step": 25
+    },
+    {
+      "epoch": 0.0061823802163833074,
+      "grad_norm": 0.44581621851474207,
+      "learning_rate": 1.2345679012345678e-05,
+      "loss": 0.4771,
+      "step": 30
+    },
+    {
+      "epoch": 0.0072127769191138585,
+      "grad_norm": 0.460575062400366,
+      "learning_rate": 1.440329218106996e-05,
+      "loss": 0.5632,
+      "step": 35
+    },
+    {
+      "epoch": 0.00824317362184441,
+      "grad_norm": 0.30751978801541957,
+      "learning_rate": 1.646090534979424e-05,
+      "loss": 0.4236,
+      "step": 40
+    },
+    {
+      "epoch": 0.00927357032457496,
+      "grad_norm": 0.38005292756389475,
+      "learning_rate": 1.8518518518518518e-05,
+      "loss": 0.4796,
+      "step": 45
+    },
+    {
+      "epoch": 0.010303967027305513,
+      "grad_norm": 0.7516556669584462,
+      "learning_rate": 2.05761316872428e-05,
+      "loss": 0.499,
+      "step": 50
+    },
+    {
+      "epoch": 0.011334363730036065,
+      "grad_norm": 0.43984231867613194,
+      "learning_rate": 2.2633744855967078e-05,
+      "loss": 0.3982,
+      "step": 55
+    },
+    {
+      "epoch": 0.012364760432766615,
+      "grad_norm": 0.48668013365999796,
+      "learning_rate": 2.4691358024691357e-05,
+      "loss": 0.4797,
+      "step": 60
+    },
+    {
+      "epoch": 0.013395157135497167,
+      "grad_norm": 0.414961574786946,
+      "learning_rate": 2.6748971193415638e-05,
+      "loss": 0.4253,
+      "step": 65
+    },
+    {
+      "epoch": 0.014425553838227717,
+      "grad_norm": 0.400815338908771,
+      "learning_rate": 2.880658436213992e-05,
+      "loss": 0.4563,
+      "step": 70
+    },
+    {
+      "epoch": 0.015455950540958269,
+      "grad_norm": 0.6713253921630139,
+      "learning_rate": 3.08641975308642e-05,
+      "loss": 0.4456,
+      "step": 75
+    },
+    {
+      "epoch": 0.01648634724368882,
+      "grad_norm": 0.46429013698478766,
+      "learning_rate": 3.292181069958848e-05,
+      "loss": 0.3943,
+      "step": 80
+    },
+    {
+      "epoch": 0.01751674394641937,
+      "grad_norm": 0.46368279851910454,
+      "learning_rate": 3.497942386831276e-05,
+      "loss": 0.4472,
+      "step": 85
+    },
+    {
+      "epoch": 0.01854714064914992,
+      "grad_norm": 0.6084053230691859,
+      "learning_rate": 3.7037037037037037e-05,
+      "loss": 0.4168,
+      "step": 90
+    },
+    {
+      "epoch": 0.019577537351880475,
+      "grad_norm": 0.3924187177282214,
+      "learning_rate": 3.909465020576132e-05,
+      "loss": 0.4242,
+      "step": 95
+    },
+    {
+      "epoch": 0.020607934054611025,
+      "grad_norm": 0.5391485143871717,
+      "learning_rate": 4.11522633744856e-05,
+      "loss": 0.405,
+      "step": 100
+    },
+    {
+      "epoch": 0.021638330757341576,
+      "grad_norm": 0.5391287793903055,
+      "learning_rate": 4.3209876543209875e-05,
+      "loss": 0.3786,
+      "step": 105
+    },
+    {
+      "epoch": 0.02266872746007213,
+      "grad_norm": 0.5860930775079664,
+      "learning_rate": 4.5267489711934157e-05,
+      "loss": 0.4353,
+      "step": 110
+    },
+    {
+      "epoch": 0.02369912416280268,
+      "grad_norm": 0.5142605508034142,
+      "learning_rate": 4.732510288065844e-05,
+      "loss": 0.3807,
+      "step": 115
+    },
+    {
+      "epoch": 0.02472952086553323,
+      "grad_norm": 0.5212600058334357,
+      "learning_rate": 4.938271604938271e-05,
+      "loss": 0.404,
+      "step": 120
+    },
+    {
+      "epoch": 0.02575991756826378,
+      "grad_norm": 0.5518790775686553,
+      "learning_rate": 5.1440329218106995e-05,
+      "loss": 0.3854,
+      "step": 125
+    },
+    {
+      "epoch": 0.026790314270994334,
+      "grad_norm": 0.5275493325802079,
+      "learning_rate": 5.3497942386831277e-05,
+      "loss": 0.3954,
+      "step": 130
+    },
+    {
+      "epoch": 0.027820710973724884,
+      "grad_norm": 0.5128585051120935,
+      "learning_rate": 5.555555555555556e-05,
+      "loss": 0.3937,
+      "step": 135
+    },
+    {
+      "epoch": 0.028851107676455434,
+      "grad_norm": 0.5148606074591149,
+      "learning_rate": 5.761316872427984e-05,
+      "loss": 0.3846,
+      "step": 140
+    },
+    {
+      "epoch": 0.029881504379185988,
+      "grad_norm": 0.5488455929859509,
+      "learning_rate": 5.9670781893004115e-05,
+      "loss": 0.4055,
+      "step": 145
+    },
+    {
+      "epoch": 0.030911901081916538,
+      "grad_norm": 0.6098923361895402,
+      "learning_rate": 6.17283950617284e-05,
+      "loss": 0.3934,
+      "step": 150
+    },
+    {
+      "epoch": 0.03194229778464709,
+      "grad_norm": 0.4754468226292264,
+      "learning_rate": 6.378600823045267e-05,
+      "loss": 0.3727,
+      "step": 155
+    },
+    {
+      "epoch": 0.03297269448737764,
+      "grad_norm": 0.4791430171004267,
+      "learning_rate": 6.584362139917696e-05,
+      "loss": 0.3787,
+      "step": 160
+    },
+    {
+      "epoch": 0.03400309119010819,
+      "grad_norm": 0.349013143936483,
+      "learning_rate": 6.790123456790123e-05,
+      "loss": 0.3746,
+      "step": 165
+    },
+    {
+      "epoch": 0.03503348789283874,
+      "grad_norm": 0.4634759570911148,
+      "learning_rate": 6.995884773662552e-05,
+      "loss": 0.3554,
+      "step": 170
+    },
+    {
+      "epoch": 0.03606388459556929,
+      "grad_norm": 0.6112517018171014,
+      "learning_rate": 7.20164609053498e-05,
+      "loss": 0.4185,
+      "step": 175
+    },
+    {
+      "epoch": 0.03709428129829984,
+      "grad_norm": 0.5549371605797818,
+      "learning_rate": 7.407407407407407e-05,
+      "loss": 0.3424,
+      "step": 180
+    },
+    {
+      "epoch": 0.0381246780010304,
+      "grad_norm": 0.4115597342413292,
+      "learning_rate": 7.613168724279836e-05,
+      "loss": 0.3756,
+      "step": 185
+    },
+    {
+      "epoch": 0.03915507470376095,
+      "grad_norm": 0.42107468069152587,
+      "learning_rate": 7.818930041152264e-05,
+      "loss": 0.3499,
+      "step": 190
+    },
+    {
+      "epoch": 0.0401854714064915,
+      "grad_norm": 0.4487911468673543,
+      "learning_rate": 8.024691358024692e-05,
+      "loss": 0.3414,
+      "step": 195
+    },
+    {
+      "epoch": 0.04121586810922205,
+      "grad_norm": 0.5916231604404705,
+      "learning_rate": 8.23045267489712e-05,
+      "loss": 0.3535,
+      "step": 200
+    },
+    {
+      "epoch": 0.04121586810922205,
+      "eval_loss": 0.35861507058143616,
+      "eval_runtime": 2884.7816,
+      "eval_samples_per_second": 2.773,
+      "eval_steps_per_second": 0.347,
+      "step": 200
+    },
+    {
+      "epoch": 0.0422462648119526,
+      "grad_norm": 0.48915095397808533,
+      "learning_rate": 8.436213991769549e-05,
+      "loss": 0.3254,
+      "step": 205
+    },
+    {
+      "epoch": 0.04327666151468315,
+      "grad_norm": 0.5120767478539743,
+      "learning_rate": 8.641975308641975e-05,
+      "loss": 0.4097,
+      "step": 210
+    },
+    {
+      "epoch": 0.0443070582174137,
+      "grad_norm": 0.4212019374481453,
+      "learning_rate": 8.847736625514404e-05,
+      "loss": 0.3389,
+      "step": 215
+    },
+    {
+      "epoch": 0.04533745492014426,
+      "grad_norm": 0.4396834951099101,
+      "learning_rate": 9.053497942386831e-05,
+      "loss": 0.3942,
+      "step": 220
+    },
+    {
+      "epoch": 0.04636785162287481,
+      "grad_norm": 0.6548047535409067,
+      "learning_rate": 9.25925925925926e-05,
+      "loss": 0.3656,
+      "step": 225
+    },
+    {
+      "epoch": 0.04739824832560536,
+      "grad_norm": 0.49422979322946825,
+      "learning_rate": 9.465020576131688e-05,
+      "loss": 0.3472,
+      "step": 230
+    },
+    {
+      "epoch": 0.04842864502833591,
+      "grad_norm": 0.45951373244390564,
+      "learning_rate": 9.670781893004116e-05,
+      "loss": 0.389,
+      "step": 235
+    },
+    {
+      "epoch": 0.04945904173106646,
+      "grad_norm": 0.3835541723420252,
+      "learning_rate": 9.876543209876543e-05,
+      "loss": 0.3268,
+      "step": 240
+    },
+    {
+      "epoch": 0.05048943843379701,
+      "grad_norm": 0.43616306650374015,
+      "learning_rate": 0.00010082304526748971,
+      "loss": 0.3813,
+      "step": 245
+    },
+    {
+      "epoch": 0.05151983513652756,
+      "grad_norm": 0.5081680558376234,
+      "learning_rate": 0.00010288065843621399,
+      "loss": 0.3842,
+      "step": 250
+    },
+    {
+      "epoch": 0.05255023183925812,
+      "grad_norm": 0.4927357690560021,
+      "learning_rate": 0.00010493827160493828,
+      "loss": 0.3078,
+      "step": 255
+    },
+    {
+      "epoch": 0.05358062854198867,
+      "grad_norm": 0.40808058841192324,
+      "learning_rate": 0.00010699588477366255,
+      "loss": 0.3535,
+      "step": 260
+    },
+    {
+      "epoch": 0.05461102524471922,
+      "grad_norm": 0.36331664339606895,
+      "learning_rate": 0.00010905349794238684,
+      "loss": 0.2998,
+      "step": 265
+    },
+    {
+      "epoch": 0.05564142194744977,
+      "grad_norm": 0.33960813975720344,
+      "learning_rate": 0.00011111111111111112,
+      "loss": 0.3557,
+      "step": 270
+    },
+    {
+      "epoch": 0.05667181865018032,
+      "grad_norm": 0.45056548565314214,
+      "learning_rate": 0.0001131687242798354,
+      "loss": 0.3631,
+      "step": 275
+    },
+    {
+      "epoch": 0.05770221535291087,
+      "grad_norm": 0.3659158092281187,
+      "learning_rate": 0.00011522633744855968,
+      "loss": 0.314,
+      "step": 280
+    },
+    {
+      "epoch": 0.05873261205564142,
+      "grad_norm": 0.4438288771100659,
+      "learning_rate": 0.00011728395061728397,
+      "loss": 0.36,
+      "step": 285
+    },
+    {
+      "epoch": 0.059763008758371976,
+      "grad_norm": 0.418631875979075,
+      "learning_rate": 0.00011934156378600823,
+      "loss": 0.3419,
+      "step": 290
+    },
+    {
+      "epoch": 0.060793405461102526,
+      "grad_norm": 0.4396365282210275,
+      "learning_rate": 0.0001213991769547325,
+      "loss": 0.3928,
+      "step": 295
+    },
+    {
+      "epoch": 0.061823802163833076,
+      "grad_norm": 0.4402925268314175,
+      "learning_rate": 0.0001234567901234568,
+      "loss": 0.4072,
+      "step": 300
+    },
+    {
+      "epoch": 0.06285419886656363,
+      "grad_norm": 0.4563210413645905,
+      "learning_rate": 0.00012551440329218108,
+      "loss": 0.3218,
+      "step": 305
+    },
+    {
+      "epoch": 0.06388459556929418,
+      "grad_norm": 0.4951662000563643,
+      "learning_rate": 0.00012757201646090534,
+      "loss": 0.3638,
+      "step": 310
+    },
+    {
+      "epoch": 0.06491499227202473,
+      "grad_norm": 0.3406684829409407,
+      "learning_rate": 0.00012962962962962963,
+      "loss": 0.3247,
+      "step": 315
+    },
+    {
+      "epoch": 0.06594538897475528,
+      "grad_norm": 0.3849603318857793,
+      "learning_rate": 0.00013168724279835392,
+      "loss": 0.3775,
+      "step": 320
+    },
+    {
+      "epoch": 0.06697578567748583,
+      "grad_norm": 0.47213282241970134,
+      "learning_rate": 0.0001337448559670782,
+      "loss": 0.4242,
+      "step": 325
+    },
+    {
+      "epoch": 0.06800618238021638,
+      "grad_norm": 0.4081937117941576,
+      "learning_rate": 0.00013580246913580247,
+      "loss": 0.3297,
+      "step": 330
+    },
+    {
+      "epoch": 0.06903657908294693,
+      "grad_norm": 0.4290389394884079,
+      "learning_rate": 0.00013786008230452676,
+      "loss": 0.3824,
+      "step": 335
+    },
+    {
+      "epoch": 0.07006697578567748,
+      "grad_norm": 0.40035955885155716,
+      "learning_rate": 0.00013991769547325105,
+      "loss": 0.3567,
+      "step": 340
+    },
+    {
+      "epoch": 0.07109737248840804,
+      "grad_norm": 0.35369148138173667,
+      "learning_rate": 0.00014197530864197534,
+      "loss": 0.3851,
+      "step": 345
+    },
+    {
+      "epoch": 0.07212776919113859,
+      "grad_norm": 0.3933488263087153,
+      "learning_rate": 0.0001440329218106996,
+      "loss": 0.3941,
+      "step": 350
+    },
+    {
+      "epoch": 0.07315816589386914,
+      "grad_norm": 0.3403535421027664,
+      "learning_rate": 0.00014609053497942386,
+      "loss": 0.3279,
+      "step": 355
+    },
+    {
+      "epoch": 0.07418856259659969,
+      "grad_norm": 0.3901955358278729,
+      "learning_rate": 0.00014814814814814815,
+      "loss": 0.3892,
+      "step": 360
+    },
+    {
+      "epoch": 0.07521895929933024,
+      "grad_norm": 0.3942505006346225,
+      "learning_rate": 0.00015020576131687243,
+      "loss": 0.3409,
+      "step": 365
+    },
+    {
+      "epoch": 0.0762493560020608,
+      "grad_norm": 0.36161132887938113,
+      "learning_rate": 0.00015226337448559672,
+      "loss": 0.3653,
+      "step": 370
+    },
+    {
+      "epoch": 0.07727975270479134,
+      "grad_norm": 0.35951863605570794,
+      "learning_rate": 0.00015432098765432098,
+      "loss": 0.369,
+      "step": 375
+    },
+    {
+      "epoch": 0.0783101494075219,
+      "grad_norm": 0.3508546244263811,
+      "learning_rate": 0.00015637860082304527,
+      "loss": 0.3609,
+      "step": 380
+    },
+    {
+      "epoch": 0.07934054611025244,
+      "grad_norm": 0.4027043746098036,
+      "learning_rate": 0.00015843621399176956,
+      "loss": 0.3736,
+      "step": 385
+    },
+    {
+      "epoch": 0.080370942812983,
+      "grad_norm": 0.32030210240674106,
+      "learning_rate": 0.00016049382716049385,
+      "loss": 0.3543,
+      "step": 390
+    },
+    {
+      "epoch": 0.08140133951571354,
+      "grad_norm": 0.3037761803574099,
+      "learning_rate": 0.0001625514403292181,
+      "loss": 0.3657,
+      "step": 395
+    },
+    {
+      "epoch": 0.0824317362184441,
+      "grad_norm": 0.47893634163074095,
+      "learning_rate": 0.0001646090534979424,
+      "loss": 0.4117,
+      "step": 400
+    },
+    {
+      "epoch": 0.0824317362184441,
+      "eval_loss": 0.3370859622955322,
+      "eval_runtime": 2880.7606,
+      "eval_samples_per_second": 2.777,
+      "eval_steps_per_second": 0.347,
+      "step": 400
+    },
+    {
+      "epoch": 0.08346213292117466,
+      "grad_norm": 0.3512494491395804,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.3436,
+      "step": 405
+    },
+    {
+      "epoch": 0.0844925296239052,
+      "grad_norm": 0.35871617137745193,
+      "learning_rate": 0.00016872427983539098,
+      "loss": 0.3617,
+      "step": 410
+    },
+    {
+      "epoch": 0.08552292632663576,
+      "grad_norm": 0.3508752237247138,
+      "learning_rate": 0.00017078189300411524,
+      "loss": 0.3472,
+      "step": 415
+    },
+    {
+      "epoch": 0.0865533230293663,
+      "grad_norm": 0.40385959435891516,
+      "learning_rate": 0.0001728395061728395,
+      "loss": 0.3353,
+      "step": 420
+    },
+    {
+      "epoch": 0.08758371973209686,
+      "grad_norm": 0.346542199921827,
+      "learning_rate": 0.0001748971193415638,
+      "loss": 0.3251,
+      "step": 425
+    },
+    {
+      "epoch": 0.0886141164348274,
+      "grad_norm": 0.32327444973794744,
+      "learning_rate": 0.00017695473251028808,
+      "loss": 0.2782,
+      "step": 430
+    },
+    {
+      "epoch": 0.08964451313755796,
+      "grad_norm": 0.4605590792675189,
+      "learning_rate": 0.00017901234567901234,
+      "loss": 0.3569,
+      "step": 435
+    },
+    {
+      "epoch": 0.09067490984028852,
+      "grad_norm": 0.33078249165297896,
+      "learning_rate": 0.00018106995884773663,
+      "loss": 0.336,
+      "step": 440
+    },
+    {
+      "epoch": 0.09170530654301906,
+      "grad_norm": 0.3755269642559541,
+      "learning_rate": 0.00018312757201646091,
+      "loss": 0.3523,
+      "step": 445
+    },
+    {
+      "epoch": 0.09273570324574962,
+      "grad_norm": 0.3872427530097071,
+      "learning_rate": 0.0001851851851851852,
+      "loss": 0.3798,
+      "step": 450
+    },
+    {
+      "epoch": 0.09376609994848016,
+      "grad_norm": 0.3360831675941825,
+      "learning_rate": 0.00018724279835390946,
+      "loss": 0.294,
+      "step": 455
+    },
+    {
+      "epoch": 0.09479649665121072,
+      "grad_norm": 0.34721503385661145,
+      "learning_rate": 0.00018930041152263375,
+      "loss": 0.3474,
+      "step": 460
+    },
+    {
+      "epoch": 0.09582689335394126,
+      "grad_norm": 0.4035919572242035,
+      "learning_rate": 0.00019135802469135804,
+      "loss": 0.3334,
+      "step": 465
+    },
+    {
+      "epoch": 0.09685729005667182,
+      "grad_norm": 0.3634206383026372,
+      "learning_rate": 0.00019341563786008233,
+      "loss": 0.3529,
+      "step": 470
+    },
+    {
+      "epoch": 0.09788768675940238,
+      "grad_norm": 0.47867112342634655,
+      "learning_rate": 0.0001954732510288066,
+      "loss": 0.3766,
+      "step": 475
+    },
+    {
+      "epoch": 0.09891808346213292,
+      "grad_norm": 0.2683190757613072,
+      "learning_rate": 0.00019753086419753085,
+      "loss": 0.2898,
+      "step": 480
+    },
+    {
+      "epoch": 0.09994848016486348,
+      "grad_norm": 0.35359467324165766,
+      "learning_rate": 0.00019958847736625514,
+      "loss": 0.343,
+      "step": 485
+    },
+    {
+      "epoch": 0.10097887686759402,
+      "grad_norm": 0.3024165989651168,
+      "learning_rate": 0.00019999958578867407,
+      "loss": 0.2837,
+      "step": 490
+    },
+    {
+      "epoch": 0.10200927357032458,
+      "grad_norm": 0.3060378938897583,
+      "learning_rate": 0.00019999790306104336,
+      "loss": 0.3267,
+      "step": 495
+    },
+    {
+      "epoch": 0.10303967027305512,
+      "grad_norm": 0.3849754123838811,
+      "learning_rate": 0.0001999949259506647,
+      "loss": 0.367,
+      "step": 500
+    },
+    {
+      "epoch": 0.10407006697578568,
+      "grad_norm": 0.357883050197563,
+      "learning_rate": 0.00019999065449607402,
+      "loss": 0.2918,
+      "step": 505
+    },
+    {
+      "epoch": 0.10510046367851623,
+      "grad_norm": 0.41425939893167707,
+      "learning_rate": 0.00019998508875256158,
+      "loss": 0.3707,
+      "step": 510
+    },
+    {
+      "epoch": 0.10613086038124678,
+      "grad_norm": 0.30851216885867083,
+      "learning_rate": 0.0001999782287921708,
+      "loss": 0.3154,
+      "step": 515
+    },
+    {
+      "epoch": 0.10716125708397733,
+      "grad_norm": 0.3349002143749146,
+      "learning_rate": 0.00019997007470369773,
+      "loss": 0.3502,
+      "step": 520
+    },
+    {
+      "epoch": 0.10819165378670788,
+      "grad_norm": 0.37146332082387845,
+      "learning_rate": 0.0001999606265926897,
+      "loss": 0.3824,
+      "step": 525
+    },
+    {
+      "epoch": 0.10922205048943844,
+      "grad_norm": 0.30215937352060623,
+      "learning_rate": 0.000199949884581444,
+      "loss": 0.3011,
+      "step": 530
+    },
+    {
+      "epoch": 0.11025244719216898,
+      "grad_norm": 0.33400076650466676,
+      "learning_rate": 0.00019993784880900623,
+      "loss": 0.3178,
+      "step": 535
+    },
+    {
+      "epoch": 0.11128284389489954,
+      "grad_norm": 0.5100263753382147,
+      "learning_rate": 0.0001999245194311687,
+      "loss": 0.3327,
+      "step": 540
+    },
+    {
+      "epoch": 0.11231324059763009,
+      "grad_norm": 0.35486673055341267,
+      "learning_rate": 0.00019990989662046818,
+      "loss": 0.3397,
+      "step": 545
+    },
+    {
+      "epoch": 0.11334363730036064,
+      "grad_norm": 0.4302123439623329,
+      "learning_rate": 0.0001998939805661837,
+      "loss": 0.3359,
+      "step": 550
+    },
+    {
+      "epoch": 0.1143740340030912,
+      "grad_norm": 0.3424158782854524,
+      "learning_rate": 0.00019987677147433432,
+      "loss": 0.3034,
+      "step": 555
+    },
+    {
+      "epoch": 0.11540443070582174,
+      "grad_norm": 0.3322620075239061,
+      "learning_rate": 0.0001998582695676762,
+      "loss": 0.3561,
+      "step": 560
+    },
+    {
+      "epoch": 0.1164348274085523,
+      "grad_norm": 0.34133172554856017,
+      "learning_rate": 0.00019983847508569987,
+      "loss": 0.3092,
+      "step": 565
+    },
+    {
+      "epoch": 0.11746522411128284,
+      "grad_norm": 0.3114364442803864,
+      "learning_rate": 0.00019981738828462703,
+      "loss": 0.3485,
+      "step": 570
+    },
+    {
+      "epoch": 0.1184956208140134,
+      "grad_norm": 0.4465425096855801,
+      "learning_rate": 0.00019979500943740735,
+      "loss": 0.3786,
+      "step": 575
+    },
+    {
+      "epoch": 0.11952601751674395,
+      "grad_norm": 0.31716364711744127,
+      "learning_rate": 0.00019977133883371478,
+      "loss": 0.2759,
+      "step": 580
+    },
+    {
+      "epoch": 0.1205564142194745,
+      "grad_norm": 0.39768793428389815,
+      "learning_rate": 0.00019974637677994404,
+      "loss": 0.3397,
+      "step": 585
+    },
+    {
+      "epoch": 0.12158681092220505,
+      "grad_norm": 0.36329417504514366,
+      "learning_rate": 0.00019972012359920638,
+      "loss": 0.3363,
+      "step": 590
+    },
+    {
+      "epoch": 0.1226172076249356,
+      "grad_norm": 0.34276825012450907,
+      "learning_rate": 0.0001996925796313256,
+      "loss": 0.362,
+      "step": 595
+    },
+    {
+      "epoch": 0.12364760432766615,
+      "grad_norm": 0.3378267807010673,
+      "learning_rate": 0.00019966374523283347,
+      "loss": 0.3577,
+      "step": 600
+    },
+    {
+      "epoch": 0.12364760432766615,
+      "eval_loss": 0.3276773989200592,
+      "eval_runtime": 2883.345,
+      "eval_samples_per_second": 2.775,
+      "eval_steps_per_second": 0.347,
+      "step": 600
+    },
+    {
+      "epoch": 0.12467800103039671,
+      "grad_norm": 0.3741701704271306,
+      "learning_rate": 0.00019963362077696537,
+      "loss": 0.2976,
+      "step": 605
+    },
+    {
+      "epoch": 0.12570839773312725,
+      "grad_norm": 0.3477249258246963,
+      "learning_rate": 0.00019960220665365518,
+      "loss": 0.3612,
+      "step": 610
+    },
+    {
+      "epoch": 0.1267387944358578,
+      "grad_norm": 0.3387684957358674,
+      "learning_rate": 0.0001995695032695305,
+      "loss": 0.3462,
+      "step": 615
+    },
+    {
+      "epoch": 0.12776919113858837,
+      "grad_norm": 0.3239206580729764,
+      "learning_rate": 0.0001995355110479071,
+      "loss": 0.3366,
+      "step": 620
+    },
+    {
+      "epoch": 0.1287995878413189,
+      "grad_norm": 0.405004728947829,
+      "learning_rate": 0.00019950023042878366,
+      "loss": 0.3746,
+      "step": 625
+    },
+    {
+      "epoch": 0.12982998454404945,
+      "grad_norm": 0.3198722424159481,
+      "learning_rate": 0.00019946366186883604,
+      "loss": 0.274,
+      "step": 630
+    },
+    {
+      "epoch": 0.13086038124678,
+      "grad_norm": 0.35143449361980517,
+      "learning_rate": 0.00019942580584141127,
+      "loss": 0.3399,
+      "step": 635
+    },
+    {
+      "epoch": 0.13189077794951057,
+      "grad_norm": 0.35776677388978995,
+      "learning_rate": 0.0001993866628365215,
+      "loss": 0.3188,
+      "step": 640
+    },
+    {
+      "epoch": 0.13292117465224113,
+      "grad_norm": 0.28985796662067814,
+      "learning_rate": 0.00019934623336083772,
+      "loss": 0.325,
+      "step": 645
+    },
+    {
+      "epoch": 0.13395157135497165,
+      "grad_norm": 0.4298738885622017,
+      "learning_rate": 0.00019930451793768298,
+      "loss": 0.3716,
+      "step": 650
+    },
+    {
+      "epoch": 0.1349819680577022,
+      "grad_norm": 0.4249611424637099,
+      "learning_rate": 0.00019926151710702588,
+      "loss": 0.316,
+      "step": 655
+    },
+    {
+      "epoch": 0.13601236476043277,
+      "grad_norm": 0.3264504540380892,
+      "learning_rate": 0.00019921723142547347,
+      "loss": 0.3485,
+      "step": 660
+    },
+    {
+      "epoch": 0.13704276146316333,
+      "grad_norm": 0.35251441037598413,
+      "learning_rate": 0.00019917166146626392,
+      "loss": 0.3001,
+      "step": 665
+    },
+    {
+      "epoch": 0.13807315816589386,
+      "grad_norm": 0.35147072953110736,
+      "learning_rate": 0.0001991248078192593,
+      "loss": 0.3482,
+      "step": 670
+    },
+    {
+      "epoch": 0.1391035548686244,
+      "grad_norm": 0.3954004821084264,
+      "learning_rate": 0.00019907667109093794,
+      "loss": 0.383,
+      "step": 675
+    },
+    {
+      "epoch": 0.14013395157135497,
+      "grad_norm": 0.3767557406947751,
+      "learning_rate": 0.00019902725190438627,
+      "loss": 0.3111,
+      "step": 680
+    },
+    {
+      "epoch": 0.14116434827408553,
+      "grad_norm": 0.37536811330325576,
+      "learning_rate": 0.00019897655089929126,
+      "loss": 0.3581,
+      "step": 685
+    },
+    {
+      "epoch": 0.14219474497681608,
+      "grad_norm": 0.3390394380307238,
+      "learning_rate": 0.00019892456873193165,
+      "loss": 0.3034,
+      "step": 690
+    },
+    {
+      "epoch": 0.1432251416795466,
+      "grad_norm": 0.3536586170228539,
+      "learning_rate": 0.00019887130607516978,
+      "loss": 0.3709,
+      "step": 695
+    },
+    {
+      "epoch": 0.14425553838227717,
+      "grad_norm": 0.4333572658960968,
+      "learning_rate": 0.00019881676361844275,
+      "loss": 0.365,
+      "step": 700
+    },
+    {
+      "epoch": 0.14528593508500773,
+      "grad_norm": 0.3450036993380974,
+      "learning_rate": 0.0001987609420677535,
+      "loss": 0.2859,
+      "step": 705
+    },
+    {
+      "epoch": 0.14631633178773829,
+      "grad_norm": 0.37669321445499737,
+      "learning_rate": 0.00019870384214566174,
+      "loss": 0.3351,
+      "step": 710
+    },
+    {
+      "epoch": 0.14734672849046884,
+      "grad_norm": 0.30441804003835093,
+      "learning_rate": 0.00019864546459127448,
+      "loss": 0.3173,
+      "step": 715
+    },
+    {
+      "epoch": 0.14837712519319937,
+      "grad_norm": 0.3562294792918495,
+      "learning_rate": 0.0001985858101602366,
+      "loss": 0.3527,
+      "step": 720
+    },
+    {
+      "epoch": 0.14940752189592993,
+      "grad_norm": 0.4228017846853497,
+      "learning_rate": 0.0001985248796247209,
+      "loss": 0.3503,
+      "step": 725
+    },
+    {
+      "epoch": 0.15043791859866049,
+      "grad_norm": 0.3669422341454514,
+      "learning_rate": 0.00019846267377341827,
+      "loss": 0.3211,
+      "step": 730
+    },
+    {
+      "epoch": 0.15146831530139104,
+      "grad_norm": 0.8193549662084493,
+      "learning_rate": 0.00019839919341152742,
+      "loss": 0.3723,
+      "step": 735
+    },
+    {
+      "epoch": 0.1524987120041216,
+      "grad_norm": 0.3192375696482019,
+      "learning_rate": 0.00019833443936074442,
+      "loss": 0.3307,
+      "step": 740
+    },
+    {
+      "epoch": 0.15352910870685213,
+      "grad_norm": 0.37787227997284556,
+      "learning_rate": 0.00019826841245925212,
+      "loss": 0.3543,
+      "step": 745
+    },
+    {
+      "epoch": 0.1545595054095827,
+      "grad_norm": 0.3977868146433894,
+      "learning_rate": 0.00019820111356170923,
+      "loss": 0.3901,
+      "step": 750
+    },
+    {
+      "epoch": 0.15558990211231324,
+      "grad_norm": 0.31941675552699234,
+      "learning_rate": 0.00019813254353923937,
+      "loss": 0.3238,
+      "step": 755
+    },
+    {
+      "epoch": 0.1566202988150438,
+      "grad_norm": 0.3308867953958594,
+      "learning_rate": 0.00019806270327941971,
+      "loss": 0.335,
+      "step": 760
+    },
+    {
+      "epoch": 0.15765069551777433,
+      "grad_norm": 0.35793702629959095,
+      "learning_rate": 0.00019799159368626945,
+      "loss": 0.2967,
+      "step": 765
+    },
+    {
+      "epoch": 0.1586810922205049,
+      "grad_norm": 0.3990749337961218,
+      "learning_rate": 0.00019791921568023822,
+      "loss": 0.3647,
+      "step": 770
+    },
+    {
+      "epoch": 0.15971148892323545,
+      "grad_norm": 0.37712646550547224,
+      "learning_rate": 0.00019784557019819404,
+      "loss": 0.3639,
+      "step": 775
+    },
+    {
+      "epoch": 0.160741885625966,
+      "grad_norm": 0.3039841465338024,
+      "learning_rate": 0.00019777065819341137,
+      "loss": 0.2798,
+      "step": 780
+    },
+    {
+      "epoch": 0.16177228232869656,
+      "grad_norm": 0.33026286060468363,
+      "learning_rate": 0.00019769448063555856,
+      "loss": 0.3811,
+      "step": 785
+    },
+    {
+      "epoch": 0.1628026790314271,
+      "grad_norm": 0.41703134958746274,
+      "learning_rate": 0.00019761703851068553,
+      "loss": 0.3082,
+      "step": 790
+    },
+    {
+      "epoch": 0.16383307573415765,
+      "grad_norm": 0.3508751011833616,
+      "learning_rate": 0.0001975383328212107,
+      "loss": 0.3379,
+      "step": 795
+    },
+    {
+      "epoch": 0.1648634724368882,
+      "grad_norm": 0.4074398780982895,
+      "learning_rate": 0.00019745836458590836,
+      "loss": 0.3594,
+      "step": 800
+    },
+    {
+      "epoch": 0.1648634724368882,
+      "eval_loss": 0.31942659616470337,
+      "eval_runtime": 2882.3337,
+      "eval_samples_per_second": 2.776,
+      "eval_steps_per_second": 0.347,
+      "step": 800
+    },
+    {
+      "epoch": 0.16589386913961876,
+      "grad_norm": 0.368070769838129,
+      "learning_rate": 0.0001973771348398953,
+      "loss": 0.2961,
+      "step": 805
+    },
+    {
+      "epoch": 0.16692426584234932,
+      "grad_norm": 0.38698709364654266,
+      "learning_rate": 0.0001972946446346173,
+      "loss": 0.3399,
+      "step": 810
+    },
+    {
+      "epoch": 0.16795466254507985,
+      "grad_norm": 0.36943660915830057,
+      "learning_rate": 0.00019721089503783577,
+      "loss": 0.3625,
+      "step": 815
+    },
+    {
+      "epoch": 0.1689850592478104,
+      "grad_norm": 0.2988194350307528,
+      "learning_rate": 0.00019712588713361378,
+      "loss": 0.3206,
+      "step": 820
+    },
+    {
+      "epoch": 0.17001545595054096,
+      "grad_norm": 0.4353851924786793,
+      "learning_rate": 0.00019703962202230203,
+      "loss": 0.3588,
+      "step": 825
+    },
+    {
+      "epoch": 0.17104585265327152,
+      "grad_norm": 0.31052731774203746,
+      "learning_rate": 0.00019695210082052472,
+      "loss": 0.3029,
+      "step": 830
+    },
+    {
+      "epoch": 0.17207624935600205,
+      "grad_norm": 0.34433049046241526,
+      "learning_rate": 0.00019686332466116487,
+      "loss": 0.3371,
+      "step": 835
+    },
+    {
+      "epoch": 0.1731066460587326,
+      "grad_norm": 0.30745452191681344,
+      "learning_rate": 0.0001967732946933499,
+      "loss": 0.2944,
+      "step": 840
+    },
+    {
+      "epoch": 0.17413704276146316,
+      "grad_norm": 0.31184697151248314,
+      "learning_rate": 0.00019668201208243658,
+      "loss": 0.3055,
+      "step": 845
+    },
+    {
+      "epoch": 0.17516743946419372,
+      "grad_norm": 0.30600883974803933,
+      "learning_rate": 0.0001965894780099961,
+      "loss": 0.3842,
+      "step": 850
+    },
+    {
+      "epoch": 0.17619783616692428,
+      "grad_norm": 0.2887356407325159,
+      "learning_rate": 0.00019649569367379867,
+      "loss": 0.3003,
+      "step": 855
+    },
+    {
+      "epoch": 0.1772282328696548,
+      "grad_norm": 0.3432548851322827,
+      "learning_rate": 0.00019640066028779794,
+      "loss": 0.3479,
+      "step": 860
+    },
+    {
+      "epoch": 0.17825862957238536,
+      "grad_norm": 0.3174948728433821,
+      "learning_rate": 0.00019630437908211548,
+      "loss": 0.3126,
+      "step": 865
+    },
+    {
+      "epoch": 0.17928902627511592,
+      "grad_norm": 0.3097638932288133,
+      "learning_rate": 0.00019620685130302478,
+      "loss": 0.3312,
+      "step": 870
+    },
+    {
+      "epoch": 0.18031942297784648,
+      "grad_norm": 0.42841659366456675,
+      "learning_rate": 0.00019610807821293503,
+      "loss": 0.3557,
+      "step": 875
+    },
+    {
+      "epoch": 0.18134981968057703,
+      "grad_norm": 0.32435713533284716,
+      "learning_rate": 0.00019600806109037485,
+      "loss": 0.3339,
+      "step": 880
+    },
+    {
+      "epoch": 0.18238021638330756,
+      "grad_norm": 0.353414630324643,
+      "learning_rate": 0.00019590680122997582,
+      "loss": 0.3283,
+      "step": 885
+    },
+    {
+      "epoch": 0.18341061308603812,
+      "grad_norm": 0.316231400356792,
+      "learning_rate": 0.00019580429994245555,
+      "loss": 0.3368,
+      "step": 890
+    },
+    {
+      "epoch": 0.18444100978876868,
+      "grad_norm": 0.3585149987799701,
+      "learning_rate": 0.0001957005585546009,
+      "loss": 0.3356,
+      "step": 895
+    },
+    {
+      "epoch": 0.18547140649149924,
+      "grad_norm": 0.36281772675718904,
+      "learning_rate": 0.00019559557840925055,
+      "loss": 0.3593,
+      "step": 900
+    },
+    {
+      "epoch": 0.18650180319422976,
+      "grad_norm": 0.33084410964043454,
+      "learning_rate": 0.00019548936086527798,
+      "loss": 0.2723,
+      "step": 905
+    },
+    {
+      "epoch": 0.18753219989696032,
+      "grad_norm": 0.30849057849040473,
+      "learning_rate": 0.00019538190729757356,
+      "loss": 0.3333,
+      "step": 910
+    },
+    {
+      "epoch": 0.18856259659969088,
+      "grad_norm": 0.3332033528248514,
+      "learning_rate": 0.00019527321909702688,
+      "loss": 0.3154,
+      "step": 915
+    },
+    {
+      "epoch": 0.18959299330242144,
+      "grad_norm": 0.39247405346568126,
+      "learning_rate": 0.00019516329767050878,
+      "loss": 0.3034,
+      "step": 920
+    },
+    {
+      "epoch": 0.190623390005152,
+      "grad_norm": 0.38738599137518354,
+      "learning_rate": 0.00019505214444085308,
+      "loss": 0.3755,
+      "step": 925
+    },
+    {
+      "epoch": 0.19165378670788252,
+      "grad_norm": 0.3353639819496788,
+      "learning_rate": 0.00019493976084683813,
+      "loss": 0.3224,
+      "step": 930
+    },
+    {
+      "epoch": 0.19268418341061308,
+      "grad_norm": 0.3475835339204704,
+      "learning_rate": 0.00019482614834316836,
+      "loss": 0.3368,
+      "step": 935
+    },
+    {
+      "epoch": 0.19371458011334364,
+      "grad_norm": 0.3383417577483814,
+      "learning_rate": 0.00019471130840045518,
+      "loss": 0.3114,
+      "step": 940
+    },
+    {
+      "epoch": 0.1947449768160742,
+      "grad_norm": 0.4095387754149621,
+      "learning_rate": 0.00019459524250519826,
+      "loss": 0.3466,
+      "step": 945
+    },
+    {
+      "epoch": 0.19577537351880475,
+      "grad_norm": 0.3425734449753018,
+      "learning_rate": 0.00019447795215976594,
+      "loss": 0.3426,
+      "step": 950
+    },
+    {
+      "epoch": 0.19680577022153528,
+      "grad_norm": 0.36205423513820917,
+      "learning_rate": 0.0001943594388823761,
+      "loss": 0.2825,
+      "step": 955
+    },
+    {
+      "epoch": 0.19783616692426584,
+      "grad_norm": 0.3722404265826129,
+      "learning_rate": 0.00019423970420707627,
+      "loss": 0.3471,
+      "step": 960
+    },
+    {
+      "epoch": 0.1988665636269964,
+      "grad_norm": 0.40847615253258024,
+      "learning_rate": 0.00019411874968372402,
+      "loss": 0.3151,
+      "step": 965
+    },
+    {
+      "epoch": 0.19989696032972695,
+      "grad_norm": 0.3379508791065496,
+      "learning_rate": 0.00019399657687796658,
+      "loss": 0.3414,
+      "step": 970
+    },
+    {
+      "epoch": 0.2009273570324575,
+      "grad_norm": 0.34821921677428874,
+      "learning_rate": 0.00019387318737122092,
+      "loss": 0.3034,
+      "step": 975
+    },
+    {
+      "epoch": 0.20195775373518804,
+      "grad_norm": 0.4453864463102829,
+      "learning_rate": 0.0001937485827606529,
+      "loss": 0.3158,
+      "step": 980
+    },
+    {
+      "epoch": 0.2029881504379186,
+      "grad_norm": 0.32502909512279515,
+      "learning_rate": 0.00019362276465915702,
+      "loss": 0.3426,
+      "step": 985
+    },
+    {
+      "epoch": 0.20401854714064915,
+      "grad_norm": 0.33635157429335094,
+      "learning_rate": 0.0001934957346953352,
+      "loss": 0.3088,
+      "step": 990
+    },
+    {
+      "epoch": 0.2050489438433797,
+      "grad_norm": 0.32532872733870094,
+      "learning_rate": 0.00019336749451347586,
+      "loss": 0.3311,
+      "step": 995
+    },
+    {
+      "epoch": 0.20607934054611024,
+      "grad_norm": 0.4664188938536663,
+      "learning_rate": 0.0001932380457735326,
+      "loss": 0.3603,
+      "step": 1000
+    },
+    {
+      "epoch": 0.20607934054611024,
+      "eval_loss": 0.30960944294929504,
+      "eval_runtime": 2881.9251,
+      "eval_samples_per_second": 2.776,
+      "eval_steps_per_second": 0.347,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2071097372488408,
+      "grad_norm": 0.30204444974859856,
+      "learning_rate": 0.00019310739015110267,
+      "loss": 0.3048,
+      "step": 1005
+    },
+    {
+      "epoch": 0.20814013395157135,
+      "grad_norm": 0.366246259849553,
+      "learning_rate": 0.00019297552933740547,
+      "loss": 0.3329,
+      "step": 1010
+    },
+    {
+      "epoch": 0.2091705306543019,
+      "grad_norm": 0.42600306630277895,
+      "learning_rate": 0.0001928424650392603,
+      "loss": 0.3223,
+      "step": 1015
+    },
+    {
+      "epoch": 0.21020092735703247,
+      "grad_norm": 0.4447553605441752,
+      "learning_rate": 0.00019270819897906468,
+      "loss": 0.3232,
+      "step": 1020
+    },
+    {
+      "epoch": 0.211231324059763,
+      "grad_norm": 0.3949912941897582,
+      "learning_rate": 0.00019257273289477174,
+      "loss": 0.3383,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21226172076249356,
+      "grad_norm": 0.34413838854516726,
+      "learning_rate": 0.00019243606853986786,
+      "loss": 0.2956,
+      "step": 1030
+    },
+    {
+      "epoch": 0.2132921174652241,
+      "grad_norm": 0.32739075098948817,
+      "learning_rate": 0.00019229820768335,
+      "loss": 0.3441,
+      "step": 1035
+    },
+    {
+      "epoch": 0.21432251416795467,
+      "grad_norm": 0.31062731875466804,
+      "learning_rate": 0.00019215915210970267,
+      "loss": 0.2817,
+      "step": 1040
+    },
+    {
+      "epoch": 0.21535291087068523,
+      "grad_norm": 0.3603886132870308,
+      "learning_rate": 0.00019201890361887506,
+      "loss": 0.3199,
+      "step": 1045
+    },
+    {
+      "epoch": 0.21638330757341576,
+      "grad_norm": 0.4203386465797232,
+      "learning_rate": 0.0001918774640262574,
+      "loss": 0.3599,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2174137042761463,
+      "grad_norm": 0.37186453766073907,
+      "learning_rate": 0.00019173483516265788,
+      "loss": 0.3261,
+      "step": 1055
+    },
+    {
+      "epoch": 0.21844410097887687,
+      "grad_norm": 0.3357291647533494,
+      "learning_rate": 0.00019159101887427854,
+      "loss": 0.3101,
+      "step": 1060
+    },
+    {
+      "epoch": 0.21947449768160743,
+      "grad_norm": 0.2839969390914523,
+      "learning_rate": 0.00019144601702269162,
+      "loss": 0.3065,
+      "step": 1065
+    },
+    {
+      "epoch": 0.22050489438433796,
+      "grad_norm": 0.37955068309626705,
+      "learning_rate": 0.00019129983148481552,
+      "loss": 0.3205,
+      "step": 1070
+    },
+    {
+      "epoch": 0.22153529108706851,
+      "grad_norm": 0.3864113540369065,
+      "learning_rate": 0.0001911524641528902,
+      "loss": 0.3304,
+      "step": 1075
+    },
+    {
+      "epoch": 0.22256568778979907,
+      "grad_norm": 0.30427047902449084,
+      "learning_rate": 0.00019100391693445306,
+      "loss": 0.2778,
+      "step": 1080
+    },
+    {
+      "epoch": 0.22359608449252963,
+      "grad_norm": 0.3141521581653154,
+      "learning_rate": 0.00019085419175231394,
+      "loss": 0.3165,
+      "step": 1085
+    },
+    {
+      "epoch": 0.22462648119526019,
+      "grad_norm": 0.3274727504782773,
+      "learning_rate": 0.00019070329054453046,
+      "loss": 0.2788,
+      "step": 1090
+    },
+    {
+      "epoch": 0.22565687789799072,
+      "grad_norm": 0.307921166920738,
+      "learning_rate": 0.00019055121526438272,
+      "loss": 0.3032,
+      "step": 1095
+    },
+    {
+      "epoch": 0.22668727460072127,
+      "grad_norm": 0.5011206814587393,
+      "learning_rate": 0.00019039796788034822,
+      "loss": 0.3707,
+      "step": 1100
+    },
+    {
+      "epoch": 0.22771767130345183,
+      "grad_norm": 0.3031868388904063,
+      "learning_rate": 0.00019024355037607622,
+      "loss": 0.3075,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2287480680061824,
+      "grad_norm": 0.4079301863341415,
+      "learning_rate": 0.00019008796475036222,
+      "loss": 0.3533,
+      "step": 1110
+    },
+    {
+      "epoch": 0.22977846470891294,
+      "grad_norm": 0.36502776550467453,
+      "learning_rate": 0.00018993121301712193,
+      "loss": 0.3018,
+      "step": 1115
+    },
+    {
+      "epoch": 0.23080886141164347,
+      "grad_norm": 0.36939878212398075,
+      "learning_rate": 0.00018977329720536529,
+      "loss": 0.3577,
+      "step": 1120
+    },
+    {
+      "epoch": 0.23183925811437403,
+      "grad_norm": 0.46361278352012,
+      "learning_rate": 0.00018961421935917016,
+      "loss": 0.3344,
+      "step": 1125
+    },
+    {
+      "epoch": 0.2328696548171046,
+      "grad_norm": 0.3264834812438753,
+      "learning_rate": 0.00018945398153765597,
+      "loss": 0.2821,
+      "step": 1130
+    },
+    {
+      "epoch": 0.23390005151983514,
+      "grad_norm": 0.3590984056606702,
+      "learning_rate": 0.00018929258581495685,
+      "loss": 0.3186,
+      "step": 1135
+    },
+    {
+      "epoch": 0.23493044822256567,
+      "grad_norm": 0.314422298134173,
+      "learning_rate": 0.00018913003428019506,
+      "loss": 0.2805,
+      "step": 1140
+    },
+    {
+      "epoch": 0.23596084492529623,
+      "grad_norm": 0.40817489184028294,
+      "learning_rate": 0.00018896632903745374,
+      "loss": 0.3382,
+      "step": 1145
+    },
+    {
+      "epoch": 0.2369912416280268,
+      "grad_norm": 0.4550813349162488,
+      "learning_rate": 0.00018880147220574976,
+      "loss": 0.3704,
+      "step": 1150
+    },
+    {
+      "epoch": 0.23802163833075735,
+      "grad_norm": 0.36999200007127014,
+      "learning_rate": 0.00018863546591900622,
+      "loss": 0.2956,
+      "step": 1155
+    },
+    {
+      "epoch": 0.2390520350334879,
+      "grad_norm": 0.3907765146123421,
+      "learning_rate": 0.00018846831232602492,
+      "loss": 0.3306,
+      "step": 1160
+    },
+    {
+      "epoch": 0.24008243173621843,
+      "grad_norm": 0.2290757131338595,
+      "learning_rate": 0.00018830001359045845,
+      "loss": 0.2949,
+      "step": 1165
+    },
+    {
+      "epoch": 0.241112828438949,
+      "grad_norm": 0.3209905523588552,
+      "learning_rate": 0.00018813057189078243,
+      "loss": 0.321,
+      "step": 1170
+    },
+    {
+      "epoch": 0.24214322514167955,
+      "grad_norm": 0.3773596332992971,
+      "learning_rate": 0.00018795998942026685,
+      "loss": 0.3522,
+      "step": 1175
+    },
+    {
+      "epoch": 0.2431736218444101,
+      "grad_norm": 0.34209228943160486,
+      "learning_rate": 0.00018778826838694812,
+      "loss": 0.2808,
+      "step": 1180
+    },
+    {
+      "epoch": 0.24420401854714066,
+      "grad_norm": 0.41181016625387107,
+      "learning_rate": 0.0001876154110136003,
+      "loss": 0.329,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2452344152498712,
+      "grad_norm": 0.3066529005132519,
+      "learning_rate": 0.0001874414195377063,
+      "loss": 0.2656,
+      "step": 1190
+    },
+    {
+      "epoch": 0.24626481195260175,
+      "grad_norm": 0.3794689895293103,
+      "learning_rate": 0.000187266296211429,
+      "loss": 0.3052,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2472952086553323,
+      "grad_norm": 0.4365846410577374,
+      "learning_rate": 0.0001870900433015821,
+      "loss": 0.3633,
+      "step": 1200
+    },
+    {
+      "epoch": 0.2472952086553323,
+      "eval_loss": 0.3063368499279022,
+      "eval_runtime": 2882.6283,
+      "eval_samples_per_second": 2.775,
+      "eval_steps_per_second": 0.347,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24832560535806286,
+      "grad_norm": 0.38508135542075006,
+      "learning_rate": 0.00018691266308960066,
+      "loss": 0.3035,
+      "step": 1205
+    },
+    {
+      "epoch": 0.24935600206079342,
+      "grad_norm": 0.3726705928374773,
+      "learning_rate": 0.00018673415787151166,
+      "loss": 0.3209,
+      "step": 1210
+    },
+    {
+      "epoch": 0.250386398763524,
+      "grad_norm": 0.2907676540641749,
+      "learning_rate": 0.00018655452995790435,
+      "loss": 0.2398,
+      "step": 1215
+    },
+    {
+      "epoch": 0.2514167954662545,
+      "grad_norm": 0.4457090162372571,
+      "learning_rate": 0.00018637378167390018,
+      "loss": 0.2955,
+      "step": 1220
+    },
+    {
+      "epoch": 0.25244719216898504,
+      "grad_norm": 0.4064739346307168,
+      "learning_rate": 0.0001861919153591228,
+      "loss": 0.3579,
+      "step": 1225
+    },
+    {
+      "epoch": 0.2534775888717156,
+      "grad_norm": 0.3439355157513527,
+      "learning_rate": 0.00018600893336766786,
+      "loss": 0.2909,
+      "step": 1230
+    },
+    {
+      "epoch": 0.25450798557444615,
+      "grad_norm": 0.30462368971059195,
+      "learning_rate": 0.00018582483806807228,
+      "loss": 0.3128,
+      "step": 1235
+    },
+    {
+      "epoch": 0.25553838227717673,
+      "grad_norm": 0.29050064091501104,
+      "learning_rate": 0.0001856396318432838,
+      "loss": 0.2849,
+      "step": 1240
+    },
+    {
+      "epoch": 0.25656877897990726,
+      "grad_norm": 0.30700131979933004,
+      "learning_rate": 0.0001854533170906302,
+      "loss": 0.3037,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2575991756826378,
+      "grad_norm": 0.38518853571022627,
+      "learning_rate": 0.00018526589622178802,
+      "loss": 0.3175,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2586295723853684,
+      "grad_norm": 0.38834107822360947,
+      "learning_rate": 0.00018507737166275154,
+      "loss": 0.2856,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2596599690880989,
+      "grad_norm": 0.3984475799653097,
+      "learning_rate": 0.00018488774585380125,
+      "loss": 0.3323,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2606903657908295,
+      "grad_norm": 0.2895823528396569,
+      "learning_rate": 0.00018469702124947245,
+      "loss": 0.2937,
+      "step": 1265
+    },
+    {
+      "epoch": 0.26172076249356,
+      "grad_norm": 0.2966226853911782,
+      "learning_rate": 0.00018450520031852325,
+      "loss": 0.3053,
+      "step": 1270
+    },
+    {
+      "epoch": 0.26275115919629055,
+      "grad_norm": 0.4177177396553655,
+      "learning_rate": 0.0001843122855439027,
+      "loss": 0.3241,
+      "step": 1275
+    },
+    {
+      "epoch": 0.26378155589902114,
+      "grad_norm": 0.3437444560725597,
+      "learning_rate": 0.00018411827942271884,
+      "loss": 0.2482,
+      "step": 1280
+    },
+    {
+      "epoch": 0.26481195260175167,
+      "grad_norm": 0.28500402984085127,
+      "learning_rate": 0.000183923184466206,
+      "loss": 0.3476,
+      "step": 1285
+    },
+    {
+      "epoch": 0.26584234930448225,
+      "grad_norm": 0.332018609909056,
+      "learning_rate": 0.0001837270031996926,
+      "loss": 0.2863,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2668727460072128,
+      "grad_norm": 0.5825427423902888,
+      "learning_rate": 0.00018352973816256838,
+      "loss": 0.3228,
+      "step": 1295
+    },
+    {
+      "epoch": 0.2679031427099433,
+      "grad_norm": 0.35407010241176123,
+      "learning_rate": 0.0001833313919082515,
+      "loss": 0.3699,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2689335394126739,
+      "grad_norm": 0.34044644075483865,
+      "learning_rate": 0.0001831319670041555,
+      "loss": 0.2764,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2699639361154044,
+      "grad_norm": 0.35384626006434766,
+      "learning_rate": 0.00018293146603165603,
+      "loss": 0.3338,
+      "step": 1310
+    },
+    {
+      "epoch": 0.270994332818135,
+      "grad_norm": 0.3524599113148914,
+      "learning_rate": 0.00018272989158605752,
+      "loss": 0.2699,
+      "step": 1315
+    },
+    {
+      "epoch": 0.27202472952086554,
+      "grad_norm": 0.3581600001494241,
+      "learning_rate": 0.00018252724627655954,
+      "loss": 0.2907,
+      "step": 1320
+    },
+    {
+      "epoch": 0.27305512622359607,
+      "grad_norm": 0.3715546739847767,
+      "learning_rate": 0.00018232353272622302,
+      "loss": 0.3091,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27408552292632665,
+      "grad_norm": 0.3125684903994619,
+      "learning_rate": 0.00018211875357193632,
+      "loss": 0.2815,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2751159196290572,
+      "grad_norm": 0.31294105869026073,
+      "learning_rate": 0.00018191291146438105,
+      "loss": 0.3052,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2761463163317877,
+      "grad_norm": 0.3175018199701703,
+      "learning_rate": 0.0001817060090679978,
+      "loss": 0.3345,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2771767130345183,
+      "grad_norm": 0.2888029163944005,
+      "learning_rate": 0.00018149804906095163,
+      "loss": 0.2956,
+      "step": 1345
+    },
+    {
+      "epoch": 0.2782071097372488,
+      "grad_norm": 0.37092342339586304,
+      "learning_rate": 0.00018128903413509756,
+      "loss": 0.33,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2792375064399794,
+      "grad_norm": 0.37102895224368077,
+      "learning_rate": 0.0001810789669959453,
+      "loss": 0.2904,
+      "step": 1355
+    },
+    {
+      "epoch": 0.28026790314270994,
+      "grad_norm": 0.2968345721462541,
+      "learning_rate": 0.0001808678503626248,
+      "loss": 0.3297,
+      "step": 1360
+    },
+    {
+      "epoch": 0.28129829984544047,
+      "grad_norm": 0.33998169922164373,
+      "learning_rate": 0.00018065568696785058,
+      "loss": 0.2835,
+      "step": 1365
+    },
+    {
+      "epoch": 0.28232869654817105,
+      "grad_norm": 0.3725382909503799,
+      "learning_rate": 0.00018044247955788662,
+      "loss": 0.3328,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2833590932509016,
+      "grad_norm": 0.44691364364670483,
+      "learning_rate": 0.00018022823089251073,
+      "loss": 0.3633,
+      "step": 1375
+    },
+    {
+      "epoch": 0.28438948995363217,
+      "grad_norm": 0.36853190246642853,
+      "learning_rate": 0.00018001294374497882,
+      "loss": 0.2866,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2854198866563627,
+      "grad_norm": 0.3266190569238548,
+      "learning_rate": 0.00017979662090198906,
+      "loss": 0.3011,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2864502833590932,
+      "grad_norm": 0.3732859483546346,
+      "learning_rate": 0.00017957926516364565,
+      "loss": 0.3232,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2874806800618238,
+      "grad_norm": 0.32682252149256485,
+      "learning_rate": 0.00017936087934342283,
+      "loss": 0.3277,
+      "step": 1395
+    },
+    {
+      "epoch": 0.28851107676455434,
+      "grad_norm": 0.37717993812571365,
+      "learning_rate": 0.00017914146626812823,
+      "loss": 0.3078,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28851107676455434,
+      "eval_loss": 0.2999653220176697,
+      "eval_runtime": 2880.5087,
+      "eval_samples_per_second": 2.777,
+      "eval_steps_per_second": 0.347,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2895414734672849,
+      "grad_norm": 0.2987256318589488,
+      "learning_rate": 0.0001789210287778664,
+      "loss": 0.2856,
+      "step": 1405
+    },
+    {
+      "epoch": 0.29057187017001546,
+      "grad_norm": 0.3199265296581454,
+      "learning_rate": 0.00017869956972600202,
+      "loss": 0.3176,
+      "step": 1410
+    },
+    {
+      "epoch": 0.291602266872746,
+      "grad_norm": 0.3362846877462833,
+      "learning_rate": 0.00017847709197912296,
+      "loss": 0.2992,
+      "step": 1415
+    },
+    {
+      "epoch": 0.29263266357547657,
+      "grad_norm": 0.3185294578549888,
+      "learning_rate": 0.0001782535984170032,
+      "loss": 0.2937,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2936630602782071,
+      "grad_norm": 0.3243854055339378,
+      "learning_rate": 0.00017802909193256547,
+      "loss": 0.3524,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2946934569809377,
+      "grad_norm": 0.33250072639136574,
+      "learning_rate": 0.00017780357543184397,
+      "loss": 0.2744,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2957238536836682,
+      "grad_norm": 0.34028811291554906,
+      "learning_rate": 0.00017757705183394653,
+      "loss": 0.3326,
+      "step": 1435
+    },
+    {
+      "epoch": 0.29675425038639874,
+      "grad_norm": 0.2753584287911616,
+      "learning_rate": 0.00017734952407101706,
+      "loss": 0.2782,
+      "step": 1440
+    },
+    {
+      "epoch": 0.29778464708912933,
+      "grad_norm": 0.4202404483488457,
+      "learning_rate": 0.0001771209950881974,
+      "loss": 0.3234,
+      "step": 1445
+    },
+    {
+      "epoch": 0.29881504379185986,
+      "grad_norm": 0.4350364742880956,
+      "learning_rate": 0.00017689146784358927,
+      "loss": 0.3117,
+      "step": 1450
+    },
+    {
+      "epoch": 0.29984544049459044,
+      "grad_norm": 0.31309620861911946,
+      "learning_rate": 0.0001766609453082161,
+      "loss": 0.2912,
+      "step": 1455
+    },
+    {
+      "epoch": 0.30087583719732097,
+      "grad_norm": 0.32354138707483654,
+      "learning_rate": 0.00017642943046598436,
+      "loss": 0.3373,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3019062339000515,
+      "grad_norm": 0.3515037763945781,
+      "learning_rate": 0.00017619692631364506,
+      "loss": 0.3193,
+      "step": 1465
+    },
+    {
+      "epoch": 0.3029366306027821,
+      "grad_norm": 0.3483804999160483,
+      "learning_rate": 0.00017596343586075497,
+      "loss": 0.329,
+      "step": 1470
+    },
+    {
+      "epoch": 0.3039670273055126,
+      "grad_norm": 0.39993094570722865,
+      "learning_rate": 0.00017572896212963754,
+      "loss": 0.3514,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3049974240082432,
+      "grad_norm": 0.36781639256435866,
+      "learning_rate": 0.000175493508155344,
+      "loss": 0.3263,
+      "step": 1480
+    },
+    {
+      "epoch": 0.30602782071097373,
+      "grad_norm": 0.29161988646871145,
+      "learning_rate": 0.00017525707698561385,
+      "loss": 0.3461,
+      "step": 1485
+    },
+    {
+      "epoch": 0.30705821741370426,
+      "grad_norm": 0.3154471225790784,
+      "learning_rate": 0.00017501967168083557,
+      "loss": 0.2986,
+      "step": 1490
+    },
+    {
+      "epoch": 0.30808861411643484,
+      "grad_norm": 0.34492695895689485,
+      "learning_rate": 0.00017478129531400688,
+      "loss": 0.3079,
+      "step": 1495
+    },
+    {
+      "epoch": 0.3091190108191654,
+      "grad_norm": 0.4725349308354448,
+      "learning_rate": 0.00017454195097069505,
+      "loss": 0.3115,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3101494075218959,
+      "grad_norm": 0.3436076430515735,
+      "learning_rate": 0.00017430164174899696,
+      "loss": 0.2844,
+      "step": 1505
+    },
+    {
+      "epoch": 0.3111798042246265,
+      "grad_norm": 0.3199620943087237,
+      "learning_rate": 0.0001740603707594989,
+      "loss": 0.3363,
+      "step": 1510
+    },
+    {
+      "epoch": 0.312210200927357,
+      "grad_norm": 0.3294543675032788,
+      "learning_rate": 0.00017381814112523648,
+      "loss": 0.2831,
+      "step": 1515
+    },
+    {
+      "epoch": 0.3132405976300876,
+      "grad_norm": 0.4442315813048831,
+      "learning_rate": 0.000173574955981654,
+      "loss": 0.3239,
+      "step": 1520
+    },
+    {
+      "epoch": 0.31427099433281813,
+      "grad_norm": 0.3315063178638844,
+      "learning_rate": 0.00017333081847656397,
+      "loss": 0.3282,
+      "step": 1525
+    },
+    {
+      "epoch": 0.31530139103554866,
+      "grad_norm": 0.38910409480102465,
+      "learning_rate": 0.00017308573177010652,
+      "loss": 0.3091,
+      "step": 1530
+    },
+    {
+      "epoch": 0.31633178773827925,
+      "grad_norm": 0.3610118589070733,
+      "learning_rate": 0.00017283969903470815,
+      "loss": 0.3047,
+      "step": 1535
+    },
+    {
+      "epoch": 0.3173621844410098,
+      "grad_norm": 0.3342784924557649,
+      "learning_rate": 0.0001725927234550409,
+      "loss": 0.2829,
+      "step": 1540
+    },
+    {
+      "epoch": 0.31839258114374036,
+      "grad_norm": 0.3065893071381057,
+      "learning_rate": 0.00017234480822798113,
+      "loss": 0.327,
+      "step": 1545
+    },
+    {
+      "epoch": 0.3194229778464709,
+      "grad_norm": 0.3678298005788044,
+      "learning_rate": 0.00017209595656256807,
+      "loss": 0.3401,
+      "step": 1550
+    },
+    {
+      "epoch": 0.3204533745492014,
+      "grad_norm": 0.333367656104773,
+      "learning_rate": 0.0001718461716799623,
+      "loss": 0.2857,
+      "step": 1555
+    },
+    {
+      "epoch": 0.321483771251932,
+      "grad_norm": 0.37925023182652434,
+      "learning_rate": 0.000171595456813404,
+      "loss": 0.3491,
+      "step": 1560
+    },
+    {
+      "epoch": 0.32251416795466253,
+      "grad_norm": 0.3765681871737522,
+      "learning_rate": 0.00017134381520817127,
+      "loss": 0.3001,
+      "step": 1565
+    },
+    {
+      "epoch": 0.3235445646573931,
+      "grad_norm": 0.29672578991036613,
+      "learning_rate": 0.00017109125012153783,
+      "loss": 0.2959,
+      "step": 1570
+    },
+    {
+      "epoch": 0.32457496136012365,
+      "grad_norm": 0.42011792115497876,
+      "learning_rate": 0.00017083776482273126,
+      "loss": 0.3532,
+      "step": 1575
+    },
+    {
+      "epoch": 0.3256053580628542,
+      "grad_norm": 0.4222336181716507,
+      "learning_rate": 0.00017058336259289026,
+      "loss": 0.247,
+      "step": 1580
+    },
+    {
+      "epoch": 0.32663575476558476,
+      "grad_norm": 0.31097703592676695,
+      "learning_rate": 0.0001703280467250225,
+      "loss": 0.3212,
+      "step": 1585
+    },
+    {
+      "epoch": 0.3276661514683153,
+      "grad_norm": 0.2963584098165209,
+      "learning_rate": 0.0001700718205239618,
+      "loss": 0.285,
+      "step": 1590
+    },
+    {
+      "epoch": 0.3286965481710459,
+      "grad_norm": 0.27473277659713885,
+      "learning_rate": 0.0001698146873063255,
+      "loss": 0.2852,
+      "step": 1595
+    },
+    {
+      "epoch": 0.3297269448737764,
+      "grad_norm": 0.41875809452677054,
+      "learning_rate": 0.00016955665040047134,
+      "loss": 0.3274,
+      "step": 1600
+    },
+    {
+      "epoch": 0.3297269448737764,
+      "eval_loss": 0.2948421239852905,
+      "eval_runtime": 2881.5407,
+      "eval_samples_per_second": 2.776,
+      "eval_steps_per_second": 0.347,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33075734157650694,
+      "grad_norm": 0.3202391256850226,
+      "learning_rate": 0.00016929771314645454,
+      "loss": 0.2402,
+      "step": 1605
+    },
+    {
+      "epoch": 0.3317877382792375,
+      "grad_norm": 0.3612689826913437,
+      "learning_rate": 0.00016903787889598458,
+      "loss": 0.3088,
+      "step": 1610
+    },
+    {
+      "epoch": 0.33281813498196805,
+      "grad_norm": 0.31051336019913206,
+      "learning_rate": 0.00016877715101238172,
+      "loss": 0.3202,
+      "step": 1615
+    },
+    {
+      "epoch": 0.33384853168469864,
+      "grad_norm": 0.3462332312673768,
+      "learning_rate": 0.00016851553287053342,
+      "loss": 0.3489,
+      "step": 1620
+    },
+    {
+      "epoch": 0.33487892838742916,
+      "grad_norm": 0.4197430450739659,
+      "learning_rate": 0.00016825302785685077,
+      "loss": 0.3453,
+      "step": 1625
+    },
+    {
+      "epoch": 0.3359093250901597,
+      "grad_norm": 0.355766620140618,
+      "learning_rate": 0.00016798963936922467,
+      "loss": 0.3032,
+      "step": 1630
+    },
+    {
+      "epoch": 0.3369397217928903,
+      "grad_norm": 0.452678294800493,
+      "learning_rate": 0.00016772537081698175,
+      "loss": 0.3333,
+      "step": 1635
+    },
+    {
+      "epoch": 0.3379701184956208,
+      "grad_norm": 0.38423566249718305,
+      "learning_rate": 0.00016746022562084026,
+      "loss": 0.2674,
+      "step": 1640
+    },
+    {
+      "epoch": 0.33900051519835134,
+      "grad_norm": 0.2811598903049604,
+      "learning_rate": 0.0001671942072128659,
+      "loss": 0.3195,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3400309119010819,
+      "grad_norm": 0.3585975448009611,
+      "learning_rate": 0.00016692731903642725,
+      "loss": 0.3445,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34106130860381245,
+      "grad_norm": 0.3724482026933407,
+      "learning_rate": 0.0001666595645461512,
+      "loss": 0.3199,
+      "step": 1655
+    },
+    {
+      "epoch": 0.34209170530654304,
+      "grad_norm": 0.3867823350912187,
+      "learning_rate": 0.0001663909472078784,
+      "loss": 0.3194,
+      "step": 1660
+    },
+    {
+      "epoch": 0.34312210200927357,
+      "grad_norm": 0.33254169042512904,
+      "learning_rate": 0.0001661214704986182,
+      "loss": 0.3144,
+      "step": 1665
+    },
+    {
+      "epoch": 0.3441524987120041,
+      "grad_norm": 0.3478869105292329,
+      "learning_rate": 0.00016585113790650388,
+      "loss": 0.3242,
+      "step": 1670
+    },
+    {
+      "epoch": 0.3451828954147347,
+      "grad_norm": 0.34064698003921173,
+      "learning_rate": 0.00016557995293074715,
+      "loss": 0.3102,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3462132921174652,
+      "grad_norm": 0.4527188823274613,
+      "learning_rate": 0.00016530791908159323,
+      "loss": 0.2957,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3472436888201958,
+      "grad_norm": 0.3309454383786622,
+      "learning_rate": 0.0001650350398802751,
+      "loss": 0.3244,
+      "step": 1685
+    },
+    {
+      "epoch": 0.3482740855229263,
+      "grad_norm": 0.3589818975912585,
+      "learning_rate": 0.0001647613188589682,
+      "loss": 0.2871,
+      "step": 1690
+    },
+    {
+      "epoch": 0.34930448222565685,
+      "grad_norm": 0.369801877686259,
+      "learning_rate": 0.00016448675956074444,
+      "loss": 0.2991,
+      "step": 1695
+    },
+    {
+      "epoch": 0.35033487892838744,
+      "grad_norm": 0.3664277823673914,
+      "learning_rate": 0.0001642113655395266,
+      "loss": 0.3374,
+      "step": 1700
+    },
+    {
+      "epoch": 0.35136527563111797,
+      "grad_norm": 0.30826498189575957,
+      "learning_rate": 0.00016393514036004204,
+      "loss": 0.2722,
+      "step": 1705
+    },
+    {
+      "epoch": 0.35239567233384855,
+      "grad_norm": 0.2991931556332237,
+      "learning_rate": 0.0001636580875977769,
+      "loss": 0.317,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3534260690365791,
+      "grad_norm": 0.3253832462040809,
+      "learning_rate": 0.0001633802108389295,
+      "loss": 0.3,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3544564657393096,
+      "grad_norm": 0.37843130847255274,
+      "learning_rate": 0.00016310151368036408,
+      "loss": 0.3036,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3554868624420402,
+      "grad_norm": 0.3399737695505088,
+      "learning_rate": 0.00016282199972956425,
+      "loss": 0.2905,
+      "step": 1725
+    },
+    {
+      "epoch": 0.3565172591447707,
+      "grad_norm": 0.32815030354752983,
+      "learning_rate": 0.00016254167260458622,
+      "loss": 0.3056,
+      "step": 1730
+    },
+    {
+      "epoch": 0.3575476558475013,
+      "grad_norm": 0.3322854407295662,
+      "learning_rate": 0.000162260535934012,
+      "loss": 0.2974,
+      "step": 1735
+    },
+    {
+      "epoch": 0.35857805255023184,
+      "grad_norm": 0.3141453045464179,
+      "learning_rate": 0.00016197859335690247,
+      "loss": 0.289,
+      "step": 1740
+    },
+    {
+      "epoch": 0.35960844925296237,
+      "grad_norm": 0.32100160310926246,
+      "learning_rate": 0.0001616958485227503,
+      "loss": 0.3173,
+      "step": 1745
+    },
+    {
+      "epoch": 0.36063884595569295,
+      "grad_norm": 0.39886105757863777,
+      "learning_rate": 0.0001614123050914325,
+      "loss": 0.3074,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3616692426584235,
+      "grad_norm": 0.3604472382794737,
+      "learning_rate": 0.00016112796673316336,
+      "loss": 0.2689,
+      "step": 1755
+    },
+    {
+      "epoch": 0.36269963936115407,
+      "grad_norm": 0.2784618832581671,
+      "learning_rate": 0.00016084283712844666,
+      "loss": 0.3148,
+      "step": 1760
+    },
+    {
+      "epoch": 0.3637300360638846,
+      "grad_norm": 0.3021780644976563,
+      "learning_rate": 0.00016055691996802823,
+      "loss": 0.2802,
+      "step": 1765
+    },
+    {
+      "epoch": 0.36476043276661513,
+      "grad_norm": 0.34664455247499926,
+      "learning_rate": 0.00016027021895284808,
+      "loss": 0.3214,
+      "step": 1770
+    },
+    {
+      "epoch": 0.3657908294693457,
+      "grad_norm": 0.3169725333496023,
+      "learning_rate": 0.0001599827377939925,
+      "loss": 0.2969,
+      "step": 1775
+    },
+    {
+      "epoch": 0.36682122617207624,
+      "grad_norm": 0.3171890414737335,
+      "learning_rate": 0.00015969448021264606,
+      "loss": 0.2762,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3678516228748068,
+      "grad_norm": 0.32861761572712384,
+      "learning_rate": 0.00015940544994004334,
+      "loss": 0.3453,
+      "step": 1785
+    },
+    {
+      "epoch": 0.36888201957753736,
+      "grad_norm": 0.31257042261002754,
+      "learning_rate": 0.00015911565071742088,
+      "loss": 0.3054,
+      "step": 1790
+    },
+    {
+      "epoch": 0.3699124162802679,
+      "grad_norm": 0.3241031150716069,
+      "learning_rate": 0.00015882508629596836,
+      "loss": 0.3076,
+      "step": 1795
+    },
+    {
+      "epoch": 0.37094281298299847,
+      "grad_norm": 0.39403630552828167,
+      "learning_rate": 0.00015853376043678053,
+      "loss": 0.3474,
+      "step": 1800
+    },
+    {
+      "epoch": 0.37094281298299847,
+      "eval_loss": 0.2924995422363281,
+      "eval_runtime": 2880.2214,
+      "eval_samples_per_second": 2.778,
+      "eval_steps_per_second": 0.347,
+      "step": 1800
+    },
+    {
+      "epoch": 0.371973209685729,
+      "grad_norm": 0.2832222312603273,
+      "learning_rate": 0.00015824167691080802,
+      "loss": 0.2464,
+      "step": 1805
+    },
+    {
+      "epoch": 0.37300360638845953,
+      "grad_norm": 0.34854474448900724,
+      "learning_rate": 0.00015794883949880894,
+      "loss": 0.3153,
+      "step": 1810
+    },
+    {
+      "epoch": 0.3740340030911901,
+      "grad_norm": 0.411995708678398,
+      "learning_rate": 0.00015765525199129966,
+      "loss": 0.3075,
+      "step": 1815
+    },
+    {
+      "epoch": 0.37506439979392064,
+      "grad_norm": 0.35807431649917565,
+      "learning_rate": 0.000157360918188506,
+      "loss": 0.3233,
+      "step": 1820
+    },
+    {
+      "epoch": 0.37609479649665123,
+      "grad_norm": 0.39599363094279105,
+      "learning_rate": 0.0001570658419003137,
+      "loss": 0.3298,
+      "step": 1825
+    },
+    {
+      "epoch": 0.37712519319938176,
+      "grad_norm": 0.38707229731214027,
+      "learning_rate": 0.00015677002694621948,
+      "loss": 0.269,
+      "step": 1830
+    },
+    {
+      "epoch": 0.3781555899021123,
+      "grad_norm": 0.32749605968678447,
+      "learning_rate": 0.00015647347715528137,
+      "loss": 0.3334,
+      "step": 1835
+    },
+    {
+      "epoch": 0.3791859866048429,
+      "grad_norm": 0.35761830471268774,
+      "learning_rate": 0.00015617619636606924,
+      "loss": 0.2933,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3802163833075734,
+      "grad_norm": 0.35967453920717335,
+      "learning_rate": 0.00015587818842661494,
+      "loss": 0.2949,
+      "step": 1845
+    },
+    {
+      "epoch": 0.381246780010304,
+      "grad_norm": 0.3143527248467094,
+      "learning_rate": 0.00015557945719436278,
+      "loss": 0.329,
+      "step": 1850
+    },
+    {
+      "epoch": 0.3822771767130345,
+      "grad_norm": 0.372991583134114,
+      "learning_rate": 0.00015528000653611935,
+      "loss": 0.3037,
+      "step": 1855
+    },
+    {
+      "epoch": 0.38330757341576505,
+      "grad_norm": 0.35004314943597253,
+      "learning_rate": 0.0001549798403280036,
+      "loss": 0.3377,
+      "step": 1860
+    },
+    {
+      "epoch": 0.38433797011849563,
+      "grad_norm": 0.3231196426185917,
+      "learning_rate": 0.0001546789624553966,
+      "loss": 0.3208,
+      "step": 1865
+    },
+    {
+      "epoch": 0.38536836682122616,
+      "grad_norm": 0.35488369318279184,
+      "learning_rate": 0.00015437737681289128,
+      "loss": 0.296,
+      "step": 1870
+    },
+    {
+      "epoch": 0.38639876352395675,
+      "grad_norm": 0.3718463756429486,
+      "learning_rate": 0.00015407508730424206,
+      "loss": 0.3374,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3874291602266873,
+      "grad_norm": 0.2802376403000415,
+      "learning_rate": 0.00015377209784231424,
+      "loss": 0.2588,
+      "step": 1880
+    },
+    {
+      "epoch": 0.3884595569294178,
+      "grad_norm": 0.3799105689825581,
+      "learning_rate": 0.00015346841234903337,
+      "loss": 0.3026,
+      "step": 1885
+    },
+    {
+      "epoch": 0.3894899536321484,
+      "grad_norm": 0.3296377663164442,
+      "learning_rate": 0.0001531640347553345,
+      "loss": 0.2751,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3905203503348789,
+      "grad_norm": 0.36279328661672916,
+      "learning_rate": 0.00015285896900111133,
+      "loss": 0.3242,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3915507470376095,
+      "grad_norm": 0.3552709404621765,
+      "learning_rate": 0.00015255321903516516,
+      "loss": 0.3264,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39258114374034003,
+      "grad_norm": 0.41641063282756574,
+      "learning_rate": 0.00015224678881515377,
+      "loss": 0.2637,
+      "step": 1905
+    },
+    {
+      "epoch": 0.39361154044307056,
+      "grad_norm": 0.3878387710201773,
+      "learning_rate": 0.00015193968230754024,
+      "loss": 0.3148,
+      "step": 1910
+    },
+    {
+      "epoch": 0.39464193714580115,
+      "grad_norm": 0.3421084341160711,
+      "learning_rate": 0.00015163190348754162,
+      "loss": 0.2826,
+      "step": 1915
+    },
+    {
+      "epoch": 0.3956723338485317,
+      "grad_norm": 0.2898537738332495,
+      "learning_rate": 0.00015132345633907734,
+      "loss": 0.3069,
+      "step": 1920
+    },
+    {
+      "epoch": 0.39670273055126226,
+      "grad_norm": 0.3594731425839707,
+      "learning_rate": 0.0001510143448547178,
+      "loss": 0.347,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3977331272539928,
+      "grad_norm": 0.33029549881450443,
+      "learning_rate": 0.00015070457303563268,
+      "loss": 0.3231,
+      "step": 1930
+    },
+    {
+      "epoch": 0.3987635239567233,
+      "grad_norm": 0.40244939641132027,
+      "learning_rate": 0.000150394144891539,
+      "loss": 0.3372,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3997939206594539,
+      "grad_norm": 0.35291733189988017,
+      "learning_rate": 0.00015008306444064942,
+      "loss": 0.2737,
+      "step": 1940
+    },
+    {
+      "epoch": 0.40082431736218443,
+      "grad_norm": 0.32337967389685746,
+      "learning_rate": 0.00014977133570961997,
+      "loss": 0.2988,
+      "step": 1945
+    },
+    {
+      "epoch": 0.401854714064915,
+      "grad_norm": 0.36198496665746605,
+      "learning_rate": 0.00014945896273349827,
+      "loss": 0.3199,
+      "step": 1950
+    },
+    {
+      "epoch": 0.40288511076764555,
+      "grad_norm": 0.2982085727716395,
+      "learning_rate": 0.00014914594955567099,
+      "loss": 0.2734,
+      "step": 1955
+    },
+    {
+      "epoch": 0.4039155074703761,
+      "grad_norm": 0.35550939579352364,
+      "learning_rate": 0.00014883230022781163,
+      "loss": 0.3455,
+      "step": 1960
+    },
+    {
+      "epoch": 0.40494590417310666,
+      "grad_norm": 0.3483385027728902,
+      "learning_rate": 0.00014851801880982814,
+      "loss": 0.2716,
+      "step": 1965
+    },
+    {
+      "epoch": 0.4059763008758372,
+      "grad_norm": 0.348108964846049,
+      "learning_rate": 0.00014820310936981026,
+      "loss": 0.2873,
+      "step": 1970
+    },
+    {
+      "epoch": 0.4070066975785677,
+      "grad_norm": 0.3953405122632469,
+      "learning_rate": 0.000147887575983977,
+      "loss": 0.3316,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4080370942812983,
+      "grad_norm": 0.27428554427495483,
+      "learning_rate": 0.00014757142273662358,
+      "loss": 0.2857,
+      "step": 1980
+    },
+    {
+      "epoch": 0.40906749098402884,
+      "grad_norm": 0.3584297752000547,
+      "learning_rate": 0.00014725465372006905,
+      "loss": 0.3355,
+      "step": 1985
+    },
+    {
+      "epoch": 0.4100978876867594,
+      "grad_norm": 0.2972999446567122,
+      "learning_rate": 0.0001469372730346028,
+      "loss": 0.2656,
+      "step": 1990
+    },
+    {
+      "epoch": 0.41112828438948995,
+      "grad_norm": 0.3627876088336718,
+      "learning_rate": 0.00014661928478843186,
+      "loss": 0.3345,
+      "step": 1995
+    },
+    {
+      "epoch": 0.4121586810922205,
+      "grad_norm": 0.3495779600410771,
+      "learning_rate": 0.00014630069309762753,
+      "loss": 0.3401,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4121586810922205,
+      "eval_loss": 0.2875462770462036,
+      "eval_runtime": 2879.4184,
+      "eval_samples_per_second": 2.778,
+      "eval_steps_per_second": 0.347,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41318907779495107,
+      "grad_norm": 0.3391439590455538,
+      "learning_rate": 0.00014598150208607212,
+      "loss": 0.2705,
+      "step": 2005
+    },
+    {
+      "epoch": 0.4142194744976816,
+      "grad_norm": 0.29356233596458986,
+      "learning_rate": 0.00014566171588540572,
+      "loss": 0.3306,
+      "step": 2010
+    },
+    {
+      "epoch": 0.4152498712004122,
+      "grad_norm": 0.33751989432866675,
+      "learning_rate": 0.00014534133863497252,
+      "loss": 0.2923,
+      "step": 2015
+    },
+    {
+      "epoch": 0.4162802679031427,
+      "grad_norm": 0.32503045452079943,
+      "learning_rate": 0.00014502037448176734,
+      "loss": 0.2681,
+      "step": 2020
+    },
+    {
+      "epoch": 0.41731066460587324,
+      "grad_norm": 0.4256762276514426,
+      "learning_rate": 0.00014469882758038193,
+      "loss": 0.3142,
+      "step": 2025
+    },
+    {
+      "epoch": 0.4183410613086038,
+      "grad_norm": 0.31736929564499033,
+      "learning_rate": 0.00014437670209295112,
+      "loss": 0.281,
+      "step": 2030
+    },
+    {
+      "epoch": 0.41937145801133435,
+      "grad_norm": 0.289199714492364,
+      "learning_rate": 0.0001440540021890992,
+      "loss": 0.3127,
+      "step": 2035
+    },
+    {
+      "epoch": 0.42040185471406494,
+      "grad_norm": 0.32970220123556293,
+      "learning_rate": 0.00014373073204588556,
+      "loss": 0.2875,
+      "step": 2040
+    },
+    {
+      "epoch": 0.42143225141679547,
+      "grad_norm": 0.32861612982909,
+      "learning_rate": 0.0001434068958477509,
+      "loss": 0.3219,
+      "step": 2045
+    },
+    {
+      "epoch": 0.422462648119526,
+      "grad_norm": 0.4097507208513791,
+      "learning_rate": 0.00014308249778646306,
+      "loss": 0.3185,
+      "step": 2050
+    },
+    {
+      "epoch": 0.4234930448222566,
+      "grad_norm": 0.3696930062252253,
+      "learning_rate": 0.0001427575420610626,
+      "loss": 0.2635,
+      "step": 2055
+    },
+    {
+      "epoch": 0.4245234415249871,
+      "grad_norm": 0.3440933088510053,
+      "learning_rate": 0.00014243203287780856,
+      "loss": 0.322,
+      "step": 2060
+    },
+    {
+      "epoch": 0.4255538382277177,
+      "grad_norm": 0.3639003692887089,
+      "learning_rate": 0.00014210597445012398,
+      "loss": 0.2911,
+      "step": 2065
+    },
+    {
+      "epoch": 0.4265842349304482,
+      "grad_norm": 0.3552632151370132,
+      "learning_rate": 0.0001417793709985415,
+      "loss": 0.3233,
+      "step": 2070
+    },
+    {
+      "epoch": 0.42761463163317875,
+      "grad_norm": 0.385489568046544,
+      "learning_rate": 0.0001414522267506484,
+      "loss": 0.348,
+      "step": 2075
+    },
+    {
+      "epoch": 0.42864502833590934,
+      "grad_norm": 0.34307097509378426,
+      "learning_rate": 0.0001411245459410322,
+      "loss": 0.2649,
+      "step": 2080
+    },
+    {
+      "epoch": 0.42967542503863987,
+      "grad_norm": 0.3018369278756531,
+      "learning_rate": 0.00014079633281122573,
+      "loss": 0.3082,
+      "step": 2085
+    },
+    {
+      "epoch": 0.43070582174137045,
+      "grad_norm": 0.286861468385612,
+      "learning_rate": 0.00014046759160965224,
+      "loss": 0.2935,
+      "step": 2090
+    },
+    {
+      "epoch": 0.431736218444101,
+      "grad_norm": 0.3506128021819822,
+      "learning_rate": 0.0001401383265915703,
+      "loss": 0.303,
+      "step": 2095
+    },
+    {
+      "epoch": 0.4327666151468315,
+      "grad_norm": 0.40503992585942733,
+      "learning_rate": 0.00013980854201901886,
+      "loss": 0.3254,
+      "step": 2100
+    },
+    {
+      "epoch": 0.4337970118495621,
+      "grad_norm": 0.3697571032927392,
+      "learning_rate": 0.00013947824216076207,
+      "loss": 0.2567,
+      "step": 2105
+    },
+    {
+      "epoch": 0.4348274085522926,
+      "grad_norm": 0.399061699010693,
+      "learning_rate": 0.00013914743129223405,
+      "loss": 0.3413,
+      "step": 2110
+    },
+    {
+      "epoch": 0.43585780525502316,
+      "grad_norm": 0.32157629254065767,
+      "learning_rate": 0.00013881611369548325,
+      "loss": 0.2802,
+      "step": 2115
+    },
+    {
+      "epoch": 0.43688820195775374,
+      "grad_norm": 0.3567204984816549,
+      "learning_rate": 0.00013848429365911753,
+      "loss": 0.3155,
+      "step": 2120
+    },
+    {
+      "epoch": 0.43791859866048427,
+      "grad_norm": 0.30654912781148985,
+      "learning_rate": 0.00013815197547824824,
+      "loss": 0.3,
+      "step": 2125
+    },
+    {
+      "epoch": 0.43894899536321486,
+      "grad_norm": 0.3866305793168129,
+      "learning_rate": 0.00013781916345443474,
+      "loss": 0.285,
+      "step": 2130
+    },
+    {
+      "epoch": 0.4399793920659454,
+      "grad_norm": 0.38963039998932314,
+      "learning_rate": 0.00013748586189562878,
+      "loss": 0.3271,
+      "step": 2135
+    },
+    {
+      "epoch": 0.4410097887686759,
+      "grad_norm": 0.2944680903796273,
+      "learning_rate": 0.00013715207511611876,
+      "loss": 0.2796,
+      "step": 2140
+    },
+    {
+      "epoch": 0.4420401854714065,
+      "grad_norm": 0.3287140509685993,
+      "learning_rate": 0.0001368178074364737,
+      "loss": 0.2794,
+      "step": 2145
+    },
+    {
+      "epoch": 0.44307058217413703,
+      "grad_norm": 0.3497708699847718,
+      "learning_rate": 0.00013648306318348762,
+      "loss": 0.348,
+      "step": 2150
+    },
+    {
+      "epoch": 0.4441009788768676,
+      "grad_norm": 0.3577124630797181,
+      "learning_rate": 0.0001361478466901231,
+      "loss": 0.2638,
+      "step": 2155
+    },
+    {
+      "epoch": 0.44513137557959814,
+      "grad_norm": 0.3160258522774726,
+      "learning_rate": 0.0001358121622954557,
+      "loss": 0.305,
+      "step": 2160
+    },
+    {
+      "epoch": 0.4461617722823287,
+      "grad_norm": 0.3245915685090122,
+      "learning_rate": 0.00013547601434461733,
+      "loss": 0.2794,
+      "step": 2165
+    },
+    {
+      "epoch": 0.44719216898505926,
+      "grad_norm": 0.3081359389684238,
+      "learning_rate": 0.0001351394071887404,
+      "loss": 0.3119,
+      "step": 2170
+    },
+    {
+      "epoch": 0.4482225656877898,
+      "grad_norm": 0.3562399653740551,
+      "learning_rate": 0.0001348023451849012,
+      "loss": 0.3133,
+      "step": 2175
+    },
+    {
+      "epoch": 0.44925296239052037,
+      "grad_norm": 0.33659092073928976,
+      "learning_rate": 0.00013446483269606362,
+      "loss": 0.2584,
+      "step": 2180
+    },
+    {
+      "epoch": 0.4502833590932509,
+      "grad_norm": 0.3486043316585054,
+      "learning_rate": 0.00013412687409102277,
+      "loss": 0.3073,
+      "step": 2185
+    },
+    {
+      "epoch": 0.45131375579598143,
+      "grad_norm": 0.3184771967280283,
+      "learning_rate": 0.00013378847374434814,
+      "loss": 0.2868,
+      "step": 2190
+    },
+    {
+      "epoch": 0.452344152498712,
+      "grad_norm": 0.32065577051849475,
+      "learning_rate": 0.0001334496360363274,
+      "loss": 0.2804,
+      "step": 2195
+    },
+    {
+      "epoch": 0.45337454920144254,
+      "grad_norm": 0.3597968288617642,
+      "learning_rate": 0.00013311036535290925,
+      "loss": 0.3124,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45337454920144254,
+      "eval_loss": 0.2839260995388031,
+      "eval_runtime": 2877.4926,
+      "eval_samples_per_second": 2.78,
+      "eval_steps_per_second": 0.348,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45440494590417313,
+      "grad_norm": 0.3642061316908434,
+      "learning_rate": 0.00013277066608564713,
+      "loss": 0.2602,
+      "step": 2205
+    },
+    {
+      "epoch": 0.45543534260690366,
+      "grad_norm": 0.4070129402915311,
+      "learning_rate": 0.0001324305426316418,
+      "loss": 0.3135,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4564657393096342,
+      "grad_norm": 0.28705492007355193,
+      "learning_rate": 0.000132089999393485,
+      "loss": 0.2955,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4574961360123648,
+      "grad_norm": 0.4192774207761234,
+      "learning_rate": 0.000131749040779202,
+      "loss": 0.3107,
+      "step": 2220
+    },
+    {
+      "epoch": 0.4585265327150953,
+      "grad_norm": 0.35888440532939486,
+      "learning_rate": 0.0001314076712021949,
+      "loss": 0.3141,
+      "step": 2225
+    },
+    {
+      "epoch": 0.4595569294178259,
+      "grad_norm": 0.36636355983534064,
+      "learning_rate": 0.0001310658950811852,
+      "loss": 0.2645,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4605873261205564,
+      "grad_norm": 0.2950156999856839,
+      "learning_rate": 0.00013072371684015688,
+      "loss": 0.2863,
+      "step": 2235
+    },
+    {
+      "epoch": 0.46161772282328695,
+      "grad_norm": 0.3006839955052212,
+      "learning_rate": 0.00013038114090829892,
+      "loss": 0.2907,
+      "step": 2240
+    },
+    {
+      "epoch": 0.46264811952601753,
+      "grad_norm": 0.4458786907297178,
+      "learning_rate": 0.00013003817171994807,
+      "loss": 0.3088,
+      "step": 2245
+    },
+    {
+      "epoch": 0.46367851622874806,
+      "grad_norm": 0.45350359238505367,
+      "learning_rate": 0.00012969481371453135,
+      "loss": 0.3292,
+      "step": 2250
+    },
+    {
+      "epoch": 0.46470891293147865,
+      "grad_norm": 0.32874718511199297,
+      "learning_rate": 0.00012935107133650885,
+      "loss": 0.26,
+      "step": 2255
+    },
+    {
+      "epoch": 0.4657393096342092,
+      "grad_norm": 0.3640981247486453,
+      "learning_rate": 0.00012900694903531586,
+      "loss": 0.3098,
+      "step": 2260
+    },
+    {
+      "epoch": 0.4667697063369397,
+      "grad_norm": 0.3753366193823528,
+      "learning_rate": 0.0001286624512653055,
+      "loss": 0.2864,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4678001030396703,
+      "grad_norm": 0.3932136104409213,
+      "learning_rate": 0.00012831758248569097,
+      "loss": 0.3311,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4688304997424008,
+      "grad_norm": 0.40583272633582196,
+      "learning_rate": 0.00012797234716048784,
+      "loss": 0.2953,
+      "step": 2275
+    },
+    {
+      "epoch": 0.46986089644513135,
+      "grad_norm": 0.3932055476008604,
+      "learning_rate": 0.00012762674975845637,
+      "loss": 0.2474,
+      "step": 2280
+    },
+    {
+      "epoch": 0.47089129314786193,
+      "grad_norm": 0.30115467677227103,
+      "learning_rate": 0.00012728079475304345,
+      "loss": 0.3102,
+      "step": 2285
+    },
+    {
+      "epoch": 0.47192168985059246,
+      "grad_norm": 0.3161465840153421,
+      "learning_rate": 0.0001269344866223249,
+      "loss": 0.27,
+      "step": 2290
+    },
+    {
+      "epoch": 0.47295208655332305,
+      "grad_norm": 0.2984326355380522,
+      "learning_rate": 0.00012658782984894743,
+      "loss": 0.2889,
+      "step": 2295
+    },
+    {
+      "epoch": 0.4739824832560536,
+      "grad_norm": 0.42593663924882696,
+      "learning_rate": 0.00012624082892007064,
+      "loss": 0.3334,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4750128799587841,
+      "grad_norm": 0.33769395333885094,
+      "learning_rate": 0.00012589348832730882,
+      "loss": 0.2998,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4760432766615147,
+      "grad_norm": 0.303923325646938,
+      "learning_rate": 0.00012554581256667296,
+      "loss": 0.3035,
+      "step": 2310
+    },
+    {
+      "epoch": 0.4770736733642452,
+      "grad_norm": 0.2858575579612513,
+      "learning_rate": 0.00012519780613851254,
+      "loss": 0.2485,
+      "step": 2315
+    },
+    {
+      "epoch": 0.4781040700669758,
+      "grad_norm": 0.35956368876613565,
+      "learning_rate": 0.00012484947354745714,
+      "loss": 0.3149,
+      "step": 2320
+    },
+    {
+      "epoch": 0.47913446676970634,
+      "grad_norm": 0.37980409546574345,
+      "learning_rate": 0.0001245008193023583,
+      "loss": 0.3311,
+      "step": 2325
+    },
+    {
+      "epoch": 0.48016486347243686,
+      "grad_norm": 0.33696103332976024,
+      "learning_rate": 0.00012415184791623101,
+      "loss": 0.2588,
+      "step": 2330
+    },
+    {
+      "epoch": 0.48119526017516745,
+      "grad_norm": 0.29566476069771774,
+      "learning_rate": 0.00012380256390619548,
+      "loss": 0.3085,
+      "step": 2335
+    },
+    {
+      "epoch": 0.482225656877898,
+      "grad_norm": 0.33290532311027576,
+      "learning_rate": 0.00012345297179341844,
+      "loss": 0.2796,
+      "step": 2340
+    },
+    {
+      "epoch": 0.48325605358062856,
+      "grad_norm": 0.3250440560893066,
+      "learning_rate": 0.00012310307610305477,
+      "loss": 0.3314,
+      "step": 2345
+    },
+    {
+      "epoch": 0.4842864502833591,
+      "grad_norm": 0.4026792239398559,
+      "learning_rate": 0.00012275288136418889,
+      "loss": 0.3271,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4853168469860896,
+      "grad_norm": 0.34006109198287215,
+      "learning_rate": 0.0001224023921097762,
+      "loss": 0.2546,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4863472436888202,
+      "grad_norm": 0.34339764904284936,
+      "learning_rate": 0.0001220516128765842,
+      "loss": 0.2988,
+      "step": 2360
+    },
+    {
+      "epoch": 0.48737764039155074,
+      "grad_norm": 0.33688366804796616,
+      "learning_rate": 0.00012170054820513401,
+      "loss": 0.2532,
+      "step": 2365
+    },
+    {
+      "epoch": 0.4884080370942813,
+      "grad_norm": 0.38985512420501045,
+      "learning_rate": 0.00012134920263964147,
+      "loss": 0.3374,
+      "step": 2370
+    },
+    {
+      "epoch": 0.48943843379701185,
+      "grad_norm": 0.35409668644772796,
+      "learning_rate": 0.00012099758072795832,
+      "loss": 0.2997,
+      "step": 2375
+    },
+    {
+      "epoch": 0.4904688304997424,
+      "grad_norm": 0.30022085659766934,
+      "learning_rate": 0.00012064568702151335,
+      "loss": 0.2606,
+      "step": 2380
+    },
+    {
+      "epoch": 0.49149922720247297,
+      "grad_norm": 0.3461846293405654,
+      "learning_rate": 0.0001202935260752535,
+      "loss": 0.3363,
+      "step": 2385
+    },
+    {
+      "epoch": 0.4925296239052035,
+      "grad_norm": 0.28834357398525434,
+      "learning_rate": 0.00011994110244758496,
+      "loss": 0.2773,
+      "step": 2390
+    },
+    {
+      "epoch": 0.4935600206079341,
+      "grad_norm": 0.3332635903203983,
+      "learning_rate": 0.00011958842070031395,
+      "loss": 0.3057,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4945904173106646,
+      "grad_norm": 0.35364367272743974,
+      "learning_rate": 0.0001192354853985879,
+      "loss": 0.3095,
+      "step": 2400
+    },
+    {
+      "epoch": 0.4945904173106646,
+      "eval_loss": 0.28022027015686035,
+      "eval_runtime": 2878.1786,
+      "eval_samples_per_second": 2.78,
+      "eval_steps_per_second": 0.347,
+      "step": 2400
+    },
+    {
+      "epoch": 0.49562081401339514,
+      "grad_norm": 0.3454343108599935,
+      "learning_rate": 0.00011888230111083627,
+      "loss": 0.2574,
+      "step": 2405
+    },
+    {
+      "epoch": 0.4966512107161257,
+      "grad_norm": 0.4247051943255858,
+      "learning_rate": 0.00011852887240871145,
+      "loss": 0.3067,
+      "step": 2410
+    },
+    {
+      "epoch": 0.49768160741885625,
+      "grad_norm": 0.36774522098501977,
+      "learning_rate": 0.00011817520386702947,
+      "loss": 0.3384,
+      "step": 2415
+    },
+    {
+      "epoch": 0.49871200412158684,
+      "grad_norm": 0.3272725276077021,
+      "learning_rate": 0.00011782130006371092,
+      "loss": 0.3136,
+      "step": 2420
+    },
+    {
+      "epoch": 0.49974240082431737,
+      "grad_norm": 0.46257832400717397,
+      "learning_rate": 0.00011746716557972167,
+      "loss": 0.3025,
+      "step": 2425
+    },
+    {
+      "epoch": 0.500772797527048,
+      "grad_norm": 0.28584536205539957,
+      "learning_rate": 0.00011711280499901347,
+      "loss": 0.2736,
+      "step": 2430
+    },
+    {
+      "epoch": 0.5018031942297785,
+      "grad_norm": 0.36350270232137516,
+      "learning_rate": 0.00011675822290846474,
+      "loss": 0.3114,
+      "step": 2435
+    },
+    {
+      "epoch": 0.502833590932509,
+      "grad_norm": 0.3515382888246873,
+      "learning_rate": 0.00011640342389782114,
+      "loss": 0.2756,
+      "step": 2440
+    },
+    {
+      "epoch": 0.5038639876352395,
+      "grad_norm": 0.333990361886686,
+      "learning_rate": 0.00011604841255963616,
+      "loss": 0.2982,
+      "step": 2445
+    },
+    {
+      "epoch": 0.5048943843379701,
+      "grad_norm": 0.3774121482190172,
+      "learning_rate": 0.00011569319348921168,
+      "loss": 0.2888,
+      "step": 2450
+    },
+    {
+      "epoch": 0.5059247810407007,
+      "grad_norm": 0.3449765743536067,
+      "learning_rate": 0.00011533777128453844,
+      "loss": 0.239,
+      "step": 2455
+    },
+    {
+      "epoch": 0.5069551777434312,
+      "grad_norm": 0.39508888175442797,
+      "learning_rate": 0.00011498215054623664,
+      "loss": 0.3099,
+      "step": 2460
+    },
+    {
+      "epoch": 0.5079855744461618,
+      "grad_norm": 0.29997399648896544,
+      "learning_rate": 0.00011462633587749629,
+      "loss": 0.2553,
+      "step": 2465
+    },
+    {
+      "epoch": 0.5090159711488923,
+      "grad_norm": 0.3946262619553709,
+      "learning_rate": 0.00011427033188401768,
+      "loss": 0.3003,
+      "step": 2470
+    },
+    {
+      "epoch": 0.5100463678516228,
+      "grad_norm": 0.3979213104058746,
+      "learning_rate": 0.00011391414317395167,
+      "loss": 0.3272,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5110767645543535,
+      "grad_norm": 0.36162636563510103,
+      "learning_rate": 0.00011355777435784016,
+      "loss": 0.2696,
+      "step": 2480
+    },
+    {
+      "epoch": 0.512107161257084,
+      "grad_norm": 0.3436202202392731,
+      "learning_rate": 0.0001132012300485564,
+      "loss": 0.3231,
+      "step": 2485
+    },
+    {
+      "epoch": 0.5131375579598145,
+      "grad_norm": 0.358384231565426,
+      "learning_rate": 0.00011284451486124514,
+      "loss": 0.2691,
+      "step": 2490
+    },
+    {
+      "epoch": 0.5141679546625451,
+      "grad_norm": 0.32204877501229845,
+      "learning_rate": 0.00011248763341326307,
+      "loss": 0.3002,
+      "step": 2495
+    },
+    {
+      "epoch": 0.5151983513652756,
+      "grad_norm": 0.30701193700017093,
+      "learning_rate": 0.00011213059032411897,
+      "loss": 0.3097,
+      "step": 2500
+    },
+    {
+      "epoch": 0.5162287480680062,
+      "grad_norm": 0.3394950970385267,
+      "learning_rate": 0.00011177339021541387,
+      "loss": 0.2587,
+      "step": 2505
+    },
+    {
+      "epoch": 0.5172591447707368,
+      "grad_norm": 0.3366588908099883,
+      "learning_rate": 0.00011141603771078133,
+      "loss": 0.3194,
+      "step": 2510
+    },
+    {
+      "epoch": 0.5182895414734673,
+      "grad_norm": 0.3981768141047026,
+      "learning_rate": 0.00011105853743582751,
+      "loss": 0.2968,
+      "step": 2515
+    },
+    {
+      "epoch": 0.5193199381761978,
+      "grad_norm": 0.32297584447574446,
+      "learning_rate": 0.00011070089401807129,
+      "loss": 0.2836,
+      "step": 2520
+    },
+    {
+      "epoch": 0.5203503348789283,
+      "grad_norm": 0.357520694115771,
+      "learning_rate": 0.0001103431120868845,
+      "loss": 0.2945,
+      "step": 2525
+    },
+    {
+      "epoch": 0.521380731581659,
+      "grad_norm": 0.3150716632390592,
+      "learning_rate": 0.00010998519627343182,
+      "loss": 0.2665,
+      "step": 2530
+    },
+    {
+      "epoch": 0.5224111282843895,
+      "grad_norm": 0.3979335237201818,
+      "learning_rate": 0.00010962715121061095,
+      "loss": 0.3219,
+      "step": 2535
+    },
+    {
+      "epoch": 0.52344152498712,
+      "grad_norm": 0.2755501921687075,
+      "learning_rate": 0.00010926898153299259,
+      "loss": 0.2851,
+      "step": 2540
+    },
+    {
+      "epoch": 0.5244719216898506,
+      "grad_norm": 0.3265322073550003,
+      "learning_rate": 0.00010891069187676051,
+      "loss": 0.2908,
+      "step": 2545
+    },
+    {
+      "epoch": 0.5255023183925811,
+      "grad_norm": 0.2997864769906755,
+      "learning_rate": 0.00010855228687965138,
+      "loss": 0.2829,
+      "step": 2550
+    },
+    {
+      "epoch": 0.5265327150953117,
+      "grad_norm": 0.2857206537360907,
+      "learning_rate": 0.000108193771180895,
+      "loss": 0.2539,
+      "step": 2555
+    },
+    {
+      "epoch": 0.5275631117980423,
+      "grad_norm": 0.36091935273374165,
+      "learning_rate": 0.00010783514942115398,
+      "loss": 0.3289,
+      "step": 2560
+    },
+    {
+      "epoch": 0.5285935085007728,
+      "grad_norm": 0.3436525132226202,
+      "learning_rate": 0.0001074764262424639,
+      "loss": 0.2741,
+      "step": 2565
+    },
+    {
+      "epoch": 0.5296239052035033,
+      "grad_norm": 0.27339341095603925,
+      "learning_rate": 0.00010711760628817304,
+      "loss": 0.2826,
+      "step": 2570
+    },
+    {
+      "epoch": 0.5306543019062339,
+      "grad_norm": 0.40588239990731534,
+      "learning_rate": 0.0001067586942028824,
+      "loss": 0.3024,
+      "step": 2575
+    },
+    {
+      "epoch": 0.5316846986089645,
+      "grad_norm": 0.4102653825925909,
+      "learning_rate": 0.00010639969463238553,
+      "loss": 0.2714,
+      "step": 2580
+    },
+    {
+      "epoch": 0.532715095311695,
+      "grad_norm": 0.3099370106047446,
+      "learning_rate": 0.00010604061222360828,
+      "loss": 0.2808,
+      "step": 2585
+    },
+    {
+      "epoch": 0.5337454920144256,
+      "grad_norm": 0.2822749846247971,
+      "learning_rate": 0.00010568145162454896,
+      "loss": 0.2801,
+      "step": 2590
+    },
+    {
+      "epoch": 0.5347758887171561,
+      "grad_norm": 0.3490990673692889,
+      "learning_rate": 0.00010532221748421787,
+      "loss": 0.2945,
+      "step": 2595
+    },
+    {
+      "epoch": 0.5358062854198866,
+      "grad_norm": 0.4053913540554717,
+      "learning_rate": 0.00010496291445257725,
+      "loss": 0.3532,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5358062854198866,
+      "eval_loss": 0.27754950523376465,
+      "eval_runtime": 2877.7009,
+      "eval_samples_per_second": 2.78,
+      "eval_steps_per_second": 0.347,
+      "step": 2600
+    },
+    {
+      "epoch": 0.5368366821226173,
+      "grad_norm": 0.33039659690717843,
+      "learning_rate": 0.00010460354718048109,
+      "loss": 0.2886,
+      "step": 2605
+    },
+    {
+      "epoch": 0.5378670788253478,
+      "grad_norm": 0.3749436861438668,
+      "learning_rate": 0.00010424412031961484,
+      "loss": 0.3254,
+      "step": 2610
+    },
+    {
+      "epoch": 0.5388974755280783,
+      "grad_norm": 0.21141370926954733,
+      "learning_rate": 0.0001038846385224354,
+      "loss": 0.2522,
+      "step": 2615
+    },
+    {
+      "epoch": 0.5399278722308088,
+      "grad_norm": 0.3834437318572923,
+      "learning_rate": 0.00010352510644211074,
+      "loss": 0.2977,
+      "step": 2620
+    },
+    {
+      "epoch": 0.5409582689335394,
+      "grad_norm": 0.3725001960205959,
+      "learning_rate": 0.0001031655287324596,
+      "loss": 0.3162,
+      "step": 2625
+    },
+    {
+      "epoch": 0.54198866563627,
+      "grad_norm": 0.3203518352250769,
+      "learning_rate": 0.00010280591004789144,
+      "loss": 0.2699,
+      "step": 2630
+    },
+    {
+      "epoch": 0.5430190623390005,
+      "grad_norm": 0.29941898288284835,
+      "learning_rate": 0.00010244625504334609,
+      "loss": 0.2931,
+      "step": 2635
+    },
+    {
+      "epoch": 0.5440494590417311,
+      "grad_norm": 0.30304378110760377,
+      "learning_rate": 0.00010208656837423351,
+      "loss": 0.2689,
+      "step": 2640
+    },
+    {
+      "epoch": 0.5450798557444616,
+      "grad_norm": 0.3348107931428455,
+      "learning_rate": 0.00010172685469637351,
+      "loss": 0.2764,
+      "step": 2645
+    },
+    {
+      "epoch": 0.5461102524471921,
+      "grad_norm": 0.42244879169591937,
+      "learning_rate": 0.00010136711866593551,
+      "loss": 0.3216,
+      "step": 2650
+    },
+    {
+      "epoch": 0.5471406491499228,
+      "grad_norm": 0.3226252753934826,
+      "learning_rate": 0.00010100736493937828,
+      "loss": 0.29,
+      "step": 2655
+    },
+    {
+      "epoch": 0.5481710458526533,
+      "grad_norm": 0.4102152257267717,
+      "learning_rate": 0.00010064759817338965,
+      "loss": 0.3126,
+      "step": 2660
+    },
+    {
+      "epoch": 0.5492014425553838,
+      "grad_norm": 0.35520661791811914,
+      "learning_rate": 0.00010028782302482617,
+      "loss": 0.2796,
+      "step": 2665
+    },
+    {
+      "epoch": 0.5502318392581144,
+      "grad_norm": 0.34575679827777267,
+      "learning_rate": 9.992804415065305e-05,
+      "loss": 0.305,
+      "step": 2670
+    },
+    {
+      "epoch": 0.5512622359608449,
+      "grad_norm": 0.39375752670785547,
+      "learning_rate": 9.956826620788352e-05,
+      "loss": 0.3136,
+      "step": 2675
+    },
+    {
+      "epoch": 0.5522926326635754,
+      "grad_norm": 0.3612100553953111,
+      "learning_rate": 9.920849385351889e-05,
+      "loss": 0.2581,
+      "step": 2680
+    },
+    {
+      "epoch": 0.5533230293663061,
+      "grad_norm": 0.2712636740543298,
+      "learning_rate": 9.884873174448811e-05,
+      "loss": 0.32,
+      "step": 2685
+    },
+    {
+      "epoch": 0.5543534260690366,
+      "grad_norm": 0.3709056434567233,
+      "learning_rate": 9.848898453758752e-05,
+      "loss": 0.2832,
+      "step": 2690
+    },
+    {
+      "epoch": 0.5553838227717671,
+      "grad_norm": 0.31865604636217015,
+      "learning_rate": 9.812925688942054e-05,
+      "loss": 0.2877,
+      "step": 2695
+    },
+    {
+      "epoch": 0.5564142194744977,
+      "grad_norm": 0.38997754979522803,
+      "learning_rate": 9.776955345633739e-05,
+      "loss": 0.3212,
+      "step": 2700
+    },
+    {
+      "epoch": 0.5574446161772282,
+      "grad_norm": 0.33716767476645676,
+      "learning_rate": 9.740987889437492e-05,
+      "loss": 0.2427,
+      "step": 2705
+    },
+    {
+      "epoch": 0.5584750128799588,
+      "grad_norm": 0.33728797221096607,
+      "learning_rate": 9.705023785919623e-05,
+      "loss": 0.2934,
+      "step": 2710
+    },
+    {
+      "epoch": 0.5595054095826894,
+      "grad_norm": 0.29970002424929326,
+      "learning_rate": 9.669063500603049e-05,
+      "loss": 0.2824,
+      "step": 2715
+    },
+    {
+      "epoch": 0.5605358062854199,
+      "grad_norm": 0.4423023226513398,
+      "learning_rate": 9.633107498961251e-05,
+      "loss": 0.2951,
+      "step": 2720
+    },
+    {
+      "epoch": 0.5615662029881504,
+      "grad_norm": 0.3832845615289885,
+      "learning_rate": 9.597156246412277e-05,
+      "loss": 0.3217,
+      "step": 2725
+    },
+    {
+      "epoch": 0.5625965996908809,
+      "grad_norm": 0.3243338286963795,
+      "learning_rate": 9.561210208312694e-05,
+      "loss": 0.2701,
+      "step": 2730
+    },
+    {
+      "epoch": 0.5636269963936116,
+      "grad_norm": 0.3962929861495999,
+      "learning_rate": 9.525269849951578e-05,
+      "loss": 0.2894,
+      "step": 2735
+    },
+    {
+      "epoch": 0.5646573930963421,
+      "grad_norm": 0.2617163116841798,
+      "learning_rate": 9.489335636544476e-05,
+      "loss": 0.2122,
+      "step": 2740
+    },
+    {
+      "epoch": 0.5656877897990726,
+      "grad_norm": 0.3646251465358233,
+      "learning_rate": 9.453408033227406e-05,
+      "loss": 0.3145,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5667181865018032,
+      "grad_norm": 0.3319778563330451,
+      "learning_rate": 9.417487505050816e-05,
+      "loss": 0.2919,
+      "step": 2750
+    },
+    {
+      "epoch": 0.5677485832045337,
+      "grad_norm": 0.372171978724275,
+      "learning_rate": 9.38157451697358e-05,
+      "loss": 0.2757,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5687789799072643,
+      "grad_norm": 0.3752921419845626,
+      "learning_rate": 9.345669533856961e-05,
+      "loss": 0.3013,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5698093766099949,
+      "grad_norm": 0.3043674671804571,
+      "learning_rate": 9.309773020458616e-05,
+      "loss": 0.2543,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5708397733127254,
+      "grad_norm": 0.2550633494163026,
+      "learning_rate": 9.273885441426562e-05,
+      "loss": 0.2497,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5718701700154559,
+      "grad_norm": 0.36955800478440715,
+      "learning_rate": 9.238007261293176e-05,
+      "loss": 0.3131,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5729005667181865,
+      "grad_norm": 0.35332539238207206,
+      "learning_rate": 9.202138944469168e-05,
+      "loss": 0.2658,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5739309634209171,
+      "grad_norm": 0.3264554515234995,
+      "learning_rate": 9.16628095523758e-05,
+      "loss": 0.2897,
+      "step": 2785
+    },
+    {
+      "epoch": 0.5749613601236476,
+      "grad_norm": 0.29610170158655286,
+      "learning_rate": 9.130433757747772e-05,
+      "loss": 0.283,
+      "step": 2790
+    },
+    {
+      "epoch": 0.5759917568263782,
+      "grad_norm": 0.33278027754490686,
+      "learning_rate": 9.094597816009409e-05,
+      "loss": 0.2842,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5770221535291087,
+      "grad_norm": 0.4528453579289396,
+      "learning_rate": 9.058773593886469e-05,
+      "loss": 0.301,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5770221535291087,
+      "eval_loss": 0.27565160393714905,
+      "eval_runtime": 2878.5613,
+      "eval_samples_per_second": 2.779,
+      "eval_steps_per_second": 0.347,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5780525502318392,
+      "grad_norm": 0.35089860682025525,
+      "learning_rate": 9.022961555091226e-05,
+      "loss": 0.2567,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5790829469345699,
+      "grad_norm": 0.33263682642433734,
+      "learning_rate": 8.987162163178256e-05,
+      "loss": 0.2926,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5801133436373004,
+      "grad_norm": 0.3451318549909309,
+      "learning_rate": 8.951375881538421e-05,
+      "loss": 0.2747,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5811437403400309,
+      "grad_norm": 0.32240079639742575,
+      "learning_rate": 8.915603173392895e-05,
+      "loss": 0.2879,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5821741370427614,
+      "grad_norm": 0.399328888829942,
+      "learning_rate": 8.87984450178715e-05,
+      "loss": 0.2948,
+      "step": 2825
+    },
+    {
+      "epoch": 0.583204533745492,
+      "grad_norm": 0.36055298325517393,
+      "learning_rate": 8.84410032958497e-05,
+      "loss": 0.2633,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5842349304482226,
+      "grad_norm": 0.3920621364225242,
+      "learning_rate": 8.808371119462452e-05,
+      "loss": 0.3164,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5852653271509531,
+      "grad_norm": 0.34432478887516577,
+      "learning_rate": 8.772657333902027e-05,
+      "loss": 0.2683,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5862957238536837,
+      "grad_norm": 0.3158164048351637,
+      "learning_rate": 8.736959435186466e-05,
+      "loss": 0.3049,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5873261205564142,
+      "grad_norm": 0.3999813171101764,
+      "learning_rate": 8.701277885392909e-05,
+      "loss": 0.325,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5883565172591447,
+      "grad_norm": 0.32600872728866054,
+      "learning_rate": 8.665613146386854e-05,
+      "loss": 0.2686,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5893869139618754,
+      "grad_norm": 0.352195036886543,
+      "learning_rate": 8.629965679816217e-05,
+      "loss": 0.3215,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5904173106646059,
+      "grad_norm": 0.31676293306405257,
+      "learning_rate": 8.594335947105328e-05,
+      "loss": 0.2762,
+      "step": 2865
+    },
+    {
+      "epoch": 0.5914477073673364,
+      "grad_norm": 0.3596596959340484,
+      "learning_rate": 8.558724409448974e-05,
+      "loss": 0.2708,
+      "step": 2870
+    },
+    {
+      "epoch": 0.592478104070067,
+      "grad_norm": 0.36548567729401993,
+      "learning_rate": 8.523131527806423e-05,
+      "loss": 0.3089,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5935085007727975,
+      "grad_norm": 0.3315130832888099,
+      "learning_rate": 8.48755776289545e-05,
+      "loss": 0.2527,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5945388974755281,
+      "grad_norm": 0.4241057519904373,
+      "learning_rate": 8.452003575186394e-05,
+      "loss": 0.3049,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5955692941782587,
+      "grad_norm": 0.30694798340386065,
+      "learning_rate": 8.416469424896167e-05,
+      "loss": 0.2523,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5965996908809892,
+      "grad_norm": 0.28092729723258986,
+      "learning_rate": 8.380955771982332e-05,
+      "loss": 0.3147,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5976300875837197,
+      "grad_norm": 0.3701838757603822,
+      "learning_rate": 8.345463076137125e-05,
+      "loss": 0.2936,
+      "step": 2900
+    },
+    {
+      "epoch": 0.5986604842864502,
+      "grad_norm": 0.33596931371013156,
+      "learning_rate": 8.309991796781511e-05,
+      "loss": 0.2313,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5996908809891809,
+      "grad_norm": 0.358626386553519,
+      "learning_rate": 8.274542393059235e-05,
+      "loss": 0.2965,
+      "step": 2910
+    },
+    {
+      "epoch": 0.6007212776919114,
+      "grad_norm": 0.34139489578529375,
+      "learning_rate": 8.239115323830889e-05,
+      "loss": 0.2526,
+      "step": 2915
+    },
+    {
+      "epoch": 0.6017516743946419,
+      "grad_norm": 0.3258606452817875,
+      "learning_rate": 8.203711047667958e-05,
+      "loss": 0.2881,
+      "step": 2920
+    },
+    {
+      "epoch": 0.6027820710973725,
+      "grad_norm": 0.3847954972920331,
+      "learning_rate": 8.1683300228469e-05,
+      "loss": 0.337,
+      "step": 2925
+    },
+    {
+      "epoch": 0.603812467800103,
+      "grad_norm": 0.29131611380441297,
+      "learning_rate": 8.132972707343192e-05,
+      "loss": 0.2275,
+      "step": 2930
+    },
+    {
+      "epoch": 0.6048428645028336,
+      "grad_norm": 0.27037063359019814,
+      "learning_rate": 8.097639558825427e-05,
+      "loss": 0.2976,
+      "step": 2935
+    },
+    {
+      "epoch": 0.6058732612055642,
+      "grad_norm": 0.32696985705968035,
+      "learning_rate": 8.062331034649376e-05,
+      "loss": 0.2878,
+      "step": 2940
+    },
+    {
+      "epoch": 0.6069036579082947,
+      "grad_norm": 0.44146891732351323,
+      "learning_rate": 8.027047591852069e-05,
+      "loss": 0.2888,
+      "step": 2945
+    },
+    {
+      "epoch": 0.6079340546110252,
+      "grad_norm": 0.39221179639528275,
+      "learning_rate": 7.991789687145873e-05,
+      "loss": 0.3014,
+      "step": 2950
+    },
+    {
+      "epoch": 0.6089644513137558,
+      "grad_norm": 0.3510472981653684,
+      "learning_rate": 7.956557776912596e-05,
+      "loss": 0.2546,
+      "step": 2955
+    },
+    {
+      "epoch": 0.6099948480164864,
+      "grad_norm": 0.38058040305787494,
+      "learning_rate": 7.921352317197574e-05,
+      "loss": 0.2949,
+      "step": 2960
+    },
+    {
+      "epoch": 0.6110252447192169,
+      "grad_norm": 0.35902085682972984,
+      "learning_rate": 7.886173763703757e-05,
+      "loss": 0.2827,
+      "step": 2965
+    },
+    {
+      "epoch": 0.6120556414219475,
+      "grad_norm": 0.3927327124338447,
+      "learning_rate": 7.851022571785819e-05,
+      "loss": 0.3116,
+      "step": 2970
+    },
+    {
+      "epoch": 0.613086038124678,
+      "grad_norm": 0.3935039943632793,
+      "learning_rate": 7.815899196444267e-05,
+      "loss": 0.3131,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6141164348274085,
+      "grad_norm": 0.3561243208088519,
+      "learning_rate": 7.780804092319547e-05,
+      "loss": 0.2535,
+      "step": 2980
+    },
+    {
+      "epoch": 0.615146831530139,
+      "grad_norm": 0.3479054336232948,
+      "learning_rate": 7.745737713686152e-05,
+      "loss": 0.2948,
+      "step": 2985
+    },
+    {
+      "epoch": 0.6161772282328697,
+      "grad_norm": 0.3898086480384796,
+      "learning_rate": 7.710700514446762e-05,
+      "loss": 0.305,
+      "step": 2990
+    },
+    {
+      "epoch": 0.6172076249356002,
+      "grad_norm": 0.2782916809425304,
+      "learning_rate": 7.675692948126345e-05,
+      "loss": 0.3028,
+      "step": 2995
+    },
+    {
+      "epoch": 0.6182380216383307,
+      "grad_norm": 0.3827931144256995,
+      "learning_rate": 7.640715467866307e-05,
+      "loss": 0.3204,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6182380216383307,
+      "eval_loss": 0.2711670994758606,
+      "eval_runtime": 2879.0056,
+      "eval_samples_per_second": 2.779,
+      "eval_steps_per_second": 0.347,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6192684183410613,
+      "grad_norm": 0.3311400596855034,
+      "learning_rate": 7.605768526418605e-05,
+      "loss": 0.2534,
+      "step": 3005
+    },
+    {
+      "epoch": 0.6202988150437918,
+      "grad_norm": 0.4024348674942288,
+      "learning_rate": 7.57085257613991e-05,
+      "loss": 0.2999,
+      "step": 3010
+    },
+    {
+      "epoch": 0.6213292117465224,
+      "grad_norm": 0.30615727991401664,
+      "learning_rate": 7.535968068985737e-05,
+      "loss": 0.277,
+      "step": 3015
+    },
+    {
+      "epoch": 0.622359608449253,
+      "grad_norm": 0.3434621463110342,
+      "learning_rate": 7.501115456504595e-05,
+      "loss": 0.2832,
+      "step": 3020
+    },
+    {
+      "epoch": 0.6233900051519835,
+      "grad_norm": 0.34574651517676536,
+      "learning_rate": 7.466295189832148e-05,
+      "loss": 0.2949,
+      "step": 3025
+    },
+    {
+      "epoch": 0.624420401854714,
+      "grad_norm": 0.3352676719646584,
+      "learning_rate": 7.431507719685371e-05,
+      "loss": 0.2449,
+      "step": 3030
+    },
+    {
+      "epoch": 0.6254507985574446,
+      "grad_norm": 0.4336028485216119,
+      "learning_rate": 7.396753496356718e-05,
+      "loss": 0.3216,
+      "step": 3035
+    },
+    {
+      "epoch": 0.6264811952601752,
+      "grad_norm": 0.33779434998514446,
+      "learning_rate": 7.362032969708297e-05,
+      "loss": 0.2632,
+      "step": 3040
+    },
+    {
+      "epoch": 0.6275115919629057,
+      "grad_norm": 0.3337649698653029,
+      "learning_rate": 7.327346589166035e-05,
+      "loss": 0.2749,
+      "step": 3045
+    },
+    {
+      "epoch": 0.6285419886656363,
+      "grad_norm": 0.4278703603131763,
+      "learning_rate": 7.292694803713871e-05,
+      "loss": 0.3101,
+      "step": 3050
+    },
+    {
+      "epoch": 0.6295723853683668,
+      "grad_norm": 0.33098852192128414,
+      "learning_rate": 7.258078061887947e-05,
+      "loss": 0.2577,
+      "step": 3055
+    },
+    {
+      "epoch": 0.6306027820710973,
+      "grad_norm": 0.4010491539631291,
+      "learning_rate": 7.223496811770796e-05,
+      "loss": 0.3134,
+      "step": 3060
+    },
+    {
+      "epoch": 0.631633178773828,
+      "grad_norm": 0.46500732529362854,
+      "learning_rate": 7.188951500985533e-05,
+      "loss": 0.2926,
+      "step": 3065
+    },
+    {
+      "epoch": 0.6326635754765585,
+      "grad_norm": 0.3282290070792828,
+      "learning_rate": 7.154442576690083e-05,
+      "loss": 0.3185,
+      "step": 3070
+    },
+    {
+      "epoch": 0.633693972179289,
+      "grad_norm": 0.43208806306822944,
+      "learning_rate": 7.119970485571375e-05,
+      "loss": 0.3238,
+      "step": 3075
+    },
+    {
+      "epoch": 0.6347243688820196,
+      "grad_norm": 0.31953297640440387,
+      "learning_rate": 7.08553567383956e-05,
+      "loss": 0.2961,
+      "step": 3080
+    },
+    {
+      "epoch": 0.6357547655847501,
+      "grad_norm": 0.37608984484932495,
+      "learning_rate": 7.051138587222255e-05,
+      "loss": 0.3062,
+      "step": 3085
+    },
+    {
+      "epoch": 0.6367851622874807,
+      "grad_norm": 0.30155315001331834,
+      "learning_rate": 7.016779670958746e-05,
+      "loss": 0.2951,
+      "step": 3090
+    },
+    {
+      "epoch": 0.6378155589902113,
+      "grad_norm": 0.3072620029346922,
+      "learning_rate": 6.982459369794247e-05,
+      "loss": 0.2783,
+      "step": 3095
+    },
+    {
+      "epoch": 0.6388459556929418,
+      "grad_norm": 0.30677479360829335,
+      "learning_rate": 6.948178127974127e-05,
+      "loss": 0.275,
+      "step": 3100
+    },
+    {
+      "epoch": 0.6398763523956723,
+      "grad_norm": 0.31058594621709634,
+      "learning_rate": 6.913936389238174e-05,
+      "loss": 0.2751,
+      "step": 3105
+    },
+    {
+      "epoch": 0.6409067490984028,
+      "grad_norm": 0.38892915494581226,
+      "learning_rate": 6.879734596814839e-05,
+      "loss": 0.3148,
+      "step": 3110
+    },
+    {
+      "epoch": 0.6419371458011335,
+      "grad_norm": 0.3166973015889613,
+      "learning_rate": 6.845573193415508e-05,
+      "loss": 0.308,
+      "step": 3115
+    },
+    {
+      "epoch": 0.642967542503864,
+      "grad_norm": 0.3572971206745672,
+      "learning_rate": 6.811452621228766e-05,
+      "loss": 0.3069,
+      "step": 3120
+    },
+    {
+      "epoch": 0.6439979392065945,
+      "grad_norm": 0.3735142202617577,
+      "learning_rate": 6.777373321914671e-05,
+      "loss": 0.3258,
+      "step": 3125
+    },
+    {
+      "epoch": 0.6450283359093251,
+      "grad_norm": 0.34484584719959194,
+      "learning_rate": 6.743335736599045e-05,
+      "loss": 0.254,
+      "step": 3130
+    },
+    {
+      "epoch": 0.6460587326120556,
+      "grad_norm": 0.40642185135429687,
+      "learning_rate": 6.709340305867762e-05,
+      "loss": 0.3387,
+      "step": 3135
+    },
+    {
+      "epoch": 0.6470891293147862,
+      "grad_norm": 0.34408235252291053,
+      "learning_rate": 6.675387469761033e-05,
+      "loss": 0.2852,
+      "step": 3140
+    },
+    {
+      "epoch": 0.6481195260175168,
+      "grad_norm": 0.2760204462481522,
+      "learning_rate": 6.641477667767738e-05,
+      "loss": 0.2979,
+      "step": 3145
+    },
+    {
+      "epoch": 0.6491499227202473,
+      "grad_norm": 0.3521257726089709,
+      "learning_rate": 6.607611338819697e-05,
+      "loss": 0.2972,
+      "step": 3150
+    },
+    {
+      "epoch": 0.6501803194229778,
+      "grad_norm": 0.3570108107553612,
+      "learning_rate": 6.573788921286028e-05,
+      "loss": 0.2556,
+      "step": 3155
+    },
+    {
+      "epoch": 0.6512107161257084,
+      "grad_norm": 0.29080835096475477,
+      "learning_rate": 6.540010852967447e-05,
+      "loss": 0.2909,
+      "step": 3160
+    },
+    {
+      "epoch": 0.652241112828439,
+      "grad_norm": 0.35414069594841313,
+      "learning_rate": 6.506277571090613e-05,
+      "loss": 0.2618,
+      "step": 3165
+    },
+    {
+      "epoch": 0.6532715095311695,
+      "grad_norm": 0.34570363781855507,
+      "learning_rate": 6.47258951230246e-05,
+      "loss": 0.2927,
+      "step": 3170
+    },
+    {
+      "epoch": 0.6543019062339,
+      "grad_norm": 0.3516727468458287,
+      "learning_rate": 6.438947112664555e-05,
+      "loss": 0.2738,
+      "step": 3175
+    },
+    {
+      "epoch": 0.6553323029366306,
+      "grad_norm": 0.2504400994918305,
+      "learning_rate": 6.405350807647444e-05,
+      "loss": 0.2446,
+      "step": 3180
+    },
+    {
+      "epoch": 0.6563626996393611,
+      "grad_norm": 0.3041195746043791,
+      "learning_rate": 6.371801032125026e-05,
+      "loss": 0.282,
+      "step": 3185
+    },
+    {
+      "epoch": 0.6573930963420918,
+      "grad_norm": 0.3395467056509759,
+      "learning_rate": 6.338298220368912e-05,
+      "loss": 0.2616,
+      "step": 3190
+    },
+    {
+      "epoch": 0.6584234930448223,
+      "grad_norm": 0.36750181241259344,
+      "learning_rate": 6.304842806042812e-05,
+      "loss": 0.2649,
+      "step": 3195
+    },
+    {
+      "epoch": 0.6594538897475528,
+      "grad_norm": 0.41382005017445883,
+      "learning_rate": 6.271435222196916e-05,
+      "loss": 0.3158,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6594538897475528,
+      "eval_loss": 0.26873406767845154,
+      "eval_runtime": 2877.9738,
+      "eval_samples_per_second": 2.78,
+      "eval_steps_per_second": 0.347,
+      "step": 3200
+    },
+    {
+      "epoch": 0.6604842864502833,
+      "grad_norm": 0.3166096758409699,
+      "learning_rate": 6.238075901262293e-05,
+      "loss": 0.2522,
+      "step": 3205
+    },
+    {
+      "epoch": 0.6615146831530139,
+      "grad_norm": 0.3330010541870663,
+      "learning_rate": 6.204765275045298e-05,
+      "loss": 0.2961,
+      "step": 3210
+    },
+    {
+      "epoch": 0.6625450798557445,
+      "grad_norm": 0.26670607401675234,
+      "learning_rate": 6.171503774721966e-05,
+      "loss": 0.2548,
+      "step": 3215
+    },
+    {
+      "epoch": 0.663575476558475,
+      "grad_norm": 0.3302326211327972,
+      "learning_rate": 6.13829183083245e-05,
+      "loss": 0.314,
+      "step": 3220
+    },
+    {
+      "epoch": 0.6646058732612056,
+      "grad_norm": 0.3353496397076012,
+      "learning_rate": 6.105129873275435e-05,
+      "loss": 0.3014,
+      "step": 3225
+    },
+    {
+      "epoch": 0.6656362699639361,
+      "grad_norm": 0.31520904347140727,
+      "learning_rate": 6.072018331302577e-05,
+      "loss": 0.2333,
+      "step": 3230
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.35782950763156535,
+      "learning_rate": 6.038957633512957e-05,
+      "loss": 0.2881,
+      "step": 3235
+    },
+    {
+      "epoch": 0.6676970633693973,
+      "grad_norm": 0.2891661798517709,
+      "learning_rate": 6.005948207847516e-05,
+      "loss": 0.2499,
+      "step": 3240
+    },
+    {
+      "epoch": 0.6687274600721278,
+      "grad_norm": 0.29473346542335793,
+      "learning_rate": 5.97299048158352e-05,
+      "loss": 0.2756,
+      "step": 3245
+    },
+    {
+      "epoch": 0.6697578567748583,
+      "grad_norm": 0.4272344871197883,
+      "learning_rate": 5.940084881329042e-05,
+      "loss": 0.3062,
+      "step": 3250
+    },
+    {
+      "epoch": 0.6707882534775889,
+      "grad_norm": 0.4206884593213127,
+      "learning_rate": 5.907231833017424e-05,
+      "loss": 0.2517,
+      "step": 3255
+    },
+    {
+      "epoch": 0.6718186501803194,
+      "grad_norm": 0.3441386348303789,
+      "learning_rate": 5.8744317619017755e-05,
+      "loss": 0.3093,
+      "step": 3260
+    },
+    {
+      "epoch": 0.67284904688305,
+      "grad_norm": 0.33030990891048667,
+      "learning_rate": 5.841685092549456e-05,
+      "loss": 0.285,
+      "step": 3265
+    },
+    {
+      "epoch": 0.6738794435857806,
+      "grad_norm": 0.3414165248105595,
+      "learning_rate": 5.8089922488365975e-05,
+      "loss": 0.2906,
+      "step": 3270
+    },
+    {
+      "epoch": 0.6749098402885111,
+      "grad_norm": 0.36693139617736636,
+      "learning_rate": 5.776353653942602e-05,
+      "loss": 0.3077,
+      "step": 3275
+    },
+    {
+      "epoch": 0.6759402369912416,
+      "grad_norm": 0.35952213271475214,
+      "learning_rate": 5.743769730344666e-05,
+      "loss": 0.2588,
+      "step": 3280
+    },
+    {
+      "epoch": 0.6769706336939721,
+      "grad_norm": 0.3817072974116425,
+      "learning_rate": 5.7112408998123256e-05,
+      "loss": 0.3116,
+      "step": 3285
+    },
+    {
+      "epoch": 0.6780010303967027,
+      "grad_norm": 0.35226830622590205,
+      "learning_rate": 5.678767583401974e-05,
+      "loss": 0.2631,
+      "step": 3290
+    },
+    {
+      "epoch": 0.6790314270994333,
+      "grad_norm": 0.30682229922613086,
+      "learning_rate": 5.646350201451438e-05,
+      "loss": 0.3023,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6800618238021638,
+      "grad_norm": 0.3752337210236874,
+      "learning_rate": 5.613989173574512e-05,
+      "loss": 0.2816,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6810922205048944,
+      "grad_norm": 0.3785279140180591,
+      "learning_rate": 5.5816849186555386e-05,
+      "loss": 0.2239,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6821226172076249,
+      "grad_norm": 0.32589901787515324,
+      "learning_rate": 5.549437854843995e-05,
+      "loss": 0.2436,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6831530139103554,
+      "grad_norm": 0.3268785279060166,
+      "learning_rate": 5.517248399549063e-05,
+      "loss": 0.2332,
+      "step": 3315
+    },
+    {
+      "epoch": 0.6841834106130861,
+      "grad_norm": 0.3341851818333058,
+      "learning_rate": 5.48511696943423e-05,
+      "loss": 0.293,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6852138073158166,
+      "grad_norm": 0.3438105755717694,
+      "learning_rate": 5.4530439804119096e-05,
+      "loss": 0.3301,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6862442040185471,
+      "grad_norm": 0.3356276432272127,
+      "learning_rate": 5.4210298476380484e-05,
+      "loss": 0.2566,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6872746007212777,
+      "grad_norm": 0.27575134631458276,
+      "learning_rate": 5.38907498550674e-05,
+      "loss": 0.29,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6883049974240082,
+      "grad_norm": 0.6488176265940205,
+      "learning_rate": 5.357179807644887e-05,
+      "loss": 0.264,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6893353941267388,
+      "grad_norm": 0.3468629435100644,
+      "learning_rate": 5.3253447269068245e-05,
+      "loss": 0.3106,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6903657908294694,
+      "grad_norm": 0.3959804063346456,
+      "learning_rate": 5.293570155368981e-05,
+      "loss": 0.2888,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6913961875321999,
+      "grad_norm": 0.29830853292625725,
+      "learning_rate": 5.261856504324563e-05,
+      "loss": 0.2553,
+      "step": 3355
+    },
+    {
+      "epoch": 0.6924265842349304,
+      "grad_norm": 0.32422270047266666,
+      "learning_rate": 5.230204184278195e-05,
+      "loss": 0.2863,
+      "step": 3360
+    },
+    {
+      "epoch": 0.693456980937661,
+      "grad_norm": 0.3472569191866048,
+      "learning_rate": 5.198613604940649e-05,
+      "loss": 0.2957,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6944873776403916,
+      "grad_norm": 0.3756763363166075,
+      "learning_rate": 5.1670851752235025e-05,
+      "loss": 0.305,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6955177743431221,
+      "grad_norm": 0.34873701828232423,
+      "learning_rate": 5.135619303233867e-05,
+      "loss": 0.3007,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6965481710458526,
+      "grad_norm": 0.3521257255678766,
+      "learning_rate": 5.104216396269109e-05,
+      "loss": 0.2589,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6975785677485832,
+      "grad_norm": 0.36867262641478793,
+      "learning_rate": 5.072876860811553e-05,
+      "loss": 0.3187,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6986089644513137,
+      "grad_norm": 0.3582623295688211,
+      "learning_rate": 5.0416011025232546e-05,
+      "loss": 0.2496,
+      "step": 3390
+    },
+    {
+      "epoch": 0.6996393611540443,
+      "grad_norm": 0.2938091378788691,
+      "learning_rate": 5.010389526240719e-05,
+      "loss": 0.267,
+      "step": 3395
+    },
+    {
+      "epoch": 0.7006697578567749,
+      "grad_norm": 0.44117444865964234,
+      "learning_rate": 4.979242535969672e-05,
+      "loss": 0.3032,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7006697578567749,
+      "eval_loss": 0.2666991353034973,
+      "eval_runtime": 2879.7665,
+      "eval_samples_per_second": 2.778,
+      "eval_steps_per_second": 0.347,
+      "step": 3400
+    },
+    {
+      "epoch": 0.7017001545595054,
+      "grad_norm": 0.32564728902576306,
+      "learning_rate": 4.9481605348798435e-05,
+      "loss": 0.279,
+      "step": 3405
+    },
+    {
+      "epoch": 0.7027305512622359,
+      "grad_norm": 0.3341012868943753,
+      "learning_rate": 4.917143925299728e-05,
+      "loss": 0.2746,
+      "step": 3410
+    },
+    {
+      "epoch": 0.7037609479649665,
+      "grad_norm": 0.34149796395178733,
+      "learning_rate": 4.886193108711378e-05,
+      "loss": 0.2561,
+      "step": 3415
+    },
+    {
+      "epoch": 0.7047913446676971,
+      "grad_norm": 0.32243235346772253,
+      "learning_rate": 4.8553084857452426e-05,
+      "loss": 0.2944,
+      "step": 3420
+    },
+    {
+      "epoch": 0.7058217413704276,
+      "grad_norm": 0.44891237315770505,
+      "learning_rate": 4.824490456174926e-05,
+      "loss": 0.3099,
+      "step": 3425
+    },
+    {
+      "epoch": 0.7068521380731582,
+      "grad_norm": 0.3713432210872996,
+      "learning_rate": 4.7937394189120485e-05,
+      "loss": 0.279,
+      "step": 3430
+    },
+    {
+      "epoch": 0.7078825347758887,
+      "grad_norm": 0.32387159462298654,
+      "learning_rate": 4.763055772001086e-05,
+      "loss": 0.2954,
+      "step": 3435
+    },
+    {
+      "epoch": 0.7089129314786192,
+      "grad_norm": 0.3655049214248582,
+      "learning_rate": 4.732439912614195e-05,
+      "loss": 0.2599,
+      "step": 3440
+    },
+    {
+      "epoch": 0.7099433281813499,
+      "grad_norm": 0.3585347195553639,
+      "learning_rate": 4.7018922370460835e-05,
+      "loss": 0.2819,
+      "step": 3445
+    },
+    {
+      "epoch": 0.7109737248840804,
+      "grad_norm": 0.39037453777103204,
+      "learning_rate": 4.671413140708893e-05,
+      "loss": 0.2956,
+      "step": 3450
+    },
+    {
+      "epoch": 0.7120041215868109,
+      "grad_norm": 0.44234657360712804,
+      "learning_rate": 4.6410030181270546e-05,
+      "loss": 0.2585,
+      "step": 3455
+    },
+    {
+      "epoch": 0.7130345182895415,
+      "grad_norm": 0.4181782981247106,
+      "learning_rate": 4.610662262932209e-05,
+      "loss": 0.3028,
+      "step": 3460
+    },
+    {
+      "epoch": 0.714064914992272,
+      "grad_norm": 0.36884003440220187,
+      "learning_rate": 4.5803912678580906e-05,
+      "loss": 0.2893,
+      "step": 3465
+    },
+    {
+      "epoch": 0.7150953116950026,
+      "grad_norm": 0.3437139757005691,
+      "learning_rate": 4.5501904247354474e-05,
+      "loss": 0.2672,
+      "step": 3470
+    },
+    {
+      "epoch": 0.7161257083977332,
+      "grad_norm": 0.3692077816092001,
+      "learning_rate": 4.520060124486989e-05,
+      "loss": 0.313,
+      "step": 3475
+    },
+    {
+      "epoch": 0.7171561051004637,
+      "grad_norm": 0.3688505909089892,
+      "learning_rate": 4.4900007571222946e-05,
+      "loss": 0.2338,
+      "step": 3480
+    },
+    {
+      "epoch": 0.7181865018031942,
+      "grad_norm": 0.33900982896467213,
+      "learning_rate": 4.460012711732795e-05,
+      "loss": 0.2938,
+      "step": 3485
+    },
+    {
+      "epoch": 0.7192168985059247,
+      "grad_norm": 0.37856168681933267,
+      "learning_rate": 4.430096376486713e-05,
+      "loss": 0.2671,
+      "step": 3490
+    },
+    {
+      "epoch": 0.7202472952086554,
+      "grad_norm": 0.2878154556878028,
+      "learning_rate": 4.4002521386240466e-05,
+      "loss": 0.2982,
+      "step": 3495
+    },
+    {
+      "epoch": 0.7212776919113859,
+      "grad_norm": 0.3454781004229804,
+      "learning_rate": 4.3704803844515705e-05,
+      "loss": 0.3049,
+      "step": 3500
+    },
+    {
+      "epoch": 0.7223080886141164,
+      "grad_norm": 0.3950485263991763,
+      "learning_rate": 4.3407814993378095e-05,
+      "loss": 0.2416,
+      "step": 3505
+    },
+    {
+      "epoch": 0.723338485316847,
+      "grad_norm": 0.3441530397526022,
+      "learning_rate": 4.311155867708071e-05,
+      "loss": 0.296,
+      "step": 3510
+    },
+    {
+      "epoch": 0.7243688820195775,
+      "grad_norm": 0.3142921040902395,
+      "learning_rate": 4.2816038730394656e-05,
+      "loss": 0.2544,
+      "step": 3515
+    },
+    {
+      "epoch": 0.7253992787223081,
+      "grad_norm": 0.34653726659247286,
+      "learning_rate": 4.252125897855932e-05,
+      "loss": 0.2707,
+      "step": 3520
+    },
+    {
+      "epoch": 0.7264296754250387,
+      "grad_norm": 0.4116989451226589,
+      "learning_rate": 4.222722323723294e-05,
+      "loss": 0.3414,
+      "step": 3525
+    },
+    {
+      "epoch": 0.7274600721277692,
+      "grad_norm": 0.32774342903134085,
+      "learning_rate": 4.1933935312443286e-05,
+      "loss": 0.2357,
+      "step": 3530
+    },
+    {
+      "epoch": 0.7284904688304997,
+      "grad_norm": 0.40918991230864377,
+      "learning_rate": 4.164139900053824e-05,
+      "loss": 0.3185,
+      "step": 3535
+    },
+    {
+      "epoch": 0.7295208655332303,
+      "grad_norm": 0.3188798828984705,
+      "learning_rate": 4.134961808813672e-05,
+      "loss": 0.255,
+      "step": 3540
+    },
+    {
+      "epoch": 0.7305512622359609,
+      "grad_norm": 0.3630307170455085,
+      "learning_rate": 4.1058596352079805e-05,
+      "loss": 0.2985,
+      "step": 3545
+    },
+    {
+      "epoch": 0.7315816589386914,
+      "grad_norm": 0.3547013162494809,
+      "learning_rate": 4.076833755938153e-05,
+      "loss": 0.3032,
+      "step": 3550
+    },
+    {
+      "epoch": 0.732612055641422,
+      "grad_norm": 0.3387183224894239,
+      "learning_rate": 4.0478845467180506e-05,
+      "loss": 0.2515,
+      "step": 3555
+    },
+    {
+      "epoch": 0.7336424523441525,
+      "grad_norm": 0.36466954036357224,
+      "learning_rate": 4.0190123822690965e-05,
+      "loss": 0.2937,
+      "step": 3560
+    },
+    {
+      "epoch": 0.734672849046883,
+      "grad_norm": 0.3143229990949987,
+      "learning_rate": 3.990217636315441e-05,
+      "loss": 0.2423,
+      "step": 3565
+    },
+    {
+      "epoch": 0.7357032457496137,
+      "grad_norm": 0.335420380404653,
+      "learning_rate": 3.961500681579129e-05,
+      "loss": 0.2678,
+      "step": 3570
+    },
+    {
+      "epoch": 0.7367336424523442,
+      "grad_norm": 0.37119641635914863,
+      "learning_rate": 3.9328618897752566e-05,
+      "loss": 0.2926,
+      "step": 3575
+    },
+    {
+      "epoch": 0.7377640391550747,
+      "grad_norm": 0.36479696667762745,
+      "learning_rate": 3.904301631607186e-05,
+      "loss": 0.2714,
+      "step": 3580
+    },
+    {
+      "epoch": 0.7387944358578052,
+      "grad_norm": 0.3767501561850481,
+      "learning_rate": 3.875820276761717e-05,
+      "loss": 0.3114,
+      "step": 3585
+    },
+    {
+      "epoch": 0.7398248325605358,
+      "grad_norm": 0.31019709873450946,
+      "learning_rate": 3.847418193904324e-05,
+      "loss": 0.2397,
+      "step": 3590
+    },
+    {
+      "epoch": 0.7408552292632664,
+      "grad_norm": 0.34360053586472283,
+      "learning_rate": 3.8190957506743806e-05,
+      "loss": 0.2985,
+      "step": 3595
+    },
+    {
+      "epoch": 0.7418856259659969,
+      "grad_norm": 0.3495720979590242,
+      "learning_rate": 3.790853313680389e-05,
+      "loss": 0.2851,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7418856259659969,
+      "eval_loss": 0.2644506096839905,
+      "eval_runtime": 2882.0514,
+      "eval_samples_per_second": 2.776,
+      "eval_steps_per_second": 0.347,
+      "step": 3600
+    },
+    {
+      "epoch": 0.7429160226687275,
+      "grad_norm": 0.39945130493412156,
+      "learning_rate": 3.7626912484952495e-05,
+      "loss": 0.2578,
+      "step": 3605
+    },
+    {
+      "epoch": 0.743946419371458,
+      "grad_norm": 0.3148594700458464,
+      "learning_rate": 3.734609919651523e-05,
+      "loss": 0.2948,
+      "step": 3610
+    },
+    {
+      "epoch": 0.7449768160741885,
+      "grad_norm": 0.3215798364969972,
+      "learning_rate": 3.706609690636703e-05,
+      "loss": 0.2627,
+      "step": 3615
+    },
+    {
+      "epoch": 0.7460072127769191,
+      "grad_norm": 0.3548798973562654,
+      "learning_rate": 3.6786909238885215e-05,
+      "loss": 0.3064,
+      "step": 3620
+    },
+    {
+      "epoch": 0.7470376094796497,
+      "grad_norm": 0.31115745685492907,
+      "learning_rate": 3.650853980790262e-05,
+      "loss": 0.2837,
+      "step": 3625
+    },
+    {
+      "epoch": 0.7480680061823802,
+      "grad_norm": 0.27553549207406003,
+      "learning_rate": 3.6230992216660664e-05,
+      "loss": 0.255,
+      "step": 3630
+    },
+    {
+      "epoch": 0.7490984028851108,
+      "grad_norm": 0.3096210760605592,
+      "learning_rate": 3.595427005776281e-05,
+      "loss": 0.282,
+      "step": 3635
+    },
+    {
+      "epoch": 0.7501287995878413,
+      "grad_norm": 0.3605700004322446,
+      "learning_rate": 3.5678376913128075e-05,
+      "loss": 0.2682,
+      "step": 3640
+    },
+    {
+      "epoch": 0.7511591962905718,
+      "grad_norm": 0.32678097806644013,
+      "learning_rate": 3.540331635394458e-05,
+      "loss": 0.2891,
+      "step": 3645
+    },
+    {
+      "epoch": 0.7521895929933025,
+      "grad_norm": 0.37435989227694727,
+      "learning_rate": 3.512909194062347e-05,
+      "loss": 0.3123,
+      "step": 3650
+    },
+    {
+      "epoch": 0.753219989696033,
+      "grad_norm": 0.3132183633883141,
+      "learning_rate": 3.485570722275264e-05,
+      "loss": 0.2516,
+      "step": 3655
+    },
+    {
+      "epoch": 0.7542503863987635,
+      "grad_norm": 0.39780383939283487,
+      "learning_rate": 3.458316573905087e-05,
+      "loss": 0.2857,
+      "step": 3660
+    },
+    {
+      "epoch": 0.755280783101494,
+      "grad_norm": 0.34139632684317156,
+      "learning_rate": 3.4311471017322175e-05,
+      "loss": 0.2345,
+      "step": 3665
+    },
+    {
+      "epoch": 0.7563111798042246,
+      "grad_norm": 0.3438982360840127,
+      "learning_rate": 3.4040626574409815e-05,
+      "loss": 0.275,
+      "step": 3670
+    },
+    {
+      "epoch": 0.7573415765069552,
+      "grad_norm": 0.34410794574144105,
+      "learning_rate": 3.377063591615113e-05,
+      "loss": 0.3175,
+      "step": 3675
+    },
+    {
+      "epoch": 0.7583719732096857,
+      "grad_norm": 0.3966945722421853,
+      "learning_rate": 3.350150253733186e-05,
+      "loss": 0.245,
+      "step": 3680
+    },
+    {
+      "epoch": 0.7594023699124163,
+      "grad_norm": 0.34416471278077815,
+      "learning_rate": 3.3233229921641064e-05,
+      "loss": 0.2788,
+      "step": 3685
+    },
+    {
+      "epoch": 0.7604327666151468,
+      "grad_norm": 0.3070507519002996,
+      "learning_rate": 3.296582154162604e-05,
+      "loss": 0.2541,
+      "step": 3690
+    },
+    {
+      "epoch": 0.7614631633178773,
+      "grad_norm": 0.3156034424176268,
+      "learning_rate": 3.2699280858647327e-05,
+      "loss": 0.3,
+      "step": 3695
+    },
+    {
+      "epoch": 0.762493560020608,
+      "grad_norm": 0.34194205180770326,
+      "learning_rate": 3.2433611322833845e-05,
+      "loss": 0.3081,
+      "step": 3700
+    },
+    {
+      "epoch": 0.7635239567233385,
+      "grad_norm": 0.33451394695168063,
+      "learning_rate": 3.216881637303839e-05,
+      "loss": 0.2352,
+      "step": 3705
+    },
+    {
+      "epoch": 0.764554353426069,
+      "grad_norm": 0.30760730599169134,
+      "learning_rate": 3.190489943679297e-05,
+      "loss": 0.3016,
+      "step": 3710
+    },
+    {
+      "epoch": 0.7655847501287996,
+      "grad_norm": 0.33011461282751464,
+      "learning_rate": 3.164186393026445e-05,
+      "loss": 0.2455,
+      "step": 3715
+    },
+    {
+      "epoch": 0.7666151468315301,
+      "grad_norm": 0.3557839605304464,
+      "learning_rate": 3.137971325821054e-05,
+      "loss": 0.3085,
+      "step": 3720
+    },
+    {
+      "epoch": 0.7676455435342607,
+      "grad_norm": 0.4005436680671154,
+      "learning_rate": 3.111845081393542e-05,
+      "loss": 0.2881,
+      "step": 3725
+    },
+    {
+      "epoch": 0.7686759402369913,
+      "grad_norm": 0.33159697370088725,
+      "learning_rate": 3.0858079979245965e-05,
+      "loss": 0.2463,
+      "step": 3730
+    },
+    {
+      "epoch": 0.7697063369397218,
+      "grad_norm": 0.2682129393375418,
+      "learning_rate": 3.059860412440811e-05,
+      "loss": 0.2776,
+      "step": 3735
+    },
+    {
+      "epoch": 0.7707367336424523,
+      "grad_norm": 0.31234701048600605,
+      "learning_rate": 3.0340026608102902e-05,
+      "loss": 0.2284,
+      "step": 3740
+    },
+    {
+      "epoch": 0.7717671303451829,
+      "grad_norm": 0.3499928250572632,
+      "learning_rate": 3.008235077738334e-05,
+      "loss": 0.3077,
+      "step": 3745
+    },
+    {
+      "epoch": 0.7727975270479135,
+      "grad_norm": 0.385108342102361,
+      "learning_rate": 2.9825579967630846e-05,
+      "loss": 0.3223,
+      "step": 3750
+    },
+    {
+      "epoch": 0.773827923750644,
+      "grad_norm": 0.3716570069552568,
+      "learning_rate": 2.956971750251215e-05,
+      "loss": 0.2398,
+      "step": 3755
+    },
+    {
+      "epoch": 0.7748583204533745,
+      "grad_norm": 0.29486612653904987,
+      "learning_rate": 2.9314766693936356e-05,
+      "loss": 0.2659,
+      "step": 3760
+    },
+    {
+      "epoch": 0.7758887171561051,
+      "grad_norm": 0.3691838137762809,
+      "learning_rate": 2.906073084201191e-05,
+      "loss": 0.2553,
+      "step": 3765
+    },
+    {
+      "epoch": 0.7769191138588356,
+      "grad_norm": 0.3573299514624386,
+      "learning_rate": 2.8807613235004037e-05,
+      "loss": 0.2966,
+      "step": 3770
+    },
+    {
+      "epoch": 0.7779495105615662,
+      "grad_norm": 0.2972422322880715,
+      "learning_rate": 2.855541714929206e-05,
+      "loss": 0.2817,
+      "step": 3775
+    },
+    {
+      "epoch": 0.7789799072642968,
+      "grad_norm": 0.31746513053860076,
+      "learning_rate": 2.8304145849327036e-05,
+      "loss": 0.2492,
+      "step": 3780
+    },
+    {
+      "epoch": 0.7800103039670273,
+      "grad_norm": 0.33326098116022557,
+      "learning_rate": 2.8053802587589538e-05,
+      "loss": 0.2821,
+      "step": 3785
+    },
+    {
+      "epoch": 0.7810407006697578,
+      "grad_norm": 0.343516862256668,
+      "learning_rate": 2.7804390604547557e-05,
+      "loss": 0.253,
+      "step": 3790
+    },
+    {
+      "epoch": 0.7820710973724884,
+      "grad_norm": 0.29542386402077736,
+      "learning_rate": 2.7555913128614398e-05,
+      "loss": 0.2846,
+      "step": 3795
+    },
+    {
+      "epoch": 0.783101494075219,
+      "grad_norm": 0.40216308008328466,
+      "learning_rate": 2.7308373376107142e-05,
+      "loss": 0.2903,
+      "step": 3800
+    },
+    {
+      "epoch": 0.783101494075219,
+      "eval_loss": 0.2628938853740692,
+      "eval_runtime": 2894.5935,
+      "eval_samples_per_second": 2.764,
+      "eval_steps_per_second": 0.345,
+      "step": 3800
+    },
+    {
+      "epoch": 0.7841318907779495,
+      "grad_norm": 0.40257332364286125,
+      "learning_rate": 2.706177455120482e-05,
+      "loss": 0.2771,
+      "step": 3805
+    },
+    {
+      "epoch": 0.7851622874806801,
+      "grad_norm": 0.33675471922875544,
+      "learning_rate": 2.681611984590696e-05,
+      "loss": 0.3142,
+      "step": 3810
+    },
+    {
+      "epoch": 0.7861926841834106,
+      "grad_norm": 0.30015764012340485,
+      "learning_rate": 2.6571412439992437e-05,
+      "loss": 0.2565,
+      "step": 3815
+    },
+    {
+      "epoch": 0.7872230808861411,
+      "grad_norm": 0.2809235294807618,
+      "learning_rate": 2.6327655500978076e-05,
+      "loss": 0.272,
+      "step": 3820
+    },
+    {
+      "epoch": 0.7882534775888718,
+      "grad_norm": 0.3792796949891521,
+      "learning_rate": 2.608485218407779e-05,
+      "loss": 0.2691,
+      "step": 3825
+    },
+    {
+      "epoch": 0.7892838742916023,
+      "grad_norm": 0.3665809699953418,
+      "learning_rate": 2.5843005632161787e-05,
+      "loss": 0.2615,
+      "step": 3830
+    },
+    {
+      "epoch": 0.7903142709943328,
+      "grad_norm": 0.30924641336214526,
+      "learning_rate": 2.5602118975715683e-05,
+      "loss": 0.2668,
+      "step": 3835
+    },
+    {
+      "epoch": 0.7913446676970634,
+      "grad_norm": 0.3535640843972266,
+      "learning_rate": 2.5362195332800253e-05,
+      "loss": 0.251,
+      "step": 3840
+    },
+    {
+      "epoch": 0.7923750643997939,
+      "grad_norm": 0.31442694000184856,
+      "learning_rate": 2.5123237809010836e-05,
+      "loss": 0.277,
+      "step": 3845
+    },
+    {
+      "epoch": 0.7934054611025245,
+      "grad_norm": 0.473383912475728,
+      "learning_rate": 2.4885249497437223e-05,
+      "loss": 0.3069,
+      "step": 3850
+    },
+    {
+      "epoch": 0.794435857805255,
+      "grad_norm": 0.39185959855652625,
+      "learning_rate": 2.4648233478623705e-05,
+      "loss": 0.2589,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7954662545079856,
+      "grad_norm": 0.3644503327681114,
+      "learning_rate": 2.4412192820529034e-05,
+      "loss": 0.2912,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7964966512107161,
+      "grad_norm": 0.28558482434608623,
+      "learning_rate": 2.4177130578486885e-05,
+      "loss": 0.2455,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7975270479134466,
+      "grad_norm": 0.3567026701188835,
+      "learning_rate": 2.3943049795166126e-05,
+      "loss": 0.2901,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7985574446161773,
+      "grad_norm": 0.3398153100653687,
+      "learning_rate": 2.370995350053157e-05,
+      "loss": 0.2763,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7995878413189078,
+      "grad_norm": 0.3234949943482735,
+      "learning_rate": 2.3477844711804708e-05,
+      "loss": 0.2644,
+      "step": 3880
+    },
+    {
+      "epoch": 0.8006182380216383,
+      "grad_norm": 0.33467823842360356,
+      "learning_rate": 2.3246726433424716e-05,
+      "loss": 0.2914,
+      "step": 3885
+    },
+    {
+      "epoch": 0.8016486347243689,
+      "grad_norm": 0.32056191813609336,
+      "learning_rate": 2.301660165700936e-05,
+      "loss": 0.2654,
+      "step": 3890
+    },
+    {
+      "epoch": 0.8026790314270994,
+      "grad_norm": 0.3034671419615242,
+      "learning_rate": 2.2787473361316592e-05,
+      "loss": 0.2756,
+      "step": 3895
+    },
+    {
+      "epoch": 0.80370942812983,
+      "grad_norm": 0.37786668109879534,
+      "learning_rate": 2.2559344512205705e-05,
+      "loss": 0.292,
+      "step": 3900
+    },
+    {
+      "epoch": 0.8047398248325606,
+      "grad_norm": 0.34403274315844035,
+      "learning_rate": 2.233221806259903e-05,
+      "loss": 0.2334,
+      "step": 3905
+    },
+    {
+      "epoch": 0.8057702215352911,
+      "grad_norm": 0.32304818372501226,
+      "learning_rate": 2.2106096952443888e-05,
+      "loss": 0.2962,
+      "step": 3910
+    },
+    {
+      "epoch": 0.8068006182380216,
+      "grad_norm": 0.28048590872312423,
+      "learning_rate": 2.188098410867424e-05,
+      "loss": 0.2304,
+      "step": 3915
+    },
+    {
+      "epoch": 0.8078310149407522,
+      "grad_norm": 0.40871189926827856,
+      "learning_rate": 2.165688244517299e-05,
+      "loss": 0.3092,
+      "step": 3920
+    },
+    {
+      "epoch": 0.8088614116434827,
+      "grad_norm": 0.37394901632285205,
+      "learning_rate": 2.143379486273428e-05,
+      "loss": 0.3139,
+      "step": 3925
+    },
+    {
+      "epoch": 0.8098918083462133,
+      "grad_norm": 0.3813857683853878,
+      "learning_rate": 2.1211724249025787e-05,
+      "loss": 0.2522,
+      "step": 3930
+    },
+    {
+      "epoch": 0.8109222050489439,
+      "grad_norm": 0.3634405006796172,
+      "learning_rate": 2.099067347855157e-05,
+      "loss": 0.2892,
+      "step": 3935
+    },
+    {
+      "epoch": 0.8119526017516744,
+      "grad_norm": 0.3055737527512765,
+      "learning_rate": 2.077064541261462e-05,
+      "loss": 0.2376,
+      "step": 3940
+    },
+    {
+      "epoch": 0.8129829984544049,
+      "grad_norm": 0.29484654000322413,
+      "learning_rate": 2.0551642899279975e-05,
+      "loss": 0.2752,
+      "step": 3945
+    },
+    {
+      "epoch": 0.8140133951571354,
+      "grad_norm": 0.32834988086926475,
+      "learning_rate": 2.0333668773337866e-05,
+      "loss": 0.2711,
+      "step": 3950
+    },
+    {
+      "epoch": 0.8150437918598661,
+      "grad_norm": 0.36111993999911474,
+      "learning_rate": 2.0116725856266926e-05,
+      "loss": 0.2563,
+      "step": 3955
+    },
+    {
+      "epoch": 0.8160741885625966,
+      "grad_norm": 0.34560793782552396,
+      "learning_rate": 1.9900816956197698e-05,
+      "loss": 0.2868,
+      "step": 3960
+    },
+    {
+      "epoch": 0.8171045852653271,
+      "grad_norm": 0.3575615068921621,
+      "learning_rate": 1.9685944867876373e-05,
+      "loss": 0.2475,
+      "step": 3965
+    },
+    {
+      "epoch": 0.8181349819680577,
+      "grad_norm": 0.40119120114689133,
+      "learning_rate": 1.9472112372628536e-05,
+      "loss": 0.3028,
+      "step": 3970
+    },
+    {
+      "epoch": 0.8191653786707882,
+      "grad_norm": 0.3996261960750051,
+      "learning_rate": 1.9259322238323095e-05,
+      "loss": 0.2891,
+      "step": 3975
+    },
+    {
+      "epoch": 0.8201957753735188,
+      "grad_norm": 0.3045180790248437,
+      "learning_rate": 1.9047577219336665e-05,
+      "loss": 0.2379,
+      "step": 3980
+    },
+    {
+      "epoch": 0.8212261720762494,
+      "grad_norm": 0.32648574145564824,
+      "learning_rate": 1.8836880056517658e-05,
+      "loss": 0.3136,
+      "step": 3985
+    },
+    {
+      "epoch": 0.8222565687789799,
+      "grad_norm": 0.3368467272156829,
+      "learning_rate": 1.862723347715103e-05,
+      "loss": 0.2565,
+      "step": 3990
+    },
+    {
+      "epoch": 0.8232869654817104,
+      "grad_norm": 0.3785392791415945,
+      "learning_rate": 1.841864019492282e-05,
+      "loss": 0.3129,
+      "step": 3995
+    },
+    {
+      "epoch": 0.824317362184441,
+      "grad_norm": 0.37942467936258845,
+      "learning_rate": 1.821110290988509e-05,
+      "loss": 0.2943,
+      "step": 4000
+    },
+    {
+      "epoch": 0.824317362184441,
+      "eval_loss": 0.2613193392753601,
+      "eval_runtime": 2912.2794,
+      "eval_samples_per_second": 2.747,
+      "eval_steps_per_second": 0.343,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8253477588871716,
+      "grad_norm": 0.37476871355187913,
+      "learning_rate": 1.8004624308421026e-05,
+      "loss": 0.2454,
+      "step": 4005
+    },
+    {
+      "epoch": 0.8263781555899021,
+      "grad_norm": 0.432774306799512,
+      "learning_rate": 1.7799207063210044e-05,
+      "loss": 0.3059,
+      "step": 4010
+    },
+    {
+      "epoch": 0.8274085522926327,
+      "grad_norm": 0.2649022181395363,
+      "learning_rate": 1.759485383319326e-05,
+      "loss": 0.2358,
+      "step": 4015
+    },
+    {
+      "epoch": 0.8284389489953632,
+      "grad_norm": 0.4262757717672499,
+      "learning_rate": 1.7391567263539144e-05,
+      "loss": 0.3068,
+      "step": 4020
+    },
+    {
+      "epoch": 0.8294693456980937,
+      "grad_norm": 0.40389188939685916,
+      "learning_rate": 1.7189349985609115e-05,
+      "loss": 0.3148,
+      "step": 4025
+    },
+    {
+      "epoch": 0.8304997424008244,
+      "grad_norm": 0.34848990344269987,
+      "learning_rate": 1.6988204616923666e-05,
+      "loss": 0.2645,
+      "step": 4030
+    },
+    {
+      "epoch": 0.8315301391035549,
+      "grad_norm": 0.3081339412565511,
+      "learning_rate": 1.6788133761128312e-05,
+      "loss": 0.2901,
+      "step": 4035
+    },
+    {
+      "epoch": 0.8325605358062854,
+      "grad_norm": 0.31249793235144235,
+      "learning_rate": 1.658914000795999e-05,
+      "loss": 0.2607,
+      "step": 4040
+    },
+    {
+      "epoch": 0.833590932509016,
+      "grad_norm": 0.4513911289927334,
+      "learning_rate": 1.639122593321357e-05,
+      "loss": 0.3071,
+      "step": 4045
+    },
+    {
+      "epoch": 0.8346213292117465,
+      "grad_norm": 0.36902215017120704,
+      "learning_rate": 1.6194394098708377e-05,
+      "loss": 0.2841,
+      "step": 4050
+    },
+    {
+      "epoch": 0.8356517259144771,
+      "grad_norm": 0.34291388974227893,
+      "learning_rate": 1.59986470522551e-05,
+      "loss": 0.256,
+      "step": 4055
+    },
+    {
+      "epoch": 0.8366821226172076,
+      "grad_norm": 0.3484068120394701,
+      "learning_rate": 1.580398732762297e-05,
+      "loss": 0.2998,
+      "step": 4060
+    },
+    {
+      "epoch": 0.8377125193199382,
+      "grad_norm": 0.3076306390902032,
+      "learning_rate": 1.5610417444506664e-05,
+      "loss": 0.2772,
+      "step": 4065
+    },
+    {
+      "epoch": 0.8387429160226687,
+      "grad_norm": 0.3147872649091281,
+      "learning_rate": 1.541793990849387e-05,
+      "loss": 0.2549,
+      "step": 4070
+    },
+    {
+      "epoch": 0.8397733127253992,
+      "grad_norm": 0.39284501205730754,
+      "learning_rate": 1.522655721103291e-05,
+      "loss": 0.335,
+      "step": 4075
+    },
+    {
+      "epoch": 0.8408037094281299,
+      "grad_norm": 0.4446246586077643,
+      "learning_rate": 1.5036271829400294e-05,
+      "loss": 0.2516,
+      "step": 4080
+    },
+    {
+      "epoch": 0.8418341061308604,
+      "grad_norm": 0.355547901160868,
+      "learning_rate": 1.4847086226668872e-05,
+      "loss": 0.2764,
+      "step": 4085
+    },
+    {
+      "epoch": 0.8428645028335909,
+      "grad_norm": 0.3688730084692743,
+      "learning_rate": 1.4659002851675774e-05,
+      "loss": 0.2716,
+      "step": 4090
+    },
+    {
+      "epoch": 0.8438948995363215,
+      "grad_norm": 0.40395424185354845,
+      "learning_rate": 1.447202413899078e-05,
+      "loss": 0.3036,
+      "step": 4095
+    },
+    {
+      "epoch": 0.844925296239052,
+      "grad_norm": 0.3798137311999296,
+      "learning_rate": 1.4286152508884898e-05,
+      "loss": 0.3078,
+      "step": 4100
+    },
+    {
+      "epoch": 0.8459556929417826,
+      "grad_norm": 0.3136631297841343,
+      "learning_rate": 1.4101390367298861e-05,
+      "loss": 0.239,
+      "step": 4105
+    },
+    {
+      "epoch": 0.8469860896445132,
+      "grad_norm": 0.3133675747466587,
+      "learning_rate": 1.3917740105812094e-05,
+      "loss": 0.2685,
+      "step": 4110
+    },
+    {
+      "epoch": 0.8480164863472437,
+      "grad_norm": 0.3840975313756144,
+      "learning_rate": 1.3735204101611776e-05,
+      "loss": 0.2685,
+      "step": 4115
+    },
+    {
+      "epoch": 0.8490468830499742,
+      "grad_norm": 0.34167013481311503,
+      "learning_rate": 1.355378471746196e-05,
+      "loss": 0.2835,
+      "step": 4120
+    },
+    {
+      "epoch": 0.8500772797527048,
+      "grad_norm": 0.37678807026533556,
+      "learning_rate": 1.3373484301673145e-05,
+      "loss": 0.3225,
+      "step": 4125
+    },
+    {
+      "epoch": 0.8511076764554354,
+      "grad_norm": 0.36169730897373126,
+      "learning_rate": 1.3194305188071732e-05,
+      "loss": 0.2297,
+      "step": 4130
+    },
+    {
+      "epoch": 0.8521380731581659,
+      "grad_norm": 0.34627350397620255,
+      "learning_rate": 1.301624969596985e-05,
+      "loss": 0.308,
+      "step": 4135
+    },
+    {
+      "epoch": 0.8531684698608965,
+      "grad_norm": 0.3855153639798906,
+      "learning_rate": 1.2839320130135468e-05,
+      "loss": 0.2484,
+      "step": 4140
+    },
+    {
+      "epoch": 0.854198866563627,
+      "grad_norm": 0.2963869582281001,
+      "learning_rate": 1.266351878076234e-05,
+      "loss": 0.2646,
+      "step": 4145
+    },
+    {
+      "epoch": 0.8552292632663575,
+      "grad_norm": 0.3094943477185959,
+      "learning_rate": 1.2488847923440483e-05,
+      "loss": 0.284,
+      "step": 4150
+    },
+    {
+      "epoch": 0.8562596599690881,
+      "grad_norm": 0.3854839502521428,
+      "learning_rate": 1.2315309819126852e-05,
+      "loss": 0.2085,
+      "step": 4155
+    },
+    {
+      "epoch": 0.8572900566718187,
+      "grad_norm": 0.3087600519704163,
+      "learning_rate": 1.2142906714115787e-05,
+      "loss": 0.2962,
+      "step": 4160
+    },
+    {
+      "epoch": 0.8583204533745492,
+      "grad_norm": 0.3979580261424234,
+      "learning_rate": 1.197164084001009e-05,
+      "loss": 0.2516,
+      "step": 4165
+    },
+    {
+      "epoch": 0.8593508500772797,
+      "grad_norm": 0.28766440994452486,
+      "learning_rate": 1.1801514413692239e-05,
+      "loss": 0.2795,
+      "step": 4170
+    },
+    {
+      "epoch": 0.8603812467800103,
+      "grad_norm": 0.3167014720203987,
+      "learning_rate": 1.1632529637295475e-05,
+      "loss": 0.2857,
+      "step": 4175
+    },
+    {
+      "epoch": 0.8614116434827409,
+      "grad_norm": 0.33414263857523546,
+      "learning_rate": 1.1464688698175497e-05,
+      "loss": 0.2431,
+      "step": 4180
+    },
+    {
+      "epoch": 0.8624420401854714,
+      "grad_norm": 0.29115194585667353,
+      "learning_rate": 1.1297993768881998e-05,
+      "loss": 0.301,
+      "step": 4185
+    },
+    {
+      "epoch": 0.863472436888202,
+      "grad_norm": 0.34951793739233555,
+      "learning_rate": 1.113244700713063e-05,
+      "loss": 0.2559,
+      "step": 4190
+    },
+    {
+      "epoch": 0.8645028335909325,
+      "grad_norm": 0.3322485561015013,
+      "learning_rate": 1.0968050555775067e-05,
+      "loss": 0.3051,
+      "step": 4195
+    },
+    {
+      "epoch": 0.865533230293663,
+      "grad_norm": 0.3598063007969742,
+      "learning_rate": 1.0804806542779223e-05,
+      "loss": 0.2787,
+      "step": 4200
+    },
+    {
+      "epoch": 0.865533230293663,
+      "eval_loss": 0.2603052854537964,
+      "eval_runtime": 2913.7463,
+      "eval_samples_per_second": 2.746,
+      "eval_steps_per_second": 0.343,
+      "step": 4200
+    },
+    {
+      "epoch": 0.8665636269963937,
+      "grad_norm": 0.37342349900756117,
+      "learning_rate": 1.0642717081189735e-05,
+      "loss": 0.2767,
+      "step": 4205
+    },
+    {
+      "epoch": 0.8675940236991242,
+      "grad_norm": 0.3264967979693496,
+      "learning_rate": 1.0481784269108664e-05,
+      "loss": 0.2836,
+      "step": 4210
+    },
+    {
+      "epoch": 0.8686244204018547,
+      "grad_norm": 0.2837466654297017,
+      "learning_rate": 1.032201018966621e-05,
+      "loss": 0.2616,
+      "step": 4215
+    },
+    {
+      "epoch": 0.8696548171045853,
+      "grad_norm": 0.3880834543063498,
+      "learning_rate": 1.0163396910993883e-05,
+      "loss": 0.2641,
+      "step": 4220
+    },
+    {
+      "epoch": 0.8706852138073158,
+      "grad_norm": 0.40926771296668485,
+      "learning_rate": 1.0005946486197648e-05,
+      "loss": 0.2946,
+      "step": 4225
+    },
+    {
+      "epoch": 0.8717156105100463,
+      "grad_norm": 0.34072299231820374,
+      "learning_rate": 9.849660953331363e-06,
+      "loss": 0.2257,
+      "step": 4230
+    },
+    {
+      "epoch": 0.872746007212777,
+      "grad_norm": 0.3607472251947616,
+      "learning_rate": 9.694542335370437e-06,
+      "loss": 0.3061,
+      "step": 4235
+    },
+    {
+      "epoch": 0.8737764039155075,
+      "grad_norm": 0.2710285195917331,
+      "learning_rate": 9.540592640185597e-06,
+      "loss": 0.2725,
+      "step": 4240
+    },
+    {
+      "epoch": 0.874806800618238,
+      "grad_norm": 0.36432299015695907,
+      "learning_rate": 9.387813860516915e-06,
+      "loss": 0.2851,
+      "step": 4245
+    },
+    {
+      "epoch": 0.8758371973209685,
+      "grad_norm": 0.35703992526720524,
+      "learning_rate": 9.236207973948063e-06,
+      "loss": 0.2972,
+      "step": 4250
+    },
+    {
+      "epoch": 0.8768675940236991,
+      "grad_norm": 0.3173002625132457,
+      "learning_rate": 9.085776942880608e-06,
+      "loss": 0.2876,
+      "step": 4255
+    },
+    {
+      "epoch": 0.8778979907264297,
+      "grad_norm": 0.28131579729158257,
+      "learning_rate": 8.936522714508678e-06,
+      "loss": 0.2867,
+      "step": 4260
+    },
+    {
+      "epoch": 0.8789283874291602,
+      "grad_norm": 0.3530032076872567,
+      "learning_rate": 8.788447220793806e-06,
+      "loss": 0.2798,
+      "step": 4265
+    },
+    {
+      "epoch": 0.8799587841318908,
+      "grad_norm": 0.347013694472638,
+      "learning_rate": 8.641552378439776e-06,
+      "loss": 0.2937,
+      "step": 4270
+    },
+    {
+      "epoch": 0.8809891808346213,
+      "grad_norm": 0.357377959486403,
+      "learning_rate": 8.495840088868024e-06,
+      "loss": 0.334,
+      "step": 4275
+    },
+    {
+      "epoch": 0.8820195775373518,
+      "grad_norm": 0.27316974502275887,
+      "learning_rate": 8.351312238192787e-06,
+      "loss": 0.2675,
+      "step": 4280
+    },
+    {
+      "epoch": 0.8830499742400825,
+      "grad_norm": 0.35924727199943307,
+      "learning_rate": 8.207970697196866e-06,
+      "loss": 0.2961,
+      "step": 4285
+    },
+    {
+      "epoch": 0.884080370942813,
+      "grad_norm": 0.3577190138371286,
+      "learning_rate": 8.065817321307367e-06,
+      "loss": 0.254,
+      "step": 4290
+    },
+    {
+      "epoch": 0.8851107676455435,
+      "grad_norm": 0.5576399803787185,
+      "learning_rate": 7.924853950571642e-06,
+      "loss": 0.2952,
+      "step": 4295
+    },
+    {
+      "epoch": 0.8861411643482741,
+      "grad_norm": 0.3763241455754686,
+      "learning_rate": 7.78508240963347e-06,
+      "loss": 0.2863,
+      "step": 4300
+    },
+    {
+      "epoch": 0.8871715610510046,
+      "grad_norm": 0.30880739002832336,
+      "learning_rate": 7.646504507709563e-06,
+      "loss": 0.2824,
+      "step": 4305
+    },
+    {
+      "epoch": 0.8882019577537352,
+      "grad_norm": 0.3137337962583597,
+      "learning_rate": 7.50912203856593e-06,
+      "loss": 0.2674,
+      "step": 4310
+    },
+    {
+      "epoch": 0.8892323544564658,
+      "grad_norm": 0.31308889569105514,
+      "learning_rate": 7.372936780494877e-06,
+      "loss": 0.2755,
+      "step": 4315
+    },
+    {
+      "epoch": 0.8902627511591963,
+      "grad_norm": 0.4028987419767302,
+      "learning_rate": 7.237950496291856e-06,
+      "loss": 0.2871,
+      "step": 4320
+    },
+    {
+      "epoch": 0.8912931478619268,
+      "grad_norm": 0.36938026316991573,
+      "learning_rate": 7.104164933232649e-06,
+      "loss": 0.2786,
+      "step": 4325
+    },
+    {
+      "epoch": 0.8923235445646573,
+      "grad_norm": 0.3208025100288799,
+      "learning_rate": 6.971581823050832e-06,
+      "loss": 0.2474,
+      "step": 4330
+    },
+    {
+      "epoch": 0.893353941267388,
+      "grad_norm": 0.3598206195922243,
+      "learning_rate": 6.840202881915325e-06,
+      "loss": 0.3023,
+      "step": 4335
+    },
+    {
+      "epoch": 0.8943843379701185,
+      "grad_norm": 0.29364044659572436,
+      "learning_rate": 6.710029810408092e-06,
+      "loss": 0.2397,
+      "step": 4340
+    },
+    {
+      "epoch": 0.895414734672849,
+      "grad_norm": 0.31914310843659316,
+      "learning_rate": 6.581064293502293e-06,
+      "loss": 0.2883,
+      "step": 4345
+    },
+    {
+      "epoch": 0.8964451313755796,
+      "grad_norm": 0.45022448364382284,
+      "learning_rate": 6.453308000540304e-06,
+      "loss": 0.2756,
+      "step": 4350
+    },
+    {
+      "epoch": 0.8974755280783101,
+      "grad_norm": 0.310267786867979,
+      "learning_rate": 6.326762585212209e-06,
+      "loss": 0.2415,
+      "step": 4355
+    },
+    {
+      "epoch": 0.8985059247810407,
+      "grad_norm": 0.3112505151505233,
+      "learning_rate": 6.20142968553441e-06,
+      "loss": 0.2821,
+      "step": 4360
+    },
+    {
+      "epoch": 0.8995363214837713,
+      "grad_norm": 0.3141837487350365,
+      "learning_rate": 6.077310923828328e-06,
+      "loss": 0.2725,
+      "step": 4365
+    },
+    {
+      "epoch": 0.9005667181865018,
+      "grad_norm": 0.27623461345523886,
+      "learning_rate": 5.954407906699511e-06,
+      "loss": 0.268,
+      "step": 4370
+    },
+    {
+      "epoch": 0.9015971148892323,
+      "grad_norm": 0.3891601937753882,
+      "learning_rate": 5.8327222250167735e-06,
+      "loss": 0.2778,
+      "step": 4375
+    },
+    {
+      "epoch": 0.9026275115919629,
+      "grad_norm": 0.3558581335957565,
+      "learning_rate": 5.71225545389158e-06,
+      "loss": 0.2241,
+      "step": 4380
+    },
+    {
+      "epoch": 0.9036579082946935,
+      "grad_norm": 0.3693584144347684,
+      "learning_rate": 5.59300915265778e-06,
+      "loss": 0.293,
+      "step": 4385
+    },
+    {
+      "epoch": 0.904688304997424,
+      "grad_norm": 0.412387091213602,
+      "learning_rate": 5.4749848648512624e-06,
+      "loss": 0.2638,
+      "step": 4390
+    },
+    {
+      "epoch": 0.9057187017001546,
+      "grad_norm": 0.4134072879227268,
+      "learning_rate": 5.358184118190068e-06,
+      "loss": 0.2996,
+      "step": 4395
+    },
+    {
+      "epoch": 0.9067490984028851,
+      "grad_norm": 0.33905018808481746,
+      "learning_rate": 5.242608424554651e-06,
+      "loss": 0.2558,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9067490984028851,
+      "eval_loss": 0.25964951515197754,
+      "eval_runtime": 2914.0015,
+      "eval_samples_per_second": 2.745,
+      "eval_steps_per_second": 0.343,
+      "step": 4400
+    },
+    {
+      "epoch": 0.9077794951056156,
+      "grad_norm": 0.3147365134234666,
+      "learning_rate": 5.128259279968195e-06,
+      "loss": 0.2516,
+      "step": 4405
+    },
+    {
+      "epoch": 0.9088098918083463,
+      "grad_norm": 0.34810883229758166,
+      "learning_rate": 5.01513816457736e-06,
+      "loss": 0.2778,
+      "step": 4410
+    },
+    {
+      "epoch": 0.9098402885110768,
+      "grad_norm": 0.3558739397328132,
+      "learning_rate": 4.903246542633033e-06,
+      "loss": 0.2516,
+      "step": 4415
+    },
+    {
+      "epoch": 0.9108706852138073,
+      "grad_norm": 0.3434310486600019,
+      "learning_rate": 4.7925858624714215e-06,
+      "loss": 0.2658,
+      "step": 4420
+    },
+    {
+      "epoch": 0.9119010819165378,
+      "grad_norm": 0.4739996869148237,
+      "learning_rate": 4.683157556495343e-06,
+      "loss": 0.3177,
+      "step": 4425
+    },
+    {
+      "epoch": 0.9129314786192684,
+      "grad_norm": 0.3432195383447999,
+      "learning_rate": 4.574963041155622e-06,
+      "loss": 0.2456,
+      "step": 4430
+    },
+    {
+      "epoch": 0.913961875321999,
+      "grad_norm": 0.39865656812853095,
+      "learning_rate": 4.468003716932734e-06,
+      "loss": 0.3004,
+      "step": 4435
+    },
+    {
+      "epoch": 0.9149922720247295,
+      "grad_norm": 0.3074797243983616,
+      "learning_rate": 4.362280968318777e-06,
+      "loss": 0.2468,
+      "step": 4440
+    },
+    {
+      "epoch": 0.9160226687274601,
+      "grad_norm": 0.3821770273420256,
+      "learning_rate": 4.257796163799455e-06,
+      "loss": 0.2569,
+      "step": 4445
+    },
+    {
+      "epoch": 0.9170530654301906,
+      "grad_norm": 0.3846408351560208,
+      "learning_rate": 4.154550655836409e-06,
+      "loss": 0.2915,
+      "step": 4450
+    },
+    {
+      "epoch": 0.9180834621329211,
+      "grad_norm": 0.3328477896869499,
+      "learning_rate": 4.052545780849715e-06,
+      "loss": 0.2382,
+      "step": 4455
+    },
+    {
+      "epoch": 0.9191138588356518,
+      "grad_norm": 0.39094030418826553,
+      "learning_rate": 3.9517828592005475e-06,
+      "loss": 0.3148,
+      "step": 4460
+    },
+    {
+      "epoch": 0.9201442555383823,
+      "grad_norm": 0.32981008377515736,
+      "learning_rate": 3.852263195174155e-06,
+      "loss": 0.2671,
+      "step": 4465
+    },
+    {
+      "epoch": 0.9211746522411128,
+      "grad_norm": 0.31369535441554675,
+      "learning_rate": 3.7539880769628998e-06,
+      "loss": 0.2947,
+      "step": 4470
+    },
+    {
+      "epoch": 0.9222050489438434,
+      "grad_norm": 0.37611489272202936,
+      "learning_rate": 3.6569587766496216e-06,
+      "loss": 0.2808,
+      "step": 4475
+    },
+    {
+      "epoch": 0.9232354456465739,
+      "grad_norm": 0.26359972435462287,
+      "learning_rate": 3.561176550191203e-06,
+      "loss": 0.2394,
+      "step": 4480
+    },
+    {
+      "epoch": 0.9242658423493045,
+      "grad_norm": 0.3951324034345502,
+      "learning_rate": 3.46664263740224e-06,
+      "loss": 0.2848,
+      "step": 4485
+    },
+    {
+      "epoch": 0.9252962390520351,
+      "grad_norm": 0.3608819993192937,
+      "learning_rate": 3.3733582619390523e-06,
+      "loss": 0.2632,
+      "step": 4490
+    },
+    {
+      "epoch": 0.9263266357547656,
+      "grad_norm": 0.3555456945529403,
+      "learning_rate": 3.281324631283833e-06,
+      "loss": 0.279,
+      "step": 4495
+    },
+    {
+      "epoch": 0.9273570324574961,
+      "grad_norm": 0.39293143583676265,
+      "learning_rate": 3.1905429367289795e-06,
+      "loss": 0.31,
+      "step": 4500
+    },
+    {
+      "epoch": 0.9283874291602267,
+      "grad_norm": 0.3368920177589104,
+      "learning_rate": 3.101014353361753e-06,
+      "loss": 0.2295,
+      "step": 4505
+    },
+    {
+      "epoch": 0.9294178258629573,
+      "grad_norm": 0.3034071042496814,
+      "learning_rate": 3.012740040048978e-06,
+      "loss": 0.2922,
+      "step": 4510
+    },
+    {
+      "epoch": 0.9304482225656878,
+      "grad_norm": 0.30952410133286584,
+      "learning_rate": 2.9257211394220773e-06,
+      "loss": 0.2719,
+      "step": 4515
+    },
+    {
+      "epoch": 0.9314786192684184,
+      "grad_norm": 0.30563095374433163,
+      "learning_rate": 2.8399587778623505e-06,
+      "loss": 0.2688,
+      "step": 4520
+    },
+    {
+      "epoch": 0.9325090159711489,
+      "grad_norm": 0.3782690565607755,
+      "learning_rate": 2.755454065486263e-06,
+      "loss": 0.2842,
+      "step": 4525
+    },
+    {
+      "epoch": 0.9335394126738794,
+      "grad_norm": 0.37596385739941673,
+      "learning_rate": 2.672208096131157e-06,
+      "loss": 0.2412,
+      "step": 4530
+    },
+    {
+      "epoch": 0.93456980937661,
+      "grad_norm": 0.33654733598583986,
+      "learning_rate": 2.5902219473411204e-06,
+      "loss": 0.2903,
+      "step": 4535
+    },
+    {
+      "epoch": 0.9356002060793406,
+      "grad_norm": 0.3317164278402486,
+      "learning_rate": 2.509496680352963e-06,
+      "loss": 0.2443,
+      "step": 4540
+    },
+    {
+      "epoch": 0.9366306027820711,
+      "grad_norm": 0.30341596651692276,
+      "learning_rate": 2.430033340082516e-06,
+      "loss": 0.2532,
+      "step": 4545
+    },
+    {
+      "epoch": 0.9376609994848016,
+      "grad_norm": 0.38567400726184814,
+      "learning_rate": 2.3518329551111217e-06,
+      "loss": 0.3219,
+      "step": 4550
+    },
+    {
+      "epoch": 0.9386913961875322,
+      "grad_norm": 0.5114420556484347,
+      "learning_rate": 2.2748965376723e-06,
+      "loss": 0.2911,
+      "step": 4555
+    },
+    {
+      "epoch": 0.9397217928902627,
+      "grad_norm": 0.2935585297022278,
+      "learning_rate": 2.199225083638656e-06,
+      "loss": 0.2843,
+      "step": 4560
+    },
+    {
+      "epoch": 0.9407521895929933,
+      "grad_norm": 0.28858986310591855,
+      "learning_rate": 2.1248195725089624e-06,
+      "loss": 0.2327,
+      "step": 4565
+    },
+    {
+      "epoch": 0.9417825862957239,
+      "grad_norm": 0.2847670925427268,
+      "learning_rate": 2.0516809673955083e-06,
+      "loss": 0.2866,
+      "step": 4570
+    },
+    {
+      "epoch": 0.9428129829984544,
+      "grad_norm": 0.3816164009941783,
+      "learning_rate": 1.9798102150116573e-06,
+      "loss": 0.3146,
+      "step": 4575
+    },
+    {
+      "epoch": 0.9438433797011849,
+      "grad_norm": 0.34153941420407485,
+      "learning_rate": 1.909208245659522e-06,
+      "loss": 0.2655,
+      "step": 4580
+    },
+    {
+      "epoch": 0.9448737764039155,
+      "grad_norm": 0.3775194213326933,
+      "learning_rate": 1.8398759732179637e-06,
+      "loss": 0.2893,
+      "step": 4585
+    },
+    {
+      "epoch": 0.9459041731066461,
+      "grad_norm": 0.30788658820871284,
+      "learning_rate": 1.7718142951307914e-06,
+      "loss": 0.2611,
+      "step": 4590
+    },
+    {
+      "epoch": 0.9469345698093766,
+      "grad_norm": 0.3383435069564331,
+      "learning_rate": 1.705024092395091e-06,
+      "loss": 0.2504,
+      "step": 4595
+    },
+    {
+      "epoch": 0.9479649665121072,
+      "grad_norm": 0.4137029730454973,
+      "learning_rate": 1.6395062295498698e-06,
+      "loss": 0.3107,
+      "step": 4600
+    },
+    {
+      "epoch": 0.9479649665121072,
+      "eval_loss": 0.2593182921409607,
+      "eval_runtime": 2914.0057,
+      "eval_samples_per_second": 2.745,
+      "eval_steps_per_second": 0.343,
+      "step": 4600
+    },
+    {
+      "epoch": 0.9489953632148377,
+      "grad_norm": 0.2756456550382528,
+      "learning_rate": 1.5752615546647975e-06,
+      "loss": 0.2688,
+      "step": 4605
+    },
+    {
+      "epoch": 0.9500257599175682,
+      "grad_norm": 0.3558221128709261,
+      "learning_rate": 1.5122908993293273e-06,
+      "loss": 0.2962,
+      "step": 4610
+    },
+    {
+      "epoch": 0.9510561566202989,
+      "grad_norm": 0.30866148215688255,
+      "learning_rate": 1.4505950786418255e-06,
+      "loss": 0.2396,
+      "step": 4615
+    },
+    {
+      "epoch": 0.9520865533230294,
+      "grad_norm": 0.32718060116494296,
+      "learning_rate": 1.3901748911991253e-06,
+      "loss": 0.259,
+      "step": 4620
+    },
+    {
+      "epoch": 0.9531169500257599,
+      "grad_norm": 0.41916606085025016,
+      "learning_rate": 1.331031119086079e-06,
+      "loss": 0.2914,
+      "step": 4625
+    },
+    {
+      "epoch": 0.9541473467284904,
+      "grad_norm": 0.32801299914013593,
+      "learning_rate": 1.2731645278655445e-06,
+      "loss": 0.2627,
+      "step": 4630
+    },
+    {
+      "epoch": 0.955177743431221,
+      "grad_norm": 0.35340318213193234,
+      "learning_rate": 1.2165758665683924e-06,
+      "loss": 0.2832,
+      "step": 4635
+    },
+    {
+      "epoch": 0.9562081401339516,
+      "grad_norm": 0.3450871009199193,
+      "learning_rate": 1.1612658676838473e-06,
+      "loss": 0.2402,
+      "step": 4640
+    },
+    {
+      "epoch": 0.9572385368366821,
+      "grad_norm": 0.35663476220665696,
+      "learning_rate": 1.107235247150018e-06,
+      "loss": 0.2892,
+      "step": 4645
+    },
+    {
+      "epoch": 0.9582689335394127,
+      "grad_norm": 0.3491909974711667,
+      "learning_rate": 1.0544847043445938e-06,
+      "loss": 0.2838,
+      "step": 4650
+    },
+    {
+      "epoch": 0.9592993302421432,
+      "grad_norm": 0.47789307717475066,
+      "learning_rate": 1.0030149220758288e-06,
+      "loss": 0.2633,
+      "step": 4655
+    },
+    {
+      "epoch": 0.9603297269448737,
+      "grad_norm": 0.3220693161443208,
+      "learning_rate": 9.528265665736502e-07,
+      "loss": 0.2983,
+      "step": 4660
+    },
+    {
+      "epoch": 0.9613601236476044,
+      "grad_norm": 0.3355491215986634,
+      "learning_rate": 9.039202874811192e-07,
+      "loss": 0.2606,
+      "step": 4665
+    },
+    {
+      "epoch": 0.9623905203503349,
+      "grad_norm": 0.3331685126111554,
+      "learning_rate": 8.562967178459391e-07,
+      "loss": 0.2836,
+      "step": 4670
+    },
+    {
+      "epoch": 0.9634209170530654,
+      "grad_norm": 0.33838080949230487,
+      "learning_rate": 8.099564741123166e-07,
+      "loss": 0.2912,
+      "step": 4675
+    },
+    {
+      "epoch": 0.964451313755796,
+      "grad_norm": 0.3638971937799169,
+      "learning_rate": 7.649001561129354e-07,
+      "loss": 0.2357,
+      "step": 4680
+    },
+    {
+      "epoch": 0.9654817104585265,
+      "grad_norm": 0.30728028400390417,
+      "learning_rate": 7.211283470612395e-07,
+      "loss": 0.2763,
+      "step": 4685
+    },
+    {
+      "epoch": 0.9665121071612571,
+      "grad_norm": 0.42508518073646434,
+      "learning_rate": 6.786416135438512e-07,
+      "loss": 0.2632,
+      "step": 4690
+    },
+    {
+      "epoch": 0.9675425038639877,
+      "grad_norm": 0.319418982554995,
+      "learning_rate": 6.374405055132537e-07,
+      "loss": 0.2666,
+      "step": 4695
+    },
+    {
+      "epoch": 0.9685729005667182,
+      "grad_norm": 0.3449145369969141,
+      "learning_rate": 5.975255562806647e-07,
+      "loss": 0.2924,
+      "step": 4700
+    },
+    {
+      "epoch": 0.9696032972694487,
+      "grad_norm": 0.4202821282926822,
+      "learning_rate": 5.58897282509141e-07,
+      "loss": 0.2151,
+      "step": 4705
+    },
+    {
+      "epoch": 0.9706336939721792,
+      "grad_norm": 0.34298503928511215,
+      "learning_rate": 5.215561842068728e-07,
+      "loss": 0.2776,
+      "step": 4710
+    },
+    {
+      "epoch": 0.9716640906749099,
+      "grad_norm": 0.3226411709221547,
+      "learning_rate": 4.855027447207338e-07,
+      "loss": 0.2726,
+      "step": 4715
+    },
+    {
+      "epoch": 0.9726944873776404,
+      "grad_norm": 0.32438079260005576,
+      "learning_rate": 4.507374307299972e-07,
+      "loss": 0.2897,
+      "step": 4720
+    },
+    {
+      "epoch": 0.973724884080371,
+      "grad_norm": 0.29250744283879526,
+      "learning_rate": 4.172606922403399e-07,
+      "loss": 0.3075,
+      "step": 4725
+    },
+    {
+      "epoch": 0.9747552807831015,
+      "grad_norm": 0.3734192410658037,
+      "learning_rate": 3.8507296257798145e-07,
+      "loss": 0.2382,
+      "step": 4730
+    },
+    {
+      "epoch": 0.975785677485832,
+      "grad_norm": 0.32324581396516133,
+      "learning_rate": 3.541746583840655e-07,
+      "loss": 0.2776,
+      "step": 4735
+    },
+    {
+      "epoch": 0.9768160741885626,
+      "grad_norm": 0.33899080785526653,
+      "learning_rate": 3.24566179609298e-07,
+      "loss": 0.278,
+      "step": 4740
+    },
+    {
+      "epoch": 0.9778464708912932,
+      "grad_norm": 0.31722996073708115,
+      "learning_rate": 2.9624790950875113e-07,
+      "loss": 0.2854,
+      "step": 4745
+    },
+    {
+      "epoch": 0.9788768675940237,
+      "grad_norm": 0.3653754490411951,
+      "learning_rate": 2.692202146369338e-07,
+      "loss": 0.2903,
+      "step": 4750
+    },
+    {
+      "epoch": 0.9799072642967542,
+      "grad_norm": 0.3440659912321346,
+      "learning_rate": 2.434834448429957e-07,
+      "loss": 0.2624,
+      "step": 4755
+    },
+    {
+      "epoch": 0.9809376609994848,
+      "grad_norm": 0.3265654789285722,
+      "learning_rate": 2.1903793326621957e-07,
+      "loss": 0.2785,
+      "step": 4760
+    },
+    {
+      "epoch": 0.9819680577022154,
+      "grad_norm": 0.2763203967812842,
+      "learning_rate": 1.9588399633173605e-07,
+      "loss": 0.2387,
+      "step": 4765
+    },
+    {
+      "epoch": 0.9829984544049459,
+      "grad_norm": 0.32114982480782467,
+      "learning_rate": 1.740219337463822e-07,
+      "loss": 0.2601,
+      "step": 4770
+    },
+    {
+      "epoch": 0.9840288511076765,
+      "grad_norm": 0.34288896513991585,
+      "learning_rate": 1.534520284948715e-07,
+      "loss": 0.2752,
+      "step": 4775
+    },
+    {
+      "epoch": 0.985059247810407,
+      "grad_norm": 0.3760081860947557,
+      "learning_rate": 1.3417454683608554e-07,
+      "loss": 0.2583,
+      "step": 4780
+    },
+    {
+      "epoch": 0.9860896445131375,
+      "grad_norm": 0.362145568445466,
+      "learning_rate": 1.1618973829966572e-07,
+      "loss": 0.3256,
+      "step": 4785
+    },
+    {
+      "epoch": 0.9871200412158682,
+      "grad_norm": 0.29692027485612293,
+      "learning_rate": 9.949783568272697e-08,
+      "loss": 0.2302,
+      "step": 4790
+    },
+    {
+      "epoch": 0.9881504379185987,
+      "grad_norm": 0.35544835709990513,
+      "learning_rate": 8.409905504693782e-08,
+      "loss": 0.2742,
+      "step": 4795
+    },
+    {
+      "epoch": 0.9891808346213292,
+      "grad_norm": 0.34329714361053626,
+      "learning_rate": 6.999359571561171e-08,
+      "loss": 0.2894,
+      "step": 4800
+    },
+    {
+      "epoch": 0.9891808346213292,
+      "eval_loss": 0.25925347208976746,
+      "eval_runtime": 2913.0942,
+      "eval_samples_per_second": 2.746,
+      "eval_steps_per_second": 0.343,
+      "step": 4800
+    },
+    {
+      "epoch": 0.9902112313240597,
+      "grad_norm": 0.3542301832349851,
+      "learning_rate": 5.718164027121997e-08,
+      "loss": 0.2343,
+      "step": 4805
+    },
+    {
+      "epoch": 0.9912416280267903,
+      "grad_norm": 0.3086510062926568,
+      "learning_rate": 4.566335455299387e-08,
+      "loss": 0.2788,
+      "step": 4810
+    },
+    {
+      "epoch": 0.9922720247295209,
+      "grad_norm": 0.3507547113812826,
+      "learning_rate": 3.5438887654737355e-08,
+      "loss": 0.2747,
+      "step": 4815
+    },
+    {
+      "epoch": 0.9933024214322514,
+      "grad_norm": 0.3265356296027687,
+      "learning_rate": 2.6508371922984166e-08,
+      "loss": 0.2711,
+      "step": 4820
+    },
+    {
+      "epoch": 0.994332818134982,
+      "grad_norm": 0.3943623315986484,
+      "learning_rate": 1.887192295521034e-08,
+      "loss": 0.2788,
+      "step": 4825
+    },
+    {
+      "epoch": 0.9953632148377125,
+      "grad_norm": 0.3158484623921147,
+      "learning_rate": 1.252963959834652e-08,
+      "loss": 0.2249,
+      "step": 4830
+    },
+    {
+      "epoch": 0.996393611540443,
+      "grad_norm": 0.2997635546788507,
+      "learning_rate": 7.481603947556703e-09,
+      "loss": 0.2663,
+      "step": 4835
+    },
+    {
+      "epoch": 0.9974240082431737,
+      "grad_norm": 0.32227151020376166,
+      "learning_rate": 3.727881345105821e-09,
+      "loss": 0.2558,
+      "step": 4840
+    },
+    {
+      "epoch": 0.9984544049459042,
+      "grad_norm": 0.33469168529537796,
+      "learning_rate": 1.2685203795492762e-09,
+      "loss": 0.259,
+      "step": 4845
+    },
+    {
+      "epoch": 0.9994848016486347,
+      "grad_norm": 0.3630544783943566,
+      "learning_rate": 1.0355288510011107e-10,
+      "loss": 0.3056,
+      "step": 4850
+    },
+    {
+      "epoch": 0.999896960329727,
+      "step": 4852,
+      "total_flos": 2.4144679757152256e+16,
+      "train_loss": 0.3040192322367887,
+      "train_runtime": 137718.1009,
+      "train_samples_per_second": 0.564,
+      "train_steps_per_second": 0.035
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4852,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.4144679757152256e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}