diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4111 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.997411003236246,
+  "eval_steps": 50,
+  "global_step": 4632,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008629989212513484,
+      "grad_norm": 20.681591033935547,
+      "learning_rate": 2.1551724137931036e-07,
+      "loss": 1.0408,
+      "step": 10
+    },
+    {
+      "epoch": 0.017259978425026967,
+      "grad_norm": 96.75000762939453,
+      "learning_rate": 4.3103448275862073e-07,
+      "loss": 1.047,
+      "step": 20
+    },
+    {
+      "epoch": 0.025889967637540454,
+      "grad_norm": 22.108104705810547,
+      "learning_rate": 6.465517241379311e-07,
+      "loss": 1.0718,
+      "step": 30
+    },
+    {
+      "epoch": 0.034519956850053934,
+      "grad_norm": 40.05157470703125,
+      "learning_rate": 8.620689655172415e-07,
+      "loss": 1.0488,
+      "step": 40
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "grad_norm": 15.964655876159668,
+      "learning_rate": 1.0775862068965518e-06,
+      "loss": 1.075,
+      "step": 50
+    },
+    {
+      "epoch": 0.043149946062567425,
+      "eval_accuracy": 0.49320388349514566,
+      "eval_loss": 1.018173336982727,
+      "eval_runtime": 322.676,
+      "eval_samples_per_second": 1.596,
+      "eval_steps_per_second": 1.596,
+      "step": 50
+    },
+    {
+      "epoch": 0.05177993527508091,
+      "grad_norm": 27.802989959716797,
+      "learning_rate": 1.2931034482758623e-06,
+      "loss": 1.1389,
+      "step": 60
+    },
+    {
+      "epoch": 0.06040992448759439,
+      "grad_norm": 28.11711883544922,
+      "learning_rate": 1.5086206896551726e-06,
+      "loss": 1.1116,
+      "step": 70
+    },
+    {
+      "epoch": 0.06903991370010787,
+      "grad_norm": 22.176109313964844,
+      "learning_rate": 1.724137931034483e-06,
+      "loss": 1.0697,
+      "step": 80
+    },
+    {
+      "epoch": 0.07766990291262135,
+      "grad_norm": 41.33392333984375,
+      "learning_rate": 1.9396551724137932e-06,
+      "loss": 1.0242,
+      "step": 90
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "grad_norm": 34.400508880615234,
+      "learning_rate": 2.1551724137931035e-06,
+      "loss": 1.0505,
+      "step": 100
+    },
+    {
+      "epoch": 0.08629989212513485,
+      "eval_accuracy": 0.5009708737864078,
+      "eval_loss": 0.9943639039993286,
+      "eval_runtime": 321.8255,
+      "eval_samples_per_second": 1.6,
+      "eval_steps_per_second": 1.6,
+      "step": 100
+    },
+    {
+      "epoch": 0.09492988133764833,
+      "grad_norm": 28.23130989074707,
+      "learning_rate": 2.370689655172414e-06,
+      "loss": 1.0073,
+      "step": 110
+    },
+    {
+      "epoch": 0.10355987055016182,
+      "grad_norm": 36.090736389160156,
+      "learning_rate": 2.5862068965517246e-06,
+      "loss": 0.9802,
+      "step": 120
+    },
+    {
+      "epoch": 0.1121898597626753,
+      "grad_norm": 58.96036148071289,
+      "learning_rate": 2.8017241379310345e-06,
+      "loss": 0.9827,
+      "step": 130
+    },
+    {
+      "epoch": 0.12081984897518878,
+      "grad_norm": 18.94993782043457,
+      "learning_rate": 3.017241379310345e-06,
+      "loss": 1.0015,
+      "step": 140
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "grad_norm": 32.874114990234375,
+      "learning_rate": 3.2327586206896555e-06,
+      "loss": 0.9387,
+      "step": 150
+    },
+    {
+      "epoch": 0.12944983818770225,
+      "eval_accuracy": 0.5048543689320388,
+      "eval_loss": 0.9101472496986389,
+      "eval_runtime": 321.9422,
+      "eval_samples_per_second": 1.6,
+      "eval_steps_per_second": 1.6,
+      "step": 150
+    },
+    {
+      "epoch": 0.13807982740021574,
+      "grad_norm": 14.486083030700684,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 0.9255,
+      "step": 160
+    },
+    {
+      "epoch": 0.14670981661272922,
+      "grad_norm": 26.06964111328125,
+      "learning_rate": 3.663793103448276e-06,
+      "loss": 0.8775,
+      "step": 170
+    },
+    {
+      "epoch": 0.1553398058252427,
+      "grad_norm": 23.44382667541504,
+      "learning_rate": 3.8793103448275865e-06,
+      "loss": 0.8675,
+      "step": 180
+    },
+    {
+      "epoch": 0.16396979503775622,
+      "grad_norm": 22.29359245300293,
+      "learning_rate": 4.094827586206897e-06,
+      "loss": 0.9728,
+      "step": 190
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "grad_norm": 38.14244842529297,
+      "learning_rate": 4.310344827586207e-06,
+      "loss": 0.92,
+      "step": 200
+    },
+    {
+      "epoch": 0.1725997842502697,
+      "eval_accuracy": 0.5048543689320388,
+      "eval_loss": 0.9019931554794312,
+      "eval_runtime": 321.9115,
+      "eval_samples_per_second": 1.6,
+      "eval_steps_per_second": 1.6,
+      "step": 200
+    },
+    {
+      "epoch": 0.18122977346278318,
+      "grad_norm": 64.9331283569336,
+      "learning_rate": 4.525862068965518e-06,
+      "loss": 0.9633,
+      "step": 210
+    },
+    {
+      "epoch": 0.18985976267529667,
+      "grad_norm": 39.31247329711914,
+      "learning_rate": 4.741379310344828e-06,
+      "loss": 0.9646,
+      "step": 220
+    },
+    {
+      "epoch": 0.19848975188781015,
+      "grad_norm": 26.192481994628906,
+      "learning_rate": 4.9568965517241384e-06,
+      "loss": 0.9956,
+      "step": 230
+    },
+    {
+      "epoch": 0.20711974110032363,
+      "grad_norm": 33.946685791015625,
+      "learning_rate": 5.172413793103449e-06,
+      "loss": 0.8929,
+      "step": 240
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "grad_norm": 20.04779624938965,
+      "learning_rate": 5.38793103448276e-06,
+      "loss": 0.9531,
+      "step": 250
+    },
+    {
+      "epoch": 0.21574973031283712,
+      "eval_accuracy": 0.5223300970873787,
+      "eval_loss": 0.886761486530304,
+      "eval_runtime": 321.7179,
+      "eval_samples_per_second": 1.601,
+      "eval_steps_per_second": 1.601,
+      "step": 250
+    },
+    {
+      "epoch": 0.2243797195253506,
+      "grad_norm": 53.125587463378906,
+      "learning_rate": 5.603448275862069e-06,
+      "loss": 0.9716,
+      "step": 260
+    },
+    {
+      "epoch": 0.23300970873786409,
+      "grad_norm": 43.821533203125,
+      "learning_rate": 5.81896551724138e-06,
+      "loss": 0.9407,
+      "step": 270
+    },
+    {
+      "epoch": 0.24163969795037757,
+      "grad_norm": 47.41954803466797,
+      "learning_rate": 6.03448275862069e-06,
+      "loss": 0.9464,
+      "step": 280
+    },
+    {
+      "epoch": 0.25026968716289105,
+      "grad_norm": 29.925968170166016,
+      "learning_rate": 6.25e-06,
+      "loss": 0.9151,
+      "step": 290
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "grad_norm": 23.372934341430664,
+      "learning_rate": 6.465517241379311e-06,
+      "loss": 0.849,
+      "step": 300
+    },
+    {
+      "epoch": 0.2588996763754045,
+      "eval_accuracy": 0.5339805825242718,
+      "eval_loss": 0.856666088104248,
+      "eval_runtime": 321.7027,
+      "eval_samples_per_second": 1.601,
+      "eval_steps_per_second": 1.601,
+      "step": 300
+    },
+    {
+      "epoch": 0.267529665587918,
+      "grad_norm": 22.651479721069336,
+      "learning_rate": 6.681034482758622e-06,
+      "loss": 1.0237,
+      "step": 310
+    },
+    {
+      "epoch": 0.2761596548004315,
+      "grad_norm": 17.50941276550293,
+      "learning_rate": 6.896551724137932e-06,
+      "loss": 0.8401,
+      "step": 320
+    },
+    {
+      "epoch": 0.284789644012945,
+      "grad_norm": 51.20744323730469,
+      "learning_rate": 7.1120689655172415e-06,
+      "loss": 0.9366,
+      "step": 330
+    },
+    {
+      "epoch": 0.29341963322545844,
+      "grad_norm": 23.283870697021484,
+      "learning_rate": 7.327586206896552e-06,
+      "loss": 0.8198,
+      "step": 340
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "grad_norm": 24.28423500061035,
+      "learning_rate": 7.543103448275862e-06,
+      "loss": 0.8897,
+      "step": 350
+    },
+    {
+      "epoch": 0.30204962243797195,
+      "eval_accuracy": 0.5262135922330097,
+      "eval_loss": 0.8523032069206238,
+      "eval_runtime": 321.7555,
+      "eval_samples_per_second": 1.601,
+      "eval_steps_per_second": 1.601,
+      "step": 350
+    },
+    {
+      "epoch": 0.3106796116504854,
+      "grad_norm": 27.711999893188477,
+      "learning_rate": 7.758620689655173e-06,
+      "loss": 0.8352,
+      "step": 360
+    },
+    {
+      "epoch": 0.3193096008629989,
+      "grad_norm": 25.017581939697266,
+      "learning_rate": 7.974137931034484e-06,
+      "loss": 0.7918,
+      "step": 370
+    },
+    {
+      "epoch": 0.32793959007551243,
+      "grad_norm": 33.27495193481445,
+      "learning_rate": 8.189655172413794e-06,
+      "loss": 0.9004,
+      "step": 380
+    },
+    {
+      "epoch": 0.3365695792880259,
+      "grad_norm": 17.355253219604492,
+      "learning_rate": 8.405172413793105e-06,
+      "loss": 0.8079,
+      "step": 390
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "grad_norm": 33.237518310546875,
+      "learning_rate": 8.620689655172414e-06,
+      "loss": 0.8512,
+      "step": 400
+    },
+    {
+      "epoch": 0.3451995685005394,
+      "eval_accuracy": 0.5262135922330097,
+      "eval_loss": 0.8104857206344604,
+      "eval_runtime": 321.6492,
+      "eval_samples_per_second": 1.601,
+      "eval_steps_per_second": 1.601,
+      "step": 400
+    },
+    {
+      "epoch": 0.35382955771305286,
+      "grad_norm": 31.926298141479492,
+      "learning_rate": 8.836206896551725e-06,
+      "loss": 0.8049,
+      "step": 410
+    },
+    {
+      "epoch": 0.36245954692556637,
+      "grad_norm": 18.511268615722656,
+      "learning_rate": 9.051724137931036e-06,
+      "loss": 0.7887,
+      "step": 420
+    },
+    {
+      "epoch": 0.3710895361380798,
+      "grad_norm": 12.080615043640137,
+      "learning_rate": 9.267241379310346e-06,
+      "loss": 0.8286,
+      "step": 430
+    },
+    {
+      "epoch": 0.37971952535059333,
+      "grad_norm": 22.48563003540039,
+      "learning_rate": 9.482758620689655e-06,
+      "loss": 0.8201,
+      "step": 440
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "grad_norm": 25.83173179626465,
+      "learning_rate": 9.698275862068966e-06,
+      "loss": 0.7854,
+      "step": 450
+    },
+    {
+      "epoch": 0.3883495145631068,
+      "eval_accuracy": 0.5106796116504855,
+      "eval_loss": 0.7994323372840881,
+      "eval_runtime": 321.4421,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 1.602,
+      "step": 450
+    },
+    {
+      "epoch": 0.3969795037756203,
+      "grad_norm": 41.783851623535156,
+      "learning_rate": 9.913793103448277e-06,
+      "loss": 0.8339,
+      "step": 460
+    },
+    {
+      "epoch": 0.40560949298813376,
+      "grad_norm": 12.72182846069336,
+      "learning_rate": 9.9999488687872e-06,
+      "loss": 0.8063,
+      "step": 470
+    },
+    {
+      "epoch": 0.41423948220064727,
+      "grad_norm": 28.933361053466797,
+      "learning_rate": 9.999636404051638e-06,
+      "loss": 0.8554,
+      "step": 480
+    },
+    {
+      "epoch": 0.4228694714131607,
+      "grad_norm": 48.14093017578125,
+      "learning_rate": 9.999039898540166e-06,
+      "loss": 0.9297,
+      "step": 490
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "grad_norm": 27.8731746673584,
+      "learning_rate": 9.998159386141626e-06,
+      "loss": 0.8147,
+      "step": 500
+    },
+    {
+      "epoch": 0.43149946062567424,
+      "eval_accuracy": 0.5398058252427185,
+      "eval_loss": 0.7859384417533875,
+      "eval_runtime": 321.5871,
+      "eval_samples_per_second": 1.601,
+      "eval_steps_per_second": 1.601,
+      "step": 500
+    },
+    {
+      "epoch": 0.4401294498381877,
+      "grad_norm": 17.547481536865234,
+      "learning_rate": 9.996994916879941e-06,
+      "loss": 0.8449,
+      "step": 510
+    },
+    {
+      "epoch": 0.4487594390507012,
+      "grad_norm": 33.447723388671875,
+      "learning_rate": 9.995546556911271e-06,
+      "loss": 0.779,
+      "step": 520
+    },
+    {
+      "epoch": 0.45738942826321466,
+      "grad_norm": 41.81571578979492,
+      "learning_rate": 9.99381438852026e-06,
+      "loss": 0.7262,
+      "step": 530
+    },
+    {
+      "epoch": 0.46601941747572817,
+      "grad_norm": 40.82163619995117,
+      "learning_rate": 9.991798510115351e-06,
+      "loss": 0.8282,
+      "step": 540
+    },
+    {
+      "epoch": 0.4746494066882416,
+      "grad_norm": 55.30727767944336,
+      "learning_rate": 9.989499036223209e-06,
+      "loss": 0.8075,
+      "step": 550
+    },
+    {
+      "epoch": 0.4746494066882416,
+      "eval_accuracy": 0.5553398058252427,
+      "eval_loss": 0.7565743923187256,
+      "eval_runtime": 321.511,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 1.602,
+      "step": 550
+    },
+    {
+      "epoch": 0.48327939590075514,
+      "grad_norm": 51.085289001464844,
+      "learning_rate": 9.986916097482204e-06,
+      "loss": 0.7747,
+      "step": 560
+    },
+    {
+      "epoch": 0.4919093851132686,
+      "grad_norm": 65.66133880615234,
+      "learning_rate": 9.98404984063499e-06,
+      "loss": 0.7563,
+      "step": 570
+    },
+    {
+      "epoch": 0.5005393743257821,
+      "grad_norm": 11.704032897949219,
+      "learning_rate": 9.980900428520171e-06,
+      "loss": 0.7819,
+      "step": 580
+    },
+    {
+      "epoch": 0.5091693635382956,
+      "grad_norm": 27.524673461914062,
+      "learning_rate": 9.977468040063054e-06,
+      "loss": 0.7777,
+      "step": 590
+    },
+    {
+      "epoch": 0.517799352750809,
+      "grad_norm": 22.56294822692871,
+      "learning_rate": 9.973752870265473e-06,
+      "loss": 0.8282,
+      "step": 600
+    },
+    {
+      "epoch": 0.517799352750809,
+      "eval_accuracy": 0.5145631067961165,
+      "eval_loss": 0.7454360127449036,
+      "eval_runtime": 321.3773,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 1.602,
+      "step": 600
+    },
+    {
+      "epoch": 0.5264293419633226,
+      "grad_norm": 24.327606201171875,
+      "learning_rate": 9.96975513019472e-06,
+      "loss": 0.7907,
+      "step": 610
+    },
+    {
+      "epoch": 0.535059331175836,
+      "grad_norm": 18.27765655517578,
+      "learning_rate": 9.965475046971548e-06,
+      "loss": 0.8475,
+      "step": 620
+    },
+    {
+      "epoch": 0.5436893203883495,
+      "grad_norm": 23.742115020751953,
+      "learning_rate": 9.960912863757273e-06,
+      "loss": 0.7363,
+      "step": 630
+    },
+    {
+      "epoch": 0.552319309600863,
+      "grad_norm": 11.194246292114258,
+      "learning_rate": 9.956068839739955e-06,
+      "loss": 0.8291,
+      "step": 640
+    },
+    {
+      "epoch": 0.5609492988133765,
+      "grad_norm": 23.568937301635742,
+      "learning_rate": 9.950943250119674e-06,
+      "loss": 0.7524,
+      "step": 650
+    },
+    {
+      "epoch": 0.5609492988133765,
+      "eval_accuracy": 0.49902912621359224,
+      "eval_loss": 0.7317044138908386,
+      "eval_runtime": 321.3686,
+      "eval_samples_per_second": 1.603,
+      "eval_steps_per_second": 1.603,
+      "step": 650
+    },
+    {
+      "epoch": 0.56957928802589,
+      "grad_norm": 11.3060302734375,
+      "learning_rate": 9.945536386092893e-06,
+      "loss": 0.7319,
+      "step": 660
+    },
+    {
+      "epoch": 0.5782092772384034,
+      "grad_norm": 29.552515029907227,
+      "learning_rate": 9.939848554835927e-06,
+      "loss": 0.6644,
+      "step": 670
+    },
+    {
+      "epoch": 0.5868392664509169,
+      "grad_norm": 23.357723236083984,
+      "learning_rate": 9.93388007948747e-06,
+      "loss": 0.8749,
+      "step": 680
+    },
+    {
+      "epoch": 0.5954692556634305,
+      "grad_norm": 18.92988395690918,
+      "learning_rate": 9.927631299130254e-06,
+      "loss": 0.8157,
+      "step": 690
+    },
+    {
+      "epoch": 0.6040992448759439,
+      "grad_norm": 18.492721557617188,
+      "learning_rate": 9.921102568771781e-06,
+      "loss": 0.7338,
+      "step": 700
+    },
+    {
+      "epoch": 0.6040992448759439,
+      "eval_accuracy": 0.5339805825242718,
+      "eval_loss": 0.7266865968704224,
+      "eval_runtime": 321.4222,
+      "eval_samples_per_second": 1.602,
+      "eval_steps_per_second": 1.602,
+      "step": 700
+    },
+    {
+      "epoch": 0.6127292340884574,
+      "grad_norm": 24.050262451171875,
+      "learning_rate": 9.914294259324149e-06,
+      "loss": 0.7609,
+      "step": 710
+    },
+    {
+      "epoch": 0.6213592233009708,
+      "grad_norm": 8.642351150512695,
+      "learning_rate": 9.907206757582987e-06,
+      "loss": 0.7681,
+      "step": 720
+    },
+    {
+      "epoch": 0.6299892125134844,
+      "grad_norm": 20.86747932434082,
+      "learning_rate": 9.899840466205473e-06,
+      "loss": 0.8052,
+      "step": 730
+    },
+    {
+      "epoch": 0.6386192017259978,
+      "grad_norm": 44.50579833984375,
+      "learning_rate": 9.892195803687464e-06,
+      "loss": 0.739,
+      "step": 740
+    },
+    {
+      "epoch": 0.6472491909385113,
+      "grad_norm": 20.538475036621094,
+      "learning_rate": 9.884273204339716e-06,
+      "loss": 0.7909,
+      "step": 750
+    },
+    {
+      "epoch": 0.6472491909385113,
+      "eval_accuracy": 0.5611650485436893,
+      "eval_loss": 0.7110950350761414,
+      "eval_runtime": 321.0742,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 750
+    },
+    {
+      "epoch": 0.6558791801510249,
+      "grad_norm": 53.17654037475586,
+      "learning_rate": 9.876073118263216e-06,
+      "loss": 0.8172,
+      "step": 760
+    },
+    {
+      "epoch": 0.6645091693635383,
+      "grad_norm": 26.998899459838867,
+      "learning_rate": 9.867596011323602e-06,
+      "loss": 0.7901,
+      "step": 770
+    },
+    {
+      "epoch": 0.6731391585760518,
+      "grad_norm": 45.38533020019531,
+      "learning_rate": 9.858842365124702e-06,
+      "loss": 0.7284,
+      "step": 780
+    },
+    {
+      "epoch": 0.6817691477885652,
+      "grad_norm": 28.952617645263672,
+      "learning_rate": 9.849812676981172e-06,
+      "loss": 0.7501,
+      "step": 790
+    },
+    {
+      "epoch": 0.6903991370010788,
+      "grad_norm": 19.87049102783203,
+      "learning_rate": 9.840507459890244e-06,
+      "loss": 0.7783,
+      "step": 800
+    },
+    {
+      "epoch": 0.6903991370010788,
+      "eval_accuracy": 0.5300970873786408,
+      "eval_loss": 0.7211207151412964,
+      "eval_runtime": 320.8034,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 800
+    },
+    {
+      "epoch": 0.6990291262135923,
+      "grad_norm": 15.508710861206055,
+      "learning_rate": 9.830927242502575e-06,
+      "loss": 0.6965,
+      "step": 810
+    },
+    {
+      "epoch": 0.7076591154261057,
+      "grad_norm": 36.019798278808594,
+      "learning_rate": 9.821072569092223e-06,
+      "loss": 0.77,
+      "step": 820
+    },
+    {
+      "epoch": 0.7162891046386192,
+      "grad_norm": 13.119162559509277,
+      "learning_rate": 9.810943999525714e-06,
+      "loss": 0.7158,
+      "step": 830
+    },
+    {
+      "epoch": 0.7249190938511327,
+      "grad_norm": 20.22465705871582,
+      "learning_rate": 9.800542109230247e-06,
+      "loss": 0.6938,
+      "step": 840
+    },
+    {
+      "epoch": 0.7335490830636462,
+      "grad_norm": 33.313209533691406,
+      "learning_rate": 9.78986748916099e-06,
+      "loss": 0.7895,
+      "step": 850
+    },
+    {
+      "epoch": 0.7335490830636462,
+      "eval_accuracy": 0.5592233009708738,
+      "eval_loss": 0.7069711685180664,
+      "eval_runtime": 321.285,
+      "eval_samples_per_second": 1.603,
+      "eval_steps_per_second": 1.603,
+      "step": 850
+    },
+    {
+      "epoch": 0.7421790722761596,
+      "grad_norm": 9.106620788574219,
+      "learning_rate": 9.778920745767524e-06,
+      "loss": 0.6717,
+      "step": 860
+    },
+    {
+      "epoch": 0.7508090614886731,
+      "grad_norm": 34.899375915527344,
+      "learning_rate": 9.767702500959365e-06,
+      "loss": 0.7353,
+      "step": 870
+    },
+    {
+      "epoch": 0.7594390507011867,
+      "grad_norm": 29.355737686157227,
+      "learning_rate": 9.756213392070654e-06,
+      "loss": 0.7315,
+      "step": 880
+    },
+    {
+      "epoch": 0.7680690399137001,
+      "grad_norm": 16.923168182373047,
+      "learning_rate": 9.744454071823936e-06,
+      "loss": 0.6777,
+      "step": 890
+    },
+    {
+      "epoch": 0.7766990291262136,
+      "grad_norm": 7.441469192504883,
+      "learning_rate": 9.732425208293083e-06,
+      "loss": 0.6881,
+      "step": 900
+    },
+    {
+      "epoch": 0.7766990291262136,
+      "eval_accuracy": 0.537864077669903,
+      "eval_loss": 0.7709933519363403,
+      "eval_runtime": 321.2302,
+      "eval_samples_per_second": 1.603,
+      "eval_steps_per_second": 1.603,
+      "step": 900
+    },
+    {
+      "epoch": 0.785329018338727,
+      "grad_norm": 17.159208297729492,
+      "learning_rate": 9.720127484865336e-06,
+      "loss": 0.7973,
+      "step": 910
+    },
+    {
+      "epoch": 0.7939590075512406,
+      "grad_norm": 29.373632431030273,
+      "learning_rate": 9.707561600202481e-06,
+      "loss": 0.6946,
+      "step": 920
+    },
+    {
+      "epoch": 0.8025889967637541,
+      "grad_norm": 40.986690521240234,
+      "learning_rate": 9.694728268201162e-06,
+      "loss": 0.7697,
+      "step": 930
+    },
+    {
+      "epoch": 0.8112189859762675,
+      "grad_norm": 10.117018699645996,
+      "learning_rate": 9.681628217952308e-06,
+      "loss": 0.7183,
+      "step": 940
+    },
+    {
+      "epoch": 0.819848975188781,
+      "grad_norm": 45.013118743896484,
+      "learning_rate": 9.668262193699731e-06,
+      "loss": 0.7137,
+      "step": 950
+    },
+    {
+      "epoch": 0.819848975188781,
+      "eval_accuracy": 0.5805825242718446,
+      "eval_loss": 0.6908486485481262,
+      "eval_runtime": 321.1671,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 950
+    },
+    {
+      "epoch": 0.8284789644012945,
+      "grad_norm": 22.911548614501953,
+      "learning_rate": 9.65463095479783e-06,
+      "loss": 0.7166,
+      "step": 960
+    },
+    {
+      "epoch": 0.837108953613808,
+      "grad_norm": 9.517961502075195,
+      "learning_rate": 9.640735275668453e-06,
+      "loss": 0.7713,
+      "step": 970
+    },
+    {
+      "epoch": 0.8457389428263214,
+      "grad_norm": 19.63594627380371,
+      "learning_rate": 9.62657594575691e-06,
+      "loss": 0.7101,
+      "step": 980
+    },
+    {
+      "epoch": 0.8543689320388349,
+      "grad_norm": 27.475940704345703,
+      "learning_rate": 9.6121537694871e-06,
+      "loss": 0.741,
+      "step": 990
+    },
+    {
+      "epoch": 0.8629989212513485,
+      "grad_norm": 13.922393798828125,
+      "learning_rate": 9.597469566215841e-06,
+      "loss": 0.6924,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8629989212513485,
+      "eval_accuracy": 0.6,
+      "eval_loss": 0.6857309341430664,
+      "eval_runtime": 321.1313,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8716289104638619,
+      "grad_norm": 8.671666145324707,
+      "learning_rate": 9.582524170186294e-06,
+      "loss": 0.6936,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8802588996763754,
+      "grad_norm": 11.311553001403809,
+      "learning_rate": 9.567318430480579e-06,
+      "loss": 0.6853,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 6.2082648277282715,
+      "learning_rate": 9.55185321097154e-06,
+      "loss": 0.6846,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8975188781014024,
+      "grad_norm": 35.873565673828125,
+      "learning_rate": 9.536129390273659e-06,
+      "loss": 0.7125,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9061488673139159,
+      "grad_norm": 3.9832065105438232,
+      "learning_rate": 9.520147861693138e-06,
+      "loss": 0.7275,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9061488673139159,
+      "eval_accuracy": 0.5766990291262136,
+      "eval_loss": 0.6835415959358215,
+      "eval_runtime": 321.1452,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9147788565264293,
+      "grad_norm": 9.8655424118042,
+      "learning_rate": 9.503909533177162e-06,
+      "loss": 0.7286,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9234088457389428,
+      "grad_norm": 14.413016319274902,
+      "learning_rate": 9.487415327262303e-06,
+      "loss": 0.7012,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9320388349514563,
+      "grad_norm": 22.791946411132812,
+      "learning_rate": 9.470666181022114e-06,
+      "loss": 0.7057,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9406688241639698,
+      "grad_norm": 7.595472812652588,
+      "learning_rate": 9.453663046013889e-06,
+      "loss": 0.7165,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9492988133764833,
+      "grad_norm": 6.206796169281006,
+      "learning_rate": 9.436406888224603e-06,
+      "loss": 0.67,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9492988133764833,
+      "eval_accuracy": 0.570873786407767,
+      "eval_loss": 0.6888366341590881,
+      "eval_runtime": 321.1897,
+      "eval_samples_per_second": 1.603,
+      "eval_steps_per_second": 1.603,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9579288025889967,
+      "grad_norm": 9.740569114685059,
+      "learning_rate": 9.418898688016042e-06,
+      "loss": 0.7177,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9665587918015103,
+      "grad_norm": 9.868525505065918,
+      "learning_rate": 9.40113944006909e-06,
+      "loss": 0.6841,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9751887810140237,
+      "grad_norm": 10.188973426818848,
+      "learning_rate": 9.383130153327231e-06,
+      "loss": 0.6808,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9838187702265372,
+      "grad_norm": 5.215792655944824,
+      "learning_rate": 9.36487185093922e-06,
+      "loss": 0.7059,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9924487594390508,
+      "grad_norm": 5.438614845275879,
+      "learning_rate": 9.34636557020097e-06,
+      "loss": 0.6787,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9924487594390508,
+      "eval_accuracy": 0.596116504854369,
+      "eval_loss": 0.6860348582267761,
+      "eval_runtime": 320.9468,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0010787486515642,
+      "grad_norm": 7.045734405517578,
+      "learning_rate": 9.327612362496601e-06,
+      "loss": 0.6904,
+      "step": 1160
+    },
+    {
+      "epoch": 1.0097087378640777,
+      "grad_norm": 21.833343505859375,
+      "learning_rate": 9.308613293238722e-06,
+      "loss": 0.7516,
+      "step": 1170
+    },
+    {
+      "epoch": 1.0183387270765911,
+      "grad_norm": 4.44768762588501,
+      "learning_rate": 9.2893694418079e-06,
+      "loss": 0.7105,
+      "step": 1180
+    },
+    {
+      "epoch": 1.0269687162891046,
+      "grad_norm": 12.016294479370117,
+      "learning_rate": 9.269881901491335e-06,
+      "loss": 0.67,
+      "step": 1190
+    },
+    {
+      "epoch": 1.035598705501618,
+      "grad_norm": 5.096578598022461,
+      "learning_rate": 9.250151779420756e-06,
+      "loss": 0.7012,
+      "step": 1200
+    },
+    {
+      "epoch": 1.035598705501618,
+      "eval_accuracy": 0.570873786407767,
+      "eval_loss": 0.6847370266914368,
+      "eval_runtime": 320.5183,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 1.607,
+      "step": 1200
+    },
+    {
+      "epoch": 1.0442286947141317,
+      "grad_norm": 11.158854484558105,
+      "learning_rate": 9.230180196509506e-06,
+      "loss": 0.6726,
+      "step": 1210
+    },
+    {
+      "epoch": 1.0528586839266452,
+      "grad_norm": 7.818958282470703,
+      "learning_rate": 9.209968287388878e-06,
+      "loss": 0.6737,
+      "step": 1220
+    },
+    {
+      "epoch": 1.0614886731391586,
+      "grad_norm": 4.283718109130859,
+      "learning_rate": 9.189517200343643e-06,
+      "loss": 0.6421,
+      "step": 1230
+    },
+    {
+      "epoch": 1.070118662351672,
+      "grad_norm": 6.186824321746826,
+      "learning_rate": 9.168828097246819e-06,
+      "loss": 0.7709,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0787486515641855,
+      "grad_norm": 5.761249542236328,
+      "learning_rate": 9.147902153493659e-06,
+      "loss": 0.6765,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0787486515641855,
+      "eval_accuracy": 0.5786407766990291,
+      "eval_loss": 0.6961000561714172,
+      "eval_runtime": 320.4513,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 1.607,
+      "step": 1250
+    },
+    {
+      "epoch": 1.087378640776699,
+      "grad_norm": 5.015466213226318,
+      "learning_rate": 9.126740557934874e-06,
+      "loss": 0.6551,
+      "step": 1260
+    },
+    {
+      "epoch": 1.0960086299892124,
+      "grad_norm": 8.18385124206543,
+      "learning_rate": 9.105344512809097e-06,
+      "loss": 0.6606,
+      "step": 1270
+    },
+    {
+      "epoch": 1.104638619201726,
+      "grad_norm": 3.6305551528930664,
+      "learning_rate": 9.083715233674572e-06,
+      "loss": 0.7058,
+      "step": 1280
+    },
+    {
+      "epoch": 1.1132686084142396,
+      "grad_norm": 9.872076034545898,
+      "learning_rate": 9.061853949340104e-06,
+      "loss": 0.6577,
+      "step": 1290
+    },
+    {
+      "epoch": 1.121898597626753,
+      "grad_norm": 4.889667510986328,
+      "learning_rate": 9.039761901795241e-06,
+      "loss": 0.7052,
+      "step": 1300
+    },
+    {
+      "epoch": 1.121898597626753,
+      "eval_accuracy": 0.6058252427184466,
+      "eval_loss": 0.6881099939346313,
+      "eval_runtime": 320.8035,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 1300
+    },
+    {
+      "epoch": 1.1305285868392665,
+      "grad_norm": 3.392106056213379,
+      "learning_rate": 9.017440346139718e-06,
+      "loss": 0.681,
+      "step": 1310
+    },
+    {
+      "epoch": 1.13915857605178,
+      "grad_norm": 5.220512866973877,
+      "learning_rate": 8.994890550512152e-06,
+      "loss": 0.7117,
+      "step": 1320
+    },
+    {
+      "epoch": 1.1477885652642934,
+      "grad_norm": 11.190145492553711,
+      "learning_rate": 8.972113796017992e-06,
+      "loss": 0.7058,
+      "step": 1330
+    },
+    {
+      "epoch": 1.1564185544768069,
+      "grad_norm": 3.2504310607910156,
+      "learning_rate": 8.949111376656741e-06,
+      "loss": 0.6867,
+      "step": 1340
+    },
+    {
+      "epoch": 1.1650485436893203,
+      "grad_norm": 3.312730073928833,
+      "learning_rate": 8.925884599248437e-06,
+      "loss": 0.6804,
+      "step": 1350
+    },
+    {
+      "epoch": 1.1650485436893203,
+      "eval_accuracy": 0.6097087378640776,
+      "eval_loss": 0.6778111457824707,
+      "eval_runtime": 320.8442,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 1350
+    },
+    {
+      "epoch": 1.173678532901834,
+      "grad_norm": 3.8169898986816406,
+      "learning_rate": 8.902434783359417e-06,
+      "loss": 0.6812,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1823085221143474,
+      "grad_norm": 13.139059066772461,
+      "learning_rate": 8.878763261227337e-06,
+      "loss": 0.7111,
+      "step": 1370
+    },
+    {
+      "epoch": 1.190938511326861,
+      "grad_norm": 8.938994407653809,
+      "learning_rate": 8.854871377685496e-06,
+      "loss": 0.6762,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1995685005393744,
+      "grad_norm": 7.517580509185791,
+      "learning_rate": 8.830760490086427e-06,
+      "loss": 0.6817,
+      "step": 1390
+    },
+    {
+      "epoch": 1.2081984897518878,
+      "grad_norm": 5.75648307800293,
+      "learning_rate": 8.806431968224784e-06,
+      "loss": 0.6644,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2081984897518878,
+      "eval_accuracy": 0.6194174757281553,
+      "eval_loss": 0.6810408234596252,
+      "eval_runtime": 320.9626,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 1400
+    },
+    {
+      "epoch": 1.2168284789644013,
+      "grad_norm": 6.445542812347412,
+      "learning_rate": 8.781887194259523e-06,
+      "loss": 0.6684,
+      "step": 1410
+    },
+    {
+      "epoch": 1.2254584681769147,
+      "grad_norm": 5.923236846923828,
+      "learning_rate": 8.757127562635374e-06,
+      "loss": 0.6802,
+      "step": 1420
+    },
+    {
+      "epoch": 1.2340884573894282,
+      "grad_norm": 5.63727331161499,
+      "learning_rate": 8.732154480003625e-06,
+      "loss": 0.7045,
+      "step": 1430
+    },
+    {
+      "epoch": 1.2427184466019416,
+      "grad_norm": 5.639196872711182,
+      "learning_rate": 8.706969365142202e-06,
+      "loss": 0.6916,
+      "step": 1440
+    },
+    {
+      "epoch": 1.2513484358144553,
+      "grad_norm": 6.068101406097412,
+      "learning_rate": 8.681573648875064e-06,
+      "loss": 0.6566,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2513484358144553,
+      "eval_accuracy": 0.6135922330097088,
+      "eval_loss": 0.6820415258407593,
+      "eval_runtime": 320.9166,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 1450
+    },
+    {
+      "epoch": 1.2599784250269688,
+      "grad_norm": 5.288263320922852,
+      "learning_rate": 8.655968773990922e-06,
+      "loss": 0.6696,
+      "step": 1460
+    },
+    {
+      "epoch": 1.2686084142394822,
+      "grad_norm": 9.293752670288086,
+      "learning_rate": 8.630156195161264e-06,
+      "loss": 0.6407,
+      "step": 1470
+    },
+    {
+      "epoch": 1.2772384034519957,
+      "grad_norm": 14.672719955444336,
+      "learning_rate": 8.604137378857713e-06,
+      "loss": 0.6507,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2858683926645091,
+      "grad_norm": 9.176056861877441,
+      "learning_rate": 8.577913803268719e-06,
+      "loss": 0.7229,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2944983818770226,
+      "grad_norm": 12.57158374786377,
+      "learning_rate": 8.551486958215569e-06,
+      "loss": 0.7024,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2944983818770226,
+      "eval_accuracy": 0.6116504854368932,
+      "eval_loss": 0.6744683384895325,
+      "eval_runtime": 321.1558,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1500
+    },
+    {
+      "epoch": 1.303128371089536,
+      "grad_norm": 6.8445305824279785,
+      "learning_rate": 8.524858345067757e-06,
+      "loss": 0.6842,
+      "step": 1510
+    },
+    {
+      "epoch": 1.3117583603020497,
+      "grad_norm": 5.6327643394470215,
+      "learning_rate": 8.498029476657686e-06,
+      "loss": 0.6904,
+      "step": 1520
+    },
+    {
+      "epoch": 1.3203883495145632,
+      "grad_norm": 10.025938987731934,
+      "learning_rate": 8.471001877194708e-06,
+      "loss": 0.6733,
+      "step": 1530
+    },
+    {
+      "epoch": 1.3290183387270766,
+      "grad_norm": 6.761681079864502,
+      "learning_rate": 8.443777082178556e-06,
+      "loss": 0.6767,
+      "step": 1540
+    },
+    {
+      "epoch": 1.33764832793959,
+      "grad_norm": 5.284752368927002,
+      "learning_rate": 8.416356638312082e-06,
+      "loss": 0.7241,
+      "step": 1550
+    },
+    {
+      "epoch": 1.33764832793959,
+      "eval_accuracy": 0.6135922330097088,
+      "eval_loss": 0.6697773933410645,
+      "eval_runtime": 321.0762,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1550
+    },
+    {
+      "epoch": 1.3462783171521036,
+      "grad_norm": 5.520620346069336,
+      "learning_rate": 8.388742103413397e-06,
+      "loss": 0.6738,
+      "step": 1560
+    },
+    {
+      "epoch": 1.354908306364617,
+      "grad_norm": 4.6568098068237305,
+      "learning_rate": 8.360935046327373e-06,
+      "loss": 0.671,
+      "step": 1570
+    },
+    {
+      "epoch": 1.3635382955771305,
+      "grad_norm": 4.777432441711426,
+      "learning_rate": 8.332937046836503e-06,
+      "loss": 0.69,
+      "step": 1580
+    },
+    {
+      "epoch": 1.3721682847896441,
+      "grad_norm": 8.115592956542969,
+      "learning_rate": 8.304749695571157e-06,
+      "loss": 0.6583,
+      "step": 1590
+    },
+    {
+      "epoch": 1.3807982740021574,
+      "grad_norm": 11.980337142944336,
+      "learning_rate": 8.276374593919213e-06,
+      "loss": 0.7378,
+      "step": 1600
+    },
+    {
+      "epoch": 1.3807982740021574,
+      "eval_accuracy": 0.6058252427184466,
+      "eval_loss": 0.6734395027160645,
+      "eval_runtime": 320.9778,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1600
+    },
+    {
+      "epoch": 1.389428263214671,
+      "grad_norm": 3.5900051593780518,
+      "learning_rate": 8.247813353935073e-06,
+      "loss": 0.664,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3980582524271845,
+      "grad_norm": 14.644140243530273,
+      "learning_rate": 8.219067598248087e-06,
+      "loss": 0.6718,
+      "step": 1620
+    },
+    {
+      "epoch": 1.406688241639698,
+      "grad_norm": 6.659509658813477,
+      "learning_rate": 8.190138959970366e-06,
+      "loss": 0.6476,
+      "step": 1630
+    },
+    {
+      "epoch": 1.4153182308522114,
+      "grad_norm": 5.535285949707031,
+      "learning_rate": 8.161029082603994e-06,
+      "loss": 0.642,
+      "step": 1640
+    },
+    {
+      "epoch": 1.4239482200647249,
+      "grad_norm": 7.590597152709961,
+      "learning_rate": 8.131739619947667e-06,
+      "loss": 0.6584,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4239482200647249,
+      "eval_accuracy": 0.6,
+      "eval_loss": 0.6994197964668274,
+      "eval_runtime": 321.0664,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1650
+    },
+    {
+      "epoch": 1.4325782092772383,
+      "grad_norm": 13.075584411621094,
+      "learning_rate": 8.102272236002729e-06,
+      "loss": 0.7239,
+      "step": 1660
+    },
+    {
+      "epoch": 1.4412081984897518,
+      "grad_norm": 6.066156387329102,
+      "learning_rate": 8.072628604878638e-06,
+      "loss": 0.7182,
+      "step": 1670
+    },
+    {
+      "epoch": 1.4498381877022655,
+      "grad_norm": 4.588730335235596,
+      "learning_rate": 8.042810410697861e-06,
+      "loss": 0.717,
+      "step": 1680
+    },
+    {
+      "epoch": 1.458468176914779,
+      "grad_norm": 3.397918224334717,
+      "learning_rate": 8.012819347500189e-06,
+      "loss": 0.6567,
+      "step": 1690
+    },
+    {
+      "epoch": 1.4670981661272924,
+      "grad_norm": 8.24763298034668,
+      "learning_rate": 7.982657119146495e-06,
+      "loss": 0.6724,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4670981661272924,
+      "eval_accuracy": 0.6097087378640776,
+      "eval_loss": 0.6715120077133179,
+      "eval_runtime": 321.0917,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1700
+    },
+    {
+      "epoch": 1.4757281553398058,
+      "grad_norm": 8.984458923339844,
+      "learning_rate": 7.952325439221944e-06,
+      "loss": 0.6653,
+      "step": 1710
+    },
+    {
+      "epoch": 1.4843581445523193,
+      "grad_norm": 8.375741958618164,
+      "learning_rate": 7.921826030938623e-06,
+      "loss": 0.722,
+      "step": 1720
+    },
+    {
+      "epoch": 1.4929881337648327,
+      "grad_norm": 8.309843063354492,
+      "learning_rate": 7.891160627037653e-06,
+      "loss": 0.7034,
+      "step": 1730
+    },
+    {
+      "epoch": 1.5016181229773462,
+      "grad_norm": 7.065859794616699,
+      "learning_rate": 7.860330969690749e-06,
+      "loss": 0.6338,
+      "step": 1740
+    },
+    {
+      "epoch": 1.5102481121898599,
+      "grad_norm": 5.86482048034668,
+      "learning_rate": 7.829338810401238e-06,
+      "loss": 0.6774,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5102481121898599,
+      "eval_accuracy": 0.6135922330097088,
+      "eval_loss": 0.669984757900238,
+      "eval_runtime": 321.0227,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1750
+    },
+    {
+      "epoch": 1.5188781014023731,
+      "grad_norm": 6.1000075340271,
+      "learning_rate": 7.798185909904552e-06,
+      "loss": 0.6813,
+      "step": 1760
+    },
+    {
+      "epoch": 1.5275080906148868,
+      "grad_norm": 8.106244087219238,
+      "learning_rate": 7.766874038068202e-06,
+      "loss": 0.7138,
+      "step": 1770
+    },
+    {
+      "epoch": 1.5361380798274002,
+      "grad_norm": 5.946533203125,
+      "learning_rate": 7.735404973791223e-06,
+      "loss": 0.7025,
+      "step": 1780
+    },
+    {
+      "epoch": 1.5447680690399137,
+      "grad_norm": 6.442516326904297,
+      "learning_rate": 7.703780504903107e-06,
+      "loss": 0.6643,
+      "step": 1790
+    },
+    {
+      "epoch": 1.5533980582524272,
+      "grad_norm": 6.0701985359191895,
+      "learning_rate": 7.672002428062245e-06,
+      "loss": 0.6653,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5533980582524272,
+      "eval_accuracy": 0.6097087378640776,
+      "eval_loss": 0.6695827841758728,
+      "eval_runtime": 321.0661,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1800
+    },
+    {
+      "epoch": 1.5620280474649406,
+      "grad_norm": 10.973797798156738,
+      "learning_rate": 7.640072548653843e-06,
+      "loss": 0.6681,
+      "step": 1810
+    },
+    {
+      "epoch": 1.5706580366774543,
+      "grad_norm": 9.289361000061035,
+      "learning_rate": 7.607992680687362e-06,
+      "loss": 0.6297,
+      "step": 1820
+    },
+    {
+      "epoch": 1.5792880258899675,
+      "grad_norm": 6.6282148361206055,
+      "learning_rate": 7.575764646693447e-06,
+      "loss": 0.706,
+      "step": 1830
+    },
+    {
+      "epoch": 1.5879180151024812,
+      "grad_norm": 4.8196702003479,
+      "learning_rate": 7.5433902776204015e-06,
+      "loss": 0.6669,
+      "step": 1840
+    },
+    {
+      "epoch": 1.5965480043149944,
+      "grad_norm": 8.807297706604004,
+      "learning_rate": 7.510871412730157e-06,
+      "loss": 0.6641,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5965480043149944,
+      "eval_accuracy": 0.5980582524271845,
+      "eval_loss": 0.6732643246650696,
+      "eval_runtime": 321.036,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1850
+    },
+    {
+      "epoch": 1.6051779935275081,
+      "grad_norm": 4.137267589569092,
+      "learning_rate": 7.478209899493787e-06,
+      "loss": 0.6345,
+      "step": 1860
+    },
+    {
+      "epoch": 1.6138079827400216,
+      "grad_norm": 7.294461250305176,
+      "learning_rate": 7.445407593486535e-06,
+      "loss": 0.6899,
+      "step": 1870
+    },
+    {
+      "epoch": 1.622437971952535,
+      "grad_norm": 8.29757308959961,
+      "learning_rate": 7.41246635828241e-06,
+      "loss": 0.6848,
+      "step": 1880
+    },
+    {
+      "epoch": 1.6310679611650487,
+      "grad_norm": 10.072659492492676,
+      "learning_rate": 7.379388065348305e-06,
+      "loss": 0.6829,
+      "step": 1890
+    },
+    {
+      "epoch": 1.639697950377562,
+      "grad_norm": 8.695294380187988,
+      "learning_rate": 7.346174593937676e-06,
+      "loss": 0.7241,
+      "step": 1900
+    },
+    {
+      "epoch": 1.639697950377562,
+      "eval_accuracy": 0.596116504854369,
+      "eval_loss": 0.6652901768684387,
+      "eval_runtime": 321.0146,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1900
+    },
+    {
+      "epoch": 1.6483279395900756,
+      "grad_norm": 3.542787790298462,
+      "learning_rate": 7.31282783098378e-06,
+      "loss": 0.6428,
+      "step": 1910
+    },
+    {
+      "epoch": 1.6569579288025889,
+      "grad_norm": 6.900018215179443,
+      "learning_rate": 7.279349670992464e-06,
+      "loss": 0.6494,
+      "step": 1920
+    },
+    {
+      "epoch": 1.6655879180151025,
+      "grad_norm": 7.8714189529418945,
+      "learning_rate": 7.245742015934547e-06,
+      "loss": 0.5778,
+      "step": 1930
+    },
+    {
+      "epoch": 1.674217907227616,
+      "grad_norm": 4.089023590087891,
+      "learning_rate": 7.212006775137761e-06,
+      "loss": 0.6912,
+      "step": 1940
+    },
+    {
+      "epoch": 1.6828478964401294,
+      "grad_norm": 5.432620048522949,
+      "learning_rate": 7.178145865178268e-06,
+      "loss": 0.6496,
+      "step": 1950
+    },
+    {
+      "epoch": 1.6828478964401294,
+      "eval_accuracy": 0.6116504854368932,
+      "eval_loss": 0.6761239767074585,
+      "eval_runtime": 320.9902,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 1950
+    },
+    {
+      "epoch": 1.691477885652643,
+      "grad_norm": 4.092471122741699,
+      "learning_rate": 7.144161209771788e-06,
+      "loss": 0.6757,
+      "step": 1960
+    },
+    {
+      "epoch": 1.7001078748651564,
+      "grad_norm": 6.498571872711182,
+      "learning_rate": 7.110054739664303e-06,
+      "loss": 0.6111,
+      "step": 1970
+    },
+    {
+      "epoch": 1.70873786407767,
+      "grad_norm": 9.238410949707031,
+      "learning_rate": 7.075828392522362e-06,
+      "loss": 0.5998,
+      "step": 1980
+    },
+    {
+      "epoch": 1.7173678532901833,
+      "grad_norm": 5.266243934631348,
+      "learning_rate": 7.04148411282301e-06,
+      "loss": 0.655,
+      "step": 1990
+    },
+    {
+      "epoch": 1.725997842502697,
+      "grad_norm": 8.122797966003418,
+      "learning_rate": 7.0070238517433e-06,
+      "loss": 0.662,
+      "step": 2000
+    },
+    {
+      "epoch": 1.725997842502697,
+      "eval_accuracy": 0.6038834951456311,
+      "eval_loss": 0.6728688478469849,
+      "eval_runtime": 320.9753,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 2000
+    },
+    {
+      "epoch": 1.7346278317152104,
+      "grad_norm": 8.114389419555664,
+      "learning_rate": 6.972449567049463e-06,
+      "loss": 0.6923,
+      "step": 2010
+    },
+    {
+      "epoch": 1.7432578209277239,
+      "grad_norm": 6.447281837463379,
+      "learning_rate": 6.9377632229856665e-06,
+      "loss": 0.6625,
+      "step": 2020
+    },
+    {
+      "epoch": 1.7518878101402373,
+      "grad_norm": 8.996492385864258,
+      "learning_rate": 6.902966790162425e-06,
+      "loss": 0.6919,
+      "step": 2030
+    },
+    {
+      "epoch": 1.7605177993527508,
+      "grad_norm": 5.145361423492432,
+      "learning_rate": 6.868062245444655e-06,
+      "loss": 0.6468,
+      "step": 2040
+    },
+    {
+      "epoch": 1.7691477885652644,
+      "grad_norm": 6.459311008453369,
+      "learning_rate": 6.833051571839347e-06,
+      "loss": 0.7049,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7691477885652644,
+      "eval_accuracy": 0.6135922330097088,
+      "eval_loss": 0.6757835149765015,
+      "eval_runtime": 320.6068,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2050
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 8.930355072021484,
+      "learning_rate": 6.797936758382924e-06,
+      "loss": 0.6384,
+      "step": 2060
+    },
+    {
+      "epoch": 1.7864077669902914,
+      "grad_norm": 8.780126571655273,
+      "learning_rate": 6.762719800028231e-06,
+      "loss": 0.6169,
+      "step": 2070
+    },
+    {
+      "epoch": 1.7950377562028046,
+      "grad_norm": 7.830219745635986,
+      "learning_rate": 6.727402697531193e-06,
+      "loss": 0.6596,
+      "step": 2080
+    },
+    {
+      "epoch": 1.8036677454153183,
+      "grad_norm": 4.703182697296143,
+      "learning_rate": 6.69198745733716e-06,
+      "loss": 0.6964,
+      "step": 2090
+    },
+    {
+      "epoch": 1.8122977346278317,
+      "grad_norm": 4.655829906463623,
+      "learning_rate": 6.656476091466901e-06,
+      "loss": 0.6483,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8122977346278317,
+      "eval_accuracy": 0.6135922330097088,
+      "eval_loss": 0.6741885542869568,
+      "eval_runtime": 320.6691,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2100
+    },
+    {
+      "epoch": 1.8209277238403452,
+      "grad_norm": 4.25952672958374,
+      "learning_rate": 6.620870617402312e-06,
+      "loss": 0.6732,
+      "step": 2110
+    },
+    {
+      "epoch": 1.8295577130528586,
+      "grad_norm": 6.7814226150512695,
+      "learning_rate": 6.585173057971787e-06,
+      "loss": 0.6674,
+      "step": 2120
+    },
+    {
+      "epoch": 1.838187702265372,
+      "grad_norm": 4.3662638664245605,
+      "learning_rate": 6.5493854412352985e-06,
+      "loss": 0.6807,
+      "step": 2130
+    },
+    {
+      "epoch": 1.8468176914778858,
+      "grad_norm": 5.596447467803955,
+      "learning_rate": 6.5135098003691865e-06,
+      "loss": 0.6637,
+      "step": 2140
+    },
+    {
+      "epoch": 1.855447680690399,
+      "grad_norm": 4.839741230010986,
+      "learning_rate": 6.477548173550635e-06,
+      "loss": 0.678,
+      "step": 2150
+    },
+    {
+      "epoch": 1.855447680690399,
+      "eval_accuracy": 0.6310679611650486,
+      "eval_loss": 0.6695934534072876,
+      "eval_runtime": 320.6467,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2150
+    },
+    {
+      "epoch": 1.8640776699029127,
+      "grad_norm": 11.375150680541992,
+      "learning_rate": 6.441502603841892e-06,
+      "loss": 0.6592,
+      "step": 2160
+    },
+    {
+      "epoch": 1.8727076591154261,
+      "grad_norm": 6.302811145782471,
+      "learning_rate": 6.405375139074194e-06,
+      "loss": 0.6413,
+      "step": 2170
+    },
+    {
+      "epoch": 1.8813376483279396,
+      "grad_norm": 9.698513984680176,
+      "learning_rate": 6.369167831731419e-06,
+      "loss": 0.6304,
+      "step": 2180
+    },
+    {
+      "epoch": 1.889967637540453,
+      "grad_norm": 9.770709991455078,
+      "learning_rate": 6.332882738833485e-06,
+      "loss": 0.6144,
+      "step": 2190
+    },
+    {
+      "epoch": 1.8985976267529665,
+      "grad_norm": 10.665081977844238,
+      "learning_rate": 6.296521921819489e-06,
+      "loss": 0.678,
+      "step": 2200
+    },
+    {
+      "epoch": 1.8985976267529665,
+      "eval_accuracy": 0.6233009708737864,
+      "eval_loss": 0.6689735054969788,
+      "eval_runtime": 320.6295,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2200
+    },
+    {
+      "epoch": 1.9072276159654802,
+      "grad_norm": 8.206169128417969,
+      "learning_rate": 6.260087446430582e-06,
+      "loss": 0.6622,
+      "step": 2210
+    },
+    {
+      "epoch": 1.9158576051779934,
+      "grad_norm": 11.89337158203125,
+      "learning_rate": 6.223581382592625e-06,
+      "loss": 0.6567,
+      "step": 2220
+    },
+    {
+      "epoch": 1.924487594390507,
+      "grad_norm": 4.916356086730957,
+      "learning_rate": 6.18700580429857e-06,
+      "loss": 0.6634,
+      "step": 2230
+    },
+    {
+      "epoch": 1.9331175836030206,
+      "grad_norm": 9.565736770629883,
+      "learning_rate": 6.150362789490654e-06,
+      "loss": 0.6532,
+      "step": 2240
+    },
+    {
+      "epoch": 1.941747572815534,
+      "grad_norm": 10.54036808013916,
+      "learning_rate": 6.113654419942334e-06,
+      "loss": 0.6953,
+      "step": 2250
+    },
+    {
+      "epoch": 1.941747572815534,
+      "eval_accuracy": 0.625242718446602,
+      "eval_loss": 0.6624494791030884,
+      "eval_runtime": 320.6343,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2250
+    },
+    {
+      "epoch": 1.9503775620280475,
+      "grad_norm": 12.351181983947754,
+      "learning_rate": 6.0768827811400166e-06,
+      "loss": 0.71,
+      "step": 2260
+    },
+    {
+      "epoch": 1.959007551240561,
+      "grad_norm": 6.94906759262085,
+      "learning_rate": 6.040049962164585e-06,
+      "loss": 0.6464,
+      "step": 2270
+    },
+    {
+      "epoch": 1.9676375404530746,
+      "grad_norm": 6.037535667419434,
+      "learning_rate": 6.0031580555727005e-06,
+      "loss": 0.6598,
+      "step": 2280
+    },
+    {
+      "epoch": 1.9762675296655878,
+      "grad_norm": 11.901267051696777,
+      "learning_rate": 5.9662091572779325e-06,
+      "loss": 0.6292,
+      "step": 2290
+    },
+    {
+      "epoch": 1.9848975188781015,
+      "grad_norm": 7.471567153930664,
+      "learning_rate": 5.929205366431679e-06,
+      "loss": 0.6969,
+      "step": 2300
+    },
+    {
+      "epoch": 1.9848975188781015,
+      "eval_accuracy": 0.6368932038834951,
+      "eval_loss": 0.6725260019302368,
+      "eval_runtime": 320.5652,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 1.607,
+      "step": 2300
+    },
+    {
+      "epoch": 1.9935275080906147,
+      "grad_norm": 4.360079765319824,
+      "learning_rate": 5.892148785303905e-06,
+      "loss": 0.6386,
+      "step": 2310
+    },
+    {
+      "epoch": 2.0021574973031284,
+      "grad_norm": 7.370548725128174,
+      "learning_rate": 5.855041519163718e-06,
+      "loss": 0.5936,
+      "step": 2320
+    },
+    {
+      "epoch": 2.0107874865156417,
+      "grad_norm": 11.645364761352539,
+      "learning_rate": 5.817885676159754e-06,
+      "loss": 0.7021,
+      "step": 2330
+    },
+    {
+      "epoch": 2.0194174757281553,
+      "grad_norm": 9.975643157958984,
+      "learning_rate": 5.78068336720041e-06,
+      "loss": 0.62,
+      "step": 2340
+    },
+    {
+      "epoch": 2.028047464940669,
+      "grad_norm": 8.763169288635254,
+      "learning_rate": 5.743436705833922e-06,
+      "loss": 0.6492,
+      "step": 2350
+    },
+    {
+      "epoch": 2.028047464940669,
+      "eval_accuracy": 0.6485436893203883,
+      "eval_loss": 0.656815767288208,
+      "eval_runtime": 320.6788,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2350
+    },
+    {
+      "epoch": 2.0366774541531822,
+      "grad_norm": 6.766859531402588,
+      "learning_rate": 5.706147808128288e-06,
+      "loss": 0.6385,
+      "step": 2360
+    },
+    {
+      "epoch": 2.045307443365696,
+      "grad_norm": 7.149226665496826,
+      "learning_rate": 5.668818792551052e-06,
+      "loss": 0.5838,
+      "step": 2370
+    },
+    {
+      "epoch": 2.053937432578209,
+      "grad_norm": 6.320857048034668,
+      "learning_rate": 5.6314517798489395e-06,
+      "loss": 0.655,
+      "step": 2380
+    },
+    {
+      "epoch": 2.062567421790723,
+      "grad_norm": 12.915064811706543,
+      "learning_rate": 5.594048892927382e-06,
+      "loss": 0.7095,
+      "step": 2390
+    },
+    {
+      "epoch": 2.071197411003236,
+      "grad_norm": 7.46158504486084,
+      "learning_rate": 5.556612256729909e-06,
+      "loss": 0.6572,
+      "step": 2400
+    },
+    {
+      "epoch": 2.071197411003236,
+      "eval_accuracy": 0.6446601941747573,
+      "eval_loss": 0.669795036315918,
+      "eval_runtime": 320.7237,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2400
+    },
+    {
+      "epoch": 2.0798274002157497,
+      "grad_norm": 9.09875202178955,
+      "learning_rate": 5.519143998117424e-06,
+      "loss": 0.6518,
+      "step": 2410
+    },
+    {
+      "epoch": 2.0884573894282634,
+      "grad_norm": 9.286842346191406,
+      "learning_rate": 5.48164624574737e-06,
+      "loss": 0.6492,
+      "step": 2420
+    },
+    {
+      "epoch": 2.0970873786407767,
+      "grad_norm": 5.891538143157959,
+      "learning_rate": 5.444121129952799e-06,
+      "loss": 0.648,
+      "step": 2430
+    },
+    {
+      "epoch": 2.1057173678532903,
+      "grad_norm": 11.724071502685547,
+      "learning_rate": 5.406570782621341e-06,
+      "loss": 0.6533,
+      "step": 2440
+    },
+    {
+      "epoch": 2.1143473570658036,
+      "grad_norm": 8.159801483154297,
+      "learning_rate": 5.368997337074088e-06,
+      "loss": 0.6204,
+      "step": 2450
+    },
+    {
+      "epoch": 2.1143473570658036,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6549546122550964,
+      "eval_runtime": 320.7153,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2450
+    },
+    {
+      "epoch": 2.1229773462783172,
+      "grad_norm": 8.30516529083252,
+      "learning_rate": 5.331402927944392e-06,
+      "loss": 0.5746,
+      "step": 2460
+    },
+    {
+      "epoch": 2.1316073354908305,
+      "grad_norm": 6.368971824645996,
+      "learning_rate": 5.293789691056601e-06,
+      "loss": 0.6352,
+      "step": 2470
+    },
+    {
+      "epoch": 2.140237324703344,
+      "grad_norm": 18.369422912597656,
+      "learning_rate": 5.256159763304703e-06,
+      "loss": 0.6815,
+      "step": 2480
+    },
+    {
+      "epoch": 2.148867313915858,
+      "grad_norm": 7.470778465270996,
+      "learning_rate": 5.218515282530934e-06,
+      "loss": 0.5849,
+      "step": 2490
+    },
+    {
+      "epoch": 2.157497303128371,
+      "grad_norm": 8.369938850402832,
+      "learning_rate": 5.180858387404325e-06,
+      "loss": 0.6479,
+      "step": 2500
+    },
+    {
+      "epoch": 2.157497303128371,
+      "eval_accuracy": 0.6446601941747573,
+      "eval_loss": 0.6610180735588074,
+      "eval_runtime": 320.6988,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2500
+    },
+    {
+      "epoch": 2.1661272923408847,
+      "grad_norm": 12.514945030212402,
+      "learning_rate": 5.143191217299189e-06,
+      "loss": 0.5588,
+      "step": 2510
+    },
+    {
+      "epoch": 2.174757281553398,
+      "grad_norm": 10.213220596313477,
+      "learning_rate": 5.10551591217359e-06,
+      "loss": 0.6862,
+      "step": 2520
+    },
+    {
+      "epoch": 2.1833872707659117,
+      "grad_norm": 10.838960647583008,
+      "learning_rate": 5.067834612447755e-06,
+      "loss": 0.6218,
+      "step": 2530
+    },
+    {
+      "epoch": 2.192017259978425,
+      "grad_norm": 8.767598152160645,
+      "learning_rate": 5.0301494588824795e-06,
+      "loss": 0.5711,
+      "step": 2540
+    },
+    {
+      "epoch": 2.2006472491909386,
+      "grad_norm": 6.138967514038086,
+      "learning_rate": 4.9924625924575095e-06,
+      "loss": 0.6954,
+      "step": 2550
+    },
+    {
+      "epoch": 2.2006472491909386,
+      "eval_accuracy": 0.6679611650485436,
+      "eval_loss": 0.6637104153633118,
+      "eval_runtime": 320.7599,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2550
+    },
+    {
+      "epoch": 2.209277238403452,
+      "grad_norm": 10.984577178955078,
+      "learning_rate": 4.954776154249896e-06,
+      "loss": 0.6567,
+      "step": 2560
+    },
+    {
+      "epoch": 2.2179072276159655,
+      "grad_norm": 8.720921516418457,
+      "learning_rate": 4.9170922853123635e-06,
+      "loss": 0.6283,
+      "step": 2570
+    },
+    {
+      "epoch": 2.226537216828479,
+      "grad_norm": 10.784737586975098,
+      "learning_rate": 4.879413126551675e-06,
+      "loss": 0.6072,
+      "step": 2580
+    },
+    {
+      "epoch": 2.2351672060409924,
+      "grad_norm": 6.139902114868164,
+      "learning_rate": 4.84174081860699e-06,
+      "loss": 0.5966,
+      "step": 2590
+    },
+    {
+      "epoch": 2.243797195253506,
+      "grad_norm": 7.9166083335876465,
+      "learning_rate": 4.8040775017282644e-06,
+      "loss": 0.5668,
+      "step": 2600
+    },
+    {
+      "epoch": 2.243797195253506,
+      "eval_accuracy": 0.658252427184466,
+      "eval_loss": 0.6660070419311523,
+      "eval_runtime": 320.7212,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2600
+    },
+    {
+      "epoch": 2.2524271844660193,
+      "grad_norm": 6.704747676849365,
+      "learning_rate": 4.766425315654648e-06,
+      "loss": 0.5675,
+      "step": 2610
+    },
+    {
+      "epoch": 2.261057173678533,
+      "grad_norm": 6.141285419464111,
+      "learning_rate": 4.728786399492923e-06,
+      "loss": 0.6543,
+      "step": 2620
+    },
+    {
+      "epoch": 2.269687162891046,
+      "grad_norm": 16.798852920532227,
+      "learning_rate": 4.69116289159598e-06,
+      "loss": 0.5984,
+      "step": 2630
+    },
+    {
+      "epoch": 2.27831715210356,
+      "grad_norm": 7.124361038208008,
+      "learning_rate": 4.653556929441332e-06,
+      "loss": 0.5777,
+      "step": 2640
+    },
+    {
+      "epoch": 2.286947141316073,
+      "grad_norm": 13.590773582458496,
+      "learning_rate": 4.61597064950967e-06,
+      "loss": 0.6185,
+      "step": 2650
+    },
+    {
+      "epoch": 2.286947141316073,
+      "eval_accuracy": 0.6679611650485436,
+      "eval_loss": 0.6793263554573059,
+      "eval_runtime": 320.6049,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2650
+    },
+    {
+      "epoch": 2.295577130528587,
+      "grad_norm": 8.081377983093262,
+      "learning_rate": 4.578406187163503e-06,
+      "loss": 0.5651,
+      "step": 2660
+    },
+    {
+      "epoch": 2.3042071197411005,
+      "grad_norm": 6.233886241912842,
+      "learning_rate": 4.540865676525828e-06,
+      "loss": 0.6087,
+      "step": 2670
+    },
+    {
+      "epoch": 2.3128371089536137,
+      "grad_norm": 5.7994489669799805,
+      "learning_rate": 4.503351250358893e-06,
+      "loss": 0.6153,
+      "step": 2680
+    },
+    {
+      "epoch": 2.3214670981661274,
+      "grad_norm": 21.2513427734375,
+      "learning_rate": 4.465865039943023e-06,
+      "loss": 0.5765,
+      "step": 2690
+    },
+    {
+      "epoch": 2.3300970873786406,
+      "grad_norm": 13.356746673583984,
+      "learning_rate": 4.428409174955548e-06,
+      "loss": 0.5314,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3300970873786406,
+      "eval_accuracy": 0.6718446601941748,
+      "eval_loss": 0.6751753091812134,
+      "eval_runtime": 320.6989,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2700
+    },
+    {
+      "epoch": 2.3387270765911543,
+      "grad_norm": 10.287054061889648,
+      "learning_rate": 4.3909857833498015e-06,
+      "loss": 0.6288,
+      "step": 2710
+    },
+    {
+      "epoch": 2.347357065803668,
+      "grad_norm": 8.844134330749512,
+      "learning_rate": 4.353596991234228e-06,
+      "loss": 0.6502,
+      "step": 2720
+    },
+    {
+      "epoch": 2.355987055016181,
+      "grad_norm": 18.77345848083496,
+      "learning_rate": 4.3162449227516015e-06,
+      "loss": 0.6461,
+      "step": 2730
+    },
+    {
+      "epoch": 2.364617044228695,
+      "grad_norm": 5.465780258178711,
+      "learning_rate": 4.278931699958337e-06,
+      "loss": 0.5786,
+      "step": 2740
+    },
+    {
+      "epoch": 2.373247033441208,
+      "grad_norm": 9.964437484741211,
+      "learning_rate": 4.241659442703937e-06,
+      "loss": 0.6406,
+      "step": 2750
+    },
+    {
+      "epoch": 2.373247033441208,
+      "eval_accuracy": 0.6563106796116505,
+      "eval_loss": 0.6680858731269836,
+      "eval_runtime": 320.7173,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2750
+    },
+    {
+      "epoch": 2.381877022653722,
+      "grad_norm": 16.344274520874023,
+      "learning_rate": 4.2044302685105635e-06,
+      "loss": 0.6201,
+      "step": 2760
+    },
+    {
+      "epoch": 2.390507011866235,
+      "grad_norm": 6.842400074005127,
+      "learning_rate": 4.167246292452724e-06,
+      "loss": 0.5944,
+      "step": 2770
+    },
+    {
+      "epoch": 2.3991370010787487,
+      "grad_norm": 15.446759223937988,
+      "learning_rate": 4.130109627037124e-06,
+      "loss": 0.5883,
+      "step": 2780
+    },
+    {
+      "epoch": 2.407766990291262,
+      "grad_norm": 8.021566390991211,
+      "learning_rate": 4.093022382082639e-06,
+      "loss": 0.6618,
+      "step": 2790
+    },
+    {
+      "epoch": 2.4163969795037756,
+      "grad_norm": 10.198580741882324,
+      "learning_rate": 4.0559866646004546e-06,
+      "loss": 0.7011,
+      "step": 2800
+    },
+    {
+      "epoch": 2.4163969795037756,
+      "eval_accuracy": 0.6679611650485436,
+      "eval_loss": 0.6721732020378113,
+      "eval_runtime": 320.5897,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2800
+    },
+    {
+      "epoch": 2.4250269687162893,
+      "grad_norm": 7.147483825683594,
+      "learning_rate": 4.0190045786743656e-06,
+      "loss": 0.5454,
+      "step": 2810
+    },
+    {
+      "epoch": 2.4336569579288025,
+      "grad_norm": 6.587264060974121,
+      "learning_rate": 3.982078225341232e-06,
+      "loss": 0.5114,
+      "step": 2820
+    },
+    {
+      "epoch": 2.4422869471413162,
+      "grad_norm": 9.162304878234863,
+      "learning_rate": 3.945209702471622e-06,
+      "loss": 0.712,
+      "step": 2830
+    },
+    {
+      "epoch": 2.4509169363538295,
+      "grad_norm": 8.858553886413574,
+      "learning_rate": 3.908401104650621e-06,
+      "loss": 0.6119,
+      "step": 2840
+    },
+    {
+      "epoch": 2.459546925566343,
+      "grad_norm": 7.771361827850342,
+      "learning_rate": 3.871654523058831e-06,
+      "loss": 0.6195,
+      "step": 2850
+    },
+    {
+      "epoch": 2.459546925566343,
+      "eval_accuracy": 0.6757281553398058,
+      "eval_loss": 0.6643590927124023,
+      "eval_runtime": 320.706,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2850
+    },
+    {
+      "epoch": 2.4681769147788564,
+      "grad_norm": 7.508529186248779,
+      "learning_rate": 3.834972045353575e-06,
+      "loss": 0.6087,
+      "step": 2860
+    },
+    {
+      "epoch": 2.47680690399137,
+      "grad_norm": 9.493097305297852,
+      "learning_rate": 3.798355755550292e-06,
+      "loss": 0.6224,
+      "step": 2870
+    },
+    {
+      "epoch": 2.4854368932038833,
+      "grad_norm": 7.044253826141357,
+      "learning_rate": 3.7618077339041244e-06,
+      "loss": 0.6495,
+      "step": 2880
+    },
+    {
+      "epoch": 2.494066882416397,
+      "grad_norm": 6.932374954223633,
+      "learning_rate": 3.725330056791753e-06,
+      "loss": 0.627,
+      "step": 2890
+    },
+    {
+      "epoch": 2.5026968716289106,
+      "grad_norm": 8.32701301574707,
+      "learning_rate": 3.6889247965934195e-06,
+      "loss": 0.6675,
+      "step": 2900
+    },
+    {
+      "epoch": 2.5026968716289106,
+      "eval_accuracy": 0.6601941747572816,
+      "eval_loss": 0.6530495285987854,
+      "eval_runtime": 320.625,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2900
+    },
+    {
+      "epoch": 2.511326860841424,
+      "grad_norm": 7.712283134460449,
+      "learning_rate": 3.6525940215751987e-06,
+      "loss": 0.6522,
+      "step": 2910
+    },
+    {
+      "epoch": 2.5199568500539375,
+      "grad_norm": 8.3215913772583,
+      "learning_rate": 3.6163397957714895e-06,
+      "loss": 0.6759,
+      "step": 2920
+    },
+    {
+      "epoch": 2.528586839266451,
+      "grad_norm": 6.627832412719727,
+      "learning_rate": 3.5801641788677576e-06,
+      "loss": 0.6035,
+      "step": 2930
+    },
+    {
+      "epoch": 2.5372168284789645,
+      "grad_norm": 11.45533561706543,
+      "learning_rate": 3.5440692260835162e-06,
+      "loss": 0.6256,
+      "step": 2940
+    },
+    {
+      "epoch": 2.545846817691478,
+      "grad_norm": 6.252264499664307,
+      "learning_rate": 3.508056988055564e-06,
+      "loss": 0.5796,
+      "step": 2950
+    },
+    {
+      "epoch": 2.545846817691478,
+      "eval_accuracy": 0.6601941747572816,
+      "eval_loss": 0.6489056348800659,
+      "eval_runtime": 320.6022,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 2950
+    },
+    {
+      "epoch": 2.5544768069039914,
+      "grad_norm": 10.386983871459961,
+      "learning_rate": 3.4721295107214835e-06,
+      "loss": 0.4864,
+      "step": 2960
+    },
+    {
+      "epoch": 2.5631067961165046,
+      "grad_norm": 8.145389556884766,
+      "learning_rate": 3.4362888352034153e-06,
+      "loss": 0.6728,
+      "step": 2970
+    },
+    {
+      "epoch": 2.5717367853290183,
+      "grad_norm": 6.486176013946533,
+      "learning_rate": 3.4005369976920837e-06,
+      "loss": 0.6055,
+      "step": 2980
+    },
+    {
+      "epoch": 2.580366774541532,
+      "grad_norm": 10.21779727935791,
+      "learning_rate": 3.3648760293311267e-06,
+      "loss": 0.6123,
+      "step": 2990
+    },
+    {
+      "epoch": 2.588996763754045,
+      "grad_norm": 8.619269371032715,
+      "learning_rate": 3.3293079561016957e-06,
+      "loss": 0.6148,
+      "step": 3000
+    },
+    {
+      "epoch": 2.588996763754045,
+      "eval_accuracy": 0.6679611650485436,
+      "eval_loss": 0.6675190329551697,
+      "eval_runtime": 320.4804,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 1.607,
+      "step": 3000
+    },
+    {
+      "epoch": 2.597626752966559,
+      "grad_norm": 14.024328231811523,
+      "learning_rate": 3.2938347987073576e-06,
+      "loss": 0.6054,
+      "step": 3010
+    },
+    {
+      "epoch": 2.606256742179072,
+      "grad_norm": 13.966845512390137,
+      "learning_rate": 3.2584585724592967e-06,
+      "loss": 0.5767,
+      "step": 3020
+    },
+    {
+      "epoch": 2.614886731391586,
+      "grad_norm": 6.929962635040283,
+      "learning_rate": 3.223181287161812e-06,
+      "loss": 0.5214,
+      "step": 3030
+    },
+    {
+      "epoch": 2.6235167206040995,
+      "grad_norm": 9.28740406036377,
+      "learning_rate": 3.1880049469981468e-06,
+      "loss": 0.5823,
+      "step": 3040
+    },
+    {
+      "epoch": 2.6321467098166127,
+      "grad_norm": 22.37981414794922,
+      "learning_rate": 3.1529315504166147e-06,
+      "loss": 0.6293,
+      "step": 3050
+    },
+    {
+      "epoch": 2.6321467098166127,
+      "eval_accuracy": 0.6368932038834951,
+      "eval_loss": 0.6685478091239929,
+      "eval_runtime": 321.0635,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3050
+    },
+    {
+      "epoch": 2.6407766990291264,
+      "grad_norm": 17.161617279052734,
+      "learning_rate": 3.117963090017071e-06,
+      "loss": 0.5728,
+      "step": 3060
+    },
+    {
+      "epoch": 2.6494066882416396,
+      "grad_norm": 19.009254455566406,
+      "learning_rate": 3.08310155243771e-06,
+      "loss": 0.7621,
+      "step": 3070
+    },
+    {
+      "epoch": 2.6580366774541533,
+      "grad_norm": 12.797933578491211,
+      "learning_rate": 3.048348918242191e-06,
+      "loss": 0.5567,
+      "step": 3080
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 10.396708488464355,
+      "learning_rate": 3.013707161807128e-06,
+      "loss": 0.6592,
+      "step": 3090
+    },
+    {
+      "epoch": 2.67529665587918,
+      "grad_norm": 8.590036392211914,
+      "learning_rate": 2.9791782512099098e-06,
+      "loss": 0.6095,
+      "step": 3100
+    },
+    {
+      "epoch": 2.67529665587918,
+      "eval_accuracy": 0.6621359223300971,
+      "eval_loss": 0.6717608571052551,
+      "eval_runtime": 321.0303,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3100
+    },
+    {
+      "epoch": 2.6839266450916934,
+      "grad_norm": 11.886474609375,
+      "learning_rate": 2.944764148116902e-06,
+      "loss": 0.4862,
+      "step": 3110
+    },
+    {
+      "epoch": 2.692556634304207,
+      "grad_norm": 15.282882690429688,
+      "learning_rate": 2.9104668076719876e-06,
+      "loss": 0.5833,
+      "step": 3120
+    },
+    {
+      "epoch": 2.701186623516721,
+      "grad_norm": 15.11883544921875,
+      "learning_rate": 2.8762881783855025e-06,
+      "loss": 0.5887,
+      "step": 3130
+    },
+    {
+      "epoch": 2.709816612729234,
+      "grad_norm": 9.773431777954102,
+      "learning_rate": 2.8422302020235252e-06,
+      "loss": 0.6644,
+      "step": 3140
+    },
+    {
+      "epoch": 2.7184466019417477,
+      "grad_norm": 16.19442367553711,
+      "learning_rate": 2.808294813497563e-06,
+      "loss": 0.5422,
+      "step": 3150
+    },
+    {
+      "epoch": 2.7184466019417477,
+      "eval_accuracy": 0.6485436893203883,
+      "eval_loss": 0.6904874444007874,
+      "eval_runtime": 321.1401,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3150
+    },
+    {
+      "epoch": 2.727076591154261,
+      "grad_norm": 16.843564987182617,
+      "learning_rate": 2.7744839407546374e-06,
+      "loss": 0.6523,
+      "step": 3160
+    },
+    {
+      "epoch": 2.7357065803667746,
+      "grad_norm": 18.18024253845215,
+      "learning_rate": 2.7407995046677377e-06,
+      "loss": 0.5283,
+      "step": 3170
+    },
+    {
+      "epoch": 2.7443365695792883,
+      "grad_norm": 20.41519546508789,
+      "learning_rate": 2.7072434189266945e-06,
+      "loss": 0.5934,
+      "step": 3180
+    },
+    {
+      "epoch": 2.7529665587918015,
+      "grad_norm": 14.765863418579102,
+      "learning_rate": 2.6738175899294703e-06,
+      "loss": 0.6699,
+      "step": 3190
+    },
+    {
+      "epoch": 2.7615965480043148,
+      "grad_norm": 17.99534034729004,
+      "learning_rate": 2.640523916673838e-06,
+      "loss": 0.6089,
+      "step": 3200
+    },
+    {
+      "epoch": 2.7615965480043148,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6814106106758118,
+      "eval_runtime": 321.1084,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3200
+    },
+    {
+      "epoch": 2.7702265372168284,
+      "grad_norm": 5.104621887207031,
+      "learning_rate": 2.607364290649501e-06,
+      "loss": 0.6884,
+      "step": 3210
+    },
+    {
+      "epoch": 2.778856526429342,
+      "grad_norm": 17.406665802001953,
+      "learning_rate": 2.574340595730633e-06,
+      "loss": 0.6264,
+      "step": 3220
+    },
+    {
+      "epoch": 2.7874865156418553,
+      "grad_norm": 8.697972297668457,
+      "learning_rate": 2.541454708068855e-06,
+      "loss": 0.5552,
+      "step": 3230
+    },
+    {
+      "epoch": 2.796116504854369,
+      "grad_norm": 7.472986698150635,
+      "learning_rate": 2.5087084959866403e-06,
+      "loss": 0.596,
+      "step": 3240
+    },
+    {
+      "epoch": 2.8047464940668823,
+      "grad_norm": 11.333291053771973,
+      "learning_rate": 2.476103819871166e-06,
+      "loss": 0.6238,
+      "step": 3250
+    },
+    {
+      "epoch": 2.8047464940668823,
+      "eval_accuracy": 0.6466019417475728,
+      "eval_loss": 0.6738768815994263,
+      "eval_runtime": 321.0019,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3250
+    },
+    {
+      "epoch": 2.813376483279396,
+      "grad_norm": 15.323911666870117,
+      "learning_rate": 2.44364253206864e-06,
+      "loss": 0.6472,
+      "step": 3260
+    },
+    {
+      "epoch": 2.8220064724919096,
+      "grad_norm": 14.362588882446289,
+      "learning_rate": 2.4113264767790433e-06,
+      "loss": 0.6375,
+      "step": 3270
+    },
+    {
+      "epoch": 2.830636461704423,
+      "grad_norm": 11.027913093566895,
+      "learning_rate": 2.379157489951367e-06,
+      "loss": 0.6185,
+      "step": 3280
+    },
+    {
+      "epoch": 2.839266450916936,
+      "grad_norm": 8.004063606262207,
+      "learning_rate": 2.3471373991793116e-06,
+      "loss": 0.6608,
+      "step": 3290
+    },
+    {
+      "epoch": 2.8478964401294498,
+      "grad_norm": 11.401987075805664,
+      "learning_rate": 2.315268023597447e-06,
+      "loss": 0.7386,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8478964401294498,
+      "eval_accuracy": 0.6485436893203883,
+      "eval_loss": 0.6621807813644409,
+      "eval_runtime": 321.0895,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 3300
+    },
+    {
+      "epoch": 2.8565264293419634,
+      "grad_norm": 11.381020545959473,
+      "learning_rate": 2.2835511737778687e-06,
+      "loss": 0.5386,
+      "step": 3310
+    },
+    {
+      "epoch": 2.8651564185544767,
+      "grad_norm": 14.900254249572754,
+      "learning_rate": 2.2519886516273365e-06,
+      "loss": 0.6754,
+      "step": 3320
+    },
+    {
+      "epoch": 2.8737864077669903,
+      "grad_norm": 10.069350242614746,
+      "learning_rate": 2.220582250284905e-06,
+      "loss": 0.6129,
+      "step": 3330
+    },
+    {
+      "epoch": 2.8824163969795036,
+      "grad_norm": 8.782756805419922,
+      "learning_rate": 2.189333754020046e-06,
+      "loss": 0.6185,
+      "step": 3340
+    },
+    {
+      "epoch": 2.8910463861920173,
+      "grad_norm": 8.9526948928833,
+      "learning_rate": 2.158244938131277e-06,
+      "loss": 0.6166,
+      "step": 3350
+    },
+    {
+      "epoch": 2.8910463861920173,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6567447781562805,
+      "eval_runtime": 320.6468,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3350
+    },
+    {
+      "epoch": 2.899676375404531,
+      "grad_norm": 6.0573625564575195,
+      "learning_rate": 2.12731756884532e-06,
+      "loss": 0.6601,
+      "step": 3360
+    },
+    {
+      "epoch": 2.908306364617044,
+      "grad_norm": 15.11607837677002,
+      "learning_rate": 2.096553403216739e-06,
+      "loss": 0.7397,
+      "step": 3370
+    },
+    {
+      "epoch": 2.916936353829558,
+      "grad_norm": 7.567427635192871,
+      "learning_rate": 2.0659541890281236e-06,
+      "loss": 0.5167,
+      "step": 3380
+    },
+    {
+      "epoch": 2.925566343042071,
+      "grad_norm": 11.045202255249023,
+      "learning_rate": 2.0355216646908016e-06,
+      "loss": 0.6497,
+      "step": 3390
+    },
+    {
+      "epoch": 2.9341963322545848,
+      "grad_norm": 14.782462120056152,
+      "learning_rate": 2.0052575591460636e-06,
+      "loss": 0.5866,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9341963322545848,
+      "eval_accuracy": 0.6504854368932039,
+      "eval_loss": 0.6615984439849854,
+      "eval_runtime": 320.6259,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3400
+    },
+    {
+      "epoch": 2.9428263214670984,
+      "grad_norm": 5.701985836029053,
+      "learning_rate": 1.975163591766946e-06,
+      "loss": 0.6723,
+      "step": 3410
+    },
+    {
+      "epoch": 2.9514563106796117,
+      "grad_norm": 10.19908618927002,
+      "learning_rate": 1.9452414722605432e-06,
+      "loss": 0.592,
+      "step": 3420
+    },
+    {
+      "epoch": 2.960086299892125,
+      "grad_norm": 8.34867000579834,
+      "learning_rate": 1.915492900570887e-06,
+      "loss": 0.6623,
+      "step": 3430
+    },
+    {
+      "epoch": 2.9687162891046386,
+      "grad_norm": 14.363434791564941,
+      "learning_rate": 1.885919566782352e-06,
+      "loss": 0.6295,
+      "step": 3440
+    },
+    {
+      "epoch": 2.9773462783171523,
+      "grad_norm": 9.90467357635498,
+      "learning_rate": 1.8565231510236531e-06,
+      "loss": 0.6348,
+      "step": 3450
+    },
+    {
+      "epoch": 2.9773462783171523,
+      "eval_accuracy": 0.6563106796116505,
+      "eval_loss": 0.6633828282356262,
+      "eval_runtime": 320.6481,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3450
+    },
+    {
+      "epoch": 2.9859762675296655,
+      "grad_norm": 13.353963851928711,
+      "learning_rate": 1.8273053233723843e-06,
+      "loss": 0.5338,
+      "step": 3460
+    },
+    {
+      "epoch": 2.994606256742179,
+      "grad_norm": 14.00833797454834,
+      "learning_rate": 1.798267743760142e-06,
+      "loss": 0.633,
+      "step": 3470
+    },
+    {
+      "epoch": 3.0032362459546924,
+      "grad_norm": 14.501118659973145,
+      "learning_rate": 1.7694120618782169e-06,
+      "loss": 0.5085,
+      "step": 3480
+    },
+    {
+      "epoch": 3.011866235167206,
+      "grad_norm": 9.27495002746582,
+      "learning_rate": 1.7407399170838802e-06,
+      "loss": 0.5477,
+      "step": 3490
+    },
+    {
+      "epoch": 3.0204962243797193,
+      "grad_norm": 12.652294158935547,
+      "learning_rate": 1.7122529383072346e-06,
+      "loss": 0.5907,
+      "step": 3500
+    },
+    {
+      "epoch": 3.0204962243797193,
+      "eval_accuracy": 0.658252427184466,
+      "eval_loss": 0.6642096042633057,
+      "eval_runtime": 320.7217,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3500
+    },
+    {
+      "epoch": 3.029126213592233,
+      "grad_norm": 12.352764129638672,
+      "learning_rate": 1.68395274395868e-06,
+      "loss": 0.5256,
+      "step": 3510
+    },
+    {
+      "epoch": 3.0377562028047467,
+      "grad_norm": 6.0259222984313965,
+      "learning_rate": 1.6558409418369686e-06,
+      "loss": 0.4449,
+      "step": 3520
+    },
+    {
+      "epoch": 3.04638619201726,
+      "grad_norm": 4.154427528381348,
+      "learning_rate": 1.6279191290378566e-06,
+      "loss": 0.449,
+      "step": 3530
+    },
+    {
+      "epoch": 3.0550161812297736,
+      "grad_norm": 12.186491012573242,
+      "learning_rate": 1.6001888918633728e-06,
+      "loss": 0.4746,
+      "step": 3540
+    },
+    {
+      "epoch": 3.063646170442287,
+      "grad_norm": 9.144371032714844,
+      "learning_rate": 1.5726518057316969e-06,
+      "loss": 0.4985,
+      "step": 3550
+    },
+    {
+      "epoch": 3.063646170442287,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6903661489486694,
+      "eval_runtime": 320.6325,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3550
+    },
+    {
+      "epoch": 3.0722761596548005,
+      "grad_norm": 14.253432273864746,
+      "learning_rate": 1.5453094350876563e-06,
+      "loss": 0.5309,
+      "step": 3560
+    },
+    {
+      "epoch": 3.0809061488673137,
+      "grad_norm": 14.948261260986328,
+      "learning_rate": 1.5181633333138456e-06,
+      "loss": 0.5263,
+      "step": 3570
+    },
+    {
+      "epoch": 3.0895361380798274,
+      "grad_norm": 9.058218955993652,
+      "learning_rate": 1.4912150426423766e-06,
+      "loss": 0.5077,
+      "step": 3580
+    },
+    {
+      "epoch": 3.098166127292341,
+      "grad_norm": 17.286836624145508,
+      "learning_rate": 1.4644660940672628e-06,
+      "loss": 0.5556,
+      "step": 3590
+    },
+    {
+      "epoch": 3.1067961165048543,
+      "grad_norm": 9.762429237365723,
+      "learning_rate": 1.4379180072574335e-06,
+      "loss": 0.53,
+      "step": 3600
+    },
+    {
+      "epoch": 3.1067961165048543,
+      "eval_accuracy": 0.6466019417475728,
+      "eval_loss": 0.6925872564315796,
+      "eval_runtime": 320.6091,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3600
+    },
+    {
+      "epoch": 3.115426105717368,
+      "grad_norm": 15.105671882629395,
+      "learning_rate": 1.411572290470401e-06,
+      "loss": 0.5956,
+      "step": 3610
+    },
+    {
+      "epoch": 3.1240560949298812,
+      "grad_norm": 13.916862487792969,
+      "learning_rate": 1.3854304404665796e-06,
+      "loss": 0.5019,
+      "step": 3620
+    },
+    {
+      "epoch": 3.132686084142395,
+      "grad_norm": 14.544822692871094,
+      "learning_rate": 1.359493942424241e-06,
+      "loss": 0.5761,
+      "step": 3630
+    },
+    {
+      "epoch": 3.141316073354908,
+      "grad_norm": 15.535740852355957,
+      "learning_rate": 1.3337642698551428e-06,
+      "loss": 0.4957,
+      "step": 3640
+    },
+    {
+      "epoch": 3.149946062567422,
+      "grad_norm": 13.230164527893066,
+      "learning_rate": 1.3082428845208155e-06,
+      "loss": 0.5728,
+      "step": 3650
+    },
+    {
+      "epoch": 3.149946062567422,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6939272880554199,
+      "eval_runtime": 320.6286,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3650
+    },
+    {
+      "epoch": 3.158576051779935,
+      "grad_norm": 11.026480674743652,
+      "learning_rate": 1.2829312363495155e-06,
+      "loss": 0.5602,
+      "step": 3660
+    },
+    {
+      "epoch": 3.1672060409924487,
+      "grad_norm": 10.449764251708984,
+      "learning_rate": 1.2578307633538505e-06,
+      "loss": 0.6031,
+      "step": 3670
+    },
+    {
+      "epoch": 3.1758360302049624,
+      "grad_norm": 13.517521858215332,
+      "learning_rate": 1.232942891549083e-06,
+      "loss": 0.6053,
+      "step": 3680
+    },
+    {
+      "epoch": 3.1844660194174756,
+      "grad_norm": 10.760808944702148,
+      "learning_rate": 1.2082690348721204e-06,
+      "loss": 0.5024,
+      "step": 3690
+    },
+    {
+      "epoch": 3.1930960086299893,
+      "grad_norm": 14.012762069702148,
+      "learning_rate": 1.1838105951011758e-06,
+      "loss": 0.5011,
+      "step": 3700
+    },
+    {
+      "epoch": 3.1930960086299893,
+      "eval_accuracy": 0.6601941747572816,
+      "eval_loss": 0.6916132569313049,
+      "eval_runtime": 320.6627,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3700
+    },
+    {
+      "epoch": 3.2017259978425026,
+      "grad_norm": 11.190227508544922,
+      "learning_rate": 1.1595689617761363e-06,
+      "loss": 0.4906,
+      "step": 3710
+    },
+    {
+      "epoch": 3.2103559870550162,
+      "grad_norm": 17.964550018310547,
+      "learning_rate": 1.1355455121196234e-06,
+      "loss": 0.5705,
+      "step": 3720
+    },
+    {
+      "epoch": 3.2189859762675295,
+      "grad_norm": 21.885299682617188,
+      "learning_rate": 1.1117416109587403e-06,
+      "loss": 0.6581,
+      "step": 3730
+    },
+    {
+      "epoch": 3.227615965480043,
+      "grad_norm": 10.283282279968262,
+      "learning_rate": 1.0881586106475406e-06,
+      "loss": 0.6133,
+      "step": 3740
+    },
+    {
+      "epoch": 3.236245954692557,
+      "grad_norm": 8.597122192382812,
+      "learning_rate": 1.0647978509901946e-06,
+      "loss": 0.4987,
+      "step": 3750
+    },
+    {
+      "epoch": 3.236245954692557,
+      "eval_accuracy": 0.654368932038835,
+      "eval_loss": 0.6906397938728333,
+      "eval_runtime": 320.6953,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3750
+    },
+    {
+      "epoch": 3.24487594390507,
+      "grad_norm": 10.815213203430176,
+      "learning_rate": 1.0416606591648737e-06,
+      "loss": 0.6638,
+      "step": 3760
+    },
+    {
+      "epoch": 3.2535059331175837,
+      "grad_norm": 7.768321990966797,
+      "learning_rate": 1.018748349648348e-06,
+      "loss": 0.5556,
+      "step": 3770
+    },
+    {
+      "epoch": 3.262135922330097,
+      "grad_norm": 11.6558837890625,
+      "learning_rate": 9.960622241413137e-07,
+      "loss": 0.5817,
+      "step": 3780
+    },
+    {
+      "epoch": 3.2707659115426106,
+      "grad_norm": 14.339502334594727,
+      "learning_rate": 9.736035714944314e-07,
+      "loss": 0.5237,
+      "step": 3790
+    },
+    {
+      "epoch": 3.279395900755124,
+      "grad_norm": 15.16897964477539,
+      "learning_rate": 9.513736676351104e-07,
+      "loss": 0.5909,
+      "step": 3800
+    },
+    {
+      "epoch": 3.279395900755124,
+      "eval_accuracy": 0.658252427184466,
+      "eval_loss": 0.6882277727127075,
+      "eval_runtime": 320.663,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3800
+    },
+    {
+      "epoch": 3.2880258899676376,
+      "grad_norm": 13.602522850036621,
+      "learning_rate": 9.293737754950166e-07,
+      "loss": 0.5828,
+      "step": 3810
+    },
+    {
+      "epoch": 3.2966558791801512,
+      "grad_norm": 17.136140823364258,
+      "learning_rate": 9.076051449383294e-07,
+      "loss": 0.6515,
+      "step": 3820
+    },
+    {
+      "epoch": 3.3052858683926645,
+      "grad_norm": 13.352173805236816,
+      "learning_rate": 8.860690126907229e-07,
+      "loss": 0.5751,
+      "step": 3830
+    },
+    {
+      "epoch": 3.313915857605178,
+      "grad_norm": 21.102169036865234,
+      "learning_rate": 8.64766602269112e-07,
+      "loss": 0.6061,
+      "step": 3840
+    },
+    {
+      "epoch": 3.3225458468176914,
+      "grad_norm": 23.22005844116211,
+      "learning_rate": 8.436991239121451e-07,
+      "loss": 0.5194,
+      "step": 3850
+    },
+    {
+      "epoch": 3.3225458468176914,
+      "eval_accuracy": 0.6524271844660194,
+      "eval_loss": 0.6874131560325623,
+      "eval_runtime": 320.7489,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 3850
+    },
+    {
+      "epoch": 3.331175836030205,
+      "grad_norm": 8.979095458984375,
+      "learning_rate": 8.22867774511435e-07,
+      "loss": 0.5395,
+      "step": 3860
+    },
+    {
+      "epoch": 3.3398058252427183,
+      "grad_norm": 9.126049041748047,
+      "learning_rate": 8.022737375435735e-07,
+      "loss": 0.566,
+      "step": 3870
+    },
+    {
+      "epoch": 3.348435814455232,
+      "grad_norm": 8.811643600463867,
+      "learning_rate": 7.81918183002891e-07,
+      "loss": 0.5703,
+      "step": 3880
+    },
+    {
+      "epoch": 3.357065803667745,
+      "grad_norm": 9.9462308883667,
+      "learning_rate": 7.618022673349834e-07,
+      "loss": 0.5318,
+      "step": 3890
+    },
+    {
+      "epoch": 3.365695792880259,
+      "grad_norm": 15.365378379821777,
+      "learning_rate": 7.419271333710154e-07,
+      "loss": 0.5925,
+      "step": 3900
+    },
+    {
+      "epoch": 3.365695792880259,
+      "eval_accuracy": 0.6601941747572816,
+      "eval_loss": 0.685357391834259,
+      "eval_runtime": 320.5481,
+      "eval_samples_per_second": 1.607,
+      "eval_steps_per_second": 1.607,
+      "step": 3900
+    },
+    {
+      "epoch": 3.3743257820927726,
+      "grad_norm": 13.633624076843262,
+      "learning_rate": 7.222939102627919e-07,
+      "loss": 0.6622,
+      "step": 3910
+    },
+    {
+      "epoch": 3.382955771305286,
+      "grad_norm": 14.377915382385254,
+      "learning_rate": 7.029037134186112e-07,
+      "loss": 0.4916,
+      "step": 3920
+    },
+    {
+      "epoch": 3.3915857605177995,
+      "grad_norm": 11.740239143371582,
+      "learning_rate": 6.837576444398913e-07,
+      "loss": 0.5409,
+      "step": 3930
+    },
+    {
+      "epoch": 3.4002157497303127,
+      "grad_norm": 10.254107475280762,
+      "learning_rate": 6.648567910585874e-07,
+      "loss": 0.6555,
+      "step": 3940
+    },
+    {
+      "epoch": 3.4088457389428264,
+      "grad_norm": 16.456100463867188,
+      "learning_rate": 6.46202227075401e-07,
+      "loss": 0.4709,
+      "step": 3950
+    },
+    {
+      "epoch": 3.4088457389428264,
+      "eval_accuracy": 0.6621359223300971,
+      "eval_loss": 0.6879016160964966,
+      "eval_runtime": 320.8657,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 3950
+    },
+    {
+      "epoch": 3.4174757281553396,
+      "grad_norm": 6.954639911651611,
+      "learning_rate": 6.277950122987631e-07,
+      "loss": 0.542,
+      "step": 3960
+    },
+    {
+      "epoch": 3.4261057173678533,
+      "grad_norm": 16.155237197875977,
+      "learning_rate": 6.096361924846333e-07,
+      "loss": 0.6621,
+      "step": 3970
+    },
+    {
+      "epoch": 3.4347357065803665,
+      "grad_norm": 10.976309776306152,
+      "learning_rate": 5.917267992770881e-07,
+      "loss": 0.5217,
+      "step": 3980
+    },
+    {
+      "epoch": 3.44336569579288,
+      "grad_norm": 17.910186767578125,
+      "learning_rate": 5.740678501497049e-07,
+      "loss": 0.669,
+      "step": 3990
+    },
+    {
+      "epoch": 3.451995685005394,
+      "grad_norm": 16.26474952697754,
+      "learning_rate": 5.566603483477607e-07,
+      "loss": 0.5317,
+      "step": 4000
+    },
+    {
+      "epoch": 3.451995685005394,
+      "eval_accuracy": 0.6601941747572816,
+      "eval_loss": 0.6886419057846069,
+      "eval_runtime": 320.5766,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 4000
+    },
+    {
+      "epoch": 3.460625674217907,
+      "grad_norm": 22.223215103149414,
+      "learning_rate": 5.395052828312359e-07,
+      "loss": 0.5363,
+      "step": 4010
+    },
+    {
+      "epoch": 3.469255663430421,
+      "grad_norm": 8.730759620666504,
+      "learning_rate": 5.226036282186286e-07,
+      "loss": 0.6681,
+      "step": 4020
+    },
+    {
+      "epoch": 3.477885652642934,
+      "grad_norm": 8.632150650024414,
+      "learning_rate": 5.059563447315829e-07,
+      "loss": 0.5089,
+      "step": 4030
+    },
+    {
+      "epoch": 3.4865156418554477,
+      "grad_norm": 9.663848876953125,
+      "learning_rate": 4.895643781403375e-07,
+      "loss": 0.4644,
+      "step": 4040
+    },
+    {
+      "epoch": 3.4951456310679614,
+      "grad_norm": 11.52153205871582,
+      "learning_rate": 4.73428659709998e-07,
+      "loss": 0.5821,
+      "step": 4050
+    },
+    {
+      "epoch": 3.4951456310679614,
+      "eval_accuracy": 0.6660194174757281,
+      "eval_loss": 0.6889378428459167,
+      "eval_runtime": 320.9557,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 4050
+    },
+    {
+      "epoch": 3.5037756202804746,
+      "grad_norm": 17.435976028442383,
+      "learning_rate": 4.575501061476195e-07,
+      "loss": 0.5951,
+      "step": 4060
+    },
+    {
+      "epoch": 3.512405609492988,
+      "grad_norm": 13.329899787902832,
+      "learning_rate": 4.4192961955013766e-07,
+      "loss": 0.5985,
+      "step": 4070
+    },
+    {
+      "epoch": 3.5210355987055015,
+      "grad_norm": 10.234993934631348,
+      "learning_rate": 4.265680873531136e-07,
+      "loss": 0.5232,
+      "step": 4080
+    },
+    {
+      "epoch": 3.529665587918015,
+      "grad_norm": 13.122269630432129,
+      "learning_rate": 4.1146638228031557e-07,
+      "loss": 0.5554,
+      "step": 4090
+    },
+    {
+      "epoch": 3.5382955771305284,
+      "grad_norm": 10.752240180969238,
+      "learning_rate": 3.966253622941385e-07,
+      "loss": 0.5887,
+      "step": 4100
+    },
+    {
+      "epoch": 3.5382955771305284,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6890589594841003,
+      "eval_runtime": 321.1286,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 4100
+    },
+    {
+      "epoch": 3.546925566343042,
+      "grad_norm": 13.36107063293457,
+      "learning_rate": 3.820458705468633e-07,
+      "loss": 0.5101,
+      "step": 4110
+    },
+    {
+      "epoch": 3.5555555555555554,
+      "grad_norm": 11.969443321228027,
+      "learning_rate": 3.677287353327519e-07,
+      "loss": 0.6162,
+      "step": 4120
+    },
+    {
+      "epoch": 3.564185544768069,
+      "grad_norm": 15.6027250289917,
+      "learning_rate": 3.536747700409932e-07,
+      "loss": 0.6591,
+      "step": 4130
+    },
+    {
+      "epoch": 3.5728155339805827,
+      "grad_norm": 10.335657119750977,
+      "learning_rate": 3.3988477310948785e-07,
+      "loss": 0.5749,
+      "step": 4140
+    },
+    {
+      "epoch": 3.581445523193096,
+      "grad_norm": 7.062427043914795,
+      "learning_rate": 3.2635952797949566e-07,
+      "loss": 0.5362,
+      "step": 4150
+    },
+    {
+      "epoch": 3.581445523193096,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6879053711891174,
+      "eval_runtime": 321.1587,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 4150
+    },
+    {
+      "epoch": 3.5900755124056096,
+      "grad_norm": 9.053596496582031,
+      "learning_rate": 3.1309980305111674e-07,
+      "loss": 0.5753,
+      "step": 4160
+    },
+    {
+      "epoch": 3.598705501618123,
+      "grad_norm": 9.732317924499512,
+      "learning_rate": 3.0010635163964186e-07,
+      "loss": 0.5671,
+      "step": 4170
+    },
+    {
+      "epoch": 3.6073354908306365,
+      "grad_norm": 14.350728034973145,
+      "learning_rate": 2.8737991193275805e-07,
+      "loss": 0.525,
+      "step": 4180
+    },
+    {
+      "epoch": 3.61596548004315,
+      "grad_norm": 12.92699146270752,
+      "learning_rate": 2.7492120694860237e-07,
+      "loss": 0.5276,
+      "step": 4190
+    },
+    {
+      "epoch": 3.6245954692556634,
+      "grad_norm": 8.268197059631348,
+      "learning_rate": 2.627309444946929e-07,
+      "loss": 0.4971,
+      "step": 4200
+    },
+    {
+      "epoch": 3.6245954692556634,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6887635588645935,
+      "eval_runtime": 320.9246,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 4200
+    },
+    {
+      "epoch": 3.6332254584681767,
+      "grad_norm": 9.3760404586792,
+      "learning_rate": 2.5080981712771344e-07,
+      "loss": 0.4793,
+      "step": 4210
+    },
+    {
+      "epoch": 3.6418554476806904,
+      "grad_norm": 17.867101669311523,
+      "learning_rate": 2.391585021141668e-07,
+      "loss": 0.4916,
+      "step": 4220
+    },
+    {
+      "epoch": 3.650485436893204,
+      "grad_norm": 9.685575485229492,
+      "learning_rate": 2.2777766139190084e-07,
+      "loss": 0.54,
+      "step": 4230
+    },
+    {
+      "epoch": 3.6591154261057173,
+      "grad_norm": 20.8098201751709,
+      "learning_rate": 2.1666794153249792e-07,
+      "loss": 0.6402,
+      "step": 4240
+    },
+    {
+      "epoch": 3.667745415318231,
+      "grad_norm": 9.999732971191406,
+      "learning_rate": 2.0582997370454882e-07,
+      "loss": 0.5009,
+      "step": 4250
+    },
+    {
+      "epoch": 3.667745415318231,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6899433732032776,
+      "eval_runtime": 321.085,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 4250
+    },
+    {
+      "epoch": 3.676375404530744,
+      "grad_norm": 13.440372467041016,
+      "learning_rate": 1.9526437363778404e-07,
+      "loss": 0.7073,
+      "step": 4260
+    },
+    {
+      "epoch": 3.685005393743258,
+      "grad_norm": 12.25793170928955,
+      "learning_rate": 1.8497174158810361e-07,
+      "loss": 0.6589,
+      "step": 4270
+    },
+    {
+      "epoch": 3.6936353829557715,
+      "grad_norm": 20.834096908569336,
+      "learning_rate": 1.749526623034681e-07,
+      "loss": 0.6127,
+      "step": 4280
+    },
+    {
+      "epoch": 3.7022653721682848,
+      "grad_norm": 14.255398750305176,
+      "learning_rate": 1.6520770499068083e-07,
+      "loss": 0.4761,
+      "step": 4290
+    },
+    {
+      "epoch": 3.710895361380798,
+      "grad_norm": 6.590888977050781,
+      "learning_rate": 1.557374232830483e-07,
+      "loss": 0.5813,
+      "step": 4300
+    },
+    {
+      "epoch": 3.710895361380798,
+      "eval_accuracy": 0.6621359223300971,
+      "eval_loss": 0.6886661648750305,
+      "eval_runtime": 321.0071,
+      "eval_samples_per_second": 1.604,
+      "eval_steps_per_second": 1.604,
+      "step": 4300
+    },
+    {
+      "epoch": 3.7195253505933117,
+      "grad_norm": 7.404444694519043,
+      "learning_rate": 1.4654235520892958e-07,
+      "loss": 0.5689,
+      "step": 4310
+    },
+    {
+      "epoch": 3.7281553398058254,
+      "grad_norm": 18.861854553222656,
+      "learning_rate": 1.3762302316116527e-07,
+      "loss": 0.4723,
+      "step": 4320
+    },
+    {
+      "epoch": 3.7367853290183386,
+      "grad_norm": 20.41657257080078,
+      "learning_rate": 1.289799338674036e-07,
+      "loss": 0.6008,
+      "step": 4330
+    },
+    {
+      "epoch": 3.7454153182308523,
+      "grad_norm": 11.25420093536377,
+      "learning_rate": 1.2061357836131104e-07,
+      "loss": 0.5452,
+      "step": 4340
+    },
+    {
+      "epoch": 3.7540453074433655,
+      "grad_norm": 13.756759643554688,
+      "learning_rate": 1.1252443195467311e-07,
+      "loss": 0.6147,
+      "step": 4350
+    },
+    {
+      "epoch": 3.7540453074433655,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6891469955444336,
+      "eval_runtime": 320.9449,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 4350
+    },
+    {
+      "epoch": 3.762675296655879,
+      "grad_norm": 13.715859413146973,
+      "learning_rate": 1.0471295421039251e-07,
+      "loss": 0.5173,
+      "step": 4360
+    },
+    {
+      "epoch": 3.771305285868393,
+      "grad_norm": 7.733090400695801,
+      "learning_rate": 9.71795889163818e-08,
+      "loss": 0.6093,
+      "step": 4370
+    },
+    {
+      "epoch": 3.779935275080906,
+      "grad_norm": 7.727634429931641,
+      "learning_rate": 8.992476406034845e-08,
+      "loss": 0.5655,
+      "step": 4380
+    },
+    {
+      "epoch": 3.7885652642934198,
+      "grad_norm": 8.828600883483887,
+      "learning_rate": 8.294889180548104e-08,
+      "loss": 0.7,
+      "step": 4390
+    },
+    {
+      "epoch": 3.797195253505933,
+      "grad_norm": 8.170161247253418,
+      "learning_rate": 7.625236846703243e-08,
+      "loss": 0.6033,
+      "step": 4400
+    },
+    {
+      "epoch": 3.797195253505933,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6890521049499512,
+      "eval_runtime": 320.8322,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 4400
+    },
+    {
+      "epoch": 3.8058252427184467,
+      "grad_norm": 10.907033920288086,
+      "learning_rate": 6.983557448980549e-08,
+      "loss": 0.5508,
+      "step": 4410
+    },
+    {
+      "epoch": 3.81445523193096,
+      "grad_norm": 16.888439178466797,
+      "learning_rate": 6.369887442653877e-08,
+      "loss": 0.5819,
+      "step": 4420
+    },
+    {
+      "epoch": 3.8230852211434736,
+      "grad_norm": 20.531522750854492,
+      "learning_rate": 5.7842616917193064e-08,
+      "loss": 0.4267,
+      "step": 4430
+    },
+    {
+      "epoch": 3.831715210355987,
+      "grad_norm": 8.410703659057617,
+      "learning_rate": 5.226713466915001e-08,
+      "loss": 0.5266,
+      "step": 4440
+    },
+    {
+      "epoch": 3.8403451995685005,
+      "grad_norm": 6.310892105102539,
+      "learning_rate": 4.697274443830335e-08,
+      "loss": 0.565,
+      "step": 4450
+    },
+    {
+      "epoch": 3.8403451995685005,
+      "eval_accuracy": 0.6660194174757281,
+      "eval_loss": 0.6890508532524109,
+      "eval_runtime": 320.9035,
+      "eval_samples_per_second": 1.605,
+      "eval_steps_per_second": 1.605,
+      "step": 4450
+    },
+    {
+      "epoch": 3.848975188781014,
+      "grad_norm": 28.219768524169922,
+      "learning_rate": 4.195974701106775e-08,
+      "loss": 0.5493,
+      "step": 4460
+    },
+    {
+      "epoch": 3.8576051779935274,
+      "grad_norm": 19.05866241455078,
+      "learning_rate": 3.722842718728969e-08,
+      "loss": 0.5646,
+      "step": 4470
+    },
+    {
+      "epoch": 3.866235167206041,
+      "grad_norm": 8.093132019042969,
+      "learning_rate": 3.277905376406654e-08,
+      "loss": 0.5774,
+      "step": 4480
+    },
+    {
+      "epoch": 3.8748651564185543,
+      "grad_norm": 10.243422508239746,
+      "learning_rate": 2.8611879520476503e-08,
+      "loss": 0.6114,
+      "step": 4490
+    },
+    {
+      "epoch": 3.883495145631068,
+      "grad_norm": 9.737555503845215,
+      "learning_rate": 2.4727141203216286e-08,
+      "loss": 0.5044,
+      "step": 4500
+    },
+    {
+      "epoch": 3.883495145631068,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6893202662467957,
+      "eval_runtime": 321.2665,
+      "eval_samples_per_second": 1.603,
+      "eval_steps_per_second": 1.603,
+      "step": 4500
+    },
+    {
+      "epoch": 3.8921251348435817,
+      "grad_norm": 15.192139625549316,
+      "learning_rate": 2.1125059513152357e-08,
+      "loss": 0.5512,
+      "step": 4510
+    },
+    {
+      "epoch": 3.900755124056095,
+      "grad_norm": 23.43290901184082,
+      "learning_rate": 1.7805839092781553e-08,
+      "loss": 0.633,
+      "step": 4520
+    },
+    {
+      "epoch": 3.909385113268608,
+      "grad_norm": 13.518702507019043,
+      "learning_rate": 1.4769668514605374e-08,
+      "loss": 0.5216,
+      "step": 4530
+    },
+    {
+      "epoch": 3.918015102481122,
+      "grad_norm": 11.329241752624512,
+      "learning_rate": 1.2016720270417448e-08,
+      "loss": 0.5502,
+      "step": 4540
+    },
+    {
+      "epoch": 3.9266450916936355,
+      "grad_norm": 20.290353775024414,
+      "learning_rate": 9.547150761501922e-09,
+      "loss": 0.613,
+      "step": 4550
+    },
+    {
+      "epoch": 3.9266450916936355,
+      "eval_accuracy": 0.6660194174757281,
+      "eval_loss": 0.68938148021698,
+      "eval_runtime": 320.6069,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 4550
+    },
+    {
+      "epoch": 3.9352750809061487,
+      "grad_norm": 10.623443603515625,
+      "learning_rate": 7.3611002897489015e-09,
+      "loss": 0.5943,
+      "step": 4560
+    },
+    {
+      "epoch": 3.9439050701186624,
+      "grad_norm": 13.714851379394531,
+      "learning_rate": 5.458693049684161e-09,
+      "loss": 0.5628,
+      "step": 4570
+    },
+    {
+      "epoch": 3.9525350593311757,
+      "grad_norm": 20.694622039794922,
+      "learning_rate": 3.8400371214131205e-09,
+      "loss": 0.5538,
+      "step": 4580
+    },
+    {
+      "epoch": 3.9611650485436893,
+      "grad_norm": 14.463215827941895,
+      "learning_rate": 2.5052244644802048e-09,
+      "loss": 0.64,
+      "step": 4590
+    },
+    {
+      "epoch": 3.969795037756203,
+      "grad_norm": 7.637043476104736,
+      "learning_rate": 1.4543309126446858e-09,
+      "loss": 0.4614,
+      "step": 4600
+    },
+    {
+      "epoch": 3.969795037756203,
+      "eval_accuracy": 0.6640776699029126,
+      "eval_loss": 0.6896011829376221,
+      "eval_runtime": 320.6166,
+      "eval_samples_per_second": 1.606,
+      "eval_steps_per_second": 1.606,
+      "step": 4600
+    },
+    {
+      "epoch": 3.9784250269687162,
+      "grad_norm": 12.583084106445312,
+      "learning_rate": 6.874161695719084e-10,
+      "loss": 0.5865,
+      "step": 4610
+    },
+    {
+      "epoch": 3.98705501618123,
+      "grad_norm": 16.6655216217041,
+      "learning_rate": 2.045238054415588e-10,
+      "loss": 0.5533,
+      "step": 4620
+    },
+    {
+      "epoch": 3.995685005393743,
+      "grad_norm": 26.88420867919922,
+      "learning_rate": 5.681254474088072e-12,
+      "loss": 0.6292,
+      "step": 4630
+    },
+    {
+      "epoch": 3.997411003236246,
+      "step": 4632,
+      "total_flos": 0.0,
+      "train_loss": 0.6694142627746947,
+      "train_runtime": 66014.9203,
+      "train_samples_per_second": 0.281,
+      "train_steps_per_second": 0.07
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4632,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}