diff --git "a/checkpoint-2500/trainer_state.json" "b/checkpoint-2500/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-2500/trainer_state.json"
@@ -0,0 +1,4408 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.5125628140703515,
+  "eval_steps": 500,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004020100502512563,
+      "grad_norm": 0.68359375,
+      "learning_rate": 3.9999999999999996e-05,
+      "loss": 0.6758,
+      "step": 4
+    },
+    {
+      "epoch": 0.008040201005025126,
+      "grad_norm": 0.6328125,
+      "learning_rate": 7.999999999999999e-05,
+      "loss": 0.6607,
+      "step": 8
+    },
+    {
+      "epoch": 0.012060301507537688,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 0.6948,
+      "step": 12
+    },
+    {
+      "epoch": 0.016080402010050253,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00015999999999999999,
+      "loss": 0.6473,
+      "step": 16
+    },
+    {
+      "epoch": 0.020100502512562814,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 0.6259,
+      "step": 20
+    },
+    {
+      "epoch": 0.024120603015075376,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.677,
+      "step": 24
+    },
+    {
+      "epoch": 0.02814070351758794,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00028,
+      "loss": 0.6508,
+      "step": 28
+    },
+    {
+      "epoch": 0.032160804020100506,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029999966091711776,
+      "loss": 0.649,
+      "step": 32
+    },
+    {
+      "epoch": 0.036180904522613064,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.000299996948263258,
+      "loss": 0.645,
+      "step": 36
+    },
+    {
+      "epoch": 0.04020100502512563,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0002999915230045952,
+      "loss": 0.6616,
+      "step": 40
+    },
+    {
+      "epoch": 0.044221105527638194,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029998338523924196,
+      "loss": 0.6724,
+      "step": 44
+    },
+    {
+      "epoch": 0.04824120603015075,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002999725351143648,
+      "loss": 0.6506,
+      "step": 48
+    },
+    {
+      "epoch": 0.05226130653266332,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029995897282618177,
+      "loss": 0.6499,
+      "step": 52
+    },
+    {
+      "epoch": 0.05628140703517588,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0002999426986199587,
+      "loss": 0.6445,
+      "step": 56
+    },
+    {
+      "epoch": 0.06030150753768844,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00029992371279000487,
+      "loss": 0.6445,
+      "step": 60
+    },
+    {
+      "epoch": 0.06432160804020101,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0002999020156796676,
+      "loss": 0.6495,
+      "step": 64
+    },
+    {
+      "epoch": 0.06834170854271357,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002998776076813261,
+      "loss": 0.6212,
+      "step": 68
+    },
+    {
+      "epoch": 0.07236180904522613,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002998504892363843,
+      "loss": 0.6622,
+      "step": 72
+    },
+    {
+      "epoch": 0.0763819095477387,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002998206608352632,
+      "loss": 0.6706,
+      "step": 76
+    },
+    {
+      "epoch": 0.08040201005025126,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002997881230173914,
+      "loss": 0.6146,
+      "step": 80
+    },
+    {
+      "epoch": 0.08442211055276382,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00029975287637119585,
+      "loss": 0.6458,
+      "step": 84
+    },
+    {
+      "epoch": 0.08844221105527639,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002997149215340909,
+      "loss": 0.6771,
+      "step": 88
+    },
+    {
+      "epoch": 0.09246231155778895,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0002996742591924671,
+      "loss": 0.6559,
+      "step": 92
+    },
+    {
+      "epoch": 0.0964824120603015,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00029963089008167856,
+      "loss": 0.6041,
+      "step": 96
+    },
+    {
+      "epoch": 0.10050251256281408,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002995848149860295,
+      "loss": 0.6464,
+      "step": 100
+    },
+    {
+      "epoch": 0.10452261306532663,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002995360347387604,
+      "loss": 0.6513,
+      "step": 104
+    },
+    {
+      "epoch": 0.10854271356783919,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00029948455022203285,
+      "loss": 0.6714,
+      "step": 108
+    },
+    {
+      "epoch": 0.11256281407035176,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00029943036236691333,
+      "loss": 0.623,
+      "step": 112
+    },
+    {
+      "epoch": 0.11658291457286432,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00029937347215335674,
+      "loss": 0.6691,
+      "step": 116
+    },
+    {
+      "epoch": 0.12060301507537688,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029931388061018845,
+      "loss": 0.6512,
+      "step": 120
+    },
+    {
+      "epoch": 0.12462311557788945,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00029925158881508577,
+      "loss": 0.679,
+      "step": 124
+    },
+    {
+      "epoch": 0.12864321608040202,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0002991865978945584,
+      "loss": 0.6164,
+      "step": 128
+    },
+    {
+      "epoch": 0.13266331658291458,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002991189090239282,
+      "loss": 0.6462,
+      "step": 132
+    },
+    {
+      "epoch": 0.13668341708542714,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00029904852342730774,
+      "loss": 0.6567,
+      "step": 136
+    },
+    {
+      "epoch": 0.1407035175879397,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0002989754423775783,
+      "loss": 0.6519,
+      "step": 140
+    },
+    {
+      "epoch": 0.14472361809045226,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00029889966719636677,
+      "loss": 0.6049,
+      "step": 144
+    },
+    {
+      "epoch": 0.1487437185929648,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002988211992540219,
+      "loss": 0.6298,
+      "step": 148
+    },
+    {
+      "epoch": 0.1527638190954774,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002987400399695893,
+      "loss": 0.6358,
+      "step": 152
+    },
+    {
+      "epoch": 0.15678391959798996,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.000298656190810786,
+      "loss": 0.6588,
+      "step": 156
+    },
+    {
+      "epoch": 0.16080402010050251,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029856965329397365,
+      "loss": 0.6834,
+      "step": 160
+    },
+    {
+      "epoch": 0.16482412060301507,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002984804289841313,
+      "loss": 0.6619,
+      "step": 164
+    },
+    {
+      "epoch": 0.16884422110552763,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002983885194948271,
+      "loss": 0.6345,
+      "step": 168
+    },
+    {
+      "epoch": 0.1728643216080402,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000298293926488189,
+      "loss": 0.6452,
+      "step": 172
+    },
+    {
+      "epoch": 0.17688442211055277,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002981966516748748,
+      "loss": 0.6378,
+      "step": 176
+    },
+    {
+      "epoch": 0.18090452261306533,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029809669681404107,
+      "loss": 0.6233,
+      "step": 180
+    },
+    {
+      "epoch": 0.1849246231155779,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00029799406371331153,
+      "loss": 0.6583,
+      "step": 184
+    },
+    {
+      "epoch": 0.18894472361809045,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002978887542287442,
+      "loss": 0.6488,
+      "step": 188
+    },
+    {
+      "epoch": 0.192964824120603,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002977807702647979,
+      "loss": 0.6394,
+      "step": 192
+    },
+    {
+      "epoch": 0.19698492462311556,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029767011377429786,
+      "loss": 0.6069,
+      "step": 196
+    },
+    {
+      "epoch": 0.20100502512562815,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00029755678675840027,
+      "loss": 0.6155,
+      "step": 200
+    },
+    {
+      "epoch": 0.2050251256281407,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002974407912665563,
+      "loss": 0.6305,
+      "step": 204
+    },
+    {
+      "epoch": 0.20904522613065327,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002973221293964747,
+      "loss": 0.6617,
+      "step": 208
+    },
+    {
+      "epoch": 0.21306532663316582,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00029720080329408426,
+      "loss": 0.6611,
+      "step": 212
+    },
+    {
+      "epoch": 0.21708542713567838,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002970768151534947,
+      "loss": 0.6054,
+      "step": 216
+    },
+    {
+      "epoch": 0.22110552763819097,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002969501672169571,
+      "loss": 0.6981,
+      "step": 220
+    },
+    {
+      "epoch": 0.22512562814070353,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00029682086177482353,
+      "loss": 0.6005,
+      "step": 224
+    },
+    {
+      "epoch": 0.22914572864321608,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00029668890116550526,
+      "loss": 0.6184,
+      "step": 228
+    },
+    {
+      "epoch": 0.23316582914572864,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00029655428777543074,
+      "loss": 0.5997,
+      "step": 232
+    },
+    {
+      "epoch": 0.2371859296482412,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002964170240390023,
+      "loss": 0.6214,
+      "step": 236
+    },
+    {
+      "epoch": 0.24120603015075376,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00029627711243855224,
+      "loss": 0.6562,
+      "step": 240
+    },
+    {
+      "epoch": 0.24522613065326634,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.000296134555504298,
+      "loss": 0.6193,
+      "step": 244
+    },
+    {
+      "epoch": 0.2492462311557789,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.000295989355814296,
+      "loss": 0.6486,
+      "step": 248
+    },
+    {
+      "epoch": 0.25326633165829143,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002958415159943958,
+      "loss": 0.6516,
+      "step": 252
+    },
+    {
+      "epoch": 0.25728643216080405,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002956910387181916,
+      "loss": 0.6304,
+      "step": 256
+    },
+    {
+      "epoch": 0.2613065326633166,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002955379267069747,
+      "loss": 0.6423,
+      "step": 260
+    },
+    {
+      "epoch": 0.26532663316582916,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029538218272968394,
+      "loss": 0.6231,
+      "step": 264
+    },
+    {
+      "epoch": 0.2693467336683417,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00029522380960285573,
+      "loss": 0.5963,
+      "step": 268
+    },
+    {
+      "epoch": 0.2733668341708543,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.000295062810190573,
+      "loss": 0.6504,
+      "step": 272
+    },
+    {
+      "epoch": 0.27738693467336684,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0002948991874044136,
+      "loss": 0.6141,
+      "step": 276
+    },
+    {
+      "epoch": 0.2814070351758794,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002947329442033973,
+      "loss": 0.6604,
+      "step": 280
+    },
+    {
+      "epoch": 0.28542713567839195,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00029456408359393275,
+      "loss": 0.6366,
+      "step": 284
+    },
+    {
+      "epoch": 0.2894472361809045,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0002943926086297627,
+      "loss": 0.6538,
+      "step": 288
+    },
+    {
+      "epoch": 0.29346733668341707,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.000294218522411909,
+      "loss": 0.6312,
+      "step": 292
+    },
+    {
+      "epoch": 0.2974874371859296,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002940418280886163,
+      "loss": 0.6618,
+      "step": 296
+    },
+    {
+      "epoch": 0.3015075376884422,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002938625288552957,
+      "loss": 0.6176,
+      "step": 300
+    },
+    {
+      "epoch": 0.3055276381909548,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.000293680627954466,
+      "loss": 0.626,
+      "step": 304
+    },
+    {
+      "epoch": 0.30954773869346736,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0002934961286756959,
+      "loss": 0.6636,
+      "step": 308
+    },
+    {
+      "epoch": 0.3135678391959799,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002933090343555442,
+      "loss": 0.6565,
+      "step": 312
+    },
+    {
+      "epoch": 0.31758793969849247,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0002931193483774993,
+      "loss": 0.6049,
+      "step": 316
+    },
+    {
+      "epoch": 0.32160804020100503,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00029292707417191845,
+      "loss": 0.6412,
+      "step": 320
+    },
+    {
+      "epoch": 0.3256281407035176,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0002927322152159652,
+      "loss": 0.6503,
+      "step": 324
+    },
+    {
+      "epoch": 0.32964824120603015,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00029253477503354684,
+      "loss": 0.6133,
+      "step": 328
+    },
+    {
+      "epoch": 0.3336683417085427,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002923347571952506,
+      "loss": 0.6265,
+      "step": 332
+    },
+    {
+      "epoch": 0.33768844221105526,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00029213216531827905,
+      "loss": 0.6366,
+      "step": 336
+    },
+    {
+      "epoch": 0.3417085427135678,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029192700306638475,
+      "loss": 0.6065,
+      "step": 340
+    },
+    {
+      "epoch": 0.3457286432160804,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0002917192741498039,
+      "loss": 0.6875,
+      "step": 344
+    },
+    {
+      "epoch": 0.349748743718593,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002915089823251893,
+      "loss": 0.6538,
+      "step": 348
+    },
+    {
+      "epoch": 0.35376884422110555,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00029129613139554237,
+      "loss": 0.637,
+      "step": 352
+    },
+    {
+      "epoch": 0.3577889447236181,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002910807252101446,
+      "loss": 0.6136,
+      "step": 356
+    },
+    {
+      "epoch": 0.36180904522613067,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002908627676644874,
+      "loss": 0.6581,
+      "step": 360
+    },
+    {
+      "epoch": 0.3658291457286432,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00029064226270020233,
+      "loss": 0.6421,
+      "step": 364
+    },
+    {
+      "epoch": 0.3698492462311558,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002904192143049893,
+      "loss": 0.5957,
+      "step": 368
+    },
+    {
+      "epoch": 0.37386934673366834,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002901936265125448,
+      "loss": 0.6291,
+      "step": 372
+    },
+    {
+      "epoch": 0.3778894472361809,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002899655034024885,
+      "loss": 0.6493,
+      "step": 376
+    },
+    {
+      "epoch": 0.38190954773869346,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002897348491002901,
+      "loss": 0.597,
+      "step": 380
+    },
+    {
+      "epoch": 0.385929648241206,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002895016677771942,
+      "loss": 0.6323,
+      "step": 384
+    },
+    {
+      "epoch": 0.38994974874371857,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0002892659636501452,
+      "loss": 0.6468,
+      "step": 388
+    },
+    {
+      "epoch": 0.39396984924623113,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002890277409817107,
+      "loss": 0.6733,
+      "step": 392
+    },
+    {
+      "epoch": 0.39798994974874374,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00028878700408000466,
+      "loss": 0.6245,
+      "step": 396
+    },
+    {
+      "epoch": 0.4020100502512563,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002885437572986096,
+      "loss": 0.6804,
+      "step": 400
+    },
+    {
+      "epoch": 0.40603015075376886,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0002882980050364976,
+      "loss": 0.6546,
+      "step": 404
+    },
+    {
+      "epoch": 0.4100502512562814,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002880497517379508,
+      "loss": 0.6428,
+      "step": 408
+    },
+    {
+      "epoch": 0.414070351758794,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00028779900189248117,
+      "loss": 0.6349,
+      "step": 412
+    },
+    {
+      "epoch": 0.41809045226130653,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0002875457600347492,
+      "loss": 0.6171,
+      "step": 416
+    },
+    {
+      "epoch": 0.4221105527638191,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00028729003074448193,
+      "loss": 0.6148,
+      "step": 420
+    },
+    {
+      "epoch": 0.42613065326633165,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002870318186463901,
+      "loss": 0.6469,
+      "step": 424
+    },
+    {
+      "epoch": 0.4301507537688442,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002867711284100846,
+      "loss": 0.5852,
+      "step": 428
+    },
+    {
+      "epoch": 0.43417085427135677,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002865079647499919,
+      "loss": 0.6327,
+      "step": 432
+    },
+    {
+      "epoch": 0.4381909547738693,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00028624233242526887,
+      "loss": 0.652,
+      "step": 436
+    },
+    {
+      "epoch": 0.44221105527638194,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00028597423623971674,
+      "loss": 0.6432,
+      "step": 440
+    },
+    {
+      "epoch": 0.4462311557788945,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00028570368104169407,
+      "loss": 0.6091,
+      "step": 444
+    },
+    {
+      "epoch": 0.45025125628140705,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002854306717240294,
+      "loss": 0.6278,
+      "step": 448
+    },
+    {
+      "epoch": 0.4542713567839196,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00028515521322393237,
+      "loss": 0.5679,
+      "step": 452
+    },
+    {
+      "epoch": 0.45829145728643217,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0002848773105229046,
+      "loss": 0.6386,
+      "step": 456
+    },
+    {
+      "epoch": 0.4623115577889447,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002845969686466498,
+      "loss": 0.605,
+      "step": 460
+    },
+    {
+      "epoch": 0.4663316582914573,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002843141926649824,
+      "loss": 0.5858,
+      "step": 464
+    },
+    {
+      "epoch": 0.47035175879396984,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00028402898769173653,
+      "loss": 0.6038,
+      "step": 468
+    },
+    {
+      "epoch": 0.4743718592964824,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00028374135888467296,
+      "loss": 0.6276,
+      "step": 472
+    },
+    {
+      "epoch": 0.47839195979899496,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00028345131144538597,
+      "loss": 0.6003,
+      "step": 476
+    },
+    {
+      "epoch": 0.4824120603015075,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00028315885061920955,
+      "loss": 0.6336,
+      "step": 480
+    },
+    {
+      "epoch": 0.4864321608040201,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0002828639816951222,
+      "loss": 0.6344,
+      "step": 484
+    },
+    {
+      "epoch": 0.4904522613065327,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002825667100056515,
+      "loss": 0.6498,
+      "step": 488
+    },
+    {
+      "epoch": 0.49447236180904525,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002822670409267776,
+      "loss": 0.6236,
+      "step": 492
+    },
+    {
+      "epoch": 0.4984924623115578,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0002819649798778359,
+      "loss": 0.5804,
+      "step": 496
+    },
+    {
+      "epoch": 0.5025125628140703,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0002816605323214193,
+      "loss": 0.5991,
+      "step": 500
+    },
+    {
+      "epoch": 0.5065326633165829,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002813537037632791,
+      "loss": 0.5647,
+      "step": 504
+    },
+    {
+      "epoch": 0.5105527638190954,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002810444997522257,
+      "loss": 0.6434,
+      "step": 508
+    },
+    {
+      "epoch": 0.5145728643216081,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002807329258800281,
+      "loss": 0.6258,
+      "step": 512
+    },
+    {
+      "epoch": 0.5185929648241207,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002804189877813128,
+      "loss": 0.6212,
+      "step": 516
+    },
+    {
+      "epoch": 0.5226130653266332,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.000280102691133462,
+      "loss": 0.6603,
+      "step": 520
+    },
+    {
+      "epoch": 0.5266331658291458,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00027978404165651064,
+      "loss": 0.6936,
+      "step": 524
+    },
+    {
+      "epoch": 0.5306532663316583,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00027946304511304343,
+      "loss": 0.6549,
+      "step": 528
+    },
+    {
+      "epoch": 0.5346733668341709,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002791397073080902,
+      "loss": 0.6177,
+      "step": 532
+    },
+    {
+      "epoch": 0.5386934673366834,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00027881403408902116,
+      "loss": 0.6376,
+      "step": 536
+    },
+    {
+      "epoch": 0.542713567839196,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00027848603134544104,
+      "loss": 0.6017,
+      "step": 540
+    },
+    {
+      "epoch": 0.5467336683417086,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00027815570500908256,
+      "loss": 0.6401,
+      "step": 544
+    },
+    {
+      "epoch": 0.5507537688442211,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00027782306105369944,
+      "loss": 0.6471,
+      "step": 548
+    },
+    {
+      "epoch": 0.5547738693467337,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002774881054949579,
+      "loss": 0.6027,
+      "step": 552
+    },
+    {
+      "epoch": 0.5587939698492462,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00027715084439032826,
+      "loss": 0.6497,
+      "step": 556
+    },
+    {
+      "epoch": 0.5628140703517588,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00027681128383897524,
+      "loss": 0.6139,
+      "step": 560
+    },
+    {
+      "epoch": 0.5668341708542713,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002764694299816477,
+      "loss": 0.6408,
+      "step": 564
+    },
+    {
+      "epoch": 0.5708542713567839,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002761252890005674,
+      "loss": 0.6232,
+      "step": 568
+    },
+    {
+      "epoch": 0.5748743718592965,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002757788671193176,
+      "loss": 0.6059,
+      "step": 572
+    },
+    {
+      "epoch": 0.578894472361809,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00027543017060273003,
+      "loss": 0.5653,
+      "step": 576
+    },
+    {
+      "epoch": 0.5829145728643216,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002750792057567721,
+      "loss": 0.6294,
+      "step": 580
+    },
+    {
+      "epoch": 0.5869346733668341,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00027472597892843226,
+      "loss": 0.6528,
+      "step": 584
+    },
+    {
+      "epoch": 0.5909547738693467,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00027437049650560596,
+      "loss": 0.6278,
+      "step": 588
+    },
+    {
+      "epoch": 0.5949748743718593,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00027401276491697933,
+      "loss": 0.641,
+      "step": 592
+    },
+    {
+      "epoch": 0.5989949748743718,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002736527906319136,
+      "loss": 0.6252,
+      "step": 596
+    },
+    {
+      "epoch": 0.6030150753768844,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002732905801603277,
+      "loss": 0.6359,
+      "step": 600
+    },
+    {
+      "epoch": 0.607035175879397,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0002729261400525806,
+      "loss": 0.6323,
+      "step": 604
+    },
+    {
+      "epoch": 0.6110552763819096,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.000272559476899353,
+      "loss": 0.6002,
+      "step": 608
+    },
+    {
+      "epoch": 0.6150753768844222,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00027219059733152805,
+      "loss": 0.6447,
+      "step": 612
+    },
+    {
+      "epoch": 0.6190954773869347,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00027181950802007134,
+      "loss": 0.6009,
+      "step": 616
+    },
+    {
+      "epoch": 0.6231155778894473,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002714462156759104,
+      "loss": 0.6515,
+      "step": 620
+    },
+    {
+      "epoch": 0.6271356783919598,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00027107072704981325,
+      "loss": 0.5963,
+      "step": 624
+    },
+    {
+      "epoch": 0.6311557788944724,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00027069304893226646,
+      "loss": 0.6148,
+      "step": 628
+    },
+    {
+      "epoch": 0.6351758793969849,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002703131881533521,
+      "loss": 0.6235,
+      "step": 632
+    },
+    {
+      "epoch": 0.6391959798994975,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00026993115158262444,
+      "loss": 0.6813,
+      "step": 636
+    },
+    {
+      "epoch": 0.6432160804020101,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002695469461289856,
+      "loss": 0.6219,
+      "step": 640
+    },
+    {
+      "epoch": 0.6472361809045226,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00026916057874056063,
+      "loss": 0.6044,
+      "step": 644
+    },
+    {
+      "epoch": 0.6512562814070352,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00026877205640457195,
+      "loss": 0.5938,
+      "step": 648
+    },
+    {
+      "epoch": 0.6552763819095477,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00026838138614721294,
+      "loss": 0.6097,
+      "step": 652
+    },
+    {
+      "epoch": 0.6592964824120603,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002679885750335207,
+      "loss": 0.6163,
+      "step": 656
+    },
+    {
+      "epoch": 0.6633165829145728,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002675936301672485,
+      "loss": 0.6465,
+      "step": 660
+    },
+    {
+      "epoch": 0.6673366834170854,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002671965586907373,
+      "loss": 0.6238,
+      "step": 664
+    },
+    {
+      "epoch": 0.671356783919598,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0002667973677847865,
+      "loss": 0.6,
+      "step": 668
+    },
+    {
+      "epoch": 0.6753768844221105,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.000266396064668524,
+      "loss": 0.6435,
+      "step": 672
+    },
+    {
+      "epoch": 0.6793969849246231,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00026599265659927603,
+      "loss": 0.6182,
+      "step": 676
+    },
+    {
+      "epoch": 0.6834170854271356,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002655871508724353,
+      "loss": 0.6139,
+      "step": 680
+    },
+    {
+      "epoch": 0.6874371859296482,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00026517955482132955,
+      "loss": 0.6341,
+      "step": 684
+    },
+    {
+      "epoch": 0.6914572864321608,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002647698758170889,
+      "loss": 0.6273,
+      "step": 688
+    },
+    {
+      "epoch": 0.6954773869346733,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00026435812126851223,
+      "loss": 0.6282,
+      "step": 692
+    },
+    {
+      "epoch": 0.699497487437186,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002639442986219335,
+      "loss": 0.6421,
+      "step": 696
+    },
+    {
+      "epoch": 0.7035175879396985,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.000263528415361087,
+      "loss": 0.6504,
+      "step": 700
+    },
+    {
+      "epoch": 0.7075376884422111,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002631104790069719,
+      "loss": 0.615,
+      "step": 704
+    },
+    {
+      "epoch": 0.7115577889447237,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00026269049711771634,
+      "loss": 0.6939,
+      "step": 708
+    },
+    {
+      "epoch": 0.7155778894472362,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00026226847728844083,
+      "loss": 0.5338,
+      "step": 712
+    },
+    {
+      "epoch": 0.7195979899497488,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00026184442715112074,
+      "loss": 0.6489,
+      "step": 716
+    },
+    {
+      "epoch": 0.7236180904522613,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0002614183543744484,
+      "loss": 0.6212,
+      "step": 720
+    },
+    {
+      "epoch": 0.7276381909547739,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002609902666636942,
+      "loss": 0.607,
+      "step": 724
+    },
+    {
+      "epoch": 0.7316582914572864,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0002605601717605676,
+      "loss": 0.5979,
+      "step": 728
+    },
+    {
+      "epoch": 0.735678391959799,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002601280774430768,
+      "loss": 0.6397,
+      "step": 732
+    },
+    {
+      "epoch": 0.7396984924623116,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00025969399152538824,
+      "loss": 0.6519,
+      "step": 736
+    },
+    {
+      "epoch": 0.7437185929648241,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002592579218576853,
+      "loss": 0.6131,
+      "step": 740
+    },
+    {
+      "epoch": 0.7477386934673367,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00025881987632602626,
+      "loss": 0.6793,
+      "step": 744
+    },
+    {
+      "epoch": 0.7517587939698492,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00025837986285220173,
+      "loss": 0.6477,
+      "step": 748
+    },
+    {
+      "epoch": 0.7557788944723618,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002579378893935913,
+      "loss": 0.6347,
+      "step": 752
+    },
+    {
+      "epoch": 0.7597989949748744,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0002574939639430198,
+      "loss": 0.6188,
+      "step": 756
+    },
+    {
+      "epoch": 0.7638190954773869,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00025704809452861254,
+      "loss": 0.6099,
+      "step": 760
+    },
+    {
+      "epoch": 0.7678391959798995,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002566002892136505,
+      "loss": 0.6358,
+      "step": 764
+    },
+    {
+      "epoch": 0.771859296482412,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00025615055609642387,
+      "loss": 0.6048,
+      "step": 768
+    },
+    {
+      "epoch": 0.7758793969849246,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002556989033100864,
+      "loss": 0.653,
+      "step": 772
+    },
+    {
+      "epoch": 0.7798994974874371,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002552453390225076,
+      "loss": 0.6311,
+      "step": 776
+    },
+    {
+      "epoch": 0.7839195979899497,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002547898714361255,
+      "loss": 0.6403,
+      "step": 780
+    },
+    {
+      "epoch": 0.7879396984924623,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002543325087877981,
+      "loss": 0.6325,
+      "step": 784
+    },
+    {
+      "epoch": 0.7919597989949749,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0002538732593486545,
+      "loss": 0.6364,
+      "step": 788
+    },
+    {
+      "epoch": 0.7959798994974875,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00025341213142394514,
+      "loss": 0.643,
+      "step": 792
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0002529491333528918,
+      "loss": 0.6523,
+      "step": 796
+    },
+    {
+      "epoch": 0.8040201005025126,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00025248427350853687,
+      "loss": 0.6685,
+      "step": 800
+    },
+    {
+      "epoch": 0.8080402010050252,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002520175602975917,
+      "loss": 0.6533,
+      "step": 804
+    },
+    {
+      "epoch": 0.8120603015075377,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00025154900216028465,
+      "loss": 0.5855,
+      "step": 808
+    },
+    {
+      "epoch": 0.8160804020100503,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00025107860757020835,
+      "loss": 0.617,
+      "step": 812
+    },
+    {
+      "epoch": 0.8201005025125628,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0002506063850341669,
+      "loss": 0.6098,
+      "step": 816
+    },
+    {
+      "epoch": 0.8241206030150754,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00025013234309202134,
+      "loss": 0.631,
+      "step": 820
+    },
+    {
+      "epoch": 0.828140703517588,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002496564903165358,
+      "loss": 0.5969,
+      "step": 824
+    },
+    {
+      "epoch": 0.8321608040201005,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002491788353132222,
+      "loss": 0.6147,
+      "step": 828
+    },
+    {
+      "epoch": 0.8361809045226131,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00024869938672018464,
+      "loss": 0.6408,
+      "step": 832
+    },
+    {
+      "epoch": 0.8402010050251256,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00024821815320796327,
+      "loss": 0.6386,
+      "step": 836
+    },
+    {
+      "epoch": 0.8442211055276382,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00024773514347937726,
+      "loss": 0.6396,
+      "step": 840
+    },
+    {
+      "epoch": 0.8482412060301507,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002472503662693679,
+      "loss": 0.6181,
+      "step": 844
+    },
+    {
+      "epoch": 0.8522613065326633,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00024676383034484003,
+      "loss": 0.6569,
+      "step": 848
+    },
+    {
+      "epoch": 0.8562814070351759,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00024627554450450394,
+      "loss": 0.6382,
+      "step": 852
+    },
+    {
+      "epoch": 0.8603015075376884,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002457855175787161,
+      "loss": 0.6108,
+      "step": 856
+    },
+    {
+      "epoch": 0.864321608040201,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00024529375842931924,
+      "loss": 0.6555,
+      "step": 860
+    },
+    {
+      "epoch": 0.8683417085427135,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00024480027594948265,
+      "loss": 0.5887,
+      "step": 864
+    },
+    {
+      "epoch": 0.8723618090452261,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002443050790635408,
+      "loss": 0.6581,
+      "step": 868
+    },
+    {
+      "epoch": 0.8763819095477386,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00024380817672683234,
+      "loss": 0.6448,
+      "step": 872
+    },
+    {
+      "epoch": 0.8804020100502512,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0002433095779255377,
+      "loss": 0.6277,
+      "step": 876
+    },
+    {
+      "epoch": 0.8844221105527639,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002428092916765171,
+      "loss": 0.5841,
+      "step": 880
+    },
+    {
+      "epoch": 0.8884422110552764,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00024230732702714718,
+      "loss": 0.619,
+      "step": 884
+    },
+    {
+      "epoch": 0.892462311557789,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00024180369305515733,
+      "loss": 0.5848,
+      "step": 888
+    },
+    {
+      "epoch": 0.8964824120603015,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00024129839886846582,
+      "loss": 0.625,
+      "step": 892
+    },
+    {
+      "epoch": 0.9005025125628141,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00024079145360501473,
+      "loss": 0.5922,
+      "step": 896
+    },
+    {
+      "epoch": 0.9045226130653267,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00024028286643260503,
+      "loss": 0.6346,
+      "step": 900
+    },
+    {
+      "epoch": 0.9085427135678392,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00023977264654873048,
+      "loss": 0.6594,
+      "step": 904
+    },
+    {
+      "epoch": 0.9125628140703518,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002392608031804116,
+      "loss": 0.6271,
+      "step": 908
+    },
+    {
+      "epoch": 0.9165829145728643,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002387473455840285,
+      "loss": 0.6102,
+      "step": 912
+    },
+    {
+      "epoch": 0.9206030150753769,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00023823228304515373,
+      "loss": 0.6448,
+      "step": 916
+    },
+    {
+      "epoch": 0.9246231155778895,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00023771562487838425,
+      "loss": 0.5876,
+      "step": 920
+    },
+    {
+      "epoch": 0.928643216080402,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00023719738042717297,
+      "loss": 0.6417,
+      "step": 924
+    },
+    {
+      "epoch": 0.9326633165829146,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00023667755906365984,
+      "loss": 0.6126,
+      "step": 928
+    },
+    {
+      "epoch": 0.9366834170854271,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00023615617018850232,
+      "loss": 0.5917,
+      "step": 932
+    },
+    {
+      "epoch": 0.9407035175879397,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00023563322323070528,
+      "loss": 0.627,
+      "step": 936
+    },
+    {
+      "epoch": 0.9447236180904522,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0002351087276474507,
+      "loss": 0.6283,
+      "step": 940
+    },
+    {
+      "epoch": 0.9487437185929648,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002345826929239265,
+      "loss": 0.6519,
+      "step": 944
+    },
+    {
+      "epoch": 0.9527638190954774,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00023405512857315494,
+      "loss": 0.5933,
+      "step": 948
+    },
+    {
+      "epoch": 0.9567839195979899,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00023352604413582074,
+      "loss": 0.604,
+      "step": 952
+    },
+    {
+      "epoch": 0.9608040201005025,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00023299544918009858,
+      "loss": 0.6275,
+      "step": 956
+    },
+    {
+      "epoch": 0.964824120603015,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0002324633533014797,
+      "loss": 0.61,
+      "step": 960
+    },
+    {
+      "epoch": 0.9688442211055276,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0002319297661225989,
+      "loss": 0.6548,
+      "step": 964
+    },
+    {
+      "epoch": 0.9728643216080402,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00023139469729306007,
+      "loss": 0.6309,
+      "step": 968
+    },
+    {
+      "epoch": 0.9768844221105528,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00023085815648926194,
+      "loss": 0.5972,
+      "step": 972
+    },
+    {
+      "epoch": 0.9809045226130654,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00023032015341422295,
+      "loss": 0.6526,
+      "step": 976
+    },
+    {
+      "epoch": 0.9849246231155779,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00022978069779740597,
+      "loss": 0.6492,
+      "step": 980
+    },
+    {
+      "epoch": 0.9889447236180905,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00022923979939454202,
+      "loss": 0.6393,
+      "step": 984
+    },
+    {
+      "epoch": 0.992964824120603,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00022869746798745425,
+      "loss": 0.5946,
+      "step": 988
+    },
+    {
+      "epoch": 0.9969849246231156,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00022815371338388062,
+      "loss": 0.6341,
+      "step": 992
+    },
+    {
+      "epoch": 1.0010050251256282,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00022760854541729693,
+      "loss": 0.5532,
+      "step": 996
+    },
+    {
+      "epoch": 1.0050251256281406,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00022706197394673874,
+      "loss": 0.5303,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0090452261306533,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0002265140088566231,
+      "loss": 0.5229,
+      "step": 1004
+    },
+    {
+      "epoch": 1.0130653266331657,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00022596466005656983,
+      "loss": 0.528,
+      "step": 1008
+    },
+    {
+      "epoch": 1.0170854271356784,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00022541393748122234,
+      "loss": 0.4928,
+      "step": 1012
+    },
+    {
+      "epoch": 1.0211055276381908,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00022486185109006797,
+      "loss": 0.5461,
+      "step": 1016
+    },
+    {
+      "epoch": 1.0251256281407035,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0002243084108672578,
+      "loss": 0.5323,
+      "step": 1020
+    },
+    {
+      "epoch": 1.0291457286432162,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00022375362682142618,
+      "loss": 0.5196,
+      "step": 1024
+    },
+    {
+      "epoch": 1.0331658291457286,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00022319750898550962,
+      "loss": 0.5131,
+      "step": 1028
+    },
+    {
+      "epoch": 1.0371859296482413,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0002226400674165656,
+      "loss": 0.5072,
+      "step": 1032
+    },
+    {
+      "epoch": 1.0412060301507537,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00022208131219559032,
+      "loss": 0.4853,
+      "step": 1036
+    },
+    {
+      "epoch": 1.0452261306532664,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00022152125342733673,
+      "loss": 0.505,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0492462311557789,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00022095990124013147,
+      "loss": 0.5292,
+      "step": 1044
+    },
+    {
+      "epoch": 1.0532663316582915,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00022039726578569212,
+      "loss": 0.5244,
+      "step": 1048
+    },
+    {
+      "epoch": 1.057286432160804,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0002198333572389432,
+      "loss": 0.4877,
+      "step": 1052
+    },
+    {
+      "epoch": 1.0613065326633166,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002192681857978324,
+      "loss": 0.5056,
+      "step": 1056
+    },
+    {
+      "epoch": 1.065326633165829,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0002187017616831461,
+      "loss": 0.5349,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0693467336683418,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00021813409513832464,
+      "loss": 0.4781,
+      "step": 1064
+    },
+    {
+      "epoch": 1.0733668341708542,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00021756519642927665,
+      "loss": 0.523,
+      "step": 1068
+    },
+    {
+      "epoch": 1.0773869346733669,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0002169950758441941,
+      "loss": 0.5209,
+      "step": 1072
+    },
+    {
+      "epoch": 1.0814070351758793,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00021642374369336558,
+      "loss": 0.52,
+      "step": 1076
+    },
+    {
+      "epoch": 1.085427135678392,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00021585121030899014,
+      "loss": 0.5171,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0894472361809044,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00021527748604499062,
+      "loss": 0.525,
+      "step": 1084
+    },
+    {
+      "epoch": 1.0934673366834171,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000214702581276826,
+      "loss": 0.5494,
+      "step": 1088
+    },
+    {
+      "epoch": 1.0974874371859296,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00021412650640130409,
+      "loss": 0.5679,
+      "step": 1092
+    },
+    {
+      "epoch": 1.1015075376884422,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00021354927183639326,
+      "loss": 0.5263,
+      "step": 1096
+    },
+    {
+      "epoch": 1.1055276381909547,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00021297088802103427,
+      "loss": 0.5461,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1095477386934673,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00021239136541495137,
+      "loss": 0.4837,
+      "step": 1104
+    },
+    {
+      "epoch": 1.11356783919598,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0002118107144984632,
+      "loss": 0.5085,
+      "step": 1108
+    },
+    {
+      "epoch": 1.1175879396984925,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00021122894577229307,
+      "loss": 0.5327,
+      "step": 1112
+    },
+    {
+      "epoch": 1.121608040201005,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00021064606975737933,
+      "loss": 0.5378,
+      "step": 1116
+    },
+    {
+      "epoch": 1.1256281407035176,
+      "grad_norm": 0.375,
+      "learning_rate": 0.000210062096994685,
+      "loss": 0.5782,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1296482412060302,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000209477038045007,
+      "loss": 0.5379,
+      "step": 1124
+    },
+    {
+      "epoch": 1.1336683417085427,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0002088909034887854,
+      "loss": 0.5295,
+      "step": 1128
+    },
+    {
+      "epoch": 1.1376884422110554,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00020830370392591201,
+      "loss": 0.4973,
+      "step": 1132
+    },
+    {
+      "epoch": 1.1417085427135678,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0002077154499755384,
+      "loss": 0.5354,
+      "step": 1136
+    },
+    {
+      "epoch": 1.1457286432160805,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00020712615227588447,
+      "loss": 0.553,
+      "step": 1140
+    },
+    {
+      "epoch": 1.149748743718593,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00020653582148404538,
+      "loss": 0.4842,
+      "step": 1144
+    },
+    {
+      "epoch": 1.1537688442211056,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00020594446827579935,
+      "loss": 0.5194,
+      "step": 1148
+    },
+    {
+      "epoch": 1.157788944723618,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0002053521033454142,
+      "loss": 0.5314,
+      "step": 1152
+    },
+    {
+      "epoch": 1.1618090452261307,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00020475873740545444,
+      "loss": 0.4998,
+      "step": 1156
+    },
+    {
+      "epoch": 1.1658291457286432,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0002041643811865868,
+      "loss": 0.5061,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1698492462311558,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000203569045437387,
+      "loss": 0.5047,
+      "step": 1164
+    },
+    {
+      "epoch": 1.1738693467336683,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00020297274092414484,
+      "loss": 0.5115,
+      "step": 1168
+    },
+    {
+      "epoch": 1.177889447236181,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0002023754784306695,
+      "loss": 0.5287,
+      "step": 1172
+    },
+    {
+      "epoch": 1.1819095477386934,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00020177726875809498,
+      "loss": 0.553,
+      "step": 1176
+    },
+    {
+      "epoch": 1.185929648241206,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00020117812272468408,
+      "loss": 0.5212,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1899497487437185,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0002005780511656333,
+      "loss": 0.5433,
+      "step": 1184
+    },
+    {
+      "epoch": 1.1939698492462312,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00019997706493287686,
+      "loss": 0.5462,
+      "step": 1188
+    },
+    {
+      "epoch": 1.1979899497487438,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00019937517489489008,
+      "loss": 0.5251,
+      "step": 1192
+    },
+    {
+      "epoch": 1.2020100502512563,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00019877239193649303,
+      "loss": 0.522,
+      "step": 1196
+    },
+    {
+      "epoch": 1.2060301507537687,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001981687269586539,
+      "loss": 0.5207,
+      "step": 1200
+    },
+    {
+      "epoch": 1.2100502512562814,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00019756419087829161,
+      "loss": 0.5119,
+      "step": 1204
+    },
+    {
+      "epoch": 1.214070351758794,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00019695879462807835,
+      "loss": 0.5216,
+      "step": 1208
+    },
+    {
+      "epoch": 1.2180904522613065,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001963525491562421,
+      "loss": 0.5006,
+      "step": 1212
+    },
+    {
+      "epoch": 1.2221105527638192,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001957454654263684,
+      "loss": 0.5248,
+      "step": 1216
+    },
+    {
+      "epoch": 1.2261306532663316,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001951375544172022,
+      "loss": 0.5105,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2301507537688443,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00019452882712244935,
+      "loss": 0.5096,
+      "step": 1224
+    },
+    {
+      "epoch": 1.2341708542713568,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00019391929455057772,
+      "loss": 0.5608,
+      "step": 1228
+    },
+    {
+      "epoch": 1.2381909547738694,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00019330896772461813,
+      "loss": 0.5667,
+      "step": 1232
+    },
+    {
+      "epoch": 1.2422110552763819,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0001926978576819649,
+      "loss": 0.5181,
+      "step": 1236
+    },
+    {
+      "epoch": 1.2462311557788945,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0001920859754741766,
+      "loss": 0.5102,
+      "step": 1240
+    },
+    {
+      "epoch": 1.250251256281407,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001914733321667757,
+      "loss": 0.5188,
+      "step": 1244
+    },
+    {
+      "epoch": 1.2542713567839197,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00019085993883904878,
+      "loss": 0.539,
+      "step": 1248
+    },
+    {
+      "epoch": 1.258291457286432,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00019024580658384612,
+      "loss": 0.5216,
+      "step": 1252
+    },
+    {
+      "epoch": 1.2623115577889448,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0001896309465073811,
+      "loss": 0.5534,
+      "step": 1256
+    },
+    {
+      "epoch": 1.2663316582914572,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00018901536972902922,
+      "loss": 0.5503,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2703517587939699,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00018839908738112714,
+      "loss": 0.4965,
+      "step": 1264
+    },
+    {
+      "epoch": 1.2743718592964823,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00018778211060877127,
+      "loss": 0.519,
+      "step": 1268
+    },
+    {
+      "epoch": 1.278391959798995,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00018716445056961634,
+      "loss": 0.5164,
+      "step": 1272
+    },
+    {
+      "epoch": 1.2824120603015077,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0001865461184336736,
+      "loss": 0.5148,
+      "step": 1276
+    },
+    {
+      "epoch": 1.2864321608040201,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00018592712538310864,
+      "loss": 0.508,
+      "step": 1280
+    },
+    {
+      "epoch": 1.2904522613065326,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00018530748261203934,
+      "loss": 0.5248,
+      "step": 1284
+    },
+    {
+      "epoch": 1.2944723618090452,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00018468720132633337,
+      "loss": 0.5287,
+      "step": 1288
+    },
+    {
+      "epoch": 1.298492462311558,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00018406629274340564,
+      "loss": 0.5527,
+      "step": 1292
+    },
+    {
+      "epoch": 1.3025125628140704,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001834447680920153,
+      "loss": 0.4996,
+      "step": 1296
+    },
+    {
+      "epoch": 1.3065326633165828,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00018282263861206266,
+      "loss": 0.4831,
+      "step": 1300
+    },
+    {
+      "epoch": 1.3105527638190955,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0001821999155543861,
+      "loss": 0.5409,
+      "step": 1304
+    },
+    {
+      "epoch": 1.3145728643216081,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00018157661018055842,
+      "loss": 0.5095,
+      "step": 1308
+    },
+    {
+      "epoch": 1.3185929648241206,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00018095273376268333,
+      "loss": 0.5683,
+      "step": 1312
+    },
+    {
+      "epoch": 1.322613065326633,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00018032829758319146,
+      "loss": 0.4956,
+      "step": 1316
+    },
+    {
+      "epoch": 1.3266331658291457,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00017970331293463643,
+      "loss": 0.5346,
+      "step": 1320
+    },
+    {
+      "epoch": 1.3306532663316584,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00017907779111949054,
+      "loss": 0.5211,
+      "step": 1324
+    },
+    {
+      "epoch": 1.3346733668341708,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001784517434499405,
+      "loss": 0.5293,
+      "step": 1328
+    },
+    {
+      "epoch": 1.3386934673366835,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00017782518124768282,
+      "loss": 0.5131,
+      "step": 1332
+    },
+    {
+      "epoch": 1.342713567839196,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00017719811584371886,
+      "loss": 0.5012,
+      "step": 1336
+    },
+    {
+      "epoch": 1.3467336683417086,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00017657055857815018,
+      "loss": 0.513,
+      "step": 1340
+    },
+    {
+      "epoch": 1.350753768844221,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0001759425207999734,
+      "loss": 0.5691,
+      "step": 1344
+    },
+    {
+      "epoch": 1.3547738693467337,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00017531401386687492,
+      "loss": 0.5173,
+      "step": 1348
+    },
+    {
+      "epoch": 1.3587939698492462,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.00017468504914502542,
+      "loss": 0.4847,
+      "step": 1352
+    },
+    {
+      "epoch": 1.3628140703517588,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0001740556380088745,
+      "loss": 0.5451,
+      "step": 1356
+    },
+    {
+      "epoch": 1.3668341708542713,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001734257918409449,
+      "loss": 0.5408,
+      "step": 1360
+    },
+    {
+      "epoch": 1.370854271356784,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001727955220316265,
+      "loss": 0.4905,
+      "step": 1364
+    },
+    {
+      "epoch": 1.3748743718592964,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0001721648399789708,
+      "loss": 0.4882,
+      "step": 1368
+    },
+    {
+      "epoch": 1.378894472361809,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00017153375708848422,
+      "loss": 0.5087,
+      "step": 1372
+    },
+    {
+      "epoch": 1.3829145728643217,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00017090228477292202,
+      "loss": 0.5281,
+      "step": 1376
+    },
+    {
+      "epoch": 1.3869346733668342,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00017027043445208225,
+      "loss": 0.5353,
+      "step": 1380
+    },
+    {
+      "epoch": 1.3909547738693466,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0001696382175525988,
+      "loss": 0.5069,
+      "step": 1384
+    },
+    {
+      "epoch": 1.3949748743718593,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001690056455077349,
+      "loss": 0.496,
+      "step": 1388
+    },
+    {
+      "epoch": 1.398994974874372,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00016837272975717642,
+      "loss": 0.5481,
+      "step": 1392
+    },
+    {
+      "epoch": 1.4030150753768844,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001677394817468249,
+      "loss": 0.5071,
+      "step": 1396
+    },
+    {
+      "epoch": 1.4070351758793969,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00016710591292859063,
+      "loss": 0.51,
+      "step": 1400
+    },
+    {
+      "epoch": 1.4110552763819095,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0001664720347601855,
+      "loss": 0.5155,
+      "step": 1404
+    },
+    {
+      "epoch": 1.4150753768844222,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00016583785870491588,
+      "loss": 0.5221,
+      "step": 1408
+    },
+    {
+      "epoch": 1.4190954773869346,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00016520339623147517,
+      "loss": 0.542,
+      "step": 1412
+    },
+    {
+      "epoch": 1.4231155778894473,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001645686588137365,
+      "loss": 0.4669,
+      "step": 1416
+    },
+    {
+      "epoch": 1.4271356783919598,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001639336579305451,
+      "loss": 0.5211,
+      "step": 1420
+    },
+    {
+      "epoch": 1.4311557788944724,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00016329840506551098,
+      "loss": 0.4785,
+      "step": 1424
+    },
+    {
+      "epoch": 1.4351758793969849,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001626629117068011,
+      "loss": 0.5316,
+      "step": 1428
+    },
+    {
+      "epoch": 1.4391959798994975,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00016202718934693134,
+      "loss": 0.523,
+      "step": 1432
+    },
+    {
+      "epoch": 1.44321608040201,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00016139124948255925,
+      "loss": 0.5512,
+      "step": 1436
+    },
+    {
+      "epoch": 1.4472361809045227,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00016075510361427564,
+      "loss": 0.5319,
+      "step": 1440
+    },
+    {
+      "epoch": 1.451256281407035,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001601187632463968,
+      "loss": 0.4716,
+      "step": 1444
+    },
+    {
+      "epoch": 1.4552763819095478,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00015948223988675644,
+      "loss": 0.5194,
+      "step": 1448
+    },
+    {
+      "epoch": 1.4592964824120602,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00015884554504649764,
+      "loss": 0.5121,
+      "step": 1452
+    },
+    {
+      "epoch": 1.463316582914573,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00015820869023986444,
+      "loss": 0.51,
+      "step": 1456
+    },
+    {
+      "epoch": 1.4673366834170856,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00015757168698399387,
+      "loss": 0.5532,
+      "step": 1460
+    },
+    {
+      "epoch": 1.471356783919598,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00015693454679870772,
+      "loss": 0.4899,
+      "step": 1464
+    },
+    {
+      "epoch": 1.4753768844221105,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00015629728120630378,
+      "loss": 0.5007,
+      "step": 1468
+    },
+    {
+      "epoch": 1.4793969849246231,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00015565990173134792,
+      "loss": 0.5182,
+      "step": 1472
+    },
+    {
+      "epoch": 1.4834170854271358,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00015502241990046547,
+      "loss": 0.5502,
+      "step": 1476
+    },
+    {
+      "epoch": 1.4874371859296482,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00015438484724213287,
+      "loss": 0.5105,
+      "step": 1480
+    },
+    {
+      "epoch": 1.4914572864321607,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00015374719528646907,
+      "loss": 0.5128,
+      "step": 1484
+    },
+    {
+      "epoch": 1.4954773869346734,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00015310947556502702,
+      "loss": 0.5355,
+      "step": 1488
+    },
+    {
+      "epoch": 1.499497487437186,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00015247169961058523,
+      "loss": 0.5161,
+      "step": 1492
+    },
+    {
+      "epoch": 1.5035175879396985,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00015183387895693911,
+      "loss": 0.5628,
+      "step": 1496
+    },
+    {
+      "epoch": 1.507537688442211,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00015119602513869249,
+      "loss": 0.5296,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5115577889447236,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00015055814969104893,
+      "loss": 0.5178,
+      "step": 1504
+    },
+    {
+      "epoch": 1.5155778894472363,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00014992026414960313,
+      "loss": 0.53,
+      "step": 1508
+    },
+    {
+      "epoch": 1.5195979899497487,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0001492823800501323,
+      "loss": 0.5137,
+      "step": 1512
+    },
+    {
+      "epoch": 1.5236180904522612,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001486445089283877,
+      "loss": 0.4953,
+      "step": 1516
+    },
+    {
+      "epoch": 1.5276381909547738,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00014800666231988574,
+      "loss": 0.5631,
+      "step": 1520
+    },
+    {
+      "epoch": 1.5316582914572865,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0001473688517596996,
+      "loss": 0.5299,
+      "step": 1524
+    },
+    {
+      "epoch": 1.5356783919597992,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001467310887822506,
+      "loss": 0.509,
+      "step": 1528
+    },
+    {
+      "epoch": 1.5396984924623116,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00014609338492109944,
+      "loss": 0.5143,
+      "step": 1532
+    },
+    {
+      "epoch": 1.543718592964824,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00014545575170873777,
+      "loss": 0.5166,
+      "step": 1536
+    },
+    {
+      "epoch": 1.5477386934673367,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00014481820067637966,
+      "loss": 0.5479,
+      "step": 1540
+    },
+    {
+      "epoch": 1.5517587939698494,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00014418074335375297,
+      "loss": 0.4979,
+      "step": 1544
+    },
+    {
+      "epoch": 1.5557788944723618,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00014354339126889084,
+      "loss": 0.5331,
+      "step": 1548
+    },
+    {
+      "epoch": 1.5597989949748743,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00014290615594792335,
+      "loss": 0.5257,
+      "step": 1552
+    },
+    {
+      "epoch": 1.563819095477387,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00014226904891486878,
+      "loss": 0.5104,
+      "step": 1556
+    },
+    {
+      "epoch": 1.5678391959798996,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001416320816914256,
+      "loss": 0.5514,
+      "step": 1560
+    },
+    {
+      "epoch": 1.571859296482412,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00014099526579676387,
+      "loss": 0.49,
+      "step": 1564
+    },
+    {
+      "epoch": 1.5758793969849245,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001403586127473168,
+      "loss": 0.5235,
+      "step": 1568
+    },
+    {
+      "epoch": 1.5798994974874372,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0001397221340565729,
+      "loss": 0.5119,
+      "step": 1572
+    },
+    {
+      "epoch": 1.5839195979899499,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00013908584123486736,
+      "loss": 0.4947,
+      "step": 1576
+    },
+    {
+      "epoch": 1.5879396984924623,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00013844974578917395,
+      "loss": 0.4872,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5919597989949748,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001378138592228971,
+      "loss": 0.547,
+      "step": 1584
+    },
+    {
+      "epoch": 1.5959798994974874,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001371781930356639,
+      "loss": 0.5132,
+      "step": 1588
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00013654275872311588,
+      "loss": 0.5251,
+      "step": 1592
+    },
+    {
+      "epoch": 1.6040201005025125,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00013590756777670133,
+      "loss": 0.506,
+      "step": 1596
+    },
+    {
+      "epoch": 1.608040201005025,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00013527263168346725,
+      "loss": 0.4885,
+      "step": 1600
+    },
+    {
+      "epoch": 1.6120603015075377,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00013463796192585197,
+      "loss": 0.5728,
+      "step": 1604
+    },
+    {
+      "epoch": 1.6160804020100503,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001340035699814772,
+      "loss": 0.4981,
+      "step": 1608
+    },
+    {
+      "epoch": 1.6201005025125628,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0001333694673229406,
+      "loss": 0.5385,
+      "step": 1612
+    },
+    {
+      "epoch": 1.6241206030150752,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0001327356654176082,
+      "loss": 0.4765,
+      "step": 1616
+    },
+    {
+      "epoch": 1.6281407035175879,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00013210217572740725,
+      "loss": 0.517,
+      "step": 1620
+    },
+    {
+      "epoch": 1.6321608040201006,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00013146900970861856,
+      "loss": 0.5199,
+      "step": 1624
+    },
+    {
+      "epoch": 1.6361809045226132,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00013083617881166971,
+      "loss": 0.5128,
+      "step": 1628
+    },
+    {
+      "epoch": 1.6402010050251257,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0001302036944809277,
+      "loss": 0.5482,
+      "step": 1632
+    },
+    {
+      "epoch": 1.6442211055276381,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00012957156815449216,
+      "loss": 0.4917,
+      "step": 1636
+    },
+    {
+      "epoch": 1.6482412060301508,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00012893981126398837,
+      "loss": 0.515,
+      "step": 1640
+    },
+    {
+      "epoch": 1.6522613065326635,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00012830843523436064,
+      "loss": 0.4891,
+      "step": 1644
+    },
+    {
+      "epoch": 1.656281407035176,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00012767745148366556,
+      "loss": 0.5319,
+      "step": 1648
+    },
+    {
+      "epoch": 1.6603015075376883,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.00012704687142286563,
+      "loss": 0.4826,
+      "step": 1652
+    },
+    {
+      "epoch": 1.664321608040201,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00012641670645562294,
+      "loss": 0.5107,
+      "step": 1656
+    },
+    {
+      "epoch": 1.6683417085427137,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00012578696797809266,
+      "loss": 0.4988,
+      "step": 1660
+    },
+    {
+      "epoch": 1.6723618090452261,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00012515766737871743,
+      "loss": 0.4736,
+      "step": 1664
+    },
+    {
+      "epoch": 1.6763819095477386,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00012452881603802095,
+      "loss": 0.5087,
+      "step": 1668
+    },
+    {
+      "epoch": 1.6804020100502512,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0001239004253284023,
+      "loss": 0.5101,
+      "step": 1672
+    },
+    {
+      "epoch": 1.684422110552764,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00012327250661393037,
+      "loss": 0.5208,
+      "step": 1676
+    },
+    {
+      "epoch": 1.6884422110552764,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001226450712501384,
+      "loss": 0.5393,
+      "step": 1680
+    },
+    {
+      "epoch": 1.6924623115577888,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00012201813058381845,
+      "loss": 0.5202,
+      "step": 1684
+    },
+    {
+      "epoch": 1.6964824120603015,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00012139169595281624,
+      "loss": 0.4888,
+      "step": 1688
+    },
+    {
+      "epoch": 1.7005025125628142,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00012076577868582623,
+      "loss": 0.4949,
+      "step": 1692
+    },
+    {
+      "epoch": 1.7045226130653266,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00012014039010218651,
+      "loss": 0.4858,
+      "step": 1696
+    },
+    {
+      "epoch": 1.708542713567839,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00011951554151167443,
+      "loss": 0.5161,
+      "step": 1700
+    },
+    {
+      "epoch": 1.7125628140703517,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00011889124421430179,
+      "loss": 0.4953,
+      "step": 1704
+    },
+    {
+      "epoch": 1.7165829145728644,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00011826750950011057,
+      "loss": 0.5068,
+      "step": 1708
+    },
+    {
+      "epoch": 1.7206030150753768,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00011764434864896884,
+      "loss": 0.5539,
+      "step": 1712
+    },
+    {
+      "epoch": 1.7246231155778895,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00011702177293036667,
+      "loss": 0.5099,
+      "step": 1716
+    },
+    {
+      "epoch": 1.728643216080402,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001163997936032123,
+      "loss": 0.5331,
+      "step": 1720
+    },
+    {
+      "epoch": 1.7326633165829146,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00011577842191562864,
+      "loss": 0.5269,
+      "step": 1724
+    },
+    {
+      "epoch": 1.7366834170854273,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00011515766910474989,
+      "loss": 0.4833,
+      "step": 1728
+    },
+    {
+      "epoch": 1.7407035175879397,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00011453754639651804,
+      "loss": 0.5658,
+      "step": 1732
+    },
+    {
+      "epoch": 1.7447236180904522,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00011391806500548021,
+      "loss": 0.5175,
+      "step": 1736
+    },
+    {
+      "epoch": 1.7487437185929648,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00011329923613458571,
+      "loss": 0.5106,
+      "step": 1740
+    },
+    {
+      "epoch": 1.7527638190954775,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00011268107097498322,
+      "loss": 0.4766,
+      "step": 1744
+    },
+    {
+      "epoch": 1.75678391959799,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00011206358070581876,
+      "loss": 0.4809,
+      "step": 1748
+    },
+    {
+      "epoch": 1.7608040201005024,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00011144677649403329,
+      "loss": 0.4951,
+      "step": 1752
+    },
+    {
+      "epoch": 1.764824120603015,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00011083066949416092,
+      "loss": 0.5347,
+      "step": 1756
+    },
+    {
+      "epoch": 1.7688442211055277,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00011021527084812704,
+      "loss": 0.5442,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7728643216080402,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00010960059168504694,
+      "loss": 0.5528,
+      "step": 1764
+    },
+    {
+      "epoch": 1.7768844221105526,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00010898664312102425,
+      "loss": 0.5639,
+      "step": 1768
+    },
+    {
+      "epoch": 1.7809045226130653,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00010837343625895054,
+      "loss": 0.5303,
+      "step": 1772
+    },
+    {
+      "epoch": 1.784924623115578,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00010776098218830389,
+      "loss": 0.5272,
+      "step": 1776
+    },
+    {
+      "epoch": 1.7889447236180904,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00010714929198494866,
+      "loss": 0.5386,
+      "step": 1780
+    },
+    {
+      "epoch": 1.7929648241206029,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00010653837671093511,
+      "loss": 0.5112,
+      "step": 1784
+    },
+    {
+      "epoch": 1.7969849246231155,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00010592824741429945,
+      "loss": 0.5354,
+      "step": 1788
+    },
+    {
+      "epoch": 1.8010050251256282,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00010531891512886384,
+      "loss": 0.5214,
+      "step": 1792
+    },
+    {
+      "epoch": 1.8050251256281407,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00010471039087403705,
+      "loss": 0.5422,
+      "step": 1796
+    },
+    {
+      "epoch": 1.809045226130653,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00010410268565461506,
+      "loss": 0.5214,
+      "step": 1800
+    },
+    {
+      "epoch": 1.8130653266331658,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00010349581046058215,
+      "loss": 0.5413,
+      "step": 1804
+    },
+    {
+      "epoch": 1.8170854271356784,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00010288977626691202,
+      "loss": 0.5185,
+      "step": 1808
+    },
+    {
+      "epoch": 1.8211055276381911,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00010228459403336941,
+      "loss": 0.538,
+      "step": 1812
+    },
+    {
+      "epoch": 1.8251256281407036,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00010168027470431189,
+      "loss": 0.5167,
+      "step": 1816
+    },
+    {
+      "epoch": 1.829145728643216,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00010107682920849185,
+      "loss": 0.5226,
+      "step": 1820
+    },
+    {
+      "epoch": 1.8331658291457287,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00010047426845885903,
+      "loss": 0.4987,
+      "step": 1824
+    },
+    {
+      "epoch": 1.8371859296482413,
+      "grad_norm": 0.39453125,
+      "learning_rate": 9.987260335236297e-05,
+      "loss": 0.5597,
+      "step": 1828
+    },
+    {
+      "epoch": 1.8412060301507538,
+      "grad_norm": 0.375,
+      "learning_rate": 9.92718447697562e-05,
+      "loss": 0.5334,
+      "step": 1832
+    },
+    {
+      "epoch": 1.8452261306532662,
+      "grad_norm": 0.384765625,
+      "learning_rate": 9.867200357539708e-05,
+      "loss": 0.4883,
+      "step": 1836
+    },
+    {
+      "epoch": 1.849246231155779,
+      "grad_norm": 0.35546875,
+      "learning_rate": 9.807309061705372e-05,
+      "loss": 0.5091,
+      "step": 1840
+    },
+    {
+      "epoch": 1.8532663316582916,
+      "grad_norm": 0.37890625,
+      "learning_rate": 9.747511672570755e-05,
+      "loss": 0.5562,
+      "step": 1844
+    },
+    {
+      "epoch": 1.857286432160804,
+      "grad_norm": 0.36328125,
+      "learning_rate": 9.687809271535762e-05,
+      "loss": 0.5292,
+      "step": 1848
+    },
+    {
+      "epoch": 1.8613065326633165,
+      "grad_norm": 0.365234375,
+      "learning_rate": 9.628202938282493e-05,
+      "loss": 0.4938,
+      "step": 1852
+    },
+    {
+      "epoch": 1.8653266331658291,
+      "grad_norm": 0.376953125,
+      "learning_rate": 9.568693750755723e-05,
+      "loss": 0.5205,
+      "step": 1856
+    },
+    {
+      "epoch": 1.8693467336683418,
+      "grad_norm": 0.357421875,
+      "learning_rate": 9.50928278514338e-05,
+      "loss": 0.5144,
+      "step": 1860
+    },
+    {
+      "epoch": 1.8733668341708543,
+      "grad_norm": 0.388671875,
+      "learning_rate": 9.449971115857143e-05,
+      "loss": 0.5513,
+      "step": 1864
+    },
+    {
+      "epoch": 1.8773869346733667,
+      "grad_norm": 0.3828125,
+      "learning_rate": 9.390759815512959e-05,
+      "loss": 0.5286,
+      "step": 1868
+    },
+    {
+      "epoch": 1.8814070351758794,
+      "grad_norm": 0.388671875,
+      "learning_rate": 9.331649954911662e-05,
+      "loss": 0.5448,
+      "step": 1872
+    },
+    {
+      "epoch": 1.885427135678392,
+      "grad_norm": 0.373046875,
+      "learning_rate": 9.272642603019611e-05,
+      "loss": 0.5206,
+      "step": 1876
+    },
+    {
+      "epoch": 1.8894472361809045,
+      "grad_norm": 0.359375,
+      "learning_rate": 9.213738826949364e-05,
+      "loss": 0.5171,
+      "step": 1880
+    },
+    {
+      "epoch": 1.893467336683417,
+      "grad_norm": 0.38671875,
+      "learning_rate": 9.154939691940357e-05,
+      "loss": 0.5161,
+      "step": 1884
+    },
+    {
+      "epoch": 1.8974874371859296,
+      "grad_norm": 0.369140625,
+      "learning_rate": 9.096246261339669e-05,
+      "loss": 0.5415,
+      "step": 1888
+    },
+    {
+      "epoch": 1.9015075376884423,
+      "grad_norm": 0.373046875,
+      "learning_rate": 9.03765959658277e-05,
+      "loss": 0.5354,
+      "step": 1892
+    },
+    {
+      "epoch": 1.9055276381909547,
+      "grad_norm": 0.3671875,
+      "learning_rate": 8.979180757174341e-05,
+      "loss": 0.5186,
+      "step": 1896
+    },
+    {
+      "epoch": 1.9095477386934674,
+      "grad_norm": 0.37890625,
+      "learning_rate": 8.920810800669098e-05,
+      "loss": 0.5099,
+      "step": 1900
+    },
+    {
+      "epoch": 1.9135678391959798,
+      "grad_norm": 0.380859375,
+      "learning_rate": 8.862550782652686e-05,
+      "loss": 0.4908,
+      "step": 1904
+    },
+    {
+      "epoch": 1.9175879396984925,
+      "grad_norm": 0.3828125,
+      "learning_rate": 8.804401756722564e-05,
+      "loss": 0.5253,
+      "step": 1908
+    },
+    {
+      "epoch": 1.9216080402010052,
+      "grad_norm": 0.37890625,
+      "learning_rate": 8.746364774468973e-05,
+      "loss": 0.5318,
+      "step": 1912
+    },
+    {
+      "epoch": 1.9256281407035176,
+      "grad_norm": 0.37890625,
+      "learning_rate": 8.688440885455922e-05,
+      "loss": 0.516,
+      "step": 1916
+    },
+    {
+      "epoch": 1.92964824120603,
+      "grad_norm": 0.392578125,
+      "learning_rate": 8.630631137202172e-05,
+      "loss": 0.5279,
+      "step": 1920
+    },
+    {
+      "epoch": 1.9336683417085427,
+      "grad_norm": 0.36328125,
+      "learning_rate": 8.572936575162345e-05,
+      "loss": 0.522,
+      "step": 1924
+    },
+    {
+      "epoch": 1.9376884422110554,
+      "grad_norm": 0.384765625,
+      "learning_rate": 8.515358242707971e-05,
+      "loss": 0.5256,
+      "step": 1928
+    },
+    {
+      "epoch": 1.9417085427135679,
+      "grad_norm": 0.3671875,
+      "learning_rate": 8.457897181108633e-05,
+      "loss": 0.5411,
+      "step": 1932
+    },
+    {
+      "epoch": 1.9457286432160803,
+      "grad_norm": 0.390625,
+      "learning_rate": 8.400554429513164e-05,
+      "loss": 0.5088,
+      "step": 1936
+    },
+    {
+      "epoch": 1.949748743718593,
+      "grad_norm": 0.37109375,
+      "learning_rate": 8.343331024930805e-05,
+      "loss": 0.5169,
+      "step": 1940
+    },
+    {
+      "epoch": 1.9537688442211056,
+      "grad_norm": 0.37890625,
+      "learning_rate": 8.286228002212506e-05,
+      "loss": 0.5431,
+      "step": 1944
+    },
+    {
+      "epoch": 1.957788944723618,
+      "grad_norm": 0.359375,
+      "learning_rate": 8.229246394032151e-05,
+      "loss": 0.4912,
+      "step": 1948
+    },
+    {
+      "epoch": 1.9618090452261305,
+      "grad_norm": 0.359375,
+      "learning_rate": 8.172387230867946e-05,
+      "loss": 0.504,
+      "step": 1952
+    },
+    {
+      "epoch": 1.9658291457286432,
+      "grad_norm": 0.37109375,
+      "learning_rate": 8.115651540983735e-05,
+      "loss": 0.5055,
+      "step": 1956
+    },
+    {
+      "epoch": 1.9698492462311559,
+      "grad_norm": 0.357421875,
+      "learning_rate": 8.059040350410414e-05,
+      "loss": 0.4912,
+      "step": 1960
+    },
+    {
+      "epoch": 1.9738693467336683,
+      "grad_norm": 0.40625,
+      "learning_rate": 8.00255468292741e-05,
+      "loss": 0.519,
+      "step": 1964
+    },
+    {
+      "epoch": 1.9778894472361808,
+      "grad_norm": 0.38671875,
+      "learning_rate": 7.946195560044113e-05,
+      "loss": 0.5418,
+      "step": 1968
+    },
+    {
+      "epoch": 1.9819095477386934,
+      "grad_norm": 0.37890625,
+      "learning_rate": 7.889964000981446e-05,
+      "loss": 0.5355,
+      "step": 1972
+    },
+    {
+      "epoch": 1.985929648241206,
+      "grad_norm": 0.359375,
+      "learning_rate": 7.833861022653428e-05,
+      "loss": 0.4856,
+      "step": 1976
+    },
+    {
+      "epoch": 1.9899497487437185,
+      "grad_norm": 0.375,
+      "learning_rate": 7.777887639648728e-05,
+      "loss": 0.5165,
+      "step": 1980
+    },
+    {
+      "epoch": 1.993969849246231,
+      "grad_norm": 0.400390625,
+      "learning_rate": 7.722044864212408e-05,
+      "loss": 0.5453,
+      "step": 1984
+    },
+    {
+      "epoch": 1.9979899497487437,
+      "grad_norm": 0.3359375,
+      "learning_rate": 7.666333706227556e-05,
+      "loss": 0.5159,
+      "step": 1988
+    },
+    {
+      "epoch": 2.0020100502512563,
+      "grad_norm": 0.337890625,
+      "learning_rate": 7.610755173197023e-05,
+      "loss": 0.4954,
+      "step": 1992
+    },
+    {
+      "epoch": 2.006030150753769,
+      "grad_norm": 0.328125,
+      "learning_rate": 7.555310270225238e-05,
+      "loss": 0.4156,
+      "step": 1996
+    },
+    {
+      "epoch": 2.0100502512562812,
+      "grad_norm": 0.353515625,
+      "learning_rate": 7.500000000000002e-05,
+      "loss": 0.4518,
+      "step": 2000
+    },
+    {
+      "epoch": 2.014070351758794,
+      "grad_norm": 0.345703125,
+      "learning_rate": 7.444825362774351e-05,
+      "loss": 0.4622,
+      "step": 2004
+    },
+    {
+      "epoch": 2.0180904522613066,
+      "grad_norm": 0.361328125,
+      "learning_rate": 7.389787356348506e-05,
+      "loss": 0.502,
+      "step": 2008
+    },
+    {
+      "epoch": 2.0221105527638192,
+      "grad_norm": 0.359375,
+      "learning_rate": 7.334886976051775e-05,
+      "loss": 0.4602,
+      "step": 2012
+    },
+    {
+      "epoch": 2.0261306532663315,
+      "grad_norm": 0.353515625,
+      "learning_rate": 7.280125214724605e-05,
+      "loss": 0.4535,
+      "step": 2016
+    },
+    {
+      "epoch": 2.030150753768844,
+      "grad_norm": 0.3359375,
+      "learning_rate": 7.22550306270058e-05,
+      "loss": 0.4341,
+      "step": 2020
+    },
+    {
+      "epoch": 2.034170854271357,
+      "grad_norm": 0.361328125,
+      "learning_rate": 7.171021507788542e-05,
+      "loss": 0.3997,
+      "step": 2024
+    },
+    {
+      "epoch": 2.0381909547738695,
+      "grad_norm": 0.37890625,
+      "learning_rate": 7.116681535254728e-05,
+      "loss": 0.4226,
+      "step": 2028
+    },
+    {
+      "epoch": 2.0422110552763817,
+      "grad_norm": 0.34375,
+      "learning_rate": 7.062484127804927e-05,
+      "loss": 0.4244,
+      "step": 2032
+    },
+    {
+      "epoch": 2.0462311557788944,
+      "grad_norm": 0.369140625,
+      "learning_rate": 7.008430265566738e-05,
+      "loss": 0.4473,
+      "step": 2036
+    },
+    {
+      "epoch": 2.050251256281407,
+      "grad_norm": 0.345703125,
+      "learning_rate": 6.954520926071815e-05,
+      "loss": 0.4669,
+      "step": 2040
+    },
+    {
+      "epoch": 2.0542713567839197,
+      "grad_norm": 0.373046875,
+      "learning_rate": 6.900757084238225e-05,
+      "loss": 0.4699,
+      "step": 2044
+    },
+    {
+      "epoch": 2.0582914572864324,
+      "grad_norm": 0.369140625,
+      "learning_rate": 6.847139712352783e-05,
+      "loss": 0.3894,
+      "step": 2048
+    },
+    {
+      "epoch": 2.0623115577889446,
+      "grad_norm": 0.349609375,
+      "learning_rate": 6.793669780053477e-05,
+      "loss": 0.43,
+      "step": 2052
+    },
+    {
+      "epoch": 2.0663316582914573,
+      "grad_norm": 0.35546875,
+      "learning_rate": 6.740348254311956e-05,
+      "loss": 0.4588,
+      "step": 2056
+    },
+    {
+      "epoch": 2.07035175879397,
+      "grad_norm": 0.361328125,
+      "learning_rate": 6.687176099416022e-05,
+      "loss": 0.477,
+      "step": 2060
+    },
+    {
+      "epoch": 2.0743718592964826,
+      "grad_norm": 0.3671875,
+      "learning_rate": 6.634154276952179e-05,
+      "loss": 0.4173,
+      "step": 2064
+    },
+    {
+      "epoch": 2.078391959798995,
+      "grad_norm": 0.365234375,
+      "learning_rate": 6.581283745788287e-05,
+      "loss": 0.4395,
+      "step": 2068
+    },
+    {
+      "epoch": 2.0824120603015075,
+      "grad_norm": 0.36328125,
+      "learning_rate": 6.528565462056154e-05,
+      "loss": 0.455,
+      "step": 2072
+    },
+    {
+      "epoch": 2.08643216080402,
+      "grad_norm": 0.3828125,
+      "learning_rate": 6.47600037913432e-05,
+      "loss": 0.4537,
+      "step": 2076
+    },
+    {
+      "epoch": 2.090452261306533,
+      "grad_norm": 0.3671875,
+      "learning_rate": 6.423589447630772e-05,
+      "loss": 0.4805,
+      "step": 2080
+    },
+    {
+      "epoch": 2.094472361809045,
+      "grad_norm": 0.35546875,
+      "learning_rate": 6.371333615365747e-05,
+      "loss": 0.447,
+      "step": 2084
+    },
+    {
+      "epoch": 2.0984924623115577,
+      "grad_norm": 0.392578125,
+      "learning_rate": 6.319233827354623e-05,
+      "loss": 0.4475,
+      "step": 2088
+    },
+    {
+      "epoch": 2.1025125628140704,
+      "grad_norm": 0.37109375,
+      "learning_rate": 6.267291025790803e-05,
+      "loss": 0.4904,
+      "step": 2092
+    },
+    {
+      "epoch": 2.106532663316583,
+      "grad_norm": 0.37109375,
+      "learning_rate": 6.215506150028676e-05,
+      "loss": 0.4489,
+      "step": 2096
+    },
+    {
+      "epoch": 2.1105527638190953,
+      "grad_norm": 0.34765625,
+      "learning_rate": 6.163880136566658e-05,
+      "loss": 0.4131,
+      "step": 2100
+    },
+    {
+      "epoch": 2.114572864321608,
+      "grad_norm": 0.373046875,
+      "learning_rate": 6.112413919030214e-05,
+      "loss": 0.4356,
+      "step": 2104
+    },
+    {
+      "epoch": 2.1185929648241206,
+      "grad_norm": 0.369140625,
+      "learning_rate": 6.061108428155014e-05,
+      "loss": 0.4512,
+      "step": 2108
+    },
+    {
+      "epoch": 2.1226130653266333,
+      "grad_norm": 0.361328125,
+      "learning_rate": 6.0099645917700655e-05,
+      "loss": 0.4485,
+      "step": 2112
+    },
+    {
+      "epoch": 2.1266331658291455,
+      "grad_norm": 0.384765625,
+      "learning_rate": 5.9589833347809726e-05,
+      "loss": 0.4486,
+      "step": 2116
+    },
+    {
+      "epoch": 2.130653266331658,
+      "grad_norm": 0.375,
+      "learning_rate": 5.908165579153169e-05,
+      "loss": 0.4717,
+      "step": 2120
+    },
+    {
+      "epoch": 2.134673366834171,
+      "grad_norm": 0.37890625,
+      "learning_rate": 5.857512243895267e-05,
+      "loss": 0.4452,
+      "step": 2124
+    },
+    {
+      "epoch": 2.1386934673366835,
+      "grad_norm": 0.3828125,
+      "learning_rate": 5.8070242450424495e-05,
+      "loss": 0.4552,
+      "step": 2128
+    },
+    {
+      "epoch": 2.1427135678391958,
+      "grad_norm": 0.37890625,
+      "learning_rate": 5.756702495639871e-05,
+      "loss": 0.4634,
+      "step": 2132
+    },
+    {
+      "epoch": 2.1467336683417084,
+      "grad_norm": 0.369140625,
+      "learning_rate": 5.706547905726178e-05,
+      "loss": 0.4308,
+      "step": 2136
+    },
+    {
+      "epoch": 2.150753768844221,
+      "grad_norm": 0.359375,
+      "learning_rate": 5.656561382317047e-05,
+      "loss": 0.4589,
+      "step": 2140
+    },
+    {
+      "epoch": 2.1547738693467338,
+      "grad_norm": 0.376953125,
+      "learning_rate": 5.6067438293887346e-05,
+      "loss": 0.4743,
+      "step": 2144
+    },
+    {
+      "epoch": 2.1587939698492464,
+      "grad_norm": 0.375,
+      "learning_rate": 5.557096147861804e-05,
+      "loss": 0.4762,
+      "step": 2148
+    },
+    {
+      "epoch": 2.1628140703517587,
+      "grad_norm": 0.3984375,
+      "learning_rate": 5.50761923558479e-05,
+      "loss": 0.4836,
+      "step": 2152
+    },
+    {
+      "epoch": 2.1668341708542713,
+      "grad_norm": 0.38671875,
+      "learning_rate": 5.458313987317952e-05,
+      "loss": 0.4698,
+      "step": 2156
+    },
+    {
+      "epoch": 2.170854271356784,
+      "grad_norm": 0.3828125,
+      "learning_rate": 5.4091812947171285e-05,
+      "loss": 0.4198,
+      "step": 2160
+    },
+    {
+      "epoch": 2.1748743718592967,
+      "grad_norm": 0.373046875,
+      "learning_rate": 5.3602220463175784e-05,
+      "loss": 0.4556,
+      "step": 2164
+    },
+    {
+      "epoch": 2.178894472361809,
+      "grad_norm": 0.404296875,
+      "learning_rate": 5.3114371275179254e-05,
+      "loss": 0.4487,
+      "step": 2168
+    },
+    {
+      "epoch": 2.1829145728643216,
+      "grad_norm": 0.390625,
+      "learning_rate": 5.262827420564162e-05,
+      "loss": 0.4466,
+      "step": 2172
+    },
+    {
+      "epoch": 2.1869346733668342,
+      "grad_norm": 0.36328125,
+      "learning_rate": 5.214393804533662e-05,
+      "loss": 0.4432,
+      "step": 2176
+    },
+    {
+      "epoch": 2.190954773869347,
+      "grad_norm": 0.384765625,
+      "learning_rate": 5.166137155319317e-05,
+      "loss": 0.4661,
+      "step": 2180
+    },
+    {
+      "epoch": 2.194974874371859,
+      "grad_norm": 0.373046875,
+      "learning_rate": 5.118058345613661e-05,
+      "loss": 0.4527,
+      "step": 2184
+    },
+    {
+      "epoch": 2.198994974874372,
+      "grad_norm": 0.357421875,
+      "learning_rate": 5.0701582448931284e-05,
+      "loss": 0.4467,
+      "step": 2188
+    },
+    {
+      "epoch": 2.2030150753768845,
+      "grad_norm": 0.37890625,
+      "learning_rate": 5.0224377194022936e-05,
+      "loss": 0.4384,
+      "step": 2192
+    },
+    {
+      "epoch": 2.207035175879397,
+      "grad_norm": 0.375,
+      "learning_rate": 4.974897632138219e-05,
+      "loss": 0.4337,
+      "step": 2196
+    },
+    {
+      "epoch": 2.2110552763819094,
+      "grad_norm": 0.361328125,
+      "learning_rate": 4.927538842834865e-05,
+      "loss": 0.427,
+      "step": 2200
+    },
+    {
+      "epoch": 2.215075376884422,
+      "grad_norm": 0.357421875,
+      "learning_rate": 4.880362207947512e-05,
+      "loss": 0.4358,
+      "step": 2204
+    },
+    {
+      "epoch": 2.2190954773869347,
+      "grad_norm": 0.373046875,
+      "learning_rate": 4.8333685806373014e-05,
+      "loss": 0.4364,
+      "step": 2208
+    },
+    {
+      "epoch": 2.2231155778894474,
+      "grad_norm": 0.369140625,
+      "learning_rate": 4.7865588107557773e-05,
+      "loss": 0.4491,
+      "step": 2212
+    },
+    {
+      "epoch": 2.22713567839196,
+      "grad_norm": 0.3671875,
+      "learning_rate": 4.7399337448295386e-05,
+      "loss": 0.4603,
+      "step": 2216
+    },
+    {
+      "epoch": 2.2311557788944723,
+      "grad_norm": 0.3984375,
+      "learning_rate": 4.6934942260449314e-05,
+      "loss": 0.4182,
+      "step": 2220
+    },
+    {
+      "epoch": 2.235175879396985,
+      "grad_norm": 0.365234375,
+      "learning_rate": 4.6472410942327806e-05,
+      "loss": 0.4323,
+      "step": 2224
+    },
+    {
+      "epoch": 2.2391959798994976,
+      "grad_norm": 0.353515625,
+      "learning_rate": 4.601175185853222e-05,
+      "loss": 0.434,
+      "step": 2228
+    },
+    {
+      "epoch": 2.24321608040201,
+      "grad_norm": 0.369140625,
+      "learning_rate": 4.5552973339805775e-05,
+      "loss": 0.435,
+      "step": 2232
+    },
+    {
+      "epoch": 2.2472361809045225,
+      "grad_norm": 0.36328125,
+      "learning_rate": 4.509608368288249e-05,
+      "loss": 0.4539,
+      "step": 2236
+    },
+    {
+      "epoch": 2.251256281407035,
+      "grad_norm": 0.38671875,
+      "learning_rate": 4.4641091150337774e-05,
+      "loss": 0.454,
+      "step": 2240
+    },
+    {
+      "epoch": 2.255276381909548,
+      "grad_norm": 0.40234375,
+      "learning_rate": 4.418800397043857e-05,
+      "loss": 0.4908,
+      "step": 2244
+    },
+    {
+      "epoch": 2.2592964824120605,
+      "grad_norm": 0.369140625,
+      "learning_rate": 4.373683033699459e-05,
+      "loss": 0.4652,
+      "step": 2248
+    },
+    {
+      "epoch": 2.2633165829145727,
+      "grad_norm": 0.365234375,
+      "learning_rate": 4.328757840921033e-05,
+      "loss": 0.4229,
+      "step": 2252
+    },
+    {
+      "epoch": 2.2673366834170854,
+      "grad_norm": 0.396484375,
+      "learning_rate": 4.2840256311537305e-05,
+      "loss": 0.462,
+      "step": 2256
+    },
+    {
+      "epoch": 2.271356783919598,
+      "grad_norm": 0.38671875,
+      "learning_rate": 4.239487213352716e-05,
+      "loss": 0.449,
+      "step": 2260
+    },
+    {
+      "epoch": 2.2753768844221107,
+      "grad_norm": 0.40625,
+      "learning_rate": 4.195143392968563e-05,
+      "loss": 0.4464,
+      "step": 2264
+    },
+    {
+      "epoch": 2.279396984924623,
+      "grad_norm": 0.373046875,
+      "learning_rate": 4.150994971932643e-05,
+      "loss": 0.4218,
+      "step": 2268
+    },
+    {
+      "epoch": 2.2834170854271356,
+      "grad_norm": 0.38671875,
+      "learning_rate": 4.1070427486426674e-05,
+      "loss": 0.4373,
+      "step": 2272
+    },
+    {
+      "epoch": 2.2874371859296483,
+      "grad_norm": 0.357421875,
+      "learning_rate": 4.0632875179482114e-05,
+      "loss": 0.4874,
+      "step": 2276
+    },
+    {
+      "epoch": 2.291457286432161,
+      "grad_norm": 0.359375,
+      "learning_rate": 4.019730071136379e-05,
+      "loss": 0.4531,
+      "step": 2280
+    },
+    {
+      "epoch": 2.295477386934673,
+      "grad_norm": 0.369140625,
+      "learning_rate": 3.97637119591745e-05,
+      "loss": 0.4166,
+      "step": 2284
+    },
+    {
+      "epoch": 2.299497487437186,
+      "grad_norm": 0.39453125,
+      "learning_rate": 3.933211676410664e-05,
+      "loss": 0.4153,
+      "step": 2288
+    },
+    {
+      "epoch": 2.3035175879396985,
+      "grad_norm": 0.392578125,
+      "learning_rate": 3.8902522931300416e-05,
+      "loss": 0.4459,
+      "step": 2292
+    },
+    {
+      "epoch": 2.307537688442211,
+      "grad_norm": 0.369140625,
+      "learning_rate": 3.847493822970241e-05,
+      "loss": 0.4454,
+      "step": 2296
+    },
+    {
+      "epoch": 2.3115577889447234,
+      "grad_norm": 0.3671875,
+      "learning_rate": 3.8049370391925424e-05,
+      "loss": 0.4405,
+      "step": 2300
+    },
+    {
+      "epoch": 2.315577889447236,
+      "grad_norm": 0.35546875,
+      "learning_rate": 3.7625827114108533e-05,
+      "loss": 0.4672,
+      "step": 2304
+    },
+    {
+      "epoch": 2.3195979899497488,
+      "grad_norm": 0.376953125,
+      "learning_rate": 3.72043160557776e-05,
+      "loss": 0.4154,
+      "step": 2308
+    },
+    {
+      "epoch": 2.3236180904522614,
+      "grad_norm": 0.365234375,
+      "learning_rate": 3.678484483970731e-05,
+      "loss": 0.4802,
+      "step": 2312
+    },
+    {
+      "epoch": 2.327638190954774,
+      "grad_norm": 0.37890625,
+      "learning_rate": 3.636742105178281e-05,
+      "loss": 0.4569,
+      "step": 2316
+    },
+    {
+      "epoch": 2.3316582914572863,
+      "grad_norm": 0.39453125,
+      "learning_rate": 3.5952052240862885e-05,
+      "loss": 0.4539,
+      "step": 2320
+    },
+    {
+      "epoch": 2.335678391959799,
+      "grad_norm": 0.361328125,
+      "learning_rate": 3.553874591864325e-05,
+      "loss": 0.4037,
+      "step": 2324
+    },
+    {
+      "epoch": 2.3396984924623117,
+      "grad_norm": 0.353515625,
+      "learning_rate": 3.5127509559520715e-05,
+      "loss": 0.4787,
+      "step": 2328
+    },
+    {
+      "epoch": 2.343718592964824,
+      "grad_norm": 0.3515625,
+      "learning_rate": 3.471835060045804e-05,
+      "loss": 0.3874,
+      "step": 2332
+    },
+    {
+      "epoch": 2.3477386934673365,
+      "grad_norm": 0.3828125,
+      "learning_rate": 3.431127644084953e-05,
+      "loss": 0.4441,
+      "step": 2336
+    },
+    {
+      "epoch": 2.351758793969849,
+      "grad_norm": 0.369140625,
+      "learning_rate": 3.390629444238704e-05,
+      "loss": 0.4572,
+      "step": 2340
+    },
+    {
+      "epoch": 2.355778894472362,
+      "grad_norm": 0.376953125,
+      "learning_rate": 3.350341192892708e-05,
+      "loss": 0.4298,
+      "step": 2344
+    },
+    {
+      "epoch": 2.3597989949748746,
+      "grad_norm": 0.345703125,
+      "learning_rate": 3.310263618635807e-05,
+      "loss": 0.4108,
+      "step": 2348
+    },
+    {
+      "epoch": 2.363819095477387,
+      "grad_norm": 0.388671875,
+      "learning_rate": 3.2703974462468896e-05,
+      "loss": 0.4497,
+      "step": 2352
+    },
+    {
+      "epoch": 2.3678391959798994,
+      "grad_norm": 0.365234375,
+      "learning_rate": 3.2307433966817594e-05,
+      "loss": 0.4525,
+      "step": 2356
+    },
+    {
+      "epoch": 2.371859296482412,
+      "grad_norm": 0.37890625,
+      "learning_rate": 3.191302187060103e-05,
+      "loss": 0.4586,
+      "step": 2360
+    },
+    {
+      "epoch": 2.375879396984925,
+      "grad_norm": 0.376953125,
+      "learning_rate": 3.152074530652539e-05,
+      "loss": 0.4659,
+      "step": 2364
+    },
+    {
+      "epoch": 2.379899497487437,
+      "grad_norm": 0.353515625,
+      "learning_rate": 3.113061136867685e-05,
+      "loss": 0.4744,
+      "step": 2368
+    },
+    {
+      "epoch": 2.3839195979899497,
+      "grad_norm": 0.345703125,
+      "learning_rate": 3.074262711239367e-05,
+      "loss": 0.4177,
+      "step": 2372
+    },
+    {
+      "epoch": 2.3879396984924623,
+      "grad_norm": 0.359375,
+      "learning_rate": 3.0356799554138256e-05,
+      "loss": 0.4621,
+      "step": 2376
+    },
+    {
+      "epoch": 2.391959798994975,
+      "grad_norm": 0.357421875,
+      "learning_rate": 2.9973135671370452e-05,
+      "loss": 0.4114,
+      "step": 2380
+    },
+    {
+      "epoch": 2.3959798994974877,
+      "grad_norm": 0.337890625,
+      "learning_rate": 2.959164240242145e-05,
+      "loss": 0.4193,
+      "step": 2384
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.365234375,
+      "learning_rate": 2.9212326646367995e-05,
+      "loss": 0.4469,
+      "step": 2388
+    },
+    {
+      "epoch": 2.4040201005025126,
+      "grad_norm": 0.357421875,
+      "learning_rate": 2.883519526290798e-05,
+      "loss": 0.4398,
+      "step": 2392
+    },
+    {
+      "epoch": 2.4080402010050252,
+      "grad_norm": 0.39453125,
+      "learning_rate": 2.8460255072236226e-05,
+      "loss": 0.4424,
+      "step": 2396
+    },
+    {
+      "epoch": 2.4120603015075375,
+      "grad_norm": 0.373046875,
+      "learning_rate": 2.8087512854921044e-05,
+      "loss": 0.4459,
+      "step": 2400
+    },
+    {
+      "epoch": 2.41608040201005,
+      "grad_norm": 0.37890625,
+      "learning_rate": 2.7716975351781772e-05,
+      "loss": 0.4363,
+      "step": 2404
+    },
+    {
+      "epoch": 2.420100502512563,
+      "grad_norm": 0.376953125,
+      "learning_rate": 2.734864926376677e-05,
+      "loss": 0.4646,
+      "step": 2408
+    },
+    {
+      "epoch": 2.4241206030150755,
+      "grad_norm": 0.357421875,
+      "learning_rate": 2.698254125183236e-05,
+      "loss": 0.4419,
+      "step": 2412
+    },
+    {
+      "epoch": 2.428140703517588,
+      "grad_norm": 0.353515625,
+      "learning_rate": 2.6618657936822308e-05,
+      "loss": 0.421,
+      "step": 2416
+    },
+    {
+      "epoch": 2.4321608040201004,
+      "grad_norm": 0.369140625,
+      "learning_rate": 2.6257005899347962e-05,
+      "loss": 0.4533,
+      "step": 2420
+    },
+    {
+      "epoch": 2.436180904522613,
+      "grad_norm": 0.357421875,
+      "learning_rate": 2.5897591679669367e-05,
+      "loss": 0.4531,
+      "step": 2424
+    },
+    {
+      "epoch": 2.4402010050251257,
+      "grad_norm": 0.341796875,
+      "learning_rate": 2.5540421777577114e-05,
+      "loss": 0.3943,
+      "step": 2428
+    },
+    {
+      "epoch": 2.4442211055276384,
+      "grad_norm": 0.361328125,
+      "learning_rate": 2.5185502652274475e-05,
+      "loss": 0.4607,
+      "step": 2432
+    },
+    {
+      "epoch": 2.4482412060301506,
+      "grad_norm": 0.388671875,
+      "learning_rate": 2.4832840722260915e-05,
+      "loss": 0.4372,
+      "step": 2436
+    },
+    {
+      "epoch": 2.4522613065326633,
+      "grad_norm": 0.36328125,
+      "learning_rate": 2.4482442365215788e-05,
+      "loss": 0.4316,
+      "step": 2440
+    },
+    {
+      "epoch": 2.456281407035176,
+      "grad_norm": 0.392578125,
+      "learning_rate": 2.413431391788317e-05,
+      "loss": 0.4207,
+      "step": 2444
+    },
+    {
+      "epoch": 2.4603015075376886,
+      "grad_norm": 0.369140625,
+      "learning_rate": 2.3788461675957094e-05,
+      "loss": 0.4368,
+      "step": 2448
+    },
+    {
+      "epoch": 2.464321608040201,
+      "grad_norm": 0.37890625,
+      "learning_rate": 2.3444891893967804e-05,
+      "loss": 0.4642,
+      "step": 2452
+    },
+    {
+      "epoch": 2.4683417085427135,
+      "grad_norm": 0.37890625,
+      "learning_rate": 2.3103610785168714e-05,
+      "loss": 0.4541,
+      "step": 2456
+    },
+    {
+      "epoch": 2.472361809045226,
+      "grad_norm": 0.375,
+      "learning_rate": 2.2764624521423824e-05,
+      "loss": 0.4488,
+      "step": 2460
+    },
+    {
+      "epoch": 2.476381909547739,
+      "grad_norm": 0.36328125,
+      "learning_rate": 2.2427939233096355e-05,
+      "loss": 0.446,
+      "step": 2464
+    },
+    {
+      "epoch": 2.480402010050251,
+      "grad_norm": 0.34375,
+      "learning_rate": 2.2093561008937703e-05,
+      "loss": 0.4293,
+      "step": 2468
+    },
+    {
+      "epoch": 2.4844221105527637,
+      "grad_norm": 0.369140625,
+      "learning_rate": 2.1761495895977372e-05,
+      "loss": 0.4381,
+      "step": 2472
+    },
+    {
+      "epoch": 2.4884422110552764,
+      "grad_norm": 0.373046875,
+      "learning_rate": 2.1431749899413726e-05,
+      "loss": 0.4362,
+      "step": 2476
+    },
+    {
+      "epoch": 2.492462311557789,
+      "grad_norm": 0.359375,
+      "learning_rate": 2.1104328982505185e-05,
+      "loss": 0.407,
+      "step": 2480
+    },
+    {
+      "epoch": 2.4964824120603017,
+      "grad_norm": 0.3828125,
+      "learning_rate": 2.0779239066462595e-05,
+      "loss": 0.4392,
+      "step": 2484
+    },
+    {
+      "epoch": 2.500502512562814,
+      "grad_norm": 0.392578125,
+      "learning_rate": 2.0456486030342057e-05,
+      "loss": 0.4642,
+      "step": 2488
+    },
+    {
+      "epoch": 2.5045226130653266,
+      "grad_norm": 0.384765625,
+      "learning_rate": 2.013607571093852e-05,
+      "loss": 0.4557,
+      "step": 2492
+    },
+    {
+      "epoch": 2.5085427135678393,
+      "grad_norm": 0.380859375,
+      "learning_rate": 1.981801390268034e-05,
+      "loss": 0.4448,
+      "step": 2496
+    },
+    {
+      "epoch": 2.5125628140703515,
+      "grad_norm": 0.376953125,
+      "learning_rate": 1.9502306357524443e-05,
+      "loss": 0.4698,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 4,
+  "max_steps": 2985,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6357403205555978e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}