gemma2b-summarize-gpt4o-128k / trainer_state.json

Model save

19f9b6c verified 2 months ago

No virus

149 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 15.0,
	"eval_steps": 500,
	"global_step": 4395,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0034129692832764505,
	"grad_norm": 1.8359375,
	"learning_rate": 4.545454545454545e-07,
	"loss": 3.0499,
	"step": 1
	},
	{
	"epoch": 0.017064846416382253,
	"grad_norm": 2.234375,
	"learning_rate": 2.2727272727272728e-06,
	"loss": 3.0434,
	"step": 5
	},
	{
	"epoch": 0.034129692832764506,
	"grad_norm": 2.078125,
	"learning_rate": 4.5454545454545455e-06,
	"loss": 3.0699,
	"step": 10
	},
	{
	"epoch": 0.051194539249146756,
	"grad_norm": 1.8515625,
	"learning_rate": 6.818181818181818e-06,
	"loss": 3.0656,
	"step": 15
	},
	{
	"epoch": 0.06825938566552901,
	"grad_norm": 3.0,
	"learning_rate": 9.090909090909091e-06,
	"loss": 3.0526,
	"step": 20
	},
	{
	"epoch": 0.08532423208191127,
	"grad_norm": 2.015625,
	"learning_rate": 1.1363636363636365e-05,
	"loss": 3.0382,
	"step": 25
	},
	{
	"epoch": 0.10238907849829351,
	"grad_norm": 17.125,
	"learning_rate": 1.3636363636363637e-05,
	"loss": 2.982,
	"step": 30
	},
	{
	"epoch": 0.11945392491467577,
	"grad_norm": 2.03125,
	"learning_rate": 1.590909090909091e-05,
	"loss": 2.9332,
	"step": 35
	},
	{
	"epoch": 0.13651877133105803,
	"grad_norm": 3.140625,
	"learning_rate": 1.8181818181818182e-05,
	"loss": 2.8934,
	"step": 40
	},
	{
	"epoch": 0.15358361774744028,
	"grad_norm": 3.25,
	"learning_rate": 2.0454545454545457e-05,
	"loss": 2.7804,
	"step": 45
	},
	{
	"epoch": 0.17064846416382254,
	"grad_norm": 1.3984375,
	"learning_rate": 2.272727272727273e-05,
	"loss": 2.7194,
	"step": 50
	},
	{
	"epoch": 0.18771331058020477,
	"grad_norm": 1.296875,
	"learning_rate": 2.5e-05,
	"loss": 2.5961,
	"step": 55
	},
	{
	"epoch": 0.20477815699658702,
	"grad_norm": 1.6640625,
	"learning_rate": 2.7272727272727273e-05,
	"loss": 2.5046,
	"step": 60
	},
	{
	"epoch": 0.22184300341296928,
	"grad_norm": 1.0234375,
	"learning_rate": 2.954545454545455e-05,
	"loss": 2.3975,
	"step": 65
	},
	{
	"epoch": 0.23890784982935154,
	"grad_norm": 1.390625,
	"learning_rate": 3.181818181818182e-05,
	"loss": 2.3091,
	"step": 70
	},
	{
	"epoch": 0.25597269624573377,
	"grad_norm": 1.1171875,
	"learning_rate": 3.409090909090909e-05,
	"loss": 2.2036,
	"step": 75
	},
	{
	"epoch": 0.27303754266211605,
	"grad_norm": 0.84375,
	"learning_rate": 3.6363636363636364e-05,
	"loss": 2.113,
	"step": 80
	},
	{
	"epoch": 0.2901023890784983,
	"grad_norm": 1.7421875,
	"learning_rate": 3.8636363636363636e-05,
	"loss": 2.043,
	"step": 85
	},
	{
	"epoch": 0.30716723549488056,
	"grad_norm": 6.375,
	"learning_rate": 4.0909090909090915e-05,
	"loss": 1.9568,
	"step": 90
	},
	{
	"epoch": 0.3242320819112628,
	"grad_norm": 0.71484375,
	"learning_rate": 4.318181818181819e-05,
	"loss": 1.8927,
	"step": 95
	},
	{
	"epoch": 0.3412969283276451,
	"grad_norm": 1.5078125,
	"learning_rate": 4.545454545454546e-05,
	"loss": 1.8394,
	"step": 100
	},
	{
	"epoch": 0.3583617747440273,
	"grad_norm": 25.625,
	"learning_rate": 4.772727272727273e-05,
	"loss": 1.7808,
	"step": 105
	},
	{
	"epoch": 0.37542662116040953,
	"grad_norm": 0.54296875,
	"learning_rate": 5e-05,
	"loss": 1.7467,
	"step": 110
	},
	{
	"epoch": 0.3924914675767918,
	"grad_norm": 0.8671875,
	"learning_rate": 5.2272727272727274e-05,
	"loss": 1.6988,
	"step": 115
	},
	{
	"epoch": 0.40955631399317405,
	"grad_norm": 0.609375,
	"learning_rate": 5.4545454545454546e-05,
	"loss": 1.6442,
	"step": 120
	},
	{
	"epoch": 0.42662116040955633,
	"grad_norm": 0.5078125,
	"learning_rate": 5.6818181818181825e-05,
	"loss": 1.5875,
	"step": 125
	},
	{
	"epoch": 0.44368600682593856,
	"grad_norm": 0.890625,
	"learning_rate": 5.90909090909091e-05,
	"loss": 1.5646,
	"step": 130
	},
	{
	"epoch": 0.46075085324232085,
	"grad_norm": 0.53125,
	"learning_rate": 6.136363636363636e-05,
	"loss": 1.5244,
	"step": 135
	},
	{
	"epoch": 0.4778156996587031,
	"grad_norm": 0.39453125,
	"learning_rate": 6.363636363636364e-05,
	"loss": 1.4945,
	"step": 140
	},
	{
	"epoch": 0.4948805460750853,
	"grad_norm": 0.447265625,
	"learning_rate": 6.59090909090909e-05,
	"loss": 1.469,
	"step": 145
	},
	{
	"epoch": 0.5119453924914675,
	"grad_norm": 0.75390625,
	"learning_rate": 6.818181818181818e-05,
	"loss": 1.4478,
	"step": 150
	},
	{
	"epoch": 0.5290102389078498,
	"grad_norm": 0.392578125,
	"learning_rate": 7.045454545454546e-05,
	"loss": 1.4291,
	"step": 155
	},
	{
	"epoch": 0.5460750853242321,
	"grad_norm": 0.314453125,
	"learning_rate": 7.272727272727273e-05,
	"loss": 1.4056,
	"step": 160
	},
	{
	"epoch": 0.5631399317406144,
	"grad_norm": 0.326171875,
	"learning_rate": 7.500000000000001e-05,
	"loss": 1.3839,
	"step": 165
	},
	{
	"epoch": 0.5802047781569966,
	"grad_norm": 0.283203125,
	"learning_rate": 7.727272727272727e-05,
	"loss": 1.3664,
	"step": 170
	},
	{
	"epoch": 0.5972696245733788,
	"grad_norm": 0.39453125,
	"learning_rate": 7.954545454545455e-05,
	"loss": 1.3557,
	"step": 175
	},
	{
	"epoch": 0.6143344709897611,
	"grad_norm": 0.33984375,
	"learning_rate": 8.181818181818183e-05,
	"loss": 1.3317,
	"step": 180
	},
	{
	"epoch": 0.6313993174061433,
	"grad_norm": 0.3125,
	"learning_rate": 8.40909090909091e-05,
	"loss": 1.3323,
	"step": 185
	},
	{
	"epoch": 0.6484641638225256,
	"grad_norm": 0.384765625,
	"learning_rate": 8.636363636363637e-05,
	"loss": 1.3129,
	"step": 190
	},
	{
	"epoch": 0.6655290102389079,
	"grad_norm": 0.435546875,
	"learning_rate": 8.863636363636364e-05,
	"loss": 1.3194,
	"step": 195
	},
	{
	"epoch": 0.6825938566552902,
	"grad_norm": 0.4140625,
	"learning_rate": 9.090909090909092e-05,
	"loss": 1.2992,
	"step": 200
	},
	{
	"epoch": 0.6996587030716723,
	"grad_norm": 0.296875,
	"learning_rate": 9.318181818181818e-05,
	"loss": 1.2934,
	"step": 205
	},
	{
	"epoch": 0.7167235494880546,
	"grad_norm": 0.515625,
	"learning_rate": 9.545454545454546e-05,
	"loss": 1.2759,
	"step": 210
	},
	{
	"epoch": 0.7337883959044369,
	"grad_norm": 0.52734375,
	"learning_rate": 9.772727272727274e-05,
	"loss": 1.2775,
	"step": 215
	},
	{
	"epoch": 0.7508532423208191,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001,
	"loss": 1.2696,
	"step": 220
	},
	{
	"epoch": 0.7679180887372014,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00010227272727272727,
	"loss": 1.2621,
	"step": 225
	},
	{
	"epoch": 0.7849829351535836,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00010454545454545455,
	"loss": 1.251,
	"step": 230
	},
	{
	"epoch": 0.8020477815699659,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00010681818181818181,
	"loss": 1.2544,
	"step": 235
	},
	{
	"epoch": 0.8191126279863481,
	"grad_norm": 0.48046875,
	"learning_rate": 0.00010909090909090909,
	"loss": 1.2528,
	"step": 240
	},
	{
	"epoch": 0.8361774744027304,
	"grad_norm": 0.5234375,
	"learning_rate": 0.00011136363636363636,
	"loss": 1.2459,
	"step": 245
	},
	{
	"epoch": 0.8532423208191127,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00011363636363636365,
	"loss": 1.2322,
	"step": 250
	},
	{
	"epoch": 0.8703071672354948,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00011590909090909093,
	"loss": 1.2154,
	"step": 255
	},
	{
	"epoch": 0.8873720136518771,
	"grad_norm": 0.44140625,
	"learning_rate": 0.0001181818181818182,
	"loss": 1.2258,
	"step": 260
	},
	{
	"epoch": 0.9044368600682594,
	"grad_norm": 0.56640625,
	"learning_rate": 0.00012045454545454546,
	"loss": 1.213,
	"step": 265
	},
	{
	"epoch": 0.9215017064846417,
	"grad_norm": 0.46875,
	"learning_rate": 0.00012272727272727272,
	"loss": 1.224,
	"step": 270
	},
	{
	"epoch": 0.9385665529010239,
	"grad_norm": 0.51171875,
	"learning_rate": 0.000125,
	"loss": 1.2093,
	"step": 275
	},
	{
	"epoch": 0.9556313993174061,
	"grad_norm": 0.90234375,
	"learning_rate": 0.00012727272727272728,
	"loss": 1.2132,
	"step": 280
	},
	{
	"epoch": 0.9726962457337884,
	"grad_norm": 0.63671875,
	"learning_rate": 0.00012954545454545456,
	"loss": 1.2083,
	"step": 285
	},
	{
	"epoch": 0.9897610921501706,
	"grad_norm": 0.671875,
	"learning_rate": 0.0001318181818181818,
	"loss": 1.2085,
	"step": 290
	},
	{
	"epoch": 1.0,
	"eval_loss": 2.486323833465576,
	"eval_runtime": 0.5451,
	"eval_samples_per_second": 18.345,
	"eval_steps_per_second": 1.834,
	"step": 293
	},
	{
	"epoch": 1.006825938566553,
	"grad_norm": 0.52734375,
	"learning_rate": 0.0001340909090909091,
	"loss": 1.1892,
	"step": 295
	},
	{
	"epoch": 1.023890784982935,
	"grad_norm": 0.4296875,
	"learning_rate": 0.00013636363636363637,
	"loss": 1.191,
	"step": 300
	},
	{
	"epoch": 1.0409556313993173,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00013863636363636365,
	"loss": 1.18,
	"step": 305
	},
	{
	"epoch": 1.0580204778156996,
	"grad_norm": 0.703125,
	"learning_rate": 0.00014090909090909093,
	"loss": 1.1964,
	"step": 310
	},
	{
	"epoch": 1.075085324232082,
	"grad_norm": 0.435546875,
	"learning_rate": 0.0001431818181818182,
	"loss": 1.1877,
	"step": 315
	},
	{
	"epoch": 1.0921501706484642,
	"grad_norm": 0.59375,
	"learning_rate": 0.00014545454545454546,
	"loss": 1.1846,
	"step": 320
	},
	{
	"epoch": 1.1092150170648465,
	"grad_norm": 0.8125,
	"learning_rate": 0.00014772727272727274,
	"loss": 1.1833,
	"step": 325
	},
	{
	"epoch": 1.1262798634812285,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00015000000000000001,
	"loss": 1.1704,
	"step": 330
	},
	{
	"epoch": 1.1433447098976108,
	"grad_norm": 1.5703125,
	"learning_rate": 0.00015227272727272727,
	"loss": 1.1886,
	"step": 335
	},
	{
	"epoch": 1.1604095563139931,
	"grad_norm": 0.4609375,
	"learning_rate": 0.00015454545454545454,
	"loss": 1.1759,
	"step": 340
	},
	{
	"epoch": 1.1774744027303754,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00015681818181818182,
	"loss": 1.1712,
	"step": 345
	},
	{
	"epoch": 1.1945392491467577,
	"grad_norm": 0.48046875,
	"learning_rate": 0.0001590909090909091,
	"loss": 1.1637,
	"step": 350
	},
	{
	"epoch": 1.21160409556314,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00016136363636363635,
	"loss": 1.1657,
	"step": 355
	},
	{
	"epoch": 1.2286689419795223,
	"grad_norm": 0.54296875,
	"learning_rate": 0.00016363636363636366,
	"loss": 1.1745,
	"step": 360
	},
	{
	"epoch": 1.2457337883959045,
	"grad_norm": 0.421875,
	"learning_rate": 0.00016590909090909094,
	"loss": 1.1496,
	"step": 365
	},
	{
	"epoch": 1.2627986348122868,
	"grad_norm": 0.546875,
	"learning_rate": 0.0001681818181818182,
	"loss": 1.1653,
	"step": 370
	},
	{
	"epoch": 1.2798634812286689,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00017045454545454547,
	"loss": 1.1702,
	"step": 375
	},
	{
	"epoch": 1.2969283276450512,
	"grad_norm": 0.53125,
	"learning_rate": 0.00017272727272727275,
	"loss": 1.15,
	"step": 380
	},
	{
	"epoch": 1.3139931740614335,
	"grad_norm": 1.2421875,
	"learning_rate": 0.000175,
	"loss": 1.1615,
	"step": 385
	},
	{
	"epoch": 1.3310580204778157,
	"grad_norm": 1.625,
	"learning_rate": 0.00017727272727272728,
	"loss": 1.1662,
	"step": 390
	},
	{
	"epoch": 1.348122866894198,
	"grad_norm": 0.5,
	"learning_rate": 0.00017954545454545456,
	"loss": 1.1579,
	"step": 395
	},
	{
	"epoch": 1.36518771331058,
	"grad_norm": 0.90234375,
	"learning_rate": 0.00018181818181818183,
	"loss": 1.1628,
	"step": 400
	},
	{
	"epoch": 1.3822525597269624,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00018409090909090909,
	"loss": 1.1521,
	"step": 405
	},
	{
	"epoch": 1.3993174061433447,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00018636363636363636,
	"loss": 1.1422,
	"step": 410
	},
	{
	"epoch": 1.416382252559727,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00018863636363636364,
	"loss": 1.1408,
	"step": 415
	},
	{
	"epoch": 1.4334470989761092,
	"grad_norm": 0.53515625,
	"learning_rate": 0.00019090909090909092,
	"loss": 1.1356,
	"step": 420
	},
	{
	"epoch": 1.4505119453924915,
	"grad_norm": 0.46875,
	"learning_rate": 0.0001931818181818182,
	"loss": 1.1497,
	"step": 425
	},
	{
	"epoch": 1.4675767918088738,
	"grad_norm": 0.47265625,
	"learning_rate": 0.00019545454545454548,
	"loss": 1.1437,
	"step": 430
	},
	{
	"epoch": 1.484641638225256,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00019772727272727273,
	"loss": 1.1518,
	"step": 435
	},
	{
	"epoch": 1.5017064846416384,
	"grad_norm": 0.38671875,
	"learning_rate": 0.0002,
	"loss": 1.1518,
	"step": 440
	},
	{
	"epoch": 1.5187713310580204,
	"grad_norm": 0.3984375,
	"learning_rate": 0.000199999211292062,
	"loss": 1.1498,
	"step": 445
	},
	{
	"epoch": 1.5358361774744027,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00019999684518068916,
	"loss": 1.1378,
	"step": 450
	},
	{
	"epoch": 1.552901023890785,
	"grad_norm": 0.87109375,
	"learning_rate": 0.00019999290170320485,
	"loss": 1.1434,
	"step": 455
	},
	{
	"epoch": 1.5699658703071673,
	"grad_norm": 0.6953125,
	"learning_rate": 0.00019998738092181421,
	"loss": 1.1417,
	"step": 460
	},
	{
	"epoch": 1.5870307167235493,
	"grad_norm": 0.79296875,
	"learning_rate": 0.00019998028292360286,
	"loss": 1.1329,
	"step": 465
	},
	{
	"epoch": 1.6040955631399316,
	"grad_norm": 0.5625,
	"learning_rate": 0.00019997160782053578,
	"loss": 1.1339,
	"step": 470
	},
	{
	"epoch": 1.621160409556314,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00019996135574945544,
	"loss": 1.1273,
	"step": 475
	},
	{
	"epoch": 1.6382252559726962,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00019994952687207954,
	"loss": 1.1343,
	"step": 480
	},
	{
	"epoch": 1.6552901023890785,
	"grad_norm": 0.69921875,
	"learning_rate": 0.00019993612137499876,
	"loss": 1.1374,
	"step": 485
	},
	{
	"epoch": 1.6723549488054608,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00019992113946967353,
	"loss": 1.1368,
	"step": 490
	},
	{
	"epoch": 1.689419795221843,
	"grad_norm": 0.490234375,
	"learning_rate": 0.00019990458139243077,
	"loss": 1.1289,
	"step": 495
	},
	{
	"epoch": 1.7064846416382253,
	"grad_norm": 0.53515625,
	"learning_rate": 0.00019988644740446022,
	"loss": 1.1255,
	"step": 500
	},
	{
	"epoch": 1.7235494880546076,
	"grad_norm": 0.5234375,
	"learning_rate": 0.00019986673779181033,
	"loss": 1.1149,
	"step": 505
	},
	{
	"epoch": 1.74061433447099,
	"grad_norm": 0.41015625,
	"learning_rate": 0.0001998454528653836,
	"loss": 1.1241,
	"step": 510
	},
	{
	"epoch": 1.757679180887372,
	"grad_norm": 0.41796875,
	"learning_rate": 0.0001998225929609319,
	"loss": 1.1252,
	"step": 515
	},
	{
	"epoch": 1.7747440273037542,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00019979815843905097,
	"loss": 1.1292,
	"step": 520
	},
	{
	"epoch": 1.7918088737201365,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001997721496851748,
	"loss": 1.1147,
	"step": 525
	},
	{
	"epoch": 1.8088737201365188,
	"grad_norm": 0.53125,
	"learning_rate": 0.00019974456710956964,
	"loss": 1.1155,
	"step": 530
	},
	{
	"epoch": 1.8259385665529009,
	"grad_norm": 0.546875,
	"learning_rate": 0.00019971541114732741,
	"loss": 1.1213,
	"step": 535
	},
	{
	"epoch": 1.8430034129692832,
	"grad_norm": 0.40234375,
	"learning_rate": 0.0001996846822583589,
	"loss": 1.1257,
	"step": 540
	},
	{
	"epoch": 1.8600682593856654,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00019965238092738643,
	"loss": 1.1217,
	"step": 545
	},
	{
	"epoch": 1.8771331058020477,
	"grad_norm": 0.5390625,
	"learning_rate": 0.0001996185076639364,
	"loss": 1.122,
	"step": 550
	},
	{
	"epoch": 1.89419795221843,
	"grad_norm": 0.390625,
	"learning_rate": 0.00019958306300233098,
	"loss": 1.1236,
	"step": 555
	},
	{
	"epoch": 1.9112627986348123,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00019954604750167993,
	"loss": 1.122,
	"step": 560
	},
	{
	"epoch": 1.9283276450511946,
	"grad_norm": 0.66796875,
	"learning_rate": 0.00019950746174587163,
	"loss": 1.1271,
	"step": 565
	},
	{
	"epoch": 1.9453924914675769,
	"grad_norm": 0.47265625,
	"learning_rate": 0.0001994673063435639,
	"loss": 1.1064,
	"step": 570
	},
	{
	"epoch": 1.9624573378839592,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001994255819281744,
	"loss": 1.1186,
	"step": 575
	},
	{
	"epoch": 1.9795221843003414,
	"grad_norm": 0.63671875,
	"learning_rate": 0.0001993822891578708,
	"loss": 1.1054,
	"step": 580
	},
	{
	"epoch": 1.9965870307167235,
	"grad_norm": 0.68359375,
	"learning_rate": 0.00019933742871556,
	"loss": 1.1135,
	"step": 585
	},
	{
	"epoch": 2.0,
	"eval_loss": 2.4516425132751465,
	"eval_runtime": 0.5387,
	"eval_samples_per_second": 18.563,
	"eval_steps_per_second": 1.856,
	"step": 586
	},
	{
	"epoch": 2.013651877133106,
	"grad_norm": 0.66015625,
	"learning_rate": 0.00019929100130887782,
	"loss": 1.1079,
	"step": 590
	},
	{
	"epoch": 2.030716723549488,
	"grad_norm": 0.94140625,
	"learning_rate": 0.0001992430076701775,
	"loss": 1.088,
	"step": 595
	},
	{
	"epoch": 2.04778156996587,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00019919344855651833,
	"loss": 1.0921,
	"step": 600
	},
	{
	"epoch": 2.0648464163822524,
	"grad_norm": 0.59375,
	"learning_rate": 0.00019914232474965365,
	"loss": 1.0909,
	"step": 605
	},
	{
	"epoch": 2.0819112627986347,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00019908963705601846,
	"loss": 1.0986,
	"step": 610
	},
	{
	"epoch": 2.098976109215017,
	"grad_norm": 0.435546875,
	"learning_rate": 0.0001990353863067169,
	"loss": 1.0925,
	"step": 615
	},
	{
	"epoch": 2.1160409556313993,
	"grad_norm": 0.640625,
	"learning_rate": 0.00019897957335750878,
	"loss": 1.0887,
	"step": 620
	},
	{
	"epoch": 2.1331058020477816,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00019892219908879653,
	"loss": 1.0991,
	"step": 625
	},
	{
	"epoch": 2.150170648464164,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00019886326440561093,
	"loss": 1.0949,
	"step": 630
	},
	{
	"epoch": 2.167235494880546,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00019880277023759702,
	"loss": 1.0841,
	"step": 635
	},
	{
	"epoch": 2.1843003412969284,
	"grad_norm": 0.78515625,
	"learning_rate": 0.0001987407175389994,
	"loss": 1.0947,
	"step": 640
	},
	{
	"epoch": 2.2013651877133107,
	"grad_norm": 0.42578125,
	"learning_rate": 0.0001986771072886472,
	"loss": 1.1026,
	"step": 645
	},
	{
	"epoch": 2.218430034129693,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00019861194048993863,
	"loss": 1.0918,
	"step": 650
	},
	{
	"epoch": 2.2354948805460753,
	"grad_norm": 0.41015625,
	"learning_rate": 0.0001985452181708251,
	"loss": 1.0903,
	"step": 655
	},
	{
	"epoch": 2.252559726962457,
	"grad_norm": 0.7109375,
	"learning_rate": 0.00019847694138379506,
	"loss": 1.0978,
	"step": 660
	},
	{
	"epoch": 2.26962457337884,
	"grad_norm": 0.4609375,
	"learning_rate": 0.0001984071112058574,
	"loss": 1.0864,
	"step": 665
	},
	{
	"epoch": 2.2866894197952217,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00019833572873852444,
	"loss": 1.0896,
	"step": 670
	},
	{
	"epoch": 2.303754266211604,
	"grad_norm": 0.53125,
	"learning_rate": 0.00019826279510779454,
	"loss": 1.0962,
	"step": 675
	},
	{
	"epoch": 2.3208191126279862,
	"grad_norm": 0.54296875,
	"learning_rate": 0.00019818831146413434,
	"loss": 1.0766,
	"step": 680
	},
	{
	"epoch": 2.3378839590443685,
	"grad_norm": 0.337890625,
	"learning_rate": 0.0001981122789824607,
	"loss": 1.0853,
	"step": 685
	},
	{
	"epoch": 2.354948805460751,
	"grad_norm": 0.69140625,
	"learning_rate": 0.0001980346988621221,
	"loss": 1.0788,
	"step": 690
	},
	{
	"epoch": 2.372013651877133,
	"grad_norm": 0.96875,
	"learning_rate": 0.00019795557232687956,
	"loss": 1.0804,
	"step": 695
	},
	{
	"epoch": 2.3890784982935154,
	"grad_norm": 0.470703125,
	"learning_rate": 0.0001978749006248877,
	"loss": 1.0674,
	"step": 700
	},
	{
	"epoch": 2.4061433447098977,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00019779268502867473,
	"loss": 1.0931,
	"step": 705
	},
	{
	"epoch": 2.42320819112628,
	"grad_norm": 0.458984375,
	"learning_rate": 0.0001977089268351225,
	"loss": 1.0854,
	"step": 710
	},
	{
	"epoch": 2.4402730375426622,
	"grad_norm": 0.43359375,
	"learning_rate": 0.00019762362736544607,
	"loss": 1.0858,
	"step": 715
	},
	{
	"epoch": 2.4573378839590445,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00019753678796517282,
	"loss": 1.0835,
	"step": 720
	},
	{
	"epoch": 2.474402730375427,
	"grad_norm": 0.59375,
	"learning_rate": 0.00019744841000412123,
	"loss": 1.0881,
	"step": 725
	},
	{
	"epoch": 2.491467576791809,
	"grad_norm": 0.6171875,
	"learning_rate": 0.00019735849487637929,
	"loss": 1.091,
	"step": 730
	},
	{
	"epoch": 2.508532423208191,
	"grad_norm": 0.5625,
	"learning_rate": 0.0001972670440002825,
	"loss": 1.0877,
	"step": 735
	},
	{
	"epoch": 2.5255972696245736,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00019717405881839145,
	"loss": 1.0777,
	"step": 740
	},
	{
	"epoch": 2.5426621160409555,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019707954079746927,
	"loss": 1.0934,
	"step": 745
	},
	{
	"epoch": 2.5597269624573378,
	"grad_norm": 0.439453125,
	"learning_rate": 0.00019698349142845814,
	"loss": 1.085,
	"step": 750
	},
	{
	"epoch": 2.57679180887372,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00019688591222645607,
	"loss": 1.0744,
	"step": 755
	},
	{
	"epoch": 2.5938566552901023,
	"grad_norm": 0.4375,
	"learning_rate": 0.00019678680473069293,
	"loss": 1.0818,
	"step": 760
	},
	{
	"epoch": 2.6109215017064846,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00019668617050450603,
	"loss": 1.0824,
	"step": 765
	},
	{
	"epoch": 2.627986348122867,
	"grad_norm": 0.4921875,
	"learning_rate": 0.00019658401113531565,
	"loss": 1.0828,
	"step": 770
	},
	{
	"epoch": 2.645051194539249,
	"grad_norm": 1.09375,
	"learning_rate": 0.00019648032823459994,
	"loss": 1.0884,
	"step": 775
	},
	{
	"epoch": 2.6621160409556315,
	"grad_norm": 0.55859375,
	"learning_rate": 0.00019637512343786937,
	"loss": 1.0835,
	"step": 780
	},
	{
	"epoch": 2.6791808873720138,
	"grad_norm": 0.484375,
	"learning_rate": 0.00019626839840464119,
	"loss": 1.0828,
	"step": 785
	},
	{
	"epoch": 2.696245733788396,
	"grad_norm": 0.376953125,
	"learning_rate": 0.0001961601548184129,
	"loss": 1.0881,
	"step": 790
	},
	{
	"epoch": 2.7133105802047783,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00019605039438663614,
	"loss": 1.0772,
	"step": 795
	},
	{
	"epoch": 2.73037542662116,
	"grad_norm": 0.349609375,
	"learning_rate": 0.0001959391188406893,
	"loss": 1.0677,
	"step": 800
	},
	{
	"epoch": 2.747440273037543,
	"grad_norm": 0.486328125,
	"learning_rate": 0.00019582632993585052,
	"loss": 1.0815,
	"step": 805
	},
	{
	"epoch": 2.7645051194539247,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00019571202945126994,
	"loss": 1.0763,
	"step": 810
	},
	{
	"epoch": 2.781569965870307,
	"grad_norm": 0.396484375,
	"learning_rate": 0.0001955962191899415,
	"loss": 1.0684,
	"step": 815
	},
	{
	"epoch": 2.7986348122866893,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00019547890097867468,
	"loss": 1.0847,
	"step": 820
	},
	{
	"epoch": 2.8156996587030716,
	"grad_norm": 0.474609375,
	"learning_rate": 0.00019536007666806556,
	"loss": 1.071,
	"step": 825
	},
	{
	"epoch": 2.832764505119454,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019523974813246767,
	"loss": 1.0873,
	"step": 830
	},
	{
	"epoch": 2.849829351535836,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00019511791726996243,
	"loss": 1.0676,
	"step": 835
	},
	{
	"epoch": 2.8668941979522184,
	"grad_norm": 0.51953125,
	"learning_rate": 0.0001949945860023292,
	"loss": 1.0748,
	"step": 840
	},
	{
	"epoch": 2.8839590443686007,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00019486975627501502,
	"loss": 1.0716,
	"step": 845
	},
	{
	"epoch": 2.901023890784983,
	"grad_norm": 0.38671875,
	"learning_rate": 0.0001947434300571038,
	"loss": 1.0777,
	"step": 850
	},
	{
	"epoch": 2.9180887372013653,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00019461560934128533,
	"loss": 1.0733,
	"step": 855
	},
	{
	"epoch": 2.9351535836177476,
	"grad_norm": 0.42578125,
	"learning_rate": 0.0001944862961438239,
	"loss": 1.0582,
	"step": 860
	},
	{
	"epoch": 2.9522184300341294,
	"grad_norm": 0.462890625,
	"learning_rate": 0.00019435549250452645,
	"loss": 1.0657,
	"step": 865
	},
	{
	"epoch": 2.969283276450512,
	"grad_norm": 1.1171875,
	"learning_rate": 0.0001942232004867103,
	"loss": 1.0746,
	"step": 870
	},
	{
	"epoch": 2.986348122866894,
	"grad_norm": 0.466796875,
	"learning_rate": 0.0001940894221771708,
	"loss": 1.0715,
	"step": 875
	},
	{
	"epoch": 3.0,
	"eval_loss": 2.447284698486328,
	"eval_runtime": 0.553,
	"eval_samples_per_second": 18.083,
	"eval_steps_per_second": 1.808,
	"step": 879
	},
	{
	"epoch": 3.0034129692832763,
	"grad_norm": 0.80859375,
	"learning_rate": 0.00019395415968614813,
	"loss": 1.0736,
	"step": 880
	},
	{
	"epoch": 3.0204778156996586,
	"grad_norm": 0.47265625,
	"learning_rate": 0.00019381741514729443,
	"loss": 1.0618,
	"step": 885
	},
	{
	"epoch": 3.037542662116041,
	"grad_norm": 0.390625,
	"learning_rate": 0.0001936791907176397,
	"loss": 1.0571,
	"step": 890
	},
	{
	"epoch": 3.054607508532423,
	"grad_norm": 0.46484375,
	"learning_rate": 0.00019353948857755803,
	"loss": 1.0626,
	"step": 895
	},
	{
	"epoch": 3.0716723549488054,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00019339831093073318,
	"loss": 1.053,
	"step": 900
	},
	{
	"epoch": 3.0887372013651877,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019325566000412376,
	"loss": 1.06,
	"step": 905
	},
	{
	"epoch": 3.10580204778157,
	"grad_norm": 0.38671875,
	"learning_rate": 0.0001931115380479281,
	"loss": 1.0452,
	"step": 910
	},
	{
	"epoch": 3.1228668941979523,
	"grad_norm": 0.515625,
	"learning_rate": 0.00019296594733554892,
	"loss": 1.0642,
	"step": 915
	},
	{
	"epoch": 3.1399317406143346,
	"grad_norm": 0.5,
	"learning_rate": 0.0001928188901635571,
	"loss": 1.0474,
	"step": 920
	},
	{
	"epoch": 3.156996587030717,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019267036885165588,
	"loss": 1.0526,
	"step": 925
	},
	{
	"epoch": 3.174061433447099,
	"grad_norm": 0.4296875,
	"learning_rate": 0.00019252038574264405,
	"loss": 1.061,
	"step": 930
	},
	{
	"epoch": 3.1911262798634814,
	"grad_norm": 0.443359375,
	"learning_rate": 0.00019236894320237894,
	"loss": 1.0519,
	"step": 935
	},
	{
	"epoch": 3.2081911262798632,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00019221604361973919,
	"loss": 1.0479,
	"step": 940
	},
	{
	"epoch": 3.2252559726962455,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00019206168940658712,
	"loss": 1.049,
	"step": 945
	},
	{
	"epoch": 3.242320819112628,
	"grad_norm": 0.462890625,
	"learning_rate": 0.00019190588299773062,
	"loss": 1.0474,
	"step": 950
	},
	{
	"epoch": 3.25938566552901,
	"grad_norm": 0.462890625,
	"learning_rate": 0.00019174862685088472,
	"loss": 1.06,
	"step": 955
	},
	{
	"epoch": 3.2764505119453924,
	"grad_norm": 0.373046875,
	"learning_rate": 0.0001915899234466328,
	"loss": 1.0464,
	"step": 960
	},
	{
	"epoch": 3.2935153583617747,
	"grad_norm": 0.48046875,
	"learning_rate": 0.00019142977528838762,
	"loss": 1.0531,
	"step": 965
	},
	{
	"epoch": 3.310580204778157,
	"grad_norm": 0.380859375,
	"learning_rate": 0.0001912681849023516,
	"loss": 1.0518,
	"step": 970
	},
	{
	"epoch": 3.3276450511945392,
	"grad_norm": 0.447265625,
	"learning_rate": 0.00019110515483747716,
	"loss": 1.0535,
	"step": 975
	},
	{
	"epoch": 3.3447098976109215,
	"grad_norm": 0.625,
	"learning_rate": 0.0001909406876654264,
	"loss": 1.0559,
	"step": 980
	},
	{
	"epoch": 3.361774744027304,
	"grad_norm": 0.51953125,
	"learning_rate": 0.00019077478598053063,
	"loss": 1.0528,
	"step": 985
	},
	{
	"epoch": 3.378839590443686,
	"grad_norm": 0.46875,
	"learning_rate": 0.00019060745239974936,
	"loss": 1.0431,
	"step": 990
	},
	{
	"epoch": 3.3959044368600684,
	"grad_norm": 0.63671875,
	"learning_rate": 0.0001904386895626291,
	"loss": 1.0456,
	"step": 995
	},
	{
	"epoch": 3.4129692832764507,
	"grad_norm": 0.48828125,
	"learning_rate": 0.00019026850013126157,
	"loss": 1.0579,
	"step": 1000
	},
	{
	"epoch": 3.430034129692833,
	"grad_norm": 0.625,
	"learning_rate": 0.0001900968867902419,
	"loss": 1.0592,
	"step": 1005
	},
	{
	"epoch": 3.4470989761092152,
	"grad_norm": 0.51171875,
	"learning_rate": 0.00018992385224662623,
	"loss": 1.0476,
	"step": 1010
	},
	{
	"epoch": 3.464163822525597,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00018974939922988883,
	"loss": 1.0517,
	"step": 1015
	},
	{
	"epoch": 3.4812286689419794,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00018957353049187936,
	"loss": 1.0607,
	"step": 1020
	},
	{
	"epoch": 3.4982935153583616,
	"grad_norm": 0.4765625,
	"learning_rate": 0.00018939624880677918,
	"loss": 1.0502,
	"step": 1025
	},
	{
	"epoch": 3.515358361774744,
	"grad_norm": 0.3671875,
	"learning_rate": 0.0001892175569710577,
	"loss": 1.041,
	"step": 1030
	},
	{
	"epoch": 3.532423208191126,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00018903745780342839,
	"loss": 1.0382,
	"step": 1035
	},
	{
	"epoch": 3.5494880546075085,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00018885595414480405,
	"loss": 1.0426,
	"step": 1040
	},
	{
	"epoch": 3.5665529010238908,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001886730488582522,
	"loss": 1.0524,
	"step": 1045
	},
	{
	"epoch": 3.583617747440273,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00018848874482894993,
	"loss": 1.0371,
	"step": 1050
	},
	{
	"epoch": 3.6006825938566553,
	"grad_norm": 0.412109375,
	"learning_rate": 0.00018830304496413822,
	"loss": 1.0571,
	"step": 1055
	},
	{
	"epoch": 3.6177474402730376,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00018811595219307622,
	"loss": 1.0458,
	"step": 1060
	},
	{
	"epoch": 3.63481228668942,
	"grad_norm": 0.455078125,
	"learning_rate": 0.000187927469466995,
	"loss": 1.0474,
	"step": 1065
	},
	{
	"epoch": 3.651877133105802,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00018773759975905098,
	"loss": 1.0438,
	"step": 1070
	},
	{
	"epoch": 3.6689419795221845,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00018754634606427914,
	"loss": 1.0577,
	"step": 1075
	},
	{
	"epoch": 3.6860068259385663,
	"grad_norm": 0.435546875,
	"learning_rate": 0.00018735371139954558,
	"loss": 1.0522,
	"step": 1080
	},
	{
	"epoch": 3.703071672354949,
	"grad_norm": 0.55859375,
	"learning_rate": 0.0001871596988035001,
	"loss": 1.0622,
	"step": 1085
	},
	{
	"epoch": 3.720136518771331,
	"grad_norm": 0.53125,
	"learning_rate": 0.00018696431133652817,
	"loss": 1.0404,
	"step": 1090
	},
	{
	"epoch": 3.737201365187713,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00018676755208070275,
	"loss": 1.0576,
	"step": 1095
	},
	{
	"epoch": 3.7542662116040955,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00018656942413973555,
	"loss": 1.0525,
	"step": 1100
	},
	{
	"epoch": 3.7713310580204777,
	"grad_norm": 0.392578125,
	"learning_rate": 0.0001863699306389282,
	"loss": 1.047,
	"step": 1105
	},
	{
	"epoch": 3.78839590443686,
	"grad_norm": 0.54296875,
	"learning_rate": 0.0001861690747251228,
	"loss": 1.0547,
	"step": 1110
	},
	{
	"epoch": 3.8054607508532423,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00018596685956665245,
	"loss": 1.0366,
	"step": 1115
	},
	{
	"epoch": 3.8225255972696246,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00018576328835329117,
	"loss": 1.0444,
	"step": 1120
	},
	{
	"epoch": 3.839590443686007,
	"grad_norm": 0.498046875,
	"learning_rate": 0.00018555836429620358,
	"loss": 1.0428,
	"step": 1125
	},
	{
	"epoch": 3.856655290102389,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00018535209062789433,
	"loss": 1.0425,
	"step": 1130
	},
	{
	"epoch": 3.8737201365187715,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00018514447060215698,
	"loss": 1.0503,
	"step": 1135
	},
	{
	"epoch": 3.8907849829351537,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00018493550749402278,
	"loss": 1.0376,
	"step": 1140
	},
	{
	"epoch": 3.9078498293515356,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00018472520459970898,
	"loss": 1.054,
	"step": 1145
	},
	{
	"epoch": 3.9249146757679183,
	"grad_norm": 0.44921875,
	"learning_rate": 0.0001845135652365668,
	"loss": 1.0491,
	"step": 1150
	},
	{
	"epoch": 3.9419795221843,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00018430059274302917,
	"loss": 1.0454,
	"step": 1155
	},
	{
	"epoch": 3.9590443686006824,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00018408629047855804,
	"loss": 1.0466,
	"step": 1160
	},
	{
	"epoch": 3.9761092150170647,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00018387066182359133,
	"loss": 1.0356,
	"step": 1165
	},
	{
	"epoch": 3.993174061433447,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00018365371017948964,
	"loss": 1.0471,
	"step": 1170
	},
	{
	"epoch": 4.0,
	"eval_loss": 2.452413558959961,
	"eval_runtime": 0.5427,
	"eval_samples_per_second": 18.427,
	"eval_steps_per_second": 1.843,
	"step": 1172
	},
	{
	"epoch": 4.010238907849829,
	"grad_norm": 0.47265625,
	"learning_rate": 0.00018343543896848273,
	"loss": 1.0282,
	"step": 1175
	},
	{
	"epoch": 4.027303754266212,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00018321585163361527,
	"loss": 1.0262,
	"step": 1180
	},
	{
	"epoch": 4.044368600682594,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00018299495163869275,
	"loss": 1.0263,
	"step": 1185
	},
	{
	"epoch": 4.061433447098976,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001827727424682268,
	"loss": 1.0265,
	"step": 1190
	},
	{
	"epoch": 4.078498293515358,
	"grad_norm": 0.375,
	"learning_rate": 0.00018254922762738008,
	"loss": 1.0266,
	"step": 1195
	},
	{
	"epoch": 4.09556313993174,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00018232441064191125,
	"loss": 1.0326,
	"step": 1200
	},
	{
	"epoch": 4.112627986348123,
	"grad_norm": 0.3828125,
	"learning_rate": 0.0001820982950581191,
	"loss": 1.0278,
	"step": 1205
	},
	{
	"epoch": 4.129692832764505,
	"grad_norm": 0.46484375,
	"learning_rate": 0.00018187088444278674,
	"loss": 1.0206,
	"step": 1210
	},
	{
	"epoch": 4.146757679180888,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00018164218238312535,
	"loss": 1.037,
	"step": 1215
	},
	{
	"epoch": 4.163822525597269,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00018141219248671745,
	"loss": 1.0229,
	"step": 1220
	},
	{
	"epoch": 4.180887372013652,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00018118091838146029,
	"loss": 1.0223,
	"step": 1225
	},
	{
	"epoch": 4.197952218430034,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00018094836371550824,
	"loss": 1.0175,
	"step": 1230
	},
	{
	"epoch": 4.215017064846417,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00018071453215721554,
	"loss": 1.0369,
	"step": 1235
	},
	{
	"epoch": 4.2320819112627985,
	"grad_norm": 0.41015625,
	"learning_rate": 0.00018047942739507836,
	"loss": 1.0182,
	"step": 1240
	},
	{
	"epoch": 4.249146757679181,
	"grad_norm": 0.421875,
	"learning_rate": 0.00018024305313767646,
	"loss": 1.0192,
	"step": 1245
	},
	{
	"epoch": 4.266211604095563,
	"grad_norm": 0.40625,
	"learning_rate": 0.000180005413113615,
	"loss": 1.0427,
	"step": 1250
	},
	{
	"epoch": 4.283276450511945,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00017976651107146533,
	"loss": 1.0313,
	"step": 1255
	},
	{
	"epoch": 4.300341296928328,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001795263507797063,
	"loss": 1.0195,
	"step": 1260
	},
	{
	"epoch": 4.3174061433447095,
	"grad_norm": 0.453125,
	"learning_rate": 0.00017928493602666445,
	"loss": 1.0222,
	"step": 1265
	},
	{
	"epoch": 4.334470989761092,
	"grad_norm": 0.5546875,
	"learning_rate": 0.00017904227062045437,
	"loss": 1.0183,
	"step": 1270
	},
	{
	"epoch": 4.351535836177474,
	"grad_norm": 0.6328125,
	"learning_rate": 0.00017879835838891875,
	"loss": 1.0321,
	"step": 1275
	},
	{
	"epoch": 4.368600682593857,
	"grad_norm": 0.7265625,
	"learning_rate": 0.00017855320317956784,
	"loss": 1.0241,
	"step": 1280
	},
	{
	"epoch": 4.385665529010239,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00017830680885951887,
	"loss": 1.019,
	"step": 1285
	},
	{
	"epoch": 4.402730375426621,
	"grad_norm": 0.7265625,
	"learning_rate": 0.00017805917931543492,
	"loss": 1.0291,
	"step": 1290
	},
	{
	"epoch": 4.419795221843003,
	"grad_norm": 0.8671875,
	"learning_rate": 0.00017781031845346375,
	"loss": 1.0254,
	"step": 1295
	},
	{
	"epoch": 4.436860068259386,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00017756023019917607,
	"loss": 1.0232,
	"step": 1300
	},
	{
	"epoch": 4.453924914675768,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00017730891849750377,
	"loss": 1.0267,
	"step": 1305
	},
	{
	"epoch": 4.4709897610921505,
	"grad_norm": 0.38671875,
	"learning_rate": 0.0001770563873126775,
	"loss": 1.0282,
	"step": 1310
	},
	{
	"epoch": 4.488054607508532,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001768026406281642,
	"loss": 1.0384,
	"step": 1315
	},
	{
	"epoch": 4.505119453924914,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00017654768244660448,
	"loss": 1.0197,
	"step": 1320
	},
	{
	"epoch": 4.522184300341297,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00017629151678974907,
	"loss": 1.023,
	"step": 1325
	},
	{
	"epoch": 4.53924914675768,
	"grad_norm": 0.359375,
	"learning_rate": 0.00017603414769839577,
	"loss": 1.0289,
	"step": 1330
	},
	{
	"epoch": 4.5563139931740615,
	"grad_norm": 0.72265625,
	"learning_rate": 0.00017577557923232546,
	"loss": 1.0222,
	"step": 1335
	},
	{
	"epoch": 4.573378839590443,
	"grad_norm": 0.5,
	"learning_rate": 0.00017551581547023819,
	"loss": 1.0285,
	"step": 1340
	},
	{
	"epoch": 4.590443686006826,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00017525486050968875,
	"loss": 1.0288,
	"step": 1345
	},
	{
	"epoch": 4.607508532423208,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00017499271846702213,
	"loss": 1.0302,
	"step": 1350
	},
	{
	"epoch": 4.624573378839591,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00017472939347730856,
	"loss": 1.0358,
	"step": 1355
	},
	{
	"epoch": 4.6416382252559725,
	"grad_norm": 0.451171875,
	"learning_rate": 0.0001744648896942782,
	"loss": 1.0278,
	"step": 1360
	},
	{
	"epoch": 4.658703071672355,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00017419921129025576,
	"loss": 1.0171,
	"step": 1365
	},
	{
	"epoch": 4.675767918088737,
	"grad_norm": 0.376953125,
	"learning_rate": 0.0001739323624560945,
	"loss": 1.0152,
	"step": 1370
	},
	{
	"epoch": 4.69283276450512,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00017366434740111037,
	"loss": 1.0247,
	"step": 1375
	},
	{
	"epoch": 4.709897610921502,
	"grad_norm": 0.431640625,
	"learning_rate": 0.00017339517035301532,
	"loss": 1.0212,
	"step": 1380
	},
	{
	"epoch": 4.726962457337884,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00017312483555785086,
	"loss": 1.0309,
	"step": 1385
	},
	{
	"epoch": 4.744027303754266,
	"grad_norm": 0.353515625,
	"learning_rate": 0.000172853347279921,
	"loss": 1.0298,
	"step": 1390
	},
	{
	"epoch": 4.761092150170649,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00017258070980172494,
	"loss": 1.0215,
	"step": 1395
	},
	{
	"epoch": 4.778156996587031,
	"grad_norm": 0.4453125,
	"learning_rate": 0.0001723069274238895,
	"loss": 1.0249,
	"step": 1400
	},
	{
	"epoch": 4.795221843003413,
	"grad_norm": 0.4921875,
	"learning_rate": 0.0001720320044651014,
	"loss": 1.0259,
	"step": 1405
	},
	{
	"epoch": 4.812286689419795,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00017175594526203905,
	"loss": 1.0215,
	"step": 1410
	},
	{
	"epoch": 4.829351535836177,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00017147875416930416,
	"loss": 1.0272,
	"step": 1415
	},
	{
	"epoch": 4.84641638225256,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00017120043555935298,
	"loss": 1.0365,
	"step": 1420
	},
	{
	"epoch": 4.863481228668942,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00017092099382242748,
	"loss": 1.02,
	"step": 1425
	},
	{
	"epoch": 4.8805460750853245,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00017064043336648599,
	"loss": 1.021,
	"step": 1430
	},
	{
	"epoch": 4.897610921501706,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001703587586171337,
	"loss": 1.0156,
	"step": 1435
	},
	{
	"epoch": 4.914675767918089,
	"grad_norm": 0.375,
	"learning_rate": 0.00017007597401755276,
	"loss": 1.0283,
	"step": 1440
	},
	{
	"epoch": 4.931740614334471,
	"grad_norm": 0.443359375,
	"learning_rate": 0.00016979208402843237,
	"loss": 1.0194,
	"step": 1445
	},
	{
	"epoch": 4.948805460750854,
	"grad_norm": 0.57421875,
	"learning_rate": 0.00016950709312789833,
	"loss": 1.0198,
	"step": 1450
	},
	{
	"epoch": 4.965870307167235,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00016922100581144228,
	"loss": 1.028,
	"step": 1455
	},
	{
	"epoch": 4.982935153583618,
	"grad_norm": 0.4765625,
	"learning_rate": 0.00016893382659185105,
	"loss": 1.0157,
	"step": 1460
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00016864555999913518,
	"loss": 1.0357,
	"step": 1465
	},
	{
	"epoch": 5.0,
	"eval_loss": 2.468480110168457,
	"eval_runtime": 0.549,
	"eval_samples_per_second": 18.214,
	"eval_steps_per_second": 1.821,
	"step": 1465
	},
	{
	"epoch": 5.017064846416382,
	"grad_norm": 0.380859375,
	"learning_rate": 0.0001683562105804577,
	"loss": 1.0001,
	"step": 1470
	},
	{
	"epoch": 5.034129692832765,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00016806578290006225,
	"loss": 0.9998,
	"step": 1475
	},
	{
	"epoch": 5.051194539249146,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001677742815392012,
	"loss": 0.9999,
	"step": 1480
	},
	{
	"epoch": 5.068259385665529,
	"grad_norm": 0.43359375,
	"learning_rate": 0.00016748171109606328,
	"loss": 1.0085,
	"step": 1485
	},
	{
	"epoch": 5.085324232081911,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00016718807618570106,
	"loss": 1.0018,
	"step": 1490
	},
	{
	"epoch": 5.102389078498294,
	"grad_norm": 0.453125,
	"learning_rate": 0.00016689338143995833,
	"loss": 0.9997,
	"step": 1495
	},
	{
	"epoch": 5.1194539249146755,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00016659763150739677,
	"loss": 1.009,
	"step": 1500
	},
	{
	"epoch": 5.136518771331058,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00016630083105322266,
	"loss": 1.0047,
	"step": 1505
	},
	{
	"epoch": 5.15358361774744,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00016600298475921365,
	"loss": 1.004,
	"step": 1510
	},
	{
	"epoch": 5.170648464163823,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00016570409732364437,
	"loss": 1.0022,
	"step": 1515
	},
	{
	"epoch": 5.187713310580205,
	"grad_norm": 0.427734375,
	"learning_rate": 0.0001654041734612127,
	"loss": 1.0113,
	"step": 1520
	},
	{
	"epoch": 5.204778156996587,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00016510321790296525,
	"loss": 1.0171,
	"step": 1525
	},
	{
	"epoch": 5.221843003412969,
	"grad_norm": 0.462890625,
	"learning_rate": 0.00016480123539622281,
	"loss": 1.0146,
	"step": 1530
	},
	{
	"epoch": 5.238907849829351,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00016449823070450531,
	"loss": 1.0005,
	"step": 1535
	},
	{
	"epoch": 5.255972696245734,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00016419420860745699,
	"loss": 1.0093,
	"step": 1540
	},
	{
	"epoch": 5.273037542662116,
	"grad_norm": 0.439453125,
	"learning_rate": 0.00016388917390077054,
	"loss": 0.9965,
	"step": 1545
	},
	{
	"epoch": 5.290102389078498,
	"grad_norm": 0.466796875,
	"learning_rate": 0.00016358313139611195,
	"loss": 1.0153,
	"step": 1550
	},
	{
	"epoch": 5.30716723549488,
	"grad_norm": 0.376953125,
	"learning_rate": 0.0001632760859210442,
	"loss": 1.0094,
	"step": 1555
	},
	{
	"epoch": 5.324232081911263,
	"grad_norm": 0.5234375,
	"learning_rate": 0.00016296804231895142,
	"loss": 0.9984,
	"step": 1560
	},
	{
	"epoch": 5.341296928327645,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00016265900544896225,
	"loss": 1.0066,
	"step": 1565
	},
	{
	"epoch": 5.3583617747440275,
	"grad_norm": 0.470703125,
	"learning_rate": 0.00016234898018587337,
	"loss": 1.0027,
	"step": 1570
	},
	{
	"epoch": 5.375426621160409,
	"grad_norm": 0.470703125,
	"learning_rate": 0.0001620379714200725,
	"loss": 1.014,
	"step": 1575
	},
	{
	"epoch": 5.392491467576792,
	"grad_norm": 0.39453125,
	"learning_rate": 0.00016172598405746124,
	"loss": 1.0085,
	"step": 1580
	},
	{
	"epoch": 5.409556313993174,
	"grad_norm": 0.51171875,
	"learning_rate": 0.00016141302301937786,
	"loss": 0.9999,
	"step": 1585
	},
	{
	"epoch": 5.426621160409557,
	"grad_norm": 0.54296875,
	"learning_rate": 0.0001610990932425194,
	"loss": 1.0199,
	"step": 1590
	},
	{
	"epoch": 5.4436860068259385,
	"grad_norm": 0.447265625,
	"learning_rate": 0.00016078419967886402,
	"loss": 1.0137,
	"step": 1595
	},
	{
	"epoch": 5.460750853242321,
	"grad_norm": 0.408203125,
	"learning_rate": 0.0001604683472955928,
	"loss": 1.0057,
	"step": 1600
	},
	{
	"epoch": 5.477815699658703,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00016015154107501133,
	"loss": 1.0099,
	"step": 1605
	},
	{
	"epoch": 5.494880546075085,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00015983378601447127,
	"loss": 1.0066,
	"step": 1610
	},
	{
	"epoch": 5.511945392491468,
	"grad_norm": 0.412109375,
	"learning_rate": 0.0001595150871262914,
	"loss": 1.0134,
	"step": 1615
	},
	{
	"epoch": 5.5290102389078495,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00015919544943767856,
	"loss": 1.0108,
	"step": 1620
	},
	{
	"epoch": 5.546075085324232,
	"grad_norm": 0.40625,
	"learning_rate": 0.00015887487799064838,
	"loss": 1.0229,
	"step": 1625
	},
	{
	"epoch": 5.563139931740614,
	"grad_norm": 0.56640625,
	"learning_rate": 0.00015855337784194577,
	"loss": 1.0126,
	"step": 1630
	},
	{
	"epoch": 5.580204778156997,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00015823095406296514,
	"loss": 0.9947,
	"step": 1635
	},
	{
	"epoch": 5.597269624573379,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00015790761173967036,
	"loss": 1.0063,
	"step": 1640
	},
	{
	"epoch": 5.614334470989761,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00015758335597251458,
	"loss": 1.0132,
	"step": 1645
	},
	{
	"epoch": 5.631399317406143,
	"grad_norm": 0.4375,
	"learning_rate": 0.00015725819187635968,
	"loss": 1.0173,
	"step": 1650
	},
	{
	"epoch": 5.648464163822526,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00015693212458039584,
	"loss": 1.0115,
	"step": 1655
	},
	{
	"epoch": 5.665529010238908,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00015660515922806027,
	"loss": 0.9966,
	"step": 1660
	},
	{
	"epoch": 5.6825938566552905,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00015627730097695638,
	"loss": 1.0058,
	"step": 1665
	},
	{
	"epoch": 5.699658703071672,
	"grad_norm": 0.427734375,
	"learning_rate": 0.0001559485549987723,
	"loss": 1.0143,
	"step": 1670
	},
	{
	"epoch": 5.716723549488055,
	"grad_norm": 0.384765625,
	"learning_rate": 0.0001556189264791992,
	"loss": 1.0124,
	"step": 1675
	},
	{
	"epoch": 5.733788395904437,
	"grad_norm": 0.40625,
	"learning_rate": 0.0001552884206178498,
	"loss": 1.0119,
	"step": 1680
	},
	{
	"epoch": 5.750853242320819,
	"grad_norm": 0.412109375,
	"learning_rate": 0.00015495704262817597,
	"loss": 1.0061,
	"step": 1685
	},
	{
	"epoch": 5.7679180887372015,
	"grad_norm": 0.3828125,
	"learning_rate": 0.0001546247977373867,
	"loss": 1.0054,
	"step": 1690
	},
	{
	"epoch": 5.784982935153583,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00015429169118636566,
	"loss": 1.0021,
	"step": 1695
	},
	{
	"epoch": 5.802047781569966,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00015395772822958845,
	"loss": 1.0037,
	"step": 1700
	},
	{
	"epoch": 5.819112627986348,
	"grad_norm": 0.408203125,
	"learning_rate": 0.00015362291413503984,
	"loss": 1.0054,
	"step": 1705
	},
	{
	"epoch": 5.836177474402731,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00015328725418413045,
	"loss": 1.0132,
	"step": 1710
	},
	{
	"epoch": 5.853242320819112,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00015295075367161367,
	"loss": 1.0041,
	"step": 1715
	},
	{
	"epoch": 5.870307167235495,
	"grad_norm": 0.34375,
	"learning_rate": 0.00015261341790550196,
	"loss": 1.001,
	"step": 1720
	},
	{
	"epoch": 5.887372013651877,
	"grad_norm": 0.373046875,
	"learning_rate": 0.0001522752522069833,
	"loss": 1.0102,
	"step": 1725
	},
	{
	"epoch": 5.90443686006826,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00015193626191033712,
	"loss": 0.996,
	"step": 1730
	},
	{
	"epoch": 5.921501706484642,
	"grad_norm": 0.37109375,
	"learning_rate": 0.0001515964523628501,
	"loss": 1.0052,
	"step": 1735
	},
	{
	"epoch": 5.938566552901024,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00015125582892473204,
	"loss": 1.0118,
	"step": 1740
	},
	{
	"epoch": 5.955631399317406,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00015091439696903115,
	"loss": 0.998,
	"step": 1745
	},
	{
	"epoch": 5.972696245733788,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00015057216188154928,
	"loss": 0.9925,
	"step": 1750
	},
	{
	"epoch": 5.989761092150171,
	"grad_norm": 0.5,
	"learning_rate": 0.00015022912906075702,
	"loss": 0.993,
	"step": 1755
	},
	{
	"epoch": 6.0,
	"eval_loss": 2.4702811241149902,
	"eval_runtime": 0.5473,
	"eval_samples_per_second": 18.272,
	"eval_steps_per_second": 1.827,
	"step": 1758
	},
	{
	"epoch": 6.006825938566553,
	"grad_norm": 0.48046875,
	"learning_rate": 0.00014988530391770856,
	"loss": 0.9939,
	"step": 1760
	},
	{
	"epoch": 6.023890784982935,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00014954069187595633,
	"loss": 0.9904,
	"step": 1765
	},
	{
	"epoch": 6.040955631399317,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00014919529837146528,
	"loss": 0.982,
	"step": 1770
	},
	{
	"epoch": 6.0580204778157,
	"grad_norm": 0.421875,
	"learning_rate": 0.0001488491288525275,
	"loss": 0.9741,
	"step": 1775
	},
	{
	"epoch": 6.075085324232082,
	"grad_norm": 0.439453125,
	"learning_rate": 0.0001485021887796759,
	"loss": 0.995,
	"step": 1780
	},
	{
	"epoch": 6.092150170648464,
	"grad_norm": 0.39453125,
	"learning_rate": 0.00014815448362559826,
	"loss": 0.9931,
	"step": 1785
	},
	{
	"epoch": 6.109215017064846,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00014780601887505088,
	"loss": 1.0001,
	"step": 1790
	},
	{
	"epoch": 6.126279863481229,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00014745680002477203,
	"loss": 0.9913,
	"step": 1795
	},
	{
	"epoch": 6.143344709897611,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00014710683258339536,
	"loss": 0.9883,
	"step": 1800
	},
	{
	"epoch": 6.160409556313994,
	"grad_norm": 0.427734375,
	"learning_rate": 0.0001467561220713628,
	"loss": 0.9835,
	"step": 1805
	},
	{
	"epoch": 6.177474402730375,
	"grad_norm": 0.421875,
	"learning_rate": 0.0001464046740208377,
	"loss": 0.9894,
	"step": 1810
	},
	{
	"epoch": 6.194539249146757,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00014605249397561736,
	"loss": 0.9833,
	"step": 1815
	},
	{
	"epoch": 6.21160409556314,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00014569958749104575,
	"loss": 0.9942,
	"step": 1820
	},
	{
	"epoch": 6.228668941979522,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00014534596013392575,
	"loss": 0.9937,
	"step": 1825
	},
	{
	"epoch": 6.2457337883959045,
	"grad_norm": 0.6328125,
	"learning_rate": 0.00014499161748243147,
	"loss": 0.9852,
	"step": 1830
	},
	{
	"epoch": 6.262798634812286,
	"grad_norm": 0.6640625,
	"learning_rate": 0.0001446365651260201,
	"loss": 0.9886,
	"step": 1835
	},
	{
	"epoch": 6.279863481228669,
	"grad_norm": 0.4375,
	"learning_rate": 0.00014428080866534396,
	"loss": 0.9893,
	"step": 1840
	},
	{
	"epoch": 6.296928327645051,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00014392435371216185,
	"loss": 0.9951,
	"step": 1845
	},
	{
	"epoch": 6.313993174061434,
	"grad_norm": 0.34765625,
	"learning_rate": 0.0001435672058892509,
	"loss": 0.9877,
	"step": 1850
	},
	{
	"epoch": 6.3310580204778155,
	"grad_norm": 0.390625,
	"learning_rate": 0.00014320937083031748,
	"loss": 0.9922,
	"step": 1855
	},
	{
	"epoch": 6.348122866894198,
	"grad_norm": 0.38671875,
	"learning_rate": 0.0001428508541799086,
	"loss": 0.9939,
	"step": 1860
	},
	{
	"epoch": 6.36518771331058,
	"grad_norm": 0.3828125,
	"learning_rate": 0.0001424916615933229,
	"loss": 0.994,
	"step": 1865
	},
	{
	"epoch": 6.382252559726963,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00014213179873652127,
	"loss": 0.993,
	"step": 1870
	},
	{
	"epoch": 6.399317406143345,
	"grad_norm": 0.408203125,
	"learning_rate": 0.00014177127128603745,
	"loss": 0.9982,
	"step": 1875
	},
	{
	"epoch": 6.4163822525597265,
	"grad_norm": 0.40625,
	"learning_rate": 0.0001414100849288888,
	"loss": 0.9926,
	"step": 1880
	},
	{
	"epoch": 6.433447098976109,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00014104824536248614,
	"loss": 0.995,
	"step": 1885
	},
	{
	"epoch": 6.450511945392491,
	"grad_norm": 0.40625,
	"learning_rate": 0.00014068575829454436,
	"loss": 0.9894,
	"step": 1890
	},
	{
	"epoch": 6.467576791808874,
	"grad_norm": 0.359375,
	"learning_rate": 0.00014032262944299194,
	"loss": 0.997,
	"step": 1895
	},
	{
	"epoch": 6.484641638225256,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00013995886453588104,
	"loss": 0.9861,
	"step": 1900
	},
	{
	"epoch": 6.501706484641638,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00013959446931129704,
	"loss": 0.9896,
	"step": 1905
	},
	{
	"epoch": 6.51877133105802,
	"grad_norm": 0.380859375,
	"learning_rate": 0.0001392294495172681,
	"loss": 0.9969,
	"step": 1910
	},
	{
	"epoch": 6.535836177474403,
	"grad_norm": 0.392578125,
	"learning_rate": 0.0001388638109116744,
	"loss": 0.9902,
	"step": 1915
	},
	{
	"epoch": 6.552901023890785,
	"grad_norm": 0.375,
	"learning_rate": 0.00013849755926215735,
	"loss": 0.9995,
	"step": 1920
	},
	{
	"epoch": 6.5699658703071675,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00013813070034602863,
	"loss": 0.9935,
	"step": 1925
	},
	{
	"epoch": 6.587030716723549,
	"grad_norm": 0.466796875,
	"learning_rate": 0.00013776323995017898,
	"loss": 0.9799,
	"step": 1930
	},
	{
	"epoch": 6.604095563139932,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00013739518387098705,
	"loss": 0.9959,
	"step": 1935
	},
	{
	"epoch": 6.621160409556314,
	"grad_norm": 0.388671875,
	"learning_rate": 0.0001370265379142279,
	"loss": 0.9897,
	"step": 1940
	},
	{
	"epoch": 6.638225255972696,
	"grad_norm": 0.443359375,
	"learning_rate": 0.0001366573078949813,
	"loss": 0.9829,
	"step": 1945
	},
	{
	"epoch": 6.6552901023890785,
	"grad_norm": 0.44921875,
	"learning_rate": 0.00013628749963754026,
	"loss": 0.9963,
	"step": 1950
	},
	{
	"epoch": 6.672354948805461,
	"grad_norm": 0.52734375,
	"learning_rate": 0.0001359171189753189,
	"loss": 0.999,
	"step": 1955
	},
	{
	"epoch": 6.689419795221843,
	"grad_norm": 0.6484375,
	"learning_rate": 0.00013554617175076062,
	"loss": 0.9806,
	"step": 1960
	},
	{
	"epoch": 6.706484641638225,
	"grad_norm": 0.388671875,
	"learning_rate": 0.0001351746638152458,
	"loss": 0.9903,
	"step": 1965
	},
	{
	"epoch": 6.723549488054608,
	"grad_norm": 0.4765625,
	"learning_rate": 0.00013480260102899966,
	"loss": 1.0009,
	"step": 1970
	},
	{
	"epoch": 6.7406143344709895,
	"grad_norm": 0.44140625,
	"learning_rate": 0.0001344299892609996,
	"loss": 0.9879,
	"step": 1975
	},
	{
	"epoch": 6.757679180887372,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00013405683438888282,
	"loss": 0.9966,
	"step": 1980
	},
	{
	"epoch": 6.774744027303754,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00013368314229885347,
	"loss": 0.988,
	"step": 1985
	},
	{
	"epoch": 6.791808873720137,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00013330891888559002,
	"loss": 0.9835,
	"step": 1990
	},
	{
	"epoch": 6.808873720136519,
	"grad_norm": 0.421875,
	"learning_rate": 0.00013293417005215188,
	"loss": 0.9922,
	"step": 1995
	},
	{
	"epoch": 6.825938566552901,
	"grad_norm": 0.40234375,
	"learning_rate": 0.0001325589017098867,
	"loss": 0.9893,
	"step": 2000
	},
	{
	"epoch": 6.843003412969283,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00013218311977833687,
	"loss": 0.9965,
	"step": 2005
	},
	{
	"epoch": 6.860068259385666,
	"grad_norm": 0.365234375,
	"learning_rate": 0.0001318068301851463,
	"loss": 0.9843,
	"step": 2010
	},
	{
	"epoch": 6.877133105802048,
	"grad_norm": 0.390625,
	"learning_rate": 0.00013143003886596669,
	"loss": 0.9845,
	"step": 2015
	},
	{
	"epoch": 6.8941979522184305,
	"grad_norm": 0.3515625,
	"learning_rate": 0.0001310527517643642,
	"loss": 0.9909,
	"step": 2020
	},
	{
	"epoch": 6.911262798634812,
	"grad_norm": 0.359375,
	"learning_rate": 0.00013067497483172538,
	"loss": 0.9885,
	"step": 2025
	},
	{
	"epoch": 6.928327645051194,
	"grad_norm": 0.375,
	"learning_rate": 0.00013029671402716366,
	"loss": 0.9879,
	"step": 2030
	},
	{
	"epoch": 6.945392491467577,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00012991797531742492,
	"loss": 0.9891,
	"step": 2035
	},
	{
	"epoch": 6.962457337883959,
	"grad_norm": 0.34375,
	"learning_rate": 0.00012953876467679373,
	"loss": 0.9972,
	"step": 2040
	},
	{
	"epoch": 6.979522184300341,
	"grad_norm": 0.369140625,
	"learning_rate": 0.00012915908808699893,
	"loss": 0.9962,
	"step": 2045
	},
	{
	"epoch": 6.996587030716723,
	"grad_norm": 0.44921875,
	"learning_rate": 0.00012877895153711935,
	"loss": 0.9941,
	"step": 2050
	},
	{
	"epoch": 7.0,
	"eval_loss": 2.49063777923584,
	"eval_runtime": 0.554,
	"eval_samples_per_second": 18.051,
	"eval_steps_per_second": 1.805,
	"step": 2051
	},
	{
	"epoch": 7.013651877133106,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00012839836102348926,
	"loss": 0.9759,
	"step": 2055
	},
	{
	"epoch": 7.030716723549488,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00012801732254960388,
	"loss": 0.9703,
	"step": 2060
	},
	{
	"epoch": 7.047781569965871,
	"grad_norm": 0.375,
	"learning_rate": 0.00012763584212602453,
	"loss": 0.9643,
	"step": 2065
	},
	{
	"epoch": 7.064846416382252,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00012725392577028402,
	"loss": 0.9646,
	"step": 2070
	},
	{
	"epoch": 7.081911262798635,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001268715795067916,
	"loss": 0.9732,
	"step": 2075
	},
	{
	"epoch": 7.098976109215017,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00012648880936673787,
	"loss": 0.9786,
	"step": 2080
	},
	{
	"epoch": 7.1160409556314,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00012610562138799978,
	"loss": 0.9733,
	"step": 2085
	},
	{
	"epoch": 7.1331058020477816,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00012572202161504543,
	"loss": 0.9808,
	"step": 2090
	},
	{
	"epoch": 7.150170648464163,
	"grad_norm": 0.4609375,
	"learning_rate": 0.00012533801609883842,
	"loss": 0.9762,
	"step": 2095
	},
	{
	"epoch": 7.167235494880546,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00012495361089674285,
	"loss": 0.9809,
	"step": 2100
	},
	{
	"epoch": 7.184300341296928,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00012456881207242732,
	"loss": 0.9821,
	"step": 2105
	},
	{
	"epoch": 7.201365187713311,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00012418362569576965,
	"loss": 0.9873,
	"step": 2110
	},
	{
	"epoch": 7.2184300341296925,
	"grad_norm": 0.55078125,
	"learning_rate": 0.00012379805784276082,
	"loss": 0.9727,
	"step": 2115
	},
	{
	"epoch": 7.235494880546075,
	"grad_norm": 0.515625,
	"learning_rate": 0.0001234121145954094,
	"loss": 0.9827,
	"step": 2120
	},
	{
	"epoch": 7.252559726962457,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00012302580204164541,
	"loss": 0.9846,
	"step": 2125
	},
	{
	"epoch": 7.26962457337884,
	"grad_norm": 0.42578125,
	"learning_rate": 0.0001226391262752245,
	"loss": 0.9736,
	"step": 2130
	},
	{
	"epoch": 7.286689419795222,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00012225209339563145,
	"loss": 0.9743,
	"step": 2135
	},
	{
	"epoch": 7.303754266211604,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00012186470950798445,
	"loss": 0.9787,
	"step": 2140
	},
	{
	"epoch": 7.320819112627986,
	"grad_norm": 0.490234375,
	"learning_rate": 0.00012147698072293842,
	"loss": 0.9788,
	"step": 2145
	},
	{
	"epoch": 7.337883959044369,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00012108891315658879,
	"loss": 0.967,
	"step": 2150
	},
	{
	"epoch": 7.354948805460751,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00012070051293037492,
	"loss": 0.9792,
	"step": 2155
	},
	{
	"epoch": 7.372013651877133,
	"grad_norm": 0.43359375,
	"learning_rate": 0.00012031178617098371,
	"loss": 0.9905,
	"step": 2160
	},
	{
	"epoch": 7.389078498293515,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00011992273901025269,
	"loss": 0.9873,
	"step": 2165
	},
	{
	"epoch": 7.406143344709897,
	"grad_norm": 0.4453125,
	"learning_rate": 0.0001195333775850736,
	"loss": 0.9872,
	"step": 2170
	},
	{
	"epoch": 7.42320819112628,
	"grad_norm": 0.44140625,
	"learning_rate": 0.00011914370803729533,
	"loss": 0.98,
	"step": 2175
	},
	{
	"epoch": 7.440273037542662,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00011875373651362727,
	"loss": 0.9827,
	"step": 2180
	},
	{
	"epoch": 7.4573378839590445,
	"grad_norm": 0.474609375,
	"learning_rate": 0.00011836346916554205,
	"loss": 0.9738,
	"step": 2185
	},
	{
	"epoch": 7.474402730375426,
	"grad_norm": 0.421875,
	"learning_rate": 0.00011797291214917881,
	"loss": 0.9762,
	"step": 2190
	},
	{
	"epoch": 7.491467576791809,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00011758207162524598,
	"loss": 0.9675,
	"step": 2195
	},
	{
	"epoch": 7.508532423208191,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00011719095375892396,
	"loss": 0.9923,
	"step": 2200
	},
	{
	"epoch": 7.525597269624574,
	"grad_norm": 0.482421875,
	"learning_rate": 0.00011679956471976814,
	"loss": 0.9818,
	"step": 2205
	},
	{
	"epoch": 7.5426621160409555,
	"grad_norm": 0.3671875,
	"learning_rate": 0.0001164079106816113,
	"loss": 0.9783,
	"step": 2210
	},
	{
	"epoch": 7.559726962457338,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00011601599782246646,
	"loss": 0.9735,
	"step": 2215
	},
	{
	"epoch": 7.57679180887372,
	"grad_norm": 0.443359375,
	"learning_rate": 0.00011562383232442926,
	"loss": 0.9751,
	"step": 2220
	},
	{
	"epoch": 7.593856655290102,
	"grad_norm": 0.3671875,
	"learning_rate": 0.0001152314203735805,
	"loss": 0.9734,
	"step": 2225
	},
	{
	"epoch": 7.610921501706485,
	"grad_norm": 0.439453125,
	"learning_rate": 0.00011483876815988867,
	"loss": 0.9706,
	"step": 2230
	},
	{
	"epoch": 7.627986348122867,
	"grad_norm": 0.44140625,
	"learning_rate": 0.00011444588187711205,
	"loss": 0.9727,
	"step": 2235
	},
	{
	"epoch": 7.645051194539249,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00011405276772270126,
	"loss": 0.9774,
	"step": 2240
	},
	{
	"epoch": 7.662116040955631,
	"grad_norm": 0.353515625,
	"learning_rate": 0.0001136594318977014,
	"loss": 0.9815,
	"step": 2245
	},
	{
	"epoch": 7.679180887372014,
	"grad_norm": 0.412109375,
	"learning_rate": 0.0001132658806066542,
	"loss": 0.9835,
	"step": 2250
	},
	{
	"epoch": 7.696245733788396,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00011287212005750024,
	"loss": 0.9773,
	"step": 2255
	},
	{
	"epoch": 7.713310580204778,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00011247815646148087,
	"loss": 0.9835,
	"step": 2260
	},
	{
	"epoch": 7.73037542662116,
	"grad_norm": 0.56640625,
	"learning_rate": 0.00011208399603304047,
	"loss": 0.9832,
	"step": 2265
	},
	{
	"epoch": 7.747440273037543,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00011168964498972818,
	"loss": 0.9701,
	"step": 2270
	},
	{
	"epoch": 7.764505119453925,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00011129510955209996,
	"loss": 0.9832,
	"step": 2275
	},
	{
	"epoch": 7.7815699658703075,
	"grad_norm": 0.546875,
	"learning_rate": 0.00011090039594362045,
	"loss": 0.9861,
	"step": 2280
	},
	{
	"epoch": 7.798634812286689,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00011050551039056479,
	"loss": 0.9881,
	"step": 2285
	},
	{
	"epoch": 7.815699658703072,
	"grad_norm": 0.375,
	"learning_rate": 0.00011011045912192035,
	"loss": 0.9872,
	"step": 2290
	},
	{
	"epoch": 7.832764505119454,
	"grad_norm": 0.373046875,
	"learning_rate": 0.0001097152483692886,
	"loss": 0.9819,
	"step": 2295
	},
	{
	"epoch": 7.849829351535837,
	"grad_norm": 0.375,
	"learning_rate": 0.00010931988436678666,
	"loss": 0.9756,
	"step": 2300
	},
	{
	"epoch": 7.8668941979522184,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00010892437335094912,
	"loss": 0.9662,
	"step": 2305
	},
	{
	"epoch": 7.8839590443686,
	"grad_norm": 0.427734375,
	"learning_rate": 0.00010852872156062946,
	"loss": 0.9669,
	"step": 2310
	},
	{
	"epoch": 7.901023890784983,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00010813293523690191,
	"loss": 0.9755,
	"step": 2315
	},
	{
	"epoch": 7.918088737201365,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00010773702062296273,
	"loss": 0.9916,
	"step": 2320
	},
	{
	"epoch": 7.935153583617748,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00010734098396403192,
	"loss": 0.9869,
	"step": 2325
	},
	{
	"epoch": 7.952218430034129,
	"grad_norm": 0.447265625,
	"learning_rate": 0.00010694483150725458,
	"loss": 0.978,
	"step": 2330
	},
	{
	"epoch": 7.969283276450512,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00010654856950160253,
	"loss": 0.9711,
	"step": 2335
	},
	{
	"epoch": 7.986348122866894,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00010615220419777548,
	"loss": 0.9844,
	"step": 2340
	},
	{
	"epoch": 8.0,
	"eval_loss": 2.489572525024414,
	"eval_runtime": 0.5472,
	"eval_samples_per_second": 18.276,
	"eval_steps_per_second": 1.828,
	"step": 2344
	},
	{
	"epoch": 8.003412969283277,
	"grad_norm": 0.44140625,
	"learning_rate": 0.00010575574184810269,
	"loss": 0.9713,
	"step": 2345
	},
	{
	"epoch": 8.020477815699659,
	"grad_norm": 0.3984375,
	"learning_rate": 0.0001053591887064442,
	"loss": 0.9647,
	"step": 2350
	},
	{
	"epoch": 8.03754266211604,
	"grad_norm": 0.453125,
	"learning_rate": 0.00010496255102809223,
	"loss": 0.9709,
	"step": 2355
	},
	{
	"epoch": 8.054607508532424,
	"grad_norm": 0.431640625,
	"learning_rate": 0.00010456583506967248,
	"loss": 0.9701,
	"step": 2360
	},
	{
	"epoch": 8.071672354948806,
	"grad_norm": 0.44921875,
	"learning_rate": 0.00010416904708904548,
	"loss": 0.9662,
	"step": 2365
	},
	{
	"epoch": 8.088737201365188,
	"grad_norm": 0.46875,
	"learning_rate": 0.00010377219334520783,
	"loss": 0.9616,
	"step": 2370
	},
	{
	"epoch": 8.10580204778157,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00010337528009819344,
	"loss": 0.9609,
	"step": 2375
	},
	{
	"epoch": 8.122866894197951,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00010297831360897492,
	"loss": 0.9714,
	"step": 2380
	},
	{
	"epoch": 8.139931740614335,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00010258130013936474,
	"loss": 0.9718,
	"step": 2385
	},
	{
	"epoch": 8.156996587030717,
	"grad_norm": 0.4296875,
	"learning_rate": 0.00010218424595191631,
	"loss": 0.963,
	"step": 2390
	},
	{
	"epoch": 8.174061433447099,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00010178715730982549,
	"loss": 0.9612,
	"step": 2395
	},
	{
	"epoch": 8.19112627986348,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00010139004047683151,
	"loss": 0.9757,
	"step": 2400
	},
	{
	"epoch": 8.208191126279864,
	"grad_norm": 0.62890625,
	"learning_rate": 0.00010099290171711841,
	"loss": 0.961,
	"step": 2405
	},
	{
	"epoch": 8.225255972696246,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00010059574729521595,
	"loss": 0.962,
	"step": 2410
	},
	{
	"epoch": 8.242320819112628,
	"grad_norm": 0.51171875,
	"learning_rate": 0.0001001985834759011,
	"loss": 0.9761,
	"step": 2415
	},
	{
	"epoch": 8.25938566552901,
	"grad_norm": 0.390625,
	"learning_rate": 9.980141652409895e-05,
	"loss": 0.9718,
	"step": 2420
	},
	{
	"epoch": 8.276450511945393,
	"grad_norm": 0.41796875,
	"learning_rate": 9.940425270478407e-05,
	"loss": 0.9672,
	"step": 2425
	},
	{
	"epoch": 8.293515358361775,
	"grad_norm": 0.431640625,
	"learning_rate": 9.900709828288164e-05,
	"loss": 0.9658,
	"step": 2430
	},
	{
	"epoch": 8.310580204778157,
	"grad_norm": 0.4140625,
	"learning_rate": 9.860995952316851e-05,
	"loss": 0.9776,
	"step": 2435
	},
	{
	"epoch": 8.327645051194539,
	"grad_norm": 0.37890625,
	"learning_rate": 9.821284269017455e-05,
	"loss": 0.9664,
	"step": 2440
	},
	{
	"epoch": 8.344709897610922,
	"grad_norm": 0.380859375,
	"learning_rate": 9.781575404808371e-05,
	"loss": 0.9672,
	"step": 2445
	},
	{
	"epoch": 8.361774744027304,
	"grad_norm": 0.3828125,
	"learning_rate": 9.741869986063526e-05,
	"loss": 0.9778,
	"step": 2450
	},
	{
	"epoch": 8.378839590443686,
	"grad_norm": 0.361328125,
	"learning_rate": 9.702168639102509e-05,
	"loss": 0.9659,
	"step": 2455
	},
	{
	"epoch": 8.395904436860068,
	"grad_norm": 0.392578125,
	"learning_rate": 9.662471990180657e-05,
	"loss": 0.9623,
	"step": 2460
	},
	{
	"epoch": 8.41296928327645,
	"grad_norm": 0.365234375,
	"learning_rate": 9.622780665479222e-05,
	"loss": 0.9657,
	"step": 2465
	},
	{
	"epoch": 8.430034129692833,
	"grad_norm": 0.40234375,
	"learning_rate": 9.583095291095453e-05,
	"loss": 0.9679,
	"step": 2470
	},
	{
	"epoch": 8.447098976109215,
	"grad_norm": 0.443359375,
	"learning_rate": 9.543416493032757e-05,
	"loss": 0.9686,
	"step": 2475
	},
	{
	"epoch": 8.464163822525597,
	"grad_norm": 0.404296875,
	"learning_rate": 9.503744897190778e-05,
	"loss": 0.9679,
	"step": 2480
	},
	{
	"epoch": 8.481228668941979,
	"grad_norm": 0.396484375,
	"learning_rate": 9.464081129355586e-05,
	"loss": 0.9588,
	"step": 2485
	},
	{
	"epoch": 8.498293515358363,
	"grad_norm": 0.431640625,
	"learning_rate": 9.424425815189733e-05,
	"loss": 0.9775,
	"step": 2490
	},
	{
	"epoch": 8.515358361774744,
	"grad_norm": 0.384765625,
	"learning_rate": 9.384779580222453e-05,
	"loss": 0.9668,
	"step": 2495
	},
	{
	"epoch": 8.532423208191126,
	"grad_norm": 0.447265625,
	"learning_rate": 9.345143049839749e-05,
	"loss": 0.9677,
	"step": 2500
	},
	{
	"epoch": 8.549488054607508,
	"grad_norm": 0.48828125,
	"learning_rate": 9.305516849274541e-05,
	"loss": 0.9603,
	"step": 2505
	},
	{
	"epoch": 8.56655290102389,
	"grad_norm": 0.427734375,
	"learning_rate": 9.265901603596811e-05,
	"loss": 0.9688,
	"step": 2510
	},
	{
	"epoch": 8.583617747440274,
	"grad_norm": 0.498046875,
	"learning_rate": 9.226297937703728e-05,
	"loss": 0.9645,
	"step": 2515
	},
	{
	"epoch": 8.600682593856655,
	"grad_norm": 0.431640625,
	"learning_rate": 9.186706476309812e-05,
	"loss": 0.967,
	"step": 2520
	},
	{
	"epoch": 8.617747440273037,
	"grad_norm": 0.423828125,
	"learning_rate": 9.147127843937055e-05,
	"loss": 0.9711,
	"step": 2525
	},
	{
	"epoch": 8.634812286689419,
	"grad_norm": 0.455078125,
	"learning_rate": 9.107562664905093e-05,
	"loss": 0.971,
	"step": 2530
	},
	{
	"epoch": 8.651877133105803,
	"grad_norm": 0.484375,
	"learning_rate": 9.068011563321336e-05,
	"loss": 0.9722,
	"step": 2535
	},
	{
	"epoch": 8.668941979522184,
	"grad_norm": 0.435546875,
	"learning_rate": 9.028475163071141e-05,
	"loss": 0.9747,
	"step": 2540
	},
	{
	"epoch": 8.686006825938566,
	"grad_norm": 0.4140625,
	"learning_rate": 8.988954087807968e-05,
	"loss": 0.9638,
	"step": 2545
	},
	{
	"epoch": 8.703071672354948,
	"grad_norm": 0.400390625,
	"learning_rate": 8.949448960943524e-05,
	"loss": 0.9625,
	"step": 2550
	},
	{
	"epoch": 8.720136518771332,
	"grad_norm": 0.49609375,
	"learning_rate": 8.909960405637958e-05,
	"loss": 0.9568,
	"step": 2555
	},
	{
	"epoch": 8.737201365187714,
	"grad_norm": 0.435546875,
	"learning_rate": 8.870489044790006e-05,
	"loss": 0.9766,
	"step": 2560
	},
	{
	"epoch": 8.754266211604095,
	"grad_norm": 0.41015625,
	"learning_rate": 8.831035501027186e-05,
	"loss": 0.967,
	"step": 2565
	},
	{
	"epoch": 8.771331058020477,
	"grad_norm": 0.376953125,
	"learning_rate": 8.791600396695954e-05,
	"loss": 0.9686,
	"step": 2570
	},
	{
	"epoch": 8.788395904436861,
	"grad_norm": 0.373046875,
	"learning_rate": 8.752184353851916e-05,
	"loss": 0.9684,
	"step": 2575
	},
	{
	"epoch": 8.805460750853243,
	"grad_norm": 0.435546875,
	"learning_rate": 8.712787994249979e-05,
	"loss": 0.977,
	"step": 2580
	},
	{
	"epoch": 8.822525597269625,
	"grad_norm": 0.419921875,
	"learning_rate": 8.673411939334581e-05,
	"loss": 0.9712,
	"step": 2585
	},
	{
	"epoch": 8.839590443686006,
	"grad_norm": 0.478515625,
	"learning_rate": 8.634056810229862e-05,
	"loss": 0.9692,
	"step": 2590
	},
	{
	"epoch": 8.856655290102388,
	"grad_norm": 0.404296875,
	"learning_rate": 8.594723227729875e-05,
	"loss": 0.9639,
	"step": 2595
	},
	{
	"epoch": 8.873720136518772,
	"grad_norm": 0.447265625,
	"learning_rate": 8.555411812288798e-05,
	"loss": 0.974,
	"step": 2600
	},
	{
	"epoch": 8.890784982935154,
	"grad_norm": 0.392578125,
	"learning_rate": 8.516123184011135e-05,
	"loss": 0.9589,
	"step": 2605
	},
	{
	"epoch": 8.907849829351536,
	"grad_norm": 0.43359375,
	"learning_rate": 8.47685796264195e-05,
	"loss": 0.968,
	"step": 2610
	},
	{
	"epoch": 8.924914675767917,
	"grad_norm": 0.396484375,
	"learning_rate": 8.437616767557077e-05,
	"loss": 0.9693,
	"step": 2615
	},
	{
	"epoch": 8.941979522184301,
	"grad_norm": 0.5390625,
	"learning_rate": 8.398400217753357e-05,
	"loss": 0.9727,
	"step": 2620
	},
	{
	"epoch": 8.959044368600683,
	"grad_norm": 0.419921875,
	"learning_rate": 8.359208931838871e-05,
	"loss": 0.9708,
	"step": 2625
	},
	{
	"epoch": 8.976109215017065,
	"grad_norm": 0.427734375,
	"learning_rate": 8.320043528023188e-05,
	"loss": 0.9607,
	"step": 2630
	},
	{
	"epoch": 8.993174061433447,
	"grad_norm": 0.455078125,
	"learning_rate": 8.280904624107606e-05,
	"loss": 0.9779,
	"step": 2635
	},
	{
	"epoch": 9.0,
	"eval_loss": 2.502519130706787,
	"eval_runtime": 0.5483,
	"eval_samples_per_second": 18.238,
	"eval_steps_per_second": 1.824,
	"step": 2637
	},
	{
	"epoch": 9.01023890784983,
	"grad_norm": 0.3828125,
	"learning_rate": 8.241792837475405e-05,
	"loss": 0.9673,
	"step": 2640
	},
	{
	"epoch": 9.027303754266212,
	"grad_norm": 0.42578125,
	"learning_rate": 8.202708785082121e-05,
	"loss": 0.9481,
	"step": 2645
	},
	{
	"epoch": 9.044368600682594,
	"grad_norm": 0.39453125,
	"learning_rate": 8.163653083445799e-05,
	"loss": 0.9694,
	"step": 2650
	},
	{
	"epoch": 9.061433447098976,
	"grad_norm": 0.392578125,
	"learning_rate": 8.124626348637279e-05,
	"loss": 0.9651,
	"step": 2655
	},
	{
	"epoch": 9.078498293515358,
	"grad_norm": 0.376953125,
	"learning_rate": 8.085629196270469e-05,
	"loss": 0.9561,
	"step": 2660
	},
	{
	"epoch": 9.095563139931741,
	"grad_norm": 0.408203125,
	"learning_rate": 8.046662241492645e-05,
	"loss": 0.9617,
	"step": 2665
	},
	{
	"epoch": 9.112627986348123,
	"grad_norm": 0.408203125,
	"learning_rate": 8.007726098974734e-05,
	"loss": 0.9636,
	"step": 2670
	},
	{
	"epoch": 9.129692832764505,
	"grad_norm": 0.390625,
	"learning_rate": 7.96882138290163e-05,
	"loss": 0.9661,
	"step": 2675
	},
	{
	"epoch": 9.146757679180887,
	"grad_norm": 0.396484375,
	"learning_rate": 7.929948706962508e-05,
	"loss": 0.9577,
	"step": 2680
	},
	{
	"epoch": 9.16382252559727,
	"grad_norm": 0.41796875,
	"learning_rate": 7.891108684341121e-05,
	"loss": 0.961,
	"step": 2685
	},
	{
	"epoch": 9.180887372013652,
	"grad_norm": 0.37109375,
	"learning_rate": 7.852301927706159e-05,
	"loss": 0.9602,
	"step": 2690
	},
	{
	"epoch": 9.197952218430034,
	"grad_norm": 0.396484375,
	"learning_rate": 7.813529049201556e-05,
	"loss": 0.9544,
	"step": 2695
	},
	{
	"epoch": 9.215017064846416,
	"grad_norm": 0.470703125,
	"learning_rate": 7.774790660436858e-05,
	"loss": 0.9569,
	"step": 2700
	},
	{
	"epoch": 9.2320819112628,
	"grad_norm": 0.375,
	"learning_rate": 7.736087372477554e-05,
	"loss": 0.9636,
	"step": 2705
	},
	{
	"epoch": 9.249146757679181,
	"grad_norm": 0.37109375,
	"learning_rate": 7.69741979583546e-05,
	"loss": 0.9574,
	"step": 2710
	},
	{
	"epoch": 9.266211604095563,
	"grad_norm": 0.390625,
	"learning_rate": 7.658788540459062e-05,
	"loss": 0.9536,
	"step": 2715
	},
	{
	"epoch": 9.283276450511945,
	"grad_norm": 0.388671875,
	"learning_rate": 7.620194215723919e-05,
	"loss": 0.9598,
	"step": 2720
	},
	{
	"epoch": 9.300341296928327,
	"grad_norm": 0.3828125,
	"learning_rate": 7.581637430423037e-05,
	"loss": 0.9657,
	"step": 2725
	},
	{
	"epoch": 9.31740614334471,
	"grad_norm": 0.435546875,
	"learning_rate": 7.543118792757266e-05,
	"loss": 0.9639,
	"step": 2730
	},
	{
	"epoch": 9.334470989761092,
	"grad_norm": 0.408203125,
	"learning_rate": 7.504638910325717e-05,
	"loss": 0.9625,
	"step": 2735
	},
	{
	"epoch": 9.351535836177474,
	"grad_norm": 0.37109375,
	"learning_rate": 7.466198390116158e-05,
	"loss": 0.9585,
	"step": 2740
	},
	{
	"epoch": 9.368600682593856,
	"grad_norm": 0.447265625,
	"learning_rate": 7.427797838495463e-05,
	"loss": 0.9634,
	"step": 2745
	},
	{
	"epoch": 9.38566552901024,
	"grad_norm": 0.41796875,
	"learning_rate": 7.389437861200024e-05,
	"loss": 0.9624,
	"step": 2750
	},
	{
	"epoch": 9.402730375426621,
	"grad_norm": 0.408203125,
	"learning_rate": 7.35111906332622e-05,
	"loss": 0.9555,
	"step": 2755
	},
	{
	"epoch": 9.419795221843003,
	"grad_norm": 0.435546875,
	"learning_rate": 7.312842049320844e-05,
	"loss": 0.9575,
	"step": 2760
	},
	{
	"epoch": 9.436860068259385,
	"grad_norm": 0.42578125,
	"learning_rate": 7.2746074229716e-05,
	"loss": 0.9598,
	"step": 2765
	},
	{
	"epoch": 9.453924914675769,
	"grad_norm": 0.423828125,
	"learning_rate": 7.236415787397548e-05,
	"loss": 0.9594,
	"step": 2770
	},
	{
	"epoch": 9.47098976109215,
	"grad_norm": 0.408203125,
	"learning_rate": 7.198267745039612e-05,
	"loss": 0.9571,
	"step": 2775
	},
	{
	"epoch": 9.488054607508532,
	"grad_norm": 0.41015625,
	"learning_rate": 7.160163897651075e-05,
	"loss": 0.9582,
	"step": 2780
	},
	{
	"epoch": 9.505119453924914,
	"grad_norm": 0.453125,
	"learning_rate": 7.122104846288064e-05,
	"loss": 0.9583,
	"step": 2785
	},
	{
	"epoch": 9.522184300341298,
	"grad_norm": 0.474609375,
	"learning_rate": 7.08409119130011e-05,
	"loss": 0.9713,
	"step": 2790
	},
	{
	"epoch": 9.53924914675768,
	"grad_norm": 0.388671875,
	"learning_rate": 7.04612353232063e-05,
	"loss": 0.9538,
	"step": 2795
	},
	{
	"epoch": 9.556313993174061,
	"grad_norm": 0.41796875,
	"learning_rate": 7.008202468257514e-05,
	"loss": 0.9572,
	"step": 2800
	},
	{
	"epoch": 9.573378839590443,
	"grad_norm": 0.41015625,
	"learning_rate": 6.970328597283637e-05,
	"loss": 0.9483,
	"step": 2805
	},
	{
	"epoch": 9.590443686006825,
	"grad_norm": 0.40234375,
	"learning_rate": 6.932502516827461e-05,
	"loss": 0.9521,
	"step": 2810
	},
	{
	"epoch": 9.607508532423209,
	"grad_norm": 0.38671875,
	"learning_rate": 6.894724823563583e-05,
	"loss": 0.9534,
	"step": 2815
	},
	{
	"epoch": 9.62457337883959,
	"grad_norm": 0.41015625,
	"learning_rate": 6.85699611340333e-05,
	"loss": 0.9611,
	"step": 2820
	},
	{
	"epoch": 9.641638225255972,
	"grad_norm": 0.369140625,
	"learning_rate": 6.819316981485372e-05,
	"loss": 0.9499,
	"step": 2825
	},
	{
	"epoch": 9.658703071672354,
	"grad_norm": 0.361328125,
	"learning_rate": 6.781688022166311e-05,
	"loss": 0.9689,
	"step": 2830
	},
	{
	"epoch": 9.675767918088738,
	"grad_norm": 0.40234375,
	"learning_rate": 6.744109829011332e-05,
	"loss": 0.9492,
	"step": 2835
	},
	{
	"epoch": 9.69283276450512,
	"grad_norm": 0.384765625,
	"learning_rate": 6.706582994784814e-05,
	"loss": 0.9626,
	"step": 2840
	},
	{
	"epoch": 9.709897610921502,
	"grad_norm": 0.408203125,
	"learning_rate": 6.669108111441003e-05,
	"loss": 0.9641,
	"step": 2845
	},
	{
	"epoch": 9.726962457337883,
	"grad_norm": 0.40234375,
	"learning_rate": 6.631685770114654e-05,
	"loss": 0.9578,
	"step": 2850
	},
	{
	"epoch": 9.744027303754265,
	"grad_norm": 0.37890625,
	"learning_rate": 6.594316561111724e-05,
	"loss": 0.9648,
	"step": 2855
	},
	{
	"epoch": 9.761092150170649,
	"grad_norm": 0.390625,
	"learning_rate": 6.557001073900044e-05,
	"loss": 0.957,
	"step": 2860
	},
	{
	"epoch": 9.77815699658703,
	"grad_norm": 0.375,
	"learning_rate": 6.519739897100034e-05,
	"loss": 0.9513,
	"step": 2865
	},
	{
	"epoch": 9.795221843003413,
	"grad_norm": 0.453125,
	"learning_rate": 6.482533618475422e-05,
	"loss": 0.9591,
	"step": 2870
	},
	{
	"epoch": 9.812286689419794,
	"grad_norm": 0.369140625,
	"learning_rate": 6.445382824923938e-05,
	"loss": 0.9625,
	"step": 2875
	},
	{
	"epoch": 9.829351535836178,
	"grad_norm": 0.37109375,
	"learning_rate": 6.408288102468113e-05,
	"loss": 0.9606,
	"step": 2880
	},
	{
	"epoch": 9.84641638225256,
	"grad_norm": 0.37890625,
	"learning_rate": 6.371250036245976e-05,
	"loss": 0.9662,
	"step": 2885
	},
	{
	"epoch": 9.863481228668942,
	"grad_norm": 0.373046875,
	"learning_rate": 6.334269210501875e-05,
	"loss": 0.9635,
	"step": 2890
	},
	{
	"epoch": 9.880546075085324,
	"grad_norm": 0.365234375,
	"learning_rate": 6.297346208577213e-05,
	"loss": 0.9649,
	"step": 2895
	},
	{
	"epoch": 9.897610921501707,
	"grad_norm": 0.390625,
	"learning_rate": 6.260481612901299e-05,
	"loss": 0.9516,
	"step": 2900
	},
	{
	"epoch": 9.914675767918089,
	"grad_norm": 0.3828125,
	"learning_rate": 6.223676004982105e-05,
	"loss": 0.9601,
	"step": 2905
	},
	{
	"epoch": 9.93174061433447,
	"grad_norm": 0.5625,
	"learning_rate": 6.18692996539714e-05,
	"loss": 0.9611,
	"step": 2910
	},
	{
	"epoch": 9.948805460750853,
	"grad_norm": 0.39453125,
	"learning_rate": 6.150244073784266e-05,
	"loss": 0.9742,
	"step": 2915
	},
	{
	"epoch": 9.965870307167236,
	"grad_norm": 0.4296875,
	"learning_rate": 6.113618908832561e-05,
	"loss": 0.9666,
	"step": 2920
	},
	{
	"epoch": 9.982935153583618,
	"grad_norm": 0.447265625,
	"learning_rate": 6.0770550482731924e-05,
	"loss": 0.9684,
	"step": 2925
	},
	{
	"epoch": 10.0,
	"grad_norm": 0.41015625,
	"learning_rate": 6.0405530688702986e-05,
	"loss": 0.9639,
	"step": 2930
	},
	{
	"epoch": 10.0,
	"eval_loss": 2.512617588043213,
	"eval_runtime": 0.5446,
	"eval_samples_per_second": 18.362,
	"eval_steps_per_second": 1.836,
	"step": 2930
	},
	{
	"epoch": 10.017064846416382,
	"grad_norm": 0.427734375,
	"learning_rate": 6.0041135464119024e-05,
	"loss": 0.9618,
	"step": 2935
	},
	{
	"epoch": 10.034129692832764,
	"grad_norm": 0.384765625,
	"learning_rate": 5.9677370557008104e-05,
	"loss": 0.9433,
	"step": 2940
	},
	{
	"epoch": 10.051194539249147,
	"grad_norm": 0.478515625,
	"learning_rate": 5.9314241705455674e-05,
	"loss": 0.9543,
	"step": 2945
	},
	{
	"epoch": 10.06825938566553,
	"grad_norm": 0.408203125,
	"learning_rate": 5.895175463751385e-05,
	"loss": 0.9579,
	"step": 2950
	},
	{
	"epoch": 10.085324232081911,
	"grad_norm": 0.380859375,
	"learning_rate": 5.858991507111122e-05,
	"loss": 0.9506,
	"step": 2955
	},
	{
	"epoch": 10.102389078498293,
	"grad_norm": 0.3828125,
	"learning_rate": 5.8228728713962543e-05,
	"loss": 0.9582,
	"step": 2960
	},
	{
	"epoch": 10.119453924914676,
	"grad_norm": 0.38671875,
	"learning_rate": 5.786820126347876e-05,
	"loss": 0.9576,
	"step": 2965
	},
	{
	"epoch": 10.136518771331058,
	"grad_norm": 0.4140625,
	"learning_rate": 5.750833840667711e-05,
	"loss": 0.9506,
	"step": 2970
	},
	{
	"epoch": 10.15358361774744,
	"grad_norm": 0.390625,
	"learning_rate": 5.7149145820091385e-05,
	"loss": 0.952,
	"step": 2975
	},
	{
	"epoch": 10.170648464163822,
	"grad_norm": 0.38671875,
	"learning_rate": 5.6790629169682564e-05,
	"loss": 0.9532,
	"step": 2980
	},
	{
	"epoch": 10.187713310580206,
	"grad_norm": 0.396484375,
	"learning_rate": 5.6432794110749134e-05,
	"loss": 0.9459,
	"step": 2985
	},
	{
	"epoch": 10.204778156996587,
	"grad_norm": 0.490234375,
	"learning_rate": 5.607564628783817e-05,
	"loss": 0.9513,
	"step": 2990
	},
	{
	"epoch": 10.22184300341297,
	"grad_norm": 0.41796875,
	"learning_rate": 5.571919133465605e-05,
	"loss": 0.9499,
	"step": 2995
	},
	{
	"epoch": 10.238907849829351,
	"grad_norm": 0.392578125,
	"learning_rate": 5.5363434873979903e-05,
	"loss": 0.9481,
	"step": 3000
	},
	{
	"epoch": 10.255972696245733,
	"grad_norm": 0.380859375,
	"learning_rate": 5.500838251756857e-05,
	"loss": 0.9501,
	"step": 3005
	},
	{
	"epoch": 10.273037542662117,
	"grad_norm": 0.3671875,
	"learning_rate": 5.465403986607426e-05,
	"loss": 0.9498,
	"step": 3010
	},
	{
	"epoch": 10.290102389078498,
	"grad_norm": 0.396484375,
	"learning_rate": 5.430041250895428e-05,
	"loss": 0.947,
	"step": 3015
	},
	{
	"epoch": 10.30716723549488,
	"grad_norm": 0.42578125,
	"learning_rate": 5.3947506024382665e-05,
	"loss": 0.9581,
	"step": 3020
	},
	{
	"epoch": 10.324232081911262,
	"grad_norm": 0.408203125,
	"learning_rate": 5.359532597916233e-05,
	"loss": 0.9549,
	"step": 3025
	},
	{
	"epoch": 10.341296928327646,
	"grad_norm": 0.40625,
	"learning_rate": 5.324387792863719e-05,
	"loss": 0.968,
	"step": 3030
	},
	{
	"epoch": 10.358361774744028,
	"grad_norm": 0.404296875,
	"learning_rate": 5.289316741660466e-05,
	"loss": 0.9499,
	"step": 3035
	},
	{
	"epoch": 10.37542662116041,
	"grad_norm": 0.3828125,
	"learning_rate": 5.254319997522796e-05,
	"loss": 0.9639,
	"step": 3040
	},
	{
	"epoch": 10.392491467576791,
	"grad_norm": 0.404296875,
	"learning_rate": 5.21939811249492e-05,
	"loss": 0.9555,
	"step": 3045
	},
	{
	"epoch": 10.409556313993175,
	"grad_norm": 0.38671875,
	"learning_rate": 5.1845516374401784e-05,
	"loss": 0.9533,
	"step": 3050
	},
	{
	"epoch": 10.426621160409557,
	"grad_norm": 0.421875,
	"learning_rate": 5.14978112203241e-05,
	"loss": 0.9632,
	"step": 3055
	},
	{
	"epoch": 10.443686006825939,
	"grad_norm": 0.380859375,
	"learning_rate": 5.11508711474725e-05,
	"loss": 0.9596,
	"step": 3060
	},
	{
	"epoch": 10.46075085324232,
	"grad_norm": 0.4140625,
	"learning_rate": 5.080470162853472e-05,
	"loss": 0.963,
	"step": 3065
	},
	{
	"epoch": 10.477815699658702,
	"grad_norm": 0.412109375,
	"learning_rate": 5.0459308124043715e-05,
	"loss": 0.9602,
	"step": 3070
	},
	{
	"epoch": 10.494880546075086,
	"grad_norm": 0.4375,
	"learning_rate": 5.0114696082291425e-05,
	"loss": 0.9429,
	"step": 3075
	},
	{
	"epoch": 10.511945392491468,
	"grad_norm": 0.3828125,
	"learning_rate": 4.9770870939242986e-05,
	"loss": 0.9569,
	"step": 3080
	},
	{
	"epoch": 10.52901023890785,
	"grad_norm": 0.396484375,
	"learning_rate": 4.942783811845074e-05,
	"loss": 0.945,
	"step": 3085
	},
	{
	"epoch": 10.546075085324231,
	"grad_norm": 0.38671875,
	"learning_rate": 4.908560303096887e-05,
	"loss": 0.955,
	"step": 3090
	},
	{
	"epoch": 10.563139931740615,
	"grad_norm": 0.404296875,
	"learning_rate": 4.874417107526795e-05,
	"loss": 0.9583,
	"step": 3095
	},
	{
	"epoch": 10.580204778156997,
	"grad_norm": 0.38671875,
	"learning_rate": 4.840354763714991e-05,
	"loss": 0.9499,
	"step": 3100
	},
	{
	"epoch": 10.597269624573379,
	"grad_norm": 0.41015625,
	"learning_rate": 4.8063738089662926e-05,
	"loss": 0.9528,
	"step": 3105
	},
	{
	"epoch": 10.61433447098976,
	"grad_norm": 0.373046875,
	"learning_rate": 4.772474779301669e-05,
	"loss": 0.9581,
	"step": 3110
	},
	{
	"epoch": 10.631399317406144,
	"grad_norm": 0.4140625,
	"learning_rate": 4.738658209449805e-05,
	"loss": 0.9456,
	"step": 3115
	},
	{
	"epoch": 10.648464163822526,
	"grad_norm": 0.384765625,
	"learning_rate": 4.704924632838636e-05,
	"loss": 0.9507,
	"step": 3120
	},
	{
	"epoch": 10.665529010238908,
	"grad_norm": 0.384765625,
	"learning_rate": 4.671274581586958e-05,
	"loss": 0.9586,
	"step": 3125
	},
	{
	"epoch": 10.68259385665529,
	"grad_norm": 0.375,
	"learning_rate": 4.637708586496018e-05,
	"loss": 0.9487,
	"step": 3130
	},
	{
	"epoch": 10.699658703071673,
	"grad_norm": 0.38671875,
	"learning_rate": 4.604227177041156e-05,
	"loss": 0.9511,
	"step": 3135
	},
	{
	"epoch": 10.716723549488055,
	"grad_norm": 0.404296875,
	"learning_rate": 4.570830881363439e-05,
	"loss": 0.9529,
	"step": 3140
	},
	{
	"epoch": 10.733788395904437,
	"grad_norm": 0.5078125,
	"learning_rate": 4.537520226261333e-05,
	"loss": 0.962,
	"step": 3145
	},
	{
	"epoch": 10.750853242320819,
	"grad_norm": 0.396484375,
	"learning_rate": 4.5042957371824057e-05,
	"loss": 0.9551,
	"step": 3150
	},
	{
	"epoch": 10.7679180887372,
	"grad_norm": 0.42578125,
	"learning_rate": 4.471157938215017e-05,
	"loss": 0.9537,
	"step": 3155
	},
	{
	"epoch": 10.784982935153584,
	"grad_norm": 0.3984375,
	"learning_rate": 4.438107352080076e-05,
	"loss": 0.9573,
	"step": 3160
	},
	{
	"epoch": 10.802047781569966,
	"grad_norm": 0.384765625,
	"learning_rate": 4.405144500122772e-05,
	"loss": 0.9615,
	"step": 3165
	},
	{
	"epoch": 10.819112627986348,
	"grad_norm": 0.365234375,
	"learning_rate": 4.372269902304363e-05,
	"loss": 0.9592,
	"step": 3170
	},
	{
	"epoch": 10.83617747440273,
	"grad_norm": 0.38671875,
	"learning_rate": 4.339484077193974e-05,
	"loss": 0.9518,
	"step": 3175
	},
	{
	"epoch": 10.853242320819113,
	"grad_norm": 0.423828125,
	"learning_rate": 4.3067875419604184e-05,
	"loss": 0.953,
	"step": 3180
	},
	{
	"epoch": 10.870307167235495,
	"grad_norm": 0.376953125,
	"learning_rate": 4.2741808123640335e-05,
	"loss": 0.9578,
	"step": 3185
	},
	{
	"epoch": 10.887372013651877,
	"grad_norm": 0.36328125,
	"learning_rate": 4.241664402748544e-05,
	"loss": 0.9548,
	"step": 3190
	},
	{
	"epoch": 10.904436860068259,
	"grad_norm": 0.361328125,
	"learning_rate": 4.209238826032965e-05,
	"loss": 0.955,
	"step": 3195
	},
	{
	"epoch": 10.921501706484642,
	"grad_norm": 0.380859375,
	"learning_rate": 4.1769045937034876e-05,
	"loss": 0.9591,
	"step": 3200
	},
	{
	"epoch": 10.938566552901024,
	"grad_norm": 0.43359375,
	"learning_rate": 4.144662215805426e-05,
	"loss": 0.9544,
	"step": 3205
	},
	{
	"epoch": 10.955631399317406,
	"grad_norm": 0.58984375,
	"learning_rate": 4.1125122009351634e-05,
	"loss": 0.9539,
	"step": 3210
	},
	{
	"epoch": 10.972696245733788,
	"grad_norm": 0.416015625,
	"learning_rate": 4.080455056232147e-05,
	"loss": 0.9497,
	"step": 3215
	},
	{
	"epoch": 10.98976109215017,
	"grad_norm": 0.421875,
	"learning_rate": 4.048491287370863e-05,
	"loss": 0.952,
	"step": 3220
	},
	{
	"epoch": 11.0,
	"eval_loss": 2.519228935241699,
	"eval_runtime": 0.5351,
	"eval_samples_per_second": 18.688,
	"eval_steps_per_second": 1.869,
	"step": 3223
	},
	{
	"epoch": 11.006825938566553,
	"grad_norm": 0.404296875,
	"learning_rate": 4.016621398552877e-05,
	"loss": 0.954,
	"step": 3225
	},
	{
	"epoch": 11.023890784982935,
	"grad_norm": 0.390625,
	"learning_rate": 3.9848458924988684e-05,
	"loss": 0.9494,
	"step": 3230
	},
	{
	"epoch": 11.040955631399317,
	"grad_norm": 0.404296875,
	"learning_rate": 3.953165270440721e-05,
	"loss": 0.9434,
	"step": 3235
	},
	{
	"epoch": 11.058020477815699,
	"grad_norm": 0.38671875,
	"learning_rate": 3.921580032113602e-05,
	"loss": 0.9542,
	"step": 3240
	},
	{
	"epoch": 11.075085324232083,
	"grad_norm": 0.388671875,
	"learning_rate": 3.8900906757480614e-05,
	"loss": 0.9519,
	"step": 3245
	},
	{
	"epoch": 11.092150170648464,
	"grad_norm": 0.388671875,
	"learning_rate": 3.858697698062217e-05,
	"loss": 0.9597,
	"step": 3250
	},
	{
	"epoch": 11.109215017064846,
	"grad_norm": 0.373046875,
	"learning_rate": 3.8274015942538745e-05,
	"loss": 0.9437,
	"step": 3255
	},
	{
	"epoch": 11.126279863481228,
	"grad_norm": 0.37890625,
	"learning_rate": 3.7962028579927555e-05,
	"loss": 0.9545,
	"step": 3260
	},
	{
	"epoch": 11.143344709897612,
	"grad_norm": 0.392578125,
	"learning_rate": 3.7651019814126654e-05,
	"loss": 0.9524,
	"step": 3265
	},
	{
	"epoch": 11.160409556313994,
	"grad_norm": 0.37890625,
	"learning_rate": 3.734099455103779e-05,
	"loss": 0.9591,
	"step": 3270
	},
	{
	"epoch": 11.177474402730375,
	"grad_norm": 0.38671875,
	"learning_rate": 3.7031957681048604e-05,
	"loss": 0.9503,
	"step": 3275
	},
	{
	"epoch": 11.194539249146757,
	"grad_norm": 0.384765625,
	"learning_rate": 3.6723914078955825e-05,
	"loss": 0.9456,
	"step": 3280
	},
	{
	"epoch": 11.211604095563139,
	"grad_norm": 0.380859375,
	"learning_rate": 3.64168686038881e-05,
	"loss": 0.9426,
	"step": 3285
	},
	{
	"epoch": 11.228668941979523,
	"grad_norm": 0.390625,
	"learning_rate": 3.6110826099229453e-05,
	"loss": 0.9496,
	"step": 3290
	},
	{
	"epoch": 11.245733788395905,
	"grad_norm": 0.37109375,
	"learning_rate": 3.580579139254303e-05,
	"loss": 0.9515,
	"step": 3295
	},
	{
	"epoch": 11.262798634812286,
	"grad_norm": 0.3828125,
	"learning_rate": 3.550176929549468e-05,
	"loss": 0.9535,
	"step": 3300
	},
	{
	"epoch": 11.279863481228668,
	"grad_norm": 0.3671875,
	"learning_rate": 3.5198764603777235e-05,
	"loss": 0.9575,
	"step": 3305
	},
	{
	"epoch": 11.296928327645052,
	"grad_norm": 0.376953125,
	"learning_rate": 3.489678209703475e-05,
	"loss": 0.9468,
	"step": 3310
	},
	{
	"epoch": 11.313993174061434,
	"grad_norm": 0.396484375,
	"learning_rate": 3.459582653878731e-05,
	"loss": 0.9536,
	"step": 3315
	},
	{
	"epoch": 11.331058020477816,
	"grad_norm": 0.39453125,
	"learning_rate": 3.429590267635565e-05,
	"loss": 0.9575,
	"step": 3320
	},
	{
	"epoch": 11.348122866894197,
	"grad_norm": 0.38671875,
	"learning_rate": 3.399701524078635e-05,
	"loss": 0.9533,
	"step": 3325
	},
	{
	"epoch": 11.365187713310581,
	"grad_norm": 0.380859375,
	"learning_rate": 3.369916894677733e-05,
	"loss": 0.9414,
	"step": 3330
	},
	{
	"epoch": 11.382252559726963,
	"grad_norm": 0.421875,
	"learning_rate": 3.340236849260324e-05,
	"loss": 0.9494,
	"step": 3335
	},
	{
	"epoch": 11.399317406143345,
	"grad_norm": 0.419921875,
	"learning_rate": 3.31066185600417e-05,
	"loss": 0.9457,
	"step": 3340
	},
	{
	"epoch": 11.416382252559726,
	"grad_norm": 0.384765625,
	"learning_rate": 3.281192381429894e-05,
	"loss": 0.9403,
	"step": 3345
	},
	{
	"epoch": 11.43344709897611,
	"grad_norm": 0.375,
	"learning_rate": 3.251828890393677e-05,
	"loss": 0.9489,
	"step": 3350
	},
	{
	"epoch": 11.450511945392492,
	"grad_norm": 0.412109375,
	"learning_rate": 3.222571846079881e-05,
	"loss": 0.9525,
	"step": 3355
	},
	{
	"epoch": 11.467576791808874,
	"grad_norm": 0.37109375,
	"learning_rate": 3.193421709993779e-05,
	"loss": 0.9574,
	"step": 3360
	},
	{
	"epoch": 11.484641638225256,
	"grad_norm": 0.390625,
	"learning_rate": 3.1643789419542324e-05,
	"loss": 0.9453,
	"step": 3365
	},
	{
	"epoch": 11.501706484641637,
	"grad_norm": 0.3828125,
	"learning_rate": 3.135444000086485e-05,
	"loss": 0.9462,
	"step": 3370
	},
	{
	"epoch": 11.518771331058021,
	"grad_norm": 0.384765625,
	"learning_rate": 3.1066173408148955e-05,
	"loss": 0.9551,
	"step": 3375
	},
	{
	"epoch": 11.535836177474403,
	"grad_norm": 0.404296875,
	"learning_rate": 3.077899418855772e-05,
	"loss": 0.9504,
	"step": 3380
	},
	{
	"epoch": 11.552901023890785,
	"grad_norm": 0.400390625,
	"learning_rate": 3.04929068721017e-05,
	"loss": 0.9496,
	"step": 3385
	},
	{
	"epoch": 11.569965870307167,
	"grad_norm": 0.380859375,
	"learning_rate": 3.0207915971567624e-05,
	"loss": 0.9426,
	"step": 3390
	},
	{
	"epoch": 11.58703071672355,
	"grad_norm": 0.384765625,
	"learning_rate": 2.992402598244727e-05,
	"loss": 0.9458,
	"step": 3395
	},
	{
	"epoch": 11.604095563139932,
	"grad_norm": 0.384765625,
	"learning_rate": 2.9641241382866348e-05,
	"loss": 0.9525,
	"step": 3400
	},
	{
	"epoch": 11.621160409556314,
	"grad_norm": 0.400390625,
	"learning_rate": 2.9359566633514037e-05,
	"loss": 0.9449,
	"step": 3405
	},
	{
	"epoch": 11.638225255972696,
	"grad_norm": 0.380859375,
	"learning_rate": 2.907900617757252e-05,
	"loss": 0.9526,
	"step": 3410
	},
	{
	"epoch": 11.655290102389078,
	"grad_norm": 0.373046875,
	"learning_rate": 2.879956444064703e-05,
	"loss": 0.9598,
	"step": 3415
	},
	{
	"epoch": 11.672354948805461,
	"grad_norm": 0.388671875,
	"learning_rate": 2.8521245830695864e-05,
	"loss": 0.9484,
	"step": 3420
	},
	{
	"epoch": 11.689419795221843,
	"grad_norm": 0.3828125,
	"learning_rate": 2.8244054737960935e-05,
	"loss": 0.9431,
	"step": 3425
	},
	{
	"epoch": 11.706484641638225,
	"grad_norm": 0.365234375,
	"learning_rate": 2.7967995534898596e-05,
	"loss": 0.9554,
	"step": 3430
	},
	{
	"epoch": 11.723549488054607,
	"grad_norm": 0.390625,
	"learning_rate": 2.7693072576110514e-05,
	"loss": 0.9519,
	"step": 3435
	},
	{
	"epoch": 11.74061433447099,
	"grad_norm": 0.365234375,
	"learning_rate": 2.7419290198275095e-05,
	"loss": 0.9509,
	"step": 3440
	},
	{
	"epoch": 11.757679180887372,
	"grad_norm": 0.40234375,
	"learning_rate": 2.7146652720079003e-05,
	"loss": 0.9578,
	"step": 3445
	},
	{
	"epoch": 11.774744027303754,
	"grad_norm": 0.376953125,
	"learning_rate": 2.6875164442149147e-05,
	"loss": 0.9449,
	"step": 3450
	},
	{
	"epoch": 11.791808873720136,
	"grad_norm": 0.40625,
	"learning_rate": 2.6604829646984686e-05,
	"loss": 0.9505,
	"step": 3455
	},
	{
	"epoch": 11.80887372013652,
	"grad_norm": 0.3984375,
	"learning_rate": 2.6335652598889683e-05,
	"loss": 0.9433,
	"step": 3460
	},
	{
	"epoch": 11.825938566552901,
	"grad_norm": 0.380859375,
	"learning_rate": 2.60676375439055e-05,
	"loss": 0.9464,
	"step": 3465
	},
	{
	"epoch": 11.843003412969283,
	"grad_norm": 0.384765625,
	"learning_rate": 2.5800788709744227e-05,
	"loss": 0.955,
	"step": 3470
	},
	{
	"epoch": 11.860068259385665,
	"grad_norm": 0.380859375,
	"learning_rate": 2.5535110305721776e-05,
	"loss": 0.9458,
	"step": 3475
	},
	{
	"epoch": 11.877133105802049,
	"grad_norm": 0.3828125,
	"learning_rate": 2.5270606522691443e-05,
	"loss": 0.9544,
	"step": 3480
	},
	{
	"epoch": 11.89419795221843,
	"grad_norm": 0.408203125,
	"learning_rate": 2.500728153297788e-05,
	"loss": 0.9534,
	"step": 3485
	},
	{
	"epoch": 11.911262798634812,
	"grad_norm": 0.373046875,
	"learning_rate": 2.4745139490311254e-05,
	"loss": 0.9521,
	"step": 3490
	},
	{
	"epoch": 11.928327645051194,
	"grad_norm": 0.392578125,
	"learning_rate": 2.4484184529761834e-05,
	"loss": 0.948,
	"step": 3495
	},
	{
	"epoch": 11.945392491467576,
	"grad_norm": 0.39453125,
	"learning_rate": 2.4224420767674562e-05,
	"loss": 0.9543,
	"step": 3500
	},
	{
	"epoch": 11.96245733788396,
	"grad_norm": 0.375,
	"learning_rate": 2.3965852301604254e-05,
	"loss": 0.959,
	"step": 3505
	},
	{
	"epoch": 11.979522184300341,
	"grad_norm": 0.375,
	"learning_rate": 2.370848321025093e-05,
	"loss": 0.9599,
	"step": 3510
	},
	{
	"epoch": 11.996587030716723,
	"grad_norm": 0.37109375,
	"learning_rate": 2.345231755339554e-05,
	"loss": 0.9505,
	"step": 3515
	},
	{
	"epoch": 12.0,
	"eval_loss": 2.520477771759033,
	"eval_runtime": 0.5502,
	"eval_samples_per_second": 18.175,
	"eval_steps_per_second": 1.818,
	"step": 3516
	},
	{
	"epoch": 12.013651877133105,
	"grad_norm": 0.43359375,
	"learning_rate": 2.3197359371835802e-05,
	"loss": 0.9615,
	"step": 3520
	},
	{
	"epoch": 12.030716723549489,
	"grad_norm": 0.376953125,
	"learning_rate": 2.2943612687322525e-05,
	"loss": 0.9485,
	"step": 3525
	},
	{
	"epoch": 12.04778156996587,
	"grad_norm": 0.384765625,
	"learning_rate": 2.2691081502496246e-05,
	"loss": 0.9475,
	"step": 3530
	},
	{
	"epoch": 12.064846416382252,
	"grad_norm": 0.388671875,
	"learning_rate": 2.243976980082394e-05,
	"loss": 0.9393,
	"step": 3535
	},
	{
	"epoch": 12.081911262798634,
	"grad_norm": 0.39453125,
	"learning_rate": 2.218968154653629e-05,
	"loss": 0.9466,
	"step": 3540
	},
	{
	"epoch": 12.098976109215018,
	"grad_norm": 0.376953125,
	"learning_rate": 2.194082068456509e-05,
	"loss": 0.9537,
	"step": 3545
	},
	{
	"epoch": 12.1160409556314,
	"grad_norm": 0.36328125,
	"learning_rate": 2.169319114048114e-05,
	"loss": 0.961,
	"step": 3550
	},
	{
	"epoch": 12.133105802047782,
	"grad_norm": 0.38671875,
	"learning_rate": 2.1446796820432167e-05,
	"loss": 0.9493,
	"step": 3555
	},
	{
	"epoch": 12.150170648464163,
	"grad_norm": 0.384765625,
	"learning_rate": 2.1201641611081246e-05,
	"loss": 0.948,
	"step": 3560
	},
	{
	"epoch": 12.167235494880545,
	"grad_norm": 0.373046875,
	"learning_rate": 2.0957729379545655e-05,
	"loss": 0.9584,
	"step": 3565
	},
	{
	"epoch": 12.184300341296929,
	"grad_norm": 0.380859375,
	"learning_rate": 2.0715063973335568e-05,
	"loss": 0.9503,
	"step": 3570
	},
	{
	"epoch": 12.20136518771331,
	"grad_norm": 0.388671875,
	"learning_rate": 2.04736492202937e-05,
	"loss": 0.9498,
	"step": 3575
	},
	{
	"epoch": 12.218430034129693,
	"grad_norm": 0.392578125,
	"learning_rate": 2.0233488928534673e-05,
	"loss": 0.9553,
	"step": 3580
	},
	{
	"epoch": 12.235494880546074,
	"grad_norm": 0.396484375,
	"learning_rate": 1.9994586886385046e-05,
	"loss": 0.9438,
	"step": 3585
	},
	{
	"epoch": 12.252559726962458,
	"grad_norm": 0.369140625,
	"learning_rate": 1.9756946862323535e-05,
	"loss": 0.9489,
	"step": 3590
	},
	{
	"epoch": 12.26962457337884,
	"grad_norm": 0.369140625,
	"learning_rate": 1.9520572604921672e-05,
	"loss": 0.9477,
	"step": 3595
	},
	{
	"epoch": 12.286689419795222,
	"grad_norm": 0.375,
	"learning_rate": 1.9285467842784467e-05,
	"loss": 0.9457,
	"step": 3600
	},
	{
	"epoch": 12.303754266211604,
	"grad_norm": 0.380859375,
	"learning_rate": 1.9051636284491757e-05,
	"loss": 0.9541,
	"step": 3605
	},
	{
	"epoch": 12.320819112627987,
	"grad_norm": 0.365234375,
	"learning_rate": 1.8819081618539723e-05,
	"loss": 0.9393,
	"step": 3610
	},
	{
	"epoch": 12.337883959044369,
	"grad_norm": 0.375,
	"learning_rate": 1.858780751328255e-05,
	"loss": 0.949,
	"step": 3615
	},
	{
	"epoch": 12.35494880546075,
	"grad_norm": 0.384765625,
	"learning_rate": 1.8357817616874694e-05,
	"loss": 0.9537,
	"step": 3620
	},
	{
	"epoch": 12.372013651877133,
	"grad_norm": 0.3671875,
	"learning_rate": 1.8129115557213262e-05,
	"loss": 0.9505,
	"step": 3625
	},
	{
	"epoch": 12.389078498293514,
	"grad_norm": 0.3671875,
	"learning_rate": 1.7901704941880914e-05,
	"loss": 0.9447,
	"step": 3630
	},
	{
	"epoch": 12.406143344709898,
	"grad_norm": 0.3671875,
	"learning_rate": 1.7675589358088763e-05,
	"loss": 0.9526,
	"step": 3635
	},
	{
	"epoch": 12.42320819112628,
	"grad_norm": 0.376953125,
	"learning_rate": 1.745077237261994e-05,
	"loss": 0.9592,
	"step": 3640
	},
	{
	"epoch": 12.440273037542662,
	"grad_norm": 0.40234375,
	"learning_rate": 1.7227257531773223e-05,
	"loss": 0.9515,
	"step": 3645
	},
	{
	"epoch": 12.457337883959044,
	"grad_norm": 0.408203125,
	"learning_rate": 1.7005048361307262e-05,
	"loss": 0.9504,
	"step": 3650
	},
	{
	"epoch": 12.474402730375427,
	"grad_norm": 0.388671875,
	"learning_rate": 1.6784148366384754e-05,
	"loss": 0.9462,
	"step": 3655
	},
	{
	"epoch": 12.491467576791809,
	"grad_norm": 0.384765625,
	"learning_rate": 1.656456103151728e-05,
	"loss": 0.9456,
	"step": 3660
	},
	{
	"epoch": 12.508532423208191,
	"grad_norm": 0.375,
	"learning_rate": 1.6346289820510363e-05,
	"loss": 0.9475,
	"step": 3665
	},
	{
	"epoch": 12.525597269624573,
	"grad_norm": 0.384765625,
	"learning_rate": 1.612933817640868e-05,
	"loss": 0.9478,
	"step": 3670
	},
	{
	"epoch": 12.542662116040956,
	"grad_norm": 0.3671875,
	"learning_rate": 1.5913709521441988e-05,
	"loss": 0.9415,
	"step": 3675
	},
	{
	"epoch": 12.559726962457338,
	"grad_norm": 0.375,
	"learning_rate": 1.5699407256970833e-05,
	"loss": 0.9452,
	"step": 3680
	},
	{
	"epoch": 12.57679180887372,
	"grad_norm": 0.375,
	"learning_rate": 1.5486434763433222e-05,
	"loss": 0.9479,
	"step": 3685
	},
	{
	"epoch": 12.593856655290102,
	"grad_norm": 0.38671875,
	"learning_rate": 1.527479540029104e-05,
	"loss": 0.9495,
	"step": 3690
	},
	{
	"epoch": 12.610921501706486,
	"grad_norm": 0.3828125,
	"learning_rate": 1.5064492505977234e-05,
	"loss": 0.936,
	"step": 3695
	},
	{
	"epoch": 12.627986348122867,
	"grad_norm": 0.392578125,
	"learning_rate": 1.4855529397843038e-05,
	"loss": 0.9476,
	"step": 3700
	},
	{
	"epoch": 12.64505119453925,
	"grad_norm": 0.380859375,
	"learning_rate": 1.4647909372105672e-05,
	"loss": 0.9525,
	"step": 3705
	},
	{
	"epoch": 12.662116040955631,
	"grad_norm": 0.41796875,
	"learning_rate": 1.4441635703796408e-05,
	"loss": 0.9477,
	"step": 3710
	},
	{
	"epoch": 12.679180887372013,
	"grad_norm": 0.3984375,
	"learning_rate": 1.4236711646708844e-05,
	"loss": 0.9505,
	"step": 3715
	},
	{
	"epoch": 12.696245733788396,
	"grad_norm": 0.384765625,
	"learning_rate": 1.4033140433347569e-05,
	"loss": 0.9464,
	"step": 3720
	},
	{
	"epoch": 12.713310580204778,
	"grad_norm": 0.384765625,
	"learning_rate": 1.3830925274877216e-05,
	"loss": 0.9392,
	"step": 3725
	},
	{
	"epoch": 12.73037542662116,
	"grad_norm": 0.37890625,
	"learning_rate": 1.363006936107183e-05,
	"loss": 0.9495,
	"step": 3730
	},
	{
	"epoch": 12.747440273037542,
	"grad_norm": 0.3828125,
	"learning_rate": 1.343057586026446e-05,
	"loss": 0.9423,
	"step": 3735
	},
	{
	"epoch": 12.764505119453926,
	"grad_norm": 0.416015625,
	"learning_rate": 1.3232447919297274e-05,
	"loss": 0.9448,
	"step": 3740
	},
	{
	"epoch": 12.781569965870307,
	"grad_norm": 0.404296875,
	"learning_rate": 1.3035688663471834e-05,
	"loss": 0.9544,
	"step": 3745
	},
	{
	"epoch": 12.79863481228669,
	"grad_norm": 0.37109375,
	"learning_rate": 1.2840301196499893e-05,
	"loss": 0.9548,
	"step": 3750
	},
	{
	"epoch": 12.815699658703071,
	"grad_norm": 0.376953125,
	"learning_rate": 1.2646288600454448e-05,
	"loss": 0.9492,
	"step": 3755
	},
	{
	"epoch": 12.832764505119453,
	"grad_norm": 0.373046875,
	"learning_rate": 1.2453653935720867e-05,
	"loss": 0.9506,
	"step": 3760
	},
	{
	"epoch": 12.849829351535837,
	"grad_norm": 0.388671875,
	"learning_rate": 1.2262400240949023e-05,
	"loss": 0.9543,
	"step": 3765
	},
	{
	"epoch": 12.866894197952218,
	"grad_norm": 0.369140625,
	"learning_rate": 1.2072530533005012e-05,
	"loss": 0.9418,
	"step": 3770
	},
	{
	"epoch": 12.8839590443686,
	"grad_norm": 0.369140625,
	"learning_rate": 1.1884047806923815e-05,
	"loss": 0.9475,
	"step": 3775
	},
	{
	"epoch": 12.901023890784982,
	"grad_norm": 0.39453125,
	"learning_rate": 1.169695503586179e-05,
	"loss": 0.9428,
	"step": 3780
	},
	{
	"epoch": 12.918088737201366,
	"grad_norm": 0.38671875,
	"learning_rate": 1.1511255171050084e-05,
	"loss": 0.9529,
	"step": 3785
	},
	{
	"epoch": 12.935153583617748,
	"grad_norm": 0.376953125,
	"learning_rate": 1.1326951141747788e-05,
	"loss": 0.9455,
	"step": 3790
	},
	{
	"epoch": 12.95221843003413,
	"grad_norm": 0.376953125,
	"learning_rate": 1.1144045855195973e-05,
	"loss": 0.9537,
	"step": 3795
	},
	{
	"epoch": 12.969283276450511,
	"grad_norm": 0.396484375,
	"learning_rate": 1.0962542196571634e-05,
	"loss": 0.9426,
	"step": 3800
	},
	{
	"epoch": 12.986348122866895,
	"grad_norm": 0.373046875,
	"learning_rate": 1.078244302894229e-05,
	"loss": 0.9442,
	"step": 3805
	},
	{
	"epoch": 13.0,
	"eval_loss": 2.522336959838867,
	"eval_runtime": 0.5484,
	"eval_samples_per_second": 18.236,
	"eval_steps_per_second": 1.824,
	"step": 3809
	},
	{
	"epoch": 13.003412969283277,
	"grad_norm": 0.376953125,
	"learning_rate": 1.0603751193220846e-05,
	"loss": 0.956,
	"step": 3810
	},
	{
	"epoch": 13.020477815699659,
	"grad_norm": 0.392578125,
	"learning_rate": 1.0426469508120662e-05,
	"loss": 0.9449,
	"step": 3815
	},
	{
	"epoch": 13.03754266211604,
	"grad_norm": 0.390625,
	"learning_rate": 1.0250600770111185e-05,
	"loss": 0.9479,
	"step": 3820
	},
	{
	"epoch": 13.054607508532424,
	"grad_norm": 0.392578125,
	"learning_rate": 1.0076147753373789e-05,
	"loss": 0.953,
	"step": 3825
	},
	{
	"epoch": 13.071672354948806,
	"grad_norm": 0.388671875,
	"learning_rate": 9.903113209758096e-06,
	"loss": 0.9436,
	"step": 3830
	},
	{
	"epoch": 13.088737201365188,
	"grad_norm": 0.380859375,
	"learning_rate": 9.731499868738447e-06,
	"loss": 0.9454,
	"step": 3835
	},
	{
	"epoch": 13.10580204778157,
	"grad_norm": 0.3828125,
	"learning_rate": 9.561310437370907e-06,
	"loss": 0.9556,
	"step": 3840
	},
	{
	"epoch": 13.122866894197951,
	"grad_norm": 0.373046875,
	"learning_rate": 9.392547600250634e-06,
	"loss": 0.949,
	"step": 3845
	},
	{
	"epoch": 13.139931740614335,
	"grad_norm": 0.380859375,
	"learning_rate": 9.225214019469385e-06,
	"loss": 0.9382,
	"step": 3850
	},
	{
	"epoch": 13.156996587030717,
	"grad_norm": 0.40234375,
	"learning_rate": 9.059312334573633e-06,
	"loss": 0.943,
	"step": 3855
	},
	{
	"epoch": 13.174061433447099,
	"grad_norm": 0.3828125,
	"learning_rate": 8.89484516252287e-06,
	"loss": 0.9534,
	"step": 3860
	},
	{
	"epoch": 13.19112627986348,
	"grad_norm": 0.369140625,
	"learning_rate": 8.731815097648433e-06,
	"loss": 0.9526,
	"step": 3865
	},
	{
	"epoch": 13.208191126279864,
	"grad_norm": 0.392578125,
	"learning_rate": 8.570224711612385e-06,
	"loss": 0.9419,
	"step": 3870
	},
	{
	"epoch": 13.225255972696246,
	"grad_norm": 0.373046875,
	"learning_rate": 8.410076553367208e-06,
	"loss": 0.9511,
	"step": 3875
	},
	{
	"epoch": 13.242320819112628,
	"grad_norm": 0.380859375,
	"learning_rate": 8.251373149115293e-06,
	"loss": 0.9489,
	"step": 3880
	},
	{
	"epoch": 13.25938566552901,
	"grad_norm": 0.36328125,
	"learning_rate": 8.094117002269363e-06,
	"loss": 0.9428,
	"step": 3885
	},
	{
	"epoch": 13.276450511945393,
	"grad_norm": 0.443359375,
	"learning_rate": 7.938310593412879e-06,
	"loss": 0.9485,
	"step": 3890
	},
	{
	"epoch": 13.293515358361775,
	"grad_norm": 0.3671875,
	"learning_rate": 7.783956380260837e-06,
	"loss": 0.955,
	"step": 3895
	},
	{
	"epoch": 13.310580204778157,
	"grad_norm": 0.384765625,
	"learning_rate": 7.631056797621106e-06,
	"loss": 0.9566,
	"step": 3900
	},
	{
	"epoch": 13.327645051194539,
	"grad_norm": 0.369140625,
	"learning_rate": 7.479614257355971e-06,
	"loss": 0.9495,
	"step": 3905
	},
	{
	"epoch": 13.344709897610922,
	"grad_norm": 0.376953125,
	"learning_rate": 7.329631148344118e-06,
	"loss": 0.9535,
	"step": 3910
	},
	{
	"epoch": 13.361774744027304,
	"grad_norm": 0.375,
	"learning_rate": 7.181109836442912e-06,
	"loss": 0.9473,
	"step": 3915
	},
	{
	"epoch": 13.378839590443686,
	"grad_norm": 0.37890625,
	"learning_rate": 7.034052664451118e-06,
	"loss": 0.946,
	"step": 3920
	},
	{
	"epoch": 13.395904436860068,
	"grad_norm": 0.380859375,
	"learning_rate": 6.88846195207189e-06,
	"loss": 0.9526,
	"step": 3925
	},
	{
	"epoch": 13.41296928327645,
	"grad_norm": 0.365234375,
	"learning_rate": 6.7443399958762584e-06,
	"loss": 0.9416,
	"step": 3930
	},
	{
	"epoch": 13.430034129692833,
	"grad_norm": 0.365234375,
	"learning_rate": 6.6016890692668364e-06,
	"loss": 0.9529,
	"step": 3935
	},
	{
	"epoch": 13.447098976109215,
	"grad_norm": 0.376953125,
	"learning_rate": 6.460511422441984e-06,
	"loss": 0.9427,
	"step": 3940
	},
	{
	"epoch": 13.464163822525597,
	"grad_norm": 0.37890625,
	"learning_rate": 6.320809282360319e-06,
	"loss": 0.9516,
	"step": 3945
	},
	{
	"epoch": 13.481228668941979,
	"grad_norm": 0.380859375,
	"learning_rate": 6.1825848527055865e-06,
	"loss": 0.9448,
	"step": 3950
	},
	{
	"epoch": 13.498293515358363,
	"grad_norm": 0.384765625,
	"learning_rate": 6.04584031385188e-06,
	"loss": 0.9542,
	"step": 3955
	},
	{
	"epoch": 13.515358361774744,
	"grad_norm": 0.376953125,
	"learning_rate": 5.910577822829233e-06,
	"loss": 0.9525,
	"step": 3960
	},
	{
	"epoch": 13.532423208191126,
	"grad_norm": 0.3671875,
	"learning_rate": 5.77679951328971e-06,
	"loss": 0.9502,
	"step": 3965
	},
	{
	"epoch": 13.549488054607508,
	"grad_norm": 0.373046875,
	"learning_rate": 5.644507495473572e-06,
	"loss": 0.9464,
	"step": 3970
	},
	{
	"epoch": 13.56655290102389,
	"grad_norm": 0.37890625,
	"learning_rate": 5.5137038561761115e-06,
	"loss": 0.9531,
	"step": 3975
	},
	{
	"epoch": 13.583617747440274,
	"grad_norm": 0.375,
	"learning_rate": 5.3843906587146886e-06,
	"loss": 0.9498,
	"step": 3980
	},
	{
	"epoch": 13.600682593856655,
	"grad_norm": 0.37890625,
	"learning_rate": 5.256569942896217e-06,
	"loss": 0.945,
	"step": 3985
	},
	{
	"epoch": 13.617747440273037,
	"grad_norm": 0.365234375,
	"learning_rate": 5.130243724984995e-06,
	"loss": 0.9468,
	"step": 3990
	},
	{
	"epoch": 13.634812286689419,
	"grad_norm": 0.369140625,
	"learning_rate": 5.005413997670816e-06,
	"loss": 0.9517,
	"step": 3995
	},
	{
	"epoch": 13.651877133105803,
	"grad_norm": 0.365234375,
	"learning_rate": 4.8820827300376075e-06,
	"loss": 0.9502,
	"step": 4000
	},
	{
	"epoch": 13.668941979522184,
	"grad_norm": 0.369140625,
	"learning_rate": 4.760251867532362e-06,
	"loss": 0.9462,
	"step": 4005
	},
	{
	"epoch": 13.686006825938566,
	"grad_norm": 0.384765625,
	"learning_rate": 4.639923331934471e-06,
	"loss": 0.9476,
	"step": 4010
	},
	{
	"epoch": 13.703071672354948,
	"grad_norm": 0.369140625,
	"learning_rate": 4.521099021325336e-06,
	"loss": 0.9556,
	"step": 4015
	},
	{
	"epoch": 13.720136518771332,
	"grad_norm": 0.390625,
	"learning_rate": 4.403780810058511e-06,
	"loss": 0.9438,
	"step": 4020
	},
	{
	"epoch": 13.737201365187714,
	"grad_norm": 0.470703125,
	"learning_rate": 4.287970548730069e-06,
	"loss": 0.9495,
	"step": 4025
	},
	{
	"epoch": 13.754266211604095,
	"grad_norm": 0.36328125,
	"learning_rate": 4.173670064149482e-06,
	"loss": 0.934,
	"step": 4030
	},
	{
	"epoch": 13.771331058020477,
	"grad_norm": 0.384765625,
	"learning_rate": 4.060881159310725e-06,
	"loss": 0.9502,
	"step": 4035
	},
	{
	"epoch": 13.788395904436861,
	"grad_norm": 0.388671875,
	"learning_rate": 3.949605613363882e-06,
	"loss": 0.939,
	"step": 4040
	},
	{
	"epoch": 13.805460750853243,
	"grad_norm": 0.37890625,
	"learning_rate": 3.839845181587098e-06,
	"loss": 0.9559,
	"step": 4045
	},
	{
	"epoch": 13.822525597269625,
	"grad_norm": 0.376953125,
	"learning_rate": 3.7316015953588467e-06,
	"loss": 0.9547,
	"step": 4050
	},
	{
	"epoch": 13.839590443686006,
	"grad_norm": 0.384765625,
	"learning_rate": 3.6248765621306414e-06,
	"loss": 0.9463,
	"step": 4055
	},
	{
	"epoch": 13.856655290102388,
	"grad_norm": 0.376953125,
	"learning_rate": 3.519671765400079e-06,
	"loss": 0.9454,
	"step": 4060
	},
	{
	"epoch": 13.873720136518772,
	"grad_norm": 0.373046875,
	"learning_rate": 3.4159888646843495e-06,
	"loss": 0.9485,
	"step": 4065
	},
	{
	"epoch": 13.890784982935154,
	"grad_norm": 0.375,
	"learning_rate": 3.313829495493992e-06,
	"loss": 0.9455,
	"step": 4070
	},
	{
	"epoch": 13.907849829351536,
	"grad_norm": 0.37890625,
	"learning_rate": 3.2131952693070898e-06,
	"loss": 0.9409,
	"step": 4075
	},
	{
	"epoch": 13.924914675767917,
	"grad_norm": 0.396484375,
	"learning_rate": 3.1140877735439387e-06,
	"loss": 0.9468,
	"step": 4080
	},
	{
	"epoch": 13.941979522184301,
	"grad_norm": 0.375,
	"learning_rate": 3.0165085715418763e-06,
	"loss": 0.9434,
	"step": 4085
	},
	{
	"epoch": 13.959044368600683,
	"grad_norm": 0.3671875,
	"learning_rate": 2.9204592025307566e-06,
	"loss": 0.9455,
	"step": 4090
	},
	{
	"epoch": 13.976109215017065,
	"grad_norm": 0.369140625,
	"learning_rate": 2.8259411816085492e-06,
	"loss": 0.9437,
	"step": 4095
	},
	{
	"epoch": 13.993174061433447,
	"grad_norm": 0.478515625,
	"learning_rate": 2.732955999717546e-06,
	"loss": 0.9469,
	"step": 4100
	},
	{
	"epoch": 14.0,
	"eval_loss": 2.5227127075195312,
	"eval_runtime": 0.542,
	"eval_samples_per_second": 18.45,
	"eval_steps_per_second": 1.845,
	"step": 4102
	},
	{
	"epoch": 14.01023890784983,
	"grad_norm": 0.376953125,
	"learning_rate": 2.6415051236207355e-06,
	"loss": 0.9508,
	"step": 4105
	},
	{
	"epoch": 14.027303754266212,
	"grad_norm": 0.375,
	"learning_rate": 2.551589995878789e-06,
	"loss": 0.9459,
	"step": 4110
	},
	{
	"epoch": 14.044368600682594,
	"grad_norm": 0.380859375,
	"learning_rate": 2.4632120348272003e-06,
	"loss": 0.9465,
	"step": 4115
	},
	{
	"epoch": 14.061433447098976,
	"grad_norm": 0.37890625,
	"learning_rate": 2.376372634553936e-06,
	"loss": 0.9475,
	"step": 4120
	},
	{
	"epoch": 14.078498293515358,
	"grad_norm": 0.376953125,
	"learning_rate": 2.291073164877511e-06,
	"loss": 0.9435,
	"step": 4125
	},
	{
	"epoch": 14.095563139931741,
	"grad_norm": 0.37890625,
	"learning_rate": 2.207314971325292e-06,
	"loss": 0.9546,
	"step": 4130
	},
	{
	"epoch": 14.112627986348123,
	"grad_norm": 0.400390625,
	"learning_rate": 2.125099375112316e-06,
	"loss": 0.9496,
	"step": 4135
	},
	{
	"epoch": 14.129692832764505,
	"grad_norm": 0.3671875,
	"learning_rate": 2.0444276731204415e-06,
	"loss": 0.9592,
	"step": 4140
	},
	{
	"epoch": 14.146757679180887,
	"grad_norm": 0.37890625,
	"learning_rate": 1.9653011378779283e-06,
	"loss": 0.9446,
	"step": 4145
	},
	{
	"epoch": 14.16382252559727,
	"grad_norm": 0.5625,
	"learning_rate": 1.88772101753929e-06,
	"loss": 0.9374,
	"step": 4150
	},
	{
	"epoch": 14.180887372013652,
	"grad_norm": 0.37890625,
	"learning_rate": 1.8116885358656744e-06,
	"loss": 0.9543,
	"step": 4155
	},
	{
	"epoch": 14.197952218430034,
	"grad_norm": 0.37109375,
	"learning_rate": 1.7372048922054906e-06,
	"loss": 0.9488,
	"step": 4160
	},
	{
	"epoch": 14.215017064846416,
	"grad_norm": 0.373046875,
	"learning_rate": 1.6642712614755695e-06,
	"loss": 0.9466,
	"step": 4165
	},
	{
	"epoch": 14.2320819112628,
	"grad_norm": 0.396484375,
	"learning_rate": 1.5928887941426107e-06,
	"loss": 0.9482,
	"step": 4170
	},
	{
	"epoch": 14.249146757679181,
	"grad_norm": 0.373046875,
	"learning_rate": 1.523058616204942e-06,
	"loss": 0.9449,
	"step": 4175
	},
	{
	"epoch": 14.266211604095563,
	"grad_norm": 0.3984375,
	"learning_rate": 1.4547818291749115e-06,
	"loss": 0.9562,
	"step": 4180
	},
	{
	"epoch": 14.283276450511945,
	"grad_norm": 0.388671875,
	"learning_rate": 1.3880595100613792e-06,
	"loss": 0.9445,
	"step": 4185
	},
	{
	"epoch": 14.300341296928327,
	"grad_norm": 0.376953125,
	"learning_rate": 1.3228927113528189e-06,
	"loss": 0.9457,
	"step": 4190
	},
	{
	"epoch": 14.31740614334471,
	"grad_norm": 0.388671875,
	"learning_rate": 1.2592824610006215e-06,
	"loss": 0.9488,
	"step": 4195
	},
	{
	"epoch": 14.334470989761092,
	"grad_norm": 0.38671875,
	"learning_rate": 1.1972297624030072e-06,
	"loss": 0.9437,
	"step": 4200
	},
	{
	"epoch": 14.351535836177474,
	"grad_norm": 0.3671875,
	"learning_rate": 1.1367355943890823e-06,
	"loss": 0.9459,
	"step": 4205
	},
	{
	"epoch": 14.368600682593856,
	"grad_norm": 0.396484375,
	"learning_rate": 1.0778009112034748e-06,
	"loss": 0.9477,
	"step": 4210
	},
	{
	"epoch": 14.38566552901024,
	"grad_norm": 0.375,
	"learning_rate": 1.0204266424912123e-06,
	"loss": 0.95,
	"step": 4215
	},
	{
	"epoch": 14.402730375426621,
	"grad_norm": 0.40625,
	"learning_rate": 9.64613693283123e-07,
	"loss": 0.9477,
	"step": 4220
	},
	{
	"epoch": 14.419795221843003,
	"grad_norm": 0.375,
	"learning_rate": 9.103629439815354e-07,
	"loss": 0.9461,
	"step": 4225
	},
	{
	"epoch": 14.436860068259385,
	"grad_norm": 0.3828125,
	"learning_rate": 8.57675250346368e-07,
	"loss": 0.9585,
	"step": 4230
	},
	{
	"epoch": 14.453924914675769,
	"grad_norm": 0.443359375,
	"learning_rate": 8.065514434816845e-07,
	"loss": 0.9434,
	"step": 4235
	},
	{
	"epoch": 14.47098976109215,
	"grad_norm": 0.396484375,
	"learning_rate": 7.569923298225146e-07,
	"loss": 0.941,
	"step": 4240
	},
	{
	"epoch": 14.488054607508532,
	"grad_norm": 0.375,
	"learning_rate": 7.08998691122198e-07,
	"loss": 0.9527,
	"step": 4245
	},
	{
	"epoch": 14.505119453924914,
	"grad_norm": 0.380859375,
	"learning_rate": 6.625712844400056e-07,
	"loss": 0.9484,
	"step": 4250
	},
	{
	"epoch": 14.522184300341298,
	"grad_norm": 0.390625,
	"learning_rate": 6.177108421292266e-07,
	"loss": 0.9453,
	"step": 4255
	},
	{
	"epoch": 14.53924914675768,
	"grad_norm": 0.404296875,
	"learning_rate": 5.744180718255776e-07,
	"loss": 0.9464,
	"step": 4260
	},
	{
	"epoch": 14.556313993174061,
	"grad_norm": 0.375,
	"learning_rate": 5.326936564361118e-07,
	"loss": 0.943,
	"step": 4265
	},
	{
	"epoch": 14.573378839590443,
	"grad_norm": 0.369140625,
	"learning_rate": 4.92538254128383e-07,
	"loss": 0.9422,
	"step": 4270
	},
	{
	"epoch": 14.590443686006825,
	"grad_norm": 0.390625,
	"learning_rate": 4.5395249832007604e-07,
	"loss": 0.9591,
	"step": 4275
	},
	{
	"epoch": 14.607508532423209,
	"grad_norm": 0.396484375,
	"learning_rate": 4.1693699766902626e-07,
	"loss": 0.9475,
	"step": 4280
	},
	{
	"epoch": 14.62457337883959,
	"grad_norm": 0.369140625,
	"learning_rate": 3.814923360636158e-07,
	"loss": 0.9391,
	"step": 4285
	},
	{
	"epoch": 14.641638225255972,
	"grad_norm": 0.36328125,
	"learning_rate": 3.4761907261356976e-07,
	"loss": 0.9574,
	"step": 4290
	},
	{
	"epoch": 14.658703071672354,
	"grad_norm": 0.388671875,
	"learning_rate": 3.1531774164111903e-07,
	"loss": 0.9495,
	"step": 4295
	},
	{
	"epoch": 14.675767918088738,
	"grad_norm": 0.373046875,
	"learning_rate": 2.8458885267260705e-07,
	"loss": 0.9537,
	"step": 4300
	},
	{
	"epoch": 14.69283276450512,
	"grad_norm": 0.38671875,
	"learning_rate": 2.554328904303738e-07,
	"loss": 0.9435,
	"step": 4305
	},
	{
	"epoch": 14.709897610921502,
	"grad_norm": 0.39453125,
	"learning_rate": 2.2785031482521758e-07,
	"loss": 0.9474,
	"step": 4310
	},
	{
	"epoch": 14.726962457337883,
	"grad_norm": 0.376953125,
	"learning_rate": 2.0184156094905648e-07,
	"loss": 0.947,
	"step": 4315
	},
	{
	"epoch": 14.744027303754265,
	"grad_norm": 0.3671875,
	"learning_rate": 1.7740703906810042e-07,
	"loss": 0.9431,
	"step": 4320
	},
	{
	"epoch": 14.761092150170649,
	"grad_norm": 0.384765625,
	"learning_rate": 1.545471346164007e-07,
	"loss": 0.9431,
	"step": 4325
	},
	{
	"epoch": 14.77815699658703,
	"grad_norm": 0.37890625,
	"learning_rate": 1.3326220818968838e-07,
	"loss": 0.9455,
	"step": 4330
	},
	{
	"epoch": 14.795221843003413,
	"grad_norm": 0.375,
	"learning_rate": 1.1355259553978981e-07,
	"loss": 0.9512,
	"step": 4335
	},
	{
	"epoch": 14.812286689419794,
	"grad_norm": 0.390625,
	"learning_rate": 9.541860756925314e-08,
	"loss": 0.9439,
	"step": 4340
	},
	{
	"epoch": 14.829351535836178,
	"grad_norm": 0.37109375,
	"learning_rate": 7.886053032649665e-08,
	"loss": 0.9548,
	"step": 4345
	},
	{
	"epoch": 14.84641638225256,
	"grad_norm": 0.4921875,
	"learning_rate": 6.387862500125685e-08,
	"loss": 0.9437,
	"step": 4350
	},
	{
	"epoch": 14.863481228668942,
	"grad_norm": 0.380859375,
	"learning_rate": 5.047312792046954e-08,
	"loss": 0.9512,
	"step": 4355
	},
	{
	"epoch": 14.880546075085324,
	"grad_norm": 0.39453125,
	"learning_rate": 3.8644250544594975e-08,
	"loss": 0.9478,
	"step": 4360
	},
	{
	"epoch": 14.897610921501707,
	"grad_norm": 0.380859375,
	"learning_rate": 2.839217946422057e-08,
	"loss": 0.9362,
	"step": 4365
	},
	{
	"epoch": 14.914675767918089,
	"grad_norm": 0.380859375,
	"learning_rate": 1.971707639712994e-08,
	"loss": 0.9507,
	"step": 4370
	},
	{
	"epoch": 14.93174061433447,
	"grad_norm": 0.37109375,
	"learning_rate": 1.2619078185793776e-08,
	"loss": 0.948,
	"step": 4375
	},
	{
	"epoch": 14.948805460750853,
	"grad_norm": 0.400390625,
	"learning_rate": 7.098296795138293e-09,
	"loss": 0.9524,
	"step": 4380
	},
	{
	"epoch": 14.965870307167236,
	"grad_norm": 0.36328125,
	"learning_rate": 3.154819310868806e-09,
	"loss": 0.9497,
	"step": 4385
	},
	{
	"epoch": 14.982935153583618,
	"grad_norm": 0.400390625,
	"learning_rate": 7.887079380153317e-10,
	"loss": 0.9536,
	"step": 4390
	},
	{
	"epoch": 15.0,
	"grad_norm": 0.373046875,
	"learning_rate": 0.0,
	"loss": 0.9444,
	"step": 4395
	},
	{
	"epoch": 15.0,
	"eval_loss": 2.523277521133423,
	"eval_runtime": 0.5592,
	"eval_samples_per_second": 17.883,
	"eval_steps_per_second": 1.788,
	"step": 4395
	},
	{
	"epoch": 15.0,
	"step": 4395,
	"total_flos": 2.581505823377195e+18,
	"train_loss": 1.0488379673203783,
	"train_runtime": 23446.7186,
	"train_samples_per_second": 8.983,
	"train_steps_per_second": 0.187
	}
	],
	"logging_steps": 5,
	"max_steps": 4395,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 15,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 2.581505823377195e+18,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}