gemma2b-summarize-gpt4o-128k / trainer_state.json

Model save

18a20e0 verified about 2 months ago

No virus

99.9 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 10.0,
	"eval_steps": 500,
	"global_step": 2930,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0034129692832764505,
	"grad_norm": 3.671875,
	"learning_rate": 6.825938566552902e-07,
	"loss": 3.0499,
	"step": 1
	},
	{
	"epoch": 0.017064846416382253,
	"grad_norm": 4.96875,
	"learning_rate": 3.4129692832764506e-06,
	"loss": 3.0421,
	"step": 5
	},
	{
	"epoch": 0.034129692832764506,
	"grad_norm": 4.34375,
	"learning_rate": 6.825938566552901e-06,
	"loss": 3.0559,
	"step": 10
	},
	{
	"epoch": 0.051194539249146756,
	"grad_norm": 3.09375,
	"learning_rate": 1.0238907849829352e-05,
	"loss": 2.9957,
	"step": 15
	},
	{
	"epoch": 0.06825938566552901,
	"grad_norm": 2.71875,
	"learning_rate": 1.3651877133105803e-05,
	"loss": 2.8653,
	"step": 20
	},
	{
	"epoch": 0.08532423208191127,
	"grad_norm": 3.3125,
	"learning_rate": 1.7064846416382256e-05,
	"loss": 2.7049,
	"step": 25
	},
	{
	"epoch": 0.10238907849829351,
	"grad_norm": 18.5,
	"learning_rate": 2.0477815699658705e-05,
	"loss": 2.5238,
	"step": 30
	},
	{
	"epoch": 0.11945392491467577,
	"grad_norm": 1.8828125,
	"learning_rate": 2.3890784982935157e-05,
	"loss": 2.3984,
	"step": 35
	},
	{
	"epoch": 0.13651877133105803,
	"grad_norm": 1.3671875,
	"learning_rate": 2.7303754266211605e-05,
	"loss": 2.3001,
	"step": 40
	},
	{
	"epoch": 0.15358361774744028,
	"grad_norm": 2.0625,
	"learning_rate": 3.071672354948806e-05,
	"loss": 2.1645,
	"step": 45
	},
	{
	"epoch": 0.17064846416382254,
	"grad_norm": 1.25,
	"learning_rate": 3.412969283276451e-05,
	"loss": 2.0453,
	"step": 50
	},
	{
	"epoch": 0.18771331058020477,
	"grad_norm": 0.8046875,
	"learning_rate": 3.754266211604096e-05,
	"loss": 1.8952,
	"step": 55
	},
	{
	"epoch": 0.20477815699658702,
	"grad_norm": 1.609375,
	"learning_rate": 4.095563139931741e-05,
	"loss": 1.7862,
	"step": 60
	},
	{
	"epoch": 0.22184300341296928,
	"grad_norm": 0.63671875,
	"learning_rate": 4.436860068259386e-05,
	"loss": 1.6922,
	"step": 65
	},
	{
	"epoch": 0.23890784982935154,
	"grad_norm": 0.400390625,
	"learning_rate": 4.778156996587031e-05,
	"loss": 1.6006,
	"step": 70
	},
	{
	"epoch": 0.25597269624573377,
	"grad_norm": 0.375,
	"learning_rate": 5.119453924914676e-05,
	"loss": 1.5335,
	"step": 75
	},
	{
	"epoch": 0.27303754266211605,
	"grad_norm": 0.43359375,
	"learning_rate": 5.460750853242321e-05,
	"loss": 1.4832,
	"step": 80
	},
	{
	"epoch": 0.2901023890784983,
	"grad_norm": 0.67578125,
	"learning_rate": 5.802047781569966e-05,
	"loss": 1.4393,
	"step": 85
	},
	{
	"epoch": 0.30716723549488056,
	"grad_norm": 0.337890625,
	"learning_rate": 6.143344709897612e-05,
	"loss": 1.3951,
	"step": 90
	},
	{
	"epoch": 0.3242320819112628,
	"grad_norm": 0.275390625,
	"learning_rate": 6.484641638225257e-05,
	"loss": 1.3594,
	"step": 95
	},
	{
	"epoch": 0.3412969283276451,
	"grad_norm": 0.267578125,
	"learning_rate": 6.825938566552902e-05,
	"loss": 1.3456,
	"step": 100
	},
	{
	"epoch": 0.3583617747440273,
	"grad_norm": 0.3671875,
	"learning_rate": 7.167235494880547e-05,
	"loss": 1.3174,
	"step": 105
	},
	{
	"epoch": 0.37542662116040953,
	"grad_norm": 0.328125,
	"learning_rate": 7.508532423208191e-05,
	"loss": 1.3087,
	"step": 110
	},
	{
	"epoch": 0.3924914675767918,
	"grad_norm": 0.5703125,
	"learning_rate": 7.849829351535837e-05,
	"loss": 1.3001,
	"step": 115
	},
	{
	"epoch": 0.40955631399317405,
	"grad_norm": 0.30078125,
	"learning_rate": 8.191126279863482e-05,
	"loss": 1.2871,
	"step": 120
	},
	{
	"epoch": 0.42662116040955633,
	"grad_norm": 0.65625,
	"learning_rate": 8.532423208191128e-05,
	"loss": 1.2567,
	"step": 125
	},
	{
	"epoch": 0.44368600682593856,
	"grad_norm": 0.458984375,
	"learning_rate": 8.873720136518772e-05,
	"loss": 1.2582,
	"step": 130
	},
	{
	"epoch": 0.46075085324232085,
	"grad_norm": 0.380859375,
	"learning_rate": 9.215017064846417e-05,
	"loss": 1.2471,
	"step": 135
	},
	{
	"epoch": 0.4778156996587031,
	"grad_norm": 0.298828125,
	"learning_rate": 9.556313993174063e-05,
	"loss": 1.2357,
	"step": 140
	},
	{
	"epoch": 0.4948805460750853,
	"grad_norm": 0.65625,
	"learning_rate": 9.897610921501707e-05,
	"loss": 1.2303,
	"step": 145
	},
	{
	"epoch": 0.5119453924914675,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00010238907849829352,
	"loss": 1.226,
	"step": 150
	},
	{
	"epoch": 0.5290102389078498,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00010580204778156998,
	"loss": 1.2251,
	"step": 155
	},
	{
	"epoch": 0.5460750853242321,
	"grad_norm": 0.416015625,
	"learning_rate": 0.00010921501706484642,
	"loss": 1.2135,
	"step": 160
	},
	{
	"epoch": 0.5631399317406144,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00011262798634812288,
	"loss": 1.2069,
	"step": 165
	},
	{
	"epoch": 0.5802047781569966,
	"grad_norm": 0.6796875,
	"learning_rate": 0.00011604095563139932,
	"loss": 1.2005,
	"step": 170
	},
	{
	"epoch": 0.5972696245733788,
	"grad_norm": 0.41015625,
	"learning_rate": 0.00011945392491467577,
	"loss": 1.1944,
	"step": 175
	},
	{
	"epoch": 0.6143344709897611,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00012286689419795224,
	"loss": 1.1775,
	"step": 180
	},
	{
	"epoch": 0.6313993174061433,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00012627986348122866,
	"loss": 1.1844,
	"step": 185
	},
	{
	"epoch": 0.6484641638225256,
	"grad_norm": 0.46484375,
	"learning_rate": 0.00012969283276450513,
	"loss": 1.1711,
	"step": 190
	},
	{
	"epoch": 0.6655290102389079,
	"grad_norm": 0.72265625,
	"learning_rate": 0.00013310580204778158,
	"loss": 1.1824,
	"step": 195
	},
	{
	"epoch": 0.6825938566552902,
	"grad_norm": 1.0859375,
	"learning_rate": 0.00013651877133105805,
	"loss": 1.169,
	"step": 200
	},
	{
	"epoch": 0.6996587030716723,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00013993174061433447,
	"loss": 1.1691,
	"step": 205
	},
	{
	"epoch": 0.7167235494880546,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00014334470989761094,
	"loss": 1.1573,
	"step": 210
	},
	{
	"epoch": 0.7337883959044369,
	"grad_norm": 0.6328125,
	"learning_rate": 0.00014675767918088738,
	"loss": 1.1637,
	"step": 215
	},
	{
	"epoch": 0.7508532423208191,
	"grad_norm": 0.68359375,
	"learning_rate": 0.00015017064846416383,
	"loss": 1.1605,
	"step": 220
	},
	{
	"epoch": 0.7679180887372014,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00015358361774744027,
	"loss": 1.1539,
	"step": 225
	},
	{
	"epoch": 0.7849829351535836,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00015699658703071675,
	"loss": 1.1458,
	"step": 230
	},
	{
	"epoch": 0.8020477815699659,
	"grad_norm": 0.4140625,
	"learning_rate": 0.0001604095563139932,
	"loss": 1.1524,
	"step": 235
	},
	{
	"epoch": 0.8191126279863481,
	"grad_norm": 0.51171875,
	"learning_rate": 0.00016382252559726964,
	"loss": 1.1505,
	"step": 240
	},
	{
	"epoch": 0.8361774744027304,
	"grad_norm": 0.81640625,
	"learning_rate": 0.00016723549488054608,
	"loss": 1.1493,
	"step": 245
	},
	{
	"epoch": 0.8532423208191127,
	"grad_norm": 0.59765625,
	"learning_rate": 0.00017064846416382255,
	"loss": 1.1391,
	"step": 250
	},
	{
	"epoch": 0.8703071672354948,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00017406143344709897,
	"loss": 1.1213,
	"step": 255
	},
	{
	"epoch": 0.8873720136518771,
	"grad_norm": 0.443359375,
	"learning_rate": 0.00017747440273037544,
	"loss": 1.1311,
	"step": 260
	},
	{
	"epoch": 0.9044368600682594,
	"grad_norm": 0.8046875,
	"learning_rate": 0.0001808873720136519,
	"loss": 1.1224,
	"step": 265
	},
	{
	"epoch": 0.9215017064846417,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00018430034129692833,
	"loss": 1.1369,
	"step": 270
	},
	{
	"epoch": 0.9385665529010239,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00018771331058020478,
	"loss": 1.1203,
	"step": 275
	},
	{
	"epoch": 0.9556313993174061,
	"grad_norm": 1.171875,
	"learning_rate": 0.00019112627986348125,
	"loss": 1.1281,
	"step": 280
	},
	{
	"epoch": 0.9726962457337884,
	"grad_norm": 0.8671875,
	"learning_rate": 0.0001945392491467577,
	"loss": 1.1231,
	"step": 285
	},
	{
	"epoch": 0.9897610921501706,
	"grad_norm": 0.466796875,
	"learning_rate": 0.00019795221843003414,
	"loss": 1.1249,
	"step": 290
	},
	{
	"epoch": 1.0,
	"eval_loss": 2.4640614986419678,
	"eval_runtime": 0.5515,
	"eval_samples_per_second": 18.133,
	"eval_steps_per_second": 1.813,
	"step": 293
	},
	{
	"epoch": 1.006825938566553,
	"grad_norm": 0.65625,
	"learning_rate": 0.00019999971613668125,
	"loss": 1.1028,
	"step": 295
	},
	{
	"epoch": 1.023890784982935,
	"grad_norm": 0.87890625,
	"learning_rate": 0.00019999652269285281,
	"loss": 1.0985,
	"step": 300
	},
	{
	"epoch": 1.0409556313993173,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00019998978108973762,
	"loss": 1.0885,
	"step": 305
	},
	{
	"epoch": 1.0580204778156996,
	"grad_norm": 0.34375,
	"learning_rate": 0.00019997949156654686,
	"loss": 1.1064,
	"step": 310
	},
	{
	"epoch": 1.075085324232082,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00019996565448838176,
	"loss": 1.0991,
	"step": 315
	},
	{
	"epoch": 1.0921501706484642,
	"grad_norm": 1.03125,
	"learning_rate": 0.0001999482703462211,
	"loss": 1.0947,
	"step": 320
	},
	{
	"epoch": 1.1092150170648465,
	"grad_norm": 0.478515625,
	"learning_rate": 0.00019992733975690333,
	"loss": 1.097,
	"step": 325
	},
	{
	"epoch": 1.1262798634812285,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00019990286346310493,
	"loss": 1.0835,
	"step": 330
	},
	{
	"epoch": 1.1433447098976108,
	"grad_norm": 1.40625,
	"learning_rate": 0.00019987484233331394,
	"loss": 1.1033,
	"step": 335
	},
	{
	"epoch": 1.1604095563139931,
	"grad_norm": 1.8515625,
	"learning_rate": 0.00019984327736179936,
	"loss": 1.1011,
	"step": 340
	},
	{
	"epoch": 1.1774744027303754,
	"grad_norm": 1.6328125,
	"learning_rate": 0.0001998081696685755,
	"loss": 1.0986,
	"step": 345
	},
	{
	"epoch": 1.1945392491467577,
	"grad_norm": 0.72265625,
	"learning_rate": 0.0001997695204993626,
	"loss": 1.0859,
	"step": 350
	},
	{
	"epoch": 1.21160409556314,
	"grad_norm": 0.6875,
	"learning_rate": 0.00019972733122554246,
	"loss": 1.0867,
	"step": 355
	},
	{
	"epoch": 1.2286689419795223,
	"grad_norm": 0.62109375,
	"learning_rate": 0.00019968160334410975,
	"loss": 1.0949,
	"step": 360
	},
	{
	"epoch": 1.2457337883959045,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00019963233847761894,
	"loss": 1.0683,
	"step": 365
	},
	{
	"epoch": 1.2627986348122868,
	"grad_norm": 0.5,
	"learning_rate": 0.00019957953837412677,
	"loss": 1.0829,
	"step": 370
	},
	{
	"epoch": 1.2798634812286689,
	"grad_norm": 0.306640625,
	"learning_rate": 0.0001995232049071302,
	"loss": 1.0878,
	"step": 375
	},
	{
	"epoch": 1.2969283276450512,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00019946334007549978,
	"loss": 1.0697,
	"step": 380
	},
	{
	"epoch": 1.3139931740614335,
	"grad_norm": 0.54296875,
	"learning_rate": 0.00019939994600340905,
	"loss": 1.0765,
	"step": 385
	},
	{
	"epoch": 1.3310580204778157,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00019933302494025884,
	"loss": 1.0772,
	"step": 390
	},
	{
	"epoch": 1.348122866894198,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00019926257926059768,
	"loss": 1.0739,
	"step": 395
	},
	{
	"epoch": 1.36518771331058,
	"grad_norm": 0.271484375,
	"learning_rate": 0.00019918861146403733,
	"loss": 1.0816,
	"step": 400
	},
	{
	"epoch": 1.3822525597269624,
	"grad_norm": 0.30859375,
	"learning_rate": 0.0001991111241751644,
	"loss": 1.0711,
	"step": 405
	},
	{
	"epoch": 1.3993174061433447,
	"grad_norm": 0.27734375,
	"learning_rate": 0.00019903012014344686,
	"loss": 1.0616,
	"step": 410
	},
	{
	"epoch": 1.416382252559727,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00019894560224313678,
	"loss": 1.0624,
	"step": 415
	},
	{
	"epoch": 1.4334470989761092,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00019885757347316813,
	"loss": 1.0572,
	"step": 420
	},
	{
	"epoch": 1.4505119453924915,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001987660369570505,
	"loss": 1.0701,
	"step": 425
	},
	{
	"epoch": 1.4675767918088738,
	"grad_norm": 0.578125,
	"learning_rate": 0.00019867099594275827,
	"loss": 1.0669,
	"step": 430
	},
	{
	"epoch": 1.484641638225256,
	"grad_norm": 0.412109375,
	"learning_rate": 0.00019857245380261525,
	"loss": 1.0724,
	"step": 435
	},
	{
	"epoch": 1.5017064846416384,
	"grad_norm": 0.365234375,
	"learning_rate": 0.0001984704140331751,
	"loss": 1.0728,
	"step": 440
	},
	{
	"epoch": 1.5187713310580204,
	"grad_norm": 0.296875,
	"learning_rate": 0.00019836488025509736,
	"loss": 1.0712,
	"step": 445
	},
	{
	"epoch": 1.5358361774744027,
	"grad_norm": 0.3125,
	"learning_rate": 0.00019825585621301872,
	"loss": 1.0569,
	"step": 450
	},
	{
	"epoch": 1.552901023890785,
	"grad_norm": 0.486328125,
	"learning_rate": 0.00019814334577542038,
	"loss": 1.0638,
	"step": 455
	},
	{
	"epoch": 1.5699658703071673,
	"grad_norm": 0.30078125,
	"learning_rate": 0.0001980273529344907,
	"loss": 1.0638,
	"step": 460
	},
	{
	"epoch": 1.5870307167235493,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00019790788180598358,
	"loss": 1.0556,
	"step": 465
	},
	{
	"epoch": 1.6040955631399316,
	"grad_norm": 0.265625,
	"learning_rate": 0.00019778493662907237,
	"loss": 1.056,
	"step": 470
	},
	{
	"epoch": 1.621160409556314,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00019765852176619944,
	"loss": 1.0512,
	"step": 475
	},
	{
	"epoch": 1.6382252559726962,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00019752864170292152,
	"loss": 1.0585,
	"step": 480
	},
	{
	"epoch": 1.6552901023890785,
	"grad_norm": 0.62890625,
	"learning_rate": 0.00019739530104775032,
	"loss": 1.0628,
	"step": 485
	},
	{
	"epoch": 1.6723549488054608,
	"grad_norm": 0.3125,
	"learning_rate": 0.00019725850453198925,
	"loss": 1.0612,
	"step": 490
	},
	{
	"epoch": 1.689419795221843,
	"grad_norm": 0.5234375,
	"learning_rate": 0.00019711825700956536,
	"loss": 1.0549,
	"step": 495
	},
	{
	"epoch": 1.7064846416382253,
	"grad_norm": 0.42578125,
	"learning_rate": 0.0001969745634568572,
	"loss": 1.0506,
	"step": 500
	},
	{
	"epoch": 1.7235494880546076,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00019682742897251818,
	"loss": 1.0418,
	"step": 505
	},
	{
	"epoch": 1.74061433447099,
	"grad_norm": 0.400390625,
	"learning_rate": 0.0001966768587772957,
	"loss": 1.0508,
	"step": 510
	},
	{
	"epoch": 1.757679180887372,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00019652285821384596,
	"loss": 1.0519,
	"step": 515
	},
	{
	"epoch": 1.7747440273037542,
	"grad_norm": 0.404296875,
	"learning_rate": 0.0001963654327465442,
	"loss": 1.0554,
	"step": 520
	},
	{
	"epoch": 1.7918088737201365,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00019620458796129104,
	"loss": 1.0421,
	"step": 525
	},
	{
	"epoch": 1.8088737201365188,
	"grad_norm": 0.375,
	"learning_rate": 0.0001960403295653141,
	"loss": 1.0421,
	"step": 530
	},
	{
	"epoch": 1.8259385665529009,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00019587266338696565,
	"loss": 1.046,
	"step": 535
	},
	{
	"epoch": 1.8430034129692832,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00019570159537551552,
	"loss": 1.0528,
	"step": 540
	},
	{
	"epoch": 1.8600682593856654,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00019552713160094038,
	"loss": 1.0481,
	"step": 545
	},
	{
	"epoch": 1.8771331058020477,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00019534927825370815,
	"loss": 1.0477,
	"step": 550
	},
	{
	"epoch": 1.89419795221843,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00019516804164455826,
	"loss": 1.0513,
	"step": 555
	},
	{
	"epoch": 1.9112627986348123,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00019498342820427794,
	"loss": 1.0505,
	"step": 560
	},
	{
	"epoch": 1.9283276450511946,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00019479544448347392,
	"loss": 1.0538,
	"step": 565
	},
	{
	"epoch": 1.9453924914675769,
	"grad_norm": 0.283203125,
	"learning_rate": 0.00019460409715233996,
	"loss": 1.0332,
	"step": 570
	},
	{
	"epoch": 1.9624573378839592,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00019440939300042028,
	"loss": 1.047,
	"step": 575
	},
	{
	"epoch": 1.9795221843003414,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00019421133893636854,
	"loss": 1.0321,
	"step": 580
	},
	{
	"epoch": 1.9965870307167235,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00019400994198770274,
	"loss": 1.0415,
	"step": 585
	},
	{
	"epoch": 2.0,
	"eval_loss": 2.451392650604248,
	"eval_runtime": 0.5484,
	"eval_samples_per_second": 18.236,
	"eval_steps_per_second": 1.824,
	"step": 586
	},
	{
	"epoch": 2.013651877133106,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00019380520930055602,
	"loss": 1.0194,
	"step": 590
	},
	{
	"epoch": 2.030716723549488,
	"grad_norm": 0.412109375,
	"learning_rate": 0.0001935971481394227,
	"loss": 0.9985,
	"step": 595
	},
	{
	"epoch": 2.04778156996587,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00019338576588690104,
	"loss": 1.0026,
	"step": 600
	},
	{
	"epoch": 2.0648464163822524,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00019317107004343078,
	"loss": 1.0018,
	"step": 605
	},
	{
	"epoch": 2.0819112627986347,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001929530682270274,
	"loss": 1.0096,
	"step": 610
	},
	{
	"epoch": 2.098976109215017,
	"grad_norm": 0.326171875,
	"learning_rate": 0.0001927317681730115,
	"loss": 1.0047,
	"step": 615
	},
	{
	"epoch": 2.1160409556313993,
	"grad_norm": 0.4609375,
	"learning_rate": 0.00019250717773373462,
	"loss": 0.9998,
	"step": 620
	},
	{
	"epoch": 2.1331058020477816,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00019227930487830035,
	"loss": 1.0121,
	"step": 625
	},
	{
	"epoch": 2.150170648464164,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00019204815769228176,
	"loss": 1.0064,
	"step": 630
	},
	{
	"epoch": 2.167235494880546,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00019181374437743438,
	"loss": 0.9968,
	"step": 635
	},
	{
	"epoch": 2.1843003412969284,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00019157607325140524,
	"loss": 1.0046,
	"step": 640
	},
	{
	"epoch": 2.2013651877133107,
	"grad_norm": 0.43359375,
	"learning_rate": 0.00019133515274743771,
	"loss": 1.0161,
	"step": 645
	},
	{
	"epoch": 2.218430034129693,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00019109099141407233,
	"loss": 1.004,
	"step": 650
	},
	{
	"epoch": 2.2354948805460753,
	"grad_norm": 0.41015625,
	"learning_rate": 0.0001908435979148434,
	"loss": 1.0071,
	"step": 655
	},
	{
	"epoch": 2.252559726962457,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00019059298102797146,
	"loss": 1.0117,
	"step": 660
	},
	{
	"epoch": 2.26962457337884,
	"grad_norm": 0.3671875,
	"learning_rate": 0.0001903391496460522,
	"loss": 0.9996,
	"step": 665
	},
	{
	"epoch": 2.2866894197952217,
	"grad_norm": 0.31640625,
	"learning_rate": 0.0001900821127757405,
	"loss": 1.0038,
	"step": 670
	},
	{
	"epoch": 2.303754266211604,
	"grad_norm": 0.5078125,
	"learning_rate": 0.0001898218795374311,
	"loss": 1.0105,
	"step": 675
	},
	{
	"epoch": 2.3208191126279862,
	"grad_norm": 0.451171875,
	"learning_rate": 0.0001895584591649349,
	"loss": 0.9929,
	"step": 680
	},
	{
	"epoch": 2.3378839590443685,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00018929186100515136,
	"loss": 1.0018,
	"step": 685
	},
	{
	"epoch": 2.354948805460751,
	"grad_norm": 0.498046875,
	"learning_rate": 0.00018902209451773674,
	"loss": 0.9955,
	"step": 690
	},
	{
	"epoch": 2.372013651877133,
	"grad_norm": 0.4375,
	"learning_rate": 0.0001887491692747686,
	"loss": 0.9953,
	"step": 695
	},
	{
	"epoch": 2.3890784982935154,
	"grad_norm": 0.37890625,
	"learning_rate": 0.000188473094960406,
	"loss": 0.9833,
	"step": 700
	},
	{
	"epoch": 2.4061433447098977,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00018819388137054604,
	"loss": 1.0089,
	"step": 705
	},
	{
	"epoch": 2.42320819112628,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00018791153841247614,
	"loss": 1.0031,
	"step": 710
	},
	{
	"epoch": 2.4402730375426622,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00018762607610452254,
	"loss": 1.002,
	"step": 715
	},
	{
	"epoch": 2.4573378839590445,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00018733750457569485,
	"loss": 1.0003,
	"step": 720
	},
	{
	"epoch": 2.474402730375427,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00018704583406532662,
	"loss": 1.004,
	"step": 725
	},
	{
	"epoch": 2.491467576791809,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00018675107492271208,
	"loss": 1.0075,
	"step": 730
	},
	{
	"epoch": 2.508532423208191,
	"grad_norm": 0.2890625,
	"learning_rate": 0.0001864532376067387,
	"loss": 1.0035,
	"step": 735
	},
	{
	"epoch": 2.5255972696245736,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00018615233268551643,
	"loss": 0.9968,
	"step": 740
	},
	{
	"epoch": 2.5426621160409555,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00018584837083600244,
	"loss": 1.0124,
	"step": 745
	},
	{
	"epoch": 2.5597269624573378,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00018554136284362237,
	"loss": 1.0012,
	"step": 750
	},
	{
	"epoch": 2.57679180887372,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00018523131960188755,
	"loss": 0.9915,
	"step": 755
	},
	{
	"epoch": 2.5938566552901023,
	"grad_norm": 0.365234375,
	"learning_rate": 0.0001849182521120087,
	"loss": 0.9996,
	"step": 760
	},
	{
	"epoch": 2.6109215017064846,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00018460217148250524,
	"loss": 0.9975,
	"step": 765
	},
	{
	"epoch": 2.627986348122867,
	"grad_norm": 0.33203125,
	"learning_rate": 0.0001842830889288114,
	"loss": 1.0008,
	"step": 770
	},
	{
	"epoch": 2.645051194539249,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00018396101577287813,
	"loss": 1.0041,
	"step": 775
	},
	{
	"epoch": 2.6621160409556315,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00018363596344277144,
	"loss": 0.9995,
	"step": 780
	},
	{
	"epoch": 2.6791808873720138,
	"grad_norm": 0.30078125,
	"learning_rate": 0.0001833079434722668,
	"loss": 1.002,
	"step": 785
	},
	{
	"epoch": 2.696245733788396,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00018297696750044,
	"loss": 1.0057,
	"step": 790
	},
	{
	"epoch": 2.7133105802047783,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00018264304727125407,
	"loss": 0.9966,
	"step": 795
	},
	{
	"epoch": 2.73037542662116,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00018230619463314266,
	"loss": 0.9887,
	"step": 800
	},
	{
	"epoch": 2.747440273037543,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00018196642153858958,
	"loss": 0.9993,
	"step": 805
	},
	{
	"epoch": 2.7645051194539247,
	"grad_norm": 0.408203125,
	"learning_rate": 0.00018162374004370463,
	"loss": 0.9953,
	"step": 810
	},
	{
	"epoch": 2.781569965870307,
	"grad_norm": 0.283203125,
	"learning_rate": 0.0001812781623077959,
	"loss": 0.9856,
	"step": 815
	},
	{
	"epoch": 2.7986348122866893,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00018092970059293835,
	"loss": 1.0029,
	"step": 820
	},
	{
	"epoch": 2.8156996587030716,
	"grad_norm": 0.37109375,
	"learning_rate": 0.0001805783672635386,
	"loss": 0.991,
	"step": 825
	},
	{
	"epoch": 2.832764505119454,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00018022417478589627,
	"loss": 1.0053,
	"step": 830
	},
	{
	"epoch": 2.849829351535836,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00017986713572776174,
	"loss": 0.9865,
	"step": 835
	},
	{
	"epoch": 2.8668941979522184,
	"grad_norm": 0.271484375,
	"learning_rate": 0.00017950726275789,
	"loss": 0.9948,
	"step": 840
	},
	{
	"epoch": 2.8839590443686007,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00017914456864559126,
	"loss": 0.9916,
	"step": 845
	},
	{
	"epoch": 2.901023890784983,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001787790662602779,
	"loss": 0.9985,
	"step": 850
	},
	{
	"epoch": 2.9180887372013653,
	"grad_norm": 0.34375,
	"learning_rate": 0.00017841076857100767,
	"loss": 0.994,
	"step": 855
	},
	{
	"epoch": 2.9351535836177476,
	"grad_norm": 0.265625,
	"learning_rate": 0.0001780396886460237,
	"loss": 0.9811,
	"step": 860
	},
	{
	"epoch": 2.9522184300341294,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00017766583965229065,
	"loss": 0.9872,
	"step": 865
	},
	{
	"epoch": 2.969283276450512,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00017728923485502759,
	"loss": 0.9951,
	"step": 870
	},
	{
	"epoch": 2.986348122866894,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00017690988761723725,
	"loss": 0.9915,
	"step": 875
	},
	{
	"epoch": 3.0,
	"eval_loss": 2.4749691486358643,
	"eval_runtime": 0.5425,
	"eval_samples_per_second": 18.434,
	"eval_steps_per_second": 1.843,
	"step": 879
	},
	{
	"epoch": 3.0034129692832763,
	"grad_norm": 0.39453125,
	"learning_rate": 0.00017652781139923196,
	"loss": 0.9883,
	"step": 880
	},
	{
	"epoch": 3.0204778156996586,
	"grad_norm": 0.5078125,
	"learning_rate": 0.000176143019758156,
	"loss": 0.9611,
	"step": 885
	},
	{
	"epoch": 3.037542662116041,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001757555263475044,
	"loss": 0.9542,
	"step": 890
	},
	{
	"epoch": 3.054607508532423,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00017536534491663873,
	"loss": 0.9614,
	"step": 895
	},
	{
	"epoch": 3.0716723549488054,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00017497248931029914,
	"loss": 0.9538,
	"step": 900
	},
	{
	"epoch": 3.0887372013651877,
	"grad_norm": 0.30859375,
	"learning_rate": 0.000174576973468113,
	"loss": 0.9581,
	"step": 905
	},
	{
	"epoch": 3.10580204778157,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00017417881142410037,
	"loss": 0.9466,
	"step": 910
	},
	{
	"epoch": 3.1228668941979523,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00017377801730617613,
	"loss": 0.9632,
	"step": 915
	},
	{
	"epoch": 3.1399317406143346,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00017337460533564845,
	"loss": 0.948,
	"step": 920
	},
	{
	"epoch": 3.156996587030717,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00017296858982671442,
	"loss": 0.9515,
	"step": 925
	},
	{
	"epoch": 3.174061433447099,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00017255998518595194,
	"loss": 0.9625,
	"step": 930
	},
	{
	"epoch": 3.1911262798634814,
	"grad_norm": 0.28125,
	"learning_rate": 0.00017214880591180873,
	"loss": 0.9532,
	"step": 935
	},
	{
	"epoch": 3.2081911262798632,
	"grad_norm": 0.326171875,
	"learning_rate": 0.0001717350665940877,
	"loss": 0.9499,
	"step": 940
	},
	{
	"epoch": 3.2252559726962455,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00017131878191342932,
	"loss": 0.9505,
	"step": 945
	},
	{
	"epoch": 3.242320819112628,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00017089996664079084,
	"loss": 0.9489,
	"step": 950
	},
	{
	"epoch": 3.25938566552901,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00017047863563692198,
	"loss": 0.9623,
	"step": 955
	},
	{
	"epoch": 3.2764505119453924,
	"grad_norm": 0.267578125,
	"learning_rate": 0.00017005480385183774,
	"loss": 0.9474,
	"step": 960
	},
	{
	"epoch": 3.2935153583617747,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00016962848632428795,
	"loss": 0.9558,
	"step": 965
	},
	{
	"epoch": 3.310580204778157,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00016919969818122345,
	"loss": 0.9538,
	"step": 970
	},
	{
	"epoch": 3.3276450511945392,
	"grad_norm": 0.5,
	"learning_rate": 0.00016876845463725975,
	"loss": 0.955,
	"step": 975
	},
	{
	"epoch": 3.3447098976109215,
	"grad_norm": 1.8203125,
	"learning_rate": 0.0001683347709941367,
	"loss": 0.9615,
	"step": 980
	},
	{
	"epoch": 3.361774744027304,
	"grad_norm": 0.44921875,
	"learning_rate": 0.0001678986626401759,
	"loss": 0.9591,
	"step": 985
	},
	{
	"epoch": 3.378839590443686,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00016746014504973448,
	"loss": 0.9479,
	"step": 990
	},
	{
	"epoch": 3.3959044368600684,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00016701923378265615,
	"loss": 0.9511,
	"step": 995
	},
	{
	"epoch": 3.4129692832764507,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00016657594448371896,
	"loss": 0.962,
	"step": 1000
	},
	{
	"epoch": 3.430034129692833,
	"grad_norm": 0.4296875,
	"learning_rate": 0.0001661302928820803,
	"loss": 0.9612,
	"step": 1005
	},
	{
	"epoch": 3.4470989761092152,
	"grad_norm": 0.455078125,
	"learning_rate": 0.00016568229479071872,
	"loss": 0.9524,
	"step": 1010
	},
	{
	"epoch": 3.464163822525597,
	"grad_norm": 0.306640625,
	"learning_rate": 0.0001652319661058729,
	"loss": 0.9557,
	"step": 1015
	},
	{
	"epoch": 3.4812286689419794,
	"grad_norm": 0.26953125,
	"learning_rate": 0.00016477932280647747,
	"loss": 0.9635,
	"step": 1020
	},
	{
	"epoch": 3.4982935153583616,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00016432438095359623,
	"loss": 0.9549,
	"step": 1025
	},
	{
	"epoch": 3.515358361774744,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00016386715668985211,
	"loss": 0.9456,
	"step": 1030
	},
	{
	"epoch": 3.532423208191126,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00016340766623885438,
	"loss": 0.945,
	"step": 1035
	},
	{
	"epoch": 3.5494880546075085,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00016294592590462316,
	"loss": 0.95,
	"step": 1040
	},
	{
	"epoch": 3.5665529010238908,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001624819520710107,
	"loss": 0.9583,
	"step": 1045
	},
	{
	"epoch": 3.583617747440273,
	"grad_norm": 0.36328125,
	"learning_rate": 0.00016201576120112007,
	"loss": 0.9443,
	"step": 1050
	},
	{
	"epoch": 3.6006825938566553,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001615473698367212,
	"loss": 0.9635,
	"step": 1055
	},
	{
	"epoch": 3.6177474402730376,
	"grad_norm": 0.51953125,
	"learning_rate": 0.00016107679459766367,
	"loss": 0.9524,
	"step": 1060
	},
	{
	"epoch": 3.63481228668942,
	"grad_norm": 0.279296875,
	"learning_rate": 0.0001606040521812872,
	"loss": 0.9552,
	"step": 1065
	},
	{
	"epoch": 3.651877133105802,
	"grad_norm": 0.3125,
	"learning_rate": 0.00016012915936182892,
	"loss": 0.9502,
	"step": 1070
	},
	{
	"epoch": 3.6689419795221845,
	"grad_norm": 0.3125,
	"learning_rate": 0.00015965213298982855,
	"loss": 0.9629,
	"step": 1075
	},
	{
	"epoch": 3.6860068259385663,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00015917298999153015,
	"loss": 0.9591,
	"step": 1080
	},
	{
	"epoch": 3.703071672354949,
	"grad_norm": 0.296875,
	"learning_rate": 0.00015869174736828168,
	"loss": 0.9699,
	"step": 1085
	},
	{
	"epoch": 3.720136518771331,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00015820842219593182,
	"loss": 0.9478,
	"step": 1090
	},
	{
	"epoch": 3.737201365187713,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00015772303162422385,
	"loss": 0.9646,
	"step": 1095
	},
	{
	"epoch": 3.7542662116040955,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00015723559287618728,
	"loss": 0.9601,
	"step": 1100
	},
	{
	"epoch": 3.7713310580204777,
	"grad_norm": 0.421875,
	"learning_rate": 0.00015674612324752683,
	"loss": 0.9548,
	"step": 1105
	},
	{
	"epoch": 3.78839590443686,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00015625464010600844,
	"loss": 0.9625,
	"step": 1110
	},
	{
	"epoch": 3.8054607508532423,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00015576116089084327,
	"loss": 0.9448,
	"step": 1115
	},
	{
	"epoch": 3.8225255972696246,
	"grad_norm": 0.28125,
	"learning_rate": 0.00015526570311206884,
	"loss": 0.9547,
	"step": 1120
	},
	{
	"epoch": 3.839590443686007,
	"grad_norm": 0.275390625,
	"learning_rate": 0.00015476828434992762,
	"loss": 0.9527,
	"step": 1125
	},
	{
	"epoch": 3.856655290102389,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00015426892225424337,
	"loss": 0.9499,
	"step": 1130
	},
	{
	"epoch": 3.8737201365187715,
	"grad_norm": 0.271484375,
	"learning_rate": 0.00015376763454379478,
	"loss": 0.9593,
	"step": 1135
	},
	{
	"epoch": 3.8907849829351537,
	"grad_norm": 0.314453125,
	"learning_rate": 0.0001532644390056868,
	"loss": 0.9457,
	"step": 1140
	},
	{
	"epoch": 3.9078498293515356,
	"grad_norm": 0.49609375,
	"learning_rate": 0.00015275935349471959,
	"loss": 0.9622,
	"step": 1145
	},
	{
	"epoch": 3.9249146757679183,
	"grad_norm": 0.3125,
	"learning_rate": 0.00015225239593275473,
	"loss": 0.9584,
	"step": 1150
	},
	{
	"epoch": 3.9419795221843,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00015174358430807957,
	"loss": 0.9547,
	"step": 1155
	},
	{
	"epoch": 3.9590443686006824,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00015123293667476887,
	"loss": 0.9546,
	"step": 1160
	},
	{
	"epoch": 3.9761092150170647,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00015072047115204397,
	"loss": 0.945,
	"step": 1165
	},
	{
	"epoch": 3.993174061433447,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00015020620592363034,
	"loss": 0.9551,
	"step": 1170
	},
	{
	"epoch": 4.0,
	"eval_loss": 2.529212474822998,
	"eval_runtime": 0.5437,
	"eval_samples_per_second": 18.394,
	"eval_steps_per_second": 1.839,
	"step": 1172
	},
	{
	"epoch": 4.010238907849829,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00014969015923711195,
	"loss": 0.925,
	"step": 1175
	},
	{
	"epoch": 4.027303754266212,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00014917234940328396,
	"loss": 0.9111,
	"step": 1180
	},
	{
	"epoch": 4.044368600682594,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00014865279479550292,
	"loss": 0.9124,
	"step": 1185
	},
	{
	"epoch": 4.061433447098976,
	"grad_norm": 0.3125,
	"learning_rate": 0.00014813151384903493,
	"loss": 0.912,
	"step": 1190
	},
	{
	"epoch": 4.078498293515358,
	"grad_norm": 0.3125,
	"learning_rate": 0.00014760852506040162,
	"loss": 0.9113,
	"step": 1195
	},
	{
	"epoch": 4.09556313993174,
	"grad_norm": 0.298828125,
	"learning_rate": 0.0001470838469867234,
	"loss": 0.9168,
	"step": 1200
	},
	{
	"epoch": 4.112627986348123,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00014655749824506151,
	"loss": 0.9152,
	"step": 1205
	},
	{
	"epoch": 4.129692832764505,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00014602949751175713,
	"loss": 0.9098,
	"step": 1210
	},
	{
	"epoch": 4.146757679180888,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00014549986352176882,
	"loss": 0.9213,
	"step": 1215
	},
	{
	"epoch": 4.163822525597269,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00014496861506800758,
	"loss": 0.9128,
	"step": 1220
	},
	{
	"epoch": 4.180887372013652,
	"grad_norm": 0.29296875,
	"learning_rate": 0.0001444357710006703,
	"loss": 0.9102,
	"step": 1225
	},
	{
	"epoch": 4.197952218430034,
	"grad_norm": 0.330078125,
	"learning_rate": 0.0001439013502265707,
	"loss": 0.9058,
	"step": 1230
	},
	{
	"epoch": 4.215017064846417,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00014336537170846848,
	"loss": 0.9233,
	"step": 1235
	},
	{
	"epoch": 4.2320819112627985,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00014282785446439653,
	"loss": 0.9092,
	"step": 1240
	},
	{
	"epoch": 4.249146757679181,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00014228881756698603,
	"loss": 0.9093,
	"step": 1245
	},
	{
	"epoch": 4.266211604095563,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00014174828014278985,
	"loss": 0.9271,
	"step": 1250
	},
	{
	"epoch": 4.283276450511945,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00014120626137160375,
	"loss": 0.9189,
	"step": 1255
	},
	{
	"epoch": 4.300341296928328,
	"grad_norm": 0.28125,
	"learning_rate": 0.00014066278048578584,
	"loss": 0.9078,
	"step": 1260
	},
	{
	"epoch": 4.3174061433447095,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00014011785676957422,
	"loss": 0.9115,
	"step": 1265
	},
	{
	"epoch": 4.334470989761092,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00013957150955840267,
	"loss": 0.9099,
	"step": 1270
	},
	{
	"epoch": 4.351535836177474,
	"grad_norm": 0.28125,
	"learning_rate": 0.0001390237582382147,
	"loss": 0.9208,
	"step": 1275
	},
	{
	"epoch": 4.368600682593857,
	"grad_norm": 0.3125,
	"learning_rate": 0.00013847462224477538,
	"loss": 0.9133,
	"step": 1280
	},
	{
	"epoch": 4.385665529010239,
	"grad_norm": 0.328125,
	"learning_rate": 0.00013792412106298198,
	"loss": 0.9088,
	"step": 1285
	},
	{
	"epoch": 4.402730375426621,
	"grad_norm": 0.328125,
	"learning_rate": 0.00013737227422617267,
	"loss": 0.9176,
	"step": 1290
	},
	{
	"epoch": 4.419795221843003,
	"grad_norm": 0.30078125,
	"learning_rate": 0.00013681910131543309,
	"loss": 0.9143,
	"step": 1295
	},
	{
	"epoch": 4.436860068259386,
	"grad_norm": 0.328125,
	"learning_rate": 0.00013626462195890168,
	"loss": 0.9148,
	"step": 1300
	},
	{
	"epoch": 4.453924914675768,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00013570885583107347,
	"loss": 0.9165,
	"step": 1305
	},
	{
	"epoch": 4.4709897610921505,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00013515182265210165,
	"loss": 0.9198,
	"step": 1310
	},
	{
	"epoch": 4.488054607508532,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00013459354218709794,
	"loss": 0.9294,
	"step": 1315
	},
	{
	"epoch": 4.505119453924914,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00013403403424543139,
	"loss": 0.9137,
	"step": 1320
	},
	{
	"epoch": 4.522184300341297,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00013347331868002527,
	"loss": 0.9172,
	"step": 1325
	},
	{
	"epoch": 4.53924914675768,
	"grad_norm": 0.294921875,
	"learning_rate": 0.0001329114153866529,
	"loss": 0.9237,
	"step": 1330
	},
	{
	"epoch": 4.5563139931740615,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00013234834430323145,
	"loss": 0.9144,
	"step": 1335
	},
	{
	"epoch": 4.573378839590443,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00013178412540911457,
	"loss": 0.9193,
	"step": 1340
	},
	{
	"epoch": 4.590443686006826,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00013121877872438354,
	"loss": 0.9217,
	"step": 1345
	},
	{
	"epoch": 4.607508532423208,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00013065232430913676,
	"loss": 0.9252,
	"step": 1350
	},
	{
	"epoch": 4.624573378839591,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00013008478226277816,
	"loss": 0.9265,
	"step": 1355
	},
	{
	"epoch": 4.6416382252559725,
	"grad_norm": 0.298828125,
	"learning_rate": 0.00012951617272330377,
	"loss": 0.9221,
	"step": 1360
	},
	{
	"epoch": 4.658703071672355,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00012894651586658736,
	"loss": 0.9131,
	"step": 1365
	},
	{
	"epoch": 4.675767918088737,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00012837583190566446,
	"loss": 0.9109,
	"step": 1370
	},
	{
	"epoch": 4.69283276450512,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00012780414109001518,
	"loss": 0.9204,
	"step": 1375
	},
	{
	"epoch": 4.709897610921502,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00012723146370484568,
	"loss": 0.9154,
	"step": 1380
	},
	{
	"epoch": 4.726962457337884,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00012665782007036835,
	"loss": 0.9251,
	"step": 1385
	},
	{
	"epoch": 4.744027303754266,
	"grad_norm": 0.3984375,
	"learning_rate": 0.0001260832305410809,
	"loss": 0.926,
	"step": 1390
	},
	{
	"epoch": 4.761092150170649,
	"grad_norm": 0.369140625,
	"learning_rate": 0.00012550771550504396,
	"loss": 0.9137,
	"step": 1395
	},
	{
	"epoch": 4.778156996587031,
	"grad_norm": 0.34375,
	"learning_rate": 0.00012493129538315788,
	"loss": 0.9181,
	"step": 1400
	},
	{
	"epoch": 4.795221843003413,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00012435399062843796,
	"loss": 0.9207,
	"step": 1405
	},
	{
	"epoch": 4.812286689419795,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00012377582172528877,
	"loss": 0.9156,
	"step": 1410
	},
	{
	"epoch": 4.829351535836177,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00012319680918877732,
	"loss": 0.9222,
	"step": 1415
	},
	{
	"epoch": 4.84641638225256,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00012261697356390506,
	"loss": 0.9297,
	"step": 1420
	},
	{
	"epoch": 4.863481228668942,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00012203633542487907,
	"loss": 0.9146,
	"step": 1425
	},
	{
	"epoch": 4.8805460750853245,
	"grad_norm": 0.453125,
	"learning_rate": 0.00012145491537438174,
	"loss": 0.917,
	"step": 1430
	},
	{
	"epoch": 4.897610921501706,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00012087273404284002,
	"loss": 0.912,
	"step": 1435
	},
	{
	"epoch": 4.914675767918089,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001202898120876932,
	"loss": 0.9224,
	"step": 1440
	},
	{
	"epoch": 4.931740614334471,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00011970617019266,
	"loss": 0.9167,
	"step": 1445
	},
	{
	"epoch": 4.948805460750854,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011912182906700466,
	"loss": 0.9166,
	"step": 1450
	},
	{
	"epoch": 4.965870307167235,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00011853680944480206,
	"loss": 0.9243,
	"step": 1455
	},
	{
	"epoch": 4.982935153583618,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00011795113208420208,
	"loss": 0.9128,
	"step": 1460
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.9287,
	"step": 1465
	},
	{
	"epoch": 5.0,
	"eval_loss": 2.5924570560455322,
	"eval_runtime": 0.5421,
	"eval_samples_per_second": 18.446,
	"eval_steps_per_second": 1.845,
	"step": 1465
	},
	{
	"epoch": 5.017064846416382,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00011677788729636427,
	"loss": 0.8743,
	"step": 1470
	},
	{
	"epoch": 5.034129692832765,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001161903614991679,
	"loss": 0.8731,
	"step": 1475
	},
	{
	"epoch": 5.051194539249146,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00011560226122218,
	"loss": 0.8735,
	"step": 1480
	},
	{
	"epoch": 5.068259385665529,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00011501360733286085,
	"loss": 0.8808,
	"step": 1485
	},
	{
	"epoch": 5.085324232081911,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00011442442071831434,
	"loss": 0.8776,
	"step": 1490
	},
	{
	"epoch": 5.102389078498294,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00011383472228454699,
	"loss": 0.872,
	"step": 1495
	},
	{
	"epoch": 5.1194539249146755,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00011324453295572618,
	"loss": 0.8801,
	"step": 1500
	},
	{
	"epoch": 5.136518771331058,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00011265387367343763,
	"loss": 0.8767,
	"step": 1505
	},
	{
	"epoch": 5.15358361774744,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00011206276539594221,
	"loss": 0.8764,
	"step": 1510
	},
	{
	"epoch": 5.170648464163823,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00011147122909743257,
	"loss": 0.8768,
	"step": 1515
	},
	{
	"epoch": 5.187713310580205,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00011087928576728865,
	"loss": 0.8848,
	"step": 1520
	},
	{
	"epoch": 5.204778156996587,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00011028695640933309,
	"loss": 0.8905,
	"step": 1525
	},
	{
	"epoch": 5.221843003412969,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00010969426204108583,
	"loss": 0.8872,
	"step": 1530
	},
	{
	"epoch": 5.238907849829351,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00010910122369301842,
	"loss": 0.8749,
	"step": 1535
	},
	{
	"epoch": 5.255972696245734,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00010850786240780786,
	"loss": 0.884,
	"step": 1540
	},
	{
	"epoch": 5.273037542662116,
	"grad_norm": 0.41015625,
	"learning_rate": 0.00010791419923958976,
	"loss": 0.8739,
	"step": 1545
	},
	{
	"epoch": 5.290102389078498,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00010732025525321145,
	"loss": 0.8902,
	"step": 1550
	},
	{
	"epoch": 5.30716723549488,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00010672605152348449,
	"loss": 0.8863,
	"step": 1555
	},
	{
	"epoch": 5.324232081911263,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00010613160913443682,
	"loss": 0.8752,
	"step": 1560
	},
	{
	"epoch": 5.341296928327645,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00010553694917856478,
	"loss": 0.8782,
	"step": 1565
	},
	{
	"epoch": 5.3583617747440275,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00010494209275608455,
	"loss": 0.8804,
	"step": 1570
	},
	{
	"epoch": 5.375426621160409,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00010434706097418338,
	"loss": 0.8889,
	"step": 1575
	},
	{
	"epoch": 5.392491467576792,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00010375187494627098,
	"loss": 0.8861,
	"step": 1580
	},
	{
	"epoch": 5.409556313993174,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00010315655579123,
	"loss": 0.878,
	"step": 1585
	},
	{
	"epoch": 5.426621160409557,
	"grad_norm": 0.388671875,
	"learning_rate": 0.00010256112463266687,
	"loss": 0.893,
	"step": 1590
	},
	{
	"epoch": 5.4436860068259385,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00010196560259816221,
	"loss": 0.8913,
	"step": 1595
	},
	{
	"epoch": 5.460750853242321,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00010137001081852113,
	"loss": 0.8848,
	"step": 1600
	},
	{
	"epoch": 5.477815699658703,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00010077437042702362,
	"loss": 0.8867,
	"step": 1605
	},
	{
	"epoch": 5.494880546075085,
	"grad_norm": 0.328125,
	"learning_rate": 0.00010017870255867445,
	"loss": 0.8843,
	"step": 1610
	},
	{
	"epoch": 5.511945392491468,
	"grad_norm": 0.34375,
	"learning_rate": 9.958302834945332e-05,
	"loss": 0.8905,
	"step": 1615
	},
	{
	"epoch": 5.5290102389078495,
	"grad_norm": 0.38671875,
	"learning_rate": 9.898736893556502e-05,
	"loss": 0.8903,
	"step": 1620
	},
	{
	"epoch": 5.546075085324232,
	"grad_norm": 0.337890625,
	"learning_rate": 9.839174545268931e-05,
	"loss": 0.897,
	"step": 1625
	},
	{
	"epoch": 5.563139931740614,
	"grad_norm": 0.326171875,
	"learning_rate": 9.7796179035231e-05,
	"loss": 0.8925,
	"step": 1630
	},
	{
	"epoch": 5.580204778156997,
	"grad_norm": 0.310546875,
	"learning_rate": 9.720069081557009e-05,
	"loss": 0.8748,
	"step": 1635
	},
	{
	"epoch": 5.597269624573379,
	"grad_norm": 0.31640625,
	"learning_rate": 9.660530192331191e-05,
	"loss": 0.8829,
	"step": 1640
	},
	{
	"epoch": 5.614334470989761,
	"grad_norm": 0.30859375,
	"learning_rate": 9.601003348453734e-05,
	"loss": 0.8922,
	"step": 1645
	},
	{
	"epoch": 5.631399317406143,
	"grad_norm": 0.3203125,
	"learning_rate": 9.541490662105326e-05,
	"loss": 0.8936,
	"step": 1650
	},
	{
	"epoch": 5.648464163822526,
	"grad_norm": 0.337890625,
	"learning_rate": 9.481994244964297e-05,
	"loss": 0.8897,
	"step": 1655
	},
	{
	"epoch": 5.665529010238908,
	"grad_norm": 0.30078125,
	"learning_rate": 9.422516208131709e-05,
	"loss": 0.8762,
	"step": 1660
	},
	{
	"epoch": 5.6825938566552905,
	"grad_norm": 0.3046875,
	"learning_rate": 9.363058662056443e-05,
	"loss": 0.8842,
	"step": 1665
	},
	{
	"epoch": 5.699658703071672,
	"grad_norm": 0.341796875,
	"learning_rate": 9.303623716460297e-05,
	"loss": 0.8906,
	"step": 1670
	},
	{
	"epoch": 5.716723549488055,
	"grad_norm": 0.328125,
	"learning_rate": 9.244213480263148e-05,
	"loss": 0.8911,
	"step": 1675
	},
	{
	"epoch": 5.733788395904437,
	"grad_norm": 0.333984375,
	"learning_rate": 9.184830061508113e-05,
	"loss": 0.8893,
	"step": 1680
	},
	{
	"epoch": 5.750853242320819,
	"grad_norm": 0.3359375,
	"learning_rate": 9.125475567286744e-05,
	"loss": 0.8826,
	"step": 1685
	},
	{
	"epoch": 5.7679180887372015,
	"grad_norm": 0.341796875,
	"learning_rate": 9.066152103664283e-05,
	"loss": 0.8845,
	"step": 1690
	},
	{
	"epoch": 5.784982935153583,
	"grad_norm": 0.345703125,
	"learning_rate": 9.006861775604904e-05,
	"loss": 0.8808,
	"step": 1695
	},
	{
	"epoch": 5.802047781569966,
	"grad_norm": 0.328125,
	"learning_rate": 8.947606686897045e-05,
	"loss": 0.8829,
	"step": 1700
	},
	{
	"epoch": 5.819112627986348,
	"grad_norm": 0.31640625,
	"learning_rate": 8.88838894007875e-05,
	"loss": 0.8835,
	"step": 1705
	},
	{
	"epoch": 5.836177474402731,
	"grad_norm": 0.326171875,
	"learning_rate": 8.829210636363067e-05,
	"loss": 0.8894,
	"step": 1710
	},
	{
	"epoch": 5.853242320819112,
	"grad_norm": 0.318359375,
	"learning_rate": 8.770073875563493e-05,
	"loss": 0.8822,
	"step": 1715
	},
	{
	"epoch": 5.870307167235495,
	"grad_norm": 0.3203125,
	"learning_rate": 8.710980756019467e-05,
	"loss": 0.8811,
	"step": 1720
	},
	{
	"epoch": 5.887372013651877,
	"grad_norm": 0.32421875,
	"learning_rate": 8.651933374521907e-05,
	"loss": 0.8906,
	"step": 1725
	},
	{
	"epoch": 5.90443686006826,
	"grad_norm": 0.3125,
	"learning_rate": 8.592933826238818e-05,
	"loss": 0.8773,
	"step": 1730
	},
	{
	"epoch": 5.921501706484642,
	"grad_norm": 0.390625,
	"learning_rate": 8.533984204640941e-05,
	"loss": 0.8843,
	"step": 1735
	},
	{
	"epoch": 5.938566552901024,
	"grad_norm": 0.314453125,
	"learning_rate": 8.4750866014275e-05,
	"loss": 0.8907,
	"step": 1740
	},
	{
	"epoch": 5.955631399317406,
	"grad_norm": 0.353515625,
	"learning_rate": 8.416243106451934e-05,
	"loss": 0.8795,
	"step": 1745
	},
	{
	"epoch": 5.972696245733788,
	"grad_norm": 0.30859375,
	"learning_rate": 8.357455807647778e-05,
	"loss": 0.8767,
	"step": 1750
	},
	{
	"epoch": 5.989761092150171,
	"grad_norm": 0.34375,
	"learning_rate": 8.29872679095457e-05,
	"loss": 0.8733,
	"step": 1755
	},
	{
	"epoch": 6.0,
	"eval_loss": 2.6554951667785645,
	"eval_runtime": 0.5458,
	"eval_samples_per_second": 18.321,
	"eval_steps_per_second": 1.832,
	"step": 1758
	},
	{
	"epoch": 6.006825938566553,
	"grad_norm": 0.3203125,
	"learning_rate": 8.240058140243834e-05,
	"loss": 0.8646,
	"step": 1760
	},
	{
	"epoch": 6.023890784982935,
	"grad_norm": 0.34765625,
	"learning_rate": 8.181451937245131e-05,
	"loss": 0.8498,
	"step": 1765
	},
	{
	"epoch": 6.040955631399317,
	"grad_norm": 0.349609375,
	"learning_rate": 8.122910261472214e-05,
	"loss": 0.8455,
	"step": 1770
	},
	{
	"epoch": 6.0580204778157,
	"grad_norm": 0.365234375,
	"learning_rate": 8.064435190149218e-05,
	"loss": 0.8363,
	"step": 1775
	},
	{
	"epoch": 6.075085324232082,
	"grad_norm": 0.361328125,
	"learning_rate": 8.006028798136962e-05,
	"loss": 0.855,
	"step": 1780
	},
	{
	"epoch": 6.092150170648464,
	"grad_norm": 0.322265625,
	"learning_rate": 7.947693157859337e-05,
	"loss": 0.8556,
	"step": 1785
	},
	{
	"epoch": 6.109215017064846,
	"grad_norm": 0.3515625,
	"learning_rate": 7.889430339229754e-05,
	"loss": 0.8606,
	"step": 1790
	},
	{
	"epoch": 6.126279863481229,
	"grad_norm": 0.33203125,
	"learning_rate": 7.831242409577716e-05,
	"loss": 0.8535,
	"step": 1795
	},
	{
	"epoch": 6.143344709897611,
	"grad_norm": 0.34765625,
	"learning_rate": 7.773131433575444e-05,
	"loss": 0.851,
	"step": 1800
	},
	{
	"epoch": 6.160409556313994,
	"grad_norm": 0.369140625,
	"learning_rate": 7.715099473164632e-05,
	"loss": 0.8468,
	"step": 1805
	},
	{
	"epoch": 6.177474402730375,
	"grad_norm": 0.361328125,
	"learning_rate": 7.657148587483271e-05,
	"loss": 0.8518,
	"step": 1810
	},
	{
	"epoch": 6.194539249146757,
	"grad_norm": 0.330078125,
	"learning_rate": 7.599280832792596e-05,
	"loss": 0.8467,
	"step": 1815
	},
	{
	"epoch": 6.21160409556314,
	"grad_norm": 0.322265625,
	"learning_rate": 7.541498262404125e-05,
	"loss": 0.8549,
	"step": 1820
	},
	{
	"epoch": 6.228668941979522,
	"grad_norm": 0.32421875,
	"learning_rate": 7.483802926606787e-05,
	"loss": 0.8534,
	"step": 1825
	},
	{
	"epoch": 6.2457337883959045,
	"grad_norm": 0.326171875,
	"learning_rate": 7.426196872594182e-05,
	"loss": 0.8491,
	"step": 1830
	},
	{
	"epoch": 6.262798634812286,
	"grad_norm": 0.330078125,
	"learning_rate": 7.368682144391944e-05,
	"loss": 0.8503,
	"step": 1835
	},
	{
	"epoch": 6.279863481228669,
	"grad_norm": 0.361328125,
	"learning_rate": 7.311260782785207e-05,
	"loss": 0.8528,
	"step": 1840
	},
	{
	"epoch": 6.296928327645051,
	"grad_norm": 0.328125,
	"learning_rate": 7.253934825246193e-05,
	"loss": 0.8592,
	"step": 1845
	},
	{
	"epoch": 6.313993174061434,
	"grad_norm": 0.39453125,
	"learning_rate": 7.196706305861925e-05,
	"loss": 0.8528,
	"step": 1850
	},
	{
	"epoch": 6.3310580204778155,
	"grad_norm": 0.328125,
	"learning_rate": 7.139577255262034e-05,
	"loss": 0.8528,
	"step": 1855
	},
	{
	"epoch": 6.348122866894198,
	"grad_norm": 0.34375,
	"learning_rate": 7.082549700546726e-05,
	"loss": 0.8561,
	"step": 1860
	},
	{
	"epoch": 6.36518771331058,
	"grad_norm": 0.330078125,
	"learning_rate": 7.025625665214844e-05,
	"loss": 0.8562,
	"step": 1865
	},
	{
	"epoch": 6.382252559726963,
	"grad_norm": 0.3203125,
	"learning_rate": 6.968807169092059e-05,
	"loss": 0.8561,
	"step": 1870
	},
	{
	"epoch": 6.399317406143345,
	"grad_norm": 0.33203125,
	"learning_rate": 6.912096228259236e-05,
	"loss": 0.8598,
	"step": 1875
	},
	{
	"epoch": 6.4163822525597265,
	"grad_norm": 0.337890625,
	"learning_rate": 6.855494854980857e-05,
	"loss": 0.8573,
	"step": 1880
	},
	{
	"epoch": 6.433447098976109,
	"grad_norm": 0.359375,
	"learning_rate": 6.799005057633644e-05,
	"loss": 0.8576,
	"step": 1885
	},
	{
	"epoch": 6.450511945392491,
	"grad_norm": 0.32421875,
	"learning_rate": 6.742628840635284e-05,
	"loss": 0.855,
	"step": 1890
	},
	{
	"epoch": 6.467576791808874,
	"grad_norm": 0.33203125,
	"learning_rate": 6.68636820437331e-05,
	"loss": 0.8628,
	"step": 1895
	},
	{
	"epoch": 6.484641638225256,
	"grad_norm": 0.39453125,
	"learning_rate": 6.630225145134144e-05,
	"loss": 0.8489,
	"step": 1900
	},
	{
	"epoch": 6.501706484641638,
	"grad_norm": 0.34765625,
	"learning_rate": 6.574201655032216e-05,
	"loss": 0.8534,
	"step": 1905
	},
	{
	"epoch": 6.51877133105802,
	"grad_norm": 0.361328125,
	"learning_rate": 6.518299721939323e-05,
	"loss": 0.8582,
	"step": 1910
	},
	{
	"epoch": 6.535836177474403,
	"grad_norm": 0.34375,
	"learning_rate": 6.462521329414066e-05,
	"loss": 0.8561,
	"step": 1915
	},
	{
	"epoch": 6.552901023890785,
	"grad_norm": 0.326171875,
	"learning_rate": 6.406868456631483e-05,
	"loss": 0.8618,
	"step": 1920
	},
	{
	"epoch": 6.5699658703071675,
	"grad_norm": 0.36328125,
	"learning_rate": 6.351343078312819e-05,
	"loss": 0.8575,
	"step": 1925
	},
	{
	"epoch": 6.587030716723549,
	"grad_norm": 0.376953125,
	"learning_rate": 6.295947164655447e-05,
	"loss": 0.8504,
	"step": 1930
	},
	{
	"epoch": 6.604095563139932,
	"grad_norm": 0.3359375,
	"learning_rate": 6.240682681262971e-05,
	"loss": 0.8619,
	"step": 1935
	},
	{
	"epoch": 6.621160409556314,
	"grad_norm": 0.322265625,
	"learning_rate": 6.185551589075482e-05,
	"loss": 0.8536,
	"step": 1940
	},
	{
	"epoch": 6.638225255972696,
	"grad_norm": 0.33984375,
	"learning_rate": 6.130555844299973e-05,
	"loss": 0.8511,
	"step": 1945
	},
	{
	"epoch": 6.6552901023890785,
	"grad_norm": 0.322265625,
	"learning_rate": 6.075697398340913e-05,
	"loss": 0.859,
	"step": 1950
	},
	{
	"epoch": 6.672354948805461,
	"grad_norm": 0.330078125,
	"learning_rate": 6.0209781977310486e-05,
	"loss": 0.8617,
	"step": 1955
	},
	{
	"epoch": 6.689419795221843,
	"grad_norm": 0.314453125,
	"learning_rate": 5.9664001840622886e-05,
	"loss": 0.8478,
	"step": 1960
	},
	{
	"epoch": 6.706484641638225,
	"grad_norm": 0.314453125,
	"learning_rate": 5.91196529391683e-05,
	"loss": 0.8548,
	"step": 1965
	},
	{
	"epoch": 6.723549488054608,
	"grad_norm": 0.33984375,
	"learning_rate": 5.857675458798453e-05,
	"loss": 0.8623,
	"step": 1970
	},
	{
	"epoch": 6.7406143344709895,
	"grad_norm": 0.333984375,
	"learning_rate": 5.8035326050639615e-05,
	"loss": 0.853,
	"step": 1975
	},
	{
	"epoch": 6.757679180887372,
	"grad_norm": 0.3515625,
	"learning_rate": 5.749538653854861e-05,
	"loss": 0.8594,
	"step": 1980
	},
	{
	"epoch": 6.774744027303754,
	"grad_norm": 0.32421875,
	"learning_rate": 5.695695521029163e-05,
	"loss": 0.8528,
	"step": 1985
	},
	{
	"epoch": 6.791808873720137,
	"grad_norm": 0.328125,
	"learning_rate": 5.642005117093419e-05,
	"loss": 0.8485,
	"step": 1990
	},
	{
	"epoch": 6.808873720136519,
	"grad_norm": 0.333984375,
	"learning_rate": 5.5884693471349256e-05,
	"loss": 0.8578,
	"step": 1995
	},
	{
	"epoch": 6.825938566552901,
	"grad_norm": 0.330078125,
	"learning_rate": 5.535090110754131e-05,
	"loss": 0.8549,
	"step": 2000
	},
	{
	"epoch": 6.843003412969283,
	"grad_norm": 0.34375,
	"learning_rate": 5.481869301997236e-05,
	"loss": 0.8625,
	"step": 2005
	},
	{
	"epoch": 6.860068259385666,
	"grad_norm": 0.3125,
	"learning_rate": 5.428808809288975e-05,
	"loss": 0.8529,
	"step": 2010
	},
	{
	"epoch": 6.877133105802048,
	"grad_norm": 0.3359375,
	"learning_rate": 5.37591051536561e-05,
	"loss": 0.8505,
	"step": 2015
	},
	{
	"epoch": 6.8941979522184305,
	"grad_norm": 0.345703125,
	"learning_rate": 5.32317629720814e-05,
	"loss": 0.8585,
	"step": 2020
	},
	{
	"epoch": 6.911262798634812,
	"grad_norm": 0.341796875,
	"learning_rate": 5.270608025975686e-05,
	"loss": 0.8563,
	"step": 2025
	},
	{
	"epoch": 6.928327645051194,
	"grad_norm": 0.326171875,
	"learning_rate": 5.218207566939116e-05,
	"loss": 0.8534,
	"step": 2030
	},
	{
	"epoch": 6.945392491467577,
	"grad_norm": 0.330078125,
	"learning_rate": 5.1659767794148316e-05,
	"loss": 0.853,
	"step": 2035
	},
	{
	"epoch": 6.962457337883959,
	"grad_norm": 0.33984375,
	"learning_rate": 5.1139175166988187e-05,
	"loss": 0.8622,
	"step": 2040
	},
	{
	"epoch": 6.979522184300341,
	"grad_norm": 0.333984375,
	"learning_rate": 5.062031626000873e-05,
	"loss": 0.8602,
	"step": 2045
	},
	{
	"epoch": 6.996587030716723,
	"grad_norm": 0.33984375,
	"learning_rate": 5.0103209483790636e-05,
	"loss": 0.8577,
	"step": 2050
	},
	{
	"epoch": 7.0,
	"eval_loss": 2.731566905975342,
	"eval_runtime": 0.5528,
	"eval_samples_per_second": 18.088,
	"eval_steps_per_second": 1.809,
	"step": 2051
	},
	{
	"epoch": 7.013651877133106,
	"grad_norm": 0.322265625,
	"learning_rate": 4.9587873186744025e-05,
	"loss": 0.8366,
	"step": 2055
	},
	{
	"epoch": 7.030716723549488,
	"grad_norm": 0.326171875,
	"learning_rate": 4.9074325654457446e-05,
	"loss": 0.8237,
	"step": 2060
	},
	{
	"epoch": 7.047781569965871,
	"grad_norm": 0.333984375,
	"learning_rate": 4.856258510904899e-05,
	"loss": 0.8231,
	"step": 2065
	},
	{
	"epoch": 7.064846416382252,
	"grad_norm": 0.3359375,
	"learning_rate": 4.805266970851975e-05,
	"loss": 0.8253,
	"step": 2070
	},
	{
	"epoch": 7.081911262798635,
	"grad_norm": 0.333984375,
	"learning_rate": 4.7544597546109514e-05,
	"loss": 0.8313,
	"step": 2075
	},
	{
	"epoch": 7.098976109215017,
	"grad_norm": 0.337890625,
	"learning_rate": 4.7038386649654764e-05,
	"loss": 0.8322,
	"step": 2080
	},
	{
	"epoch": 7.1160409556314,
	"grad_norm": 0.33984375,
	"learning_rate": 4.6534054980949113e-05,
	"loss": 0.8317,
	"step": 2085
	},
	{
	"epoch": 7.1331058020477816,
	"grad_norm": 0.328125,
	"learning_rate": 4.603162043510566e-05,
	"loss": 0.8356,
	"step": 2090
	},
	{
	"epoch": 7.150170648464163,
	"grad_norm": 0.33984375,
	"learning_rate": 4.553110083992237e-05,
	"loss": 0.8289,
	"step": 2095
	},
	{
	"epoch": 7.167235494880546,
	"grad_norm": 0.359375,
	"learning_rate": 4.50325139552493e-05,
	"loss": 0.8382,
	"step": 2100
	},
	{
	"epoch": 7.184300341296928,
	"grad_norm": 0.35546875,
	"learning_rate": 4.4535877472358466e-05,
	"loss": 0.8363,
	"step": 2105
	},
	{
	"epoch": 7.201365187713311,
	"grad_norm": 0.32421875,
	"learning_rate": 4.404120901331618e-05,
	"loss": 0.8388,
	"step": 2110
	},
	{
	"epoch": 7.2184300341296925,
	"grad_norm": 0.34765625,
	"learning_rate": 4.354852613035763e-05,
	"loss": 0.8291,
	"step": 2115
	},
	{
	"epoch": 7.235494880546075,
	"grad_norm": 0.328125,
	"learning_rate": 4.305784630526416e-05,
	"loss": 0.8361,
	"step": 2120
	},
	{
	"epoch": 7.252559726962457,
	"grad_norm": 0.3359375,
	"learning_rate": 4.2569186948743e-05,
	"loss": 0.8416,
	"step": 2125
	},
	{
	"epoch": 7.26962457337884,
	"grad_norm": 0.345703125,
	"learning_rate": 4.2082565399809404e-05,
	"loss": 0.8281,
	"step": 2130
	},
	{
	"epoch": 7.286689419795222,
	"grad_norm": 0.326171875,
	"learning_rate": 4.159799892517148e-05,
	"loss": 0.8281,
	"step": 2135
	},
	{
	"epoch": 7.303754266211604,
	"grad_norm": 0.349609375,
	"learning_rate": 4.111550471861747e-05,
	"loss": 0.8352,
	"step": 2140
	},
	{
	"epoch": 7.320819112627986,
	"grad_norm": 0.359375,
	"learning_rate": 4.06350999004057e-05,
	"loss": 0.833,
	"step": 2145
	},
	{
	"epoch": 7.337883959044369,
	"grad_norm": 0.353515625,
	"learning_rate": 4.0156801516657095e-05,
	"loss": 0.825,
	"step": 2150
	},
	{
	"epoch": 7.354948805460751,
	"grad_norm": 0.3359375,
	"learning_rate": 3.968062653875031e-05,
	"loss": 0.8386,
	"step": 2155
	},
	{
	"epoch": 7.372013651877133,
	"grad_norm": 0.3359375,
	"learning_rate": 3.920659186271953e-05,
	"loss": 0.8454,
	"step": 2160
	},
	{
	"epoch": 7.389078498293515,
	"grad_norm": 0.349609375,
	"learning_rate": 3.873471430865515e-05,
	"loss": 0.8431,
	"step": 2165
	},
	{
	"epoch": 7.406143344709897,
	"grad_norm": 0.345703125,
	"learning_rate": 3.8265010620106533e-05,
	"loss": 0.8392,
	"step": 2170
	},
	{
	"epoch": 7.42320819112628,
	"grad_norm": 0.3359375,
	"learning_rate": 3.779749746348831e-05,
	"loss": 0.8362,
	"step": 2175
	},
	{
	"epoch": 7.440273037542662,
	"grad_norm": 0.35546875,
	"learning_rate": 3.7332191427488784e-05,
	"loss": 0.8348,
	"step": 2180
	},
	{
	"epoch": 7.4573378839590445,
	"grad_norm": 0.35546875,
	"learning_rate": 3.6869109022481386e-05,
	"loss": 0.831,
	"step": 2185
	},
	{
	"epoch": 7.474402730375426,
	"grad_norm": 0.357421875,
	"learning_rate": 3.640826667993891e-05,
	"loss": 0.8314,
	"step": 2190
	},
	{
	"epoch": 7.491467576791809,
	"grad_norm": 0.33203125,
	"learning_rate": 3.59496807518503e-05,
	"loss": 0.8258,
	"step": 2195
	},
	{
	"epoch": 7.508532423208191,
	"grad_norm": 0.3359375,
	"learning_rate": 3.549336751014057e-05,
	"loss": 0.8482,
	"step": 2200
	},
	{
	"epoch": 7.525597269624574,
	"grad_norm": 0.32421875,
	"learning_rate": 3.503934314609343e-05,
	"loss": 0.8387,
	"step": 2205
	},
	{
	"epoch": 7.5426621160409555,
	"grad_norm": 0.3515625,
	"learning_rate": 3.458762376977669e-05,
	"loss": 0.8344,
	"step": 2210
	},
	{
	"epoch": 7.559726962457338,
	"grad_norm": 0.353515625,
	"learning_rate": 3.41382254094707e-05,
	"loss": 0.8315,
	"step": 2215
	},
	{
	"epoch": 7.57679180887372,
	"grad_norm": 0.345703125,
	"learning_rate": 3.369116401109963e-05,
	"loss": 0.8331,
	"step": 2220
	},
	{
	"epoch": 7.593856655290102,
	"grad_norm": 0.34375,
	"learning_rate": 3.3246455437665594e-05,
	"loss": 0.8322,
	"step": 2225
	},
	{
	"epoch": 7.610921501706485,
	"grad_norm": 0.326171875,
	"learning_rate": 3.280411546868583e-05,
	"loss": 0.8281,
	"step": 2230
	},
	{
	"epoch": 7.627986348122867,
	"grad_norm": 0.34375,
	"learning_rate": 3.2364159799632786e-05,
	"loss": 0.8281,
	"step": 2235
	},
	{
	"epoch": 7.645051194539249,
	"grad_norm": 0.333984375,
	"learning_rate": 3.192660404137729e-05,
	"loss": 0.832,
	"step": 2240
	},
	{
	"epoch": 7.662116040955631,
	"grad_norm": 0.337890625,
	"learning_rate": 3.14914637196345e-05,
	"loss": 0.8361,
	"step": 2245
	},
	{
	"epoch": 7.679180887372014,
	"grad_norm": 0.328125,
	"learning_rate": 3.105875427441297e-05,
	"loss": 0.837,
	"step": 2250
	},
	{
	"epoch": 7.696245733788396,
	"grad_norm": 0.33984375,
	"learning_rate": 3.0628491059467014e-05,
	"loss": 0.8351,
	"step": 2255
	},
	{
	"epoch": 7.713310580204778,
	"grad_norm": 0.328125,
	"learning_rate": 3.020068934175171e-05,
	"loss": 0.838,
	"step": 2260
	},
	{
	"epoch": 7.73037542662116,
	"grad_norm": 0.333984375,
	"learning_rate": 2.977536430088125e-05,
	"loss": 0.8355,
	"step": 2265
	},
	{
	"epoch": 7.747440273037543,
	"grad_norm": 0.326171875,
	"learning_rate": 2.9352531028590424e-05,
	"loss": 0.8261,
	"step": 2270
	},
	{
	"epoch": 7.764505119453925,
	"grad_norm": 0.3359375,
	"learning_rate": 2.8932204528198926e-05,
	"loss": 0.8367,
	"step": 2275
	},
	{
	"epoch": 7.7815699658703075,
	"grad_norm": 0.333984375,
	"learning_rate": 2.8514399714079132e-05,
	"loss": 0.8405,
	"step": 2280
	},
	{
	"epoch": 7.798634812286689,
	"grad_norm": 0.328125,
	"learning_rate": 2.8099131411126867e-05,
	"loss": 0.8408,
	"step": 2285
	},
	{
	"epoch": 7.815699658703072,
	"grad_norm": 0.326171875,
	"learning_rate": 2.7686414354235356e-05,
	"loss": 0.8397,
	"step": 2290
	},
	{
	"epoch": 7.832764505119454,
	"grad_norm": 0.34375,
	"learning_rate": 2.7276263187772423e-05,
	"loss": 0.8385,
	"step": 2295
	},
	{
	"epoch": 7.849829351535837,
	"grad_norm": 0.333984375,
	"learning_rate": 2.6868692465060828e-05,
	"loss": 0.8309,
	"step": 2300
	},
	{
	"epoch": 7.8668941979522184,
	"grad_norm": 0.341796875,
	"learning_rate": 2.6463716647861904e-05,
	"loss": 0.8229,
	"step": 2305
	},
	{
	"epoch": 7.8839590443686,
	"grad_norm": 0.34765625,
	"learning_rate": 2.6061350105862382e-05,
	"loss": 0.8226,
	"step": 2310
	},
	{
	"epoch": 7.901023890784983,
	"grad_norm": 0.33203125,
	"learning_rate": 2.5661607116164532e-05,
	"loss": 0.8334,
	"step": 2315
	},
	{
	"epoch": 7.918088737201365,
	"grad_norm": 0.34765625,
	"learning_rate": 2.5264501862779667e-05,
	"loss": 0.8444,
	"step": 2320
	},
	{
	"epoch": 7.935153583617748,
	"grad_norm": 0.328125,
	"learning_rate": 2.4870048436124595e-05,
	"loss": 0.8403,
	"step": 2325
	},
	{
	"epoch": 7.952218430034129,
	"grad_norm": 0.32421875,
	"learning_rate": 2.4478260832521938e-05,
	"loss": 0.8302,
	"step": 2330
	},
	{
	"epoch": 7.969283276450512,
	"grad_norm": 0.33203125,
	"learning_rate": 2.4089152953703332e-05,
	"loss": 0.8265,
	"step": 2335
	},
	{
	"epoch": 7.986348122866894,
	"grad_norm": 0.3359375,
	"learning_rate": 2.37027386063162e-05,
	"loss": 0.8364,
	"step": 2340
	},
	{
	"epoch": 8.0,
	"eval_loss": 2.7742018699645996,
	"eval_runtime": 0.5517,
	"eval_samples_per_second": 18.125,
	"eval_steps_per_second": 1.813,
	"step": 2344
	},
	{
	"epoch": 8.003412969283277,
	"grad_norm": 0.33203125,
	"learning_rate": 2.331903150143391e-05,
	"loss": 0.83,
	"step": 2345
	},
	{
	"epoch": 8.020477815699659,
	"grad_norm": 0.330078125,
	"learning_rate": 2.293804525406915e-05,
	"loss": 0.8208,
	"step": 2350
	},
	{
	"epoch": 8.03754266211604,
	"grad_norm": 0.345703125,
	"learning_rate": 2.255979338269093e-05,
	"loss": 0.8288,
	"step": 2355
	},
	{
	"epoch": 8.054607508532424,
	"grad_norm": 0.3515625,
	"learning_rate": 2.2184289308744844e-05,
	"loss": 0.8251,
	"step": 2360
	},
	{
	"epoch": 8.071672354948806,
	"grad_norm": 0.33984375,
	"learning_rate": 2.1811546356176872e-05,
	"loss": 0.8202,
	"step": 2365
	},
	{
	"epoch": 8.088737201365188,
	"grad_norm": 0.341796875,
	"learning_rate": 2.144157775096063e-05,
	"loss": 0.8191,
	"step": 2370
	},
	{
	"epoch": 8.10580204778157,
	"grad_norm": 0.33984375,
	"learning_rate": 2.1074396620628e-05,
	"loss": 0.8161,
	"step": 2375
	},
	{
	"epoch": 8.122866894197951,
	"grad_norm": 0.330078125,
	"learning_rate": 2.0710015993803422e-05,
	"loss": 0.8259,
	"step": 2380
	},
	{
	"epoch": 8.139931740614335,
	"grad_norm": 0.34375,
	"learning_rate": 2.0348448799741537e-05,
	"loss": 0.8271,
	"step": 2385
	},
	{
	"epoch": 8.156996587030717,
	"grad_norm": 0.33984375,
	"learning_rate": 1.9989707867868425e-05,
	"loss": 0.8222,
	"step": 2390
	},
	{
	"epoch": 8.174061433447099,
	"grad_norm": 0.33984375,
	"learning_rate": 1.9633805927326387e-05,
	"loss": 0.8176,
	"step": 2395
	},
	{
	"epoch": 8.19112627986348,
	"grad_norm": 0.341796875,
	"learning_rate": 1.9280755606522384e-05,
	"loss": 0.8303,
	"step": 2400
	},
	{
	"epoch": 8.208191126279864,
	"grad_norm": 0.33984375,
	"learning_rate": 1.893056943267969e-05,
	"loss": 0.8179,
	"step": 2405
	},
	{
	"epoch": 8.225255972696246,
	"grad_norm": 0.341796875,
	"learning_rate": 1.8583259831393663e-05,
	"loss": 0.8219,
	"step": 2410
	},
	{
	"epoch": 8.242320819112628,
	"grad_norm": 0.337890625,
	"learning_rate": 1.8238839126190686e-05,
	"loss": 0.829,
	"step": 2415
	},
	{
	"epoch": 8.25938566552901,
	"grad_norm": 0.34765625,
	"learning_rate": 1.7897319538090962e-05,
	"loss": 0.8233,
	"step": 2420
	},
	{
	"epoch": 8.276450511945393,
	"grad_norm": 0.33203125,
	"learning_rate": 1.755871318517488e-05,
	"loss": 0.8224,
	"step": 2425
	},
	{
	"epoch": 8.293515358361775,
	"grad_norm": 0.328125,
	"learning_rate": 1.722303208215297e-05,
	"loss": 0.8239,
	"step": 2430
	},
	{
	"epoch": 8.310580204778157,
	"grad_norm": 0.33203125,
	"learning_rate": 1.6890288139939625e-05,
	"loss": 0.8324,
	"step": 2435
	},
	{
	"epoch": 8.327645051194539,
	"grad_norm": 0.33984375,
	"learning_rate": 1.6560493165230516e-05,
	"loss": 0.8216,
	"step": 2440
	},
	{
	"epoch": 8.344709897610922,
	"grad_norm": 0.337890625,
	"learning_rate": 1.623365886008357e-05,
	"loss": 0.8249,
	"step": 2445
	},
	{
	"epoch": 8.361774744027304,
	"grad_norm": 0.3359375,
	"learning_rate": 1.5909796821503785e-05,
	"loss": 0.8327,
	"step": 2450
	},
	{
	"epoch": 8.378839590443686,
	"grad_norm": 0.3359375,
	"learning_rate": 1.5588918541031783e-05,
	"loss": 0.8202,
	"step": 2455
	},
	{
	"epoch": 8.395904436860068,
	"grad_norm": 0.337890625,
	"learning_rate": 1.5271035404335954e-05,
	"loss": 0.8213,
	"step": 2460
	},
	{
	"epoch": 8.41296928327645,
	"grad_norm": 0.33203125,
	"learning_rate": 1.4956158690808585e-05,
	"loss": 0.8217,
	"step": 2465
	},
	{
	"epoch": 8.430034129692833,
	"grad_norm": 0.359375,
	"learning_rate": 1.464429957316552e-05,
	"loss": 0.8235,
	"step": 2470
	},
	{
	"epoch": 8.447098976109215,
	"grad_norm": 0.337890625,
	"learning_rate": 1.433546911704977e-05,
	"loss": 0.8257,
	"step": 2475
	},
	{
	"epoch": 8.464163822525597,
	"grad_norm": 0.3359375,
	"learning_rate": 1.402967828063897e-05,
	"loss": 0.8228,
	"step": 2480
	},
	{
	"epoch": 8.481228668941979,
	"grad_norm": 0.33203125,
	"learning_rate": 1.37269379142563e-05,
	"loss": 0.8155,
	"step": 2485
	},
	{
	"epoch": 8.498293515358363,
	"grad_norm": 0.337890625,
	"learning_rate": 1.3427258759985739e-05,
	"loss": 0.8329,
	"step": 2490
	},
	{
	"epoch": 8.515358361774744,
	"grad_norm": 0.337890625,
	"learning_rate": 1.3130651451290798e-05,
	"loss": 0.8224,
	"step": 2495
	},
	{
	"epoch": 8.532423208191126,
	"grad_norm": 0.353515625,
	"learning_rate": 1.2837126512637198e-05,
	"loss": 0.8219,
	"step": 2500
	},
	{
	"epoch": 8.549488054607508,
	"grad_norm": 0.330078125,
	"learning_rate": 1.2546694359119493e-05,
	"loss": 0.8151,
	"step": 2505
	},
	{
	"epoch": 8.56655290102389,
	"grad_norm": 0.33984375,
	"learning_rate": 1.2259365296091464e-05,
	"loss": 0.8237,
	"step": 2510
	},
	{
	"epoch": 8.583617747440274,
	"grad_norm": 0.34765625,
	"learning_rate": 1.1975149518800454e-05,
	"loss": 0.8207,
	"step": 2515
	},
	{
	"epoch": 8.600682593856655,
	"grad_norm": 0.341796875,
	"learning_rate": 1.1694057112025636e-05,
	"loss": 0.8221,
	"step": 2520
	},
	{
	"epoch": 8.617747440273037,
	"grad_norm": 0.33203125,
	"learning_rate": 1.141609804972017e-05,
	"loss": 0.828,
	"step": 2525
	},
	{
	"epoch": 8.634812286689419,
	"grad_norm": 0.345703125,
	"learning_rate": 1.1141282194657287e-05,
	"loss": 0.8232,
	"step": 2530
	},
	{
	"epoch": 8.651877133105803,
	"grad_norm": 0.353515625,
	"learning_rate": 1.086961929808038e-05,
	"loss": 0.8281,
	"step": 2535
	},
	{
	"epoch": 8.668941979522184,
	"grad_norm": 0.34375,
	"learning_rate": 1.0601118999356907e-05,
	"loss": 0.8252,
	"step": 2540
	},
	{
	"epoch": 8.686006825938566,
	"grad_norm": 0.333984375,
	"learning_rate": 1.0335790825636449e-05,
	"loss": 0.8225,
	"step": 2545
	},
	{
	"epoch": 8.703071672354948,
	"grad_norm": 0.341796875,
	"learning_rate": 1.00736441915126e-05,
	"loss": 0.8199,
	"step": 2550
	},
	{
	"epoch": 8.720136518771332,
	"grad_norm": 0.345703125,
	"learning_rate": 9.814688398688998e-06,
	"loss": 0.8146,
	"step": 2555
	},
	{
	"epoch": 8.737201365187714,
	"grad_norm": 0.34375,
	"learning_rate": 9.558932635649131e-06,
	"loss": 0.8303,
	"step": 2560
	},
	{
	"epoch": 8.754266211604095,
	"grad_norm": 0.328125,
	"learning_rate": 9.306385977330411e-06,
	"loss": 0.8224,
	"step": 2565
	},
	{
	"epoch": 8.771331058020477,
	"grad_norm": 0.333984375,
	"learning_rate": 9.057057384802181e-06,
	"loss": 0.8228,
	"step": 2570
	},
	{
	"epoch": 8.788395904436861,
	"grad_norm": 0.3359375,
	"learning_rate": 8.810955704947666e-06,
	"loss": 0.8231,
	"step": 2575
	},
	{
	"epoch": 8.805460750853243,
	"grad_norm": 0.330078125,
	"learning_rate": 8.568089670150115e-06,
	"loss": 0.8278,
	"step": 2580
	},
	{
	"epoch": 8.822525597269625,
	"grad_norm": 0.341796875,
	"learning_rate": 8.328467897982995e-06,
	"loss": 0.8248,
	"step": 2585
	},
	{
	"epoch": 8.839590443686006,
	"grad_norm": 0.333984375,
	"learning_rate": 8.092098890904098e-06,
	"loss": 0.8195,
	"step": 2590
	},
	{
	"epoch": 8.856655290102388,
	"grad_norm": 0.333984375,
	"learning_rate": 7.858991035953944e-06,
	"loss": 0.8203,
	"step": 2595
	},
	{
	"epoch": 8.873720136518772,
	"grad_norm": 0.328125,
	"learning_rate": 7.629152604458156e-06,
	"loss": 0.8257,
	"step": 2600
	},
	{
	"epoch": 8.890784982935154,
	"grad_norm": 0.34375,
	"learning_rate": 7.402591751733989e-06,
	"loss": 0.8128,
	"step": 2605
	},
	{
	"epoch": 8.907849829351536,
	"grad_norm": 0.3359375,
	"learning_rate": 7.179316516800894e-06,
	"loss": 0.8251,
	"step": 2610
	},
	{
	"epoch": 8.924914675767917,
	"grad_norm": 0.341796875,
	"learning_rate": 6.959334822095354e-06,
	"loss": 0.824,
	"step": 2615
	},
	{
	"epoch": 8.941979522184301,
	"grad_norm": 0.3515625,
	"learning_rate": 6.7426544731897245e-06,
	"loss": 0.8287,
	"step": 2620
	},
	{
	"epoch": 8.959044368600683,
	"grad_norm": 0.3359375,
	"learning_rate": 6.529283158515276e-06,
	"loss": 0.8264,
	"step": 2625
	},
	{
	"epoch": 8.976109215017065,
	"grad_norm": 0.337890625,
	"learning_rate": 6.319228449089376e-06,
	"loss": 0.8179,
	"step": 2630
	},
	{
	"epoch": 8.993174061433447,
	"grad_norm": 0.34375,
	"learning_rate": 6.11249779824693e-06,
	"loss": 0.8311,
	"step": 2635
	},
	{
	"epoch": 9.0,
	"eval_loss": 2.7970776557922363,
	"eval_runtime": 0.547,
	"eval_samples_per_second": 18.282,
	"eval_steps_per_second": 1.828,
	"step": 2637
	},
	{
	"epoch": 9.01023890784983,
	"grad_norm": 0.33984375,
	"learning_rate": 5.909098541375746e-06,
	"loss": 0.827,
	"step": 2640
	},
	{
	"epoch": 9.027303754266212,
	"grad_norm": 0.333984375,
	"learning_rate": 5.7090378956564216e-06,
	"loss": 0.8173,
	"step": 2645
	},
	{
	"epoch": 9.044368600682594,
	"grad_norm": 0.341796875,
	"learning_rate": 5.512322959806193e-06,
	"loss": 0.8315,
	"step": 2650
	},
	{
	"epoch": 9.061433447098976,
	"grad_norm": 0.345703125,
	"learning_rate": 5.3189607138270255e-06,
	"loss": 0.8278,
	"step": 2655
	},
	{
	"epoch": 9.078498293515358,
	"grad_norm": 0.328125,
	"learning_rate": 5.128958018758012e-06,
	"loss": 0.821,
	"step": 2660
	},
	{
	"epoch": 9.095563139931741,
	"grad_norm": 0.337890625,
	"learning_rate": 4.942321616431833e-06,
	"loss": 0.8261,
	"step": 2665
	},
	{
	"epoch": 9.112627986348123,
	"grad_norm": 0.341796875,
	"learning_rate": 4.7590581292356276e-06,
	"loss": 0.8267,
	"step": 2670
	},
	{
	"epoch": 9.129692832764505,
	"grad_norm": 0.3359375,
	"learning_rate": 4.579174059875946e-06,
	"loss": 0.8265,
	"step": 2675
	},
	{
	"epoch": 9.146757679180887,
	"grad_norm": 0.33203125,
	"learning_rate": 4.402675791148059e-06,
	"loss": 0.8217,
	"step": 2680
	},
	{
	"epoch": 9.16382252559727,
	"grad_norm": 0.330078125,
	"learning_rate": 4.229569585709425e-06,
	"loss": 0.8245,
	"step": 2685
	},
	{
	"epoch": 9.180887372013652,
	"grad_norm": 0.3359375,
	"learning_rate": 4.0598615858575605e-06,
	"loss": 0.8211,
	"step": 2690
	},
	{
	"epoch": 9.197952218430034,
	"grad_norm": 0.330078125,
	"learning_rate": 3.89355781331201e-06,
	"loss": 0.8162,
	"step": 2695
	},
	{
	"epoch": 9.215017064846416,
	"grad_norm": 0.33203125,
	"learning_rate": 3.730664169000708e-06,
	"loss": 0.8154,
	"step": 2700
	},
	{
	"epoch": 9.2320819112628,
	"grad_norm": 0.330078125,
	"learning_rate": 3.571186432850626e-06,
	"loss": 0.8245,
	"step": 2705
	},
	{
	"epoch": 9.249146757679181,
	"grad_norm": 0.333984375,
	"learning_rate": 3.415130263582611e-06,
	"loss": 0.8198,
	"step": 2710
	},
	{
	"epoch": 9.266211604095563,
	"grad_norm": 0.330078125,
	"learning_rate": 3.2625011985107257e-06,
	"loss": 0.8178,
	"step": 2715
	},
	{
	"epoch": 9.283276450511945,
	"grad_norm": 0.337890625,
	"learning_rate": 3.1133046533455947e-06,
	"loss": 0.825,
	"step": 2720
	},
	{
	"epoch": 9.300341296928327,
	"grad_norm": 0.3359375,
	"learning_rate": 2.967545922002379e-06,
	"loss": 0.8249,
	"step": 2725
	},
	{
	"epoch": 9.31740614334471,
	"grad_norm": 0.337890625,
	"learning_rate": 2.8252301764128962e-06,
	"loss": 0.8228,
	"step": 2730
	},
	{
	"epoch": 9.334470989761092,
	"grad_norm": 0.3359375,
	"learning_rate": 2.686362466342085e-06,
	"loss": 0.822,
	"step": 2735
	},
	{
	"epoch": 9.351535836177474,
	"grad_norm": 0.3359375,
	"learning_rate": 2.550947719208829e-06,
	"loss": 0.8224,
	"step": 2740
	},
	{
	"epoch": 9.368600682593856,
	"grad_norm": 0.34375,
	"learning_rate": 2.4189907399111534e-06,
	"loss": 0.8224,
	"step": 2745
	},
	{
	"epoch": 9.38566552901024,
	"grad_norm": 0.33984375,
	"learning_rate": 2.2904962106556793e-06,
	"loss": 0.82,
	"step": 2750
	},
	{
	"epoch": 9.402730375426621,
	"grad_norm": 0.337890625,
	"learning_rate": 2.1654686907915167e-06,
	"loss": 0.8183,
	"step": 2755
	},
	{
	"epoch": 9.419795221843003,
	"grad_norm": 0.33984375,
	"learning_rate": 2.0439126166485025e-06,
	"loss": 0.8189,
	"step": 2760
	},
	{
	"epoch": 9.436860068259385,
	"grad_norm": 0.333984375,
	"learning_rate": 1.925832301379726e-06,
	"loss": 0.8215,
	"step": 2765
	},
	{
	"epoch": 9.453924914675769,
	"grad_norm": 0.341796875,
	"learning_rate": 1.8112319348085771e-06,
	"loss": 0.8235,
	"step": 2770
	},
	{
	"epoch": 9.47098976109215,
	"grad_norm": 0.341796875,
	"learning_rate": 1.700115583279993e-06,
	"loss": 0.8157,
	"step": 2775
	},
	{
	"epoch": 9.488054607508532,
	"grad_norm": 0.337890625,
	"learning_rate": 1.592487189516212e-06,
	"loss": 0.8192,
	"step": 2780
	},
	{
	"epoch": 9.505119453924914,
	"grad_norm": 0.3359375,
	"learning_rate": 1.4883505724768932e-06,
	"loss": 0.8168,
	"step": 2785
	},
	{
	"epoch": 9.522184300341298,
	"grad_norm": 0.341796875,
	"learning_rate": 1.3877094272235712e-06,
	"loss": 0.8296,
	"step": 2790
	},
	{
	"epoch": 9.53924914675768,
	"grad_norm": 0.3359375,
	"learning_rate": 1.2905673247885718e-06,
	"loss": 0.8166,
	"step": 2795
	},
	{
	"epoch": 9.556313993174061,
	"grad_norm": 0.3359375,
	"learning_rate": 1.196927712048257e-06,
	"loss": 0.817,
	"step": 2800
	},
	{
	"epoch": 9.573378839590443,
	"grad_norm": 0.33984375,
	"learning_rate": 1.1067939116008009e-06,
	"loss": 0.813,
	"step": 2805
	},
	{
	"epoch": 9.590443686006825,
	"grad_norm": 0.33203125,
	"learning_rate": 1.020169121648218e-06,
	"loss": 0.8114,
	"step": 2810
	},
	{
	"epoch": 9.607508532423209,
	"grad_norm": 0.32421875,
	"learning_rate": 9.370564158829087e-07,
	"loss": 0.8146,
	"step": 2815
	},
	{
	"epoch": 9.62457337883959,
	"grad_norm": 0.333984375,
	"learning_rate": 8.574587433786363e-07,
	"loss": 0.8216,
	"step": 2820
	},
	{
	"epoch": 9.641638225255972,
	"grad_norm": 0.34375,
	"learning_rate": 7.813789284857986e-07,
	"loss": 0.8157,
	"step": 2825
	},
	{
	"epoch": 9.658703071672354,
	"grad_norm": 0.333984375,
	"learning_rate": 7.088196707312977e-07,
	"loss": 0.8283,
	"step": 2830
	},
	{
	"epoch": 9.675767918088738,
	"grad_norm": 0.330078125,
	"learning_rate": 6.39783544722694e-07,
	"loss": 0.8092,
	"step": 2835
	},
	{
	"epoch": 9.69283276450512,
	"grad_norm": 0.330078125,
	"learning_rate": 5.742730000568908e-07,
	"loss": 0.8242,
	"step": 2840
	},
	{
	"epoch": 9.709897610921502,
	"grad_norm": 0.341796875,
	"learning_rate": 5.12290361233192e-07,
	"loss": 0.8239,
	"step": 2845
	},
	{
	"epoch": 9.726962457337883,
	"grad_norm": 0.33984375,
	"learning_rate": 4.538378275708133e-07,
	"loss": 0.8145,
	"step": 2850
	},
	{
	"epoch": 9.744027303754265,
	"grad_norm": 0.3359375,
	"learning_rate": 3.989174731308998e-07,
	"loss": 0.8249,
	"step": 2855
	},
	{
	"epoch": 9.761092150170649,
	"grad_norm": 0.3359375,
	"learning_rate": 3.4753124664286265e-07,
	"loss": 0.817,
	"step": 2860
	},
	{
	"epoch": 9.77815699658703,
	"grad_norm": 0.33203125,
	"learning_rate": 2.9968097143526775e-07,
	"loss": 0.8115,
	"step": 2865
	},
	{
	"epoch": 9.795221843003413,
	"grad_norm": 0.35546875,
	"learning_rate": 2.5536834537114307e-07,
	"loss": 0.8192,
	"step": 2870
	},
	{
	"epoch": 9.812286689419794,
	"grad_norm": 0.328125,
	"learning_rate": 2.145949407877157e-07,
	"loss": 0.8181,
	"step": 2875
	},
	{
	"epoch": 9.829351535836178,
	"grad_norm": 0.3359375,
	"learning_rate": 1.7736220444064533e-07,
	"loss": 0.8203,
	"step": 2880
	},
	{
	"epoch": 9.84641638225256,
	"grad_norm": 0.333984375,
	"learning_rate": 1.436714574526543e-07,
	"loss": 0.826,
	"step": 2885
	},
	{
	"epoch": 9.863481228668942,
	"grad_norm": 0.337890625,
	"learning_rate": 1.1352389526668727e-07,
	"loss": 0.8241,
	"step": 2890
	},
	{
	"epoch": 9.880546075085324,
	"grad_norm": 0.3359375,
	"learning_rate": 8.692058760345622e-08,
	"loss": 0.8268,
	"step": 2895
	},
	{
	"epoch": 9.897610921501707,
	"grad_norm": 0.330078125,
	"learning_rate": 6.386247842353754e-08,
	"loss": 0.8106,
	"step": 2900
	},
	{
	"epoch": 9.914675767918089,
	"grad_norm": 0.330078125,
	"learning_rate": 4.435038589380991e-08,
	"loss": 0.8232,
	"step": 2905
	},
	{
	"epoch": 9.93174061433447,
	"grad_norm": 0.3359375,
	"learning_rate": 2.8385002358466418e-08,
	"loss": 0.8187,
	"step": 2910
	},
	{
	"epoch": 9.948805460750853,
	"grad_norm": 0.337890625,
	"learning_rate": 1.5966894314456415e-08,
	"loss": 0.8284,
	"step": 2915
	},
	{
	"epoch": 9.965870307167236,
	"grad_norm": 0.333984375,
	"learning_rate": 7.096502391346071e-09,
	"loss": 0.8275,
	"step": 2920
	},
	{
	"epoch": 9.982935153583618,
	"grad_norm": 0.33984375,
	"learning_rate": 1.7741413357197368e-09,
	"loss": 0.8271,
	"step": 2925
	},
	{
	"epoch": 10.0,
	"grad_norm": 0.333984375,
	"learning_rate": 0.0,
	"loss": 0.8243,
	"step": 2930
	},
	{
	"epoch": 10.0,
	"eval_loss": 2.7977683544158936,
	"eval_runtime": 0.5422,
	"eval_samples_per_second": 18.444,
	"eval_steps_per_second": 1.844,
	"step": 2930
	},
	{
	"epoch": 10.0,
	"step": 2930,
	"total_flos": 1.7464232891960525e+18,
	"train_loss": 0.9647074054125633,
	"train_runtime": 17674.2713,
	"train_samples_per_second": 7.945,
	"train_steps_per_second": 0.166
	}
	],
	"logging_steps": 5,
	"max_steps": 2930,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 10,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 1.7464232891960525e+18,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}