gemma2b-summarize-gpt4o-256k / trainer_state.json

Model save

4393a03 verified 7 months ago

149 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 14.961571306575577,
	"eval_steps": 500,
	"global_step": 4380,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0034158838599487617,
	"grad_norm": 1.8984375,
	"learning_rate": 4.5662100456621004e-07,
	"loss": 3.0658,
	"step": 1
	},
	{
	"epoch": 0.017079419299743808,
	"grad_norm": 1.9609375,
	"learning_rate": 2.2831050228310503e-06,
	"loss": 3.0722,
	"step": 5
	},
	{
	"epoch": 0.034158838599487616,
	"grad_norm": 1.8984375,
	"learning_rate": 4.566210045662101e-06,
	"loss": 3.0516,
	"step": 10
	},
	{
	"epoch": 0.05123825789923143,
	"grad_norm": 2.0625,
	"learning_rate": 6.849315068493151e-06,
	"loss": 3.0565,
	"step": 15
	},
	{
	"epoch": 0.06831767719897523,
	"grad_norm": 4.125,
	"learning_rate": 9.132420091324201e-06,
	"loss": 3.0491,
	"step": 20
	},
	{
	"epoch": 0.08539709649871904,
	"grad_norm": 1.8203125,
	"learning_rate": 1.1415525114155251e-05,
	"loss": 3.0567,
	"step": 25
	},
	{
	"epoch": 0.10247651579846286,
	"grad_norm": 2.1875,
	"learning_rate": 1.3698630136986302e-05,
	"loss": 2.9917,
	"step": 30
	},
	{
	"epoch": 0.11955593509820667,
	"grad_norm": 2.03125,
	"learning_rate": 1.5981735159817352e-05,
	"loss": 2.9399,
	"step": 35
	},
	{
	"epoch": 0.13663535439795046,
	"grad_norm": 1.6328125,
	"learning_rate": 1.8264840182648402e-05,
	"loss": 2.8463,
	"step": 40
	},
	{
	"epoch": 0.1537147736976943,
	"grad_norm": 1.59375,
	"learning_rate": 2.0547945205479453e-05,
	"loss": 2.7478,
	"step": 45
	},
	{
	"epoch": 0.1707941929974381,
	"grad_norm": 1.34375,
	"learning_rate": 2.2831050228310503e-05,
	"loss": 2.6397,
	"step": 50
	},
	{
	"epoch": 0.18787361229718189,
	"grad_norm": 1.3515625,
	"learning_rate": 2.5114155251141553e-05,
	"loss": 2.5394,
	"step": 55
	},
	{
	"epoch": 0.2049530315969257,
	"grad_norm": 2.828125,
	"learning_rate": 2.7397260273972603e-05,
	"loss": 2.4552,
	"step": 60
	},
	{
	"epoch": 0.2220324508966695,
	"grad_norm": 5.3125,
	"learning_rate": 2.9680365296803654e-05,
	"loss": 2.3562,
	"step": 65
	},
	{
	"epoch": 0.23911187019641333,
	"grad_norm": 1.9453125,
	"learning_rate": 3.1963470319634704e-05,
	"loss": 2.2596,
	"step": 70
	},
	{
	"epoch": 0.2561912894961571,
	"grad_norm": 1.1875,
	"learning_rate": 3.424657534246575e-05,
	"loss": 2.1869,
	"step": 75
	},
	{
	"epoch": 0.27327070879590093,
	"grad_norm": 4.15625,
	"learning_rate": 3.6529680365296805e-05,
	"loss": 2.1068,
	"step": 80
	},
	{
	"epoch": 0.29035012809564475,
	"grad_norm": 1.890625,
	"learning_rate": 3.881278538812785e-05,
	"loss": 2.0422,
	"step": 85
	},
	{
	"epoch": 0.3074295473953886,
	"grad_norm": 0.9453125,
	"learning_rate": 4.1095890410958905e-05,
	"loss": 1.9626,
	"step": 90
	},
	{
	"epoch": 0.32450896669513235,
	"grad_norm": 2.203125,
	"learning_rate": 4.337899543378995e-05,
	"loss": 1.8912,
	"step": 95
	},
	{
	"epoch": 0.3415883859948762,
	"grad_norm": 7.5,
	"learning_rate": 4.5662100456621006e-05,
	"loss": 1.8274,
	"step": 100
	},
	{
	"epoch": 0.35866780529462,
	"grad_norm": 1.0,
	"learning_rate": 4.794520547945205e-05,
	"loss": 1.779,
	"step": 105
	},
	{
	"epoch": 0.37574722459436377,
	"grad_norm": 1.09375,
	"learning_rate": 5.0228310502283106e-05,
	"loss": 1.7254,
	"step": 110
	},
	{
	"epoch": 0.3928266438941076,
	"grad_norm": 1.1484375,
	"learning_rate": 5.251141552511416e-05,
	"loss": 1.6696,
	"step": 115
	},
	{
	"epoch": 0.4099060631938514,
	"grad_norm": 0.474609375,
	"learning_rate": 5.479452054794521e-05,
	"loss": 1.6192,
	"step": 120
	},
	{
	"epoch": 0.4269854824935952,
	"grad_norm": 0.482421875,
	"learning_rate": 5.707762557077626e-05,
	"loss": 1.5722,
	"step": 125
	},
	{
	"epoch": 0.444064901793339,
	"grad_norm": 0.3515625,
	"learning_rate": 5.936073059360731e-05,
	"loss": 1.5393,
	"step": 130
	},
	{
	"epoch": 0.46114432109308284,
	"grad_norm": 0.34375,
	"learning_rate": 6.164383561643835e-05,
	"loss": 1.5068,
	"step": 135
	},
	{
	"epoch": 0.47822374039282667,
	"grad_norm": 0.412109375,
	"learning_rate": 6.392694063926941e-05,
	"loss": 1.4883,
	"step": 140
	},
	{
	"epoch": 0.49530315969257044,
	"grad_norm": 0.306640625,
	"learning_rate": 6.621004566210046e-05,
	"loss": 1.4554,
	"step": 145
	},
	{
	"epoch": 0.5123825789923142,
	"grad_norm": 0.408203125,
	"learning_rate": 6.84931506849315e-05,
	"loss": 1.4246,
	"step": 150
	},
	{
	"epoch": 0.5294619982920581,
	"grad_norm": 0.376953125,
	"learning_rate": 7.077625570776256e-05,
	"loss": 1.4059,
	"step": 155
	},
	{
	"epoch": 0.5465414175918019,
	"grad_norm": 0.3671875,
	"learning_rate": 7.305936073059361e-05,
	"loss": 1.3877,
	"step": 160
	},
	{
	"epoch": 0.5636208368915457,
	"grad_norm": 0.3046875,
	"learning_rate": 7.534246575342466e-05,
	"loss": 1.3705,
	"step": 165
	},
	{
	"epoch": 0.5807002561912895,
	"grad_norm": 0.375,
	"learning_rate": 7.76255707762557e-05,
	"loss": 1.3634,
	"step": 170
	},
	{
	"epoch": 0.5977796754910333,
	"grad_norm": 0.283203125,
	"learning_rate": 7.990867579908676e-05,
	"loss": 1.3377,
	"step": 175
	},
	{
	"epoch": 0.6148590947907772,
	"grad_norm": 0.50390625,
	"learning_rate": 8.219178082191781e-05,
	"loss": 1.3313,
	"step": 180
	},
	{
	"epoch": 0.6319385140905209,
	"grad_norm": 0.4765625,
	"learning_rate": 8.447488584474886e-05,
	"loss": 1.3172,
	"step": 185
	},
	{
	"epoch": 0.6490179333902647,
	"grad_norm": 0.2451171875,
	"learning_rate": 8.67579908675799e-05,
	"loss": 1.3015,
	"step": 190
	},
	{
	"epoch": 0.6660973526900086,
	"grad_norm": 0.349609375,
	"learning_rate": 8.904109589041096e-05,
	"loss": 1.3011,
	"step": 195
	},
	{
	"epoch": 0.6831767719897524,
	"grad_norm": 0.330078125,
	"learning_rate": 9.132420091324201e-05,
	"loss": 1.2912,
	"step": 200
	},
	{
	"epoch": 0.7002561912894961,
	"grad_norm": 0.36328125,
	"learning_rate": 9.360730593607307e-05,
	"loss": 1.2805,
	"step": 205
	},
	{
	"epoch": 0.71733561058924,
	"grad_norm": 0.392578125,
	"learning_rate": 9.58904109589041e-05,
	"loss": 1.2683,
	"step": 210
	},
	{
	"epoch": 0.7344150298889838,
	"grad_norm": 0.48046875,
	"learning_rate": 9.817351598173516e-05,
	"loss": 1.267,
	"step": 215
	},
	{
	"epoch": 0.7514944491887275,
	"grad_norm": 0.703125,
	"learning_rate": 0.00010045662100456621,
	"loss": 1.2562,
	"step": 220
	},
	{
	"epoch": 0.7685738684884714,
	"grad_norm": 0.5703125,
	"learning_rate": 0.00010273972602739728,
	"loss": 1.2553,
	"step": 225
	},
	{
	"epoch": 0.7856532877882152,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00010502283105022832,
	"loss": 1.2405,
	"step": 230
	},
	{
	"epoch": 0.802732707087959,
	"grad_norm": 0.7421875,
	"learning_rate": 0.00010730593607305936,
	"loss": 1.2356,
	"step": 235
	},
	{
	"epoch": 0.8198121263877028,
	"grad_norm": 0.40625,
	"learning_rate": 0.00010958904109589041,
	"loss": 1.2318,
	"step": 240
	},
	{
	"epoch": 0.8368915456874466,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00011187214611872148,
	"loss": 1.2329,
	"step": 245
	},
	{
	"epoch": 0.8539709649871904,
	"grad_norm": 0.2353515625,
	"learning_rate": 0.00011415525114155252,
	"loss": 1.2222,
	"step": 250
	},
	{
	"epoch": 0.8710503842869343,
	"grad_norm": 0.255859375,
	"learning_rate": 0.00011643835616438356,
	"loss": 1.216,
	"step": 255
	},
	{
	"epoch": 0.888129803586678,
	"grad_norm": 0.4921875,
	"learning_rate": 0.00011872146118721462,
	"loss": 1.2133,
	"step": 260
	},
	{
	"epoch": 0.9052092228864219,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00012100456621004568,
	"loss": 1.2054,
	"step": 265
	},
	{
	"epoch": 0.9222886421861657,
	"grad_norm": 0.26171875,
	"learning_rate": 0.0001232876712328767,
	"loss": 1.2101,
	"step": 270
	},
	{
	"epoch": 0.9393680614859095,
	"grad_norm": 0.390625,
	"learning_rate": 0.00012557077625570778,
	"loss": 1.2071,
	"step": 275
	},
	{
	"epoch": 0.9564474807856533,
	"grad_norm": 0.318359375,
	"learning_rate": 0.00012785388127853882,
	"loss": 1.1957,
	"step": 280
	},
	{
	"epoch": 0.9735269000853971,
	"grad_norm": 0.453125,
	"learning_rate": 0.00013013698630136988,
	"loss": 1.1911,
	"step": 285
	},
	{
	"epoch": 0.9906063193851409,
	"grad_norm": 0.6796875,
	"learning_rate": 0.00013242009132420092,
	"loss": 1.1964,
	"step": 290
	},
	{
	"epoch": 0.9974380871050385,
	"eval_loss": 2.489213705062866,
	"eval_runtime": 0.8337,
	"eval_samples_per_second": 11.994,
	"eval_steps_per_second": 1.199,
	"step": 292
	},
	{
	"epoch": 1.0076857386848848,
	"grad_norm": 0.6171875,
	"learning_rate": 0.00013470319634703196,
	"loss": 1.187,
	"step": 295
	},
	{
	"epoch": 1.0247651579846284,
	"grad_norm": 0.65234375,
	"learning_rate": 0.000136986301369863,
	"loss": 1.1809,
	"step": 300
	},
	{
	"epoch": 1.0418445772843723,
	"grad_norm": 0.69921875,
	"learning_rate": 0.00013926940639269407,
	"loss": 1.1823,
	"step": 305
	},
	{
	"epoch": 1.0589239965841162,
	"grad_norm": 0.5546875,
	"learning_rate": 0.0001415525114155251,
	"loss": 1.1685,
	"step": 310
	},
	{
	"epoch": 1.0760034158838598,
	"grad_norm": 0.58984375,
	"learning_rate": 0.00014383561643835618,
	"loss": 1.1797,
	"step": 315
	},
	{
	"epoch": 1.0930828351836037,
	"grad_norm": 0.921875,
	"learning_rate": 0.00014611872146118722,
	"loss": 1.1798,
	"step": 320
	},
	{
	"epoch": 1.1101622544833476,
	"grad_norm": 0.625,
	"learning_rate": 0.00014840182648401829,
	"loss": 1.1722,
	"step": 325
	},
	{
	"epoch": 1.1272416737830913,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00015068493150684933,
	"loss": 1.1595,
	"step": 330
	},
	{
	"epoch": 1.1443210930828351,
	"grad_norm": 0.484375,
	"learning_rate": 0.00015296803652968037,
	"loss": 1.1633,
	"step": 335
	},
	{
	"epoch": 1.161400512382579,
	"grad_norm": 0.412109375,
	"learning_rate": 0.0001552511415525114,
	"loss": 1.1587,
	"step": 340
	},
	{
	"epoch": 1.1784799316823227,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00015753424657534247,
	"loss": 1.1557,
	"step": 345
	},
	{
	"epoch": 1.1955593509820666,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00015981735159817351,
	"loss": 1.1586,
	"step": 350
	},
	{
	"epoch": 1.2126387702818104,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00016210045662100458,
	"loss": 1.1559,
	"step": 355
	},
	{
	"epoch": 1.2297181895815543,
	"grad_norm": 0.375,
	"learning_rate": 0.00016438356164383562,
	"loss": 1.1575,
	"step": 360
	},
	{
	"epoch": 1.246797608881298,
	"grad_norm": 0.6015625,
	"learning_rate": 0.0001666666666666667,
	"loss": 1.1559,
	"step": 365
	},
	{
	"epoch": 1.2638770281810419,
	"grad_norm": 0.328125,
	"learning_rate": 0.00016894977168949773,
	"loss": 1.1528,
	"step": 370
	},
	{
	"epoch": 1.2809564474807855,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00017123287671232877,
	"loss": 1.1541,
	"step": 375
	},
	{
	"epoch": 1.2980358667805294,
	"grad_norm": 0.4453125,
	"learning_rate": 0.0001735159817351598,
	"loss": 1.147,
	"step": 380
	},
	{
	"epoch": 1.3151152860802733,
	"grad_norm": 0.62890625,
	"learning_rate": 0.00017579908675799088,
	"loss": 1.1434,
	"step": 385
	},
	{
	"epoch": 1.332194705380017,
	"grad_norm": 0.369140625,
	"learning_rate": 0.00017808219178082192,
	"loss": 1.1408,
	"step": 390
	},
	{
	"epoch": 1.3492741246797608,
	"grad_norm": 0.51953125,
	"learning_rate": 0.00018036529680365298,
	"loss": 1.1357,
	"step": 395
	},
	{
	"epoch": 1.3663535439795047,
	"grad_norm": 0.48046875,
	"learning_rate": 0.00018264840182648402,
	"loss": 1.1451,
	"step": 400
	},
	{
	"epoch": 1.3834329632792486,
	"grad_norm": 0.84375,
	"learning_rate": 0.0001849315068493151,
	"loss": 1.1419,
	"step": 405
	},
	{
	"epoch": 1.4005123825789922,
	"grad_norm": 0.8515625,
	"learning_rate": 0.00018721461187214613,
	"loss": 1.1435,
	"step": 410
	},
	{
	"epoch": 1.4175918018787361,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00018949771689497717,
	"loss": 1.139,
	"step": 415
	},
	{
	"epoch": 1.43467122117848,
	"grad_norm": 0.34375,
	"learning_rate": 0.0001917808219178082,
	"loss": 1.1381,
	"step": 420
	},
	{
	"epoch": 1.4517506404782237,
	"grad_norm": 0.640625,
	"learning_rate": 0.00019406392694063928,
	"loss": 1.1345,
	"step": 425
	},
	{
	"epoch": 1.4688300597779675,
	"grad_norm": 0.6328125,
	"learning_rate": 0.00019634703196347032,
	"loss": 1.1293,
	"step": 430
	},
	{
	"epoch": 1.4859094790777114,
	"grad_norm": 0.6796875,
	"learning_rate": 0.00019863013698630139,
	"loss": 1.1328,
	"step": 435
	},
	{
	"epoch": 1.5029888983774553,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00019999987297289245,
	"loss": 1.1359,
	"step": 440
	},
	{
	"epoch": 1.520068317677199,
	"grad_norm": 0.46875,
	"learning_rate": 0.00019999844392163855,
	"loss": 1.1271,
	"step": 445
	},
	{
	"epoch": 1.5371477369769426,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00019999542705801296,
	"loss": 1.1346,
	"step": 450
	},
	{
	"epoch": 1.5542271562766867,
	"grad_norm": 0.435546875,
	"learning_rate": 0.0001999908224299185,
	"loss": 1.1278,
	"step": 455
	},
	{
	"epoch": 1.5713065755764304,
	"grad_norm": 0.734375,
	"learning_rate": 0.00019998463011046926,
	"loss": 1.1303,
	"step": 460
	},
	{
	"epoch": 1.588385994876174,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00019997685019798912,
	"loss": 1.1288,
	"step": 465
	},
	{
	"epoch": 1.6054654141759181,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00019996748281601038,
	"loss": 1.1171,
	"step": 470
	},
	{
	"epoch": 1.6225448334756618,
	"grad_norm": 0.62109375,
	"learning_rate": 0.00019995652811327186,
	"loss": 1.1199,
	"step": 475
	},
	{
	"epoch": 1.6396242527754057,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00019994398626371643,
	"loss": 1.119,
	"step": 480
	},
	{
	"epoch": 1.6567036720751496,
	"grad_norm": 0.98828125,
	"learning_rate": 0.00019992985746648812,
	"loss": 1.1249,
	"step": 485
	},
	{
	"epoch": 1.6737830913748932,
	"grad_norm": 0.45703125,
	"learning_rate": 0.0001999141419459293,
	"loss": 1.1121,
	"step": 490
	},
	{
	"epoch": 1.690862510674637,
	"grad_norm": 0.5078125,
	"learning_rate": 0.00019989683995157677,
	"loss": 1.1038,
	"step": 495
	},
	{
	"epoch": 1.707941929974381,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00019987795175815807,
	"loss": 1.111,
	"step": 500
	},
	{
	"epoch": 1.7250213492741246,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00019985747766558692,
	"loss": 1.113,
	"step": 505
	},
	{
	"epoch": 1.7421007685738685,
	"grad_norm": 0.396484375,
	"learning_rate": 0.0001998354179989585,
	"loss": 1.1138,
	"step": 510
	},
	{
	"epoch": 1.7591801878736124,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00019981177310854448,
	"loss": 1.1114,
	"step": 515
	},
	{
	"epoch": 1.776259607173356,
	"grad_norm": 0.353515625,
	"learning_rate": 0.0001997865433697871,
	"loss": 1.1114,
	"step": 520
	},
	{
	"epoch": 1.7933390264731,
	"grad_norm": 0.44921875,
	"learning_rate": 0.00019975972918329356,
	"loss": 1.1058,
	"step": 525
	},
	{
	"epoch": 1.8104184457728438,
	"grad_norm": 0.47265625,
	"learning_rate": 0.00019973133097482947,
	"loss": 1.1036,
	"step": 530
	},
	{
	"epoch": 1.8274978650725875,
	"grad_norm": 0.427734375,
	"learning_rate": 0.00019970134919531206,
	"loss": 1.1033,
	"step": 535
	},
	{
	"epoch": 1.8445772843723314,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00019966978432080316,
	"loss": 1.1096,
	"step": 540
	},
	{
	"epoch": 1.8616567036720753,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00019963663685250156,
	"loss": 1.0954,
	"step": 545
	},
	{
	"epoch": 1.878736122971819,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00019960190731673505,
	"loss": 1.1034,
	"step": 550
	},
	{
	"epoch": 1.8958155422715628,
	"grad_norm": 0.52734375,
	"learning_rate": 0.00019956559626495212,
	"loss": 1.1083,
	"step": 555
	},
	{
	"epoch": 1.9128949615713067,
	"grad_norm": 0.375,
	"learning_rate": 0.00019952770427371304,
	"loss": 1.1024,
	"step": 560
	},
	{
	"epoch": 1.9299743808710503,
	"grad_norm": 0.40625,
	"learning_rate": 0.0001994882319446809,
	"loss": 1.1005,
	"step": 565
	},
	{
	"epoch": 1.9470538001707942,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00019944717990461207,
	"loss": 1.1137,
	"step": 570
	},
	{
	"epoch": 1.964133219470538,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00019940454880534598,
	"loss": 1.094,
	"step": 575
	},
	{
	"epoch": 1.9812126387702818,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00019936033932379504,
	"loss": 1.097,
	"step": 580
	},
	{
	"epoch": 1.9982920580700256,
	"grad_norm": 0.55859375,
	"learning_rate": 0.00019931455216193382,
	"loss": 1.0954,
	"step": 585
	},
	{
	"epoch": 1.9982920580700256,
	"eval_loss": 2.4542269706726074,
	"eval_runtime": 0.5521,
	"eval_samples_per_second": 18.112,
	"eval_steps_per_second": 1.811,
	"step": 585
	},
	{
	"epoch": 2.0153714773697695,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00019926718804678785,
	"loss": 1.0848,
	"step": 590
	},
	{
	"epoch": 2.032450896669513,
	"grad_norm": 0.458984375,
	"learning_rate": 0.0001992182477304221,
	"loss": 1.0899,
	"step": 595
	},
	{
	"epoch": 2.049530315969257,
	"grad_norm": 0.375,
	"learning_rate": 0.000199167731989929,
	"loss": 1.0937,
	"step": 600
	},
	{
	"epoch": 2.066609735269001,
	"grad_norm": 0.74609375,
	"learning_rate": 0.00019911564162741633,
	"loss": 1.0866,
	"step": 605
	},
	{
	"epoch": 2.0836891545687446,
	"grad_norm": 0.55078125,
	"learning_rate": 0.00019906197746999408,
	"loss": 1.081,
	"step": 610
	},
	{
	"epoch": 2.1007685738684883,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00019900674036976173,
	"loss": 1.0857,
	"step": 615
	},
	{
	"epoch": 2.1178479931682324,
	"grad_norm": 0.359375,
	"learning_rate": 0.00019894993120379435,
	"loss": 1.0894,
	"step": 620
	},
	{
	"epoch": 2.134927412467976,
	"grad_norm": 0.28515625,
	"learning_rate": 0.000198891550874129,
	"loss": 1.0803,
	"step": 625
	},
	{
	"epoch": 2.1520068317677197,
	"grad_norm": 0.328125,
	"learning_rate": 0.00019883160030775016,
	"loss": 1.0899,
	"step": 630
	},
	{
	"epoch": 2.1690862510674638,
	"grad_norm": 0.423828125,
	"learning_rate": 0.0001987700804565752,
	"loss": 1.0826,
	"step": 635
	},
	{
	"epoch": 2.1861656703672074,
	"grad_norm": 0.447265625,
	"learning_rate": 0.00019870699229743911,
	"loss": 1.0842,
	"step": 640
	},
	{
	"epoch": 2.203245089666951,
	"grad_norm": 2.359375,
	"learning_rate": 0.00019864233683207906,
	"loss": 1.0816,
	"step": 645
	},
	{
	"epoch": 2.220324508966695,
	"grad_norm": 0.37890625,
	"learning_rate": 0.0001985761150871185,
	"loss": 1.0883,
	"step": 650
	},
	{
	"epoch": 2.237403928266439,
	"grad_norm": 0.4296875,
	"learning_rate": 0.00019850832811405087,
	"loss": 1.089,
	"step": 655
	},
	{
	"epoch": 2.2544833475661825,
	"grad_norm": 0.3125,
	"learning_rate": 0.00019843897698922284,
	"loss": 1.0905,
	"step": 660
	},
	{
	"epoch": 2.2715627668659266,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00019836806281381733,
	"loss": 1.0817,
	"step": 665
	},
	{
	"epoch": 2.2886421861656703,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00019829558671383585,
	"loss": 1.0857,
	"step": 670
	},
	{
	"epoch": 2.305721605465414,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00019822154984008088,
	"loss": 1.0824,
	"step": 675
	},
	{
	"epoch": 2.322801024765158,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00019814595336813725,
	"loss": 1.0849,
	"step": 680
	},
	{
	"epoch": 2.3398804440649017,
	"grad_norm": 0.53125,
	"learning_rate": 0.0001980687984983538,
	"loss": 1.0807,
	"step": 685
	},
	{
	"epoch": 2.3569598633646454,
	"grad_norm": 0.578125,
	"learning_rate": 0.0001979900864558242,
	"loss": 1.0732,
	"step": 690
	},
	{
	"epoch": 2.3740392826643895,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00019790981849036746,
	"loss": 1.0855,
	"step": 695
	},
	{
	"epoch": 2.391118701964133,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00019782799587650805,
	"loss": 1.0797,
	"step": 700
	},
	{
	"epoch": 2.408198121263877,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00019774461991345577,
	"loss": 1.0728,
	"step": 705
	},
	{
	"epoch": 2.425277540563621,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00019765969192508508,
	"loss": 1.0805,
	"step": 710
	},
	{
	"epoch": 2.4423569598633645,
	"grad_norm": 0.375,
	"learning_rate": 0.00019757321325991414,
	"loss": 1.074,
	"step": 715
	},
	{
	"epoch": 2.4594363791631086,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00019748518529108316,
	"loss": 1.0794,
	"step": 720
	},
	{
	"epoch": 2.4765157984628523,
	"grad_norm": 0.56640625,
	"learning_rate": 0.00019739560941633294,
	"loss": 1.069,
	"step": 725
	},
	{
	"epoch": 2.493595217762596,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00019730448705798239,
	"loss": 1.0763,
	"step": 730
	},
	{
	"epoch": 2.5106746370623396,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00019721181966290613,
	"loss": 1.0798,
	"step": 735
	},
	{
	"epoch": 2.5277540563620837,
	"grad_norm": 0.67578125,
	"learning_rate": 0.00019711760870251143,
	"loss": 1.075,
	"step": 740
	},
	{
	"epoch": 2.5448334756618274,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00019702185567271486,
	"loss": 1.0775,
	"step": 745
	},
	{
	"epoch": 2.561912894961571,
	"grad_norm": 0.5703125,
	"learning_rate": 0.00019692456209391846,
	"loss": 1.0773,
	"step": 750
	},
	{
	"epoch": 2.578992314261315,
	"grad_norm": 0.443359375,
	"learning_rate": 0.0001968257295109858,
	"loss": 1.0713,
	"step": 755
	},
	{
	"epoch": 2.596071733561059,
	"grad_norm": 0.328125,
	"learning_rate": 0.0001967253594932173,
	"loss": 1.0719,
	"step": 760
	},
	{
	"epoch": 2.6131511528608025,
	"grad_norm": 0.32421875,
	"learning_rate": 0.0001966234536343253,
	"loss": 1.0674,
	"step": 765
	},
	{
	"epoch": 2.6302305721605466,
	"grad_norm": 0.3828125,
	"learning_rate": 0.00019652001355240878,
	"loss": 1.0768,
	"step": 770
	},
	{
	"epoch": 2.6473099914602902,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00019641504088992778,
	"loss": 1.0785,
	"step": 775
	},
	{
	"epoch": 2.664389410760034,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00019630853731367713,
	"loss": 1.0716,
	"step": 780
	},
	{
	"epoch": 2.681468830059778,
	"grad_norm": 0.515625,
	"learning_rate": 0.00019620050451476007,
	"loss": 1.0674,
	"step": 785
	},
	{
	"epoch": 2.6985482493595216,
	"grad_norm": 0.298828125,
	"learning_rate": 0.0001960909442085615,
	"loss": 1.0658,
	"step": 790
	},
	{
	"epoch": 2.7156276686592657,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00019597985813472052,
	"loss": 1.0746,
	"step": 795
	},
	{
	"epoch": 2.7327070879590094,
	"grad_norm": 0.75,
	"learning_rate": 0.00019586724805710306,
	"loss": 1.0696,
	"step": 800
	},
	{
	"epoch": 2.749786507258753,
	"grad_norm": 0.5,
	"learning_rate": 0.00019575311576377366,
	"loss": 1.0695,
	"step": 805
	},
	{
	"epoch": 2.766865926558497,
	"grad_norm": 0.353515625,
	"learning_rate": 0.0001956374630669672,
	"loss": 1.0633,
	"step": 810
	},
	{
	"epoch": 2.783945345858241,
	"grad_norm": 0.330078125,
	"learning_rate": 0.0001955202918030601,
	"loss": 1.069,
	"step": 815
	},
	{
	"epoch": 2.8010247651579845,
	"grad_norm": 0.5,
	"learning_rate": 0.00019540160383254107,
	"loss": 1.0636,
	"step": 820
	},
	{
	"epoch": 2.8181041844577286,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019528140103998177,
	"loss": 1.0681,
	"step": 825
	},
	{
	"epoch": 2.8351836037574722,
	"grad_norm": 0.375,
	"learning_rate": 0.00019515968533400673,
	"loss": 1.0637,
	"step": 830
	},
	{
	"epoch": 2.852263023057216,
	"grad_norm": 0.32421875,
	"learning_rate": 0.000195036458647263,
	"loss": 1.0677,
	"step": 835
	},
	{
	"epoch": 2.86934244235696,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00019491172293638968,
	"loss": 1.0658,
	"step": 840
	},
	{
	"epoch": 2.8864218616567037,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00019478548018198657,
	"loss": 1.0671,
	"step": 845
	},
	{
	"epoch": 2.9035012809564473,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00019465773238858298,
	"loss": 1.0637,
	"step": 850
	},
	{
	"epoch": 2.9205807002561914,
	"grad_norm": 0.55859375,
	"learning_rate": 0.0001945284815846057,
	"loss": 1.0638,
	"step": 855
	},
	{
	"epoch": 2.937660119555935,
	"grad_norm": 0.357421875,
	"learning_rate": 0.00019439772982234697,
	"loss": 1.0623,
	"step": 860
	},
	{
	"epoch": 2.9547395388556787,
	"grad_norm": 0.39453125,
	"learning_rate": 0.0001942654791779317,
	"loss": 1.0597,
	"step": 865
	},
	{
	"epoch": 2.971818958155423,
	"grad_norm": 0.482421875,
	"learning_rate": 0.00019413173175128473,
	"loss": 1.0629,
	"step": 870
	},
	{
	"epoch": 2.9888983774551665,
	"grad_norm": 0.5390625,
	"learning_rate": 0.0001939964896660972,
	"loss": 1.0621,
	"step": 875
	},
	{
	"epoch": 2.9991460290350127,
	"eval_loss": 2.453326463699341,
	"eval_runtime": 0.5495,
	"eval_samples_per_second": 18.197,
	"eval_steps_per_second": 1.82,
	"step": 878
	},
	{
	"epoch": 3.00597779675491,
	"grad_norm": 0.28515625,
	"learning_rate": 0.0001938597550697932,
	"loss": 1.0542,
	"step": 880
	},
	{
	"epoch": 3.0230572160546543,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00019372153013349523,
	"loss": 1.0482,
	"step": 885
	},
	{
	"epoch": 3.040136635354398,
	"grad_norm": 0.73046875,
	"learning_rate": 0.00019358181705199015,
	"loss": 1.0565,
	"step": 890
	},
	{
	"epoch": 3.0572160546541416,
	"grad_norm": 0.58203125,
	"learning_rate": 0.00019344061804369412,
	"loss": 1.0564,
	"step": 895
	},
	{
	"epoch": 3.0742954739538857,
	"grad_norm": 0.486328125,
	"learning_rate": 0.00019329793535061723,
	"loss": 1.0544,
	"step": 900
	},
	{
	"epoch": 3.0913748932536294,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00019315377123832827,
	"loss": 1.042,
	"step": 905
	},
	{
	"epoch": 3.108454312553373,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00019300812799591846,
	"loss": 1.0552,
	"step": 910
	},
	{
	"epoch": 3.125533731853117,
	"grad_norm": 0.306640625,
	"learning_rate": 0.0001928610079359652,
	"loss": 1.044,
	"step": 915
	},
	{
	"epoch": 3.1426131511528608,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00019271241339449536,
	"loss": 1.0546,
	"step": 920
	},
	{
	"epoch": 3.1596925704526044,
	"grad_norm": 0.412109375,
	"learning_rate": 0.00019256234673094814,
	"loss": 1.0553,
	"step": 925
	},
	{
	"epoch": 3.1767719897523485,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00019241081032813772,
	"loss": 1.0522,
	"step": 930
	},
	{
	"epoch": 3.193851409052092,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00019225780659221523,
	"loss": 1.0579,
	"step": 935
	},
	{
	"epoch": 3.210930828351836,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00019210333795263075,
	"loss": 1.0516,
	"step": 940
	},
	{
	"epoch": 3.22801024765158,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00019194740686209464,
	"loss": 1.0428,
	"step": 945
	},
	{
	"epoch": 3.2450896669513236,
	"grad_norm": 0.61328125,
	"learning_rate": 0.00019179001579653853,
	"loss": 1.045,
	"step": 950
	},
	{
	"epoch": 3.2621690862510673,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00019163116725507619,
	"loss": 1.0534,
	"step": 955
	},
	{
	"epoch": 3.2792485055508114,
	"grad_norm": 0.427734375,
	"learning_rate": 0.0001914708637599636,
	"loss": 1.0463,
	"step": 960
	},
	{
	"epoch": 3.296327924850555,
	"grad_norm": 0.37890625,
	"learning_rate": 0.00019130910785655907,
	"loss": 1.0482,
	"step": 965
	},
	{
	"epoch": 3.313407344150299,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00019114590211328288,
	"loss": 1.0431,
	"step": 970
	},
	{
	"epoch": 3.330486763450043,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00019098124912157632,
	"loss": 1.0487,
	"step": 975
	},
	{
	"epoch": 3.3475661827497865,
	"grad_norm": 0.5625,
	"learning_rate": 0.0001908151514958606,
	"loss": 1.0591,
	"step": 980
	},
	{
	"epoch": 3.3646456020495306,
	"grad_norm": 0.53125,
	"learning_rate": 0.00019064761187349548,
	"loss": 1.0458,
	"step": 985
	},
	{
	"epoch": 3.381725021349274,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00019047863291473717,
	"loss": 1.0488,
	"step": 990
	},
	{
	"epoch": 3.398804440649018,
	"grad_norm": 0.5546875,
	"learning_rate": 0.00019030821730269624,
	"loss": 1.0472,
	"step": 995
	},
	{
	"epoch": 3.415883859948762,
	"grad_norm": 0.921875,
	"learning_rate": 0.00019013636774329495,
	"loss": 1.0506,
	"step": 1000
	},
	{
	"epoch": 3.4329632792485056,
	"grad_norm": 0.74609375,
	"learning_rate": 0.00018996308696522433,
	"loss": 1.0488,
	"step": 1005
	},
	{
	"epoch": 3.4500426985482493,
	"grad_norm": 0.60546875,
	"learning_rate": 0.00018978837771990085,
	"loss": 1.0425,
	"step": 1010
	},
	{
	"epoch": 3.4671221178479934,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00018961224278142268,
	"loss": 1.05,
	"step": 1015
	},
	{
	"epoch": 3.484201537147737,
	"grad_norm": 0.29296875,
	"learning_rate": 0.0001894346849465257,
	"loss": 1.0406,
	"step": 1020
	},
	{
	"epoch": 3.5012809564474807,
	"grad_norm": 0.390625,
	"learning_rate": 0.000189255707034539,
	"loss": 1.0502,
	"step": 1025
	},
	{
	"epoch": 3.518360375747225,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00018907531188734026,
	"loss": 1.0451,
	"step": 1030
	},
	{
	"epoch": 3.5354397950469685,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00018889350236931055,
	"loss": 1.041,
	"step": 1035
	},
	{
	"epoch": 3.552519214346712,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00018871028136728874,
	"loss": 1.04,
	"step": 1040
	},
	{
	"epoch": 3.5695986336464562,
	"grad_norm": 0.466796875,
	"learning_rate": 0.0001885256517905259,
	"loss": 1.0432,
	"step": 1045
	},
	{
	"epoch": 3.5866780529462,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00018833961657063885,
	"loss": 1.0473,
	"step": 1050
	},
	{
	"epoch": 3.6037574722459436,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00018815217866156387,
	"loss": 1.0475,
	"step": 1055
	},
	{
	"epoch": 3.6208368915456877,
	"grad_norm": 0.470703125,
	"learning_rate": 0.0001879633410395095,
	"loss": 1.04,
	"step": 1060
	},
	{
	"epoch": 3.6379163108454313,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001877731067029096,
	"loss": 1.0408,
	"step": 1065
	},
	{
	"epoch": 3.654995730145175,
	"grad_norm": 0.380859375,
	"learning_rate": 0.00018758147867237548,
	"loss": 1.0497,
	"step": 1070
	},
	{
	"epoch": 3.672075149444919,
	"grad_norm": 0.53125,
	"learning_rate": 0.000187388459990648,
	"loss": 1.0388,
	"step": 1075
	},
	{
	"epoch": 3.6891545687446627,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00018719405372254948,
	"loss": 1.0444,
	"step": 1080
	},
	{
	"epoch": 3.7062339880444064,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00018699826295493462,
	"loss": 1.0355,
	"step": 1085
	},
	{
	"epoch": 3.7233134073441505,
	"grad_norm": 0.328125,
	"learning_rate": 0.00018680109079664188,
	"loss": 1.044,
	"step": 1090
	},
	{
	"epoch": 3.740392826643894,
	"grad_norm": 0.490234375,
	"learning_rate": 0.00018660254037844388,
	"loss": 1.0379,
	"step": 1095
	},
	{
	"epoch": 3.757472245943638,
	"grad_norm": 0.388671875,
	"learning_rate": 0.0001864026148529978,
	"loss": 1.0523,
	"step": 1100
	},
	{
	"epoch": 3.774551665243382,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00018620131739479525,
	"loss": 1.0454,
	"step": 1105
	},
	{
	"epoch": 3.7916310845431256,
	"grad_norm": 0.4140625,
	"learning_rate": 0.00018599865120011192,
	"loss": 1.039,
	"step": 1110
	},
	{
	"epoch": 3.8087105038428692,
	"grad_norm": 0.46484375,
	"learning_rate": 0.0001857946194869568,
	"loss": 1.0452,
	"step": 1115
	},
	{
	"epoch": 3.8257899231426133,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00018558922549502107,
	"loss": 1.0444,
	"step": 1120
	},
	{
	"epoch": 3.842869342442357,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00018538247248562674,
	"loss": 1.0351,
	"step": 1125
	},
	{
	"epoch": 3.8599487617421007,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001851743637416747,
	"loss": 1.041,
	"step": 1130
	},
	{
	"epoch": 3.8770281810418448,
	"grad_norm": 0.42578125,
	"learning_rate": 0.00018496490256759277,
	"loss": 1.0364,
	"step": 1135
	},
	{
	"epoch": 3.8941076003415884,
	"grad_norm": 0.4609375,
	"learning_rate": 0.00018475409228928312,
	"loss": 1.0476,
	"step": 1140
	},
	{
	"epoch": 3.911187019641332,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00018454193625406956,
	"loss": 1.0376,
	"step": 1145
	},
	{
	"epoch": 3.928266438941076,
	"grad_norm": 0.439453125,
	"learning_rate": 0.00018432843783064429,
	"loss": 1.0342,
	"step": 1150
	},
	{
	"epoch": 3.94534585824082,
	"grad_norm": 0.30859375,
	"learning_rate": 0.0001841136004090144,
	"loss": 1.0422,
	"step": 1155
	},
	{
	"epoch": 3.9624252775405635,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00018389742740044813,
	"loss": 1.0393,
	"step": 1160
	},
	{
	"epoch": 3.9795046968403076,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00018367992223742067,
	"loss": 1.0371,
	"step": 1165
	},
	{
	"epoch": 3.9965841161400513,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00018346108837355972,
	"loss": 1.0523,
	"step": 1170
	},
	{
	"epoch": 4.0,
	"eval_loss": 2.4546658992767334,
	"eval_runtime": 0.5459,
	"eval_samples_per_second": 18.317,
	"eval_steps_per_second": 1.832,
	"step": 1171
	},
	{
	"epoch": 4.013663535439795,
	"grad_norm": 0.48046875,
	"learning_rate": 0.00018324092928359041,
	"loss": 1.0323,
	"step": 1175
	},
	{
	"epoch": 4.030742954739539,
	"grad_norm": 0.41015625,
	"learning_rate": 0.00018301944846328049,
	"loss": 1.0219,
	"step": 1180
	},
	{
	"epoch": 4.047822374039282,
	"grad_norm": 0.3984375,
	"learning_rate": 0.00018279664942938447,
	"loss": 1.0262,
	"step": 1185
	},
	{
	"epoch": 4.064901793339026,
	"grad_norm": 0.314453125,
	"learning_rate": 0.0001825725357195881,
	"loss": 1.0191,
	"step": 1190
	},
	{
	"epoch": 4.0819812126387705,
	"grad_norm": 0.4375,
	"learning_rate": 0.0001823471108924519,
	"loss": 1.0331,
	"step": 1195
	},
	{
	"epoch": 4.099060631938514,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00018212037852735486,
	"loss": 1.0269,
	"step": 1200
	},
	{
	"epoch": 4.116140051238258,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00018189234222443763,
	"loss": 1.0282,
	"step": 1205
	},
	{
	"epoch": 4.133219470538002,
	"grad_norm": 0.287109375,
	"learning_rate": 0.0001816630056045451,
	"loss": 1.027,
	"step": 1210
	},
	{
	"epoch": 4.150298889837745,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001814323723091692,
	"loss": 1.026,
	"step": 1215
	},
	{
	"epoch": 4.167378309137489,
	"grad_norm": 0.333984375,
	"learning_rate": 0.0001812004460003909,
	"loss": 1.0228,
	"step": 1220
	},
	{
	"epoch": 4.184457728437233,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00018096723036082214,
	"loss": 1.0319,
	"step": 1225
	},
	{
	"epoch": 4.2015371477369765,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00018073272909354727,
	"loss": 1.027,
	"step": 1230
	},
	{
	"epoch": 4.218616567036721,
	"grad_norm": 0.5703125,
	"learning_rate": 0.0001804969459220644,
	"loss": 1.0217,
	"step": 1235
	},
	{
	"epoch": 4.235695986336465,
	"grad_norm": 0.462890625,
	"learning_rate": 0.0001802598845902262,
	"loss": 1.0332,
	"step": 1240
	},
	{
	"epoch": 4.252775405636209,
	"grad_norm": 0.421875,
	"learning_rate": 0.00018002154886218033,
	"loss": 1.0293,
	"step": 1245
	},
	{
	"epoch": 4.269854824935952,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00017978194252230985,
	"loss": 1.0259,
	"step": 1250
	},
	{
	"epoch": 4.286934244235696,
	"grad_norm": 0.2890625,
	"learning_rate": 0.00017954106937517316,
	"loss": 1.0222,
	"step": 1255
	},
	{
	"epoch": 4.304013663535439,
	"grad_norm": 0.271484375,
	"learning_rate": 0.00017929893324544332,
	"loss": 1.0259,
	"step": 1260
	},
	{
	"epoch": 4.3210930828351835,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00017905553797784759,
	"loss": 1.0195,
	"step": 1265
	},
	{
	"epoch": 4.3381725021349276,
	"grad_norm": 0.296875,
	"learning_rate": 0.0001788108874371063,
	"loss": 1.0139,
	"step": 1270
	},
	{
	"epoch": 4.355251921434672,
	"grad_norm": 0.34375,
	"learning_rate": 0.00017856498550787144,
	"loss": 1.0215,
	"step": 1275
	},
	{
	"epoch": 4.372331340734415,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00017831783609466504,
	"loss": 1.0332,
	"step": 1280
	},
	{
	"epoch": 4.389410760034159,
	"grad_norm": 0.2734375,
	"learning_rate": 0.0001780694431218171,
	"loss": 1.0242,
	"step": 1285
	},
	{
	"epoch": 4.406490179333902,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00017781981053340337,
	"loss": 1.0262,
	"step": 1290
	},
	{
	"epoch": 4.423569598633646,
	"grad_norm": 0.328125,
	"learning_rate": 0.00017756894229318263,
	"loss": 1.0323,
	"step": 1295
	},
	{
	"epoch": 4.44064901793339,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00017731684238453385,
	"loss": 1.0234,
	"step": 1300
	},
	{
	"epoch": 4.4577284372331345,
	"grad_norm": 0.341796875,
	"learning_rate": 0.00017706351481039284,
	"loss": 1.0224,
	"step": 1305
	},
	{
	"epoch": 4.474807856532878,
	"grad_norm": 0.412109375,
	"learning_rate": 0.0001768089635931887,
	"loss": 1.0277,
	"step": 1310
	},
	{
	"epoch": 4.491887275832622,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00017655319277478016,
	"loss": 1.0228,
	"step": 1315
	},
	{
	"epoch": 4.508966695132365,
	"grad_norm": 0.5390625,
	"learning_rate": 0.00017629620641639103,
	"loss": 1.028,
	"step": 1320
	},
	{
	"epoch": 4.526046114432109,
	"grad_norm": 0.4296875,
	"learning_rate": 0.000176038008598546,
	"loss": 1.0412,
	"step": 1325
	},
	{
	"epoch": 4.543125533731853,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00017577860342100579,
	"loss": 1.0253,
	"step": 1330
	},
	{
	"epoch": 4.560204953031597,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00017551799500270198,
	"loss": 1.0233,
	"step": 1335
	},
	{
	"epoch": 4.577284372331341,
	"grad_norm": 0.369140625,
	"learning_rate": 0.0001752561874816717,
	"loss": 1.0259,
	"step": 1340
	},
	{
	"epoch": 4.594363791631085,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00017499318501499177,
	"loss": 1.0265,
	"step": 1345
	},
	{
	"epoch": 4.611443210930828,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00017472899177871297,
	"loss": 1.0229,
	"step": 1350
	},
	{
	"epoch": 4.628522630230572,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00017446361196779342,
	"loss": 1.0194,
	"step": 1355
	},
	{
	"epoch": 4.645602049530316,
	"grad_norm": 0.5703125,
	"learning_rate": 0.00017419704979603214,
	"loss": 1.0261,
	"step": 1360
	},
	{
	"epoch": 4.66268146883006,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00017392930949600217,
	"loss": 1.0226,
	"step": 1365
	},
	{
	"epoch": 4.679760888129803,
	"grad_norm": 0.65625,
	"learning_rate": 0.00017366039531898326,
	"loss": 1.0319,
	"step": 1370
	},
	{
	"epoch": 4.6968403074295475,
	"grad_norm": 0.59765625,
	"learning_rate": 0.00017339031153489444,
	"loss": 1.0249,
	"step": 1375
	},
	{
	"epoch": 4.713919726729291,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00017311906243222614,
	"loss": 1.0244,
	"step": 1380
	},
	{
	"epoch": 4.730999146029035,
	"grad_norm": 0.466796875,
	"learning_rate": 0.00017284665231797223,
	"loss": 1.0273,
	"step": 1385
	},
	{
	"epoch": 4.748078565328779,
	"grad_norm": 0.435546875,
	"learning_rate": 0.0001725730855175615,
	"loss": 1.0294,
	"step": 1390
	},
	{
	"epoch": 4.765157984628523,
	"grad_norm": 0.439453125,
	"learning_rate": 0.00017229836637478902,
	"loss": 1.0283,
	"step": 1395
	},
	{
	"epoch": 4.782237403928266,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00017202249925174723,
	"loss": 1.0295,
	"step": 1400
	},
	{
	"epoch": 4.79931682322801,
	"grad_norm": 0.2734375,
	"learning_rate": 0.0001717454885287566,
	"loss": 1.0252,
	"step": 1405
	},
	{
	"epoch": 4.816396242527754,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00017146733860429612,
	"loss": 1.0219,
	"step": 1410
	},
	{
	"epoch": 4.833475661827498,
	"grad_norm": 0.34765625,
	"learning_rate": 0.0001711880538949334,
	"loss": 1.0245,
	"step": 1415
	},
	{
	"epoch": 4.850555081127242,
	"grad_norm": 0.314453125,
	"learning_rate": 0.0001709076388352546,
	"loss": 1.0266,
	"step": 1420
	},
	{
	"epoch": 4.867634500426986,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00017062609787779403,
	"loss": 1.0193,
	"step": 1425
	},
	{
	"epoch": 4.884713919726729,
	"grad_norm": 0.53125,
	"learning_rate": 0.00017034343549296346,
	"loss": 1.024,
	"step": 1430
	},
	{
	"epoch": 4.901793339026473,
	"grad_norm": 0.412109375,
	"learning_rate": 0.00017005965616898096,
	"loss": 1.0272,
	"step": 1435
	},
	{
	"epoch": 4.918872758326217,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00016977476441179992,
	"loss": 1.0212,
	"step": 1440
	},
	{
	"epoch": 4.9359521776259605,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00016948876474503726,
	"loss": 1.0268,
	"step": 1445
	},
	{
	"epoch": 4.953031596925705,
	"grad_norm": 0.3125,
	"learning_rate": 0.0001692016617099018,
	"loss": 1.0238,
	"step": 1450
	},
	{
	"epoch": 4.970111016225449,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001689134598651219,
	"loss": 1.0161,
	"step": 1455
	},
	{
	"epoch": 4.987190435525192,
	"grad_norm": 0.408203125,
	"learning_rate": 0.0001686241637868734,
	"loss": 1.0188,
	"step": 1460
	},
	{
	"epoch": 4.997438087105039,
	"eval_loss": 2.4524176120758057,
	"eval_runtime": 0.5495,
	"eval_samples_per_second": 18.198,
	"eval_steps_per_second": 1.82,
	"step": 1463
	},
	{
	"epoch": 5.004269854824936,
	"grad_norm": 0.55859375,
	"learning_rate": 0.0001683337780687066,
	"loss": 1.0219,
	"step": 1465
	},
	{
	"epoch": 5.02134927412468,
	"grad_norm": 0.30078125,
	"learning_rate": 0.0001680423073214737,
	"loss": 1.0173,
	"step": 1470
	},
	{
	"epoch": 5.038428693424423,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00016774975617325527,
	"loss": 1.0036,
	"step": 1475
	},
	{
	"epoch": 5.0555081127241674,
	"grad_norm": 0.4453125,
	"learning_rate": 0.00016745612926928694,
	"loss": 1.0119,
	"step": 1480
	},
	{
	"epoch": 5.0725875320239115,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00016716143127188548,
	"loss": 1.0061,
	"step": 1485
	},
	{
	"epoch": 5.089666951323655,
	"grad_norm": 0.396484375,
	"learning_rate": 0.0001668656668603751,
	"loss": 1.0114,
	"step": 1490
	},
	{
	"epoch": 5.106746370623399,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00016656884073101266,
	"loss": 1.0145,
	"step": 1495
	},
	{
	"epoch": 5.123825789923143,
	"grad_norm": 0.287109375,
	"learning_rate": 0.00016627095759691362,
	"loss": 1.0101,
	"step": 1500
	},
	{
	"epoch": 5.140905209222886,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00016597202218797676,
	"loss": 1.0081,
	"step": 1505
	},
	{
	"epoch": 5.15798462852263,
	"grad_norm": 0.3359375,
	"learning_rate": 0.0001656720392508094,
	"loss": 1.0066,
	"step": 1510
	},
	{
	"epoch": 5.175064047822374,
	"grad_norm": 0.390625,
	"learning_rate": 0.0001653710135486518,
	"loss": 1.0109,
	"step": 1515
	},
	{
	"epoch": 5.192143467122118,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00016506894986130171,
	"loss": 1.007,
	"step": 1520
	},
	{
	"epoch": 5.209222886421862,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00016476585298503835,
	"loss": 1.0113,
	"step": 1525
	},
	{
	"epoch": 5.226302305721606,
	"grad_norm": 0.43359375,
	"learning_rate": 0.00016446172773254629,
	"loss": 1.0123,
	"step": 1530
	},
	{
	"epoch": 5.243381725021349,
	"grad_norm": 0.328125,
	"learning_rate": 0.0001641565789328391,
	"loss": 1.0168,
	"step": 1535
	},
	{
	"epoch": 5.260461144321093,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00016385041143118255,
	"loss": 1.0116,
	"step": 1540
	},
	{
	"epoch": 5.277540563620837,
	"grad_norm": 0.38671875,
	"learning_rate": 0.00016354323008901776,
	"loss": 1.0098,
	"step": 1545
	},
	{
	"epoch": 5.2946199829205804,
	"grad_norm": 0.388671875,
	"learning_rate": 0.000163235039783884,
	"loss": 1.0168,
	"step": 1550
	},
	{
	"epoch": 5.3116994022203246,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00016292584540934113,
	"loss": 1.007,
	"step": 1555
	},
	{
	"epoch": 5.328778821520069,
	"grad_norm": 0.3984375,
	"learning_rate": 0.0001626156518748922,
	"loss": 1.0133,
	"step": 1560
	},
	{
	"epoch": 5.345858240819812,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00016230446410590504,
	"loss": 1.0106,
	"step": 1565
	},
	{
	"epoch": 5.362937660119556,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00016199228704353455,
	"loss": 1.0024,
	"step": 1570
	},
	{
	"epoch": 5.3800170794193,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00016167912564464383,
	"loss": 1.0121,
	"step": 1575
	},
	{
	"epoch": 5.397096498719043,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00016136498488172568,
	"loss": 1.0089,
	"step": 1580
	},
	{
	"epoch": 5.414175918018787,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00016104986974282363,
	"loss": 1.0157,
	"step": 1585
	},
	{
	"epoch": 5.4312553373185315,
	"grad_norm": 0.29296875,
	"learning_rate": 0.0001607337852314527,
	"loss": 1.0051,
	"step": 1590
	},
	{
	"epoch": 5.448334756618275,
	"grad_norm": 0.390625,
	"learning_rate": 0.00016041673636651996,
	"loss": 1.0094,
	"step": 1595
	},
	{
	"epoch": 5.465414175918019,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00016009872818224485,
	"loss": 1.0125,
	"step": 1600
	},
	{
	"epoch": 5.482493595217763,
	"grad_norm": 0.28515625,
	"learning_rate": 0.0001597797657280792,
	"loss": 1.014,
	"step": 1605
	},
	{
	"epoch": 5.499573014517506,
	"grad_norm": 0.302734375,
	"learning_rate": 0.00015945985406862721,
	"loss": 1.0154,
	"step": 1610
	},
	{
	"epoch": 5.51665243381725,
	"grad_norm": 0.365234375,
	"learning_rate": 0.00015913899828356477,
	"loss": 1.0122,
	"step": 1615
	},
	{
	"epoch": 5.533731853116994,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00015881720346755905,
	"loss": 1.0133,
	"step": 1620
	},
	{
	"epoch": 5.5508112724167376,
	"grad_norm": 0.330078125,
	"learning_rate": 0.0001584944747301874,
	"loss": 1.0087,
	"step": 1625
	},
	{
	"epoch": 5.567890691716482,
	"grad_norm": 0.3203125,
	"learning_rate": 0.00015817081719585643,
	"loss": 1.0101,
	"step": 1630
	},
	{
	"epoch": 5.584970111016226,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00015784623600372042,
	"loss": 1.0118,
	"step": 1635
	},
	{
	"epoch": 5.602049530315969,
	"grad_norm": 0.48828125,
	"learning_rate": 0.00015752073630759998,
	"loss": 1.0133,
	"step": 1640
	},
	{
	"epoch": 5.619128949615713,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00015719432327589988,
	"loss": 1.0089,
	"step": 1645
	},
	{
	"epoch": 5.636208368915457,
	"grad_norm": 0.6171875,
	"learning_rate": 0.00015686700209152738,
	"loss": 1.007,
	"step": 1650
	},
	{
	"epoch": 5.6532877882152,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00015653877795180954,
	"loss": 1.0031,
	"step": 1655
	},
	{
	"epoch": 5.6703672075149445,
	"grad_norm": 0.40234375,
	"learning_rate": 0.00015620965606841098,
	"loss": 1.0099,
	"step": 1660
	},
	{
	"epoch": 5.687446626814689,
	"grad_norm": 0.41796875,
	"learning_rate": 0.00015587964166725095,
	"loss": 1.0127,
	"step": 1665
	},
	{
	"epoch": 5.704526046114432,
	"grad_norm": 0.423828125,
	"learning_rate": 0.0001555487399884206,
	"loss": 1.0049,
	"step": 1670
	},
	{
	"epoch": 5.721605465414176,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00015521695628609937,
	"loss": 1.0036,
	"step": 1675
	},
	{
	"epoch": 5.73868488471392,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00015488429582847192,
	"loss": 1.012,
	"step": 1680
	},
	{
	"epoch": 5.755764304013663,
	"grad_norm": 0.458984375,
	"learning_rate": 0.00015455076389764443,
	"loss": 1.0099,
	"step": 1685
	},
	{
	"epoch": 5.772843723313407,
	"grad_norm": 0.380859375,
	"learning_rate": 0.0001542163657895605,
	"loss": 1.0144,
	"step": 1690
	},
	{
	"epoch": 5.789923142613151,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00015388110681391725,
	"loss": 1.0082,
	"step": 1695
	},
	{
	"epoch": 5.807002561912895,
	"grad_norm": 0.328125,
	"learning_rate": 0.00015354499229408114,
	"loss": 1.013,
	"step": 1700
	},
	{
	"epoch": 5.824081981212639,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00015320802756700302,
	"loss": 1.0089,
	"step": 1705
	},
	{
	"epoch": 5.841161400512383,
	"grad_norm": 0.31640625,
	"learning_rate": 0.0001528702179831338,
	"loss": 1.0117,
	"step": 1710
	},
	{
	"epoch": 5.858240819812126,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00015253156890633935,
	"loss": 1.0087,
	"step": 1715
	},
	{
	"epoch": 5.87532023911187,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00015219208571381525,
	"loss": 1.0161,
	"step": 1720
	},
	{
	"epoch": 5.892399658411614,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00015185177379600152,
	"loss": 1.0109,
	"step": 1725
	},
	{
	"epoch": 5.9094790777113575,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00015151063855649698,
	"loss": 1.0131,
	"step": 1730
	},
	{
	"epoch": 5.926558497011102,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00015116868541197343,
	"loss": 1.0118,
	"step": 1735
	},
	{
	"epoch": 5.943637916310846,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00015082591979208976,
	"loss": 1.0126,
	"step": 1740
	},
	{
	"epoch": 5.960717335610589,
	"grad_norm": 0.365234375,
	"learning_rate": 0.0001504823471394055,
	"loss": 1.0065,
	"step": 1745
	},
	{
	"epoch": 5.977796754910333,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00015013797290929466,
	"loss": 1.0095,
	"step": 1750
	},
	{
	"epoch": 5.994876174210077,
	"grad_norm": 0.4609375,
	"learning_rate": 0.000149792802569859,
	"loss": 1.0119,
	"step": 1755
	},
	{
	"epoch": 5.998292058070025,
	"eval_loss": 2.454439640045166,
	"eval_runtime": 0.5526,
	"eval_samples_per_second": 18.096,
	"eval_steps_per_second": 1.81,
	"step": 1756
	},
	{
	"epoch": 6.01195559350982,
	"grad_norm": 0.4765625,
	"learning_rate": 0.00014944684160184108,
	"loss": 1.001,
	"step": 1760
	},
	{
	"epoch": 6.029035012809564,
	"grad_norm": 0.423828125,
	"learning_rate": 0.00014910009549853746,
	"loss": 0.9916,
	"step": 1765
	},
	{
	"epoch": 6.0461144321093085,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00014875256976571135,
	"loss": 0.9965,
	"step": 1770
	},
	{
	"epoch": 6.063193851409052,
	"grad_norm": 0.458984375,
	"learning_rate": 0.0001484042699215052,
	"loss": 1.009,
	"step": 1775
	},
	{
	"epoch": 6.080273270708796,
	"grad_norm": 0.34765625,
	"learning_rate": 0.00014805520149635307,
	"loss": 1.001,
	"step": 1780
	},
	{
	"epoch": 6.09735269000854,
	"grad_norm": 0.359375,
	"learning_rate": 0.0001477053700328929,
	"loss": 0.9998,
	"step": 1785
	},
	{
	"epoch": 6.114432109308283,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00014735478108587828,
	"loss": 1.008,
	"step": 1790
	},
	{
	"epoch": 6.131511528608027,
	"grad_norm": 0.322265625,
	"learning_rate": 0.0001470034402220906,
	"loss": 0.9872,
	"step": 1795
	},
	{
	"epoch": 6.148590947907771,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00014665135302025035,
	"loss": 0.9968,
	"step": 1800
	},
	{
	"epoch": 6.165670367207515,
	"grad_norm": 0.34375,
	"learning_rate": 0.00014629852507092866,
	"loss": 0.996,
	"step": 1805
	},
	{
	"epoch": 6.182749786507259,
	"grad_norm": 0.333984375,
	"learning_rate": 0.00014594496197645852,
	"loss": 0.9986,
	"step": 1810
	},
	{
	"epoch": 6.199829205807003,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00014559066935084588,
	"loss": 0.9966,
	"step": 1815
	},
	{
	"epoch": 6.216908625106746,
	"grad_norm": 0.341796875,
	"learning_rate": 0.0001452356528196804,
	"loss": 1.002,
	"step": 1820
	},
	{
	"epoch": 6.23398804440649,
	"grad_norm": 0.33984375,
	"learning_rate": 0.00014487991802004623,
	"loss": 1.0061,
	"step": 1825
	},
	{
	"epoch": 6.251067463706234,
	"grad_norm": 0.29296875,
	"learning_rate": 0.00014452347060043237,
	"loss": 1.0019,
	"step": 1830
	},
	{
	"epoch": 6.268146883005977,
	"grad_norm": 0.404296875,
	"learning_rate": 0.00014416631622064316,
	"loss": 0.9955,
	"step": 1835
	},
	{
	"epoch": 6.2852263023057215,
	"grad_norm": 0.94921875,
	"learning_rate": 0.00014380846055170828,
	"loss": 0.9979,
	"step": 1840
	},
	{
	"epoch": 6.302305721605466,
	"grad_norm": 0.328125,
	"learning_rate": 0.00014344990927579268,
	"loss": 1.0024,
	"step": 1845
	},
	{
	"epoch": 6.319385140905209,
	"grad_norm": 0.314453125,
	"learning_rate": 0.00014309066808610655,
	"loss": 1.0009,
	"step": 1850
	},
	{
	"epoch": 6.336464560204953,
	"grad_norm": 0.486328125,
	"learning_rate": 0.00014273074268681462,
	"loss": 1.0039,
	"step": 1855
	},
	{
	"epoch": 6.353543979504697,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001423701387929459,
	"loss": 0.9994,
	"step": 1860
	},
	{
	"epoch": 6.37062339880444,
	"grad_norm": 0.283203125,
	"learning_rate": 0.0001420088621303027,
	"loss": 1.0076,
	"step": 1865
	},
	{
	"epoch": 6.387702818104184,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00014164691843536982,
	"loss": 1.0011,
	"step": 1870
	},
	{
	"epoch": 6.4047822374039285,
	"grad_norm": 0.345703125,
	"learning_rate": 0.0001412843134552235,
	"loss": 0.9996,
	"step": 1875
	},
	{
	"epoch": 6.421861656703672,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00014092105294744,
	"loss": 0.9984,
	"step": 1880
	},
	{
	"epoch": 6.438941076003416,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00014055714268000445,
	"loss": 0.9947,
	"step": 1885
	},
	{
	"epoch": 6.45602049530316,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00014019258843121893,
	"loss": 1.0061,
	"step": 1890
	},
	{
	"epoch": 6.473099914602903,
	"grad_norm": 0.330078125,
	"learning_rate": 0.000139827395989611,
	"loss": 0.9986,
	"step": 1895
	},
	{
	"epoch": 6.490179333902647,
	"grad_norm": 0.357421875,
	"learning_rate": 0.0001394615711538417,
	"loss": 1.0003,
	"step": 1900
	},
	{
	"epoch": 6.507258753202391,
	"grad_norm": 0.353515625,
	"learning_rate": 0.0001390951197326134,
	"loss": 0.9985,
	"step": 1905
	},
	{
	"epoch": 6.5243381725021345,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00013872804754457759,
	"loss": 1.0071,
	"step": 1910
	},
	{
	"epoch": 6.541417591801879,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00013836036041824264,
	"loss": 0.9921,
	"step": 1915
	},
	{
	"epoch": 6.558497011101623,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00013799206419188103,
	"loss": 1.0062,
	"step": 1920
	},
	{
	"epoch": 6.575576430401366,
	"grad_norm": 0.322265625,
	"learning_rate": 0.0001376231647134369,
	"loss": 1.0059,
	"step": 1925
	},
	{
	"epoch": 6.59265584970111,
	"grad_norm": 0.392578125,
	"learning_rate": 0.00013725366784043288,
	"loss": 1.0001,
	"step": 1930
	},
	{
	"epoch": 6.609735269000854,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00013688357943987732,
	"loss": 0.9975,
	"step": 1935
	},
	{
	"epoch": 6.626814688300598,
	"grad_norm": 0.490234375,
	"learning_rate": 0.00013651290538817113,
	"loss": 1.0012,
	"step": 1940
	},
	{
	"epoch": 6.6438941076003415,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00013614165157101423,
	"loss": 0.9949,
	"step": 1945
	},
	{
	"epoch": 6.660973526900086,
	"grad_norm": 0.318359375,
	"learning_rate": 0.0001357698238833126,
	"loss": 1.0088,
	"step": 1950
	},
	{
	"epoch": 6.678052946199829,
	"grad_norm": 0.267578125,
	"learning_rate": 0.0001353974282290839,
	"loss": 0.9965,
	"step": 1955
	},
	{
	"epoch": 6.695132365499573,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00013502447052136455,
	"loss": 1.0063,
	"step": 1960
	},
	{
	"epoch": 6.712211784799317,
	"grad_norm": 0.3515625,
	"learning_rate": 0.0001346509566821153,
	"loss": 0.999,
	"step": 1965
	},
	{
	"epoch": 6.729291204099061,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00013427689264212738,
	"loss": 0.9998,
	"step": 1970
	},
	{
	"epoch": 6.746370623398804,
	"grad_norm": 0.57421875,
	"learning_rate": 0.00013390228434092833,
	"loss": 0.9977,
	"step": 1975
	},
	{
	"epoch": 6.763450042698548,
	"grad_norm": 0.337890625,
	"learning_rate": 0.00013352713772668765,
	"loss": 0.9991,
	"step": 1980
	},
	{
	"epoch": 6.780529461998292,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00013315145875612236,
	"loss": 0.9939,
	"step": 1985
	},
	{
	"epoch": 6.797608881298036,
	"grad_norm": 0.388671875,
	"learning_rate": 0.0001327752533944025,
	"loss": 0.9993,
	"step": 1990
	},
	{
	"epoch": 6.81468830059778,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00013239852761505626,
	"loss": 1.0079,
	"step": 1995
	},
	{
	"epoch": 6.831767719897524,
	"grad_norm": 0.396484375,
	"learning_rate": 0.00013202128739987532,
	"loss": 0.9962,
	"step": 2000
	},
	{
	"epoch": 6.848847139197267,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00013164353873881961,
	"loss": 0.9982,
	"step": 2005
	},
	{
	"epoch": 6.865926558497011,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00013126528762992247,
	"loss": 0.9959,
	"step": 2010
	},
	{
	"epoch": 6.8830059777967545,
	"grad_norm": 0.369140625,
	"learning_rate": 0.0001308865400791953,
	"loss": 1.0043,
	"step": 2015
	},
	{
	"epoch": 6.900085397096499,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001305073021005321,
	"loss": 0.9966,
	"step": 2020
	},
	{
	"epoch": 6.917164816396243,
	"grad_norm": 0.453125,
	"learning_rate": 0.00013012757971561415,
	"loss": 0.9953,
	"step": 2025
	},
	{
	"epoch": 6.934244235695987,
	"grad_norm": 0.435546875,
	"learning_rate": 0.0001297473789538142,
	"loss": 0.9957,
	"step": 2030
	},
	{
	"epoch": 6.95132365499573,
	"grad_norm": 0.453125,
	"learning_rate": 0.00012936670585210103,
	"loss": 1.0004,
	"step": 2035
	},
	{
	"epoch": 6.968403074295474,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00012898556645494325,
	"loss": 0.9952,
	"step": 2040
	},
	{
	"epoch": 6.985482493595217,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00012860396681421354,
	"loss": 1.0028,
	"step": 2045
	},
	{
	"epoch": 6.999146029035013,
	"eval_loss": 2.4654781818389893,
	"eval_runtime": 0.5548,
	"eval_samples_per_second": 18.024,
	"eval_steps_per_second": 1.802,
	"step": 2049
	},
	{
	"epoch": 7.002561912894961,
	"grad_norm": 0.34765625,
	"learning_rate": 0.0001282219129890925,
	"loss": 0.9919,
	"step": 2050
	},
	{
	"epoch": 7.0196413321947055,
	"grad_norm": 0.2890625,
	"learning_rate": 0.0001278394110459724,
	"loss": 0.9953,
	"step": 2055
	},
	{
	"epoch": 7.036720751494449,
	"grad_norm": 0.3359375,
	"learning_rate": 0.00012745646705836097,
	"loss": 0.9879,
	"step": 2060
	},
	{
	"epoch": 7.053800170794193,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00012707308710678477,
	"loss": 0.9939,
	"step": 2065
	},
	{
	"epoch": 7.070879590093937,
	"grad_norm": 0.37890625,
	"learning_rate": 0.0001266892772786929,
	"loss": 0.9936,
	"step": 2070
	},
	{
	"epoch": 7.08795900939368,
	"grad_norm": 0.373046875,
	"learning_rate": 0.00012630504366836008,
	"loss": 0.987,
	"step": 2075
	},
	{
	"epoch": 7.105038428693424,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001259203923767901,
	"loss": 0.9919,
	"step": 2080
	},
	{
	"epoch": 7.122117847993168,
	"grad_norm": 0.318359375,
	"learning_rate": 0.0001255353295116187,
	"loss": 0.9909,
	"step": 2085
	},
	{
	"epoch": 7.1391972672929125,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00012514986118701695,
	"loss": 0.9868,
	"step": 2090
	},
	{
	"epoch": 7.156276686592656,
	"grad_norm": 0.27734375,
	"learning_rate": 0.00012476399352359376,
	"loss": 0.9881,
	"step": 2095
	},
	{
	"epoch": 7.1733561058924,
	"grad_norm": 0.3515625,
	"learning_rate": 0.00012437773264829897,
	"loss": 0.9939,
	"step": 2100
	},
	{
	"epoch": 7.190435525192143,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00012399108469432601,
	"loss": 0.9834,
	"step": 2105
	},
	{
	"epoch": 7.207514944491887,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00012360405580101448,
	"loss": 0.9847,
	"step": 2110
	},
	{
	"epoch": 7.224594363791631,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00012321665211375256,
	"loss": 0.9945,
	"step": 2115
	},
	{
	"epoch": 7.241673783091375,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00012282887978387976,
	"loss": 0.999,
	"step": 2120
	},
	{
	"epoch": 7.2587532023911185,
	"grad_norm": 0.310546875,
	"learning_rate": 0.00012244074496858888,
	"loss": 0.9854,
	"step": 2125
	},
	{
	"epoch": 7.275832621690863,
	"grad_norm": 0.3125,
	"learning_rate": 0.00012205225383082843,
	"loss": 0.9842,
	"step": 2130
	},
	{
	"epoch": 7.292912040990606,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00012166341253920472,
	"loss": 0.9928,
	"step": 2135
	},
	{
	"epoch": 7.30999146029035,
	"grad_norm": 0.318359375,
	"learning_rate": 0.000121274227267884,
	"loss": 0.9915,
	"step": 2140
	},
	{
	"epoch": 7.327070879590094,
	"grad_norm": 0.328125,
	"learning_rate": 0.00012088470419649432,
	"loss": 0.992,
	"step": 2145
	},
	{
	"epoch": 7.344150298889838,
	"grad_norm": 0.56640625,
	"learning_rate": 0.00012049484951002739,
	"loss": 0.9897,
	"step": 2150
	},
	{
	"epoch": 7.361229718189581,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00012010466939874053,
	"loss": 0.9968,
	"step": 2155
	},
	{
	"epoch": 7.3783091374893255,
	"grad_norm": 0.294921875,
	"learning_rate": 0.00011971417005805818,
	"loss": 0.9933,
	"step": 2160
	},
	{
	"epoch": 7.395388556789069,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00011932335768847371,
	"loss": 0.9953,
	"step": 2165
	},
	{
	"epoch": 7.412467976088813,
	"grad_norm": 0.330078125,
	"learning_rate": 0.00011893223849545084,
	"loss": 0.9918,
	"step": 2170
	},
	{
	"epoch": 7.429547395388557,
	"grad_norm": 0.40234375,
	"learning_rate": 0.0001185408186893251,
	"loss": 0.9896,
	"step": 2175
	},
	{
	"epoch": 7.446626814688301,
	"grad_norm": 0.35546875,
	"learning_rate": 0.00011814910448520536,
	"loss": 0.9906,
	"step": 2180
	},
	{
	"epoch": 7.463706233988044,
	"grad_norm": 0.376953125,
	"learning_rate": 0.00011775710210287492,
	"loss": 0.9844,
	"step": 2185
	},
	{
	"epoch": 7.480785653287788,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.99,
	"step": 2190
	},
	{
	"epoch": 7.497865072587532,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00011697225770549585,
	"loss": 0.9899,
	"step": 2195
	},
	{
	"epoch": 7.514944491887276,
	"grad_norm": 0.296875,
	"learning_rate": 0.00011657942815249754,
	"loss": 0.9888,
	"step": 2200
	},
	{
	"epoch": 7.53202391118702,
	"grad_norm": 0.37109375,
	"learning_rate": 0.00011618633534519141,
	"loss": 0.9935,
	"step": 2205
	},
	{
	"epoch": 7.549103330486764,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00011579298552525084,
	"loss": 0.9955,
	"step": 2210
	},
	{
	"epoch": 7.566182749786507,
	"grad_norm": 0.310546875,
	"learning_rate": 0.0001153993849384301,
	"loss": 0.9884,
	"step": 2215
	},
	{
	"epoch": 7.583262169086251,
	"grad_norm": 0.3046875,
	"learning_rate": 0.00011500553983446527,
	"loss": 0.9895,
	"step": 2220
	},
	{
	"epoch": 7.600341588385994,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00011461145646697495,
	"loss": 0.9874,
	"step": 2225
	},
	{
	"epoch": 7.6174210076857385,
	"grad_norm": 0.28125,
	"learning_rate": 0.00011421714109336097,
	"loss": 0.9885,
	"step": 2230
	},
	{
	"epoch": 7.634500426985483,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00011382259997470899,
	"loss": 0.9876,
	"step": 2235
	},
	{
	"epoch": 7.651579846285227,
	"grad_norm": 0.31640625,
	"learning_rate": 0.00011342783937568926,
	"loss": 0.9948,
	"step": 2240
	},
	{
	"epoch": 7.66865926558497,
	"grad_norm": 0.296875,
	"learning_rate": 0.00011303286556445694,
	"loss": 0.9883,
	"step": 2245
	},
	{
	"epoch": 7.685738684884714,
	"grad_norm": 0.328125,
	"learning_rate": 0.00011263768481255264,
	"loss": 0.996,
	"step": 2250
	},
	{
	"epoch": 7.702818104184458,
	"grad_norm": 0.349609375,
	"learning_rate": 0.00011224230339480284,
	"loss": 0.9896,
	"step": 2255
	},
	{
	"epoch": 7.719897523484201,
	"grad_norm": 0.384765625,
	"learning_rate": 0.00011184672758922034,
	"loss": 0.9856,
	"step": 2260
	},
	{
	"epoch": 7.736976942783945,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00011145096367690444,
	"loss": 0.9968,
	"step": 2265
	},
	{
	"epoch": 7.7540563620836895,
	"grad_norm": 0.322265625,
	"learning_rate": 0.00011105501794194131,
	"loss": 0.9938,
	"step": 2270
	},
	{
	"epoch": 7.771135781383433,
	"grad_norm": 0.326171875,
	"learning_rate": 0.00011065889667130414,
	"loss": 0.9924,
	"step": 2275
	},
	{
	"epoch": 7.788215200683177,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00011026260615475333,
	"loss": 1.0,
	"step": 2280
	},
	{
	"epoch": 7.805294619982921,
	"grad_norm": 0.419921875,
	"learning_rate": 0.00010986615268473661,
	"loss": 0.9882,
	"step": 2285
	},
	{
	"epoch": 7.822374039282664,
	"grad_norm": 0.279296875,
	"learning_rate": 0.00010946954255628928,
	"loss": 0.9926,
	"step": 2290
	},
	{
	"epoch": 7.839453458582408,
	"grad_norm": 0.32421875,
	"learning_rate": 0.00010907278206693395,
	"loss": 0.9936,
	"step": 2295
	},
	{
	"epoch": 7.856532877882152,
	"grad_norm": 0.345703125,
	"learning_rate": 0.00010867587751658079,
	"loss": 0.9842,
	"step": 2300
	},
	{
	"epoch": 7.873612297181896,
	"grad_norm": 0.306640625,
	"learning_rate": 0.00010827883520742741,
	"loss": 0.9875,
	"step": 2305
	},
	{
	"epoch": 7.89069171648164,
	"grad_norm": 0.359375,
	"learning_rate": 0.00010788166144385888,
	"loss": 0.9893,
	"step": 2310
	},
	{
	"epoch": 7.907771135781384,
	"grad_norm": 0.50390625,
	"learning_rate": 0.00010748436253234742,
	"loss": 0.9815,
	"step": 2315
	},
	{
	"epoch": 7.924850555081127,
	"grad_norm": 0.373046875,
	"learning_rate": 0.0001070869447813525,
	"loss": 0.9863,
	"step": 2320
	},
	{
	"epoch": 7.941929974380871,
	"grad_norm": 0.353515625,
	"learning_rate": 0.00010668941450122055,
	"loss": 0.9943,
	"step": 2325
	},
	{
	"epoch": 7.959009393680615,
	"grad_norm": 0.3984375,
	"learning_rate": 0.0001062917780040847,
	"loss": 0.9886,
	"step": 2330
	},
	{
	"epoch": 7.976088812980358,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00010589404160376473,
	"loss": 0.9928,
	"step": 2335
	},
	{
	"epoch": 7.9931682322801025,
	"grad_norm": 0.3046875,
	"learning_rate": 0.0001054962116156667,
	"loss": 0.9914,
	"step": 2340
	},
	{
	"epoch": 8.0,
	"eval_loss": 2.468475341796875,
	"eval_runtime": 0.5517,
	"eval_samples_per_second": 18.125,
	"eval_steps_per_second": 1.812,
	"step": 2342
	},
	{
	"epoch": 8.010247651579846,
	"grad_norm": 0.359375,
	"learning_rate": 0.00010509829435668265,
	"loss": 0.9939,
	"step": 2345
	},
	{
	"epoch": 8.02732707087959,
	"grad_norm": 0.291015625,
	"learning_rate": 0.00010470029614509041,
	"loss": 0.9808,
	"step": 2350
	},
	{
	"epoch": 8.044406490179334,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00010430222330045304,
	"loss": 0.9807,
	"step": 2355
	},
	{
	"epoch": 8.061485909479078,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00010390408214351892,
	"loss": 0.9852,
	"step": 2360
	},
	{
	"epoch": 8.078565328778822,
	"grad_norm": 0.33203125,
	"learning_rate": 0.00010350587899612088,
	"loss": 0.9704,
	"step": 2365
	},
	{
	"epoch": 8.095644748078564,
	"grad_norm": 0.37109375,
	"learning_rate": 0.0001031076201810762,
	"loss": 0.9861,
	"step": 2370
	},
	{
	"epoch": 8.112724167378309,
	"grad_norm": 0.30859375,
	"learning_rate": 0.00010270931202208595,
	"loss": 0.9933,
	"step": 2375
	},
	{
	"epoch": 8.129803586678053,
	"grad_norm": 0.361328125,
	"learning_rate": 0.00010231096084363483,
	"loss": 0.9808,
	"step": 2380
	},
	{
	"epoch": 8.146883005977797,
	"grad_norm": 0.3671875,
	"learning_rate": 0.00010191257297089052,
	"loss": 0.9859,
	"step": 2385
	},
	{
	"epoch": 8.163962425277541,
	"grad_norm": 0.28515625,
	"learning_rate": 0.00010151415472960342,
	"loss": 0.9839,
	"step": 2390
	},
	{
	"epoch": 8.181041844577285,
	"grad_norm": 0.451171875,
	"learning_rate": 0.00010111571244600606,
	"loss": 0.9881,
	"step": 2395
	},
	{
	"epoch": 8.198121263877027,
	"grad_norm": 0.443359375,
	"learning_rate": 0.00010071725244671282,
	"loss": 0.9869,
	"step": 2400
	},
	{
	"epoch": 8.215200683176771,
	"grad_norm": 0.400390625,
	"learning_rate": 0.00010031878105861923,
	"loss": 0.979,
	"step": 2405
	},
	{
	"epoch": 8.232280102476516,
	"grad_norm": 0.330078125,
	"learning_rate": 9.992030460880181e-05,
	"loss": 0.9873,
	"step": 2410
	},
	{
	"epoch": 8.24935952177626,
	"grad_norm": 0.302734375,
	"learning_rate": 9.952182942441733e-05,
	"loss": 0.978,
	"step": 2415
	},
	{
	"epoch": 8.266438941076004,
	"grad_norm": 0.31640625,
	"learning_rate": 9.91233618326026e-05,
	"loss": 0.9911,
	"step": 2420
	},
	{
	"epoch": 8.283518360375748,
	"grad_norm": 0.365234375,
	"learning_rate": 9.872490816037372e-05,
	"loss": 0.9824,
	"step": 2425
	},
	{
	"epoch": 8.30059777967549,
	"grad_norm": 0.322265625,
	"learning_rate": 9.83264747345259e-05,
	"loss": 0.9845,
	"step": 2430
	},
	{
	"epoch": 8.317677198975234,
	"grad_norm": 0.34375,
	"learning_rate": 9.792806788153271e-05,
	"loss": 0.9752,
	"step": 2435
	},
	{
	"epoch": 8.334756618274978,
	"grad_norm": 0.322265625,
	"learning_rate": 9.752969392744606e-05,
	"loss": 0.9802,
	"step": 2440
	},
	{
	"epoch": 8.351836037574722,
	"grad_norm": 0.359375,
	"learning_rate": 9.713135919779515e-05,
	"loss": 0.9945,
	"step": 2445
	},
	{
	"epoch": 8.368915456874467,
	"grad_norm": 0.38671875,
	"learning_rate": 9.673307001748661e-05,
	"loss": 0.9825,
	"step": 2450
	},
	{
	"epoch": 8.38599487617421,
	"grad_norm": 0.349609375,
	"learning_rate": 9.633483271070366e-05,
	"loss": 0.9803,
	"step": 2455
	},
	{
	"epoch": 8.403074295473953,
	"grad_norm": 0.32421875,
	"learning_rate": 9.593665360080599e-05,
	"loss": 0.9841,
	"step": 2460
	},
	{
	"epoch": 8.420153714773697,
	"grad_norm": 0.380859375,
	"learning_rate": 9.553853901022913e-05,
	"loss": 0.9777,
	"step": 2465
	},
	{
	"epoch": 8.437233134073441,
	"grad_norm": 0.388671875,
	"learning_rate": 9.514049526038418e-05,
	"loss": 0.9871,
	"step": 2470
	},
	{
	"epoch": 8.454312553373185,
	"grad_norm": 0.328125,
	"learning_rate": 9.474252867155732e-05,
	"loss": 0.9821,
	"step": 2475
	},
	{
	"epoch": 8.47139197267293,
	"grad_norm": 0.287109375,
	"learning_rate": 9.43446455628097e-05,
	"loss": 0.983,
	"step": 2480
	},
	{
	"epoch": 8.488471391972674,
	"grad_norm": 0.318359375,
	"learning_rate": 9.394685225187683e-05,
	"loss": 0.9807,
	"step": 2485
	},
	{
	"epoch": 8.505550811272418,
	"grad_norm": 0.330078125,
	"learning_rate": 9.354915505506839e-05,
	"loss": 0.991,
	"step": 2490
	},
	{
	"epoch": 8.52263023057216,
	"grad_norm": 0.3046875,
	"learning_rate": 9.31515602871679e-05,
	"loss": 0.9914,
	"step": 2495
	},
	{
	"epoch": 8.539709649871904,
	"grad_norm": 0.3125,
	"learning_rate": 9.27540742613326e-05,
	"loss": 0.9886,
	"step": 2500
	},
	{
	"epoch": 8.556789069171648,
	"grad_norm": 0.296875,
	"learning_rate": 9.235670328899293e-05,
	"loss": 0.9841,
	"step": 2505
	},
	{
	"epoch": 8.573868488471392,
	"grad_norm": 0.341796875,
	"learning_rate": 9.195945367975256e-05,
	"loss": 0.9859,
	"step": 2510
	},
	{
	"epoch": 8.590947907771136,
	"grad_norm": 0.298828125,
	"learning_rate": 9.156233174128805e-05,
	"loss": 0.9863,
	"step": 2515
	},
	{
	"epoch": 8.608027327070879,
	"grad_norm": 0.3203125,
	"learning_rate": 9.116534377924883e-05,
	"loss": 0.9789,
	"step": 2520
	},
	{
	"epoch": 8.625106746370623,
	"grad_norm": 0.283203125,
	"learning_rate": 9.076849609715693e-05,
	"loss": 0.983,
	"step": 2525
	},
	{
	"epoch": 8.642186165670367,
	"grad_norm": 0.310546875,
	"learning_rate": 9.037179499630703e-05,
	"loss": 0.9922,
	"step": 2530
	},
	{
	"epoch": 8.659265584970111,
	"grad_norm": 0.310546875,
	"learning_rate": 8.997524677566627e-05,
	"loss": 0.9863,
	"step": 2535
	},
	{
	"epoch": 8.676345004269855,
	"grad_norm": 0.2890625,
	"learning_rate": 8.957885773177438e-05,
	"loss": 0.9845,
	"step": 2540
	},
	{
	"epoch": 8.6934244235696,
	"grad_norm": 0.291015625,
	"learning_rate": 8.918263415864354e-05,
	"loss": 0.9893,
	"step": 2545
	},
	{
	"epoch": 8.710503842869343,
	"grad_norm": 0.33984375,
	"learning_rate": 8.878658234765858e-05,
	"loss": 0.9812,
	"step": 2550
	},
	{
	"epoch": 8.727583262169086,
	"grad_norm": 0.283203125,
	"learning_rate": 8.839070858747697e-05,
	"loss": 0.9855,
	"step": 2555
	},
	{
	"epoch": 8.74466268146883,
	"grad_norm": 0.310546875,
	"learning_rate": 8.799501916392912e-05,
	"loss": 0.9797,
	"step": 2560
	},
	{
	"epoch": 8.761742100768574,
	"grad_norm": 0.333984375,
	"learning_rate": 8.759952035991844e-05,
	"loss": 0.9836,
	"step": 2565
	},
	{
	"epoch": 8.778821520068318,
	"grad_norm": 0.29296875,
	"learning_rate": 8.720421845532151e-05,
	"loss": 0.9848,
	"step": 2570
	},
	{
	"epoch": 8.795900939368062,
	"grad_norm": 0.306640625,
	"learning_rate": 8.680911972688855e-05,
	"loss": 0.9837,
	"step": 2575
	},
	{
	"epoch": 8.812980358667804,
	"grad_norm": 0.283203125,
	"learning_rate": 8.641423044814374e-05,
	"loss": 0.9823,
	"step": 2580
	},
	{
	"epoch": 8.830059777967548,
	"grad_norm": 0.310546875,
	"learning_rate": 8.601955688928545e-05,
	"loss": 0.9852,
	"step": 2585
	},
	{
	"epoch": 8.847139197267293,
	"grad_norm": 0.302734375,
	"learning_rate": 8.562510531708677e-05,
	"loss": 0.9804,
	"step": 2590
	},
	{
	"epoch": 8.864218616567037,
	"grad_norm": 0.291015625,
	"learning_rate": 8.5230881994796e-05,
	"loss": 0.9783,
	"step": 2595
	},
	{
	"epoch": 8.88129803586678,
	"grad_norm": 0.357421875,
	"learning_rate": 8.48368931820373e-05,
	"loss": 0.9783,
	"step": 2600
	},
	{
	"epoch": 8.898377455166525,
	"grad_norm": 0.322265625,
	"learning_rate": 8.444314513471107e-05,
	"loss": 0.9813,
	"step": 2605
	},
	{
	"epoch": 8.915456874466269,
	"grad_norm": 0.337890625,
	"learning_rate": 8.404964410489485e-05,
	"loss": 0.9846,
	"step": 2610
	},
	{
	"epoch": 8.932536293766011,
	"grad_norm": 0.302734375,
	"learning_rate": 8.365639634074382e-05,
	"loss": 0.9875,
	"step": 2615
	},
	{
	"epoch": 8.949615713065755,
	"grad_norm": 0.2890625,
	"learning_rate": 8.32634080863919e-05,
	"loss": 0.9764,
	"step": 2620
	},
	{
	"epoch": 8.9666951323655,
	"grad_norm": 0.353515625,
	"learning_rate": 8.287068558185225e-05,
	"loss": 0.9899,
	"step": 2625
	},
	{
	"epoch": 8.983774551665244,
	"grad_norm": 0.28515625,
	"learning_rate": 8.247823506291844e-05,
	"loss": 0.9813,
	"step": 2630
	},
	{
	"epoch": 8.997438087105039,
	"eval_loss": 2.474276065826416,
	"eval_runtime": 0.6699,
	"eval_samples_per_second": 14.929,
	"eval_steps_per_second": 1.493,
	"step": 2634
	},
	{
	"epoch": 9.000853970964988,
	"grad_norm": 0.31640625,
	"learning_rate": 8.208606276106528e-05,
	"loss": 0.9885,
	"step": 2635
	},
	{
	"epoch": 9.017933390264732,
	"grad_norm": 0.3359375,
	"learning_rate": 8.169417490335007e-05,
	"loss": 0.9783,
	"step": 2640
	},
	{
	"epoch": 9.035012809564474,
	"grad_norm": 0.322265625,
	"learning_rate": 8.130257771231348e-05,
	"loss": 0.9778,
	"step": 2645
	},
	{
	"epoch": 9.052092228864218,
	"grad_norm": 0.333984375,
	"learning_rate": 8.091127740588094e-05,
	"loss": 0.9793,
	"step": 2650
	},
	{
	"epoch": 9.069171648163962,
	"grad_norm": 0.328125,
	"learning_rate": 8.052028019726371e-05,
	"loss": 0.9762,
	"step": 2655
	},
	{
	"epoch": 9.086251067463706,
	"grad_norm": 0.291015625,
	"learning_rate": 8.012959229486061e-05,
	"loss": 0.9777,
	"step": 2660
	},
	{
	"epoch": 9.10333048676345,
	"grad_norm": 0.447265625,
	"learning_rate": 7.973921990215894e-05,
	"loss": 0.9871,
	"step": 2665
	},
	{
	"epoch": 9.120409906063195,
	"grad_norm": 0.306640625,
	"learning_rate": 7.934916921763628e-05,
	"loss": 0.9776,
	"step": 2670
	},
	{
	"epoch": 9.137489325362937,
	"grad_norm": 0.31640625,
	"learning_rate": 7.895944643466203e-05,
	"loss": 0.9795,
	"step": 2675
	},
	{
	"epoch": 9.154568744662681,
	"grad_norm": 0.318359375,
	"learning_rate": 7.857005774139907e-05,
	"loss": 0.9804,
	"step": 2680
	},
	{
	"epoch": 9.171648163962425,
	"grad_norm": 0.35546875,
	"learning_rate": 7.818100932070546e-05,
	"loss": 0.9759,
	"step": 2685
	},
	{
	"epoch": 9.18872758326217,
	"grad_norm": 0.28515625,
	"learning_rate": 7.779230735003628e-05,
	"loss": 0.9785,
	"step": 2690
	},
	{
	"epoch": 9.205807002561913,
	"grad_norm": 0.28515625,
	"learning_rate": 7.740395800134552e-05,
	"loss": 0.9751,
	"step": 2695
	},
	{
	"epoch": 9.222886421861658,
	"grad_norm": 0.30859375,
	"learning_rate": 7.701596744098818e-05,
	"loss": 0.9831,
	"step": 2700
	},
	{
	"epoch": 9.2399658411614,
	"grad_norm": 0.333984375,
	"learning_rate": 7.662834182962222e-05,
	"loss": 0.984,
	"step": 2705
	},
	{
	"epoch": 9.257045260461144,
	"grad_norm": 0.314453125,
	"learning_rate": 7.624108732211081e-05,
	"loss": 0.9777,
	"step": 2710
	},
	{
	"epoch": 9.274124679760888,
	"grad_norm": 0.337890625,
	"learning_rate": 7.585421006742463e-05,
	"loss": 0.9859,
	"step": 2715
	},
	{
	"epoch": 9.291204099060632,
	"grad_norm": 0.365234375,
	"learning_rate": 7.54677162085442e-05,
	"loss": 0.9765,
	"step": 2720
	},
	{
	"epoch": 9.308283518360376,
	"grad_norm": 0.29296875,
	"learning_rate": 7.508161188236232e-05,
	"loss": 0.9749,
	"step": 2725
	},
	{
	"epoch": 9.32536293766012,
	"grad_norm": 0.333984375,
	"learning_rate": 7.469590321958662e-05,
	"loss": 0.9815,
	"step": 2730
	},
	{
	"epoch": 9.342442356959863,
	"grad_norm": 0.287109375,
	"learning_rate": 7.431059634464229e-05,
	"loss": 0.9796,
	"step": 2735
	},
	{
	"epoch": 9.359521776259607,
	"grad_norm": 0.3125,
	"learning_rate": 7.392569737557474e-05,
	"loss": 0.9791,
	"step": 2740
	},
	{
	"epoch": 9.376601195559351,
	"grad_norm": 0.373046875,
	"learning_rate": 7.354121242395254e-05,
	"loss": 0.9854,
	"step": 2745
	},
	{
	"epoch": 9.393680614859095,
	"grad_norm": 0.328125,
	"learning_rate": 7.31571475947703e-05,
	"loss": 0.971,
	"step": 2750
	},
	{
	"epoch": 9.410760034158839,
	"grad_norm": 0.29296875,
	"learning_rate": 7.277350898635178e-05,
	"loss": 0.9789,
	"step": 2755
	},
	{
	"epoch": 9.427839453458583,
	"grad_norm": 0.357421875,
	"learning_rate": 7.239030269025311e-05,
	"loss": 0.9808,
	"step": 2760
	},
	{
	"epoch": 9.444918872758326,
	"grad_norm": 0.31640625,
	"learning_rate": 7.200753479116593e-05,
	"loss": 0.9712,
	"step": 2765
	},
	{
	"epoch": 9.46199829205807,
	"grad_norm": 0.2890625,
	"learning_rate": 7.162521136682085e-05,
	"loss": 0.975,
	"step": 2770
	},
	{
	"epoch": 9.479077711357814,
	"grad_norm": 0.306640625,
	"learning_rate": 7.124333848789091e-05,
	"loss": 0.9875,
	"step": 2775
	},
	{
	"epoch": 9.496157130657558,
	"grad_norm": 0.287109375,
	"learning_rate": 7.08619222178954e-05,
	"loss": 0.9806,
	"step": 2780
	},
	{
	"epoch": 9.513236549957302,
	"grad_norm": 0.353515625,
	"learning_rate": 7.048096861310322e-05,
	"loss": 0.9861,
	"step": 2785
	},
	{
	"epoch": 9.530315969257046,
	"grad_norm": 0.35546875,
	"learning_rate": 7.010048372243698e-05,
	"loss": 0.9823,
	"step": 2790
	},
	{
	"epoch": 9.547395388556788,
	"grad_norm": 0.29296875,
	"learning_rate": 6.972047358737681e-05,
	"loss": 0.9795,
	"step": 2795
	},
	{
	"epoch": 9.564474807856532,
	"grad_norm": 0.423828125,
	"learning_rate": 6.934094424186459e-05,
	"loss": 0.976,
	"step": 2800
	},
	{
	"epoch": 9.581554227156277,
	"grad_norm": 0.349609375,
	"learning_rate": 6.8961901712208e-05,
	"loss": 0.9709,
	"step": 2805
	},
	{
	"epoch": 9.59863364645602,
	"grad_norm": 0.3515625,
	"learning_rate": 6.858335201698485e-05,
	"loss": 0.9793,
	"step": 2810
	},
	{
	"epoch": 9.615713065755765,
	"grad_norm": 0.341796875,
	"learning_rate": 6.820530116694756e-05,
	"loss": 0.9741,
	"step": 2815
	},
	{
	"epoch": 9.632792485055509,
	"grad_norm": 0.30859375,
	"learning_rate": 6.782775516492771e-05,
	"loss": 0.9755,
	"step": 2820
	},
	{
	"epoch": 9.649871904355251,
	"grad_norm": 0.328125,
	"learning_rate": 6.745072000574075e-05,
	"loss": 0.9871,
	"step": 2825
	},
	{
	"epoch": 9.666951323654995,
	"grad_norm": 0.287109375,
	"learning_rate": 6.70742016760907e-05,
	"loss": 0.9754,
	"step": 2830
	},
	{
	"epoch": 9.68403074295474,
	"grad_norm": 0.29296875,
	"learning_rate": 6.669820615447522e-05,
	"loss": 0.9797,
	"step": 2835
	},
	{
	"epoch": 9.701110162254484,
	"grad_norm": 0.333984375,
	"learning_rate": 6.632273941109064e-05,
	"loss": 0.9813,
	"step": 2840
	},
	{
	"epoch": 9.718189581554228,
	"grad_norm": 0.298828125,
	"learning_rate": 6.594780740773712e-05,
	"loss": 0.9792,
	"step": 2845
	},
	{
	"epoch": 9.735269000853972,
	"grad_norm": 0.298828125,
	"learning_rate": 6.5573416097724e-05,
	"loss": 0.9728,
	"step": 2850
	},
	{
	"epoch": 9.752348420153714,
	"grad_norm": 0.3046875,
	"learning_rate": 6.519957142577535e-05,
	"loss": 0.9767,
	"step": 2855
	},
	{
	"epoch": 9.769427839453458,
	"grad_norm": 0.318359375,
	"learning_rate": 6.482627932793553e-05,
	"loss": 0.9822,
	"step": 2860
	},
	{
	"epoch": 9.786507258753202,
	"grad_norm": 0.31640625,
	"learning_rate": 6.445354573147484e-05,
	"loss": 0.9855,
	"step": 2865
	},
	{
	"epoch": 9.803586678052946,
	"grad_norm": 0.314453125,
	"learning_rate": 6.408137655479554e-05,
	"loss": 0.9796,
	"step": 2870
	},
	{
	"epoch": 9.82066609735269,
	"grad_norm": 0.294921875,
	"learning_rate": 6.370977770733777e-05,
	"loss": 0.9791,
	"step": 2875
	},
	{
	"epoch": 9.837745516652435,
	"grad_norm": 0.294921875,
	"learning_rate": 6.333875508948593e-05,
	"loss": 0.9791,
	"step": 2880
	},
	{
	"epoch": 9.854824935952177,
	"grad_norm": 0.3125,
	"learning_rate": 6.296831459247464e-05,
	"loss": 0.9791,
	"step": 2885
	},
	{
	"epoch": 9.871904355251921,
	"grad_norm": 0.33203125,
	"learning_rate": 6.259846209829551e-05,
	"loss": 0.9785,
	"step": 2890
	},
	{
	"epoch": 9.888983774551665,
	"grad_norm": 0.30078125,
	"learning_rate": 6.22292034796035e-05,
	"loss": 0.9827,
	"step": 2895
	},
	{
	"epoch": 9.90606319385141,
	"grad_norm": 0.298828125,
	"learning_rate": 6.186054459962399e-05,
	"loss": 0.9758,
	"step": 2900
	},
	{
	"epoch": 9.923142613151153,
	"grad_norm": 0.287109375,
	"learning_rate": 6.149249131205931e-05,
	"loss": 0.9788,
	"step": 2905
	},
	{
	"epoch": 9.940222032450897,
	"grad_norm": 0.287109375,
	"learning_rate": 6.112504946099604e-05,
	"loss": 0.9773,
	"step": 2910
	},
	{
	"epoch": 9.95730145175064,
	"grad_norm": 0.296875,
	"learning_rate": 6.075822488081213e-05,
	"loss": 0.9769,
	"step": 2915
	},
	{
	"epoch": 9.974380871050384,
	"grad_norm": 0.29296875,
	"learning_rate": 6.039202339608432e-05,
	"loss": 0.9809,
	"step": 2920
	},
	{
	"epoch": 9.991460290350128,
	"grad_norm": 0.384765625,
	"learning_rate": 6.0026450821495536e-05,
	"loss": 0.9756,
	"step": 2925
	},
	{
	"epoch": 9.998292058070026,
	"eval_loss": 2.4803388118743896,
	"eval_runtime": 0.5616,
	"eval_samples_per_second": 17.806,
	"eval_steps_per_second": 1.781,
	"step": 2927
	},
	{
	"epoch": 10.008539709649872,
	"grad_norm": 0.314453125,
	"learning_rate": 5.966151296174268e-05,
	"loss": 0.975,
	"step": 2930
	},
	{
	"epoch": 10.025619128949616,
	"grad_norm": 0.310546875,
	"learning_rate": 5.929721561144439e-05,
	"loss": 0.9813,
	"step": 2935
	},
	{
	"epoch": 10.04269854824936,
	"grad_norm": 0.291015625,
	"learning_rate": 5.8933564555049105e-05,
	"loss": 0.9799,
	"step": 2940
	},
	{
	"epoch": 10.059777967549103,
	"grad_norm": 0.29296875,
	"learning_rate": 5.857056556674313e-05,
	"loss": 0.9724,
	"step": 2945
	},
	{
	"epoch": 10.076857386848847,
	"grad_norm": 0.306640625,
	"learning_rate": 5.820822441035899e-05,
	"loss": 0.9755,
	"step": 2950
	},
	{
	"epoch": 10.09393680614859,
	"grad_norm": 0.296875,
	"learning_rate": 5.784654683928391e-05,
	"loss": 0.9773,
	"step": 2955
	},
	{
	"epoch": 10.111016225448335,
	"grad_norm": 0.287109375,
	"learning_rate": 5.7485538596368496e-05,
	"loss": 0.9698,
	"step": 2960
	},
	{
	"epoch": 10.128095644748079,
	"grad_norm": 0.28125,
	"learning_rate": 5.7125205413835504e-05,
	"loss": 0.9774,
	"step": 2965
	},
	{
	"epoch": 10.145175064047823,
	"grad_norm": 0.337890625,
	"learning_rate": 5.6765553013188766e-05,
	"loss": 0.9788,
	"step": 2970
	},
	{
	"epoch": 10.162254483347565,
	"grad_norm": 0.30859375,
	"learning_rate": 5.6406587105122475e-05,
	"loss": 0.9814,
	"step": 2975
	},
	{
	"epoch": 10.17933390264731,
	"grad_norm": 0.298828125,
	"learning_rate": 5.6048313389430484e-05,
	"loss": 0.9787,
	"step": 2980
	},
	{
	"epoch": 10.196413321947054,
	"grad_norm": 0.310546875,
	"learning_rate": 5.5690737554915604e-05,
	"loss": 0.9741,
	"step": 2985
	},
	{
	"epoch": 10.213492741246798,
	"grad_norm": 0.28515625,
	"learning_rate": 5.533386527929962e-05,
	"loss": 0.9735,
	"step": 2990
	},
	{
	"epoch": 10.230572160546542,
	"grad_norm": 0.296875,
	"learning_rate": 5.4977702229132745e-05,
	"loss": 0.9736,
	"step": 2995
	},
	{
	"epoch": 10.247651579846286,
	"grad_norm": 0.279296875,
	"learning_rate": 5.462225405970401e-05,
	"loss": 0.9754,
	"step": 3000
	},
	{
	"epoch": 10.264730999146028,
	"grad_norm": 0.2734375,
	"learning_rate": 5.4267526414951296e-05,
	"loss": 0.9734,
	"step": 3005
	},
	{
	"epoch": 10.281810418445772,
	"grad_norm": 0.298828125,
	"learning_rate": 5.391352492737157e-05,
	"loss": 0.9737,
	"step": 3010
	},
	{
	"epoch": 10.298889837745516,
	"grad_norm": 0.322265625,
	"learning_rate": 5.3560255217931785e-05,
	"loss": 0.9729,
	"step": 3015
	},
	{
	"epoch": 10.31596925704526,
	"grad_norm": 0.302734375,
	"learning_rate": 5.3207722895979406e-05,
	"loss": 0.9706,
	"step": 3020
	},
	{
	"epoch": 10.333048676345005,
	"grad_norm": 0.287109375,
	"learning_rate": 5.285593355915328e-05,
	"loss": 0.976,
	"step": 3025
	},
	{
	"epoch": 10.350128095644749,
	"grad_norm": 0.314453125,
	"learning_rate": 5.2504892793295e-05,
	"loss": 0.9657,
	"step": 3030
	},
	{
	"epoch": 10.367207514944491,
	"grad_norm": 0.341796875,
	"learning_rate": 5.215460617235993e-05,
	"loss": 0.9731,
	"step": 3035
	},
	{
	"epoch": 10.384286934244235,
	"grad_norm": 0.29296875,
	"learning_rate": 5.1805079258329056e-05,
	"loss": 0.978,
	"step": 3040
	},
	{
	"epoch": 10.40136635354398,
	"grad_norm": 0.404296875,
	"learning_rate": 5.145631760112022e-05,
	"loss": 0.9757,
	"step": 3045
	},
	{
	"epoch": 10.418445772843723,
	"grad_norm": 0.3125,
	"learning_rate": 5.110832673850039e-05,
	"loss": 0.9779,
	"step": 3050
	},
	{
	"epoch": 10.435525192143468,
	"grad_norm": 0.294921875,
	"learning_rate": 5.076111219599745e-05,
	"loss": 0.9718,
	"step": 3055
	},
	{
	"epoch": 10.452604611443212,
	"grad_norm": 0.359375,
	"learning_rate": 5.041467948681269e-05,
	"loss": 0.9796,
	"step": 3060
	},
	{
	"epoch": 10.469684030742954,
	"grad_norm": 0.291015625,
	"learning_rate": 5.0069034111733184e-05,
	"loss": 0.9741,
	"step": 3065
	},
	{
	"epoch": 10.486763450042698,
	"grad_norm": 0.283203125,
	"learning_rate": 4.9724181559044234e-05,
	"loss": 0.9757,
	"step": 3070
	},
	{
	"epoch": 10.503842869342442,
	"grad_norm": 0.29296875,
	"learning_rate": 4.9380127304442634e-05,
	"loss": 0.9777,
	"step": 3075
	},
	{
	"epoch": 10.520922288642186,
	"grad_norm": 0.27734375,
	"learning_rate": 4.903687681094942e-05,
	"loss": 0.973,
	"step": 3080
	},
	{
	"epoch": 10.53800170794193,
	"grad_norm": 0.279296875,
	"learning_rate": 4.8694435528823135e-05,
	"loss": 0.9822,
	"step": 3085
	},
	{
	"epoch": 10.555081127241674,
	"grad_norm": 0.29296875,
	"learning_rate": 4.835280889547351e-05,
	"loss": 0.9754,
	"step": 3090
	},
	{
	"epoch": 10.572160546541417,
	"grad_norm": 0.28125,
	"learning_rate": 4.801200233537483e-05,
	"loss": 0.9711,
	"step": 3095
	},
	{
	"epoch": 10.589239965841161,
	"grad_norm": 0.298828125,
	"learning_rate": 4.767202125998005e-05,
	"loss": 0.9782,
	"step": 3100
	},
	{
	"epoch": 10.606319385140905,
	"grad_norm": 0.30078125,
	"learning_rate": 4.733287106763481e-05,
	"loss": 0.9784,
	"step": 3105
	},
	{
	"epoch": 10.623398804440649,
	"grad_norm": 0.298828125,
	"learning_rate": 4.699455714349152e-05,
	"loss": 0.9724,
	"step": 3110
	},
	{
	"epoch": 10.640478223740393,
	"grad_norm": 0.287109375,
	"learning_rate": 4.665708485942417e-05,
	"loss": 0.9729,
	"step": 3115
	},
	{
	"epoch": 10.657557643040137,
	"grad_norm": 0.28515625,
	"learning_rate": 4.6320459573942856e-05,
	"loss": 0.9775,
	"step": 3120
	},
	{
	"epoch": 10.67463706233988,
	"grad_norm": 0.3046875,
	"learning_rate": 4.5984686632108585e-05,
	"loss": 0.9749,
	"step": 3125
	},
	{
	"epoch": 10.691716481639624,
	"grad_norm": 0.28515625,
	"learning_rate": 4.564977136544873e-05,
	"loss": 0.9796,
	"step": 3130
	},
	{
	"epoch": 10.708795900939368,
	"grad_norm": 0.279296875,
	"learning_rate": 4.531571909187197e-05,
	"loss": 0.9731,
	"step": 3135
	},
	{
	"epoch": 10.725875320239112,
	"grad_norm": 0.279296875,
	"learning_rate": 4.49825351155843e-05,
	"loss": 0.9734,
	"step": 3140
	},
	{
	"epoch": 10.742954739538856,
	"grad_norm": 0.302734375,
	"learning_rate": 4.4650224727004334e-05,
	"loss": 0.9709,
	"step": 3145
	},
	{
	"epoch": 10.7600341588386,
	"grad_norm": 0.28515625,
	"learning_rate": 4.431879320267972e-05,
	"loss": 0.9739,
	"step": 3150
	},
	{
	"epoch": 10.777113578138342,
	"grad_norm": 0.3203125,
	"learning_rate": 4.398824580520302e-05,
	"loss": 0.9762,
	"step": 3155
	},
	{
	"epoch": 10.794192997438087,
	"grad_norm": 0.287109375,
	"learning_rate": 4.3658587783128425e-05,
	"loss": 0.9749,
	"step": 3160
	},
	{
	"epoch": 10.81127241673783,
	"grad_norm": 0.29296875,
	"learning_rate": 4.332982437088825e-05,
	"loss": 0.9769,
	"step": 3165
	},
	{
	"epoch": 10.828351836037575,
	"grad_norm": 0.27734375,
	"learning_rate": 4.300196078870982e-05,
	"loss": 0.9783,
	"step": 3170
	},
	{
	"epoch": 10.845431255337319,
	"grad_norm": 0.2890625,
	"learning_rate": 4.267500224253269e-05,
	"loss": 0.9817,
	"step": 3175
	},
	{
	"epoch": 10.862510674637063,
	"grad_norm": 0.28125,
	"learning_rate": 4.2348953923925916e-05,
	"loss": 0.9776,
	"step": 3180
	},
	{
	"epoch": 10.879590093936805,
	"grad_norm": 0.2890625,
	"learning_rate": 4.202382101000554e-05,
	"loss": 0.9771,
	"step": 3185
	},
	{
	"epoch": 10.89666951323655,
	"grad_norm": 0.2890625,
	"learning_rate": 4.16996086633526e-05,
	"loss": 0.9716,
	"step": 3190
	},
	{
	"epoch": 10.913748932536294,
	"grad_norm": 0.28125,
	"learning_rate": 4.137632203193086e-05,
	"loss": 0.9845,
	"step": 3195
	},
	{
	"epoch": 10.930828351836038,
	"grad_norm": 0.330078125,
	"learning_rate": 4.105396624900538e-05,
	"loss": 0.9799,
	"step": 3200
	},
	{
	"epoch": 10.947907771135782,
	"grad_norm": 0.314453125,
	"learning_rate": 4.073254643306086e-05,
	"loss": 0.97,
	"step": 3205
	},
	{
	"epoch": 10.964987190435526,
	"grad_norm": 0.28515625,
	"learning_rate": 4.041206768772022e-05,
	"loss": 0.977,
	"step": 3210
	},
	{
	"epoch": 10.98206660973527,
	"grad_norm": 0.28125,
	"learning_rate": 4.009253510166386e-05,
	"loss": 0.9714,
	"step": 3215
	},
	{
	"epoch": 10.999146029035012,
	"grad_norm": 0.279296875,
	"learning_rate": 3.977395374854871e-05,
	"loss": 0.9815,
	"step": 3220
	},
	{
	"epoch": 10.999146029035012,
	"eval_loss": 2.4823217391967773,
	"eval_runtime": 0.5642,
	"eval_samples_per_second": 17.723,
	"eval_steps_per_second": 1.772,
	"step": 3220
	},
	{
	"epoch": 11.016225448334756,
	"grad_norm": 0.28125,
	"learning_rate": 3.9456328686927525e-05,
	"loss": 0.9669,
	"step": 3225
	},
	{
	"epoch": 11.0333048676345,
	"grad_norm": 0.275390625,
	"learning_rate": 3.913966496016891e-05,
	"loss": 0.9682,
	"step": 3230
	},
	{
	"epoch": 11.050384286934245,
	"grad_norm": 0.34375,
	"learning_rate": 3.88239675963768e-05,
	"loss": 0.9733,
	"step": 3235
	},
	{
	"epoch": 11.067463706233989,
	"grad_norm": 0.291015625,
	"learning_rate": 3.850924160831115e-05,
	"loss": 0.9732,
	"step": 3240
	},
	{
	"epoch": 11.084543125533731,
	"grad_norm": 0.283203125,
	"learning_rate": 3.819549199330784e-05,
	"loss": 0.9702,
	"step": 3245
	},
	{
	"epoch": 11.101622544833475,
	"grad_norm": 0.302734375,
	"learning_rate": 3.788272373319955e-05,
	"loss": 0.974,
	"step": 3250
	},
	{
	"epoch": 11.11870196413322,
	"grad_norm": 0.294921875,
	"learning_rate": 3.757094179423672e-05,
	"loss": 0.9747,
	"step": 3255
	},
	{
	"epoch": 11.135781383432963,
	"grad_norm": 0.306640625,
	"learning_rate": 3.726015112700859e-05,
	"loss": 0.9854,
	"step": 3260
	},
	{
	"epoch": 11.152860802732707,
	"grad_norm": 0.298828125,
	"learning_rate": 3.695035666636464e-05,
	"loss": 0.9724,
	"step": 3265
	},
	{
	"epoch": 11.169940222032452,
	"grad_norm": 0.298828125,
	"learning_rate": 3.6641563331336125e-05,
	"loss": 0.9668,
	"step": 3270
	},
	{
	"epoch": 11.187019641332194,
	"grad_norm": 0.28515625,
	"learning_rate": 3.633377602505815e-05,
	"loss": 0.9708,
	"step": 3275
	},
	{
	"epoch": 11.204099060631938,
	"grad_norm": 0.287109375,
	"learning_rate": 3.6026999634691725e-05,
	"loss": 0.9728,
	"step": 3280
	},
	{
	"epoch": 11.221178479931682,
	"grad_norm": 0.28515625,
	"learning_rate": 3.5721239031346066e-05,
	"loss": 0.987,
	"step": 3285
	},
	{
	"epoch": 11.238257899231426,
	"grad_norm": 0.302734375,
	"learning_rate": 3.541649907000147e-05,
	"loss": 0.9808,
	"step": 3290
	},
	{
	"epoch": 11.25533731853117,
	"grad_norm": 0.2890625,
	"learning_rate": 3.511278458943197e-05,
	"loss": 0.9693,
	"step": 3295
	},
	{
	"epoch": 11.272416737830914,
	"grad_norm": 0.279296875,
	"learning_rate": 3.4810100412128747e-05,
	"loss": 0.9766,
	"step": 3300
	},
	{
	"epoch": 11.289496157130657,
	"grad_norm": 0.310546875,
	"learning_rate": 3.4508451344223425e-05,
	"loss": 0.9756,
	"step": 3305
	},
	{
	"epoch": 11.3065755764304,
	"grad_norm": 0.298828125,
	"learning_rate": 3.42078421754117e-05,
	"loss": 0.9723,
	"step": 3310
	},
	{
	"epoch": 11.323654995730145,
	"grad_norm": 0.27734375,
	"learning_rate": 3.3908277678877445e-05,
	"loss": 0.972,
	"step": 3315
	},
	{
	"epoch": 11.340734415029889,
	"grad_norm": 0.283203125,
	"learning_rate": 3.360976261121684e-05,
	"loss": 0.9771,
	"step": 3320
	},
	{
	"epoch": 11.357813834329633,
	"grad_norm": 0.298828125,
	"learning_rate": 3.331230171236277e-05,
	"loss": 0.9764,
	"step": 3325
	},
	{
	"epoch": 11.374893253629377,
	"grad_norm": 0.29296875,
	"learning_rate": 3.3015899705509734e-05,
	"loss": 0.9716,
	"step": 3330
	},
	{
	"epoch": 11.391972672929121,
	"grad_norm": 0.279296875,
	"learning_rate": 3.272056129703861e-05,
	"loss": 0.9743,
	"step": 3335
	},
	{
	"epoch": 11.409052092228864,
	"grad_norm": 0.27734375,
	"learning_rate": 3.242629117644229e-05,
	"loss": 0.9731,
	"step": 3340
	},
	{
	"epoch": 11.426131511528608,
	"grad_norm": 0.306640625,
	"learning_rate": 3.21330940162508e-05,
	"loss": 0.9776,
	"step": 3345
	},
	{
	"epoch": 11.443210930828352,
	"grad_norm": 0.283203125,
	"learning_rate": 3.184097447195732e-05,
	"loss": 0.9695,
	"step": 3350
	},
	{
	"epoch": 11.460290350128096,
	"grad_norm": 0.302734375,
	"learning_rate": 3.1549937181944346e-05,
	"loss": 0.9792,
	"step": 3355
	},
	{
	"epoch": 11.47736976942784,
	"grad_norm": 0.29296875,
	"learning_rate": 3.125998676740987e-05,
	"loss": 0.9689,
	"step": 3360
	},
	{
	"epoch": 11.494449188727582,
	"grad_norm": 0.3046875,
	"learning_rate": 3.097112783229412e-05,
	"loss": 0.9706,
	"step": 3365
	},
	{
	"epoch": 11.511528608027326,
	"grad_norm": 0.28515625,
	"learning_rate": 3.068336496320631e-05,
	"loss": 0.975,
	"step": 3370
	},
	{
	"epoch": 11.52860802732707,
	"grad_norm": 0.2734375,
	"learning_rate": 3.0396702729352023e-05,
	"loss": 0.9742,
	"step": 3375
	},
	{
	"epoch": 11.545687446626815,
	"grad_norm": 0.279296875,
	"learning_rate": 3.0111145682460507e-05,
	"loss": 0.9709,
	"step": 3380
	},
	{
	"epoch": 11.562766865926559,
	"grad_norm": 0.28515625,
	"learning_rate": 2.9826698356712403e-05,
	"loss": 0.9664,
	"step": 3385
	},
	{
	"epoch": 11.579846285226303,
	"grad_norm": 0.30859375,
	"learning_rate": 2.9543365268667867e-05,
	"loss": 0.973,
	"step": 3390
	},
	{
	"epoch": 11.596925704526047,
	"grad_norm": 0.291015625,
	"learning_rate": 2.926115091719467e-05,
	"loss": 0.9723,
	"step": 3395
	},
	{
	"epoch": 11.61400512382579,
	"grad_norm": 0.29296875,
	"learning_rate": 2.8980059783396953e-05,
	"loss": 0.9707,
	"step": 3400
	},
	{
	"epoch": 11.631084543125533,
	"grad_norm": 0.279296875,
	"learning_rate": 2.8700096330544012e-05,
	"loss": 0.9702,
	"step": 3405
	},
	{
	"epoch": 11.648163962425278,
	"grad_norm": 0.283203125,
	"learning_rate": 2.8421265003999286e-05,
	"loss": 0.9788,
	"step": 3410
	},
	{
	"epoch": 11.665243381725022,
	"grad_norm": 0.298828125,
	"learning_rate": 2.8143570231150006e-05,
	"loss": 0.9708,
	"step": 3415
	},
	{
	"epoch": 11.682322801024766,
	"grad_norm": 0.291015625,
	"learning_rate": 2.7867016421336776e-05,
	"loss": 0.9803,
	"step": 3420
	},
	{
	"epoch": 11.699402220324508,
	"grad_norm": 0.283203125,
	"learning_rate": 2.759160796578347e-05,
	"loss": 0.9667,
	"step": 3425
	},
	{
	"epoch": 11.716481639624252,
	"grad_norm": 0.271484375,
	"learning_rate": 2.7317349237527724e-05,
	"loss": 0.9794,
	"step": 3430
	},
	{
	"epoch": 11.733561058923996,
	"grad_norm": 0.271484375,
	"learning_rate": 2.7044244591351232e-05,
	"loss": 0.9736,
	"step": 3435
	},
	{
	"epoch": 11.75064047822374,
	"grad_norm": 0.2890625,
	"learning_rate": 2.6772298363710956e-05,
	"loss": 0.976,
	"step": 3440
	},
	{
	"epoch": 11.767719897523484,
	"grad_norm": 0.275390625,
	"learning_rate": 2.6501514872669865e-05,
	"loss": 0.9744,
	"step": 3445
	},
	{
	"epoch": 11.784799316823229,
	"grad_norm": 0.287109375,
	"learning_rate": 2.6231898417828603e-05,
	"loss": 0.9771,
	"step": 3450
	},
	{
	"epoch": 11.801878736122973,
	"grad_norm": 0.28515625,
	"learning_rate": 2.5963453280257267e-05,
	"loss": 0.9744,
	"step": 3455
	},
	{
	"epoch": 11.818958155422715,
	"grad_norm": 0.28125,
	"learning_rate": 2.569618372242727e-05,
	"loss": 0.9797,
	"step": 3460
	},
	{
	"epoch": 11.836037574722459,
	"grad_norm": 0.302734375,
	"learning_rate": 2.5430093988143778e-05,
	"loss": 0.9775,
	"step": 3465
	},
	{
	"epoch": 11.853116994022203,
	"grad_norm": 0.27734375,
	"learning_rate": 2.5165188302478215e-05,
	"loss": 0.9702,
	"step": 3470
	},
	{
	"epoch": 11.870196413321947,
	"grad_norm": 0.275390625,
	"learning_rate": 2.4901470871701305e-05,
	"loss": 0.9738,
	"step": 3475
	},
	{
	"epoch": 11.887275832621691,
	"grad_norm": 0.279296875,
	"learning_rate": 2.4638945883216235e-05,
	"loss": 0.968,
	"step": 3480
	},
	{
	"epoch": 11.904355251921434,
	"grad_norm": 0.279296875,
	"learning_rate": 2.4377617505492046e-05,
	"loss": 0.9707,
	"step": 3485
	},
	{
	"epoch": 11.921434671221178,
	"grad_norm": 0.298828125,
	"learning_rate": 2.411748988799769e-05,
	"loss": 0.9727,
	"step": 3490
	},
	{
	"epoch": 11.938514090520922,
	"grad_norm": 0.28515625,
	"learning_rate": 2.385856716113587e-05,
	"loss": 0.9754,
	"step": 3495
	},
	{
	"epoch": 11.955593509820666,
	"grad_norm": 0.2890625,
	"learning_rate": 2.3600853436177672e-05,
	"loss": 0.9699,
	"step": 3500
	},
	{
	"epoch": 11.97267292912041,
	"grad_norm": 0.287109375,
	"learning_rate": 2.3344352805197212e-05,
	"loss": 0.9726,
	"step": 3505
	},
	{
	"epoch": 11.989752348420154,
	"grad_norm": 0.30078125,
	"learning_rate": 2.3089069341006565e-05,
	"loss": 0.9657,
	"step": 3510
	},
	{
	"epoch": 12.0,
	"eval_loss": 2.4844307899475098,
	"eval_runtime": 0.551,
	"eval_samples_per_second": 18.15,
	"eval_steps_per_second": 1.815,
	"step": 3513
	},
	{
	"epoch": 12.006831767719898,
	"grad_norm": 0.283203125,
	"learning_rate": 2.2835007097091267e-05,
	"loss": 0.9726,
	"step": 3515
	},
	{
	"epoch": 12.02391118701964,
	"grad_norm": 0.28125,
	"learning_rate": 2.2582170107545852e-05,
	"loss": 0.9707,
	"step": 3520
	},
	{
	"epoch": 12.040990606319385,
	"grad_norm": 0.271484375,
	"learning_rate": 2.2330562387009745e-05,
	"loss": 0.9734,
	"step": 3525
	},
	{
	"epoch": 12.058070025619129,
	"grad_norm": 0.28125,
	"learning_rate": 2.2080187930603668e-05,
	"loss": 0.9771,
	"step": 3530
	},
	{
	"epoch": 12.075149444918873,
	"grad_norm": 0.275390625,
	"learning_rate": 2.1831050713866007e-05,
	"loss": 0.9756,
	"step": 3535
	},
	{
	"epoch": 12.092228864218617,
	"grad_norm": 0.291015625,
	"learning_rate": 2.1583154692689976e-05,
	"loss": 0.9753,
	"step": 3540
	},
	{
	"epoch": 12.109308283518361,
	"grad_norm": 0.2734375,
	"learning_rate": 2.1336503803260456e-05,
	"loss": 0.9644,
	"step": 3545
	},
	{
	"epoch": 12.126387702818104,
	"grad_norm": 0.287109375,
	"learning_rate": 2.109110196199171e-05,
	"loss": 0.9691,
	"step": 3550
	},
	{
	"epoch": 12.143467122117848,
	"grad_norm": 0.2890625,
	"learning_rate": 2.08469530654652e-05,
	"loss": 0.9801,
	"step": 3555
	},
	{
	"epoch": 12.160546541417592,
	"grad_norm": 0.294921875,
	"learning_rate": 2.0604060990367624e-05,
	"loss": 0.966,
	"step": 3560
	},
	{
	"epoch": 12.177625960717336,
	"grad_norm": 0.28125,
	"learning_rate": 2.0362429593429432e-05,
	"loss": 0.9698,
	"step": 3565
	},
	{
	"epoch": 12.19470538001708,
	"grad_norm": 0.275390625,
	"learning_rate": 2.0122062711363532e-05,
	"loss": 0.9717,
	"step": 3570
	},
	{
	"epoch": 12.211784799316824,
	"grad_norm": 0.275390625,
	"learning_rate": 1.988296416080435e-05,
	"loss": 0.9793,
	"step": 3575
	},
	{
	"epoch": 12.228864218616566,
	"grad_norm": 0.294921875,
	"learning_rate": 1.9645137738247422e-05,
	"loss": 0.9743,
	"step": 3580
	},
	{
	"epoch": 12.24594363791631,
	"grad_norm": 0.287109375,
	"learning_rate": 1.9408587219988805e-05,
	"loss": 0.969,
	"step": 3585
	},
	{
	"epoch": 12.263023057216055,
	"grad_norm": 0.27734375,
	"learning_rate": 1.9173316362065384e-05,
	"loss": 0.9621,
	"step": 3590
	},
	{
	"epoch": 12.280102476515799,
	"grad_norm": 0.28125,
	"learning_rate": 1.893932890019503e-05,
	"loss": 0.9786,
	"step": 3595
	},
	{
	"epoch": 12.297181895815543,
	"grad_norm": 0.27734375,
	"learning_rate": 1.8706628549717452e-05,
	"loss": 0.971,
	"step": 3600
	},
	{
	"epoch": 12.314261315115287,
	"grad_norm": 0.279296875,
	"learning_rate": 1.8475219005535117e-05,
	"loss": 0.9797,
	"step": 3605
	},
	{
	"epoch": 12.33134073441503,
	"grad_norm": 0.291015625,
	"learning_rate": 1.824510394205453e-05,
	"loss": 0.9727,
	"step": 3610
	},
	{
	"epoch": 12.348420153714773,
	"grad_norm": 0.28515625,
	"learning_rate": 1.8016287013128018e-05,
	"loss": 0.9723,
	"step": 3615
	},
	{
	"epoch": 12.365499573014517,
	"grad_norm": 0.2734375,
	"learning_rate": 1.7788771851995655e-05,
	"loss": 0.9764,
	"step": 3620
	},
	{
	"epoch": 12.382578992314262,
	"grad_norm": 0.279296875,
	"learning_rate": 1.7562562071227474e-05,
	"loss": 0.9665,
	"step": 3625
	},
	{
	"epoch": 12.399658411614006,
	"grad_norm": 0.291015625,
	"learning_rate": 1.7337661262666294e-05,
	"loss": 0.9737,
	"step": 3630
	},
	{
	"epoch": 12.41673783091375,
	"grad_norm": 0.275390625,
	"learning_rate": 1.711407299737049e-05,
	"loss": 0.9671,
	"step": 3635
	},
	{
	"epoch": 12.433817250213492,
	"grad_norm": 0.283203125,
	"learning_rate": 1.6891800825557535e-05,
	"loss": 0.975,
	"step": 3640
	},
	{
	"epoch": 12.450896669513236,
	"grad_norm": 0.28515625,
	"learning_rate": 1.6670848276547334e-05,
	"loss": 0.9767,
	"step": 3645
	},
	{
	"epoch": 12.46797608881298,
	"grad_norm": 0.2734375,
	"learning_rate": 1.6451218858706374e-05,
	"loss": 0.9773,
	"step": 3650
	},
	{
	"epoch": 12.485055508112724,
	"grad_norm": 0.30078125,
	"learning_rate": 1.6232916059392e-05,
	"loss": 0.9752,
	"step": 3655
	},
	{
	"epoch": 12.502134927412468,
	"grad_norm": 0.279296875,
	"learning_rate": 1.601594334489702e-05,
	"loss": 0.9632,
	"step": 3660
	},
	{
	"epoch": 12.519214346712213,
	"grad_norm": 0.28125,
	"learning_rate": 1.5800304160394673e-05,
	"loss": 0.9749,
	"step": 3665
	},
	{
	"epoch": 12.536293766011955,
	"grad_norm": 0.279296875,
	"learning_rate": 1.5586001929883865e-05,
	"loss": 0.9746,
	"step": 3670
	},
	{
	"epoch": 12.553373185311699,
	"grad_norm": 0.287109375,
	"learning_rate": 1.5373040056134814e-05,
	"loss": 0.9783,
	"step": 3675
	},
	{
	"epoch": 12.570452604611443,
	"grad_norm": 0.283203125,
	"learning_rate": 1.516142192063521e-05,
	"loss": 0.974,
	"step": 3680
	},
	{
	"epoch": 12.587532023911187,
	"grad_norm": 0.2890625,
	"learning_rate": 1.4951150883536225e-05,
	"loss": 0.9802,
	"step": 3685
	},
	{
	"epoch": 12.604611443210931,
	"grad_norm": 0.279296875,
	"learning_rate": 1.474223028359939e-05,
	"loss": 0.9729,
	"step": 3690
	},
	{
	"epoch": 12.621690862510675,
	"grad_norm": 0.28125,
	"learning_rate": 1.4534663438143415e-05,
	"loss": 0.9711,
	"step": 3695
	},
	{
	"epoch": 12.638770281810418,
	"grad_norm": 0.29296875,
	"learning_rate": 1.4328453642991646e-05,
	"loss": 0.969,
	"step": 3700
	},
	{
	"epoch": 12.655849701110162,
	"grad_norm": 0.30078125,
	"learning_rate": 1.4123604172419713e-05,
	"loss": 0.9795,
	"step": 3705
	},
	{
	"epoch": 12.672929120409906,
	"grad_norm": 0.271484375,
	"learning_rate": 1.392011827910341e-05,
	"loss": 0.9715,
	"step": 3710
	},
	{
	"epoch": 12.69000853970965,
	"grad_norm": 0.296875,
	"learning_rate": 1.3717999194067232e-05,
	"loss": 0.9644,
	"step": 3715
	},
	{
	"epoch": 12.707087959009394,
	"grad_norm": 0.28125,
	"learning_rate": 1.3517250126632986e-05,
	"loss": 0.9717,
	"step": 3720
	},
	{
	"epoch": 12.724167378309138,
	"grad_norm": 0.28125,
	"learning_rate": 1.3317874264368734e-05,
	"loss": 0.9716,
	"step": 3725
	},
	{
	"epoch": 12.74124679760888,
	"grad_norm": 0.27734375,
	"learning_rate": 1.311987477303842e-05,
	"loss": 0.9772,
	"step": 3730
	},
	{
	"epoch": 12.758326216908625,
	"grad_norm": 0.275390625,
	"learning_rate": 1.292325479655131e-05,
	"loss": 0.9656,
	"step": 3735
	},
	{
	"epoch": 12.775405636208369,
	"grad_norm": 0.30078125,
	"learning_rate": 1.2728017456912344e-05,
	"loss": 0.9738,
	"step": 3740
	},
	{
	"epoch": 12.792485055508113,
	"grad_norm": 0.28515625,
	"learning_rate": 1.2534165854172397e-05,
	"loss": 0.9709,
	"step": 3745
	},
	{
	"epoch": 12.809564474807857,
	"grad_norm": 0.275390625,
	"learning_rate": 1.2341703066379074e-05,
	"loss": 0.9714,
	"step": 3750
	},
	{
	"epoch": 12.826643894107601,
	"grad_norm": 0.298828125,
	"learning_rate": 1.2150632149527886e-05,
	"loss": 0.9722,
	"step": 3755
	},
	{
	"epoch": 12.843723313407343,
	"grad_norm": 0.27734375,
	"learning_rate": 1.1960956137513701e-05,
	"loss": 0.978,
	"step": 3760
	},
	{
	"epoch": 12.860802732707088,
	"grad_norm": 0.279296875,
	"learning_rate": 1.1772678042082607e-05,
	"loss": 0.9668,
	"step": 3765
	},
	{
	"epoch": 12.877882152006832,
	"grad_norm": 0.283203125,
	"learning_rate": 1.158580085278398e-05,
	"loss": 0.9748,
	"step": 3770
	},
	{
	"epoch": 12.894961571306576,
	"grad_norm": 0.265625,
	"learning_rate": 1.1400327536923083e-05,
	"loss": 0.9702,
	"step": 3775
	},
	{
	"epoch": 12.91204099060632,
	"grad_norm": 0.318359375,
	"learning_rate": 1.1216261039514087e-05,
	"loss": 0.97,
	"step": 3780
	},
	{
	"epoch": 12.929120409906064,
	"grad_norm": 0.283203125,
	"learning_rate": 1.1033604283233035e-05,
	"loss": 0.968,
	"step": 3785
	},
	{
	"epoch": 12.946199829205806,
	"grad_norm": 0.2890625,
	"learning_rate": 1.0852360168371656e-05,
	"loss": 0.9732,
	"step": 3790
	},
	{
	"epoch": 12.96327924850555,
	"grad_norm": 0.28515625,
	"learning_rate": 1.0672531572791178e-05,
	"loss": 0.9739,
	"step": 3795
	},
	{
	"epoch": 12.980358667805294,
	"grad_norm": 0.2734375,
	"learning_rate": 1.049412135187675e-05,
	"loss": 0.9687,
	"step": 3800
	},
	{
	"epoch": 12.997438087105039,
	"grad_norm": 0.28125,
	"learning_rate": 1.0317132338492019e-05,
	"loss": 0.9694,
	"step": 3805
	},
	{
	"epoch": 12.997438087105039,
	"eval_loss": 2.4820432662963867,
	"eval_runtime": 0.553,
	"eval_samples_per_second": 18.083,
	"eval_steps_per_second": 1.808,
	"step": 3805
	},
	{
	"epoch": 13.014517506404783,
	"grad_norm": 0.26953125,
	"learning_rate": 1.0141567342934132e-05,
	"loss": 0.9741,
	"step": 3810
	},
	{
	"epoch": 13.031596925704527,
	"grad_norm": 0.283203125,
	"learning_rate": 9.967429152889208e-06,
	"loss": 0.9639,
	"step": 3815
	},
	{
	"epoch": 13.048676345004269,
	"grad_norm": 0.30078125,
	"learning_rate": 9.794720533388024e-06,
	"loss": 0.9852,
	"step": 3820
	},
	{
	"epoch": 13.065755764304013,
	"grad_norm": 0.27734375,
	"learning_rate": 9.623444226762035e-06,
	"loss": 0.9699,
	"step": 3825
	},
	{
	"epoch": 13.082835183603757,
	"grad_norm": 0.27734375,
	"learning_rate": 9.453602952599982e-06,
	"loss": 0.9677,
	"step": 3830
	},
	{
	"epoch": 13.099914602903501,
	"grad_norm": 0.28125,
	"learning_rate": 9.285199407704558e-06,
	"loss": 0.9795,
	"step": 3835
	},
	{
	"epoch": 13.116994022203246,
	"grad_norm": 0.267578125,
	"learning_rate": 9.118236266049707e-06,
	"loss": 0.9746,
	"step": 3840
	},
	{
	"epoch": 13.13407344150299,
	"grad_norm": 0.283203125,
	"learning_rate": 8.95271617873813e-06,
	"loss": 0.9759,
	"step": 3845
	},
	{
	"epoch": 13.151152860802732,
	"grad_norm": 0.2734375,
	"learning_rate": 8.788641773959105e-06,
	"loss": 0.966,
	"step": 3850
	},
	{
	"epoch": 13.168232280102476,
	"grad_norm": 0.2890625,
	"learning_rate": 8.626015656946895e-06,
	"loss": 0.9667,
	"step": 3855
	},
	{
	"epoch": 13.18531169940222,
	"grad_norm": 0.2734375,
	"learning_rate": 8.464840409939267e-06,
	"loss": 0.9725,
	"step": 3860
	},
	{
	"epoch": 13.202391118701964,
	"grad_norm": 0.279296875,
	"learning_rate": 8.305118592136597e-06,
	"loss": 0.9682,
	"step": 3865
	},
	{
	"epoch": 13.219470538001708,
	"grad_norm": 0.27734375,
	"learning_rate": 8.146852739661105e-06,
	"loss": 0.9727,
	"step": 3870
	},
	{
	"epoch": 13.236549957301452,
	"grad_norm": 0.26953125,
	"learning_rate": 7.99004536551664e-06,
	"loss": 0.9664,
	"step": 3875
	},
	{
	"epoch": 13.253629376601195,
	"grad_norm": 0.26953125,
	"learning_rate": 7.834698959548914e-06,
	"loss": 0.9708,
	"step": 3880
	},
	{
	"epoch": 13.270708795900939,
	"grad_norm": 0.267578125,
	"learning_rate": 7.6808159884057e-06,
	"loss": 0.9755,
	"step": 3885
	},
	{
	"epoch": 13.287788215200683,
	"grad_norm": 0.2890625,
	"learning_rate": 7.528398895497924e-06,
	"loss": 0.9687,
	"step": 3890
	},
	{
	"epoch": 13.304867634500427,
	"grad_norm": 0.271484375,
	"learning_rate": 7.377450100960648e-06,
	"loss": 0.9743,
	"step": 3895
	},
	{
	"epoch": 13.321947053800171,
	"grad_norm": 0.29296875,
	"learning_rate": 7.2279720016148244e-06,
	"loss": 0.9716,
	"step": 3900
	},
	{
	"epoch": 13.339026473099915,
	"grad_norm": 0.296875,
	"learning_rate": 7.079966970929175e-06,
	"loss": 0.9706,
	"step": 3905
	},
	{
	"epoch": 13.356105892399658,
	"grad_norm": 0.275390625,
	"learning_rate": 6.933437358982409e-06,
	"loss": 0.9724,
	"step": 3910
	},
	{
	"epoch": 13.373185311699402,
	"grad_norm": 0.283203125,
	"learning_rate": 6.788385492426053e-06,
	"loss": 0.9797,
	"step": 3915
	},
	{
	"epoch": 13.390264730999146,
	"grad_norm": 0.322265625,
	"learning_rate": 6.6448136744474474e-06,
	"loss": 0.9764,
	"step": 3920
	},
	{
	"epoch": 13.40734415029889,
	"grad_norm": 0.29296875,
	"learning_rate": 6.502724184733122e-06,
	"loss": 0.9751,
	"step": 3925
	},
	{
	"epoch": 13.424423569598634,
	"grad_norm": 0.279296875,
	"learning_rate": 6.36211927943271e-06,
	"loss": 0.9724,
	"step": 3930
	},
	{
	"epoch": 13.441502988898378,
	"grad_norm": 0.275390625,
	"learning_rate": 6.223001191123012e-06,
	"loss": 0.9804,
	"step": 3935
	},
	{
	"epoch": 13.45858240819812,
	"grad_norm": 0.279296875,
	"learning_rate": 6.085372128772637e-06,
	"loss": 0.9742,
	"step": 3940
	},
	{
	"epoch": 13.475661827497865,
	"grad_norm": 0.279296875,
	"learning_rate": 5.949234277706861e-06,
	"loss": 0.9763,
	"step": 3945
	},
	{
	"epoch": 13.492741246797609,
	"grad_norm": 0.283203125,
	"learning_rate": 5.814589799572956e-06,
	"loss": 0.9779,
	"step": 3950
	},
	{
	"epoch": 13.509820666097353,
	"grad_norm": 0.283203125,
	"learning_rate": 5.681440832305873e-06,
	"loss": 0.9769,
	"step": 3955
	},
	{
	"epoch": 13.526900085397097,
	"grad_norm": 0.287109375,
	"learning_rate": 5.549789490094304e-06,
	"loss": 0.9766,
	"step": 3960
	},
	{
	"epoch": 13.543979504696841,
	"grad_norm": 0.283203125,
	"learning_rate": 5.41963786334706e-06,
	"loss": 0.9655,
	"step": 3965
	},
	{
	"epoch": 13.561058923996583,
	"grad_norm": 0.28125,
	"learning_rate": 5.290988018659937e-06,
	"loss": 0.9738,
	"step": 3970
	},
	{
	"epoch": 13.578138343296327,
	"grad_norm": 0.333984375,
	"learning_rate": 5.163841998782837e-06,
	"loss": 0.9729,
	"step": 3975
	},
	{
	"epoch": 13.595217762596072,
	"grad_norm": 0.298828125,
	"learning_rate": 5.038201822587474e-06,
	"loss": 0.9685,
	"step": 3980
	},
	{
	"epoch": 13.612297181895816,
	"grad_norm": 0.271484375,
	"learning_rate": 4.914069485035111e-06,
	"loss": 0.9698,
	"step": 3985
	},
	{
	"epoch": 13.62937660119556,
	"grad_norm": 0.265625,
	"learning_rate": 4.79144695714504e-06,
	"loss": 0.9747,
	"step": 3990
	},
	{
	"epoch": 13.646456020495304,
	"grad_norm": 0.2734375,
	"learning_rate": 4.67033618596322e-06,
	"loss": 0.9716,
	"step": 3995
	},
	{
	"epoch": 13.663535439795046,
	"grad_norm": 0.2734375,
	"learning_rate": 4.550739094531386e-06,
	"loss": 0.9771,
	"step": 4000
	},
	{
	"epoch": 13.68061485909479,
	"grad_norm": 0.275390625,
	"learning_rate": 4.432657581856525e-06,
	"loss": 0.9703,
	"step": 4005
	},
	{
	"epoch": 13.697694278394534,
	"grad_norm": 0.30078125,
	"learning_rate": 4.316093522880648e-06,
	"loss": 0.9621,
	"step": 4010
	},
	{
	"epoch": 13.714773697694278,
	"grad_norm": 0.28515625,
	"learning_rate": 4.20104876845111e-06,
	"loss": 0.977,
	"step": 4015
	},
	{
	"epoch": 13.731853116994023,
	"grad_norm": 0.275390625,
	"learning_rate": 4.087525145291204e-06,
	"loss": 0.9682,
	"step": 4020
	},
	{
	"epoch": 13.748932536293767,
	"grad_norm": 0.2890625,
	"learning_rate": 3.97552445597108e-06,
	"loss": 0.9693,
	"step": 4025
	},
	{
	"epoch": 13.766011955593509,
	"grad_norm": 0.28515625,
	"learning_rate": 3.865048478879241e-06,
	"loss": 0.9684,
	"step": 4030
	},
	{
	"epoch": 13.783091374893253,
	"grad_norm": 0.287109375,
	"learning_rate": 3.7560989681941992e-06,
	"loss": 0.9724,
	"step": 4035
	},
	{
	"epoch": 13.800170794192997,
	"grad_norm": 0.26953125,
	"learning_rate": 3.6486776538566803e-06,
	"loss": 0.977,
	"step": 4040
	},
	{
	"epoch": 13.817250213492741,
	"grad_norm": 0.271484375,
	"learning_rate": 3.542786241542162e-06,
	"loss": 0.9676,
	"step": 4045
	},
	{
	"epoch": 13.834329632792485,
	"grad_norm": 0.27734375,
	"learning_rate": 3.4384264126337328e-06,
	"loss": 0.967,
	"step": 4050
	},
	{
	"epoch": 13.85140905209223,
	"grad_norm": 0.2734375,
	"learning_rate": 3.3355998241954678e-06,
	"loss": 0.971,
	"step": 4055
	},
	{
	"epoch": 13.868488471391974,
	"grad_norm": 0.27734375,
	"learning_rate": 3.2343081089460603e-06,
	"loss": 0.9702,
	"step": 4060
	},
	{
	"epoch": 13.885567890691716,
	"grad_norm": 0.275390625,
	"learning_rate": 3.1345528752329212e-06,
	"loss": 0.9733,
	"step": 4065
	},
	{
	"epoch": 13.90264730999146,
	"grad_norm": 0.28125,
	"learning_rate": 3.0363357070066544e-06,
	"loss": 0.9707,
	"step": 4070
	},
	{
	"epoch": 13.919726729291204,
	"grad_norm": 0.287109375,
	"learning_rate": 2.939658163795844e-06,
	"loss": 0.9696,
	"step": 4075
	},
	{
	"epoch": 13.936806148590948,
	"grad_norm": 0.275390625,
	"learning_rate": 2.8445217806824077e-06,
	"loss": 0.9683,
	"step": 4080
	},
	{
	"epoch": 13.953885567890692,
	"grad_norm": 0.27734375,
	"learning_rate": 2.750928068277081e-06,
	"loss": 0.9703,
	"step": 4085
	},
	{
	"epoch": 13.970964987190435,
	"grad_norm": 0.287109375,
	"learning_rate": 2.658878512695562e-06,
	"loss": 0.9775,
	"step": 4090
	},
	{
	"epoch": 13.988044406490179,
	"grad_norm": 0.28125,
	"learning_rate": 2.5683745755348044e-06,
	"loss": 0.968,
	"step": 4095
	},
	{
	"epoch": 13.998292058070026,
	"eval_loss": 2.482388734817505,
	"eval_runtime": 0.5602,
	"eval_samples_per_second": 17.85,
	"eval_steps_per_second": 1.785,
	"step": 4098
	},
	{
	"epoch": 14.005123825789923,
	"grad_norm": 0.28125,
	"learning_rate": 2.4794176938498837e-06,
	"loss": 0.9692,
	"step": 4100
	},
	{
	"epoch": 14.022203245089667,
	"grad_norm": 0.27734375,
	"learning_rate": 2.392009280131169e-06,
	"loss": 0.9693,
	"step": 4105
	},
	{
	"epoch": 14.039282664389411,
	"grad_norm": 0.275390625,
	"learning_rate": 2.30615072228183e-06,
	"loss": 0.9759,
	"step": 4110
	},
	{
	"epoch": 14.056362083689155,
	"grad_norm": 0.279296875,
	"learning_rate": 2.221843383595923e-06,
	"loss": 0.9748,
	"step": 4115
	},
	{
	"epoch": 14.073441502988898,
	"grad_norm": 0.291015625,
	"learning_rate": 2.139088602736616e-06,
	"loss": 0.9762,
	"step": 4120
	},
	{
	"epoch": 14.090520922288642,
	"grad_norm": 0.275390625,
	"learning_rate": 2.057887693714988e-06,
	"loss": 0.9779,
	"step": 4125
	},
	{
	"epoch": 14.107600341588386,
	"grad_norm": 0.271484375,
	"learning_rate": 1.9782419458692193e-06,
	"loss": 0.9726,
	"step": 4130
	},
	{
	"epoch": 14.12467976088813,
	"grad_norm": 0.267578125,
	"learning_rate": 1.900152623843987e-06,
	"loss": 0.964,
	"step": 4135
	},
	{
	"epoch": 14.141759180187874,
	"grad_norm": 0.2734375,
	"learning_rate": 1.8236209675705274e-06,
	"loss": 0.9724,
	"step": 4140
	},
	{
	"epoch": 14.158838599487618,
	"grad_norm": 0.263671875,
	"learning_rate": 1.7486481922468489e-06,
	"loss": 0.9744,
	"step": 4145
	},
	{
	"epoch": 14.17591801878736,
	"grad_norm": 0.27734375,
	"learning_rate": 1.6752354883184717e-06,
	"loss": 0.9797,
	"step": 4150
	},
	{
	"epoch": 14.192997438087104,
	"grad_norm": 0.271484375,
	"learning_rate": 1.6033840214595308e-06,
	"loss": 0.976,
	"step": 4155
	},
	{
	"epoch": 14.210076857386849,
	"grad_norm": 0.283203125,
	"learning_rate": 1.5330949325542797e-06,
	"loss": 0.9653,
	"step": 4160
	},
	{
	"epoch": 14.227156276686593,
	"grad_norm": 0.271484375,
	"learning_rate": 1.4643693376789058e-06,
	"loss": 0.9744,
	"step": 4165
	},
	{
	"epoch": 14.244235695986337,
	"grad_norm": 0.29296875,
	"learning_rate": 1.397208328083921e-06,
	"loss": 0.9619,
	"step": 4170
	},
	{
	"epoch": 14.26131511528608,
	"grad_norm": 0.279296875,
	"learning_rate": 1.3316129701766878e-06,
	"loss": 0.9673,
	"step": 4175
	},
	{
	"epoch": 14.278394534585825,
	"grad_norm": 0.2734375,
	"learning_rate": 1.2675843055046765e-06,
	"loss": 0.9703,
	"step": 4180
	},
	{
	"epoch": 14.295473953885567,
	"grad_norm": 0.27734375,
	"learning_rate": 1.205123350738746e-06,
	"loss": 0.9778,
	"step": 4185
	},
	{
	"epoch": 14.312553373185311,
	"grad_norm": 0.279296875,
	"learning_rate": 1.144231097657078e-06,
	"loss": 0.9707,
	"step": 4190
	},
	{
	"epoch": 14.329632792485056,
	"grad_norm": 0.28125,
	"learning_rate": 1.0849085131294678e-06,
	"loss": 0.966,
	"step": 4195
	},
	{
	"epoch": 14.3467122117848,
	"grad_norm": 0.26953125,
	"learning_rate": 1.0271565391018922e-06,
	"loss": 0.9732,
	"step": 4200
	},
	{
	"epoch": 14.363791631084544,
	"grad_norm": 0.296875,
	"learning_rate": 9.709760925816325e-07,
	"loss": 0.9701,
	"step": 4205
	},
	{
	"epoch": 14.380871050384286,
	"grad_norm": 0.283203125,
	"learning_rate": 9.163680656226303e-07,
	"loss": 0.9754,
	"step": 4210
	},
	{
	"epoch": 14.39795046968403,
	"grad_norm": 0.279296875,
	"learning_rate": 8.633333253113995e-07,
	"loss": 0.9676,
	"step": 4215
	},
	{
	"epoch": 14.415029888983774,
	"grad_norm": 0.283203125,
	"learning_rate": 8.118727137532034e-07,
	"loss": 0.9781,
	"step": 4220
	},
	{
	"epoch": 14.432109308283518,
	"grad_norm": 0.28125,
	"learning_rate": 7.619870480587099e-07,
	"loss": 0.9664,
	"step": 4225
	},
	{
	"epoch": 14.449188727583262,
	"grad_norm": 0.275390625,
	"learning_rate": 7.136771203310245e-07,
	"loss": 0.9772,
	"step": 4230
	},
	{
	"epoch": 14.466268146883007,
	"grad_norm": 0.2734375,
	"learning_rate": 6.669436976530885e-07,
	"loss": 0.9685,
	"step": 4235
	},
	{
	"epoch": 14.48334756618275,
	"grad_norm": 0.275390625,
	"learning_rate": 6.21787522075512e-07,
	"loss": 0.9722,
	"step": 4240
	},
	{
	"epoch": 14.500426985482493,
	"grad_norm": 0.2734375,
	"learning_rate": 5.782093106048159e-07,
	"loss": 0.969,
	"step": 4245
	},
	{
	"epoch": 14.517506404782237,
	"grad_norm": 0.279296875,
	"learning_rate": 5.362097551919631e-07,
	"loss": 0.9691,
	"step": 4250
	},
	{
	"epoch": 14.534585824081981,
	"grad_norm": 0.275390625,
	"learning_rate": 4.957895227215015e-07,
	"loss": 0.9683,
	"step": 4255
	},
	{
	"epoch": 14.551665243381725,
	"grad_norm": 0.27734375,
	"learning_rate": 4.569492550008603e-07,
	"loss": 0.9676,
	"step": 4260
	},
	{
	"epoch": 14.56874466268147,
	"grad_norm": 0.279296875,
	"learning_rate": 4.1968956875020336e-07,
	"loss": 0.9796,
	"step": 4265
	},
	{
	"epoch": 14.585824081981212,
	"grad_norm": 0.275390625,
	"learning_rate": 3.84011055592659e-07,
	"loss": 0.9671,
	"step": 4270
	},
	{
	"epoch": 14.602903501280956,
	"grad_norm": 0.275390625,
	"learning_rate": 3.49914282044872e-07,
	"loss": 0.9776,
	"step": 4275
	},
	{
	"epoch": 14.6199829205807,
	"grad_norm": 0.27734375,
	"learning_rate": 3.1739978950806603e-07,
	"loss": 0.9661,
	"step": 4280
	},
	{
	"epoch": 14.637062339880444,
	"grad_norm": 0.28125,
	"learning_rate": 2.864680942594178e-07,
	"loss": 0.9708,
	"step": 4285
	},
	{
	"epoch": 14.654141759180188,
	"grad_norm": 0.2890625,
	"learning_rate": 2.5711968744382974e-07,
	"loss": 0.9728,
	"step": 4290
	},
	{
	"epoch": 14.671221178479932,
	"grad_norm": 0.271484375,
	"learning_rate": 2.2935503506621436e-07,
	"loss": 0.9742,
	"step": 4295
	},
	{
	"epoch": 14.688300597779676,
	"grad_norm": 0.291015625,
	"learning_rate": 2.0317457798398888e-07,
	"loss": 0.9753,
	"step": 4300
	},
	{
	"epoch": 14.705380017079419,
	"grad_norm": 0.30859375,
	"learning_rate": 1.7857873190019192e-07,
	"loss": 0.9707,
	"step": 4305
	},
	{
	"epoch": 14.722459436379163,
	"grad_norm": 0.27734375,
	"learning_rate": 1.5556788735676675e-07,
	"loss": 0.971,
	"step": 4310
	},
	{
	"epoch": 14.739538855678907,
	"grad_norm": 0.271484375,
	"learning_rate": 1.3414240972843273e-07,
	"loss": 0.9695,
	"step": 4315
	},
	{
	"epoch": 14.756618274978651,
	"grad_norm": 0.271484375,
	"learning_rate": 1.143026392168789e-07,
	"loss": 0.9833,
	"step": 4320
	},
	{
	"epoch": 14.773697694278395,
	"grad_norm": 0.27734375,
	"learning_rate": 9.604889084532387e-08,
	"loss": 0.9728,
	"step": 4325
	},
	{
	"epoch": 14.790777113578137,
	"grad_norm": 0.30078125,
	"learning_rate": 7.938145445357536e-08,
	"loss": 0.9739,
	"step": 4330
	},
	{
	"epoch": 14.807856532877882,
	"grad_norm": 0.279296875,
	"learning_rate": 6.430059469334504e-08,
	"loss": 0.974,
	"step": 4335
	},
	{
	"epoch": 14.824935952177626,
	"grad_norm": 0.28125,
	"learning_rate": 5.0806551024129565e-08,
	"loss": 0.9763,
	"step": 4340
	},
	{
	"epoch": 14.84201537147737,
	"grad_norm": 0.28125,
	"learning_rate": 3.889953770935817e-08,
	"loss": 0.978,
	"step": 4345
	},
	{
	"epoch": 14.859094790777114,
	"grad_norm": 0.287109375,
	"learning_rate": 2.8579743813006432e-08,
	"loss": 0.9686,
	"step": 4350
	},
	{
	"epoch": 14.876174210076858,
	"grad_norm": 0.27734375,
	"learning_rate": 1.98473331965654e-08,
	"loss": 0.9723,
	"step": 4355
	},
	{
	"epoch": 14.893253629376602,
	"grad_norm": 0.287109375,
	"learning_rate": 1.270244451652136e-08,
	"loss": 0.9641,
	"step": 4360
	},
	{
	"epoch": 14.910333048676344,
	"grad_norm": 0.28125,
	"learning_rate": 7.145191222035497e-09,
	"loss": 0.968,
	"step": 4365
	},
	{
	"epoch": 14.927412467976088,
	"grad_norm": 0.2734375,
	"learning_rate": 3.175661553256326e-09,
	"loss": 0.9763,
	"step": 4370
	},
	{
	"epoch": 14.944491887275833,
	"grad_norm": 0.283203125,
	"learning_rate": 7.939185398431193e-10,
	"loss": 0.972,
	"step": 4375
	},
	{
	"epoch": 14.961571306575577,
	"grad_norm": 0.275390625,
	"learning_rate": 0.0,
	"loss": 0.9728,
	"step": 4380
	},
	{
	"epoch": 14.961571306575577,
	"eval_loss": 2.482285976409912,
	"eval_runtime": 0.5491,
	"eval_samples_per_second": 18.212,
	"eval_steps_per_second": 1.821,
	"step": 4380
	},
	{
	"epoch": 14.961571306575577,
	"step": 4380,
	"total_flos": 5.145390446595277e+18,
	"train_loss": 1.0581742508226333,
	"train_runtime": 45587.0719,
	"train_samples_per_second": 9.241,
	"train_steps_per_second": 0.096
	}
	],
	"logging_steps": 5,
	"max_steps": 4380,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 15,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 5.145390446595277e+18,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}