prometheus-7b / trainer_state.json
terry69's picture
Model save
185566e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999850411368736,
"eval_steps": 500,
"global_step": 3342,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002991772625280479,
"grad_norm": 24.73757525746042,
"learning_rate": 2.985074626865672e-08,
"loss": 1.4467,
"step": 1
},
{
"epoch": 0.0014958863126402393,
"grad_norm": 25.170481305263397,
"learning_rate": 1.4925373134328358e-07,
"loss": 1.418,
"step": 5
},
{
"epoch": 0.0029917726252804786,
"grad_norm": 17.671227057151118,
"learning_rate": 2.9850746268656716e-07,
"loss": 1.4125,
"step": 10
},
{
"epoch": 0.004487658937920718,
"grad_norm": 8.909500017199079,
"learning_rate": 4.4776119402985074e-07,
"loss": 1.3226,
"step": 15
},
{
"epoch": 0.005983545250560957,
"grad_norm": 11.280714871339804,
"learning_rate": 5.970149253731343e-07,
"loss": 1.1982,
"step": 20
},
{
"epoch": 0.0074794315632011965,
"grad_norm": 8.398369640505729,
"learning_rate": 7.462686567164179e-07,
"loss": 1.0909,
"step": 25
},
{
"epoch": 0.008975317875841436,
"grad_norm": 2.9571179549426505,
"learning_rate": 8.955223880597015e-07,
"loss": 0.9951,
"step": 30
},
{
"epoch": 0.010471204188481676,
"grad_norm": 2.8894837770977224,
"learning_rate": 1.044776119402985e-06,
"loss": 0.9586,
"step": 35
},
{
"epoch": 0.011967090501121914,
"grad_norm": 2.324424480950711,
"learning_rate": 1.1940298507462686e-06,
"loss": 0.9305,
"step": 40
},
{
"epoch": 0.013462976813762155,
"grad_norm": 2.2619500315834866,
"learning_rate": 1.3432835820895524e-06,
"loss": 0.91,
"step": 45
},
{
"epoch": 0.014958863126402393,
"grad_norm": 2.3542199193134663,
"learning_rate": 1.4925373134328358e-06,
"loss": 0.8925,
"step": 50
},
{
"epoch": 0.016454749439042633,
"grad_norm": 2.1942393280485444,
"learning_rate": 1.6417910447761196e-06,
"loss": 0.8768,
"step": 55
},
{
"epoch": 0.01795063575168287,
"grad_norm": 2.1616387387297245,
"learning_rate": 1.791044776119403e-06,
"loss": 0.8723,
"step": 60
},
{
"epoch": 0.01944652206432311,
"grad_norm": 2.2022102428237957,
"learning_rate": 1.9402985074626867e-06,
"loss": 0.8639,
"step": 65
},
{
"epoch": 0.020942408376963352,
"grad_norm": 2.1621065249124585,
"learning_rate": 2.08955223880597e-06,
"loss": 0.8699,
"step": 70
},
{
"epoch": 0.02243829468960359,
"grad_norm": 2.3233760195216147,
"learning_rate": 2.238805970149254e-06,
"loss": 0.8603,
"step": 75
},
{
"epoch": 0.02393418100224383,
"grad_norm": 2.266487924942459,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.8537,
"step": 80
},
{
"epoch": 0.025430067314884067,
"grad_norm": 2.2873594748046076,
"learning_rate": 2.537313432835821e-06,
"loss": 0.8487,
"step": 85
},
{
"epoch": 0.02692595362752431,
"grad_norm": 2.283742446960398,
"learning_rate": 2.686567164179105e-06,
"loss": 0.8413,
"step": 90
},
{
"epoch": 0.028421839940164548,
"grad_norm": 2.1774038405712486,
"learning_rate": 2.835820895522388e-06,
"loss": 0.8281,
"step": 95
},
{
"epoch": 0.029917726252804786,
"grad_norm": 2.342263870606748,
"learning_rate": 2.9850746268656716e-06,
"loss": 0.8305,
"step": 100
},
{
"epoch": 0.031413612565445025,
"grad_norm": 2.385361324640983,
"learning_rate": 3.1343283582089558e-06,
"loss": 0.8168,
"step": 105
},
{
"epoch": 0.032909498878085267,
"grad_norm": 2.500114420278517,
"learning_rate": 3.283582089552239e-06,
"loss": 0.8207,
"step": 110
},
{
"epoch": 0.0344053851907255,
"grad_norm": 2.4374602854285286,
"learning_rate": 3.4328358208955225e-06,
"loss": 0.8244,
"step": 115
},
{
"epoch": 0.03590127150336574,
"grad_norm": 2.44198563300893,
"learning_rate": 3.582089552238806e-06,
"loss": 0.8199,
"step": 120
},
{
"epoch": 0.037397157816005985,
"grad_norm": 2.2760332882107157,
"learning_rate": 3.73134328358209e-06,
"loss": 0.8216,
"step": 125
},
{
"epoch": 0.03889304412864622,
"grad_norm": 2.508508841712968,
"learning_rate": 3.8805970149253735e-06,
"loss": 0.8014,
"step": 130
},
{
"epoch": 0.04038893044128646,
"grad_norm": 2.3029396530303066,
"learning_rate": 4.029850746268657e-06,
"loss": 0.7989,
"step": 135
},
{
"epoch": 0.041884816753926704,
"grad_norm": 2.3322348191256594,
"learning_rate": 4.17910447761194e-06,
"loss": 0.7964,
"step": 140
},
{
"epoch": 0.04338070306656694,
"grad_norm": 2.331622885776369,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.8013,
"step": 145
},
{
"epoch": 0.04487658937920718,
"grad_norm": 2.288860960559162,
"learning_rate": 4.477611940298508e-06,
"loss": 0.8045,
"step": 150
},
{
"epoch": 0.04637247569184742,
"grad_norm": 2.4508630826235778,
"learning_rate": 4.626865671641791e-06,
"loss": 0.7898,
"step": 155
},
{
"epoch": 0.04786836200448766,
"grad_norm": 2.3184684975983045,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.7937,
"step": 160
},
{
"epoch": 0.0493642483171279,
"grad_norm": 2.1613921165346826,
"learning_rate": 4.925373134328359e-06,
"loss": 0.7911,
"step": 165
},
{
"epoch": 0.050860134629768135,
"grad_norm": 2.453275213296358,
"learning_rate": 5.074626865671642e-06,
"loss": 0.7857,
"step": 170
},
{
"epoch": 0.05235602094240838,
"grad_norm": 2.3284898790399353,
"learning_rate": 5.2238805970149255e-06,
"loss": 0.7793,
"step": 175
},
{
"epoch": 0.05385190725504862,
"grad_norm": 2.3201172049873686,
"learning_rate": 5.37313432835821e-06,
"loss": 0.7779,
"step": 180
},
{
"epoch": 0.055347793567688854,
"grad_norm": 2.357711081062365,
"learning_rate": 5.522388059701493e-06,
"loss": 0.7847,
"step": 185
},
{
"epoch": 0.056843679880329095,
"grad_norm": 2.7611374374886037,
"learning_rate": 5.671641791044776e-06,
"loss": 0.7706,
"step": 190
},
{
"epoch": 0.05833956619296934,
"grad_norm": 2.6271055846372513,
"learning_rate": 5.820895522388061e-06,
"loss": 0.7607,
"step": 195
},
{
"epoch": 0.05983545250560957,
"grad_norm": 2.376880891581398,
"learning_rate": 5.970149253731343e-06,
"loss": 0.7671,
"step": 200
},
{
"epoch": 0.061331338818249814,
"grad_norm": 2.3835041311189022,
"learning_rate": 6.119402985074627e-06,
"loss": 0.7586,
"step": 205
},
{
"epoch": 0.06282722513089005,
"grad_norm": 2.462749716678564,
"learning_rate": 6.2686567164179116e-06,
"loss": 0.7758,
"step": 210
},
{
"epoch": 0.0643231114435303,
"grad_norm": 2.4077847282275866,
"learning_rate": 6.417910447761194e-06,
"loss": 0.7638,
"step": 215
},
{
"epoch": 0.06581899775617053,
"grad_norm": 2.3747888389216616,
"learning_rate": 6.567164179104478e-06,
"loss": 0.761,
"step": 220
},
{
"epoch": 0.06731488406881077,
"grad_norm": 2.493092964041521,
"learning_rate": 6.7164179104477625e-06,
"loss": 0.7574,
"step": 225
},
{
"epoch": 0.068810770381451,
"grad_norm": 2.6477317119354984,
"learning_rate": 6.865671641791045e-06,
"loss": 0.7498,
"step": 230
},
{
"epoch": 0.07030665669409125,
"grad_norm": 2.563921275934862,
"learning_rate": 7.014925373134329e-06,
"loss": 0.761,
"step": 235
},
{
"epoch": 0.07180254300673149,
"grad_norm": 2.4406043810417257,
"learning_rate": 7.164179104477612e-06,
"loss": 0.7423,
"step": 240
},
{
"epoch": 0.07329842931937172,
"grad_norm": 2.2639780494151034,
"learning_rate": 7.313432835820896e-06,
"loss": 0.7478,
"step": 245
},
{
"epoch": 0.07479431563201197,
"grad_norm": 2.6132105098393628,
"learning_rate": 7.46268656716418e-06,
"loss": 0.7522,
"step": 250
},
{
"epoch": 0.0762902019446522,
"grad_norm": 2.5688006092201365,
"learning_rate": 7.611940298507463e-06,
"loss": 0.7409,
"step": 255
},
{
"epoch": 0.07778608825729244,
"grad_norm": 2.4383140799178564,
"learning_rate": 7.761194029850747e-06,
"loss": 0.7449,
"step": 260
},
{
"epoch": 0.07928197456993269,
"grad_norm": 2.3404023048993365,
"learning_rate": 7.91044776119403e-06,
"loss": 0.7306,
"step": 265
},
{
"epoch": 0.08077786088257292,
"grad_norm": 2.435305377918958,
"learning_rate": 8.059701492537314e-06,
"loss": 0.7464,
"step": 270
},
{
"epoch": 0.08227374719521316,
"grad_norm": 2.9299430373461433,
"learning_rate": 8.208955223880599e-06,
"loss": 0.7279,
"step": 275
},
{
"epoch": 0.08376963350785341,
"grad_norm": 2.387726009024013,
"learning_rate": 8.35820895522388e-06,
"loss": 0.7388,
"step": 280
},
{
"epoch": 0.08526551982049364,
"grad_norm": 2.268041368580347,
"learning_rate": 8.507462686567165e-06,
"loss": 0.7407,
"step": 285
},
{
"epoch": 0.08676140613313388,
"grad_norm": 2.408844552763582,
"learning_rate": 8.656716417910447e-06,
"loss": 0.7222,
"step": 290
},
{
"epoch": 0.08825729244577413,
"grad_norm": 2.401282808607445,
"learning_rate": 8.805970149253732e-06,
"loss": 0.7265,
"step": 295
},
{
"epoch": 0.08975317875841436,
"grad_norm": 2.4691438286047425,
"learning_rate": 8.955223880597016e-06,
"loss": 0.7239,
"step": 300
},
{
"epoch": 0.0912490650710546,
"grad_norm": 2.3268484057723673,
"learning_rate": 9.104477611940299e-06,
"loss": 0.7227,
"step": 305
},
{
"epoch": 0.09274495138369485,
"grad_norm": 2.402308618181149,
"learning_rate": 9.253731343283582e-06,
"loss": 0.7244,
"step": 310
},
{
"epoch": 0.09424083769633508,
"grad_norm": 2.3361311735184604,
"learning_rate": 9.402985074626867e-06,
"loss": 0.7263,
"step": 315
},
{
"epoch": 0.09573672400897532,
"grad_norm": 2.335173501165771,
"learning_rate": 9.552238805970149e-06,
"loss": 0.7215,
"step": 320
},
{
"epoch": 0.09723261032161555,
"grad_norm": 2.730322757042367,
"learning_rate": 9.701492537313434e-06,
"loss": 0.7332,
"step": 325
},
{
"epoch": 0.0987284966342558,
"grad_norm": 2.2835032077275312,
"learning_rate": 9.850746268656717e-06,
"loss": 0.7115,
"step": 330
},
{
"epoch": 0.10022438294689603,
"grad_norm": 2.4782326787594338,
"learning_rate": 1e-05,
"loss": 0.7142,
"step": 335
},
{
"epoch": 0.10172026925953627,
"grad_norm": 2.4128411535499454,
"learning_rate": 9.999931779967976e-06,
"loss": 0.7108,
"step": 340
},
{
"epoch": 0.10321615557217652,
"grad_norm": 2.230247650641441,
"learning_rate": 9.99972712173349e-06,
"loss": 0.7037,
"step": 345
},
{
"epoch": 0.10471204188481675,
"grad_norm": 2.4375274487235363,
"learning_rate": 9.999386030881264e-06,
"loss": 0.7052,
"step": 350
},
{
"epoch": 0.10620792819745699,
"grad_norm": 2.497148482639146,
"learning_rate": 9.998908516718984e-06,
"loss": 0.723,
"step": 355
},
{
"epoch": 0.10770381451009724,
"grad_norm": 2.7572554578379793,
"learning_rate": 9.998294592277064e-06,
"loss": 0.7089,
"step": 360
},
{
"epoch": 0.10919970082273747,
"grad_norm": 2.4811541468806175,
"learning_rate": 9.997544274308282e-06,
"loss": 0.7049,
"step": 365
},
{
"epoch": 0.11069558713537771,
"grad_norm": 2.3612905719078556,
"learning_rate": 9.996657583287326e-06,
"loss": 0.7112,
"step": 370
},
{
"epoch": 0.11219147344801796,
"grad_norm": 2.3269094749093346,
"learning_rate": 9.995634543410231e-06,
"loss": 0.7091,
"step": 375
},
{
"epoch": 0.11368735976065819,
"grad_norm": 2.1856434021067543,
"learning_rate": 9.99447518259372e-06,
"loss": 0.6945,
"step": 380
},
{
"epoch": 0.11518324607329843,
"grad_norm": 2.2818233887085677,
"learning_rate": 9.99317953247445e-06,
"loss": 0.695,
"step": 385
},
{
"epoch": 0.11667913238593867,
"grad_norm": 2.5644750677779267,
"learning_rate": 9.991747628408138e-06,
"loss": 0.6878,
"step": 390
},
{
"epoch": 0.11817501869857891,
"grad_norm": 2.485293875776813,
"learning_rate": 9.990179509468595e-06,
"loss": 0.6987,
"step": 395
},
{
"epoch": 0.11967090501121914,
"grad_norm": 2.291043542367887,
"learning_rate": 9.988475218446676e-06,
"loss": 0.6898,
"step": 400
},
{
"epoch": 0.1211667913238594,
"grad_norm": 2.4232767050288726,
"learning_rate": 9.986634801849093e-06,
"loss": 0.6963,
"step": 405
},
{
"epoch": 0.12266267763649963,
"grad_norm": 2.4211336275289512,
"learning_rate": 9.984658309897161e-06,
"loss": 0.6793,
"step": 410
},
{
"epoch": 0.12415856394913986,
"grad_norm": 2.5029979080579734,
"learning_rate": 9.982545796525416e-06,
"loss": 0.6773,
"step": 415
},
{
"epoch": 0.1256544502617801,
"grad_norm": 2.4668720551969487,
"learning_rate": 9.980297319380148e-06,
"loss": 0.6741,
"step": 420
},
{
"epoch": 0.12715033657442035,
"grad_norm": 2.1324057406570796,
"learning_rate": 9.977912939817833e-06,
"loss": 0.6717,
"step": 425
},
{
"epoch": 0.1286462228870606,
"grad_norm": 2.216513918034811,
"learning_rate": 9.97539272290345e-06,
"loss": 0.664,
"step": 430
},
{
"epoch": 0.13014210919970082,
"grad_norm": 2.3891946944700346,
"learning_rate": 9.97273673740871e-06,
"loss": 0.6779,
"step": 435
},
{
"epoch": 0.13163799551234107,
"grad_norm": 2.107297355911597,
"learning_rate": 9.96994505581018e-06,
"loss": 0.663,
"step": 440
},
{
"epoch": 0.13313388182498131,
"grad_norm": 2.2741240440107666,
"learning_rate": 9.967017754287303e-06,
"loss": 0.6628,
"step": 445
},
{
"epoch": 0.13462976813762154,
"grad_norm": 2.231118541487464,
"learning_rate": 9.963954912720319e-06,
"loss": 0.6805,
"step": 450
},
{
"epoch": 0.13612565445026178,
"grad_norm": 2.327411278722037,
"learning_rate": 9.960756614688089e-06,
"loss": 0.6572,
"step": 455
},
{
"epoch": 0.137621540762902,
"grad_norm": 2.2727646648145097,
"learning_rate": 9.957422947465814e-06,
"loss": 0.6682,
"step": 460
},
{
"epoch": 0.13911742707554225,
"grad_norm": 2.43427967377174,
"learning_rate": 9.953954002022643e-06,
"loss": 0.658,
"step": 465
},
{
"epoch": 0.1406133133881825,
"grad_norm": 2.203173002529278,
"learning_rate": 9.950349873019204e-06,
"loss": 0.6513,
"step": 470
},
{
"epoch": 0.14210919970082272,
"grad_norm": 2.159064147239943,
"learning_rate": 9.946610658805018e-06,
"loss": 0.6597,
"step": 475
},
{
"epoch": 0.14360508601346297,
"grad_norm": 2.2802374368293186,
"learning_rate": 9.94273646141581e-06,
"loss": 0.6642,
"step": 480
},
{
"epoch": 0.14510097232610322,
"grad_norm": 2.321550706239028,
"learning_rate": 9.938727386570727e-06,
"loss": 0.6525,
"step": 485
},
{
"epoch": 0.14659685863874344,
"grad_norm": 2.3398188402263105,
"learning_rate": 9.934583543669454e-06,
"loss": 0.6583,
"step": 490
},
{
"epoch": 0.1480927449513837,
"grad_norm": 2.1439110014914524,
"learning_rate": 9.93030504578923e-06,
"loss": 0.6413,
"step": 495
},
{
"epoch": 0.14958863126402394,
"grad_norm": 2.2275265346511377,
"learning_rate": 9.925892009681762e-06,
"loss": 0.6529,
"step": 500
},
{
"epoch": 0.15108451757666416,
"grad_norm": 2.3496939081419637,
"learning_rate": 9.921344555770033e-06,
"loss": 0.6437,
"step": 505
},
{
"epoch": 0.1525804038893044,
"grad_norm": 2.238484219281493,
"learning_rate": 9.916662808145023e-06,
"loss": 0.6452,
"step": 510
},
{
"epoch": 0.15407629020194466,
"grad_norm": 2.78908558811821,
"learning_rate": 9.911846894562325e-06,
"loss": 0.6436,
"step": 515
},
{
"epoch": 0.15557217651458488,
"grad_norm": 2.320928708686177,
"learning_rate": 9.906896946438646e-06,
"loss": 0.6336,
"step": 520
},
{
"epoch": 0.15706806282722513,
"grad_norm": 2.2586199846671686,
"learning_rate": 9.901813098848238e-06,
"loss": 0.6338,
"step": 525
},
{
"epoch": 0.15856394913986538,
"grad_norm": 2.3116521162760217,
"learning_rate": 9.896595490519196e-06,
"loss": 0.6414,
"step": 530
},
{
"epoch": 0.1600598354525056,
"grad_norm": 2.1311643830360767,
"learning_rate": 9.891244263829685e-06,
"loss": 0.64,
"step": 535
},
{
"epoch": 0.16155572176514585,
"grad_norm": 2.3201652793369605,
"learning_rate": 9.885759564804045e-06,
"loss": 0.6197,
"step": 540
},
{
"epoch": 0.1630516080777861,
"grad_norm": 2.1802123067545134,
"learning_rate": 9.880141543108816e-06,
"loss": 0.6354,
"step": 545
},
{
"epoch": 0.16454749439042632,
"grad_norm": 2.3111352831943086,
"learning_rate": 9.874390352048646e-06,
"loss": 0.6422,
"step": 550
},
{
"epoch": 0.16604338070306657,
"grad_norm": 2.3857931202103524,
"learning_rate": 9.868506148562107e-06,
"loss": 0.6255,
"step": 555
},
{
"epoch": 0.16753926701570682,
"grad_norm": 2.3118891681947518,
"learning_rate": 9.862489093217422e-06,
"loss": 0.6123,
"step": 560
},
{
"epoch": 0.16903515332834704,
"grad_norm": 2.3891897641974165,
"learning_rate": 9.856339350208073e-06,
"loss": 0.6426,
"step": 565
},
{
"epoch": 0.1705310396409873,
"grad_norm": 2.305906878734901,
"learning_rate": 9.850057087348328e-06,
"loss": 0.6199,
"step": 570
},
{
"epoch": 0.17202692595362754,
"grad_norm": 2.1960382748129432,
"learning_rate": 9.843642476068654e-06,
"loss": 0.6095,
"step": 575
},
{
"epoch": 0.17352281226626776,
"grad_norm": 2.0839495395902534,
"learning_rate": 9.837095691411047e-06,
"loss": 0.6131,
"step": 580
},
{
"epoch": 0.175018698578908,
"grad_norm": 2.4685394970589685,
"learning_rate": 9.83041691202425e-06,
"loss": 0.6257,
"step": 585
},
{
"epoch": 0.17651458489154825,
"grad_norm": 2.8548483464223957,
"learning_rate": 9.82360632015888e-06,
"loss": 0.5935,
"step": 590
},
{
"epoch": 0.17801047120418848,
"grad_norm": 2.5687866778693347,
"learning_rate": 9.816664101662458e-06,
"loss": 0.6176,
"step": 595
},
{
"epoch": 0.17950635751682872,
"grad_norm": 2.1643123544103497,
"learning_rate": 9.809590445974328e-06,
"loss": 0.6236,
"step": 600
},
{
"epoch": 0.18100224382946897,
"grad_norm": 2.1920911452788023,
"learning_rate": 9.802385546120498e-06,
"loss": 0.6149,
"step": 605
},
{
"epoch": 0.1824981301421092,
"grad_norm": 2.1719167623114046,
"learning_rate": 9.795049598708369e-06,
"loss": 0.6165,
"step": 610
},
{
"epoch": 0.18399401645474944,
"grad_norm": 2.045624267196742,
"learning_rate": 9.787582803921366e-06,
"loss": 0.6056,
"step": 615
},
{
"epoch": 0.1854899027673897,
"grad_norm": 2.1670193890658105,
"learning_rate": 9.77998536551348e-06,
"loss": 0.583,
"step": 620
},
{
"epoch": 0.1869857890800299,
"grad_norm": 2.143005021612413,
"learning_rate": 9.77225749080371e-06,
"loss": 0.6025,
"step": 625
},
{
"epoch": 0.18848167539267016,
"grad_norm": 2.2897606994593733,
"learning_rate": 9.764399390670401e-06,
"loss": 0.6044,
"step": 630
},
{
"epoch": 0.18997756170531038,
"grad_norm": 2.1407407791372304,
"learning_rate": 9.756411279545486e-06,
"loss": 0.6028,
"step": 635
},
{
"epoch": 0.19147344801795063,
"grad_norm": 2.1400040414477512,
"learning_rate": 9.748293375408647e-06,
"loss": 0.6008,
"step": 640
},
{
"epoch": 0.19296933433059088,
"grad_norm": 2.3487555741055646,
"learning_rate": 9.740045899781353e-06,
"loss": 0.5905,
"step": 645
},
{
"epoch": 0.1944652206432311,
"grad_norm": 2.211663714643132,
"learning_rate": 9.731669077720828e-06,
"loss": 0.5834,
"step": 650
},
{
"epoch": 0.19596110695587135,
"grad_norm": 2.188161715718423,
"learning_rate": 9.723163137813898e-06,
"loss": 0.5855,
"step": 655
},
{
"epoch": 0.1974569932685116,
"grad_norm": 2.133955120338045,
"learning_rate": 9.714528312170762e-06,
"loss": 0.5944,
"step": 660
},
{
"epoch": 0.19895287958115182,
"grad_norm": 2.2340780975578527,
"learning_rate": 9.705764836418648e-06,
"loss": 0.583,
"step": 665
},
{
"epoch": 0.20044876589379207,
"grad_norm": 2.3292781920189936,
"learning_rate": 9.696872949695399e-06,
"loss": 0.5827,
"step": 670
},
{
"epoch": 0.20194465220643232,
"grad_norm": 2.3176955302107647,
"learning_rate": 9.687852894642932e-06,
"loss": 0.584,
"step": 675
},
{
"epoch": 0.20344053851907254,
"grad_norm": 2.2410986216187863,
"learning_rate": 9.678704917400628e-06,
"loss": 0.5702,
"step": 680
},
{
"epoch": 0.2049364248317128,
"grad_norm": 2.2113552696479766,
"learning_rate": 9.669429267598603e-06,
"loss": 0.5656,
"step": 685
},
{
"epoch": 0.20643231114435304,
"grad_norm": 2.1894234586204613,
"learning_rate": 9.660026198350906e-06,
"loss": 0.5688,
"step": 690
},
{
"epoch": 0.20792819745699326,
"grad_norm": 2.2894157314528183,
"learning_rate": 9.650495966248618e-06,
"loss": 0.5563,
"step": 695
},
{
"epoch": 0.2094240837696335,
"grad_norm": 2.2231586059805863,
"learning_rate": 9.64083883135283e-06,
"loss": 0.5642,
"step": 700
},
{
"epoch": 0.21091997008227376,
"grad_norm": 2.227615707267463,
"learning_rate": 9.631055057187564e-06,
"loss": 0.5788,
"step": 705
},
{
"epoch": 0.21241585639491398,
"grad_norm": 2.155741018622304,
"learning_rate": 9.621144910732573e-06,
"loss": 0.5634,
"step": 710
},
{
"epoch": 0.21391174270755423,
"grad_norm": 2.396343334926677,
"learning_rate": 9.611108662416064e-06,
"loss": 0.5655,
"step": 715
},
{
"epoch": 0.21540762902019447,
"grad_norm": 2.331449791458783,
"learning_rate": 9.600946586107306e-06,
"loss": 0.5739,
"step": 720
},
{
"epoch": 0.2169035153328347,
"grad_norm": 2.2507152546219924,
"learning_rate": 9.590658959109168e-06,
"loss": 0.5768,
"step": 725
},
{
"epoch": 0.21839940164547494,
"grad_norm": 2.164980578292193,
"learning_rate": 9.58024606215055e-06,
"loss": 0.5517,
"step": 730
},
{
"epoch": 0.2198952879581152,
"grad_norm": 2.2186056393230484,
"learning_rate": 9.569708179378716e-06,
"loss": 0.5773,
"step": 735
},
{
"epoch": 0.22139117427075541,
"grad_norm": 2.1412265933937245,
"learning_rate": 9.559045598351544e-06,
"loss": 0.5597,
"step": 740
},
{
"epoch": 0.22288706058339566,
"grad_norm": 2.113998854082962,
"learning_rate": 9.548258610029684e-06,
"loss": 0.5602,
"step": 745
},
{
"epoch": 0.2243829468960359,
"grad_norm": 2.1066935794719823,
"learning_rate": 9.537347508768613e-06,
"loss": 0.553,
"step": 750
},
{
"epoch": 0.22587883320867613,
"grad_norm": 2.1269652854319285,
"learning_rate": 9.526312592310597e-06,
"loss": 0.5462,
"step": 755
},
{
"epoch": 0.22737471952131638,
"grad_norm": 2.1421869014604966,
"learning_rate": 9.515154161776584e-06,
"loss": 0.5508,
"step": 760
},
{
"epoch": 0.22887060583395663,
"grad_norm": 2.116284198421969,
"learning_rate": 9.503872521657964e-06,
"loss": 0.549,
"step": 765
},
{
"epoch": 0.23036649214659685,
"grad_norm": 2.0774732327342673,
"learning_rate": 9.49246797980828e-06,
"loss": 0.5485,
"step": 770
},
{
"epoch": 0.2318623784592371,
"grad_norm": 2.276120847003367,
"learning_rate": 9.480940847434814e-06,
"loss": 0.5553,
"step": 775
},
{
"epoch": 0.23335826477187735,
"grad_norm": 2.1356056201671882,
"learning_rate": 9.469291439090104e-06,
"loss": 0.5465,
"step": 780
},
{
"epoch": 0.23485415108451757,
"grad_norm": 2.048373811826588,
"learning_rate": 9.457520072663353e-06,
"loss": 0.5396,
"step": 785
},
{
"epoch": 0.23635003739715782,
"grad_norm": 2.2466734007706397,
"learning_rate": 9.445627069371758e-06,
"loss": 0.5688,
"step": 790
},
{
"epoch": 0.23784592370979807,
"grad_norm": 2.3976619549715292,
"learning_rate": 9.433612753751748e-06,
"loss": 0.5496,
"step": 795
},
{
"epoch": 0.2393418100224383,
"grad_norm": 2.0982203268057793,
"learning_rate": 9.421477453650118e-06,
"loss": 0.5482,
"step": 800
},
{
"epoch": 0.24083769633507854,
"grad_norm": 2.1926594347223936,
"learning_rate": 9.409221500215096e-06,
"loss": 0.5281,
"step": 805
},
{
"epoch": 0.2423335826477188,
"grad_norm": 2.046500172753204,
"learning_rate": 9.396845227887295e-06,
"loss": 0.5495,
"step": 810
},
{
"epoch": 0.243829468960359,
"grad_norm": 2.116270403530158,
"learning_rate": 9.38434897439059e-06,
"loss": 0.5333,
"step": 815
},
{
"epoch": 0.24532535527299926,
"grad_norm": 2.1427393113292026,
"learning_rate": 9.371733080722911e-06,
"loss": 0.5314,
"step": 820
},
{
"epoch": 0.24682124158563948,
"grad_norm": 2.2287931226941766,
"learning_rate": 9.358997891146924e-06,
"loss": 0.5389,
"step": 825
},
{
"epoch": 0.24831712789827973,
"grad_norm": 2.183511996335904,
"learning_rate": 9.346143753180646e-06,
"loss": 0.5332,
"step": 830
},
{
"epoch": 0.24981301421091998,
"grad_norm": 2.1563125330336077,
"learning_rate": 9.333171017587956e-06,
"loss": 0.5278,
"step": 835
},
{
"epoch": 0.2513089005235602,
"grad_norm": 2.384672087516804,
"learning_rate": 9.320080038369032e-06,
"loss": 0.5321,
"step": 840
},
{
"epoch": 0.25280478683620045,
"grad_norm": 2.2250998536771154,
"learning_rate": 9.30687117275068e-06,
"loss": 0.5237,
"step": 845
},
{
"epoch": 0.2543006731488407,
"grad_norm": 2.3295538202244237,
"learning_rate": 9.293544781176598e-06,
"loss": 0.5238,
"step": 850
},
{
"epoch": 0.25579655946148094,
"grad_norm": 2.46386287871832,
"learning_rate": 9.280101227297526e-06,
"loss": 0.5274,
"step": 855
},
{
"epoch": 0.2572924457741212,
"grad_norm": 2.2480305463427865,
"learning_rate": 9.266540877961337e-06,
"loss": 0.535,
"step": 860
},
{
"epoch": 0.2587883320867614,
"grad_norm": 2.1850110027540826,
"learning_rate": 9.252864103203015e-06,
"loss": 0.5216,
"step": 865
},
{
"epoch": 0.26028421839940163,
"grad_norm": 2.1759114077528845,
"learning_rate": 9.239071276234568e-06,
"loss": 0.5162,
"step": 870
},
{
"epoch": 0.2617801047120419,
"grad_norm": 2.1338769320741515,
"learning_rate": 9.225162773434831e-06,
"loss": 0.5143,
"step": 875
},
{
"epoch": 0.26327599102468213,
"grad_norm": 2.1659203361390063,
"learning_rate": 9.21113897433921e-06,
"loss": 0.5103,
"step": 880
},
{
"epoch": 0.2647718773373224,
"grad_norm": 2.122282430960376,
"learning_rate": 9.197000261629314e-06,
"loss": 0.5081,
"step": 885
},
{
"epoch": 0.26626776364996263,
"grad_norm": 2.056748593014802,
"learning_rate": 9.182747021122516e-06,
"loss": 0.5117,
"step": 890
},
{
"epoch": 0.2677636499626028,
"grad_norm": 2.203097118962648,
"learning_rate": 9.168379641761425e-06,
"loss": 0.5166,
"step": 895
},
{
"epoch": 0.26925953627524307,
"grad_norm": 2.248299702751712,
"learning_rate": 9.153898515603272e-06,
"loss": 0.5121,
"step": 900
},
{
"epoch": 0.2707554225878833,
"grad_norm": 2.215308947297488,
"learning_rate": 9.139304037809216e-06,
"loss": 0.5151,
"step": 905
},
{
"epoch": 0.27225130890052357,
"grad_norm": 2.115586467592,
"learning_rate": 9.124596606633551e-06,
"loss": 0.5083,
"step": 910
},
{
"epoch": 0.2737471952131638,
"grad_norm": 2.2977950459018017,
"learning_rate": 9.10977662341285e-06,
"loss": 0.5153,
"step": 915
},
{
"epoch": 0.275243081525804,
"grad_norm": 2.205780583800523,
"learning_rate": 9.094844492555004e-06,
"loss": 0.5123,
"step": 920
},
{
"epoch": 0.27673896783844426,
"grad_norm": 2.227802917043228,
"learning_rate": 9.07980062152819e-06,
"loss": 0.5117,
"step": 925
},
{
"epoch": 0.2782348541510845,
"grad_norm": 2.2359783620231632,
"learning_rate": 9.064645420849754e-06,
"loss": 0.5022,
"step": 930
},
{
"epoch": 0.27973074046372476,
"grad_norm": 2.1642613110172366,
"learning_rate": 9.049379304075009e-06,
"loss": 0.4907,
"step": 935
},
{
"epoch": 0.281226626776365,
"grad_norm": 2.2277389804733447,
"learning_rate": 9.03400268778594e-06,
"loss": 0.5011,
"step": 940
},
{
"epoch": 0.28272251308900526,
"grad_norm": 2.1493583853918907,
"learning_rate": 9.018515991579851e-06,
"loss": 0.5019,
"step": 945
},
{
"epoch": 0.28421839940164545,
"grad_norm": 2.4395894627674073,
"learning_rate": 9.002919638057908e-06,
"loss": 0.5033,
"step": 950
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.2370400153506806,
"learning_rate": 8.987214052813605e-06,
"loss": 0.5045,
"step": 955
},
{
"epoch": 0.28721017202692595,
"grad_norm": 2.078576437577485,
"learning_rate": 8.971399664421154e-06,
"loss": 0.5009,
"step": 960
},
{
"epoch": 0.2887060583395662,
"grad_norm": 2.2142839400817937,
"learning_rate": 8.955476904423785e-06,
"loss": 0.5023,
"step": 965
},
{
"epoch": 0.29020194465220644,
"grad_norm": 2.14232609513754,
"learning_rate": 8.939446207321982e-06,
"loss": 0.477,
"step": 970
},
{
"epoch": 0.2916978309648467,
"grad_norm": 2.21107323554905,
"learning_rate": 8.923308010561608e-06,
"loss": 0.4994,
"step": 975
},
{
"epoch": 0.2931937172774869,
"grad_norm": 2.1386395431438054,
"learning_rate": 8.907062754521985e-06,
"loss": 0.5023,
"step": 980
},
{
"epoch": 0.29468960359012714,
"grad_norm": 2.1332355719651037,
"learning_rate": 8.89071088250387e-06,
"loss": 0.4843,
"step": 985
},
{
"epoch": 0.2961854899027674,
"grad_norm": 2.0749503641930276,
"learning_rate": 8.87425284071735e-06,
"loss": 0.4942,
"step": 990
},
{
"epoch": 0.29768137621540763,
"grad_norm": 2.159991846647922,
"learning_rate": 8.857689078269688e-06,
"loss": 0.5108,
"step": 995
},
{
"epoch": 0.2991772625280479,
"grad_norm": 2.1267522505598446,
"learning_rate": 8.841020047153039e-06,
"loss": 0.4935,
"step": 1000
},
{
"epoch": 0.30067314884068813,
"grad_norm": 2.1642503588715245,
"learning_rate": 8.824246202232142e-06,
"loss": 0.4907,
"step": 1005
},
{
"epoch": 0.3021690351533283,
"grad_norm": 2.084991570149356,
"learning_rate": 8.80736800123189e-06,
"loss": 0.4781,
"step": 1010
},
{
"epoch": 0.3036649214659686,
"grad_norm": 2.1035440822771223,
"learning_rate": 8.790385904724848e-06,
"loss": 0.4845,
"step": 1015
},
{
"epoch": 0.3051608077786088,
"grad_norm": 2.1736909744601687,
"learning_rate": 8.773300376118685e-06,
"loss": 0.4801,
"step": 1020
},
{
"epoch": 0.30665669409124907,
"grad_norm": 2.2520314938860815,
"learning_rate": 8.75611188164352e-06,
"loss": 0.4893,
"step": 1025
},
{
"epoch": 0.3081525804038893,
"grad_norm": 2.1104641749948403,
"learning_rate": 8.738820890339217e-06,
"loss": 0.4938,
"step": 1030
},
{
"epoch": 0.30964846671652957,
"grad_norm": 2.0838403753220986,
"learning_rate": 8.721427874042563e-06,
"loss": 0.4835,
"step": 1035
},
{
"epoch": 0.31114435302916976,
"grad_norm": 2.0711510810184266,
"learning_rate": 8.703933307374413e-06,
"loss": 0.4725,
"step": 1040
},
{
"epoch": 0.31264023934181,
"grad_norm": 2.1063779245743888,
"learning_rate": 8.686337667726723e-06,
"loss": 0.4892,
"step": 1045
},
{
"epoch": 0.31413612565445026,
"grad_norm": 2.1105067703269422,
"learning_rate": 8.668641435249534e-06,
"loss": 0.4825,
"step": 1050
},
{
"epoch": 0.3156320119670905,
"grad_norm": 2.102573408737706,
"learning_rate": 8.650845092837867e-06,
"loss": 0.4885,
"step": 1055
},
{
"epoch": 0.31712789827973076,
"grad_norm": 2.2988609972066274,
"learning_rate": 8.632949126118538e-06,
"loss": 0.4752,
"step": 1060
},
{
"epoch": 0.318623784592371,
"grad_norm": 2.122502919871484,
"learning_rate": 8.61495402343692e-06,
"loss": 0.4769,
"step": 1065
},
{
"epoch": 0.3201196709050112,
"grad_norm": 2.165018274340972,
"learning_rate": 8.596860275843602e-06,
"loss": 0.4671,
"step": 1070
},
{
"epoch": 0.32161555721765145,
"grad_norm": 1.9717223958070753,
"learning_rate": 8.578668377081001e-06,
"loss": 0.4675,
"step": 1075
},
{
"epoch": 0.3231114435302917,
"grad_norm": 2.1031743583556803,
"learning_rate": 8.560378823569886e-06,
"loss": 0.4713,
"step": 1080
},
{
"epoch": 0.32460732984293195,
"grad_norm": 2.0178473800411307,
"learning_rate": 8.541992114395825e-06,
"loss": 0.4715,
"step": 1085
},
{
"epoch": 0.3261032161555722,
"grad_norm": 2.0225831073597007,
"learning_rate": 8.523508751295574e-06,
"loss": 0.4772,
"step": 1090
},
{
"epoch": 0.3275991024682124,
"grad_norm": 2.087877364586164,
"learning_rate": 8.504929238643381e-06,
"loss": 0.4688,
"step": 1095
},
{
"epoch": 0.32909498878085264,
"grad_norm": 2.160270876260719,
"learning_rate": 8.486254083437227e-06,
"loss": 0.4665,
"step": 1100
},
{
"epoch": 0.3305908750934929,
"grad_norm": 2.060627567407879,
"learning_rate": 8.467483795284987e-06,
"loss": 0.4617,
"step": 1105
},
{
"epoch": 0.33208676140613314,
"grad_norm": 2.1150833498354893,
"learning_rate": 8.448618886390523e-06,
"loss": 0.4676,
"step": 1110
},
{
"epoch": 0.3335826477187734,
"grad_norm": 2.026961078510351,
"learning_rate": 8.429659871539709e-06,
"loss": 0.4772,
"step": 1115
},
{
"epoch": 0.33507853403141363,
"grad_norm": 2.6128907908421852,
"learning_rate": 8.410607268086388e-06,
"loss": 0.4678,
"step": 1120
},
{
"epoch": 0.3365744203440538,
"grad_norm": 2.1162204368840185,
"learning_rate": 8.391461595938245e-06,
"loss": 0.4728,
"step": 1125
},
{
"epoch": 0.3380703066566941,
"grad_norm": 2.0236924938571095,
"learning_rate": 8.372223377542631e-06,
"loss": 0.4556,
"step": 1130
},
{
"epoch": 0.3395661929693343,
"grad_norm": 2.0470011404134345,
"learning_rate": 8.352893137872292e-06,
"loss": 0.4476,
"step": 1135
},
{
"epoch": 0.3410620792819746,
"grad_norm": 2.100726525573022,
"learning_rate": 8.333471404411054e-06,
"loss": 0.458,
"step": 1140
},
{
"epoch": 0.3425579655946148,
"grad_norm": 2.162727675316811,
"learning_rate": 8.313958707139434e-06,
"loss": 0.4751,
"step": 1145
},
{
"epoch": 0.34405385190725507,
"grad_norm": 2.0116970709952495,
"learning_rate": 8.29435557852016e-06,
"loss": 0.4647,
"step": 1150
},
{
"epoch": 0.34554973821989526,
"grad_norm": 1.9814728402387116,
"learning_rate": 8.274662553483662e-06,
"loss": 0.4441,
"step": 1155
},
{
"epoch": 0.3470456245325355,
"grad_norm": 2.0453867973962607,
"learning_rate": 8.254880169413455e-06,
"loss": 0.4613,
"step": 1160
},
{
"epoch": 0.34854151084517576,
"grad_norm": 2.0854663750868268,
"learning_rate": 8.235008966131492e-06,
"loss": 0.456,
"step": 1165
},
{
"epoch": 0.350037397157816,
"grad_norm": 2.1204249951123706,
"learning_rate": 8.215049485883419e-06,
"loss": 0.4526,
"step": 1170
},
{
"epoch": 0.35153328347045626,
"grad_norm": 2.125080372850005,
"learning_rate": 8.195002273323792e-06,
"loss": 0.4442,
"step": 1175
},
{
"epoch": 0.3530291697830965,
"grad_norm": 2.2626876387499224,
"learning_rate": 8.174867875501203e-06,
"loss": 0.4491,
"step": 1180
},
{
"epoch": 0.3545250560957367,
"grad_norm": 2.17644103793076,
"learning_rate": 8.154646841843358e-06,
"loss": 0.449,
"step": 1185
},
{
"epoch": 0.35602094240837695,
"grad_norm": 1.9934405786856697,
"learning_rate": 8.134339724142083e-06,
"loss": 0.4491,
"step": 1190
},
{
"epoch": 0.3575168287210172,
"grad_norm": 1.9811124546772585,
"learning_rate": 8.113947076538264e-06,
"loss": 0.4412,
"step": 1195
},
{
"epoch": 0.35901271503365745,
"grad_norm": 2.1197485018681785,
"learning_rate": 8.093469455506731e-06,
"loss": 0.4448,
"step": 1200
},
{
"epoch": 0.3605086013462977,
"grad_norm": 2.0582968984341967,
"learning_rate": 8.07290741984107e-06,
"loss": 0.4397,
"step": 1205
},
{
"epoch": 0.36200448765893795,
"grad_norm": 1.9803742197531462,
"learning_rate": 8.052261530638375e-06,
"loss": 0.4486,
"step": 1210
},
{
"epoch": 0.36350037397157814,
"grad_norm": 1.9763814917893987,
"learning_rate": 8.03153235128393e-06,
"loss": 0.4379,
"step": 1215
},
{
"epoch": 0.3649962602842184,
"grad_norm": 2.177684384739003,
"learning_rate": 8.01072044743585e-06,
"loss": 0.4448,
"step": 1220
},
{
"epoch": 0.36649214659685864,
"grad_norm": 2.2177683069308047,
"learning_rate": 7.989826387009634e-06,
"loss": 0.4398,
"step": 1225
},
{
"epoch": 0.3679880329094989,
"grad_norm": 2.0614298881537416,
"learning_rate": 7.96885074016267e-06,
"loss": 0.438,
"step": 1230
},
{
"epoch": 0.36948391922213913,
"grad_norm": 2.063175118233129,
"learning_rate": 7.947794079278678e-06,
"loss": 0.4353,
"step": 1235
},
{
"epoch": 0.3709798055347794,
"grad_norm": 2.0902885795644943,
"learning_rate": 7.926656978952089e-06,
"loss": 0.4369,
"step": 1240
},
{
"epoch": 0.3724756918474196,
"grad_norm": 2.081819065453435,
"learning_rate": 7.905440015972372e-06,
"loss": 0.4392,
"step": 1245
},
{
"epoch": 0.3739715781600598,
"grad_norm": 1.9635390617281576,
"learning_rate": 7.884143769308276e-06,
"loss": 0.4318,
"step": 1250
},
{
"epoch": 0.3754674644727001,
"grad_norm": 2.010397135845292,
"learning_rate": 7.862768820092061e-06,
"loss": 0.4294,
"step": 1255
},
{
"epoch": 0.3769633507853403,
"grad_norm": 2.120029095014225,
"learning_rate": 7.84131575160361e-06,
"loss": 0.4367,
"step": 1260
},
{
"epoch": 0.37845923709798057,
"grad_norm": 2.047223712557703,
"learning_rate": 7.819785149254534e-06,
"loss": 0.4247,
"step": 1265
},
{
"epoch": 0.37995512341062077,
"grad_norm": 2.1565665198769546,
"learning_rate": 7.798177600572184e-06,
"loss": 0.4545,
"step": 1270
},
{
"epoch": 0.381451009723261,
"grad_norm": 1.9698630282226646,
"learning_rate": 7.776493695183623e-06,
"loss": 0.4327,
"step": 1275
},
{
"epoch": 0.38294689603590126,
"grad_norm": 2.027501209185265,
"learning_rate": 7.754734024799544e-06,
"loss": 0.4378,
"step": 1280
},
{
"epoch": 0.3844427823485415,
"grad_norm": 1.9336783003915325,
"learning_rate": 7.732899183198108e-06,
"loss": 0.4199,
"step": 1285
},
{
"epoch": 0.38593866866118176,
"grad_norm": 2.074909881667748,
"learning_rate": 7.710989766208751e-06,
"loss": 0.431,
"step": 1290
},
{
"epoch": 0.387434554973822,
"grad_norm": 2.08466673344805,
"learning_rate": 7.689006371695928e-06,
"loss": 0.436,
"step": 1295
},
{
"epoch": 0.3889304412864622,
"grad_norm": 2.0101045976441334,
"learning_rate": 7.666949599542788e-06,
"loss": 0.4363,
"step": 1300
},
{
"epoch": 0.39042632759910245,
"grad_norm": 2.1388630620219304,
"learning_rate": 7.644820051634813e-06,
"loss": 0.4353,
"step": 1305
},
{
"epoch": 0.3919222139117427,
"grad_norm": 1.9897181694789714,
"learning_rate": 7.62261833184339e-06,
"loss": 0.4321,
"step": 1310
},
{
"epoch": 0.39341810022438295,
"grad_norm": 2.069750404086554,
"learning_rate": 7.60034504600933e-06,
"loss": 0.4166,
"step": 1315
},
{
"epoch": 0.3949139865370232,
"grad_norm": 2.0828214162126564,
"learning_rate": 7.5780008019263465e-06,
"loss": 0.4309,
"step": 1320
},
{
"epoch": 0.39640987284966345,
"grad_norm": 2.1311064881304183,
"learning_rate": 7.555586209324455e-06,
"loss": 0.42,
"step": 1325
},
{
"epoch": 0.39790575916230364,
"grad_norm": 2.0067032988225715,
"learning_rate": 7.533101879853348e-06,
"loss": 0.4247,
"step": 1330
},
{
"epoch": 0.3994016454749439,
"grad_norm": 2.1601395941384514,
"learning_rate": 7.510548427065693e-06,
"loss": 0.4103,
"step": 1335
},
{
"epoch": 0.40089753178758414,
"grad_norm": 2.0545268261654166,
"learning_rate": 7.487926466400403e-06,
"loss": 0.418,
"step": 1340
},
{
"epoch": 0.4023934181002244,
"grad_norm": 2.029856636678106,
"learning_rate": 7.465236615165826e-06,
"loss": 0.4265,
"step": 1345
},
{
"epoch": 0.40388930441286464,
"grad_norm": 1.9396811090214083,
"learning_rate": 7.4424794925229175e-06,
"loss": 0.4241,
"step": 1350
},
{
"epoch": 0.4053851907255049,
"grad_norm": 2.073788987162284,
"learning_rate": 7.4196557194683265e-06,
"loss": 0.4039,
"step": 1355
},
{
"epoch": 0.4068810770381451,
"grad_norm": 2.070263015501858,
"learning_rate": 7.3967659188174676e-06,
"loss": 0.4331,
"step": 1360
},
{
"epoch": 0.4083769633507853,
"grad_norm": 1.957024406881209,
"learning_rate": 7.373810715187516e-06,
"loss": 0.4198,
"step": 1365
},
{
"epoch": 0.4098728496634256,
"grad_norm": 2.0021094595131705,
"learning_rate": 7.350790734980359e-06,
"loss": 0.4138,
"step": 1370
},
{
"epoch": 0.4113687359760658,
"grad_norm": 2.038893591791927,
"learning_rate": 7.327706606365512e-06,
"loss": 0.4099,
"step": 1375
},
{
"epoch": 0.4128646222887061,
"grad_norm": 2.091182328954734,
"learning_rate": 7.304558959262973e-06,
"loss": 0.4091,
"step": 1380
},
{
"epoch": 0.4143605086013463,
"grad_norm": 2.005484469630839,
"learning_rate": 7.281348425326034e-06,
"loss": 0.4071,
"step": 1385
},
{
"epoch": 0.4158563949139865,
"grad_norm": 2.000171729890043,
"learning_rate": 7.258075637924039e-06,
"loss": 0.4077,
"step": 1390
},
{
"epoch": 0.41735228122662676,
"grad_norm": 1.88335343776708,
"learning_rate": 7.234741232125111e-06,
"loss": 0.4106,
"step": 1395
},
{
"epoch": 0.418848167539267,
"grad_norm": 2.041697368575073,
"learning_rate": 7.211345844678816e-06,
"loss": 0.4124,
"step": 1400
},
{
"epoch": 0.42034405385190726,
"grad_norm": 2.1120074891606313,
"learning_rate": 7.1878901139987826e-06,
"loss": 0.414,
"step": 1405
},
{
"epoch": 0.4218399401645475,
"grad_norm": 2.017409414749495,
"learning_rate": 7.164374680145293e-06,
"loss": 0.4038,
"step": 1410
},
{
"epoch": 0.42333582647718776,
"grad_norm": 2.0432465019716144,
"learning_rate": 7.140800184807805e-06,
"loss": 0.4073,
"step": 1415
},
{
"epoch": 0.42483171278982795,
"grad_norm": 2.060077990063716,
"learning_rate": 7.117167271287453e-06,
"loss": 0.4068,
"step": 1420
},
{
"epoch": 0.4263275991024682,
"grad_norm": 2.027592571205212,
"learning_rate": 7.09347658447948e-06,
"loss": 0.4042,
"step": 1425
},
{
"epoch": 0.42782348541510845,
"grad_norm": 2.040823863949173,
"learning_rate": 7.069728770855652e-06,
"loss": 0.4034,
"step": 1430
},
{
"epoch": 0.4293193717277487,
"grad_norm": 2.1465152715010722,
"learning_rate": 7.0459244784466115e-06,
"loss": 0.4049,
"step": 1435
},
{
"epoch": 0.43081525804038895,
"grad_norm": 2.017024929241199,
"learning_rate": 7.022064356824196e-06,
"loss": 0.4051,
"step": 1440
},
{
"epoch": 0.4323111443530292,
"grad_norm": 1.9756966229288817,
"learning_rate": 6.998149057083711e-06,
"loss": 0.3991,
"step": 1445
},
{
"epoch": 0.4338070306656694,
"grad_norm": 1.9869718270881975,
"learning_rate": 6.9741792318261585e-06,
"loss": 0.4029,
"step": 1450
},
{
"epoch": 0.43530291697830964,
"grad_norm": 2.1015981628011136,
"learning_rate": 6.950155535140439e-06,
"loss": 0.3998,
"step": 1455
},
{
"epoch": 0.4367988032909499,
"grad_norm": 2.1512869214406174,
"learning_rate": 6.926078622585496e-06,
"loss": 0.4001,
"step": 1460
},
{
"epoch": 0.43829468960359014,
"grad_norm": 2.0152270376530677,
"learning_rate": 6.901949151172427e-06,
"loss": 0.4047,
"step": 1465
},
{
"epoch": 0.4397905759162304,
"grad_norm": 2.11665136839116,
"learning_rate": 6.877767779346556e-06,
"loss": 0.4064,
"step": 1470
},
{
"epoch": 0.4412864622288706,
"grad_norm": 2.231208727114714,
"learning_rate": 6.8535351669694694e-06,
"loss": 0.3884,
"step": 1475
},
{
"epoch": 0.44278234854151083,
"grad_norm": 1.9444993004804072,
"learning_rate": 6.829251975301003e-06,
"loss": 0.3949,
"step": 1480
},
{
"epoch": 0.4442782348541511,
"grad_norm": 1.98272069907838,
"learning_rate": 6.8049188669812024e-06,
"loss": 0.395,
"step": 1485
},
{
"epoch": 0.4457741211667913,
"grad_norm": 1.9120999593676538,
"learning_rate": 6.7805365060122386e-06,
"loss": 0.3968,
"step": 1490
},
{
"epoch": 0.4472700074794316,
"grad_norm": 2.0053365034386186,
"learning_rate": 6.756105557740289e-06,
"loss": 0.402,
"step": 1495
},
{
"epoch": 0.4487658937920718,
"grad_norm": 1.9514629474872618,
"learning_rate": 6.731626688837387e-06,
"loss": 0.3836,
"step": 1500
},
{
"epoch": 0.450261780104712,
"grad_norm": 2.1087506038221955,
"learning_rate": 6.707100567283217e-06,
"loss": 0.3843,
"step": 1505
},
{
"epoch": 0.45175766641735227,
"grad_norm": 2.1300871436189306,
"learning_rate": 6.682527862346898e-06,
"loss": 0.3996,
"step": 1510
},
{
"epoch": 0.4532535527299925,
"grad_norm": 1.9854434493239195,
"learning_rate": 6.657909244568721e-06,
"loss": 0.4011,
"step": 1515
},
{
"epoch": 0.45474943904263276,
"grad_norm": 1.9814246083045182,
"learning_rate": 6.6332453857418375e-06,
"loss": 0.4012,
"step": 1520
},
{
"epoch": 0.456245325355273,
"grad_norm": 2.023928605650618,
"learning_rate": 6.608536958893948e-06,
"loss": 0.3962,
"step": 1525
},
{
"epoch": 0.45774121166791326,
"grad_norm": 2.012248063709598,
"learning_rate": 6.583784638268919e-06,
"loss": 0.4001,
"step": 1530
},
{
"epoch": 0.45923709798055345,
"grad_norm": 1.9208473033828253,
"learning_rate": 6.5589890993083934e-06,
"loss": 0.3965,
"step": 1535
},
{
"epoch": 0.4607329842931937,
"grad_norm": 2.1713469007968476,
"learning_rate": 6.534151018633355e-06,
"loss": 0.3962,
"step": 1540
},
{
"epoch": 0.46222887060583395,
"grad_norm": 2.079357026566145,
"learning_rate": 6.509271074025668e-06,
"loss": 0.3913,
"step": 1545
},
{
"epoch": 0.4637247569184742,
"grad_norm": 1.9619607602752462,
"learning_rate": 6.484349944409579e-06,
"loss": 0.3885,
"step": 1550
},
{
"epoch": 0.46522064323111445,
"grad_norm": 2.025383663450042,
"learning_rate": 6.459388309833193e-06,
"loss": 0.396,
"step": 1555
},
{
"epoch": 0.4667165295437547,
"grad_norm": 1.9926140480691588,
"learning_rate": 6.434386851449914e-06,
"loss": 0.3978,
"step": 1560
},
{
"epoch": 0.4682124158563949,
"grad_norm": 2.0001783057698677,
"learning_rate": 6.409346251499859e-06,
"loss": 0.3889,
"step": 1565
},
{
"epoch": 0.46970830216903514,
"grad_norm": 1.9977538629610117,
"learning_rate": 6.384267193291238e-06,
"loss": 0.3872,
"step": 1570
},
{
"epoch": 0.4712041884816754,
"grad_norm": 1.9725560974868908,
"learning_rate": 6.3591503611817155e-06,
"loss": 0.39,
"step": 1575
},
{
"epoch": 0.47270007479431564,
"grad_norm": 1.9326640130746877,
"learning_rate": 6.333996440559726e-06,
"loss": 0.3815,
"step": 1580
},
{
"epoch": 0.4741959611069559,
"grad_norm": 1.9055402813860574,
"learning_rate": 6.308806117825777e-06,
"loss": 0.3801,
"step": 1585
},
{
"epoch": 0.47569184741959614,
"grad_norm": 1.9722002752461958,
"learning_rate": 6.283580080373721e-06,
"loss": 0.3804,
"step": 1590
},
{
"epoch": 0.47718773373223633,
"grad_norm": 2.0082373206027526,
"learning_rate": 6.25831901657199e-06,
"loss": 0.3775,
"step": 1595
},
{
"epoch": 0.4786836200448766,
"grad_norm": 1.9266846313881612,
"learning_rate": 6.233023615744813e-06,
"loss": 0.3883,
"step": 1600
},
{
"epoch": 0.4801795063575168,
"grad_norm": 2.088263839026747,
"learning_rate": 6.207694568153418e-06,
"loss": 0.389,
"step": 1605
},
{
"epoch": 0.4816753926701571,
"grad_norm": 2.1042065332247555,
"learning_rate": 6.182332564977174e-06,
"loss": 0.3792,
"step": 1610
},
{
"epoch": 0.4831712789827973,
"grad_norm": 1.9720059814432505,
"learning_rate": 6.156938298294752e-06,
"loss": 0.3706,
"step": 1615
},
{
"epoch": 0.4846671652954376,
"grad_norm": 1.9180314469419848,
"learning_rate": 6.131512461065227e-06,
"loss": 0.377,
"step": 1620
},
{
"epoch": 0.48616305160807777,
"grad_norm": 2.2323059057893775,
"learning_rate": 6.106055747109169e-06,
"loss": 0.3737,
"step": 1625
},
{
"epoch": 0.487658937920718,
"grad_norm": 1.9094100696871863,
"learning_rate": 6.080568851089717e-06,
"loss": 0.381,
"step": 1630
},
{
"epoch": 0.48915482423335827,
"grad_norm": 1.8740312923707445,
"learning_rate": 6.055052468493614e-06,
"loss": 0.3712,
"step": 1635
},
{
"epoch": 0.4906507105459985,
"grad_norm": 2.2274282125289364,
"learning_rate": 6.029507295612235e-06,
"loss": 0.3818,
"step": 1640
},
{
"epoch": 0.49214659685863876,
"grad_norm": 1.9574498076717952,
"learning_rate": 6.0039340295225845e-06,
"loss": 0.3808,
"step": 1645
},
{
"epoch": 0.49364248317127896,
"grad_norm": 2.019106731639845,
"learning_rate": 5.978333368068278e-06,
"loss": 0.3739,
"step": 1650
},
{
"epoch": 0.4951383694839192,
"grad_norm": 2.003448813298111,
"learning_rate": 5.952706009840491e-06,
"loss": 0.3801,
"step": 1655
},
{
"epoch": 0.49663425579655945,
"grad_norm": 2.033251099732251,
"learning_rate": 5.9270526541589025e-06,
"loss": 0.3719,
"step": 1660
},
{
"epoch": 0.4981301421091997,
"grad_norm": 1.9736607384350244,
"learning_rate": 5.901374001052614e-06,
"loss": 0.3647,
"step": 1665
},
{
"epoch": 0.49962602842183995,
"grad_norm": 1.8450885851841383,
"learning_rate": 5.875670751241036e-06,
"loss": 0.3694,
"step": 1670
},
{
"epoch": 0.5011219147344802,
"grad_norm": 1.9550125676086019,
"learning_rate": 5.849943606114782e-06,
"loss": 0.3765,
"step": 1675
},
{
"epoch": 0.5026178010471204,
"grad_norm": 1.9088487115244133,
"learning_rate": 5.824193267716517e-06,
"loss": 0.3628,
"step": 1680
},
{
"epoch": 0.5041136873597607,
"grad_norm": 2.0236760384942887,
"learning_rate": 5.798420438721804e-06,
"loss": 0.3681,
"step": 1685
},
{
"epoch": 0.5056095736724009,
"grad_norm": 1.8739105083496626,
"learning_rate": 5.772625822419933e-06,
"loss": 0.3626,
"step": 1690
},
{
"epoch": 0.5071054599850411,
"grad_norm": 2.0538294961575048,
"learning_rate": 5.74681012269472e-06,
"loss": 0.3664,
"step": 1695
},
{
"epoch": 0.5086013462976814,
"grad_norm": 1.9510478793415906,
"learning_rate": 5.720974044005314e-06,
"loss": 0.3687,
"step": 1700
},
{
"epoch": 0.5100972326103216,
"grad_norm": 2.0027682706640206,
"learning_rate": 5.695118291366959e-06,
"loss": 0.3791,
"step": 1705
},
{
"epoch": 0.5115931189229619,
"grad_norm": 2.017422075178467,
"learning_rate": 5.669243570331766e-06,
"loss": 0.3592,
"step": 1710
},
{
"epoch": 0.5130890052356021,
"grad_norm": 1.9298103072373924,
"learning_rate": 5.643350586969453e-06,
"loss": 0.3624,
"step": 1715
},
{
"epoch": 0.5145848915482424,
"grad_norm": 1.828229384099037,
"learning_rate": 5.617440047848081e-06,
"loss": 0.3693,
"step": 1720
},
{
"epoch": 0.5160807778608826,
"grad_norm": 1.9835871613164413,
"learning_rate": 5.591512660014773e-06,
"loss": 0.367,
"step": 1725
},
{
"epoch": 0.5175766641735228,
"grad_norm": 1.906111097537283,
"learning_rate": 5.5655691309764225e-06,
"loss": 0.3698,
"step": 1730
},
{
"epoch": 0.5190725504861631,
"grad_norm": 1.8980070836105973,
"learning_rate": 5.539610168680381e-06,
"loss": 0.3617,
"step": 1735
},
{
"epoch": 0.5205684367988033,
"grad_norm": 1.929980721348062,
"learning_rate": 5.513636481495143e-06,
"loss": 0.3603,
"step": 1740
},
{
"epoch": 0.5220643231114436,
"grad_norm": 1.91015401663393,
"learning_rate": 5.487648778191021e-06,
"loss": 0.3533,
"step": 1745
},
{
"epoch": 0.5235602094240838,
"grad_norm": 1.9455506909545779,
"learning_rate": 5.4616477679207906e-06,
"loss": 0.3746,
"step": 1750
},
{
"epoch": 0.525056095736724,
"grad_norm": 1.8552115044332138,
"learning_rate": 5.435634160200355e-06,
"loss": 0.3583,
"step": 1755
},
{
"epoch": 0.5265519820493643,
"grad_norm": 1.913776238110964,
"learning_rate": 5.409608664889376e-06,
"loss": 0.3571,
"step": 1760
},
{
"epoch": 0.5280478683620045,
"grad_norm": 1.9566204864416041,
"learning_rate": 5.383571992171904e-06,
"loss": 0.3681,
"step": 1765
},
{
"epoch": 0.5295437546746448,
"grad_norm": 2.0484694098984813,
"learning_rate": 5.357524852536996e-06,
"loss": 0.3579,
"step": 1770
},
{
"epoch": 0.531039640987285,
"grad_norm": 1.9124761975111415,
"learning_rate": 5.331467956759331e-06,
"loss": 0.3508,
"step": 1775
},
{
"epoch": 0.5325355272999253,
"grad_norm": 1.9151628917936323,
"learning_rate": 5.305402015879817e-06,
"loss": 0.3582,
"step": 1780
},
{
"epoch": 0.5340314136125655,
"grad_norm": 1.8760817819604374,
"learning_rate": 5.279327741186179e-06,
"loss": 0.3607,
"step": 1785
},
{
"epoch": 0.5355272999252056,
"grad_norm": 1.961131431192389,
"learning_rate": 5.253245844193564e-06,
"loss": 0.3517,
"step": 1790
},
{
"epoch": 0.537023186237846,
"grad_norm": 1.971571895204417,
"learning_rate": 5.227157036625108e-06,
"loss": 0.3456,
"step": 1795
},
{
"epoch": 0.5385190725504861,
"grad_norm": 1.8838335367241383,
"learning_rate": 5.2010620303925275e-06,
"loss": 0.3519,
"step": 1800
},
{
"epoch": 0.5400149588631264,
"grad_norm": 1.829377568323147,
"learning_rate": 5.174961537576685e-06,
"loss": 0.3564,
"step": 1805
},
{
"epoch": 0.5415108451757666,
"grad_norm": 1.8522486080816014,
"learning_rate": 5.148856270408163e-06,
"loss": 0.3568,
"step": 1810
},
{
"epoch": 0.5430067314884068,
"grad_norm": 1.938579795945218,
"learning_rate": 5.122746941247828e-06,
"loss": 0.3607,
"step": 1815
},
{
"epoch": 0.5445026178010471,
"grad_norm": 1.8962553032833915,
"learning_rate": 5.096634262567388e-06,
"loss": 0.3578,
"step": 1820
},
{
"epoch": 0.5459985041136873,
"grad_norm": 1.7953489371783111,
"learning_rate": 5.070518946929954e-06,
"loss": 0.3495,
"step": 1825
},
{
"epoch": 0.5474943904263276,
"grad_norm": 1.9518780090135102,
"learning_rate": 5.044401706970592e-06,
"loss": 0.3558,
"step": 1830
},
{
"epoch": 0.5489902767389678,
"grad_norm": 2.029523910683152,
"learning_rate": 5.018283255376882e-06,
"loss": 0.3505,
"step": 1835
},
{
"epoch": 0.550486163051608,
"grad_norm": 1.9831397143651377,
"learning_rate": 4.992164304869464e-06,
"loss": 0.3569,
"step": 1840
},
{
"epoch": 0.5519820493642483,
"grad_norm": 2.1279272506945075,
"learning_rate": 4.966045568182596e-06,
"loss": 0.3372,
"step": 1845
},
{
"epoch": 0.5534779356768885,
"grad_norm": 1.9637293854690605,
"learning_rate": 4.939927758044698e-06,
"loss": 0.3518,
"step": 1850
},
{
"epoch": 0.5549738219895288,
"grad_norm": 2.0828701986556695,
"learning_rate": 4.913811587158908e-06,
"loss": 0.3443,
"step": 1855
},
{
"epoch": 0.556469708302169,
"grad_norm": 1.981630887644782,
"learning_rate": 4.887697768183633e-06,
"loss": 0.3444,
"step": 1860
},
{
"epoch": 0.5579655946148093,
"grad_norm": 1.8743980292802156,
"learning_rate": 4.861587013713096e-06,
"loss": 0.346,
"step": 1865
},
{
"epoch": 0.5594614809274495,
"grad_norm": 1.9064350126377236,
"learning_rate": 4.835480036257904e-06,
"loss": 0.3467,
"step": 1870
},
{
"epoch": 0.5609573672400897,
"grad_norm": 1.8972548935569284,
"learning_rate": 4.809377548225589e-06,
"loss": 0.3388,
"step": 1875
},
{
"epoch": 0.56245325355273,
"grad_norm": 1.8677668781712522,
"learning_rate": 4.783280261901179e-06,
"loss": 0.3442,
"step": 1880
},
{
"epoch": 0.5639491398653702,
"grad_norm": 1.900487755648876,
"learning_rate": 4.757188889427761e-06,
"loss": 0.3389,
"step": 1885
},
{
"epoch": 0.5654450261780105,
"grad_norm": 1.9420787190248043,
"learning_rate": 4.731104142787035e-06,
"loss": 0.3472,
"step": 1890
},
{
"epoch": 0.5669409124906507,
"grad_norm": 1.927314753260138,
"learning_rate": 4.7050267337799074e-06,
"loss": 0.3481,
"step": 1895
},
{
"epoch": 0.5684367988032909,
"grad_norm": 1.9207634340998982,
"learning_rate": 4.678957374007046e-06,
"loss": 0.3424,
"step": 1900
},
{
"epoch": 0.5699326851159312,
"grad_norm": 1.9212470848002643,
"learning_rate": 4.652896774849477e-06,
"loss": 0.3358,
"step": 1905
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.924739770896096,
"learning_rate": 4.626845647449161e-06,
"loss": 0.3353,
"step": 1910
},
{
"epoch": 0.5729244577412117,
"grad_norm": 1.9350839334038696,
"learning_rate": 4.600804702689598e-06,
"loss": 0.3348,
"step": 1915
},
{
"epoch": 0.5744203440538519,
"grad_norm": 1.8695042520523082,
"learning_rate": 4.57477465117642e-06,
"loss": 0.338,
"step": 1920
},
{
"epoch": 0.5759162303664922,
"grad_norm": 1.9312558535320394,
"learning_rate": 4.54875620321801e-06,
"loss": 0.343,
"step": 1925
},
{
"epoch": 0.5774121166791324,
"grad_norm": 1.8821605226871228,
"learning_rate": 4.522750068806107e-06,
"loss": 0.3407,
"step": 1930
},
{
"epoch": 0.5789080029917726,
"grad_norm": 2.0267756257950906,
"learning_rate": 4.496756957596438e-06,
"loss": 0.332,
"step": 1935
},
{
"epoch": 0.5804038893044129,
"grad_norm": 1.87047157167879,
"learning_rate": 4.4707775788893586e-06,
"loss": 0.3377,
"step": 1940
},
{
"epoch": 0.5818997756170531,
"grad_norm": 1.8334500325846965,
"learning_rate": 4.444812641610482e-06,
"loss": 0.3331,
"step": 1945
},
{
"epoch": 0.5833956619296934,
"grad_norm": 1.865940227461524,
"learning_rate": 4.418862854291356e-06,
"loss": 0.3336,
"step": 1950
},
{
"epoch": 0.5848915482423336,
"grad_norm": 1.8290658280068524,
"learning_rate": 4.392928925050106e-06,
"loss": 0.3237,
"step": 1955
},
{
"epoch": 0.5863874345549738,
"grad_norm": 1.8622254708709993,
"learning_rate": 4.3670115615721265e-06,
"loss": 0.3376,
"step": 1960
},
{
"epoch": 0.5878833208676141,
"grad_norm": 1.9201843032013242,
"learning_rate": 4.341111471090762e-06,
"loss": 0.3459,
"step": 1965
},
{
"epoch": 0.5893792071802543,
"grad_norm": 1.8796680590731187,
"learning_rate": 4.315229360368014e-06,
"loss": 0.3278,
"step": 1970
},
{
"epoch": 0.5908750934928946,
"grad_norm": 1.8016030738978284,
"learning_rate": 4.289365935675255e-06,
"loss": 0.3268,
"step": 1975
},
{
"epoch": 0.5923709798055348,
"grad_norm": 1.9190193806693643,
"learning_rate": 4.263521902773944e-06,
"loss": 0.3333,
"step": 1980
},
{
"epoch": 0.5938668661181751,
"grad_norm": 1.8784476290504393,
"learning_rate": 4.237697966896385e-06,
"loss": 0.3271,
"step": 1985
},
{
"epoch": 0.5953627524308153,
"grad_norm": 1.8712250384764961,
"learning_rate": 4.211894832726471e-06,
"loss": 0.3342,
"step": 1990
},
{
"epoch": 0.5968586387434555,
"grad_norm": 1.9036575376553382,
"learning_rate": 4.1861132043804555e-06,
"loss": 0.335,
"step": 1995
},
{
"epoch": 0.5983545250560958,
"grad_norm": 1.9581740636617746,
"learning_rate": 4.160353785387746e-06,
"loss": 0.324,
"step": 2000
},
{
"epoch": 0.599850411368736,
"grad_norm": 1.834158258904465,
"learning_rate": 4.134617278671694e-06,
"loss": 0.3278,
"step": 2005
},
{
"epoch": 0.6013462976813763,
"grad_norm": 1.864212034584157,
"learning_rate": 4.108904386530429e-06,
"loss": 0.3293,
"step": 2010
},
{
"epoch": 0.6028421839940165,
"grad_norm": 1.7424366923402765,
"learning_rate": 4.083215810617678e-06,
"loss": 0.327,
"step": 2015
},
{
"epoch": 0.6043380703066566,
"grad_norm": 1.8772680059539715,
"learning_rate": 4.057552251923633e-06,
"loss": 0.3327,
"step": 2020
},
{
"epoch": 0.605833956619297,
"grad_norm": 1.8850386701103279,
"learning_rate": 4.031914410755809e-06,
"loss": 0.327,
"step": 2025
},
{
"epoch": 0.6073298429319371,
"grad_norm": 1.8735991544459796,
"learning_rate": 4.0063029867199455e-06,
"loss": 0.3278,
"step": 2030
},
{
"epoch": 0.6088257292445775,
"grad_norm": 2.0742483586745952,
"learning_rate": 3.980718678700909e-06,
"loss": 0.3295,
"step": 2035
},
{
"epoch": 0.6103216155572176,
"grad_norm": 1.805997806919521,
"learning_rate": 3.955162184843625e-06,
"loss": 0.318,
"step": 2040
},
{
"epoch": 0.6118175018698578,
"grad_norm": 1.9482369327485018,
"learning_rate": 3.929634202534026e-06,
"loss": 0.3303,
"step": 2045
},
{
"epoch": 0.6133133881824981,
"grad_norm": 1.8643741778263954,
"learning_rate": 3.904135428380019e-06,
"loss": 0.3221,
"step": 2050
},
{
"epoch": 0.6148092744951383,
"grad_norm": 1.9119914679721755,
"learning_rate": 3.8786665581924805e-06,
"loss": 0.3259,
"step": 2055
},
{
"epoch": 0.6163051608077786,
"grad_norm": 2.0294178588740808,
"learning_rate": 3.853228286966265e-06,
"loss": 0.3114,
"step": 2060
},
{
"epoch": 0.6178010471204188,
"grad_norm": 1.8598282314437558,
"learning_rate": 3.827821308861244e-06,
"loss": 0.3242,
"step": 2065
},
{
"epoch": 0.6192969334330591,
"grad_norm": 1.9818198802388973,
"learning_rate": 3.8024463171833636e-06,
"loss": 0.3252,
"step": 2070
},
{
"epoch": 0.6207928197456993,
"grad_norm": 1.9439228162479631,
"learning_rate": 3.777104004365721e-06,
"loss": 0.3258,
"step": 2075
},
{
"epoch": 0.6222887060583395,
"grad_norm": 1.9406393323579751,
"learning_rate": 3.7517950619496713e-06,
"loss": 0.327,
"step": 2080
},
{
"epoch": 0.6237845923709798,
"grad_norm": 1.8702196116833902,
"learning_rate": 3.7265201805659618e-06,
"loss": 0.3274,
"step": 2085
},
{
"epoch": 0.62528047868362,
"grad_norm": 1.8206045884367064,
"learning_rate": 3.701280049915877e-06,
"loss": 0.3087,
"step": 2090
},
{
"epoch": 0.6267763649962603,
"grad_norm": 1.8946084974764223,
"learning_rate": 3.676075358752426e-06,
"loss": 0.3227,
"step": 2095
},
{
"epoch": 0.6282722513089005,
"grad_norm": 1.8425099562092453,
"learning_rate": 3.6509067948615464e-06,
"loss": 0.3091,
"step": 2100
},
{
"epoch": 0.6297681376215407,
"grad_norm": 1.833988306261615,
"learning_rate": 3.6257750450433284e-06,
"loss": 0.3158,
"step": 2105
},
{
"epoch": 0.631264023934181,
"grad_norm": 1.848102418513888,
"learning_rate": 3.6006807950932867e-06,
"loss": 0.3231,
"step": 2110
},
{
"epoch": 0.6327599102468212,
"grad_norm": 1.8597295350064236,
"learning_rate": 3.575624729783632e-06,
"loss": 0.317,
"step": 2115
},
{
"epoch": 0.6342557965594615,
"grad_norm": 1.8571116178437028,
"learning_rate": 3.550607532844596e-06,
"loss": 0.3185,
"step": 2120
},
{
"epoch": 0.6357516828721017,
"grad_norm": 1.850039717310936,
"learning_rate": 3.5256298869457715e-06,
"loss": 0.3153,
"step": 2125
},
{
"epoch": 0.637247569184742,
"grad_norm": 1.8517187441330423,
"learning_rate": 3.5006924736774793e-06,
"loss": 0.3231,
"step": 2130
},
{
"epoch": 0.6387434554973822,
"grad_norm": 1.886804887794377,
"learning_rate": 3.47579597353217e-06,
"loss": 0.3132,
"step": 2135
},
{
"epoch": 0.6402393418100224,
"grad_norm": 1.8207891498106763,
"learning_rate": 3.4509410658858606e-06,
"loss": 0.3239,
"step": 2140
},
{
"epoch": 0.6417352281226627,
"grad_norm": 1.9080722925799685,
"learning_rate": 3.426128428979589e-06,
"loss": 0.3127,
"step": 2145
},
{
"epoch": 0.6432311144353029,
"grad_norm": 1.7978167092374475,
"learning_rate": 3.4013587399009073e-06,
"loss": 0.3112,
"step": 2150
},
{
"epoch": 0.6447270007479432,
"grad_norm": 1.8462499082396047,
"learning_rate": 3.376632674565411e-06,
"loss": 0.3168,
"step": 2155
},
{
"epoch": 0.6462228870605834,
"grad_norm": 1.856553229309688,
"learning_rate": 3.351950907698285e-06,
"loss": 0.3065,
"step": 2160
},
{
"epoch": 0.6477187733732236,
"grad_norm": 1.7800004213781706,
"learning_rate": 3.3273141128159005e-06,
"loss": 0.3132,
"step": 2165
},
{
"epoch": 0.6492146596858639,
"grad_norm": 1.9132965188669029,
"learning_rate": 3.3027229622074335e-06,
"loss": 0.3179,
"step": 2170
},
{
"epoch": 0.6507105459985041,
"grad_norm": 1.7650226022206836,
"learning_rate": 3.278178126916515e-06,
"loss": 0.3137,
"step": 2175
},
{
"epoch": 0.6522064323111444,
"grad_norm": 1.951509417973989,
"learning_rate": 3.2536802767229243e-06,
"loss": 0.3084,
"step": 2180
},
{
"epoch": 0.6537023186237846,
"grad_norm": 1.772116366162939,
"learning_rate": 3.2292300801243133e-06,
"loss": 0.3102,
"step": 2185
},
{
"epoch": 0.6551982049364248,
"grad_norm": 1.8140401176421401,
"learning_rate": 3.20482820431796e-06,
"loss": 0.3056,
"step": 2190
},
{
"epoch": 0.6566940912490651,
"grad_norm": 1.8243620700136636,
"learning_rate": 3.180475315182563e-06,
"loss": 0.3033,
"step": 2195
},
{
"epoch": 0.6581899775617053,
"grad_norm": 1.8380166168759837,
"learning_rate": 3.1561720772600736e-06,
"loss": 0.304,
"step": 2200
},
{
"epoch": 0.6596858638743456,
"grad_norm": 1.8336050039462124,
"learning_rate": 3.1319191537375577e-06,
"loss": 0.3143,
"step": 2205
},
{
"epoch": 0.6611817501869858,
"grad_norm": 1.8667890213032734,
"learning_rate": 3.107717206429105e-06,
"loss": 0.3031,
"step": 2210
},
{
"epoch": 0.6626776364996261,
"grad_norm": 1.7638159112909835,
"learning_rate": 3.0835668957577636e-06,
"loss": 0.3013,
"step": 2215
},
{
"epoch": 0.6641735228122663,
"grad_norm": 1.900781665691589,
"learning_rate": 3.059468880737519e-06,
"loss": 0.3073,
"step": 2220
},
{
"epoch": 0.6656694091249065,
"grad_norm": 1.943524014415726,
"learning_rate": 3.035423818955316e-06,
"loss": 0.3087,
"step": 2225
},
{
"epoch": 0.6671652954375468,
"grad_norm": 1.736021065342517,
"learning_rate": 3.0114323665531066e-06,
"loss": 0.2979,
"step": 2230
},
{
"epoch": 0.668661181750187,
"grad_norm": 1.746010053168365,
"learning_rate": 2.987495178209951e-06,
"loss": 0.307,
"step": 2235
},
{
"epoch": 0.6701570680628273,
"grad_norm": 1.8018064213578624,
"learning_rate": 2.9636129071241515e-06,
"loss": 0.3126,
"step": 2240
},
{
"epoch": 0.6716529543754675,
"grad_norm": 1.8077932770071266,
"learning_rate": 2.9397862049954307e-06,
"loss": 0.3004,
"step": 2245
},
{
"epoch": 0.6731488406881077,
"grad_norm": 1.7048569088891747,
"learning_rate": 2.916015722007137e-06,
"loss": 0.3066,
"step": 2250
},
{
"epoch": 0.674644727000748,
"grad_norm": 1.7988871113907166,
"learning_rate": 2.892302106808519e-06,
"loss": 0.3052,
"step": 2255
},
{
"epoch": 0.6761406133133882,
"grad_norm": 1.8715481375394143,
"learning_rate": 2.8686460064970078e-06,
"loss": 0.3085,
"step": 2260
},
{
"epoch": 0.6776364996260285,
"grad_norm": 1.8258948545382783,
"learning_rate": 2.8450480666005743e-06,
"loss": 0.3023,
"step": 2265
},
{
"epoch": 0.6791323859386686,
"grad_norm": 1.7183769572814935,
"learning_rate": 2.821508931060104e-06,
"loss": 0.3169,
"step": 2270
},
{
"epoch": 0.680628272251309,
"grad_norm": 1.8087144140013556,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.3,
"step": 2275
},
{
"epoch": 0.6821241585639491,
"grad_norm": 1.9721800720444596,
"learning_rate": 2.7746096407698004e-06,
"loss": 0.3029,
"step": 2280
},
{
"epoch": 0.6836200448765893,
"grad_norm": 1.8344419672931702,
"learning_rate": 2.7512507658083996e-06,
"loss": 0.2996,
"step": 2285
},
{
"epoch": 0.6851159311892296,
"grad_norm": 1.757267551606752,
"learning_rate": 2.7279532547449083e-06,
"loss": 0.3033,
"step": 2290
},
{
"epoch": 0.6866118175018698,
"grad_norm": 1.8575093423008022,
"learning_rate": 2.704717743322104e-06,
"loss": 0.2873,
"step": 2295
},
{
"epoch": 0.6881077038145101,
"grad_norm": 1.761502547654336,
"learning_rate": 2.681544865590926e-06,
"loss": 0.2999,
"step": 2300
},
{
"epoch": 0.6896035901271503,
"grad_norm": 1.958074773552565,
"learning_rate": 2.6584352538931523e-06,
"loss": 0.3023,
"step": 2305
},
{
"epoch": 0.6910994764397905,
"grad_norm": 1.7604780827427178,
"learning_rate": 2.635389538844166e-06,
"loss": 0.2923,
"step": 2310
},
{
"epoch": 0.6925953627524308,
"grad_norm": 1.860509876291064,
"learning_rate": 2.612408349315734e-06,
"loss": 0.2968,
"step": 2315
},
{
"epoch": 0.694091249065071,
"grad_norm": 1.8116830542415268,
"learning_rate": 2.5894923124188498e-06,
"loss": 0.2911,
"step": 2320
},
{
"epoch": 0.6955871353777113,
"grad_norm": 1.816773761816662,
"learning_rate": 2.5666420534866256e-06,
"loss": 0.3017,
"step": 2325
},
{
"epoch": 0.6970830216903515,
"grad_norm": 1.810456487051493,
"learning_rate": 2.543858196057214e-06,
"loss": 0.3045,
"step": 2330
},
{
"epoch": 0.6985789080029918,
"grad_norm": 1.8462477832363797,
"learning_rate": 2.5211413618568114e-06,
"loss": 0.2979,
"step": 2335
},
{
"epoch": 0.700074794315632,
"grad_norm": 1.749680469906487,
"learning_rate": 2.4984921707826805e-06,
"loss": 0.298,
"step": 2340
},
{
"epoch": 0.7015706806282722,
"grad_norm": 1.7715319612256217,
"learning_rate": 2.4759112408862366e-06,
"loss": 0.2905,
"step": 2345
},
{
"epoch": 0.7030665669409125,
"grad_norm": 1.9011349884243633,
"learning_rate": 2.4533991883561868e-06,
"loss": 0.2938,
"step": 2350
},
{
"epoch": 0.7045624532535527,
"grad_norm": 1.7509668722553002,
"learning_rate": 2.4309566275017027e-06,
"loss": 0.2931,
"step": 2355
},
{
"epoch": 0.706058339566193,
"grad_norm": 1.7463279622870067,
"learning_rate": 2.4085841707356787e-06,
"loss": 0.2948,
"step": 2360
},
{
"epoch": 0.7075542258788332,
"grad_norm": 1.7457958614044327,
"learning_rate": 2.386282428558001e-06,
"loss": 0.2935,
"step": 2365
},
{
"epoch": 0.7090501121914734,
"grad_norm": 1.8306487338719184,
"learning_rate": 2.364052009538892e-06,
"loss": 0.3029,
"step": 2370
},
{
"epoch": 0.7105459985041137,
"grad_norm": 1.8902782477754185,
"learning_rate": 2.341893520302313e-06,
"loss": 0.2937,
"step": 2375
},
{
"epoch": 0.7120418848167539,
"grad_norm": 1.7948687484011157,
"learning_rate": 2.3198075655094023e-06,
"loss": 0.2925,
"step": 2380
},
{
"epoch": 0.7135377711293942,
"grad_norm": 1.8682547497864384,
"learning_rate": 2.297794747841976e-06,
"loss": 0.2992,
"step": 2385
},
{
"epoch": 0.7150336574420344,
"grad_norm": 1.7985072864408282,
"learning_rate": 2.275855667986086e-06,
"loss": 0.2992,
"step": 2390
},
{
"epoch": 0.7165295437546746,
"grad_norm": 1.6780824098442955,
"learning_rate": 2.2539909246156257e-06,
"loss": 0.2902,
"step": 2395
},
{
"epoch": 0.7180254300673149,
"grad_norm": 1.9327685022447323,
"learning_rate": 2.232201114375988e-06,
"loss": 0.2879,
"step": 2400
},
{
"epoch": 0.7195213163799551,
"grad_norm": 1.8312593750432005,
"learning_rate": 2.2104868318677963e-06,
"loss": 0.2967,
"step": 2405
},
{
"epoch": 0.7210172026925954,
"grad_norm": 1.8041698028281294,
"learning_rate": 2.1888486696306706e-06,
"loss": 0.2849,
"step": 2410
},
{
"epoch": 0.7225130890052356,
"grad_norm": 1.8021876820178402,
"learning_rate": 2.1672872181270575e-06,
"loss": 0.2918,
"step": 2415
},
{
"epoch": 0.7240089753178759,
"grad_norm": 1.807836863115144,
"learning_rate": 2.1458030657261235e-06,
"loss": 0.282,
"step": 2420
},
{
"epoch": 0.7255048616305161,
"grad_norm": 1.7515999717106407,
"learning_rate": 2.1243967986876933e-06,
"loss": 0.2922,
"step": 2425
},
{
"epoch": 0.7270007479431563,
"grad_norm": 1.8149872804694056,
"learning_rate": 2.1030690011462567e-06,
"loss": 0.2912,
"step": 2430
},
{
"epoch": 0.7284966342557966,
"grad_norm": 1.7878582875336215,
"learning_rate": 2.081820255095028e-06,
"loss": 0.2886,
"step": 2435
},
{
"epoch": 0.7299925205684368,
"grad_norm": 1.7664930533873893,
"learning_rate": 2.0606511403700575e-06,
"loss": 0.2964,
"step": 2440
},
{
"epoch": 0.7314884068810771,
"grad_norm": 1.7856577814800616,
"learning_rate": 2.0395622346344213e-06,
"loss": 0.2849,
"step": 2445
},
{
"epoch": 0.7329842931937173,
"grad_norm": 1.7620387064486105,
"learning_rate": 2.018554113362449e-06,
"loss": 0.2811,
"step": 2450
},
{
"epoch": 0.7344801795063575,
"grad_norm": 1.746148787119175,
"learning_rate": 1.9976273498240234e-06,
"loss": 0.2866,
"step": 2455
},
{
"epoch": 0.7359760658189978,
"grad_norm": 1.759195000248038,
"learning_rate": 1.976782515068938e-06,
"loss": 0.294,
"step": 2460
},
{
"epoch": 0.737471952131638,
"grad_norm": 1.6081462651916374,
"learning_rate": 1.9560201779113056e-06,
"loss": 0.2821,
"step": 2465
},
{
"epoch": 0.7389678384442783,
"grad_norm": 1.8127282683936143,
"learning_rate": 1.9353409049140515e-06,
"loss": 0.2827,
"step": 2470
},
{
"epoch": 0.7404637247569185,
"grad_norm": 1.7928349569557254,
"learning_rate": 1.9147452603734402e-06,
"loss": 0.2889,
"step": 2475
},
{
"epoch": 0.7419596110695588,
"grad_norm": 1.7519180416889486,
"learning_rate": 1.894233806303689e-06,
"loss": 0.2816,
"step": 2480
},
{
"epoch": 0.743455497382199,
"grad_norm": 1.792648064853805,
"learning_rate": 1.8738071024216141e-06,
"loss": 0.2843,
"step": 2485
},
{
"epoch": 0.7449513836948392,
"grad_norm": 1.7815734013272622,
"learning_rate": 1.8534657061313744e-06,
"loss": 0.2742,
"step": 2490
},
{
"epoch": 0.7464472700074795,
"grad_norm": 1.825180595387709,
"learning_rate": 1.8332101725092522e-06,
"loss": 0.2816,
"step": 2495
},
{
"epoch": 0.7479431563201197,
"grad_norm": 1.8420097876440362,
"learning_rate": 1.8130410542885084e-06,
"loss": 0.2808,
"step": 2500
},
{
"epoch": 0.74943904263276,
"grad_norm": 1.8442353488656769,
"learning_rate": 1.7929589018443016e-06,
"loss": 0.2923,
"step": 2505
},
{
"epoch": 0.7509349289454001,
"grad_norm": 1.876793012170064,
"learning_rate": 1.7729642631786613e-06,
"loss": 0.2872,
"step": 2510
},
{
"epoch": 0.7524308152580403,
"grad_norm": 1.7511287142130798,
"learning_rate": 1.7530576839055453e-06,
"loss": 0.2822,
"step": 2515
},
{
"epoch": 0.7539267015706806,
"grad_norm": 1.8394555324866848,
"learning_rate": 1.7332397072359435e-06,
"loss": 0.2765,
"step": 2520
},
{
"epoch": 0.7554225878833208,
"grad_norm": 1.773080627419537,
"learning_rate": 1.7135108739630573e-06,
"loss": 0.2772,
"step": 2525
},
{
"epoch": 0.7569184741959611,
"grad_norm": 1.7397840701003071,
"learning_rate": 1.693871722447542e-06,
"loss": 0.2748,
"step": 2530
},
{
"epoch": 0.7584143605086013,
"grad_norm": 1.8139047134561623,
"learning_rate": 1.6743227886028152e-06,
"loss": 0.2809,
"step": 2535
},
{
"epoch": 0.7599102468212415,
"grad_norm": 1.723146398169513,
"learning_rate": 1.6548646058804347e-06,
"loss": 0.277,
"step": 2540
},
{
"epoch": 0.7614061331338818,
"grad_norm": 1.755509982892445,
"learning_rate": 1.6354977052555393e-06,
"loss": 0.2845,
"step": 2545
},
{
"epoch": 0.762902019446522,
"grad_norm": 1.7634745348399379,
"learning_rate": 1.6162226152123633e-06,
"loss": 0.2845,
"step": 2550
},
{
"epoch": 0.7643979057591623,
"grad_norm": 1.8539062432851583,
"learning_rate": 1.5970398617298078e-06,
"loss": 0.2828,
"step": 2555
},
{
"epoch": 0.7658937920718025,
"grad_norm": 1.8053358835812254,
"learning_rate": 1.5779499682670963e-06,
"loss": 0.2774,
"step": 2560
},
{
"epoch": 0.7673896783844428,
"grad_norm": 1.8014531312640616,
"learning_rate": 1.5589534557494868e-06,
"loss": 0.2841,
"step": 2565
},
{
"epoch": 0.768885564697083,
"grad_norm": 1.735571527942806,
"learning_rate": 1.5400508425540562e-06,
"loss": 0.2746,
"step": 2570
},
{
"epoch": 0.7703814510097232,
"grad_norm": 1.8540824858023373,
"learning_rate": 1.5212426444955569e-06,
"loss": 0.2807,
"step": 2575
},
{
"epoch": 0.7718773373223635,
"grad_norm": 1.7139393419525597,
"learning_rate": 1.5025293748123354e-06,
"loss": 0.2815,
"step": 2580
},
{
"epoch": 0.7733732236350037,
"grad_norm": 1.6431033212935895,
"learning_rate": 1.4839115441523355e-06,
"loss": 0.2696,
"step": 2585
},
{
"epoch": 0.774869109947644,
"grad_norm": 1.7227778483828726,
"learning_rate": 1.4653896605591584e-06,
"loss": 0.2732,
"step": 2590
},
{
"epoch": 0.7763649962602842,
"grad_norm": 1.7527519060060008,
"learning_rate": 1.4469642294582048e-06,
"loss": 0.2748,
"step": 2595
},
{
"epoch": 0.7778608825729244,
"grad_norm": 1.6997524796558416,
"learning_rate": 1.4286357536428696e-06,
"loss": 0.2729,
"step": 2600
},
{
"epoch": 0.7793567688855647,
"grad_norm": 1.7807204337692575,
"learning_rate": 1.4104047332608379e-06,
"loss": 0.2755,
"step": 2605
},
{
"epoch": 0.7808526551982049,
"grad_norm": 1.7182846099936764,
"learning_rate": 1.392271665800427e-06,
"loss": 0.2777,
"step": 2610
},
{
"epoch": 0.7823485415108452,
"grad_norm": 1.7302301084436003,
"learning_rate": 1.3742370460770144e-06,
"loss": 0.2762,
"step": 2615
},
{
"epoch": 0.7838444278234854,
"grad_norm": 1.711106037244554,
"learning_rate": 1.3563013662195356e-06,
"loss": 0.2737,
"step": 2620
},
{
"epoch": 0.7853403141361257,
"grad_norm": 1.8191358842574659,
"learning_rate": 1.3384651156570483e-06,
"loss": 0.2732,
"step": 2625
},
{
"epoch": 0.7868362004487659,
"grad_norm": 1.751260410944088,
"learning_rate": 1.3207287811053893e-06,
"loss": 0.2771,
"step": 2630
},
{
"epoch": 0.7883320867614061,
"grad_norm": 1.7320253510102213,
"learning_rate": 1.3030928465538822e-06,
"loss": 0.27,
"step": 2635
},
{
"epoch": 0.7898279730740464,
"grad_norm": 1.7406452518990843,
"learning_rate": 1.2855577932521352e-06,
"loss": 0.2703,
"step": 2640
},
{
"epoch": 0.7913238593866866,
"grad_norm": 1.8538751789457641,
"learning_rate": 1.2681240996969085e-06,
"loss": 0.2776,
"step": 2645
},
{
"epoch": 0.7928197456993269,
"grad_norm": 1.740887599672242,
"learning_rate": 1.250792241619051e-06,
"loss": 0.2736,
"step": 2650
},
{
"epoch": 0.7943156320119671,
"grad_norm": 1.8281991178787242,
"learning_rate": 1.233562691970533e-06,
"loss": 0.2749,
"step": 2655
},
{
"epoch": 0.7958115183246073,
"grad_norm": 1.6556477939621426,
"learning_rate": 1.2164359209115235e-06,
"loss": 0.2776,
"step": 2660
},
{
"epoch": 0.7973074046372476,
"grad_norm": 1.695787778492541,
"learning_rate": 1.1994123957975722e-06,
"loss": 0.2702,
"step": 2665
},
{
"epoch": 0.7988032909498878,
"grad_norm": 1.7707776645975837,
"learning_rate": 1.1824925811668485e-06,
"loss": 0.2627,
"step": 2670
},
{
"epoch": 0.8002991772625281,
"grad_norm": 1.8300425136047838,
"learning_rate": 1.1656769387274714e-06,
"loss": 0.2688,
"step": 2675
},
{
"epoch": 0.8017950635751683,
"grad_norm": 1.6906589157556278,
"learning_rate": 1.1489659273449073e-06,
"loss": 0.2672,
"step": 2680
},
{
"epoch": 0.8032909498878086,
"grad_norm": 1.7718115103968484,
"learning_rate": 1.132360003029449e-06,
"loss": 0.2673,
"step": 2685
},
{
"epoch": 0.8047868362004488,
"grad_norm": 1.7597119643475179,
"learning_rate": 1.115859618923773e-06,
"loss": 0.2744,
"step": 2690
},
{
"epoch": 0.806282722513089,
"grad_norm": 1.7801333538259148,
"learning_rate": 1.0994652252905695e-06,
"loss": 0.2662,
"step": 2695
},
{
"epoch": 0.8077786088257293,
"grad_norm": 1.6866429011639965,
"learning_rate": 1.083177269500264e-06,
"loss": 0.2675,
"step": 2700
},
{
"epoch": 0.8092744951383695,
"grad_norm": 1.9195992948000482,
"learning_rate": 1.0669961960188008e-06,
"loss": 0.2739,
"step": 2705
},
{
"epoch": 0.8107703814510098,
"grad_norm": 1.8220041781840073,
"learning_rate": 1.0509224463955249e-06,
"loss": 0.2604,
"step": 2710
},
{
"epoch": 0.81226626776365,
"grad_norm": 1.7303540258737908,
"learning_rate": 1.0349564592511162e-06,
"loss": 0.2743,
"step": 2715
},
{
"epoch": 0.8137621540762902,
"grad_norm": 1.6406056857804932,
"learning_rate": 1.0190986702656403e-06,
"loss": 0.2719,
"step": 2720
},
{
"epoch": 0.8152580403889305,
"grad_norm": 1.8590839739169418,
"learning_rate": 1.0033495121666442e-06,
"loss": 0.273,
"step": 2725
},
{
"epoch": 0.8167539267015707,
"grad_norm": 1.7341252368355093,
"learning_rate": 9.877094147173566e-07,
"loss": 0.2712,
"step": 2730
},
{
"epoch": 0.818249813014211,
"grad_norm": 1.7272695337289556,
"learning_rate": 9.721788047049586e-07,
"loss": 0.2628,
"step": 2735
},
{
"epoch": 0.8197456993268512,
"grad_norm": 1.7050895419647492,
"learning_rate": 9.567581059289322e-07,
"loss": 0.2678,
"step": 2740
},
{
"epoch": 0.8212415856394913,
"grad_norm": 1.7258978187627068,
"learning_rate": 9.414477391895044e-07,
"loss": 0.2715,
"step": 2745
},
{
"epoch": 0.8227374719521316,
"grad_norm": 1.8460755537922702,
"learning_rate": 9.262481222761588e-07,
"loss": 0.2716,
"step": 2750
},
{
"epoch": 0.8242333582647718,
"grad_norm": 1.7677837124955216,
"learning_rate": 9.11159669956237e-07,
"loss": 0.2725,
"step": 2755
},
{
"epoch": 0.8257292445774121,
"grad_norm": 1.7183389424616196,
"learning_rate": 8.961827939636198e-07,
"loss": 0.2683,
"step": 2760
},
{
"epoch": 0.8272251308900523,
"grad_norm": 1.8851170229714924,
"learning_rate": 8.813179029874874e-07,
"loss": 0.2588,
"step": 2765
},
{
"epoch": 0.8287210172026926,
"grad_norm": 1.8163919089444864,
"learning_rate": 8.665654026611797e-07,
"loss": 0.2631,
"step": 2770
},
{
"epoch": 0.8302169035153328,
"grad_norm": 1.7098860990754234,
"learning_rate": 8.51925695551113e-07,
"loss": 0.2679,
"step": 2775
},
{
"epoch": 0.831712789827973,
"grad_norm": 1.7663056355635183,
"learning_rate": 8.373991811458027e-07,
"loss": 0.2652,
"step": 2780
},
{
"epoch": 0.8332086761406133,
"grad_norm": 1.7186868648976898,
"learning_rate": 8.229862558449592e-07,
"loss": 0.2661,
"step": 2785
},
{
"epoch": 0.8347045624532535,
"grad_norm": 1.8059879215165224,
"learning_rate": 8.086873129486722e-07,
"loss": 0.2686,
"step": 2790
},
{
"epoch": 0.8362004487658938,
"grad_norm": 1.7374284001547664,
"learning_rate": 7.945027426466801e-07,
"loss": 0.2708,
"step": 2795
},
{
"epoch": 0.837696335078534,
"grad_norm": 1.6598096486422094,
"learning_rate": 7.804329320077181e-07,
"loss": 0.2653,
"step": 2800
},
{
"epoch": 0.8391922213911742,
"grad_norm": 1.676734657625906,
"learning_rate": 7.664782649689611e-07,
"loss": 0.2563,
"step": 2805
},
{
"epoch": 0.8406881077038145,
"grad_norm": 1.7941246676620155,
"learning_rate": 7.526391223255386e-07,
"loss": 0.2643,
"step": 2810
},
{
"epoch": 0.8421839940164547,
"grad_norm": 1.7441327844730907,
"learning_rate": 7.389158817201541e-07,
"loss": 0.2663,
"step": 2815
},
{
"epoch": 0.843679880329095,
"grad_norm": 1.6764728143369185,
"learning_rate": 7.253089176327738e-07,
"loss": 0.2631,
"step": 2820
},
{
"epoch": 0.8451757666417352,
"grad_norm": 1.7090343355435693,
"learning_rate": 7.118186013704065e-07,
"loss": 0.2579,
"step": 2825
},
{
"epoch": 0.8466716529543755,
"grad_norm": 1.723034589615204,
"learning_rate": 6.984453010569758e-07,
"loss": 0.2718,
"step": 2830
},
{
"epoch": 0.8481675392670157,
"grad_norm": 1.7083769223090157,
"learning_rate": 6.851893816232729e-07,
"loss": 0.259,
"step": 2835
},
{
"epoch": 0.8496634255796559,
"grad_norm": 1.6983173618906942,
"learning_rate": 6.720512047969957e-07,
"loss": 0.2655,
"step": 2840
},
{
"epoch": 0.8511593118922962,
"grad_norm": 1.6008652695866359,
"learning_rate": 6.590311290928825e-07,
"loss": 0.2661,
"step": 2845
},
{
"epoch": 0.8526551982049364,
"grad_norm": 1.723592329316595,
"learning_rate": 6.461295098029269e-07,
"loss": 0.2548,
"step": 2850
},
{
"epoch": 0.8541510845175767,
"grad_norm": 1.8054575785485054,
"learning_rate": 6.333466989866787e-07,
"loss": 0.264,
"step": 2855
},
{
"epoch": 0.8556469708302169,
"grad_norm": 1.7902077125134892,
"learning_rate": 6.206830454616447e-07,
"loss": 0.266,
"step": 2860
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.7147769185915753,
"learning_rate": 6.08138894793765e-07,
"loss": 0.2654,
"step": 2865
},
{
"epoch": 0.8586387434554974,
"grad_norm": 1.7518112730752275,
"learning_rate": 5.957145892879829e-07,
"loss": 0.2594,
"step": 2870
},
{
"epoch": 0.8601346297681376,
"grad_norm": 1.811592287193994,
"learning_rate": 5.834104679789077e-07,
"loss": 0.2647,
"step": 2875
},
{
"epoch": 0.8616305160807779,
"grad_norm": 1.773326433422328,
"learning_rate": 5.712268666215559e-07,
"loss": 0.264,
"step": 2880
},
{
"epoch": 0.8631264023934181,
"grad_norm": 1.68178039725722,
"learning_rate": 5.591641176822005e-07,
"loss": 0.2614,
"step": 2885
},
{
"epoch": 0.8646222887060584,
"grad_norm": 1.6842479909967625,
"learning_rate": 5.472225503292883e-07,
"loss": 0.2621,
"step": 2890
},
{
"epoch": 0.8661181750186986,
"grad_norm": 1.7449782410599817,
"learning_rate": 5.354024904244632e-07,
"loss": 0.2522,
"step": 2895
},
{
"epoch": 0.8676140613313388,
"grad_norm": 1.7259602780620398,
"learning_rate": 5.237042605136689e-07,
"loss": 0.2614,
"step": 2900
},
{
"epoch": 0.8691099476439791,
"grad_norm": 1.688101500268341,
"learning_rate": 5.121281798183547e-07,
"loss": 0.2611,
"step": 2905
},
{
"epoch": 0.8706058339566193,
"grad_norm": 1.7726586716734274,
"learning_rate": 5.00674564226758e-07,
"loss": 0.2544,
"step": 2910
},
{
"epoch": 0.8721017202692596,
"grad_norm": 1.6935216955087868,
"learning_rate": 4.893437262852885e-07,
"loss": 0.2523,
"step": 2915
},
{
"epoch": 0.8735976065818998,
"grad_norm": 1.878804856678552,
"learning_rate": 4.781359751899984e-07,
"loss": 0.2538,
"step": 2920
},
{
"epoch": 0.87509349289454,
"grad_norm": 1.64770700770445,
"learning_rate": 4.6705161677814024e-07,
"loss": 0.2569,
"step": 2925
},
{
"epoch": 0.8765893792071803,
"grad_norm": 1.860024134107886,
"learning_rate": 4.560909535198299e-07,
"loss": 0.2576,
"step": 2930
},
{
"epoch": 0.8780852655198205,
"grad_norm": 1.675929796569693,
"learning_rate": 4.4525428450978627e-07,
"loss": 0.2539,
"step": 2935
},
{
"epoch": 0.8795811518324608,
"grad_norm": 1.6649509488101208,
"learning_rate": 4.3454190545917317e-07,
"loss": 0.2654,
"step": 2940
},
{
"epoch": 0.881077038145101,
"grad_norm": 1.7894352860083609,
"learning_rate": 4.239541086875265e-07,
"loss": 0.2647,
"step": 2945
},
{
"epoch": 0.8825729244577412,
"grad_norm": 1.688537215035147,
"learning_rate": 4.134911831147798e-07,
"loss": 0.2563,
"step": 2950
},
{
"epoch": 0.8840688107703815,
"grad_norm": 1.7422422459372517,
"learning_rate": 4.031534142533816e-07,
"loss": 0.2517,
"step": 2955
},
{
"epoch": 0.8855646970830217,
"grad_norm": 1.832964243427611,
"learning_rate": 3.9294108420049935e-07,
"loss": 0.2664,
"step": 2960
},
{
"epoch": 0.887060583395662,
"grad_norm": 1.7932779810454953,
"learning_rate": 3.828544716303284e-07,
"loss": 0.2543,
"step": 2965
},
{
"epoch": 0.8885564697083022,
"grad_norm": 1.8073243004592312,
"learning_rate": 3.728938517864794e-07,
"loss": 0.2601,
"step": 2970
},
{
"epoch": 0.8900523560209425,
"grad_norm": 1.714561248097055,
"learning_rate": 3.6305949647447545e-07,
"loss": 0.2564,
"step": 2975
},
{
"epoch": 0.8915482423335827,
"grad_norm": 1.647575871046988,
"learning_rate": 3.5335167405433024e-07,
"loss": 0.2607,
"step": 2980
},
{
"epoch": 0.8930441286462228,
"grad_norm": 1.6977984176077578,
"learning_rate": 3.437706494332266e-07,
"loss": 0.2522,
"step": 2985
},
{
"epoch": 0.8945400149588631,
"grad_norm": 1.7141499596339997,
"learning_rate": 3.3431668405828675e-07,
"loss": 0.2558,
"step": 2990
},
{
"epoch": 0.8960359012715033,
"grad_norm": 1.6494105719449952,
"learning_rate": 3.249900359094388e-07,
"loss": 0.256,
"step": 2995
},
{
"epoch": 0.8975317875841436,
"grad_norm": 1.6630293618544516,
"learning_rate": 3.1579095949237584e-07,
"loss": 0.2508,
"step": 3000
},
{
"epoch": 0.8990276738967838,
"grad_norm": 1.7346655505039537,
"learning_rate": 3.067197058316157e-07,
"loss": 0.2614,
"step": 3005
},
{
"epoch": 0.900523560209424,
"grad_norm": 1.7107296935219805,
"learning_rate": 2.9777652246364306e-07,
"loss": 0.2538,
"step": 3010
},
{
"epoch": 0.9020194465220643,
"grad_norm": 1.6491436991741326,
"learning_rate": 2.889616534301598e-07,
"loss": 0.2521,
"step": 3015
},
{
"epoch": 0.9035153328347045,
"grad_norm": 1.7323747022001885,
"learning_rate": 2.8027533927142525e-07,
"loss": 0.2593,
"step": 3020
},
{
"epoch": 0.9050112191473448,
"grad_norm": 1.7534706658955106,
"learning_rate": 2.717178170196916e-07,
"loss": 0.249,
"step": 3025
},
{
"epoch": 0.906507105459985,
"grad_norm": 1.7068949519667596,
"learning_rate": 2.6328932019273556e-07,
"loss": 0.2625,
"step": 3030
},
{
"epoch": 0.9080029917726253,
"grad_norm": 1.7466561136363379,
"learning_rate": 2.549900787874876e-07,
"loss": 0.2572,
"step": 3035
},
{
"epoch": 0.9094988780852655,
"grad_norm": 1.6487218463492848,
"learning_rate": 2.468203192737512e-07,
"loss": 0.2618,
"step": 3040
},
{
"epoch": 0.9109947643979057,
"grad_norm": 1.5699982289102938,
"learning_rate": 2.3878026458803047e-07,
"loss": 0.2559,
"step": 3045
},
{
"epoch": 0.912490650710546,
"grad_norm": 1.678827851691801,
"learning_rate": 2.3087013412743998e-07,
"loss": 0.2504,
"step": 3050
},
{
"epoch": 0.9139865370231862,
"grad_norm": 1.732344143690627,
"learning_rate": 2.2309014374372106e-07,
"loss": 0.2556,
"step": 3055
},
{
"epoch": 0.9154824233358265,
"grad_norm": 1.6563146141875156,
"learning_rate": 2.1544050573735153e-07,
"loss": 0.2555,
"step": 3060
},
{
"epoch": 0.9169783096484667,
"grad_norm": 1.8096543479163172,
"learning_rate": 2.079214288517506e-07,
"loss": 0.2553,
"step": 3065
},
{
"epoch": 0.9184741959611069,
"grad_norm": 1.690741820888644,
"learning_rate": 2.0053311826758458e-07,
"loss": 0.256,
"step": 3070
},
{
"epoch": 0.9199700822737472,
"grad_norm": 1.7615351195511213,
"learning_rate": 1.9327577559716815e-07,
"loss": 0.2562,
"step": 3075
},
{
"epoch": 0.9214659685863874,
"grad_norm": 1.8449608271118088,
"learning_rate": 1.8614959887896078e-07,
"loss": 0.2549,
"step": 3080
},
{
"epoch": 0.9229618548990277,
"grad_norm": 1.7775694545753302,
"learning_rate": 1.79154782572164e-07,
"loss": 0.247,
"step": 3085
},
{
"epoch": 0.9244577412116679,
"grad_norm": 1.6956013013917148,
"learning_rate": 1.7229151755141394e-07,
"loss": 0.2571,
"step": 3090
},
{
"epoch": 0.9259536275243081,
"grad_norm": 1.6373082200013647,
"learning_rate": 1.655599911015754e-07,
"loss": 0.2547,
"step": 3095
},
{
"epoch": 0.9274495138369484,
"grad_norm": 1.7078162984487721,
"learning_rate": 1.5896038691262772e-07,
"loss": 0.2592,
"step": 3100
},
{
"epoch": 0.9289454001495886,
"grad_norm": 1.6367425145666301,
"learning_rate": 1.52492885074656e-07,
"loss": 0.2561,
"step": 3105
},
{
"epoch": 0.9304412864622289,
"grad_norm": 1.5872236691558035,
"learning_rate": 1.4615766207293157e-07,
"loss": 0.2518,
"step": 3110
},
{
"epoch": 0.9319371727748691,
"grad_norm": 1.714675291765629,
"learning_rate": 1.3995489078310055e-07,
"loss": 0.2633,
"step": 3115
},
{
"epoch": 0.9334330590875094,
"grad_norm": 1.619406307330865,
"learning_rate": 1.338847404664667e-07,
"loss": 0.2548,
"step": 3120
},
{
"epoch": 0.9349289454001496,
"grad_norm": 1.5539413237386495,
"learning_rate": 1.2794737676536993e-07,
"loss": 0.2527,
"step": 3125
},
{
"epoch": 0.9364248317127898,
"grad_norm": 1.7495161399355714,
"learning_rate": 1.2214296169866578e-07,
"loss": 0.2515,
"step": 3130
},
{
"epoch": 0.9379207180254301,
"grad_norm": 1.641652885536429,
"learning_rate": 1.164716536573074e-07,
"loss": 0.2501,
"step": 3135
},
{
"epoch": 0.9394166043380703,
"grad_norm": 1.753141085715687,
"learning_rate": 1.1093360740002057e-07,
"loss": 0.2515,
"step": 3140
},
{
"epoch": 0.9409124906507106,
"grad_norm": 1.7530034719134988,
"learning_rate": 1.0552897404908391e-07,
"loss": 0.2559,
"step": 3145
},
{
"epoch": 0.9424083769633508,
"grad_norm": 1.5804220071987112,
"learning_rate": 1.0025790108620092e-07,
"loss": 0.2483,
"step": 3150
},
{
"epoch": 0.943904263275991,
"grad_norm": 1.822783043661551,
"learning_rate": 9.512053234847774e-08,
"loss": 0.258,
"step": 3155
},
{
"epoch": 0.9454001495886313,
"grad_norm": 1.7776638404626903,
"learning_rate": 9.01170080244984e-08,
"loss": 0.2463,
"step": 3160
},
{
"epoch": 0.9468960359012715,
"grad_norm": 1.7244463932526486,
"learning_rate": 8.52474646504986e-08,
"loss": 0.2506,
"step": 3165
},
{
"epoch": 0.9483919222139118,
"grad_norm": 1.7184065717867174,
"learning_rate": 8.05120351066413e-08,
"loss": 0.2605,
"step": 3170
},
{
"epoch": 0.949887808526552,
"grad_norm": 1.7978606844090408,
"learning_rate": 7.591084861338749e-08,
"loss": 0.2503,
"step": 3175
},
{
"epoch": 0.9513836948391923,
"grad_norm": 1.6764242072572402,
"learning_rate": 7.144403072797346e-08,
"loss": 0.2523,
"step": 3180
},
{
"epoch": 0.9528795811518325,
"grad_norm": 1.6752659598734612,
"learning_rate": 6.711170334098294e-08,
"loss": 0.2566,
"step": 3185
},
{
"epoch": 0.9543754674644727,
"grad_norm": 1.7696006414428376,
"learning_rate": 6.291398467302146e-08,
"loss": 0.2579,
"step": 3190
},
{
"epoch": 0.955871353777113,
"grad_norm": 1.6541063626129755,
"learning_rate": 5.885098927148947e-08,
"loss": 0.2505,
"step": 3195
},
{
"epoch": 0.9573672400897532,
"grad_norm": 1.791951476550907,
"learning_rate": 5.492282800745707e-08,
"loss": 0.252,
"step": 3200
},
{
"epoch": 0.9588631264023935,
"grad_norm": 1.6998940151846391,
"learning_rate": 5.112960807263978e-08,
"loss": 0.2602,
"step": 3205
},
{
"epoch": 0.9603590127150337,
"grad_norm": 1.739892053817991,
"learning_rate": 4.7471432976471944e-08,
"loss": 0.264,
"step": 3210
},
{
"epoch": 0.9618548990276738,
"grad_norm": 1.5429992279839573,
"learning_rate": 4.3948402543282366e-08,
"loss": 0.2543,
"step": 3215
},
{
"epoch": 0.9633507853403142,
"grad_norm": 1.772813294024904,
"learning_rate": 4.056061290956981e-08,
"loss": 0.2524,
"step": 3220
},
{
"epoch": 0.9648466716529543,
"grad_norm": 1.5751929247313246,
"learning_rate": 3.730815652138231e-08,
"loss": 0.2525,
"step": 3225
},
{
"epoch": 0.9663425579655947,
"grad_norm": 1.7360588122089868,
"learning_rate": 3.4191122131790324e-08,
"loss": 0.255,
"step": 3230
},
{
"epoch": 0.9678384442782348,
"grad_norm": 1.7743122424766984,
"learning_rate": 3.120959479846919e-08,
"loss": 0.2584,
"step": 3235
},
{
"epoch": 0.9693343305908751,
"grad_norm": 1.574467195657007,
"learning_rate": 2.8363655881374906e-08,
"loss": 0.2558,
"step": 3240
},
{
"epoch": 0.9708302169035153,
"grad_norm": 1.80180199036063,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.2568,
"step": 3245
},
{
"epoch": 0.9723261032161555,
"grad_norm": 1.6886241273143858,
"learning_rate": 2.3078850233878015e-08,
"loss": 0.2466,
"step": 3250
},
{
"epoch": 0.9738219895287958,
"grad_norm": 1.7815633691396229,
"learning_rate": 2.064012771532009e-08,
"loss": 0.2536,
"step": 3255
},
{
"epoch": 0.975317875841436,
"grad_norm": 1.6956109134529065,
"learning_rate": 1.83372820327421e-08,
"loss": 0.2592,
"step": 3260
},
{
"epoch": 0.9768137621540763,
"grad_norm": 1.7584432907260417,
"learning_rate": 1.6170376026226065e-08,
"loss": 0.2647,
"step": 3265
},
{
"epoch": 0.9783096484667165,
"grad_norm": 1.687889717075937,
"learning_rate": 1.4139468826331327e-08,
"loss": 0.2529,
"step": 3270
},
{
"epoch": 0.9798055347793567,
"grad_norm": 1.6973842345080912,
"learning_rate": 1.2244615852479158e-08,
"loss": 0.2586,
"step": 3275
},
{
"epoch": 0.981301421091997,
"grad_norm": 1.7860998582475756,
"learning_rate": 1.0485868811441757e-08,
"loss": 0.2596,
"step": 3280
},
{
"epoch": 0.9827973074046372,
"grad_norm": 1.7444036807918029,
"learning_rate": 8.86327569593115e-09,
"loss": 0.253,
"step": 3285
},
{
"epoch": 0.9842931937172775,
"grad_norm": 1.7876798673093501,
"learning_rate": 7.376880783289131e-09,
"loss": 0.2551,
"step": 3290
},
{
"epoch": 0.9857890800299177,
"grad_norm": 1.66622892909602,
"learning_rate": 6.026724634279335e-09,
"loss": 0.2557,
"step": 3295
},
{
"epoch": 0.9872849663425579,
"grad_norm": 1.7386422804846284,
"learning_rate": 4.8128440919792405e-09,
"loss": 0.253,
"step": 3300
},
{
"epoch": 0.9887808526551982,
"grad_norm": 1.5376218727713236,
"learning_rate": 3.73527228077708e-09,
"loss": 0.2501,
"step": 3305
},
{
"epoch": 0.9902767389678384,
"grad_norm": 1.7638583930274379,
"learning_rate": 2.7940386054664537e-09,
"loss": 0.262,
"step": 3310
},
{
"epoch": 0.9917726252804787,
"grad_norm": 1.9162749151140541,
"learning_rate": 1.9891687504436373e-09,
"loss": 0.2575,
"step": 3315
},
{
"epoch": 0.9932685115931189,
"grad_norm": 1.880864088528354,
"learning_rate": 1.320684679008144e-09,
"loss": 0.2602,
"step": 3320
},
{
"epoch": 0.9947643979057592,
"grad_norm": 1.7803280986620529,
"learning_rate": 7.886046327609809e-10,
"loss": 0.2543,
"step": 3325
},
{
"epoch": 0.9962602842183994,
"grad_norm": 1.5859942056150071,
"learning_rate": 3.929431311094911e-10,
"loss": 0.2563,
"step": 3330
},
{
"epoch": 0.9977561705310396,
"grad_norm": 1.6644206261602157,
"learning_rate": 1.337109708704487e-10,
"loss": 0.2515,
"step": 3335
},
{
"epoch": 0.9992520568436799,
"grad_norm": 1.6728854762026557,
"learning_rate": 1.091522597362893e-11,
"loss": 0.2602,
"step": 3340
},
{
"epoch": 0.999850411368736,
"eval_loss": 0.25460532307624817,
"eval_runtime": 342.3221,
"eval_samples_per_second": 3.152,
"eval_steps_per_second": 0.789,
"step": 3342
},
{
"epoch": 0.999850411368736,
"step": 3342,
"total_flos": 699694464368640.0,
"train_loss": 0.43199071664427796,
"train_runtime": 75290.8899,
"train_samples_per_second": 1.421,
"train_steps_per_second": 0.044
}
],
"logging_steps": 5,
"max_steps": 3342,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 699694464368640.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}