9b-6 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
0a42731 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1626,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024622960911049553,
"grad_norm": 1.5390625,
"learning_rate": 2.040816326530612e-10,
"loss": 1.3865270614624023,
"step": 2
},
{
"epoch": 0.0049245921822099106,
"grad_norm": 4.375,
"learning_rate": 6.122448979591837e-10,
"loss": 1.8760377168655396,
"step": 4
},
{
"epoch": 0.007386888273314866,
"grad_norm": 2.359375,
"learning_rate": 1.020408163265306e-09,
"loss": 1.1314038038253784,
"step": 6
},
{
"epoch": 0.009849184364419821,
"grad_norm": 5.71875,
"learning_rate": 1.4285714285714286e-09,
"loss": 1.8253700733184814,
"step": 8
},
{
"epoch": 0.012311480455524777,
"grad_norm": 12.625,
"learning_rate": 1.8367346938775511e-09,
"loss": 2.2051210403442383,
"step": 10
},
{
"epoch": 0.014773776546629732,
"grad_norm": 20.375,
"learning_rate": 2.2448979591836736e-09,
"loss": 2.4439101219177246,
"step": 12
},
{
"epoch": 0.017236072637734686,
"grad_norm": 3.578125,
"learning_rate": 2.653061224489796e-09,
"loss": 1.3878843784332275,
"step": 14
},
{
"epoch": 0.019698368728839642,
"grad_norm": 1.765625,
"learning_rate": 3.0612244897959187e-09,
"loss": 1.1822748184204102,
"step": 16
},
{
"epoch": 0.0221606648199446,
"grad_norm": 2.53125,
"learning_rate": 3.4693877551020408e-09,
"loss": 1.1794735193252563,
"step": 18
},
{
"epoch": 0.024622960911049555,
"grad_norm": 14.625,
"learning_rate": 3.877551020408163e-09,
"loss": 2.3212547302246094,
"step": 20
},
{
"epoch": 0.02708525700215451,
"grad_norm": 5.625,
"learning_rate": 4.285714285714286e-09,
"loss": 1.7700073719024658,
"step": 22
},
{
"epoch": 0.029547553093259463,
"grad_norm": 14.25,
"learning_rate": 4.693877551020409e-09,
"loss": 2.191647529602051,
"step": 24
},
{
"epoch": 0.03200984918436442,
"grad_norm": 4.15625,
"learning_rate": 5.102040816326531e-09,
"loss": 1.7301385402679443,
"step": 26
},
{
"epoch": 0.03447214527546937,
"grad_norm": 14.1875,
"learning_rate": 5.510204081632653e-09,
"loss": 2.343463659286499,
"step": 28
},
{
"epoch": 0.03693444136657433,
"grad_norm": 5.90625,
"learning_rate": 5.918367346938776e-09,
"loss": 1.2581849098205566,
"step": 30
},
{
"epoch": 0.039396737457679284,
"grad_norm": 5.1875,
"learning_rate": 6.326530612244899e-09,
"loss": 1.9037660360336304,
"step": 32
},
{
"epoch": 0.041859033548784244,
"grad_norm": 6.25,
"learning_rate": 6.73469387755102e-09,
"loss": 1.8926417827606201,
"step": 34
},
{
"epoch": 0.0443213296398892,
"grad_norm": 4.15625,
"learning_rate": 7.142857142857143e-09,
"loss": 1.494161605834961,
"step": 36
},
{
"epoch": 0.04678362573099415,
"grad_norm": 72.5,
"learning_rate": 7.551020408163264e-09,
"loss": 2.4310765266418457,
"step": 38
},
{
"epoch": 0.04924592182209911,
"grad_norm": 13.1875,
"learning_rate": 7.959183673469387e-09,
"loss": 2.401200294494629,
"step": 40
},
{
"epoch": 0.05170821791320406,
"grad_norm": 17.875,
"learning_rate": 8.36734693877551e-09,
"loss": 2.269543170928955,
"step": 42
},
{
"epoch": 0.05417051400430902,
"grad_norm": 6.375,
"learning_rate": 8.775510204081633e-09,
"loss": 1.880392074584961,
"step": 44
},
{
"epoch": 0.056632810095413974,
"grad_norm": 10.9375,
"learning_rate": 9.183673469387756e-09,
"loss": 2.2891359329223633,
"step": 46
},
{
"epoch": 0.05909510618651893,
"grad_norm": 2.953125,
"learning_rate": 9.591836734693877e-09,
"loss": 1.245388150215149,
"step": 48
},
{
"epoch": 0.061557402277623886,
"grad_norm": 14.0625,
"learning_rate": 1e-08,
"loss": 1.8519728183746338,
"step": 50
},
{
"epoch": 0.06401969836872884,
"grad_norm": 12.3125,
"learning_rate": 9.99996825131286e-09,
"loss": 2.678940773010254,
"step": 52
},
{
"epoch": 0.0664819944598338,
"grad_norm": 13.4375,
"learning_rate": 9.999873005755431e-09,
"loss": 2.3168435096740723,
"step": 54
},
{
"epoch": 0.06894429055093874,
"grad_norm": 23.125,
"learning_rate": 9.999714264839672e-09,
"loss": 2.218395233154297,
"step": 56
},
{
"epoch": 0.0714065866420437,
"grad_norm": 3.265625,
"learning_rate": 9.999492031085492e-09,
"loss": 1.2967658042907715,
"step": 58
},
{
"epoch": 0.07386888273314866,
"grad_norm": 8.4375,
"learning_rate": 9.999206308020707e-09,
"loss": 2.0597116947174072,
"step": 60
},
{
"epoch": 0.07633117882425362,
"grad_norm": 3.984375,
"learning_rate": 9.99885710018098e-09,
"loss": 1.6437733173370361,
"step": 62
},
{
"epoch": 0.07879347491535857,
"grad_norm": 6.9375,
"learning_rate": 9.99844441310976e-09,
"loss": 1.878865122795105,
"step": 64
},
{
"epoch": 0.08125577100646353,
"grad_norm": 5.34375,
"learning_rate": 9.997968253358178e-09,
"loss": 1.8909335136413574,
"step": 66
},
{
"epoch": 0.08371806709756849,
"grad_norm": 15.9375,
"learning_rate": 9.997428628484963e-09,
"loss": 2.290242910385132,
"step": 68
},
{
"epoch": 0.08618036318867343,
"grad_norm": 8.9375,
"learning_rate": 9.996825547056302e-09,
"loss": 2.0678482055664062,
"step": 70
},
{
"epoch": 0.0886426592797784,
"grad_norm": 5.75,
"learning_rate": 9.996159018645721e-09,
"loss": 1.8928303718566895,
"step": 72
},
{
"epoch": 0.09110495537088335,
"grad_norm": 7.53125,
"learning_rate": 9.995429053833917e-09,
"loss": 1.9023447036743164,
"step": 74
},
{
"epoch": 0.0935672514619883,
"grad_norm": 7.59375,
"learning_rate": 9.994635664208602e-09,
"loss": 1.914489507675171,
"step": 76
},
{
"epoch": 0.09602954755309326,
"grad_norm": 11.125,
"learning_rate": 9.99377886236432e-09,
"loss": 2.057431221008301,
"step": 78
},
{
"epoch": 0.09849184364419822,
"grad_norm": 7.21875,
"learning_rate": 9.992858661902233e-09,
"loss": 1.9636759757995605,
"step": 80
},
{
"epoch": 0.10095413973530316,
"grad_norm": 4.15625,
"learning_rate": 9.99187507742992e-09,
"loss": 1.298654317855835,
"step": 82
},
{
"epoch": 0.10341643582640812,
"grad_norm": 2.953125,
"learning_rate": 9.990828124561143e-09,
"loss": 1.1845377683639526,
"step": 84
},
{
"epoch": 0.10587873191751308,
"grad_norm": 12.0625,
"learning_rate": 9.989717819915584e-09,
"loss": 2.3120527267456055,
"step": 86
},
{
"epoch": 0.10834102800861804,
"grad_norm": 6.75,
"learning_rate": 9.988544181118608e-09,
"loss": 1.792182445526123,
"step": 88
},
{
"epoch": 0.11080332409972299,
"grad_norm": 4.03125,
"learning_rate": 9.987307226800957e-09,
"loss": 1.4169440269470215,
"step": 90
},
{
"epoch": 0.11326562019082795,
"grad_norm": 19.375,
"learning_rate": 9.98600697659847e-09,
"loss": 2.2629003524780273,
"step": 92
},
{
"epoch": 0.11572791628193291,
"grad_norm": 4.65625,
"learning_rate": 9.984643451151764e-09,
"loss": 1.8561232089996338,
"step": 94
},
{
"epoch": 0.11819021237303785,
"grad_norm": 5.03125,
"learning_rate": 9.98321667210591e-09,
"loss": 1.8327598571777344,
"step": 96
},
{
"epoch": 0.12065250846414281,
"grad_norm": 3.34375,
"learning_rate": 9.98172666211009e-09,
"loss": 1.2463821172714233,
"step": 98
},
{
"epoch": 0.12311480455524777,
"grad_norm": 6.375,
"learning_rate": 9.980173444817238e-09,
"loss": 1.351346731185913,
"step": 100
},
{
"epoch": 0.12557710064635272,
"grad_norm": 4.0625,
"learning_rate": 9.978557044883651e-09,
"loss": 1.2666093111038208,
"step": 102
},
{
"epoch": 0.12803939673745768,
"grad_norm": 4.78125,
"learning_rate": 9.976877487968623e-09,
"loss": 1.905246615409851,
"step": 104
},
{
"epoch": 0.13050169282856264,
"grad_norm": 3.0,
"learning_rate": 9.975134800734015e-09,
"loss": 1.1379789113998413,
"step": 106
},
{
"epoch": 0.1329639889196676,
"grad_norm": 4.65625,
"learning_rate": 9.973329010843847e-09,
"loss": 1.8731987476348877,
"step": 108
},
{
"epoch": 0.13542628501077256,
"grad_norm": 10.3125,
"learning_rate": 9.97146014696384e-09,
"loss": 1.897504448890686,
"step": 110
},
{
"epoch": 0.1378885811018775,
"grad_norm": 2.375,
"learning_rate": 9.96952823876099e-09,
"loss": 1.1055809259414673,
"step": 112
},
{
"epoch": 0.14035087719298245,
"grad_norm": 14.5625,
"learning_rate": 9.967533316903066e-09,
"loss": 2.4285759925842285,
"step": 114
},
{
"epoch": 0.1428131732840874,
"grad_norm": 6.0625,
"learning_rate": 9.965475413058142e-09,
"loss": 1.8401623964309692,
"step": 116
},
{
"epoch": 0.14527546937519237,
"grad_norm": 3.625,
"learning_rate": 9.963354559894099e-09,
"loss": 1.2698298692703247,
"step": 118
},
{
"epoch": 0.14773776546629733,
"grad_norm": 1.6875,
"learning_rate": 9.961170791078078e-09,
"loss": 1.1040065288543701,
"step": 120
},
{
"epoch": 0.1502000615574023,
"grad_norm": 16.375,
"learning_rate": 9.958924141275982e-09,
"loss": 1.8983745574951172,
"step": 122
},
{
"epoch": 0.15266235764850725,
"grad_norm": 5.125,
"learning_rate": 9.956614646151903e-09,
"loss": 1.9957232475280762,
"step": 124
},
{
"epoch": 0.15512465373961218,
"grad_norm": 12.75,
"learning_rate": 9.954242342367555e-09,
"loss": 2.3904964923858643,
"step": 126
},
{
"epoch": 0.15758694983071714,
"grad_norm": 5.0625,
"learning_rate": 9.951807267581707e-09,
"loss": 1.8866188526153564,
"step": 128
},
{
"epoch": 0.1600492459218221,
"grad_norm": 1.984375,
"learning_rate": 9.94930946044957e-09,
"loss": 1.2808419466018677,
"step": 130
},
{
"epoch": 0.16251154201292706,
"grad_norm": 2.484375,
"learning_rate": 9.946748960622197e-09,
"loss": 1.3167526721954346,
"step": 132
},
{
"epoch": 0.16497383810403202,
"grad_norm": 2.4375,
"learning_rate": 9.944125808745837e-09,
"loss": 1.2127764225006104,
"step": 134
},
{
"epoch": 0.16743613419513698,
"grad_norm": 4.5,
"learning_rate": 9.941440046461305e-09,
"loss": 1.9335191249847412,
"step": 136
},
{
"epoch": 0.1698984302862419,
"grad_norm": 9.0,
"learning_rate": 9.938691716403316e-09,
"loss": 1.9803462028503418,
"step": 138
},
{
"epoch": 0.17236072637734687,
"grad_norm": 4.65625,
"learning_rate": 9.935880862199809e-09,
"loss": 1.820433259010315,
"step": 140
},
{
"epoch": 0.17482302246845183,
"grad_norm": 5.78125,
"learning_rate": 9.93300752847124e-09,
"loss": 1.9337809085845947,
"step": 142
},
{
"epoch": 0.1772853185595568,
"grad_norm": 5.28125,
"learning_rate": 9.930071760829904e-09,
"loss": 1.8973931074142456,
"step": 144
},
{
"epoch": 0.17974761465066175,
"grad_norm": 5.40625,
"learning_rate": 9.927073605879185e-09,
"loss": 1.9531124830245972,
"step": 146
},
{
"epoch": 0.1822099107417667,
"grad_norm": 5.75,
"learning_rate": 9.924013111212818e-09,
"loss": 1.9310762882232666,
"step": 148
},
{
"epoch": 0.18467220683287167,
"grad_norm": 9.375,
"learning_rate": 9.920890325414153e-09,
"loss": 2.008820056915283,
"step": 150
},
{
"epoch": 0.1871345029239766,
"grad_norm": 82.5,
"learning_rate": 9.917705298055361e-09,
"loss": 3.0185141563415527,
"step": 152
},
{
"epoch": 0.18959679901508156,
"grad_norm": 8.625,
"learning_rate": 9.914458079696664e-09,
"loss": 2.008962631225586,
"step": 154
},
{
"epoch": 0.19205909510618652,
"grad_norm": 9.25,
"learning_rate": 9.91114872188552e-09,
"loss": 1.6197317838668823,
"step": 156
},
{
"epoch": 0.19452139119729148,
"grad_norm": 4.53125,
"learning_rate": 9.907777277155811e-09,
"loss": 1.8305246829986572,
"step": 158
},
{
"epoch": 0.19698368728839644,
"grad_norm": 9.75,
"learning_rate": 9.904343799027012e-09,
"loss": 1.9033877849578857,
"step": 160
},
{
"epoch": 0.1994459833795014,
"grad_norm": 8.5,
"learning_rate": 9.90084834200333e-09,
"loss": 1.9224884510040283,
"step": 162
},
{
"epoch": 0.20190827947060633,
"grad_norm": 5.5,
"learning_rate": 9.897290961572854e-09,
"loss": 1.5109963417053223,
"step": 164
},
{
"epoch": 0.2043705755617113,
"grad_norm": 6.0625,
"learning_rate": 9.893671714206662e-09,
"loss": 1.9377520084381104,
"step": 166
},
{
"epoch": 0.20683287165281625,
"grad_norm": 5.03125,
"learning_rate": 9.889990657357933e-09,
"loss": 1.6958491802215576,
"step": 168
},
{
"epoch": 0.2092951677439212,
"grad_norm": 5.1875,
"learning_rate": 9.886247849461023e-09,
"loss": 1.320851445198059,
"step": 170
},
{
"epoch": 0.21175746383502617,
"grad_norm": 17.375,
"learning_rate": 9.882443349930552e-09,
"loss": 2.529175281524658,
"step": 172
},
{
"epoch": 0.21421975992613113,
"grad_norm": 5.53125,
"learning_rate": 9.878577219160456e-09,
"loss": 1.9636085033416748,
"step": 174
},
{
"epoch": 0.21668205601723609,
"grad_norm": 5.84375,
"learning_rate": 9.87464951852302e-09,
"loss": 1.9693580865859985,
"step": 176
},
{
"epoch": 0.21914435210834102,
"grad_norm": 8.5,
"learning_rate": 9.870660310367915e-09,
"loss": 1.955024242401123,
"step": 178
},
{
"epoch": 0.22160664819944598,
"grad_norm": 11.5,
"learning_rate": 9.866609658021202e-09,
"loss": 2.3577377796173096,
"step": 180
},
{
"epoch": 0.22406894429055094,
"grad_norm": 14.1875,
"learning_rate": 9.862497625784324e-09,
"loss": 2.3302321434020996,
"step": 182
},
{
"epoch": 0.2265312403816559,
"grad_norm": 5.40625,
"learning_rate": 9.8583242789331e-09,
"loss": 1.872032642364502,
"step": 184
},
{
"epoch": 0.22899353647276086,
"grad_norm": 8.1875,
"learning_rate": 9.854089683716666e-09,
"loss": 1.9843339920043945,
"step": 186
},
{
"epoch": 0.23145583256386582,
"grad_norm": 6.375,
"learning_rate": 9.849793907356444e-09,
"loss": 1.8600096702575684,
"step": 188
},
{
"epoch": 0.23391812865497075,
"grad_norm": 11.0,
"learning_rate": 9.845437018045063e-09,
"loss": 2.281198024749756,
"step": 190
},
{
"epoch": 0.2363804247460757,
"grad_norm": 4.34375,
"learning_rate": 9.841019084945281e-09,
"loss": 1.8489793539047241,
"step": 192
},
{
"epoch": 0.23884272083718067,
"grad_norm": 4.40625,
"learning_rate": 9.836540178188888e-09,
"loss": 1.8184915781021118,
"step": 194
},
{
"epoch": 0.24130501692828563,
"grad_norm": 39.5,
"learning_rate": 9.832000368875586e-09,
"loss": 2.5119130611419678,
"step": 196
},
{
"epoch": 0.24376731301939059,
"grad_norm": 42.25,
"learning_rate": 9.82739972907187e-09,
"loss": 1.7983183860778809,
"step": 198
},
{
"epoch": 0.24622960911049555,
"grad_norm": 7.9375,
"learning_rate": 9.822738331809873e-09,
"loss": 1.8701186180114746,
"step": 200
},
{
"epoch": 0.2486919052016005,
"grad_norm": 10.8125,
"learning_rate": 9.818016251086222e-09,
"loss": 2.0227789878845215,
"step": 202
},
{
"epoch": 0.25115420129270544,
"grad_norm": 9.625,
"learning_rate": 9.813233561860844e-09,
"loss": 2.185953140258789,
"step": 204
},
{
"epoch": 0.2536164973838104,
"grad_norm": 5.0,
"learning_rate": 9.808390340055792e-09,
"loss": 1.850534439086914,
"step": 206
},
{
"epoch": 0.25607879347491536,
"grad_norm": 5.125,
"learning_rate": 9.803486662554038e-09,
"loss": 1.9469786882400513,
"step": 208
},
{
"epoch": 0.2585410895660203,
"grad_norm": 4.03125,
"learning_rate": 9.798522607198235e-09,
"loss": 1.7527638673782349,
"step": 210
},
{
"epoch": 0.2610033856571253,
"grad_norm": 4.75,
"learning_rate": 9.79349825278951e-09,
"loss": 1.9203780889511108,
"step": 212
},
{
"epoch": 0.2634656817482302,
"grad_norm": 4.53125,
"learning_rate": 9.788413679086188e-09,
"loss": 1.8700388669967651,
"step": 214
},
{
"epoch": 0.2659279778393352,
"grad_norm": 5.78125,
"learning_rate": 9.783268966802539e-09,
"loss": 2.030698299407959,
"step": 216
},
{
"epoch": 0.2683902739304401,
"grad_norm": 7.375,
"learning_rate": 9.778064197607495e-09,
"loss": 1.936469316482544,
"step": 218
},
{
"epoch": 0.2708525700215451,
"grad_norm": 18.875,
"learning_rate": 9.772799454123349e-09,
"loss": 2.471208095550537,
"step": 220
},
{
"epoch": 0.27331486611265005,
"grad_norm": 13.25,
"learning_rate": 9.767474819924447e-09,
"loss": 2.437526226043701,
"step": 222
},
{
"epoch": 0.275777162203755,
"grad_norm": 6.5625,
"learning_rate": 9.762090379535862e-09,
"loss": 2.013521909713745,
"step": 224
},
{
"epoch": 0.27823945829485996,
"grad_norm": 14.0625,
"learning_rate": 9.756646218432053e-09,
"loss": 2.0168678760528564,
"step": 226
},
{
"epoch": 0.2807017543859649,
"grad_norm": 5.8125,
"learning_rate": 9.751142423035501e-09,
"loss": 1.995202660560608,
"step": 228
},
{
"epoch": 0.2831640504770699,
"grad_norm": 42.5,
"learning_rate": 9.74557908071535e-09,
"loss": 1.953993320465088,
"step": 230
},
{
"epoch": 0.2856263465681748,
"grad_norm": 2.46875,
"learning_rate": 9.739956279786e-09,
"loss": 1.149980068206787,
"step": 232
},
{
"epoch": 0.2880886426592798,
"grad_norm": 4.21875,
"learning_rate": 9.734274109505729e-09,
"loss": 1.7589616775512695,
"step": 234
},
{
"epoch": 0.29055093875038474,
"grad_norm": 5.0625,
"learning_rate": 9.72853266007526e-09,
"loss": 1.9171326160430908,
"step": 236
},
{
"epoch": 0.29301323484148967,
"grad_norm": 11.4375,
"learning_rate": 9.722732022636333e-09,
"loss": 1.6742775440216064,
"step": 238
},
{
"epoch": 0.29547553093259465,
"grad_norm": 4.78125,
"learning_rate": 9.716872289270262e-09,
"loss": 1.7873895168304443,
"step": 240
},
{
"epoch": 0.2979378270236996,
"grad_norm": 4.40625,
"learning_rate": 9.710953552996464e-09,
"loss": 1.9001209735870361,
"step": 242
},
{
"epoch": 0.3004001231148046,
"grad_norm": 4.78125,
"learning_rate": 9.704975907770995e-09,
"loss": 1.869600534439087,
"step": 244
},
{
"epoch": 0.3028624192059095,
"grad_norm": 3.46875,
"learning_rate": 9.69893944848505e-09,
"loss": 1.5148907899856567,
"step": 246
},
{
"epoch": 0.3053247152970145,
"grad_norm": 14.6875,
"learning_rate": 9.69284427096345e-09,
"loss": 1.914973497390747,
"step": 248
},
{
"epoch": 0.3077870113881194,
"grad_norm": 13.125,
"learning_rate": 9.686690471963147e-09,
"loss": 2.230684757232666,
"step": 250
},
{
"epoch": 0.31024930747922436,
"grad_norm": 7.34375,
"learning_rate": 9.680478149171657e-09,
"loss": 2.0974578857421875,
"step": 252
},
{
"epoch": 0.31271160357032934,
"grad_norm": 13.5625,
"learning_rate": 9.674207401205524e-09,
"loss": 2.2117700576782227,
"step": 254
},
{
"epoch": 0.3151738996614343,
"grad_norm": 5.25,
"learning_rate": 9.667878327608756e-09,
"loss": 1.8505613803863525,
"step": 256
},
{
"epoch": 0.31763619575253926,
"grad_norm": 14.25,
"learning_rate": 9.661491028851246e-09,
"loss": 1.7967166900634766,
"step": 258
},
{
"epoch": 0.3200984918436442,
"grad_norm": 4.0625,
"learning_rate": 9.655045606327165e-09,
"loss": 1.869051456451416,
"step": 260
},
{
"epoch": 0.3225607879347491,
"grad_norm": 9.0625,
"learning_rate": 9.648542162353366e-09,
"loss": 1.876924753189087,
"step": 262
},
{
"epoch": 0.3250230840258541,
"grad_norm": 5.21875,
"learning_rate": 9.64198080016775e-09,
"loss": 2.0315141677856445,
"step": 264
},
{
"epoch": 0.32748538011695905,
"grad_norm": 8.5625,
"learning_rate": 9.635361623927643e-09,
"loss": 2.1542179584503174,
"step": 266
},
{
"epoch": 0.32994767620806403,
"grad_norm": 3.140625,
"learning_rate": 9.62868473870811e-09,
"loss": 1.1597316265106201,
"step": 268
},
{
"epoch": 0.33240997229916897,
"grad_norm": 10.5,
"learning_rate": 9.621950250500333e-09,
"loss": 2.637326717376709,
"step": 270
},
{
"epoch": 0.33487226839027395,
"grad_norm": 2.859375,
"learning_rate": 9.615158266209887e-09,
"loss": 1.283077597618103,
"step": 272
},
{
"epoch": 0.3373345644813789,
"grad_norm": 7.125,
"learning_rate": 9.608308893655061e-09,
"loss": 2.046065092086792,
"step": 274
},
{
"epoch": 0.3397968605724838,
"grad_norm": 2.953125,
"learning_rate": 9.601402241565154e-09,
"loss": 1.1603574752807617,
"step": 276
},
{
"epoch": 0.3422591566635888,
"grad_norm": 5.34375,
"learning_rate": 9.59443841957873e-09,
"loss": 1.7637038230895996,
"step": 278
},
{
"epoch": 0.34472145275469374,
"grad_norm": 5.21875,
"learning_rate": 9.587417538241892e-09,
"loss": 1.938485860824585,
"step": 280
},
{
"epoch": 0.3471837488457987,
"grad_norm": 29.0,
"learning_rate": 9.580339709006524e-09,
"loss": 2.3233187198638916,
"step": 282
},
{
"epoch": 0.34964604493690365,
"grad_norm": 6.0,
"learning_rate": 9.573205044228518e-09,
"loss": 1.4073760509490967,
"step": 284
},
{
"epoch": 0.35210834102800864,
"grad_norm": 6.375,
"learning_rate": 9.566013657165994e-09,
"loss": 1.3963334560394287,
"step": 286
},
{
"epoch": 0.3545706371191136,
"grad_norm": 6.8125,
"learning_rate": 9.558765661977503e-09,
"loss": 1.9514954090118408,
"step": 288
},
{
"epoch": 0.3570329332102185,
"grad_norm": 5.75,
"learning_rate": 9.551461173720208e-09,
"loss": 2.0840539932250977,
"step": 290
},
{
"epoch": 0.3594952293013235,
"grad_norm": 11.9375,
"learning_rate": 9.544100308348067e-09,
"loss": 2.2709197998046875,
"step": 292
},
{
"epoch": 0.3619575253924284,
"grad_norm": 12.3125,
"learning_rate": 9.536683182709986e-09,
"loss": 2.443535327911377,
"step": 294
},
{
"epoch": 0.3644198214835334,
"grad_norm": 18.875,
"learning_rate": 9.529209914547962e-09,
"loss": 2.240347385406494,
"step": 296
},
{
"epoch": 0.36688211757463834,
"grad_norm": 12.375,
"learning_rate": 9.521680622495228e-09,
"loss": 2.1307570934295654,
"step": 298
},
{
"epoch": 0.36934441366574333,
"grad_norm": 11.8125,
"learning_rate": 9.514095426074347e-09,
"loss": 2.510369062423706,
"step": 300
},
{
"epoch": 0.37180670975684826,
"grad_norm": 5.03125,
"learning_rate": 9.506454445695337e-09,
"loss": 1.9031611680984497,
"step": 302
},
{
"epoch": 0.3742690058479532,
"grad_norm": 2.484375,
"learning_rate": 9.498757802653741e-09,
"loss": 1.2329223155975342,
"step": 304
},
{
"epoch": 0.3767313019390582,
"grad_norm": 5.28125,
"learning_rate": 9.491005619128721e-09,
"loss": 1.8155068159103394,
"step": 306
},
{
"epoch": 0.3791935980301631,
"grad_norm": 7.625,
"learning_rate": 9.483198018181099e-09,
"loss": 1.736093282699585,
"step": 308
},
{
"epoch": 0.3816558941212681,
"grad_norm": 13.5625,
"learning_rate": 9.475335123751412e-09,
"loss": 1.9234977960586548,
"step": 310
},
{
"epoch": 0.38411819021237303,
"grad_norm": 8.5,
"learning_rate": 9.467417060657952e-09,
"loss": 1.9270076751708984,
"step": 312
},
{
"epoch": 0.38658048630347797,
"grad_norm": 4.0625,
"learning_rate": 9.459443954594769e-09,
"loss": 1.350337028503418,
"step": 314
},
{
"epoch": 0.38904278239458295,
"grad_norm": 2.609375,
"learning_rate": 9.451415932129692e-09,
"loss": 1.1429853439331055,
"step": 316
},
{
"epoch": 0.3915050784856879,
"grad_norm": 4.90625,
"learning_rate": 9.443333120702307e-09,
"loss": 1.8531888723373413,
"step": 318
},
{
"epoch": 0.3939673745767929,
"grad_norm": 3.0625,
"learning_rate": 9.435195648621935e-09,
"loss": 1.3913381099700928,
"step": 320
},
{
"epoch": 0.3964296706678978,
"grad_norm": 5.15625,
"learning_rate": 9.42700364506561e-09,
"loss": 1.8761987686157227,
"step": 322
},
{
"epoch": 0.3988919667590028,
"grad_norm": 4.4375,
"learning_rate": 9.418757240076008e-09,
"loss": 1.9191958904266357,
"step": 324
},
{
"epoch": 0.4013542628501077,
"grad_norm": 2.75,
"learning_rate": 9.410456564559393e-09,
"loss": 1.175315260887146,
"step": 326
},
{
"epoch": 0.40381655894121266,
"grad_norm": 12.375,
"learning_rate": 9.402101750283545e-09,
"loss": 2.3216049671173096,
"step": 328
},
{
"epoch": 0.40627885503231764,
"grad_norm": 2.265625,
"learning_rate": 9.39369292987565e-09,
"loss": 1.1453694105148315,
"step": 330
},
{
"epoch": 0.4087411511234226,
"grad_norm": 5.71875,
"learning_rate": 9.38523023682022e-09,
"loss": 1.9262512922286987,
"step": 332
},
{
"epoch": 0.41120344721452756,
"grad_norm": 10.375,
"learning_rate": 9.376713805456945e-09,
"loss": 2.126582622528076,
"step": 334
},
{
"epoch": 0.4136657433056325,
"grad_norm": 2.609375,
"learning_rate": 9.368143770978586e-09,
"loss": 1.1786751747131348,
"step": 336
},
{
"epoch": 0.4161280393967375,
"grad_norm": 8.125,
"learning_rate": 9.359520269428812e-09,
"loss": 2.126143217086792,
"step": 338
},
{
"epoch": 0.4185903354878424,
"grad_norm": 2.390625,
"learning_rate": 9.350843437700052e-09,
"loss": 1.245577335357666,
"step": 340
},
{
"epoch": 0.42105263157894735,
"grad_norm": 29.625,
"learning_rate": 9.342113413531315e-09,
"loss": 2.009819507598877,
"step": 342
},
{
"epoch": 0.42351492767005233,
"grad_norm": 3.875,
"learning_rate": 9.333330335506001e-09,
"loss": 1.1387863159179688,
"step": 344
},
{
"epoch": 0.42597722376115726,
"grad_norm": 12.5625,
"learning_rate": 9.324494343049707e-09,
"loss": 2.2192680835723877,
"step": 346
},
{
"epoch": 0.42843951985226225,
"grad_norm": 11.0,
"learning_rate": 9.315605576428018e-09,
"loss": 1.939860463142395,
"step": 348
},
{
"epoch": 0.4309018159433672,
"grad_norm": 13.8125,
"learning_rate": 9.306664176744266e-09,
"loss": 2.318619728088379,
"step": 350
},
{
"epoch": 0.43336411203447217,
"grad_norm": 3.15625,
"learning_rate": 9.297670285937303e-09,
"loss": 1.0619254112243652,
"step": 352
},
{
"epoch": 0.4358264081255771,
"grad_norm": 5.4375,
"learning_rate": 9.288624046779241e-09,
"loss": 1.834202766418457,
"step": 354
},
{
"epoch": 0.43828870421668203,
"grad_norm": 9.8125,
"learning_rate": 9.279525602873189e-09,
"loss": 1.9926815032958984,
"step": 356
},
{
"epoch": 0.440751000307787,
"grad_norm": 4.8125,
"learning_rate": 9.27037509865097e-09,
"loss": 1.9792507886886597,
"step": 358
},
{
"epoch": 0.44321329639889195,
"grad_norm": 9.125,
"learning_rate": 9.26117267937083e-09,
"loss": 1.5881253480911255,
"step": 360
},
{
"epoch": 0.44567559248999694,
"grad_norm": 15.5,
"learning_rate": 9.251918491115142e-09,
"loss": 2.488168239593506,
"step": 362
},
{
"epoch": 0.4481378885811019,
"grad_norm": 4.8125,
"learning_rate": 9.242612680788061e-09,
"loss": 1.9684348106384277,
"step": 364
},
{
"epoch": 0.45060018467220686,
"grad_norm": 9.375,
"learning_rate": 9.233255396113223e-09,
"loss": 2.305130958557129,
"step": 366
},
{
"epoch": 0.4530624807633118,
"grad_norm": 16.75,
"learning_rate": 9.223846785631378e-09,
"loss": 2.335341215133667,
"step": 368
},
{
"epoch": 0.4555247768544167,
"grad_norm": 5.25,
"learning_rate": 9.214386998698039e-09,
"loss": 1.7638440132141113,
"step": 370
},
{
"epoch": 0.4579870729455217,
"grad_norm": 5.53125,
"learning_rate": 9.20487618548112e-09,
"loss": 1.7996431589126587,
"step": 372
},
{
"epoch": 0.46044936903662664,
"grad_norm": 4.6875,
"learning_rate": 9.195314496958531e-09,
"loss": 1.7842280864715576,
"step": 374
},
{
"epoch": 0.46291166512773163,
"grad_norm": 38.0,
"learning_rate": 9.185702084915805e-09,
"loss": 2.152765989303589,
"step": 376
},
{
"epoch": 0.46537396121883656,
"grad_norm": 5.125,
"learning_rate": 9.176039101943672e-09,
"loss": 1.7519220113754272,
"step": 378
},
{
"epoch": 0.4678362573099415,
"grad_norm": 45.0,
"learning_rate": 9.166325701435644e-09,
"loss": 2.9101526737213135,
"step": 380
},
{
"epoch": 0.4702985534010465,
"grad_norm": 12.9375,
"learning_rate": 9.156562037585576e-09,
"loss": 2.2048463821411133,
"step": 382
},
{
"epoch": 0.4727608494921514,
"grad_norm": 5.4375,
"learning_rate": 9.146748265385223e-09,
"loss": 1.8226771354675293,
"step": 384
},
{
"epoch": 0.4752231455832564,
"grad_norm": 16.0,
"learning_rate": 9.13688454062178e-09,
"loss": 2.297773838043213,
"step": 386
},
{
"epoch": 0.47768544167436133,
"grad_norm": 16.375,
"learning_rate": 9.126971019875397e-09,
"loss": 2.2794573307037354,
"step": 388
},
{
"epoch": 0.4801477377654663,
"grad_norm": 8.1875,
"learning_rate": 9.117007860516713e-09,
"loss": 1.2689777612686157,
"step": 390
},
{
"epoch": 0.48261003385657125,
"grad_norm": 10.4375,
"learning_rate": 9.106995220704344e-09,
"loss": 2.273574113845825,
"step": 392
},
{
"epoch": 0.4850723299476762,
"grad_norm": 4.3125,
"learning_rate": 9.09693325938237e-09,
"loss": 1.7581639289855957,
"step": 394
},
{
"epoch": 0.48753462603878117,
"grad_norm": 4.25,
"learning_rate": 9.08682213627782e-09,
"loss": 1.8824234008789062,
"step": 396
},
{
"epoch": 0.4899969221298861,
"grad_norm": 40.0,
"learning_rate": 9.076662011898145e-09,
"loss": 2.692976951599121,
"step": 398
},
{
"epoch": 0.4924592182209911,
"grad_norm": 5.0625,
"learning_rate": 9.066453047528642e-09,
"loss": 1.951959490776062,
"step": 400
},
{
"epoch": 0.494921514312096,
"grad_norm": 19.125,
"learning_rate": 9.056195405229922e-09,
"loss": 2.419041156768799,
"step": 402
},
{
"epoch": 0.497383810403201,
"grad_norm": 4.3125,
"learning_rate": 9.045889247835322e-09,
"loss": 1.7131880521774292,
"step": 404
},
{
"epoch": 0.49984610649430594,
"grad_norm": 2.875,
"learning_rate": 9.035534738948328e-09,
"loss": 1.2638614177703857,
"step": 406
},
{
"epoch": 0.5023084025854109,
"grad_norm": 6.90625,
"learning_rate": 9.02513204293997e-09,
"loss": 1.8727983236312866,
"step": 408
},
{
"epoch": 0.5047706986765158,
"grad_norm": 2.203125,
"learning_rate": 9.014681324946216e-09,
"loss": 1.1091878414154053,
"step": 410
},
{
"epoch": 0.5072329947676208,
"grad_norm": 5.5625,
"learning_rate": 9.004182750865357e-09,
"loss": 2.032684326171875,
"step": 412
},
{
"epoch": 0.5096952908587258,
"grad_norm": 3.25,
"learning_rate": 8.993636487355366e-09,
"loss": 1.4393967390060425,
"step": 414
},
{
"epoch": 0.5121575869498307,
"grad_norm": 15.4375,
"learning_rate": 8.98304270183125e-09,
"loss": 2.364288806915283,
"step": 416
},
{
"epoch": 0.5146198830409356,
"grad_norm": 6.84375,
"learning_rate": 8.9724015624624e-09,
"loss": 1.4677906036376953,
"step": 418
},
{
"epoch": 0.5170821791320406,
"grad_norm": 4.53125,
"learning_rate": 8.961713238169922e-09,
"loss": 1.9610824584960938,
"step": 420
},
{
"epoch": 0.5195444752231456,
"grad_norm": 6.1875,
"learning_rate": 8.950977898623947e-09,
"loss": 1.8107311725616455,
"step": 422
},
{
"epoch": 0.5220067713142506,
"grad_norm": 2.859375,
"learning_rate": 8.940195714240937e-09,
"loss": 1.2439892292022705,
"step": 424
},
{
"epoch": 0.5244690674053555,
"grad_norm": 9.75,
"learning_rate": 8.929366856181003e-09,
"loss": 1.985514521598816,
"step": 426
},
{
"epoch": 0.5269313634964604,
"grad_norm": 3.703125,
"learning_rate": 8.918491496345149e-09,
"loss": 1.8395881652832031,
"step": 428
},
{
"epoch": 0.5293936595875655,
"grad_norm": 3.421875,
"learning_rate": 8.907569807372576e-09,
"loss": 1.2282559871673584,
"step": 430
},
{
"epoch": 0.5318559556786704,
"grad_norm": 4.75,
"learning_rate": 8.896601962637927e-09,
"loss": 1.9522662162780762,
"step": 432
},
{
"epoch": 0.5343182517697753,
"grad_norm": 6.4375,
"learning_rate": 8.885588136248539e-09,
"loss": 1.831364631652832,
"step": 434
},
{
"epoch": 0.5367805478608803,
"grad_norm": 3.21875,
"learning_rate": 8.874528503041674e-09,
"loss": 1.3392367362976074,
"step": 436
},
{
"epoch": 0.5392428439519852,
"grad_norm": 2.03125,
"learning_rate": 8.86342323858175e-09,
"loss": 1.154931664466858,
"step": 438
},
{
"epoch": 0.5417051400430902,
"grad_norm": 2.84375,
"learning_rate": 8.852272519157554e-09,
"loss": 1.1106712818145752,
"step": 440
},
{
"epoch": 0.5441674361341952,
"grad_norm": 12.6875,
"learning_rate": 8.841076521779431e-09,
"loss": 2.266367197036743,
"step": 442
},
{
"epoch": 0.5466297322253001,
"grad_norm": 6.78125,
"learning_rate": 8.829835424176495e-09,
"loss": 1.9257324934005737,
"step": 444
},
{
"epoch": 0.549092028316405,
"grad_norm": 7.6875,
"learning_rate": 8.81854940479379e-09,
"loss": 1.2584561109542847,
"step": 446
},
{
"epoch": 0.55155432440751,
"grad_norm": 8.3125,
"learning_rate": 8.807218642789463e-09,
"loss": 2.150424003601074,
"step": 448
},
{
"epoch": 0.554016620498615,
"grad_norm": 3.6875,
"learning_rate": 8.795843318031926e-09,
"loss": 1.100125789642334,
"step": 450
},
{
"epoch": 0.5564789165897199,
"grad_norm": 4.71875,
"learning_rate": 8.78442361109699e-09,
"loss": 1.8502240180969238,
"step": 452
},
{
"epoch": 0.5589412126808249,
"grad_norm": 4.625,
"learning_rate": 8.772959703265008e-09,
"loss": 1.7188208103179932,
"step": 454
},
{
"epoch": 0.5614035087719298,
"grad_norm": 2.25,
"learning_rate": 8.76145177651799e-09,
"loss": 1.1569561958312988,
"step": 456
},
{
"epoch": 0.5638658048630347,
"grad_norm": 13.5,
"learning_rate": 8.74990001353672e-09,
"loss": 2.2237837314605713,
"step": 458
},
{
"epoch": 0.5663281009541398,
"grad_norm": 2.625,
"learning_rate": 8.738304597697855e-09,
"loss": 1.2278821468353271,
"step": 460
},
{
"epoch": 0.5687903970452447,
"grad_norm": 2.984375,
"learning_rate": 8.726665713071004e-09,
"loss": 1.4073512554168701,
"step": 462
},
{
"epoch": 0.5712526931363496,
"grad_norm": 12.375,
"learning_rate": 8.714983544415824e-09,
"loss": 2.3128976821899414,
"step": 464
},
{
"epoch": 0.5737149892274546,
"grad_norm": 13.3125,
"learning_rate": 8.703258277179076e-09,
"loss": 2.249760627746582,
"step": 466
},
{
"epoch": 0.5761772853185596,
"grad_norm": 5.75,
"learning_rate": 8.691490097491676e-09,
"loss": 1.949746012687683,
"step": 468
},
{
"epoch": 0.5786395814096645,
"grad_norm": 8.0625,
"learning_rate": 8.679679192165755e-09,
"loss": 2.0255026817321777,
"step": 470
},
{
"epoch": 0.5811018775007695,
"grad_norm": 2.953125,
"learning_rate": 8.667825748691678e-09,
"loss": 1.172034502029419,
"step": 472
},
{
"epoch": 0.5835641735918744,
"grad_norm": 13.4375,
"learning_rate": 8.655929955235084e-09,
"loss": 1.7464905977249146,
"step": 474
},
{
"epoch": 0.5860264696829793,
"grad_norm": 4.875,
"learning_rate": 8.643992000633882e-09,
"loss": 1.7516231536865234,
"step": 476
},
{
"epoch": 0.5884887657740844,
"grad_norm": 13.6875,
"learning_rate": 8.632012074395267e-09,
"loss": 1.9086973667144775,
"step": 478
},
{
"epoch": 0.5909510618651893,
"grad_norm": 20.375,
"learning_rate": 8.619990366692703e-09,
"loss": 1.120478630065918,
"step": 480
},
{
"epoch": 0.5934133579562942,
"grad_norm": 5.40625,
"learning_rate": 8.607927068362909e-09,
"loss": 1.8365321159362793,
"step": 482
},
{
"epoch": 0.5958756540473992,
"grad_norm": 4.21875,
"learning_rate": 8.595822370902824e-09,
"loss": 1.8781213760375977,
"step": 484
},
{
"epoch": 0.5983379501385041,
"grad_norm": 5.09375,
"learning_rate": 8.583676466466578e-09,
"loss": 1.8990083932876587,
"step": 486
},
{
"epoch": 0.6008002462296091,
"grad_norm": 9.25,
"learning_rate": 8.571489547862432e-09,
"loss": 2.005687713623047,
"step": 488
},
{
"epoch": 0.6032625423207141,
"grad_norm": 11.75,
"learning_rate": 8.559261808549717e-09,
"loss": 2.288544178009033,
"step": 490
},
{
"epoch": 0.605724838411819,
"grad_norm": 12.0625,
"learning_rate": 8.546993442635767e-09,
"loss": 1.9239308834075928,
"step": 492
},
{
"epoch": 0.6081871345029239,
"grad_norm": 3.203125,
"learning_rate": 8.534684644872836e-09,
"loss": 1.2520358562469482,
"step": 494
},
{
"epoch": 0.610649430594029,
"grad_norm": 7.65625,
"learning_rate": 8.522335610655014e-09,
"loss": 2.1090569496154785,
"step": 496
},
{
"epoch": 0.6131117266851339,
"grad_norm": 10.3125,
"learning_rate": 8.509946536015109e-09,
"loss": 2.2030882835388184,
"step": 498
},
{
"epoch": 0.6155740227762388,
"grad_norm": 18.75,
"learning_rate": 8.497517617621549e-09,
"loss": 2.205538034439087,
"step": 500
},
{
"epoch": 0.6180363188673438,
"grad_norm": 3.484375,
"learning_rate": 8.485049052775255e-09,
"loss": 1.5225834846496582,
"step": 502
},
{
"epoch": 0.6204986149584487,
"grad_norm": 4.8125,
"learning_rate": 8.472541039406509e-09,
"loss": 1.8662419319152832,
"step": 504
},
{
"epoch": 0.6229609110495538,
"grad_norm": 3.3125,
"learning_rate": 8.459993776071815e-09,
"loss": 1.5459778308868408,
"step": 506
},
{
"epoch": 0.6254232071406587,
"grad_norm": 2.359375,
"learning_rate": 8.44740746195074e-09,
"loss": 1.2113550901412964,
"step": 508
},
{
"epoch": 0.6278855032317636,
"grad_norm": 3.078125,
"learning_rate": 8.434782296842755e-09,
"loss": 1.2501018047332764,
"step": 510
},
{
"epoch": 0.6303477993228686,
"grad_norm": 5.46875,
"learning_rate": 8.422118481164076e-09,
"loss": 1.3121228218078613,
"step": 512
},
{
"epoch": 0.6328100954139735,
"grad_norm": 8.875,
"learning_rate": 8.409416215944459e-09,
"loss": 2.0257339477539062,
"step": 514
},
{
"epoch": 0.6352723915050785,
"grad_norm": 2.828125,
"learning_rate": 8.396675702824026e-09,
"loss": 1.249032974243164,
"step": 516
},
{
"epoch": 0.6377346875961835,
"grad_norm": 2.40625,
"learning_rate": 8.38389714405006e-09,
"loss": 1.089784026145935,
"step": 518
},
{
"epoch": 0.6401969836872884,
"grad_norm": 3.078125,
"learning_rate": 8.371080742473797e-09,
"loss": 1.107433795928955,
"step": 520
},
{
"epoch": 0.6426592797783933,
"grad_norm": 24.25,
"learning_rate": 8.358226701547196e-09,
"loss": 2.397225856781006,
"step": 522
},
{
"epoch": 0.6451215758694983,
"grad_norm": 30.625,
"learning_rate": 8.345335225319716e-09,
"loss": 2.917544364929199,
"step": 524
},
{
"epoch": 0.6475838719606033,
"grad_norm": 5.3125,
"learning_rate": 8.332406518435087e-09,
"loss": 1.9733543395996094,
"step": 526
},
{
"epoch": 0.6500461680517082,
"grad_norm": 11.5625,
"learning_rate": 8.319440786128039e-09,
"loss": 2.30487060546875,
"step": 528
},
{
"epoch": 0.6525084641428132,
"grad_norm": 14.125,
"learning_rate": 8.306438234221058e-09,
"loss": 2.489694118499756,
"step": 530
},
{
"epoch": 0.6549707602339181,
"grad_norm": 4.90625,
"learning_rate": 8.293399069121128e-09,
"loss": 1.7912418842315674,
"step": 532
},
{
"epoch": 0.6574330563250231,
"grad_norm": 9.4375,
"learning_rate": 8.280323497816431e-09,
"loss": 1.935392141342163,
"step": 534
},
{
"epoch": 0.6598953524161281,
"grad_norm": 6.09375,
"learning_rate": 8.267211727873078e-09,
"loss": 1.9411722421646118,
"step": 536
},
{
"epoch": 0.662357648507233,
"grad_norm": 3.765625,
"learning_rate": 8.254063967431816e-09,
"loss": 1.7723370790481567,
"step": 538
},
{
"epoch": 0.6648199445983379,
"grad_norm": 10.75,
"learning_rate": 8.240880425204702e-09,
"loss": 2.3154473304748535,
"step": 540
},
{
"epoch": 0.6672822406894429,
"grad_norm": 6.25,
"learning_rate": 8.22766131047182e-09,
"loss": 1.941293716430664,
"step": 542
},
{
"epoch": 0.6697445367805479,
"grad_norm": 5.15625,
"learning_rate": 8.21440683307794e-09,
"loss": 1.8273173570632935,
"step": 544
},
{
"epoch": 0.6722068328716528,
"grad_norm": 5.75,
"learning_rate": 8.201117203429187e-09,
"loss": 1.917323112487793,
"step": 546
},
{
"epoch": 0.6746691289627578,
"grad_norm": 4.53125,
"learning_rate": 8.18779263248971e-09,
"loss": 1.5516306161880493,
"step": 548
},
{
"epoch": 0.6771314250538627,
"grad_norm": 5.71875,
"learning_rate": 8.174433331778322e-09,
"loss": 2.0121002197265625,
"step": 550
},
{
"epoch": 0.6795937211449676,
"grad_norm": 4.34375,
"learning_rate": 8.161039513365158e-09,
"loss": 1.2636222839355469,
"step": 552
},
{
"epoch": 0.6820560172360727,
"grad_norm": 3.8125,
"learning_rate": 8.147611389868293e-09,
"loss": 1.3448388576507568,
"step": 554
},
{
"epoch": 0.6845183133271776,
"grad_norm": 8.0625,
"learning_rate": 8.13414917445037e-09,
"loss": 2.0951576232910156,
"step": 556
},
{
"epoch": 0.6869806094182825,
"grad_norm": 10.875,
"learning_rate": 8.120653080815219e-09,
"loss": 2.3154006004333496,
"step": 558
},
{
"epoch": 0.6894429055093875,
"grad_norm": 2.96875,
"learning_rate": 8.107123323204473e-09,
"loss": 1.1850239038467407,
"step": 560
},
{
"epoch": 0.6919052016004925,
"grad_norm": 4.4375,
"learning_rate": 8.093560116394149e-09,
"loss": 1.9023423194885254,
"step": 562
},
{
"epoch": 0.6943674976915974,
"grad_norm": 5.5,
"learning_rate": 8.079963675691255e-09,
"loss": 1.9364053010940552,
"step": 564
},
{
"epoch": 0.6968297937827024,
"grad_norm": 8.1875,
"learning_rate": 8.06633421693036e-09,
"loss": 1.8559212684631348,
"step": 566
},
{
"epoch": 0.6992920898738073,
"grad_norm": 12.0,
"learning_rate": 8.052671956470177e-09,
"loss": 1.9172155857086182,
"step": 568
},
{
"epoch": 0.7017543859649122,
"grad_norm": 5.1875,
"learning_rate": 8.038977111190119e-09,
"loss": 1.7878023386001587,
"step": 570
},
{
"epoch": 0.7042166820560173,
"grad_norm": 5.28125,
"learning_rate": 8.025249898486866e-09,
"loss": 1.9518636465072632,
"step": 572
},
{
"epoch": 0.7066789781471222,
"grad_norm": 4.875,
"learning_rate": 8.011490536270911e-09,
"loss": 1.7933154106140137,
"step": 574
},
{
"epoch": 0.7091412742382271,
"grad_norm": 4.75,
"learning_rate": 7.997699242963094e-09,
"loss": 1.7392499446868896,
"step": 576
},
{
"epoch": 0.7116035703293321,
"grad_norm": 3.734375,
"learning_rate": 7.983876237491148e-09,
"loss": 1.403039813041687,
"step": 578
},
{
"epoch": 0.714065866420437,
"grad_norm": 2.921875,
"learning_rate": 7.970021739286207e-09,
"loss": 1.1680914163589478,
"step": 580
},
{
"epoch": 0.716528162511542,
"grad_norm": 2.234375,
"learning_rate": 7.956135968279335e-09,
"loss": 1.1165484189987183,
"step": 582
},
{
"epoch": 0.718990458602647,
"grad_norm": 10.9375,
"learning_rate": 7.942219144898033e-09,
"loss": 2.342836856842041,
"step": 584
},
{
"epoch": 0.7214527546937519,
"grad_norm": 3.953125,
"learning_rate": 7.928271490062737e-09,
"loss": 1.8495182991027832,
"step": 586
},
{
"epoch": 0.7239150507848569,
"grad_norm": 5.875,
"learning_rate": 7.914293225183313e-09,
"loss": 1.9028046131134033,
"step": 588
},
{
"epoch": 0.7263773468759618,
"grad_norm": 10.0625,
"learning_rate": 7.900284572155538e-09,
"loss": 1.9208589792251587,
"step": 590
},
{
"epoch": 0.7288396429670668,
"grad_norm": 4.59375,
"learning_rate": 7.886245753357586e-09,
"loss": 1.8670642375946045,
"step": 592
},
{
"epoch": 0.7313019390581718,
"grad_norm": 65.0,
"learning_rate": 7.872176991646488e-09,
"loss": 1.555503487586975,
"step": 594
},
{
"epoch": 0.7337642351492767,
"grad_norm": 5.46875,
"learning_rate": 7.858078510354597e-09,
"loss": 1.9539310932159424,
"step": 596
},
{
"epoch": 0.7362265312403816,
"grad_norm": 2.703125,
"learning_rate": 7.843950533286057e-09,
"loss": 1.2128690481185913,
"step": 598
},
{
"epoch": 0.7386888273314867,
"grad_norm": 4.46875,
"learning_rate": 7.829793284713224e-09,
"loss": 1.873086929321289,
"step": 600
},
{
"epoch": 0.7411511234225916,
"grad_norm": 2.578125,
"learning_rate": 7.81560698937313e-09,
"loss": 1.1673393249511719,
"step": 602
},
{
"epoch": 0.7436134195136965,
"grad_norm": 12.8125,
"learning_rate": 7.801391872463896e-09,
"loss": 2.315310001373291,
"step": 604
},
{
"epoch": 0.7460757156048015,
"grad_norm": 11.3125,
"learning_rate": 7.787148159641176e-09,
"loss": 2.4388017654418945,
"step": 606
},
{
"epoch": 0.7485380116959064,
"grad_norm": 8.75,
"learning_rate": 7.77287607701456e-09,
"loss": 2.1161627769470215,
"step": 608
},
{
"epoch": 0.7510003077870114,
"grad_norm": 3.921875,
"learning_rate": 7.758575851143987e-09,
"loss": 1.1796162128448486,
"step": 610
},
{
"epoch": 0.7534626038781164,
"grad_norm": 4.90625,
"learning_rate": 7.744247709036165e-09,
"loss": 1.3470849990844727,
"step": 612
},
{
"epoch": 0.7559248999692213,
"grad_norm": 12.3125,
"learning_rate": 7.729891878140936e-09,
"loss": 2.33459734916687,
"step": 614
},
{
"epoch": 0.7583871960603262,
"grad_norm": 6.96875,
"learning_rate": 7.715508586347695e-09,
"loss": 1.9637078046798706,
"step": 616
},
{
"epoch": 0.7608494921514312,
"grad_norm": 4.34375,
"learning_rate": 7.701098061981757e-09,
"loss": 1.9413955211639404,
"step": 618
},
{
"epoch": 0.7633117882425362,
"grad_norm": 9.6875,
"learning_rate": 7.686660533800736e-09,
"loss": 1.9719551801681519,
"step": 620
},
{
"epoch": 0.7657740843336411,
"grad_norm": 3.71875,
"learning_rate": 7.672196230990918e-09,
"loss": 1.3401029109954834,
"step": 622
},
{
"epoch": 0.7682363804247461,
"grad_norm": 6.59375,
"learning_rate": 7.65770538316361e-09,
"loss": 1.7963333129882812,
"step": 624
},
{
"epoch": 0.770698676515851,
"grad_norm": 7.96875,
"learning_rate": 7.643188220351516e-09,
"loss": 2.0712432861328125,
"step": 626
},
{
"epoch": 0.7731609726069559,
"grad_norm": 11.125,
"learning_rate": 7.628644973005061e-09,
"loss": 2.3805270195007324,
"step": 628
},
{
"epoch": 0.775623268698061,
"grad_norm": 7.34375,
"learning_rate": 7.61407587198875e-09,
"loss": 1.2845838069915771,
"step": 630
},
{
"epoch": 0.7780855647891659,
"grad_norm": 20.875,
"learning_rate": 7.5994811485775e-09,
"loss": 2.2516846656799316,
"step": 632
},
{
"epoch": 0.7805478608802708,
"grad_norm": 5.0,
"learning_rate": 7.584861034452963e-09,
"loss": 1.964002251625061,
"step": 634
},
{
"epoch": 0.7830101569713758,
"grad_norm": 3.046875,
"learning_rate": 7.570215761699855e-09,
"loss": 1.3124688863754272,
"step": 636
},
{
"epoch": 0.7854724530624808,
"grad_norm": 11.6875,
"learning_rate": 7.55554556280227e-09,
"loss": 2.2107834815979004,
"step": 638
},
{
"epoch": 0.7879347491535857,
"grad_norm": 5.6875,
"learning_rate": 7.540850670639978e-09,
"loss": 1.9630699157714844,
"step": 640
},
{
"epoch": 0.7903970452446907,
"grad_norm": 5.75,
"learning_rate": 7.526131318484753e-09,
"loss": 1.9335198402404785,
"step": 642
},
{
"epoch": 0.7928593413357956,
"grad_norm": 3.765625,
"learning_rate": 7.511387739996644e-09,
"loss": 1.2916162014007568,
"step": 644
},
{
"epoch": 0.7953216374269005,
"grad_norm": 14.5625,
"learning_rate": 7.496620169220286e-09,
"loss": 2.1263046264648438,
"step": 646
},
{
"epoch": 0.7977839335180056,
"grad_norm": 5.78125,
"learning_rate": 7.481828840581164e-09,
"loss": 1.8862347602844238,
"step": 648
},
{
"epoch": 0.8002462296091105,
"grad_norm": 11.75,
"learning_rate": 7.46701398888192e-09,
"loss": 2.1435751914978027,
"step": 650
},
{
"epoch": 0.8027085257002154,
"grad_norm": 36.25,
"learning_rate": 7.45217584929859e-09,
"loss": 1.8985021114349365,
"step": 652
},
{
"epoch": 0.8051708217913204,
"grad_norm": 3.96875,
"learning_rate": 7.437314657376906e-09,
"loss": 1.255218267440796,
"step": 654
},
{
"epoch": 0.8076331178824253,
"grad_norm": 6.71875,
"learning_rate": 7.422430649028533e-09,
"loss": 1.8039145469665527,
"step": 656
},
{
"epoch": 0.8100954139735304,
"grad_norm": 2.828125,
"learning_rate": 7.407524060527333e-09,
"loss": 1.2014645338058472,
"step": 658
},
{
"epoch": 0.8125577100646353,
"grad_norm": 9.5625,
"learning_rate": 7.3925951285056146e-09,
"loss": 2.114205837249756,
"step": 660
},
{
"epoch": 0.8150200061557402,
"grad_norm": 18.0,
"learning_rate": 7.377644089950371e-09,
"loss": 2.3271141052246094,
"step": 662
},
{
"epoch": 0.8174823022468451,
"grad_norm": 4.59375,
"learning_rate": 7.362671182199527e-09,
"loss": 1.9512523412704468,
"step": 664
},
{
"epoch": 0.8199445983379502,
"grad_norm": 4.875,
"learning_rate": 7.347676642938163e-09,
"loss": 1.875675082206726,
"step": 666
},
{
"epoch": 0.8224068944290551,
"grad_norm": 7.28125,
"learning_rate": 7.332660710194749e-09,
"loss": 2.120806932449341,
"step": 668
},
{
"epoch": 0.8248691905201601,
"grad_norm": 12.1875,
"learning_rate": 7.3176236223373595e-09,
"loss": 2.482332229614258,
"step": 670
},
{
"epoch": 0.827331486611265,
"grad_norm": 5.34375,
"learning_rate": 7.302565618069894e-09,
"loss": 1.932433843612671,
"step": 672
},
{
"epoch": 0.8297937827023699,
"grad_norm": 2.296875,
"learning_rate": 7.287486936428282e-09,
"loss": 1.1869601011276245,
"step": 674
},
{
"epoch": 0.832256078793475,
"grad_norm": 2.40625,
"learning_rate": 7.272387816776704e-09,
"loss": 1.2416247129440308,
"step": 676
},
{
"epoch": 0.8347183748845799,
"grad_norm": 6.34375,
"learning_rate": 7.257268498803767e-09,
"loss": 1.4887652397155762,
"step": 678
},
{
"epoch": 0.8371806709756848,
"grad_norm": 5.34375,
"learning_rate": 7.2421292225187186e-09,
"loss": 1.833484411239624,
"step": 680
},
{
"epoch": 0.8396429670667898,
"grad_norm": 13.8125,
"learning_rate": 7.2269702282476335e-09,
"loss": 2.041853904724121,
"step": 682
},
{
"epoch": 0.8421052631578947,
"grad_norm": 14.625,
"learning_rate": 7.211791756629598e-09,
"loss": 2.366133689880371,
"step": 684
},
{
"epoch": 0.8445675592489997,
"grad_norm": 10.875,
"learning_rate": 7.196594048612881e-09,
"loss": 1.9250491857528687,
"step": 686
},
{
"epoch": 0.8470298553401047,
"grad_norm": 10.3125,
"learning_rate": 7.1813773454511215e-09,
"loss": 2.2896928787231445,
"step": 688
},
{
"epoch": 0.8494921514312096,
"grad_norm": 5.40625,
"learning_rate": 7.166141888699495e-09,
"loss": 1.9879870414733887,
"step": 690
},
{
"epoch": 0.8519544475223145,
"grad_norm": 11.625,
"learning_rate": 7.150887920210878e-09,
"loss": 2.2236876487731934,
"step": 692
},
{
"epoch": 0.8544167436134195,
"grad_norm": 10.0,
"learning_rate": 7.135615682132004e-09,
"loss": 1.4050698280334473,
"step": 694
},
{
"epoch": 0.8568790397045245,
"grad_norm": 22.25,
"learning_rate": 7.120325416899629e-09,
"loss": 2.2749319076538086,
"step": 696
},
{
"epoch": 0.8593413357956294,
"grad_norm": 15.75,
"learning_rate": 7.105017367236675e-09,
"loss": 2.3958988189697266,
"step": 698
},
{
"epoch": 0.8618036318867344,
"grad_norm": 11.0,
"learning_rate": 7.089691776148384e-09,
"loss": 2.313142776489258,
"step": 700
},
{
"epoch": 0.8642659279778393,
"grad_norm": 11.625,
"learning_rate": 7.0743488869184535e-09,
"loss": 2.3592798709869385,
"step": 702
},
{
"epoch": 0.8667282240689443,
"grad_norm": 8.5625,
"learning_rate": 7.058988943105175e-09,
"loss": 2.11894154548645,
"step": 704
},
{
"epoch": 0.8691905201600493,
"grad_norm": 2.34375,
"learning_rate": 7.04361218853758e-09,
"loss": 1.3712561130523682,
"step": 706
},
{
"epoch": 0.8716528162511542,
"grad_norm": 13.4375,
"learning_rate": 7.0282188673115514e-09,
"loss": 2.092770576477051,
"step": 708
},
{
"epoch": 0.8741151123422591,
"grad_norm": 15.0625,
"learning_rate": 7.012809223785957e-09,
"loss": 1.9357192516326904,
"step": 710
},
{
"epoch": 0.8765774084333641,
"grad_norm": 2.953125,
"learning_rate": 6.9973835025787715e-09,
"loss": 1.2680325508117676,
"step": 712
},
{
"epoch": 0.8790397045244691,
"grad_norm": 7.125,
"learning_rate": 6.981941948563198e-09,
"loss": 1.7719722986221313,
"step": 714
},
{
"epoch": 0.881502000615574,
"grad_norm": 5.0625,
"learning_rate": 6.966484806863764e-09,
"loss": 1.8633275032043457,
"step": 716
},
{
"epoch": 0.883964296706679,
"grad_norm": 3.296875,
"learning_rate": 6.9510123228524545e-09,
"loss": 1.4539438486099243,
"step": 718
},
{
"epoch": 0.8864265927977839,
"grad_norm": 13.25,
"learning_rate": 6.935524742144792e-09,
"loss": 2.2359728813171387,
"step": 720
},
{
"epoch": 0.8888888888888888,
"grad_norm": 6.78125,
"learning_rate": 6.920022310595953e-09,
"loss": 1.8414530754089355,
"step": 722
},
{
"epoch": 0.8913511849799939,
"grad_norm": 3.84375,
"learning_rate": 6.904505274296864e-09,
"loss": 1.2079766988754272,
"step": 724
},
{
"epoch": 0.8938134810710988,
"grad_norm": 8.625,
"learning_rate": 6.88897387957029e-09,
"loss": 1.9165315628051758,
"step": 726
},
{
"epoch": 0.8962757771622037,
"grad_norm": 3.34375,
"learning_rate": 6.87342837296693e-09,
"loss": 1.2759442329406738,
"step": 728
},
{
"epoch": 0.8987380732533087,
"grad_norm": 5.34375,
"learning_rate": 6.857869001261491e-09,
"loss": 1.2644639015197754,
"step": 730
},
{
"epoch": 0.9012003693444137,
"grad_norm": 12.75,
"learning_rate": 6.842296011448788e-09,
"loss": 2.2167718410491943,
"step": 732
},
{
"epoch": 0.9036626654355187,
"grad_norm": 7.1875,
"learning_rate": 6.826709650739812e-09,
"loss": 1.402853012084961,
"step": 734
},
{
"epoch": 0.9061249615266236,
"grad_norm": 9.25,
"learning_rate": 6.811110166557809e-09,
"loss": 2.0942487716674805,
"step": 736
},
{
"epoch": 0.9085872576177285,
"grad_norm": 4.40625,
"learning_rate": 6.795497806534348e-09,
"loss": 1.8234786987304688,
"step": 738
},
{
"epoch": 0.9110495537088334,
"grad_norm": 16.5,
"learning_rate": 6.779872818505397e-09,
"loss": 1.8784126043319702,
"step": 740
},
{
"epoch": 0.9135118497999385,
"grad_norm": 9.5,
"learning_rate": 6.7642354505073835e-09,
"loss": 2.2190794944763184,
"step": 742
},
{
"epoch": 0.9159741458910434,
"grad_norm": 4.8125,
"learning_rate": 6.748585950773263e-09,
"loss": 1.9413115978240967,
"step": 744
},
{
"epoch": 0.9184364419821484,
"grad_norm": 3.109375,
"learning_rate": 6.732924567728566e-09,
"loss": 1.3823771476745605,
"step": 746
},
{
"epoch": 0.9208987380732533,
"grad_norm": 5.03125,
"learning_rate": 6.7172515499874705e-09,
"loss": 1.9463045597076416,
"step": 748
},
{
"epoch": 0.9233610341643582,
"grad_norm": 6.71875,
"learning_rate": 6.701567146348843e-09,
"loss": 2.0039689540863037,
"step": 750
},
{
"epoch": 0.9258233302554633,
"grad_norm": 3.828125,
"learning_rate": 6.685871605792301e-09,
"loss": 1.438122272491455,
"step": 752
},
{
"epoch": 0.9282856263465682,
"grad_norm": 34.25,
"learning_rate": 6.670165177474241e-09,
"loss": 1.7374298572540283,
"step": 754
},
{
"epoch": 0.9307479224376731,
"grad_norm": 2.796875,
"learning_rate": 6.6544481107239054e-09,
"loss": 1.4571634531021118,
"step": 756
},
{
"epoch": 0.9332102185287781,
"grad_norm": 4.78125,
"learning_rate": 6.638720655039412e-09,
"loss": 1.7221906185150146,
"step": 758
},
{
"epoch": 0.935672514619883,
"grad_norm": 22.25,
"learning_rate": 6.622983060083796e-09,
"loss": 1.344387173652649,
"step": 760
},
{
"epoch": 0.938134810710988,
"grad_norm": 2.4375,
"learning_rate": 6.607235575681045e-09,
"loss": 1.2809216976165771,
"step": 762
},
{
"epoch": 0.940597106802093,
"grad_norm": 2.609375,
"learning_rate": 6.591478451812138e-09,
"loss": 1.1766109466552734,
"step": 764
},
{
"epoch": 0.9430594028931979,
"grad_norm": 3.765625,
"learning_rate": 6.575711938611073e-09,
"loss": 1.3128526210784912,
"step": 766
},
{
"epoch": 0.9455216989843028,
"grad_norm": 5.625,
"learning_rate": 6.559936286360897e-09,
"loss": 1.8674499988555908,
"step": 768
},
{
"epoch": 0.9479839950754079,
"grad_norm": 5.28125,
"learning_rate": 6.544151745489735e-09,
"loss": 1.934564471244812,
"step": 770
},
{
"epoch": 0.9504462911665128,
"grad_norm": 7.625,
"learning_rate": 6.52835856656681e-09,
"loss": 2.1300408840179443,
"step": 772
},
{
"epoch": 0.9529085872576177,
"grad_norm": 10.3125,
"learning_rate": 6.512557000298471e-09,
"loss": 2.284024715423584,
"step": 774
},
{
"epoch": 0.9553708833487227,
"grad_norm": 5.15625,
"learning_rate": 6.49674729752421e-09,
"loss": 1.9190423488616943,
"step": 776
},
{
"epoch": 0.9578331794398276,
"grad_norm": 9.0,
"learning_rate": 6.480929709212682e-09,
"loss": 2.2223734855651855,
"step": 778
},
{
"epoch": 0.9602954755309326,
"grad_norm": 5.5,
"learning_rate": 6.465104486457718e-09,
"loss": 1.9598147869110107,
"step": 780
},
{
"epoch": 0.9627577716220376,
"grad_norm": 6.59375,
"learning_rate": 6.4492718804743365e-09,
"loss": 2.041882276535034,
"step": 782
},
{
"epoch": 0.9652200677131425,
"grad_norm": 2.125,
"learning_rate": 6.433432142594771e-09,
"loss": 1.2188262939453125,
"step": 784
},
{
"epoch": 0.9676823638042474,
"grad_norm": 11.375,
"learning_rate": 6.4175855242644575e-09,
"loss": 2.208829879760742,
"step": 786
},
{
"epoch": 0.9701446598953524,
"grad_norm": 5.0,
"learning_rate": 6.401732277038063e-09,
"loss": 2.0125837326049805,
"step": 788
},
{
"epoch": 0.9726069559864574,
"grad_norm": 8.75,
"learning_rate": 6.3858726525754814e-09,
"loss": 2.2643885612487793,
"step": 790
},
{
"epoch": 0.9750692520775623,
"grad_norm": 7.0625,
"learning_rate": 6.370006902637836e-09,
"loss": 1.9207779169082642,
"step": 792
},
{
"epoch": 0.9775315481686673,
"grad_norm": 2.59375,
"learning_rate": 6.354135279083497e-09,
"loss": 1.2121376991271973,
"step": 794
},
{
"epoch": 0.9799938442597722,
"grad_norm": 10.9375,
"learning_rate": 6.338258033864067e-09,
"loss": 2.1134583950042725,
"step": 796
},
{
"epoch": 0.9824561403508771,
"grad_norm": 18.125,
"learning_rate": 6.3223754190203895e-09,
"loss": 2.3652374744415283,
"step": 798
},
{
"epoch": 0.9849184364419822,
"grad_norm": 11.6875,
"learning_rate": 6.306487686678556e-09,
"loss": 1.956110954284668,
"step": 800
},
{
"epoch": 0.9873807325330871,
"grad_norm": 5.21875,
"learning_rate": 6.290595089045882e-09,
"loss": 1.993713140487671,
"step": 802
},
{
"epoch": 0.989843028624192,
"grad_norm": 19.25,
"learning_rate": 6.274697878406925e-09,
"loss": 1.3555768728256226,
"step": 804
},
{
"epoch": 0.992305324715297,
"grad_norm": 14.9375,
"learning_rate": 6.2587963071194695e-09,
"loss": 1.7694034576416016,
"step": 806
},
{
"epoch": 0.994767620806402,
"grad_norm": 14.0,
"learning_rate": 6.242890627610518e-09,
"loss": 2.2126145362854004,
"step": 808
},
{
"epoch": 0.997229916897507,
"grad_norm": 5.46875,
"learning_rate": 6.226981092372297e-09,
"loss": 1.7438420057296753,
"step": 810
},
{
"epoch": 0.9996922129886119,
"grad_norm": 3.671875,
"learning_rate": 6.211067953958229e-09,
"loss": 1.237831711769104,
"step": 812
},
{
"epoch": 1.0012311480455525,
"grad_norm": 2.15625,
"learning_rate": 6.195151464978945e-09,
"loss": 1.2776278257369995,
"step": 814
},
{
"epoch": 1.0036934441366574,
"grad_norm": 5.625,
"learning_rate": 6.179231878098257e-09,
"loss": 1.6098976135253906,
"step": 816
},
{
"epoch": 1.0061557402277623,
"grad_norm": 1.875,
"learning_rate": 6.163309446029157e-09,
"loss": 1.5421602725982666,
"step": 818
},
{
"epoch": 1.0086180363188673,
"grad_norm": 5.3125,
"learning_rate": 6.1473844215298045e-09,
"loss": 1.4228730201721191,
"step": 820
},
{
"epoch": 1.0110803324099722,
"grad_norm": 14.75,
"learning_rate": 6.131457057399506e-09,
"loss": 2.0147526264190674,
"step": 822
},
{
"epoch": 1.0135426285010773,
"grad_norm": 13.9375,
"learning_rate": 6.115527606474713e-09,
"loss": 2.301534652709961,
"step": 824
},
{
"epoch": 1.0160049245921823,
"grad_norm": 6.75,
"learning_rate": 6.099596321625005e-09,
"loss": 1.9000599384307861,
"step": 826
},
{
"epoch": 1.0184672206832872,
"grad_norm": 1.4140625,
"learning_rate": 6.083663455749068e-09,
"loss": 1.2694454193115234,
"step": 828
},
{
"epoch": 1.0209295167743921,
"grad_norm": 2.453125,
"learning_rate": 6.0677292617706915e-09,
"loss": 1.1476200819015503,
"step": 830
},
{
"epoch": 1.023391812865497,
"grad_norm": 15.125,
"learning_rate": 6.051793992634741e-09,
"loss": 1.685870885848999,
"step": 832
},
{
"epoch": 1.025854108956602,
"grad_norm": 5.15625,
"learning_rate": 6.035857901303159e-09,
"loss": 2.1021130084991455,
"step": 834
},
{
"epoch": 1.028316405047707,
"grad_norm": 9.25,
"learning_rate": 6.019921240750932e-09,
"loss": 1.9393489360809326,
"step": 836
},
{
"epoch": 1.0307787011388119,
"grad_norm": 3.640625,
"learning_rate": 6.0039842639620844e-09,
"loss": 1.9408633708953857,
"step": 838
},
{
"epoch": 1.0332409972299168,
"grad_norm": 16.875,
"learning_rate": 5.988047223925661e-09,
"loss": 2.042579174041748,
"step": 840
},
{
"epoch": 1.035703293321022,
"grad_norm": 2.328125,
"learning_rate": 5.9721103736317114e-09,
"loss": 1.7358704805374146,
"step": 842
},
{
"epoch": 1.0381655894121269,
"grad_norm": 7.53125,
"learning_rate": 5.956173966067275e-09,
"loss": 1.5867335796356201,
"step": 844
},
{
"epoch": 1.0406278855032318,
"grad_norm": 4.34375,
"learning_rate": 5.940238254212358e-09,
"loss": 1.8849399089813232,
"step": 846
},
{
"epoch": 1.0430901815943368,
"grad_norm": 4.84375,
"learning_rate": 5.924303491035925e-09,
"loss": 1.643231987953186,
"step": 848
},
{
"epoch": 1.0455524776854417,
"grad_norm": 14.0625,
"learning_rate": 5.9083699294918835e-09,
"loss": 2.0420408248901367,
"step": 850
},
{
"epoch": 1.0480147737765466,
"grad_norm": 10.5,
"learning_rate": 5.89243782251506e-09,
"loss": 2.353334903717041,
"step": 852
},
{
"epoch": 1.0504770698676515,
"grad_norm": 12.625,
"learning_rate": 5.876507423017199e-09,
"loss": 2.2866880893707275,
"step": 854
},
{
"epoch": 1.0529393659587565,
"grad_norm": 5.09375,
"learning_rate": 5.8605789838829335e-09,
"loss": 2.091262102127075,
"step": 856
},
{
"epoch": 1.0554016620498614,
"grad_norm": 15.1875,
"learning_rate": 5.844652757965778e-09,
"loss": 2.1091365814208984,
"step": 858
},
{
"epoch": 1.0578639581409663,
"grad_norm": 2.4375,
"learning_rate": 5.828728998084117e-09,
"loss": 1.6677895784378052,
"step": 860
},
{
"epoch": 1.0603262542320715,
"grad_norm": 4.4375,
"learning_rate": 5.812807957017181e-09,
"loss": 1.5235992670059204,
"step": 862
},
{
"epoch": 1.0627885503231764,
"grad_norm": 12.1875,
"learning_rate": 5.796889887501051e-09,
"loss": 2.279834270477295,
"step": 864
},
{
"epoch": 1.0652508464142814,
"grad_norm": 9.125,
"learning_rate": 5.780975042224629e-09,
"loss": 2.450547456741333,
"step": 866
},
{
"epoch": 1.0677131425053863,
"grad_norm": 61.25,
"learning_rate": 5.765063673825634e-09,
"loss": 2.2601470947265625,
"step": 868
},
{
"epoch": 1.0701754385964912,
"grad_norm": 3.140625,
"learning_rate": 5.749156034886602e-09,
"loss": 1.6974682807922363,
"step": 870
},
{
"epoch": 1.0726377346875962,
"grad_norm": 5.75,
"learning_rate": 5.733252377930853e-09,
"loss": 1.7504122257232666,
"step": 872
},
{
"epoch": 1.075100030778701,
"grad_norm": 3.640625,
"learning_rate": 5.7173529554185045e-09,
"loss": 1.7744596004486084,
"step": 874
},
{
"epoch": 1.077562326869806,
"grad_norm": 5.0625,
"learning_rate": 5.701458019742448e-09,
"loss": 1.8063809871673584,
"step": 876
},
{
"epoch": 1.080024622960911,
"grad_norm": 4.75,
"learning_rate": 5.685567823224358e-09,
"loss": 1.8798420429229736,
"step": 878
},
{
"epoch": 1.082486919052016,
"grad_norm": 12.75,
"learning_rate": 5.669682618110672e-09,
"loss": 2.0758848190307617,
"step": 880
},
{
"epoch": 1.084949215143121,
"grad_norm": 12.5,
"learning_rate": 5.653802656568592e-09,
"loss": 2.1326591968536377,
"step": 882
},
{
"epoch": 1.087411511234226,
"grad_norm": 4.8125,
"learning_rate": 5.637928190682084e-09,
"loss": 1.9486507177352905,
"step": 884
},
{
"epoch": 1.089873807325331,
"grad_norm": 6.75,
"learning_rate": 5.622059472447876e-09,
"loss": 1.9365224838256836,
"step": 886
},
{
"epoch": 1.0923361034164358,
"grad_norm": 6.4375,
"learning_rate": 5.606196753771449e-09,
"loss": 1.8881072998046875,
"step": 888
},
{
"epoch": 1.0947983995075408,
"grad_norm": 7.21875,
"learning_rate": 5.590340286463054e-09,
"loss": 1.9489333629608154,
"step": 890
},
{
"epoch": 1.0972606955986457,
"grad_norm": 8.5,
"learning_rate": 5.574490322233697e-09,
"loss": 1.9946143627166748,
"step": 892
},
{
"epoch": 1.0997229916897506,
"grad_norm": 3.484375,
"learning_rate": 5.558647112691158e-09,
"loss": 1.6062787771224976,
"step": 894
},
{
"epoch": 1.1021852877808556,
"grad_norm": 2.859375,
"learning_rate": 5.542810909335987e-09,
"loss": 1.2802103757858276,
"step": 896
},
{
"epoch": 1.1046475838719605,
"grad_norm": 17.0,
"learning_rate": 5.526981963557518e-09,
"loss": 1.7315878868103027,
"step": 898
},
{
"epoch": 1.1071098799630656,
"grad_norm": 9.0,
"learning_rate": 5.511160526629875e-09,
"loss": 1.9750934839248657,
"step": 900
},
{
"epoch": 1.1095721760541706,
"grad_norm": 3.515625,
"learning_rate": 5.495346849707981e-09,
"loss": 1.6797375679016113,
"step": 902
},
{
"epoch": 1.1120344721452755,
"grad_norm": 10.75,
"learning_rate": 5.479541183823578e-09,
"loss": 1.8305199146270752,
"step": 904
},
{
"epoch": 1.1144967682363804,
"grad_norm": 4.84375,
"learning_rate": 5.463743779881238e-09,
"loss": 1.9975595474243164,
"step": 906
},
{
"epoch": 1.1169590643274854,
"grad_norm": 4.65625,
"learning_rate": 5.447954888654378e-09,
"loss": 1.7815577983856201,
"step": 908
},
{
"epoch": 1.1194213604185903,
"grad_norm": 3.109375,
"learning_rate": 5.432174760781281e-09,
"loss": 1.5837122201919556,
"step": 910
},
{
"epoch": 1.1218836565096952,
"grad_norm": 2.25,
"learning_rate": 5.416403646761119e-09,
"loss": 1.2701913118362427,
"step": 912
},
{
"epoch": 1.1243459526008002,
"grad_norm": 2.890625,
"learning_rate": 5.400641796949976e-09,
"loss": 1.3599649667739868,
"step": 914
},
{
"epoch": 1.1268082486919053,
"grad_norm": 6.34375,
"learning_rate": 5.384889461556868e-09,
"loss": 1.5575028657913208,
"step": 916
},
{
"epoch": 1.1292705447830103,
"grad_norm": 3.34375,
"learning_rate": 5.36914689063978e-09,
"loss": 1.4743753671646118,
"step": 918
},
{
"epoch": 1.1317328408741152,
"grad_norm": 5.25,
"learning_rate": 5.353414334101692e-09,
"loss": 1.5236045122146606,
"step": 920
},
{
"epoch": 1.1341951369652201,
"grad_norm": 4.4375,
"learning_rate": 5.337692041686615e-09,
"loss": 1.891930341720581,
"step": 922
},
{
"epoch": 1.136657433056325,
"grad_norm": 2.046875,
"learning_rate": 5.321980262975614e-09,
"loss": 1.522653341293335,
"step": 924
},
{
"epoch": 1.13911972914743,
"grad_norm": 15.625,
"learning_rate": 5.306279247382867e-09,
"loss": 1.66744065284729,
"step": 926
},
{
"epoch": 1.141582025238535,
"grad_norm": 16.875,
"learning_rate": 5.290589244151689e-09,
"loss": 2.157740592956543,
"step": 928
},
{
"epoch": 1.1440443213296398,
"grad_norm": 2.390625,
"learning_rate": 5.274910502350581e-09,
"loss": 1.5675222873687744,
"step": 930
},
{
"epoch": 1.1465066174207448,
"grad_norm": 4.84375,
"learning_rate": 5.259243270869276e-09,
"loss": 1.1499652862548828,
"step": 932
},
{
"epoch": 1.1489689135118497,
"grad_norm": 12.75,
"learning_rate": 5.243587798414792e-09,
"loss": 1.5367200374603271,
"step": 934
},
{
"epoch": 1.1514312096029546,
"grad_norm": 5.34375,
"learning_rate": 5.227944333507477e-09,
"loss": 1.9310216903686523,
"step": 936
},
{
"epoch": 1.1538935056940598,
"grad_norm": 11.5,
"learning_rate": 5.212313124477067e-09,
"loss": 2.123908519744873,
"step": 938
},
{
"epoch": 1.1563558017851647,
"grad_norm": 7.28125,
"learning_rate": 5.196694419458744e-09,
"loss": 2.1816015243530273,
"step": 940
},
{
"epoch": 1.1588180978762697,
"grad_norm": 1.84375,
"learning_rate": 5.1810884663891986e-09,
"loss": 1.5526807308197021,
"step": 942
},
{
"epoch": 1.1612803939673746,
"grad_norm": 1.8671875,
"learning_rate": 5.165495513002691e-09,
"loss": 1.3024842739105225,
"step": 944
},
{
"epoch": 1.1637426900584795,
"grad_norm": 2.796875,
"learning_rate": 5.149915806827121e-09,
"loss": 1.2783153057098389,
"step": 946
},
{
"epoch": 1.1662049861495845,
"grad_norm": 5.125,
"learning_rate": 5.134349595180094e-09,
"loss": 1.5641247034072876,
"step": 948
},
{
"epoch": 1.1686672822406894,
"grad_norm": 7.0,
"learning_rate": 5.1187971251650065e-09,
"loss": 1.9546620845794678,
"step": 950
},
{
"epoch": 1.1711295783317943,
"grad_norm": 4.4375,
"learning_rate": 5.10325864366711e-09,
"loss": 1.87162446975708,
"step": 952
},
{
"epoch": 1.1735918744228995,
"grad_norm": 11.5,
"learning_rate": 5.087734397349596e-09,
"loss": 1.8723485469818115,
"step": 954
},
{
"epoch": 1.1760541705140044,
"grad_norm": 5.21875,
"learning_rate": 5.072224632649684e-09,
"loss": 1.91074538230896,
"step": 956
},
{
"epoch": 1.1785164666051093,
"grad_norm": 5.25,
"learning_rate": 5.056729595774712e-09,
"loss": 1.9009315967559814,
"step": 958
},
{
"epoch": 1.1809787626962143,
"grad_norm": 7.3125,
"learning_rate": 5.041249532698214e-09,
"loss": 1.9836119413375854,
"step": 960
},
{
"epoch": 1.1834410587873192,
"grad_norm": 9.375,
"learning_rate": 5.025784689156032e-09,
"loss": 1.9037981033325195,
"step": 962
},
{
"epoch": 1.1859033548784241,
"grad_norm": 27.875,
"learning_rate": 5.0103353106424065e-09,
"loss": 2.551020622253418,
"step": 964
},
{
"epoch": 1.188365650969529,
"grad_norm": 12.75,
"learning_rate": 4.994901642406078e-09,
"loss": 2.474264144897461,
"step": 966
},
{
"epoch": 1.190827947060634,
"grad_norm": 11.5625,
"learning_rate": 4.979483929446398e-09,
"loss": 1.7837506532669067,
"step": 968
},
{
"epoch": 1.193290243151739,
"grad_norm": 3.65625,
"learning_rate": 4.964082416509442e-09,
"loss": 1.760176181793213,
"step": 970
},
{
"epoch": 1.1957525392428439,
"grad_norm": 17.75,
"learning_rate": 4.948697348084115e-09,
"loss": 1.9721624851226807,
"step": 972
},
{
"epoch": 1.1982148353339488,
"grad_norm": 6.6875,
"learning_rate": 4.933328968398283e-09,
"loss": 1.8035709857940674,
"step": 974
},
{
"epoch": 1.200677131425054,
"grad_norm": 5.21875,
"learning_rate": 4.9179775214148806e-09,
"loss": 1.6362351179122925,
"step": 976
},
{
"epoch": 1.2031394275161589,
"grad_norm": 5.90625,
"learning_rate": 4.902643250828055e-09,
"loss": 1.7732539176940918,
"step": 978
},
{
"epoch": 1.2056017236072638,
"grad_norm": 4.875,
"learning_rate": 4.887326400059283e-09,
"loss": 1.7590731382369995,
"step": 980
},
{
"epoch": 1.2080640196983687,
"grad_norm": 2.421875,
"learning_rate": 4.8720272122535195e-09,
"loss": 1.590978980064392,
"step": 982
},
{
"epoch": 1.2105263157894737,
"grad_norm": 22.875,
"learning_rate": 4.8567459302753234e-09,
"loss": 1.8453547954559326,
"step": 984
},
{
"epoch": 1.2129886118805786,
"grad_norm": 6.71875,
"learning_rate": 4.841482796705019e-09,
"loss": 2.2472167015075684,
"step": 986
},
{
"epoch": 1.2154509079716835,
"grad_norm": 5.0625,
"learning_rate": 4.826238053834831e-09,
"loss": 1.9840574264526367,
"step": 988
},
{
"epoch": 1.2179132040627885,
"grad_norm": 9.3125,
"learning_rate": 4.811011943665047e-09,
"loss": 1.930182695388794,
"step": 990
},
{
"epoch": 1.2203755001538936,
"grad_norm": 15.875,
"learning_rate": 4.795804707900169e-09,
"loss": 2.222364664077759,
"step": 992
},
{
"epoch": 1.2228377962449986,
"grad_norm": 10.9375,
"learning_rate": 4.780616587945083e-09,
"loss": 2.241105079650879,
"step": 994
},
{
"epoch": 1.2253000923361035,
"grad_norm": 6.09375,
"learning_rate": 4.765447824901222e-09,
"loss": 2.1059789657592773,
"step": 996
},
{
"epoch": 1.2277623884272084,
"grad_norm": 5.0625,
"learning_rate": 4.750298659562745e-09,
"loss": 1.9286503791809082,
"step": 998
},
{
"epoch": 1.2302246845183133,
"grad_norm": 4.84375,
"learning_rate": 4.735169332412704e-09,
"loss": 1.8667454719543457,
"step": 1000
},
{
"epoch": 1.2326869806094183,
"grad_norm": 9.4375,
"learning_rate": 4.720060083619239e-09,
"loss": 2.0463290214538574,
"step": 1002
},
{
"epoch": 1.2351492767005232,
"grad_norm": 6.28125,
"learning_rate": 4.7049711530317564e-09,
"loss": 2.106719970703125,
"step": 1004
},
{
"epoch": 1.2376115727916281,
"grad_norm": 3.8125,
"learning_rate": 4.6899027801771234e-09,
"loss": 1.829174518585205,
"step": 1006
},
{
"epoch": 1.240073868882733,
"grad_norm": 47.5,
"learning_rate": 4.6748552042558664e-09,
"loss": 2.110135555267334,
"step": 1008
},
{
"epoch": 1.242536164973838,
"grad_norm": 15.9375,
"learning_rate": 4.659828664138378e-09,
"loss": 2.152853012084961,
"step": 1010
},
{
"epoch": 1.244998461064943,
"grad_norm": 10.75,
"learning_rate": 4.6448233983611165e-09,
"loss": 1.862748622894287,
"step": 1012
},
{
"epoch": 1.247460757156048,
"grad_norm": 20.375,
"learning_rate": 4.629839645122828e-09,
"loss": 2.054180860519409,
"step": 1014
},
{
"epoch": 1.249923053247153,
"grad_norm": 10.5,
"learning_rate": 4.614877642280759e-09,
"loss": 2.0183398723602295,
"step": 1016
},
{
"epoch": 1.252385349338258,
"grad_norm": 4.3125,
"learning_rate": 4.59993762734688e-09,
"loss": 1.9448716640472412,
"step": 1018
},
{
"epoch": 1.254847645429363,
"grad_norm": 5.3125,
"learning_rate": 4.585019837484127e-09,
"loss": 1.909618854522705,
"step": 1020
},
{
"epoch": 1.2573099415204678,
"grad_norm": 4.4375,
"learning_rate": 4.5701245095026175e-09,
"loss": 1.8093581199645996,
"step": 1022
},
{
"epoch": 1.2597722376115728,
"grad_norm": 4.375,
"learning_rate": 4.555251879855905e-09,
"loss": 1.8561820983886719,
"step": 1024
},
{
"epoch": 1.2622345337026777,
"grad_norm": 5.71875,
"learning_rate": 4.540402184637225e-09,
"loss": 1.9136399030685425,
"step": 1026
},
{
"epoch": 1.2646968297937828,
"grad_norm": 6.1875,
"learning_rate": 4.525575659575739e-09,
"loss": 1.922465443611145,
"step": 1028
},
{
"epoch": 1.2671591258848878,
"grad_norm": 6.125,
"learning_rate": 4.510772540032801e-09,
"loss": 1.945884346961975,
"step": 1030
},
{
"epoch": 1.2696214219759927,
"grad_norm": 11.6875,
"learning_rate": 4.495993060998216e-09,
"loss": 2.1394665241241455,
"step": 1032
},
{
"epoch": 1.2720837180670976,
"grad_norm": 12.875,
"learning_rate": 4.481237457086511e-09,
"loss": 2.548738479614258,
"step": 1034
},
{
"epoch": 1.2745460141582026,
"grad_norm": 6.65625,
"learning_rate": 4.466505962533216e-09,
"loss": 2.148568868637085,
"step": 1036
},
{
"epoch": 1.2770083102493075,
"grad_norm": 143.0,
"learning_rate": 4.451798811191132e-09,
"loss": 2.0206987857818604,
"step": 1038
},
{
"epoch": 1.2794706063404124,
"grad_norm": 4.78125,
"learning_rate": 4.437116236526635e-09,
"loss": 2.025409698486328,
"step": 1040
},
{
"epoch": 1.2819329024315174,
"grad_norm": 14.875,
"learning_rate": 4.42245847161596e-09,
"loss": 1.8983882665634155,
"step": 1042
},
{
"epoch": 1.2843951985226223,
"grad_norm": 1.8515625,
"learning_rate": 4.4078257491415e-09,
"loss": 1.594254732131958,
"step": 1044
},
{
"epoch": 1.2868574946137272,
"grad_norm": 3.75,
"learning_rate": 4.393218301388123e-09,
"loss": 1.4578649997711182,
"step": 1046
},
{
"epoch": 1.2893197907048322,
"grad_norm": 6.0625,
"learning_rate": 4.378636360239471e-09,
"loss": 1.8163200616836548,
"step": 1048
},
{
"epoch": 1.291782086795937,
"grad_norm": 21.625,
"learning_rate": 4.364080157174287e-09,
"loss": 1.811424732208252,
"step": 1050
},
{
"epoch": 1.2942443828870422,
"grad_norm": 6.46875,
"learning_rate": 4.349549923262743e-09,
"loss": 1.6952979564666748,
"step": 1052
},
{
"epoch": 1.2967066789781472,
"grad_norm": 8.9375,
"learning_rate": 4.33504588916276e-09,
"loss": 1.85584557056427,
"step": 1054
},
{
"epoch": 1.299168975069252,
"grad_norm": 6.25,
"learning_rate": 4.320568285116362e-09,
"loss": 1.8780372142791748,
"step": 1056
},
{
"epoch": 1.301631271160357,
"grad_norm": 3.265625,
"learning_rate": 4.306117340946008e-09,
"loss": 1.694900393486023,
"step": 1058
},
{
"epoch": 1.304093567251462,
"grad_norm": 5.40625,
"learning_rate": 4.291693286050951e-09,
"loss": 1.7237621545791626,
"step": 1060
},
{
"epoch": 1.306555863342567,
"grad_norm": 7.8125,
"learning_rate": 4.277296349403592e-09,
"loss": 1.9782402515411377,
"step": 1062
},
{
"epoch": 1.3090181594336718,
"grad_norm": 11.625,
"learning_rate": 4.262926759545853e-09,
"loss": 2.2806496620178223,
"step": 1064
},
{
"epoch": 1.311480455524777,
"grad_norm": 14.9375,
"learning_rate": 4.2485847445855384e-09,
"loss": 2.0329091548919678,
"step": 1066
},
{
"epoch": 1.313942751615882,
"grad_norm": 7.8125,
"learning_rate": 4.234270532192722e-09,
"loss": 1.996172308921814,
"step": 1068
},
{
"epoch": 1.3164050477069869,
"grad_norm": 5.4375,
"learning_rate": 4.219984349596131e-09,
"loss": 1.7426702976226807,
"step": 1070
},
{
"epoch": 1.3188673437980918,
"grad_norm": 4.09375,
"learning_rate": 4.205726423579531e-09,
"loss": 1.9689075946807861,
"step": 1072
},
{
"epoch": 1.3213296398891967,
"grad_norm": 4.375,
"learning_rate": 4.1914969804781435e-09,
"loss": 1.851407766342163,
"step": 1074
},
{
"epoch": 1.3237919359803016,
"grad_norm": 4.5625,
"learning_rate": 4.177296246175035e-09,
"loss": 1.9321177005767822,
"step": 1076
},
{
"epoch": 1.3262542320714066,
"grad_norm": 10.75,
"learning_rate": 4.1631244460975395e-09,
"loss": 2.1217970848083496,
"step": 1078
},
{
"epoch": 1.3287165281625115,
"grad_norm": 2.34375,
"learning_rate": 4.148981805213683e-09,
"loss": 1.6175642013549805,
"step": 1080
},
{
"epoch": 1.3311788242536164,
"grad_norm": 9.9375,
"learning_rate": 4.134868548028603e-09,
"loss": 1.8694862127304077,
"step": 1082
},
{
"epoch": 1.3336411203447214,
"grad_norm": 3.9375,
"learning_rate": 4.120784898580994e-09,
"loss": 1.9671717882156372,
"step": 1084
},
{
"epoch": 1.3361034164358263,
"grad_norm": 5.9375,
"learning_rate": 4.106731080439549e-09,
"loss": 1.6825287342071533,
"step": 1086
},
{
"epoch": 1.3385657125269312,
"grad_norm": 3.03125,
"learning_rate": 4.092707316699403e-09,
"loss": 1.5507920980453491,
"step": 1088
},
{
"epoch": 1.3410280086180364,
"grad_norm": 6.03125,
"learning_rate": 4.078713829978599e-09,
"loss": 1.4552762508392334,
"step": 1090
},
{
"epoch": 1.3434903047091413,
"grad_norm": 7.09375,
"learning_rate": 4.064750842414555e-09,
"loss": 1.8754684925079346,
"step": 1092
},
{
"epoch": 1.3459526008002463,
"grad_norm": 94.5,
"learning_rate": 4.050818575660528e-09,
"loss": 2.175379753112793,
"step": 1094
},
{
"epoch": 1.3484148968913512,
"grad_norm": 2.921875,
"learning_rate": 4.0369172508821154e-09,
"loss": 1.8554493188858032,
"step": 1096
},
{
"epoch": 1.3508771929824561,
"grad_norm": 4.5625,
"learning_rate": 4.023047088753718e-09,
"loss": 1.2790199518203735,
"step": 1098
},
{
"epoch": 1.353339489073561,
"grad_norm": 4.75,
"learning_rate": 4.009208309455052e-09,
"loss": 1.7523287534713745,
"step": 1100
},
{
"epoch": 1.355801785164666,
"grad_norm": 7.9375,
"learning_rate": 3.9954011326676595e-09,
"loss": 2.061239242553711,
"step": 1102
},
{
"epoch": 1.3582640812557711,
"grad_norm": 8.875,
"learning_rate": 3.981625777571407e-09,
"loss": 2.029423713684082,
"step": 1104
},
{
"epoch": 1.360726377346876,
"grad_norm": 22.125,
"learning_rate": 3.967882462841013e-09,
"loss": 2.4487719535827637,
"step": 1106
},
{
"epoch": 1.363188673437981,
"grad_norm": 13.4375,
"learning_rate": 3.954171406642579e-09,
"loss": 2.2747087478637695,
"step": 1108
},
{
"epoch": 1.365650969529086,
"grad_norm": 8.875,
"learning_rate": 3.940492826630122e-09,
"loss": 2.142123222351074,
"step": 1110
},
{
"epoch": 1.3681132656201909,
"grad_norm": 13.3125,
"learning_rate": 3.926846939942119e-09,
"loss": 2.411155939102173,
"step": 1112
},
{
"epoch": 1.3705755617112958,
"grad_norm": 6.96875,
"learning_rate": 3.913233963198062e-09,
"loss": 2.1852264404296875,
"step": 1114
},
{
"epoch": 1.3730378578024007,
"grad_norm": 2.71875,
"learning_rate": 3.899654112495024e-09,
"loss": 1.5160444974899292,
"step": 1116
},
{
"epoch": 1.3755001538935057,
"grad_norm": 4.59375,
"learning_rate": 3.886107603404221e-09,
"loss": 1.5113252401351929,
"step": 1118
},
{
"epoch": 1.3779624499846106,
"grad_norm": 4.71875,
"learning_rate": 3.872594650967591e-09,
"loss": 1.700373649597168,
"step": 1120
},
{
"epoch": 1.3804247460757155,
"grad_norm": 9.5625,
"learning_rate": 3.859115469694385e-09,
"loss": 1.9584300518035889,
"step": 1122
},
{
"epoch": 1.3828870421668205,
"grad_norm": 5.5,
"learning_rate": 3.845670273557754e-09,
"loss": 1.8532516956329346,
"step": 1124
},
{
"epoch": 1.3853493382579254,
"grad_norm": 4.21875,
"learning_rate": 3.832259275991365e-09,
"loss": 1.640071988105774,
"step": 1126
},
{
"epoch": 1.3878116343490305,
"grad_norm": 3.390625,
"learning_rate": 3.818882689885998e-09,
"loss": 1.2326576709747314,
"step": 1128
},
{
"epoch": 1.3902739304401355,
"grad_norm": 4.375,
"learning_rate": 3.80554072758618e-09,
"loss": 1.5156090259552002,
"step": 1130
},
{
"epoch": 1.3927362265312404,
"grad_norm": 2.625,
"learning_rate": 3.7922336008868e-09,
"loss": 1.5685241222381592,
"step": 1132
},
{
"epoch": 1.3951985226223453,
"grad_norm": 5.09375,
"learning_rate": 3.778961521029762e-09,
"loss": 1.6617923974990845,
"step": 1134
},
{
"epoch": 1.3976608187134503,
"grad_norm": 6.46875,
"learning_rate": 3.765724698700621e-09,
"loss": 1.8906147480010986,
"step": 1136
},
{
"epoch": 1.4001231148045552,
"grad_norm": 2.875,
"learning_rate": 3.752523344025243e-09,
"loss": 1.545287847518921,
"step": 1138
},
{
"epoch": 1.4025854108956601,
"grad_norm": 7.78125,
"learning_rate": 3.7393576665664675e-09,
"loss": 1.732557773590088,
"step": 1140
},
{
"epoch": 1.4050477069867653,
"grad_norm": 2.25,
"learning_rate": 3.7262278753207815e-09,
"loss": 1.72062087059021,
"step": 1142
},
{
"epoch": 1.4075100030778702,
"grad_norm": 8.75,
"learning_rate": 3.7131341787150018e-09,
"loss": 1.5638048648834229,
"step": 1144
},
{
"epoch": 1.4099722991689752,
"grad_norm": 25.0,
"learning_rate": 3.7000767846029665e-09,
"loss": 2.013415575027466,
"step": 1146
},
{
"epoch": 1.41243459526008,
"grad_norm": 2.46875,
"learning_rate": 3.687055900262238e-09,
"loss": 1.5985221862792969,
"step": 1148
},
{
"epoch": 1.414896891351185,
"grad_norm": 12.1875,
"learning_rate": 3.6740717323908046e-09,
"loss": 1.7952547073364258,
"step": 1150
},
{
"epoch": 1.41735918744229,
"grad_norm": 2.9375,
"learning_rate": 3.6611244871038118e-09,
"loss": 1.5459375381469727,
"step": 1152
},
{
"epoch": 1.4198214835333949,
"grad_norm": 6.84375,
"learning_rate": 3.648214369930278e-09,
"loss": 1.641556739807129,
"step": 1154
},
{
"epoch": 1.4222837796244998,
"grad_norm": 2.109375,
"learning_rate": 3.635341585809837e-09,
"loss": 1.5961995124816895,
"step": 1156
},
{
"epoch": 1.4247460757156047,
"grad_norm": 9.125,
"learning_rate": 3.6225063390894896e-09,
"loss": 1.6079602241516113,
"step": 1158
},
{
"epoch": 1.4272083718067097,
"grad_norm": 4.84375,
"learning_rate": 3.609708833520351e-09,
"loss": 2.1076085567474365,
"step": 1160
},
{
"epoch": 1.4296706678978146,
"grad_norm": 19.125,
"learning_rate": 3.5969492722544207e-09,
"loss": 2.1435282230377197,
"step": 1162
},
{
"epoch": 1.4321329639889195,
"grad_norm": 1.796875,
"learning_rate": 3.5842278578413577e-09,
"loss": 1.6422967910766602,
"step": 1164
},
{
"epoch": 1.4345952600800247,
"grad_norm": 4.1875,
"learning_rate": 3.5715447922252655e-09,
"loss": 1.4160196781158447,
"step": 1166
},
{
"epoch": 1.4370575561711296,
"grad_norm": 7.78125,
"learning_rate": 3.558900276741485e-09,
"loss": 1.9306385517120361,
"step": 1168
},
{
"epoch": 1.4395198522622346,
"grad_norm": 6.625,
"learning_rate": 3.5462945121134016e-09,
"loss": 2.028043508529663,
"step": 1170
},
{
"epoch": 1.4419821483533395,
"grad_norm": 18.125,
"learning_rate": 3.533727698449252e-09,
"loss": 1.7561140060424805,
"step": 1172
},
{
"epoch": 1.4444444444444444,
"grad_norm": 11.6875,
"learning_rate": 3.521200035238954e-09,
"loss": 1.9722295999526978,
"step": 1174
},
{
"epoch": 1.4469067405355494,
"grad_norm": 5.40625,
"learning_rate": 3.5087117213509367e-09,
"loss": 2.2334213256835938,
"step": 1176
},
{
"epoch": 1.4493690366266543,
"grad_norm": 10.1875,
"learning_rate": 3.4962629550289858e-09,
"loss": 2.2049357891082764,
"step": 1178
},
{
"epoch": 1.4518313327177594,
"grad_norm": 11.0625,
"learning_rate": 3.4838539338890964e-09,
"loss": 2.2469396591186523,
"step": 1180
},
{
"epoch": 1.4542936288088644,
"grad_norm": 5.59375,
"learning_rate": 3.4714848549163314e-09,
"loss": 2.023268938064575,
"step": 1182
},
{
"epoch": 1.4567559248999693,
"grad_norm": 3.671875,
"learning_rate": 3.4591559144617014e-09,
"loss": 1.8120558261871338,
"step": 1184
},
{
"epoch": 1.4592182209910742,
"grad_norm": 5.65625,
"learning_rate": 3.4468673082390432e-09,
"loss": 1.7612297534942627,
"step": 1186
},
{
"epoch": 1.4616805170821792,
"grad_norm": 23.5,
"learning_rate": 3.434619231321912e-09,
"loss": 1.9972333908081055,
"step": 1188
},
{
"epoch": 1.464142813173284,
"grad_norm": 4.3125,
"learning_rate": 3.4224118781404923e-09,
"loss": 1.8834655284881592,
"step": 1190
},
{
"epoch": 1.466605109264389,
"grad_norm": 35.25,
"learning_rate": 3.4102454424784997e-09,
"loss": 2.4007821083068848,
"step": 1192
},
{
"epoch": 1.469067405355494,
"grad_norm": 9.0,
"learning_rate": 3.398120117470115e-09,
"loss": 2.477167844772339,
"step": 1194
},
{
"epoch": 1.471529701446599,
"grad_norm": 8.625,
"learning_rate": 3.3860360955969127e-09,
"loss": 2.0541319847106934,
"step": 1196
},
{
"epoch": 1.4739919975377038,
"grad_norm": 11.3125,
"learning_rate": 3.373993568684808e-09,
"loss": 2.007800579071045,
"step": 1198
},
{
"epoch": 1.4764542936288088,
"grad_norm": 13.125,
"learning_rate": 3.36199272790101e-09,
"loss": 2.2932679653167725,
"step": 1200
},
{
"epoch": 1.4789165897199137,
"grad_norm": 2.8125,
"learning_rate": 3.350033763750989e-09,
"loss": 1.7902061939239502,
"step": 1202
},
{
"epoch": 1.4813788858110188,
"grad_norm": 15.0625,
"learning_rate": 3.3381168660754523e-09,
"loss": 1.8084830045700073,
"step": 1204
},
{
"epoch": 1.4838411819021238,
"grad_norm": 5.46875,
"learning_rate": 3.3262422240473268e-09,
"loss": 1.930219054222107,
"step": 1206
},
{
"epoch": 1.4863034779932287,
"grad_norm": 4.65625,
"learning_rate": 3.314410026168757e-09,
"loss": 1.8515759706497192,
"step": 1208
},
{
"epoch": 1.4887657740843336,
"grad_norm": 20.875,
"learning_rate": 3.30262046026812e-09,
"loss": 2.1966378688812256,
"step": 1210
},
{
"epoch": 1.4912280701754386,
"grad_norm": 5.0,
"learning_rate": 3.2908737134970367e-09,
"loss": 2.388540744781494,
"step": 1212
},
{
"epoch": 1.4936903662665435,
"grad_norm": 10.375,
"learning_rate": 3.2791699723273984e-09,
"loss": 2.1200718879699707,
"step": 1214
},
{
"epoch": 1.4961526623576484,
"grad_norm": 3.515625,
"learning_rate": 3.2675094225484135e-09,
"loss": 2.037621021270752,
"step": 1216
},
{
"epoch": 1.4986149584487536,
"grad_norm": 3.234375,
"learning_rate": 3.2558922492636578e-09,
"loss": 1.5640082359313965,
"step": 1218
},
{
"epoch": 1.5010772545398585,
"grad_norm": 6.59375,
"learning_rate": 3.2443186368881287e-09,
"loss": 1.5967392921447754,
"step": 1220
},
{
"epoch": 1.5035395506309635,
"grad_norm": 1.1875,
"learning_rate": 3.2327887691453277e-09,
"loss": 1.4248828887939453,
"step": 1222
},
{
"epoch": 1.5060018467220684,
"grad_norm": 5.84375,
"learning_rate": 3.2213028290643363e-09,
"loss": 1.5917315483093262,
"step": 1224
},
{
"epoch": 1.5084641428131733,
"grad_norm": 5.59375,
"learning_rate": 3.2098609989769122e-09,
"loss": 1.761174201965332,
"step": 1226
},
{
"epoch": 1.5109264389042782,
"grad_norm": 13.8125,
"learning_rate": 3.198463460514598e-09,
"loss": 1.7805390357971191,
"step": 1228
},
{
"epoch": 1.5133887349953832,
"grad_norm": 3.125,
"learning_rate": 3.1871103946058343e-09,
"loss": 2.06949782371521,
"step": 1230
},
{
"epoch": 1.515851031086488,
"grad_norm": 8.0625,
"learning_rate": 3.1758019814730902e-09,
"loss": 1.6458537578582764,
"step": 1232
},
{
"epoch": 1.518313327177593,
"grad_norm": 5.90625,
"learning_rate": 3.1645384006300033e-09,
"loss": 1.8969038724899292,
"step": 1234
},
{
"epoch": 1.520775623268698,
"grad_norm": 2.53125,
"learning_rate": 3.153319830878523e-09,
"loss": 1.5056371688842773,
"step": 1236
},
{
"epoch": 1.523237919359803,
"grad_norm": 25.5,
"learning_rate": 3.142146450306082e-09,
"loss": 1.7204036712646484,
"step": 1238
},
{
"epoch": 1.5257002154509078,
"grad_norm": 5.5625,
"learning_rate": 3.1310184362827594e-09,
"loss": 1.7970688343048096,
"step": 1240
},
{
"epoch": 1.5281625115420128,
"grad_norm": 2.75,
"learning_rate": 3.1199359654584756e-09,
"loss": 1.5522937774658203,
"step": 1242
},
{
"epoch": 1.530624807633118,
"grad_norm": 5.46875,
"learning_rate": 3.1088992137601797e-09,
"loss": 1.5566771030426025,
"step": 1244
},
{
"epoch": 1.5330871037242229,
"grad_norm": 4.875,
"learning_rate": 3.097908356389059e-09,
"loss": 1.8924975395202637,
"step": 1246
},
{
"epoch": 1.5355493998153278,
"grad_norm": 2.234375,
"learning_rate": 3.08696356781776e-09,
"loss": 1.5438798666000366,
"step": 1248
},
{
"epoch": 1.5380116959064327,
"grad_norm": 1.8515625,
"learning_rate": 3.0760650217876174e-09,
"loss": 1.286960482597351,
"step": 1250
},
{
"epoch": 1.5404739919975377,
"grad_norm": 3.140625,
"learning_rate": 3.0652128913058935e-09,
"loss": 1.1232177019119263,
"step": 1252
},
{
"epoch": 1.5429362880886428,
"grad_norm": 10.0625,
"learning_rate": 3.0544073486430396e-09,
"loss": 1.7119476795196533,
"step": 1254
},
{
"epoch": 1.5453985841797477,
"grad_norm": 4.84375,
"learning_rate": 3.0436485653299487e-09,
"loss": 2.0494632720947266,
"step": 1256
},
{
"epoch": 1.5478608802708527,
"grad_norm": 3.1875,
"learning_rate": 3.032936712155246e-09,
"loss": 1.5645394325256348,
"step": 1258
},
{
"epoch": 1.5503231763619576,
"grad_norm": 11.1875,
"learning_rate": 3.022271959162567e-09,
"loss": 1.7430448532104492,
"step": 1260
},
{
"epoch": 1.5527854724530625,
"grad_norm": 3.25,
"learning_rate": 3.0116544756478663e-09,
"loss": 1.6215105056762695,
"step": 1262
},
{
"epoch": 1.5552477685441675,
"grad_norm": 5.40625,
"learning_rate": 3.001084430156724e-09,
"loss": 1.4022070169448853,
"step": 1264
},
{
"epoch": 1.5577100646352724,
"grad_norm": 4.3125,
"learning_rate": 2.990561990481675e-09,
"loss": 1.7849698066711426,
"step": 1266
},
{
"epoch": 1.5601723607263773,
"grad_norm": 2.90625,
"learning_rate": 2.9800873236595416e-09,
"loss": 1.514677882194519,
"step": 1268
},
{
"epoch": 1.5626346568174823,
"grad_norm": 10.0,
"learning_rate": 2.9696605959687833e-09,
"loss": 1.529390573501587,
"step": 1270
},
{
"epoch": 1.5650969529085872,
"grad_norm": 2.5625,
"learning_rate": 2.9592819729268566e-09,
"loss": 1.8093581199645996,
"step": 1272
},
{
"epoch": 1.5675592489996921,
"grad_norm": 10.0625,
"learning_rate": 2.948951619287592e-09,
"loss": 1.3842357397079468,
"step": 1274
},
{
"epoch": 1.570021545090797,
"grad_norm": 14.5,
"learning_rate": 2.938669699038571e-09,
"loss": 1.85842764377594,
"step": 1276
},
{
"epoch": 1.572483841181902,
"grad_norm": 29.0,
"learning_rate": 2.928436375398528e-09,
"loss": 2.2186334133148193,
"step": 1278
},
{
"epoch": 1.574946137273007,
"grad_norm": 7.625,
"learning_rate": 2.9182518108147588e-09,
"loss": 2.11116361618042,
"step": 1280
},
{
"epoch": 1.577408433364112,
"grad_norm": 10.5625,
"learning_rate": 2.9081161669605395e-09,
"loss": 2.039137363433838,
"step": 1282
},
{
"epoch": 1.579870729455217,
"grad_norm": 1.7578125,
"learning_rate": 2.8980296047325638e-09,
"loss": 1.548026204109192,
"step": 1284
},
{
"epoch": 1.582333025546322,
"grad_norm": 6.34375,
"learning_rate": 2.8879922842483867e-09,
"loss": 1.4916882514953613,
"step": 1286
},
{
"epoch": 1.5847953216374269,
"grad_norm": 4.5,
"learning_rate": 2.8780043648438818e-09,
"loss": 1.6858062744140625,
"step": 1288
},
{
"epoch": 1.587257617728532,
"grad_norm": 6.84375,
"learning_rate": 2.868066005070713e-09,
"loss": 1.8366402387619019,
"step": 1290
},
{
"epoch": 1.589719913819637,
"grad_norm": 3.15625,
"learning_rate": 2.8581773626938166e-09,
"loss": 1.4952478408813477,
"step": 1292
},
{
"epoch": 1.5921822099107419,
"grad_norm": 4.3125,
"learning_rate": 2.8483385946889017e-09,
"loss": 1.4701340198516846,
"step": 1294
},
{
"epoch": 1.5946445060018468,
"grad_norm": 5.25,
"learning_rate": 2.8385498572399503e-09,
"loss": 1.8555335998535156,
"step": 1296
},
{
"epoch": 1.5971068020929517,
"grad_norm": 5.0,
"learning_rate": 2.828811305736743e-09,
"loss": 1.8610620498657227,
"step": 1298
},
{
"epoch": 1.5995690981840567,
"grad_norm": 7.09375,
"learning_rate": 2.8191230947723945e-09,
"loss": 1.883762240409851,
"step": 1300
},
{
"epoch": 1.6020313942751616,
"grad_norm": 14.5625,
"learning_rate": 2.809485378140893e-09,
"loss": 2.238772392272949,
"step": 1302
},
{
"epoch": 1.6044936903662665,
"grad_norm": 6.25,
"learning_rate": 2.7998983088346625e-09,
"loss": 2.1114282608032227,
"step": 1304
},
{
"epoch": 1.6069559864573715,
"grad_norm": 1.9140625,
"learning_rate": 2.7903620390421363e-09,
"loss": 1.6002395153045654,
"step": 1306
},
{
"epoch": 1.6094182825484764,
"grad_norm": 9.4375,
"learning_rate": 2.7808767201453376e-09,
"loss": 1.6772760152816772,
"step": 1308
},
{
"epoch": 1.6118805786395813,
"grad_norm": 10.4375,
"learning_rate": 2.771442502717478e-09,
"loss": 2.111185073852539,
"step": 1310
},
{
"epoch": 1.6143428747306863,
"grad_norm": 14.125,
"learning_rate": 2.7620595365205627e-09,
"loss": 2.0705718994140625,
"step": 1312
},
{
"epoch": 1.6168051708217912,
"grad_norm": 4.46875,
"learning_rate": 2.752727970503024e-09,
"loss": 1.95082426071167,
"step": 1314
},
{
"epoch": 1.6192674669128961,
"grad_norm": 5.03125,
"learning_rate": 2.7434479527973477e-09,
"loss": 1.7210240364074707,
"step": 1316
},
{
"epoch": 1.621729763004001,
"grad_norm": 3.515625,
"learning_rate": 2.7342196307177214e-09,
"loss": 1.6697207689285278,
"step": 1318
},
{
"epoch": 1.6241920590951062,
"grad_norm": 2.65625,
"learning_rate": 2.7250431507577004e-09,
"loss": 1.4422950744628906,
"step": 1320
},
{
"epoch": 1.6266543551862112,
"grad_norm": 2.84375,
"learning_rate": 2.7159186585878816e-09,
"loss": 1.1386830806732178,
"step": 1322
},
{
"epoch": 1.629116651277316,
"grad_norm": 3.015625,
"learning_rate": 2.7068462990535863e-09,
"loss": 1.2971214056015015,
"step": 1324
},
{
"epoch": 1.631578947368421,
"grad_norm": 19.875,
"learning_rate": 2.697826216172569e-09,
"loss": 1.638606309890747,
"step": 1326
},
{
"epoch": 1.6340412434595262,
"grad_norm": 3.109375,
"learning_rate": 2.688858553132723e-09,
"loss": 1.6914677619934082,
"step": 1328
},
{
"epoch": 1.636503539550631,
"grad_norm": 2.28125,
"learning_rate": 2.6799434522898126e-09,
"loss": 1.1819281578063965,
"step": 1330
},
{
"epoch": 1.638965835641736,
"grad_norm": 2.140625,
"learning_rate": 2.6710810551652133e-09,
"loss": 1.1034936904907227,
"step": 1332
},
{
"epoch": 1.641428131732841,
"grad_norm": 34.5,
"learning_rate": 2.66227150244366e-09,
"loss": 1.6707381010055542,
"step": 1334
},
{
"epoch": 1.643890427823946,
"grad_norm": 25.5,
"learning_rate": 2.6535149339710184e-09,
"loss": 2.70631742477417,
"step": 1336
},
{
"epoch": 1.6463527239150508,
"grad_norm": 30.75,
"learning_rate": 2.644811488752068e-09,
"loss": 2.4394781589508057,
"step": 1338
},
{
"epoch": 1.6488150200061558,
"grad_norm": 13.625,
"learning_rate": 2.636161304948286e-09,
"loss": 2.2337255477905273,
"step": 1340
},
{
"epoch": 1.6512773160972607,
"grad_norm": 13.0,
"learning_rate": 2.627564519875663e-09,
"loss": 2.295048236846924,
"step": 1342
},
{
"epoch": 1.6537396121883656,
"grad_norm": 20.0,
"learning_rate": 2.6190212700025183e-09,
"loss": 2.110807418823242,
"step": 1344
},
{
"epoch": 1.6562019082794706,
"grad_norm": 4.84375,
"learning_rate": 2.6105316909473364e-09,
"loss": 1.8732104301452637,
"step": 1346
},
{
"epoch": 1.6586642043705755,
"grad_norm": 8.125,
"learning_rate": 2.6020959174766106e-09,
"loss": 1.9254186153411865,
"step": 1348
},
{
"epoch": 1.6611265004616804,
"grad_norm": 6.15625,
"learning_rate": 2.5937140835027097e-09,
"loss": 1.8715019226074219,
"step": 1350
},
{
"epoch": 1.6635887965527854,
"grad_norm": 9.8125,
"learning_rate": 2.5853863220817436e-09,
"loss": 1.9434764385223389,
"step": 1352
},
{
"epoch": 1.6660510926438903,
"grad_norm": 5.25,
"learning_rate": 2.577112765411459e-09,
"loss": 2.207705497741699,
"step": 1354
},
{
"epoch": 1.6685133887349952,
"grad_norm": 12.625,
"learning_rate": 2.568893544829136e-09,
"loss": 1.880719780921936,
"step": 1356
},
{
"epoch": 1.6709756848261004,
"grad_norm": 9.5625,
"learning_rate": 2.560728790809509e-09,
"loss": 1.8875178098678589,
"step": 1358
},
{
"epoch": 1.6734379809172053,
"grad_norm": 5.4375,
"learning_rate": 2.5526186329626865e-09,
"loss": 1.6963284015655518,
"step": 1360
},
{
"epoch": 1.6759002770083102,
"grad_norm": 5.90625,
"learning_rate": 2.5445632000320995e-09,
"loss": 1.791224718093872,
"step": 1362
},
{
"epoch": 1.6783625730994152,
"grad_norm": 3.890625,
"learning_rate": 2.5365626198924598e-09,
"loss": 1.6278963088989258,
"step": 1364
},
{
"epoch": 1.6808248691905203,
"grad_norm": 3.375,
"learning_rate": 2.528617019547723e-09,
"loss": 1.3288359642028809,
"step": 1366
},
{
"epoch": 1.6832871652816253,
"grad_norm": 9.0625,
"learning_rate": 2.5207265251290823e-09,
"loss": 1.6888291835784912,
"step": 1368
},
{
"epoch": 1.6857494613727302,
"grad_norm": 13.375,
"learning_rate": 2.512891261892955e-09,
"loss": 2.285770893096924,
"step": 1370
},
{
"epoch": 1.6882117574638351,
"grad_norm": 3.1875,
"learning_rate": 2.505111354219002e-09,
"loss": 1.671492099761963,
"step": 1372
},
{
"epoch": 1.69067405355494,
"grad_norm": 6.25,
"learning_rate": 2.49738692560815e-09,
"loss": 1.5187859535217285,
"step": 1374
},
{
"epoch": 1.693136349646045,
"grad_norm": 7.0625,
"learning_rate": 2.4897180986806322e-09,
"loss": 1.9461727142333984,
"step": 1376
},
{
"epoch": 1.69559864573715,
"grad_norm": 7.53125,
"learning_rate": 2.482104995174044e-09,
"loss": 1.8825700283050537,
"step": 1378
},
{
"epoch": 1.6980609418282548,
"grad_norm": 5.28125,
"learning_rate": 2.474547735941405e-09,
"loss": 1.8659740686416626,
"step": 1380
},
{
"epoch": 1.7005232379193598,
"grad_norm": 5.59375,
"learning_rate": 2.4670464409492447e-09,
"loss": 1.7924315929412842,
"step": 1382
},
{
"epoch": 1.7029855340104647,
"grad_norm": 13.4375,
"learning_rate": 2.459601229275697e-09,
"loss": 1.9610867500305176,
"step": 1384
},
{
"epoch": 1.7054478301015696,
"grad_norm": 8.5,
"learning_rate": 2.4522122191086104e-09,
"loss": 1.836552381515503,
"step": 1386
},
{
"epoch": 1.7079101261926746,
"grad_norm": 8.8125,
"learning_rate": 2.4448795277436698e-09,
"loss": 1.7403874397277832,
"step": 1388
},
{
"epoch": 1.7103724222837795,
"grad_norm": 4.625,
"learning_rate": 2.4376032715825386e-09,
"loss": 1.5626749992370605,
"step": 1390
},
{
"epoch": 1.7128347183748844,
"grad_norm": 3.625,
"learning_rate": 2.4303835661310066e-09,
"loss": 1.3395249843597412,
"step": 1392
},
{
"epoch": 1.7152970144659896,
"grad_norm": 13.125,
"learning_rate": 2.4232205259971584e-09,
"loss": 1.0826705694198608,
"step": 1394
},
{
"epoch": 1.7177593105570945,
"grad_norm": 12.875,
"learning_rate": 2.4161142648895533e-09,
"loss": 1.810969352722168,
"step": 1396
},
{
"epoch": 1.7202216066481995,
"grad_norm": 9.0,
"learning_rate": 2.4090648956154223e-09,
"loss": 2.039994239807129,
"step": 1398
},
{
"epoch": 1.7226839027393044,
"grad_norm": 7.625,
"learning_rate": 2.402072530078876e-09,
"loss": 1.8878741264343262,
"step": 1400
},
{
"epoch": 1.7251461988304093,
"grad_norm": 4.5625,
"learning_rate": 2.395137279279127e-09,
"loss": 1.8724961280822754,
"step": 1402
},
{
"epoch": 1.7276084949215145,
"grad_norm": 4.0,
"learning_rate": 2.3882592533087286e-09,
"loss": 1.9301607608795166,
"step": 1404
},
{
"epoch": 1.7300707910126194,
"grad_norm": 24.125,
"learning_rate": 2.3814385613518284e-09,
"loss": 1.6868252754211426,
"step": 1406
},
{
"epoch": 1.7325330871037243,
"grad_norm": 6.78125,
"learning_rate": 2.374675311682433e-09,
"loss": 1.7913291454315186,
"step": 1408
},
{
"epoch": 1.7349953831948293,
"grad_norm": 2.59375,
"learning_rate": 2.3679696116626936e-09,
"loss": 1.5577332973480225,
"step": 1410
},
{
"epoch": 1.7374576792859342,
"grad_norm": 4.875,
"learning_rate": 2.3613215677411944e-09,
"loss": 1.5362656116485596,
"step": 1412
},
{
"epoch": 1.7399199753770391,
"grad_norm": 1.75,
"learning_rate": 2.354731285451268e-09,
"loss": 1.5279173851013184,
"step": 1414
},
{
"epoch": 1.742382271468144,
"grad_norm": 10.6875,
"learning_rate": 2.348198869409322e-09,
"loss": 1.696439504623413,
"step": 1416
},
{
"epoch": 1.744844567559249,
"grad_norm": 18.5,
"learning_rate": 2.341724423313171e-09,
"loss": 2.554849147796631,
"step": 1418
},
{
"epoch": 1.747306863650354,
"grad_norm": 13.0625,
"learning_rate": 2.335308049940398e-09,
"loss": 2.1925854682922363,
"step": 1420
},
{
"epoch": 1.7497691597414589,
"grad_norm": 3.46875,
"learning_rate": 2.328949851146718e-09,
"loss": 1.593017816543579,
"step": 1422
},
{
"epoch": 1.7522314558325638,
"grad_norm": 4.0,
"learning_rate": 2.322649927864363e-09,
"loss": 1.229564905166626,
"step": 1424
},
{
"epoch": 1.7546937519236687,
"grad_norm": 15.6875,
"learning_rate": 2.3164083801004798e-09,
"loss": 1.9423973560333252,
"step": 1426
},
{
"epoch": 1.7571560480147737,
"grad_norm": 5.75,
"learning_rate": 2.3102253069355413e-09,
"loss": 2.0594370365142822,
"step": 1428
},
{
"epoch": 1.7596183441058786,
"grad_norm": 6.53125,
"learning_rate": 2.3041008065217754e-09,
"loss": 1.9393881559371948,
"step": 1430
},
{
"epoch": 1.7620806401969837,
"grad_norm": 7.90625,
"learning_rate": 2.298034976081607e-09,
"loss": 1.8895037174224854,
"step": 1432
},
{
"epoch": 1.7645429362880887,
"grad_norm": 8.125,
"learning_rate": 2.292027911906112e-09,
"loss": 1.7276127338409424,
"step": 1434
},
{
"epoch": 1.7670052323791936,
"grad_norm": 6.125,
"learning_rate": 2.286079709353491e-09,
"loss": 1.5182913541793823,
"step": 1436
},
{
"epoch": 1.7694675284702985,
"grad_norm": 8.6875,
"learning_rate": 2.2801904628475545e-09,
"loss": 1.845018982887268,
"step": 1438
},
{
"epoch": 1.7719298245614035,
"grad_norm": 13.0625,
"learning_rate": 2.274360265876225e-09,
"loss": 2.4570071697235107,
"step": 1440
},
{
"epoch": 1.7743921206525086,
"grad_norm": 6.53125,
"learning_rate": 2.268589210990052e-09,
"loss": 1.779624342918396,
"step": 1442
},
{
"epoch": 1.7768544167436136,
"grad_norm": 11.0,
"learning_rate": 2.262877389800745e-09,
"loss": 1.5919256210327148,
"step": 1444
},
{
"epoch": 1.7793167128347185,
"grad_norm": 5.96875,
"learning_rate": 2.257224892979714e-09,
"loss": 2.230924129486084,
"step": 1446
},
{
"epoch": 1.7817790089258234,
"grad_norm": 2.296875,
"learning_rate": 2.2516318102566373e-09,
"loss": 1.6709070205688477,
"step": 1448
},
{
"epoch": 1.7842413050169283,
"grad_norm": 7.1875,
"learning_rate": 2.24609823041803e-09,
"loss": 1.5729997158050537,
"step": 1450
},
{
"epoch": 1.7867036011080333,
"grad_norm": 23.125,
"learning_rate": 2.240624241305841e-09,
"loss": 2.22371768951416,
"step": 1452
},
{
"epoch": 1.7891658971991382,
"grad_norm": 9.0,
"learning_rate": 2.2352099298160545e-09,
"loss": 1.9387813806533813,
"step": 1454
},
{
"epoch": 1.7916281932902431,
"grad_norm": 6.96875,
"learning_rate": 2.2298553818973096e-09,
"loss": 1.6565120220184326,
"step": 1456
},
{
"epoch": 1.794090489381348,
"grad_norm": 24.0,
"learning_rate": 2.2245606825495408e-09,
"loss": 1.6322071552276611,
"step": 1458
},
{
"epoch": 1.796552785472453,
"grad_norm": 6.0625,
"learning_rate": 2.219325915822624e-09,
"loss": 2.004333257675171,
"step": 1460
},
{
"epoch": 1.799015081563558,
"grad_norm": 11.625,
"learning_rate": 2.214151164815044e-09,
"loss": 2.2140424251556396,
"step": 1462
},
{
"epoch": 1.8014773776546629,
"grad_norm": 5.90625,
"learning_rate": 2.2090365116725787e-09,
"loss": 1.876783847808838,
"step": 1464
},
{
"epoch": 1.8039396737457678,
"grad_norm": 2.921875,
"learning_rate": 2.203982037586988e-09,
"loss": 1.5903770923614502,
"step": 1466
},
{
"epoch": 1.8064019698368727,
"grad_norm": 5.78125,
"learning_rate": 2.1989878227947297e-09,
"loss": 1.4093436002731323,
"step": 1468
},
{
"epoch": 1.8088642659279779,
"grad_norm": 5.4375,
"learning_rate": 2.1940539465756848e-09,
"loss": 1.5252522230148315,
"step": 1470
},
{
"epoch": 1.8113265620190828,
"grad_norm": 11.1875,
"learning_rate": 2.1891804872519013e-09,
"loss": 1.6333411931991577,
"step": 1472
},
{
"epoch": 1.8137888581101878,
"grad_norm": 12.125,
"learning_rate": 2.1843675221863456e-09,
"loss": 2.395686626434326,
"step": 1474
},
{
"epoch": 1.8162511542012927,
"grad_norm": 6.40625,
"learning_rate": 2.179615127781678e-09,
"loss": 2.011446475982666,
"step": 1476
},
{
"epoch": 1.8187134502923976,
"grad_norm": 27.75,
"learning_rate": 2.1749233794790424e-09,
"loss": 1.9201209545135498,
"step": 1478
},
{
"epoch": 1.8211757463835028,
"grad_norm": 8.75,
"learning_rate": 2.1702923517568608e-09,
"loss": 1.9654639959335327,
"step": 1480
},
{
"epoch": 1.8236380424746077,
"grad_norm": 14.4375,
"learning_rate": 2.1657221181296596e-09,
"loss": 2.4255740642547607,
"step": 1482
},
{
"epoch": 1.8261003385657126,
"grad_norm": 5.46875,
"learning_rate": 2.161212751146898e-09,
"loss": 2.1441259384155273,
"step": 1484
},
{
"epoch": 1.8285626346568176,
"grad_norm": 3.03125,
"learning_rate": 2.1567643223918164e-09,
"loss": 1.5081210136413574,
"step": 1486
},
{
"epoch": 1.8310249307479225,
"grad_norm": 3.15625,
"learning_rate": 2.1523769024803013e-09,
"loss": 1.219706416130066,
"step": 1488
},
{
"epoch": 1.8334872268390274,
"grad_norm": 3.296875,
"learning_rate": 2.148050561059763e-09,
"loss": 1.3154406547546387,
"step": 1490
},
{
"epoch": 1.8359495229301324,
"grad_norm": 4.84375,
"learning_rate": 2.1437853668080316e-09,
"loss": 1.663912057876587,
"step": 1492
},
{
"epoch": 1.8384118190212373,
"grad_norm": 5.5,
"learning_rate": 2.139581387432267e-09,
"loss": 1.9996685981750488,
"step": 1494
},
{
"epoch": 1.8408741151123422,
"grad_norm": 9.125,
"learning_rate": 2.135438689667882e-09,
"loss": 2.1527910232543945,
"step": 1496
},
{
"epoch": 1.8433364112034472,
"grad_norm": 5.4375,
"learning_rate": 2.1313573392774835e-09,
"loss": 2.181238889694214,
"step": 1498
},
{
"epoch": 1.845798707294552,
"grad_norm": 26.625,
"learning_rate": 2.1273374010498306e-09,
"loss": 2.07470965385437,
"step": 1500
},
{
"epoch": 1.848261003385657,
"grad_norm": 7.375,
"learning_rate": 2.123378938798803e-09,
"loss": 2.180095672607422,
"step": 1502
},
{
"epoch": 1.850723299476762,
"grad_norm": 10.25,
"learning_rate": 2.119482015362392e-09,
"loss": 2.023428440093994,
"step": 1504
},
{
"epoch": 1.8531855955678669,
"grad_norm": 6.03125,
"learning_rate": 2.1156466926016974e-09,
"loss": 1.9310382604599,
"step": 1506
},
{
"epoch": 1.855647891658972,
"grad_norm": 10.9375,
"learning_rate": 2.1118730313999516e-09,
"loss": 1.7410407066345215,
"step": 1508
},
{
"epoch": 1.858110187750077,
"grad_norm": 14.9375,
"learning_rate": 2.108161091661548e-09,
"loss": 2.463320732116699,
"step": 1510
},
{
"epoch": 1.860572483841182,
"grad_norm": 10.1875,
"learning_rate": 2.1045109323110943e-09,
"loss": 2.164478302001953,
"step": 1512
},
{
"epoch": 1.8630347799322868,
"grad_norm": 11.0,
"learning_rate": 2.1009226112924727e-09,
"loss": 2.304097890853882,
"step": 1514
},
{
"epoch": 1.8654970760233918,
"grad_norm": 11.4375,
"learning_rate": 2.097396185567926e-09,
"loss": 2.384671688079834,
"step": 1516
},
{
"epoch": 1.867959372114497,
"grad_norm": 11.875,
"learning_rate": 2.0939317111171467e-09,
"loss": 1.752406358718872,
"step": 1518
},
{
"epoch": 1.8704216682056019,
"grad_norm": 19.875,
"learning_rate": 2.090529242936392e-09,
"loss": 1.5490081310272217,
"step": 1520
},
{
"epoch": 1.8728839642967068,
"grad_norm": 5.90625,
"learning_rate": 2.087188835037611e-09,
"loss": 2.0984854698181152,
"step": 1522
},
{
"epoch": 1.8753462603878117,
"grad_norm": 2.890625,
"learning_rate": 2.0839105404475866e-09,
"loss": 1.6633992195129395,
"step": 1524
},
{
"epoch": 1.8778085564789166,
"grad_norm": 3.6875,
"learning_rate": 2.080694411207094e-09,
"loss": 1.4255918264389038,
"step": 1526
},
{
"epoch": 1.8802708525700216,
"grad_norm": 4.84375,
"learning_rate": 2.0775404983700724e-09,
"loss": 1.845369577407837,
"step": 1528
},
{
"epoch": 1.8827331486611265,
"grad_norm": 4.40625,
"learning_rate": 2.074448852002819e-09,
"loss": 1.7371915578842163,
"step": 1530
},
{
"epoch": 1.8851954447522314,
"grad_norm": 13.3125,
"learning_rate": 2.07141952118319e-09,
"loss": 1.805029034614563,
"step": 1532
},
{
"epoch": 1.8876577408433364,
"grad_norm": 6.65625,
"learning_rate": 2.068452553999822e-09,
"loss": 2.060267448425293,
"step": 1534
},
{
"epoch": 1.8901200369344413,
"grad_norm": 3.625,
"learning_rate": 2.065547997551375e-09,
"loss": 1.525952935218811,
"step": 1536
},
{
"epoch": 1.8925823330255462,
"grad_norm": 7.46875,
"learning_rate": 2.062705897945773e-09,
"loss": 1.4751570224761963,
"step": 1538
},
{
"epoch": 1.8950446291166512,
"grad_norm": 5.0625,
"learning_rate": 2.059926300299483e-09,
"loss": 1.6626102924346924,
"step": 1540
},
{
"epoch": 1.897506925207756,
"grad_norm": 5.65625,
"learning_rate": 2.057209248736792e-09,
"loss": 1.2773092985153198,
"step": 1542
},
{
"epoch": 1.899969221298861,
"grad_norm": 13.0625,
"learning_rate": 2.054554786389111e-09,
"loss": 1.6589457988739014,
"step": 1544
},
{
"epoch": 1.9024315173899662,
"grad_norm": 6.25,
"learning_rate": 2.051962955394286e-09,
"loss": 1.9413405656814575,
"step": 1546
},
{
"epoch": 1.9048938134810711,
"grad_norm": 10.25,
"learning_rate": 2.0494337968959344e-09,
"loss": 1.6395326852798462,
"step": 1548
},
{
"epoch": 1.907356109572176,
"grad_norm": 5.21875,
"learning_rate": 2.0469673510427865e-09,
"loss": 1.9667985439300537,
"step": 1550
},
{
"epoch": 1.909818405663281,
"grad_norm": 4.90625,
"learning_rate": 2.0445636569880505e-09,
"loss": 1.8468351364135742,
"step": 1552
},
{
"epoch": 1.912280701754386,
"grad_norm": 11.25,
"learning_rate": 2.0422227528887923e-09,
"loss": 2.118504524230957,
"step": 1554
},
{
"epoch": 1.914742997845491,
"grad_norm": 10.375,
"learning_rate": 2.0399446759053274e-09,
"loss": 2.0504517555236816,
"step": 1556
},
{
"epoch": 1.917205293936596,
"grad_norm": 5.25,
"learning_rate": 2.037729462200633e-09,
"loss": 1.661136507987976,
"step": 1558
},
{
"epoch": 1.919667590027701,
"grad_norm": 6.03125,
"learning_rate": 2.0355771469397726e-09,
"loss": 1.5671418905258179,
"step": 1560
},
{
"epoch": 1.9221298861188059,
"grad_norm": 5.34375,
"learning_rate": 2.0334877642893373e-09,
"loss": 2.0463449954986572,
"step": 1562
},
{
"epoch": 1.9245921822099108,
"grad_norm": 3.96875,
"learning_rate": 2.0314613474169064e-09,
"loss": 1.7543866634368896,
"step": 1564
},
{
"epoch": 1.9270544783010157,
"grad_norm": 23.375,
"learning_rate": 2.029497928490516e-09,
"loss": 1.5825181007385254,
"step": 1566
},
{
"epoch": 1.9295167743921207,
"grad_norm": 8.6875,
"learning_rate": 2.027597538678154e-09,
"loss": 1.5585989952087402,
"step": 1568
},
{
"epoch": 1.9319790704832256,
"grad_norm": 10.0625,
"learning_rate": 2.0257602081472603e-09,
"loss": 1.5373648405075073,
"step": 1570
},
{
"epoch": 1.9344413665743305,
"grad_norm": 3.296875,
"learning_rate": 2.023985966064252e-09,
"loss": 1.638904333114624,
"step": 1572
},
{
"epoch": 1.9369036626654355,
"grad_norm": 2.71875,
"learning_rate": 2.0222748405940567e-09,
"loss": 1.3301455974578857,
"step": 1574
},
{
"epoch": 1.9393659587565404,
"grad_norm": 2.734375,
"learning_rate": 2.0206268588996686e-09,
"loss": 1.1727893352508545,
"step": 1576
},
{
"epoch": 1.9418282548476453,
"grad_norm": 4.46875,
"learning_rate": 2.019042047141714e-09,
"loss": 1.2285372018814087,
"step": 1578
},
{
"epoch": 1.9442905509387503,
"grad_norm": 5.0625,
"learning_rate": 2.0175204304780413e-09,
"loss": 1.5906985998153687,
"step": 1580
},
{
"epoch": 1.9467528470298552,
"grad_norm": 18.875,
"learning_rate": 2.016062033063314e-09,
"loss": 1.8927161693572998,
"step": 1582
},
{
"epoch": 1.9492151431209603,
"grad_norm": 11.4375,
"learning_rate": 2.0146668780486356e-09,
"loss": 2.0817370414733887,
"step": 1584
},
{
"epoch": 1.9516774392120653,
"grad_norm": 8.4375,
"learning_rate": 2.0133349875811752e-09,
"loss": 2.1541638374328613,
"step": 1586
},
{
"epoch": 1.9541397353031702,
"grad_norm": 6.03125,
"learning_rate": 2.0120663828038197e-09,
"loss": 2.136171340942383,
"step": 1588
},
{
"epoch": 1.9566020313942751,
"grad_norm": 8.8125,
"learning_rate": 2.010861083854838e-09,
"loss": 2.047274112701416,
"step": 1590
},
{
"epoch": 1.95906432748538,
"grad_norm": 5.4375,
"learning_rate": 2.009719109867558e-09,
"loss": 2.093939781188965,
"step": 1592
},
{
"epoch": 1.9615266235764852,
"grad_norm": 8.0625,
"learning_rate": 2.0086404789700686e-09,
"loss": 1.9545447826385498,
"step": 1594
},
{
"epoch": 1.9639889196675901,
"grad_norm": 4.03125,
"learning_rate": 2.0076252082849266e-09,
"loss": 1.710350751876831,
"step": 1596
},
{
"epoch": 1.966451215758695,
"grad_norm": 9.8125,
"learning_rate": 2.006673313928888e-09,
"loss": 1.6602602005004883,
"step": 1598
},
{
"epoch": 1.9689135118498,
"grad_norm": 6.96875,
"learning_rate": 2.0057848110126513e-09,
"loss": 2.073413848876953,
"step": 1600
},
{
"epoch": 1.971375807940905,
"grad_norm": 18.75,
"learning_rate": 2.0049597136406157e-09,
"loss": 2.155198574066162,
"step": 1602
},
{
"epoch": 1.9738381040320099,
"grad_norm": 7.4375,
"learning_rate": 2.004198034910662e-09,
"loss": 2.1142520904541016,
"step": 1604
},
{
"epoch": 1.9763004001231148,
"grad_norm": 2.6875,
"learning_rate": 2.003499786913938e-09,
"loss": 1.6299633979797363,
"step": 1606
},
{
"epoch": 1.9787626962142197,
"grad_norm": 11.3125,
"learning_rate": 2.0028649807346742e-09,
"loss": 1.5626764297485352,
"step": 1608
},
{
"epoch": 1.9812249923053247,
"grad_norm": 16.875,
"learning_rate": 2.0022936264500017e-09,
"loss": 2.2909412384033203,
"step": 1610
},
{
"epoch": 1.9836872883964296,
"grad_norm": 11.25,
"learning_rate": 2.0017857331297935e-09,
"loss": 2.1796622276306152,
"step": 1612
},
{
"epoch": 1.9861495844875345,
"grad_norm": 5.375,
"learning_rate": 2.001341308836524e-09,
"loss": 1.9472308158874512,
"step": 1614
},
{
"epoch": 1.9886118805786395,
"grad_norm": 8.5625,
"learning_rate": 2.000960360625136e-09,
"loss": 1.743130087852478,
"step": 1616
},
{
"epoch": 1.9910741766697444,
"grad_norm": 10.1875,
"learning_rate": 2.0006428945429335e-09,
"loss": 1.43598210811615,
"step": 1618
},
{
"epoch": 1.9935364727608493,
"grad_norm": 12.8125,
"learning_rate": 2.0003889156294813e-09,
"loss": 1.9119551181793213,
"step": 1620
},
{
"epoch": 1.9959987688519545,
"grad_norm": 5.71875,
"learning_rate": 2.0001984279165285e-09,
"loss": 2.036318302154541,
"step": 1622
},
{
"epoch": 1.9984610649430594,
"grad_norm": 5.28125,
"learning_rate": 2.0000714344279417e-09,
"loss": 1.577465295791626,
"step": 1624
},
{
"epoch": 2.0,
"grad_norm": 3.578125,
"learning_rate": 2.00000793717966e-09,
"loss": 1.1681241989135742,
"step": 1626
},
{
"epoch": 2.0,
"step": 1626,
"total_flos": 2.5753569883429274e+18,
"train_loss": 1.8335715001506265,
"train_runtime": 15477.0683,
"train_samples_per_second": 1.679,
"train_steps_per_second": 0.105
}
],
"logging_steps": 2,
"max_steps": 1626,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 9999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5753569883429274e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}