atsuki-yamaguchi's picture
Upload folder using huggingface_hub
0e8b670 verified
raw
history blame
73.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1257,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002386634844868735,
"grad_norm": 129.18601989746094,
"learning_rate": 2.3809523809523808e-06,
"loss": 14.4042,
"step": 3
},
{
"epoch": 0.00477326968973747,
"grad_norm": 55.742095947265625,
"learning_rate": 4.7619047619047615e-06,
"loss": 12.0437,
"step": 6
},
{
"epoch": 0.007159904534606206,
"grad_norm": 39.26912307739258,
"learning_rate": 7.142857142857143e-06,
"loss": 8.1864,
"step": 9
},
{
"epoch": 0.00954653937947494,
"grad_norm": 10.621451377868652,
"learning_rate": 9.523809523809523e-06,
"loss": 4.9227,
"step": 12
},
{
"epoch": 0.011933174224343675,
"grad_norm": 10.118457794189453,
"learning_rate": 1.1904761904761905e-05,
"loss": 4.8046,
"step": 15
},
{
"epoch": 0.014319809069212411,
"grad_norm": 9.227359771728516,
"learning_rate": 1.4285714285714285e-05,
"loss": 4.813,
"step": 18
},
{
"epoch": 0.016706443914081145,
"grad_norm": 7.128821849822998,
"learning_rate": 1.6666666666666667e-05,
"loss": 4.551,
"step": 21
},
{
"epoch": 0.01909307875894988,
"grad_norm": 9.488492012023926,
"learning_rate": 1.9047619047619046e-05,
"loss": 4.484,
"step": 24
},
{
"epoch": 0.021479713603818614,
"grad_norm": 9.141919136047363,
"learning_rate": 2.1428571428571428e-05,
"loss": 4.5506,
"step": 27
},
{
"epoch": 0.02386634844868735,
"grad_norm": 6.18914270401001,
"learning_rate": 2.380952380952381e-05,
"loss": 4.5217,
"step": 30
},
{
"epoch": 0.026252983293556086,
"grad_norm": 7.893601894378662,
"learning_rate": 2.6190476190476192e-05,
"loss": 4.2932,
"step": 33
},
{
"epoch": 0.028639618138424822,
"grad_norm": 5.094172954559326,
"learning_rate": 2.857142857142857e-05,
"loss": 4.2497,
"step": 36
},
{
"epoch": 0.031026252983293555,
"grad_norm": 5.149319171905518,
"learning_rate": 3.095238095238095e-05,
"loss": 4.3677,
"step": 39
},
{
"epoch": 0.03341288782816229,
"grad_norm": 4.346240520477295,
"learning_rate": 3.3333333333333335e-05,
"loss": 4.1722,
"step": 42
},
{
"epoch": 0.03579952267303103,
"grad_norm": 3.216545581817627,
"learning_rate": 3.571428571428572e-05,
"loss": 4.1916,
"step": 45
},
{
"epoch": 0.03818615751789976,
"grad_norm": 3.108903408050537,
"learning_rate": 3.809523809523809e-05,
"loss": 4.0829,
"step": 48
},
{
"epoch": 0.0405727923627685,
"grad_norm": 3.1567437648773193,
"learning_rate": 4.047619047619048e-05,
"loss": 4.0211,
"step": 51
},
{
"epoch": 0.04295942720763723,
"grad_norm": 12.768291473388672,
"learning_rate": 4.2857142857142856e-05,
"loss": 4.1494,
"step": 54
},
{
"epoch": 0.045346062052505964,
"grad_norm": 3.211005926132202,
"learning_rate": 4.523809523809524e-05,
"loss": 4.1356,
"step": 57
},
{
"epoch": 0.0477326968973747,
"grad_norm": 5.850863456726074,
"learning_rate": 4.761904761904762e-05,
"loss": 4.2211,
"step": 60
},
{
"epoch": 0.050119331742243436,
"grad_norm": 4.25229024887085,
"learning_rate": 5e-05,
"loss": 4.1235,
"step": 63
},
{
"epoch": 0.05250596658711217,
"grad_norm": 2.7204909324645996,
"learning_rate": 5.2380952380952384e-05,
"loss": 3.9658,
"step": 66
},
{
"epoch": 0.05489260143198091,
"grad_norm": 3.8032662868499756,
"learning_rate": 5.4761904761904766e-05,
"loss": 3.8705,
"step": 69
},
{
"epoch": 0.057279236276849645,
"grad_norm": 3.7930989265441895,
"learning_rate": 5.714285714285714e-05,
"loss": 4.0484,
"step": 72
},
{
"epoch": 0.059665871121718374,
"grad_norm": 3.6324238777160645,
"learning_rate": 5.9523809523809524e-05,
"loss": 3.9811,
"step": 75
},
{
"epoch": 0.06205250596658711,
"grad_norm": 3.116438388824463,
"learning_rate": 6.19047619047619e-05,
"loss": 3.9441,
"step": 78
},
{
"epoch": 0.06443914081145585,
"grad_norm": 3.774524688720703,
"learning_rate": 6.428571428571429e-05,
"loss": 3.9418,
"step": 81
},
{
"epoch": 0.06682577565632458,
"grad_norm": 4.456262588500977,
"learning_rate": 6.666666666666667e-05,
"loss": 4.0214,
"step": 84
},
{
"epoch": 0.06921241050119331,
"grad_norm": 4.311483860015869,
"learning_rate": 6.904761904761905e-05,
"loss": 3.9148,
"step": 87
},
{
"epoch": 0.07159904534606205,
"grad_norm": 3.6151814460754395,
"learning_rate": 7.142857142857143e-05,
"loss": 3.8497,
"step": 90
},
{
"epoch": 0.07398568019093078,
"grad_norm": 3.262566328048706,
"learning_rate": 7.380952380952382e-05,
"loss": 3.8559,
"step": 93
},
{
"epoch": 0.07637231503579953,
"grad_norm": 4.0872907638549805,
"learning_rate": 7.619047619047618e-05,
"loss": 3.7604,
"step": 96
},
{
"epoch": 0.07875894988066826,
"grad_norm": 4.191164493560791,
"learning_rate": 7.857142857142858e-05,
"loss": 3.6232,
"step": 99
},
{
"epoch": 0.081145584725537,
"grad_norm": 3.834773540496826,
"learning_rate": 8.095238095238096e-05,
"loss": 3.5658,
"step": 102
},
{
"epoch": 0.08353221957040573,
"grad_norm": 4.667291164398193,
"learning_rate": 8.333333333333334e-05,
"loss": 3.6749,
"step": 105
},
{
"epoch": 0.08591885441527446,
"grad_norm": 3.7559292316436768,
"learning_rate": 8.571428571428571e-05,
"loss": 3.5708,
"step": 108
},
{
"epoch": 0.0883054892601432,
"grad_norm": 4.605666160583496,
"learning_rate": 8.80952380952381e-05,
"loss": 3.5868,
"step": 111
},
{
"epoch": 0.09069212410501193,
"grad_norm": 3.557979106903076,
"learning_rate": 9.047619047619048e-05,
"loss": 3.5797,
"step": 114
},
{
"epoch": 0.09307875894988067,
"grad_norm": 3.252253770828247,
"learning_rate": 9.285714285714286e-05,
"loss": 3.4353,
"step": 117
},
{
"epoch": 0.0954653937947494,
"grad_norm": 7.634258270263672,
"learning_rate": 9.523809523809524e-05,
"loss": 3.3768,
"step": 120
},
{
"epoch": 0.09785202863961814,
"grad_norm": 4.522894859313965,
"learning_rate": 9.761904761904762e-05,
"loss": 3.4649,
"step": 123
},
{
"epoch": 0.10023866348448687,
"grad_norm": 3.660895824432373,
"learning_rate": 0.0001,
"loss": 3.3314,
"step": 126
},
{
"epoch": 0.1026252983293556,
"grad_norm": 3.2496635913848877,
"learning_rate": 9.999961058466053e-05,
"loss": 3.3748,
"step": 129
},
{
"epoch": 0.10501193317422435,
"grad_norm": 3.1348578929901123,
"learning_rate": 9.999844234470782e-05,
"loss": 3.1998,
"step": 132
},
{
"epoch": 0.10739856801909307,
"grad_norm": 4.37357759475708,
"learning_rate": 9.999649529833915e-05,
"loss": 3.2159,
"step": 135
},
{
"epoch": 0.10978520286396182,
"grad_norm": 3.638875961303711,
"learning_rate": 9.999376947588288e-05,
"loss": 3.2499,
"step": 138
},
{
"epoch": 0.11217183770883055,
"grad_norm": 4.375198841094971,
"learning_rate": 9.999026491979808e-05,
"loss": 3.1967,
"step": 141
},
{
"epoch": 0.11455847255369929,
"grad_norm": 3.1584630012512207,
"learning_rate": 9.99859816846739e-05,
"loss": 3.2014,
"step": 144
},
{
"epoch": 0.11694510739856802,
"grad_norm": 3.404482126235962,
"learning_rate": 9.998091983722863e-05,
"loss": 3.1216,
"step": 147
},
{
"epoch": 0.11933174224343675,
"grad_norm": 3.8131189346313477,
"learning_rate": 9.99750794563087e-05,
"loss": 3.0097,
"step": 150
},
{
"epoch": 0.12171837708830549,
"grad_norm": 3.831827163696289,
"learning_rate": 9.996846063288747e-05,
"loss": 3.1055,
"step": 153
},
{
"epoch": 0.12410501193317422,
"grad_norm": 3.281343698501587,
"learning_rate": 9.996106347006379e-05,
"loss": 3.1001,
"step": 156
},
{
"epoch": 0.12649164677804295,
"grad_norm": 3.8899145126342773,
"learning_rate": 9.99528880830604e-05,
"loss": 3.0959,
"step": 159
},
{
"epoch": 0.1288782816229117,
"grad_norm": 2.8943607807159424,
"learning_rate": 9.994393459922218e-05,
"loss": 3.0305,
"step": 162
},
{
"epoch": 0.13126491646778043,
"grad_norm": 3.5095627307891846,
"learning_rate": 9.993420315801406e-05,
"loss": 2.9394,
"step": 165
},
{
"epoch": 0.13365155131264916,
"grad_norm": 3.2103517055511475,
"learning_rate": 9.992369391101895e-05,
"loss": 2.999,
"step": 168
},
{
"epoch": 0.1360381861575179,
"grad_norm": 2.3885068893432617,
"learning_rate": 9.991240702193532e-05,
"loss": 3.1839,
"step": 171
},
{
"epoch": 0.13842482100238662,
"grad_norm": 3.747015953063965,
"learning_rate": 9.990034266657467e-05,
"loss": 2.9744,
"step": 174
},
{
"epoch": 0.14081145584725538,
"grad_norm": 3.1654574871063232,
"learning_rate": 9.988750103285883e-05,
"loss": 2.9318,
"step": 177
},
{
"epoch": 0.1431980906921241,
"grad_norm": 3.403306007385254,
"learning_rate": 9.987388232081694e-05,
"loss": 2.9445,
"step": 180
},
{
"epoch": 0.14558472553699284,
"grad_norm": 3.077636480331421,
"learning_rate": 9.985948674258243e-05,
"loss": 2.9306,
"step": 183
},
{
"epoch": 0.14797136038186157,
"grad_norm": 3.725022554397583,
"learning_rate": 9.984431452238967e-05,
"loss": 2.8871,
"step": 186
},
{
"epoch": 0.15035799522673032,
"grad_norm": 4.53774356842041,
"learning_rate": 9.982836589657043e-05,
"loss": 2.9007,
"step": 189
},
{
"epoch": 0.15274463007159905,
"grad_norm": 2.8278591632843018,
"learning_rate": 9.981164111355035e-05,
"loss": 2.9678,
"step": 192
},
{
"epoch": 0.15513126491646778,
"grad_norm": 3.511444568634033,
"learning_rate": 9.979414043384485e-05,
"loss": 2.9657,
"step": 195
},
{
"epoch": 0.1575178997613365,
"grad_norm": 2.6156179904937744,
"learning_rate": 9.977586413005531e-05,
"loss": 2.9317,
"step": 198
},
{
"epoch": 0.15990453460620524,
"grad_norm": 3.9150912761688232,
"learning_rate": 9.975681248686461e-05,
"loss": 2.9199,
"step": 201
},
{
"epoch": 0.162291169451074,
"grad_norm": 3.9258997440338135,
"learning_rate": 9.973698580103285e-05,
"loss": 2.82,
"step": 204
},
{
"epoch": 0.16467780429594273,
"grad_norm": 2.9867348670959473,
"learning_rate": 9.971638438139266e-05,
"loss": 2.8516,
"step": 207
},
{
"epoch": 0.16706443914081145,
"grad_norm": 3.0659570693969727,
"learning_rate": 9.96950085488444e-05,
"loss": 2.6836,
"step": 210
},
{
"epoch": 0.16945107398568018,
"grad_norm": 2.818599224090576,
"learning_rate": 9.967285863635112e-05,
"loss": 2.9212,
"step": 213
},
{
"epoch": 0.1718377088305489,
"grad_norm": 2.9934067726135254,
"learning_rate": 9.964993498893349e-05,
"loss": 2.7864,
"step": 216
},
{
"epoch": 0.17422434367541767,
"grad_norm": 2.358505964279175,
"learning_rate": 9.962623796366429e-05,
"loss": 2.7296,
"step": 219
},
{
"epoch": 0.1766109785202864,
"grad_norm": 2.630748748779297,
"learning_rate": 9.960176792966289e-05,
"loss": 2.74,
"step": 222
},
{
"epoch": 0.17899761336515513,
"grad_norm": 2.7040629386901855,
"learning_rate": 9.95765252680896e-05,
"loss": 2.8416,
"step": 225
},
{
"epoch": 0.18138424821002386,
"grad_norm": 2.9643092155456543,
"learning_rate": 9.95505103721396e-05,
"loss": 2.7412,
"step": 228
},
{
"epoch": 0.18377088305489261,
"grad_norm": 3.3539974689483643,
"learning_rate": 9.952372364703687e-05,
"loss": 2.7819,
"step": 231
},
{
"epoch": 0.18615751789976134,
"grad_norm": 3.3523247241973877,
"learning_rate": 9.949616551002787e-05,
"loss": 2.7698,
"step": 234
},
{
"epoch": 0.18854415274463007,
"grad_norm": 2.5793633460998535,
"learning_rate": 9.946783639037504e-05,
"loss": 2.7085,
"step": 237
},
{
"epoch": 0.1909307875894988,
"grad_norm": 3.2276599407196045,
"learning_rate": 9.943873672935014e-05,
"loss": 2.7572,
"step": 240
},
{
"epoch": 0.19331742243436753,
"grad_norm": 3.1655075550079346,
"learning_rate": 9.940886698022734e-05,
"loss": 2.6529,
"step": 243
},
{
"epoch": 0.1957040572792363,
"grad_norm": 3.296135663986206,
"learning_rate": 9.93782276082762e-05,
"loss": 2.7917,
"step": 246
},
{
"epoch": 0.19809069212410502,
"grad_norm": 2.3760223388671875,
"learning_rate": 9.934681909075434e-05,
"loss": 2.6912,
"step": 249
},
{
"epoch": 0.20047732696897375,
"grad_norm": 2.8032619953155518,
"learning_rate": 9.931464191690015e-05,
"loss": 2.8242,
"step": 252
},
{
"epoch": 0.20286396181384247,
"grad_norm": 2.846773386001587,
"learning_rate": 9.928169658792498e-05,
"loss": 2.6749,
"step": 255
},
{
"epoch": 0.2052505966587112,
"grad_norm": 2.901726245880127,
"learning_rate": 9.924798361700553e-05,
"loss": 2.6449,
"step": 258
},
{
"epoch": 0.20763723150357996,
"grad_norm": 3.291959524154663,
"learning_rate": 9.92135035292757e-05,
"loss": 2.6156,
"step": 261
},
{
"epoch": 0.2100238663484487,
"grad_norm": 2.4123713970184326,
"learning_rate": 9.91782568618185e-05,
"loss": 2.7662,
"step": 264
},
{
"epoch": 0.21241050119331742,
"grad_norm": 2.883798837661743,
"learning_rate": 9.914224416365764e-05,
"loss": 2.7688,
"step": 267
},
{
"epoch": 0.21479713603818615,
"grad_norm": 2.7051072120666504,
"learning_rate": 9.910546599574902e-05,
"loss": 2.8533,
"step": 270
},
{
"epoch": 0.2171837708830549,
"grad_norm": 2.8238630294799805,
"learning_rate": 9.906792293097194e-05,
"loss": 2.6326,
"step": 273
},
{
"epoch": 0.21957040572792363,
"grad_norm": 2.51338529586792,
"learning_rate": 9.90296155541202e-05,
"loss": 2.7023,
"step": 276
},
{
"epoch": 0.22195704057279236,
"grad_norm": 2.7575676441192627,
"learning_rate": 9.899054446189304e-05,
"loss": 2.648,
"step": 279
},
{
"epoch": 0.2243436754176611,
"grad_norm": 2.750290870666504,
"learning_rate": 9.895071026288574e-05,
"loss": 2.6133,
"step": 282
},
{
"epoch": 0.22673031026252982,
"grad_norm": 2.5514075756073,
"learning_rate": 9.891011357758022e-05,
"loss": 2.569,
"step": 285
},
{
"epoch": 0.22911694510739858,
"grad_norm": 2.7157366275787354,
"learning_rate": 9.886875503833536e-05,
"loss": 2.693,
"step": 288
},
{
"epoch": 0.2315035799522673,
"grad_norm": 3.114830255508423,
"learning_rate": 9.882663528937717e-05,
"loss": 2.596,
"step": 291
},
{
"epoch": 0.23389021479713604,
"grad_norm": 2.7666478157043457,
"learning_rate": 9.87837549867887e-05,
"loss": 2.5166,
"step": 294
},
{
"epoch": 0.23627684964200477,
"grad_norm": 2.94891095161438,
"learning_rate": 9.87401147984998e-05,
"loss": 2.7035,
"step": 297
},
{
"epoch": 0.2386634844868735,
"grad_norm": 2.5706005096435547,
"learning_rate": 9.869571540427689e-05,
"loss": 2.6282,
"step": 300
},
{
"epoch": 0.24105011933174225,
"grad_norm": 2.8076047897338867,
"learning_rate": 9.865055749571215e-05,
"loss": 2.7397,
"step": 303
},
{
"epoch": 0.24343675417661098,
"grad_norm": 2.445986032485962,
"learning_rate": 9.860464177621284e-05,
"loss": 2.4252,
"step": 306
},
{
"epoch": 0.2458233890214797,
"grad_norm": 2.591123580932617,
"learning_rate": 9.855796896099045e-05,
"loss": 2.6638,
"step": 309
},
{
"epoch": 0.24821002386634844,
"grad_norm": 2.5579094886779785,
"learning_rate": 9.851053977704931e-05,
"loss": 2.5365,
"step": 312
},
{
"epoch": 0.25059665871121717,
"grad_norm": 2.701978921890259,
"learning_rate": 9.846235496317555e-05,
"loss": 2.48,
"step": 315
},
{
"epoch": 0.2529832935560859,
"grad_norm": 2.6815707683563232,
"learning_rate": 9.841341526992536e-05,
"loss": 2.5878,
"step": 318
},
{
"epoch": 0.2553699284009546,
"grad_norm": 2.3997161388397217,
"learning_rate": 9.836372145961345e-05,
"loss": 2.6202,
"step": 321
},
{
"epoch": 0.2577565632458234,
"grad_norm": 2.5097908973693848,
"learning_rate": 9.83132743063011e-05,
"loss": 2.5595,
"step": 324
},
{
"epoch": 0.26014319809069214,
"grad_norm": 2.3788466453552246,
"learning_rate": 9.826207459578411e-05,
"loss": 2.4104,
"step": 327
},
{
"epoch": 0.26252983293556087,
"grad_norm": 2.3941385746002197,
"learning_rate": 9.821012312558058e-05,
"loss": 2.4048,
"step": 330
},
{
"epoch": 0.2649164677804296,
"grad_norm": 2.558845281600952,
"learning_rate": 9.815742070491852e-05,
"loss": 2.5062,
"step": 333
},
{
"epoch": 0.26730310262529833,
"grad_norm": 2.1203112602233887,
"learning_rate": 9.810396815472314e-05,
"loss": 2.4535,
"step": 336
},
{
"epoch": 0.26968973747016706,
"grad_norm": 2.465987205505371,
"learning_rate": 9.804976630760419e-05,
"loss": 2.4999,
"step": 339
},
{
"epoch": 0.2720763723150358,
"grad_norm": 2.280484914779663,
"learning_rate": 9.799481600784286e-05,
"loss": 2.403,
"step": 342
},
{
"epoch": 0.2744630071599045,
"grad_norm": 2.6242918968200684,
"learning_rate": 9.793911811137875e-05,
"loss": 2.4721,
"step": 345
},
{
"epoch": 0.27684964200477324,
"grad_norm": 2.8333303928375244,
"learning_rate": 9.788267348579648e-05,
"loss": 2.5081,
"step": 348
},
{
"epoch": 0.27923627684964203,
"grad_norm": 2.4998323917388916,
"learning_rate": 9.782548301031217e-05,
"loss": 2.4825,
"step": 351
},
{
"epoch": 0.28162291169451076,
"grad_norm": 2.535447835922241,
"learning_rate": 9.776754757575975e-05,
"loss": 2.4639,
"step": 354
},
{
"epoch": 0.2840095465393795,
"grad_norm": 2.608652114868164,
"learning_rate": 9.770886808457709e-05,
"loss": 2.5209,
"step": 357
},
{
"epoch": 0.2863961813842482,
"grad_norm": 2.5548174381256104,
"learning_rate": 9.764944545079196e-05,
"loss": 2.5707,
"step": 360
},
{
"epoch": 0.28878281622911695,
"grad_norm": 2.3992514610290527,
"learning_rate": 9.758928060000778e-05,
"loss": 2.515,
"step": 363
},
{
"epoch": 0.2911694510739857,
"grad_norm": 3.199773073196411,
"learning_rate": 9.752837446938915e-05,
"loss": 2.4069,
"step": 366
},
{
"epoch": 0.2935560859188544,
"grad_norm": 2.689730405807495,
"learning_rate": 9.746672800764735e-05,
"loss": 2.4314,
"step": 369
},
{
"epoch": 0.29594272076372313,
"grad_norm": 2.357332944869995,
"learning_rate": 9.740434217502547e-05,
"loss": 2.3856,
"step": 372
},
{
"epoch": 0.29832935560859186,
"grad_norm": 2.409001111984253,
"learning_rate": 9.734121794328357e-05,
"loss": 2.4423,
"step": 375
},
{
"epoch": 0.30071599045346065,
"grad_norm": 2.4118542671203613,
"learning_rate": 9.727735629568336e-05,
"loss": 2.5059,
"step": 378
},
{
"epoch": 0.3031026252983294,
"grad_norm": 2.4173245429992676,
"learning_rate": 9.721275822697306e-05,
"loss": 2.386,
"step": 381
},
{
"epoch": 0.3054892601431981,
"grad_norm": 2.1414527893066406,
"learning_rate": 9.714742474337186e-05,
"loss": 2.5249,
"step": 384
},
{
"epoch": 0.30787589498806683,
"grad_norm": 2.1543893814086914,
"learning_rate": 9.708135686255416e-05,
"loss": 2.5626,
"step": 387
},
{
"epoch": 0.31026252983293556,
"grad_norm": 3.0734686851501465,
"learning_rate": 9.701455561363379e-05,
"loss": 2.3829,
"step": 390
},
{
"epoch": 0.3126491646778043,
"grad_norm": 2.2871782779693604,
"learning_rate": 9.6947022037148e-05,
"loss": 2.4011,
"step": 393
},
{
"epoch": 0.315035799522673,
"grad_norm": 2.4067625999450684,
"learning_rate": 9.687875718504126e-05,
"loss": 2.4074,
"step": 396
},
{
"epoch": 0.31742243436754175,
"grad_norm": 3.624891757965088,
"learning_rate": 9.680976212064874e-05,
"loss": 2.4101,
"step": 399
},
{
"epoch": 0.3198090692124105,
"grad_norm": 2.027580499649048,
"learning_rate": 9.674003791867991e-05,
"loss": 2.4197,
"step": 402
},
{
"epoch": 0.3221957040572792,
"grad_norm": 2.436666965484619,
"learning_rate": 9.666958566520174e-05,
"loss": 2.4592,
"step": 405
},
{
"epoch": 0.324582338902148,
"grad_norm": 2.1806681156158447,
"learning_rate": 9.659840645762175e-05,
"loss": 2.4362,
"step": 408
},
{
"epoch": 0.3269689737470167,
"grad_norm": 2.1825525760650635,
"learning_rate": 9.652650140467093e-05,
"loss": 2.4333,
"step": 411
},
{
"epoch": 0.32935560859188545,
"grad_norm": 2.0613701343536377,
"learning_rate": 9.645387162638652e-05,
"loss": 2.4313,
"step": 414
},
{
"epoch": 0.3317422434367542,
"grad_norm": 2.278815746307373,
"learning_rate": 9.638051825409453e-05,
"loss": 2.5416,
"step": 417
},
{
"epoch": 0.3341288782816229,
"grad_norm": 2.1906769275665283,
"learning_rate": 9.630644243039207e-05,
"loss": 2.3424,
"step": 420
},
{
"epoch": 0.33651551312649164,
"grad_norm": 2.412473678588867,
"learning_rate": 9.623164530912963e-05,
"loss": 2.4984,
"step": 423
},
{
"epoch": 0.33890214797136037,
"grad_norm": 2.44022274017334,
"learning_rate": 9.615612805539305e-05,
"loss": 2.3352,
"step": 426
},
{
"epoch": 0.3412887828162291,
"grad_norm": 2.2409064769744873,
"learning_rate": 9.607989184548543e-05,
"loss": 2.435,
"step": 429
},
{
"epoch": 0.3436754176610978,
"grad_norm": 1.8773585557937622,
"learning_rate": 9.600293786690872e-05,
"loss": 2.3689,
"step": 432
},
{
"epoch": 0.3460620525059666,
"grad_norm": 2.026489496231079,
"learning_rate": 9.592526731834537e-05,
"loss": 2.4707,
"step": 435
},
{
"epoch": 0.34844868735083534,
"grad_norm": 1.9061930179595947,
"learning_rate": 9.584688140963944e-05,
"loss": 2.1919,
"step": 438
},
{
"epoch": 0.35083532219570407,
"grad_norm": 1.9705097675323486,
"learning_rate": 9.576778136177798e-05,
"loss": 2.3773,
"step": 441
},
{
"epoch": 0.3532219570405728,
"grad_norm": 2.263256788253784,
"learning_rate": 9.568796840687184e-05,
"loss": 2.2805,
"step": 444
},
{
"epoch": 0.3556085918854415,
"grad_norm": 2.4777801036834717,
"learning_rate": 9.560744378813659e-05,
"loss": 2.4083,
"step": 447
},
{
"epoch": 0.35799522673031026,
"grad_norm": 2.305082082748413,
"learning_rate": 9.552620875987311e-05,
"loss": 2.2696,
"step": 450
},
{
"epoch": 0.360381861575179,
"grad_norm": 2.2283718585968018,
"learning_rate": 9.544426458744804e-05,
"loss": 2.2327,
"step": 453
},
{
"epoch": 0.3627684964200477,
"grad_norm": 2.152367115020752,
"learning_rate": 9.536161254727408e-05,
"loss": 2.2686,
"step": 456
},
{
"epoch": 0.36515513126491644,
"grad_norm": 2.222525119781494,
"learning_rate": 9.527825392679012e-05,
"loss": 2.3051,
"step": 459
},
{
"epoch": 0.36754176610978523,
"grad_norm": 2.0518105030059814,
"learning_rate": 9.51941900244412e-05,
"loss": 2.2914,
"step": 462
},
{
"epoch": 0.36992840095465396,
"grad_norm": 1.8403911590576172,
"learning_rate": 9.51094221496582e-05,
"loss": 2.2763,
"step": 465
},
{
"epoch": 0.3723150357995227,
"grad_norm": 2.0134127140045166,
"learning_rate": 9.502395162283759e-05,
"loss": 2.1292,
"step": 468
},
{
"epoch": 0.3747016706443914,
"grad_norm": 2.2648725509643555,
"learning_rate": 9.493777977532072e-05,
"loss": 2.2943,
"step": 471
},
{
"epoch": 0.37708830548926014,
"grad_norm": 2.598200559616089,
"learning_rate": 9.485090794937319e-05,
"loss": 2.2811,
"step": 474
},
{
"epoch": 0.3794749403341289,
"grad_norm": 2.188739776611328,
"learning_rate": 9.476333749816382e-05,
"loss": 2.2055,
"step": 477
},
{
"epoch": 0.3818615751789976,
"grad_norm": 2.106133460998535,
"learning_rate": 9.467506978574371e-05,
"loss": 2.1652,
"step": 480
},
{
"epoch": 0.38424821002386633,
"grad_norm": 2.550200939178467,
"learning_rate": 9.45861061870249e-05,
"loss": 2.3259,
"step": 483
},
{
"epoch": 0.38663484486873506,
"grad_norm": 2.1999354362487793,
"learning_rate": 9.449644808775902e-05,
"loss": 2.2477,
"step": 486
},
{
"epoch": 0.38902147971360385,
"grad_norm": 1.8450204133987427,
"learning_rate": 9.44060968845156e-05,
"loss": 2.1774,
"step": 489
},
{
"epoch": 0.3914081145584726,
"grad_norm": 2.097698211669922,
"learning_rate": 9.431505398466045e-05,
"loss": 2.3338,
"step": 492
},
{
"epoch": 0.3937947494033413,
"grad_norm": 2.0692379474639893,
"learning_rate": 9.42233208063336e-05,
"loss": 2.1268,
"step": 495
},
{
"epoch": 0.39618138424821003,
"grad_norm": 2.289555788040161,
"learning_rate": 9.413089877842736e-05,
"loss": 2.3312,
"step": 498
},
{
"epoch": 0.39856801909307876,
"grad_norm": 2.145961284637451,
"learning_rate": 9.403778934056391e-05,
"loss": 2.3026,
"step": 501
},
{
"epoch": 0.4009546539379475,
"grad_norm": 1.9669148921966553,
"learning_rate": 9.394399394307303e-05,
"loss": 2.4298,
"step": 504
},
{
"epoch": 0.4033412887828162,
"grad_norm": 2.412475824356079,
"learning_rate": 9.384951404696933e-05,
"loss": 2.1335,
"step": 507
},
{
"epoch": 0.40572792362768495,
"grad_norm": 1.9955891370773315,
"learning_rate": 9.375435112392969e-05,
"loss": 2.2833,
"step": 510
},
{
"epoch": 0.4081145584725537,
"grad_norm": 2.0198495388031006,
"learning_rate": 9.365850665627016e-05,
"loss": 2.2196,
"step": 513
},
{
"epoch": 0.4105011933174224,
"grad_norm": 1.874887228012085,
"learning_rate": 9.356198213692297e-05,
"loss": 2.1938,
"step": 516
},
{
"epoch": 0.4128878281622912,
"grad_norm": 1.9954278469085693,
"learning_rate": 9.346477906941331e-05,
"loss": 2.1082,
"step": 519
},
{
"epoch": 0.4152744630071599,
"grad_norm": 1.9344687461853027,
"learning_rate": 9.336689896783573e-05,
"loss": 2.1265,
"step": 522
},
{
"epoch": 0.41766109785202865,
"grad_norm": 2.1499831676483154,
"learning_rate": 9.32683433568308e-05,
"loss": 2.2211,
"step": 525
},
{
"epoch": 0.4200477326968974,
"grad_norm": 2.411583662033081,
"learning_rate": 9.316911377156117e-05,
"loss": 2.2793,
"step": 528
},
{
"epoch": 0.4224343675417661,
"grad_norm": 2.0739586353302,
"learning_rate": 9.306921175768775e-05,
"loss": 2.1653,
"step": 531
},
{
"epoch": 0.42482100238663484,
"grad_norm": 2.0313665866851807,
"learning_rate": 9.29686388713456e-05,
"loss": 2.3611,
"step": 534
},
{
"epoch": 0.42720763723150357,
"grad_norm": 1.8887441158294678,
"learning_rate": 9.286739667911972e-05,
"loss": 2.201,
"step": 537
},
{
"epoch": 0.4295942720763723,
"grad_norm": 1.928774118423462,
"learning_rate": 9.276548675802059e-05,
"loss": 2.2504,
"step": 540
},
{
"epoch": 0.431980906921241,
"grad_norm": 1.7933377027511597,
"learning_rate": 9.266291069545972e-05,
"loss": 2.2219,
"step": 543
},
{
"epoch": 0.4343675417661098,
"grad_norm": 1.877989649772644,
"learning_rate": 9.255967008922474e-05,
"loss": 2.2547,
"step": 546
},
{
"epoch": 0.43675417661097854,
"grad_norm": 1.6794133186340332,
"learning_rate": 9.245576654745471e-05,
"loss": 2.3152,
"step": 549
},
{
"epoch": 0.43914081145584727,
"grad_norm": 2.024888515472412,
"learning_rate": 9.235120168861496e-05,
"loss": 2.0559,
"step": 552
},
{
"epoch": 0.441527446300716,
"grad_norm": 2.1907565593719482,
"learning_rate": 9.224597714147186e-05,
"loss": 2.2414,
"step": 555
},
{
"epoch": 0.4439140811455847,
"grad_norm": 1.9804993867874146,
"learning_rate": 9.214009454506753e-05,
"loss": 2.0826,
"step": 558
},
{
"epoch": 0.44630071599045346,
"grad_norm": 2.1654927730560303,
"learning_rate": 9.203355554869428e-05,
"loss": 2.2048,
"step": 561
},
{
"epoch": 0.4486873508353222,
"grad_norm": 2.4933547973632812,
"learning_rate": 9.192636181186888e-05,
"loss": 2.0166,
"step": 564
},
{
"epoch": 0.4510739856801909,
"grad_norm": 2.0309464931488037,
"learning_rate": 9.181851500430673e-05,
"loss": 2.0729,
"step": 567
},
{
"epoch": 0.45346062052505964,
"grad_norm": 2.191399574279785,
"learning_rate": 9.171001680589588e-05,
"loss": 2.0675,
"step": 570
},
{
"epoch": 0.45584725536992843,
"grad_norm": 2.108330249786377,
"learning_rate": 9.160086890667086e-05,
"loss": 2.1535,
"step": 573
},
{
"epoch": 0.45823389021479716,
"grad_norm": 2.01182222366333,
"learning_rate": 9.14910730067863e-05,
"loss": 2.1768,
"step": 576
},
{
"epoch": 0.4606205250596659,
"grad_norm": 1.80693519115448,
"learning_rate": 9.138063081649051e-05,
"loss": 2.2843,
"step": 579
},
{
"epoch": 0.4630071599045346,
"grad_norm": 1.9676529169082642,
"learning_rate": 9.126954405609882e-05,
"loss": 2.1211,
"step": 582
},
{
"epoch": 0.46539379474940334,
"grad_norm": 1.9550760984420776,
"learning_rate": 9.115781445596676e-05,
"loss": 2.0111,
"step": 585
},
{
"epoch": 0.4677804295942721,
"grad_norm": 2.1613616943359375,
"learning_rate": 9.104544375646313e-05,
"loss": 2.3365,
"step": 588
},
{
"epoch": 0.4701670644391408,
"grad_norm": 2.0168464183807373,
"learning_rate": 9.093243370794291e-05,
"loss": 2.1486,
"step": 591
},
{
"epoch": 0.47255369928400953,
"grad_norm": 2.0406484603881836,
"learning_rate": 9.081878607071996e-05,
"loss": 2.2437,
"step": 594
},
{
"epoch": 0.47494033412887826,
"grad_norm": 2.1495018005371094,
"learning_rate": 9.07045026150396e-05,
"loss": 2.122,
"step": 597
},
{
"epoch": 0.477326968973747,
"grad_norm": 2.0007262229919434,
"learning_rate": 9.058958512105104e-05,
"loss": 2.121,
"step": 600
},
{
"epoch": 0.4797136038186158,
"grad_norm": 2.7373595237731934,
"learning_rate": 9.047403537877971e-05,
"loss": 2.083,
"step": 603
},
{
"epoch": 0.4821002386634845,
"grad_norm": 1.8298907279968262,
"learning_rate": 9.035785518809927e-05,
"loss": 2.1547,
"step": 606
},
{
"epoch": 0.48448687350835323,
"grad_norm": 1.9181143045425415,
"learning_rate": 9.024104635870368e-05,
"loss": 2.0938,
"step": 609
},
{
"epoch": 0.48687350835322196,
"grad_norm": 2.0003180503845215,
"learning_rate": 9.012361071007891e-05,
"loss": 2.1254,
"step": 612
},
{
"epoch": 0.4892601431980907,
"grad_norm": 1.9533605575561523,
"learning_rate": 9.000555007147469e-05,
"loss": 2.1393,
"step": 615
},
{
"epoch": 0.4916467780429594,
"grad_norm": 2.0818209648132324,
"learning_rate": 8.988686628187597e-05,
"loss": 2.1339,
"step": 618
},
{
"epoch": 0.49403341288782815,
"grad_norm": 1.762052059173584,
"learning_rate": 8.976756118997427e-05,
"loss": 2.1497,
"step": 621
},
{
"epoch": 0.4964200477326969,
"grad_norm": 2.371490001678467,
"learning_rate": 8.964763665413893e-05,
"loss": 1.9532,
"step": 624
},
{
"epoch": 0.4988066825775656,
"grad_norm": 2.1913626194000244,
"learning_rate": 8.952709454238808e-05,
"loss": 2.1899,
"step": 627
},
{
"epoch": 0.5011933174224343,
"grad_norm": 1.85502290725708,
"learning_rate": 8.940593673235962e-05,
"loss": 2.1804,
"step": 630
},
{
"epoch": 0.5035799522673031,
"grad_norm": 1.9168248176574707,
"learning_rate": 8.928416511128195e-05,
"loss": 2.0406,
"step": 633
},
{
"epoch": 0.5059665871121718,
"grad_norm": 1.9352686405181885,
"learning_rate": 8.916178157594453e-05,
"loss": 2.0952,
"step": 636
},
{
"epoch": 0.5083532219570406,
"grad_norm": 2.1342227458953857,
"learning_rate": 8.903878803266841e-05,
"loss": 2.2643,
"step": 639
},
{
"epoch": 0.5107398568019093,
"grad_norm": 1.8305190801620483,
"learning_rate": 8.891518639727649e-05,
"loss": 1.9711,
"step": 642
},
{
"epoch": 0.513126491646778,
"grad_norm": 1.9878753423690796,
"learning_rate": 8.879097859506372e-05,
"loss": 2.1531,
"step": 645
},
{
"epoch": 0.5155131264916468,
"grad_norm": 1.7866030931472778,
"learning_rate": 8.866616656076696e-05,
"loss": 2.0549,
"step": 648
},
{
"epoch": 0.5178997613365155,
"grad_norm": 1.8742599487304688,
"learning_rate": 8.854075223853508e-05,
"loss": 2.1491,
"step": 651
},
{
"epoch": 0.5202863961813843,
"grad_norm": 2.051337242126465,
"learning_rate": 8.841473758189854e-05,
"loss": 2.0055,
"step": 654
},
{
"epoch": 0.522673031026253,
"grad_norm": 2.0818710327148438,
"learning_rate": 8.828812455373891e-05,
"loss": 2.1989,
"step": 657
},
{
"epoch": 0.5250596658711217,
"grad_norm": 1.859626293182373,
"learning_rate": 8.816091512625843e-05,
"loss": 2.129,
"step": 660
},
{
"epoch": 0.5274463007159904,
"grad_norm": 2.454052686691284,
"learning_rate": 8.803311128094918e-05,
"loss": 2.1351,
"step": 663
},
{
"epoch": 0.5298329355608592,
"grad_norm": 2.2742836475372314,
"learning_rate": 8.790471500856228e-05,
"loss": 2.2181,
"step": 666
},
{
"epoch": 0.5322195704057279,
"grad_norm": 1.7958076000213623,
"learning_rate": 8.777572830907684e-05,
"loss": 2.2685,
"step": 669
},
{
"epoch": 0.5346062052505967,
"grad_norm": 1.7897841930389404,
"learning_rate": 8.764615319166886e-05,
"loss": 2.1735,
"step": 672
},
{
"epoch": 0.5369928400954654,
"grad_norm": 1.8910062313079834,
"learning_rate": 8.751599167467985e-05,
"loss": 1.9897,
"step": 675
},
{
"epoch": 0.5393794749403341,
"grad_norm": 1.875690221786499,
"learning_rate": 8.738524578558547e-05,
"loss": 2.05,
"step": 678
},
{
"epoch": 0.5417661097852029,
"grad_norm": 1.935346007347107,
"learning_rate": 8.72539175609639e-05,
"loss": 1.9948,
"step": 681
},
{
"epoch": 0.5441527446300716,
"grad_norm": 1.7295023202896118,
"learning_rate": 8.712200904646416e-05,
"loss": 1.9492,
"step": 684
},
{
"epoch": 0.5465393794749404,
"grad_norm": 1.7163113355636597,
"learning_rate": 8.698952229677422e-05,
"loss": 2.0111,
"step": 687
},
{
"epoch": 0.548926014319809,
"grad_norm": 2.101062536239624,
"learning_rate": 8.685645937558896e-05,
"loss": 2.2221,
"step": 690
},
{
"epoch": 0.5513126491646778,
"grad_norm": 1.937011480331421,
"learning_rate": 8.67228223555781e-05,
"loss": 2.0276,
"step": 693
},
{
"epoch": 0.5536992840095465,
"grad_norm": 1.9230318069458008,
"learning_rate": 8.658861331835385e-05,
"loss": 2.0541,
"step": 696
},
{
"epoch": 0.5560859188544153,
"grad_norm": 1.91681969165802,
"learning_rate": 8.645383435443852e-05,
"loss": 2.0046,
"step": 699
},
{
"epoch": 0.5584725536992841,
"grad_norm": 1.711011528968811,
"learning_rate": 8.631848756323197e-05,
"loss": 2.0765,
"step": 702
},
{
"epoch": 0.5608591885441527,
"grad_norm": 1.817881464958191,
"learning_rate": 8.618257505297886e-05,
"loss": 2.0712,
"step": 705
},
{
"epoch": 0.5632458233890215,
"grad_norm": 1.8954594135284424,
"learning_rate": 8.604609894073584e-05,
"loss": 1.8753,
"step": 708
},
{
"epoch": 0.5656324582338902,
"grad_norm": 1.7681952714920044,
"learning_rate": 8.590906135233854e-05,
"loss": 2.005,
"step": 711
},
{
"epoch": 0.568019093078759,
"grad_norm": 1.9686832427978516,
"learning_rate": 8.577146442236857e-05,
"loss": 2.0602,
"step": 714
},
{
"epoch": 0.5704057279236276,
"grad_norm": 1.82900071144104,
"learning_rate": 8.563331029412012e-05,
"loss": 2.1201,
"step": 717
},
{
"epoch": 0.5727923627684964,
"grad_norm": 1.9767824411392212,
"learning_rate": 8.549460111956664e-05,
"loss": 2.0324,
"step": 720
},
{
"epoch": 0.5751789976133651,
"grad_norm": 1.8764835596084595,
"learning_rate": 8.535533905932738e-05,
"loss": 2.0167,
"step": 723
},
{
"epoch": 0.5775656324582339,
"grad_norm": 2.0710604190826416,
"learning_rate": 8.521552628263362e-05,
"loss": 2.0242,
"step": 726
},
{
"epoch": 0.5799522673031027,
"grad_norm": 1.8737850189208984,
"learning_rate": 8.507516496729495e-05,
"loss": 1.9975,
"step": 729
},
{
"epoch": 0.5823389021479713,
"grad_norm": 1.7871778011322021,
"learning_rate": 8.493425729966534e-05,
"loss": 2.0898,
"step": 732
},
{
"epoch": 0.5847255369928401,
"grad_norm": 1.8007279634475708,
"learning_rate": 8.479280547460907e-05,
"loss": 1.9798,
"step": 735
},
{
"epoch": 0.5871121718377088,
"grad_norm": 1.6072956323623657,
"learning_rate": 8.465081169546659e-05,
"loss": 2.1404,
"step": 738
},
{
"epoch": 0.5894988066825776,
"grad_norm": 1.7656550407409668,
"learning_rate": 8.450827817402011e-05,
"loss": 2.1144,
"step": 741
},
{
"epoch": 0.5918854415274463,
"grad_norm": 2.1177773475646973,
"learning_rate": 8.436520713045922e-05,
"loss": 1.9443,
"step": 744
},
{
"epoch": 0.594272076372315,
"grad_norm": 1.5557399988174438,
"learning_rate": 8.422160079334628e-05,
"loss": 1.9939,
"step": 747
},
{
"epoch": 0.5966587112171837,
"grad_norm": 2.5375919342041016,
"learning_rate": 8.40774613995817e-05,
"loss": 2.0561,
"step": 750
},
{
"epoch": 0.5990453460620525,
"grad_norm": 2.0415713787078857,
"learning_rate": 8.393279119436912e-05,
"loss": 2.0833,
"step": 753
},
{
"epoch": 0.6014319809069213,
"grad_norm": 1.6955374479293823,
"learning_rate": 8.378759243118044e-05,
"loss": 2.2045,
"step": 756
},
{
"epoch": 0.60381861575179,
"grad_norm": 1.6944271326065063,
"learning_rate": 8.364186737172068e-05,
"loss": 1.9618,
"step": 759
},
{
"epoch": 0.6062052505966588,
"grad_norm": 1.7665637731552124,
"learning_rate": 8.349561828589277e-05,
"loss": 2.1562,
"step": 762
},
{
"epoch": 0.6085918854415274,
"grad_norm": 1.9578077793121338,
"learning_rate": 8.33488474517622e-05,
"loss": 1.9817,
"step": 765
},
{
"epoch": 0.6109785202863962,
"grad_norm": 1.7413527965545654,
"learning_rate": 8.320155715552155e-05,
"loss": 1.9573,
"step": 768
},
{
"epoch": 0.6133651551312649,
"grad_norm": 2.020991325378418,
"learning_rate": 8.305374969145488e-05,
"loss": 2.107,
"step": 771
},
{
"epoch": 0.6157517899761337,
"grad_norm": 1.8580269813537598,
"learning_rate": 8.290542736190188e-05,
"loss": 1.8968,
"step": 774
},
{
"epoch": 0.6181384248210023,
"grad_norm": 1.6956955194473267,
"learning_rate": 8.275659247722222e-05,
"loss": 1.9221,
"step": 777
},
{
"epoch": 0.6205250596658711,
"grad_norm": 1.873700499534607,
"learning_rate": 8.260724735575933e-05,
"loss": 2.0205,
"step": 780
},
{
"epoch": 0.6229116945107399,
"grad_norm": 1.998443603515625,
"learning_rate": 8.24573943238045e-05,
"loss": 2.0767,
"step": 783
},
{
"epoch": 0.6252983293556086,
"grad_norm": 1.5726591348648071,
"learning_rate": 8.230703571556048e-05,
"loss": 1.887,
"step": 786
},
{
"epoch": 0.6276849642004774,
"grad_norm": 1.5653208494186401,
"learning_rate": 8.215617387310524e-05,
"loss": 1.9488,
"step": 789
},
{
"epoch": 0.630071599045346,
"grad_norm": 1.8755910396575928,
"learning_rate": 8.200481114635536e-05,
"loss": 1.9843,
"step": 792
},
{
"epoch": 0.6324582338902148,
"grad_norm": 1.6157357692718506,
"learning_rate": 8.185294989302958e-05,
"loss": 1.9286,
"step": 795
},
{
"epoch": 0.6348448687350835,
"grad_norm": 1.6673681735992432,
"learning_rate": 8.170059247861194e-05,
"loss": 1.9185,
"step": 798
},
{
"epoch": 0.6372315035799523,
"grad_norm": 2.0018393993377686,
"learning_rate": 8.154774127631501e-05,
"loss": 1.8868,
"step": 801
},
{
"epoch": 0.639618138424821,
"grad_norm": 1.6845289468765259,
"learning_rate": 8.139439866704293e-05,
"loss": 1.973,
"step": 804
},
{
"epoch": 0.6420047732696897,
"grad_norm": 2.231600522994995,
"learning_rate": 8.124056703935423e-05,
"loss": 1.9579,
"step": 807
},
{
"epoch": 0.6443914081145584,
"grad_norm": 1.8767783641815186,
"learning_rate": 8.108624878942477e-05,
"loss": 1.9769,
"step": 810
},
{
"epoch": 0.6467780429594272,
"grad_norm": 1.685530424118042,
"learning_rate": 8.093144632101026e-05,
"loss": 1.8638,
"step": 813
},
{
"epoch": 0.649164677804296,
"grad_norm": 1.8997292518615723,
"learning_rate": 8.077616204540897e-05,
"loss": 1.955,
"step": 816
},
{
"epoch": 0.6515513126491647,
"grad_norm": 1.982273817062378,
"learning_rate": 8.062039838142402e-05,
"loss": 1.9263,
"step": 819
},
{
"epoch": 0.6539379474940334,
"grad_norm": 2.659235954284668,
"learning_rate": 8.046415775532585e-05,
"loss": 1.9621,
"step": 822
},
{
"epoch": 0.6563245823389021,
"grad_norm": 1.7202818393707275,
"learning_rate": 8.030744260081426e-05,
"loss": 1.9706,
"step": 825
},
{
"epoch": 0.6587112171837709,
"grad_norm": 1.7414380311965942,
"learning_rate": 8.015025535898073e-05,
"loss": 2.0403,
"step": 828
},
{
"epoch": 0.6610978520286396,
"grad_norm": 1.7690259218215942,
"learning_rate": 7.999259847827015e-05,
"loss": 2.0361,
"step": 831
},
{
"epoch": 0.6634844868735084,
"grad_norm": 1.6158711910247803,
"learning_rate": 7.983447441444281e-05,
"loss": 1.8728,
"step": 834
},
{
"epoch": 0.665871121718377,
"grad_norm": 2.032346487045288,
"learning_rate": 7.967588563053616e-05,
"loss": 1.9029,
"step": 837
},
{
"epoch": 0.6682577565632458,
"grad_norm": 1.7840015888214111,
"learning_rate": 7.951683459682641e-05,
"loss": 1.9341,
"step": 840
},
{
"epoch": 0.6706443914081146,
"grad_norm": 1.7461919784545898,
"learning_rate": 7.935732379079008e-05,
"loss": 2.1495,
"step": 843
},
{
"epoch": 0.6730310262529833,
"grad_norm": 1.7609453201293945,
"learning_rate": 7.919735569706533e-05,
"loss": 2.0172,
"step": 846
},
{
"epoch": 0.6754176610978521,
"grad_norm": 1.7367274761199951,
"learning_rate": 7.903693280741331e-05,
"loss": 1.9978,
"step": 849
},
{
"epoch": 0.6778042959427207,
"grad_norm": 1.6769988536834717,
"learning_rate": 7.887605762067945e-05,
"loss": 1.871,
"step": 852
},
{
"epoch": 0.6801909307875895,
"grad_norm": 1.6532725095748901,
"learning_rate": 7.871473264275429e-05,
"loss": 1.8663,
"step": 855
},
{
"epoch": 0.6825775656324582,
"grad_norm": 1.6891016960144043,
"learning_rate": 7.855296038653475e-05,
"loss": 2.1671,
"step": 858
},
{
"epoch": 0.684964200477327,
"grad_norm": 1.6832776069641113,
"learning_rate": 7.83907433718847e-05,
"loss": 1.9273,
"step": 861
},
{
"epoch": 0.6873508353221957,
"grad_norm": 1.6840589046478271,
"learning_rate": 7.82280841255959e-05,
"loss": 1.8862,
"step": 864
},
{
"epoch": 0.6897374701670644,
"grad_norm": 1.644808292388916,
"learning_rate": 7.80649851813486e-05,
"loss": 2.0491,
"step": 867
},
{
"epoch": 0.6921241050119332,
"grad_norm": 1.915778636932373,
"learning_rate": 7.790144907967201e-05,
"loss": 1.9142,
"step": 870
},
{
"epoch": 0.6945107398568019,
"grad_norm": 1.957671880722046,
"learning_rate": 7.773747836790481e-05,
"loss": 2.1069,
"step": 873
},
{
"epoch": 0.6968973747016707,
"grad_norm": 1.8935115337371826,
"learning_rate": 7.757307560015538e-05,
"loss": 1.9058,
"step": 876
},
{
"epoch": 0.6992840095465394,
"grad_norm": 2.2118947505950928,
"learning_rate": 7.740824333726213e-05,
"loss": 1.8754,
"step": 879
},
{
"epoch": 0.7016706443914081,
"grad_norm": 1.820563793182373,
"learning_rate": 7.724298414675353e-05,
"loss": 1.9056,
"step": 882
},
{
"epoch": 0.7040572792362768,
"grad_norm": 2.0297231674194336,
"learning_rate": 7.707730060280812e-05,
"loss": 2.043,
"step": 885
},
{
"epoch": 0.7064439140811456,
"grad_norm": 1.5307203531265259,
"learning_rate": 7.691119528621444e-05,
"loss": 1.9592,
"step": 888
},
{
"epoch": 0.7088305489260143,
"grad_norm": 1.9287526607513428,
"learning_rate": 7.674467078433081e-05,
"loss": 2.0573,
"step": 891
},
{
"epoch": 0.711217183770883,
"grad_norm": 1.9057129621505737,
"learning_rate": 7.657772969104508e-05,
"loss": 1.78,
"step": 894
},
{
"epoch": 0.7136038186157518,
"grad_norm": 1.8515708446502686,
"learning_rate": 7.641037460673412e-05,
"loss": 1.7595,
"step": 897
},
{
"epoch": 0.7159904534606205,
"grad_norm": 1.5945055484771729,
"learning_rate": 7.624260813822342e-05,
"loss": 1.8427,
"step": 900
},
{
"epoch": 0.7183770883054893,
"grad_norm": 1.8130457401275635,
"learning_rate": 7.607443289874642e-05,
"loss": 1.9802,
"step": 903
},
{
"epoch": 0.720763723150358,
"grad_norm": 1.740313172340393,
"learning_rate": 7.590585150790389e-05,
"loss": 2.0377,
"step": 906
},
{
"epoch": 0.7231503579952268,
"grad_norm": 1.4544378519058228,
"learning_rate": 7.573686659162293e-05,
"loss": 1.9641,
"step": 909
},
{
"epoch": 0.7255369928400954,
"grad_norm": 1.7715884447097778,
"learning_rate": 7.556748078211635e-05,
"loss": 2.0572,
"step": 912
},
{
"epoch": 0.7279236276849642,
"grad_norm": 1.5698533058166504,
"learning_rate": 7.53976967178414e-05,
"loss": 1.9866,
"step": 915
},
{
"epoch": 0.7303102625298329,
"grad_norm": 1.4722906351089478,
"learning_rate": 7.522751704345887e-05,
"loss": 1.9815,
"step": 918
},
{
"epoch": 0.7326968973747017,
"grad_norm": 1.628419280052185,
"learning_rate": 7.505694440979178e-05,
"loss": 1.9551,
"step": 921
},
{
"epoch": 0.7350835322195705,
"grad_norm": 1.8405951261520386,
"learning_rate": 7.488598147378416e-05,
"loss": 1.82,
"step": 924
},
{
"epoch": 0.7374701670644391,
"grad_norm": 1.9501157999038696,
"learning_rate": 7.471463089845956e-05,
"loss": 1.8727,
"step": 927
},
{
"epoch": 0.7398568019093079,
"grad_norm": 1.797590970993042,
"learning_rate": 7.454289535287968e-05,
"loss": 1.8462,
"step": 930
},
{
"epoch": 0.7422434367541766,
"grad_norm": 1.7332159280776978,
"learning_rate": 7.437077751210279e-05,
"loss": 2.0425,
"step": 933
},
{
"epoch": 0.7446300715990454,
"grad_norm": 1.8471993207931519,
"learning_rate": 7.419828005714194e-05,
"loss": 1.9414,
"step": 936
},
{
"epoch": 0.747016706443914,
"grad_norm": 1.7861772775650024,
"learning_rate": 7.402540567492337e-05,
"loss": 1.9029,
"step": 939
},
{
"epoch": 0.7494033412887828,
"grad_norm": 1.5510649681091309,
"learning_rate": 7.385215705824449e-05,
"loss": 2.1046,
"step": 942
},
{
"epoch": 0.7517899761336515,
"grad_norm": 1.687177300453186,
"learning_rate": 7.367853690573208e-05,
"loss": 1.7673,
"step": 945
},
{
"epoch": 0.7541766109785203,
"grad_norm": 1.8639237880706787,
"learning_rate": 7.350454792180016e-05,
"loss": 1.859,
"step": 948
},
{
"epoch": 0.7565632458233891,
"grad_norm": 1.7479451894760132,
"learning_rate": 7.333019281660789e-05,
"loss": 2.043,
"step": 951
},
{
"epoch": 0.7589498806682577,
"grad_norm": 1.6814374923706055,
"learning_rate": 7.31554743060174e-05,
"loss": 1.8431,
"step": 954
},
{
"epoch": 0.7613365155131265,
"grad_norm": 1.6872406005859375,
"learning_rate": 7.298039511155138e-05,
"loss": 1.9233,
"step": 957
},
{
"epoch": 0.7637231503579952,
"grad_norm": 1.6230123043060303,
"learning_rate": 7.280495796035079e-05,
"loss": 1.9329,
"step": 960
},
{
"epoch": 0.766109785202864,
"grad_norm": 1.5747705698013306,
"learning_rate": 7.262916558513237e-05,
"loss": 1.7736,
"step": 963
},
{
"epoch": 0.7684964200477327,
"grad_norm": 1.9275933504104614,
"learning_rate": 7.245302072414601e-05,
"loss": 1.9087,
"step": 966
},
{
"epoch": 0.7708830548926014,
"grad_norm": 1.7358119487762451,
"learning_rate": 7.227652612113213e-05,
"loss": 1.7724,
"step": 969
},
{
"epoch": 0.7732696897374701,
"grad_norm": 1.5825779438018799,
"learning_rate": 7.209968452527896e-05,
"loss": 1.8674,
"step": 972
},
{
"epoch": 0.7756563245823389,
"grad_norm": 1.8964987993240356,
"learning_rate": 7.192249869117971e-05,
"loss": 1.9597,
"step": 975
},
{
"epoch": 0.7780429594272077,
"grad_norm": 1.873633861541748,
"learning_rate": 7.174497137878966e-05,
"loss": 1.8599,
"step": 978
},
{
"epoch": 0.7804295942720764,
"grad_norm": 1.6681768894195557,
"learning_rate": 7.156710535338312e-05,
"loss": 2.0096,
"step": 981
},
{
"epoch": 0.7828162291169452,
"grad_norm": 1.7456769943237305,
"learning_rate": 7.138890338551048e-05,
"loss": 1.9242,
"step": 984
},
{
"epoch": 0.7852028639618138,
"grad_norm": 1.683982491493225,
"learning_rate": 7.121036825095492e-05,
"loss": 1.8168,
"step": 987
},
{
"epoch": 0.7875894988066826,
"grad_norm": 1.7995425462722778,
"learning_rate": 7.103150273068921e-05,
"loss": 1.8701,
"step": 990
},
{
"epoch": 0.7899761336515513,
"grad_norm": 1.594572901725769,
"learning_rate": 7.085230961083249e-05,
"loss": 1.9488,
"step": 993
},
{
"epoch": 0.7923627684964201,
"grad_norm": 2.3273961544036865,
"learning_rate": 7.067279168260671e-05,
"loss": 1.9518,
"step": 996
},
{
"epoch": 0.7947494033412887,
"grad_norm": 1.6921088695526123,
"learning_rate": 7.04929517422933e-05,
"loss": 1.8953,
"step": 999
},
{
"epoch": 0.7971360381861575,
"grad_norm": 1.571298360824585,
"learning_rate": 7.031279259118946e-05,
"loss": 1.7606,
"step": 1002
},
{
"epoch": 0.7995226730310262,
"grad_norm": 2.253617286682129,
"learning_rate": 7.013231703556471e-05,
"loss": 1.9815,
"step": 1005
},
{
"epoch": 0.801909307875895,
"grad_norm": 1.8123805522918701,
"learning_rate": 6.995152788661705e-05,
"loss": 1.9012,
"step": 1008
},
{
"epoch": 0.8042959427207638,
"grad_norm": 1.5949262380599976,
"learning_rate": 6.977042796042917e-05,
"loss": 1.862,
"step": 1011
},
{
"epoch": 0.8066825775656324,
"grad_norm": 1.508523941040039,
"learning_rate": 6.958902007792466e-05,
"loss": 1.8586,
"step": 1014
},
{
"epoch": 0.8090692124105012,
"grad_norm": 2.1280345916748047,
"learning_rate": 6.940730706482399e-05,
"loss": 1.9284,
"step": 1017
},
{
"epoch": 0.8114558472553699,
"grad_norm": 1.6476510763168335,
"learning_rate": 6.922529175160054e-05,
"loss": 1.8232,
"step": 1020
},
{
"epoch": 0.8138424821002387,
"grad_norm": 1.483221173286438,
"learning_rate": 6.904297697343655e-05,
"loss": 2.0177,
"step": 1023
},
{
"epoch": 0.8162291169451074,
"grad_norm": 1.637446403503418,
"learning_rate": 6.886036557017881e-05,
"loss": 1.9592,
"step": 1026
},
{
"epoch": 0.8186157517899761,
"grad_norm": 1.7579176425933838,
"learning_rate": 6.867746038629462e-05,
"loss": 2.0381,
"step": 1029
},
{
"epoch": 0.8210023866348448,
"grad_norm": 1.6292965412139893,
"learning_rate": 6.849426427082735e-05,
"loss": 1.8949,
"step": 1032
},
{
"epoch": 0.8233890214797136,
"grad_norm": 1.5425759553909302,
"learning_rate": 6.83107800773521e-05,
"loss": 1.9462,
"step": 1035
},
{
"epoch": 0.8257756563245824,
"grad_norm": 1.5830105543136597,
"learning_rate": 6.812701066393124e-05,
"loss": 1.8403,
"step": 1038
},
{
"epoch": 0.8281622911694511,
"grad_norm": 2.093899726867676,
"learning_rate": 6.79429588930699e-05,
"loss": 1.8663,
"step": 1041
},
{
"epoch": 0.8305489260143198,
"grad_norm": 2.133967161178589,
"learning_rate": 6.775862763167142e-05,
"loss": 1.8503,
"step": 1044
},
{
"epoch": 0.8329355608591885,
"grad_norm": 1.640627145767212,
"learning_rate": 6.757401975099262e-05,
"loss": 1.7844,
"step": 1047
},
{
"epoch": 0.8353221957040573,
"grad_norm": 3.505713939666748,
"learning_rate": 6.738913812659912e-05,
"loss": 2.0046,
"step": 1050
},
{
"epoch": 0.837708830548926,
"grad_norm": 1.8108558654785156,
"learning_rate": 6.720398563832055e-05,
"loss": 1.8705,
"step": 1053
},
{
"epoch": 0.8400954653937948,
"grad_norm": 1.6313164234161377,
"learning_rate": 6.701856517020565e-05,
"loss": 1.9745,
"step": 1056
},
{
"epoch": 0.8424821002386634,
"grad_norm": 1.5467928647994995,
"learning_rate": 6.683287961047742e-05,
"loss": 2.0668,
"step": 1059
},
{
"epoch": 0.8448687350835322,
"grad_norm": 1.6917724609375,
"learning_rate": 6.664693185148807e-05,
"loss": 1.8494,
"step": 1062
},
{
"epoch": 0.847255369928401,
"grad_norm": 1.695573329925537,
"learning_rate": 6.646072478967397e-05,
"loss": 2.0077,
"step": 1065
},
{
"epoch": 0.8496420047732697,
"grad_norm": 1.6147156953811646,
"learning_rate": 6.627426132551058e-05,
"loss": 1.8638,
"step": 1068
},
{
"epoch": 0.8520286396181385,
"grad_norm": 1.6783572435379028,
"learning_rate": 6.608754436346725e-05,
"loss": 1.8168,
"step": 1071
},
{
"epoch": 0.8544152744630071,
"grad_norm": 2.0277252197265625,
"learning_rate": 6.590057681196191e-05,
"loss": 1.7963,
"step": 1074
},
{
"epoch": 0.8568019093078759,
"grad_norm": 1.7208962440490723,
"learning_rate": 6.571336158331589e-05,
"loss": 2.0736,
"step": 1077
},
{
"epoch": 0.8591885441527446,
"grad_norm": 1.736999750137329,
"learning_rate": 6.552590159370844e-05,
"loss": 1.7492,
"step": 1080
},
{
"epoch": 0.8615751789976134,
"grad_norm": 1.6817790269851685,
"learning_rate": 6.53381997631314e-05,
"loss": 1.9629,
"step": 1083
},
{
"epoch": 0.863961813842482,
"grad_norm": 1.6716388463974,
"learning_rate": 6.515025901534364e-05,
"loss": 1.8712,
"step": 1086
},
{
"epoch": 0.8663484486873508,
"grad_norm": 1.500181794166565,
"learning_rate": 6.496208227782556e-05,
"loss": 1.8531,
"step": 1089
},
{
"epoch": 0.8687350835322196,
"grad_norm": 1.6236196756362915,
"learning_rate": 6.477367248173352e-05,
"loss": 1.8949,
"step": 1092
},
{
"epoch": 0.8711217183770883,
"grad_norm": 1.5496482849121094,
"learning_rate": 6.458503256185404e-05,
"loss": 1.8663,
"step": 1095
},
{
"epoch": 0.8735083532219571,
"grad_norm": 1.5112017393112183,
"learning_rate": 6.439616545655834e-05,
"loss": 1.7794,
"step": 1098
},
{
"epoch": 0.8758949880668258,
"grad_norm": 1.5853941440582275,
"learning_rate": 6.420707410775626e-05,
"loss": 1.9137,
"step": 1101
},
{
"epoch": 0.8782816229116945,
"grad_norm": 1.7348613739013672,
"learning_rate": 6.401776146085072e-05,
"loss": 2.181,
"step": 1104
},
{
"epoch": 0.8806682577565632,
"grad_norm": 1.8379839658737183,
"learning_rate": 6.382823046469167e-05,
"loss": 1.9456,
"step": 1107
},
{
"epoch": 0.883054892601432,
"grad_norm": 1.6896251440048218,
"learning_rate": 6.363848407153016e-05,
"loss": 1.7248,
"step": 1110
},
{
"epoch": 0.8854415274463007,
"grad_norm": 1.8158091306686401,
"learning_rate": 6.344852523697247e-05,
"loss": 1.9658,
"step": 1113
},
{
"epoch": 0.8878281622911695,
"grad_norm": 1.7598294019699097,
"learning_rate": 6.325835691993394e-05,
"loss": 1.6611,
"step": 1116
},
{
"epoch": 0.8902147971360382,
"grad_norm": 1.7450791597366333,
"learning_rate": 6.306798208259297e-05,
"loss": 1.8354,
"step": 1119
},
{
"epoch": 0.8926014319809069,
"grad_norm": 1.71793532371521,
"learning_rate": 6.287740369034485e-05,
"loss": 1.6597,
"step": 1122
},
{
"epoch": 0.8949880668257757,
"grad_norm": 1.8093537092208862,
"learning_rate": 6.26866247117555e-05,
"loss": 1.716,
"step": 1125
},
{
"epoch": 0.8973747016706444,
"grad_norm": 1.7055341005325317,
"learning_rate": 6.249564811851543e-05,
"loss": 1.9221,
"step": 1128
},
{
"epoch": 0.8997613365155132,
"grad_norm": 1.5543915033340454,
"learning_rate": 6.230447688539316e-05,
"loss": 1.7363,
"step": 1131
},
{
"epoch": 0.9021479713603818,
"grad_norm": 1.7169188261032104,
"learning_rate": 6.211311399018916e-05,
"loss": 1.8639,
"step": 1134
},
{
"epoch": 0.9045346062052506,
"grad_norm": 1.5219560861587524,
"learning_rate": 6.192156241368929e-05,
"loss": 1.8671,
"step": 1137
},
{
"epoch": 0.9069212410501193,
"grad_norm": 1.6446306705474854,
"learning_rate": 6.172982513961845e-05,
"loss": 1.8123,
"step": 1140
},
{
"epoch": 0.9093078758949881,
"grad_norm": 1.7986334562301636,
"learning_rate": 6.153790515459404e-05,
"loss": 1.7454,
"step": 1143
},
{
"epoch": 0.9116945107398569,
"grad_norm": 1.6070222854614258,
"learning_rate": 6.13458054480795e-05,
"loss": 1.9428,
"step": 1146
},
{
"epoch": 0.9140811455847255,
"grad_norm": 1.6163593530654907,
"learning_rate": 6.115352901233779e-05,
"loss": 1.9039,
"step": 1149
},
{
"epoch": 0.9164677804295943,
"grad_norm": 1.5768218040466309,
"learning_rate": 6.096107884238458e-05,
"loss": 1.7472,
"step": 1152
},
{
"epoch": 0.918854415274463,
"grad_norm": 1.4955110549926758,
"learning_rate": 6.0768457935941817e-05,
"loss": 1.8869,
"step": 1155
},
{
"epoch": 0.9212410501193318,
"grad_norm": 1.939584493637085,
"learning_rate": 6.0575669293390954e-05,
"loss": 1.908,
"step": 1158
},
{
"epoch": 0.9236276849642004,
"grad_norm": 1.7790099382400513,
"learning_rate": 6.038271591772615e-05,
"loss": 1.9309,
"step": 1161
},
{
"epoch": 0.9260143198090692,
"grad_norm": 1.8958290815353394,
"learning_rate": 6.0189600814507604e-05,
"loss": 1.9888,
"step": 1164
},
{
"epoch": 0.9284009546539379,
"grad_norm": 1.4420616626739502,
"learning_rate": 5.9996326991814654e-05,
"loss": 1.7128,
"step": 1167
},
{
"epoch": 0.9307875894988067,
"grad_norm": 1.9376964569091797,
"learning_rate": 5.980289746019892e-05,
"loss": 1.9666,
"step": 1170
},
{
"epoch": 0.9331742243436754,
"grad_norm": 1.6446843147277832,
"learning_rate": 5.9609315232637483e-05,
"loss": 1.6969,
"step": 1173
},
{
"epoch": 0.9355608591885441,
"grad_norm": 1.5497177839279175,
"learning_rate": 5.941558332448589e-05,
"loss": 1.7452,
"step": 1176
},
{
"epoch": 0.9379474940334129,
"grad_norm": 1.5692472457885742,
"learning_rate": 5.922170475343125e-05,
"loss": 1.7873,
"step": 1179
},
{
"epoch": 0.9403341288782816,
"grad_norm": 1.8818715810775757,
"learning_rate": 5.9027682539445104e-05,
"loss": 1.8012,
"step": 1182
},
{
"epoch": 0.9427207637231504,
"grad_norm": 1.5524557828903198,
"learning_rate": 5.883351970473654e-05,
"loss": 1.9377,
"step": 1185
},
{
"epoch": 0.9451073985680191,
"grad_norm": 1.6942998170852661,
"learning_rate": 5.863921927370498e-05,
"loss": 1.8297,
"step": 1188
},
{
"epoch": 0.9474940334128878,
"grad_norm": 1.5411245822906494,
"learning_rate": 5.8444784272893175e-05,
"loss": 1.7801,
"step": 1191
},
{
"epoch": 0.9498806682577565,
"grad_norm": 1.609163522720337,
"learning_rate": 5.8250217730939973e-05,
"loss": 1.7861,
"step": 1194
},
{
"epoch": 0.9522673031026253,
"grad_norm": 1.5381650924682617,
"learning_rate": 5.8055522678533225e-05,
"loss": 1.7624,
"step": 1197
},
{
"epoch": 0.954653937947494,
"grad_norm": 1.7099480628967285,
"learning_rate": 5.786070214836254e-05,
"loss": 1.732,
"step": 1200
},
{
"epoch": 0.9570405727923628,
"grad_norm": 1.4979294538497925,
"learning_rate": 5.7665759175072034e-05,
"loss": 1.9665,
"step": 1203
},
{
"epoch": 0.9594272076372315,
"grad_norm": 1.5802431106567383,
"learning_rate": 5.747069679521305e-05,
"loss": 1.8585,
"step": 1206
},
{
"epoch": 0.9618138424821002,
"grad_norm": 1.6456499099731445,
"learning_rate": 5.727551804719693e-05,
"loss": 1.8085,
"step": 1209
},
{
"epoch": 0.964200477326969,
"grad_norm": 1.7286982536315918,
"learning_rate": 5.708022597124758e-05,
"loss": 1.7829,
"step": 1212
},
{
"epoch": 0.9665871121718377,
"grad_norm": 1.9187157154083252,
"learning_rate": 5.688482360935423e-05,
"loss": 1.9609,
"step": 1215
},
{
"epoch": 0.9689737470167065,
"grad_norm": 1.4411349296569824,
"learning_rate": 5.668931400522396e-05,
"loss": 1.8859,
"step": 1218
},
{
"epoch": 0.9713603818615751,
"grad_norm": 1.7264066934585571,
"learning_rate": 5.649370020423431e-05,
"loss": 1.7933,
"step": 1221
},
{
"epoch": 0.9737470167064439,
"grad_norm": 1.6054697036743164,
"learning_rate": 5.629798525338589e-05,
"loss": 1.8167,
"step": 1224
},
{
"epoch": 0.9761336515513126,
"grad_norm": 1.7534525394439697,
"learning_rate": 5.6102172201254835e-05,
"loss": 1.8407,
"step": 1227
},
{
"epoch": 0.9785202863961814,
"grad_norm": 1.4455212354660034,
"learning_rate": 5.5906264097945407e-05,
"loss": 1.8722,
"step": 1230
},
{
"epoch": 0.9809069212410502,
"grad_norm": 1.5002778768539429,
"learning_rate": 5.5710263995042434e-05,
"loss": 1.9403,
"step": 1233
},
{
"epoch": 0.9832935560859188,
"grad_norm": 1.5572600364685059,
"learning_rate": 5.551417494556376e-05,
"loss": 1.8539,
"step": 1236
},
{
"epoch": 0.9856801909307876,
"grad_norm": 1.4347857236862183,
"learning_rate": 5.531800000391275e-05,
"loss": 1.8457,
"step": 1239
},
{
"epoch": 0.9880668257756563,
"grad_norm": 1.5811883211135864,
"learning_rate": 5.5121742225830665e-05,
"loss": 1.9307,
"step": 1242
},
{
"epoch": 0.9904534606205251,
"grad_norm": 1.734519124031067,
"learning_rate": 5.4925404668349076e-05,
"loss": 1.8851,
"step": 1245
},
{
"epoch": 0.9928400954653938,
"grad_norm": 1.4903361797332764,
"learning_rate": 5.472899038974225e-05,
"loss": 1.8052,
"step": 1248
},
{
"epoch": 0.9952267303102625,
"grad_norm": 1.638721227645874,
"learning_rate": 5.45325024494795e-05,
"loss": 1.8556,
"step": 1251
},
{
"epoch": 0.9976133651551312,
"grad_norm": 1.70747971534729,
"learning_rate": 5.433594390817756e-05,
"loss": 1.9593,
"step": 1254
},
{
"epoch": 1.0,
"grad_norm": 2.315592050552368,
"learning_rate": 5.413931782755283e-05,
"loss": 1.9452,
"step": 1257
}
],
"logging_steps": 3,
"max_steps": 2514,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1257,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.127117763309732e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}