code-full-hard / trainer_state.json
cterdam's picture
Upload 10 files
7e9add2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 10000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 17.069135665893555,
"learning_rate": 3.96e-06,
"loss": 1.9267,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 7.49233341217041,
"learning_rate": 7.960000000000002e-06,
"loss": 1.7814,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 10.066174507141113,
"learning_rate": 1.196e-05,
"loss": 1.7612,
"step": 300
},
{
"epoch": 0.02,
"grad_norm": 8.212127685546875,
"learning_rate": 1.5960000000000003e-05,
"loss": 1.7866,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 5.494411945343018,
"learning_rate": 1.9960000000000002e-05,
"loss": 1.7809,
"step": 500
},
{
"epoch": 0.03,
"grad_norm": 6.4578728675842285,
"learning_rate": 1.9980100502512564e-05,
"loss": 1.7893,
"step": 600
},
{
"epoch": 0.04,
"grad_norm": 4.508376121520996,
"learning_rate": 1.996020100502513e-05,
"loss": 1.7851,
"step": 700
},
{
"epoch": 0.04,
"grad_norm": 6.78291654586792,
"learning_rate": 1.9940100502512564e-05,
"loss": 1.758,
"step": 800
},
{
"epoch": 0.04,
"grad_norm": 3.534212589263916,
"learning_rate": 1.9920000000000002e-05,
"loss": 1.7659,
"step": 900
},
{
"epoch": 0.05,
"grad_norm": 6.295835018157959,
"learning_rate": 1.9899899497487437e-05,
"loss": 1.7201,
"step": 1000
},
{
"epoch": 0.06,
"grad_norm": 5.502695083618164,
"learning_rate": 1.987979899497488e-05,
"loss": 1.7634,
"step": 1100
},
{
"epoch": 0.06,
"grad_norm": 6.535002708435059,
"learning_rate": 1.9859698492462313e-05,
"loss": 1.7322,
"step": 1200
},
{
"epoch": 0.07,
"grad_norm": 5.8399834632873535,
"learning_rate": 1.983959798994975e-05,
"loss": 1.7306,
"step": 1300
},
{
"epoch": 0.07,
"grad_norm": 3.5027928352355957,
"learning_rate": 1.9819497487437185e-05,
"loss": 1.6763,
"step": 1400
},
{
"epoch": 0.07,
"grad_norm": 6.470935821533203,
"learning_rate": 1.9799396984924623e-05,
"loss": 1.687,
"step": 1500
},
{
"epoch": 0.08,
"grad_norm": 6.546024799346924,
"learning_rate": 1.977929648241206e-05,
"loss": 1.6944,
"step": 1600
},
{
"epoch": 0.09,
"grad_norm": 6.186180591583252,
"learning_rate": 1.97591959798995e-05,
"loss": 1.7085,
"step": 1700
},
{
"epoch": 0.09,
"grad_norm": 3.819445848464966,
"learning_rate": 1.9739095477386937e-05,
"loss": 1.6765,
"step": 1800
},
{
"epoch": 0.1,
"grad_norm": 4.2868876457214355,
"learning_rate": 1.9718994974874372e-05,
"loss": 1.6888,
"step": 1900
},
{
"epoch": 0.1,
"grad_norm": 6.690129280090332,
"learning_rate": 1.969889447236181e-05,
"loss": 1.675,
"step": 2000
},
{
"epoch": 0.1,
"grad_norm": 7.435989856719971,
"learning_rate": 1.9678793969849248e-05,
"loss": 1.6913,
"step": 2100
},
{
"epoch": 0.11,
"grad_norm": 6.209521293640137,
"learning_rate": 1.9658693467336686e-05,
"loss": 1.7021,
"step": 2200
},
{
"epoch": 0.12,
"grad_norm": 7.704258441925049,
"learning_rate": 1.963859296482412e-05,
"loss": 1.6973,
"step": 2300
},
{
"epoch": 0.12,
"grad_norm": 6.3551025390625,
"learning_rate": 1.9618492462311562e-05,
"loss": 1.6951,
"step": 2400
},
{
"epoch": 0.12,
"grad_norm": 4.278153419494629,
"learning_rate": 1.9598391959798996e-05,
"loss": 1.6749,
"step": 2500
},
{
"epoch": 0.13,
"grad_norm": 5.053964614868164,
"learning_rate": 1.9578291457286434e-05,
"loss": 1.6839,
"step": 2600
},
{
"epoch": 0.14,
"grad_norm": 5.125670909881592,
"learning_rate": 1.955819095477387e-05,
"loss": 1.6734,
"step": 2700
},
{
"epoch": 0.14,
"grad_norm": 3.6489973068237305,
"learning_rate": 1.953809045226131e-05,
"loss": 1.6436,
"step": 2800
},
{
"epoch": 0.14,
"grad_norm": 6.31420373916626,
"learning_rate": 1.9517989949748745e-05,
"loss": 1.6781,
"step": 2900
},
{
"epoch": 0.15,
"grad_norm": 4.799983501434326,
"learning_rate": 1.9497889447236183e-05,
"loss": 1.6766,
"step": 3000
},
{
"epoch": 0.15,
"grad_norm": 3.4388930797576904,
"learning_rate": 1.9477788944723618e-05,
"loss": 1.6613,
"step": 3100
},
{
"epoch": 0.16,
"grad_norm": 6.100885391235352,
"learning_rate": 1.945768844221106e-05,
"loss": 1.6885,
"step": 3200
},
{
"epoch": 0.17,
"grad_norm": 5.916701793670654,
"learning_rate": 1.9437587939698493e-05,
"loss": 1.6705,
"step": 3300
},
{
"epoch": 0.17,
"grad_norm": 4.234365463256836,
"learning_rate": 1.941748743718593e-05,
"loss": 1.6803,
"step": 3400
},
{
"epoch": 0.17,
"grad_norm": 5.592808723449707,
"learning_rate": 1.939738693467337e-05,
"loss": 1.6222,
"step": 3500
},
{
"epoch": 0.18,
"grad_norm": 6.33900260925293,
"learning_rate": 1.9377286432160804e-05,
"loss": 1.6275,
"step": 3600
},
{
"epoch": 0.18,
"grad_norm": 5.385664939880371,
"learning_rate": 1.9357185929648242e-05,
"loss": 1.6234,
"step": 3700
},
{
"epoch": 0.19,
"grad_norm": 6.868696689605713,
"learning_rate": 1.933708542713568e-05,
"loss": 1.6584,
"step": 3800
},
{
"epoch": 0.2,
"grad_norm": 5.236993789672852,
"learning_rate": 1.9316984924623118e-05,
"loss": 1.6342,
"step": 3900
},
{
"epoch": 0.2,
"grad_norm": 5.226047515869141,
"learning_rate": 1.9296884422110552e-05,
"loss": 1.6109,
"step": 4000
},
{
"epoch": 0.2,
"grad_norm": 6.2877655029296875,
"learning_rate": 1.9276783919597994e-05,
"loss": 1.632,
"step": 4100
},
{
"epoch": 0.21,
"grad_norm": 4.26241397857666,
"learning_rate": 1.925668341708543e-05,
"loss": 1.5813,
"step": 4200
},
{
"epoch": 0.21,
"grad_norm": 4.126381874084473,
"learning_rate": 1.9236582914572866e-05,
"loss": 1.6013,
"step": 4300
},
{
"epoch": 0.22,
"grad_norm": 4.520874977111816,
"learning_rate": 1.92164824120603e-05,
"loss": 1.6499,
"step": 4400
},
{
"epoch": 0.23,
"grad_norm": 4.288824558258057,
"learning_rate": 1.9196381909547742e-05,
"loss": 1.6151,
"step": 4500
},
{
"epoch": 0.23,
"grad_norm": 5.139670372009277,
"learning_rate": 1.9176281407035177e-05,
"loss": 1.61,
"step": 4600
},
{
"epoch": 0.23,
"grad_norm": 5.32205867767334,
"learning_rate": 1.9156180904522615e-05,
"loss": 1.6125,
"step": 4700
},
{
"epoch": 0.24,
"grad_norm": 3.354684829711914,
"learning_rate": 1.913608040201005e-05,
"loss": 1.6219,
"step": 4800
},
{
"epoch": 0.24,
"grad_norm": 4.567023754119873,
"learning_rate": 1.911597989949749e-05,
"loss": 1.6326,
"step": 4900
},
{
"epoch": 0.25,
"grad_norm": 5.753734588623047,
"learning_rate": 1.9095879396984925e-05,
"loss": 1.5992,
"step": 5000
},
{
"epoch": 0.26,
"grad_norm": 4.663640975952148,
"learning_rate": 1.9075778894472363e-05,
"loss": 1.5792,
"step": 5100
},
{
"epoch": 0.26,
"grad_norm": 5.779416561126709,
"learning_rate": 1.90556783919598e-05,
"loss": 1.6072,
"step": 5200
},
{
"epoch": 0.27,
"grad_norm": 5.422142505645752,
"learning_rate": 1.903557788944724e-05,
"loss": 1.6244,
"step": 5300
},
{
"epoch": 0.27,
"grad_norm": 4.999263286590576,
"learning_rate": 1.9015477386934674e-05,
"loss": 1.6056,
"step": 5400
},
{
"epoch": 0.28,
"grad_norm": 7.253634452819824,
"learning_rate": 1.8995376884422112e-05,
"loss": 1.5862,
"step": 5500
},
{
"epoch": 0.28,
"grad_norm": 5.289574146270752,
"learning_rate": 1.897527638190955e-05,
"loss": 1.6188,
"step": 5600
},
{
"epoch": 0.28,
"grad_norm": 4.021580219268799,
"learning_rate": 1.8955175879396988e-05,
"loss": 1.5506,
"step": 5700
},
{
"epoch": 0.29,
"grad_norm": 4.235142707824707,
"learning_rate": 1.893527638190955e-05,
"loss": 1.5975,
"step": 5800
},
{
"epoch": 0.29,
"grad_norm": 5.086207866668701,
"learning_rate": 1.8915175879396988e-05,
"loss": 1.5796,
"step": 5900
},
{
"epoch": 0.3,
"grad_norm": 5.470453262329102,
"learning_rate": 1.8895075376884423e-05,
"loss": 1.5871,
"step": 6000
},
{
"epoch": 0.3,
"grad_norm": 6.826024532318115,
"learning_rate": 1.887497487437186e-05,
"loss": 1.5398,
"step": 6100
},
{
"epoch": 0.31,
"grad_norm": 4.6210761070251465,
"learning_rate": 1.88548743718593e-05,
"loss": 1.6186,
"step": 6200
},
{
"epoch": 0.32,
"grad_norm": 5.72627592086792,
"learning_rate": 1.8834773869346733e-05,
"loss": 1.5377,
"step": 6300
},
{
"epoch": 0.32,
"grad_norm": 6.252579212188721,
"learning_rate": 1.881467336683417e-05,
"loss": 1.6209,
"step": 6400
},
{
"epoch": 0.33,
"grad_norm": 5.615140914916992,
"learning_rate": 1.879457286432161e-05,
"loss": 1.5482,
"step": 6500
},
{
"epoch": 0.33,
"grad_norm": 3.57523512840271,
"learning_rate": 1.8774472361809047e-05,
"loss": 1.5903,
"step": 6600
},
{
"epoch": 0.34,
"grad_norm": 7.865254878997803,
"learning_rate": 1.8754371859296482e-05,
"loss": 1.596,
"step": 6700
},
{
"epoch": 0.34,
"grad_norm": 5.929252624511719,
"learning_rate": 1.8734271356783923e-05,
"loss": 1.538,
"step": 6800
},
{
"epoch": 0.34,
"grad_norm": 3.798943281173706,
"learning_rate": 1.8714170854271358e-05,
"loss": 1.5793,
"step": 6900
},
{
"epoch": 0.35,
"grad_norm": 6.028155326843262,
"learning_rate": 1.8694070351758796e-05,
"loss": 1.5486,
"step": 7000
},
{
"epoch": 0.35,
"grad_norm": 4.279233932495117,
"learning_rate": 1.867396984924623e-05,
"loss": 1.5538,
"step": 7100
},
{
"epoch": 0.36,
"grad_norm": 3.5549476146698,
"learning_rate": 1.8653869346733672e-05,
"loss": 1.571,
"step": 7200
},
{
"epoch": 0.36,
"grad_norm": 6.669614315032959,
"learning_rate": 1.8633768844221106e-05,
"loss": 1.5943,
"step": 7300
},
{
"epoch": 0.37,
"grad_norm": 5.953083515167236,
"learning_rate": 1.8613668341708544e-05,
"loss": 1.5622,
"step": 7400
},
{
"epoch": 0.38,
"grad_norm": 4.570889472961426,
"learning_rate": 1.8593567839195982e-05,
"loss": 1.5548,
"step": 7500
},
{
"epoch": 0.38,
"grad_norm": 6.883716583251953,
"learning_rate": 1.857346733668342e-05,
"loss": 1.5523,
"step": 7600
},
{
"epoch": 0.39,
"grad_norm": 5.618870258331299,
"learning_rate": 1.8553366834170855e-05,
"loss": 1.573,
"step": 7700
},
{
"epoch": 0.39,
"grad_norm": 6.305469989776611,
"learning_rate": 1.8533266331658293e-05,
"loss": 1.5568,
"step": 7800
},
{
"epoch": 0.4,
"grad_norm": 5.411215782165527,
"learning_rate": 1.851316582914573e-05,
"loss": 1.5381,
"step": 7900
},
{
"epoch": 0.4,
"grad_norm": 4.859926700592041,
"learning_rate": 1.849306532663317e-05,
"loss": 1.5554,
"step": 8000
},
{
"epoch": 0.41,
"grad_norm": 3.7912240028381348,
"learning_rate": 1.8473165829145728e-05,
"loss": 1.4991,
"step": 8100
},
{
"epoch": 0.41,
"grad_norm": 5.7156901359558105,
"learning_rate": 1.845306532663317e-05,
"loss": 1.567,
"step": 8200
},
{
"epoch": 0.41,
"grad_norm": 5.494976043701172,
"learning_rate": 1.8432964824120604e-05,
"loss": 1.5879,
"step": 8300
},
{
"epoch": 0.42,
"grad_norm": 4.401707172393799,
"learning_rate": 1.8412864321608042e-05,
"loss": 1.5464,
"step": 8400
},
{
"epoch": 0.42,
"grad_norm": 6.136995315551758,
"learning_rate": 1.839276381909548e-05,
"loss": 1.5808,
"step": 8500
},
{
"epoch": 0.43,
"grad_norm": 5.482088565826416,
"learning_rate": 1.8372663316582918e-05,
"loss": 1.5014,
"step": 8600
},
{
"epoch": 0.43,
"grad_norm": 5.293674945831299,
"learning_rate": 1.8352562814070352e-05,
"loss": 1.5409,
"step": 8700
},
{
"epoch": 0.44,
"grad_norm": 6.073010444641113,
"learning_rate": 1.833246231155779e-05,
"loss": 1.5292,
"step": 8800
},
{
"epoch": 0.45,
"grad_norm": 5.509156703948975,
"learning_rate": 1.8312361809045228e-05,
"loss": 1.5243,
"step": 8900
},
{
"epoch": 0.45,
"grad_norm": 7.400144577026367,
"learning_rate": 1.8292261306532663e-05,
"loss": 1.5577,
"step": 9000
},
{
"epoch": 0.46,
"grad_norm": 6.070242881774902,
"learning_rate": 1.8272160804020104e-05,
"loss": 1.5394,
"step": 9100
},
{
"epoch": 0.46,
"grad_norm": 5.7604804039001465,
"learning_rate": 1.825206030150754e-05,
"loss": 1.5335,
"step": 9200
},
{
"epoch": 0.47,
"grad_norm": 4.665848731994629,
"learning_rate": 1.8231959798994977e-05,
"loss": 1.5484,
"step": 9300
},
{
"epoch": 0.47,
"grad_norm": 3.539947271347046,
"learning_rate": 1.821185929648241e-05,
"loss": 1.5424,
"step": 9400
},
{
"epoch": 0.47,
"grad_norm": 6.976469039916992,
"learning_rate": 1.8191758793969853e-05,
"loss": 1.5128,
"step": 9500
},
{
"epoch": 0.48,
"grad_norm": 4.112757682800293,
"learning_rate": 1.8171658291457287e-05,
"loss": 1.5085,
"step": 9600
},
{
"epoch": 0.48,
"grad_norm": 4.922628402709961,
"learning_rate": 1.8151557788944725e-05,
"loss": 1.5107,
"step": 9700
},
{
"epoch": 0.49,
"grad_norm": 6.697315692901611,
"learning_rate": 1.813145728643216e-05,
"loss": 1.5258,
"step": 9800
},
{
"epoch": 0.49,
"grad_norm": 7.6928253173828125,
"learning_rate": 1.81113567839196e-05,
"loss": 1.525,
"step": 9900
},
{
"epoch": 0.5,
"grad_norm": 3.736105442047119,
"learning_rate": 1.8091256281407036e-05,
"loss": 1.5156,
"step": 10000
},
{
"epoch": 0.5,
"eval_loss": 1.5033862590789795,
"eval_runtime": 21.8773,
"eval_samples_per_second": 45.709,
"eval_steps_per_second": 5.714,
"step": 10000
},
{
"epoch": 0.51,
"grad_norm": 6.009729385375977,
"learning_rate": 1.8071155778894474e-05,
"loss": 1.564,
"step": 10100
},
{
"epoch": 0.51,
"grad_norm": 5.207152366638184,
"learning_rate": 1.8051256281407036e-05,
"loss": 1.5506,
"step": 10200
},
{
"epoch": 0.52,
"grad_norm": 4.1969685554504395,
"learning_rate": 1.8031155778894474e-05,
"loss": 1.508,
"step": 10300
},
{
"epoch": 0.52,
"grad_norm": 4.609541416168213,
"learning_rate": 1.801105527638191e-05,
"loss": 1.5556,
"step": 10400
},
{
"epoch": 0.53,
"grad_norm": 4.768685340881348,
"learning_rate": 1.799095477386935e-05,
"loss": 1.5357,
"step": 10500
},
{
"epoch": 0.53,
"grad_norm": 6.3910651206970215,
"learning_rate": 1.7970854271356785e-05,
"loss": 1.5377,
"step": 10600
},
{
"epoch": 0.54,
"grad_norm": 4.312323570251465,
"learning_rate": 1.7950753768844223e-05,
"loss": 1.5454,
"step": 10700
},
{
"epoch": 0.54,
"grad_norm": 3.9479427337646484,
"learning_rate": 1.793065326633166e-05,
"loss": 1.4863,
"step": 10800
},
{
"epoch": 0.55,
"grad_norm": 5.744295120239258,
"learning_rate": 1.79105527638191e-05,
"loss": 1.5507,
"step": 10900
},
{
"epoch": 0.55,
"grad_norm": 4.2211833000183105,
"learning_rate": 1.7890452261306533e-05,
"loss": 1.5182,
"step": 11000
},
{
"epoch": 0.56,
"grad_norm": 4.841630458831787,
"learning_rate": 1.787035175879397e-05,
"loss": 1.4814,
"step": 11100
},
{
"epoch": 0.56,
"grad_norm": 6.714913845062256,
"learning_rate": 1.785025125628141e-05,
"loss": 1.4904,
"step": 11200
},
{
"epoch": 0.56,
"grad_norm": 6.587597846984863,
"learning_rate": 1.7830150753768847e-05,
"loss": 1.5045,
"step": 11300
},
{
"epoch": 0.57,
"grad_norm": 4.343375205993652,
"learning_rate": 1.7810050251256285e-05,
"loss": 1.5143,
"step": 11400
},
{
"epoch": 0.57,
"grad_norm": 7.786270618438721,
"learning_rate": 1.778994974874372e-05,
"loss": 1.496,
"step": 11500
},
{
"epoch": 0.58,
"grad_norm": 7.0261054039001465,
"learning_rate": 1.7769849246231158e-05,
"loss": 1.5113,
"step": 11600
},
{
"epoch": 0.58,
"grad_norm": 5.448154449462891,
"learning_rate": 1.7749748743718592e-05,
"loss": 1.4699,
"step": 11700
},
{
"epoch": 0.59,
"grad_norm": 5.4564361572265625,
"learning_rate": 1.7729648241206034e-05,
"loss": 1.4909,
"step": 11800
},
{
"epoch": 0.59,
"grad_norm": 5.704242706298828,
"learning_rate": 1.7709547738693468e-05,
"loss": 1.4931,
"step": 11900
},
{
"epoch": 0.6,
"grad_norm": 4.819602966308594,
"learning_rate": 1.7689447236180906e-05,
"loss": 1.4981,
"step": 12000
},
{
"epoch": 0.6,
"grad_norm": 2.80843186378479,
"learning_rate": 1.766934673366834e-05,
"loss": 1.4753,
"step": 12100
},
{
"epoch": 0.61,
"grad_norm": 4.010366439819336,
"learning_rate": 1.7649246231155782e-05,
"loss": 1.4899,
"step": 12200
},
{
"epoch": 0.61,
"grad_norm": 6.8596391677856445,
"learning_rate": 1.7629145728643217e-05,
"loss": 1.4951,
"step": 12300
},
{
"epoch": 0.62,
"grad_norm": 5.7791643142700195,
"learning_rate": 1.7609246231155782e-05,
"loss": 1.4841,
"step": 12400
},
{
"epoch": 0.62,
"grad_norm": 4.629549503326416,
"learning_rate": 1.7589145728643217e-05,
"loss": 1.4918,
"step": 12500
},
{
"epoch": 0.63,
"grad_norm": 4.676841735839844,
"learning_rate": 1.7569045226130655e-05,
"loss": 1.4934,
"step": 12600
},
{
"epoch": 0.64,
"grad_norm": 5.469869613647461,
"learning_rate": 1.754894472361809e-05,
"loss": 1.4767,
"step": 12700
},
{
"epoch": 0.64,
"grad_norm": 4.605990886688232,
"learning_rate": 1.752884422110553e-05,
"loss": 1.4635,
"step": 12800
},
{
"epoch": 0.65,
"grad_norm": 5.055588722229004,
"learning_rate": 1.7508743718592966e-05,
"loss": 1.4925,
"step": 12900
},
{
"epoch": 0.65,
"grad_norm": 4.2058916091918945,
"learning_rate": 1.7488643216080404e-05,
"loss": 1.4988,
"step": 13000
},
{
"epoch": 0.66,
"grad_norm": 13.90904426574707,
"learning_rate": 1.7468542713567838e-05,
"loss": 1.4883,
"step": 13100
},
{
"epoch": 0.66,
"grad_norm": 5.441417694091797,
"learning_rate": 1.744844221105528e-05,
"loss": 1.4817,
"step": 13200
},
{
"epoch": 0.67,
"grad_norm": 4.476733684539795,
"learning_rate": 1.7428341708542714e-05,
"loss": 1.5099,
"step": 13300
},
{
"epoch": 0.67,
"grad_norm": 4.831094264984131,
"learning_rate": 1.7408241206030152e-05,
"loss": 1.4817,
"step": 13400
},
{
"epoch": 0.68,
"grad_norm": 6.846999645233154,
"learning_rate": 1.738814070351759e-05,
"loss": 1.473,
"step": 13500
},
{
"epoch": 0.68,
"grad_norm": 4.799276351928711,
"learning_rate": 1.7368040201005028e-05,
"loss": 1.4969,
"step": 13600
},
{
"epoch": 0.69,
"grad_norm": 7.644506931304932,
"learning_rate": 1.7347939698492463e-05,
"loss": 1.4836,
"step": 13700
},
{
"epoch": 0.69,
"grad_norm": 4.794766426086426,
"learning_rate": 1.73278391959799e-05,
"loss": 1.4993,
"step": 13800
},
{
"epoch": 0.69,
"grad_norm": 5.148614406585693,
"learning_rate": 1.730773869346734e-05,
"loss": 1.4697,
"step": 13900
},
{
"epoch": 0.7,
"grad_norm": 3.6471970081329346,
"learning_rate": 1.7287638190954777e-05,
"loss": 1.4811,
"step": 14000
},
{
"epoch": 0.7,
"grad_norm": 5.793773174285889,
"learning_rate": 1.7267537688442214e-05,
"loss": 1.4988,
"step": 14100
},
{
"epoch": 0.71,
"grad_norm": 4.239154815673828,
"learning_rate": 1.724743718592965e-05,
"loss": 1.4673,
"step": 14200
},
{
"epoch": 0.71,
"grad_norm": 5.415383815765381,
"learning_rate": 1.7227336683417087e-05,
"loss": 1.4966,
"step": 14300
},
{
"epoch": 0.72,
"grad_norm": 4.704416275024414,
"learning_rate": 1.720723618090452e-05,
"loss": 1.4996,
"step": 14400
},
{
"epoch": 0.72,
"grad_norm": 3.5393893718719482,
"learning_rate": 1.7187336683417087e-05,
"loss": 1.4677,
"step": 14500
},
{
"epoch": 0.73,
"grad_norm": 3.5352883338928223,
"learning_rate": 1.7167236180904522e-05,
"loss": 1.4739,
"step": 14600
},
{
"epoch": 0.73,
"grad_norm": 5.679812431335449,
"learning_rate": 1.7147135678391963e-05,
"loss": 1.4556,
"step": 14700
},
{
"epoch": 0.74,
"grad_norm": Infinity,
"learning_rate": 1.7127236180904526e-05,
"loss": 1.4665,
"step": 14800
},
{
"epoch": 0.74,
"grad_norm": 5.901428699493408,
"learning_rate": 1.710713567839196e-05,
"loss": 1.464,
"step": 14900
},
{
"epoch": 0.75,
"grad_norm": 4.5120744705200195,
"learning_rate": 1.70870351758794e-05,
"loss": 1.4752,
"step": 15000
},
{
"epoch": 0.76,
"grad_norm": 4.715979099273682,
"learning_rate": 1.7066934673366836e-05,
"loss": 1.4632,
"step": 15100
},
{
"epoch": 0.76,
"grad_norm": 5.309842109680176,
"learning_rate": 1.704683417085427e-05,
"loss": 1.4474,
"step": 15200
},
{
"epoch": 0.77,
"grad_norm": 4.106339454650879,
"learning_rate": 1.7026733668341712e-05,
"loss": 1.4706,
"step": 15300
},
{
"epoch": 0.77,
"grad_norm": 9.052672386169434,
"learning_rate": 1.7006633165829147e-05,
"loss": 1.4662,
"step": 15400
},
{
"epoch": 0.78,
"grad_norm": 5.6153059005737305,
"learning_rate": 1.6986532663316585e-05,
"loss": 1.485,
"step": 15500
},
{
"epoch": 0.78,
"grad_norm": 4.049362659454346,
"learning_rate": 1.696643216080402e-05,
"loss": 1.4847,
"step": 15600
},
{
"epoch": 0.79,
"grad_norm": 4.916749477386475,
"learning_rate": 1.694633165829146e-05,
"loss": 1.4487,
"step": 15700
},
{
"epoch": 0.79,
"grad_norm": 4.682246685028076,
"learning_rate": 1.6926231155778895e-05,
"loss": 1.4227,
"step": 15800
},
{
"epoch": 0.8,
"grad_norm": 3.3342747688293457,
"learning_rate": 1.6906130653266333e-05,
"loss": 1.4564,
"step": 15900
},
{
"epoch": 0.8,
"grad_norm": 5.837874412536621,
"learning_rate": 1.6886030150753768e-05,
"loss": 1.4759,
"step": 16000
},
{
"epoch": 0.81,
"grad_norm": 4.549025058746338,
"learning_rate": 1.686592964824121e-05,
"loss": 1.4381,
"step": 16100
},
{
"epoch": 0.81,
"grad_norm": 6.344630718231201,
"learning_rate": 1.6845829145728644e-05,
"loss": 1.4824,
"step": 16200
},
{
"epoch": 0.81,
"grad_norm": 4.8240485191345215,
"learning_rate": 1.6825728643216082e-05,
"loss": 1.4631,
"step": 16300
},
{
"epoch": 0.82,
"grad_norm": 4.205628395080566,
"learning_rate": 1.680562814070352e-05,
"loss": 1.4683,
"step": 16400
},
{
"epoch": 0.82,
"grad_norm": 4.132819652557373,
"learning_rate": 1.6785527638190958e-05,
"loss": 1.4209,
"step": 16500
},
{
"epoch": 0.83,
"grad_norm": 4.3151140213012695,
"learning_rate": 1.6765427135678392e-05,
"loss": 1.4468,
"step": 16600
},
{
"epoch": 0.83,
"grad_norm": 5.677152633666992,
"learning_rate": 1.674532663316583e-05,
"loss": 1.4198,
"step": 16700
},
{
"epoch": 0.84,
"grad_norm": 4.871775150299072,
"learning_rate": 1.672522613065327e-05,
"loss": 1.4904,
"step": 16800
},
{
"epoch": 0.84,
"grad_norm": 5.693517208099365,
"learning_rate": 1.6705125628140706e-05,
"loss": 1.4592,
"step": 16900
},
{
"epoch": 0.85,
"grad_norm": 4.093634605407715,
"learning_rate": 1.668502512562814e-05,
"loss": 1.4561,
"step": 17000
},
{
"epoch": 0.85,
"grad_norm": 6.508328437805176,
"learning_rate": 1.666492462311558e-05,
"loss": 1.456,
"step": 17100
},
{
"epoch": 0.86,
"grad_norm": 8.67950439453125,
"learning_rate": 1.6644824120603017e-05,
"loss": 1.4432,
"step": 17200
},
{
"epoch": 0.86,
"grad_norm": 6.435894012451172,
"learning_rate": 1.662472361809045e-05,
"loss": 1.4807,
"step": 17300
},
{
"epoch": 0.87,
"grad_norm": 4.387815952301025,
"learning_rate": 1.6604623115577893e-05,
"loss": 1.4171,
"step": 17400
},
{
"epoch": 0.88,
"grad_norm": 5.042853832244873,
"learning_rate": 1.6584522613065327e-05,
"loss": 1.4361,
"step": 17500
},
{
"epoch": 0.88,
"grad_norm": 4.579937934875488,
"learning_rate": 1.6564422110552765e-05,
"loss": 1.4752,
"step": 17600
},
{
"epoch": 0.89,
"grad_norm": 8.229300498962402,
"learning_rate": 1.65443216080402e-05,
"loss": 1.4058,
"step": 17700
},
{
"epoch": 0.89,
"grad_norm": 5.82681131362915,
"learning_rate": 1.652422110552764e-05,
"loss": 1.4353,
"step": 17800
},
{
"epoch": 0.9,
"grad_norm": 3.8094637393951416,
"learning_rate": 1.6504120603015076e-05,
"loss": 1.4043,
"step": 17900
},
{
"epoch": 0.9,
"grad_norm": 5.825170993804932,
"learning_rate": 1.6484020100502514e-05,
"loss": 1.458,
"step": 18000
},
{
"epoch": 0.91,
"grad_norm": 5.906398773193359,
"learning_rate": 1.6463919597989952e-05,
"loss": 1.4373,
"step": 18100
},
{
"epoch": 0.91,
"grad_norm": 4.367284774780273,
"learning_rate": 1.644381909547739e-05,
"loss": 1.4119,
"step": 18200
},
{
"epoch": 0.92,
"grad_norm": 4.767496585845947,
"learning_rate": 1.6423718592964824e-05,
"loss": 1.4378,
"step": 18300
},
{
"epoch": 0.92,
"grad_norm": 2.3912670612335205,
"learning_rate": 1.6403618090452262e-05,
"loss": 1.4452,
"step": 18400
},
{
"epoch": 0.93,
"grad_norm": 4.836172580718994,
"learning_rate": 1.63835175879397e-05,
"loss": 1.4054,
"step": 18500
},
{
"epoch": 0.93,
"grad_norm": 7.216467380523682,
"learning_rate": 1.636341708542714e-05,
"loss": 1.4604,
"step": 18600
},
{
"epoch": 0.94,
"grad_norm": 4.356799602508545,
"learning_rate": 1.6343316582914573e-05,
"loss": 1.4552,
"step": 18700
},
{
"epoch": 0.94,
"grad_norm": 3.337068557739258,
"learning_rate": 1.632321608040201e-05,
"loss": 1.4597,
"step": 18800
},
{
"epoch": 0.94,
"grad_norm": 4.059195518493652,
"learning_rate": 1.630311557788945e-05,
"loss": 1.467,
"step": 18900
},
{
"epoch": 0.95,
"grad_norm": 3.697249412536621,
"learning_rate": 1.6283015075376887e-05,
"loss": 1.445,
"step": 19000
},
{
"epoch": 0.95,
"grad_norm": 6.429022789001465,
"learning_rate": 1.6262914572864325e-05,
"loss": 1.4536,
"step": 19100
},
{
"epoch": 0.96,
"grad_norm": 5.085973739624023,
"learning_rate": 1.624281407035176e-05,
"loss": 1.4507,
"step": 19200
},
{
"epoch": 0.96,
"grad_norm": 4.309168815612793,
"learning_rate": 1.6222713567839197e-05,
"loss": 1.4319,
"step": 19300
},
{
"epoch": 0.97,
"grad_norm": 5.077241897583008,
"learning_rate": 1.6202613065326635e-05,
"loss": 1.4103,
"step": 19400
},
{
"epoch": 0.97,
"grad_norm": 3.984090566635132,
"learning_rate": 1.6182512562814073e-05,
"loss": 1.4104,
"step": 19500
},
{
"epoch": 0.98,
"grad_norm": 4.95877742767334,
"learning_rate": 1.6162412060301508e-05,
"loss": 1.4656,
"step": 19600
},
{
"epoch": 0.98,
"grad_norm": 4.357282638549805,
"learning_rate": 1.6142311557788946e-05,
"loss": 1.4453,
"step": 19700
},
{
"epoch": 0.99,
"grad_norm": 5.499750137329102,
"learning_rate": 1.6122211055276384e-05,
"loss": 1.409,
"step": 19800
},
{
"epoch": 0.99,
"grad_norm": 4.081977367401123,
"learning_rate": 1.6102110552763822e-05,
"loss": 1.41,
"step": 19900
},
{
"epoch": 1.0,
"grad_norm": 5.961399078369141,
"learning_rate": 1.6082010050251256e-05,
"loss": 1.393,
"step": 20000
},
{
"epoch": 1.0,
"eval_loss": 1.431087613105774,
"eval_runtime": 21.8687,
"eval_samples_per_second": 45.727,
"eval_steps_per_second": 5.716,
"step": 20000
},
{
"epoch": 1.0,
"grad_norm": 6.539051532745361,
"learning_rate": 1.6062110552763822e-05,
"loss": 1.4339,
"step": 20100
},
{
"epoch": 1.01,
"grad_norm": 7.362614631652832,
"learning_rate": 1.6042010050251257e-05,
"loss": 1.4047,
"step": 20200
},
{
"epoch": 1.01,
"grad_norm": 4.155520439147949,
"learning_rate": 1.6021909547738695e-05,
"loss": 1.3943,
"step": 20300
},
{
"epoch": 1.02,
"grad_norm": 4.347718715667725,
"learning_rate": 1.600180904522613e-05,
"loss": 1.432,
"step": 20400
},
{
"epoch": 1.02,
"grad_norm": 4.478184700012207,
"learning_rate": 1.598170854271357e-05,
"loss": 1.4201,
"step": 20500
},
{
"epoch": 1.03,
"grad_norm": 8.239706993103027,
"learning_rate": 1.5961608040201005e-05,
"loss": 1.3944,
"step": 20600
},
{
"epoch": 1.03,
"grad_norm": 6.581277370452881,
"learning_rate": 1.5941507537688443e-05,
"loss": 1.359,
"step": 20700
},
{
"epoch": 1.04,
"grad_norm": 4.083044528961182,
"learning_rate": 1.592140703517588e-05,
"loss": 1.4004,
"step": 20800
},
{
"epoch": 1.04,
"grad_norm": 5.052839756011963,
"learning_rate": 1.590130653266332e-05,
"loss": 1.4157,
"step": 20900
},
{
"epoch": 1.05,
"grad_norm": 3.8107857704162598,
"learning_rate": 1.5881206030150754e-05,
"loss": 1.3823,
"step": 21000
},
{
"epoch": 1.05,
"grad_norm": 6.1900954246521,
"learning_rate": 1.5861105527638192e-05,
"loss": 1.4194,
"step": 21100
},
{
"epoch": 1.06,
"grad_norm": 4.510327339172363,
"learning_rate": 1.584100502512563e-05,
"loss": 1.4173,
"step": 21200
},
{
"epoch": 1.06,
"grad_norm": 6.412552833557129,
"learning_rate": 1.5820904522613068e-05,
"loss": 1.3996,
"step": 21300
},
{
"epoch": 1.07,
"grad_norm": 5.167262077331543,
"learning_rate": 1.5800804020100506e-05,
"loss": 1.4007,
"step": 21400
},
{
"epoch": 1.07,
"grad_norm": 4.012689590454102,
"learning_rate": 1.578070351758794e-05,
"loss": 1.3875,
"step": 21500
},
{
"epoch": 1.08,
"grad_norm": 6.7843017578125,
"learning_rate": 1.5760603015075378e-05,
"loss": 1.4192,
"step": 21600
},
{
"epoch": 1.08,
"grad_norm": 5.4605207443237305,
"learning_rate": 1.5740502512562816e-05,
"loss": 1.421,
"step": 21700
},
{
"epoch": 1.09,
"grad_norm": 8.303611755371094,
"learning_rate": 1.5720402010050254e-05,
"loss": 1.45,
"step": 21800
},
{
"epoch": 1.09,
"grad_norm": 4.898472309112549,
"learning_rate": 1.570030150753769e-05,
"loss": 1.3982,
"step": 21900
},
{
"epoch": 1.1,
"grad_norm": 6.471447467803955,
"learning_rate": 1.5680201005025127e-05,
"loss": 1.4272,
"step": 22000
},
{
"epoch": 1.1,
"grad_norm": 7.5459885597229,
"learning_rate": 1.5660100502512565e-05,
"loss": 1.3861,
"step": 22100
},
{
"epoch": 1.11,
"grad_norm": 7.108932971954346,
"learning_rate": 1.5640000000000003e-05,
"loss": 1.3946,
"step": 22200
},
{
"epoch": 1.11,
"grad_norm": 5.083498954772949,
"learning_rate": 1.5619899497487437e-05,
"loss": 1.4006,
"step": 22300
},
{
"epoch": 1.12,
"grad_norm": 6.796627998352051,
"learning_rate": 1.5599798994974875e-05,
"loss": 1.4266,
"step": 22400
},
{
"epoch": 1.12,
"grad_norm": 5.1619367599487305,
"learning_rate": 1.5579698492462313e-05,
"loss": 1.3567,
"step": 22500
},
{
"epoch": 1.13,
"grad_norm": 5.548572063446045,
"learning_rate": 1.555959798994975e-05,
"loss": 1.4193,
"step": 22600
},
{
"epoch": 1.14,
"grad_norm": 4.019492149353027,
"learning_rate": 1.5539497487437186e-05,
"loss": 1.3988,
"step": 22700
},
{
"epoch": 1.14,
"grad_norm": 5.359696865081787,
"learning_rate": 1.5519396984924624e-05,
"loss": 1.4046,
"step": 22800
},
{
"epoch": 1.15,
"grad_norm": 3.4442272186279297,
"learning_rate": 1.5499296482412062e-05,
"loss": 1.3924,
"step": 22900
},
{
"epoch": 1.15,
"grad_norm": 5.838873863220215,
"learning_rate": 1.5479396984924624e-05,
"loss": 1.4141,
"step": 23000
},
{
"epoch": 1.16,
"grad_norm": 5.01621150970459,
"learning_rate": 1.5459296482412062e-05,
"loss": 1.3658,
"step": 23100
},
{
"epoch": 1.16,
"grad_norm": 5.7665205001831055,
"learning_rate": 1.54391959798995e-05,
"loss": 1.4245,
"step": 23200
},
{
"epoch": 1.17,
"grad_norm": 3.1527726650238037,
"learning_rate": 1.5419095477386935e-05,
"loss": 1.3695,
"step": 23300
},
{
"epoch": 1.17,
"grad_norm": 6.3304924964904785,
"learning_rate": 1.5398994974874373e-05,
"loss": 1.3978,
"step": 23400
},
{
"epoch": 1.18,
"grad_norm": 7.042291164398193,
"learning_rate": 1.537889447236181e-05,
"loss": 1.385,
"step": 23500
},
{
"epoch": 1.18,
"grad_norm": 4.397637844085693,
"learning_rate": 1.535879396984925e-05,
"loss": 1.3726,
"step": 23600
},
{
"epoch": 1.19,
"grad_norm": 3.7688262462615967,
"learning_rate": 1.5338693467336687e-05,
"loss": 1.3896,
"step": 23700
},
{
"epoch": 1.19,
"grad_norm": 4.961839199066162,
"learning_rate": 1.531859296482412e-05,
"loss": 1.3792,
"step": 23800
},
{
"epoch": 1.2,
"grad_norm": 4.08626127243042,
"learning_rate": 1.529849246231156e-05,
"loss": 1.3507,
"step": 23900
},
{
"epoch": 1.2,
"grad_norm": 4.055938243865967,
"learning_rate": 1.5278391959798997e-05,
"loss": 1.3873,
"step": 24000
},
{
"epoch": 1.21,
"grad_norm": 5.093524932861328,
"learning_rate": 1.5258291457286433e-05,
"loss": 1.3763,
"step": 24100
},
{
"epoch": 1.21,
"grad_norm": 5.755058288574219,
"learning_rate": 1.523819095477387e-05,
"loss": 1.4164,
"step": 24200
},
{
"epoch": 1.22,
"grad_norm": 4.845275402069092,
"learning_rate": 1.5218090452261308e-05,
"loss": 1.3103,
"step": 24300
},
{
"epoch": 1.22,
"grad_norm": 5.180044174194336,
"learning_rate": 1.5197989949748746e-05,
"loss": 1.3739,
"step": 24400
},
{
"epoch": 1.23,
"grad_norm": 5.913352012634277,
"learning_rate": 1.5178090452261306e-05,
"loss": 1.3946,
"step": 24500
},
{
"epoch": 1.23,
"grad_norm": 6.644520282745361,
"learning_rate": 1.5157989949748746e-05,
"loss": 1.3683,
"step": 24600
},
{
"epoch": 1.23,
"grad_norm": 4.617815017700195,
"learning_rate": 1.5137889447236182e-05,
"loss": 1.3588,
"step": 24700
},
{
"epoch": 1.24,
"grad_norm": 5.690709114074707,
"learning_rate": 1.5117788944723619e-05,
"loss": 1.4102,
"step": 24800
},
{
"epoch": 1.25,
"grad_norm": 4.181049823760986,
"learning_rate": 1.5097688442211057e-05,
"loss": 1.3777,
"step": 24900
},
{
"epoch": 1.25,
"grad_norm": 5.829825401306152,
"learning_rate": 1.5077587939698495e-05,
"loss": 1.4278,
"step": 25000
},
{
"epoch": 1.25,
"grad_norm": 4.409423351287842,
"learning_rate": 1.505748743718593e-05,
"loss": 1.3751,
"step": 25100
},
{
"epoch": 1.26,
"grad_norm": 5.290346145629883,
"learning_rate": 1.5037386934673369e-05,
"loss": 1.3752,
"step": 25200
},
{
"epoch": 1.27,
"grad_norm": 6.922583103179932,
"learning_rate": 1.5017286432160805e-05,
"loss": 1.3756,
"step": 25300
},
{
"epoch": 1.27,
"grad_norm": 4.969797134399414,
"learning_rate": 1.4997185929648241e-05,
"loss": 1.3808,
"step": 25400
},
{
"epoch": 1.27,
"grad_norm": 4.4493184089660645,
"learning_rate": 1.4977085427135681e-05,
"loss": 1.3836,
"step": 25500
},
{
"epoch": 1.28,
"grad_norm": 3.4044313430786133,
"learning_rate": 1.4956984924623117e-05,
"loss": 1.3737,
"step": 25600
},
{
"epoch": 1.28,
"grad_norm": 3.3968327045440674,
"learning_rate": 1.4936884422110554e-05,
"loss": 1.3776,
"step": 25700
},
{
"epoch": 1.29,
"grad_norm": 3.016774892807007,
"learning_rate": 1.491678391959799e-05,
"loss": 1.3642,
"step": 25800
},
{
"epoch": 1.29,
"grad_norm": 6.324804306030273,
"learning_rate": 1.489668341708543e-05,
"loss": 1.3838,
"step": 25900
},
{
"epoch": 1.3,
"grad_norm": 3.8945064544677734,
"learning_rate": 1.4876582914572866e-05,
"loss": 1.339,
"step": 26000
},
{
"epoch": 1.3,
"grad_norm": 6.601470470428467,
"learning_rate": 1.4856482412060302e-05,
"loss": 1.3609,
"step": 26100
},
{
"epoch": 1.31,
"grad_norm": 5.291379928588867,
"learning_rate": 1.4836381909547738e-05,
"loss": 1.3829,
"step": 26200
},
{
"epoch": 1.31,
"grad_norm": 5.4891462326049805,
"learning_rate": 1.4816281407035178e-05,
"loss": 1.3733,
"step": 26300
},
{
"epoch": 1.32,
"grad_norm": 4.400446891784668,
"learning_rate": 1.4796180904522614e-05,
"loss": 1.3601,
"step": 26400
},
{
"epoch": 1.32,
"grad_norm": 5.4064860343933105,
"learning_rate": 1.477608040201005e-05,
"loss": 1.3922,
"step": 26500
},
{
"epoch": 1.33,
"grad_norm": 6.4848737716674805,
"learning_rate": 1.4755979899497489e-05,
"loss": 1.3839,
"step": 26600
},
{
"epoch": 1.33,
"grad_norm": 3.8651046752929688,
"learning_rate": 1.4735879396984927e-05,
"loss": 1.3798,
"step": 26700
},
{
"epoch": 1.34,
"grad_norm": 6.21872615814209,
"learning_rate": 1.4715778894472363e-05,
"loss": 1.3771,
"step": 26800
},
{
"epoch": 1.34,
"grad_norm": 4.698353290557861,
"learning_rate": 1.46956783919598e-05,
"loss": 1.3794,
"step": 26900
},
{
"epoch": 1.35,
"grad_norm": 6.222665309906006,
"learning_rate": 1.4675577889447237e-05,
"loss": 1.3603,
"step": 27000
},
{
"epoch": 1.35,
"grad_norm": 5.659895896911621,
"learning_rate": 1.4655477386934675e-05,
"loss": 1.4091,
"step": 27100
},
{
"epoch": 1.36,
"grad_norm": 5.340900897979736,
"learning_rate": 1.4635376884422113e-05,
"loss": 1.385,
"step": 27200
},
{
"epoch": 1.36,
"grad_norm": 2.827996253967285,
"learning_rate": 1.461527638190955e-05,
"loss": 1.3724,
"step": 27300
},
{
"epoch": 1.37,
"grad_norm": 5.637544631958008,
"learning_rate": 1.4595175879396986e-05,
"loss": 1.3911,
"step": 27400
},
{
"epoch": 1.38,
"grad_norm": 3.459794282913208,
"learning_rate": 1.4575075376884422e-05,
"loss": 1.3946,
"step": 27500
},
{
"epoch": 1.38,
"grad_norm": 6.612933158874512,
"learning_rate": 1.4554974874371862e-05,
"loss": 1.404,
"step": 27600
},
{
"epoch": 1.39,
"grad_norm": 4.6960577964782715,
"learning_rate": 1.4534874371859298e-05,
"loss": 1.3571,
"step": 27700
},
{
"epoch": 1.39,
"grad_norm": 3.677015781402588,
"learning_rate": 1.4514773869346734e-05,
"loss": 1.4166,
"step": 27800
},
{
"epoch": 1.4,
"grad_norm": 4.411760330200195,
"learning_rate": 1.449467336683417e-05,
"loss": 1.3666,
"step": 27900
},
{
"epoch": 1.4,
"grad_norm": 4.236432075500488,
"learning_rate": 1.447457286432161e-05,
"loss": 1.3738,
"step": 28000
},
{
"epoch": 1.41,
"grad_norm": 7.484130859375,
"learning_rate": 1.4454472361809046e-05,
"loss": 1.3454,
"step": 28100
},
{
"epoch": 1.41,
"grad_norm": 4.38557243347168,
"learning_rate": 1.4434371859296483e-05,
"loss": 1.3842,
"step": 28200
},
{
"epoch": 1.42,
"grad_norm": 5.947939395904541,
"learning_rate": 1.441427135678392e-05,
"loss": 1.3391,
"step": 28300
},
{
"epoch": 1.42,
"grad_norm": 4.879386901855469,
"learning_rate": 1.4394170854271359e-05,
"loss": 1.3337,
"step": 28400
},
{
"epoch": 1.43,
"grad_norm": 5.369794845581055,
"learning_rate": 1.4374070351758795e-05,
"loss": 1.371,
"step": 28500
},
{
"epoch": 1.43,
"grad_norm": 6.312124252319336,
"learning_rate": 1.4353969849246233e-05,
"loss": 1.3321,
"step": 28600
},
{
"epoch": 1.44,
"grad_norm": 5.254230976104736,
"learning_rate": 1.4333869346733669e-05,
"loss": 1.3835,
"step": 28700
},
{
"epoch": 1.44,
"grad_norm": 5.279263496398926,
"learning_rate": 1.4313768844221107e-05,
"loss": 1.341,
"step": 28800
},
{
"epoch": 1.45,
"grad_norm": 3.9145216941833496,
"learning_rate": 1.4293668341708545e-05,
"loss": 1.3457,
"step": 28900
},
{
"epoch": 1.45,
"grad_norm": 18.773277282714844,
"learning_rate": 1.4273768844221107e-05,
"loss": 1.3617,
"step": 29000
},
{
"epoch": 1.46,
"grad_norm": 4.489799499511719,
"learning_rate": 1.4253668341708544e-05,
"loss": 1.3604,
"step": 29100
},
{
"epoch": 1.46,
"grad_norm": 3.820908784866333,
"learning_rate": 1.423356783919598e-05,
"loss": 1.3688,
"step": 29200
},
{
"epoch": 1.47,
"grad_norm": 5.470434188842773,
"learning_rate": 1.4213467336683418e-05,
"loss": 1.4041,
"step": 29300
},
{
"epoch": 1.47,
"grad_norm": 2.9653820991516113,
"learning_rate": 1.4193366834170856e-05,
"loss": 1.3593,
"step": 29400
},
{
"epoch": 1.48,
"grad_norm": 4.433176517486572,
"learning_rate": 1.4173266331658292e-05,
"loss": 1.3402,
"step": 29500
},
{
"epoch": 1.48,
"grad_norm": 3.5363194942474365,
"learning_rate": 1.415316582914573e-05,
"loss": 1.376,
"step": 29600
},
{
"epoch": 1.48,
"grad_norm": 6.819579601287842,
"learning_rate": 1.4133065326633166e-05,
"loss": 1.3203,
"step": 29700
},
{
"epoch": 1.49,
"grad_norm": 5.506997585296631,
"learning_rate": 1.4112964824120604e-05,
"loss": 1.3234,
"step": 29800
},
{
"epoch": 1.5,
"grad_norm": 6.012782573699951,
"learning_rate": 1.4092864321608042e-05,
"loss": 1.3625,
"step": 29900
},
{
"epoch": 1.5,
"grad_norm": 2.309823751449585,
"learning_rate": 1.4072763819095479e-05,
"loss": 1.351,
"step": 30000
},
{
"epoch": 1.5,
"eval_loss": 1.3539750576019287,
"eval_runtime": 21.8688,
"eval_samples_per_second": 45.727,
"eval_steps_per_second": 5.716,
"step": 30000
},
{
"epoch": 1.5,
"grad_norm": 5.257068157196045,
"learning_rate": 1.4052663316582915e-05,
"loss": 1.348,
"step": 30100
},
{
"epoch": 1.51,
"grad_norm": 13.082318305969238,
"learning_rate": 1.4032562814070351e-05,
"loss": 1.3791,
"step": 30200
},
{
"epoch": 1.52,
"grad_norm": 4.944590091705322,
"learning_rate": 1.4012462311557791e-05,
"loss": 1.3805,
"step": 30300
},
{
"epoch": 1.52,
"grad_norm": 4.780072212219238,
"learning_rate": 1.3992361809045227e-05,
"loss": 1.3651,
"step": 30400
},
{
"epoch": 1.52,
"grad_norm": 4.359679698944092,
"learning_rate": 1.3972261306532664e-05,
"loss": 1.3118,
"step": 30500
},
{
"epoch": 1.53,
"grad_norm": 4.789872646331787,
"learning_rate": 1.3952160804020101e-05,
"loss": 1.3662,
"step": 30600
},
{
"epoch": 1.54,
"grad_norm": 4.301767349243164,
"learning_rate": 1.393206030150754e-05,
"loss": 1.3564,
"step": 30700
},
{
"epoch": 1.54,
"grad_norm": 4.046327590942383,
"learning_rate": 1.3911959798994976e-05,
"loss": 1.3386,
"step": 30800
},
{
"epoch": 1.54,
"grad_norm": 6.321465969085693,
"learning_rate": 1.3891859296482412e-05,
"loss": 1.3544,
"step": 30900
},
{
"epoch": 1.55,
"grad_norm": 5.538000106811523,
"learning_rate": 1.387175879396985e-05,
"loss": 1.3466,
"step": 31000
},
{
"epoch": 1.56,
"grad_norm": 6.25814151763916,
"learning_rate": 1.3851658291457288e-05,
"loss": 1.3246,
"step": 31100
},
{
"epoch": 1.56,
"grad_norm": 5.343544006347656,
"learning_rate": 1.3831758793969849e-05,
"loss": 1.3629,
"step": 31200
},
{
"epoch": 1.56,
"grad_norm": 4.390071868896484,
"learning_rate": 1.3811658291457288e-05,
"loss": 1.3211,
"step": 31300
},
{
"epoch": 1.57,
"grad_norm": 5.539604187011719,
"learning_rate": 1.3791557788944725e-05,
"loss": 1.3297,
"step": 31400
},
{
"epoch": 1.57,
"grad_norm": 4.06265115737915,
"learning_rate": 1.3771457286432161e-05,
"loss": 1.37,
"step": 31500
},
{
"epoch": 1.58,
"grad_norm": 4.819797515869141,
"learning_rate": 1.3751356783919599e-05,
"loss": 1.3279,
"step": 31600
},
{
"epoch": 1.58,
"grad_norm": 4.675261497497559,
"learning_rate": 1.3731256281407037e-05,
"loss": 1.321,
"step": 31700
},
{
"epoch": 1.59,
"grad_norm": 6.112530708312988,
"learning_rate": 1.3711155778894473e-05,
"loss": 1.3321,
"step": 31800
},
{
"epoch": 1.59,
"grad_norm": 4.85811185836792,
"learning_rate": 1.3691055276381911e-05,
"loss": 1.36,
"step": 31900
},
{
"epoch": 1.6,
"grad_norm": 3.9626624584198,
"learning_rate": 1.3670954773869347e-05,
"loss": 1.3446,
"step": 32000
},
{
"epoch": 1.6,
"grad_norm": 4.470461845397949,
"learning_rate": 1.3650854271356785e-05,
"loss": 1.3503,
"step": 32100
},
{
"epoch": 1.61,
"grad_norm": 3.1345880031585693,
"learning_rate": 1.3630753768844223e-05,
"loss": 1.322,
"step": 32200
},
{
"epoch": 1.61,
"grad_norm": 4.220657825469971,
"learning_rate": 1.361065326633166e-05,
"loss": 1.2922,
"step": 32300
},
{
"epoch": 1.62,
"grad_norm": 4.827053546905518,
"learning_rate": 1.3590552763819096e-05,
"loss": 1.3663,
"step": 32400
},
{
"epoch": 1.62,
"grad_norm": 3.613919496536255,
"learning_rate": 1.3570452261306536e-05,
"loss": 1.359,
"step": 32500
},
{
"epoch": 1.63,
"grad_norm": 6.170840263366699,
"learning_rate": 1.3550351758793972e-05,
"loss": 1.3076,
"step": 32600
},
{
"epoch": 1.64,
"grad_norm": 5.604345321655273,
"learning_rate": 1.3530251256281408e-05,
"loss": 1.3335,
"step": 32700
},
{
"epoch": 1.64,
"grad_norm": 3.617830514907837,
"learning_rate": 1.3510150753768844e-05,
"loss": 1.3601,
"step": 32800
},
{
"epoch": 1.65,
"grad_norm": 5.692416191101074,
"learning_rate": 1.349005025125628e-05,
"loss": 1.3117,
"step": 32900
},
{
"epoch": 1.65,
"grad_norm": 6.292971611022949,
"learning_rate": 1.346994974874372e-05,
"loss": 1.3427,
"step": 33000
},
{
"epoch": 1.66,
"grad_norm": 4.1335906982421875,
"learning_rate": 1.3449849246231157e-05,
"loss": 1.3593,
"step": 33100
},
{
"epoch": 1.66,
"grad_norm": 3.7386412620544434,
"learning_rate": 1.3429748743718593e-05,
"loss": 1.3509,
"step": 33200
},
{
"epoch": 1.67,
"grad_norm": 4.687217712402344,
"learning_rate": 1.3409648241206031e-05,
"loss": 1.3331,
"step": 33300
},
{
"epoch": 1.67,
"grad_norm": 3.9221270084381104,
"learning_rate": 1.3389547738693469e-05,
"loss": 1.3299,
"step": 33400
},
{
"epoch": 1.68,
"grad_norm": 4.383951663970947,
"learning_rate": 1.3369447236180905e-05,
"loss": 1.3455,
"step": 33500
},
{
"epoch": 1.68,
"grad_norm": 4.3801140785217285,
"learning_rate": 1.3349346733668343e-05,
"loss": 1.3092,
"step": 33600
},
{
"epoch": 1.69,
"grad_norm": 11.051312446594238,
"learning_rate": 1.332924623115578e-05,
"loss": 1.3164,
"step": 33700
},
{
"epoch": 1.69,
"grad_norm": 5.16642951965332,
"learning_rate": 1.3309346733668342e-05,
"loss": 1.3463,
"step": 33800
},
{
"epoch": 1.69,
"grad_norm": 4.878006935119629,
"learning_rate": 1.328924623115578e-05,
"loss": 1.3452,
"step": 33900
},
{
"epoch": 1.7,
"grad_norm": 3.8694944381713867,
"learning_rate": 1.3269145728643218e-05,
"loss": 1.3393,
"step": 34000
},
{
"epoch": 1.71,
"grad_norm": 3.1603684425354004,
"learning_rate": 1.3249045226130654e-05,
"loss": 1.3608,
"step": 34100
},
{
"epoch": 1.71,
"grad_norm": 7.4358086585998535,
"learning_rate": 1.3228944723618092e-05,
"loss": 1.334,
"step": 34200
},
{
"epoch": 1.71,
"grad_norm": 3.47105073928833,
"learning_rate": 1.3208844221105528e-05,
"loss": 1.3394,
"step": 34300
},
{
"epoch": 1.72,
"grad_norm": 6.943812847137451,
"learning_rate": 1.3188743718592966e-05,
"loss": 1.3316,
"step": 34400
},
{
"epoch": 1.73,
"grad_norm": 3.4260807037353516,
"learning_rate": 1.3168643216080404e-05,
"loss": 1.2544,
"step": 34500
},
{
"epoch": 1.73,
"grad_norm": 5.455039978027344,
"learning_rate": 1.314854271356784e-05,
"loss": 1.3122,
"step": 34600
},
{
"epoch": 1.73,
"grad_norm": 5.853464603424072,
"learning_rate": 1.3128442211055277e-05,
"loss": 1.3533,
"step": 34700
},
{
"epoch": 1.74,
"grad_norm": 6.121273994445801,
"learning_rate": 1.3108341708542715e-05,
"loss": 1.3144,
"step": 34800
},
{
"epoch": 1.75,
"grad_norm": 4.645484447479248,
"learning_rate": 1.3088241206030153e-05,
"loss": 1.3154,
"step": 34900
},
{
"epoch": 1.75,
"grad_norm": 5.509624004364014,
"learning_rate": 1.3068140703517589e-05,
"loss": 1.3335,
"step": 35000
},
{
"epoch": 1.75,
"grad_norm": 4.298194885253906,
"learning_rate": 1.3048040201005025e-05,
"loss": 1.3374,
"step": 35100
},
{
"epoch": 1.76,
"grad_norm": 3.8274903297424316,
"learning_rate": 1.3027939698492465e-05,
"loss": 1.3577,
"step": 35200
},
{
"epoch": 1.77,
"grad_norm": 3.6994845867156982,
"learning_rate": 1.3007839195979901e-05,
"loss": 1.3439,
"step": 35300
},
{
"epoch": 1.77,
"grad_norm": 5.096419334411621,
"learning_rate": 1.2987738693467338e-05,
"loss": 1.3425,
"step": 35400
},
{
"epoch": 1.77,
"grad_norm": 5.432124614715576,
"learning_rate": 1.2967638190954774e-05,
"loss": 1.3089,
"step": 35500
},
{
"epoch": 1.78,
"grad_norm": 6.754784107208252,
"learning_rate": 1.2947537688442212e-05,
"loss": 1.3577,
"step": 35600
},
{
"epoch": 1.79,
"grad_norm": 5.67179012298584,
"learning_rate": 1.292743718592965e-05,
"loss": 1.3158,
"step": 35700
},
{
"epoch": 1.79,
"grad_norm": 5.4324517250061035,
"learning_rate": 1.2907336683417086e-05,
"loss": 1.3423,
"step": 35800
},
{
"epoch": 1.79,
"grad_norm": 6.366858959197998,
"learning_rate": 1.2887236180904524e-05,
"loss": 1.348,
"step": 35900
},
{
"epoch": 1.8,
"grad_norm": 4.625099182128906,
"learning_rate": 1.286713567839196e-05,
"loss": 1.344,
"step": 36000
},
{
"epoch": 1.81,
"grad_norm": 4.306463718414307,
"learning_rate": 1.2847035175879398e-05,
"loss": 1.3471,
"step": 36100
},
{
"epoch": 1.81,
"grad_norm": 5.759509086608887,
"learning_rate": 1.2826934673366835e-05,
"loss": 1.2994,
"step": 36200
},
{
"epoch": 1.81,
"grad_norm": 6.741412162780762,
"learning_rate": 1.2806834170854273e-05,
"loss": 1.3385,
"step": 36300
},
{
"epoch": 1.82,
"grad_norm": 8.009258270263672,
"learning_rate": 1.2786733668341709e-05,
"loss": 1.307,
"step": 36400
},
{
"epoch": 1.82,
"grad_norm": 4.115825653076172,
"learning_rate": 1.2766633165829147e-05,
"loss": 1.3514,
"step": 36500
},
{
"epoch": 1.83,
"grad_norm": 6.015181541442871,
"learning_rate": 1.2746532663316585e-05,
"loss": 1.3126,
"step": 36600
},
{
"epoch": 1.83,
"grad_norm": 5.512998104095459,
"learning_rate": 1.2726633165829147e-05,
"loss": 1.3288,
"step": 36700
},
{
"epoch": 1.84,
"grad_norm": 4.820090293884277,
"learning_rate": 1.2706532663316583e-05,
"loss": 1.322,
"step": 36800
},
{
"epoch": 1.84,
"grad_norm": 5.174429893493652,
"learning_rate": 1.2686432160804021e-05,
"loss": 1.3451,
"step": 36900
},
{
"epoch": 1.85,
"grad_norm": 5.226765155792236,
"learning_rate": 1.2666331658291458e-05,
"loss": 1.3362,
"step": 37000
},
{
"epoch": 1.85,
"grad_norm": 7.082357406616211,
"learning_rate": 1.2646231155778896e-05,
"loss": 1.3053,
"step": 37100
},
{
"epoch": 1.86,
"grad_norm": 5.663987636566162,
"learning_rate": 1.2626130653266334e-05,
"loss": 1.3312,
"step": 37200
},
{
"epoch": 1.86,
"grad_norm": 4.99432897567749,
"learning_rate": 1.260603015075377e-05,
"loss": 1.2969,
"step": 37300
},
{
"epoch": 1.87,
"grad_norm": 3.6237847805023193,
"learning_rate": 1.2585929648241206e-05,
"loss": 1.3284,
"step": 37400
},
{
"epoch": 1.88,
"grad_norm": 4.317774772644043,
"learning_rate": 1.2565829145728646e-05,
"loss": 1.2724,
"step": 37500
},
{
"epoch": 1.88,
"grad_norm": 28.135671615600586,
"learning_rate": 1.2545728643216082e-05,
"loss": 1.301,
"step": 37600
},
{
"epoch": 1.89,
"grad_norm": 6.461686611175537,
"learning_rate": 1.2525628140703518e-05,
"loss": 1.311,
"step": 37700
},
{
"epoch": 1.89,
"grad_norm": 5.122781276702881,
"learning_rate": 1.2505527638190955e-05,
"loss": 1.3124,
"step": 37800
},
{
"epoch": 1.9,
"grad_norm": 7.070276737213135,
"learning_rate": 1.2485628140703519e-05,
"loss": 1.3042,
"step": 37900
},
{
"epoch": 1.9,
"grad_norm": 5.5672607421875,
"learning_rate": 1.2465527638190955e-05,
"loss": 1.2705,
"step": 38000
},
{
"epoch": 1.91,
"grad_norm": 5.857154369354248,
"learning_rate": 1.2445427135678395e-05,
"loss": 1.2842,
"step": 38100
},
{
"epoch": 1.91,
"grad_norm": 7.3641133308410645,
"learning_rate": 1.2425326633165831e-05,
"loss": 1.3284,
"step": 38200
},
{
"epoch": 1.92,
"grad_norm": 7.004215717315674,
"learning_rate": 1.2405226130653267e-05,
"loss": 1.3106,
"step": 38300
},
{
"epoch": 1.92,
"grad_norm": 6.171952247619629,
"learning_rate": 1.2385125628140704e-05,
"loss": 1.2711,
"step": 38400
},
{
"epoch": 1.93,
"grad_norm": 4.094524383544922,
"learning_rate": 1.236502512562814e-05,
"loss": 1.2879,
"step": 38500
},
{
"epoch": 1.93,
"grad_norm": 6.0571980476379395,
"learning_rate": 1.234492462311558e-05,
"loss": 1.3454,
"step": 38600
},
{
"epoch": 1.94,
"grad_norm": 4.057672023773193,
"learning_rate": 1.2324824120603016e-05,
"loss": 1.3275,
"step": 38700
},
{
"epoch": 1.94,
"grad_norm": 6.506563663482666,
"learning_rate": 1.2304723618090452e-05,
"loss": 1.2866,
"step": 38800
},
{
"epoch": 1.94,
"grad_norm": 5.117976188659668,
"learning_rate": 1.228462311557789e-05,
"loss": 1.2811,
"step": 38900
},
{
"epoch": 1.95,
"grad_norm": 4.050692558288574,
"learning_rate": 1.2264522613065328e-05,
"loss": 1.2682,
"step": 39000
},
{
"epoch": 1.96,
"grad_norm": 4.216948509216309,
"learning_rate": 1.2244422110552764e-05,
"loss": 1.3079,
"step": 39100
},
{
"epoch": 1.96,
"grad_norm": 5.064427375793457,
"learning_rate": 1.2224321608040202e-05,
"loss": 1.3022,
"step": 39200
},
{
"epoch": 1.96,
"grad_norm": 6.869637489318848,
"learning_rate": 1.2204221105527639e-05,
"loss": 1.294,
"step": 39300
},
{
"epoch": 1.97,
"grad_norm": 4.062154769897461,
"learning_rate": 1.2184120603015077e-05,
"loss": 1.3249,
"step": 39400
},
{
"epoch": 1.98,
"grad_norm": 5.1579976081848145,
"learning_rate": 1.2164020100502515e-05,
"loss": 1.3006,
"step": 39500
},
{
"epoch": 1.98,
"grad_norm": 6.146691799163818,
"learning_rate": 1.214391959798995e-05,
"loss": 1.317,
"step": 39600
},
{
"epoch": 1.98,
"grad_norm": 5.503583908081055,
"learning_rate": 1.2123819095477387e-05,
"loss": 1.2803,
"step": 39700
},
{
"epoch": 1.99,
"grad_norm": 5.574082374572754,
"learning_rate": 1.2103718592964827e-05,
"loss": 1.3149,
"step": 39800
},
{
"epoch": 2.0,
"grad_norm": 7.74934720993042,
"learning_rate": 1.2083618090452263e-05,
"loss": 1.3085,
"step": 39900
},
{
"epoch": 2.0,
"grad_norm": 5.324882984161377,
"learning_rate": 1.20635175879397e-05,
"loss": 1.2662,
"step": 40000
},
{
"epoch": 2.0,
"eval_loss": 1.311901330947876,
"eval_runtime": 21.8453,
"eval_samples_per_second": 45.776,
"eval_steps_per_second": 5.722,
"step": 40000
},
{
"epoch": 2.0,
"grad_norm": 4.58272647857666,
"learning_rate": 1.2043417085427136e-05,
"loss": 1.251,
"step": 40100
},
{
"epoch": 2.01,
"grad_norm": 6.063676357269287,
"learning_rate": 1.2023316582914575e-05,
"loss": 1.3191,
"step": 40200
},
{
"epoch": 2.02,
"grad_norm": 6.219407558441162,
"learning_rate": 1.2003216080402012e-05,
"loss": 1.2535,
"step": 40300
},
{
"epoch": 2.02,
"grad_norm": 4.881536960601807,
"learning_rate": 1.1983115577889448e-05,
"loss": 1.3005,
"step": 40400
},
{
"epoch": 2.02,
"grad_norm": 2.7239251136779785,
"learning_rate": 1.1963015075376884e-05,
"loss": 1.2162,
"step": 40500
},
{
"epoch": 2.03,
"grad_norm": 4.018795490264893,
"learning_rate": 1.1942914572864324e-05,
"loss": 1.2869,
"step": 40600
},
{
"epoch": 2.04,
"grad_norm": 5.884511947631836,
"learning_rate": 1.192281407035176e-05,
"loss": 1.3242,
"step": 40700
},
{
"epoch": 2.04,
"grad_norm": 6.765765190124512,
"learning_rate": 1.1902713567839196e-05,
"loss": 1.3089,
"step": 40800
},
{
"epoch": 2.04,
"grad_norm": 7.366244316101074,
"learning_rate": 1.1882613065326634e-05,
"loss": 1.2574,
"step": 40900
},
{
"epoch": 2.05,
"grad_norm": 4.718629837036133,
"learning_rate": 1.186251256281407e-05,
"loss": 1.2967,
"step": 41000
},
{
"epoch": 2.06,
"grad_norm": 3.110818386077881,
"learning_rate": 1.1842412060301509e-05,
"loss": 1.2619,
"step": 41100
},
{
"epoch": 2.06,
"grad_norm": 5.5501298904418945,
"learning_rate": 1.1822311557788947e-05,
"loss": 1.2895,
"step": 41200
},
{
"epoch": 2.06,
"grad_norm": 4.115253925323486,
"learning_rate": 1.1802412060301509e-05,
"loss": 1.2978,
"step": 41300
},
{
"epoch": 2.07,
"grad_norm": 4.711161136627197,
"learning_rate": 1.1782311557788945e-05,
"loss": 1.2846,
"step": 41400
},
{
"epoch": 2.08,
"grad_norm": 4.220121383666992,
"learning_rate": 1.1762211055276383e-05,
"loss": 1.2873,
"step": 41500
},
{
"epoch": 2.08,
"grad_norm": 2.8940072059631348,
"learning_rate": 1.174211055276382e-05,
"loss": 1.2794,
"step": 41600
},
{
"epoch": 2.08,
"grad_norm": 4.298203468322754,
"learning_rate": 1.1722010050251257e-05,
"loss": 1.2412,
"step": 41700
},
{
"epoch": 2.09,
"grad_norm": 2.983751058578491,
"learning_rate": 1.1701909547738694e-05,
"loss": 1.2379,
"step": 41800
},
{
"epoch": 2.1,
"grad_norm": 5.402541637420654,
"learning_rate": 1.1681809045226132e-05,
"loss": 1.2579,
"step": 41900
},
{
"epoch": 2.1,
"grad_norm": 4.412992000579834,
"learning_rate": 1.1661708542713568e-05,
"loss": 1.2576,
"step": 42000
},
{
"epoch": 2.1,
"grad_norm": 3.6417946815490723,
"learning_rate": 1.1641608040201006e-05,
"loss": 1.2711,
"step": 42100
},
{
"epoch": 2.11,
"grad_norm": 4.4454216957092285,
"learning_rate": 1.1621507537688444e-05,
"loss": 1.2823,
"step": 42200
},
{
"epoch": 2.12,
"grad_norm": 5.299724578857422,
"learning_rate": 1.160140703517588e-05,
"loss": 1.2559,
"step": 42300
},
{
"epoch": 2.12,
"grad_norm": 4.227545261383057,
"learning_rate": 1.1581306532663317e-05,
"loss": 1.2321,
"step": 42400
},
{
"epoch": 2.12,
"grad_norm": 5.005281925201416,
"learning_rate": 1.1561206030150756e-05,
"loss": 1.2988,
"step": 42500
},
{
"epoch": 2.13,
"grad_norm": 5.402487754821777,
"learning_rate": 1.1541105527638192e-05,
"loss": 1.2935,
"step": 42600
},
{
"epoch": 2.13,
"grad_norm": 4.937114715576172,
"learning_rate": 1.1521005025125629e-05,
"loss": 1.2786,
"step": 42700
},
{
"epoch": 2.14,
"grad_norm": 5.078079700469971,
"learning_rate": 1.1500904522613065e-05,
"loss": 1.2735,
"step": 42800
},
{
"epoch": 2.15,
"grad_norm": 3.767037868499756,
"learning_rate": 1.1480804020100505e-05,
"loss": 1.2557,
"step": 42900
},
{
"epoch": 2.15,
"grad_norm": 5.843142032623291,
"learning_rate": 1.1460703517587941e-05,
"loss": 1.2338,
"step": 43000
},
{
"epoch": 2.15,
"grad_norm": 7.4397053718566895,
"learning_rate": 1.1440804020100505e-05,
"loss": 1.2841,
"step": 43100
},
{
"epoch": 2.16,
"grad_norm": 5.660872459411621,
"learning_rate": 1.1420703517587941e-05,
"loss": 1.245,
"step": 43200
},
{
"epoch": 2.17,
"grad_norm": 5.597433567047119,
"learning_rate": 1.1400603015075378e-05,
"loss": 1.2267,
"step": 43300
},
{
"epoch": 2.17,
"grad_norm": 5.053484916687012,
"learning_rate": 1.1380502512562814e-05,
"loss": 1.2734,
"step": 43400
},
{
"epoch": 2.17,
"grad_norm": 4.610946178436279,
"learning_rate": 1.1360402010050254e-05,
"loss": 1.2443,
"step": 43500
},
{
"epoch": 2.18,
"grad_norm": 4.457014083862305,
"learning_rate": 1.134030150753769e-05,
"loss": 1.2665,
"step": 43600
},
{
"epoch": 2.19,
"grad_norm": 4.798270225524902,
"learning_rate": 1.1320201005025126e-05,
"loss": 1.3062,
"step": 43700
},
{
"epoch": 2.19,
"grad_norm": 6.110182762145996,
"learning_rate": 1.1300100502512562e-05,
"loss": 1.2652,
"step": 43800
},
{
"epoch": 2.19,
"grad_norm": 5.528191089630127,
"learning_rate": 1.128e-05,
"loss": 1.2886,
"step": 43900
},
{
"epoch": 2.2,
"grad_norm": 6.199995994567871,
"learning_rate": 1.1259899497487438e-05,
"loss": 1.2431,
"step": 44000
},
{
"epoch": 2.21,
"grad_norm": 4.476943492889404,
"learning_rate": 1.1239798994974875e-05,
"loss": 1.292,
"step": 44100
},
{
"epoch": 2.21,
"grad_norm": 4.873668670654297,
"learning_rate": 1.1219698492462313e-05,
"loss": 1.3071,
"step": 44200
},
{
"epoch": 2.21,
"grad_norm": 4.769585609436035,
"learning_rate": 1.1199597989949749e-05,
"loss": 1.2568,
"step": 44300
},
{
"epoch": 2.22,
"grad_norm": 3.5369479656219482,
"learning_rate": 1.1179497487437187e-05,
"loss": 1.2472,
"step": 44400
},
{
"epoch": 2.23,
"grad_norm": 3.9671337604522705,
"learning_rate": 1.1159396984924625e-05,
"loss": 1.2917,
"step": 44500
},
{
"epoch": 2.23,
"grad_norm": 5.720705986022949,
"learning_rate": 1.1139296482412061e-05,
"loss": 1.235,
"step": 44600
},
{
"epoch": 2.23,
"grad_norm": 5.337419509887695,
"learning_rate": 1.1119195979899497e-05,
"loss": 1.3176,
"step": 44700
},
{
"epoch": 2.24,
"grad_norm": 5.101902961730957,
"learning_rate": 1.1099095477386937e-05,
"loss": 1.3028,
"step": 44800
},
{
"epoch": 2.25,
"grad_norm": 4.49253511428833,
"learning_rate": 1.1078994974874373e-05,
"loss": 1.2426,
"step": 44900
},
{
"epoch": 2.25,
"grad_norm": 6.949131011962891,
"learning_rate": 1.105889447236181e-05,
"loss": 1.2352,
"step": 45000
},
{
"epoch": 2.25,
"grad_norm": 3.002480983734131,
"learning_rate": 1.1038793969849246e-05,
"loss": 1.2544,
"step": 45100
},
{
"epoch": 2.26,
"grad_norm": 4.93209171295166,
"learning_rate": 1.1018693467336686e-05,
"loss": 1.2528,
"step": 45200
},
{
"epoch": 2.27,
"grad_norm": 3.5885465145111084,
"learning_rate": 1.0998592964824122e-05,
"loss": 1.2395,
"step": 45300
},
{
"epoch": 2.27,
"grad_norm": 6.264705657958984,
"learning_rate": 1.0978492462311558e-05,
"loss": 1.2468,
"step": 45400
},
{
"epoch": 2.27,
"grad_norm": 4.435594081878662,
"learning_rate": 1.0958391959798994e-05,
"loss": 1.2523,
"step": 45500
},
{
"epoch": 2.28,
"grad_norm": 2.9626126289367676,
"learning_rate": 1.0938291457286434e-05,
"loss": 1.2618,
"step": 45600
},
{
"epoch": 2.29,
"grad_norm": 4.376198768615723,
"learning_rate": 1.091819095477387e-05,
"loss": 1.2738,
"step": 45700
},
{
"epoch": 2.29,
"grad_norm": 4.058696269989014,
"learning_rate": 1.0898090452261307e-05,
"loss": 1.2787,
"step": 45800
},
{
"epoch": 2.29,
"grad_norm": 5.347177982330322,
"learning_rate": 1.087819095477387e-05,
"loss": 1.2778,
"step": 45900
},
{
"epoch": 2.3,
"grad_norm": 3.85967755317688,
"learning_rate": 1.0858090452261307e-05,
"loss": 1.2746,
"step": 46000
},
{
"epoch": 2.31,
"grad_norm": 5.233943939208984,
"learning_rate": 1.0837989949748743e-05,
"loss": 1.283,
"step": 46100
},
{
"epoch": 2.31,
"grad_norm": 6.364080429077148,
"learning_rate": 1.0817889447236183e-05,
"loss": 1.2685,
"step": 46200
},
{
"epoch": 2.31,
"grad_norm": 5.601933479309082,
"learning_rate": 1.079778894472362e-05,
"loss": 1.2214,
"step": 46300
},
{
"epoch": 2.32,
"grad_norm": 7.273884296417236,
"learning_rate": 1.0777688442211056e-05,
"loss": 1.2522,
"step": 46400
},
{
"epoch": 2.33,
"grad_norm": 4.99397611618042,
"learning_rate": 1.0757587939698494e-05,
"loss": 1.2779,
"step": 46500
},
{
"epoch": 2.33,
"grad_norm": 6.805306434631348,
"learning_rate": 1.073748743718593e-05,
"loss": 1.2605,
"step": 46600
},
{
"epoch": 2.33,
"grad_norm": 5.773606777191162,
"learning_rate": 1.0717386934673368e-05,
"loss": 1.2236,
"step": 46700
},
{
"epoch": 2.34,
"grad_norm": 5.045441150665283,
"learning_rate": 1.0697286432160806e-05,
"loss": 1.2629,
"step": 46800
},
{
"epoch": 2.34,
"grad_norm": 6.011552333831787,
"learning_rate": 1.0677185929648242e-05,
"loss": 1.2061,
"step": 46900
},
{
"epoch": 2.35,
"grad_norm": 6.2456817626953125,
"learning_rate": 1.0657085427135678e-05,
"loss": 1.2532,
"step": 47000
},
{
"epoch": 2.35,
"grad_norm": 4.08701229095459,
"learning_rate": 1.0636984924623116e-05,
"loss": 1.2701,
"step": 47100
},
{
"epoch": 2.36,
"grad_norm": 4.472239017486572,
"learning_rate": 1.0616884422110554e-05,
"loss": 1.261,
"step": 47200
},
{
"epoch": 2.37,
"grad_norm": 5.726062297821045,
"learning_rate": 1.059678391959799e-05,
"loss": 1.2506,
"step": 47300
},
{
"epoch": 2.37,
"grad_norm": 5.6060686111450195,
"learning_rate": 1.0576683417085427e-05,
"loss": 1.2472,
"step": 47400
},
{
"epoch": 2.38,
"grad_norm": 5.226354122161865,
"learning_rate": 1.0556582914572867e-05,
"loss": 1.2599,
"step": 47500
},
{
"epoch": 2.38,
"grad_norm": 6.913018703460693,
"learning_rate": 1.0536482412060303e-05,
"loss": 1.2555,
"step": 47600
},
{
"epoch": 2.38,
"grad_norm": 4.932835578918457,
"learning_rate": 1.0516381909547739e-05,
"loss": 1.2658,
"step": 47700
},
{
"epoch": 2.39,
"grad_norm": 5.260751724243164,
"learning_rate": 1.0496281407035175e-05,
"loss": 1.3025,
"step": 47800
},
{
"epoch": 2.4,
"grad_norm": 4.677539825439453,
"learning_rate": 1.0476180904522615e-05,
"loss": 1.2367,
"step": 47900
},
{
"epoch": 2.4,
"grad_norm": 5.679705619812012,
"learning_rate": 1.0456080402010051e-05,
"loss": 1.2693,
"step": 48000
},
{
"epoch": 2.41,
"grad_norm": 5.315084934234619,
"learning_rate": 1.0435979899497488e-05,
"loss": 1.3188,
"step": 48100
},
{
"epoch": 2.41,
"grad_norm": 5.218776702880859,
"learning_rate": 1.0415879396984926e-05,
"loss": 1.25,
"step": 48200
},
{
"epoch": 2.42,
"grad_norm": 3.9905712604522705,
"learning_rate": 1.0395778894472364e-05,
"loss": 1.2318,
"step": 48300
},
{
"epoch": 2.42,
"grad_norm": 5.051150798797607,
"learning_rate": 1.03756783919598e-05,
"loss": 1.2564,
"step": 48400
},
{
"epoch": 2.42,
"grad_norm": 4.899648666381836,
"learning_rate": 1.0355577889447238e-05,
"loss": 1.2616,
"step": 48500
},
{
"epoch": 2.43,
"grad_norm": 5.728534698486328,
"learning_rate": 1.0335477386934674e-05,
"loss": 1.2387,
"step": 48600
},
{
"epoch": 2.44,
"grad_norm": 4.747395038604736,
"learning_rate": 1.0315376884422112e-05,
"loss": 1.2352,
"step": 48700
},
{
"epoch": 2.44,
"grad_norm": 3.0430312156677246,
"learning_rate": 1.0295276381909548e-05,
"loss": 1.2302,
"step": 48800
},
{
"epoch": 2.44,
"grad_norm": 4.847692012786865,
"learning_rate": 1.0275175879396986e-05,
"loss": 1.23,
"step": 48900
},
{
"epoch": 2.45,
"grad_norm": 4.695472240447998,
"learning_rate": 1.0255075376884423e-05,
"loss": 1.2543,
"step": 49000
},
{
"epoch": 2.46,
"grad_norm": 4.463906764984131,
"learning_rate": 1.0234974874371859e-05,
"loss": 1.2596,
"step": 49100
},
{
"epoch": 2.46,
"grad_norm": 4.5606770515441895,
"learning_rate": 1.0214874371859299e-05,
"loss": 1.235,
"step": 49200
},
{
"epoch": 2.46,
"grad_norm": 5.294122219085693,
"learning_rate": 1.0194773869346735e-05,
"loss": 1.2505,
"step": 49300
},
{
"epoch": 2.47,
"grad_norm": 3.5599803924560547,
"learning_rate": 1.0174673366834171e-05,
"loss": 1.2817,
"step": 49400
},
{
"epoch": 2.48,
"grad_norm": 3.720597982406616,
"learning_rate": 1.0154572864321607e-05,
"loss": 1.2628,
"step": 49500
},
{
"epoch": 2.48,
"grad_norm": 4.774421215057373,
"learning_rate": 1.0134472361809047e-05,
"loss": 1.2705,
"step": 49600
},
{
"epoch": 2.48,
"grad_norm": 5.8144330978393555,
"learning_rate": 1.0114371859296483e-05,
"loss": 1.2982,
"step": 49700
},
{
"epoch": 2.49,
"grad_norm": 6.6790385246276855,
"learning_rate": 1.009427135678392e-05,
"loss": 1.2301,
"step": 49800
},
{
"epoch": 2.5,
"grad_norm": 4.106939792633057,
"learning_rate": 1.0074170854271358e-05,
"loss": 1.2355,
"step": 49900
},
{
"epoch": 2.5,
"grad_norm": 3.354093551635742,
"learning_rate": 1.0054070351758796e-05,
"loss": 1.2459,
"step": 50000
},
{
"epoch": 2.5,
"eval_loss": 1.3116555213928223,
"eval_runtime": 21.8654,
"eval_samples_per_second": 45.734,
"eval_steps_per_second": 5.717,
"step": 50000
},
{
"epoch": 2.5,
"grad_norm": 5.311431407928467,
"learning_rate": 1.0033969849246232e-05,
"loss": 1.2402,
"step": 50100
},
{
"epoch": 2.51,
"grad_norm": 5.163784503936768,
"learning_rate": 1.0013869346733668e-05,
"loss": 1.2329,
"step": 50200
},
{
"epoch": 2.52,
"grad_norm": 5.684957981109619,
"learning_rate": 9.993768844221106e-06,
"loss": 1.2403,
"step": 50300
},
{
"epoch": 2.52,
"grad_norm": 6.14009952545166,
"learning_rate": 9.973668341708544e-06,
"loss": 1.2217,
"step": 50400
},
{
"epoch": 2.52,
"grad_norm": 6.6271796226501465,
"learning_rate": 9.95356783919598e-06,
"loss": 1.2122,
"step": 50500
},
{
"epoch": 2.53,
"grad_norm": 4.011633396148682,
"learning_rate": 9.933467336683418e-06,
"loss": 1.2242,
"step": 50600
},
{
"epoch": 2.54,
"grad_norm": 4.849850177764893,
"learning_rate": 9.913366834170856e-06,
"loss": 1.2349,
"step": 50700
},
{
"epoch": 2.54,
"grad_norm": 3.841789484024048,
"learning_rate": 9.893266331658293e-06,
"loss": 1.2556,
"step": 50800
},
{
"epoch": 2.54,
"grad_norm": 7.193374156951904,
"learning_rate": 9.87316582914573e-06,
"loss": 1.2119,
"step": 50900
},
{
"epoch": 2.55,
"grad_norm": 3.556542158126831,
"learning_rate": 9.853065326633167e-06,
"loss": 1.2426,
"step": 51000
},
{
"epoch": 2.56,
"grad_norm": 6.54746150970459,
"learning_rate": 9.832964824120603e-06,
"loss": 1.1936,
"step": 51100
},
{
"epoch": 2.56,
"grad_norm": 4.4405951499938965,
"learning_rate": 9.812864321608041e-06,
"loss": 1.2591,
"step": 51200
},
{
"epoch": 2.56,
"grad_norm": 5.398285865783691,
"learning_rate": 9.792763819095477e-06,
"loss": 1.2375,
"step": 51300
},
{
"epoch": 2.57,
"grad_norm": 5.835482120513916,
"learning_rate": 9.772864321608041e-06,
"loss": 1.2626,
"step": 51400
},
{
"epoch": 2.58,
"grad_norm": 5.3824543952941895,
"learning_rate": 9.752763819095478e-06,
"loss": 1.2145,
"step": 51500
},
{
"epoch": 2.58,
"grad_norm": 4.847600936889648,
"learning_rate": 9.732663316582916e-06,
"loss": 1.2699,
"step": 51600
},
{
"epoch": 2.58,
"grad_norm": 4.644218921661377,
"learning_rate": 9.712562814070352e-06,
"loss": 1.2669,
"step": 51700
},
{
"epoch": 2.59,
"grad_norm": 5.046612739562988,
"learning_rate": 9.69246231155779e-06,
"loss": 1.2425,
"step": 51800
},
{
"epoch": 2.59,
"grad_norm": 3.9644362926483154,
"learning_rate": 9.672361809045226e-06,
"loss": 1.2802,
"step": 51900
},
{
"epoch": 2.6,
"grad_norm": 4.848786354064941,
"learning_rate": 9.652261306532664e-06,
"loss": 1.2257,
"step": 52000
},
{
"epoch": 2.6,
"grad_norm": 5.159448623657227,
"learning_rate": 9.6321608040201e-06,
"loss": 1.2156,
"step": 52100
},
{
"epoch": 2.61,
"grad_norm": 6.021341800689697,
"learning_rate": 9.612060301507538e-06,
"loss": 1.2122,
"step": 52200
},
{
"epoch": 2.62,
"grad_norm": 7.450246334075928,
"learning_rate": 9.591959798994975e-06,
"loss": 1.2452,
"step": 52300
},
{
"epoch": 2.62,
"grad_norm": 4.6322503089904785,
"learning_rate": 9.571859296482413e-06,
"loss": 1.2615,
"step": 52400
},
{
"epoch": 2.62,
"grad_norm": 9.119827270507812,
"learning_rate": 9.551758793969849e-06,
"loss": 1.2484,
"step": 52500
},
{
"epoch": 2.63,
"grad_norm": 7.022191047668457,
"learning_rate": 9.531658291457287e-06,
"loss": 1.2413,
"step": 52600
},
{
"epoch": 2.63,
"grad_norm": 6.008714199066162,
"learning_rate": 9.511557788944725e-06,
"loss": 1.2545,
"step": 52700
},
{
"epoch": 2.64,
"grad_norm": 4.883365631103516,
"learning_rate": 9.491457286432161e-06,
"loss": 1.256,
"step": 52800
},
{
"epoch": 2.65,
"grad_norm": 4.666494846343994,
"learning_rate": 9.4713567839196e-06,
"loss": 1.212,
"step": 52900
},
{
"epoch": 2.65,
"grad_norm": 6.942872524261475,
"learning_rate": 9.451256281407035e-06,
"loss": 1.2539,
"step": 53000
},
{
"epoch": 2.66,
"grad_norm": 8.57226848602295,
"learning_rate": 9.431155778894473e-06,
"loss": 1.238,
"step": 53100
},
{
"epoch": 2.66,
"grad_norm": 3.5034737586975098,
"learning_rate": 9.411055276381911e-06,
"loss": 1.1608,
"step": 53200
},
{
"epoch": 2.67,
"grad_norm": 4.17569637298584,
"learning_rate": 9.390954773869348e-06,
"loss": 1.2335,
"step": 53300
},
{
"epoch": 2.67,
"grad_norm": 5.2211012840271,
"learning_rate": 9.370854271356786e-06,
"loss": 1.2664,
"step": 53400
},
{
"epoch": 2.67,
"grad_norm": 3.3811118602752686,
"learning_rate": 9.350753768844222e-06,
"loss": 1.2317,
"step": 53500
},
{
"epoch": 2.68,
"grad_norm": 6.415603160858154,
"learning_rate": 9.33065326633166e-06,
"loss": 1.2613,
"step": 53600
},
{
"epoch": 2.69,
"grad_norm": 4.720609188079834,
"learning_rate": 9.310552763819096e-06,
"loss": 1.2124,
"step": 53700
},
{
"epoch": 2.69,
"grad_norm": 5.3697710037231445,
"learning_rate": 9.290452261306533e-06,
"loss": 1.2388,
"step": 53800
},
{
"epoch": 2.69,
"grad_norm": 4.376136302947998,
"learning_rate": 9.27035175879397e-06,
"loss": 1.2495,
"step": 53900
},
{
"epoch": 2.7,
"grad_norm": 3.973159074783325,
"learning_rate": 9.250251256281407e-06,
"loss": 1.2389,
"step": 54000
},
{
"epoch": 2.71,
"grad_norm": 6.894681930541992,
"learning_rate": 9.230150753768845e-06,
"loss": 1.2056,
"step": 54100
},
{
"epoch": 2.71,
"grad_norm": 4.781852722167969,
"learning_rate": 9.210050251256281e-06,
"loss": 1.2316,
"step": 54200
},
{
"epoch": 2.71,
"grad_norm": 4.408322334289551,
"learning_rate": 9.189949748743719e-06,
"loss": 1.2945,
"step": 54300
},
{
"epoch": 2.72,
"grad_norm": 4.875626564025879,
"learning_rate": 9.169849246231157e-06,
"loss": 1.2458,
"step": 54400
},
{
"epoch": 2.73,
"grad_norm": 5.88706111907959,
"learning_rate": 9.149748743718593e-06,
"loss": 1.2257,
"step": 54500
},
{
"epoch": 2.73,
"grad_norm": 4.785450458526611,
"learning_rate": 9.129648241206031e-06,
"loss": 1.2372,
"step": 54600
},
{
"epoch": 2.73,
"grad_norm": 4.651752948760986,
"learning_rate": 9.109547738693468e-06,
"loss": 1.2146,
"step": 54700
},
{
"epoch": 2.74,
"grad_norm": 5.303548336029053,
"learning_rate": 9.089447236180905e-06,
"loss": 1.2307,
"step": 54800
},
{
"epoch": 2.75,
"grad_norm": 4.032742977142334,
"learning_rate": 9.069346733668343e-06,
"loss": 1.2467,
"step": 54900
},
{
"epoch": 2.75,
"grad_norm": 4.288597583770752,
"learning_rate": 9.04924623115578e-06,
"loss": 1.248,
"step": 55000
},
{
"epoch": 2.75,
"grad_norm": 5.981525897979736,
"learning_rate": 9.029145728643218e-06,
"loss": 1.2344,
"step": 55100
},
{
"epoch": 2.76,
"grad_norm": 4.837640762329102,
"learning_rate": 9.009045226130654e-06,
"loss": 1.2305,
"step": 55200
},
{
"epoch": 2.77,
"grad_norm": 5.082337856292725,
"learning_rate": 8.988944723618092e-06,
"loss": 1.2199,
"step": 55300
},
{
"epoch": 2.77,
"grad_norm": 5.879444599151611,
"learning_rate": 8.968844221105528e-06,
"loss": 1.2158,
"step": 55400
},
{
"epoch": 2.77,
"grad_norm": 4.926747798919678,
"learning_rate": 8.948944723618092e-06,
"loss": 1.2575,
"step": 55500
},
{
"epoch": 2.78,
"grad_norm": 4.744166851043701,
"learning_rate": 8.928844221105529e-06,
"loss": 1.2325,
"step": 55600
},
{
"epoch": 2.79,
"grad_norm": 5.994776725769043,
"learning_rate": 8.908743718592967e-06,
"loss": 1.2199,
"step": 55700
},
{
"epoch": 2.79,
"grad_norm": 4.0552215576171875,
"learning_rate": 8.888643216080403e-06,
"loss": 1.2214,
"step": 55800
},
{
"epoch": 2.79,
"grad_norm": 6.152566432952881,
"learning_rate": 8.868542713567841e-06,
"loss": 1.2565,
"step": 55900
},
{
"epoch": 2.8,
"grad_norm": 5.718895435333252,
"learning_rate": 8.848442211055277e-06,
"loss": 1.2452,
"step": 56000
},
{
"epoch": 2.81,
"grad_norm": 6.39285135269165,
"learning_rate": 8.828341708542715e-06,
"loss": 1.2,
"step": 56100
},
{
"epoch": 2.81,
"grad_norm": 6.312516689300537,
"learning_rate": 8.808241206030151e-06,
"loss": 1.2739,
"step": 56200
},
{
"epoch": 2.81,
"grad_norm": 5.090448379516602,
"learning_rate": 8.78814070351759e-06,
"loss": 1.2405,
"step": 56300
},
{
"epoch": 2.82,
"grad_norm": 3.6773719787597656,
"learning_rate": 8.768040201005026e-06,
"loss": 1.2231,
"step": 56400
},
{
"epoch": 2.83,
"grad_norm": 3.831404209136963,
"learning_rate": 8.747939698492462e-06,
"loss": 1.2068,
"step": 56500
},
{
"epoch": 2.83,
"grad_norm": 6.460518836975098,
"learning_rate": 8.7278391959799e-06,
"loss": 1.2585,
"step": 56600
},
{
"epoch": 2.83,
"grad_norm": 5.495750427246094,
"learning_rate": 8.707738693467336e-06,
"loss": 1.2489,
"step": 56700
},
{
"epoch": 2.84,
"grad_norm": 3.785914421081543,
"learning_rate": 8.687638190954774e-06,
"loss": 1.1986,
"step": 56800
},
{
"epoch": 2.84,
"grad_norm": 3.443301200866699,
"learning_rate": 8.667537688442212e-06,
"loss": 1.233,
"step": 56900
},
{
"epoch": 2.85,
"grad_norm": 7.463976860046387,
"learning_rate": 8.647437185929648e-06,
"loss": 1.214,
"step": 57000
},
{
"epoch": 2.85,
"grad_norm": 5.533621788024902,
"learning_rate": 8.627336683417086e-06,
"loss": 1.2116,
"step": 57100
},
{
"epoch": 2.86,
"grad_norm": 5.238498210906982,
"learning_rate": 8.607236180904524e-06,
"loss": 1.1792,
"step": 57200
},
{
"epoch": 2.87,
"grad_norm": 5.2816996574401855,
"learning_rate": 8.58713567839196e-06,
"loss": 1.2359,
"step": 57300
},
{
"epoch": 2.87,
"grad_norm": 4.712213516235352,
"learning_rate": 8.567035175879399e-06,
"loss": 1.1824,
"step": 57400
},
{
"epoch": 2.88,
"grad_norm": 3.9056365489959717,
"learning_rate": 8.546934673366835e-06,
"loss": 1.2206,
"step": 57500
},
{
"epoch": 2.88,
"grad_norm": 6.587601184844971,
"learning_rate": 8.527035175879397e-06,
"loss": 1.2542,
"step": 57600
},
{
"epoch": 2.88,
"grad_norm": 4.349347114562988,
"learning_rate": 8.506934673366835e-06,
"loss": 1.2516,
"step": 57700
},
{
"epoch": 2.89,
"grad_norm": 4.775893211364746,
"learning_rate": 8.486834170854272e-06,
"loss": 1.18,
"step": 57800
},
{
"epoch": 2.9,
"grad_norm": 4.952343940734863,
"learning_rate": 8.46673366834171e-06,
"loss": 1.2287,
"step": 57900
},
{
"epoch": 2.9,
"grad_norm": 5.0424089431762695,
"learning_rate": 8.446834170854272e-06,
"loss": 1.2044,
"step": 58000
},
{
"epoch": 2.91,
"grad_norm": 5.429243564605713,
"learning_rate": 8.426733668341708e-06,
"loss": 1.1936,
"step": 58100
},
{
"epoch": 2.91,
"grad_norm": 4.514014720916748,
"learning_rate": 8.406633165829146e-06,
"loss": 1.2417,
"step": 58200
},
{
"epoch": 2.92,
"grad_norm": 4.364452362060547,
"learning_rate": 8.386532663316584e-06,
"loss": 1.2004,
"step": 58300
},
{
"epoch": 2.92,
"grad_norm": 3.5190353393554688,
"learning_rate": 8.36643216080402e-06,
"loss": 1.2144,
"step": 58400
},
{
"epoch": 2.92,
"grad_norm": 5.794633865356445,
"learning_rate": 8.346331658291458e-06,
"loss": 1.2072,
"step": 58500
},
{
"epoch": 2.93,
"grad_norm": 4.060710430145264,
"learning_rate": 8.326231155778895e-06,
"loss": 1.225,
"step": 58600
},
{
"epoch": 2.94,
"grad_norm": 4.3035664558410645,
"learning_rate": 8.306130653266333e-06,
"loss": 1.1935,
"step": 58700
},
{
"epoch": 2.94,
"grad_norm": 4.8658246994018555,
"learning_rate": 8.28603015075377e-06,
"loss": 1.2316,
"step": 58800
},
{
"epoch": 2.94,
"grad_norm": 3.5524916648864746,
"learning_rate": 8.265929648241207e-06,
"loss": 1.2119,
"step": 58900
},
{
"epoch": 2.95,
"grad_norm": 3.966935157775879,
"learning_rate": 8.245829145728645e-06,
"loss": 1.2427,
"step": 59000
},
{
"epoch": 2.96,
"grad_norm": 5.453131675720215,
"learning_rate": 8.225728643216081e-06,
"loss": 1.1948,
"step": 59100
},
{
"epoch": 2.96,
"grad_norm": 6.029975414276123,
"learning_rate": 8.20562814070352e-06,
"loss": 1.1688,
"step": 59200
},
{
"epoch": 2.96,
"grad_norm": 3.311718225479126,
"learning_rate": 8.185527638190955e-06,
"loss": 1.2195,
"step": 59300
},
{
"epoch": 2.97,
"grad_norm": 3.63813853263855,
"learning_rate": 8.165427135678393e-06,
"loss": 1.2288,
"step": 59400
},
{
"epoch": 2.98,
"grad_norm": 4.470839500427246,
"learning_rate": 8.14532663316583e-06,
"loss": 1.2273,
"step": 59500
},
{
"epoch": 2.98,
"grad_norm": 4.462855815887451,
"learning_rate": 8.125226130653266e-06,
"loss": 1.2066,
"step": 59600
},
{
"epoch": 2.98,
"grad_norm": 4.757040023803711,
"learning_rate": 8.105125628140704e-06,
"loss": 1.2159,
"step": 59700
},
{
"epoch": 2.99,
"grad_norm": 5.637049674987793,
"learning_rate": 8.08502512562814e-06,
"loss": 1.2449,
"step": 59800
},
{
"epoch": 3.0,
"grad_norm": 5.578622341156006,
"learning_rate": 8.064924623115578e-06,
"loss": 1.2401,
"step": 59900
},
{
"epoch": 3.0,
"grad_norm": 6.322601318359375,
"learning_rate": 8.044824120603014e-06,
"loss": 1.2546,
"step": 60000
},
{
"epoch": 3.0,
"eval_loss": 1.232736587524414,
"eval_runtime": 21.8587,
"eval_samples_per_second": 45.748,
"eval_steps_per_second": 5.719,
"step": 60000
},
{
"epoch": 3.0,
"grad_norm": 4.387004375457764,
"learning_rate": 8.024723618090452e-06,
"loss": 1.1591,
"step": 60100
},
{
"epoch": 3.01,
"grad_norm": 3.7655532360076904,
"learning_rate": 8.00462311557789e-06,
"loss": 1.2148,
"step": 60200
},
{
"epoch": 3.02,
"grad_norm": 4.917843341827393,
"learning_rate": 7.984522613065327e-06,
"loss": 1.1924,
"step": 60300
},
{
"epoch": 3.02,
"grad_norm": 4.71078634262085,
"learning_rate": 7.964422110552765e-06,
"loss": 1.1649,
"step": 60400
},
{
"epoch": 3.02,
"grad_norm": 6.106967449188232,
"learning_rate": 7.944321608040203e-06,
"loss": 1.2001,
"step": 60500
},
{
"epoch": 3.03,
"grad_norm": 5.224365711212158,
"learning_rate": 7.924221105527639e-06,
"loss": 1.1912,
"step": 60600
},
{
"epoch": 3.04,
"grad_norm": 6.110058784484863,
"learning_rate": 7.904120603015077e-06,
"loss": 1.2004,
"step": 60700
},
{
"epoch": 3.04,
"grad_norm": 4.606750965118408,
"learning_rate": 7.884020100502513e-06,
"loss": 1.1824,
"step": 60800
},
{
"epoch": 3.04,
"grad_norm": 4.328644275665283,
"learning_rate": 7.863919597989951e-06,
"loss": 1.1818,
"step": 60900
},
{
"epoch": 3.05,
"grad_norm": 5.017879009246826,
"learning_rate": 7.843819095477387e-06,
"loss": 1.1843,
"step": 61000
},
{
"epoch": 3.06,
"grad_norm": 6.072721481323242,
"learning_rate": 7.823718592964825e-06,
"loss": 1.1876,
"step": 61100
},
{
"epoch": 3.06,
"grad_norm": 5.169823169708252,
"learning_rate": 7.803618090452262e-06,
"loss": 1.1762,
"step": 61200
},
{
"epoch": 3.06,
"grad_norm": 5.349250793457031,
"learning_rate": 7.7835175879397e-06,
"loss": 1.1541,
"step": 61300
},
{
"epoch": 3.07,
"grad_norm": 5.824612140655518,
"learning_rate": 7.763417085427136e-06,
"loss": 1.1696,
"step": 61400
},
{
"epoch": 3.08,
"grad_norm": 6.2018938064575195,
"learning_rate": 7.743316582914574e-06,
"loss": 1.1621,
"step": 61500
},
{
"epoch": 3.08,
"grad_norm": 4.709869384765625,
"learning_rate": 7.72321608040201e-06,
"loss": 1.1777,
"step": 61600
},
{
"epoch": 3.08,
"grad_norm": 4.259114742279053,
"learning_rate": 7.703115577889448e-06,
"loss": 1.1883,
"step": 61700
},
{
"epoch": 3.09,
"grad_norm": 5.505044460296631,
"learning_rate": 7.683015075376884e-06,
"loss": 1.1896,
"step": 61800
},
{
"epoch": 3.1,
"grad_norm": 5.121050834655762,
"learning_rate": 7.662914572864322e-06,
"loss": 1.1533,
"step": 61900
},
{
"epoch": 3.1,
"grad_norm": 3.265988349914551,
"learning_rate": 7.642814070351759e-06,
"loss": 1.1746,
"step": 62000
},
{
"epoch": 3.1,
"grad_norm": 4.327176094055176,
"learning_rate": 7.622713567839196e-06,
"loss": 1.2136,
"step": 62100
},
{
"epoch": 3.11,
"grad_norm": 6.783113479614258,
"learning_rate": 7.602613065326634e-06,
"loss": 1.1538,
"step": 62200
},
{
"epoch": 3.12,
"grad_norm": 6.27109956741333,
"learning_rate": 7.582512562814071e-06,
"loss": 1.1995,
"step": 62300
},
{
"epoch": 3.12,
"grad_norm": 6.903465270996094,
"learning_rate": 7.562412060301508e-06,
"loss": 1.1468,
"step": 62400
},
{
"epoch": 3.12,
"grad_norm": 4.696254253387451,
"learning_rate": 7.542311557788945e-06,
"loss": 1.199,
"step": 62500
},
{
"epoch": 3.13,
"grad_norm": 4.280835151672363,
"learning_rate": 7.522211055276382e-06,
"loss": 1.1616,
"step": 62600
},
{
"epoch": 3.13,
"grad_norm": 5.0677924156188965,
"learning_rate": 7.5021105527638195e-06,
"loss": 1.1575,
"step": 62700
},
{
"epoch": 3.14,
"grad_norm": 6.276374816894531,
"learning_rate": 7.4820100502512574e-06,
"loss": 1.137,
"step": 62800
},
{
"epoch": 3.15,
"grad_norm": 4.777525424957275,
"learning_rate": 7.461909547738694e-06,
"loss": 1.201,
"step": 62900
},
{
"epoch": 3.15,
"grad_norm": 4.285521030426025,
"learning_rate": 7.441809045226132e-06,
"loss": 1.1924,
"step": 63000
},
{
"epoch": 3.15,
"grad_norm": 6.18180513381958,
"learning_rate": 7.421708542713568e-06,
"loss": 1.1347,
"step": 63100
},
{
"epoch": 3.16,
"grad_norm": 6.091145038604736,
"learning_rate": 7.401608040201006e-06,
"loss": 1.1795,
"step": 63200
},
{
"epoch": 3.17,
"grad_norm": 5.018629550933838,
"learning_rate": 7.381507537688442e-06,
"loss": 1.1755,
"step": 63300
},
{
"epoch": 3.17,
"grad_norm": 5.406840801239014,
"learning_rate": 7.36140703517588e-06,
"loss": 1.1869,
"step": 63400
},
{
"epoch": 3.17,
"grad_norm": 4.727605819702148,
"learning_rate": 7.341306532663317e-06,
"loss": 1.1772,
"step": 63500
},
{
"epoch": 3.18,
"grad_norm": 5.590334892272949,
"learning_rate": 7.3212060301507544e-06,
"loss": 1.1686,
"step": 63600
},
{
"epoch": 3.19,
"grad_norm": 5.156420707702637,
"learning_rate": 7.3011055276381916e-06,
"loss": 1.1696,
"step": 63700
},
{
"epoch": 3.19,
"grad_norm": 6.83641242980957,
"learning_rate": 7.2810050251256296e-06,
"loss": 1.1732,
"step": 63800
},
{
"epoch": 3.19,
"grad_norm": 4.086230278015137,
"learning_rate": 7.260904522613066e-06,
"loss": 1.1536,
"step": 63900
},
{
"epoch": 3.2,
"grad_norm": 7.394796371459961,
"learning_rate": 7.241005025125629e-06,
"loss": 1.1636,
"step": 64000
},
{
"epoch": 3.21,
"grad_norm": 6.290234088897705,
"learning_rate": 7.220904522613066e-06,
"loss": 1.1486,
"step": 64100
},
{
"epoch": 3.21,
"grad_norm": 5.5817036628723145,
"learning_rate": 7.200804020100503e-06,
"loss": 1.1424,
"step": 64200
},
{
"epoch": 3.21,
"grad_norm": 5.681445598602295,
"learning_rate": 7.1807035175879405e-06,
"loss": 1.1591,
"step": 64300
},
{
"epoch": 3.22,
"grad_norm": 6.768691062927246,
"learning_rate": 7.160603015075377e-06,
"loss": 1.1293,
"step": 64400
},
{
"epoch": 3.23,
"grad_norm": 5.4178595542907715,
"learning_rate": 7.140502512562815e-06,
"loss": 1.1729,
"step": 64500
},
{
"epoch": 3.23,
"grad_norm": 4.215164661407471,
"learning_rate": 7.120402010050251e-06,
"loss": 1.1783,
"step": 64600
},
{
"epoch": 3.23,
"grad_norm": 5.66365385055542,
"learning_rate": 7.100301507537689e-06,
"loss": 1.1967,
"step": 64700
},
{
"epoch": 3.24,
"grad_norm": 5.554622650146484,
"learning_rate": 7.080201005025126e-06,
"loss": 1.1466,
"step": 64800
},
{
"epoch": 3.25,
"grad_norm": 5.001458644866943,
"learning_rate": 7.060100502512563e-06,
"loss": 1.1732,
"step": 64900
},
{
"epoch": 3.25,
"grad_norm": 2.8027803897857666,
"learning_rate": 7.04e-06,
"loss": 1.152,
"step": 65000
},
{
"epoch": 3.25,
"grad_norm": 4.733661651611328,
"learning_rate": 7.019899497487438e-06,
"loss": 1.1155,
"step": 65100
},
{
"epoch": 3.26,
"grad_norm": 7.268320083618164,
"learning_rate": 6.999798994974875e-06,
"loss": 1.1703,
"step": 65200
},
{
"epoch": 3.27,
"grad_norm": 6.752691745758057,
"learning_rate": 6.979698492462313e-06,
"loss": 1.1986,
"step": 65300
},
{
"epoch": 3.27,
"grad_norm": 5.244182586669922,
"learning_rate": 6.959597989949749e-06,
"loss": 1.148,
"step": 65400
},
{
"epoch": 3.27,
"grad_norm": 5.481043815612793,
"learning_rate": 6.939497487437187e-06,
"loss": 1.1811,
"step": 65500
},
{
"epoch": 3.28,
"grad_norm": 5.892518997192383,
"learning_rate": 6.919396984924623e-06,
"loss": 1.1574,
"step": 65600
},
{
"epoch": 3.29,
"grad_norm": 5.347742557525635,
"learning_rate": 6.899296482412061e-06,
"loss": 1.1353,
"step": 65700
},
{
"epoch": 3.29,
"grad_norm": 5.078448295593262,
"learning_rate": 6.879195979899498e-06,
"loss": 1.1398,
"step": 65800
},
{
"epoch": 3.29,
"grad_norm": 4.15362548828125,
"learning_rate": 6.859095477386935e-06,
"loss": 1.1556,
"step": 65900
},
{
"epoch": 3.3,
"grad_norm": 4.748194694519043,
"learning_rate": 6.8389949748743725e-06,
"loss": 1.1929,
"step": 66000
},
{
"epoch": 3.31,
"grad_norm": 5.56561803817749,
"learning_rate": 6.81889447236181e-06,
"loss": 1.1907,
"step": 66100
},
{
"epoch": 3.31,
"grad_norm": 4.8242316246032715,
"learning_rate": 6.798793969849247e-06,
"loss": 1.166,
"step": 66200
},
{
"epoch": 3.31,
"grad_norm": 5.7045087814331055,
"learning_rate": 6.778693467336685e-06,
"loss": 1.1579,
"step": 66300
},
{
"epoch": 3.32,
"grad_norm": 4.583883285522461,
"learning_rate": 6.758592964824121e-06,
"loss": 1.1259,
"step": 66400
},
{
"epoch": 3.33,
"grad_norm": 5.085745811462402,
"learning_rate": 6.738492462311559e-06,
"loss": 1.1431,
"step": 66500
},
{
"epoch": 3.33,
"grad_norm": 4.655329704284668,
"learning_rate": 6.718391959798995e-06,
"loss": 1.1681,
"step": 66600
},
{
"epoch": 3.33,
"grad_norm": 4.375367164611816,
"learning_rate": 6.698291457286433e-06,
"loss": 1.1428,
"step": 66700
},
{
"epoch": 3.34,
"grad_norm": 5.143255710601807,
"learning_rate": 6.6781909547738695e-06,
"loss": 1.1676,
"step": 66800
},
{
"epoch": 3.34,
"grad_norm": 5.463631629943848,
"learning_rate": 6.658090452261307e-06,
"loss": 1.1742,
"step": 66900
},
{
"epoch": 3.35,
"grad_norm": 5.112860679626465,
"learning_rate": 6.637989949748745e-06,
"loss": 1.1655,
"step": 67000
},
{
"epoch": 3.35,
"grad_norm": 3.88566517829895,
"learning_rate": 6.617889447236181e-06,
"loss": 1.1428,
"step": 67100
},
{
"epoch": 3.36,
"grad_norm": 5.075991153717041,
"learning_rate": 6.597788944723619e-06,
"loss": 1.1853,
"step": 67200
},
{
"epoch": 3.37,
"grad_norm": 6.4206061363220215,
"learning_rate": 6.577688442211055e-06,
"loss": 1.1622,
"step": 67300
},
{
"epoch": 3.37,
"grad_norm": 4.789801597595215,
"learning_rate": 6.557587939698493e-06,
"loss": 1.2021,
"step": 67400
},
{
"epoch": 3.38,
"grad_norm": 4.722198486328125,
"learning_rate": 6.53748743718593e-06,
"loss": 1.1515,
"step": 67500
},
{
"epoch": 3.38,
"grad_norm": 4.774144649505615,
"learning_rate": 6.517386934673367e-06,
"loss": 1.1429,
"step": 67600
},
{
"epoch": 3.38,
"grad_norm": 3.848876953125,
"learning_rate": 6.49748743718593e-06,
"loss": 1.1719,
"step": 67700
},
{
"epoch": 3.39,
"grad_norm": 6.2731804847717285,
"learning_rate": 6.477386934673368e-06,
"loss": 1.1513,
"step": 67800
},
{
"epoch": 3.4,
"grad_norm": 6.135923385620117,
"learning_rate": 6.457286432160804e-06,
"loss": 1.1758,
"step": 67900
},
{
"epoch": 3.4,
"grad_norm": 5.437047958374023,
"learning_rate": 6.437185929648242e-06,
"loss": 1.174,
"step": 68000
},
{
"epoch": 3.41,
"grad_norm": 5.043646335601807,
"learning_rate": 6.417085427135678e-06,
"loss": 1.171,
"step": 68100
},
{
"epoch": 3.41,
"grad_norm": 4.104462623596191,
"learning_rate": 6.396984924623116e-06,
"loss": 1.1712,
"step": 68200
},
{
"epoch": 3.42,
"grad_norm": 2.740678310394287,
"learning_rate": 6.376884422110553e-06,
"loss": 1.1127,
"step": 68300
},
{
"epoch": 3.42,
"grad_norm": 5.4752936363220215,
"learning_rate": 6.3567839195979905e-06,
"loss": 1.1971,
"step": 68400
},
{
"epoch": 3.42,
"grad_norm": 7.34414529800415,
"learning_rate": 6.336683417085428e-06,
"loss": 1.1714,
"step": 68500
},
{
"epoch": 3.43,
"grad_norm": 3.3866333961486816,
"learning_rate": 6.316582914572866e-06,
"loss": 1.1604,
"step": 68600
},
{
"epoch": 3.44,
"grad_norm": 5.284789085388184,
"learning_rate": 6.296482412060302e-06,
"loss": 1.1792,
"step": 68700
},
{
"epoch": 3.44,
"grad_norm": 4.47866678237915,
"learning_rate": 6.27638190954774e-06,
"loss": 1.1343,
"step": 68800
},
{
"epoch": 3.44,
"grad_norm": 6.508190631866455,
"learning_rate": 6.256281407035176e-06,
"loss": 1.1286,
"step": 68900
},
{
"epoch": 3.45,
"grad_norm": 5.973139762878418,
"learning_rate": 6.236180904522614e-06,
"loss": 1.1774,
"step": 69000
},
{
"epoch": 3.46,
"grad_norm": 4.717242240905762,
"learning_rate": 6.21608040201005e-06,
"loss": 1.1273,
"step": 69100
},
{
"epoch": 3.46,
"grad_norm": 5.430871486663818,
"learning_rate": 6.195979899497488e-06,
"loss": 1.1764,
"step": 69200
},
{
"epoch": 3.46,
"grad_norm": 4.484432697296143,
"learning_rate": 6.1758793969849255e-06,
"loss": 1.154,
"step": 69300
},
{
"epoch": 3.47,
"grad_norm": 4.041011333465576,
"learning_rate": 6.155778894472362e-06,
"loss": 1.1614,
"step": 69400
},
{
"epoch": 3.48,
"grad_norm": 4.026901721954346,
"learning_rate": 6.1356783919598e-06,
"loss": 1.1376,
"step": 69500
},
{
"epoch": 3.48,
"grad_norm": 6.2372660636901855,
"learning_rate": 6.115577889447236e-06,
"loss": 1.1556,
"step": 69600
},
{
"epoch": 3.48,
"grad_norm": 5.324029445648193,
"learning_rate": 6.095477386934674e-06,
"loss": 1.1533,
"step": 69700
},
{
"epoch": 3.49,
"grad_norm": 7.105170726776123,
"learning_rate": 6.07537688442211e-06,
"loss": 1.1895,
"step": 69800
},
{
"epoch": 3.5,
"grad_norm": 23.556692123413086,
"learning_rate": 6.055276381909548e-06,
"loss": 1.1569,
"step": 69900
},
{
"epoch": 3.5,
"grad_norm": 5.792446613311768,
"learning_rate": 6.035175879396985e-06,
"loss": 1.1488,
"step": 70000
},
{
"epoch": 3.5,
"eval_loss": 1.2055375576019287,
"eval_runtime": 21.7807,
"eval_samples_per_second": 45.912,
"eval_steps_per_second": 5.739,
"step": 70000
},
{
"epoch": 3.5,
"grad_norm": 5.223493576049805,
"learning_rate": 6.0150753768844225e-06,
"loss": 1.144,
"step": 70100
},
{
"epoch": 3.51,
"grad_norm": 4.998097896575928,
"learning_rate": 5.99497487437186e-06,
"loss": 1.154,
"step": 70200
},
{
"epoch": 3.52,
"grad_norm": 6.396411895751953,
"learning_rate": 5.975075376884423e-06,
"loss": 1.1398,
"step": 70300
},
{
"epoch": 3.52,
"grad_norm": 4.5480427742004395,
"learning_rate": 5.954974874371859e-06,
"loss": 1.1753,
"step": 70400
},
{
"epoch": 3.52,
"grad_norm": 5.8266167640686035,
"learning_rate": 5.934874371859297e-06,
"loss": 1.1481,
"step": 70500
},
{
"epoch": 3.53,
"grad_norm": 3.3341739177703857,
"learning_rate": 5.9147738693467334e-06,
"loss": 1.1801,
"step": 70600
},
{
"epoch": 3.54,
"grad_norm": 5.731110572814941,
"learning_rate": 5.894673366834171e-06,
"loss": 1.1945,
"step": 70700
},
{
"epoch": 3.54,
"grad_norm": 5.0252251625061035,
"learning_rate": 5.8745728643216085e-06,
"loss": 1.1171,
"step": 70800
},
{
"epoch": 3.54,
"grad_norm": 6.497035503387451,
"learning_rate": 5.854472361809046e-06,
"loss": 1.1299,
"step": 70900
},
{
"epoch": 3.55,
"grad_norm": 3.014439582824707,
"learning_rate": 5.834371859296483e-06,
"loss": 1.1135,
"step": 71000
},
{
"epoch": 3.56,
"grad_norm": 6.145033359527588,
"learning_rate": 5.814271356783921e-06,
"loss": 1.1544,
"step": 71100
},
{
"epoch": 3.56,
"grad_norm": 4.730653285980225,
"learning_rate": 5.794170854271357e-06,
"loss": 1.1501,
"step": 71200
},
{
"epoch": 3.56,
"grad_norm": 4.570452690124512,
"learning_rate": 5.774070351758795e-06,
"loss": 1.1638,
"step": 71300
},
{
"epoch": 3.57,
"grad_norm": 3.947618007659912,
"learning_rate": 5.753969849246231e-06,
"loss": 1.1436,
"step": 71400
},
{
"epoch": 3.58,
"grad_norm": 6.833681106567383,
"learning_rate": 5.733869346733669e-06,
"loss": 1.1271,
"step": 71500
},
{
"epoch": 3.58,
"grad_norm": 4.837987422943115,
"learning_rate": 5.7137688442211056e-06,
"loss": 1.1503,
"step": 71600
},
{
"epoch": 3.58,
"grad_norm": 4.822892189025879,
"learning_rate": 5.6936683417085435e-06,
"loss": 1.1089,
"step": 71700
},
{
"epoch": 3.59,
"grad_norm": 5.022984027862549,
"learning_rate": 5.673567839195981e-06,
"loss": 1.1562,
"step": 71800
},
{
"epoch": 3.59,
"grad_norm": 5.105147838592529,
"learning_rate": 5.653467336683418e-06,
"loss": 1.1174,
"step": 71900
},
{
"epoch": 3.6,
"grad_norm": 4.37985372543335,
"learning_rate": 5.633366834170855e-06,
"loss": 1.1627,
"step": 72000
},
{
"epoch": 3.6,
"grad_norm": 3.854820966720581,
"learning_rate": 5.613266331658291e-06,
"loss": 1.1316,
"step": 72100
},
{
"epoch": 3.61,
"grad_norm": 7.305357933044434,
"learning_rate": 5.593165829145729e-06,
"loss": 1.1827,
"step": 72200
},
{
"epoch": 3.62,
"grad_norm": 7.693294048309326,
"learning_rate": 5.5730653266331654e-06,
"loss": 1.1649,
"step": 72300
},
{
"epoch": 3.62,
"grad_norm": 4.14479398727417,
"learning_rate": 5.552964824120603e-06,
"loss": 1.1844,
"step": 72400
},
{
"epoch": 3.62,
"grad_norm": 6.665209770202637,
"learning_rate": 5.5328643216080405e-06,
"loss": 1.1304,
"step": 72500
},
{
"epoch": 3.63,
"grad_norm": 4.629899978637695,
"learning_rate": 5.512763819095478e-06,
"loss": 1.1318,
"step": 72600
},
{
"epoch": 3.63,
"grad_norm": 3.4445009231567383,
"learning_rate": 5.492663316582915e-06,
"loss": 1.1532,
"step": 72700
},
{
"epoch": 3.64,
"grad_norm": 3.96806001663208,
"learning_rate": 5.472562814070353e-06,
"loss": 1.1332,
"step": 72800
},
{
"epoch": 3.65,
"grad_norm": 7.256198406219482,
"learning_rate": 5.452462311557789e-06,
"loss": 1.1339,
"step": 72900
},
{
"epoch": 3.65,
"grad_norm": 4.4425458908081055,
"learning_rate": 5.432361809045227e-06,
"loss": 1.1615,
"step": 73000
},
{
"epoch": 3.66,
"grad_norm": 6.114246368408203,
"learning_rate": 5.412261306532663e-06,
"loss": 1.1359,
"step": 73100
},
{
"epoch": 3.66,
"grad_norm": 3.9182209968566895,
"learning_rate": 5.392160804020101e-06,
"loss": 1.1574,
"step": 73200
},
{
"epoch": 3.67,
"grad_norm": 6.088989734649658,
"learning_rate": 5.3720603015075376e-06,
"loss": 1.0938,
"step": 73300
},
{
"epoch": 3.67,
"grad_norm": 6.2887163162231445,
"learning_rate": 5.3519597989949755e-06,
"loss": 1.1546,
"step": 73400
},
{
"epoch": 3.67,
"grad_norm": 5.033719539642334,
"learning_rate": 5.331859296482413e-06,
"loss": 1.1177,
"step": 73500
},
{
"epoch": 3.68,
"grad_norm": 6.611480236053467,
"learning_rate": 5.31175879396985e-06,
"loss": 1.1238,
"step": 73600
},
{
"epoch": 3.69,
"grad_norm": 7.612136363983154,
"learning_rate": 5.291658291457287e-06,
"loss": 1.1494,
"step": 73700
},
{
"epoch": 3.69,
"grad_norm": 3.842085599899292,
"learning_rate": 5.271557788944725e-06,
"loss": 1.1585,
"step": 73800
},
{
"epoch": 3.69,
"grad_norm": 4.859694004058838,
"learning_rate": 5.251457286432161e-06,
"loss": 1.1299,
"step": 73900
},
{
"epoch": 3.7,
"grad_norm": 6.1673808097839355,
"learning_rate": 5.231356783919599e-06,
"loss": 1.1242,
"step": 74000
},
{
"epoch": 3.71,
"grad_norm": 4.361232280731201,
"learning_rate": 5.2112562814070354e-06,
"loss": 1.1545,
"step": 74100
},
{
"epoch": 3.71,
"grad_norm": 5.18151330947876,
"learning_rate": 5.191155778894473e-06,
"loss": 1.0794,
"step": 74200
},
{
"epoch": 3.71,
"grad_norm": 5.049399375915527,
"learning_rate": 5.17105527638191e-06,
"loss": 1.136,
"step": 74300
},
{
"epoch": 3.72,
"grad_norm": 4.28516149520874,
"learning_rate": 5.150954773869347e-06,
"loss": 1.177,
"step": 74400
},
{
"epoch": 3.73,
"grad_norm": 3.71186900138855,
"learning_rate": 5.13105527638191e-06,
"loss": 1.1747,
"step": 74500
},
{
"epoch": 3.73,
"grad_norm": 8.269830703735352,
"learning_rate": 5.110954773869348e-06,
"loss": 1.1494,
"step": 74600
},
{
"epoch": 3.73,
"grad_norm": 4.453472137451172,
"learning_rate": 5.09105527638191e-06,
"loss": 1.1511,
"step": 74700
},
{
"epoch": 3.74,
"grad_norm": 6.132487773895264,
"learning_rate": 5.070954773869348e-06,
"loss": 1.1604,
"step": 74800
},
{
"epoch": 3.75,
"grad_norm": 4.084425926208496,
"learning_rate": 5.050854271356785e-06,
"loss": 1.1582,
"step": 74900
},
{
"epoch": 3.75,
"grad_norm": 4.719122886657715,
"learning_rate": 5.030753768844222e-06,
"loss": 1.0998,
"step": 75000
},
{
"epoch": 3.75,
"grad_norm": 6.886116981506348,
"learning_rate": 5.010854271356784e-06,
"loss": 1.1402,
"step": 75100
},
{
"epoch": 3.76,
"grad_norm": 3.9882471561431885,
"learning_rate": 4.990753768844221e-06,
"loss": 1.1163,
"step": 75200
},
{
"epoch": 3.77,
"grad_norm": 7.415084362030029,
"learning_rate": 4.9706532663316585e-06,
"loss": 1.1145,
"step": 75300
},
{
"epoch": 3.77,
"grad_norm": 4.16004753112793,
"learning_rate": 4.950552763819096e-06,
"loss": 1.1071,
"step": 75400
},
{
"epoch": 3.77,
"grad_norm": 5.258670806884766,
"learning_rate": 4.930452261306533e-06,
"loss": 1.1242,
"step": 75500
},
{
"epoch": 3.78,
"grad_norm": 5.320291519165039,
"learning_rate": 4.910351758793971e-06,
"loss": 1.1388,
"step": 75600
},
{
"epoch": 3.79,
"grad_norm": 4.200166702270508,
"learning_rate": 4.890251256281408e-06,
"loss": 1.1395,
"step": 75700
},
{
"epoch": 3.79,
"grad_norm": 4.569030284881592,
"learning_rate": 4.870150753768845e-06,
"loss": 1.1397,
"step": 75800
},
{
"epoch": 3.79,
"grad_norm": 4.229086875915527,
"learning_rate": 4.850050251256282e-06,
"loss": 1.1705,
"step": 75900
},
{
"epoch": 3.8,
"grad_norm": 5.543234825134277,
"learning_rate": 4.829949748743719e-06,
"loss": 1.1403,
"step": 76000
},
{
"epoch": 3.81,
"grad_norm": 4.94819974899292,
"learning_rate": 4.809849246231156e-06,
"loss": 1.1395,
"step": 76100
},
{
"epoch": 3.81,
"grad_norm": 3.889681577682495,
"learning_rate": 4.7897487437185935e-06,
"loss": 1.137,
"step": 76200
},
{
"epoch": 3.81,
"grad_norm": 5.113078594207764,
"learning_rate": 4.769648241206031e-06,
"loss": 1.1171,
"step": 76300
},
{
"epoch": 3.82,
"grad_norm": 4.799468994140625,
"learning_rate": 4.749547738693468e-06,
"loss": 1.1235,
"step": 76400
},
{
"epoch": 3.83,
"grad_norm": 4.0734453201293945,
"learning_rate": 4.729447236180905e-06,
"loss": 1.1283,
"step": 76500
},
{
"epoch": 3.83,
"grad_norm": 4.847392559051514,
"learning_rate": 4.709346733668342e-06,
"loss": 1.1932,
"step": 76600
},
{
"epoch": 3.83,
"grad_norm": 6.197125434875488,
"learning_rate": 4.689246231155779e-06,
"loss": 1.144,
"step": 76700
},
{
"epoch": 3.84,
"grad_norm": 5.93238639831543,
"learning_rate": 4.669145728643216e-06,
"loss": 1.0989,
"step": 76800
},
{
"epoch": 3.84,
"grad_norm": 5.202603340148926,
"learning_rate": 4.649045226130653e-06,
"loss": 1.1312,
"step": 76900
},
{
"epoch": 3.85,
"grad_norm": 5.290008544921875,
"learning_rate": 4.6289447236180905e-06,
"loss": 1.096,
"step": 77000
},
{
"epoch": 3.85,
"grad_norm": 4.39517879486084,
"learning_rate": 4.608844221105528e-06,
"loss": 1.1948,
"step": 77100
},
{
"epoch": 3.86,
"grad_norm": 6.785552024841309,
"learning_rate": 4.588743718592965e-06,
"loss": 1.139,
"step": 77200
},
{
"epoch": 3.87,
"grad_norm": 5.2934370040893555,
"learning_rate": 4.568643216080402e-06,
"loss": 1.1152,
"step": 77300
},
{
"epoch": 3.87,
"grad_norm": 5.2376604080200195,
"learning_rate": 4.54854271356784e-06,
"loss": 1.1464,
"step": 77400
},
{
"epoch": 3.88,
"grad_norm": 6.19007682800293,
"learning_rate": 4.528442211055277e-06,
"loss": 1.1199,
"step": 77500
},
{
"epoch": 3.88,
"grad_norm": 5.796671390533447,
"learning_rate": 4.508341708542714e-06,
"loss": 1.1399,
"step": 77600
},
{
"epoch": 3.88,
"grad_norm": 5.685388565063477,
"learning_rate": 4.488241206030151e-06,
"loss": 1.1485,
"step": 77700
},
{
"epoch": 3.89,
"grad_norm": 6.502816677093506,
"learning_rate": 4.468140703517588e-06,
"loss": 1.15,
"step": 77800
},
{
"epoch": 3.9,
"grad_norm": 4.437497138977051,
"learning_rate": 4.4480402010050255e-06,
"loss": 1.1344,
"step": 77900
},
{
"epoch": 3.9,
"grad_norm": 6.776554107666016,
"learning_rate": 4.427939698492463e-06,
"loss": 1.1486,
"step": 78000
},
{
"epoch": 3.91,
"grad_norm": 3.9491071701049805,
"learning_rate": 4.4078391959799e-06,
"loss": 1.1269,
"step": 78100
},
{
"epoch": 3.91,
"grad_norm": 5.485203266143799,
"learning_rate": 4.387738693467337e-06,
"loss": 1.1539,
"step": 78200
},
{
"epoch": 3.92,
"grad_norm": 7.40858793258667,
"learning_rate": 4.367638190954774e-06,
"loss": 1.1281,
"step": 78300
},
{
"epoch": 3.92,
"grad_norm": 4.498636722564697,
"learning_rate": 4.347537688442212e-06,
"loss": 1.1912,
"step": 78400
},
{
"epoch": 3.92,
"grad_norm": 6.4472856521606445,
"learning_rate": 4.327437185929649e-06,
"loss": 1.1276,
"step": 78500
},
{
"epoch": 3.93,
"grad_norm": 6.126656532287598,
"learning_rate": 4.307336683417086e-06,
"loss": 1.1329,
"step": 78600
},
{
"epoch": 3.94,
"grad_norm": 5.280217170715332,
"learning_rate": 4.287236180904523e-06,
"loss": 1.093,
"step": 78700
},
{
"epoch": 3.94,
"grad_norm": 6.618311405181885,
"learning_rate": 4.2671356783919605e-06,
"loss": 1.1215,
"step": 78800
},
{
"epoch": 3.94,
"grad_norm": 5.225731372833252,
"learning_rate": 4.247035175879397e-06,
"loss": 1.1252,
"step": 78900
},
{
"epoch": 3.95,
"grad_norm": 6.615197658538818,
"learning_rate": 4.226934673366834e-06,
"loss": 1.0734,
"step": 79000
},
{
"epoch": 3.96,
"grad_norm": 5.426534175872803,
"learning_rate": 4.206834170854272e-06,
"loss": 1.0851,
"step": 79100
},
{
"epoch": 3.96,
"grad_norm": 3.9240429401397705,
"learning_rate": 4.186733668341709e-06,
"loss": 1.1033,
"step": 79200
},
{
"epoch": 3.96,
"grad_norm": 4.4714250564575195,
"learning_rate": 4.166633165829146e-06,
"loss": 1.1347,
"step": 79300
},
{
"epoch": 3.97,
"grad_norm": 10.831876754760742,
"learning_rate": 4.146532663316583e-06,
"loss": 1.0971,
"step": 79400
},
{
"epoch": 3.98,
"grad_norm": 5.725513458251953,
"learning_rate": 4.12643216080402e-06,
"loss": 1.1346,
"step": 79500
},
{
"epoch": 3.98,
"grad_norm": 6.024409770965576,
"learning_rate": 4.1063316582914575e-06,
"loss": 1.1551,
"step": 79600
},
{
"epoch": 3.98,
"grad_norm": 3.672581195831299,
"learning_rate": 4.086231155778895e-06,
"loss": 1.1365,
"step": 79700
},
{
"epoch": 3.99,
"grad_norm": 5.520185470581055,
"learning_rate": 4.066130653266332e-06,
"loss": 1.1321,
"step": 79800
},
{
"epoch": 4.0,
"grad_norm": 6.262924671173096,
"learning_rate": 4.046030150753769e-06,
"loss": 1.1062,
"step": 79900
},
{
"epoch": 4.0,
"grad_norm": 6.248514652252197,
"learning_rate": 4.025929648241206e-06,
"loss": 1.0963,
"step": 80000
},
{
"epoch": 4.0,
"eval_loss": 1.1962475776672363,
"eval_runtime": 21.7746,
"eval_samples_per_second": 45.925,
"eval_steps_per_second": 5.741,
"step": 80000
},
{
"epoch": 4.0,
"grad_norm": 4.919951915740967,
"learning_rate": 4.005829145728643e-06,
"loss": 1.0946,
"step": 80100
},
{
"epoch": 4.01,
"grad_norm": 4.365044593811035,
"learning_rate": 3.985728643216081e-06,
"loss": 1.0987,
"step": 80200
},
{
"epoch": 4.01,
"grad_norm": 9.315962791442871,
"learning_rate": 3.965628140703518e-06,
"loss": 1.0575,
"step": 80300
},
{
"epoch": 4.02,
"grad_norm": 10.50012493133545,
"learning_rate": 3.945527638190955e-06,
"loss": 1.069,
"step": 80400
},
{
"epoch": 4.03,
"grad_norm": 8.853384017944336,
"learning_rate": 3.9254271356783925e-06,
"loss": 1.0638,
"step": 80500
},
{
"epoch": 4.03,
"grad_norm": 5.290276527404785,
"learning_rate": 3.90532663316583e-06,
"loss": 1.0422,
"step": 80600
},
{
"epoch": 4.04,
"grad_norm": 5.553296089172363,
"learning_rate": 3.885226130653267e-06,
"loss": 1.0517,
"step": 80700
},
{
"epoch": 4.04,
"grad_norm": 5.262092113494873,
"learning_rate": 3.865125628140704e-06,
"loss": 1.1184,
"step": 80800
},
{
"epoch": 4.04,
"grad_norm": 8.444523811340332,
"learning_rate": 3.845025125628141e-06,
"loss": 1.0861,
"step": 80900
},
{
"epoch": 4.05,
"grad_norm": 5.442756652832031,
"learning_rate": 3.824924623115578e-06,
"loss": 1.0559,
"step": 81000
},
{
"epoch": 4.05,
"grad_norm": 5.926506519317627,
"learning_rate": 3.8050251256281414e-06,
"loss": 1.0482,
"step": 81100
},
{
"epoch": 4.06,
"grad_norm": 5.596924781799316,
"learning_rate": 3.7849246231155785e-06,
"loss": 1.071,
"step": 81200
},
{
"epoch": 4.07,
"grad_norm": 8.789709091186523,
"learning_rate": 3.7648241206030156e-06,
"loss": 1.0796,
"step": 81300
},
{
"epoch": 4.07,
"grad_norm": 5.048920631408691,
"learning_rate": 3.7447236180904528e-06,
"loss": 1.0602,
"step": 81400
},
{
"epoch": 4.08,
"grad_norm": 6.4080705642700195,
"learning_rate": 3.72462311557789e-06,
"loss": 1.0481,
"step": 81500
},
{
"epoch": 4.08,
"grad_norm": 6.361392974853516,
"learning_rate": 3.7045226130653266e-06,
"loss": 1.0393,
"step": 81600
},
{
"epoch": 4.08,
"grad_norm": 7.596906661987305,
"learning_rate": 3.6844221105527637e-06,
"loss": 1.0212,
"step": 81700
},
{
"epoch": 4.09,
"grad_norm": 5.352015972137451,
"learning_rate": 3.6643216080402013e-06,
"loss": 1.0159,
"step": 81800
},
{
"epoch": 4.09,
"grad_norm": 5.569393157958984,
"learning_rate": 3.6442211055276384e-06,
"loss": 1.0516,
"step": 81900
},
{
"epoch": 4.1,
"grad_norm": 5.463687419891357,
"learning_rate": 3.6241206030150755e-06,
"loss": 1.0729,
"step": 82000
},
{
"epoch": 4.11,
"grad_norm": 11.406126976013184,
"learning_rate": 3.6040201005025127e-06,
"loss": 1.0389,
"step": 82100
},
{
"epoch": 4.11,
"grad_norm": 6.264597415924072,
"learning_rate": 3.58391959798995e-06,
"loss": 1.0613,
"step": 82200
},
{
"epoch": 4.12,
"grad_norm": 6.288965225219727,
"learning_rate": 3.5638190954773873e-06,
"loss": 1.0486,
"step": 82300
},
{
"epoch": 4.12,
"grad_norm": 5.622555732727051,
"learning_rate": 3.5437185929648245e-06,
"loss": 1.0665,
"step": 82400
},
{
"epoch": 4.12,
"grad_norm": 4.834249973297119,
"learning_rate": 3.5236180904522616e-06,
"loss": 1.074,
"step": 82500
},
{
"epoch": 4.13,
"grad_norm": 8.10172176361084,
"learning_rate": 3.5035175879396987e-06,
"loss": 1.0272,
"step": 82600
},
{
"epoch": 4.13,
"grad_norm": 5.477063179016113,
"learning_rate": 3.483417085427136e-06,
"loss": 1.0405,
"step": 82700
},
{
"epoch": 4.14,
"grad_norm": 4.51005220413208,
"learning_rate": 3.463316582914573e-06,
"loss": 1.0339,
"step": 82800
},
{
"epoch": 4.14,
"grad_norm": 7.996946811676025,
"learning_rate": 3.4432160804020105e-06,
"loss": 1.0159,
"step": 82900
},
{
"epoch": 4.15,
"grad_norm": 6.292382717132568,
"learning_rate": 3.4231155778894477e-06,
"loss": 1.085,
"step": 83000
},
{
"epoch": 4.16,
"grad_norm": 8.48137092590332,
"learning_rate": 3.4030150753768848e-06,
"loss": 1.047,
"step": 83100
},
{
"epoch": 4.16,
"grad_norm": 6.187898635864258,
"learning_rate": 3.382914572864322e-06,
"loss": 1.0577,
"step": 83200
},
{
"epoch": 4.17,
"grad_norm": 4.8612775802612305,
"learning_rate": 3.3630150753768847e-06,
"loss": 1.0273,
"step": 83300
},
{
"epoch": 4.17,
"grad_norm": 5.060107231140137,
"learning_rate": 3.342914572864322e-06,
"loss": 1.0998,
"step": 83400
},
{
"epoch": 4.17,
"grad_norm": 4.8297505378723145,
"learning_rate": 3.322814070351759e-06,
"loss": 1.0821,
"step": 83500
},
{
"epoch": 4.18,
"grad_norm": 4.36783504486084,
"learning_rate": 3.3027135678391965e-06,
"loss": 1.0591,
"step": 83600
},
{
"epoch": 4.18,
"grad_norm": 7.225709438323975,
"learning_rate": 3.2826130653266337e-06,
"loss": 1.0245,
"step": 83700
},
{
"epoch": 4.19,
"grad_norm": 5.521580219268799,
"learning_rate": 3.262512562814071e-06,
"loss": 1.0131,
"step": 83800
},
{
"epoch": 4.2,
"grad_norm": 4.925292491912842,
"learning_rate": 3.242412060301508e-06,
"loss": 1.023,
"step": 83900
},
{
"epoch": 4.2,
"grad_norm": 5.887296676635742,
"learning_rate": 3.222311557788945e-06,
"loss": 1.0233,
"step": 84000
},
{
"epoch": 4.21,
"grad_norm": 8.528314590454102,
"learning_rate": 3.2022110552763826e-06,
"loss": 1.0593,
"step": 84100
},
{
"epoch": 4.21,
"grad_norm": 5.414628982543945,
"learning_rate": 3.182110552763819e-06,
"loss": 1.0526,
"step": 84200
},
{
"epoch": 4.21,
"grad_norm": 8.320696830749512,
"learning_rate": 3.1620100502512564e-06,
"loss": 1.061,
"step": 84300
},
{
"epoch": 4.22,
"grad_norm": 7.6390767097473145,
"learning_rate": 3.1419095477386936e-06,
"loss": 1.0789,
"step": 84400
},
{
"epoch": 4.22,
"grad_norm": 6.141530990600586,
"learning_rate": 3.1218090452261307e-06,
"loss": 1.0438,
"step": 84500
},
{
"epoch": 4.23,
"grad_norm": 5.907855987548828,
"learning_rate": 3.101708542713568e-06,
"loss": 1.068,
"step": 84600
},
{
"epoch": 4.24,
"grad_norm": 8.13024616241455,
"learning_rate": 3.081608040201005e-06,
"loss": 1.0612,
"step": 84700
},
{
"epoch": 4.24,
"grad_norm": 4.439888954162598,
"learning_rate": 3.0615075376884425e-06,
"loss": 1.0589,
"step": 84800
},
{
"epoch": 4.25,
"grad_norm": 5.636837005615234,
"learning_rate": 3.0414070351758796e-06,
"loss": 1.0306,
"step": 84900
},
{
"epoch": 4.25,
"grad_norm": 4.6407389640808105,
"learning_rate": 3.0213065326633168e-06,
"loss": 1.0509,
"step": 85000
},
{
"epoch": 4.25,
"grad_norm": 5.515079975128174,
"learning_rate": 3.001206030150754e-06,
"loss": 1.0273,
"step": 85100
},
{
"epoch": 4.26,
"grad_norm": 5.22351598739624,
"learning_rate": 2.981105527638191e-06,
"loss": 1.0579,
"step": 85200
},
{
"epoch": 4.26,
"grad_norm": 6.4437479972839355,
"learning_rate": 2.961206030150754e-06,
"loss": 1.0435,
"step": 85300
},
{
"epoch": 4.27,
"grad_norm": 6.850719928741455,
"learning_rate": 2.941105527638191e-06,
"loss": 1.0487,
"step": 85400
},
{
"epoch": 4.28,
"grad_norm": 4.1141276359558105,
"learning_rate": 2.9210050251256285e-06,
"loss": 1.0124,
"step": 85500
},
{
"epoch": 4.28,
"grad_norm": 6.114626884460449,
"learning_rate": 2.9009045226130656e-06,
"loss": 1.0146,
"step": 85600
},
{
"epoch": 4.29,
"grad_norm": 5.943951606750488,
"learning_rate": 2.8808040201005028e-06,
"loss": 1.0484,
"step": 85700
},
{
"epoch": 4.29,
"grad_norm": 6.745482444763184,
"learning_rate": 2.86070351758794e-06,
"loss": 1.0353,
"step": 85800
},
{
"epoch": 4.29,
"grad_norm": 6.165544033050537,
"learning_rate": 2.840603015075377e-06,
"loss": 1.0572,
"step": 85900
},
{
"epoch": 4.3,
"grad_norm": 5.070137023925781,
"learning_rate": 2.8205025125628146e-06,
"loss": 1.0607,
"step": 86000
},
{
"epoch": 4.3,
"grad_norm": 5.177759647369385,
"learning_rate": 2.8004020100502517e-06,
"loss": 1.0589,
"step": 86100
},
{
"epoch": 4.31,
"grad_norm": 5.926203727722168,
"learning_rate": 2.780301507537689e-06,
"loss": 1.0728,
"step": 86200
},
{
"epoch": 4.32,
"grad_norm": 4.766726970672607,
"learning_rate": 2.760201005025126e-06,
"loss": 1.066,
"step": 86300
},
{
"epoch": 4.32,
"grad_norm": 7.0791401863098145,
"learning_rate": 2.740100502512563e-06,
"loss": 1.0381,
"step": 86400
},
{
"epoch": 4.33,
"grad_norm": 6.904074668884277,
"learning_rate": 2.7200000000000002e-06,
"loss": 1.0341,
"step": 86500
},
{
"epoch": 4.33,
"grad_norm": 7.680102348327637,
"learning_rate": 2.700100502512563e-06,
"loss": 1.0288,
"step": 86600
},
{
"epoch": 4.33,
"grad_norm": 7.589695930480957,
"learning_rate": 2.68e-06,
"loss": 1.0677,
"step": 86700
},
{
"epoch": 4.34,
"grad_norm": 4.686061382293701,
"learning_rate": 2.6598994974874377e-06,
"loss": 1.0213,
"step": 86800
},
{
"epoch": 4.34,
"grad_norm": 5.1108269691467285,
"learning_rate": 2.639798994974875e-06,
"loss": 1.0501,
"step": 86900
},
{
"epoch": 4.35,
"grad_norm": 6.551150798797607,
"learning_rate": 2.619698492462312e-06,
"loss": 1.0598,
"step": 87000
},
{
"epoch": 4.36,
"grad_norm": 6.834580898284912,
"learning_rate": 2.599597989949749e-06,
"loss": 1.0757,
"step": 87100
},
{
"epoch": 4.36,
"grad_norm": 4.995261192321777,
"learning_rate": 2.579497487437186e-06,
"loss": 1.0653,
"step": 87200
},
{
"epoch": 4.37,
"grad_norm": 5.614989757537842,
"learning_rate": 2.559396984924623e-06,
"loss": 1.0412,
"step": 87300
},
{
"epoch": 4.37,
"grad_norm": 12.592680931091309,
"learning_rate": 2.5392964824120605e-06,
"loss": 1.1065,
"step": 87400
},
{
"epoch": 4.38,
"grad_norm": 7.026024341583252,
"learning_rate": 2.5191959798994976e-06,
"loss": 1.064,
"step": 87500
},
{
"epoch": 4.38,
"grad_norm": 6.075479507446289,
"learning_rate": 2.499095477386935e-06,
"loss": 1.0559,
"step": 87600
},
{
"epoch": 4.38,
"grad_norm": 7.670622825622559,
"learning_rate": 2.4789949748743723e-06,
"loss": 1.0173,
"step": 87700
},
{
"epoch": 4.39,
"grad_norm": 6.775985240936279,
"learning_rate": 2.458894472361809e-06,
"loss": 1.0192,
"step": 87800
},
{
"epoch": 4.39,
"grad_norm": 6.7632222175598145,
"learning_rate": 2.438793969849246e-06,
"loss": 1.0403,
"step": 87900
},
{
"epoch": 4.4,
"grad_norm": 6.712850570678711,
"learning_rate": 2.4186934673366837e-06,
"loss": 1.0408,
"step": 88000
},
{
"epoch": 4.41,
"grad_norm": 4.842583656311035,
"learning_rate": 2.398592964824121e-06,
"loss": 1.0711,
"step": 88100
},
{
"epoch": 4.41,
"grad_norm": 6.134751796722412,
"learning_rate": 2.378492462311558e-06,
"loss": 1.0085,
"step": 88200
},
{
"epoch": 4.42,
"grad_norm": 5.06552791595459,
"learning_rate": 2.358391959798995e-06,
"loss": 1.0088,
"step": 88300
},
{
"epoch": 4.42,
"grad_norm": 6.872971057891846,
"learning_rate": 2.338291457286432e-06,
"loss": 1.0492,
"step": 88400
},
{
"epoch": 4.42,
"grad_norm": 5.2886528968811035,
"learning_rate": 2.3181909547738697e-06,
"loss": 1.0699,
"step": 88500
},
{
"epoch": 4.43,
"grad_norm": 7.221102237701416,
"learning_rate": 2.298090452261307e-06,
"loss": 1.0505,
"step": 88600
},
{
"epoch": 4.43,
"grad_norm": 7.388364791870117,
"learning_rate": 2.277989949748744e-06,
"loss": 1.0127,
"step": 88700
},
{
"epoch": 4.44,
"grad_norm": 11.533583641052246,
"learning_rate": 2.257889447236181e-06,
"loss": 1.055,
"step": 88800
},
{
"epoch": 4.45,
"grad_norm": 5.406151294708252,
"learning_rate": 2.2377889447236182e-06,
"loss": 1.0848,
"step": 88900
},
{
"epoch": 4.45,
"grad_norm": 8.093517303466797,
"learning_rate": 2.2176884422110554e-06,
"loss": 1.0058,
"step": 89000
},
{
"epoch": 4.46,
"grad_norm": 7.073362827301025,
"learning_rate": 2.1975879396984925e-06,
"loss": 1.0233,
"step": 89100
},
{
"epoch": 4.46,
"grad_norm": 6.263842582702637,
"learning_rate": 2.1776884422110558e-06,
"loss": 1.0259,
"step": 89200
},
{
"epoch": 4.46,
"grad_norm": 4.668169975280762,
"learning_rate": 2.1575879396984925e-06,
"loss": 1.0641,
"step": 89300
},
{
"epoch": 4.47,
"grad_norm": 5.550297737121582,
"learning_rate": 2.1374874371859296e-06,
"loss": 1.0768,
"step": 89400
},
{
"epoch": 4.47,
"grad_norm": 6.35700798034668,
"learning_rate": 2.1173869346733667e-06,
"loss": 1.0237,
"step": 89500
},
{
"epoch": 4.48,
"grad_norm": 8.346928596496582,
"learning_rate": 2.0972864321608043e-06,
"loss": 1.0265,
"step": 89600
},
{
"epoch": 4.49,
"grad_norm": 5.672070026397705,
"learning_rate": 2.0771859296482414e-06,
"loss": 1.0309,
"step": 89700
},
{
"epoch": 4.49,
"grad_norm": 7.057281494140625,
"learning_rate": 2.0570854271356785e-06,
"loss": 1.0131,
"step": 89800
},
{
"epoch": 4.5,
"grad_norm": 5.305558204650879,
"learning_rate": 2.0369849246231156e-06,
"loss": 1.075,
"step": 89900
},
{
"epoch": 4.5,
"grad_norm": 7.687346935272217,
"learning_rate": 2.0168844221105528e-06,
"loss": 1.0525,
"step": 90000
},
{
"epoch": 4.5,
"eval_loss": 1.240631341934204,
"eval_runtime": 21.7808,
"eval_samples_per_second": 45.912,
"eval_steps_per_second": 5.739,
"step": 90000
},
{
"epoch": 4.5,
"grad_norm": 4.474082946777344,
"learning_rate": 1.9967839195979903e-06,
"loss": 1.0245,
"step": 90100
},
{
"epoch": 4.51,
"grad_norm": 7.113109588623047,
"learning_rate": 1.9766834170854275e-06,
"loss": 1.0475,
"step": 90200
},
{
"epoch": 4.51,
"grad_norm": 5.945183753967285,
"learning_rate": 1.9565829145728646e-06,
"loss": 1.066,
"step": 90300
},
{
"epoch": 4.52,
"grad_norm": 5.280254364013672,
"learning_rate": 1.9364824120603017e-06,
"loss": 1.0428,
"step": 90400
},
{
"epoch": 4.53,
"grad_norm": 4.7843098640441895,
"learning_rate": 1.916381909547739e-06,
"loss": 1.0468,
"step": 90500
},
{
"epoch": 4.53,
"grad_norm": 6.668610572814941,
"learning_rate": 1.896281407035176e-06,
"loss": 1.0357,
"step": 90600
},
{
"epoch": 4.54,
"grad_norm": 6.481570720672607,
"learning_rate": 1.876180904522613e-06,
"loss": 1.0502,
"step": 90700
},
{
"epoch": 4.54,
"grad_norm": 6.947873115539551,
"learning_rate": 1.8560804020100504e-06,
"loss": 1.0129,
"step": 90800
},
{
"epoch": 4.54,
"grad_norm": 5.446567058563232,
"learning_rate": 1.8359798994974876e-06,
"loss": 1.0222,
"step": 90900
},
{
"epoch": 4.55,
"grad_norm": 6.875363349914551,
"learning_rate": 1.8158793969849247e-06,
"loss": 1.0444,
"step": 91000
},
{
"epoch": 4.55,
"grad_norm": 5.002197742462158,
"learning_rate": 1.795778894472362e-06,
"loss": 1.0339,
"step": 91100
},
{
"epoch": 4.56,
"grad_norm": 6.167435169219971,
"learning_rate": 1.7756783919597991e-06,
"loss": 0.9916,
"step": 91200
},
{
"epoch": 4.56,
"grad_norm": 4.765905380249023,
"learning_rate": 1.7555778894472365e-06,
"loss": 1.0356,
"step": 91300
},
{
"epoch": 4.57,
"grad_norm": 9.28632926940918,
"learning_rate": 1.7354773869346736e-06,
"loss": 1.052,
"step": 91400
},
{
"epoch": 4.58,
"grad_norm": 5.864830017089844,
"learning_rate": 1.7153768844221107e-06,
"loss": 1.014,
"step": 91500
},
{
"epoch": 4.58,
"grad_norm": 10.017616271972656,
"learning_rate": 1.6952763819095477e-06,
"loss": 1.0347,
"step": 91600
},
{
"epoch": 4.58,
"grad_norm": 6.52495813369751,
"learning_rate": 1.675175879396985e-06,
"loss": 1.0094,
"step": 91700
},
{
"epoch": 4.59,
"grad_norm": 5.506436347961426,
"learning_rate": 1.6550753768844221e-06,
"loss": 1.0306,
"step": 91800
},
{
"epoch": 4.59,
"grad_norm": 6.378169059753418,
"learning_rate": 1.6349748743718595e-06,
"loss": 1.0481,
"step": 91900
},
{
"epoch": 4.6,
"grad_norm": 7.866368293762207,
"learning_rate": 1.6148743718592966e-06,
"loss": 1.0433,
"step": 92000
},
{
"epoch": 4.61,
"grad_norm": 3.3074705600738525,
"learning_rate": 1.5947738693467337e-06,
"loss": 1.0255,
"step": 92100
},
{
"epoch": 4.61,
"grad_norm": 5.663514137268066,
"learning_rate": 1.574673366834171e-06,
"loss": 1.0066,
"step": 92200
},
{
"epoch": 4.62,
"grad_norm": 5.797703266143799,
"learning_rate": 1.5545728643216082e-06,
"loss": 1.0223,
"step": 92300
},
{
"epoch": 4.62,
"grad_norm": 5.430294513702393,
"learning_rate": 1.5344723618090453e-06,
"loss": 1.0361,
"step": 92400
},
{
"epoch": 4.62,
"grad_norm": 7.778083324432373,
"learning_rate": 1.5143718592964826e-06,
"loss": 1.0128,
"step": 92500
},
{
"epoch": 4.63,
"grad_norm": 8.762657165527344,
"learning_rate": 1.4942713567839198e-06,
"loss": 0.9764,
"step": 92600
},
{
"epoch": 4.63,
"grad_norm": 6.608733654022217,
"learning_rate": 1.4741708542713571e-06,
"loss": 1.0355,
"step": 92700
},
{
"epoch": 4.64,
"grad_norm": 4.251249313354492,
"learning_rate": 1.454070351758794e-06,
"loss": 1.0383,
"step": 92800
},
{
"epoch": 4.64,
"grad_norm": 5.131290912628174,
"learning_rate": 1.4339698492462312e-06,
"loss": 1.0118,
"step": 92900
},
{
"epoch": 4.65,
"grad_norm": 5.982537746429443,
"learning_rate": 1.4138693467336683e-06,
"loss": 1.0031,
"step": 93000
},
{
"epoch": 4.66,
"grad_norm": 5.640321254730225,
"learning_rate": 1.3937688442211056e-06,
"loss": 0.9834,
"step": 93100
},
{
"epoch": 4.66,
"grad_norm": 4.641716003417969,
"learning_rate": 1.3738693467336682e-06,
"loss": 1.0313,
"step": 93200
},
{
"epoch": 4.67,
"grad_norm": 4.14018440246582,
"learning_rate": 1.3537688442211056e-06,
"loss": 1.0132,
"step": 93300
},
{
"epoch": 4.67,
"grad_norm": 5.720813274383545,
"learning_rate": 1.3336683417085427e-06,
"loss": 1.0398,
"step": 93400
},
{
"epoch": 4.67,
"grad_norm": 6.412359714508057,
"learning_rate": 1.31356783919598e-06,
"loss": 0.9922,
"step": 93500
},
{
"epoch": 4.68,
"grad_norm": 6.955712795257568,
"learning_rate": 1.2934673366834172e-06,
"loss": 1.0396,
"step": 93600
},
{
"epoch": 4.69,
"grad_norm": 9.27692699432373,
"learning_rate": 1.2733668341708543e-06,
"loss": 1.0883,
"step": 93700
},
{
"epoch": 4.69,
"grad_norm": 5.846386909484863,
"learning_rate": 1.2532663316582916e-06,
"loss": 0.9981,
"step": 93800
},
{
"epoch": 4.7,
"grad_norm": 7.700655937194824,
"learning_rate": 1.2331658291457288e-06,
"loss": 1.0766,
"step": 93900
},
{
"epoch": 4.7,
"grad_norm": 6.861111640930176,
"learning_rate": 1.213065326633166e-06,
"loss": 1.0677,
"step": 94000
},
{
"epoch": 4.71,
"grad_norm": 7.702520370483398,
"learning_rate": 1.192964824120603e-06,
"loss": 1.092,
"step": 94100
},
{
"epoch": 4.71,
"grad_norm": 7.238519668579102,
"learning_rate": 1.1728643216080404e-06,
"loss": 1.0195,
"step": 94200
},
{
"epoch": 4.71,
"grad_norm": 5.9995646476745605,
"learning_rate": 1.1527638190954775e-06,
"loss": 1.0125,
"step": 94300
},
{
"epoch": 4.72,
"grad_norm": 8.999128341674805,
"learning_rate": 1.1326633165829146e-06,
"loss": 1.0342,
"step": 94400
},
{
"epoch": 4.72,
"grad_norm": 8.37474536895752,
"learning_rate": 1.112562814070352e-06,
"loss": 1.0151,
"step": 94500
},
{
"epoch": 4.73,
"grad_norm": 7.018558979034424,
"learning_rate": 1.092462311557789e-06,
"loss": 0.9854,
"step": 94600
},
{
"epoch": 4.74,
"grad_norm": 5.188572883605957,
"learning_rate": 1.0723618090452262e-06,
"loss": 0.9934,
"step": 94700
},
{
"epoch": 4.74,
"grad_norm": 5.260889530181885,
"learning_rate": 1.0522613065326633e-06,
"loss": 0.9982,
"step": 94800
},
{
"epoch": 4.75,
"grad_norm": 8.045933723449707,
"learning_rate": 1.0321608040201007e-06,
"loss": 1.0473,
"step": 94900
},
{
"epoch": 4.75,
"grad_norm": 8.305715560913086,
"learning_rate": 1.0120603015075378e-06,
"loss": 0.9831,
"step": 95000
},
{
"epoch": 4.75,
"grad_norm": 7.344651222229004,
"learning_rate": 9.91959798994975e-07,
"loss": 1.0251,
"step": 95100
},
{
"epoch": 4.76,
"grad_norm": 8.983135223388672,
"learning_rate": 9.71859296482412e-07,
"loss": 1.0084,
"step": 95200
},
{
"epoch": 4.76,
"grad_norm": 8.123686790466309,
"learning_rate": 9.51959798994975e-07,
"loss": 1.0136,
"step": 95300
},
{
"epoch": 4.77,
"grad_norm": 6.7493462562561035,
"learning_rate": 9.318592964824122e-07,
"loss": 0.9987,
"step": 95400
},
{
"epoch": 4.78,
"grad_norm": 8.338164329528809,
"learning_rate": 9.117587939698493e-07,
"loss": 0.99,
"step": 95500
},
{
"epoch": 4.78,
"grad_norm": 4.540625095367432,
"learning_rate": 8.916582914572865e-07,
"loss": 1.0015,
"step": 95600
},
{
"epoch": 4.79,
"grad_norm": 4.909175872802734,
"learning_rate": 8.715577889447237e-07,
"loss": 1.0395,
"step": 95700
},
{
"epoch": 4.79,
"grad_norm": 7.736073970794678,
"learning_rate": 8.514572864321608e-07,
"loss": 1.0005,
"step": 95800
},
{
"epoch": 4.79,
"grad_norm": 5.298911094665527,
"learning_rate": 8.313567839195981e-07,
"loss": 1.0283,
"step": 95900
},
{
"epoch": 4.8,
"grad_norm": 8.024383544921875,
"learning_rate": 8.112562814070353e-07,
"loss": 1.0152,
"step": 96000
},
{
"epoch": 4.8,
"grad_norm": 6.19573974609375,
"learning_rate": 7.911557788944723e-07,
"loss": 1.0195,
"step": 96100
},
{
"epoch": 4.81,
"grad_norm": 7.0770182609558105,
"learning_rate": 7.710552763819096e-07,
"loss": 1.0285,
"step": 96200
},
{
"epoch": 4.81,
"grad_norm": 5.578373908996582,
"learning_rate": 7.509547738693468e-07,
"loss": 0.9974,
"step": 96300
},
{
"epoch": 4.82,
"grad_norm": 6.602869033813477,
"learning_rate": 7.30854271356784e-07,
"loss": 1.0019,
"step": 96400
},
{
"epoch": 4.83,
"grad_norm": 8.442864418029785,
"learning_rate": 7.107537688442212e-07,
"loss": 1.035,
"step": 96500
},
{
"epoch": 4.83,
"grad_norm": 7.632187366485596,
"learning_rate": 6.906532663316584e-07,
"loss": 1.0316,
"step": 96600
},
{
"epoch": 4.83,
"grad_norm": 9.547595024108887,
"learning_rate": 6.705527638190955e-07,
"loss": 0.9944,
"step": 96700
},
{
"epoch": 4.84,
"grad_norm": 7.9466938972473145,
"learning_rate": 6.504522613065326e-07,
"loss": 1.0426,
"step": 96800
},
{
"epoch": 4.84,
"grad_norm": 4.955949783325195,
"learning_rate": 6.303517587939699e-07,
"loss": 1.0797,
"step": 96900
},
{
"epoch": 4.85,
"grad_norm": 8.598955154418945,
"learning_rate": 6.102512562814071e-07,
"loss": 1.0233,
"step": 97000
},
{
"epoch": 4.86,
"grad_norm": 5.320690631866455,
"learning_rate": 5.901507537688442e-07,
"loss": 1.0028,
"step": 97100
},
{
"epoch": 4.86,
"grad_norm": 7.243544101715088,
"learning_rate": 5.700502512562815e-07,
"loss": 0.9832,
"step": 97200
},
{
"epoch": 4.87,
"grad_norm": 6.318332672119141,
"learning_rate": 5.501507537688443e-07,
"loss": 1.0295,
"step": 97300
},
{
"epoch": 4.87,
"grad_norm": 5.297775745391846,
"learning_rate": 5.300502512562814e-07,
"loss": 1.0161,
"step": 97400
},
{
"epoch": 4.88,
"grad_norm": 5.717208385467529,
"learning_rate": 5.099497487437187e-07,
"loss": 1.0508,
"step": 97500
},
{
"epoch": 4.88,
"grad_norm": 6.545378684997559,
"learning_rate": 4.898492462311558e-07,
"loss": 1.039,
"step": 97600
},
{
"epoch": 4.88,
"grad_norm": 7.0295867919921875,
"learning_rate": 4.69748743718593e-07,
"loss": 0.9963,
"step": 97700
},
{
"epoch": 4.89,
"grad_norm": 6.685591697692871,
"learning_rate": 4.4964824120603015e-07,
"loss": 0.9845,
"step": 97800
},
{
"epoch": 4.89,
"grad_norm": 8.319560050964355,
"learning_rate": 4.295477386934674e-07,
"loss": 0.9951,
"step": 97900
},
{
"epoch": 4.9,
"grad_norm": 5.060975551605225,
"learning_rate": 4.094472361809045e-07,
"loss": 0.9893,
"step": 98000
},
{
"epoch": 4.91,
"grad_norm": 7.493504524230957,
"learning_rate": 3.8934673366834175e-07,
"loss": 0.9722,
"step": 98100
},
{
"epoch": 4.91,
"grad_norm": 6.903368949890137,
"learning_rate": 3.6924623115577893e-07,
"loss": 1.0248,
"step": 98200
},
{
"epoch": 4.92,
"grad_norm": 7.836544036865234,
"learning_rate": 3.4914572864321606e-07,
"loss": 1.0075,
"step": 98300
},
{
"epoch": 4.92,
"grad_norm": 7.111424446105957,
"learning_rate": 3.290452261306533e-07,
"loss": 0.9776,
"step": 98400
},
{
"epoch": 4.92,
"grad_norm": 5.424601078033447,
"learning_rate": 3.0894472361809047e-07,
"loss": 1.0235,
"step": 98500
},
{
"epoch": 4.93,
"grad_norm": 5.190994739532471,
"learning_rate": 2.8884422110552765e-07,
"loss": 1.0375,
"step": 98600
},
{
"epoch": 4.94,
"grad_norm": 7.971541404724121,
"learning_rate": 2.6874371859296483e-07,
"loss": 1.0049,
"step": 98700
},
{
"epoch": 4.94,
"grad_norm": 8.369765281677246,
"learning_rate": 2.48643216080402e-07,
"loss": 1.0028,
"step": 98800
},
{
"epoch": 4.95,
"grad_norm": 5.76533842086792,
"learning_rate": 2.2854271356783922e-07,
"loss": 0.9663,
"step": 98900
},
{
"epoch": 4.95,
"grad_norm": 8.707147598266602,
"learning_rate": 2.084422110552764e-07,
"loss": 1.0048,
"step": 99000
},
{
"epoch": 4.96,
"grad_norm": 7.287750720977783,
"learning_rate": 1.8834170854271358e-07,
"loss": 0.9777,
"step": 99100
},
{
"epoch": 4.96,
"grad_norm": 6.303613185882568,
"learning_rate": 1.6824120603015078e-07,
"loss": 1.0266,
"step": 99200
},
{
"epoch": 4.96,
"grad_norm": 6.6418776512146,
"learning_rate": 1.4814070351758796e-07,
"loss": 1.0179,
"step": 99300
},
{
"epoch": 4.97,
"grad_norm": 7.093784332275391,
"learning_rate": 1.2824120603015077e-07,
"loss": 1.0051,
"step": 99400
},
{
"epoch": 4.97,
"grad_norm": 9.0274658203125,
"learning_rate": 1.0814070351758795e-07,
"loss": 1.0022,
"step": 99500
},
{
"epoch": 4.98,
"grad_norm": 9.27753734588623,
"learning_rate": 8.804020100502513e-08,
"loss": 1.0176,
"step": 99600
},
{
"epoch": 4.99,
"grad_norm": 7.9141364097595215,
"learning_rate": 6.793969849246231e-08,
"loss": 0.9695,
"step": 99700
},
{
"epoch": 4.99,
"grad_norm": 6.786277770996094,
"learning_rate": 4.7839195979899497e-08,
"loss": 1.0346,
"step": 99800
},
{
"epoch": 5.0,
"grad_norm": 6.178213119506836,
"learning_rate": 2.7738693467336683e-08,
"loss": 0.987,
"step": 99900
},
{
"epoch": 5.0,
"grad_norm": 5.494964599609375,
"learning_rate": 7.63819095477387e-09,
"loss": 0.9787,
"step": 100000
},
{
"epoch": 5.0,
"eval_loss": 1.162515640258789,
"eval_runtime": 21.775,
"eval_samples_per_second": 45.924,
"eval_steps_per_second": 5.741,
"step": 100000
}
],
"logging_steps": 100,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10000,
"total_flos": 1.1800273747968e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}