Valuesnet_DeBERTa_v3 / trainer_state.json
nharrel's picture
Initial commit
3b69445
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 40.0,
"eval_steps": 500,
"global_step": 160320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.124750499001996,
"grad_norm": 13.176804542541504,
"learning_rate": 1.9937624750499e-06,
"loss": 0.2137,
"step": 500
},
{
"epoch": 0.249500998003992,
"grad_norm": 52.68854904174805,
"learning_rate": 1.9875249500998005e-06,
"loss": 0.2463,
"step": 1000
},
{
"epoch": 0.37425149700598803,
"grad_norm": 9.197150230407715,
"learning_rate": 1.9812874251497004e-06,
"loss": 0.2316,
"step": 1500
},
{
"epoch": 0.499001996007984,
"grad_norm": 23.94010353088379,
"learning_rate": 1.9750499001996007e-06,
"loss": 0.2095,
"step": 2000
},
{
"epoch": 0.6237524950099801,
"grad_norm": 25.69223976135254,
"learning_rate": 1.968812375249501e-06,
"loss": 0.2102,
"step": 2500
},
{
"epoch": 0.7485029940119761,
"grad_norm": 14.870789527893066,
"learning_rate": 1.9625748502994013e-06,
"loss": 0.2335,
"step": 3000
},
{
"epoch": 0.873253493013972,
"grad_norm": 19.752464294433594,
"learning_rate": 1.9563373253493016e-06,
"loss": 0.2065,
"step": 3500
},
{
"epoch": 0.998003992015968,
"grad_norm": 7.356762409210205,
"learning_rate": 1.9500998003992014e-06,
"loss": 0.215,
"step": 4000
},
{
"epoch": 1.0,
"eval_loss": 0.4418031871318817,
"eval_runtime": 50.9423,
"eval_samples_per_second": 62.934,
"eval_steps_per_second": 15.743,
"step": 4008
},
{
"epoch": 1.122754491017964,
"grad_norm": 1.049210786819458,
"learning_rate": 1.9438622754491017e-06,
"loss": 0.1786,
"step": 4500
},
{
"epoch": 1.24750499001996,
"grad_norm": 33.95945358276367,
"learning_rate": 1.937624750499002e-06,
"loss": 0.2107,
"step": 5000
},
{
"epoch": 1.372255489021956,
"grad_norm": 3.9420273303985596,
"learning_rate": 1.931387225548902e-06,
"loss": 0.1877,
"step": 5500
},
{
"epoch": 1.4970059880239521,
"grad_norm": 15.459404945373535,
"learning_rate": 1.925149700598802e-06,
"loss": 0.1808,
"step": 6000
},
{
"epoch": 1.621756487025948,
"grad_norm": 0.35231631994247437,
"learning_rate": 1.9189121756487025e-06,
"loss": 0.1842,
"step": 6500
},
{
"epoch": 1.746506986027944,
"grad_norm": 22.17848014831543,
"learning_rate": 1.9126746506986028e-06,
"loss": 0.2165,
"step": 7000
},
{
"epoch": 1.8712574850299402,
"grad_norm": 31.21565055847168,
"learning_rate": 1.906437125748503e-06,
"loss": 0.1879,
"step": 7500
},
{
"epoch": 1.996007984031936,
"grad_norm": 1.7563763856887817,
"learning_rate": 1.9001996007984032e-06,
"loss": 0.1913,
"step": 8000
},
{
"epoch": 2.0,
"eval_loss": 0.4956786632537842,
"eval_runtime": 48.4339,
"eval_samples_per_second": 66.193,
"eval_steps_per_second": 16.559,
"step": 8016
},
{
"epoch": 2.1207584830339323,
"grad_norm": 0.0910625234246254,
"learning_rate": 1.8939620758483032e-06,
"loss": 0.1712,
"step": 8500
},
{
"epoch": 2.245508982035928,
"grad_norm": 30.4615421295166,
"learning_rate": 1.8877245508982035e-06,
"loss": 0.1579,
"step": 9000
},
{
"epoch": 2.370259481037924,
"grad_norm": 29.169662475585938,
"learning_rate": 1.8814870259481036e-06,
"loss": 0.1701,
"step": 9500
},
{
"epoch": 2.49500998003992,
"grad_norm": 0.9950535893440247,
"learning_rate": 1.875249500998004e-06,
"loss": 0.1717,
"step": 10000
},
{
"epoch": 2.6197604790419162,
"grad_norm": 0.30978772044181824,
"learning_rate": 1.8690119760479042e-06,
"loss": 0.1778,
"step": 10500
},
{
"epoch": 2.744510978043912,
"grad_norm": 1.4617693424224854,
"learning_rate": 1.8627744510978043e-06,
"loss": 0.1772,
"step": 11000
},
{
"epoch": 2.8692614770459084,
"grad_norm": 13.257425308227539,
"learning_rate": 1.8565369261477044e-06,
"loss": 0.1697,
"step": 11500
},
{
"epoch": 2.9940119760479043,
"grad_norm": 11.522214889526367,
"learning_rate": 1.8502994011976047e-06,
"loss": 0.1651,
"step": 12000
},
{
"epoch": 3.0,
"eval_loss": 0.4591982960700989,
"eval_runtime": 48.8032,
"eval_samples_per_second": 65.692,
"eval_steps_per_second": 16.433,
"step": 12024
},
{
"epoch": 3.1187624750499,
"grad_norm": 20.58974266052246,
"learning_rate": 1.844061876247505e-06,
"loss": 0.1607,
"step": 12500
},
{
"epoch": 3.243512974051896,
"grad_norm": 54.52241516113281,
"learning_rate": 1.8378243512974053e-06,
"loss": 0.1527,
"step": 13000
},
{
"epoch": 3.3682634730538923,
"grad_norm": 12.846843719482422,
"learning_rate": 1.8315868263473054e-06,
"loss": 0.1484,
"step": 13500
},
{
"epoch": 3.493013972055888,
"grad_norm": 0.6479830145835876,
"learning_rate": 1.8253493013972054e-06,
"loss": 0.1621,
"step": 14000
},
{
"epoch": 3.6177644710578845,
"grad_norm": 0.7256312370300293,
"learning_rate": 1.8191117764471057e-06,
"loss": 0.1428,
"step": 14500
},
{
"epoch": 3.7425149700598803,
"grad_norm": 12.274479866027832,
"learning_rate": 1.8128742514970058e-06,
"loss": 0.1433,
"step": 15000
},
{
"epoch": 3.867265469061876,
"grad_norm": 35.40715408325195,
"learning_rate": 1.8066367265469061e-06,
"loss": 0.161,
"step": 15500
},
{
"epoch": 3.992015968063872,
"grad_norm": 0.78450608253479,
"learning_rate": 1.8003992015968064e-06,
"loss": 0.171,
"step": 16000
},
{
"epoch": 4.0,
"eval_loss": 0.4495457410812378,
"eval_runtime": 49.2624,
"eval_samples_per_second": 65.08,
"eval_steps_per_second": 16.28,
"step": 16032
},
{
"epoch": 4.116766467065868,
"grad_norm": 0.06905636936426163,
"learning_rate": 1.7941616766467065e-06,
"loss": 0.1475,
"step": 16500
},
{
"epoch": 4.241516966067865,
"grad_norm": 34.77931213378906,
"learning_rate": 1.7879241516966066e-06,
"loss": 0.1365,
"step": 17000
},
{
"epoch": 4.3662674650698605,
"grad_norm": 0.5809102058410645,
"learning_rate": 1.7816866267465069e-06,
"loss": 0.1366,
"step": 17500
},
{
"epoch": 4.491017964071856,
"grad_norm": 66.70156860351562,
"learning_rate": 1.775449101796407e-06,
"loss": 0.1526,
"step": 18000
},
{
"epoch": 4.615768463073852,
"grad_norm": 29.423938751220703,
"learning_rate": 1.7692115768463075e-06,
"loss": 0.135,
"step": 18500
},
{
"epoch": 4.740518962075848,
"grad_norm": 0.48827868700027466,
"learning_rate": 1.7629740518962075e-06,
"loss": 0.1444,
"step": 19000
},
{
"epoch": 4.865269461077844,
"grad_norm": 8.966581344604492,
"learning_rate": 1.7567365269461076e-06,
"loss": 0.1295,
"step": 19500
},
{
"epoch": 4.99001996007984,
"grad_norm": 3.6332414150238037,
"learning_rate": 1.750499001996008e-06,
"loss": 0.1407,
"step": 20000
},
{
"epoch": 5.0,
"eval_loss": 0.5054113268852234,
"eval_runtime": 46.022,
"eval_samples_per_second": 69.662,
"eval_steps_per_second": 17.426,
"step": 20040
},
{
"epoch": 5.114770459081837,
"grad_norm": 26.99722671508789,
"learning_rate": 1.744261477045908e-06,
"loss": 0.1307,
"step": 20500
},
{
"epoch": 5.2395209580838324,
"grad_norm": 0.7371481657028198,
"learning_rate": 1.7380239520958083e-06,
"loss": 0.1153,
"step": 21000
},
{
"epoch": 5.364271457085828,
"grad_norm": 0.3232800364494324,
"learning_rate": 1.7317864271457086e-06,
"loss": 0.1154,
"step": 21500
},
{
"epoch": 5.489021956087824,
"grad_norm": 1.8309438228607178,
"learning_rate": 1.7255489021956087e-06,
"loss": 0.1331,
"step": 22000
},
{
"epoch": 5.61377245508982,
"grad_norm": 0.4226222038269043,
"learning_rate": 1.719311377245509e-06,
"loss": 0.1206,
"step": 22500
},
{
"epoch": 5.738522954091817,
"grad_norm": 1.4337540864944458,
"learning_rate": 1.713073852295409e-06,
"loss": 0.13,
"step": 23000
},
{
"epoch": 5.863273453093813,
"grad_norm": 47.5312614440918,
"learning_rate": 1.7068363273453091e-06,
"loss": 0.1285,
"step": 23500
},
{
"epoch": 5.9880239520958085,
"grad_norm": 1.092816710472107,
"learning_rate": 1.7005988023952097e-06,
"loss": 0.1412,
"step": 24000
},
{
"epoch": 6.0,
"eval_loss": 0.4939550459384918,
"eval_runtime": 45.0298,
"eval_samples_per_second": 71.197,
"eval_steps_per_second": 17.81,
"step": 24048
},
{
"epoch": 6.112774451097804,
"grad_norm": 0.03936842083930969,
"learning_rate": 1.6943612774451097e-06,
"loss": 0.1134,
"step": 24500
},
{
"epoch": 6.2375249500998,
"grad_norm": 3.047616481781006,
"learning_rate": 1.6881237524950098e-06,
"loss": 0.1066,
"step": 25000
},
{
"epoch": 6.362275449101796,
"grad_norm": 16.7564754486084,
"learning_rate": 1.6818862275449101e-06,
"loss": 0.1615,
"step": 25500
},
{
"epoch": 6.487025948103792,
"grad_norm": 21.36778450012207,
"learning_rate": 1.6756487025948102e-06,
"loss": 0.1645,
"step": 26000
},
{
"epoch": 6.611776447105789,
"grad_norm": 78.45208740234375,
"learning_rate": 1.6694111776447105e-06,
"loss": 0.1675,
"step": 26500
},
{
"epoch": 6.736526946107785,
"grad_norm": 7.212148666381836,
"learning_rate": 1.6631736526946108e-06,
"loss": 0.146,
"step": 27000
},
{
"epoch": 6.86127744510978,
"grad_norm": 9.503207206726074,
"learning_rate": 1.6569361277445109e-06,
"loss": 0.1606,
"step": 27500
},
{
"epoch": 6.986027944111776,
"grad_norm": 0.4464740753173828,
"learning_rate": 1.6506986027944112e-06,
"loss": 0.1429,
"step": 28000
},
{
"epoch": 7.0,
"eval_loss": 0.4717544615268707,
"eval_runtime": 47.386,
"eval_samples_per_second": 67.657,
"eval_steps_per_second": 16.925,
"step": 28056
},
{
"epoch": 7.110778443113772,
"grad_norm": 0.42686018347740173,
"learning_rate": 1.6444610778443113e-06,
"loss": 0.1207,
"step": 28500
},
{
"epoch": 7.235528942115769,
"grad_norm": 24.92848014831543,
"learning_rate": 1.6382235528942113e-06,
"loss": 0.1351,
"step": 29000
},
{
"epoch": 7.360279441117765,
"grad_norm": 7.397327423095703,
"learning_rate": 1.6319860279441118e-06,
"loss": 0.1543,
"step": 29500
},
{
"epoch": 7.485029940119761,
"grad_norm": 0.43539106845855713,
"learning_rate": 1.625748502994012e-06,
"loss": 0.1494,
"step": 30000
},
{
"epoch": 7.6097804391217565,
"grad_norm": 14.456055641174316,
"learning_rate": 1.619510978043912e-06,
"loss": 0.1419,
"step": 30500
},
{
"epoch": 7.734530938123752,
"grad_norm": 9.563997268676758,
"learning_rate": 1.6132734530938123e-06,
"loss": 0.1357,
"step": 31000
},
{
"epoch": 7.859281437125748,
"grad_norm": 1.7568217515945435,
"learning_rate": 1.6070359281437124e-06,
"loss": 0.1369,
"step": 31500
},
{
"epoch": 7.984031936127744,
"grad_norm": 2.780186653137207,
"learning_rate": 1.600798403193613e-06,
"loss": 0.1451,
"step": 32000
},
{
"epoch": 8.0,
"eval_loss": 0.45947006344795227,
"eval_runtime": 44.2941,
"eval_samples_per_second": 72.38,
"eval_steps_per_second": 18.106,
"step": 32064
},
{
"epoch": 8.10878243512974,
"grad_norm": 10.451217651367188,
"learning_rate": 1.594560878243513e-06,
"loss": 0.1136,
"step": 32500
},
{
"epoch": 8.233532934131736,
"grad_norm": 0.18200552463531494,
"learning_rate": 1.588323353293413e-06,
"loss": 0.1259,
"step": 33000
},
{
"epoch": 8.358283433133732,
"grad_norm": 1.9428528547286987,
"learning_rate": 1.5820858283433134e-06,
"loss": 0.1279,
"step": 33500
},
{
"epoch": 8.48303393213573,
"grad_norm": 1.7016535997390747,
"learning_rate": 1.5758483033932135e-06,
"loss": 0.1231,
"step": 34000
},
{
"epoch": 8.607784431137725,
"grad_norm": 0.7158037424087524,
"learning_rate": 1.5696107784431135e-06,
"loss": 0.1446,
"step": 34500
},
{
"epoch": 8.732534930139721,
"grad_norm": 0.4712078273296356,
"learning_rate": 1.563373253493014e-06,
"loss": 0.1344,
"step": 35000
},
{
"epoch": 8.857285429141717,
"grad_norm": 24.5105037689209,
"learning_rate": 1.5571357285429141e-06,
"loss": 0.1331,
"step": 35500
},
{
"epoch": 8.982035928143713,
"grad_norm": 27.750621795654297,
"learning_rate": 1.5508982035928142e-06,
"loss": 0.1296,
"step": 36000
},
{
"epoch": 9.0,
"eval_loss": 0.47351646423339844,
"eval_runtime": 41.3902,
"eval_samples_per_second": 77.458,
"eval_steps_per_second": 19.377,
"step": 36072
},
{
"epoch": 9.106786427145709,
"grad_norm": 28.095134735107422,
"learning_rate": 1.5446606786427145e-06,
"loss": 0.119,
"step": 36500
},
{
"epoch": 9.231536926147704,
"grad_norm": 0.07204411178827286,
"learning_rate": 1.5384231536926146e-06,
"loss": 0.1098,
"step": 37000
},
{
"epoch": 9.3562874251497,
"grad_norm": 0.2767297327518463,
"learning_rate": 1.532185628742515e-06,
"loss": 0.1224,
"step": 37500
},
{
"epoch": 9.481037924151696,
"grad_norm": 0.14060889184474945,
"learning_rate": 1.5259481037924152e-06,
"loss": 0.1247,
"step": 38000
},
{
"epoch": 9.605788423153692,
"grad_norm": 32.673011779785156,
"learning_rate": 1.5197105788423153e-06,
"loss": 0.122,
"step": 38500
},
{
"epoch": 9.730538922155688,
"grad_norm": 0.21247480809688568,
"learning_rate": 1.5134730538922156e-06,
"loss": 0.1233,
"step": 39000
},
{
"epoch": 9.855289421157684,
"grad_norm": 0.4861377775669098,
"learning_rate": 1.5072355289421156e-06,
"loss": 0.1286,
"step": 39500
},
{
"epoch": 9.980039920159681,
"grad_norm": 11.489697456359863,
"learning_rate": 1.5009980039920157e-06,
"loss": 0.1203,
"step": 40000
},
{
"epoch": 10.0,
"eval_loss": 0.44174808263778687,
"eval_runtime": 40.4714,
"eval_samples_per_second": 79.216,
"eval_steps_per_second": 19.816,
"step": 40080
},
{
"epoch": 10.104790419161677,
"grad_norm": 0.06284382939338684,
"learning_rate": 1.4947604790419162e-06,
"loss": 0.1176,
"step": 40500
},
{
"epoch": 10.229540918163673,
"grad_norm": 0.8282334804534912,
"learning_rate": 1.4885229540918163e-06,
"loss": 0.1133,
"step": 41000
},
{
"epoch": 10.354291417165669,
"grad_norm": 0.675163984298706,
"learning_rate": 1.4822854291417164e-06,
"loss": 0.0977,
"step": 41500
},
{
"epoch": 10.479041916167665,
"grad_norm": 6.970102310180664,
"learning_rate": 1.4760479041916167e-06,
"loss": 0.1113,
"step": 42000
},
{
"epoch": 10.60379241516966,
"grad_norm": 8.85517406463623,
"learning_rate": 1.4698103792415168e-06,
"loss": 0.1164,
"step": 42500
},
{
"epoch": 10.728542914171657,
"grad_norm": 0.9282238483428955,
"learning_rate": 1.4635728542914173e-06,
"loss": 0.1167,
"step": 43000
},
{
"epoch": 10.853293413173652,
"grad_norm": 9.984148979187012,
"learning_rate": 1.4573353293413174e-06,
"loss": 0.1261,
"step": 43500
},
{
"epoch": 10.978043912175648,
"grad_norm": 0.20773719251155853,
"learning_rate": 1.4510978043912175e-06,
"loss": 0.1132,
"step": 44000
},
{
"epoch": 11.0,
"eval_loss": 0.49900639057159424,
"eval_runtime": 43.241,
"eval_samples_per_second": 74.143,
"eval_steps_per_second": 18.547,
"step": 44088
},
{
"epoch": 11.102794411177644,
"grad_norm": 12.603593826293945,
"learning_rate": 1.4448602794411178e-06,
"loss": 0.1061,
"step": 44500
},
{
"epoch": 11.22754491017964,
"grad_norm": 51.32432174682617,
"learning_rate": 1.4386227544910178e-06,
"loss": 0.1079,
"step": 45000
},
{
"epoch": 11.352295409181636,
"grad_norm": 10.22624397277832,
"learning_rate": 1.432385229540918e-06,
"loss": 0.1166,
"step": 45500
},
{
"epoch": 11.477045908183634,
"grad_norm": 11.041003227233887,
"learning_rate": 1.4261477045908184e-06,
"loss": 0.105,
"step": 46000
},
{
"epoch": 11.60179640718563,
"grad_norm": 35.79409408569336,
"learning_rate": 1.4199101796407185e-06,
"loss": 0.1124,
"step": 46500
},
{
"epoch": 11.726546906187625,
"grad_norm": 0.18676696717739105,
"learning_rate": 1.4136726546906188e-06,
"loss": 0.0928,
"step": 47000
},
{
"epoch": 11.851297405189621,
"grad_norm": 1.4925884008407593,
"learning_rate": 1.4074351297405189e-06,
"loss": 0.1098,
"step": 47500
},
{
"epoch": 11.976047904191617,
"grad_norm": 0.32953181862831116,
"learning_rate": 1.401197604790419e-06,
"loss": 0.1117,
"step": 48000
},
{
"epoch": 12.0,
"eval_loss": 0.4872562289237976,
"eval_runtime": 41.6872,
"eval_samples_per_second": 76.906,
"eval_steps_per_second": 19.239,
"step": 48096
},
{
"epoch": 12.100798403193613,
"grad_norm": 0.027937307953834534,
"learning_rate": 1.3949600798403195e-06,
"loss": 0.0992,
"step": 48500
},
{
"epoch": 12.225548902195609,
"grad_norm": 0.29068148136138916,
"learning_rate": 1.3887225548902196e-06,
"loss": 0.0921,
"step": 49000
},
{
"epoch": 12.350299401197605,
"grad_norm": 0.127395898103714,
"learning_rate": 1.3824850299401197e-06,
"loss": 0.0933,
"step": 49500
},
{
"epoch": 12.4750499001996,
"grad_norm": 0.09435238689184189,
"learning_rate": 1.37624750499002e-06,
"loss": 0.116,
"step": 50000
},
{
"epoch": 12.599800399201596,
"grad_norm": 39.19729232788086,
"learning_rate": 1.37000998003992e-06,
"loss": 0.1052,
"step": 50500
},
{
"epoch": 12.724550898203592,
"grad_norm": 0.28930047154426575,
"learning_rate": 1.3637724550898201e-06,
"loss": 0.1038,
"step": 51000
},
{
"epoch": 12.849301397205588,
"grad_norm": 0.15510033071041107,
"learning_rate": 1.3575349301397206e-06,
"loss": 0.0983,
"step": 51500
},
{
"epoch": 12.974051896207584,
"grad_norm": 81.58076477050781,
"learning_rate": 1.3512974051896207e-06,
"loss": 0.1117,
"step": 52000
},
{
"epoch": 13.0,
"eval_loss": 0.45387548208236694,
"eval_runtime": 43.4012,
"eval_samples_per_second": 73.869,
"eval_steps_per_second": 18.479,
"step": 52104
},
{
"epoch": 13.098802395209582,
"grad_norm": 4.060844421386719,
"learning_rate": 1.345059880239521e-06,
"loss": 0.0983,
"step": 52500
},
{
"epoch": 13.223552894211577,
"grad_norm": 33.315853118896484,
"learning_rate": 1.338822355289421e-06,
"loss": 0.0941,
"step": 53000
},
{
"epoch": 13.348303393213573,
"grad_norm": 0.1183587834239006,
"learning_rate": 1.3325848303393212e-06,
"loss": 0.0973,
"step": 53500
},
{
"epoch": 13.47305389221557,
"grad_norm": 40.30908966064453,
"learning_rate": 1.3263473053892215e-06,
"loss": 0.0871,
"step": 54000
},
{
"epoch": 13.597804391217565,
"grad_norm": 0.619777262210846,
"learning_rate": 1.3201097804391218e-06,
"loss": 0.1001,
"step": 54500
},
{
"epoch": 13.72255489021956,
"grad_norm": 0.2705942392349243,
"learning_rate": 1.3138722554890218e-06,
"loss": 0.0983,
"step": 55000
},
{
"epoch": 13.847305389221557,
"grad_norm": 6.151524066925049,
"learning_rate": 1.3076347305389221e-06,
"loss": 0.0793,
"step": 55500
},
{
"epoch": 13.972055888223553,
"grad_norm": 2.340573787689209,
"learning_rate": 1.3013972055888222e-06,
"loss": 0.099,
"step": 56000
},
{
"epoch": 14.0,
"eval_loss": 0.47363531589508057,
"eval_runtime": 42.3279,
"eval_samples_per_second": 75.742,
"eval_steps_per_second": 18.947,
"step": 56112
},
{
"epoch": 14.096806387225548,
"grad_norm": 2.052589178085327,
"learning_rate": 1.2951596806387225e-06,
"loss": 0.0875,
"step": 56500
},
{
"epoch": 14.221556886227544,
"grad_norm": 1.2925941944122314,
"learning_rate": 1.2889221556886228e-06,
"loss": 0.0812,
"step": 57000
},
{
"epoch": 14.34630738522954,
"grad_norm": 0.062304213643074036,
"learning_rate": 1.282684630738523e-06,
"loss": 0.1017,
"step": 57500
},
{
"epoch": 14.471057884231538,
"grad_norm": 0.1741693764925003,
"learning_rate": 1.2764471057884232e-06,
"loss": 0.0836,
"step": 58000
},
{
"epoch": 14.595808383233534,
"grad_norm": 0.6444254517555237,
"learning_rate": 1.2702095808383233e-06,
"loss": 0.0804,
"step": 58500
},
{
"epoch": 14.72055888223553,
"grad_norm": 2.0034759044647217,
"learning_rate": 1.2639720558882234e-06,
"loss": 0.0953,
"step": 59000
},
{
"epoch": 14.845309381237525,
"grad_norm": 52.82548522949219,
"learning_rate": 1.2577345309381237e-06,
"loss": 0.0996,
"step": 59500
},
{
"epoch": 14.970059880239521,
"grad_norm": 6.955111503601074,
"learning_rate": 1.251497005988024e-06,
"loss": 0.0857,
"step": 60000
},
{
"epoch": 15.0,
"eval_loss": 0.45942702889442444,
"eval_runtime": 43.409,
"eval_samples_per_second": 73.856,
"eval_steps_per_second": 18.475,
"step": 60120
},
{
"epoch": 15.094810379241517,
"grad_norm": 3.2324092388153076,
"learning_rate": 1.245259481037924e-06,
"loss": 0.0849,
"step": 60500
},
{
"epoch": 15.219560878243513,
"grad_norm": 61.83153533935547,
"learning_rate": 1.2390219560878243e-06,
"loss": 0.0798,
"step": 61000
},
{
"epoch": 15.344311377245509,
"grad_norm": 0.015876924619078636,
"learning_rate": 1.2327844311377244e-06,
"loss": 0.0785,
"step": 61500
},
{
"epoch": 15.469061876247505,
"grad_norm": 3.0025134086608887,
"learning_rate": 1.2265469061876247e-06,
"loss": 0.0881,
"step": 62000
},
{
"epoch": 15.5938123752495,
"grad_norm": 12.912367820739746,
"learning_rate": 1.220309381237525e-06,
"loss": 0.0802,
"step": 62500
},
{
"epoch": 15.718562874251496,
"grad_norm": 0.3600245714187622,
"learning_rate": 1.214071856287425e-06,
"loss": 0.0849,
"step": 63000
},
{
"epoch": 15.843313373253492,
"grad_norm": 0.21024100482463837,
"learning_rate": 1.2078343313373254e-06,
"loss": 0.078,
"step": 63500
},
{
"epoch": 15.968063872255488,
"grad_norm": 9.392132759094238,
"learning_rate": 1.2015968063872255e-06,
"loss": 0.0865,
"step": 64000
},
{
"epoch": 16.0,
"eval_loss": 0.48642057180404663,
"eval_runtime": 46.7976,
"eval_samples_per_second": 68.508,
"eval_steps_per_second": 17.138,
"step": 64128
},
{
"epoch": 16.092814371257486,
"grad_norm": 0.5227041244506836,
"learning_rate": 1.1953592814371256e-06,
"loss": 0.0722,
"step": 64500
},
{
"epoch": 16.21756487025948,
"grad_norm": 25.282564163208008,
"learning_rate": 1.1891217564870259e-06,
"loss": 0.0981,
"step": 65000
},
{
"epoch": 16.342315369261478,
"grad_norm": 0.6670591235160828,
"learning_rate": 1.1828842315369261e-06,
"loss": 0.0787,
"step": 65500
},
{
"epoch": 16.46706586826347,
"grad_norm": 22.668352127075195,
"learning_rate": 1.1766467065868262e-06,
"loss": 0.0764,
"step": 66000
},
{
"epoch": 16.59181636726547,
"grad_norm": 0.22597374022006989,
"learning_rate": 1.1704091816367265e-06,
"loss": 0.078,
"step": 66500
},
{
"epoch": 16.716566866267463,
"grad_norm": 21.123409271240234,
"learning_rate": 1.1641716566866266e-06,
"loss": 0.0766,
"step": 67000
},
{
"epoch": 16.84131736526946,
"grad_norm": 0.04259370267391205,
"learning_rate": 1.157934131736527e-06,
"loss": 0.0765,
"step": 67500
},
{
"epoch": 16.96606786427146,
"grad_norm": 0.021560240536928177,
"learning_rate": 1.1516966067864272e-06,
"loss": 0.0785,
"step": 68000
},
{
"epoch": 17.0,
"eval_loss": 0.4793809652328491,
"eval_runtime": 45.0906,
"eval_samples_per_second": 71.101,
"eval_steps_per_second": 17.786,
"step": 68136
},
{
"epoch": 17.090818363273453,
"grad_norm": 9.094868659973145,
"learning_rate": 1.1454590818363273e-06,
"loss": 0.0647,
"step": 68500
},
{
"epoch": 17.21556886227545,
"grad_norm": 0.195833221077919,
"learning_rate": 1.1392215568862276e-06,
"loss": 0.0698,
"step": 69000
},
{
"epoch": 17.340319361277444,
"grad_norm": 0.18507197499275208,
"learning_rate": 1.1329840319361277e-06,
"loss": 0.0712,
"step": 69500
},
{
"epoch": 17.465069860279442,
"grad_norm": 0.9911601543426514,
"learning_rate": 1.1267465069860278e-06,
"loss": 0.0752,
"step": 70000
},
{
"epoch": 17.589820359281436,
"grad_norm": 1.9703953266143799,
"learning_rate": 1.120508982035928e-06,
"loss": 0.0675,
"step": 70500
},
{
"epoch": 17.714570858283434,
"grad_norm": 41.10940933227539,
"learning_rate": 1.1142714570858283e-06,
"loss": 0.0705,
"step": 71000
},
{
"epoch": 17.839321357285428,
"grad_norm": 15.87336254119873,
"learning_rate": 1.1080339321357286e-06,
"loss": 0.0763,
"step": 71500
},
{
"epoch": 17.964071856287426,
"grad_norm": 0.060888275504112244,
"learning_rate": 1.1017964071856287e-06,
"loss": 0.0784,
"step": 72000
},
{
"epoch": 18.0,
"eval_loss": 0.4715409278869629,
"eval_runtime": 44.1035,
"eval_samples_per_second": 72.693,
"eval_steps_per_second": 18.184,
"step": 72144
},
{
"epoch": 18.08882235528942,
"grad_norm": 2.47182035446167,
"learning_rate": 1.0955588822355288e-06,
"loss": 0.0747,
"step": 72500
},
{
"epoch": 18.213572854291417,
"grad_norm": 40.5880126953125,
"learning_rate": 1.089321357285429e-06,
"loss": 0.0678,
"step": 73000
},
{
"epoch": 18.338323353293415,
"grad_norm": 0.4340246915817261,
"learning_rate": 1.0830838323353294e-06,
"loss": 0.0713,
"step": 73500
},
{
"epoch": 18.46307385229541,
"grad_norm": 4.4763312339782715,
"learning_rate": 1.0768463073852295e-06,
"loss": 0.065,
"step": 74000
},
{
"epoch": 18.587824351297407,
"grad_norm": 0.1397508829832077,
"learning_rate": 1.0706087824351298e-06,
"loss": 0.0727,
"step": 74500
},
{
"epoch": 18.7125748502994,
"grad_norm": 7.134496212005615,
"learning_rate": 1.0643712574850299e-06,
"loss": 0.0605,
"step": 75000
},
{
"epoch": 18.8373253493014,
"grad_norm": 0.05227530747652054,
"learning_rate": 1.05813373253493e-06,
"loss": 0.0764,
"step": 75500
},
{
"epoch": 18.962075848303392,
"grad_norm": 17.22441864013672,
"learning_rate": 1.0518962075848302e-06,
"loss": 0.0696,
"step": 76000
},
{
"epoch": 19.0,
"eval_loss": 0.4802711308002472,
"eval_runtime": 45.7109,
"eval_samples_per_second": 70.136,
"eval_steps_per_second": 17.545,
"step": 76152
},
{
"epoch": 19.08682634730539,
"grad_norm": 0.02889215387403965,
"learning_rate": 1.0456586826347305e-06,
"loss": 0.0625,
"step": 76500
},
{
"epoch": 19.211576846307384,
"grad_norm": 128.0497283935547,
"learning_rate": 1.0394211576846308e-06,
"loss": 0.0548,
"step": 77000
},
{
"epoch": 19.336327345309382,
"grad_norm": 0.22108981013298035,
"learning_rate": 1.033183632734531e-06,
"loss": 0.0695,
"step": 77500
},
{
"epoch": 19.461077844311376,
"grad_norm": 55.13557815551758,
"learning_rate": 1.026946107784431e-06,
"loss": 0.0679,
"step": 78000
},
{
"epoch": 19.585828343313374,
"grad_norm": 3.5990562438964844,
"learning_rate": 1.0207085828343313e-06,
"loss": 0.0697,
"step": 78500
},
{
"epoch": 19.710578842315368,
"grad_norm": 3.9640650749206543,
"learning_rate": 1.0144710578842316e-06,
"loss": 0.0699,
"step": 79000
},
{
"epoch": 19.835329341317365,
"grad_norm": 0.3529013395309448,
"learning_rate": 1.0082335329341317e-06,
"loss": 0.0676,
"step": 79500
},
{
"epoch": 19.960079840319363,
"grad_norm": 1.3875175714492798,
"learning_rate": 1.001996007984032e-06,
"loss": 0.0683,
"step": 80000
},
{
"epoch": 20.0,
"eval_loss": 0.5128437280654907,
"eval_runtime": 46.0282,
"eval_samples_per_second": 69.653,
"eval_steps_per_second": 17.424,
"step": 80160
},
{
"epoch": 20.084830339321357,
"grad_norm": 6.171479225158691,
"learning_rate": 9.95758483033932e-07,
"loss": 0.0698,
"step": 80500
},
{
"epoch": 20.209580838323355,
"grad_norm": 0.012239497154951096,
"learning_rate": 9.895209580838323e-07,
"loss": 0.0532,
"step": 81000
},
{
"epoch": 20.33433133732535,
"grad_norm": 7.920960426330566,
"learning_rate": 9.832834331337324e-07,
"loss": 0.0609,
"step": 81500
},
{
"epoch": 20.459081836327346,
"grad_norm": 59.41933822631836,
"learning_rate": 9.770459081836327e-07,
"loss": 0.0653,
"step": 82000
},
{
"epoch": 20.58383233532934,
"grad_norm": 0.10031065344810486,
"learning_rate": 9.708083832335328e-07,
"loss": 0.0497,
"step": 82500
},
{
"epoch": 20.708582834331338,
"grad_norm": 5.42900276184082,
"learning_rate": 9.645708582834331e-07,
"loss": 0.061,
"step": 83000
},
{
"epoch": 20.833333333333332,
"grad_norm": 20.380285263061523,
"learning_rate": 9.583333333333334e-07,
"loss": 0.0717,
"step": 83500
},
{
"epoch": 20.95808383233533,
"grad_norm": 0.10651753097772598,
"learning_rate": 9.520958083832335e-07,
"loss": 0.0638,
"step": 84000
},
{
"epoch": 21.0,
"eval_loss": 0.4833807945251465,
"eval_runtime": 46.5592,
"eval_samples_per_second": 68.859,
"eval_steps_per_second": 17.225,
"step": 84168
},
{
"epoch": 21.082834331337324,
"grad_norm": 0.3842374086380005,
"learning_rate": 9.458582834331337e-07,
"loss": 0.0603,
"step": 84500
},
{
"epoch": 21.20758483033932,
"grad_norm": 51.563140869140625,
"learning_rate": 9.396207584830339e-07,
"loss": 0.06,
"step": 85000
},
{
"epoch": 21.33233532934132,
"grad_norm": 0.037806153297424316,
"learning_rate": 9.333832335329342e-07,
"loss": 0.0612,
"step": 85500
},
{
"epoch": 21.457085828343313,
"grad_norm": 0.11586946994066238,
"learning_rate": 9.271457085828342e-07,
"loss": 0.0664,
"step": 86000
},
{
"epoch": 21.58183632734531,
"grad_norm": 0.34262338280677795,
"learning_rate": 9.209081836327344e-07,
"loss": 0.0602,
"step": 86500
},
{
"epoch": 21.706586826347305,
"grad_norm": 0.11894870549440384,
"learning_rate": 9.146706586826347e-07,
"loss": 0.0522,
"step": 87000
},
{
"epoch": 21.831337325349303,
"grad_norm": 0.1180167868733406,
"learning_rate": 9.084331337325349e-07,
"loss": 0.0616,
"step": 87500
},
{
"epoch": 21.956087824351297,
"grad_norm": 0.09437087923288345,
"learning_rate": 9.02195608782435e-07,
"loss": 0.0607,
"step": 88000
},
{
"epoch": 22.0,
"eval_loss": 0.4958905279636383,
"eval_runtime": 44.4581,
"eval_samples_per_second": 72.113,
"eval_steps_per_second": 18.039,
"step": 88176
},
{
"epoch": 22.080838323353294,
"grad_norm": 0.5892271399497986,
"learning_rate": 8.959580838323353e-07,
"loss": 0.058,
"step": 88500
},
{
"epoch": 22.20558882235529,
"grad_norm": 1.0569002628326416,
"learning_rate": 8.897205588822355e-07,
"loss": 0.0559,
"step": 89000
},
{
"epoch": 22.330339321357286,
"grad_norm": 50.68812561035156,
"learning_rate": 8.834830339321357e-07,
"loss": 0.05,
"step": 89500
},
{
"epoch": 22.45508982035928,
"grad_norm": 0.08090469241142273,
"learning_rate": 8.772455089820359e-07,
"loss": 0.0595,
"step": 90000
},
{
"epoch": 22.579840319361278,
"grad_norm": 14.62991714477539,
"learning_rate": 8.710079840319361e-07,
"loss": 0.059,
"step": 90500
},
{
"epoch": 22.704590818363272,
"grad_norm": 0.2893312871456146,
"learning_rate": 8.647704590818364e-07,
"loss": 0.0518,
"step": 91000
},
{
"epoch": 22.82934131736527,
"grad_norm": 22.239938735961914,
"learning_rate": 8.585329341317364e-07,
"loss": 0.0493,
"step": 91500
},
{
"epoch": 22.954091816367267,
"grad_norm": 0.09933929890394211,
"learning_rate": 8.522954091816366e-07,
"loss": 0.0536,
"step": 92000
},
{
"epoch": 23.0,
"eval_loss": 0.48672357201576233,
"eval_runtime": 44.6856,
"eval_samples_per_second": 71.746,
"eval_steps_per_second": 17.948,
"step": 92184
},
{
"epoch": 23.07884231536926,
"grad_norm": 0.821902871131897,
"learning_rate": 8.460578842315369e-07,
"loss": 0.0553,
"step": 92500
},
{
"epoch": 23.20359281437126,
"grad_norm": 0.2537296414375305,
"learning_rate": 8.398203592814371e-07,
"loss": 0.046,
"step": 93000
},
{
"epoch": 23.328343313373253,
"grad_norm": 0.198989599943161,
"learning_rate": 8.335828343313372e-07,
"loss": 0.0496,
"step": 93500
},
{
"epoch": 23.45309381237525,
"grad_norm": 14.523540496826172,
"learning_rate": 8.273453093812375e-07,
"loss": 0.0465,
"step": 94000
},
{
"epoch": 23.577844311377245,
"grad_norm": 0.3473449945449829,
"learning_rate": 8.211077844311377e-07,
"loss": 0.048,
"step": 94500
},
{
"epoch": 23.702594810379242,
"grad_norm": 4.4253129959106445,
"learning_rate": 8.14870259481038e-07,
"loss": 0.0489,
"step": 95000
},
{
"epoch": 23.827345309381236,
"grad_norm": 159.51025390625,
"learning_rate": 8.086327345309381e-07,
"loss": 0.0552,
"step": 95500
},
{
"epoch": 23.952095808383234,
"grad_norm": 0.31450316309928894,
"learning_rate": 8.023952095808383e-07,
"loss": 0.0537,
"step": 96000
},
{
"epoch": 24.0,
"eval_loss": 0.5026536583900452,
"eval_runtime": 46.0362,
"eval_samples_per_second": 69.641,
"eval_steps_per_second": 17.421,
"step": 96192
},
{
"epoch": 24.076846307385228,
"grad_norm": 1.8670942783355713,
"learning_rate": 7.961576846307386e-07,
"loss": 0.0556,
"step": 96500
},
{
"epoch": 24.201596806387226,
"grad_norm": 0.4119631052017212,
"learning_rate": 7.899201596806386e-07,
"loss": 0.0427,
"step": 97000
},
{
"epoch": 24.32634730538922,
"grad_norm": 4.47167444229126,
"learning_rate": 7.836826347305388e-07,
"loss": 0.0579,
"step": 97500
},
{
"epoch": 24.451097804391217,
"grad_norm": 0.940743625164032,
"learning_rate": 7.774451097804391e-07,
"loss": 0.0462,
"step": 98000
},
{
"epoch": 24.575848303393215,
"grad_norm": 4.091241359710693,
"learning_rate": 7.712075848303393e-07,
"loss": 0.0524,
"step": 98500
},
{
"epoch": 24.70059880239521,
"grad_norm": 11.099757194519043,
"learning_rate": 7.649700598802394e-07,
"loss": 0.0549,
"step": 99000
},
{
"epoch": 24.825349301397207,
"grad_norm": 2.001067876815796,
"learning_rate": 7.587325349301397e-07,
"loss": 0.0485,
"step": 99500
},
{
"epoch": 24.9500998003992,
"grad_norm": 0.15496690571308136,
"learning_rate": 7.524950099800399e-07,
"loss": 0.0537,
"step": 100000
},
{
"epoch": 25.0,
"eval_loss": 0.48970088362693787,
"eval_runtime": 48.0502,
"eval_samples_per_second": 66.722,
"eval_steps_per_second": 16.691,
"step": 100200
},
{
"epoch": 25.0748502994012,
"grad_norm": 5.718461513519287,
"learning_rate": 7.462574850299402e-07,
"loss": 0.0471,
"step": 100500
},
{
"epoch": 25.199600798403193,
"grad_norm": 53.097293853759766,
"learning_rate": 7.400199600798403e-07,
"loss": 0.0467,
"step": 101000
},
{
"epoch": 25.32435129740519,
"grad_norm": 70.51046752929688,
"learning_rate": 7.337824351297404e-07,
"loss": 0.0464,
"step": 101500
},
{
"epoch": 25.449101796407184,
"grad_norm": 6.485039234161377,
"learning_rate": 7.275449101796407e-07,
"loss": 0.0501,
"step": 102000
},
{
"epoch": 25.573852295409182,
"grad_norm": 0.2076825648546219,
"learning_rate": 7.213073852295409e-07,
"loss": 0.05,
"step": 102500
},
{
"epoch": 25.698602794411176,
"grad_norm": 40.60255432128906,
"learning_rate": 7.15069860279441e-07,
"loss": 0.0374,
"step": 103000
},
{
"epoch": 25.823353293413174,
"grad_norm": 1.1958940029144287,
"learning_rate": 7.088323353293413e-07,
"loss": 0.0533,
"step": 103500
},
{
"epoch": 25.948103792415168,
"grad_norm": 11.201072692871094,
"learning_rate": 7.025948103792415e-07,
"loss": 0.0388,
"step": 104000
},
{
"epoch": 26.0,
"eval_loss": 0.48730549216270447,
"eval_runtime": 48.7336,
"eval_samples_per_second": 65.786,
"eval_steps_per_second": 16.457,
"step": 104208
},
{
"epoch": 26.072854291417165,
"grad_norm": 0.08899884670972824,
"learning_rate": 6.963572854291417e-07,
"loss": 0.0482,
"step": 104500
},
{
"epoch": 26.197604790419163,
"grad_norm": 0.08736108243465424,
"learning_rate": 6.901197604790419e-07,
"loss": 0.042,
"step": 105000
},
{
"epoch": 26.322355289421157,
"grad_norm": 0.050059039145708084,
"learning_rate": 6.838822355289421e-07,
"loss": 0.0443,
"step": 105500
},
{
"epoch": 26.447105788423155,
"grad_norm": 0.3098917603492737,
"learning_rate": 6.776447105788423e-07,
"loss": 0.0431,
"step": 106000
},
{
"epoch": 26.57185628742515,
"grad_norm": 0.601845920085907,
"learning_rate": 6.714071856287425e-07,
"loss": 0.0474,
"step": 106500
},
{
"epoch": 26.696606786427147,
"grad_norm": 43.90340805053711,
"learning_rate": 6.651696606786426e-07,
"loss": 0.0546,
"step": 107000
},
{
"epoch": 26.82135728542914,
"grad_norm": 0.1658441424369812,
"learning_rate": 6.589321357285429e-07,
"loss": 0.0463,
"step": 107500
},
{
"epoch": 26.94610778443114,
"grad_norm": 0.7097954154014587,
"learning_rate": 6.526946107784431e-07,
"loss": 0.0413,
"step": 108000
},
{
"epoch": 27.0,
"eval_loss": 0.49195966124534607,
"eval_runtime": 48.5815,
"eval_samples_per_second": 65.992,
"eval_steps_per_second": 16.508,
"step": 108216
},
{
"epoch": 27.070858283433132,
"grad_norm": 0.12945351004600525,
"learning_rate": 6.464570858283432e-07,
"loss": 0.0514,
"step": 108500
},
{
"epoch": 27.19560878243513,
"grad_norm": 0.09241262078285217,
"learning_rate": 6.402195608782435e-07,
"loss": 0.0454,
"step": 109000
},
{
"epoch": 27.320359281437124,
"grad_norm": 0.07145562022924423,
"learning_rate": 6.339820359281437e-07,
"loss": 0.0381,
"step": 109500
},
{
"epoch": 27.44510978043912,
"grad_norm": 0.003607134334743023,
"learning_rate": 6.277445109780439e-07,
"loss": 0.0476,
"step": 110000
},
{
"epoch": 27.56986027944112,
"grad_norm": 10.220846176147461,
"learning_rate": 6.215069860279441e-07,
"loss": 0.0441,
"step": 110500
},
{
"epoch": 27.694610778443113,
"grad_norm": 0.18386581540107727,
"learning_rate": 6.152694610778443e-07,
"loss": 0.0461,
"step": 111000
},
{
"epoch": 27.81936127744511,
"grad_norm": 0.26254481077194214,
"learning_rate": 6.090319361277445e-07,
"loss": 0.0367,
"step": 111500
},
{
"epoch": 27.944111776447105,
"grad_norm": 68.7042007446289,
"learning_rate": 6.027944111776448e-07,
"loss": 0.0471,
"step": 112000
},
{
"epoch": 28.0,
"eval_loss": 0.4870954751968384,
"eval_runtime": 45.0714,
"eval_samples_per_second": 71.132,
"eval_steps_per_second": 17.794,
"step": 112224
},
{
"epoch": 28.068862275449103,
"grad_norm": 0.0271464716643095,
"learning_rate": 5.965568862275448e-07,
"loss": 0.0433,
"step": 112500
},
{
"epoch": 28.193612774451097,
"grad_norm": 0.0086235161870718,
"learning_rate": 5.903193612774451e-07,
"loss": 0.0475,
"step": 113000
},
{
"epoch": 28.318363273453095,
"grad_norm": 0.11506126821041107,
"learning_rate": 5.840818363273453e-07,
"loss": 0.0353,
"step": 113500
},
{
"epoch": 28.44311377245509,
"grad_norm": 10.355070114135742,
"learning_rate": 5.778443113772454e-07,
"loss": 0.0416,
"step": 114000
},
{
"epoch": 28.567864271457086,
"grad_norm": 0.2200528234243393,
"learning_rate": 5.716067864271457e-07,
"loss": 0.0325,
"step": 114500
},
{
"epoch": 28.69261477045908,
"grad_norm": 0.05802537873387337,
"learning_rate": 5.653692614770459e-07,
"loss": 0.0468,
"step": 115000
},
{
"epoch": 28.817365269461078,
"grad_norm": 0.10829133540391922,
"learning_rate": 5.591317365269461e-07,
"loss": 0.042,
"step": 115500
},
{
"epoch": 28.942115768463076,
"grad_norm": 0.162460595369339,
"learning_rate": 5.528942115768463e-07,
"loss": 0.049,
"step": 116000
},
{
"epoch": 29.0,
"eval_loss": 0.4795687198638916,
"eval_runtime": 45.1647,
"eval_samples_per_second": 70.985,
"eval_steps_per_second": 17.757,
"step": 116232
},
{
"epoch": 29.06686626746507,
"grad_norm": 134.6587677001953,
"learning_rate": 5.466566866267465e-07,
"loss": 0.0416,
"step": 116500
},
{
"epoch": 29.191616766467067,
"grad_norm": 0.09312257915735245,
"learning_rate": 5.404191616766467e-07,
"loss": 0.0287,
"step": 117000
},
{
"epoch": 29.31636726546906,
"grad_norm": 0.3530866503715515,
"learning_rate": 5.341816367265469e-07,
"loss": 0.0384,
"step": 117500
},
{
"epoch": 29.44111776447106,
"grad_norm": 0.033993642777204514,
"learning_rate": 5.27944111776447e-07,
"loss": 0.043,
"step": 118000
},
{
"epoch": 29.565868263473053,
"grad_norm": 0.3124711513519287,
"learning_rate": 5.217065868263473e-07,
"loss": 0.04,
"step": 118500
},
{
"epoch": 29.69061876247505,
"grad_norm": 10.49288272857666,
"learning_rate": 5.154690618762475e-07,
"loss": 0.0463,
"step": 119000
},
{
"epoch": 29.815369261477045,
"grad_norm": 0.024224599823355675,
"learning_rate": 5.092315369261477e-07,
"loss": 0.0411,
"step": 119500
},
{
"epoch": 29.940119760479043,
"grad_norm": 3.9215731620788574,
"learning_rate": 5.029940119760479e-07,
"loss": 0.0408,
"step": 120000
},
{
"epoch": 30.0,
"eval_loss": 0.492553174495697,
"eval_runtime": 46.2042,
"eval_samples_per_second": 69.388,
"eval_steps_per_second": 17.358,
"step": 120240
},
{
"epoch": 30.064870259481037,
"grad_norm": 0.021667474880814552,
"learning_rate": 4.967564870259481e-07,
"loss": 0.0374,
"step": 120500
},
{
"epoch": 30.189620758483034,
"grad_norm": 0.5888983011245728,
"learning_rate": 4.905189620758483e-07,
"loss": 0.0463,
"step": 121000
},
{
"epoch": 30.31437125748503,
"grad_norm": 0.09637131541967392,
"learning_rate": 4.842814371257485e-07,
"loss": 0.033,
"step": 121500
},
{
"epoch": 30.439121756487026,
"grad_norm": 0.23179832100868225,
"learning_rate": 4.780439121756487e-07,
"loss": 0.0402,
"step": 122000
},
{
"epoch": 30.563872255489024,
"grad_norm": 0.14170564711093903,
"learning_rate": 4.718063872255489e-07,
"loss": 0.0395,
"step": 122500
},
{
"epoch": 30.688622754491018,
"grad_norm": 0.006093321368098259,
"learning_rate": 4.6556886227544903e-07,
"loss": 0.0356,
"step": 123000
},
{
"epoch": 30.813373253493015,
"grad_norm": 0.1018219068646431,
"learning_rate": 4.593313373253493e-07,
"loss": 0.0419,
"step": 123500
},
{
"epoch": 30.93812375249501,
"grad_norm": 2.9131383895874023,
"learning_rate": 4.5309381237524947e-07,
"loss": 0.0378,
"step": 124000
},
{
"epoch": 31.0,
"eval_loss": 0.5052226781845093,
"eval_runtime": 43.1611,
"eval_samples_per_second": 74.28,
"eval_steps_per_second": 18.582,
"step": 124248
},
{
"epoch": 31.062874251497007,
"grad_norm": 11.588695526123047,
"learning_rate": 4.468562874251497e-07,
"loss": 0.0346,
"step": 124500
},
{
"epoch": 31.187624750499,
"grad_norm": 0.2488149255514145,
"learning_rate": 4.4061876247504985e-07,
"loss": 0.0351,
"step": 125000
},
{
"epoch": 31.312375249501,
"grad_norm": 12.691544532775879,
"learning_rate": 4.343812375249501e-07,
"loss": 0.0323,
"step": 125500
},
{
"epoch": 31.437125748502993,
"grad_norm": 0.004168800078332424,
"learning_rate": 4.281437125748503e-07,
"loss": 0.033,
"step": 126000
},
{
"epoch": 31.56187624750499,
"grad_norm": 0.042690277099609375,
"learning_rate": 4.219061876247505e-07,
"loss": 0.039,
"step": 126500
},
{
"epoch": 31.686626746506985,
"grad_norm": 1.1096973419189453,
"learning_rate": 4.1566866267465066e-07,
"loss": 0.0349,
"step": 127000
},
{
"epoch": 31.811377245508982,
"grad_norm": 0.2642970085144043,
"learning_rate": 4.094311377245509e-07,
"loss": 0.0338,
"step": 127500
},
{
"epoch": 31.936127744510976,
"grad_norm": 0.21338249742984772,
"learning_rate": 4.031936127744511e-07,
"loss": 0.0349,
"step": 128000
},
{
"epoch": 32.0,
"eval_loss": 0.4927305281162262,
"eval_runtime": 43.7641,
"eval_samples_per_second": 73.256,
"eval_steps_per_second": 18.326,
"step": 128256
},
{
"epoch": 32.060878243512974,
"grad_norm": 0.1497274786233902,
"learning_rate": 3.969560878243513e-07,
"loss": 0.0403,
"step": 128500
},
{
"epoch": 32.18562874251497,
"grad_norm": 0.5848351120948792,
"learning_rate": 3.9071856287425147e-07,
"loss": 0.037,
"step": 129000
},
{
"epoch": 32.31037924151697,
"grad_norm": 0.11372077465057373,
"learning_rate": 3.8448103792415166e-07,
"loss": 0.0383,
"step": 129500
},
{
"epoch": 32.43512974051896,
"grad_norm": 0.1047956719994545,
"learning_rate": 3.782435129740519e-07,
"loss": 0.0315,
"step": 130000
},
{
"epoch": 32.55988023952096,
"grad_norm": 0.2975727617740631,
"learning_rate": 3.7200598802395204e-07,
"loss": 0.0264,
"step": 130500
},
{
"epoch": 32.684630738522955,
"grad_norm": 0.2123280167579651,
"learning_rate": 3.657684630738523e-07,
"loss": 0.0341,
"step": 131000
},
{
"epoch": 32.80938123752495,
"grad_norm": 27.63080596923828,
"learning_rate": 3.5953093812375247e-07,
"loss": 0.0368,
"step": 131500
},
{
"epoch": 32.93413173652694,
"grad_norm": 0.034935545176267624,
"learning_rate": 3.5329341317365266e-07,
"loss": 0.0394,
"step": 132000
},
{
"epoch": 33.0,
"eval_loss": 0.4937605559825897,
"eval_runtime": 39.9355,
"eval_samples_per_second": 80.279,
"eval_steps_per_second": 20.082,
"step": 132264
},
{
"epoch": 33.05888223552894,
"grad_norm": 0.003380158683285117,
"learning_rate": 3.4705588822355285e-07,
"loss": 0.0394,
"step": 132500
},
{
"epoch": 33.18363273453094,
"grad_norm": 2.721451997756958,
"learning_rate": 3.408183632734531e-07,
"loss": 0.0365,
"step": 133000
},
{
"epoch": 33.308383233532936,
"grad_norm": 0.4309988021850586,
"learning_rate": 3.345808383233533e-07,
"loss": 0.0302,
"step": 133500
},
{
"epoch": 33.43313373253493,
"grad_norm": 0.24694228172302246,
"learning_rate": 3.283433133732535e-07,
"loss": 0.037,
"step": 134000
},
{
"epoch": 33.557884231536924,
"grad_norm": 0.34988418221473694,
"learning_rate": 3.2210578842315366e-07,
"loss": 0.0258,
"step": 134500
},
{
"epoch": 33.68263473053892,
"grad_norm": 0.19452495872974396,
"learning_rate": 3.158682634730539e-07,
"loss": 0.035,
"step": 135000
},
{
"epoch": 33.80738522954092,
"grad_norm": 0.006651519797742367,
"learning_rate": 3.096307385229541e-07,
"loss": 0.0368,
"step": 135500
},
{
"epoch": 33.93213572854292,
"grad_norm": 0.04128989204764366,
"learning_rate": 3.033932135728543e-07,
"loss": 0.0301,
"step": 136000
},
{
"epoch": 34.0,
"eval_loss": 0.4872666597366333,
"eval_runtime": 37.6035,
"eval_samples_per_second": 85.258,
"eval_steps_per_second": 21.328,
"step": 136272
},
{
"epoch": 34.05688622754491,
"grad_norm": 0.05333876982331276,
"learning_rate": 2.971556886227545e-07,
"loss": 0.0349,
"step": 136500
},
{
"epoch": 34.181636726546905,
"grad_norm": 1.3579726219177246,
"learning_rate": 2.909181636726547e-07,
"loss": 0.0285,
"step": 137000
},
{
"epoch": 34.3063872255489,
"grad_norm": 0.6725994348526001,
"learning_rate": 2.8468063872255486e-07,
"loss": 0.0361,
"step": 137500
},
{
"epoch": 34.4311377245509,
"grad_norm": 0.03919246420264244,
"learning_rate": 2.7844311377245504e-07,
"loss": 0.0274,
"step": 138000
},
{
"epoch": 34.55588822355289,
"grad_norm": 35.5837287902832,
"learning_rate": 2.722055888223553e-07,
"loss": 0.0363,
"step": 138500
},
{
"epoch": 34.68063872255489,
"grad_norm": 0.007728968746960163,
"learning_rate": 2.659680638722555e-07,
"loss": 0.0391,
"step": 139000
},
{
"epoch": 34.80538922155689,
"grad_norm": 0.07272203266620636,
"learning_rate": 2.5973053892215567e-07,
"loss": 0.0268,
"step": 139500
},
{
"epoch": 34.930139720558884,
"grad_norm": 0.33094656467437744,
"learning_rate": 2.5349301397205586e-07,
"loss": 0.0365,
"step": 140000
},
{
"epoch": 35.0,
"eval_loss": 0.4920032024383545,
"eval_runtime": 40.4781,
"eval_samples_per_second": 79.203,
"eval_steps_per_second": 19.813,
"step": 140280
},
{
"epoch": 35.054890219560875,
"grad_norm": 191.99266052246094,
"learning_rate": 2.472554890219561e-07,
"loss": 0.0333,
"step": 140500
},
{
"epoch": 35.17964071856287,
"grad_norm": 0.002573936013504863,
"learning_rate": 2.410179640718563e-07,
"loss": 0.0327,
"step": 141000
},
{
"epoch": 35.30439121756487,
"grad_norm": 0.04750495404005051,
"learning_rate": 2.3478043912175645e-07,
"loss": 0.0345,
"step": 141500
},
{
"epoch": 35.42914171656687,
"grad_norm": 193.8626251220703,
"learning_rate": 2.2854291417165667e-07,
"loss": 0.0321,
"step": 142000
},
{
"epoch": 35.553892215568865,
"grad_norm": 0.0009173236903734505,
"learning_rate": 2.2230538922155686e-07,
"loss": 0.0359,
"step": 142500
},
{
"epoch": 35.678642714570856,
"grad_norm": 0.12355954945087433,
"learning_rate": 2.1606786427145708e-07,
"loss": 0.0347,
"step": 143000
},
{
"epoch": 35.80339321357285,
"grad_norm": 0.24140344560146332,
"learning_rate": 2.0983033932135726e-07,
"loss": 0.031,
"step": 143500
},
{
"epoch": 35.92814371257485,
"grad_norm": 0.007129414472728968,
"learning_rate": 2.0359281437125748e-07,
"loss": 0.0214,
"step": 144000
},
{
"epoch": 36.0,
"eval_loss": 0.4941750466823578,
"eval_runtime": 38.7085,
"eval_samples_per_second": 82.824,
"eval_steps_per_second": 20.719,
"step": 144288
},
{
"epoch": 36.05289421157685,
"grad_norm": 0.27973344922065735,
"learning_rate": 1.9735528942115767e-07,
"loss": 0.0331,
"step": 144500
},
{
"epoch": 36.17764471057884,
"grad_norm": 0.05331612005829811,
"learning_rate": 1.911177644710579e-07,
"loss": 0.0303,
"step": 145000
},
{
"epoch": 36.30239520958084,
"grad_norm": 1.8135106563568115,
"learning_rate": 1.8488023952095808e-07,
"loss": 0.0349,
"step": 145500
},
{
"epoch": 36.427145708582835,
"grad_norm": 0.13009090721607208,
"learning_rate": 1.7864271457085827e-07,
"loss": 0.0405,
"step": 146000
},
{
"epoch": 36.55189620758483,
"grad_norm": 0.07144490629434586,
"learning_rate": 1.7240518962075848e-07,
"loss": 0.0377,
"step": 146500
},
{
"epoch": 36.67664670658683,
"grad_norm": 74.39689636230469,
"learning_rate": 1.6616766467065867e-07,
"loss": 0.0278,
"step": 147000
},
{
"epoch": 36.80139720558882,
"grad_norm": 0.08526595681905746,
"learning_rate": 1.599301397205589e-07,
"loss": 0.0306,
"step": 147500
},
{
"epoch": 36.92614770459082,
"grad_norm": 12.262850761413574,
"learning_rate": 1.5369261477045908e-07,
"loss": 0.0314,
"step": 148000
},
{
"epoch": 37.0,
"eval_loss": 0.49442577362060547,
"eval_runtime": 42.7404,
"eval_samples_per_second": 75.011,
"eval_steps_per_second": 18.764,
"step": 148296
},
{
"epoch": 37.050898203592816,
"grad_norm": 0.02493446320295334,
"learning_rate": 1.474550898203593e-07,
"loss": 0.0262,
"step": 148500
},
{
"epoch": 37.17564870259481,
"grad_norm": 0.14130648970603943,
"learning_rate": 1.4121756487025949e-07,
"loss": 0.0281,
"step": 149000
},
{
"epoch": 37.300399201596804,
"grad_norm": 0.035768117755651474,
"learning_rate": 1.3498003992015965e-07,
"loss": 0.0255,
"step": 149500
},
{
"epoch": 37.4251497005988,
"grad_norm": 0.18820720911026,
"learning_rate": 1.2874251497005986e-07,
"loss": 0.032,
"step": 150000
},
{
"epoch": 37.5499001996008,
"grad_norm": 0.37001463770866394,
"learning_rate": 1.2250499001996008e-07,
"loss": 0.0301,
"step": 150500
},
{
"epoch": 37.6746506986028,
"grad_norm": 0.06626907736063004,
"learning_rate": 1.1626746506986028e-07,
"loss": 0.0238,
"step": 151000
},
{
"epoch": 37.79940119760479,
"grad_norm": 19.17169189453125,
"learning_rate": 1.1002994011976049e-07,
"loss": 0.0385,
"step": 151500
},
{
"epoch": 37.924151696606785,
"grad_norm": 4.972864627838135,
"learning_rate": 1.0379241516966066e-07,
"loss": 0.0337,
"step": 152000
},
{
"epoch": 38.0,
"eval_loss": 0.48605817556381226,
"eval_runtime": 40.1954,
"eval_samples_per_second": 79.76,
"eval_steps_per_second": 19.953,
"step": 152304
},
{
"epoch": 38.04890219560878,
"grad_norm": 0.002587054157629609,
"learning_rate": 9.755489021956087e-08,
"loss": 0.0334,
"step": 152500
},
{
"epoch": 38.17365269461078,
"grad_norm": 70.7108383178711,
"learning_rate": 9.131736526946107e-08,
"loss": 0.0319,
"step": 153000
},
{
"epoch": 38.29840319361278,
"grad_norm": 0.5694107413291931,
"learning_rate": 8.507984031936127e-08,
"loss": 0.0313,
"step": 153500
},
{
"epoch": 38.42315369261477,
"grad_norm": 0.003176228841766715,
"learning_rate": 7.884231536926148e-08,
"loss": 0.0298,
"step": 154000
},
{
"epoch": 38.547904191616766,
"grad_norm": 0.004230498801916838,
"learning_rate": 7.260479041916168e-08,
"loss": 0.0284,
"step": 154500
},
{
"epoch": 38.672654690618764,
"grad_norm": 0.13844607770442963,
"learning_rate": 6.636726546906188e-08,
"loss": 0.0305,
"step": 155000
},
{
"epoch": 38.79740518962076,
"grad_norm": 0.05394995957612991,
"learning_rate": 6.012974051896207e-08,
"loss": 0.0269,
"step": 155500
},
{
"epoch": 38.92215568862275,
"grad_norm": 0.11763022094964981,
"learning_rate": 5.3892215568862274e-08,
"loss": 0.0279,
"step": 156000
},
{
"epoch": 39.0,
"eval_loss": 0.4873499870300293,
"eval_runtime": 44.0281,
"eval_samples_per_second": 72.817,
"eval_steps_per_second": 18.216,
"step": 156312
},
{
"epoch": 39.04690618762475,
"grad_norm": 0.22139760851860046,
"learning_rate": 4.765469061876248e-08,
"loss": 0.0255,
"step": 156500
},
{
"epoch": 39.17165668662675,
"grad_norm": 0.002428988926112652,
"learning_rate": 4.1417165668662674e-08,
"loss": 0.0302,
"step": 157000
},
{
"epoch": 39.296407185628745,
"grad_norm": 0.07879871129989624,
"learning_rate": 3.517964071856287e-08,
"loss": 0.027,
"step": 157500
},
{
"epoch": 39.421157684630735,
"grad_norm": 0.03594490885734558,
"learning_rate": 2.8942115768463073e-08,
"loss": 0.033,
"step": 158000
},
{
"epoch": 39.54590818363273,
"grad_norm": 0.12444847822189331,
"learning_rate": 2.2704590818363273e-08,
"loss": 0.0271,
"step": 158500
},
{
"epoch": 39.67065868263473,
"grad_norm": 47.82669448852539,
"learning_rate": 1.6467065868263473e-08,
"loss": 0.0276,
"step": 159000
},
{
"epoch": 39.79540918163673,
"grad_norm": 0.1385308802127838,
"learning_rate": 1.0229540918163672e-08,
"loss": 0.03,
"step": 159500
},
{
"epoch": 39.920159680638726,
"grad_norm": 0.1429419070482254,
"learning_rate": 3.992015968063871e-09,
"loss": 0.0303,
"step": 160000
}
],
"logging_steps": 500,
"max_steps": 160320,
"num_input_tokens_seen": 0,
"num_train_epochs": 40,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.21770798769152e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}