MohamedAhmedAE's picture
Training in progress, step 87200, checkpoint
aac52d8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.12129203840172702,
"eval_steps": 200,
"global_step": 87200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00021075215336012696,
"grad_norm": 0.5226424932479858,
"learning_rate": 1.9999997880573555e-05,
"loss": 2.036,
"step": 300
},
{
"epoch": 0.0004215043067202539,
"grad_norm": 0.38226085901260376,
"learning_rate": 1.9999991377996364e-05,
"loss": 1.7571,
"step": 600
},
{
"epoch": 0.0006322564600803809,
"grad_norm": 0.563613772392273,
"learning_rate": 1.9999980491662426e-05,
"loss": 1.8127,
"step": 900
},
{
"epoch": 0.0008430086134405078,
"grad_norm": 0.5952104330062866,
"learning_rate": 1.999996522157651e-05,
"loss": 1.7384,
"step": 1200
},
{
"epoch": 0.0010537607668006349,
"grad_norm": 0.5547693967819214,
"learning_rate": 1.999994556774531e-05,
"loss": 1.7468,
"step": 1500
},
{
"epoch": 0.0012645129201607618,
"grad_norm": 0.43776935338974,
"learning_rate": 1.9999921530177446e-05,
"loss": 1.758,
"step": 1800
},
{
"epoch": 0.0014752650735208887,
"grad_norm": 0.5462590456008911,
"learning_rate": 1.999989310888345e-05,
"loss": 1.7594,
"step": 2100
},
{
"epoch": 0.0016860172268810157,
"grad_norm": 0.8200376033782959,
"learning_rate": 1.9999860303875793e-05,
"loss": 1.7043,
"step": 2400
},
{
"epoch": 0.0018967693802411426,
"grad_norm": 0.7284532189369202,
"learning_rate": 1.9999823115168838e-05,
"loss": 1.7438,
"step": 2700
},
{
"epoch": 0.0021075215336012697,
"grad_norm": 0.33967018127441406,
"learning_rate": 1.9999781542778898e-05,
"loss": 1.714,
"step": 3000
},
{
"epoch": 0.0023182736869613967,
"grad_norm": 0.5259618759155273,
"learning_rate": 1.9999735586724198e-05,
"loss": 1.7203,
"step": 3300
},
{
"epoch": 0.0025290258403215236,
"grad_norm": 0.43303459882736206,
"learning_rate": 1.9999685247024884e-05,
"loss": 1.727,
"step": 3600
},
{
"epoch": 0.0027397779936816505,
"grad_norm": 0.46599942445755005,
"learning_rate": 1.999963052370302e-05,
"loss": 1.7606,
"step": 3900
},
{
"epoch": 0.0029505301470417774,
"grad_norm": 0.3022066652774811,
"learning_rate": 1.9999571416782594e-05,
"loss": 1.6982,
"step": 4200
},
{
"epoch": 0.0031612823004019044,
"grad_norm": 0.6201561093330383,
"learning_rate": 1.999950792628952e-05,
"loss": 1.7679,
"step": 4500
},
{
"epoch": 0.0033720344537620313,
"grad_norm": 0.49575334787368774,
"learning_rate": 1.9999440052251636e-05,
"loss": 1.7536,
"step": 4800
},
{
"epoch": 0.0035827866071221582,
"grad_norm": 0.5179804563522339,
"learning_rate": 1.999936779469869e-05,
"loss": 1.6947,
"step": 5100
},
{
"epoch": 0.003793538760482285,
"grad_norm": 0.7314251661300659,
"learning_rate": 1.9999291153662357e-05,
"loss": 1.778,
"step": 5400
},
{
"epoch": 0.004004290913842412,
"grad_norm": 0.6501708626747131,
"learning_rate": 1.999921012917624e-05,
"loss": 1.7251,
"step": 5700
},
{
"epoch": 0.0042150430672025395,
"grad_norm": 0.4130661189556122,
"learning_rate": 1.9999124721275855e-05,
"loss": 1.7343,
"step": 6000
},
{
"epoch": 0.004425795220562666,
"grad_norm": 0.8185787200927734,
"learning_rate": 1.999903492999864e-05,
"loss": 1.7199,
"step": 6300
},
{
"epoch": 0.004636547373922793,
"grad_norm": 0.8078156113624573,
"learning_rate": 1.999894075538396e-05,
"loss": 1.6994,
"step": 6600
},
{
"epoch": 0.00484729952728292,
"grad_norm": 0.5087378621101379,
"learning_rate": 1.99988421974731e-05,
"loss": 1.7405,
"step": 6900
},
{
"epoch": 0.005058051680643047,
"grad_norm": 0.41504907608032227,
"learning_rate": 1.9998739256309265e-05,
"loss": 1.7625,
"step": 7200
},
{
"epoch": 0.005268803834003174,
"grad_norm": 0.6914900541305542,
"learning_rate": 1.9998631931937582e-05,
"loss": 1.7243,
"step": 7500
},
{
"epoch": 0.005479555987363301,
"grad_norm": 0.7715939283370972,
"learning_rate": 1.99985202244051e-05,
"loss": 1.7325,
"step": 7800
},
{
"epoch": 0.0056903081407234275,
"grad_norm": 0.7590686678886414,
"learning_rate": 1.9998404133760786e-05,
"loss": 1.7087,
"step": 8100
},
{
"epoch": 0.005901060294083555,
"grad_norm": 0.4283329248428345,
"learning_rate": 1.999828366005553e-05,
"loss": 1.6673,
"step": 8400
},
{
"epoch": 0.006111812447443681,
"grad_norm": 0.340751051902771,
"learning_rate": 1.9998158803342154e-05,
"loss": 1.7053,
"step": 8700
},
{
"epoch": 0.006322564600803809,
"grad_norm": 0.691828191280365,
"learning_rate": 1.999802956367538e-05,
"loss": 1.6969,
"step": 9000
},
{
"epoch": 0.006533316754163935,
"grad_norm": 0.690230131149292,
"learning_rate": 1.9997895941111877e-05,
"loss": 1.6877,
"step": 9300
},
{
"epoch": 0.006744068907524063,
"grad_norm": 0.493589848279953,
"learning_rate": 1.999775793571021e-05,
"loss": 1.7322,
"step": 9600
},
{
"epoch": 0.00695482106088419,
"grad_norm": 0.3544859290122986,
"learning_rate": 1.9997615547530883e-05,
"loss": 1.6785,
"step": 9900
},
{
"epoch": 0.0071655732142443165,
"grad_norm": 0.385469913482666,
"learning_rate": 1.9997468776636312e-05,
"loss": 1.6838,
"step": 10200
},
{
"epoch": 0.007376325367604444,
"grad_norm": 0.5788832902908325,
"learning_rate": 1.9997317623090845e-05,
"loss": 1.6544,
"step": 10500
},
{
"epoch": 0.00758707752096457,
"grad_norm": 0.5985785722732544,
"learning_rate": 1.9997162086960737e-05,
"loss": 1.7223,
"step": 10800
},
{
"epoch": 0.007797829674324698,
"grad_norm": 0.316755473613739,
"learning_rate": 1.999700216831417e-05,
"loss": 1.69,
"step": 11100
},
{
"epoch": 0.008008581827684824,
"grad_norm": 0.38338717818260193,
"learning_rate": 1.999683786722126e-05,
"loss": 1.6609,
"step": 11400
},
{
"epoch": 0.00821933398104495,
"grad_norm": 0.5168259143829346,
"learning_rate": 1.999666918375402e-05,
"loss": 1.6581,
"step": 11700
},
{
"epoch": 0.008430086134405079,
"grad_norm": 0.534095823764801,
"learning_rate": 1.9996496117986404e-05,
"loss": 1.6349,
"step": 12000
},
{
"epoch": 0.008640838287765205,
"grad_norm": 0.3490186333656311,
"learning_rate": 1.9996318669994275e-05,
"loss": 1.6453,
"step": 12300
},
{
"epoch": 0.008851590441125332,
"grad_norm": 0.8970847129821777,
"learning_rate": 1.9996136839855426e-05,
"loss": 1.7177,
"step": 12600
},
{
"epoch": 0.009062342594485458,
"grad_norm": 0.26147884130477905,
"learning_rate": 1.9995950627649567e-05,
"loss": 1.7142,
"step": 12900
},
{
"epoch": 0.009273094747845587,
"grad_norm": 0.2811965048313141,
"learning_rate": 1.9995760033458323e-05,
"loss": 1.6708,
"step": 13200
},
{
"epoch": 0.009483846901205713,
"grad_norm": 0.38605690002441406,
"learning_rate": 1.9995565057365255e-05,
"loss": 1.7134,
"step": 13500
},
{
"epoch": 0.00969459905456584,
"grad_norm": 0.3690793514251709,
"learning_rate": 1.999536569945583e-05,
"loss": 1.7195,
"step": 13800
},
{
"epoch": 0.009905351207925968,
"grad_norm": 0.6093018054962158,
"learning_rate": 1.9995161959817442e-05,
"loss": 1.7219,
"step": 14100
},
{
"epoch": 0.010116103361286094,
"grad_norm": 0.5218012928962708,
"learning_rate": 1.9994953838539408e-05,
"loss": 1.7018,
"step": 14400
},
{
"epoch": 0.01032685551464622,
"grad_norm": 0.49458399415016174,
"learning_rate": 1.9994741335712963e-05,
"loss": 1.6825,
"step": 14700
},
{
"epoch": 0.010537607668006347,
"grad_norm": 0.559432327747345,
"learning_rate": 1.999452445143126e-05,
"loss": 1.7167,
"step": 15000
},
{
"epoch": 0.010748359821366476,
"grad_norm": 0.30228278040885925,
"learning_rate": 1.999430318578938e-05,
"loss": 1.6662,
"step": 15300
},
{
"epoch": 0.010959111974726602,
"grad_norm": 0.7081576585769653,
"learning_rate": 1.9994077538884315e-05,
"loss": 1.7021,
"step": 15600
},
{
"epoch": 0.011169864128086729,
"grad_norm": 0.35427266359329224,
"learning_rate": 1.9993847510814986e-05,
"loss": 1.7281,
"step": 15900
},
{
"epoch": 0.011380616281446855,
"grad_norm": 0.5683895349502563,
"learning_rate": 1.9993613101682236e-05,
"loss": 1.7239,
"step": 16200
},
{
"epoch": 0.011591368434806983,
"grad_norm": 0.38721588253974915,
"learning_rate": 1.999337431158882e-05,
"loss": 1.6956,
"step": 16500
},
{
"epoch": 0.01180212058816711,
"grad_norm": 0.5694710612297058,
"learning_rate": 1.9993131140639417e-05,
"loss": 1.7068,
"step": 16800
},
{
"epoch": 0.012012872741527236,
"grad_norm": 0.6137599945068359,
"learning_rate": 1.9992883588940636e-05,
"loss": 1.6573,
"step": 17100
},
{
"epoch": 0.012223624894887363,
"grad_norm": 0.40165194869041443,
"learning_rate": 1.999263165660098e-05,
"loss": 1.7253,
"step": 17400
},
{
"epoch": 0.012434377048247491,
"grad_norm": 0.6021196246147156,
"learning_rate": 1.9992375343730913e-05,
"loss": 1.7137,
"step": 17700
},
{
"epoch": 0.012645129201607617,
"grad_norm": 0.26746639609336853,
"learning_rate": 1.9992114650442776e-05,
"loss": 1.6879,
"step": 18000
},
{
"epoch": 0.012855881354967744,
"grad_norm": 0.2976267635822296,
"learning_rate": 1.999184957685086e-05,
"loss": 1.7046,
"step": 18300
},
{
"epoch": 0.01306663350832787,
"grad_norm": 0.7765467166900635,
"learning_rate": 1.999158012307137e-05,
"loss": 1.659,
"step": 18600
},
{
"epoch": 0.013277385661687999,
"grad_norm": 0.3900175392627716,
"learning_rate": 1.999130628922242e-05,
"loss": 1.6311,
"step": 18900
},
{
"epoch": 0.013488137815048125,
"grad_norm": 0.5595497488975525,
"learning_rate": 1.9991028075424058e-05,
"loss": 1.7031,
"step": 19200
},
{
"epoch": 0.013698889968408252,
"grad_norm": 0.4682876765727997,
"learning_rate": 1.999074548179824e-05,
"loss": 1.678,
"step": 19500
},
{
"epoch": 0.01390964212176838,
"grad_norm": 0.29408198595046997,
"learning_rate": 1.999045850846886e-05,
"loss": 1.6933,
"step": 19800
},
{
"epoch": 0.014120394275128506,
"grad_norm": 0.5688738226890564,
"learning_rate": 1.999016715556171e-05,
"loss": 1.7094,
"step": 20100
},
{
"epoch": 0.014331146428488633,
"grad_norm": 0.33922871947288513,
"learning_rate": 1.9989871423204515e-05,
"loss": 1.7241,
"step": 20400
},
{
"epoch": 0.01454189858184876,
"grad_norm": 0.5533545613288879,
"learning_rate": 1.9989571311526917e-05,
"loss": 1.6947,
"step": 20700
},
{
"epoch": 0.014752650735208888,
"grad_norm": 0.4148263931274414,
"learning_rate": 1.9989266820660477e-05,
"loss": 1.6968,
"step": 21000
},
{
"epoch": 0.014963402888569014,
"grad_norm": 0.41911232471466064,
"learning_rate": 1.9988957950738678e-05,
"loss": 1.6583,
"step": 21300
},
{
"epoch": 0.01517415504192914,
"grad_norm": 0.6024166941642761,
"learning_rate": 1.9988644701896922e-05,
"loss": 1.7029,
"step": 21600
},
{
"epoch": 0.015384907195289267,
"grad_norm": 1.0043667554855347,
"learning_rate": 1.9988327074272528e-05,
"loss": 1.6589,
"step": 21900
},
{
"epoch": 0.015595659348649395,
"grad_norm": 0.2308570146560669,
"learning_rate": 1.9988005068004734e-05,
"loss": 1.6768,
"step": 22200
},
{
"epoch": 0.01580641150200952,
"grad_norm": 0.37441328167915344,
"learning_rate": 1.9987678683234707e-05,
"loss": 1.6844,
"step": 22500
},
{
"epoch": 0.01601716365536965,
"grad_norm": 0.5360813736915588,
"learning_rate": 1.9987347920105517e-05,
"loss": 1.6552,
"step": 22800
},
{
"epoch": 0.016227915808729777,
"grad_norm": 0.6203900575637817,
"learning_rate": 1.9987012778762173e-05,
"loss": 1.6582,
"step": 23100
},
{
"epoch": 0.0164386679620899,
"grad_norm": 0.44901394844055176,
"learning_rate": 1.9986673259351584e-05,
"loss": 1.6519,
"step": 23400
},
{
"epoch": 0.01664942011545003,
"grad_norm": 0.35824868083000183,
"learning_rate": 1.998632936202259e-05,
"loss": 1.6868,
"step": 23700
},
{
"epoch": 0.016860172268810158,
"grad_norm": 0.2986483871936798,
"learning_rate": 1.998598108692595e-05,
"loss": 1.681,
"step": 24000
},
{
"epoch": 0.017070924422170283,
"grad_norm": 0.24179548025131226,
"learning_rate": 1.9985628434214334e-05,
"loss": 1.6609,
"step": 24300
},
{
"epoch": 0.01728167657553041,
"grad_norm": 0.3813832402229309,
"learning_rate": 1.9985271404042343e-05,
"loss": 1.7062,
"step": 24600
},
{
"epoch": 0.01749242872889054,
"grad_norm": 0.729052722454071,
"learning_rate": 1.9984909996566487e-05,
"loss": 1.6842,
"step": 24900
},
{
"epoch": 0.017703180882250664,
"grad_norm": 0.4519130289554596,
"learning_rate": 1.99845442119452e-05,
"loss": 1.6985,
"step": 25200
},
{
"epoch": 0.017913933035610792,
"grad_norm": 0.8739224672317505,
"learning_rate": 1.998417405033883e-05,
"loss": 1.7079,
"step": 25500
},
{
"epoch": 0.018124685188970917,
"grad_norm": 0.6653211712837219,
"learning_rate": 1.998379951190965e-05,
"loss": 1.7096,
"step": 25800
},
{
"epoch": 0.018335437342331045,
"grad_norm": 0.284675270318985,
"learning_rate": 1.9983420596821848e-05,
"loss": 1.6631,
"step": 26100
},
{
"epoch": 0.018546189495691173,
"grad_norm": 0.3598421514034271,
"learning_rate": 1.998303730524153e-05,
"loss": 1.675,
"step": 26400
},
{
"epoch": 0.018756941649051298,
"grad_norm": 0.4144167900085449,
"learning_rate": 1.9982649637336722e-05,
"loss": 1.6568,
"step": 26700
},
{
"epoch": 0.018967693802411426,
"grad_norm": 0.3359282314777374,
"learning_rate": 1.998225759327737e-05,
"loss": 1.7094,
"step": 27000
},
{
"epoch": 0.019178445955771554,
"grad_norm": 0.513903021812439,
"learning_rate": 1.9981861173235337e-05,
"loss": 1.7124,
"step": 27300
},
{
"epoch": 0.01938919810913168,
"grad_norm": 0.6693827509880066,
"learning_rate": 1.9981460377384402e-05,
"loss": 1.7058,
"step": 27600
},
{
"epoch": 0.019599950262491807,
"grad_norm": 0.5340267419815063,
"learning_rate": 1.9981055205900263e-05,
"loss": 1.6654,
"step": 27900
},
{
"epoch": 0.019810702415851936,
"grad_norm": 0.5445379018783569,
"learning_rate": 1.9980645658960543e-05,
"loss": 1.7163,
"step": 28200
},
{
"epoch": 0.02002145456921206,
"grad_norm": 0.4673166275024414,
"learning_rate": 1.9980231736744772e-05,
"loss": 1.6885,
"step": 28500
},
{
"epoch": 0.02023220672257219,
"grad_norm": 0.3964649438858032,
"learning_rate": 1.997981343943441e-05,
"loss": 1.7349,
"step": 28800
},
{
"epoch": 0.020442958875932313,
"grad_norm": 0.2978706955909729,
"learning_rate": 1.9979390767212818e-05,
"loss": 1.7043,
"step": 29100
},
{
"epoch": 0.02065371102929244,
"grad_norm": 0.5393288135528564,
"learning_rate": 1.9978963720265297e-05,
"loss": 1.6713,
"step": 29400
},
{
"epoch": 0.02086446318265257,
"grad_norm": 0.5377795100212097,
"learning_rate": 1.9978532298779047e-05,
"loss": 1.7386,
"step": 29700
},
{
"epoch": 0.021075215336012695,
"grad_norm": 0.9713721871376038,
"learning_rate": 1.997809650294319e-05,
"loss": 1.666,
"step": 30000
},
{
"epoch": 0.021285967489372823,
"grad_norm": 0.5377500057220459,
"learning_rate": 1.9977656332948783e-05,
"loss": 1.6333,
"step": 30300
},
{
"epoch": 0.02149671964273295,
"grad_norm": 0.7222330570220947,
"learning_rate": 1.997721178898877e-05,
"loss": 1.647,
"step": 30600
},
{
"epoch": 0.021707471796093076,
"grad_norm": 0.7166306972503662,
"learning_rate": 1.9976762871258036e-05,
"loss": 1.6702,
"step": 30900
},
{
"epoch": 0.021918223949453204,
"grad_norm": 0.5046064257621765,
"learning_rate": 1.9976309579953374e-05,
"loss": 1.6834,
"step": 31200
},
{
"epoch": 0.02212897610281333,
"grad_norm": 0.45860910415649414,
"learning_rate": 1.9975851915273495e-05,
"loss": 1.6752,
"step": 31500
},
{
"epoch": 0.022339728256173457,
"grad_norm": 0.29681679606437683,
"learning_rate": 1.9975389877419033e-05,
"loss": 1.6773,
"step": 31800
},
{
"epoch": 0.022550480409533585,
"grad_norm": 0.3104653060436249,
"learning_rate": 1.9974923466592528e-05,
"loss": 1.6612,
"step": 32100
},
{
"epoch": 0.02276123256289371,
"grad_norm": 0.3063599467277527,
"learning_rate": 1.9974452682998446e-05,
"loss": 1.6728,
"step": 32400
},
{
"epoch": 0.02297198471625384,
"grad_norm": 0.5225020051002502,
"learning_rate": 1.9973977526843173e-05,
"loss": 1.6858,
"step": 32700
},
{
"epoch": 0.023182736869613967,
"grad_norm": 0.5143675208091736,
"learning_rate": 1.9973497998334993e-05,
"loss": 1.6589,
"step": 33000
},
{
"epoch": 0.02339348902297409,
"grad_norm": 0.38197314739227295,
"learning_rate": 1.9973014097684134e-05,
"loss": 1.6792,
"step": 33300
},
{
"epoch": 0.02360424117633422,
"grad_norm": 0.3367295563220978,
"learning_rate": 1.9972525825102716e-05,
"loss": 1.699,
"step": 33600
},
{
"epoch": 0.023814993329694348,
"grad_norm": 0.7394578456878662,
"learning_rate": 1.997203318080479e-05,
"loss": 1.6465,
"step": 33900
},
{
"epoch": 0.024025745483054473,
"grad_norm": 0.474181205034256,
"learning_rate": 1.9971536165006323e-05,
"loss": 1.6735,
"step": 34200
},
{
"epoch": 0.0242364976364146,
"grad_norm": 0.23586097359657288,
"learning_rate": 1.997103477792519e-05,
"loss": 1.673,
"step": 34500
},
{
"epoch": 0.024447249789774726,
"grad_norm": 0.6340083479881287,
"learning_rate": 1.9970529019781188e-05,
"loss": 1.6962,
"step": 34800
},
{
"epoch": 0.024658001943134854,
"grad_norm": 0.6163159608840942,
"learning_rate": 1.997001889079603e-05,
"loss": 1.7392,
"step": 35100
},
{
"epoch": 0.024868754096494982,
"grad_norm": 0.493683397769928,
"learning_rate": 1.996950439119334e-05,
"loss": 1.69,
"step": 35400
},
{
"epoch": 0.025079506249855107,
"grad_norm": 0.5327634215354919,
"learning_rate": 1.996898552119867e-05,
"loss": 1.6646,
"step": 35700
},
{
"epoch": 0.025290258403215235,
"grad_norm": 0.31081724166870117,
"learning_rate": 1.9968462281039477e-05,
"loss": 1.6577,
"step": 36000
},
{
"epoch": 0.025501010556575363,
"grad_norm": 0.35480156540870667,
"learning_rate": 1.9967934670945133e-05,
"loss": 1.7186,
"step": 36300
},
{
"epoch": 0.025711762709935488,
"grad_norm": 0.6122684478759766,
"learning_rate": 1.996740269114694e-05,
"loss": 1.6977,
"step": 36600
},
{
"epoch": 0.025922514863295616,
"grad_norm": 0.5439947843551636,
"learning_rate": 1.996686634187809e-05,
"loss": 1.707,
"step": 36900
},
{
"epoch": 0.02613326701665574,
"grad_norm": 0.2414426952600479,
"learning_rate": 1.996632562337372e-05,
"loss": 1.685,
"step": 37200
},
{
"epoch": 0.02634401917001587,
"grad_norm": 0.5094660520553589,
"learning_rate": 1.9965780535870857e-05,
"loss": 1.7265,
"step": 37500
},
{
"epoch": 0.026554771323375997,
"grad_norm": 0.47010356187820435,
"learning_rate": 1.996523107960846e-05,
"loss": 1.7038,
"step": 37800
},
{
"epoch": 0.026765523476736122,
"grad_norm": 0.8133333325386047,
"learning_rate": 1.99646772548274e-05,
"loss": 1.6871,
"step": 38100
},
{
"epoch": 0.02697627563009625,
"grad_norm": 0.38821008801460266,
"learning_rate": 1.9964119061770457e-05,
"loss": 1.708,
"step": 38400
},
{
"epoch": 0.02718702778345638,
"grad_norm": 0.4741418957710266,
"learning_rate": 1.9963556500682326e-05,
"loss": 1.6105,
"step": 38700
},
{
"epoch": 0.027397779936816503,
"grad_norm": 0.29607221484184265,
"learning_rate": 1.9962989571809627e-05,
"loss": 1.6973,
"step": 39000
},
{
"epoch": 0.02760853209017663,
"grad_norm": 0.6345324516296387,
"learning_rate": 1.9962418275400887e-05,
"loss": 1.6509,
"step": 39300
},
{
"epoch": 0.02781928424353676,
"grad_norm": 0.45032599568367004,
"learning_rate": 1.996184261170654e-05,
"loss": 1.7075,
"step": 39600
},
{
"epoch": 0.028030036396896885,
"grad_norm": 0.5050271153450012,
"learning_rate": 1.9961262580978954e-05,
"loss": 1.7109,
"step": 39900
},
{
"epoch": 0.028240788550257013,
"grad_norm": 0.41781535744667053,
"learning_rate": 1.9960678183472398e-05,
"loss": 1.6649,
"step": 40200
},
{
"epoch": 0.028451540703617138,
"grad_norm": 0.26590654253959656,
"learning_rate": 1.9960089419443054e-05,
"loss": 1.6784,
"step": 40500
},
{
"epoch": 0.028662292856977266,
"grad_norm": 0.2450639009475708,
"learning_rate": 1.9959496289149025e-05,
"loss": 1.6668,
"step": 40800
},
{
"epoch": 0.028873045010337394,
"grad_norm": 0.4967813789844513,
"learning_rate": 1.9958898792850324e-05,
"loss": 1.6743,
"step": 41100
},
{
"epoch": 0.02908379716369752,
"grad_norm": 0.6676028966903687,
"learning_rate": 1.995829693080888e-05,
"loss": 1.7498,
"step": 41400
},
{
"epoch": 0.029294549317057647,
"grad_norm": 0.5373163819313049,
"learning_rate": 1.995769070328854e-05,
"loss": 1.6637,
"step": 41700
},
{
"epoch": 0.029505301470417775,
"grad_norm": 0.519572377204895,
"learning_rate": 1.9957080110555046e-05,
"loss": 1.6743,
"step": 42000
},
{
"epoch": 0.0297160536237779,
"grad_norm": 0.4347212612628937,
"learning_rate": 1.9956465152876076e-05,
"loss": 1.6588,
"step": 42300
},
{
"epoch": 0.02992680577713803,
"grad_norm": 1.0741196870803833,
"learning_rate": 1.9955845830521215e-05,
"loss": 1.6594,
"step": 42600
},
{
"epoch": 0.030137557930498157,
"grad_norm": 0.29438555240631104,
"learning_rate": 1.9955222143761954e-05,
"loss": 1.6742,
"step": 42900
},
{
"epoch": 0.03034831008385828,
"grad_norm": 0.2416359782218933,
"learning_rate": 1.995459409287171e-05,
"loss": 1.6244,
"step": 43200
},
{
"epoch": 0.03055906223721841,
"grad_norm": 0.3811536431312561,
"learning_rate": 1.995396167812579e-05,
"loss": 1.6768,
"step": 43500
},
{
"epoch": 0.030769814390578534,
"grad_norm": 0.52689528465271,
"learning_rate": 1.995332489980145e-05,
"loss": 1.6966,
"step": 43800
},
{
"epoch": 0.030980566543938663,
"grad_norm": 0.7789379358291626,
"learning_rate": 1.9952683758177822e-05,
"loss": 1.7243,
"step": 44100
},
{
"epoch": 0.03119131869729879,
"grad_norm": 0.5525240898132324,
"learning_rate": 1.9952038253535974e-05,
"loss": 1.6921,
"step": 44400
},
{
"epoch": 0.031402070850658916,
"grad_norm": 0.3712736666202545,
"learning_rate": 1.995138838615888e-05,
"loss": 1.7013,
"step": 44700
},
{
"epoch": 0.03161282300401904,
"grad_norm": 0.9973317980766296,
"learning_rate": 1.9950734156331425e-05,
"loss": 1.6312,
"step": 45000
},
{
"epoch": 0.03182357515737917,
"grad_norm": 0.47756654024124146,
"learning_rate": 1.9950075564340406e-05,
"loss": 1.6871,
"step": 45300
},
{
"epoch": 0.0320343273107393,
"grad_norm": 0.6799851655960083,
"learning_rate": 1.9949412610474533e-05,
"loss": 1.6863,
"step": 45600
},
{
"epoch": 0.03224507946409942,
"grad_norm": 0.4427473843097687,
"learning_rate": 1.9948745295024436e-05,
"loss": 1.6623,
"step": 45900
},
{
"epoch": 0.03245583161745955,
"grad_norm": 0.39299216866493225,
"learning_rate": 1.9948073618282646e-05,
"loss": 1.6714,
"step": 46200
},
{
"epoch": 0.03266658377081968,
"grad_norm": 0.49504077434539795,
"learning_rate": 1.994739758054361e-05,
"loss": 1.6637,
"step": 46500
},
{
"epoch": 0.0328773359241798,
"grad_norm": 0.8702055215835571,
"learning_rate": 1.9946717182103684e-05,
"loss": 1.6592,
"step": 46800
},
{
"epoch": 0.033088088077539934,
"grad_norm": 0.4576627314090729,
"learning_rate": 1.994603242326114e-05,
"loss": 1.6766,
"step": 47100
},
{
"epoch": 0.03329884023090006,
"grad_norm": 0.4096079170703888,
"learning_rate": 1.9945343304316168e-05,
"loss": 1.6465,
"step": 47400
},
{
"epoch": 0.033509592384260184,
"grad_norm": 0.20980533957481384,
"learning_rate": 1.9944649825570846e-05,
"loss": 1.642,
"step": 47700
},
{
"epoch": 0.033720344537620316,
"grad_norm": 0.2815381586551666,
"learning_rate": 1.994395198732919e-05,
"loss": 1.6655,
"step": 48000
},
{
"epoch": 0.03393109669098044,
"grad_norm": 0.3039368987083435,
"learning_rate": 1.9943249789897115e-05,
"loss": 1.6321,
"step": 48300
},
{
"epoch": 0.034141848844340565,
"grad_norm": 0.3904463052749634,
"learning_rate": 1.9942543233582442e-05,
"loss": 1.7152,
"step": 48600
},
{
"epoch": 0.0343526009977007,
"grad_norm": 0.44439107179641724,
"learning_rate": 1.994183231869491e-05,
"loss": 1.6447,
"step": 48900
},
{
"epoch": 0.03456335315106082,
"grad_norm": 0.49679499864578247,
"learning_rate": 1.9941117045546172e-05,
"loss": 1.7256,
"step": 49200
},
{
"epoch": 0.034774105304420946,
"grad_norm": 0.5180505514144897,
"learning_rate": 1.994039741444978e-05,
"loss": 1.6376,
"step": 49500
},
{
"epoch": 0.03498485745778108,
"grad_norm": 0.425346314907074,
"learning_rate": 1.9939673425721203e-05,
"loss": 1.7076,
"step": 49800
},
{
"epoch": 0.0351956096111412,
"grad_norm": 0.6802691221237183,
"learning_rate": 1.9938945079677827e-05,
"loss": 1.7051,
"step": 50100
},
{
"epoch": 0.03540636176450133,
"grad_norm": 0.3185325562953949,
"learning_rate": 1.9938212376638937e-05,
"loss": 1.6973,
"step": 50400
},
{
"epoch": 0.03561711391786145,
"grad_norm": 0.493886262178421,
"learning_rate": 1.9937475316925734e-05,
"loss": 1.7147,
"step": 50700
},
{
"epoch": 0.035827866071221584,
"grad_norm": 0.3434235155582428,
"learning_rate": 1.9936733900861324e-05,
"loss": 1.7198,
"step": 51000
},
{
"epoch": 0.03603861822458171,
"grad_norm": 0.40676742792129517,
"learning_rate": 1.993598812877073e-05,
"loss": 1.7005,
"step": 51300
},
{
"epoch": 0.036249370377941834,
"grad_norm": 0.311731219291687,
"learning_rate": 1.993523800098088e-05,
"loss": 1.6024,
"step": 51600
},
{
"epoch": 0.036460122531301965,
"grad_norm": 0.3956526517868042,
"learning_rate": 1.993448351782061e-05,
"loss": 1.6885,
"step": 51900
},
{
"epoch": 0.03667087468466209,
"grad_norm": 0.5735802054405212,
"learning_rate": 1.993372467962067e-05,
"loss": 1.6835,
"step": 52200
},
{
"epoch": 0.036881626838022215,
"grad_norm": 0.35788044333457947,
"learning_rate": 1.993296148671371e-05,
"loss": 1.641,
"step": 52500
},
{
"epoch": 0.037092378991382347,
"grad_norm": 0.5725888013839722,
"learning_rate": 1.9932193939434304e-05,
"loss": 1.6075,
"step": 52800
},
{
"epoch": 0.03686056482904,
"grad_norm": 0.37512072920799255,
"learning_rate": 1.993303783123765e-05,
"loss": 1.6621,
"step": 53000
},
{
"epoch": 0.03699966130009299,
"grad_norm": 0.46840205788612366,
"learning_rate": 1.993253202515022e-05,
"loss": 1.6933,
"step": 53200
},
{
"epoch": 0.03713875777114597,
"grad_norm": 0.6212956309318542,
"learning_rate": 1.9932024322378897e-05,
"loss": 1.6746,
"step": 53400
},
{
"epoch": 0.03727785424219895,
"grad_norm": 0.44953858852386475,
"learning_rate": 1.9931514723020624e-05,
"loss": 1.6428,
"step": 53600
},
{
"epoch": 0.03741695071325193,
"grad_norm": 0.30623960494995117,
"learning_rate": 1.993100322717272e-05,
"loss": 1.5969,
"step": 53800
},
{
"epoch": 0.037556047184304914,
"grad_norm": 0.2042389065027237,
"learning_rate": 1.9930489834932853e-05,
"loss": 1.6739,
"step": 54000
},
{
"epoch": 0.03769514365535789,
"grad_norm": 1.5293948650360107,
"learning_rate": 1.9929974546399056e-05,
"loss": 1.5911,
"step": 54200
},
{
"epoch": 0.03783424012641087,
"grad_norm": 0.2645045220851898,
"learning_rate": 1.9929457361669735e-05,
"loss": 1.7516,
"step": 54400
},
{
"epoch": 0.03797333659746385,
"grad_norm": 0.47690504789352417,
"learning_rate": 1.992893828084365e-05,
"loss": 1.667,
"step": 54600
},
{
"epoch": 0.03811243306851683,
"grad_norm": 0.683849573135376,
"learning_rate": 1.9928417304019915e-05,
"loss": 1.6974,
"step": 54800
},
{
"epoch": 0.03825152953956982,
"grad_norm": 0.5895049571990967,
"learning_rate": 1.9927894431298016e-05,
"loss": 1.6562,
"step": 55000
},
{
"epoch": 0.0383906260106228,
"grad_norm": 0.41334617137908936,
"learning_rate": 1.9927369662777806e-05,
"loss": 1.6587,
"step": 55200
},
{
"epoch": 0.03852972248167578,
"grad_norm": 0.4091622531414032,
"learning_rate": 1.992684299855949e-05,
"loss": 1.6477,
"step": 55400
},
{
"epoch": 0.03866881895272876,
"grad_norm": 0.28317388892173767,
"learning_rate": 1.992631443874363e-05,
"loss": 1.6445,
"step": 55600
},
{
"epoch": 0.038807915423781744,
"grad_norm": 0.4005356431007385,
"learning_rate": 1.992578398343117e-05,
"loss": 1.6825,
"step": 55800
},
{
"epoch": 0.038947011894834724,
"grad_norm": 0.44340595602989197,
"learning_rate": 1.9925251632723396e-05,
"loss": 1.6805,
"step": 56000
},
{
"epoch": 0.0390861083658877,
"grad_norm": 0.2671707570552826,
"learning_rate": 1.9924717386721964e-05,
"loss": 1.6489,
"step": 56200
},
{
"epoch": 0.03922520483694068,
"grad_norm": 0.5683106780052185,
"learning_rate": 1.9924181245528898e-05,
"loss": 1.6548,
"step": 56400
},
{
"epoch": 0.03936430130799366,
"grad_norm": 0.3212202787399292,
"learning_rate": 1.9923643209246575e-05,
"loss": 1.6416,
"step": 56600
},
{
"epoch": 0.03950339777904665,
"grad_norm": 0.2874471843242645,
"learning_rate": 1.9923103277977735e-05,
"loss": 1.6848,
"step": 56800
},
{
"epoch": 0.03964249425009963,
"grad_norm": 0.38741543889045715,
"learning_rate": 1.992256145182548e-05,
"loss": 1.7068,
"step": 57000
},
{
"epoch": 0.03978159072115261,
"grad_norm": 1.436324954032898,
"learning_rate": 1.9922017730893278e-05,
"loss": 1.6259,
"step": 57200
},
{
"epoch": 0.03992068719220559,
"grad_norm": 0.3459469676017761,
"learning_rate": 1.9921472115284957e-05,
"loss": 1.7207,
"step": 57400
},
{
"epoch": 0.040059783663258575,
"grad_norm": 1.227866768836975,
"learning_rate": 1.9920924605104708e-05,
"loss": 1.6709,
"step": 57600
},
{
"epoch": 0.040198880134311554,
"grad_norm": 0.35972604155540466,
"learning_rate": 1.9920375200457074e-05,
"loss": 1.6679,
"step": 57800
},
{
"epoch": 0.040337976605364534,
"grad_norm": 0.32493817806243896,
"learning_rate": 1.9919823901446976e-05,
"loss": 1.6326,
"step": 58000
},
{
"epoch": 0.040477073076417514,
"grad_norm": 0.4151223599910736,
"learning_rate": 1.9919270708179682e-05,
"loss": 1.709,
"step": 58200
},
{
"epoch": 0.04061616954747049,
"grad_norm": 0.4418564438819885,
"learning_rate": 1.991871562076083e-05,
"loss": 1.6295,
"step": 58400
},
{
"epoch": 0.04075526601852348,
"grad_norm": 0.826331377029419,
"learning_rate": 1.991815863929642e-05,
"loss": 1.6223,
"step": 58600
},
{
"epoch": 0.04089436248957646,
"grad_norm": 0.592551589012146,
"learning_rate": 1.991759976389281e-05,
"loss": 1.6334,
"step": 58800
},
{
"epoch": 0.04103345896062944,
"grad_norm": 0.5953264832496643,
"learning_rate": 1.9917038994656715e-05,
"loss": 1.7214,
"step": 59000
},
{
"epoch": 0.04117255543168242,
"grad_norm": 0.37218430638313293,
"learning_rate": 1.9916476331695228e-05,
"loss": 1.6912,
"step": 59200
},
{
"epoch": 0.041311651902735405,
"grad_norm": 0.5334930419921875,
"learning_rate": 1.9915911775115785e-05,
"loss": 1.6132,
"step": 59400
},
{
"epoch": 0.041450748373788385,
"grad_norm": 0.2905101180076599,
"learning_rate": 1.9915345325026195e-05,
"loss": 1.6618,
"step": 59600
},
{
"epoch": 0.041589844844841364,
"grad_norm": 0.3982454240322113,
"learning_rate": 1.9914776981534633e-05,
"loss": 1.6848,
"step": 59800
},
{
"epoch": 0.041728941315894344,
"grad_norm": 0.4498521387577057,
"learning_rate": 1.9914206744749612e-05,
"loss": 1.6858,
"step": 60000
},
{
"epoch": 0.041868037786947324,
"grad_norm": 0.4296523630619049,
"learning_rate": 1.991363461478004e-05,
"loss": 1.6255,
"step": 60200
},
{
"epoch": 0.04200713425800031,
"grad_norm": 0.35572549700737,
"learning_rate": 1.991306059173515e-05,
"loss": 1.7085,
"step": 60400
},
{
"epoch": 0.04214623072905329,
"grad_norm": 0.45710158348083496,
"learning_rate": 1.9912484675724575e-05,
"loss": 1.6886,
"step": 60600
},
{
"epoch": 0.04228532720010627,
"grad_norm": 0.6244034171104431,
"learning_rate": 1.9911906866858276e-05,
"loss": 1.6668,
"step": 60800
},
{
"epoch": 0.04242442367115925,
"grad_norm": 0.3790908753871918,
"learning_rate": 1.9911327165246598e-05,
"loss": 1.6264,
"step": 61000
},
{
"epoch": 0.04256352014221223,
"grad_norm": 0.37423181533813477,
"learning_rate": 1.991074557100023e-05,
"loss": 1.6972,
"step": 61200
},
{
"epoch": 0.042702616613265215,
"grad_norm": 0.3610687255859375,
"learning_rate": 1.991016208423024e-05,
"loss": 1.7148,
"step": 61400
},
{
"epoch": 0.042841713084318195,
"grad_norm": 0.36563995480537415,
"learning_rate": 1.9909576705048048e-05,
"loss": 1.7165,
"step": 61600
},
{
"epoch": 0.042980809555371174,
"grad_norm": 0.6822441816329956,
"learning_rate": 1.990898943356543e-05,
"loss": 1.6602,
"step": 61800
},
{
"epoch": 0.043119906026424154,
"grad_norm": 0.5291351675987244,
"learning_rate": 1.9908400269894534e-05,
"loss": 1.6277,
"step": 62000
},
{
"epoch": 0.04325900249747714,
"grad_norm": 0.5071974992752075,
"learning_rate": 1.9907809214147863e-05,
"loss": 1.6528,
"step": 62200
},
{
"epoch": 0.04339809896853012,
"grad_norm": 0.3188241422176361,
"learning_rate": 1.990721626643828e-05,
"loss": 1.6864,
"step": 62400
},
{
"epoch": 0.0435371954395831,
"grad_norm": 0.323962926864624,
"learning_rate": 1.9906621426879026e-05,
"loss": 1.7026,
"step": 62600
},
{
"epoch": 0.04367629191063608,
"grad_norm": 0.3724607527256012,
"learning_rate": 1.9906024695583674e-05,
"loss": 1.6503,
"step": 62800
},
{
"epoch": 0.04381538838168906,
"grad_norm": 0.6592810750007629,
"learning_rate": 1.990542607266618e-05,
"loss": 1.6498,
"step": 63000
},
{
"epoch": 0.043954484852742046,
"grad_norm": 0.7135905027389526,
"learning_rate": 1.9904825558240853e-05,
"loss": 1.6419,
"step": 63200
},
{
"epoch": 0.044093581323795025,
"grad_norm": 0.5441883206367493,
"learning_rate": 1.990422315242237e-05,
"loss": 1.6208,
"step": 63400
},
{
"epoch": 0.044232677794848005,
"grad_norm": 0.6606343984603882,
"learning_rate": 1.9903618855325762e-05,
"loss": 1.7041,
"step": 63600
},
{
"epoch": 0.044371774265900985,
"grad_norm": 0.26841533184051514,
"learning_rate": 1.990301266706642e-05,
"loss": 1.6535,
"step": 63800
},
{
"epoch": 0.04451087073695397,
"grad_norm": 0.5965221524238586,
"learning_rate": 1.9902404587760108e-05,
"loss": 1.6738,
"step": 64000
},
{
"epoch": 0.04464996720800695,
"grad_norm": 0.7239671349525452,
"learning_rate": 1.9901794617522933e-05,
"loss": 1.6598,
"step": 64200
},
{
"epoch": 0.04478906367905993,
"grad_norm": 0.505463182926178,
"learning_rate": 1.990118275647138e-05,
"loss": 1.6669,
"step": 64400
},
{
"epoch": 0.08985625780678401,
"grad_norm": 0.28192538022994995,
"learning_rate": 1.9604251136197576e-05,
"loss": 1.6758,
"step": 64600
},
{
"epoch": 0.09013445055541182,
"grad_norm": 0.2823716402053833,
"learning_rate": 1.9601813114871574e-05,
"loss": 1.6962,
"step": 64800
},
{
"epoch": 0.09041264330403964,
"grad_norm": 0.19964300096035004,
"learning_rate": 1.9599367759381548e-05,
"loss": 1.6809,
"step": 65000
},
{
"epoch": 0.09069083605266745,
"grad_norm": 0.3048658072948456,
"learning_rate": 1.9596915071595334e-05,
"loss": 1.6526,
"step": 65200
},
{
"epoch": 0.09096902880129526,
"grad_norm": 0.21376414597034454,
"learning_rate": 1.9594455053386376e-05,
"loss": 1.6969,
"step": 65400
},
{
"epoch": 0.09124722154992308,
"grad_norm": 0.46542856097221375,
"learning_rate": 1.9591987706633712e-05,
"loss": 1.7277,
"step": 65600
},
{
"epoch": 0.09152541429855089,
"grad_norm": 0.35781970620155334,
"learning_rate": 1.9589513033221976e-05,
"loss": 1.674,
"step": 65800
},
{
"epoch": 0.09180360704717871,
"grad_norm": 0.36667361855506897,
"learning_rate": 1.9587031035041403e-05,
"loss": 1.6689,
"step": 66000
},
{
"epoch": 0.09208179979580652,
"grad_norm": 0.28317973017692566,
"learning_rate": 1.9584541713987823e-05,
"loss": 1.6812,
"step": 66200
},
{
"epoch": 0.09235999254443433,
"grad_norm": 0.3756510615348816,
"learning_rate": 1.9582045071962648e-05,
"loss": 1.712,
"step": 66400
},
{
"epoch": 0.09263818529306216,
"grad_norm": 0.29829922318458557,
"learning_rate": 1.9579541110872903e-05,
"loss": 1.7115,
"step": 66600
},
{
"epoch": 0.09291637804168996,
"grad_norm": 0.2587492763996124,
"learning_rate": 1.957702983263118e-05,
"loss": 1.7049,
"step": 66800
},
{
"epoch": 0.09319457079031777,
"grad_norm": 0.220120370388031,
"learning_rate": 1.9574511239155677e-05,
"loss": 1.6979,
"step": 67000
},
{
"epoch": 0.0934727635389456,
"grad_norm": 0.4158194959163666,
"learning_rate": 1.9571985332370176e-05,
"loss": 1.715,
"step": 67200
},
{
"epoch": 0.0937509562875734,
"grad_norm": 0.3217870593070984,
"learning_rate": 1.956945211420404e-05,
"loss": 1.6753,
"step": 67400
},
{
"epoch": 0.09402914903620123,
"grad_norm": 0.3095886707305908,
"learning_rate": 1.956691158659222e-05,
"loss": 1.6879,
"step": 67600
},
{
"epoch": 0.09430734178482904,
"grad_norm": 0.31036627292633057,
"learning_rate": 1.956436375147525e-05,
"loss": 1.6815,
"step": 67800
},
{
"epoch": 0.09458553453345685,
"grad_norm": 0.2732967138290405,
"learning_rate": 1.9561808610799255e-05,
"loss": 1.7006,
"step": 68000
},
{
"epoch": 0.09486372728208467,
"grad_norm": 0.465425044298172,
"learning_rate": 1.9559246166515917e-05,
"loss": 1.6962,
"step": 68200
},
{
"epoch": 0.09514192003071248,
"grad_norm": 0.2841686010360718,
"learning_rate": 1.9556676420582517e-05,
"loss": 1.6894,
"step": 68400
},
{
"epoch": 0.09542011277934029,
"grad_norm": 0.30286672711372375,
"learning_rate": 1.955409937496191e-05,
"loss": 1.6704,
"step": 68600
},
{
"epoch": 0.09569830552796811,
"grad_norm": 0.2298436164855957,
"learning_rate": 1.955151503162252e-05,
"loss": 1.7096,
"step": 68800
},
{
"epoch": 0.09597649827659592,
"grad_norm": 0.24763423204421997,
"learning_rate": 1.9548923392538346e-05,
"loss": 1.7543,
"step": 69000
},
{
"epoch": 0.09625469102522374,
"grad_norm": 0.31084269285202026,
"learning_rate": 1.9546324459688967e-05,
"loss": 1.6615,
"step": 69200
},
{
"epoch": 0.09653288377385155,
"grad_norm": 0.43397852778434753,
"learning_rate": 1.954371823505953e-05,
"loss": 1.7036,
"step": 69400
},
{
"epoch": 0.09681107652247936,
"grad_norm": 0.31707778573036194,
"learning_rate": 1.954110472064075e-05,
"loss": 1.6716,
"step": 69600
},
{
"epoch": 0.09708926927110718,
"grad_norm": 0.7093697190284729,
"learning_rate": 1.953848391842891e-05,
"loss": 1.7113,
"step": 69800
},
{
"epoch": 0.097367462019735,
"grad_norm": 0.2799917161464691,
"learning_rate": 1.9535855830425857e-05,
"loss": 1.704,
"step": 70000
},
{
"epoch": 0.0976456547683628,
"grad_norm": 0.3946918845176697,
"learning_rate": 1.9533220458639013e-05,
"loss": 1.7351,
"step": 70200
},
{
"epoch": 0.09792384751699063,
"grad_norm": 0.2851293981075287,
"learning_rate": 1.953057780508135e-05,
"loss": 1.686,
"step": 70400
},
{
"epoch": 0.09820204026561843,
"grad_norm": 0.32784244418144226,
"learning_rate": 1.9527927871771416e-05,
"loss": 1.7294,
"step": 70600
},
{
"epoch": 0.09848023301424626,
"grad_norm": 0.3254486918449402,
"learning_rate": 1.9525270660733304e-05,
"loss": 1.7089,
"step": 70800
},
{
"epoch": 0.09875842576287407,
"grad_norm": 0.35317400097846985,
"learning_rate": 1.9522606173996683e-05,
"loss": 1.7065,
"step": 71000
},
{
"epoch": 0.09903661851150188,
"grad_norm": 0.43726846575737,
"learning_rate": 1.9519934413596768e-05,
"loss": 1.6938,
"step": 71200
},
{
"epoch": 0.0993148112601297,
"grad_norm": 0.3208189904689789,
"learning_rate": 1.9517255381574337e-05,
"loss": 1.6987,
"step": 71400
},
{
"epoch": 0.09959300400875751,
"grad_norm": 0.5465477108955383,
"learning_rate": 1.9514569079975705e-05,
"loss": 1.6968,
"step": 71600
},
{
"epoch": 0.09987119675738532,
"grad_norm": 0.3042430877685547,
"learning_rate": 1.951187551085277e-05,
"loss": 1.6851,
"step": 71800
},
{
"epoch": 0.10014938950601314,
"grad_norm": 1.3345911502838135,
"learning_rate": 1.950917467626295e-05,
"loss": 1.747,
"step": 72000
},
{
"epoch": 0.10042758225464095,
"grad_norm": 0.37963107228279114,
"learning_rate": 1.9506466578269238e-05,
"loss": 1.6539,
"step": 72200
},
{
"epoch": 0.10070577500326877,
"grad_norm": 0.2997443974018097,
"learning_rate": 1.9503751218940152e-05,
"loss": 1.6752,
"step": 72400
},
{
"epoch": 0.10098396775189658,
"grad_norm": 0.29101336002349854,
"learning_rate": 1.9501028600349775e-05,
"loss": 1.6912,
"step": 72600
},
{
"epoch": 0.10126216050052439,
"grad_norm": 0.32952240109443665,
"learning_rate": 1.949829872457773e-05,
"loss": 1.6751,
"step": 72800
},
{
"epoch": 0.10154035324915221,
"grad_norm": 0.6925126910209656,
"learning_rate": 1.9495561593709174e-05,
"loss": 1.6948,
"step": 73000
},
{
"epoch": 0.10181854599778002,
"grad_norm": 0.3063753545284271,
"learning_rate": 1.949281720983482e-05,
"loss": 1.6583,
"step": 73200
},
{
"epoch": 0.10209673874640783,
"grad_norm": 0.1839706301689148,
"learning_rate": 1.949006557505091e-05,
"loss": 1.7303,
"step": 73400
},
{
"epoch": 0.10237493149503565,
"grad_norm": 0.3848772943019867,
"learning_rate": 1.948730669145923e-05,
"loss": 1.6642,
"step": 73600
},
{
"epoch": 0.10265312424366346,
"grad_norm": 0.3976518213748932,
"learning_rate": 1.94845405611671e-05,
"loss": 1.7244,
"step": 73800
},
{
"epoch": 0.10293131699229129,
"grad_norm": 0.4992425739765167,
"learning_rate": 1.9481767186287377e-05,
"loss": 1.7002,
"step": 74000
},
{
"epoch": 0.1032095097409191,
"grad_norm": 0.36821067333221436,
"learning_rate": 1.947898656893845e-05,
"loss": 1.7047,
"step": 74200
},
{
"epoch": 0.1034877024895469,
"grad_norm": 0.5008091926574707,
"learning_rate": 1.9476198711244245e-05,
"loss": 1.6921,
"step": 74400
},
{
"epoch": 0.10376589523817473,
"grad_norm": 0.4330016076564789,
"learning_rate": 1.947340361533421e-05,
"loss": 1.7168,
"step": 74600
},
{
"epoch": 0.10404408798680254,
"grad_norm": 0.342879980802536,
"learning_rate": 1.9470601283343328e-05,
"loss": 1.7438,
"step": 74800
},
{
"epoch": 0.10432228073543034,
"grad_norm": 0.4086737632751465,
"learning_rate": 1.9467791717412107e-05,
"loss": 1.6795,
"step": 75000
},
{
"epoch": 0.10460047348405817,
"grad_norm": 0.3051975965499878,
"learning_rate": 1.9464974919686585e-05,
"loss": 1.715,
"step": 75200
},
{
"epoch": 0.10487866623268598,
"grad_norm": 0.40126869082450867,
"learning_rate": 1.9462150892318315e-05,
"loss": 1.7332,
"step": 75400
},
{
"epoch": 0.1051568589813138,
"grad_norm": 0.2848926782608032,
"learning_rate": 1.9459319637464376e-05,
"loss": 1.715,
"step": 75600
},
{
"epoch": 0.10543505172994161,
"grad_norm": 0.40455424785614014,
"learning_rate": 1.945648115728737e-05,
"loss": 1.7237,
"step": 75800
},
{
"epoch": 0.10571324447856942,
"grad_norm": 0.40959736704826355,
"learning_rate": 1.9453635453955425e-05,
"loss": 1.7048,
"step": 76000
},
{
"epoch": 0.10599143722719724,
"grad_norm": 0.30568796396255493,
"learning_rate": 1.9450782529642164e-05,
"loss": 1.7276,
"step": 76200
},
{
"epoch": 0.10626962997582505,
"grad_norm": 0.29840388894081116,
"learning_rate": 1.9447922386526752e-05,
"loss": 1.7084,
"step": 76400
},
{
"epoch": 0.10654782272445286,
"grad_norm": 0.2814345359802246,
"learning_rate": 1.9445055026793846e-05,
"loss": 1.6686,
"step": 76600
},
{
"epoch": 0.10682601547308068,
"grad_norm": 0.5545098185539246,
"learning_rate": 1.9442180452633628e-05,
"loss": 1.6811,
"step": 76800
},
{
"epoch": 0.10710420822170849,
"grad_norm": 0.18126177787780762,
"learning_rate": 1.943929866624179e-05,
"loss": 1.6961,
"step": 77000
},
{
"epoch": 0.10738240097033631,
"grad_norm": 0.5672417879104614,
"learning_rate": 1.9436409669819527e-05,
"loss": 1.7022,
"step": 77200
},
{
"epoch": 0.10766059371896412,
"grad_norm": 0.26212358474731445,
"learning_rate": 1.9433513465573545e-05,
"loss": 1.6836,
"step": 77400
},
{
"epoch": 0.10793878646759193,
"grad_norm": 0.24932098388671875,
"learning_rate": 1.9430610055716056e-05,
"loss": 1.7245,
"step": 77600
},
{
"epoch": 0.10821697921621976,
"grad_norm": 0.26785990595817566,
"learning_rate": 1.9427699442464774e-05,
"loss": 1.7242,
"step": 77800
},
{
"epoch": 0.10849517196484756,
"grad_norm": 0.310847669839859,
"learning_rate": 1.9424781628042915e-05,
"loss": 1.736,
"step": 78000
},
{
"epoch": 0.10877336471347537,
"grad_norm": 0.2551518380641937,
"learning_rate": 1.94218566146792e-05,
"loss": 1.706,
"step": 78200
},
{
"epoch": 0.1090515574621032,
"grad_norm": 0.3088129162788391,
"learning_rate": 1.9418924404607842e-05,
"loss": 1.7089,
"step": 78400
},
{
"epoch": 0.109329750210731,
"grad_norm": 0.3498319983482361,
"learning_rate": 1.9415985000068556e-05,
"loss": 1.6846,
"step": 78600
},
{
"epoch": 0.10960794295935881,
"grad_norm": 0.2993554174900055,
"learning_rate": 1.941303840330655e-05,
"loss": 1.7197,
"step": 78800
},
{
"epoch": 0.10988613570798664,
"grad_norm": 0.23916451632976532,
"learning_rate": 1.941008461657253e-05,
"loss": 1.6994,
"step": 79000
},
{
"epoch": 0.11016432845661445,
"grad_norm": 0.5682498216629028,
"learning_rate": 1.9407123642122686e-05,
"loss": 1.6827,
"step": 79200
},
{
"epoch": 0.11044252120524227,
"grad_norm": 0.25092509388923645,
"learning_rate": 1.9404155482218703e-05,
"loss": 1.6798,
"step": 79400
},
{
"epoch": 0.11072071395387008,
"grad_norm": 0.4485166072845459,
"learning_rate": 1.9401180139127755e-05,
"loss": 1.6785,
"step": 79600
},
{
"epoch": 0.11099890670249789,
"grad_norm": 0.33716893196105957,
"learning_rate": 1.9398197615122504e-05,
"loss": 1.6885,
"step": 79800
},
{
"epoch": 0.11127709945112571,
"grad_norm": 0.2410348504781723,
"learning_rate": 1.939520791248109e-05,
"loss": 1.7109,
"step": 80000
},
{
"epoch": 0.11155529219975352,
"grad_norm": 0.37753862142562866,
"learning_rate": 1.939221103348714e-05,
"loss": 1.6944,
"step": 80200
},
{
"epoch": 0.11183348494838133,
"grad_norm": 0.31450897455215454,
"learning_rate": 1.938920698042977e-05,
"loss": 1.6899,
"step": 80400
},
{
"epoch": 0.11211167769700915,
"grad_norm": 0.31832966208457947,
"learning_rate": 1.9386195755603568e-05,
"loss": 1.6958,
"step": 80600
},
{
"epoch": 0.11238987044563696,
"grad_norm": 0.24022148549556732,
"learning_rate": 1.9383177361308597e-05,
"loss": 1.7042,
"step": 80800
},
{
"epoch": 0.11266806319426478,
"grad_norm": 0.3811296820640564,
"learning_rate": 1.9380151799850404e-05,
"loss": 1.699,
"step": 81000
},
{
"epoch": 0.11294625594289259,
"grad_norm": 0.23990687727928162,
"learning_rate": 1.9377119073540007e-05,
"loss": 1.7091,
"step": 81200
},
{
"epoch": 0.1132244486915204,
"grad_norm": 0.265990287065506,
"learning_rate": 1.9374079184693898e-05,
"loss": 1.6931,
"step": 81400
},
{
"epoch": 0.11350264144014822,
"grad_norm": 0.2768077850341797,
"learning_rate": 1.9371032135634033e-05,
"loss": 1.7058,
"step": 81600
},
{
"epoch": 0.11378083418877603,
"grad_norm": 0.6246830821037292,
"learning_rate": 1.936797792868785e-05,
"loss": 1.7078,
"step": 81800
},
{
"epoch": 0.11405902693740384,
"grad_norm": 0.28057006001472473,
"learning_rate": 1.9364916566188242e-05,
"loss": 1.712,
"step": 82000
},
{
"epoch": 0.11433721968603167,
"grad_norm": 0.31619200110435486,
"learning_rate": 1.9361848050473578e-05,
"loss": 1.7087,
"step": 82200
},
{
"epoch": 0.11461541243465947,
"grad_norm": 0.45818835496902466,
"learning_rate": 1.9358772383887683e-05,
"loss": 1.672,
"step": 82400
},
{
"epoch": 0.1148936051832873,
"grad_norm": 0.2376299947500229,
"learning_rate": 1.9355689568779844e-05,
"loss": 1.678,
"step": 82600
},
{
"epoch": 0.11517179793191511,
"grad_norm": 0.28687921166419983,
"learning_rate": 1.935259960750482e-05,
"loss": 1.6904,
"step": 82800
},
{
"epoch": 0.11544999068054292,
"grad_norm": 0.3678719997406006,
"learning_rate": 1.934950250242281e-05,
"loss": 1.6848,
"step": 83000
},
{
"epoch": 0.11572818342917074,
"grad_norm": 0.3978765606880188,
"learning_rate": 1.9346398255899485e-05,
"loss": 1.6557,
"step": 83200
},
{
"epoch": 0.11600637617779855,
"grad_norm": 0.34782731533050537,
"learning_rate": 1.9343286870305964e-05,
"loss": 1.6658,
"step": 83400
},
{
"epoch": 0.11628456892642636,
"grad_norm": 0.384232759475708,
"learning_rate": 1.9340168348018822e-05,
"loss": 1.7144,
"step": 83600
},
{
"epoch": 0.11656276167505418,
"grad_norm": 0.6080924272537231,
"learning_rate": 1.933704269142008e-05,
"loss": 1.6828,
"step": 83800
},
{
"epoch": 0.11684095442368199,
"grad_norm": 0.3429834246635437,
"learning_rate": 1.9333909902897212e-05,
"loss": 1.7374,
"step": 84000
},
{
"epoch": 0.11711914717230981,
"grad_norm": 0.34908148646354675,
"learning_rate": 1.9330769984843144e-05,
"loss": 1.7273,
"step": 84200
},
{
"epoch": 0.11739733992093762,
"grad_norm": 0.47220101952552795,
"learning_rate": 1.932762293965624e-05,
"loss": 1.6758,
"step": 84400
},
{
"epoch": 0.11767553266956543,
"grad_norm": 0.5649632215499878,
"learning_rate": 1.9324468769740307e-05,
"loss": 1.6967,
"step": 84600
},
{
"epoch": 0.11795372541819325,
"grad_norm": 0.3771503269672394,
"learning_rate": 1.932130747750461e-05,
"loss": 1.7156,
"step": 84800
},
{
"epoch": 0.11823191816682106,
"grad_norm": 0.3423559367656708,
"learning_rate": 1.9318139065363826e-05,
"loss": 1.6854,
"step": 85000
},
{
"epoch": 0.11851011091544887,
"grad_norm": 0.4594859182834625,
"learning_rate": 1.93149635357381e-05,
"loss": 1.7195,
"step": 85200
},
{
"epoch": 0.1187883036640767,
"grad_norm": 0.29249799251556396,
"learning_rate": 1.9311780891052998e-05,
"loss": 1.6567,
"step": 85400
},
{
"epoch": 0.1190664964127045,
"grad_norm": 0.23755620419979095,
"learning_rate": 1.930859113373952e-05,
"loss": 1.7054,
"step": 85600
},
{
"epoch": 0.11934468916133233,
"grad_norm": 0.29518914222717285,
"learning_rate": 1.9305394266234104e-05,
"loss": 1.6406,
"step": 85800
},
{
"epoch": 0.11962288190996014,
"grad_norm": 0.5197004675865173,
"learning_rate": 1.9302190290978622e-05,
"loss": 1.6807,
"step": 86000
},
{
"epoch": 0.11990107465858794,
"grad_norm": 0.2740679979324341,
"learning_rate": 1.929897921042036e-05,
"loss": 1.6977,
"step": 86200
},
{
"epoch": 0.12017926740721577,
"grad_norm": 0.33021771907806396,
"learning_rate": 1.9295761027012046e-05,
"loss": 1.6943,
"step": 86400
},
{
"epoch": 0.12045746015584358,
"grad_norm": 0.32778891921043396,
"learning_rate": 1.929253574321183e-05,
"loss": 1.6941,
"step": 86600
},
{
"epoch": 0.12073565290447139,
"grad_norm": 0.3531610369682312,
"learning_rate": 1.9289303361483284e-05,
"loss": 1.7031,
"step": 86800
},
{
"epoch": 0.12101384565309921,
"grad_norm": 0.4716193377971649,
"learning_rate": 1.9286063884295397e-05,
"loss": 1.668,
"step": 87000
},
{
"epoch": 0.12129203840172702,
"grad_norm": 0.35484832525253296,
"learning_rate": 1.928281731412259e-05,
"loss": 1.7128,
"step": 87200
}
],
"logging_steps": 200,
"max_steps": 718926,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7288134689589985e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}