t5-big-scratch-custom-iwslt2017 / trainer_state.json
minseok0809's picture
End of training
6ada77c verified
raw
history blame contribute delete
No virus
128 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 100.0,
"eval_steps": 500,
"global_step": 364600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13713658804168952,
"grad_norm": 2.6796674728393555,
"learning_rate": 4.993143170597916e-05,
"loss": 6.6471,
"step": 500
},
{
"epoch": 0.27427317608337903,
"grad_norm": 2.4919307231903076,
"learning_rate": 4.986286341195831e-05,
"loss": 5.8655,
"step": 1000
},
{
"epoch": 0.4114097641250686,
"grad_norm": 2.4707608222961426,
"learning_rate": 4.9794295117937464e-05,
"loss": 5.5305,
"step": 1500
},
{
"epoch": 0.5485463521667581,
"grad_norm": 2.745155096054077,
"learning_rate": 4.972572682391663e-05,
"loss": 5.3124,
"step": 2000
},
{
"epoch": 0.6856829402084477,
"grad_norm": 2.7914953231811523,
"learning_rate": 4.965715852989578e-05,
"loss": 5.1281,
"step": 2500
},
{
"epoch": 0.8228195282501372,
"grad_norm": 2.9527249336242676,
"learning_rate": 4.958859023587493e-05,
"loss": 4.9858,
"step": 3000
},
{
"epoch": 0.9599561162918266,
"grad_norm": 2.5611634254455566,
"learning_rate": 4.952002194185409e-05,
"loss": 4.8776,
"step": 3500
},
{
"epoch": 1.0970927043335161,
"grad_norm": 3.1537859439849854,
"learning_rate": 4.9451453647833245e-05,
"loss": 4.7393,
"step": 4000
},
{
"epoch": 1.2342292923752056,
"grad_norm": 2.9716005325317383,
"learning_rate": 4.9382885353812394e-05,
"loss": 4.67,
"step": 4500
},
{
"epoch": 1.3713658804168953,
"grad_norm": 3.2810921669006348,
"learning_rate": 4.931431705979155e-05,
"loss": 4.5978,
"step": 5000
},
{
"epoch": 1.5085024684585848,
"grad_norm": 3.3998653888702393,
"learning_rate": 4.9245748765770713e-05,
"loss": 4.5642,
"step": 5500
},
{
"epoch": 1.6456390565002743,
"grad_norm": 3.440001964569092,
"learning_rate": 4.917718047174987e-05,
"loss": 4.5106,
"step": 6000
},
{
"epoch": 1.7827756445419638,
"grad_norm": 3.469703435897827,
"learning_rate": 4.910861217772902e-05,
"loss": 4.4556,
"step": 6500
},
{
"epoch": 1.9199122325836533,
"grad_norm": 3.6554129123687744,
"learning_rate": 4.9040043883708175e-05,
"loss": 4.418,
"step": 7000
},
{
"epoch": 2.0570488206253428,
"grad_norm": 3.7117955684661865,
"learning_rate": 4.897147558968733e-05,
"loss": 4.3516,
"step": 7500
},
{
"epoch": 2.1941854086670323,
"grad_norm": 3.9816272258758545,
"learning_rate": 4.890290729566648e-05,
"loss": 4.2883,
"step": 8000
},
{
"epoch": 2.3313219967087218,
"grad_norm": 3.5806946754455566,
"learning_rate": 4.8834339001645644e-05,
"loss": 4.2377,
"step": 8500
},
{
"epoch": 2.4684585847504112,
"grad_norm": 3.8521053791046143,
"learning_rate": 4.87657707076248e-05,
"loss": 4.2398,
"step": 9000
},
{
"epoch": 2.6055951727921007,
"grad_norm": 3.8129312992095947,
"learning_rate": 4.8697202413603956e-05,
"loss": 4.2187,
"step": 9500
},
{
"epoch": 2.7427317608337907,
"grad_norm": 3.7518739700317383,
"learning_rate": 4.8628634119583105e-05,
"loss": 4.1825,
"step": 10000
},
{
"epoch": 2.8798683488754797,
"grad_norm": 4.074005603790283,
"learning_rate": 4.856006582556226e-05,
"loss": 4.1529,
"step": 10500
},
{
"epoch": 3.0170049369171696,
"grad_norm": 3.723891258239746,
"learning_rate": 4.849149753154142e-05,
"loss": 4.1431,
"step": 11000
},
{
"epoch": 3.154141524958859,
"grad_norm": 4.242587089538574,
"learning_rate": 4.842292923752057e-05,
"loss": 4.027,
"step": 11500
},
{
"epoch": 3.2912781130005486,
"grad_norm": 4.178415775299072,
"learning_rate": 4.835436094349973e-05,
"loss": 4.0418,
"step": 12000
},
{
"epoch": 3.428414701042238,
"grad_norm": 4.147921085357666,
"learning_rate": 4.8285792649478886e-05,
"loss": 4.0158,
"step": 12500
},
{
"epoch": 3.5655512890839276,
"grad_norm": 4.463027000427246,
"learning_rate": 4.821722435545804e-05,
"loss": 4.014,
"step": 13000
},
{
"epoch": 3.702687877125617,
"grad_norm": 3.6492116451263428,
"learning_rate": 4.814865606143719e-05,
"loss": 3.9874,
"step": 13500
},
{
"epoch": 3.8398244651673066,
"grad_norm": 4.560110092163086,
"learning_rate": 4.808008776741635e-05,
"loss": 3.99,
"step": 14000
},
{
"epoch": 3.976961053208996,
"grad_norm": 3.869370222091675,
"learning_rate": 4.8011519473395504e-05,
"loss": 3.9618,
"step": 14500
},
{
"epoch": 4.1140976412506856,
"grad_norm": 4.145757675170898,
"learning_rate": 4.794295117937466e-05,
"loss": 3.8752,
"step": 15000
},
{
"epoch": 4.2512342292923755,
"grad_norm": 3.9991416931152344,
"learning_rate": 4.7874382885353817e-05,
"loss": 3.8646,
"step": 15500
},
{
"epoch": 4.3883708173340645,
"grad_norm": 4.006019115447998,
"learning_rate": 4.780581459133297e-05,
"loss": 3.8571,
"step": 16000
},
{
"epoch": 4.5255074053757545,
"grad_norm": 3.8029978275299072,
"learning_rate": 4.773724629731213e-05,
"loss": 3.8605,
"step": 16500
},
{
"epoch": 4.6626439934174435,
"grad_norm": 4.239439010620117,
"learning_rate": 4.766867800329128e-05,
"loss": 3.8272,
"step": 17000
},
{
"epoch": 4.799780581459133,
"grad_norm": 4.278761863708496,
"learning_rate": 4.7600109709270434e-05,
"loss": 3.8232,
"step": 17500
},
{
"epoch": 4.9369171695008225,
"grad_norm": 4.346251964569092,
"learning_rate": 4.753154141524959e-05,
"loss": 3.8141,
"step": 18000
},
{
"epoch": 5.074053757542512,
"grad_norm": 4.909966468811035,
"learning_rate": 4.746297312122875e-05,
"loss": 3.7563,
"step": 18500
},
{
"epoch": 5.2111903455842015,
"grad_norm": 4.102847099304199,
"learning_rate": 4.73944048272079e-05,
"loss": 3.7147,
"step": 19000
},
{
"epoch": 5.348326933625891,
"grad_norm": 4.887523174285889,
"learning_rate": 4.732583653318706e-05,
"loss": 3.6985,
"step": 19500
},
{
"epoch": 5.485463521667581,
"grad_norm": 4.481743812561035,
"learning_rate": 4.7257268239166215e-05,
"loss": 3.7095,
"step": 20000
},
{
"epoch": 5.62260010970927,
"grad_norm": 4.673679828643799,
"learning_rate": 4.7188699945145365e-05,
"loss": 3.7015,
"step": 20500
},
{
"epoch": 5.75973669775096,
"grad_norm": 4.764498233795166,
"learning_rate": 4.712013165112452e-05,
"loss": 3.6906,
"step": 21000
},
{
"epoch": 5.896873285792649,
"grad_norm": 4.535381317138672,
"learning_rate": 4.705156335710368e-05,
"loss": 3.6911,
"step": 21500
},
{
"epoch": 6.034009873834339,
"grad_norm": 4.882272720336914,
"learning_rate": 4.698299506308283e-05,
"loss": 3.6483,
"step": 22000
},
{
"epoch": 6.171146461876028,
"grad_norm": 4.5370354652404785,
"learning_rate": 4.691442676906199e-05,
"loss": 3.5739,
"step": 22500
},
{
"epoch": 6.308283049917718,
"grad_norm": 4.514719486236572,
"learning_rate": 4.6845858475041146e-05,
"loss": 3.5716,
"step": 23000
},
{
"epoch": 6.445419637959407,
"grad_norm": 5.22483491897583,
"learning_rate": 4.67772901810203e-05,
"loss": 3.5767,
"step": 23500
},
{
"epoch": 6.582556226001097,
"grad_norm": 4.764497756958008,
"learning_rate": 4.670872188699945e-05,
"loss": 3.5682,
"step": 24000
},
{
"epoch": 6.719692814042786,
"grad_norm": 4.6897406578063965,
"learning_rate": 4.664015359297861e-05,
"loss": 3.5883,
"step": 24500
},
{
"epoch": 6.856829402084476,
"grad_norm": 4.739509105682373,
"learning_rate": 4.6571585298957763e-05,
"loss": 3.5627,
"step": 25000
},
{
"epoch": 6.993965990126165,
"grad_norm": 4.331806182861328,
"learning_rate": 4.650301700493692e-05,
"loss": 3.5664,
"step": 25500
},
{
"epoch": 7.131102578167855,
"grad_norm": 4.402791500091553,
"learning_rate": 4.6434448710916076e-05,
"loss": 3.4629,
"step": 26000
},
{
"epoch": 7.268239166209545,
"grad_norm": 4.822177410125732,
"learning_rate": 4.636588041689523e-05,
"loss": 3.4411,
"step": 26500
},
{
"epoch": 7.405375754251234,
"grad_norm": 4.601207733154297,
"learning_rate": 4.629731212287439e-05,
"loss": 3.4691,
"step": 27000
},
{
"epoch": 7.542512342292924,
"grad_norm": 5.0039215087890625,
"learning_rate": 4.622874382885354e-05,
"loss": 3.4672,
"step": 27500
},
{
"epoch": 7.679648930334613,
"grad_norm": 4.404879093170166,
"learning_rate": 4.6160175534832694e-05,
"loss": 3.465,
"step": 28000
},
{
"epoch": 7.816785518376303,
"grad_norm": 4.750667095184326,
"learning_rate": 4.609160724081185e-05,
"loss": 3.4425,
"step": 28500
},
{
"epoch": 7.953922106417992,
"grad_norm": 5.396721363067627,
"learning_rate": 4.6023038946791006e-05,
"loss": 3.4686,
"step": 29000
},
{
"epoch": 8.091058694459681,
"grad_norm": 4.806807518005371,
"learning_rate": 4.595447065277016e-05,
"loss": 3.3741,
"step": 29500
},
{
"epoch": 8.228195282501371,
"grad_norm": 4.791159629821777,
"learning_rate": 4.588590235874932e-05,
"loss": 3.336,
"step": 30000
},
{
"epoch": 8.365331870543061,
"grad_norm": 5.24031925201416,
"learning_rate": 4.5817334064728475e-05,
"loss": 3.3489,
"step": 30500
},
{
"epoch": 8.502468458584751,
"grad_norm": 4.839347839355469,
"learning_rate": 4.5748765770707624e-05,
"loss": 3.3387,
"step": 31000
},
{
"epoch": 8.63960504662644,
"grad_norm": 5.201210021972656,
"learning_rate": 4.568019747668678e-05,
"loss": 3.3739,
"step": 31500
},
{
"epoch": 8.776741634668129,
"grad_norm": 4.874946117401123,
"learning_rate": 4.5611629182665936e-05,
"loss": 3.3543,
"step": 32000
},
{
"epoch": 8.913878222709819,
"grad_norm": 4.564042091369629,
"learning_rate": 4.554306088864509e-05,
"loss": 3.3685,
"step": 32500
},
{
"epoch": 9.051014810751509,
"grad_norm": 5.104782581329346,
"learning_rate": 4.547449259462425e-05,
"loss": 3.3079,
"step": 33000
},
{
"epoch": 9.188151398793199,
"grad_norm": 5.117952823638916,
"learning_rate": 4.5405924300603405e-05,
"loss": 3.2292,
"step": 33500
},
{
"epoch": 9.325287986834887,
"grad_norm": 5.0324387550354,
"learning_rate": 4.533735600658256e-05,
"loss": 3.2464,
"step": 34000
},
{
"epoch": 9.462424574876577,
"grad_norm": 5.019642353057861,
"learning_rate": 4.526878771256171e-05,
"loss": 3.2425,
"step": 34500
},
{
"epoch": 9.599561162918267,
"grad_norm": 4.830804824829102,
"learning_rate": 4.5200219418540867e-05,
"loss": 3.257,
"step": 35000
},
{
"epoch": 9.736697750959957,
"grad_norm": 4.778350830078125,
"learning_rate": 4.513165112452002e-05,
"loss": 3.256,
"step": 35500
},
{
"epoch": 9.873834339001645,
"grad_norm": 5.261332988739014,
"learning_rate": 4.506308283049918e-05,
"loss": 3.2568,
"step": 36000
},
{
"epoch": 10.010970927043335,
"grad_norm": 5.060239315032959,
"learning_rate": 4.4994514536478335e-05,
"loss": 3.2595,
"step": 36500
},
{
"epoch": 10.148107515085025,
"grad_norm": 4.848392009735107,
"learning_rate": 4.492594624245749e-05,
"loss": 3.1429,
"step": 37000
},
{
"epoch": 10.285244103126715,
"grad_norm": 5.145500183105469,
"learning_rate": 4.485737794843665e-05,
"loss": 3.1512,
"step": 37500
},
{
"epoch": 10.422380691168403,
"grad_norm": 4.9423041343688965,
"learning_rate": 4.47888096544158e-05,
"loss": 3.1507,
"step": 38000
},
{
"epoch": 10.559517279210093,
"grad_norm": 4.46920108795166,
"learning_rate": 4.472024136039495e-05,
"loss": 3.1502,
"step": 38500
},
{
"epoch": 10.696653867251783,
"grad_norm": 4.90908670425415,
"learning_rate": 4.465167306637411e-05,
"loss": 3.1577,
"step": 39000
},
{
"epoch": 10.833790455293473,
"grad_norm": 4.850174903869629,
"learning_rate": 4.4583104772353265e-05,
"loss": 3.1667,
"step": 39500
},
{
"epoch": 10.970927043335163,
"grad_norm": 4.480921268463135,
"learning_rate": 4.451453647833242e-05,
"loss": 3.1676,
"step": 40000
},
{
"epoch": 11.10806363137685,
"grad_norm": 4.8438801765441895,
"learning_rate": 4.444596818431158e-05,
"loss": 3.0481,
"step": 40500
},
{
"epoch": 11.24520021941854,
"grad_norm": 5.078440189361572,
"learning_rate": 4.4377399890290734e-05,
"loss": 3.0403,
"step": 41000
},
{
"epoch": 11.38233680746023,
"grad_norm": 4.893128395080566,
"learning_rate": 4.430883159626989e-05,
"loss": 3.0463,
"step": 41500
},
{
"epoch": 11.51947339550192,
"grad_norm": 5.149147033691406,
"learning_rate": 4.424026330224904e-05,
"loss": 3.0649,
"step": 42000
},
{
"epoch": 11.656609983543609,
"grad_norm": 4.947761058807373,
"learning_rate": 4.4171695008228196e-05,
"loss": 3.0579,
"step": 42500
},
{
"epoch": 11.793746571585299,
"grad_norm": 5.356738567352295,
"learning_rate": 4.410312671420735e-05,
"loss": 3.0662,
"step": 43000
},
{
"epoch": 11.930883159626989,
"grad_norm": 5.635279655456543,
"learning_rate": 4.403455842018651e-05,
"loss": 3.0744,
"step": 43500
},
{
"epoch": 12.068019747668679,
"grad_norm": 5.142524242401123,
"learning_rate": 4.3965990126165664e-05,
"loss": 3.0006,
"step": 44000
},
{
"epoch": 12.205156335710367,
"grad_norm": 4.920190334320068,
"learning_rate": 4.389742183214482e-05,
"loss": 2.9331,
"step": 44500
},
{
"epoch": 12.342292923752057,
"grad_norm": 5.261963367462158,
"learning_rate": 4.3828853538123976e-05,
"loss": 2.9692,
"step": 45000
},
{
"epoch": 12.479429511793747,
"grad_norm": 5.450014114379883,
"learning_rate": 4.3760285244103126e-05,
"loss": 2.9458,
"step": 45500
},
{
"epoch": 12.616566099835437,
"grad_norm": 5.4277520179748535,
"learning_rate": 4.369171695008228e-05,
"loss": 2.9547,
"step": 46000
},
{
"epoch": 12.753702687877126,
"grad_norm": 5.046356201171875,
"learning_rate": 4.362314865606144e-05,
"loss": 2.953,
"step": 46500
},
{
"epoch": 12.890839275918815,
"grad_norm": 4.98581075668335,
"learning_rate": 4.3554580362040594e-05,
"loss": 2.9482,
"step": 47000
},
{
"epoch": 13.027975863960505,
"grad_norm": 5.0181450843811035,
"learning_rate": 4.348601206801975e-05,
"loss": 2.9263,
"step": 47500
},
{
"epoch": 13.165112452002194,
"grad_norm": 5.356304168701172,
"learning_rate": 4.341744377399891e-05,
"loss": 2.8199,
"step": 48000
},
{
"epoch": 13.302249040043884,
"grad_norm": 5.0527825355529785,
"learning_rate": 4.334887547997806e-05,
"loss": 2.8353,
"step": 48500
},
{
"epoch": 13.439385628085573,
"grad_norm": 5.287441253662109,
"learning_rate": 4.328030718595721e-05,
"loss": 2.8235,
"step": 49000
},
{
"epoch": 13.576522216127263,
"grad_norm": 5.292849540710449,
"learning_rate": 4.321173889193637e-05,
"loss": 2.8426,
"step": 49500
},
{
"epoch": 13.713658804168952,
"grad_norm": 5.380087852478027,
"learning_rate": 4.3143170597915525e-05,
"loss": 2.8293,
"step": 50000
},
{
"epoch": 13.850795392210642,
"grad_norm": 5.534645080566406,
"learning_rate": 4.307460230389468e-05,
"loss": 2.8535,
"step": 50500
},
{
"epoch": 13.98793198025233,
"grad_norm": 5.294557571411133,
"learning_rate": 4.300603400987384e-05,
"loss": 2.8432,
"step": 51000
},
{
"epoch": 14.12506856829402,
"grad_norm": 5.039003849029541,
"learning_rate": 4.293746571585299e-05,
"loss": 2.7162,
"step": 51500
},
{
"epoch": 14.26220515633571,
"grad_norm": 5.455623149871826,
"learning_rate": 4.286889742183215e-05,
"loss": 2.712,
"step": 52000
},
{
"epoch": 14.3993417443774,
"grad_norm": 5.256813049316406,
"learning_rate": 4.28003291278113e-05,
"loss": 2.7246,
"step": 52500
},
{
"epoch": 14.53647833241909,
"grad_norm": 5.521039962768555,
"learning_rate": 4.2731760833790455e-05,
"loss": 2.7471,
"step": 53000
},
{
"epoch": 14.673614920460778,
"grad_norm": 5.75991153717041,
"learning_rate": 4.266319253976961e-05,
"loss": 2.7177,
"step": 53500
},
{
"epoch": 14.810751508502468,
"grad_norm": 4.9295759201049805,
"learning_rate": 4.259462424574877e-05,
"loss": 2.7111,
"step": 54000
},
{
"epoch": 14.947888096544158,
"grad_norm": 4.961513042449951,
"learning_rate": 4.252605595172792e-05,
"loss": 2.7263,
"step": 54500
},
{
"epoch": 15.085024684585848,
"grad_norm": 4.933211803436279,
"learning_rate": 4.245748765770708e-05,
"loss": 2.6599,
"step": 55000
},
{
"epoch": 15.222161272627536,
"grad_norm": 5.510207176208496,
"learning_rate": 4.2388919363686236e-05,
"loss": 2.6078,
"step": 55500
},
{
"epoch": 15.359297860669226,
"grad_norm": 5.186633110046387,
"learning_rate": 4.2320351069665385e-05,
"loss": 2.6238,
"step": 56000
},
{
"epoch": 15.496434448710916,
"grad_norm": 5.6987690925598145,
"learning_rate": 4.225178277564454e-05,
"loss": 2.6189,
"step": 56500
},
{
"epoch": 15.633571036752606,
"grad_norm": 5.060766696929932,
"learning_rate": 4.21832144816237e-05,
"loss": 2.6261,
"step": 57000
},
{
"epoch": 15.770707624794294,
"grad_norm": 5.581600666046143,
"learning_rate": 4.2114646187602854e-05,
"loss": 2.6096,
"step": 57500
},
{
"epoch": 15.907844212835984,
"grad_norm": 5.272013187408447,
"learning_rate": 4.204607789358201e-05,
"loss": 2.6243,
"step": 58000
},
{
"epoch": 16.044980800877674,
"grad_norm": 5.0031538009643555,
"learning_rate": 4.1977509599561166e-05,
"loss": 2.5654,
"step": 58500
},
{
"epoch": 16.182117388919362,
"grad_norm": 5.4185872077941895,
"learning_rate": 4.190894130554032e-05,
"loss": 2.4769,
"step": 59000
},
{
"epoch": 16.319253976961054,
"grad_norm": 5.633464336395264,
"learning_rate": 4.184037301151947e-05,
"loss": 2.4867,
"step": 59500
},
{
"epoch": 16.456390565002742,
"grad_norm": 5.207147598266602,
"learning_rate": 4.177180471749863e-05,
"loss": 2.5209,
"step": 60000
},
{
"epoch": 16.593527153044434,
"grad_norm": 5.337882995605469,
"learning_rate": 4.170323642347779e-05,
"loss": 2.5095,
"step": 60500
},
{
"epoch": 16.730663741086122,
"grad_norm": 5.710779666900635,
"learning_rate": 4.163466812945694e-05,
"loss": 2.5256,
"step": 61000
},
{
"epoch": 16.86780032912781,
"grad_norm": 4.833573818206787,
"learning_rate": 4.1566099835436096e-05,
"loss": 2.5221,
"step": 61500
},
{
"epoch": 17.004936917169502,
"grad_norm": 4.590396404266357,
"learning_rate": 4.149753154141525e-05,
"loss": 2.5274,
"step": 62000
},
{
"epoch": 17.14207350521119,
"grad_norm": 5.467580318450928,
"learning_rate": 4.142896324739441e-05,
"loss": 2.3651,
"step": 62500
},
{
"epoch": 17.27921009325288,
"grad_norm": 5.374948024749756,
"learning_rate": 4.136039495337356e-05,
"loss": 2.3904,
"step": 63000
},
{
"epoch": 17.41634668129457,
"grad_norm": 5.345193386077881,
"learning_rate": 4.1291826659352714e-05,
"loss": 2.4162,
"step": 63500
},
{
"epoch": 17.553483269336258,
"grad_norm": 5.317601680755615,
"learning_rate": 4.122325836533188e-05,
"loss": 2.4228,
"step": 64000
},
{
"epoch": 17.69061985737795,
"grad_norm": 5.649726390838623,
"learning_rate": 4.1154690071311026e-05,
"loss": 2.398,
"step": 64500
},
{
"epoch": 17.827756445419638,
"grad_norm": 4.870903015136719,
"learning_rate": 4.108612177729018e-05,
"loss": 2.4126,
"step": 65000
},
{
"epoch": 17.964893033461326,
"grad_norm": 5.537862300872803,
"learning_rate": 4.101755348326934e-05,
"loss": 2.4315,
"step": 65500
},
{
"epoch": 18.102029621503018,
"grad_norm": 5.414814472198486,
"learning_rate": 4.0948985189248495e-05,
"loss": 2.3059,
"step": 66000
},
{
"epoch": 18.239166209544706,
"grad_norm": 5.167638301849365,
"learning_rate": 4.0880416895227644e-05,
"loss": 2.2847,
"step": 66500
},
{
"epoch": 18.376302797586398,
"grad_norm": 5.151243209838867,
"learning_rate": 4.08118486012068e-05,
"loss": 2.2914,
"step": 67000
},
{
"epoch": 18.513439385628086,
"grad_norm": 5.785707473754883,
"learning_rate": 4.0743280307185963e-05,
"loss": 2.3047,
"step": 67500
},
{
"epoch": 18.650575973669774,
"grad_norm": 4.904608249664307,
"learning_rate": 4.067471201316512e-05,
"loss": 2.3021,
"step": 68000
},
{
"epoch": 18.787712561711466,
"grad_norm": 5.454782009124756,
"learning_rate": 4.060614371914427e-05,
"loss": 2.3305,
"step": 68500
},
{
"epoch": 18.924849149753154,
"grad_norm": 5.2010650634765625,
"learning_rate": 4.0537575425123425e-05,
"loss": 2.3182,
"step": 69000
},
{
"epoch": 19.061985737794842,
"grad_norm": 5.094666481018066,
"learning_rate": 4.046900713110258e-05,
"loss": 2.2601,
"step": 69500
},
{
"epoch": 19.199122325836534,
"grad_norm": 5.217191696166992,
"learning_rate": 4.040043883708173e-05,
"loss": 2.1853,
"step": 70000
},
{
"epoch": 19.336258913878222,
"grad_norm": 5.011998653411865,
"learning_rate": 4.033187054306089e-05,
"loss": 2.1981,
"step": 70500
},
{
"epoch": 19.473395501919914,
"grad_norm": 5.134762287139893,
"learning_rate": 4.026330224904005e-05,
"loss": 2.2176,
"step": 71000
},
{
"epoch": 19.610532089961602,
"grad_norm": 5.362982273101807,
"learning_rate": 4.0194733955019206e-05,
"loss": 2.2141,
"step": 71500
},
{
"epoch": 19.74766867800329,
"grad_norm": 5.136562347412109,
"learning_rate": 4.0126165660998355e-05,
"loss": 2.2004,
"step": 72000
},
{
"epoch": 19.88480526604498,
"grad_norm": 5.2206220626831055,
"learning_rate": 4.005759736697751e-05,
"loss": 2.2177,
"step": 72500
},
{
"epoch": 20.02194185408667,
"grad_norm": 5.294692516326904,
"learning_rate": 3.998902907295667e-05,
"loss": 2.1987,
"step": 73000
},
{
"epoch": 20.15907844212836,
"grad_norm": 5.572756767272949,
"learning_rate": 3.992046077893582e-05,
"loss": 2.0699,
"step": 73500
},
{
"epoch": 20.29621503017005,
"grad_norm": 5.801488876342773,
"learning_rate": 3.985189248491497e-05,
"loss": 2.0924,
"step": 74000
},
{
"epoch": 20.433351618211738,
"grad_norm": 5.149176120758057,
"learning_rate": 3.9783324190894136e-05,
"loss": 2.114,
"step": 74500
},
{
"epoch": 20.57048820625343,
"grad_norm": 5.202007293701172,
"learning_rate": 3.971475589687329e-05,
"loss": 2.1022,
"step": 75000
},
{
"epoch": 20.707624794295118,
"grad_norm": 4.496254920959473,
"learning_rate": 3.964618760285244e-05,
"loss": 2.1327,
"step": 75500
},
{
"epoch": 20.844761382336806,
"grad_norm": 5.123493194580078,
"learning_rate": 3.95776193088316e-05,
"loss": 2.1265,
"step": 76000
},
{
"epoch": 20.981897970378498,
"grad_norm": 5.082859516143799,
"learning_rate": 3.9509051014810754e-05,
"loss": 2.1341,
"step": 76500
},
{
"epoch": 21.119034558420186,
"grad_norm": 4.651580810546875,
"learning_rate": 3.9440482720789904e-05,
"loss": 2.0016,
"step": 77000
},
{
"epoch": 21.256171146461877,
"grad_norm": 5.409528732299805,
"learning_rate": 3.9371914426769066e-05,
"loss": 2.0007,
"step": 77500
},
{
"epoch": 21.393307734503566,
"grad_norm": 5.502586841583252,
"learning_rate": 3.930334613274822e-05,
"loss": 2.0057,
"step": 78000
},
{
"epoch": 21.530444322545254,
"grad_norm": 5.030213356018066,
"learning_rate": 3.923477783872738e-05,
"loss": 2.0159,
"step": 78500
},
{
"epoch": 21.667580910586945,
"grad_norm": 4.999740123748779,
"learning_rate": 3.916620954470653e-05,
"loss": 2.0237,
"step": 79000
},
{
"epoch": 21.804717498628634,
"grad_norm": 5.182149887084961,
"learning_rate": 3.9097641250685684e-05,
"loss": 2.0275,
"step": 79500
},
{
"epoch": 21.941854086670325,
"grad_norm": 5.282116889953613,
"learning_rate": 3.902907295666484e-05,
"loss": 2.0473,
"step": 80000
},
{
"epoch": 22.078990674712013,
"grad_norm": 4.748703956604004,
"learning_rate": 3.896050466264399e-05,
"loss": 1.9621,
"step": 80500
},
{
"epoch": 22.2161272627537,
"grad_norm": 4.832570552825928,
"learning_rate": 3.889193636862315e-05,
"loss": 1.9008,
"step": 81000
},
{
"epoch": 22.353263850795393,
"grad_norm": 5.199923992156982,
"learning_rate": 3.882336807460231e-05,
"loss": 1.9096,
"step": 81500
},
{
"epoch": 22.49040043883708,
"grad_norm": 5.1267499923706055,
"learning_rate": 3.8754799780581465e-05,
"loss": 1.9247,
"step": 82000
},
{
"epoch": 22.62753702687877,
"grad_norm": 4.7476606369018555,
"learning_rate": 3.8686231486560615e-05,
"loss": 1.9216,
"step": 82500
},
{
"epoch": 22.76467361492046,
"grad_norm": 5.416210174560547,
"learning_rate": 3.861766319253977e-05,
"loss": 1.9493,
"step": 83000
},
{
"epoch": 22.90181020296215,
"grad_norm": 5.211349010467529,
"learning_rate": 3.854909489851893e-05,
"loss": 1.9391,
"step": 83500
},
{
"epoch": 23.03894679100384,
"grad_norm": 5.296257495880127,
"learning_rate": 3.8480526604498076e-05,
"loss": 1.9047,
"step": 84000
},
{
"epoch": 23.17608337904553,
"grad_norm": 5.259824752807617,
"learning_rate": 3.841195831047724e-05,
"loss": 1.803,
"step": 84500
},
{
"epoch": 23.313219967087218,
"grad_norm": 4.756730079650879,
"learning_rate": 3.8343390016456395e-05,
"loss": 1.8172,
"step": 85000
},
{
"epoch": 23.45035655512891,
"grad_norm": 5.009732723236084,
"learning_rate": 3.827482172243555e-05,
"loss": 1.8314,
"step": 85500
},
{
"epoch": 23.587493143170597,
"grad_norm": 5.3414082527160645,
"learning_rate": 3.82062534284147e-05,
"loss": 1.8489,
"step": 86000
},
{
"epoch": 23.72462973121229,
"grad_norm": 4.76619815826416,
"learning_rate": 3.813768513439386e-05,
"loss": 1.8542,
"step": 86500
},
{
"epoch": 23.861766319253977,
"grad_norm": 5.249925136566162,
"learning_rate": 3.806911684037301e-05,
"loss": 1.8639,
"step": 87000
},
{
"epoch": 23.998902907295665,
"grad_norm": 4.97225284576416,
"learning_rate": 3.800054854635216e-05,
"loss": 1.8802,
"step": 87500
},
{
"epoch": 24.136039495337357,
"grad_norm": 5.291701793670654,
"learning_rate": 3.7931980252331326e-05,
"loss": 1.7144,
"step": 88000
},
{
"epoch": 24.273176083379045,
"grad_norm": 5.1743340492248535,
"learning_rate": 3.786341195831048e-05,
"loss": 1.7312,
"step": 88500
},
{
"epoch": 24.410312671420733,
"grad_norm": 5.3917646408081055,
"learning_rate": 3.779484366428964e-05,
"loss": 1.7488,
"step": 89000
},
{
"epoch": 24.547449259462425,
"grad_norm": 4.806937217712402,
"learning_rate": 3.772627537026879e-05,
"loss": 1.7637,
"step": 89500
},
{
"epoch": 24.684585847504113,
"grad_norm": 5.0730156898498535,
"learning_rate": 3.7657707076247944e-05,
"loss": 1.7668,
"step": 90000
},
{
"epoch": 24.821722435545805,
"grad_norm": 4.786214828491211,
"learning_rate": 3.75891387822271e-05,
"loss": 1.7769,
"step": 90500
},
{
"epoch": 24.958859023587493,
"grad_norm": 5.39318323135376,
"learning_rate": 3.752057048820625e-05,
"loss": 1.7903,
"step": 91000
},
{
"epoch": 25.09599561162918,
"grad_norm": 4.981703281402588,
"learning_rate": 3.745200219418541e-05,
"loss": 1.6718,
"step": 91500
},
{
"epoch": 25.233132199670873,
"grad_norm": 4.901900291442871,
"learning_rate": 3.738343390016457e-05,
"loss": 1.6542,
"step": 92000
},
{
"epoch": 25.37026878771256,
"grad_norm": 5.158128261566162,
"learning_rate": 3.7314865606143724e-05,
"loss": 1.6573,
"step": 92500
},
{
"epoch": 25.507405375754253,
"grad_norm": 4.649386882781982,
"learning_rate": 3.7246297312122874e-05,
"loss": 1.6773,
"step": 93000
},
{
"epoch": 25.64454196379594,
"grad_norm": 4.9402666091918945,
"learning_rate": 3.717772901810203e-05,
"loss": 1.687,
"step": 93500
},
{
"epoch": 25.78167855183763,
"grad_norm": 5.1116180419921875,
"learning_rate": 3.7109160724081186e-05,
"loss": 1.701,
"step": 94000
},
{
"epoch": 25.91881513987932,
"grad_norm": 5.389803886413574,
"learning_rate": 3.704059243006034e-05,
"loss": 1.7042,
"step": 94500
},
{
"epoch": 26.05595172792101,
"grad_norm": 5.371042251586914,
"learning_rate": 3.69720241360395e-05,
"loss": 1.637,
"step": 95000
},
{
"epoch": 26.193088315962697,
"grad_norm": 5.292448997497559,
"learning_rate": 3.6903455842018655e-05,
"loss": 1.5579,
"step": 95500
},
{
"epoch": 26.33022490400439,
"grad_norm": 5.034709453582764,
"learning_rate": 3.683488754799781e-05,
"loss": 1.5781,
"step": 96000
},
{
"epoch": 26.467361492046077,
"grad_norm": 4.979785919189453,
"learning_rate": 3.676631925397696e-05,
"loss": 1.6006,
"step": 96500
},
{
"epoch": 26.60449808008777,
"grad_norm": 4.940494537353516,
"learning_rate": 3.6697750959956116e-05,
"loss": 1.6032,
"step": 97000
},
{
"epoch": 26.741634668129457,
"grad_norm": 5.339479923248291,
"learning_rate": 3.662918266593527e-05,
"loss": 1.6248,
"step": 97500
},
{
"epoch": 26.878771256171145,
"grad_norm": 5.139049530029297,
"learning_rate": 3.656061437191443e-05,
"loss": 1.6189,
"step": 98000
},
{
"epoch": 27.015907844212837,
"grad_norm": 4.733531951904297,
"learning_rate": 3.6492046077893585e-05,
"loss": 1.6215,
"step": 98500
},
{
"epoch": 27.153044432254525,
"grad_norm": 5.294017791748047,
"learning_rate": 3.642347778387274e-05,
"loss": 1.4842,
"step": 99000
},
{
"epoch": 27.290181020296217,
"grad_norm": 5.071205139160156,
"learning_rate": 3.63549094898519e-05,
"loss": 1.5071,
"step": 99500
},
{
"epoch": 27.427317608337905,
"grad_norm": 5.08548641204834,
"learning_rate": 3.628634119583105e-05,
"loss": 1.5179,
"step": 100000
},
{
"epoch": 27.564454196379593,
"grad_norm": 5.183330059051514,
"learning_rate": 3.62177729018102e-05,
"loss": 1.5282,
"step": 100500
},
{
"epoch": 27.701590784421285,
"grad_norm": 4.851142406463623,
"learning_rate": 3.614920460778936e-05,
"loss": 1.5419,
"step": 101000
},
{
"epoch": 27.838727372462973,
"grad_norm": 4.878331661224365,
"learning_rate": 3.6080636313768515e-05,
"loss": 1.5537,
"step": 101500
},
{
"epoch": 27.97586396050466,
"grad_norm": 5.406539440155029,
"learning_rate": 3.601206801974767e-05,
"loss": 1.5562,
"step": 102000
},
{
"epoch": 28.113000548546353,
"grad_norm": 5.543664455413818,
"learning_rate": 3.594349972572683e-05,
"loss": 1.4368,
"step": 102500
},
{
"epoch": 28.25013713658804,
"grad_norm": 5.570579528808594,
"learning_rate": 3.5874931431705984e-05,
"loss": 1.4252,
"step": 103000
},
{
"epoch": 28.387273724629733,
"grad_norm": 4.777440547943115,
"learning_rate": 3.580636313768513e-05,
"loss": 1.4427,
"step": 103500
},
{
"epoch": 28.52441031267142,
"grad_norm": 4.820840835571289,
"learning_rate": 3.573779484366429e-05,
"loss": 1.4574,
"step": 104000
},
{
"epoch": 28.66154690071311,
"grad_norm": 4.499929904937744,
"learning_rate": 3.5669226549643445e-05,
"loss": 1.4651,
"step": 104500
},
{
"epoch": 28.7986834887548,
"grad_norm": 4.876035213470459,
"learning_rate": 3.56006582556226e-05,
"loss": 1.4721,
"step": 105000
},
{
"epoch": 28.93582007679649,
"grad_norm": 5.974823951721191,
"learning_rate": 3.553208996160176e-05,
"loss": 1.4854,
"step": 105500
},
{
"epoch": 29.07295666483818,
"grad_norm": 5.119105815887451,
"learning_rate": 3.5463521667580914e-05,
"loss": 1.4078,
"step": 106000
},
{
"epoch": 29.21009325287987,
"grad_norm": 4.832869052886963,
"learning_rate": 3.539495337356007e-05,
"loss": 1.3521,
"step": 106500
},
{
"epoch": 29.347229840921557,
"grad_norm": 5.020029544830322,
"learning_rate": 3.532638507953922e-05,
"loss": 1.3673,
"step": 107000
},
{
"epoch": 29.48436642896325,
"grad_norm": 5.573171615600586,
"learning_rate": 3.5257816785518376e-05,
"loss": 1.3926,
"step": 107500
},
{
"epoch": 29.621503017004937,
"grad_norm": 5.574306488037109,
"learning_rate": 3.518924849149753e-05,
"loss": 1.4032,
"step": 108000
},
{
"epoch": 29.758639605046625,
"grad_norm": 5.316165924072266,
"learning_rate": 3.512068019747669e-05,
"loss": 1.4039,
"step": 108500
},
{
"epoch": 29.895776193088317,
"grad_norm": 5.210799217224121,
"learning_rate": 3.5052111903455844e-05,
"loss": 1.4099,
"step": 109000
},
{
"epoch": 30.032912781130005,
"grad_norm": 4.973813056945801,
"learning_rate": 3.4983543609435e-05,
"loss": 1.3881,
"step": 109500
},
{
"epoch": 30.170049369171696,
"grad_norm": 4.447306156158447,
"learning_rate": 3.4914975315414157e-05,
"loss": 1.2853,
"step": 110000
},
{
"epoch": 30.307185957213385,
"grad_norm": 5.200187683105469,
"learning_rate": 3.4846407021393306e-05,
"loss": 1.2992,
"step": 110500
},
{
"epoch": 30.444322545255073,
"grad_norm": 5.067360877990723,
"learning_rate": 3.477783872737246e-05,
"loss": 1.3248,
"step": 111000
},
{
"epoch": 30.581459133296764,
"grad_norm": 5.2174391746521,
"learning_rate": 3.470927043335162e-05,
"loss": 1.326,
"step": 111500
},
{
"epoch": 30.718595721338453,
"grad_norm": 5.6121392250061035,
"learning_rate": 3.4640702139330774e-05,
"loss": 1.3509,
"step": 112000
},
{
"epoch": 30.855732309380144,
"grad_norm": 5.090517997741699,
"learning_rate": 3.457213384530993e-05,
"loss": 1.3437,
"step": 112500
},
{
"epoch": 30.992868897421832,
"grad_norm": 4.977377414703369,
"learning_rate": 3.450356555128909e-05,
"loss": 1.3577,
"step": 113000
},
{
"epoch": 31.13000548546352,
"grad_norm": 5.1490478515625,
"learning_rate": 3.443499725726824e-05,
"loss": 1.2348,
"step": 113500
},
{
"epoch": 31.267142073505212,
"grad_norm": 4.903263092041016,
"learning_rate": 3.436642896324739e-05,
"loss": 1.2272,
"step": 114000
},
{
"epoch": 31.4042786615469,
"grad_norm": 5.068541049957275,
"learning_rate": 3.429786066922655e-05,
"loss": 1.2484,
"step": 114500
},
{
"epoch": 31.54141524958859,
"grad_norm": 5.064205169677734,
"learning_rate": 3.4229292375205705e-05,
"loss": 1.2704,
"step": 115000
},
{
"epoch": 31.67855183763028,
"grad_norm": 5.463748455047607,
"learning_rate": 3.416072408118486e-05,
"loss": 1.2753,
"step": 115500
},
{
"epoch": 31.81568842567197,
"grad_norm": 4.637465476989746,
"learning_rate": 3.409215578716402e-05,
"loss": 1.2875,
"step": 116000
},
{
"epoch": 31.95282501371366,
"grad_norm": 4.767406463623047,
"learning_rate": 3.402358749314317e-05,
"loss": 1.2919,
"step": 116500
},
{
"epoch": 32.08996160175535,
"grad_norm": 4.907227993011475,
"learning_rate": 3.395501919912233e-05,
"loss": 1.2129,
"step": 117000
},
{
"epoch": 32.22709818979704,
"grad_norm": 4.724886417388916,
"learning_rate": 3.388645090510148e-05,
"loss": 1.1743,
"step": 117500
},
{
"epoch": 32.364234777838725,
"grad_norm": 5.002569198608398,
"learning_rate": 3.3817882611080635e-05,
"loss": 1.1865,
"step": 118000
},
{
"epoch": 32.501371365880416,
"grad_norm": 4.655109405517578,
"learning_rate": 3.374931431705979e-05,
"loss": 1.2088,
"step": 118500
},
{
"epoch": 32.63850795392211,
"grad_norm": 5.301872730255127,
"learning_rate": 3.368074602303895e-05,
"loss": 1.2133,
"step": 119000
},
{
"epoch": 32.77564454196379,
"grad_norm": 5.0290846824646,
"learning_rate": 3.3612177729018103e-05,
"loss": 1.224,
"step": 119500
},
{
"epoch": 32.912781130005484,
"grad_norm": 5.073773384094238,
"learning_rate": 3.354360943499726e-05,
"loss": 1.2292,
"step": 120000
},
{
"epoch": 33.049917718047176,
"grad_norm": 5.129011631011963,
"learning_rate": 3.3475041140976416e-05,
"loss": 1.1887,
"step": 120500
},
{
"epoch": 33.18705430608887,
"grad_norm": 5.234120845794678,
"learning_rate": 3.340647284695557e-05,
"loss": 1.1145,
"step": 121000
},
{
"epoch": 33.32419089413055,
"grad_norm": 5.61316442489624,
"learning_rate": 3.333790455293472e-05,
"loss": 1.1246,
"step": 121500
},
{
"epoch": 33.461327482172244,
"grad_norm": 5.373575210571289,
"learning_rate": 3.326933625891388e-05,
"loss": 1.1392,
"step": 122000
},
{
"epoch": 33.598464070213936,
"grad_norm": 5.573062419891357,
"learning_rate": 3.3200767964893034e-05,
"loss": 1.1565,
"step": 122500
},
{
"epoch": 33.73560065825562,
"grad_norm": 5.016828536987305,
"learning_rate": 3.313219967087219e-05,
"loss": 1.1566,
"step": 123000
},
{
"epoch": 33.87273724629731,
"grad_norm": 5.493660926818848,
"learning_rate": 3.3063631376851346e-05,
"loss": 1.1775,
"step": 123500
},
{
"epoch": 34.009873834339004,
"grad_norm": 4.961755275726318,
"learning_rate": 3.29950630828305e-05,
"loss": 1.182,
"step": 124000
},
{
"epoch": 34.14701042238069,
"grad_norm": 5.307010173797607,
"learning_rate": 3.292649478880966e-05,
"loss": 1.0434,
"step": 124500
},
{
"epoch": 34.28414701042238,
"grad_norm": 5.011436462402344,
"learning_rate": 3.285792649478881e-05,
"loss": 1.0732,
"step": 125000
},
{
"epoch": 34.42128359846407,
"grad_norm": 5.172646522521973,
"learning_rate": 3.2789358200767964e-05,
"loss": 1.0886,
"step": 125500
},
{
"epoch": 34.55842018650576,
"grad_norm": 5.302252769470215,
"learning_rate": 3.272078990674713e-05,
"loss": 1.0978,
"step": 126000
},
{
"epoch": 34.69555677454745,
"grad_norm": 5.635678768157959,
"learning_rate": 3.2652221612726276e-05,
"loss": 1.1098,
"step": 126500
},
{
"epoch": 34.83269336258914,
"grad_norm": 5.217731475830078,
"learning_rate": 3.258365331870543e-05,
"loss": 1.1163,
"step": 127000
},
{
"epoch": 34.96982995063083,
"grad_norm": 5.012636661529541,
"learning_rate": 3.251508502468459e-05,
"loss": 1.1212,
"step": 127500
},
{
"epoch": 35.106966538672516,
"grad_norm": 4.536286354064941,
"learning_rate": 3.2446516730663745e-05,
"loss": 1.0247,
"step": 128000
},
{
"epoch": 35.24410312671421,
"grad_norm": 5.208780288696289,
"learning_rate": 3.2377948436642894e-05,
"loss": 1.0127,
"step": 128500
},
{
"epoch": 35.3812397147559,
"grad_norm": 5.084893226623535,
"learning_rate": 3.230938014262205e-05,
"loss": 1.0279,
"step": 129000
},
{
"epoch": 35.518376302797584,
"grad_norm": 4.847336769104004,
"learning_rate": 3.224081184860121e-05,
"loss": 1.0429,
"step": 129500
},
{
"epoch": 35.655512890839276,
"grad_norm": 5.661252021789551,
"learning_rate": 3.217224355458036e-05,
"loss": 1.0581,
"step": 130000
},
{
"epoch": 35.79264947888097,
"grad_norm": 5.379410743713379,
"learning_rate": 3.210367526055952e-05,
"loss": 1.0666,
"step": 130500
},
{
"epoch": 35.92978606692265,
"grad_norm": 5.579956531524658,
"learning_rate": 3.2035106966538675e-05,
"loss": 1.0739,
"step": 131000
},
{
"epoch": 36.066922654964344,
"grad_norm": 5.134979248046875,
"learning_rate": 3.196653867251783e-05,
"loss": 1.01,
"step": 131500
},
{
"epoch": 36.204059243006036,
"grad_norm": 5.556998252868652,
"learning_rate": 3.189797037849698e-05,
"loss": 0.9573,
"step": 132000
},
{
"epoch": 36.34119583104772,
"grad_norm": 5.259885787963867,
"learning_rate": 3.182940208447614e-05,
"loss": 0.9718,
"step": 132500
},
{
"epoch": 36.47833241908941,
"grad_norm": 5.2222208976745605,
"learning_rate": 3.17608337904553e-05,
"loss": 0.9924,
"step": 133000
},
{
"epoch": 36.615469007131104,
"grad_norm": 5.009335041046143,
"learning_rate": 3.169226549643445e-05,
"loss": 0.9983,
"step": 133500
},
{
"epoch": 36.752605595172795,
"grad_norm": 4.928483009338379,
"learning_rate": 3.1623697202413605e-05,
"loss": 1.0232,
"step": 134000
},
{
"epoch": 36.88974218321448,
"grad_norm": 5.5725226402282715,
"learning_rate": 3.155512890839276e-05,
"loss": 1.0157,
"step": 134500
},
{
"epoch": 37.02687877125617,
"grad_norm": 5.25609827041626,
"learning_rate": 3.148656061437192e-05,
"loss": 1.0,
"step": 135000
},
{
"epoch": 37.16401535929786,
"grad_norm": 5.325344085693359,
"learning_rate": 3.141799232035107e-05,
"loss": 0.9007,
"step": 135500
},
{
"epoch": 37.30115194733955,
"grad_norm": 5.14201021194458,
"learning_rate": 3.134942402633022e-05,
"loss": 0.9298,
"step": 136000
},
{
"epoch": 37.43828853538124,
"grad_norm": 5.08565092086792,
"learning_rate": 3.1280855732309386e-05,
"loss": 0.941,
"step": 136500
},
{
"epoch": 37.57542512342293,
"grad_norm": 5.582076549530029,
"learning_rate": 3.1212287438288536e-05,
"loss": 0.9441,
"step": 137000
},
{
"epoch": 37.712561711464616,
"grad_norm": 5.214138031005859,
"learning_rate": 3.114371914426769e-05,
"loss": 0.9642,
"step": 137500
},
{
"epoch": 37.84969829950631,
"grad_norm": 5.962718963623047,
"learning_rate": 3.107515085024685e-05,
"loss": 0.9764,
"step": 138000
},
{
"epoch": 37.986834887548,
"grad_norm": 5.05949592590332,
"learning_rate": 3.1006582556226004e-05,
"loss": 0.9785,
"step": 138500
},
{
"epoch": 38.123971475589684,
"grad_norm": 5.427227020263672,
"learning_rate": 3.0938014262205153e-05,
"loss": 0.8783,
"step": 139000
},
{
"epoch": 38.261108063631376,
"grad_norm": 5.215878009796143,
"learning_rate": 3.086944596818431e-05,
"loss": 0.8717,
"step": 139500
},
{
"epoch": 38.39824465167307,
"grad_norm": 5.433798789978027,
"learning_rate": 3.080087767416347e-05,
"loss": 0.8879,
"step": 140000
},
{
"epoch": 38.53538123971476,
"grad_norm": 5.417360782623291,
"learning_rate": 3.073230938014262e-05,
"loss": 0.9018,
"step": 140500
},
{
"epoch": 38.672517827756444,
"grad_norm": 5.553948879241943,
"learning_rate": 3.066374108612178e-05,
"loss": 0.9187,
"step": 141000
},
{
"epoch": 38.809654415798136,
"grad_norm": 5.142756938934326,
"learning_rate": 3.0595172792100934e-05,
"loss": 0.9246,
"step": 141500
},
{
"epoch": 38.94679100383983,
"grad_norm": 5.797046184539795,
"learning_rate": 3.052660449808009e-05,
"loss": 0.9304,
"step": 142000
},
{
"epoch": 39.08392759188151,
"grad_norm": 4.4715986251831055,
"learning_rate": 3.0458036204059243e-05,
"loss": 0.859,
"step": 142500
},
{
"epoch": 39.221064179923204,
"grad_norm": 4.92647123336792,
"learning_rate": 3.03894679100384e-05,
"loss": 0.8293,
"step": 143000
},
{
"epoch": 39.358200767964895,
"grad_norm": 5.064645767211914,
"learning_rate": 3.0320899616017556e-05,
"loss": 0.8431,
"step": 143500
},
{
"epoch": 39.49533735600658,
"grad_norm": 5.243420600891113,
"learning_rate": 3.025233132199671e-05,
"loss": 0.8615,
"step": 144000
},
{
"epoch": 39.63247394404827,
"grad_norm": 6.133671760559082,
"learning_rate": 3.0183763027975865e-05,
"loss": 0.8714,
"step": 144500
},
{
"epoch": 39.76961053208996,
"grad_norm": 5.261296272277832,
"learning_rate": 3.011519473395502e-05,
"loss": 0.8801,
"step": 145000
},
{
"epoch": 39.90674712013165,
"grad_norm": 5.25457239151001,
"learning_rate": 3.0046626439934177e-05,
"loss": 0.8869,
"step": 145500
},
{
"epoch": 40.04388370817334,
"grad_norm": 5.886989116668701,
"learning_rate": 2.997805814591333e-05,
"loss": 0.8569,
"step": 146000
},
{
"epoch": 40.18102029621503,
"grad_norm": 4.354552745819092,
"learning_rate": 2.9909489851892486e-05,
"loss": 0.7847,
"step": 146500
},
{
"epoch": 40.31815688425672,
"grad_norm": 5.114023208618164,
"learning_rate": 2.9840921557871642e-05,
"loss": 0.8053,
"step": 147000
},
{
"epoch": 40.45529347229841,
"grad_norm": 5.665450572967529,
"learning_rate": 2.9772353263850798e-05,
"loss": 0.8053,
"step": 147500
},
{
"epoch": 40.5924300603401,
"grad_norm": 4.803800106048584,
"learning_rate": 2.970378496982995e-05,
"loss": 0.8227,
"step": 148000
},
{
"epoch": 40.72956664838179,
"grad_norm": 5.80670690536499,
"learning_rate": 2.9635216675809107e-05,
"loss": 0.8375,
"step": 148500
},
{
"epoch": 40.866703236423476,
"grad_norm": 5.025584697723389,
"learning_rate": 2.9566648381788263e-05,
"loss": 0.8358,
"step": 149000
},
{
"epoch": 41.00383982446517,
"grad_norm": 4.726833343505859,
"learning_rate": 2.9498080087767416e-05,
"loss": 0.8472,
"step": 149500
},
{
"epoch": 41.14097641250686,
"grad_norm": 5.068787097930908,
"learning_rate": 2.9429511793746572e-05,
"loss": 0.7477,
"step": 150000
},
{
"epoch": 41.278113000548544,
"grad_norm": 4.701972484588623,
"learning_rate": 2.936094349972573e-05,
"loss": 0.7578,
"step": 150500
},
{
"epoch": 41.415249588590235,
"grad_norm": 4.899438858032227,
"learning_rate": 2.9292375205704885e-05,
"loss": 0.7685,
"step": 151000
},
{
"epoch": 41.55238617663193,
"grad_norm": 5.0015482902526855,
"learning_rate": 2.9223806911684037e-05,
"loss": 0.7812,
"step": 151500
},
{
"epoch": 41.68952276467361,
"grad_norm": 4.952108860015869,
"learning_rate": 2.9155238617663194e-05,
"loss": 0.7886,
"step": 152000
},
{
"epoch": 41.8266593527153,
"grad_norm": 5.88131046295166,
"learning_rate": 2.908667032364235e-05,
"loss": 0.7972,
"step": 152500
},
{
"epoch": 41.963795940756995,
"grad_norm": 5.144876956939697,
"learning_rate": 2.9018102029621502e-05,
"loss": 0.8097,
"step": 153000
},
{
"epoch": 42.10093252879869,
"grad_norm": 5.848343849182129,
"learning_rate": 2.894953373560066e-05,
"loss": 0.7389,
"step": 153500
},
{
"epoch": 42.23806911684037,
"grad_norm": 5.04640007019043,
"learning_rate": 2.8880965441579815e-05,
"loss": 0.7163,
"step": 154000
},
{
"epoch": 42.37520570488206,
"grad_norm": 5.1840128898620605,
"learning_rate": 2.881239714755897e-05,
"loss": 0.732,
"step": 154500
},
{
"epoch": 42.512342292923755,
"grad_norm": 5.124771595001221,
"learning_rate": 2.8743828853538124e-05,
"loss": 0.7431,
"step": 155000
},
{
"epoch": 42.64947888096544,
"grad_norm": 4.6923089027404785,
"learning_rate": 2.867526055951728e-05,
"loss": 0.7582,
"step": 155500
},
{
"epoch": 42.78661546900713,
"grad_norm": 5.027599334716797,
"learning_rate": 2.8606692265496436e-05,
"loss": 0.7561,
"step": 156000
},
{
"epoch": 42.92375205704882,
"grad_norm": 4.931192398071289,
"learning_rate": 2.853812397147559e-05,
"loss": 0.7642,
"step": 156500
},
{
"epoch": 43.06088864509051,
"grad_norm": 4.427544593811035,
"learning_rate": 2.8469555677454745e-05,
"loss": 0.7286,
"step": 157000
},
{
"epoch": 43.1980252331322,
"grad_norm": 5.119362831115723,
"learning_rate": 2.84009873834339e-05,
"loss": 0.6782,
"step": 157500
},
{
"epoch": 43.33516182117389,
"grad_norm": 4.8863749504089355,
"learning_rate": 2.833241908941306e-05,
"loss": 0.6933,
"step": 158000
},
{
"epoch": 43.472298409215576,
"grad_norm": 5.453842639923096,
"learning_rate": 2.826385079539221e-05,
"loss": 0.7039,
"step": 158500
},
{
"epoch": 43.60943499725727,
"grad_norm": 4.8158721923828125,
"learning_rate": 2.8195282501371366e-05,
"loss": 0.711,
"step": 159000
},
{
"epoch": 43.74657158529896,
"grad_norm": 5.3100905418396,
"learning_rate": 2.8126714207350523e-05,
"loss": 0.7238,
"step": 159500
},
{
"epoch": 43.88370817334065,
"grad_norm": 4.8812031745910645,
"learning_rate": 2.8058145913329675e-05,
"loss": 0.7283,
"step": 160000
},
{
"epoch": 44.020844761382335,
"grad_norm": 5.003659725189209,
"learning_rate": 2.798957761930883e-05,
"loss": 0.7262,
"step": 160500
},
{
"epoch": 44.15798134942403,
"grad_norm": 5.185481548309326,
"learning_rate": 2.7921009325287988e-05,
"loss": 0.6417,
"step": 161000
},
{
"epoch": 44.29511793746572,
"grad_norm": 4.771406173706055,
"learning_rate": 2.7852441031267147e-05,
"loss": 0.6564,
"step": 161500
},
{
"epoch": 44.4322545255074,
"grad_norm": 5.313647270202637,
"learning_rate": 2.7783872737246297e-05,
"loss": 0.6727,
"step": 162000
},
{
"epoch": 44.569391113549095,
"grad_norm": 5.134614944458008,
"learning_rate": 2.7715304443225453e-05,
"loss": 0.6784,
"step": 162500
},
{
"epoch": 44.70652770159079,
"grad_norm": 4.888493537902832,
"learning_rate": 2.764673614920461e-05,
"loss": 0.6869,
"step": 163000
},
{
"epoch": 44.84366428963247,
"grad_norm": 5.336511135101318,
"learning_rate": 2.7578167855183762e-05,
"loss": 0.692,
"step": 163500
},
{
"epoch": 44.98080087767416,
"grad_norm": 5.053600311279297,
"learning_rate": 2.7509599561162918e-05,
"loss": 0.7041,
"step": 164000
},
{
"epoch": 45.117937465715855,
"grad_norm": 4.778295993804932,
"learning_rate": 2.7441031267142074e-05,
"loss": 0.6192,
"step": 164500
},
{
"epoch": 45.25507405375754,
"grad_norm": 5.197367191314697,
"learning_rate": 2.7372462973121234e-05,
"loss": 0.6225,
"step": 165000
},
{
"epoch": 45.39221064179923,
"grad_norm": 5.395830154418945,
"learning_rate": 2.7303894679100383e-05,
"loss": 0.6342,
"step": 165500
},
{
"epoch": 45.52934722984092,
"grad_norm": 5.031848430633545,
"learning_rate": 2.723532638507954e-05,
"loss": 0.6448,
"step": 166000
},
{
"epoch": 45.666483817882614,
"grad_norm": 5.896296977996826,
"learning_rate": 2.71667580910587e-05,
"loss": 0.6532,
"step": 166500
},
{
"epoch": 45.8036204059243,
"grad_norm": 5.266870021820068,
"learning_rate": 2.7098189797037848e-05,
"loss": 0.6583,
"step": 167000
},
{
"epoch": 45.94075699396599,
"grad_norm": 5.225521087646484,
"learning_rate": 2.7029621503017004e-05,
"loss": 0.6648,
"step": 167500
},
{
"epoch": 46.07789358200768,
"grad_norm": 5.04818058013916,
"learning_rate": 2.696105320899616e-05,
"loss": 0.6178,
"step": 168000
},
{
"epoch": 46.21503017004937,
"grad_norm": 4.635532855987549,
"learning_rate": 2.689248491497532e-05,
"loss": 0.5913,
"step": 168500
},
{
"epoch": 46.35216675809106,
"grad_norm": 5.3561906814575195,
"learning_rate": 2.682391662095447e-05,
"loss": 0.5949,
"step": 169000
},
{
"epoch": 46.48930334613275,
"grad_norm": 5.117276191711426,
"learning_rate": 2.6755348326933626e-05,
"loss": 0.6108,
"step": 169500
},
{
"epoch": 46.626439934174435,
"grad_norm": 5.213390350341797,
"learning_rate": 2.6686780032912785e-05,
"loss": 0.6224,
"step": 170000
},
{
"epoch": 46.76357652221613,
"grad_norm": 5.088405609130859,
"learning_rate": 2.6618211738891935e-05,
"loss": 0.6281,
"step": 170500
},
{
"epoch": 46.90071311025782,
"grad_norm": 5.051976680755615,
"learning_rate": 2.654964344487109e-05,
"loss": 0.6368,
"step": 171000
},
{
"epoch": 47.0378496982995,
"grad_norm": 4.881986141204834,
"learning_rate": 2.648107515085025e-05,
"loss": 0.6152,
"step": 171500
},
{
"epoch": 47.174986286341195,
"grad_norm": 5.066763401031494,
"learning_rate": 2.6412506856829406e-05,
"loss": 0.564,
"step": 172000
},
{
"epoch": 47.31212287438289,
"grad_norm": 4.73757791519165,
"learning_rate": 2.6343938562808556e-05,
"loss": 0.5697,
"step": 172500
},
{
"epoch": 47.44925946242458,
"grad_norm": 4.839804172515869,
"learning_rate": 2.6275370268787712e-05,
"loss": 0.5825,
"step": 173000
},
{
"epoch": 47.58639605046626,
"grad_norm": 5.461195945739746,
"learning_rate": 2.620680197476687e-05,
"loss": 0.5853,
"step": 173500
},
{
"epoch": 47.723532638507955,
"grad_norm": 4.896440029144287,
"learning_rate": 2.613823368074602e-05,
"loss": 0.593,
"step": 174000
},
{
"epoch": 47.860669226549646,
"grad_norm": 4.847322463989258,
"learning_rate": 2.6069665386725177e-05,
"loss": 0.6,
"step": 174500
},
{
"epoch": 47.99780581459133,
"grad_norm": 4.478647708892822,
"learning_rate": 2.6001097092704337e-05,
"loss": 0.6104,
"step": 175000
},
{
"epoch": 48.13494240263302,
"grad_norm": 5.172453880310059,
"learning_rate": 2.5932528798683493e-05,
"loss": 0.5294,
"step": 175500
},
{
"epoch": 48.272078990674714,
"grad_norm": 4.307365894317627,
"learning_rate": 2.5863960504662642e-05,
"loss": 0.5392,
"step": 176000
},
{
"epoch": 48.4092155787164,
"grad_norm": 4.813899517059326,
"learning_rate": 2.57953922106418e-05,
"loss": 0.5521,
"step": 176500
},
{
"epoch": 48.54635216675809,
"grad_norm": 5.233691215515137,
"learning_rate": 2.5726823916620958e-05,
"loss": 0.5565,
"step": 177000
},
{
"epoch": 48.68348875479978,
"grad_norm": 5.3576979637146,
"learning_rate": 2.5658255622600114e-05,
"loss": 0.5663,
"step": 177500
},
{
"epoch": 48.82062534284147,
"grad_norm": 5.31622314453125,
"learning_rate": 2.5589687328579264e-05,
"loss": 0.5754,
"step": 178000
},
{
"epoch": 48.95776193088316,
"grad_norm": 5.2634148597717285,
"learning_rate": 2.5521119034558423e-05,
"loss": 0.58,
"step": 178500
},
{
"epoch": 49.09489851892485,
"grad_norm": 4.982797622680664,
"learning_rate": 2.545255074053758e-05,
"loss": 0.5266,
"step": 179000
},
{
"epoch": 49.23203510696654,
"grad_norm": 4.663660526275635,
"learning_rate": 2.538398244651673e-05,
"loss": 0.5101,
"step": 179500
},
{
"epoch": 49.36917169500823,
"grad_norm": 4.653820991516113,
"learning_rate": 2.5315414152495888e-05,
"loss": 0.5206,
"step": 180000
},
{
"epoch": 49.50630828304992,
"grad_norm": 4.846981048583984,
"learning_rate": 2.5246845858475044e-05,
"loss": 0.5349,
"step": 180500
},
{
"epoch": 49.64344487109161,
"grad_norm": 4.962299346923828,
"learning_rate": 2.51782775644542e-05,
"loss": 0.5378,
"step": 181000
},
{
"epoch": 49.780581459133295,
"grad_norm": 4.924633979797363,
"learning_rate": 2.510970927043335e-05,
"loss": 0.5446,
"step": 181500
},
{
"epoch": 49.917718047174986,
"grad_norm": 5.435749053955078,
"learning_rate": 2.504114097641251e-05,
"loss": 0.5535,
"step": 182000
},
{
"epoch": 50.05485463521668,
"grad_norm": 4.581083297729492,
"learning_rate": 2.4972572682391662e-05,
"loss": 0.5224,
"step": 182500
},
{
"epoch": 50.19199122325836,
"grad_norm": 4.435048580169678,
"learning_rate": 2.490400438837082e-05,
"loss": 0.4925,
"step": 183000
},
{
"epoch": 50.329127811300054,
"grad_norm": 4.9870710372924805,
"learning_rate": 2.4835436094349975e-05,
"loss": 0.4966,
"step": 183500
},
{
"epoch": 50.466264399341746,
"grad_norm": 4.312280178070068,
"learning_rate": 2.476686780032913e-05,
"loss": 0.505,
"step": 184000
},
{
"epoch": 50.60340098738343,
"grad_norm": 4.78123664855957,
"learning_rate": 2.4698299506308284e-05,
"loss": 0.508,
"step": 184500
},
{
"epoch": 50.74053757542512,
"grad_norm": 5.319374084472656,
"learning_rate": 2.462973121228744e-05,
"loss": 0.5149,
"step": 185000
},
{
"epoch": 50.877674163466814,
"grad_norm": 4.26421594619751,
"learning_rate": 2.4561162918266596e-05,
"loss": 0.5278,
"step": 185500
},
{
"epoch": 51.014810751508506,
"grad_norm": 4.891973495483398,
"learning_rate": 2.449259462424575e-05,
"loss": 0.522,
"step": 186000
},
{
"epoch": 51.15194733955019,
"grad_norm": 5.03622579574585,
"learning_rate": 2.4424026330224905e-05,
"loss": 0.4602,
"step": 186500
},
{
"epoch": 51.28908392759188,
"grad_norm": 4.524442195892334,
"learning_rate": 2.435545803620406e-05,
"loss": 0.4689,
"step": 187000
},
{
"epoch": 51.426220515633574,
"grad_norm": 4.18233060836792,
"learning_rate": 2.4286889742183217e-05,
"loss": 0.4786,
"step": 187500
},
{
"epoch": 51.56335710367526,
"grad_norm": 4.806675434112549,
"learning_rate": 2.421832144816237e-05,
"loss": 0.4886,
"step": 188000
},
{
"epoch": 51.70049369171695,
"grad_norm": 4.611050128936768,
"learning_rate": 2.4149753154141526e-05,
"loss": 0.4901,
"step": 188500
},
{
"epoch": 51.83763027975864,
"grad_norm": 5.323733806610107,
"learning_rate": 2.4081184860120682e-05,
"loss": 0.5021,
"step": 189000
},
{
"epoch": 51.97476686780033,
"grad_norm": 4.821100234985352,
"learning_rate": 2.4012616566099835e-05,
"loss": 0.5053,
"step": 189500
},
{
"epoch": 52.11190345584202,
"grad_norm": 4.823397159576416,
"learning_rate": 2.394404827207899e-05,
"loss": 0.4498,
"step": 190000
},
{
"epoch": 52.24904004388371,
"grad_norm": 4.650783061981201,
"learning_rate": 2.3875479978058147e-05,
"loss": 0.4506,
"step": 190500
},
{
"epoch": 52.386176631925395,
"grad_norm": 5.3509697914123535,
"learning_rate": 2.3806911684037304e-05,
"loss": 0.4492,
"step": 191000
},
{
"epoch": 52.523313219967086,
"grad_norm": 5.251642227172852,
"learning_rate": 2.3738343390016456e-05,
"loss": 0.4665,
"step": 191500
},
{
"epoch": 52.66044980800878,
"grad_norm": 4.471257209777832,
"learning_rate": 2.3669775095995613e-05,
"loss": 0.4686,
"step": 192000
},
{
"epoch": 52.79758639605047,
"grad_norm": 4.814416885375977,
"learning_rate": 2.360120680197477e-05,
"loss": 0.4761,
"step": 192500
},
{
"epoch": 52.934722984092154,
"grad_norm": 5.369185924530029,
"learning_rate": 2.353263850795392e-05,
"loss": 0.4826,
"step": 193000
},
{
"epoch": 53.071859572133846,
"grad_norm": 4.826727867126465,
"learning_rate": 2.3464070213933078e-05,
"loss": 0.4517,
"step": 193500
},
{
"epoch": 53.20899616017554,
"grad_norm": 4.9067583084106445,
"learning_rate": 2.3395501919912234e-05,
"loss": 0.4241,
"step": 194000
},
{
"epoch": 53.34613274821722,
"grad_norm": 5.361186981201172,
"learning_rate": 2.332693362589139e-05,
"loss": 0.4334,
"step": 194500
},
{
"epoch": 53.483269336258914,
"grad_norm": 4.9540300369262695,
"learning_rate": 2.3258365331870543e-05,
"loss": 0.4379,
"step": 195000
},
{
"epoch": 53.620405924300606,
"grad_norm": 5.23082971572876,
"learning_rate": 2.31897970378497e-05,
"loss": 0.4495,
"step": 195500
},
{
"epoch": 53.75754251234229,
"grad_norm": 4.608271598815918,
"learning_rate": 2.3121228743828855e-05,
"loss": 0.4521,
"step": 196000
},
{
"epoch": 53.89467910038398,
"grad_norm": 4.835067272186279,
"learning_rate": 2.305266044980801e-05,
"loss": 0.4616,
"step": 196500
},
{
"epoch": 54.031815688425674,
"grad_norm": 4.397408485412598,
"learning_rate": 2.2984092155787164e-05,
"loss": 0.4483,
"step": 197000
},
{
"epoch": 54.16895227646736,
"grad_norm": 4.769198894500732,
"learning_rate": 2.291552386176632e-05,
"loss": 0.4038,
"step": 197500
},
{
"epoch": 54.30608886450905,
"grad_norm": 4.403786659240723,
"learning_rate": 2.2846955567745476e-05,
"loss": 0.4146,
"step": 198000
},
{
"epoch": 54.44322545255074,
"grad_norm": 4.6071696281433105,
"learning_rate": 2.277838727372463e-05,
"loss": 0.4199,
"step": 198500
},
{
"epoch": 54.58036204059243,
"grad_norm": 4.638876438140869,
"learning_rate": 2.2709818979703785e-05,
"loss": 0.4268,
"step": 199000
},
{
"epoch": 54.71749862863412,
"grad_norm": 4.671108722686768,
"learning_rate": 2.264125068568294e-05,
"loss": 0.4302,
"step": 199500
},
{
"epoch": 54.85463521667581,
"grad_norm": 5.265748977661133,
"learning_rate": 2.2572682391662098e-05,
"loss": 0.44,
"step": 200000
},
{
"epoch": 54.9917718047175,
"grad_norm": 5.179275989532471,
"learning_rate": 2.250411409764125e-05,
"loss": 0.4381,
"step": 200500
},
{
"epoch": 55.128908392759186,
"grad_norm": 4.084758758544922,
"learning_rate": 2.2435545803620407e-05,
"loss": 0.3888,
"step": 201000
},
{
"epoch": 55.26604498080088,
"grad_norm": 4.465928554534912,
"learning_rate": 2.2366977509599563e-05,
"loss": 0.3935,
"step": 201500
},
{
"epoch": 55.40318156884257,
"grad_norm": 4.657350540161133,
"learning_rate": 2.2298409215578716e-05,
"loss": 0.3963,
"step": 202000
},
{
"epoch": 55.540318156884254,
"grad_norm": 4.591371059417725,
"learning_rate": 2.2229840921557872e-05,
"loss": 0.4052,
"step": 202500
},
{
"epoch": 55.677454744925946,
"grad_norm": 4.821173191070557,
"learning_rate": 2.2161272627537028e-05,
"loss": 0.4135,
"step": 203000
},
{
"epoch": 55.81459133296764,
"grad_norm": 4.650514125823975,
"learning_rate": 2.2092704333516184e-05,
"loss": 0.4171,
"step": 203500
},
{
"epoch": 55.95172792100932,
"grad_norm": 4.952467441558838,
"learning_rate": 2.2024136039495337e-05,
"loss": 0.4238,
"step": 204000
},
{
"epoch": 56.088864509051014,
"grad_norm": 4.717243671417236,
"learning_rate": 2.1955567745474493e-05,
"loss": 0.3891,
"step": 204500
},
{
"epoch": 56.226001097092706,
"grad_norm": 4.069623947143555,
"learning_rate": 2.188699945145365e-05,
"loss": 0.373,
"step": 205000
},
{
"epoch": 56.3631376851344,
"grad_norm": 4.447889804840088,
"learning_rate": 2.1818431157432802e-05,
"loss": 0.3799,
"step": 205500
},
{
"epoch": 56.50027427317608,
"grad_norm": 4.514695644378662,
"learning_rate": 2.174986286341196e-05,
"loss": 0.39,
"step": 206000
},
{
"epoch": 56.637410861217774,
"grad_norm": 5.111133098602295,
"learning_rate": 2.1681294569391114e-05,
"loss": 0.3919,
"step": 206500
},
{
"epoch": 56.774547449259465,
"grad_norm": 4.48080587387085,
"learning_rate": 2.161272627537027e-05,
"loss": 0.3965,
"step": 207000
},
{
"epoch": 56.91168403730115,
"grad_norm": 4.876768589019775,
"learning_rate": 2.1544157981349423e-05,
"loss": 0.4046,
"step": 207500
},
{
"epoch": 57.04882062534284,
"grad_norm": 4.336927890777588,
"learning_rate": 2.1475589687328583e-05,
"loss": 0.3817,
"step": 208000
},
{
"epoch": 57.18595721338453,
"grad_norm": 4.570804595947266,
"learning_rate": 2.1407021393307736e-05,
"loss": 0.357,
"step": 208500
},
{
"epoch": 57.32309380142622,
"grad_norm": 4.742151737213135,
"learning_rate": 2.133845309928689e-05,
"loss": 0.3648,
"step": 209000
},
{
"epoch": 57.46023038946791,
"grad_norm": 4.641295433044434,
"learning_rate": 2.1269884805266048e-05,
"loss": 0.3716,
"step": 209500
},
{
"epoch": 57.5973669775096,
"grad_norm": 5.475332736968994,
"learning_rate": 2.12013165112452e-05,
"loss": 0.3731,
"step": 210000
},
{
"epoch": 57.734503565551286,
"grad_norm": 4.712151050567627,
"learning_rate": 2.1132748217224357e-05,
"loss": 0.3809,
"step": 210500
},
{
"epoch": 57.87164015359298,
"grad_norm": 4.409310817718506,
"learning_rate": 2.106417992320351e-05,
"loss": 0.3812,
"step": 211000
},
{
"epoch": 58.00877674163467,
"grad_norm": 4.162150859832764,
"learning_rate": 2.099561162918267e-05,
"loss": 0.383,
"step": 211500
},
{
"epoch": 58.14591332967636,
"grad_norm": 4.273313999176025,
"learning_rate": 2.0927043335161822e-05,
"loss": 0.3407,
"step": 212000
},
{
"epoch": 58.283049917718046,
"grad_norm": 4.508772850036621,
"learning_rate": 2.0858475041140975e-05,
"loss": 0.3489,
"step": 212500
},
{
"epoch": 58.42018650575974,
"grad_norm": 5.550928592681885,
"learning_rate": 2.0789906747120134e-05,
"loss": 0.3526,
"step": 213000
},
{
"epoch": 58.55732309380143,
"grad_norm": 4.722227096557617,
"learning_rate": 2.0721338453099287e-05,
"loss": 0.3576,
"step": 213500
},
{
"epoch": 58.694459681843114,
"grad_norm": 4.649284839630127,
"learning_rate": 2.0652770159078443e-05,
"loss": 0.3605,
"step": 214000
},
{
"epoch": 58.831596269884805,
"grad_norm": 4.80319881439209,
"learning_rate": 2.05842018650576e-05,
"loss": 0.3655,
"step": 214500
},
{
"epoch": 58.9687328579265,
"grad_norm": 5.22609806060791,
"learning_rate": 2.0515633571036756e-05,
"loss": 0.3714,
"step": 215000
},
{
"epoch": 59.10586944596818,
"grad_norm": 5.241272926330566,
"learning_rate": 2.044706527701591e-05,
"loss": 0.3345,
"step": 215500
},
{
"epoch": 59.24300603400987,
"grad_norm": 4.466114044189453,
"learning_rate": 2.037849698299506e-05,
"loss": 0.3314,
"step": 216000
},
{
"epoch": 59.380142622051565,
"grad_norm": 4.289991855621338,
"learning_rate": 2.030992868897422e-05,
"loss": 0.3353,
"step": 216500
},
{
"epoch": 59.51727921009325,
"grad_norm": 4.458993911743164,
"learning_rate": 2.0241360394953374e-05,
"loss": 0.3385,
"step": 217000
},
{
"epoch": 59.65441579813494,
"grad_norm": 4.6871724128723145,
"learning_rate": 2.017279210093253e-05,
"loss": 0.3452,
"step": 217500
},
{
"epoch": 59.79155238617663,
"grad_norm": 4.503798484802246,
"learning_rate": 2.0104223806911686e-05,
"loss": 0.3476,
"step": 218000
},
{
"epoch": 59.928688974218325,
"grad_norm": 5.342411041259766,
"learning_rate": 2.0035655512890842e-05,
"loss": 0.3553,
"step": 218500
},
{
"epoch": 60.06582556226001,
"grad_norm": 5.1502180099487305,
"learning_rate": 1.9967087218869995e-05,
"loss": 0.3331,
"step": 219000
},
{
"epoch": 60.2029621503017,
"grad_norm": 4.446504592895508,
"learning_rate": 1.9898518924849148e-05,
"loss": 0.3166,
"step": 219500
},
{
"epoch": 60.34009873834339,
"grad_norm": 4.185482025146484,
"learning_rate": 1.9829950630828307e-05,
"loss": 0.3204,
"step": 220000
},
{
"epoch": 60.47723532638508,
"grad_norm": 4.356864929199219,
"learning_rate": 1.976138233680746e-05,
"loss": 0.3262,
"step": 220500
},
{
"epoch": 60.61437191442677,
"grad_norm": 4.678393840789795,
"learning_rate": 1.9692814042786616e-05,
"loss": 0.3311,
"step": 221000
},
{
"epoch": 60.75150850246846,
"grad_norm": 5.001060962677002,
"learning_rate": 1.9624245748765772e-05,
"loss": 0.3354,
"step": 221500
},
{
"epoch": 60.888645090510146,
"grad_norm": 5.079350471496582,
"learning_rate": 1.955567745474493e-05,
"loss": 0.3404,
"step": 222000
},
{
"epoch": 61.02578167855184,
"grad_norm": 4.419836044311523,
"learning_rate": 1.948710916072408e-05,
"loss": 0.331,
"step": 222500
},
{
"epoch": 61.16291826659353,
"grad_norm": 4.383386611938477,
"learning_rate": 1.9418540866703238e-05,
"loss": 0.3028,
"step": 223000
},
{
"epoch": 61.30005485463521,
"grad_norm": 4.333778381347656,
"learning_rate": 1.9349972572682394e-05,
"loss": 0.3055,
"step": 223500
},
{
"epoch": 61.437191442676905,
"grad_norm": 4.988595008850098,
"learning_rate": 1.9281404278661547e-05,
"loss": 0.3112,
"step": 224000
},
{
"epoch": 61.5743280307186,
"grad_norm": 5.163971900939941,
"learning_rate": 1.9212835984640703e-05,
"loss": 0.3153,
"step": 224500
},
{
"epoch": 61.71146461876029,
"grad_norm": 3.907899856567383,
"learning_rate": 1.914426769061986e-05,
"loss": 0.3228,
"step": 225000
},
{
"epoch": 61.84860120680197,
"grad_norm": 4.212146282196045,
"learning_rate": 1.9075699396599015e-05,
"loss": 0.325,
"step": 225500
},
{
"epoch": 61.985737794843665,
"grad_norm": 4.616479873657227,
"learning_rate": 1.9007131102578168e-05,
"loss": 0.3263,
"step": 226000
},
{
"epoch": 62.12287438288536,
"grad_norm": 4.422669887542725,
"learning_rate": 1.8938562808557324e-05,
"loss": 0.2962,
"step": 226500
},
{
"epoch": 62.26001097092704,
"grad_norm": 4.242331027984619,
"learning_rate": 1.886999451453648e-05,
"loss": 0.2926,
"step": 227000
},
{
"epoch": 62.39714755896873,
"grad_norm": 4.647274494171143,
"learning_rate": 1.8801426220515633e-05,
"loss": 0.2985,
"step": 227500
},
{
"epoch": 62.534284147010425,
"grad_norm": 4.557641983032227,
"learning_rate": 1.873285792649479e-05,
"loss": 0.3027,
"step": 228000
},
{
"epoch": 62.67142073505211,
"grad_norm": 4.458461284637451,
"learning_rate": 1.8664289632473945e-05,
"loss": 0.3088,
"step": 228500
},
{
"epoch": 62.8085573230938,
"grad_norm": 4.6789727210998535,
"learning_rate": 1.85957213384531e-05,
"loss": 0.3105,
"step": 229000
},
{
"epoch": 62.94569391113549,
"grad_norm": 4.642698287963867,
"learning_rate": 1.8527153044432254e-05,
"loss": 0.3154,
"step": 229500
},
{
"epoch": 63.08283049917718,
"grad_norm": 4.549673557281494,
"learning_rate": 1.845858475041141e-05,
"loss": 0.2929,
"step": 230000
},
{
"epoch": 63.21996708721887,
"grad_norm": 4.2093119621276855,
"learning_rate": 1.8390016456390567e-05,
"loss": 0.2838,
"step": 230500
},
{
"epoch": 63.35710367526056,
"grad_norm": 4.682537078857422,
"learning_rate": 1.832144816236972e-05,
"loss": 0.2853,
"step": 231000
},
{
"epoch": 63.49424026330225,
"grad_norm": 4.815731048583984,
"learning_rate": 1.8252879868348876e-05,
"loss": 0.2885,
"step": 231500
},
{
"epoch": 63.63137685134394,
"grad_norm": 5.170729160308838,
"learning_rate": 1.818431157432803e-05,
"loss": 0.2957,
"step": 232000
},
{
"epoch": 63.76851343938563,
"grad_norm": 4.020371913909912,
"learning_rate": 1.8115743280307188e-05,
"loss": 0.2985,
"step": 232500
},
{
"epoch": 63.90565002742732,
"grad_norm": 4.983353137969971,
"learning_rate": 1.804717498628634e-05,
"loss": 0.3007,
"step": 233000
},
{
"epoch": 64.04278661546901,
"grad_norm": 4.521115303039551,
"learning_rate": 1.7978606692265497e-05,
"loss": 0.2887,
"step": 233500
},
{
"epoch": 64.1799232035107,
"grad_norm": 4.261961460113525,
"learning_rate": 1.7910038398244653e-05,
"loss": 0.2695,
"step": 234000
},
{
"epoch": 64.31705979155238,
"grad_norm": 4.490432262420654,
"learning_rate": 1.7841470104223806e-05,
"loss": 0.2726,
"step": 234500
},
{
"epoch": 64.45419637959408,
"grad_norm": 4.353551864624023,
"learning_rate": 1.7772901810202962e-05,
"loss": 0.2774,
"step": 235000
},
{
"epoch": 64.59133296763576,
"grad_norm": 4.908097743988037,
"learning_rate": 1.7704333516182118e-05,
"loss": 0.2837,
"step": 235500
},
{
"epoch": 64.72846955567745,
"grad_norm": 4.305734157562256,
"learning_rate": 1.7635765222161274e-05,
"loss": 0.2836,
"step": 236000
},
{
"epoch": 64.86560614371915,
"grad_norm": 5.043435096740723,
"learning_rate": 1.7567196928140427e-05,
"loss": 0.2897,
"step": 236500
},
{
"epoch": 65.00274273176083,
"grad_norm": 4.230961322784424,
"learning_rate": 1.7498628634119583e-05,
"loss": 0.2911,
"step": 237000
},
{
"epoch": 65.13987931980252,
"grad_norm": 4.4803056716918945,
"learning_rate": 1.743006034009874e-05,
"loss": 0.259,
"step": 237500
},
{
"epoch": 65.27701590784422,
"grad_norm": 3.8626787662506104,
"learning_rate": 1.7361492046077896e-05,
"loss": 0.2618,
"step": 238000
},
{
"epoch": 65.4141524958859,
"grad_norm": 4.65452766418457,
"learning_rate": 1.729292375205705e-05,
"loss": 0.265,
"step": 238500
},
{
"epoch": 65.55128908392759,
"grad_norm": 4.291559219360352,
"learning_rate": 1.7224355458036205e-05,
"loss": 0.2709,
"step": 239000
},
{
"epoch": 65.68842567196928,
"grad_norm": 4.508846282958984,
"learning_rate": 1.715578716401536e-05,
"loss": 0.2721,
"step": 239500
},
{
"epoch": 65.82556226001097,
"grad_norm": 5.093057632446289,
"learning_rate": 1.7087218869994513e-05,
"loss": 0.2769,
"step": 240000
},
{
"epoch": 65.96269884805265,
"grad_norm": 4.549623012542725,
"learning_rate": 1.7018650575973673e-05,
"loss": 0.2773,
"step": 240500
},
{
"epoch": 66.09983543609435,
"grad_norm": 4.05508279800415,
"learning_rate": 1.6950082281952826e-05,
"loss": 0.2553,
"step": 241000
},
{
"epoch": 66.23697202413604,
"grad_norm": 3.7369630336761475,
"learning_rate": 1.6881513987931982e-05,
"loss": 0.2528,
"step": 241500
},
{
"epoch": 66.37410861217774,
"grad_norm": 3.813990831375122,
"learning_rate": 1.6812945693911135e-05,
"loss": 0.2526,
"step": 242000
},
{
"epoch": 66.51124520021942,
"grad_norm": 3.993372917175293,
"learning_rate": 1.674437739989029e-05,
"loss": 0.2592,
"step": 242500
},
{
"epoch": 66.6483817882611,
"grad_norm": 4.96673059463501,
"learning_rate": 1.6675809105869447e-05,
"loss": 0.2611,
"step": 243000
},
{
"epoch": 66.7855183763028,
"grad_norm": 4.10557746887207,
"learning_rate": 1.66072408118486e-05,
"loss": 0.2648,
"step": 243500
},
{
"epoch": 66.92265496434449,
"grad_norm": 4.813425064086914,
"learning_rate": 1.653867251782776e-05,
"loss": 0.2656,
"step": 244000
},
{
"epoch": 67.05979155238617,
"grad_norm": 4.064112186431885,
"learning_rate": 1.6470104223806912e-05,
"loss": 0.2536,
"step": 244500
},
{
"epoch": 67.19692814042787,
"grad_norm": 4.719504356384277,
"learning_rate": 1.640153592978607e-05,
"loss": 0.2411,
"step": 245000
},
{
"epoch": 67.33406472846956,
"grad_norm": 4.4745588302612305,
"learning_rate": 1.633296763576522e-05,
"loss": 0.2505,
"step": 245500
},
{
"epoch": 67.47120131651124,
"grad_norm": 4.499454021453857,
"learning_rate": 1.6264399341744377e-05,
"loss": 0.2493,
"step": 246000
},
{
"epoch": 67.60833790455294,
"grad_norm": 3.987778663635254,
"learning_rate": 1.6195831047723534e-05,
"loss": 0.2503,
"step": 246500
},
{
"epoch": 67.74547449259462,
"grad_norm": 4.4290618896484375,
"learning_rate": 1.6127262753702686e-05,
"loss": 0.2552,
"step": 247000
},
{
"epoch": 67.88261108063631,
"grad_norm": 4.531731605529785,
"learning_rate": 1.6058694459681846e-05,
"loss": 0.2579,
"step": 247500
},
{
"epoch": 68.01974766867801,
"grad_norm": 3.8032639026641846,
"learning_rate": 1.5990126165661e-05,
"loss": 0.2562,
"step": 248000
},
{
"epoch": 68.15688425671969,
"grad_norm": 3.864058017730713,
"learning_rate": 1.5921557871640155e-05,
"loss": 0.234,
"step": 248500
},
{
"epoch": 68.29402084476138,
"grad_norm": 3.7496285438537598,
"learning_rate": 1.585298957761931e-05,
"loss": 0.2362,
"step": 249000
},
{
"epoch": 68.43115743280308,
"grad_norm": 3.9640090465545654,
"learning_rate": 1.5784421283598467e-05,
"loss": 0.2406,
"step": 249500
},
{
"epoch": 68.56829402084476,
"grad_norm": 4.273751258850098,
"learning_rate": 1.571585298957762e-05,
"loss": 0.2402,
"step": 250000
},
{
"epoch": 68.70543060888645,
"grad_norm": 3.934805393218994,
"learning_rate": 1.5647284695556773e-05,
"loss": 0.2437,
"step": 250500
},
{
"epoch": 68.84256719692814,
"grad_norm": 3.652498245239258,
"learning_rate": 1.5578716401535932e-05,
"loss": 0.2467,
"step": 251000
},
{
"epoch": 68.97970378496983,
"grad_norm": 3.7606563568115234,
"learning_rate": 1.5510148107515085e-05,
"loss": 0.2489,
"step": 251500
},
{
"epoch": 69.11684037301151,
"grad_norm": 4.354647636413574,
"learning_rate": 1.544157981349424e-05,
"loss": 0.2272,
"step": 252000
},
{
"epoch": 69.25397696105321,
"grad_norm": 3.411524772644043,
"learning_rate": 1.5373011519473397e-05,
"loss": 0.2274,
"step": 252500
},
{
"epoch": 69.3911135490949,
"grad_norm": 4.171504020690918,
"learning_rate": 1.5304443225452554e-05,
"loss": 0.2305,
"step": 253000
},
{
"epoch": 69.52825013713658,
"grad_norm": 4.308210372924805,
"learning_rate": 1.5235874931431706e-05,
"loss": 0.2319,
"step": 253500
},
{
"epoch": 69.66538672517828,
"grad_norm": 4.150519847869873,
"learning_rate": 1.516730663741086e-05,
"loss": 0.2344,
"step": 254000
},
{
"epoch": 69.80252331321996,
"grad_norm": 4.316656112670898,
"learning_rate": 1.5098738343390017e-05,
"loss": 0.2391,
"step": 254500
},
{
"epoch": 69.93965990126166,
"grad_norm": 4.44851541519165,
"learning_rate": 1.5030170049369171e-05,
"loss": 0.2385,
"step": 255000
},
{
"epoch": 70.07679648930335,
"grad_norm": 4.209973335266113,
"learning_rate": 1.4961601755348328e-05,
"loss": 0.225,
"step": 255500
},
{
"epoch": 70.21393307734503,
"grad_norm": 4.037484169006348,
"learning_rate": 1.4893033461327482e-05,
"loss": 0.2179,
"step": 256000
},
{
"epoch": 70.35106966538673,
"grad_norm": 3.6946587562561035,
"learning_rate": 1.482446516730664e-05,
"loss": 0.2222,
"step": 256500
},
{
"epoch": 70.48820625342842,
"grad_norm": 4.2428717613220215,
"learning_rate": 1.4755896873285793e-05,
"loss": 0.2211,
"step": 257000
},
{
"epoch": 70.6253428414701,
"grad_norm": 3.7683310508728027,
"learning_rate": 1.4687328579264947e-05,
"loss": 0.2259,
"step": 257500
},
{
"epoch": 70.7624794295118,
"grad_norm": 4.147058486938477,
"learning_rate": 1.4618760285244103e-05,
"loss": 0.2283,
"step": 258000
},
{
"epoch": 70.89961601755348,
"grad_norm": 4.305523872375488,
"learning_rate": 1.4550191991223258e-05,
"loss": 0.2316,
"step": 258500
},
{
"epoch": 71.03675260559517,
"grad_norm": 4.284609317779541,
"learning_rate": 1.4481623697202416e-05,
"loss": 0.2254,
"step": 259000
},
{
"epoch": 71.17388919363687,
"grad_norm": 3.876636028289795,
"learning_rate": 1.4413055403181569e-05,
"loss": 0.2084,
"step": 259500
},
{
"epoch": 71.31102578167855,
"grad_norm": 4.208460330963135,
"learning_rate": 1.4344487109160726e-05,
"loss": 0.2141,
"step": 260000
},
{
"epoch": 71.44816236972024,
"grad_norm": 3.976590156555176,
"learning_rate": 1.427591881513988e-05,
"loss": 0.2146,
"step": 260500
},
{
"epoch": 71.58529895776194,
"grad_norm": 3.778451442718506,
"learning_rate": 1.4207350521119034e-05,
"loss": 0.2163,
"step": 261000
},
{
"epoch": 71.72243554580362,
"grad_norm": 4.75286340713501,
"learning_rate": 1.4138782227098192e-05,
"loss": 0.2194,
"step": 261500
},
{
"epoch": 71.8595721338453,
"grad_norm": 3.755993366241455,
"learning_rate": 1.4070213933077344e-05,
"loss": 0.2236,
"step": 262000
},
{
"epoch": 71.996708721887,
"grad_norm": 4.23431396484375,
"learning_rate": 1.4001645639056502e-05,
"loss": 0.224,
"step": 262500
},
{
"epoch": 72.13384530992869,
"grad_norm": 4.001950263977051,
"learning_rate": 1.3933077345035655e-05,
"loss": 0.2022,
"step": 263000
},
{
"epoch": 72.27098189797037,
"grad_norm": 3.7588768005371094,
"learning_rate": 1.3864509051014813e-05,
"loss": 0.2043,
"step": 263500
},
{
"epoch": 72.40811848601207,
"grad_norm": 4.171288013458252,
"learning_rate": 1.3795940756993966e-05,
"loss": 0.2065,
"step": 264000
},
{
"epoch": 72.54525507405376,
"grad_norm": 4.1884636878967285,
"learning_rate": 1.3727372462973123e-05,
"loss": 0.2081,
"step": 264500
},
{
"epoch": 72.68239166209544,
"grad_norm": 4.019055366516113,
"learning_rate": 1.3658804168952278e-05,
"loss": 0.212,
"step": 265000
},
{
"epoch": 72.81952825013714,
"grad_norm": 3.9061167240142822,
"learning_rate": 1.359023587493143e-05,
"loss": 0.2128,
"step": 265500
},
{
"epoch": 72.95666483817882,
"grad_norm": 4.590092182159424,
"learning_rate": 1.3521667580910589e-05,
"loss": 0.2149,
"step": 266000
},
{
"epoch": 73.09380142622051,
"grad_norm": 4.069841384887695,
"learning_rate": 1.3453099286889741e-05,
"loss": 0.2016,
"step": 266500
},
{
"epoch": 73.23093801426221,
"grad_norm": 3.7650821208953857,
"learning_rate": 1.33845309928689e-05,
"loss": 0.1969,
"step": 267000
},
{
"epoch": 73.36807460230389,
"grad_norm": 3.8244950771331787,
"learning_rate": 1.3315962698848054e-05,
"loss": 0.1984,
"step": 267500
},
{
"epoch": 73.50521119034559,
"grad_norm": 3.6921212673187256,
"learning_rate": 1.324739440482721e-05,
"loss": 0.2026,
"step": 268000
},
{
"epoch": 73.64234777838728,
"grad_norm": 4.225021839141846,
"learning_rate": 1.3178826110806364e-05,
"loss": 0.2036,
"step": 268500
},
{
"epoch": 73.77948436642896,
"grad_norm": 4.311788082122803,
"learning_rate": 1.3110257816785517e-05,
"loss": 0.2052,
"step": 269000
},
{
"epoch": 73.91662095447066,
"grad_norm": 4.360690116882324,
"learning_rate": 1.3041689522764675e-05,
"loss": 0.2081,
"step": 269500
},
{
"epoch": 74.05375754251234,
"grad_norm": 3.889430522918701,
"learning_rate": 1.297312122874383e-05,
"loss": 0.2006,
"step": 270000
},
{
"epoch": 74.19089413055403,
"grad_norm": 4.069758892059326,
"learning_rate": 1.2904552934722986e-05,
"loss": 0.1903,
"step": 270500
},
{
"epoch": 74.32803071859573,
"grad_norm": 3.5697872638702393,
"learning_rate": 1.283598464070214e-05,
"loss": 0.1921,
"step": 271000
},
{
"epoch": 74.46516730663741,
"grad_norm": 4.888301849365234,
"learning_rate": 1.2767416346681296e-05,
"loss": 0.1976,
"step": 271500
},
{
"epoch": 74.6023038946791,
"grad_norm": 4.195688247680664,
"learning_rate": 1.269884805266045e-05,
"loss": 0.1956,
"step": 272000
},
{
"epoch": 74.7394404827208,
"grad_norm": 3.5373120307922363,
"learning_rate": 1.2630279758639604e-05,
"loss": 0.1976,
"step": 272500
},
{
"epoch": 74.87657707076248,
"grad_norm": 4.081260681152344,
"learning_rate": 1.2561711464618761e-05,
"loss": 0.1993,
"step": 273000
},
{
"epoch": 75.01371365880416,
"grad_norm": 3.637251615524292,
"learning_rate": 1.2493143170597916e-05,
"loss": 0.1993,
"step": 273500
},
{
"epoch": 75.15085024684586,
"grad_norm": 4.6371355056762695,
"learning_rate": 1.2424574876577072e-05,
"loss": 0.1847,
"step": 274000
},
{
"epoch": 75.28798683488755,
"grad_norm": 3.781407594680786,
"learning_rate": 1.2356006582556227e-05,
"loss": 0.1862,
"step": 274500
},
{
"epoch": 75.42512342292923,
"grad_norm": 3.249769926071167,
"learning_rate": 1.2287438288535381e-05,
"loss": 0.189,
"step": 275000
},
{
"epoch": 75.56226001097093,
"grad_norm": 3.62080717086792,
"learning_rate": 1.2218869994514537e-05,
"loss": 0.1904,
"step": 275500
},
{
"epoch": 75.69939659901262,
"grad_norm": 3.6299779415130615,
"learning_rate": 1.2150301700493692e-05,
"loss": 0.1913,
"step": 276000
},
{
"epoch": 75.8365331870543,
"grad_norm": 4.178566932678223,
"learning_rate": 1.2081733406472848e-05,
"loss": 0.1916,
"step": 276500
},
{
"epoch": 75.973669775096,
"grad_norm": 3.7569074630737305,
"learning_rate": 1.2013165112452002e-05,
"loss": 0.1932,
"step": 277000
},
{
"epoch": 76.11080636313768,
"grad_norm": 3.6671714782714844,
"learning_rate": 1.1944596818431158e-05,
"loss": 0.1789,
"step": 277500
},
{
"epoch": 76.24794295117937,
"grad_norm": 4.360944747924805,
"learning_rate": 1.1876028524410313e-05,
"loss": 0.1799,
"step": 278000
},
{
"epoch": 76.38507953922107,
"grad_norm": 4.378243446350098,
"learning_rate": 1.1807460230389467e-05,
"loss": 0.1815,
"step": 278500
},
{
"epoch": 76.52221612726275,
"grad_norm": 3.7712574005126953,
"learning_rate": 1.1738891936368624e-05,
"loss": 0.1849,
"step": 279000
},
{
"epoch": 76.65935271530444,
"grad_norm": 3.6135239601135254,
"learning_rate": 1.1670323642347778e-05,
"loss": 0.1851,
"step": 279500
},
{
"epoch": 76.79648930334614,
"grad_norm": 4.262831687927246,
"learning_rate": 1.1601755348326934e-05,
"loss": 0.187,
"step": 280000
},
{
"epoch": 76.93362589138782,
"grad_norm": 3.7981927394866943,
"learning_rate": 1.153318705430609e-05,
"loss": 0.1867,
"step": 280500
},
{
"epoch": 77.07076247942952,
"grad_norm": 3.799161434173584,
"learning_rate": 1.1464618760285245e-05,
"loss": 0.1781,
"step": 281000
},
{
"epoch": 77.2078990674712,
"grad_norm": 3.511946201324463,
"learning_rate": 1.1396050466264401e-05,
"loss": 0.1721,
"step": 281500
},
{
"epoch": 77.34503565551289,
"grad_norm": 3.6062841415405273,
"learning_rate": 1.1327482172243554e-05,
"loss": 0.1768,
"step": 282000
},
{
"epoch": 77.48217224355459,
"grad_norm": 3.6229002475738525,
"learning_rate": 1.125891387822271e-05,
"loss": 0.1761,
"step": 282500
},
{
"epoch": 77.61930883159627,
"grad_norm": 4.036831378936768,
"learning_rate": 1.1190345584201866e-05,
"loss": 0.1775,
"step": 283000
},
{
"epoch": 77.75644541963796,
"grad_norm": 3.842072010040283,
"learning_rate": 1.112177729018102e-05,
"loss": 0.1794,
"step": 283500
},
{
"epoch": 77.89358200767965,
"grad_norm": 4.432040691375732,
"learning_rate": 1.1053208996160177e-05,
"loss": 0.1809,
"step": 284000
},
{
"epoch": 78.03071859572134,
"grad_norm": 3.7242350578308105,
"learning_rate": 1.0984640702139331e-05,
"loss": 0.1777,
"step": 284500
},
{
"epoch": 78.16785518376302,
"grad_norm": 3.5870072841644287,
"learning_rate": 1.0916072408118487e-05,
"loss": 0.1685,
"step": 285000
},
{
"epoch": 78.30499177180472,
"grad_norm": 4.315713405609131,
"learning_rate": 1.0847504114097642e-05,
"loss": 0.1691,
"step": 285500
},
{
"epoch": 78.44212835984641,
"grad_norm": 4.229913234710693,
"learning_rate": 1.0778935820076796e-05,
"loss": 0.1694,
"step": 286000
},
{
"epoch": 78.57926494788809,
"grad_norm": 4.238448143005371,
"learning_rate": 1.0710367526055953e-05,
"loss": 0.1722,
"step": 286500
},
{
"epoch": 78.71640153592979,
"grad_norm": 3.810060739517212,
"learning_rate": 1.0641799232035107e-05,
"loss": 0.1739,
"step": 287000
},
{
"epoch": 78.85353812397148,
"grad_norm": 3.8846802711486816,
"learning_rate": 1.0573230938014263e-05,
"loss": 0.1743,
"step": 287500
},
{
"epoch": 78.99067471201316,
"grad_norm": 3.194765567779541,
"learning_rate": 1.0504662643993418e-05,
"loss": 0.1771,
"step": 288000
},
{
"epoch": 79.12781130005486,
"grad_norm": 3.9391047954559326,
"learning_rate": 1.0436094349972574e-05,
"loss": 0.1636,
"step": 288500
},
{
"epoch": 79.26494788809654,
"grad_norm": 4.282817840576172,
"learning_rate": 1.0367526055951728e-05,
"loss": 0.1637,
"step": 289000
},
{
"epoch": 79.40208447613823,
"grad_norm": 3.725553512573242,
"learning_rate": 1.0298957761930883e-05,
"loss": 0.167,
"step": 289500
},
{
"epoch": 79.53922106417993,
"grad_norm": 3.7785303592681885,
"learning_rate": 1.0230389467910039e-05,
"loss": 0.1674,
"step": 290000
},
{
"epoch": 79.67635765222161,
"grad_norm": 3.667619228363037,
"learning_rate": 1.0161821173889193e-05,
"loss": 0.1661,
"step": 290500
},
{
"epoch": 79.8134942402633,
"grad_norm": 3.732048273086548,
"learning_rate": 1.009325287986835e-05,
"loss": 0.1696,
"step": 291000
},
{
"epoch": 79.950630828305,
"grad_norm": 4.32537841796875,
"learning_rate": 1.0024684585847504e-05,
"loss": 0.1697,
"step": 291500
},
{
"epoch": 80.08776741634668,
"grad_norm": 3.7802329063415527,
"learning_rate": 9.95611629182666e-06,
"loss": 0.1632,
"step": 292000
},
{
"epoch": 80.22490400438836,
"grad_norm": 4.236711025238037,
"learning_rate": 9.887547997805815e-06,
"loss": 0.1587,
"step": 292500
},
{
"epoch": 80.36204059243006,
"grad_norm": 3.8807108402252197,
"learning_rate": 9.818979703784971e-06,
"loss": 0.1588,
"step": 293000
},
{
"epoch": 80.49917718047175,
"grad_norm": 3.935448408126831,
"learning_rate": 9.750411409764125e-06,
"loss": 0.1598,
"step": 293500
},
{
"epoch": 80.63631376851345,
"grad_norm": 3.9982056617736816,
"learning_rate": 9.68184311574328e-06,
"loss": 0.1623,
"step": 294000
},
{
"epoch": 80.77345035655513,
"grad_norm": 4.14504337310791,
"learning_rate": 9.613274821722436e-06,
"loss": 0.1641,
"step": 294500
},
{
"epoch": 80.91058694459682,
"grad_norm": 3.4991772174835205,
"learning_rate": 9.54470652770159e-06,
"loss": 0.1647,
"step": 295000
},
{
"epoch": 81.04772353263851,
"grad_norm": 3.483520030975342,
"learning_rate": 9.476138233680747e-06,
"loss": 0.1608,
"step": 295500
},
{
"epoch": 81.1848601206802,
"grad_norm": 3.9691319465637207,
"learning_rate": 9.407569939659903e-06,
"loss": 0.1542,
"step": 296000
},
{
"epoch": 81.32199670872188,
"grad_norm": 4.031587600708008,
"learning_rate": 9.339001645639057e-06,
"loss": 0.1551,
"step": 296500
},
{
"epoch": 81.45913329676358,
"grad_norm": 3.9734628200531006,
"learning_rate": 9.270433351618212e-06,
"loss": 0.1533,
"step": 297000
},
{
"epoch": 81.59626988480527,
"grad_norm": 3.245915651321411,
"learning_rate": 9.201865057597366e-06,
"loss": 0.1573,
"step": 297500
},
{
"epoch": 81.73340647284695,
"grad_norm": 3.699833393096924,
"learning_rate": 9.133296763576522e-06,
"loss": 0.1575,
"step": 298000
},
{
"epoch": 81.87054306088865,
"grad_norm": 3.8309028148651123,
"learning_rate": 9.064728469555677e-06,
"loss": 0.1581,
"step": 298500
},
{
"epoch": 82.00767964893033,
"grad_norm": 4.079482078552246,
"learning_rate": 8.996160175534833e-06,
"loss": 0.1593,
"step": 299000
},
{
"epoch": 82.14481623697202,
"grad_norm": 3.2036027908325195,
"learning_rate": 8.92759188151399e-06,
"loss": 0.1485,
"step": 299500
},
{
"epoch": 82.28195282501372,
"grad_norm": 3.7567873001098633,
"learning_rate": 8.859023587493144e-06,
"loss": 0.1502,
"step": 300000
},
{
"epoch": 82.4190894130554,
"grad_norm": 4.391474723815918,
"learning_rate": 8.7904552934723e-06,
"loss": 0.1517,
"step": 300500
},
{
"epoch": 82.55622600109709,
"grad_norm": 2.887322187423706,
"learning_rate": 8.721886999451453e-06,
"loss": 0.1535,
"step": 301000
},
{
"epoch": 82.69336258913879,
"grad_norm": 3.5882978439331055,
"learning_rate": 8.653318705430609e-06,
"loss": 0.1525,
"step": 301500
},
{
"epoch": 82.83049917718047,
"grad_norm": 3.38724946975708,
"learning_rate": 8.584750411409765e-06,
"loss": 0.1541,
"step": 302000
},
{
"epoch": 82.96763576522216,
"grad_norm": 3.582343578338623,
"learning_rate": 8.51618211738892e-06,
"loss": 0.1539,
"step": 302500
},
{
"epoch": 83.10477235326385,
"grad_norm": 3.700831413269043,
"learning_rate": 8.447613823368076e-06,
"loss": 0.1472,
"step": 303000
},
{
"epoch": 83.24190894130554,
"grad_norm": 3.810107707977295,
"learning_rate": 8.37904552934723e-06,
"loss": 0.1438,
"step": 303500
},
{
"epoch": 83.37904552934722,
"grad_norm": 3.461057424545288,
"learning_rate": 8.310477235326386e-06,
"loss": 0.1468,
"step": 304000
},
{
"epoch": 83.51618211738892,
"grad_norm": 3.1016461849212646,
"learning_rate": 8.24190894130554e-06,
"loss": 0.147,
"step": 304500
},
{
"epoch": 83.6533187054306,
"grad_norm": 3.615780830383301,
"learning_rate": 8.173340647284695e-06,
"loss": 0.1484,
"step": 305000
},
{
"epoch": 83.79045529347229,
"grad_norm": 3.3265013694763184,
"learning_rate": 8.104772353263851e-06,
"loss": 0.1468,
"step": 305500
},
{
"epoch": 83.92759188151399,
"grad_norm": 3.722999334335327,
"learning_rate": 8.036204059243006e-06,
"loss": 0.1503,
"step": 306000
},
{
"epoch": 84.06472846955567,
"grad_norm": 3.4315872192382812,
"learning_rate": 7.967635765222162e-06,
"loss": 0.1465,
"step": 306500
},
{
"epoch": 84.20186505759737,
"grad_norm": 3.664315700531006,
"learning_rate": 7.899067471201317e-06,
"loss": 0.1414,
"step": 307000
},
{
"epoch": 84.33900164563906,
"grad_norm": 2.992607831954956,
"learning_rate": 7.830499177180473e-06,
"loss": 0.1412,
"step": 307500
},
{
"epoch": 84.47613823368074,
"grad_norm": 3.560657024383545,
"learning_rate": 7.761930883159627e-06,
"loss": 0.1425,
"step": 308000
},
{
"epoch": 84.61327482172244,
"grad_norm": 4.001883506774902,
"learning_rate": 7.693362589138782e-06,
"loss": 0.145,
"step": 308500
},
{
"epoch": 84.75041140976413,
"grad_norm": 3.371948480606079,
"learning_rate": 7.624794295117937e-06,
"loss": 0.1476,
"step": 309000
},
{
"epoch": 84.88754799780581,
"grad_norm": 3.9280834197998047,
"learning_rate": 7.556226001097093e-06,
"loss": 0.1455,
"step": 309500
},
{
"epoch": 85.02468458584751,
"grad_norm": 3.2914552688598633,
"learning_rate": 7.4876577070762485e-06,
"loss": 0.1434,
"step": 310000
},
{
"epoch": 85.1618211738892,
"grad_norm": 3.4161980152130127,
"learning_rate": 7.419089413055404e-06,
"loss": 0.138,
"step": 310500
},
{
"epoch": 85.29895776193088,
"grad_norm": 3.9036171436309814,
"learning_rate": 7.350521119034559e-06,
"loss": 0.1393,
"step": 311000
},
{
"epoch": 85.43609434997258,
"grad_norm": 3.8328452110290527,
"learning_rate": 7.2819528250137145e-06,
"loss": 0.1389,
"step": 311500
},
{
"epoch": 85.57323093801426,
"grad_norm": 3.2638742923736572,
"learning_rate": 7.21338453099287e-06,
"loss": 0.1387,
"step": 312000
},
{
"epoch": 85.71036752605595,
"grad_norm": 3.8440749645233154,
"learning_rate": 7.144816236972024e-06,
"loss": 0.1413,
"step": 312500
},
{
"epoch": 85.84750411409765,
"grad_norm": 4.172990798950195,
"learning_rate": 7.07624794295118e-06,
"loss": 0.1409,
"step": 313000
},
{
"epoch": 85.98464070213933,
"grad_norm": 3.7025864124298096,
"learning_rate": 7.007679648930335e-06,
"loss": 0.1412,
"step": 313500
},
{
"epoch": 86.12177729018102,
"grad_norm": 2.4466094970703125,
"learning_rate": 6.93911135490949e-06,
"loss": 0.1346,
"step": 314000
},
{
"epoch": 86.25891387822271,
"grad_norm": 3.610511541366577,
"learning_rate": 6.870543060888646e-06,
"loss": 0.1346,
"step": 314500
},
{
"epoch": 86.3960504662644,
"grad_norm": 3.2303617000579834,
"learning_rate": 6.801974766867801e-06,
"loss": 0.1357,
"step": 315000
},
{
"epoch": 86.53318705430608,
"grad_norm": 3.74819016456604,
"learning_rate": 6.733406472846956e-06,
"loss": 0.1377,
"step": 315500
},
{
"epoch": 86.67032364234778,
"grad_norm": 3.3001086711883545,
"learning_rate": 6.664838178826111e-06,
"loss": 0.1374,
"step": 316000
},
{
"epoch": 86.80746023038947,
"grad_norm": 3.8687660694122314,
"learning_rate": 6.596269884805266e-06,
"loss": 0.1381,
"step": 316500
},
{
"epoch": 86.94459681843115,
"grad_norm": 3.627427101135254,
"learning_rate": 6.527701590784421e-06,
"loss": 0.1374,
"step": 317000
},
{
"epoch": 87.08173340647285,
"grad_norm": 3.2286431789398193,
"learning_rate": 6.459133296763577e-06,
"loss": 0.1327,
"step": 317500
},
{
"epoch": 87.21886999451453,
"grad_norm": 2.8570611476898193,
"learning_rate": 6.390565002742732e-06,
"loss": 0.1322,
"step": 318000
},
{
"epoch": 87.35600658255622,
"grad_norm": 3.3692467212677,
"learning_rate": 6.321996708721887e-06,
"loss": 0.1315,
"step": 318500
},
{
"epoch": 87.49314317059792,
"grad_norm": 3.5185604095458984,
"learning_rate": 6.253428414701043e-06,
"loss": 0.1327,
"step": 319000
},
{
"epoch": 87.6302797586396,
"grad_norm": 3.416106700897217,
"learning_rate": 6.184860120680198e-06,
"loss": 0.1328,
"step": 319500
},
{
"epoch": 87.7674163466813,
"grad_norm": 2.7670998573303223,
"learning_rate": 6.116291826659353e-06,
"loss": 0.1325,
"step": 320000
},
{
"epoch": 87.90455293472299,
"grad_norm": 3.5294463634490967,
"learning_rate": 6.047723532638509e-06,
"loss": 0.1355,
"step": 320500
},
{
"epoch": 88.04168952276467,
"grad_norm": 2.728625535964966,
"learning_rate": 5.979155238617663e-06,
"loss": 0.1316,
"step": 321000
},
{
"epoch": 88.17882611080637,
"grad_norm": 3.675401449203491,
"learning_rate": 5.9105869445968184e-06,
"loss": 0.1279,
"step": 321500
},
{
"epoch": 88.31596269884805,
"grad_norm": 3.3878486156463623,
"learning_rate": 5.842018650575974e-06,
"loss": 0.1286,
"step": 322000
},
{
"epoch": 88.45309928688974,
"grad_norm": 3.215028762817383,
"learning_rate": 5.773450356555129e-06,
"loss": 0.1285,
"step": 322500
},
{
"epoch": 88.59023587493144,
"grad_norm": 3.3920953273773193,
"learning_rate": 5.704882062534284e-06,
"loss": 0.1309,
"step": 323000
},
{
"epoch": 88.72737246297312,
"grad_norm": 4.03735876083374,
"learning_rate": 5.63631376851344e-06,
"loss": 0.1288,
"step": 323500
},
{
"epoch": 88.8645090510148,
"grad_norm": 3.8700907230377197,
"learning_rate": 5.567745474492595e-06,
"loss": 0.1314,
"step": 324000
},
{
"epoch": 89.0016456390565,
"grad_norm": 3.8290393352508545,
"learning_rate": 5.4991771804717495e-06,
"loss": 0.1296,
"step": 324500
},
{
"epoch": 89.13878222709819,
"grad_norm": 3.1456034183502197,
"learning_rate": 5.430608886450905e-06,
"loss": 0.1225,
"step": 325000
},
{
"epoch": 89.27591881513987,
"grad_norm": 3.4296352863311768,
"learning_rate": 5.362040592430061e-06,
"loss": 0.1241,
"step": 325500
},
{
"epoch": 89.41305540318157,
"grad_norm": 3.2781150341033936,
"learning_rate": 5.293472298409216e-06,
"loss": 0.1245,
"step": 326000
},
{
"epoch": 89.55019199122326,
"grad_norm": 2.664435625076294,
"learning_rate": 5.224904004388371e-06,
"loss": 0.1266,
"step": 326500
},
{
"epoch": 89.68732857926494,
"grad_norm": 4.348361015319824,
"learning_rate": 5.156335710367526e-06,
"loss": 0.127,
"step": 327000
},
{
"epoch": 89.82446516730664,
"grad_norm": 3.075655698776245,
"learning_rate": 5.0877674163466815e-06,
"loss": 0.1255,
"step": 327500
},
{
"epoch": 89.96160175534833,
"grad_norm": 3.5324909687042236,
"learning_rate": 5.019199122325837e-06,
"loss": 0.1269,
"step": 328000
},
{
"epoch": 90.09873834339001,
"grad_norm": 2.883422374725342,
"learning_rate": 4.950630828304992e-06,
"loss": 0.1237,
"step": 328500
},
{
"epoch": 90.23587493143171,
"grad_norm": 3.225177049636841,
"learning_rate": 4.8820625342841474e-06,
"loss": 0.1221,
"step": 329000
},
{
"epoch": 90.3730115194734,
"grad_norm": 3.199986457824707,
"learning_rate": 4.813494240263303e-06,
"loss": 0.1225,
"step": 329500
},
{
"epoch": 90.51014810751508,
"grad_norm": 3.1552860736846924,
"learning_rate": 4.744925946242457e-06,
"loss": 0.1231,
"step": 330000
},
{
"epoch": 90.64728469555678,
"grad_norm": 3.491950750350952,
"learning_rate": 4.6763576522216126e-06,
"loss": 0.1227,
"step": 330500
},
{
"epoch": 90.78442128359846,
"grad_norm": 3.0924017429351807,
"learning_rate": 4.607789358200768e-06,
"loss": 0.1235,
"step": 331000
},
{
"epoch": 90.92155787164015,
"grad_norm": 3.1873390674591064,
"learning_rate": 4.539221064179924e-06,
"loss": 0.1217,
"step": 331500
},
{
"epoch": 91.05869445968185,
"grad_norm": 3.9850494861602783,
"learning_rate": 4.4706527701590785e-06,
"loss": 0.122,
"step": 332000
},
{
"epoch": 91.19583104772353,
"grad_norm": 3.238954782485962,
"learning_rate": 4.402084476138234e-06,
"loss": 0.1207,
"step": 332500
},
{
"epoch": 91.33296763576523,
"grad_norm": 2.354977607727051,
"learning_rate": 4.333516182117389e-06,
"loss": 0.1207,
"step": 333000
},
{
"epoch": 91.47010422380691,
"grad_norm": 3.9573888778686523,
"learning_rate": 4.2649478880965445e-06,
"loss": 0.1195,
"step": 333500
},
{
"epoch": 91.6072408118486,
"grad_norm": 3.099452495574951,
"learning_rate": 4.196379594075699e-06,
"loss": 0.1199,
"step": 334000
},
{
"epoch": 91.7443773998903,
"grad_norm": 2.886826753616333,
"learning_rate": 4.127811300054855e-06,
"loss": 0.1191,
"step": 334500
},
{
"epoch": 91.88151398793198,
"grad_norm": 2.478618860244751,
"learning_rate": 4.0592430060340105e-06,
"loss": 0.121,
"step": 335000
},
{
"epoch": 92.01865057597367,
"grad_norm": 3.46500301361084,
"learning_rate": 3.990674712013166e-06,
"loss": 0.1201,
"step": 335500
},
{
"epoch": 92.15578716401536,
"grad_norm": 2.743831157684326,
"learning_rate": 3.92210641799232e-06,
"loss": 0.1162,
"step": 336000
},
{
"epoch": 92.29292375205705,
"grad_norm": 3.4375343322753906,
"learning_rate": 3.853538123971476e-06,
"loss": 0.1159,
"step": 336500
},
{
"epoch": 92.43006034009873,
"grad_norm": 3.173588991165161,
"learning_rate": 3.7849698299506313e-06,
"loss": 0.1183,
"step": 337000
},
{
"epoch": 92.56719692814043,
"grad_norm": 3.2577898502349854,
"learning_rate": 3.7164015359297867e-06,
"loss": 0.1167,
"step": 337500
},
{
"epoch": 92.70433351618212,
"grad_norm": 3.3100554943084717,
"learning_rate": 3.647833241908941e-06,
"loss": 0.1173,
"step": 338000
},
{
"epoch": 92.8414701042238,
"grad_norm": 3.179342269897461,
"learning_rate": 3.579264947888097e-06,
"loss": 0.1165,
"step": 338500
},
{
"epoch": 92.9786066922655,
"grad_norm": 3.096334218978882,
"learning_rate": 3.510696653867252e-06,
"loss": 0.1176,
"step": 339000
},
{
"epoch": 93.11574328030719,
"grad_norm": 2.9532058238983154,
"learning_rate": 3.4421283598464067e-06,
"loss": 0.1142,
"step": 339500
},
{
"epoch": 93.25287986834887,
"grad_norm": 3.717654228210449,
"learning_rate": 3.3735600658255624e-06,
"loss": 0.1143,
"step": 340000
},
{
"epoch": 93.39001645639057,
"grad_norm": 3.084181308746338,
"learning_rate": 3.3049917718047177e-06,
"loss": 0.1146,
"step": 340500
},
{
"epoch": 93.52715304443225,
"grad_norm": 3.636079788208008,
"learning_rate": 3.236423477783873e-06,
"loss": 0.1147,
"step": 341000
},
{
"epoch": 93.66428963247394,
"grad_norm": 2.80279278755188,
"learning_rate": 3.167855183763028e-06,
"loss": 0.1153,
"step": 341500
},
{
"epoch": 93.80142622051564,
"grad_norm": 2.7597951889038086,
"learning_rate": 3.0992868897421833e-06,
"loss": 0.1133,
"step": 342000
},
{
"epoch": 93.93856280855732,
"grad_norm": 3.1757214069366455,
"learning_rate": 3.0307185957213386e-06,
"loss": 0.1159,
"step": 342500
},
{
"epoch": 94.075699396599,
"grad_norm": 3.245447874069214,
"learning_rate": 2.962150301700494e-06,
"loss": 0.1129,
"step": 343000
},
{
"epoch": 94.2128359846407,
"grad_norm": 2.7797350883483887,
"learning_rate": 2.8935820076796493e-06,
"loss": 0.1117,
"step": 343500
},
{
"epoch": 94.34997257268239,
"grad_norm": 3.2236897945404053,
"learning_rate": 2.825013713658804e-06,
"loss": 0.112,
"step": 344000
},
{
"epoch": 94.48710916072407,
"grad_norm": 3.792973756790161,
"learning_rate": 2.7564454196379595e-06,
"loss": 0.1118,
"step": 344500
},
{
"epoch": 94.62424574876577,
"grad_norm": 2.6465868949890137,
"learning_rate": 2.687877125617115e-06,
"loss": 0.1124,
"step": 345000
},
{
"epoch": 94.76138233680746,
"grad_norm": 2.944362163543701,
"learning_rate": 2.61930883159627e-06,
"loss": 0.113,
"step": 345500
},
{
"epoch": 94.89851892484916,
"grad_norm": 3.0111756324768066,
"learning_rate": 2.550740537575425e-06,
"loss": 0.1114,
"step": 346000
},
{
"epoch": 95.03565551289084,
"grad_norm": 3.691293954849243,
"learning_rate": 2.4821722435545808e-06,
"loss": 0.1119,
"step": 346500
},
{
"epoch": 95.17279210093253,
"grad_norm": 2.5828378200531006,
"learning_rate": 2.4136039495337357e-06,
"loss": 0.1106,
"step": 347000
},
{
"epoch": 95.30992868897422,
"grad_norm": 3.733536720275879,
"learning_rate": 2.3450356555128906e-06,
"loss": 0.1091,
"step": 347500
},
{
"epoch": 95.44706527701591,
"grad_norm": 3.203916311264038,
"learning_rate": 2.2764673614920463e-06,
"loss": 0.1102,
"step": 348000
},
{
"epoch": 95.5842018650576,
"grad_norm": 2.8628923892974854,
"learning_rate": 2.2078990674712012e-06,
"loss": 0.1123,
"step": 348500
},
{
"epoch": 95.72133845309929,
"grad_norm": 3.761380195617676,
"learning_rate": 2.1393307734503565e-06,
"loss": 0.1097,
"step": 349000
},
{
"epoch": 95.85847504114098,
"grad_norm": 2.951045036315918,
"learning_rate": 2.070762479429512e-06,
"loss": 0.1106,
"step": 349500
},
{
"epoch": 95.99561162918266,
"grad_norm": 3.6867475509643555,
"learning_rate": 2.002194185408667e-06,
"loss": 0.1098,
"step": 350000
},
{
"epoch": 96.13274821722436,
"grad_norm": 3.162787675857544,
"learning_rate": 1.933625891387822e-06,
"loss": 0.1077,
"step": 350500
},
{
"epoch": 96.26988480526605,
"grad_norm": 3.5869784355163574,
"learning_rate": 1.8650575973669776e-06,
"loss": 0.1084,
"step": 351000
},
{
"epoch": 96.40702139330773,
"grad_norm": 3.4423720836639404,
"learning_rate": 1.7964893033461327e-06,
"loss": 0.1075,
"step": 351500
},
{
"epoch": 96.54415798134943,
"grad_norm": 3.4415297508239746,
"learning_rate": 1.727921009325288e-06,
"loss": 0.1081,
"step": 352000
},
{
"epoch": 96.68129456939111,
"grad_norm": 2.9299986362457275,
"learning_rate": 1.6593527153044432e-06,
"loss": 0.1088,
"step": 352500
},
{
"epoch": 96.8184311574328,
"grad_norm": 3.395812511444092,
"learning_rate": 1.5907844212835987e-06,
"loss": 0.1073,
"step": 353000
},
{
"epoch": 96.9555677454745,
"grad_norm": 3.1126651763916016,
"learning_rate": 1.5222161272627538e-06,
"loss": 0.1086,
"step": 353500
},
{
"epoch": 97.09270433351618,
"grad_norm": 2.898881435394287,
"learning_rate": 1.453647833241909e-06,
"loss": 0.1067,
"step": 354000
},
{
"epoch": 97.22984092155787,
"grad_norm": 3.300261974334717,
"learning_rate": 1.3850795392210643e-06,
"loss": 0.1082,
"step": 354500
},
{
"epoch": 97.36697750959956,
"grad_norm": 3.1039366722106934,
"learning_rate": 1.3165112452002194e-06,
"loss": 0.1073,
"step": 355000
},
{
"epoch": 97.50411409764125,
"grad_norm": 3.544015645980835,
"learning_rate": 1.2479429511793747e-06,
"loss": 0.1075,
"step": 355500
},
{
"epoch": 97.64125068568293,
"grad_norm": 2.692314624786377,
"learning_rate": 1.17937465715853e-06,
"loss": 0.1075,
"step": 356000
},
{
"epoch": 97.77838727372463,
"grad_norm": 2.966008186340332,
"learning_rate": 1.1108063631376851e-06,
"loss": 0.1077,
"step": 356500
},
{
"epoch": 97.91552386176632,
"grad_norm": 2.9783902168273926,
"learning_rate": 1.0422380691168404e-06,
"loss": 0.1076,
"step": 357000
},
{
"epoch": 98.052660449808,
"grad_norm": 3.1313674449920654,
"learning_rate": 9.736697750959958e-07,
"loss": 0.1052,
"step": 357500
},
{
"epoch": 98.1897970378497,
"grad_norm": 3.143101453781128,
"learning_rate": 9.05101481075151e-07,
"loss": 0.1038,
"step": 358000
},
{
"epoch": 98.32693362589139,
"grad_norm": 3.30661940574646,
"learning_rate": 8.365331870543062e-07,
"loss": 0.1059,
"step": 358500
},
{
"epoch": 98.46407021393308,
"grad_norm": 3.1109259128570557,
"learning_rate": 7.679648930334613e-07,
"loss": 0.1062,
"step": 359000
},
{
"epoch": 98.60120680197477,
"grad_norm": 3.4787518978118896,
"learning_rate": 6.993965990126166e-07,
"loss": 0.1037,
"step": 359500
},
{
"epoch": 98.73834339001645,
"grad_norm": 3.0321710109710693,
"learning_rate": 6.308283049917719e-07,
"loss": 0.1056,
"step": 360000
},
{
"epoch": 98.87547997805815,
"grad_norm": 3.13843035697937,
"learning_rate": 5.622600109709271e-07,
"loss": 0.1058,
"step": 360500
},
{
"epoch": 99.01261656609984,
"grad_norm": 2.8458125591278076,
"learning_rate": 4.936917169500823e-07,
"loss": 0.1051,
"step": 361000
},
{
"epoch": 99.14975315414152,
"grad_norm": 2.781649589538574,
"learning_rate": 4.2512342292923756e-07,
"loss": 0.1039,
"step": 361500
},
{
"epoch": 99.28688974218322,
"grad_norm": 3.680230140686035,
"learning_rate": 3.565551289083928e-07,
"loss": 0.1046,
"step": 362000
},
{
"epoch": 99.4240263302249,
"grad_norm": 3.4057164192199707,
"learning_rate": 2.87986834887548e-07,
"loss": 0.1064,
"step": 362500
},
{
"epoch": 99.56116291826659,
"grad_norm": 3.2353737354278564,
"learning_rate": 2.1941854086670326e-07,
"loss": 0.1056,
"step": 363000
},
{
"epoch": 99.69829950630829,
"grad_norm": 3.273487091064453,
"learning_rate": 1.5085024684585848e-07,
"loss": 0.1034,
"step": 363500
},
{
"epoch": 99.83543609434997,
"grad_norm": 2.404613733291626,
"learning_rate": 8.228195282501371e-08,
"loss": 0.1058,
"step": 364000
},
{
"epoch": 99.97257268239166,
"grad_norm": 3.081162691116333,
"learning_rate": 1.3713658804168954e-08,
"loss": 0.1047,
"step": 364500
},
{
"epoch": 100.0,
"step": 364600,
"total_flos": 1.1071101974803907e+18,
"train_loss": 1.123260397659967,
"train_runtime": 96380.4057,
"train_samples_per_second": 121.035,
"train_steps_per_second": 3.783
}
],
"logging_steps": 500,
"max_steps": 364600,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1071101974803907e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}