tinybert-javanese / trainer_state.json
akahana's picture
End of training
34bd5ab verified
raw
history blame
84.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 238700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10473397570171764,
"grad_norm": 1.1384308338165283,
"learning_rate": 4.989526602429829e-05,
"loss": 8.9672,
"step": 500
},
{
"epoch": 0.20946795140343527,
"grad_norm": 0.9985808730125427,
"learning_rate": 4.979053204859657e-05,
"loss": 7.7253,
"step": 1000
},
{
"epoch": 0.31420192710515293,
"grad_norm": 1.0521348714828491,
"learning_rate": 4.968579807289485e-05,
"loss": 7.5614,
"step": 1500
},
{
"epoch": 0.41893590280687054,
"grad_norm": 1.0602227449417114,
"learning_rate": 4.958106409719313e-05,
"loss": 7.5037,
"step": 2000
},
{
"epoch": 0.5236698785085881,
"grad_norm": 1.6102268695831299,
"learning_rate": 4.9476330121491414e-05,
"loss": 7.4595,
"step": 2500
},
{
"epoch": 0.6284038542103059,
"grad_norm": 1.335976004600525,
"learning_rate": 4.9371596145789694e-05,
"loss": 7.4267,
"step": 3000
},
{
"epoch": 0.7331378299120235,
"grad_norm": 1.340728998184204,
"learning_rate": 4.926686217008798e-05,
"loss": 7.392,
"step": 3500
},
{
"epoch": 0.8378718056137411,
"grad_norm": 1.4520059823989868,
"learning_rate": 4.916212819438626e-05,
"loss": 7.3253,
"step": 4000
},
{
"epoch": 0.9426057813154587,
"grad_norm": 1.7685532569885254,
"learning_rate": 4.905760368663595e-05,
"loss": 7.3204,
"step": 4500
},
{
"epoch": 1.0473397570171763,
"grad_norm": 1.396130084991455,
"learning_rate": 4.8952869710934226e-05,
"loss": 7.2682,
"step": 5000
},
{
"epoch": 1.1520737327188941,
"grad_norm": 1.5962079763412476,
"learning_rate": 4.884813573523251e-05,
"loss": 7.2441,
"step": 5500
},
{
"epoch": 1.2568077084206117,
"grad_norm": 1.6328166723251343,
"learning_rate": 4.874340175953079e-05,
"loss": 7.2216,
"step": 6000
},
{
"epoch": 1.3615416841223293,
"grad_norm": 1.8534362316131592,
"learning_rate": 4.863887725178048e-05,
"loss": 7.1957,
"step": 6500
},
{
"epoch": 1.466275659824047,
"grad_norm": 1.4871692657470703,
"learning_rate": 4.8534143276078766e-05,
"loss": 7.1561,
"step": 7000
},
{
"epoch": 1.5710096355257646,
"grad_norm": 1.8590672016143799,
"learning_rate": 4.8429409300377045e-05,
"loss": 7.1397,
"step": 7500
},
{
"epoch": 1.6757436112274822,
"grad_norm": 1.7009446620941162,
"learning_rate": 4.8324675324675325e-05,
"loss": 7.1149,
"step": 8000
},
{
"epoch": 1.7804775869291998,
"grad_norm": 1.9020010232925415,
"learning_rate": 4.822015081692501e-05,
"loss": 7.1215,
"step": 8500
},
{
"epoch": 1.8852115626309174,
"grad_norm": 2.912442445755005,
"learning_rate": 4.811541684122329e-05,
"loss": 7.0616,
"step": 9000
},
{
"epoch": 1.989945538332635,
"grad_norm": 2.654263496398926,
"learning_rate": 4.801068286552158e-05,
"loss": 7.0508,
"step": 9500
},
{
"epoch": 2.0946795140343526,
"grad_norm": 2.1642003059387207,
"learning_rate": 4.790594888981986e-05,
"loss": 7.0251,
"step": 10000
},
{
"epoch": 2.19941348973607,
"grad_norm": 1.9420874118804932,
"learning_rate": 4.7801424382069544e-05,
"loss": 6.9806,
"step": 10500
},
{
"epoch": 2.3041474654377883,
"grad_norm": 2.2306201457977295,
"learning_rate": 4.769669040636783e-05,
"loss": 6.9721,
"step": 11000
},
{
"epoch": 2.4088814411395054,
"grad_norm": 2.8180389404296875,
"learning_rate": 4.759195643066611e-05,
"loss": 6.9582,
"step": 11500
},
{
"epoch": 2.5136154168412235,
"grad_norm": 2.387949228286743,
"learning_rate": 4.748722245496439e-05,
"loss": 6.9211,
"step": 12000
},
{
"epoch": 2.618349392542941,
"grad_norm": 3.3709394931793213,
"learning_rate": 4.7382697947214076e-05,
"loss": 6.9183,
"step": 12500
},
{
"epoch": 2.7230833682446587,
"grad_norm": 2.567798376083374,
"learning_rate": 4.727796397151236e-05,
"loss": 6.8732,
"step": 13000
},
{
"epoch": 2.8278173439463763,
"grad_norm": 2.6373414993286133,
"learning_rate": 4.717322999581064e-05,
"loss": 6.8658,
"step": 13500
},
{
"epoch": 2.932551319648094,
"grad_norm": 2.2950875759124756,
"learning_rate": 4.706849602010893e-05,
"loss": 6.8436,
"step": 14000
},
{
"epoch": 3.0372852953498115,
"grad_norm": 4.0021514892578125,
"learning_rate": 4.696397151235861e-05,
"loss": 6.8496,
"step": 14500
},
{
"epoch": 3.142019271051529,
"grad_norm": 3.289193630218506,
"learning_rate": 4.685923753665689e-05,
"loss": 6.8047,
"step": 15000
},
{
"epoch": 3.2467532467532467,
"grad_norm": 2.9973654747009277,
"learning_rate": 4.6754503560955175e-05,
"loss": 6.7651,
"step": 15500
},
{
"epoch": 3.3514872224549643,
"grad_norm": 2.9979376792907715,
"learning_rate": 4.664976958525346e-05,
"loss": 6.7768,
"step": 16000
},
{
"epoch": 3.456221198156682,
"grad_norm": 3.263784885406494,
"learning_rate": 4.654524507750315e-05,
"loss": 6.7617,
"step": 16500
},
{
"epoch": 3.5609551738583995,
"grad_norm": 3.330116033554077,
"learning_rate": 4.644051110180143e-05,
"loss": 6.7417,
"step": 17000
},
{
"epoch": 3.665689149560117,
"grad_norm": 3.224337339401245,
"learning_rate": 4.6335777126099714e-05,
"loss": 6.7028,
"step": 17500
},
{
"epoch": 3.7704231252618348,
"grad_norm": 3.21891450881958,
"learning_rate": 4.623104315039799e-05,
"loss": 6.7101,
"step": 18000
},
{
"epoch": 3.875157100963553,
"grad_norm": 2.3559324741363525,
"learning_rate": 4.6126518642647674e-05,
"loss": 6.6892,
"step": 18500
},
{
"epoch": 3.97989107666527,
"grad_norm": 3.1527633666992188,
"learning_rate": 4.602178466694596e-05,
"loss": 6.6851,
"step": 19000
},
{
"epoch": 4.084625052366988,
"grad_norm": 2.9760189056396484,
"learning_rate": 4.591705069124424e-05,
"loss": 6.6704,
"step": 19500
},
{
"epoch": 4.189359028068705,
"grad_norm": 2.8135318756103516,
"learning_rate": 4.5812316715542526e-05,
"loss": 6.6621,
"step": 20000
},
{
"epoch": 4.294093003770423,
"grad_norm": 3.060316324234009,
"learning_rate": 4.570779220779221e-05,
"loss": 6.6298,
"step": 20500
},
{
"epoch": 4.39882697947214,
"grad_norm": 2.7130279541015625,
"learning_rate": 4.560305823209049e-05,
"loss": 6.628,
"step": 21000
},
{
"epoch": 4.5035609551738585,
"grad_norm": 3.156386613845825,
"learning_rate": 4.549853372434017e-05,
"loss": 6.6423,
"step": 21500
},
{
"epoch": 4.6082949308755765,
"grad_norm": 3.039471387863159,
"learning_rate": 4.539379974863846e-05,
"loss": 6.6343,
"step": 22000
},
{
"epoch": 4.713028906577294,
"grad_norm": 3.976949453353882,
"learning_rate": 4.5289065772936745e-05,
"loss": 6.606,
"step": 22500
},
{
"epoch": 4.817762882279011,
"grad_norm": 3.310382604598999,
"learning_rate": 4.5184331797235025e-05,
"loss": 6.5956,
"step": 23000
},
{
"epoch": 4.922496857980729,
"grad_norm": 3.5924322605133057,
"learning_rate": 4.507959782153331e-05,
"loss": 6.5965,
"step": 23500
},
{
"epoch": 5.027230833682447,
"grad_norm": 2.616468667984009,
"learning_rate": 4.497486384583159e-05,
"loss": 6.5788,
"step": 24000
},
{
"epoch": 5.131964809384164,
"grad_norm": 3.3178062438964844,
"learning_rate": 4.487012987012987e-05,
"loss": 6.5684,
"step": 24500
},
{
"epoch": 5.236698785085882,
"grad_norm": 3.7108089923858643,
"learning_rate": 4.476539589442815e-05,
"loss": 6.5756,
"step": 25000
},
{
"epoch": 5.341432760787599,
"grad_norm": 3.396498918533325,
"learning_rate": 4.466087138667784e-05,
"loss": 6.5678,
"step": 25500
},
{
"epoch": 5.446166736489317,
"grad_norm": 3.7245748043060303,
"learning_rate": 4.4556137410976123e-05,
"loss": 6.5578,
"step": 26000
},
{
"epoch": 5.5509007121910345,
"grad_norm": 3.6525135040283203,
"learning_rate": 4.44514034352744e-05,
"loss": 6.5385,
"step": 26500
},
{
"epoch": 5.655634687892753,
"grad_norm": 3.4302523136138916,
"learning_rate": 4.434666945957269e-05,
"loss": 6.5143,
"step": 27000
},
{
"epoch": 5.76036866359447,
"grad_norm": 3.762871503829956,
"learning_rate": 4.4242144951822376e-05,
"loss": 6.52,
"step": 27500
},
{
"epoch": 5.865102639296188,
"grad_norm": 2.8195388317108154,
"learning_rate": 4.4137410976120656e-05,
"loss": 6.5213,
"step": 28000
},
{
"epoch": 5.969836614997905,
"grad_norm": 3.013187885284424,
"learning_rate": 4.4032677000418936e-05,
"loss": 6.5052,
"step": 28500
},
{
"epoch": 6.074570590699623,
"grad_norm": 2.9772274494171143,
"learning_rate": 4.392794302471722e-05,
"loss": 6.502,
"step": 29000
},
{
"epoch": 6.17930456640134,
"grad_norm": 3.2228713035583496,
"learning_rate": 4.38232090490155e-05,
"loss": 6.4889,
"step": 29500
},
{
"epoch": 6.284038542103058,
"grad_norm": 3.824286937713623,
"learning_rate": 4.371847507331379e-05,
"loss": 6.4792,
"step": 30000
},
{
"epoch": 6.388772517804776,
"grad_norm": 3.100308656692505,
"learning_rate": 4.361374109761207e-05,
"loss": 6.4816,
"step": 30500
},
{
"epoch": 6.4935064935064934,
"grad_norm": 3.4449245929718018,
"learning_rate": 4.350900712191035e-05,
"loss": 6.4786,
"step": 31000
},
{
"epoch": 6.5982404692082115,
"grad_norm": 3.6803085803985596,
"learning_rate": 4.3404482614160034e-05,
"loss": 6.4778,
"step": 31500
},
{
"epoch": 6.702974444909929,
"grad_norm": 3.6413722038269043,
"learning_rate": 4.329974863845832e-05,
"loss": 6.4782,
"step": 32000
},
{
"epoch": 6.807708420611647,
"grad_norm": 3.482905626296997,
"learning_rate": 4.31950146627566e-05,
"loss": 6.4719,
"step": 32500
},
{
"epoch": 6.912442396313364,
"grad_norm": 3.4605376720428467,
"learning_rate": 4.3090280687054887e-05,
"loss": 6.4458,
"step": 33000
},
{
"epoch": 7.017176372015082,
"grad_norm": 3.814375877380371,
"learning_rate": 4.2985965647255974e-05,
"loss": 6.4539,
"step": 33500
},
{
"epoch": 7.121910347716799,
"grad_norm": 4.238844871520996,
"learning_rate": 4.288123167155425e-05,
"loss": 6.4522,
"step": 34000
},
{
"epoch": 7.226644323418517,
"grad_norm": 2.8327670097351074,
"learning_rate": 4.277649769585253e-05,
"loss": 6.4383,
"step": 34500
},
{
"epoch": 7.331378299120234,
"grad_norm": 3.1451475620269775,
"learning_rate": 4.267176372015082e-05,
"loss": 6.4451,
"step": 35000
},
{
"epoch": 7.436112274821952,
"grad_norm": 3.6858575344085693,
"learning_rate": 4.2567239212400506e-05,
"loss": 6.4313,
"step": 35500
},
{
"epoch": 7.5408462505236695,
"grad_norm": 4.258295059204102,
"learning_rate": 4.2462505236698786e-05,
"loss": 6.4143,
"step": 36000
},
{
"epoch": 7.645580226225388,
"grad_norm": 4.3574676513671875,
"learning_rate": 4.235777126099707e-05,
"loss": 6.4061,
"step": 36500
},
{
"epoch": 7.750314201927106,
"grad_norm": 3.8001816272735596,
"learning_rate": 4.225303728529535e-05,
"loss": 6.3994,
"step": 37000
},
{
"epoch": 7.855048177628823,
"grad_norm": 3.487893581390381,
"learning_rate": 4.214851277754504e-05,
"loss": 6.4192,
"step": 37500
},
{
"epoch": 7.95978215333054,
"grad_norm": 3.9729723930358887,
"learning_rate": 4.204377880184332e-05,
"loss": 6.407,
"step": 38000
},
{
"epoch": 8.064516129032258,
"grad_norm": 3.4465062618255615,
"learning_rate": 4.1939044826141604e-05,
"loss": 6.3867,
"step": 38500
},
{
"epoch": 8.169250104733976,
"grad_norm": 3.706404685974121,
"learning_rate": 4.1834310850439884e-05,
"loss": 6.3877,
"step": 39000
},
{
"epoch": 8.273984080435694,
"grad_norm": 3.8204259872436523,
"learning_rate": 4.172957687473817e-05,
"loss": 6.3921,
"step": 39500
},
{
"epoch": 8.37871805613741,
"grad_norm": 3.4868948459625244,
"learning_rate": 4.162505236698786e-05,
"loss": 6.3729,
"step": 40000
},
{
"epoch": 8.483452031839128,
"grad_norm": 3.5007081031799316,
"learning_rate": 4.152031839128614e-05,
"loss": 6.3886,
"step": 40500
},
{
"epoch": 8.588186007540846,
"grad_norm": 2.937894582748413,
"learning_rate": 4.1415584415584417e-05,
"loss": 6.3814,
"step": 41000
},
{
"epoch": 8.692919983242565,
"grad_norm": 3.529237985610962,
"learning_rate": 4.1310850439882696e-05,
"loss": 6.3722,
"step": 41500
},
{
"epoch": 8.79765395894428,
"grad_norm": 3.883575677871704,
"learning_rate": 4.120611646418098e-05,
"loss": 6.3655,
"step": 42000
},
{
"epoch": 8.902387934645999,
"grad_norm": 4.439103603363037,
"learning_rate": 4.110159195643067e-05,
"loss": 6.3673,
"step": 42500
},
{
"epoch": 9.007121910347717,
"grad_norm": 4.103298664093018,
"learning_rate": 4.099685798072895e-05,
"loss": 6.3659,
"step": 43000
},
{
"epoch": 9.111855886049435,
"grad_norm": 3.491204023361206,
"learning_rate": 4.0892124005027235e-05,
"loss": 6.3744,
"step": 43500
},
{
"epoch": 9.216589861751151,
"grad_norm": 3.441976547241211,
"learning_rate": 4.0787390029325515e-05,
"loss": 6.3573,
"step": 44000
},
{
"epoch": 9.32132383745287,
"grad_norm": 3.58134126663208,
"learning_rate": 4.0682656053623795e-05,
"loss": 6.3407,
"step": 44500
},
{
"epoch": 9.426057813154587,
"grad_norm": 3.274592638015747,
"learning_rate": 4.057813154587348e-05,
"loss": 6.3373,
"step": 45000
},
{
"epoch": 9.530791788856305,
"grad_norm": 4.296390533447266,
"learning_rate": 4.047339757017177e-05,
"loss": 6.3499,
"step": 45500
},
{
"epoch": 9.635525764558023,
"grad_norm": 3.5000336170196533,
"learning_rate": 4.036866359447005e-05,
"loss": 6.3199,
"step": 46000
},
{
"epoch": 9.74025974025974,
"grad_norm": 3.4947054386138916,
"learning_rate": 4.0263929618768334e-05,
"loss": 6.3474,
"step": 46500
},
{
"epoch": 9.844993715961458,
"grad_norm": 3.3658857345581055,
"learning_rate": 4.0159195643066614e-05,
"loss": 6.3296,
"step": 47000
},
{
"epoch": 9.949727691663176,
"grad_norm": 2.9811642169952393,
"learning_rate": 4.0054671135316294e-05,
"loss": 6.345,
"step": 47500
},
{
"epoch": 10.054461667364894,
"grad_norm": 4.165875434875488,
"learning_rate": 3.994993715961458e-05,
"loss": 6.3209,
"step": 48000
},
{
"epoch": 10.15919564306661,
"grad_norm": 3.6118202209472656,
"learning_rate": 3.9845203183912866e-05,
"loss": 6.3175,
"step": 48500
},
{
"epoch": 10.263929618768328,
"grad_norm": 3.930669069290161,
"learning_rate": 3.9740469208211146e-05,
"loss": 6.3231,
"step": 49000
},
{
"epoch": 10.368663594470046,
"grad_norm": 3.1688554286956787,
"learning_rate": 3.963594470046083e-05,
"loss": 6.309,
"step": 49500
},
{
"epoch": 10.473397570171764,
"grad_norm": 3.6746394634246826,
"learning_rate": 3.953121072475911e-05,
"loss": 6.3077,
"step": 50000
},
{
"epoch": 10.57813154587348,
"grad_norm": 3.5134785175323486,
"learning_rate": 3.942647674905739e-05,
"loss": 6.3299,
"step": 50500
},
{
"epoch": 10.682865521575199,
"grad_norm": 3.2903287410736084,
"learning_rate": 3.932174277335568e-05,
"loss": 6.3178,
"step": 51000
},
{
"epoch": 10.787599497276917,
"grad_norm": 3.5344769954681396,
"learning_rate": 3.921700879765396e-05,
"loss": 6.3139,
"step": 51500
},
{
"epoch": 10.892333472978635,
"grad_norm": 3.5710573196411133,
"learning_rate": 3.9112274821952245e-05,
"loss": 6.306,
"step": 52000
},
{
"epoch": 10.997067448680351,
"grad_norm": 3.4193336963653564,
"learning_rate": 3.9007540846250524e-05,
"loss": 6.3129,
"step": 52500
},
{
"epoch": 11.101801424382069,
"grad_norm": 3.683143377304077,
"learning_rate": 3.890301633850021e-05,
"loss": 6.3084,
"step": 53000
},
{
"epoch": 11.206535400083787,
"grad_norm": 3.214221239089966,
"learning_rate": 3.879828236279849e-05,
"loss": 6.302,
"step": 53500
},
{
"epoch": 11.311269375785505,
"grad_norm": 3.5691747665405273,
"learning_rate": 3.869354838709678e-05,
"loss": 6.3164,
"step": 54000
},
{
"epoch": 11.416003351487223,
"grad_norm": 3.2734036445617676,
"learning_rate": 3.858881441139506e-05,
"loss": 6.2889,
"step": 54500
},
{
"epoch": 11.52073732718894,
"grad_norm": 4.049854278564453,
"learning_rate": 3.848428990364474e-05,
"loss": 6.2957,
"step": 55000
},
{
"epoch": 11.625471302890658,
"grad_norm": 3.837921380996704,
"learning_rate": 3.837955592794303e-05,
"loss": 6.2806,
"step": 55500
},
{
"epoch": 11.730205278592376,
"grad_norm": 3.4606828689575195,
"learning_rate": 3.827482195224131e-05,
"loss": 6.2896,
"step": 56000
},
{
"epoch": 11.834939254294094,
"grad_norm": 4.859198093414307,
"learning_rate": 3.8170087976539596e-05,
"loss": 6.273,
"step": 56500
},
{
"epoch": 11.93967322999581,
"grad_norm": 4.689023494720459,
"learning_rate": 3.806535400083787e-05,
"loss": 6.2776,
"step": 57000
},
{
"epoch": 12.044407205697528,
"grad_norm": 4.234752178192139,
"learning_rate": 3.7960829493087555e-05,
"loss": 6.282,
"step": 57500
},
{
"epoch": 12.149141181399246,
"grad_norm": 3.950773239135742,
"learning_rate": 3.785609551738584e-05,
"loss": 6.283,
"step": 58000
},
{
"epoch": 12.253875157100964,
"grad_norm": 4.1780548095703125,
"learning_rate": 3.775136154168412e-05,
"loss": 6.2635,
"step": 58500
},
{
"epoch": 12.35860913280268,
"grad_norm": 3.2049672603607178,
"learning_rate": 3.764662756598241e-05,
"loss": 6.2858,
"step": 59000
},
{
"epoch": 12.463343108504398,
"grad_norm": 3.863649606704712,
"learning_rate": 3.7541893590280694e-05,
"loss": 6.2609,
"step": 59500
},
{
"epoch": 12.568077084206116,
"grad_norm": 3.881343364715576,
"learning_rate": 3.743715961457897e-05,
"loss": 6.2695,
"step": 60000
},
{
"epoch": 12.672811059907835,
"grad_norm": 3.522132635116577,
"learning_rate": 3.7332635106828654e-05,
"loss": 6.2523,
"step": 60500
},
{
"epoch": 12.777545035609553,
"grad_norm": 4.043595790863037,
"learning_rate": 3.722790113112694e-05,
"loss": 6.2546,
"step": 61000
},
{
"epoch": 12.882279011311269,
"grad_norm": 3.4860141277313232,
"learning_rate": 3.712316715542522e-05,
"loss": 6.2468,
"step": 61500
},
{
"epoch": 12.987012987012987,
"grad_norm": 3.9201574325561523,
"learning_rate": 3.7018433179723506e-05,
"loss": 6.2615,
"step": 62000
},
{
"epoch": 13.091746962714705,
"grad_norm": 3.5582118034362793,
"learning_rate": 3.6913699204021786e-05,
"loss": 6.2529,
"step": 62500
},
{
"epoch": 13.196480938416423,
"grad_norm": 3.1254093647003174,
"learning_rate": 3.680917469627147e-05,
"loss": 6.2418,
"step": 63000
},
{
"epoch": 13.30121491411814,
"grad_norm": 4.058616638183594,
"learning_rate": 3.670444072056975e-05,
"loss": 6.243,
"step": 63500
},
{
"epoch": 13.405948889819857,
"grad_norm": 3.5146963596343994,
"learning_rate": 3.659970674486803e-05,
"loss": 6.2595,
"step": 64000
},
{
"epoch": 13.510682865521575,
"grad_norm": 3.804818630218506,
"learning_rate": 3.649497276916632e-05,
"loss": 6.2438,
"step": 64500
},
{
"epoch": 13.615416841223293,
"grad_norm": 3.591214179992676,
"learning_rate": 3.6390238793464605e-05,
"loss": 6.2266,
"step": 65000
},
{
"epoch": 13.72015081692501,
"grad_norm": 4.973635196685791,
"learning_rate": 3.6285504817762885e-05,
"loss": 6.2501,
"step": 65500
},
{
"epoch": 13.824884792626728,
"grad_norm": 4.189575672149658,
"learning_rate": 3.618098031001257e-05,
"loss": 6.2341,
"step": 66000
},
{
"epoch": 13.929618768328446,
"grad_norm": 4.408186912536621,
"learning_rate": 3.607624633431085e-05,
"loss": 6.227,
"step": 66500
},
{
"epoch": 14.034352744030164,
"grad_norm": 4.066199779510498,
"learning_rate": 3.597151235860913e-05,
"loss": 6.2316,
"step": 67000
},
{
"epoch": 14.139086719731882,
"grad_norm": 3.8263683319091797,
"learning_rate": 3.586677838290742e-05,
"loss": 6.2322,
"step": 67500
},
{
"epoch": 14.243820695433598,
"grad_norm": 4.787020206451416,
"learning_rate": 3.57620444072057e-05,
"loss": 6.2099,
"step": 68000
},
{
"epoch": 14.348554671135316,
"grad_norm": 3.5196545124053955,
"learning_rate": 3.565751989945538e-05,
"loss": 6.2425,
"step": 68500
},
{
"epoch": 14.453288646837034,
"grad_norm": 4.1746439933776855,
"learning_rate": 3.555278592375367e-05,
"loss": 6.2373,
"step": 69000
},
{
"epoch": 14.558022622538752,
"grad_norm": 4.07820463180542,
"learning_rate": 3.544805194805195e-05,
"loss": 6.2099,
"step": 69500
},
{
"epoch": 14.662756598240469,
"grad_norm": 3.400038242340088,
"learning_rate": 3.534331797235023e-05,
"loss": 6.216,
"step": 70000
},
{
"epoch": 14.767490573942187,
"grad_norm": 4.578042030334473,
"learning_rate": 3.5238793464599916e-05,
"loss": 6.2091,
"step": 70500
},
{
"epoch": 14.872224549643905,
"grad_norm": 3.6254208087921143,
"learning_rate": 3.51340594888982e-05,
"loss": 6.2258,
"step": 71000
},
{
"epoch": 14.976958525345623,
"grad_norm": 3.496166467666626,
"learning_rate": 3.502932551319648e-05,
"loss": 6.2138,
"step": 71500
},
{
"epoch": 15.081692501047339,
"grad_norm": 3.5367865562438965,
"learning_rate": 3.492459153749477e-05,
"loss": 6.213,
"step": 72000
},
{
"epoch": 15.186426476749057,
"grad_norm": 3.4754440784454346,
"learning_rate": 3.481985756179305e-05,
"loss": 6.2153,
"step": 72500
},
{
"epoch": 15.291160452450775,
"grad_norm": 4.432271957397461,
"learning_rate": 3.4715333054042735e-05,
"loss": 6.2007,
"step": 73000
},
{
"epoch": 15.395894428152493,
"grad_norm": 3.8427770137786865,
"learning_rate": 3.4610599078341014e-05,
"loss": 6.2071,
"step": 73500
},
{
"epoch": 15.50062840385421,
"grad_norm": 3.9617857933044434,
"learning_rate": 3.4505865102639294e-05,
"loss": 6.2142,
"step": 74000
},
{
"epoch": 15.605362379555928,
"grad_norm": 3.769693613052368,
"learning_rate": 3.440113112693758e-05,
"loss": 6.2065,
"step": 74500
},
{
"epoch": 15.710096355257646,
"grad_norm": 3.825507402420044,
"learning_rate": 3.429639715123587e-05,
"loss": 6.2072,
"step": 75000
},
{
"epoch": 15.814830330959364,
"grad_norm": 3.982872724533081,
"learning_rate": 3.4191872643485554e-05,
"loss": 6.2003,
"step": 75500
},
{
"epoch": 15.91956430666108,
"grad_norm": 3.9958648681640625,
"learning_rate": 3.408713866778383e-05,
"loss": 6.1913,
"step": 76000
},
{
"epoch": 16.0242982823628,
"grad_norm": 3.947957754135132,
"learning_rate": 3.398240469208211e-05,
"loss": 6.2019,
"step": 76500
},
{
"epoch": 16.129032258064516,
"grad_norm": 3.8135411739349365,
"learning_rate": 3.387767071638039e-05,
"loss": 6.1944,
"step": 77000
},
{
"epoch": 16.233766233766232,
"grad_norm": 3.940861701965332,
"learning_rate": 3.377293674067868e-05,
"loss": 6.1893,
"step": 77500
},
{
"epoch": 16.338500209467952,
"grad_norm": 5.24894905090332,
"learning_rate": 3.3668412232928366e-05,
"loss": 6.1984,
"step": 78000
},
{
"epoch": 16.44323418516967,
"grad_norm": 4.470870494842529,
"learning_rate": 3.3563678257226645e-05,
"loss": 6.1958,
"step": 78500
},
{
"epoch": 16.547968160871388,
"grad_norm": 3.699892282485962,
"learning_rate": 3.345894428152493e-05,
"loss": 6.1952,
"step": 79000
},
{
"epoch": 16.652702136573104,
"grad_norm": 4.136711120605469,
"learning_rate": 3.335421030582321e-05,
"loss": 6.1896,
"step": 79500
},
{
"epoch": 16.75743611227482,
"grad_norm": 4.904257297515869,
"learning_rate": 3.324968579807289e-05,
"loss": 6.1715,
"step": 80000
},
{
"epoch": 16.86217008797654,
"grad_norm": 4.219280242919922,
"learning_rate": 3.314495182237118e-05,
"loss": 6.1829,
"step": 80500
},
{
"epoch": 16.966904063678257,
"grad_norm": 4.426414489746094,
"learning_rate": 3.3040217846669464e-05,
"loss": 6.1782,
"step": 81000
},
{
"epoch": 17.071638039379973,
"grad_norm": 4.792020797729492,
"learning_rate": 3.2935483870967744e-05,
"loss": 6.1675,
"step": 81500
},
{
"epoch": 17.176372015081693,
"grad_norm": 3.9796903133392334,
"learning_rate": 3.283095936321743e-05,
"loss": 6.179,
"step": 82000
},
{
"epoch": 17.28110599078341,
"grad_norm": 4.554388046264648,
"learning_rate": 3.272622538751571e-05,
"loss": 6.1756,
"step": 82500
},
{
"epoch": 17.38583996648513,
"grad_norm": 4.024316787719727,
"learning_rate": 3.262149141181399e-05,
"loss": 6.177,
"step": 83000
},
{
"epoch": 17.490573942186845,
"grad_norm": 4.059772968292236,
"learning_rate": 3.2516757436112276e-05,
"loss": 6.1818,
"step": 83500
},
{
"epoch": 17.59530791788856,
"grad_norm": 4.296391487121582,
"learning_rate": 3.241223292836196e-05,
"loss": 6.1866,
"step": 84000
},
{
"epoch": 17.70004189359028,
"grad_norm": 4.008220672607422,
"learning_rate": 3.230749895266024e-05,
"loss": 6.172,
"step": 84500
},
{
"epoch": 17.804775869291998,
"grad_norm": 4.639082908630371,
"learning_rate": 3.220276497695853e-05,
"loss": 6.169,
"step": 85000
},
{
"epoch": 17.909509844993718,
"grad_norm": 4.635848522186279,
"learning_rate": 3.209803100125681e-05,
"loss": 6.1721,
"step": 85500
},
{
"epoch": 18.014243820695434,
"grad_norm": 4.662270545959473,
"learning_rate": 3.199329702555509e-05,
"loss": 6.1575,
"step": 86000
},
{
"epoch": 18.11897779639715,
"grad_norm": 4.280701637268066,
"learning_rate": 3.1888772517804775e-05,
"loss": 6.1482,
"step": 86500
},
{
"epoch": 18.22371177209887,
"grad_norm": 3.8602380752563477,
"learning_rate": 3.178403854210306e-05,
"loss": 6.1565,
"step": 87000
},
{
"epoch": 18.328445747800586,
"grad_norm": 4.634263515472412,
"learning_rate": 3.167930456640134e-05,
"loss": 6.1587,
"step": 87500
},
{
"epoch": 18.433179723502302,
"grad_norm": 4.115392208099365,
"learning_rate": 3.157457059069963e-05,
"loss": 6.1436,
"step": 88000
},
{
"epoch": 18.537913699204022,
"grad_norm": 3.6665916442871094,
"learning_rate": 3.146983661499791e-05,
"loss": 6.1442,
"step": 88500
},
{
"epoch": 18.64264767490574,
"grad_norm": 4.444345951080322,
"learning_rate": 3.1365312107247594e-05,
"loss": 6.1571,
"step": 89000
},
{
"epoch": 18.74738165060746,
"grad_norm": 3.792792558670044,
"learning_rate": 3.1260578131545873e-05,
"loss": 6.1535,
"step": 89500
},
{
"epoch": 18.852115626309175,
"grad_norm": 3.904019832611084,
"learning_rate": 3.115584415584415e-05,
"loss": 6.1501,
"step": 90000
},
{
"epoch": 18.95684960201089,
"grad_norm": 4.531284332275391,
"learning_rate": 3.105111018014244e-05,
"loss": 6.1592,
"step": 90500
},
{
"epoch": 19.06158357771261,
"grad_norm": 3.7976317405700684,
"learning_rate": 3.0946376204440726e-05,
"loss": 6.1474,
"step": 91000
},
{
"epoch": 19.166317553414327,
"grad_norm": 3.8021469116210938,
"learning_rate": 3.084185169669041e-05,
"loss": 6.1408,
"step": 91500
},
{
"epoch": 19.271051529116047,
"grad_norm": 4.194758892059326,
"learning_rate": 3.073711772098869e-05,
"loss": 6.1476,
"step": 92000
},
{
"epoch": 19.375785504817763,
"grad_norm": 4.084668159484863,
"learning_rate": 3.063238374528697e-05,
"loss": 6.1443,
"step": 92500
},
{
"epoch": 19.48051948051948,
"grad_norm": 4.383222579956055,
"learning_rate": 3.052764976958525e-05,
"loss": 6.1422,
"step": 93000
},
{
"epoch": 19.5852534562212,
"grad_norm": 4.250995635986328,
"learning_rate": 3.042312526183494e-05,
"loss": 6.1375,
"step": 93500
},
{
"epoch": 19.689987431922916,
"grad_norm": 4.78529691696167,
"learning_rate": 3.0318391286133225e-05,
"loss": 6.1368,
"step": 94000
},
{
"epoch": 19.794721407624632,
"grad_norm": 3.4997754096984863,
"learning_rate": 3.0213657310431504e-05,
"loss": 6.1432,
"step": 94500
},
{
"epoch": 19.89945538332635,
"grad_norm": 4.723648548126221,
"learning_rate": 3.0108923334729787e-05,
"loss": 6.1263,
"step": 95000
},
{
"epoch": 20.004189359028068,
"grad_norm": 3.930859088897705,
"learning_rate": 3.0004398826979474e-05,
"loss": 6.1387,
"step": 95500
},
{
"epoch": 20.108923334729788,
"grad_norm": 4.286599159240723,
"learning_rate": 2.9899664851277754e-05,
"loss": 6.1187,
"step": 96000
},
{
"epoch": 20.213657310431504,
"grad_norm": 3.8475680351257324,
"learning_rate": 2.9794930875576037e-05,
"loss": 6.1312,
"step": 96500
},
{
"epoch": 20.31839128613322,
"grad_norm": 4.844906806945801,
"learning_rate": 2.9690196899874323e-05,
"loss": 6.1457,
"step": 97000
},
{
"epoch": 20.42312526183494,
"grad_norm": 4.691315174102783,
"learning_rate": 2.958567239212401e-05,
"loss": 6.1351,
"step": 97500
},
{
"epoch": 20.527859237536656,
"grad_norm": 6.15250825881958,
"learning_rate": 2.9480938416422286e-05,
"loss": 6.1275,
"step": 98000
},
{
"epoch": 20.632593213238373,
"grad_norm": 3.8872599601745605,
"learning_rate": 2.9376204440720573e-05,
"loss": 6.1275,
"step": 98500
},
{
"epoch": 20.737327188940093,
"grad_norm": 4.541051864624023,
"learning_rate": 2.9271470465018852e-05,
"loss": 6.1472,
"step": 99000
},
{
"epoch": 20.84206116464181,
"grad_norm": 4.556408405303955,
"learning_rate": 2.9166736489317135e-05,
"loss": 6.1369,
"step": 99500
},
{
"epoch": 20.94679514034353,
"grad_norm": 4.5567498207092285,
"learning_rate": 2.9062211981566822e-05,
"loss": 6.1148,
"step": 100000
},
{
"epoch": 21.051529116045245,
"grad_norm": 4.647518634796143,
"learning_rate": 2.8957478005865102e-05,
"loss": 6.1281,
"step": 100500
},
{
"epoch": 21.15626309174696,
"grad_norm": 4.372421741485596,
"learning_rate": 2.8852744030163388e-05,
"loss": 6.1226,
"step": 101000
},
{
"epoch": 21.26099706744868,
"grad_norm": 4.270533084869385,
"learning_rate": 2.874801005446167e-05,
"loss": 6.123,
"step": 101500
},
{
"epoch": 21.365731043150397,
"grad_norm": 3.5596370697021484,
"learning_rate": 2.8643485546711358e-05,
"loss": 6.1402,
"step": 102000
},
{
"epoch": 21.470465018852117,
"grad_norm": 5.230384826660156,
"learning_rate": 2.8538751571009638e-05,
"loss": 6.1199,
"step": 102500
},
{
"epoch": 21.575198994553833,
"grad_norm": 3.9881417751312256,
"learning_rate": 2.843401759530792e-05,
"loss": 6.1244,
"step": 103000
},
{
"epoch": 21.67993297025555,
"grad_norm": 4.617568016052246,
"learning_rate": 2.83292836196062e-05,
"loss": 6.1154,
"step": 103500
},
{
"epoch": 21.78466694595727,
"grad_norm": 4.641009330749512,
"learning_rate": 2.8224759111855887e-05,
"loss": 6.113,
"step": 104000
},
{
"epoch": 21.889400921658986,
"grad_norm": 4.005772113800049,
"learning_rate": 2.812002513615417e-05,
"loss": 6.1163,
"step": 104500
},
{
"epoch": 21.994134897360702,
"grad_norm": 4.2611799240112305,
"learning_rate": 2.801529116045245e-05,
"loss": 6.1023,
"step": 105000
},
{
"epoch": 22.098868873062422,
"grad_norm": 4.568357467651367,
"learning_rate": 2.7910557184750736e-05,
"loss": 6.1169,
"step": 105500
},
{
"epoch": 22.203602848764138,
"grad_norm": 4.323103427886963,
"learning_rate": 2.780603267700042e-05,
"loss": 6.1226,
"step": 106000
},
{
"epoch": 22.308336824465858,
"grad_norm": 4.507444381713867,
"learning_rate": 2.77012987012987e-05,
"loss": 6.1052,
"step": 106500
},
{
"epoch": 22.413070800167574,
"grad_norm": 4.301244735717773,
"learning_rate": 2.7596564725596985e-05,
"loss": 6.0944,
"step": 107000
},
{
"epoch": 22.51780477586929,
"grad_norm": 4.984853267669678,
"learning_rate": 2.749183074989527e-05,
"loss": 6.1125,
"step": 107500
},
{
"epoch": 22.62253875157101,
"grad_norm": 4.682931423187256,
"learning_rate": 2.7387096774193548e-05,
"loss": 6.1158,
"step": 108000
},
{
"epoch": 22.727272727272727,
"grad_norm": 4.494015693664551,
"learning_rate": 2.7282572266443235e-05,
"loss": 6.1035,
"step": 108500
},
{
"epoch": 22.832006702974446,
"grad_norm": 3.880779981613159,
"learning_rate": 2.717783829074152e-05,
"loss": 6.1084,
"step": 109000
},
{
"epoch": 22.936740678676163,
"grad_norm": 4.154653072357178,
"learning_rate": 2.7073104315039798e-05,
"loss": 6.0945,
"step": 109500
},
{
"epoch": 23.04147465437788,
"grad_norm": 5.443271160125732,
"learning_rate": 2.6968370339338084e-05,
"loss": 6.1016,
"step": 110000
},
{
"epoch": 23.1462086300796,
"grad_norm": 4.298133373260498,
"learning_rate": 2.686384583158777e-05,
"loss": 6.103,
"step": 110500
},
{
"epoch": 23.250942605781315,
"grad_norm": 4.379884243011475,
"learning_rate": 2.675911185588605e-05,
"loss": 6.0814,
"step": 111000
},
{
"epoch": 23.35567658148303,
"grad_norm": 6.175398349761963,
"learning_rate": 2.6654377880184333e-05,
"loss": 6.1088,
"step": 111500
},
{
"epoch": 23.46041055718475,
"grad_norm": 4.121715068817139,
"learning_rate": 2.6549643904482613e-05,
"loss": 6.0966,
"step": 112000
},
{
"epoch": 23.565144532886467,
"grad_norm": 5.040287494659424,
"learning_rate": 2.6444909928780896e-05,
"loss": 6.098,
"step": 112500
},
{
"epoch": 23.669878508588187,
"grad_norm": 4.766879081726074,
"learning_rate": 2.6340175953079182e-05,
"loss": 6.0957,
"step": 113000
},
{
"epoch": 23.774612484289904,
"grad_norm": 5.87930965423584,
"learning_rate": 2.623565144532887e-05,
"loss": 6.1089,
"step": 113500
},
{
"epoch": 23.87934645999162,
"grad_norm": 5.318653583526611,
"learning_rate": 2.613091746962715e-05,
"loss": 6.0761,
"step": 114000
},
{
"epoch": 23.98408043569334,
"grad_norm": 4.465319633483887,
"learning_rate": 2.6026183493925432e-05,
"loss": 6.0826,
"step": 114500
},
{
"epoch": 24.088814411395056,
"grad_norm": 4.640571594238281,
"learning_rate": 2.592144951822371e-05,
"loss": 6.0805,
"step": 115000
},
{
"epoch": 24.193548387096776,
"grad_norm": 4.252554416656494,
"learning_rate": 2.5816925010473398e-05,
"loss": 6.0701,
"step": 115500
},
{
"epoch": 24.298282362798492,
"grad_norm": 4.704644203186035,
"learning_rate": 2.571219103477168e-05,
"loss": 6.0936,
"step": 116000
},
{
"epoch": 24.40301633850021,
"grad_norm": 4.601324558258057,
"learning_rate": 2.560745705906996e-05,
"loss": 6.0758,
"step": 116500
},
{
"epoch": 24.507750314201928,
"grad_norm": 4.380444526672363,
"learning_rate": 2.5502723083368247e-05,
"loss": 6.0871,
"step": 117000
},
{
"epoch": 24.612484289903644,
"grad_norm": 4.119806289672852,
"learning_rate": 2.5397989107666527e-05,
"loss": 6.102,
"step": 117500
},
{
"epoch": 24.71721826560536,
"grad_norm": 3.9712698459625244,
"learning_rate": 2.5293464599916217e-05,
"loss": 6.0999,
"step": 118000
},
{
"epoch": 24.82195224130708,
"grad_norm": 5.146612167358398,
"learning_rate": 2.5188730624214497e-05,
"loss": 6.0947,
"step": 118500
},
{
"epoch": 24.926686217008797,
"grad_norm": 4.406741142272949,
"learning_rate": 2.508399664851278e-05,
"loss": 6.0829,
"step": 119000
},
{
"epoch": 25.031420192710517,
"grad_norm": 5.4739766120910645,
"learning_rate": 2.497926267281106e-05,
"loss": 6.0598,
"step": 119500
},
{
"epoch": 25.136154168412233,
"grad_norm": 4.6231794357299805,
"learning_rate": 2.4874738165060746e-05,
"loss": 6.072,
"step": 120000
},
{
"epoch": 25.24088814411395,
"grad_norm": 4.47750186920166,
"learning_rate": 2.477000418935903e-05,
"loss": 6.0664,
"step": 120500
},
{
"epoch": 25.34562211981567,
"grad_norm": 5.023014068603516,
"learning_rate": 2.4665270213657312e-05,
"loss": 6.0894,
"step": 121000
},
{
"epoch": 25.450356095517385,
"grad_norm": 5.687644004821777,
"learning_rate": 2.4560536237955595e-05,
"loss": 6.0936,
"step": 121500
},
{
"epoch": 25.555090071219105,
"grad_norm": 4.534958362579346,
"learning_rate": 2.4455802262253878e-05,
"loss": 6.0794,
"step": 122000
},
{
"epoch": 25.65982404692082,
"grad_norm": 5.563751697540283,
"learning_rate": 2.435127775450356e-05,
"loss": 6.081,
"step": 122500
},
{
"epoch": 25.764558022622538,
"grad_norm": 4.613626956939697,
"learning_rate": 2.4246543778801845e-05,
"loss": 6.0982,
"step": 123000
},
{
"epoch": 25.869291998324258,
"grad_norm": 4.645818710327148,
"learning_rate": 2.4141809803100128e-05,
"loss": 6.0665,
"step": 123500
},
{
"epoch": 25.974025974025974,
"grad_norm": 4.9156928062438965,
"learning_rate": 2.4037075827398407e-05,
"loss": 6.0674,
"step": 124000
},
{
"epoch": 26.07875994972769,
"grad_norm": 4.7342305183410645,
"learning_rate": 2.3932551319648094e-05,
"loss": 6.0741,
"step": 124500
},
{
"epoch": 26.18349392542941,
"grad_norm": 4.607081413269043,
"learning_rate": 2.3827817343946377e-05,
"loss": 6.0626,
"step": 125000
},
{
"epoch": 26.288227901131126,
"grad_norm": 4.820442199707031,
"learning_rate": 2.372308336824466e-05,
"loss": 6.0936,
"step": 125500
},
{
"epoch": 26.392961876832846,
"grad_norm": 4.549975395202637,
"learning_rate": 2.3618349392542943e-05,
"loss": 6.0804,
"step": 126000
},
{
"epoch": 26.497695852534562,
"grad_norm": 4.722150802612305,
"learning_rate": 2.351382488479263e-05,
"loss": 6.0555,
"step": 126500
},
{
"epoch": 26.60242982823628,
"grad_norm": 4.2948408126831055,
"learning_rate": 2.340909090909091e-05,
"loss": 6.0626,
"step": 127000
},
{
"epoch": 26.707163803938,
"grad_norm": 4.246878623962402,
"learning_rate": 2.3304356933389193e-05,
"loss": 6.0577,
"step": 127500
},
{
"epoch": 26.811897779639715,
"grad_norm": 4.165809154510498,
"learning_rate": 2.3199622957687476e-05,
"loss": 6.0608,
"step": 128000
},
{
"epoch": 26.916631755341434,
"grad_norm": 4.3159894943237305,
"learning_rate": 2.309488898198576e-05,
"loss": 6.0806,
"step": 128500
},
{
"epoch": 27.02136573104315,
"grad_norm": 4.15300989151001,
"learning_rate": 2.2990364474235442e-05,
"loss": 6.0654,
"step": 129000
},
{
"epoch": 27.126099706744867,
"grad_norm": 4.730154991149902,
"learning_rate": 2.2885630498533725e-05,
"loss": 6.0567,
"step": 129500
},
{
"epoch": 27.230833682446587,
"grad_norm": 4.300974369049072,
"learning_rate": 2.2780896522832008e-05,
"loss": 6.0449,
"step": 130000
},
{
"epoch": 27.335567658148303,
"grad_norm": 4.148283958435059,
"learning_rate": 2.2676162547130288e-05,
"loss": 6.0491,
"step": 130500
},
{
"epoch": 27.44030163385002,
"grad_norm": 4.9924421310424805,
"learning_rate": 2.2571638039379974e-05,
"loss": 6.0491,
"step": 131000
},
{
"epoch": 27.54503560955174,
"grad_norm": 5.713706016540527,
"learning_rate": 2.246690406367826e-05,
"loss": 6.0396,
"step": 131500
},
{
"epoch": 27.649769585253456,
"grad_norm": 5.007369518280029,
"learning_rate": 2.236217008797654e-05,
"loss": 6.0378,
"step": 132000
},
{
"epoch": 27.754503560955175,
"grad_norm": 4.500640392303467,
"learning_rate": 2.2257436112274823e-05,
"loss": 6.0313,
"step": 132500
},
{
"epoch": 27.85923753665689,
"grad_norm": 4.709275722503662,
"learning_rate": 2.2152702136573107e-05,
"loss": 6.0278,
"step": 133000
},
{
"epoch": 27.963971512358608,
"grad_norm": 4.891386032104492,
"learning_rate": 2.204817762882279e-05,
"loss": 6.0267,
"step": 133500
},
{
"epoch": 28.068705488060328,
"grad_norm": 4.82666540145874,
"learning_rate": 2.1943443653121073e-05,
"loss": 5.986,
"step": 134000
},
{
"epoch": 28.173439463762044,
"grad_norm": 4.489607810974121,
"learning_rate": 2.1838709677419356e-05,
"loss": 6.0209,
"step": 134500
},
{
"epoch": 28.278173439463764,
"grad_norm": 4.719301700592041,
"learning_rate": 2.173397570171764e-05,
"loss": 5.9998,
"step": 135000
},
{
"epoch": 28.38290741516548,
"grad_norm": 5.639565467834473,
"learning_rate": 2.1629451193967322e-05,
"loss": 5.9881,
"step": 135500
},
{
"epoch": 28.487641390867196,
"grad_norm": 4.745512008666992,
"learning_rate": 2.1524717218265605e-05,
"loss": 6.0002,
"step": 136000
},
{
"epoch": 28.592375366568916,
"grad_norm": 5.661725997924805,
"learning_rate": 2.141998324256389e-05,
"loss": 5.9967,
"step": 136500
},
{
"epoch": 28.697109342270632,
"grad_norm": 5.3391194343566895,
"learning_rate": 2.131524926686217e-05,
"loss": 6.0024,
"step": 137000
},
{
"epoch": 28.80184331797235,
"grad_norm": 5.1614089012146,
"learning_rate": 2.1210724759111858e-05,
"loss": 5.9992,
"step": 137500
},
{
"epoch": 28.90657729367407,
"grad_norm": 5.429248332977295,
"learning_rate": 2.110599078341014e-05,
"loss": 5.9929,
"step": 138000
},
{
"epoch": 29.011311269375785,
"grad_norm": 5.1270012855529785,
"learning_rate": 2.100125680770842e-05,
"loss": 5.9883,
"step": 138500
},
{
"epoch": 29.116045245077505,
"grad_norm": 5.027891159057617,
"learning_rate": 2.0896522832006704e-05,
"loss": 5.963,
"step": 139000
},
{
"epoch": 29.22077922077922,
"grad_norm": 5.712099552154541,
"learning_rate": 2.079199832425639e-05,
"loss": 5.9811,
"step": 139500
},
{
"epoch": 29.325513196480937,
"grad_norm": 4.954220294952393,
"learning_rate": 2.0687264348554674e-05,
"loss": 5.9808,
"step": 140000
},
{
"epoch": 29.430247172182657,
"grad_norm": 5.713419437408447,
"learning_rate": 2.0582530372852953e-05,
"loss": 5.9887,
"step": 140500
},
{
"epoch": 29.534981147884373,
"grad_norm": 4.683711528778076,
"learning_rate": 2.0477796397151236e-05,
"loss": 5.9612,
"step": 141000
},
{
"epoch": 29.63971512358609,
"grad_norm": 5.164538383483887,
"learning_rate": 2.0373271889400923e-05,
"loss": 5.993,
"step": 141500
},
{
"epoch": 29.74444909928781,
"grad_norm": 5.386078357696533,
"learning_rate": 2.0268537913699203e-05,
"loss": 5.9735,
"step": 142000
},
{
"epoch": 29.849183074989526,
"grad_norm": 4.4406418800354,
"learning_rate": 2.016380393799749e-05,
"loss": 5.9672,
"step": 142500
},
{
"epoch": 29.953917050691246,
"grad_norm": 5.029815673828125,
"learning_rate": 2.0059069962295772e-05,
"loss": 5.961,
"step": 143000
},
{
"epoch": 30.058651026392962,
"grad_norm": 4.666591167449951,
"learning_rate": 1.9954335986594052e-05,
"loss": 5.9505,
"step": 143500
},
{
"epoch": 30.163385002094678,
"grad_norm": 6.975547790527344,
"learning_rate": 1.9849602010892335e-05,
"loss": 5.956,
"step": 144000
},
{
"epoch": 30.268118977796398,
"grad_norm": 4.687684535980225,
"learning_rate": 1.974507750314202e-05,
"loss": 5.9475,
"step": 144500
},
{
"epoch": 30.372852953498114,
"grad_norm": 5.594231605529785,
"learning_rate": 1.96403435274403e-05,
"loss": 5.9496,
"step": 145000
},
{
"epoch": 30.477586929199834,
"grad_norm": 4.879722595214844,
"learning_rate": 1.9535609551738584e-05,
"loss": 5.9577,
"step": 145500
},
{
"epoch": 30.58232090490155,
"grad_norm": 5.470447540283203,
"learning_rate": 1.9430875576036867e-05,
"loss": 5.9672,
"step": 146000
},
{
"epoch": 30.687054880603267,
"grad_norm": 5.818385124206543,
"learning_rate": 1.932614160033515e-05,
"loss": 5.9501,
"step": 146500
},
{
"epoch": 30.791788856304986,
"grad_norm": 5.907487392425537,
"learning_rate": 1.9221617092584834e-05,
"loss": 5.9458,
"step": 147000
},
{
"epoch": 30.896522832006703,
"grad_norm": 4.739224433898926,
"learning_rate": 1.911688311688312e-05,
"loss": 5.935,
"step": 147500
},
{
"epoch": 31.00125680770842,
"grad_norm": 4.57131814956665,
"learning_rate": 1.90121491411814e-05,
"loss": 5.945,
"step": 148000
},
{
"epoch": 31.10599078341014,
"grad_norm": 5.128586769104004,
"learning_rate": 1.8907415165479683e-05,
"loss": 5.9494,
"step": 148500
},
{
"epoch": 31.210724759111855,
"grad_norm": 4.871676921844482,
"learning_rate": 1.880289065772937e-05,
"loss": 5.9415,
"step": 149000
},
{
"epoch": 31.315458734813575,
"grad_norm": 5.380068778991699,
"learning_rate": 1.8698156682027652e-05,
"loss": 5.939,
"step": 149500
},
{
"epoch": 31.42019271051529,
"grad_norm": 5.430812835693359,
"learning_rate": 1.8593422706325932e-05,
"loss": 5.9276,
"step": 150000
},
{
"epoch": 31.524926686217007,
"grad_norm": 4.7710442543029785,
"learning_rate": 1.8488688730624215e-05,
"loss": 5.9413,
"step": 150500
},
{
"epoch": 31.629660661918727,
"grad_norm": 5.183919906616211,
"learning_rate": 1.8383954754922498e-05,
"loss": 5.9257,
"step": 151000
},
{
"epoch": 31.734394637620444,
"grad_norm": 4.851598739624023,
"learning_rate": 1.827943024717218e-05,
"loss": 5.9251,
"step": 151500
},
{
"epoch": 31.839128613322163,
"grad_norm": 4.835882663726807,
"learning_rate": 1.8174696271470464e-05,
"loss": 5.92,
"step": 152000
},
{
"epoch": 31.94386258902388,
"grad_norm": 5.428823947906494,
"learning_rate": 1.8069962295768748e-05,
"loss": 5.9146,
"step": 152500
},
{
"epoch": 32.0485965647256,
"grad_norm": 6.11329984664917,
"learning_rate": 1.796522832006703e-05,
"loss": 5.9179,
"step": 153000
},
{
"epoch": 32.15333054042731,
"grad_norm": 4.836859226226807,
"learning_rate": 1.7860703812316717e-05,
"loss": 5.9189,
"step": 153500
},
{
"epoch": 32.25806451612903,
"grad_norm": 4.598475456237793,
"learning_rate": 1.7755969836615e-05,
"loss": 5.9207,
"step": 154000
},
{
"epoch": 32.36279849183075,
"grad_norm": 4.638394832611084,
"learning_rate": 1.765123586091328e-05,
"loss": 5.915,
"step": 154500
},
{
"epoch": 32.467532467532465,
"grad_norm": 5.637279987335205,
"learning_rate": 1.7546501885211563e-05,
"loss": 5.9228,
"step": 155000
},
{
"epoch": 32.572266443234184,
"grad_norm": 4.516068458557129,
"learning_rate": 1.7441767909509846e-05,
"loss": 5.9322,
"step": 155500
},
{
"epoch": 32.677000418935904,
"grad_norm": 4.652084827423096,
"learning_rate": 1.7337243401759533e-05,
"loss": 5.9395,
"step": 156000
},
{
"epoch": 32.78173439463762,
"grad_norm": 5.667607307434082,
"learning_rate": 1.7232509426057812e-05,
"loss": 5.9172,
"step": 156500
},
{
"epoch": 32.88646837033934,
"grad_norm": 4.980391025543213,
"learning_rate": 1.7127775450356095e-05,
"loss": 5.934,
"step": 157000
},
{
"epoch": 32.99120234604106,
"grad_norm": 4.39646053314209,
"learning_rate": 1.702304147465438e-05,
"loss": 5.914,
"step": 157500
},
{
"epoch": 33.095936321742776,
"grad_norm": 5.263533115386963,
"learning_rate": 1.6918516966904062e-05,
"loss": 5.9094,
"step": 158000
},
{
"epoch": 33.20067029744449,
"grad_norm": 4.661126136779785,
"learning_rate": 1.6813782991202348e-05,
"loss": 5.9189,
"step": 158500
},
{
"epoch": 33.30540427314621,
"grad_norm": 4.935306549072266,
"learning_rate": 1.670904901550063e-05,
"loss": 5.8944,
"step": 159000
},
{
"epoch": 33.41013824884793,
"grad_norm": 5.82065486907959,
"learning_rate": 1.660431503979891e-05,
"loss": 5.892,
"step": 159500
},
{
"epoch": 33.51487222454964,
"grad_norm": 4.6220927238464355,
"learning_rate": 1.6499581064097194e-05,
"loss": 5.9022,
"step": 160000
},
{
"epoch": 33.61960620025136,
"grad_norm": 5.109046936035156,
"learning_rate": 1.639505655634688e-05,
"loss": 5.8933,
"step": 160500
},
{
"epoch": 33.72434017595308,
"grad_norm": 5.230437278747559,
"learning_rate": 1.6290322580645164e-05,
"loss": 5.9107,
"step": 161000
},
{
"epoch": 33.829074151654794,
"grad_norm": 6.466080188751221,
"learning_rate": 1.6185588604943443e-05,
"loss": 5.9077,
"step": 161500
},
{
"epoch": 33.933808127356514,
"grad_norm": 4.655428409576416,
"learning_rate": 1.6080854629241726e-05,
"loss": 5.9186,
"step": 162000
},
{
"epoch": 34.038542103058234,
"grad_norm": 6.136354923248291,
"learning_rate": 1.597612065354001e-05,
"loss": 5.8815,
"step": 162500
},
{
"epoch": 34.143276078759946,
"grad_norm": 5.668376445770264,
"learning_rate": 1.587138667783829e-05,
"loss": 5.8872,
"step": 163000
},
{
"epoch": 34.248010054461666,
"grad_norm": 5.579314708709717,
"learning_rate": 1.5766862170087976e-05,
"loss": 5.8966,
"step": 163500
},
{
"epoch": 34.352744030163386,
"grad_norm": 5.474893569946289,
"learning_rate": 1.5662128194386262e-05,
"loss": 5.8865,
"step": 164000
},
{
"epoch": 34.457478005865106,
"grad_norm": 4.853377342224121,
"learning_rate": 1.5557394218684542e-05,
"loss": 5.8718,
"step": 164500
},
{
"epoch": 34.56221198156682,
"grad_norm": 4.8684563636779785,
"learning_rate": 1.5452660242982825e-05,
"loss": 5.8896,
"step": 165000
},
{
"epoch": 34.66694595726854,
"grad_norm": 4.851233959197998,
"learning_rate": 1.5347926267281108e-05,
"loss": 5.8908,
"step": 165500
},
{
"epoch": 34.77167993297026,
"grad_norm": 6.647628307342529,
"learning_rate": 1.524319229157939e-05,
"loss": 5.8972,
"step": 166000
},
{
"epoch": 34.87641390867197,
"grad_norm": 5.826745986938477,
"learning_rate": 1.5138667783829074e-05,
"loss": 5.8852,
"step": 166500
},
{
"epoch": 34.98114788437369,
"grad_norm": 5.109675407409668,
"learning_rate": 1.5033933808127357e-05,
"loss": 5.8971,
"step": 167000
},
{
"epoch": 35.08588186007541,
"grad_norm": 5.743017196655273,
"learning_rate": 1.4929199832425639e-05,
"loss": 5.8892,
"step": 167500
},
{
"epoch": 35.19061583577712,
"grad_norm": 4.862270355224609,
"learning_rate": 1.482446585672392e-05,
"loss": 5.8976,
"step": 168000
},
{
"epoch": 35.29534981147884,
"grad_norm": 5.532686233520508,
"learning_rate": 1.4719941348973607e-05,
"loss": 5.8716,
"step": 168500
},
{
"epoch": 35.40008378718056,
"grad_norm": 5.498019695281982,
"learning_rate": 1.4615207373271891e-05,
"loss": 5.8768,
"step": 169000
},
{
"epoch": 35.504817762882276,
"grad_norm": 4.7324042320251465,
"learning_rate": 1.4510473397570173e-05,
"loss": 5.8739,
"step": 169500
},
{
"epoch": 35.609551738583995,
"grad_norm": 4.973413944244385,
"learning_rate": 1.4405739421868456e-05,
"loss": 5.8801,
"step": 170000
},
{
"epoch": 35.714285714285715,
"grad_norm": 4.977658271789551,
"learning_rate": 1.430121491411814e-05,
"loss": 5.8644,
"step": 170500
},
{
"epoch": 35.819019689987435,
"grad_norm": 5.551715850830078,
"learning_rate": 1.4196480938416424e-05,
"loss": 5.8789,
"step": 171000
},
{
"epoch": 35.92375366568915,
"grad_norm": 5.135740756988525,
"learning_rate": 1.4091746962714705e-05,
"loss": 5.8862,
"step": 171500
},
{
"epoch": 36.02848764139087,
"grad_norm": 5.068655967712402,
"learning_rate": 1.3987012987012987e-05,
"loss": 5.879,
"step": 172000
},
{
"epoch": 36.13322161709259,
"grad_norm": 5.393857479095459,
"learning_rate": 1.388227901131127e-05,
"loss": 5.8544,
"step": 172500
},
{
"epoch": 36.2379555927943,
"grad_norm": 5.854538440704346,
"learning_rate": 1.3777545035609551e-05,
"loss": 5.8824,
"step": 173000
},
{
"epoch": 36.34268956849602,
"grad_norm": 5.566401481628418,
"learning_rate": 1.3673020527859238e-05,
"loss": 5.8587,
"step": 173500
},
{
"epoch": 36.44742354419774,
"grad_norm": 6.091250896453857,
"learning_rate": 1.3568286552157519e-05,
"loss": 5.8624,
"step": 174000
},
{
"epoch": 36.55215751989945,
"grad_norm": 4.826417922973633,
"learning_rate": 1.3463552576455804e-05,
"loss": 5.879,
"step": 174500
},
{
"epoch": 36.65689149560117,
"grad_norm": 5.28770637512207,
"learning_rate": 1.3358818600754087e-05,
"loss": 5.8632,
"step": 175000
},
{
"epoch": 36.76162547130289,
"grad_norm": 5.072086811065674,
"learning_rate": 1.3254084625052368e-05,
"loss": 5.8698,
"step": 175500
},
{
"epoch": 36.866359447004605,
"grad_norm": 6.194067001342773,
"learning_rate": 1.3149560117302053e-05,
"loss": 5.8701,
"step": 176000
},
{
"epoch": 36.971093422706325,
"grad_norm": 5.250491142272949,
"learning_rate": 1.3044826141600336e-05,
"loss": 5.855,
"step": 176500
},
{
"epoch": 37.075827398408045,
"grad_norm": 4.9726080894470215,
"learning_rate": 1.2940092165898618e-05,
"loss": 5.8613,
"step": 177000
},
{
"epoch": 37.180561374109764,
"grad_norm": 5.526548385620117,
"learning_rate": 1.28353581901969e-05,
"loss": 5.8519,
"step": 177500
},
{
"epoch": 37.28529534981148,
"grad_norm": 5.5989861488342285,
"learning_rate": 1.2730624214495182e-05,
"loss": 5.8642,
"step": 178000
},
{
"epoch": 37.3900293255132,
"grad_norm": 5.138686180114746,
"learning_rate": 1.2625890238793465e-05,
"loss": 5.852,
"step": 178500
},
{
"epoch": 37.49476330121492,
"grad_norm": 5.0514326095581055,
"learning_rate": 1.252115626309175e-05,
"loss": 5.8484,
"step": 179000
},
{
"epoch": 37.59949727691663,
"grad_norm": 4.9300360679626465,
"learning_rate": 1.241642228739003e-05,
"loss": 5.849,
"step": 179500
},
{
"epoch": 37.70423125261835,
"grad_norm": 5.487224102020264,
"learning_rate": 1.2311897779639716e-05,
"loss": 5.8562,
"step": 180000
},
{
"epoch": 37.80896522832007,
"grad_norm": 5.826539516448975,
"learning_rate": 1.2207163803937999e-05,
"loss": 5.8665,
"step": 180500
},
{
"epoch": 37.91369920402178,
"grad_norm": 5.733819961547852,
"learning_rate": 1.2102639296187684e-05,
"loss": 5.8569,
"step": 181000
},
{
"epoch": 38.0184331797235,
"grad_norm": 4.917960166931152,
"learning_rate": 1.1997905320485967e-05,
"loss": 5.8395,
"step": 181500
},
{
"epoch": 38.12316715542522,
"grad_norm": 5.337119102478027,
"learning_rate": 1.1893171344784248e-05,
"loss": 5.854,
"step": 182000
},
{
"epoch": 38.227901131126934,
"grad_norm": 5.299139022827148,
"learning_rate": 1.178843736908253e-05,
"loss": 5.8433,
"step": 182500
},
{
"epoch": 38.332635106828654,
"grad_norm": 5.900153160095215,
"learning_rate": 1.1683703393380813e-05,
"loss": 5.8535,
"step": 183000
},
{
"epoch": 38.437369082530374,
"grad_norm": 6.776584625244141,
"learning_rate": 1.15791788856305e-05,
"loss": 5.8454,
"step": 183500
},
{
"epoch": 38.542103058232094,
"grad_norm": 6.258368015289307,
"learning_rate": 1.1474444909928783e-05,
"loss": 5.8354,
"step": 184000
},
{
"epoch": 38.64683703393381,
"grad_norm": 5.288670539855957,
"learning_rate": 1.1369710934227064e-05,
"loss": 5.8458,
"step": 184500
},
{
"epoch": 38.751571009635526,
"grad_norm": 5.596650123596191,
"learning_rate": 1.1264976958525345e-05,
"loss": 5.8387,
"step": 185000
},
{
"epoch": 38.856304985337246,
"grad_norm": 5.121638774871826,
"learning_rate": 1.1160452450775032e-05,
"loss": 5.8268,
"step": 185500
},
{
"epoch": 38.96103896103896,
"grad_norm": 4.5758442878723145,
"learning_rate": 1.1055718475073313e-05,
"loss": 5.83,
"step": 186000
},
{
"epoch": 39.06577293674068,
"grad_norm": 5.161282539367676,
"learning_rate": 1.0950984499371596e-05,
"loss": 5.8544,
"step": 186500
},
{
"epoch": 39.1705069124424,
"grad_norm": 4.628884315490723,
"learning_rate": 1.084625052366988e-05,
"loss": 5.8474,
"step": 187000
},
{
"epoch": 39.27524088814411,
"grad_norm": 5.854598045349121,
"learning_rate": 1.074151654796816e-05,
"loss": 5.8501,
"step": 187500
},
{
"epoch": 39.37997486384583,
"grad_norm": 5.315525054931641,
"learning_rate": 1.0636782572266444e-05,
"loss": 5.8265,
"step": 188000
},
{
"epoch": 39.48470883954755,
"grad_norm": 6.078185081481934,
"learning_rate": 1.0532048596564727e-05,
"loss": 5.8419,
"step": 188500
},
{
"epoch": 39.589442815249264,
"grad_norm": 5.223086357116699,
"learning_rate": 1.0427524088814412e-05,
"loss": 5.8197,
"step": 189000
},
{
"epoch": 39.69417679095098,
"grad_norm": 5.235757827758789,
"learning_rate": 1.0322790113112695e-05,
"loss": 5.8245,
"step": 189500
},
{
"epoch": 39.7989107666527,
"grad_norm": 5.124643325805664,
"learning_rate": 1.0218056137410976e-05,
"loss": 5.839,
"step": 190000
},
{
"epoch": 39.90364474235442,
"grad_norm": 5.613321304321289,
"learning_rate": 1.011332216170926e-05,
"loss": 5.844,
"step": 190500
},
{
"epoch": 40.008378718056136,
"grad_norm": 5.873430252075195,
"learning_rate": 1.0008588186007542e-05,
"loss": 5.837,
"step": 191000
},
{
"epoch": 40.113112693757856,
"grad_norm": 5.089309215545654,
"learning_rate": 9.903854210305824e-06,
"loss": 5.8384,
"step": 191500
},
{
"epoch": 40.217846669459576,
"grad_norm": 7.3569817543029785,
"learning_rate": 9.79932970255551e-06,
"loss": 5.8232,
"step": 192000
},
{
"epoch": 40.32258064516129,
"grad_norm": 6.024489402770996,
"learning_rate": 9.694595726853792e-06,
"loss": 5.8292,
"step": 192500
},
{
"epoch": 40.42731462086301,
"grad_norm": 5.7150983810424805,
"learning_rate": 9.589861751152073e-06,
"loss": 5.8614,
"step": 193000
},
{
"epoch": 40.53204859656473,
"grad_norm": 4.717107772827148,
"learning_rate": 9.485127775450356e-06,
"loss": 5.8092,
"step": 193500
},
{
"epoch": 40.63678257226644,
"grad_norm": 4.9722490310668945,
"learning_rate": 9.380603267700043e-06,
"loss": 5.8231,
"step": 194000
},
{
"epoch": 40.74151654796816,
"grad_norm": 5.593094825744629,
"learning_rate": 9.275869291998326e-06,
"loss": 5.8339,
"step": 194500
},
{
"epoch": 40.84625052366988,
"grad_norm": 5.731310844421387,
"learning_rate": 9.171135316296607e-06,
"loss": 5.8381,
"step": 195000
},
{
"epoch": 40.95098449937159,
"grad_norm": 5.072065353393555,
"learning_rate": 9.066401340594889e-06,
"loss": 5.8367,
"step": 195500
},
{
"epoch": 41.05571847507331,
"grad_norm": 5.219040870666504,
"learning_rate": 8.961667364893172e-06,
"loss": 5.8234,
"step": 196000
},
{
"epoch": 41.16045245077503,
"grad_norm": 5.844238758087158,
"learning_rate": 8.856933389191455e-06,
"loss": 5.8347,
"step": 196500
},
{
"epoch": 41.26518642647675,
"grad_norm": 6.088447093963623,
"learning_rate": 8.752199413489736e-06,
"loss": 5.8178,
"step": 197000
},
{
"epoch": 41.369920402178465,
"grad_norm": 5.14108943939209,
"learning_rate": 8.647465437788019e-06,
"loss": 5.8248,
"step": 197500
},
{
"epoch": 41.474654377880185,
"grad_norm": 5.424249172210693,
"learning_rate": 8.542940930037704e-06,
"loss": 5.8113,
"step": 198000
},
{
"epoch": 41.579388353581905,
"grad_norm": 4.888121604919434,
"learning_rate": 8.43841642228739e-06,
"loss": 5.8111,
"step": 198500
},
{
"epoch": 41.68412232928362,
"grad_norm": 4.9909515380859375,
"learning_rate": 8.333682446585672e-06,
"loss": 5.8276,
"step": 199000
},
{
"epoch": 41.78885630498534,
"grad_norm": 5.032175540924072,
"learning_rate": 8.228948470883955e-06,
"loss": 5.8332,
"step": 199500
},
{
"epoch": 41.89359028068706,
"grad_norm": 5.116880416870117,
"learning_rate": 8.124214495182238e-06,
"loss": 5.8233,
"step": 200000
},
{
"epoch": 41.99832425638877,
"grad_norm": 5.235647678375244,
"learning_rate": 8.019689987431923e-06,
"loss": 5.8297,
"step": 200500
},
{
"epoch": 42.10305823209049,
"grad_norm": 5.445380210876465,
"learning_rate": 7.914956011730206e-06,
"loss": 5.8159,
"step": 201000
},
{
"epoch": 42.20779220779221,
"grad_norm": 4.979036331176758,
"learning_rate": 7.810222036028488e-06,
"loss": 5.809,
"step": 201500
},
{
"epoch": 42.31252618349392,
"grad_norm": 5.359362602233887,
"learning_rate": 7.70548806032677e-06,
"loss": 5.8346,
"step": 202000
},
{
"epoch": 42.41726015919564,
"grad_norm": 5.264519214630127,
"learning_rate": 7.600754084625053e-06,
"loss": 5.8089,
"step": 202500
},
{
"epoch": 42.52199413489736,
"grad_norm": 5.985982894897461,
"learning_rate": 7.496020108923335e-06,
"loss": 5.8192,
"step": 203000
},
{
"epoch": 42.626728110599075,
"grad_norm": 5.505626201629639,
"learning_rate": 7.391286133221617e-06,
"loss": 5.8095,
"step": 203500
},
{
"epoch": 42.731462086300795,
"grad_norm": 5.069738388061523,
"learning_rate": 7.286552157519899e-06,
"loss": 5.8186,
"step": 204000
},
{
"epoch": 42.836196062002514,
"grad_norm": 6.004745960235596,
"learning_rate": 7.182027649769586e-06,
"loss": 5.8136,
"step": 204500
},
{
"epoch": 42.940930037704234,
"grad_norm": 6.299502372741699,
"learning_rate": 7.077293674067868e-06,
"loss": 5.8213,
"step": 205000
},
{
"epoch": 43.04566401340595,
"grad_norm": 6.302718162536621,
"learning_rate": 6.97255969836615e-06,
"loss": 5.8075,
"step": 205500
},
{
"epoch": 43.15039798910767,
"grad_norm": 5.921250343322754,
"learning_rate": 6.867825722664433e-06,
"loss": 5.8276,
"step": 206000
},
{
"epoch": 43.25513196480939,
"grad_norm": 5.123110771179199,
"learning_rate": 6.763091746962715e-06,
"loss": 5.7965,
"step": 206500
},
{
"epoch": 43.3598659405111,
"grad_norm": 5.187294006347656,
"learning_rate": 6.658357771260998e-06,
"loss": 5.8137,
"step": 207000
},
{
"epoch": 43.46459991621282,
"grad_norm": 5.407510757446289,
"learning_rate": 6.55362379555928e-06,
"loss": 5.8305,
"step": 207500
},
{
"epoch": 43.56933389191454,
"grad_norm": 5.892600059509277,
"learning_rate": 6.449099287808966e-06,
"loss": 5.8167,
"step": 208000
},
{
"epoch": 43.67406786761625,
"grad_norm": 5.39382266998291,
"learning_rate": 6.344365312107248e-06,
"loss": 5.8185,
"step": 208500
},
{
"epoch": 43.77880184331797,
"grad_norm": 5.608034133911133,
"learning_rate": 6.23963133640553e-06,
"loss": 5.8051,
"step": 209000
},
{
"epoch": 43.88353581901969,
"grad_norm": 6.069722652435303,
"learning_rate": 6.1348973607038125e-06,
"loss": 5.8101,
"step": 209500
},
{
"epoch": 43.988269794721404,
"grad_norm": 5.938599109649658,
"learning_rate": 6.0301633850020955e-06,
"loss": 5.807,
"step": 210000
},
{
"epoch": 44.093003770423124,
"grad_norm": 5.808456897735596,
"learning_rate": 5.925429409300378e-06,
"loss": 5.8246,
"step": 210500
},
{
"epoch": 44.197737746124844,
"grad_norm": 5.229996681213379,
"learning_rate": 5.820904901550063e-06,
"loss": 5.8029,
"step": 211000
},
{
"epoch": 44.302471721826564,
"grad_norm": 5.295706748962402,
"learning_rate": 5.716170925848346e-06,
"loss": 5.8094,
"step": 211500
},
{
"epoch": 44.407205697528276,
"grad_norm": 5.649194240570068,
"learning_rate": 5.611436950146628e-06,
"loss": 5.811,
"step": 212000
},
{
"epoch": 44.511939673229996,
"grad_norm": 6.5928521156311035,
"learning_rate": 5.50670297444491e-06,
"loss": 5.7974,
"step": 212500
},
{
"epoch": 44.616673648931716,
"grad_norm": 6.246605396270752,
"learning_rate": 5.401968998743192e-06,
"loss": 5.8011,
"step": 213000
},
{
"epoch": 44.72140762463343,
"grad_norm": 5.312093734741211,
"learning_rate": 5.2972350230414745e-06,
"loss": 5.7819,
"step": 213500
},
{
"epoch": 44.82614160033515,
"grad_norm": 5.348554611206055,
"learning_rate": 5.19271051529116e-06,
"loss": 5.8027,
"step": 214000
},
{
"epoch": 44.93087557603687,
"grad_norm": 5.95352029800415,
"learning_rate": 5.087976539589443e-06,
"loss": 5.8046,
"step": 214500
},
{
"epoch": 45.03560955173858,
"grad_norm": 5.978014945983887,
"learning_rate": 4.983242563887726e-06,
"loss": 5.8021,
"step": 215000
},
{
"epoch": 45.1403435274403,
"grad_norm": 5.595849990844727,
"learning_rate": 4.878508588186008e-06,
"loss": 5.7996,
"step": 215500
},
{
"epoch": 45.24507750314202,
"grad_norm": 5.570345401763916,
"learning_rate": 4.77377461248429e-06,
"loss": 5.7973,
"step": 216000
},
{
"epoch": 45.34981147884373,
"grad_norm": 5.320748805999756,
"learning_rate": 4.669040636782573e-06,
"loss": 5.7886,
"step": 216500
},
{
"epoch": 45.45454545454545,
"grad_norm": 4.676185607910156,
"learning_rate": 4.564516129032258e-06,
"loss": 5.7874,
"step": 217000
},
{
"epoch": 45.55927943024717,
"grad_norm": 5.7768473625183105,
"learning_rate": 4.45978215333054e-06,
"loss": 5.7875,
"step": 217500
},
{
"epoch": 45.66401340594889,
"grad_norm": 5.668895244598389,
"learning_rate": 4.355048177628823e-06,
"loss": 5.8121,
"step": 218000
},
{
"epoch": 45.768747381650606,
"grad_norm": 5.033557891845703,
"learning_rate": 4.2503142019271055e-06,
"loss": 5.8137,
"step": 218500
},
{
"epoch": 45.873481357352325,
"grad_norm": 6.15772819519043,
"learning_rate": 4.145580226225388e-06,
"loss": 5.8137,
"step": 219000
},
{
"epoch": 45.978215333054045,
"grad_norm": 6.617910861968994,
"learning_rate": 4.04084625052367e-06,
"loss": 5.8079,
"step": 219500
},
{
"epoch": 46.08294930875576,
"grad_norm": 5.210205554962158,
"learning_rate": 3.936112274821952e-06,
"loss": 5.8189,
"step": 220000
},
{
"epoch": 46.18768328445748,
"grad_norm": 5.630945205688477,
"learning_rate": 3.831587767071639e-06,
"loss": 5.795,
"step": 220500
},
{
"epoch": 46.2924172601592,
"grad_norm": 5.8690032958984375,
"learning_rate": 3.7268537913699205e-06,
"loss": 5.8019,
"step": 221000
},
{
"epoch": 46.39715123586091,
"grad_norm": 5.787112712860107,
"learning_rate": 3.6221198156682027e-06,
"loss": 5.7901,
"step": 221500
},
{
"epoch": 46.50188521156263,
"grad_norm": 5.568469524383545,
"learning_rate": 3.5173858399664853e-06,
"loss": 5.803,
"step": 222000
},
{
"epoch": 46.60661918726435,
"grad_norm": 5.671326637268066,
"learning_rate": 3.4126518642647675e-06,
"loss": 5.7931,
"step": 222500
},
{
"epoch": 46.71135316296606,
"grad_norm": 5.4085307121276855,
"learning_rate": 3.30791788856305e-06,
"loss": 5.7903,
"step": 223000
},
{
"epoch": 46.81608713866778,
"grad_norm": 5.7440571784973145,
"learning_rate": 3.203393380812736e-06,
"loss": 5.7897,
"step": 223500
},
{
"epoch": 46.9208211143695,
"grad_norm": 5.144542217254639,
"learning_rate": 3.098659405111018e-06,
"loss": 5.8145,
"step": 224000
},
{
"epoch": 47.02555509007122,
"grad_norm": 5.842213153839111,
"learning_rate": 2.9939254294093008e-06,
"loss": 5.8046,
"step": 224500
},
{
"epoch": 47.130289065772935,
"grad_norm": 6.161410331726074,
"learning_rate": 2.8891914537075826e-06,
"loss": 5.7965,
"step": 225000
},
{
"epoch": 47.235023041474655,
"grad_norm": 6.173724174499512,
"learning_rate": 2.784457478005865e-06,
"loss": 5.7854,
"step": 225500
},
{
"epoch": 47.339757017176375,
"grad_norm": 5.132796287536621,
"learning_rate": 2.679932970255551e-06,
"loss": 5.791,
"step": 226000
},
{
"epoch": 47.44449099287809,
"grad_norm": 6.053417205810547,
"learning_rate": 2.5751989945538332e-06,
"loss": 5.7935,
"step": 226500
},
{
"epoch": 47.54922496857981,
"grad_norm": 5.08466911315918,
"learning_rate": 2.470465018852116e-06,
"loss": 5.8006,
"step": 227000
},
{
"epoch": 47.65395894428153,
"grad_norm": 6.060305595397949,
"learning_rate": 2.365731043150398e-06,
"loss": 5.7839,
"step": 227500
},
{
"epoch": 47.75869291998324,
"grad_norm": 5.808520317077637,
"learning_rate": 2.2609970674486806e-06,
"loss": 5.7933,
"step": 228000
},
{
"epoch": 47.86342689568496,
"grad_norm": 5.413971424102783,
"learning_rate": 2.156472559698366e-06,
"loss": 5.7985,
"step": 228500
},
{
"epoch": 47.96816087138668,
"grad_norm": 6.4786529541015625,
"learning_rate": 2.0517385839966487e-06,
"loss": 5.8059,
"step": 229000
},
{
"epoch": 48.07289484708839,
"grad_norm": 5.606147289276123,
"learning_rate": 1.947004608294931e-06,
"loss": 5.7934,
"step": 229500
},
{
"epoch": 48.17762882279011,
"grad_norm": 5.827240467071533,
"learning_rate": 1.8422706325932133e-06,
"loss": 5.8007,
"step": 230000
},
{
"epoch": 48.28236279849183,
"grad_norm": 5.678854465484619,
"learning_rate": 1.7375366568914957e-06,
"loss": 5.8038,
"step": 230500
},
{
"epoch": 48.38709677419355,
"grad_norm": 5.42138671875,
"learning_rate": 1.632802681189778e-06,
"loss": 5.7976,
"step": 231000
},
{
"epoch": 48.491830749895264,
"grad_norm": 6.03090238571167,
"learning_rate": 1.5280687054880603e-06,
"loss": 5.7874,
"step": 231500
},
{
"epoch": 48.596564725596984,
"grad_norm": 4.660221099853516,
"learning_rate": 1.4233347297863427e-06,
"loss": 5.7742,
"step": 232000
},
{
"epoch": 48.701298701298704,
"grad_norm": 6.032063961029053,
"learning_rate": 1.3188102220360285e-06,
"loss": 5.7866,
"step": 232500
},
{
"epoch": 48.80603267700042,
"grad_norm": 6.296925067901611,
"learning_rate": 1.2142857142857144e-06,
"loss": 5.7922,
"step": 233000
},
{
"epoch": 48.91076665270214,
"grad_norm": 5.2991437911987305,
"learning_rate": 1.1095517385839968e-06,
"loss": 5.7873,
"step": 233500
},
{
"epoch": 49.015500628403856,
"grad_norm": 6.130777835845947,
"learning_rate": 1.0048177628822792e-06,
"loss": 5.8074,
"step": 234000
},
{
"epoch": 49.12023460410557,
"grad_norm": 6.305094242095947,
"learning_rate": 9.000837871805614e-07,
"loss": 5.7847,
"step": 234500
},
{
"epoch": 49.22496857980729,
"grad_norm": 6.156949996948242,
"learning_rate": 7.953498114788438e-07,
"loss": 5.7741,
"step": 235000
},
{
"epoch": 49.32970255550901,
"grad_norm": 6.475966930389404,
"learning_rate": 6.908253037285296e-07,
"loss": 5.8083,
"step": 235500
},
{
"epoch": 49.43443653121072,
"grad_norm": 5.687895774841309,
"learning_rate": 5.86091328026812e-07,
"loss": 5.7976,
"step": 236000
},
{
"epoch": 49.53917050691244,
"grad_norm": 5.615401744842529,
"learning_rate": 4.813573523250943e-07,
"loss": 5.7907,
"step": 236500
},
{
"epoch": 49.64390448261416,
"grad_norm": 6.177160263061523,
"learning_rate": 3.7662337662337666e-07,
"loss": 5.792,
"step": 237000
},
{
"epoch": 49.74863845831588,
"grad_norm": 5.604287624359131,
"learning_rate": 2.71889400921659e-07,
"loss": 5.7837,
"step": 237500
},
{
"epoch": 49.853372434017594,
"grad_norm": 4.826539039611816,
"learning_rate": 1.673648931713448e-07,
"loss": 5.7808,
"step": 238000
},
{
"epoch": 49.95810640971931,
"grad_norm": 5.6649250984191895,
"learning_rate": 6.263091746962715e-08,
"loss": 5.7888,
"step": 238500
},
{
"epoch": 50.0,
"step": 238700,
"total_flos": 5161725447936000.0,
"train_loss": 6.140905278960581,
"train_runtime": 7883.0646,
"train_samples_per_second": 484.444,
"train_steps_per_second": 30.28
}
],
"logging_steps": 500,
"max_steps": 238700,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5161725447936000.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}