svjack's picture
Upload folder using huggingface_hub
14a1987 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.989841527834214,
"eval_steps": 500,
"global_step": 1535,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016253555465258026,
"grad_norm": 2.077186346054077,
"learning_rate": 4.9998691031433496e-05,
"loss": 2.8778,
"step": 5
},
{
"epoch": 0.03250711093051605,
"grad_norm": 2.073504686355591,
"learning_rate": 4.999476426280588e-05,
"loss": 2.6432,
"step": 10
},
{
"epoch": 0.048760666395774074,
"grad_norm": 2.1281931400299072,
"learning_rate": 4.998822010531848e-05,
"loss": 2.3167,
"step": 15
},
{
"epoch": 0.0650142218610321,
"grad_norm": 1.090219497680664,
"learning_rate": 4.997905924425903e-05,
"loss": 2.1533,
"step": 20
},
{
"epoch": 0.08126777732629012,
"grad_norm": 1.007896900177002,
"learning_rate": 4.996728263892985e-05,
"loss": 2.0267,
"step": 25
},
{
"epoch": 0.09752133279154815,
"grad_norm": 0.9982665777206421,
"learning_rate": 4.995289152254744e-05,
"loss": 1.9352,
"step": 30
},
{
"epoch": 0.11377488825680618,
"grad_norm": 0.8844298720359802,
"learning_rate": 4.9935887402113315e-05,
"loss": 1.9486,
"step": 35
},
{
"epoch": 0.1300284437220642,
"grad_norm": 0.9337536692619324,
"learning_rate": 4.991627205825621e-05,
"loss": 1.9228,
"step": 40
},
{
"epoch": 0.14628199918732224,
"grad_norm": 0.9377800822257996,
"learning_rate": 4.9894047545045605e-05,
"loss": 1.835,
"step": 45
},
{
"epoch": 0.16253555465258024,
"grad_norm": 0.8525241017341614,
"learning_rate": 4.986921618977664e-05,
"loss": 1.8157,
"step": 50
},
{
"epoch": 0.17878911011783827,
"grad_norm": 0.872871458530426,
"learning_rate": 4.984178059272638e-05,
"loss": 1.8811,
"step": 55
},
{
"epoch": 0.1950426655830963,
"grad_norm": 0.9125804305076599,
"learning_rate": 4.981174362688158e-05,
"loss": 1.8242,
"step": 60
},
{
"epoch": 0.21129622104835433,
"grad_norm": 0.7474733591079712,
"learning_rate": 4.977910843763777e-05,
"loss": 1.7808,
"step": 65
},
{
"epoch": 0.22754977651361236,
"grad_norm": 1.0248199701309204,
"learning_rate": 4.974387844246987e-05,
"loss": 1.8512,
"step": 70
},
{
"epoch": 0.24380333197887039,
"grad_norm": 0.9032835960388184,
"learning_rate": 4.970605733057441e-05,
"loss": 1.8172,
"step": 75
},
{
"epoch": 0.2600568874441284,
"grad_norm": 0.8550340533256531,
"learning_rate": 4.9665649062483115e-05,
"loss": 1.7418,
"step": 80
},
{
"epoch": 0.27631044290938644,
"grad_norm": 0.9429016709327698,
"learning_rate": 4.96226578696482e-05,
"loss": 1.8265,
"step": 85
},
{
"epoch": 0.2925639983746445,
"grad_norm": 0.975885272026062,
"learning_rate": 4.957708825399927e-05,
"loss": 1.7943,
"step": 90
},
{
"epoch": 0.3088175538399025,
"grad_norm": 0.827629804611206,
"learning_rate": 4.9528944987471884e-05,
"loss": 1.8067,
"step": 95
},
{
"epoch": 0.3250711093051605,
"grad_norm": 1.0019093751907349,
"learning_rate": 4.9478233111507856e-05,
"loss": 1.7874,
"step": 100
},
{
"epoch": 0.3413246647704185,
"grad_norm": 1.0811312198638916,
"learning_rate": 4.9424957936527295e-05,
"loss": 1.7395,
"step": 105
},
{
"epoch": 0.35757822023567654,
"grad_norm": 0.9461565613746643,
"learning_rate": 4.936912504137257e-05,
"loss": 1.7833,
"step": 110
},
{
"epoch": 0.37383177570093457,
"grad_norm": 1.0386009216308594,
"learning_rate": 4.9310740272724055e-05,
"loss": 1.7569,
"step": 115
},
{
"epoch": 0.3900853311661926,
"grad_norm": 0.9916568994522095,
"learning_rate": 4.924980974448791e-05,
"loss": 1.7584,
"step": 120
},
{
"epoch": 0.4063388866314506,
"grad_norm": 1.240400791168213,
"learning_rate": 4.918633983715581e-05,
"loss": 1.7438,
"step": 125
},
{
"epoch": 0.42259244209670865,
"grad_norm": 1.1441287994384766,
"learning_rate": 4.912033719713687e-05,
"loss": 1.7046,
"step": 130
},
{
"epoch": 0.4388459975619667,
"grad_norm": 1.1687984466552734,
"learning_rate": 4.905180873606157e-05,
"loss": 1.738,
"step": 135
},
{
"epoch": 0.4550995530272247,
"grad_norm": 0.9126319885253906,
"learning_rate": 4.8980761630058014e-05,
"loss": 1.7738,
"step": 140
},
{
"epoch": 0.47135310849248274,
"grad_norm": 1.172781229019165,
"learning_rate": 4.8907203319000495e-05,
"loss": 1.6642,
"step": 145
},
{
"epoch": 0.48760666395774077,
"grad_norm": 1.1217776536941528,
"learning_rate": 4.883114150573037e-05,
"loss": 1.7359,
"step": 150
},
{
"epoch": 0.5038602194229987,
"grad_norm": 1.1024829149246216,
"learning_rate": 4.8752584155249444e-05,
"loss": 1.6707,
"step": 155
},
{
"epoch": 0.5201137748882568,
"grad_norm": 1.042017936706543,
"learning_rate": 4.8671539493885934e-05,
"loss": 1.709,
"step": 160
},
{
"epoch": 0.5363673303535148,
"grad_norm": 1.1834771633148193,
"learning_rate": 4.8588016008432945e-05,
"loss": 1.7282,
"step": 165
},
{
"epoch": 0.5526208858187729,
"grad_norm": 1.0311099290847778,
"learning_rate": 4.850202244525987e-05,
"loss": 1.774,
"step": 170
},
{
"epoch": 0.5688744412840309,
"grad_norm": 1.2995566129684448,
"learning_rate": 4.8413567809396376e-05,
"loss": 1.714,
"step": 175
},
{
"epoch": 0.585127996749289,
"grad_norm": 1.23491370677948,
"learning_rate": 4.8322661363589507e-05,
"loss": 1.7717,
"step": 180
},
{
"epoch": 0.6013815522145469,
"grad_norm": 1.1809489727020264,
"learning_rate": 4.822931262733367e-05,
"loss": 1.722,
"step": 185
},
{
"epoch": 0.617635107679805,
"grad_norm": 0.988572359085083,
"learning_rate": 4.813353137587377e-05,
"loss": 1.7007,
"step": 190
},
{
"epoch": 0.633888663145063,
"grad_norm": 1.0803310871124268,
"learning_rate": 4.803532763918162e-05,
"loss": 1.6824,
"step": 195
},
{
"epoch": 0.650142218610321,
"grad_norm": 1.02713143825531,
"learning_rate": 4.793471170090555e-05,
"loss": 1.6453,
"step": 200
},
{
"epoch": 0.666395774075579,
"grad_norm": 1.2023831605911255,
"learning_rate": 4.783169409729362e-05,
"loss": 1.6974,
"step": 205
},
{
"epoch": 0.682649329540837,
"grad_norm": 1.0009326934814453,
"learning_rate": 4.772628561609022e-05,
"loss": 1.7094,
"step": 210
},
{
"epoch": 0.6989028850060951,
"grad_norm": 1.269020438194275,
"learning_rate": 4.761849729540643e-05,
"loss": 1.6832,
"step": 215
},
{
"epoch": 0.7151564404713531,
"grad_norm": 1.3129128217697144,
"learning_rate": 4.7508340422564134e-05,
"loss": 1.6847,
"step": 220
},
{
"epoch": 0.7314099959366112,
"grad_norm": 1.0837377309799194,
"learning_rate": 4.7395826532914054e-05,
"loss": 1.7215,
"step": 225
},
{
"epoch": 0.7476635514018691,
"grad_norm": 1.1211832761764526,
"learning_rate": 4.728096740862778e-05,
"loss": 1.7658,
"step": 230
},
{
"epoch": 0.7639171068671272,
"grad_norm": 1.1422042846679688,
"learning_rate": 4.716377507746397e-05,
"loss": 1.7098,
"step": 235
},
{
"epoch": 0.7801706623323852,
"grad_norm": 1.2925424575805664,
"learning_rate": 4.704426181150884e-05,
"loss": 1.7504,
"step": 240
},
{
"epoch": 0.7964242177976433,
"grad_norm": 1.282771348953247,
"learning_rate": 4.6922440125891064e-05,
"loss": 1.72,
"step": 245
},
{
"epoch": 0.8126777732629012,
"grad_norm": 1.2494529485702515,
"learning_rate": 4.6798322777471216e-05,
"loss": 1.6581,
"step": 250
},
{
"epoch": 0.8289313287281593,
"grad_norm": 1.2736073732376099,
"learning_rate": 4.667192276350591e-05,
"loss": 1.6825,
"step": 255
},
{
"epoch": 0.8451848841934173,
"grad_norm": 1.1477563381195068,
"learning_rate": 4.654325332028676e-05,
"loss": 1.6889,
"step": 260
},
{
"epoch": 0.8614384396586753,
"grad_norm": 1.2415803670883179,
"learning_rate": 4.641232792175428e-05,
"loss": 1.6586,
"step": 265
},
{
"epoch": 0.8776919951239334,
"grad_norm": 1.3159024715423584,
"learning_rate": 4.6279160278086994e-05,
"loss": 1.7102,
"step": 270
},
{
"epoch": 0.8939455505891913,
"grad_norm": 1.2742581367492676,
"learning_rate": 4.614376433426565e-05,
"loss": 1.7476,
"step": 275
},
{
"epoch": 0.9101991060544494,
"grad_norm": 1.34221351146698,
"learning_rate": 4.6006154268613014e-05,
"loss": 1.7497,
"step": 280
},
{
"epoch": 0.9264526615197074,
"grad_norm": 1.1669361591339111,
"learning_rate": 4.586634449130911e-05,
"loss": 1.6693,
"step": 285
},
{
"epoch": 0.9427062169849655,
"grad_norm": 1.2765443325042725,
"learning_rate": 4.572434964288226e-05,
"loss": 1.5862,
"step": 290
},
{
"epoch": 0.9589597724502235,
"grad_norm": 1.2429810762405396,
"learning_rate": 4.55801845926759e-05,
"loss": 1.6875,
"step": 295
},
{
"epoch": 0.9752133279154815,
"grad_norm": 1.2103233337402344,
"learning_rate": 4.543386443729157e-05,
"loss": 1.7294,
"step": 300
},
{
"epoch": 0.9914668833807395,
"grad_norm": 1.3512628078460693,
"learning_rate": 4.528540449900798e-05,
"loss": 1.709,
"step": 305
},
{
"epoch": 1.0077204388459975,
"grad_norm": 1.2039848566055298,
"learning_rate": 4.513482032417656e-05,
"loss": 1.6632,
"step": 310
},
{
"epoch": 1.0239739943112556,
"grad_norm": 1.3805475234985352,
"learning_rate": 4.498212768159341e-05,
"loss": 1.695,
"step": 315
},
{
"epoch": 1.0402275497765137,
"grad_norm": 1.3265151977539062,
"learning_rate": 4.48273425608481e-05,
"loss": 1.6653,
"step": 320
},
{
"epoch": 1.0564811052417715,
"grad_norm": 1.015641212463379,
"learning_rate": 4.4670481170649214e-05,
"loss": 1.652,
"step": 325
},
{
"epoch": 1.0727346607070296,
"grad_norm": 1.2359689474105835,
"learning_rate": 4.451155993712711e-05,
"loss": 1.6457,
"step": 330
},
{
"epoch": 1.0889882161722877,
"grad_norm": 1.3188092708587646,
"learning_rate": 4.435059550211371e-05,
"loss": 1.6512,
"step": 335
},
{
"epoch": 1.1052417716375458,
"grad_norm": 1.323879361152649,
"learning_rate": 4.4187604721399877e-05,
"loss": 1.6338,
"step": 340
},
{
"epoch": 1.1214953271028036,
"grad_norm": 1.263047218322754,
"learning_rate": 4.40226046629703e-05,
"loss": 1.6508,
"step": 345
},
{
"epoch": 1.1377488825680617,
"grad_norm": 1.265649676322937,
"learning_rate": 4.3855612605216174e-05,
"loss": 1.6447,
"step": 350
},
{
"epoch": 1.1540024380333198,
"grad_norm": 1.2379087209701538,
"learning_rate": 4.3686646035125855e-05,
"loss": 1.6522,
"step": 355
},
{
"epoch": 1.170255993498578,
"grad_norm": 1.200422763824463,
"learning_rate": 4.351572264645366e-05,
"loss": 1.6935,
"step": 360
},
{
"epoch": 1.1865095489638358,
"grad_norm": 1.3982837200164795,
"learning_rate": 4.334286033786704e-05,
"loss": 1.6792,
"step": 365
},
{
"epoch": 1.2027631044290938,
"grad_norm": 1.3108696937561035,
"learning_rate": 4.3168077211072263e-05,
"loss": 1.6498,
"step": 370
},
{
"epoch": 1.219016659894352,
"grad_norm": 1.3713555335998535,
"learning_rate": 4.2991391568918825e-05,
"loss": 1.6675,
"step": 375
},
{
"epoch": 1.23527021535961,
"grad_norm": 1.406441330909729,
"learning_rate": 4.281282191348289e-05,
"loss": 1.6328,
"step": 380
},
{
"epoch": 1.2515237708248679,
"grad_norm": 1.3209528923034668,
"learning_rate": 4.2632386944129707e-05,
"loss": 1.7064,
"step": 385
},
{
"epoch": 1.267777326290126,
"grad_norm": 1.4870327711105347,
"learning_rate": 4.245010555555554e-05,
"loss": 1.7161,
"step": 390
},
{
"epoch": 1.284030881755384,
"grad_norm": 1.4530566930770874,
"learning_rate": 4.2265996835809016e-05,
"loss": 1.6267,
"step": 395
},
{
"epoch": 1.300284437220642,
"grad_norm": 1.4400850534439087,
"learning_rate": 4.2080080064292304e-05,
"loss": 1.6029,
"step": 400
},
{
"epoch": 1.3165379926859,
"grad_norm": 1.389769434928894,
"learning_rate": 4.189237470974219e-05,
"loss": 1.6298,
"step": 405
},
{
"epoch": 1.332791548151158,
"grad_norm": 1.2579468488693237,
"learning_rate": 4.1702900428191374e-05,
"loss": 1.6965,
"step": 410
},
{
"epoch": 1.3490451036164162,
"grad_norm": 1.4243718385696411,
"learning_rate": 4.151167706091017e-05,
"loss": 1.5955,
"step": 415
},
{
"epoch": 1.3652986590816742,
"grad_norm": 1.395219087600708,
"learning_rate": 4.131872463232872e-05,
"loss": 1.6369,
"step": 420
},
{
"epoch": 1.3815522145469321,
"grad_norm": 1.4617631435394287,
"learning_rate": 4.1124063347940135e-05,
"loss": 1.6105,
"step": 425
},
{
"epoch": 1.3978057700121902,
"grad_norm": 1.1881122589111328,
"learning_rate": 4.092771359218461e-05,
"loss": 1.6372,
"step": 430
},
{
"epoch": 1.414059325477448,
"grad_norm": 1.330495834350586,
"learning_rate": 4.0729695926314815e-05,
"loss": 1.5817,
"step": 435
},
{
"epoch": 1.4303128809427061,
"grad_norm": 1.3312103748321533,
"learning_rate": 4.053003108624276e-05,
"loss": 1.6602,
"step": 440
},
{
"epoch": 1.4465664364079642,
"grad_norm": 1.3933234214782715,
"learning_rate": 4.03287399803684e-05,
"loss": 1.63,
"step": 445
},
{
"epoch": 1.4628199918732223,
"grad_norm": 1.600183129310608,
"learning_rate": 4.0125843687390166e-05,
"loss": 1.6614,
"step": 450
},
{
"epoch": 1.4790735473384804,
"grad_norm": 1.4344749450683594,
"learning_rate": 3.992136345409765e-05,
"loss": 1.6492,
"step": 455
},
{
"epoch": 1.4953271028037383,
"grad_norm": 1.4546074867248535,
"learning_rate": 3.9715320693146655e-05,
"loss": 1.6063,
"step": 460
},
{
"epoch": 1.5115806582689963,
"grad_norm": 1.2904654741287231,
"learning_rate": 3.9507736980817e-05,
"loss": 1.6618,
"step": 465
},
{
"epoch": 1.5278342137342542,
"grad_norm": 1.8645012378692627,
"learning_rate": 3.9298634054753026e-05,
"loss": 1.697,
"step": 470
},
{
"epoch": 1.5440877691995123,
"grad_norm": 1.4428379535675049,
"learning_rate": 3.908803381168732e-05,
"loss": 1.6741,
"step": 475
},
{
"epoch": 1.5603413246647704,
"grad_norm": 1.4418410062789917,
"learning_rate": 3.887595830514775e-05,
"loss": 1.7026,
"step": 480
},
{
"epoch": 1.5765948801300285,
"grad_norm": 1.3783613443374634,
"learning_rate": 3.8662429743148046e-05,
"loss": 1.6692,
"step": 485
},
{
"epoch": 1.5928484355952865,
"grad_norm": 1.3248358964920044,
"learning_rate": 3.844747048586228e-05,
"loss": 1.6315,
"step": 490
},
{
"epoch": 1.6091019910605446,
"grad_norm": 1.3751637935638428,
"learning_rate": 3.823110304328331e-05,
"loss": 1.6716,
"step": 495
},
{
"epoch": 1.6253555465258025,
"grad_norm": 1.3097206354141235,
"learning_rate": 3.801335007286564e-05,
"loss": 1.5766,
"step": 500
},
{
"epoch": 1.6416091019910606,
"grad_norm": 1.402269959449768,
"learning_rate": 3.779423437715274e-05,
"loss": 1.6405,
"step": 505
},
{
"epoch": 1.6578626574563184,
"grad_norm": 1.6543242931365967,
"learning_rate": 3.757377890138927e-05,
"loss": 1.6307,
"step": 510
},
{
"epoch": 1.6741162129215765,
"grad_norm": 1.4377750158309937,
"learning_rate": 3.7352006731118264e-05,
"loss": 1.588,
"step": 515
},
{
"epoch": 1.6903697683868346,
"grad_norm": 1.578800916671753,
"learning_rate": 3.712894108976372e-05,
"loss": 1.6307,
"step": 520
},
{
"epoch": 1.7066233238520927,
"grad_norm": 1.4415676593780518,
"learning_rate": 3.690460533619866e-05,
"loss": 1.6854,
"step": 525
},
{
"epoch": 1.7228768793173508,
"grad_norm": 1.5168665647506714,
"learning_rate": 3.667902296229905e-05,
"loss": 1.599,
"step": 530
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.3709524869918823,
"learning_rate": 3.645221759048384e-05,
"loss": 1.6427,
"step": 535
},
{
"epoch": 1.7553839902478667,
"grad_norm": 1.4154709577560425,
"learning_rate": 3.622421297124122e-05,
"loss": 1.5486,
"step": 540
},
{
"epoch": 1.7716375457131246,
"grad_norm": 1.5655745267868042,
"learning_rate": 3.599503298064154e-05,
"loss": 1.6065,
"step": 545
},
{
"epoch": 1.7878911011783827,
"grad_norm": 1.5691167116165161,
"learning_rate": 3.576470161783712e-05,
"loss": 1.6194,
"step": 550
},
{
"epoch": 1.8041446566436408,
"grad_norm": 1.5229332447052002,
"learning_rate": 3.5533243002549046e-05,
"loss": 1.6312,
"step": 555
},
{
"epoch": 1.8203982121088988,
"grad_norm": 2.0803072452545166,
"learning_rate": 3.5300681372541476e-05,
"loss": 1.5872,
"step": 560
},
{
"epoch": 1.836651767574157,
"grad_norm": 1.4445246458053589,
"learning_rate": 3.5067041081083496e-05,
"loss": 1.6226,
"step": 565
},
{
"epoch": 1.852905323039415,
"grad_norm": 1.559645652770996,
"learning_rate": 3.483234659439889e-05,
"loss": 1.5703,
"step": 570
},
{
"epoch": 1.8691588785046729,
"grad_norm": 1.3045098781585693,
"learning_rate": 3.459662248910411e-05,
"loss": 1.5574,
"step": 575
},
{
"epoch": 1.885412433969931,
"grad_norm": 1.410343050956726,
"learning_rate": 3.435989344963471e-05,
"loss": 1.6836,
"step": 580
},
{
"epoch": 1.9016659894351888,
"grad_norm": 1.5801265239715576,
"learning_rate": 3.41221842656604e-05,
"loss": 1.6366,
"step": 585
},
{
"epoch": 1.917919544900447,
"grad_norm": 1.440772533416748,
"learning_rate": 3.3883519829489154e-05,
"loss": 1.6395,
"step": 590
},
{
"epoch": 1.934173100365705,
"grad_norm": 1.3845847845077515,
"learning_rate": 3.3643925133460564e-05,
"loss": 1.6139,
"step": 595
},
{
"epoch": 1.950426655830963,
"grad_norm": 1.3879539966583252,
"learning_rate": 3.3403425267328715e-05,
"loss": 1.5887,
"step": 600
},
{
"epoch": 1.9666802112962212,
"grad_norm": 1.3762508630752563,
"learning_rate": 3.316204541563479e-05,
"loss": 1.636,
"step": 605
},
{
"epoch": 1.982933766761479,
"grad_norm": 1.6764074563980103,
"learning_rate": 3.2919810855069864e-05,
"loss": 1.6699,
"step": 610
},
{
"epoch": 1.9991873222267371,
"grad_norm": 1.5237112045288086,
"learning_rate": 3.267674695182798e-05,
"loss": 1.6194,
"step": 615
},
{
"epoch": 2.015440877691995,
"grad_norm": 1.7229068279266357,
"learning_rate": 3.243287915894987e-05,
"loss": 1.6077,
"step": 620
},
{
"epoch": 2.031694433157253,
"grad_norm": 1.5503779649734497,
"learning_rate": 3.218823301365754e-05,
"loss": 1.6024,
"step": 625
},
{
"epoch": 2.047947988622511,
"grad_norm": 1.584635853767395,
"learning_rate": 3.1942834134680124e-05,
"loss": 1.5911,
"step": 630
},
{
"epoch": 2.0642015440877692,
"grad_norm": 1.4763562679290771,
"learning_rate": 3.1696708219571126e-05,
"loss": 1.6143,
"step": 635
},
{
"epoch": 2.0804550995530273,
"grad_norm": 1.5131165981292725,
"learning_rate": 3.144988104201745e-05,
"loss": 1.6069,
"step": 640
},
{
"epoch": 2.0967086550182854,
"grad_norm": 1.595162272453308,
"learning_rate": 3.120237844914044e-05,
"loss": 1.6128,
"step": 645
},
{
"epoch": 2.112962210483543,
"grad_norm": 1.9360445737838745,
"learning_rate": 3.095422635878923e-05,
"loss": 1.6004,
"step": 650
},
{
"epoch": 2.129215765948801,
"grad_norm": 1.4424818754196167,
"learning_rate": 3.0705450756826706e-05,
"loss": 1.6084,
"step": 655
},
{
"epoch": 2.145469321414059,
"grad_norm": 1.6683636903762817,
"learning_rate": 3.045607769440829e-05,
"loss": 1.6063,
"step": 660
},
{
"epoch": 2.1617228768793173,
"grad_norm": 1.9396454095840454,
"learning_rate": 3.0206133285254017e-05,
"loss": 1.6374,
"step": 665
},
{
"epoch": 2.1779764323445754,
"grad_norm": 1.9183346033096313,
"learning_rate": 2.9955643702913865e-05,
"loss": 1.529,
"step": 670
},
{
"epoch": 2.1942299878098335,
"grad_norm": 1.61932373046875,
"learning_rate": 2.9704635178027012e-05,
"loss": 1.5607,
"step": 675
},
{
"epoch": 2.2104835432750916,
"grad_norm": 1.7259231805801392,
"learning_rate": 2.9453133995574955e-05,
"loss": 1.5854,
"step": 680
},
{
"epoch": 2.2267370987403496,
"grad_norm": 1.675529956817627,
"learning_rate": 2.9201166492129088e-05,
"loss": 1.5563,
"step": 685
},
{
"epoch": 2.2429906542056073,
"grad_norm": 1.628509283065796,
"learning_rate": 2.8948759053092754e-05,
"loss": 1.5781,
"step": 690
},
{
"epoch": 2.2592442096708654,
"grad_norm": 1.5093809366226196,
"learning_rate": 2.869593810993824e-05,
"loss": 1.5759,
"step": 695
},
{
"epoch": 2.2754977651361235,
"grad_norm": 1.8209433555603027,
"learning_rate": 2.844273013743896e-05,
"loss": 1.6089,
"step": 700
},
{
"epoch": 2.2917513206013815,
"grad_norm": 2.2926626205444336,
"learning_rate": 2.8189161650897045e-05,
"loss": 1.578,
"step": 705
},
{
"epoch": 2.3080048760666396,
"grad_norm": 1.6229182481765747,
"learning_rate": 2.7935259203366777e-05,
"loss": 1.5563,
"step": 710
},
{
"epoch": 2.3242584315318977,
"grad_norm": 1.8527780771255493,
"learning_rate": 2.7681049382873965e-05,
"loss": 1.5463,
"step": 715
},
{
"epoch": 2.340511986997156,
"grad_norm": 1.877462387084961,
"learning_rate": 2.7426558809631748e-05,
"loss": 1.5227,
"step": 720
},
{
"epoch": 2.356765542462414,
"grad_norm": 1.5895532369613647,
"learning_rate": 2.7171814133253015e-05,
"loss": 1.6244,
"step": 725
},
{
"epoch": 2.3730190979276715,
"grad_norm": 1.5113598108291626,
"learning_rate": 2.691684202995966e-05,
"loss": 1.5977,
"step": 730
},
{
"epoch": 2.3892726533929296,
"grad_norm": 1.582775592803955,
"learning_rate": 2.6661669199789174e-05,
"loss": 1.6314,
"step": 735
},
{
"epoch": 2.4055262088581877,
"grad_norm": 1.6638239622116089,
"learning_rate": 2.6406322363798657e-05,
"loss": 1.5892,
"step": 740
},
{
"epoch": 2.4217797643234458,
"grad_norm": 1.538442850112915,
"learning_rate": 2.6150828261266642e-05,
"loss": 1.5512,
"step": 745
},
{
"epoch": 2.438033319788704,
"grad_norm": 1.6044487953186035,
"learning_rate": 2.589521364689308e-05,
"loss": 1.5288,
"step": 750
},
{
"epoch": 2.454286875253962,
"grad_norm": 1.8566341400146484,
"learning_rate": 2.5639505287997583e-05,
"loss": 1.6311,
"step": 755
},
{
"epoch": 2.47054043071922,
"grad_norm": 1.7828178405761719,
"learning_rate": 2.5383729961716483e-05,
"loss": 1.5705,
"step": 760
},
{
"epoch": 2.4867939861844777,
"grad_norm": 1.8720645904541016,
"learning_rate": 2.512791445219876e-05,
"loss": 1.5955,
"step": 765
},
{
"epoch": 2.5030475416497358,
"grad_norm": 1.5776010751724243,
"learning_rate": 2.487208554780125e-05,
"loss": 1.6158,
"step": 770
},
{
"epoch": 2.519301097114994,
"grad_norm": 1.802860140800476,
"learning_rate": 2.461627003828352e-05,
"loss": 1.5844,
"step": 775
},
{
"epoch": 2.535554652580252,
"grad_norm": 1.8349519968032837,
"learning_rate": 2.4360494712002423e-05,
"loss": 1.5486,
"step": 780
},
{
"epoch": 2.55180820804551,
"grad_norm": 1.77409029006958,
"learning_rate": 2.4104786353106926e-05,
"loss": 1.5918,
"step": 785
},
{
"epoch": 2.568061763510768,
"grad_norm": 1.7446224689483643,
"learning_rate": 2.384917173873336e-05,
"loss": 1.5554,
"step": 790
},
{
"epoch": 2.584315318976026,
"grad_norm": 1.7830730676651,
"learning_rate": 2.359367763620135e-05,
"loss": 1.6061,
"step": 795
},
{
"epoch": 2.600568874441284,
"grad_norm": 1.542494773864746,
"learning_rate": 2.3338330800210828e-05,
"loss": 1.6328,
"step": 800
},
{
"epoch": 2.616822429906542,
"grad_norm": 1.7287977933883667,
"learning_rate": 2.3083157970040344e-05,
"loss": 1.5585,
"step": 805
},
{
"epoch": 2.6330759853718,
"grad_norm": 1.6727949380874634,
"learning_rate": 2.282818586674699e-05,
"loss": 1.6686,
"step": 810
},
{
"epoch": 2.649329540837058,
"grad_norm": 1.8130536079406738,
"learning_rate": 2.2573441190368255e-05,
"loss": 1.606,
"step": 815
},
{
"epoch": 2.665583096302316,
"grad_norm": 1.8258434534072876,
"learning_rate": 2.2318950617126044e-05,
"loss": 1.5736,
"step": 820
},
{
"epoch": 2.6818366517675742,
"grad_norm": 1.8970004320144653,
"learning_rate": 2.206474079663323e-05,
"loss": 1.5835,
"step": 825
},
{
"epoch": 2.6980902072328323,
"grad_norm": 1.6587392091751099,
"learning_rate": 2.181083834910296e-05,
"loss": 1.6072,
"step": 830
},
{
"epoch": 2.71434376269809,
"grad_norm": 2.0163655281066895,
"learning_rate": 2.1557269862561043e-05,
"loss": 1.6178,
"step": 835
},
{
"epoch": 2.7305973181633485,
"grad_norm": 1.669689416885376,
"learning_rate": 2.1304061890061762e-05,
"loss": 1.5864,
"step": 840
},
{
"epoch": 2.746850873628606,
"grad_norm": 1.7801822423934937,
"learning_rate": 2.105124094690725e-05,
"loss": 1.5186,
"step": 845
},
{
"epoch": 2.7631044290938642,
"grad_norm": 1.7704806327819824,
"learning_rate": 2.0798833507870918e-05,
"loss": 1.5555,
"step": 850
},
{
"epoch": 2.7793579845591223,
"grad_norm": 1.954834222793579,
"learning_rate": 2.0546866004425054e-05,
"loss": 1.6356,
"step": 855
},
{
"epoch": 2.7956115400243804,
"grad_norm": 1.6739940643310547,
"learning_rate": 2.0295364821972997e-05,
"loss": 1.5781,
"step": 860
},
{
"epoch": 2.8118650954896385,
"grad_norm": 1.758725881576538,
"learning_rate": 2.0044356297086134e-05,
"loss": 1.6308,
"step": 865
},
{
"epoch": 2.828118650954896,
"grad_norm": 1.7557764053344727,
"learning_rate": 1.9793866714745978e-05,
"loss": 1.5366,
"step": 870
},
{
"epoch": 2.8443722064201546,
"grad_norm": 1.6240283250808716,
"learning_rate": 1.9543922305591704e-05,
"loss": 1.6104,
"step": 875
},
{
"epoch": 2.8606257618854123,
"grad_norm": 1.7991812229156494,
"learning_rate": 1.9294549243173303e-05,
"loss": 1.6576,
"step": 880
},
{
"epoch": 2.8768793173506704,
"grad_norm": 1.8438798189163208,
"learning_rate": 1.904577364121077e-05,
"loss": 1.5634,
"step": 885
},
{
"epoch": 2.8931328728159285,
"grad_norm": 2.0317084789276123,
"learning_rate": 1.879762155085956e-05,
"loss": 1.5538,
"step": 890
},
{
"epoch": 2.9093864282811865,
"grad_norm": 1.6778262853622437,
"learning_rate": 1.8550118957982547e-05,
"loss": 1.6398,
"step": 895
},
{
"epoch": 2.9256399837464446,
"grad_norm": 1.8134307861328125,
"learning_rate": 1.8303291780428876e-05,
"loss": 1.4889,
"step": 900
},
{
"epoch": 2.9418935392117027,
"grad_norm": 1.9006396532058716,
"learning_rate": 1.8057165865319882e-05,
"loss": 1.582,
"step": 905
},
{
"epoch": 2.958147094676961,
"grad_norm": 1.8433765172958374,
"learning_rate": 1.781176698634246e-05,
"loss": 1.5856,
"step": 910
},
{
"epoch": 2.9744006501422184,
"grad_norm": 1.6938343048095703,
"learning_rate": 1.7567120841050135e-05,
"loss": 1.599,
"step": 915
},
{
"epoch": 2.9906542056074765,
"grad_norm": 1.868025302886963,
"learning_rate": 1.7323253048172013e-05,
"loss": 1.5799,
"step": 920
},
{
"epoch": 3.0069077610727346,
"grad_norm": 1.767774224281311,
"learning_rate": 1.7080189144930135e-05,
"loss": 1.6014,
"step": 925
},
{
"epoch": 3.0231613165379927,
"grad_norm": 1.7357462644577026,
"learning_rate": 1.6837954584365216e-05,
"loss": 1.5911,
"step": 930
},
{
"epoch": 3.039414872003251,
"grad_norm": 1.7004544734954834,
"learning_rate": 1.6596574732671287e-05,
"loss": 1.5304,
"step": 935
},
{
"epoch": 3.055668427468509,
"grad_norm": 1.7053595781326294,
"learning_rate": 1.6356074866539435e-05,
"loss": 1.548,
"step": 940
},
{
"epoch": 3.071921982933767,
"grad_norm": 1.7751458883285522,
"learning_rate": 1.6116480170510852e-05,
"loss": 1.6273,
"step": 945
},
{
"epoch": 3.0881755383990246,
"grad_norm": 1.8386896848678589,
"learning_rate": 1.5877815734339608e-05,
"loss": 1.5394,
"step": 950
},
{
"epoch": 3.1044290938642827,
"grad_norm": 1.6355414390563965,
"learning_rate": 1.5640106550365298e-05,
"loss": 1.5259,
"step": 955
},
{
"epoch": 3.1206826493295408,
"grad_norm": 1.7519521713256836,
"learning_rate": 1.54033775108959e-05,
"loss": 1.6105,
"step": 960
},
{
"epoch": 3.136936204794799,
"grad_norm": 1.690624713897705,
"learning_rate": 1.5167653405601124e-05,
"loss": 1.5488,
"step": 965
},
{
"epoch": 3.153189760260057,
"grad_norm": 1.7758216857910156,
"learning_rate": 1.4932958918916513e-05,
"loss": 1.5906,
"step": 970
},
{
"epoch": 3.169443315725315,
"grad_norm": 1.7740142345428467,
"learning_rate": 1.469931862745853e-05,
"loss": 1.5361,
"step": 975
},
{
"epoch": 3.185696871190573,
"grad_norm": 1.8564815521240234,
"learning_rate": 1.446675699745097e-05,
"loss": 1.5407,
"step": 980
},
{
"epoch": 3.2019504266558307,
"grad_norm": 1.829458236694336,
"learning_rate": 1.4235298382162899e-05,
"loss": 1.5962,
"step": 985
},
{
"epoch": 3.218203982121089,
"grad_norm": 1.9702229499816895,
"learning_rate": 1.4004967019358469e-05,
"loss": 1.5905,
"step": 990
},
{
"epoch": 3.234457537586347,
"grad_norm": 1.8121120929718018,
"learning_rate": 1.3775787028758799e-05,
"loss": 1.5726,
"step": 995
},
{
"epoch": 3.250711093051605,
"grad_norm": 2.0297420024871826,
"learning_rate": 1.354778240951617e-05,
"loss": 1.5484,
"step": 1000
},
{
"epoch": 3.266964648516863,
"grad_norm": 1.9334895610809326,
"learning_rate": 1.3320977037700952e-05,
"loss": 1.5091,
"step": 1005
},
{
"epoch": 3.283218203982121,
"grad_norm": 1.7220382690429688,
"learning_rate": 1.3095394663801347e-05,
"loss": 1.5658,
"step": 1010
},
{
"epoch": 3.2994717594473792,
"grad_norm": 1.8911561965942383,
"learning_rate": 1.2871058910236292e-05,
"loss": 1.5774,
"step": 1015
},
{
"epoch": 3.315725314912637,
"grad_norm": 1.8349742889404297,
"learning_rate": 1.2647993268881745e-05,
"loss": 1.5463,
"step": 1020
},
{
"epoch": 3.331978870377895,
"grad_norm": 1.8155410289764404,
"learning_rate": 1.2426221098610741e-05,
"loss": 1.55,
"step": 1025
},
{
"epoch": 3.348232425843153,
"grad_norm": 2.0373146533966064,
"learning_rate": 1.2205765622847274e-05,
"loss": 1.5677,
"step": 1030
},
{
"epoch": 3.364485981308411,
"grad_norm": 1.8565573692321777,
"learning_rate": 1.1986649927134372e-05,
"loss": 1.594,
"step": 1035
},
{
"epoch": 3.3807395367736692,
"grad_norm": 1.7318943738937378,
"learning_rate": 1.1768896956716693e-05,
"loss": 1.5615,
"step": 1040
},
{
"epoch": 3.3969930922389273,
"grad_norm": 2.03358793258667,
"learning_rate": 1.1552529514137733e-05,
"loss": 1.5068,
"step": 1045
},
{
"epoch": 3.4132466477041854,
"grad_norm": 1.943980097770691,
"learning_rate": 1.1337570256851963e-05,
"loss": 1.5696,
"step": 1050
},
{
"epoch": 3.4295002031694435,
"grad_norm": 1.7781481742858887,
"learning_rate": 1.1124041694852258e-05,
"loss": 1.5515,
"step": 1055
},
{
"epoch": 3.4457537586347016,
"grad_norm": 1.72038733959198,
"learning_rate": 1.091196618831268e-05,
"loss": 1.5536,
"step": 1060
},
{
"epoch": 3.462007314099959,
"grad_norm": 1.7962324619293213,
"learning_rate": 1.070136594524698e-05,
"loss": 1.578,
"step": 1065
},
{
"epoch": 3.4782608695652173,
"grad_norm": 2.4699037075042725,
"learning_rate": 1.0492263019183002e-05,
"loss": 1.5386,
"step": 1070
},
{
"epoch": 3.4945144250304754,
"grad_norm": 1.756534218788147,
"learning_rate": 1.0284679306853343e-05,
"loss": 1.5757,
"step": 1075
},
{
"epoch": 3.5107679804957335,
"grad_norm": 1.8124436140060425,
"learning_rate": 1.0078636545902362e-05,
"loss": 1.5579,
"step": 1080
},
{
"epoch": 3.5270215359609915,
"grad_norm": 1.7683898210525513,
"learning_rate": 9.874156312609836e-06,
"loss": 1.5616,
"step": 1085
},
{
"epoch": 3.5432750914262496,
"grad_norm": 1.870835542678833,
"learning_rate": 9.671260019631604e-06,
"loss": 1.5243,
"step": 1090
},
{
"epoch": 3.5595286468915077,
"grad_norm": 1.8171908855438232,
"learning_rate": 9.469968913757254e-06,
"loss": 1.6215,
"step": 1095
},
{
"epoch": 3.5757822023567654,
"grad_norm": 1.8751733303070068,
"learning_rate": 9.270304073685193e-06,
"loss": 1.5294,
"step": 1100
},
{
"epoch": 3.5920357578220234,
"grad_norm": 1.6688753366470337,
"learning_rate": 9.07228640781539e-06,
"loss": 1.546,
"step": 1105
},
{
"epoch": 3.6082893132872815,
"grad_norm": 1.7186001539230347,
"learning_rate": 8.875936652059872e-06,
"loss": 1.5659,
"step": 1110
},
{
"epoch": 3.6245428687525396,
"grad_norm": 1.6870884895324707,
"learning_rate": 8.681275367671287e-06,
"loss": 1.5799,
"step": 1115
},
{
"epoch": 3.6407964242177977,
"grad_norm": 1.8477848768234253,
"learning_rate": 8.488322939089838e-06,
"loss": 1.6147,
"step": 1120
},
{
"epoch": 3.657049979683056,
"grad_norm": 1.7299193143844604,
"learning_rate": 8.297099571808625e-06,
"loss": 1.5244,
"step": 1125
},
{
"epoch": 3.673303535148314,
"grad_norm": 1.8795167207717896,
"learning_rate": 8.10762529025782e-06,
"loss": 1.62,
"step": 1130
},
{
"epoch": 3.6895570906135715,
"grad_norm": 2.0962882041931152,
"learning_rate": 7.9199199357077e-06,
"loss": 1.5168,
"step": 1135
},
{
"epoch": 3.7058106460788296,
"grad_norm": 1.949979305267334,
"learning_rate": 7.734003164190983e-06,
"loss": 1.5378,
"step": 1140
},
{
"epoch": 3.7220642015440877,
"grad_norm": 1.9429444074630737,
"learning_rate": 7.549894444444469e-06,
"loss": 1.5231,
"step": 1145
},
{
"epoch": 3.7383177570093458,
"grad_norm": 2.2284860610961914,
"learning_rate": 7.367613055870301e-06,
"loss": 1.5921,
"step": 1150
},
{
"epoch": 3.754571312474604,
"grad_norm": 1.9204776287078857,
"learning_rate": 7.187178086517116e-06,
"loss": 1.539,
"step": 1155
},
{
"epoch": 3.770824867939862,
"grad_norm": 1.7024307250976562,
"learning_rate": 7.008608431081179e-06,
"loss": 1.58,
"step": 1160
},
{
"epoch": 3.78707842340512,
"grad_norm": 2.035949945449829,
"learning_rate": 6.831922788927744e-06,
"loss": 1.5792,
"step": 1165
},
{
"epoch": 3.8033319788703777,
"grad_norm": 1.947672963142395,
"learning_rate": 6.657139662132961e-06,
"loss": 1.5228,
"step": 1170
},
{
"epoch": 3.819585534335636,
"grad_norm": 1.8621673583984375,
"learning_rate": 6.4842773535463416e-06,
"loss": 1.5181,
"step": 1175
},
{
"epoch": 3.835839089800894,
"grad_norm": 1.968684434890747,
"learning_rate": 6.313353964874155e-06,
"loss": 1.5622,
"step": 1180
},
{
"epoch": 3.852092645266152,
"grad_norm": 1.8336701393127441,
"learning_rate": 6.14438739478383e-06,
"loss": 1.5889,
"step": 1185
},
{
"epoch": 3.86834620073141,
"grad_norm": 2.0393776893615723,
"learning_rate": 5.977395337029701e-06,
"loss": 1.5254,
"step": 1190
},
{
"epoch": 3.884599756196668,
"grad_norm": 1.9161821603775024,
"learning_rate": 5.8123952786001276e-06,
"loss": 1.5271,
"step": 1195
},
{
"epoch": 3.900853311661926,
"grad_norm": 2.1005172729492188,
"learning_rate": 5.6494044978862906e-06,
"loss": 1.553,
"step": 1200
},
{
"epoch": 3.917106867127184,
"grad_norm": 2.095838785171509,
"learning_rate": 5.488440062872891e-06,
"loss": 1.5279,
"step": 1205
},
{
"epoch": 3.9333604225924423,
"grad_norm": 2.2288739681243896,
"learning_rate": 5.329518829350788e-06,
"loss": 1.6242,
"step": 1210
},
{
"epoch": 3.9496139780577,
"grad_norm": 1.9176870584487915,
"learning_rate": 5.172657439151912e-06,
"loss": 1.4934,
"step": 1215
},
{
"epoch": 3.965867533522958,
"grad_norm": 1.945634126663208,
"learning_rate": 5.017872318406594e-06,
"loss": 1.5367,
"step": 1220
},
{
"epoch": 3.982121088988216,
"grad_norm": 1.921211838722229,
"learning_rate": 4.865179675823442e-06,
"loss": 1.512,
"step": 1225
},
{
"epoch": 3.9983746444534742,
"grad_norm": 2.0986344814300537,
"learning_rate": 4.7145955009920195e-06,
"loss": 1.5012,
"step": 1230
},
{
"epoch": 4.014628199918732,
"grad_norm": 2.0708072185516357,
"learning_rate": 4.5661355627084374e-06,
"loss": 1.6165,
"step": 1235
},
{
"epoch": 4.03088175538399,
"grad_norm": 1.9316012859344482,
"learning_rate": 4.419815407324102e-06,
"loss": 1.57,
"step": 1240
},
{
"epoch": 4.0471353108492485,
"grad_norm": 1.8041713237762451,
"learning_rate": 4.275650357117747e-06,
"loss": 1.555,
"step": 1245
},
{
"epoch": 4.063388866314506,
"grad_norm": 2.017209529876709,
"learning_rate": 4.13365550869089e-06,
"loss": 1.567,
"step": 1250
},
{
"epoch": 4.079642421779765,
"grad_norm": 1.708092212677002,
"learning_rate": 3.993845731386991e-06,
"loss": 1.5679,
"step": 1255
},
{
"epoch": 4.095895977245022,
"grad_norm": 1.8569332361221313,
"learning_rate": 3.856235665734359e-06,
"loss": 1.6,
"step": 1260
},
{
"epoch": 4.11214953271028,
"grad_norm": 1.8322447538375854,
"learning_rate": 3.720839721913011e-06,
"loss": 1.568,
"step": 1265
},
{
"epoch": 4.1284030881755385,
"grad_norm": 1.730568289756775,
"learning_rate": 3.587672078245716e-06,
"loss": 1.5105,
"step": 1270
},
{
"epoch": 4.144656643640796,
"grad_norm": 2.135589361190796,
"learning_rate": 3.456746679713238e-06,
"loss": 1.5713,
"step": 1275
},
{
"epoch": 4.160910199106055,
"grad_norm": 1.7022476196289062,
"learning_rate": 3.328077236494087e-06,
"loss": 1.614,
"step": 1280
},
{
"epoch": 4.177163754571312,
"grad_norm": 1.968857765197754,
"learning_rate": 3.2016772225287844e-06,
"loss": 1.5023,
"step": 1285
},
{
"epoch": 4.193417310036571,
"grad_norm": 2.244847536087036,
"learning_rate": 3.0775598741089372e-06,
"loss": 1.5558,
"step": 1290
},
{
"epoch": 4.2096708655018285,
"grad_norm": 1.9722509384155273,
"learning_rate": 2.955738188491167e-06,
"loss": 1.495,
"step": 1295
},
{
"epoch": 4.225924420967086,
"grad_norm": 2.2794189453125,
"learning_rate": 2.836224922536035e-06,
"loss": 1.4822,
"step": 1300
},
{
"epoch": 4.242177976432345,
"grad_norm": 1.8010648488998413,
"learning_rate": 2.7190325913722196e-06,
"loss": 1.5623,
"step": 1305
},
{
"epoch": 4.258431531897602,
"grad_norm": 2.321396589279175,
"learning_rate": 2.6041734670859493e-06,
"loss": 1.4989,
"step": 1310
},
{
"epoch": 4.274685087362861,
"grad_norm": 1.797762393951416,
"learning_rate": 2.4916595774358705e-06,
"loss": 1.6575,
"step": 1315
},
{
"epoch": 4.290938642828118,
"grad_norm": 1.9985977411270142,
"learning_rate": 2.3815027045935774e-06,
"loss": 1.5394,
"step": 1320
},
{
"epoch": 4.307192198293377,
"grad_norm": 1.9368855953216553,
"learning_rate": 2.273714383909789e-06,
"loss": 1.5128,
"step": 1325
},
{
"epoch": 4.323445753758635,
"grad_norm": 1.9567540884017944,
"learning_rate": 2.1683059027063828e-06,
"loss": 1.4683,
"step": 1330
},
{
"epoch": 4.339699309223893,
"grad_norm": 1.9031387567520142,
"learning_rate": 2.0652882990944534e-06,
"loss": 1.5042,
"step": 1335
},
{
"epoch": 4.355952864689151,
"grad_norm": 2.1097919940948486,
"learning_rate": 1.964672360818387e-06,
"loss": 1.5478,
"step": 1340
},
{
"epoch": 4.372206420154408,
"grad_norm": 2.0269947052001953,
"learning_rate": 1.8664686241262358e-06,
"loss": 1.4658,
"step": 1345
},
{
"epoch": 4.388459975619667,
"grad_norm": 1.8135507106781006,
"learning_rate": 1.7706873726663381e-06,
"loss": 1.5633,
"step": 1350
},
{
"epoch": 4.404713531084925,
"grad_norm": 2.005018949508667,
"learning_rate": 1.6773386364104971e-06,
"loss": 1.535,
"step": 1355
},
{
"epoch": 4.420967086550183,
"grad_norm": 1.8301798105239868,
"learning_rate": 1.5864321906036262e-06,
"loss": 1.524,
"step": 1360
},
{
"epoch": 4.437220642015441,
"grad_norm": 2.0094285011291504,
"learning_rate": 1.4979775547401376e-06,
"loss": 1.5157,
"step": 1365
},
{
"epoch": 4.453474197480699,
"grad_norm": 1.9530905485153198,
"learning_rate": 1.4119839915670562e-06,
"loss": 1.5491,
"step": 1370
},
{
"epoch": 4.469727752945957,
"grad_norm": 2.0504391193389893,
"learning_rate": 1.3284605061140764e-06,
"loss": 1.5194,
"step": 1375
},
{
"epoch": 4.485981308411215,
"grad_norm": 2.2431955337524414,
"learning_rate": 1.2474158447505601e-06,
"loss": 1.533,
"step": 1380
},
{
"epoch": 4.502234863876473,
"grad_norm": 1.8665050268173218,
"learning_rate": 1.1688584942696368e-06,
"loss": 1.4757,
"step": 1385
},
{
"epoch": 4.518488419341731,
"grad_norm": 1.950392246246338,
"learning_rate": 1.0927966809995084e-06,
"loss": 1.5358,
"step": 1390
},
{
"epoch": 4.534741974806989,
"grad_norm": 2.056180000305176,
"learning_rate": 1.0192383699419911e-06,
"loss": 1.4977,
"step": 1395
},
{
"epoch": 4.550995530272247,
"grad_norm": 1.7911049127578735,
"learning_rate": 9.481912639384389e-07,
"loss": 1.5788,
"step": 1400
},
{
"epoch": 4.567249085737505,
"grad_norm": 2.2223970890045166,
"learning_rate": 8.796628028631321e-07,
"loss": 1.513,
"step": 1405
},
{
"epoch": 4.583502641202763,
"grad_norm": 1.851468801498413,
"learning_rate": 8.136601628441876e-07,
"loss": 1.6059,
"step": 1410
},
{
"epoch": 4.599756196668022,
"grad_norm": 1.991767406463623,
"learning_rate": 7.501902555120982e-07,
"loss": 1.5495,
"step": 1415
},
{
"epoch": 4.616009752133279,
"grad_norm": 1.99836003780365,
"learning_rate": 6.892597272759482e-07,
"loss": 1.582,
"step": 1420
},
{
"epoch": 4.632263307598537,
"grad_norm": 1.8228671550750732,
"learning_rate": 6.308749586274309e-07,
"loss": 1.587,
"step": 1425
},
{
"epoch": 4.648516863063795,
"grad_norm": 2.050323009490967,
"learning_rate": 5.750420634727082e-07,
"loss": 1.6031,
"step": 1430
},
{
"epoch": 4.664770418529053,
"grad_norm": 1.8385125398635864,
"learning_rate": 5.217668884921506e-07,
"loss": 1.5689,
"step": 1435
},
{
"epoch": 4.681023973994312,
"grad_norm": 1.8687756061553955,
"learning_rate": 4.7105501252811546e-07,
"loss": 1.5067,
"step": 1440
},
{
"epoch": 4.697277529459569,
"grad_norm": 1.8358879089355469,
"learning_rate": 4.229117460007342e-07,
"loss": 1.5941,
"step": 1445
},
{
"epoch": 4.713531084924828,
"grad_norm": 2.0827765464782715,
"learning_rate": 3.7734213035180435e-07,
"loss": 1.593,
"step": 1450
},
{
"epoch": 4.729784640390085,
"grad_norm": 1.8851794004440308,
"learning_rate": 3.343509375168863e-07,
"loss": 1.4914,
"step": 1455
},
{
"epoch": 4.746038195855343,
"grad_norm": 1.9744462966918945,
"learning_rate": 2.939426694255898e-07,
"loss": 1.5901,
"step": 1460
},
{
"epoch": 4.762291751320602,
"grad_norm": 1.7084238529205322,
"learning_rate": 2.561215575301312e-07,
"loss": 1.631,
"step": 1465
},
{
"epoch": 4.778545306785859,
"grad_norm": 1.8658472299575806,
"learning_rate": 2.2089156236224096e-07,
"loss": 1.544,
"step": 1470
},
{
"epoch": 4.794798862251118,
"grad_norm": 1.8390461206436157,
"learning_rate": 1.8825637311841726e-07,
"loss": 1.5495,
"step": 1475
},
{
"epoch": 4.811052417716375,
"grad_norm": 2.16766619682312,
"learning_rate": 1.5821940727361874e-07,
"loss": 1.5227,
"step": 1480
},
{
"epoch": 4.827305973181634,
"grad_norm": 2.022632122039795,
"learning_rate": 1.3078381022336715e-07,
"loss": 1.5021,
"step": 1485
},
{
"epoch": 4.8435595286468915,
"grad_norm": 1.9457296133041382,
"learning_rate": 1.0595245495439999e-07,
"loss": 1.4985,
"step": 1490
},
{
"epoch": 4.859813084112149,
"grad_norm": 2.0490968227386475,
"learning_rate": 8.372794174379417e-08,
"loss": 1.5625,
"step": 1495
},
{
"epoch": 4.876066639577408,
"grad_norm": 2.0088553428649902,
"learning_rate": 6.411259788668966e-08,
"loss": 1.5457,
"step": 1500
},
{
"epoch": 4.892320195042665,
"grad_norm": 2.013977289199829,
"learning_rate": 4.7108477452562084e-08,
"loss": 1.5011,
"step": 1505
},
{
"epoch": 4.908573750507924,
"grad_norm": 1.8730751276016235,
"learning_rate": 3.271736107015033e-08,
"loss": 1.5281,
"step": 1510
},
{
"epoch": 4.9248273059731815,
"grad_norm": 1.82501220703125,
"learning_rate": 2.0940755740969652e-08,
"loss": 1.5072,
"step": 1515
},
{
"epoch": 4.94108086143844,
"grad_norm": 1.6899996995925903,
"learning_rate": 1.1779894681515635e-08,
"loss": 1.5403,
"step": 1520
},
{
"epoch": 4.957334416903698,
"grad_norm": 1.7991502285003662,
"learning_rate": 5.235737194120294e-09,
"loss": 1.5469,
"step": 1525
},
{
"epoch": 4.973587972368955,
"grad_norm": 1.9792165756225586,
"learning_rate": 1.3089685665046426e-09,
"loss": 1.4977,
"step": 1530
},
{
"epoch": 4.989841527834214,
"grad_norm": 1.802415370941162,
"learning_rate": 0.0,
"loss": 1.5761,
"step": 1535
},
{
"epoch": 4.989841527834214,
"step": 1535,
"total_flos": 7.916227959595991e+17,
"train_loss": 1.6256803264058763,
"train_runtime": 27276.7889,
"train_samples_per_second": 0.902,
"train_steps_per_second": 0.056
}
],
"logging_steps": 5,
"max_steps": 1535,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.916227959595991e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}