zephyr-7b-sft-full / trainer_state.json
RikkiXu's picture
Training in progress, step 500
9488186 verified
raw
history blame
99.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2825,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017699115044247787,
"grad_norm": 5.395828211145053,
"learning_rate": 7.06713780918728e-08,
"loss": 0.8769,
"step": 1
},
{
"epoch": 0.008849557522123894,
"grad_norm": 5.078223153245108,
"learning_rate": 3.53356890459364e-07,
"loss": 0.8957,
"step": 5
},
{
"epoch": 0.017699115044247787,
"grad_norm": 4.3462255239787675,
"learning_rate": 7.06713780918728e-07,
"loss": 0.8679,
"step": 10
},
{
"epoch": 0.02654867256637168,
"grad_norm": 2.7342805368137206,
"learning_rate": 1.060070671378092e-06,
"loss": 0.849,
"step": 15
},
{
"epoch": 0.035398230088495575,
"grad_norm": 2.245119548796511,
"learning_rate": 1.413427561837456e-06,
"loss": 0.8632,
"step": 20
},
{
"epoch": 0.04424778761061947,
"grad_norm": 1.9300825255667216,
"learning_rate": 1.76678445229682e-06,
"loss": 0.8287,
"step": 25
},
{
"epoch": 0.05309734513274336,
"grad_norm": 2.0548676134373025,
"learning_rate": 2.120141342756184e-06,
"loss": 0.8216,
"step": 30
},
{
"epoch": 0.061946902654867256,
"grad_norm": 2.107520072503226,
"learning_rate": 2.473498233215548e-06,
"loss": 0.8336,
"step": 35
},
{
"epoch": 0.07079646017699115,
"grad_norm": 2.012421705138888,
"learning_rate": 2.826855123674912e-06,
"loss": 0.8106,
"step": 40
},
{
"epoch": 0.07964601769911504,
"grad_norm": 2.0477120819165138,
"learning_rate": 3.1802120141342757e-06,
"loss": 0.8035,
"step": 45
},
{
"epoch": 0.08849557522123894,
"grad_norm": 1.94045311291632,
"learning_rate": 3.53356890459364e-06,
"loss": 0.8212,
"step": 50
},
{
"epoch": 0.09734513274336283,
"grad_norm": 1.9781807437344778,
"learning_rate": 3.886925795053004e-06,
"loss": 0.8167,
"step": 55
},
{
"epoch": 0.10619469026548672,
"grad_norm": 2.1588067674308955,
"learning_rate": 4.240282685512368e-06,
"loss": 0.8159,
"step": 60
},
{
"epoch": 0.11504424778761062,
"grad_norm": 2.0574710215669936,
"learning_rate": 4.593639575971732e-06,
"loss": 0.7809,
"step": 65
},
{
"epoch": 0.12389380530973451,
"grad_norm": 2.0440859301429692,
"learning_rate": 4.946996466431096e-06,
"loss": 0.7735,
"step": 70
},
{
"epoch": 0.13274336283185842,
"grad_norm": 2.1450366358677857,
"learning_rate": 5.300353356890459e-06,
"loss": 0.775,
"step": 75
},
{
"epoch": 0.1415929203539823,
"grad_norm": 2.0239595981615444,
"learning_rate": 5.653710247349824e-06,
"loss": 0.7739,
"step": 80
},
{
"epoch": 0.1504424778761062,
"grad_norm": 2.248989868043356,
"learning_rate": 6.0070671378091885e-06,
"loss": 0.7629,
"step": 85
},
{
"epoch": 0.1592920353982301,
"grad_norm": 2.113620356465933,
"learning_rate": 6.360424028268551e-06,
"loss": 0.7617,
"step": 90
},
{
"epoch": 0.168141592920354,
"grad_norm": 2.1063718938021987,
"learning_rate": 6.713780918727916e-06,
"loss": 0.7741,
"step": 95
},
{
"epoch": 0.17699115044247787,
"grad_norm": 2.0994149998808074,
"learning_rate": 7.06713780918728e-06,
"loss": 0.7657,
"step": 100
},
{
"epoch": 0.18584070796460178,
"grad_norm": 2.0741285650614785,
"learning_rate": 7.420494699646644e-06,
"loss": 0.7564,
"step": 105
},
{
"epoch": 0.19469026548672566,
"grad_norm": 1.8812346063525685,
"learning_rate": 7.773851590106007e-06,
"loss": 0.764,
"step": 110
},
{
"epoch": 0.20353982300884957,
"grad_norm": 1.6875596687855905,
"learning_rate": 8.127208480565372e-06,
"loss": 0.7395,
"step": 115
},
{
"epoch": 0.21238938053097345,
"grad_norm": 1.7946446429435303,
"learning_rate": 8.480565371024736e-06,
"loss": 0.7507,
"step": 120
},
{
"epoch": 0.22123893805309736,
"grad_norm": 1.5175436501935706,
"learning_rate": 8.8339222614841e-06,
"loss": 0.7197,
"step": 125
},
{
"epoch": 0.23008849557522124,
"grad_norm": 1.5846428467972697,
"learning_rate": 9.187279151943464e-06,
"loss": 0.7422,
"step": 130
},
{
"epoch": 0.23893805309734514,
"grad_norm": 1.9811600560411595,
"learning_rate": 9.540636042402828e-06,
"loss": 0.7305,
"step": 135
},
{
"epoch": 0.24778761061946902,
"grad_norm": 2.0603096951143822,
"learning_rate": 9.893992932862191e-06,
"loss": 0.7266,
"step": 140
},
{
"epoch": 0.25663716814159293,
"grad_norm": 1.79227194384631,
"learning_rate": 1.0247349823321556e-05,
"loss": 0.7273,
"step": 145
},
{
"epoch": 0.26548672566371684,
"grad_norm": 1.935925432369111,
"learning_rate": 1.0600706713780919e-05,
"loss": 0.7337,
"step": 150
},
{
"epoch": 0.2743362831858407,
"grad_norm": 1.6735245424795926,
"learning_rate": 1.0954063604240283e-05,
"loss": 0.726,
"step": 155
},
{
"epoch": 0.2831858407079646,
"grad_norm": 1.6786739654407825,
"learning_rate": 1.1307420494699648e-05,
"loss": 0.7442,
"step": 160
},
{
"epoch": 0.2920353982300885,
"grad_norm": 1.4647520145743216,
"learning_rate": 1.1660777385159012e-05,
"loss": 0.7104,
"step": 165
},
{
"epoch": 0.3008849557522124,
"grad_norm": 1.6975078748459098,
"learning_rate": 1.2014134275618377e-05,
"loss": 0.7202,
"step": 170
},
{
"epoch": 0.30973451327433627,
"grad_norm": 2.1939246563522925,
"learning_rate": 1.2367491166077738e-05,
"loss": 0.7135,
"step": 175
},
{
"epoch": 0.3185840707964602,
"grad_norm": 1.7970007273551845,
"learning_rate": 1.2720848056537103e-05,
"loss": 0.7404,
"step": 180
},
{
"epoch": 0.3274336283185841,
"grad_norm": 2.2094978373729,
"learning_rate": 1.3074204946996467e-05,
"loss": 0.7395,
"step": 185
},
{
"epoch": 0.336283185840708,
"grad_norm": 1.6148149490160606,
"learning_rate": 1.3427561837455832e-05,
"loss": 0.7236,
"step": 190
},
{
"epoch": 0.34513274336283184,
"grad_norm": 1.5716018292237066,
"learning_rate": 1.3780918727915195e-05,
"loss": 0.7176,
"step": 195
},
{
"epoch": 0.35398230088495575,
"grad_norm": 1.56394291086092,
"learning_rate": 1.413427561837456e-05,
"loss": 0.7077,
"step": 200
},
{
"epoch": 0.36283185840707965,
"grad_norm": 1.5944555462298586,
"learning_rate": 1.4487632508833924e-05,
"loss": 0.732,
"step": 205
},
{
"epoch": 0.37168141592920356,
"grad_norm": 3.0644131086145716,
"learning_rate": 1.4840989399293289e-05,
"loss": 0.7226,
"step": 210
},
{
"epoch": 0.3805309734513274,
"grad_norm": 48.89141497644312,
"learning_rate": 1.519434628975265e-05,
"loss": 0.9165,
"step": 215
},
{
"epoch": 0.3893805309734513,
"grad_norm": 63.82971563129597,
"learning_rate": 1.5547703180212014e-05,
"loss": 2.8435,
"step": 220
},
{
"epoch": 0.39823008849557523,
"grad_norm": 63.166528581176436,
"learning_rate": 1.590106007067138e-05,
"loss": 1.8489,
"step": 225
},
{
"epoch": 0.40707964601769914,
"grad_norm": 11.532232434183182,
"learning_rate": 1.6254416961130744e-05,
"loss": 1.1145,
"step": 230
},
{
"epoch": 0.415929203539823,
"grad_norm": 8.320428887121134,
"learning_rate": 1.6607773851590106e-05,
"loss": 1.0432,
"step": 235
},
{
"epoch": 0.4247787610619469,
"grad_norm": 5.49695556302703,
"learning_rate": 1.6961130742049473e-05,
"loss": 0.9249,
"step": 240
},
{
"epoch": 0.4336283185840708,
"grad_norm": 3.8086780643307954,
"learning_rate": 1.7314487632508836e-05,
"loss": 0.901,
"step": 245
},
{
"epoch": 0.4424778761061947,
"grad_norm": 3.0530888165680254,
"learning_rate": 1.76678445229682e-05,
"loss": 0.8318,
"step": 250
},
{
"epoch": 0.45132743362831856,
"grad_norm": 2.051374080224868,
"learning_rate": 1.802120141342756e-05,
"loss": 0.8123,
"step": 255
},
{
"epoch": 0.46017699115044247,
"grad_norm": 1.9860592805013697,
"learning_rate": 1.8374558303886928e-05,
"loss": 0.8145,
"step": 260
},
{
"epoch": 0.4690265486725664,
"grad_norm": 1.9585131132246008,
"learning_rate": 1.872791519434629e-05,
"loss": 0.8015,
"step": 265
},
{
"epoch": 0.4778761061946903,
"grad_norm": 2.0807728265740937,
"learning_rate": 1.9081272084805657e-05,
"loss": 0.7572,
"step": 270
},
{
"epoch": 0.48672566371681414,
"grad_norm": 8.726925488702674,
"learning_rate": 1.943462897526502e-05,
"loss": 0.7968,
"step": 275
},
{
"epoch": 0.49557522123893805,
"grad_norm": 3.0506762781221926,
"learning_rate": 1.9787985865724383e-05,
"loss": 0.7859,
"step": 280
},
{
"epoch": 0.504424778761062,
"grad_norm": 20.477250938697214,
"learning_rate": 1.999996945230629e-05,
"loss": 0.7726,
"step": 285
},
{
"epoch": 0.5132743362831859,
"grad_norm": 18.602610432760407,
"learning_rate": 1.9999625792895357e-05,
"loss": 0.7615,
"step": 290
},
{
"epoch": 0.5221238938053098,
"grad_norm": 1.9360445735897074,
"learning_rate": 1.9998900302622567e-05,
"loss": 0.7346,
"step": 295
},
{
"epoch": 0.5309734513274337,
"grad_norm": 1.9907733223798811,
"learning_rate": 1.9997793009190403e-05,
"loss": 0.7257,
"step": 300
},
{
"epoch": 0.5398230088495575,
"grad_norm": 2.649430391779402,
"learning_rate": 1.999630395488034e-05,
"loss": 0.7398,
"step": 305
},
{
"epoch": 0.5486725663716814,
"grad_norm": 2.295494291640341,
"learning_rate": 1.9994433196551183e-05,
"loss": 0.7404,
"step": 310
},
{
"epoch": 0.5575221238938053,
"grad_norm": 1.6045394414529988,
"learning_rate": 1.9992180805636936e-05,
"loss": 0.7157,
"step": 315
},
{
"epoch": 0.5663716814159292,
"grad_norm": 1.7545617453249185,
"learning_rate": 1.998954686814406e-05,
"loss": 0.722,
"step": 320
},
{
"epoch": 0.5752212389380531,
"grad_norm": 2.20743008915746,
"learning_rate": 1.998653148464817e-05,
"loss": 0.7133,
"step": 325
},
{
"epoch": 0.584070796460177,
"grad_norm": 1.8768620308499198,
"learning_rate": 1.9983134770290232e-05,
"loss": 0.7247,
"step": 330
},
{
"epoch": 0.5929203539823009,
"grad_norm": 2.4610134324640396,
"learning_rate": 1.9979356854772128e-05,
"loss": 0.6939,
"step": 335
},
{
"epoch": 0.6017699115044248,
"grad_norm": 1.5013573865061423,
"learning_rate": 1.997519788235174e-05,
"loss": 0.7184,
"step": 340
},
{
"epoch": 0.6106194690265486,
"grad_norm": 1.607524878532932,
"learning_rate": 1.9970658011837404e-05,
"loss": 0.7206,
"step": 345
},
{
"epoch": 0.6194690265486725,
"grad_norm": 1.57447888430762,
"learning_rate": 1.996573741658188e-05,
"loss": 0.7082,
"step": 350
},
{
"epoch": 0.6283185840707964,
"grad_norm": 1.5581830775430778,
"learning_rate": 1.9960436284475712e-05,
"loss": 0.6727,
"step": 355
},
{
"epoch": 0.6371681415929203,
"grad_norm": 1.5066025396716325,
"learning_rate": 1.9954754817940054e-05,
"loss": 0.708,
"step": 360
},
{
"epoch": 0.6460176991150443,
"grad_norm": 1.4817421328262548,
"learning_rate": 1.994869323391895e-05,
"loss": 0.6863,
"step": 365
},
{
"epoch": 0.6548672566371682,
"grad_norm": 1.4055734979132297,
"learning_rate": 1.9942251763871056e-05,
"loss": 0.7108,
"step": 370
},
{
"epoch": 0.6637168141592921,
"grad_norm": 1.391086495332054,
"learning_rate": 1.9935430653760772e-05,
"loss": 0.6608,
"step": 375
},
{
"epoch": 0.672566371681416,
"grad_norm": 1.5585496911243557,
"learning_rate": 1.9928230164048885e-05,
"loss": 0.6968,
"step": 380
},
{
"epoch": 0.6814159292035398,
"grad_norm": 1.5660850244776638,
"learning_rate": 1.99206505696826e-05,
"loss": 0.6892,
"step": 385
},
{
"epoch": 0.6902654867256637,
"grad_norm": 1.6030826428695462,
"learning_rate": 1.9912692160085054e-05,
"loss": 0.707,
"step": 390
},
{
"epoch": 0.6991150442477876,
"grad_norm": 1.4696166964471262,
"learning_rate": 1.990435523914426e-05,
"loss": 0.6773,
"step": 395
},
{
"epoch": 0.7079646017699115,
"grad_norm": 1.4674837525767894,
"learning_rate": 1.9895640125201498e-05,
"loss": 0.7007,
"step": 400
},
{
"epoch": 0.7168141592920354,
"grad_norm": 1.6349741965597524,
"learning_rate": 1.988654715103917e-05,
"loss": 0.6884,
"step": 405
},
{
"epoch": 0.7256637168141593,
"grad_norm": 1.6317810785917048,
"learning_rate": 1.9877076663868084e-05,
"loss": 0.6761,
"step": 410
},
{
"epoch": 0.7345132743362832,
"grad_norm": 1.7519745537508327,
"learning_rate": 1.9867229025314204e-05,
"loss": 0.6843,
"step": 415
},
{
"epoch": 0.7433628318584071,
"grad_norm": 1.707801199033859,
"learning_rate": 1.9857004611404825e-05,
"loss": 0.6735,
"step": 420
},
{
"epoch": 0.7522123893805309,
"grad_norm": 1.687330084394135,
"learning_rate": 1.984640381255424e-05,
"loss": 0.6661,
"step": 425
},
{
"epoch": 0.7610619469026548,
"grad_norm": 1.7043833380846305,
"learning_rate": 1.9835427033548807e-05,
"loss": 0.6794,
"step": 430
},
{
"epoch": 0.7699115044247787,
"grad_norm": 1.5675248939760222,
"learning_rate": 1.982407469353152e-05,
"loss": 0.6864,
"step": 435
},
{
"epoch": 0.7787610619469026,
"grad_norm": 1.4561869480484766,
"learning_rate": 1.9812347225985966e-05,
"loss": 0.657,
"step": 440
},
{
"epoch": 0.7876106194690266,
"grad_norm": 1.3651968406362616,
"learning_rate": 1.9800245078719814e-05,
"loss": 0.6725,
"step": 445
},
{
"epoch": 0.7964601769911505,
"grad_norm": 1.7068501163645904,
"learning_rate": 1.9787768713847685e-05,
"loss": 0.6907,
"step": 450
},
{
"epoch": 0.8053097345132744,
"grad_norm": 1.8186666681748165,
"learning_rate": 1.9774918607773524e-05,
"loss": 0.6666,
"step": 455
},
{
"epoch": 0.8141592920353983,
"grad_norm": 52.762622436160925,
"learning_rate": 1.9761695251172398e-05,
"loss": 0.6903,
"step": 460
},
{
"epoch": 0.8230088495575221,
"grad_norm": 2.785296160404902,
"learning_rate": 1.9748099148971766e-05,
"loss": 0.682,
"step": 465
},
{
"epoch": 0.831858407079646,
"grad_norm": 1.8158247940222614,
"learning_rate": 1.97341308203322e-05,
"loss": 0.6654,
"step": 470
},
{
"epoch": 0.8407079646017699,
"grad_norm": 3.2136675982919427,
"learning_rate": 1.9719790798627555e-05,
"loss": 0.6875,
"step": 475
},
{
"epoch": 0.8495575221238938,
"grad_norm": 1.9905381473001953,
"learning_rate": 1.9705079631424605e-05,
"loss": 0.6785,
"step": 480
},
{
"epoch": 0.8584070796460177,
"grad_norm": 2.788997906081037,
"learning_rate": 1.9689997880462134e-05,
"loss": 0.6614,
"step": 485
},
{
"epoch": 0.8672566371681416,
"grad_norm": 1.947320603121196,
"learning_rate": 1.9674546121629495e-05,
"loss": 0.6612,
"step": 490
},
{
"epoch": 0.8761061946902655,
"grad_norm": 16.64653233306791,
"learning_rate": 1.9658724944944597e-05,
"loss": 0.6755,
"step": 495
},
{
"epoch": 0.8849557522123894,
"grad_norm": 2.445435643330624,
"learning_rate": 1.964253495453141e-05,
"loss": 0.6489,
"step": 500
},
{
"epoch": 0.8938053097345132,
"grad_norm": 1.8384904388034906,
"learning_rate": 1.9625976768596862e-05,
"loss": 0.6832,
"step": 505
},
{
"epoch": 0.9026548672566371,
"grad_norm": 1.7969557638918616,
"learning_rate": 1.9609051019407254e-05,
"loss": 0.6624,
"step": 510
},
{
"epoch": 0.911504424778761,
"grad_norm": 1.7456317363852023,
"learning_rate": 1.9591758353264106e-05,
"loss": 0.6573,
"step": 515
},
{
"epoch": 0.9203539823008849,
"grad_norm": 1.4813878883475884,
"learning_rate": 1.9574099430479498e-05,
"loss": 0.659,
"step": 520
},
{
"epoch": 0.9292035398230089,
"grad_norm": 4.494324308598284,
"learning_rate": 1.9556074925350826e-05,
"loss": 0.6811,
"step": 525
},
{
"epoch": 0.9380530973451328,
"grad_norm": 7.919549938124173,
"learning_rate": 1.9537685526135088e-05,
"loss": 0.6812,
"step": 530
},
{
"epoch": 0.9469026548672567,
"grad_norm": 15.115841458620135,
"learning_rate": 1.951893193502256e-05,
"loss": 0.6774,
"step": 535
},
{
"epoch": 0.9557522123893806,
"grad_norm": 5.495425735869417,
"learning_rate": 1.9499814868110035e-05,
"loss": 0.6889,
"step": 540
},
{
"epoch": 0.9646017699115044,
"grad_norm": 12.34480108912751,
"learning_rate": 1.9480335055373444e-05,
"loss": 0.689,
"step": 545
},
{
"epoch": 0.9734513274336283,
"grad_norm": 15.091035487093771,
"learning_rate": 1.9460493240639985e-05,
"loss": 0.6907,
"step": 550
},
{
"epoch": 0.9823008849557522,
"grad_norm": 2.6410011813854815,
"learning_rate": 1.9440290181559737e-05,
"loss": 0.6728,
"step": 555
},
{
"epoch": 0.9911504424778761,
"grad_norm": 3.807048913327981,
"learning_rate": 1.9419726649576707e-05,
"loss": 0.6699,
"step": 560
},
{
"epoch": 1.0,
"grad_norm": 11.71551051866065,
"learning_rate": 1.93988034298994e-05,
"loss": 0.6823,
"step": 565
},
{
"epoch": 1.0,
"eval_loss": 0.6282660961151123,
"eval_runtime": 346.0891,
"eval_samples_per_second": 21.731,
"eval_steps_per_second": 0.341,
"step": 565
},
{
"epoch": 1.008849557522124,
"grad_norm": 5.445947511608255,
"learning_rate": 1.9377521321470806e-05,
"loss": 0.5764,
"step": 570
},
{
"epoch": 1.0176991150442478,
"grad_norm": 5.441443896405208,
"learning_rate": 1.935588113693792e-05,
"loss": 0.5922,
"step": 575
},
{
"epoch": 1.0265486725663717,
"grad_norm": 4.764716514583245,
"learning_rate": 1.9333883702620692e-05,
"loss": 0.5688,
"step": 580
},
{
"epoch": 1.0353982300884956,
"grad_norm": 6.234212565235311,
"learning_rate": 1.9311529858480488e-05,
"loss": 0.5627,
"step": 585
},
{
"epoch": 1.0442477876106195,
"grad_norm": 3.275772180265634,
"learning_rate": 1.9288820458088004e-05,
"loss": 0.5405,
"step": 590
},
{
"epoch": 1.0530973451327434,
"grad_norm": 2.4494316038791752,
"learning_rate": 1.926575636859068e-05,
"loss": 0.5353,
"step": 595
},
{
"epoch": 1.0619469026548674,
"grad_norm": 1.964088050973173,
"learning_rate": 1.924233847067959e-05,
"loss": 0.5409,
"step": 600
},
{
"epoch": 1.0707964601769913,
"grad_norm": 1.7637636165900412,
"learning_rate": 1.9218567658555813e-05,
"loss": 0.5477,
"step": 605
},
{
"epoch": 1.079646017699115,
"grad_norm": 2.3214954304344433,
"learning_rate": 1.919444483989628e-05,
"loss": 0.5519,
"step": 610
},
{
"epoch": 1.0884955752212389,
"grad_norm": 2.0536428402479134,
"learning_rate": 1.9169970935819123e-05,
"loss": 0.5218,
"step": 615
},
{
"epoch": 1.0973451327433628,
"grad_norm": 1.670923852239301,
"learning_rate": 1.9145146880848505e-05,
"loss": 0.531,
"step": 620
},
{
"epoch": 1.1061946902654867,
"grad_norm": 1.9728407984682095,
"learning_rate": 1.9119973622878928e-05,
"loss": 0.5402,
"step": 625
},
{
"epoch": 1.1150442477876106,
"grad_norm": 8.142996728944224,
"learning_rate": 1.9094452123139034e-05,
"loss": 0.5656,
"step": 630
},
{
"epoch": 1.1238938053097345,
"grad_norm": 1.890805938276718,
"learning_rate": 1.9068583356154917e-05,
"loss": 0.539,
"step": 635
},
{
"epoch": 1.1327433628318584,
"grad_norm": 2.1078644912267968,
"learning_rate": 1.9042368309712906e-05,
"loss": 0.5461,
"step": 640
},
{
"epoch": 1.1415929203539823,
"grad_norm": 1.8971067802854567,
"learning_rate": 1.9015807984821827e-05,
"loss": 0.5494,
"step": 645
},
{
"epoch": 1.1504424778761062,
"grad_norm": 27.881164380475635,
"learning_rate": 1.8988903395674814e-05,
"loss": 0.535,
"step": 650
},
{
"epoch": 1.1592920353982301,
"grad_norm": 3.4980726529367576,
"learning_rate": 1.8961655569610557e-05,
"loss": 0.531,
"step": 655
},
{
"epoch": 1.168141592920354,
"grad_norm": 2.2255039588879066,
"learning_rate": 1.8934065547074077e-05,
"loss": 0.5369,
"step": 660
},
{
"epoch": 1.176991150442478,
"grad_norm": 1.7665125865767044,
"learning_rate": 1.8906134381577008e-05,
"loss": 0.5231,
"step": 665
},
{
"epoch": 1.1858407079646018,
"grad_norm": 1.7187220901255802,
"learning_rate": 1.887786313965736e-05,
"loss": 0.5205,
"step": 670
},
{
"epoch": 1.1946902654867257,
"grad_norm": 1.793384142575199,
"learning_rate": 1.8849252900838795e-05,
"loss": 0.5307,
"step": 675
},
{
"epoch": 1.2035398230088497,
"grad_norm": 1.789651406311183,
"learning_rate": 1.8820304757589406e-05,
"loss": 0.5259,
"step": 680
},
{
"epoch": 1.2123893805309733,
"grad_norm": 1.9237494570492129,
"learning_rate": 1.8791019815280015e-05,
"loss": 0.5262,
"step": 685
},
{
"epoch": 1.2212389380530975,
"grad_norm": 1.6656226502649325,
"learning_rate": 1.8761399192141933e-05,
"loss": 0.5681,
"step": 690
},
{
"epoch": 1.2300884955752212,
"grad_norm": 1.8325987162210478,
"learning_rate": 1.8731444019224296e-05,
"loss": 0.5373,
"step": 695
},
{
"epoch": 1.238938053097345,
"grad_norm": 1.772645490498923,
"learning_rate": 1.8701155440350854e-05,
"loss": 0.5274,
"step": 700
},
{
"epoch": 1.247787610619469,
"grad_norm": 1.7076814876614839,
"learning_rate": 1.8670534612076304e-05,
"loss": 0.5345,
"step": 705
},
{
"epoch": 1.2566371681415929,
"grad_norm": 2.8366257516212925,
"learning_rate": 1.863958270364213e-05,
"loss": 0.5448,
"step": 710
},
{
"epoch": 1.2654867256637168,
"grad_norm": 1.6438063307669566,
"learning_rate": 1.8608300896931935e-05,
"loss": 0.5345,
"step": 715
},
{
"epoch": 1.2743362831858407,
"grad_norm": 7.910992161903767,
"learning_rate": 1.857669038642635e-05,
"loss": 0.5771,
"step": 720
},
{
"epoch": 1.2831858407079646,
"grad_norm": 5.320203779573794,
"learning_rate": 1.8544752379157383e-05,
"loss": 0.5889,
"step": 725
},
{
"epoch": 1.2920353982300885,
"grad_norm": 4.200987040185538,
"learning_rate": 1.851248809466236e-05,
"loss": 0.5572,
"step": 730
},
{
"epoch": 1.3008849557522124,
"grad_norm": 3.6844001075774564,
"learning_rate": 1.847989876493733e-05,
"loss": 0.5729,
"step": 735
},
{
"epoch": 1.3097345132743363,
"grad_norm": 2.6959513168479003,
"learning_rate": 1.8446985634390056e-05,
"loss": 0.5438,
"step": 740
},
{
"epoch": 1.3185840707964602,
"grad_norm": 2.0110192321282074,
"learning_rate": 1.841374995979246e-05,
"loss": 0.5346,
"step": 745
},
{
"epoch": 1.3274336283185841,
"grad_norm": 2.4149221674637142,
"learning_rate": 1.8380193010232664e-05,
"loss": 0.5443,
"step": 750
},
{
"epoch": 1.336283185840708,
"grad_norm": 3.216072321253876,
"learning_rate": 1.834631606706651e-05,
"loss": 0.5388,
"step": 755
},
{
"epoch": 1.3451327433628317,
"grad_norm": 1.7562091306971943,
"learning_rate": 1.831212042386865e-05,
"loss": 0.5332,
"step": 760
},
{
"epoch": 1.3539823008849559,
"grad_norm": 1.7681274800133804,
"learning_rate": 1.8277607386383134e-05,
"loss": 0.5531,
"step": 765
},
{
"epoch": 1.3628318584070795,
"grad_norm": 1.6184444020633955,
"learning_rate": 1.8242778272473566e-05,
"loss": 0.5288,
"step": 770
},
{
"epoch": 1.3716814159292037,
"grad_norm": 1.8584096368775243,
"learning_rate": 1.8207634412072765e-05,
"loss": 0.5134,
"step": 775
},
{
"epoch": 1.3805309734513274,
"grad_norm": 2.3984599706556504,
"learning_rate": 1.8172177147132e-05,
"loss": 0.5293,
"step": 780
},
{
"epoch": 1.3893805309734513,
"grad_norm": 1.6184621514028006,
"learning_rate": 1.8136407831569748e-05,
"loss": 0.5332,
"step": 785
},
{
"epoch": 1.3982300884955752,
"grad_norm": 1.659497791050273,
"learning_rate": 1.8100327831219968e-05,
"loss": 0.5499,
"step": 790
},
{
"epoch": 1.407079646017699,
"grad_norm": 2.080085747337152,
"learning_rate": 1.806393852377998e-05,
"loss": 0.5373,
"step": 795
},
{
"epoch": 1.415929203539823,
"grad_norm": 1.7412321009147458,
"learning_rate": 1.802724129875784e-05,
"loss": 0.5237,
"step": 800
},
{
"epoch": 1.424778761061947,
"grad_norm": 1.5117367250487712,
"learning_rate": 1.7990237557419298e-05,
"loss": 0.5212,
"step": 805
},
{
"epoch": 1.4336283185840708,
"grad_norm": 1.5677828815765256,
"learning_rate": 1.7952928712734266e-05,
"loss": 0.5293,
"step": 810
},
{
"epoch": 1.4424778761061947,
"grad_norm": 1.6191863845989973,
"learning_rate": 1.791531618932289e-05,
"loss": 0.5108,
"step": 815
},
{
"epoch": 1.4513274336283186,
"grad_norm": 1.6356183497375685,
"learning_rate": 1.7877401423401134e-05,
"loss": 0.535,
"step": 820
},
{
"epoch": 1.4601769911504425,
"grad_norm": 2.120698903964094,
"learning_rate": 1.7839185862725953e-05,
"loss": 0.5276,
"step": 825
},
{
"epoch": 1.4690265486725664,
"grad_norm": 1.7162725245546222,
"learning_rate": 1.7800670966539997e-05,
"loss": 0.5157,
"step": 830
},
{
"epoch": 1.4778761061946903,
"grad_norm": 1.8342947704435748,
"learning_rate": 1.7761858205515904e-05,
"loss": 0.503,
"step": 835
},
{
"epoch": 1.4867256637168142,
"grad_norm": 1.6080172248542548,
"learning_rate": 1.7722749061700122e-05,
"loss": 0.5164,
"step": 840
},
{
"epoch": 1.495575221238938,
"grad_norm": 1.7310863566366472,
"learning_rate": 1.7683345028456357e-05,
"loss": 0.5144,
"step": 845
},
{
"epoch": 1.504424778761062,
"grad_norm": 1.5259860237803888,
"learning_rate": 1.7643647610408507e-05,
"loss": 0.5144,
"step": 850
},
{
"epoch": 1.5132743362831858,
"grad_norm": 1.949693136998924,
"learning_rate": 1.760365832338322e-05,
"loss": 0.5208,
"step": 855
},
{
"epoch": 1.5221238938053099,
"grad_norm": 1.4606753283332923,
"learning_rate": 1.7563378694352038e-05,
"loss": 0.514,
"step": 860
},
{
"epoch": 1.5309734513274336,
"grad_norm": 1.500515219738256,
"learning_rate": 1.752281026137306e-05,
"loss": 0.5105,
"step": 865
},
{
"epoch": 1.5398230088495575,
"grad_norm": 1.6809093610034818,
"learning_rate": 1.7481954573532233e-05,
"loss": 0.5246,
"step": 870
},
{
"epoch": 1.5486725663716814,
"grad_norm": 1.8505913851080076,
"learning_rate": 1.7440813190884177e-05,
"loss": 0.5263,
"step": 875
},
{
"epoch": 1.5575221238938053,
"grad_norm": 1.5042921112971175,
"learning_rate": 1.7399387684392643e-05,
"loss": 0.5078,
"step": 880
},
{
"epoch": 1.5663716814159292,
"grad_norm": 1.7603274810661258,
"learning_rate": 1.7357679635870504e-05,
"loss": 0.5152,
"step": 885
},
{
"epoch": 1.575221238938053,
"grad_norm": 1.9907861198097643,
"learning_rate": 1.731569063791937e-05,
"loss": 0.517,
"step": 890
},
{
"epoch": 1.584070796460177,
"grad_norm": 2.287444737461443,
"learning_rate": 1.727342229386877e-05,
"loss": 0.5118,
"step": 895
},
{
"epoch": 1.592920353982301,
"grad_norm": 1.7114791811335306,
"learning_rate": 1.723087621771492e-05,
"loss": 0.512,
"step": 900
},
{
"epoch": 1.6017699115044248,
"grad_norm": 1.6387744190074889,
"learning_rate": 1.718805403405911e-05,
"loss": 0.5151,
"step": 905
},
{
"epoch": 1.6106194690265485,
"grad_norm": 1.9126014391813266,
"learning_rate": 1.7144957378045656e-05,
"loss": 0.5072,
"step": 910
},
{
"epoch": 1.6194690265486726,
"grad_norm": 1.5534492075843847,
"learning_rate": 1.7101587895299463e-05,
"loss": 0.5139,
"step": 915
},
{
"epoch": 1.6283185840707963,
"grad_norm": 3.324022735746321,
"learning_rate": 1.7057947241863207e-05,
"loss": 0.486,
"step": 920
},
{
"epoch": 1.6371681415929205,
"grad_norm": 2.5161147424579413,
"learning_rate": 1.7014037084134076e-05,
"loss": 0.5127,
"step": 925
},
{
"epoch": 1.6460176991150441,
"grad_norm": 2.6183686325273814,
"learning_rate": 1.696985909880015e-05,
"loss": 0.5103,
"step": 930
},
{
"epoch": 1.6548672566371683,
"grad_norm": 1.8311730816272584,
"learning_rate": 1.692541497277637e-05,
"loss": 0.51,
"step": 935
},
{
"epoch": 1.663716814159292,
"grad_norm": 2.077861904241967,
"learning_rate": 1.6880706403140146e-05,
"loss": 0.5082,
"step": 940
},
{
"epoch": 1.672566371681416,
"grad_norm": 2.0643493814936282,
"learning_rate": 1.6835735097066524e-05,
"loss": 0.5199,
"step": 945
},
{
"epoch": 1.6814159292035398,
"grad_norm": 1.5582651497341313,
"learning_rate": 1.6790502771763018e-05,
"loss": 0.5014,
"step": 950
},
{
"epoch": 1.6902654867256637,
"grad_norm": 1.6655179770008597,
"learning_rate": 1.6745011154404037e-05,
"loss": 0.4854,
"step": 955
},
{
"epoch": 1.6991150442477876,
"grad_norm": 1.4915522872459333,
"learning_rate": 1.669926198206493e-05,
"loss": 0.5132,
"step": 960
},
{
"epoch": 1.7079646017699115,
"grad_norm": 1.6252968757056179,
"learning_rate": 1.6653257001655652e-05,
"loss": 0.5016,
"step": 965
},
{
"epoch": 1.7168141592920354,
"grad_norm": 2.4097499030817096,
"learning_rate": 1.6606997969854087e-05,
"loss": 0.5227,
"step": 970
},
{
"epoch": 1.7256637168141593,
"grad_norm": 1.631370053795557,
"learning_rate": 1.6560486653038916e-05,
"loss": 0.5119,
"step": 975
},
{
"epoch": 1.7345132743362832,
"grad_norm": 2.442759512892503,
"learning_rate": 1.6513724827222225e-05,
"loss": 0.4912,
"step": 980
},
{
"epoch": 1.7433628318584071,
"grad_norm": 12.222832820637523,
"learning_rate": 1.6466714277981656e-05,
"loss": 0.5224,
"step": 985
},
{
"epoch": 1.752212389380531,
"grad_norm": 1.699557253240237,
"learning_rate": 1.641945680039223e-05,
"loss": 0.52,
"step": 990
},
{
"epoch": 1.7610619469026547,
"grad_norm": 1.6614165390235756,
"learning_rate": 1.6371954198957823e-05,
"loss": 0.5118,
"step": 995
},
{
"epoch": 1.7699115044247788,
"grad_norm": 5.006976593399911,
"learning_rate": 1.6324208287542228e-05,
"loss": 0.4785,
"step": 1000
},
{
"epoch": 1.7787610619469025,
"grad_norm": 2.501687669574672,
"learning_rate": 1.6276220889299918e-05,
"loss": 0.494,
"step": 1005
},
{
"epoch": 1.7876106194690267,
"grad_norm": 1.593500017019138,
"learning_rate": 1.622799383660643e-05,
"loss": 0.5184,
"step": 1010
},
{
"epoch": 1.7964601769911503,
"grad_norm": 1.8062411504712435,
"learning_rate": 1.617952897098839e-05,
"loss": 0.4905,
"step": 1015
},
{
"epoch": 1.8053097345132745,
"grad_norm": 1.7405963191869815,
"learning_rate": 1.6130828143053173e-05,
"loss": 0.4826,
"step": 1020
},
{
"epoch": 1.8141592920353982,
"grad_norm": 2.046715193192915,
"learning_rate": 1.6081893212418292e-05,
"loss": 0.4923,
"step": 1025
},
{
"epoch": 1.823008849557522,
"grad_norm": 2.016184225830357,
"learning_rate": 1.6032726047640336e-05,
"loss": 0.5014,
"step": 1030
},
{
"epoch": 1.831858407079646,
"grad_norm": 35.09896301727333,
"learning_rate": 1.5983328526143653e-05,
"loss": 0.4711,
"step": 1035
},
{
"epoch": 1.8407079646017699,
"grad_norm": 2.493655284729721,
"learning_rate": 1.5933702534148648e-05,
"loss": 0.5138,
"step": 1040
},
{
"epoch": 1.8495575221238938,
"grad_norm": 5.918180177995943,
"learning_rate": 1.588384996659976e-05,
"loss": 0.5389,
"step": 1045
},
{
"epoch": 1.8584070796460177,
"grad_norm": 20.200037687607217,
"learning_rate": 1.583377272709311e-05,
"loss": 0.5038,
"step": 1050
},
{
"epoch": 1.8672566371681416,
"grad_norm": 11.981522736747214,
"learning_rate": 1.5783472727803796e-05,
"loss": 0.5098,
"step": 1055
},
{
"epoch": 1.8761061946902655,
"grad_norm": 2.493450240622104,
"learning_rate": 1.5732951889412905e-05,
"loss": 0.5068,
"step": 1060
},
{
"epoch": 1.8849557522123894,
"grad_norm": 84.35037035076192,
"learning_rate": 1.5682212141034153e-05,
"loss": 0.5365,
"step": 1065
},
{
"epoch": 1.893805309734513,
"grad_norm": 5.882151543248406,
"learning_rate": 1.5631255420140225e-05,
"loss": 0.5275,
"step": 1070
},
{
"epoch": 1.9026548672566372,
"grad_norm": 4.244653752423409,
"learning_rate": 1.55800836724888e-05,
"loss": 0.5221,
"step": 1075
},
{
"epoch": 1.911504424778761,
"grad_norm": 2.318077082885742,
"learning_rate": 1.5528698852048247e-05,
"loss": 0.5034,
"step": 1080
},
{
"epoch": 1.920353982300885,
"grad_norm": 1.9732014714384287,
"learning_rate": 1.547710292092301e-05,
"loss": 0.522,
"step": 1085
},
{
"epoch": 1.9292035398230087,
"grad_norm": 1.650645415727171,
"learning_rate": 1.5425297849278714e-05,
"loss": 0.511,
"step": 1090
},
{
"epoch": 1.9380530973451329,
"grad_norm": 1.8379847429115603,
"learning_rate": 1.5373285615266884e-05,
"loss": 0.5234,
"step": 1095
},
{
"epoch": 1.9469026548672566,
"grad_norm": 1.8823141679209345,
"learning_rate": 1.5321068204949465e-05,
"loss": 0.494,
"step": 1100
},
{
"epoch": 1.9557522123893807,
"grad_norm": 1.7345679128452123,
"learning_rate": 1.526864761222294e-05,
"loss": 0.4964,
"step": 1105
},
{
"epoch": 1.9646017699115044,
"grad_norm": 2.076946865200806,
"learning_rate": 1.5216025838742226e-05,
"loss": 0.487,
"step": 1110
},
{
"epoch": 1.9734513274336283,
"grad_norm": 1.6316830069632373,
"learning_rate": 1.5163204893844223e-05,
"loss": 0.4799,
"step": 1115
},
{
"epoch": 1.9823008849557522,
"grad_norm": 1.5027611303212294,
"learning_rate": 1.5110186794471105e-05,
"loss": 0.5015,
"step": 1120
},
{
"epoch": 1.991150442477876,
"grad_norm": 1.4539381914768303,
"learning_rate": 1.505697356509328e-05,
"loss": 0.4817,
"step": 1125
},
{
"epoch": 2.0,
"grad_norm": 1.6088305066129869,
"learning_rate": 1.5003567237632113e-05,
"loss": 0.4922,
"step": 1130
},
{
"epoch": 2.0,
"eval_loss": 0.385861873626709,
"eval_runtime": 342.0399,
"eval_samples_per_second": 21.989,
"eval_steps_per_second": 0.345,
"step": 1130
},
{
"epoch": 2.0088495575221237,
"grad_norm": 2.7761717129790804,
"learning_rate": 1.4949969851382315e-05,
"loss": 0.3518,
"step": 1135
},
{
"epoch": 2.017699115044248,
"grad_norm": 1.7619273167046978,
"learning_rate": 1.4896183452934087e-05,
"loss": 0.3277,
"step": 1140
},
{
"epoch": 2.0265486725663715,
"grad_norm": 1.7864597766278332,
"learning_rate": 1.4842210096094984e-05,
"loss": 0.3257,
"step": 1145
},
{
"epoch": 2.0353982300884956,
"grad_norm": 2.7271140490759724,
"learning_rate": 1.478805184181145e-05,
"loss": 0.3358,
"step": 1150
},
{
"epoch": 2.0442477876106193,
"grad_norm": 1.6882000707315932,
"learning_rate": 1.4733710758090175e-05,
"loss": 0.3295,
"step": 1155
},
{
"epoch": 2.0530973451327434,
"grad_norm": 1.6306905404465186,
"learning_rate": 1.4679188919919076e-05,
"loss": 0.3355,
"step": 1160
},
{
"epoch": 2.061946902654867,
"grad_norm": 1.7785685190803522,
"learning_rate": 1.4624488409188116e-05,
"loss": 0.329,
"step": 1165
},
{
"epoch": 2.0707964601769913,
"grad_norm": 1.7355960995540745,
"learning_rate": 1.4569611314609767e-05,
"loss": 0.3384,
"step": 1170
},
{
"epoch": 2.079646017699115,
"grad_norm": 4.525568032764957,
"learning_rate": 1.4514559731639273e-05,
"loss": 0.3318,
"step": 1175
},
{
"epoch": 2.088495575221239,
"grad_norm": 28.29868813913778,
"learning_rate": 1.4459335762394637e-05,
"loss": 0.3307,
"step": 1180
},
{
"epoch": 2.0973451327433628,
"grad_norm": 3.34159347381774,
"learning_rate": 1.4403941515576344e-05,
"loss": 0.331,
"step": 1185
},
{
"epoch": 2.106194690265487,
"grad_norm": 2.279005617470207,
"learning_rate": 1.434837910638685e-05,
"loss": 0.3506,
"step": 1190
},
{
"epoch": 2.1150442477876106,
"grad_norm": 1.818174492390053,
"learning_rate": 1.42926506564498e-05,
"loss": 0.335,
"step": 1195
},
{
"epoch": 2.1238938053097347,
"grad_norm": 2.21280984926424,
"learning_rate": 1.4236758293729034e-05,
"loss": 0.3383,
"step": 1200
},
{
"epoch": 2.1327433628318584,
"grad_norm": 2.6931991751982154,
"learning_rate": 1.4180704152447322e-05,
"loss": 0.3431,
"step": 1205
},
{
"epoch": 2.1415929203539825,
"grad_norm": 3.1061095288545504,
"learning_rate": 1.4124490373004864e-05,
"loss": 0.3485,
"step": 1210
},
{
"epoch": 2.150442477876106,
"grad_norm": 1.8692302359437503,
"learning_rate": 1.4068119101897568e-05,
"loss": 0.3482,
"step": 1215
},
{
"epoch": 2.15929203539823,
"grad_norm": 4.590488061615715,
"learning_rate": 1.4011592491635088e-05,
"loss": 0.3349,
"step": 1220
},
{
"epoch": 2.168141592920354,
"grad_norm": 1.7406001344065656,
"learning_rate": 1.3954912700658626e-05,
"loss": 0.33,
"step": 1225
},
{
"epoch": 2.1769911504424777,
"grad_norm": 2.147923934228603,
"learning_rate": 1.389808189325851e-05,
"loss": 0.3384,
"step": 1230
},
{
"epoch": 2.185840707964602,
"grad_norm": 1.980447202723778,
"learning_rate": 1.3841102239491567e-05,
"loss": 0.3409,
"step": 1235
},
{
"epoch": 2.1946902654867255,
"grad_norm": 1.754785548742135,
"learning_rate": 1.3783975915098244e-05,
"loss": 0.3267,
"step": 1240
},
{
"epoch": 2.2035398230088497,
"grad_norm": 1.685034715782438,
"learning_rate": 1.3726705101419538e-05,
"loss": 0.3173,
"step": 1245
},
{
"epoch": 2.2123893805309733,
"grad_norm": 1.697282116117874,
"learning_rate": 1.3669291985313695e-05,
"loss": 0.3422,
"step": 1250
},
{
"epoch": 2.2212389380530975,
"grad_norm": 1.7593945507869428,
"learning_rate": 1.3611738759072712e-05,
"loss": 0.33,
"step": 1255
},
{
"epoch": 2.230088495575221,
"grad_norm": 1.7917883090230355,
"learning_rate": 1.3554047620338629e-05,
"loss": 0.3305,
"step": 1260
},
{
"epoch": 2.2389380530973453,
"grad_norm": 1.8630367840502837,
"learning_rate": 1.3496220772019597e-05,
"loss": 0.3331,
"step": 1265
},
{
"epoch": 2.247787610619469,
"grad_norm": 1.4789663707909015,
"learning_rate": 1.3438260422205779e-05,
"loss": 0.3388,
"step": 1270
},
{
"epoch": 2.256637168141593,
"grad_norm": 1.605944505641843,
"learning_rate": 1.3380168784085028e-05,
"loss": 0.3366,
"step": 1275
},
{
"epoch": 2.265486725663717,
"grad_norm": 1.6963565599437993,
"learning_rate": 1.3321948075858377e-05,
"loss": 0.3563,
"step": 1280
},
{
"epoch": 2.274336283185841,
"grad_norm": 1.6228304534956426,
"learning_rate": 1.3263600520655333e-05,
"loss": 0.3365,
"step": 1285
},
{
"epoch": 2.2831858407079646,
"grad_norm": 1.76469997194976,
"learning_rate": 1.3205128346449003e-05,
"loss": 0.3443,
"step": 1290
},
{
"epoch": 2.2920353982300883,
"grad_norm": 1.645969241835665,
"learning_rate": 1.3146533785970997e-05,
"loss": 0.3288,
"step": 1295
},
{
"epoch": 2.3008849557522124,
"grad_norm": 1.6481237108795372,
"learning_rate": 1.3087819076626201e-05,
"loss": 0.3314,
"step": 1300
},
{
"epoch": 2.309734513274336,
"grad_norm": 1.672673680905006,
"learning_rate": 1.3028986460407312e-05,
"loss": 0.3142,
"step": 1305
},
{
"epoch": 2.3185840707964602,
"grad_norm": 1.6126633305212483,
"learning_rate": 1.297003818380926e-05,
"loss": 0.3331,
"step": 1310
},
{
"epoch": 2.327433628318584,
"grad_norm": 1.6511744854665646,
"learning_rate": 1.2910976497743389e-05,
"loss": 0.321,
"step": 1315
},
{
"epoch": 2.336283185840708,
"grad_norm": 1.6614647687728916,
"learning_rate": 1.2851803657451554e-05,
"loss": 0.34,
"step": 1320
},
{
"epoch": 2.3451327433628317,
"grad_norm": 1.5556961404816416,
"learning_rate": 1.2792521922419958e-05,
"loss": 0.3378,
"step": 1325
},
{
"epoch": 2.353982300884956,
"grad_norm": 1.607457047378207,
"learning_rate": 1.2733133556292914e-05,
"loss": 0.3277,
"step": 1330
},
{
"epoch": 2.3628318584070795,
"grad_norm": 1.6195875099518586,
"learning_rate": 1.2673640826786378e-05,
"loss": 0.3268,
"step": 1335
},
{
"epoch": 2.3716814159292037,
"grad_norm": 1.5352915106052365,
"learning_rate": 1.2614046005601377e-05,
"loss": 0.3186,
"step": 1340
},
{
"epoch": 2.3805309734513274,
"grad_norm": 1.5797503971889433,
"learning_rate": 1.2554351368337262e-05,
"loss": 0.3344,
"step": 1345
},
{
"epoch": 2.3893805309734515,
"grad_norm": 2.5943284467596253,
"learning_rate": 1.2494559194404809e-05,
"loss": 0.3468,
"step": 1350
},
{
"epoch": 2.398230088495575,
"grad_norm": 1.7257039415600963,
"learning_rate": 1.2434671766939184e-05,
"loss": 0.3348,
"step": 1355
},
{
"epoch": 2.4070796460176993,
"grad_norm": 1.892133636249101,
"learning_rate": 1.2374691372712761e-05,
"loss": 0.3276,
"step": 1360
},
{
"epoch": 2.415929203539823,
"grad_norm": 1.959589712843592,
"learning_rate": 1.2314620302047818e-05,
"loss": 0.3273,
"step": 1365
},
{
"epoch": 2.4247787610619467,
"grad_norm": 1.5186200162272043,
"learning_rate": 1.2254460848729046e-05,
"loss": 0.3274,
"step": 1370
},
{
"epoch": 2.433628318584071,
"grad_norm": 1.5473486484026677,
"learning_rate": 1.2194215309916005e-05,
"loss": 0.3443,
"step": 1375
},
{
"epoch": 2.442477876106195,
"grad_norm": 1.6097900368392104,
"learning_rate": 1.2133885986055379e-05,
"loss": 0.3179,
"step": 1380
},
{
"epoch": 2.4513274336283186,
"grad_norm": 1.59719321690748,
"learning_rate": 1.2073475180793144e-05,
"loss": 0.324,
"step": 1385
},
{
"epoch": 2.4601769911504423,
"grad_norm": 1.657560283923696,
"learning_rate": 1.2012985200886602e-05,
"loss": 0.3279,
"step": 1390
},
{
"epoch": 2.4690265486725664,
"grad_norm": 1.6312886292352506,
"learning_rate": 1.1952418356116309e-05,
"loss": 0.342,
"step": 1395
},
{
"epoch": 2.47787610619469,
"grad_norm": 1.673413598360019,
"learning_rate": 1.1891776959197854e-05,
"loss": 0.3325,
"step": 1400
},
{
"epoch": 2.4867256637168142,
"grad_norm": 1.5641257680673357,
"learning_rate": 1.1831063325693578e-05,
"loss": 0.33,
"step": 1405
},
{
"epoch": 2.495575221238938,
"grad_norm": 1.557582800520083,
"learning_rate": 1.1770279773924133e-05,
"loss": 0.3229,
"step": 1410
},
{
"epoch": 2.504424778761062,
"grad_norm": 2.0076002123451033,
"learning_rate": 1.1709428624879971e-05,
"loss": 0.338,
"step": 1415
},
{
"epoch": 2.5132743362831858,
"grad_norm": 1.7747271993836249,
"learning_rate": 1.1648512202132705e-05,
"loss": 0.3312,
"step": 1420
},
{
"epoch": 2.52212389380531,
"grad_norm": 1.7345249327456702,
"learning_rate": 1.15875328317464e-05,
"loss": 0.3324,
"step": 1425
},
{
"epoch": 2.5309734513274336,
"grad_norm": 1.5526595051928562,
"learning_rate": 1.1526492842188746e-05,
"loss": 0.3183,
"step": 1430
},
{
"epoch": 2.5398230088495577,
"grad_norm": 1.7188069365366003,
"learning_rate": 1.1465394564242142e-05,
"loss": 0.3382,
"step": 1435
},
{
"epoch": 2.5486725663716814,
"grad_norm": 1.5515396732118238,
"learning_rate": 1.1404240330914706e-05,
"loss": 0.3214,
"step": 1440
},
{
"epoch": 2.557522123893805,
"grad_norm": 1.7016705926250035,
"learning_rate": 1.1343032477351183e-05,
"loss": 0.341,
"step": 1445
},
{
"epoch": 2.566371681415929,
"grad_norm": 1.700802699390158,
"learning_rate": 1.128177334074377e-05,
"loss": 0.3206,
"step": 1450
},
{
"epoch": 2.5752212389380533,
"grad_norm": 1.5960912094984059,
"learning_rate": 1.122046526024291e-05,
"loss": 0.3155,
"step": 1455
},
{
"epoch": 2.584070796460177,
"grad_norm": 1.7339788975369348,
"learning_rate": 1.1159110576867915e-05,
"loss": 0.3239,
"step": 1460
},
{
"epoch": 2.5929203539823007,
"grad_norm": 1.6059957404958596,
"learning_rate": 1.1097711633417623e-05,
"loss": 0.3221,
"step": 1465
},
{
"epoch": 2.601769911504425,
"grad_norm": 1.8427719621309488,
"learning_rate": 1.1036270774380906e-05,
"loss": 0.3304,
"step": 1470
},
{
"epoch": 2.6106194690265485,
"grad_norm": 1.631329628154416,
"learning_rate": 1.0974790345847187e-05,
"loss": 0.3202,
"step": 1475
},
{
"epoch": 2.6194690265486726,
"grad_norm": 1.588519061268432,
"learning_rate": 1.0913272695416807e-05,
"loss": 0.3262,
"step": 1480
},
{
"epoch": 2.6283185840707963,
"grad_norm": 1.6832971532585765,
"learning_rate": 1.085172017211142e-05,
"loss": 0.343,
"step": 1485
},
{
"epoch": 2.6371681415929205,
"grad_norm": 1.6201470424053184,
"learning_rate": 1.0790135126284275e-05,
"loss": 0.3173,
"step": 1490
},
{
"epoch": 2.646017699115044,
"grad_norm": 1.6068035539370134,
"learning_rate": 1.072851990953049e-05,
"loss": 0.3302,
"step": 1495
},
{
"epoch": 2.6548672566371683,
"grad_norm": 1.6527582221176542,
"learning_rate": 1.0666876874597235e-05,
"loss": 0.317,
"step": 1500
},
{
"epoch": 2.663716814159292,
"grad_norm": 1.6872940097146771,
"learning_rate": 1.0605208375293905e-05,
"loss": 0.3327,
"step": 1505
},
{
"epoch": 2.672566371681416,
"grad_norm": 1.9993336890377256,
"learning_rate": 1.0543516766402245e-05,
"loss": 0.327,
"step": 1510
},
{
"epoch": 2.6814159292035398,
"grad_norm": 1.8925758992460178,
"learning_rate": 1.0481804403586421e-05,
"loss": 0.3232,
"step": 1515
},
{
"epoch": 2.6902654867256635,
"grad_norm": 1.7506417492910218,
"learning_rate": 1.0420073643303085e-05,
"loss": 0.3236,
"step": 1520
},
{
"epoch": 2.6991150442477876,
"grad_norm": 1.7254760531768936,
"learning_rate": 1.0358326842711383e-05,
"loss": 0.3376,
"step": 1525
},
{
"epoch": 2.7079646017699117,
"grad_norm": 1.6056971874044912,
"learning_rate": 1.0296566359582951e-05,
"loss": 0.3197,
"step": 1530
},
{
"epoch": 2.7168141592920354,
"grad_norm": 1.692264180914051,
"learning_rate": 1.023479455221189e-05,
"loss": 0.317,
"step": 1535
},
{
"epoch": 2.725663716814159,
"grad_norm": 1.755215116123095,
"learning_rate": 1.0173013779324714e-05,
"loss": 0.3309,
"step": 1540
},
{
"epoch": 2.734513274336283,
"grad_norm": 1.6225703848243365,
"learning_rate": 1.0111226399990267e-05,
"loss": 0.3247,
"step": 1545
},
{
"epoch": 2.7433628318584073,
"grad_norm": 1.574502935573898,
"learning_rate": 1.0049434773529678e-05,
"loss": 0.3193,
"step": 1550
},
{
"epoch": 2.752212389380531,
"grad_norm": 1.642771315319423,
"learning_rate": 9.98764125942623e-06,
"loss": 0.3304,
"step": 1555
},
{
"epoch": 2.7610619469026547,
"grad_norm": 1.5731646251204943,
"learning_rate": 9.9258482172353e-06,
"loss": 0.3438,
"step": 1560
},
{
"epoch": 2.769911504424779,
"grad_norm": 1.8184099244247223,
"learning_rate": 9.864058006494237e-06,
"loss": 0.3278,
"step": 1565
},
{
"epoch": 2.7787610619469025,
"grad_norm": 1.9834453880133907,
"learning_rate": 9.80227298663227e-06,
"loss": 0.3305,
"step": 1570
},
{
"epoch": 2.7876106194690267,
"grad_norm": 1.9264046453250971,
"learning_rate": 9.740495516880428e-06,
"loss": 0.3158,
"step": 1575
},
{
"epoch": 2.7964601769911503,
"grad_norm": 2.1903532218624857,
"learning_rate": 9.678727956181438e-06,
"loss": 0.3267,
"step": 1580
},
{
"epoch": 2.8053097345132745,
"grad_norm": 1.8158272144124583,
"learning_rate": 9.616972663099648e-06,
"loss": 0.342,
"step": 1585
},
{
"epoch": 2.814159292035398,
"grad_norm": 2.7692218225078724,
"learning_rate": 9.55523199573098e-06,
"loss": 0.3258,
"step": 1590
},
{
"epoch": 2.823008849557522,
"grad_norm": 1.6127123082950938,
"learning_rate": 9.493508311612874e-06,
"loss": 0.3214,
"step": 1595
},
{
"epoch": 2.831858407079646,
"grad_norm": 1.7692693875404215,
"learning_rate": 9.431803967634284e-06,
"loss": 0.337,
"step": 1600
},
{
"epoch": 2.84070796460177,
"grad_norm": 1.5722273068604184,
"learning_rate": 9.370121319945657e-06,
"loss": 0.3354,
"step": 1605
},
{
"epoch": 2.849557522123894,
"grad_norm": 1.5455898969077757,
"learning_rate": 9.308462723868987e-06,
"loss": 0.3203,
"step": 1610
},
{
"epoch": 2.8584070796460175,
"grad_norm": 1.6331654733208454,
"learning_rate": 9.246830533807857e-06,
"loss": 0.3215,
"step": 1615
},
{
"epoch": 2.8672566371681416,
"grad_norm": 1.6276033261489797,
"learning_rate": 9.185227103157573e-06,
"loss": 0.3152,
"step": 1620
},
{
"epoch": 2.8761061946902657,
"grad_norm": 1.6614797915917234,
"learning_rate": 9.12365478421525e-06,
"loss": 0.3214,
"step": 1625
},
{
"epoch": 2.8849557522123894,
"grad_norm": 1.732426911950826,
"learning_rate": 9.062115928090036e-06,
"loss": 0.3068,
"step": 1630
},
{
"epoch": 2.893805309734513,
"grad_norm": 1.518118162492201,
"learning_rate": 9.000612884613306e-06,
"loss": 0.3126,
"step": 1635
},
{
"epoch": 2.9026548672566372,
"grad_norm": 1.6702030494722016,
"learning_rate": 8.939148002248954e-06,
"loss": 0.3348,
"step": 1640
},
{
"epoch": 2.911504424778761,
"grad_norm": 1.7443283818272917,
"learning_rate": 8.877723628003703e-06,
"loss": 0.3266,
"step": 1645
},
{
"epoch": 2.920353982300885,
"grad_norm": 1.6887892210021154,
"learning_rate": 8.816342107337501e-06,
"loss": 0.331,
"step": 1650
},
{
"epoch": 2.9292035398230087,
"grad_norm": 1.6444996183942215,
"learning_rate": 8.755005784073948e-06,
"loss": 0.3078,
"step": 1655
},
{
"epoch": 2.938053097345133,
"grad_norm": 1.5829832005281133,
"learning_rate": 8.693717000310801e-06,
"loss": 0.3071,
"step": 1660
},
{
"epoch": 2.9469026548672566,
"grad_norm": 1.7157739006910784,
"learning_rate": 8.632478096330559e-06,
"loss": 0.3255,
"step": 1665
},
{
"epoch": 2.9557522123893807,
"grad_norm": 1.5977542612636495,
"learning_rate": 8.571291410511063e-06,
"loss": 0.3176,
"step": 1670
},
{
"epoch": 2.9646017699115044,
"grad_norm": 1.6999676018113206,
"learning_rate": 8.510159279236244e-06,
"loss": 0.3275,
"step": 1675
},
{
"epoch": 2.9734513274336285,
"grad_norm": 1.8724555229847881,
"learning_rate": 8.449084036806893e-06,
"loss": 0.3201,
"step": 1680
},
{
"epoch": 2.982300884955752,
"grad_norm": 1.980909859882966,
"learning_rate": 8.388068015351521e-06,
"loss": 0.3105,
"step": 1685
},
{
"epoch": 2.991150442477876,
"grad_norm": 1.797933699630095,
"learning_rate": 8.327113544737325e-06,
"loss": 0.3207,
"step": 1690
},
{
"epoch": 3.0,
"grad_norm": 1.6572798098507155,
"learning_rate": 8.2662229524812e-06,
"loss": 0.3003,
"step": 1695
},
{
"epoch": 3.0,
"eval_loss": 0.235044464468956,
"eval_runtime": 341.1312,
"eval_samples_per_second": 22.047,
"eval_steps_per_second": 0.346,
"step": 1695
},
{
"epoch": 3.0088495575221237,
"grad_norm": 2.353752696189,
"learning_rate": 8.205398563660886e-06,
"loss": 0.179,
"step": 1700
},
{
"epoch": 3.017699115044248,
"grad_norm": 2.3653722927763288,
"learning_rate": 8.144642700826182e-06,
"loss": 0.1704,
"step": 1705
},
{
"epoch": 3.0265486725663715,
"grad_norm": 1.9259007577930276,
"learning_rate": 8.08395768391024e-06,
"loss": 0.1707,
"step": 1710
},
{
"epoch": 3.0353982300884956,
"grad_norm": 1.893459900305888,
"learning_rate": 8.02334583014101e-06,
"loss": 0.1675,
"step": 1715
},
{
"epoch": 3.0442477876106193,
"grad_norm": 2.0717584069847037,
"learning_rate": 7.96280945395273e-06,
"loss": 0.1839,
"step": 1720
},
{
"epoch": 3.0530973451327434,
"grad_norm": 2.114700678563183,
"learning_rate": 7.902350866897573e-06,
"loss": 0.1793,
"step": 1725
},
{
"epoch": 3.061946902654867,
"grad_norm": 1.8731510987373088,
"learning_rate": 7.841972377557366e-06,
"loss": 0.1846,
"step": 1730
},
{
"epoch": 3.0707964601769913,
"grad_norm": 1.722257787492872,
"learning_rate": 7.78167629145545e-06,
"loss": 0.1697,
"step": 1735
},
{
"epoch": 3.079646017699115,
"grad_norm": 1.705359298929866,
"learning_rate": 7.721464910968628e-06,
"loss": 0.1687,
"step": 1740
},
{
"epoch": 3.088495575221239,
"grad_norm": 1.8344884220833366,
"learning_rate": 7.661340535239266e-06,
"loss": 0.1724,
"step": 1745
},
{
"epoch": 3.0973451327433628,
"grad_norm": 1.7566442049461333,
"learning_rate": 7.6013054600875005e-06,
"loss": 0.1754,
"step": 1750
},
{
"epoch": 3.106194690265487,
"grad_norm": 1.6129096564380034,
"learning_rate": 7.541361977923564e-06,
"loss": 0.1667,
"step": 1755
},
{
"epoch": 3.1150442477876106,
"grad_norm": 1.7068252140891824,
"learning_rate": 7.481512377660251e-06,
"loss": 0.1667,
"step": 1760
},
{
"epoch": 3.1238938053097347,
"grad_norm": 1.6531864536593857,
"learning_rate": 7.421758944625528e-06,
"loss": 0.1785,
"step": 1765
},
{
"epoch": 3.1327433628318584,
"grad_norm": 1.821254758412697,
"learning_rate": 7.362103960475258e-06,
"loss": 0.1698,
"step": 1770
},
{
"epoch": 3.1415929203539825,
"grad_norm": 1.7780006102159944,
"learning_rate": 7.302549703106084e-06,
"loss": 0.1828,
"step": 1775
},
{
"epoch": 3.150442477876106,
"grad_norm": 2.39713947955002,
"learning_rate": 7.243098446568442e-06,
"loss": 0.1736,
"step": 1780
},
{
"epoch": 3.15929203539823,
"grad_norm": 1.6290436678999984,
"learning_rate": 7.183752460979737e-06,
"loss": 0.1699,
"step": 1785
},
{
"epoch": 3.168141592920354,
"grad_norm": 1.582870471036582,
"learning_rate": 7.124514012437645e-06,
"loss": 0.1718,
"step": 1790
},
{
"epoch": 3.1769911504424777,
"grad_norm": 1.5585723392749353,
"learning_rate": 7.065385362933603e-06,
"loss": 0.166,
"step": 1795
},
{
"epoch": 3.185840707964602,
"grad_norm": 1.6356629585103508,
"learning_rate": 7.006368770266421e-06,
"loss": 0.1738,
"step": 1800
},
{
"epoch": 3.1946902654867255,
"grad_norm": 1.8092852534302832,
"learning_rate": 6.947466487956067e-06,
"loss": 0.184,
"step": 1805
},
{
"epoch": 3.2035398230088497,
"grad_norm": 1.763331264313237,
"learning_rate": 6.88868076515763e-06,
"loss": 0.1747,
"step": 1810
},
{
"epoch": 3.2123893805309733,
"grad_norm": 1.6618660989856646,
"learning_rate": 6.83001384657543e-06,
"loss": 0.1753,
"step": 1815
},
{
"epoch": 3.2212389380530975,
"grad_norm": 1.587869179848222,
"learning_rate": 6.7714679723772996e-06,
"loss": 0.177,
"step": 1820
},
{
"epoch": 3.230088495575221,
"grad_norm": 1.7012507664894645,
"learning_rate": 6.713045378109058e-06,
"loss": 0.182,
"step": 1825
},
{
"epoch": 3.2389380530973453,
"grad_norm": 3.092871482624404,
"learning_rate": 6.654748294609137e-06,
"loss": 0.1749,
"step": 1830
},
{
"epoch": 3.247787610619469,
"grad_norm": 5.246672357838822,
"learning_rate": 6.596578947923395e-06,
"loss": 0.1852,
"step": 1835
},
{
"epoch": 3.256637168141593,
"grad_norm": 2.6149168080932492,
"learning_rate": 6.538539559220141e-06,
"loss": 0.1717,
"step": 1840
},
{
"epoch": 3.265486725663717,
"grad_norm": 2.298194261929981,
"learning_rate": 6.480632344705274e-06,
"loss": 0.1827,
"step": 1845
},
{
"epoch": 3.274336283185841,
"grad_norm": 2.0137609080864083,
"learning_rate": 6.422859515537709e-06,
"loss": 0.1783,
"step": 1850
},
{
"epoch": 3.2831858407079646,
"grad_norm": 1.9067096587439358,
"learning_rate": 6.365223277744907e-06,
"loss": 0.1762,
"step": 1855
},
{
"epoch": 3.2920353982300883,
"grad_norm": 1.749958280136199,
"learning_rate": 6.3077258321386604e-06,
"loss": 0.1666,
"step": 1860
},
{
"epoch": 3.3008849557522124,
"grad_norm": 1.6753892133330817,
"learning_rate": 6.25036937423105e-06,
"loss": 0.1817,
"step": 1865
},
{
"epoch": 3.309734513274336,
"grad_norm": 1.770861004923346,
"learning_rate": 6.1931560941506055e-06,
"loss": 0.1753,
"step": 1870
},
{
"epoch": 3.3185840707964602,
"grad_norm": 1.6941989909031387,
"learning_rate": 6.136088176558683e-06,
"loss": 0.1683,
"step": 1875
},
{
"epoch": 3.327433628318584,
"grad_norm": 1.7818744676278702,
"learning_rate": 6.07916780056604e-06,
"loss": 0.1819,
"step": 1880
},
{
"epoch": 3.336283185840708,
"grad_norm": 1.7169221713713938,
"learning_rate": 6.022397139649636e-06,
"loss": 0.1753,
"step": 1885
},
{
"epoch": 3.3451327433628317,
"grad_norm": 1.7383379208379126,
"learning_rate": 5.96577836156963e-06,
"loss": 0.1772,
"step": 1890
},
{
"epoch": 3.353982300884956,
"grad_norm": 10.593484473821267,
"learning_rate": 5.9093136282866014e-06,
"loss": 0.1776,
"step": 1895
},
{
"epoch": 3.3628318584070795,
"grad_norm": 7.892581375758647,
"learning_rate": 5.853005095879015e-06,
"loss": 0.177,
"step": 1900
},
{
"epoch": 3.3716814159292037,
"grad_norm": 3.632404735066819,
"learning_rate": 5.796854914460873e-06,
"loss": 0.1819,
"step": 1905
},
{
"epoch": 3.3805309734513274,
"grad_norm": 2.48312406396527,
"learning_rate": 5.740865228099621e-06,
"loss": 0.1765,
"step": 1910
},
{
"epoch": 3.3893805309734515,
"grad_norm": 2.2660397170129993,
"learning_rate": 5.68503817473429e-06,
"loss": 0.1833,
"step": 1915
},
{
"epoch": 3.398230088495575,
"grad_norm": 1.9157388795853385,
"learning_rate": 5.629375886093835e-06,
"loss": 0.1735,
"step": 1920
},
{
"epoch": 3.4070796460176993,
"grad_norm": 1.9732798667035623,
"learning_rate": 5.573880487615755e-06,
"loss": 0.1776,
"step": 1925
},
{
"epoch": 3.415929203539823,
"grad_norm": 1.694802471034398,
"learning_rate": 5.518554098364932e-06,
"loss": 0.1723,
"step": 1930
},
{
"epoch": 3.4247787610619467,
"grad_norm": 1.607717285267641,
"learning_rate": 5.463398830952714e-06,
"loss": 0.1699,
"step": 1935
},
{
"epoch": 3.433628318584071,
"grad_norm": 1.742187593317827,
"learning_rate": 5.408416791456239e-06,
"loss": 0.1829,
"step": 1940
},
{
"epoch": 3.442477876106195,
"grad_norm": 1.6295575263043325,
"learning_rate": 5.3536100793380234e-06,
"loss": 0.168,
"step": 1945
},
{
"epoch": 3.4513274336283186,
"grad_norm": 1.5116611834425375,
"learning_rate": 5.298980787365785e-06,
"loss": 0.1733,
"step": 1950
},
{
"epoch": 3.4601769911504423,
"grad_norm": 1.7614935769749696,
"learning_rate": 5.244531001532558e-06,
"loss": 0.1639,
"step": 1955
},
{
"epoch": 3.4690265486725664,
"grad_norm": 1.4882500944214951,
"learning_rate": 5.190262800977007e-06,
"loss": 0.1623,
"step": 1960
},
{
"epoch": 3.47787610619469,
"grad_norm": 1.7919227523392,
"learning_rate": 5.136178257904048e-06,
"loss": 0.1793,
"step": 1965
},
{
"epoch": 3.4867256637168142,
"grad_norm": 1.5720819445780152,
"learning_rate": 5.082279437505739e-06,
"loss": 0.1814,
"step": 1970
},
{
"epoch": 3.495575221238938,
"grad_norm": 1.777272679047606,
"learning_rate": 5.028568397882397e-06,
"loss": 0.1732,
"step": 1975
},
{
"epoch": 3.504424778761062,
"grad_norm": 1.661227110456758,
"learning_rate": 4.975047189964027e-06,
"loss": 0.1681,
"step": 1980
},
{
"epoch": 3.5132743362831858,
"grad_norm": 1.5442922066155755,
"learning_rate": 4.921717857431997e-06,
"loss": 0.165,
"step": 1985
},
{
"epoch": 3.52212389380531,
"grad_norm": 1.7475725868687078,
"learning_rate": 4.868582436641006e-06,
"loss": 0.1654,
"step": 1990
},
{
"epoch": 3.5309734513274336,
"grad_norm": 1.6819734436503684,
"learning_rate": 4.81564295654134e-06,
"loss": 0.1689,
"step": 1995
},
{
"epoch": 3.5398230088495577,
"grad_norm": 1.5921158221510299,
"learning_rate": 4.762901438601368e-06,
"loss": 0.1712,
"step": 2000
},
{
"epoch": 3.5486725663716814,
"grad_norm": 1.7221909524561103,
"learning_rate": 4.710359896730379e-06,
"loss": 0.1761,
"step": 2005
},
{
"epoch": 3.557522123893805,
"grad_norm": 1.6159991650918777,
"learning_rate": 4.658020337201666e-06,
"loss": 0.1779,
"step": 2010
},
{
"epoch": 3.566371681415929,
"grad_norm": 1.6617020529449875,
"learning_rate": 4.6058847585759335e-06,
"loss": 0.1805,
"step": 2015
},
{
"epoch": 3.5752212389380533,
"grad_norm": 1.6141307718438942,
"learning_rate": 4.5539551516249735e-06,
"loss": 0.181,
"step": 2020
},
{
"epoch": 3.584070796460177,
"grad_norm": 1.613159926564658,
"learning_rate": 4.502233499255641e-06,
"loss": 0.1812,
"step": 2025
},
{
"epoch": 3.5929203539823007,
"grad_norm": 1.5123746738195785,
"learning_rate": 4.450721776434152e-06,
"loss": 0.1737,
"step": 2030
},
{
"epoch": 3.601769911504425,
"grad_norm": 1.59229310691719,
"learning_rate": 4.399421950110657e-06,
"loss": 0.156,
"step": 2035
},
{
"epoch": 3.6106194690265485,
"grad_norm": 1.5956185168711787,
"learning_rate": 4.348335979144158e-06,
"loss": 0.1739,
"step": 2040
},
{
"epoch": 3.6194690265486726,
"grad_norm": 1.5890271518636088,
"learning_rate": 4.297465814227678e-06,
"loss": 0.1682,
"step": 2045
},
{
"epoch": 3.6283185840707963,
"grad_norm": 1.5754061195856448,
"learning_rate": 4.2468133978137945e-06,
"loss": 0.172,
"step": 2050
},
{
"epoch": 3.6371681415929205,
"grad_norm": 1.5165865020871168,
"learning_rate": 4.196380664040468e-06,
"loss": 0.1514,
"step": 2055
},
{
"epoch": 3.646017699115044,
"grad_norm": 1.47915973391418,
"learning_rate": 4.146169538657185e-06,
"loss": 0.1685,
"step": 2060
},
{
"epoch": 3.6548672566371683,
"grad_norm": 1.6368988601716563,
"learning_rate": 4.096181938951419e-06,
"loss": 0.163,
"step": 2065
},
{
"epoch": 3.663716814159292,
"grad_norm": 1.6225383306108707,
"learning_rate": 4.046419773675421e-06,
"loss": 0.1709,
"step": 2070
},
{
"epoch": 3.672566371681416,
"grad_norm": 1.4634676872930676,
"learning_rate": 3.9968849429733396e-06,
"loss": 0.1617,
"step": 2075
},
{
"epoch": 3.6814159292035398,
"grad_norm": 1.6544557711131658,
"learning_rate": 3.94757933830867e-06,
"loss": 0.1675,
"step": 2080
},
{
"epoch": 3.6902654867256635,
"grad_norm": 1.6512592824076686,
"learning_rate": 3.898504842392017e-06,
"loss": 0.1722,
"step": 2085
},
{
"epoch": 3.6991150442477876,
"grad_norm": 1.5886810130145228,
"learning_rate": 3.849663329109206e-06,
"loss": 0.1726,
"step": 2090
},
{
"epoch": 3.7079646017699117,
"grad_norm": 1.631842537467322,
"learning_rate": 3.801056663449737e-06,
"loss": 0.1598,
"step": 2095
},
{
"epoch": 3.7168141592920354,
"grad_norm": 1.5134893226587467,
"learning_rate": 3.7526867014355685e-06,
"loss": 0.1647,
"step": 2100
},
{
"epoch": 3.725663716814159,
"grad_norm": 1.5640824884747317,
"learning_rate": 3.70455529005025e-06,
"loss": 0.1739,
"step": 2105
},
{
"epoch": 3.734513274336283,
"grad_norm": 1.50638388767751,
"learning_rate": 3.6566642671683806e-06,
"loss": 0.1644,
"step": 2110
},
{
"epoch": 3.7433628318584073,
"grad_norm": 1.6707015790458888,
"learning_rate": 3.6090154614854432e-06,
"loss": 0.1623,
"step": 2115
},
{
"epoch": 3.752212389380531,
"grad_norm": 1.6269953969553939,
"learning_rate": 3.561610692447982e-06,
"loss": 0.1603,
"step": 2120
},
{
"epoch": 3.7610619469026547,
"grad_norm": 1.6173678776254214,
"learning_rate": 3.514451770184113e-06,
"loss": 0.1751,
"step": 2125
},
{
"epoch": 3.769911504424779,
"grad_norm": 2.901697583641324,
"learning_rate": 3.467540495434415e-06,
"loss": 0.1599,
"step": 2130
},
{
"epoch": 3.7787610619469025,
"grad_norm": 1.6837282238012816,
"learning_rate": 3.420878659483161e-06,
"loss": 0.1636,
"step": 2135
},
{
"epoch": 3.7876106194690267,
"grad_norm": 1.4175543694096393,
"learning_rate": 3.374468044089937e-06,
"loss": 0.1649,
"step": 2140
},
{
"epoch": 3.7964601769911503,
"grad_norm": 1.7233110576016455,
"learning_rate": 3.328310421421579e-06,
"loss": 0.1641,
"step": 2145
},
{
"epoch": 3.8053097345132745,
"grad_norm": 1.5605391560510777,
"learning_rate": 3.2824075539845334e-06,
"loss": 0.1705,
"step": 2150
},
{
"epoch": 3.814159292035398,
"grad_norm": 1.5377565176020294,
"learning_rate": 3.2367611945575308e-06,
"loss": 0.1539,
"step": 2155
},
{
"epoch": 3.823008849557522,
"grad_norm": 1.5868929794341793,
"learning_rate": 3.191373086124666e-06,
"loss": 0.1709,
"step": 2160
},
{
"epoch": 3.831858407079646,
"grad_norm": 1.5622595706222757,
"learning_rate": 3.1462449618088576e-06,
"loss": 0.1559,
"step": 2165
},
{
"epoch": 3.84070796460177,
"grad_norm": 1.7270166919786822,
"learning_rate": 3.1013785448056454e-06,
"loss": 0.1748,
"step": 2170
},
{
"epoch": 3.849557522123894,
"grad_norm": 1.5748266845975978,
"learning_rate": 3.0567755483174043e-06,
"loss": 0.1726,
"step": 2175
},
{
"epoch": 3.8584070796460175,
"grad_norm": 1.5844961174820655,
"learning_rate": 3.0124376754879305e-06,
"loss": 0.1586,
"step": 2180
},
{
"epoch": 3.8672566371681416,
"grad_norm": 2.1519156556967776,
"learning_rate": 2.968366619337394e-06,
"loss": 0.1619,
"step": 2185
},
{
"epoch": 3.8761061946902657,
"grad_norm": 1.5353376420507088,
"learning_rate": 2.9245640626977012e-06,
"loss": 0.1582,
"step": 2190
},
{
"epoch": 3.8849557522123894,
"grad_norm": 1.604740201034412,
"learning_rate": 2.881031678148244e-06,
"loss": 0.1632,
"step": 2195
},
{
"epoch": 3.893805309734513,
"grad_norm": 1.5866339698891518,
"learning_rate": 2.837771127952007e-06,
"loss": 0.1564,
"step": 2200
},
{
"epoch": 3.9026548672566372,
"grad_norm": 2.295835726400093,
"learning_rate": 2.7947840639921308e-06,
"loss": 0.163,
"step": 2205
},
{
"epoch": 3.911504424778761,
"grad_norm": 1.5757417684634734,
"learning_rate": 2.7520721277088023e-06,
"loss": 0.1596,
"step": 2210
},
{
"epoch": 3.920353982300885,
"grad_norm": 1.8301488972667728,
"learning_rate": 2.709636950036597e-06,
"loss": 0.1609,
"step": 2215
},
{
"epoch": 3.9292035398230087,
"grad_norm": 1.5193689628299787,
"learning_rate": 2.6674801513421945e-06,
"loss": 0.166,
"step": 2220
},
{
"epoch": 3.938053097345133,
"grad_norm": 1.514470929679295,
"learning_rate": 2.6256033413625136e-06,
"loss": 0.1579,
"step": 2225
},
{
"epoch": 3.9469026548672566,
"grad_norm": 1.5711723708738545,
"learning_rate": 2.584008119143234e-06,
"loss": 0.1636,
"step": 2230
},
{
"epoch": 3.9557522123893807,
"grad_norm": 1.5856863391487659,
"learning_rate": 2.5426960729777496e-06,
"loss": 0.1656,
"step": 2235
},
{
"epoch": 3.9646017699115044,
"grad_norm": 1.511859910396652,
"learning_rate": 2.5016687803465033e-06,
"loss": 0.1583,
"step": 2240
},
{
"epoch": 3.9734513274336285,
"grad_norm": 1.5002228905494757,
"learning_rate": 2.460927807856778e-06,
"loss": 0.1602,
"step": 2245
},
{
"epoch": 3.982300884955752,
"grad_norm": 1.4446988734661728,
"learning_rate": 2.4204747111828463e-06,
"loss": 0.1587,
"step": 2250
},
{
"epoch": 3.991150442477876,
"grad_norm": 1.549269756232167,
"learning_rate": 2.3803110350065884e-06,
"loss": 0.1696,
"step": 2255
},
{
"epoch": 4.0,
"grad_norm": 1.641037232857028,
"learning_rate": 2.3404383129585018e-06,
"loss": 0.1776,
"step": 2260
},
{
"epoch": 4.0,
"eval_loss": 0.1632937490940094,
"eval_runtime": 341.9824,
"eval_samples_per_second": 21.992,
"eval_steps_per_second": 0.345,
"step": 2260
},
{
"epoch": 4.008849557522124,
"grad_norm": 2.475219520944848,
"learning_rate": 2.3008580675591462e-06,
"loss": 0.0861,
"step": 2265
},
{
"epoch": 4.017699115044247,
"grad_norm": 1.729540631610469,
"learning_rate": 2.2615718101609986e-06,
"loss": 0.0782,
"step": 2270
},
{
"epoch": 4.0265486725663715,
"grad_norm": 1.467284022386594,
"learning_rate": 2.222581040890741e-06,
"loss": 0.0792,
"step": 2275
},
{
"epoch": 4.035398230088496,
"grad_norm": 1.4386951051586159,
"learning_rate": 2.183887248591996e-06,
"loss": 0.0799,
"step": 2280
},
{
"epoch": 4.04424778761062,
"grad_norm": 1.3926757930334368,
"learning_rate": 2.1454919107684615e-06,
"loss": 0.0824,
"step": 2285
},
{
"epoch": 4.053097345132743,
"grad_norm": 1.3315563224559441,
"learning_rate": 2.107396493527489e-06,
"loss": 0.0765,
"step": 2290
},
{
"epoch": 4.061946902654867,
"grad_norm": 1.36195332075,
"learning_rate": 2.069602451524114e-06,
"loss": 0.0752,
"step": 2295
},
{
"epoch": 4.070796460176991,
"grad_norm": 1.350156993277736,
"learning_rate": 2.0321112279055e-06,
"loss": 0.0791,
"step": 2300
},
{
"epoch": 4.079646017699115,
"grad_norm": 1.448441545074319,
"learning_rate": 1.9949242542558466e-06,
"loss": 0.0737,
"step": 2305
},
{
"epoch": 4.088495575221239,
"grad_norm": 1.3217457960916599,
"learning_rate": 1.9580429505417054e-06,
"loss": 0.083,
"step": 2310
},
{
"epoch": 4.097345132743363,
"grad_norm": 1.33752273930371,
"learning_rate": 1.9214687250577766e-06,
"loss": 0.0833,
"step": 2315
},
{
"epoch": 4.106194690265487,
"grad_norm": 1.3522253645938511,
"learning_rate": 1.8852029743731203e-06,
"loss": 0.0778,
"step": 2320
},
{
"epoch": 4.115044247787611,
"grad_norm": 1.3575033220346027,
"learning_rate": 1.8492470832778442e-06,
"loss": 0.0812,
"step": 2325
},
{
"epoch": 4.123893805309734,
"grad_norm": 1.2823684094843621,
"learning_rate": 1.8136024247302152e-06,
"loss": 0.0763,
"step": 2330
},
{
"epoch": 4.132743362831858,
"grad_norm": 1.3765176805961667,
"learning_rate": 1.7782703598042327e-06,
"loss": 0.0758,
"step": 2335
},
{
"epoch": 4.1415929203539825,
"grad_norm": 1.2766052485378403,
"learning_rate": 1.7432522376376637e-06,
"loss": 0.0824,
"step": 2340
},
{
"epoch": 4.150442477876107,
"grad_norm": 1.3056775652487742,
"learning_rate": 1.7085493953805187e-06,
"loss": 0.0788,
"step": 2345
},
{
"epoch": 4.15929203539823,
"grad_norm": 1.2765927958394436,
"learning_rate": 1.6741631581440066e-06,
"loss": 0.0792,
"step": 2350
},
{
"epoch": 4.168141592920354,
"grad_norm": 1.2224525666208792,
"learning_rate": 1.6400948389499194e-06,
"loss": 0.0767,
"step": 2355
},
{
"epoch": 4.176991150442478,
"grad_norm": 1.3375837067028162,
"learning_rate": 1.6063457386805004e-06,
"loss": 0.0734,
"step": 2360
},
{
"epoch": 4.185840707964601,
"grad_norm": 1.336866928045873,
"learning_rate": 1.572917146028783e-06,
"loss": 0.0812,
"step": 2365
},
{
"epoch": 4.1946902654867255,
"grad_norm": 1.3309435734104655,
"learning_rate": 1.539810337449369e-06,
"loss": 0.079,
"step": 2370
},
{
"epoch": 4.20353982300885,
"grad_norm": 1.32678570412564,
"learning_rate": 1.507026577109686e-06,
"loss": 0.0796,
"step": 2375
},
{
"epoch": 4.212389380530974,
"grad_norm": 1.3156430424303722,
"learning_rate": 1.4745671168417265e-06,
"loss": 0.0777,
"step": 2380
},
{
"epoch": 4.221238938053097,
"grad_norm": 1.2973545964944124,
"learning_rate": 1.442433196094236e-06,
"loss": 0.0827,
"step": 2385
},
{
"epoch": 4.230088495575221,
"grad_norm": 1.311408962321446,
"learning_rate": 1.4106260418854033e-06,
"loss": 0.0775,
"step": 2390
},
{
"epoch": 4.238938053097345,
"grad_norm": 1.3792578224833796,
"learning_rate": 1.379146868755985e-06,
"loss": 0.0804,
"step": 2395
},
{
"epoch": 4.247787610619469,
"grad_norm": 1.2720416544972974,
"learning_rate": 1.3479968787229402e-06,
"loss": 0.0811,
"step": 2400
},
{
"epoch": 4.256637168141593,
"grad_norm": 1.2865117421029972,
"learning_rate": 1.3171772612335332e-06,
"loss": 0.076,
"step": 2405
},
{
"epoch": 4.265486725663717,
"grad_norm": 1.3809883790356827,
"learning_rate": 1.2866891931199132e-06,
"loss": 0.0797,
"step": 2410
},
{
"epoch": 4.274336283185841,
"grad_norm": 1.339742153054439,
"learning_rate": 1.2565338385541792e-06,
"loss": 0.0773,
"step": 2415
},
{
"epoch": 4.283185840707965,
"grad_norm": 1.369494397621055,
"learning_rate": 1.2267123490039201e-06,
"loss": 0.0803,
"step": 2420
},
{
"epoch": 4.292035398230088,
"grad_norm": 1.2994483999857558,
"learning_rate": 1.1972258631882527e-06,
"loss": 0.076,
"step": 2425
},
{
"epoch": 4.300884955752212,
"grad_norm": 1.3824019299588173,
"learning_rate": 1.168075507034341e-06,
"loss": 0.0779,
"step": 2430
},
{
"epoch": 4.3097345132743365,
"grad_norm": 1.3842282775051018,
"learning_rate": 1.1392623936343994e-06,
"loss": 0.08,
"step": 2435
},
{
"epoch": 4.31858407079646,
"grad_norm": 1.359855304821783,
"learning_rate": 1.110787623203189e-06,
"loss": 0.08,
"step": 2440
},
{
"epoch": 4.327433628318584,
"grad_norm": 1.3510428064785265,
"learning_rate": 1.0826522830360087e-06,
"loss": 0.0814,
"step": 2445
},
{
"epoch": 4.336283185840708,
"grad_norm": 1.3365703898748194,
"learning_rate": 1.0548574474671835e-06,
"loss": 0.0791,
"step": 2450
},
{
"epoch": 4.345132743362832,
"grad_norm": 1.391541582481427,
"learning_rate": 1.027404177829031e-06,
"loss": 0.0827,
"step": 2455
},
{
"epoch": 4.353982300884955,
"grad_norm": 1.324819487757073,
"learning_rate": 1.0002935224113387e-06,
"loss": 0.0796,
"step": 2460
},
{
"epoch": 4.3628318584070795,
"grad_norm": 1.403599449784847,
"learning_rate": 9.735265164213349e-07,
"loss": 0.0806,
"step": 2465
},
{
"epoch": 4.371681415929204,
"grad_norm": 1.254012872145482,
"learning_rate": 9.471041819441673e-07,
"loss": 0.0762,
"step": 2470
},
{
"epoch": 4.380530973451328,
"grad_norm": 1.3727040572468259,
"learning_rate": 9.210275279038638e-07,
"loss": 0.0773,
"step": 2475
},
{
"epoch": 4.389380530973451,
"grad_norm": 1.269212054809803,
"learning_rate": 8.952975500248129e-07,
"loss": 0.0789,
"step": 2480
},
{
"epoch": 4.398230088495575,
"grad_norm": 1.2901290730207529,
"learning_rate": 8.69915230793742e-07,
"loss": 0.075,
"step": 2485
},
{
"epoch": 4.407079646017699,
"grad_norm": 1.2318338508628874,
"learning_rate": 8.448815394222043e-07,
"loss": 0.0813,
"step": 2490
},
{
"epoch": 4.415929203539823,
"grad_norm": 1.2925512164200812,
"learning_rate": 8.20197431809564e-07,
"loss": 0.0755,
"step": 2495
},
{
"epoch": 4.424778761061947,
"grad_norm": 1.3632740857909407,
"learning_rate": 7.958638505065031e-07,
"loss": 0.077,
"step": 2500
},
{
"epoch": 4.433628318584071,
"grad_norm": 1.2727197193668072,
"learning_rate": 7.718817246790222e-07,
"loss": 0.0756,
"step": 2505
},
{
"epoch": 4.442477876106195,
"grad_norm": 1.283160067311808,
"learning_rate": 7.48251970072964e-07,
"loss": 0.0771,
"step": 2510
},
{
"epoch": 4.451327433628318,
"grad_norm": 1.3266085812679953,
"learning_rate": 7.249754889790539e-07,
"loss": 0.0779,
"step": 2515
},
{
"epoch": 4.460176991150442,
"grad_norm": 1.3553958194021836,
"learning_rate": 7.020531701984334e-07,
"loss": 0.0815,
"step": 2520
},
{
"epoch": 4.469026548672566,
"grad_norm": 1.4498354656139365,
"learning_rate": 6.794858890087275e-07,
"loss": 0.0804,
"step": 2525
},
{
"epoch": 4.477876106194691,
"grad_norm": 1.33445017208115,
"learning_rate": 6.572745071306286e-07,
"loss": 0.0825,
"step": 2530
},
{
"epoch": 4.486725663716814,
"grad_norm": 1.2094739845924918,
"learning_rate": 6.3541987269498e-07,
"loss": 0.0724,
"step": 2535
},
{
"epoch": 4.495575221238938,
"grad_norm": 1.2915737532727858,
"learning_rate": 6.139228202104008e-07,
"loss": 0.0745,
"step": 2540
},
{
"epoch": 4.504424778761062,
"grad_norm": 1.3914894835662084,
"learning_rate": 5.927841705314175e-07,
"loss": 0.0796,
"step": 2545
},
{
"epoch": 4.513274336283186,
"grad_norm": 1.292394593993158,
"learning_rate": 5.720047308271149e-07,
"loss": 0.078,
"step": 2550
},
{
"epoch": 4.522123893805309,
"grad_norm": 1.3661686769110382,
"learning_rate": 5.515852945503241e-07,
"loss": 0.0811,
"step": 2555
},
{
"epoch": 4.530973451327434,
"grad_norm": 1.2989196151190672,
"learning_rate": 5.315266414073161e-07,
"loss": 0.077,
"step": 2560
},
{
"epoch": 4.539823008849558,
"grad_norm": 1.415979853747286,
"learning_rate": 5.118295373280335e-07,
"loss": 0.0812,
"step": 2565
},
{
"epoch": 4.548672566371682,
"grad_norm": 1.3239654771249467,
"learning_rate": 4.924947344368448e-07,
"loss": 0.079,
"step": 2570
},
{
"epoch": 4.557522123893805,
"grad_norm": 1.4440576212335767,
"learning_rate": 4.7352297102382317e-07,
"loss": 0.0747,
"step": 2575
},
{
"epoch": 4.566371681415929,
"grad_norm": 1.3092000545668983,
"learning_rate": 4.549149715165546e-07,
"loss": 0.0754,
"step": 2580
},
{
"epoch": 4.575221238938053,
"grad_norm": 1.2120991458734485,
"learning_rate": 4.3667144645247463e-07,
"loss": 0.0782,
"step": 2585
},
{
"epoch": 4.584070796460177,
"grad_norm": 1.2333368517726013,
"learning_rate": 4.187930924517436e-07,
"loss": 0.076,
"step": 2590
},
{
"epoch": 4.592920353982301,
"grad_norm": 1.2863679695017827,
"learning_rate": 4.012805921906393e-07,
"loss": 0.0751,
"step": 2595
},
{
"epoch": 4.601769911504425,
"grad_norm": 1.3354897817488147,
"learning_rate": 3.8413461437549203e-07,
"loss": 0.0774,
"step": 2600
},
{
"epoch": 4.610619469026549,
"grad_norm": 1.3165459456119635,
"learning_rate": 3.673558137171496e-07,
"loss": 0.0758,
"step": 2605
},
{
"epoch": 4.619469026548672,
"grad_norm": 1.448724213864104,
"learning_rate": 3.5094483090597706e-07,
"loss": 0.0772,
"step": 2610
},
{
"epoch": 4.628318584070796,
"grad_norm": 1.401376773315023,
"learning_rate": 3.3490229258739794e-07,
"loss": 0.0799,
"step": 2615
},
{
"epoch": 4.6371681415929205,
"grad_norm": 1.3907713234287105,
"learning_rate": 3.1922881133795827e-07,
"loss": 0.0784,
"step": 2620
},
{
"epoch": 4.646017699115045,
"grad_norm": 1.2597928306220052,
"learning_rate": 3.0392498564193685e-07,
"loss": 0.076,
"step": 2625
},
{
"epoch": 4.654867256637168,
"grad_norm": 1.3365628143767583,
"learning_rate": 2.889913998684979e-07,
"loss": 0.0772,
"step": 2630
},
{
"epoch": 4.663716814159292,
"grad_norm": 1.284544234891874,
"learning_rate": 2.744286242493721e-07,
"loss": 0.0782,
"step": 2635
},
{
"epoch": 4.672566371681416,
"grad_norm": 1.2551721985478679,
"learning_rate": 2.602372148570864e-07,
"loss": 0.0811,
"step": 2640
},
{
"epoch": 4.68141592920354,
"grad_norm": 1.2553558965086657,
"learning_rate": 2.4641771358372537e-07,
"loss": 0.0807,
"step": 2645
},
{
"epoch": 4.6902654867256635,
"grad_norm": 1.2348755466860069,
"learning_rate": 2.329706481202443e-07,
"loss": 0.0801,
"step": 2650
},
{
"epoch": 4.699115044247788,
"grad_norm": 1.1907724591359092,
"learning_rate": 2.1989653193631667e-07,
"loss": 0.0738,
"step": 2655
},
{
"epoch": 4.707964601769912,
"grad_norm": 1.3510071207779715,
"learning_rate": 2.0719586426072858e-07,
"loss": 0.084,
"step": 2660
},
{
"epoch": 4.716814159292035,
"grad_norm": 1.295370917721196,
"learning_rate": 1.9486913006231846e-07,
"loss": 0.0776,
"step": 2665
},
{
"epoch": 4.725663716814159,
"grad_norm": 1.3413694479416487,
"learning_rate": 1.8291680003145074e-07,
"loss": 0.0797,
"step": 2670
},
{
"epoch": 4.734513274336283,
"grad_norm": 1.4581808262991165,
"learning_rate": 1.7133933056205366e-07,
"loss": 0.0804,
"step": 2675
},
{
"epoch": 4.743362831858407,
"grad_norm": 1.2741635529778859,
"learning_rate": 1.601371637341864e-07,
"loss": 0.0752,
"step": 2680
},
{
"epoch": 4.752212389380531,
"grad_norm": 1.2782881439793918,
"learning_rate": 1.49310727297155e-07,
"loss": 0.0774,
"step": 2685
},
{
"epoch": 4.761061946902655,
"grad_norm": 1.3120371018641195,
"learning_rate": 1.3886043465318522e-07,
"loss": 0.0813,
"step": 2690
},
{
"epoch": 4.769911504424779,
"grad_norm": 1.1988340814149363,
"learning_rate": 1.2878668484163303e-07,
"loss": 0.0777,
"step": 2695
},
{
"epoch": 4.778761061946903,
"grad_norm": 1.321759520377253,
"learning_rate": 1.1908986252375243e-07,
"loss": 0.078,
"step": 2700
},
{
"epoch": 4.787610619469026,
"grad_norm": 1.4194270019959467,
"learning_rate": 1.097703379679993e-07,
"loss": 0.0775,
"step": 2705
},
{
"epoch": 4.79646017699115,
"grad_norm": 1.2788598484535139,
"learning_rate": 1.0082846703590055e-07,
"loss": 0.0765,
"step": 2710
},
{
"epoch": 4.8053097345132745,
"grad_norm": 1.3148204824038185,
"learning_rate": 9.226459116846054e-08,
"loss": 0.0751,
"step": 2715
},
{
"epoch": 4.814159292035399,
"grad_norm": 1.344867899591603,
"learning_rate": 8.407903737312929e-08,
"loss": 0.0808,
"step": 2720
},
{
"epoch": 4.823008849557522,
"grad_norm": 1.278318794310315,
"learning_rate": 7.627211821130576e-08,
"loss": 0.0756,
"step": 2725
},
{
"epoch": 4.831858407079646,
"grad_norm": 1.2101543880815988,
"learning_rate": 6.884413178641414e-08,
"loss": 0.0728,
"step": 2730
},
{
"epoch": 4.84070796460177,
"grad_norm": 1.4344243105198062,
"learning_rate": 6.179536173251399e-08,
"loss": 0.0844,
"step": 2735
},
{
"epoch": 4.849557522123893,
"grad_norm": 1.3276414122745406,
"learning_rate": 5.5126077203471186e-08,
"loss": 0.0808,
"step": 2740
},
{
"epoch": 4.8584070796460175,
"grad_norm": 1.3506069083893197,
"learning_rate": 4.883653286268164e-08,
"loss": 0.0833,
"step": 2745
},
{
"epoch": 4.867256637168142,
"grad_norm": 1.3682014190493492,
"learning_rate": 4.292696887334691e-08,
"loss": 0.0771,
"step": 2750
},
{
"epoch": 4.876106194690266,
"grad_norm": 1.2828761325152347,
"learning_rate": 3.7397610889300384e-08,
"loss": 0.0801,
"step": 2755
},
{
"epoch": 4.88495575221239,
"grad_norm": 1.3118235861689838,
"learning_rate": 3.224867004639642e-08,
"loss": 0.0775,
"step": 2760
},
{
"epoch": 4.893805309734513,
"grad_norm": 1.2599575243275205,
"learning_rate": 2.7480342954444572e-08,
"loss": 0.0771,
"step": 2765
},
{
"epoch": 4.902654867256637,
"grad_norm": 1.2689219424528377,
"learning_rate": 2.309281168970223e-08,
"loss": 0.0745,
"step": 2770
},
{
"epoch": 4.911504424778761,
"grad_norm": 1.3933156024565598,
"learning_rate": 1.9086243787922453e-08,
"loss": 0.0754,
"step": 2775
},
{
"epoch": 4.920353982300885,
"grad_norm": 1.321697338465418,
"learning_rate": 1.5460792237960154e-08,
"loss": 0.0757,
"step": 2780
},
{
"epoch": 4.929203539823009,
"grad_norm": 1.2224130906716606,
"learning_rate": 1.2216595475921245e-08,
"loss": 0.0804,
"step": 2785
},
{
"epoch": 4.938053097345133,
"grad_norm": 1.3113541450877073,
"learning_rate": 9.353777379889073e-09,
"loss": 0.0774,
"step": 2790
},
{
"epoch": 4.946902654867257,
"grad_norm": 1.3170038581923675,
"learning_rate": 6.8724472651815474e-09,
"loss": 0.0793,
"step": 2795
},
{
"epoch": 4.95575221238938,
"grad_norm": 1.2713773209438923,
"learning_rate": 4.772699880187804e-09,
"loss": 0.0763,
"step": 2800
},
{
"epoch": 4.964601769911504,
"grad_norm": 1.201702049473955,
"learning_rate": 3.054615402743322e-09,
"loss": 0.0733,
"step": 2805
},
{
"epoch": 4.9734513274336285,
"grad_norm": 1.272942179409837,
"learning_rate": 1.7182594370701577e-09,
"loss": 0.0772,
"step": 2810
},
{
"epoch": 4.982300884955752,
"grad_norm": 1.480916856542288,
"learning_rate": 7.636830112733862e-10,
"loss": 0.0793,
"step": 2815
},
{
"epoch": 4.991150442477876,
"grad_norm": 1.328167260480697,
"learning_rate": 1.9092257538932956e-10,
"loss": 0.0787,
"step": 2820
},
{
"epoch": 5.0,
"grad_norm": 1.2170466610329072,
"learning_rate": 0.0,
"loss": 0.0793,
"step": 2825
},
{
"epoch": 5.0,
"eval_loss": 0.1581079065799713,
"eval_runtime": 341.788,
"eval_samples_per_second": 22.005,
"eval_steps_per_second": 0.345,
"step": 2825
},
{
"epoch": 5.0,
"step": 2825,
"total_flos": 2365990109184000.0,
"train_loss": 0.3740393664457102,
"train_runtime": 65777.1761,
"train_samples_per_second": 5.497,
"train_steps_per_second": 0.043
}
],
"logging_steps": 5,
"max_steps": 2825,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"total_flos": 2365990109184000.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}