svjack's picture
Upload folder using huggingface_hub
45b3936 verified
raw
history blame contribute delete
No virus
143 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.352441613588111,
"eval_steps": 500,
"global_step": 4100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005307855626326964,
"grad_norm": 0.8387855291366577,
"learning_rate": 4.999986097031132e-05,
"loss": 1.9588,
"step": 5
},
{
"epoch": 0.010615711252653927,
"grad_norm": 0.645732581615448,
"learning_rate": 4.999944388279162e-05,
"loss": 2.1093,
"step": 10
},
{
"epoch": 0.01592356687898089,
"grad_norm": 0.7065261602401733,
"learning_rate": 4.9998748742079904e-05,
"loss": 1.8366,
"step": 15
},
{
"epoch": 0.021231422505307854,
"grad_norm": 2.4020400047302246,
"learning_rate": 4.999777555590779e-05,
"loss": 2.298,
"step": 20
},
{
"epoch": 0.02653927813163482,
"grad_norm": 0.9261078834533691,
"learning_rate": 4.99965243350994e-05,
"loss": 1.854,
"step": 25
},
{
"epoch": 0.03184713375796178,
"grad_norm": 4.6172943115234375,
"learning_rate": 4.9994995093571314e-05,
"loss": 1.8602,
"step": 30
},
{
"epoch": 0.037154989384288746,
"grad_norm": 0.7544531226158142,
"learning_rate": 4.9993187848332315e-05,
"loss": 1.7065,
"step": 35
},
{
"epoch": 0.04246284501061571,
"grad_norm": 0.9400565028190613,
"learning_rate": 4.9991102619483254e-05,
"loss": 1.7744,
"step": 40
},
{
"epoch": 0.04777070063694268,
"grad_norm": 0.8333051204681396,
"learning_rate": 4.9988739430216834e-05,
"loss": 1.6745,
"step": 45
},
{
"epoch": 0.05307855626326964,
"grad_norm": 0.9034268260002136,
"learning_rate": 4.998609830681734e-05,
"loss": 1.7685,
"step": 50
},
{
"epoch": 0.058386411889596604,
"grad_norm": 5.156125068664551,
"learning_rate": 4.998317927866033e-05,
"loss": 1.822,
"step": 55
},
{
"epoch": 0.06369426751592357,
"grad_norm": 3.3585586547851562,
"learning_rate": 4.997998237821233e-05,
"loss": 1.8087,
"step": 60
},
{
"epoch": 0.06900212314225053,
"grad_norm": 1.1148409843444824,
"learning_rate": 4.9976507641030466e-05,
"loss": 1.5799,
"step": 65
},
{
"epoch": 0.07430997876857749,
"grad_norm": 2.8555004596710205,
"learning_rate": 4.997275510576207e-05,
"loss": 1.6364,
"step": 70
},
{
"epoch": 0.07961783439490445,
"grad_norm": 1.1666771173477173,
"learning_rate": 4.996872481414425e-05,
"loss": 1.6141,
"step": 75
},
{
"epoch": 0.08492569002123142,
"grad_norm": 1.0667269229888916,
"learning_rate": 4.9964416811003414e-05,
"loss": 1.7928,
"step": 80
},
{
"epoch": 0.09023354564755838,
"grad_norm": 1.2639814615249634,
"learning_rate": 4.9959831144254794e-05,
"loss": 1.4345,
"step": 85
},
{
"epoch": 0.09554140127388536,
"grad_norm": 1.1917731761932373,
"learning_rate": 4.995496786490189e-05,
"loss": 1.7151,
"step": 90
},
{
"epoch": 0.10084925690021232,
"grad_norm": 1.2275704145431519,
"learning_rate": 4.9949827027035924e-05,
"loss": 1.8297,
"step": 95
},
{
"epoch": 0.10615711252653928,
"grad_norm": 1.3175028562545776,
"learning_rate": 4.994440868783522e-05,
"loss": 1.6928,
"step": 100
},
{
"epoch": 0.11146496815286625,
"grad_norm": 1.2463750839233398,
"learning_rate": 4.993871290756459e-05,
"loss": 1.7687,
"step": 105
},
{
"epoch": 0.11677282377919321,
"grad_norm": 4.9171624183654785,
"learning_rate": 4.993273974957463e-05,
"loss": 1.6187,
"step": 110
},
{
"epoch": 0.12208067940552017,
"grad_norm": 6.226306438446045,
"learning_rate": 4.992648928030103e-05,
"loss": 1.7059,
"step": 115
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.2201507091522217,
"learning_rate": 4.991996156926387e-05,
"loss": 1.6283,
"step": 120
},
{
"epoch": 0.1326963906581741,
"grad_norm": 1.2521358728408813,
"learning_rate": 4.9913156689066806e-05,
"loss": 1.5449,
"step": 125
},
{
"epoch": 0.13800424628450106,
"grad_norm": 4.661998271942139,
"learning_rate": 4.990607471539626e-05,
"loss": 1.8635,
"step": 130
},
{
"epoch": 0.14331210191082802,
"grad_norm": 2.3003995418548584,
"learning_rate": 4.9898715727020594e-05,
"loss": 1.6994,
"step": 135
},
{
"epoch": 0.14861995753715498,
"grad_norm": 1.2704275846481323,
"learning_rate": 4.989107980578924e-05,
"loss": 1.6886,
"step": 140
},
{
"epoch": 0.15392781316348195,
"grad_norm": 5.455511093139648,
"learning_rate": 4.988316703663179e-05,
"loss": 1.7095,
"step": 145
},
{
"epoch": 0.1592356687898089,
"grad_norm": 1.3058491945266724,
"learning_rate": 4.987497750755702e-05,
"loss": 1.6245,
"step": 150
},
{
"epoch": 0.16454352441613587,
"grad_norm": 1.2686774730682373,
"learning_rate": 4.986651130965194e-05,
"loss": 1.7859,
"step": 155
},
{
"epoch": 0.16985138004246284,
"grad_norm": 1.4865705966949463,
"learning_rate": 4.9857768537080784e-05,
"loss": 1.6112,
"step": 160
},
{
"epoch": 0.1751592356687898,
"grad_norm": 1.1640167236328125,
"learning_rate": 4.9848749287083945e-05,
"loss": 1.736,
"step": 165
},
{
"epoch": 0.18046709129511676,
"grad_norm": 1.2125452756881714,
"learning_rate": 4.983945365997691e-05,
"loss": 1.6853,
"step": 170
},
{
"epoch": 0.18577494692144372,
"grad_norm": 1.4915114641189575,
"learning_rate": 4.9829881759149135e-05,
"loss": 1.6422,
"step": 175
},
{
"epoch": 0.1910828025477707,
"grad_norm": 3.3147950172424316,
"learning_rate": 4.982003369106287e-05,
"loss": 1.5487,
"step": 180
},
{
"epoch": 0.19639065817409768,
"grad_norm": 1.2265527248382568,
"learning_rate": 4.980990956525205e-05,
"loss": 1.6864,
"step": 185
},
{
"epoch": 0.20169851380042464,
"grad_norm": 1.448042631149292,
"learning_rate": 4.979950949432098e-05,
"loss": 1.5778,
"step": 190
},
{
"epoch": 0.2070063694267516,
"grad_norm": 1.3541866540908813,
"learning_rate": 4.9788833593943166e-05,
"loss": 1.6342,
"step": 195
},
{
"epoch": 0.21231422505307856,
"grad_norm": 1.1465802192687988,
"learning_rate": 4.977788198285995e-05,
"loss": 1.6218,
"step": 200
},
{
"epoch": 0.21762208067940553,
"grad_norm": 1.4200804233551025,
"learning_rate": 4.976665478287929e-05,
"loss": 1.6393,
"step": 205
},
{
"epoch": 0.2229299363057325,
"grad_norm": 3.8200623989105225,
"learning_rate": 4.9755152118874294e-05,
"loss": 1.7447,
"step": 210
},
{
"epoch": 0.22823779193205945,
"grad_norm": 1.822286605834961,
"learning_rate": 4.974337411878191e-05,
"loss": 1.4881,
"step": 215
},
{
"epoch": 0.23354564755838642,
"grad_norm": 1.3040127754211426,
"learning_rate": 4.9731320913601474e-05,
"loss": 1.6864,
"step": 220
},
{
"epoch": 0.23885350318471338,
"grad_norm": 1.3640131950378418,
"learning_rate": 4.9718992637393256e-05,
"loss": 1.5177,
"step": 225
},
{
"epoch": 0.24416135881104034,
"grad_norm": 1.1982786655426025,
"learning_rate": 4.970638942727698e-05,
"loss": 1.6818,
"step": 230
},
{
"epoch": 0.2494692144373673,
"grad_norm": 1.3077235221862793,
"learning_rate": 4.969351142343025e-05,
"loss": 1.5376,
"step": 235
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.420879602432251,
"learning_rate": 4.9680358769087076e-05,
"loss": 1.5622,
"step": 240
},
{
"epoch": 0.26008492569002123,
"grad_norm": 1.5823885202407837,
"learning_rate": 4.966693161053621e-05,
"loss": 1.6004,
"step": 245
},
{
"epoch": 0.2653927813163482,
"grad_norm": 1.6158925294876099,
"learning_rate": 4.965323009711954e-05,
"loss": 1.9304,
"step": 250
},
{
"epoch": 0.27070063694267515,
"grad_norm": 1.4248143434524536,
"learning_rate": 4.963925438123044e-05,
"loss": 1.682,
"step": 255
},
{
"epoch": 0.2760084925690021,
"grad_norm": 1.3910802602767944,
"learning_rate": 4.962500461831207e-05,
"loss": 1.8035,
"step": 260
},
{
"epoch": 0.2813163481953291,
"grad_norm": 1.4260281324386597,
"learning_rate": 4.9610480966855625e-05,
"loss": 1.5745,
"step": 265
},
{
"epoch": 0.28662420382165604,
"grad_norm": 9.649313926696777,
"learning_rate": 4.959568358839861e-05,
"loss": 1.5141,
"step": 270
},
{
"epoch": 0.291932059447983,
"grad_norm": 1.6886736154556274,
"learning_rate": 4.958061264752303e-05,
"loss": 1.6512,
"step": 275
},
{
"epoch": 0.29723991507430997,
"grad_norm": 1.3794057369232178,
"learning_rate": 4.956526831185353e-05,
"loss": 1.5219,
"step": 280
},
{
"epoch": 0.30254777070063693,
"grad_norm": 1.388073205947876,
"learning_rate": 4.9549650752055564e-05,
"loss": 1.6123,
"step": 285
},
{
"epoch": 0.3078556263269639,
"grad_norm": 1.3487626314163208,
"learning_rate": 4.9533760141833506e-05,
"loss": 1.5851,
"step": 290
},
{
"epoch": 0.31316348195329086,
"grad_norm": 1.3812495470046997,
"learning_rate": 4.9517596657928665e-05,
"loss": 1.6599,
"step": 295
},
{
"epoch": 0.3184713375796178,
"grad_norm": 1.5697531700134277,
"learning_rate": 4.950116048011739e-05,
"loss": 1.5878,
"step": 300
},
{
"epoch": 0.3237791932059448,
"grad_norm": 1.511542558670044,
"learning_rate": 4.9484451791209e-05,
"loss": 1.5842,
"step": 305
},
{
"epoch": 0.32908704883227174,
"grad_norm": 1.4298487901687622,
"learning_rate": 4.9467470777043806e-05,
"loss": 1.624,
"step": 310
},
{
"epoch": 0.3343949044585987,
"grad_norm": 1.5230979919433594,
"learning_rate": 4.9450217626491016e-05,
"loss": 1.478,
"step": 315
},
{
"epoch": 0.33970276008492567,
"grad_norm": 1.4259607791900635,
"learning_rate": 4.943269253144664e-05,
"loss": 1.606,
"step": 320
},
{
"epoch": 0.34501061571125263,
"grad_norm": 1.4630590677261353,
"learning_rate": 4.9414895686831376e-05,
"loss": 1.6398,
"step": 325
},
{
"epoch": 0.3503184713375796,
"grad_norm": 6.577169895172119,
"learning_rate": 4.939682729058839e-05,
"loss": 1.6031,
"step": 330
},
{
"epoch": 0.35562632696390656,
"grad_norm": 1.5499671697616577,
"learning_rate": 4.9378487543681154e-05,
"loss": 1.5839,
"step": 335
},
{
"epoch": 0.3609341825902335,
"grad_norm": 1.4330203533172607,
"learning_rate": 4.935987665009123e-05,
"loss": 1.6147,
"step": 340
},
{
"epoch": 0.3662420382165605,
"grad_norm": 1.4522225856781006,
"learning_rate": 4.9340994816815946e-05,
"loss": 1.5507,
"step": 345
},
{
"epoch": 0.37154989384288745,
"grad_norm": 1.4307374954223633,
"learning_rate": 4.9321842253866136e-05,
"loss": 1.617,
"step": 350
},
{
"epoch": 0.37685774946921446,
"grad_norm": 1.288672685623169,
"learning_rate": 4.930241917426379e-05,
"loss": 1.5612,
"step": 355
},
{
"epoch": 0.3821656050955414,
"grad_norm": 4.4466938972473145,
"learning_rate": 4.928272579403969e-05,
"loss": 1.6811,
"step": 360
},
{
"epoch": 0.3874734607218684,
"grad_norm": 1.278381586074829,
"learning_rate": 4.9262762332230996e-05,
"loss": 1.6635,
"step": 365
},
{
"epoch": 0.39278131634819535,
"grad_norm": 1.3399561643600464,
"learning_rate": 4.924252901087881e-05,
"loss": 1.5004,
"step": 370
},
{
"epoch": 0.3980891719745223,
"grad_norm": 1.4491037130355835,
"learning_rate": 4.922202605502573e-05,
"loss": 1.5217,
"step": 375
},
{
"epoch": 0.4033970276008493,
"grad_norm": 1.376114845275879,
"learning_rate": 4.920125369271332e-05,
"loss": 1.5523,
"step": 380
},
{
"epoch": 0.40870488322717624,
"grad_norm": 1.5188900232315063,
"learning_rate": 4.918021215497958e-05,
"loss": 1.6177,
"step": 385
},
{
"epoch": 0.4140127388535032,
"grad_norm": 3.417870044708252,
"learning_rate": 4.9158901675856395e-05,
"loss": 1.6203,
"step": 390
},
{
"epoch": 0.41932059447983017,
"grad_norm": 1.3470820188522339,
"learning_rate": 4.913732249236689e-05,
"loss": 1.4859,
"step": 395
},
{
"epoch": 0.42462845010615713,
"grad_norm": 1.4560948610305786,
"learning_rate": 4.911547484452286e-05,
"loss": 1.6273,
"step": 400
},
{
"epoch": 0.4299363057324841,
"grad_norm": 1.5756402015686035,
"learning_rate": 4.909335897532202e-05,
"loss": 1.7351,
"step": 405
},
{
"epoch": 0.43524416135881105,
"grad_norm": 1.5693994760513306,
"learning_rate": 4.9070975130745387e-05,
"loss": 1.4738,
"step": 410
},
{
"epoch": 0.440552016985138,
"grad_norm": 1.669011116027832,
"learning_rate": 4.904832355975445e-05,
"loss": 1.6116,
"step": 415
},
{
"epoch": 0.445859872611465,
"grad_norm": 1.4707449674606323,
"learning_rate": 4.902540451428849e-05,
"loss": 1.5715,
"step": 420
},
{
"epoch": 0.45116772823779194,
"grad_norm": 1.2995195388793945,
"learning_rate": 4.900221824926173e-05,
"loss": 1.6486,
"step": 425
},
{
"epoch": 0.4564755838641189,
"grad_norm": 3.4517061710357666,
"learning_rate": 4.89787650225605e-05,
"loss": 1.6355,
"step": 430
},
{
"epoch": 0.46178343949044587,
"grad_norm": 1.5337308645248413,
"learning_rate": 4.895504509504039e-05,
"loss": 1.6102,
"step": 435
},
{
"epoch": 0.46709129511677283,
"grad_norm": 3.0765092372894287,
"learning_rate": 4.893105873052333e-05,
"loss": 1.6678,
"step": 440
},
{
"epoch": 0.4723991507430998,
"grad_norm": 1.5984159708023071,
"learning_rate": 4.8906806195794655e-05,
"loss": 1.6586,
"step": 445
},
{
"epoch": 0.47770700636942676,
"grad_norm": 1.6139107942581177,
"learning_rate": 4.888228776060016e-05,
"loss": 1.447,
"step": 450
},
{
"epoch": 0.4830148619957537,
"grad_norm": 2.6926217079162598,
"learning_rate": 4.8857503697643094e-05,
"loss": 1.6684,
"step": 455
},
{
"epoch": 0.4883227176220807,
"grad_norm": 7.389099597930908,
"learning_rate": 4.883245428258107e-05,
"loss": 1.6146,
"step": 460
},
{
"epoch": 0.49363057324840764,
"grad_norm": 1.4098495244979858,
"learning_rate": 4.880713979402311e-05,
"loss": 1.4861,
"step": 465
},
{
"epoch": 0.4989384288747346,
"grad_norm": 1.5015437602996826,
"learning_rate": 4.8781560513526414e-05,
"loss": 1.7288,
"step": 470
},
{
"epoch": 0.5042462845010616,
"grad_norm": 1.5533912181854248,
"learning_rate": 4.875571672559337e-05,
"loss": 1.5165,
"step": 475
},
{
"epoch": 0.5095541401273885,
"grad_norm": 1.6178662776947021,
"learning_rate": 4.8729608717668265e-05,
"loss": 1.4429,
"step": 480
},
{
"epoch": 0.5148619957537155,
"grad_norm": 1.4526007175445557,
"learning_rate": 4.870323678013415e-05,
"loss": 1.5218,
"step": 485
},
{
"epoch": 0.5201698513800425,
"grad_norm": 1.5645301342010498,
"learning_rate": 4.867660120630962e-05,
"loss": 1.5621,
"step": 490
},
{
"epoch": 0.5254777070063694,
"grad_norm": 1.4064879417419434,
"learning_rate": 4.864970229244552e-05,
"loss": 1.5439,
"step": 495
},
{
"epoch": 0.5307855626326964,
"grad_norm": 1.5387187004089355,
"learning_rate": 4.862254033772164e-05,
"loss": 1.5439,
"step": 500
},
{
"epoch": 0.5360934182590233,
"grad_norm": 15.129183769226074,
"learning_rate": 4.859511564424345e-05,
"loss": 1.7019,
"step": 505
},
{
"epoch": 0.5414012738853503,
"grad_norm": 1.44954252243042,
"learning_rate": 4.856742851703866e-05,
"loss": 1.4983,
"step": 510
},
{
"epoch": 0.5467091295116773,
"grad_norm": 1.40328049659729,
"learning_rate": 4.8539479264053896e-05,
"loss": 1.5446,
"step": 515
},
{
"epoch": 0.5520169851380042,
"grad_norm": 1.4867098331451416,
"learning_rate": 4.8511268196151224e-05,
"loss": 1.5093,
"step": 520
},
{
"epoch": 0.5573248407643312,
"grad_norm": 1.2966865301132202,
"learning_rate": 4.848279562710474e-05,
"loss": 1.5193,
"step": 525
},
{
"epoch": 0.5626326963906582,
"grad_norm": 1.4985305070877075,
"learning_rate": 4.845406187359701e-05,
"loss": 1.5356,
"step": 530
},
{
"epoch": 0.5679405520169851,
"grad_norm": 1.6481366157531738,
"learning_rate": 4.842506725521565e-05,
"loss": 1.5552,
"step": 535
},
{
"epoch": 0.5732484076433121,
"grad_norm": 1.7323246002197266,
"learning_rate": 4.839581209444966e-05,
"loss": 1.6082,
"step": 540
},
{
"epoch": 0.578556263269639,
"grad_norm": 1.4038372039794922,
"learning_rate": 4.8366296716685914e-05,
"loss": 1.7123,
"step": 545
},
{
"epoch": 0.583864118895966,
"grad_norm": 1.4740629196166992,
"learning_rate": 4.833652145020551e-05,
"loss": 1.5231,
"step": 550
},
{
"epoch": 0.589171974522293,
"grad_norm": 1.6231038570404053,
"learning_rate": 4.830648662618015e-05,
"loss": 1.3732,
"step": 555
},
{
"epoch": 0.5944798301486199,
"grad_norm": 1.4000989198684692,
"learning_rate": 4.827619257866839e-05,
"loss": 1.7253,
"step": 560
},
{
"epoch": 0.5997876857749469,
"grad_norm": 3.50241756439209,
"learning_rate": 4.8245639644612006e-05,
"loss": 1.4861,
"step": 565
},
{
"epoch": 0.6050955414012739,
"grad_norm": 2.493551731109619,
"learning_rate": 4.821482816383218e-05,
"loss": 1.5129,
"step": 570
},
{
"epoch": 0.6104033970276008,
"grad_norm": 1.4700591564178467,
"learning_rate": 4.818375847902577e-05,
"loss": 1.4915,
"step": 575
},
{
"epoch": 0.6157112526539278,
"grad_norm": 1.4178653955459595,
"learning_rate": 4.8152430935761456e-05,
"loss": 1.5438,
"step": 580
},
{
"epoch": 0.6210191082802548,
"grad_norm": 1.6205229759216309,
"learning_rate": 4.812084588247592e-05,
"loss": 1.666,
"step": 585
},
{
"epoch": 0.6263269639065817,
"grad_norm": 1.566666841506958,
"learning_rate": 4.808900367046999e-05,
"loss": 1.7644,
"step": 590
},
{
"epoch": 0.6316348195329087,
"grad_norm": 1.8027448654174805,
"learning_rate": 4.8056904653904666e-05,
"loss": 1.6192,
"step": 595
},
{
"epoch": 0.6369426751592356,
"grad_norm": 1.7948691844940186,
"learning_rate": 4.8024549189797276e-05,
"loss": 1.5361,
"step": 600
},
{
"epoch": 0.6422505307855626,
"grad_norm": 1.4708564281463623,
"learning_rate": 4.7991937638017415e-05,
"loss": 1.7171,
"step": 605
},
{
"epoch": 0.6475583864118896,
"grad_norm": 4.92915678024292,
"learning_rate": 4.795907036128299e-05,
"loss": 1.5913,
"step": 610
},
{
"epoch": 0.6528662420382165,
"grad_norm": 1.3035740852355957,
"learning_rate": 4.792594772515619e-05,
"loss": 1.7267,
"step": 615
},
{
"epoch": 0.6581740976645435,
"grad_norm": 1.4440399408340454,
"learning_rate": 4.78925700980394e-05,
"loss": 1.6977,
"step": 620
},
{
"epoch": 0.6634819532908705,
"grad_norm": 1.6491578817367554,
"learning_rate": 4.78589378511711e-05,
"loss": 1.6391,
"step": 625
},
{
"epoch": 0.6687898089171974,
"grad_norm": 1.6024360656738281,
"learning_rate": 4.782505135862176e-05,
"loss": 1.6311,
"step": 630
},
{
"epoch": 0.6740976645435244,
"grad_norm": 1.5361950397491455,
"learning_rate": 4.7790910997289664e-05,
"loss": 1.5929,
"step": 635
},
{
"epoch": 0.6794055201698513,
"grad_norm": 1.4991101026535034,
"learning_rate": 4.77565171468967e-05,
"loss": 1.5092,
"step": 640
},
{
"epoch": 0.6847133757961783,
"grad_norm": 4.35531759262085,
"learning_rate": 4.77218701899842e-05,
"loss": 1.6607,
"step": 645
},
{
"epoch": 0.6900212314225053,
"grad_norm": 1.4146044254302979,
"learning_rate": 4.7686970511908594e-05,
"loss": 1.5518,
"step": 650
},
{
"epoch": 0.6953290870488322,
"grad_norm": 1.5280144214630127,
"learning_rate": 4.7651818500837184e-05,
"loss": 1.7207,
"step": 655
},
{
"epoch": 0.7006369426751592,
"grad_norm": 1.5810437202453613,
"learning_rate": 4.761641454774386e-05,
"loss": 1.4195,
"step": 660
},
{
"epoch": 0.7059447983014862,
"grad_norm": 1.454335331916809,
"learning_rate": 4.758075904640463e-05,
"loss": 1.4806,
"step": 665
},
{
"epoch": 0.7112526539278131,
"grad_norm": 1.6834053993225098,
"learning_rate": 4.7544852393393375e-05,
"loss": 1.4771,
"step": 670
},
{
"epoch": 0.7165605095541401,
"grad_norm": 1.5010559558868408,
"learning_rate": 4.750869498807735e-05,
"loss": 1.5019,
"step": 675
},
{
"epoch": 0.721868365180467,
"grad_norm": 1.5334972143173218,
"learning_rate": 4.747228723261278e-05,
"loss": 1.4645,
"step": 680
},
{
"epoch": 0.727176220806794,
"grad_norm": 1.3904098272323608,
"learning_rate": 4.743562953194039e-05,
"loss": 1.4856,
"step": 685
},
{
"epoch": 0.732484076433121,
"grad_norm": 5.131705284118652,
"learning_rate": 4.739872229378085e-05,
"loss": 1.6691,
"step": 690
},
{
"epoch": 0.7377919320594479,
"grad_norm": 1.4987908601760864,
"learning_rate": 4.736156592863032e-05,
"loss": 1.581,
"step": 695
},
{
"epoch": 0.7430997876857749,
"grad_norm": 1.5452443361282349,
"learning_rate": 4.732416084975585e-05,
"loss": 1.5531,
"step": 700
},
{
"epoch": 0.7484076433121019,
"grad_norm": 1.5594438314437866,
"learning_rate": 4.7286507473190736e-05,
"loss": 1.5902,
"step": 705
},
{
"epoch": 0.7537154989384289,
"grad_norm": 1.5028551816940308,
"learning_rate": 4.724860621772995e-05,
"loss": 1.4885,
"step": 710
},
{
"epoch": 0.7590233545647559,
"grad_norm": 3.062858819961548,
"learning_rate": 4.721045750492549e-05,
"loss": 1.5931,
"step": 715
},
{
"epoch": 0.7643312101910829,
"grad_norm": 1.6405315399169922,
"learning_rate": 4.717206175908164e-05,
"loss": 1.3859,
"step": 720
},
{
"epoch": 0.7696390658174098,
"grad_norm": 1.4577491283416748,
"learning_rate": 4.713341940725029e-05,
"loss": 1.5765,
"step": 725
},
{
"epoch": 0.7749469214437368,
"grad_norm": 1.5505036115646362,
"learning_rate": 4.7094530879226166e-05,
"loss": 1.6068,
"step": 730
},
{
"epoch": 0.7802547770700637,
"grad_norm": 1.4415700435638428,
"learning_rate": 4.705539660754207e-05,
"loss": 1.5555,
"step": 735
},
{
"epoch": 0.7855626326963907,
"grad_norm": 1.699349045753479,
"learning_rate": 4.701601702746405e-05,
"loss": 1.4237,
"step": 740
},
{
"epoch": 0.7908704883227177,
"grad_norm": 1.6142672300338745,
"learning_rate": 4.697639257698657e-05,
"loss": 1.5193,
"step": 745
},
{
"epoch": 0.7961783439490446,
"grad_norm": 1.498228907585144,
"learning_rate": 4.6936523696827615e-05,
"loss": 1.548,
"step": 750
},
{
"epoch": 0.8014861995753716,
"grad_norm": 1.5121357440948486,
"learning_rate": 4.6896410830423845e-05,
"loss": 1.546,
"step": 755
},
{
"epoch": 0.8067940552016986,
"grad_norm": 1.4195791482925415,
"learning_rate": 4.685605442392559e-05,
"loss": 1.5297,
"step": 760
},
{
"epoch": 0.8121019108280255,
"grad_norm": 1.5095983743667603,
"learning_rate": 4.681545492619195e-05,
"loss": 1.6098,
"step": 765
},
{
"epoch": 0.8174097664543525,
"grad_norm": 1.597701072692871,
"learning_rate": 4.677461278878577e-05,
"loss": 1.7606,
"step": 770
},
{
"epoch": 0.8227176220806794,
"grad_norm": 1.5142344236373901,
"learning_rate": 4.673352846596861e-05,
"loss": 1.4081,
"step": 775
},
{
"epoch": 0.8280254777070064,
"grad_norm": 1.5927125215530396,
"learning_rate": 4.669220241469573e-05,
"loss": 1.4579,
"step": 780
},
{
"epoch": 0.8333333333333334,
"grad_norm": 1.527633547782898,
"learning_rate": 4.665063509461097e-05,
"loss": 1.5787,
"step": 785
},
{
"epoch": 0.8386411889596603,
"grad_norm": 1.558786153793335,
"learning_rate": 4.660882696804165e-05,
"loss": 1.5751,
"step": 790
},
{
"epoch": 0.8439490445859873,
"grad_norm": 1.5467716455459595,
"learning_rate": 4.656677849999345e-05,
"loss": 1.4025,
"step": 795
},
{
"epoch": 0.8492569002123143,
"grad_norm": 1.6665401458740234,
"learning_rate": 4.652449015814518e-05,
"loss": 1.5634,
"step": 800
},
{
"epoch": 0.8545647558386412,
"grad_norm": 1.736045479774475,
"learning_rate": 4.648196241284367e-05,
"loss": 1.5068,
"step": 805
},
{
"epoch": 0.8598726114649682,
"grad_norm": 14.421595573425293,
"learning_rate": 4.643919573709843e-05,
"loss": 1.5791,
"step": 810
},
{
"epoch": 0.8651804670912951,
"grad_norm": 3.725691080093384,
"learning_rate": 4.639619060657648e-05,
"loss": 1.5196,
"step": 815
},
{
"epoch": 0.8704883227176221,
"grad_norm": 1.8308895826339722,
"learning_rate": 4.6352947499597024e-05,
"loss": 1.5593,
"step": 820
},
{
"epoch": 0.8757961783439491,
"grad_norm": 1.6904733180999756,
"learning_rate": 4.630946689712609e-05,
"loss": 1.568,
"step": 825
},
{
"epoch": 0.881104033970276,
"grad_norm": 1.5767687559127808,
"learning_rate": 4.626574928277127e-05,
"loss": 1.5503,
"step": 830
},
{
"epoch": 0.886411889596603,
"grad_norm": 1.6126394271850586,
"learning_rate": 4.622179514277626e-05,
"loss": 1.6526,
"step": 835
},
{
"epoch": 0.89171974522293,
"grad_norm": 2.0911881923675537,
"learning_rate": 4.618646186075468e-05,
"loss": 1.6366,
"step": 840
},
{
"epoch": 0.8970276008492569,
"grad_norm": 1.9342654943466187,
"learning_rate": 4.614208320833528e-05,
"loss": 1.5226,
"step": 845
},
{
"epoch": 0.9023354564755839,
"grad_norm": 2.9547078609466553,
"learning_rate": 4.6097469405736174e-05,
"loss": 1.5154,
"step": 850
},
{
"epoch": 0.9076433121019108,
"grad_norm": 1.7412949800491333,
"learning_rate": 4.605262094916878e-05,
"loss": 1.4203,
"step": 855
},
{
"epoch": 0.9129511677282378,
"grad_norm": 1.4583709239959717,
"learning_rate": 4.6007538337454464e-05,
"loss": 1.4819,
"step": 860
},
{
"epoch": 0.9182590233545648,
"grad_norm": 1.556915521621704,
"learning_rate": 4.5962222072018955e-05,
"loss": 1.4277,
"step": 865
},
{
"epoch": 0.9235668789808917,
"grad_norm": 1.6333413124084473,
"learning_rate": 4.5916672656886746e-05,
"loss": 1.48,
"step": 870
},
{
"epoch": 0.9288747346072187,
"grad_norm": 1.5821317434310913,
"learning_rate": 4.587089059867552e-05,
"loss": 1.532,
"step": 875
},
{
"epoch": 0.9341825902335457,
"grad_norm": 1.4887222051620483,
"learning_rate": 4.58248764065905e-05,
"loss": 1.6305,
"step": 880
},
{
"epoch": 0.9394904458598726,
"grad_norm": 1.8513277769088745,
"learning_rate": 4.577863059241879e-05,
"loss": 1.6394,
"step": 885
},
{
"epoch": 0.9447983014861996,
"grad_norm": 1.4932013750076294,
"learning_rate": 4.573215367052369e-05,
"loss": 1.7202,
"step": 890
},
{
"epoch": 0.9501061571125266,
"grad_norm": 8.590271949768066,
"learning_rate": 4.568544615783894e-05,
"loss": 1.4357,
"step": 895
},
{
"epoch": 0.9554140127388535,
"grad_norm": 1.458540439605713,
"learning_rate": 4.5638508573863035e-05,
"loss": 1.6818,
"step": 900
},
{
"epoch": 0.9607218683651805,
"grad_norm": 1.7310667037963867,
"learning_rate": 4.559134144065338e-05,
"loss": 1.6905,
"step": 905
},
{
"epoch": 0.9660297239915074,
"grad_norm": 1.4106065034866333,
"learning_rate": 4.554394528282052e-05,
"loss": 1.5248,
"step": 910
},
{
"epoch": 0.9713375796178344,
"grad_norm": 1.5425328016281128,
"learning_rate": 4.549632062752231e-05,
"loss": 1.5851,
"step": 915
},
{
"epoch": 0.9766454352441614,
"grad_norm": 1.6904933452606201,
"learning_rate": 4.5448468004458025e-05,
"loss": 1.434,
"step": 920
},
{
"epoch": 0.9819532908704883,
"grad_norm": 12.340048789978027,
"learning_rate": 4.5400387945862486e-05,
"loss": 1.567,
"step": 925
},
{
"epoch": 0.9872611464968153,
"grad_norm": 8.410961151123047,
"learning_rate": 4.5352080986500135e-05,
"loss": 1.5363,
"step": 930
},
{
"epoch": 0.9925690021231423,
"grad_norm": 1.6052480936050415,
"learning_rate": 4.530354766365911e-05,
"loss": 1.6247,
"step": 935
},
{
"epoch": 0.9978768577494692,
"grad_norm": 1.6782705783843994,
"learning_rate": 4.525478851714522e-05,
"loss": 1.4887,
"step": 940
},
{
"epoch": 1.0031847133757963,
"grad_norm": 1.480660080909729,
"learning_rate": 4.5205804089275976e-05,
"loss": 1.441,
"step": 945
},
{
"epoch": 1.0084925690021231,
"grad_norm": 4.800995349884033,
"learning_rate": 4.5156594924874575e-05,
"loss": 1.5609,
"step": 950
},
{
"epoch": 1.0138004246284502,
"grad_norm": 1.552259087562561,
"learning_rate": 4.510716157126379e-05,
"loss": 1.5113,
"step": 955
},
{
"epoch": 1.019108280254777,
"grad_norm": 1.4873735904693604,
"learning_rate": 4.5057504578259924e-05,
"loss": 1.5546,
"step": 960
},
{
"epoch": 1.0244161358811041,
"grad_norm": 1.6509064435958862,
"learning_rate": 4.500762449816668e-05,
"loss": 1.4914,
"step": 965
},
{
"epoch": 1.029723991507431,
"grad_norm": 2.9540882110595703,
"learning_rate": 4.495752188576902e-05,
"loss": 1.3561,
"step": 970
},
{
"epoch": 1.035031847133758,
"grad_norm": 1.5996639728546143,
"learning_rate": 4.4907197298327e-05,
"loss": 1.6173,
"step": 975
},
{
"epoch": 1.040339702760085,
"grad_norm": 1.5103893280029297,
"learning_rate": 4.485665129556954e-05,
"loss": 1.6103,
"step": 980
},
{
"epoch": 1.045647558386412,
"grad_norm": 1.7204993963241577,
"learning_rate": 4.4805884439688244e-05,
"loss": 1.5181,
"step": 985
},
{
"epoch": 1.0509554140127388,
"grad_norm": 1.6108498573303223,
"learning_rate": 4.475489729533114e-05,
"loss": 1.5974,
"step": 990
},
{
"epoch": 1.056263269639066,
"grad_norm": 1.7573896646499634,
"learning_rate": 4.470369042959637e-05,
"loss": 1.4313,
"step": 995
},
{
"epoch": 1.0615711252653928,
"grad_norm": 1.7552425861358643,
"learning_rate": 4.465226441202589e-05,
"loss": 1.4407,
"step": 1000
},
{
"epoch": 1.0668789808917198,
"grad_norm": 1.6480523347854614,
"learning_rate": 4.460061981459917e-05,
"loss": 1.597,
"step": 1005
},
{
"epoch": 1.0721868365180467,
"grad_norm": 2.603396415710449,
"learning_rate": 4.454875721172679e-05,
"loss": 1.5327,
"step": 1010
},
{
"epoch": 1.0774946921443738,
"grad_norm": 1.5323936939239502,
"learning_rate": 4.4496677180244065e-05,
"loss": 1.5541,
"step": 1015
},
{
"epoch": 1.0828025477707006,
"grad_norm": 1.6930556297302246,
"learning_rate": 4.444438029940465e-05,
"loss": 1.5251,
"step": 1020
},
{
"epoch": 1.0881104033970277,
"grad_norm": 1.7261557579040527,
"learning_rate": 4.439186715087406e-05,
"loss": 1.603,
"step": 1025
},
{
"epoch": 1.0934182590233545,
"grad_norm": 3.680421829223633,
"learning_rate": 4.4339138318723246e-05,
"loss": 1.529,
"step": 1030
},
{
"epoch": 1.0987261146496816,
"grad_norm": 1.6117990016937256,
"learning_rate": 4.428619438942204e-05,
"loss": 1.6533,
"step": 1035
},
{
"epoch": 1.1040339702760085,
"grad_norm": 1.8256531953811646,
"learning_rate": 4.42330359518327e-05,
"loss": 1.5175,
"step": 1040
},
{
"epoch": 1.1093418259023355,
"grad_norm": 1.751794457435608,
"learning_rate": 4.417966359720329e-05,
"loss": 1.5462,
"step": 1045
},
{
"epoch": 1.1146496815286624,
"grad_norm": 1.6888757944107056,
"learning_rate": 4.4126077919161165e-05,
"loss": 1.5416,
"step": 1050
},
{
"epoch": 1.1199575371549895,
"grad_norm": 1.6523631811141968,
"learning_rate": 4.407227951370635e-05,
"loss": 1.5035,
"step": 1055
},
{
"epoch": 1.1252653927813163,
"grad_norm": 1.532172441482544,
"learning_rate": 4.401826897920487e-05,
"loss": 1.5502,
"step": 1060
},
{
"epoch": 1.1305732484076434,
"grad_norm": 1.8720592260360718,
"learning_rate": 4.396404691638215e-05,
"loss": 1.5217,
"step": 1065
},
{
"epoch": 1.1358811040339702,
"grad_norm": 1.553623080253601,
"learning_rate": 4.390961392831633e-05,
"loss": 1.4841,
"step": 1070
},
{
"epoch": 1.1411889596602973,
"grad_norm": 3.326525926589966,
"learning_rate": 4.38549706204315e-05,
"loss": 1.5746,
"step": 1075
},
{
"epoch": 1.1464968152866242,
"grad_norm": 1.6932830810546875,
"learning_rate": 4.380011760049104e-05,
"loss": 1.4295,
"step": 1080
},
{
"epoch": 1.1518046709129512,
"grad_norm": 1.6742303371429443,
"learning_rate": 4.37450554785908e-05,
"loss": 1.6131,
"step": 1085
},
{
"epoch": 1.157112526539278,
"grad_norm": 1.4667819738388062,
"learning_rate": 4.368978486715237e-05,
"loss": 1.5901,
"step": 1090
},
{
"epoch": 1.1624203821656052,
"grad_norm": 1.6580276489257812,
"learning_rate": 4.363430638091621e-05,
"loss": 1.4339,
"step": 1095
},
{
"epoch": 1.167728237791932,
"grad_norm": 1.791914939880371,
"learning_rate": 4.357862063693486e-05,
"loss": 1.6448,
"step": 1100
},
{
"epoch": 1.173036093418259,
"grad_norm": 1.6610525846481323,
"learning_rate": 4.352272825456605e-05,
"loss": 1.4427,
"step": 1105
},
{
"epoch": 1.178343949044586,
"grad_norm": 1.6194666624069214,
"learning_rate": 4.346662985546581e-05,
"loss": 1.5659,
"step": 1110
},
{
"epoch": 1.183651804670913,
"grad_norm": 1.7261152267456055,
"learning_rate": 4.34103260635816e-05,
"loss": 1.7018,
"step": 1115
},
{
"epoch": 1.1889596602972399,
"grad_norm": 1.4662343263626099,
"learning_rate": 4.335381750514529e-05,
"loss": 1.3376,
"step": 1120
},
{
"epoch": 1.194267515923567,
"grad_norm": 1.6291650533676147,
"learning_rate": 4.329710480866627e-05,
"loss": 1.5875,
"step": 1125
},
{
"epoch": 1.1995753715498938,
"grad_norm": 1.7333427667617798,
"learning_rate": 4.3240188604924436e-05,
"loss": 1.6739,
"step": 1130
},
{
"epoch": 1.2048832271762209,
"grad_norm": 1.6119394302368164,
"learning_rate": 4.3183069526963135e-05,
"loss": 1.5353,
"step": 1135
},
{
"epoch": 1.2101910828025477,
"grad_norm": 1.6907188892364502,
"learning_rate": 4.312574821008219e-05,
"loss": 1.4782,
"step": 1140
},
{
"epoch": 1.2154989384288748,
"grad_norm": 1.735899567604065,
"learning_rate": 4.30682252918308e-05,
"loss": 1.6236,
"step": 1145
},
{
"epoch": 1.2208067940552016,
"grad_norm": 1.4847278594970703,
"learning_rate": 4.301050141200041e-05,
"loss": 1.5917,
"step": 1150
},
{
"epoch": 1.2261146496815287,
"grad_norm": 1.5689457654953003,
"learning_rate": 4.295257721261768e-05,
"loss": 1.4878,
"step": 1155
},
{
"epoch": 1.2314225053078556,
"grad_norm": 1.6647305488586426,
"learning_rate": 4.289445333793728e-05,
"loss": 1.4494,
"step": 1160
},
{
"epoch": 1.2367303609341826,
"grad_norm": 1.6868528127670288,
"learning_rate": 4.283613043443474e-05,
"loss": 1.4505,
"step": 1165
},
{
"epoch": 1.2420382165605095,
"grad_norm": 1.5382746458053589,
"learning_rate": 4.277760915079928e-05,
"loss": 1.4367,
"step": 1170
},
{
"epoch": 1.2473460721868366,
"grad_norm": 1.7457520961761475,
"learning_rate": 4.271889013792656e-05,
"loss": 1.5249,
"step": 1175
},
{
"epoch": 1.2526539278131634,
"grad_norm": 1.7499550580978394,
"learning_rate": 4.2659974048911474e-05,
"loss": 1.4727,
"step": 1180
},
{
"epoch": 1.2579617834394905,
"grad_norm": 1.7218068838119507,
"learning_rate": 4.2600861539040845e-05,
"loss": 1.47,
"step": 1185
},
{
"epoch": 1.2632696390658174,
"grad_norm": 1.6986812353134155,
"learning_rate": 4.254155326578621e-05,
"loss": 1.4663,
"step": 1190
},
{
"epoch": 1.2685774946921444,
"grad_norm": 1.8053547143936157,
"learning_rate": 4.2482049888796406e-05,
"loss": 1.5941,
"step": 1195
},
{
"epoch": 1.2738853503184713,
"grad_norm": 1.7940459251403809,
"learning_rate": 4.242235206989032e-05,
"loss": 1.4495,
"step": 1200
},
{
"epoch": 1.2791932059447984,
"grad_norm": 1.8331998586654663,
"learning_rate": 4.236246047304949e-05,
"loss": 1.4658,
"step": 1205
},
{
"epoch": 1.2845010615711252,
"grad_norm": 1.748568058013916,
"learning_rate": 4.2302375764410706e-05,
"loss": 1.5562,
"step": 1210
},
{
"epoch": 1.2898089171974523,
"grad_norm": 3.9075028896331787,
"learning_rate": 4.224209861225865e-05,
"loss": 1.6023,
"step": 1215
},
{
"epoch": 1.2951167728237791,
"grad_norm": 1.7952510118484497,
"learning_rate": 4.218162968701842e-05,
"loss": 1.5116,
"step": 1220
},
{
"epoch": 1.3004246284501062,
"grad_norm": 9.50228500366211,
"learning_rate": 4.212096966124807e-05,
"loss": 1.6317,
"step": 1225
},
{
"epoch": 1.305732484076433,
"grad_norm": 1.3332840204238892,
"learning_rate": 4.206011920963117e-05,
"loss": 1.3332,
"step": 1230
},
{
"epoch": 1.3110403397027601,
"grad_norm": 1.7881704568862915,
"learning_rate": 4.1999079008969264e-05,
"loss": 1.5414,
"step": 1235
},
{
"epoch": 1.316348195329087,
"grad_norm": 1.5981987714767456,
"learning_rate": 4.1937849738174364e-05,
"loss": 1.2791,
"step": 1240
},
{
"epoch": 1.321656050955414,
"grad_norm": 1.5826878547668457,
"learning_rate": 4.187643207826137e-05,
"loss": 1.5198,
"step": 1245
},
{
"epoch": 1.326963906581741,
"grad_norm": 1.590627670288086,
"learning_rate": 4.181482671234056e-05,
"loss": 1.5467,
"step": 1250
},
{
"epoch": 1.332271762208068,
"grad_norm": 2.814920425415039,
"learning_rate": 4.17530343256099e-05,
"loss": 1.4888,
"step": 1255
},
{
"epoch": 1.3375796178343948,
"grad_norm": 1.7269675731658936,
"learning_rate": 4.16910556053475e-05,
"loss": 1.5527,
"step": 1260
},
{
"epoch": 1.342887473460722,
"grad_norm": 1.7376699447631836,
"learning_rate": 4.162889124090394e-05,
"loss": 1.5451,
"step": 1265
},
{
"epoch": 1.3481953290870488,
"grad_norm": 1.6660183668136597,
"learning_rate": 4.1566541923694594e-05,
"loss": 1.5799,
"step": 1270
},
{
"epoch": 1.3535031847133758,
"grad_norm": 1.6107293367385864,
"learning_rate": 4.150400834719195e-05,
"loss": 1.5069,
"step": 1275
},
{
"epoch": 1.3588110403397027,
"grad_norm": 1.662691593170166,
"learning_rate": 4.144129120691791e-05,
"loss": 1.5886,
"step": 1280
},
{
"epoch": 1.3641188959660298,
"grad_norm": 1.9366472959518433,
"learning_rate": 4.137839120043603e-05,
"loss": 1.4109,
"step": 1285
},
{
"epoch": 1.3694267515923566,
"grad_norm": 1.5821161270141602,
"learning_rate": 4.1315309027343774e-05,
"loss": 1.4114,
"step": 1290
},
{
"epoch": 1.3747346072186837,
"grad_norm": 1.6319890022277832,
"learning_rate": 4.125204538926474e-05,
"loss": 1.5181,
"step": 1295
},
{
"epoch": 1.3800424628450108,
"grad_norm": 5.780959129333496,
"learning_rate": 4.118860098984083e-05,
"loss": 1.5228,
"step": 1300
},
{
"epoch": 1.3853503184713376,
"grad_norm": 1.6391093730926514,
"learning_rate": 4.112497653472446e-05,
"loss": 1.5428,
"step": 1305
},
{
"epoch": 1.3906581740976645,
"grad_norm": 4.674883842468262,
"learning_rate": 4.106117273157068e-05,
"loss": 1.553,
"step": 1310
},
{
"epoch": 1.3959660297239915,
"grad_norm": 1.5224674940109253,
"learning_rate": 4.099719029002932e-05,
"loss": 1.4488,
"step": 1315
},
{
"epoch": 1.4012738853503186,
"grad_norm": 1.7352584600448608,
"learning_rate": 4.09330299217371e-05,
"loss": 1.4543,
"step": 1320
},
{
"epoch": 1.4065817409766455,
"grad_norm": 1.31924569606781,
"learning_rate": 4.086869234030969e-05,
"loss": 1.488,
"step": 1325
},
{
"epoch": 1.4118895966029723,
"grad_norm": 1.72556471824646,
"learning_rate": 4.0804178261333826e-05,
"loss": 1.5535,
"step": 1330
},
{
"epoch": 1.4171974522292994,
"grad_norm": 1.706010341644287,
"learning_rate": 4.073948840235928e-05,
"loss": 1.6833,
"step": 1335
},
{
"epoch": 1.4225053078556265,
"grad_norm": 1.762577772140503,
"learning_rate": 4.067462348289092e-05,
"loss": 1.4062,
"step": 1340
},
{
"epoch": 1.4278131634819533,
"grad_norm": 1.6299678087234497,
"learning_rate": 4.060958422438072e-05,
"loss": 1.4231,
"step": 1345
},
{
"epoch": 1.4331210191082802,
"grad_norm": 2.0553765296936035,
"learning_rate": 4.0544371350219716e-05,
"loss": 1.5546,
"step": 1350
},
{
"epoch": 1.4384288747346072,
"grad_norm": 1.8095347881317139,
"learning_rate": 4.0478985585729946e-05,
"loss": 1.3377,
"step": 1355
},
{
"epoch": 1.4437367303609343,
"grad_norm": 1.8389967679977417,
"learning_rate": 4.041342765815641e-05,
"loss": 1.5391,
"step": 1360
},
{
"epoch": 1.4490445859872612,
"grad_norm": 3.814575433731079,
"learning_rate": 4.0347698296658966e-05,
"loss": 1.6319,
"step": 1365
},
{
"epoch": 1.454352441613588,
"grad_norm": 1.799367070198059,
"learning_rate": 4.028179823230423e-05,
"loss": 1.5563,
"step": 1370
},
{
"epoch": 1.459660297239915,
"grad_norm": 1.7153081893920898,
"learning_rate": 4.021572819805744e-05,
"loss": 1.5521,
"step": 1375
},
{
"epoch": 1.4649681528662422,
"grad_norm": 1.6066639423370361,
"learning_rate": 4.014948892877429e-05,
"loss": 1.4111,
"step": 1380
},
{
"epoch": 1.470276008492569,
"grad_norm": 1.7587813138961792,
"learning_rate": 4.008308116119279e-05,
"loss": 1.5377,
"step": 1385
},
{
"epoch": 1.4755838641188959,
"grad_norm": 1.7587300539016724,
"learning_rate": 4.001650563392504e-05,
"loss": 1.5002,
"step": 1390
},
{
"epoch": 1.480891719745223,
"grad_norm": 1.636374592781067,
"learning_rate": 3.994976308744901e-05,
"loss": 1.5244,
"step": 1395
},
{
"epoch": 1.48619957537155,
"grad_norm": 8.85874080657959,
"learning_rate": 3.988285426410036e-05,
"loss": 1.5959,
"step": 1400
},
{
"epoch": 1.4915074309978769,
"grad_norm": 1.6643364429473877,
"learning_rate": 3.98157799080641e-05,
"loss": 1.4482,
"step": 1405
},
{
"epoch": 1.4968152866242037,
"grad_norm": 1.6281925439834595,
"learning_rate": 3.974854076536639e-05,
"loss": 1.377,
"step": 1410
},
{
"epoch": 1.5021231422505308,
"grad_norm": 1.9073539972305298,
"learning_rate": 3.968113758386619e-05,
"loss": 1.4558,
"step": 1415
},
{
"epoch": 1.5074309978768579,
"grad_norm": 1.6521536111831665,
"learning_rate": 3.9613571113246974e-05,
"loss": 1.5093,
"step": 1420
},
{
"epoch": 1.5127388535031847,
"grad_norm": 1.5043442249298096,
"learning_rate": 3.954584210500837e-05,
"loss": 1.3886,
"step": 1425
},
{
"epoch": 1.5180467091295116,
"grad_norm": 1.8989366292953491,
"learning_rate": 3.94779513124578e-05,
"loss": 1.5509,
"step": 1430
},
{
"epoch": 1.5233545647558386,
"grad_norm": 1.6416149139404297,
"learning_rate": 3.940989949070214e-05,
"loss": 1.4652,
"step": 1435
},
{
"epoch": 1.5286624203821657,
"grad_norm": 2.7976372241973877,
"learning_rate": 3.934168739663927e-05,
"loss": 1.363,
"step": 1440
},
{
"epoch": 1.5339702760084926,
"grad_norm": 1.847293734550476,
"learning_rate": 3.9273315788949686e-05,
"loss": 1.4779,
"step": 1445
},
{
"epoch": 1.5392781316348194,
"grad_norm": 1.6267356872558594,
"learning_rate": 3.920478542808805e-05,
"loss": 1.4931,
"step": 1450
},
{
"epoch": 1.5445859872611465,
"grad_norm": 1.7588109970092773,
"learning_rate": 3.913609707627476e-05,
"loss": 1.4393,
"step": 1455
},
{
"epoch": 1.5498938428874736,
"grad_norm": 1.7333145141601562,
"learning_rate": 3.906725149748741e-05,
"loss": 1.5746,
"step": 1460
},
{
"epoch": 1.5552016985138004,
"grad_norm": 1.7868926525115967,
"learning_rate": 3.899824945745236e-05,
"loss": 1.4401,
"step": 1465
},
{
"epoch": 1.5605095541401273,
"grad_norm": 1.4039024114608765,
"learning_rate": 3.892909172363617e-05,
"loss": 1.3735,
"step": 1470
},
{
"epoch": 1.5658174097664543,
"grad_norm": 1.9331457614898682,
"learning_rate": 3.8859779065237115e-05,
"loss": 1.543,
"step": 1475
},
{
"epoch": 1.5711252653927814,
"grad_norm": 1.744489312171936,
"learning_rate": 3.879031225317656e-05,
"loss": 1.5235,
"step": 1480
},
{
"epoch": 1.5764331210191083,
"grad_norm": 1.8111132383346558,
"learning_rate": 3.872069206009047e-05,
"loss": 1.4448,
"step": 1485
},
{
"epoch": 1.5817409766454351,
"grad_norm": 1.7649325132369995,
"learning_rate": 3.865091926032072e-05,
"loss": 1.4324,
"step": 1490
},
{
"epoch": 1.5870488322717622,
"grad_norm": 1.977229118347168,
"learning_rate": 3.858099462990658e-05,
"loss": 1.458,
"step": 1495
},
{
"epoch": 1.5923566878980893,
"grad_norm": 1.7111470699310303,
"learning_rate": 3.851091894657601e-05,
"loss": 1.5631,
"step": 1500
},
{
"epoch": 1.5976645435244161,
"grad_norm": 1.9638622999191284,
"learning_rate": 3.8440692989737044e-05,
"loss": 1.6272,
"step": 1505
},
{
"epoch": 1.602972399150743,
"grad_norm": 1.7026537656784058,
"learning_rate": 3.837031754046911e-05,
"loss": 1.4667,
"step": 1510
},
{
"epoch": 1.60828025477707,
"grad_norm": 1.6524882316589355,
"learning_rate": 3.829979338151437e-05,
"loss": 1.3998,
"step": 1515
},
{
"epoch": 1.6135881104033971,
"grad_norm": 1.5562883615493774,
"learning_rate": 3.822912129726896e-05,
"loss": 1.5495,
"step": 1520
},
{
"epoch": 1.618895966029724,
"grad_norm": 1.3875113725662231,
"learning_rate": 3.815830207377431e-05,
"loss": 1.4045,
"step": 1525
},
{
"epoch": 1.6242038216560508,
"grad_norm": 2.9664599895477295,
"learning_rate": 3.808733649870839e-05,
"loss": 1.3617,
"step": 1530
},
{
"epoch": 1.629511677282378,
"grad_norm": 1.9497058391571045,
"learning_rate": 3.801622536137694e-05,
"loss": 1.6036,
"step": 1535
},
{
"epoch": 1.634819532908705,
"grad_norm": 1.8934732675552368,
"learning_rate": 3.794496945270471e-05,
"loss": 1.4382,
"step": 1540
},
{
"epoch": 1.6401273885350318,
"grad_norm": 2.006883382797241,
"learning_rate": 3.787356956522665e-05,
"loss": 1.4724,
"step": 1545
},
{
"epoch": 1.6454352441613587,
"grad_norm": 1.51792573928833,
"learning_rate": 3.780202649307907e-05,
"loss": 1.3992,
"step": 1550
},
{
"epoch": 1.6507430997876857,
"grad_norm": 1.7015622854232788,
"learning_rate": 3.7730341031990875e-05,
"loss": 1.5489,
"step": 1555
},
{
"epoch": 1.6560509554140128,
"grad_norm": 1.560760259628296,
"learning_rate": 3.765851397927463e-05,
"loss": 1.4211,
"step": 1560
},
{
"epoch": 1.6613588110403397,
"grad_norm": 1.7241127490997314,
"learning_rate": 3.758654613381778e-05,
"loss": 1.506,
"step": 1565
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.5885661840438843,
"learning_rate": 3.751443829607368e-05,
"loss": 1.4212,
"step": 1570
},
{
"epoch": 1.6719745222929936,
"grad_norm": 1.660274624824524,
"learning_rate": 3.744219126805276e-05,
"loss": 1.2287,
"step": 1575
},
{
"epoch": 1.6772823779193207,
"grad_norm": 1.8777093887329102,
"learning_rate": 3.736980585331355e-05,
"loss": 1.52,
"step": 1580
},
{
"epoch": 1.6825902335456475,
"grad_norm": 1.9632158279418945,
"learning_rate": 3.729728285695381e-05,
"loss": 1.4532,
"step": 1585
},
{
"epoch": 1.6878980891719744,
"grad_norm": 1.859124779701233,
"learning_rate": 3.7224623085601474e-05,
"loss": 1.6036,
"step": 1590
},
{
"epoch": 1.6932059447983014,
"grad_norm": 3.9819421768188477,
"learning_rate": 3.7151827347405806e-05,
"loss": 1.7094,
"step": 1595
},
{
"epoch": 1.6985138004246285,
"grad_norm": 1.9998877048492432,
"learning_rate": 3.707889645202829e-05,
"loss": 1.393,
"step": 1600
},
{
"epoch": 1.7038216560509554,
"grad_norm": 1.7848412990570068,
"learning_rate": 3.700583121063371e-05,
"loss": 1.4604,
"step": 1605
},
{
"epoch": 1.7091295116772822,
"grad_norm": 2.515498638153076,
"learning_rate": 3.693263243588109e-05,
"loss": 1.465,
"step": 1610
},
{
"epoch": 1.7144373673036093,
"grad_norm": 1.8479849100112915,
"learning_rate": 3.6859300941914645e-05,
"loss": 1.6931,
"step": 1615
},
{
"epoch": 1.7197452229299364,
"grad_norm": 1.7097549438476562,
"learning_rate": 3.6785837544354774e-05,
"loss": 1.547,
"step": 1620
},
{
"epoch": 1.7250530785562632,
"grad_norm": 1.6838785409927368,
"learning_rate": 3.671224306028893e-05,
"loss": 1.3985,
"step": 1625
},
{
"epoch": 1.73036093418259,
"grad_norm": 1.7739403247833252,
"learning_rate": 3.6638518308262565e-05,
"loss": 1.4027,
"step": 1630
},
{
"epoch": 1.7356687898089171,
"grad_norm": 1.8597843647003174,
"learning_rate": 3.656466410827004e-05,
"loss": 1.492,
"step": 1635
},
{
"epoch": 1.7409766454352442,
"grad_norm": 2.0825037956237793,
"learning_rate": 3.649068128174546e-05,
"loss": 1.5483,
"step": 1640
},
{
"epoch": 1.746284501061571,
"grad_norm": 6.958364486694336,
"learning_rate": 3.641657065155358e-05,
"loss": 1.5487,
"step": 1645
},
{
"epoch": 1.7515923566878981,
"grad_norm": 1.7793304920196533,
"learning_rate": 3.634233304198061e-05,
"loss": 1.3823,
"step": 1650
},
{
"epoch": 1.7569002123142252,
"grad_norm": 1.587827444076538,
"learning_rate": 3.626796927872511e-05,
"loss": 1.506,
"step": 1655
},
{
"epoch": 1.762208067940552,
"grad_norm": 1.9246413707733154,
"learning_rate": 3.619348018888873e-05,
"loss": 1.5549,
"step": 1660
},
{
"epoch": 1.767515923566879,
"grad_norm": 1.54891836643219,
"learning_rate": 3.611886660096709e-05,
"loss": 1.5131,
"step": 1665
},
{
"epoch": 1.772823779193206,
"grad_norm": 1.9341977834701538,
"learning_rate": 3.604412934484048e-05,
"loss": 1.584,
"step": 1670
},
{
"epoch": 1.778131634819533,
"grad_norm": 1.5830014944076538,
"learning_rate": 3.5969269251764704e-05,
"loss": 1.5922,
"step": 1675
},
{
"epoch": 1.78343949044586,
"grad_norm": 1.724741816520691,
"learning_rate": 3.58942871543618e-05,
"loss": 1.3407,
"step": 1680
},
{
"epoch": 1.7887473460721868,
"grad_norm": 1.831621766090393,
"learning_rate": 3.581918388661078e-05,
"loss": 1.5302,
"step": 1685
},
{
"epoch": 1.7940552016985138,
"grad_norm": 1.8564783334732056,
"learning_rate": 3.5743960283838355e-05,
"loss": 1.5634,
"step": 1690
},
{
"epoch": 1.799363057324841,
"grad_norm": 1.8462448120117188,
"learning_rate": 3.566861718270966e-05,
"loss": 1.4205,
"step": 1695
},
{
"epoch": 1.8046709129511678,
"grad_norm": 1.8261650800704956,
"learning_rate": 3.5593155421218914e-05,
"loss": 1.4333,
"step": 1700
},
{
"epoch": 1.8099787685774946,
"grad_norm": 2.0608906745910645,
"learning_rate": 3.5517575838680144e-05,
"loss": 1.427,
"step": 1705
},
{
"epoch": 1.8152866242038217,
"grad_norm": 1.8263474702835083,
"learning_rate": 3.544187927571781e-05,
"loss": 1.4824,
"step": 1710
},
{
"epoch": 1.8205944798301488,
"grad_norm": 1.9386931657791138,
"learning_rate": 3.5366066574257486e-05,
"loss": 1.3078,
"step": 1715
},
{
"epoch": 1.8259023354564756,
"grad_norm": 1.8082537651062012,
"learning_rate": 3.5290138577516455e-05,
"loss": 1.4363,
"step": 1720
},
{
"epoch": 1.8312101910828025,
"grad_norm": 1.8778493404388428,
"learning_rate": 3.52140961299944e-05,
"loss": 1.3782,
"step": 1725
},
{
"epoch": 1.8365180467091295,
"grad_norm": 3.402137279510498,
"learning_rate": 3.513794007746394e-05,
"loss": 1.5746,
"step": 1730
},
{
"epoch": 1.8418259023354566,
"grad_norm": 1.8941349983215332,
"learning_rate": 3.506167126696125e-05,
"loss": 1.4293,
"step": 1735
},
{
"epoch": 1.8471337579617835,
"grad_norm": 1.9133306741714478,
"learning_rate": 3.498529054677665e-05,
"loss": 1.5387,
"step": 1740
},
{
"epoch": 1.8524416135881103,
"grad_norm": 1.6595460176467896,
"learning_rate": 3.4908798766445163e-05,
"loss": 1.4309,
"step": 1745
},
{
"epoch": 1.8577494692144374,
"grad_norm": 1.7001606225967407,
"learning_rate": 3.483219677673706e-05,
"loss": 1.444,
"step": 1750
},
{
"epoch": 1.8630573248407645,
"grad_norm": 1.6574691534042358,
"learning_rate": 3.4755485429648404e-05,
"loss": 1.4694,
"step": 1755
},
{
"epoch": 1.8683651804670913,
"grad_norm": 1.6714574098587036,
"learning_rate": 3.467866557839157e-05,
"loss": 1.5645,
"step": 1760
},
{
"epoch": 1.8736730360934182,
"grad_norm": 1.8455125093460083,
"learning_rate": 3.4601738077385765e-05,
"loss": 1.3651,
"step": 1765
},
{
"epoch": 1.8789808917197452,
"grad_norm": 1.527896761894226,
"learning_rate": 3.452470378224749e-05,
"loss": 1.3828,
"step": 1770
},
{
"epoch": 1.8842887473460723,
"grad_norm": 1.6732441186904907,
"learning_rate": 3.4447563549781104e-05,
"loss": 1.422,
"step": 1775
},
{
"epoch": 1.8895966029723992,
"grad_norm": 2.028780698776245,
"learning_rate": 3.437031823796918e-05,
"loss": 1.6961,
"step": 1780
},
{
"epoch": 1.894904458598726,
"grad_norm": 1.666979432106018,
"learning_rate": 3.4292968705963057e-05,
"loss": 1.4066,
"step": 1785
},
{
"epoch": 1.900212314225053,
"grad_norm": 1.9285733699798584,
"learning_rate": 3.4215515814073254e-05,
"loss": 1.3729,
"step": 1790
},
{
"epoch": 1.9055201698513802,
"grad_norm": 1.876514196395874,
"learning_rate": 3.413796042375987e-05,
"loss": 1.5066,
"step": 1795
},
{
"epoch": 1.910828025477707,
"grad_norm": 1.6777039766311646,
"learning_rate": 3.4060303397623054e-05,
"loss": 1.5205,
"step": 1800
},
{
"epoch": 1.9161358811040339,
"grad_norm": 1.6733899116516113,
"learning_rate": 3.398254559939339e-05,
"loss": 1.4899,
"step": 1805
},
{
"epoch": 1.921443736730361,
"grad_norm": 1.869210124015808,
"learning_rate": 3.390468789392226e-05,
"loss": 1.2822,
"step": 1810
},
{
"epoch": 1.926751592356688,
"grad_norm": 1.5251469612121582,
"learning_rate": 3.382673114717228e-05,
"loss": 1.4774,
"step": 1815
},
{
"epoch": 1.9320594479830149,
"grad_norm": 1.7720097303390503,
"learning_rate": 3.3748676226207615e-05,
"loss": 1.4899,
"step": 1820
},
{
"epoch": 1.9373673036093417,
"grad_norm": 1.8252582550048828,
"learning_rate": 3.367052399918439e-05,
"loss": 1.5476,
"step": 1825
},
{
"epoch": 1.9426751592356688,
"grad_norm": 1.5934362411499023,
"learning_rate": 3.359227533534097e-05,
"loss": 1.491,
"step": 1830
},
{
"epoch": 1.9479830148619959,
"grad_norm": 1.8928519487380981,
"learning_rate": 3.3513931104988374e-05,
"loss": 1.4503,
"step": 1835
},
{
"epoch": 1.9532908704883227,
"grad_norm": 2.1186277866363525,
"learning_rate": 3.3435492179500485e-05,
"loss": 1.5802,
"step": 1840
},
{
"epoch": 1.9585987261146496,
"grad_norm": 1.6814011335372925,
"learning_rate": 3.3356959431304474e-05,
"loss": 1.5618,
"step": 1845
},
{
"epoch": 1.9639065817409767,
"grad_norm": 1.7327566146850586,
"learning_rate": 3.327833373387101e-05,
"loss": 1.5079,
"step": 1850
},
{
"epoch": 1.9692144373673037,
"grad_norm": 1.7963391542434692,
"learning_rate": 3.3199615961704614e-05,
"loss": 1.4489,
"step": 1855
},
{
"epoch": 1.9745222929936306,
"grad_norm": 1.9455621242523193,
"learning_rate": 3.312080699033386e-05,
"loss": 1.4823,
"step": 1860
},
{
"epoch": 1.9798301486199574,
"grad_norm": 1.7423186302185059,
"learning_rate": 3.304190769630169e-05,
"loss": 1.51,
"step": 1865
},
{
"epoch": 1.9851380042462845,
"grad_norm": 1.8353419303894043,
"learning_rate": 3.2962918957155645e-05,
"loss": 1.5076,
"step": 1870
},
{
"epoch": 1.9904458598726116,
"grad_norm": 1.960813283920288,
"learning_rate": 3.288384165143811e-05,
"loss": 1.4509,
"step": 1875
},
{
"epoch": 1.9957537154989384,
"grad_norm": 1.7254458665847778,
"learning_rate": 3.280467665867654e-05,
"loss": 1.4408,
"step": 1880
},
{
"epoch": 2.0010615711252653,
"grad_norm": 1.7819510698318481,
"learning_rate": 3.272542485937369e-05,
"loss": 1.4863,
"step": 1885
},
{
"epoch": 2.0063694267515926,
"grad_norm": 1.7719825506210327,
"learning_rate": 3.2646087134997784e-05,
"loss": 1.4181,
"step": 1890
},
{
"epoch": 2.0116772823779194,
"grad_norm": 2.0865135192871094,
"learning_rate": 3.256666436797276e-05,
"loss": 1.5429,
"step": 1895
},
{
"epoch": 2.0169851380042463,
"grad_norm": 3.6327757835388184,
"learning_rate": 3.2487157441668415e-05,
"loss": 1.5906,
"step": 1900
},
{
"epoch": 2.022292993630573,
"grad_norm": 1.6347745656967163,
"learning_rate": 3.240756724039062e-05,
"loss": 1.4776,
"step": 1905
},
{
"epoch": 2.0276008492569004,
"grad_norm": 1.8393852710723877,
"learning_rate": 3.2327894649371435e-05,
"loss": 1.3918,
"step": 1910
},
{
"epoch": 2.0329087048832273,
"grad_norm": 1.8218520879745483,
"learning_rate": 3.224814055475932e-05,
"loss": 1.4173,
"step": 1915
},
{
"epoch": 2.038216560509554,
"grad_norm": 1.8101989030838013,
"learning_rate": 3.21683058436092e-05,
"loss": 1.4378,
"step": 1920
},
{
"epoch": 2.043524416135881,
"grad_norm": 1.9730515480041504,
"learning_rate": 3.208839140387271e-05,
"loss": 1.3769,
"step": 1925
},
{
"epoch": 2.0488322717622083,
"grad_norm": 2.008378505706787,
"learning_rate": 3.200839812438821e-05,
"loss": 1.3861,
"step": 1930
},
{
"epoch": 2.054140127388535,
"grad_norm": 2.0991263389587402,
"learning_rate": 3.192832689487095e-05,
"loss": 1.5276,
"step": 1935
},
{
"epoch": 2.059447983014862,
"grad_norm": 1.616385579109192,
"learning_rate": 3.184817860590319e-05,
"loss": 1.4331,
"step": 1940
},
{
"epoch": 2.064755838641189,
"grad_norm": 1.628811240196228,
"learning_rate": 3.176795414892427e-05,
"loss": 1.3525,
"step": 1945
},
{
"epoch": 2.070063694267516,
"grad_norm": 2.066753625869751,
"learning_rate": 3.1687654416220666e-05,
"loss": 1.3573,
"step": 1950
},
{
"epoch": 2.075371549893843,
"grad_norm": 1.9191819429397583,
"learning_rate": 3.160728030091616e-05,
"loss": 1.5621,
"step": 1955
},
{
"epoch": 2.08067940552017,
"grad_norm": 2.1518616676330566,
"learning_rate": 3.152683269696179e-05,
"loss": 1.4343,
"step": 1960
},
{
"epoch": 2.0859872611464967,
"grad_norm": 1.8996518850326538,
"learning_rate": 3.1446312499125986e-05,
"loss": 1.4251,
"step": 1965
},
{
"epoch": 2.091295116772824,
"grad_norm": 1.9120466709136963,
"learning_rate": 3.1365720602984586e-05,
"loss": 1.4178,
"step": 1970
},
{
"epoch": 2.096602972399151,
"grad_norm": 1.6327637434005737,
"learning_rate": 3.12850579049109e-05,
"loss": 1.4255,
"step": 1975
},
{
"epoch": 2.1019108280254777,
"grad_norm": 1.9733164310455322,
"learning_rate": 3.120432530206569e-05,
"loss": 1.5019,
"step": 1980
},
{
"epoch": 2.1072186836518045,
"grad_norm": 1.8080165386199951,
"learning_rate": 3.112352369238728e-05,
"loss": 1.4403,
"step": 1985
},
{
"epoch": 2.112526539278132,
"grad_norm": 1.841697096824646,
"learning_rate": 3.104265397458146e-05,
"loss": 1.4666,
"step": 1990
},
{
"epoch": 2.1178343949044587,
"grad_norm": 1.9103078842163086,
"learning_rate": 3.096171704811156e-05,
"loss": 1.3622,
"step": 1995
},
{
"epoch": 2.1231422505307855,
"grad_norm": 2.059898614883423,
"learning_rate": 3.088071381318845e-05,
"loss": 1.4161,
"step": 2000
},
{
"epoch": 2.1284501061571124,
"grad_norm": 2.0670325756073,
"learning_rate": 3.0799645170760486e-05,
"loss": 1.4092,
"step": 2005
},
{
"epoch": 2.1337579617834397,
"grad_norm": 1.5905953645706177,
"learning_rate": 3.071851202250352e-05,
"loss": 1.4295,
"step": 2010
},
{
"epoch": 2.1390658174097665,
"grad_norm": 1.9497631788253784,
"learning_rate": 3.063731527081086e-05,
"loss": 1.4194,
"step": 2015
},
{
"epoch": 2.1443736730360934,
"grad_norm": 4.793402671813965,
"learning_rate": 3.055605581878322e-05,
"loss": 1.4232,
"step": 2020
},
{
"epoch": 2.1496815286624202,
"grad_norm": 2.07564115524292,
"learning_rate": 3.0474734570218732e-05,
"loss": 1.4475,
"step": 2025
},
{
"epoch": 2.1549893842887475,
"grad_norm": 1.9304168224334717,
"learning_rate": 3.03933524296028e-05,
"loss": 1.4494,
"step": 2030
},
{
"epoch": 2.1602972399150744,
"grad_norm": 2.044576644897461,
"learning_rate": 3.031191030209814e-05,
"loss": 1.5126,
"step": 2035
},
{
"epoch": 2.1656050955414012,
"grad_norm": 1.8235676288604736,
"learning_rate": 3.0230409093534622e-05,
"loss": 1.4385,
"step": 2040
},
{
"epoch": 2.170912951167728,
"grad_norm": 1.870332956314087,
"learning_rate": 3.0148849710399278e-05,
"loss": 1.4701,
"step": 2045
},
{
"epoch": 2.1762208067940554,
"grad_norm": 4.544968128204346,
"learning_rate": 3.0067233059826143e-05,
"loss": 1.5572,
"step": 2050
},
{
"epoch": 2.1815286624203822,
"grad_norm": 1.9169080257415771,
"learning_rate": 2.9985560049586237e-05,
"loss": 1.4814,
"step": 2055
},
{
"epoch": 2.186836518046709,
"grad_norm": 2.15110445022583,
"learning_rate": 2.9903831588077392e-05,
"loss": 1.6031,
"step": 2060
},
{
"epoch": 2.192144373673036,
"grad_norm": 2.1074917316436768,
"learning_rate": 2.9822048584314228e-05,
"loss": 1.3167,
"step": 2065
},
{
"epoch": 2.1974522292993632,
"grad_norm": 1.8479692935943604,
"learning_rate": 2.9740211947917984e-05,
"loss": 1.3893,
"step": 2070
},
{
"epoch": 2.20276008492569,
"grad_norm": 1.86372971534729,
"learning_rate": 2.965832258910643e-05,
"loss": 1.5014,
"step": 2075
},
{
"epoch": 2.208067940552017,
"grad_norm": 2.080585479736328,
"learning_rate": 2.957638141868373e-05,
"loss": 1.5324,
"step": 2080
},
{
"epoch": 2.213375796178344,
"grad_norm": 2.2292301654815674,
"learning_rate": 2.9494389348030317e-05,
"loss": 1.2817,
"step": 2085
},
{
"epoch": 2.218683651804671,
"grad_norm": 1.86923348903656,
"learning_rate": 2.941234728909275e-05,
"loss": 1.4919,
"step": 2090
},
{
"epoch": 2.223991507430998,
"grad_norm": 1.9480481147766113,
"learning_rate": 2.9330256154373593e-05,
"loss": 1.3585,
"step": 2095
},
{
"epoch": 2.229299363057325,
"grad_norm": 2.1231307983398438,
"learning_rate": 2.9248116856921226e-05,
"loss": 1.5803,
"step": 2100
},
{
"epoch": 2.2346072186836516,
"grad_norm": 2.2283520698547363,
"learning_rate": 2.9165930310319733e-05,
"loss": 1.502,
"step": 2105
},
{
"epoch": 2.239915074309979,
"grad_norm": 1.9217607975006104,
"learning_rate": 2.9083697428678712e-05,
"loss": 1.442,
"step": 2110
},
{
"epoch": 2.245222929936306,
"grad_norm": 4.379934310913086,
"learning_rate": 2.9001419126623113e-05,
"loss": 1.5073,
"step": 2115
},
{
"epoch": 2.2505307855626326,
"grad_norm": 2.02557635307312,
"learning_rate": 2.8919096319283084e-05,
"loss": 1.3755,
"step": 2120
},
{
"epoch": 2.2558386411889595,
"grad_norm": 2.0024921894073486,
"learning_rate": 2.8836729922283755e-05,
"loss": 1.5393,
"step": 2125
},
{
"epoch": 2.261146496815287,
"grad_norm": 1.9053192138671875,
"learning_rate": 2.8754320851735107e-05,
"loss": 1.3441,
"step": 2130
},
{
"epoch": 2.2664543524416136,
"grad_norm": 2.073275327682495,
"learning_rate": 2.8671870024221707e-05,
"loss": 1.3883,
"step": 2135
},
{
"epoch": 2.2717622080679405,
"grad_norm": 1.8114712238311768,
"learning_rate": 2.8589378356792606e-05,
"loss": 1.3674,
"step": 2140
},
{
"epoch": 2.2770700636942673,
"grad_norm": 12.008658409118652,
"learning_rate": 2.8506846766951063e-05,
"loss": 1.4504,
"step": 2145
},
{
"epoch": 2.2823779193205946,
"grad_norm": 1.860350489616394,
"learning_rate": 2.8424276172644382e-05,
"loss": 1.4243,
"step": 2150
},
{
"epoch": 2.2876857749469215,
"grad_norm": 2.3256890773773193,
"learning_rate": 2.8341667492253675e-05,
"loss": 1.4229,
"step": 2155
},
{
"epoch": 2.2929936305732483,
"grad_norm": 2.0510356426239014,
"learning_rate": 2.825902164458369e-05,
"loss": 1.3562,
"step": 2160
},
{
"epoch": 2.298301486199575,
"grad_norm": 1.8248239755630493,
"learning_rate": 2.817633954885252e-05,
"loss": 1.5125,
"step": 2165
},
{
"epoch": 2.3036093418259025,
"grad_norm": 1.8396949768066406,
"learning_rate": 2.8093622124681473e-05,
"loss": 1.4886,
"step": 2170
},
{
"epoch": 2.3089171974522293,
"grad_norm": 1.9355541467666626,
"learning_rate": 2.8010870292084744e-05,
"loss": 1.466,
"step": 2175
},
{
"epoch": 2.314225053078556,
"grad_norm": 1.9316900968551636,
"learning_rate": 2.7928084971459272e-05,
"loss": 1.4423,
"step": 2180
},
{
"epoch": 2.319532908704883,
"grad_norm": 2.041689395904541,
"learning_rate": 2.7845267083574432e-05,
"loss": 1.4992,
"step": 2185
},
{
"epoch": 2.3248407643312103,
"grad_norm": 1.89895761013031,
"learning_rate": 2.7762417549561858e-05,
"loss": 1.3173,
"step": 2190
},
{
"epoch": 2.330148619957537,
"grad_norm": 1.6639972925186157,
"learning_rate": 2.7679537290905117e-05,
"loss": 1.4519,
"step": 2195
},
{
"epoch": 2.335456475583864,
"grad_norm": 1.8617371320724487,
"learning_rate": 2.7596627229429556e-05,
"loss": 1.2956,
"step": 2200
},
{
"epoch": 2.340764331210191,
"grad_norm": 2.1502444744110107,
"learning_rate": 2.751368828729196e-05,
"loss": 1.5061,
"step": 2205
},
{
"epoch": 2.346072186836518,
"grad_norm": 2.0051639080047607,
"learning_rate": 2.7430721386970372e-05,
"loss": 1.6049,
"step": 2210
},
{
"epoch": 2.351380042462845,
"grad_norm": 2.1966779232025146,
"learning_rate": 2.7347727451253763e-05,
"loss": 1.5206,
"step": 2215
},
{
"epoch": 2.356687898089172,
"grad_norm": 2.000822067260742,
"learning_rate": 2.7264707403231826e-05,
"loss": 1.533,
"step": 2220
},
{
"epoch": 2.3619957537154987,
"grad_norm": 2.175576686859131,
"learning_rate": 2.718166216628466e-05,
"loss": 1.5238,
"step": 2225
},
{
"epoch": 2.367303609341826,
"grad_norm": 1.9840582609176636,
"learning_rate": 2.7098592664072563e-05,
"loss": 1.3994,
"step": 2230
},
{
"epoch": 2.372611464968153,
"grad_norm": 1.7856370210647583,
"learning_rate": 2.701549982052568e-05,
"loss": 1.5091,
"step": 2235
},
{
"epoch": 2.3779193205944797,
"grad_norm": 2.5901758670806885,
"learning_rate": 2.6932384559833795e-05,
"loss": 1.4364,
"step": 2240
},
{
"epoch": 2.3832271762208066,
"grad_norm": 2.039409637451172,
"learning_rate": 2.6849247806436002e-05,
"loss": 1.6041,
"step": 2245
},
{
"epoch": 2.388535031847134,
"grad_norm": 2.1011273860931396,
"learning_rate": 2.676609048501047e-05,
"loss": 1.3286,
"step": 2250
},
{
"epoch": 2.3938428874734607,
"grad_norm": 1.7810633182525635,
"learning_rate": 2.6682913520464104e-05,
"loss": 1.4414,
"step": 2255
},
{
"epoch": 2.3991507430997876,
"grad_norm": 2.1423192024230957,
"learning_rate": 2.6599717837922324e-05,
"loss": 1.3732,
"step": 2260
},
{
"epoch": 2.404458598726115,
"grad_norm": 1.6819944381713867,
"learning_rate": 2.6516504362718692e-05,
"loss": 1.4819,
"step": 2265
},
{
"epoch": 2.4097664543524417,
"grad_norm": 4.543319225311279,
"learning_rate": 2.6433274020384717e-05,
"loss": 1.3511,
"step": 2270
},
{
"epoch": 2.4150743099787686,
"grad_norm": 2.027402639389038,
"learning_rate": 2.6350027736639466e-05,
"loss": 1.4949,
"step": 2275
},
{
"epoch": 2.4203821656050954,
"grad_norm": 2.225890636444092,
"learning_rate": 2.6266766437379348e-05,
"loss": 1.5223,
"step": 2280
},
{
"epoch": 2.4256900212314223,
"grad_norm": 1.709800362586975,
"learning_rate": 2.6183491048667748e-05,
"loss": 1.3139,
"step": 2285
},
{
"epoch": 2.4309978768577496,
"grad_norm": 1.9229758977890015,
"learning_rate": 2.610020249672479e-05,
"loss": 1.4932,
"step": 2290
},
{
"epoch": 2.4363057324840764,
"grad_norm": 1.7151269912719727,
"learning_rate": 2.601690170791698e-05,
"loss": 1.3308,
"step": 2295
},
{
"epoch": 2.4416135881104033,
"grad_norm": 2.1811819076538086,
"learning_rate": 2.5933589608746945e-05,
"loss": 1.4028,
"step": 2300
},
{
"epoch": 2.4469214437367306,
"grad_norm": 2.236459970474243,
"learning_rate": 2.585026712584309e-05,
"loss": 1.5397,
"step": 2305
},
{
"epoch": 2.4522292993630574,
"grad_norm": 2.1036362648010254,
"learning_rate": 2.576693518594934e-05,
"loss": 1.4838,
"step": 2310
},
{
"epoch": 2.4575371549893843,
"grad_norm": 2.25015926361084,
"learning_rate": 2.568359471591477e-05,
"loss": 1.4518,
"step": 2315
},
{
"epoch": 2.462845010615711,
"grad_norm": 2.4356729984283447,
"learning_rate": 2.5600246642683367e-05,
"loss": 1.4599,
"step": 2320
},
{
"epoch": 2.468152866242038,
"grad_norm": 1.9419898986816406,
"learning_rate": 2.5516891893283645e-05,
"loss": 1.4831,
"step": 2325
},
{
"epoch": 2.4734607218683653,
"grad_norm": 1.9871810674667358,
"learning_rate": 2.543353139481841e-05,
"loss": 1.4965,
"step": 2330
},
{
"epoch": 2.478768577494692,
"grad_norm": 2.024142026901245,
"learning_rate": 2.535016607445438e-05,
"loss": 1.5143,
"step": 2335
},
{
"epoch": 2.484076433121019,
"grad_norm": 1.7300843000411987,
"learning_rate": 2.526679685941193e-05,
"loss": 1.4122,
"step": 2340
},
{
"epoch": 2.4893842887473463,
"grad_norm": 1.9679033756256104,
"learning_rate": 2.518342467695473e-05,
"loss": 1.4179,
"step": 2345
},
{
"epoch": 2.494692144373673,
"grad_norm": 2.0890605449676514,
"learning_rate": 2.5100050454379475e-05,
"loss": 1.4879,
"step": 2350
},
{
"epoch": 2.5,
"grad_norm": 2.1922872066497803,
"learning_rate": 2.501667511900554e-05,
"loss": 1.252,
"step": 2355
},
{
"epoch": 2.505307855626327,
"grad_norm": 2.26338529586792,
"learning_rate": 2.4933299598164674e-05,
"loss": 1.3662,
"step": 2360
},
{
"epoch": 2.5106157112526537,
"grad_norm": 2.192429542541504,
"learning_rate": 2.4849924819190696e-05,
"loss": 1.4638,
"step": 2365
},
{
"epoch": 2.515923566878981,
"grad_norm": 1.9536056518554688,
"learning_rate": 2.4766551709409172e-05,
"loss": 1.5399,
"step": 2370
},
{
"epoch": 2.521231422505308,
"grad_norm": 3.6337201595306396,
"learning_rate": 2.46831811961271e-05,
"loss": 1.4925,
"step": 2375
},
{
"epoch": 2.5265392781316347,
"grad_norm": 3.1741602420806885,
"learning_rate": 2.4599814206622604e-05,
"loss": 1.4498,
"step": 2380
},
{
"epoch": 2.531847133757962,
"grad_norm": 1.8435138463974,
"learning_rate": 2.451645166813461e-05,
"loss": 1.2496,
"step": 2385
},
{
"epoch": 2.537154989384289,
"grad_norm": 2.1668882369995117,
"learning_rate": 2.4433094507852537e-05,
"loss": 1.3713,
"step": 2390
},
{
"epoch": 2.5424628450106157,
"grad_norm": 2.1070964336395264,
"learning_rate": 2.434974365290599e-05,
"loss": 1.481,
"step": 2395
},
{
"epoch": 2.5477707006369426,
"grad_norm": 1.9351259469985962,
"learning_rate": 2.4266400030354444e-05,
"loss": 1.5247,
"step": 2400
},
{
"epoch": 2.5530785562632694,
"grad_norm": 2.2230396270751953,
"learning_rate": 2.4183064567176928e-05,
"loss": 1.3355,
"step": 2405
},
{
"epoch": 2.5583864118895967,
"grad_norm": 8.01842975616455,
"learning_rate": 2.409973819026173e-05,
"loss": 1.4554,
"step": 2410
},
{
"epoch": 2.5636942675159236,
"grad_norm": 2.270322322845459,
"learning_rate": 2.401642182639605e-05,
"loss": 1.4627,
"step": 2415
},
{
"epoch": 2.5690021231422504,
"grad_norm": 2.0427427291870117,
"learning_rate": 2.3933116402255764e-05,
"loss": 1.4061,
"step": 2420
},
{
"epoch": 2.5743099787685777,
"grad_norm": 2.2200124263763428,
"learning_rate": 2.384982284439503e-05,
"loss": 1.4439,
"step": 2425
},
{
"epoch": 2.5796178343949046,
"grad_norm": 1.9936960935592651,
"learning_rate": 2.3766542079236048e-05,
"loss": 1.6219,
"step": 2430
},
{
"epoch": 2.5849256900212314,
"grad_norm": 2.3527231216430664,
"learning_rate": 2.368327503305872e-05,
"loss": 1.5253,
"step": 2435
},
{
"epoch": 2.5902335456475583,
"grad_norm": 1.7423584461212158,
"learning_rate": 2.3600022631990372e-05,
"loss": 1.299,
"step": 2440
},
{
"epoch": 2.595541401273885,
"grad_norm": 1.7755167484283447,
"learning_rate": 2.3516785801995433e-05,
"loss": 1.4781,
"step": 2445
},
{
"epoch": 2.6008492569002124,
"grad_norm": 6.36676549911499,
"learning_rate": 2.3433565468865157e-05,
"loss": 1.5042,
"step": 2450
},
{
"epoch": 2.6061571125265393,
"grad_norm": 1.851580023765564,
"learning_rate": 2.335036255820729e-05,
"loss": 1.4587,
"step": 2455
},
{
"epoch": 2.611464968152866,
"grad_norm": 2.0467379093170166,
"learning_rate": 2.3267177995435824e-05,
"loss": 1.6473,
"step": 2460
},
{
"epoch": 2.6167728237791934,
"grad_norm": 2.056533098220825,
"learning_rate": 2.3184012705760662e-05,
"loss": 1.4673,
"step": 2465
},
{
"epoch": 2.6220806794055203,
"grad_norm": 1.8680130243301392,
"learning_rate": 2.3100867614177353e-05,
"loss": 1.3721,
"step": 2470
},
{
"epoch": 2.627388535031847,
"grad_norm": 2.404651641845703,
"learning_rate": 2.3017743645456794e-05,
"loss": 1.4524,
"step": 2475
},
{
"epoch": 2.632696390658174,
"grad_norm": 2.0899250507354736,
"learning_rate": 2.293464172413495e-05,
"loss": 1.5181,
"step": 2480
},
{
"epoch": 2.638004246284501,
"grad_norm": 2.262739658355713,
"learning_rate": 2.2851562774502542e-05,
"loss": 1.5211,
"step": 2485
},
{
"epoch": 2.643312101910828,
"grad_norm": 4.4608941078186035,
"learning_rate": 2.276850772059483e-05,
"loss": 1.599,
"step": 2490
},
{
"epoch": 2.648619957537155,
"grad_norm": 2.067124128341675,
"learning_rate": 2.2685477486181267e-05,
"loss": 1.37,
"step": 2495
},
{
"epoch": 2.653927813163482,
"grad_norm": 2.236569404602051,
"learning_rate": 2.2602472994755276e-05,
"loss": 1.4943,
"step": 2500
},
{
"epoch": 2.659235668789809,
"grad_norm": 7.286528587341309,
"learning_rate": 2.2519495169523924e-05,
"loss": 1.459,
"step": 2505
},
{
"epoch": 2.664543524416136,
"grad_norm": 2.1355056762695312,
"learning_rate": 2.243654493339773e-05,
"loss": 1.4789,
"step": 2510
},
{
"epoch": 2.669851380042463,
"grad_norm": 2.1914258003234863,
"learning_rate": 2.2353623208980316e-05,
"loss": 1.3678,
"step": 2515
},
{
"epoch": 2.6751592356687897,
"grad_norm": 1.906111717224121,
"learning_rate": 2.227073091855822e-05,
"loss": 1.4229,
"step": 2520
},
{
"epoch": 2.6804670912951165,
"grad_norm": 2.0272438526153564,
"learning_rate": 2.2187868984090577e-05,
"loss": 1.3161,
"step": 2525
},
{
"epoch": 2.685774946921444,
"grad_norm": 2.4848039150238037,
"learning_rate": 2.2105038327198914e-05,
"loss": 1.3172,
"step": 2530
},
{
"epoch": 2.6910828025477707,
"grad_norm": 1.991502285003662,
"learning_rate": 2.202223986915685e-05,
"loss": 1.4735,
"step": 2535
},
{
"epoch": 2.6963906581740975,
"grad_norm": 2.0094003677368164,
"learning_rate": 2.193947453087991e-05,
"loss": 1.4223,
"step": 2540
},
{
"epoch": 2.701698513800425,
"grad_norm": 3.0732345581054688,
"learning_rate": 2.185674323291522e-05,
"loss": 1.4143,
"step": 2545
},
{
"epoch": 2.7070063694267517,
"grad_norm": 9.969552040100098,
"learning_rate": 2.1774046895431317e-05,
"loss": 1.465,
"step": 2550
},
{
"epoch": 2.7123142250530785,
"grad_norm": 1.9031027555465698,
"learning_rate": 2.1691386438207873e-05,
"loss": 1.5055,
"step": 2555
},
{
"epoch": 2.7176220806794054,
"grad_norm": 2.1032540798187256,
"learning_rate": 2.160876278062551e-05,
"loss": 1.4889,
"step": 2560
},
{
"epoch": 2.722929936305732,
"grad_norm": 2.103361129760742,
"learning_rate": 2.1526176841655533e-05,
"loss": 1.5629,
"step": 2565
},
{
"epoch": 2.7282377919320595,
"grad_norm": 1.7875380516052246,
"learning_rate": 2.1443629539849735e-05,
"loss": 1.438,
"step": 2570
},
{
"epoch": 2.7335456475583864,
"grad_norm": 4.726676940917969,
"learning_rate": 2.136112179333017e-05,
"loss": 1.3722,
"step": 2575
},
{
"epoch": 2.738853503184713,
"grad_norm": 2.1944401264190674,
"learning_rate": 2.1278654519778947e-05,
"loss": 1.4818,
"step": 2580
},
{
"epoch": 2.7441613588110405,
"grad_norm": 2.1532351970672607,
"learning_rate": 2.1196228636428002e-05,
"loss": 1.5619,
"step": 2585
},
{
"epoch": 2.7494692144373674,
"grad_norm": 1.9567017555236816,
"learning_rate": 2.111384506004894e-05,
"loss": 1.4255,
"step": 2590
},
{
"epoch": 2.754777070063694,
"grad_norm": 2.1782784461975098,
"learning_rate": 2.10315047069428e-05,
"loss": 1.5677,
"step": 2595
},
{
"epoch": 2.7600849256900215,
"grad_norm": 6.628244400024414,
"learning_rate": 2.0949208492929866e-05,
"loss": 1.4233,
"step": 2600
},
{
"epoch": 2.7653927813163484,
"grad_norm": 2.323992967605591,
"learning_rate": 2.08669573333395e-05,
"loss": 1.7152,
"step": 2605
},
{
"epoch": 2.770700636942675,
"grad_norm": 1.6532930135726929,
"learning_rate": 2.078475214299996e-05,
"loss": 1.1821,
"step": 2610
},
{
"epoch": 2.776008492569002,
"grad_norm": 2.089218854904175,
"learning_rate": 2.0702593836228196e-05,
"loss": 1.3794,
"step": 2615
},
{
"epoch": 2.781316348195329,
"grad_norm": 2.067755699157715,
"learning_rate": 2.062048332681972e-05,
"loss": 1.406,
"step": 2620
},
{
"epoch": 2.786624203821656,
"grad_norm": 1.9614882469177246,
"learning_rate": 2.053842152803842e-05,
"loss": 1.5471,
"step": 2625
},
{
"epoch": 2.791932059447983,
"grad_norm": 2.23738956451416,
"learning_rate": 2.0456409352606396e-05,
"loss": 1.5058,
"step": 2630
},
{
"epoch": 2.79723991507431,
"grad_norm": 2.063555955886841,
"learning_rate": 2.037444771269382e-05,
"loss": 1.3531,
"step": 2635
},
{
"epoch": 2.802547770700637,
"grad_norm": 2.0748682022094727,
"learning_rate": 2.0292537519908817e-05,
"loss": 1.2844,
"step": 2640
},
{
"epoch": 2.807855626326964,
"grad_norm": 2.197343587875366,
"learning_rate": 2.0210679685287248e-05,
"loss": 1.5082,
"step": 2645
},
{
"epoch": 2.813163481953291,
"grad_norm": 4.911831378936768,
"learning_rate": 2.0128875119282674e-05,
"loss": 1.5497,
"step": 2650
},
{
"epoch": 2.8184713375796178,
"grad_norm": 1.9611250162124634,
"learning_rate": 2.004712473175615e-05,
"loss": 1.4158,
"step": 2655
},
{
"epoch": 2.8237791932059446,
"grad_norm": 2.2171881198883057,
"learning_rate": 1.996542943196616e-05,
"loss": 1.3746,
"step": 2660
},
{
"epoch": 2.829087048832272,
"grad_norm": 2.2292375564575195,
"learning_rate": 1.9883790128558463e-05,
"loss": 1.5202,
"step": 2665
},
{
"epoch": 2.8343949044585988,
"grad_norm": 2.012502431869507,
"learning_rate": 1.980220772955602e-05,
"loss": 1.5072,
"step": 2670
},
{
"epoch": 2.8397027600849256,
"grad_norm": 2.0411856174468994,
"learning_rate": 1.9720683142348873e-05,
"loss": 1.61,
"step": 2675
},
{
"epoch": 2.845010615711253,
"grad_norm": 2.1242668628692627,
"learning_rate": 1.963921727368406e-05,
"loss": 1.4123,
"step": 2680
},
{
"epoch": 2.8503184713375798,
"grad_norm": 1.9053332805633545,
"learning_rate": 1.9557811029655522e-05,
"loss": 1.4463,
"step": 2685
},
{
"epoch": 2.8556263269639066,
"grad_norm": 2.0949225425720215,
"learning_rate": 1.9476465315694055e-05,
"loss": 1.5502,
"step": 2690
},
{
"epoch": 2.8609341825902335,
"grad_norm": 2.2040441036224365,
"learning_rate": 1.9395181036557188e-05,
"loss": 1.4678,
"step": 2695
},
{
"epoch": 2.8662420382165603,
"grad_norm": 2.0631675720214844,
"learning_rate": 1.9313959096319175e-05,
"loss": 1.3414,
"step": 2700
},
{
"epoch": 2.8715498938428876,
"grad_norm": 4.133503437042236,
"learning_rate": 1.923280039836089e-05,
"loss": 1.4198,
"step": 2705
},
{
"epoch": 2.8768577494692145,
"grad_norm": 2.2318644523620605,
"learning_rate": 1.9151705845359825e-05,
"loss": 1.3251,
"step": 2710
},
{
"epoch": 2.8821656050955413,
"grad_norm": 1.9430187940597534,
"learning_rate": 1.9070676339280004e-05,
"loss": 1.5425,
"step": 2715
},
{
"epoch": 2.8874734607218686,
"grad_norm": 2.3634932041168213,
"learning_rate": 1.8989712781361997e-05,
"loss": 1.4142,
"step": 2720
},
{
"epoch": 2.8927813163481955,
"grad_norm": 1.9859676361083984,
"learning_rate": 1.8908816072112856e-05,
"loss": 1.4577,
"step": 2725
},
{
"epoch": 2.8980891719745223,
"grad_norm": 1.9253382682800293,
"learning_rate": 1.882798711129613e-05,
"loss": 1.4094,
"step": 2730
},
{
"epoch": 2.903397027600849,
"grad_norm": 1.818198323249817,
"learning_rate": 1.8747226797921845e-05,
"loss": 1.4304,
"step": 2735
},
{
"epoch": 2.908704883227176,
"grad_norm": 2.1580026149749756,
"learning_rate": 1.866653603023649e-05,
"loss": 1.1938,
"step": 2740
},
{
"epoch": 2.9140127388535033,
"grad_norm": 2.2599189281463623,
"learning_rate": 1.858591570571306e-05,
"loss": 1.5586,
"step": 2745
},
{
"epoch": 2.91932059447983,
"grad_norm": 2.2263216972351074,
"learning_rate": 1.8505366721041033e-05,
"loss": 1.5277,
"step": 2750
},
{
"epoch": 2.924628450106157,
"grad_norm": 2.088515043258667,
"learning_rate": 1.842488997211644e-05,
"loss": 1.5115,
"step": 2755
},
{
"epoch": 2.9299363057324843,
"grad_norm": 2.1377217769622803,
"learning_rate": 1.834448635403186e-05,
"loss": 1.5497,
"step": 2760
},
{
"epoch": 2.935244161358811,
"grad_norm": 2.0695080757141113,
"learning_rate": 1.82641567610665e-05,
"loss": 1.4421,
"step": 2765
},
{
"epoch": 2.940552016985138,
"grad_norm": 4.7147674560546875,
"learning_rate": 1.8183902086676217e-05,
"loss": 1.556,
"step": 2770
},
{
"epoch": 2.945859872611465,
"grad_norm": 2.120251178741455,
"learning_rate": 1.810372322348361e-05,
"loss": 1.4685,
"step": 2775
},
{
"epoch": 2.9511677282377917,
"grad_norm": 1.9467816352844238,
"learning_rate": 1.8023621063268064e-05,
"loss": 1.4662,
"step": 2780
},
{
"epoch": 2.956475583864119,
"grad_norm": 2.197115659713745,
"learning_rate": 1.7943596496955854e-05,
"loss": 1.347,
"step": 2785
},
{
"epoch": 2.961783439490446,
"grad_norm": 2.2167131900787354,
"learning_rate": 1.7863650414610223e-05,
"loss": 1.5102,
"step": 2790
},
{
"epoch": 2.9670912951167727,
"grad_norm": 3.7643346786499023,
"learning_rate": 1.7783783705421487e-05,
"loss": 1.2618,
"step": 2795
},
{
"epoch": 2.9723991507431,
"grad_norm": 2.3326447010040283,
"learning_rate": 1.7703997257697137e-05,
"loss": 1.3727,
"step": 2800
},
{
"epoch": 2.977707006369427,
"grad_norm": 2.2277841567993164,
"learning_rate": 1.762429195885198e-05,
"loss": 1.447,
"step": 2805
},
{
"epoch": 2.9830148619957537,
"grad_norm": 2.2077393531799316,
"learning_rate": 1.754466869539824e-05,
"loss": 1.4006,
"step": 2810
},
{
"epoch": 2.9883227176220806,
"grad_norm": 2.2370176315307617,
"learning_rate": 1.7465128352935732e-05,
"loss": 1.4167,
"step": 2815
},
{
"epoch": 2.9936305732484074,
"grad_norm": 2.0863046646118164,
"learning_rate": 1.7385671816141963e-05,
"loss": 1.5003,
"step": 2820
},
{
"epoch": 2.9989384288747347,
"grad_norm": 2.1493258476257324,
"learning_rate": 1.730629996876235e-05,
"loss": 1.2646,
"step": 2825
},
{
"epoch": 3.0042462845010616,
"grad_norm": 2.2664546966552734,
"learning_rate": 1.7227013693600347e-05,
"loss": 1.4217,
"step": 2830
},
{
"epoch": 3.0095541401273884,
"grad_norm": 2.0606799125671387,
"learning_rate": 1.7147813872507654e-05,
"loss": 1.3851,
"step": 2835
},
{
"epoch": 3.0148619957537157,
"grad_norm": 2.0941736698150635,
"learning_rate": 1.7068701386374374e-05,
"loss": 1.4804,
"step": 2840
},
{
"epoch": 3.0201698513800426,
"grad_norm": 2.1622138023376465,
"learning_rate": 1.6989677115119267e-05,
"loss": 1.3979,
"step": 2845
},
{
"epoch": 3.0254777070063694,
"grad_norm": 2.0956408977508545,
"learning_rate": 1.691074193767991e-05,
"loss": 1.4387,
"step": 2850
},
{
"epoch": 3.0307855626326963,
"grad_norm": 2.141923666000366,
"learning_rate": 1.683189673200296e-05,
"loss": 1.2867,
"step": 2855
},
{
"epoch": 3.0360934182590236,
"grad_norm": 2.4103381633758545,
"learning_rate": 1.675314237503436e-05,
"loss": 1.409,
"step": 2860
},
{
"epoch": 3.0414012738853504,
"grad_norm": 2.0725698471069336,
"learning_rate": 1.667447974270962e-05,
"loss": 1.434,
"step": 2865
},
{
"epoch": 3.0467091295116773,
"grad_norm": 2.1921072006225586,
"learning_rate": 1.6595909709944035e-05,
"loss": 1.278,
"step": 2870
},
{
"epoch": 3.052016985138004,
"grad_norm": 2.077505588531494,
"learning_rate": 1.651743315062299e-05,
"loss": 1.4423,
"step": 2875
},
{
"epoch": 3.0573248407643314,
"grad_norm": 2.065654993057251,
"learning_rate": 1.64390509375922e-05,
"loss": 1.4783,
"step": 2880
},
{
"epoch": 3.0626326963906583,
"grad_norm": 2.1610610485076904,
"learning_rate": 1.6360763942648056e-05,
"loss": 1.4743,
"step": 2885
},
{
"epoch": 3.067940552016985,
"grad_norm": 2.189526319503784,
"learning_rate": 1.628257303652786e-05,
"loss": 1.444,
"step": 2890
},
{
"epoch": 3.073248407643312,
"grad_norm": 2.0830533504486084,
"learning_rate": 1.620447908890022e-05,
"loss": 1.3342,
"step": 2895
},
{
"epoch": 3.0785562632696393,
"grad_norm": 2.0287106037139893,
"learning_rate": 1.61264829683553e-05,
"loss": 1.4212,
"step": 2900
},
{
"epoch": 3.083864118895966,
"grad_norm": 1.9034909009933472,
"learning_rate": 1.604858554239521e-05,
"loss": 1.4566,
"step": 2905
},
{
"epoch": 3.089171974522293,
"grad_norm": 2.098928451538086,
"learning_rate": 1.597078767742434e-05,
"loss": 1.4257,
"step": 2910
},
{
"epoch": 3.09447983014862,
"grad_norm": 2.230280637741089,
"learning_rate": 1.589309023873974e-05,
"loss": 1.4142,
"step": 2915
},
{
"epoch": 3.099787685774947,
"grad_norm": 2.3032426834106445,
"learning_rate": 1.581549409052145e-05,
"loss": 1.4048,
"step": 2920
},
{
"epoch": 3.105095541401274,
"grad_norm": 2.5375571250915527,
"learning_rate": 1.5738000095822948e-05,
"loss": 1.3517,
"step": 2925
},
{
"epoch": 3.110403397027601,
"grad_norm": 1.7900398969650269,
"learning_rate": 1.5660609116561493e-05,
"loss": 1.3255,
"step": 2930
},
{
"epoch": 3.1157112526539277,
"grad_norm": 2.3134543895721436,
"learning_rate": 1.5583322013508604e-05,
"loss": 1.4345,
"step": 2935
},
{
"epoch": 3.121019108280255,
"grad_norm": 2.1048059463500977,
"learning_rate": 1.5506139646280427e-05,
"loss": 1.1971,
"step": 2940
},
{
"epoch": 3.126326963906582,
"grad_norm": 2.4140281677246094,
"learning_rate": 1.5429062873328194e-05,
"loss": 1.4395,
"step": 2945
},
{
"epoch": 3.1316348195329087,
"grad_norm": 2.161916971206665,
"learning_rate": 1.535209255192869e-05,
"loss": 1.3569,
"step": 2950
},
{
"epoch": 3.1369426751592355,
"grad_norm": 2.458399534225464,
"learning_rate": 1.52752295381747e-05,
"loss": 1.4479,
"step": 2955
},
{
"epoch": 3.142250530785563,
"grad_norm": 2.1411702632904053,
"learning_rate": 1.5198474686965495e-05,
"loss": 1.4861,
"step": 2960
},
{
"epoch": 3.1475583864118897,
"grad_norm": 2.147722005844116,
"learning_rate": 1.5121828851997319e-05,
"loss": 1.3144,
"step": 2965
},
{
"epoch": 3.1528662420382165,
"grad_norm": 2.137479782104492,
"learning_rate": 1.5045292885753894e-05,
"loss": 1.4583,
"step": 2970
},
{
"epoch": 3.1581740976645434,
"grad_norm": 2.3600449562072754,
"learning_rate": 1.4968867639496956e-05,
"loss": 1.4061,
"step": 2975
},
{
"epoch": 3.1634819532908707,
"grad_norm": 2.391444206237793,
"learning_rate": 1.4892553963256745e-05,
"loss": 1.5864,
"step": 2980
},
{
"epoch": 3.1687898089171975,
"grad_norm": 2.2082204818725586,
"learning_rate": 1.4816352705822612e-05,
"loss": 1.2608,
"step": 2985
},
{
"epoch": 3.1740976645435244,
"grad_norm": 2.3388137817382812,
"learning_rate": 1.4740264714733504e-05,
"loss": 1.5217,
"step": 2990
},
{
"epoch": 3.1794055201698512,
"grad_norm": 2.470644950866699,
"learning_rate": 1.4664290836268613e-05,
"loss": 1.471,
"step": 2995
},
{
"epoch": 3.1847133757961785,
"grad_norm": 2.1400668621063232,
"learning_rate": 1.4588431915437906e-05,
"loss": 1.4109,
"step": 3000
},
{
"epoch": 3.1900212314225054,
"grad_norm": 2.219452142715454,
"learning_rate": 1.4512688795972756e-05,
"loss": 1.5468,
"step": 3005
},
{
"epoch": 3.1953290870488322,
"grad_norm": 2.131532907485962,
"learning_rate": 1.4437062320316558e-05,
"loss": 1.3587,
"step": 3010
},
{
"epoch": 3.200636942675159,
"grad_norm": 2.2039008140563965,
"learning_rate": 1.4361553329615324e-05,
"loss": 1.4387,
"step": 3015
},
{
"epoch": 3.2059447983014864,
"grad_norm": 2.1974165439605713,
"learning_rate": 1.428616266370838e-05,
"loss": 1.4334,
"step": 3020
},
{
"epoch": 3.2112526539278132,
"grad_norm": 2.232050895690918,
"learning_rate": 1.4210891161118992e-05,
"loss": 1.4696,
"step": 3025
},
{
"epoch": 3.21656050955414,
"grad_norm": 2.1611685752868652,
"learning_rate": 1.4135739659045053e-05,
"loss": 1.3681,
"step": 3030
},
{
"epoch": 3.221868365180467,
"grad_norm": 2.1293656826019287,
"learning_rate": 1.4060708993349738e-05,
"loss": 1.3466,
"step": 3035
},
{
"epoch": 3.2271762208067942,
"grad_norm": 2.123840570449829,
"learning_rate": 1.3985799998552267e-05,
"loss": 1.4368,
"step": 3040
},
{
"epoch": 3.232484076433121,
"grad_norm": 2.4953863620758057,
"learning_rate": 1.3911013507818581e-05,
"loss": 1.3601,
"step": 3045
},
{
"epoch": 3.237791932059448,
"grad_norm": 2.3729612827301025,
"learning_rate": 1.3836350352952085e-05,
"loss": 1.4593,
"step": 3050
},
{
"epoch": 3.243099787685775,
"grad_norm": 2.4012491703033447,
"learning_rate": 1.3761811364384378e-05,
"loss": 1.4123,
"step": 3055
},
{
"epoch": 3.248407643312102,
"grad_norm": 2.420816659927368,
"learning_rate": 1.3687397371166055e-05,
"loss": 1.426,
"step": 3060
},
{
"epoch": 3.253715498938429,
"grad_norm": 2.3156495094299316,
"learning_rate": 1.3613109200957469e-05,
"loss": 1.396,
"step": 3065
},
{
"epoch": 3.259023354564756,
"grad_norm": 2.5136337280273438,
"learning_rate": 1.3538947680019514e-05,
"loss": 1.3537,
"step": 3070
},
{
"epoch": 3.2643312101910826,
"grad_norm": 2.1792850494384766,
"learning_rate": 1.3464913633204434e-05,
"loss": 1.4983,
"step": 3075
},
{
"epoch": 3.26963906581741,
"grad_norm": 2.5874316692352295,
"learning_rate": 1.3391007883946669e-05,
"loss": 1.4165,
"step": 3080
},
{
"epoch": 3.274946921443737,
"grad_norm": 2.1205999851226807,
"learning_rate": 1.3317231254253687e-05,
"loss": 1.3347,
"step": 3085
},
{
"epoch": 3.2802547770700636,
"grad_norm": 2.6099026203155518,
"learning_rate": 1.3243584564696848e-05,
"loss": 1.4395,
"step": 3090
},
{
"epoch": 3.2855626326963905,
"grad_norm": 2.8517844676971436,
"learning_rate": 1.3170068634402236e-05,
"loss": 1.4585,
"step": 3095
},
{
"epoch": 3.290870488322718,
"grad_norm": 2.137800931930542,
"learning_rate": 1.3096684281041613e-05,
"loss": 1.4397,
"step": 3100
},
{
"epoch": 3.2961783439490446,
"grad_norm": 2.1291749477386475,
"learning_rate": 1.3023432320823287e-05,
"loss": 1.4387,
"step": 3105
},
{
"epoch": 3.3014861995753715,
"grad_norm": 1.9797881841659546,
"learning_rate": 1.2950313568483036e-05,
"loss": 1.266,
"step": 3110
},
{
"epoch": 3.3067940552016983,
"grad_norm": 2.3177640438079834,
"learning_rate": 1.2877328837275044e-05,
"loss": 1.3107,
"step": 3115
},
{
"epoch": 3.3121019108280256,
"grad_norm": 2.499016046524048,
"learning_rate": 1.2804478938962867e-05,
"loss": 1.397,
"step": 3120
},
{
"epoch": 3.3174097664543525,
"grad_norm": 2.1002349853515625,
"learning_rate": 1.2731764683810398e-05,
"loss": 1.3943,
"step": 3125
},
{
"epoch": 3.3227176220806793,
"grad_norm": 2.446040391921997,
"learning_rate": 1.265918688057288e-05,
"loss": 1.4008,
"step": 3130
},
{
"epoch": 3.328025477707006,
"grad_norm": 2.4573974609375,
"learning_rate": 1.2586746336487835e-05,
"loss": 1.533,
"step": 3135
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2.1954808235168457,
"learning_rate": 1.2514443857266175e-05,
"loss": 1.3354,
"step": 3140
},
{
"epoch": 3.3386411889596603,
"grad_norm": 2.187451124191284,
"learning_rate": 1.2442280247083198e-05,
"loss": 1.3633,
"step": 3145
},
{
"epoch": 3.343949044585987,
"grad_norm": 5.623300552368164,
"learning_rate": 1.2370256308569656e-05,
"loss": 1.4056,
"step": 3150
},
{
"epoch": 3.349256900212314,
"grad_norm": 2.10553240776062,
"learning_rate": 1.2298372842802786e-05,
"loss": 1.4899,
"step": 3155
},
{
"epoch": 3.3545647558386413,
"grad_norm": 2.105638265609741,
"learning_rate": 1.2226630649297466e-05,
"loss": 1.4447,
"step": 3160
},
{
"epoch": 3.359872611464968,
"grad_norm": 3.875722885131836,
"learning_rate": 1.2155030525997286e-05,
"loss": 1.3026,
"step": 3165
},
{
"epoch": 3.365180467091295,
"grad_norm": 2.115307092666626,
"learning_rate": 1.208357326926568e-05,
"loss": 1.3592,
"step": 3170
},
{
"epoch": 3.370488322717622,
"grad_norm": 3.39106822013855,
"learning_rate": 1.2012259673877046e-05,
"loss": 1.2692,
"step": 3175
},
{
"epoch": 3.375796178343949,
"grad_norm": 1.9469239711761475,
"learning_rate": 1.1941090533007948e-05,
"loss": 1.4078,
"step": 3180
},
{
"epoch": 3.381104033970276,
"grad_norm": 2.1060116291046143,
"learning_rate": 1.1870066638228264e-05,
"loss": 1.3936,
"step": 3185
},
{
"epoch": 3.386411889596603,
"grad_norm": 2.3175833225250244,
"learning_rate": 1.1799188779492406e-05,
"loss": 1.3777,
"step": 3190
},
{
"epoch": 3.3917197452229297,
"grad_norm": 2.24416446685791,
"learning_rate": 1.1728457745130483e-05,
"loss": 1.4643,
"step": 3195
},
{
"epoch": 3.397027600849257,
"grad_norm": 2.251556158065796,
"learning_rate": 1.1657874321839602e-05,
"loss": 1.434,
"step": 3200
},
{
"epoch": 3.402335456475584,
"grad_norm": 2.4189612865448,
"learning_rate": 1.1587439294675068e-05,
"loss": 1.4088,
"step": 3205
},
{
"epoch": 3.4076433121019107,
"grad_norm": 2.213326930999756,
"learning_rate": 1.1517153447041687e-05,
"loss": 1.4231,
"step": 3210
},
{
"epoch": 3.412951167728238,
"grad_norm": 2.072181463241577,
"learning_rate": 1.1447017560684996e-05,
"loss": 1.4474,
"step": 3215
},
{
"epoch": 3.418259023354565,
"grad_norm": 2.4454848766326904,
"learning_rate": 1.1377032415682648e-05,
"loss": 1.4199,
"step": 3220
},
{
"epoch": 3.4235668789808917,
"grad_norm": 2.853790283203125,
"learning_rate": 1.130719879043567e-05,
"loss": 1.4094,
"step": 3225
},
{
"epoch": 3.4288747346072186,
"grad_norm": 2.308567762374878,
"learning_rate": 1.1237517461659846e-05,
"loss": 1.3363,
"step": 3230
},
{
"epoch": 3.4341825902335454,
"grad_norm": 2.467379093170166,
"learning_rate": 1.1167989204377036e-05,
"loss": 1.4102,
"step": 3235
},
{
"epoch": 3.4394904458598727,
"grad_norm": 2.1049180030822754,
"learning_rate": 1.1098614791906606e-05,
"loss": 1.2906,
"step": 3240
},
{
"epoch": 3.4447983014861996,
"grad_norm": 2.4943039417266846,
"learning_rate": 1.1029394995856792e-05,
"loss": 1.2913,
"step": 3245
},
{
"epoch": 3.4501061571125264,
"grad_norm": 2.1266067028045654,
"learning_rate": 1.0960330586116138e-05,
"loss": 1.4101,
"step": 3250
},
{
"epoch": 3.4554140127388537,
"grad_norm": 2.2753243446350098,
"learning_rate": 1.08914223308449e-05,
"loss": 1.4643,
"step": 3255
},
{
"epoch": 3.4607218683651806,
"grad_norm": 2.324915885925293,
"learning_rate": 1.0822670996466547e-05,
"loss": 1.4543,
"step": 3260
},
{
"epoch": 3.4660297239915074,
"grad_norm": 2.1122493743896484,
"learning_rate": 1.0754077347659208e-05,
"loss": 1.5137,
"step": 3265
},
{
"epoch": 3.4713375796178343,
"grad_norm": 2.3270649909973145,
"learning_rate": 1.0685642147347183e-05,
"loss": 1.4333,
"step": 3270
},
{
"epoch": 3.476645435244161,
"grad_norm": 2.105161190032959,
"learning_rate": 1.0617366156692423e-05,
"loss": 1.4143,
"step": 3275
},
{
"epoch": 3.4819532908704884,
"grad_norm": 2.328491687774658,
"learning_rate": 1.0549250135086114e-05,
"loss": 1.4786,
"step": 3280
},
{
"epoch": 3.4872611464968153,
"grad_norm": 2.6255910396575928,
"learning_rate": 1.0481294840140199e-05,
"loss": 1.3698,
"step": 3285
},
{
"epoch": 3.492569002123142,
"grad_norm": 2.3305420875549316,
"learning_rate": 1.0413501027678965e-05,
"loss": 1.3544,
"step": 3290
},
{
"epoch": 3.4978768577494694,
"grad_norm": 2.7587552070617676,
"learning_rate": 1.0345869451730608e-05,
"loss": 1.4469,
"step": 3295
},
{
"epoch": 3.5031847133757963,
"grad_norm": 16.741992950439453,
"learning_rate": 1.0278400864518892e-05,
"loss": 1.5186,
"step": 3300
},
{
"epoch": 3.508492569002123,
"grad_norm": 2.2464590072631836,
"learning_rate": 1.0211096016454749e-05,
"loss": 1.4908,
"step": 3305
},
{
"epoch": 3.51380042462845,
"grad_norm": 2.071381092071533,
"learning_rate": 1.0143955656127957e-05,
"loss": 1.3258,
"step": 3310
},
{
"epoch": 3.519108280254777,
"grad_norm": 2.1464314460754395,
"learning_rate": 1.0076980530298769e-05,
"loss": 1.4744,
"step": 3315
},
{
"epoch": 3.524416135881104,
"grad_norm": 1.953969120979309,
"learning_rate": 1.0010171383889664e-05,
"loss": 1.4532,
"step": 3320
},
{
"epoch": 3.529723991507431,
"grad_norm": 2.3309433460235596,
"learning_rate": 9.943528959977027e-06,
"loss": 1.4666,
"step": 3325
},
{
"epoch": 3.535031847133758,
"grad_norm": 2.290804862976074,
"learning_rate": 9.877053999782907e-06,
"loss": 1.3826,
"step": 3330
},
{
"epoch": 3.540339702760085,
"grad_norm": 2.3279714584350586,
"learning_rate": 9.81074724266672e-06,
"loss": 1.4758,
"step": 3335
},
{
"epoch": 3.545647558386412,
"grad_norm": 2.5002360343933105,
"learning_rate": 9.74460942611711e-06,
"loss": 1.4526,
"step": 3340
},
{
"epoch": 3.550955414012739,
"grad_norm": 2.25785756111145,
"learning_rate": 9.678641285743673e-06,
"loss": 1.3462,
"step": 3345
},
{
"epoch": 3.5562632696390657,
"grad_norm": 2.0230438709259033,
"learning_rate": 9.612843555268813e-06,
"loss": 1.2952,
"step": 3350
},
{
"epoch": 3.5615711252653925,
"grad_norm": 2.1412694454193115,
"learning_rate": 9.547216966519577e-06,
"loss": 1.3239,
"step": 3355
},
{
"epoch": 3.56687898089172,
"grad_norm": 14.145221710205078,
"learning_rate": 9.481762249419482e-06,
"loss": 1.4509,
"step": 3360
},
{
"epoch": 3.5721868365180467,
"grad_norm": 2.359635353088379,
"learning_rate": 9.416480131980455e-06,
"loss": 1.3237,
"step": 3365
},
{
"epoch": 3.5774946921443735,
"grad_norm": 2.2716643810272217,
"learning_rate": 9.35137134029469e-06,
"loss": 1.3844,
"step": 3370
},
{
"epoch": 3.582802547770701,
"grad_norm": 2.952258825302124,
"learning_rate": 9.286436598526601e-06,
"loss": 1.3404,
"step": 3375
},
{
"epoch": 3.5881104033970277,
"grad_norm": 2.1978461742401123,
"learning_rate": 9.221676628904724e-06,
"loss": 1.4622,
"step": 3380
},
{
"epoch": 3.5934182590233545,
"grad_norm": 2.3678669929504395,
"learning_rate": 9.157092151713742e-06,
"loss": 1.3749,
"step": 3385
},
{
"epoch": 3.5987261146496814,
"grad_norm": 5.995536804199219,
"learning_rate": 9.092683885286438e-06,
"loss": 1.4892,
"step": 3390
},
{
"epoch": 3.6040339702760082,
"grad_norm": 2.5272786617279053,
"learning_rate": 9.028452545995714e-06,
"loss": 1.4309,
"step": 3395
},
{
"epoch": 3.6093418259023355,
"grad_norm": 2.5360288619995117,
"learning_rate": 8.964398848246603e-06,
"loss": 1.3416,
"step": 3400
},
{
"epoch": 3.6146496815286624,
"grad_norm": 2.71073842048645,
"learning_rate": 8.900523504468366e-06,
"loss": 1.3541,
"step": 3405
},
{
"epoch": 3.6199575371549892,
"grad_norm": 2.1415514945983887,
"learning_rate": 8.836827225106536e-06,
"loss": 1.4829,
"step": 3410
},
{
"epoch": 3.6252653927813165,
"grad_norm": 2.409083604812622,
"learning_rate": 8.773310718615036e-06,
"loss": 1.4849,
"step": 3415
},
{
"epoch": 3.6305732484076434,
"grad_norm": 2.1981987953186035,
"learning_rate": 8.709974691448253e-06,
"loss": 1.4821,
"step": 3420
},
{
"epoch": 3.6358811040339702,
"grad_norm": 2.5032248497009277,
"learning_rate": 8.64681984805325e-06,
"loss": 1.3284,
"step": 3425
},
{
"epoch": 3.641188959660297,
"grad_norm": 2.0410494804382324,
"learning_rate": 8.583846890861886e-06,
"loss": 1.4164,
"step": 3430
},
{
"epoch": 3.646496815286624,
"grad_norm": 2.2673985958099365,
"learning_rate": 8.521056520283017e-06,
"loss": 1.4347,
"step": 3435
},
{
"epoch": 3.6518046709129512,
"grad_norm": 2.1072731018066406,
"learning_rate": 8.458449434694679e-06,
"loss": 1.4396,
"step": 3440
},
{
"epoch": 3.657112526539278,
"grad_norm": 2.3610174655914307,
"learning_rate": 8.396026330436374e-06,
"loss": 1.4165,
"step": 3445
},
{
"epoch": 3.662420382165605,
"grad_norm": 2.143754482269287,
"learning_rate": 8.333787901801279e-06,
"loss": 1.3376,
"step": 3450
},
{
"epoch": 3.6677282377919322,
"grad_norm": 2.567183017730713,
"learning_rate": 8.271734841028553e-06,
"loss": 1.3744,
"step": 3455
},
{
"epoch": 3.673036093418259,
"grad_norm": 2.2236099243164062,
"learning_rate": 8.209867838295596e-06,
"loss": 1.3606,
"step": 3460
},
{
"epoch": 3.678343949044586,
"grad_norm": 2.2553422451019287,
"learning_rate": 8.148187581710423e-06,
"loss": 1.355,
"step": 3465
},
{
"epoch": 3.683651804670913,
"grad_norm": 2.2291765213012695,
"learning_rate": 8.086694757303991e-06,
"loss": 1.2048,
"step": 3470
},
{
"epoch": 3.6889596602972397,
"grad_norm": 2.602382183074951,
"learning_rate": 8.025390049022562e-06,
"loss": 1.3158,
"step": 3475
},
{
"epoch": 3.694267515923567,
"grad_norm": 2.460057258605957,
"learning_rate": 7.964274138720081e-06,
"loss": 1.4712,
"step": 3480
},
{
"epoch": 3.699575371549894,
"grad_norm": 2.3745005130767822,
"learning_rate": 7.903347706150636e-06,
"loss": 1.3811,
"step": 3485
},
{
"epoch": 3.7048832271762207,
"grad_norm": 2.286000967025757,
"learning_rate": 7.842611428960861e-06,
"loss": 1.573,
"step": 3490
},
{
"epoch": 3.710191082802548,
"grad_norm": 2.2103700637817383,
"learning_rate": 7.782065982682423e-06,
"loss": 1.4452,
"step": 3495
},
{
"epoch": 3.715498938428875,
"grad_norm": 2.0639681816101074,
"learning_rate": 7.721712040724469e-06,
"loss": 1.5869,
"step": 3500
},
{
"epoch": 3.7208067940552016,
"grad_norm": 1.9713996648788452,
"learning_rate": 7.661550274366189e-06,
"loss": 1.4913,
"step": 3505
},
{
"epoch": 3.7261146496815285,
"grad_norm": 2.4420719146728516,
"learning_rate": 7.601581352749309e-06,
"loss": 1.5407,
"step": 3510
},
{
"epoch": 3.7314225053078554,
"grad_norm": 2.3371787071228027,
"learning_rate": 7.5418059428706865e-06,
"loss": 1.4896,
"step": 3515
},
{
"epoch": 3.7367303609341826,
"grad_norm": 5.171130657196045,
"learning_rate": 7.482224709574829e-06,
"loss": 1.4157,
"step": 3520
},
{
"epoch": 3.7420382165605095,
"grad_norm": 2.2224090099334717,
"learning_rate": 7.4228383155465705e-06,
"loss": 1.4068,
"step": 3525
},
{
"epoch": 3.7473460721868364,
"grad_norm": 2.283038854598999,
"learning_rate": 7.363647421303666e-06,
"loss": 1.3288,
"step": 3530
},
{
"epoch": 3.7526539278131636,
"grad_norm": 2.160877227783203,
"learning_rate": 7.304652685189434e-06,
"loss": 1.4058,
"step": 3535
},
{
"epoch": 3.7579617834394905,
"grad_norm": 2.3264095783233643,
"learning_rate": 7.2458547633654675e-06,
"loss": 1.4104,
"step": 3540
},
{
"epoch": 3.7632696390658174,
"grad_norm": 2.3588404655456543,
"learning_rate": 7.1872543098043035e-06,
"loss": 1.5082,
"step": 3545
},
{
"epoch": 3.7685774946921446,
"grad_norm": 2.169647216796875,
"learning_rate": 7.128851976282172e-06,
"loss": 1.3996,
"step": 3550
},
{
"epoch": 3.7738853503184715,
"grad_norm": 2.3636116981506348,
"learning_rate": 7.070648412371725e-06,
"loss": 1.3886,
"step": 3555
},
{
"epoch": 3.7791932059447984,
"grad_norm": 2.253528118133545,
"learning_rate": 7.012644265434834e-06,
"loss": 1.4162,
"step": 3560
},
{
"epoch": 3.784501061571125,
"grad_norm": 2.413527727127075,
"learning_rate": 6.95484018061538e-06,
"loss": 1.4416,
"step": 3565
},
{
"epoch": 3.789808917197452,
"grad_norm": 2.313779354095459,
"learning_rate": 6.897236800832082e-06,
"loss": 1.4022,
"step": 3570
},
{
"epoch": 3.7951167728237793,
"grad_norm": 2.22562575340271,
"learning_rate": 6.8398347667713246e-06,
"loss": 1.4004,
"step": 3575
},
{
"epoch": 3.800424628450106,
"grad_norm": 2.1902172565460205,
"learning_rate": 6.782634716880068e-06,
"loss": 1.4268,
"step": 3580
},
{
"epoch": 3.805732484076433,
"grad_norm": 2.4533374309539795,
"learning_rate": 6.725637287358724e-06,
"loss": 1.3675,
"step": 3585
},
{
"epoch": 3.8110403397027603,
"grad_norm": 2.2463905811309814,
"learning_rate": 6.668843112154088e-06,
"loss": 1.3991,
"step": 3590
},
{
"epoch": 3.816348195329087,
"grad_norm": 2.4668920040130615,
"learning_rate": 6.612252822952267e-06,
"loss": 1.4839,
"step": 3595
},
{
"epoch": 3.821656050955414,
"grad_norm": 2.244965076446533,
"learning_rate": 6.555867049171688e-06,
"loss": 1.525,
"step": 3600
},
{
"epoch": 3.826963906581741,
"grad_norm": 2.3175225257873535,
"learning_rate": 6.499686417956083e-06,
"loss": 1.4035,
"step": 3605
},
{
"epoch": 3.8322717622080678,
"grad_norm": 2.565484046936035,
"learning_rate": 6.443711554167506e-06,
"loss": 1.4154,
"step": 3610
},
{
"epoch": 3.837579617834395,
"grad_norm": 2.11671781539917,
"learning_rate": 6.38794308037938e-06,
"loss": 1.2861,
"step": 3615
},
{
"epoch": 3.842887473460722,
"grad_norm": 2.795337438583374,
"learning_rate": 6.332381616869593e-06,
"loss": 1.4971,
"step": 3620
},
{
"epoch": 3.8481953290870488,
"grad_norm": 2.443300247192383,
"learning_rate": 6.2770277816135814e-06,
"loss": 1.4387,
"step": 3625
},
{
"epoch": 3.853503184713376,
"grad_norm": 2.138455629348755,
"learning_rate": 6.221882190277472e-06,
"loss": 1.3503,
"step": 3630
},
{
"epoch": 3.858811040339703,
"grad_norm": 2.1262552738189697,
"learning_rate": 6.166945456211204e-06,
"loss": 1.4727,
"step": 3635
},
{
"epoch": 3.8641188959660298,
"grad_norm": 2.3657939434051514,
"learning_rate": 6.112218190441746e-06,
"loss": 1.4265,
"step": 3640
},
{
"epoch": 3.8694267515923566,
"grad_norm": 2.300050735473633,
"learning_rate": 6.057701001666275e-06,
"loss": 1.2813,
"step": 3645
},
{
"epoch": 3.8747346072186835,
"grad_norm": 2.3793447017669678,
"learning_rate": 6.0033944962454205e-06,
"loss": 1.4436,
"step": 3650
},
{
"epoch": 3.8800424628450108,
"grad_norm": 2.3621938228607178,
"learning_rate": 5.949299278196493e-06,
"loss": 1.2759,
"step": 3655
},
{
"epoch": 3.8853503184713376,
"grad_norm": 2.4751360416412354,
"learning_rate": 5.8954159491868085e-06,
"loss": 1.2852,
"step": 3660
},
{
"epoch": 3.8906581740976645,
"grad_norm": 2.046066999435425,
"learning_rate": 5.841745108526958e-06,
"loss": 1.3721,
"step": 3665
},
{
"epoch": 3.8959660297239918,
"grad_norm": 2.4861631393432617,
"learning_rate": 5.7882873531641705e-06,
"loss": 1.4705,
"step": 3670
},
{
"epoch": 3.9012738853503186,
"grad_norm": 2.280402421951294,
"learning_rate": 5.735043277675642e-06,
"loss": 1.5398,
"step": 3675
},
{
"epoch": 3.9065817409766455,
"grad_norm": 9.118425369262695,
"learning_rate": 5.682013474261957e-06,
"loss": 1.5439,
"step": 3680
},
{
"epoch": 3.9118895966029723,
"grad_norm": 2.5025949478149414,
"learning_rate": 5.629198532740482e-06,
"loss": 1.3915,
"step": 3685
},
{
"epoch": 3.917197452229299,
"grad_norm": 2.5477778911590576,
"learning_rate": 5.576599040538813e-06,
"loss": 1.4886,
"step": 3690
},
{
"epoch": 3.9225053078556265,
"grad_norm": 2.4313480854034424,
"learning_rate": 5.524215582688216e-06,
"loss": 1.4189,
"step": 3695
},
{
"epoch": 3.9278131634819533,
"grad_norm": 2.327481269836426,
"learning_rate": 5.472048741817165e-06,
"loss": 1.4373,
"step": 3700
},
{
"epoch": 3.93312101910828,
"grad_norm": 2.1773698329925537,
"learning_rate": 5.4200990981448375e-06,
"loss": 1.2943,
"step": 3705
},
{
"epoch": 3.9384288747346075,
"grad_norm": 2.5786592960357666,
"learning_rate": 5.368367229474655e-06,
"loss": 1.5092,
"step": 3710
},
{
"epoch": 3.9437367303609343,
"grad_norm": 2.2224104404449463,
"learning_rate": 5.316853711187858e-06,
"loss": 1.426,
"step": 3715
},
{
"epoch": 3.949044585987261,
"grad_norm": 2.4788384437561035,
"learning_rate": 5.265559116237123e-06,
"loss": 1.2464,
"step": 3720
},
{
"epoch": 3.954352441613588,
"grad_norm": 2.4742305278778076,
"learning_rate": 5.214484015140178e-06,
"loss": 1.4523,
"step": 3725
},
{
"epoch": 3.959660297239915,
"grad_norm": 2.3287322521209717,
"learning_rate": 5.163628975973458e-06,
"loss": 1.5333,
"step": 3730
},
{
"epoch": 3.964968152866242,
"grad_norm": 2.1568734645843506,
"learning_rate": 5.112994564365775e-06,
"loss": 1.3845,
"step": 3735
},
{
"epoch": 3.970276008492569,
"grad_norm": 2.289116144180298,
"learning_rate": 5.062581343492051e-06,
"loss": 1.382,
"step": 3740
},
{
"epoch": 3.975583864118896,
"grad_norm": 2.5056374073028564,
"learning_rate": 5.012389874067039e-06,
"loss": 1.3863,
"step": 3745
},
{
"epoch": 3.980891719745223,
"grad_norm": 2.403012990951538,
"learning_rate": 4.962420714339094e-06,
"loss": 1.3269,
"step": 3750
},
{
"epoch": 3.98619957537155,
"grad_norm": 5.482332229614258,
"learning_rate": 4.9126744200839456e-06,
"loss": 1.5098,
"step": 3755
},
{
"epoch": 3.991507430997877,
"grad_norm": 2.4922893047332764,
"learning_rate": 4.8631515445985404e-06,
"loss": 1.4779,
"step": 3760
},
{
"epoch": 3.9968152866242037,
"grad_norm": 2.413883686065674,
"learning_rate": 4.813852638694874e-06,
"loss": 1.4107,
"step": 3765
},
{
"epoch": 4.002123142250531,
"grad_norm": 2.4241840839385986,
"learning_rate": 4.76477825069388e-06,
"loss": 1.5316,
"step": 3770
},
{
"epoch": 4.007430997876858,
"grad_norm": 2.759788751602173,
"learning_rate": 4.715928926419292e-06,
"loss": 1.2973,
"step": 3775
},
{
"epoch": 4.012738853503185,
"grad_norm": 2.396528959274292,
"learning_rate": 4.6673052091916276e-06,
"loss": 1.3054,
"step": 3780
},
{
"epoch": 4.018046709129512,
"grad_norm": 2.296591281890869,
"learning_rate": 4.618907639822107e-06,
"loss": 1.3189,
"step": 3785
},
{
"epoch": 4.023354564755839,
"grad_norm": 1.9780110120773315,
"learning_rate": 4.570736756606659e-06,
"loss": 1.4215,
"step": 3790
},
{
"epoch": 4.028662420382165,
"grad_norm": 2.2614572048187256,
"learning_rate": 4.522793095319899e-06,
"loss": 1.4185,
"step": 3795
},
{
"epoch": 4.033970276008493,
"grad_norm": 2.025022029876709,
"learning_rate": 4.475077189209218e-06,
"loss": 1.4099,
"step": 3800
},
{
"epoch": 4.03927813163482,
"grad_norm": 2.2896909713745117,
"learning_rate": 4.427589568988824e-06,
"loss": 1.4216,
"step": 3805
},
{
"epoch": 4.044585987261146,
"grad_norm": 2.432157516479492,
"learning_rate": 4.380330762833848e-06,
"loss": 1.3377,
"step": 3810
},
{
"epoch": 4.049893842887474,
"grad_norm": 2.016071081161499,
"learning_rate": 4.333301296374442e-06,
"loss": 1.3515,
"step": 3815
},
{
"epoch": 4.055201698513801,
"grad_norm": 1.8489809036254883,
"learning_rate": 4.286501692689984e-06,
"loss": 1.2477,
"step": 3820
},
{
"epoch": 4.060509554140127,
"grad_norm": 2.454228162765503,
"learning_rate": 4.239932472303215e-06,
"loss": 1.4972,
"step": 3825
},
{
"epoch": 4.065817409766455,
"grad_norm": 2.3903145790100098,
"learning_rate": 4.193594153174485e-06,
"loss": 1.2439,
"step": 3830
},
{
"epoch": 4.071125265392781,
"grad_norm": 2.542217254638672,
"learning_rate": 4.1474872506959416e-06,
"loss": 1.406,
"step": 3835
},
{
"epoch": 4.076433121019108,
"grad_norm": 2.2244298458099365,
"learning_rate": 4.101612277685856e-06,
"loss": 1.3496,
"step": 3840
},
{
"epoch": 4.081740976645436,
"grad_norm": 2.2349693775177,
"learning_rate": 4.0559697443828895e-06,
"loss": 1.4276,
"step": 3845
},
{
"epoch": 4.087048832271762,
"grad_norm": 2.4000797271728516,
"learning_rate": 4.0105601584404214e-06,
"loss": 1.4705,
"step": 3850
},
{
"epoch": 4.092356687898089,
"grad_norm": 2.4222750663757324,
"learning_rate": 3.965384024920885e-06,
"loss": 1.3891,
"step": 3855
},
{
"epoch": 4.097664543524417,
"grad_norm": 2.3922958374023438,
"learning_rate": 3.920441846290193e-06,
"loss": 1.4135,
"step": 3860
},
{
"epoch": 4.102972399150743,
"grad_norm": 2.559807538986206,
"learning_rate": 3.8757341224121085e-06,
"loss": 1.4211,
"step": 3865
},
{
"epoch": 4.10828025477707,
"grad_norm": 2.560068130493164,
"learning_rate": 3.831261350542712e-06,
"loss": 1.4407,
"step": 3870
},
{
"epoch": 4.113588110403397,
"grad_norm": 2.4018898010253906,
"learning_rate": 3.7870240253248563e-06,
"loss": 1.3578,
"step": 3875
},
{
"epoch": 4.118895966029724,
"grad_norm": 2.588036298751831,
"learning_rate": 3.7430226387826535e-06,
"loss": 1.5336,
"step": 3880
},
{
"epoch": 4.124203821656051,
"grad_norm": 2.196746587753296,
"learning_rate": 3.6992576803160374e-06,
"loss": 1.2888,
"step": 3885
},
{
"epoch": 4.129511677282378,
"grad_norm": 2.288543701171875,
"learning_rate": 3.6557296366952878e-06,
"loss": 1.3779,
"step": 3890
},
{
"epoch": 4.134819532908705,
"grad_norm": 2.3535075187683105,
"learning_rate": 3.6124389920556445e-06,
"loss": 1.3858,
"step": 3895
},
{
"epoch": 4.140127388535032,
"grad_norm": 2.0725769996643066,
"learning_rate": 3.5693862278918797e-06,
"loss": 1.5102,
"step": 3900
},
{
"epoch": 4.145435244161359,
"grad_norm": 2.4607093334198,
"learning_rate": 3.526571823052993e-06,
"loss": 1.2617,
"step": 3905
},
{
"epoch": 4.150743099787686,
"grad_norm": 2.3538119792938232,
"learning_rate": 3.4839962537368516e-06,
"loss": 1.3718,
"step": 3910
},
{
"epoch": 4.156050955414012,
"grad_norm": 2.226527214050293,
"learning_rate": 3.4416599934849162e-06,
"loss": 1.4393,
"step": 3915
},
{
"epoch": 4.16135881104034,
"grad_norm": 2.4096546173095703,
"learning_rate": 3.3995635131769428e-06,
"loss": 1.4414,
"step": 3920
},
{
"epoch": 4.166666666666667,
"grad_norm": 2.072619676589966,
"learning_rate": 3.3577072810257766e-06,
"loss": 1.4062,
"step": 3925
},
{
"epoch": 4.171974522292993,
"grad_norm": 2.6206490993499756,
"learning_rate": 3.3160917625721376e-06,
"loss": 1.3057,
"step": 3930
},
{
"epoch": 4.177282377919321,
"grad_norm": 2.2108707427978516,
"learning_rate": 3.2747174206794295e-06,
"loss": 1.5061,
"step": 3935
},
{
"epoch": 4.182590233545648,
"grad_norm": 2.2947421073913574,
"learning_rate": 3.233584715528601e-06,
"loss": 1.3381,
"step": 3940
},
{
"epoch": 4.187898089171974,
"grad_norm": 2.2240118980407715,
"learning_rate": 3.1926941046130225e-06,
"loss": 1.347,
"step": 3945
},
{
"epoch": 4.193205944798302,
"grad_norm": 2.6407012939453125,
"learning_rate": 3.152046042733414e-06,
"loss": 1.3313,
"step": 3950
},
{
"epoch": 4.198513800424628,
"grad_norm": 2.520167112350464,
"learning_rate": 3.1116409819927695e-06,
"loss": 1.409,
"step": 3955
},
{
"epoch": 4.203821656050955,
"grad_norm": 2.168043851852417,
"learning_rate": 3.071479371791322e-06,
"loss": 1.3983,
"step": 3960
},
{
"epoch": 4.209129511677283,
"grad_norm": 2.150791883468628,
"learning_rate": 3.0315616588215635e-06,
"loss": 1.376,
"step": 3965
},
{
"epoch": 4.214437367303609,
"grad_norm": 2.6289355754852295,
"learning_rate": 2.991888287063277e-06,
"loss": 1.3597,
"step": 3970
},
{
"epoch": 4.219745222929936,
"grad_norm": 2.2437193393707275,
"learning_rate": 2.9524596977785867e-06,
"loss": 1.3839,
"step": 3975
},
{
"epoch": 4.225053078556264,
"grad_norm": 2.4536490440368652,
"learning_rate": 2.913276329507042e-06,
"loss": 1.403,
"step": 3980
},
{
"epoch": 4.23036093418259,
"grad_norm": 2.498270034790039,
"learning_rate": 2.874338618060765e-06,
"loss": 1.3648,
"step": 3985
},
{
"epoch": 4.235668789808917,
"grad_norm": 2.479475975036621,
"learning_rate": 2.835646996519595e-06,
"loss": 1.2893,
"step": 3990
},
{
"epoch": 4.240976645435244,
"grad_norm": 2.214371681213379,
"learning_rate": 2.7972018952262563e-06,
"loss": 1.2914,
"step": 3995
},
{
"epoch": 4.246284501061571,
"grad_norm": 2.644470691680908,
"learning_rate": 2.7590037417815824e-06,
"loss": 1.4778,
"step": 4000
},
{
"epoch": 4.251592356687898,
"grad_norm": 2.93762469291687,
"learning_rate": 2.721052961039766e-06,
"loss": 1.3126,
"step": 4005
},
{
"epoch": 4.256900212314225,
"grad_norm": 2.3123199939727783,
"learning_rate": 2.6833499751036247e-06,
"loss": 1.3483,
"step": 4010
},
{
"epoch": 4.262208067940552,
"grad_norm": 2.590116262435913,
"learning_rate": 2.6458952033199176e-06,
"loss": 1.3834,
"step": 4015
},
{
"epoch": 4.267515923566879,
"grad_norm": 2.3344857692718506,
"learning_rate": 2.6086890622746526e-06,
"loss": 1.3848,
"step": 4020
},
{
"epoch": 4.272823779193206,
"grad_norm": 2.472461223602295,
"learning_rate": 2.571731965788496e-06,
"loss": 1.2809,
"step": 4025
},
{
"epoch": 4.278131634819533,
"grad_norm": 2.2091104984283447,
"learning_rate": 2.535024324912133e-06,
"loss": 1.367,
"step": 4030
},
{
"epoch": 4.2834394904458595,
"grad_norm": 2.2722952365875244,
"learning_rate": 2.4985665479217213e-06,
"loss": 1.4383,
"step": 4035
},
{
"epoch": 4.288747346072187,
"grad_norm": 2.44112491607666,
"learning_rate": 2.4623590403143187e-06,
"loss": 1.3626,
"step": 4040
},
{
"epoch": 4.294055201698514,
"grad_norm": 2.245461940765381,
"learning_rate": 2.4264022048034155e-06,
"loss": 1.3627,
"step": 4045
},
{
"epoch": 4.2993630573248405,
"grad_norm": 2.2949862480163574,
"learning_rate": 2.3906964413144215e-06,
"loss": 1.3443,
"step": 4050
},
{
"epoch": 4.304670912951168,
"grad_norm": 2.4080650806427,
"learning_rate": 2.3552421469802354e-06,
"loss": 1.3183,
"step": 4055
},
{
"epoch": 4.309978768577495,
"grad_norm": 2.6764564514160156,
"learning_rate": 2.320039716136807e-06,
"loss": 1.5511,
"step": 4060
},
{
"epoch": 4.3152866242038215,
"grad_norm": 2.4801034927368164,
"learning_rate": 2.2850895403187856e-06,
"loss": 1.5182,
"step": 4065
},
{
"epoch": 4.320594479830149,
"grad_norm": 2.4023561477661133,
"learning_rate": 2.250392008255131e-06,
"loss": 1.3622,
"step": 4070
},
{
"epoch": 4.325902335456475,
"grad_norm": 2.485903739929199,
"learning_rate": 2.215947505864818e-06,
"loss": 1.3739,
"step": 4075
},
{
"epoch": 4.3312101910828025,
"grad_norm": 2.491157293319702,
"learning_rate": 2.181756416252512e-06,
"loss": 1.5562,
"step": 4080
},
{
"epoch": 4.33651804670913,
"grad_norm": 2.497166395187378,
"learning_rate": 2.147819119704339e-06,
"loss": 1.3947,
"step": 4085
},
{
"epoch": 4.341825902335456,
"grad_norm": 2.6698756217956543,
"learning_rate": 2.1141359936836414e-06,
"loss": 1.3789,
"step": 4090
},
{
"epoch": 4.3471337579617835,
"grad_norm": 2.5180952548980713,
"learning_rate": 2.0807074128267876e-06,
"loss": 1.422,
"step": 4095
},
{
"epoch": 4.352441613588111,
"grad_norm": 2.275834321975708,
"learning_rate": 2.0475337489389846e-06,
"loss": 1.3095,
"step": 4100
}
],
"logging_steps": 5,
"max_steps": 4710,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7323402312577434e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}