OpenELM-1_1B-SFT-2 / trainer_state.json
CharlesLi's picture
Model save
81464ae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 4566,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004380201489268506,
"grad_norm": 244.058147496229,
"learning_rate": 4.37636761487965e-08,
"loss": 8.7812,
"step": 1
},
{
"epoch": 0.002190100744634253,
"grad_norm": 242.66751047909037,
"learning_rate": 2.188183807439825e-07,
"loss": 8.8047,
"step": 5
},
{
"epoch": 0.004380201489268506,
"grad_norm": 224.45025961352744,
"learning_rate": 4.37636761487965e-07,
"loss": 8.7406,
"step": 10
},
{
"epoch": 0.006570302233902759,
"grad_norm": 202.25250028934636,
"learning_rate": 6.564551422319475e-07,
"loss": 8.6438,
"step": 15
},
{
"epoch": 0.008760402978537012,
"grad_norm": 194.17623442189665,
"learning_rate": 8.7527352297593e-07,
"loss": 8.3891,
"step": 20
},
{
"epoch": 0.010950503723171266,
"grad_norm": 110.65671385292053,
"learning_rate": 1.0940919037199126e-06,
"loss": 7.8109,
"step": 25
},
{
"epoch": 0.013140604467805518,
"grad_norm": 65.62760076712048,
"learning_rate": 1.312910284463895e-06,
"loss": 7.175,
"step": 30
},
{
"epoch": 0.015330705212439772,
"grad_norm": 53.67520132013994,
"learning_rate": 1.5317286652078775e-06,
"loss": 6.6734,
"step": 35
},
{
"epoch": 0.017520805957074025,
"grad_norm": 37.22908552601946,
"learning_rate": 1.75054704595186e-06,
"loss": 5.9109,
"step": 40
},
{
"epoch": 0.01971090670170828,
"grad_norm": 33.17042166374813,
"learning_rate": 1.9693654266958425e-06,
"loss": 5.2781,
"step": 45
},
{
"epoch": 0.021901007446342532,
"grad_norm": 24.17805937125211,
"learning_rate": 2.188183807439825e-06,
"loss": 4.6617,
"step": 50
},
{
"epoch": 0.024091108190976786,
"grad_norm": 15.533482402019871,
"learning_rate": 2.4070021881838077e-06,
"loss": 3.6313,
"step": 55
},
{
"epoch": 0.026281208935611037,
"grad_norm": 13.054298424254013,
"learning_rate": 2.62582056892779e-06,
"loss": 3.0438,
"step": 60
},
{
"epoch": 0.02847130968024529,
"grad_norm": 10.046368116334548,
"learning_rate": 2.8446389496717725e-06,
"loss": 2.5609,
"step": 65
},
{
"epoch": 0.030661410424879545,
"grad_norm": 3.63825446752271,
"learning_rate": 3.063457330415755e-06,
"loss": 2.1242,
"step": 70
},
{
"epoch": 0.0328515111695138,
"grad_norm": 2.5205471934789943,
"learning_rate": 3.2822757111597377e-06,
"loss": 1.8531,
"step": 75
},
{
"epoch": 0.03504161191414805,
"grad_norm": 1.5353534970314644,
"learning_rate": 3.50109409190372e-06,
"loss": 1.7168,
"step": 80
},
{
"epoch": 0.03723171265878231,
"grad_norm": 0.9320337937023512,
"learning_rate": 3.7199124726477025e-06,
"loss": 1.6402,
"step": 85
},
{
"epoch": 0.03942181340341656,
"grad_norm": 0.7724916688386967,
"learning_rate": 3.938730853391685e-06,
"loss": 1.5934,
"step": 90
},
{
"epoch": 0.04161191414805081,
"grad_norm": 0.5985215738037675,
"learning_rate": 4.157549234135668e-06,
"loss": 1.5066,
"step": 95
},
{
"epoch": 0.043802014892685065,
"grad_norm": 0.5238425867378174,
"learning_rate": 4.37636761487965e-06,
"loss": 1.4219,
"step": 100
},
{
"epoch": 0.045992115637319315,
"grad_norm": 0.4867641557206586,
"learning_rate": 4.595185995623633e-06,
"loss": 1.4422,
"step": 105
},
{
"epoch": 0.04818221638195357,
"grad_norm": 0.4337246016382216,
"learning_rate": 4.8140043763676155e-06,
"loss": 1.418,
"step": 110
},
{
"epoch": 0.05037231712658782,
"grad_norm": 0.40770721306392865,
"learning_rate": 5.032822757111597e-06,
"loss": 1.3773,
"step": 115
},
{
"epoch": 0.052562417871222074,
"grad_norm": 0.4233334394962591,
"learning_rate": 5.25164113785558e-06,
"loss": 1.3613,
"step": 120
},
{
"epoch": 0.05475251861585633,
"grad_norm": 0.42178615143473047,
"learning_rate": 5.470459518599562e-06,
"loss": 1.391,
"step": 125
},
{
"epoch": 0.05694261936049058,
"grad_norm": 0.4006940117433896,
"learning_rate": 5.689277899343545e-06,
"loss": 1.3297,
"step": 130
},
{
"epoch": 0.05913272010512484,
"grad_norm": 0.3651420010557856,
"learning_rate": 5.908096280087528e-06,
"loss": 1.3313,
"step": 135
},
{
"epoch": 0.06132282084975909,
"grad_norm": 0.3747020330363452,
"learning_rate": 6.12691466083151e-06,
"loss": 1.3332,
"step": 140
},
{
"epoch": 0.06351292159439334,
"grad_norm": 0.3728552218659468,
"learning_rate": 6.345733041575493e-06,
"loss": 1.3004,
"step": 145
},
{
"epoch": 0.0657030223390276,
"grad_norm": 0.37018947461483953,
"learning_rate": 6.564551422319475e-06,
"loss": 1.3484,
"step": 150
},
{
"epoch": 0.06789312308366185,
"grad_norm": 0.3529067831254164,
"learning_rate": 6.783369803063458e-06,
"loss": 1.3023,
"step": 155
},
{
"epoch": 0.0700832238282961,
"grad_norm": 0.3518414768064083,
"learning_rate": 7.00218818380744e-06,
"loss": 1.3152,
"step": 160
},
{
"epoch": 0.07227332457293036,
"grad_norm": 0.34420293663926643,
"learning_rate": 7.221006564551422e-06,
"loss": 1.3258,
"step": 165
},
{
"epoch": 0.07446342531756461,
"grad_norm": 0.3406303226830583,
"learning_rate": 7.439824945295405e-06,
"loss": 1.2852,
"step": 170
},
{
"epoch": 0.07665352606219886,
"grad_norm": 0.3366612677735102,
"learning_rate": 7.658643326039388e-06,
"loss": 1.2812,
"step": 175
},
{
"epoch": 0.07884362680683311,
"grad_norm": 0.3437595783050795,
"learning_rate": 7.87746170678337e-06,
"loss": 1.3117,
"step": 180
},
{
"epoch": 0.08103372755146737,
"grad_norm": 0.35966923330984457,
"learning_rate": 8.096280087527353e-06,
"loss": 1.2727,
"step": 185
},
{
"epoch": 0.08322382829610162,
"grad_norm": 0.3341663091683573,
"learning_rate": 8.315098468271335e-06,
"loss": 1.2867,
"step": 190
},
{
"epoch": 0.08541392904073587,
"grad_norm": 0.35172271498862123,
"learning_rate": 8.533916849015318e-06,
"loss": 1.2664,
"step": 195
},
{
"epoch": 0.08760402978537013,
"grad_norm": 0.35659032911139527,
"learning_rate": 8.7527352297593e-06,
"loss": 1.2742,
"step": 200
},
{
"epoch": 0.08979413053000437,
"grad_norm": 0.3350456396328354,
"learning_rate": 8.971553610503283e-06,
"loss": 1.2781,
"step": 205
},
{
"epoch": 0.09198423127463863,
"grad_norm": 0.33449398836221755,
"learning_rate": 9.190371991247266e-06,
"loss": 1.2613,
"step": 210
},
{
"epoch": 0.09417433201927289,
"grad_norm": 0.3589466098486552,
"learning_rate": 9.409190371991248e-06,
"loss": 1.2559,
"step": 215
},
{
"epoch": 0.09636443276390715,
"grad_norm": 0.368082616736705,
"learning_rate": 9.628008752735231e-06,
"loss": 1.2852,
"step": 220
},
{
"epoch": 0.09855453350854139,
"grad_norm": 0.3512400018084896,
"learning_rate": 9.846827133479214e-06,
"loss": 1.2352,
"step": 225
},
{
"epoch": 0.10074463425317565,
"grad_norm": 0.3284893195501342,
"learning_rate": 1.0065645514223194e-05,
"loss": 1.2566,
"step": 230
},
{
"epoch": 0.1029347349978099,
"grad_norm": 0.35906080776203964,
"learning_rate": 1.0284463894967179e-05,
"loss": 1.2617,
"step": 235
},
{
"epoch": 0.10512483574244415,
"grad_norm": 0.3270591540034968,
"learning_rate": 1.050328227571116e-05,
"loss": 1.2375,
"step": 240
},
{
"epoch": 0.1073149364870784,
"grad_norm": 0.3532550836201906,
"learning_rate": 1.0722100656455144e-05,
"loss": 1.2086,
"step": 245
},
{
"epoch": 0.10950503723171266,
"grad_norm": 0.3237648295967401,
"learning_rate": 1.0940919037199125e-05,
"loss": 1.2398,
"step": 250
},
{
"epoch": 0.1116951379763469,
"grad_norm": 0.35822141700690474,
"learning_rate": 1.1159737417943109e-05,
"loss": 1.2355,
"step": 255
},
{
"epoch": 0.11388523872098116,
"grad_norm": 0.3563825451290871,
"learning_rate": 1.137855579868709e-05,
"loss": 1.2617,
"step": 260
},
{
"epoch": 0.11607533946561542,
"grad_norm": 0.3225396305771842,
"learning_rate": 1.1597374179431074e-05,
"loss": 1.2105,
"step": 265
},
{
"epoch": 0.11826544021024968,
"grad_norm": 0.3122994394367223,
"learning_rate": 1.1816192560175055e-05,
"loss": 1.2051,
"step": 270
},
{
"epoch": 0.12045554095488392,
"grad_norm": 0.33769463403346195,
"learning_rate": 1.2035010940919038e-05,
"loss": 1.2402,
"step": 275
},
{
"epoch": 0.12264564169951818,
"grad_norm": 0.323094652208957,
"learning_rate": 1.225382932166302e-05,
"loss": 1.2363,
"step": 280
},
{
"epoch": 0.12483574244415244,
"grad_norm": 0.32685421614831767,
"learning_rate": 1.2472647702407003e-05,
"loss": 1.2297,
"step": 285
},
{
"epoch": 0.12702584318878668,
"grad_norm": 0.3406152378644622,
"learning_rate": 1.2691466083150986e-05,
"loss": 1.2293,
"step": 290
},
{
"epoch": 0.12921594393342092,
"grad_norm": 0.31234607391709834,
"learning_rate": 1.2910284463894968e-05,
"loss": 1.2,
"step": 295
},
{
"epoch": 0.1314060446780552,
"grad_norm": 0.36244801139958155,
"learning_rate": 1.312910284463895e-05,
"loss": 1.2113,
"step": 300
},
{
"epoch": 0.13359614542268944,
"grad_norm": 0.3158237405680623,
"learning_rate": 1.3347921225382933e-05,
"loss": 1.2008,
"step": 305
},
{
"epoch": 0.1357862461673237,
"grad_norm": 0.35444090899453623,
"learning_rate": 1.3566739606126916e-05,
"loss": 1.2273,
"step": 310
},
{
"epoch": 0.13797634691195795,
"grad_norm": 0.3233212629925759,
"learning_rate": 1.3785557986870899e-05,
"loss": 1.191,
"step": 315
},
{
"epoch": 0.1401664476565922,
"grad_norm": 0.3625756011702934,
"learning_rate": 1.400437636761488e-05,
"loss": 1.173,
"step": 320
},
{
"epoch": 0.14235654840122647,
"grad_norm": 0.30517921759823324,
"learning_rate": 1.4223194748358864e-05,
"loss": 1.2137,
"step": 325
},
{
"epoch": 0.1445466491458607,
"grad_norm": 0.31776810415254086,
"learning_rate": 1.4442013129102845e-05,
"loss": 1.1977,
"step": 330
},
{
"epoch": 0.14673674989049496,
"grad_norm": 0.3141513649782591,
"learning_rate": 1.4660831509846829e-05,
"loss": 1.1906,
"step": 335
},
{
"epoch": 0.14892685063512923,
"grad_norm": 0.326520661370947,
"learning_rate": 1.487964989059081e-05,
"loss": 1.1848,
"step": 340
},
{
"epoch": 0.15111695137976347,
"grad_norm": 0.3245323516153073,
"learning_rate": 1.5098468271334794e-05,
"loss": 1.2023,
"step": 345
},
{
"epoch": 0.1533070521243977,
"grad_norm": 0.3106198673900871,
"learning_rate": 1.5317286652078775e-05,
"loss": 1.2039,
"step": 350
},
{
"epoch": 0.15549715286903198,
"grad_norm": 0.31738937212374596,
"learning_rate": 1.553610503282276e-05,
"loss": 1.1891,
"step": 355
},
{
"epoch": 0.15768725361366623,
"grad_norm": 0.3185701621576024,
"learning_rate": 1.575492341356674e-05,
"loss": 1.1965,
"step": 360
},
{
"epoch": 0.15987735435830047,
"grad_norm": 0.3376037242993364,
"learning_rate": 1.5973741794310725e-05,
"loss": 1.1895,
"step": 365
},
{
"epoch": 0.16206745510293474,
"grad_norm": 0.31121755906355864,
"learning_rate": 1.6192560175054705e-05,
"loss": 1.2,
"step": 370
},
{
"epoch": 0.164257555847569,
"grad_norm": 0.3217495152934365,
"learning_rate": 1.641137855579869e-05,
"loss": 1.1836,
"step": 375
},
{
"epoch": 0.16644765659220323,
"grad_norm": 0.2966935919095498,
"learning_rate": 1.663019693654267e-05,
"loss": 1.1887,
"step": 380
},
{
"epoch": 0.1686377573368375,
"grad_norm": 0.31806956377186374,
"learning_rate": 1.6849015317286655e-05,
"loss": 1.2281,
"step": 385
},
{
"epoch": 0.17082785808147175,
"grad_norm": 0.3110118252043724,
"learning_rate": 1.7067833698030636e-05,
"loss": 1.2066,
"step": 390
},
{
"epoch": 0.173017958826106,
"grad_norm": 0.3125792284015961,
"learning_rate": 1.728665207877462e-05,
"loss": 1.2027,
"step": 395
},
{
"epoch": 0.17520805957074026,
"grad_norm": 0.31848490451529393,
"learning_rate": 1.75054704595186e-05,
"loss": 1.2059,
"step": 400
},
{
"epoch": 0.1773981603153745,
"grad_norm": 0.29444069603586365,
"learning_rate": 1.7724288840262585e-05,
"loss": 1.1617,
"step": 405
},
{
"epoch": 0.17958826106000875,
"grad_norm": 0.2980836708326727,
"learning_rate": 1.7943107221006566e-05,
"loss": 1.1824,
"step": 410
},
{
"epoch": 0.18177836180464302,
"grad_norm": 0.31147307412521985,
"learning_rate": 1.816192560175055e-05,
"loss": 1.1695,
"step": 415
},
{
"epoch": 0.18396846254927726,
"grad_norm": 0.3101768840043127,
"learning_rate": 1.838074398249453e-05,
"loss": 1.177,
"step": 420
},
{
"epoch": 0.18615856329391153,
"grad_norm": 0.3203742924172763,
"learning_rate": 1.8599562363238512e-05,
"loss": 1.1992,
"step": 425
},
{
"epoch": 0.18834866403854578,
"grad_norm": 0.3109730786465228,
"learning_rate": 1.8818380743982497e-05,
"loss": 1.1965,
"step": 430
},
{
"epoch": 0.19053876478318002,
"grad_norm": 0.3283883952699153,
"learning_rate": 1.9037199124726478e-05,
"loss": 1.191,
"step": 435
},
{
"epoch": 0.1927288655278143,
"grad_norm": 0.310293473406716,
"learning_rate": 1.9256017505470462e-05,
"loss": 1.1687,
"step": 440
},
{
"epoch": 0.19491896627244854,
"grad_norm": 0.33957719240402645,
"learning_rate": 1.9474835886214443e-05,
"loss": 1.1785,
"step": 445
},
{
"epoch": 0.19710906701708278,
"grad_norm": 0.3097291115799612,
"learning_rate": 1.9693654266958427e-05,
"loss": 1.1879,
"step": 450
},
{
"epoch": 0.19929916776171705,
"grad_norm": 0.3466959419750145,
"learning_rate": 1.9912472647702408e-05,
"loss": 1.1793,
"step": 455
},
{
"epoch": 0.2014892685063513,
"grad_norm": 0.33898635574038505,
"learning_rate": 1.9999973694910354e-05,
"loss": 1.1668,
"step": 460
},
{
"epoch": 0.20367936925098554,
"grad_norm": 0.3227952712343219,
"learning_rate": 1.9999812942085888e-05,
"loss": 1.1633,
"step": 465
},
{
"epoch": 0.2058694699956198,
"grad_norm": 0.31482981716458963,
"learning_rate": 1.9999506052722038e-05,
"loss": 1.166,
"step": 470
},
{
"epoch": 0.20805957074025405,
"grad_norm": 0.3014515240380428,
"learning_rate": 1.9999053031303655e-05,
"loss": 1.1484,
"step": 475
},
{
"epoch": 0.2102496714848883,
"grad_norm": 0.2864843044702426,
"learning_rate": 1.9998453884451173e-05,
"loss": 1.1711,
"step": 480
},
{
"epoch": 0.21243977222952257,
"grad_norm": 0.31537308079673587,
"learning_rate": 1.9997708620920465e-05,
"loss": 1.1602,
"step": 485
},
{
"epoch": 0.2146298729741568,
"grad_norm": 0.2910479831367777,
"learning_rate": 1.9996817251602773e-05,
"loss": 1.1699,
"step": 490
},
{
"epoch": 0.21681997371879105,
"grad_norm": 0.29723625812900056,
"learning_rate": 1.9995779789524494e-05,
"loss": 1.1738,
"step": 495
},
{
"epoch": 0.21901007446342532,
"grad_norm": 0.32928417993263026,
"learning_rate": 1.9994596249847024e-05,
"loss": 1.1469,
"step": 500
},
{
"epoch": 0.22120017520805957,
"grad_norm": 0.318545598564101,
"learning_rate": 1.999326664986653e-05,
"loss": 1.1529,
"step": 505
},
{
"epoch": 0.2233902759526938,
"grad_norm": 0.3053735425350909,
"learning_rate": 1.9991791009013687e-05,
"loss": 1.1863,
"step": 510
},
{
"epoch": 0.22558037669732808,
"grad_norm": 0.3294132278225112,
"learning_rate": 1.99901693488534e-05,
"loss": 1.168,
"step": 515
},
{
"epoch": 0.22777047744196233,
"grad_norm": 0.30255802988311753,
"learning_rate": 1.9988401693084502e-05,
"loss": 1.1641,
"step": 520
},
{
"epoch": 0.22996057818659657,
"grad_norm": 0.31614096881549886,
"learning_rate": 1.9986488067539378e-05,
"loss": 1.1695,
"step": 525
},
{
"epoch": 0.23215067893123084,
"grad_norm": 0.29107544403138813,
"learning_rate": 1.9984428500183616e-05,
"loss": 1.1879,
"step": 530
},
{
"epoch": 0.23434077967586509,
"grad_norm": 0.3064129449057793,
"learning_rate": 1.998222302111558e-05,
"loss": 1.1453,
"step": 535
},
{
"epoch": 0.23653088042049936,
"grad_norm": 0.298260787884317,
"learning_rate": 1.9979871662565982e-05,
"loss": 1.15,
"step": 540
},
{
"epoch": 0.2387209811651336,
"grad_norm": 0.29812547479564855,
"learning_rate": 1.9977374458897408e-05,
"loss": 1.1465,
"step": 545
},
{
"epoch": 0.24091108190976784,
"grad_norm": 0.3198201131884772,
"learning_rate": 1.9974731446603805e-05,
"loss": 1.1531,
"step": 550
},
{
"epoch": 0.24310118265440211,
"grad_norm": 0.3026280375781704,
"learning_rate": 1.997194266430997e-05,
"loss": 1.1732,
"step": 555
},
{
"epoch": 0.24529128339903636,
"grad_norm": 0.3030023979215963,
"learning_rate": 1.996900815277096e-05,
"loss": 1.1793,
"step": 560
},
{
"epoch": 0.2474813841436706,
"grad_norm": 0.3121701164570895,
"learning_rate": 1.9965927954871516e-05,
"loss": 1.157,
"step": 565
},
{
"epoch": 0.24967148488830487,
"grad_norm": 0.3037543042825315,
"learning_rate": 1.996270211562542e-05,
"loss": 1.1588,
"step": 570
},
{
"epoch": 0.2518615856329391,
"grad_norm": 0.31986392002534736,
"learning_rate": 1.9959330682174863e-05,
"loss": 1.1359,
"step": 575
},
{
"epoch": 0.25405168637757336,
"grad_norm": 0.30039161810050113,
"learning_rate": 1.9955813703789717e-05,
"loss": 1.1492,
"step": 580
},
{
"epoch": 0.25624178712220763,
"grad_norm": 0.30051328246651055,
"learning_rate": 1.9952151231866858e-05,
"loss": 1.1404,
"step": 585
},
{
"epoch": 0.25843188786684185,
"grad_norm": 0.2993163815080819,
"learning_rate": 1.9948343319929377e-05,
"loss": 1.1797,
"step": 590
},
{
"epoch": 0.2606219886114761,
"grad_norm": 0.2925299399789088,
"learning_rate": 1.9944390023625827e-05,
"loss": 1.1727,
"step": 595
},
{
"epoch": 0.2628120893561104,
"grad_norm": 0.28733870474954115,
"learning_rate": 1.9940291400729385e-05,
"loss": 1.1449,
"step": 600
},
{
"epoch": 0.26500219010074466,
"grad_norm": 0.3166965904102066,
"learning_rate": 1.993604751113704e-05,
"loss": 1.1453,
"step": 605
},
{
"epoch": 0.2671922908453789,
"grad_norm": 0.2938712170029366,
"learning_rate": 1.9931658416868677e-05,
"loss": 1.1637,
"step": 610
},
{
"epoch": 0.26938239159001315,
"grad_norm": 0.2919926264580296,
"learning_rate": 1.9927124182066205e-05,
"loss": 1.1449,
"step": 615
},
{
"epoch": 0.2715724923346474,
"grad_norm": 0.2911062410962241,
"learning_rate": 1.9922444872992604e-05,
"loss": 1.1533,
"step": 620
},
{
"epoch": 0.27376259307928164,
"grad_norm": 0.285422153507294,
"learning_rate": 1.991762055803095e-05,
"loss": 1.1707,
"step": 625
},
{
"epoch": 0.2759526938239159,
"grad_norm": 0.2994545512735193,
"learning_rate": 1.9912651307683433e-05,
"loss": 1.1574,
"step": 630
},
{
"epoch": 0.2781427945685502,
"grad_norm": 0.2832612329809898,
"learning_rate": 1.9907537194570315e-05,
"loss": 1.1527,
"step": 635
},
{
"epoch": 0.2803328953131844,
"grad_norm": 0.283860545913851,
"learning_rate": 1.9902278293428883e-05,
"loss": 1.1613,
"step": 640
},
{
"epoch": 0.28252299605781866,
"grad_norm": 0.2857818677163762,
"learning_rate": 1.9896874681112323e-05,
"loss": 1.159,
"step": 645
},
{
"epoch": 0.28471309680245294,
"grad_norm": 0.28550178083083544,
"learning_rate": 1.989132643658864e-05,
"loss": 1.1516,
"step": 650
},
{
"epoch": 0.28690319754708715,
"grad_norm": 0.301608563919381,
"learning_rate": 1.9885633640939475e-05,
"loss": 1.1594,
"step": 655
},
{
"epoch": 0.2890932982917214,
"grad_norm": 0.302035660129087,
"learning_rate": 1.987979637735893e-05,
"loss": 1.165,
"step": 660
},
{
"epoch": 0.2912833990363557,
"grad_norm": 0.2867968194946794,
"learning_rate": 1.9873814731152346e-05,
"loss": 1.1492,
"step": 665
},
{
"epoch": 0.2934734997809899,
"grad_norm": 0.2866293446184061,
"learning_rate": 1.9867688789735075e-05,
"loss": 1.1445,
"step": 670
},
{
"epoch": 0.2956636005256242,
"grad_norm": 0.3014820377030796,
"learning_rate": 1.9861418642631173e-05,
"loss": 1.1387,
"step": 675
},
{
"epoch": 0.29785370127025845,
"grad_norm": 0.27958880178191026,
"learning_rate": 1.9855004381472113e-05,
"loss": 1.1537,
"step": 680
},
{
"epoch": 0.30004380201489267,
"grad_norm": 0.3044820311596824,
"learning_rate": 1.984844609999544e-05,
"loss": 1.1414,
"step": 685
},
{
"epoch": 0.30223390275952694,
"grad_norm": 0.28640742624947424,
"learning_rate": 1.9841743894043412e-05,
"loss": 1.125,
"step": 690
},
{
"epoch": 0.3044240035041612,
"grad_norm": 0.27345860578003156,
"learning_rate": 1.9834897861561572e-05,
"loss": 1.1301,
"step": 695
},
{
"epoch": 0.3066141042487954,
"grad_norm": 0.2789976740559993,
"learning_rate": 1.9827908102597342e-05,
"loss": 1.1637,
"step": 700
},
{
"epoch": 0.3088042049934297,
"grad_norm": 0.29865777361212825,
"learning_rate": 1.9820774719298553e-05,
"loss": 1.1555,
"step": 705
},
{
"epoch": 0.31099430573806397,
"grad_norm": 0.29781806230277835,
"learning_rate": 1.981349781591195e-05,
"loss": 1.1422,
"step": 710
},
{
"epoch": 0.3131844064826982,
"grad_norm": 0.28285008218859387,
"learning_rate": 1.9806077498781667e-05,
"loss": 1.1375,
"step": 715
},
{
"epoch": 0.31537450722733246,
"grad_norm": 0.299690829321009,
"learning_rate": 1.9798513876347686e-05,
"loss": 1.1324,
"step": 720
},
{
"epoch": 0.31756460797196673,
"grad_norm": 0.29123536907457154,
"learning_rate": 1.9790807059144224e-05,
"loss": 1.1406,
"step": 725
},
{
"epoch": 0.31975470871660094,
"grad_norm": 0.28343901662871235,
"learning_rate": 1.978295715979816e-05,
"loss": 1.159,
"step": 730
},
{
"epoch": 0.3219448094612352,
"grad_norm": 0.2983120464171406,
"learning_rate": 1.9774964293027343e-05,
"loss": 1.1172,
"step": 735
},
{
"epoch": 0.3241349102058695,
"grad_norm": 0.30619911851808346,
"learning_rate": 1.976682857563895e-05,
"loss": 1.1316,
"step": 740
},
{
"epoch": 0.3263250109505037,
"grad_norm": 0.29955540950645765,
"learning_rate": 1.9758550126527763e-05,
"loss": 1.1332,
"step": 745
},
{
"epoch": 0.328515111695138,
"grad_norm": 0.3225950257313704,
"learning_rate": 1.975012906667444e-05,
"loss": 1.1285,
"step": 750
},
{
"epoch": 0.33070521243977224,
"grad_norm": 0.30076771908251354,
"learning_rate": 1.974156551914373e-05,
"loss": 1.1582,
"step": 755
},
{
"epoch": 0.33289531318440646,
"grad_norm": 0.26730681052415284,
"learning_rate": 1.9732859609082703e-05,
"loss": 1.1363,
"step": 760
},
{
"epoch": 0.33508541392904073,
"grad_norm": 0.296320423317251,
"learning_rate": 1.9724011463718886e-05,
"loss": 1.1475,
"step": 765
},
{
"epoch": 0.337275514673675,
"grad_norm": 0.2831250199600264,
"learning_rate": 1.971502121235844e-05,
"loss": 1.1504,
"step": 770
},
{
"epoch": 0.3394656154183092,
"grad_norm": 0.30579339517624693,
"learning_rate": 1.9705888986384237e-05,
"loss": 1.149,
"step": 775
},
{
"epoch": 0.3416557161629435,
"grad_norm": 0.279787903018849,
"learning_rate": 1.969661491925397e-05,
"loss": 1.1285,
"step": 780
},
{
"epoch": 0.34384581690757776,
"grad_norm": 0.2984680376613341,
"learning_rate": 1.9687199146498184e-05,
"loss": 1.1326,
"step": 785
},
{
"epoch": 0.346035917652212,
"grad_norm": 0.303788532910488,
"learning_rate": 1.9677641805718287e-05,
"loss": 1.1332,
"step": 790
},
{
"epoch": 0.34822601839684625,
"grad_norm": 0.28544564287464336,
"learning_rate": 1.9667943036584572e-05,
"loss": 1.1332,
"step": 795
},
{
"epoch": 0.3504161191414805,
"grad_norm": 0.302702688121673,
"learning_rate": 1.9658102980834147e-05,
"loss": 1.1271,
"step": 800
},
{
"epoch": 0.35260621988611474,
"grad_norm": 0.2861342406686224,
"learning_rate": 1.9648121782268862e-05,
"loss": 1.127,
"step": 805
},
{
"epoch": 0.354796320630749,
"grad_norm": 0.285579818423958,
"learning_rate": 1.9637999586753236e-05,
"loss": 1.1344,
"step": 810
},
{
"epoch": 0.3569864213753833,
"grad_norm": 0.2747392171436459,
"learning_rate": 1.9627736542212292e-05,
"loss": 1.1566,
"step": 815
},
{
"epoch": 0.3591765221200175,
"grad_norm": 0.2836525027135021,
"learning_rate": 1.961733279862942e-05,
"loss": 1.1314,
"step": 820
},
{
"epoch": 0.36136662286465177,
"grad_norm": 0.2814212118759052,
"learning_rate": 1.9606788508044176e-05,
"loss": 1.1271,
"step": 825
},
{
"epoch": 0.36355672360928604,
"grad_norm": 0.2751373320963047,
"learning_rate": 1.959610382455005e-05,
"loss": 1.1324,
"step": 830
},
{
"epoch": 0.3657468243539203,
"grad_norm": 0.29671327096904027,
"learning_rate": 1.9585278904292232e-05,
"loss": 1.1238,
"step": 835
},
{
"epoch": 0.3679369250985545,
"grad_norm": 0.2807910552149839,
"learning_rate": 1.9574313905465317e-05,
"loss": 1.1473,
"step": 840
},
{
"epoch": 0.3701270258431888,
"grad_norm": 0.2750707155460053,
"learning_rate": 1.956320898831101e-05,
"loss": 1.0916,
"step": 845
},
{
"epoch": 0.37231712658782307,
"grad_norm": 0.28600560772614464,
"learning_rate": 1.9551964315115755e-05,
"loss": 1.1438,
"step": 850
},
{
"epoch": 0.3745072273324573,
"grad_norm": 0.2945488706090824,
"learning_rate": 1.954058005020839e-05,
"loss": 1.1252,
"step": 855
},
{
"epoch": 0.37669732807709155,
"grad_norm": 0.2821523000109209,
"learning_rate": 1.952905635995773e-05,
"loss": 1.1215,
"step": 860
},
{
"epoch": 0.3788874288217258,
"grad_norm": 0.2809319217223354,
"learning_rate": 1.9517393412770154e-05,
"loss": 1.1438,
"step": 865
},
{
"epoch": 0.38107752956636004,
"grad_norm": 0.29258642255243755,
"learning_rate": 1.9505591379087126e-05,
"loss": 1.1406,
"step": 870
},
{
"epoch": 0.3832676303109943,
"grad_norm": 0.2782515672234601,
"learning_rate": 1.9493650431382702e-05,
"loss": 1.127,
"step": 875
},
{
"epoch": 0.3854577310556286,
"grad_norm": 0.2651516436947355,
"learning_rate": 1.9481570744161024e-05,
"loss": 1.1293,
"step": 880
},
{
"epoch": 0.3876478318002628,
"grad_norm": 0.2778922300040792,
"learning_rate": 1.9469352493953767e-05,
"loss": 1.1621,
"step": 885
},
{
"epoch": 0.38983793254489707,
"grad_norm": 0.277626897098439,
"learning_rate": 1.945699585931755e-05,
"loss": 1.1348,
"step": 890
},
{
"epoch": 0.39202803328953134,
"grad_norm": 0.28606581063244546,
"learning_rate": 1.944450102083133e-05,
"loss": 1.1,
"step": 895
},
{
"epoch": 0.39421813403416556,
"grad_norm": 0.2766274824029753,
"learning_rate": 1.9431868161093773e-05,
"loss": 1.1344,
"step": 900
},
{
"epoch": 0.39640823477879983,
"grad_norm": 0.28411728929167873,
"learning_rate": 1.941909746472057e-05,
"loss": 1.1352,
"step": 905
},
{
"epoch": 0.3985983355234341,
"grad_norm": 0.28207030065486227,
"learning_rate": 1.9406189118341752e-05,
"loss": 1.1338,
"step": 910
},
{
"epoch": 0.4007884362680683,
"grad_norm": 0.2889032305193362,
"learning_rate": 1.939314331059895e-05,
"loss": 1.1449,
"step": 915
},
{
"epoch": 0.4029785370127026,
"grad_norm": 0.2607847605615642,
"learning_rate": 1.9379960232142655e-05,
"loss": 1.1332,
"step": 920
},
{
"epoch": 0.40516863775733686,
"grad_norm": 0.27283416214879747,
"learning_rate": 1.936664007562941e-05,
"loss": 1.1187,
"step": 925
},
{
"epoch": 0.4073587385019711,
"grad_norm": 0.27360396328426034,
"learning_rate": 1.9353183035719027e-05,
"loss": 1.1111,
"step": 930
},
{
"epoch": 0.40954883924660535,
"grad_norm": 0.27641067744827497,
"learning_rate": 1.9339589309071694e-05,
"loss": 1.1146,
"step": 935
},
{
"epoch": 0.4117389399912396,
"grad_norm": 0.28012688463597135,
"learning_rate": 1.932585909434515e-05,
"loss": 1.1273,
"step": 940
},
{
"epoch": 0.41392904073587383,
"grad_norm": 0.2986899749731716,
"learning_rate": 1.931199259219176e-05,
"loss": 1.1516,
"step": 945
},
{
"epoch": 0.4161191414805081,
"grad_norm": 0.2746086383673798,
"learning_rate": 1.929799000525557e-05,
"loss": 1.1383,
"step": 950
},
{
"epoch": 0.4183092422251424,
"grad_norm": 0.2705485338360411,
"learning_rate": 1.9283851538169376e-05,
"loss": 1.1281,
"step": 955
},
{
"epoch": 0.4204993429697766,
"grad_norm": 0.2693102198514296,
"learning_rate": 1.9269577397551698e-05,
"loss": 1.1297,
"step": 960
},
{
"epoch": 0.42268944371441086,
"grad_norm": 0.2812249914983643,
"learning_rate": 1.9255167792003803e-05,
"loss": 1.1488,
"step": 965
},
{
"epoch": 0.42487954445904513,
"grad_norm": 0.2780616972935151,
"learning_rate": 1.9240622932106606e-05,
"loss": 1.1148,
"step": 970
},
{
"epoch": 0.42706964520367935,
"grad_norm": 0.2811437839638983,
"learning_rate": 1.922594303041764e-05,
"loss": 1.1242,
"step": 975
},
{
"epoch": 0.4292597459483136,
"grad_norm": 0.268208376235582,
"learning_rate": 1.9211128301467913e-05,
"loss": 1.1516,
"step": 980
},
{
"epoch": 0.4314498466929479,
"grad_norm": 0.2777452313433181,
"learning_rate": 1.919617896175881e-05,
"loss": 1.1273,
"step": 985
},
{
"epoch": 0.4336399474375821,
"grad_norm": 0.28289566775790254,
"learning_rate": 1.918109522975888e-05,
"loss": 1.1387,
"step": 990
},
{
"epoch": 0.4358300481822164,
"grad_norm": 0.28637508184497684,
"learning_rate": 1.9165877325900696e-05,
"loss": 1.1275,
"step": 995
},
{
"epoch": 0.43802014892685065,
"grad_norm": 0.2798034291448599,
"learning_rate": 1.91505254725776e-05,
"loss": 1.1135,
"step": 1000
},
{
"epoch": 0.44021024967148487,
"grad_norm": 0.27447002943494403,
"learning_rate": 1.9135039894140446e-05,
"loss": 1.1383,
"step": 1005
},
{
"epoch": 0.44240035041611914,
"grad_norm": 0.2645078006001802,
"learning_rate": 1.911942081689437e-05,
"loss": 1.134,
"step": 1010
},
{
"epoch": 0.4445904511607534,
"grad_norm": 0.27970364517188895,
"learning_rate": 1.910366846909542e-05,
"loss": 1.1195,
"step": 1015
},
{
"epoch": 0.4467805519053876,
"grad_norm": 0.27465951372753394,
"learning_rate": 1.9087783080947263e-05,
"loss": 1.1332,
"step": 1020
},
{
"epoch": 0.4489706526500219,
"grad_norm": 0.26634601919762435,
"learning_rate": 1.9071764884597812e-05,
"loss": 1.1236,
"step": 1025
},
{
"epoch": 0.45116075339465617,
"grad_norm": 0.2794646407733667,
"learning_rate": 1.905561411413582e-05,
"loss": 1.1582,
"step": 1030
},
{
"epoch": 0.4533508541392904,
"grad_norm": 0.28173898357125154,
"learning_rate": 1.903933100558747e-05,
"loss": 1.1332,
"step": 1035
},
{
"epoch": 0.45554095488392465,
"grad_norm": 0.2736898887292246,
"learning_rate": 1.902291579691293e-05,
"loss": 1.1434,
"step": 1040
},
{
"epoch": 0.4577310556285589,
"grad_norm": 0.27191810888639717,
"learning_rate": 1.9006368728002864e-05,
"loss": 1.1275,
"step": 1045
},
{
"epoch": 0.45992115637319314,
"grad_norm": 0.2828367794791594,
"learning_rate": 1.8989690040674937e-05,
"loss": 1.1555,
"step": 1050
},
{
"epoch": 0.4621112571178274,
"grad_norm": 0.27719265415892497,
"learning_rate": 1.897287997867027e-05,
"loss": 1.1092,
"step": 1055
},
{
"epoch": 0.4643013578624617,
"grad_norm": 0.2872145245485876,
"learning_rate": 1.8955938787649896e-05,
"loss": 1.1672,
"step": 1060
},
{
"epoch": 0.4664914586070959,
"grad_norm": 0.28308043695745805,
"learning_rate": 1.8938866715191137e-05,
"loss": 1.1285,
"step": 1065
},
{
"epoch": 0.46868155935173017,
"grad_norm": 0.28356329044611883,
"learning_rate": 1.892166401078402e-05,
"loss": 1.1186,
"step": 1070
},
{
"epoch": 0.47087166009636444,
"grad_norm": 0.2870427948470036,
"learning_rate": 1.8904330925827628e-05,
"loss": 1.1344,
"step": 1075
},
{
"epoch": 0.4730617608409987,
"grad_norm": 0.2711551541427653,
"learning_rate": 1.8886867713626393e-05,
"loss": 1.1174,
"step": 1080
},
{
"epoch": 0.47525186158563293,
"grad_norm": 0.29791754553385197,
"learning_rate": 1.8869274629386433e-05,
"loss": 1.1207,
"step": 1085
},
{
"epoch": 0.4774419623302672,
"grad_norm": 0.2843662761979316,
"learning_rate": 1.8851551930211803e-05,
"loss": 1.1449,
"step": 1090
},
{
"epoch": 0.47963206307490147,
"grad_norm": 0.27143984381476377,
"learning_rate": 1.8833699875100735e-05,
"loss": 1.1035,
"step": 1095
},
{
"epoch": 0.4818221638195357,
"grad_norm": 0.2757462217218276,
"learning_rate": 1.881571872494187e-05,
"loss": 1.1105,
"step": 1100
},
{
"epoch": 0.48401226456416996,
"grad_norm": 0.2765481966040783,
"learning_rate": 1.879760874251043e-05,
"loss": 1.1156,
"step": 1105
},
{
"epoch": 0.48620236530880423,
"grad_norm": 0.282978908681222,
"learning_rate": 1.8779370192464378e-05,
"loss": 1.1201,
"step": 1110
},
{
"epoch": 0.48839246605343845,
"grad_norm": 0.27284223511573796,
"learning_rate": 1.876100334134056e-05,
"loss": 1.1266,
"step": 1115
},
{
"epoch": 0.4905825667980727,
"grad_norm": 0.280578986759108,
"learning_rate": 1.8742508457550804e-05,
"loss": 1.1117,
"step": 1120
},
{
"epoch": 0.492772667542707,
"grad_norm": 0.27600914760958584,
"learning_rate": 1.8723885811377998e-05,
"loss": 1.125,
"step": 1125
},
{
"epoch": 0.4949627682873412,
"grad_norm": 0.2792414213857675,
"learning_rate": 1.8705135674972133e-05,
"loss": 1.1043,
"step": 1130
},
{
"epoch": 0.4971528690319755,
"grad_norm": 0.29355325689986594,
"learning_rate": 1.868625832234635e-05,
"loss": 1.1283,
"step": 1135
},
{
"epoch": 0.49934296977660975,
"grad_norm": 0.2959742597776297,
"learning_rate": 1.8667254029372898e-05,
"loss": 1.1379,
"step": 1140
},
{
"epoch": 0.501533070521244,
"grad_norm": 0.2803561452383429,
"learning_rate": 1.8648123073779136e-05,
"loss": 1.1008,
"step": 1145
},
{
"epoch": 0.5037231712658782,
"grad_norm": 0.28730614143136535,
"learning_rate": 1.8628865735143464e-05,
"loss": 1.0941,
"step": 1150
},
{
"epoch": 0.5059132720105125,
"grad_norm": 0.2730078243750208,
"learning_rate": 1.860948229489122e-05,
"loss": 1.1369,
"step": 1155
},
{
"epoch": 0.5081033727551467,
"grad_norm": 0.28244458331532185,
"learning_rate": 1.8589973036290597e-05,
"loss": 1.124,
"step": 1160
},
{
"epoch": 0.5102934734997809,
"grad_norm": 0.27033517409253727,
"learning_rate": 1.857033824444848e-05,
"loss": 1.1002,
"step": 1165
},
{
"epoch": 0.5124835742444153,
"grad_norm": 0.27997442688862706,
"learning_rate": 1.855057820630629e-05,
"loss": 1.127,
"step": 1170
},
{
"epoch": 0.5146736749890495,
"grad_norm": 0.2924889952428353,
"learning_rate": 1.8530693210635785e-05,
"loss": 1.1121,
"step": 1175
},
{
"epoch": 0.5168637757336837,
"grad_norm": 0.27255392506521753,
"learning_rate": 1.8510683548034853e-05,
"loss": 1.1289,
"step": 1180
},
{
"epoch": 0.519053876478318,
"grad_norm": 0.27789463711596296,
"learning_rate": 1.8490549510923243e-05,
"loss": 1.1313,
"step": 1185
},
{
"epoch": 0.5212439772229522,
"grad_norm": 0.270522153270926,
"learning_rate": 1.8470291393538308e-05,
"loss": 1.148,
"step": 1190
},
{
"epoch": 0.5234340779675866,
"grad_norm": 0.2903581607172235,
"learning_rate": 1.8449909491930707e-05,
"loss": 1.1336,
"step": 1195
},
{
"epoch": 0.5256241787122208,
"grad_norm": 0.28130785295367783,
"learning_rate": 1.8429404103960068e-05,
"loss": 1.0943,
"step": 1200
},
{
"epoch": 0.527814279456855,
"grad_norm": 0.2847352994330959,
"learning_rate": 1.840877552929064e-05,
"loss": 1.1072,
"step": 1205
},
{
"epoch": 0.5300043802014893,
"grad_norm": 0.27594764699978475,
"learning_rate": 1.8388024069386913e-05,
"loss": 1.1031,
"step": 1210
},
{
"epoch": 0.5321944809461235,
"grad_norm": 0.27259818678775977,
"learning_rate": 1.836715002750921e-05,
"loss": 1.1352,
"step": 1215
},
{
"epoch": 0.5343845816907578,
"grad_norm": 0.28100802685276305,
"learning_rate": 1.8346153708709267e-05,
"loss": 1.1191,
"step": 1220
},
{
"epoch": 0.5365746824353921,
"grad_norm": 0.2691241413663141,
"learning_rate": 1.832503541982576e-05,
"loss": 1.0869,
"step": 1225
},
{
"epoch": 0.5387647831800263,
"grad_norm": 0.2709716330941766,
"learning_rate": 1.8303795469479824e-05,
"loss": 1.1207,
"step": 1230
},
{
"epoch": 0.5409548839246605,
"grad_norm": 0.2941920285023787,
"learning_rate": 1.8282434168070554e-05,
"loss": 1.1223,
"step": 1235
},
{
"epoch": 0.5431449846692948,
"grad_norm": 0.290222410153825,
"learning_rate": 1.826095182777045e-05,
"loss": 1.1059,
"step": 1240
},
{
"epoch": 0.545335085413929,
"grad_norm": 0.2806597130704421,
"learning_rate": 1.8239348762520877e-05,
"loss": 1.1268,
"step": 1245
},
{
"epoch": 0.5475251861585633,
"grad_norm": 0.26150041135869034,
"learning_rate": 1.8217625288027453e-05,
"loss": 1.1258,
"step": 1250
},
{
"epoch": 0.5497152869031976,
"grad_norm": 0.2685954735341038,
"learning_rate": 1.8195781721755464e-05,
"loss": 1.1062,
"step": 1255
},
{
"epoch": 0.5519053876478318,
"grad_norm": 0.281704971443474,
"learning_rate": 1.8173818382925196e-05,
"loss": 1.1219,
"step": 1260
},
{
"epoch": 0.554095488392466,
"grad_norm": 0.28159951912225467,
"learning_rate": 1.8151735592507285e-05,
"loss": 1.0965,
"step": 1265
},
{
"epoch": 0.5562855891371004,
"grad_norm": 0.2706729959241479,
"learning_rate": 1.8129533673218026e-05,
"loss": 1.109,
"step": 1270
},
{
"epoch": 0.5584756898817346,
"grad_norm": 0.2885774782422386,
"learning_rate": 1.8107212949514648e-05,
"loss": 1.1039,
"step": 1275
},
{
"epoch": 0.5606657906263688,
"grad_norm": 0.27545326234938333,
"learning_rate": 1.8084773747590594e-05,
"loss": 1.1096,
"step": 1280
},
{
"epoch": 0.5628558913710031,
"grad_norm": 0.2689447608381848,
"learning_rate": 1.8062216395370723e-05,
"loss": 1.1244,
"step": 1285
},
{
"epoch": 0.5650459921156373,
"grad_norm": 0.2697532238591275,
"learning_rate": 1.8039541222506544e-05,
"loss": 1.1002,
"step": 1290
},
{
"epoch": 0.5672360928602715,
"grad_norm": 0.2671203153481593,
"learning_rate": 1.801674856037138e-05,
"loss": 1.1066,
"step": 1295
},
{
"epoch": 0.5694261936049059,
"grad_norm": 0.2761723227472548,
"learning_rate": 1.7993838742055544e-05,
"loss": 1.11,
"step": 1300
},
{
"epoch": 0.5716162943495401,
"grad_norm": 0.28615486924844014,
"learning_rate": 1.7970812102361455e-05,
"loss": 1.1371,
"step": 1305
},
{
"epoch": 0.5738063950941743,
"grad_norm": 0.2747844784537908,
"learning_rate": 1.7947668977798748e-05,
"loss": 1.1049,
"step": 1310
},
{
"epoch": 0.5759964958388086,
"grad_norm": 0.28073244882701426,
"learning_rate": 1.7924409706579366e-05,
"loss": 1.1221,
"step": 1315
},
{
"epoch": 0.5781865965834428,
"grad_norm": 0.2943655545530222,
"learning_rate": 1.7901034628612603e-05,
"loss": 1.1371,
"step": 1320
},
{
"epoch": 0.5803766973280771,
"grad_norm": 0.2824289348253016,
"learning_rate": 1.7877544085500156e-05,
"loss": 1.107,
"step": 1325
},
{
"epoch": 0.5825667980727114,
"grad_norm": 0.26639957586943036,
"learning_rate": 1.785393842053111e-05,
"loss": 1.1281,
"step": 1330
},
{
"epoch": 0.5847568988173456,
"grad_norm": 0.2846585279548851,
"learning_rate": 1.7830217978676935e-05,
"loss": 1.0809,
"step": 1335
},
{
"epoch": 0.5869469995619798,
"grad_norm": 0.27391331375101896,
"learning_rate": 1.780638310658645e-05,
"loss": 1.1395,
"step": 1340
},
{
"epoch": 0.5891371003066141,
"grad_norm": 0.28513106577718356,
"learning_rate": 1.778243415258074e-05,
"loss": 1.118,
"step": 1345
},
{
"epoch": 0.5913272010512484,
"grad_norm": 0.26859438508124023,
"learning_rate": 1.7758371466648076e-05,
"loss": 1.1133,
"step": 1350
},
{
"epoch": 0.5935173017958826,
"grad_norm": 0.27262765520782745,
"learning_rate": 1.7734195400438804e-05,
"loss": 1.1105,
"step": 1355
},
{
"epoch": 0.5957074025405169,
"grad_norm": 0.26578867382487387,
"learning_rate": 1.7709906307260193e-05,
"loss": 1.109,
"step": 1360
},
{
"epoch": 0.5978975032851511,
"grad_norm": 0.2739484520721144,
"learning_rate": 1.7685504542071294e-05,
"loss": 1.1168,
"step": 1365
},
{
"epoch": 0.6000876040297853,
"grad_norm": 0.2613962343706309,
"learning_rate": 1.7660990461477717e-05,
"loss": 1.1187,
"step": 1370
},
{
"epoch": 0.6022777047744197,
"grad_norm": 0.27907516560188245,
"learning_rate": 1.7636364423726468e-05,
"loss": 1.1059,
"step": 1375
},
{
"epoch": 0.6044678055190539,
"grad_norm": 0.2692785993705207,
"learning_rate": 1.7611626788700658e-05,
"loss": 1.1176,
"step": 1380
},
{
"epoch": 0.6066579062636881,
"grad_norm": 0.2826591838293577,
"learning_rate": 1.75867779179143e-05,
"loss": 1.1215,
"step": 1385
},
{
"epoch": 0.6088480070083224,
"grad_norm": 0.2585536815951757,
"learning_rate": 1.7561818174506976e-05,
"loss": 1.1041,
"step": 1390
},
{
"epoch": 0.6110381077529566,
"grad_norm": 0.2690038568592748,
"learning_rate": 1.7536747923238566e-05,
"loss": 1.0959,
"step": 1395
},
{
"epoch": 0.6132282084975909,
"grad_norm": 0.2794428452568368,
"learning_rate": 1.75115675304839e-05,
"loss": 1.1203,
"step": 1400
},
{
"epoch": 0.6154183092422252,
"grad_norm": 0.2712753678084759,
"learning_rate": 1.7486277364227406e-05,
"loss": 1.1428,
"step": 1405
},
{
"epoch": 0.6176084099868594,
"grad_norm": 0.30076453779446854,
"learning_rate": 1.7460877794057736e-05,
"loss": 1.1297,
"step": 1410
},
{
"epoch": 0.6197985107314936,
"grad_norm": 0.27142722196615265,
"learning_rate": 1.7435369191162357e-05,
"loss": 1.1109,
"step": 1415
},
{
"epoch": 0.6219886114761279,
"grad_norm": 0.27765521985915914,
"learning_rate": 1.7409751928322143e-05,
"loss": 1.1043,
"step": 1420
},
{
"epoch": 0.6241787122207622,
"grad_norm": 0.2736713650444242,
"learning_rate": 1.738402637990591e-05,
"loss": 1.1289,
"step": 1425
},
{
"epoch": 0.6263688129653964,
"grad_norm": 0.27068657267510915,
"learning_rate": 1.7358192921864955e-05,
"loss": 1.0949,
"step": 1430
},
{
"epoch": 0.6285589137100307,
"grad_norm": 0.2731706398238531,
"learning_rate": 1.7332251931727547e-05,
"loss": 1.1039,
"step": 1435
},
{
"epoch": 0.6307490144546649,
"grad_norm": 0.27367550283243264,
"learning_rate": 1.7306203788593436e-05,
"loss": 1.0953,
"step": 1440
},
{
"epoch": 0.6329391151992991,
"grad_norm": 0.2738816381357539,
"learning_rate": 1.7280048873128296e-05,
"loss": 1.1121,
"step": 1445
},
{
"epoch": 0.6351292159439335,
"grad_norm": 0.2698317289481176,
"learning_rate": 1.7253787567558152e-05,
"loss": 1.0783,
"step": 1450
},
{
"epoch": 0.6373193166885677,
"grad_norm": 0.2769632550292509,
"learning_rate": 1.722742025566382e-05,
"loss": 1.1086,
"step": 1455
},
{
"epoch": 0.6395094174332019,
"grad_norm": 0.2897254427604234,
"learning_rate": 1.7200947322775276e-05,
"loss": 1.1316,
"step": 1460
},
{
"epoch": 0.6416995181778362,
"grad_norm": 0.2640477629321144,
"learning_rate": 1.7174369155766037e-05,
"loss": 1.1121,
"step": 1465
},
{
"epoch": 0.6438896189224704,
"grad_norm": 0.2773590006369379,
"learning_rate": 1.7147686143047507e-05,
"loss": 1.1031,
"step": 1470
},
{
"epoch": 0.6460797196671046,
"grad_norm": 0.2837149025424185,
"learning_rate": 1.7120898674563292e-05,
"loss": 1.109,
"step": 1475
},
{
"epoch": 0.648269820411739,
"grad_norm": 0.28232081520676006,
"learning_rate": 1.7094007141783512e-05,
"loss": 1.0969,
"step": 1480
},
{
"epoch": 0.6504599211563732,
"grad_norm": 0.27647777357776215,
"learning_rate": 1.7067011937699067e-05,
"loss": 1.0973,
"step": 1485
},
{
"epoch": 0.6526500219010074,
"grad_norm": 0.272043266444083,
"learning_rate": 1.703991345681591e-05,
"loss": 1.1156,
"step": 1490
},
{
"epoch": 0.6548401226456417,
"grad_norm": 0.27378936375046253,
"learning_rate": 1.7012712095149268e-05,
"loss": 1.1102,
"step": 1495
},
{
"epoch": 0.657030223390276,
"grad_norm": 0.275330797988678,
"learning_rate": 1.6985408250217866e-05,
"loss": 1.1324,
"step": 1500
},
{
"epoch": 0.6592203241349102,
"grad_norm": 0.2713748379437795,
"learning_rate": 1.6958002321038106e-05,
"loss": 1.1156,
"step": 1505
},
{
"epoch": 0.6614104248795445,
"grad_norm": 0.28092417423165894,
"learning_rate": 1.693049470811825e-05,
"loss": 1.1201,
"step": 1510
},
{
"epoch": 0.6636005256241787,
"grad_norm": 0.27389632704021605,
"learning_rate": 1.690288581345255e-05,
"loss": 1.1156,
"step": 1515
},
{
"epoch": 0.6657906263688129,
"grad_norm": 0.27561700781744836,
"learning_rate": 1.6875176040515383e-05,
"loss": 1.0928,
"step": 1520
},
{
"epoch": 0.6679807271134472,
"grad_norm": 0.262557233485991,
"learning_rate": 1.6847365794255363e-05,
"loss": 1.1049,
"step": 1525
},
{
"epoch": 0.6701708278580815,
"grad_norm": 0.3001616781731598,
"learning_rate": 1.68194554810894e-05,
"loss": 1.1039,
"step": 1530
},
{
"epoch": 0.6723609286027157,
"grad_norm": 0.27415291968349004,
"learning_rate": 1.6791445508896784e-05,
"loss": 1.1135,
"step": 1535
},
{
"epoch": 0.67455102934735,
"grad_norm": 0.2732543195254785,
"learning_rate": 1.6763336287013216e-05,
"loss": 1.1082,
"step": 1540
},
{
"epoch": 0.6767411300919842,
"grad_norm": 0.26943767063054014,
"learning_rate": 1.6735128226224816e-05,
"loss": 1.0922,
"step": 1545
},
{
"epoch": 0.6789312308366184,
"grad_norm": 0.277744541655017,
"learning_rate": 1.6706821738762138e-05,
"loss": 1.0938,
"step": 1550
},
{
"epoch": 0.6811213315812528,
"grad_norm": 0.2719454837646402,
"learning_rate": 1.6678417238294128e-05,
"loss": 1.0947,
"step": 1555
},
{
"epoch": 0.683311432325887,
"grad_norm": 0.2630586272470944,
"learning_rate": 1.6649915139922093e-05,
"loss": 1.1033,
"step": 1560
},
{
"epoch": 0.6855015330705212,
"grad_norm": 0.2724307550324142,
"learning_rate": 1.6621315860173627e-05,
"loss": 1.1441,
"step": 1565
},
{
"epoch": 0.6876916338151555,
"grad_norm": 0.2708643111258457,
"learning_rate": 1.659261981699653e-05,
"loss": 1.0734,
"step": 1570
},
{
"epoch": 0.6898817345597897,
"grad_norm": 0.2769771935758939,
"learning_rate": 1.656382742975268e-05,
"loss": 1.1176,
"step": 1575
},
{
"epoch": 0.692071835304424,
"grad_norm": 0.265923617478963,
"learning_rate": 1.6534939119211935e-05,
"loss": 1.1164,
"step": 1580
},
{
"epoch": 0.6942619360490583,
"grad_norm": 0.26879461120855397,
"learning_rate": 1.6505955307545972e-05,
"loss": 1.0928,
"step": 1585
},
{
"epoch": 0.6964520367936925,
"grad_norm": 0.275380991512307,
"learning_rate": 1.64768764183221e-05,
"loss": 1.1559,
"step": 1590
},
{
"epoch": 0.6986421375383267,
"grad_norm": 0.27286451730113803,
"learning_rate": 1.6447702876497097e-05,
"loss": 1.0912,
"step": 1595
},
{
"epoch": 0.700832238282961,
"grad_norm": 0.2599528062396287,
"learning_rate": 1.641843510841098e-05,
"loss": 1.1156,
"step": 1600
},
{
"epoch": 0.7030223390275953,
"grad_norm": 0.2825057968855528,
"learning_rate": 1.6389073541780784e-05,
"loss": 1.1156,
"step": 1605
},
{
"epoch": 0.7052124397722295,
"grad_norm": 0.26388995603859694,
"learning_rate": 1.635961860569431e-05,
"loss": 1.1191,
"step": 1610
},
{
"epoch": 0.7074025405168638,
"grad_norm": 0.2736402329338208,
"learning_rate": 1.633007073060385e-05,
"loss": 1.0949,
"step": 1615
},
{
"epoch": 0.709592641261498,
"grad_norm": 0.2738242842182442,
"learning_rate": 1.6300430348319903e-05,
"loss": 1.115,
"step": 1620
},
{
"epoch": 0.7117827420061322,
"grad_norm": 0.27283103691493066,
"learning_rate": 1.627069789200487e-05,
"loss": 1.1236,
"step": 1625
},
{
"epoch": 0.7139728427507666,
"grad_norm": 0.27042925057968625,
"learning_rate": 1.6240873796166696e-05,
"loss": 1.1115,
"step": 1630
},
{
"epoch": 0.7161629434954008,
"grad_norm": 0.2648740003276731,
"learning_rate": 1.621095849665255e-05,
"loss": 1.1135,
"step": 1635
},
{
"epoch": 0.718353044240035,
"grad_norm": 0.2699312698156825,
"learning_rate": 1.6180952430642452e-05,
"loss": 1.0814,
"step": 1640
},
{
"epoch": 0.7205431449846693,
"grad_norm": 0.2727113593417385,
"learning_rate": 1.615085603664286e-05,
"loss": 1.1227,
"step": 1645
},
{
"epoch": 0.7227332457293035,
"grad_norm": 0.261310750518031,
"learning_rate": 1.6120669754480295e-05,
"loss": 1.1074,
"step": 1650
},
{
"epoch": 0.7249233464739377,
"grad_norm": 0.2572311428806488,
"learning_rate": 1.6090394025294885e-05,
"loss": 1.1078,
"step": 1655
},
{
"epoch": 0.7271134472185721,
"grad_norm": 0.2626062536380091,
"learning_rate": 1.606002929153394e-05,
"loss": 1.0889,
"step": 1660
},
{
"epoch": 0.7293035479632063,
"grad_norm": 0.27208847516673046,
"learning_rate": 1.602957599694547e-05,
"loss": 1.1285,
"step": 1665
},
{
"epoch": 0.7314936487078406,
"grad_norm": 0.25895874544574315,
"learning_rate": 1.5999034586571705e-05,
"loss": 1.0859,
"step": 1670
},
{
"epoch": 0.7336837494524748,
"grad_norm": 0.26323986195539395,
"learning_rate": 1.59684055067426e-05,
"loss": 1.0869,
"step": 1675
},
{
"epoch": 0.735873850197109,
"grad_norm": 0.26562354048699777,
"learning_rate": 1.5937689205069304e-05,
"loss": 1.1176,
"step": 1680
},
{
"epoch": 0.7380639509417434,
"grad_norm": 0.2685404295636409,
"learning_rate": 1.5906886130437606e-05,
"loss": 1.1092,
"step": 1685
},
{
"epoch": 0.7402540516863776,
"grad_norm": 0.26206885479027087,
"learning_rate": 1.5875996733001405e-05,
"loss": 1.1057,
"step": 1690
},
{
"epoch": 0.7424441524310118,
"grad_norm": 0.2684814010119893,
"learning_rate": 1.5845021464176114e-05,
"loss": 1.1266,
"step": 1695
},
{
"epoch": 0.7446342531756461,
"grad_norm": 0.270738097510599,
"learning_rate": 1.581396077663206e-05,
"loss": 1.1207,
"step": 1700
},
{
"epoch": 0.7468243539202803,
"grad_norm": 0.26716036294720524,
"learning_rate": 1.5782815124287867e-05,
"loss": 1.1014,
"step": 1705
},
{
"epoch": 0.7490144546649146,
"grad_norm": 0.26694423569483944,
"learning_rate": 1.575158496230383e-05,
"loss": 1.1039,
"step": 1710
},
{
"epoch": 0.7512045554095489,
"grad_norm": 0.2815172017782616,
"learning_rate": 1.5720270747075277e-05,
"loss": 1.0941,
"step": 1715
},
{
"epoch": 0.7533946561541831,
"grad_norm": 0.2624397530715869,
"learning_rate": 1.568887293622587e-05,
"loss": 1.0947,
"step": 1720
},
{
"epoch": 0.7555847568988173,
"grad_norm": 0.26315263302354647,
"learning_rate": 1.565739198860093e-05,
"loss": 1.101,
"step": 1725
},
{
"epoch": 0.7577748576434516,
"grad_norm": 0.26819487927961905,
"learning_rate": 1.562582836426074e-05,
"loss": 1.1062,
"step": 1730
},
{
"epoch": 0.7599649583880859,
"grad_norm": 0.2752388588069863,
"learning_rate": 1.559418252447381e-05,
"loss": 1.118,
"step": 1735
},
{
"epoch": 0.7621550591327201,
"grad_norm": 0.2743015464537201,
"learning_rate": 1.5562454931710146e-05,
"loss": 1.1061,
"step": 1740
},
{
"epoch": 0.7643451598773544,
"grad_norm": 0.2606577851181012,
"learning_rate": 1.5530646049634473e-05,
"loss": 1.0871,
"step": 1745
},
{
"epoch": 0.7665352606219886,
"grad_norm": 0.26480322824428465,
"learning_rate": 1.5498756343099495e-05,
"loss": 1.1221,
"step": 1750
},
{
"epoch": 0.7687253613666228,
"grad_norm": 0.2668874639775243,
"learning_rate": 1.5466786278139054e-05,
"loss": 1.1434,
"step": 1755
},
{
"epoch": 0.7709154621112572,
"grad_norm": 0.26968985298684783,
"learning_rate": 1.543473632196136e-05,
"loss": 1.1061,
"step": 1760
},
{
"epoch": 0.7731055628558914,
"grad_norm": 0.270316749677291,
"learning_rate": 1.540260694294214e-05,
"loss": 1.1309,
"step": 1765
},
{
"epoch": 0.7752956636005256,
"grad_norm": 0.27316189703503946,
"learning_rate": 1.5370398610617804e-05,
"loss": 1.1191,
"step": 1770
},
{
"epoch": 0.7774857643451599,
"grad_norm": 0.27324029056852334,
"learning_rate": 1.5338111795678585e-05,
"loss": 1.1016,
"step": 1775
},
{
"epoch": 0.7796758650897941,
"grad_norm": 0.26275659139758095,
"learning_rate": 1.530574696996164e-05,
"loss": 1.091,
"step": 1780
},
{
"epoch": 0.7818659658344284,
"grad_norm": 0.2608113586761751,
"learning_rate": 1.5273304606444185e-05,
"loss": 1.1047,
"step": 1785
},
{
"epoch": 0.7840560665790627,
"grad_norm": 0.2669656071169941,
"learning_rate": 1.5240785179236556e-05,
"loss": 1.1051,
"step": 1790
},
{
"epoch": 0.7862461673236969,
"grad_norm": 0.2787846832682112,
"learning_rate": 1.5208189163575306e-05,
"loss": 1.0857,
"step": 1795
},
{
"epoch": 0.7884362680683311,
"grad_norm": 0.2681859587142459,
"learning_rate": 1.5175517035816236e-05,
"loss": 1.0865,
"step": 1800
},
{
"epoch": 0.7906263688129654,
"grad_norm": 0.2647278576482758,
"learning_rate": 1.5142769273427445e-05,
"loss": 1.0879,
"step": 1805
},
{
"epoch": 0.7928164695575997,
"grad_norm": 0.26988725688107346,
"learning_rate": 1.5109946354982352e-05,
"loss": 1.1203,
"step": 1810
},
{
"epoch": 0.7950065703022339,
"grad_norm": 0.26671062187636824,
"learning_rate": 1.5077048760152701e-05,
"loss": 1.102,
"step": 1815
},
{
"epoch": 0.7971966710468682,
"grad_norm": 0.26783415660796006,
"learning_rate": 1.5044076969701551e-05,
"loss": 1.0967,
"step": 1820
},
{
"epoch": 0.7993867717915024,
"grad_norm": 0.26130280226328173,
"learning_rate": 1.5011031465476249e-05,
"loss": 1.1078,
"step": 1825
},
{
"epoch": 0.8015768725361366,
"grad_norm": 0.27999326528902496,
"learning_rate": 1.4977912730401397e-05,
"loss": 1.0814,
"step": 1830
},
{
"epoch": 0.803766973280771,
"grad_norm": 0.25954065164515167,
"learning_rate": 1.4944721248471776e-05,
"loss": 1.1092,
"step": 1835
},
{
"epoch": 0.8059570740254052,
"grad_norm": 0.2664277831190814,
"learning_rate": 1.491145750474529e-05,
"loss": 1.1137,
"step": 1840
},
{
"epoch": 0.8081471747700394,
"grad_norm": 0.30116014543803976,
"learning_rate": 1.4878121985335879e-05,
"loss": 1.0996,
"step": 1845
},
{
"epoch": 0.8103372755146737,
"grad_norm": 0.2634337997526123,
"learning_rate": 1.484471517740639e-05,
"loss": 1.0869,
"step": 1850
},
{
"epoch": 0.8125273762593079,
"grad_norm": 0.2666163201346335,
"learning_rate": 1.4811237569161491e-05,
"loss": 1.0826,
"step": 1855
},
{
"epoch": 0.8147174770039421,
"grad_norm": 0.28026478782735814,
"learning_rate": 1.4777689649840518e-05,
"loss": 1.0844,
"step": 1860
},
{
"epoch": 0.8169075777485765,
"grad_norm": 0.2618418896808571,
"learning_rate": 1.4744071909710323e-05,
"loss": 1.1223,
"step": 1865
},
{
"epoch": 0.8190976784932107,
"grad_norm": 0.2785664585456496,
"learning_rate": 1.4710384840058114e-05,
"loss": 1.1012,
"step": 1870
},
{
"epoch": 0.8212877792378449,
"grad_norm": 0.2690351334092324,
"learning_rate": 1.4676628933184278e-05,
"loss": 1.1176,
"step": 1875
},
{
"epoch": 0.8234778799824792,
"grad_norm": 0.27324737770550867,
"learning_rate": 1.4642804682395186e-05,
"loss": 1.0885,
"step": 1880
},
{
"epoch": 0.8256679807271134,
"grad_norm": 0.2579358276970761,
"learning_rate": 1.4608912581995982e-05,
"loss": 1.1043,
"step": 1885
},
{
"epoch": 0.8278580814717477,
"grad_norm": 0.26105385274922205,
"learning_rate": 1.4574953127283353e-05,
"loss": 1.102,
"step": 1890
},
{
"epoch": 0.830048182216382,
"grad_norm": 0.2673842799065729,
"learning_rate": 1.4540926814538303e-05,
"loss": 1.0922,
"step": 1895
},
{
"epoch": 0.8322382829610162,
"grad_norm": 0.26252808529604915,
"learning_rate": 1.4506834141018895e-05,
"loss": 1.0918,
"step": 1900
},
{
"epoch": 0.8344283837056504,
"grad_norm": 0.26910609526126184,
"learning_rate": 1.4472675604952979e-05,
"loss": 1.0902,
"step": 1905
},
{
"epoch": 0.8366184844502847,
"grad_norm": 0.26133364932976627,
"learning_rate": 1.443845170553092e-05,
"loss": 1.0889,
"step": 1910
},
{
"epoch": 0.838808585194919,
"grad_norm": 0.27193238728414193,
"learning_rate": 1.440416294289829e-05,
"loss": 1.091,
"step": 1915
},
{
"epoch": 0.8409986859395532,
"grad_norm": 0.262829311113063,
"learning_rate": 1.4369809818148586e-05,
"loss": 1.0875,
"step": 1920
},
{
"epoch": 0.8431887866841875,
"grad_norm": 0.26524372560054243,
"learning_rate": 1.4335392833315862e-05,
"loss": 1.1064,
"step": 1925
},
{
"epoch": 0.8453788874288217,
"grad_norm": 0.2584636235528877,
"learning_rate": 1.430091249136744e-05,
"loss": 1.1074,
"step": 1930
},
{
"epoch": 0.8475689881734559,
"grad_norm": 0.2775481255437133,
"learning_rate": 1.4266369296196532e-05,
"loss": 1.0938,
"step": 1935
},
{
"epoch": 0.8497590889180903,
"grad_norm": 0.26138245663872295,
"learning_rate": 1.4231763752614876e-05,
"loss": 1.1285,
"step": 1940
},
{
"epoch": 0.8519491896627245,
"grad_norm": 0.2707757340268956,
"learning_rate": 1.4197096366345372e-05,
"loss": 1.1139,
"step": 1945
},
{
"epoch": 0.8541392904073587,
"grad_norm": 0.26782566168738436,
"learning_rate": 1.4162367644014683e-05,
"loss": 1.107,
"step": 1950
},
{
"epoch": 0.856329391151993,
"grad_norm": 0.26526168698615754,
"learning_rate": 1.4127578093145833e-05,
"loss": 1.1086,
"step": 1955
},
{
"epoch": 0.8585194918966272,
"grad_norm": 0.263651586567362,
"learning_rate": 1.4092728222150789e-05,
"loss": 1.107,
"step": 1960
},
{
"epoch": 0.8607095926412615,
"grad_norm": 0.2753196350258517,
"learning_rate": 1.4057818540323034e-05,
"loss": 1.1176,
"step": 1965
},
{
"epoch": 0.8628996933858958,
"grad_norm": 0.2844631244822961,
"learning_rate": 1.4022849557830113e-05,
"loss": 1.0777,
"step": 1970
},
{
"epoch": 0.86508979413053,
"grad_norm": 0.2934373431409096,
"learning_rate": 1.3987821785706206e-05,
"loss": 1.1,
"step": 1975
},
{
"epoch": 0.8672798948751642,
"grad_norm": 0.25800409226074444,
"learning_rate": 1.395273573584462e-05,
"loss": 1.1039,
"step": 1980
},
{
"epoch": 0.8694699956197985,
"grad_norm": 0.27027320276142963,
"learning_rate": 1.3917591920990339e-05,
"loss": 1.1207,
"step": 1985
},
{
"epoch": 0.8716600963644328,
"grad_norm": 0.2693164308545569,
"learning_rate": 1.3882390854732518e-05,
"loss": 1.0965,
"step": 1990
},
{
"epoch": 0.873850197109067,
"grad_norm": 0.2606202597234289,
"learning_rate": 1.3847133051496981e-05,
"loss": 1.1125,
"step": 1995
},
{
"epoch": 0.8760402978537013,
"grad_norm": 0.2694454731907674,
"learning_rate": 1.3811819026538702e-05,
"loss": 1.0926,
"step": 2000
},
{
"epoch": 0.8782303985983355,
"grad_norm": 0.26631733983589906,
"learning_rate": 1.3776449295934274e-05,
"loss": 1.0949,
"step": 2005
},
{
"epoch": 0.8804204993429697,
"grad_norm": 0.26589757058511515,
"learning_rate": 1.3741024376574369e-05,
"loss": 1.098,
"step": 2010
},
{
"epoch": 0.8826106000876041,
"grad_norm": 0.2736792848915224,
"learning_rate": 1.3705544786156183e-05,
"loss": 1.1156,
"step": 2015
},
{
"epoch": 0.8848007008322383,
"grad_norm": 0.2658722341229626,
"learning_rate": 1.3670011043175871e-05,
"loss": 1.1164,
"step": 2020
},
{
"epoch": 0.8869908015768725,
"grad_norm": 0.26178245994021004,
"learning_rate": 1.3634423666920968e-05,
"loss": 1.1336,
"step": 2025
},
{
"epoch": 0.8891809023215068,
"grad_norm": 0.27553942746876153,
"learning_rate": 1.3598783177462807e-05,
"loss": 1.1238,
"step": 2030
},
{
"epoch": 0.891371003066141,
"grad_norm": 0.2681764008089611,
"learning_rate": 1.3563090095648907e-05,
"loss": 1.1117,
"step": 2035
},
{
"epoch": 0.8935611038107752,
"grad_norm": 0.2669696710935724,
"learning_rate": 1.3527344943095374e-05,
"loss": 1.1184,
"step": 2040
},
{
"epoch": 0.8957512045554096,
"grad_norm": 0.2573801447434789,
"learning_rate": 1.3491548242179267e-05,
"loss": 1.0996,
"step": 2045
},
{
"epoch": 0.8979413053000438,
"grad_norm": 0.2746222462070939,
"learning_rate": 1.345570051603097e-05,
"loss": 1.102,
"step": 2050
},
{
"epoch": 0.900131406044678,
"grad_norm": 0.2671374820083521,
"learning_rate": 1.3419802288526551e-05,
"loss": 1.0863,
"step": 2055
},
{
"epoch": 0.9023215067893123,
"grad_norm": 0.26407337641785006,
"learning_rate": 1.3383854084280088e-05,
"loss": 1.0873,
"step": 2060
},
{
"epoch": 0.9045116075339465,
"grad_norm": 0.26664290685548253,
"learning_rate": 1.3347856428636037e-05,
"loss": 1.0939,
"step": 2065
},
{
"epoch": 0.9067017082785808,
"grad_norm": 0.27742565755152865,
"learning_rate": 1.3311809847661512e-05,
"loss": 1.1012,
"step": 2070
},
{
"epoch": 0.9088918090232151,
"grad_norm": 0.26844433618793634,
"learning_rate": 1.3275714868138629e-05,
"loss": 1.0992,
"step": 2075
},
{
"epoch": 0.9110819097678493,
"grad_norm": 0.31156760423493884,
"learning_rate": 1.3239572017556792e-05,
"loss": 1.127,
"step": 2080
},
{
"epoch": 0.9132720105124835,
"grad_norm": 0.2737693950564783,
"learning_rate": 1.3203381824105001e-05,
"loss": 1.0795,
"step": 2085
},
{
"epoch": 0.9154621112571178,
"grad_norm": 0.2655988793911631,
"learning_rate": 1.31671448166641e-05,
"loss": 1.0982,
"step": 2090
},
{
"epoch": 0.9176522120017521,
"grad_norm": 0.278932714958382,
"learning_rate": 1.3130861524799091e-05,
"loss": 1.0846,
"step": 2095
},
{
"epoch": 0.9198423127463863,
"grad_norm": 0.2613274740773185,
"learning_rate": 1.309453247875136e-05,
"loss": 1.1,
"step": 2100
},
{
"epoch": 0.9220324134910206,
"grad_norm": 0.2713075516855544,
"learning_rate": 1.3058158209430944e-05,
"loss": 1.1074,
"step": 2105
},
{
"epoch": 0.9242225142356548,
"grad_norm": 0.2609833323859274,
"learning_rate": 1.3021739248408776e-05,
"loss": 1.107,
"step": 2110
},
{
"epoch": 0.926412614980289,
"grad_norm": 0.2977024683808482,
"learning_rate": 1.2985276127908897e-05,
"loss": 1.1176,
"step": 2115
},
{
"epoch": 0.9286027157249234,
"grad_norm": 0.27394749452645756,
"learning_rate": 1.2948769380800706e-05,
"loss": 1.0777,
"step": 2120
},
{
"epoch": 0.9307928164695576,
"grad_norm": 0.2566984919681051,
"learning_rate": 1.2912219540591145e-05,
"loss": 1.1031,
"step": 2125
},
{
"epoch": 0.9329829172141918,
"grad_norm": 0.2648683843482838,
"learning_rate": 1.2875627141416926e-05,
"loss": 1.1004,
"step": 2130
},
{
"epoch": 0.9351730179588261,
"grad_norm": 0.27158910578936263,
"learning_rate": 1.2838992718036707e-05,
"loss": 1.1086,
"step": 2135
},
{
"epoch": 0.9373631187034603,
"grad_norm": 0.28448174882950567,
"learning_rate": 1.2802316805823293e-05,
"loss": 1.0879,
"step": 2140
},
{
"epoch": 0.9395532194480947,
"grad_norm": 0.2662017189971071,
"learning_rate": 1.27655999407558e-05,
"loss": 1.1258,
"step": 2145
},
{
"epoch": 0.9417433201927289,
"grad_norm": 0.27194322611948335,
"learning_rate": 1.2728842659411815e-05,
"loss": 1.091,
"step": 2150
},
{
"epoch": 0.9439334209373631,
"grad_norm": 0.25534754454963765,
"learning_rate": 1.2692045498959584e-05,
"loss": 1.0658,
"step": 2155
},
{
"epoch": 0.9461235216819974,
"grad_norm": 0.26353467479376913,
"learning_rate": 1.2655208997150134e-05,
"loss": 1.084,
"step": 2160
},
{
"epoch": 0.9483136224266316,
"grad_norm": 0.2627027873198173,
"learning_rate": 1.2618333692309426e-05,
"loss": 1.1117,
"step": 2165
},
{
"epoch": 0.9505037231712659,
"grad_norm": 0.2689357563785859,
"learning_rate": 1.2581420123330476e-05,
"loss": 1.1363,
"step": 2170
},
{
"epoch": 0.9526938239159002,
"grad_norm": 0.26046024725339567,
"learning_rate": 1.2544468829665503e-05,
"loss": 1.1094,
"step": 2175
},
{
"epoch": 0.9548839246605344,
"grad_norm": 0.2598928969406913,
"learning_rate": 1.2507480351318032e-05,
"loss": 1.1092,
"step": 2180
},
{
"epoch": 0.9570740254051686,
"grad_norm": 0.2698599669899258,
"learning_rate": 1.2470455228834987e-05,
"loss": 1.1307,
"step": 2185
},
{
"epoch": 0.9592641261498029,
"grad_norm": 0.28035682075399004,
"learning_rate": 1.243339400329882e-05,
"loss": 1.1156,
"step": 2190
},
{
"epoch": 0.9614542268944372,
"grad_norm": 0.2645856709181594,
"learning_rate": 1.2396297216319588e-05,
"loss": 1.0924,
"step": 2195
},
{
"epoch": 0.9636443276390714,
"grad_norm": 0.263618320158081,
"learning_rate": 1.2359165410027038e-05,
"loss": 1.0682,
"step": 2200
},
{
"epoch": 0.9658344283837057,
"grad_norm": 0.2650009200265081,
"learning_rate": 1.232199912706269e-05,
"loss": 1.1012,
"step": 2205
},
{
"epoch": 0.9680245291283399,
"grad_norm": 0.26416661065971203,
"learning_rate": 1.2284798910571898e-05,
"loss": 1.1186,
"step": 2210
},
{
"epoch": 0.9702146298729741,
"grad_norm": 0.25736813068625275,
"learning_rate": 1.2247565304195924e-05,
"loss": 1.0914,
"step": 2215
},
{
"epoch": 0.9724047306176085,
"grad_norm": 0.2586875575139371,
"learning_rate": 1.2210298852063984e-05,
"loss": 1.1008,
"step": 2220
},
{
"epoch": 0.9745948313622427,
"grad_norm": 0.26173267154631147,
"learning_rate": 1.2173000098785299e-05,
"loss": 1.0914,
"step": 2225
},
{
"epoch": 0.9767849321068769,
"grad_norm": 0.26864230049294224,
"learning_rate": 1.2135669589441141e-05,
"loss": 1.0641,
"step": 2230
},
{
"epoch": 0.9789750328515112,
"grad_norm": 0.2655742223524583,
"learning_rate": 1.2098307869576858e-05,
"loss": 1.1055,
"step": 2235
},
{
"epoch": 0.9811651335961454,
"grad_norm": 0.2671231847391253,
"learning_rate": 1.2060915485193907e-05,
"loss": 1.0871,
"step": 2240
},
{
"epoch": 0.9833552343407796,
"grad_norm": 0.26788579163236637,
"learning_rate": 1.2023492982741875e-05,
"loss": 1.1068,
"step": 2245
},
{
"epoch": 0.985545335085414,
"grad_norm": 0.28062285715180646,
"learning_rate": 1.1986040909110494e-05,
"loss": 1.1516,
"step": 2250
},
{
"epoch": 0.9877354358300482,
"grad_norm": 0.27291257622616144,
"learning_rate": 1.1948559811621645e-05,
"loss": 1.0926,
"step": 2255
},
{
"epoch": 0.9899255365746824,
"grad_norm": 0.2688314222045062,
"learning_rate": 1.1911050238021362e-05,
"loss": 1.0813,
"step": 2260
},
{
"epoch": 0.9921156373193167,
"grad_norm": 0.26864057707347405,
"learning_rate": 1.1873512736471829e-05,
"loss": 1.0973,
"step": 2265
},
{
"epoch": 0.994305738063951,
"grad_norm": 0.2594131843312388,
"learning_rate": 1.1835947855543356e-05,
"loss": 1.0924,
"step": 2270
},
{
"epoch": 0.9964958388085852,
"grad_norm": 0.26027744632734207,
"learning_rate": 1.1798356144206395e-05,
"loss": 1.1008,
"step": 2275
},
{
"epoch": 0.9986859395532195,
"grad_norm": 0.2723136597045715,
"learning_rate": 1.1760738151823474e-05,
"loss": 1.0941,
"step": 2280
},
{
"epoch": 1.0008760402978536,
"grad_norm": 0.26592399363964636,
"learning_rate": 1.1723094428141202e-05,
"loss": 1.0813,
"step": 2285
},
{
"epoch": 1.003066141042488,
"grad_norm": 0.2741757414547693,
"learning_rate": 1.1685425523282218e-05,
"loss": 1.0701,
"step": 2290
},
{
"epoch": 1.0052562417871223,
"grad_norm": 0.2785639167681659,
"learning_rate": 1.1647731987737163e-05,
"loss": 1.0855,
"step": 2295
},
{
"epoch": 1.0074463425317564,
"grad_norm": 0.25921909631409684,
"learning_rate": 1.1610014372356621e-05,
"loss": 1.05,
"step": 2300
},
{
"epoch": 1.0096364432763907,
"grad_norm": 0.26730561024556904,
"learning_rate": 1.1572273228343085e-05,
"loss": 1.0709,
"step": 2305
},
{
"epoch": 1.011826544021025,
"grad_norm": 0.2666226352769794,
"learning_rate": 1.1534509107242886e-05,
"loss": 1.0814,
"step": 2310
},
{
"epoch": 1.0140166447656591,
"grad_norm": 0.26883958965423044,
"learning_rate": 1.1496722560938147e-05,
"loss": 1.0398,
"step": 2315
},
{
"epoch": 1.0162067455102934,
"grad_norm": 0.26461136654012524,
"learning_rate": 1.1458914141638708e-05,
"loss": 1.0699,
"step": 2320
},
{
"epoch": 1.0183968462549278,
"grad_norm": 0.2621701797623292,
"learning_rate": 1.1421084401874063e-05,
"loss": 1.0684,
"step": 2325
},
{
"epoch": 1.0205869469995619,
"grad_norm": 0.2638410480632078,
"learning_rate": 1.1383233894485278e-05,
"loss": 1.0539,
"step": 2330
},
{
"epoch": 1.0227770477441962,
"grad_norm": 0.2773665371526558,
"learning_rate": 1.134536317261691e-05,
"loss": 1.0666,
"step": 2335
},
{
"epoch": 1.0249671484888305,
"grad_norm": 0.27738246423485236,
"learning_rate": 1.1307472789708942e-05,
"loss": 1.0904,
"step": 2340
},
{
"epoch": 1.0271572492334646,
"grad_norm": 0.26555678082360085,
"learning_rate": 1.1269563299488673e-05,
"loss": 1.0764,
"step": 2345
},
{
"epoch": 1.029347349978099,
"grad_norm": 0.2702748541117833,
"learning_rate": 1.123163525596264e-05,
"loss": 1.0652,
"step": 2350
},
{
"epoch": 1.0315374507227333,
"grad_norm": 0.2832727236287343,
"learning_rate": 1.119368921340851e-05,
"loss": 1.0742,
"step": 2355
},
{
"epoch": 1.0337275514673676,
"grad_norm": 0.2749938354036527,
"learning_rate": 1.1155725726366992e-05,
"loss": 1.0711,
"step": 2360
},
{
"epoch": 1.0359176522120017,
"grad_norm": 0.26230244379974577,
"learning_rate": 1.1117745349633723e-05,
"loss": 1.0717,
"step": 2365
},
{
"epoch": 1.038107752956636,
"grad_norm": 0.2664760872899082,
"learning_rate": 1.1079748638251175e-05,
"loss": 1.0805,
"step": 2370
},
{
"epoch": 1.0402978537012704,
"grad_norm": 0.27367024800241485,
"learning_rate": 1.1041736147500522e-05,
"loss": 1.0707,
"step": 2375
},
{
"epoch": 1.0424879544459045,
"grad_norm": 0.27181089092067195,
"learning_rate": 1.1003708432893531e-05,
"loss": 1.0416,
"step": 2380
},
{
"epoch": 1.0446780551905388,
"grad_norm": 0.2665172852872497,
"learning_rate": 1.0965666050164477e-05,
"loss": 1.0686,
"step": 2385
},
{
"epoch": 1.0468681559351731,
"grad_norm": 0.27115445836831914,
"learning_rate": 1.0927609555261962e-05,
"loss": 1.0721,
"step": 2390
},
{
"epoch": 1.0490582566798072,
"grad_norm": 0.26962744736855904,
"learning_rate": 1.088953950434084e-05,
"loss": 1.073,
"step": 2395
},
{
"epoch": 1.0512483574244416,
"grad_norm": 0.2682198503489364,
"learning_rate": 1.0851456453754068e-05,
"loss": 1.0363,
"step": 2400
},
{
"epoch": 1.0534384581690759,
"grad_norm": 0.26286369163664847,
"learning_rate": 1.0813360960044579e-05,
"loss": 1.0785,
"step": 2405
},
{
"epoch": 1.05562855891371,
"grad_norm": 0.2669382343591095,
"learning_rate": 1.0775253579937148e-05,
"loss": 1.075,
"step": 2410
},
{
"epoch": 1.0578186596583443,
"grad_norm": 0.2752037755375215,
"learning_rate": 1.0737134870330255e-05,
"loss": 1.068,
"step": 2415
},
{
"epoch": 1.0600087604029786,
"grad_norm": 0.271664654445446,
"learning_rate": 1.0699005388287959e-05,
"loss": 1.1002,
"step": 2420
},
{
"epoch": 1.0621988611476127,
"grad_norm": 0.272588814423304,
"learning_rate": 1.0660865691031729e-05,
"loss": 1.0742,
"step": 2425
},
{
"epoch": 1.064388961892247,
"grad_norm": 0.266723825837105,
"learning_rate": 1.0622716335932336e-05,
"loss": 1.0908,
"step": 2430
},
{
"epoch": 1.0665790626368814,
"grad_norm": 0.2633628906535874,
"learning_rate": 1.0584557880501678e-05,
"loss": 1.0723,
"step": 2435
},
{
"epoch": 1.0687691633815155,
"grad_norm": 0.2715087145721187,
"learning_rate": 1.0546390882384649e-05,
"loss": 1.0924,
"step": 2440
},
{
"epoch": 1.0709592641261498,
"grad_norm": 0.2643851262115833,
"learning_rate": 1.0508215899350986e-05,
"loss": 1.0527,
"step": 2445
},
{
"epoch": 1.0731493648707842,
"grad_norm": 0.264344202748604,
"learning_rate": 1.0470033489287115e-05,
"loss": 1.0697,
"step": 2450
},
{
"epoch": 1.0753394656154183,
"grad_norm": 0.27016171316360305,
"learning_rate": 1.0431844210188e-05,
"loss": 1.067,
"step": 2455
},
{
"epoch": 1.0775295663600526,
"grad_norm": 0.26462744675653105,
"learning_rate": 1.039364862014899e-05,
"loss": 1.0553,
"step": 2460
},
{
"epoch": 1.079719667104687,
"grad_norm": 0.25957249915751707,
"learning_rate": 1.0355447277357667e-05,
"loss": 1.0662,
"step": 2465
},
{
"epoch": 1.081909767849321,
"grad_norm": 0.2682650123947121,
"learning_rate": 1.0317240740085666e-05,
"loss": 1.0496,
"step": 2470
},
{
"epoch": 1.0840998685939554,
"grad_norm": 0.27783105584443907,
"learning_rate": 1.0279029566680556e-05,
"loss": 1.0521,
"step": 2475
},
{
"epoch": 1.0862899693385897,
"grad_norm": 0.26626718276311767,
"learning_rate": 1.024081431555764e-05,
"loss": 1.0441,
"step": 2480
},
{
"epoch": 1.0884800700832238,
"grad_norm": 0.2629927537429945,
"learning_rate": 1.0202595545191827e-05,
"loss": 1.0576,
"step": 2485
},
{
"epoch": 1.090670170827858,
"grad_norm": 0.2620068996693131,
"learning_rate": 1.0164373814109439e-05,
"loss": 1.0625,
"step": 2490
},
{
"epoch": 1.0928602715724924,
"grad_norm": 0.2653845655607556,
"learning_rate": 1.0126149680880084e-05,
"loss": 1.0582,
"step": 2495
},
{
"epoch": 1.0950503723171265,
"grad_norm": 0.27299625220300644,
"learning_rate": 1.0087923704108462e-05,
"loss": 1.0934,
"step": 2500
},
{
"epoch": 1.0972404730617609,
"grad_norm": 0.26498736474890866,
"learning_rate": 1.0049696442426222e-05,
"loss": 1.058,
"step": 2505
},
{
"epoch": 1.0994305738063952,
"grad_norm": 0.2709488000586329,
"learning_rate": 1.0011468454483781e-05,
"loss": 1.0719,
"step": 2510
},
{
"epoch": 1.1016206745510293,
"grad_norm": 0.26135794077525687,
"learning_rate": 9.973240298942187e-06,
"loss": 1.0545,
"step": 2515
},
{
"epoch": 1.1038107752956636,
"grad_norm": 0.2582671924491208,
"learning_rate": 9.93501253446491e-06,
"loss": 1.0656,
"step": 2520
},
{
"epoch": 1.106000876040298,
"grad_norm": 0.2649770195518186,
"learning_rate": 9.896785719709735e-06,
"loss": 1.0789,
"step": 2525
},
{
"epoch": 1.108190976784932,
"grad_norm": 0.26940514713119723,
"learning_rate": 9.85856041332055e-06,
"loss": 1.0754,
"step": 2530
},
{
"epoch": 1.1103810775295664,
"grad_norm": 0.26672673745336384,
"learning_rate": 9.820337173919201e-06,
"loss": 1.0426,
"step": 2535
},
{
"epoch": 1.1125711782742007,
"grad_norm": 0.26266521432117856,
"learning_rate": 9.782116560097334e-06,
"loss": 1.0736,
"step": 2540
},
{
"epoch": 1.1147612790188348,
"grad_norm": 0.2661017661171288,
"learning_rate": 9.743899130408216e-06,
"loss": 1.0596,
"step": 2545
},
{
"epoch": 1.1169513797634691,
"grad_norm": 0.26244326034773946,
"learning_rate": 9.705685443358586e-06,
"loss": 1.0768,
"step": 2550
},
{
"epoch": 1.1191414805081035,
"grad_norm": 0.26228295242456245,
"learning_rate": 9.667476057400492e-06,
"loss": 1.0412,
"step": 2555
},
{
"epoch": 1.1213315812527376,
"grad_norm": 0.2629461038584397,
"learning_rate": 9.629271530923126e-06,
"loss": 1.0465,
"step": 2560
},
{
"epoch": 1.123521681997372,
"grad_norm": 0.271093061900036,
"learning_rate": 9.591072422244654e-06,
"loss": 1.0406,
"step": 2565
},
{
"epoch": 1.1257117827420062,
"grad_norm": 0.2822471803911114,
"learning_rate": 9.552879289604075e-06,
"loss": 1.074,
"step": 2570
},
{
"epoch": 1.1279018834866403,
"grad_norm": 0.26861806302874436,
"learning_rate": 9.514692691153058e-06,
"loss": 1.0783,
"step": 2575
},
{
"epoch": 1.1300919842312747,
"grad_norm": 0.27817880338620504,
"learning_rate": 9.476513184947769e-06,
"loss": 1.066,
"step": 2580
},
{
"epoch": 1.132282084975909,
"grad_norm": 0.270732129128024,
"learning_rate": 9.438341328940742e-06,
"loss": 1.0664,
"step": 2585
},
{
"epoch": 1.134472185720543,
"grad_norm": 0.26430623776114975,
"learning_rate": 9.400177680972696e-06,
"loss": 1.0818,
"step": 2590
},
{
"epoch": 1.1366622864651774,
"grad_norm": 0.2696174677769872,
"learning_rate": 9.362022798764424e-06,
"loss": 1.0467,
"step": 2595
},
{
"epoch": 1.1388523872098117,
"grad_norm": 0.2739729791232298,
"learning_rate": 9.323877239908587e-06,
"loss": 1.0523,
"step": 2600
},
{
"epoch": 1.1410424879544458,
"grad_norm": 0.26804487327058313,
"learning_rate": 9.28574156186162e-06,
"loss": 1.0672,
"step": 2605
},
{
"epoch": 1.1432325886990802,
"grad_norm": 0.2591521070265315,
"learning_rate": 9.247616321935539e-06,
"loss": 1.0719,
"step": 2610
},
{
"epoch": 1.1454226894437145,
"grad_norm": 0.25951317593104206,
"learning_rate": 9.209502077289836e-06,
"loss": 1.0447,
"step": 2615
},
{
"epoch": 1.1476127901883486,
"grad_norm": 0.2652460820102422,
"learning_rate": 9.17139938492331e-06,
"loss": 1.0521,
"step": 2620
},
{
"epoch": 1.149802890932983,
"grad_norm": 0.2602152514750744,
"learning_rate": 9.133308801665937e-06,
"loss": 1.0523,
"step": 2625
},
{
"epoch": 1.1519929916776173,
"grad_norm": 0.2705995487451913,
"learning_rate": 9.095230884170727e-06,
"loss": 1.0396,
"step": 2630
},
{
"epoch": 1.1541830924222514,
"grad_norm": 0.2604370066458989,
"learning_rate": 9.057166188905604e-06,
"loss": 1.0588,
"step": 2635
},
{
"epoch": 1.1563731931668857,
"grad_norm": 0.2737060731254108,
"learning_rate": 9.01911527214526e-06,
"loss": 1.0502,
"step": 2640
},
{
"epoch": 1.15856329391152,
"grad_norm": 0.27030630333355876,
"learning_rate": 8.98107868996302e-06,
"loss": 1.0725,
"step": 2645
},
{
"epoch": 1.1607533946561541,
"grad_norm": 0.26859131666568503,
"learning_rate": 8.943056998222733e-06,
"loss": 1.0652,
"step": 2650
},
{
"epoch": 1.1629434954007885,
"grad_norm": 0.2641393090266361,
"learning_rate": 8.905050752570637e-06,
"loss": 1.0668,
"step": 2655
},
{
"epoch": 1.1651335961454228,
"grad_norm": 0.2756537145265912,
"learning_rate": 8.867060508427245e-06,
"loss": 1.0738,
"step": 2660
},
{
"epoch": 1.1673236968900569,
"grad_norm": 0.26685077848408933,
"learning_rate": 8.82908682097922e-06,
"loss": 1.0604,
"step": 2665
},
{
"epoch": 1.1695137976346912,
"grad_norm": 0.2753178085823332,
"learning_rate": 8.791130245171278e-06,
"loss": 1.0676,
"step": 2670
},
{
"epoch": 1.1717038983793255,
"grad_norm": 0.2732703952765648,
"learning_rate": 8.753191335698049e-06,
"loss": 1.0744,
"step": 2675
},
{
"epoch": 1.1738939991239596,
"grad_norm": 0.2603013658010823,
"learning_rate": 8.715270646996002e-06,
"loss": 1.0529,
"step": 2680
},
{
"epoch": 1.176084099868594,
"grad_norm": 0.2634391048696047,
"learning_rate": 8.677368733235328e-06,
"loss": 1.0801,
"step": 2685
},
{
"epoch": 1.1782742006132283,
"grad_norm": 0.27467962055701084,
"learning_rate": 8.639486148311833e-06,
"loss": 1.0725,
"step": 2690
},
{
"epoch": 1.1804643013578624,
"grad_norm": 0.2721967843702617,
"learning_rate": 8.601623445838865e-06,
"loss": 1.0406,
"step": 2695
},
{
"epoch": 1.1826544021024967,
"grad_norm": 0.25984926483960213,
"learning_rate": 8.563781179139202e-06,
"loss": 1.0504,
"step": 2700
},
{
"epoch": 1.184844502847131,
"grad_norm": 0.2632769236380217,
"learning_rate": 8.525959901236975e-06,
"loss": 1.052,
"step": 2705
},
{
"epoch": 1.1870346035917652,
"grad_norm": 0.2752690229086118,
"learning_rate": 8.488160164849596e-06,
"loss": 1.0611,
"step": 2710
},
{
"epoch": 1.1892247043363995,
"grad_norm": 0.25299421098700864,
"learning_rate": 8.450382522379668e-06,
"loss": 1.0732,
"step": 2715
},
{
"epoch": 1.1914148050810338,
"grad_norm": 0.26715997535203595,
"learning_rate": 8.412627525906902e-06,
"loss": 1.0445,
"step": 2720
},
{
"epoch": 1.193604905825668,
"grad_norm": 0.284833895697399,
"learning_rate": 8.374895727180079e-06,
"loss": 1.0334,
"step": 2725
},
{
"epoch": 1.1957950065703022,
"grad_norm": 0.26556547667567937,
"learning_rate": 8.33718767760896e-06,
"loss": 1.0762,
"step": 2730
},
{
"epoch": 1.1979851073149366,
"grad_norm": 0.27538602694646613,
"learning_rate": 8.299503928256238e-06,
"loss": 1.0738,
"step": 2735
},
{
"epoch": 1.2001752080595707,
"grad_norm": 0.2705036393436721,
"learning_rate": 8.261845029829488e-06,
"loss": 1.0736,
"step": 2740
},
{
"epoch": 1.202365308804205,
"grad_norm": 0.26006945720175584,
"learning_rate": 8.224211532673117e-06,
"loss": 1.0613,
"step": 2745
},
{
"epoch": 1.2045554095488393,
"grad_norm": 0.2723552029684137,
"learning_rate": 8.186603986760316e-06,
"loss": 1.0287,
"step": 2750
},
{
"epoch": 1.2067455102934734,
"grad_norm": 0.25851377292505096,
"learning_rate": 8.149022941685023e-06,
"loss": 1.0803,
"step": 2755
},
{
"epoch": 1.2089356110381078,
"grad_norm": 0.26629820551366745,
"learning_rate": 8.111468946653901e-06,
"loss": 1.073,
"step": 2760
},
{
"epoch": 1.211125711782742,
"grad_norm": 0.2699109190074932,
"learning_rate": 8.073942550478307e-06,
"loss": 1.0629,
"step": 2765
},
{
"epoch": 1.2133158125273762,
"grad_norm": 0.25618392505342463,
"learning_rate": 8.03644430156626e-06,
"loss": 1.0521,
"step": 2770
},
{
"epoch": 1.2155059132720105,
"grad_norm": 0.2694808758637472,
"learning_rate": 7.998974747914449e-06,
"loss": 1.0732,
"step": 2775
},
{
"epoch": 1.2176960140166448,
"grad_norm": 0.27962922484388153,
"learning_rate": 7.9615344371002e-06,
"loss": 1.0859,
"step": 2780
},
{
"epoch": 1.219886114761279,
"grad_norm": 0.27750174727788907,
"learning_rate": 7.924123916273504e-06,
"loss": 1.0875,
"step": 2785
},
{
"epoch": 1.2220762155059133,
"grad_norm": 0.2592875926566742,
"learning_rate": 7.886743732148986e-06,
"loss": 1.0826,
"step": 2790
},
{
"epoch": 1.2242663162505476,
"grad_norm": 0.2654730019317536,
"learning_rate": 7.849394430997941e-06,
"loss": 1.0619,
"step": 2795
},
{
"epoch": 1.2264564169951817,
"grad_norm": 0.26072631883327624,
"learning_rate": 7.81207655864034e-06,
"loss": 1.0437,
"step": 2800
},
{
"epoch": 1.228646517739816,
"grad_norm": 0.26825325034393344,
"learning_rate": 7.774790660436857e-06,
"loss": 1.0676,
"step": 2805
},
{
"epoch": 1.2308366184844504,
"grad_norm": 0.2566995101810123,
"learning_rate": 7.7375372812809e-06,
"loss": 1.0529,
"step": 2810
},
{
"epoch": 1.2330267192290845,
"grad_norm": 0.2604309539637625,
"learning_rate": 7.700316965590638e-06,
"loss": 1.0619,
"step": 2815
},
{
"epoch": 1.2352168199737188,
"grad_norm": 0.2625246921961206,
"learning_rate": 7.663130257301064e-06,
"loss": 1.0891,
"step": 2820
},
{
"epoch": 1.2374069207183531,
"grad_norm": 0.25855563736281145,
"learning_rate": 7.62597769985603e-06,
"loss": 1.0566,
"step": 2825
},
{
"epoch": 1.2395970214629872,
"grad_norm": 0.26232370362085894,
"learning_rate": 7.588859836200309e-06,
"loss": 1.0561,
"step": 2830
},
{
"epoch": 1.2417871222076216,
"grad_norm": 0.26917013720230804,
"learning_rate": 7.551777208771659e-06,
"loss": 1.0471,
"step": 2835
},
{
"epoch": 1.2439772229522559,
"grad_norm": 0.26375355389890487,
"learning_rate": 7.514730359492905e-06,
"loss": 1.0592,
"step": 2840
},
{
"epoch": 1.24616732369689,
"grad_norm": 0.26648356006456103,
"learning_rate": 7.477719829764008e-06,
"loss": 1.0469,
"step": 2845
},
{
"epoch": 1.2483574244415243,
"grad_norm": 0.2728857993840234,
"learning_rate": 7.440746160454156e-06,
"loss": 1.0646,
"step": 2850
},
{
"epoch": 1.2505475251861586,
"grad_norm": 0.26552837575529725,
"learning_rate": 7.403809891893865e-06,
"loss": 1.0557,
"step": 2855
},
{
"epoch": 1.2527376259307927,
"grad_norm": 0.2688170697408606,
"learning_rate": 7.366911563867086e-06,
"loss": 1.0533,
"step": 2860
},
{
"epoch": 1.254927726675427,
"grad_norm": 0.2738164410511349,
"learning_rate": 7.330051715603295e-06,
"loss": 1.0559,
"step": 2865
},
{
"epoch": 1.2571178274200614,
"grad_norm": 0.2619756596211397,
"learning_rate": 7.293230885769638e-06,
"loss": 1.052,
"step": 2870
},
{
"epoch": 1.2593079281646955,
"grad_norm": 0.2729439802092583,
"learning_rate": 7.2564496124630455e-06,
"loss": 1.0621,
"step": 2875
},
{
"epoch": 1.2614980289093298,
"grad_norm": 0.2618676111912421,
"learning_rate": 7.219708433202368e-06,
"loss": 1.0527,
"step": 2880
},
{
"epoch": 1.2636881296539642,
"grad_norm": 0.2575770131741819,
"learning_rate": 7.183007884920534e-06,
"loss": 1.0813,
"step": 2885
},
{
"epoch": 1.2658782303985983,
"grad_norm": 0.268317818796291,
"learning_rate": 7.14634850395668e-06,
"loss": 1.0744,
"step": 2890
},
{
"epoch": 1.2680683311432326,
"grad_norm": 0.25989715732532104,
"learning_rate": 7.109730826048344e-06,
"loss": 1.0586,
"step": 2895
},
{
"epoch": 1.270258431887867,
"grad_norm": 0.2632165775296998,
"learning_rate": 7.073155386323602e-06,
"loss": 1.0463,
"step": 2900
},
{
"epoch": 1.272448532632501,
"grad_norm": 0.2561543218662453,
"learning_rate": 7.036622719293278e-06,
"loss": 1.0414,
"step": 2905
},
{
"epoch": 1.2746386333771353,
"grad_norm": 0.25666741513918634,
"learning_rate": 7.0001333588431055e-06,
"loss": 1.0973,
"step": 2910
},
{
"epoch": 1.2768287341217697,
"grad_norm": 0.2720271858166436,
"learning_rate": 6.963687838225948e-06,
"loss": 1.0404,
"step": 2915
},
{
"epoch": 1.2790188348664038,
"grad_norm": 0.2664079925854839,
"learning_rate": 6.927286690053996e-06,
"loss": 1.0627,
"step": 2920
},
{
"epoch": 1.281208935611038,
"grad_norm": 0.2616457737793427,
"learning_rate": 6.890930446290976e-06,
"loss": 1.076,
"step": 2925
},
{
"epoch": 1.2833990363556724,
"grad_norm": 0.26503317946827504,
"learning_rate": 6.854619638244399e-06,
"loss": 1.0523,
"step": 2930
},
{
"epoch": 1.2855891371003065,
"grad_norm": 0.2643470479220203,
"learning_rate": 6.8183547965577735e-06,
"loss": 1.0723,
"step": 2935
},
{
"epoch": 1.2877792378449409,
"grad_norm": 0.2547905467870185,
"learning_rate": 6.782136451202857e-06,
"loss": 1.0273,
"step": 2940
},
{
"epoch": 1.2899693385895752,
"grad_norm": 0.261647302996271,
"learning_rate": 6.745965131471915e-06,
"loss": 1.0414,
"step": 2945
},
{
"epoch": 1.2921594393342093,
"grad_norm": 0.24989074380882761,
"learning_rate": 6.709841365969989e-06,
"loss": 1.0352,
"step": 2950
},
{
"epoch": 1.2943495400788436,
"grad_norm": 0.259577860201249,
"learning_rate": 6.673765682607155e-06,
"loss": 1.0445,
"step": 2955
},
{
"epoch": 1.296539640823478,
"grad_norm": 0.2644082431413658,
"learning_rate": 6.637738608590831e-06,
"loss": 1.0443,
"step": 2960
},
{
"epoch": 1.298729741568112,
"grad_norm": 0.2618054773070754,
"learning_rate": 6.6017606704180555e-06,
"loss": 1.0693,
"step": 2965
},
{
"epoch": 1.3009198423127464,
"grad_norm": 0.26097876761733935,
"learning_rate": 6.565832393867808e-06,
"loss": 1.0543,
"step": 2970
},
{
"epoch": 1.3031099430573807,
"grad_norm": 0.2594806373702017,
"learning_rate": 6.529954303993305e-06,
"loss": 1.0521,
"step": 2975
},
{
"epoch": 1.3053000438020148,
"grad_norm": 0.25960390582097226,
"learning_rate": 6.494126925114341e-06,
"loss": 1.0398,
"step": 2980
},
{
"epoch": 1.3074901445466491,
"grad_norm": 0.26253903837359144,
"learning_rate": 6.458350780809634e-06,
"loss": 1.042,
"step": 2985
},
{
"epoch": 1.3096802452912835,
"grad_norm": 0.2654661460334143,
"learning_rate": 6.422626393909151e-06,
"loss": 1.0592,
"step": 2990
},
{
"epoch": 1.3118703460359176,
"grad_norm": 0.2492587439304439,
"learning_rate": 6.386954286486485e-06,
"loss": 1.0678,
"step": 2995
},
{
"epoch": 1.314060446780552,
"grad_norm": 0.26726116413928985,
"learning_rate": 6.351334979851218e-06,
"loss": 1.0725,
"step": 3000
},
{
"epoch": 1.3162505475251862,
"grad_norm": 0.26088237090419825,
"learning_rate": 6.315768994541316e-06,
"loss": 1.0654,
"step": 3005
},
{
"epoch": 1.3184406482698203,
"grad_norm": 0.26118786607312605,
"learning_rate": 6.280256850315496e-06,
"loss": 1.05,
"step": 3010
},
{
"epoch": 1.3206307490144547,
"grad_norm": 0.26714886343958494,
"learning_rate": 6.244799066145657e-06,
"loss": 1.0594,
"step": 3015
},
{
"epoch": 1.322820849759089,
"grad_norm": 0.2697209076699916,
"learning_rate": 6.209396160209275e-06,
"loss": 1.0604,
"step": 3020
},
{
"epoch": 1.325010950503723,
"grad_norm": 0.2592918928971803,
"learning_rate": 6.1740486498818454e-06,
"loss": 1.0617,
"step": 3025
},
{
"epoch": 1.3272010512483574,
"grad_norm": 0.2744613652569151,
"learning_rate": 6.138757051729316e-06,
"loss": 1.0656,
"step": 3030
},
{
"epoch": 1.3293911519929917,
"grad_norm": 0.26102063894859023,
"learning_rate": 6.103521881500531e-06,
"loss": 1.0643,
"step": 3035
},
{
"epoch": 1.3315812527376258,
"grad_norm": 0.26308106234970585,
"learning_rate": 6.068343654119711e-06,
"loss": 1.0703,
"step": 3040
},
{
"epoch": 1.3337713534822602,
"grad_norm": 0.27250738827855164,
"learning_rate": 6.033222883678915e-06,
"loss": 1.0545,
"step": 3045
},
{
"epoch": 1.3359614542268945,
"grad_norm": 0.25548627781504274,
"learning_rate": 5.998160083430529e-06,
"loss": 1.0406,
"step": 3050
},
{
"epoch": 1.3381515549715286,
"grad_norm": 0.27175999771187154,
"learning_rate": 5.963155765779762e-06,
"loss": 1.0582,
"step": 3055
},
{
"epoch": 1.340341655716163,
"grad_norm": 0.2766896501373885,
"learning_rate": 5.928210442277176e-06,
"loss": 1.0434,
"step": 3060
},
{
"epoch": 1.3425317564607973,
"grad_norm": 0.26421628687562304,
"learning_rate": 5.893324623611178e-06,
"loss": 1.05,
"step": 3065
},
{
"epoch": 1.3447218572054314,
"grad_norm": 0.26784348877239994,
"learning_rate": 5.858498819600591e-06,
"loss": 1.0545,
"step": 3070
},
{
"epoch": 1.3469119579500657,
"grad_norm": 0.26753929682342953,
"learning_rate": 5.823733539187184e-06,
"loss": 1.0752,
"step": 3075
},
{
"epoch": 1.3491020586947,
"grad_norm": 0.2876683161118842,
"learning_rate": 5.789029290428234e-06,
"loss": 1.0551,
"step": 3080
},
{
"epoch": 1.3512921594393341,
"grad_norm": 0.2655828094593293,
"learning_rate": 5.754386580489118e-06,
"loss": 1.0725,
"step": 3085
},
{
"epoch": 1.3534822601839684,
"grad_norm": 0.2642140873534744,
"learning_rate": 5.719805915635872e-06,
"loss": 1.0303,
"step": 3090
},
{
"epoch": 1.3556723609286028,
"grad_norm": 0.2662184748201957,
"learning_rate": 5.685287801227819e-06,
"loss": 1.0682,
"step": 3095
},
{
"epoch": 1.3578624616732369,
"grad_norm": 0.2602261042223262,
"learning_rate": 5.65083274171018e-06,
"loss": 1.0734,
"step": 3100
},
{
"epoch": 1.3600525624178712,
"grad_norm": 0.26422024768061875,
"learning_rate": 5.616441240606685e-06,
"loss": 1.0686,
"step": 3105
},
{
"epoch": 1.3622426631625055,
"grad_norm": 0.26487517368734986,
"learning_rate": 5.5821138005122275e-06,
"loss": 1.0586,
"step": 3110
},
{
"epoch": 1.3644327639071396,
"grad_norm": 0.2665684900016343,
"learning_rate": 5.547850923085525e-06,
"loss": 1.0486,
"step": 3115
},
{
"epoch": 1.366622864651774,
"grad_norm": 0.26846022882339726,
"learning_rate": 5.513653109041784e-06,
"loss": 1.0537,
"step": 3120
},
{
"epoch": 1.3688129653964083,
"grad_norm": 0.2667169718942767,
"learning_rate": 5.479520858145366e-06,
"loss": 1.0777,
"step": 3125
},
{
"epoch": 1.3710030661410424,
"grad_norm": 0.26370064314585917,
"learning_rate": 5.4454546692025014e-06,
"loss": 1.0596,
"step": 3130
},
{
"epoch": 1.3731931668856767,
"grad_norm": 0.25983283968870546,
"learning_rate": 5.411455040054008e-06,
"loss": 1.0668,
"step": 3135
},
{
"epoch": 1.375383267630311,
"grad_norm": 0.2694060610420749,
"learning_rate": 5.377522467567988e-06,
"loss": 1.0311,
"step": 3140
},
{
"epoch": 1.3775733683749452,
"grad_norm": 0.2708644298850711,
"learning_rate": 5.343657447632593e-06,
"loss": 1.0604,
"step": 3145
},
{
"epoch": 1.3797634691195795,
"grad_norm": 0.2663715611937321,
"learning_rate": 5.30986047514875e-06,
"loss": 1.0434,
"step": 3150
},
{
"epoch": 1.3819535698642138,
"grad_norm": 0.2683831238281801,
"learning_rate": 5.276132044022976e-06,
"loss": 1.0648,
"step": 3155
},
{
"epoch": 1.384143670608848,
"grad_norm": 0.2710281366615512,
"learning_rate": 5.242472647160104e-06,
"loss": 1.0621,
"step": 3160
},
{
"epoch": 1.3863337713534822,
"grad_norm": 0.26635212498540184,
"learning_rate": 5.208882776456112e-06,
"loss": 1.0344,
"step": 3165
},
{
"epoch": 1.3885238720981166,
"grad_norm": 0.2646040753621649,
"learning_rate": 5.175362922790925e-06,
"loss": 1.0584,
"step": 3170
},
{
"epoch": 1.3907139728427507,
"grad_norm": 0.2606068539201707,
"learning_rate": 5.1419135760212546e-06,
"loss": 1.0566,
"step": 3175
},
{
"epoch": 1.392904073587385,
"grad_norm": 0.2692058993237337,
"learning_rate": 5.108535224973421e-06,
"loss": 1.084,
"step": 3180
},
{
"epoch": 1.3950941743320193,
"grad_norm": 0.2692100394219459,
"learning_rate": 5.075228357436215e-06,
"loss": 1.0594,
"step": 3185
},
{
"epoch": 1.3972842750766534,
"grad_norm": 0.2590485616992936,
"learning_rate": 5.04199346015378e-06,
"loss": 1.0588,
"step": 3190
},
{
"epoch": 1.3994743758212878,
"grad_norm": 0.2600923581263787,
"learning_rate": 5.0088310188184954e-06,
"loss": 1.0795,
"step": 3195
},
{
"epoch": 1.401664476565922,
"grad_norm": 0.26786057407617736,
"learning_rate": 4.975741518063863e-06,
"loss": 1.0332,
"step": 3200
},
{
"epoch": 1.4038545773105562,
"grad_norm": 0.2553993419762132,
"learning_rate": 4.9427254414574355e-06,
"loss": 1.0643,
"step": 3205
},
{
"epoch": 1.4060446780551905,
"grad_norm": 0.2650885102717858,
"learning_rate": 4.909783271493768e-06,
"loss": 1.0574,
"step": 3210
},
{
"epoch": 1.4082347787998248,
"grad_norm": 0.2690705321050312,
"learning_rate": 4.87691548958733e-06,
"loss": 1.076,
"step": 3215
},
{
"epoch": 1.410424879544459,
"grad_norm": 0.2635061966804039,
"learning_rate": 4.844122576065494e-06,
"loss": 1.0605,
"step": 3220
},
{
"epoch": 1.4126149802890933,
"grad_norm": 0.2707741119898313,
"learning_rate": 4.81140501016152e-06,
"loss": 1.0809,
"step": 3225
},
{
"epoch": 1.4148050810337276,
"grad_norm": 0.2648037571113503,
"learning_rate": 4.77876327000754e-06,
"loss": 1.0645,
"step": 3230
},
{
"epoch": 1.4169951817783617,
"grad_norm": 0.2639644819174903,
"learning_rate": 4.7461978326275686e-06,
"loss": 1.0477,
"step": 3235
},
{
"epoch": 1.419185282522996,
"grad_norm": 0.2719513637037621,
"learning_rate": 4.7137091739305356e-06,
"loss": 1.0807,
"step": 3240
},
{
"epoch": 1.4213753832676304,
"grad_norm": 0.26159805380609785,
"learning_rate": 4.681297768703346e-06,
"loss": 1.0668,
"step": 3245
},
{
"epoch": 1.4235654840122645,
"grad_norm": 0.25856122844361584,
"learning_rate": 4.648964090603913e-06,
"loss": 1.0664,
"step": 3250
},
{
"epoch": 1.4257555847568988,
"grad_norm": 0.26782272235769733,
"learning_rate": 4.616708612154258e-06,
"loss": 1.0363,
"step": 3255
},
{
"epoch": 1.4279456855015331,
"grad_norm": 0.259074751270484,
"learning_rate": 4.5845318047336e-06,
"loss": 1.0656,
"step": 3260
},
{
"epoch": 1.4301357862461672,
"grad_norm": 0.26806061151907784,
"learning_rate": 4.5524341385714675e-06,
"loss": 1.0324,
"step": 3265
},
{
"epoch": 1.4323258869908015,
"grad_norm": 0.2713642276306849,
"learning_rate": 4.520416082740816e-06,
"loss": 1.0514,
"step": 3270
},
{
"epoch": 1.4345159877354359,
"grad_norm": 0.2517619955218095,
"learning_rate": 4.4884781051511835e-06,
"loss": 1.0453,
"step": 3275
},
{
"epoch": 1.43670608848007,
"grad_norm": 0.2573589678247352,
"learning_rate": 4.456620672541859e-06,
"loss": 1.0744,
"step": 3280
},
{
"epoch": 1.4388961892247043,
"grad_norm": 0.26411247973588675,
"learning_rate": 4.424844250475043e-06,
"loss": 1.077,
"step": 3285
},
{
"epoch": 1.4410862899693386,
"grad_norm": 0.25817254393906147,
"learning_rate": 4.39314930332906e-06,
"loss": 1.067,
"step": 3290
},
{
"epoch": 1.4432763907139727,
"grad_norm": 0.255853414459153,
"learning_rate": 4.361536294291555e-06,
"loss": 1.075,
"step": 3295
},
{
"epoch": 1.445466491458607,
"grad_norm": 0.2621787716237385,
"learning_rate": 4.330005685352751e-06,
"loss": 1.0625,
"step": 3300
},
{
"epoch": 1.4476565922032414,
"grad_norm": 0.25777071723483336,
"learning_rate": 4.29855793729868e-06,
"loss": 1.0596,
"step": 3305
},
{
"epoch": 1.4498466929478755,
"grad_norm": 0.26127021300557535,
"learning_rate": 4.267193509704438e-06,
"loss": 1.0771,
"step": 3310
},
{
"epoch": 1.4520367936925098,
"grad_norm": 0.25960438092364824,
"learning_rate": 4.235912860927489e-06,
"loss": 1.0543,
"step": 3315
},
{
"epoch": 1.4542268944371441,
"grad_norm": 0.27211721141661616,
"learning_rate": 4.204716448100967e-06,
"loss": 1.0865,
"step": 3320
},
{
"epoch": 1.4564169951817783,
"grad_norm": 0.25962589139875863,
"learning_rate": 4.173604727126978e-06,
"loss": 1.06,
"step": 3325
},
{
"epoch": 1.4586070959264126,
"grad_norm": 0.2563513023704075,
"learning_rate": 4.142578152669946e-06,
"loss": 1.0633,
"step": 3330
},
{
"epoch": 1.460797196671047,
"grad_norm": 0.2575843897502921,
"learning_rate": 4.111637178149978e-06,
"loss": 1.0527,
"step": 3335
},
{
"epoch": 1.462987297415681,
"grad_norm": 0.26535361865514706,
"learning_rate": 4.0807822557362305e-06,
"loss": 1.0498,
"step": 3340
},
{
"epoch": 1.4651773981603153,
"grad_norm": 0.2666398926972173,
"learning_rate": 4.050013836340294e-06,
"loss": 1.0668,
"step": 3345
},
{
"epoch": 1.4673674989049497,
"grad_norm": 0.27206398230995277,
"learning_rate": 4.019332369609608e-06,
"loss": 1.0471,
"step": 3350
},
{
"epoch": 1.4695575996495838,
"grad_norm": 0.26515734844142613,
"learning_rate": 3.9887383039209045e-06,
"loss": 1.0787,
"step": 3355
},
{
"epoch": 1.471747700394218,
"grad_norm": 0.2627505561616997,
"learning_rate": 3.9582320863736315e-06,
"loss": 1.0564,
"step": 3360
},
{
"epoch": 1.4739378011388524,
"grad_norm": 0.2602708632102437,
"learning_rate": 3.927814162783431e-06,
"loss": 1.0602,
"step": 3365
},
{
"epoch": 1.4761279018834865,
"grad_norm": 0.2707223435716907,
"learning_rate": 3.897484977675634e-06,
"loss": 1.0525,
"step": 3370
},
{
"epoch": 1.4783180026281209,
"grad_norm": 0.2627452484071492,
"learning_rate": 3.867244974278741e-06,
"loss": 1.0527,
"step": 3375
},
{
"epoch": 1.4805081033727552,
"grad_norm": 0.26390966503788216,
"learning_rate": 3.83709459451797e-06,
"loss": 1.0367,
"step": 3380
},
{
"epoch": 1.4826982041173893,
"grad_norm": 0.25536736348425665,
"learning_rate": 3.80703427900877e-06,
"loss": 1.0518,
"step": 3385
},
{
"epoch": 1.4848883048620236,
"grad_norm": 0.2773298314887989,
"learning_rate": 3.777064467050415e-06,
"loss": 1.0789,
"step": 3390
},
{
"epoch": 1.487078405606658,
"grad_norm": 0.2728892928772842,
"learning_rate": 3.7471855966195556e-06,
"loss": 1.0832,
"step": 3395
},
{
"epoch": 1.489268506351292,
"grad_norm": 0.26553829473522816,
"learning_rate": 3.7173981043638317e-06,
"loss": 1.0607,
"step": 3400
},
{
"epoch": 1.4914586070959264,
"grad_norm": 0.2694225835399303,
"learning_rate": 3.687702425595485e-06,
"loss": 1.0824,
"step": 3405
},
{
"epoch": 1.4936487078405607,
"grad_norm": 0.26543120146820526,
"learning_rate": 3.658098994285011e-06,
"loss": 1.0738,
"step": 3410
},
{
"epoch": 1.4958388085851948,
"grad_norm": 0.26713945932896793,
"learning_rate": 3.628588243054807e-06,
"loss": 1.048,
"step": 3415
},
{
"epoch": 1.4980289093298291,
"grad_norm": 0.26162062775036266,
"learning_rate": 3.59917060317284e-06,
"loss": 1.0436,
"step": 3420
},
{
"epoch": 1.5002190100744635,
"grad_norm": 0.26485362804622087,
"learning_rate": 3.5698465045463594e-06,
"loss": 1.0617,
"step": 3425
},
{
"epoch": 1.5024091108190976,
"grad_norm": 0.2531141962202315,
"learning_rate": 3.5406163757156177e-06,
"loss": 1.0281,
"step": 3430
},
{
"epoch": 1.5045992115637319,
"grad_norm": 0.26406689492465707,
"learning_rate": 3.511480643847588e-06,
"loss": 1.042,
"step": 3435
},
{
"epoch": 1.5067893123083662,
"grad_norm": 0.26405636994206744,
"learning_rate": 3.4824397347297356e-06,
"loss": 1.0633,
"step": 3440
},
{
"epoch": 1.5089794130530003,
"grad_norm": 0.26746329470551067,
"learning_rate": 3.453494072763801e-06,
"loss": 1.0584,
"step": 3445
},
{
"epoch": 1.5111695137976346,
"grad_norm": 0.2617239083579329,
"learning_rate": 3.4246440809595782e-06,
"loss": 1.0818,
"step": 3450
},
{
"epoch": 1.513359614542269,
"grad_norm": 0.2577442031154383,
"learning_rate": 3.395890180928756e-06,
"loss": 1.0635,
"step": 3455
},
{
"epoch": 1.515549715286903,
"grad_norm": 0.25802319557110337,
"learning_rate": 3.367232792878733e-06,
"loss": 1.0387,
"step": 3460
},
{
"epoch": 1.5177398160315374,
"grad_norm": 0.26395535667353776,
"learning_rate": 3.338672335606501e-06,
"loss": 1.0463,
"step": 3465
},
{
"epoch": 1.5199299167761717,
"grad_norm": 0.2675824852661886,
"learning_rate": 3.3102092264925034e-06,
"loss": 1.0412,
"step": 3470
},
{
"epoch": 1.5221200175208058,
"grad_norm": 0.2635866153828133,
"learning_rate": 3.2818438814945443e-06,
"loss": 1.0436,
"step": 3475
},
{
"epoch": 1.5243101182654402,
"grad_norm": 0.2634172643790584,
"learning_rate": 3.2535767151417196e-06,
"loss": 1.0939,
"step": 3480
},
{
"epoch": 1.5265002190100745,
"grad_norm": 0.26747117493686146,
"learning_rate": 3.2254081405283368e-06,
"loss": 1.0701,
"step": 3485
},
{
"epoch": 1.5286903197547086,
"grad_norm": 0.26083933140484994,
"learning_rate": 3.1973385693079053e-06,
"loss": 1.0594,
"step": 3490
},
{
"epoch": 1.530880420499343,
"grad_norm": 0.2631801552676203,
"learning_rate": 3.1693684116870915e-06,
"loss": 1.059,
"step": 3495
},
{
"epoch": 1.5330705212439772,
"grad_norm": 0.2628999198155687,
"learning_rate": 3.141498076419751e-06,
"loss": 1.066,
"step": 3500
},
{
"epoch": 1.5352606219886114,
"grad_norm": 0.2673654861785837,
"learning_rate": 3.113727970800935e-06,
"loss": 1.0768,
"step": 3505
},
{
"epoch": 1.5374507227332457,
"grad_norm": 0.2670303119820206,
"learning_rate": 3.0860585006609476e-06,
"loss": 1.0506,
"step": 3510
},
{
"epoch": 1.53964082347788,
"grad_norm": 0.2579786497495102,
"learning_rate": 3.0584900703594124e-06,
"loss": 1.0613,
"step": 3515
},
{
"epoch": 1.541830924222514,
"grad_norm": 0.26312720898642156,
"learning_rate": 3.0310230827793698e-06,
"loss": 1.0754,
"step": 3520
},
{
"epoch": 1.5440210249671484,
"grad_norm": 0.27154475514562654,
"learning_rate": 3.0036579393213738e-06,
"loss": 1.0521,
"step": 3525
},
{
"epoch": 1.5462111257117828,
"grad_norm": 0.2660044823931567,
"learning_rate": 2.9763950398976494e-06,
"loss": 1.0717,
"step": 3530
},
{
"epoch": 1.5484012264564169,
"grad_norm": 0.25584411175808575,
"learning_rate": 2.949234782926218e-06,
"loss": 1.0689,
"step": 3535
},
{
"epoch": 1.5505913272010512,
"grad_norm": 0.26504323222247284,
"learning_rate": 2.9221775653251094e-06,
"loss": 1.0561,
"step": 3540
},
{
"epoch": 1.5527814279456855,
"grad_norm": 0.2621451181651719,
"learning_rate": 2.89522378250653e-06,
"loss": 1.076,
"step": 3545
},
{
"epoch": 1.5549715286903196,
"grad_norm": 0.2633161653877227,
"learning_rate": 2.8683738283711007e-06,
"loss": 1.0471,
"step": 3550
},
{
"epoch": 1.557161629434954,
"grad_norm": 0.2607111198003952,
"learning_rate": 2.8416280953021036e-06,
"loss": 1.0451,
"step": 3555
},
{
"epoch": 1.5593517301795883,
"grad_norm": 0.25674063766814414,
"learning_rate": 2.8149869741597323e-06,
"loss": 1.032,
"step": 3560
},
{
"epoch": 1.5615418309242224,
"grad_norm": 0.26291694066848564,
"learning_rate": 2.7884508542754008e-06,
"loss": 1.0594,
"step": 3565
},
{
"epoch": 1.563731931668857,
"grad_norm": 0.26018069572571767,
"learning_rate": 2.7620201234460296e-06,
"loss": 1.0637,
"step": 3570
},
{
"epoch": 1.565922032413491,
"grad_norm": 0.2598568109349574,
"learning_rate": 2.735695167928405e-06,
"loss": 1.0525,
"step": 3575
},
{
"epoch": 1.5681121331581251,
"grad_norm": 0.2698254467493303,
"learning_rate": 2.7094763724335084e-06,
"loss": 1.0754,
"step": 3580
},
{
"epoch": 1.5703022339027597,
"grad_norm": 0.26635873399369575,
"learning_rate": 2.6833641201209083e-06,
"loss": 1.0693,
"step": 3585
},
{
"epoch": 1.5724923346473938,
"grad_norm": 0.26869051331331034,
"learning_rate": 2.6573587925931676e-06,
"loss": 1.0598,
"step": 3590
},
{
"epoch": 1.574682435392028,
"grad_norm": 0.2659077562702536,
"learning_rate": 2.631460769890248e-06,
"loss": 1.0475,
"step": 3595
},
{
"epoch": 1.5768725361366625,
"grad_norm": 0.2621915748487364,
"learning_rate": 2.605670430483975e-06,
"loss": 1.0725,
"step": 3600
},
{
"epoch": 1.5790626368812966,
"grad_norm": 0.25199719840433615,
"learning_rate": 2.579988151272489e-06,
"loss": 1.035,
"step": 3605
},
{
"epoch": 1.5812527376259307,
"grad_norm": 0.2576740526950345,
"learning_rate": 2.5544143075747563e-06,
"loss": 1.0592,
"step": 3610
},
{
"epoch": 1.5834428383705652,
"grad_norm": 0.25887381760680483,
"learning_rate": 2.5289492731250665e-06,
"loss": 1.0541,
"step": 3615
},
{
"epoch": 1.5856329391151993,
"grad_norm": 0.2630343324716044,
"learning_rate": 2.503593420067579e-06,
"loss": 1.0375,
"step": 3620
},
{
"epoch": 1.5878230398598334,
"grad_norm": 0.2532971997224149,
"learning_rate": 2.4783471189508945e-06,
"loss": 1.0594,
"step": 3625
},
{
"epoch": 1.590013140604468,
"grad_norm": 0.25821760865745746,
"learning_rate": 2.4532107387226176e-06,
"loss": 1.059,
"step": 3630
},
{
"epoch": 1.592203241349102,
"grad_norm": 0.26225107109059886,
"learning_rate": 2.42818464672398e-06,
"loss": 1.0518,
"step": 3635
},
{
"epoch": 1.5943933420937362,
"grad_norm": 0.2622443538659936,
"learning_rate": 2.4032692086844755e-06,
"loss": 1.043,
"step": 3640
},
{
"epoch": 1.5965834428383707,
"grad_norm": 0.2638282719915154,
"learning_rate": 2.378464788716498e-06,
"loss": 1.059,
"step": 3645
},
{
"epoch": 1.5987735435830048,
"grad_norm": 0.2697815422805798,
"learning_rate": 2.3537717493100455e-06,
"loss": 1.0818,
"step": 3650
},
{
"epoch": 1.600963644327639,
"grad_norm": 0.2571237597778176,
"learning_rate": 2.3291904513273976e-06,
"loss": 1.0768,
"step": 3655
},
{
"epoch": 1.6031537450722735,
"grad_norm": 0.2591593934526409,
"learning_rate": 2.3047212539978515e-06,
"loss": 1.0314,
"step": 3660
},
{
"epoch": 1.6053438458169076,
"grad_norm": 0.25288794506056167,
"learning_rate": 2.2803645149124853e-06,
"loss": 1.0463,
"step": 3665
},
{
"epoch": 1.6075339465615417,
"grad_norm": 0.26274198683291683,
"learning_rate": 2.2561205900189064e-06,
"loss": 1.0641,
"step": 3670
},
{
"epoch": 1.6097240473061762,
"grad_norm": 0.2585964477343107,
"learning_rate": 2.2319898336160782e-06,
"loss": 1.0406,
"step": 3675
},
{
"epoch": 1.6119141480508103,
"grad_norm": 0.26454251839814774,
"learning_rate": 2.207972598349114e-06,
"loss": 1.0473,
"step": 3680
},
{
"epoch": 1.6141042487954445,
"grad_norm": 0.26167402823368013,
"learning_rate": 2.184069235204149e-06,
"loss": 1.0555,
"step": 3685
},
{
"epoch": 1.616294349540079,
"grad_norm": 0.2617176344796563,
"learning_rate": 2.160280093503193e-06,
"loss": 1.0537,
"step": 3690
},
{
"epoch": 1.618484450284713,
"grad_norm": 0.2557415990045297,
"learning_rate": 2.136605520899029e-06,
"loss": 1.0576,
"step": 3695
},
{
"epoch": 1.6206745510293472,
"grad_norm": 0.26094617139109133,
"learning_rate": 2.113045863370148e-06,
"loss": 1.0738,
"step": 3700
},
{
"epoch": 1.6228646517739818,
"grad_norm": 0.27449534463755304,
"learning_rate": 2.0896014652156673e-06,
"loss": 1.0518,
"step": 3705
},
{
"epoch": 1.6250547525186159,
"grad_norm": 0.26830283731724036,
"learning_rate": 2.0662726690503153e-06,
"loss": 1.0445,
"step": 3710
},
{
"epoch": 1.62724485326325,
"grad_norm": 0.26178239860462027,
"learning_rate": 2.0430598157994263e-06,
"loss": 1.0457,
"step": 3715
},
{
"epoch": 1.6294349540078845,
"grad_norm": 0.26997488755625093,
"learning_rate": 2.0199632446939523e-06,
"loss": 1.0572,
"step": 3720
},
{
"epoch": 1.6316250547525186,
"grad_norm": 0.26067940678020124,
"learning_rate": 1.996983293265502e-06,
"loss": 1.0721,
"step": 3725
},
{
"epoch": 1.6338151554971527,
"grad_norm": 0.26125656461287955,
"learning_rate": 1.9741202973414133e-06,
"loss": 1.0678,
"step": 3730
},
{
"epoch": 1.6360052562417873,
"grad_norm": 0.2640142873058846,
"learning_rate": 1.9513745910398494e-06,
"loss": 1.0588,
"step": 3735
},
{
"epoch": 1.6381953569864214,
"grad_norm": 0.25790596663792437,
"learning_rate": 1.928746506764909e-06,
"loss": 1.05,
"step": 3740
},
{
"epoch": 1.6403854577310555,
"grad_norm": 0.2610429685420584,
"learning_rate": 1.9062363752017666e-06,
"loss": 1.0607,
"step": 3745
},
{
"epoch": 1.64257555847569,
"grad_norm": 0.2625385391766983,
"learning_rate": 1.883844525311851e-06,
"loss": 1.068,
"step": 3750
},
{
"epoch": 1.6447656592203241,
"grad_norm": 0.26068949311696277,
"learning_rate": 1.861571284328032e-06,
"loss": 1.0451,
"step": 3755
},
{
"epoch": 1.6469557599649582,
"grad_norm": 0.26311841661647273,
"learning_rate": 1.8394169777498306e-06,
"loss": 1.0588,
"step": 3760
},
{
"epoch": 1.6491458607095928,
"grad_norm": 0.2618370787264574,
"learning_rate": 1.817381929338673e-06,
"loss": 1.0596,
"step": 3765
},
{
"epoch": 1.651335961454227,
"grad_norm": 0.2624185244357276,
"learning_rate": 1.7954664611131522e-06,
"loss": 1.0578,
"step": 3770
},
{
"epoch": 1.653526062198861,
"grad_norm": 0.2553246881271621,
"learning_rate": 1.7736708933443335e-06,
"loss": 1.0367,
"step": 3775
},
{
"epoch": 1.6557161629434956,
"grad_norm": 0.2540067566235003,
"learning_rate": 1.7519955445510562e-06,
"loss": 1.073,
"step": 3780
},
{
"epoch": 1.6579062636881297,
"grad_norm": 0.2606857530240499,
"learning_rate": 1.7304407314952898e-06,
"loss": 1.0631,
"step": 3785
},
{
"epoch": 1.6600963644327638,
"grad_norm": 0.2613143652543251,
"learning_rate": 1.709006769177508e-06,
"loss": 1.0648,
"step": 3790
},
{
"epoch": 1.6622864651773983,
"grad_norm": 0.2561120275008954,
"learning_rate": 1.6876939708320806e-06,
"loss": 1.0453,
"step": 3795
},
{
"epoch": 1.6644765659220324,
"grad_norm": 0.2606335593974457,
"learning_rate": 1.6665026479226908e-06,
"loss": 1.0299,
"step": 3800
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.27032112797171276,
"learning_rate": 1.6454331101377875e-06,
"loss": 1.0395,
"step": 3805
},
{
"epoch": 1.668856767411301,
"grad_norm": 0.2592791939318623,
"learning_rate": 1.6244856653860696e-06,
"loss": 1.0395,
"step": 3810
},
{
"epoch": 1.6710468681559352,
"grad_norm": 0.2573885200586874,
"learning_rate": 1.6036606197919703e-06,
"loss": 1.0248,
"step": 3815
},
{
"epoch": 1.6732369689005693,
"grad_norm": 0.2632705169188632,
"learning_rate": 1.582958277691189e-06,
"loss": 1.0627,
"step": 3820
},
{
"epoch": 1.6754270696452038,
"grad_norm": 0.2585213551771273,
"learning_rate": 1.5623789416262513e-06,
"loss": 1.0445,
"step": 3825
},
{
"epoch": 1.677617170389838,
"grad_norm": 0.2638970821888065,
"learning_rate": 1.5419229123420799e-06,
"loss": 1.0396,
"step": 3830
},
{
"epoch": 1.679807271134472,
"grad_norm": 0.25489306481434604,
"learning_rate": 1.5215904887815969e-06,
"loss": 1.0545,
"step": 3835
},
{
"epoch": 1.6819973718791066,
"grad_norm": 0.2570139660401704,
"learning_rate": 1.5013819680813602e-06,
"loss": 1.0393,
"step": 3840
},
{
"epoch": 1.6841874726237407,
"grad_norm": 0.26986499526339125,
"learning_rate": 1.4812976455672278e-06,
"loss": 1.0561,
"step": 3845
},
{
"epoch": 1.6863775733683748,
"grad_norm": 0.2539719372535899,
"learning_rate": 1.4613378147500257e-06,
"loss": 1.0738,
"step": 3850
},
{
"epoch": 1.6885676741130093,
"grad_norm": 0.25998054837825935,
"learning_rate": 1.4415027673212712e-06,
"loss": 1.0789,
"step": 3855
},
{
"epoch": 1.6907577748576434,
"grad_norm": 0.2683313676445113,
"learning_rate": 1.4217927931488996e-06,
"loss": 1.0592,
"step": 3860
},
{
"epoch": 1.6929478756022776,
"grad_norm": 0.25855774771508333,
"learning_rate": 1.4022081802730503e-06,
"loss": 1.0514,
"step": 3865
},
{
"epoch": 1.695137976346912,
"grad_norm": 0.2657756452696875,
"learning_rate": 1.3827492149018285e-06,
"loss": 1.073,
"step": 3870
},
{
"epoch": 1.6973280770915462,
"grad_norm": 0.25795267450071285,
"learning_rate": 1.363416181407139e-06,
"loss": 1.0561,
"step": 3875
},
{
"epoch": 1.6995181778361803,
"grad_norm": 0.2604007114404956,
"learning_rate": 1.3442093623205243e-06,
"loss": 1.0592,
"step": 3880
},
{
"epoch": 1.7017082785808149,
"grad_norm": 0.2577655438176319,
"learning_rate": 1.3251290383290493e-06,
"loss": 1.0547,
"step": 3885
},
{
"epoch": 1.703898379325449,
"grad_norm": 0.2594147352663477,
"learning_rate": 1.3061754882711775e-06,
"loss": 1.0514,
"step": 3890
},
{
"epoch": 1.706088480070083,
"grad_norm": 0.26762797623037493,
"learning_rate": 1.2873489891327096e-06,
"loss": 1.0592,
"step": 3895
},
{
"epoch": 1.7082785808147176,
"grad_norm": 0.25919122876440764,
"learning_rate": 1.2686498160427384e-06,
"loss": 1.0568,
"step": 3900
},
{
"epoch": 1.7104686815593517,
"grad_norm": 0.25374064783512074,
"learning_rate": 1.2500782422696211e-06,
"loss": 1.0443,
"step": 3905
},
{
"epoch": 1.7126587823039858,
"grad_norm": 0.2544658335365746,
"learning_rate": 1.231634539216986e-06,
"loss": 1.0439,
"step": 3910
},
{
"epoch": 1.7148488830486204,
"grad_norm": 0.2602599493437291,
"learning_rate": 1.2133189764197661e-06,
"loss": 1.0725,
"step": 3915
},
{
"epoch": 1.7170389837932545,
"grad_norm": 0.2544098863630771,
"learning_rate": 1.1951318215402674e-06,
"loss": 1.0559,
"step": 3920
},
{
"epoch": 1.7192290845378886,
"grad_norm": 0.2666420834689722,
"learning_rate": 1.1770733403642498e-06,
"loss": 1.0688,
"step": 3925
},
{
"epoch": 1.7214191852825231,
"grad_norm": 0.2571596859481218,
"learning_rate": 1.1591437967970399e-06,
"loss": 1.0619,
"step": 3930
},
{
"epoch": 1.7236092860271572,
"grad_norm": 0.25709888915380735,
"learning_rate": 1.1413434528596879e-06,
"loss": 1.059,
"step": 3935
},
{
"epoch": 1.7257993867717913,
"grad_norm": 0.265414809756402,
"learning_rate": 1.1236725686851268e-06,
"loss": 1.092,
"step": 3940
},
{
"epoch": 1.727989487516426,
"grad_norm": 0.261455574018407,
"learning_rate": 1.106131402514372e-06,
"loss": 1.0342,
"step": 3945
},
{
"epoch": 1.73017958826106,
"grad_norm": 0.2546962789705564,
"learning_rate": 1.0887202106927485e-06,
"loss": 1.035,
"step": 3950
},
{
"epoch": 1.732369689005694,
"grad_norm": 0.25295413120526167,
"learning_rate": 1.0714392476661518e-06,
"loss": 1.0451,
"step": 3955
},
{
"epoch": 1.7345597897503287,
"grad_norm": 0.26081673975148323,
"learning_rate": 1.054288765977317e-06,
"loss": 1.0471,
"step": 3960
},
{
"epoch": 1.7367498904949628,
"grad_norm": 0.2622781961728633,
"learning_rate": 1.0372690162621368e-06,
"loss": 1.0703,
"step": 3965
},
{
"epoch": 1.7389399912395969,
"grad_norm": 0.2639733492692751,
"learning_rate": 1.0203802472459934e-06,
"loss": 1.0783,
"step": 3970
},
{
"epoch": 1.7411300919842314,
"grad_norm": 0.26422423738917283,
"learning_rate": 1.003622705740136e-06,
"loss": 1.0549,
"step": 3975
},
{
"epoch": 1.7433201927288655,
"grad_norm": 0.25474550459007383,
"learning_rate": 9.869966366380546e-07,
"loss": 1.0742,
"step": 3980
},
{
"epoch": 1.7455102934734996,
"grad_norm": 0.25885409380951035,
"learning_rate": 9.70502282911915e-07,
"loss": 1.0559,
"step": 3985
},
{
"epoch": 1.7477003942181342,
"grad_norm": 0.26987519340353844,
"learning_rate": 9.54139885609e-07,
"loss": 1.0602,
"step": 3990
},
{
"epoch": 1.7498904949627683,
"grad_norm": 0.2606876106625255,
"learning_rate": 9.379096838481993e-07,
"loss": 1.067,
"step": 3995
},
{
"epoch": 1.7520805957074024,
"grad_norm": 0.2703733043459076,
"learning_rate": 9.218119148165006e-07,
"loss": 1.0918,
"step": 4000
},
{
"epoch": 1.754270696452037,
"grad_norm": 0.2594581284532797,
"learning_rate": 9.058468137655251e-07,
"loss": 1.0652,
"step": 4005
},
{
"epoch": 1.756460797196671,
"grad_norm": 0.25391150674998686,
"learning_rate": 8.900146140081045e-07,
"loss": 1.0443,
"step": 4010
},
{
"epoch": 1.7586508979413051,
"grad_norm": 0.26854247497016676,
"learning_rate": 8.743155469148556e-07,
"loss": 1.0961,
"step": 4015
},
{
"epoch": 1.7608409986859397,
"grad_norm": 0.25915592104435276,
"learning_rate": 8.587498419108009e-07,
"loss": 1.035,
"step": 4020
},
{
"epoch": 1.7630310994305738,
"grad_norm": 0.26010447740581344,
"learning_rate": 8.433177264720205e-07,
"loss": 1.0627,
"step": 4025
},
{
"epoch": 1.765221200175208,
"grad_norm": 0.2610936282674673,
"learning_rate": 8.280194261223318e-07,
"loss": 1.0422,
"step": 4030
},
{
"epoch": 1.7674113009198424,
"grad_norm": 0.25929224377828625,
"learning_rate": 8.128551644299809e-07,
"loss": 1.0715,
"step": 4035
},
{
"epoch": 1.7696014016644765,
"grad_norm": 0.2786652487296327,
"learning_rate": 7.978251630043854e-07,
"loss": 1.0668,
"step": 4040
},
{
"epoch": 1.7717915024091109,
"grad_norm": 0.2625714433234073,
"learning_rate": 7.829296414928944e-07,
"loss": 1.0465,
"step": 4045
},
{
"epoch": 1.7739816031537452,
"grad_norm": 0.2516813690067178,
"learning_rate": 7.681688175775792e-07,
"loss": 1.0225,
"step": 4050
},
{
"epoch": 1.7761717038983793,
"grad_norm": 0.2562341997538078,
"learning_rate": 7.535429069720446e-07,
"loss": 1.0557,
"step": 4055
},
{
"epoch": 1.7783618046430136,
"grad_norm": 0.2660365664245146,
"learning_rate": 7.390521234182835e-07,
"loss": 1.0758,
"step": 4060
},
{
"epoch": 1.780551905387648,
"grad_norm": 0.2611768825441285,
"learning_rate": 7.246966786835563e-07,
"loss": 1.0311,
"step": 4065
},
{
"epoch": 1.782742006132282,
"grad_norm": 0.25377711508428263,
"learning_rate": 7.104767825572878e-07,
"loss": 1.0586,
"step": 4070
},
{
"epoch": 1.7849321068769164,
"grad_norm": 0.25560505594472754,
"learning_rate": 6.96392642848005e-07,
"loss": 1.0762,
"step": 4075
},
{
"epoch": 1.7871222076215507,
"grad_norm": 0.2540736575686416,
"learning_rate": 6.82444465380303e-07,
"loss": 1.0582,
"step": 4080
},
{
"epoch": 1.7893123083661848,
"grad_norm": 0.26520844557122947,
"learning_rate": 6.686324539918343e-07,
"loss": 1.0639,
"step": 4085
},
{
"epoch": 1.7915024091108191,
"grad_norm": 0.2605220753523269,
"learning_rate": 6.549568105303283e-07,
"loss": 1.0344,
"step": 4090
},
{
"epoch": 1.7936925098554535,
"grad_norm": 0.2646533823325038,
"learning_rate": 6.414177348506423e-07,
"loss": 1.0582,
"step": 4095
},
{
"epoch": 1.7958826106000876,
"grad_norm": 0.26517750832452447,
"learning_rate": 6.280154248118475e-07,
"loss": 1.0656,
"step": 4100
},
{
"epoch": 1.798072711344722,
"grad_norm": 0.25435449716739056,
"learning_rate": 6.147500762743263e-07,
"loss": 1.0574,
"step": 4105
},
{
"epoch": 1.8002628120893562,
"grad_norm": 0.2649605554429642,
"learning_rate": 6.0162188309692e-07,
"loss": 1.0723,
"step": 4110
},
{
"epoch": 1.8024529128339903,
"grad_norm": 0.2594557118791885,
"learning_rate": 5.886310371340853e-07,
"loss": 1.0688,
"step": 4115
},
{
"epoch": 1.8046430135786247,
"grad_norm": 0.2547123772715979,
"learning_rate": 5.757777282331034e-07,
"loss": 1.0748,
"step": 4120
},
{
"epoch": 1.806833114323259,
"grad_norm": 0.2549766396239622,
"learning_rate": 5.630621442312978e-07,
"loss": 1.0711,
"step": 4125
},
{
"epoch": 1.809023215067893,
"grad_norm": 0.2552394207988368,
"learning_rate": 5.504844709532864e-07,
"loss": 1.0443,
"step": 4130
},
{
"epoch": 1.8112133158125274,
"grad_norm": 0.2568508170285463,
"learning_rate": 5.380448922082726e-07,
"loss": 1.0686,
"step": 4135
},
{
"epoch": 1.8134034165571618,
"grad_norm": 0.26793785186152136,
"learning_rate": 5.257435897873564e-07,
"loss": 1.0627,
"step": 4140
},
{
"epoch": 1.8155935173017959,
"grad_norm": 0.2556345040237505,
"learning_rate": 5.135807434608764e-07,
"loss": 1.0824,
"step": 4145
},
{
"epoch": 1.8177836180464302,
"grad_norm": 0.2522974784155926,
"learning_rate": 5.015565309757841e-07,
"loss": 1.0561,
"step": 4150
},
{
"epoch": 1.8199737187910645,
"grad_norm": 0.25993666225105416,
"learning_rate": 4.896711280530475e-07,
"loss": 1.0658,
"step": 4155
},
{
"epoch": 1.8221638195356986,
"grad_norm": 0.25711695701360565,
"learning_rate": 4.779247083850814e-07,
"loss": 1.0594,
"step": 4160
},
{
"epoch": 1.824353920280333,
"grad_norm": 0.2582563617934845,
"learning_rate": 4.6631744363320964e-07,
"loss": 1.0373,
"step": 4165
},
{
"epoch": 1.8265440210249673,
"grad_norm": 0.2575689764561888,
"learning_rate": 4.548495034251521e-07,
"loss": 1.0604,
"step": 4170
},
{
"epoch": 1.8287341217696014,
"grad_norm": 0.2599323407971448,
"learning_rate": 4.435210553525571e-07,
"loss": 1.0646,
"step": 4175
},
{
"epoch": 1.8309242225142357,
"grad_norm": 0.2598200040268689,
"learning_rate": 4.323322649685391e-07,
"loss": 1.0629,
"step": 4180
},
{
"epoch": 1.83311432325887,
"grad_norm": 0.2617388138565943,
"learning_rate": 4.2128329578526636e-07,
"loss": 1.065,
"step": 4185
},
{
"epoch": 1.8353044240035041,
"grad_norm": 0.2525471022943904,
"learning_rate": 4.1037430927157507e-07,
"loss": 1.0395,
"step": 4190
},
{
"epoch": 1.8374945247481385,
"grad_norm": 0.2673015600105575,
"learning_rate": 3.996054648505965e-07,
"loss": 1.0555,
"step": 4195
},
{
"epoch": 1.8396846254927728,
"grad_norm": 0.25746822209162834,
"learning_rate": 3.8897691989744467e-07,
"loss": 1.0723,
"step": 4200
},
{
"epoch": 1.841874726237407,
"grad_norm": 0.259721450178856,
"learning_rate": 3.784888297368994e-07,
"loss": 1.0324,
"step": 4205
},
{
"epoch": 1.8440648269820412,
"grad_norm": 0.2557254088083905,
"learning_rate": 3.6814134764114997e-07,
"loss": 1.0424,
"step": 4210
},
{
"epoch": 1.8462549277266755,
"grad_norm": 0.26821986701418704,
"learning_rate": 3.5793462482754613e-07,
"loss": 1.0525,
"step": 4215
},
{
"epoch": 1.8484450284713096,
"grad_norm": 0.26526821999725914,
"learning_rate": 3.478688104563943e-07,
"loss": 1.0332,
"step": 4220
},
{
"epoch": 1.850635129215944,
"grad_norm": 0.2657264529801212,
"learning_rate": 3.379440516287724e-07,
"loss": 1.0424,
"step": 4225
},
{
"epoch": 1.8528252299605783,
"grad_norm": 0.2622695643818941,
"learning_rate": 3.281604933843852e-07,
"loss": 1.067,
"step": 4230
},
{
"epoch": 1.8550153307052124,
"grad_norm": 0.2621090614558729,
"learning_rate": 3.185182786994423e-07,
"loss": 1.0805,
"step": 4235
},
{
"epoch": 1.8572054314498467,
"grad_norm": 0.26019161340403885,
"learning_rate": 3.090175484845681e-07,
"loss": 1.066,
"step": 4240
},
{
"epoch": 1.859395532194481,
"grad_norm": 0.2526519420768719,
"learning_rate": 2.996584415827419e-07,
"loss": 1.0564,
"step": 4245
},
{
"epoch": 1.8615856329391152,
"grad_norm": 0.2623067668598087,
"learning_rate": 2.904410947672731e-07,
"loss": 1.0965,
"step": 4250
},
{
"epoch": 1.8637757336837495,
"grad_norm": 0.2665677286699009,
"learning_rate": 2.8136564273979816e-07,
"loss": 1.0484,
"step": 4255
},
{
"epoch": 1.8659658344283838,
"grad_norm": 0.2570070789968646,
"learning_rate": 2.724322181283112e-07,
"loss": 1.0926,
"step": 4260
},
{
"epoch": 1.868155935173018,
"grad_norm": 0.25969876794186386,
"learning_rate": 2.6364095148523114e-07,
"loss": 1.0557,
"step": 4265
},
{
"epoch": 1.8703460359176522,
"grad_norm": 0.25841316628132993,
"learning_rate": 2.549919712854909e-07,
"loss": 1.0721,
"step": 4270
},
{
"epoch": 1.8725361366622866,
"grad_norm": 0.2622532841527044,
"learning_rate": 2.4648540392465783e-07,
"loss": 1.042,
"step": 4275
},
{
"epoch": 1.8747262374069207,
"grad_norm": 0.262227535263612,
"learning_rate": 2.3812137371708732e-07,
"loss": 1.075,
"step": 4280
},
{
"epoch": 1.876916338151555,
"grad_norm": 0.26352495433091844,
"learning_rate": 2.299000028941112e-07,
"loss": 1.0654,
"step": 4285
},
{
"epoch": 1.8791064388961893,
"grad_norm": 0.26358888945101755,
"learning_rate": 2.2182141160224325e-07,
"loss": 1.035,
"step": 4290
},
{
"epoch": 1.8812965396408234,
"grad_norm": 0.2653279390073888,
"learning_rate": 2.1388571790142865e-07,
"loss": 1.0668,
"step": 4295
},
{
"epoch": 1.8834866403854578,
"grad_norm": 0.257677335409702,
"learning_rate": 2.0609303776332078e-07,
"loss": 1.0529,
"step": 4300
},
{
"epoch": 1.885676741130092,
"grad_norm": 0.26428455446729876,
"learning_rate": 1.9844348506957824e-07,
"loss": 1.0467,
"step": 4305
},
{
"epoch": 1.8878668418747262,
"grad_norm": 0.26294409986685496,
"learning_rate": 1.909371716102093e-07,
"loss": 1.0568,
"step": 4310
},
{
"epoch": 1.8900569426193605,
"grad_norm": 0.26329149601630425,
"learning_rate": 1.835742070819335e-07,
"loss": 1.0672,
"step": 4315
},
{
"epoch": 1.8922470433639949,
"grad_norm": 0.26651307963750537,
"learning_rate": 1.7635469908657832e-07,
"loss": 1.0617,
"step": 4320
},
{
"epoch": 1.894437144108629,
"grad_norm": 0.2663947225115879,
"learning_rate": 1.6927875312950927e-07,
"loss": 1.0506,
"step": 4325
},
{
"epoch": 1.8966272448532633,
"grad_norm": 0.2534729439266378,
"learning_rate": 1.6234647261808678e-07,
"loss": 1.0457,
"step": 4330
},
{
"epoch": 1.8988173455978976,
"grad_norm": 0.25515531687003673,
"learning_rate": 1.555579588601519e-07,
"loss": 1.0537,
"step": 4335
},
{
"epoch": 1.9010074463425317,
"grad_norm": 0.2598580190509047,
"learning_rate": 1.489133110625529e-07,
"loss": 1.0549,
"step": 4340
},
{
"epoch": 1.903197547087166,
"grad_norm": 0.25780277018916953,
"learning_rate": 1.42412626329691e-07,
"loss": 1.0813,
"step": 4345
},
{
"epoch": 1.9053876478318004,
"grad_norm": 0.25921414992499353,
"learning_rate": 1.3605599966209803e-07,
"loss": 1.0811,
"step": 4350
},
{
"epoch": 1.9075777485764345,
"grad_norm": 0.2632565173184216,
"learning_rate": 1.298435239550544e-07,
"loss": 1.0566,
"step": 4355
},
{
"epoch": 1.9097678493210688,
"grad_norm": 0.25942353454065054,
"learning_rate": 1.2377528999723e-07,
"loss": 1.0555,
"step": 4360
},
{
"epoch": 1.9119579500657031,
"grad_norm": 0.25904309238444745,
"learning_rate": 1.1785138646935313e-07,
"loss": 1.0555,
"step": 4365
},
{
"epoch": 1.9141480508103372,
"grad_norm": 0.2592095253843778,
"learning_rate": 1.1207189994291934e-07,
"loss": 1.0545,
"step": 4370
},
{
"epoch": 1.9163381515549716,
"grad_norm": 0.2575477136605174,
"learning_rate": 1.0643691487892572e-07,
"loss": 1.0414,
"step": 4375
},
{
"epoch": 1.9185282522996059,
"grad_norm": 0.256629839507532,
"learning_rate": 1.0094651362663299e-07,
"loss": 1.0807,
"step": 4380
},
{
"epoch": 1.92071835304424,
"grad_norm": 0.2631594825530231,
"learning_rate": 9.560077642236765e-08,
"loss": 1.0635,
"step": 4385
},
{
"epoch": 1.9229084537888743,
"grad_norm": 0.25757118647856275,
"learning_rate": 9.039978138834282e-08,
"loss": 1.0559,
"step": 4390
},
{
"epoch": 1.9250985545335086,
"grad_norm": 0.25635783319220967,
"learning_rate": 8.534360453152369e-08,
"loss": 1.0736,
"step": 4395
},
{
"epoch": 1.9272886552781427,
"grad_norm": 0.26497079430057574,
"learning_rate": 8.043231974250942e-08,
"loss": 1.0393,
"step": 4400
},
{
"epoch": 1.929478756022777,
"grad_norm": 0.2574631334108864,
"learning_rate": 7.566599879445968e-08,
"loss": 1.0646,
"step": 4405
},
{
"epoch": 1.9316688567674114,
"grad_norm": 0.25966994233199864,
"learning_rate": 7.104471134204205e-08,
"loss": 1.0629,
"step": 4410
},
{
"epoch": 1.9338589575120455,
"grad_norm": 0.259903869118688,
"learning_rate": 6.656852492041621e-08,
"loss": 1.0799,
"step": 4415
},
{
"epoch": 1.9360490582566798,
"grad_norm": 0.26803973886426774,
"learning_rate": 6.223750494424363e-08,
"loss": 1.0887,
"step": 4420
},
{
"epoch": 1.9382391590013142,
"grad_norm": 0.254641347090638,
"learning_rate": 5.8051714706737203e-08,
"loss": 1.048,
"step": 4425
},
{
"epoch": 1.9404292597459483,
"grad_norm": 0.2561371140140117,
"learning_rate": 5.401121537872978e-08,
"loss": 1.0555,
"step": 4430
},
{
"epoch": 1.9426193604905826,
"grad_norm": 0.25821846272770893,
"learning_rate": 5.011606600778596e-08,
"loss": 1.0459,
"step": 4435
},
{
"epoch": 1.944809461235217,
"grad_norm": 0.2713573159900797,
"learning_rate": 4.636632351733394e-08,
"loss": 1.0768,
"step": 4440
},
{
"epoch": 1.946999561979851,
"grad_norm": 0.24621746440958298,
"learning_rate": 4.27620427058395e-08,
"loss": 1.0227,
"step": 4445
},
{
"epoch": 1.9491896627244854,
"grad_norm": 0.26617144432411904,
"learning_rate": 3.930327624599994e-08,
"loss": 1.0582,
"step": 4450
},
{
"epoch": 1.9513797634691197,
"grad_norm": 0.2588098327195591,
"learning_rate": 3.599007468397586e-08,
"loss": 1.0408,
"step": 4455
},
{
"epoch": 1.9535698642137538,
"grad_norm": 0.2588717220296684,
"learning_rate": 3.282248643865504e-08,
"loss": 1.0371,
"step": 4460
},
{
"epoch": 1.955759964958388,
"grad_norm": 0.2612067459198219,
"learning_rate": 2.9800557800941932e-08,
"loss": 1.06,
"step": 4465
},
{
"epoch": 1.9579500657030224,
"grad_norm": 0.26040948454378465,
"learning_rate": 2.69243329330815e-08,
"loss": 1.0809,
"step": 4470
},
{
"epoch": 1.9601401664476565,
"grad_norm": 0.2587545182578282,
"learning_rate": 2.4193853868014206e-08,
"loss": 1.0627,
"step": 4475
},
{
"epoch": 1.9623302671922909,
"grad_norm": 0.2601426689640191,
"learning_rate": 2.160916050876427e-08,
"loss": 1.0479,
"step": 4480
},
{
"epoch": 1.9645203679369252,
"grad_norm": 0.27136109068065567,
"learning_rate": 1.9170290627851253e-08,
"loss": 1.0672,
"step": 4485
},
{
"epoch": 1.9667104686815593,
"grad_norm": 0.25835360666843193,
"learning_rate": 1.68772798667427e-08,
"loss": 1.0355,
"step": 4490
},
{
"epoch": 1.9689005694261936,
"grad_norm": 0.25394949222511376,
"learning_rate": 1.4730161735331262e-08,
"loss": 1.0496,
"step": 4495
},
{
"epoch": 1.971090670170828,
"grad_norm": 0.25972727406913415,
"learning_rate": 1.2728967611445042e-08,
"loss": 1.0676,
"step": 4500
},
{
"epoch": 1.973280770915462,
"grad_norm": 0.28013541605837033,
"learning_rate": 1.0873726740390223e-08,
"loss": 1.0594,
"step": 4505
},
{
"epoch": 1.9754708716600964,
"grad_norm": 0.2582001095059419,
"learning_rate": 9.164466234521385e-09,
"loss": 1.0531,
"step": 4510
},
{
"epoch": 1.9776609724047307,
"grad_norm": 0.2637625227746746,
"learning_rate": 7.601211072846282e-09,
"loss": 1.0441,
"step": 4515
},
{
"epoch": 1.9798510731493648,
"grad_norm": 0.2656010194757306,
"learning_rate": 6.183984100663898e-09,
"loss": 1.0732,
"step": 4520
},
{
"epoch": 1.9820411738939991,
"grad_norm": 0.2557112398087727,
"learning_rate": 4.912806029225836e-09,
"loss": 1.049,
"step": 4525
},
{
"epoch": 1.9842312746386335,
"grad_norm": 0.2640281308017009,
"learning_rate": 3.787695435434336e-09,
"loss": 1.0367,
"step": 4530
},
{
"epoch": 1.9864213753832676,
"grad_norm": 0.2550981215142995,
"learning_rate": 2.808668761576927e-09,
"loss": 1.0549,
"step": 4535
},
{
"epoch": 1.988611476127902,
"grad_norm": 0.2611486955899783,
"learning_rate": 1.975740315075525e-09,
"loss": 1.0496,
"step": 4540
},
{
"epoch": 1.9908015768725362,
"grad_norm": 0.26343127951064566,
"learning_rate": 1.2889222682865854e-09,
"loss": 1.048,
"step": 4545
},
{
"epoch": 1.9929916776171703,
"grad_norm": 0.2617943886231076,
"learning_rate": 7.482246583201402e-10,
"loss": 1.0607,
"step": 4550
},
{
"epoch": 1.9951817783618047,
"grad_norm": 0.267332987436094,
"learning_rate": 3.5365538689102754e-10,
"loss": 1.0668,
"step": 4555
},
{
"epoch": 1.997371879106439,
"grad_norm": 0.25204221832833196,
"learning_rate": 1.0522022020564848e-10,
"loss": 1.0562,
"step": 4560
},
{
"epoch": 1.999561979851073,
"grad_norm": 0.25875087028369437,
"learning_rate": 2.9227888775906764e-12,
"loss": 1.0584,
"step": 4565
},
{
"epoch": 2.0,
"step": 4566,
"total_flos": 432050892963840.0,
"train_loss": 1.1768484963589576,
"train_runtime": 15595.6601,
"train_samples_per_second": 18.73,
"train_steps_per_second": 0.293
}
],
"logging_steps": 5,
"max_steps": 4566,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 432050892963840.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}