distilroberta-base-edu-classifier / trainer_state.json
pszemraj's picture
End of training
5f9565f verified
raw
history blame contribute delete
No virus
130 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999279383151978,
"eval_steps": 100,
"global_step": 3469,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014412336960438136,
"grad_norm": 22.562244415283203,
"learning_rate": 2.8735632183908047e-07,
"loss": 2.0425,
"step": 5
},
{
"epoch": 0.0028824673920876272,
"grad_norm": 21.29774284362793,
"learning_rate": 5.747126436781609e-07,
"loss": 1.8066,
"step": 10
},
{
"epoch": 0.004323701088131441,
"grad_norm": 21.9217586517334,
"learning_rate": 8.620689655172415e-07,
"loss": 1.8381,
"step": 15
},
{
"epoch": 0.0057649347841752544,
"grad_norm": 21.61351776123047,
"learning_rate": 1.1494252873563219e-06,
"loss": 2.0628,
"step": 20
},
{
"epoch": 0.007206168480219067,
"grad_norm": 19.952226638793945,
"learning_rate": 1.4367816091954023e-06,
"loss": 1.9157,
"step": 25
},
{
"epoch": 0.008647402176262881,
"grad_norm": 21.637529373168945,
"learning_rate": 1.724137931034483e-06,
"loss": 1.7546,
"step": 30
},
{
"epoch": 0.010088635872306694,
"grad_norm": 17.36451530456543,
"learning_rate": 2.0114942528735633e-06,
"loss": 1.5482,
"step": 35
},
{
"epoch": 0.011529869568350509,
"grad_norm": 19.64199447631836,
"learning_rate": 2.2988505747126437e-06,
"loss": 1.4528,
"step": 40
},
{
"epoch": 0.012971103264394322,
"grad_norm": 19.360851287841797,
"learning_rate": 2.5862068965517246e-06,
"loss": 1.5399,
"step": 45
},
{
"epoch": 0.014412336960438135,
"grad_norm": 20.030967712402344,
"learning_rate": 2.8735632183908046e-06,
"loss": 1.607,
"step": 50
},
{
"epoch": 0.01585357065648195,
"grad_norm": 20.46259117126465,
"learning_rate": 3.1609195402298854e-06,
"loss": 1.2416,
"step": 55
},
{
"epoch": 0.017294804352525762,
"grad_norm": 9.801645278930664,
"learning_rate": 3.448275862068966e-06,
"loss": 1.0737,
"step": 60
},
{
"epoch": 0.018736038048569575,
"grad_norm": 12.106565475463867,
"learning_rate": 3.7356321839080462e-06,
"loss": 0.949,
"step": 65
},
{
"epoch": 0.02017727174461339,
"grad_norm": 6.313957214355469,
"learning_rate": 4.022988505747127e-06,
"loss": 0.7346,
"step": 70
},
{
"epoch": 0.0216185054406572,
"grad_norm": 10.853752136230469,
"learning_rate": 4.310344827586207e-06,
"loss": 0.721,
"step": 75
},
{
"epoch": 0.023059739136701018,
"grad_norm": 3.9490792751312256,
"learning_rate": 4.5977011494252875e-06,
"loss": 0.6074,
"step": 80
},
{
"epoch": 0.02450097283274483,
"grad_norm": 1.7576261758804321,
"learning_rate": 4.885057471264369e-06,
"loss": 0.5638,
"step": 85
},
{
"epoch": 0.025942206528788644,
"grad_norm": 2.3737521171569824,
"learning_rate": 5.172413793103449e-06,
"loss": 0.7098,
"step": 90
},
{
"epoch": 0.027383440224832457,
"grad_norm": 2.490168333053589,
"learning_rate": 5.45977011494253e-06,
"loss": 0.4592,
"step": 95
},
{
"epoch": 0.02882467392087627,
"grad_norm": 3.4372336864471436,
"learning_rate": 5.747126436781609e-06,
"loss": 0.5276,
"step": 100
},
{
"epoch": 0.02882467392087627,
"eval_loss": 0.5012353658676147,
"eval_mse": 0.5012353515625,
"eval_runtime": 3.7775,
"eval_samples_per_second": 264.724,
"eval_steps_per_second": 16.678,
"step": 100
},
{
"epoch": 0.030265907616920083,
"grad_norm": 6.628878116607666,
"learning_rate": 6.03448275862069e-06,
"loss": 0.6036,
"step": 105
},
{
"epoch": 0.0317071413129639,
"grad_norm": 5.881587505340576,
"learning_rate": 6.321839080459771e-06,
"loss": 0.4636,
"step": 110
},
{
"epoch": 0.03314837500900771,
"grad_norm": 3.570418119430542,
"learning_rate": 6.609195402298851e-06,
"loss": 0.4917,
"step": 115
},
{
"epoch": 0.034589608705051525,
"grad_norm": 4.549156665802002,
"learning_rate": 6.896551724137932e-06,
"loss": 0.4228,
"step": 120
},
{
"epoch": 0.03603084240109534,
"grad_norm": 6.016390800476074,
"learning_rate": 7.183908045977011e-06,
"loss": 0.3889,
"step": 125
},
{
"epoch": 0.03747207609713915,
"grad_norm": 3.7417054176330566,
"learning_rate": 7.4712643678160925e-06,
"loss": 0.3697,
"step": 130
},
{
"epoch": 0.038913309793182964,
"grad_norm": 2.5890052318573,
"learning_rate": 7.758620689655173e-06,
"loss": 0.3944,
"step": 135
},
{
"epoch": 0.04035454348922678,
"grad_norm": 7.596370220184326,
"learning_rate": 8.045977011494253e-06,
"loss": 0.3378,
"step": 140
},
{
"epoch": 0.04179577718527059,
"grad_norm": 2.7181951999664307,
"learning_rate": 8.333333333333334e-06,
"loss": 0.3575,
"step": 145
},
{
"epoch": 0.0432370108813144,
"grad_norm": 3.7169361114501953,
"learning_rate": 8.620689655172414e-06,
"loss": 0.339,
"step": 150
},
{
"epoch": 0.044678244577358216,
"grad_norm": 7.888335227966309,
"learning_rate": 8.908045977011495e-06,
"loss": 0.35,
"step": 155
},
{
"epoch": 0.046119478273402036,
"grad_norm": 12.077622413635254,
"learning_rate": 9.195402298850575e-06,
"loss": 0.3382,
"step": 160
},
{
"epoch": 0.04756071196944585,
"grad_norm": 16.399019241333008,
"learning_rate": 9.482758620689655e-06,
"loss": 0.34,
"step": 165
},
{
"epoch": 0.04900194566548966,
"grad_norm": 7.357732772827148,
"learning_rate": 9.770114942528738e-06,
"loss": 0.331,
"step": 170
},
{
"epoch": 0.050443179361533474,
"grad_norm": 10.48930549621582,
"learning_rate": 9.996965098634295e-06,
"loss": 0.3058,
"step": 175
},
{
"epoch": 0.05188441305757729,
"grad_norm": 8.28653621673584,
"learning_rate": 9.981790591805767e-06,
"loss": 0.3416,
"step": 180
},
{
"epoch": 0.0533256467536211,
"grad_norm": 3.4774363040924072,
"learning_rate": 9.966616084977238e-06,
"loss": 0.2923,
"step": 185
},
{
"epoch": 0.05476688044966491,
"grad_norm": 17.32866096496582,
"learning_rate": 9.951441578148711e-06,
"loss": 0.3346,
"step": 190
},
{
"epoch": 0.056208114145708726,
"grad_norm": 6.1874003410339355,
"learning_rate": 9.936267071320182e-06,
"loss": 0.3512,
"step": 195
},
{
"epoch": 0.05764934784175254,
"grad_norm": 7.11575984954834,
"learning_rate": 9.921092564491654e-06,
"loss": 0.3307,
"step": 200
},
{
"epoch": 0.05764934784175254,
"eval_loss": 0.3466827869415283,
"eval_mse": 0.3466828079223633,
"eval_runtime": 3.6079,
"eval_samples_per_second": 277.17,
"eval_steps_per_second": 17.462,
"step": 200
},
{
"epoch": 0.05909058153779635,
"grad_norm": 9.548958778381348,
"learning_rate": 9.905918057663127e-06,
"loss": 0.2987,
"step": 205
},
{
"epoch": 0.060531815233840165,
"grad_norm": 5.386926651000977,
"learning_rate": 9.890743550834598e-06,
"loss": 0.3275,
"step": 210
},
{
"epoch": 0.06197304892988398,
"grad_norm": 3.712883710861206,
"learning_rate": 9.87556904400607e-06,
"loss": 0.2812,
"step": 215
},
{
"epoch": 0.0634142826259278,
"grad_norm": 3.6099259853363037,
"learning_rate": 9.860394537177543e-06,
"loss": 0.2938,
"step": 220
},
{
"epoch": 0.0648555163219716,
"grad_norm": 4.330367088317871,
"learning_rate": 9.845220030349014e-06,
"loss": 0.3534,
"step": 225
},
{
"epoch": 0.06629675001801542,
"grad_norm": 3.1101043224334717,
"learning_rate": 9.830045523520486e-06,
"loss": 0.2815,
"step": 230
},
{
"epoch": 0.06773798371405923,
"grad_norm": 2.8998003005981445,
"learning_rate": 9.814871016691959e-06,
"loss": 0.3151,
"step": 235
},
{
"epoch": 0.06917921741010305,
"grad_norm": 3.0117270946502686,
"learning_rate": 9.79969650986343e-06,
"loss": 0.2948,
"step": 240
},
{
"epoch": 0.07062045110614686,
"grad_norm": 3.8122599124908447,
"learning_rate": 9.784522003034902e-06,
"loss": 0.303,
"step": 245
},
{
"epoch": 0.07206168480219068,
"grad_norm": 9.389573097229004,
"learning_rate": 9.769347496206375e-06,
"loss": 0.268,
"step": 250
},
{
"epoch": 0.07350291849823448,
"grad_norm": 4.838011264801025,
"learning_rate": 9.754172989377846e-06,
"loss": 0.2573,
"step": 255
},
{
"epoch": 0.0749441521942783,
"grad_norm": 3.3773627281188965,
"learning_rate": 9.738998482549317e-06,
"loss": 0.3056,
"step": 260
},
{
"epoch": 0.07638538589032212,
"grad_norm": 8.778388977050781,
"learning_rate": 9.72382397572079e-06,
"loss": 0.3394,
"step": 265
},
{
"epoch": 0.07782661958636593,
"grad_norm": 9.080647468566895,
"learning_rate": 9.708649468892262e-06,
"loss": 0.3213,
"step": 270
},
{
"epoch": 0.07926785328240975,
"grad_norm": 5.629511833190918,
"learning_rate": 9.693474962063733e-06,
"loss": 0.3378,
"step": 275
},
{
"epoch": 0.08070908697845355,
"grad_norm": 5.277593612670898,
"learning_rate": 9.678300455235205e-06,
"loss": 0.2955,
"step": 280
},
{
"epoch": 0.08215032067449737,
"grad_norm": 12.505846977233887,
"learning_rate": 9.663125948406678e-06,
"loss": 0.3378,
"step": 285
},
{
"epoch": 0.08359155437054118,
"grad_norm": 20.066940307617188,
"learning_rate": 9.64795144157815e-06,
"loss": 0.3333,
"step": 290
},
{
"epoch": 0.085032788066585,
"grad_norm": 9.962249755859375,
"learning_rate": 9.63277693474962e-06,
"loss": 0.3223,
"step": 295
},
{
"epoch": 0.0864740217626288,
"grad_norm": 8.705531120300293,
"learning_rate": 9.617602427921094e-06,
"loss": 0.2994,
"step": 300
},
{
"epoch": 0.0864740217626288,
"eval_loss": 0.29482540488243103,
"eval_mse": 0.29482541751861574,
"eval_runtime": 3.5313,
"eval_samples_per_second": 283.184,
"eval_steps_per_second": 17.841,
"step": 300
},
{
"epoch": 0.08791525545867263,
"grad_norm": 3.253650426864624,
"learning_rate": 9.602427921092565e-06,
"loss": 0.2902,
"step": 305
},
{
"epoch": 0.08935648915471643,
"grad_norm": 2.954401969909668,
"learning_rate": 9.587253414264037e-06,
"loss": 0.2631,
"step": 310
},
{
"epoch": 0.09079772285076025,
"grad_norm": 5.84781551361084,
"learning_rate": 9.57207890743551e-06,
"loss": 0.3064,
"step": 315
},
{
"epoch": 0.09223895654680407,
"grad_norm": 6.994048595428467,
"learning_rate": 9.556904400606981e-06,
"loss": 0.2599,
"step": 320
},
{
"epoch": 0.09368019024284788,
"grad_norm": 2.8386964797973633,
"learning_rate": 9.541729893778452e-06,
"loss": 0.2409,
"step": 325
},
{
"epoch": 0.0951214239388917,
"grad_norm": 8.831405639648438,
"learning_rate": 9.526555386949926e-06,
"loss": 0.2823,
"step": 330
},
{
"epoch": 0.0965626576349355,
"grad_norm": 5.2568840980529785,
"learning_rate": 9.511380880121397e-06,
"loss": 0.2995,
"step": 335
},
{
"epoch": 0.09800389133097932,
"grad_norm": 8.514307022094727,
"learning_rate": 9.496206373292868e-06,
"loss": 0.2998,
"step": 340
},
{
"epoch": 0.09944512502702313,
"grad_norm": 7.257457256317139,
"learning_rate": 9.481031866464341e-06,
"loss": 0.2657,
"step": 345
},
{
"epoch": 0.10088635872306695,
"grad_norm": 16.146055221557617,
"learning_rate": 9.465857359635813e-06,
"loss": 0.3007,
"step": 350
},
{
"epoch": 0.10232759241911075,
"grad_norm": 9.077672958374023,
"learning_rate": 9.450682852807284e-06,
"loss": 0.2855,
"step": 355
},
{
"epoch": 0.10376882611515457,
"grad_norm": 2.436467170715332,
"learning_rate": 9.435508345978757e-06,
"loss": 0.2809,
"step": 360
},
{
"epoch": 0.10521005981119838,
"grad_norm": 3.1497037410736084,
"learning_rate": 9.420333839150229e-06,
"loss": 0.2773,
"step": 365
},
{
"epoch": 0.1066512935072422,
"grad_norm": 11.72987174987793,
"learning_rate": 9.4051593323217e-06,
"loss": 0.2475,
"step": 370
},
{
"epoch": 0.108092527203286,
"grad_norm": 7.571060657501221,
"learning_rate": 9.389984825493173e-06,
"loss": 0.3025,
"step": 375
},
{
"epoch": 0.10953376089932983,
"grad_norm": 4.485091686248779,
"learning_rate": 9.374810318664645e-06,
"loss": 0.2742,
"step": 380
},
{
"epoch": 0.11097499459537365,
"grad_norm": 9.997501373291016,
"learning_rate": 9.359635811836116e-06,
"loss": 0.2845,
"step": 385
},
{
"epoch": 0.11241622829141745,
"grad_norm": 8.28499984741211,
"learning_rate": 9.344461305007587e-06,
"loss": 0.2786,
"step": 390
},
{
"epoch": 0.11385746198746127,
"grad_norm": 7.368741512298584,
"learning_rate": 9.32928679817906e-06,
"loss": 0.2712,
"step": 395
},
{
"epoch": 0.11529869568350508,
"grad_norm": 2.7350659370422363,
"learning_rate": 9.314112291350532e-06,
"loss": 0.2813,
"step": 400
},
{
"epoch": 0.11529869568350508,
"eval_loss": 0.27986839413642883,
"eval_mse": 0.27986840057373047,
"eval_runtime": 3.5237,
"eval_samples_per_second": 283.794,
"eval_steps_per_second": 17.879,
"step": 400
},
{
"epoch": 0.1167399293795489,
"grad_norm": 6.319039344787598,
"learning_rate": 9.298937784522003e-06,
"loss": 0.2822,
"step": 405
},
{
"epoch": 0.1181811630755927,
"grad_norm": 3.2821357250213623,
"learning_rate": 9.283763277693477e-06,
"loss": 0.2529,
"step": 410
},
{
"epoch": 0.11962239677163652,
"grad_norm": 10.913554191589355,
"learning_rate": 9.268588770864948e-06,
"loss": 0.2906,
"step": 415
},
{
"epoch": 0.12106363046768033,
"grad_norm": 2.793478488922119,
"learning_rate": 9.25341426403642e-06,
"loss": 0.2947,
"step": 420
},
{
"epoch": 0.12250486416372415,
"grad_norm": 7.466792583465576,
"learning_rate": 9.238239757207892e-06,
"loss": 0.3253,
"step": 425
},
{
"epoch": 0.12394609785976796,
"grad_norm": 3.626875877380371,
"learning_rate": 9.223065250379364e-06,
"loss": 0.2886,
"step": 430
},
{
"epoch": 0.12538733155581178,
"grad_norm": 3.203030586242676,
"learning_rate": 9.207890743550835e-06,
"loss": 0.2855,
"step": 435
},
{
"epoch": 0.1268285652518556,
"grad_norm": 5.602776050567627,
"learning_rate": 9.192716236722308e-06,
"loss": 0.2634,
"step": 440
},
{
"epoch": 0.1282697989478994,
"grad_norm": 4.7141804695129395,
"learning_rate": 9.17754172989378e-06,
"loss": 0.2729,
"step": 445
},
{
"epoch": 0.1297110326439432,
"grad_norm": 3.62369704246521,
"learning_rate": 9.162367223065251e-06,
"loss": 0.2595,
"step": 450
},
{
"epoch": 0.13115226633998703,
"grad_norm": 7.390626907348633,
"learning_rate": 9.147192716236724e-06,
"loss": 0.2729,
"step": 455
},
{
"epoch": 0.13259350003603085,
"grad_norm": 5.839654445648193,
"learning_rate": 9.132018209408196e-06,
"loss": 0.3046,
"step": 460
},
{
"epoch": 0.13403473373207467,
"grad_norm": 5.694056987762451,
"learning_rate": 9.116843702579667e-06,
"loss": 0.315,
"step": 465
},
{
"epoch": 0.13547596742811846,
"grad_norm": 5.082228660583496,
"learning_rate": 9.10166919575114e-06,
"loss": 0.2727,
"step": 470
},
{
"epoch": 0.13691720112416228,
"grad_norm": 16.798309326171875,
"learning_rate": 9.08649468892261e-06,
"loss": 0.2771,
"step": 475
},
{
"epoch": 0.1383584348202061,
"grad_norm": 6.507099151611328,
"learning_rate": 9.071320182094083e-06,
"loss": 0.2788,
"step": 480
},
{
"epoch": 0.13979966851624992,
"grad_norm": 6.645677089691162,
"learning_rate": 9.056145675265554e-06,
"loss": 0.2655,
"step": 485
},
{
"epoch": 0.1412409022122937,
"grad_norm": 8.834277153015137,
"learning_rate": 9.040971168437026e-06,
"loss": 0.306,
"step": 490
},
{
"epoch": 0.14268213590833753,
"grad_norm": 7.980821132659912,
"learning_rate": 9.025796661608497e-06,
"loss": 0.2498,
"step": 495
},
{
"epoch": 0.14412336960438135,
"grad_norm": 6.141432285308838,
"learning_rate": 9.01062215477997e-06,
"loss": 0.2707,
"step": 500
},
{
"epoch": 0.14412336960438135,
"eval_loss": 0.3016921281814575,
"eval_mse": 0.30169212198257445,
"eval_runtime": 3.5058,
"eval_samples_per_second": 285.241,
"eval_steps_per_second": 17.97,
"step": 500
},
{
"epoch": 0.14556460330042517,
"grad_norm": 3.130420446395874,
"learning_rate": 8.995447647951442e-06,
"loss": 0.2848,
"step": 505
},
{
"epoch": 0.14700583699646896,
"grad_norm": 3.3286259174346924,
"learning_rate": 8.980273141122913e-06,
"loss": 0.2534,
"step": 510
},
{
"epoch": 0.14844707069251278,
"grad_norm": 4.090871334075928,
"learning_rate": 8.965098634294386e-06,
"loss": 0.2423,
"step": 515
},
{
"epoch": 0.1498883043885566,
"grad_norm": 3.3191046714782715,
"learning_rate": 8.949924127465858e-06,
"loss": 0.2641,
"step": 520
},
{
"epoch": 0.15132953808460042,
"grad_norm": 4.689390182495117,
"learning_rate": 8.934749620637329e-06,
"loss": 0.2966,
"step": 525
},
{
"epoch": 0.15277077178064424,
"grad_norm": 12.92320728302002,
"learning_rate": 8.919575113808802e-06,
"loss": 0.2754,
"step": 530
},
{
"epoch": 0.15421200547668804,
"grad_norm": 6.917271137237549,
"learning_rate": 8.904400606980273e-06,
"loss": 0.2606,
"step": 535
},
{
"epoch": 0.15565323917273186,
"grad_norm": 11.144198417663574,
"learning_rate": 8.889226100151745e-06,
"loss": 0.2724,
"step": 540
},
{
"epoch": 0.15709447286877568,
"grad_norm": 8.833760261535645,
"learning_rate": 8.874051593323218e-06,
"loss": 0.2824,
"step": 545
},
{
"epoch": 0.1585357065648195,
"grad_norm": 10.618779182434082,
"learning_rate": 8.85887708649469e-06,
"loss": 0.2735,
"step": 550
},
{
"epoch": 0.1599769402608633,
"grad_norm": 6.135279655456543,
"learning_rate": 8.84370257966616e-06,
"loss": 0.2704,
"step": 555
},
{
"epoch": 0.1614181739569071,
"grad_norm": 9.724879264831543,
"learning_rate": 8.828528072837634e-06,
"loss": 0.3164,
"step": 560
},
{
"epoch": 0.16285940765295093,
"grad_norm": 19.34429168701172,
"learning_rate": 8.813353566009105e-06,
"loss": 0.3358,
"step": 565
},
{
"epoch": 0.16430064134899475,
"grad_norm": 3.7202112674713135,
"learning_rate": 8.798179059180577e-06,
"loss": 0.2559,
"step": 570
},
{
"epoch": 0.16574187504503854,
"grad_norm": 14.137849807739258,
"learning_rate": 8.78300455235205e-06,
"loss": 0.3265,
"step": 575
},
{
"epoch": 0.16718310874108236,
"grad_norm": 13.450931549072266,
"learning_rate": 8.767830045523521e-06,
"loss": 0.2847,
"step": 580
},
{
"epoch": 0.16862434243712618,
"grad_norm": 11.542911529541016,
"learning_rate": 8.752655538694993e-06,
"loss": 0.2993,
"step": 585
},
{
"epoch": 0.17006557613317,
"grad_norm": 2.6248021125793457,
"learning_rate": 8.737481031866466e-06,
"loss": 0.2237,
"step": 590
},
{
"epoch": 0.17150680982921382,
"grad_norm": 2.968738079071045,
"learning_rate": 8.722306525037937e-06,
"loss": 0.2788,
"step": 595
},
{
"epoch": 0.1729480435252576,
"grad_norm": 3.263702869415283,
"learning_rate": 8.707132018209408e-06,
"loss": 0.2506,
"step": 600
},
{
"epoch": 0.1729480435252576,
"eval_loss": 0.26987460255622864,
"eval_mse": 0.26987459245696666,
"eval_runtime": 3.7034,
"eval_samples_per_second": 270.023,
"eval_steps_per_second": 17.011,
"step": 600
},
{
"epoch": 0.17438927722130143,
"grad_norm": 4.830836772918701,
"learning_rate": 8.69195751138088e-06,
"loss": 0.2604,
"step": 605
},
{
"epoch": 0.17583051091734525,
"grad_norm": 9.858766555786133,
"learning_rate": 8.676783004552353e-06,
"loss": 0.2707,
"step": 610
},
{
"epoch": 0.17727174461338907,
"grad_norm": 14.384836196899414,
"learning_rate": 8.661608497723824e-06,
"loss": 0.2804,
"step": 615
},
{
"epoch": 0.17871297830943286,
"grad_norm": 3.5715346336364746,
"learning_rate": 8.646433990895296e-06,
"loss": 0.2705,
"step": 620
},
{
"epoch": 0.18015421200547668,
"grad_norm": 3.551644802093506,
"learning_rate": 8.631259484066769e-06,
"loss": 0.2455,
"step": 625
},
{
"epoch": 0.1815954457015205,
"grad_norm": 5.014893054962158,
"learning_rate": 8.61608497723824e-06,
"loss": 0.2518,
"step": 630
},
{
"epoch": 0.18303667939756432,
"grad_norm": 8.507904052734375,
"learning_rate": 8.600910470409712e-06,
"loss": 0.2794,
"step": 635
},
{
"epoch": 0.18447791309360814,
"grad_norm": 3.0522546768188477,
"learning_rate": 8.585735963581185e-06,
"loss": 0.2676,
"step": 640
},
{
"epoch": 0.18591914678965193,
"grad_norm": 5.574901580810547,
"learning_rate": 8.570561456752656e-06,
"loss": 0.2909,
"step": 645
},
{
"epoch": 0.18736038048569575,
"grad_norm": 4.334587097167969,
"learning_rate": 8.555386949924128e-06,
"loss": 0.2427,
"step": 650
},
{
"epoch": 0.18880161418173957,
"grad_norm": 3.7379872798919678,
"learning_rate": 8.5402124430956e-06,
"loss": 0.2761,
"step": 655
},
{
"epoch": 0.1902428478777834,
"grad_norm": 12.186504364013672,
"learning_rate": 8.525037936267072e-06,
"loss": 0.2533,
"step": 660
},
{
"epoch": 0.1916840815738272,
"grad_norm": 3.4297823905944824,
"learning_rate": 8.509863429438544e-06,
"loss": 0.2627,
"step": 665
},
{
"epoch": 0.193125315269871,
"grad_norm": 6.119399547576904,
"learning_rate": 8.494688922610017e-06,
"loss": 0.284,
"step": 670
},
{
"epoch": 0.19456654896591483,
"grad_norm": 3.3624227046966553,
"learning_rate": 8.479514415781488e-06,
"loss": 0.232,
"step": 675
},
{
"epoch": 0.19600778266195865,
"grad_norm": 3.575873374938965,
"learning_rate": 8.46433990895296e-06,
"loss": 0.2472,
"step": 680
},
{
"epoch": 0.19744901635800244,
"grad_norm": 4.15811014175415,
"learning_rate": 8.449165402124433e-06,
"loss": 0.2802,
"step": 685
},
{
"epoch": 0.19889025005404626,
"grad_norm": 10.572412490844727,
"learning_rate": 8.433990895295904e-06,
"loss": 0.239,
"step": 690
},
{
"epoch": 0.20033148375009008,
"grad_norm": 4.094681262969971,
"learning_rate": 8.418816388467375e-06,
"loss": 0.2569,
"step": 695
},
{
"epoch": 0.2017727174461339,
"grad_norm": 7.369691371917725,
"learning_rate": 8.403641881638848e-06,
"loss": 0.2584,
"step": 700
},
{
"epoch": 0.2017727174461339,
"eval_loss": 0.2632690966129303,
"eval_mse": 0.26326909382641317,
"eval_runtime": 3.532,
"eval_samples_per_second": 283.128,
"eval_steps_per_second": 17.837,
"step": 700
},
{
"epoch": 0.20321395114217772,
"grad_norm": 5.504802703857422,
"learning_rate": 8.38846737481032e-06,
"loss": 0.2701,
"step": 705
},
{
"epoch": 0.2046551848382215,
"grad_norm": 2.7906060218811035,
"learning_rate": 8.373292867981791e-06,
"loss": 0.2547,
"step": 710
},
{
"epoch": 0.20609641853426533,
"grad_norm": 4.351840496063232,
"learning_rate": 8.358118361153263e-06,
"loss": 0.2734,
"step": 715
},
{
"epoch": 0.20753765223030915,
"grad_norm": 4.175307750701904,
"learning_rate": 8.342943854324736e-06,
"loss": 0.282,
"step": 720
},
{
"epoch": 0.20897888592635297,
"grad_norm": 6.214290142059326,
"learning_rate": 8.327769347496207e-06,
"loss": 0.2786,
"step": 725
},
{
"epoch": 0.21042011962239676,
"grad_norm": 4.1190505027771,
"learning_rate": 8.312594840667679e-06,
"loss": 0.2671,
"step": 730
},
{
"epoch": 0.21186135331844058,
"grad_norm": 9.297944068908691,
"learning_rate": 8.297420333839152e-06,
"loss": 0.2467,
"step": 735
},
{
"epoch": 0.2133025870144844,
"grad_norm": 10.469265937805176,
"learning_rate": 8.282245827010623e-06,
"loss": 0.2941,
"step": 740
},
{
"epoch": 0.21474382071052822,
"grad_norm": 8.109935760498047,
"learning_rate": 8.267071320182094e-06,
"loss": 0.2759,
"step": 745
},
{
"epoch": 0.216185054406572,
"grad_norm": 7.648597717285156,
"learning_rate": 8.251896813353568e-06,
"loss": 0.2568,
"step": 750
},
{
"epoch": 0.21762628810261583,
"grad_norm": 5.525659084320068,
"learning_rate": 8.236722306525039e-06,
"loss": 0.275,
"step": 755
},
{
"epoch": 0.21906752179865965,
"grad_norm": 4.415822505950928,
"learning_rate": 8.22154779969651e-06,
"loss": 0.2663,
"step": 760
},
{
"epoch": 0.22050875549470347,
"grad_norm": 5.961898326873779,
"learning_rate": 8.206373292867983e-06,
"loss": 0.2702,
"step": 765
},
{
"epoch": 0.2219499891907473,
"grad_norm": 7.850853443145752,
"learning_rate": 8.191198786039455e-06,
"loss": 0.2699,
"step": 770
},
{
"epoch": 0.22339122288679109,
"grad_norm": 7.393783092498779,
"learning_rate": 8.176024279210926e-06,
"loss": 0.2571,
"step": 775
},
{
"epoch": 0.2248324565828349,
"grad_norm": 4.719627380371094,
"learning_rate": 8.1608497723824e-06,
"loss": 0.2531,
"step": 780
},
{
"epoch": 0.22627369027887873,
"grad_norm": 2.988987684249878,
"learning_rate": 8.14567526555387e-06,
"loss": 0.2608,
"step": 785
},
{
"epoch": 0.22771492397492255,
"grad_norm": 3.3279573917388916,
"learning_rate": 8.130500758725342e-06,
"loss": 0.255,
"step": 790
},
{
"epoch": 0.22915615767096634,
"grad_norm": 5.147717475891113,
"learning_rate": 8.115326251896815e-06,
"loss": 0.2765,
"step": 795
},
{
"epoch": 0.23059739136701016,
"grad_norm": 3.649806022644043,
"learning_rate": 8.100151745068287e-06,
"loss": 0.2603,
"step": 800
},
{
"epoch": 0.23059739136701016,
"eval_loss": 0.2433857023715973,
"eval_mse": 0.2433856954583898,
"eval_runtime": 3.5848,
"eval_samples_per_second": 278.954,
"eval_steps_per_second": 17.574,
"step": 800
},
{
"epoch": 0.23203862506305398,
"grad_norm": 6.429337501525879,
"learning_rate": 8.084977238239758e-06,
"loss": 0.2499,
"step": 805
},
{
"epoch": 0.2334798587590978,
"grad_norm": 12.670327186584473,
"learning_rate": 8.06980273141123e-06,
"loss": 0.2675,
"step": 810
},
{
"epoch": 0.2349210924551416,
"grad_norm": 8.6768159866333,
"learning_rate": 8.054628224582701e-06,
"loss": 0.2406,
"step": 815
},
{
"epoch": 0.2363623261511854,
"grad_norm": 7.374031066894531,
"learning_rate": 8.039453717754174e-06,
"loss": 0.2762,
"step": 820
},
{
"epoch": 0.23780355984722923,
"grad_norm": 5.246163845062256,
"learning_rate": 8.024279210925645e-06,
"loss": 0.2587,
"step": 825
},
{
"epoch": 0.23924479354327305,
"grad_norm": 4.083150386810303,
"learning_rate": 8.009104704097117e-06,
"loss": 0.2795,
"step": 830
},
{
"epoch": 0.24068602723931687,
"grad_norm": 4.8549065589904785,
"learning_rate": 7.993930197268588e-06,
"loss": 0.2625,
"step": 835
},
{
"epoch": 0.24212726093536066,
"grad_norm": 2.8547518253326416,
"learning_rate": 7.978755690440061e-06,
"loss": 0.273,
"step": 840
},
{
"epoch": 0.24356849463140448,
"grad_norm": 3.599306106567383,
"learning_rate": 7.963581183611533e-06,
"loss": 0.2595,
"step": 845
},
{
"epoch": 0.2450097283274483,
"grad_norm": 4.009678840637207,
"learning_rate": 7.948406676783004e-06,
"loss": 0.2725,
"step": 850
},
{
"epoch": 0.24645096202349212,
"grad_norm": 2.623904228210449,
"learning_rate": 7.933232169954477e-06,
"loss": 0.2577,
"step": 855
},
{
"epoch": 0.2478921957195359,
"grad_norm": 3.4143848419189453,
"learning_rate": 7.918057663125949e-06,
"loss": 0.2345,
"step": 860
},
{
"epoch": 0.24933342941557973,
"grad_norm": 3.2756216526031494,
"learning_rate": 7.90288315629742e-06,
"loss": 0.2609,
"step": 865
},
{
"epoch": 0.25077466311162355,
"grad_norm": 6.703480243682861,
"learning_rate": 7.887708649468893e-06,
"loss": 0.273,
"step": 870
},
{
"epoch": 0.2522158968076674,
"grad_norm": 3.215533494949341,
"learning_rate": 7.872534142640365e-06,
"loss": 0.2448,
"step": 875
},
{
"epoch": 0.2536571305037112,
"grad_norm": 3.3647091388702393,
"learning_rate": 7.857359635811836e-06,
"loss": 0.239,
"step": 880
},
{
"epoch": 0.255098364199755,
"grad_norm": 5.089346408843994,
"learning_rate": 7.842185128983309e-06,
"loss": 0.26,
"step": 885
},
{
"epoch": 0.2565395978957988,
"grad_norm": 3.7883851528167725,
"learning_rate": 7.82701062215478e-06,
"loss": 0.2573,
"step": 890
},
{
"epoch": 0.2579808315918426,
"grad_norm": 3.3582770824432373,
"learning_rate": 7.811836115326252e-06,
"loss": 0.2462,
"step": 895
},
{
"epoch": 0.2594220652878864,
"grad_norm": 3.166255235671997,
"learning_rate": 7.796661608497725e-06,
"loss": 0.2973,
"step": 900
},
{
"epoch": 0.2594220652878864,
"eval_loss": 0.2394440621137619,
"eval_mse": 0.23944407220184802,
"eval_runtime": 3.4937,
"eval_samples_per_second": 286.233,
"eval_steps_per_second": 18.033,
"step": 900
},
{
"epoch": 0.26086329898393024,
"grad_norm": 9.669307708740234,
"learning_rate": 7.781487101669196e-06,
"loss": 0.2588,
"step": 905
},
{
"epoch": 0.26230453267997406,
"grad_norm": 3.7549054622650146,
"learning_rate": 7.766312594840668e-06,
"loss": 0.2646,
"step": 910
},
{
"epoch": 0.2637457663760179,
"grad_norm": 8.603379249572754,
"learning_rate": 7.75113808801214e-06,
"loss": 0.2425,
"step": 915
},
{
"epoch": 0.2651870000720617,
"grad_norm": 3.408053398132324,
"learning_rate": 7.735963581183612e-06,
"loss": 0.2323,
"step": 920
},
{
"epoch": 0.2666282337681055,
"grad_norm": 14.35401725769043,
"learning_rate": 7.720789074355084e-06,
"loss": 0.251,
"step": 925
},
{
"epoch": 0.26806946746414934,
"grad_norm": 6.7734599113464355,
"learning_rate": 7.705614567526557e-06,
"loss": 0.2523,
"step": 930
},
{
"epoch": 0.2695107011601931,
"grad_norm": 7.644830226898193,
"learning_rate": 7.690440060698028e-06,
"loss": 0.2574,
"step": 935
},
{
"epoch": 0.2709519348562369,
"grad_norm": 4.449583053588867,
"learning_rate": 7.6752655538695e-06,
"loss": 0.2885,
"step": 940
},
{
"epoch": 0.27239316855228074,
"grad_norm": 3.3101038932800293,
"learning_rate": 7.660091047040971e-06,
"loss": 0.2893,
"step": 945
},
{
"epoch": 0.27383440224832456,
"grad_norm": 11.52640151977539,
"learning_rate": 7.644916540212444e-06,
"loss": 0.305,
"step": 950
},
{
"epoch": 0.2752756359443684,
"grad_norm": 11.3882474899292,
"learning_rate": 7.629742033383915e-06,
"loss": 0.2692,
"step": 955
},
{
"epoch": 0.2767168696404122,
"grad_norm": 4.39008903503418,
"learning_rate": 7.614567526555388e-06,
"loss": 0.2551,
"step": 960
},
{
"epoch": 0.278158103336456,
"grad_norm": 7.699972629547119,
"learning_rate": 7.599393019726859e-06,
"loss": 0.2684,
"step": 965
},
{
"epoch": 0.27959933703249984,
"grad_norm": 10.91154956817627,
"learning_rate": 7.584218512898331e-06,
"loss": 0.3027,
"step": 970
},
{
"epoch": 0.28104057072854366,
"grad_norm": 3.525420904159546,
"learning_rate": 7.569044006069804e-06,
"loss": 0.2406,
"step": 975
},
{
"epoch": 0.2824818044245874,
"grad_norm": 12.457540512084961,
"learning_rate": 7.553869499241275e-06,
"loss": 0.2964,
"step": 980
},
{
"epoch": 0.28392303812063124,
"grad_norm": 10.155010223388672,
"learning_rate": 7.538694992412747e-06,
"loss": 0.2561,
"step": 985
},
{
"epoch": 0.28536427181667506,
"grad_norm": 4.487485408782959,
"learning_rate": 7.5235204855842195e-06,
"loss": 0.2575,
"step": 990
},
{
"epoch": 0.2868055055127189,
"grad_norm": 3.443803071975708,
"learning_rate": 7.508345978755691e-06,
"loss": 0.2915,
"step": 995
},
{
"epoch": 0.2882467392087627,
"grad_norm": 3.3720641136169434,
"learning_rate": 7.493171471927163e-06,
"loss": 0.2541,
"step": 1000
},
{
"epoch": 0.2882467392087627,
"eval_loss": 0.23559316992759705,
"eval_mse": 0.23559318256378173,
"eval_runtime": 3.5793,
"eval_samples_per_second": 279.383,
"eval_steps_per_second": 17.601,
"step": 1000
},
{
"epoch": 0.2896879729048065,
"grad_norm": 2.868006467819214,
"learning_rate": 7.477996965098635e-06,
"loss": 0.2541,
"step": 1005
},
{
"epoch": 0.29112920660085034,
"grad_norm": 8.352180480957031,
"learning_rate": 7.462822458270107e-06,
"loss": 0.347,
"step": 1010
},
{
"epoch": 0.29257044029689416,
"grad_norm": 3.547825813293457,
"learning_rate": 7.447647951441579e-06,
"loss": 0.2627,
"step": 1015
},
{
"epoch": 0.2940116739929379,
"grad_norm": 7.52193021774292,
"learning_rate": 7.4324734446130505e-06,
"loss": 0.2261,
"step": 1020
},
{
"epoch": 0.29545290768898175,
"grad_norm": 3.890550374984741,
"learning_rate": 7.417298937784523e-06,
"loss": 0.2755,
"step": 1025
},
{
"epoch": 0.29689414138502557,
"grad_norm": 3.059263229370117,
"learning_rate": 7.402124430955995e-06,
"loss": 0.2643,
"step": 1030
},
{
"epoch": 0.2983353750810694,
"grad_norm": 3.5128462314605713,
"learning_rate": 7.386949924127466e-06,
"loss": 0.2388,
"step": 1035
},
{
"epoch": 0.2997766087771132,
"grad_norm": 4.26191520690918,
"learning_rate": 7.371775417298939e-06,
"loss": 0.2605,
"step": 1040
},
{
"epoch": 0.301217842473157,
"grad_norm": 3.458613157272339,
"learning_rate": 7.356600910470411e-06,
"loss": 0.2461,
"step": 1045
},
{
"epoch": 0.30265907616920085,
"grad_norm": 11.277898788452148,
"learning_rate": 7.341426403641882e-06,
"loss": 0.24,
"step": 1050
},
{
"epoch": 0.30410030986524467,
"grad_norm": 3.508758544921875,
"learning_rate": 7.3262518968133545e-06,
"loss": 0.2536,
"step": 1055
},
{
"epoch": 0.3055415435612885,
"grad_norm": 6.12369441986084,
"learning_rate": 7.311077389984827e-06,
"loss": 0.2553,
"step": 1060
},
{
"epoch": 0.30698277725733225,
"grad_norm": 8.630524635314941,
"learning_rate": 7.295902883156298e-06,
"loss": 0.239,
"step": 1065
},
{
"epoch": 0.30842401095337607,
"grad_norm": 6.543661117553711,
"learning_rate": 7.2807283763277704e-06,
"loss": 0.2436,
"step": 1070
},
{
"epoch": 0.3098652446494199,
"grad_norm": 2.481372594833374,
"learning_rate": 7.265553869499242e-06,
"loss": 0.2497,
"step": 1075
},
{
"epoch": 0.3113064783454637,
"grad_norm": 2.966479539871216,
"learning_rate": 7.250379362670714e-06,
"loss": 0.2792,
"step": 1080
},
{
"epoch": 0.31274771204150753,
"grad_norm": 2.8353137969970703,
"learning_rate": 7.235204855842186e-06,
"loss": 0.2745,
"step": 1085
},
{
"epoch": 0.31418894573755135,
"grad_norm": 3.3317484855651855,
"learning_rate": 7.220030349013658e-06,
"loss": 0.2748,
"step": 1090
},
{
"epoch": 0.31563017943359517,
"grad_norm": 9.185257911682129,
"learning_rate": 7.20485584218513e-06,
"loss": 0.2659,
"step": 1095
},
{
"epoch": 0.317071413129639,
"grad_norm": 3.2732787132263184,
"learning_rate": 7.189681335356602e-06,
"loss": 0.2837,
"step": 1100
},
{
"epoch": 0.317071413129639,
"eval_loss": 0.2436872273683548,
"eval_mse": 0.24368724367022515,
"eval_runtime": 3.6657,
"eval_samples_per_second": 272.802,
"eval_steps_per_second": 17.187,
"step": 1100
},
{
"epoch": 0.3185126468256828,
"grad_norm": 7.124091625213623,
"learning_rate": 7.174506828528074e-06,
"loss": 0.2601,
"step": 1105
},
{
"epoch": 0.3199538805217266,
"grad_norm": 2.949673652648926,
"learning_rate": 7.159332321699546e-06,
"loss": 0.2378,
"step": 1110
},
{
"epoch": 0.3213951142177704,
"grad_norm": 6.16537618637085,
"learning_rate": 7.144157814871018e-06,
"loss": 0.2681,
"step": 1115
},
{
"epoch": 0.3228363479138142,
"grad_norm": 10.283601760864258,
"learning_rate": 7.1289833080424896e-06,
"loss": 0.2537,
"step": 1120
},
{
"epoch": 0.32427758160985803,
"grad_norm": 8.933276176452637,
"learning_rate": 7.113808801213962e-06,
"loss": 0.2522,
"step": 1125
},
{
"epoch": 0.32571881530590185,
"grad_norm": 6.161346435546875,
"learning_rate": 7.098634294385432e-06,
"loss": 0.2717,
"step": 1130
},
{
"epoch": 0.3271600490019457,
"grad_norm": 3.492527961730957,
"learning_rate": 7.083459787556905e-06,
"loss": 0.2691,
"step": 1135
},
{
"epoch": 0.3286012826979895,
"grad_norm": 11.494355201721191,
"learning_rate": 7.068285280728376e-06,
"loss": 0.2508,
"step": 1140
},
{
"epoch": 0.3300425163940333,
"grad_norm": 5.227607250213623,
"learning_rate": 7.053110773899848e-06,
"loss": 0.2791,
"step": 1145
},
{
"epoch": 0.3314837500900771,
"grad_norm": 9.131232261657715,
"learning_rate": 7.0379362670713205e-06,
"loss": 0.2617,
"step": 1150
},
{
"epoch": 0.3329249837861209,
"grad_norm": 8.7017183303833,
"learning_rate": 7.022761760242792e-06,
"loss": 0.2456,
"step": 1155
},
{
"epoch": 0.3343662174821647,
"grad_norm": 10.420509338378906,
"learning_rate": 7.007587253414264e-06,
"loss": 0.2494,
"step": 1160
},
{
"epoch": 0.33580745117820854,
"grad_norm": 10.890824317932129,
"learning_rate": 6.9924127465857364e-06,
"loss": 0.2571,
"step": 1165
},
{
"epoch": 0.33724868487425236,
"grad_norm": 7.0846428871154785,
"learning_rate": 6.977238239757208e-06,
"loss": 0.2786,
"step": 1170
},
{
"epoch": 0.3386899185702962,
"grad_norm": 11.748177528381348,
"learning_rate": 6.96206373292868e-06,
"loss": 0.243,
"step": 1175
},
{
"epoch": 0.34013115226634,
"grad_norm": 3.483414888381958,
"learning_rate": 6.946889226100152e-06,
"loss": 0.2426,
"step": 1180
},
{
"epoch": 0.3415723859623838,
"grad_norm": 8.315218925476074,
"learning_rate": 6.931714719271624e-06,
"loss": 0.2457,
"step": 1185
},
{
"epoch": 0.34301361965842764,
"grad_norm": 3.617290496826172,
"learning_rate": 6.916540212443096e-06,
"loss": 0.2278,
"step": 1190
},
{
"epoch": 0.3444548533544714,
"grad_norm": 3.7248239517211914,
"learning_rate": 6.901365705614567e-06,
"loss": 0.2359,
"step": 1195
},
{
"epoch": 0.3458960870505152,
"grad_norm": 4.484503269195557,
"learning_rate": 6.88619119878604e-06,
"loss": 0.242,
"step": 1200
},
{
"epoch": 0.3458960870505152,
"eval_loss": 0.23793897032737732,
"eval_mse": 0.237938963919878,
"eval_runtime": 3.5407,
"eval_samples_per_second": 282.427,
"eval_steps_per_second": 17.793,
"step": 1200
},
{
"epoch": 0.34733732074655904,
"grad_norm": 3.734285593032837,
"learning_rate": 6.871016691957512e-06,
"loss": 0.2488,
"step": 1205
},
{
"epoch": 0.34877855444260286,
"grad_norm": 2.971663475036621,
"learning_rate": 6.855842185128983e-06,
"loss": 0.2463,
"step": 1210
},
{
"epoch": 0.3502197881386467,
"grad_norm": 20.05165672302246,
"learning_rate": 6.8406676783004556e-06,
"loss": 0.2428,
"step": 1215
},
{
"epoch": 0.3516610218346905,
"grad_norm": 2.946723461151123,
"learning_rate": 6.825493171471928e-06,
"loss": 0.252,
"step": 1220
},
{
"epoch": 0.3531022555307343,
"grad_norm": 3.993689775466919,
"learning_rate": 6.810318664643399e-06,
"loss": 0.2588,
"step": 1225
},
{
"epoch": 0.35454348922677814,
"grad_norm": 3.7553279399871826,
"learning_rate": 6.7951441578148715e-06,
"loss": 0.2432,
"step": 1230
},
{
"epoch": 0.35598472292282196,
"grad_norm": 10.069433212280273,
"learning_rate": 6.779969650986343e-06,
"loss": 0.24,
"step": 1235
},
{
"epoch": 0.3574259566188657,
"grad_norm": 3.12581205368042,
"learning_rate": 6.764795144157815e-06,
"loss": 0.2396,
"step": 1240
},
{
"epoch": 0.35886719031490955,
"grad_norm": 3.955724000930786,
"learning_rate": 6.749620637329287e-06,
"loss": 0.2582,
"step": 1245
},
{
"epoch": 0.36030842401095337,
"grad_norm": 5.299466133117676,
"learning_rate": 6.734446130500759e-06,
"loss": 0.2537,
"step": 1250
},
{
"epoch": 0.3617496577069972,
"grad_norm": 3.8833680152893066,
"learning_rate": 6.719271623672231e-06,
"loss": 0.2178,
"step": 1255
},
{
"epoch": 0.363190891403041,
"grad_norm": 3.391704559326172,
"learning_rate": 6.704097116843703e-06,
"loss": 0.2616,
"step": 1260
},
{
"epoch": 0.3646321250990848,
"grad_norm": 5.185522079467773,
"learning_rate": 6.688922610015175e-06,
"loss": 0.2529,
"step": 1265
},
{
"epoch": 0.36607335879512864,
"grad_norm": 7.748248100280762,
"learning_rate": 6.673748103186647e-06,
"loss": 0.2918,
"step": 1270
},
{
"epoch": 0.36751459249117246,
"grad_norm": 6.879116535186768,
"learning_rate": 6.658573596358119e-06,
"loss": 0.2522,
"step": 1275
},
{
"epoch": 0.3689558261872163,
"grad_norm": 5.699965476989746,
"learning_rate": 6.643399089529591e-06,
"loss": 0.2467,
"step": 1280
},
{
"epoch": 0.37039705988326005,
"grad_norm": 6.8762617111206055,
"learning_rate": 6.628224582701063e-06,
"loss": 0.247,
"step": 1285
},
{
"epoch": 0.37183829357930387,
"grad_norm": 3.2006216049194336,
"learning_rate": 6.613050075872534e-06,
"loss": 0.235,
"step": 1290
},
{
"epoch": 0.3732795272753477,
"grad_norm": 5.277589321136475,
"learning_rate": 6.5978755690440065e-06,
"loss": 0.2583,
"step": 1295
},
{
"epoch": 0.3747207609713915,
"grad_norm": 3.9958581924438477,
"learning_rate": 6.582701062215479e-06,
"loss": 0.2379,
"step": 1300
},
{
"epoch": 0.3747207609713915,
"eval_loss": 0.2270413339138031,
"eval_mse": 0.2270413387455046,
"eval_runtime": 3.4467,
"eval_samples_per_second": 290.129,
"eval_steps_per_second": 18.278,
"step": 1300
},
{
"epoch": 0.37616199466743533,
"grad_norm": 3.3491854667663574,
"learning_rate": 6.56752655538695e-06,
"loss": 0.2653,
"step": 1305
},
{
"epoch": 0.37760322836347915,
"grad_norm": 5.074587345123291,
"learning_rate": 6.552352048558422e-06,
"loss": 0.2377,
"step": 1310
},
{
"epoch": 0.37904446205952297,
"grad_norm": 7.423491477966309,
"learning_rate": 6.537177541729895e-06,
"loss": 0.2854,
"step": 1315
},
{
"epoch": 0.3804856957555668,
"grad_norm": 14.916816711425781,
"learning_rate": 6.522003034901366e-06,
"loss": 0.2513,
"step": 1320
},
{
"epoch": 0.38192692945161055,
"grad_norm": 5.700295448303223,
"learning_rate": 6.506828528072838e-06,
"loss": 0.2609,
"step": 1325
},
{
"epoch": 0.3833681631476544,
"grad_norm": 3.671921968460083,
"learning_rate": 6.4916540212443106e-06,
"loss": 0.2536,
"step": 1330
},
{
"epoch": 0.3848093968436982,
"grad_norm": 7.694835186004639,
"learning_rate": 6.476479514415782e-06,
"loss": 0.2482,
"step": 1335
},
{
"epoch": 0.386250630539742,
"grad_norm": 6.534417629241943,
"learning_rate": 6.461305007587254e-06,
"loss": 0.2571,
"step": 1340
},
{
"epoch": 0.38769186423578583,
"grad_norm": 4.938977241516113,
"learning_rate": 6.446130500758726e-06,
"loss": 0.2513,
"step": 1345
},
{
"epoch": 0.38913309793182965,
"grad_norm": 4.277524471282959,
"learning_rate": 6.430955993930198e-06,
"loss": 0.2433,
"step": 1350
},
{
"epoch": 0.3905743316278735,
"grad_norm": 10.038058280944824,
"learning_rate": 6.41578148710167e-06,
"loss": 0.2622,
"step": 1355
},
{
"epoch": 0.3920155653239173,
"grad_norm": 7.558711051940918,
"learning_rate": 6.4006069802731415e-06,
"loss": 0.2434,
"step": 1360
},
{
"epoch": 0.3934567990199611,
"grad_norm": 9.363914489746094,
"learning_rate": 6.385432473444614e-06,
"loss": 0.2491,
"step": 1365
},
{
"epoch": 0.3948980327160049,
"grad_norm": 7.465854644775391,
"learning_rate": 6.370257966616086e-06,
"loss": 0.2545,
"step": 1370
},
{
"epoch": 0.3963392664120487,
"grad_norm": 6.826533794403076,
"learning_rate": 6.3550834597875574e-06,
"loss": 0.2912,
"step": 1375
},
{
"epoch": 0.3977805001080925,
"grad_norm": 6.965171813964844,
"learning_rate": 6.33990895295903e-06,
"loss": 0.2773,
"step": 1380
},
{
"epoch": 0.39922173380413634,
"grad_norm": 3.642481565475464,
"learning_rate": 6.324734446130502e-06,
"loss": 0.245,
"step": 1385
},
{
"epoch": 0.40066296750018016,
"grad_norm": 3.2742958068847656,
"learning_rate": 6.309559939301973e-06,
"loss": 0.2422,
"step": 1390
},
{
"epoch": 0.402104201196224,
"grad_norm": 3.7424025535583496,
"learning_rate": 6.294385432473446e-06,
"loss": 0.247,
"step": 1395
},
{
"epoch": 0.4035454348922678,
"grad_norm": 8.01791763305664,
"learning_rate": 6.279210925644917e-06,
"loss": 0.23,
"step": 1400
},
{
"epoch": 0.4035454348922678,
"eval_loss": 0.23571762442588806,
"eval_mse": 0.23571763192489742,
"eval_runtime": 3.6319,
"eval_samples_per_second": 275.335,
"eval_steps_per_second": 17.346,
"step": 1400
},
{
"epoch": 0.4049866685883116,
"grad_norm": 11.114055633544922,
"learning_rate": 6.264036418816389e-06,
"loss": 0.2732,
"step": 1405
},
{
"epoch": 0.40642790228435544,
"grad_norm": 4.4829020500183105,
"learning_rate": 6.2488619119878615e-06,
"loss": 0.2467,
"step": 1410
},
{
"epoch": 0.4078691359803992,
"grad_norm": 9.322883605957031,
"learning_rate": 6.233687405159333e-06,
"loss": 0.2525,
"step": 1415
},
{
"epoch": 0.409310369676443,
"grad_norm": 4.860049247741699,
"learning_rate": 6.218512898330805e-06,
"loss": 0.2567,
"step": 1420
},
{
"epoch": 0.41075160337248684,
"grad_norm": 3.1241393089294434,
"learning_rate": 6.203338391502277e-06,
"loss": 0.2244,
"step": 1425
},
{
"epoch": 0.41219283706853066,
"grad_norm": 5.4952263832092285,
"learning_rate": 6.188163884673749e-06,
"loss": 0.2841,
"step": 1430
},
{
"epoch": 0.4136340707645745,
"grad_norm": 11.66360092163086,
"learning_rate": 6.172989377845221e-06,
"loss": 0.2669,
"step": 1435
},
{
"epoch": 0.4150753044606183,
"grad_norm": 5.991528511047363,
"learning_rate": 6.157814871016693e-06,
"loss": 0.2819,
"step": 1440
},
{
"epoch": 0.4165165381566621,
"grad_norm": 4.639535903930664,
"learning_rate": 6.142640364188165e-06,
"loss": 0.2536,
"step": 1445
},
{
"epoch": 0.41795777185270594,
"grad_norm": 15.960262298583984,
"learning_rate": 6.127465857359637e-06,
"loss": 0.2553,
"step": 1450
},
{
"epoch": 0.4193990055487497,
"grad_norm": 2.875967025756836,
"learning_rate": 6.112291350531108e-06,
"loss": 0.2521,
"step": 1455
},
{
"epoch": 0.4208402392447935,
"grad_norm": 8.322701454162598,
"learning_rate": 6.09711684370258e-06,
"loss": 0.2637,
"step": 1460
},
{
"epoch": 0.42228147294083734,
"grad_norm": 10.47356128692627,
"learning_rate": 6.081942336874051e-06,
"loss": 0.247,
"step": 1465
},
{
"epoch": 0.42372270663688116,
"grad_norm": 4.5936598777771,
"learning_rate": 6.0667678300455234e-06,
"loss": 0.2284,
"step": 1470
},
{
"epoch": 0.425163940332925,
"grad_norm": 6.534598350524902,
"learning_rate": 6.051593323216996e-06,
"loss": 0.2431,
"step": 1475
},
{
"epoch": 0.4266051740289688,
"grad_norm": 5.094123363494873,
"learning_rate": 6.036418816388467e-06,
"loss": 0.2582,
"step": 1480
},
{
"epoch": 0.4280464077250126,
"grad_norm": 6.307914733886719,
"learning_rate": 6.021244309559939e-06,
"loss": 0.2723,
"step": 1485
},
{
"epoch": 0.42948764142105644,
"grad_norm": 3.139702081680298,
"learning_rate": 6.006069802731412e-06,
"loss": 0.2624,
"step": 1490
},
{
"epoch": 0.43092887511710026,
"grad_norm": 8.088966369628906,
"learning_rate": 5.990895295902883e-06,
"loss": 0.2114,
"step": 1495
},
{
"epoch": 0.432370108813144,
"grad_norm": 5.096198558807373,
"learning_rate": 5.975720789074355e-06,
"loss": 0.2345,
"step": 1500
},
{
"epoch": 0.432370108813144,
"eval_loss": 0.24168826639652252,
"eval_mse": 0.24168826324585826,
"eval_runtime": 3.4764,
"eval_samples_per_second": 287.655,
"eval_steps_per_second": 18.122,
"step": 1500
},
{
"epoch": 0.43381134250918785,
"grad_norm": 4.601346015930176,
"learning_rate": 5.9605462822458275e-06,
"loss": 0.2741,
"step": 1505
},
{
"epoch": 0.43525257620523167,
"grad_norm": 3.155237913131714,
"learning_rate": 5.945371775417299e-06,
"loss": 0.2726,
"step": 1510
},
{
"epoch": 0.4366938099012755,
"grad_norm": 10.061796188354492,
"learning_rate": 5.930197268588771e-06,
"loss": 0.2424,
"step": 1515
},
{
"epoch": 0.4381350435973193,
"grad_norm": 3.544672966003418,
"learning_rate": 5.9150227617602426e-06,
"loss": 0.2538,
"step": 1520
},
{
"epoch": 0.4395762772933631,
"grad_norm": 16.151227951049805,
"learning_rate": 5.899848254931715e-06,
"loss": 0.279,
"step": 1525
},
{
"epoch": 0.44101751098940695,
"grad_norm": 6.2358598709106445,
"learning_rate": 5.884673748103187e-06,
"loss": 0.2487,
"step": 1530
},
{
"epoch": 0.44245874468545077,
"grad_norm": 4.63312292098999,
"learning_rate": 5.8694992412746585e-06,
"loss": 0.232,
"step": 1535
},
{
"epoch": 0.4438999783814946,
"grad_norm": 6.458159923553467,
"learning_rate": 5.854324734446131e-06,
"loss": 0.2283,
"step": 1540
},
{
"epoch": 0.44534121207753835,
"grad_norm": 5.087281227111816,
"learning_rate": 5.839150227617603e-06,
"loss": 0.2657,
"step": 1545
},
{
"epoch": 0.44678244577358217,
"grad_norm": 5.383090972900391,
"learning_rate": 5.823975720789074e-06,
"loss": 0.2505,
"step": 1550
},
{
"epoch": 0.448223679469626,
"grad_norm": 3.1167826652526855,
"learning_rate": 5.808801213960547e-06,
"loss": 0.2581,
"step": 1555
},
{
"epoch": 0.4496649131656698,
"grad_norm": 3.727855682373047,
"learning_rate": 5.793626707132019e-06,
"loss": 0.232,
"step": 1560
},
{
"epoch": 0.45110614686171363,
"grad_norm": 4.394417762756348,
"learning_rate": 5.77845220030349e-06,
"loss": 0.2481,
"step": 1565
},
{
"epoch": 0.45254738055775745,
"grad_norm": 8.402990341186523,
"learning_rate": 5.7632776934749625e-06,
"loss": 0.2528,
"step": 1570
},
{
"epoch": 0.45398861425380127,
"grad_norm": 6.247002601623535,
"learning_rate": 5.748103186646434e-06,
"loss": 0.255,
"step": 1575
},
{
"epoch": 0.4554298479498451,
"grad_norm": 4.592987060546875,
"learning_rate": 5.732928679817906e-06,
"loss": 0.2551,
"step": 1580
},
{
"epoch": 0.45687108164588885,
"grad_norm": 5.450745105743408,
"learning_rate": 5.7177541729893784e-06,
"loss": 0.2292,
"step": 1585
},
{
"epoch": 0.4583123153419327,
"grad_norm": 2.5550763607025146,
"learning_rate": 5.70257966616085e-06,
"loss": 0.2342,
"step": 1590
},
{
"epoch": 0.4597535490379765,
"grad_norm": 4.07436990737915,
"learning_rate": 5.687405159332322e-06,
"loss": 0.2504,
"step": 1595
},
{
"epoch": 0.4611947827340203,
"grad_norm": 14.1714506149292,
"learning_rate": 5.672230652503794e-06,
"loss": 0.2574,
"step": 1600
},
{
"epoch": 0.4611947827340203,
"eval_loss": 0.2556320130825043,
"eval_mse": 0.25563200883287934,
"eval_runtime": 3.4708,
"eval_samples_per_second": 288.122,
"eval_steps_per_second": 18.152,
"step": 1600
},
{
"epoch": 0.46263601643006413,
"grad_norm": 2.311433792114258,
"learning_rate": 5.657056145675266e-06,
"loss": 0.2406,
"step": 1605
},
{
"epoch": 0.46407725012610795,
"grad_norm": 5.986364841461182,
"learning_rate": 5.641881638846738e-06,
"loss": 0.2466,
"step": 1610
},
{
"epoch": 0.4655184838221518,
"grad_norm": 11.425749778747559,
"learning_rate": 5.62670713201821e-06,
"loss": 0.2676,
"step": 1615
},
{
"epoch": 0.4669597175181956,
"grad_norm": 4.642142295837402,
"learning_rate": 5.611532625189682e-06,
"loss": 0.2226,
"step": 1620
},
{
"epoch": 0.4684009512142394,
"grad_norm": 3.2466318607330322,
"learning_rate": 5.596358118361154e-06,
"loss": 0.2508,
"step": 1625
},
{
"epoch": 0.4698421849102832,
"grad_norm": 5.736583232879639,
"learning_rate": 5.581183611532625e-06,
"loss": 0.2495,
"step": 1630
},
{
"epoch": 0.471283418606327,
"grad_norm": 3.0507047176361084,
"learning_rate": 5.5660091047040976e-06,
"loss": 0.2345,
"step": 1635
},
{
"epoch": 0.4727246523023708,
"grad_norm": 7.51447868347168,
"learning_rate": 5.55083459787557e-06,
"loss": 0.259,
"step": 1640
},
{
"epoch": 0.47416588599841464,
"grad_norm": 4.414575576782227,
"learning_rate": 5.535660091047041e-06,
"loss": 0.2347,
"step": 1645
},
{
"epoch": 0.47560711969445846,
"grad_norm": 5.172975540161133,
"learning_rate": 5.5204855842185135e-06,
"loss": 0.2588,
"step": 1650
},
{
"epoch": 0.4770483533905023,
"grad_norm": 3.5544350147247314,
"learning_rate": 5.505311077389986e-06,
"loss": 0.243,
"step": 1655
},
{
"epoch": 0.4784895870865461,
"grad_norm": 6.046934127807617,
"learning_rate": 5.490136570561457e-06,
"loss": 0.2428,
"step": 1660
},
{
"epoch": 0.4799308207825899,
"grad_norm": 12.10718059539795,
"learning_rate": 5.474962063732929e-06,
"loss": 0.2343,
"step": 1665
},
{
"epoch": 0.48137205447863374,
"grad_norm": 5.280265808105469,
"learning_rate": 5.459787556904402e-06,
"loss": 0.2546,
"step": 1670
},
{
"epoch": 0.4828132881746775,
"grad_norm": 2.352365255355835,
"learning_rate": 5.444613050075873e-06,
"loss": 0.2445,
"step": 1675
},
{
"epoch": 0.4842545218707213,
"grad_norm": 3.669762372970581,
"learning_rate": 5.429438543247345e-06,
"loss": 0.2468,
"step": 1680
},
{
"epoch": 0.48569575556676514,
"grad_norm": 2.9449355602264404,
"learning_rate": 5.414264036418817e-06,
"loss": 0.239,
"step": 1685
},
{
"epoch": 0.48713698926280896,
"grad_norm": 6.094593048095703,
"learning_rate": 5.399089529590289e-06,
"loss": 0.2371,
"step": 1690
},
{
"epoch": 0.4885782229588528,
"grad_norm": 4.748923301696777,
"learning_rate": 5.383915022761761e-06,
"loss": 0.2494,
"step": 1695
},
{
"epoch": 0.4900194566548966,
"grad_norm": 3.353111743927002,
"learning_rate": 5.368740515933233e-06,
"loss": 0.264,
"step": 1700
},
{
"epoch": 0.4900194566548966,
"eval_loss": 0.24517062306404114,
"eval_mse": 0.2451706298738718,
"eval_runtime": 3.6449,
"eval_samples_per_second": 274.356,
"eval_steps_per_second": 17.284,
"step": 1700
},
{
"epoch": 0.4914606903509404,
"grad_norm": 3.940274238586426,
"learning_rate": 5.353566009104705e-06,
"loss": 0.2538,
"step": 1705
},
{
"epoch": 0.49290192404698424,
"grad_norm": 5.480809688568115,
"learning_rate": 5.338391502276177e-06,
"loss": 0.2316,
"step": 1710
},
{
"epoch": 0.494343157743028,
"grad_norm": 3.5788941383361816,
"learning_rate": 5.3232169954476485e-06,
"loss": 0.2452,
"step": 1715
},
{
"epoch": 0.4957843914390718,
"grad_norm": 2.8091890811920166,
"learning_rate": 5.308042488619121e-06,
"loss": 0.2649,
"step": 1720
},
{
"epoch": 0.49722562513511565,
"grad_norm": 5.758116722106934,
"learning_rate": 5.292867981790593e-06,
"loss": 0.23,
"step": 1725
},
{
"epoch": 0.49866685883115947,
"grad_norm": 5.408109188079834,
"learning_rate": 5.277693474962064e-06,
"loss": 0.2686,
"step": 1730
},
{
"epoch": 0.5001080925272033,
"grad_norm": 3.6406140327453613,
"learning_rate": 5.262518968133537e-06,
"loss": 0.2427,
"step": 1735
},
{
"epoch": 0.5015493262232471,
"grad_norm": 3.9658877849578857,
"learning_rate": 5.247344461305008e-06,
"loss": 0.231,
"step": 1740
},
{
"epoch": 0.5029905599192909,
"grad_norm": 2.78668212890625,
"learning_rate": 5.23216995447648e-06,
"loss": 0.2545,
"step": 1745
},
{
"epoch": 0.5044317936153347,
"grad_norm": 3.2075002193450928,
"learning_rate": 5.2169954476479526e-06,
"loss": 0.2527,
"step": 1750
},
{
"epoch": 0.5058730273113785,
"grad_norm": 4.351621150970459,
"learning_rate": 5.201820940819424e-06,
"loss": 0.2538,
"step": 1755
},
{
"epoch": 0.5073142610074224,
"grad_norm": 8.305010795593262,
"learning_rate": 5.186646433990896e-06,
"loss": 0.2959,
"step": 1760
},
{
"epoch": 0.5087554947034661,
"grad_norm": 6.016130447387695,
"learning_rate": 5.1714719271623685e-06,
"loss": 0.2508,
"step": 1765
},
{
"epoch": 0.51019672839951,
"grad_norm": 3.914813995361328,
"learning_rate": 5.15629742033384e-06,
"loss": 0.256,
"step": 1770
},
{
"epoch": 0.5116379620955538,
"grad_norm": 7.858732223510742,
"learning_rate": 5.141122913505312e-06,
"loss": 0.2762,
"step": 1775
},
{
"epoch": 0.5130791957915976,
"grad_norm": 5.898010730743408,
"learning_rate": 5.125948406676784e-06,
"loss": 0.2777,
"step": 1780
},
{
"epoch": 0.5145204294876414,
"grad_norm": 2.817342758178711,
"learning_rate": 5.110773899848256e-06,
"loss": 0.2631,
"step": 1785
},
{
"epoch": 0.5159616631836852,
"grad_norm": 3.927349328994751,
"learning_rate": 5.095599393019726e-06,
"loss": 0.225,
"step": 1790
},
{
"epoch": 0.5174028968797291,
"grad_norm": 9.196603775024414,
"learning_rate": 5.080424886191199e-06,
"loss": 0.2591,
"step": 1795
},
{
"epoch": 0.5188441305757728,
"grad_norm": 2.575307607650757,
"learning_rate": 5.065250379362671e-06,
"loss": 0.2596,
"step": 1800
},
{
"epoch": 0.5188441305757728,
"eval_loss": 0.22148238122463226,
"eval_mse": 0.2214823840931058,
"eval_runtime": 3.4856,
"eval_samples_per_second": 286.898,
"eval_steps_per_second": 18.075,
"step": 1800
},
{
"epoch": 0.5202853642718167,
"grad_norm": 4.5332350730896,
"learning_rate": 5.050075872534142e-06,
"loss": 0.2515,
"step": 1805
},
{
"epoch": 0.5217265979678605,
"grad_norm": 4.379772663116455,
"learning_rate": 5.0349013657056145e-06,
"loss": 0.2285,
"step": 1810
},
{
"epoch": 0.5231678316639043,
"grad_norm": 4.068484783172607,
"learning_rate": 5.019726858877087e-06,
"loss": 0.2309,
"step": 1815
},
{
"epoch": 0.5246090653599481,
"grad_norm": 6.4193525314331055,
"learning_rate": 5.004552352048558e-06,
"loss": 0.2637,
"step": 1820
},
{
"epoch": 0.5260502990559919,
"grad_norm": 5.752609729766846,
"learning_rate": 4.989377845220031e-06,
"loss": 0.2531,
"step": 1825
},
{
"epoch": 0.5274915327520358,
"grad_norm": 7.622284889221191,
"learning_rate": 4.974203338391503e-06,
"loss": 0.2417,
"step": 1830
},
{
"epoch": 0.5289327664480795,
"grad_norm": 3.0805881023406982,
"learning_rate": 4.959028831562975e-06,
"loss": 0.2608,
"step": 1835
},
{
"epoch": 0.5303740001441234,
"grad_norm": 6.141805648803711,
"learning_rate": 4.943854324734446e-06,
"loss": 0.2381,
"step": 1840
},
{
"epoch": 0.5318152338401672,
"grad_norm": 3.3085532188415527,
"learning_rate": 4.9286798179059185e-06,
"loss": 0.2315,
"step": 1845
},
{
"epoch": 0.533256467536211,
"grad_norm": 4.580583095550537,
"learning_rate": 4.913505311077391e-06,
"loss": 0.2621,
"step": 1850
},
{
"epoch": 0.5346977012322548,
"grad_norm": 4.834953308105469,
"learning_rate": 4.898330804248862e-06,
"loss": 0.2486,
"step": 1855
},
{
"epoch": 0.5361389349282987,
"grad_norm": 5.170509338378906,
"learning_rate": 4.8831562974203345e-06,
"loss": 0.2621,
"step": 1860
},
{
"epoch": 0.5375801686243424,
"grad_norm": 2.793675184249878,
"learning_rate": 4.867981790591807e-06,
"loss": 0.2469,
"step": 1865
},
{
"epoch": 0.5390214023203862,
"grad_norm": 11.32933521270752,
"learning_rate": 4.852807283763278e-06,
"loss": 0.2499,
"step": 1870
},
{
"epoch": 0.5404626360164301,
"grad_norm": 7.267145156860352,
"learning_rate": 4.8376327769347495e-06,
"loss": 0.2429,
"step": 1875
},
{
"epoch": 0.5419038697124738,
"grad_norm": 3.166592597961426,
"learning_rate": 4.822458270106222e-06,
"loss": 0.2465,
"step": 1880
},
{
"epoch": 0.5433451034085177,
"grad_norm": 2.622795820236206,
"learning_rate": 4.807283763277694e-06,
"loss": 0.2472,
"step": 1885
},
{
"epoch": 0.5447863371045615,
"grad_norm": 8.5530424118042,
"learning_rate": 4.792109256449165e-06,
"loss": 0.2758,
"step": 1890
},
{
"epoch": 0.5462275708006054,
"grad_norm": 3.895925521850586,
"learning_rate": 4.776934749620638e-06,
"loss": 0.2346,
"step": 1895
},
{
"epoch": 0.5476688044966491,
"grad_norm": 3.4110894203186035,
"learning_rate": 4.761760242792109e-06,
"loss": 0.244,
"step": 1900
},
{
"epoch": 0.5476688044966491,
"eval_loss": 0.22688308358192444,
"eval_mse": 0.22688308400684037,
"eval_runtime": 3.4433,
"eval_samples_per_second": 290.417,
"eval_steps_per_second": 18.296,
"step": 1900
},
{
"epoch": 0.549110038192693,
"grad_norm": 4.175052165985107,
"learning_rate": 4.746585735963581e-06,
"loss": 0.2459,
"step": 1905
},
{
"epoch": 0.5505512718887368,
"grad_norm": 4.640260219573975,
"learning_rate": 4.731411229135054e-06,
"loss": 0.2269,
"step": 1910
},
{
"epoch": 0.5519925055847805,
"grad_norm": 3.7191929817199707,
"learning_rate": 4.716236722306525e-06,
"loss": 0.2627,
"step": 1915
},
{
"epoch": 0.5534337392808244,
"grad_norm": 5.770391941070557,
"learning_rate": 4.701062215477997e-06,
"loss": 0.2484,
"step": 1920
},
{
"epoch": 0.5548749729768682,
"grad_norm": 2.703814744949341,
"learning_rate": 4.6858877086494695e-06,
"loss": 0.2298,
"step": 1925
},
{
"epoch": 0.556316206672912,
"grad_norm": 3.706900119781494,
"learning_rate": 4.670713201820941e-06,
"loss": 0.2493,
"step": 1930
},
{
"epoch": 0.5577574403689558,
"grad_norm": 2.6456151008605957,
"learning_rate": 4.655538694992413e-06,
"loss": 0.2533,
"step": 1935
},
{
"epoch": 0.5591986740649997,
"grad_norm": 5.848125457763672,
"learning_rate": 4.640364188163885e-06,
"loss": 0.2217,
"step": 1940
},
{
"epoch": 0.5606399077610434,
"grad_norm": 2.838711738586426,
"learning_rate": 4.625189681335357e-06,
"loss": 0.2255,
"step": 1945
},
{
"epoch": 0.5620811414570873,
"grad_norm": 5.126152992248535,
"learning_rate": 4.610015174506829e-06,
"loss": 0.2732,
"step": 1950
},
{
"epoch": 0.5635223751531311,
"grad_norm": 3.699435234069824,
"learning_rate": 4.5948406676783005e-06,
"loss": 0.2703,
"step": 1955
},
{
"epoch": 0.5649636088491748,
"grad_norm": 3.9639079570770264,
"learning_rate": 4.579666160849773e-06,
"loss": 0.2552,
"step": 1960
},
{
"epoch": 0.5664048425452187,
"grad_norm": 12.423652648925781,
"learning_rate": 4.564491654021245e-06,
"loss": 0.2473,
"step": 1965
},
{
"epoch": 0.5678460762412625,
"grad_norm": 9.687994003295898,
"learning_rate": 4.549317147192716e-06,
"loss": 0.2383,
"step": 1970
},
{
"epoch": 0.5692873099373064,
"grad_norm": 4.642548561096191,
"learning_rate": 4.534142640364189e-06,
"loss": 0.2444,
"step": 1975
},
{
"epoch": 0.5707285436333501,
"grad_norm": 3.047471523284912,
"learning_rate": 4.518968133535661e-06,
"loss": 0.2159,
"step": 1980
},
{
"epoch": 0.572169777329394,
"grad_norm": 5.581223011016846,
"learning_rate": 4.503793626707132e-06,
"loss": 0.2415,
"step": 1985
},
{
"epoch": 0.5736110110254378,
"grad_norm": 5.823172092437744,
"learning_rate": 4.4886191198786045e-06,
"loss": 0.2445,
"step": 1990
},
{
"epoch": 0.5750522447214816,
"grad_norm": 5.474643707275391,
"learning_rate": 4.473444613050077e-06,
"loss": 0.2253,
"step": 1995
},
{
"epoch": 0.5764934784175254,
"grad_norm": 5.476977825164795,
"learning_rate": 4.458270106221548e-06,
"loss": 0.2225,
"step": 2000
},
{
"epoch": 0.5764934784175254,
"eval_loss": 0.23420479893684387,
"eval_mse": 0.23420481357816605,
"eval_runtime": 3.5455,
"eval_samples_per_second": 282.05,
"eval_steps_per_second": 17.769,
"step": 2000
},
{
"epoch": 0.5779347121135692,
"grad_norm": 4.759982109069824,
"learning_rate": 4.44309559939302e-06,
"loss": 0.2449,
"step": 2005
},
{
"epoch": 0.579375945809613,
"grad_norm": 5.200133323669434,
"learning_rate": 4.427921092564492e-06,
"loss": 0.2382,
"step": 2010
},
{
"epoch": 0.5808171795056568,
"grad_norm": 5.744959831237793,
"learning_rate": 4.412746585735964e-06,
"loss": 0.2606,
"step": 2015
},
{
"epoch": 0.5822584132017007,
"grad_norm": 2.84411358833313,
"learning_rate": 4.397572078907436e-06,
"loss": 0.2506,
"step": 2020
},
{
"epoch": 0.5836996468977445,
"grad_norm": 3.2607600688934326,
"learning_rate": 4.382397572078908e-06,
"loss": 0.2516,
"step": 2025
},
{
"epoch": 0.5851408805937883,
"grad_norm": 2.637953519821167,
"learning_rate": 4.36722306525038e-06,
"loss": 0.2389,
"step": 2030
},
{
"epoch": 0.5865821142898321,
"grad_norm": 4.82386589050293,
"learning_rate": 4.352048558421852e-06,
"loss": 0.272,
"step": 2035
},
{
"epoch": 0.5880233479858759,
"grad_norm": 10.822226524353027,
"learning_rate": 4.336874051593324e-06,
"loss": 0.2727,
"step": 2040
},
{
"epoch": 0.5894645816819197,
"grad_norm": 4.939727783203125,
"learning_rate": 4.321699544764795e-06,
"loss": 0.2575,
"step": 2045
},
{
"epoch": 0.5909058153779635,
"grad_norm": 4.966119289398193,
"learning_rate": 4.306525037936267e-06,
"loss": 0.2289,
"step": 2050
},
{
"epoch": 0.5923470490740074,
"grad_norm": 7.144054889678955,
"learning_rate": 4.2913505311077395e-06,
"loss": 0.2519,
"step": 2055
},
{
"epoch": 0.5937882827700511,
"grad_norm": 2.4000091552734375,
"learning_rate": 4.276176024279211e-06,
"loss": 0.22,
"step": 2060
},
{
"epoch": 0.595229516466095,
"grad_norm": 3.6598424911499023,
"learning_rate": 4.261001517450683e-06,
"loss": 0.2355,
"step": 2065
},
{
"epoch": 0.5966707501621388,
"grad_norm": 5.501832485198975,
"learning_rate": 4.245827010622155e-06,
"loss": 0.2197,
"step": 2070
},
{
"epoch": 0.5981119838581826,
"grad_norm": 5.9661688804626465,
"learning_rate": 4.230652503793627e-06,
"loss": 0.2239,
"step": 2075
},
{
"epoch": 0.5995532175542264,
"grad_norm": 7.844972610473633,
"learning_rate": 4.215477996965099e-06,
"loss": 0.235,
"step": 2080
},
{
"epoch": 0.6009944512502702,
"grad_norm": 3.511651039123535,
"learning_rate": 4.2003034901365705e-06,
"loss": 0.2096,
"step": 2085
},
{
"epoch": 0.602435684946314,
"grad_norm": 13.119202613830566,
"learning_rate": 4.185128983308043e-06,
"loss": 0.2381,
"step": 2090
},
{
"epoch": 0.6038769186423578,
"grad_norm": 3.2882187366485596,
"learning_rate": 4.169954476479515e-06,
"loss": 0.2296,
"step": 2095
},
{
"epoch": 0.6053181523384017,
"grad_norm": 8.070931434631348,
"learning_rate": 4.154779969650986e-06,
"loss": 0.2475,
"step": 2100
},
{
"epoch": 0.6053181523384017,
"eval_loss": 0.24032321572303772,
"eval_mse": 0.2403232262916863,
"eval_runtime": 3.6198,
"eval_samples_per_second": 276.258,
"eval_steps_per_second": 17.404,
"step": 2100
},
{
"epoch": 0.6067593860344455,
"grad_norm": 4.749974727630615,
"learning_rate": 4.139605462822459e-06,
"loss": 0.232,
"step": 2105
},
{
"epoch": 0.6082006197304893,
"grad_norm": 10.697235107421875,
"learning_rate": 4.124430955993931e-06,
"loss": 0.2725,
"step": 2110
},
{
"epoch": 0.6096418534265331,
"grad_norm": 3.0119149684906006,
"learning_rate": 4.109256449165402e-06,
"loss": 0.2379,
"step": 2115
},
{
"epoch": 0.611083087122577,
"grad_norm": 12.935966491699219,
"learning_rate": 4.0940819423368746e-06,
"loss": 0.2514,
"step": 2120
},
{
"epoch": 0.6125243208186207,
"grad_norm": 3.34983229637146,
"learning_rate": 4.078907435508346e-06,
"loss": 0.2616,
"step": 2125
},
{
"epoch": 0.6139655545146645,
"grad_norm": 3.52375864982605,
"learning_rate": 4.063732928679818e-06,
"loss": 0.2353,
"step": 2130
},
{
"epoch": 0.6154067882107084,
"grad_norm": 2.455650806427002,
"learning_rate": 4.0485584218512905e-06,
"loss": 0.238,
"step": 2135
},
{
"epoch": 0.6168480219067521,
"grad_norm": 6.911266803741455,
"learning_rate": 4.033383915022762e-06,
"loss": 0.2177,
"step": 2140
},
{
"epoch": 0.618289255602796,
"grad_norm": 3.4753849506378174,
"learning_rate": 4.018209408194234e-06,
"loss": 0.244,
"step": 2145
},
{
"epoch": 0.6197304892988398,
"grad_norm": 4.0766520500183105,
"learning_rate": 4.003034901365706e-06,
"loss": 0.2446,
"step": 2150
},
{
"epoch": 0.6211717229948837,
"grad_norm": 3.6783783435821533,
"learning_rate": 3.987860394537178e-06,
"loss": 0.2342,
"step": 2155
},
{
"epoch": 0.6226129566909274,
"grad_norm": 4.708898544311523,
"learning_rate": 3.97268588770865e-06,
"loss": 0.2444,
"step": 2160
},
{
"epoch": 0.6240541903869713,
"grad_norm": 7.179166316986084,
"learning_rate": 3.957511380880122e-06,
"loss": 0.2562,
"step": 2165
},
{
"epoch": 0.6254954240830151,
"grad_norm": 7.478214740753174,
"learning_rate": 3.942336874051594e-06,
"loss": 0.2508,
"step": 2170
},
{
"epoch": 0.6269366577790588,
"grad_norm": 3.4198427200317383,
"learning_rate": 3.927162367223066e-06,
"loss": 0.2601,
"step": 2175
},
{
"epoch": 0.6283778914751027,
"grad_norm": 10.68290901184082,
"learning_rate": 3.911987860394537e-06,
"loss": 0.2632,
"step": 2180
},
{
"epoch": 0.6298191251711465,
"grad_norm": 3.058023452758789,
"learning_rate": 3.89681335356601e-06,
"loss": 0.226,
"step": 2185
},
{
"epoch": 0.6312603588671903,
"grad_norm": 6.483073711395264,
"learning_rate": 3.881638846737482e-06,
"loss": 0.2929,
"step": 2190
},
{
"epoch": 0.6327015925632341,
"grad_norm": 9.623044967651367,
"learning_rate": 3.866464339908953e-06,
"loss": 0.2588,
"step": 2195
},
{
"epoch": 0.634142826259278,
"grad_norm": 3.588764190673828,
"learning_rate": 3.8512898330804255e-06,
"loss": 0.253,
"step": 2200
},
{
"epoch": 0.634142826259278,
"eval_loss": 0.23261044919490814,
"eval_mse": 0.23261045941268094,
"eval_runtime": 3.5178,
"eval_samples_per_second": 284.272,
"eval_steps_per_second": 17.909,
"step": 2200
},
{
"epoch": 0.6355840599553217,
"grad_norm": 5.355119705200195,
"learning_rate": 3.836115326251897e-06,
"loss": 0.2447,
"step": 2205
},
{
"epoch": 0.6370252936513656,
"grad_norm": 4.401165008544922,
"learning_rate": 3.820940819423369e-06,
"loss": 0.2484,
"step": 2210
},
{
"epoch": 0.6384665273474094,
"grad_norm": 3.379408597946167,
"learning_rate": 3.8057663125948406e-06,
"loss": 0.2141,
"step": 2215
},
{
"epoch": 0.6399077610434531,
"grad_norm": 4.606984615325928,
"learning_rate": 3.790591805766313e-06,
"loss": 0.2521,
"step": 2220
},
{
"epoch": 0.641348994739497,
"grad_norm": 6.997488975524902,
"learning_rate": 3.7754172989377846e-06,
"loss": 0.2429,
"step": 2225
},
{
"epoch": 0.6427902284355408,
"grad_norm": 5.074779987335205,
"learning_rate": 3.7602427921092565e-06,
"loss": 0.2337,
"step": 2230
},
{
"epoch": 0.6442314621315847,
"grad_norm": 4.323925971984863,
"learning_rate": 3.7450682852807287e-06,
"loss": 0.2238,
"step": 2235
},
{
"epoch": 0.6456726958276284,
"grad_norm": 2.953834056854248,
"learning_rate": 3.7298937784522006e-06,
"loss": 0.2502,
"step": 2240
},
{
"epoch": 0.6471139295236723,
"grad_norm": 7.569371223449707,
"learning_rate": 3.7147192716236724e-06,
"loss": 0.2641,
"step": 2245
},
{
"epoch": 0.6485551632197161,
"grad_norm": 3.7635698318481445,
"learning_rate": 3.699544764795144e-06,
"loss": 0.2396,
"step": 2250
},
{
"epoch": 0.6499963969157599,
"grad_norm": 5.548856735229492,
"learning_rate": 3.6843702579666165e-06,
"loss": 0.2391,
"step": 2255
},
{
"epoch": 0.6514376306118037,
"grad_norm": 5.984725475311279,
"learning_rate": 3.6691957511380883e-06,
"loss": 0.1981,
"step": 2260
},
{
"epoch": 0.6528788643078475,
"grad_norm": 3.8673911094665527,
"learning_rate": 3.65402124430956e-06,
"loss": 0.2753,
"step": 2265
},
{
"epoch": 0.6543200980038913,
"grad_norm": 3.2690844535827637,
"learning_rate": 3.638846737481032e-06,
"loss": 0.2352,
"step": 2270
},
{
"epoch": 0.6557613316999351,
"grad_norm": 6.600918292999268,
"learning_rate": 3.623672230652504e-06,
"loss": 0.2107,
"step": 2275
},
{
"epoch": 0.657202565395979,
"grad_norm": 4.141593933105469,
"learning_rate": 3.608497723823976e-06,
"loss": 0.2366,
"step": 2280
},
{
"epoch": 0.6586437990920228,
"grad_norm": 3.730548143386841,
"learning_rate": 3.593323216995448e-06,
"loss": 0.2665,
"step": 2285
},
{
"epoch": 0.6600850327880666,
"grad_norm": 4.160298824310303,
"learning_rate": 3.57814871016692e-06,
"loss": 0.2794,
"step": 2290
},
{
"epoch": 0.6615262664841104,
"grad_norm": 3.7238965034484863,
"learning_rate": 3.562974203338392e-06,
"loss": 0.2177,
"step": 2295
},
{
"epoch": 0.6629675001801542,
"grad_norm": 3.0806283950805664,
"learning_rate": 3.5477996965098638e-06,
"loss": 0.2435,
"step": 2300
},
{
"epoch": 0.6629675001801542,
"eval_loss": 0.21606960892677307,
"eval_mse": 0.2160696000645403,
"eval_runtime": 3.6091,
"eval_samples_per_second": 277.081,
"eval_steps_per_second": 17.456,
"step": 2300
},
{
"epoch": 0.664408733876198,
"grad_norm": 2.149531126022339,
"learning_rate": 3.5326251896813356e-06,
"loss": 0.2111,
"step": 2305
},
{
"epoch": 0.6658499675722418,
"grad_norm": 3.548011541366577,
"learning_rate": 3.517450682852808e-06,
"loss": 0.2549,
"step": 2310
},
{
"epoch": 0.6672912012682857,
"grad_norm": 3.9522154331207275,
"learning_rate": 3.5022761760242797e-06,
"loss": 0.2398,
"step": 2315
},
{
"epoch": 0.6687324349643294,
"grad_norm": 5.864704132080078,
"learning_rate": 3.4871016691957515e-06,
"loss": 0.2703,
"step": 2320
},
{
"epoch": 0.6701736686603733,
"grad_norm": 2.8301804065704346,
"learning_rate": 3.4719271623672233e-06,
"loss": 0.2537,
"step": 2325
},
{
"epoch": 0.6716149023564171,
"grad_norm": 6.02512264251709,
"learning_rate": 3.4567526555386956e-06,
"loss": 0.2673,
"step": 2330
},
{
"epoch": 0.673056136052461,
"grad_norm": 5.798670768737793,
"learning_rate": 3.4415781487101674e-06,
"loss": 0.2269,
"step": 2335
},
{
"epoch": 0.6744973697485047,
"grad_norm": 3.2476484775543213,
"learning_rate": 3.4264036418816392e-06,
"loss": 0.2453,
"step": 2340
},
{
"epoch": 0.6759386034445485,
"grad_norm": 2.640549659729004,
"learning_rate": 3.4112291350531115e-06,
"loss": 0.2378,
"step": 2345
},
{
"epoch": 0.6773798371405924,
"grad_norm": 2.7928059101104736,
"learning_rate": 3.3960546282245833e-06,
"loss": 0.2269,
"step": 2350
},
{
"epoch": 0.6788210708366361,
"grad_norm": 3.8571255207061768,
"learning_rate": 3.380880121396055e-06,
"loss": 0.2241,
"step": 2355
},
{
"epoch": 0.68026230453268,
"grad_norm": 6.047236442565918,
"learning_rate": 3.365705614567527e-06,
"loss": 0.2427,
"step": 2360
},
{
"epoch": 0.6817035382287238,
"grad_norm": 2.478774070739746,
"learning_rate": 3.350531107738999e-06,
"loss": 0.2397,
"step": 2365
},
{
"epoch": 0.6831447719247676,
"grad_norm": 5.034613609313965,
"learning_rate": 3.3353566009104706e-06,
"loss": 0.2545,
"step": 2370
},
{
"epoch": 0.6845860056208114,
"grad_norm": 8.068209648132324,
"learning_rate": 3.3201820940819424e-06,
"loss": 0.2692,
"step": 2375
},
{
"epoch": 0.6860272393168553,
"grad_norm": 4.195474624633789,
"learning_rate": 3.3050075872534143e-06,
"loss": 0.2334,
"step": 2380
},
{
"epoch": 0.687468473012899,
"grad_norm": 2.976191759109497,
"learning_rate": 3.289833080424886e-06,
"loss": 0.2519,
"step": 2385
},
{
"epoch": 0.6889097067089428,
"grad_norm": 3.802194118499756,
"learning_rate": 3.2746585735963583e-06,
"loss": 0.2178,
"step": 2390
},
{
"epoch": 0.6903509404049867,
"grad_norm": 3.7784626483917236,
"learning_rate": 3.25948406676783e-06,
"loss": 0.2381,
"step": 2395
},
{
"epoch": 0.6917921741010304,
"grad_norm": 4.433982849121094,
"learning_rate": 3.244309559939302e-06,
"loss": 0.2865,
"step": 2400
},
{
"epoch": 0.6917921741010304,
"eval_loss": 0.22648707032203674,
"eval_mse": 0.2264870922613336,
"eval_runtime": 3.5396,
"eval_samples_per_second": 282.517,
"eval_steps_per_second": 17.799,
"step": 2400
},
{
"epoch": 0.6932334077970743,
"grad_norm": 6.46554708480835,
"learning_rate": 3.229135053110774e-06,
"loss": 0.2477,
"step": 2405
},
{
"epoch": 0.6946746414931181,
"grad_norm": 4.361721992492676,
"learning_rate": 3.213960546282246e-06,
"loss": 0.2319,
"step": 2410
},
{
"epoch": 0.696115875189162,
"grad_norm": 6.774047374725342,
"learning_rate": 3.198786039453718e-06,
"loss": 0.2454,
"step": 2415
},
{
"epoch": 0.6975571088852057,
"grad_norm": 8.801002502441406,
"learning_rate": 3.1836115326251897e-06,
"loss": 0.2417,
"step": 2420
},
{
"epoch": 0.6989983425812496,
"grad_norm": 3.7401864528656006,
"learning_rate": 3.168437025796662e-06,
"loss": 0.2472,
"step": 2425
},
{
"epoch": 0.7004395762772934,
"grad_norm": 4.724398136138916,
"learning_rate": 3.153262518968134e-06,
"loss": 0.2413,
"step": 2430
},
{
"epoch": 0.7018808099733371,
"grad_norm": 4.55704927444458,
"learning_rate": 3.1380880121396056e-06,
"loss": 0.2486,
"step": 2435
},
{
"epoch": 0.703322043669381,
"grad_norm": 4.389082431793213,
"learning_rate": 3.1229135053110775e-06,
"loss": 0.2163,
"step": 2440
},
{
"epoch": 0.7047632773654248,
"grad_norm": 2.8571767807006836,
"learning_rate": 3.1077389984825497e-06,
"loss": 0.2638,
"step": 2445
},
{
"epoch": 0.7062045110614686,
"grad_norm": 4.311636447906494,
"learning_rate": 3.0925644916540215e-06,
"loss": 0.234,
"step": 2450
},
{
"epoch": 0.7076457447575124,
"grad_norm": 2.979557752609253,
"learning_rate": 3.0773899848254934e-06,
"loss": 0.2644,
"step": 2455
},
{
"epoch": 0.7090869784535563,
"grad_norm": 3.3358638286590576,
"learning_rate": 3.062215477996965e-06,
"loss": 0.2606,
"step": 2460
},
{
"epoch": 0.7105282121496,
"grad_norm": 2.8190908432006836,
"learning_rate": 3.0470409711684375e-06,
"loss": 0.2195,
"step": 2465
},
{
"epoch": 0.7119694458456439,
"grad_norm": 4.225391387939453,
"learning_rate": 3.0318664643399093e-06,
"loss": 0.2236,
"step": 2470
},
{
"epoch": 0.7134106795416877,
"grad_norm": 8.34825611114502,
"learning_rate": 3.016691957511381e-06,
"loss": 0.2587,
"step": 2475
},
{
"epoch": 0.7148519132377315,
"grad_norm": 3.4229488372802734,
"learning_rate": 3.0015174506828534e-06,
"loss": 0.2353,
"step": 2480
},
{
"epoch": 0.7162931469337753,
"grad_norm": 13.701717376708984,
"learning_rate": 2.986342943854325e-06,
"loss": 0.2678,
"step": 2485
},
{
"epoch": 0.7177343806298191,
"grad_norm": 3.0285487174987793,
"learning_rate": 2.971168437025797e-06,
"loss": 0.2346,
"step": 2490
},
{
"epoch": 0.719175614325863,
"grad_norm": 6.881831645965576,
"learning_rate": 2.955993930197269e-06,
"loss": 0.2431,
"step": 2495
},
{
"epoch": 0.7206168480219067,
"grad_norm": 4.84171199798584,
"learning_rate": 2.940819423368741e-06,
"loss": 0.2351,
"step": 2500
},
{
"epoch": 0.7206168480219067,
"eval_loss": 0.23427650332450867,
"eval_mse": 0.2342765065105632,
"eval_runtime": 3.5318,
"eval_samples_per_second": 283.14,
"eval_steps_per_second": 17.838,
"step": 2500
},
{
"epoch": 0.7220580817179506,
"grad_norm": 4.6257548332214355,
"learning_rate": 2.925644916540213e-06,
"loss": 0.2287,
"step": 2505
},
{
"epoch": 0.7234993154139944,
"grad_norm": 3.9493324756622314,
"learning_rate": 2.9104704097116847e-06,
"loss": 0.2516,
"step": 2510
},
{
"epoch": 0.7249405491100382,
"grad_norm": 2.4812047481536865,
"learning_rate": 2.8952959028831566e-06,
"loss": 0.2903,
"step": 2515
},
{
"epoch": 0.726381782806082,
"grad_norm": 6.331687927246094,
"learning_rate": 2.880121396054629e-06,
"loss": 0.2332,
"step": 2520
},
{
"epoch": 0.7278230165021258,
"grad_norm": 4.676543712615967,
"learning_rate": 2.8649468892261007e-06,
"loss": 0.2508,
"step": 2525
},
{
"epoch": 0.7292642501981697,
"grad_norm": 4.514668941497803,
"learning_rate": 2.849772382397572e-06,
"loss": 0.2277,
"step": 2530
},
{
"epoch": 0.7307054838942134,
"grad_norm": 4.100338459014893,
"learning_rate": 2.834597875569044e-06,
"loss": 0.2104,
"step": 2535
},
{
"epoch": 0.7321467175902573,
"grad_norm": 4.366134166717529,
"learning_rate": 2.8194233687405157e-06,
"loss": 0.2409,
"step": 2540
},
{
"epoch": 0.733587951286301,
"grad_norm": 2.7329518795013428,
"learning_rate": 2.804248861911988e-06,
"loss": 0.2085,
"step": 2545
},
{
"epoch": 0.7350291849823449,
"grad_norm": 5.565128803253174,
"learning_rate": 2.78907435508346e-06,
"loss": 0.2284,
"step": 2550
},
{
"epoch": 0.7364704186783887,
"grad_norm": 2.5566251277923584,
"learning_rate": 2.7738998482549316e-06,
"loss": 0.2087,
"step": 2555
},
{
"epoch": 0.7379116523744326,
"grad_norm": 3.898559331893921,
"learning_rate": 2.758725341426404e-06,
"loss": 0.2541,
"step": 2560
},
{
"epoch": 0.7393528860704763,
"grad_norm": 5.346958160400391,
"learning_rate": 2.7435508345978757e-06,
"loss": 0.2322,
"step": 2565
},
{
"epoch": 0.7407941197665201,
"grad_norm": 9.296390533447266,
"learning_rate": 2.7283763277693475e-06,
"loss": 0.2522,
"step": 2570
},
{
"epoch": 0.742235353462564,
"grad_norm": 3.838529348373413,
"learning_rate": 2.7132018209408194e-06,
"loss": 0.2562,
"step": 2575
},
{
"epoch": 0.7436765871586077,
"grad_norm": 2.6012656688690186,
"learning_rate": 2.6980273141122916e-06,
"loss": 0.2637,
"step": 2580
},
{
"epoch": 0.7451178208546516,
"grad_norm": 5.522032737731934,
"learning_rate": 2.6828528072837634e-06,
"loss": 0.2537,
"step": 2585
},
{
"epoch": 0.7465590545506954,
"grad_norm": 5.43183708190918,
"learning_rate": 2.6676783004552353e-06,
"loss": 0.2322,
"step": 2590
},
{
"epoch": 0.7480002882467393,
"grad_norm": 3.3480212688446045,
"learning_rate": 2.652503793626707e-06,
"loss": 0.2349,
"step": 2595
},
{
"epoch": 0.749441521942783,
"grad_norm": 3.7614428997039795,
"learning_rate": 2.6373292867981793e-06,
"loss": 0.2582,
"step": 2600
},
{
"epoch": 0.749441521942783,
"eval_loss": 0.2341535985469818,
"eval_mse": 0.23415358681604267,
"eval_runtime": 3.5332,
"eval_samples_per_second": 283.03,
"eval_steps_per_second": 17.831,
"step": 2600
},
{
"epoch": 0.7508827556388268,
"grad_norm": 9.034743309020996,
"learning_rate": 2.622154779969651e-06,
"loss": 0.2608,
"step": 2605
},
{
"epoch": 0.7523239893348707,
"grad_norm": 7.9333062171936035,
"learning_rate": 2.606980273141123e-06,
"loss": 0.2403,
"step": 2610
},
{
"epoch": 0.7537652230309144,
"grad_norm": 2.5331106185913086,
"learning_rate": 2.5918057663125952e-06,
"loss": 0.2326,
"step": 2615
},
{
"epoch": 0.7552064567269583,
"grad_norm": 4.369472026824951,
"learning_rate": 2.576631259484067e-06,
"loss": 0.2631,
"step": 2620
},
{
"epoch": 0.7566476904230021,
"grad_norm": 4.154514789581299,
"learning_rate": 2.561456752655539e-06,
"loss": 0.222,
"step": 2625
},
{
"epoch": 0.7580889241190459,
"grad_norm": 3.586236000061035,
"learning_rate": 2.5462822458270107e-06,
"loss": 0.2319,
"step": 2630
},
{
"epoch": 0.7595301578150897,
"grad_norm": 4.088715076446533,
"learning_rate": 2.531107738998483e-06,
"loss": 0.2207,
"step": 2635
},
{
"epoch": 0.7609713915111336,
"grad_norm": 3.201357364654541,
"learning_rate": 2.515933232169955e-06,
"loss": 0.2472,
"step": 2640
},
{
"epoch": 0.7624126252071773,
"grad_norm": 11.038939476013184,
"learning_rate": 2.5007587253414266e-06,
"loss": 0.2467,
"step": 2645
},
{
"epoch": 0.7638538589032211,
"grad_norm": 2.891178846359253,
"learning_rate": 2.4855842185128985e-06,
"loss": 0.2492,
"step": 2650
},
{
"epoch": 0.765295092599265,
"grad_norm": 5.062381267547607,
"learning_rate": 2.4704097116843703e-06,
"loss": 0.2564,
"step": 2655
},
{
"epoch": 0.7667363262953087,
"grad_norm": 7.891109943389893,
"learning_rate": 2.455235204855842e-06,
"loss": 0.2163,
"step": 2660
},
{
"epoch": 0.7681775599913526,
"grad_norm": 2.734602451324463,
"learning_rate": 2.4400606980273144e-06,
"loss": 0.2392,
"step": 2665
},
{
"epoch": 0.7696187936873964,
"grad_norm": 3.1891098022460938,
"learning_rate": 2.424886191198786e-06,
"loss": 0.236,
"step": 2670
},
{
"epoch": 0.7710600273834403,
"grad_norm": 2.7162604331970215,
"learning_rate": 2.409711684370258e-06,
"loss": 0.2412,
"step": 2675
},
{
"epoch": 0.772501261079484,
"grad_norm": 9.879570960998535,
"learning_rate": 2.39453717754173e-06,
"loss": 0.2362,
"step": 2680
},
{
"epoch": 0.7739424947755279,
"grad_norm": 5.890117645263672,
"learning_rate": 2.379362670713202e-06,
"loss": 0.2489,
"step": 2685
},
{
"epoch": 0.7753837284715717,
"grad_norm": 4.443430423736572,
"learning_rate": 2.364188163884674e-06,
"loss": 0.2402,
"step": 2690
},
{
"epoch": 0.7768249621676154,
"grad_norm": 8.724933624267578,
"learning_rate": 2.3490136570561458e-06,
"loss": 0.2343,
"step": 2695
},
{
"epoch": 0.7782661958636593,
"grad_norm": 5.362820148468018,
"learning_rate": 2.333839150227618e-06,
"loss": 0.2167,
"step": 2700
},
{
"epoch": 0.7782661958636593,
"eval_loss": 0.23367303609848022,
"eval_mse": 0.23367304604459788,
"eval_runtime": 3.4711,
"eval_samples_per_second": 288.093,
"eval_steps_per_second": 18.15,
"step": 2700
},
{
"epoch": 0.7797074295597031,
"grad_norm": 4.84460973739624,
"learning_rate": 2.31866464339909e-06,
"loss": 0.2462,
"step": 2705
},
{
"epoch": 0.781148663255747,
"grad_norm": 3.910109043121338,
"learning_rate": 2.3034901365705617e-06,
"loss": 0.2238,
"step": 2710
},
{
"epoch": 0.7825898969517907,
"grad_norm": 5.745606422424316,
"learning_rate": 2.2883156297420335e-06,
"loss": 0.2653,
"step": 2715
},
{
"epoch": 0.7840311306478346,
"grad_norm": 3.182253837585449,
"learning_rate": 2.2731411229135057e-06,
"loss": 0.2527,
"step": 2720
},
{
"epoch": 0.7854723643438783,
"grad_norm": 5.271828651428223,
"learning_rate": 2.2579666160849776e-06,
"loss": 0.2354,
"step": 2725
},
{
"epoch": 0.7869135980399222,
"grad_norm": 10.707908630371094,
"learning_rate": 2.2427921092564494e-06,
"loss": 0.2249,
"step": 2730
},
{
"epoch": 0.788354831735966,
"grad_norm": 8.2650785446167,
"learning_rate": 2.2276176024279212e-06,
"loss": 0.2293,
"step": 2735
},
{
"epoch": 0.7897960654320098,
"grad_norm": 5.198644161224365,
"learning_rate": 2.212443095599393e-06,
"loss": 0.2669,
"step": 2740
},
{
"epoch": 0.7912372991280536,
"grad_norm": 3.3063764572143555,
"learning_rate": 2.197268588770865e-06,
"loss": 0.2177,
"step": 2745
},
{
"epoch": 0.7926785328240974,
"grad_norm": 7.53934383392334,
"learning_rate": 2.182094081942337e-06,
"loss": 0.2339,
"step": 2750
},
{
"epoch": 0.7941197665201413,
"grad_norm": 2.6870410442352295,
"learning_rate": 2.166919575113809e-06,
"loss": 0.2357,
"step": 2755
},
{
"epoch": 0.795561000216185,
"grad_norm": 6.6113810539245605,
"learning_rate": 2.1517450682852808e-06,
"loss": 0.2345,
"step": 2760
},
{
"epoch": 0.7970022339122289,
"grad_norm": 6.589913368225098,
"learning_rate": 2.1365705614567526e-06,
"loss": 0.2586,
"step": 2765
},
{
"epoch": 0.7984434676082727,
"grad_norm": 3.827324867248535,
"learning_rate": 2.121396054628225e-06,
"loss": 0.2378,
"step": 2770
},
{
"epoch": 0.7998847013043165,
"grad_norm": 4.446374893188477,
"learning_rate": 2.1062215477996967e-06,
"loss": 0.2181,
"step": 2775
},
{
"epoch": 0.8013259350003603,
"grad_norm": 3.70227313041687,
"learning_rate": 2.0910470409711685e-06,
"loss": 0.2358,
"step": 2780
},
{
"epoch": 0.8027671686964041,
"grad_norm": 3.6692309379577637,
"learning_rate": 2.0758725341426408e-06,
"loss": 0.2394,
"step": 2785
},
{
"epoch": 0.804208402392448,
"grad_norm": 3.3776164054870605,
"learning_rate": 2.0606980273141126e-06,
"loss": 0.2564,
"step": 2790
},
{
"epoch": 0.8056496360884917,
"grad_norm": 3.1166326999664307,
"learning_rate": 2.0455235204855844e-06,
"loss": 0.2523,
"step": 2795
},
{
"epoch": 0.8070908697845356,
"grad_norm": 5.939455986022949,
"learning_rate": 2.0303490136570563e-06,
"loss": 0.2495,
"step": 2800
},
{
"epoch": 0.8070908697845356,
"eval_loss": 0.22727453708648682,
"eval_mse": 0.22727454181946813,
"eval_runtime": 3.4934,
"eval_samples_per_second": 286.258,
"eval_steps_per_second": 18.034,
"step": 2800
},
{
"epoch": 0.8085321034805794,
"grad_norm": 5.415030002593994,
"learning_rate": 2.0151745068285285e-06,
"loss": 0.2211,
"step": 2805
},
{
"epoch": 0.8099733371766232,
"grad_norm": 7.75388240814209,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2482,
"step": 2810
},
{
"epoch": 0.811414570872667,
"grad_norm": 4.087270736694336,
"learning_rate": 1.984825493171472e-06,
"loss": 0.2305,
"step": 2815
},
{
"epoch": 0.8128558045687109,
"grad_norm": 5.57621955871582,
"learning_rate": 1.969650986342944e-06,
"loss": 0.2371,
"step": 2820
},
{
"epoch": 0.8142970382647546,
"grad_norm": 3.64129638671875,
"learning_rate": 1.954476479514416e-06,
"loss": 0.2188,
"step": 2825
},
{
"epoch": 0.8157382719607984,
"grad_norm": 5.807697772979736,
"learning_rate": 1.9393019726858876e-06,
"loss": 0.2417,
"step": 2830
},
{
"epoch": 0.8171795056568423,
"grad_norm": 3.478083610534668,
"learning_rate": 1.92412746585736e-06,
"loss": 0.2443,
"step": 2835
},
{
"epoch": 0.818620739352886,
"grad_norm": 4.476086139678955,
"learning_rate": 1.9089529590288317e-06,
"loss": 0.2196,
"step": 2840
},
{
"epoch": 0.8200619730489299,
"grad_norm": 3.3738081455230713,
"learning_rate": 1.8937784522003035e-06,
"loss": 0.222,
"step": 2845
},
{
"epoch": 0.8215032067449737,
"grad_norm": 6.0412163734436035,
"learning_rate": 1.8786039453717756e-06,
"loss": 0.2297,
"step": 2850
},
{
"epoch": 0.8229444404410176,
"grad_norm": 4.279974937438965,
"learning_rate": 1.8634294385432474e-06,
"loss": 0.2597,
"step": 2855
},
{
"epoch": 0.8243856741370613,
"grad_norm": 7.096454620361328,
"learning_rate": 1.8482549317147195e-06,
"loss": 0.2466,
"step": 2860
},
{
"epoch": 0.8258269078331051,
"grad_norm": 2.768843412399292,
"learning_rate": 1.8330804248861913e-06,
"loss": 0.2363,
"step": 2865
},
{
"epoch": 0.827268141529149,
"grad_norm": 4.103148460388184,
"learning_rate": 1.8179059180576633e-06,
"loss": 0.2451,
"step": 2870
},
{
"epoch": 0.8287093752251927,
"grad_norm": 6.537170886993408,
"learning_rate": 1.8027314112291352e-06,
"loss": 0.2394,
"step": 2875
},
{
"epoch": 0.8301506089212366,
"grad_norm": 3.861488103866577,
"learning_rate": 1.7875569044006072e-06,
"loss": 0.2407,
"step": 2880
},
{
"epoch": 0.8315918426172804,
"grad_norm": 4.997881889343262,
"learning_rate": 1.7723823975720792e-06,
"loss": 0.2369,
"step": 2885
},
{
"epoch": 0.8330330763133242,
"grad_norm": 3.4017415046691895,
"learning_rate": 1.757207890743551e-06,
"loss": 0.2272,
"step": 2890
},
{
"epoch": 0.834474310009368,
"grad_norm": 3.679633140563965,
"learning_rate": 1.742033383915023e-06,
"loss": 0.223,
"step": 2895
},
{
"epoch": 0.8359155437054119,
"grad_norm": 3.0705554485321045,
"learning_rate": 1.726858877086495e-06,
"loss": 0.2364,
"step": 2900
},
{
"epoch": 0.8359155437054119,
"eval_loss": 0.22982758283615112,
"eval_mse": 0.229827576600248,
"eval_runtime": 3.5078,
"eval_samples_per_second": 285.082,
"eval_steps_per_second": 17.96,
"step": 2900
},
{
"epoch": 0.8373567774014556,
"grad_norm": 3.1811139583587646,
"learning_rate": 1.7116843702579665e-06,
"loss": 0.2422,
"step": 2905
},
{
"epoch": 0.8387980110974994,
"grad_norm": 2.974726915359497,
"learning_rate": 1.6965098634294386e-06,
"loss": 0.2405,
"step": 2910
},
{
"epoch": 0.8402392447935433,
"grad_norm": 9.266916275024414,
"learning_rate": 1.6813353566009106e-06,
"loss": 0.2312,
"step": 2915
},
{
"epoch": 0.841680478489587,
"grad_norm": 6.627812385559082,
"learning_rate": 1.6661608497723824e-06,
"loss": 0.2605,
"step": 2920
},
{
"epoch": 0.8431217121856309,
"grad_norm": 3.63108491897583,
"learning_rate": 1.6509863429438545e-06,
"loss": 0.197,
"step": 2925
},
{
"epoch": 0.8445629458816747,
"grad_norm": 3.545898914337158,
"learning_rate": 1.6358118361153263e-06,
"loss": 0.231,
"step": 2930
},
{
"epoch": 0.8460041795777186,
"grad_norm": 4.493469715118408,
"learning_rate": 1.6206373292867984e-06,
"loss": 0.2516,
"step": 2935
},
{
"epoch": 0.8474454132737623,
"grad_norm": 3.4782025814056396,
"learning_rate": 1.6054628224582702e-06,
"loss": 0.2234,
"step": 2940
},
{
"epoch": 0.8488866469698062,
"grad_norm": 3.453437566757202,
"learning_rate": 1.5902883156297422e-06,
"loss": 0.2201,
"step": 2945
},
{
"epoch": 0.85032788066585,
"grad_norm": 3.0764260292053223,
"learning_rate": 1.575113808801214e-06,
"loss": 0.2397,
"step": 2950
},
{
"epoch": 0.8517691143618937,
"grad_norm": 4.054821014404297,
"learning_rate": 1.559939301972686e-06,
"loss": 0.2319,
"step": 2955
},
{
"epoch": 0.8532103480579376,
"grad_norm": 4.305934429168701,
"learning_rate": 1.544764795144158e-06,
"loss": 0.257,
"step": 2960
},
{
"epoch": 0.8546515817539814,
"grad_norm": 4.810431480407715,
"learning_rate": 1.52959028831563e-06,
"loss": 0.229,
"step": 2965
},
{
"epoch": 0.8560928154500252,
"grad_norm": 3.385409116744995,
"learning_rate": 1.514415781487102e-06,
"loss": 0.2382,
"step": 2970
},
{
"epoch": 0.857534049146069,
"grad_norm": 3.6940057277679443,
"learning_rate": 1.4992412746585738e-06,
"loss": 0.2206,
"step": 2975
},
{
"epoch": 0.8589752828421129,
"grad_norm": 4.147489547729492,
"learning_rate": 1.4840667678300459e-06,
"loss": 0.2429,
"step": 2980
},
{
"epoch": 0.8604165165381567,
"grad_norm": 2.8008570671081543,
"learning_rate": 1.4688922610015175e-06,
"loss": 0.229,
"step": 2985
},
{
"epoch": 0.8618577502342005,
"grad_norm": 3.3151803016662598,
"learning_rate": 1.4537177541729893e-06,
"loss": 0.2556,
"step": 2990
},
{
"epoch": 0.8632989839302443,
"grad_norm": 3.3559606075286865,
"learning_rate": 1.4385432473444613e-06,
"loss": 0.2048,
"step": 2995
},
{
"epoch": 0.864740217626288,
"grad_norm": 7.827072620391846,
"learning_rate": 1.4233687405159332e-06,
"loss": 0.2236,
"step": 3000
},
{
"epoch": 0.864740217626288,
"eval_loss": 0.21699398756027222,
"eval_mse": 0.21699400277157838,
"eval_runtime": 3.6477,
"eval_samples_per_second": 274.144,
"eval_steps_per_second": 17.271,
"step": 3000
},
{
"epoch": 0.8661814513223319,
"grad_norm": 4.992173194885254,
"learning_rate": 1.4081942336874052e-06,
"loss": 0.2542,
"step": 3005
},
{
"epoch": 0.8676226850183757,
"grad_norm": 3.356741428375244,
"learning_rate": 1.3930197268588772e-06,
"loss": 0.2368,
"step": 3010
},
{
"epoch": 0.8690639187144196,
"grad_norm": 5.88596773147583,
"learning_rate": 1.377845220030349e-06,
"loss": 0.2497,
"step": 3015
},
{
"epoch": 0.8705051524104633,
"grad_norm": 4.098241806030273,
"learning_rate": 1.3626707132018211e-06,
"loss": 0.2639,
"step": 3020
},
{
"epoch": 0.8719463861065072,
"grad_norm": 5.0098347663879395,
"learning_rate": 1.347496206373293e-06,
"loss": 0.2341,
"step": 3025
},
{
"epoch": 0.873387619802551,
"grad_norm": 4.149919033050537,
"learning_rate": 1.332321699544765e-06,
"loss": 0.2148,
"step": 3030
},
{
"epoch": 0.8748288534985948,
"grad_norm": 3.0215213298797607,
"learning_rate": 1.3171471927162368e-06,
"loss": 0.2426,
"step": 3035
},
{
"epoch": 0.8762700871946386,
"grad_norm": 3.6104202270507812,
"learning_rate": 1.3019726858877088e-06,
"loss": 0.2096,
"step": 3040
},
{
"epoch": 0.8777113208906824,
"grad_norm": 6.203906059265137,
"learning_rate": 1.2867981790591807e-06,
"loss": 0.2306,
"step": 3045
},
{
"epoch": 0.8791525545867263,
"grad_norm": 9.710282325744629,
"learning_rate": 1.2716236722306527e-06,
"loss": 0.2262,
"step": 3050
},
{
"epoch": 0.88059378828277,
"grad_norm": 6.790435791015625,
"learning_rate": 1.2564491654021245e-06,
"loss": 0.2329,
"step": 3055
},
{
"epoch": 0.8820350219788139,
"grad_norm": 5.565480709075928,
"learning_rate": 1.2412746585735964e-06,
"loss": 0.2614,
"step": 3060
},
{
"epoch": 0.8834762556748577,
"grad_norm": 3.470287322998047,
"learning_rate": 1.2261001517450684e-06,
"loss": 0.2437,
"step": 3065
},
{
"epoch": 0.8849174893709015,
"grad_norm": 3.8940699100494385,
"learning_rate": 1.2109256449165402e-06,
"loss": 0.2458,
"step": 3070
},
{
"epoch": 0.8863587230669453,
"grad_norm": 3.414523124694824,
"learning_rate": 1.1957511380880123e-06,
"loss": 0.2401,
"step": 3075
},
{
"epoch": 0.8877999567629892,
"grad_norm": 3.306699752807617,
"learning_rate": 1.1805766312594843e-06,
"loss": 0.266,
"step": 3080
},
{
"epoch": 0.8892411904590329,
"grad_norm": 5.706084251403809,
"learning_rate": 1.1654021244309561e-06,
"loss": 0.2496,
"step": 3085
},
{
"epoch": 0.8906824241550767,
"grad_norm": 4.052499771118164,
"learning_rate": 1.150227617602428e-06,
"loss": 0.2488,
"step": 3090
},
{
"epoch": 0.8921236578511206,
"grad_norm": 4.791928291320801,
"learning_rate": 1.1350531107738998e-06,
"loss": 0.2431,
"step": 3095
},
{
"epoch": 0.8935648915471643,
"grad_norm": 2.6201539039611816,
"learning_rate": 1.1198786039453718e-06,
"loss": 0.231,
"step": 3100
},
{
"epoch": 0.8935648915471643,
"eval_loss": 0.22340208292007446,
"eval_mse": 0.22340208877553233,
"eval_runtime": 3.6046,
"eval_samples_per_second": 277.421,
"eval_steps_per_second": 17.477,
"step": 3100
},
{
"epoch": 0.8950061252432082,
"grad_norm": 3.4521875381469727,
"learning_rate": 1.1047040971168439e-06,
"loss": 0.2156,
"step": 3105
},
{
"epoch": 0.896447358939252,
"grad_norm": 3.851257085800171,
"learning_rate": 1.0895295902883157e-06,
"loss": 0.222,
"step": 3110
},
{
"epoch": 0.8978885926352959,
"grad_norm": 3.1783621311187744,
"learning_rate": 1.0743550834597877e-06,
"loss": 0.2571,
"step": 3115
},
{
"epoch": 0.8993298263313396,
"grad_norm": 6.458141326904297,
"learning_rate": 1.0591805766312596e-06,
"loss": 0.2194,
"step": 3120
},
{
"epoch": 0.9007710600273834,
"grad_norm": 6.427337169647217,
"learning_rate": 1.0440060698027316e-06,
"loss": 0.2468,
"step": 3125
},
{
"epoch": 0.9022122937234273,
"grad_norm": 3.6548845767974854,
"learning_rate": 1.0288315629742034e-06,
"loss": 0.2646,
"step": 3130
},
{
"epoch": 0.903653527419471,
"grad_norm": 10.87209415435791,
"learning_rate": 1.0136570561456753e-06,
"loss": 0.2473,
"step": 3135
},
{
"epoch": 0.9050947611155149,
"grad_norm": 3.511836290359497,
"learning_rate": 9.984825493171473e-07,
"loss": 0.211,
"step": 3140
},
{
"epoch": 0.9065359948115587,
"grad_norm": 6.754063129425049,
"learning_rate": 9.833080424886191e-07,
"loss": 0.2296,
"step": 3145
},
{
"epoch": 0.9079772285076025,
"grad_norm": 3.369685173034668,
"learning_rate": 9.681335356600912e-07,
"loss": 0.2473,
"step": 3150
},
{
"epoch": 0.9094184622036463,
"grad_norm": 3.534219741821289,
"learning_rate": 9.529590288315631e-07,
"loss": 0.2478,
"step": 3155
},
{
"epoch": 0.9108596958996902,
"grad_norm": 2.9221699237823486,
"learning_rate": 9.37784522003035e-07,
"loss": 0.2464,
"step": 3160
},
{
"epoch": 0.912300929595734,
"grad_norm": 3.83062481880188,
"learning_rate": 9.22610015174507e-07,
"loss": 0.2334,
"step": 3165
},
{
"epoch": 0.9137421632917777,
"grad_norm": 2.8956944942474365,
"learning_rate": 9.074355083459788e-07,
"loss": 0.2328,
"step": 3170
},
{
"epoch": 0.9151833969878216,
"grad_norm": 5.482194900512695,
"learning_rate": 8.922610015174507e-07,
"loss": 0.2301,
"step": 3175
},
{
"epoch": 0.9166246306838653,
"grad_norm": 4.709765434265137,
"learning_rate": 8.770864946889227e-07,
"loss": 0.2486,
"step": 3180
},
{
"epoch": 0.9180658643799092,
"grad_norm": 4.38163948059082,
"learning_rate": 8.619119878603946e-07,
"loss": 0.2463,
"step": 3185
},
{
"epoch": 0.919507098075953,
"grad_norm": 8.512566566467285,
"learning_rate": 8.467374810318665e-07,
"loss": 0.2662,
"step": 3190
},
{
"epoch": 0.9209483317719969,
"grad_norm": 4.098446369171143,
"learning_rate": 8.315629742033385e-07,
"loss": 0.2466,
"step": 3195
},
{
"epoch": 0.9223895654680406,
"grad_norm": 3.8778812885284424,
"learning_rate": 8.163884673748104e-07,
"loss": 0.2474,
"step": 3200
},
{
"epoch": 0.9223895654680406,
"eval_loss": 0.22270123660564423,
"eval_mse": 0.2227012378773652,
"eval_runtime": 3.5199,
"eval_samples_per_second": 284.101,
"eval_steps_per_second": 17.898,
"step": 3200
},
{
"epoch": 0.9238307991640845,
"grad_norm": 4.652983665466309,
"learning_rate": 8.012139605462823e-07,
"loss": 0.2426,
"step": 3205
},
{
"epoch": 0.9252720328601283,
"grad_norm": 2.9939351081848145,
"learning_rate": 7.860394537177542e-07,
"loss": 0.2258,
"step": 3210
},
{
"epoch": 0.926713266556172,
"grad_norm": 3.162224531173706,
"learning_rate": 7.708649468892261e-07,
"loss": 0.2489,
"step": 3215
},
{
"epoch": 0.9281545002522159,
"grad_norm": 6.1820807456970215,
"learning_rate": 7.55690440060698e-07,
"loss": 0.244,
"step": 3220
},
{
"epoch": 0.9295957339482597,
"grad_norm": 3.0030033588409424,
"learning_rate": 7.4051593323217e-07,
"loss": 0.2482,
"step": 3225
},
{
"epoch": 0.9310369676443035,
"grad_norm": 2.886375904083252,
"learning_rate": 7.253414264036419e-07,
"loss": 0.2329,
"step": 3230
},
{
"epoch": 0.9324782013403473,
"grad_norm": 4.034816741943359,
"learning_rate": 7.101669195751138e-07,
"loss": 0.2241,
"step": 3235
},
{
"epoch": 0.9339194350363912,
"grad_norm": 3.043692111968994,
"learning_rate": 6.949924127465859e-07,
"loss": 0.2271,
"step": 3240
},
{
"epoch": 0.935360668732435,
"grad_norm": 3.8499319553375244,
"learning_rate": 6.798179059180578e-07,
"loss": 0.237,
"step": 3245
},
{
"epoch": 0.9368019024284788,
"grad_norm": 5.247629165649414,
"learning_rate": 6.646433990895297e-07,
"loss": 0.2495,
"step": 3250
},
{
"epoch": 0.9382431361245226,
"grad_norm": 2.750441312789917,
"learning_rate": 6.494688922610016e-07,
"loss": 0.2398,
"step": 3255
},
{
"epoch": 0.9396843698205664,
"grad_norm": 3.8449652194976807,
"learning_rate": 6.342943854324735e-07,
"loss": 0.2387,
"step": 3260
},
{
"epoch": 0.9411256035166102,
"grad_norm": 3.128649950027466,
"learning_rate": 6.191198786039454e-07,
"loss": 0.2164,
"step": 3265
},
{
"epoch": 0.942566837212654,
"grad_norm": 7.363992691040039,
"learning_rate": 6.039453717754174e-07,
"loss": 0.2351,
"step": 3270
},
{
"epoch": 0.9440080709086979,
"grad_norm": 4.601086616516113,
"learning_rate": 5.887708649468893e-07,
"loss": 0.2237,
"step": 3275
},
{
"epoch": 0.9454493046047416,
"grad_norm": 5.33753776550293,
"learning_rate": 5.735963581183612e-07,
"loss": 0.2281,
"step": 3280
},
{
"epoch": 0.9468905383007855,
"grad_norm": 3.417290449142456,
"learning_rate": 5.584218512898331e-07,
"loss": 0.2377,
"step": 3285
},
{
"epoch": 0.9483317719968293,
"grad_norm": 3.4570603370666504,
"learning_rate": 5.43247344461305e-07,
"loss": 0.2203,
"step": 3290
},
{
"epoch": 0.9497730056928732,
"grad_norm": 2.8994665145874023,
"learning_rate": 5.28072837632777e-07,
"loss": 0.2236,
"step": 3295
},
{
"epoch": 0.9512142393889169,
"grad_norm": 3.222146511077881,
"learning_rate": 5.12898330804249e-07,
"loss": 0.2333,
"step": 3300
},
{
"epoch": 0.9512142393889169,
"eval_loss": 0.2240542769432068,
"eval_mse": 0.22405428479570036,
"eval_runtime": 3.5952,
"eval_samples_per_second": 278.147,
"eval_steps_per_second": 17.523,
"step": 3300
},
{
"epoch": 0.9526554730849607,
"grad_norm": 3.720475673675537,
"learning_rate": 4.977238239757208e-07,
"loss": 0.2587,
"step": 3305
},
{
"epoch": 0.9540967067810046,
"grad_norm": 2.5176913738250732,
"learning_rate": 4.825493171471927e-07,
"loss": 0.2155,
"step": 3310
},
{
"epoch": 0.9555379404770483,
"grad_norm": 4.59359884262085,
"learning_rate": 4.673748103186647e-07,
"loss": 0.2151,
"step": 3315
},
{
"epoch": 0.9569791741730922,
"grad_norm": 5.00642728805542,
"learning_rate": 4.5220030349013665e-07,
"loss": 0.215,
"step": 3320
},
{
"epoch": 0.958420407869136,
"grad_norm": 3.947004556655884,
"learning_rate": 4.3702579666160853e-07,
"loss": 0.2259,
"step": 3325
},
{
"epoch": 0.9598616415651798,
"grad_norm": 7.238663196563721,
"learning_rate": 4.2185128983308046e-07,
"loss": 0.2444,
"step": 3330
},
{
"epoch": 0.9613028752612236,
"grad_norm": 5.583510398864746,
"learning_rate": 4.066767830045524e-07,
"loss": 0.2388,
"step": 3335
},
{
"epoch": 0.9627441089572675,
"grad_norm": 4.246332168579102,
"learning_rate": 3.9150227617602433e-07,
"loss": 0.2343,
"step": 3340
},
{
"epoch": 0.9641853426533112,
"grad_norm": 2.9725794792175293,
"learning_rate": 3.763277693474962e-07,
"loss": 0.2356,
"step": 3345
},
{
"epoch": 0.965626576349355,
"grad_norm": 3.7293708324432373,
"learning_rate": 3.6115326251896814e-07,
"loss": 0.2317,
"step": 3350
},
{
"epoch": 0.9670678100453989,
"grad_norm": 3.179847002029419,
"learning_rate": 3.459787556904401e-07,
"loss": 0.1998,
"step": 3355
},
{
"epoch": 0.9685090437414426,
"grad_norm": 7.184839725494385,
"learning_rate": 3.3080424886191206e-07,
"loss": 0.2042,
"step": 3360
},
{
"epoch": 0.9699502774374865,
"grad_norm": 3.559626817703247,
"learning_rate": 3.156297420333839e-07,
"loss": 0.2339,
"step": 3365
},
{
"epoch": 0.9713915111335303,
"grad_norm": 8.27930736541748,
"learning_rate": 3.004552352048559e-07,
"loss": 0.2266,
"step": 3370
},
{
"epoch": 0.9728327448295742,
"grad_norm": 9.843157768249512,
"learning_rate": 2.852807283763278e-07,
"loss": 0.2377,
"step": 3375
},
{
"epoch": 0.9742739785256179,
"grad_norm": 4.575967311859131,
"learning_rate": 2.701062215477997e-07,
"loss": 0.229,
"step": 3380
},
{
"epoch": 0.9757152122216617,
"grad_norm": 6.101067543029785,
"learning_rate": 2.549317147192716e-07,
"loss": 0.237,
"step": 3385
},
{
"epoch": 0.9771564459177056,
"grad_norm": 5.833640098571777,
"learning_rate": 2.3975720789074356e-07,
"loss": 0.2441,
"step": 3390
},
{
"epoch": 0.9785976796137493,
"grad_norm": 6.027054309844971,
"learning_rate": 2.245827010622155e-07,
"loss": 0.2194,
"step": 3395
},
{
"epoch": 0.9800389133097932,
"grad_norm": 4.420630931854248,
"learning_rate": 2.0940819423368745e-07,
"loss": 0.2265,
"step": 3400
},
{
"epoch": 0.9800389133097932,
"eval_loss": 0.21972429752349854,
"eval_mse": 0.2197243231460452,
"eval_runtime": 3.6053,
"eval_samples_per_second": 277.37,
"eval_steps_per_second": 17.474,
"step": 3400
},
{
"epoch": 0.981480147005837,
"grad_norm": 5.711348056793213,
"learning_rate": 1.9423368740515936e-07,
"loss": 0.2362,
"step": 3405
},
{
"epoch": 0.9829213807018808,
"grad_norm": 4.06050968170166,
"learning_rate": 1.790591805766313e-07,
"loss": 0.2623,
"step": 3410
},
{
"epoch": 0.9843626143979246,
"grad_norm": 3.5431113243103027,
"learning_rate": 1.638846737481032e-07,
"loss": 0.241,
"step": 3415
},
{
"epoch": 0.9858038480939685,
"grad_norm": 7.887821674346924,
"learning_rate": 1.4871016691957513e-07,
"loss": 0.2662,
"step": 3420
},
{
"epoch": 0.9872450817900122,
"grad_norm": 3.6304845809936523,
"learning_rate": 1.3353566009104704e-07,
"loss": 0.2154,
"step": 3425
},
{
"epoch": 0.988686315486056,
"grad_norm": 2.7374932765960693,
"learning_rate": 1.1836115326251897e-07,
"loss": 0.2107,
"step": 3430
},
{
"epoch": 0.9901275491820999,
"grad_norm": 3.0526680946350098,
"learning_rate": 1.031866464339909e-07,
"loss": 0.2426,
"step": 3435
},
{
"epoch": 0.9915687828781437,
"grad_norm": 4.16213321685791,
"learning_rate": 8.801213960546283e-08,
"loss": 0.2483,
"step": 3440
},
{
"epoch": 0.9930100165741875,
"grad_norm": 2.5456955432891846,
"learning_rate": 7.283763277693476e-08,
"loss": 0.199,
"step": 3445
},
{
"epoch": 0.9944512502702313,
"grad_norm": 3.8972580432891846,
"learning_rate": 5.7663125948406686e-08,
"loss": 0.2504,
"step": 3450
},
{
"epoch": 0.9958924839662752,
"grad_norm": 4.49379301071167,
"learning_rate": 4.248861911987861e-08,
"loss": 0.2226,
"step": 3455
},
{
"epoch": 0.9973337176623189,
"grad_norm": 3.561121702194214,
"learning_rate": 2.7314112291350533e-08,
"loss": 0.2555,
"step": 3460
},
{
"epoch": 0.9987749513583628,
"grad_norm": 4.0361647605896,
"learning_rate": 1.213960546282246e-08,
"loss": 0.2121,
"step": 3465
},
{
"epoch": 0.9999279383151978,
"step": 3469,
"total_flos": 5.881871499303322e+16,
"train_loss": 0.2843814373497456,
"train_runtime": 1907.7343,
"train_samples_per_second": 232.764,
"train_steps_per_second": 1.818
}
],
"logging_steps": 5,
"max_steps": 3469,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.881871499303322e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}