gemma-7b-tr-instruct-test-lora / trainer_state.json
eren23's picture
Upload folder using huggingface_hub
bf0b79b verified
raw
history blame contribute delete
No virus
102 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9929408114188193,
"eval_steps": 500,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 20.729074478149414,
"learning_rate": 2.5e-06,
"loss": 8.0612,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 16.320600509643555,
"learning_rate": 5e-06,
"loss": 7.3007,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 17.508378982543945,
"learning_rate": 7.5e-06,
"loss": 7.7541,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 18.7609920501709,
"learning_rate": 1e-05,
"loss": 7.0762,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 10.039741516113281,
"learning_rate": 1.25e-05,
"loss": 6.3794,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 10.681583404541016,
"learning_rate": 1.5e-05,
"loss": 5.7463,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 8.521218299865723,
"learning_rate": 1.75e-05,
"loss": 5.1425,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 8.024609565734863,
"learning_rate": 2e-05,
"loss": 4.8565,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 6.419050216674805,
"learning_rate": 2.25e-05,
"loss": 4.4552,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 7.1052398681640625,
"learning_rate": 2.5e-05,
"loss": 4.1432,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 7.79315710067749,
"learning_rate": 2.7500000000000004e-05,
"loss": 3.9919,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 5.008393287658691,
"learning_rate": 3e-05,
"loss": 3.3339,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 8.750615119934082,
"learning_rate": 3.2500000000000004e-05,
"loss": 3.3154,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 5.283076286315918,
"learning_rate": 3.5e-05,
"loss": 2.8296,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 6.005578517913818,
"learning_rate": 3.7500000000000003e-05,
"loss": 2.8239,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 7.009499549865723,
"learning_rate": 4e-05,
"loss": 3.0532,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 5.712557315826416,
"learning_rate": 4.25e-05,
"loss": 2.8819,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 4.914234638214111,
"learning_rate": 4.5e-05,
"loss": 2.8031,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 7.396793842315674,
"learning_rate": 4.75e-05,
"loss": 2.6904,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 5.087535381317139,
"learning_rate": 5e-05,
"loss": 2.772,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 6.230583190917969,
"learning_rate": 4.9999683566063894e-05,
"loss": 2.6301,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 4.741369724273682,
"learning_rate": 4.9998734272266e-05,
"loss": 2.5966,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 4.758203506469727,
"learning_rate": 4.9997152142637426e-05,
"loss": 2.4406,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": 4.093080997467041,
"learning_rate": 4.999493721722933e-05,
"loss": 2.6457,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 5.253550052642822,
"learning_rate": 4.999208955211192e-05,
"loss": 2.5449,
"step": 125
},
{
"epoch": 0.04,
"grad_norm": 5.3556294441223145,
"learning_rate": 4.998860921937302e-05,
"loss": 2.5182,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 3.888378620147705,
"learning_rate": 4.998449630711627e-05,
"loss": 2.6575,
"step": 135
},
{
"epoch": 0.04,
"grad_norm": 4.9733967781066895,
"learning_rate": 4.997975091945886e-05,
"loss": 2.5669,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 3.3941574096679688,
"learning_rate": 4.997437317652894e-05,
"loss": 2.5628,
"step": 145
},
{
"epoch": 0.05,
"grad_norm": 3.743703842163086,
"learning_rate": 4.996836321446253e-05,
"loss": 2.6051,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 3.359017848968506,
"learning_rate": 4.99617211854001e-05,
"loss": 2.2357,
"step": 155
},
{
"epoch": 0.05,
"grad_norm": 4.703392028808594,
"learning_rate": 4.995444725748274e-05,
"loss": 2.4146,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 4.182121753692627,
"learning_rate": 4.994654161484784e-05,
"loss": 2.4228,
"step": 165
},
{
"epoch": 0.05,
"grad_norm": 4.623451232910156,
"learning_rate": 4.993800445762451e-05,
"loss": 2.4149,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 3.7832231521606445,
"learning_rate": 4.992883600192844e-05,
"loss": 2.4566,
"step": 175
},
{
"epoch": 0.06,
"grad_norm": 3.907249689102173,
"learning_rate": 4.991903647985646e-05,
"loss": 2.403,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 3.4823191165924072,
"learning_rate": 4.990860613948071e-05,
"loss": 2.518,
"step": 185
},
{
"epoch": 0.06,
"grad_norm": 4.531657695770264,
"learning_rate": 4.989754524484225e-05,
"loss": 2.4007,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 4.945577621459961,
"learning_rate": 4.988585407594449e-05,
"loss": 2.3891,
"step": 195
},
{
"epoch": 0.06,
"grad_norm": 3.9174554347991943,
"learning_rate": 4.9873532928746036e-05,
"loss": 2.2904,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 3.8385236263275146,
"learning_rate": 4.986058211515321e-05,
"loss": 2.2802,
"step": 205
},
{
"epoch": 0.07,
"grad_norm": 4.326376914978027,
"learning_rate": 4.9847001963012176e-05,
"loss": 2.295,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 5.581832408905029,
"learning_rate": 4.9832792816100605e-05,
"loss": 2.4895,
"step": 215
},
{
"epoch": 0.07,
"grad_norm": 3.5401458740234375,
"learning_rate": 4.981795503411901e-05,
"loss": 2.3254,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 4.960626125335693,
"learning_rate": 4.9802488992681594e-05,
"loss": 2.2977,
"step": 225
},
{
"epoch": 0.07,
"grad_norm": 4.908995628356934,
"learning_rate": 4.978639508330681e-05,
"loss": 2.3534,
"step": 230
},
{
"epoch": 0.07,
"grad_norm": 4.865789890289307,
"learning_rate": 4.976967371340736e-05,
"loss": 2.3781,
"step": 235
},
{
"epoch": 0.07,
"grad_norm": 4.27896785736084,
"learning_rate": 4.975232530627998e-05,
"loss": 2.3221,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 3.9018704891204834,
"learning_rate": 4.973435030109463e-05,
"loss": 2.407,
"step": 245
},
{
"epoch": 0.08,
"grad_norm": 3.4363269805908203,
"learning_rate": 4.971574915288345e-05,
"loss": 2.3857,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 4.802529335021973,
"learning_rate": 4.9696522332529205e-05,
"loss": 2.183,
"step": 255
},
{
"epoch": 0.08,
"grad_norm": 4.064101696014404,
"learning_rate": 4.967667032675337e-05,
"loss": 2.2134,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 5.066267490386963,
"learning_rate": 4.965619363810381e-05,
"loss": 2.2722,
"step": 265
},
{
"epoch": 0.08,
"grad_norm": 4.149215221405029,
"learning_rate": 4.9635092784942064e-05,
"loss": 2.3393,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 3.8846592903137207,
"learning_rate": 4.9613368301430194e-05,
"loss": 2.2163,
"step": 275
},
{
"epoch": 0.09,
"grad_norm": 4.181525230407715,
"learning_rate": 4.9591020737517335e-05,
"loss": 2.4478,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 3.1801464557647705,
"learning_rate": 4.956805065892568e-05,
"loss": 2.2887,
"step": 285
},
{
"epoch": 0.09,
"grad_norm": 5.8738250732421875,
"learning_rate": 4.954445864713622e-05,
"loss": 2.29,
"step": 290
},
{
"epoch": 0.09,
"grad_norm": 4.968664646148682,
"learning_rate": 4.9520245299374014e-05,
"loss": 2.2801,
"step": 295
},
{
"epoch": 0.09,
"grad_norm": 5.4960784912109375,
"learning_rate": 4.949541122859305e-05,
"loss": 2.3109,
"step": 300
},
{
"epoch": 0.09,
"grad_norm": 3.6677656173706055,
"learning_rate": 4.9469957063460747e-05,
"loss": 2.2748,
"step": 305
},
{
"epoch": 0.1,
"grad_norm": 2.90336275100708,
"learning_rate": 4.944388344834205e-05,
"loss": 2.2016,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 3.515296459197998,
"learning_rate": 4.9417191043283086e-05,
"loss": 2.3607,
"step": 315
},
{
"epoch": 0.1,
"grad_norm": 3.070936679840088,
"learning_rate": 4.938988052399447e-05,
"loss": 2.3314,
"step": 320
},
{
"epoch": 0.1,
"grad_norm": 3.801671028137207,
"learning_rate": 4.936195258183422e-05,
"loss": 2.2395,
"step": 325
},
{
"epoch": 0.1,
"grad_norm": 4.183629035949707,
"learning_rate": 4.933340792379023e-05,
"loss": 2.4527,
"step": 330
},
{
"epoch": 0.1,
"grad_norm": 3.9023029804229736,
"learning_rate": 4.930424727246238e-05,
"loss": 2.2828,
"step": 335
},
{
"epoch": 0.11,
"grad_norm": 3.6366467475891113,
"learning_rate": 4.927447136604424e-05,
"loss": 2.2859,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 3.219228506088257,
"learning_rate": 4.924408095830439e-05,
"loss": 2.3497,
"step": 345
},
{
"epoch": 0.11,
"grad_norm": 3.768355369567871,
"learning_rate": 4.921307681856735e-05,
"loss": 2.1229,
"step": 350
},
{
"epoch": 0.11,
"grad_norm": 3.8723647594451904,
"learning_rate": 4.9181459731694054e-05,
"loss": 2.3544,
"step": 355
},
{
"epoch": 0.11,
"grad_norm": 3.512420892715454,
"learning_rate": 4.914923049806207e-05,
"loss": 1.9489,
"step": 360
},
{
"epoch": 0.11,
"grad_norm": 3.77095627784729,
"learning_rate": 4.911638993354524e-05,
"loss": 2.2499,
"step": 365
},
{
"epoch": 0.11,
"grad_norm": 3.8103721141815186,
"learning_rate": 4.90829388694931e-05,
"loss": 2.1032,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 3.6579902172088623,
"learning_rate": 4.9048878152709785e-05,
"loss": 2.2104,
"step": 375
},
{
"epoch": 0.12,
"grad_norm": 5.087968826293945,
"learning_rate": 4.901420864543265e-05,
"loss": 2.2601,
"step": 380
},
{
"epoch": 0.12,
"grad_norm": 3.773608684539795,
"learning_rate": 4.8978931225310375e-05,
"loss": 2.1831,
"step": 385
},
{
"epoch": 0.12,
"grad_norm": 6.229213714599609,
"learning_rate": 4.8943046785380795e-05,
"loss": 2.2507,
"step": 390
},
{
"epoch": 0.12,
"grad_norm": 5.113283634185791,
"learning_rate": 4.890655623404828e-05,
"loss": 2.2868,
"step": 395
},
{
"epoch": 0.12,
"grad_norm": 3.9976158142089844,
"learning_rate": 4.8869460495060726e-05,
"loss": 2.264,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 4.450018405914307,
"learning_rate": 4.883176050748619e-05,
"loss": 2.2319,
"step": 405
},
{
"epoch": 0.13,
"grad_norm": 3.610208511352539,
"learning_rate": 4.879345722568911e-05,
"loss": 2.1011,
"step": 410
},
{
"epoch": 0.13,
"grad_norm": 3.5385842323303223,
"learning_rate": 4.875455161930614e-05,
"loss": 2.2372,
"step": 415
},
{
"epoch": 0.13,
"grad_norm": 3.5152907371520996,
"learning_rate": 4.871504467322162e-05,
"loss": 2.3424,
"step": 420
},
{
"epoch": 0.13,
"grad_norm": 3.0804309844970703,
"learning_rate": 4.867493738754263e-05,
"loss": 1.9902,
"step": 425
},
{
"epoch": 0.13,
"grad_norm": 4.568037033081055,
"learning_rate": 4.8634230777573655e-05,
"loss": 2.216,
"step": 430
},
{
"epoch": 0.13,
"grad_norm": 3.0766966342926025,
"learning_rate": 4.859292587379094e-05,
"loss": 2.2049,
"step": 435
},
{
"epoch": 0.14,
"grad_norm": 3.8717846870422363,
"learning_rate": 4.855102372181634e-05,
"loss": 2.179,
"step": 440
},
{
"epoch": 0.14,
"grad_norm": 3.963639497756958,
"learning_rate": 4.8508525382390876e-05,
"loss": 2.3567,
"step": 445
},
{
"epoch": 0.14,
"grad_norm": 3.3204896450042725,
"learning_rate": 4.8465431931347904e-05,
"loss": 2.1157,
"step": 450
},
{
"epoch": 0.14,
"grad_norm": 4.884645938873291,
"learning_rate": 4.842174445958585e-05,
"loss": 2.192,
"step": 455
},
{
"epoch": 0.14,
"grad_norm": 5.058561325073242,
"learning_rate": 4.837746407304061e-05,
"loss": 2.2785,
"step": 460
},
{
"epoch": 0.14,
"grad_norm": 4.240612983703613,
"learning_rate": 4.833259189265753e-05,
"loss": 2.3115,
"step": 465
},
{
"epoch": 0.15,
"grad_norm": 3.628058433532715,
"learning_rate": 4.8287129054363076e-05,
"loss": 2.3267,
"step": 470
},
{
"epoch": 0.15,
"grad_norm": 3.4856207370758057,
"learning_rate": 4.8241076709036036e-05,
"loss": 2.1803,
"step": 475
},
{
"epoch": 0.15,
"grad_norm": 4.317348480224609,
"learning_rate": 4.8194436022478404e-05,
"loss": 2.1224,
"step": 480
},
{
"epoch": 0.15,
"grad_norm": 3.6160759925842285,
"learning_rate": 4.814720817538585e-05,
"loss": 2.1848,
"step": 485
},
{
"epoch": 0.15,
"grad_norm": 3.2244794368743896,
"learning_rate": 4.809939436331786e-05,
"loss": 2.2176,
"step": 490
},
{
"epoch": 0.15,
"grad_norm": 3.645427942276001,
"learning_rate": 4.805099579666748e-05,
"loss": 2.1778,
"step": 495
},
{
"epoch": 0.16,
"grad_norm": 3.9020988941192627,
"learning_rate": 4.800201370063059e-05,
"loss": 2.2817,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 4.484887599945068,
"learning_rate": 4.7952449315174996e-05,
"loss": 1.9207,
"step": 505
},
{
"epoch": 0.16,
"grad_norm": 4.281662464141846,
"learning_rate": 4.790230389500901e-05,
"loss": 2.2251,
"step": 510
},
{
"epoch": 0.16,
"grad_norm": 3.9683914184570312,
"learning_rate": 4.785157870954961e-05,
"loss": 2.22,
"step": 515
},
{
"epoch": 0.16,
"grad_norm": 3.39128041267395,
"learning_rate": 4.780027504289042e-05,
"loss": 2.3237,
"step": 520
},
{
"epoch": 0.16,
"grad_norm": 3.148158550262451,
"learning_rate": 4.774839419376914e-05,
"loss": 2.1838,
"step": 525
},
{
"epoch": 0.16,
"grad_norm": 4.339906692504883,
"learning_rate": 4.769593747553468e-05,
"loss": 2.0075,
"step": 530
},
{
"epoch": 0.17,
"grad_norm": 3.3067688941955566,
"learning_rate": 4.764290621611388e-05,
"loss": 2.1666,
"step": 535
},
{
"epoch": 0.17,
"grad_norm": 4.491573810577393,
"learning_rate": 4.758930175797797e-05,
"loss": 2.3295,
"step": 540
},
{
"epoch": 0.17,
"grad_norm": 3.894711494445801,
"learning_rate": 4.753512545810851e-05,
"loss": 2.1021,
"step": 545
},
{
"epoch": 0.17,
"grad_norm": 2.7983195781707764,
"learning_rate": 4.7480378687963114e-05,
"loss": 2.2335,
"step": 550
},
{
"epoch": 0.17,
"grad_norm": 3.40674090385437,
"learning_rate": 4.7425062833440634e-05,
"loss": 2.0456,
"step": 555
},
{
"epoch": 0.17,
"grad_norm": 3.834815263748169,
"learning_rate": 4.736917929484616e-05,
"loss": 2.3161,
"step": 560
},
{
"epoch": 0.18,
"grad_norm": 3.8907999992370605,
"learning_rate": 4.731272948685554e-05,
"loss": 2.1104,
"step": 565
},
{
"epoch": 0.18,
"grad_norm": 3.7746763229370117,
"learning_rate": 4.725571483847958e-05,
"loss": 2.0498,
"step": 570
},
{
"epoch": 0.18,
"grad_norm": 4.495760917663574,
"learning_rate": 4.719813679302784e-05,
"loss": 2.231,
"step": 575
},
{
"epoch": 0.18,
"grad_norm": 3.9231886863708496,
"learning_rate": 4.713999680807211e-05,
"loss": 2.1878,
"step": 580
},
{
"epoch": 0.18,
"grad_norm": 4.197574138641357,
"learning_rate": 4.708129635540955e-05,
"loss": 2.1897,
"step": 585
},
{
"epoch": 0.18,
"grad_norm": 4.721147060394287,
"learning_rate": 4.702203692102539e-05,
"loss": 2.1359,
"step": 590
},
{
"epoch": 0.18,
"grad_norm": 2.4958722591400146,
"learning_rate": 4.696222000505529e-05,
"loss": 2.1873,
"step": 595
},
{
"epoch": 0.19,
"grad_norm": 3.5209269523620605,
"learning_rate": 4.6901847121747455e-05,
"loss": 2.0386,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 3.6823954582214355,
"learning_rate": 4.6840919799424186e-05,
"loss": 2.0325,
"step": 605
},
{
"epoch": 0.19,
"grad_norm": 4.033428192138672,
"learning_rate": 4.677943958044329e-05,
"loss": 2.13,
"step": 610
},
{
"epoch": 0.19,
"grad_norm": 3.907592535018921,
"learning_rate": 4.671740802115897e-05,
"loss": 2.0553,
"step": 615
},
{
"epoch": 0.19,
"grad_norm": 3.318100690841675,
"learning_rate": 4.665482669188248e-05,
"loss": 2.0218,
"step": 620
},
{
"epoch": 0.19,
"grad_norm": 4.057621479034424,
"learning_rate": 4.659169717684232e-05,
"loss": 2.1056,
"step": 625
},
{
"epoch": 0.2,
"grad_norm": 4.882345199584961,
"learning_rate": 4.6528021074144165e-05,
"loss": 2.1249,
"step": 630
},
{
"epoch": 0.2,
"grad_norm": 4.954129219055176,
"learning_rate": 4.646379999573039e-05,
"loss": 2.1942,
"step": 635
},
{
"epoch": 0.2,
"grad_norm": 4.156874656677246,
"learning_rate": 4.639903556733931e-05,
"loss": 2.175,
"step": 640
},
{
"epoch": 0.2,
"grad_norm": 4.1573710441589355,
"learning_rate": 4.633372942846393e-05,
"loss": 2.0856,
"step": 645
},
{
"epoch": 0.2,
"grad_norm": 5.385977745056152,
"learning_rate": 4.6267883232310575e-05,
"loss": 2.2399,
"step": 650
},
{
"epoch": 0.2,
"grad_norm": 4.143659591674805,
"learning_rate": 4.620149864575689e-05,
"loss": 2.17,
"step": 655
},
{
"epoch": 0.2,
"grad_norm": 3.286294460296631,
"learning_rate": 4.613457734930978e-05,
"loss": 2.0458,
"step": 660
},
{
"epoch": 0.21,
"grad_norm": 4.520682334899902,
"learning_rate": 4.606712103706278e-05,
"loss": 2.1244,
"step": 665
},
{
"epoch": 0.21,
"grad_norm": 3.6921236515045166,
"learning_rate": 4.59991314166532e-05,
"loss": 2.0801,
"step": 670
},
{
"epoch": 0.21,
"grad_norm": 3.1880507469177246,
"learning_rate": 4.593061020921889e-05,
"loss": 2.3062,
"step": 675
},
{
"epoch": 0.21,
"grad_norm": 3.380157709121704,
"learning_rate": 4.586155914935469e-05,
"loss": 2.0267,
"step": 680
},
{
"epoch": 0.21,
"grad_norm": 3.0647785663604736,
"learning_rate": 4.57919799850685e-05,
"loss": 2.1566,
"step": 685
},
{
"epoch": 0.21,
"grad_norm": 3.353318691253662,
"learning_rate": 4.5721874477737006e-05,
"loss": 2.0618,
"step": 690
},
{
"epoch": 0.22,
"grad_norm": 3.342336654663086,
"learning_rate": 4.5651244402061144e-05,
"loss": 1.9534,
"step": 695
},
{
"epoch": 0.22,
"grad_norm": 4.064236640930176,
"learning_rate": 4.558009154602115e-05,
"loss": 2.1573,
"step": 700
},
{
"epoch": 0.22,
"grad_norm": 3.5223772525787354,
"learning_rate": 4.550841771083129e-05,
"loss": 2.0089,
"step": 705
},
{
"epoch": 0.22,
"grad_norm": 4.3469557762146,
"learning_rate": 4.543622471089426e-05,
"loss": 2.1214,
"step": 710
},
{
"epoch": 0.22,
"grad_norm": 3.922893762588501,
"learning_rate": 4.536351437375526e-05,
"loss": 2.0982,
"step": 715
},
{
"epoch": 0.22,
"grad_norm": 3.053823947906494,
"learning_rate": 4.529028854005576e-05,
"loss": 2.0791,
"step": 720
},
{
"epoch": 0.22,
"grad_norm": 3.636437177658081,
"learning_rate": 4.521654906348687e-05,
"loss": 2.1326,
"step": 725
},
{
"epoch": 0.23,
"grad_norm": 4.3226318359375,
"learning_rate": 4.51422978107424e-05,
"loss": 2.2037,
"step": 730
},
{
"epoch": 0.23,
"grad_norm": 4.59119987487793,
"learning_rate": 4.506753666147163e-05,
"loss": 2.1187,
"step": 735
},
{
"epoch": 0.23,
"grad_norm": 5.592061996459961,
"learning_rate": 4.499226750823177e-05,
"loss": 2.3031,
"step": 740
},
{
"epoch": 0.23,
"grad_norm": 4.18353271484375,
"learning_rate": 4.491649225643996e-05,
"loss": 2.0337,
"step": 745
},
{
"epoch": 0.23,
"grad_norm": 3.2864906787872314,
"learning_rate": 4.484021282432509e-05,
"loss": 2.0575,
"step": 750
},
{
"epoch": 0.23,
"grad_norm": 3.3072474002838135,
"learning_rate": 4.476343114287924e-05,
"loss": 2.0173,
"step": 755
},
{
"epoch": 0.24,
"grad_norm": 4.088031768798828,
"learning_rate": 4.468614915580879e-05,
"loss": 2.1929,
"step": 760
},
{
"epoch": 0.24,
"grad_norm": 4.264316082000732,
"learning_rate": 4.4608368819485204e-05,
"loss": 2.0457,
"step": 765
},
{
"epoch": 0.24,
"grad_norm": 4.678459644317627,
"learning_rate": 4.453009210289551e-05,
"loss": 2.031,
"step": 770
},
{
"epoch": 0.24,
"grad_norm": 3.3418045043945312,
"learning_rate": 4.445132098759249e-05,
"loss": 2.1464,
"step": 775
},
{
"epoch": 0.24,
"grad_norm": 3.89583420753479,
"learning_rate": 4.4372057467644455e-05,
"loss": 2.1509,
"step": 780
},
{
"epoch": 0.24,
"grad_norm": 2.6973416805267334,
"learning_rate": 4.4292303549584816e-05,
"loss": 2.072,
"step": 785
},
{
"epoch": 0.25,
"grad_norm": 4.848878383636475,
"learning_rate": 4.421206125236128e-05,
"loss": 2.166,
"step": 790
},
{
"epoch": 0.25,
"grad_norm": 3.48630428314209,
"learning_rate": 4.4131332607284706e-05,
"loss": 1.9686,
"step": 795
},
{
"epoch": 0.25,
"grad_norm": 3.4183597564697266,
"learning_rate": 4.405011965797775e-05,
"loss": 2.0781,
"step": 800
},
{
"epoch": 0.25,
"grad_norm": 3.5883586406707764,
"learning_rate": 4.3968424460323047e-05,
"loss": 2.0631,
"step": 805
},
{
"epoch": 0.25,
"grad_norm": 3.683375835418701,
"learning_rate": 4.388624908241124e-05,
"loss": 2.0533,
"step": 810
},
{
"epoch": 0.25,
"grad_norm": 3.0786943435668945,
"learning_rate": 4.3803595604488595e-05,
"loss": 1.8946,
"step": 815
},
{
"epoch": 0.25,
"grad_norm": 3.2280662059783936,
"learning_rate": 4.372046611890434e-05,
"loss": 2.0221,
"step": 820
},
{
"epoch": 0.26,
"grad_norm": 3.1918365955352783,
"learning_rate": 4.36368627300577e-05,
"loss": 2.0023,
"step": 825
},
{
"epoch": 0.26,
"grad_norm": 4.814984321594238,
"learning_rate": 4.3552787554344634e-05,
"loss": 2.0967,
"step": 830
},
{
"epoch": 0.26,
"grad_norm": 5.989580154418945,
"learning_rate": 4.346824272010423e-05,
"loss": 1.9698,
"step": 835
},
{
"epoch": 0.26,
"grad_norm": 3.2674803733825684,
"learning_rate": 4.338323036756488e-05,
"loss": 2.0381,
"step": 840
},
{
"epoch": 0.26,
"grad_norm": 3.6016860008239746,
"learning_rate": 4.3297752648790035e-05,
"loss": 2.0444,
"step": 845
},
{
"epoch": 0.26,
"grad_norm": 4.092184543609619,
"learning_rate": 4.321181172762379e-05,
"loss": 2.1514,
"step": 850
},
{
"epoch": 0.27,
"grad_norm": 3.5366742610931396,
"learning_rate": 4.312540977963604e-05,
"loss": 2.0518,
"step": 855
},
{
"epoch": 0.27,
"grad_norm": 4.222804069519043,
"learning_rate": 4.303854899206749e-05,
"loss": 1.9858,
"step": 860
},
{
"epoch": 0.27,
"grad_norm": 4.207810401916504,
"learning_rate": 4.295123156377419e-05,
"loss": 2.0067,
"step": 865
},
{
"epoch": 0.27,
"grad_norm": 3.15069842338562,
"learning_rate": 4.2863459705171945e-05,
"loss": 1.9234,
"step": 870
},
{
"epoch": 0.27,
"grad_norm": 3.337561845779419,
"learning_rate": 4.2775235638180344e-05,
"loss": 1.974,
"step": 875
},
{
"epoch": 0.27,
"grad_norm": 5.987912178039551,
"learning_rate": 4.2686561596166487e-05,
"loss": 2.1928,
"step": 880
},
{
"epoch": 0.27,
"grad_norm": 3.9456374645233154,
"learning_rate": 4.259743982388845e-05,
"loss": 2.023,
"step": 885
},
{
"epoch": 0.28,
"grad_norm": 4.308691501617432,
"learning_rate": 4.250787257743851e-05,
"loss": 2.1075,
"step": 890
},
{
"epoch": 0.28,
"grad_norm": 3.699410915374756,
"learning_rate": 4.2417862124185955e-05,
"loss": 2.0471,
"step": 895
},
{
"epoch": 0.28,
"grad_norm": 4.254593372344971,
"learning_rate": 4.232741074271977e-05,
"loss": 2.0331,
"step": 900
},
{
"epoch": 0.28,
"grad_norm": 3.2899739742279053,
"learning_rate": 4.2236520722790855e-05,
"loss": 2.0153,
"step": 905
},
{
"epoch": 0.28,
"grad_norm": 5.5724616050720215,
"learning_rate": 4.214519436525418e-05,
"loss": 2.1466,
"step": 910
},
{
"epoch": 0.28,
"grad_norm": 3.673755168914795,
"learning_rate": 4.2053433982010436e-05,
"loss": 2.1062,
"step": 915
},
{
"epoch": 0.29,
"grad_norm": 4.009172439575195,
"learning_rate": 4.1961241895947554e-05,
"loss": 2.013,
"step": 920
},
{
"epoch": 0.29,
"grad_norm": 3.0359890460968018,
"learning_rate": 4.1868620440881925e-05,
"loss": 2.1153,
"step": 925
},
{
"epoch": 0.29,
"grad_norm": 4.953378200531006,
"learning_rate": 4.177557196149927e-05,
"loss": 2.0847,
"step": 930
},
{
"epoch": 0.29,
"grad_norm": 3.580415964126587,
"learning_rate": 4.168209881329531e-05,
"loss": 1.9907,
"step": 935
},
{
"epoch": 0.29,
"grad_norm": 3.3144888877868652,
"learning_rate": 4.1588203362516153e-05,
"loss": 2.0741,
"step": 940
},
{
"epoch": 0.29,
"grad_norm": 4.115612983703613,
"learning_rate": 4.149388798609836e-05,
"loss": 1.9596,
"step": 945
},
{
"epoch": 0.29,
"grad_norm": 5.178717613220215,
"learning_rate": 4.1399155071608774e-05,
"loss": 2.142,
"step": 950
},
{
"epoch": 0.3,
"grad_norm": 3.350316286087036,
"learning_rate": 4.1304007017184146e-05,
"loss": 2.06,
"step": 955
},
{
"epoch": 0.3,
"grad_norm": 4.030082702636719,
"learning_rate": 4.120844623147033e-05,
"loss": 2.0618,
"step": 960
},
{
"epoch": 0.3,
"grad_norm": 5.1543707847595215,
"learning_rate": 4.1112475133561376e-05,
"loss": 2.3692,
"step": 965
},
{
"epoch": 0.3,
"grad_norm": 3.9695091247558594,
"learning_rate": 4.101609615293827e-05,
"loss": 2.0065,
"step": 970
},
{
"epoch": 0.3,
"grad_norm": 3.1106691360473633,
"learning_rate": 4.0919311729407416e-05,
"loss": 2.0318,
"step": 975
},
{
"epoch": 0.3,
"grad_norm": 3.532636880874634,
"learning_rate": 4.0822124313038904e-05,
"loss": 2.139,
"step": 980
},
{
"epoch": 0.31,
"grad_norm": 4.04263162612915,
"learning_rate": 4.072453636410448e-05,
"loss": 2.1352,
"step": 985
},
{
"epoch": 0.31,
"grad_norm": 4.174222946166992,
"learning_rate": 4.0626550353015236e-05,
"loss": 2.0269,
"step": 990
},
{
"epoch": 0.31,
"grad_norm": 4.390026569366455,
"learning_rate": 4.052816876025912e-05,
"loss": 2.0775,
"step": 995
},
{
"epoch": 0.31,
"grad_norm": 4.04339075088501,
"learning_rate": 4.042939407633808e-05,
"loss": 2.0042,
"step": 1000
},
{
"epoch": 0.31,
"grad_norm": 3.5550975799560547,
"learning_rate": 4.03302288017051e-05,
"loss": 1.9624,
"step": 1005
},
{
"epoch": 0.31,
"grad_norm": 4.015019416809082,
"learning_rate": 4.023067544670082e-05,
"loss": 2.142,
"step": 1010
},
{
"epoch": 0.31,
"grad_norm": 3.452937126159668,
"learning_rate": 4.013073653149005e-05,
"loss": 2.0798,
"step": 1015
},
{
"epoch": 0.32,
"grad_norm": 4.2777509689331055,
"learning_rate": 4.0030414585997925e-05,
"loss": 2.0245,
"step": 1020
},
{
"epoch": 0.32,
"grad_norm": 5.5015459060668945,
"learning_rate": 3.99297121498459e-05,
"loss": 2.0897,
"step": 1025
},
{
"epoch": 0.32,
"grad_norm": 4.524988651275635,
"learning_rate": 3.982863177228743e-05,
"loss": 2.182,
"step": 1030
},
{
"epoch": 0.32,
"grad_norm": 4.300734043121338,
"learning_rate": 3.972717601214345e-05,
"loss": 2.0477,
"step": 1035
},
{
"epoch": 0.32,
"grad_norm": 3.456317186355591,
"learning_rate": 3.962534743773761e-05,
"loss": 2.1261,
"step": 1040
},
{
"epoch": 0.32,
"grad_norm": 3.567162275314331,
"learning_rate": 3.9523148626831234e-05,
"loss": 2.119,
"step": 1045
},
{
"epoch": 0.33,
"grad_norm": 3.5200531482696533,
"learning_rate": 3.942058216655808e-05,
"loss": 1.9731,
"step": 1050
},
{
"epoch": 0.33,
"grad_norm": 4.380658149719238,
"learning_rate": 3.931765065335886e-05,
"loss": 1.9642,
"step": 1055
},
{
"epoch": 0.33,
"grad_norm": 4.44472074508667,
"learning_rate": 3.921435669291547e-05,
"loss": 1.8666,
"step": 1060
},
{
"epoch": 0.33,
"grad_norm": 5.24396276473999,
"learning_rate": 3.9110702900085064e-05,
"loss": 2.0983,
"step": 1065
},
{
"epoch": 0.33,
"grad_norm": 4.166001319885254,
"learning_rate": 3.900669189883386e-05,
"loss": 1.9032,
"step": 1070
},
{
"epoch": 0.33,
"grad_norm": 3.893059730529785,
"learning_rate": 3.890232632217071e-05,
"loss": 1.9269,
"step": 1075
},
{
"epoch": 0.34,
"grad_norm": 3.5707895755767822,
"learning_rate": 3.879760881208042e-05,
"loss": 1.9055,
"step": 1080
},
{
"epoch": 0.34,
"grad_norm": 4.270632743835449,
"learning_rate": 3.869254201945692e-05,
"loss": 1.9936,
"step": 1085
},
{
"epoch": 0.34,
"grad_norm": 4.152591228485107,
"learning_rate": 3.858712860403608e-05,
"loss": 2.1007,
"step": 1090
},
{
"epoch": 0.34,
"grad_norm": 3.5370168685913086,
"learning_rate": 3.848137123432848e-05,
"loss": 2.1225,
"step": 1095
},
{
"epoch": 0.34,
"grad_norm": 3.657259941101074,
"learning_rate": 3.837527258755177e-05,
"loss": 1.9526,
"step": 1100
},
{
"epoch": 0.34,
"grad_norm": 4.236551761627197,
"learning_rate": 3.8268835349562946e-05,
"loss": 1.9357,
"step": 1105
},
{
"epoch": 0.34,
"grad_norm": 3.312053680419922,
"learning_rate": 3.816206221479034e-05,
"loss": 1.9833,
"step": 1110
},
{
"epoch": 0.35,
"grad_norm": 3.346323013305664,
"learning_rate": 3.8054955886165427e-05,
"loss": 1.9351,
"step": 1115
},
{
"epoch": 0.35,
"grad_norm": 3.557433843612671,
"learning_rate": 3.7947519075054364e-05,
"loss": 2.0037,
"step": 1120
},
{
"epoch": 0.35,
"grad_norm": 3.824169635772705,
"learning_rate": 3.7839754501189406e-05,
"loss": 2.1035,
"step": 1125
},
{
"epoch": 0.35,
"grad_norm": 4.1984968185424805,
"learning_rate": 3.7731664892600004e-05,
"loss": 1.9416,
"step": 1130
},
{
"epoch": 0.35,
"grad_norm": 2.998347520828247,
"learning_rate": 3.762325298554379e-05,
"loss": 1.9615,
"step": 1135
},
{
"epoch": 0.35,
"grad_norm": 4.985104560852051,
"learning_rate": 3.751452152443728e-05,
"loss": 1.912,
"step": 1140
},
{
"epoch": 0.36,
"grad_norm": 3.560026168823242,
"learning_rate": 3.74054732617864e-05,
"loss": 1.9317,
"step": 1145
},
{
"epoch": 0.36,
"grad_norm": 3.894937515258789,
"learning_rate": 3.7296110958116844e-05,
"loss": 1.9516,
"step": 1150
},
{
"epoch": 0.36,
"grad_norm": 3.1330158710479736,
"learning_rate": 3.718643738190414e-05,
"loss": 1.8787,
"step": 1155
},
{
"epoch": 0.36,
"grad_norm": 3.924584150314331,
"learning_rate": 3.707645530950361e-05,
"loss": 1.9294,
"step": 1160
},
{
"epoch": 0.36,
"grad_norm": 3.2176225185394287,
"learning_rate": 3.6966167525080056e-05,
"loss": 2.1003,
"step": 1165
},
{
"epoch": 0.36,
"grad_norm": 3.9685873985290527,
"learning_rate": 3.6855576820537277e-05,
"loss": 1.9088,
"step": 1170
},
{
"epoch": 0.36,
"grad_norm": 4.544212818145752,
"learning_rate": 3.674468599544746e-05,
"loss": 2.0211,
"step": 1175
},
{
"epoch": 0.37,
"grad_norm": 3.6609127521514893,
"learning_rate": 3.663349785698021e-05,
"loss": 2.0021,
"step": 1180
},
{
"epoch": 0.37,
"grad_norm": 4.17726469039917,
"learning_rate": 3.6522015219831546e-05,
"loss": 2.0828,
"step": 1185
},
{
"epoch": 0.37,
"grad_norm": 3.6899638175964355,
"learning_rate": 3.641024090615265e-05,
"loss": 1.9462,
"step": 1190
},
{
"epoch": 0.37,
"grad_norm": 3.7764229774475098,
"learning_rate": 3.62981777454784e-05,
"loss": 2.0825,
"step": 1195
},
{
"epoch": 0.37,
"grad_norm": 4.037018775939941,
"learning_rate": 3.6185828574655766e-05,
"loss": 1.8715,
"step": 1200
},
{
"epoch": 0.37,
"grad_norm": 3.727513074874878,
"learning_rate": 3.607319623777196e-05,
"loss": 1.9394,
"step": 1205
},
{
"epoch": 0.38,
"grad_norm": 4.162086009979248,
"learning_rate": 3.59602835860825e-05,
"loss": 1.89,
"step": 1210
},
{
"epoch": 0.38,
"grad_norm": 3.546518564224243,
"learning_rate": 3.5847093477938956e-05,
"loss": 1.8102,
"step": 1215
},
{
"epoch": 0.38,
"grad_norm": 4.054803371429443,
"learning_rate": 3.5733628778716646e-05,
"loss": 1.8825,
"step": 1220
},
{
"epoch": 0.38,
"grad_norm": 3.638885498046875,
"learning_rate": 3.5619892360742075e-05,
"loss": 2.0755,
"step": 1225
},
{
"epoch": 0.38,
"grad_norm": 3.433565378189087,
"learning_rate": 3.5505887103220254e-05,
"loss": 2.0261,
"step": 1230
},
{
"epoch": 0.38,
"grad_norm": 3.5785629749298096,
"learning_rate": 3.5391615892161754e-05,
"loss": 2.1362,
"step": 1235
},
{
"epoch": 0.38,
"grad_norm": 3.4514031410217285,
"learning_rate": 3.527708162030971e-05,
"loss": 1.8821,
"step": 1240
},
{
"epoch": 0.39,
"grad_norm": 4.2519073486328125,
"learning_rate": 3.516228718706656e-05,
"loss": 2.112,
"step": 1245
},
{
"epoch": 0.39,
"grad_norm": 3.0281126499176025,
"learning_rate": 3.504723549842066e-05,
"loss": 1.8516,
"step": 1250
},
{
"epoch": 0.39,
"grad_norm": 3.3636157512664795,
"learning_rate": 3.4931929466872685e-05,
"loss": 1.9612,
"step": 1255
},
{
"epoch": 0.39,
"grad_norm": 3.7413578033447266,
"learning_rate": 3.481637201136197e-05,
"loss": 1.9865,
"step": 1260
},
{
"epoch": 0.39,
"grad_norm": 3.007408618927002,
"learning_rate": 3.4700566057192544e-05,
"loss": 1.9493,
"step": 1265
},
{
"epoch": 0.39,
"grad_norm": 4.331480979919434,
"learning_rate": 3.4584514535959114e-05,
"loss": 2.1174,
"step": 1270
},
{
"epoch": 0.4,
"grad_norm": 4.286431312561035,
"learning_rate": 3.446822038547287e-05,
"loss": 1.883,
"step": 1275
},
{
"epoch": 0.4,
"grad_norm": 3.356170177459717,
"learning_rate": 3.435168654968706e-05,
"loss": 1.9707,
"step": 1280
},
{
"epoch": 0.4,
"grad_norm": 3.436434507369995,
"learning_rate": 3.423491597862251e-05,
"loss": 1.8922,
"step": 1285
},
{
"epoch": 0.4,
"grad_norm": 3.307274580001831,
"learning_rate": 3.411791162829294e-05,
"loss": 2.0583,
"step": 1290
},
{
"epoch": 0.4,
"grad_norm": 4.032553195953369,
"learning_rate": 3.4000676460630126e-05,
"loss": 2.0121,
"step": 1295
},
{
"epoch": 0.4,
"grad_norm": 3.4915122985839844,
"learning_rate": 3.3883213443408903e-05,
"loss": 1.9361,
"step": 1300
},
{
"epoch": 0.4,
"grad_norm": 3.969005823135376,
"learning_rate": 3.3765525550172066e-05,
"loss": 1.8782,
"step": 1305
},
{
"epoch": 0.41,
"grad_norm": 3.772780179977417,
"learning_rate": 3.364761576015507e-05,
"loss": 2.0914,
"step": 1310
},
{
"epoch": 0.41,
"grad_norm": 2.9640040397644043,
"learning_rate": 3.352948705821065e-05,
"loss": 1.9143,
"step": 1315
},
{
"epoch": 0.41,
"grad_norm": 5.698980331420898,
"learning_rate": 3.341114243473319e-05,
"loss": 1.9417,
"step": 1320
},
{
"epoch": 0.41,
"grad_norm": 3.4275810718536377,
"learning_rate": 3.3292584885583114e-05,
"loss": 1.9053,
"step": 1325
},
{
"epoch": 0.41,
"grad_norm": 3.2752602100372314,
"learning_rate": 3.317381741201097e-05,
"loss": 2.0126,
"step": 1330
},
{
"epoch": 0.41,
"grad_norm": 4.166382312774658,
"learning_rate": 3.305484302058148e-05,
"loss": 1.9256,
"step": 1335
},
{
"epoch": 0.42,
"grad_norm": 3.7549707889556885,
"learning_rate": 3.293566472309746e-05,
"loss": 2.0742,
"step": 1340
},
{
"epoch": 0.42,
"grad_norm": 3.449774980545044,
"learning_rate": 3.2816285536523515e-05,
"loss": 1.9322,
"step": 1345
},
{
"epoch": 0.42,
"grad_norm": 3.590756416320801,
"learning_rate": 3.269670848290973e-05,
"loss": 1.9619,
"step": 1350
},
{
"epoch": 0.42,
"grad_norm": 4.403102874755859,
"learning_rate": 3.2576936589315124e-05,
"loss": 1.9513,
"step": 1355
},
{
"epoch": 0.42,
"grad_norm": 4.1176676750183105,
"learning_rate": 3.245697288773102e-05,
"loss": 2.0274,
"step": 1360
},
{
"epoch": 0.42,
"grad_norm": 4.0299859046936035,
"learning_rate": 3.233682041500433e-05,
"loss": 1.9853,
"step": 1365
},
{
"epoch": 0.43,
"grad_norm": 4.306421279907227,
"learning_rate": 3.2216482212760646e-05,
"loss": 1.949,
"step": 1370
},
{
"epoch": 0.43,
"grad_norm": 3.9233736991882324,
"learning_rate": 3.209596132732725e-05,
"loss": 1.9009,
"step": 1375
},
{
"epoch": 0.43,
"grad_norm": 3.82336163520813,
"learning_rate": 3.197526080965598e-05,
"loss": 2.1035,
"step": 1380
},
{
"epoch": 0.43,
"grad_norm": 3.946753740310669,
"learning_rate": 3.185438371524605e-05,
"loss": 1.9775,
"step": 1385
},
{
"epoch": 0.43,
"grad_norm": 4.122159481048584,
"learning_rate": 3.173333310406662e-05,
"loss": 1.7694,
"step": 1390
},
{
"epoch": 0.43,
"grad_norm": 3.5491435527801514,
"learning_rate": 3.161211204047943e-05,
"loss": 2.0022,
"step": 1395
},
{
"epoch": 0.43,
"grad_norm": 4.0456438064575195,
"learning_rate": 3.1490723593161096e-05,
"loss": 2.1332,
"step": 1400
},
{
"epoch": 0.44,
"grad_norm": 3.476616621017456,
"learning_rate": 3.1369170835025594e-05,
"loss": 1.9567,
"step": 1405
},
{
"epoch": 0.44,
"grad_norm": 3.3506128787994385,
"learning_rate": 3.124745684314633e-05,
"loss": 2.1015,
"step": 1410
},
{
"epoch": 0.44,
"grad_norm": 3.737765312194824,
"learning_rate": 3.112558469867829e-05,
"loss": 1.9677,
"step": 1415
},
{
"epoch": 0.44,
"grad_norm": 3.6628215312957764,
"learning_rate": 3.100355748678009e-05,
"loss": 2.1167,
"step": 1420
},
{
"epoch": 0.44,
"grad_norm": 3.3631627559661865,
"learning_rate": 3.0881378296535784e-05,
"loss": 1.928,
"step": 1425
},
{
"epoch": 0.44,
"grad_norm": 4.281042575836182,
"learning_rate": 3.075905022087675e-05,
"loss": 1.9394,
"step": 1430
},
{
"epoch": 0.45,
"grad_norm": 3.994631290435791,
"learning_rate": 3.063657635650335e-05,
"loss": 1.8533,
"step": 1435
},
{
"epoch": 0.45,
"grad_norm": 5.131731033325195,
"learning_rate": 3.0513959803806526e-05,
"loss": 1.9484,
"step": 1440
},
{
"epoch": 0.45,
"grad_norm": 3.4644176959991455,
"learning_rate": 3.039120366678937e-05,
"loss": 1.9492,
"step": 1445
},
{
"epoch": 0.45,
"grad_norm": 3.832453966140747,
"learning_rate": 3.0268311052988473e-05,
"loss": 1.869,
"step": 1450
},
{
"epoch": 0.45,
"grad_norm": 3.8497562408447266,
"learning_rate": 3.0145285073395334e-05,
"loss": 1.8965,
"step": 1455
},
{
"epoch": 0.45,
"grad_norm": 3.4898972511291504,
"learning_rate": 3.0022128842377534e-05,
"loss": 2.0029,
"step": 1460
},
{
"epoch": 0.45,
"grad_norm": 4.340991020202637,
"learning_rate": 2.9898845477599963e-05,
"loss": 1.9139,
"step": 1465
},
{
"epoch": 0.46,
"grad_norm": 5.687810897827148,
"learning_rate": 2.9775438099945836e-05,
"loss": 2.0196,
"step": 1470
},
{
"epoch": 0.46,
"grad_norm": 3.468388795852661,
"learning_rate": 2.965190983343774e-05,
"loss": 2.0382,
"step": 1475
},
{
"epoch": 0.46,
"grad_norm": 3.2167277336120605,
"learning_rate": 2.9528263805158524e-05,
"loss": 2.0924,
"step": 1480
},
{
"epoch": 0.46,
"grad_norm": 4.481842041015625,
"learning_rate": 2.940450314517214e-05,
"loss": 2.0535,
"step": 1485
},
{
"epoch": 0.46,
"grad_norm": 4.334501266479492,
"learning_rate": 2.92806309864444e-05,
"loss": 1.9523,
"step": 1490
},
{
"epoch": 0.46,
"grad_norm": 4.137599945068359,
"learning_rate": 2.9156650464763713e-05,
"loss": 2.0247,
"step": 1495
},
{
"epoch": 0.47,
"grad_norm": 3.5023269653320312,
"learning_rate": 2.9032564718661603e-05,
"loss": 2.0151,
"step": 1500
},
{
"epoch": 0.47,
"grad_norm": 4.225565433502197,
"learning_rate": 2.8908376889333376e-05,
"loss": 1.9438,
"step": 1505
},
{
"epoch": 0.47,
"grad_norm": 3.86175799369812,
"learning_rate": 2.8784090120558515e-05,
"loss": 2.0108,
"step": 1510
},
{
"epoch": 0.47,
"grad_norm": 2.7544214725494385,
"learning_rate": 2.865970755862114e-05,
"loss": 1.943,
"step": 1515
},
{
"epoch": 0.47,
"grad_norm": 3.8477399349212646,
"learning_rate": 2.8535232352230345e-05,
"loss": 1.891,
"step": 1520
},
{
"epoch": 0.47,
"grad_norm": 3.7875800132751465,
"learning_rate": 2.8410667652440482e-05,
"loss": 1.9343,
"step": 1525
},
{
"epoch": 0.47,
"grad_norm": 3.8977842330932617,
"learning_rate": 2.828601661257142e-05,
"loss": 1.8978,
"step": 1530
},
{
"epoch": 0.48,
"grad_norm": 3.39017915725708,
"learning_rate": 2.8161282388128696e-05,
"loss": 1.9368,
"step": 1535
},
{
"epoch": 0.48,
"grad_norm": 4.3148322105407715,
"learning_rate": 2.8036468136723627e-05,
"loss": 1.9393,
"step": 1540
},
{
"epoch": 0.48,
"grad_norm": 3.528031587600708,
"learning_rate": 2.7911577017993412e-05,
"loss": 1.831,
"step": 1545
},
{
"epoch": 0.48,
"grad_norm": 4.506915092468262,
"learning_rate": 2.778661219352111e-05,
"loss": 2.1384,
"step": 1550
},
{
"epoch": 0.48,
"grad_norm": 4.252208709716797,
"learning_rate": 2.766157682675562e-05,
"loss": 1.9593,
"step": 1555
},
{
"epoch": 0.48,
"grad_norm": 3.718641996383667,
"learning_rate": 2.753647408293161e-05,
"loss": 1.9347,
"step": 1560
},
{
"epoch": 0.49,
"grad_norm": 3.7793309688568115,
"learning_rate": 2.7411307128989368e-05,
"loss": 1.9519,
"step": 1565
},
{
"epoch": 0.49,
"grad_norm": 3.7921085357666016,
"learning_rate": 2.728607913349464e-05,
"loss": 1.8966,
"step": 1570
},
{
"epoch": 0.49,
"grad_norm": 3.735579252243042,
"learning_rate": 2.7160793266558443e-05,
"loss": 1.8972,
"step": 1575
},
{
"epoch": 0.49,
"grad_norm": 4.979485511779785,
"learning_rate": 2.7035452699756768e-05,
"loss": 1.9879,
"step": 1580
},
{
"epoch": 0.49,
"grad_norm": 3.672161102294922,
"learning_rate": 2.6910060606050324e-05,
"loss": 1.895,
"step": 1585
},
{
"epoch": 0.49,
"grad_norm": 3.2381715774536133,
"learning_rate": 2.6784620159704222e-05,
"loss": 1.9259,
"step": 1590
},
{
"epoch": 0.49,
"grad_norm": 5.407585620880127,
"learning_rate": 2.6659134536207587e-05,
"loss": 1.9021,
"step": 1595
},
{
"epoch": 0.5,
"grad_norm": 3.894399642944336,
"learning_rate": 2.6533606912193216e-05,
"loss": 2.0666,
"step": 1600
},
{
"epoch": 0.5,
"grad_norm": 3.4516754150390625,
"learning_rate": 2.6408040465357097e-05,
"loss": 1.9388,
"step": 1605
},
{
"epoch": 0.5,
"grad_norm": 5.389581203460693,
"learning_rate": 2.628243837437806e-05,
"loss": 1.9731,
"step": 1610
},
{
"epoch": 0.5,
"grad_norm": 3.623656988143921,
"learning_rate": 2.6156803818837204e-05,
"loss": 1.8931,
"step": 1615
},
{
"epoch": 0.5,
"grad_norm": 3.5042312145233154,
"learning_rate": 2.6031139979137492e-05,
"loss": 1.8365,
"step": 1620
},
{
"epoch": 0.5,
"grad_norm": 5.07073974609375,
"learning_rate": 2.59054500364232e-05,
"loss": 2.0215,
"step": 1625
},
{
"epoch": 0.51,
"grad_norm": 4.199176788330078,
"learning_rate": 2.5779737172499396e-05,
"loss": 1.967,
"step": 1630
},
{
"epoch": 0.51,
"grad_norm": 4.009402751922607,
"learning_rate": 2.565400456975138e-05,
"loss": 2.0154,
"step": 1635
},
{
"epoch": 0.51,
"grad_norm": 3.114271640777588,
"learning_rate": 2.552825541106414e-05,
"loss": 1.9405,
"step": 1640
},
{
"epoch": 0.51,
"grad_norm": 3.4758782386779785,
"learning_rate": 2.540249287974178e-05,
"loss": 1.94,
"step": 1645
},
{
"epoch": 0.51,
"grad_norm": 6.038011074066162,
"learning_rate": 2.527672015942693e-05,
"loss": 2.1653,
"step": 1650
},
{
"epoch": 0.51,
"grad_norm": 3.370410203933716,
"learning_rate": 2.5150940434020132e-05,
"loss": 1.9588,
"step": 1655
},
{
"epoch": 0.52,
"grad_norm": 3.766829252243042,
"learning_rate": 2.5025156887599288e-05,
"loss": 1.8133,
"step": 1660
},
{
"epoch": 0.52,
"grad_norm": 3.650520086288452,
"learning_rate": 2.489937270433901e-05,
"loss": 1.9111,
"step": 1665
},
{
"epoch": 0.52,
"grad_norm": 3.1080238819122314,
"learning_rate": 2.4773591068430018e-05,
"loss": 1.8758,
"step": 1670
},
{
"epoch": 0.52,
"grad_norm": 3.3637783527374268,
"learning_rate": 2.4647815163998585e-05,
"loss": 1.7589,
"step": 1675
},
{
"epoch": 0.52,
"grad_norm": 4.043179988861084,
"learning_rate": 2.452204817502587e-05,
"loss": 1.9339,
"step": 1680
},
{
"epoch": 0.52,
"grad_norm": 4.033404350280762,
"learning_rate": 2.4396293285267327e-05,
"loss": 1.9412,
"step": 1685
},
{
"epoch": 0.52,
"grad_norm": 4.043616771697998,
"learning_rate": 2.427055367817214e-05,
"loss": 1.8728,
"step": 1690
},
{
"epoch": 0.53,
"grad_norm": 4.840696811676025,
"learning_rate": 2.4144832536802628e-05,
"loss": 1.9966,
"step": 1695
},
{
"epoch": 0.53,
"grad_norm": 4.977992057800293,
"learning_rate": 2.4019133043753628e-05,
"loss": 1.9621,
"step": 1700
},
{
"epoch": 0.53,
"grad_norm": 3.1471240520477295,
"learning_rate": 2.3893458381071964e-05,
"loss": 2.0315,
"step": 1705
},
{
"epoch": 0.53,
"grad_norm": 5.21504020690918,
"learning_rate": 2.376781173017589e-05,
"loss": 1.9859,
"step": 1710
},
{
"epoch": 0.53,
"grad_norm": 3.4117472171783447,
"learning_rate": 2.3642196271774568e-05,
"loss": 1.905,
"step": 1715
},
{
"epoch": 0.53,
"grad_norm": 3.8640167713165283,
"learning_rate": 2.3516615185787494e-05,
"loss": 2.0321,
"step": 1720
},
{
"epoch": 0.54,
"grad_norm": 3.5830259323120117,
"learning_rate": 2.3391071651264064e-05,
"loss": 1.9936,
"step": 1725
},
{
"epoch": 0.54,
"grad_norm": 5.528283596038818,
"learning_rate": 2.3265568846303054e-05,
"loss": 1.8955,
"step": 1730
},
{
"epoch": 0.54,
"grad_norm": 3.968691110610962,
"learning_rate": 2.3140109947972204e-05,
"loss": 1.9137,
"step": 1735
},
{
"epoch": 0.54,
"grad_norm": 3.56799054145813,
"learning_rate": 2.3014698132227735e-05,
"loss": 1.9854,
"step": 1740
},
{
"epoch": 0.54,
"grad_norm": 4.353531360626221,
"learning_rate": 2.2889336573834027e-05,
"loss": 1.8967,
"step": 1745
},
{
"epoch": 0.54,
"grad_norm": 3.8630661964416504,
"learning_rate": 2.276402844628317e-05,
"loss": 1.8833,
"step": 1750
},
{
"epoch": 0.54,
"grad_norm": 3.5117268562316895,
"learning_rate": 2.2638776921714696e-05,
"loss": 1.8493,
"step": 1755
},
{
"epoch": 0.55,
"grad_norm": 4.000200271606445,
"learning_rate": 2.251358517083524e-05,
"loss": 1.8717,
"step": 1760
},
{
"epoch": 0.55,
"grad_norm": 3.0542423725128174,
"learning_rate": 2.2388456362838283e-05,
"loss": 1.9941,
"step": 1765
},
{
"epoch": 0.55,
"grad_norm": 4.117686748504639,
"learning_rate": 2.2263393665323907e-05,
"loss": 2.0925,
"step": 1770
},
{
"epoch": 0.55,
"grad_norm": 5.376316070556641,
"learning_rate": 2.2138400244218665e-05,
"loss": 2.0568,
"step": 1775
},
{
"epoch": 0.55,
"grad_norm": 3.879211187362671,
"learning_rate": 2.2013479263695368e-05,
"loss": 1.9256,
"step": 1780
},
{
"epoch": 0.55,
"grad_norm": 4.660920143127441,
"learning_rate": 2.1888633886093017e-05,
"loss": 2.092,
"step": 1785
},
{
"epoch": 0.56,
"grad_norm": 3.143937587738037,
"learning_rate": 2.176386727183676e-05,
"loss": 1.7624,
"step": 1790
},
{
"epoch": 0.56,
"grad_norm": 4.354220390319824,
"learning_rate": 2.1639182579357846e-05,
"loss": 1.8961,
"step": 1795
},
{
"epoch": 0.56,
"grad_norm": 5.339317798614502,
"learning_rate": 2.151458296501374e-05,
"loss": 1.9361,
"step": 1800
},
{
"epoch": 0.56,
"grad_norm": 3.080310344696045,
"learning_rate": 2.139007158300814e-05,
"loss": 1.8459,
"step": 1805
},
{
"epoch": 0.56,
"grad_norm": 3.5018744468688965,
"learning_rate": 2.126565158531119e-05,
"loss": 1.9086,
"step": 1810
},
{
"epoch": 0.56,
"grad_norm": 5.1605072021484375,
"learning_rate": 2.1141326121579638e-05,
"loss": 1.9395,
"step": 1815
},
{
"epoch": 0.56,
"grad_norm": 4.0767998695373535,
"learning_rate": 2.1017098339077176e-05,
"loss": 2.005,
"step": 1820
},
{
"epoch": 0.57,
"grad_norm": 4.308762073516846,
"learning_rate": 2.0892971382594694e-05,
"loss": 1.8772,
"step": 1825
},
{
"epoch": 0.57,
"grad_norm": 3.049802541732788,
"learning_rate": 2.0768948394370702e-05,
"loss": 1.9591,
"step": 1830
},
{
"epoch": 0.57,
"grad_norm": 3.853872060775757,
"learning_rate": 2.0645032514011773e-05,
"loss": 1.8408,
"step": 1835
},
{
"epoch": 0.57,
"grad_norm": 3.8186545372009277,
"learning_rate": 2.052122687841311e-05,
"loss": 1.9765,
"step": 1840
},
{
"epoch": 0.57,
"grad_norm": 3.238193988800049,
"learning_rate": 2.0397534621679075e-05,
"loss": 1.931,
"step": 1845
},
{
"epoch": 0.57,
"grad_norm": 3.316253662109375,
"learning_rate": 2.0273958875043874e-05,
"loss": 1.9787,
"step": 1850
},
{
"epoch": 0.58,
"grad_norm": 4.303181171417236,
"learning_rate": 2.0150502766792298e-05,
"loss": 1.9991,
"step": 1855
},
{
"epoch": 0.58,
"grad_norm": 3.6812000274658203,
"learning_rate": 2.0027169422180546e-05,
"loss": 1.8782,
"step": 1860
},
{
"epoch": 0.58,
"grad_norm": 5.033133506774902,
"learning_rate": 1.990396196335706e-05,
"loss": 1.8406,
"step": 1865
},
{
"epoch": 0.58,
"grad_norm": 4.612210750579834,
"learning_rate": 1.9780883509283526e-05,
"loss": 2.0226,
"step": 1870
},
{
"epoch": 0.58,
"grad_norm": 4.63312292098999,
"learning_rate": 1.9657937175655922e-05,
"loss": 1.9403,
"step": 1875
},
{
"epoch": 0.58,
"grad_norm": 3.5263733863830566,
"learning_rate": 1.9535126074825647e-05,
"loss": 1.9812,
"step": 1880
},
{
"epoch": 0.58,
"grad_norm": 3.100794792175293,
"learning_rate": 1.941245331572068e-05,
"loss": 1.8332,
"step": 1885
},
{
"epoch": 0.59,
"grad_norm": 4.041380405426025,
"learning_rate": 1.9289922003766962e-05,
"loss": 1.9352,
"step": 1890
},
{
"epoch": 0.59,
"grad_norm": 3.329756736755371,
"learning_rate": 1.9167535240809703e-05,
"loss": 1.9084,
"step": 1895
},
{
"epoch": 0.59,
"grad_norm": 3.596053123474121,
"learning_rate": 1.904529612503493e-05,
"loss": 1.8971,
"step": 1900
},
{
"epoch": 0.59,
"grad_norm": 3.9134511947631836,
"learning_rate": 1.8923207750890992e-05,
"loss": 2.0642,
"step": 1905
},
{
"epoch": 0.59,
"grad_norm": 3.707994222640991,
"learning_rate": 1.8801273209010284e-05,
"loss": 1.8276,
"step": 1910
},
{
"epoch": 0.59,
"grad_norm": 4.338993072509766,
"learning_rate": 1.8679495586130952e-05,
"loss": 1.9576,
"step": 1915
},
{
"epoch": 0.6,
"grad_norm": 3.758429765701294,
"learning_rate": 1.8557877965018817e-05,
"loss": 1.9956,
"step": 1920
},
{
"epoch": 0.6,
"grad_norm": 3.7816905975341797,
"learning_rate": 1.843642342438928e-05,
"loss": 1.9079,
"step": 1925
},
{
"epoch": 0.6,
"grad_norm": 5.009194850921631,
"learning_rate": 1.8315135038829406e-05,
"loss": 1.9509,
"step": 1930
},
{
"epoch": 0.6,
"grad_norm": 3.4465157985687256,
"learning_rate": 1.8194015878720084e-05,
"loss": 2.0019,
"step": 1935
},
{
"epoch": 0.6,
"grad_norm": 3.6948273181915283,
"learning_rate": 1.8073069010158334e-05,
"loss": 2.0043,
"step": 1940
},
{
"epoch": 0.6,
"grad_norm": 3.3850791454315186,
"learning_rate": 1.795229749487965e-05,
"loss": 1.9031,
"step": 1945
},
{
"epoch": 0.61,
"grad_norm": 5.051716327667236,
"learning_rate": 1.7831704390180498e-05,
"loss": 1.8958,
"step": 1950
},
{
"epoch": 0.61,
"grad_norm": 2.8910887241363525,
"learning_rate": 1.7711292748840943e-05,
"loss": 1.8856,
"step": 1955
},
{
"epoch": 0.61,
"grad_norm": 3.8123810291290283,
"learning_rate": 1.759106561904737e-05,
"loss": 1.8229,
"step": 1960
},
{
"epoch": 0.61,
"grad_norm": 4.154626369476318,
"learning_rate": 1.747102604431528e-05,
"loss": 1.9509,
"step": 1965
},
{
"epoch": 0.61,
"grad_norm": 4.20812463760376,
"learning_rate": 1.7351177063412276e-05,
"loss": 1.9501,
"step": 1970
},
{
"epoch": 0.61,
"grad_norm": 3.2041704654693604,
"learning_rate": 1.723152171028114e-05,
"loss": 1.9888,
"step": 1975
},
{
"epoch": 0.61,
"grad_norm": 3.133105754852295,
"learning_rate": 1.7112063013963044e-05,
"loss": 2.0086,
"step": 1980
},
{
"epoch": 0.62,
"grad_norm": 4.227274417877197,
"learning_rate": 1.6992803998520794e-05,
"loss": 1.9373,
"step": 1985
},
{
"epoch": 0.62,
"grad_norm": 3.2231645584106445,
"learning_rate": 1.6873747682962394e-05,
"loss": 1.7439,
"step": 1990
},
{
"epoch": 0.62,
"grad_norm": 2.90924334526062,
"learning_rate": 1.67548970811645e-05,
"loss": 1.8914,
"step": 1995
},
{
"epoch": 0.62,
"grad_norm": 3.2363147735595703,
"learning_rate": 1.6636255201796237e-05,
"loss": 1.9674,
"step": 2000
},
{
"epoch": 0.62,
"grad_norm": 4.925014019012451,
"learning_rate": 1.6517825048242936e-05,
"loss": 1.8693,
"step": 2005
},
{
"epoch": 0.62,
"grad_norm": 3.2326242923736572,
"learning_rate": 1.6399609618530183e-05,
"loss": 1.8776,
"step": 2010
},
{
"epoch": 0.63,
"grad_norm": 3.984081506729126,
"learning_rate": 1.6281611905247855e-05,
"loss": 1.881,
"step": 2015
},
{
"epoch": 0.63,
"grad_norm": 3.8823959827423096,
"learning_rate": 1.6163834895474445e-05,
"loss": 1.9769,
"step": 2020
},
{
"epoch": 0.63,
"grad_norm": 4.131060600280762,
"learning_rate": 1.604628157070136e-05,
"loss": 1.9811,
"step": 2025
},
{
"epoch": 0.63,
"grad_norm": 4.516271591186523,
"learning_rate": 1.5928954906757515e-05,
"loss": 1.995,
"step": 2030
},
{
"epoch": 0.63,
"grad_norm": 3.9269816875457764,
"learning_rate": 1.5811857873733942e-05,
"loss": 1.8224,
"step": 2035
},
{
"epoch": 0.63,
"grad_norm": 3.7068333625793457,
"learning_rate": 1.5694993435908646e-05,
"loss": 1.8799,
"step": 2040
},
{
"epoch": 0.63,
"grad_norm": 4.0933756828308105,
"learning_rate": 1.557836455167157e-05,
"loss": 1.9251,
"step": 2045
},
{
"epoch": 0.64,
"grad_norm": 4.189598560333252,
"learning_rate": 1.546197417344965e-05,
"loss": 2.032,
"step": 2050
},
{
"epoch": 0.64,
"grad_norm": 3.609545946121216,
"learning_rate": 1.5345825247632135e-05,
"loss": 1.9399,
"step": 2055
},
{
"epoch": 0.64,
"grad_norm": 3.9929699897766113,
"learning_rate": 1.5229920714495948e-05,
"loss": 1.8803,
"step": 2060
},
{
"epoch": 0.64,
"grad_norm": 3.578582286834717,
"learning_rate": 1.5114263508131327e-05,
"loss": 1.8303,
"step": 2065
},
{
"epoch": 0.64,
"grad_norm": 3.167156457901001,
"learning_rate": 1.499885655636746e-05,
"loss": 2.0741,
"step": 2070
},
{
"epoch": 0.64,
"grad_norm": 3.376950263977051,
"learning_rate": 1.4883702780698433e-05,
"loss": 1.8935,
"step": 2075
},
{
"epoch": 0.65,
"grad_norm": 7.022952556610107,
"learning_rate": 1.4768805096209231e-05,
"loss": 1.9285,
"step": 2080
},
{
"epoch": 0.65,
"grad_norm": 4.465900897979736,
"learning_rate": 1.4654166411502002e-05,
"loss": 1.9464,
"step": 2085
},
{
"epoch": 0.65,
"grad_norm": 2.990349292755127,
"learning_rate": 1.4539789628622347e-05,
"loss": 1.8252,
"step": 2090
},
{
"epoch": 0.65,
"grad_norm": 3.1683619022369385,
"learning_rate": 1.4425677642985924e-05,
"loss": 1.8346,
"step": 2095
},
{
"epoch": 0.65,
"grad_norm": 3.782841444015503,
"learning_rate": 1.4311833343305097e-05,
"loss": 1.8584,
"step": 2100
},
{
"epoch": 0.65,
"grad_norm": 3.302788257598877,
"learning_rate": 1.4198259611515886e-05,
"loss": 1.9615,
"step": 2105
},
{
"epoch": 0.65,
"grad_norm": 4.179065227508545,
"learning_rate": 1.4084959322704893e-05,
"loss": 2.0387,
"step": 2110
},
{
"epoch": 0.66,
"grad_norm": 3.3860225677490234,
"learning_rate": 1.3971935345036657e-05,
"loss": 1.7267,
"step": 2115
},
{
"epoch": 0.66,
"grad_norm": 4.326015472412109,
"learning_rate": 1.3859190539680927e-05,
"loss": 1.9828,
"step": 2120
},
{
"epoch": 0.66,
"grad_norm": 3.4805123805999756,
"learning_rate": 1.3746727760740328e-05,
"loss": 1.8873,
"step": 2125
},
{
"epoch": 0.66,
"grad_norm": 2.8176207542419434,
"learning_rate": 1.3634549855178028e-05,
"loss": 2.0302,
"step": 2130
},
{
"epoch": 0.66,
"grad_norm": 2.756837844848633,
"learning_rate": 1.3522659662745723e-05,
"loss": 1.9893,
"step": 2135
},
{
"epoch": 0.66,
"grad_norm": 4.258969783782959,
"learning_rate": 1.3411060015911734e-05,
"loss": 1.847,
"step": 2140
},
{
"epoch": 0.67,
"grad_norm": 5.707541465759277,
"learning_rate": 1.32997537397893e-05,
"loss": 1.8802,
"step": 2145
},
{
"epoch": 0.67,
"grad_norm": 3.7876532077789307,
"learning_rate": 1.3188743652065083e-05,
"loss": 1.9015,
"step": 2150
},
{
"epoch": 0.67,
"grad_norm": 3.91947340965271,
"learning_rate": 1.3078032562927788e-05,
"loss": 1.8293,
"step": 2155
},
{
"epoch": 0.67,
"grad_norm": 4.129434108734131,
"learning_rate": 1.296762327499707e-05,
"loss": 1.786,
"step": 2160
},
{
"epoch": 0.67,
"grad_norm": 3.0605030059814453,
"learning_rate": 1.2857518583252587e-05,
"loss": 1.9754,
"step": 2165
},
{
"epoch": 0.67,
"grad_norm": 3.6712772846221924,
"learning_rate": 1.2747721274963214e-05,
"loss": 1.8931,
"step": 2170
},
{
"epoch": 0.67,
"grad_norm": 3.6777453422546387,
"learning_rate": 1.2638234129616488e-05,
"loss": 1.9122,
"step": 2175
},
{
"epoch": 0.68,
"grad_norm": 3.1498284339904785,
"learning_rate": 1.2529059918848296e-05,
"loss": 1.8041,
"step": 2180
},
{
"epoch": 0.68,
"grad_norm": 3.7665841579437256,
"learning_rate": 1.2420201406372662e-05,
"loss": 1.7802,
"step": 2185
},
{
"epoch": 0.68,
"grad_norm": 3.147603988647461,
"learning_rate": 1.2311661347911783e-05,
"loss": 1.9658,
"step": 2190
},
{
"epoch": 0.68,
"grad_norm": 3.327116012573242,
"learning_rate": 1.220344249112629e-05,
"loss": 1.8795,
"step": 2195
},
{
"epoch": 0.68,
"grad_norm": 3.689382553100586,
"learning_rate": 1.2095547575545686e-05,
"loss": 1.942,
"step": 2200
},
{
"epoch": 0.68,
"grad_norm": 3.967803955078125,
"learning_rate": 1.1987979332499011e-05,
"loss": 1.8653,
"step": 2205
},
{
"epoch": 0.69,
"grad_norm": 3.113976001739502,
"learning_rate": 1.1880740485045649e-05,
"loss": 1.8737,
"step": 2210
},
{
"epoch": 0.69,
"grad_norm": 3.3383049964904785,
"learning_rate": 1.1773833747906471e-05,
"loss": 1.9163,
"step": 2215
},
{
"epoch": 0.69,
"grad_norm": 3.971327304840088,
"learning_rate": 1.1667261827395035e-05,
"loss": 2.0355,
"step": 2220
},
{
"epoch": 0.69,
"grad_norm": 3.8071823120117188,
"learning_rate": 1.1561027421349117e-05,
"loss": 1.7467,
"step": 2225
},
{
"epoch": 0.69,
"grad_norm": 3.7409048080444336,
"learning_rate": 1.145513321906243e-05,
"loss": 1.847,
"step": 2230
},
{
"epoch": 0.69,
"grad_norm": 5.195309162139893,
"learning_rate": 1.1349581901216514e-05,
"loss": 2.0805,
"step": 2235
},
{
"epoch": 0.7,
"grad_norm": 2.922433376312256,
"learning_rate": 1.1244376139812867e-05,
"loss": 1.7545,
"step": 2240
},
{
"epoch": 0.7,
"grad_norm": 5.311805725097656,
"learning_rate": 1.1139518598105358e-05,
"loss": 1.9093,
"step": 2245
},
{
"epoch": 0.7,
"grad_norm": 3.9856057167053223,
"learning_rate": 1.1035011930532771e-05,
"loss": 1.8777,
"step": 2250
},
{
"epoch": 0.7,
"grad_norm": 3.006605386734009,
"learning_rate": 1.0930858782651585e-05,
"loss": 1.9631,
"step": 2255
},
{
"epoch": 0.7,
"grad_norm": 3.3158912658691406,
"learning_rate": 1.0827061791069045e-05,
"loss": 1.8097,
"step": 2260
},
{
"epoch": 0.7,
"grad_norm": 4.086146831512451,
"learning_rate": 1.0723623583376392e-05,
"loss": 1.9171,
"step": 2265
},
{
"epoch": 0.7,
"grad_norm": 4.822931289672852,
"learning_rate": 1.062054677808238e-05,
"loss": 2.1704,
"step": 2270
},
{
"epoch": 0.71,
"grad_norm": 3.8096282482147217,
"learning_rate": 1.0517833984546923e-05,
"loss": 1.9599,
"step": 2275
},
{
"epoch": 0.71,
"grad_norm": 5.096799373626709,
"learning_rate": 1.0415487802915133e-05,
"loss": 1.9463,
"step": 2280
},
{
"epoch": 0.71,
"grad_norm": 3.9913666248321533,
"learning_rate": 1.0313510824051393e-05,
"loss": 1.9045,
"step": 2285
},
{
"epoch": 0.71,
"grad_norm": 3.0718228816986084,
"learning_rate": 1.0211905629473866e-05,
"loss": 1.7678,
"step": 2290
},
{
"epoch": 0.71,
"grad_norm": 5.186037540435791,
"learning_rate": 1.0110674791289079e-05,
"loss": 1.9355,
"step": 2295
},
{
"epoch": 0.71,
"grad_norm": 3.739786386489868,
"learning_rate": 1.0009820872126835e-05,
"loss": 2.015,
"step": 2300
},
{
"epoch": 0.72,
"grad_norm": 3.730051040649414,
"learning_rate": 9.909346425075335e-06,
"loss": 1.9639,
"step": 2305
},
{
"epoch": 0.72,
"grad_norm": 4.366475582122803,
"learning_rate": 9.809253993616569e-06,
"loss": 2.1142,
"step": 2310
},
{
"epoch": 0.72,
"grad_norm": 2.9198176860809326,
"learning_rate": 9.709546111561913e-06,
"loss": 1.8616,
"step": 2315
},
{
"epoch": 0.72,
"grad_norm": 3.5179014205932617,
"learning_rate": 9.610225302987961e-06,
"loss": 1.8651,
"step": 2320
},
{
"epoch": 0.72,
"grad_norm": 3.9303548336029053,
"learning_rate": 9.511294082172653e-06,
"loss": 2.0002,
"step": 2325
},
{
"epoch": 0.72,
"grad_norm": 3.435821771621704,
"learning_rate": 9.412754953531663e-06,
"loss": 1.8817,
"step": 2330
},
{
"epoch": 0.72,
"grad_norm": 4.4535932540893555,
"learning_rate": 9.314610411554925e-06,
"loss": 1.8213,
"step": 2335
},
{
"epoch": 0.73,
"grad_norm": 3.345769166946411,
"learning_rate": 9.216862940743529e-06,
"loss": 1.8374,
"step": 2340
},
{
"epoch": 0.73,
"grad_norm": 4.314777851104736,
"learning_rate": 9.119515015546836e-06,
"loss": 2.0438,
"step": 2345
},
{
"epoch": 0.73,
"grad_norm": 4.599632263183594,
"learning_rate": 9.02256910029983e-06,
"loss": 1.8459,
"step": 2350
},
{
"epoch": 0.73,
"grad_norm": 3.590637683868408,
"learning_rate": 8.926027649160704e-06,
"loss": 1.8009,
"step": 2355
},
{
"epoch": 0.73,
"grad_norm": 3.119189500808716,
"learning_rate": 8.82989310604877e-06,
"loss": 1.9651,
"step": 2360
},
{
"epoch": 0.73,
"grad_norm": 3.1386303901672363,
"learning_rate": 8.734167904582566e-06,
"loss": 1.7791,
"step": 2365
},
{
"epoch": 0.74,
"grad_norm": 3.6528995037078857,
"learning_rate": 8.638854468018296e-06,
"loss": 1.9259,
"step": 2370
},
{
"epoch": 0.74,
"grad_norm": 4.182424545288086,
"learning_rate": 8.543955209188412e-06,
"loss": 1.8853,
"step": 2375
},
{
"epoch": 0.74,
"grad_norm": 5.662861347198486,
"learning_rate": 8.449472530440612e-06,
"loss": 1.9349,
"step": 2380
},
{
"epoch": 0.74,
"grad_norm": 4.169982433319092,
"learning_rate": 8.355408823576951e-06,
"loss": 1.9554,
"step": 2385
},
{
"epoch": 0.74,
"grad_norm": 3.808478832244873,
"learning_rate": 8.261766469793373e-06,
"loss": 1.8309,
"step": 2390
},
{
"epoch": 0.74,
"grad_norm": 3.801201343536377,
"learning_rate": 8.168547839619352e-06,
"loss": 1.8714,
"step": 2395
},
{
"epoch": 0.74,
"grad_norm": 3.8212218284606934,
"learning_rate": 8.075755292857933e-06,
"loss": 1.844,
"step": 2400
},
{
"epoch": 0.75,
"grad_norm": 4.7147650718688965,
"learning_rate": 7.983391178525979e-06,
"loss": 1.9004,
"step": 2405
},
{
"epoch": 0.75,
"grad_norm": 3.4768807888031006,
"learning_rate": 7.89145783479471e-06,
"loss": 1.947,
"step": 2410
},
{
"epoch": 0.75,
"grad_norm": 3.307199478149414,
"learning_rate": 7.799957588930523e-06,
"loss": 1.9069,
"step": 2415
},
{
"epoch": 0.75,
"grad_norm": 4.613658905029297,
"learning_rate": 7.708892757236047e-06,
"loss": 1.917,
"step": 2420
},
{
"epoch": 0.75,
"grad_norm": 2.8293955326080322,
"learning_rate": 7.618265644991535e-06,
"loss": 1.8854,
"step": 2425
},
{
"epoch": 0.75,
"grad_norm": 3.302823066711426,
"learning_rate": 7.528078546396481e-06,
"loss": 2.0073,
"step": 2430
},
{
"epoch": 0.76,
"grad_norm": 2.862478494644165,
"learning_rate": 7.438333744511591e-06,
"loss": 1.9243,
"step": 2435
},
{
"epoch": 0.76,
"grad_norm": 4.1902899742126465,
"learning_rate": 7.3490335112009225e-06,
"loss": 1.8696,
"step": 2440
},
{
"epoch": 0.76,
"grad_norm": 3.4848709106445312,
"learning_rate": 7.260180107074438e-06,
"loss": 2.0236,
"step": 2445
},
{
"epoch": 0.76,
"grad_norm": 2.9219446182250977,
"learning_rate": 7.171775781430712e-06,
"loss": 1.9218,
"step": 2450
},
{
"epoch": 0.76,
"grad_norm": 3.458622694015503,
"learning_rate": 7.083822772200058e-06,
"loss": 1.9155,
"step": 2455
},
{
"epoch": 0.76,
"grad_norm": 3.5859556198120117,
"learning_rate": 6.996323305887822e-06,
"loss": 1.9701,
"step": 2460
},
{
"epoch": 0.76,
"grad_norm": 3.7645373344421387,
"learning_rate": 6.909279597518048e-06,
"loss": 1.9555,
"step": 2465
},
{
"epoch": 0.77,
"grad_norm": 5.934003829956055,
"learning_rate": 6.822693850577385e-06,
"loss": 1.9963,
"step": 2470
},
{
"epoch": 0.77,
"grad_norm": 4.152750015258789,
"learning_rate": 6.7365682569593496e-06,
"loss": 1.8777,
"step": 2475
},
{
"epoch": 0.77,
"grad_norm": 3.7498714923858643,
"learning_rate": 6.6509049969087715e-06,
"loss": 1.9313,
"step": 2480
},
{
"epoch": 0.77,
"grad_norm": 2.86311411857605,
"learning_rate": 6.565706238966671e-06,
"loss": 1.7692,
"step": 2485
},
{
"epoch": 0.77,
"grad_norm": 4.296627521514893,
"learning_rate": 6.480974139915297e-06,
"loss": 1.942,
"step": 2490
},
{
"epoch": 0.77,
"grad_norm": 3.102341890335083,
"learning_rate": 6.396710844723597e-06,
"loss": 1.9011,
"step": 2495
},
{
"epoch": 0.78,
"grad_norm": 4.467423439025879,
"learning_rate": 6.312918486492855e-06,
"loss": 1.8276,
"step": 2500
},
{
"epoch": 0.78,
"grad_norm": 4.662038803100586,
"learning_rate": 6.229599186402729e-06,
"loss": 1.8927,
"step": 2505
},
{
"epoch": 0.78,
"grad_norm": 6.194324493408203,
"learning_rate": 6.146755053657541e-06,
"loss": 1.8046,
"step": 2510
},
{
"epoch": 0.78,
"grad_norm": 3.2271151542663574,
"learning_rate": 6.064388185432898e-06,
"loss": 1.7897,
"step": 2515
},
{
"epoch": 0.78,
"grad_norm": 3.0152978897094727,
"learning_rate": 5.9825006668225905e-06,
"loss": 1.8203,
"step": 2520
},
{
"epoch": 0.78,
"grad_norm": 3.5677027702331543,
"learning_rate": 5.901094570785798e-06,
"loss": 1.9312,
"step": 2525
},
{
"epoch": 0.79,
"grad_norm": 3.464501142501831,
"learning_rate": 5.820171958094628e-06,
"loss": 1.9227,
"step": 2530
},
{
"epoch": 0.79,
"grad_norm": 4.184050559997559,
"learning_rate": 5.73973487728196e-06,
"loss": 1.8542,
"step": 2535
},
{
"epoch": 0.79,
"grad_norm": 3.7280945777893066,
"learning_rate": 5.659785364589556e-06,
"loss": 2.0387,
"step": 2540
},
{
"epoch": 0.79,
"grad_norm": 3.863532543182373,
"learning_rate": 5.580325443916526e-06,
"loss": 1.8824,
"step": 2545
},
{
"epoch": 0.79,
"grad_norm": 3.403118133544922,
"learning_rate": 5.501357126768117e-06,
"loss": 1.8999,
"step": 2550
},
{
"epoch": 0.79,
"grad_norm": 3.203178644180298,
"learning_rate": 5.422882412204766e-06,
"loss": 2.0521,
"step": 2555
},
{
"epoch": 0.79,
"grad_norm": 3.8374898433685303,
"learning_rate": 5.344903286791494e-06,
"loss": 1.8838,
"step": 2560
},
{
"epoch": 0.8,
"grad_norm": 3.570945978164673,
"learning_rate": 5.267421724547627e-06,
"loss": 1.9615,
"step": 2565
},
{
"epoch": 0.8,
"grad_norm": 6.397089004516602,
"learning_rate": 5.1904396868968195e-06,
"loss": 1.9624,
"step": 2570
},
{
"epoch": 0.8,
"grad_norm": 3.234090805053711,
"learning_rate": 5.113959122617412e-06,
"loss": 1.9239,
"step": 2575
},
{
"epoch": 0.8,
"grad_norm": 3.1682183742523193,
"learning_rate": 5.037981967793076e-06,
"loss": 1.8498,
"step": 2580
},
{
"epoch": 0.8,
"grad_norm": 4.0839152336120605,
"learning_rate": 4.9625101457638376e-06,
"loss": 1.9856,
"step": 2585
},
{
"epoch": 0.8,
"grad_norm": 3.629542589187622,
"learning_rate": 4.887545567077337e-06,
"loss": 1.8867,
"step": 2590
},
{
"epoch": 0.81,
"grad_norm": 4.0674638748168945,
"learning_rate": 4.8130901294405255e-06,
"loss": 2.0402,
"step": 2595
},
{
"epoch": 0.81,
"grad_norm": 3.093059539794922,
"learning_rate": 4.739145717671572e-06,
"loss": 1.9107,
"step": 2600
},
{
"epoch": 0.81,
"grad_norm": 6.425740718841553,
"learning_rate": 4.665714203652177e-06,
"loss": 1.8893,
"step": 2605
},
{
"epoch": 0.81,
"grad_norm": 3.764960765838623,
"learning_rate": 4.592797446280178e-06,
"loss": 1.8649,
"step": 2610
},
{
"epoch": 0.81,
"grad_norm": 3.2027156352996826,
"learning_rate": 4.520397291422501e-06,
"loss": 1.991,
"step": 2615
},
{
"epoch": 0.81,
"grad_norm": 4.535457134246826,
"learning_rate": 4.448515571868434e-06,
"loss": 1.8798,
"step": 2620
},
{
"epoch": 0.81,
"grad_norm": 3.6848881244659424,
"learning_rate": 4.3771541072832045e-06,
"loss": 1.9349,
"step": 2625
},
{
"epoch": 0.82,
"grad_norm": 3.817534923553467,
"learning_rate": 4.306314704161937e-06,
"loss": 1.8637,
"step": 2630
},
{
"epoch": 0.82,
"grad_norm": 3.4655098915100098,
"learning_rate": 4.23599915578394e-06,
"loss": 1.8615,
"step": 2635
},
{
"epoch": 0.82,
"grad_norm": 2.829066276550293,
"learning_rate": 4.16620924216726e-06,
"loss": 1.7928,
"step": 2640
},
{
"epoch": 0.82,
"grad_norm": 4.525213241577148,
"learning_rate": 4.096946730023662e-06,
"loss": 1.903,
"step": 2645
},
{
"epoch": 0.82,
"grad_norm": 3.8306119441986084,
"learning_rate": 4.028213372713904e-06,
"loss": 1.9473,
"step": 2650
},
{
"epoch": 0.82,
"grad_norm": 4.448178768157959,
"learning_rate": 3.960010910203319e-06,
"loss": 1.959,
"step": 2655
},
{
"epoch": 0.83,
"grad_norm": 3.6487441062927246,
"learning_rate": 3.892341069017808e-06,
"loss": 1.9932,
"step": 2660
},
{
"epoch": 0.83,
"grad_norm": 3.487689256668091,
"learning_rate": 3.825205562200101e-06,
"loss": 1.9578,
"step": 2665
},
{
"epoch": 0.83,
"grad_norm": 3.0234782695770264,
"learning_rate": 3.75860608926642e-06,
"loss": 1.9083,
"step": 2670
},
{
"epoch": 0.83,
"grad_norm": 3.328275203704834,
"learning_rate": 3.69254433616342e-06,
"loss": 2.0128,
"step": 2675
},
{
"epoch": 0.83,
"grad_norm": 2.9996497631073,
"learning_rate": 3.627021975225553e-06,
"loss": 1.633,
"step": 2680
},
{
"epoch": 0.83,
"grad_norm": 3.9526045322418213,
"learning_rate": 3.562040665132715e-06,
"loss": 1.8948,
"step": 2685
},
{
"epoch": 0.83,
"grad_norm": 4.027220249176025,
"learning_rate": 3.4976020508682344e-06,
"loss": 1.8918,
"step": 2690
},
{
"epoch": 0.84,
"grad_norm": 4.6429829597473145,
"learning_rate": 3.4337077636772547e-06,
"loss": 1.8865,
"step": 2695
},
{
"epoch": 0.84,
"grad_norm": 4.5367865562438965,
"learning_rate": 3.3703594210254487e-06,
"loss": 1.895,
"step": 2700
},
{
"epoch": 0.84,
"grad_norm": 3.4687774181365967,
"learning_rate": 3.3075586265580494e-06,
"loss": 1.8908,
"step": 2705
},
{
"epoch": 0.84,
"grad_norm": 4.654914855957031,
"learning_rate": 3.24530697005925e-06,
"loss": 1.7785,
"step": 2710
},
{
"epoch": 0.84,
"grad_norm": 4.516482353210449,
"learning_rate": 3.183606027411998e-06,
"loss": 1.7936,
"step": 2715
},
{
"epoch": 0.84,
"grad_norm": 4.209545135498047,
"learning_rate": 3.1224573605580648e-06,
"loss": 1.9851,
"step": 2720
},
{
"epoch": 0.85,
"grad_norm": 4.1666178703308105,
"learning_rate": 3.061862517458519e-06,
"loss": 1.858,
"step": 2725
},
{
"epoch": 0.85,
"grad_norm": 5.190033912658691,
"learning_rate": 3.001823032054532e-06,
"loss": 1.9802,
"step": 2730
},
{
"epoch": 0.85,
"grad_norm": 4.3511528968811035,
"learning_rate": 2.942340424228554e-06,
"loss": 1.9403,
"step": 2735
},
{
"epoch": 0.85,
"grad_norm": 4.630067348480225,
"learning_rate": 2.8834161997658565e-06,
"loss": 1.7726,
"step": 2740
},
{
"epoch": 0.85,
"grad_norm": 3.705087184906006,
"learning_rate": 2.825051850316371e-06,
"loss": 1.8286,
"step": 2745
},
{
"epoch": 0.85,
"grad_norm": 3.315842628479004,
"learning_rate": 2.767248853356971e-06,
"loss": 1.8397,
"step": 2750
},
{
"epoch": 0.85,
"grad_norm": 5.60033655166626,
"learning_rate": 2.710008672154035e-06,
"loss": 1.994,
"step": 2755
},
{
"epoch": 0.86,
"grad_norm": 4.465238571166992,
"learning_rate": 2.65333275572644e-06,
"loss": 1.9824,
"step": 2760
},
{
"epoch": 0.86,
"grad_norm": 3.8040528297424316,
"learning_rate": 2.5972225388088497e-06,
"loss": 1.8507,
"step": 2765
},
{
"epoch": 0.86,
"grad_norm": 3.2600059509277344,
"learning_rate": 2.5416794418154035e-06,
"loss": 1.992,
"step": 2770
},
{
"epoch": 0.86,
"grad_norm": 4.9075703620910645,
"learning_rate": 2.486704870803763e-06,
"loss": 1.8189,
"step": 2775
},
{
"epoch": 0.86,
"grad_norm": 4.047214508056641,
"learning_rate": 2.432300217439526e-06,
"loss": 1.9156,
"step": 2780
},
{
"epoch": 0.86,
"grad_norm": 4.082090854644775,
"learning_rate": 2.3784668589609814e-06,
"loss": 1.8582,
"step": 2785
},
{
"epoch": 0.87,
"grad_norm": 3.8980605602264404,
"learning_rate": 2.3252061581442496e-06,
"loss": 1.8418,
"step": 2790
},
{
"epoch": 0.87,
"grad_norm": 4.5113372802734375,
"learning_rate": 2.2725194632687795e-06,
"loss": 1.8942,
"step": 2795
},
{
"epoch": 0.87,
"grad_norm": 4.78348445892334,
"learning_rate": 2.220408108083244e-06,
"loss": 1.868,
"step": 2800
},
{
"epoch": 0.87,
"grad_norm": 3.327033281326294,
"learning_rate": 2.1688734117717295e-06,
"loss": 1.9177,
"step": 2805
},
{
"epoch": 0.87,
"grad_norm": 3.6453311443328857,
"learning_rate": 2.117916678920384e-06,
"loss": 1.8282,
"step": 2810
},
{
"epoch": 0.87,
"grad_norm": 3.0697853565216064,
"learning_rate": 2.0675391994843695e-06,
"loss": 1.8374,
"step": 2815
},
{
"epoch": 0.88,
"grad_norm": 3.6173019409179688,
"learning_rate": 2.017742248755225e-06,
"loss": 1.9797,
"step": 2820
},
{
"epoch": 0.88,
"grad_norm": 3.858684539794922,
"learning_rate": 1.9685270873285505e-06,
"loss": 1.9083,
"step": 2825
},
{
"epoch": 0.88,
"grad_norm": 3.6615593433380127,
"learning_rate": 1.9198949610721273e-06,
"loss": 2.0119,
"step": 2830
},
{
"epoch": 0.88,
"grad_norm": 4.125614643096924,
"learning_rate": 1.8718471010943623e-06,
"loss": 1.8927,
"step": 2835
},
{
"epoch": 0.88,
"grad_norm": 3.79669451713562,
"learning_rate": 1.8243847237131406e-06,
"loss": 1.8407,
"step": 2840
},
{
"epoch": 0.88,
"grad_norm": 3.5093576908111572,
"learning_rate": 1.7775090304250065e-06,
"loss": 1.9293,
"step": 2845
},
{
"epoch": 0.88,
"grad_norm": 3.6266543865203857,
"learning_rate": 1.7312212078747781e-06,
"loss": 1.6496,
"step": 2850
},
{
"epoch": 0.89,
"grad_norm": 4.086301326751709,
"learning_rate": 1.6855224278254812e-06,
"loss": 1.9496,
"step": 2855
},
{
"epoch": 0.89,
"grad_norm": 3.14742374420166,
"learning_rate": 1.6404138471286966e-06,
"loss": 1.8646,
"step": 2860
},
{
"epoch": 0.89,
"grad_norm": 2.868939161300659,
"learning_rate": 1.5958966076952992e-06,
"loss": 1.9593,
"step": 2865
},
{
"epoch": 0.89,
"grad_norm": 3.424562931060791,
"learning_rate": 1.5519718364665009e-06,
"loss": 1.7344,
"step": 2870
},
{
"epoch": 0.89,
"grad_norm": 3.9741764068603516,
"learning_rate": 1.5086406453853646e-06,
"loss": 1.7876,
"step": 2875
},
{
"epoch": 0.89,
"grad_norm": 4.209314346313477,
"learning_rate": 1.4659041313686366e-06,
"loss": 2.1263,
"step": 2880
},
{
"epoch": 0.9,
"grad_norm": 4.095180034637451,
"learning_rate": 1.4237633762789942e-06,
"loss": 1.7563,
"step": 2885
},
{
"epoch": 0.9,
"grad_norm": 4.4438066482543945,
"learning_rate": 1.3822194468976284e-06,
"loss": 1.8099,
"step": 2890
},
{
"epoch": 0.9,
"grad_norm": 4.844168663024902,
"learning_rate": 1.3412733948972688e-06,
"loss": 1.8867,
"step": 2895
},
{
"epoch": 0.9,
"grad_norm": 3.2806739807128906,
"learning_rate": 1.300926256815546e-06,
"loss": 1.9385,
"step": 2900
},
{
"epoch": 0.9,
"grad_norm": 3.7914087772369385,
"learning_rate": 1.2611790540287633e-06,
"loss": 1.7425,
"step": 2905
},
{
"epoch": 0.9,
"grad_norm": 4.138453960418701,
"learning_rate": 1.2220327927260161e-06,
"loss": 1.9172,
"step": 2910
},
{
"epoch": 0.9,
"grad_norm": 3.3346848487854004,
"learning_rate": 1.1834884638837613e-06,
"loss": 1.9754,
"step": 2915
},
{
"epoch": 0.91,
"grad_norm": 3.6204893589019775,
"learning_rate": 1.1455470432406829e-06,
"loss": 1.7101,
"step": 2920
},
{
"epoch": 0.91,
"grad_norm": 4.972575664520264,
"learning_rate": 1.108209491273035e-06,
"loss": 1.8861,
"step": 2925
},
{
"epoch": 0.91,
"grad_norm": 3.620809316635132,
"learning_rate": 1.0714767531702973e-06,
"loss": 1.8525,
"step": 2930
},
{
"epoch": 0.91,
"grad_norm": 3.33205509185791,
"learning_rate": 1.035349758811263e-06,
"loss": 1.8453,
"step": 2935
},
{
"epoch": 0.91,
"grad_norm": 3.7018685340881348,
"learning_rate": 9.998294227404863e-07,
"loss": 2.0806,
"step": 2940
},
{
"epoch": 0.91,
"grad_norm": 4.9941864013671875,
"learning_rate": 9.649166441451557e-07,
"loss": 1.94,
"step": 2945
},
{
"epoch": 0.92,
"grad_norm": 4.217085361480713,
"learning_rate": 9.306123068323097e-07,
"loss": 1.9168,
"step": 2950
},
{
"epoch": 0.92,
"grad_norm": 3.2208547592163086,
"learning_rate": 8.969172792064634e-07,
"loss": 1.8819,
"step": 2955
},
{
"epoch": 0.92,
"grad_norm": 3.9018375873565674,
"learning_rate": 8.638324142476284e-07,
"loss": 1.9311,
"step": 2960
},
{
"epoch": 0.92,
"grad_norm": 3.776543140411377,
"learning_rate": 8.313585494897385e-07,
"loss": 1.762,
"step": 2965
},
{
"epoch": 0.92,
"grad_norm": 6.1161603927612305,
"learning_rate": 7.994965069994142e-07,
"loss": 1.8604,
"step": 2970
},
{
"epoch": 0.92,
"grad_norm": 3.6044158935546875,
"learning_rate": 7.682470933551761e-07,
"loss": 1.7736,
"step": 2975
},
{
"epoch": 0.92,
"grad_norm": 4.38954496383667,
"learning_rate": 7.376110996270281e-07,
"loss": 1.9429,
"step": 2980
},
{
"epoch": 0.93,
"grad_norm": 4.361955165863037,
"learning_rate": 7.075893013564123e-07,
"loss": 1.8157,
"step": 2985
},
{
"epoch": 0.93,
"grad_norm": 3.799809217453003,
"learning_rate": 6.781824585365915e-07,
"loss": 1.9094,
"step": 2990
},
{
"epoch": 0.93,
"grad_norm": 4.269566059112549,
"learning_rate": 6.493913155934117e-07,
"loss": 1.9207,
"step": 2995
},
{
"epoch": 0.93,
"grad_norm": 4.451285362243652,
"learning_rate": 6.212166013664422e-07,
"loss": 1.6652,
"step": 3000
},
{
"epoch": 0.93,
"grad_norm": 3.91097092628479,
"learning_rate": 5.93659029090543e-07,
"loss": 1.9185,
"step": 3005
},
{
"epoch": 0.93,
"grad_norm": 3.952296257019043,
"learning_rate": 5.667192963778017e-07,
"loss": 1.7982,
"step": 3010
},
{
"epoch": 0.94,
"grad_norm": 3.8603575229644775,
"learning_rate": 5.403980851998669e-07,
"loss": 1.8665,
"step": 3015
},
{
"epoch": 0.94,
"grad_norm": 4.040564060211182,
"learning_rate": 5.146960618706981e-07,
"loss": 1.8744,
"step": 3020
},
{
"epoch": 0.94,
"grad_norm": 3.266788959503174,
"learning_rate": 4.896138770296876e-07,
"loss": 1.8463,
"step": 3025
},
{
"epoch": 0.94,
"grad_norm": 3.374309539794922,
"learning_rate": 4.6515216562519615e-07,
"loss": 1.8195,
"step": 3030
},
{
"epoch": 0.94,
"grad_norm": 3.7271621227264404,
"learning_rate": 4.41311546898468e-07,
"loss": 1.788,
"step": 3035
},
{
"epoch": 0.94,
"grad_norm": 3.1484320163726807,
"learning_rate": 4.180926243679689e-07,
"loss": 1.8316,
"step": 3040
},
{
"epoch": 0.94,
"grad_norm": 3.443974256515503,
"learning_rate": 3.954959858141066e-07,
"loss": 1.9071,
"step": 3045
},
{
"epoch": 0.95,
"grad_norm": 3.8171606063842773,
"learning_rate": 3.735222032643426e-07,
"loss": 2.1321,
"step": 3050
},
{
"epoch": 0.95,
"grad_norm": 3.141526699066162,
"learning_rate": 3.521718329787177e-07,
"loss": 1.8597,
"step": 3055
},
{
"epoch": 0.95,
"grad_norm": 3.848994255065918,
"learning_rate": 3.314454154357688e-07,
"loss": 1.9906,
"step": 3060
},
{
"epoch": 0.95,
"grad_norm": 3.9238314628601074,
"learning_rate": 3.1134347531884267e-07,
"loss": 1.9433,
"step": 3065
},
{
"epoch": 0.95,
"grad_norm": 4.169834136962891,
"learning_rate": 2.9186652150282603e-07,
"loss": 1.7679,
"step": 3070
},
{
"epoch": 0.95,
"grad_norm": 6.12147331237793,
"learning_rate": 2.7301504704125016e-07,
"loss": 1.6556,
"step": 3075
},
{
"epoch": 0.96,
"grad_norm": 3.5053157806396484,
"learning_rate": 2.547895291538177e-07,
"loss": 1.9142,
"step": 3080
},
{
"epoch": 0.96,
"grad_norm": 4.274362087249756,
"learning_rate": 2.371904292143151e-07,
"loss": 1.8754,
"step": 3085
},
{
"epoch": 0.96,
"grad_norm": 3.843151569366455,
"learning_rate": 2.2021819273894127e-07,
"loss": 1.7239,
"step": 3090
},
{
"epoch": 0.96,
"grad_norm": 3.5693886280059814,
"learning_rate": 2.0387324937502505e-07,
"loss": 1.8063,
"step": 3095
},
{
"epoch": 0.96,
"grad_norm": 4.155526161193848,
"learning_rate": 1.8815601289014496e-07,
"loss": 1.8008,
"step": 3100
},
{
"epoch": 0.96,
"grad_norm": 4.957355499267578,
"learning_rate": 1.730668811616598e-07,
"loss": 1.9108,
"step": 3105
},
{
"epoch": 0.97,
"grad_norm": 5.035935878753662,
"learning_rate": 1.5860623616664184e-07,
"loss": 2.0325,
"step": 3110
},
{
"epoch": 0.97,
"grad_norm": 4.176791667938232,
"learning_rate": 1.4477444397219542e-07,
"loss": 1.8947,
"step": 3115
},
{
"epoch": 0.97,
"grad_norm": 3.648829460144043,
"learning_rate": 1.3157185472619516e-07,
"loss": 1.8535,
"step": 3120
},
{
"epoch": 0.97,
"grad_norm": 3.8320178985595703,
"learning_rate": 1.1899880264842068e-07,
"loss": 1.8678,
"step": 3125
},
{
"epoch": 0.97,
"grad_norm": 3.046886682510376,
"learning_rate": 1.0705560602210784e-07,
"loss": 1.8263,
"step": 3130
},
{
"epoch": 0.97,
"grad_norm": 5.341119766235352,
"learning_rate": 9.574256718586639e-08,
"loss": 1.9319,
"step": 3135
},
{
"epoch": 0.97,
"grad_norm": 3.0084095001220703,
"learning_rate": 8.505997252605258e-08,
"loss": 1.7669,
"step": 3140
},
{
"epoch": 0.98,
"grad_norm": 3.5134646892547607,
"learning_rate": 7.500809246950569e-08,
"loss": 1.824,
"step": 3145
},
{
"epoch": 0.98,
"grad_norm": 3.576869249343872,
"learning_rate": 6.558718147670339e-08,
"loss": 1.8971,
"step": 3150
},
{
"epoch": 0.98,
"grad_norm": 3.1408050060272217,
"learning_rate": 5.679747803531699e-08,
"loss": 1.9365,
"step": 3155
},
{
"epoch": 0.98,
"grad_norm": 4.063467979431152,
"learning_rate": 4.863920465418836e-08,
"loss": 1.8272,
"step": 3160
},
{
"epoch": 0.98,
"grad_norm": 3.66452693939209,
"learning_rate": 4.111256785767903e-08,
"loss": 1.7885,
"step": 3165
},
{
"epoch": 0.98,
"grad_norm": 3.7975409030914307,
"learning_rate": 3.421775818045481e-08,
"loss": 1.879,
"step": 3170
},
{
"epoch": 0.99,
"grad_norm": 4.497860908508301,
"learning_rate": 2.7954950162656367e-08,
"loss": 1.828,
"step": 3175
},
{
"epoch": 0.99,
"grad_norm": 3.815382242202759,
"learning_rate": 2.2324302345483327e-08,
"loss": 1.9715,
"step": 3180
},
{
"epoch": 0.99,
"grad_norm": 5.165794849395752,
"learning_rate": 1.7325957267180782e-08,
"loss": 1.8856,
"step": 3185
},
{
"epoch": 0.99,
"grad_norm": 4.661296367645264,
"learning_rate": 1.2960041459425532e-08,
"loss": 1.9542,
"step": 3190
},
{
"epoch": 0.99,
"grad_norm": 4.152047157287598,
"learning_rate": 9.226665444136973e-09,
"loss": 1.9453,
"step": 3195
},
{
"epoch": 0.99,
"grad_norm": 3.161618232727051,
"learning_rate": 6.1259237306599e-09,
"loss": 1.7805,
"step": 3200
}
],
"logging_steps": 5,
"max_steps": 3222,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 4.797270917531566e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}