CabraQwen7b / trainer_state.json
nicolasdec's picture
Upload folder using huggingface_hub
36726da verified
raw
history blame
91.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999207397622193,
"eval_steps": 500,
"global_step": 2838,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 54.60578280011217,
"learning_rate": 3.4482758620689656e-07,
"loss": 2.0852,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 42.54745106429926,
"learning_rate": 1.724137931034483e-06,
"loss": 2.0381,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 9.212853479586382,
"learning_rate": 3.448275862068966e-06,
"loss": 1.6632,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 5.314912275996213,
"learning_rate": 5.172413793103449e-06,
"loss": 1.3468,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 4.03433890035307,
"learning_rate": 6.896551724137932e-06,
"loss": 1.2538,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 2.6447411660183016,
"learning_rate": 8.620689655172414e-06,
"loss": 1.2604,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 2.1857065034640795,
"learning_rate": 9.999996872939885e-06,
"loss": 1.1935,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 1.7526909326796276,
"learning_rate": 9.999887426246524e-06,
"loss": 1.1939,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 1.5378704654179458,
"learning_rate": 9.999621630458743e-06,
"loss": 1.1626,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 1.3071739468687105,
"learning_rate": 9.999199493888118e-06,
"loss": 1.122,
"step": 45
},
{
"epoch": 0.05,
"grad_norm": 1.1982413397341172,
"learning_rate": 9.998621029735082e-06,
"loss": 1.1191,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 1.0956550608235818,
"learning_rate": 9.997886256088507e-06,
"loss": 1.123,
"step": 55
},
{
"epoch": 0.06,
"grad_norm": 1.0821639142212824,
"learning_rate": 9.996995195925152e-06,
"loss": 1.0751,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 1.0865870956103822,
"learning_rate": 9.995947877108933e-06,
"loss": 1.114,
"step": 65
},
{
"epoch": 0.07,
"grad_norm": 1.0582468505041362,
"learning_rate": 9.99474433239006e-06,
"loss": 1.0609,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 1.0556893008132,
"learning_rate": 9.993384599404001e-06,
"loss": 1.0861,
"step": 75
},
{
"epoch": 0.08,
"grad_norm": 1.053104954690895,
"learning_rate": 9.991868720670322e-06,
"loss": 1.0792,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 1.069931327549148,
"learning_rate": 9.990196743591341e-06,
"loss": 1.0722,
"step": 85
},
{
"epoch": 0.1,
"grad_norm": 1.0793525694046096,
"learning_rate": 9.988368720450656e-06,
"loss": 1.0561,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 1.0449093375062701,
"learning_rate": 9.986384708411507e-06,
"loss": 1.0675,
"step": 95
},
{
"epoch": 0.11,
"grad_norm": 0.9947694351867048,
"learning_rate": 9.984244769514988e-06,
"loss": 1.0395,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 1.0281907914775885,
"learning_rate": 9.981948970678107e-06,
"loss": 1.0455,
"step": 105
},
{
"epoch": 0.12,
"grad_norm": 1.0429607059919377,
"learning_rate": 9.979497383691695e-06,
"loss": 1.014,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 1.1551790238118493,
"learning_rate": 9.976890085218157e-06,
"loss": 1.0335,
"step": 115
},
{
"epoch": 0.13,
"grad_norm": 1.0958017153997939,
"learning_rate": 9.974127156789082e-06,
"loss": 1.0566,
"step": 120
},
{
"epoch": 0.13,
"grad_norm": 1.0545164230640043,
"learning_rate": 9.971208684802686e-06,
"loss": 1.0234,
"step": 125
},
{
"epoch": 0.14,
"grad_norm": 1.1119539198620567,
"learning_rate": 9.968134760521114e-06,
"loss": 0.9956,
"step": 130
},
{
"epoch": 0.14,
"grad_norm": 1.045532676163788,
"learning_rate": 9.964905480067585e-06,
"loss": 1.0103,
"step": 135
},
{
"epoch": 0.15,
"grad_norm": 1.0818099938062198,
"learning_rate": 9.96152094442339e-06,
"loss": 0.987,
"step": 140
},
{
"epoch": 0.15,
"grad_norm": 1.06916616510137,
"learning_rate": 9.957981259424724e-06,
"loss": 1.0189,
"step": 145
},
{
"epoch": 0.16,
"grad_norm": 1.1000812098052206,
"learning_rate": 9.954286535759394e-06,
"loss": 1.0025,
"step": 150
},
{
"epoch": 0.16,
"grad_norm": 1.0740685860653156,
"learning_rate": 9.950436888963337e-06,
"loss": 1.0394,
"step": 155
},
{
"epoch": 0.17,
"grad_norm": 1.0578416601226404,
"learning_rate": 9.946432439417021e-06,
"loss": 1.0419,
"step": 160
},
{
"epoch": 0.17,
"grad_norm": 1.1378367934770748,
"learning_rate": 9.942273312341679e-06,
"loss": 1.04,
"step": 165
},
{
"epoch": 0.18,
"grad_norm": 1.106141894903122,
"learning_rate": 9.937959637795389e-06,
"loss": 1.0112,
"step": 170
},
{
"epoch": 0.18,
"grad_norm": 1.0459501547982482,
"learning_rate": 9.93349155066901e-06,
"loss": 0.9959,
"step": 175
},
{
"epoch": 0.19,
"grad_norm": 1.1420602608538855,
"learning_rate": 9.928869190681964e-06,
"loss": 0.9952,
"step": 180
},
{
"epoch": 0.2,
"grad_norm": 1.0748374838181862,
"learning_rate": 9.924092702377863e-06,
"loss": 1.0094,
"step": 185
},
{
"epoch": 0.2,
"grad_norm": 1.0535011085546289,
"learning_rate": 9.919162235119996e-06,
"loss": 1.0054,
"step": 190
},
{
"epoch": 0.21,
"grad_norm": 1.0310625793824704,
"learning_rate": 9.91407794308665e-06,
"loss": 1.0117,
"step": 195
},
{
"epoch": 0.21,
"grad_norm": 1.0359842004906923,
"learning_rate": 9.908839985266297e-06,
"loss": 0.9982,
"step": 200
},
{
"epoch": 0.22,
"grad_norm": 1.084059570369228,
"learning_rate": 9.903448525452618e-06,
"loss": 1.0127,
"step": 205
},
{
"epoch": 0.22,
"grad_norm": 1.1227120329409497,
"learning_rate": 9.89790373223938e-06,
"loss": 1.048,
"step": 210
},
{
"epoch": 0.23,
"grad_norm": 1.0615021518173307,
"learning_rate": 9.892205779015167e-06,
"loss": 1.0021,
"step": 215
},
{
"epoch": 0.23,
"grad_norm": 1.070318683802529,
"learning_rate": 9.886354843957953e-06,
"loss": 1.0043,
"step": 220
},
{
"epoch": 0.24,
"grad_norm": 1.0419755132096296,
"learning_rate": 9.88035111002954e-06,
"loss": 0.9743,
"step": 225
},
{
"epoch": 0.24,
"grad_norm": 1.1796172322040084,
"learning_rate": 9.874194764969827e-06,
"loss": 0.9957,
"step": 230
},
{
"epoch": 0.25,
"grad_norm": 1.0933963352790785,
"learning_rate": 9.867886001290943e-06,
"loss": 0.9814,
"step": 235
},
{
"epoch": 0.25,
"grad_norm": 1.1066280030775704,
"learning_rate": 9.861425016271227e-06,
"loss": 0.9832,
"step": 240
},
{
"epoch": 0.26,
"grad_norm": 1.1138948008724274,
"learning_rate": 9.854812011949059e-06,
"loss": 0.9871,
"step": 245
},
{
"epoch": 0.26,
"grad_norm": 1.0644401239508805,
"learning_rate": 9.848047195116543e-06,
"loss": 0.9951,
"step": 250
},
{
"epoch": 0.27,
"grad_norm": 1.1880183474724784,
"learning_rate": 9.841130777313039e-06,
"loss": 0.9902,
"step": 255
},
{
"epoch": 0.27,
"grad_norm": 1.0747113009717828,
"learning_rate": 9.834062974818547e-06,
"loss": 0.9433,
"step": 260
},
{
"epoch": 0.28,
"grad_norm": 1.1442114734348945,
"learning_rate": 9.826844008646949e-06,
"loss": 0.9703,
"step": 265
},
{
"epoch": 0.29,
"grad_norm": 1.0895758630826766,
"learning_rate": 9.81947410453909e-06,
"loss": 1.0236,
"step": 270
},
{
"epoch": 0.29,
"grad_norm": 0.996075250542336,
"learning_rate": 9.811953492955728e-06,
"loss": 0.9577,
"step": 275
},
{
"epoch": 0.3,
"grad_norm": 1.1734623195649692,
"learning_rate": 9.80428240907032e-06,
"loss": 0.9752,
"step": 280
},
{
"epoch": 0.3,
"grad_norm": 1.282701051609298,
"learning_rate": 9.796461092761668e-06,
"loss": 0.987,
"step": 285
},
{
"epoch": 0.31,
"grad_norm": 1.0721992980205135,
"learning_rate": 9.788489788606423e-06,
"loss": 0.944,
"step": 290
},
{
"epoch": 0.31,
"grad_norm": 1.105694230535082,
"learning_rate": 9.780368745871438e-06,
"loss": 0.9804,
"step": 295
},
{
"epoch": 0.32,
"grad_norm": 1.1121587653939105,
"learning_rate": 9.772098218505963e-06,
"loss": 1.0099,
"step": 300
},
{
"epoch": 0.32,
"grad_norm": 1.1073177873687883,
"learning_rate": 9.763678465133712e-06,
"loss": 0.9887,
"step": 305
},
{
"epoch": 0.33,
"grad_norm": 1.1986141459298305,
"learning_rate": 9.755109749044781e-06,
"loss": 0.9749,
"step": 310
},
{
"epoch": 0.33,
"grad_norm": 1.0864391212895972,
"learning_rate": 9.7463923381874e-06,
"loss": 0.9767,
"step": 315
},
{
"epoch": 0.34,
"grad_norm": 1.0595953209575595,
"learning_rate": 9.737526505159564e-06,
"loss": 0.9297,
"step": 320
},
{
"epoch": 0.34,
"grad_norm": 1.083224438455533,
"learning_rate": 9.728512527200509e-06,
"loss": 0.9498,
"step": 325
},
{
"epoch": 0.35,
"grad_norm": 1.1306776282190978,
"learning_rate": 9.719350686182041e-06,
"loss": 0.982,
"step": 330
},
{
"epoch": 0.35,
"grad_norm": 1.07939319367538,
"learning_rate": 9.710041268599718e-06,
"loss": 0.9669,
"step": 335
},
{
"epoch": 0.36,
"grad_norm": 1.1100410279851476,
"learning_rate": 9.700584565563897e-06,
"loss": 0.956,
"step": 340
},
{
"epoch": 0.36,
"grad_norm": 1.0917533373255544,
"learning_rate": 9.690980872790627e-06,
"loss": 0.9878,
"step": 345
},
{
"epoch": 0.37,
"grad_norm": 1.1287494016251205,
"learning_rate": 9.681230490592403e-06,
"loss": 0.9604,
"step": 350
},
{
"epoch": 0.38,
"grad_norm": 1.0366025693971206,
"learning_rate": 9.671333723868773e-06,
"loss": 0.9809,
"step": 355
},
{
"epoch": 0.38,
"grad_norm": 1.1876939558601538,
"learning_rate": 9.66129088209681e-06,
"loss": 0.9324,
"step": 360
},
{
"epoch": 0.39,
"grad_norm": 1.1296469706806582,
"learning_rate": 9.651102279321429e-06,
"loss": 0.98,
"step": 365
},
{
"epoch": 0.39,
"grad_norm": 1.0920615981549329,
"learning_rate": 9.640768234145563e-06,
"loss": 0.9474,
"step": 370
},
{
"epoch": 0.4,
"grad_norm": 1.045353192143218,
"learning_rate": 9.630289069720213e-06,
"loss": 0.9416,
"step": 375
},
{
"epoch": 0.4,
"grad_norm": 1.0546831730532094,
"learning_rate": 9.619665113734327e-06,
"loss": 0.9583,
"step": 380
},
{
"epoch": 0.41,
"grad_norm": 1.120397617115956,
"learning_rate": 9.608896698404567e-06,
"loss": 0.9739,
"step": 385
},
{
"epoch": 0.41,
"grad_norm": 1.0897789727469696,
"learning_rate": 9.597984160464908e-06,
"loss": 0.9882,
"step": 390
},
{
"epoch": 0.42,
"grad_norm": 1.0655227440534312,
"learning_rate": 9.586927841156121e-06,
"loss": 0.973,
"step": 395
},
{
"epoch": 0.42,
"grad_norm": 1.024445190271631,
"learning_rate": 9.575728086215093e-06,
"loss": 0.9488,
"step": 400
},
{
"epoch": 0.43,
"grad_norm": 1.0957551302719917,
"learning_rate": 9.564385245864015e-06,
"loss": 0.9395,
"step": 405
},
{
"epoch": 0.43,
"grad_norm": 1.0348921383964815,
"learning_rate": 9.552899674799438e-06,
"loss": 0.9618,
"step": 410
},
{
"epoch": 0.44,
"grad_norm": 1.1320917241343242,
"learning_rate": 9.541271732181174e-06,
"loss": 0.9737,
"step": 415
},
{
"epoch": 0.44,
"grad_norm": 1.0955620287950987,
"learning_rate": 9.52950178162107e-06,
"loss": 0.9765,
"step": 420
},
{
"epoch": 0.45,
"grad_norm": 1.0865957472837047,
"learning_rate": 9.517590191171638e-06,
"loss": 0.9402,
"step": 425
},
{
"epoch": 0.45,
"grad_norm": 1.0608004961340336,
"learning_rate": 9.505537333314534e-06,
"loss": 0.938,
"step": 430
},
{
"epoch": 0.46,
"grad_norm": 1.0436288259170787,
"learning_rate": 9.493343584948931e-06,
"loss": 0.9495,
"step": 435
},
{
"epoch": 0.46,
"grad_norm": 1.0827000850655668,
"learning_rate": 9.481009327379714e-06,
"loss": 0.9505,
"step": 440
},
{
"epoch": 0.47,
"grad_norm": 1.0958366892000795,
"learning_rate": 9.46853494630557e-06,
"loss": 0.9536,
"step": 445
},
{
"epoch": 0.48,
"grad_norm": 1.0431220913897328,
"learning_rate": 9.455920831806917e-06,
"loss": 0.942,
"step": 450
},
{
"epoch": 0.48,
"grad_norm": 1.1372655798293543,
"learning_rate": 9.443167378333711e-06,
"loss": 0.9447,
"step": 455
},
{
"epoch": 0.49,
"grad_norm": 1.0890187843066097,
"learning_rate": 9.43027498469311e-06,
"loss": 0.9291,
"step": 460
},
{
"epoch": 0.49,
"grad_norm": 1.128255566030822,
"learning_rate": 9.41724405403701e-06,
"loss": 0.9418,
"step": 465
},
{
"epoch": 0.5,
"grad_norm": 1.0200134644324146,
"learning_rate": 9.404074993849421e-06,
"loss": 0.927,
"step": 470
},
{
"epoch": 0.5,
"grad_norm": 1.0912622433950008,
"learning_rate": 9.390768215933746e-06,
"loss": 0.943,
"step": 475
},
{
"epoch": 0.51,
"grad_norm": 1.1784430852167105,
"learning_rate": 9.377324136399887e-06,
"loss": 0.9409,
"step": 480
},
{
"epoch": 0.51,
"grad_norm": 1.0732445497397998,
"learning_rate": 9.36374317565124e-06,
"loss": 0.9401,
"step": 485
},
{
"epoch": 0.52,
"grad_norm": 1.1241973380928443,
"learning_rate": 9.350025758371554e-06,
"loss": 0.9188,
"step": 490
},
{
"epoch": 0.52,
"grad_norm": 1.0680249447424572,
"learning_rate": 9.336172313511636e-06,
"loss": 0.9304,
"step": 495
},
{
"epoch": 0.53,
"grad_norm": 1.0400938648362148,
"learning_rate": 9.322183274275954e-06,
"loss": 0.9465,
"step": 500
},
{
"epoch": 0.53,
"grad_norm": 1.1484166178621282,
"learning_rate": 9.308059078109078e-06,
"loss": 0.9431,
"step": 505
},
{
"epoch": 0.54,
"grad_norm": 1.0928763685485705,
"learning_rate": 9.29380016668201e-06,
"loss": 0.9368,
"step": 510
},
{
"epoch": 0.54,
"grad_norm": 1.0470334802413224,
"learning_rate": 9.279406985878367e-06,
"loss": 0.9529,
"step": 515
},
{
"epoch": 0.55,
"grad_norm": 1.055693577627048,
"learning_rate": 9.264879985780436e-06,
"loss": 0.9237,
"step": 520
},
{
"epoch": 0.55,
"grad_norm": 1.0582407523485609,
"learning_rate": 9.250219620655112e-06,
"loss": 0.9455,
"step": 525
},
{
"epoch": 0.56,
"grad_norm": 1.0392740863841614,
"learning_rate": 9.235426348939674e-06,
"loss": 0.9866,
"step": 530
},
{
"epoch": 0.57,
"grad_norm": 1.087021743413759,
"learning_rate": 9.220500633227467e-06,
"loss": 0.9797,
"step": 535
},
{
"epoch": 0.57,
"grad_norm": 1.0905659766649087,
"learning_rate": 9.205442940253426e-06,
"loss": 0.9231,
"step": 540
},
{
"epoch": 0.58,
"grad_norm": 1.0838061353931883,
"learning_rate": 9.190253740879484e-06,
"loss": 0.9155,
"step": 545
},
{
"epoch": 0.58,
"grad_norm": 1.1721559515157844,
"learning_rate": 9.174933510079847e-06,
"loss": 0.9132,
"step": 550
},
{
"epoch": 0.59,
"grad_norm": 1.0711291424853389,
"learning_rate": 9.159482726926147e-06,
"loss": 0.9368,
"step": 555
},
{
"epoch": 0.59,
"grad_norm": 1.0906836737125443,
"learning_rate": 9.14390187457245e-06,
"loss": 0.9652,
"step": 560
},
{
"epoch": 0.6,
"grad_norm": 1.2147816750505283,
"learning_rate": 9.128191440240159e-06,
"loss": 0.922,
"step": 565
},
{
"epoch": 0.6,
"grad_norm": 1.0745698856829782,
"learning_rate": 9.11235191520277e-06,
"loss": 0.9267,
"step": 570
},
{
"epoch": 0.61,
"grad_norm": 1.1107563079565528,
"learning_rate": 9.096383794770513e-06,
"loss": 0.9403,
"step": 575
},
{
"epoch": 0.61,
"grad_norm": 1.0645734678937102,
"learning_rate": 9.080287578274866e-06,
"loss": 0.9149,
"step": 580
},
{
"epoch": 0.62,
"grad_norm": 1.1729380707889032,
"learning_rate": 9.064063769052933e-06,
"loss": 0.9236,
"step": 585
},
{
"epoch": 0.62,
"grad_norm": 1.0634029251400858,
"learning_rate": 9.047712874431716e-06,
"loss": 0.9264,
"step": 590
},
{
"epoch": 0.63,
"grad_norm": 1.185148731024843,
"learning_rate": 9.031235405712239e-06,
"loss": 0.9632,
"step": 595
},
{
"epoch": 0.63,
"grad_norm": 1.1238661801404854,
"learning_rate": 9.014631878153564e-06,
"loss": 0.9364,
"step": 600
},
{
"epoch": 0.64,
"grad_norm": 1.1101591200426506,
"learning_rate": 8.997902810956682e-06,
"loss": 0.9121,
"step": 605
},
{
"epoch": 0.64,
"grad_norm": 1.1328306862765927,
"learning_rate": 8.98104872724827e-06,
"loss": 0.9637,
"step": 610
},
{
"epoch": 0.65,
"grad_norm": 1.1182389860600772,
"learning_rate": 8.964070154064343e-06,
"loss": 0.9431,
"step": 615
},
{
"epoch": 0.66,
"grad_norm": 1.2315329373588069,
"learning_rate": 8.94696762233376e-06,
"loss": 0.9261,
"step": 620
},
{
"epoch": 0.66,
"grad_norm": 1.0785263989248792,
"learning_rate": 8.92974166686163e-06,
"loss": 0.9218,
"step": 625
},
{
"epoch": 0.67,
"grad_norm": 1.0293877329539916,
"learning_rate": 8.912392826312595e-06,
"loss": 0.9516,
"step": 630
},
{
"epoch": 0.67,
"grad_norm": 1.0797961930582287,
"learning_rate": 8.894921643193966e-06,
"loss": 0.94,
"step": 635
},
{
"epoch": 0.68,
"grad_norm": 1.0052477432214972,
"learning_rate": 8.877328663838776e-06,
"loss": 0.9207,
"step": 640
},
{
"epoch": 0.68,
"grad_norm": 1.0126272743426095,
"learning_rate": 8.85961443838869e-06,
"loss": 0.9292,
"step": 645
},
{
"epoch": 0.69,
"grad_norm": 1.0166858946265631,
"learning_rate": 8.841779520776803e-06,
"loss": 0.9171,
"step": 650
},
{
"epoch": 0.69,
"grad_norm": 1.0674058891203713,
"learning_rate": 8.823824468710312e-06,
"loss": 0.9238,
"step": 655
},
{
"epoch": 0.7,
"grad_norm": 1.0826543746678357,
"learning_rate": 8.805749843653086e-06,
"loss": 0.8903,
"step": 660
},
{
"epoch": 0.7,
"grad_norm": 1.0474293060948185,
"learning_rate": 8.787556210808101e-06,
"loss": 0.8952,
"step": 665
},
{
"epoch": 0.71,
"grad_norm": 1.1092322508696293,
"learning_rate": 8.769244139099774e-06,
"loss": 0.9191,
"step": 670
},
{
"epoch": 0.71,
"grad_norm": 1.0453618423472522,
"learning_rate": 8.750814201156157e-06,
"loss": 0.9287,
"step": 675
},
{
"epoch": 0.72,
"grad_norm": 1.0150902528617922,
"learning_rate": 8.732266973291053e-06,
"loss": 0.9005,
"step": 680
},
{
"epoch": 0.72,
"grad_norm": 1.111573072134849,
"learning_rate": 8.713603035485972e-06,
"loss": 0.9061,
"step": 685
},
{
"epoch": 0.73,
"grad_norm": 1.0266552996471214,
"learning_rate": 8.694822971372012e-06,
"loss": 0.8981,
"step": 690
},
{
"epoch": 0.73,
"grad_norm": 1.026959416886306,
"learning_rate": 8.675927368211599e-06,
"loss": 0.9119,
"step": 695
},
{
"epoch": 0.74,
"grad_norm": 0.990879098356618,
"learning_rate": 8.656916816880122e-06,
"loss": 0.934,
"step": 700
},
{
"epoch": 0.75,
"grad_norm": 1.016936193517629,
"learning_rate": 8.637791911847462e-06,
"loss": 0.9031,
"step": 705
},
{
"epoch": 0.75,
"grad_norm": 1.0105346034407392,
"learning_rate": 8.618553251159405e-06,
"loss": 0.8918,
"step": 710
},
{
"epoch": 0.76,
"grad_norm": 1.0219526658502593,
"learning_rate": 8.599201436418927e-06,
"loss": 0.9202,
"step": 715
},
{
"epoch": 0.76,
"grad_norm": 1.0611008297726183,
"learning_rate": 8.579737072767396e-06,
"loss": 0.8956,
"step": 720
},
{
"epoch": 0.77,
"grad_norm": 1.0532525094762688,
"learning_rate": 8.560160768865642e-06,
"loss": 0.8782,
"step": 725
},
{
"epoch": 0.77,
"grad_norm": 1.0472370063073,
"learning_rate": 8.540473136874926e-06,
"loss": 0.9215,
"step": 730
},
{
"epoch": 0.78,
"grad_norm": 1.0503901600633805,
"learning_rate": 8.520674792437793e-06,
"loss": 0.905,
"step": 735
},
{
"epoch": 0.78,
"grad_norm": 1.0699401745712223,
"learning_rate": 8.50076635465883e-06,
"loss": 0.8914,
"step": 740
},
{
"epoch": 0.79,
"grad_norm": 1.1604934245734189,
"learning_rate": 8.480748446085293e-06,
"loss": 0.923,
"step": 745
},
{
"epoch": 0.79,
"grad_norm": 1.0575469862405844,
"learning_rate": 8.460621692687656e-06,
"loss": 0.91,
"step": 750
},
{
"epoch": 0.8,
"grad_norm": 1.1861862918344839,
"learning_rate": 8.44038672384002e-06,
"loss": 0.9183,
"step": 755
},
{
"epoch": 0.8,
"grad_norm": 1.0866238920331526,
"learning_rate": 8.420044172300443e-06,
"loss": 0.9012,
"step": 760
},
{
"epoch": 0.81,
"grad_norm": 1.0963030089254635,
"learning_rate": 8.399594674191147e-06,
"loss": 0.8867,
"step": 765
},
{
"epoch": 0.81,
"grad_norm": 1.0516263694748806,
"learning_rate": 8.379038868978635e-06,
"loss": 0.9204,
"step": 770
},
{
"epoch": 0.82,
"grad_norm": 1.0602404388082067,
"learning_rate": 8.358377399453684e-06,
"loss": 0.8975,
"step": 775
},
{
"epoch": 0.82,
"grad_norm": 1.0524212623827451,
"learning_rate": 8.337610911711248e-06,
"loss": 0.9182,
"step": 780
},
{
"epoch": 0.83,
"grad_norm": 1.0486851629524967,
"learning_rate": 8.316740055130263e-06,
"loss": 0.8996,
"step": 785
},
{
"epoch": 0.83,
"grad_norm": 1.0382393662171674,
"learning_rate": 8.295765482353326e-06,
"loss": 0.8898,
"step": 790
},
{
"epoch": 0.84,
"grad_norm": 1.0801053233779676,
"learning_rate": 8.274687849266295e-06,
"loss": 0.8942,
"step": 795
},
{
"epoch": 0.85,
"grad_norm": 1.082914632918619,
"learning_rate": 8.253507814977779e-06,
"loss": 0.9335,
"step": 800
},
{
"epoch": 0.85,
"grad_norm": 1.115797305584172,
"learning_rate": 8.232226041798528e-06,
"loss": 0.8733,
"step": 805
},
{
"epoch": 0.86,
"grad_norm": 1.0758274816242523,
"learning_rate": 8.210843195220717e-06,
"loss": 0.9121,
"step": 810
},
{
"epoch": 0.86,
"grad_norm": 0.9966437564306923,
"learning_rate": 8.189359943897137e-06,
"loss": 0.9126,
"step": 815
},
{
"epoch": 0.87,
"grad_norm": 1.1254388184304862,
"learning_rate": 8.167776959620298e-06,
"loss": 0.9113,
"step": 820
},
{
"epoch": 0.87,
"grad_norm": 1.033615919920944,
"learning_rate": 8.1460949173014e-06,
"loss": 0.8863,
"step": 825
},
{
"epoch": 0.88,
"grad_norm": 1.0126421627367477,
"learning_rate": 8.124314494949247e-06,
"loss": 0.9044,
"step": 830
},
{
"epoch": 0.88,
"grad_norm": 1.0545539629522227,
"learning_rate": 8.102436373649029e-06,
"loss": 0.8942,
"step": 835
},
{
"epoch": 0.89,
"grad_norm": 1.004956283976033,
"learning_rate": 8.080461237541049e-06,
"loss": 0.9255,
"step": 840
},
{
"epoch": 0.89,
"grad_norm": 1.0862660155528163,
"learning_rate": 8.0583897737993e-06,
"loss": 0.9275,
"step": 845
},
{
"epoch": 0.9,
"grad_norm": 1.0697124134441602,
"learning_rate": 8.036222672609994e-06,
"loss": 0.9161,
"step": 850
},
{
"epoch": 0.9,
"grad_norm": 1.0639070724236763,
"learning_rate": 8.013960627149981e-06,
"loss": 0.8874,
"step": 855
},
{
"epoch": 0.91,
"grad_norm": 1.166900094582672,
"learning_rate": 7.991604333565062e-06,
"loss": 0.8897,
"step": 860
},
{
"epoch": 0.91,
"grad_norm": 1.1335592965754175,
"learning_rate": 7.969154490948225e-06,
"loss": 0.8964,
"step": 865
},
{
"epoch": 0.92,
"grad_norm": 1.0520381511921073,
"learning_rate": 7.946611801317794e-06,
"loss": 0.8736,
"step": 870
},
{
"epoch": 0.92,
"grad_norm": 1.16753848747216,
"learning_rate": 7.923976969595459e-06,
"loss": 0.9112,
"step": 875
},
{
"epoch": 0.93,
"grad_norm": 1.0772133099773151,
"learning_rate": 7.901250703584245e-06,
"loss": 0.9155,
"step": 880
},
{
"epoch": 0.94,
"grad_norm": 1.1464686627860388,
"learning_rate": 7.878433713946373e-06,
"loss": 0.8962,
"step": 885
},
{
"epoch": 0.94,
"grad_norm": 1.0835779136854178,
"learning_rate": 7.855526714181041e-06,
"loss": 0.9058,
"step": 890
},
{
"epoch": 0.95,
"grad_norm": 1.171366478493349,
"learning_rate": 7.832530420602113e-06,
"loss": 0.8756,
"step": 895
},
{
"epoch": 0.95,
"grad_norm": 1.040168900901505,
"learning_rate": 7.809445552315714e-06,
"loss": 0.8594,
"step": 900
},
{
"epoch": 0.96,
"grad_norm": 1.02166560480321,
"learning_rate": 7.786272831197745e-06,
"loss": 0.8935,
"step": 905
},
{
"epoch": 0.96,
"grad_norm": 1.1107392454183416,
"learning_rate": 7.763012981871314e-06,
"loss": 0.904,
"step": 910
},
{
"epoch": 0.97,
"grad_norm": 0.9896358057101541,
"learning_rate": 7.739666731684073e-06,
"loss": 0.9068,
"step": 915
},
{
"epoch": 0.97,
"grad_norm": 0.9788741930391702,
"learning_rate": 7.716234810685476e-06,
"loss": 0.8846,
"step": 920
},
{
"epoch": 0.98,
"grad_norm": 0.9931045191442167,
"learning_rate": 7.692717951603942e-06,
"loss": 0.8584,
"step": 925
},
{
"epoch": 0.98,
"grad_norm": 1.0645481368236074,
"learning_rate": 7.669116889823955e-06,
"loss": 0.8992,
"step": 930
},
{
"epoch": 0.99,
"grad_norm": 0.9816731950451545,
"learning_rate": 7.645432363363057e-06,
"loss": 0.8851,
"step": 935
},
{
"epoch": 0.99,
"grad_norm": 0.9899142833993008,
"learning_rate": 7.621665112848776e-06,
"loss": 0.8845,
"step": 940
},
{
"epoch": 1.0,
"grad_norm": 1.0638888300871174,
"learning_rate": 7.597815881495465e-06,
"loss": 0.8773,
"step": 945
},
{
"epoch": 1.0,
"grad_norm": 1.031662431521578,
"learning_rate": 7.573885415081059e-06,
"loss": 0.8258,
"step": 950
},
{
"epoch": 1.01,
"grad_norm": 1.040426497974828,
"learning_rate": 7.54987446192376e-06,
"loss": 0.7907,
"step": 955
},
{
"epoch": 1.01,
"grad_norm": 0.9887566903005512,
"learning_rate": 7.525783772858624e-06,
"loss": 0.8091,
"step": 960
},
{
"epoch": 1.02,
"grad_norm": 1.0542179478307365,
"learning_rate": 7.5016141012141e-06,
"loss": 0.7815,
"step": 965
},
{
"epoch": 1.03,
"grad_norm": 1.0738731959256824,
"learning_rate": 7.477366202788456e-06,
"loss": 0.7734,
"step": 970
},
{
"epoch": 1.03,
"grad_norm": 0.9975806760235982,
"learning_rate": 7.45304083582616e-06,
"loss": 0.7824,
"step": 975
},
{
"epoch": 1.04,
"grad_norm": 1.005274019925314,
"learning_rate": 7.4286387609941544e-06,
"loss": 0.769,
"step": 980
},
{
"epoch": 1.04,
"grad_norm": 1.0937329481520819,
"learning_rate": 7.40416074135808e-06,
"loss": 0.791,
"step": 985
},
{
"epoch": 1.05,
"grad_norm": 0.9987999174071854,
"learning_rate": 7.379607542358414e-06,
"loss": 0.7983,
"step": 990
},
{
"epoch": 1.05,
"grad_norm": 1.074721973505265,
"learning_rate": 7.3549799317865235e-06,
"loss": 0.8264,
"step": 995
},
{
"epoch": 1.06,
"grad_norm": 1.0023766389640552,
"learning_rate": 7.330278679760673e-06,
"loss": 0.8166,
"step": 1000
},
{
"epoch": 1.06,
"grad_norm": 1.0263488491446793,
"learning_rate": 7.3055045587019315e-06,
"loss": 0.7756,
"step": 1005
},
{
"epoch": 1.07,
"grad_norm": 1.222252310199244,
"learning_rate": 7.280658343310016e-06,
"loss": 0.8113,
"step": 1010
},
{
"epoch": 1.07,
"grad_norm": 1.0803171037496995,
"learning_rate": 7.255740810539078e-06,
"loss": 0.7773,
"step": 1015
},
{
"epoch": 1.08,
"grad_norm": 1.0429385720996782,
"learning_rate": 7.230752739573398e-06,
"loss": 0.7959,
"step": 1020
},
{
"epoch": 1.08,
"grad_norm": 1.0525788357504489,
"learning_rate": 7.205694911803019e-06,
"loss": 0.7962,
"step": 1025
},
{
"epoch": 1.09,
"grad_norm": 0.986228023483833,
"learning_rate": 7.18056811079932e-06,
"loss": 0.79,
"step": 1030
},
{
"epoch": 1.09,
"grad_norm": 1.031179895714868,
"learning_rate": 7.155373122290508e-06,
"loss": 0.8101,
"step": 1035
},
{
"epoch": 1.1,
"grad_norm": 1.0379629517770603,
"learning_rate": 7.13011073413705e-06,
"loss": 0.781,
"step": 1040
},
{
"epoch": 1.1,
"grad_norm": 1.033153108919124,
"learning_rate": 7.1047817363070325e-06,
"loss": 0.8418,
"step": 1045
},
{
"epoch": 1.11,
"grad_norm": 1.0357203376239867,
"learning_rate": 7.079386920851466e-06,
"loss": 0.8065,
"step": 1050
},
{
"epoch": 1.11,
"grad_norm": 1.0540192082846203,
"learning_rate": 7.053927081879505e-06,
"loss": 0.7956,
"step": 1055
},
{
"epoch": 1.12,
"grad_norm": 1.0552828635725824,
"learning_rate": 7.0284030155336315e-06,
"loss": 0.7945,
"step": 1060
},
{
"epoch": 1.13,
"grad_norm": 0.9810627289945896,
"learning_rate": 7.002815519964745e-06,
"loss": 0.7965,
"step": 1065
},
{
"epoch": 1.13,
"grad_norm": 1.0916102744452092,
"learning_rate": 6.977165395307215e-06,
"loss": 0.7991,
"step": 1070
},
{
"epoch": 1.14,
"grad_norm": 1.1543690326062077,
"learning_rate": 6.951453443653852e-06,
"loss": 0.7896,
"step": 1075
},
{
"epoch": 1.14,
"grad_norm": 1.1170103600405488,
"learning_rate": 6.9256804690308276e-06,
"loss": 0.7828,
"step": 1080
},
{
"epoch": 1.15,
"grad_norm": 1.0526733296614392,
"learning_rate": 6.899847277372538e-06,
"loss": 0.7923,
"step": 1085
},
{
"epoch": 1.15,
"grad_norm": 1.0770254342023697,
"learning_rate": 6.873954676496395e-06,
"loss": 0.8128,
"step": 1090
},
{
"epoch": 1.16,
"grad_norm": 1.037705594081886,
"learning_rate": 6.848003476077567e-06,
"loss": 0.7856,
"step": 1095
},
{
"epoch": 1.16,
"grad_norm": 1.0319807068181204,
"learning_rate": 6.8219944876236645e-06,
"loss": 0.7949,
"step": 1100
},
{
"epoch": 1.17,
"grad_norm": 1.0927555007584646,
"learning_rate": 6.795928524449354e-06,
"loss": 0.7941,
"step": 1105
},
{
"epoch": 1.17,
"grad_norm": 0.9869897993273156,
"learning_rate": 6.769806401650936e-06,
"loss": 0.7667,
"step": 1110
},
{
"epoch": 1.18,
"grad_norm": 1.0055956062759406,
"learning_rate": 6.743628936080852e-06,
"loss": 0.7855,
"step": 1115
},
{
"epoch": 1.18,
"grad_norm": 1.0283367881989096,
"learning_rate": 6.717396946322137e-06,
"loss": 0.7745,
"step": 1120
},
{
"epoch": 1.19,
"grad_norm": 1.0345829389670045,
"learning_rate": 6.6911112526628295e-06,
"loss": 0.7842,
"step": 1125
},
{
"epoch": 1.19,
"grad_norm": 1.0711135328845822,
"learning_rate": 6.664772677070316e-06,
"loss": 0.7558,
"step": 1130
},
{
"epoch": 1.2,
"grad_norm": 0.9877769296594265,
"learning_rate": 6.638382043165628e-06,
"loss": 0.7788,
"step": 1135
},
{
"epoch": 1.2,
"grad_norm": 1.131836138091609,
"learning_rate": 6.611940176197688e-06,
"loss": 0.7901,
"step": 1140
},
{
"epoch": 1.21,
"grad_norm": 1.058249641590972,
"learning_rate": 6.585447903017506e-06,
"loss": 0.7936,
"step": 1145
},
{
"epoch": 1.22,
"grad_norm": 1.073971008814511,
"learning_rate": 6.558906052052314e-06,
"loss": 0.7835,
"step": 1150
},
{
"epoch": 1.22,
"grad_norm": 1.0491301969369466,
"learning_rate": 6.532315453279673e-06,
"loss": 0.7902,
"step": 1155
},
{
"epoch": 1.23,
"grad_norm": 1.046297097483487,
"learning_rate": 6.505676938201512e-06,
"loss": 0.7767,
"step": 1160
},
{
"epoch": 1.23,
"grad_norm": 1.046022517875942,
"learning_rate": 6.478991339818128e-06,
"loss": 0.8091,
"step": 1165
},
{
"epoch": 1.24,
"grad_norm": 1.0086633248074561,
"learning_rate": 6.4522594926021355e-06,
"loss": 0.7797,
"step": 1170
},
{
"epoch": 1.24,
"grad_norm": 1.0965955454651117,
"learning_rate": 6.425482232472377e-06,
"loss": 0.7702,
"step": 1175
},
{
"epoch": 1.25,
"grad_norm": 1.0362189192150881,
"learning_rate": 6.3986603967677805e-06,
"loss": 0.7931,
"step": 1180
},
{
"epoch": 1.25,
"grad_norm": 1.110468197330772,
"learning_rate": 6.371794824221173e-06,
"loss": 0.7917,
"step": 1185
},
{
"epoch": 1.26,
"grad_norm": 1.0163659020071605,
"learning_rate": 6.344886354933058e-06,
"loss": 0.7886,
"step": 1190
},
{
"epoch": 1.26,
"grad_norm": 1.0115549227695064,
"learning_rate": 6.3179358303453386e-06,
"loss": 0.7511,
"step": 1195
},
{
"epoch": 1.27,
"grad_norm": 1.0872016119161863,
"learning_rate": 6.290944093215016e-06,
"loss": 0.8036,
"step": 1200
},
{
"epoch": 1.27,
"grad_norm": 1.0553500518484338,
"learning_rate": 6.263911987587822e-06,
"loss": 0.7938,
"step": 1205
},
{
"epoch": 1.28,
"grad_norm": 0.993815270148442,
"learning_rate": 6.236840358771837e-06,
"loss": 0.7788,
"step": 1210
},
{
"epoch": 1.28,
"grad_norm": 1.0605675582324252,
"learning_rate": 6.20973005331105e-06,
"loss": 0.7781,
"step": 1215
},
{
"epoch": 1.29,
"grad_norm": 1.0965085071552372,
"learning_rate": 6.1825819189588885e-06,
"loss": 0.7872,
"step": 1220
},
{
"epoch": 1.29,
"grad_norm": 1.040866195350916,
"learning_rate": 6.155396804651714e-06,
"loss": 0.7966,
"step": 1225
},
{
"epoch": 1.3,
"grad_norm": 1.0593376609536802,
"learning_rate": 6.128175560482264e-06,
"loss": 0.7832,
"step": 1230
},
{
"epoch": 1.31,
"grad_norm": 1.0081718313330637,
"learning_rate": 6.1009190376730785e-06,
"loss": 0.7772,
"step": 1235
},
{
"epoch": 1.31,
"grad_norm": 0.9892554397828908,
"learning_rate": 6.07362808854988e-06,
"loss": 0.7856,
"step": 1240
},
{
"epoch": 1.32,
"grad_norm": 1.0515874983049542,
"learning_rate": 6.046303566514919e-06,
"loss": 0.7812,
"step": 1245
},
{
"epoch": 1.32,
"grad_norm": 1.01738547568124,
"learning_rate": 6.018946326020287e-06,
"loss": 0.7824,
"step": 1250
},
{
"epoch": 1.33,
"grad_norm": 0.992994982201507,
"learning_rate": 5.991557222541201e-06,
"loss": 0.7842,
"step": 1255
},
{
"epoch": 1.33,
"grad_norm": 0.9928822859609259,
"learning_rate": 5.964137112549251e-06,
"loss": 0.7906,
"step": 1260
},
{
"epoch": 1.34,
"grad_norm": 1.0673862770846931,
"learning_rate": 5.9366868534856115e-06,
"loss": 0.7896,
"step": 1265
},
{
"epoch": 1.34,
"grad_norm": 1.0627251705995355,
"learning_rate": 5.909207303734241e-06,
"loss": 0.7965,
"step": 1270
},
{
"epoch": 1.35,
"grad_norm": 1.0050051635503012,
"learning_rate": 5.881699322595031e-06,
"loss": 0.7775,
"step": 1275
},
{
"epoch": 1.35,
"grad_norm": 1.0049258262531797,
"learning_rate": 5.854163770256934e-06,
"loss": 0.7659,
"step": 1280
},
{
"epoch": 1.36,
"grad_norm": 1.1097225296353777,
"learning_rate": 5.826601507771073e-06,
"loss": 0.7699,
"step": 1285
},
{
"epoch": 1.36,
"grad_norm": 1.0610730723756006,
"learning_rate": 5.799013397023806e-06,
"loss": 0.7996,
"step": 1290
},
{
"epoch": 1.37,
"grad_norm": 1.0285633823079718,
"learning_rate": 5.771400300709785e-06,
"loss": 0.7829,
"step": 1295
},
{
"epoch": 1.37,
"grad_norm": 1.0484599021027985,
"learning_rate": 5.743763082304973e-06,
"loss": 0.7619,
"step": 1300
},
{
"epoch": 1.38,
"grad_norm": 1.0137701786577156,
"learning_rate": 5.7161026060396375e-06,
"loss": 0.798,
"step": 1305
},
{
"epoch": 1.38,
"grad_norm": 1.0289414598602742,
"learning_rate": 5.688419736871341e-06,
"loss": 0.7827,
"step": 1310
},
{
"epoch": 1.39,
"grad_norm": 1.058376335913828,
"learning_rate": 5.660715340457874e-06,
"loss": 0.7921,
"step": 1315
},
{
"epoch": 1.39,
"grad_norm": 1.0011219088912342,
"learning_rate": 5.632990283130204e-06,
"loss": 0.781,
"step": 1320
},
{
"epoch": 1.4,
"grad_norm": 0.984264955084216,
"learning_rate": 5.605245431865368e-06,
"loss": 0.7772,
"step": 1325
},
{
"epoch": 1.41,
"grad_norm": 1.0151072044919451,
"learning_rate": 5.577481654259377e-06,
"loss": 0.7735,
"step": 1330
},
{
"epoch": 1.41,
"grad_norm": 1.063533843295668,
"learning_rate": 5.549699818500074e-06,
"loss": 0.7682,
"step": 1335
},
{
"epoch": 1.42,
"grad_norm": 1.0434635789190496,
"learning_rate": 5.521900793339989e-06,
"loss": 0.7915,
"step": 1340
},
{
"epoch": 1.42,
"grad_norm": 1.0587561050751115,
"learning_rate": 5.494085448069181e-06,
"loss": 0.7997,
"step": 1345
},
{
"epoch": 1.43,
"grad_norm": 1.0758864296233028,
"learning_rate": 5.466254652488036e-06,
"loss": 0.7964,
"step": 1350
},
{
"epoch": 1.43,
"grad_norm": 1.0556751372323996,
"learning_rate": 5.438409276880089e-06,
"loss": 0.8062,
"step": 1355
},
{
"epoch": 1.44,
"grad_norm": 0.9792859835280993,
"learning_rate": 5.410550191984798e-06,
"loss": 0.787,
"step": 1360
},
{
"epoch": 1.44,
"grad_norm": 1.0231438624972786,
"learning_rate": 5.3826782689703115e-06,
"loss": 0.7803,
"step": 1365
},
{
"epoch": 1.45,
"grad_norm": 1.0660534726358564,
"learning_rate": 5.354794379406242e-06,
"loss": 0.78,
"step": 1370
},
{
"epoch": 1.45,
"grad_norm": 0.9527414539128428,
"learning_rate": 5.3268993952363936e-06,
"loss": 0.796,
"step": 1375
},
{
"epoch": 1.46,
"grad_norm": 0.9870931434726852,
"learning_rate": 5.29899418875151e-06,
"loss": 0.7652,
"step": 1380
},
{
"epoch": 1.46,
"grad_norm": 1.0537299945885146,
"learning_rate": 5.271079632561992e-06,
"loss": 0.7854,
"step": 1385
},
{
"epoch": 1.47,
"grad_norm": 1.1396368040574916,
"learning_rate": 5.243156599570606e-06,
"loss": 0.7617,
"step": 1390
},
{
"epoch": 1.47,
"grad_norm": 1.0924704024745873,
"learning_rate": 5.2152259629451986e-06,
"loss": 0.7713,
"step": 1395
},
{
"epoch": 1.48,
"grad_norm": 1.021493417245078,
"learning_rate": 5.18728859609138e-06,
"loss": 0.7609,
"step": 1400
},
{
"epoch": 1.48,
"grad_norm": 1.0148194958691719,
"learning_rate": 5.159345372625223e-06,
"loss": 0.7788,
"step": 1405
},
{
"epoch": 1.49,
"grad_norm": 1.0402765811164951,
"learning_rate": 5.131397166345938e-06,
"loss": 0.7599,
"step": 1410
},
{
"epoch": 1.5,
"grad_norm": 0.9966250584272072,
"learning_rate": 5.103444851208549e-06,
"loss": 0.7874,
"step": 1415
},
{
"epoch": 1.5,
"grad_norm": 0.9871275158697829,
"learning_rate": 5.075489301296567e-06,
"loss": 0.7566,
"step": 1420
},
{
"epoch": 1.51,
"grad_norm": 1.0896451679213162,
"learning_rate": 5.047531390794661e-06,
"loss": 0.7699,
"step": 1425
},
{
"epoch": 1.51,
"grad_norm": 1.1203863877988638,
"learning_rate": 5.019571993961307e-06,
"loss": 0.8088,
"step": 1430
},
{
"epoch": 1.52,
"grad_norm": 1.03311513179617,
"learning_rate": 4.9916119851014664e-06,
"loss": 0.7739,
"step": 1435
},
{
"epoch": 1.52,
"grad_norm": 1.0389351009988612,
"learning_rate": 4.96365223853924e-06,
"loss": 0.7816,
"step": 1440
},
{
"epoch": 1.53,
"grad_norm": 0.9960641498632878,
"learning_rate": 4.93569362859052e-06,
"loss": 0.775,
"step": 1445
},
{
"epoch": 1.53,
"grad_norm": 0.9388823495229471,
"learning_rate": 4.907737029535664e-06,
"loss": 0.756,
"step": 1450
},
{
"epoch": 1.54,
"grad_norm": 1.0662538022442485,
"learning_rate": 4.8797833155921396e-06,
"loss": 0.7992,
"step": 1455
},
{
"epoch": 1.54,
"grad_norm": 1.0350212904727674,
"learning_rate": 4.8518333608872015e-06,
"loss": 0.7595,
"step": 1460
},
{
"epoch": 1.55,
"grad_norm": 0.9967538128228846,
"learning_rate": 4.823888039430551e-06,
"loss": 0.7582,
"step": 1465
},
{
"epoch": 1.55,
"grad_norm": 1.0139079612075497,
"learning_rate": 4.795948225087001e-06,
"loss": 0.7709,
"step": 1470
},
{
"epoch": 1.56,
"grad_norm": 1.0510044388149635,
"learning_rate": 4.7680147915491585e-06,
"loss": 0.7692,
"step": 1475
},
{
"epoch": 1.56,
"grad_norm": 1.0641353890612333,
"learning_rate": 4.740088612310096e-06,
"loss": 0.7847,
"step": 1480
},
{
"epoch": 1.57,
"grad_norm": 1.0192435995305715,
"learning_rate": 4.7121705606360424e-06,
"loss": 0.7732,
"step": 1485
},
{
"epoch": 1.57,
"grad_norm": 1.0076325415256413,
"learning_rate": 4.684261509539072e-06,
"loss": 0.7701,
"step": 1490
},
{
"epoch": 1.58,
"grad_norm": 0.9707102286396411,
"learning_rate": 4.65636233174981e-06,
"loss": 0.77,
"step": 1495
},
{
"epoch": 1.59,
"grad_norm": 1.0835636202474823,
"learning_rate": 4.628473899690133e-06,
"loss": 0.7849,
"step": 1500
},
{
"epoch": 1.59,
"grad_norm": 1.0157410126136626,
"learning_rate": 4.600597085445894e-06,
"loss": 0.784,
"step": 1505
},
{
"epoch": 1.6,
"grad_norm": 1.0616186913926178,
"learning_rate": 4.572732760739653e-06,
"loss": 0.7785,
"step": 1510
},
{
"epoch": 1.6,
"grad_norm": 1.006516145178769,
"learning_rate": 4.5448817969034165e-06,
"loss": 0.7753,
"step": 1515
},
{
"epoch": 1.61,
"grad_norm": 1.0480529823653495,
"learning_rate": 4.517045064851386e-06,
"loss": 0.7989,
"step": 1520
},
{
"epoch": 1.61,
"grad_norm": 1.0432567441250045,
"learning_rate": 4.489223435052732e-06,
"loss": 0.7946,
"step": 1525
},
{
"epoch": 1.62,
"grad_norm": 1.0461342178531015,
"learning_rate": 4.461417777504363e-06,
"loss": 0.7676,
"step": 1530
},
{
"epoch": 1.62,
"grad_norm": 1.0045382622138492,
"learning_rate": 4.433628961703733e-06,
"loss": 0.7651,
"step": 1535
},
{
"epoch": 1.63,
"grad_norm": 0.9890094489435823,
"learning_rate": 4.405857856621644e-06,
"loss": 0.7943,
"step": 1540
},
{
"epoch": 1.63,
"grad_norm": 1.0127639919495397,
"learning_rate": 4.378105330675074e-06,
"loss": 0.7895,
"step": 1545
},
{
"epoch": 1.64,
"grad_norm": 1.0398544121817734,
"learning_rate": 4.350372251700025e-06,
"loss": 0.8004,
"step": 1550
},
{
"epoch": 1.64,
"grad_norm": 1.037857459368961,
"learning_rate": 4.322659486924373e-06,
"loss": 0.7963,
"step": 1555
},
{
"epoch": 1.65,
"grad_norm": 1.106103919813531,
"learning_rate": 4.294967902940768e-06,
"loss": 0.787,
"step": 1560
},
{
"epoch": 1.65,
"grad_norm": 1.0865617469424886,
"learning_rate": 4.267298365679522e-06,
"loss": 0.788,
"step": 1565
},
{
"epoch": 1.66,
"grad_norm": 1.0303226290700802,
"learning_rate": 4.239651740381534e-06,
"loss": 0.7642,
"step": 1570
},
{
"epoch": 1.66,
"grad_norm": 1.0512505166055992,
"learning_rate": 4.212028891571237e-06,
"loss": 0.7832,
"step": 1575
},
{
"epoch": 1.67,
"grad_norm": 1.0750316874597787,
"learning_rate": 4.184430683029552e-06,
"loss": 0.7599,
"step": 1580
},
{
"epoch": 1.68,
"grad_norm": 1.0622608820174235,
"learning_rate": 4.156857977766896e-06,
"loss": 0.7841,
"step": 1585
},
{
"epoch": 1.68,
"grad_norm": 1.0023528643121005,
"learning_rate": 4.129311637996182e-06,
"loss": 0.7845,
"step": 1590
},
{
"epoch": 1.69,
"grad_norm": 1.0597451506484419,
"learning_rate": 4.101792525105857e-06,
"loss": 0.7802,
"step": 1595
},
{
"epoch": 1.69,
"grad_norm": 0.9622973096022323,
"learning_rate": 4.0743014996329764e-06,
"loss": 0.7678,
"step": 1600
},
{
"epoch": 1.7,
"grad_norm": 1.051095411122212,
"learning_rate": 4.046839421236276e-06,
"loss": 0.7972,
"step": 1605
},
{
"epoch": 1.7,
"grad_norm": 1.0082128589578265,
"learning_rate": 4.019407148669312e-06,
"loss": 0.7948,
"step": 1610
},
{
"epoch": 1.71,
"grad_norm": 1.0901759578931909,
"learning_rate": 3.992005539753592e-06,
"loss": 0.7914,
"step": 1615
},
{
"epoch": 1.71,
"grad_norm": 1.0584302499373435,
"learning_rate": 3.964635451351758e-06,
"loss": 0.7821,
"step": 1620
},
{
"epoch": 1.72,
"grad_norm": 1.043189384648134,
"learning_rate": 3.937297739340783e-06,
"loss": 0.778,
"step": 1625
},
{
"epoch": 1.72,
"grad_norm": 1.0245392793145456,
"learning_rate": 3.909993258585219e-06,
"loss": 0.7908,
"step": 1630
},
{
"epoch": 1.73,
"grad_norm": 1.0082519645854728,
"learning_rate": 3.882722862910458e-06,
"loss": 0.7793,
"step": 1635
},
{
"epoch": 1.73,
"grad_norm": 1.0211341337802105,
"learning_rate": 3.8554874050760345e-06,
"loss": 0.8042,
"step": 1640
},
{
"epoch": 1.74,
"grad_norm": 0.9920127978660441,
"learning_rate": 3.828287736748957e-06,
"loss": 0.758,
"step": 1645
},
{
"epoch": 1.74,
"grad_norm": 1.0187229111502758,
"learning_rate": 3.8011247084770754e-06,
"loss": 0.7986,
"step": 1650
},
{
"epoch": 1.75,
"grad_norm": 0.9982295207578855,
"learning_rate": 3.773999169662489e-06,
"loss": 0.7623,
"step": 1655
},
{
"epoch": 1.75,
"grad_norm": 1.025180441312379,
"learning_rate": 3.746911968534982e-06,
"loss": 0.7454,
"step": 1660
},
{
"epoch": 1.76,
"grad_norm": 0.9884338430346545,
"learning_rate": 3.7198639521254988e-06,
"loss": 0.7671,
"step": 1665
},
{
"epoch": 1.76,
"grad_norm": 0.9685352318412103,
"learning_rate": 3.6928559662396574e-06,
"loss": 0.7583,
"step": 1670
},
{
"epoch": 1.77,
"grad_norm": 1.029404957630594,
"learning_rate": 3.6658888554312967e-06,
"loss": 0.7868,
"step": 1675
},
{
"epoch": 1.78,
"grad_norm": 0.9921023940146521,
"learning_rate": 3.6389634629760763e-06,
"loss": 0.7555,
"step": 1680
},
{
"epoch": 1.78,
"grad_norm": 1.017350986680598,
"learning_rate": 3.612080630845096e-06,
"loss": 0.7905,
"step": 1685
},
{
"epoch": 1.79,
"grad_norm": 1.0430603602540587,
"learning_rate": 3.5852411996785776e-06,
"loss": 0.7947,
"step": 1690
},
{
"epoch": 1.79,
"grad_norm": 0.9737056004061376,
"learning_rate": 3.558446008759569e-06,
"loss": 0.7789,
"step": 1695
},
{
"epoch": 1.8,
"grad_norm": 1.0212119960635129,
"learning_rate": 3.5316958959876985e-06,
"loss": 0.7671,
"step": 1700
},
{
"epoch": 1.8,
"grad_norm": 1.0072141418910243,
"learning_rate": 3.504991697852983e-06,
"loss": 0.7844,
"step": 1705
},
{
"epoch": 1.81,
"grad_norm": 1.059809521658242,
"learning_rate": 3.4783342494096627e-06,
"loss": 0.7845,
"step": 1710
},
{
"epoch": 1.81,
"grad_norm": 1.032182317108509,
"learning_rate": 3.451724384250091e-06,
"loss": 0.7792,
"step": 1715
},
{
"epoch": 1.82,
"grad_norm": 0.9779053888998924,
"learning_rate": 3.4251629344786675e-06,
"loss": 0.7591,
"step": 1720
},
{
"epoch": 1.82,
"grad_norm": 1.0116163318504925,
"learning_rate": 3.398650730685813e-06,
"loss": 0.7556,
"step": 1725
},
{
"epoch": 1.83,
"grad_norm": 1.0511489470052602,
"learning_rate": 3.372188601922006e-06,
"loss": 0.7637,
"step": 1730
},
{
"epoch": 1.83,
"grad_norm": 1.0172930500825146,
"learning_rate": 3.3457773756718513e-06,
"loss": 0.7696,
"step": 1735
},
{
"epoch": 1.84,
"grad_norm": 1.039493994412079,
"learning_rate": 3.3194178778282046e-06,
"loss": 0.7931,
"step": 1740
},
{
"epoch": 1.84,
"grad_norm": 1.033662637919394,
"learning_rate": 3.293110932666349e-06,
"loss": 0.7692,
"step": 1745
},
{
"epoch": 1.85,
"grad_norm": 1.0584694868797393,
"learning_rate": 3.2668573628182145e-06,
"loss": 0.7792,
"step": 1750
},
{
"epoch": 1.85,
"grad_norm": 0.994626270021195,
"learning_rate": 3.2406579892466582e-06,
"loss": 0.7682,
"step": 1755
},
{
"epoch": 1.86,
"grad_norm": 0.9270237802993908,
"learning_rate": 3.2145136312197943e-06,
"loss": 0.7552,
"step": 1760
},
{
"epoch": 1.87,
"grad_norm": 2.0595234604236357,
"learning_rate": 3.18842510628537e-06,
"loss": 0.7749,
"step": 1765
},
{
"epoch": 1.87,
"grad_norm": 1.0396319816767299,
"learning_rate": 3.162393230245203e-06,
"loss": 0.804,
"step": 1770
},
{
"epoch": 1.88,
"grad_norm": 1.0214462086054552,
"learning_rate": 3.1364188171296677e-06,
"loss": 0.7744,
"step": 1775
},
{
"epoch": 1.88,
"grad_norm": 1.0145502545771508,
"learning_rate": 3.110502679172246e-06,
"loss": 0.7824,
"step": 1780
},
{
"epoch": 1.89,
"grad_norm": 1.0196641711891408,
"learning_rate": 3.084645626784124e-06,
"loss": 0.7745,
"step": 1785
},
{
"epoch": 1.89,
"grad_norm": 1.0197064636159427,
"learning_rate": 3.058848468528852e-06,
"loss": 0.8031,
"step": 1790
},
{
"epoch": 1.9,
"grad_norm": 0.9907125667454302,
"learning_rate": 3.03311201109706e-06,
"loss": 0.7919,
"step": 1795
},
{
"epoch": 1.9,
"grad_norm": 1.017942513059757,
"learning_rate": 3.0074370592812286e-06,
"loss": 0.7907,
"step": 1800
},
{
"epoch": 1.91,
"grad_norm": 1.0821499695866912,
"learning_rate": 2.9818244159505265e-06,
"loss": 0.7901,
"step": 1805
},
{
"epoch": 1.91,
"grad_norm": 0.9934394662674368,
"learning_rate": 2.956274882025706e-06,
"loss": 0.7638,
"step": 1810
},
{
"epoch": 1.92,
"grad_norm": 1.0313411208961847,
"learning_rate": 2.930789256454052e-06,
"loss": 0.7553,
"step": 1815
},
{
"epoch": 1.92,
"grad_norm": 0.9950833531614097,
"learning_rate": 2.905368336184406e-06,
"loss": 0.7576,
"step": 1820
},
{
"epoch": 1.93,
"grad_norm": 0.9936896686220547,
"learning_rate": 2.8800129161422365e-06,
"loss": 0.7671,
"step": 1825
},
{
"epoch": 1.93,
"grad_norm": 0.9909860465997411,
"learning_rate": 2.8547237892047852e-06,
"loss": 0.74,
"step": 1830
},
{
"epoch": 1.94,
"grad_norm": 0.9788752840880554,
"learning_rate": 2.8295017461762806e-06,
"loss": 0.767,
"step": 1835
},
{
"epoch": 1.94,
"grad_norm": 0.9764110020200104,
"learning_rate": 2.804347575763193e-06,
"loss": 0.7668,
"step": 1840
},
{
"epoch": 1.95,
"grad_norm": 0.9772254707929505,
"learning_rate": 2.7792620645495917e-06,
"loss": 0.7425,
"step": 1845
},
{
"epoch": 1.96,
"grad_norm": 1.0000854462976456,
"learning_rate": 2.7542459969725215e-06,
"loss": 0.7466,
"step": 1850
},
{
"epoch": 1.96,
"grad_norm": 1.0352323998365711,
"learning_rate": 2.729300155297504e-06,
"loss": 0.771,
"step": 1855
},
{
"epoch": 1.97,
"grad_norm": 0.9811051893834364,
"learning_rate": 2.704425319594049e-06,
"loss": 0.7778,
"step": 1860
},
{
"epoch": 1.97,
"grad_norm": 1.0284677234046133,
"learning_rate": 2.6796222677112825e-06,
"loss": 0.7796,
"step": 1865
},
{
"epoch": 1.98,
"grad_norm": 0.9664217044137716,
"learning_rate": 2.6548917752535997e-06,
"loss": 0.771,
"step": 1870
},
{
"epoch": 1.98,
"grad_norm": 1.0008524753186703,
"learning_rate": 2.6302346155564385e-06,
"loss": 0.7963,
"step": 1875
},
{
"epoch": 1.99,
"grad_norm": 1.0088045948631796,
"learning_rate": 2.6056515596620715e-06,
"loss": 0.7571,
"step": 1880
},
{
"epoch": 1.99,
"grad_norm": 0.9727997698934588,
"learning_rate": 2.581143376295516e-06,
"loss": 0.7968,
"step": 1885
},
{
"epoch": 2.0,
"grad_norm": 0.9760428822299934,
"learning_rate": 2.556710831840481e-06,
"loss": 0.7829,
"step": 1890
},
{
"epoch": 2.0,
"grad_norm": 1.1893585643467264,
"learning_rate": 2.5323546903154074e-06,
"loss": 0.7363,
"step": 1895
},
{
"epoch": 2.01,
"grad_norm": 1.0408498899558132,
"learning_rate": 2.508075713349575e-06,
"loss": 0.683,
"step": 1900
},
{
"epoch": 2.01,
"grad_norm": 1.0852218097728863,
"learning_rate": 2.483874660159294e-06,
"loss": 0.6388,
"step": 1905
},
{
"epoch": 2.02,
"grad_norm": 1.0636193658435114,
"learning_rate": 2.45975228752415e-06,
"loss": 0.6785,
"step": 1910
},
{
"epoch": 2.02,
"grad_norm": 1.05164052954354,
"learning_rate": 2.435709349763354e-06,
"loss": 0.7024,
"step": 1915
},
{
"epoch": 2.03,
"grad_norm": 1.0744751292672923,
"learning_rate": 2.4117465987121357e-06,
"loss": 0.6714,
"step": 1920
},
{
"epoch": 2.03,
"grad_norm": 1.0221167769747221,
"learning_rate": 2.387864783698258e-06,
"loss": 0.6441,
"step": 1925
},
{
"epoch": 2.04,
"grad_norm": 1.0453109653021675,
"learning_rate": 2.3640646515185596e-06,
"loss": 0.6668,
"step": 1930
},
{
"epoch": 2.04,
"grad_norm": 1.0035196656143317,
"learning_rate": 2.3403469464156235e-06,
"loss": 0.6711,
"step": 1935
},
{
"epoch": 2.05,
"grad_norm": 1.0614923887712562,
"learning_rate": 2.31671241005449e-06,
"loss": 0.6801,
"step": 1940
},
{
"epoch": 2.06,
"grad_norm": 1.0457688195463548,
"learning_rate": 2.2931617814994704e-06,
"loss": 0.6676,
"step": 1945
},
{
"epoch": 2.06,
"grad_norm": 1.094973586743587,
"learning_rate": 2.269695797191032e-06,
"loss": 0.6467,
"step": 1950
},
{
"epoch": 2.07,
"grad_norm": 1.0312304548353073,
"learning_rate": 2.2463151909227804e-06,
"loss": 0.6626,
"step": 1955
},
{
"epoch": 2.07,
"grad_norm": 1.0435526510546405,
"learning_rate": 2.223020693818495e-06,
"loss": 0.6565,
"step": 1960
},
{
"epoch": 2.08,
"grad_norm": 1.0361388218534178,
"learning_rate": 2.1998130343092866e-06,
"loss": 0.655,
"step": 1965
},
{
"epoch": 2.08,
"grad_norm": 1.071971382261616,
"learning_rate": 2.176692938110801e-06,
"loss": 0.6628,
"step": 1970
},
{
"epoch": 2.09,
"grad_norm": 1.0449189624346316,
"learning_rate": 2.1536611282005374e-06,
"loss": 0.6742,
"step": 1975
},
{
"epoch": 2.09,
"grad_norm": 1.0076278447431801,
"learning_rate": 2.130718324795234e-06,
"loss": 0.6615,
"step": 1980
},
{
"epoch": 2.1,
"grad_norm": 1.044357139317297,
"learning_rate": 2.107865245328354e-06,
"loss": 0.6707,
"step": 1985
},
{
"epoch": 2.1,
"grad_norm": 1.0155250644507565,
"learning_rate": 2.0851026044276405e-06,
"loss": 0.6701,
"step": 1990
},
{
"epoch": 2.11,
"grad_norm": 1.012020172763002,
"learning_rate": 2.0624311138927795e-06,
"loss": 0.6531,
"step": 1995
},
{
"epoch": 2.11,
"grad_norm": 1.0209851165233697,
"learning_rate": 2.0398514826731326e-06,
"loss": 0.6685,
"step": 2000
},
{
"epoch": 2.12,
"grad_norm": 1.0147123852944229,
"learning_rate": 2.017364416845579e-06,
"loss": 0.6506,
"step": 2005
},
{
"epoch": 2.12,
"grad_norm": 1.06994559921509,
"learning_rate": 1.9949706195924235e-06,
"loss": 0.6743,
"step": 2010
},
{
"epoch": 2.13,
"grad_norm": 0.9930487524595831,
"learning_rate": 1.97267079117942e-06,
"loss": 0.6596,
"step": 2015
},
{
"epoch": 2.13,
"grad_norm": 1.0334858708046972,
"learning_rate": 1.950465628933863e-06,
"loss": 0.6679,
"step": 2020
},
{
"epoch": 2.14,
"grad_norm": 1.060064879245556,
"learning_rate": 1.9283558272227866e-06,
"loss": 0.6749,
"step": 2025
},
{
"epoch": 2.15,
"grad_norm": 1.0171368650427,
"learning_rate": 1.9063420774312509e-06,
"loss": 0.6703,
"step": 2030
},
{
"epoch": 2.15,
"grad_norm": 0.9646165360014197,
"learning_rate": 1.8844250679407272e-06,
"loss": 0.6878,
"step": 2035
},
{
"epoch": 2.16,
"grad_norm": 1.0209055430674492,
"learning_rate": 1.862605484107562e-06,
"loss": 0.7052,
"step": 2040
},
{
"epoch": 2.16,
"grad_norm": 1.0216869737250995,
"learning_rate": 1.840884008241549e-06,
"loss": 0.6778,
"step": 2045
},
{
"epoch": 2.17,
"grad_norm": 0.990030094537176,
"learning_rate": 1.819261319584602e-06,
"loss": 0.675,
"step": 2050
},
{
"epoch": 2.17,
"grad_norm": 0.9972968188321764,
"learning_rate": 1.7977380942895007e-06,
"loss": 0.6832,
"step": 2055
},
{
"epoch": 2.18,
"grad_norm": 1.002919858574642,
"learning_rate": 1.7763150053987532e-06,
"loss": 0.6669,
"step": 2060
},
{
"epoch": 2.18,
"grad_norm": 1.040641077805689,
"learning_rate": 1.7549927228235547e-06,
"loss": 0.6874,
"step": 2065
},
{
"epoch": 2.19,
"grad_norm": 1.0136593089712416,
"learning_rate": 1.7337719133228308e-06,
"loss": 0.6662,
"step": 2070
},
{
"epoch": 2.19,
"grad_norm": 1.0032381970613455,
"learning_rate": 1.7126532404823898e-06,
"loss": 0.657,
"step": 2075
},
{
"epoch": 2.2,
"grad_norm": 1.0107311218156156,
"learning_rate": 1.6916373646941774e-06,
"loss": 0.6706,
"step": 2080
},
{
"epoch": 2.2,
"grad_norm": 1.0313882769598175,
"learning_rate": 1.6707249431356188e-06,
"loss": 0.6803,
"step": 2085
},
{
"epoch": 2.21,
"grad_norm": 1.0013867402651844,
"learning_rate": 1.6499166297490716e-06,
"loss": 0.6896,
"step": 2090
},
{
"epoch": 2.21,
"grad_norm": 0.9974367112606389,
"learning_rate": 1.6292130752213747e-06,
"loss": 0.6773,
"step": 2095
},
{
"epoch": 2.22,
"grad_norm": 1.0457782650116,
"learning_rate": 1.6086149269635081e-06,
"loss": 0.668,
"step": 2100
},
{
"epoch": 2.22,
"grad_norm": 0.9930241935385495,
"learning_rate": 1.5881228290903367e-06,
"loss": 0.6508,
"step": 2105
},
{
"epoch": 2.23,
"grad_norm": 1.0059354322817335,
"learning_rate": 1.5677374224004793e-06,
"loss": 0.6529,
"step": 2110
},
{
"epoch": 2.24,
"grad_norm": 1.0338579100235163,
"learning_rate": 1.547459344356262e-06,
"loss": 0.6614,
"step": 2115
},
{
"epoch": 2.24,
"grad_norm": 1.0203126239591027,
"learning_rate": 1.5272892290637892e-06,
"loss": 0.6749,
"step": 2120
},
{
"epoch": 2.25,
"grad_norm": 0.983643586611109,
"learning_rate": 1.5072277072531127e-06,
"loss": 0.6517,
"step": 2125
},
{
"epoch": 2.25,
"grad_norm": 1.0203957676102433,
"learning_rate": 1.4872754062585126e-06,
"loss": 0.6716,
"step": 2130
},
{
"epoch": 2.26,
"grad_norm": 1.036201909144992,
"learning_rate": 1.4674329499988737e-06,
"loss": 0.6574,
"step": 2135
},
{
"epoch": 2.26,
"grad_norm": 1.0277085537623492,
"learning_rate": 1.4477009589581787e-06,
"loss": 0.6593,
"step": 2140
},
{
"epoch": 2.27,
"grad_norm": 0.9713425669443266,
"learning_rate": 1.4280800501661057e-06,
"loss": 0.6621,
"step": 2145
},
{
"epoch": 2.27,
"grad_norm": 1.028497947768737,
"learning_rate": 1.408570837178735e-06,
"loss": 0.6656,
"step": 2150
},
{
"epoch": 2.28,
"grad_norm": 1.0565632370972053,
"learning_rate": 1.3891739300593559e-06,
"loss": 0.6644,
"step": 2155
},
{
"epoch": 2.28,
"grad_norm": 1.0043346444991121,
"learning_rate": 1.369889935359402e-06,
"loss": 0.6539,
"step": 2160
},
{
"epoch": 2.29,
"grad_norm": 1.0294689299797029,
"learning_rate": 1.3507194560994657e-06,
"loss": 0.6666,
"step": 2165
},
{
"epoch": 2.29,
"grad_norm": 1.0123495429792864,
"learning_rate": 1.331663091750463e-06,
"loss": 0.6928,
"step": 2170
},
{
"epoch": 2.3,
"grad_norm": 0.9951164224382856,
"learning_rate": 1.312721438214869e-06,
"loss": 0.6501,
"step": 2175
},
{
"epoch": 2.3,
"grad_norm": 1.025832661356824,
"learning_rate": 1.293895087808098e-06,
"loss": 0.6658,
"step": 2180
},
{
"epoch": 2.31,
"grad_norm": 0.9888366700648139,
"learning_rate": 1.2751846292399705e-06,
"loss": 0.6592,
"step": 2185
},
{
"epoch": 2.31,
"grad_norm": 1.0208359350524125,
"learning_rate": 1.2565906475963102e-06,
"loss": 0.6483,
"step": 2190
},
{
"epoch": 2.32,
"grad_norm": 1.0568986951058392,
"learning_rate": 1.2381137243206455e-06,
"loss": 0.6557,
"step": 2195
},
{
"epoch": 2.32,
"grad_norm": 0.9849389521844061,
"learning_rate": 1.2197544371960317e-06,
"loss": 0.6488,
"step": 2200
},
{
"epoch": 2.33,
"grad_norm": 1.0466426799607875,
"learning_rate": 1.2015133603269753e-06,
"loss": 0.6596,
"step": 2205
},
{
"epoch": 2.34,
"grad_norm": 0.9985742048846067,
"learning_rate": 1.183391064121493e-06,
"loss": 0.6572,
"step": 2210
},
{
"epoch": 2.34,
"grad_norm": 0.9661312369342807,
"learning_rate": 1.1653881152732582e-06,
"loss": 0.6439,
"step": 2215
},
{
"epoch": 2.35,
"grad_norm": 1.0327058718249167,
"learning_rate": 1.1475050767439e-06,
"loss": 0.6811,
"step": 2220
},
{
"epoch": 2.35,
"grad_norm": 1.0365200638536969,
"learning_rate": 1.129742507745382e-06,
"loss": 0.6588,
"step": 2225
},
{
"epoch": 2.36,
"grad_norm": 0.9804079029045045,
"learning_rate": 1.1121009637225283e-06,
"loss": 0.6783,
"step": 2230
},
{
"epoch": 2.36,
"grad_norm": 1.0326866018136251,
"learning_rate": 1.0945809963356442e-06,
"loss": 0.6705,
"step": 2235
},
{
"epoch": 2.37,
"grad_norm": 1.0314679157662048,
"learning_rate": 1.0771831534432714e-06,
"loss": 0.6353,
"step": 2240
},
{
"epoch": 2.37,
"grad_norm": 0.9589889108924486,
"learning_rate": 1.0599079790850542e-06,
"loss": 0.655,
"step": 2245
},
{
"epoch": 2.38,
"grad_norm": 0.9894914192305704,
"learning_rate": 1.0427560134647308e-06,
"loss": 0.643,
"step": 2250
},
{
"epoch": 2.38,
"grad_norm": 1.0693419775513076,
"learning_rate": 1.0257277929332332e-06,
"loss": 0.6611,
"step": 2255
},
{
"epoch": 2.39,
"grad_norm": 0.9951590219864285,
"learning_rate": 1.0088238499719254e-06,
"loss": 0.6403,
"step": 2260
},
{
"epoch": 2.39,
"grad_norm": 1.0105626202971048,
"learning_rate": 9.920447131759392e-07,
"loss": 0.6707,
"step": 2265
},
{
"epoch": 2.4,
"grad_norm": 1.0186289750333066,
"learning_rate": 9.753909072376594e-07,
"loss": 0.6809,
"step": 2270
},
{
"epoch": 2.4,
"grad_norm": 1.0267980845318398,
"learning_rate": 9.58862952930304e-07,
"loss": 0.6642,
"step": 2275
},
{
"epoch": 2.41,
"grad_norm": 1.0314667402705489,
"learning_rate": 9.424613670916499e-07,
"loss": 0.6815,
"step": 2280
},
{
"epoch": 2.41,
"grad_norm": 0.9818510396592551,
"learning_rate": 9.261866626078625e-07,
"loss": 0.6579,
"step": 2285
},
{
"epoch": 2.42,
"grad_norm": 0.998040916561116,
"learning_rate": 9.100393483974612e-07,
"loss": 0.6815,
"step": 2290
},
{
"epoch": 2.43,
"grad_norm": 1.007529165875462,
"learning_rate": 8.940199293954033e-07,
"loss": 0.6609,
"step": 2295
},
{
"epoch": 2.43,
"grad_norm": 1.0489165413908048,
"learning_rate": 8.781289065373016e-07,
"loss": 0.6661,
"step": 2300
},
{
"epoch": 2.44,
"grad_norm": 1.0586483881635766,
"learning_rate": 8.623667767437483e-07,
"loss": 0.6494,
"step": 2305
},
{
"epoch": 2.44,
"grad_norm": 0.970861929985865,
"learning_rate": 8.467340329047874e-07,
"loss": 0.6403,
"step": 2310
},
{
"epoch": 2.45,
"grad_norm": 1.0315170437890622,
"learning_rate": 8.312311638644888e-07,
"loss": 0.6802,
"step": 2315
},
{
"epoch": 2.45,
"grad_norm": 1.018615901485097,
"learning_rate": 8.158586544056791e-07,
"loss": 0.6813,
"step": 2320
},
{
"epoch": 2.46,
"grad_norm": 0.9991739019084611,
"learning_rate": 8.00616985234764e-07,
"loss": 0.6757,
"step": 2325
},
{
"epoch": 2.46,
"grad_norm": 1.039226698329409,
"learning_rate": 7.855066329667121e-07,
"loss": 0.6421,
"step": 2330
},
{
"epoch": 2.47,
"grad_norm": 1.0505394427255816,
"learning_rate": 7.705280701101392e-07,
"loss": 0.6655,
"step": 2335
},
{
"epoch": 2.47,
"grad_norm": 0.9750027460632938,
"learning_rate": 7.556817650525383e-07,
"loss": 0.6526,
"step": 2340
},
{
"epoch": 2.48,
"grad_norm": 0.989246982143368,
"learning_rate": 7.409681820456315e-07,
"loss": 0.667,
"step": 2345
},
{
"epoch": 2.48,
"grad_norm": 0.9977414734019189,
"learning_rate": 7.263877811908553e-07,
"loss": 0.6647,
"step": 2350
},
{
"epoch": 2.49,
"grad_norm": 0.9875292562685886,
"learning_rate": 7.11941018424967e-07,
"loss": 0.667,
"step": 2355
},
{
"epoch": 2.49,
"grad_norm": 0.9932801930288735,
"learning_rate": 6.97628345505797e-07,
"loss": 0.6511,
"step": 2360
},
{
"epoch": 2.5,
"grad_norm": 1.0199295886729471,
"learning_rate": 6.83450209998106e-07,
"loss": 0.6556,
"step": 2365
},
{
"epoch": 2.5,
"grad_norm": 1.0279710885988984,
"learning_rate": 6.694070552596105e-07,
"loss": 0.6676,
"step": 2370
},
{
"epoch": 2.51,
"grad_norm": 1.0221845787587531,
"learning_rate": 6.554993204270993e-07,
"loss": 0.6512,
"step": 2375
},
{
"epoch": 2.52,
"grad_norm": 0.9597530531552908,
"learning_rate": 6.417274404027163e-07,
"loss": 0.6482,
"step": 2380
},
{
"epoch": 2.52,
"grad_norm": 1.0201542647464452,
"learning_rate": 6.280918458403506e-07,
"loss": 0.6623,
"step": 2385
},
{
"epoch": 2.53,
"grad_norm": 0.9818765108255797,
"learning_rate": 6.14592963132174e-07,
"loss": 0.6599,
"step": 2390
},
{
"epoch": 2.53,
"grad_norm": 1.0020031777534095,
"learning_rate": 6.012312143953075e-07,
"loss": 0.6818,
"step": 2395
},
{
"epoch": 2.54,
"grad_norm": 1.020601700800406,
"learning_rate": 5.880070174586228e-07,
"loss": 0.6794,
"step": 2400
},
{
"epoch": 2.54,
"grad_norm": 0.9781529112263975,
"learning_rate": 5.74920785849673e-07,
"loss": 0.6612,
"step": 2405
},
{
"epoch": 2.55,
"grad_norm": 1.020456830272749,
"learning_rate": 5.619729287817621e-07,
"loss": 0.6638,
"step": 2410
},
{
"epoch": 2.55,
"grad_norm": 1.0134058298180835,
"learning_rate": 5.49163851141154e-07,
"loss": 0.6468,
"step": 2415
},
{
"epoch": 2.56,
"grad_norm": 1.0051724307379968,
"learning_rate": 5.36493953474404e-07,
"loss": 0.6411,
"step": 2420
},
{
"epoch": 2.56,
"grad_norm": 0.9963926377815217,
"learning_rate": 5.239636319758356e-07,
"loss": 0.668,
"step": 2425
},
{
"epoch": 2.57,
"grad_norm": 0.9731428272925532,
"learning_rate": 5.115732784751576e-07,
"loss": 0.6444,
"step": 2430
},
{
"epoch": 2.57,
"grad_norm": 1.0185774017291327,
"learning_rate": 4.993232804252018e-07,
"loss": 0.6529,
"step": 2435
},
{
"epoch": 2.58,
"grad_norm": 1.00711656230006,
"learning_rate": 4.872140208898118e-07,
"loss": 0.6539,
"step": 2440
},
{
"epoch": 2.58,
"grad_norm": 1.0045164786035452,
"learning_rate": 4.7524587853186866e-07,
"loss": 0.6629,
"step": 2445
},
{
"epoch": 2.59,
"grad_norm": 0.9961645157673277,
"learning_rate": 4.634192276014399e-07,
"loss": 0.6738,
"step": 2450
},
{
"epoch": 2.59,
"grad_norm": 1.0214318273829783,
"learning_rate": 4.5173443792408625e-07,
"loss": 0.6552,
"step": 2455
},
{
"epoch": 2.6,
"grad_norm": 1.0163355618069994,
"learning_rate": 4.4019187488928914e-07,
"loss": 0.6638,
"step": 2460
},
{
"epoch": 2.61,
"grad_norm": 1.032574771687925,
"learning_rate": 4.2879189943903335e-07,
"loss": 0.6877,
"step": 2465
},
{
"epoch": 2.61,
"grad_norm": 0.9930486578442914,
"learning_rate": 4.1753486805651e-07,
"loss": 0.6832,
"step": 2470
},
{
"epoch": 2.62,
"grad_norm": 0.969259241462703,
"learning_rate": 4.064211327549794e-07,
"loss": 0.6738,
"step": 2475
},
{
"epoch": 2.62,
"grad_norm": 1.018380412495952,
"learning_rate": 3.95451041066755e-07,
"loss": 0.671,
"step": 2480
},
{
"epoch": 2.63,
"grad_norm": 0.9735720562840744,
"learning_rate": 3.8462493603234064e-07,
"loss": 0.6433,
"step": 2485
},
{
"epoch": 2.63,
"grad_norm": 1.023935871901339,
"learning_rate": 3.739431561897011e-07,
"loss": 0.6593,
"step": 2490
},
{
"epoch": 2.64,
"grad_norm": 0.9931869209408388,
"learning_rate": 3.634060355636798e-07,
"loss": 0.6647,
"step": 2495
},
{
"epoch": 2.64,
"grad_norm": 1.0007736035504975,
"learning_rate": 3.53013903655548e-07,
"loss": 0.6683,
"step": 2500
},
{
"epoch": 2.65,
"grad_norm": 0.9926593135266999,
"learning_rate": 3.427670854327042e-07,
"loss": 0.6668,
"step": 2505
},
{
"epoch": 2.65,
"grad_norm": 0.9870259704326787,
"learning_rate": 3.3266590131851296e-07,
"loss": 0.6583,
"step": 2510
},
{
"epoch": 2.66,
"grad_norm": 1.0298553599069395,
"learning_rate": 3.227106671822849e-07,
"loss": 0.6835,
"step": 2515
},
{
"epoch": 2.66,
"grad_norm": 0.9915918166378904,
"learning_rate": 3.1290169432939556e-07,
"loss": 0.6428,
"step": 2520
},
{
"epoch": 2.67,
"grad_norm": 1.060474012796049,
"learning_rate": 3.03239289491557e-07,
"loss": 0.6571,
"step": 2525
},
{
"epoch": 2.67,
"grad_norm": 1.0203183687136719,
"learning_rate": 2.937237548172206e-07,
"loss": 0.6511,
"step": 2530
},
{
"epoch": 2.68,
"grad_norm": 0.989507237700814,
"learning_rate": 2.8435538786213134e-07,
"loss": 0.6746,
"step": 2535
},
{
"epoch": 2.68,
"grad_norm": 0.9853274639882493,
"learning_rate": 2.7513448158002334e-07,
"loss": 0.6657,
"step": 2540
},
{
"epoch": 2.69,
"grad_norm": 0.9957797339050202,
"learning_rate": 2.66061324313458e-07,
"loss": 0.6496,
"step": 2545
},
{
"epoch": 2.69,
"grad_norm": 1.0073836211394178,
"learning_rate": 2.5713619978480653e-07,
"loss": 0.6596,
"step": 2550
},
{
"epoch": 2.7,
"grad_norm": 0.9798969178233458,
"learning_rate": 2.483593870873829e-07,
"loss": 0.654,
"step": 2555
},
{
"epoch": 2.71,
"grad_norm": 0.9936847658098146,
"learning_rate": 2.3973116067670665e-07,
"loss": 0.6457,
"step": 2560
},
{
"epoch": 2.71,
"grad_norm": 1.0224466038654803,
"learning_rate": 2.3125179036193214e-07,
"loss": 0.6572,
"step": 2565
},
{
"epoch": 2.72,
"grad_norm": 1.0378183041017084,
"learning_rate": 2.2292154129740117e-07,
"loss": 0.6554,
"step": 2570
},
{
"epoch": 2.72,
"grad_norm": 0.9787357607930246,
"learning_rate": 2.147406739743596e-07,
"loss": 0.6689,
"step": 2575
},
{
"epoch": 2.73,
"grad_norm": 1.003947207260689,
"learning_rate": 2.0670944421280646e-07,
"loss": 0.6458,
"step": 2580
},
{
"epoch": 2.73,
"grad_norm": 1.0063190015667964,
"learning_rate": 1.9882810315349554e-07,
"loss": 0.6648,
"step": 2585
},
{
"epoch": 2.74,
"grad_norm": 1.0148103533053272,
"learning_rate": 1.9109689725008317e-07,
"loss": 0.6738,
"step": 2590
},
{
"epoch": 2.74,
"grad_norm": 1.0122729219524842,
"learning_rate": 1.8351606826142176e-07,
"loss": 0.6796,
"step": 2595
},
{
"epoch": 2.75,
"grad_norm": 1.0170129872933447,
"learning_rate": 1.7608585324399684e-07,
"loss": 0.6798,
"step": 2600
},
{
"epoch": 2.75,
"grad_norm": 0.992464215850126,
"learning_rate": 1.688064845445192e-07,
"loss": 0.6695,
"step": 2605
},
{
"epoch": 2.76,
"grad_norm": 0.9778375876093532,
"learning_rate": 1.6167818979265282e-07,
"loss": 0.6563,
"step": 2610
},
{
"epoch": 2.76,
"grad_norm": 1.0165595693382412,
"learning_rate": 1.5470119189390342e-07,
"loss": 0.6709,
"step": 2615
},
{
"epoch": 2.77,
"grad_norm": 0.9846854115443192,
"learning_rate": 1.4787570902264293e-07,
"loss": 0.6468,
"step": 2620
},
{
"epoch": 2.77,
"grad_norm": 1.0226129803358943,
"learning_rate": 1.4120195461529097e-07,
"loss": 0.6699,
"step": 2625
},
{
"epoch": 2.78,
"grad_norm": 1.0082916511837874,
"learning_rate": 1.3468013736363694e-07,
"loss": 0.6516,
"step": 2630
},
{
"epoch": 2.78,
"grad_norm": 1.0086534086914538,
"learning_rate": 1.2831046120831692e-07,
"loss": 0.6483,
"step": 2635
},
{
"epoch": 2.79,
"grad_norm": 0.9957571698657345,
"learning_rate": 1.2209312533243535e-07,
"loss": 0.6632,
"step": 2640
},
{
"epoch": 2.8,
"grad_norm": 1.0298383480420663,
"learning_rate": 1.1602832415533616e-07,
"loss": 0.6645,
"step": 2645
},
{
"epoch": 2.8,
"grad_norm": 1.0188314052602203,
"learning_rate": 1.1011624732652437e-07,
"loss": 0.6752,
"step": 2650
},
{
"epoch": 2.81,
"grad_norm": 1.0019681746822835,
"learning_rate": 1.0435707971973297e-07,
"loss": 0.6573,
"step": 2655
},
{
"epoch": 2.81,
"grad_norm": 0.9926022445477827,
"learning_rate": 9.875100142714478e-08,
"loss": 0.6396,
"step": 2660
},
{
"epoch": 2.82,
"grad_norm": 0.9847567872289796,
"learning_rate": 9.329818775376088e-08,
"loss": 0.672,
"step": 2665
},
{
"epoch": 2.82,
"grad_norm": 1.0103069579844817,
"learning_rate": 8.79988092119144e-08,
"loss": 0.678,
"step": 2670
},
{
"epoch": 2.83,
"grad_norm": 1.0092463732513441,
"learning_rate": 8.285303151594537e-08,
"loss": 0.6837,
"step": 2675
},
{
"epoch": 2.83,
"grad_norm": 1.0032753352403014,
"learning_rate": 7.786101557701209e-08,
"loss": 0.6494,
"step": 2680
},
{
"epoch": 2.84,
"grad_norm": 1.0278927407365124,
"learning_rate": 7.302291749806345e-08,
"loss": 0.6597,
"step": 2685
},
{
"epoch": 2.84,
"grad_norm": 0.9985234255556347,
"learning_rate": 6.833888856895676e-08,
"loss": 0.6672,
"step": 2690
},
{
"epoch": 2.85,
"grad_norm": 1.0086435046290338,
"learning_rate": 6.380907526172597e-08,
"loss": 0.6768,
"step": 2695
},
{
"epoch": 2.85,
"grad_norm": 0.9639413787477988,
"learning_rate": 5.943361922600255e-08,
"loss": 0.6346,
"step": 2700
},
{
"epoch": 2.86,
"grad_norm": 0.9898392259409212,
"learning_rate": 5.521265728458347e-08,
"loss": 0.6655,
"step": 2705
},
{
"epoch": 2.86,
"grad_norm": 1.0000733408715612,
"learning_rate": 5.114632142915687e-08,
"loss": 0.638,
"step": 2710
},
{
"epoch": 2.87,
"grad_norm": 0.990452054352071,
"learning_rate": 4.723473881617147e-08,
"loss": 0.6583,
"step": 2715
},
{
"epoch": 2.87,
"grad_norm": 0.988717000145255,
"learning_rate": 4.347803176286025e-08,
"loss": 0.6708,
"step": 2720
},
{
"epoch": 2.88,
"grad_norm": 0.9868081897157113,
"learning_rate": 3.98763177434186e-08,
"loss": 0.6583,
"step": 2725
},
{
"epoch": 2.89,
"grad_norm": 1.001603936622736,
"learning_rate": 3.642970938532553e-08,
"loss": 0.6754,
"step": 2730
},
{
"epoch": 2.89,
"grad_norm": 1.0028854813842756,
"learning_rate": 3.313831446582816e-08,
"loss": 0.6784,
"step": 2735
},
{
"epoch": 2.9,
"grad_norm": 0.9840591494137083,
"learning_rate": 3.000223590856666e-08,
"loss": 0.6651,
"step": 2740
},
{
"epoch": 2.9,
"grad_norm": 1.0425902900408417,
"learning_rate": 2.7021571780356804e-08,
"loss": 0.6489,
"step": 2745
},
{
"epoch": 2.91,
"grad_norm": 1.0016271763738829,
"learning_rate": 2.419641528812522e-08,
"loss": 0.6501,
"step": 2750
},
{
"epoch": 2.91,
"grad_norm": 0.9875844742537229,
"learning_rate": 2.1526854775992255e-08,
"loss": 0.667,
"step": 2755
},
{
"epoch": 2.92,
"grad_norm": 0.9909068409835267,
"learning_rate": 1.901297372251143e-08,
"loss": 0.6649,
"step": 2760
},
{
"epoch": 2.92,
"grad_norm": 1.0200770120528766,
"learning_rate": 1.665485073805817e-08,
"loss": 0.6542,
"step": 2765
},
{
"epoch": 2.93,
"grad_norm": 0.9699214260408161,
"learning_rate": 1.4452559562370683e-08,
"loss": 0.6644,
"step": 2770
},
{
"epoch": 2.93,
"grad_norm": 0.9962874170809767,
"learning_rate": 1.2406169062246232e-08,
"loss": 0.6502,
"step": 2775
},
{
"epoch": 2.94,
"grad_norm": 1.0264867036759864,
"learning_rate": 1.0515743229385645e-08,
"loss": 0.6698,
"step": 2780
},
{
"epoch": 2.94,
"grad_norm": 1.0133222133442825,
"learning_rate": 8.781341178393244e-09,
"loss": 0.6723,
"step": 2785
},
{
"epoch": 2.95,
"grad_norm": 1.0159129157737807,
"learning_rate": 7.203017144927771e-09,
"loss": 0.6561,
"step": 2790
},
{
"epoch": 2.95,
"grad_norm": 0.9931795490054022,
"learning_rate": 5.780820484007632e-09,
"loss": 0.6563,
"step": 2795
},
{
"epoch": 2.96,
"grad_norm": 1.0195254872888724,
"learning_rate": 4.514795668466576e-09,
"loss": 0.6808,
"step": 2800
},
{
"epoch": 2.96,
"grad_norm": 1.0210108366337896,
"learning_rate": 3.4049822875614757e-09,
"loss": 0.6723,
"step": 2805
},
{
"epoch": 2.97,
"grad_norm": 0.9891130306027911,
"learning_rate": 2.4514150457377594e-09,
"loss": 0.6763,
"step": 2810
},
{
"epoch": 2.97,
"grad_norm": 0.9876265686294937,
"learning_rate": 1.654123761541393e-09,
"loss": 0.6652,
"step": 2815
},
{
"epoch": 2.98,
"grad_norm": 0.9719073327336301,
"learning_rate": 1.0131333666885124e-09,
"loss": 0.6793,
"step": 2820
},
{
"epoch": 2.99,
"grad_norm": 1.004648101535836,
"learning_rate": 5.284639052832718e-10,
"loss": 0.6643,
"step": 2825
},
{
"epoch": 2.99,
"grad_norm": 1.0172517540637482,
"learning_rate": 2.0013053319334341e-10,
"loss": 0.6768,
"step": 2830
},
{
"epoch": 3.0,
"grad_norm": 0.9650966122076953,
"learning_rate": 2.814351757529643e-11,
"loss": 0.6356,
"step": 2835
},
{
"epoch": 3.0,
"step": 2838,
"total_flos": 1471706245890048.0,
"train_loss": 0.8058284866381398,
"train_runtime": 31310.8966,
"train_samples_per_second": 5.802,
"train_steps_per_second": 0.091
}
],
"logging_steps": 5,
"max_steps": 2838,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1471706245890048.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}