llama3-meta_material-3epochs / trainer_state.json
Jackie999's picture
Model save
ad55c8d verified
raw
history blame contribute delete
No virus
128 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3957,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.08993126249565843,
"learning_rate": 5.050505050505052e-07,
"loss": 1.9018,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.07331289037109069,
"learning_rate": 2.5252525252525253e-06,
"loss": 1.764,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.07765988729484478,
"learning_rate": 5.050505050505051e-06,
"loss": 1.6755,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.09328963425207142,
"learning_rate": 7.5757575757575764e-06,
"loss": 1.7942,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 0.08439300726187475,
"learning_rate": 1.0101010101010101e-05,
"loss": 1.9255,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.09969799609843567,
"learning_rate": 1.2626262626262628e-05,
"loss": 1.6785,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.08101062126150285,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.6021,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 0.09324937558599246,
"learning_rate": 1.7676767676767676e-05,
"loss": 1.8021,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 0.08809158633106223,
"learning_rate": 2.0202020202020203e-05,
"loss": 1.8128,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.12040981734567617,
"learning_rate": 2.272727272727273e-05,
"loss": 1.9518,
"step": 45
},
{
"epoch": 0.04,
"grad_norm": 0.10564260792758735,
"learning_rate": 2.5252525252525256e-05,
"loss": 1.844,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 0.11067902372614258,
"learning_rate": 2.777777777777778e-05,
"loss": 1.7309,
"step": 55
},
{
"epoch": 0.05,
"grad_norm": 0.17208150270588693,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.83,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 0.17753444313487116,
"learning_rate": 3.282828282828283e-05,
"loss": 1.7143,
"step": 65
},
{
"epoch": 0.05,
"grad_norm": 0.11795959596262973,
"learning_rate": 3.535353535353535e-05,
"loss": 1.6863,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 0.1604849588266011,
"learning_rate": 3.787878787878788e-05,
"loss": 1.7895,
"step": 75
},
{
"epoch": 0.06,
"grad_norm": 0.14614002357667696,
"learning_rate": 4.0404040404040405e-05,
"loss": 1.7037,
"step": 80
},
{
"epoch": 0.06,
"grad_norm": 0.17612584653207902,
"learning_rate": 4.292929292929293e-05,
"loss": 1.7624,
"step": 85
},
{
"epoch": 0.07,
"grad_norm": 0.1725623148760258,
"learning_rate": 4.545454545454546e-05,
"loss": 1.7826,
"step": 90
},
{
"epoch": 0.07,
"grad_norm": 0.20023707797673052,
"learning_rate": 4.797979797979798e-05,
"loss": 1.7551,
"step": 95
},
{
"epoch": 0.08,
"grad_norm": 0.19340080776803978,
"learning_rate": 5.050505050505051e-05,
"loss": 1.8434,
"step": 100
},
{
"epoch": 0.08,
"grad_norm": 0.17540911026085398,
"learning_rate": 5.303030303030303e-05,
"loss": 1.8444,
"step": 105
},
{
"epoch": 0.08,
"grad_norm": 0.17607693067428137,
"learning_rate": 5.555555555555556e-05,
"loss": 1.7179,
"step": 110
},
{
"epoch": 0.09,
"grad_norm": 0.18623446289553894,
"learning_rate": 5.808080808080808e-05,
"loss": 1.8005,
"step": 115
},
{
"epoch": 0.09,
"grad_norm": 0.22653423501586226,
"learning_rate": 6.060606060606061e-05,
"loss": 1.8171,
"step": 120
},
{
"epoch": 0.09,
"grad_norm": 0.19917898944232487,
"learning_rate": 6.313131313131313e-05,
"loss": 1.7935,
"step": 125
},
{
"epoch": 0.1,
"grad_norm": 0.17977021019465064,
"learning_rate": 6.565656565656566e-05,
"loss": 1.847,
"step": 130
},
{
"epoch": 0.1,
"grad_norm": 0.21882213186080465,
"learning_rate": 6.818181818181818e-05,
"loss": 1.7249,
"step": 135
},
{
"epoch": 0.11,
"grad_norm": 0.19872378885041136,
"learning_rate": 7.07070707070707e-05,
"loss": 1.8317,
"step": 140
},
{
"epoch": 0.11,
"grad_norm": 0.18503126257452687,
"learning_rate": 7.323232323232324e-05,
"loss": 1.7036,
"step": 145
},
{
"epoch": 0.11,
"grad_norm": 0.19374257378242796,
"learning_rate": 7.575757575757576e-05,
"loss": 1.7408,
"step": 150
},
{
"epoch": 0.12,
"grad_norm": 0.20435751977610797,
"learning_rate": 7.828282828282829e-05,
"loss": 1.7453,
"step": 155
},
{
"epoch": 0.12,
"grad_norm": 0.18626365580812038,
"learning_rate": 8.080808080808081e-05,
"loss": 1.7538,
"step": 160
},
{
"epoch": 0.13,
"grad_norm": 0.22638414276196805,
"learning_rate": 8.333333333333334e-05,
"loss": 1.7755,
"step": 165
},
{
"epoch": 0.13,
"grad_norm": 0.19644895370384188,
"learning_rate": 8.585858585858586e-05,
"loss": 1.8446,
"step": 170
},
{
"epoch": 0.13,
"grad_norm": 0.19159413735540007,
"learning_rate": 8.83838383838384e-05,
"loss": 1.6323,
"step": 175
},
{
"epoch": 0.14,
"grad_norm": 0.17020103839194523,
"learning_rate": 9.090909090909092e-05,
"loss": 1.7957,
"step": 180
},
{
"epoch": 0.14,
"grad_norm": 0.19164694691999767,
"learning_rate": 9.343434343434344e-05,
"loss": 1.9204,
"step": 185
},
{
"epoch": 0.14,
"grad_norm": 0.19378174604020243,
"learning_rate": 9.595959595959596e-05,
"loss": 1.6792,
"step": 190
},
{
"epoch": 0.15,
"grad_norm": 0.24199163008867994,
"learning_rate": 9.848484848484849e-05,
"loss": 1.7818,
"step": 195
},
{
"epoch": 0.15,
"grad_norm": 0.1791702851062047,
"learning_rate": 0.00010101010101010102,
"loss": 1.6407,
"step": 200
},
{
"epoch": 0.16,
"grad_norm": 0.2127448005277486,
"learning_rate": 0.00010353535353535353,
"loss": 1.8173,
"step": 205
},
{
"epoch": 0.16,
"grad_norm": 0.18625979651987537,
"learning_rate": 0.00010606060606060606,
"loss": 1.7401,
"step": 210
},
{
"epoch": 0.16,
"grad_norm": 0.2602576963144457,
"learning_rate": 0.0001085858585858586,
"loss": 1.8104,
"step": 215
},
{
"epoch": 0.17,
"grad_norm": 0.19387518149584881,
"learning_rate": 0.00011111111111111112,
"loss": 1.8442,
"step": 220
},
{
"epoch": 0.17,
"grad_norm": 0.22413096563678928,
"learning_rate": 0.00011363636363636365,
"loss": 1.6172,
"step": 225
},
{
"epoch": 0.17,
"grad_norm": 0.21913536165908545,
"learning_rate": 0.00011616161616161616,
"loss": 1.6973,
"step": 230
},
{
"epoch": 0.18,
"grad_norm": 0.2083524734994055,
"learning_rate": 0.00011868686868686869,
"loss": 1.7593,
"step": 235
},
{
"epoch": 0.18,
"grad_norm": 0.22803196006710846,
"learning_rate": 0.00012121212121212122,
"loss": 1.731,
"step": 240
},
{
"epoch": 0.19,
"grad_norm": 0.16039461658504198,
"learning_rate": 0.00012373737373737374,
"loss": 1.5913,
"step": 245
},
{
"epoch": 0.19,
"grad_norm": 0.2185859764067758,
"learning_rate": 0.00012626262626262626,
"loss": 1.637,
"step": 250
},
{
"epoch": 0.19,
"grad_norm": 0.19449925000530618,
"learning_rate": 0.00012878787878787878,
"loss": 1.5634,
"step": 255
},
{
"epoch": 0.2,
"grad_norm": 0.18094727231062543,
"learning_rate": 0.00013131313131313133,
"loss": 1.6769,
"step": 260
},
{
"epoch": 0.2,
"grad_norm": 0.2362383395641708,
"learning_rate": 0.00013383838383838385,
"loss": 1.7723,
"step": 265
},
{
"epoch": 0.2,
"grad_norm": 0.1756303905738309,
"learning_rate": 0.00013636363636363637,
"loss": 1.7622,
"step": 270
},
{
"epoch": 0.21,
"grad_norm": 0.18784556886056825,
"learning_rate": 0.0001388888888888889,
"loss": 1.648,
"step": 275
},
{
"epoch": 0.21,
"grad_norm": 0.23195176017229427,
"learning_rate": 0.0001414141414141414,
"loss": 1.846,
"step": 280
},
{
"epoch": 0.22,
"grad_norm": 0.22241261139284105,
"learning_rate": 0.00014393939393939396,
"loss": 1.6282,
"step": 285
},
{
"epoch": 0.22,
"grad_norm": 0.1959378752266171,
"learning_rate": 0.00014646464646464648,
"loss": 1.7298,
"step": 290
},
{
"epoch": 0.22,
"grad_norm": 0.18110574265575713,
"learning_rate": 0.000148989898989899,
"loss": 1.6463,
"step": 295
},
{
"epoch": 0.23,
"grad_norm": 0.19727075597861782,
"learning_rate": 0.00015151515151515152,
"loss": 1.7925,
"step": 300
},
{
"epoch": 0.23,
"grad_norm": 0.20574697015902954,
"learning_rate": 0.00015404040404040406,
"loss": 1.6835,
"step": 305
},
{
"epoch": 0.24,
"grad_norm": 0.18178501884804188,
"learning_rate": 0.00015656565656565658,
"loss": 1.8534,
"step": 310
},
{
"epoch": 0.24,
"grad_norm": 0.20396286221213047,
"learning_rate": 0.0001590909090909091,
"loss": 1.9553,
"step": 315
},
{
"epoch": 0.24,
"grad_norm": 0.19731656072570272,
"learning_rate": 0.00016161616161616162,
"loss": 1.7907,
"step": 320
},
{
"epoch": 0.25,
"grad_norm": 0.15745281662564334,
"learning_rate": 0.00016414141414141414,
"loss": 1.7516,
"step": 325
},
{
"epoch": 0.25,
"grad_norm": 0.17389045576146547,
"learning_rate": 0.0001666666666666667,
"loss": 1.6362,
"step": 330
},
{
"epoch": 0.25,
"grad_norm": 0.2055099842458337,
"learning_rate": 0.00016919191919191918,
"loss": 1.711,
"step": 335
},
{
"epoch": 0.26,
"grad_norm": 0.16967943859036833,
"learning_rate": 0.00017171717171717173,
"loss": 1.7327,
"step": 340
},
{
"epoch": 0.26,
"grad_norm": 0.20493364192749108,
"learning_rate": 0.00017424242424242425,
"loss": 1.7575,
"step": 345
},
{
"epoch": 0.27,
"grad_norm": 0.22713873700263487,
"learning_rate": 0.0001767676767676768,
"loss": 1.6266,
"step": 350
},
{
"epoch": 0.27,
"grad_norm": 0.22661135493794904,
"learning_rate": 0.00017929292929292931,
"loss": 1.5738,
"step": 355
},
{
"epoch": 0.27,
"grad_norm": 0.2181059846275241,
"learning_rate": 0.00018181818181818183,
"loss": 1.6742,
"step": 360
},
{
"epoch": 0.28,
"grad_norm": 0.17088148508773793,
"learning_rate": 0.00018434343434343435,
"loss": 1.7483,
"step": 365
},
{
"epoch": 0.28,
"grad_norm": 0.2533679574468662,
"learning_rate": 0.00018686868686868687,
"loss": 1.8377,
"step": 370
},
{
"epoch": 0.28,
"grad_norm": 0.19728510430536142,
"learning_rate": 0.00018939393939393942,
"loss": 1.6412,
"step": 375
},
{
"epoch": 0.29,
"grad_norm": 0.1846496893995934,
"learning_rate": 0.00019191919191919191,
"loss": 1.6605,
"step": 380
},
{
"epoch": 0.29,
"grad_norm": 0.20677111282109845,
"learning_rate": 0.00019444444444444446,
"loss": 1.9151,
"step": 385
},
{
"epoch": 0.3,
"grad_norm": 0.1843158891748435,
"learning_rate": 0.00019696969696969698,
"loss": 1.6697,
"step": 390
},
{
"epoch": 0.3,
"grad_norm": 0.19889363775332344,
"learning_rate": 0.0001994949494949495,
"loss": 1.7293,
"step": 395
},
{
"epoch": 0.3,
"grad_norm": 0.18003384908917786,
"learning_rate": 0.00019999937734807612,
"loss": 1.8024,
"step": 400
},
{
"epoch": 0.31,
"grad_norm": 0.18260287569380637,
"learning_rate": 0.00019999684783792443,
"loss": 1.6779,
"step": 405
},
{
"epoch": 0.31,
"grad_norm": 0.21622290040357123,
"learning_rate": 0.00019999237260298072,
"loss": 1.6577,
"step": 410
},
{
"epoch": 0.31,
"grad_norm": 0.19728853094941184,
"learning_rate": 0.00019998595173032347,
"loss": 1.6211,
"step": 415
},
{
"epoch": 0.32,
"grad_norm": 0.16427481018358323,
"learning_rate": 0.00019997758534488915,
"loss": 1.6793,
"step": 420
},
{
"epoch": 0.32,
"grad_norm": 0.1464512551401983,
"learning_rate": 0.00019996727360946972,
"loss": 1.731,
"step": 425
},
{
"epoch": 0.33,
"grad_norm": 0.1895744669006413,
"learning_rate": 0.00019995501672470951,
"loss": 1.7024,
"step": 430
},
{
"epoch": 0.33,
"grad_norm": 0.13688692966034832,
"learning_rate": 0.00019994081492910124,
"loss": 1.8371,
"step": 435
},
{
"epoch": 0.33,
"grad_norm": 0.20101726127225358,
"learning_rate": 0.0001999246684989815,
"loss": 1.7473,
"step": 440
},
{
"epoch": 0.34,
"grad_norm": 0.20241522090213954,
"learning_rate": 0.00019990657774852534,
"loss": 1.7423,
"step": 445
},
{
"epoch": 0.34,
"grad_norm": 0.1767592377256186,
"learning_rate": 0.00019988654302974,
"loss": 1.8304,
"step": 450
},
{
"epoch": 0.34,
"grad_norm": 0.2373965969657545,
"learning_rate": 0.00019986456473245826,
"loss": 1.8509,
"step": 455
},
{
"epoch": 0.35,
"grad_norm": 0.21300866974991087,
"learning_rate": 0.00019984064328433084,
"loss": 1.7339,
"step": 460
},
{
"epoch": 0.35,
"grad_norm": 0.185425069119908,
"learning_rate": 0.00019981477915081793,
"loss": 1.7523,
"step": 465
},
{
"epoch": 0.36,
"grad_norm": 0.1955299660793198,
"learning_rate": 0.00019978697283518023,
"loss": 1.804,
"step": 470
},
{
"epoch": 0.36,
"grad_norm": 0.19829821726437152,
"learning_rate": 0.00019975722487846918,
"loss": 1.8378,
"step": 475
},
{
"epoch": 0.36,
"grad_norm": 0.13816451624075418,
"learning_rate": 0.0001997255358595164,
"loss": 1.791,
"step": 480
},
{
"epoch": 0.37,
"grad_norm": 0.17416550139224937,
"learning_rate": 0.00019969190639492244,
"loss": 1.6882,
"step": 485
},
{
"epoch": 0.37,
"grad_norm": 0.19361009276270708,
"learning_rate": 0.00019965633713904472,
"loss": 1.7448,
"step": 490
},
{
"epoch": 0.38,
"grad_norm": 0.20655351119978135,
"learning_rate": 0.00019961882878398492,
"loss": 1.7804,
"step": 495
},
{
"epoch": 0.38,
"grad_norm": 0.2104318907698028,
"learning_rate": 0.0001995793820595754,
"loss": 1.7399,
"step": 500
},
{
"epoch": 0.38,
"grad_norm": 0.1970506865196183,
"learning_rate": 0.00019953799773336507,
"loss": 1.662,
"step": 505
},
{
"epoch": 0.39,
"grad_norm": 0.12911497323739385,
"learning_rate": 0.00019949467661060433,
"loss": 1.6589,
"step": 510
},
{
"epoch": 0.39,
"grad_norm": 0.18727055645023982,
"learning_rate": 0.00019944941953422968,
"loss": 1.7437,
"step": 515
},
{
"epoch": 0.39,
"grad_norm": 0.21063285499774953,
"learning_rate": 0.000199402227384847,
"loss": 1.837,
"step": 520
},
{
"epoch": 0.4,
"grad_norm": 0.16895692207829008,
"learning_rate": 0.00019935310108071453,
"loss": 1.7406,
"step": 525
},
{
"epoch": 0.4,
"grad_norm": 0.2316031917603028,
"learning_rate": 0.00019930204157772515,
"loss": 1.8237,
"step": 530
},
{
"epoch": 0.41,
"grad_norm": 0.14077975973845075,
"learning_rate": 0.00019924904986938754,
"loss": 1.8804,
"step": 535
},
{
"epoch": 0.41,
"grad_norm": 0.25152961069767266,
"learning_rate": 0.000199194126986807,
"loss": 1.7984,
"step": 540
},
{
"epoch": 0.41,
"grad_norm": 0.18475665649785333,
"learning_rate": 0.00019913727399866545,
"loss": 1.7,
"step": 545
},
{
"epoch": 0.42,
"grad_norm": 0.15993162566307856,
"learning_rate": 0.00019907849201120033,
"loss": 1.8694,
"step": 550
},
{
"epoch": 0.42,
"grad_norm": 0.21887423989587396,
"learning_rate": 0.00019901778216818345,
"loss": 1.699,
"step": 555
},
{
"epoch": 0.42,
"grad_norm": 0.18385555657977046,
"learning_rate": 0.00019895514565089855,
"loss": 1.7936,
"step": 560
},
{
"epoch": 0.43,
"grad_norm": 0.15762946661816535,
"learning_rate": 0.00019889058367811822,
"loss": 1.6613,
"step": 565
},
{
"epoch": 0.43,
"grad_norm": 0.19204775302628793,
"learning_rate": 0.0001988240975060804,
"loss": 1.5856,
"step": 570
},
{
"epoch": 0.44,
"grad_norm": 0.1697199863146858,
"learning_rate": 0.00019875568842846382,
"loss": 1.672,
"step": 575
},
{
"epoch": 0.44,
"grad_norm": 0.1410887592852674,
"learning_rate": 0.0001986853577763628,
"loss": 1.6269,
"step": 580
},
{
"epoch": 0.44,
"grad_norm": 0.1783222763204088,
"learning_rate": 0.00019861310691826143,
"loss": 1.8029,
"step": 585
},
{
"epoch": 0.45,
"grad_norm": 0.20484278901882244,
"learning_rate": 0.00019853893726000683,
"loss": 1.6194,
"step": 590
},
{
"epoch": 0.45,
"grad_norm": 0.1808969694192384,
"learning_rate": 0.00019846285024478202,
"loss": 1.7084,
"step": 595
},
{
"epoch": 0.45,
"grad_norm": 0.1965951187170914,
"learning_rate": 0.00019838484735307748,
"loss": 1.706,
"step": 600
},
{
"epoch": 0.46,
"grad_norm": 0.1555012346720015,
"learning_rate": 0.0001983049301026627,
"loss": 1.464,
"step": 605
},
{
"epoch": 0.46,
"grad_norm": 0.2229027944987823,
"learning_rate": 0.00019822310004855652,
"loss": 1.673,
"step": 610
},
{
"epoch": 0.47,
"grad_norm": 0.169635050611861,
"learning_rate": 0.00019813935878299662,
"loss": 1.6593,
"step": 615
},
{
"epoch": 0.47,
"grad_norm": 0.16624303946845476,
"learning_rate": 0.0001980537079354091,
"loss": 1.7164,
"step": 620
},
{
"epoch": 0.47,
"grad_norm": 0.1544208624543807,
"learning_rate": 0.00019796614917237616,
"loss": 1.5616,
"step": 625
},
{
"epoch": 0.48,
"grad_norm": 0.1900272509930039,
"learning_rate": 0.00019787668419760408,
"loss": 1.6552,
"step": 630
},
{
"epoch": 0.48,
"grad_norm": 0.20362594606792483,
"learning_rate": 0.00019778531475188996,
"loss": 1.7175,
"step": 635
},
{
"epoch": 0.49,
"grad_norm": 0.15933464850430776,
"learning_rate": 0.00019769204261308774,
"loss": 1.6599,
"step": 640
},
{
"epoch": 0.49,
"grad_norm": 0.15846354449923994,
"learning_rate": 0.00019759686959607383,
"loss": 1.7152,
"step": 645
},
{
"epoch": 0.49,
"grad_norm": 0.1731064728813603,
"learning_rate": 0.00019749979755271155,
"loss": 1.8006,
"step": 650
},
{
"epoch": 0.5,
"grad_norm": 0.2036118054344575,
"learning_rate": 0.00019740082837181526,
"loss": 1.6992,
"step": 655
},
{
"epoch": 0.5,
"grad_norm": 0.20595935892977982,
"learning_rate": 0.00019729996397911356,
"loss": 1.7571,
"step": 660
},
{
"epoch": 0.5,
"grad_norm": 0.1816167430276872,
"learning_rate": 0.00019719720633721178,
"loss": 1.8058,
"step": 665
},
{
"epoch": 0.51,
"grad_norm": 0.2139611683255453,
"learning_rate": 0.00019709255744555389,
"loss": 1.8398,
"step": 670
},
{
"epoch": 0.51,
"grad_norm": 0.18514013236898805,
"learning_rate": 0.0001969860193403835,
"loss": 1.7307,
"step": 675
},
{
"epoch": 0.52,
"grad_norm": 0.17288244809213096,
"learning_rate": 0.00019687759409470426,
"loss": 1.7242,
"step": 680
},
{
"epoch": 0.52,
"grad_norm": 0.15953349037713735,
"learning_rate": 0.00019676728381823956,
"loss": 1.6435,
"step": 685
},
{
"epoch": 0.52,
"grad_norm": 0.20963390389942183,
"learning_rate": 0.00019665509065739149,
"loss": 1.6791,
"step": 690
},
{
"epoch": 0.53,
"grad_norm": 0.19075148330166494,
"learning_rate": 0.000196541016795199,
"loss": 1.505,
"step": 695
},
{
"epoch": 0.53,
"grad_norm": 0.22817672978454195,
"learning_rate": 0.00019642506445129545,
"loss": 1.8361,
"step": 700
},
{
"epoch": 0.53,
"grad_norm": 0.1925013343867196,
"learning_rate": 0.00019630723588186545,
"loss": 1.7126,
"step": 705
},
{
"epoch": 0.54,
"grad_norm": 0.16780528759294142,
"learning_rate": 0.000196187533379601,
"loss": 1.6649,
"step": 710
},
{
"epoch": 0.54,
"grad_norm": 0.17707927803137202,
"learning_rate": 0.00019606595927365675,
"loss": 1.6551,
"step": 715
},
{
"epoch": 0.55,
"grad_norm": 0.22525846033337887,
"learning_rate": 0.00019594251592960479,
"loss": 1.7401,
"step": 720
},
{
"epoch": 0.55,
"grad_norm": 0.1953310514707257,
"learning_rate": 0.0001958172057493886,
"loss": 1.6944,
"step": 725
},
{
"epoch": 0.55,
"grad_norm": 0.2085121645512001,
"learning_rate": 0.0001956900311712763,
"loss": 1.663,
"step": 730
},
{
"epoch": 0.56,
"grad_norm": 0.17093646250615369,
"learning_rate": 0.0001955609946698131,
"loss": 1.772,
"step": 735
},
{
"epoch": 0.56,
"grad_norm": 0.19564116222725914,
"learning_rate": 0.00019543009875577346,
"loss": 1.6328,
"step": 740
},
{
"epoch": 0.56,
"grad_norm": 0.215195812549034,
"learning_rate": 0.0001952973459761118,
"loss": 1.6438,
"step": 745
},
{
"epoch": 0.57,
"grad_norm": 0.19377558972597342,
"learning_rate": 0.0001951627389139134,
"loss": 1.7442,
"step": 750
},
{
"epoch": 0.57,
"grad_norm": 0.1792011980095539,
"learning_rate": 0.00019502628018834372,
"loss": 1.7518,
"step": 755
},
{
"epoch": 0.58,
"grad_norm": 0.18977603295326154,
"learning_rate": 0.00019488797245459773,
"loss": 1.688,
"step": 760
},
{
"epoch": 0.58,
"grad_norm": 0.19341540153355985,
"learning_rate": 0.00019474781840384816,
"loss": 1.7562,
"step": 765
},
{
"epoch": 0.58,
"grad_norm": 0.14738398424312027,
"learning_rate": 0.00019460582076319302,
"loss": 1.7244,
"step": 770
},
{
"epoch": 0.59,
"grad_norm": 0.1496446026997031,
"learning_rate": 0.00019446198229560276,
"loss": 1.7083,
"step": 775
},
{
"epoch": 0.59,
"grad_norm": 0.2151992641933425,
"learning_rate": 0.00019431630579986632,
"loss": 1.7078,
"step": 780
},
{
"epoch": 0.6,
"grad_norm": 0.1972075447483379,
"learning_rate": 0.00019416879411053673,
"loss": 1.7665,
"step": 785
},
{
"epoch": 0.6,
"grad_norm": 0.20871968848692934,
"learning_rate": 0.00019401945009787594,
"loss": 1.6636,
"step": 790
},
{
"epoch": 0.6,
"grad_norm": 0.2047491094137733,
"learning_rate": 0.0001938682766677991,
"loss": 1.8061,
"step": 795
},
{
"epoch": 0.61,
"grad_norm": 0.1622522396758859,
"learning_rate": 0.00019371527676181777,
"loss": 1.8645,
"step": 800
},
{
"epoch": 0.61,
"grad_norm": 0.1714969472958251,
"learning_rate": 0.00019356045335698296,
"loss": 1.8266,
"step": 805
},
{
"epoch": 0.61,
"grad_norm": 0.1971306915514917,
"learning_rate": 0.00019340380946582695,
"loss": 1.7205,
"step": 810
},
{
"epoch": 0.62,
"grad_norm": 0.17020111323913545,
"learning_rate": 0.00019324534813630487,
"loss": 1.7339,
"step": 815
},
{
"epoch": 0.62,
"grad_norm": 0.18250825908624654,
"learning_rate": 0.00019308507245173527,
"loss": 1.5188,
"step": 820
},
{
"epoch": 0.63,
"grad_norm": 0.18593221945740382,
"learning_rate": 0.0001929229855307402,
"loss": 1.654,
"step": 825
},
{
"epoch": 0.63,
"grad_norm": 0.22850385689556876,
"learning_rate": 0.00019275909052718447,
"loss": 1.7814,
"step": 830
},
{
"epoch": 0.63,
"grad_norm": 0.19759950903326942,
"learning_rate": 0.00019259339063011432,
"loss": 1.744,
"step": 835
},
{
"epoch": 0.64,
"grad_norm": 0.17215997030755548,
"learning_rate": 0.00019242588906369536,
"loss": 1.8283,
"step": 840
},
{
"epoch": 0.64,
"grad_norm": 0.19136317284315416,
"learning_rate": 0.00019225658908714983,
"loss": 1.6163,
"step": 845
},
{
"epoch": 0.64,
"grad_norm": 0.254426335434924,
"learning_rate": 0.00019208549399469318,
"loss": 1.7618,
"step": 850
},
{
"epoch": 0.65,
"grad_norm": 0.21881435842952657,
"learning_rate": 0.00019191260711547001,
"loss": 1.7315,
"step": 855
},
{
"epoch": 0.65,
"grad_norm": 0.20799528199612635,
"learning_rate": 0.0001917379318134892,
"loss": 1.7859,
"step": 860
},
{
"epoch": 0.66,
"grad_norm": 0.17796834357588534,
"learning_rate": 0.00019156147148755855,
"loss": 1.7345,
"step": 865
},
{
"epoch": 0.66,
"grad_norm": 0.1855849493493474,
"learning_rate": 0.0001913832295712186,
"loss": 1.6232,
"step": 870
},
{
"epoch": 0.66,
"grad_norm": 0.20017349406812152,
"learning_rate": 0.00019120320953267586,
"loss": 1.7546,
"step": 875
},
{
"epoch": 0.67,
"grad_norm": 0.2146332192452092,
"learning_rate": 0.0001910214148747352,
"loss": 1.6231,
"step": 880
},
{
"epoch": 0.67,
"grad_norm": 0.15896122862532144,
"learning_rate": 0.0001908378491347319,
"loss": 1.5535,
"step": 885
},
{
"epoch": 0.67,
"grad_norm": 0.17416174476856394,
"learning_rate": 0.00019065251588446265,
"loss": 1.6337,
"step": 890
},
{
"epoch": 0.68,
"grad_norm": 0.23962933659259386,
"learning_rate": 0.0001904654187301161,
"loss": 1.8581,
"step": 895
},
{
"epoch": 0.68,
"grad_norm": 0.17002626630746845,
"learning_rate": 0.0001902765613122028,
"loss": 1.6537,
"step": 900
},
{
"epoch": 0.69,
"grad_norm": 0.23553588791103638,
"learning_rate": 0.0001900859473054841,
"loss": 1.7497,
"step": 905
},
{
"epoch": 0.69,
"grad_norm": 0.17184692025828147,
"learning_rate": 0.00018989358041890094,
"loss": 1.6305,
"step": 910
},
{
"epoch": 0.69,
"grad_norm": 0.19276600455036005,
"learning_rate": 0.00018969946439550148,
"loss": 1.6965,
"step": 915
},
{
"epoch": 0.7,
"grad_norm": 0.2266174277702017,
"learning_rate": 0.0001895036030123684,
"loss": 1.7845,
"step": 920
},
{
"epoch": 0.7,
"grad_norm": 0.15948175213103422,
"learning_rate": 0.0001893060000805453,
"loss": 1.582,
"step": 925
},
{
"epoch": 0.71,
"grad_norm": 0.20415510379076665,
"learning_rate": 0.00018910665944496264,
"loss": 1.6576,
"step": 930
},
{
"epoch": 0.71,
"grad_norm": 0.20826411615417578,
"learning_rate": 0.00018890558498436282,
"loss": 1.7243,
"step": 935
},
{
"epoch": 0.71,
"grad_norm": 0.2535182644413305,
"learning_rate": 0.00018870278061122484,
"loss": 1.5795,
"step": 940
},
{
"epoch": 0.72,
"grad_norm": 0.17063517897800512,
"learning_rate": 0.00018849825027168803,
"loss": 1.6361,
"step": 945
},
{
"epoch": 0.72,
"grad_norm": 0.1898841623155248,
"learning_rate": 0.00018829199794547535,
"loss": 1.7526,
"step": 950
},
{
"epoch": 0.72,
"grad_norm": 0.19033639448531828,
"learning_rate": 0.00018808402764581596,
"loss": 1.5943,
"step": 955
},
{
"epoch": 0.73,
"grad_norm": 0.1647576077524525,
"learning_rate": 0.0001878743434193671,
"loss": 1.7575,
"step": 960
},
{
"epoch": 0.73,
"grad_norm": 0.2070226518164384,
"learning_rate": 0.00018766294934613535,
"loss": 1.741,
"step": 965
},
{
"epoch": 0.74,
"grad_norm": 0.21633633400820462,
"learning_rate": 0.00018744984953939726,
"loss": 1.6967,
"step": 970
},
{
"epoch": 0.74,
"grad_norm": 0.2039504411965307,
"learning_rate": 0.0001872350481456193,
"loss": 1.6825,
"step": 975
},
{
"epoch": 0.74,
"grad_norm": 0.19382809212719235,
"learning_rate": 0.0001870185493443772,
"loss": 1.7494,
"step": 980
},
{
"epoch": 0.75,
"grad_norm": 0.17836311560595738,
"learning_rate": 0.0001868003573482746,
"loss": 1.6326,
"step": 985
},
{
"epoch": 0.75,
"grad_norm": 0.18940985276826594,
"learning_rate": 0.0001865804764028611,
"loss": 1.6823,
"step": 990
},
{
"epoch": 0.75,
"grad_norm": 0.15827883706638377,
"learning_rate": 0.0001863589107865496,
"loss": 1.8507,
"step": 995
},
{
"epoch": 0.76,
"grad_norm": 0.2024112582787964,
"learning_rate": 0.00018613566481053315,
"loss": 1.6737,
"step": 1000
},
{
"epoch": 0.76,
"grad_norm": 0.18631332115379975,
"learning_rate": 0.00018591074281870099,
"loss": 1.6391,
"step": 1005
},
{
"epoch": 0.77,
"grad_norm": 0.2322177223268837,
"learning_rate": 0.00018568414918755397,
"loss": 1.7185,
"step": 1010
},
{
"epoch": 0.77,
"grad_norm": 0.19585063603806546,
"learning_rate": 0.00018545588832611956,
"loss": 1.8829,
"step": 1015
},
{
"epoch": 0.77,
"grad_norm": 0.19046268057109556,
"learning_rate": 0.00018522596467586598,
"loss": 1.6889,
"step": 1020
},
{
"epoch": 0.78,
"grad_norm": 0.2319521660184869,
"learning_rate": 0.00018499438271061568,
"loss": 1.7148,
"step": 1025
},
{
"epoch": 0.78,
"grad_norm": 0.18401426887501984,
"learning_rate": 0.0001847611469364584,
"loss": 1.6355,
"step": 1030
},
{
"epoch": 0.78,
"grad_norm": 0.16467673844089234,
"learning_rate": 0.00018452626189166345,
"loss": 1.5748,
"step": 1035
},
{
"epoch": 0.79,
"grad_norm": 0.21515271715044545,
"learning_rate": 0.0001842897321465915,
"loss": 1.7172,
"step": 1040
},
{
"epoch": 0.79,
"grad_norm": 0.20010536585072475,
"learning_rate": 0.0001840515623036055,
"loss": 1.7331,
"step": 1045
},
{
"epoch": 0.8,
"grad_norm": 0.15220183718369495,
"learning_rate": 0.0001838117569969812,
"loss": 1.7703,
"step": 1050
},
{
"epoch": 0.8,
"grad_norm": 0.19249950248721495,
"learning_rate": 0.00018357032089281702,
"loss": 1.7356,
"step": 1055
},
{
"epoch": 0.8,
"grad_norm": 0.15685889188495356,
"learning_rate": 0.00018332725868894313,
"loss": 1.5789,
"step": 1060
},
{
"epoch": 0.81,
"grad_norm": 0.22123166856945198,
"learning_rate": 0.00018308257511483018,
"loss": 1.7449,
"step": 1065
},
{
"epoch": 0.81,
"grad_norm": 0.21921162541787237,
"learning_rate": 0.00018283627493149721,
"loss": 1.592,
"step": 1070
},
{
"epoch": 0.82,
"grad_norm": 0.15892072068340937,
"learning_rate": 0.00018258836293141907,
"loss": 1.6588,
"step": 1075
},
{
"epoch": 0.82,
"grad_norm": 0.2129268440643301,
"learning_rate": 0.000182338843938433,
"loss": 1.6687,
"step": 1080
},
{
"epoch": 0.82,
"grad_norm": 0.18558886049158316,
"learning_rate": 0.000182087722807645,
"loss": 1.6204,
"step": 1085
},
{
"epoch": 0.83,
"grad_norm": 0.21759739469279235,
"learning_rate": 0.00018183500442533514,
"loss": 1.7012,
"step": 1090
},
{
"epoch": 0.83,
"grad_norm": 0.16739812153050462,
"learning_rate": 0.00018158069370886266,
"loss": 1.7749,
"step": 1095
},
{
"epoch": 0.83,
"grad_norm": 0.2120028175464506,
"learning_rate": 0.0001813247956065702,
"loss": 1.7076,
"step": 1100
},
{
"epoch": 0.84,
"grad_norm": 0.21506301830058508,
"learning_rate": 0.00018106731509768753,
"loss": 1.6561,
"step": 1105
},
{
"epoch": 0.84,
"grad_norm": 0.21374007008875692,
"learning_rate": 0.00018080825719223468,
"loss": 1.7721,
"step": 1110
},
{
"epoch": 0.85,
"grad_norm": 0.21473556112453085,
"learning_rate": 0.00018054762693092444,
"loss": 1.5391,
"step": 1115
},
{
"epoch": 0.85,
"grad_norm": 0.1928094864305794,
"learning_rate": 0.00018028542938506426,
"loss": 1.7297,
"step": 1120
},
{
"epoch": 0.85,
"grad_norm": 0.22195616352181186,
"learning_rate": 0.0001800216696564576,
"loss": 1.6239,
"step": 1125
},
{
"epoch": 0.86,
"grad_norm": 0.2493704349381919,
"learning_rate": 0.00017975635287730473,
"loss": 1.7736,
"step": 1130
},
{
"epoch": 0.86,
"grad_norm": 0.1871166430212898,
"learning_rate": 0.00017948948421010264,
"loss": 1.67,
"step": 1135
},
{
"epoch": 0.86,
"grad_norm": 0.16460126336549072,
"learning_rate": 0.00017922106884754488,
"loss": 1.7331,
"step": 1140
},
{
"epoch": 0.87,
"grad_norm": 0.18707990225784327,
"learning_rate": 0.0001789511120124203,
"loss": 1.5608,
"step": 1145
},
{
"epoch": 0.87,
"grad_norm": 0.21751770239029078,
"learning_rate": 0.00017867961895751163,
"loss": 1.721,
"step": 1150
},
{
"epoch": 0.88,
"grad_norm": 0.1674742118307801,
"learning_rate": 0.00017840659496549298,
"loss": 1.7339,
"step": 1155
},
{
"epoch": 0.88,
"grad_norm": 0.19173527103482793,
"learning_rate": 0.00017813204534882738,
"loss": 1.7348,
"step": 1160
},
{
"epoch": 0.88,
"grad_norm": 0.18468049289167895,
"learning_rate": 0.0001778559754496631,
"loss": 1.6823,
"step": 1165
},
{
"epoch": 0.89,
"grad_norm": 0.2069730744593729,
"learning_rate": 0.00017757839063972997,
"loss": 1.8253,
"step": 1170
},
{
"epoch": 0.89,
"grad_norm": 0.2139312404074137,
"learning_rate": 0.00017729929632023472,
"loss": 1.7013,
"step": 1175
},
{
"epoch": 0.89,
"grad_norm": 0.1764736094502213,
"learning_rate": 0.00017701869792175593,
"loss": 1.8235,
"step": 1180
},
{
"epoch": 0.9,
"grad_norm": 0.21944309103277923,
"learning_rate": 0.00017673660090413823,
"loss": 1.8237,
"step": 1185
},
{
"epoch": 0.9,
"grad_norm": 0.20268987883171422,
"learning_rate": 0.00017645301075638634,
"loss": 1.6992,
"step": 1190
},
{
"epoch": 0.91,
"grad_norm": 0.19400968339090352,
"learning_rate": 0.00017616793299655794,
"loss": 1.8662,
"step": 1195
},
{
"epoch": 0.91,
"grad_norm": 0.18489832863809344,
"learning_rate": 0.00017588137317165657,
"loss": 1.6986,
"step": 1200
},
{
"epoch": 0.91,
"grad_norm": 0.17738333103257395,
"learning_rate": 0.0001755933368575235,
"loss": 1.6783,
"step": 1205
},
{
"epoch": 0.92,
"grad_norm": 0.17926192606119037,
"learning_rate": 0.0001753038296587294,
"loss": 1.7627,
"step": 1210
},
{
"epoch": 0.92,
"grad_norm": 0.20194075183870522,
"learning_rate": 0.00017501285720846523,
"loss": 1.7846,
"step": 1215
},
{
"epoch": 0.92,
"grad_norm": 0.19331786071311133,
"learning_rate": 0.0001747204251684325,
"loss": 1.7143,
"step": 1220
},
{
"epoch": 0.93,
"grad_norm": 0.23530188097310437,
"learning_rate": 0.00017442653922873327,
"loss": 1.7296,
"step": 1225
},
{
"epoch": 0.93,
"grad_norm": 0.17594594764152405,
"learning_rate": 0.0001741312051077594,
"loss": 1.7335,
"step": 1230
},
{
"epoch": 0.94,
"grad_norm": 0.20934249136020208,
"learning_rate": 0.00017383442855208124,
"loss": 1.6646,
"step": 1235
},
{
"epoch": 0.94,
"grad_norm": 0.2111005617028846,
"learning_rate": 0.00017353621533633583,
"loss": 1.5756,
"step": 1240
},
{
"epoch": 0.94,
"grad_norm": 0.21413727626671644,
"learning_rate": 0.00017323657126311454,
"loss": 1.4917,
"step": 1245
},
{
"epoch": 0.95,
"grad_norm": 0.2391299536210697,
"learning_rate": 0.0001729355021628502,
"loss": 1.7283,
"step": 1250
},
{
"epoch": 0.95,
"grad_norm": 0.19381232926045663,
"learning_rate": 0.00017263301389370362,
"loss": 1.7907,
"step": 1255
},
{
"epoch": 0.96,
"grad_norm": 0.21223075585900172,
"learning_rate": 0.0001723291123414495,
"loss": 1.7412,
"step": 1260
},
{
"epoch": 0.96,
"grad_norm": 0.18560634331207926,
"learning_rate": 0.00017202380341936212,
"loss": 1.7287,
"step": 1265
},
{
"epoch": 0.96,
"grad_norm": 0.18941317978765862,
"learning_rate": 0.00017171709306810012,
"loss": 1.5956,
"step": 1270
},
{
"epoch": 0.97,
"grad_norm": 0.17108900888850623,
"learning_rate": 0.000171408987255591,
"loss": 1.7789,
"step": 1275
},
{
"epoch": 0.97,
"grad_norm": 0.19233373904164977,
"learning_rate": 0.00017109949197691485,
"loss": 1.7397,
"step": 1280
},
{
"epoch": 0.97,
"grad_norm": 0.1697480170006848,
"learning_rate": 0.00017078861325418797,
"loss": 1.5765,
"step": 1285
},
{
"epoch": 0.98,
"grad_norm": 0.17575403691888572,
"learning_rate": 0.00017047635713644528,
"loss": 1.8137,
"step": 1290
},
{
"epoch": 0.98,
"grad_norm": 0.19700603487956356,
"learning_rate": 0.00017016272969952304,
"loss": 1.8248,
"step": 1295
},
{
"epoch": 0.99,
"grad_norm": 0.25577968967800774,
"learning_rate": 0.0001698477370459405,
"loss": 1.5227,
"step": 1300
},
{
"epoch": 0.99,
"grad_norm": 0.21182493743068362,
"learning_rate": 0.00016953138530478092,
"loss": 1.6463,
"step": 1305
},
{
"epoch": 0.99,
"grad_norm": 0.24187008234174068,
"learning_rate": 0.0001692136806315726,
"loss": 1.677,
"step": 1310
},
{
"epoch": 1.0,
"grad_norm": 0.23079613874981772,
"learning_rate": 0.00016889462920816902,
"loss": 1.6987,
"step": 1315
},
{
"epoch": 1.0,
"grad_norm": 0.18959747421576906,
"learning_rate": 0.00016857423724262849,
"loss": 1.6143,
"step": 1320
},
{
"epoch": 1.0,
"grad_norm": 0.19193767521915664,
"learning_rate": 0.00016825251096909343,
"loss": 1.6523,
"step": 1325
},
{
"epoch": 1.01,
"grad_norm": 0.1851789336505185,
"learning_rate": 0.00016792945664766907,
"loss": 1.5728,
"step": 1330
},
{
"epoch": 1.01,
"grad_norm": 0.14492627661875204,
"learning_rate": 0.00016760508056430152,
"loss": 1.5701,
"step": 1335
},
{
"epoch": 1.02,
"grad_norm": 0.2700845196747031,
"learning_rate": 0.0001672793890306556,
"loss": 1.8245,
"step": 1340
},
{
"epoch": 1.02,
"grad_norm": 0.1983440671335701,
"learning_rate": 0.00016695238838399206,
"loss": 1.7108,
"step": 1345
},
{
"epoch": 1.02,
"grad_norm": 0.17701113866794518,
"learning_rate": 0.0001666240849870441,
"loss": 1.5517,
"step": 1350
},
{
"epoch": 1.03,
"grad_norm": 0.16944238367848802,
"learning_rate": 0.0001662944852278936,
"loss": 1.7263,
"step": 1355
},
{
"epoch": 1.03,
"grad_norm": 0.20201061964568917,
"learning_rate": 0.00016596359551984704,
"loss": 1.6212,
"step": 1360
},
{
"epoch": 1.03,
"grad_norm": 0.16272112017898177,
"learning_rate": 0.0001656314223013104,
"loss": 1.6557,
"step": 1365
},
{
"epoch": 1.04,
"grad_norm": 0.2050184080142653,
"learning_rate": 0.00016529797203566405,
"loss": 1.6203,
"step": 1370
},
{
"epoch": 1.04,
"grad_norm": 0.18868029622446703,
"learning_rate": 0.00016496325121113706,
"loss": 1.5994,
"step": 1375
},
{
"epoch": 1.05,
"grad_norm": 0.18530725289838731,
"learning_rate": 0.00016462726634068075,
"loss": 1.661,
"step": 1380
},
{
"epoch": 1.05,
"grad_norm": 0.22254932266214475,
"learning_rate": 0.00016429002396184215,
"loss": 1.5779,
"step": 1385
},
{
"epoch": 1.05,
"grad_norm": 0.35454879952816054,
"learning_rate": 0.00016395153063663667,
"loss": 1.4926,
"step": 1390
},
{
"epoch": 1.06,
"grad_norm": 0.2083539962991777,
"learning_rate": 0.00016361179295142046,
"loss": 1.668,
"step": 1395
},
{
"epoch": 1.06,
"grad_norm": 0.20105783428150303,
"learning_rate": 0.00016327081751676227,
"loss": 1.7475,
"step": 1400
},
{
"epoch": 1.07,
"grad_norm": 0.19073307130103012,
"learning_rate": 0.0001629286109673148,
"loss": 1.6726,
"step": 1405
},
{
"epoch": 1.07,
"grad_norm": 0.21132776602726958,
"learning_rate": 0.00016258517996168564,
"loss": 1.745,
"step": 1410
},
{
"epoch": 1.07,
"grad_norm": 0.23336177448110548,
"learning_rate": 0.0001622405311823076,
"loss": 1.7185,
"step": 1415
},
{
"epoch": 1.08,
"grad_norm": 0.19045792239686193,
"learning_rate": 0.00016189467133530884,
"loss": 1.6369,
"step": 1420
},
{
"epoch": 1.08,
"grad_norm": 0.1470674402518224,
"learning_rate": 0.0001615476071503823,
"loss": 1.6593,
"step": 1425
},
{
"epoch": 1.08,
"grad_norm": 0.1895764202504411,
"learning_rate": 0.0001611993453806547,
"loss": 1.5879,
"step": 1430
},
{
"epoch": 1.09,
"grad_norm": 0.21781610564885606,
"learning_rate": 0.0001608498928025553,
"loss": 1.6377,
"step": 1435
},
{
"epoch": 1.09,
"grad_norm": 0.21082770226036582,
"learning_rate": 0.00016049925621568382,
"loss": 1.5626,
"step": 1440
},
{
"epoch": 1.1,
"grad_norm": 0.2288377931408156,
"learning_rate": 0.00016014744244267833,
"loss": 1.7531,
"step": 1445
},
{
"epoch": 1.1,
"grad_norm": 0.1822265551052057,
"learning_rate": 0.00015979445832908242,
"loss": 1.691,
"step": 1450
},
{
"epoch": 1.1,
"grad_norm": 0.23100268355259115,
"learning_rate": 0.00015944031074321204,
"loss": 1.7622,
"step": 1455
},
{
"epoch": 1.11,
"grad_norm": 0.18624779842903288,
"learning_rate": 0.00015908500657602174,
"loss": 1.5919,
"step": 1460
},
{
"epoch": 1.11,
"grad_norm": 0.20357926913176824,
"learning_rate": 0.0001587285527409707,
"loss": 1.6288,
"step": 1465
},
{
"epoch": 1.11,
"grad_norm": 0.20919686630022472,
"learning_rate": 0.00015837095617388827,
"loss": 1.6705,
"step": 1470
},
{
"epoch": 1.12,
"grad_norm": 0.1993582841062667,
"learning_rate": 0.0001580122238328387,
"loss": 1.6516,
"step": 1475
},
{
"epoch": 1.12,
"grad_norm": 0.2547942602076731,
"learning_rate": 0.00015765236269798627,
"loss": 1.5036,
"step": 1480
},
{
"epoch": 1.13,
"grad_norm": 0.1807424509361345,
"learning_rate": 0.00015729137977145893,
"loss": 1.6089,
"step": 1485
},
{
"epoch": 1.13,
"grad_norm": 0.18264437292208377,
"learning_rate": 0.0001569292820772124,
"loss": 1.7353,
"step": 1490
},
{
"epoch": 1.13,
"grad_norm": 0.21311009253554458,
"learning_rate": 0.00015656607666089334,
"loss": 1.6574,
"step": 1495
},
{
"epoch": 1.14,
"grad_norm": 0.18453680363642788,
"learning_rate": 0.0001562017705897024,
"loss": 1.5736,
"step": 1500
},
{
"epoch": 1.14,
"grad_norm": 0.23644760312940358,
"learning_rate": 0.00015583637095225656,
"loss": 1.7076,
"step": 1505
},
{
"epoch": 1.14,
"grad_norm": 0.19899933139163767,
"learning_rate": 0.00015546988485845125,
"loss": 1.665,
"step": 1510
},
{
"epoch": 1.15,
"grad_norm": 0.23202505382527974,
"learning_rate": 0.0001551023194393221,
"loss": 1.7191,
"step": 1515
},
{
"epoch": 1.15,
"grad_norm": 0.21073033640879407,
"learning_rate": 0.00015473368184690597,
"loss": 1.6123,
"step": 1520
},
{
"epoch": 1.16,
"grad_norm": 0.22019036120363472,
"learning_rate": 0.00015436397925410201,
"loss": 1.6909,
"step": 1525
},
{
"epoch": 1.16,
"grad_norm": 0.20817813902248655,
"learning_rate": 0.00015399321885453202,
"loss": 1.7648,
"step": 1530
},
{
"epoch": 1.16,
"grad_norm": 0.21714232280510767,
"learning_rate": 0.00015362140786240035,
"loss": 1.6718,
"step": 1535
},
{
"epoch": 1.17,
"grad_norm": 0.20478633851375716,
"learning_rate": 0.00015324855351235372,
"loss": 1.7586,
"step": 1540
},
{
"epoch": 1.17,
"grad_norm": 0.19046880552839732,
"learning_rate": 0.00015287466305934037,
"loss": 1.695,
"step": 1545
},
{
"epoch": 1.18,
"grad_norm": 0.23309832393442634,
"learning_rate": 0.0001524997437784689,
"loss": 1.584,
"step": 1550
},
{
"epoch": 1.18,
"grad_norm": 0.23887396172176847,
"learning_rate": 0.00015212380296486652,
"loss": 1.5742,
"step": 1555
},
{
"epoch": 1.18,
"grad_norm": 0.18128179052277552,
"learning_rate": 0.0001517468479335376,
"loss": 1.6802,
"step": 1560
},
{
"epoch": 1.19,
"grad_norm": 0.22086322507654135,
"learning_rate": 0.00015136888601922072,
"loss": 1.7222,
"step": 1565
},
{
"epoch": 1.19,
"grad_norm": 0.18870219517815454,
"learning_rate": 0.0001509899245762464,
"loss": 1.5664,
"step": 1570
},
{
"epoch": 1.19,
"grad_norm": 0.2276718864826248,
"learning_rate": 0.00015060997097839386,
"loss": 1.7565,
"step": 1575
},
{
"epoch": 1.2,
"grad_norm": 0.20329327239158157,
"learning_rate": 0.00015022903261874748,
"loss": 1.6774,
"step": 1580
},
{
"epoch": 1.2,
"grad_norm": 0.18898688137814482,
"learning_rate": 0.00014984711690955297,
"loss": 1.6518,
"step": 1585
},
{
"epoch": 1.21,
"grad_norm": 0.22865474882055875,
"learning_rate": 0.00014946423128207322,
"loss": 1.7247,
"step": 1590
},
{
"epoch": 1.21,
"grad_norm": 0.21027592834116465,
"learning_rate": 0.00014908038318644373,
"loss": 1.7849,
"step": 1595
},
{
"epoch": 1.21,
"grad_norm": 0.20948671991840284,
"learning_rate": 0.0001486955800915274,
"loss": 1.5386,
"step": 1600
},
{
"epoch": 1.22,
"grad_norm": 0.21227729763658884,
"learning_rate": 0.0001483098294847695,
"loss": 1.602,
"step": 1605
},
{
"epoch": 1.22,
"grad_norm": 0.21630672435558576,
"learning_rate": 0.00014792313887205182,
"loss": 1.6772,
"step": 1610
},
{
"epoch": 1.22,
"grad_norm": 0.21541507503873228,
"learning_rate": 0.00014753551577754664,
"loss": 1.6862,
"step": 1615
},
{
"epoch": 1.23,
"grad_norm": 0.2480903001762983,
"learning_rate": 0.0001471469677435704,
"loss": 1.5916,
"step": 1620
},
{
"epoch": 1.23,
"grad_norm": 0.20716645798924263,
"learning_rate": 0.00014675750233043679,
"loss": 1.7072,
"step": 1625
},
{
"epoch": 1.24,
"grad_norm": 0.22397565488829696,
"learning_rate": 0.00014636712711630978,
"loss": 1.6036,
"step": 1630
},
{
"epoch": 1.24,
"grad_norm": 0.19584834615434676,
"learning_rate": 0.00014597584969705616,
"loss": 1.6366,
"step": 1635
},
{
"epoch": 1.24,
"grad_norm": 0.22273274810197669,
"learning_rate": 0.00014558367768609766,
"loss": 1.6545,
"step": 1640
},
{
"epoch": 1.25,
"grad_norm": 0.30141032612570196,
"learning_rate": 0.00014519061871426286,
"loss": 1.6668,
"step": 1645
},
{
"epoch": 1.25,
"grad_norm": 0.2508746414625482,
"learning_rate": 0.0001447966804296387,
"loss": 1.5583,
"step": 1650
},
{
"epoch": 1.25,
"grad_norm": 0.2656543660091513,
"learning_rate": 0.00014440187049742165,
"loss": 1.6114,
"step": 1655
},
{
"epoch": 1.26,
"grad_norm": 0.22762072721537044,
"learning_rate": 0.00014400619659976863,
"loss": 1.5218,
"step": 1660
},
{
"epoch": 1.26,
"grad_norm": 0.21625802298436558,
"learning_rate": 0.00014360966643564747,
"loss": 1.6282,
"step": 1665
},
{
"epoch": 1.27,
"grad_norm": 0.18758356388629857,
"learning_rate": 0.00014321228772068702,
"loss": 1.5724,
"step": 1670
},
{
"epoch": 1.27,
"grad_norm": 0.22894089207752852,
"learning_rate": 0.0001428140681870272,
"loss": 1.5875,
"step": 1675
},
{
"epoch": 1.27,
"grad_norm": 0.25952806547918694,
"learning_rate": 0.0001424150155831685,
"loss": 1.6728,
"step": 1680
},
{
"epoch": 1.28,
"grad_norm": 0.3304544222948505,
"learning_rate": 0.00014201513767382108,
"loss": 1.6944,
"step": 1685
},
{
"epoch": 1.28,
"grad_norm": 0.21745874371742022,
"learning_rate": 0.00014161444223975383,
"loss": 1.5649,
"step": 1690
},
{
"epoch": 1.29,
"grad_norm": 0.18668861489627886,
"learning_rate": 0.0001412129370776429,
"loss": 1.6646,
"step": 1695
},
{
"epoch": 1.29,
"grad_norm": 0.2514658628873574,
"learning_rate": 0.00014081062999992005,
"loss": 1.6427,
"step": 1700
},
{
"epoch": 1.29,
"grad_norm": 0.23075565689636676,
"learning_rate": 0.0001404075288346206,
"loss": 1.7089,
"step": 1705
},
{
"epoch": 1.3,
"grad_norm": 0.2005453142298327,
"learning_rate": 0.00014000364142523103,
"loss": 1.7236,
"step": 1710
},
{
"epoch": 1.3,
"grad_norm": 0.21925735664261978,
"learning_rate": 0.00013959897563053662,
"loss": 1.7193,
"step": 1715
},
{
"epoch": 1.3,
"grad_norm": 0.22755950679993744,
"learning_rate": 0.00013919353932446822,
"loss": 1.6178,
"step": 1720
},
{
"epoch": 1.31,
"grad_norm": 0.24575725474371382,
"learning_rate": 0.0001387873403959492,
"loss": 1.6914,
"step": 1725
},
{
"epoch": 1.31,
"grad_norm": 0.22868287217989744,
"learning_rate": 0.00013838038674874193,
"loss": 1.6021,
"step": 1730
},
{
"epoch": 1.32,
"grad_norm": 0.21889496061933156,
"learning_rate": 0.00013797268630129413,
"loss": 1.8092,
"step": 1735
},
{
"epoch": 1.32,
"grad_norm": 0.19238702480865116,
"learning_rate": 0.0001375642469865844,
"loss": 1.54,
"step": 1740
},
{
"epoch": 1.32,
"grad_norm": 0.24437133183257548,
"learning_rate": 0.00013715507675196836,
"loss": 1.5477,
"step": 1745
},
{
"epoch": 1.33,
"grad_norm": 0.21331661362588805,
"learning_rate": 0.0001367451835590237,
"loss": 1.6229,
"step": 1750
},
{
"epoch": 1.33,
"grad_norm": 0.22934227073111574,
"learning_rate": 0.00013633457538339514,
"loss": 1.7056,
"step": 1755
},
{
"epoch": 1.33,
"grad_norm": 0.21991726124527775,
"learning_rate": 0.00013592326021463977,
"loss": 1.7322,
"step": 1760
},
{
"epoch": 1.34,
"grad_norm": 0.2279246851535844,
"learning_rate": 0.00013551124605607097,
"loss": 1.5663,
"step": 1765
},
{
"epoch": 1.34,
"grad_norm": 0.21252716182463233,
"learning_rate": 0.00013509854092460312,
"loss": 1.6308,
"step": 1770
},
{
"epoch": 1.35,
"grad_norm": 0.19276878334978295,
"learning_rate": 0.0001346851528505954,
"loss": 1.629,
"step": 1775
},
{
"epoch": 1.35,
"grad_norm": 0.20349606898831232,
"learning_rate": 0.00013427108987769566,
"loss": 1.6323,
"step": 1780
},
{
"epoch": 1.35,
"grad_norm": 0.280403908850998,
"learning_rate": 0.00013385636006268368,
"loss": 1.5647,
"step": 1785
},
{
"epoch": 1.36,
"grad_norm": 0.204649437629767,
"learning_rate": 0.00013344097147531469,
"loss": 1.6706,
"step": 1790
},
{
"epoch": 1.36,
"grad_norm": 0.2355526525352747,
"learning_rate": 0.00013302493219816223,
"loss": 1.6661,
"step": 1795
},
{
"epoch": 1.36,
"grad_norm": 0.23955342033240548,
"learning_rate": 0.00013260825032646083,
"loss": 1.7684,
"step": 1800
},
{
"epoch": 1.37,
"grad_norm": 0.1782918443154143,
"learning_rate": 0.00013219093396794852,
"loss": 1.7357,
"step": 1805
},
{
"epoch": 1.37,
"grad_norm": 0.20676511108669285,
"learning_rate": 0.00013177299124270911,
"loss": 1.7935,
"step": 1810
},
{
"epoch": 1.38,
"grad_norm": 0.24468072304122832,
"learning_rate": 0.0001313544302830142,
"loss": 1.6357,
"step": 1815
},
{
"epoch": 1.38,
"grad_norm": 0.3442798924803141,
"learning_rate": 0.00013093525923316482,
"loss": 1.7283,
"step": 1820
},
{
"epoch": 1.38,
"grad_norm": 0.18543047699982895,
"learning_rate": 0.00013051548624933314,
"loss": 1.6756,
"step": 1825
},
{
"epoch": 1.39,
"grad_norm": 0.18961104598393633,
"learning_rate": 0.00013009511949940358,
"loss": 1.6258,
"step": 1830
},
{
"epoch": 1.39,
"grad_norm": 0.23772840081980506,
"learning_rate": 0.00012967416716281414,
"loss": 1.6197,
"step": 1835
},
{
"epoch": 1.39,
"grad_norm": 0.20599306112898513,
"learning_rate": 0.00012925263743039693,
"loss": 1.6155,
"step": 1840
},
{
"epoch": 1.4,
"grad_norm": 0.17872981947947883,
"learning_rate": 0.00012883053850421897,
"loss": 1.817,
"step": 1845
},
{
"epoch": 1.4,
"grad_norm": 0.21082979842365093,
"learning_rate": 0.00012840787859742266,
"loss": 1.7045,
"step": 1850
},
{
"epoch": 1.41,
"grad_norm": 0.21065592453908275,
"learning_rate": 0.00012798466593406583,
"loss": 1.5825,
"step": 1855
},
{
"epoch": 1.41,
"grad_norm": 0.21798103821826761,
"learning_rate": 0.00012756090874896172,
"loss": 1.7622,
"step": 1860
},
{
"epoch": 1.41,
"grad_norm": 0.22916268453103483,
"learning_rate": 0.00012713661528751888,
"loss": 1.5324,
"step": 1865
},
{
"epoch": 1.42,
"grad_norm": 0.2668875410933402,
"learning_rate": 0.00012671179380558062,
"loss": 1.647,
"step": 1870
},
{
"epoch": 1.42,
"grad_norm": 0.19627830855058848,
"learning_rate": 0.00012628645256926438,
"loss": 1.5994,
"step": 1875
},
{
"epoch": 1.43,
"grad_norm": 0.21241423084048555,
"learning_rate": 0.0001258605998548009,
"loss": 1.622,
"step": 1880
},
{
"epoch": 1.43,
"grad_norm": 0.2546778643093178,
"learning_rate": 0.0001254342439483733,
"loss": 1.6916,
"step": 1885
},
{
"epoch": 1.43,
"grad_norm": 0.20610950008732792,
"learning_rate": 0.00012500739314595563,
"loss": 1.7455,
"step": 1890
},
{
"epoch": 1.44,
"grad_norm": 0.2219569529434739,
"learning_rate": 0.00012458005575315147,
"loss": 1.6683,
"step": 1895
},
{
"epoch": 1.44,
"grad_norm": 0.20787095642170883,
"learning_rate": 0.0001241522400850327,
"loss": 1.6202,
"step": 1900
},
{
"epoch": 1.44,
"grad_norm": 0.2275845179745845,
"learning_rate": 0.0001237239544659771,
"loss": 1.8088,
"step": 1905
},
{
"epoch": 1.45,
"grad_norm": 0.24655110446766015,
"learning_rate": 0.0001232952072295069,
"loss": 1.5618,
"step": 1910
},
{
"epoch": 1.45,
"grad_norm": 0.23084716022254811,
"learning_rate": 0.0001228660067181263,
"loss": 1.7204,
"step": 1915
},
{
"epoch": 1.46,
"grad_norm": 0.2420965499906573,
"learning_rate": 0.00012243636128315939,
"loss": 1.5581,
"step": 1920
},
{
"epoch": 1.46,
"grad_norm": 0.25054116126933823,
"learning_rate": 0.0001220062792845873,
"loss": 1.5808,
"step": 1925
},
{
"epoch": 1.46,
"grad_norm": 0.24876893838844386,
"learning_rate": 0.00012157576909088599,
"loss": 1.6291,
"step": 1930
},
{
"epoch": 1.47,
"grad_norm": 0.22724411732153027,
"learning_rate": 0.00012114483907886308,
"loss": 1.7218,
"step": 1935
},
{
"epoch": 1.47,
"grad_norm": 0.23781633823944948,
"learning_rate": 0.00012071349763349484,
"loss": 1.6696,
"step": 1940
},
{
"epoch": 1.47,
"grad_norm": 0.2611267676195103,
"learning_rate": 0.00012028175314776344,
"loss": 1.7099,
"step": 1945
},
{
"epoch": 1.48,
"grad_norm": 0.25342034309056527,
"learning_rate": 0.00011984961402249311,
"loss": 1.6931,
"step": 1950
},
{
"epoch": 1.48,
"grad_norm": 0.20391686876564638,
"learning_rate": 0.00011941708866618697,
"loss": 1.7043,
"step": 1955
},
{
"epoch": 1.49,
"grad_norm": 0.2005457898894919,
"learning_rate": 0.0001189841854948634,
"loss": 1.5758,
"step": 1960
},
{
"epoch": 1.49,
"grad_norm": 0.19157508121631642,
"learning_rate": 0.00011855091293189234,
"loss": 1.5831,
"step": 1965
},
{
"epoch": 1.49,
"grad_norm": 0.23409302527114853,
"learning_rate": 0.00011811727940783108,
"loss": 1.6668,
"step": 1970
},
{
"epoch": 1.5,
"grad_norm": 0.19820344277697435,
"learning_rate": 0.00011768329336026062,
"loss": 1.6894,
"step": 1975
},
{
"epoch": 1.5,
"grad_norm": 0.23641920754497897,
"learning_rate": 0.0001172489632336213,
"loss": 1.8362,
"step": 1980
},
{
"epoch": 1.5,
"grad_norm": 0.20503090615743924,
"learning_rate": 0.00011681429747904842,
"loss": 1.6885,
"step": 1985
},
{
"epoch": 1.51,
"grad_norm": 0.18474233550647523,
"learning_rate": 0.00011637930455420798,
"loss": 1.7196,
"step": 1990
},
{
"epoch": 1.51,
"grad_norm": 0.2775657036754379,
"learning_rate": 0.00011594399292313192,
"loss": 1.7362,
"step": 1995
},
{
"epoch": 1.52,
"grad_norm": 0.23760102898739513,
"learning_rate": 0.00011550837105605354,
"loss": 1.5986,
"step": 2000
},
{
"epoch": 1.52,
"grad_norm": 0.18850041877265183,
"learning_rate": 0.00011507244742924274,
"loss": 1.7116,
"step": 2005
},
{
"epoch": 1.52,
"grad_norm": 0.2164959021230041,
"learning_rate": 0.000114636230524841,
"loss": 1.578,
"step": 2010
},
{
"epoch": 1.53,
"grad_norm": 0.269300085641628,
"learning_rate": 0.00011419972883069623,
"loss": 1.5605,
"step": 2015
},
{
"epoch": 1.53,
"grad_norm": 0.24787445167484887,
"learning_rate": 0.00011376295084019792,
"loss": 1.6663,
"step": 2020
},
{
"epoch": 1.54,
"grad_norm": 0.21140623194389616,
"learning_rate": 0.00011332590505211159,
"loss": 1.658,
"step": 2025
},
{
"epoch": 1.54,
"grad_norm": 0.25921900302870593,
"learning_rate": 0.00011288859997041353,
"loss": 1.6459,
"step": 2030
},
{
"epoch": 1.54,
"grad_norm": 0.2608666502284525,
"learning_rate": 0.00011245104410412537,
"loss": 1.6928,
"step": 2035
},
{
"epoch": 1.55,
"grad_norm": 0.22406449938146802,
"learning_rate": 0.00011201324596714844,
"loss": 1.4791,
"step": 2040
},
{
"epoch": 1.55,
"grad_norm": 0.19647960391415928,
"learning_rate": 0.00011157521407809815,
"loss": 1.698,
"step": 2045
},
{
"epoch": 1.55,
"grad_norm": 0.1897962583849219,
"learning_rate": 0.00011113695696013824,
"loss": 1.8167,
"step": 2050
},
{
"epoch": 1.56,
"grad_norm": 0.20712759197533817,
"learning_rate": 0.0001106984831408149,
"loss": 1.7501,
"step": 2055
},
{
"epoch": 1.56,
"grad_norm": 0.23079961827033185,
"learning_rate": 0.00011025980115189086,
"loss": 1.5934,
"step": 2060
},
{
"epoch": 1.57,
"grad_norm": 0.22104873487185864,
"learning_rate": 0.00010982091952917943,
"loss": 1.6686,
"step": 2065
},
{
"epoch": 1.57,
"grad_norm": 0.20639504694734737,
"learning_rate": 0.00010938184681237833,
"loss": 1.7136,
"step": 2070
},
{
"epoch": 1.57,
"grad_norm": 0.2417721960073701,
"learning_rate": 0.00010894259154490354,
"loss": 1.6702,
"step": 2075
},
{
"epoch": 1.58,
"grad_norm": 0.21810729625691397,
"learning_rate": 0.00010850316227372312,
"loss": 1.7477,
"step": 2080
},
{
"epoch": 1.58,
"grad_norm": 0.23170201171415503,
"learning_rate": 0.00010806356754919091,
"loss": 1.6943,
"step": 2085
},
{
"epoch": 1.58,
"grad_norm": 0.22093119739393355,
"learning_rate": 0.00010762381592488002,
"loss": 1.623,
"step": 2090
},
{
"epoch": 1.59,
"grad_norm": 0.21034721922753088,
"learning_rate": 0.00010718391595741657,
"loss": 1.6084,
"step": 2095
},
{
"epoch": 1.59,
"grad_norm": 0.22443726771939806,
"learning_rate": 0.00010674387620631308,
"loss": 1.5536,
"step": 2100
},
{
"epoch": 1.6,
"grad_norm": 0.22568508558473213,
"learning_rate": 0.00010630370523380202,
"loss": 1.469,
"step": 2105
},
{
"epoch": 1.6,
"grad_norm": 0.3332888137498032,
"learning_rate": 0.00010586341160466904,
"loss": 1.6488,
"step": 2110
},
{
"epoch": 1.6,
"grad_norm": 0.2129808005413702,
"learning_rate": 0.00010542300388608652,
"loss": 1.6101,
"step": 2115
},
{
"epoch": 1.61,
"grad_norm": 0.20553693555408575,
"learning_rate": 0.00010498249064744679,
"loss": 1.4872,
"step": 2120
},
{
"epoch": 1.61,
"grad_norm": 0.2445112542992352,
"learning_rate": 0.00010454188046019524,
"loss": 1.7005,
"step": 2125
},
{
"epoch": 1.61,
"grad_norm": 0.20844778510756687,
"learning_rate": 0.00010410118189766387,
"loss": 1.5589,
"step": 2130
},
{
"epoch": 1.62,
"grad_norm": 0.2223212290874802,
"learning_rate": 0.0001036604035349041,
"loss": 1.6621,
"step": 2135
},
{
"epoch": 1.62,
"grad_norm": 0.20479585313872112,
"learning_rate": 0.00010321955394852018,
"loss": 1.7061,
"step": 2140
},
{
"epoch": 1.63,
"grad_norm": 0.17606184812861142,
"learning_rate": 0.0001027786417165022,
"loss": 1.5607,
"step": 2145
},
{
"epoch": 1.63,
"grad_norm": 0.2676349610853098,
"learning_rate": 0.0001023376754180592,
"loss": 1.6232,
"step": 2150
},
{
"epoch": 1.63,
"grad_norm": 0.2068560787418325,
"learning_rate": 0.00010189666363345223,
"loss": 1.5724,
"step": 2155
},
{
"epoch": 1.64,
"grad_norm": 0.19641973239797275,
"learning_rate": 0.00010145561494382742,
"loss": 1.5305,
"step": 2160
},
{
"epoch": 1.64,
"grad_norm": 0.2574797520893005,
"learning_rate": 0.00010101453793104898,
"loss": 1.6025,
"step": 2165
},
{
"epoch": 1.65,
"grad_norm": 0.2827194584842853,
"learning_rate": 0.00010057344117753222,
"loss": 1.5882,
"step": 2170
},
{
"epoch": 1.65,
"grad_norm": 0.19936180521947827,
"learning_rate": 0.00010013233326607661,
"loss": 1.5706,
"step": 2175
},
{
"epoch": 1.65,
"grad_norm": 0.21819696462759022,
"learning_rate": 9.969122277969865e-05,
"loss": 1.6623,
"step": 2180
},
{
"epoch": 1.66,
"grad_norm": 0.225417352707018,
"learning_rate": 9.9250118301465e-05,
"loss": 1.6255,
"step": 2185
},
{
"epoch": 1.66,
"grad_norm": 0.3143651738447285,
"learning_rate": 9.880902841432544e-05,
"loss": 1.4905,
"step": 2190
},
{
"epoch": 1.66,
"grad_norm": 0.23749234423783855,
"learning_rate": 9.836796170094571e-05,
"loss": 1.6156,
"step": 2195
},
{
"epoch": 1.67,
"grad_norm": 0.23579593383210742,
"learning_rate": 9.792692674354079e-05,
"loss": 1.6963,
"step": 2200
},
{
"epoch": 1.67,
"grad_norm": 0.2032329245708717,
"learning_rate": 9.748593212370773e-05,
"loss": 1.6733,
"step": 2205
},
{
"epoch": 1.68,
"grad_norm": 0.20661047812325195,
"learning_rate": 9.704498642225856e-05,
"loss": 1.622,
"step": 2210
},
{
"epoch": 1.68,
"grad_norm": 0.18970352315906064,
"learning_rate": 9.660409821905363e-05,
"loss": 1.7834,
"step": 2215
},
{
"epoch": 1.68,
"grad_norm": 0.17832580771616308,
"learning_rate": 9.616327609283445e-05,
"loss": 1.6989,
"step": 2220
},
{
"epoch": 1.69,
"grad_norm": 0.21859704299949706,
"learning_rate": 9.572252862105673e-05,
"loss": 1.7946,
"step": 2225
},
{
"epoch": 1.69,
"grad_norm": 0.24897942412148671,
"learning_rate": 9.528186437972368e-05,
"loss": 1.564,
"step": 2230
},
{
"epoch": 1.69,
"grad_norm": 0.20109922805508615,
"learning_rate": 9.484129194321896e-05,
"loss": 1.6594,
"step": 2235
},
{
"epoch": 1.7,
"grad_norm": 0.19546463521855884,
"learning_rate": 9.440081988413987e-05,
"loss": 1.542,
"step": 2240
},
{
"epoch": 1.7,
"grad_norm": 0.20254596218430737,
"learning_rate": 9.396045677313067e-05,
"loss": 1.8142,
"step": 2245
},
{
"epoch": 1.71,
"grad_norm": 0.1936135057396683,
"learning_rate": 9.352021117871574e-05,
"loss": 1.5564,
"step": 2250
},
{
"epoch": 1.71,
"grad_norm": 0.2096445430714542,
"learning_rate": 9.308009166713263e-05,
"loss": 1.6735,
"step": 2255
},
{
"epoch": 1.71,
"grad_norm": 0.22672329152194862,
"learning_rate": 9.264010680216583e-05,
"loss": 1.6761,
"step": 2260
},
{
"epoch": 1.72,
"grad_norm": 0.24482242735211057,
"learning_rate": 9.220026514497983e-05,
"loss": 1.5988,
"step": 2265
},
{
"epoch": 1.72,
"grad_norm": 0.24736418279884478,
"learning_rate": 9.176057525395252e-05,
"loss": 1.5844,
"step": 2270
},
{
"epoch": 1.72,
"grad_norm": 0.1987944867199659,
"learning_rate": 9.132104568450879e-05,
"loss": 1.6997,
"step": 2275
},
{
"epoch": 1.73,
"grad_norm": 0.1850913674566201,
"learning_rate": 9.088168498895408e-05,
"loss": 1.5696,
"step": 2280
},
{
"epoch": 1.73,
"grad_norm": 0.24393794217168674,
"learning_rate": 9.044250171630778e-05,
"loss": 1.7403,
"step": 2285
},
{
"epoch": 1.74,
"grad_norm": 0.19475525279873163,
"learning_rate": 9.000350441213708e-05,
"loss": 1.5984,
"step": 2290
},
{
"epoch": 1.74,
"grad_norm": 0.2218761532729913,
"learning_rate": 8.956470161839072e-05,
"loss": 1.6681,
"step": 2295
},
{
"epoch": 1.74,
"grad_norm": 0.24957778768532196,
"learning_rate": 8.912610187323248e-05,
"loss": 1.6169,
"step": 2300
},
{
"epoch": 1.75,
"grad_norm": 0.2510725868859042,
"learning_rate": 8.868771371087539e-05,
"loss": 1.639,
"step": 2305
},
{
"epoch": 1.75,
"grad_norm": 0.19643293153400068,
"learning_rate": 8.82495456614155e-05,
"loss": 1.7237,
"step": 2310
},
{
"epoch": 1.76,
"grad_norm": 0.26450396919742597,
"learning_rate": 8.781160625066588e-05,
"loss": 1.6528,
"step": 2315
},
{
"epoch": 1.76,
"grad_norm": 0.22179001551390587,
"learning_rate": 8.737390399999086e-05,
"loss": 1.5533,
"step": 2320
},
{
"epoch": 1.76,
"grad_norm": 0.2346687653947156,
"learning_rate": 8.693644742614017e-05,
"loss": 1.6104,
"step": 2325
},
{
"epoch": 1.77,
"grad_norm": 0.25806483606045055,
"learning_rate": 8.649924504108302e-05,
"loss": 1.6052,
"step": 2330
},
{
"epoch": 1.77,
"grad_norm": 0.1786075330646357,
"learning_rate": 8.606230535184283e-05,
"loss": 1.5603,
"step": 2335
},
{
"epoch": 1.77,
"grad_norm": 0.254068816191818,
"learning_rate": 8.562563686033145e-05,
"loss": 1.7643,
"step": 2340
},
{
"epoch": 1.78,
"grad_norm": 0.21344041020108453,
"learning_rate": 8.518924806318378e-05,
"loss": 1.6584,
"step": 2345
},
{
"epoch": 1.78,
"grad_norm": 0.2082041629797306,
"learning_rate": 8.47531474515925e-05,
"loss": 1.7992,
"step": 2350
},
{
"epoch": 1.79,
"grad_norm": 0.2645099180130053,
"learning_rate": 8.431734351114284e-05,
"loss": 1.6361,
"step": 2355
},
{
"epoch": 1.79,
"grad_norm": 0.22698336003173047,
"learning_rate": 8.388184472164736e-05,
"loss": 1.646,
"step": 2360
},
{
"epoch": 1.79,
"grad_norm": 0.24003288864061173,
"learning_rate": 8.34466595569811e-05,
"loss": 1.6379,
"step": 2365
},
{
"epoch": 1.8,
"grad_norm": 0.19443085064409085,
"learning_rate": 8.301179648491669e-05,
"loss": 1.73,
"step": 2370
},
{
"epoch": 1.8,
"grad_norm": 0.24311509067570025,
"learning_rate": 8.257726396695933e-05,
"loss": 1.6802,
"step": 2375
},
{
"epoch": 1.8,
"grad_norm": 0.24648929428851593,
"learning_rate": 8.214307045818254e-05,
"loss": 1.7708,
"step": 2380
},
{
"epoch": 1.81,
"grad_norm": 0.1940516179768531,
"learning_rate": 8.17092244070634e-05,
"loss": 1.5857,
"step": 2385
},
{
"epoch": 1.81,
"grad_norm": 0.2361070277608161,
"learning_rate": 8.127573425531814e-05,
"loss": 1.6411,
"step": 2390
},
{
"epoch": 1.82,
"grad_norm": 0.2835364928454071,
"learning_rate": 8.084260843773799e-05,
"loss": 1.7818,
"step": 2395
},
{
"epoch": 1.82,
"grad_norm": 0.18047213778922655,
"learning_rate": 8.040985538202505e-05,
"loss": 1.587,
"step": 2400
},
{
"epoch": 1.82,
"grad_norm": 0.21940093931140764,
"learning_rate": 7.997748350862822e-05,
"loss": 1.6795,
"step": 2405
},
{
"epoch": 1.83,
"grad_norm": 0.20557324059132212,
"learning_rate": 7.954550123057939e-05,
"loss": 1.638,
"step": 2410
},
{
"epoch": 1.83,
"grad_norm": 0.23522437885683956,
"learning_rate": 7.911391695332988e-05,
"loss": 1.6176,
"step": 2415
},
{
"epoch": 1.83,
"grad_norm": 0.20227659422834685,
"learning_rate": 7.868273907458661e-05,
"loss": 1.5562,
"step": 2420
},
{
"epoch": 1.84,
"grad_norm": 0.17957107180807144,
"learning_rate": 7.825197598414895e-05,
"loss": 1.6577,
"step": 2425
},
{
"epoch": 1.84,
"grad_norm": 0.21134479099989728,
"learning_rate": 7.782163606374536e-05,
"loss": 1.5407,
"step": 2430
},
{
"epoch": 1.85,
"grad_norm": 0.2190101821746382,
"learning_rate": 7.739172768687028e-05,
"loss": 1.6901,
"step": 2435
},
{
"epoch": 1.85,
"grad_norm": 0.22909832831883262,
"learning_rate": 7.696225921862126e-05,
"loss": 1.6517,
"step": 2440
},
{
"epoch": 1.85,
"grad_norm": 0.1922087104118847,
"learning_rate": 7.653323901553625e-05,
"loss": 1.5558,
"step": 2445
},
{
"epoch": 1.86,
"grad_norm": 0.2535390902934386,
"learning_rate": 7.610467542543073e-05,
"loss": 1.7802,
"step": 2450
},
{
"epoch": 1.86,
"grad_norm": 0.20264859592749507,
"learning_rate": 7.567657678723565e-05,
"loss": 1.6141,
"step": 2455
},
{
"epoch": 1.87,
"grad_norm": 0.2534081482654566,
"learning_rate": 7.52489514308349e-05,
"loss": 1.6593,
"step": 2460
},
{
"epoch": 1.87,
"grad_norm": 0.24401202904206418,
"learning_rate": 7.482180767690334e-05,
"loss": 1.5982,
"step": 2465
},
{
"epoch": 1.87,
"grad_norm": 0.2805376490259695,
"learning_rate": 7.439515383674485e-05,
"loss": 1.7126,
"step": 2470
},
{
"epoch": 1.88,
"grad_norm": 0.24585333566417664,
"learning_rate": 7.396899821213072e-05,
"loss": 1.5644,
"step": 2475
},
{
"epoch": 1.88,
"grad_norm": 0.22491029483008115,
"learning_rate": 7.354334909513791e-05,
"loss": 1.6765,
"step": 2480
},
{
"epoch": 1.88,
"grad_norm": 0.23458997274256846,
"learning_rate": 7.311821476798789e-05,
"loss": 1.6122,
"step": 2485
},
{
"epoch": 1.89,
"grad_norm": 0.17595992796667512,
"learning_rate": 7.269360350288547e-05,
"loss": 1.8356,
"step": 2490
},
{
"epoch": 1.89,
"grad_norm": 0.18759163970832302,
"learning_rate": 7.226952356185765e-05,
"loss": 1.4984,
"step": 2495
},
{
"epoch": 1.9,
"grad_norm": 0.236927434671597,
"learning_rate": 7.184598319659317e-05,
"loss": 1.6798,
"step": 2500
},
{
"epoch": 1.9,
"grad_norm": 0.26802038257147875,
"learning_rate": 7.142299064828169e-05,
"loss": 1.5844,
"step": 2505
},
{
"epoch": 1.9,
"grad_norm": 0.1751974293734832,
"learning_rate": 7.100055414745346e-05,
"loss": 1.6365,
"step": 2510
},
{
"epoch": 1.91,
"grad_norm": 0.23254005323825433,
"learning_rate": 7.057868191381936e-05,
"loss": 1.4657,
"step": 2515
},
{
"epoch": 1.91,
"grad_norm": 0.264348812986722,
"learning_rate": 7.015738215611079e-05,
"loss": 1.7816,
"step": 2520
},
{
"epoch": 1.91,
"grad_norm": 0.27530320883320614,
"learning_rate": 6.973666307191996e-05,
"loss": 1.6751,
"step": 2525
},
{
"epoch": 1.92,
"grad_norm": 0.19339613251333393,
"learning_rate": 6.931653284754042e-05,
"loss": 1.7293,
"step": 2530
},
{
"epoch": 1.92,
"grad_norm": 0.2151392309486146,
"learning_rate": 6.889699965780787e-05,
"loss": 1.7334,
"step": 2535
},
{
"epoch": 1.93,
"grad_norm": 0.22448766537331677,
"learning_rate": 6.847807166594083e-05,
"loss": 1.6827,
"step": 2540
},
{
"epoch": 1.93,
"grad_norm": 0.2286115948636003,
"learning_rate": 6.805975702338208e-05,
"loss": 1.6562,
"step": 2545
},
{
"epoch": 1.93,
"grad_norm": 0.2118908130790939,
"learning_rate": 6.764206386963991e-05,
"loss": 1.6091,
"step": 2550
},
{
"epoch": 1.94,
"grad_norm": 0.240925966059138,
"learning_rate": 6.722500033212974e-05,
"loss": 1.6314,
"step": 2555
},
{
"epoch": 1.94,
"grad_norm": 0.2271694074825516,
"learning_rate": 6.680857452601598e-05,
"loss": 1.7589,
"step": 2560
},
{
"epoch": 1.94,
"grad_norm": 0.2168118018671656,
"learning_rate": 6.639279455405432e-05,
"loss": 1.6201,
"step": 2565
},
{
"epoch": 1.95,
"grad_norm": 0.21224810091098364,
"learning_rate": 6.597766850643361e-05,
"loss": 1.5842,
"step": 2570
},
{
"epoch": 1.95,
"grad_norm": 0.19581859607212743,
"learning_rate": 6.556320446061902e-05,
"loss": 1.5586,
"step": 2575
},
{
"epoch": 1.96,
"grad_norm": 0.20327112477714954,
"learning_rate": 6.514941048119435e-05,
"loss": 1.6303,
"step": 2580
},
{
"epoch": 1.96,
"grad_norm": 0.22810086515914976,
"learning_rate": 6.47362946197055e-05,
"loss": 1.7332,
"step": 2585
},
{
"epoch": 1.96,
"grad_norm": 0.22278333474431392,
"learning_rate": 6.432386491450361e-05,
"loss": 1.6293,
"step": 2590
},
{
"epoch": 1.97,
"grad_norm": 0.23128655487134384,
"learning_rate": 6.391212939058861e-05,
"loss": 1.6937,
"step": 2595
},
{
"epoch": 1.97,
"grad_norm": 0.24641830926598107,
"learning_rate": 6.350109605945323e-05,
"loss": 1.4982,
"step": 2600
},
{
"epoch": 1.97,
"grad_norm": 0.24123146757419323,
"learning_rate": 6.309077291892702e-05,
"loss": 1.5107,
"step": 2605
},
{
"epoch": 1.98,
"grad_norm": 0.24138969338364216,
"learning_rate": 6.268116795302068e-05,
"loss": 1.5448,
"step": 2610
},
{
"epoch": 1.98,
"grad_norm": 0.2515434111696446,
"learning_rate": 6.227228913177081e-05,
"loss": 1.559,
"step": 2615
},
{
"epoch": 1.99,
"grad_norm": 0.2554427971564699,
"learning_rate": 6.186414441108487e-05,
"loss": 1.6211,
"step": 2620
},
{
"epoch": 1.99,
"grad_norm": 0.20773791558688393,
"learning_rate": 6.14567417325861e-05,
"loss": 1.6058,
"step": 2625
},
{
"epoch": 1.99,
"grad_norm": 0.20109572317054908,
"learning_rate": 6.105008902345935e-05,
"loss": 1.5911,
"step": 2630
},
{
"epoch": 2.0,
"grad_norm": 0.21186779196561445,
"learning_rate": 6.064419419629662e-05,
"loss": 1.6227,
"step": 2635
},
{
"epoch": 2.0,
"grad_norm": 0.2150487580932417,
"learning_rate": 6.023906514894313e-05,
"loss": 1.5839,
"step": 2640
},
{
"epoch": 2.01,
"grad_norm": 0.24636199955981808,
"learning_rate": 5.983470976434369e-05,
"loss": 1.5764,
"step": 2645
},
{
"epoch": 2.01,
"grad_norm": 0.22093610448062864,
"learning_rate": 5.943113591038928e-05,
"loss": 1.7157,
"step": 2650
},
{
"epoch": 2.01,
"grad_norm": 0.21359568862552614,
"learning_rate": 5.902835143976393e-05,
"loss": 1.6359,
"step": 2655
},
{
"epoch": 2.02,
"grad_norm": 0.2219633405623727,
"learning_rate": 5.862636418979198e-05,
"loss": 1.6484,
"step": 2660
},
{
"epoch": 2.02,
"grad_norm": 0.24148935530595134,
"learning_rate": 5.822518198228565e-05,
"loss": 1.52,
"step": 2665
},
{
"epoch": 2.02,
"grad_norm": 0.22871052628894134,
"learning_rate": 5.782481262339261e-05,
"loss": 1.5583,
"step": 2670
},
{
"epoch": 2.03,
"grad_norm": 0.18016152517949127,
"learning_rate": 5.742526390344427e-05,
"loss": 1.7094,
"step": 2675
},
{
"epoch": 2.03,
"grad_norm": 0.27927714573640977,
"learning_rate": 5.702654359680428e-05,
"loss": 1.7229,
"step": 2680
},
{
"epoch": 2.04,
"grad_norm": 0.20272089890919007,
"learning_rate": 5.662865946171696e-05,
"loss": 1.7436,
"step": 2685
},
{
"epoch": 2.04,
"grad_norm": 0.25187946618078394,
"learning_rate": 5.6231619240156694e-05,
"loss": 1.5926,
"step": 2690
},
{
"epoch": 2.04,
"grad_norm": 0.23619447456603418,
"learning_rate": 5.5835430657676976e-05,
"loss": 1.5177,
"step": 2695
},
{
"epoch": 2.05,
"grad_norm": 0.23076862233533377,
"learning_rate": 5.544010142326026e-05,
"loss": 1.6432,
"step": 2700
},
{
"epoch": 2.05,
"grad_norm": 0.2509266079111979,
"learning_rate": 5.504563922916799e-05,
"loss": 1.6125,
"step": 2705
},
{
"epoch": 2.05,
"grad_norm": 0.26527998507107736,
"learning_rate": 5.4652051750790825e-05,
"loss": 1.5384,
"step": 2710
},
{
"epoch": 2.06,
"grad_norm": 0.24254486560490685,
"learning_rate": 5.425934664649921e-05,
"loss": 1.6641,
"step": 2715
},
{
"epoch": 2.06,
"grad_norm": 0.22497341374372068,
"learning_rate": 5.3867531557494674e-05,
"loss": 1.4442,
"step": 2720
},
{
"epoch": 2.07,
"grad_norm": 0.22811203680708553,
"learning_rate": 5.347661410766087e-05,
"loss": 1.6313,
"step": 2725
},
{
"epoch": 2.07,
"grad_norm": 0.2193211927138723,
"learning_rate": 5.308660190341528e-05,
"loss": 1.4835,
"step": 2730
},
{
"epoch": 2.07,
"grad_norm": 0.23158894991713072,
"learning_rate": 5.2697502533561226e-05,
"loss": 1.5765,
"step": 2735
},
{
"epoch": 2.08,
"grad_norm": 0.2160152191509828,
"learning_rate": 5.230932356914032e-05,
"loss": 1.6395,
"step": 2740
},
{
"epoch": 2.08,
"grad_norm": 0.23138300560468752,
"learning_rate": 5.1922072563284986e-05,
"loss": 1.6645,
"step": 2745
},
{
"epoch": 2.08,
"grad_norm": 0.27219186986752913,
"learning_rate": 5.153575705107152e-05,
"loss": 1.5842,
"step": 2750
},
{
"epoch": 2.09,
"grad_norm": 0.24365055871265076,
"learning_rate": 5.115038454937362e-05,
"loss": 1.7234,
"step": 2755
},
{
"epoch": 2.09,
"grad_norm": 0.22921672259925305,
"learning_rate": 5.076596255671592e-05,
"loss": 1.5756,
"step": 2760
},
{
"epoch": 2.1,
"grad_norm": 0.2538431765730713,
"learning_rate": 5.0382498553128265e-05,
"loss": 1.6491,
"step": 2765
},
{
"epoch": 2.1,
"grad_norm": 0.25913968900209966,
"learning_rate": 5.000000000000002e-05,
"loss": 1.5438,
"step": 2770
},
{
"epoch": 2.1,
"grad_norm": 0.291257818004918,
"learning_rate": 4.9618474339934916e-05,
"loss": 1.5995,
"step": 2775
},
{
"epoch": 2.11,
"grad_norm": 0.24432948267207238,
"learning_rate": 4.9237928996606384e-05,
"loss": 1.5999,
"step": 2780
},
{
"epoch": 2.11,
"grad_norm": 0.26418330324646966,
"learning_rate": 4.88583713746129e-05,
"loss": 1.7175,
"step": 2785
},
{
"epoch": 2.12,
"grad_norm": 0.2647804130194954,
"learning_rate": 4.8479808859333964e-05,
"loss": 1.5083,
"step": 2790
},
{
"epoch": 2.12,
"grad_norm": 0.23990236642151055,
"learning_rate": 4.810224881678652e-05,
"loss": 1.5032,
"step": 2795
},
{
"epoch": 2.12,
"grad_norm": 0.22406476212806528,
"learning_rate": 4.772569859348156e-05,
"loss": 1.6183,
"step": 2800
},
{
"epoch": 2.13,
"grad_norm": 0.17599248862626268,
"learning_rate": 4.735016551628095e-05,
"loss": 1.694,
"step": 2805
},
{
"epoch": 2.13,
"grad_norm": 0.27545889362059484,
"learning_rate": 4.697565689225528e-05,
"loss": 1.6074,
"step": 2810
},
{
"epoch": 2.13,
"grad_norm": 0.27997532830437954,
"learning_rate": 4.660218000854143e-05,
"loss": 1.5062,
"step": 2815
},
{
"epoch": 2.14,
"grad_norm": 0.2803170335965896,
"learning_rate": 4.6229742132200746e-05,
"loss": 1.6516,
"step": 2820
},
{
"epoch": 2.14,
"grad_norm": 0.22582531196940026,
"learning_rate": 4.585835051007774e-05,
"loss": 1.6168,
"step": 2825
},
{
"epoch": 2.15,
"grad_norm": 0.22856148303418752,
"learning_rate": 4.548801236865912e-05,
"loss": 1.5435,
"step": 2830
},
{
"epoch": 2.15,
"grad_norm": 0.2764784030904549,
"learning_rate": 4.511873491393304e-05,
"loss": 1.6409,
"step": 2835
},
{
"epoch": 2.15,
"grad_norm": 0.21257264261069672,
"learning_rate": 4.475052533124893e-05,
"loss": 1.5581,
"step": 2840
},
{
"epoch": 2.16,
"grad_norm": 0.21196439275175047,
"learning_rate": 4.438339078517785e-05,
"loss": 1.5538,
"step": 2845
},
{
"epoch": 2.16,
"grad_norm": 0.2832145647608719,
"learning_rate": 4.401733841937279e-05,
"loss": 1.724,
"step": 2850
},
{
"epoch": 2.16,
"grad_norm": 0.27147849615384506,
"learning_rate": 4.3652375356429974e-05,
"loss": 1.5014,
"step": 2855
},
{
"epoch": 2.17,
"grad_norm": 0.2610576760484019,
"learning_rate": 4.328850869775001e-05,
"loss": 1.6749,
"step": 2860
},
{
"epoch": 2.17,
"grad_norm": 0.23914287887699434,
"learning_rate": 4.292574552339981e-05,
"loss": 1.5328,
"step": 2865
},
{
"epoch": 2.18,
"grad_norm": 0.24065502762902322,
"learning_rate": 4.256409289197495e-05,
"loss": 1.5942,
"step": 2870
},
{
"epoch": 2.18,
"grad_norm": 0.2083191016158885,
"learning_rate": 4.2203557840462214e-05,
"loss": 1.5539,
"step": 2875
},
{
"epoch": 2.18,
"grad_norm": 0.20639182389301813,
"learning_rate": 4.184414738410248e-05,
"loss": 1.5646,
"step": 2880
},
{
"epoch": 2.19,
"grad_norm": 0.23727403239283584,
"learning_rate": 4.148586851625461e-05,
"loss": 1.5353,
"step": 2885
},
{
"epoch": 2.19,
"grad_norm": 0.24508287577637505,
"learning_rate": 4.112872820825915e-05,
"loss": 1.4418,
"step": 2890
},
{
"epoch": 2.19,
"grad_norm": 0.2475936795575314,
"learning_rate": 4.077273340930263e-05,
"loss": 1.6643,
"step": 2895
},
{
"epoch": 2.2,
"grad_norm": 0.2505899184192717,
"learning_rate": 4.041789104628241e-05,
"loss": 1.5577,
"step": 2900
},
{
"epoch": 2.2,
"grad_norm": 0.24093576954008833,
"learning_rate": 4.006420802367205e-05,
"loss": 1.6784,
"step": 2905
},
{
"epoch": 2.21,
"grad_norm": 0.2561236323684272,
"learning_rate": 3.971169122338668e-05,
"loss": 1.6165,
"step": 2910
},
{
"epoch": 2.21,
"grad_norm": 0.24280603594696593,
"learning_rate": 3.936034750464927e-05,
"loss": 1.6695,
"step": 2915
},
{
"epoch": 2.21,
"grad_norm": 0.2602730047803284,
"learning_rate": 3.901018370385724e-05,
"loss": 1.5697,
"step": 2920
},
{
"epoch": 2.22,
"grad_norm": 0.21146640994821633,
"learning_rate": 3.866120663444914e-05,
"loss": 1.5399,
"step": 2925
},
{
"epoch": 2.22,
"grad_norm": 0.24075711387924426,
"learning_rate": 3.831342308677247e-05,
"loss": 1.5597,
"step": 2930
},
{
"epoch": 2.23,
"grad_norm": 0.24793331779495362,
"learning_rate": 3.7966839827951196e-05,
"loss": 1.6434,
"step": 2935
},
{
"epoch": 2.23,
"grad_norm": 0.19558506394109187,
"learning_rate": 3.762146360175427e-05,
"loss": 1.6499,
"step": 2940
},
{
"epoch": 2.23,
"grad_norm": 0.35587028915030966,
"learning_rate": 3.727730112846444e-05,
"loss": 1.5089,
"step": 2945
},
{
"epoch": 2.24,
"grad_norm": 0.2570330063437446,
"learning_rate": 3.693435910474732e-05,
"loss": 1.6548,
"step": 2950
},
{
"epoch": 2.24,
"grad_norm": 0.28077059284475103,
"learning_rate": 3.659264420352122e-05,
"loss": 1.6528,
"step": 2955
},
{
"epoch": 2.24,
"grad_norm": 0.23035257395244374,
"learning_rate": 3.6252163073827294e-05,
"loss": 1.4482,
"step": 2960
},
{
"epoch": 2.25,
"grad_norm": 0.2051186918638722,
"learning_rate": 3.5912922340700206e-05,
"loss": 1.5015,
"step": 2965
},
{
"epoch": 2.25,
"grad_norm": 0.22455945185810877,
"learning_rate": 3.557492860503893e-05,
"loss": 1.5176,
"step": 2970
},
{
"epoch": 2.26,
"grad_norm": 0.23453638209680727,
"learning_rate": 3.5238188443478795e-05,
"loss": 1.6343,
"step": 2975
},
{
"epoch": 2.26,
"grad_norm": 0.24470156257503126,
"learning_rate": 3.4902708408263066e-05,
"loss": 1.7663,
"step": 2980
},
{
"epoch": 2.26,
"grad_norm": 0.23135832322132918,
"learning_rate": 3.45684950271158e-05,
"loss": 1.5837,
"step": 2985
},
{
"epoch": 2.27,
"grad_norm": 0.2608640064079802,
"learning_rate": 3.423555480311457e-05,
"loss": 1.6173,
"step": 2990
},
{
"epoch": 2.27,
"grad_norm": 0.31078928098679404,
"learning_rate": 3.3903894214564026e-05,
"loss": 1.5177,
"step": 2995
},
{
"epoch": 2.27,
"grad_norm": 0.26258430453244713,
"learning_rate": 3.3573519714869914e-05,
"loss": 1.6865,
"step": 3000
},
{
"epoch": 2.28,
"grad_norm": 0.2733284038434726,
"learning_rate": 3.324443773241349e-05,
"loss": 1.3619,
"step": 3005
},
{
"epoch": 2.28,
"grad_norm": 0.2369163548191094,
"learning_rate": 3.291665467042618e-05,
"loss": 1.6509,
"step": 3010
},
{
"epoch": 2.29,
"grad_norm": 0.2664340527286697,
"learning_rate": 3.25901769068654e-05,
"loss": 1.6038,
"step": 3015
},
{
"epoch": 2.29,
"grad_norm": 0.23398120063750877,
"learning_rate": 3.2265010794290195e-05,
"loss": 1.663,
"step": 3020
},
{
"epoch": 2.29,
"grad_norm": 0.2781275708933271,
"learning_rate": 3.1941162659737647e-05,
"loss": 1.6429,
"step": 3025
},
{
"epoch": 2.3,
"grad_norm": 0.2687866606825216,
"learning_rate": 3.16186388045998e-05,
"loss": 1.6853,
"step": 3030
},
{
"epoch": 2.3,
"grad_norm": 0.23644485510225058,
"learning_rate": 3.129744550450113e-05,
"loss": 1.6027,
"step": 3035
},
{
"epoch": 2.3,
"grad_norm": 0.24644290933624716,
"learning_rate": 3.09775890091763e-05,
"loss": 1.6018,
"step": 3040
},
{
"epoch": 2.31,
"grad_norm": 0.2259139537131363,
"learning_rate": 3.065907554234858e-05,
"loss": 1.6607,
"step": 3045
},
{
"epoch": 2.31,
"grad_norm": 0.24004959008038543,
"learning_rate": 3.034191130160887e-05,
"loss": 1.5377,
"step": 3050
},
{
"epoch": 2.32,
"grad_norm": 0.2213008661812979,
"learning_rate": 3.0026102458294924e-05,
"loss": 1.5613,
"step": 3055
},
{
"epoch": 2.32,
"grad_norm": 0.2079094581228579,
"learning_rate": 2.9711655157371443e-05,
"loss": 1.5085,
"step": 3060
},
{
"epoch": 2.32,
"grad_norm": 0.2527748569210639,
"learning_rate": 2.9398575517310355e-05,
"loss": 1.5855,
"step": 3065
},
{
"epoch": 2.33,
"grad_norm": 0.2141370968928817,
"learning_rate": 2.9086869629971836e-05,
"loss": 1.5732,
"step": 3070
},
{
"epoch": 2.33,
"grad_norm": 0.24493685886391817,
"learning_rate": 2.8776543560485857e-05,
"loss": 1.6197,
"step": 3075
},
{
"epoch": 2.34,
"grad_norm": 0.2316788505534105,
"learning_rate": 2.8467603347133997e-05,
"loss": 1.648,
"step": 3080
},
{
"epoch": 2.34,
"grad_norm": 0.25146411778731,
"learning_rate": 2.816005500123203e-05,
"loss": 1.5525,
"step": 3085
},
{
"epoch": 2.34,
"grad_norm": 0.22407696629199808,
"learning_rate": 2.785390450701303e-05,
"loss": 1.7218,
"step": 3090
},
{
"epoch": 2.35,
"grad_norm": 0.27013300544460844,
"learning_rate": 2.7549157821510885e-05,
"loss": 1.5804,
"step": 3095
},
{
"epoch": 2.35,
"grad_norm": 0.25388595748221704,
"learning_rate": 2.7245820874444272e-05,
"loss": 1.7398,
"step": 3100
},
{
"epoch": 2.35,
"grad_norm": 0.19843759758285218,
"learning_rate": 2.6943899568101405e-05,
"loss": 1.6999,
"step": 3105
},
{
"epoch": 2.36,
"grad_norm": 0.20783915655026464,
"learning_rate": 2.6643399777225232e-05,
"loss": 1.6114,
"step": 3110
},
{
"epoch": 2.36,
"grad_norm": 0.2496800397125067,
"learning_rate": 2.6344327348898958e-05,
"loss": 1.5217,
"step": 3115
},
{
"epoch": 2.37,
"grad_norm": 0.22235249882770752,
"learning_rate": 2.6046688102432382e-05,
"loss": 1.6871,
"step": 3120
},
{
"epoch": 2.37,
"grad_norm": 0.2462186333352102,
"learning_rate": 2.5750487829248726e-05,
"loss": 1.7788,
"step": 3125
},
{
"epoch": 2.37,
"grad_norm": 0.20018170839209692,
"learning_rate": 2.545573229277175e-05,
"loss": 1.6076,
"step": 3130
},
{
"epoch": 2.38,
"grad_norm": 0.2704237119402894,
"learning_rate": 2.5162427228313857e-05,
"loss": 1.6456,
"step": 3135
},
{
"epoch": 2.38,
"grad_norm": 0.2735737465777087,
"learning_rate": 2.4870578342964245e-05,
"loss": 1.6402,
"step": 3140
},
{
"epoch": 2.38,
"grad_norm": 0.2188413596766906,
"learning_rate": 2.458019131547803e-05,
"loss": 1.5193,
"step": 3145
},
{
"epoch": 2.39,
"grad_norm": 0.2821633184600081,
"learning_rate": 2.429127179616575e-05,
"loss": 1.6363,
"step": 3150
},
{
"epoch": 2.39,
"grad_norm": 0.20714886526036308,
"learning_rate": 2.4003825406783308e-05,
"loss": 1.669,
"step": 3155
},
{
"epoch": 2.4,
"grad_norm": 0.2661408497359453,
"learning_rate": 2.3717857740422644e-05,
"loss": 1.5488,
"step": 3160
},
{
"epoch": 2.4,
"grad_norm": 0.2535527034852724,
"learning_rate": 2.343337436140295e-05,
"loss": 1.5851,
"step": 3165
},
{
"epoch": 2.4,
"grad_norm": 0.2629746106882043,
"learning_rate": 2.3150380805162418e-05,
"loss": 1.5467,
"step": 3170
},
{
"epoch": 2.41,
"grad_norm": 0.24098285831571226,
"learning_rate": 2.2868882578150285e-05,
"loss": 1.6417,
"step": 3175
},
{
"epoch": 2.41,
"grad_norm": 0.28638431202213366,
"learning_rate": 2.258888515772005e-05,
"loss": 1.6915,
"step": 3180
},
{
"epoch": 2.41,
"grad_norm": 0.319171053435643,
"learning_rate": 2.2310393992022704e-05,
"loss": 1.6324,
"step": 3185
},
{
"epoch": 2.42,
"grad_norm": 0.2054749944090956,
"learning_rate": 2.2033414499900685e-05,
"loss": 1.5694,
"step": 3190
},
{
"epoch": 2.42,
"grad_norm": 0.2515694982134836,
"learning_rate": 2.1757952070782504e-05,
"loss": 1.598,
"step": 3195
},
{
"epoch": 2.43,
"grad_norm": 0.23267628383812705,
"learning_rate": 2.148401206457793e-05,
"loss": 1.4513,
"step": 3200
},
{
"epoch": 2.43,
"grad_norm": 0.25390773868938254,
"learning_rate": 2.121159981157359e-05,
"loss": 1.5906,
"step": 3205
},
{
"epoch": 2.43,
"grad_norm": 0.251154990702733,
"learning_rate": 2.0940720612329258e-05,
"loss": 1.4707,
"step": 3210
},
{
"epoch": 2.44,
"grad_norm": 0.24909067323121328,
"learning_rate": 2.067137973757489e-05,
"loss": 1.6214,
"step": 3215
},
{
"epoch": 2.44,
"grad_norm": 0.23515254331621996,
"learning_rate": 2.0403582428107792e-05,
"loss": 1.3762,
"step": 3220
},
{
"epoch": 2.45,
"grad_norm": 0.24320094875542947,
"learning_rate": 2.0137333894690912e-05,
"loss": 1.4732,
"step": 3225
},
{
"epoch": 2.45,
"grad_norm": 0.26976839590657536,
"learning_rate": 1.987263931795126e-05,
"loss": 1.5325,
"step": 3230
},
{
"epoch": 2.45,
"grad_norm": 0.2480855244121356,
"learning_rate": 1.9609503848279144e-05,
"loss": 1.6336,
"step": 3235
},
{
"epoch": 2.46,
"grad_norm": 0.23767732175608752,
"learning_rate": 1.9347932605728093e-05,
"loss": 1.564,
"step": 3240
},
{
"epoch": 2.46,
"grad_norm": 0.2727265524309786,
"learning_rate": 1.9087930679915023e-05,
"loss": 1.6079,
"step": 3245
},
{
"epoch": 2.46,
"grad_norm": 0.22286517973459688,
"learning_rate": 1.882950312992131e-05,
"loss": 1.4002,
"step": 3250
},
{
"epoch": 2.47,
"grad_norm": 0.2456900771009275,
"learning_rate": 1.8572654984194392e-05,
"loss": 1.5994,
"step": 3255
},
{
"epoch": 2.47,
"grad_norm": 0.2771873738066393,
"learning_rate": 1.8317391240449876e-05,
"loss": 1.6214,
"step": 3260
},
{
"epoch": 2.48,
"grad_norm": 0.2770007292533942,
"learning_rate": 1.8063716865574266e-05,
"loss": 1.4663,
"step": 3265
},
{
"epoch": 2.48,
"grad_norm": 0.24034756553369535,
"learning_rate": 1.781163679552831e-05,
"loss": 1.6507,
"step": 3270
},
{
"epoch": 2.48,
"grad_norm": 0.2286386450562912,
"learning_rate": 1.7561155935251094e-05,
"loss": 1.5512,
"step": 3275
},
{
"epoch": 2.49,
"grad_norm": 0.2594167587325395,
"learning_rate": 1.7312279158564415e-05,
"loss": 1.6027,
"step": 3280
},
{
"epoch": 2.49,
"grad_norm": 0.2127951461073897,
"learning_rate": 1.706501130807806e-05,
"loss": 1.6896,
"step": 3285
},
{
"epoch": 2.49,
"grad_norm": 0.2796245456905501,
"learning_rate": 1.6819357195095597e-05,
"loss": 1.6376,
"step": 3290
},
{
"epoch": 2.5,
"grad_norm": 0.23110557133342613,
"learning_rate": 1.657532159952062e-05,
"loss": 1.5277,
"step": 3295
},
{
"epoch": 2.5,
"grad_norm": 0.24542029689976314,
"learning_rate": 1.6332909269763953e-05,
"loss": 1.7143,
"step": 3300
},
{
"epoch": 2.51,
"grad_norm": 0.23539162074782163,
"learning_rate": 1.609212492265103e-05,
"loss": 1.7028,
"step": 3305
},
{
"epoch": 2.51,
"grad_norm": 0.2629785684260658,
"learning_rate": 1.585297324333027e-05,
"loss": 1.4392,
"step": 3310
},
{
"epoch": 2.51,
"grad_norm": 0.2933973664128153,
"learning_rate": 1.561545888518192e-05,
"loss": 1.7234,
"step": 3315
},
{
"epoch": 2.52,
"grad_norm": 0.23954470728817145,
"learning_rate": 1.537958646972737e-05,
"loss": 1.4944,
"step": 3320
},
{
"epoch": 2.52,
"grad_norm": 0.23980954615598538,
"learning_rate": 1.5145360586539336e-05,
"loss": 1.5851,
"step": 3325
},
{
"epoch": 2.52,
"grad_norm": 0.21087011175193957,
"learning_rate": 1.4912785793152583e-05,
"loss": 1.5208,
"step": 3330
},
{
"epoch": 2.53,
"grad_norm": 0.23976449951280604,
"learning_rate": 1.4681866614975227e-05,
"loss": 1.5722,
"step": 3335
},
{
"epoch": 2.53,
"grad_norm": 0.22800754377440097,
"learning_rate": 1.4452607545200492e-05,
"loss": 1.6206,
"step": 3340
},
{
"epoch": 2.54,
"grad_norm": 0.21262175469660566,
"learning_rate": 1.4225013044719615e-05,
"loss": 1.5784,
"step": 3345
},
{
"epoch": 2.54,
"grad_norm": 0.2436947408558131,
"learning_rate": 1.3999087542034817e-05,
"loss": 1.5594,
"step": 3350
},
{
"epoch": 2.54,
"grad_norm": 0.23425763672194239,
"learning_rate": 1.3774835433173172e-05,
"loss": 1.6784,
"step": 3355
},
{
"epoch": 2.55,
"grad_norm": 0.22447628853598572,
"learning_rate": 1.3552261081601091e-05,
"loss": 1.6606,
"step": 3360
},
{
"epoch": 2.55,
"grad_norm": 0.2410908531230671,
"learning_rate": 1.3331368818139445e-05,
"loss": 1.5011,
"step": 3365
},
{
"epoch": 2.55,
"grad_norm": 0.2107811996936363,
"learning_rate": 1.3112162940879225e-05,
"loss": 1.6211,
"step": 3370
},
{
"epoch": 2.56,
"grad_norm": 0.23349707446690013,
"learning_rate": 1.289464771509804e-05,
"loss": 1.4912,
"step": 3375
},
{
"epoch": 2.56,
"grad_norm": 0.23161951144663487,
"learning_rate": 1.2678827373176894e-05,
"loss": 1.5809,
"step": 3380
},
{
"epoch": 2.57,
"grad_norm": 0.23879959809777346,
"learning_rate": 1.2464706114518088e-05,
"loss": 1.6276,
"step": 3385
},
{
"epoch": 2.57,
"grad_norm": 0.2421829350347233,
"learning_rate": 1.2252288105463405e-05,
"loss": 1.6212,
"step": 3390
},
{
"epoch": 2.57,
"grad_norm": 0.21883362123837063,
"learning_rate": 1.2041577479212963e-05,
"loss": 1.6288,
"step": 3395
},
{
"epoch": 2.58,
"grad_norm": 0.22802087223126732,
"learning_rate": 1.1832578335744882e-05,
"loss": 1.6313,
"step": 3400
},
{
"epoch": 2.58,
"grad_norm": 0.26121795799269726,
"learning_rate": 1.1625294741735526e-05,
"loss": 1.656,
"step": 3405
},
{
"epoch": 2.59,
"grad_norm": 0.24409811650460989,
"learning_rate": 1.1419730730480305e-05,
"loss": 1.618,
"step": 3410
},
{
"epoch": 2.59,
"grad_norm": 0.21817658760534914,
"learning_rate": 1.1215890301815201e-05,
"loss": 1.5273,
"step": 3415
},
{
"epoch": 2.59,
"grad_norm": 0.21593460359416924,
"learning_rate": 1.101377742203903e-05,
"loss": 1.5447,
"step": 3420
},
{
"epoch": 2.6,
"grad_norm": 0.2425820551445123,
"learning_rate": 1.0813396023836142e-05,
"loss": 1.5712,
"step": 3425
},
{
"epoch": 2.6,
"grad_norm": 0.2625166861814797,
"learning_rate": 1.0614750006200014e-05,
"loss": 1.6605,
"step": 3430
},
{
"epoch": 2.6,
"grad_norm": 0.2615766591081601,
"learning_rate": 1.0417843234357282e-05,
"loss": 1.5986,
"step": 3435
},
{
"epoch": 2.61,
"grad_norm": 0.2581808450444552,
"learning_rate": 1.022267953969257e-05,
"loss": 1.641,
"step": 3440
},
{
"epoch": 2.61,
"grad_norm": 0.2746391682993569,
"learning_rate": 1.0029262719674015e-05,
"loss": 1.6293,
"step": 3445
},
{
"epoch": 2.62,
"grad_norm": 0.20322726411468045,
"learning_rate": 9.837596537779237e-06,
"loss": 1.5418,
"step": 3450
},
{
"epoch": 2.62,
"grad_norm": 0.24234603845064367,
"learning_rate": 9.647684723422213e-06,
"loss": 1.6451,
"step": 3455
},
{
"epoch": 2.62,
"grad_norm": 0.24585798612987492,
"learning_rate": 9.459530971880681e-06,
"loss": 1.5217,
"step": 3460
},
{
"epoch": 2.63,
"grad_norm": 0.24090792198358563,
"learning_rate": 9.27313894422428e-06,
"loss": 1.7077,
"step": 3465
},
{
"epoch": 2.63,
"grad_norm": 0.2794920103874086,
"learning_rate": 9.088512267243143e-06,
"loss": 1.7315,
"step": 3470
},
{
"epoch": 2.63,
"grad_norm": 0.21561282445650495,
"learning_rate": 8.905654533377583e-06,
"loss": 1.6059,
"step": 3475
},
{
"epoch": 2.64,
"grad_norm": 0.2348946654088957,
"learning_rate": 8.724569300648034e-06,
"loss": 1.7123,
"step": 3480
},
{
"epoch": 2.64,
"grad_norm": 0.29915508239234007,
"learning_rate": 8.545260092585805e-06,
"loss": 1.6167,
"step": 3485
},
{
"epoch": 2.65,
"grad_norm": 0.2609864197177778,
"learning_rate": 8.367730398164574e-06,
"loss": 1.6634,
"step": 3490
},
{
"epoch": 2.65,
"grad_norm": 0.32383778611721475,
"learning_rate": 8.19198367173255e-06,
"loss": 1.631,
"step": 3495
},
{
"epoch": 2.65,
"grad_norm": 0.24225506413140852,
"learning_rate": 8.018023332945112e-06,
"loss": 1.5466,
"step": 3500
},
{
"epoch": 2.66,
"grad_norm": 0.2686708766662986,
"learning_rate": 7.845852766698426e-06,
"loss": 1.5889,
"step": 3505
},
{
"epoch": 2.66,
"grad_norm": 0.24593185494412043,
"learning_rate": 7.675475323063475e-06,
"loss": 1.5796,
"step": 3510
},
{
"epoch": 2.66,
"grad_norm": 0.27604151432217455,
"learning_rate": 7.5068943172209025e-06,
"loss": 1.6281,
"step": 3515
},
{
"epoch": 2.67,
"grad_norm": 0.2211924967690316,
"learning_rate": 7.340113029396567e-06,
"loss": 1.5407,
"step": 3520
},
{
"epoch": 2.67,
"grad_norm": 0.2549240432311639,
"learning_rate": 7.175134704797592e-06,
"loss": 1.6782,
"step": 3525
},
{
"epoch": 2.68,
"grad_norm": 0.22194169097137223,
"learning_rate": 7.011962553549345e-06,
"loss": 1.639,
"step": 3530
},
{
"epoch": 2.68,
"grad_norm": 0.24109057602354814,
"learning_rate": 6.8505997506329024e-06,
"loss": 1.6421,
"step": 3535
},
{
"epoch": 2.68,
"grad_norm": 0.298662548692409,
"learning_rate": 6.691049435823327e-06,
"loss": 1.5672,
"step": 3540
},
{
"epoch": 2.69,
"grad_norm": 0.21821362720901652,
"learning_rate": 6.533314713628458e-06,
"loss": 1.5832,
"step": 3545
},
{
"epoch": 2.69,
"grad_norm": 0.28320776205122955,
"learning_rate": 6.377398653228661e-06,
"loss": 1.5686,
"step": 3550
},
{
"epoch": 2.7,
"grad_norm": 0.2647885175395758,
"learning_rate": 6.22330428841702e-06,
"loss": 1.3694,
"step": 3555
},
{
"epoch": 2.7,
"grad_norm": 0.23693055785085496,
"learning_rate": 6.071034617540294e-06,
"loss": 1.4096,
"step": 3560
},
{
"epoch": 2.7,
"grad_norm": 0.2811316512655128,
"learning_rate": 5.9205926034406e-06,
"loss": 1.7223,
"step": 3565
},
{
"epoch": 2.71,
"grad_norm": 0.25921259473226205,
"learning_rate": 5.771981173397811e-06,
"loss": 1.6491,
"step": 3570
},
{
"epoch": 2.71,
"grad_norm": 0.2384614576434685,
"learning_rate": 5.625203219072495e-06,
"loss": 1.5796,
"step": 3575
},
{
"epoch": 2.71,
"grad_norm": 0.24972301573225342,
"learning_rate": 5.480261596449698e-06,
"loss": 1.6484,
"step": 3580
},
{
"epoch": 2.72,
"grad_norm": 0.2790877252142927,
"learning_rate": 5.337159125783453e-06,
"loss": 1.6747,
"step": 3585
},
{
"epoch": 2.72,
"grad_norm": 0.26526860024829096,
"learning_rate": 5.195898591541748e-06,
"loss": 1.631,
"step": 3590
},
{
"epoch": 2.73,
"grad_norm": 0.2359453480631305,
"learning_rate": 5.056482742352486e-06,
"loss": 1.5224,
"step": 3595
},
{
"epoch": 2.73,
"grad_norm": 0.2749211055400865,
"learning_rate": 4.9189142909498945e-06,
"loss": 1.5348,
"step": 3600
},
{
"epoch": 2.73,
"grad_norm": 0.20381886685736014,
"learning_rate": 4.783195914121818e-06,
"loss": 1.6092,
"step": 3605
},
{
"epoch": 2.74,
"grad_norm": 0.27694673318367125,
"learning_rate": 4.649330252657613e-06,
"loss": 1.5524,
"step": 3610
},
{
"epoch": 2.74,
"grad_norm": 0.24104039690274334,
"learning_rate": 4.517319911296747e-06,
"loss": 1.6131,
"step": 3615
},
{
"epoch": 2.74,
"grad_norm": 0.21654379256196502,
"learning_rate": 4.387167458678121e-06,
"loss": 1.5537,
"step": 3620
},
{
"epoch": 2.75,
"grad_norm": 0.22452456642122062,
"learning_rate": 4.2588754272900985e-06,
"loss": 1.5051,
"step": 3625
},
{
"epoch": 2.75,
"grad_norm": 0.2338963199910069,
"learning_rate": 4.132446313421246e-06,
"loss": 1.6882,
"step": 3630
},
{
"epoch": 2.76,
"grad_norm": 0.2584997684032959,
"learning_rate": 4.00788257711171e-06,
"loss": 1.5014,
"step": 3635
},
{
"epoch": 2.76,
"grad_norm": 0.2908213514842905,
"learning_rate": 3.885186642105376e-06,
"loss": 1.6277,
"step": 3640
},
{
"epoch": 2.76,
"grad_norm": 0.19233168126321265,
"learning_rate": 3.7643608958027543e-06,
"loss": 1.7565,
"step": 3645
},
{
"epoch": 2.77,
"grad_norm": 0.270149572768491,
"learning_rate": 3.6454076892144418e-06,
"loss": 1.6004,
"step": 3650
},
{
"epoch": 2.77,
"grad_norm": 0.2403966892086897,
"learning_rate": 3.5283293369154036e-06,
"loss": 1.5425,
"step": 3655
},
{
"epoch": 2.77,
"grad_norm": 0.1959498671099479,
"learning_rate": 3.4131281170000083e-06,
"loss": 1.6043,
"step": 3660
},
{
"epoch": 2.78,
"grad_norm": 0.2291068939021477,
"learning_rate": 3.2998062710375864e-06,
"loss": 1.6167,
"step": 3665
},
{
"epoch": 2.78,
"grad_norm": 0.2669658026144572,
"learning_rate": 3.188366004028931e-06,
"loss": 1.6093,
"step": 3670
},
{
"epoch": 2.79,
"grad_norm": 0.21167755912643296,
"learning_rate": 3.0788094843632655e-06,
"loss": 1.6288,
"step": 3675
},
{
"epoch": 2.79,
"grad_norm": 0.27869409187190786,
"learning_rate": 2.9711388437761445e-06,
"loss": 1.5781,
"step": 3680
},
{
"epoch": 2.79,
"grad_norm": 0.21975752059084885,
"learning_rate": 2.8653561773079764e-06,
"loss": 1.6193,
"step": 3685
},
{
"epoch": 2.8,
"grad_norm": 0.22758480435581485,
"learning_rate": 2.7614635432632097e-06,
"loss": 1.7111,
"step": 3690
},
{
"epoch": 2.8,
"grad_norm": 0.253380043181827,
"learning_rate": 2.6594629631702783e-06,
"loss": 1.6528,
"step": 3695
},
{
"epoch": 2.81,
"grad_norm": 0.22066629732671186,
"learning_rate": 2.5593564217423314e-06,
"loss": 1.5717,
"step": 3700
},
{
"epoch": 2.81,
"grad_norm": 0.20545309594834268,
"learning_rate": 2.461145866838599e-06,
"loss": 1.5816,
"step": 3705
},
{
"epoch": 2.81,
"grad_norm": 0.2298245690861381,
"learning_rate": 2.364833209426376e-06,
"loss": 1.5273,
"step": 3710
},
{
"epoch": 2.82,
"grad_norm": 0.28452051640315046,
"learning_rate": 2.270420323544009e-06,
"loss": 1.5568,
"step": 3715
},
{
"epoch": 2.82,
"grad_norm": 0.20796391559506347,
"learning_rate": 2.177909046264348e-06,
"loss": 1.6991,
"step": 3720
},
{
"epoch": 2.82,
"grad_norm": 0.26581893695586506,
"learning_rate": 2.0873011776589957e-06,
"loss": 1.517,
"step": 3725
},
{
"epoch": 2.83,
"grad_norm": 0.22796087968697157,
"learning_rate": 1.998598480763247e-06,
"loss": 1.7992,
"step": 3730
},
{
"epoch": 2.83,
"grad_norm": 0.2336977716987997,
"learning_rate": 1.911802681541919e-06,
"loss": 1.513,
"step": 3735
},
{
"epoch": 2.84,
"grad_norm": 0.2497871821535283,
"learning_rate": 1.8269154688556056e-06,
"loss": 1.5704,
"step": 3740
},
{
"epoch": 2.84,
"grad_norm": 0.23620748564286875,
"learning_rate": 1.7439384944279213e-06,
"loss": 1.4392,
"step": 3745
},
{
"epoch": 2.84,
"grad_norm": 0.23378030531695476,
"learning_rate": 1.6628733728133227e-06,
"loss": 1.5813,
"step": 3750
},
{
"epoch": 2.85,
"grad_norm": 0.2146995504847581,
"learning_rate": 1.5837216813656908e-06,
"loss": 1.5966,
"step": 3755
},
{
"epoch": 2.85,
"grad_norm": 0.2382092413535891,
"learning_rate": 1.506484960207677e-06,
"loss": 1.553,
"step": 3760
},
{
"epoch": 2.85,
"grad_norm": 0.19096405564266747,
"learning_rate": 1.4311647122006721e-06,
"loss": 1.5538,
"step": 3765
},
{
"epoch": 2.86,
"grad_norm": 0.2877533644995907,
"learning_rate": 1.3577624029155966e-06,
"loss": 1.5703,
"step": 3770
},
{
"epoch": 2.86,
"grad_norm": 0.25619739351479454,
"learning_rate": 1.2862794606044337e-06,
"loss": 1.4537,
"step": 3775
},
{
"epoch": 2.87,
"grad_norm": 0.17985034388431237,
"learning_rate": 1.216717276172341e-06,
"loss": 1.7393,
"step": 3780
},
{
"epoch": 2.87,
"grad_norm": 0.24865342548144834,
"learning_rate": 1.1490772031506392e-06,
"loss": 1.681,
"step": 3785
},
{
"epoch": 2.87,
"grad_norm": 0.2682691239787855,
"learning_rate": 1.0833605576705096e-06,
"loss": 1.7253,
"step": 3790
},
{
"epoch": 2.88,
"grad_norm": 0.2694273976328748,
"learning_rate": 1.0195686184373166e-06,
"loss": 1.5678,
"step": 3795
},
{
"epoch": 2.88,
"grad_norm": 0.21203591608829347,
"learning_rate": 9.577026267057476e-07,
"loss": 1.615,
"step": 3800
},
{
"epoch": 2.88,
"grad_norm": 0.24345143310387468,
"learning_rate": 8.97763786255712e-07,
"loss": 1.5338,
"step": 3805
},
{
"epoch": 2.89,
"grad_norm": 0.2579598936883795,
"learning_rate": 8.397532633688254e-07,
"loss": 1.5515,
"step": 3810
},
{
"epoch": 2.89,
"grad_norm": 0.2434697228699266,
"learning_rate": 7.836721868058061e-07,
"loss": 1.7675,
"step": 3815
},
{
"epoch": 2.9,
"grad_norm": 0.2440247854883621,
"learning_rate": 7.295216477844702e-07,
"loss": 1.6179,
"step": 3820
},
{
"epoch": 2.9,
"grad_norm": 0.25830362738889806,
"learning_rate": 6.773026999584708e-07,
"loss": 1.698,
"step": 3825
},
{
"epoch": 2.9,
"grad_norm": 0.2622736838540384,
"learning_rate": 6.270163593968703e-07,
"loss": 1.6485,
"step": 3830
},
{
"epoch": 2.91,
"grad_norm": 0.2065165799768009,
"learning_rate": 5.786636045643112e-07,
"loss": 1.6278,
"step": 3835
},
{
"epoch": 2.91,
"grad_norm": 0.22985561856815895,
"learning_rate": 5.322453763019653e-07,
"loss": 1.5524,
"step": 3840
},
{
"epoch": 2.92,
"grad_norm": 0.2727208719350262,
"learning_rate": 4.877625778092809e-07,
"loss": 1.6646,
"step": 3845
},
{
"epoch": 2.92,
"grad_norm": 0.2664985065225481,
"learning_rate": 4.4521607462640893e-07,
"loss": 1.5143,
"step": 3850
},
{
"epoch": 2.92,
"grad_norm": 0.24545453500067022,
"learning_rate": 4.046066946172822e-07,
"loss": 1.6567,
"step": 3855
},
{
"epoch": 2.93,
"grad_norm": 0.24678453590974866,
"learning_rate": 3.659352279535733e-07,
"loss": 1.6106,
"step": 3860
},
{
"epoch": 2.93,
"grad_norm": 0.28024299756070853,
"learning_rate": 3.292024270993399e-07,
"loss": 1.4444,
"step": 3865
},
{
"epoch": 2.93,
"grad_norm": 0.21189400608157236,
"learning_rate": 2.9440900679631457e-07,
"loss": 1.5323,
"step": 3870
},
{
"epoch": 2.94,
"grad_norm": 0.24352825534009542,
"learning_rate": 2.615556440500377e-07,
"loss": 1.6129,
"step": 3875
},
{
"epoch": 2.94,
"grad_norm": 0.24710743453957604,
"learning_rate": 2.306429781166908e-07,
"loss": 1.6064,
"step": 3880
},
{
"epoch": 2.95,
"grad_norm": 0.2939949777477219,
"learning_rate": 2.016716104906391e-07,
"loss": 1.5547,
"step": 3885
},
{
"epoch": 2.95,
"grad_norm": 0.24855903276634353,
"learning_rate": 1.7464210489273047e-07,
"loss": 1.4292,
"step": 3890
},
{
"epoch": 2.95,
"grad_norm": 0.29838466749808656,
"learning_rate": 1.4955498725932604e-07,
"loss": 1.6796,
"step": 3895
},
{
"epoch": 2.96,
"grad_norm": 0.24063497135231618,
"learning_rate": 1.2641074573209733e-07,
"loss": 1.6524,
"step": 3900
},
{
"epoch": 2.96,
"grad_norm": 0.266868894727991,
"learning_rate": 1.0520983064847833e-07,
"loss": 1.6033,
"step": 3905
},
{
"epoch": 2.96,
"grad_norm": 0.2468474483223204,
"learning_rate": 8.595265453292811e-08,
"loss": 1.5643,
"step": 3910
},
{
"epoch": 2.97,
"grad_norm": 0.20433798961816185,
"learning_rate": 6.86395920889149e-08,
"loss": 1.5261,
"step": 3915
},
{
"epoch": 2.97,
"grad_norm": 0.24801643390934636,
"learning_rate": 5.327098019159982e-08,
"loss": 1.7088,
"step": 3920
},
{
"epoch": 2.98,
"grad_norm": 0.25316391379721404,
"learning_rate": 3.9847117881308685e-08,
"loss": 1.6097,
"step": 3925
},
{
"epoch": 2.98,
"grad_norm": 0.2589299223311166,
"learning_rate": 2.8368266357681194e-08,
"loss": 1.7897,
"step": 3930
},
{
"epoch": 2.98,
"grad_norm": 0.2615270175678582,
"learning_rate": 1.8834648974630497e-08,
"loss": 1.7153,
"step": 3935
},
{
"epoch": 2.99,
"grad_norm": 0.23820419945511057,
"learning_rate": 1.12464512359578e-08,
"loss": 1.489,
"step": 3940
},
{
"epoch": 2.99,
"grad_norm": 0.24057944682864915,
"learning_rate": 5.603820791755254e-09,
"loss": 1.4223,
"step": 3945
},
{
"epoch": 2.99,
"grad_norm": 0.28827145571809903,
"learning_rate": 1.9068674355415815e-09,
"loss": 1.6161,
"step": 3950
},
{
"epoch": 3.0,
"grad_norm": 0.4006550408742116,
"learning_rate": 1.5566310213044333e-10,
"loss": 1.556,
"step": 3955
},
{
"epoch": 3.0,
"step": 3957,
"total_flos": 1.244366244937728e+16,
"train_loss": 1.656136889695097,
"train_runtime": 24205.6115,
"train_samples_per_second": 0.654,
"train_steps_per_second": 0.163
}
],
"logging_steps": 5,
"max_steps": 3957,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 1.244366244937728e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}