juyongjiang's picture
upload model checkpoint
57d33c5 verified
raw
history blame contribute delete
No virus
203 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.991755976916735,
"eval_steps": 500,
"global_step": 6060,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016488046166529267,
"grad_norm": 380.0,
"learning_rate": 3.300330033003301e-07,
"loss": 35.4867,
"step": 1
},
{
"epoch": 0.008244023083264633,
"grad_norm": 308.0,
"learning_rate": 1.65016501650165e-06,
"loss": 34.8873,
"step": 5
},
{
"epoch": 0.016488046166529265,
"grad_norm": 306.0,
"learning_rate": 3.3003300330033e-06,
"loss": 34.9252,
"step": 10
},
{
"epoch": 0.0247320692497939,
"grad_norm": 163.0,
"learning_rate": 4.950495049504951e-06,
"loss": 31.7188,
"step": 15
},
{
"epoch": 0.03297609233305853,
"grad_norm": 110.5,
"learning_rate": 6.6006600660066e-06,
"loss": 28.5443,
"step": 20
},
{
"epoch": 0.041220115416323165,
"grad_norm": 72.5,
"learning_rate": 8.250825082508252e-06,
"loss": 24.1835,
"step": 25
},
{
"epoch": 0.0494641384995878,
"grad_norm": 39.5,
"learning_rate": 9.900990099009901e-06,
"loss": 21.6514,
"step": 30
},
{
"epoch": 0.057708161582852434,
"grad_norm": 19.0,
"learning_rate": 1.155115511551155e-05,
"loss": 19.5766,
"step": 35
},
{
"epoch": 0.06595218466611706,
"grad_norm": 16.25,
"learning_rate": 1.32013201320132e-05,
"loss": 18.5587,
"step": 40
},
{
"epoch": 0.0741962077493817,
"grad_norm": 13.125,
"learning_rate": 1.4851485148514851e-05,
"loss": 17.2984,
"step": 45
},
{
"epoch": 0.08244023083264633,
"grad_norm": 9.375,
"learning_rate": 1.6501650165016504e-05,
"loss": 16.2291,
"step": 50
},
{
"epoch": 0.09068425391591096,
"grad_norm": 7.5,
"learning_rate": 1.8151815181518153e-05,
"loss": 15.5459,
"step": 55
},
{
"epoch": 0.0989282769991756,
"grad_norm": 5.1875,
"learning_rate": 1.9801980198019803e-05,
"loss": 15.0494,
"step": 60
},
{
"epoch": 0.10717230008244023,
"grad_norm": 3.546875,
"learning_rate": 2.1452145214521452e-05,
"loss": 14.5263,
"step": 65
},
{
"epoch": 0.11541632316570487,
"grad_norm": 3.21875,
"learning_rate": 2.31023102310231e-05,
"loss": 14.0492,
"step": 70
},
{
"epoch": 0.1236603462489695,
"grad_norm": 3.34375,
"learning_rate": 2.4752475247524754e-05,
"loss": 14.0437,
"step": 75
},
{
"epoch": 0.13190436933223412,
"grad_norm": 4.03125,
"learning_rate": 2.64026402640264e-05,
"loss": 13.4269,
"step": 80
},
{
"epoch": 0.14014839241549876,
"grad_norm": 4.46875,
"learning_rate": 2.8052805280528056e-05,
"loss": 13.1438,
"step": 85
},
{
"epoch": 0.1483924154987634,
"grad_norm": 5.4375,
"learning_rate": 2.9702970297029702e-05,
"loss": 12.8682,
"step": 90
},
{
"epoch": 0.15663643858202803,
"grad_norm": 7.5625,
"learning_rate": 3.135313531353136e-05,
"loss": 12.1682,
"step": 95
},
{
"epoch": 0.16488046166529266,
"grad_norm": 9.6875,
"learning_rate": 3.300330033003301e-05,
"loss": 11.6708,
"step": 100
},
{
"epoch": 0.1731244847485573,
"grad_norm": 13.875,
"learning_rate": 3.465346534653465e-05,
"loss": 10.6173,
"step": 105
},
{
"epoch": 0.18136850783182193,
"grad_norm": 20.375,
"learning_rate": 3.6303630363036307e-05,
"loss": 9.3535,
"step": 110
},
{
"epoch": 0.18961253091508656,
"grad_norm": 23.0,
"learning_rate": 3.7953795379537956e-05,
"loss": 7.0394,
"step": 115
},
{
"epoch": 0.1978565539983512,
"grad_norm": 19.0,
"learning_rate": 3.9603960396039605e-05,
"loss": 4.5084,
"step": 120
},
{
"epoch": 0.20610057708161583,
"grad_norm": 5.34375,
"learning_rate": 4.1254125412541255e-05,
"loss": 2.7198,
"step": 125
},
{
"epoch": 0.21434460016488047,
"grad_norm": 1.8671875,
"learning_rate": 4.2904290429042904e-05,
"loss": 1.9952,
"step": 130
},
{
"epoch": 0.2225886232481451,
"grad_norm": 1.0078125,
"learning_rate": 4.455445544554456e-05,
"loss": 1.7411,
"step": 135
},
{
"epoch": 0.23083264633140974,
"grad_norm": 0.95703125,
"learning_rate": 4.62046204620462e-05,
"loss": 1.5998,
"step": 140
},
{
"epoch": 0.23907666941467437,
"grad_norm": 1.0859375,
"learning_rate": 4.785478547854786e-05,
"loss": 1.5183,
"step": 145
},
{
"epoch": 0.247320692497939,
"grad_norm": 1.28125,
"learning_rate": 4.950495049504951e-05,
"loss": 1.4489,
"step": 150
},
{
"epoch": 0.25556471558120364,
"grad_norm": 1.09375,
"learning_rate": 5.115511551155116e-05,
"loss": 1.3924,
"step": 155
},
{
"epoch": 0.26380873866446825,
"grad_norm": 0.703125,
"learning_rate": 5.28052805280528e-05,
"loss": 1.3648,
"step": 160
},
{
"epoch": 0.2720527617477329,
"grad_norm": 0.71484375,
"learning_rate": 5.445544554455446e-05,
"loss": 1.3461,
"step": 165
},
{
"epoch": 0.2802967848309975,
"grad_norm": 0.671875,
"learning_rate": 5.610561056105611e-05,
"loss": 1.3065,
"step": 170
},
{
"epoch": 0.2885408079142622,
"grad_norm": 0.98828125,
"learning_rate": 5.7755775577557755e-05,
"loss": 1.2809,
"step": 175
},
{
"epoch": 0.2967848309975268,
"grad_norm": 0.640625,
"learning_rate": 5.9405940594059404e-05,
"loss": 1.2647,
"step": 180
},
{
"epoch": 0.30502885408079145,
"grad_norm": 1.296875,
"learning_rate": 6.105610561056106e-05,
"loss": 1.2387,
"step": 185
},
{
"epoch": 0.31327287716405605,
"grad_norm": 1.1171875,
"learning_rate": 6.270627062706272e-05,
"loss": 1.24,
"step": 190
},
{
"epoch": 0.3215169002473207,
"grad_norm": 1.4765625,
"learning_rate": 6.435643564356436e-05,
"loss": 1.2108,
"step": 195
},
{
"epoch": 0.3297609233305853,
"grad_norm": 1.7578125,
"learning_rate": 6.600660066006602e-05,
"loss": 1.2026,
"step": 200
},
{
"epoch": 0.33800494641385,
"grad_norm": 1.78125,
"learning_rate": 6.765676567656766e-05,
"loss": 1.1894,
"step": 205
},
{
"epoch": 0.3462489694971146,
"grad_norm": 1.5234375,
"learning_rate": 6.93069306930693e-05,
"loss": 1.2093,
"step": 210
},
{
"epoch": 0.35449299258037925,
"grad_norm": 1.0703125,
"learning_rate": 7.095709570957097e-05,
"loss": 1.1768,
"step": 215
},
{
"epoch": 0.36273701566364386,
"grad_norm": 1.6171875,
"learning_rate": 7.260726072607261e-05,
"loss": 1.1946,
"step": 220
},
{
"epoch": 0.37098103874690846,
"grad_norm": 2.828125,
"learning_rate": 7.425742574257426e-05,
"loss": 1.1678,
"step": 225
},
{
"epoch": 0.3792250618301731,
"grad_norm": 1.5,
"learning_rate": 7.590759075907591e-05,
"loss": 1.1618,
"step": 230
},
{
"epoch": 0.38746908491343773,
"grad_norm": 0.84375,
"learning_rate": 7.755775577557755e-05,
"loss": 1.1585,
"step": 235
},
{
"epoch": 0.3957131079967024,
"grad_norm": 1.5546875,
"learning_rate": 7.920792079207921e-05,
"loss": 1.1519,
"step": 240
},
{
"epoch": 0.403957131079967,
"grad_norm": 1.84375,
"learning_rate": 8.085808580858087e-05,
"loss": 1.1408,
"step": 245
},
{
"epoch": 0.41220115416323166,
"grad_norm": 1.40625,
"learning_rate": 8.250825082508251e-05,
"loss": 1.138,
"step": 250
},
{
"epoch": 0.42044517724649627,
"grad_norm": 0.80859375,
"learning_rate": 8.415841584158417e-05,
"loss": 1.1375,
"step": 255
},
{
"epoch": 0.42868920032976093,
"grad_norm": 1.4296875,
"learning_rate": 8.580858085808581e-05,
"loss": 1.1193,
"step": 260
},
{
"epoch": 0.43693322341302554,
"grad_norm": 2.15625,
"learning_rate": 8.745874587458746e-05,
"loss": 1.1178,
"step": 265
},
{
"epoch": 0.4451772464962902,
"grad_norm": 0.984375,
"learning_rate": 8.910891089108912e-05,
"loss": 1.1038,
"step": 270
},
{
"epoch": 0.4534212695795548,
"grad_norm": 1.5546875,
"learning_rate": 9.075907590759076e-05,
"loss": 1.1148,
"step": 275
},
{
"epoch": 0.46166529266281947,
"grad_norm": 0.84765625,
"learning_rate": 9.24092409240924e-05,
"loss": 1.112,
"step": 280
},
{
"epoch": 0.4699093157460841,
"grad_norm": 1.1640625,
"learning_rate": 9.405940594059406e-05,
"loss": 1.0882,
"step": 285
},
{
"epoch": 0.47815333882934874,
"grad_norm": 3.5625,
"learning_rate": 9.570957095709572e-05,
"loss": 1.0873,
"step": 290
},
{
"epoch": 0.48639736191261335,
"grad_norm": 0.80078125,
"learning_rate": 9.735973597359736e-05,
"loss": 1.0982,
"step": 295
},
{
"epoch": 0.494641384995878,
"grad_norm": 1.171875,
"learning_rate": 9.900990099009902e-05,
"loss": 1.074,
"step": 300
},
{
"epoch": 0.5028854080791426,
"grad_norm": 1.0078125,
"learning_rate": 0.00010066006600660067,
"loss": 1.0719,
"step": 305
},
{
"epoch": 0.5111294311624073,
"grad_norm": 4.96875,
"learning_rate": 0.00010231023102310232,
"loss": 1.0816,
"step": 310
},
{
"epoch": 0.5193734542456719,
"grad_norm": 0.95703125,
"learning_rate": 0.00010396039603960397,
"loss": 1.0681,
"step": 315
},
{
"epoch": 0.5276174773289365,
"grad_norm": 5.6875,
"learning_rate": 0.0001056105610561056,
"loss": 1.0689,
"step": 320
},
{
"epoch": 0.5358615004122012,
"grad_norm": 1.1328125,
"learning_rate": 0.00010726072607260727,
"loss": 1.0712,
"step": 325
},
{
"epoch": 0.5441055234954658,
"grad_norm": 0.9375,
"learning_rate": 0.00010891089108910893,
"loss": 1.063,
"step": 330
},
{
"epoch": 0.5523495465787304,
"grad_norm": 0.8125,
"learning_rate": 0.00011056105610561056,
"loss": 1.0622,
"step": 335
},
{
"epoch": 0.560593569661995,
"grad_norm": 5.0,
"learning_rate": 0.00011221122112211223,
"loss": 1.0614,
"step": 340
},
{
"epoch": 0.5688375927452597,
"grad_norm": 2.0,
"learning_rate": 0.00011386138613861385,
"loss": 1.0611,
"step": 345
},
{
"epoch": 0.5770816158285244,
"grad_norm": 1.75,
"learning_rate": 0.00011551155115511551,
"loss": 1.0451,
"step": 350
},
{
"epoch": 0.5853256389117889,
"grad_norm": 2.359375,
"learning_rate": 0.00011716171617161718,
"loss": 1.0506,
"step": 355
},
{
"epoch": 0.5935696619950536,
"grad_norm": 1.1796875,
"learning_rate": 0.00011881188118811881,
"loss": 1.0414,
"step": 360
},
{
"epoch": 0.6018136850783182,
"grad_norm": 2.703125,
"learning_rate": 0.00012046204620462047,
"loss": 1.0334,
"step": 365
},
{
"epoch": 0.6100577081615829,
"grad_norm": 1.3828125,
"learning_rate": 0.00012211221122112212,
"loss": 1.0388,
"step": 370
},
{
"epoch": 0.6183017312448474,
"grad_norm": 0.86328125,
"learning_rate": 0.00012376237623762376,
"loss": 1.0251,
"step": 375
},
{
"epoch": 0.6265457543281121,
"grad_norm": 2.234375,
"learning_rate": 0.00012541254125412543,
"loss": 1.0315,
"step": 380
},
{
"epoch": 0.6347897774113768,
"grad_norm": 1.5078125,
"learning_rate": 0.00012706270627062708,
"loss": 1.0342,
"step": 385
},
{
"epoch": 0.6430338004946414,
"grad_norm": 1.609375,
"learning_rate": 0.00012871287128712872,
"loss": 1.0258,
"step": 390
},
{
"epoch": 0.651277823577906,
"grad_norm": 1.53125,
"learning_rate": 0.00013036303630363036,
"loss": 1.02,
"step": 395
},
{
"epoch": 0.6595218466611706,
"grad_norm": 1.3515625,
"learning_rate": 0.00013201320132013203,
"loss": 1.0053,
"step": 400
},
{
"epoch": 0.6677658697444353,
"grad_norm": 2.09375,
"learning_rate": 0.00013366336633663367,
"loss": 1.0217,
"step": 405
},
{
"epoch": 0.6760098928277,
"grad_norm": 1.796875,
"learning_rate": 0.00013531353135313532,
"loss": 1.0066,
"step": 410
},
{
"epoch": 0.6842539159109645,
"grad_norm": 1.7421875,
"learning_rate": 0.00013696369636963699,
"loss": 1.0141,
"step": 415
},
{
"epoch": 0.6924979389942292,
"grad_norm": 1.46875,
"learning_rate": 0.0001386138613861386,
"loss": 1.0028,
"step": 420
},
{
"epoch": 0.7007419620774938,
"grad_norm": 3.875,
"learning_rate": 0.00014026402640264027,
"loss": 1.0207,
"step": 425
},
{
"epoch": 0.7089859851607585,
"grad_norm": 1.0,
"learning_rate": 0.00014191419141914194,
"loss": 1.0122,
"step": 430
},
{
"epoch": 0.717230008244023,
"grad_norm": 2.359375,
"learning_rate": 0.00014356435643564356,
"loss": 1.0145,
"step": 435
},
{
"epoch": 0.7254740313272877,
"grad_norm": 1.9609375,
"learning_rate": 0.00014521452145214523,
"loss": 1.0031,
"step": 440
},
{
"epoch": 0.7337180544105524,
"grad_norm": 1.2578125,
"learning_rate": 0.00014686468646864687,
"loss": 0.9987,
"step": 445
},
{
"epoch": 0.7419620774938169,
"grad_norm": 4.59375,
"learning_rate": 0.0001485148514851485,
"loss": 1.0024,
"step": 450
},
{
"epoch": 0.7502061005770816,
"grad_norm": 1.1875,
"learning_rate": 0.00015016501650165018,
"loss": 1.0048,
"step": 455
},
{
"epoch": 0.7584501236603463,
"grad_norm": 3.5,
"learning_rate": 0.00015181518151815182,
"loss": 1.0039,
"step": 460
},
{
"epoch": 0.7666941467436109,
"grad_norm": 5.59375,
"learning_rate": 0.00015346534653465347,
"loss": 1.0092,
"step": 465
},
{
"epoch": 0.7749381698268755,
"grad_norm": 3.578125,
"learning_rate": 0.0001551155115511551,
"loss": 1.0078,
"step": 470
},
{
"epoch": 0.7831821929101401,
"grad_norm": 2.1875,
"learning_rate": 0.00015676567656765678,
"loss": 1.0005,
"step": 475
},
{
"epoch": 0.7914262159934048,
"grad_norm": 3.765625,
"learning_rate": 0.00015841584158415842,
"loss": 0.9895,
"step": 480
},
{
"epoch": 0.7996702390766695,
"grad_norm": 1.6484375,
"learning_rate": 0.00016006600660066006,
"loss": 0.9923,
"step": 485
},
{
"epoch": 0.807914262159934,
"grad_norm": 2.703125,
"learning_rate": 0.00016171617161716173,
"loss": 0.9996,
"step": 490
},
{
"epoch": 0.8161582852431987,
"grad_norm": 1.4765625,
"learning_rate": 0.00016336633663366338,
"loss": 0.9955,
"step": 495
},
{
"epoch": 0.8244023083264633,
"grad_norm": 4.03125,
"learning_rate": 0.00016501650165016502,
"loss": 0.9931,
"step": 500
},
{
"epoch": 0.832646331409728,
"grad_norm": 4.15625,
"learning_rate": 0.0001666666666666667,
"loss": 1.0061,
"step": 505
},
{
"epoch": 0.8408903544929925,
"grad_norm": 2.640625,
"learning_rate": 0.00016831683168316833,
"loss": 1.0086,
"step": 510
},
{
"epoch": 0.8491343775762572,
"grad_norm": 1.296875,
"learning_rate": 0.00016996699669966997,
"loss": 0.9886,
"step": 515
},
{
"epoch": 0.8573784006595219,
"grad_norm": 14.4375,
"learning_rate": 0.00017161716171617162,
"loss": 0.9933,
"step": 520
},
{
"epoch": 0.8656224237427865,
"grad_norm": 1.96875,
"learning_rate": 0.00017326732673267329,
"loss": 1.0033,
"step": 525
},
{
"epoch": 0.8738664468260511,
"grad_norm": 0.68359375,
"learning_rate": 0.00017491749174917493,
"loss": 0.9905,
"step": 530
},
{
"epoch": 0.8821104699093157,
"grad_norm": 1.84375,
"learning_rate": 0.00017656765676567657,
"loss": 0.9717,
"step": 535
},
{
"epoch": 0.8903544929925804,
"grad_norm": 1.390625,
"learning_rate": 0.00017821782178217824,
"loss": 0.9656,
"step": 540
},
{
"epoch": 0.8985985160758451,
"grad_norm": 3.625,
"learning_rate": 0.00017986798679867986,
"loss": 0.9827,
"step": 545
},
{
"epoch": 0.9068425391591096,
"grad_norm": 3.453125,
"learning_rate": 0.00018151815181518153,
"loss": 0.9865,
"step": 550
},
{
"epoch": 0.9150865622423743,
"grad_norm": 1.0078125,
"learning_rate": 0.0001831683168316832,
"loss": 0.9815,
"step": 555
},
{
"epoch": 0.9233305853256389,
"grad_norm": 1.578125,
"learning_rate": 0.0001848184818481848,
"loss": 0.9799,
"step": 560
},
{
"epoch": 0.9315746084089035,
"grad_norm": 3.4375,
"learning_rate": 0.00018646864686468648,
"loss": 0.9611,
"step": 565
},
{
"epoch": 0.9398186314921682,
"grad_norm": 1.046875,
"learning_rate": 0.00018811881188118812,
"loss": 0.9652,
"step": 570
},
{
"epoch": 0.9480626545754328,
"grad_norm": 9.3125,
"learning_rate": 0.00018976897689768977,
"loss": 0.9676,
"step": 575
},
{
"epoch": 0.9563066776586975,
"grad_norm": 1.3125,
"learning_rate": 0.00019141914191419144,
"loss": 0.9692,
"step": 580
},
{
"epoch": 0.964550700741962,
"grad_norm": 6.4375,
"learning_rate": 0.00019306930693069308,
"loss": 0.9694,
"step": 585
},
{
"epoch": 0.9727947238252267,
"grad_norm": 1.25,
"learning_rate": 0.00019471947194719472,
"loss": 0.9823,
"step": 590
},
{
"epoch": 0.9810387469084914,
"grad_norm": 2.28125,
"learning_rate": 0.00019636963696369636,
"loss": 0.97,
"step": 595
},
{
"epoch": 0.989282769991756,
"grad_norm": 1.34375,
"learning_rate": 0.00019801980198019803,
"loss": 0.9746,
"step": 600
},
{
"epoch": 0.9975267930750206,
"grad_norm": 1.96875,
"learning_rate": 0.00019966996699669968,
"loss": 0.964,
"step": 605
},
{
"epoch": 0.9991755976916735,
"eval_loss": 2.485042095184326,
"eval_runtime": 0.2808,
"eval_samples_per_second": 35.608,
"eval_steps_per_second": 3.561,
"step": 606
},
{
"epoch": 1.0057708161582852,
"grad_norm": 1.3984375,
"learning_rate": 0.00019999973456433681,
"loss": 0.9535,
"step": 610
},
{
"epoch": 1.0140148392415498,
"grad_norm": 2.6875,
"learning_rate": 0.00019999865623437013,
"loss": 0.9553,
"step": 615
},
{
"epoch": 1.0222588623248146,
"grad_norm": 4.96875,
"learning_rate": 0.00019999674842930876,
"loss": 0.9556,
"step": 620
},
{
"epoch": 1.030502885408079,
"grad_norm": 1.9453125,
"learning_rate": 0.00019999401116497763,
"loss": 0.9746,
"step": 625
},
{
"epoch": 1.0387469084913439,
"grad_norm": 1.1953125,
"learning_rate": 0.000199990444464082,
"loss": 0.9639,
"step": 630
},
{
"epoch": 1.0469909315746084,
"grad_norm": 1.65625,
"learning_rate": 0.00019998604835620717,
"loss": 0.9585,
"step": 635
},
{
"epoch": 1.055234954657873,
"grad_norm": 1.6953125,
"learning_rate": 0.00019998082287781826,
"loss": 0.9563,
"step": 640
},
{
"epoch": 1.0634789777411378,
"grad_norm": 1.6171875,
"learning_rate": 0.00019997476807225985,
"loss": 0.9489,
"step": 645
},
{
"epoch": 1.0717230008244023,
"grad_norm": 4.15625,
"learning_rate": 0.00019996788398975578,
"loss": 0.9474,
"step": 650
},
{
"epoch": 1.0799670239076669,
"grad_norm": 5.53125,
"learning_rate": 0.0001999601706874085,
"loss": 0.9407,
"step": 655
},
{
"epoch": 1.0882110469909316,
"grad_norm": 2.875,
"learning_rate": 0.00019995162822919883,
"loss": 0.9514,
"step": 660
},
{
"epoch": 1.0964550700741962,
"grad_norm": 1.4140625,
"learning_rate": 0.00019994225668598526,
"loss": 0.9502,
"step": 665
},
{
"epoch": 1.104699093157461,
"grad_norm": 2.796875,
"learning_rate": 0.0001999320561355035,
"loss": 0.9502,
"step": 670
},
{
"epoch": 1.1129431162407255,
"grad_norm": 2.0,
"learning_rate": 0.00019992102666236566,
"loss": 0.9455,
"step": 675
},
{
"epoch": 1.12118713932399,
"grad_norm": 0.90625,
"learning_rate": 0.00019990916835805974,
"loss": 0.9429,
"step": 680
},
{
"epoch": 1.1294311624072548,
"grad_norm": 0.74609375,
"learning_rate": 0.00019989648132094873,
"loss": 0.9348,
"step": 685
},
{
"epoch": 1.1376751854905194,
"grad_norm": 0.76171875,
"learning_rate": 0.00019988296565626987,
"loss": 0.939,
"step": 690
},
{
"epoch": 1.145919208573784,
"grad_norm": 0.93359375,
"learning_rate": 0.0001998686214761337,
"loss": 0.9374,
"step": 695
},
{
"epoch": 1.1541632316570487,
"grad_norm": 1.375,
"learning_rate": 0.00019985344889952327,
"loss": 0.9326,
"step": 700
},
{
"epoch": 1.1624072547403133,
"grad_norm": 0.91015625,
"learning_rate": 0.00019983744805229296,
"loss": 0.9308,
"step": 705
},
{
"epoch": 1.1706512778235778,
"grad_norm": 1.75,
"learning_rate": 0.00019982061906716764,
"loss": 0.9436,
"step": 710
},
{
"epoch": 1.1788953009068426,
"grad_norm": 1.2734375,
"learning_rate": 0.00019980296208374143,
"loss": 0.9369,
"step": 715
},
{
"epoch": 1.1871393239901071,
"grad_norm": 20.0,
"learning_rate": 0.00019978447724847652,
"loss": 0.9334,
"step": 720
},
{
"epoch": 1.195383347073372,
"grad_norm": 1.46875,
"learning_rate": 0.00019976516471470216,
"loss": 0.9416,
"step": 725
},
{
"epoch": 1.2036273701566365,
"grad_norm": 8.75,
"learning_rate": 0.0001997450246426131,
"loss": 0.9382,
"step": 730
},
{
"epoch": 1.211871393239901,
"grad_norm": 0.86328125,
"learning_rate": 0.0001997240571992685,
"loss": 0.9315,
"step": 735
},
{
"epoch": 1.2201154163231658,
"grad_norm": 0.98046875,
"learning_rate": 0.00019970226255859038,
"loss": 0.9266,
"step": 740
},
{
"epoch": 1.2283594394064303,
"grad_norm": 1.5234375,
"learning_rate": 0.0001996796409013623,
"loss": 0.9299,
"step": 745
},
{
"epoch": 1.2366034624896949,
"grad_norm": 1.0,
"learning_rate": 0.0001996561924152278,
"loss": 0.9202,
"step": 750
},
{
"epoch": 1.2448474855729597,
"grad_norm": 1.078125,
"learning_rate": 0.00019963191729468888,
"loss": 0.9149,
"step": 755
},
{
"epoch": 1.2530915086562242,
"grad_norm": 1.0703125,
"learning_rate": 0.00019960681574110426,
"loss": 0.9165,
"step": 760
},
{
"epoch": 1.2613355317394888,
"grad_norm": 2.484375,
"learning_rate": 0.00019958088796268793,
"loss": 0.9188,
"step": 765
},
{
"epoch": 1.2695795548227535,
"grad_norm": 1.5546875,
"learning_rate": 0.0001995541341745072,
"loss": 0.9274,
"step": 770
},
{
"epoch": 1.277823577906018,
"grad_norm": 3.21875,
"learning_rate": 0.0001995265545984811,
"loss": 0.9136,
"step": 775
},
{
"epoch": 1.2860676009892829,
"grad_norm": 1.8203125,
"learning_rate": 0.00019949814946337838,
"loss": 0.9251,
"step": 780
},
{
"epoch": 1.2943116240725474,
"grad_norm": 3.078125,
"learning_rate": 0.00019946891900481578,
"loss": 0.9176,
"step": 785
},
{
"epoch": 1.302555647155812,
"grad_norm": 0.66796875,
"learning_rate": 0.0001994388634652559,
"loss": 0.9283,
"step": 790
},
{
"epoch": 1.3107996702390767,
"grad_norm": 1.953125,
"learning_rate": 0.00019940798309400526,
"loss": 0.9221,
"step": 795
},
{
"epoch": 1.3190436933223413,
"grad_norm": 1.421875,
"learning_rate": 0.00019937627814721237,
"loss": 0.9199,
"step": 800
},
{
"epoch": 1.327287716405606,
"grad_norm": 1.2890625,
"learning_rate": 0.00019934374888786537,
"loss": 0.9163,
"step": 805
},
{
"epoch": 1.3355317394888706,
"grad_norm": 1.5546875,
"learning_rate": 0.00019931039558578997,
"loss": 0.9181,
"step": 810
},
{
"epoch": 1.3437757625721352,
"grad_norm": 1.9765625,
"learning_rate": 0.00019927621851764725,
"loss": 0.9276,
"step": 815
},
{
"epoch": 1.3520197856553997,
"grad_norm": 1.4921875,
"learning_rate": 0.00019924121796693127,
"loss": 0.9199,
"step": 820
},
{
"epoch": 1.3602638087386645,
"grad_norm": 1.078125,
"learning_rate": 0.0001992053942239668,
"loss": 0.9213,
"step": 825
},
{
"epoch": 1.368507831821929,
"grad_norm": 1.296875,
"learning_rate": 0.00019916874758590684,
"loss": 0.9228,
"step": 830
},
{
"epoch": 1.3767518549051938,
"grad_norm": 1.3125,
"learning_rate": 0.00019913127835673023,
"loss": 0.9149,
"step": 835
},
{
"epoch": 1.3849958779884584,
"grad_norm": 0.73828125,
"learning_rate": 0.00019909298684723904,
"loss": 0.9086,
"step": 840
},
{
"epoch": 1.393239901071723,
"grad_norm": 1.125,
"learning_rate": 0.00019905387337505612,
"loss": 0.9092,
"step": 845
},
{
"epoch": 1.4014839241549877,
"grad_norm": 2.828125,
"learning_rate": 0.0001990139382646223,
"loss": 0.9041,
"step": 850
},
{
"epoch": 1.4097279472382522,
"grad_norm": 1.3203125,
"learning_rate": 0.00019897318184719385,
"loss": 0.9093,
"step": 855
},
{
"epoch": 1.417971970321517,
"grad_norm": 1.109375,
"learning_rate": 0.00019893160446083963,
"loss": 0.909,
"step": 860
},
{
"epoch": 1.4262159934047816,
"grad_norm": 1.0390625,
"learning_rate": 0.00019888920645043831,
"loss": 0.9014,
"step": 865
},
{
"epoch": 1.434460016488046,
"grad_norm": 1.8203125,
"learning_rate": 0.00019884598816767563,
"loss": 0.9036,
"step": 870
},
{
"epoch": 1.4427040395713109,
"grad_norm": 2.234375,
"learning_rate": 0.00019880194997104123,
"loss": 0.8999,
"step": 875
},
{
"epoch": 1.4509480626545754,
"grad_norm": 2.3125,
"learning_rate": 0.00019875709222582594,
"loss": 0.9,
"step": 880
},
{
"epoch": 1.45919208573784,
"grad_norm": 1.5390625,
"learning_rate": 0.00019871141530411853,
"loss": 0.8955,
"step": 885
},
{
"epoch": 1.4674361088211048,
"grad_norm": 1.65625,
"learning_rate": 0.00019866491958480284,
"loss": 0.9042,
"step": 890
},
{
"epoch": 1.4756801319043693,
"grad_norm": 0.96875,
"learning_rate": 0.00019861760545355442,
"loss": 0.9177,
"step": 895
},
{
"epoch": 1.4839241549876339,
"grad_norm": 4.5625,
"learning_rate": 0.00019856947330283752,
"loss": 0.8974,
"step": 900
},
{
"epoch": 1.4921681780708986,
"grad_norm": 1.3671875,
"learning_rate": 0.00019852052353190166,
"loss": 0.9064,
"step": 905
},
{
"epoch": 1.5004122011541632,
"grad_norm": 3.796875,
"learning_rate": 0.0001984707565467785,
"loss": 0.9086,
"step": 910
},
{
"epoch": 1.508656224237428,
"grad_norm": 6.6875,
"learning_rate": 0.00019842017276027832,
"loss": 0.9069,
"step": 915
},
{
"epoch": 1.5169002473206925,
"grad_norm": 1.3203125,
"learning_rate": 0.00019836877259198662,
"loss": 0.898,
"step": 920
},
{
"epoch": 1.525144270403957,
"grad_norm": 2.484375,
"learning_rate": 0.0001983165564682608,
"loss": 0.8999,
"step": 925
},
{
"epoch": 1.5333882934872216,
"grad_norm": 1.34375,
"learning_rate": 0.00019826352482222638,
"loss": 0.8987,
"step": 930
},
{
"epoch": 1.5416323165704864,
"grad_norm": 1.421875,
"learning_rate": 0.00019820967809377357,
"loss": 0.8791,
"step": 935
},
{
"epoch": 1.5498763396537512,
"grad_norm": 0.80859375,
"learning_rate": 0.00019815501672955358,
"loss": 0.8887,
"step": 940
},
{
"epoch": 1.5581203627370157,
"grad_norm": 6.0,
"learning_rate": 0.0001980995411829749,
"loss": 0.8955,
"step": 945
},
{
"epoch": 1.5663643858202803,
"grad_norm": 0.8984375,
"learning_rate": 0.00019804325191419956,
"loss": 0.8991,
"step": 950
},
{
"epoch": 1.5746084089035448,
"grad_norm": 1.4921875,
"learning_rate": 0.00019798614939013932,
"loss": 0.8916,
"step": 955
},
{
"epoch": 1.5828524319868096,
"grad_norm": 1.3984375,
"learning_rate": 0.00019792823408445174,
"loss": 0.9048,
"step": 960
},
{
"epoch": 1.5910964550700744,
"grad_norm": 1.1015625,
"learning_rate": 0.0001978695064775363,
"loss": 0.8828,
"step": 965
},
{
"epoch": 1.599340478153339,
"grad_norm": 0.96875,
"learning_rate": 0.00019780996705653044,
"loss": 0.8864,
"step": 970
},
{
"epoch": 1.6075845012366035,
"grad_norm": 0.99609375,
"learning_rate": 0.00019774961631530545,
"loss": 0.8908,
"step": 975
},
{
"epoch": 1.615828524319868,
"grad_norm": 1.0390625,
"learning_rate": 0.0001976884547544624,
"loss": 0.8853,
"step": 980
},
{
"epoch": 1.6240725474031328,
"grad_norm": 2.84375,
"learning_rate": 0.0001976264828813281,
"loss": 0.8835,
"step": 985
},
{
"epoch": 1.6323165704863973,
"grad_norm": 2.296875,
"learning_rate": 0.00019756370120995066,
"loss": 0.8817,
"step": 990
},
{
"epoch": 1.640560593569662,
"grad_norm": 27.25,
"learning_rate": 0.0001975001102610954,
"loss": 0.8972,
"step": 995
},
{
"epoch": 1.6488046166529267,
"grad_norm": 9.75,
"learning_rate": 0.0001974357105622405,
"loss": 0.9076,
"step": 1000
},
{
"epoch": 1.6570486397361912,
"grad_norm": 0.71484375,
"learning_rate": 0.0001973705026475726,
"loss": 0.9001,
"step": 1005
},
{
"epoch": 1.6652926628194558,
"grad_norm": 1.984375,
"learning_rate": 0.00019730448705798239,
"loss": 0.9172,
"step": 1010
},
{
"epoch": 1.6735366859027205,
"grad_norm": 1.375,
"learning_rate": 0.0001972376643410601,
"loss": 0.8945,
"step": 1015
},
{
"epoch": 1.6817807089859853,
"grad_norm": 2.71875,
"learning_rate": 0.00019717003505109095,
"loss": 0.8857,
"step": 1020
},
{
"epoch": 1.6900247320692499,
"grad_norm": 1.4375,
"learning_rate": 0.00019710159974905064,
"loss": 0.8852,
"step": 1025
},
{
"epoch": 1.6982687551525144,
"grad_norm": 2.984375,
"learning_rate": 0.00019703235900260055,
"loss": 0.8795,
"step": 1030
},
{
"epoch": 1.706512778235779,
"grad_norm": 1.2578125,
"learning_rate": 0.00019696231338608316,
"loss": 0.8926,
"step": 1035
},
{
"epoch": 1.7147568013190437,
"grad_norm": 4.90625,
"learning_rate": 0.00019689146348051719,
"loss": 0.8927,
"step": 1040
},
{
"epoch": 1.7230008244023083,
"grad_norm": 1.765625,
"learning_rate": 0.0001968198098735929,
"loss": 0.8762,
"step": 1045
},
{
"epoch": 1.731244847485573,
"grad_norm": 6.75,
"learning_rate": 0.0001967473531596671,
"loss": 0.8886,
"step": 1050
},
{
"epoch": 1.7394888705688376,
"grad_norm": 12.125,
"learning_rate": 0.00019667409393975822,
"loss": 0.8865,
"step": 1055
},
{
"epoch": 1.7477328936521022,
"grad_norm": 1.171875,
"learning_rate": 0.00019660003282154147,
"loss": 0.887,
"step": 1060
},
{
"epoch": 1.7559769167353667,
"grad_norm": 0.84765625,
"learning_rate": 0.00019652517041934356,
"loss": 0.8669,
"step": 1065
},
{
"epoch": 1.7642209398186315,
"grad_norm": 0.7890625,
"learning_rate": 0.00019644950735413788,
"loss": 0.8774,
"step": 1070
},
{
"epoch": 1.7724649629018963,
"grad_norm": 0.98828125,
"learning_rate": 0.00019637304425353916,
"loss": 0.8717,
"step": 1075
},
{
"epoch": 1.7807089859851608,
"grad_norm": 0.7578125,
"learning_rate": 0.0001962957817517982,
"loss": 0.8769,
"step": 1080
},
{
"epoch": 1.7889530090684254,
"grad_norm": 4.59375,
"learning_rate": 0.0001962177204897969,
"loss": 0.872,
"step": 1085
},
{
"epoch": 1.79719703215169,
"grad_norm": 0.69140625,
"learning_rate": 0.0001961388611150427,
"loss": 0.8727,
"step": 1090
},
{
"epoch": 1.8054410552349547,
"grad_norm": 2.171875,
"learning_rate": 0.00019605920428166323,
"loss": 0.8671,
"step": 1095
},
{
"epoch": 1.8136850783182195,
"grad_norm": 7.78125,
"learning_rate": 0.00019597875065040094,
"loss": 0.8927,
"step": 1100
},
{
"epoch": 1.821929101401484,
"grad_norm": 10.9375,
"learning_rate": 0.00019589750088860766,
"loss": 0.881,
"step": 1105
},
{
"epoch": 1.8301731244847486,
"grad_norm": 1.1328125,
"learning_rate": 0.000195815455670239,
"loss": 0.8793,
"step": 1110
},
{
"epoch": 1.838417147568013,
"grad_norm": 3.890625,
"learning_rate": 0.00019573261567584874,
"loss": 0.8795,
"step": 1115
},
{
"epoch": 1.8466611706512777,
"grad_norm": 1.1171875,
"learning_rate": 0.00019564898159258324,
"loss": 0.8933,
"step": 1120
},
{
"epoch": 1.8549051937345424,
"grad_norm": 0.921875,
"learning_rate": 0.00019556455411417573,
"loss": 0.8626,
"step": 1125
},
{
"epoch": 1.8631492168178072,
"grad_norm": 1.5625,
"learning_rate": 0.0001954793339409405,
"loss": 0.8616,
"step": 1130
},
{
"epoch": 1.8713932399010718,
"grad_norm": 2.625,
"learning_rate": 0.00019539332177976714,
"loss": 0.8693,
"step": 1135
},
{
"epoch": 1.8796372629843363,
"grad_norm": 0.875,
"learning_rate": 0.00019530651834411474,
"loss": 0.8659,
"step": 1140
},
{
"epoch": 1.8878812860676009,
"grad_norm": 6.0,
"learning_rate": 0.00019521892435400587,
"loss": 0.8666,
"step": 1145
},
{
"epoch": 1.8961253091508656,
"grad_norm": 1.1484375,
"learning_rate": 0.00019513054053602055,
"loss": 0.8601,
"step": 1150
},
{
"epoch": 1.9043693322341304,
"grad_norm": 2.125,
"learning_rate": 0.00019504136762329047,
"loss": 0.8631,
"step": 1155
},
{
"epoch": 1.912613355317395,
"grad_norm": 3.296875,
"learning_rate": 0.00019495140635549261,
"loss": 0.8833,
"step": 1160
},
{
"epoch": 1.9208573784006595,
"grad_norm": 2.4375,
"learning_rate": 0.00019486065747884333,
"loss": 0.8555,
"step": 1165
},
{
"epoch": 1.929101401483924,
"grad_norm": 1.2734375,
"learning_rate": 0.0001947691217460921,
"loss": 0.8602,
"step": 1170
},
{
"epoch": 1.9373454245671888,
"grad_norm": 1.546875,
"learning_rate": 0.0001946767999165152,
"loss": 0.8553,
"step": 1175
},
{
"epoch": 1.9455894476504534,
"grad_norm": 0.94921875,
"learning_rate": 0.00019458369275590954,
"loss": 0.8588,
"step": 1180
},
{
"epoch": 1.9538334707337182,
"grad_norm": 2.21875,
"learning_rate": 0.00019448980103658613,
"loss": 0.8529,
"step": 1185
},
{
"epoch": 1.9620774938169827,
"grad_norm": 8.6875,
"learning_rate": 0.00019439512553736394,
"loss": 0.8441,
"step": 1190
},
{
"epoch": 1.9703215169002473,
"grad_norm": 0.83984375,
"learning_rate": 0.0001942996670435632,
"loss": 0.8526,
"step": 1195
},
{
"epoch": 1.9785655399835118,
"grad_norm": 6.0625,
"learning_rate": 0.0001942034263469989,
"loss": 0.8547,
"step": 1200
},
{
"epoch": 1.9868095630667766,
"grad_norm": 13.0625,
"learning_rate": 0.0001941064042459745,
"loss": 0.8686,
"step": 1205
},
{
"epoch": 1.9950535861500414,
"grad_norm": 0.7734375,
"learning_rate": 0.00019400860154527493,
"loss": 0.8499,
"step": 1210
},
{
"epoch": 2.0,
"eval_loss": 2.4393434524536133,
"eval_runtime": 0.2359,
"eval_samples_per_second": 42.391,
"eval_steps_per_second": 4.239,
"step": 1213
},
{
"epoch": 2.003297609233306,
"grad_norm": 1.1328125,
"learning_rate": 0.0001939100190561601,
"loss": 0.8486,
"step": 1215
},
{
"epoch": 2.0115416323165705,
"grad_norm": 2.515625,
"learning_rate": 0.00019381065759635822,
"loss": 0.8375,
"step": 1220
},
{
"epoch": 2.019785655399835,
"grad_norm": 1.046875,
"learning_rate": 0.0001937105179900589,
"loss": 0.8531,
"step": 1225
},
{
"epoch": 2.0280296784830996,
"grad_norm": 1.75,
"learning_rate": 0.00019360960106790643,
"loss": 0.8369,
"step": 1230
},
{
"epoch": 2.0362737015663646,
"grad_norm": 0.58203125,
"learning_rate": 0.00019350790766699282,
"loss": 0.8276,
"step": 1235
},
{
"epoch": 2.044517724649629,
"grad_norm": 1.0390625,
"learning_rate": 0.0001934054386308508,
"loss": 0.8289,
"step": 1240
},
{
"epoch": 2.0527617477328937,
"grad_norm": 0.57421875,
"learning_rate": 0.00019330219480944694,
"loss": 0.8292,
"step": 1245
},
{
"epoch": 2.061005770816158,
"grad_norm": 0.828125,
"learning_rate": 0.0001931981770591745,
"loss": 0.8305,
"step": 1250
},
{
"epoch": 2.0692497938994228,
"grad_norm": 0.77734375,
"learning_rate": 0.00019309338624284644,
"loss": 0.8243,
"step": 1255
},
{
"epoch": 2.0774938169826878,
"grad_norm": 1.265625,
"learning_rate": 0.00019298782322968815,
"loss": 0.8225,
"step": 1260
},
{
"epoch": 2.0857378400659523,
"grad_norm": 4.03125,
"learning_rate": 0.0001928814888953303,
"loss": 0.8212,
"step": 1265
},
{
"epoch": 2.093981863149217,
"grad_norm": 2.015625,
"learning_rate": 0.0001927743841218016,
"loss": 0.8188,
"step": 1270
},
{
"epoch": 2.1022258862324814,
"grad_norm": 1.015625,
"learning_rate": 0.00019266650979752136,
"loss": 0.8209,
"step": 1275
},
{
"epoch": 2.110469909315746,
"grad_norm": 8.25,
"learning_rate": 0.00019255786681729225,
"loss": 0.8242,
"step": 1280
},
{
"epoch": 2.1187139323990105,
"grad_norm": 5.53125,
"learning_rate": 0.00019244845608229293,
"loss": 0.828,
"step": 1285
},
{
"epoch": 2.1269579554822755,
"grad_norm": 0.6953125,
"learning_rate": 0.00019233827850007027,
"loss": 0.8159,
"step": 1290
},
{
"epoch": 2.13520197856554,
"grad_norm": 7.03125,
"learning_rate": 0.00019222733498453222,
"loss": 0.8196,
"step": 1295
},
{
"epoch": 2.1434460016488046,
"grad_norm": 0.84765625,
"learning_rate": 0.00019211562645594002,
"loss": 0.8231,
"step": 1300
},
{
"epoch": 2.151690024732069,
"grad_norm": 0.474609375,
"learning_rate": 0.00019200315384090044,
"loss": 0.8073,
"step": 1305
},
{
"epoch": 2.1599340478153337,
"grad_norm": 1.484375,
"learning_rate": 0.00019188991807235844,
"loss": 0.8255,
"step": 1310
},
{
"epoch": 2.1681780708985987,
"grad_norm": 0.5859375,
"learning_rate": 0.0001917759200895891,
"loss": 0.8185,
"step": 1315
},
{
"epoch": 2.1764220939818633,
"grad_norm": 8.0,
"learning_rate": 0.00019166116083819002,
"loss": 0.8174,
"step": 1320
},
{
"epoch": 2.184666117065128,
"grad_norm": 0.96875,
"learning_rate": 0.00019154564127007336,
"loss": 0.8263,
"step": 1325
},
{
"epoch": 2.1929101401483924,
"grad_norm": 1.171875,
"learning_rate": 0.0001914293623434581,
"loss": 0.8333,
"step": 1330
},
{
"epoch": 2.201154163231657,
"grad_norm": 2.546875,
"learning_rate": 0.00019131232502286188,
"loss": 0.8227,
"step": 1335
},
{
"epoch": 2.209398186314922,
"grad_norm": 1.1171875,
"learning_rate": 0.00019119453027909323,
"loss": 0.8123,
"step": 1340
},
{
"epoch": 2.2176422093981865,
"grad_norm": 0.96484375,
"learning_rate": 0.0001910759790892433,
"loss": 0.8129,
"step": 1345
},
{
"epoch": 2.225886232481451,
"grad_norm": 0.90625,
"learning_rate": 0.0001909566724366779,
"loss": 0.8101,
"step": 1350
},
{
"epoch": 2.2341302555647156,
"grad_norm": 2.203125,
"learning_rate": 0.00019083661131102933,
"loss": 0.8205,
"step": 1355
},
{
"epoch": 2.24237427864798,
"grad_norm": 0.9921875,
"learning_rate": 0.00019071579670818808,
"loss": 0.8228,
"step": 1360
},
{
"epoch": 2.2506183017312447,
"grad_norm": 0.546875,
"learning_rate": 0.00019059422963029464,
"loss": 0.8123,
"step": 1365
},
{
"epoch": 2.2588623248145097,
"grad_norm": 0.7421875,
"learning_rate": 0.00019047191108573125,
"loss": 0.8227,
"step": 1370
},
{
"epoch": 2.267106347897774,
"grad_norm": 1.4609375,
"learning_rate": 0.00019034884208911335,
"loss": 0.814,
"step": 1375
},
{
"epoch": 2.2753503709810388,
"grad_norm": 0.78515625,
"learning_rate": 0.00019022502366128135,
"loss": 0.819,
"step": 1380
},
{
"epoch": 2.2835943940643033,
"grad_norm": 0.6484375,
"learning_rate": 0.00019010045682929213,
"loss": 0.8074,
"step": 1385
},
{
"epoch": 2.291838417147568,
"grad_norm": 0.71484375,
"learning_rate": 0.00018997514262641035,
"loss": 0.8224,
"step": 1390
},
{
"epoch": 2.300082440230833,
"grad_norm": 0.61328125,
"learning_rate": 0.0001898490820921001,
"loss": 0.8096,
"step": 1395
},
{
"epoch": 2.3083264633140974,
"grad_norm": 0.51953125,
"learning_rate": 0.00018972227627201617,
"loss": 0.8102,
"step": 1400
},
{
"epoch": 2.316570486397362,
"grad_norm": 0.482421875,
"learning_rate": 0.0001895947262179954,
"loss": 0.8113,
"step": 1405
},
{
"epoch": 2.3248145094806265,
"grad_norm": 0.52734375,
"learning_rate": 0.00018946643298804793,
"loss": 0.8109,
"step": 1410
},
{
"epoch": 2.333058532563891,
"grad_norm": 0.474609375,
"learning_rate": 0.00018933739764634847,
"loss": 0.809,
"step": 1415
},
{
"epoch": 2.3413025556471556,
"grad_norm": 0.54296875,
"learning_rate": 0.0001892076212632274,
"loss": 0.8153,
"step": 1420
},
{
"epoch": 2.3495465787304206,
"grad_norm": 0.578125,
"learning_rate": 0.00018907710491516199,
"loss": 0.8161,
"step": 1425
},
{
"epoch": 2.357790601813685,
"grad_norm": 0.60546875,
"learning_rate": 0.00018894584968476733,
"loss": 0.8141,
"step": 1430
},
{
"epoch": 2.3660346248969497,
"grad_norm": 0.6328125,
"learning_rate": 0.00018881385666078755,
"loss": 0.8102,
"step": 1435
},
{
"epoch": 2.3742786479802143,
"grad_norm": 0.4921875,
"learning_rate": 0.00018868112693808665,
"loss": 0.8124,
"step": 1440
},
{
"epoch": 2.382522671063479,
"grad_norm": 0.609375,
"learning_rate": 0.00018854766161763932,
"loss": 0.8033,
"step": 1445
},
{
"epoch": 2.390766694146744,
"grad_norm": 0.59765625,
"learning_rate": 0.00018841346180652213,
"loss": 0.812,
"step": 1450
},
{
"epoch": 2.3990107172300084,
"grad_norm": 0.46875,
"learning_rate": 0.00018827852861790398,
"loss": 0.8059,
"step": 1455
},
{
"epoch": 2.407254740313273,
"grad_norm": 0.70703125,
"learning_rate": 0.00018814286317103714,
"loss": 0.8021,
"step": 1460
},
{
"epoch": 2.4154987633965375,
"grad_norm": 1.4921875,
"learning_rate": 0.00018800646659124782,
"loss": 0.8036,
"step": 1465
},
{
"epoch": 2.423742786479802,
"grad_norm": 0.6484375,
"learning_rate": 0.00018786934000992688,
"loss": 0.8045,
"step": 1470
},
{
"epoch": 2.4319868095630666,
"grad_norm": 0.58984375,
"learning_rate": 0.00018773148456452046,
"loss": 0.8108,
"step": 1475
},
{
"epoch": 2.4402308326463316,
"grad_norm": 0.73828125,
"learning_rate": 0.00018759290139852048,
"loss": 0.8097,
"step": 1480
},
{
"epoch": 2.448474855729596,
"grad_norm": 0.79296875,
"learning_rate": 0.00018745359166145523,
"loss": 0.8052,
"step": 1485
},
{
"epoch": 2.4567188788128607,
"grad_norm": 1.203125,
"learning_rate": 0.00018731355650887985,
"loss": 0.8016,
"step": 1490
},
{
"epoch": 2.464962901896125,
"grad_norm": 0.4453125,
"learning_rate": 0.00018717279710236666,
"loss": 0.8077,
"step": 1495
},
{
"epoch": 2.4732069249793898,
"grad_norm": 0.9921875,
"learning_rate": 0.00018703131460949554,
"loss": 0.8031,
"step": 1500
},
{
"epoch": 2.4814509480626548,
"grad_norm": 5.46875,
"learning_rate": 0.00018688911020384432,
"loss": 0.8062,
"step": 1505
},
{
"epoch": 2.4896949711459193,
"grad_norm": 0.7421875,
"learning_rate": 0.000186746185064979,
"loss": 0.8156,
"step": 1510
},
{
"epoch": 2.497938994229184,
"grad_norm": 0.77734375,
"learning_rate": 0.00018660254037844388,
"loss": 0.8083,
"step": 1515
},
{
"epoch": 2.5061830173124484,
"grad_norm": 0.70703125,
"learning_rate": 0.00018645817733575193,
"loss": 0.812,
"step": 1520
},
{
"epoch": 2.514427040395713,
"grad_norm": 3.671875,
"learning_rate": 0.00018631309713437467,
"loss": 0.796,
"step": 1525
},
{
"epoch": 2.5226710634789775,
"grad_norm": 0.6484375,
"learning_rate": 0.0001861673009777325,
"loss": 0.7988,
"step": 1530
},
{
"epoch": 2.5309150865622425,
"grad_norm": 1.546875,
"learning_rate": 0.00018602079007518438,
"loss": 0.7988,
"step": 1535
},
{
"epoch": 2.539159109645507,
"grad_norm": 0.4375,
"learning_rate": 0.00018587356564201817,
"loss": 0.8045,
"step": 1540
},
{
"epoch": 2.5474031327287716,
"grad_norm": 0.44140625,
"learning_rate": 0.0001857256288994402,
"loss": 0.8112,
"step": 1545
},
{
"epoch": 2.555647155812036,
"grad_norm": 0.56640625,
"learning_rate": 0.00018557698107456549,
"loss": 0.808,
"step": 1550
},
{
"epoch": 2.563891178895301,
"grad_norm": 0.453125,
"learning_rate": 0.00018542762340040722,
"loss": 0.7958,
"step": 1555
},
{
"epoch": 2.5721352019785657,
"grad_norm": 0.859375,
"learning_rate": 0.00018527755711586678,
"loss": 0.8008,
"step": 1560
},
{
"epoch": 2.5803792250618303,
"grad_norm": 0.462890625,
"learning_rate": 0.00018512678346572337,
"loss": 0.7995,
"step": 1565
},
{
"epoch": 2.588623248145095,
"grad_norm": 0.734375,
"learning_rate": 0.00018497530370062363,
"loss": 0.7974,
"step": 1570
},
{
"epoch": 2.5968672712283594,
"grad_norm": 0.51171875,
"learning_rate": 0.0001848231190770714,
"loss": 0.7929,
"step": 1575
},
{
"epoch": 2.605111294311624,
"grad_norm": 0.78125,
"learning_rate": 0.00018467023085741717,
"loss": 0.8014,
"step": 1580
},
{
"epoch": 2.6133553173948885,
"grad_norm": 0.9140625,
"learning_rate": 0.00018451664030984773,
"loss": 0.7944,
"step": 1585
},
{
"epoch": 2.6215993404781535,
"grad_norm": 0.4453125,
"learning_rate": 0.00018436234870837547,
"loss": 0.7937,
"step": 1590
},
{
"epoch": 2.629843363561418,
"grad_norm": 1.0703125,
"learning_rate": 0.00018420735733282807,
"loss": 0.7983,
"step": 1595
},
{
"epoch": 2.6380873866446826,
"grad_norm": 0.455078125,
"learning_rate": 0.00018405166746883762,
"loss": 0.7924,
"step": 1600
},
{
"epoch": 2.646331409727947,
"grad_norm": 0.474609375,
"learning_rate": 0.00018389528040783012,
"loss": 0.7953,
"step": 1605
},
{
"epoch": 2.654575432811212,
"grad_norm": 0.578125,
"learning_rate": 0.00018373819744701476,
"loss": 0.7893,
"step": 1610
},
{
"epoch": 2.6628194558944767,
"grad_norm": 0.412109375,
"learning_rate": 0.00018358041988937305,
"loss": 0.7945,
"step": 1615
},
{
"epoch": 2.671063478977741,
"grad_norm": 0.8125,
"learning_rate": 0.00018342194904364813,
"loss": 0.7894,
"step": 1620
},
{
"epoch": 2.6793075020610058,
"grad_norm": 0.64453125,
"learning_rate": 0.00018326278622433386,
"loss": 0.7925,
"step": 1625
},
{
"epoch": 2.6875515251442703,
"grad_norm": 0.5390625,
"learning_rate": 0.00018310293275166392,
"loss": 0.7978,
"step": 1630
},
{
"epoch": 2.695795548227535,
"grad_norm": 0.63671875,
"learning_rate": 0.00018294238995160094,
"loss": 0.792,
"step": 1635
},
{
"epoch": 2.7040395713107994,
"grad_norm": 0.671875,
"learning_rate": 0.00018278115915582526,
"loss": 0.8069,
"step": 1640
},
{
"epoch": 2.7122835943940644,
"grad_norm": 1.515625,
"learning_rate": 0.0001826192417017242,
"loss": 0.8048,
"step": 1645
},
{
"epoch": 2.720527617477329,
"grad_norm": 0.54296875,
"learning_rate": 0.00018245663893238075,
"loss": 0.8009,
"step": 1650
},
{
"epoch": 2.7287716405605935,
"grad_norm": 0.6640625,
"learning_rate": 0.0001822933521965625,
"loss": 0.7903,
"step": 1655
},
{
"epoch": 2.737015663643858,
"grad_norm": 0.48046875,
"learning_rate": 0.00018212938284871047,
"loss": 0.7917,
"step": 1660
},
{
"epoch": 2.745259686727123,
"grad_norm": 0.58203125,
"learning_rate": 0.00018196473224892784,
"loss": 0.7886,
"step": 1665
},
{
"epoch": 2.7535037098103876,
"grad_norm": 0.62890625,
"learning_rate": 0.0001817994017629687,
"loss": 0.7933,
"step": 1670
},
{
"epoch": 2.761747732893652,
"grad_norm": 0.78515625,
"learning_rate": 0.00018163339276222666,
"loss": 0.792,
"step": 1675
},
{
"epoch": 2.7699917559769167,
"grad_norm": 0.65625,
"learning_rate": 0.00018146670662372354,
"loss": 0.7825,
"step": 1680
},
{
"epoch": 2.7782357790601813,
"grad_norm": 1.0234375,
"learning_rate": 0.0001812993447300979,
"loss": 0.7929,
"step": 1685
},
{
"epoch": 2.786479802143446,
"grad_norm": 0.6171875,
"learning_rate": 0.00018113130846959368,
"loss": 0.7925,
"step": 1690
},
{
"epoch": 2.7947238252267104,
"grad_norm": 0.48828125,
"learning_rate": 0.0001809625992360485,
"loss": 0.7888,
"step": 1695
},
{
"epoch": 2.8029678483099754,
"grad_norm": 0.400390625,
"learning_rate": 0.00018079321842888227,
"loss": 0.7995,
"step": 1700
},
{
"epoch": 2.81121187139324,
"grad_norm": 0.48828125,
"learning_rate": 0.00018062316745308542,
"loss": 0.7939,
"step": 1705
},
{
"epoch": 2.8194558944765045,
"grad_norm": 0.45703125,
"learning_rate": 0.0001804524477192075,
"loss": 0.79,
"step": 1710
},
{
"epoch": 2.827699917559769,
"grad_norm": 0.462890625,
"learning_rate": 0.0001802810606433451,
"loss": 0.7927,
"step": 1715
},
{
"epoch": 2.835943940643034,
"grad_norm": 0.4609375,
"learning_rate": 0.00018010900764713048,
"loss": 0.796,
"step": 1720
},
{
"epoch": 2.8441879637262986,
"grad_norm": 0.75,
"learning_rate": 0.0001799362901577196,
"loss": 0.7921,
"step": 1725
},
{
"epoch": 2.852431986809563,
"grad_norm": 0.482421875,
"learning_rate": 0.00017976290960778024,
"loss": 0.79,
"step": 1730
},
{
"epoch": 2.8606760098928277,
"grad_norm": 0.71484375,
"learning_rate": 0.0001795888674354802,
"loss": 0.7927,
"step": 1735
},
{
"epoch": 2.868920032976092,
"grad_norm": 0.458984375,
"learning_rate": 0.00017941416508447536,
"loss": 0.7917,
"step": 1740
},
{
"epoch": 2.8771640560593568,
"grad_norm": 1.2265625,
"learning_rate": 0.0001792388040038977,
"loss": 0.7905,
"step": 1745
},
{
"epoch": 2.8854080791426218,
"grad_norm": 0.7578125,
"learning_rate": 0.00017906278564834324,
"loss": 0.7934,
"step": 1750
},
{
"epoch": 2.8936521022258863,
"grad_norm": 0.4296875,
"learning_rate": 0.00017888611147786002,
"loss": 0.7957,
"step": 1755
},
{
"epoch": 2.901896125309151,
"grad_norm": 0.55078125,
"learning_rate": 0.00017870878295793598,
"loss": 0.7793,
"step": 1760
},
{
"epoch": 2.9101401483924154,
"grad_norm": 0.7421875,
"learning_rate": 0.0001785308015594868,
"loss": 0.7912,
"step": 1765
},
{
"epoch": 2.91838417147568,
"grad_norm": 0.447265625,
"learning_rate": 0.00017835216875884368,
"loss": 0.7842,
"step": 1770
},
{
"epoch": 2.926628194558945,
"grad_norm": 0.6640625,
"learning_rate": 0.00017817288603774116,
"loss": 0.784,
"step": 1775
},
{
"epoch": 2.9348722176422095,
"grad_norm": 0.828125,
"learning_rate": 0.00017799295488330467,
"loss": 0.7934,
"step": 1780
},
{
"epoch": 2.943116240725474,
"grad_norm": 0.53515625,
"learning_rate": 0.00017781237678803847,
"loss": 0.7867,
"step": 1785
},
{
"epoch": 2.9513602638087386,
"grad_norm": 0.470703125,
"learning_rate": 0.00017763115324981294,
"loss": 0.7911,
"step": 1790
},
{
"epoch": 2.959604286892003,
"grad_norm": 0.703125,
"learning_rate": 0.00017744928577185243,
"loss": 0.7914,
"step": 1795
},
{
"epoch": 2.9678483099752677,
"grad_norm": 0.62109375,
"learning_rate": 0.00017726677586272263,
"loss": 0.7917,
"step": 1800
},
{
"epoch": 2.9760923330585327,
"grad_norm": 0.455078125,
"learning_rate": 0.00017708362503631814,
"loss": 0.7819,
"step": 1805
},
{
"epoch": 2.9843363561417973,
"grad_norm": 0.419921875,
"learning_rate": 0.00017689983481184989,
"loss": 0.7842,
"step": 1810
},
{
"epoch": 2.992580379225062,
"grad_norm": 0.5078125,
"learning_rate": 0.00017671540671383243,
"loss": 0.7939,
"step": 1815
},
{
"epoch": 2.9991755976916736,
"eval_loss": 2.4241690635681152,
"eval_runtime": 0.2578,
"eval_samples_per_second": 38.793,
"eval_steps_per_second": 3.879,
"step": 1819
},
{
"epoch": 3.0008244023083264,
"grad_norm": 0.416015625,
"learning_rate": 0.00017653034227207152,
"loss": 0.7885,
"step": 1820
},
{
"epoch": 3.009068425391591,
"grad_norm": 1.0390625,
"learning_rate": 0.00017634464302165124,
"loss": 0.772,
"step": 1825
},
{
"epoch": 3.017312448474856,
"grad_norm": 0.4765625,
"learning_rate": 0.0001761583105029213,
"loss": 0.7668,
"step": 1830
},
{
"epoch": 3.0255564715581205,
"grad_norm": 1.1484375,
"learning_rate": 0.00017597134626148427,
"loss": 0.77,
"step": 1835
},
{
"epoch": 3.033800494641385,
"grad_norm": 0.88671875,
"learning_rate": 0.0001757837518481829,
"loss": 0.7713,
"step": 1840
},
{
"epoch": 3.0420445177246496,
"grad_norm": 0.74609375,
"learning_rate": 0.00017559552881908695,
"loss": 0.7748,
"step": 1845
},
{
"epoch": 3.050288540807914,
"grad_norm": 0.6015625,
"learning_rate": 0.00017540667873548063,
"loss": 0.7653,
"step": 1850
},
{
"epoch": 3.058532563891179,
"grad_norm": 0.50390625,
"learning_rate": 0.00017521720316384935,
"loss": 0.7706,
"step": 1855
},
{
"epoch": 3.0667765869744437,
"grad_norm": 0.9140625,
"learning_rate": 0.00017502710367586687,
"loss": 0.7633,
"step": 1860
},
{
"epoch": 3.075020610057708,
"grad_norm": 0.384765625,
"learning_rate": 0.00017483638184838239,
"loss": 0.7568,
"step": 1865
},
{
"epoch": 3.0832646331409728,
"grad_norm": 0.5390625,
"learning_rate": 0.0001746450392634071,
"loss": 0.757,
"step": 1870
},
{
"epoch": 3.0915086562242373,
"grad_norm": 0.44140625,
"learning_rate": 0.0001744530775081015,
"loss": 0.7701,
"step": 1875
},
{
"epoch": 3.099752679307502,
"grad_norm": 0.44140625,
"learning_rate": 0.00017426049817476197,
"loss": 0.7717,
"step": 1880
},
{
"epoch": 3.107996702390767,
"grad_norm": 0.52734375,
"learning_rate": 0.00017406730286080753,
"loss": 0.7647,
"step": 1885
},
{
"epoch": 3.1162407254740314,
"grad_norm": 0.5,
"learning_rate": 0.00017387349316876666,
"loss": 0.7618,
"step": 1890
},
{
"epoch": 3.124484748557296,
"grad_norm": 0.443359375,
"learning_rate": 0.00017367907070626424,
"loss": 0.7712,
"step": 1895
},
{
"epoch": 3.1327287716405605,
"grad_norm": 0.51953125,
"learning_rate": 0.00017348403708600772,
"loss": 0.7635,
"step": 1900
},
{
"epoch": 3.140972794723825,
"grad_norm": 0.58203125,
"learning_rate": 0.0001732883939257742,
"loss": 0.7591,
"step": 1905
},
{
"epoch": 3.14921681780709,
"grad_norm": 0.48046875,
"learning_rate": 0.00017309214284839678,
"loss": 0.7664,
"step": 1910
},
{
"epoch": 3.1574608408903546,
"grad_norm": 0.486328125,
"learning_rate": 0.00017289528548175114,
"loss": 0.7633,
"step": 1915
},
{
"epoch": 3.165704863973619,
"grad_norm": 0.482421875,
"learning_rate": 0.00017269782345874203,
"loss": 0.7676,
"step": 1920
},
{
"epoch": 3.1739488870568837,
"grad_norm": 0.45703125,
"learning_rate": 0.0001724997584172898,
"loss": 0.7712,
"step": 1925
},
{
"epoch": 3.1821929101401483,
"grad_norm": 0.48046875,
"learning_rate": 0.00017230109200031668,
"loss": 0.7631,
"step": 1930
},
{
"epoch": 3.190436933223413,
"grad_norm": 0.412109375,
"learning_rate": 0.00017210182585573327,
"loss": 0.7664,
"step": 1935
},
{
"epoch": 3.198680956306678,
"grad_norm": 0.6484375,
"learning_rate": 0.00017190196163642483,
"loss": 0.7653,
"step": 1940
},
{
"epoch": 3.2069249793899424,
"grad_norm": 0.60546875,
"learning_rate": 0.0001717015010002376,
"loss": 0.7677,
"step": 1945
},
{
"epoch": 3.215169002473207,
"grad_norm": 0.5234375,
"learning_rate": 0.00017150044560996488,
"loss": 0.7628,
"step": 1950
},
{
"epoch": 3.2234130255564715,
"grad_norm": 0.52734375,
"learning_rate": 0.00017129879713333356,
"loss": 0.7604,
"step": 1955
},
{
"epoch": 3.231657048639736,
"grad_norm": 0.419921875,
"learning_rate": 0.00017109655724298995,
"loss": 0.7664,
"step": 1960
},
{
"epoch": 3.239901071723001,
"grad_norm": 0.6484375,
"learning_rate": 0.00017089372761648616,
"loss": 0.7679,
"step": 1965
},
{
"epoch": 3.2481450948062656,
"grad_norm": 0.5234375,
"learning_rate": 0.00017069030993626603,
"loss": 0.7621,
"step": 1970
},
{
"epoch": 3.25638911788953,
"grad_norm": 0.703125,
"learning_rate": 0.00017048630588965117,
"loss": 0.7747,
"step": 1975
},
{
"epoch": 3.2646331409727947,
"grad_norm": 0.625,
"learning_rate": 0.00017028171716882714,
"loss": 0.7655,
"step": 1980
},
{
"epoch": 3.272877164056059,
"grad_norm": 0.61328125,
"learning_rate": 0.00017007654547082922,
"loss": 0.768,
"step": 1985
},
{
"epoch": 3.281121187139324,
"grad_norm": 0.6796875,
"learning_rate": 0.00016987079249752843,
"loss": 0.7631,
"step": 1990
},
{
"epoch": 3.2893652102225888,
"grad_norm": 0.7421875,
"learning_rate": 0.00016966445995561727,
"loss": 0.7686,
"step": 1995
},
{
"epoch": 3.2976092333058533,
"grad_norm": 0.54296875,
"learning_rate": 0.00016945754955659595,
"loss": 0.7695,
"step": 2000
},
{
"epoch": 3.305853256389118,
"grad_norm": 0.59765625,
"learning_rate": 0.00016925006301675763,
"loss": 0.7548,
"step": 2005
},
{
"epoch": 3.3140972794723824,
"grad_norm": 0.4140625,
"learning_rate": 0.0001690420020571747,
"loss": 0.7642,
"step": 2010
},
{
"epoch": 3.322341302555647,
"grad_norm": 0.431640625,
"learning_rate": 0.00016883336840368412,
"loss": 0.7706,
"step": 2015
},
{
"epoch": 3.330585325638912,
"grad_norm": 0.4375,
"learning_rate": 0.0001686241637868734,
"loss": 0.7693,
"step": 2020
},
{
"epoch": 3.3388293487221765,
"grad_norm": 0.50390625,
"learning_rate": 0.00016841438994206595,
"loss": 0.7616,
"step": 2025
},
{
"epoch": 3.347073371805441,
"grad_norm": 0.99609375,
"learning_rate": 0.0001682040486093071,
"loss": 0.7661,
"step": 2030
},
{
"epoch": 3.3553173948887056,
"grad_norm": 0.65234375,
"learning_rate": 0.00016799314153334916,
"loss": 0.7543,
"step": 2035
},
{
"epoch": 3.36356141797197,
"grad_norm": 0.92578125,
"learning_rate": 0.00016778167046363734,
"loss": 0.757,
"step": 2040
},
{
"epoch": 3.371805441055235,
"grad_norm": 1.1640625,
"learning_rate": 0.00016756963715429502,
"loss": 0.7647,
"step": 2045
},
{
"epoch": 3.3800494641384997,
"grad_norm": 0.5234375,
"learning_rate": 0.00016735704336410943,
"loss": 0.7562,
"step": 2050
},
{
"epoch": 3.3882934872217643,
"grad_norm": 0.70703125,
"learning_rate": 0.0001671438908565167,
"loss": 0.7573,
"step": 2055
},
{
"epoch": 3.396537510305029,
"grad_norm": 0.50390625,
"learning_rate": 0.00016693018139958763,
"loss": 0.7585,
"step": 2060
},
{
"epoch": 3.4047815333882934,
"grad_norm": 1.1953125,
"learning_rate": 0.00016671591676601272,
"loss": 0.7538,
"step": 2065
},
{
"epoch": 3.413025556471558,
"grad_norm": 0.458984375,
"learning_rate": 0.00016650109873308765,
"loss": 0.7635,
"step": 2070
},
{
"epoch": 3.421269579554823,
"grad_norm": 0.443359375,
"learning_rate": 0.00016628572908269841,
"loss": 0.7605,
"step": 2075
},
{
"epoch": 3.4295136026380875,
"grad_norm": 0.421875,
"learning_rate": 0.00016606980960130665,
"loss": 0.7511,
"step": 2080
},
{
"epoch": 3.437757625721352,
"grad_norm": 0.61328125,
"learning_rate": 0.00016585334207993476,
"loss": 0.757,
"step": 2085
},
{
"epoch": 3.4460016488046166,
"grad_norm": 0.7109375,
"learning_rate": 0.00016563632831415102,
"loss": 0.7616,
"step": 2090
},
{
"epoch": 3.454245671887881,
"grad_norm": 0.423828125,
"learning_rate": 0.00016541877010405477,
"loss": 0.7605,
"step": 2095
},
{
"epoch": 3.462489694971146,
"grad_norm": 0.52734375,
"learning_rate": 0.00016520066925426144,
"loss": 0.7564,
"step": 2100
},
{
"epoch": 3.4707337180544107,
"grad_norm": 0.59375,
"learning_rate": 0.00016498202757388758,
"loss": 0.7627,
"step": 2105
},
{
"epoch": 3.478977741137675,
"grad_norm": 0.55859375,
"learning_rate": 0.0001647628468765358,
"loss": 0.7514,
"step": 2110
},
{
"epoch": 3.4872217642209398,
"grad_norm": 0.640625,
"learning_rate": 0.0001645431289802799,
"loss": 0.7616,
"step": 2115
},
{
"epoch": 3.4954657873042043,
"grad_norm": 0.546875,
"learning_rate": 0.00016432287570764952,
"loss": 0.7639,
"step": 2120
},
{
"epoch": 3.503709810387469,
"grad_norm": 0.56640625,
"learning_rate": 0.0001641020888856153,
"loss": 0.7642,
"step": 2125
},
{
"epoch": 3.511953833470734,
"grad_norm": 0.609375,
"learning_rate": 0.00016388077034557355,
"loss": 0.7511,
"step": 2130
},
{
"epoch": 3.5201978565539984,
"grad_norm": 0.65234375,
"learning_rate": 0.0001636589219233311,
"loss": 0.7513,
"step": 2135
},
{
"epoch": 3.528441879637263,
"grad_norm": 0.458984375,
"learning_rate": 0.00016343654545909007,
"loss": 0.7568,
"step": 2140
},
{
"epoch": 3.5366859027205275,
"grad_norm": 0.435546875,
"learning_rate": 0.00016321364279743266,
"loss": 0.7562,
"step": 2145
},
{
"epoch": 3.5449299258037925,
"grad_norm": 0.70703125,
"learning_rate": 0.00016299021578730579,
"loss": 0.7591,
"step": 2150
},
{
"epoch": 3.553173948887057,
"grad_norm": 0.59375,
"learning_rate": 0.00016276626628200568,
"loss": 0.7665,
"step": 2155
},
{
"epoch": 3.5614179719703216,
"grad_norm": 0.5234375,
"learning_rate": 0.00016254179613916278,
"loss": 0.7604,
"step": 2160
},
{
"epoch": 3.569661995053586,
"grad_norm": 1.0,
"learning_rate": 0.000162316807220726,
"loss": 0.7504,
"step": 2165
},
{
"epoch": 3.5779060181368507,
"grad_norm": 0.65234375,
"learning_rate": 0.00016209130139294744,
"loss": 0.7646,
"step": 2170
},
{
"epoch": 3.5861500412201153,
"grad_norm": 0.69921875,
"learning_rate": 0.00016186528052636692,
"loss": 0.7562,
"step": 2175
},
{
"epoch": 3.59439406430338,
"grad_norm": 0.63671875,
"learning_rate": 0.00016163874649579647,
"loss": 0.7501,
"step": 2180
},
{
"epoch": 3.602638087386645,
"grad_norm": 0.482421875,
"learning_rate": 0.00016141170118030463,
"loss": 0.7548,
"step": 2185
},
{
"epoch": 3.6108821104699094,
"grad_norm": 0.453125,
"learning_rate": 0.0001611841464632011,
"loss": 0.7582,
"step": 2190
},
{
"epoch": 3.619126133553174,
"grad_norm": 0.50390625,
"learning_rate": 0.00016095608423202098,
"loss": 0.7517,
"step": 2195
},
{
"epoch": 3.6273701566364385,
"grad_norm": 0.392578125,
"learning_rate": 0.00016072751637850904,
"loss": 0.7563,
"step": 2200
},
{
"epoch": 3.6356141797197035,
"grad_norm": 0.451171875,
"learning_rate": 0.00016049844479860422,
"loss": 0.7566,
"step": 2205
},
{
"epoch": 3.643858202802968,
"grad_norm": 0.41796875,
"learning_rate": 0.00016026887139242372,
"loss": 0.7515,
"step": 2210
},
{
"epoch": 3.6521022258862326,
"grad_norm": 0.49609375,
"learning_rate": 0.0001600387980642474,
"loss": 0.754,
"step": 2215
},
{
"epoch": 3.660346248969497,
"grad_norm": 0.74609375,
"learning_rate": 0.0001598082267225018,
"loss": 0.7608,
"step": 2220
},
{
"epoch": 3.6685902720527617,
"grad_norm": 0.5546875,
"learning_rate": 0.0001595771592797445,
"loss": 0.7574,
"step": 2225
},
{
"epoch": 3.676834295136026,
"grad_norm": 0.59765625,
"learning_rate": 0.0001593455976526482,
"loss": 0.7526,
"step": 2230
},
{
"epoch": 3.6850783182192908,
"grad_norm": 0.40625,
"learning_rate": 0.0001591135437619847,
"loss": 0.7546,
"step": 2235
},
{
"epoch": 3.6933223413025558,
"grad_norm": 0.478515625,
"learning_rate": 0.00015888099953260905,
"loss": 0.7574,
"step": 2240
},
{
"epoch": 3.7015663643858203,
"grad_norm": 0.6171875,
"learning_rate": 0.0001586479668934437,
"loss": 0.7548,
"step": 2245
},
{
"epoch": 3.709810387469085,
"grad_norm": 0.7421875,
"learning_rate": 0.0001584144477774623,
"loss": 0.7519,
"step": 2250
},
{
"epoch": 3.7180544105523494,
"grad_norm": 0.75,
"learning_rate": 0.0001581804441216738,
"loss": 0.761,
"step": 2255
},
{
"epoch": 3.7262984336356144,
"grad_norm": 0.86328125,
"learning_rate": 0.00015794595786710632,
"loss": 0.7552,
"step": 2260
},
{
"epoch": 3.734542456718879,
"grad_norm": 0.58984375,
"learning_rate": 0.00015771099095879108,
"loss": 0.7573,
"step": 2265
},
{
"epoch": 3.7427864798021435,
"grad_norm": 1.1484375,
"learning_rate": 0.00015747554534574626,
"loss": 0.753,
"step": 2270
},
{
"epoch": 3.751030502885408,
"grad_norm": 0.46875,
"learning_rate": 0.0001572396229809608,
"loss": 0.7587,
"step": 2275
},
{
"epoch": 3.7592745259686726,
"grad_norm": 0.5859375,
"learning_rate": 0.00015700322582137827,
"loss": 0.7505,
"step": 2280
},
{
"epoch": 3.767518549051937,
"grad_norm": 0.423828125,
"learning_rate": 0.0001567663558278806,
"loss": 0.747,
"step": 2285
},
{
"epoch": 3.7757625721352017,
"grad_norm": 0.6328125,
"learning_rate": 0.0001565290149652718,
"loss": 0.763,
"step": 2290
},
{
"epoch": 3.7840065952184667,
"grad_norm": 0.640625,
"learning_rate": 0.00015629120520226165,
"loss": 0.7547,
"step": 2295
},
{
"epoch": 3.7922506183017313,
"grad_norm": 0.63671875,
"learning_rate": 0.00015605292851144942,
"loss": 0.7537,
"step": 2300
},
{
"epoch": 3.800494641384996,
"grad_norm": 0.5078125,
"learning_rate": 0.00015581418686930743,
"loss": 0.754,
"step": 2305
},
{
"epoch": 3.8087386644682604,
"grad_norm": 0.470703125,
"learning_rate": 0.00015557498225616487,
"loss": 0.7407,
"step": 2310
},
{
"epoch": 3.8169826875515254,
"grad_norm": 0.546875,
"learning_rate": 0.00015533531665619098,
"loss": 0.7556,
"step": 2315
},
{
"epoch": 3.82522671063479,
"grad_norm": 0.97265625,
"learning_rate": 0.00015509519205737896,
"loss": 0.7516,
"step": 2320
},
{
"epoch": 3.8334707337180545,
"grad_norm": 0.6328125,
"learning_rate": 0.0001548546104515294,
"loss": 0.7506,
"step": 2325
},
{
"epoch": 3.841714756801319,
"grad_norm": 0.486328125,
"learning_rate": 0.0001546135738342335,
"loss": 0.7524,
"step": 2330
},
{
"epoch": 3.8499587798845836,
"grad_norm": 0.81640625,
"learning_rate": 0.0001543720842048569,
"loss": 0.748,
"step": 2335
},
{
"epoch": 3.858202802967848,
"grad_norm": 0.443359375,
"learning_rate": 0.00015413014356652286,
"loss": 0.7503,
"step": 2340
},
{
"epoch": 3.8664468260511127,
"grad_norm": 0.486328125,
"learning_rate": 0.00015388775392609564,
"loss": 0.754,
"step": 2345
},
{
"epoch": 3.8746908491343777,
"grad_norm": 0.439453125,
"learning_rate": 0.000153644917294164,
"loss": 0.7511,
"step": 2350
},
{
"epoch": 3.882934872217642,
"grad_norm": 0.5078125,
"learning_rate": 0.0001534016356850244,
"loss": 0.7492,
"step": 2355
},
{
"epoch": 3.8911788953009068,
"grad_norm": 0.4140625,
"learning_rate": 0.00015315791111666425,
"loss": 0.7529,
"step": 2360
},
{
"epoch": 3.8994229183841713,
"grad_norm": 0.58984375,
"learning_rate": 0.00015291374561074536,
"loss": 0.7481,
"step": 2365
},
{
"epoch": 3.9076669414674363,
"grad_norm": 0.431640625,
"learning_rate": 0.000152669141192587,
"loss": 0.752,
"step": 2370
},
{
"epoch": 3.915910964550701,
"grad_norm": 0.41015625,
"learning_rate": 0.00015242409989114916,
"loss": 0.7389,
"step": 2375
},
{
"epoch": 3.9241549876339654,
"grad_norm": 0.46484375,
"learning_rate": 0.00015217862373901575,
"loss": 0.7521,
"step": 2380
},
{
"epoch": 3.93239901071723,
"grad_norm": 0.5546875,
"learning_rate": 0.0001519327147723776,
"loss": 0.742,
"step": 2385
},
{
"epoch": 3.9406430338004945,
"grad_norm": 0.73046875,
"learning_rate": 0.00015168637503101584,
"loss": 0.7499,
"step": 2390
},
{
"epoch": 3.948887056883759,
"grad_norm": 0.486328125,
"learning_rate": 0.00015143960655828468,
"loss": 0.7516,
"step": 2395
},
{
"epoch": 3.957131079967024,
"grad_norm": 0.384765625,
"learning_rate": 0.00015119241140109467,
"loss": 0.7493,
"step": 2400
},
{
"epoch": 3.9653751030502886,
"grad_norm": 0.458984375,
"learning_rate": 0.0001509447916098956,
"loss": 0.7445,
"step": 2405
},
{
"epoch": 3.973619126133553,
"grad_norm": 0.40625,
"learning_rate": 0.0001506967492386596,
"loss": 0.7535,
"step": 2410
},
{
"epoch": 3.9818631492168177,
"grad_norm": 0.466796875,
"learning_rate": 0.000150448286344864,
"loss": 0.7411,
"step": 2415
},
{
"epoch": 3.9901071723000827,
"grad_norm": 0.87890625,
"learning_rate": 0.00015019940498947428,
"loss": 0.7484,
"step": 2420
},
{
"epoch": 3.9983511953833473,
"grad_norm": 0.439453125,
"learning_rate": 0.00014995010723692714,
"loss": 0.7465,
"step": 2425
},
{
"epoch": 4.0,
"eval_loss": 2.436275005340576,
"eval_runtime": 0.2365,
"eval_samples_per_second": 42.283,
"eval_steps_per_second": 4.228,
"step": 2426
},
{
"epoch": 4.006595218466612,
"grad_norm": 0.47265625,
"learning_rate": 0.00014970039515511304,
"loss": 0.7483,
"step": 2430
},
{
"epoch": 4.014839241549876,
"grad_norm": 0.439453125,
"learning_rate": 0.00014945027081535937,
"loss": 0.7256,
"step": 2435
},
{
"epoch": 4.023083264633141,
"grad_norm": 0.6171875,
"learning_rate": 0.00014919973629241314,
"loss": 0.7386,
"step": 2440
},
{
"epoch": 4.0313272877164055,
"grad_norm": 0.4765625,
"learning_rate": 0.0001489487936644237,
"loss": 0.7329,
"step": 2445
},
{
"epoch": 4.03957131079967,
"grad_norm": 0.84765625,
"learning_rate": 0.00014869744501292561,
"loss": 0.7317,
"step": 2450
},
{
"epoch": 4.047815333882935,
"grad_norm": 0.4375,
"learning_rate": 0.00014844569242282148,
"loss": 0.7278,
"step": 2455
},
{
"epoch": 4.056059356966199,
"grad_norm": 0.5234375,
"learning_rate": 0.00014819353798236427,
"loss": 0.73,
"step": 2460
},
{
"epoch": 4.064303380049465,
"grad_norm": 0.91796875,
"learning_rate": 0.0001479409837831404,
"loss": 0.7357,
"step": 2465
},
{
"epoch": 4.072547403132729,
"grad_norm": 0.78125,
"learning_rate": 0.00014768803192005223,
"loss": 0.7341,
"step": 2470
},
{
"epoch": 4.080791426215994,
"grad_norm": 0.404296875,
"learning_rate": 0.00014743468449130063,
"loss": 0.7367,
"step": 2475
},
{
"epoch": 4.089035449299258,
"grad_norm": 0.53125,
"learning_rate": 0.00014718094359836772,
"loss": 0.7322,
"step": 2480
},
{
"epoch": 4.097279472382523,
"grad_norm": 0.453125,
"learning_rate": 0.00014692681134599925,
"loss": 0.73,
"step": 2485
},
{
"epoch": 4.105523495465787,
"grad_norm": 0.44140625,
"learning_rate": 0.0001466722898421873,
"loss": 0.7364,
"step": 2490
},
{
"epoch": 4.113767518549052,
"grad_norm": 0.4375,
"learning_rate": 0.00014641738119815266,
"loss": 0.7267,
"step": 2495
},
{
"epoch": 4.122011541632316,
"grad_norm": 0.412109375,
"learning_rate": 0.00014616208752832758,
"loss": 0.7282,
"step": 2500
},
{
"epoch": 4.130255564715581,
"grad_norm": 0.431640625,
"learning_rate": 0.00014590641095033787,
"loss": 0.7251,
"step": 2505
},
{
"epoch": 4.1384995877988455,
"grad_norm": 0.5234375,
"learning_rate": 0.0001456503535849855,
"loss": 0.7391,
"step": 2510
},
{
"epoch": 4.14674361088211,
"grad_norm": 0.546875,
"learning_rate": 0.0001453939175562312,
"loss": 0.7346,
"step": 2515
},
{
"epoch": 4.1549876339653755,
"grad_norm": 0.6328125,
"learning_rate": 0.00014513710499117647,
"loss": 0.7362,
"step": 2520
},
{
"epoch": 4.16323165704864,
"grad_norm": 0.451171875,
"learning_rate": 0.00014487991802004623,
"loss": 0.731,
"step": 2525
},
{
"epoch": 4.171475680131905,
"grad_norm": 0.484375,
"learning_rate": 0.00014462235877617098,
"loss": 0.7285,
"step": 2530
},
{
"epoch": 4.179719703215169,
"grad_norm": 0.52734375,
"learning_rate": 0.0001443644293959693,
"loss": 0.7386,
"step": 2535
},
{
"epoch": 4.187963726298434,
"grad_norm": 0.494140625,
"learning_rate": 0.00014410613201892985,
"loss": 0.7376,
"step": 2540
},
{
"epoch": 4.196207749381698,
"grad_norm": 0.4765625,
"learning_rate": 0.0001438474687875938,
"loss": 0.731,
"step": 2545
},
{
"epoch": 4.204451772464963,
"grad_norm": 0.384765625,
"learning_rate": 0.00014358844184753712,
"loss": 0.7238,
"step": 2550
},
{
"epoch": 4.212695795548227,
"grad_norm": 0.45703125,
"learning_rate": 0.00014332905334735261,
"loss": 0.7246,
"step": 2555
},
{
"epoch": 4.220939818631492,
"grad_norm": 0.5625,
"learning_rate": 0.00014306930543863219,
"loss": 0.7394,
"step": 2560
},
{
"epoch": 4.2291838417147565,
"grad_norm": 0.47265625,
"learning_rate": 0.00014280920027594907,
"loss": 0.7306,
"step": 2565
},
{
"epoch": 4.237427864798021,
"grad_norm": 0.57421875,
"learning_rate": 0.00014254874001683976,
"loss": 0.7418,
"step": 2570
},
{
"epoch": 4.2456718878812865,
"grad_norm": 0.45703125,
"learning_rate": 0.00014228792682178623,
"loss": 0.7291,
"step": 2575
},
{
"epoch": 4.253915910964551,
"grad_norm": 0.43359375,
"learning_rate": 0.00014202676285419812,
"loss": 0.7273,
"step": 2580
},
{
"epoch": 4.262159934047816,
"grad_norm": 0.50390625,
"learning_rate": 0.00014176525028039452,
"loss": 0.7311,
"step": 2585
},
{
"epoch": 4.27040395713108,
"grad_norm": 0.423828125,
"learning_rate": 0.00014150339126958633,
"loss": 0.7214,
"step": 2590
},
{
"epoch": 4.278647980214345,
"grad_norm": 0.43359375,
"learning_rate": 0.00014124118799385796,
"loss": 0.7324,
"step": 2595
},
{
"epoch": 4.286892003297609,
"grad_norm": 0.66015625,
"learning_rate": 0.00014097864262814955,
"loss": 0.7397,
"step": 2600
},
{
"epoch": 4.295136026380874,
"grad_norm": 0.625,
"learning_rate": 0.00014071575735023875,
"loss": 0.7382,
"step": 2605
},
{
"epoch": 4.303380049464138,
"grad_norm": 0.46875,
"learning_rate": 0.0001404525343407228,
"loss": 0.7324,
"step": 2610
},
{
"epoch": 4.311624072547403,
"grad_norm": 0.41015625,
"learning_rate": 0.00014018897578300035,
"loss": 0.7327,
"step": 2615
},
{
"epoch": 4.319868095630667,
"grad_norm": 0.43359375,
"learning_rate": 0.0001399250838632533,
"loss": 0.7419,
"step": 2620
},
{
"epoch": 4.328112118713932,
"grad_norm": 0.4921875,
"learning_rate": 0.0001396608607704289,
"loss": 0.738,
"step": 2625
},
{
"epoch": 4.336356141797197,
"grad_norm": 0.70703125,
"learning_rate": 0.00013939630869622133,
"loss": 0.7412,
"step": 2630
},
{
"epoch": 4.344600164880462,
"grad_norm": 0.42578125,
"learning_rate": 0.00013913142983505364,
"loss": 0.7336,
"step": 2635
},
{
"epoch": 4.3528441879637265,
"grad_norm": 0.53515625,
"learning_rate": 0.00013886622638405952,
"loss": 0.7282,
"step": 2640
},
{
"epoch": 4.361088211046991,
"grad_norm": 0.400390625,
"learning_rate": 0.00013860070054306516,
"loss": 0.7306,
"step": 2645
},
{
"epoch": 4.369332234130256,
"grad_norm": 0.84765625,
"learning_rate": 0.0001383348545145708,
"loss": 0.7279,
"step": 2650
},
{
"epoch": 4.37757625721352,
"grad_norm": 0.4765625,
"learning_rate": 0.0001380686905037327,
"loss": 0.7355,
"step": 2655
},
{
"epoch": 4.385820280296785,
"grad_norm": 1.0546875,
"learning_rate": 0.00013780221071834476,
"loss": 0.7336,
"step": 2660
},
{
"epoch": 4.394064303380049,
"grad_norm": 0.423828125,
"learning_rate": 0.0001375354173688201,
"loss": 0.7314,
"step": 2665
},
{
"epoch": 4.402308326463314,
"grad_norm": 0.58984375,
"learning_rate": 0.00013726831266817278,
"loss": 0.7344,
"step": 2670
},
{
"epoch": 4.410552349546578,
"grad_norm": 0.61328125,
"learning_rate": 0.00013700089883199966,
"loss": 0.7361,
"step": 2675
},
{
"epoch": 4.418796372629844,
"grad_norm": 0.57421875,
"learning_rate": 0.0001367331780784616,
"loss": 0.7322,
"step": 2680
},
{
"epoch": 4.427040395713108,
"grad_norm": 0.5234375,
"learning_rate": 0.00013646515262826552,
"loss": 0.7332,
"step": 2685
},
{
"epoch": 4.435284418796373,
"grad_norm": 0.46875,
"learning_rate": 0.00013619682470464558,
"loss": 0.7321,
"step": 2690
},
{
"epoch": 4.4435284418796375,
"grad_norm": 0.71484375,
"learning_rate": 0.00013592819653334505,
"loss": 0.7262,
"step": 2695
},
{
"epoch": 4.451772464962902,
"grad_norm": 0.443359375,
"learning_rate": 0.0001356592703425976,
"loss": 0.7273,
"step": 2700
},
{
"epoch": 4.460016488046167,
"grad_norm": 0.61328125,
"learning_rate": 0.00013539004836310894,
"loss": 0.7378,
"step": 2705
},
{
"epoch": 4.468260511129431,
"grad_norm": 0.51171875,
"learning_rate": 0.0001351205328280385,
"loss": 0.7254,
"step": 2710
},
{
"epoch": 4.476504534212696,
"grad_norm": 0.53125,
"learning_rate": 0.00013485072597298038,
"loss": 0.729,
"step": 2715
},
{
"epoch": 4.48474855729596,
"grad_norm": 0.443359375,
"learning_rate": 0.00013458063003594543,
"loss": 0.7375,
"step": 2720
},
{
"epoch": 4.492992580379225,
"grad_norm": 0.44140625,
"learning_rate": 0.0001343102472573423,
"loss": 0.7278,
"step": 2725
},
{
"epoch": 4.501236603462489,
"grad_norm": 0.4609375,
"learning_rate": 0.00013403957987995882,
"loss": 0.7363,
"step": 2730
},
{
"epoch": 4.509480626545754,
"grad_norm": 0.515625,
"learning_rate": 0.00013376863014894375,
"loss": 0.7341,
"step": 2735
},
{
"epoch": 4.517724649629019,
"grad_norm": 0.423828125,
"learning_rate": 0.00013349740031178784,
"loss": 0.7325,
"step": 2740
},
{
"epoch": 4.525968672712284,
"grad_norm": 0.447265625,
"learning_rate": 0.00013322589261830517,
"loss": 0.7376,
"step": 2745
},
{
"epoch": 4.534212695795548,
"grad_norm": 0.4375,
"learning_rate": 0.00013295410932061478,
"loss": 0.727,
"step": 2750
},
{
"epoch": 4.542456718878813,
"grad_norm": 0.431640625,
"learning_rate": 0.00013268205267312174,
"loss": 0.729,
"step": 2755
},
{
"epoch": 4.5507007419620775,
"grad_norm": 0.412109375,
"learning_rate": 0.00013240972493249847,
"loss": 0.7355,
"step": 2760
},
{
"epoch": 4.558944765045342,
"grad_norm": 0.4921875,
"learning_rate": 0.00013213712835766607,
"loss": 0.7362,
"step": 2765
},
{
"epoch": 4.567188788128607,
"grad_norm": 0.4609375,
"learning_rate": 0.0001318642652097757,
"loss": 0.7319,
"step": 2770
},
{
"epoch": 4.575432811211871,
"grad_norm": 0.384765625,
"learning_rate": 0.00013159113775218964,
"loss": 0.7265,
"step": 2775
},
{
"epoch": 4.583676834295136,
"grad_norm": 0.39453125,
"learning_rate": 0.00013131774825046245,
"loss": 0.7343,
"step": 2780
},
{
"epoch": 4.5919208573784,
"grad_norm": 0.447265625,
"learning_rate": 0.00013104409897232258,
"loss": 0.7231,
"step": 2785
},
{
"epoch": 4.600164880461666,
"grad_norm": 0.4609375,
"learning_rate": 0.00013077019218765305,
"loss": 0.7305,
"step": 2790
},
{
"epoch": 4.60840890354493,
"grad_norm": 0.40625,
"learning_rate": 0.00013049603016847296,
"loss": 0.7311,
"step": 2795
},
{
"epoch": 4.616652926628195,
"grad_norm": 0.57421875,
"learning_rate": 0.00013022161518891855,
"loss": 0.7347,
"step": 2800
},
{
"epoch": 4.624896949711459,
"grad_norm": 0.421875,
"learning_rate": 0.00012994694952522435,
"loss": 0.7395,
"step": 2805
},
{
"epoch": 4.633140972794724,
"grad_norm": 0.40625,
"learning_rate": 0.00012967203545570418,
"loss": 0.7332,
"step": 2810
},
{
"epoch": 4.6413849958779885,
"grad_norm": 0.455078125,
"learning_rate": 0.0001293968752607325,
"loss": 0.7326,
"step": 2815
},
{
"epoch": 4.649629018961253,
"grad_norm": 0.53515625,
"learning_rate": 0.00012912147122272523,
"loss": 0.7317,
"step": 2820
},
{
"epoch": 4.657873042044518,
"grad_norm": 0.6953125,
"learning_rate": 0.00012884582562612095,
"loss": 0.7336,
"step": 2825
},
{
"epoch": 4.666117065127782,
"grad_norm": 0.41796875,
"learning_rate": 0.00012856994075736197,
"loss": 0.7283,
"step": 2830
},
{
"epoch": 4.674361088211047,
"grad_norm": 0.5390625,
"learning_rate": 0.00012829381890487536,
"loss": 0.7366,
"step": 2835
},
{
"epoch": 4.682605111294311,
"grad_norm": 0.8515625,
"learning_rate": 0.00012801746235905384,
"loss": 0.7377,
"step": 2840
},
{
"epoch": 4.690849134377576,
"grad_norm": 0.40625,
"learning_rate": 0.00012774087341223695,
"loss": 0.7357,
"step": 2845
},
{
"epoch": 4.699093157460841,
"grad_norm": 0.490234375,
"learning_rate": 0.00012746405435869198,
"loss": 0.7307,
"step": 2850
},
{
"epoch": 4.707337180544106,
"grad_norm": 0.40234375,
"learning_rate": 0.00012718700749459486,
"loss": 0.7307,
"step": 2855
},
{
"epoch": 4.71558120362737,
"grad_norm": 0.5625,
"learning_rate": 0.0001269097351180112,
"loss": 0.7244,
"step": 2860
},
{
"epoch": 4.723825226710635,
"grad_norm": 0.3984375,
"learning_rate": 0.00012663223952887723,
"loss": 0.7321,
"step": 2865
},
{
"epoch": 4.732069249793899,
"grad_norm": 0.40234375,
"learning_rate": 0.0001263545230289807,
"loss": 0.7243,
"step": 2870
},
{
"epoch": 4.740313272877164,
"grad_norm": 0.4140625,
"learning_rate": 0.00012607658792194174,
"loss": 0.7282,
"step": 2875
},
{
"epoch": 4.7485572959604285,
"grad_norm": 0.4921875,
"learning_rate": 0.0001257984365131938,
"loss": 0.7239,
"step": 2880
},
{
"epoch": 4.756801319043693,
"grad_norm": 0.6640625,
"learning_rate": 0.00012552007110996463,
"loss": 0.7273,
"step": 2885
},
{
"epoch": 4.765045342126958,
"grad_norm": 0.65625,
"learning_rate": 0.00012524149402125685,
"loss": 0.7251,
"step": 2890
},
{
"epoch": 4.773289365210223,
"grad_norm": 0.50390625,
"learning_rate": 0.00012496270755782914,
"loss": 0.739,
"step": 2895
},
{
"epoch": 4.781533388293488,
"grad_norm": 0.42578125,
"learning_rate": 0.00012468371403217684,
"loss": 0.7344,
"step": 2900
},
{
"epoch": 4.789777411376752,
"grad_norm": 0.57421875,
"learning_rate": 0.00012440451575851285,
"loss": 0.7314,
"step": 2905
},
{
"epoch": 4.798021434460017,
"grad_norm": 0.5,
"learning_rate": 0.00012412511505274844,
"loss": 0.7269,
"step": 2910
},
{
"epoch": 4.806265457543281,
"grad_norm": 0.58203125,
"learning_rate": 0.00012384551423247407,
"loss": 0.7292,
"step": 2915
},
{
"epoch": 4.814509480626546,
"grad_norm": 0.6171875,
"learning_rate": 0.00012356571561693996,
"loss": 0.7227,
"step": 2920
},
{
"epoch": 4.82275350370981,
"grad_norm": 0.4921875,
"learning_rate": 0.00012328572152703725,
"loss": 0.7311,
"step": 2925
},
{
"epoch": 4.830997526793075,
"grad_norm": 0.59375,
"learning_rate": 0.00012300553428527832,
"loss": 0.7315,
"step": 2930
},
{
"epoch": 4.8392415498763395,
"grad_norm": 0.6796875,
"learning_rate": 0.00012272515621577782,
"loss": 0.7376,
"step": 2935
},
{
"epoch": 4.847485572959604,
"grad_norm": 0.65234375,
"learning_rate": 0.00012244458964423327,
"loss": 0.7305,
"step": 2940
},
{
"epoch": 4.855729596042869,
"grad_norm": 0.515625,
"learning_rate": 0.00012216383689790574,
"loss": 0.7279,
"step": 2945
},
{
"epoch": 4.863973619126133,
"grad_norm": 0.443359375,
"learning_rate": 0.00012188290030560063,
"loss": 0.7299,
"step": 2950
},
{
"epoch": 4.872217642209399,
"grad_norm": 0.44921875,
"learning_rate": 0.00012160178219764837,
"loss": 0.7253,
"step": 2955
},
{
"epoch": 4.880461665292663,
"grad_norm": 0.56640625,
"learning_rate": 0.00012132048490588492,
"loss": 0.7291,
"step": 2960
},
{
"epoch": 4.888705688375928,
"grad_norm": 0.462890625,
"learning_rate": 0.00012103901076363269,
"loss": 0.7244,
"step": 2965
},
{
"epoch": 4.896949711459192,
"grad_norm": 0.53125,
"learning_rate": 0.0001207573621056809,
"loss": 0.7279,
"step": 2970
},
{
"epoch": 4.905193734542457,
"grad_norm": 0.55078125,
"learning_rate": 0.00012047554126826643,
"loss": 0.7297,
"step": 2975
},
{
"epoch": 4.913437757625721,
"grad_norm": 0.53515625,
"learning_rate": 0.00012019355058905435,
"loss": 0.7285,
"step": 2980
},
{
"epoch": 4.921681780708986,
"grad_norm": 0.4296875,
"learning_rate": 0.00011991139240711857,
"loss": 0.7312,
"step": 2985
},
{
"epoch": 4.92992580379225,
"grad_norm": 0.5390625,
"learning_rate": 0.00011962906906292238,
"loss": 0.7284,
"step": 2990
},
{
"epoch": 4.938169826875515,
"grad_norm": 0.423828125,
"learning_rate": 0.00011934658289829902,
"loss": 0.7336,
"step": 2995
},
{
"epoch": 4.9464138499587795,
"grad_norm": 0.404296875,
"learning_rate": 0.00011906393625643244,
"loss": 0.7281,
"step": 3000
},
{
"epoch": 4.954657873042045,
"grad_norm": 0.41015625,
"learning_rate": 0.00011878113148183758,
"loss": 0.7271,
"step": 3005
},
{
"epoch": 4.9629018961253095,
"grad_norm": 0.5,
"learning_rate": 0.00011849817092034118,
"loss": 0.7229,
"step": 3010
},
{
"epoch": 4.971145919208574,
"grad_norm": 0.4375,
"learning_rate": 0.00011821505691906216,
"loss": 0.7318,
"step": 3015
},
{
"epoch": 4.979389942291839,
"grad_norm": 0.39453125,
"learning_rate": 0.00011793179182639218,
"loss": 0.7366,
"step": 3020
},
{
"epoch": 4.987633965375103,
"grad_norm": 0.421875,
"learning_rate": 0.00011764837799197622,
"loss": 0.7337,
"step": 3025
},
{
"epoch": 4.995877988458368,
"grad_norm": 0.48828125,
"learning_rate": 0.00011736481776669306,
"loss": 0.7312,
"step": 3030
},
{
"epoch": 4.999175597691673,
"eval_loss": 2.439051389694214,
"eval_runtime": 0.2596,
"eval_samples_per_second": 38.523,
"eval_steps_per_second": 3.852,
"step": 3032
},
{
"epoch": 5.004122011541632,
"grad_norm": 0.427734375,
"learning_rate": 0.0001170811135026357,
"loss": 0.7263,
"step": 3035
},
{
"epoch": 5.012366034624897,
"grad_norm": 0.6640625,
"learning_rate": 0.00011679726755309205,
"loss": 0.7183,
"step": 3040
},
{
"epoch": 5.020610057708161,
"grad_norm": 0.51171875,
"learning_rate": 0.00011651328227252517,
"loss": 0.723,
"step": 3045
},
{
"epoch": 5.028854080791426,
"grad_norm": 0.5234375,
"learning_rate": 0.00011622916001655388,
"loss": 0.7185,
"step": 3050
},
{
"epoch": 5.0370981038746905,
"grad_norm": 0.546875,
"learning_rate": 0.00011594490314193323,
"loss": 0.7132,
"step": 3055
},
{
"epoch": 5.045342126957956,
"grad_norm": 0.416015625,
"learning_rate": 0.00011566051400653486,
"loss": 0.7054,
"step": 3060
},
{
"epoch": 5.0535861500412205,
"grad_norm": 0.421875,
"learning_rate": 0.00011537599496932752,
"loss": 0.7197,
"step": 3065
},
{
"epoch": 5.061830173124485,
"grad_norm": 0.43359375,
"learning_rate": 0.00011509134839035748,
"loss": 0.7157,
"step": 3070
},
{
"epoch": 5.07007419620775,
"grad_norm": 0.458984375,
"learning_rate": 0.00011480657663072896,
"loss": 0.7093,
"step": 3075
},
{
"epoch": 5.078318219291014,
"grad_norm": 0.59375,
"learning_rate": 0.0001145216820525845,
"loss": 0.7286,
"step": 3080
},
{
"epoch": 5.086562242374279,
"grad_norm": 0.421875,
"learning_rate": 0.00011423666701908547,
"loss": 0.7105,
"step": 3085
},
{
"epoch": 5.094806265457543,
"grad_norm": 0.39453125,
"learning_rate": 0.00011395153389439233,
"loss": 0.7072,
"step": 3090
},
{
"epoch": 5.103050288540808,
"grad_norm": 0.40625,
"learning_rate": 0.00011366628504364509,
"loss": 0.7156,
"step": 3095
},
{
"epoch": 5.111294311624072,
"grad_norm": 0.4609375,
"learning_rate": 0.00011338092283294377,
"loss": 0.7052,
"step": 3100
},
{
"epoch": 5.119538334707337,
"grad_norm": 0.51953125,
"learning_rate": 0.00011309544962932862,
"loss": 0.7197,
"step": 3105
},
{
"epoch": 5.127782357790601,
"grad_norm": 0.41796875,
"learning_rate": 0.00011280986780076057,
"loss": 0.7195,
"step": 3110
},
{
"epoch": 5.136026380873867,
"grad_norm": 0.40234375,
"learning_rate": 0.00011252417971610163,
"loss": 0.7062,
"step": 3115
},
{
"epoch": 5.144270403957131,
"grad_norm": 0.546875,
"learning_rate": 0.00011223838774509514,
"loss": 0.7225,
"step": 3120
},
{
"epoch": 5.152514427040396,
"grad_norm": 0.4140625,
"learning_rate": 0.00011195249425834615,
"loss": 0.7106,
"step": 3125
},
{
"epoch": 5.1607584501236605,
"grad_norm": 0.47265625,
"learning_rate": 0.00011166650162730188,
"loss": 0.7174,
"step": 3130
},
{
"epoch": 5.169002473206925,
"grad_norm": 0.76953125,
"learning_rate": 0.00011138041222423177,
"loss": 0.7208,
"step": 3135
},
{
"epoch": 5.17724649629019,
"grad_norm": 0.7265625,
"learning_rate": 0.00011109422842220805,
"loss": 0.716,
"step": 3140
},
{
"epoch": 5.185490519373454,
"grad_norm": 0.49609375,
"learning_rate": 0.00011080795259508608,
"loss": 0.717,
"step": 3145
},
{
"epoch": 5.193734542456719,
"grad_norm": 0.3984375,
"learning_rate": 0.00011052158711748434,
"loss": 0.7093,
"step": 3150
},
{
"epoch": 5.201978565539983,
"grad_norm": 0.427734375,
"learning_rate": 0.00011023513436476511,
"loss": 0.7129,
"step": 3155
},
{
"epoch": 5.210222588623248,
"grad_norm": 0.400390625,
"learning_rate": 0.00010994859671301462,
"loss": 0.7168,
"step": 3160
},
{
"epoch": 5.218466611706512,
"grad_norm": 0.419921875,
"learning_rate": 0.0001096619765390232,
"loss": 0.7158,
"step": 3165
},
{
"epoch": 5.226710634789778,
"grad_norm": 0.42578125,
"learning_rate": 0.00010937527622026575,
"loss": 0.7229,
"step": 3170
},
{
"epoch": 5.234954657873042,
"grad_norm": 0.62109375,
"learning_rate": 0.00010908849813488203,
"loss": 0.7151,
"step": 3175
},
{
"epoch": 5.243198680956307,
"grad_norm": 0.40625,
"learning_rate": 0.00010880164466165674,
"loss": 0.7185,
"step": 3180
},
{
"epoch": 5.2514427040395715,
"grad_norm": 0.431640625,
"learning_rate": 0.00010851471817999997,
"loss": 0.7113,
"step": 3185
},
{
"epoch": 5.259686727122836,
"grad_norm": 0.4140625,
"learning_rate": 0.00010822772106992747,
"loss": 0.7178,
"step": 3190
},
{
"epoch": 5.267930750206101,
"grad_norm": 0.51171875,
"learning_rate": 0.00010794065571204072,
"loss": 0.7106,
"step": 3195
},
{
"epoch": 5.276174773289365,
"grad_norm": 0.484375,
"learning_rate": 0.0001076535244875074,
"loss": 0.7136,
"step": 3200
},
{
"epoch": 5.28441879637263,
"grad_norm": 0.59765625,
"learning_rate": 0.00010736632977804149,
"loss": 0.7138,
"step": 3205
},
{
"epoch": 5.292662819455894,
"grad_norm": 0.65625,
"learning_rate": 0.00010707907396588361,
"loss": 0.7192,
"step": 3210
},
{
"epoch": 5.300906842539159,
"grad_norm": 0.55859375,
"learning_rate": 0.00010679175943378119,
"loss": 0.7068,
"step": 3215
},
{
"epoch": 5.309150865622423,
"grad_norm": 0.427734375,
"learning_rate": 0.00010650438856496872,
"loss": 0.7095,
"step": 3220
},
{
"epoch": 5.317394888705689,
"grad_norm": 0.462890625,
"learning_rate": 0.00010621696374314807,
"loss": 0.7118,
"step": 3225
},
{
"epoch": 5.325638911788953,
"grad_norm": 0.474609375,
"learning_rate": 0.00010592948735246854,
"loss": 0.711,
"step": 3230
},
{
"epoch": 5.333882934872218,
"grad_norm": 0.53515625,
"learning_rate": 0.00010564196177750725,
"loss": 0.7172,
"step": 3235
},
{
"epoch": 5.342126957955482,
"grad_norm": 0.435546875,
"learning_rate": 0.0001053543894032493,
"loss": 0.7171,
"step": 3240
},
{
"epoch": 5.350370981038747,
"grad_norm": 0.52734375,
"learning_rate": 0.00010506677261506797,
"loss": 0.7153,
"step": 3245
},
{
"epoch": 5.3586150041220115,
"grad_norm": 0.408203125,
"learning_rate": 0.00010477911379870488,
"loss": 0.7162,
"step": 3250
},
{
"epoch": 5.366859027205276,
"grad_norm": 0.423828125,
"learning_rate": 0.00010449141534025045,
"loss": 0.7067,
"step": 3255
},
{
"epoch": 5.375103050288541,
"grad_norm": 0.55859375,
"learning_rate": 0.00010420367962612372,
"loss": 0.7117,
"step": 3260
},
{
"epoch": 5.383347073371805,
"grad_norm": 0.416015625,
"learning_rate": 0.00010391590904305284,
"loss": 0.7175,
"step": 3265
},
{
"epoch": 5.39159109645507,
"grad_norm": 0.41796875,
"learning_rate": 0.00010362810597805526,
"loss": 0.7109,
"step": 3270
},
{
"epoch": 5.399835119538334,
"grad_norm": 0.46484375,
"learning_rate": 0.00010334027281841781,
"loss": 0.7136,
"step": 3275
},
{
"epoch": 5.4080791426216,
"grad_norm": 0.412109375,
"learning_rate": 0.00010305241195167687,
"loss": 0.7123,
"step": 3280
},
{
"epoch": 5.416323165704864,
"grad_norm": 0.408203125,
"learning_rate": 0.00010276452576559879,
"loss": 0.7132,
"step": 3285
},
{
"epoch": 5.424567188788129,
"grad_norm": 0.4296875,
"learning_rate": 0.00010247661664815986,
"loss": 0.7161,
"step": 3290
},
{
"epoch": 5.432811211871393,
"grad_norm": 0.50390625,
"learning_rate": 0.00010218868698752658,
"loss": 0.7122,
"step": 3295
},
{
"epoch": 5.441055234954658,
"grad_norm": 0.48046875,
"learning_rate": 0.00010190073917203589,
"loss": 0.7167,
"step": 3300
},
{
"epoch": 5.4492992580379225,
"grad_norm": 0.51953125,
"learning_rate": 0.00010161277559017528,
"loss": 0.7143,
"step": 3305
},
{
"epoch": 5.457543281121187,
"grad_norm": 0.44921875,
"learning_rate": 0.00010132479863056303,
"loss": 0.7163,
"step": 3310
},
{
"epoch": 5.465787304204452,
"grad_norm": 0.46484375,
"learning_rate": 0.00010103681068192845,
"loss": 0.7173,
"step": 3315
},
{
"epoch": 5.474031327287716,
"grad_norm": 0.484375,
"learning_rate": 0.00010074881413309193,
"loss": 0.714,
"step": 3320
},
{
"epoch": 5.482275350370981,
"grad_norm": 0.486328125,
"learning_rate": 0.00010046081137294516,
"loss": 0.7128,
"step": 3325
},
{
"epoch": 5.490519373454246,
"grad_norm": 0.486328125,
"learning_rate": 0.00010017280479043147,
"loss": 0.7242,
"step": 3330
},
{
"epoch": 5.498763396537511,
"grad_norm": 0.421875,
"learning_rate": 9.988479677452584e-05,
"loss": 0.7196,
"step": 3335
},
{
"epoch": 5.507007419620775,
"grad_norm": 0.40625,
"learning_rate": 9.959678971421508e-05,
"loss": 0.714,
"step": 3340
},
{
"epoch": 5.51525144270404,
"grad_norm": 0.412109375,
"learning_rate": 9.930878599847821e-05,
"loss": 0.7173,
"step": 3345
},
{
"epoch": 5.523495465787304,
"grad_norm": 0.46484375,
"learning_rate": 9.902078801626636e-05,
"loss": 0.7137,
"step": 3350
},
{
"epoch": 5.531739488870569,
"grad_norm": 0.423828125,
"learning_rate": 9.873279815648318e-05,
"loss": 0.7125,
"step": 3355
},
{
"epoch": 5.539983511953833,
"grad_norm": 0.451171875,
"learning_rate": 9.844481880796491e-05,
"loss": 0.7173,
"step": 3360
},
{
"epoch": 5.548227535037098,
"grad_norm": 0.451171875,
"learning_rate": 9.815685235946068e-05,
"loss": 0.7134,
"step": 3365
},
{
"epoch": 5.5564715581203625,
"grad_norm": 0.470703125,
"learning_rate": 9.786890119961253e-05,
"loss": 0.7199,
"step": 3370
},
{
"epoch": 5.564715581203627,
"grad_norm": 0.419921875,
"learning_rate": 9.758096771693573e-05,
"loss": 0.7116,
"step": 3375
},
{
"epoch": 5.572959604286892,
"grad_norm": 0.42578125,
"learning_rate": 9.729305429979887e-05,
"loss": 0.7131,
"step": 3380
},
{
"epoch": 5.581203627370156,
"grad_norm": 0.47265625,
"learning_rate": 9.700516333640415e-05,
"loss": 0.7172,
"step": 3385
},
{
"epoch": 5.589447650453422,
"grad_norm": 0.427734375,
"learning_rate": 9.671729721476746e-05,
"loss": 0.7121,
"step": 3390
},
{
"epoch": 5.597691673536686,
"grad_norm": 0.4375,
"learning_rate": 9.642945832269874e-05,
"loss": 0.7187,
"step": 3395
},
{
"epoch": 5.605935696619951,
"grad_norm": 0.42578125,
"learning_rate": 9.614164904778196e-05,
"loss": 0.7108,
"step": 3400
},
{
"epoch": 5.614179719703215,
"grad_norm": 0.421875,
"learning_rate": 9.585387177735547e-05,
"loss": 0.7099,
"step": 3405
},
{
"epoch": 5.62242374278648,
"grad_norm": 0.447265625,
"learning_rate": 9.556612889849214e-05,
"loss": 0.7169,
"step": 3410
},
{
"epoch": 5.630667765869744,
"grad_norm": 0.5390625,
"learning_rate": 9.527842279797953e-05,
"loss": 0.7118,
"step": 3415
},
{
"epoch": 5.638911788953009,
"grad_norm": 0.396484375,
"learning_rate": 9.499075586230013e-05,
"loss": 0.7148,
"step": 3420
},
{
"epoch": 5.6471558120362735,
"grad_norm": 0.39453125,
"learning_rate": 9.470313047761167e-05,
"loss": 0.7166,
"step": 3425
},
{
"epoch": 5.655399835119538,
"grad_norm": 0.52734375,
"learning_rate": 9.44155490297271e-05,
"loss": 0.7156,
"step": 3430
},
{
"epoch": 5.663643858202803,
"grad_norm": 0.4765625,
"learning_rate": 9.412801390409497e-05,
"loss": 0.707,
"step": 3435
},
{
"epoch": 5.671887881286068,
"grad_norm": 0.4296875,
"learning_rate": 9.38405274857796e-05,
"loss": 0.7125,
"step": 3440
},
{
"epoch": 5.680131904369333,
"grad_norm": 0.453125,
"learning_rate": 9.355309215944124e-05,
"loss": 0.7153,
"step": 3445
},
{
"epoch": 5.688375927452597,
"grad_norm": 0.45703125,
"learning_rate": 9.326571030931637e-05,
"loss": 0.7143,
"step": 3450
},
{
"epoch": 5.696619950535862,
"grad_norm": 0.42578125,
"learning_rate": 9.297838431919794e-05,
"loss": 0.7192,
"step": 3455
},
{
"epoch": 5.704863973619126,
"grad_norm": 0.4765625,
"learning_rate": 9.269111657241548e-05,
"loss": 0.7151,
"step": 3460
},
{
"epoch": 5.713107996702391,
"grad_norm": 0.62109375,
"learning_rate": 9.240390945181543e-05,
"loss": 0.7171,
"step": 3465
},
{
"epoch": 5.721352019785655,
"grad_norm": 0.42578125,
"learning_rate": 9.211676533974131e-05,
"loss": 0.7111,
"step": 3470
},
{
"epoch": 5.72959604286892,
"grad_norm": 0.47265625,
"learning_rate": 9.182968661801412e-05,
"loss": 0.7111,
"step": 3475
},
{
"epoch": 5.737840065952184,
"grad_norm": 0.408203125,
"learning_rate": 9.154267566791223e-05,
"loss": 0.7211,
"step": 3480
},
{
"epoch": 5.746084089035449,
"grad_norm": 0.43359375,
"learning_rate": 9.125573487015203e-05,
"loss": 0.7165,
"step": 3485
},
{
"epoch": 5.7543281121187135,
"grad_norm": 0.404296875,
"learning_rate": 9.096886660486797e-05,
"loss": 0.7082,
"step": 3490
},
{
"epoch": 5.762572135201978,
"grad_norm": 0.41015625,
"learning_rate": 9.068207325159284e-05,
"loss": 0.7136,
"step": 3495
},
{
"epoch": 5.7708161582852435,
"grad_norm": 0.4140625,
"learning_rate": 9.039535718923804e-05,
"loss": 0.714,
"step": 3500
},
{
"epoch": 5.779060181368508,
"grad_norm": 0.41015625,
"learning_rate": 9.01087207960739e-05,
"loss": 0.7174,
"step": 3505
},
{
"epoch": 5.787304204451773,
"grad_norm": 0.54296875,
"learning_rate": 8.982216644970979e-05,
"loss": 0.7071,
"step": 3510
},
{
"epoch": 5.795548227535037,
"grad_norm": 0.43359375,
"learning_rate": 8.953569652707459e-05,
"loss": 0.7081,
"step": 3515
},
{
"epoch": 5.803792250618302,
"grad_norm": 0.44140625,
"learning_rate": 8.924931340439694e-05,
"loss": 0.7124,
"step": 3520
},
{
"epoch": 5.812036273701566,
"grad_norm": 0.41796875,
"learning_rate": 8.896301945718541e-05,
"loss": 0.7115,
"step": 3525
},
{
"epoch": 5.820280296784831,
"grad_norm": 0.396484375,
"learning_rate": 8.867681706020894e-05,
"loss": 0.7134,
"step": 3530
},
{
"epoch": 5.828524319868095,
"grad_norm": 0.40234375,
"learning_rate": 8.839070858747697e-05,
"loss": 0.7169,
"step": 3535
},
{
"epoch": 5.83676834295136,
"grad_norm": 0.58203125,
"learning_rate": 8.810469641222001e-05,
"loss": 0.7154,
"step": 3540
},
{
"epoch": 5.845012366034625,
"grad_norm": 0.51171875,
"learning_rate": 8.781878290686959e-05,
"loss": 0.7182,
"step": 3545
},
{
"epoch": 5.85325638911789,
"grad_norm": 0.45703125,
"learning_rate": 8.753297044303896e-05,
"loss": 0.7128,
"step": 3550
},
{
"epoch": 5.8615004122011545,
"grad_norm": 0.44140625,
"learning_rate": 8.724726139150318e-05,
"loss": 0.7083,
"step": 3555
},
{
"epoch": 5.869744435284419,
"grad_norm": 0.421875,
"learning_rate": 8.696165812217953e-05,
"loss": 0.7175,
"step": 3560
},
{
"epoch": 5.877988458367684,
"grad_norm": 0.404296875,
"learning_rate": 8.667616300410778e-05,
"loss": 0.7174,
"step": 3565
},
{
"epoch": 5.886232481450948,
"grad_norm": 0.46875,
"learning_rate": 8.639077840543077e-05,
"loss": 0.7173,
"step": 3570
},
{
"epoch": 5.894476504534213,
"grad_norm": 0.388671875,
"learning_rate": 8.610550669337433e-05,
"loss": 0.7147,
"step": 3575
},
{
"epoch": 5.902720527617477,
"grad_norm": 0.39453125,
"learning_rate": 8.582035023422815e-05,
"loss": 0.7169,
"step": 3580
},
{
"epoch": 5.910964550700742,
"grad_norm": 0.484375,
"learning_rate": 8.553531139332582e-05,
"loss": 0.7237,
"step": 3585
},
{
"epoch": 5.919208573784006,
"grad_norm": 0.3984375,
"learning_rate": 8.525039253502529e-05,
"loss": 0.7134,
"step": 3590
},
{
"epoch": 5.927452596867271,
"grad_norm": 0.443359375,
"learning_rate": 8.496559602268928e-05,
"loss": 0.7189,
"step": 3595
},
{
"epoch": 5.935696619950535,
"grad_norm": 0.50390625,
"learning_rate": 8.468092421866573e-05,
"loss": 0.717,
"step": 3600
},
{
"epoch": 5.943940643033801,
"grad_norm": 0.38671875,
"learning_rate": 8.439637948426801e-05,
"loss": 0.7094,
"step": 3605
},
{
"epoch": 5.952184666117065,
"grad_norm": 0.40234375,
"learning_rate": 8.411196417975558e-05,
"loss": 0.7019,
"step": 3610
},
{
"epoch": 5.96042868920033,
"grad_norm": 0.40625,
"learning_rate": 8.382768066431425e-05,
"loss": 0.7127,
"step": 3615
},
{
"epoch": 5.9686727122835945,
"grad_norm": 0.54296875,
"learning_rate": 8.354353129603668e-05,
"loss": 0.7133,
"step": 3620
},
{
"epoch": 5.976916735366859,
"grad_norm": 0.427734375,
"learning_rate": 8.325951843190274e-05,
"loss": 0.7182,
"step": 3625
},
{
"epoch": 5.985160758450124,
"grad_norm": 0.40234375,
"learning_rate": 8.297564442776014e-05,
"loss": 0.7053,
"step": 3630
},
{
"epoch": 5.993404781533388,
"grad_norm": 0.44140625,
"learning_rate": 8.269191163830467e-05,
"loss": 0.7253,
"step": 3635
},
{
"epoch": 6.0,
"eval_loss": 2.459299325942993,
"eval_runtime": 0.2463,
"eval_samples_per_second": 40.595,
"eval_steps_per_second": 4.059,
"step": 3639
},
{
"epoch": 6.001648804616653,
"grad_norm": 0.408203125,
"learning_rate": 8.240832241706068e-05,
"loss": 0.7144,
"step": 3640
},
{
"epoch": 6.009892827699917,
"grad_norm": 0.5625,
"learning_rate": 8.212487911636184e-05,
"loss": 0.7102,
"step": 3645
},
{
"epoch": 6.018136850783182,
"grad_norm": 0.640625,
"learning_rate": 8.184158408733131e-05,
"loss": 0.7073,
"step": 3650
},
{
"epoch": 6.026380873866446,
"grad_norm": 0.421875,
"learning_rate": 8.155843967986236e-05,
"loss": 0.6914,
"step": 3655
},
{
"epoch": 6.034624896949712,
"grad_norm": 0.421875,
"learning_rate": 8.127544824259889e-05,
"loss": 0.7095,
"step": 3660
},
{
"epoch": 6.042868920032976,
"grad_norm": 0.427734375,
"learning_rate": 8.099261212291601e-05,
"loss": 0.7006,
"step": 3665
},
{
"epoch": 6.051112943116241,
"grad_norm": 0.408203125,
"learning_rate": 8.070993366690029e-05,
"loss": 0.6983,
"step": 3670
},
{
"epoch": 6.0593569661995055,
"grad_norm": 0.412109375,
"learning_rate": 8.042741521933071e-05,
"loss": 0.7086,
"step": 3675
},
{
"epoch": 6.06760098928277,
"grad_norm": 0.41015625,
"learning_rate": 8.014505912365893e-05,
"loss": 0.7039,
"step": 3680
},
{
"epoch": 6.075845012366035,
"grad_norm": 0.435546875,
"learning_rate": 7.986286772198986e-05,
"loss": 0.7056,
"step": 3685
},
{
"epoch": 6.084089035449299,
"grad_norm": 0.41015625,
"learning_rate": 7.958084335506239e-05,
"loss": 0.6957,
"step": 3690
},
{
"epoch": 6.092333058532564,
"grad_norm": 0.416015625,
"learning_rate": 7.929898836222983e-05,
"loss": 0.7052,
"step": 3695
},
{
"epoch": 6.100577081615828,
"grad_norm": 0.46875,
"learning_rate": 7.90173050814406e-05,
"loss": 0.6982,
"step": 3700
},
{
"epoch": 6.108821104699093,
"grad_norm": 0.50390625,
"learning_rate": 7.873579584921869e-05,
"loss": 0.7029,
"step": 3705
},
{
"epoch": 6.117065127782358,
"grad_norm": 0.451171875,
"learning_rate": 7.84544630006445e-05,
"loss": 0.7015,
"step": 3710
},
{
"epoch": 6.125309150865623,
"grad_norm": 0.427734375,
"learning_rate": 7.817330886933527e-05,
"loss": 0.7073,
"step": 3715
},
{
"epoch": 6.133553173948887,
"grad_norm": 0.416015625,
"learning_rate": 7.789233578742582e-05,
"loss": 0.7092,
"step": 3720
},
{
"epoch": 6.141797197032152,
"grad_norm": 0.490234375,
"learning_rate": 7.761154608554927e-05,
"loss": 0.7025,
"step": 3725
},
{
"epoch": 6.150041220115416,
"grad_norm": 0.412109375,
"learning_rate": 7.733094209281756e-05,
"loss": 0.7048,
"step": 3730
},
{
"epoch": 6.158285243198681,
"grad_norm": 0.404296875,
"learning_rate": 7.705052613680211e-05,
"loss": 0.7029,
"step": 3735
},
{
"epoch": 6.1665292662819455,
"grad_norm": 0.453125,
"learning_rate": 7.677030054351477e-05,
"loss": 0.701,
"step": 3740
},
{
"epoch": 6.17477328936521,
"grad_norm": 0.439453125,
"learning_rate": 7.649026763738827e-05,
"loss": 0.7067,
"step": 3745
},
{
"epoch": 6.183017312448475,
"grad_norm": 0.451171875,
"learning_rate": 7.6210429741257e-05,
"loss": 0.7055,
"step": 3750
},
{
"epoch": 6.191261335531739,
"grad_norm": 0.423828125,
"learning_rate": 7.593078917633787e-05,
"loss": 0.7104,
"step": 3755
},
{
"epoch": 6.199505358615004,
"grad_norm": 0.380859375,
"learning_rate": 7.565134826221083e-05,
"loss": 0.703,
"step": 3760
},
{
"epoch": 6.207749381698269,
"grad_norm": 0.431640625,
"learning_rate": 7.537210931679987e-05,
"loss": 0.6998,
"step": 3765
},
{
"epoch": 6.215993404781534,
"grad_norm": 0.427734375,
"learning_rate": 7.509307465635358e-05,
"loss": 0.6976,
"step": 3770
},
{
"epoch": 6.224237427864798,
"grad_norm": 0.42578125,
"learning_rate": 7.481424659542609e-05,
"loss": 0.7025,
"step": 3775
},
{
"epoch": 6.232481450948063,
"grad_norm": 0.421875,
"learning_rate": 7.453562744685778e-05,
"loss": 0.6971,
"step": 3780
},
{
"epoch": 6.240725474031327,
"grad_norm": 0.3984375,
"learning_rate": 7.425721952175618e-05,
"loss": 0.6984,
"step": 3785
},
{
"epoch": 6.248969497114592,
"grad_norm": 0.59765625,
"learning_rate": 7.39790251294767e-05,
"loss": 0.7012,
"step": 3790
},
{
"epoch": 6.2572135201978565,
"grad_norm": 0.466796875,
"learning_rate": 7.370104657760361e-05,
"loss": 0.7012,
"step": 3795
},
{
"epoch": 6.265457543281121,
"grad_norm": 0.439453125,
"learning_rate": 7.342328617193067e-05,
"loss": 0.7069,
"step": 3800
},
{
"epoch": 6.273701566364386,
"grad_norm": 0.408203125,
"learning_rate": 7.314574621644225e-05,
"loss": 0.6998,
"step": 3805
},
{
"epoch": 6.28194558944765,
"grad_norm": 0.427734375,
"learning_rate": 7.286842901329412e-05,
"loss": 0.695,
"step": 3810
},
{
"epoch": 6.290189612530915,
"grad_norm": 0.421875,
"learning_rate": 7.259133686279429e-05,
"loss": 0.7045,
"step": 3815
},
{
"epoch": 6.29843363561418,
"grad_norm": 0.478515625,
"learning_rate": 7.231447206338407e-05,
"loss": 0.7062,
"step": 3820
},
{
"epoch": 6.306677658697445,
"grad_norm": 0.408203125,
"learning_rate": 7.203783691161883e-05,
"loss": 0.6975,
"step": 3825
},
{
"epoch": 6.314921681780709,
"grad_norm": 0.41796875,
"learning_rate": 7.176143370214914e-05,
"loss": 0.7035,
"step": 3830
},
{
"epoch": 6.323165704863974,
"grad_norm": 0.46484375,
"learning_rate": 7.148526472770154e-05,
"loss": 0.7071,
"step": 3835
},
{
"epoch": 6.331409727947238,
"grad_norm": 0.49609375,
"learning_rate": 7.12093322790597e-05,
"loss": 0.7022,
"step": 3840
},
{
"epoch": 6.339653751030503,
"grad_norm": 0.515625,
"learning_rate": 7.09336386450453e-05,
"loss": 0.7104,
"step": 3845
},
{
"epoch": 6.347897774113767,
"grad_norm": 0.423828125,
"learning_rate": 7.065818611249915e-05,
"loss": 0.7028,
"step": 3850
},
{
"epoch": 6.356141797197032,
"grad_norm": 0.43359375,
"learning_rate": 7.038297696626206e-05,
"loss": 0.7049,
"step": 3855
},
{
"epoch": 6.3643858202802965,
"grad_norm": 0.421875,
"learning_rate": 7.010801348915608e-05,
"loss": 0.7074,
"step": 3860
},
{
"epoch": 6.372629843363561,
"grad_norm": 0.423828125,
"learning_rate": 6.983329796196534e-05,
"loss": 0.7001,
"step": 3865
},
{
"epoch": 6.380873866446826,
"grad_norm": 0.5703125,
"learning_rate": 6.955883266341741e-05,
"loss": 0.7006,
"step": 3870
},
{
"epoch": 6.389117889530091,
"grad_norm": 0.4140625,
"learning_rate": 6.928461987016413e-05,
"loss": 0.7113,
"step": 3875
},
{
"epoch": 6.397361912613356,
"grad_norm": 0.416015625,
"learning_rate": 6.901066185676295e-05,
"loss": 0.6964,
"step": 3880
},
{
"epoch": 6.40560593569662,
"grad_norm": 0.42578125,
"learning_rate": 6.873696089565786e-05,
"loss": 0.7086,
"step": 3885
},
{
"epoch": 6.413849958779885,
"grad_norm": 0.5078125,
"learning_rate": 6.846351925716068e-05,
"loss": 0.698,
"step": 3890
},
{
"epoch": 6.422093981863149,
"grad_norm": 0.443359375,
"learning_rate": 6.819033920943219e-05,
"loss": 0.6997,
"step": 3895
},
{
"epoch": 6.430338004946414,
"grad_norm": 0.4765625,
"learning_rate": 6.791742301846326e-05,
"loss": 0.7031,
"step": 3900
},
{
"epoch": 6.438582028029678,
"grad_norm": 0.427734375,
"learning_rate": 6.764477294805615e-05,
"loss": 0.7026,
"step": 3905
},
{
"epoch": 6.446826051112943,
"grad_norm": 0.458984375,
"learning_rate": 6.737239125980573e-05,
"loss": 0.7006,
"step": 3910
},
{
"epoch": 6.4550700741962075,
"grad_norm": 0.412109375,
"learning_rate": 6.710028021308061e-05,
"loss": 0.6971,
"step": 3915
},
{
"epoch": 6.463314097279472,
"grad_norm": 0.400390625,
"learning_rate": 6.682844206500445e-05,
"loss": 0.7028,
"step": 3920
},
{
"epoch": 6.471558120362737,
"grad_norm": 0.392578125,
"learning_rate": 6.655687907043734e-05,
"loss": 0.7053,
"step": 3925
},
{
"epoch": 6.479802143446002,
"grad_norm": 0.4296875,
"learning_rate": 6.62855934819569e-05,
"loss": 0.6995,
"step": 3930
},
{
"epoch": 6.488046166529267,
"grad_norm": 0.40234375,
"learning_rate": 6.601458754983978e-05,
"loss": 0.6971,
"step": 3935
},
{
"epoch": 6.496290189612531,
"grad_norm": 0.462890625,
"learning_rate": 6.574386352204289e-05,
"loss": 0.7029,
"step": 3940
},
{
"epoch": 6.504534212695796,
"grad_norm": 0.408203125,
"learning_rate": 6.547342364418481e-05,
"loss": 0.7011,
"step": 3945
},
{
"epoch": 6.51277823577906,
"grad_norm": 0.4296875,
"learning_rate": 6.520327015952713e-05,
"loss": 0.699,
"step": 3950
},
{
"epoch": 6.521022258862325,
"grad_norm": 0.431640625,
"learning_rate": 6.493340530895583e-05,
"loss": 0.6987,
"step": 3955
},
{
"epoch": 6.529266281945589,
"grad_norm": 0.3828125,
"learning_rate": 6.466383133096267e-05,
"loss": 0.7095,
"step": 3960
},
{
"epoch": 6.537510305028854,
"grad_norm": 0.443359375,
"learning_rate": 6.439455046162677e-05,
"loss": 0.704,
"step": 3965
},
{
"epoch": 6.545754328112118,
"grad_norm": 0.474609375,
"learning_rate": 6.412556493459581e-05,
"loss": 0.7127,
"step": 3970
},
{
"epoch": 6.553998351195383,
"grad_norm": 0.431640625,
"learning_rate": 6.385687698106781e-05,
"loss": 0.7019,
"step": 3975
},
{
"epoch": 6.562242374278648,
"grad_norm": 0.3984375,
"learning_rate": 6.358848882977233e-05,
"loss": 0.702,
"step": 3980
},
{
"epoch": 6.570486397361913,
"grad_norm": 0.443359375,
"learning_rate": 6.332040270695219e-05,
"loss": 0.7086,
"step": 3985
},
{
"epoch": 6.5787304204451775,
"grad_norm": 0.408203125,
"learning_rate": 6.305262083634488e-05,
"loss": 0.7086,
"step": 3990
},
{
"epoch": 6.586974443528442,
"grad_norm": 0.443359375,
"learning_rate": 6.278514543916415e-05,
"loss": 0.7087,
"step": 3995
},
{
"epoch": 6.595218466611707,
"grad_norm": 0.41015625,
"learning_rate": 6.251797873408161e-05,
"loss": 0.6976,
"step": 4000
},
{
"epoch": 6.603462489694971,
"grad_norm": 0.478515625,
"learning_rate": 6.225112293720836e-05,
"loss": 0.6968,
"step": 4005
},
{
"epoch": 6.611706512778236,
"grad_norm": 0.41796875,
"learning_rate": 6.198458026207652e-05,
"loss": 0.7039,
"step": 4010
},
{
"epoch": 6.6199505358615,
"grad_norm": 0.40234375,
"learning_rate": 6.171835291962088e-05,
"loss": 0.702,
"step": 4015
},
{
"epoch": 6.628194558944765,
"grad_norm": 0.4296875,
"learning_rate": 6.145244311816063e-05,
"loss": 0.7004,
"step": 4020
},
{
"epoch": 6.636438582028029,
"grad_norm": 0.5,
"learning_rate": 6.1186853063381e-05,
"loss": 0.6988,
"step": 4025
},
{
"epoch": 6.644682605111294,
"grad_norm": 0.427734375,
"learning_rate": 6.092158495831486e-05,
"loss": 0.7019,
"step": 4030
},
{
"epoch": 6.6529266281945585,
"grad_norm": 0.4609375,
"learning_rate": 6.065664100332478e-05,
"loss": 0.7082,
"step": 4035
},
{
"epoch": 6.661170651277824,
"grad_norm": 0.4375,
"learning_rate": 6.039202339608432e-05,
"loss": 0.7008,
"step": 4040
},
{
"epoch": 6.6694146743610885,
"grad_norm": 0.44921875,
"learning_rate": 6.012773433156017e-05,
"loss": 0.7022,
"step": 4045
},
{
"epoch": 6.677658697444353,
"grad_norm": 0.404296875,
"learning_rate": 5.986377600199371e-05,
"loss": 0.6986,
"step": 4050
},
{
"epoch": 6.685902720527618,
"grad_norm": 0.42578125,
"learning_rate": 5.9600150596883066e-05,
"loss": 0.6989,
"step": 4055
},
{
"epoch": 6.694146743610882,
"grad_norm": 0.43359375,
"learning_rate": 5.933686030296459e-05,
"loss": 0.6993,
"step": 4060
},
{
"epoch": 6.702390766694147,
"grad_norm": 0.419921875,
"learning_rate": 5.907390730419507e-05,
"loss": 0.6977,
"step": 4065
},
{
"epoch": 6.710634789777411,
"grad_norm": 0.41015625,
"learning_rate": 5.881129378173347e-05,
"loss": 0.7019,
"step": 4070
},
{
"epoch": 6.718878812860676,
"grad_norm": 0.40625,
"learning_rate": 5.854902191392284e-05,
"loss": 0.6936,
"step": 4075
},
{
"epoch": 6.72712283594394,
"grad_norm": 0.494140625,
"learning_rate": 5.828709387627218e-05,
"loss": 0.7002,
"step": 4080
},
{
"epoch": 6.735366859027205,
"grad_norm": 0.435546875,
"learning_rate": 5.802551184143865e-05,
"loss": 0.7026,
"step": 4085
},
{
"epoch": 6.74361088211047,
"grad_norm": 0.404296875,
"learning_rate": 5.7764277979209094e-05,
"loss": 0.7151,
"step": 4090
},
{
"epoch": 6.751854905193735,
"grad_norm": 0.416015625,
"learning_rate": 5.750339445648252e-05,
"loss": 0.7055,
"step": 4095
},
{
"epoch": 6.760098928276999,
"grad_norm": 0.4140625,
"learning_rate": 5.724286343725185e-05,
"loss": 0.7032,
"step": 4100
},
{
"epoch": 6.768342951360264,
"grad_norm": 0.421875,
"learning_rate": 5.6982687082585994e-05,
"loss": 0.7008,
"step": 4105
},
{
"epoch": 6.7765869744435285,
"grad_norm": 0.400390625,
"learning_rate": 5.6722867550612116e-05,
"loss": 0.6998,
"step": 4110
},
{
"epoch": 6.784830997526793,
"grad_norm": 0.404296875,
"learning_rate": 5.6463406996497456e-05,
"loss": 0.6961,
"step": 4115
},
{
"epoch": 6.793075020610058,
"grad_norm": 0.412109375,
"learning_rate": 5.620430757243156e-05,
"loss": 0.6963,
"step": 4120
},
{
"epoch": 6.801319043693322,
"grad_norm": 0.40234375,
"learning_rate": 5.5945571427608526e-05,
"loss": 0.7083,
"step": 4125
},
{
"epoch": 6.809563066776587,
"grad_norm": 0.419921875,
"learning_rate": 5.5687200708209076e-05,
"loss": 0.704,
"step": 4130
},
{
"epoch": 6.817807089859851,
"grad_norm": 0.4140625,
"learning_rate": 5.542919755738275e-05,
"loss": 0.7061,
"step": 4135
},
{
"epoch": 6.826051112943116,
"grad_norm": 0.478515625,
"learning_rate": 5.5171564115230254e-05,
"loss": 0.7037,
"step": 4140
},
{
"epoch": 6.83429513602638,
"grad_norm": 0.408203125,
"learning_rate": 5.491430251878551e-05,
"loss": 0.715,
"step": 4145
},
{
"epoch": 6.842539159109646,
"grad_norm": 0.421875,
"learning_rate": 5.4657414901998095e-05,
"loss": 0.7023,
"step": 4150
},
{
"epoch": 6.85078318219291,
"grad_norm": 0.4453125,
"learning_rate": 5.4400903395715366e-05,
"loss": 0.6967,
"step": 4155
},
{
"epoch": 6.859027205276175,
"grad_norm": 0.40625,
"learning_rate": 5.4144770127665024e-05,
"loss": 0.7073,
"step": 4160
},
{
"epoch": 6.8672712283594395,
"grad_norm": 0.43359375,
"learning_rate": 5.388901722243724e-05,
"loss": 0.6954,
"step": 4165
},
{
"epoch": 6.875515251442704,
"grad_norm": 0.400390625,
"learning_rate": 5.363364680146725e-05,
"loss": 0.7044,
"step": 4170
},
{
"epoch": 6.883759274525969,
"grad_norm": 0.412109375,
"learning_rate": 5.3378660983017536e-05,
"loss": 0.7045,
"step": 4175
},
{
"epoch": 6.892003297609233,
"grad_norm": 0.404296875,
"learning_rate": 5.31240618821604e-05,
"loss": 0.7029,
"step": 4180
},
{
"epoch": 6.900247320692498,
"grad_norm": 0.396484375,
"learning_rate": 5.286985161076029e-05,
"loss": 0.7018,
"step": 4185
},
{
"epoch": 6.908491343775762,
"grad_norm": 0.4140625,
"learning_rate": 5.2616032277456463e-05,
"loss": 0.7102,
"step": 4190
},
{
"epoch": 6.916735366859028,
"grad_norm": 0.3828125,
"learning_rate": 5.236260598764535e-05,
"loss": 0.7078,
"step": 4195
},
{
"epoch": 6.924979389942292,
"grad_norm": 0.51953125,
"learning_rate": 5.210957484346314e-05,
"loss": 0.7055,
"step": 4200
},
{
"epoch": 6.933223413025557,
"grad_norm": 0.423828125,
"learning_rate": 5.185694094376843e-05,
"loss": 0.7068,
"step": 4205
},
{
"epoch": 6.941467436108821,
"grad_norm": 0.412109375,
"learning_rate": 5.160470638412461e-05,
"loss": 0.6911,
"step": 4210
},
{
"epoch": 6.949711459192086,
"grad_norm": 0.388671875,
"learning_rate": 5.135287325678271e-05,
"loss": 0.7047,
"step": 4215
},
{
"epoch": 6.95795548227535,
"grad_norm": 0.435546875,
"learning_rate": 5.1101443650663764e-05,
"loss": 0.6989,
"step": 4220
},
{
"epoch": 6.966199505358615,
"grad_norm": 0.416015625,
"learning_rate": 5.085041965134183e-05,
"loss": 0.6975,
"step": 4225
},
{
"epoch": 6.9744435284418795,
"grad_norm": 0.412109375,
"learning_rate": 5.059980334102637e-05,
"loss": 0.7055,
"step": 4230
},
{
"epoch": 6.982687551525144,
"grad_norm": 0.39453125,
"learning_rate": 5.034959679854532e-05,
"loss": 0.6983,
"step": 4235
},
{
"epoch": 6.990931574608409,
"grad_norm": 0.4296875,
"learning_rate": 5.009980209932743e-05,
"loss": 0.7046,
"step": 4240
},
{
"epoch": 6.999175597691673,
"grad_norm": 0.38671875,
"learning_rate": 4.985042131538545e-05,
"loss": 0.7042,
"step": 4245
},
{
"epoch": 6.999175597691673,
"eval_loss": 2.4711008071899414,
"eval_runtime": 0.2631,
"eval_samples_per_second": 38.011,
"eval_steps_per_second": 3.801,
"step": 4245
},
{
"epoch": 7.007419620774938,
"grad_norm": 0.49609375,
"learning_rate": 4.960145651529856e-05,
"loss": 0.6792,
"step": 4250
},
{
"epoch": 7.015663643858203,
"grad_norm": 0.53125,
"learning_rate": 4.9352909764195576e-05,
"loss": 0.6999,
"step": 4255
},
{
"epoch": 7.023907666941468,
"grad_norm": 0.4296875,
"learning_rate": 4.9104783123737566e-05,
"loss": 0.6999,
"step": 4260
},
{
"epoch": 7.032151690024732,
"grad_norm": 0.3984375,
"learning_rate": 4.885707865210093e-05,
"loss": 0.7018,
"step": 4265
},
{
"epoch": 7.040395713107997,
"grad_norm": 0.400390625,
"learning_rate": 4.860979840396016e-05,
"loss": 0.6912,
"step": 4270
},
{
"epoch": 7.048639736191261,
"grad_norm": 0.45703125,
"learning_rate": 4.836294443047088e-05,
"loss": 0.6945,
"step": 4275
},
{
"epoch": 7.056883759274526,
"grad_norm": 0.44921875,
"learning_rate": 4.8116518779252885e-05,
"loss": 0.6905,
"step": 4280
},
{
"epoch": 7.0651277823577905,
"grad_norm": 0.39453125,
"learning_rate": 4.787052349437295e-05,
"loss": 0.691,
"step": 4285
},
{
"epoch": 7.073371805441055,
"grad_norm": 0.408203125,
"learning_rate": 4.762496061632814e-05,
"loss": 0.6843,
"step": 4290
},
{
"epoch": 7.08161582852432,
"grad_norm": 0.388671875,
"learning_rate": 4.7379832182028814e-05,
"loss": 0.6951,
"step": 4295
},
{
"epoch": 7.089859851607584,
"grad_norm": 0.408203125,
"learning_rate": 4.713514022478155e-05,
"loss": 0.6893,
"step": 4300
},
{
"epoch": 7.09810387469085,
"grad_norm": 0.4453125,
"learning_rate": 4.689088677427249e-05,
"loss": 0.6952,
"step": 4305
},
{
"epoch": 7.106347897774114,
"grad_norm": 0.427734375,
"learning_rate": 4.6647073856550415e-05,
"loss": 0.6958,
"step": 4310
},
{
"epoch": 7.114591920857379,
"grad_norm": 0.4140625,
"learning_rate": 4.6403703494009875e-05,
"loss": 0.6946,
"step": 4315
},
{
"epoch": 7.122835943940643,
"grad_norm": 0.427734375,
"learning_rate": 4.6160777705374524e-05,
"loss": 0.6996,
"step": 4320
},
{
"epoch": 7.131079967023908,
"grad_norm": 0.408203125,
"learning_rate": 4.591829850568046e-05,
"loss": 0.6969,
"step": 4325
},
{
"epoch": 7.139323990107172,
"grad_norm": 0.439453125,
"learning_rate": 4.567626790625921e-05,
"loss": 0.6868,
"step": 4330
},
{
"epoch": 7.147568013190437,
"grad_norm": 0.4609375,
"learning_rate": 4.543468791472131e-05,
"loss": 0.69,
"step": 4335
},
{
"epoch": 7.155812036273701,
"grad_norm": 0.40625,
"learning_rate": 4.519356053493958e-05,
"loss": 0.6979,
"step": 4340
},
{
"epoch": 7.164056059356966,
"grad_norm": 0.408203125,
"learning_rate": 4.495288776703241e-05,
"loss": 0.7022,
"step": 4345
},
{
"epoch": 7.1723000824402305,
"grad_norm": 0.41015625,
"learning_rate": 4.471267160734731e-05,
"loss": 0.6874,
"step": 4350
},
{
"epoch": 7.180544105523495,
"grad_norm": 0.40625,
"learning_rate": 4.447291404844424e-05,
"loss": 0.6982,
"step": 4355
},
{
"epoch": 7.18878812860676,
"grad_norm": 0.404296875,
"learning_rate": 4.4233617079079236e-05,
"loss": 0.7015,
"step": 4360
},
{
"epoch": 7.197032151690025,
"grad_norm": 0.39453125,
"learning_rate": 4.399478268418771e-05,
"loss": 0.6919,
"step": 4365
},
{
"epoch": 7.20527617477329,
"grad_norm": 0.412109375,
"learning_rate": 4.375641284486808e-05,
"loss": 0.6867,
"step": 4370
},
{
"epoch": 7.213520197856554,
"grad_norm": 0.3984375,
"learning_rate": 4.3518509538365425e-05,
"loss": 0.6929,
"step": 4375
},
{
"epoch": 7.221764220939819,
"grad_norm": 0.40234375,
"learning_rate": 4.328107473805487e-05,
"loss": 0.7013,
"step": 4380
},
{
"epoch": 7.230008244023083,
"grad_norm": 0.4140625,
"learning_rate": 4.3044110413425395e-05,
"loss": 0.6879,
"step": 4385
},
{
"epoch": 7.238252267106348,
"grad_norm": 0.439453125,
"learning_rate": 4.2807618530063565e-05,
"loss": 0.6918,
"step": 4390
},
{
"epoch": 7.246496290189612,
"grad_norm": 0.39453125,
"learning_rate": 4.257160104963696e-05,
"loss": 0.6965,
"step": 4395
},
{
"epoch": 7.254740313272877,
"grad_norm": 0.41015625,
"learning_rate": 4.23360599298781e-05,
"loss": 0.6963,
"step": 4400
},
{
"epoch": 7.2629843363561415,
"grad_norm": 0.408203125,
"learning_rate": 4.210099712456822e-05,
"loss": 0.69,
"step": 4405
},
{
"epoch": 7.271228359439406,
"grad_norm": 0.396484375,
"learning_rate": 4.1866414583520877e-05,
"loss": 0.6955,
"step": 4410
},
{
"epoch": 7.2794723825226715,
"grad_norm": 0.408203125,
"learning_rate": 4.163231425256595e-05,
"loss": 0.6888,
"step": 4415
},
{
"epoch": 7.287716405605936,
"grad_norm": 0.408203125,
"learning_rate": 4.139869807353357e-05,
"loss": 0.6998,
"step": 4420
},
{
"epoch": 7.295960428689201,
"grad_norm": 0.396484375,
"learning_rate": 4.1165567984237764e-05,
"loss": 0.6963,
"step": 4425
},
{
"epoch": 7.304204451772465,
"grad_norm": 0.38671875,
"learning_rate": 4.0932925918460516e-05,
"loss": 0.6922,
"step": 4430
},
{
"epoch": 7.31244847485573,
"grad_norm": 0.40625,
"learning_rate": 4.070077380593579e-05,
"loss": 0.6969,
"step": 4435
},
{
"epoch": 7.320692497938994,
"grad_norm": 0.39453125,
"learning_rate": 4.046911357233343e-05,
"loss": 0.6893,
"step": 4440
},
{
"epoch": 7.328936521022259,
"grad_norm": 0.412109375,
"learning_rate": 4.02379471392431e-05,
"loss": 0.6902,
"step": 4445
},
{
"epoch": 7.337180544105523,
"grad_norm": 0.419921875,
"learning_rate": 4.000727642415867e-05,
"loss": 0.7053,
"step": 4450
},
{
"epoch": 7.345424567188788,
"grad_norm": 0.4140625,
"learning_rate": 3.977710334046193e-05,
"loss": 0.6942,
"step": 4455
},
{
"epoch": 7.353668590272052,
"grad_norm": 0.404296875,
"learning_rate": 3.954742979740695e-05,
"loss": 0.7078,
"step": 4460
},
{
"epoch": 7.361912613355317,
"grad_norm": 0.4375,
"learning_rate": 3.9318257700104174e-05,
"loss": 0.6932,
"step": 4465
},
{
"epoch": 7.370156636438582,
"grad_norm": 0.470703125,
"learning_rate": 3.9089588949504655e-05,
"loss": 0.6955,
"step": 4470
},
{
"epoch": 7.378400659521847,
"grad_norm": 0.478515625,
"learning_rate": 3.8861425442384135e-05,
"loss": 0.6969,
"step": 4475
},
{
"epoch": 7.3866446826051115,
"grad_norm": 0.4140625,
"learning_rate": 3.863376907132752e-05,
"loss": 0.6949,
"step": 4480
},
{
"epoch": 7.394888705688376,
"grad_norm": 0.396484375,
"learning_rate": 3.840662172471315e-05,
"loss": 0.7005,
"step": 4485
},
{
"epoch": 7.403132728771641,
"grad_norm": 0.453125,
"learning_rate": 3.8179985286696986e-05,
"loss": 0.6935,
"step": 4490
},
{
"epoch": 7.411376751854905,
"grad_norm": 0.40234375,
"learning_rate": 3.7953861637197085e-05,
"loss": 0.6923,
"step": 4495
},
{
"epoch": 7.41962077493817,
"grad_norm": 0.390625,
"learning_rate": 3.772825265187802e-05,
"loss": 0.6923,
"step": 4500
},
{
"epoch": 7.427864798021434,
"grad_norm": 0.421875,
"learning_rate": 3.75031602021353e-05,
"loss": 0.6979,
"step": 4505
},
{
"epoch": 7.436108821104699,
"grad_norm": 0.390625,
"learning_rate": 3.727858615507974e-05,
"loss": 0.6977,
"step": 4510
},
{
"epoch": 7.444352844187963,
"grad_norm": 0.41015625,
"learning_rate": 3.705453237352227e-05,
"loss": 0.7043,
"step": 4515
},
{
"epoch": 7.452596867271229,
"grad_norm": 0.404296875,
"learning_rate": 3.683100071595813e-05,
"loss": 0.6956,
"step": 4520
},
{
"epoch": 7.460840890354493,
"grad_norm": 0.40234375,
"learning_rate": 3.660799303655166e-05,
"loss": 0.6974,
"step": 4525
},
{
"epoch": 7.469084913437758,
"grad_norm": 0.43359375,
"learning_rate": 3.638551118512089e-05,
"loss": 0.7013,
"step": 4530
},
{
"epoch": 7.4773289365210225,
"grad_norm": 0.435546875,
"learning_rate": 3.616355700712221e-05,
"loss": 0.6966,
"step": 4535
},
{
"epoch": 7.485572959604287,
"grad_norm": 0.400390625,
"learning_rate": 3.594213234363486e-05,
"loss": 0.6964,
"step": 4540
},
{
"epoch": 7.493816982687552,
"grad_norm": 0.412109375,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.6922,
"step": 4545
},
{
"epoch": 7.502061005770816,
"grad_norm": 0.419921875,
"learning_rate": 3.550087890253544e-05,
"loss": 0.6948,
"step": 4550
},
{
"epoch": 7.510305028854081,
"grad_norm": 0.423828125,
"learning_rate": 3.5281053785059925e-05,
"loss": 0.695,
"step": 4555
},
{
"epoch": 7.518549051937345,
"grad_norm": 0.39453125,
"learning_rate": 3.506176550233863e-05,
"loss": 0.6949,
"step": 4560
},
{
"epoch": 7.52679307502061,
"grad_norm": 0.44921875,
"learning_rate": 3.484301587333772e-05,
"loss": 0.6903,
"step": 4565
},
{
"epoch": 7.535037098103874,
"grad_norm": 0.404296875,
"learning_rate": 3.462480671255515e-05,
"loss": 0.6983,
"step": 4570
},
{
"epoch": 7.543281121187139,
"grad_norm": 0.416015625,
"learning_rate": 3.440713983000601e-05,
"loss": 0.6964,
"step": 4575
},
{
"epoch": 7.551525144270404,
"grad_norm": 0.412109375,
"learning_rate": 3.419001703120709e-05,
"loss": 0.6934,
"step": 4580
},
{
"epoch": 7.559769167353669,
"grad_norm": 0.392578125,
"learning_rate": 3.397344011716216e-05,
"loss": 0.7035,
"step": 4585
},
{
"epoch": 7.568013190436933,
"grad_norm": 0.40625,
"learning_rate": 3.3757410884346894e-05,
"loss": 0.6827,
"step": 4590
},
{
"epoch": 7.576257213520198,
"grad_norm": 0.39453125,
"learning_rate": 3.354193112469407e-05,
"loss": 0.6979,
"step": 4595
},
{
"epoch": 7.5845012366034625,
"grad_norm": 0.419921875,
"learning_rate": 3.332700262557864e-05,
"loss": 0.7002,
"step": 4600
},
{
"epoch": 7.592745259686727,
"grad_norm": 0.419921875,
"learning_rate": 3.3112627169802946e-05,
"loss": 0.6996,
"step": 4605
},
{
"epoch": 7.600989282769992,
"grad_norm": 0.4140625,
"learning_rate": 3.289880653558188e-05,
"loss": 0.6942,
"step": 4610
},
{
"epoch": 7.609233305853256,
"grad_norm": 0.4296875,
"learning_rate": 3.2685542496528185e-05,
"loss": 0.7002,
"step": 4615
},
{
"epoch": 7.617477328936521,
"grad_norm": 0.38671875,
"learning_rate": 3.2472836821637744e-05,
"loss": 0.6953,
"step": 4620
},
{
"epoch": 7.625721352019785,
"grad_norm": 0.40234375,
"learning_rate": 3.2260691275274835e-05,
"loss": 0.7001,
"step": 4625
},
{
"epoch": 7.633965375103051,
"grad_norm": 0.4140625,
"learning_rate": 3.204910761715763e-05,
"loss": 0.6935,
"step": 4630
},
{
"epoch": 7.642209398186315,
"grad_norm": 0.40625,
"learning_rate": 3.1838087602343344e-05,
"loss": 0.6973,
"step": 4635
},
{
"epoch": 7.65045342126958,
"grad_norm": 0.392578125,
"learning_rate": 3.162763298121408e-05,
"loss": 0.6962,
"step": 4640
},
{
"epoch": 7.658697444352844,
"grad_norm": 0.396484375,
"learning_rate": 3.1417745499461934e-05,
"loss": 0.6986,
"step": 4645
},
{
"epoch": 7.666941467436109,
"grad_norm": 0.40625,
"learning_rate": 3.120842689807468e-05,
"loss": 0.7008,
"step": 4650
},
{
"epoch": 7.6751854905193735,
"grad_norm": 0.396484375,
"learning_rate": 3.099967891332132e-05,
"loss": 0.698,
"step": 4655
},
{
"epoch": 7.683429513602638,
"grad_norm": 0.400390625,
"learning_rate": 3.079150327673766e-05,
"loss": 0.6996,
"step": 4660
},
{
"epoch": 7.691673536685903,
"grad_norm": 0.458984375,
"learning_rate": 3.058390171511196e-05,
"loss": 0.6973,
"step": 4665
},
{
"epoch": 7.699917559769167,
"grad_norm": 0.419921875,
"learning_rate": 3.0376875950470617e-05,
"loss": 0.6972,
"step": 4670
},
{
"epoch": 7.708161582852432,
"grad_norm": 0.447265625,
"learning_rate": 3.0170427700063873e-05,
"loss": 0.6962,
"step": 4675
},
{
"epoch": 7.716405605935696,
"grad_norm": 0.4140625,
"learning_rate": 2.996455867635155e-05,
"loss": 0.7006,
"step": 4680
},
{
"epoch": 7.724649629018961,
"grad_norm": 0.408203125,
"learning_rate": 2.9759270586988865e-05,
"loss": 0.7017,
"step": 4685
},
{
"epoch": 7.732893652102226,
"grad_norm": 0.427734375,
"learning_rate": 2.9554565134812294e-05,
"loss": 0.7051,
"step": 4690
},
{
"epoch": 7.741137675185491,
"grad_norm": 0.392578125,
"learning_rate": 2.9350444017825385e-05,
"loss": 0.6909,
"step": 4695
},
{
"epoch": 7.749381698268755,
"grad_norm": 0.400390625,
"learning_rate": 2.9146908929184713e-05,
"loss": 0.6939,
"step": 4700
},
{
"epoch": 7.75762572135202,
"grad_norm": 0.435546875,
"learning_rate": 2.894396155718585e-05,
"loss": 0.6956,
"step": 4705
},
{
"epoch": 7.765869744435284,
"grad_norm": 0.419921875,
"learning_rate": 2.874160358524931e-05,
"loss": 0.6962,
"step": 4710
},
{
"epoch": 7.774113767518549,
"grad_norm": 0.451171875,
"learning_rate": 2.853983669190664e-05,
"loss": 0.6911,
"step": 4715
},
{
"epoch": 7.7823577906018135,
"grad_norm": 0.423828125,
"learning_rate": 2.8338662550786443e-05,
"loss": 0.6954,
"step": 4720
},
{
"epoch": 7.790601813685078,
"grad_norm": 0.404296875,
"learning_rate": 2.8138082830600554e-05,
"loss": 0.694,
"step": 4725
},
{
"epoch": 7.798845836768343,
"grad_norm": 0.40625,
"learning_rate": 2.7938099195130153e-05,
"loss": 0.6935,
"step": 4730
},
{
"epoch": 7.807089859851608,
"grad_norm": 0.416015625,
"learning_rate": 2.7738713303211982e-05,
"loss": 0.6885,
"step": 4735
},
{
"epoch": 7.815333882934873,
"grad_norm": 0.4375,
"learning_rate": 2.753992680872457e-05,
"loss": 0.7002,
"step": 4740
},
{
"epoch": 7.823577906018137,
"grad_norm": 0.3984375,
"learning_rate": 2.7341741360574548e-05,
"loss": 0.6928,
"step": 4745
},
{
"epoch": 7.831821929101402,
"grad_norm": 0.466796875,
"learning_rate": 2.7144158602682924e-05,
"loss": 0.6959,
"step": 4750
},
{
"epoch": 7.840065952184666,
"grad_norm": 0.40625,
"learning_rate": 2.6947180173971508e-05,
"loss": 0.6907,
"step": 4755
},
{
"epoch": 7.848309975267931,
"grad_norm": 0.435546875,
"learning_rate": 2.6750807708349267e-05,
"loss": 0.6982,
"step": 4760
},
{
"epoch": 7.856553998351195,
"grad_norm": 0.40234375,
"learning_rate": 2.6555042834698773e-05,
"loss": 0.6945,
"step": 4765
},
{
"epoch": 7.86479802143446,
"grad_norm": 0.427734375,
"learning_rate": 2.6359887176862718e-05,
"loss": 0.695,
"step": 4770
},
{
"epoch": 7.8730420445177245,
"grad_norm": 0.396484375,
"learning_rate": 2.6165342353630428e-05,
"loss": 0.694,
"step": 4775
},
{
"epoch": 7.881286067600989,
"grad_norm": 0.412109375,
"learning_rate": 2.5971409978724458e-05,
"loss": 0.6986,
"step": 4780
},
{
"epoch": 7.889530090684254,
"grad_norm": 0.419921875,
"learning_rate": 2.577809166078716e-05,
"loss": 0.6935,
"step": 4785
},
{
"epoch": 7.897774113767518,
"grad_norm": 0.421875,
"learning_rate": 2.558538900336741e-05,
"loss": 0.6991,
"step": 4790
},
{
"epoch": 7.906018136850783,
"grad_norm": 0.384765625,
"learning_rate": 2.5393303604907205e-05,
"loss": 0.6974,
"step": 4795
},
{
"epoch": 7.914262159934048,
"grad_norm": 0.396484375,
"learning_rate": 2.5201837058728505e-05,
"loss": 0.6956,
"step": 4800
},
{
"epoch": 7.922506183017313,
"grad_norm": 0.40625,
"learning_rate": 2.5010990953019975e-05,
"loss": 0.6927,
"step": 4805
},
{
"epoch": 7.930750206100577,
"grad_norm": 0.3984375,
"learning_rate": 2.4820766870823807e-05,
"loss": 0.688,
"step": 4810
},
{
"epoch": 7.938994229183842,
"grad_norm": 0.40625,
"learning_rate": 2.4631166390022574e-05,
"loss": 0.695,
"step": 4815
},
{
"epoch": 7.947238252267106,
"grad_norm": 0.4296875,
"learning_rate": 2.4442191083326195e-05,
"loss": 0.7014,
"step": 4820
},
{
"epoch": 7.955482275350371,
"grad_norm": 0.396484375,
"learning_rate": 2.425384251825882e-05,
"loss": 0.6955,
"step": 4825
},
{
"epoch": 7.963726298433635,
"grad_norm": 0.42578125,
"learning_rate": 2.4066122257145894e-05,
"loss": 0.6934,
"step": 4830
},
{
"epoch": 7.9719703215169,
"grad_norm": 0.388671875,
"learning_rate": 2.387903185710115e-05,
"loss": 0.6909,
"step": 4835
},
{
"epoch": 7.9802143446001645,
"grad_norm": 0.419921875,
"learning_rate": 2.3692572870013718e-05,
"loss": 0.691,
"step": 4840
},
{
"epoch": 7.98845836768343,
"grad_norm": 0.41796875,
"learning_rate": 2.3506746842535242e-05,
"loss": 0.6929,
"step": 4845
},
{
"epoch": 7.9967023907666945,
"grad_norm": 0.40625,
"learning_rate": 2.3321555316067045e-05,
"loss": 0.6928,
"step": 4850
},
{
"epoch": 8.0,
"eval_loss": 2.471337080001831,
"eval_runtime": 0.2361,
"eval_samples_per_second": 42.357,
"eval_steps_per_second": 4.236,
"step": 4852
},
{
"epoch": 8.004946413849959,
"grad_norm": 0.423828125,
"learning_rate": 2.313699982674736e-05,
"loss": 0.6913,
"step": 4855
},
{
"epoch": 8.013190436933224,
"grad_norm": 0.427734375,
"learning_rate": 2.295308190543859e-05,
"loss": 0.6943,
"step": 4860
},
{
"epoch": 8.021434460016488,
"grad_norm": 0.453125,
"learning_rate": 2.276980307771458e-05,
"loss": 0.6958,
"step": 4865
},
{
"epoch": 8.029678483099753,
"grad_norm": 0.3984375,
"learning_rate": 2.2587164863847975e-05,
"loss": 0.6957,
"step": 4870
},
{
"epoch": 8.037922506183017,
"grad_norm": 0.392578125,
"learning_rate": 2.2405168778797646e-05,
"loss": 0.6914,
"step": 4875
},
{
"epoch": 8.046166529266282,
"grad_norm": 0.408203125,
"learning_rate": 2.222381633219608e-05,
"loss": 0.6904,
"step": 4880
},
{
"epoch": 8.054410552349546,
"grad_norm": 0.435546875,
"learning_rate": 2.204310902833685e-05,
"loss": 0.6921,
"step": 4885
},
{
"epoch": 8.062654575432811,
"grad_norm": 0.435546875,
"learning_rate": 2.1863048366162208e-05,
"loss": 0.6926,
"step": 4890
},
{
"epoch": 8.070898598516075,
"grad_norm": 0.380859375,
"learning_rate": 2.1683635839250537e-05,
"loss": 0.6938,
"step": 4895
},
{
"epoch": 8.07914262159934,
"grad_norm": 0.408203125,
"learning_rate": 2.15048729358041e-05,
"loss": 0.6936,
"step": 4900
},
{
"epoch": 8.087386644682605,
"grad_norm": 0.392578125,
"learning_rate": 2.1326761138636553e-05,
"loss": 0.6959,
"step": 4905
},
{
"epoch": 8.09563066776587,
"grad_norm": 0.400390625,
"learning_rate": 2.114930192516076e-05,
"loss": 0.6883,
"step": 4910
},
{
"epoch": 8.103874690849134,
"grad_norm": 0.404296875,
"learning_rate": 2.097249676737648e-05,
"loss": 0.6989,
"step": 4915
},
{
"epoch": 8.112118713932398,
"grad_norm": 0.44921875,
"learning_rate": 2.0796347131858186e-05,
"loss": 0.6915,
"step": 4920
},
{
"epoch": 8.120362737015663,
"grad_norm": 0.408203125,
"learning_rate": 2.0620854479742834e-05,
"loss": 0.6893,
"step": 4925
},
{
"epoch": 8.12860676009893,
"grad_norm": 0.3984375,
"learning_rate": 2.044602026671786e-05,
"loss": 0.699,
"step": 4930
},
{
"epoch": 8.136850783182194,
"grad_norm": 0.3984375,
"learning_rate": 2.027184594300898e-05,
"loss": 0.6962,
"step": 4935
},
{
"epoch": 8.145094806265458,
"grad_norm": 0.40625,
"learning_rate": 2.0098332953368272e-05,
"loss": 0.6869,
"step": 4940
},
{
"epoch": 8.153338829348723,
"grad_norm": 0.40625,
"learning_rate": 1.9925482737062085e-05,
"loss": 0.6957,
"step": 4945
},
{
"epoch": 8.161582852431987,
"grad_norm": 0.40234375,
"learning_rate": 1.9753296727859195e-05,
"loss": 0.692,
"step": 4950
},
{
"epoch": 8.169826875515252,
"grad_norm": 0.39453125,
"learning_rate": 1.9581776354018854e-05,
"loss": 0.6985,
"step": 4955
},
{
"epoch": 8.178070898598516,
"grad_norm": 0.41015625,
"learning_rate": 1.941092303827896e-05,
"loss": 0.6876,
"step": 4960
},
{
"epoch": 8.186314921681781,
"grad_norm": 0.392578125,
"learning_rate": 1.9240738197844278e-05,
"loss": 0.6863,
"step": 4965
},
{
"epoch": 8.194558944765046,
"grad_norm": 0.40234375,
"learning_rate": 1.9071223244374614e-05,
"loss": 0.694,
"step": 4970
},
{
"epoch": 8.20280296784831,
"grad_norm": 0.39453125,
"learning_rate": 1.8902379583973208e-05,
"loss": 0.6936,
"step": 4975
},
{
"epoch": 8.211046990931575,
"grad_norm": 0.404296875,
"learning_rate": 1.8734208617174988e-05,
"loss": 0.6926,
"step": 4980
},
{
"epoch": 8.21929101401484,
"grad_norm": 0.3984375,
"learning_rate": 1.856671173893497e-05,
"loss": 0.6921,
"step": 4985
},
{
"epoch": 8.227535037098104,
"grad_norm": 0.408203125,
"learning_rate": 1.839989033861673e-05,
"loss": 0.6893,
"step": 4990
},
{
"epoch": 8.235779060181368,
"grad_norm": 0.4140625,
"learning_rate": 1.8233745799980817e-05,
"loss": 0.6931,
"step": 4995
},
{
"epoch": 8.244023083264633,
"grad_norm": 0.408203125,
"learning_rate": 1.8068279501173335e-05,
"loss": 0.6842,
"step": 5000
},
{
"epoch": 8.252267106347897,
"grad_norm": 0.400390625,
"learning_rate": 1.790349281471445e-05,
"loss": 0.6998,
"step": 5005
},
{
"epoch": 8.260511129431162,
"grad_norm": 0.404296875,
"learning_rate": 1.773938710748706e-05,
"loss": 0.6946,
"step": 5010
},
{
"epoch": 8.268755152514426,
"grad_norm": 0.39453125,
"learning_rate": 1.757596374072543e-05,
"loss": 0.6901,
"step": 5015
},
{
"epoch": 8.276999175597691,
"grad_norm": 0.412109375,
"learning_rate": 1.741322407000391e-05,
"loss": 0.6938,
"step": 5020
},
{
"epoch": 8.285243198680956,
"grad_norm": 0.416015625,
"learning_rate": 1.7251169445225657e-05,
"loss": 0.6922,
"step": 5025
},
{
"epoch": 8.29348722176422,
"grad_norm": 0.392578125,
"learning_rate": 1.70898012106115e-05,
"loss": 0.6844,
"step": 5030
},
{
"epoch": 8.301731244847485,
"grad_norm": 0.404296875,
"learning_rate": 1.692912070468874e-05,
"loss": 0.6968,
"step": 5035
},
{
"epoch": 8.309975267930751,
"grad_norm": 0.390625,
"learning_rate": 1.676912926028007e-05,
"loss": 0.6977,
"step": 5040
},
{
"epoch": 8.318219291014016,
"grad_norm": 0.40625,
"learning_rate": 1.660982820449247e-05,
"loss": 0.6995,
"step": 5045
},
{
"epoch": 8.32646331409728,
"grad_norm": 0.38671875,
"learning_rate": 1.6451218858706374e-05,
"loss": 0.6934,
"step": 5050
},
{
"epoch": 8.334707337180545,
"grad_norm": 0.400390625,
"learning_rate": 1.6293302538564382e-05,
"loss": 0.6954,
"step": 5055
},
{
"epoch": 8.34295136026381,
"grad_norm": 0.423828125,
"learning_rate": 1.6136080553960687e-05,
"loss": 0.6942,
"step": 5060
},
{
"epoch": 8.351195383347074,
"grad_norm": 0.408203125,
"learning_rate": 1.5979554209030024e-05,
"loss": 0.6887,
"step": 5065
},
{
"epoch": 8.359439406430338,
"grad_norm": 0.388671875,
"learning_rate": 1.5823724802136865e-05,
"loss": 0.6948,
"step": 5070
},
{
"epoch": 8.367683429513603,
"grad_norm": 0.404296875,
"learning_rate": 1.5668593625864715e-05,
"loss": 0.695,
"step": 5075
},
{
"epoch": 8.375927452596867,
"grad_norm": 0.396484375,
"learning_rate": 1.5514161967005337e-05,
"loss": 0.7057,
"step": 5080
},
{
"epoch": 8.384171475680132,
"grad_norm": 0.419921875,
"learning_rate": 1.536043110654809e-05,
"loss": 0.6906,
"step": 5085
},
{
"epoch": 8.392415498763397,
"grad_norm": 0.4140625,
"learning_rate": 1.5207402319669306e-05,
"loss": 0.6909,
"step": 5090
},
{
"epoch": 8.400659521846661,
"grad_norm": 0.40625,
"learning_rate": 1.505507687572173e-05,
"loss": 0.6841,
"step": 5095
},
{
"epoch": 8.408903544929926,
"grad_norm": 0.392578125,
"learning_rate": 1.4903456038223939e-05,
"loss": 0.6889,
"step": 5100
},
{
"epoch": 8.41714756801319,
"grad_norm": 0.38671875,
"learning_rate": 1.4752541064849946e-05,
"loss": 0.6908,
"step": 5105
},
{
"epoch": 8.425391591096455,
"grad_norm": 0.392578125,
"learning_rate": 1.4602333207418651e-05,
"loss": 0.6949,
"step": 5110
},
{
"epoch": 8.43363561417972,
"grad_norm": 0.400390625,
"learning_rate": 1.4452833711883628e-05,
"loss": 0.691,
"step": 5115
},
{
"epoch": 8.441879637262984,
"grad_norm": 0.388671875,
"learning_rate": 1.4304043818322565e-05,
"loss": 0.6855,
"step": 5120
},
{
"epoch": 8.450123660346248,
"grad_norm": 0.404296875,
"learning_rate": 1.4155964760927176e-05,
"loss": 0.6937,
"step": 5125
},
{
"epoch": 8.458367683429513,
"grad_norm": 0.390625,
"learning_rate": 1.4008597767992871e-05,
"loss": 0.6922,
"step": 5130
},
{
"epoch": 8.466611706512778,
"grad_norm": 0.423828125,
"learning_rate": 1.3861944061908583e-05,
"loss": 0.6929,
"step": 5135
},
{
"epoch": 8.474855729596042,
"grad_norm": 0.39453125,
"learning_rate": 1.3716004859146592e-05,
"loss": 0.6898,
"step": 5140
},
{
"epoch": 8.483099752679308,
"grad_norm": 0.3984375,
"learning_rate": 1.3570781370252582e-05,
"loss": 0.6851,
"step": 5145
},
{
"epoch": 8.491343775762573,
"grad_norm": 0.416015625,
"learning_rate": 1.3426274799835337e-05,
"loss": 0.6846,
"step": 5150
},
{
"epoch": 8.499587798845837,
"grad_norm": 0.412109375,
"learning_rate": 1.328248634655701e-05,
"loss": 0.7024,
"step": 5155
},
{
"epoch": 8.507831821929102,
"grad_norm": 0.419921875,
"learning_rate": 1.3139417203123027e-05,
"loss": 0.6881,
"step": 5160
},
{
"epoch": 8.516075845012367,
"grad_norm": 0.404296875,
"learning_rate": 1.2997068556272263e-05,
"loss": 0.7002,
"step": 5165
},
{
"epoch": 8.524319868095631,
"grad_norm": 0.40234375,
"learning_rate": 1.2855441586767113e-05,
"loss": 0.6909,
"step": 5170
},
{
"epoch": 8.532563891178896,
"grad_norm": 0.39453125,
"learning_rate": 1.2714537469383858e-05,
"loss": 0.6878,
"step": 5175
},
{
"epoch": 8.54080791426216,
"grad_norm": 0.390625,
"learning_rate": 1.2574357372902767e-05,
"loss": 0.6917,
"step": 5180
},
{
"epoch": 8.549051937345425,
"grad_norm": 0.40234375,
"learning_rate": 1.243490246009842e-05,
"loss": 0.689,
"step": 5185
},
{
"epoch": 8.55729596042869,
"grad_norm": 0.41015625,
"learning_rate": 1.2296173887730123e-05,
"loss": 0.6859,
"step": 5190
},
{
"epoch": 8.565539983511954,
"grad_norm": 0.392578125,
"learning_rate": 1.215817280653232e-05,
"loss": 0.6858,
"step": 5195
},
{
"epoch": 8.573784006595218,
"grad_norm": 0.412109375,
"learning_rate": 1.2020900361204968e-05,
"loss": 0.6894,
"step": 5200
},
{
"epoch": 8.582028029678483,
"grad_norm": 0.396484375,
"learning_rate": 1.1884357690404158e-05,
"loss": 0.6886,
"step": 5205
},
{
"epoch": 8.590272052761748,
"grad_norm": 0.396484375,
"learning_rate": 1.1748545926732535e-05,
"loss": 0.6903,
"step": 5210
},
{
"epoch": 8.598516075845012,
"grad_norm": 0.392578125,
"learning_rate": 1.1613466196729984e-05,
"loss": 0.7021,
"step": 5215
},
{
"epoch": 8.606760098928277,
"grad_norm": 0.408203125,
"learning_rate": 1.1479119620864276e-05,
"loss": 0.6826,
"step": 5220
},
{
"epoch": 8.615004122011541,
"grad_norm": 0.40234375,
"learning_rate": 1.1345507313521786e-05,
"loss": 0.6954,
"step": 5225
},
{
"epoch": 8.623248145094806,
"grad_norm": 0.40625,
"learning_rate": 1.1212630382998213e-05,
"loss": 0.6877,
"step": 5230
},
{
"epoch": 8.63149216817807,
"grad_norm": 0.388671875,
"learning_rate": 1.1080489931489391e-05,
"loss": 0.696,
"step": 5235
},
{
"epoch": 8.639736191261335,
"grad_norm": 0.392578125,
"learning_rate": 1.0949087055082252e-05,
"loss": 0.6977,
"step": 5240
},
{
"epoch": 8.6479802143446,
"grad_norm": 0.38671875,
"learning_rate": 1.0818422843745512e-05,
"loss": 0.6924,
"step": 5245
},
{
"epoch": 8.656224237427864,
"grad_norm": 0.40234375,
"learning_rate": 1.0688498381320855e-05,
"loss": 0.6941,
"step": 5250
},
{
"epoch": 8.664468260511129,
"grad_norm": 0.390625,
"learning_rate": 1.0559314745513805e-05,
"loss": 0.6878,
"step": 5255
},
{
"epoch": 8.672712283594395,
"grad_norm": 0.41015625,
"learning_rate": 1.0430873007884857e-05,
"loss": 0.6975,
"step": 5260
},
{
"epoch": 8.68095630667766,
"grad_norm": 0.40625,
"learning_rate": 1.0303174233840528e-05,
"loss": 0.6863,
"step": 5265
},
{
"epoch": 8.689200329760924,
"grad_norm": 0.41015625,
"learning_rate": 1.0176219482624616e-05,
"loss": 0.7022,
"step": 5270
},
{
"epoch": 8.697444352844188,
"grad_norm": 0.4140625,
"learning_rate": 1.0050009807309325e-05,
"loss": 0.6892,
"step": 5275
},
{
"epoch": 8.705688375927453,
"grad_norm": 0.39453125,
"learning_rate": 9.924546254786493e-06,
"loss": 0.6839,
"step": 5280
},
{
"epoch": 8.713932399010718,
"grad_norm": 0.41796875,
"learning_rate": 9.799829865759069e-06,
"loss": 0.6821,
"step": 5285
},
{
"epoch": 8.722176422093982,
"grad_norm": 0.388671875,
"learning_rate": 9.675861674732312e-06,
"loss": 0.6885,
"step": 5290
},
{
"epoch": 8.730420445177247,
"grad_norm": 0.421875,
"learning_rate": 9.552642710005299e-06,
"loss": 0.6965,
"step": 5295
},
{
"epoch": 8.738664468260511,
"grad_norm": 0.404296875,
"learning_rate": 9.430173993662451e-06,
"loss": 0.6971,
"step": 5300
},
{
"epoch": 8.746908491343776,
"grad_norm": 0.396484375,
"learning_rate": 9.308456541564881e-06,
"loss": 0.6847,
"step": 5305
},
{
"epoch": 8.75515251442704,
"grad_norm": 0.404296875,
"learning_rate": 9.187491363342093e-06,
"loss": 0.6982,
"step": 5310
},
{
"epoch": 8.763396537510305,
"grad_norm": 0.443359375,
"learning_rate": 9.067279462383615e-06,
"loss": 0.6906,
"step": 5315
},
{
"epoch": 8.77164056059357,
"grad_norm": 0.41015625,
"learning_rate": 8.947821835830616e-06,
"loss": 0.6981,
"step": 5320
},
{
"epoch": 8.779884583676834,
"grad_norm": 0.416015625,
"learning_rate": 8.829119474567671e-06,
"loss": 0.6972,
"step": 5325
},
{
"epoch": 8.788128606760099,
"grad_norm": 0.408203125,
"learning_rate": 8.711173363214553e-06,
"loss": 0.6875,
"step": 5330
},
{
"epoch": 8.796372629843363,
"grad_norm": 0.43359375,
"learning_rate": 8.593984480118011e-06,
"loss": 0.6904,
"step": 5335
},
{
"epoch": 8.804616652926628,
"grad_norm": 0.412109375,
"learning_rate": 8.47755379734373e-06,
"loss": 0.6886,
"step": 5340
},
{
"epoch": 8.812860676009892,
"grad_norm": 0.431640625,
"learning_rate": 8.361882280668165e-06,
"loss": 0.6919,
"step": 5345
},
{
"epoch": 8.821104699093157,
"grad_norm": 0.388671875,
"learning_rate": 8.24697088957066e-06,
"loss": 0.6934,
"step": 5350
},
{
"epoch": 8.829348722176421,
"grad_norm": 0.38671875,
"learning_rate": 8.132820577225387e-06,
"loss": 0.6882,
"step": 5355
},
{
"epoch": 8.837592745259688,
"grad_norm": 0.390625,
"learning_rate": 8.019432290493457e-06,
"loss": 0.7015,
"step": 5360
},
{
"epoch": 8.845836768342952,
"grad_norm": 0.39453125,
"learning_rate": 7.906806969915148e-06,
"loss": 0.689,
"step": 5365
},
{
"epoch": 8.854080791426217,
"grad_norm": 0.400390625,
"learning_rate": 7.794945549701993e-06,
"loss": 0.6866,
"step": 5370
},
{
"epoch": 8.862324814509481,
"grad_norm": 0.40234375,
"learning_rate": 7.683848957729056e-06,
"loss": 0.696,
"step": 5375
},
{
"epoch": 8.870568837592746,
"grad_norm": 0.4140625,
"learning_rate": 7.573518115527289e-06,
"loss": 0.6824,
"step": 5380
},
{
"epoch": 8.87881286067601,
"grad_norm": 0.39453125,
"learning_rate": 7.463953938275858e-06,
"loss": 0.6941,
"step": 5385
},
{
"epoch": 8.887056883759275,
"grad_norm": 0.390625,
"learning_rate": 7.355157334794516e-06,
"loss": 0.6901,
"step": 5390
},
{
"epoch": 8.89530090684254,
"grad_norm": 0.404296875,
"learning_rate": 7.247129207536152e-06,
"loss": 0.688,
"step": 5395
},
{
"epoch": 8.903544929925804,
"grad_norm": 0.39453125,
"learning_rate": 7.1398704525792e-06,
"loss": 0.6906,
"step": 5400
},
{
"epoch": 8.911788953009069,
"grad_norm": 0.42578125,
"learning_rate": 7.0333819596203e-06,
"loss": 0.6878,
"step": 5405
},
{
"epoch": 8.920032976092333,
"grad_norm": 0.404296875,
"learning_rate": 6.927664611966811e-06,
"loss": 0.6965,
"step": 5410
},
{
"epoch": 8.928276999175598,
"grad_norm": 0.40234375,
"learning_rate": 6.8227192865295995e-06,
"loss": 0.69,
"step": 5415
},
{
"epoch": 8.936521022258862,
"grad_norm": 0.390625,
"learning_rate": 6.718546853815688e-06,
"loss": 0.6857,
"step": 5420
},
{
"epoch": 8.944765045342127,
"grad_norm": 0.4140625,
"learning_rate": 6.6151481779211155e-06,
"loss": 0.6922,
"step": 5425
},
{
"epoch": 8.953009068425391,
"grad_norm": 0.40234375,
"learning_rate": 6.512524116523633e-06,
"loss": 0.6885,
"step": 5430
},
{
"epoch": 8.961253091508656,
"grad_norm": 0.4375,
"learning_rate": 6.410675520875742e-06,
"loss": 0.6854,
"step": 5435
},
{
"epoch": 8.96949711459192,
"grad_norm": 0.40625,
"learning_rate": 6.30960323579749e-06,
"loss": 0.6966,
"step": 5440
},
{
"epoch": 8.977741137675185,
"grad_norm": 0.392578125,
"learning_rate": 6.209308099669597e-06,
"loss": 0.6962,
"step": 5445
},
{
"epoch": 8.98598516075845,
"grad_norm": 0.408203125,
"learning_rate": 6.109790944426397e-06,
"loss": 0.707,
"step": 5450
},
{
"epoch": 8.994229183841714,
"grad_norm": 0.41015625,
"learning_rate": 6.011052595549038e-06,
"loss": 0.6924,
"step": 5455
},
{
"epoch": 8.999175597691673,
"eval_loss": 2.4814510345458984,
"eval_runtime": 0.2587,
"eval_samples_per_second": 38.654,
"eval_steps_per_second": 3.865,
"step": 5458
},
{
"epoch": 9.002473206924979,
"grad_norm": 0.3984375,
"learning_rate": 5.913093872058528e-06,
"loss": 0.6875,
"step": 5460
},
{
"epoch": 9.010717230008243,
"grad_norm": 0.404296875,
"learning_rate": 5.81591558650898e-06,
"loss": 0.6871,
"step": 5465
},
{
"epoch": 9.01896125309151,
"grad_norm": 0.396484375,
"learning_rate": 5.719518544980929e-06,
"loss": 0.6887,
"step": 5470
},
{
"epoch": 9.027205276174774,
"grad_norm": 0.4453125,
"learning_rate": 5.623903547074549e-06,
"loss": 0.7051,
"step": 5475
},
{
"epoch": 9.035449299258039,
"grad_norm": 0.40625,
"learning_rate": 5.529071385903084e-06,
"loss": 0.694,
"step": 5480
},
{
"epoch": 9.043693322341303,
"grad_norm": 0.40234375,
"learning_rate": 5.43502284808628e-06,
"loss": 0.6839,
"step": 5485
},
{
"epoch": 9.051937345424568,
"grad_norm": 0.396484375,
"learning_rate": 5.341758713743828e-06,
"loss": 0.6906,
"step": 5490
},
{
"epoch": 9.060181368507832,
"grad_norm": 0.39453125,
"learning_rate": 5.249279756488878e-06,
"loss": 0.6895,
"step": 5495
},
{
"epoch": 9.068425391591097,
"grad_norm": 0.396484375,
"learning_rate": 5.157586743421672e-06,
"loss": 0.6937,
"step": 5500
},
{
"epoch": 9.076669414674361,
"grad_norm": 0.392578125,
"learning_rate": 5.066680435123106e-06,
"loss": 0.7007,
"step": 5505
},
{
"epoch": 9.084913437757626,
"grad_norm": 0.384765625,
"learning_rate": 4.976561585648509e-06,
"loss": 0.6864,
"step": 5510
},
{
"epoch": 9.09315746084089,
"grad_norm": 0.3984375,
"learning_rate": 4.887230942521337e-06,
"loss": 0.6886,
"step": 5515
},
{
"epoch": 9.101401483924155,
"grad_norm": 0.404296875,
"learning_rate": 4.798689246727006e-06,
"loss": 0.6965,
"step": 5520
},
{
"epoch": 9.10964550700742,
"grad_norm": 0.3984375,
"learning_rate": 4.710937232706691e-06,
"loss": 0.6888,
"step": 5525
},
{
"epoch": 9.117889530090684,
"grad_norm": 0.404296875,
"learning_rate": 4.623975628351273e-06,
"loss": 0.6937,
"step": 5530
},
{
"epoch": 9.126133553173949,
"grad_norm": 0.396484375,
"learning_rate": 4.537805154995278e-06,
"loss": 0.6989,
"step": 5535
},
{
"epoch": 9.134377576257213,
"grad_norm": 0.408203125,
"learning_rate": 4.452426527410947e-06,
"loss": 0.69,
"step": 5540
},
{
"epoch": 9.142621599340478,
"grad_norm": 0.4296875,
"learning_rate": 4.36784045380223e-06,
"loss": 0.6952,
"step": 5545
},
{
"epoch": 9.150865622423742,
"grad_norm": 0.39453125,
"learning_rate": 4.2840476357989825e-06,
"loss": 0.6909,
"step": 5550
},
{
"epoch": 9.159109645507007,
"grad_norm": 0.39453125,
"learning_rate": 4.20104876845111e-06,
"loss": 0.6835,
"step": 5555
},
{
"epoch": 9.167353668590271,
"grad_norm": 0.404296875,
"learning_rate": 4.118844540222788e-06,
"loss": 0.7042,
"step": 5560
},
{
"epoch": 9.175597691673536,
"grad_norm": 0.404296875,
"learning_rate": 4.037435632986786e-06,
"loss": 0.693,
"step": 5565
},
{
"epoch": 9.1838417147568,
"grad_norm": 0.39453125,
"learning_rate": 3.95682272201876e-06,
"loss": 0.6854,
"step": 5570
},
{
"epoch": 9.192085737840065,
"grad_norm": 0.392578125,
"learning_rate": 3.877006475991729e-06,
"loss": 0.6937,
"step": 5575
},
{
"epoch": 9.200329760923331,
"grad_norm": 0.3984375,
"learning_rate": 3.797987556970495e-06,
"loss": 0.6968,
"step": 5580
},
{
"epoch": 9.208573784006596,
"grad_norm": 0.400390625,
"learning_rate": 3.7197666204060955e-06,
"loss": 0.6902,
"step": 5585
},
{
"epoch": 9.21681780708986,
"grad_norm": 0.400390625,
"learning_rate": 3.6423443151304526e-06,
"loss": 0.6896,
"step": 5590
},
{
"epoch": 9.225061830173125,
"grad_norm": 0.41796875,
"learning_rate": 3.565721283350931e-06,
"loss": 0.696,
"step": 5595
},
{
"epoch": 9.23330585325639,
"grad_norm": 0.408203125,
"learning_rate": 3.4898981606450333e-06,
"loss": 0.6895,
"step": 5600
},
{
"epoch": 9.241549876339654,
"grad_norm": 0.39453125,
"learning_rate": 3.414875575955101e-06,
"loss": 0.6845,
"step": 5605
},
{
"epoch": 9.249793899422919,
"grad_norm": 0.400390625,
"learning_rate": 3.3406541515832003e-06,
"loss": 0.6908,
"step": 5610
},
{
"epoch": 9.258037922506183,
"grad_norm": 0.396484375,
"learning_rate": 3.267234503185823e-06,
"loss": 0.6885,
"step": 5615
},
{
"epoch": 9.266281945589448,
"grad_norm": 0.3984375,
"learning_rate": 3.1946172397688267e-06,
"loss": 0.6921,
"step": 5620
},
{
"epoch": 9.274525968672712,
"grad_norm": 0.404296875,
"learning_rate": 3.1228029636824475e-06,
"loss": 0.6927,
"step": 5625
},
{
"epoch": 9.282769991755977,
"grad_norm": 0.39453125,
"learning_rate": 3.051792270616216e-06,
"loss": 0.689,
"step": 5630
},
{
"epoch": 9.291014014839241,
"grad_norm": 0.416015625,
"learning_rate": 2.981585749594051e-06,
"loss": 0.6962,
"step": 5635
},
{
"epoch": 9.299258037922506,
"grad_norm": 0.39453125,
"learning_rate": 2.912183982969385e-06,
"loss": 0.6873,
"step": 5640
},
{
"epoch": 9.30750206100577,
"grad_norm": 0.39453125,
"learning_rate": 2.8435875464203343e-06,
"loss": 0.6839,
"step": 5645
},
{
"epoch": 9.315746084089035,
"grad_norm": 0.3828125,
"learning_rate": 2.7757970089449024e-06,
"loss": 0.6884,
"step": 5650
},
{
"epoch": 9.3239901071723,
"grad_norm": 0.3984375,
"learning_rate": 2.708812932856253e-06,
"loss": 0.6865,
"step": 5655
},
{
"epoch": 9.332234130255564,
"grad_norm": 0.396484375,
"learning_rate": 2.6426358737781098e-06,
"loss": 0.6944,
"step": 5660
},
{
"epoch": 9.340478153338829,
"grad_norm": 0.39453125,
"learning_rate": 2.577266380640053e-06,
"loss": 0.6866,
"step": 5665
},
{
"epoch": 9.348722176422093,
"grad_norm": 0.40234375,
"learning_rate": 2.5127049956730207e-06,
"loss": 0.6917,
"step": 5670
},
{
"epoch": 9.356966199505358,
"grad_norm": 0.384765625,
"learning_rate": 2.448952254404846e-06,
"loss": 0.6984,
"step": 5675
},
{
"epoch": 9.365210222588622,
"grad_norm": 0.39453125,
"learning_rate": 2.3860086856557383e-06,
"loss": 0.6881,
"step": 5680
},
{
"epoch": 9.373454245671887,
"grad_norm": 0.41015625,
"learning_rate": 2.3238748115339324e-06,
"loss": 0.689,
"step": 5685
},
{
"epoch": 9.381698268755153,
"grad_norm": 0.39453125,
"learning_rate": 2.2625511474313685e-06,
"loss": 0.6968,
"step": 5690
},
{
"epoch": 9.389942291838418,
"grad_norm": 0.43359375,
"learning_rate": 2.2020382020194074e-06,
"loss": 0.6923,
"step": 5695
},
{
"epoch": 9.398186314921682,
"grad_norm": 0.439453125,
"learning_rate": 2.1423364772445887e-06,
"loss": 0.6929,
"step": 5700
},
{
"epoch": 9.406430338004947,
"grad_norm": 0.41015625,
"learning_rate": 2.0834464683245346e-06,
"loss": 0.6948,
"step": 5705
},
{
"epoch": 9.414674361088212,
"grad_norm": 0.40234375,
"learning_rate": 2.025368663743743e-06,
"loss": 0.6956,
"step": 5710
},
{
"epoch": 9.422918384171476,
"grad_norm": 0.435546875,
"learning_rate": 1.968103545249611e-06,
"loss": 0.6857,
"step": 5715
},
{
"epoch": 9.43116240725474,
"grad_norm": 0.427734375,
"learning_rate": 1.91165158784844e-06,
"loss": 0.6871,
"step": 5720
},
{
"epoch": 9.439406430338005,
"grad_norm": 0.390625,
"learning_rate": 1.8560132598014368e-06,
"loss": 0.6864,
"step": 5725
},
{
"epoch": 9.44765045342127,
"grad_norm": 0.400390625,
"learning_rate": 1.8011890226208527e-06,
"loss": 0.6922,
"step": 5730
},
{
"epoch": 9.455894476504534,
"grad_norm": 0.3984375,
"learning_rate": 1.7471793310662287e-06,
"loss": 0.6973,
"step": 5735
},
{
"epoch": 9.464138499587799,
"grad_norm": 0.396484375,
"learning_rate": 1.6939846331405108e-06,
"loss": 0.6954,
"step": 5740
},
{
"epoch": 9.472382522671063,
"grad_norm": 0.392578125,
"learning_rate": 1.6416053700863964e-06,
"loss": 0.6983,
"step": 5745
},
{
"epoch": 9.480626545754328,
"grad_norm": 0.408203125,
"learning_rate": 1.5900419763826614e-06,
"loss": 0.6904,
"step": 5750
},
{
"epoch": 9.488870568837593,
"grad_norm": 0.41015625,
"learning_rate": 1.5392948797405827e-06,
"loss": 0.7001,
"step": 5755
},
{
"epoch": 9.497114591920857,
"grad_norm": 0.42578125,
"learning_rate": 1.489364501100332e-06,
"loss": 0.6978,
"step": 5760
},
{
"epoch": 9.505358615004122,
"grad_norm": 0.3984375,
"learning_rate": 1.4402512546275114e-06,
"loss": 0.6974,
"step": 5765
},
{
"epoch": 9.513602638087386,
"grad_norm": 0.42578125,
"learning_rate": 1.3919555477097668e-06,
"loss": 0.6885,
"step": 5770
},
{
"epoch": 9.52184666117065,
"grad_norm": 0.416015625,
"learning_rate": 1.344477780953346e-06,
"loss": 0.6884,
"step": 5775
},
{
"epoch": 9.530090684253915,
"grad_norm": 0.400390625,
"learning_rate": 1.2978183481797801e-06,
"loss": 0.6899,
"step": 5780
},
{
"epoch": 9.53833470733718,
"grad_norm": 0.392578125,
"learning_rate": 1.251977636422641e-06,
"loss": 0.6897,
"step": 5785
},
{
"epoch": 9.546578730420444,
"grad_norm": 0.390625,
"learning_rate": 1.2069560259243328e-06,
"loss": 0.6933,
"step": 5790
},
{
"epoch": 9.55482275350371,
"grad_norm": 0.4140625,
"learning_rate": 1.1627538901329172e-06,
"loss": 0.6868,
"step": 5795
},
{
"epoch": 9.563066776586975,
"grad_norm": 0.39453125,
"learning_rate": 1.1193715956990258e-06,
"loss": 0.6855,
"step": 5800
},
{
"epoch": 9.57131079967024,
"grad_norm": 0.400390625,
"learning_rate": 1.076809502472831e-06,
"loss": 0.6977,
"step": 5805
},
{
"epoch": 9.579554822753504,
"grad_norm": 0.396484375,
"learning_rate": 1.035067963501024e-06,
"loss": 0.6969,
"step": 5810
},
{
"epoch": 9.587798845836769,
"grad_norm": 0.400390625,
"learning_rate": 9.94147325023953e-07,
"loss": 0.6982,
"step": 5815
},
{
"epoch": 9.596042868920033,
"grad_norm": 0.388671875,
"learning_rate": 9.540479264726676e-07,
"loss": 0.6865,
"step": 5820
},
{
"epoch": 9.604286892003298,
"grad_norm": 0.40625,
"learning_rate": 9.147701004661446e-07,
"loss": 0.6897,
"step": 5825
},
{
"epoch": 9.612530915086563,
"grad_norm": 0.404296875,
"learning_rate": 8.763141728085789e-07,
"loss": 0.6837,
"step": 5830
},
{
"epoch": 9.620774938169827,
"grad_norm": 0.396484375,
"learning_rate": 8.386804624865851e-07,
"loss": 0.6865,
"step": 5835
},
{
"epoch": 9.629018961253092,
"grad_norm": 0.39453125,
"learning_rate": 8.018692816666118e-07,
"loss": 0.6907,
"step": 5840
},
{
"epoch": 9.637262984336356,
"grad_norm": 0.39453125,
"learning_rate": 7.658809356923424e-07,
"loss": 0.6902,
"step": 5845
},
{
"epoch": 9.64550700741962,
"grad_norm": 0.39453125,
"learning_rate": 7.307157230821426e-07,
"loss": 0.6925,
"step": 5850
},
{
"epoch": 9.653751030502885,
"grad_norm": 0.3984375,
"learning_rate": 6.963739355266286e-07,
"loss": 0.6911,
"step": 5855
},
{
"epoch": 9.66199505358615,
"grad_norm": 0.39453125,
"learning_rate": 6.628558578862021e-07,
"loss": 0.6838,
"step": 5860
},
{
"epoch": 9.670239076669414,
"grad_norm": 0.388671875,
"learning_rate": 6.301617681886863e-07,
"loss": 0.6883,
"step": 5865
},
{
"epoch": 9.678483099752679,
"grad_norm": 0.408203125,
"learning_rate": 5.982919376270823e-07,
"loss": 0.6908,
"step": 5870
},
{
"epoch": 9.686727122835944,
"grad_norm": 0.416015625,
"learning_rate": 5.672466305572388e-07,
"loss": 0.6908,
"step": 5875
},
{
"epoch": 9.694971145919208,
"grad_norm": 0.408203125,
"learning_rate": 5.370261044956971e-07,
"loss": 0.6962,
"step": 5880
},
{
"epoch": 9.703215169002473,
"grad_norm": 0.396484375,
"learning_rate": 5.07630610117582e-07,
"loss": 0.6932,
"step": 5885
},
{
"epoch": 9.711459192085737,
"grad_norm": 0.390625,
"learning_rate": 4.790603912544489e-07,
"loss": 0.6878,
"step": 5890
},
{
"epoch": 9.719703215169002,
"grad_norm": 0.400390625,
"learning_rate": 4.5131568489236166e-07,
"loss": 0.6946,
"step": 5895
},
{
"epoch": 9.727947238252266,
"grad_norm": 0.4296875,
"learning_rate": 4.2439672116982855e-07,
"loss": 0.6853,
"step": 5900
},
{
"epoch": 9.73619126133553,
"grad_norm": 0.396484375,
"learning_rate": 3.983037233759368e-07,
"loss": 0.6914,
"step": 5905
},
{
"epoch": 9.744435284418797,
"grad_norm": 0.404296875,
"learning_rate": 3.73036907948543e-07,
"loss": 0.6898,
"step": 5910
},
{
"epoch": 9.752679307502062,
"grad_norm": 0.388671875,
"learning_rate": 3.485964844723744e-07,
"loss": 0.6888,
"step": 5915
},
{
"epoch": 9.760923330585326,
"grad_norm": 0.412109375,
"learning_rate": 3.2498265567739717e-07,
"loss": 0.6824,
"step": 5920
},
{
"epoch": 9.76916735366859,
"grad_norm": 0.3984375,
"learning_rate": 3.0219561743707326e-07,
"loss": 0.691,
"step": 5925
},
{
"epoch": 9.777411376751855,
"grad_norm": 0.40234375,
"learning_rate": 2.8023555876673937e-07,
"loss": 0.6862,
"step": 5930
},
{
"epoch": 9.78565539983512,
"grad_norm": 0.396484375,
"learning_rate": 2.5910266182207486e-07,
"loss": 0.6933,
"step": 5935
},
{
"epoch": 9.793899422918384,
"grad_norm": 0.400390625,
"learning_rate": 2.3879710189753656e-07,
"loss": 0.6926,
"step": 5940
},
{
"epoch": 9.802143446001649,
"grad_norm": 0.3984375,
"learning_rate": 2.1931904742495957e-07,
"loss": 0.6807,
"step": 5945
},
{
"epoch": 9.810387469084914,
"grad_norm": 0.388671875,
"learning_rate": 2.0066865997212525e-07,
"loss": 0.6923,
"step": 5950
},
{
"epoch": 9.818631492168178,
"grad_norm": 0.39453125,
"learning_rate": 1.8284609424142895e-07,
"loss": 0.6885,
"step": 5955
},
{
"epoch": 9.826875515251443,
"grad_norm": 0.392578125,
"learning_rate": 1.6585149806860324e-07,
"loss": 0.6862,
"step": 5960
},
{
"epoch": 9.835119538334707,
"grad_norm": 0.4140625,
"learning_rate": 1.4968501242148547e-07,
"loss": 0.6955,
"step": 5965
},
{
"epoch": 9.843363561417972,
"grad_norm": 0.404296875,
"learning_rate": 1.3434677139885222e-07,
"loss": 0.6957,
"step": 5970
},
{
"epoch": 9.851607584501236,
"grad_norm": 0.419921875,
"learning_rate": 1.1983690222929778e-07,
"loss": 0.6915,
"step": 5975
},
{
"epoch": 9.8598516075845,
"grad_norm": 0.39453125,
"learning_rate": 1.0615552527017958e-07,
"loss": 0.701,
"step": 5980
},
{
"epoch": 9.868095630667765,
"grad_norm": 0.40234375,
"learning_rate": 9.330275400666332e-08,
"loss": 0.6959,
"step": 5985
},
{
"epoch": 9.87633965375103,
"grad_norm": 0.396484375,
"learning_rate": 8.127869505069053e-08,
"loss": 0.6885,
"step": 5990
},
{
"epoch": 9.884583676834295,
"grad_norm": 0.40234375,
"learning_rate": 7.00834481402013e-08,
"loss": 0.6842,
"step": 5995
},
{
"epoch": 9.892827699917559,
"grad_norm": 0.38671875,
"learning_rate": 5.971710613821291e-08,
"loss": 0.6956,
"step": 6000
},
{
"epoch": 9.901071723000824,
"grad_norm": 0.3984375,
"learning_rate": 5.0179755032109253e-08,
"loss": 0.6898,
"step": 6005
},
{
"epoch": 9.90931574608409,
"grad_norm": 0.3828125,
"learning_rate": 4.147147393290807e-08,
"loss": 0.6899,
"step": 6010
},
{
"epoch": 9.917559769167354,
"grad_norm": 0.404296875,
"learning_rate": 3.359233507459481e-08,
"loss": 0.697,
"step": 6015
},
{
"epoch": 9.925803792250619,
"grad_norm": 0.408203125,
"learning_rate": 2.6542403813545334e-08,
"loss": 0.6938,
"step": 6020
},
{
"epoch": 9.934047815333884,
"grad_norm": 0.3828125,
"learning_rate": 2.0321738627981923e-08,
"loss": 0.686,
"step": 6025
},
{
"epoch": 9.942291838417148,
"grad_norm": 0.40234375,
"learning_rate": 1.4930391117451426e-08,
"loss": 0.6874,
"step": 6030
},
{
"epoch": 9.950535861500413,
"grad_norm": 0.404296875,
"learning_rate": 1.0368406002436715e-08,
"loss": 0.6934,
"step": 6035
},
{
"epoch": 9.958779884583677,
"grad_norm": 0.400390625,
"learning_rate": 6.635821124001406e-09,
"loss": 0.6913,
"step": 6040
},
{
"epoch": 9.967023907666942,
"grad_norm": 0.388671875,
"learning_rate": 3.732667443390181e-09,
"loss": 0.6895,
"step": 6045
},
{
"epoch": 9.975267930750206,
"grad_norm": 0.3984375,
"learning_rate": 1.6589690418955528e-09,
"loss": 0.6968,
"step": 6050
},
{
"epoch": 9.983511953833471,
"grad_norm": 0.3984375,
"learning_rate": 4.147431205359098e-10,
"loss": 0.6946,
"step": 6055
},
{
"epoch": 9.991755976916735,
"grad_norm": 0.376953125,
"learning_rate": 0.0,
"loss": 0.6936,
"step": 6060
},
{
"epoch": 9.991755976916735,
"eval_loss": 2.4860482215881348,
"eval_runtime": 0.2343,
"eval_samples_per_second": 42.675,
"eval_steps_per_second": 4.267,
"step": 6060
},
{
"epoch": 9.991755976916735,
"step": 6060,
"total_flos": 1.8500974249565487e+19,
"train_loss": 1.1020318522705104,
"train_runtime": 14653.0399,
"train_samples_per_second": 26.478,
"train_steps_per_second": 0.414
}
],
"logging_steps": 5,
"max_steps": 6060,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 1.8500974249565487e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}