{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.991755976916735, "eval_steps": 500, "global_step": 6060, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016488046166529267, "grad_norm": 380.0, "learning_rate": 3.300330033003301e-07, "loss": 35.4867, "step": 1 }, { "epoch": 0.008244023083264633, "grad_norm": 308.0, "learning_rate": 1.65016501650165e-06, "loss": 34.8873, "step": 5 }, { "epoch": 0.016488046166529265, "grad_norm": 306.0, "learning_rate": 3.3003300330033e-06, "loss": 34.9252, "step": 10 }, { "epoch": 0.0247320692497939, "grad_norm": 163.0, "learning_rate": 4.950495049504951e-06, "loss": 31.7188, "step": 15 }, { "epoch": 0.03297609233305853, "grad_norm": 110.5, "learning_rate": 6.6006600660066e-06, "loss": 28.5443, "step": 20 }, { "epoch": 0.041220115416323165, "grad_norm": 72.5, "learning_rate": 8.250825082508252e-06, "loss": 24.1835, "step": 25 }, { "epoch": 0.0494641384995878, "grad_norm": 39.5, "learning_rate": 9.900990099009901e-06, "loss": 21.6514, "step": 30 }, { "epoch": 0.057708161582852434, "grad_norm": 19.0, "learning_rate": 1.155115511551155e-05, "loss": 19.5766, "step": 35 }, { "epoch": 0.06595218466611706, "grad_norm": 16.25, "learning_rate": 1.32013201320132e-05, "loss": 18.5587, "step": 40 }, { "epoch": 0.0741962077493817, "grad_norm": 13.125, "learning_rate": 1.4851485148514851e-05, "loss": 17.2984, "step": 45 }, { "epoch": 0.08244023083264633, "grad_norm": 9.375, "learning_rate": 1.6501650165016504e-05, "loss": 16.2291, "step": 50 }, { "epoch": 0.09068425391591096, "grad_norm": 7.5, "learning_rate": 1.8151815181518153e-05, "loss": 15.5459, "step": 55 }, { "epoch": 0.0989282769991756, "grad_norm": 5.1875, "learning_rate": 1.9801980198019803e-05, "loss": 15.0494, "step": 60 }, { "epoch": 0.10717230008244023, "grad_norm": 3.546875, "learning_rate": 2.1452145214521452e-05, "loss": 14.5263, "step": 65 }, { "epoch": 0.11541632316570487, "grad_norm": 3.21875, "learning_rate": 2.31023102310231e-05, "loss": 14.0492, "step": 70 }, { "epoch": 0.1236603462489695, "grad_norm": 3.34375, "learning_rate": 2.4752475247524754e-05, "loss": 14.0437, "step": 75 }, { "epoch": 0.13190436933223412, "grad_norm": 4.03125, "learning_rate": 2.64026402640264e-05, "loss": 13.4269, "step": 80 }, { "epoch": 0.14014839241549876, "grad_norm": 4.46875, "learning_rate": 2.8052805280528056e-05, "loss": 13.1438, "step": 85 }, { "epoch": 0.1483924154987634, "grad_norm": 5.4375, "learning_rate": 2.9702970297029702e-05, "loss": 12.8682, "step": 90 }, { "epoch": 0.15663643858202803, "grad_norm": 7.5625, "learning_rate": 3.135313531353136e-05, "loss": 12.1682, "step": 95 }, { "epoch": 0.16488046166529266, "grad_norm": 9.6875, "learning_rate": 3.300330033003301e-05, "loss": 11.6708, "step": 100 }, { "epoch": 0.1731244847485573, "grad_norm": 13.875, "learning_rate": 3.465346534653465e-05, "loss": 10.6173, "step": 105 }, { "epoch": 0.18136850783182193, "grad_norm": 20.375, "learning_rate": 3.6303630363036307e-05, "loss": 9.3535, "step": 110 }, { "epoch": 0.18961253091508656, "grad_norm": 23.0, "learning_rate": 3.7953795379537956e-05, "loss": 7.0394, "step": 115 }, { "epoch": 0.1978565539983512, "grad_norm": 19.0, "learning_rate": 3.9603960396039605e-05, "loss": 4.5084, "step": 120 }, { "epoch": 0.20610057708161583, "grad_norm": 5.34375, "learning_rate": 4.1254125412541255e-05, "loss": 2.7198, "step": 125 }, { "epoch": 0.21434460016488047, "grad_norm": 1.8671875, "learning_rate": 4.2904290429042904e-05, "loss": 1.9952, "step": 130 }, { "epoch": 0.2225886232481451, "grad_norm": 1.0078125, "learning_rate": 4.455445544554456e-05, "loss": 1.7411, "step": 135 }, { "epoch": 0.23083264633140974, "grad_norm": 0.95703125, "learning_rate": 4.62046204620462e-05, "loss": 1.5998, "step": 140 }, { "epoch": 0.23907666941467437, "grad_norm": 1.0859375, "learning_rate": 4.785478547854786e-05, "loss": 1.5183, "step": 145 }, { "epoch": 0.247320692497939, "grad_norm": 1.28125, "learning_rate": 4.950495049504951e-05, "loss": 1.4489, "step": 150 }, { "epoch": 0.25556471558120364, "grad_norm": 1.09375, "learning_rate": 5.115511551155116e-05, "loss": 1.3924, "step": 155 }, { "epoch": 0.26380873866446825, "grad_norm": 0.703125, "learning_rate": 5.28052805280528e-05, "loss": 1.3648, "step": 160 }, { "epoch": 0.2720527617477329, "grad_norm": 0.71484375, "learning_rate": 5.445544554455446e-05, "loss": 1.3461, "step": 165 }, { "epoch": 0.2802967848309975, "grad_norm": 0.671875, "learning_rate": 5.610561056105611e-05, "loss": 1.3065, "step": 170 }, { "epoch": 0.2885408079142622, "grad_norm": 0.98828125, "learning_rate": 5.7755775577557755e-05, "loss": 1.2809, "step": 175 }, { "epoch": 0.2967848309975268, "grad_norm": 0.640625, "learning_rate": 5.9405940594059404e-05, "loss": 1.2647, "step": 180 }, { "epoch": 0.30502885408079145, "grad_norm": 1.296875, "learning_rate": 6.105610561056106e-05, "loss": 1.2387, "step": 185 }, { "epoch": 0.31327287716405605, "grad_norm": 1.1171875, "learning_rate": 6.270627062706272e-05, "loss": 1.24, "step": 190 }, { "epoch": 0.3215169002473207, "grad_norm": 1.4765625, "learning_rate": 6.435643564356436e-05, "loss": 1.2108, "step": 195 }, { "epoch": 0.3297609233305853, "grad_norm": 1.7578125, "learning_rate": 6.600660066006602e-05, "loss": 1.2026, "step": 200 }, { "epoch": 0.33800494641385, "grad_norm": 1.78125, "learning_rate": 6.765676567656766e-05, "loss": 1.1894, "step": 205 }, { "epoch": 0.3462489694971146, "grad_norm": 1.5234375, "learning_rate": 6.93069306930693e-05, "loss": 1.2093, "step": 210 }, { "epoch": 0.35449299258037925, "grad_norm": 1.0703125, "learning_rate": 7.095709570957097e-05, "loss": 1.1768, "step": 215 }, { "epoch": 0.36273701566364386, "grad_norm": 1.6171875, "learning_rate": 7.260726072607261e-05, "loss": 1.1946, "step": 220 }, { "epoch": 0.37098103874690846, "grad_norm": 2.828125, "learning_rate": 7.425742574257426e-05, "loss": 1.1678, "step": 225 }, { "epoch": 0.3792250618301731, "grad_norm": 1.5, "learning_rate": 7.590759075907591e-05, "loss": 1.1618, "step": 230 }, { "epoch": 0.38746908491343773, "grad_norm": 0.84375, "learning_rate": 7.755775577557755e-05, "loss": 1.1585, "step": 235 }, { "epoch": 0.3957131079967024, "grad_norm": 1.5546875, "learning_rate": 7.920792079207921e-05, "loss": 1.1519, "step": 240 }, { "epoch": 0.403957131079967, "grad_norm": 1.84375, "learning_rate": 8.085808580858087e-05, "loss": 1.1408, "step": 245 }, { "epoch": 0.41220115416323166, "grad_norm": 1.40625, "learning_rate": 8.250825082508251e-05, "loss": 1.138, "step": 250 }, { "epoch": 0.42044517724649627, "grad_norm": 0.80859375, "learning_rate": 8.415841584158417e-05, "loss": 1.1375, "step": 255 }, { "epoch": 0.42868920032976093, "grad_norm": 1.4296875, "learning_rate": 8.580858085808581e-05, "loss": 1.1193, "step": 260 }, { "epoch": 0.43693322341302554, "grad_norm": 2.15625, "learning_rate": 8.745874587458746e-05, "loss": 1.1178, "step": 265 }, { "epoch": 0.4451772464962902, "grad_norm": 0.984375, "learning_rate": 8.910891089108912e-05, "loss": 1.1038, "step": 270 }, { "epoch": 0.4534212695795548, "grad_norm": 1.5546875, "learning_rate": 9.075907590759076e-05, "loss": 1.1148, "step": 275 }, { "epoch": 0.46166529266281947, "grad_norm": 0.84765625, "learning_rate": 9.24092409240924e-05, "loss": 1.112, "step": 280 }, { "epoch": 0.4699093157460841, "grad_norm": 1.1640625, "learning_rate": 9.405940594059406e-05, "loss": 1.0882, "step": 285 }, { "epoch": 0.47815333882934874, "grad_norm": 3.5625, "learning_rate": 9.570957095709572e-05, "loss": 1.0873, "step": 290 }, { "epoch": 0.48639736191261335, "grad_norm": 0.80078125, "learning_rate": 9.735973597359736e-05, "loss": 1.0982, "step": 295 }, { "epoch": 0.494641384995878, "grad_norm": 1.171875, "learning_rate": 9.900990099009902e-05, "loss": 1.074, "step": 300 }, { "epoch": 0.5028854080791426, "grad_norm": 1.0078125, "learning_rate": 0.00010066006600660067, "loss": 1.0719, "step": 305 }, { "epoch": 0.5111294311624073, "grad_norm": 4.96875, "learning_rate": 0.00010231023102310232, "loss": 1.0816, "step": 310 }, { "epoch": 0.5193734542456719, "grad_norm": 0.95703125, "learning_rate": 0.00010396039603960397, "loss": 1.0681, "step": 315 }, { "epoch": 0.5276174773289365, "grad_norm": 5.6875, "learning_rate": 0.0001056105610561056, "loss": 1.0689, "step": 320 }, { "epoch": 0.5358615004122012, "grad_norm": 1.1328125, "learning_rate": 0.00010726072607260727, "loss": 1.0712, "step": 325 }, { "epoch": 0.5441055234954658, "grad_norm": 0.9375, "learning_rate": 0.00010891089108910893, "loss": 1.063, "step": 330 }, { "epoch": 0.5523495465787304, "grad_norm": 0.8125, "learning_rate": 0.00011056105610561056, "loss": 1.0622, "step": 335 }, { "epoch": 0.560593569661995, "grad_norm": 5.0, "learning_rate": 0.00011221122112211223, "loss": 1.0614, "step": 340 }, { "epoch": 0.5688375927452597, "grad_norm": 2.0, "learning_rate": 0.00011386138613861385, "loss": 1.0611, "step": 345 }, { "epoch": 0.5770816158285244, "grad_norm": 1.75, "learning_rate": 0.00011551155115511551, "loss": 1.0451, "step": 350 }, { "epoch": 0.5853256389117889, "grad_norm": 2.359375, "learning_rate": 0.00011716171617161718, "loss": 1.0506, "step": 355 }, { "epoch": 0.5935696619950536, "grad_norm": 1.1796875, "learning_rate": 0.00011881188118811881, "loss": 1.0414, "step": 360 }, { "epoch": 0.6018136850783182, "grad_norm": 2.703125, "learning_rate": 0.00012046204620462047, "loss": 1.0334, "step": 365 }, { "epoch": 0.6100577081615829, "grad_norm": 1.3828125, "learning_rate": 0.00012211221122112212, "loss": 1.0388, "step": 370 }, { "epoch": 0.6183017312448474, "grad_norm": 0.86328125, "learning_rate": 0.00012376237623762376, "loss": 1.0251, "step": 375 }, { "epoch": 0.6265457543281121, "grad_norm": 2.234375, "learning_rate": 0.00012541254125412543, "loss": 1.0315, "step": 380 }, { "epoch": 0.6347897774113768, "grad_norm": 1.5078125, "learning_rate": 0.00012706270627062708, "loss": 1.0342, "step": 385 }, { "epoch": 0.6430338004946414, "grad_norm": 1.609375, "learning_rate": 0.00012871287128712872, "loss": 1.0258, "step": 390 }, { "epoch": 0.651277823577906, "grad_norm": 1.53125, "learning_rate": 0.00013036303630363036, "loss": 1.02, "step": 395 }, { "epoch": 0.6595218466611706, "grad_norm": 1.3515625, "learning_rate": 0.00013201320132013203, "loss": 1.0053, "step": 400 }, { "epoch": 0.6677658697444353, "grad_norm": 2.09375, "learning_rate": 0.00013366336633663367, "loss": 1.0217, "step": 405 }, { "epoch": 0.6760098928277, "grad_norm": 1.796875, "learning_rate": 0.00013531353135313532, "loss": 1.0066, "step": 410 }, { "epoch": 0.6842539159109645, "grad_norm": 1.7421875, "learning_rate": 0.00013696369636963699, "loss": 1.0141, "step": 415 }, { "epoch": 0.6924979389942292, "grad_norm": 1.46875, "learning_rate": 0.0001386138613861386, "loss": 1.0028, "step": 420 }, { "epoch": 0.7007419620774938, "grad_norm": 3.875, "learning_rate": 0.00014026402640264027, "loss": 1.0207, "step": 425 }, { "epoch": 0.7089859851607585, "grad_norm": 1.0, "learning_rate": 0.00014191419141914194, "loss": 1.0122, "step": 430 }, { "epoch": 0.717230008244023, "grad_norm": 2.359375, "learning_rate": 0.00014356435643564356, "loss": 1.0145, "step": 435 }, { "epoch": 0.7254740313272877, "grad_norm": 1.9609375, "learning_rate": 0.00014521452145214523, "loss": 1.0031, "step": 440 }, { "epoch": 0.7337180544105524, "grad_norm": 1.2578125, "learning_rate": 0.00014686468646864687, "loss": 0.9987, "step": 445 }, { "epoch": 0.7419620774938169, "grad_norm": 4.59375, "learning_rate": 0.0001485148514851485, "loss": 1.0024, "step": 450 }, { "epoch": 0.7502061005770816, "grad_norm": 1.1875, "learning_rate": 0.00015016501650165018, "loss": 1.0048, "step": 455 }, { "epoch": 0.7584501236603463, "grad_norm": 3.5, "learning_rate": 0.00015181518151815182, "loss": 1.0039, "step": 460 }, { "epoch": 0.7666941467436109, "grad_norm": 5.59375, "learning_rate": 0.00015346534653465347, "loss": 1.0092, "step": 465 }, { "epoch": 0.7749381698268755, "grad_norm": 3.578125, "learning_rate": 0.0001551155115511551, "loss": 1.0078, "step": 470 }, { "epoch": 0.7831821929101401, "grad_norm": 2.1875, "learning_rate": 0.00015676567656765678, "loss": 1.0005, "step": 475 }, { "epoch": 0.7914262159934048, "grad_norm": 3.765625, "learning_rate": 0.00015841584158415842, "loss": 0.9895, "step": 480 }, { "epoch": 0.7996702390766695, "grad_norm": 1.6484375, "learning_rate": 0.00016006600660066006, "loss": 0.9923, "step": 485 }, { "epoch": 0.807914262159934, "grad_norm": 2.703125, "learning_rate": 0.00016171617161716173, "loss": 0.9996, "step": 490 }, { "epoch": 0.8161582852431987, "grad_norm": 1.4765625, "learning_rate": 0.00016336633663366338, "loss": 0.9955, "step": 495 }, { "epoch": 0.8244023083264633, "grad_norm": 4.03125, "learning_rate": 0.00016501650165016502, "loss": 0.9931, "step": 500 }, { "epoch": 0.832646331409728, "grad_norm": 4.15625, "learning_rate": 0.0001666666666666667, "loss": 1.0061, "step": 505 }, { "epoch": 0.8408903544929925, "grad_norm": 2.640625, "learning_rate": 0.00016831683168316833, "loss": 1.0086, "step": 510 }, { "epoch": 0.8491343775762572, "grad_norm": 1.296875, "learning_rate": 0.00016996699669966997, "loss": 0.9886, "step": 515 }, { "epoch": 0.8573784006595219, "grad_norm": 14.4375, "learning_rate": 0.00017161716171617162, "loss": 0.9933, "step": 520 }, { "epoch": 0.8656224237427865, "grad_norm": 1.96875, "learning_rate": 0.00017326732673267329, "loss": 1.0033, "step": 525 }, { "epoch": 0.8738664468260511, "grad_norm": 0.68359375, "learning_rate": 0.00017491749174917493, "loss": 0.9905, "step": 530 }, { "epoch": 0.8821104699093157, "grad_norm": 1.84375, "learning_rate": 0.00017656765676567657, "loss": 0.9717, "step": 535 }, { "epoch": 0.8903544929925804, "grad_norm": 1.390625, "learning_rate": 0.00017821782178217824, "loss": 0.9656, "step": 540 }, { "epoch": 0.8985985160758451, "grad_norm": 3.625, "learning_rate": 0.00017986798679867986, "loss": 0.9827, "step": 545 }, { "epoch": 0.9068425391591096, "grad_norm": 3.453125, "learning_rate": 0.00018151815181518153, "loss": 0.9865, "step": 550 }, { "epoch": 0.9150865622423743, "grad_norm": 1.0078125, "learning_rate": 0.0001831683168316832, "loss": 0.9815, "step": 555 }, { "epoch": 0.9233305853256389, "grad_norm": 1.578125, "learning_rate": 0.0001848184818481848, "loss": 0.9799, "step": 560 }, { "epoch": 0.9315746084089035, "grad_norm": 3.4375, "learning_rate": 0.00018646864686468648, "loss": 0.9611, "step": 565 }, { "epoch": 0.9398186314921682, "grad_norm": 1.046875, "learning_rate": 0.00018811881188118812, "loss": 0.9652, "step": 570 }, { "epoch": 0.9480626545754328, "grad_norm": 9.3125, "learning_rate": 0.00018976897689768977, "loss": 0.9676, "step": 575 }, { "epoch": 0.9563066776586975, "grad_norm": 1.3125, "learning_rate": 0.00019141914191419144, "loss": 0.9692, "step": 580 }, { "epoch": 0.964550700741962, "grad_norm": 6.4375, "learning_rate": 0.00019306930693069308, "loss": 0.9694, "step": 585 }, { "epoch": 0.9727947238252267, "grad_norm": 1.25, "learning_rate": 0.00019471947194719472, "loss": 0.9823, "step": 590 }, { "epoch": 0.9810387469084914, "grad_norm": 2.28125, "learning_rate": 0.00019636963696369636, "loss": 0.97, "step": 595 }, { "epoch": 0.989282769991756, "grad_norm": 1.34375, "learning_rate": 0.00019801980198019803, "loss": 0.9746, "step": 600 }, { "epoch": 0.9975267930750206, "grad_norm": 1.96875, "learning_rate": 0.00019966996699669968, "loss": 0.964, "step": 605 }, { "epoch": 0.9991755976916735, "eval_loss": 2.485042095184326, "eval_runtime": 0.2808, "eval_samples_per_second": 35.608, "eval_steps_per_second": 3.561, "step": 606 }, { "epoch": 1.0057708161582852, "grad_norm": 1.3984375, "learning_rate": 0.00019999973456433681, "loss": 0.9535, "step": 610 }, { "epoch": 1.0140148392415498, "grad_norm": 2.6875, "learning_rate": 0.00019999865623437013, "loss": 0.9553, "step": 615 }, { "epoch": 1.0222588623248146, "grad_norm": 4.96875, "learning_rate": 0.00019999674842930876, "loss": 0.9556, "step": 620 }, { "epoch": 1.030502885408079, "grad_norm": 1.9453125, "learning_rate": 0.00019999401116497763, "loss": 0.9746, "step": 625 }, { "epoch": 1.0387469084913439, "grad_norm": 1.1953125, "learning_rate": 0.000199990444464082, "loss": 0.9639, "step": 630 }, { "epoch": 1.0469909315746084, "grad_norm": 1.65625, "learning_rate": 0.00019998604835620717, "loss": 0.9585, "step": 635 }, { "epoch": 1.055234954657873, "grad_norm": 1.6953125, "learning_rate": 0.00019998082287781826, "loss": 0.9563, "step": 640 }, { "epoch": 1.0634789777411378, "grad_norm": 1.6171875, "learning_rate": 0.00019997476807225985, "loss": 0.9489, "step": 645 }, { "epoch": 1.0717230008244023, "grad_norm": 4.15625, "learning_rate": 0.00019996788398975578, "loss": 0.9474, "step": 650 }, { "epoch": 1.0799670239076669, "grad_norm": 5.53125, "learning_rate": 0.0001999601706874085, "loss": 0.9407, "step": 655 }, { "epoch": 1.0882110469909316, "grad_norm": 2.875, "learning_rate": 0.00019995162822919883, "loss": 0.9514, "step": 660 }, { "epoch": 1.0964550700741962, "grad_norm": 1.4140625, "learning_rate": 0.00019994225668598526, "loss": 0.9502, "step": 665 }, { "epoch": 1.104699093157461, "grad_norm": 2.796875, "learning_rate": 0.0001999320561355035, "loss": 0.9502, "step": 670 }, { "epoch": 1.1129431162407255, "grad_norm": 2.0, "learning_rate": 0.00019992102666236566, "loss": 0.9455, "step": 675 }, { "epoch": 1.12118713932399, "grad_norm": 0.90625, "learning_rate": 0.00019990916835805974, "loss": 0.9429, "step": 680 }, { "epoch": 1.1294311624072548, "grad_norm": 0.74609375, "learning_rate": 0.00019989648132094873, "loss": 0.9348, "step": 685 }, { "epoch": 1.1376751854905194, "grad_norm": 0.76171875, "learning_rate": 0.00019988296565626987, "loss": 0.939, "step": 690 }, { "epoch": 1.145919208573784, "grad_norm": 0.93359375, "learning_rate": 0.0001998686214761337, "loss": 0.9374, "step": 695 }, { "epoch": 1.1541632316570487, "grad_norm": 1.375, "learning_rate": 0.00019985344889952327, "loss": 0.9326, "step": 700 }, { "epoch": 1.1624072547403133, "grad_norm": 0.91015625, "learning_rate": 0.00019983744805229296, "loss": 0.9308, "step": 705 }, { "epoch": 1.1706512778235778, "grad_norm": 1.75, "learning_rate": 0.00019982061906716764, "loss": 0.9436, "step": 710 }, { "epoch": 1.1788953009068426, "grad_norm": 1.2734375, "learning_rate": 0.00019980296208374143, "loss": 0.9369, "step": 715 }, { "epoch": 1.1871393239901071, "grad_norm": 20.0, "learning_rate": 0.00019978447724847652, "loss": 0.9334, "step": 720 }, { "epoch": 1.195383347073372, "grad_norm": 1.46875, "learning_rate": 0.00019976516471470216, "loss": 0.9416, "step": 725 }, { "epoch": 1.2036273701566365, "grad_norm": 8.75, "learning_rate": 0.0001997450246426131, "loss": 0.9382, "step": 730 }, { "epoch": 1.211871393239901, "grad_norm": 0.86328125, "learning_rate": 0.0001997240571992685, "loss": 0.9315, "step": 735 }, { "epoch": 1.2201154163231658, "grad_norm": 0.98046875, "learning_rate": 0.00019970226255859038, "loss": 0.9266, "step": 740 }, { "epoch": 1.2283594394064303, "grad_norm": 1.5234375, "learning_rate": 0.0001996796409013623, "loss": 0.9299, "step": 745 }, { "epoch": 1.2366034624896949, "grad_norm": 1.0, "learning_rate": 0.0001996561924152278, "loss": 0.9202, "step": 750 }, { "epoch": 1.2448474855729597, "grad_norm": 1.078125, "learning_rate": 0.00019963191729468888, "loss": 0.9149, "step": 755 }, { "epoch": 1.2530915086562242, "grad_norm": 1.0703125, "learning_rate": 0.00019960681574110426, "loss": 0.9165, "step": 760 }, { "epoch": 1.2613355317394888, "grad_norm": 2.484375, "learning_rate": 0.00019958088796268793, "loss": 0.9188, "step": 765 }, { "epoch": 1.2695795548227535, "grad_norm": 1.5546875, "learning_rate": 0.0001995541341745072, "loss": 0.9274, "step": 770 }, { "epoch": 1.277823577906018, "grad_norm": 3.21875, "learning_rate": 0.0001995265545984811, "loss": 0.9136, "step": 775 }, { "epoch": 1.2860676009892829, "grad_norm": 1.8203125, "learning_rate": 0.00019949814946337838, "loss": 0.9251, "step": 780 }, { "epoch": 1.2943116240725474, "grad_norm": 3.078125, "learning_rate": 0.00019946891900481578, "loss": 0.9176, "step": 785 }, { "epoch": 1.302555647155812, "grad_norm": 0.66796875, "learning_rate": 0.0001994388634652559, "loss": 0.9283, "step": 790 }, { "epoch": 1.3107996702390767, "grad_norm": 1.953125, "learning_rate": 0.00019940798309400526, "loss": 0.9221, "step": 795 }, { "epoch": 1.3190436933223413, "grad_norm": 1.421875, "learning_rate": 0.00019937627814721237, "loss": 0.9199, "step": 800 }, { "epoch": 1.327287716405606, "grad_norm": 1.2890625, "learning_rate": 0.00019934374888786537, "loss": 0.9163, "step": 805 }, { "epoch": 1.3355317394888706, "grad_norm": 1.5546875, "learning_rate": 0.00019931039558578997, "loss": 0.9181, "step": 810 }, { "epoch": 1.3437757625721352, "grad_norm": 1.9765625, "learning_rate": 0.00019927621851764725, "loss": 0.9276, "step": 815 }, { "epoch": 1.3520197856553997, "grad_norm": 1.4921875, "learning_rate": 0.00019924121796693127, "loss": 0.9199, "step": 820 }, { "epoch": 1.3602638087386645, "grad_norm": 1.078125, "learning_rate": 0.0001992053942239668, "loss": 0.9213, "step": 825 }, { "epoch": 1.368507831821929, "grad_norm": 1.296875, "learning_rate": 0.00019916874758590684, "loss": 0.9228, "step": 830 }, { "epoch": 1.3767518549051938, "grad_norm": 1.3125, "learning_rate": 0.00019913127835673023, "loss": 0.9149, "step": 835 }, { "epoch": 1.3849958779884584, "grad_norm": 0.73828125, "learning_rate": 0.00019909298684723904, "loss": 0.9086, "step": 840 }, { "epoch": 1.393239901071723, "grad_norm": 1.125, "learning_rate": 0.00019905387337505612, "loss": 0.9092, "step": 845 }, { "epoch": 1.4014839241549877, "grad_norm": 2.828125, "learning_rate": 0.0001990139382646223, "loss": 0.9041, "step": 850 }, { "epoch": 1.4097279472382522, "grad_norm": 1.3203125, "learning_rate": 0.00019897318184719385, "loss": 0.9093, "step": 855 }, { "epoch": 1.417971970321517, "grad_norm": 1.109375, "learning_rate": 0.00019893160446083963, "loss": 0.909, "step": 860 }, { "epoch": 1.4262159934047816, "grad_norm": 1.0390625, "learning_rate": 0.00019888920645043831, "loss": 0.9014, "step": 865 }, { "epoch": 1.434460016488046, "grad_norm": 1.8203125, "learning_rate": 0.00019884598816767563, "loss": 0.9036, "step": 870 }, { "epoch": 1.4427040395713109, "grad_norm": 2.234375, "learning_rate": 0.00019880194997104123, "loss": 0.8999, "step": 875 }, { "epoch": 1.4509480626545754, "grad_norm": 2.3125, "learning_rate": 0.00019875709222582594, "loss": 0.9, "step": 880 }, { "epoch": 1.45919208573784, "grad_norm": 1.5390625, "learning_rate": 0.00019871141530411853, "loss": 0.8955, "step": 885 }, { "epoch": 1.4674361088211048, "grad_norm": 1.65625, "learning_rate": 0.00019866491958480284, "loss": 0.9042, "step": 890 }, { "epoch": 1.4756801319043693, "grad_norm": 0.96875, "learning_rate": 0.00019861760545355442, "loss": 0.9177, "step": 895 }, { "epoch": 1.4839241549876339, "grad_norm": 4.5625, "learning_rate": 0.00019856947330283752, "loss": 0.8974, "step": 900 }, { "epoch": 1.4921681780708986, "grad_norm": 1.3671875, "learning_rate": 0.00019852052353190166, "loss": 0.9064, "step": 905 }, { "epoch": 1.5004122011541632, "grad_norm": 3.796875, "learning_rate": 0.0001984707565467785, "loss": 0.9086, "step": 910 }, { "epoch": 1.508656224237428, "grad_norm": 6.6875, "learning_rate": 0.00019842017276027832, "loss": 0.9069, "step": 915 }, { "epoch": 1.5169002473206925, "grad_norm": 1.3203125, "learning_rate": 0.00019836877259198662, "loss": 0.898, "step": 920 }, { "epoch": 1.525144270403957, "grad_norm": 2.484375, "learning_rate": 0.0001983165564682608, "loss": 0.8999, "step": 925 }, { "epoch": 1.5333882934872216, "grad_norm": 1.34375, "learning_rate": 0.00019826352482222638, "loss": 0.8987, "step": 930 }, { "epoch": 1.5416323165704864, "grad_norm": 1.421875, "learning_rate": 0.00019820967809377357, "loss": 0.8791, "step": 935 }, { "epoch": 1.5498763396537512, "grad_norm": 0.80859375, "learning_rate": 0.00019815501672955358, "loss": 0.8887, "step": 940 }, { "epoch": 1.5581203627370157, "grad_norm": 6.0, "learning_rate": 0.0001980995411829749, "loss": 0.8955, "step": 945 }, { "epoch": 1.5663643858202803, "grad_norm": 0.8984375, "learning_rate": 0.00019804325191419956, "loss": 0.8991, "step": 950 }, { "epoch": 1.5746084089035448, "grad_norm": 1.4921875, "learning_rate": 0.00019798614939013932, "loss": 0.8916, "step": 955 }, { "epoch": 1.5828524319868096, "grad_norm": 1.3984375, "learning_rate": 0.00019792823408445174, "loss": 0.9048, "step": 960 }, { "epoch": 1.5910964550700744, "grad_norm": 1.1015625, "learning_rate": 0.0001978695064775363, "loss": 0.8828, "step": 965 }, { "epoch": 1.599340478153339, "grad_norm": 0.96875, "learning_rate": 0.00019780996705653044, "loss": 0.8864, "step": 970 }, { "epoch": 1.6075845012366035, "grad_norm": 0.99609375, "learning_rate": 0.00019774961631530545, "loss": 0.8908, "step": 975 }, { "epoch": 1.615828524319868, "grad_norm": 1.0390625, "learning_rate": 0.0001976884547544624, "loss": 0.8853, "step": 980 }, { "epoch": 1.6240725474031328, "grad_norm": 2.84375, "learning_rate": 0.0001976264828813281, "loss": 0.8835, "step": 985 }, { "epoch": 1.6323165704863973, "grad_norm": 2.296875, "learning_rate": 0.00019756370120995066, "loss": 0.8817, "step": 990 }, { "epoch": 1.640560593569662, "grad_norm": 27.25, "learning_rate": 0.0001975001102610954, "loss": 0.8972, "step": 995 }, { "epoch": 1.6488046166529267, "grad_norm": 9.75, "learning_rate": 0.0001974357105622405, "loss": 0.9076, "step": 1000 }, { "epoch": 1.6570486397361912, "grad_norm": 0.71484375, "learning_rate": 0.0001973705026475726, "loss": 0.9001, "step": 1005 }, { "epoch": 1.6652926628194558, "grad_norm": 1.984375, "learning_rate": 0.00019730448705798239, "loss": 0.9172, "step": 1010 }, { "epoch": 1.6735366859027205, "grad_norm": 1.375, "learning_rate": 0.0001972376643410601, "loss": 0.8945, "step": 1015 }, { "epoch": 1.6817807089859853, "grad_norm": 2.71875, "learning_rate": 0.00019717003505109095, "loss": 0.8857, "step": 1020 }, { "epoch": 1.6900247320692499, "grad_norm": 1.4375, "learning_rate": 0.00019710159974905064, "loss": 0.8852, "step": 1025 }, { "epoch": 1.6982687551525144, "grad_norm": 2.984375, "learning_rate": 0.00019703235900260055, "loss": 0.8795, "step": 1030 }, { "epoch": 1.706512778235779, "grad_norm": 1.2578125, "learning_rate": 0.00019696231338608316, "loss": 0.8926, "step": 1035 }, { "epoch": 1.7147568013190437, "grad_norm": 4.90625, "learning_rate": 0.00019689146348051719, "loss": 0.8927, "step": 1040 }, { "epoch": 1.7230008244023083, "grad_norm": 1.765625, "learning_rate": 0.0001968198098735929, "loss": 0.8762, "step": 1045 }, { "epoch": 1.731244847485573, "grad_norm": 6.75, "learning_rate": 0.0001967473531596671, "loss": 0.8886, "step": 1050 }, { "epoch": 1.7394888705688376, "grad_norm": 12.125, "learning_rate": 0.00019667409393975822, "loss": 0.8865, "step": 1055 }, { "epoch": 1.7477328936521022, "grad_norm": 1.171875, "learning_rate": 0.00019660003282154147, "loss": 0.887, "step": 1060 }, { "epoch": 1.7559769167353667, "grad_norm": 0.84765625, "learning_rate": 0.00019652517041934356, "loss": 0.8669, "step": 1065 }, { "epoch": 1.7642209398186315, "grad_norm": 0.7890625, "learning_rate": 0.00019644950735413788, "loss": 0.8774, "step": 1070 }, { "epoch": 1.7724649629018963, "grad_norm": 0.98828125, "learning_rate": 0.00019637304425353916, "loss": 0.8717, "step": 1075 }, { "epoch": 1.7807089859851608, "grad_norm": 0.7578125, "learning_rate": 0.0001962957817517982, "loss": 0.8769, "step": 1080 }, { "epoch": 1.7889530090684254, "grad_norm": 4.59375, "learning_rate": 0.0001962177204897969, "loss": 0.872, "step": 1085 }, { "epoch": 1.79719703215169, "grad_norm": 0.69140625, "learning_rate": 0.0001961388611150427, "loss": 0.8727, "step": 1090 }, { "epoch": 1.8054410552349547, "grad_norm": 2.171875, "learning_rate": 0.00019605920428166323, "loss": 0.8671, "step": 1095 }, { "epoch": 1.8136850783182195, "grad_norm": 7.78125, "learning_rate": 0.00019597875065040094, "loss": 0.8927, "step": 1100 }, { "epoch": 1.821929101401484, "grad_norm": 10.9375, "learning_rate": 0.00019589750088860766, "loss": 0.881, "step": 1105 }, { "epoch": 1.8301731244847486, "grad_norm": 1.1328125, "learning_rate": 0.000195815455670239, "loss": 0.8793, "step": 1110 }, { "epoch": 1.838417147568013, "grad_norm": 3.890625, "learning_rate": 0.00019573261567584874, "loss": 0.8795, "step": 1115 }, { "epoch": 1.8466611706512777, "grad_norm": 1.1171875, "learning_rate": 0.00019564898159258324, "loss": 0.8933, "step": 1120 }, { "epoch": 1.8549051937345424, "grad_norm": 0.921875, "learning_rate": 0.00019556455411417573, "loss": 0.8626, "step": 1125 }, { "epoch": 1.8631492168178072, "grad_norm": 1.5625, "learning_rate": 0.0001954793339409405, "loss": 0.8616, "step": 1130 }, { "epoch": 1.8713932399010718, "grad_norm": 2.625, "learning_rate": 0.00019539332177976714, "loss": 0.8693, "step": 1135 }, { "epoch": 1.8796372629843363, "grad_norm": 0.875, "learning_rate": 0.00019530651834411474, "loss": 0.8659, "step": 1140 }, { "epoch": 1.8878812860676009, "grad_norm": 6.0, "learning_rate": 0.00019521892435400587, "loss": 0.8666, "step": 1145 }, { "epoch": 1.8961253091508656, "grad_norm": 1.1484375, "learning_rate": 0.00019513054053602055, "loss": 0.8601, "step": 1150 }, { "epoch": 1.9043693322341304, "grad_norm": 2.125, "learning_rate": 0.00019504136762329047, "loss": 0.8631, "step": 1155 }, { "epoch": 1.912613355317395, "grad_norm": 3.296875, "learning_rate": 0.00019495140635549261, "loss": 0.8833, "step": 1160 }, { "epoch": 1.9208573784006595, "grad_norm": 2.4375, "learning_rate": 0.00019486065747884333, "loss": 0.8555, "step": 1165 }, { "epoch": 1.929101401483924, "grad_norm": 1.2734375, "learning_rate": 0.0001947691217460921, "loss": 0.8602, "step": 1170 }, { "epoch": 1.9373454245671888, "grad_norm": 1.546875, "learning_rate": 0.0001946767999165152, "loss": 0.8553, "step": 1175 }, { "epoch": 1.9455894476504534, "grad_norm": 0.94921875, "learning_rate": 0.00019458369275590954, "loss": 0.8588, "step": 1180 }, { "epoch": 1.9538334707337182, "grad_norm": 2.21875, "learning_rate": 0.00019448980103658613, "loss": 0.8529, "step": 1185 }, { "epoch": 1.9620774938169827, "grad_norm": 8.6875, "learning_rate": 0.00019439512553736394, "loss": 0.8441, "step": 1190 }, { "epoch": 1.9703215169002473, "grad_norm": 0.83984375, "learning_rate": 0.0001942996670435632, "loss": 0.8526, "step": 1195 }, { "epoch": 1.9785655399835118, "grad_norm": 6.0625, "learning_rate": 0.0001942034263469989, "loss": 0.8547, "step": 1200 }, { "epoch": 1.9868095630667766, "grad_norm": 13.0625, "learning_rate": 0.0001941064042459745, "loss": 0.8686, "step": 1205 }, { "epoch": 1.9950535861500414, "grad_norm": 0.7734375, "learning_rate": 0.00019400860154527493, "loss": 0.8499, "step": 1210 }, { "epoch": 2.0, "eval_loss": 2.4393434524536133, "eval_runtime": 0.2359, "eval_samples_per_second": 42.391, "eval_steps_per_second": 4.239, "step": 1213 }, { "epoch": 2.003297609233306, "grad_norm": 1.1328125, "learning_rate": 0.0001939100190561601, "loss": 0.8486, "step": 1215 }, { "epoch": 2.0115416323165705, "grad_norm": 2.515625, "learning_rate": 0.00019381065759635822, "loss": 0.8375, "step": 1220 }, { "epoch": 2.019785655399835, "grad_norm": 1.046875, "learning_rate": 0.0001937105179900589, "loss": 0.8531, "step": 1225 }, { "epoch": 2.0280296784830996, "grad_norm": 1.75, "learning_rate": 0.00019360960106790643, "loss": 0.8369, "step": 1230 }, { "epoch": 2.0362737015663646, "grad_norm": 0.58203125, "learning_rate": 0.00019350790766699282, "loss": 0.8276, "step": 1235 }, { "epoch": 2.044517724649629, "grad_norm": 1.0390625, "learning_rate": 0.0001934054386308508, "loss": 0.8289, "step": 1240 }, { "epoch": 2.0527617477328937, "grad_norm": 0.57421875, "learning_rate": 0.00019330219480944694, "loss": 0.8292, "step": 1245 }, { "epoch": 2.061005770816158, "grad_norm": 0.828125, "learning_rate": 0.0001931981770591745, "loss": 0.8305, "step": 1250 }, { "epoch": 2.0692497938994228, "grad_norm": 0.77734375, "learning_rate": 0.00019309338624284644, "loss": 0.8243, "step": 1255 }, { "epoch": 2.0774938169826878, "grad_norm": 1.265625, "learning_rate": 0.00019298782322968815, "loss": 0.8225, "step": 1260 }, { "epoch": 2.0857378400659523, "grad_norm": 4.03125, "learning_rate": 0.0001928814888953303, "loss": 0.8212, "step": 1265 }, { "epoch": 2.093981863149217, "grad_norm": 2.015625, "learning_rate": 0.0001927743841218016, "loss": 0.8188, "step": 1270 }, { "epoch": 2.1022258862324814, "grad_norm": 1.015625, "learning_rate": 0.00019266650979752136, "loss": 0.8209, "step": 1275 }, { "epoch": 2.110469909315746, "grad_norm": 8.25, "learning_rate": 0.00019255786681729225, "loss": 0.8242, "step": 1280 }, { "epoch": 2.1187139323990105, "grad_norm": 5.53125, "learning_rate": 0.00019244845608229293, "loss": 0.828, "step": 1285 }, { "epoch": 2.1269579554822755, "grad_norm": 0.6953125, "learning_rate": 0.00019233827850007027, "loss": 0.8159, "step": 1290 }, { "epoch": 2.13520197856554, "grad_norm": 7.03125, "learning_rate": 0.00019222733498453222, "loss": 0.8196, "step": 1295 }, { "epoch": 2.1434460016488046, "grad_norm": 0.84765625, "learning_rate": 0.00019211562645594002, "loss": 0.8231, "step": 1300 }, { "epoch": 2.151690024732069, "grad_norm": 0.474609375, "learning_rate": 0.00019200315384090044, "loss": 0.8073, "step": 1305 }, { "epoch": 2.1599340478153337, "grad_norm": 1.484375, "learning_rate": 0.00019188991807235844, "loss": 0.8255, "step": 1310 }, { "epoch": 2.1681780708985987, "grad_norm": 0.5859375, "learning_rate": 0.0001917759200895891, "loss": 0.8185, "step": 1315 }, { "epoch": 2.1764220939818633, "grad_norm": 8.0, "learning_rate": 0.00019166116083819002, "loss": 0.8174, "step": 1320 }, { "epoch": 2.184666117065128, "grad_norm": 0.96875, "learning_rate": 0.00019154564127007336, "loss": 0.8263, "step": 1325 }, { "epoch": 2.1929101401483924, "grad_norm": 1.171875, "learning_rate": 0.0001914293623434581, "loss": 0.8333, "step": 1330 }, { "epoch": 2.201154163231657, "grad_norm": 2.546875, "learning_rate": 0.00019131232502286188, "loss": 0.8227, "step": 1335 }, { "epoch": 2.209398186314922, "grad_norm": 1.1171875, "learning_rate": 0.00019119453027909323, "loss": 0.8123, "step": 1340 }, { "epoch": 2.2176422093981865, "grad_norm": 0.96484375, "learning_rate": 0.0001910759790892433, "loss": 0.8129, "step": 1345 }, { "epoch": 2.225886232481451, "grad_norm": 0.90625, "learning_rate": 0.0001909566724366779, "loss": 0.8101, "step": 1350 }, { "epoch": 2.2341302555647156, "grad_norm": 2.203125, "learning_rate": 0.00019083661131102933, "loss": 0.8205, "step": 1355 }, { "epoch": 2.24237427864798, "grad_norm": 0.9921875, "learning_rate": 0.00019071579670818808, "loss": 0.8228, "step": 1360 }, { "epoch": 2.2506183017312447, "grad_norm": 0.546875, "learning_rate": 0.00019059422963029464, "loss": 0.8123, "step": 1365 }, { "epoch": 2.2588623248145097, "grad_norm": 0.7421875, "learning_rate": 0.00019047191108573125, "loss": 0.8227, "step": 1370 }, { "epoch": 2.267106347897774, "grad_norm": 1.4609375, "learning_rate": 0.00019034884208911335, "loss": 0.814, "step": 1375 }, { "epoch": 2.2753503709810388, "grad_norm": 0.78515625, "learning_rate": 0.00019022502366128135, "loss": 0.819, "step": 1380 }, { "epoch": 2.2835943940643033, "grad_norm": 0.6484375, "learning_rate": 0.00019010045682929213, "loss": 0.8074, "step": 1385 }, { "epoch": 2.291838417147568, "grad_norm": 0.71484375, "learning_rate": 0.00018997514262641035, "loss": 0.8224, "step": 1390 }, { "epoch": 2.300082440230833, "grad_norm": 0.61328125, "learning_rate": 0.0001898490820921001, "loss": 0.8096, "step": 1395 }, { "epoch": 2.3083264633140974, "grad_norm": 0.51953125, "learning_rate": 0.00018972227627201617, "loss": 0.8102, "step": 1400 }, { "epoch": 2.316570486397362, "grad_norm": 0.482421875, "learning_rate": 0.0001895947262179954, "loss": 0.8113, "step": 1405 }, { "epoch": 2.3248145094806265, "grad_norm": 0.52734375, "learning_rate": 0.00018946643298804793, "loss": 0.8109, "step": 1410 }, { "epoch": 2.333058532563891, "grad_norm": 0.474609375, "learning_rate": 0.00018933739764634847, "loss": 0.809, "step": 1415 }, { "epoch": 2.3413025556471556, "grad_norm": 0.54296875, "learning_rate": 0.0001892076212632274, "loss": 0.8153, "step": 1420 }, { "epoch": 2.3495465787304206, "grad_norm": 0.578125, "learning_rate": 0.00018907710491516199, "loss": 0.8161, "step": 1425 }, { "epoch": 2.357790601813685, "grad_norm": 0.60546875, "learning_rate": 0.00018894584968476733, "loss": 0.8141, "step": 1430 }, { "epoch": 2.3660346248969497, "grad_norm": 0.6328125, "learning_rate": 0.00018881385666078755, "loss": 0.8102, "step": 1435 }, { "epoch": 2.3742786479802143, "grad_norm": 0.4921875, "learning_rate": 0.00018868112693808665, "loss": 0.8124, "step": 1440 }, { "epoch": 2.382522671063479, "grad_norm": 0.609375, "learning_rate": 0.00018854766161763932, "loss": 0.8033, "step": 1445 }, { "epoch": 2.390766694146744, "grad_norm": 0.59765625, "learning_rate": 0.00018841346180652213, "loss": 0.812, "step": 1450 }, { "epoch": 2.3990107172300084, "grad_norm": 0.46875, "learning_rate": 0.00018827852861790398, "loss": 0.8059, "step": 1455 }, { "epoch": 2.407254740313273, "grad_norm": 0.70703125, "learning_rate": 0.00018814286317103714, "loss": 0.8021, "step": 1460 }, { "epoch": 2.4154987633965375, "grad_norm": 1.4921875, "learning_rate": 0.00018800646659124782, "loss": 0.8036, "step": 1465 }, { "epoch": 2.423742786479802, "grad_norm": 0.6484375, "learning_rate": 0.00018786934000992688, "loss": 0.8045, "step": 1470 }, { "epoch": 2.4319868095630666, "grad_norm": 0.58984375, "learning_rate": 0.00018773148456452046, "loss": 0.8108, "step": 1475 }, { "epoch": 2.4402308326463316, "grad_norm": 0.73828125, "learning_rate": 0.00018759290139852048, "loss": 0.8097, "step": 1480 }, { "epoch": 2.448474855729596, "grad_norm": 0.79296875, "learning_rate": 0.00018745359166145523, "loss": 0.8052, "step": 1485 }, { "epoch": 2.4567188788128607, "grad_norm": 1.203125, "learning_rate": 0.00018731355650887985, "loss": 0.8016, "step": 1490 }, { "epoch": 2.464962901896125, "grad_norm": 0.4453125, "learning_rate": 0.00018717279710236666, "loss": 0.8077, "step": 1495 }, { "epoch": 2.4732069249793898, "grad_norm": 0.9921875, "learning_rate": 0.00018703131460949554, "loss": 0.8031, "step": 1500 }, { "epoch": 2.4814509480626548, "grad_norm": 5.46875, "learning_rate": 0.00018688911020384432, "loss": 0.8062, "step": 1505 }, { "epoch": 2.4896949711459193, "grad_norm": 0.7421875, "learning_rate": 0.000186746185064979, "loss": 0.8156, "step": 1510 }, { "epoch": 2.497938994229184, "grad_norm": 0.77734375, "learning_rate": 0.00018660254037844388, "loss": 0.8083, "step": 1515 }, { "epoch": 2.5061830173124484, "grad_norm": 0.70703125, "learning_rate": 0.00018645817733575193, "loss": 0.812, "step": 1520 }, { "epoch": 2.514427040395713, "grad_norm": 3.671875, "learning_rate": 0.00018631309713437467, "loss": 0.796, "step": 1525 }, { "epoch": 2.5226710634789775, "grad_norm": 0.6484375, "learning_rate": 0.0001861673009777325, "loss": 0.7988, "step": 1530 }, { "epoch": 2.5309150865622425, "grad_norm": 1.546875, "learning_rate": 0.00018602079007518438, "loss": 0.7988, "step": 1535 }, { "epoch": 2.539159109645507, "grad_norm": 0.4375, "learning_rate": 0.00018587356564201817, "loss": 0.8045, "step": 1540 }, { "epoch": 2.5474031327287716, "grad_norm": 0.44140625, "learning_rate": 0.0001857256288994402, "loss": 0.8112, "step": 1545 }, { "epoch": 2.555647155812036, "grad_norm": 0.56640625, "learning_rate": 0.00018557698107456549, "loss": 0.808, "step": 1550 }, { "epoch": 2.563891178895301, "grad_norm": 0.453125, "learning_rate": 0.00018542762340040722, "loss": 0.7958, "step": 1555 }, { "epoch": 2.5721352019785657, "grad_norm": 0.859375, "learning_rate": 0.00018527755711586678, "loss": 0.8008, "step": 1560 }, { "epoch": 2.5803792250618303, "grad_norm": 0.462890625, "learning_rate": 0.00018512678346572337, "loss": 0.7995, "step": 1565 }, { "epoch": 2.588623248145095, "grad_norm": 0.734375, "learning_rate": 0.00018497530370062363, "loss": 0.7974, "step": 1570 }, { "epoch": 2.5968672712283594, "grad_norm": 0.51171875, "learning_rate": 0.0001848231190770714, "loss": 0.7929, "step": 1575 }, { "epoch": 2.605111294311624, "grad_norm": 0.78125, "learning_rate": 0.00018467023085741717, "loss": 0.8014, "step": 1580 }, { "epoch": 2.6133553173948885, "grad_norm": 0.9140625, "learning_rate": 0.00018451664030984773, "loss": 0.7944, "step": 1585 }, { "epoch": 2.6215993404781535, "grad_norm": 0.4453125, "learning_rate": 0.00018436234870837547, "loss": 0.7937, "step": 1590 }, { "epoch": 2.629843363561418, "grad_norm": 1.0703125, "learning_rate": 0.00018420735733282807, "loss": 0.7983, "step": 1595 }, { "epoch": 2.6380873866446826, "grad_norm": 0.455078125, "learning_rate": 0.00018405166746883762, "loss": 0.7924, "step": 1600 }, { "epoch": 2.646331409727947, "grad_norm": 0.474609375, "learning_rate": 0.00018389528040783012, "loss": 0.7953, "step": 1605 }, { "epoch": 2.654575432811212, "grad_norm": 0.578125, "learning_rate": 0.00018373819744701476, "loss": 0.7893, "step": 1610 }, { "epoch": 2.6628194558944767, "grad_norm": 0.412109375, "learning_rate": 0.00018358041988937305, "loss": 0.7945, "step": 1615 }, { "epoch": 2.671063478977741, "grad_norm": 0.8125, "learning_rate": 0.00018342194904364813, "loss": 0.7894, "step": 1620 }, { "epoch": 2.6793075020610058, "grad_norm": 0.64453125, "learning_rate": 0.00018326278622433386, "loss": 0.7925, "step": 1625 }, { "epoch": 2.6875515251442703, "grad_norm": 0.5390625, "learning_rate": 0.00018310293275166392, "loss": 0.7978, "step": 1630 }, { "epoch": 2.695795548227535, "grad_norm": 0.63671875, "learning_rate": 0.00018294238995160094, "loss": 0.792, "step": 1635 }, { "epoch": 2.7040395713107994, "grad_norm": 0.671875, "learning_rate": 0.00018278115915582526, "loss": 0.8069, "step": 1640 }, { "epoch": 2.7122835943940644, "grad_norm": 1.515625, "learning_rate": 0.0001826192417017242, "loss": 0.8048, "step": 1645 }, { "epoch": 2.720527617477329, "grad_norm": 0.54296875, "learning_rate": 0.00018245663893238075, "loss": 0.8009, "step": 1650 }, { "epoch": 2.7287716405605935, "grad_norm": 0.6640625, "learning_rate": 0.0001822933521965625, "loss": 0.7903, "step": 1655 }, { "epoch": 2.737015663643858, "grad_norm": 0.48046875, "learning_rate": 0.00018212938284871047, "loss": 0.7917, "step": 1660 }, { "epoch": 2.745259686727123, "grad_norm": 0.58203125, "learning_rate": 0.00018196473224892784, "loss": 0.7886, "step": 1665 }, { "epoch": 2.7535037098103876, "grad_norm": 0.62890625, "learning_rate": 0.0001817994017629687, "loss": 0.7933, "step": 1670 }, { "epoch": 2.761747732893652, "grad_norm": 0.78515625, "learning_rate": 0.00018163339276222666, "loss": 0.792, "step": 1675 }, { "epoch": 2.7699917559769167, "grad_norm": 0.65625, "learning_rate": 0.00018146670662372354, "loss": 0.7825, "step": 1680 }, { "epoch": 2.7782357790601813, "grad_norm": 1.0234375, "learning_rate": 0.0001812993447300979, "loss": 0.7929, "step": 1685 }, { "epoch": 2.786479802143446, "grad_norm": 0.6171875, "learning_rate": 0.00018113130846959368, "loss": 0.7925, "step": 1690 }, { "epoch": 2.7947238252267104, "grad_norm": 0.48828125, "learning_rate": 0.0001809625992360485, "loss": 0.7888, "step": 1695 }, { "epoch": 2.8029678483099754, "grad_norm": 0.400390625, "learning_rate": 0.00018079321842888227, "loss": 0.7995, "step": 1700 }, { "epoch": 2.81121187139324, "grad_norm": 0.48828125, "learning_rate": 0.00018062316745308542, "loss": 0.7939, "step": 1705 }, { "epoch": 2.8194558944765045, "grad_norm": 0.45703125, "learning_rate": 0.0001804524477192075, "loss": 0.79, "step": 1710 }, { "epoch": 2.827699917559769, "grad_norm": 0.462890625, "learning_rate": 0.0001802810606433451, "loss": 0.7927, "step": 1715 }, { "epoch": 2.835943940643034, "grad_norm": 0.4609375, "learning_rate": 0.00018010900764713048, "loss": 0.796, "step": 1720 }, { "epoch": 2.8441879637262986, "grad_norm": 0.75, "learning_rate": 0.0001799362901577196, "loss": 0.7921, "step": 1725 }, { "epoch": 2.852431986809563, "grad_norm": 0.482421875, "learning_rate": 0.00017976290960778024, "loss": 0.79, "step": 1730 }, { "epoch": 2.8606760098928277, "grad_norm": 0.71484375, "learning_rate": 0.0001795888674354802, "loss": 0.7927, "step": 1735 }, { "epoch": 2.868920032976092, "grad_norm": 0.458984375, "learning_rate": 0.00017941416508447536, "loss": 0.7917, "step": 1740 }, { "epoch": 2.8771640560593568, "grad_norm": 1.2265625, "learning_rate": 0.0001792388040038977, "loss": 0.7905, "step": 1745 }, { "epoch": 2.8854080791426218, "grad_norm": 0.7578125, "learning_rate": 0.00017906278564834324, "loss": 0.7934, "step": 1750 }, { "epoch": 2.8936521022258863, "grad_norm": 0.4296875, "learning_rate": 0.00017888611147786002, "loss": 0.7957, "step": 1755 }, { "epoch": 2.901896125309151, "grad_norm": 0.55078125, "learning_rate": 0.00017870878295793598, "loss": 0.7793, "step": 1760 }, { "epoch": 2.9101401483924154, "grad_norm": 0.7421875, "learning_rate": 0.0001785308015594868, "loss": 0.7912, "step": 1765 }, { "epoch": 2.91838417147568, "grad_norm": 0.447265625, "learning_rate": 0.00017835216875884368, "loss": 0.7842, "step": 1770 }, { "epoch": 2.926628194558945, "grad_norm": 0.6640625, "learning_rate": 0.00017817288603774116, "loss": 0.784, "step": 1775 }, { "epoch": 2.9348722176422095, "grad_norm": 0.828125, "learning_rate": 0.00017799295488330467, "loss": 0.7934, "step": 1780 }, { "epoch": 2.943116240725474, "grad_norm": 0.53515625, "learning_rate": 0.00017781237678803847, "loss": 0.7867, "step": 1785 }, { "epoch": 2.9513602638087386, "grad_norm": 0.470703125, "learning_rate": 0.00017763115324981294, "loss": 0.7911, "step": 1790 }, { "epoch": 2.959604286892003, "grad_norm": 0.703125, "learning_rate": 0.00017744928577185243, "loss": 0.7914, "step": 1795 }, { "epoch": 2.9678483099752677, "grad_norm": 0.62109375, "learning_rate": 0.00017726677586272263, "loss": 0.7917, "step": 1800 }, { "epoch": 2.9760923330585327, "grad_norm": 0.455078125, "learning_rate": 0.00017708362503631814, "loss": 0.7819, "step": 1805 }, { "epoch": 2.9843363561417973, "grad_norm": 0.419921875, "learning_rate": 0.00017689983481184989, "loss": 0.7842, "step": 1810 }, { "epoch": 2.992580379225062, "grad_norm": 0.5078125, "learning_rate": 0.00017671540671383243, "loss": 0.7939, "step": 1815 }, { "epoch": 2.9991755976916736, "eval_loss": 2.4241690635681152, "eval_runtime": 0.2578, "eval_samples_per_second": 38.793, "eval_steps_per_second": 3.879, "step": 1819 }, { "epoch": 3.0008244023083264, "grad_norm": 0.416015625, "learning_rate": 0.00017653034227207152, "loss": 0.7885, "step": 1820 }, { "epoch": 3.009068425391591, "grad_norm": 1.0390625, "learning_rate": 0.00017634464302165124, "loss": 0.772, "step": 1825 }, { "epoch": 3.017312448474856, "grad_norm": 0.4765625, "learning_rate": 0.0001761583105029213, "loss": 0.7668, "step": 1830 }, { "epoch": 3.0255564715581205, "grad_norm": 1.1484375, "learning_rate": 0.00017597134626148427, "loss": 0.77, "step": 1835 }, { "epoch": 3.033800494641385, "grad_norm": 0.88671875, "learning_rate": 0.0001757837518481829, "loss": 0.7713, "step": 1840 }, { "epoch": 3.0420445177246496, "grad_norm": 0.74609375, "learning_rate": 0.00017559552881908695, "loss": 0.7748, "step": 1845 }, { "epoch": 3.050288540807914, "grad_norm": 0.6015625, "learning_rate": 0.00017540667873548063, "loss": 0.7653, "step": 1850 }, { "epoch": 3.058532563891179, "grad_norm": 0.50390625, "learning_rate": 0.00017521720316384935, "loss": 0.7706, "step": 1855 }, { "epoch": 3.0667765869744437, "grad_norm": 0.9140625, "learning_rate": 0.00017502710367586687, "loss": 0.7633, "step": 1860 }, { "epoch": 3.075020610057708, "grad_norm": 0.384765625, "learning_rate": 0.00017483638184838239, "loss": 0.7568, "step": 1865 }, { "epoch": 3.0832646331409728, "grad_norm": 0.5390625, "learning_rate": 0.0001746450392634071, "loss": 0.757, "step": 1870 }, { "epoch": 3.0915086562242373, "grad_norm": 0.44140625, "learning_rate": 0.0001744530775081015, "loss": 0.7701, "step": 1875 }, { "epoch": 3.099752679307502, "grad_norm": 0.44140625, "learning_rate": 0.00017426049817476197, "loss": 0.7717, "step": 1880 }, { "epoch": 3.107996702390767, "grad_norm": 0.52734375, "learning_rate": 0.00017406730286080753, "loss": 0.7647, "step": 1885 }, { "epoch": 3.1162407254740314, "grad_norm": 0.5, "learning_rate": 0.00017387349316876666, "loss": 0.7618, "step": 1890 }, { "epoch": 3.124484748557296, "grad_norm": 0.443359375, "learning_rate": 0.00017367907070626424, "loss": 0.7712, "step": 1895 }, { "epoch": 3.1327287716405605, "grad_norm": 0.51953125, "learning_rate": 0.00017348403708600772, "loss": 0.7635, "step": 1900 }, { "epoch": 3.140972794723825, "grad_norm": 0.58203125, "learning_rate": 0.0001732883939257742, "loss": 0.7591, "step": 1905 }, { "epoch": 3.14921681780709, "grad_norm": 0.48046875, "learning_rate": 0.00017309214284839678, "loss": 0.7664, "step": 1910 }, { "epoch": 3.1574608408903546, "grad_norm": 0.486328125, "learning_rate": 0.00017289528548175114, "loss": 0.7633, "step": 1915 }, { "epoch": 3.165704863973619, "grad_norm": 0.482421875, "learning_rate": 0.00017269782345874203, "loss": 0.7676, "step": 1920 }, { "epoch": 3.1739488870568837, "grad_norm": 0.45703125, "learning_rate": 0.0001724997584172898, "loss": 0.7712, "step": 1925 }, { "epoch": 3.1821929101401483, "grad_norm": 0.48046875, "learning_rate": 0.00017230109200031668, "loss": 0.7631, "step": 1930 }, { "epoch": 3.190436933223413, "grad_norm": 0.412109375, "learning_rate": 0.00017210182585573327, "loss": 0.7664, "step": 1935 }, { "epoch": 3.198680956306678, "grad_norm": 0.6484375, "learning_rate": 0.00017190196163642483, "loss": 0.7653, "step": 1940 }, { "epoch": 3.2069249793899424, "grad_norm": 0.60546875, "learning_rate": 0.0001717015010002376, "loss": 0.7677, "step": 1945 }, { "epoch": 3.215169002473207, "grad_norm": 0.5234375, "learning_rate": 0.00017150044560996488, "loss": 0.7628, "step": 1950 }, { "epoch": 3.2234130255564715, "grad_norm": 0.52734375, "learning_rate": 0.00017129879713333356, "loss": 0.7604, "step": 1955 }, { "epoch": 3.231657048639736, "grad_norm": 0.419921875, "learning_rate": 0.00017109655724298995, "loss": 0.7664, "step": 1960 }, { "epoch": 3.239901071723001, "grad_norm": 0.6484375, "learning_rate": 0.00017089372761648616, "loss": 0.7679, "step": 1965 }, { "epoch": 3.2481450948062656, "grad_norm": 0.5234375, "learning_rate": 0.00017069030993626603, "loss": 0.7621, "step": 1970 }, { "epoch": 3.25638911788953, "grad_norm": 0.703125, "learning_rate": 0.00017048630588965117, "loss": 0.7747, "step": 1975 }, { "epoch": 3.2646331409727947, "grad_norm": 0.625, "learning_rate": 0.00017028171716882714, "loss": 0.7655, "step": 1980 }, { "epoch": 3.272877164056059, "grad_norm": 0.61328125, "learning_rate": 0.00017007654547082922, "loss": 0.768, "step": 1985 }, { "epoch": 3.281121187139324, "grad_norm": 0.6796875, "learning_rate": 0.00016987079249752843, "loss": 0.7631, "step": 1990 }, { "epoch": 3.2893652102225888, "grad_norm": 0.7421875, "learning_rate": 0.00016966445995561727, "loss": 0.7686, "step": 1995 }, { "epoch": 3.2976092333058533, "grad_norm": 0.54296875, "learning_rate": 0.00016945754955659595, "loss": 0.7695, "step": 2000 }, { "epoch": 3.305853256389118, "grad_norm": 0.59765625, "learning_rate": 0.00016925006301675763, "loss": 0.7548, "step": 2005 }, { "epoch": 3.3140972794723824, "grad_norm": 0.4140625, "learning_rate": 0.0001690420020571747, "loss": 0.7642, "step": 2010 }, { "epoch": 3.322341302555647, "grad_norm": 0.431640625, "learning_rate": 0.00016883336840368412, "loss": 0.7706, "step": 2015 }, { "epoch": 3.330585325638912, "grad_norm": 0.4375, "learning_rate": 0.0001686241637868734, "loss": 0.7693, "step": 2020 }, { "epoch": 3.3388293487221765, "grad_norm": 0.50390625, "learning_rate": 0.00016841438994206595, "loss": 0.7616, "step": 2025 }, { "epoch": 3.347073371805441, "grad_norm": 0.99609375, "learning_rate": 0.0001682040486093071, "loss": 0.7661, "step": 2030 }, { "epoch": 3.3553173948887056, "grad_norm": 0.65234375, "learning_rate": 0.00016799314153334916, "loss": 0.7543, "step": 2035 }, { "epoch": 3.36356141797197, "grad_norm": 0.92578125, "learning_rate": 0.00016778167046363734, "loss": 0.757, "step": 2040 }, { "epoch": 3.371805441055235, "grad_norm": 1.1640625, "learning_rate": 0.00016756963715429502, "loss": 0.7647, "step": 2045 }, { "epoch": 3.3800494641384997, "grad_norm": 0.5234375, "learning_rate": 0.00016735704336410943, "loss": 0.7562, "step": 2050 }, { "epoch": 3.3882934872217643, "grad_norm": 0.70703125, "learning_rate": 0.0001671438908565167, "loss": 0.7573, "step": 2055 }, { "epoch": 3.396537510305029, "grad_norm": 0.50390625, "learning_rate": 0.00016693018139958763, "loss": 0.7585, "step": 2060 }, { "epoch": 3.4047815333882934, "grad_norm": 1.1953125, "learning_rate": 0.00016671591676601272, "loss": 0.7538, "step": 2065 }, { "epoch": 3.413025556471558, "grad_norm": 0.458984375, "learning_rate": 0.00016650109873308765, "loss": 0.7635, "step": 2070 }, { "epoch": 3.421269579554823, "grad_norm": 0.443359375, "learning_rate": 0.00016628572908269841, "loss": 0.7605, "step": 2075 }, { "epoch": 3.4295136026380875, "grad_norm": 0.421875, "learning_rate": 0.00016606980960130665, "loss": 0.7511, "step": 2080 }, { "epoch": 3.437757625721352, "grad_norm": 0.61328125, "learning_rate": 0.00016585334207993476, "loss": 0.757, "step": 2085 }, { "epoch": 3.4460016488046166, "grad_norm": 0.7109375, "learning_rate": 0.00016563632831415102, "loss": 0.7616, "step": 2090 }, { "epoch": 3.454245671887881, "grad_norm": 0.423828125, "learning_rate": 0.00016541877010405477, "loss": 0.7605, "step": 2095 }, { "epoch": 3.462489694971146, "grad_norm": 0.52734375, "learning_rate": 0.00016520066925426144, "loss": 0.7564, "step": 2100 }, { "epoch": 3.4707337180544107, "grad_norm": 0.59375, "learning_rate": 0.00016498202757388758, "loss": 0.7627, "step": 2105 }, { "epoch": 3.478977741137675, "grad_norm": 0.55859375, "learning_rate": 0.0001647628468765358, "loss": 0.7514, "step": 2110 }, { "epoch": 3.4872217642209398, "grad_norm": 0.640625, "learning_rate": 0.0001645431289802799, "loss": 0.7616, "step": 2115 }, { "epoch": 3.4954657873042043, "grad_norm": 0.546875, "learning_rate": 0.00016432287570764952, "loss": 0.7639, "step": 2120 }, { "epoch": 3.503709810387469, "grad_norm": 0.56640625, "learning_rate": 0.0001641020888856153, "loss": 0.7642, "step": 2125 }, { "epoch": 3.511953833470734, "grad_norm": 0.609375, "learning_rate": 0.00016388077034557355, "loss": 0.7511, "step": 2130 }, { "epoch": 3.5201978565539984, "grad_norm": 0.65234375, "learning_rate": 0.0001636589219233311, "loss": 0.7513, "step": 2135 }, { "epoch": 3.528441879637263, "grad_norm": 0.458984375, "learning_rate": 0.00016343654545909007, "loss": 0.7568, "step": 2140 }, { "epoch": 3.5366859027205275, "grad_norm": 0.435546875, "learning_rate": 0.00016321364279743266, "loss": 0.7562, "step": 2145 }, { "epoch": 3.5449299258037925, "grad_norm": 0.70703125, "learning_rate": 0.00016299021578730579, "loss": 0.7591, "step": 2150 }, { "epoch": 3.553173948887057, "grad_norm": 0.59375, "learning_rate": 0.00016276626628200568, "loss": 0.7665, "step": 2155 }, { "epoch": 3.5614179719703216, "grad_norm": 0.5234375, "learning_rate": 0.00016254179613916278, "loss": 0.7604, "step": 2160 }, { "epoch": 3.569661995053586, "grad_norm": 1.0, "learning_rate": 0.000162316807220726, "loss": 0.7504, "step": 2165 }, { "epoch": 3.5779060181368507, "grad_norm": 0.65234375, "learning_rate": 0.00016209130139294744, "loss": 0.7646, "step": 2170 }, { "epoch": 3.5861500412201153, "grad_norm": 0.69921875, "learning_rate": 0.00016186528052636692, "loss": 0.7562, "step": 2175 }, { "epoch": 3.59439406430338, "grad_norm": 0.63671875, "learning_rate": 0.00016163874649579647, "loss": 0.7501, "step": 2180 }, { "epoch": 3.602638087386645, "grad_norm": 0.482421875, "learning_rate": 0.00016141170118030463, "loss": 0.7548, "step": 2185 }, { "epoch": 3.6108821104699094, "grad_norm": 0.453125, "learning_rate": 0.0001611841464632011, "loss": 0.7582, "step": 2190 }, { "epoch": 3.619126133553174, "grad_norm": 0.50390625, "learning_rate": 0.00016095608423202098, "loss": 0.7517, "step": 2195 }, { "epoch": 3.6273701566364385, "grad_norm": 0.392578125, "learning_rate": 0.00016072751637850904, "loss": 0.7563, "step": 2200 }, { "epoch": 3.6356141797197035, "grad_norm": 0.451171875, "learning_rate": 0.00016049844479860422, "loss": 0.7566, "step": 2205 }, { "epoch": 3.643858202802968, "grad_norm": 0.41796875, "learning_rate": 0.00016026887139242372, "loss": 0.7515, "step": 2210 }, { "epoch": 3.6521022258862326, "grad_norm": 0.49609375, "learning_rate": 0.0001600387980642474, "loss": 0.754, "step": 2215 }, { "epoch": 3.660346248969497, "grad_norm": 0.74609375, "learning_rate": 0.0001598082267225018, "loss": 0.7608, "step": 2220 }, { "epoch": 3.6685902720527617, "grad_norm": 0.5546875, "learning_rate": 0.0001595771592797445, "loss": 0.7574, "step": 2225 }, { "epoch": 3.676834295136026, "grad_norm": 0.59765625, "learning_rate": 0.0001593455976526482, "loss": 0.7526, "step": 2230 }, { "epoch": 3.6850783182192908, "grad_norm": 0.40625, "learning_rate": 0.0001591135437619847, "loss": 0.7546, "step": 2235 }, { "epoch": 3.6933223413025558, "grad_norm": 0.478515625, "learning_rate": 0.00015888099953260905, "loss": 0.7574, "step": 2240 }, { "epoch": 3.7015663643858203, "grad_norm": 0.6171875, "learning_rate": 0.0001586479668934437, "loss": 0.7548, "step": 2245 }, { "epoch": 3.709810387469085, "grad_norm": 0.7421875, "learning_rate": 0.0001584144477774623, "loss": 0.7519, "step": 2250 }, { "epoch": 3.7180544105523494, "grad_norm": 0.75, "learning_rate": 0.0001581804441216738, "loss": 0.761, "step": 2255 }, { "epoch": 3.7262984336356144, "grad_norm": 0.86328125, "learning_rate": 0.00015794595786710632, "loss": 0.7552, "step": 2260 }, { "epoch": 3.734542456718879, "grad_norm": 0.58984375, "learning_rate": 0.00015771099095879108, "loss": 0.7573, "step": 2265 }, { "epoch": 3.7427864798021435, "grad_norm": 1.1484375, "learning_rate": 0.00015747554534574626, "loss": 0.753, "step": 2270 }, { "epoch": 3.751030502885408, "grad_norm": 0.46875, "learning_rate": 0.0001572396229809608, "loss": 0.7587, "step": 2275 }, { "epoch": 3.7592745259686726, "grad_norm": 0.5859375, "learning_rate": 0.00015700322582137827, "loss": 0.7505, "step": 2280 }, { "epoch": 3.767518549051937, "grad_norm": 0.423828125, "learning_rate": 0.0001567663558278806, "loss": 0.747, "step": 2285 }, { "epoch": 3.7757625721352017, "grad_norm": 0.6328125, "learning_rate": 0.0001565290149652718, "loss": 0.763, "step": 2290 }, { "epoch": 3.7840065952184667, "grad_norm": 0.640625, "learning_rate": 0.00015629120520226165, "loss": 0.7547, "step": 2295 }, { "epoch": 3.7922506183017313, "grad_norm": 0.63671875, "learning_rate": 0.00015605292851144942, "loss": 0.7537, "step": 2300 }, { "epoch": 3.800494641384996, "grad_norm": 0.5078125, "learning_rate": 0.00015581418686930743, "loss": 0.754, "step": 2305 }, { "epoch": 3.8087386644682604, "grad_norm": 0.470703125, "learning_rate": 0.00015557498225616487, "loss": 0.7407, "step": 2310 }, { "epoch": 3.8169826875515254, "grad_norm": 0.546875, "learning_rate": 0.00015533531665619098, "loss": 0.7556, "step": 2315 }, { "epoch": 3.82522671063479, "grad_norm": 0.97265625, "learning_rate": 0.00015509519205737896, "loss": 0.7516, "step": 2320 }, { "epoch": 3.8334707337180545, "grad_norm": 0.6328125, "learning_rate": 0.0001548546104515294, "loss": 0.7506, "step": 2325 }, { "epoch": 3.841714756801319, "grad_norm": 0.486328125, "learning_rate": 0.0001546135738342335, "loss": 0.7524, "step": 2330 }, { "epoch": 3.8499587798845836, "grad_norm": 0.81640625, "learning_rate": 0.0001543720842048569, "loss": 0.748, "step": 2335 }, { "epoch": 3.858202802967848, "grad_norm": 0.443359375, "learning_rate": 0.00015413014356652286, "loss": 0.7503, "step": 2340 }, { "epoch": 3.8664468260511127, "grad_norm": 0.486328125, "learning_rate": 0.00015388775392609564, "loss": 0.754, "step": 2345 }, { "epoch": 3.8746908491343777, "grad_norm": 0.439453125, "learning_rate": 0.000153644917294164, "loss": 0.7511, "step": 2350 }, { "epoch": 3.882934872217642, "grad_norm": 0.5078125, "learning_rate": 0.0001534016356850244, "loss": 0.7492, "step": 2355 }, { "epoch": 3.8911788953009068, "grad_norm": 0.4140625, "learning_rate": 0.00015315791111666425, "loss": 0.7529, "step": 2360 }, { "epoch": 3.8994229183841713, "grad_norm": 0.58984375, "learning_rate": 0.00015291374561074536, "loss": 0.7481, "step": 2365 }, { "epoch": 3.9076669414674363, "grad_norm": 0.431640625, "learning_rate": 0.000152669141192587, "loss": 0.752, "step": 2370 }, { "epoch": 3.915910964550701, "grad_norm": 0.41015625, "learning_rate": 0.00015242409989114916, "loss": 0.7389, "step": 2375 }, { "epoch": 3.9241549876339654, "grad_norm": 0.46484375, "learning_rate": 0.00015217862373901575, "loss": 0.7521, "step": 2380 }, { "epoch": 3.93239901071723, "grad_norm": 0.5546875, "learning_rate": 0.0001519327147723776, "loss": 0.742, "step": 2385 }, { "epoch": 3.9406430338004945, "grad_norm": 0.73046875, "learning_rate": 0.00015168637503101584, "loss": 0.7499, "step": 2390 }, { "epoch": 3.948887056883759, "grad_norm": 0.486328125, "learning_rate": 0.00015143960655828468, "loss": 0.7516, "step": 2395 }, { "epoch": 3.957131079967024, "grad_norm": 0.384765625, "learning_rate": 0.00015119241140109467, "loss": 0.7493, "step": 2400 }, { "epoch": 3.9653751030502886, "grad_norm": 0.458984375, "learning_rate": 0.0001509447916098956, "loss": 0.7445, "step": 2405 }, { "epoch": 3.973619126133553, "grad_norm": 0.40625, "learning_rate": 0.0001506967492386596, "loss": 0.7535, "step": 2410 }, { "epoch": 3.9818631492168177, "grad_norm": 0.466796875, "learning_rate": 0.000150448286344864, "loss": 0.7411, "step": 2415 }, { "epoch": 3.9901071723000827, "grad_norm": 0.87890625, "learning_rate": 0.00015019940498947428, "loss": 0.7484, "step": 2420 }, { "epoch": 3.9983511953833473, "grad_norm": 0.439453125, "learning_rate": 0.00014995010723692714, "loss": 0.7465, "step": 2425 }, { "epoch": 4.0, "eval_loss": 2.436275005340576, "eval_runtime": 0.2365, "eval_samples_per_second": 42.283, "eval_steps_per_second": 4.228, "step": 2426 }, { "epoch": 4.006595218466612, "grad_norm": 0.47265625, "learning_rate": 0.00014970039515511304, "loss": 0.7483, "step": 2430 }, { "epoch": 4.014839241549876, "grad_norm": 0.439453125, "learning_rate": 0.00014945027081535937, "loss": 0.7256, "step": 2435 }, { "epoch": 4.023083264633141, "grad_norm": 0.6171875, "learning_rate": 0.00014919973629241314, "loss": 0.7386, "step": 2440 }, { "epoch": 4.0313272877164055, "grad_norm": 0.4765625, "learning_rate": 0.0001489487936644237, "loss": 0.7329, "step": 2445 }, { "epoch": 4.03957131079967, "grad_norm": 0.84765625, "learning_rate": 0.00014869744501292561, "loss": 0.7317, "step": 2450 }, { "epoch": 4.047815333882935, "grad_norm": 0.4375, "learning_rate": 0.00014844569242282148, "loss": 0.7278, "step": 2455 }, { "epoch": 4.056059356966199, "grad_norm": 0.5234375, "learning_rate": 0.00014819353798236427, "loss": 0.73, "step": 2460 }, { "epoch": 4.064303380049465, "grad_norm": 0.91796875, "learning_rate": 0.0001479409837831404, "loss": 0.7357, "step": 2465 }, { "epoch": 4.072547403132729, "grad_norm": 0.78125, "learning_rate": 0.00014768803192005223, "loss": 0.7341, "step": 2470 }, { "epoch": 4.080791426215994, "grad_norm": 0.404296875, "learning_rate": 0.00014743468449130063, "loss": 0.7367, "step": 2475 }, { "epoch": 4.089035449299258, "grad_norm": 0.53125, "learning_rate": 0.00014718094359836772, "loss": 0.7322, "step": 2480 }, { "epoch": 4.097279472382523, "grad_norm": 0.453125, "learning_rate": 0.00014692681134599925, "loss": 0.73, "step": 2485 }, { "epoch": 4.105523495465787, "grad_norm": 0.44140625, "learning_rate": 0.0001466722898421873, "loss": 0.7364, "step": 2490 }, { "epoch": 4.113767518549052, "grad_norm": 0.4375, "learning_rate": 0.00014641738119815266, "loss": 0.7267, "step": 2495 }, { "epoch": 4.122011541632316, "grad_norm": 0.412109375, "learning_rate": 0.00014616208752832758, "loss": 0.7282, "step": 2500 }, { "epoch": 4.130255564715581, "grad_norm": 0.431640625, "learning_rate": 0.00014590641095033787, "loss": 0.7251, "step": 2505 }, { "epoch": 4.1384995877988455, "grad_norm": 0.5234375, "learning_rate": 0.0001456503535849855, "loss": 0.7391, "step": 2510 }, { "epoch": 4.14674361088211, "grad_norm": 0.546875, "learning_rate": 0.0001453939175562312, "loss": 0.7346, "step": 2515 }, { "epoch": 4.1549876339653755, "grad_norm": 0.6328125, "learning_rate": 0.00014513710499117647, "loss": 0.7362, "step": 2520 }, { "epoch": 4.16323165704864, "grad_norm": 0.451171875, "learning_rate": 0.00014487991802004623, "loss": 0.731, "step": 2525 }, { "epoch": 4.171475680131905, "grad_norm": 0.484375, "learning_rate": 0.00014462235877617098, "loss": 0.7285, "step": 2530 }, { "epoch": 4.179719703215169, "grad_norm": 0.52734375, "learning_rate": 0.0001443644293959693, "loss": 0.7386, "step": 2535 }, { "epoch": 4.187963726298434, "grad_norm": 0.494140625, "learning_rate": 0.00014410613201892985, "loss": 0.7376, "step": 2540 }, { "epoch": 4.196207749381698, "grad_norm": 0.4765625, "learning_rate": 0.0001438474687875938, "loss": 0.731, "step": 2545 }, { "epoch": 4.204451772464963, "grad_norm": 0.384765625, "learning_rate": 0.00014358844184753712, "loss": 0.7238, "step": 2550 }, { "epoch": 4.212695795548227, "grad_norm": 0.45703125, "learning_rate": 0.00014332905334735261, "loss": 0.7246, "step": 2555 }, { "epoch": 4.220939818631492, "grad_norm": 0.5625, "learning_rate": 0.00014306930543863219, "loss": 0.7394, "step": 2560 }, { "epoch": 4.2291838417147565, "grad_norm": 0.47265625, "learning_rate": 0.00014280920027594907, "loss": 0.7306, "step": 2565 }, { "epoch": 4.237427864798021, "grad_norm": 0.57421875, "learning_rate": 0.00014254874001683976, "loss": 0.7418, "step": 2570 }, { "epoch": 4.2456718878812865, "grad_norm": 0.45703125, "learning_rate": 0.00014228792682178623, "loss": 0.7291, "step": 2575 }, { "epoch": 4.253915910964551, "grad_norm": 0.43359375, "learning_rate": 0.00014202676285419812, "loss": 0.7273, "step": 2580 }, { "epoch": 4.262159934047816, "grad_norm": 0.50390625, "learning_rate": 0.00014176525028039452, "loss": 0.7311, "step": 2585 }, { "epoch": 4.27040395713108, "grad_norm": 0.423828125, "learning_rate": 0.00014150339126958633, "loss": 0.7214, "step": 2590 }, { "epoch": 4.278647980214345, "grad_norm": 0.43359375, "learning_rate": 0.00014124118799385796, "loss": 0.7324, "step": 2595 }, { "epoch": 4.286892003297609, "grad_norm": 0.66015625, "learning_rate": 0.00014097864262814955, "loss": 0.7397, "step": 2600 }, { "epoch": 4.295136026380874, "grad_norm": 0.625, "learning_rate": 0.00014071575735023875, "loss": 0.7382, "step": 2605 }, { "epoch": 4.303380049464138, "grad_norm": 0.46875, "learning_rate": 0.0001404525343407228, "loss": 0.7324, "step": 2610 }, { "epoch": 4.311624072547403, "grad_norm": 0.41015625, "learning_rate": 0.00014018897578300035, "loss": 0.7327, "step": 2615 }, { "epoch": 4.319868095630667, "grad_norm": 0.43359375, "learning_rate": 0.0001399250838632533, "loss": 0.7419, "step": 2620 }, { "epoch": 4.328112118713932, "grad_norm": 0.4921875, "learning_rate": 0.0001396608607704289, "loss": 0.738, "step": 2625 }, { "epoch": 4.336356141797197, "grad_norm": 0.70703125, "learning_rate": 0.00013939630869622133, "loss": 0.7412, "step": 2630 }, { "epoch": 4.344600164880462, "grad_norm": 0.42578125, "learning_rate": 0.00013913142983505364, "loss": 0.7336, "step": 2635 }, { "epoch": 4.3528441879637265, "grad_norm": 0.53515625, "learning_rate": 0.00013886622638405952, "loss": 0.7282, "step": 2640 }, { "epoch": 4.361088211046991, "grad_norm": 0.400390625, "learning_rate": 0.00013860070054306516, "loss": 0.7306, "step": 2645 }, { "epoch": 4.369332234130256, "grad_norm": 0.84765625, "learning_rate": 0.0001383348545145708, "loss": 0.7279, "step": 2650 }, { "epoch": 4.37757625721352, "grad_norm": 0.4765625, "learning_rate": 0.0001380686905037327, "loss": 0.7355, "step": 2655 }, { "epoch": 4.385820280296785, "grad_norm": 1.0546875, "learning_rate": 0.00013780221071834476, "loss": 0.7336, "step": 2660 }, { "epoch": 4.394064303380049, "grad_norm": 0.423828125, "learning_rate": 0.0001375354173688201, "loss": 0.7314, "step": 2665 }, { "epoch": 4.402308326463314, "grad_norm": 0.58984375, "learning_rate": 0.00013726831266817278, "loss": 0.7344, "step": 2670 }, { "epoch": 4.410552349546578, "grad_norm": 0.61328125, "learning_rate": 0.00013700089883199966, "loss": 0.7361, "step": 2675 }, { "epoch": 4.418796372629844, "grad_norm": 0.57421875, "learning_rate": 0.0001367331780784616, "loss": 0.7322, "step": 2680 }, { "epoch": 4.427040395713108, "grad_norm": 0.5234375, "learning_rate": 0.00013646515262826552, "loss": 0.7332, "step": 2685 }, { "epoch": 4.435284418796373, "grad_norm": 0.46875, "learning_rate": 0.00013619682470464558, "loss": 0.7321, "step": 2690 }, { "epoch": 4.4435284418796375, "grad_norm": 0.71484375, "learning_rate": 0.00013592819653334505, "loss": 0.7262, "step": 2695 }, { "epoch": 4.451772464962902, "grad_norm": 0.443359375, "learning_rate": 0.0001356592703425976, "loss": 0.7273, "step": 2700 }, { "epoch": 4.460016488046167, "grad_norm": 0.61328125, "learning_rate": 0.00013539004836310894, "loss": 0.7378, "step": 2705 }, { "epoch": 4.468260511129431, "grad_norm": 0.51171875, "learning_rate": 0.0001351205328280385, "loss": 0.7254, "step": 2710 }, { "epoch": 4.476504534212696, "grad_norm": 0.53125, "learning_rate": 0.00013485072597298038, "loss": 0.729, "step": 2715 }, { "epoch": 4.48474855729596, "grad_norm": 0.443359375, "learning_rate": 0.00013458063003594543, "loss": 0.7375, "step": 2720 }, { "epoch": 4.492992580379225, "grad_norm": 0.44140625, "learning_rate": 0.0001343102472573423, "loss": 0.7278, "step": 2725 }, { "epoch": 4.501236603462489, "grad_norm": 0.4609375, "learning_rate": 0.00013403957987995882, "loss": 0.7363, "step": 2730 }, { "epoch": 4.509480626545754, "grad_norm": 0.515625, "learning_rate": 0.00013376863014894375, "loss": 0.7341, "step": 2735 }, { "epoch": 4.517724649629019, "grad_norm": 0.423828125, "learning_rate": 0.00013349740031178784, "loss": 0.7325, "step": 2740 }, { "epoch": 4.525968672712284, "grad_norm": 0.447265625, "learning_rate": 0.00013322589261830517, "loss": 0.7376, "step": 2745 }, { "epoch": 4.534212695795548, "grad_norm": 0.4375, "learning_rate": 0.00013295410932061478, "loss": 0.727, "step": 2750 }, { "epoch": 4.542456718878813, "grad_norm": 0.431640625, "learning_rate": 0.00013268205267312174, "loss": 0.729, "step": 2755 }, { "epoch": 4.5507007419620775, "grad_norm": 0.412109375, "learning_rate": 0.00013240972493249847, "loss": 0.7355, "step": 2760 }, { "epoch": 4.558944765045342, "grad_norm": 0.4921875, "learning_rate": 0.00013213712835766607, "loss": 0.7362, "step": 2765 }, { "epoch": 4.567188788128607, "grad_norm": 0.4609375, "learning_rate": 0.0001318642652097757, "loss": 0.7319, "step": 2770 }, { "epoch": 4.575432811211871, "grad_norm": 0.384765625, "learning_rate": 0.00013159113775218964, "loss": 0.7265, "step": 2775 }, { "epoch": 4.583676834295136, "grad_norm": 0.39453125, "learning_rate": 0.00013131774825046245, "loss": 0.7343, "step": 2780 }, { "epoch": 4.5919208573784, "grad_norm": 0.447265625, "learning_rate": 0.00013104409897232258, "loss": 0.7231, "step": 2785 }, { "epoch": 4.600164880461666, "grad_norm": 0.4609375, "learning_rate": 0.00013077019218765305, "loss": 0.7305, "step": 2790 }, { "epoch": 4.60840890354493, "grad_norm": 0.40625, "learning_rate": 0.00013049603016847296, "loss": 0.7311, "step": 2795 }, { "epoch": 4.616652926628195, "grad_norm": 0.57421875, "learning_rate": 0.00013022161518891855, "loss": 0.7347, "step": 2800 }, { "epoch": 4.624896949711459, "grad_norm": 0.421875, "learning_rate": 0.00012994694952522435, "loss": 0.7395, "step": 2805 }, { "epoch": 4.633140972794724, "grad_norm": 0.40625, "learning_rate": 0.00012967203545570418, "loss": 0.7332, "step": 2810 }, { "epoch": 4.6413849958779885, "grad_norm": 0.455078125, "learning_rate": 0.0001293968752607325, "loss": 0.7326, "step": 2815 }, { "epoch": 4.649629018961253, "grad_norm": 0.53515625, "learning_rate": 0.00012912147122272523, "loss": 0.7317, "step": 2820 }, { "epoch": 4.657873042044518, "grad_norm": 0.6953125, "learning_rate": 0.00012884582562612095, "loss": 0.7336, "step": 2825 }, { "epoch": 4.666117065127782, "grad_norm": 0.41796875, "learning_rate": 0.00012856994075736197, "loss": 0.7283, "step": 2830 }, { "epoch": 4.674361088211047, "grad_norm": 0.5390625, "learning_rate": 0.00012829381890487536, "loss": 0.7366, "step": 2835 }, { "epoch": 4.682605111294311, "grad_norm": 0.8515625, "learning_rate": 0.00012801746235905384, "loss": 0.7377, "step": 2840 }, { "epoch": 4.690849134377576, "grad_norm": 0.40625, "learning_rate": 0.00012774087341223695, "loss": 0.7357, "step": 2845 }, { "epoch": 4.699093157460841, "grad_norm": 0.490234375, "learning_rate": 0.00012746405435869198, "loss": 0.7307, "step": 2850 }, { "epoch": 4.707337180544106, "grad_norm": 0.40234375, "learning_rate": 0.00012718700749459486, "loss": 0.7307, "step": 2855 }, { "epoch": 4.71558120362737, "grad_norm": 0.5625, "learning_rate": 0.0001269097351180112, "loss": 0.7244, "step": 2860 }, { "epoch": 4.723825226710635, "grad_norm": 0.3984375, "learning_rate": 0.00012663223952887723, "loss": 0.7321, "step": 2865 }, { "epoch": 4.732069249793899, "grad_norm": 0.40234375, "learning_rate": 0.0001263545230289807, "loss": 0.7243, "step": 2870 }, { "epoch": 4.740313272877164, "grad_norm": 0.4140625, "learning_rate": 0.00012607658792194174, "loss": 0.7282, "step": 2875 }, { "epoch": 4.7485572959604285, "grad_norm": 0.4921875, "learning_rate": 0.0001257984365131938, "loss": 0.7239, "step": 2880 }, { "epoch": 4.756801319043693, "grad_norm": 0.6640625, "learning_rate": 0.00012552007110996463, "loss": 0.7273, "step": 2885 }, { "epoch": 4.765045342126958, "grad_norm": 0.65625, "learning_rate": 0.00012524149402125685, "loss": 0.7251, "step": 2890 }, { "epoch": 4.773289365210223, "grad_norm": 0.50390625, "learning_rate": 0.00012496270755782914, "loss": 0.739, "step": 2895 }, { "epoch": 4.781533388293488, "grad_norm": 0.42578125, "learning_rate": 0.00012468371403217684, "loss": 0.7344, "step": 2900 }, { "epoch": 4.789777411376752, "grad_norm": 0.57421875, "learning_rate": 0.00012440451575851285, "loss": 0.7314, "step": 2905 }, { "epoch": 4.798021434460017, "grad_norm": 0.5, "learning_rate": 0.00012412511505274844, "loss": 0.7269, "step": 2910 }, { "epoch": 4.806265457543281, "grad_norm": 0.58203125, "learning_rate": 0.00012384551423247407, "loss": 0.7292, "step": 2915 }, { "epoch": 4.814509480626546, "grad_norm": 0.6171875, "learning_rate": 0.00012356571561693996, "loss": 0.7227, "step": 2920 }, { "epoch": 4.82275350370981, "grad_norm": 0.4921875, "learning_rate": 0.00012328572152703725, "loss": 0.7311, "step": 2925 }, { "epoch": 4.830997526793075, "grad_norm": 0.59375, "learning_rate": 0.00012300553428527832, "loss": 0.7315, "step": 2930 }, { "epoch": 4.8392415498763395, "grad_norm": 0.6796875, "learning_rate": 0.00012272515621577782, "loss": 0.7376, "step": 2935 }, { "epoch": 4.847485572959604, "grad_norm": 0.65234375, "learning_rate": 0.00012244458964423327, "loss": 0.7305, "step": 2940 }, { "epoch": 4.855729596042869, "grad_norm": 0.515625, "learning_rate": 0.00012216383689790574, "loss": 0.7279, "step": 2945 }, { "epoch": 4.863973619126133, "grad_norm": 0.443359375, "learning_rate": 0.00012188290030560063, "loss": 0.7299, "step": 2950 }, { "epoch": 4.872217642209399, "grad_norm": 0.44921875, "learning_rate": 0.00012160178219764837, "loss": 0.7253, "step": 2955 }, { "epoch": 4.880461665292663, "grad_norm": 0.56640625, "learning_rate": 0.00012132048490588492, "loss": 0.7291, "step": 2960 }, { "epoch": 4.888705688375928, "grad_norm": 0.462890625, "learning_rate": 0.00012103901076363269, "loss": 0.7244, "step": 2965 }, { "epoch": 4.896949711459192, "grad_norm": 0.53125, "learning_rate": 0.0001207573621056809, "loss": 0.7279, "step": 2970 }, { "epoch": 4.905193734542457, "grad_norm": 0.55078125, "learning_rate": 0.00012047554126826643, "loss": 0.7297, "step": 2975 }, { "epoch": 4.913437757625721, "grad_norm": 0.53515625, "learning_rate": 0.00012019355058905435, "loss": 0.7285, "step": 2980 }, { "epoch": 4.921681780708986, "grad_norm": 0.4296875, "learning_rate": 0.00011991139240711857, "loss": 0.7312, "step": 2985 }, { "epoch": 4.92992580379225, "grad_norm": 0.5390625, "learning_rate": 0.00011962906906292238, "loss": 0.7284, "step": 2990 }, { "epoch": 4.938169826875515, "grad_norm": 0.423828125, "learning_rate": 0.00011934658289829902, "loss": 0.7336, "step": 2995 }, { "epoch": 4.9464138499587795, "grad_norm": 0.404296875, "learning_rate": 0.00011906393625643244, "loss": 0.7281, "step": 3000 }, { "epoch": 4.954657873042045, "grad_norm": 0.41015625, "learning_rate": 0.00011878113148183758, "loss": 0.7271, "step": 3005 }, { "epoch": 4.9629018961253095, "grad_norm": 0.5, "learning_rate": 0.00011849817092034118, "loss": 0.7229, "step": 3010 }, { "epoch": 4.971145919208574, "grad_norm": 0.4375, "learning_rate": 0.00011821505691906216, "loss": 0.7318, "step": 3015 }, { "epoch": 4.979389942291839, "grad_norm": 0.39453125, "learning_rate": 0.00011793179182639218, "loss": 0.7366, "step": 3020 }, { "epoch": 4.987633965375103, "grad_norm": 0.421875, "learning_rate": 0.00011764837799197622, "loss": 0.7337, "step": 3025 }, { "epoch": 4.995877988458368, "grad_norm": 0.48828125, "learning_rate": 0.00011736481776669306, "loss": 0.7312, "step": 3030 }, { "epoch": 4.999175597691673, "eval_loss": 2.439051389694214, "eval_runtime": 0.2596, "eval_samples_per_second": 38.523, "eval_steps_per_second": 3.852, "step": 3032 }, { "epoch": 5.004122011541632, "grad_norm": 0.427734375, "learning_rate": 0.0001170811135026357, "loss": 0.7263, "step": 3035 }, { "epoch": 5.012366034624897, "grad_norm": 0.6640625, "learning_rate": 0.00011679726755309205, "loss": 0.7183, "step": 3040 }, { "epoch": 5.020610057708161, "grad_norm": 0.51171875, "learning_rate": 0.00011651328227252517, "loss": 0.723, "step": 3045 }, { "epoch": 5.028854080791426, "grad_norm": 0.5234375, "learning_rate": 0.00011622916001655388, "loss": 0.7185, "step": 3050 }, { "epoch": 5.0370981038746905, "grad_norm": 0.546875, "learning_rate": 0.00011594490314193323, "loss": 0.7132, "step": 3055 }, { "epoch": 5.045342126957956, "grad_norm": 0.416015625, "learning_rate": 0.00011566051400653486, "loss": 0.7054, "step": 3060 }, { "epoch": 5.0535861500412205, "grad_norm": 0.421875, "learning_rate": 0.00011537599496932752, "loss": 0.7197, "step": 3065 }, { "epoch": 5.061830173124485, "grad_norm": 0.43359375, "learning_rate": 0.00011509134839035748, "loss": 0.7157, "step": 3070 }, { "epoch": 5.07007419620775, "grad_norm": 0.458984375, "learning_rate": 0.00011480657663072896, "loss": 0.7093, "step": 3075 }, { "epoch": 5.078318219291014, "grad_norm": 0.59375, "learning_rate": 0.0001145216820525845, "loss": 0.7286, "step": 3080 }, { "epoch": 5.086562242374279, "grad_norm": 0.421875, "learning_rate": 0.00011423666701908547, "loss": 0.7105, "step": 3085 }, { "epoch": 5.094806265457543, "grad_norm": 0.39453125, "learning_rate": 0.00011395153389439233, "loss": 0.7072, "step": 3090 }, { "epoch": 5.103050288540808, "grad_norm": 0.40625, "learning_rate": 0.00011366628504364509, "loss": 0.7156, "step": 3095 }, { "epoch": 5.111294311624072, "grad_norm": 0.4609375, "learning_rate": 0.00011338092283294377, "loss": 0.7052, "step": 3100 }, { "epoch": 5.119538334707337, "grad_norm": 0.51953125, "learning_rate": 0.00011309544962932862, "loss": 0.7197, "step": 3105 }, { "epoch": 5.127782357790601, "grad_norm": 0.41796875, "learning_rate": 0.00011280986780076057, "loss": 0.7195, "step": 3110 }, { "epoch": 5.136026380873867, "grad_norm": 0.40234375, "learning_rate": 0.00011252417971610163, "loss": 0.7062, "step": 3115 }, { "epoch": 5.144270403957131, "grad_norm": 0.546875, "learning_rate": 0.00011223838774509514, "loss": 0.7225, "step": 3120 }, { "epoch": 5.152514427040396, "grad_norm": 0.4140625, "learning_rate": 0.00011195249425834615, "loss": 0.7106, "step": 3125 }, { "epoch": 5.1607584501236605, "grad_norm": 0.47265625, "learning_rate": 0.00011166650162730188, "loss": 0.7174, "step": 3130 }, { "epoch": 5.169002473206925, "grad_norm": 0.76953125, "learning_rate": 0.00011138041222423177, "loss": 0.7208, "step": 3135 }, { "epoch": 5.17724649629019, "grad_norm": 0.7265625, "learning_rate": 0.00011109422842220805, "loss": 0.716, "step": 3140 }, { "epoch": 5.185490519373454, "grad_norm": 0.49609375, "learning_rate": 0.00011080795259508608, "loss": 0.717, "step": 3145 }, { "epoch": 5.193734542456719, "grad_norm": 0.3984375, "learning_rate": 0.00011052158711748434, "loss": 0.7093, "step": 3150 }, { "epoch": 5.201978565539983, "grad_norm": 0.427734375, "learning_rate": 0.00011023513436476511, "loss": 0.7129, "step": 3155 }, { "epoch": 5.210222588623248, "grad_norm": 0.400390625, "learning_rate": 0.00010994859671301462, "loss": 0.7168, "step": 3160 }, { "epoch": 5.218466611706512, "grad_norm": 0.419921875, "learning_rate": 0.0001096619765390232, "loss": 0.7158, "step": 3165 }, { "epoch": 5.226710634789778, "grad_norm": 0.42578125, "learning_rate": 0.00010937527622026575, "loss": 0.7229, "step": 3170 }, { "epoch": 5.234954657873042, "grad_norm": 0.62109375, "learning_rate": 0.00010908849813488203, "loss": 0.7151, "step": 3175 }, { "epoch": 5.243198680956307, "grad_norm": 0.40625, "learning_rate": 0.00010880164466165674, "loss": 0.7185, "step": 3180 }, { "epoch": 5.2514427040395715, "grad_norm": 0.431640625, "learning_rate": 0.00010851471817999997, "loss": 0.7113, "step": 3185 }, { "epoch": 5.259686727122836, "grad_norm": 0.4140625, "learning_rate": 0.00010822772106992747, "loss": 0.7178, "step": 3190 }, { "epoch": 5.267930750206101, "grad_norm": 0.51171875, "learning_rate": 0.00010794065571204072, "loss": 0.7106, "step": 3195 }, { "epoch": 5.276174773289365, "grad_norm": 0.484375, "learning_rate": 0.0001076535244875074, "loss": 0.7136, "step": 3200 }, { "epoch": 5.28441879637263, "grad_norm": 0.59765625, "learning_rate": 0.00010736632977804149, "loss": 0.7138, "step": 3205 }, { "epoch": 5.292662819455894, "grad_norm": 0.65625, "learning_rate": 0.00010707907396588361, "loss": 0.7192, "step": 3210 }, { "epoch": 5.300906842539159, "grad_norm": 0.55859375, "learning_rate": 0.00010679175943378119, "loss": 0.7068, "step": 3215 }, { "epoch": 5.309150865622423, "grad_norm": 0.427734375, "learning_rate": 0.00010650438856496872, "loss": 0.7095, "step": 3220 }, { "epoch": 5.317394888705689, "grad_norm": 0.462890625, "learning_rate": 0.00010621696374314807, "loss": 0.7118, "step": 3225 }, { "epoch": 5.325638911788953, "grad_norm": 0.474609375, "learning_rate": 0.00010592948735246854, "loss": 0.711, "step": 3230 }, { "epoch": 5.333882934872218, "grad_norm": 0.53515625, "learning_rate": 0.00010564196177750725, "loss": 0.7172, "step": 3235 }, { "epoch": 5.342126957955482, "grad_norm": 0.435546875, "learning_rate": 0.0001053543894032493, "loss": 0.7171, "step": 3240 }, { "epoch": 5.350370981038747, "grad_norm": 0.52734375, "learning_rate": 0.00010506677261506797, "loss": 0.7153, "step": 3245 }, { "epoch": 5.3586150041220115, "grad_norm": 0.408203125, "learning_rate": 0.00010477911379870488, "loss": 0.7162, "step": 3250 }, { "epoch": 5.366859027205276, "grad_norm": 0.423828125, "learning_rate": 0.00010449141534025045, "loss": 0.7067, "step": 3255 }, { "epoch": 5.375103050288541, "grad_norm": 0.55859375, "learning_rate": 0.00010420367962612372, "loss": 0.7117, "step": 3260 }, { "epoch": 5.383347073371805, "grad_norm": 0.416015625, "learning_rate": 0.00010391590904305284, "loss": 0.7175, "step": 3265 }, { "epoch": 5.39159109645507, "grad_norm": 0.41796875, "learning_rate": 0.00010362810597805526, "loss": 0.7109, "step": 3270 }, { "epoch": 5.399835119538334, "grad_norm": 0.46484375, "learning_rate": 0.00010334027281841781, "loss": 0.7136, "step": 3275 }, { "epoch": 5.4080791426216, "grad_norm": 0.412109375, "learning_rate": 0.00010305241195167687, "loss": 0.7123, "step": 3280 }, { "epoch": 5.416323165704864, "grad_norm": 0.408203125, "learning_rate": 0.00010276452576559879, "loss": 0.7132, "step": 3285 }, { "epoch": 5.424567188788129, "grad_norm": 0.4296875, "learning_rate": 0.00010247661664815986, "loss": 0.7161, "step": 3290 }, { "epoch": 5.432811211871393, "grad_norm": 0.50390625, "learning_rate": 0.00010218868698752658, "loss": 0.7122, "step": 3295 }, { "epoch": 5.441055234954658, "grad_norm": 0.48046875, "learning_rate": 0.00010190073917203589, "loss": 0.7167, "step": 3300 }, { "epoch": 5.4492992580379225, "grad_norm": 0.51953125, "learning_rate": 0.00010161277559017528, "loss": 0.7143, "step": 3305 }, { "epoch": 5.457543281121187, "grad_norm": 0.44921875, "learning_rate": 0.00010132479863056303, "loss": 0.7163, "step": 3310 }, { "epoch": 5.465787304204452, "grad_norm": 0.46484375, "learning_rate": 0.00010103681068192845, "loss": 0.7173, "step": 3315 }, { "epoch": 5.474031327287716, "grad_norm": 0.484375, "learning_rate": 0.00010074881413309193, "loss": 0.714, "step": 3320 }, { "epoch": 5.482275350370981, "grad_norm": 0.486328125, "learning_rate": 0.00010046081137294516, "loss": 0.7128, "step": 3325 }, { "epoch": 5.490519373454246, "grad_norm": 0.486328125, "learning_rate": 0.00010017280479043147, "loss": 0.7242, "step": 3330 }, { "epoch": 5.498763396537511, "grad_norm": 0.421875, "learning_rate": 9.988479677452584e-05, "loss": 0.7196, "step": 3335 }, { "epoch": 5.507007419620775, "grad_norm": 0.40625, "learning_rate": 9.959678971421508e-05, "loss": 0.714, "step": 3340 }, { "epoch": 5.51525144270404, "grad_norm": 0.412109375, "learning_rate": 9.930878599847821e-05, "loss": 0.7173, "step": 3345 }, { "epoch": 5.523495465787304, "grad_norm": 0.46484375, "learning_rate": 9.902078801626636e-05, "loss": 0.7137, "step": 3350 }, { "epoch": 5.531739488870569, "grad_norm": 0.423828125, "learning_rate": 9.873279815648318e-05, "loss": 0.7125, "step": 3355 }, { "epoch": 5.539983511953833, "grad_norm": 0.451171875, "learning_rate": 9.844481880796491e-05, "loss": 0.7173, "step": 3360 }, { "epoch": 5.548227535037098, "grad_norm": 0.451171875, "learning_rate": 9.815685235946068e-05, "loss": 0.7134, "step": 3365 }, { "epoch": 5.5564715581203625, "grad_norm": 0.470703125, "learning_rate": 9.786890119961253e-05, "loss": 0.7199, "step": 3370 }, { "epoch": 5.564715581203627, "grad_norm": 0.419921875, "learning_rate": 9.758096771693573e-05, "loss": 0.7116, "step": 3375 }, { "epoch": 5.572959604286892, "grad_norm": 0.42578125, "learning_rate": 9.729305429979887e-05, "loss": 0.7131, "step": 3380 }, { "epoch": 5.581203627370156, "grad_norm": 0.47265625, "learning_rate": 9.700516333640415e-05, "loss": 0.7172, "step": 3385 }, { "epoch": 5.589447650453422, "grad_norm": 0.427734375, "learning_rate": 9.671729721476746e-05, "loss": 0.7121, "step": 3390 }, { "epoch": 5.597691673536686, "grad_norm": 0.4375, "learning_rate": 9.642945832269874e-05, "loss": 0.7187, "step": 3395 }, { "epoch": 5.605935696619951, "grad_norm": 0.42578125, "learning_rate": 9.614164904778196e-05, "loss": 0.7108, "step": 3400 }, { "epoch": 5.614179719703215, "grad_norm": 0.421875, "learning_rate": 9.585387177735547e-05, "loss": 0.7099, "step": 3405 }, { "epoch": 5.62242374278648, "grad_norm": 0.447265625, "learning_rate": 9.556612889849214e-05, "loss": 0.7169, "step": 3410 }, { "epoch": 5.630667765869744, "grad_norm": 0.5390625, "learning_rate": 9.527842279797953e-05, "loss": 0.7118, "step": 3415 }, { "epoch": 5.638911788953009, "grad_norm": 0.396484375, "learning_rate": 9.499075586230013e-05, "loss": 0.7148, "step": 3420 }, { "epoch": 5.6471558120362735, "grad_norm": 0.39453125, "learning_rate": 9.470313047761167e-05, "loss": 0.7166, "step": 3425 }, { "epoch": 5.655399835119538, "grad_norm": 0.52734375, "learning_rate": 9.44155490297271e-05, "loss": 0.7156, "step": 3430 }, { "epoch": 5.663643858202803, "grad_norm": 0.4765625, "learning_rate": 9.412801390409497e-05, "loss": 0.707, "step": 3435 }, { "epoch": 5.671887881286068, "grad_norm": 0.4296875, "learning_rate": 9.38405274857796e-05, "loss": 0.7125, "step": 3440 }, { "epoch": 5.680131904369333, "grad_norm": 0.453125, "learning_rate": 9.355309215944124e-05, "loss": 0.7153, "step": 3445 }, { "epoch": 5.688375927452597, "grad_norm": 0.45703125, "learning_rate": 9.326571030931637e-05, "loss": 0.7143, "step": 3450 }, { "epoch": 5.696619950535862, "grad_norm": 0.42578125, "learning_rate": 9.297838431919794e-05, "loss": 0.7192, "step": 3455 }, { "epoch": 5.704863973619126, "grad_norm": 0.4765625, "learning_rate": 9.269111657241548e-05, "loss": 0.7151, "step": 3460 }, { "epoch": 5.713107996702391, "grad_norm": 0.62109375, "learning_rate": 9.240390945181543e-05, "loss": 0.7171, "step": 3465 }, { "epoch": 5.721352019785655, "grad_norm": 0.42578125, "learning_rate": 9.211676533974131e-05, "loss": 0.7111, "step": 3470 }, { "epoch": 5.72959604286892, "grad_norm": 0.47265625, "learning_rate": 9.182968661801412e-05, "loss": 0.7111, "step": 3475 }, { "epoch": 5.737840065952184, "grad_norm": 0.408203125, "learning_rate": 9.154267566791223e-05, "loss": 0.7211, "step": 3480 }, { "epoch": 5.746084089035449, "grad_norm": 0.43359375, "learning_rate": 9.125573487015203e-05, "loss": 0.7165, "step": 3485 }, { "epoch": 5.7543281121187135, "grad_norm": 0.404296875, "learning_rate": 9.096886660486797e-05, "loss": 0.7082, "step": 3490 }, { "epoch": 5.762572135201978, "grad_norm": 0.41015625, "learning_rate": 9.068207325159284e-05, "loss": 0.7136, "step": 3495 }, { "epoch": 5.7708161582852435, "grad_norm": 0.4140625, "learning_rate": 9.039535718923804e-05, "loss": 0.714, "step": 3500 }, { "epoch": 5.779060181368508, "grad_norm": 0.41015625, "learning_rate": 9.01087207960739e-05, "loss": 0.7174, "step": 3505 }, { "epoch": 5.787304204451773, "grad_norm": 0.54296875, "learning_rate": 8.982216644970979e-05, "loss": 0.7071, "step": 3510 }, { "epoch": 5.795548227535037, "grad_norm": 0.43359375, "learning_rate": 8.953569652707459e-05, "loss": 0.7081, "step": 3515 }, { "epoch": 5.803792250618302, "grad_norm": 0.44140625, "learning_rate": 8.924931340439694e-05, "loss": 0.7124, "step": 3520 }, { "epoch": 5.812036273701566, "grad_norm": 0.41796875, "learning_rate": 8.896301945718541e-05, "loss": 0.7115, "step": 3525 }, { "epoch": 5.820280296784831, "grad_norm": 0.396484375, "learning_rate": 8.867681706020894e-05, "loss": 0.7134, "step": 3530 }, { "epoch": 5.828524319868095, "grad_norm": 0.40234375, "learning_rate": 8.839070858747697e-05, "loss": 0.7169, "step": 3535 }, { "epoch": 5.83676834295136, "grad_norm": 0.58203125, "learning_rate": 8.810469641222001e-05, "loss": 0.7154, "step": 3540 }, { "epoch": 5.845012366034625, "grad_norm": 0.51171875, "learning_rate": 8.781878290686959e-05, "loss": 0.7182, "step": 3545 }, { "epoch": 5.85325638911789, "grad_norm": 0.45703125, "learning_rate": 8.753297044303896e-05, "loss": 0.7128, "step": 3550 }, { "epoch": 5.8615004122011545, "grad_norm": 0.44140625, "learning_rate": 8.724726139150318e-05, "loss": 0.7083, "step": 3555 }, { "epoch": 5.869744435284419, "grad_norm": 0.421875, "learning_rate": 8.696165812217953e-05, "loss": 0.7175, "step": 3560 }, { "epoch": 5.877988458367684, "grad_norm": 0.404296875, "learning_rate": 8.667616300410778e-05, "loss": 0.7174, "step": 3565 }, { "epoch": 5.886232481450948, "grad_norm": 0.46875, "learning_rate": 8.639077840543077e-05, "loss": 0.7173, "step": 3570 }, { "epoch": 5.894476504534213, "grad_norm": 0.388671875, "learning_rate": 8.610550669337433e-05, "loss": 0.7147, "step": 3575 }, { "epoch": 5.902720527617477, "grad_norm": 0.39453125, "learning_rate": 8.582035023422815e-05, "loss": 0.7169, "step": 3580 }, { "epoch": 5.910964550700742, "grad_norm": 0.484375, "learning_rate": 8.553531139332582e-05, "loss": 0.7237, "step": 3585 }, { "epoch": 5.919208573784006, "grad_norm": 0.3984375, "learning_rate": 8.525039253502529e-05, "loss": 0.7134, "step": 3590 }, { "epoch": 5.927452596867271, "grad_norm": 0.443359375, "learning_rate": 8.496559602268928e-05, "loss": 0.7189, "step": 3595 }, { "epoch": 5.935696619950535, "grad_norm": 0.50390625, "learning_rate": 8.468092421866573e-05, "loss": 0.717, "step": 3600 }, { "epoch": 5.943940643033801, "grad_norm": 0.38671875, "learning_rate": 8.439637948426801e-05, "loss": 0.7094, "step": 3605 }, { "epoch": 5.952184666117065, "grad_norm": 0.40234375, "learning_rate": 8.411196417975558e-05, "loss": 0.7019, "step": 3610 }, { "epoch": 5.96042868920033, "grad_norm": 0.40625, "learning_rate": 8.382768066431425e-05, "loss": 0.7127, "step": 3615 }, { "epoch": 5.9686727122835945, "grad_norm": 0.54296875, "learning_rate": 8.354353129603668e-05, "loss": 0.7133, "step": 3620 }, { "epoch": 5.976916735366859, "grad_norm": 0.427734375, "learning_rate": 8.325951843190274e-05, "loss": 0.7182, "step": 3625 }, { "epoch": 5.985160758450124, "grad_norm": 0.40234375, "learning_rate": 8.297564442776014e-05, "loss": 0.7053, "step": 3630 }, { "epoch": 5.993404781533388, "grad_norm": 0.44140625, "learning_rate": 8.269191163830467e-05, "loss": 0.7253, "step": 3635 }, { "epoch": 6.0, "eval_loss": 2.459299325942993, "eval_runtime": 0.2463, "eval_samples_per_second": 40.595, "eval_steps_per_second": 4.059, "step": 3639 }, { "epoch": 6.001648804616653, "grad_norm": 0.408203125, "learning_rate": 8.240832241706068e-05, "loss": 0.7144, "step": 3640 }, { "epoch": 6.009892827699917, "grad_norm": 0.5625, "learning_rate": 8.212487911636184e-05, "loss": 0.7102, "step": 3645 }, { "epoch": 6.018136850783182, "grad_norm": 0.640625, "learning_rate": 8.184158408733131e-05, "loss": 0.7073, "step": 3650 }, { "epoch": 6.026380873866446, "grad_norm": 0.421875, "learning_rate": 8.155843967986236e-05, "loss": 0.6914, "step": 3655 }, { "epoch": 6.034624896949712, "grad_norm": 0.421875, "learning_rate": 8.127544824259889e-05, "loss": 0.7095, "step": 3660 }, { "epoch": 6.042868920032976, "grad_norm": 0.427734375, "learning_rate": 8.099261212291601e-05, "loss": 0.7006, "step": 3665 }, { "epoch": 6.051112943116241, "grad_norm": 0.408203125, "learning_rate": 8.070993366690029e-05, "loss": 0.6983, "step": 3670 }, { "epoch": 6.0593569661995055, "grad_norm": 0.412109375, "learning_rate": 8.042741521933071e-05, "loss": 0.7086, "step": 3675 }, { "epoch": 6.06760098928277, "grad_norm": 0.41015625, "learning_rate": 8.014505912365893e-05, "loss": 0.7039, "step": 3680 }, { "epoch": 6.075845012366035, "grad_norm": 0.435546875, "learning_rate": 7.986286772198986e-05, "loss": 0.7056, "step": 3685 }, { "epoch": 6.084089035449299, "grad_norm": 0.41015625, "learning_rate": 7.958084335506239e-05, "loss": 0.6957, "step": 3690 }, { "epoch": 6.092333058532564, "grad_norm": 0.416015625, "learning_rate": 7.929898836222983e-05, "loss": 0.7052, "step": 3695 }, { "epoch": 6.100577081615828, "grad_norm": 0.46875, "learning_rate": 7.90173050814406e-05, "loss": 0.6982, "step": 3700 }, { "epoch": 6.108821104699093, "grad_norm": 0.50390625, "learning_rate": 7.873579584921869e-05, "loss": 0.7029, "step": 3705 }, { "epoch": 6.117065127782358, "grad_norm": 0.451171875, "learning_rate": 7.84544630006445e-05, "loss": 0.7015, "step": 3710 }, { "epoch": 6.125309150865623, "grad_norm": 0.427734375, "learning_rate": 7.817330886933527e-05, "loss": 0.7073, "step": 3715 }, { "epoch": 6.133553173948887, "grad_norm": 0.416015625, "learning_rate": 7.789233578742582e-05, "loss": 0.7092, "step": 3720 }, { "epoch": 6.141797197032152, "grad_norm": 0.490234375, "learning_rate": 7.761154608554927e-05, "loss": 0.7025, "step": 3725 }, { "epoch": 6.150041220115416, "grad_norm": 0.412109375, "learning_rate": 7.733094209281756e-05, "loss": 0.7048, "step": 3730 }, { "epoch": 6.158285243198681, "grad_norm": 0.404296875, "learning_rate": 7.705052613680211e-05, "loss": 0.7029, "step": 3735 }, { "epoch": 6.1665292662819455, "grad_norm": 0.453125, "learning_rate": 7.677030054351477e-05, "loss": 0.701, "step": 3740 }, { "epoch": 6.17477328936521, "grad_norm": 0.439453125, "learning_rate": 7.649026763738827e-05, "loss": 0.7067, "step": 3745 }, { "epoch": 6.183017312448475, "grad_norm": 0.451171875, "learning_rate": 7.6210429741257e-05, "loss": 0.7055, "step": 3750 }, { "epoch": 6.191261335531739, "grad_norm": 0.423828125, "learning_rate": 7.593078917633787e-05, "loss": 0.7104, "step": 3755 }, { "epoch": 6.199505358615004, "grad_norm": 0.380859375, "learning_rate": 7.565134826221083e-05, "loss": 0.703, "step": 3760 }, { "epoch": 6.207749381698269, "grad_norm": 0.431640625, "learning_rate": 7.537210931679987e-05, "loss": 0.6998, "step": 3765 }, { "epoch": 6.215993404781534, "grad_norm": 0.427734375, "learning_rate": 7.509307465635358e-05, "loss": 0.6976, "step": 3770 }, { "epoch": 6.224237427864798, "grad_norm": 0.42578125, "learning_rate": 7.481424659542609e-05, "loss": 0.7025, "step": 3775 }, { "epoch": 6.232481450948063, "grad_norm": 0.421875, "learning_rate": 7.453562744685778e-05, "loss": 0.6971, "step": 3780 }, { "epoch": 6.240725474031327, "grad_norm": 0.3984375, "learning_rate": 7.425721952175618e-05, "loss": 0.6984, "step": 3785 }, { "epoch": 6.248969497114592, "grad_norm": 0.59765625, "learning_rate": 7.39790251294767e-05, "loss": 0.7012, "step": 3790 }, { "epoch": 6.2572135201978565, "grad_norm": 0.466796875, "learning_rate": 7.370104657760361e-05, "loss": 0.7012, "step": 3795 }, { "epoch": 6.265457543281121, "grad_norm": 0.439453125, "learning_rate": 7.342328617193067e-05, "loss": 0.7069, "step": 3800 }, { "epoch": 6.273701566364386, "grad_norm": 0.408203125, "learning_rate": 7.314574621644225e-05, "loss": 0.6998, "step": 3805 }, { "epoch": 6.28194558944765, "grad_norm": 0.427734375, "learning_rate": 7.286842901329412e-05, "loss": 0.695, "step": 3810 }, { "epoch": 6.290189612530915, "grad_norm": 0.421875, "learning_rate": 7.259133686279429e-05, "loss": 0.7045, "step": 3815 }, { "epoch": 6.29843363561418, "grad_norm": 0.478515625, "learning_rate": 7.231447206338407e-05, "loss": 0.7062, "step": 3820 }, { "epoch": 6.306677658697445, "grad_norm": 0.408203125, "learning_rate": 7.203783691161883e-05, "loss": 0.6975, "step": 3825 }, { "epoch": 6.314921681780709, "grad_norm": 0.41796875, "learning_rate": 7.176143370214914e-05, "loss": 0.7035, "step": 3830 }, { "epoch": 6.323165704863974, "grad_norm": 0.46484375, "learning_rate": 7.148526472770154e-05, "loss": 0.7071, "step": 3835 }, { "epoch": 6.331409727947238, "grad_norm": 0.49609375, "learning_rate": 7.12093322790597e-05, "loss": 0.7022, "step": 3840 }, { "epoch": 6.339653751030503, "grad_norm": 0.515625, "learning_rate": 7.09336386450453e-05, "loss": 0.7104, "step": 3845 }, { "epoch": 6.347897774113767, "grad_norm": 0.423828125, "learning_rate": 7.065818611249915e-05, "loss": 0.7028, "step": 3850 }, { "epoch": 6.356141797197032, "grad_norm": 0.43359375, "learning_rate": 7.038297696626206e-05, "loss": 0.7049, "step": 3855 }, { "epoch": 6.3643858202802965, "grad_norm": 0.421875, "learning_rate": 7.010801348915608e-05, "loss": 0.7074, "step": 3860 }, { "epoch": 6.372629843363561, "grad_norm": 0.423828125, "learning_rate": 6.983329796196534e-05, "loss": 0.7001, "step": 3865 }, { "epoch": 6.380873866446826, "grad_norm": 0.5703125, "learning_rate": 6.955883266341741e-05, "loss": 0.7006, "step": 3870 }, { "epoch": 6.389117889530091, "grad_norm": 0.4140625, "learning_rate": 6.928461987016413e-05, "loss": 0.7113, "step": 3875 }, { "epoch": 6.397361912613356, "grad_norm": 0.416015625, "learning_rate": 6.901066185676295e-05, "loss": 0.6964, "step": 3880 }, { "epoch": 6.40560593569662, "grad_norm": 0.42578125, "learning_rate": 6.873696089565786e-05, "loss": 0.7086, "step": 3885 }, { "epoch": 6.413849958779885, "grad_norm": 0.5078125, "learning_rate": 6.846351925716068e-05, "loss": 0.698, "step": 3890 }, { "epoch": 6.422093981863149, "grad_norm": 0.443359375, "learning_rate": 6.819033920943219e-05, "loss": 0.6997, "step": 3895 }, { "epoch": 6.430338004946414, "grad_norm": 0.4765625, "learning_rate": 6.791742301846326e-05, "loss": 0.7031, "step": 3900 }, { "epoch": 6.438582028029678, "grad_norm": 0.427734375, "learning_rate": 6.764477294805615e-05, "loss": 0.7026, "step": 3905 }, { "epoch": 6.446826051112943, "grad_norm": 0.458984375, "learning_rate": 6.737239125980573e-05, "loss": 0.7006, "step": 3910 }, { "epoch": 6.4550700741962075, "grad_norm": 0.412109375, "learning_rate": 6.710028021308061e-05, "loss": 0.6971, "step": 3915 }, { "epoch": 6.463314097279472, "grad_norm": 0.400390625, "learning_rate": 6.682844206500445e-05, "loss": 0.7028, "step": 3920 }, { "epoch": 6.471558120362737, "grad_norm": 0.392578125, "learning_rate": 6.655687907043734e-05, "loss": 0.7053, "step": 3925 }, { "epoch": 6.479802143446002, "grad_norm": 0.4296875, "learning_rate": 6.62855934819569e-05, "loss": 0.6995, "step": 3930 }, { "epoch": 6.488046166529267, "grad_norm": 0.40234375, "learning_rate": 6.601458754983978e-05, "loss": 0.6971, "step": 3935 }, { "epoch": 6.496290189612531, "grad_norm": 0.462890625, "learning_rate": 6.574386352204289e-05, "loss": 0.7029, "step": 3940 }, { "epoch": 6.504534212695796, "grad_norm": 0.408203125, "learning_rate": 6.547342364418481e-05, "loss": 0.7011, "step": 3945 }, { "epoch": 6.51277823577906, "grad_norm": 0.4296875, "learning_rate": 6.520327015952713e-05, "loss": 0.699, "step": 3950 }, { "epoch": 6.521022258862325, "grad_norm": 0.431640625, "learning_rate": 6.493340530895583e-05, "loss": 0.6987, "step": 3955 }, { "epoch": 6.529266281945589, "grad_norm": 0.3828125, "learning_rate": 6.466383133096267e-05, "loss": 0.7095, "step": 3960 }, { "epoch": 6.537510305028854, "grad_norm": 0.443359375, "learning_rate": 6.439455046162677e-05, "loss": 0.704, "step": 3965 }, { "epoch": 6.545754328112118, "grad_norm": 0.474609375, "learning_rate": 6.412556493459581e-05, "loss": 0.7127, "step": 3970 }, { "epoch": 6.553998351195383, "grad_norm": 0.431640625, "learning_rate": 6.385687698106781e-05, "loss": 0.7019, "step": 3975 }, { "epoch": 6.562242374278648, "grad_norm": 0.3984375, "learning_rate": 6.358848882977233e-05, "loss": 0.702, "step": 3980 }, { "epoch": 6.570486397361913, "grad_norm": 0.443359375, "learning_rate": 6.332040270695219e-05, "loss": 0.7086, "step": 3985 }, { "epoch": 6.5787304204451775, "grad_norm": 0.408203125, "learning_rate": 6.305262083634488e-05, "loss": 0.7086, "step": 3990 }, { "epoch": 6.586974443528442, "grad_norm": 0.443359375, "learning_rate": 6.278514543916415e-05, "loss": 0.7087, "step": 3995 }, { "epoch": 6.595218466611707, "grad_norm": 0.41015625, "learning_rate": 6.251797873408161e-05, "loss": 0.6976, "step": 4000 }, { "epoch": 6.603462489694971, "grad_norm": 0.478515625, "learning_rate": 6.225112293720836e-05, "loss": 0.6968, "step": 4005 }, { "epoch": 6.611706512778236, "grad_norm": 0.41796875, "learning_rate": 6.198458026207652e-05, "loss": 0.7039, "step": 4010 }, { "epoch": 6.6199505358615, "grad_norm": 0.40234375, "learning_rate": 6.171835291962088e-05, "loss": 0.702, "step": 4015 }, { "epoch": 6.628194558944765, "grad_norm": 0.4296875, "learning_rate": 6.145244311816063e-05, "loss": 0.7004, "step": 4020 }, { "epoch": 6.636438582028029, "grad_norm": 0.5, "learning_rate": 6.1186853063381e-05, "loss": 0.6988, "step": 4025 }, { "epoch": 6.644682605111294, "grad_norm": 0.427734375, "learning_rate": 6.092158495831486e-05, "loss": 0.7019, "step": 4030 }, { "epoch": 6.6529266281945585, "grad_norm": 0.4609375, "learning_rate": 6.065664100332478e-05, "loss": 0.7082, "step": 4035 }, { "epoch": 6.661170651277824, "grad_norm": 0.4375, "learning_rate": 6.039202339608432e-05, "loss": 0.7008, "step": 4040 }, { "epoch": 6.6694146743610885, "grad_norm": 0.44921875, "learning_rate": 6.012773433156017e-05, "loss": 0.7022, "step": 4045 }, { "epoch": 6.677658697444353, "grad_norm": 0.404296875, "learning_rate": 5.986377600199371e-05, "loss": 0.6986, "step": 4050 }, { "epoch": 6.685902720527618, "grad_norm": 0.42578125, "learning_rate": 5.9600150596883066e-05, "loss": 0.6989, "step": 4055 }, { "epoch": 6.694146743610882, "grad_norm": 0.43359375, "learning_rate": 5.933686030296459e-05, "loss": 0.6993, "step": 4060 }, { "epoch": 6.702390766694147, "grad_norm": 0.419921875, "learning_rate": 5.907390730419507e-05, "loss": 0.6977, "step": 4065 }, { "epoch": 6.710634789777411, "grad_norm": 0.41015625, "learning_rate": 5.881129378173347e-05, "loss": 0.7019, "step": 4070 }, { "epoch": 6.718878812860676, "grad_norm": 0.40625, "learning_rate": 5.854902191392284e-05, "loss": 0.6936, "step": 4075 }, { "epoch": 6.72712283594394, "grad_norm": 0.494140625, "learning_rate": 5.828709387627218e-05, "loss": 0.7002, "step": 4080 }, { "epoch": 6.735366859027205, "grad_norm": 0.435546875, "learning_rate": 5.802551184143865e-05, "loss": 0.7026, "step": 4085 }, { "epoch": 6.74361088211047, "grad_norm": 0.404296875, "learning_rate": 5.7764277979209094e-05, "loss": 0.7151, "step": 4090 }, { "epoch": 6.751854905193735, "grad_norm": 0.416015625, "learning_rate": 5.750339445648252e-05, "loss": 0.7055, "step": 4095 }, { "epoch": 6.760098928276999, "grad_norm": 0.4140625, "learning_rate": 5.724286343725185e-05, "loss": 0.7032, "step": 4100 }, { "epoch": 6.768342951360264, "grad_norm": 0.421875, "learning_rate": 5.6982687082585994e-05, "loss": 0.7008, "step": 4105 }, { "epoch": 6.7765869744435285, "grad_norm": 0.400390625, "learning_rate": 5.6722867550612116e-05, "loss": 0.6998, "step": 4110 }, { "epoch": 6.784830997526793, "grad_norm": 0.404296875, "learning_rate": 5.6463406996497456e-05, "loss": 0.6961, "step": 4115 }, { "epoch": 6.793075020610058, "grad_norm": 0.412109375, "learning_rate": 5.620430757243156e-05, "loss": 0.6963, "step": 4120 }, { "epoch": 6.801319043693322, "grad_norm": 0.40234375, "learning_rate": 5.5945571427608526e-05, "loss": 0.7083, "step": 4125 }, { "epoch": 6.809563066776587, "grad_norm": 0.419921875, "learning_rate": 5.5687200708209076e-05, "loss": 0.704, "step": 4130 }, { "epoch": 6.817807089859851, "grad_norm": 0.4140625, "learning_rate": 5.542919755738275e-05, "loss": 0.7061, "step": 4135 }, { "epoch": 6.826051112943116, "grad_norm": 0.478515625, "learning_rate": 5.5171564115230254e-05, "loss": 0.7037, "step": 4140 }, { "epoch": 6.83429513602638, "grad_norm": 0.408203125, "learning_rate": 5.491430251878551e-05, "loss": 0.715, "step": 4145 }, { "epoch": 6.842539159109646, "grad_norm": 0.421875, "learning_rate": 5.4657414901998095e-05, "loss": 0.7023, "step": 4150 }, { "epoch": 6.85078318219291, "grad_norm": 0.4453125, "learning_rate": 5.4400903395715366e-05, "loss": 0.6967, "step": 4155 }, { "epoch": 6.859027205276175, "grad_norm": 0.40625, "learning_rate": 5.4144770127665024e-05, "loss": 0.7073, "step": 4160 }, { "epoch": 6.8672712283594395, "grad_norm": 0.43359375, "learning_rate": 5.388901722243724e-05, "loss": 0.6954, "step": 4165 }, { "epoch": 6.875515251442704, "grad_norm": 0.400390625, "learning_rate": 5.363364680146725e-05, "loss": 0.7044, "step": 4170 }, { "epoch": 6.883759274525969, "grad_norm": 0.412109375, "learning_rate": 5.3378660983017536e-05, "loss": 0.7045, "step": 4175 }, { "epoch": 6.892003297609233, "grad_norm": 0.404296875, "learning_rate": 5.31240618821604e-05, "loss": 0.7029, "step": 4180 }, { "epoch": 6.900247320692498, "grad_norm": 0.396484375, "learning_rate": 5.286985161076029e-05, "loss": 0.7018, "step": 4185 }, { "epoch": 6.908491343775762, "grad_norm": 0.4140625, "learning_rate": 5.2616032277456463e-05, "loss": 0.7102, "step": 4190 }, { "epoch": 6.916735366859028, "grad_norm": 0.3828125, "learning_rate": 5.236260598764535e-05, "loss": 0.7078, "step": 4195 }, { "epoch": 6.924979389942292, "grad_norm": 0.51953125, "learning_rate": 5.210957484346314e-05, "loss": 0.7055, "step": 4200 }, { "epoch": 6.933223413025557, "grad_norm": 0.423828125, "learning_rate": 5.185694094376843e-05, "loss": 0.7068, "step": 4205 }, { "epoch": 6.941467436108821, "grad_norm": 0.412109375, "learning_rate": 5.160470638412461e-05, "loss": 0.6911, "step": 4210 }, { "epoch": 6.949711459192086, "grad_norm": 0.388671875, "learning_rate": 5.135287325678271e-05, "loss": 0.7047, "step": 4215 }, { "epoch": 6.95795548227535, "grad_norm": 0.435546875, "learning_rate": 5.1101443650663764e-05, "loss": 0.6989, "step": 4220 }, { "epoch": 6.966199505358615, "grad_norm": 0.416015625, "learning_rate": 5.085041965134183e-05, "loss": 0.6975, "step": 4225 }, { "epoch": 6.9744435284418795, "grad_norm": 0.412109375, "learning_rate": 5.059980334102637e-05, "loss": 0.7055, "step": 4230 }, { "epoch": 6.982687551525144, "grad_norm": 0.39453125, "learning_rate": 5.034959679854532e-05, "loss": 0.6983, "step": 4235 }, { "epoch": 6.990931574608409, "grad_norm": 0.4296875, "learning_rate": 5.009980209932743e-05, "loss": 0.7046, "step": 4240 }, { "epoch": 6.999175597691673, "grad_norm": 0.38671875, "learning_rate": 4.985042131538545e-05, "loss": 0.7042, "step": 4245 }, { "epoch": 6.999175597691673, "eval_loss": 2.4711008071899414, "eval_runtime": 0.2631, "eval_samples_per_second": 38.011, "eval_steps_per_second": 3.801, "step": 4245 }, { "epoch": 7.007419620774938, "grad_norm": 0.49609375, "learning_rate": 4.960145651529856e-05, "loss": 0.6792, "step": 4250 }, { "epoch": 7.015663643858203, "grad_norm": 0.53125, "learning_rate": 4.9352909764195576e-05, "loss": 0.6999, "step": 4255 }, { "epoch": 7.023907666941468, "grad_norm": 0.4296875, "learning_rate": 4.9104783123737566e-05, "loss": 0.6999, "step": 4260 }, { "epoch": 7.032151690024732, "grad_norm": 0.3984375, "learning_rate": 4.885707865210093e-05, "loss": 0.7018, "step": 4265 }, { "epoch": 7.040395713107997, "grad_norm": 0.400390625, "learning_rate": 4.860979840396016e-05, "loss": 0.6912, "step": 4270 }, { "epoch": 7.048639736191261, "grad_norm": 0.45703125, "learning_rate": 4.836294443047088e-05, "loss": 0.6945, "step": 4275 }, { "epoch": 7.056883759274526, "grad_norm": 0.44921875, "learning_rate": 4.8116518779252885e-05, "loss": 0.6905, "step": 4280 }, { "epoch": 7.0651277823577905, "grad_norm": 0.39453125, "learning_rate": 4.787052349437295e-05, "loss": 0.691, "step": 4285 }, { "epoch": 7.073371805441055, "grad_norm": 0.408203125, "learning_rate": 4.762496061632814e-05, "loss": 0.6843, "step": 4290 }, { "epoch": 7.08161582852432, "grad_norm": 0.388671875, "learning_rate": 4.7379832182028814e-05, "loss": 0.6951, "step": 4295 }, { "epoch": 7.089859851607584, "grad_norm": 0.408203125, "learning_rate": 4.713514022478155e-05, "loss": 0.6893, "step": 4300 }, { "epoch": 7.09810387469085, "grad_norm": 0.4453125, "learning_rate": 4.689088677427249e-05, "loss": 0.6952, "step": 4305 }, { "epoch": 7.106347897774114, "grad_norm": 0.427734375, "learning_rate": 4.6647073856550415e-05, "loss": 0.6958, "step": 4310 }, { "epoch": 7.114591920857379, "grad_norm": 0.4140625, "learning_rate": 4.6403703494009875e-05, "loss": 0.6946, "step": 4315 }, { "epoch": 7.122835943940643, "grad_norm": 0.427734375, "learning_rate": 4.6160777705374524e-05, "loss": 0.6996, "step": 4320 }, { "epoch": 7.131079967023908, "grad_norm": 0.408203125, "learning_rate": 4.591829850568046e-05, "loss": 0.6969, "step": 4325 }, { "epoch": 7.139323990107172, "grad_norm": 0.439453125, "learning_rate": 4.567626790625921e-05, "loss": 0.6868, "step": 4330 }, { "epoch": 7.147568013190437, "grad_norm": 0.4609375, "learning_rate": 4.543468791472131e-05, "loss": 0.69, "step": 4335 }, { "epoch": 7.155812036273701, "grad_norm": 0.40625, "learning_rate": 4.519356053493958e-05, "loss": 0.6979, "step": 4340 }, { "epoch": 7.164056059356966, "grad_norm": 0.408203125, "learning_rate": 4.495288776703241e-05, "loss": 0.7022, "step": 4345 }, { "epoch": 7.1723000824402305, "grad_norm": 0.41015625, "learning_rate": 4.471267160734731e-05, "loss": 0.6874, "step": 4350 }, { "epoch": 7.180544105523495, "grad_norm": 0.40625, "learning_rate": 4.447291404844424e-05, "loss": 0.6982, "step": 4355 }, { "epoch": 7.18878812860676, "grad_norm": 0.404296875, "learning_rate": 4.4233617079079236e-05, "loss": 0.7015, "step": 4360 }, { "epoch": 7.197032151690025, "grad_norm": 0.39453125, "learning_rate": 4.399478268418771e-05, "loss": 0.6919, "step": 4365 }, { "epoch": 7.20527617477329, "grad_norm": 0.412109375, "learning_rate": 4.375641284486808e-05, "loss": 0.6867, "step": 4370 }, { "epoch": 7.213520197856554, "grad_norm": 0.3984375, "learning_rate": 4.3518509538365425e-05, "loss": 0.6929, "step": 4375 }, { "epoch": 7.221764220939819, "grad_norm": 0.40234375, "learning_rate": 4.328107473805487e-05, "loss": 0.7013, "step": 4380 }, { "epoch": 7.230008244023083, "grad_norm": 0.4140625, "learning_rate": 4.3044110413425395e-05, "loss": 0.6879, "step": 4385 }, { "epoch": 7.238252267106348, "grad_norm": 0.439453125, "learning_rate": 4.2807618530063565e-05, "loss": 0.6918, "step": 4390 }, { "epoch": 7.246496290189612, "grad_norm": 0.39453125, "learning_rate": 4.257160104963696e-05, "loss": 0.6965, "step": 4395 }, { "epoch": 7.254740313272877, "grad_norm": 0.41015625, "learning_rate": 4.23360599298781e-05, "loss": 0.6963, "step": 4400 }, { "epoch": 7.2629843363561415, "grad_norm": 0.408203125, "learning_rate": 4.210099712456822e-05, "loss": 0.69, "step": 4405 }, { "epoch": 7.271228359439406, "grad_norm": 0.396484375, "learning_rate": 4.1866414583520877e-05, "loss": 0.6955, "step": 4410 }, { "epoch": 7.2794723825226715, "grad_norm": 0.408203125, "learning_rate": 4.163231425256595e-05, "loss": 0.6888, "step": 4415 }, { "epoch": 7.287716405605936, "grad_norm": 0.408203125, "learning_rate": 4.139869807353357e-05, "loss": 0.6998, "step": 4420 }, { "epoch": 7.295960428689201, "grad_norm": 0.396484375, "learning_rate": 4.1165567984237764e-05, "loss": 0.6963, "step": 4425 }, { "epoch": 7.304204451772465, "grad_norm": 0.38671875, "learning_rate": 4.0932925918460516e-05, "loss": 0.6922, "step": 4430 }, { "epoch": 7.31244847485573, "grad_norm": 0.40625, "learning_rate": 4.070077380593579e-05, "loss": 0.6969, "step": 4435 }, { "epoch": 7.320692497938994, "grad_norm": 0.39453125, "learning_rate": 4.046911357233343e-05, "loss": 0.6893, "step": 4440 }, { "epoch": 7.328936521022259, "grad_norm": 0.412109375, "learning_rate": 4.02379471392431e-05, "loss": 0.6902, "step": 4445 }, { "epoch": 7.337180544105523, "grad_norm": 0.419921875, "learning_rate": 4.000727642415867e-05, "loss": 0.7053, "step": 4450 }, { "epoch": 7.345424567188788, "grad_norm": 0.4140625, "learning_rate": 3.977710334046193e-05, "loss": 0.6942, "step": 4455 }, { "epoch": 7.353668590272052, "grad_norm": 0.404296875, "learning_rate": 3.954742979740695e-05, "loss": 0.7078, "step": 4460 }, { "epoch": 7.361912613355317, "grad_norm": 0.4375, "learning_rate": 3.9318257700104174e-05, "loss": 0.6932, "step": 4465 }, { "epoch": 7.370156636438582, "grad_norm": 0.470703125, "learning_rate": 3.9089588949504655e-05, "loss": 0.6955, "step": 4470 }, { "epoch": 7.378400659521847, "grad_norm": 0.478515625, "learning_rate": 3.8861425442384135e-05, "loss": 0.6969, "step": 4475 }, { "epoch": 7.3866446826051115, "grad_norm": 0.4140625, "learning_rate": 3.863376907132752e-05, "loss": 0.6949, "step": 4480 }, { "epoch": 7.394888705688376, "grad_norm": 0.396484375, "learning_rate": 3.840662172471315e-05, "loss": 0.7005, "step": 4485 }, { "epoch": 7.403132728771641, "grad_norm": 0.453125, "learning_rate": 3.8179985286696986e-05, "loss": 0.6935, "step": 4490 }, { "epoch": 7.411376751854905, "grad_norm": 0.40234375, "learning_rate": 3.7953861637197085e-05, "loss": 0.6923, "step": 4495 }, { "epoch": 7.41962077493817, "grad_norm": 0.390625, "learning_rate": 3.772825265187802e-05, "loss": 0.6923, "step": 4500 }, { "epoch": 7.427864798021434, "grad_norm": 0.421875, "learning_rate": 3.75031602021353e-05, "loss": 0.6979, "step": 4505 }, { "epoch": 7.436108821104699, "grad_norm": 0.390625, "learning_rate": 3.727858615507974e-05, "loss": 0.6977, "step": 4510 }, { "epoch": 7.444352844187963, "grad_norm": 0.41015625, "learning_rate": 3.705453237352227e-05, "loss": 0.7043, "step": 4515 }, { "epoch": 7.452596867271229, "grad_norm": 0.404296875, "learning_rate": 3.683100071595813e-05, "loss": 0.6956, "step": 4520 }, { "epoch": 7.460840890354493, "grad_norm": 0.40234375, "learning_rate": 3.660799303655166e-05, "loss": 0.6974, "step": 4525 }, { "epoch": 7.469084913437758, "grad_norm": 0.43359375, "learning_rate": 3.638551118512089e-05, "loss": 0.7013, "step": 4530 }, { "epoch": 7.4773289365210225, "grad_norm": 0.435546875, "learning_rate": 3.616355700712221e-05, "loss": 0.6966, "step": 4535 }, { "epoch": 7.485572959604287, "grad_norm": 0.400390625, "learning_rate": 3.594213234363486e-05, "loss": 0.6964, "step": 4540 }, { "epoch": 7.493816982687552, "grad_norm": 0.412109375, "learning_rate": 3.5721239031346066e-05, "loss": 0.6922, "step": 4545 }, { "epoch": 7.502061005770816, "grad_norm": 0.419921875, "learning_rate": 3.550087890253544e-05, "loss": 0.6948, "step": 4550 }, { "epoch": 7.510305028854081, "grad_norm": 0.423828125, "learning_rate": 3.5281053785059925e-05, "loss": 0.695, "step": 4555 }, { "epoch": 7.518549051937345, "grad_norm": 0.39453125, "learning_rate": 3.506176550233863e-05, "loss": 0.6949, "step": 4560 }, { "epoch": 7.52679307502061, "grad_norm": 0.44921875, "learning_rate": 3.484301587333772e-05, "loss": 0.6903, "step": 4565 }, { "epoch": 7.535037098103874, "grad_norm": 0.404296875, "learning_rate": 3.462480671255515e-05, "loss": 0.6983, "step": 4570 }, { "epoch": 7.543281121187139, "grad_norm": 0.416015625, "learning_rate": 3.440713983000601e-05, "loss": 0.6964, "step": 4575 }, { "epoch": 7.551525144270404, "grad_norm": 0.412109375, "learning_rate": 3.419001703120709e-05, "loss": 0.6934, "step": 4580 }, { "epoch": 7.559769167353669, "grad_norm": 0.392578125, "learning_rate": 3.397344011716216e-05, "loss": 0.7035, "step": 4585 }, { "epoch": 7.568013190436933, "grad_norm": 0.40625, "learning_rate": 3.3757410884346894e-05, "loss": 0.6827, "step": 4590 }, { "epoch": 7.576257213520198, "grad_norm": 0.39453125, "learning_rate": 3.354193112469407e-05, "loss": 0.6979, "step": 4595 }, { "epoch": 7.5845012366034625, "grad_norm": 0.419921875, "learning_rate": 3.332700262557864e-05, "loss": 0.7002, "step": 4600 }, { "epoch": 7.592745259686727, "grad_norm": 0.419921875, "learning_rate": 3.3112627169802946e-05, "loss": 0.6996, "step": 4605 }, { "epoch": 7.600989282769992, "grad_norm": 0.4140625, "learning_rate": 3.289880653558188e-05, "loss": 0.6942, "step": 4610 }, { "epoch": 7.609233305853256, "grad_norm": 0.4296875, "learning_rate": 3.2685542496528185e-05, "loss": 0.7002, "step": 4615 }, { "epoch": 7.617477328936521, "grad_norm": 0.38671875, "learning_rate": 3.2472836821637744e-05, "loss": 0.6953, "step": 4620 }, { "epoch": 7.625721352019785, "grad_norm": 0.40234375, "learning_rate": 3.2260691275274835e-05, "loss": 0.7001, "step": 4625 }, { "epoch": 7.633965375103051, "grad_norm": 0.4140625, "learning_rate": 3.204910761715763e-05, "loss": 0.6935, "step": 4630 }, { "epoch": 7.642209398186315, "grad_norm": 0.40625, "learning_rate": 3.1838087602343344e-05, "loss": 0.6973, "step": 4635 }, { "epoch": 7.65045342126958, "grad_norm": 0.392578125, "learning_rate": 3.162763298121408e-05, "loss": 0.6962, "step": 4640 }, { "epoch": 7.658697444352844, "grad_norm": 0.396484375, "learning_rate": 3.1417745499461934e-05, "loss": 0.6986, "step": 4645 }, { "epoch": 7.666941467436109, "grad_norm": 0.40625, "learning_rate": 3.120842689807468e-05, "loss": 0.7008, "step": 4650 }, { "epoch": 7.6751854905193735, "grad_norm": 0.396484375, "learning_rate": 3.099967891332132e-05, "loss": 0.698, "step": 4655 }, { "epoch": 7.683429513602638, "grad_norm": 0.400390625, "learning_rate": 3.079150327673766e-05, "loss": 0.6996, "step": 4660 }, { "epoch": 7.691673536685903, "grad_norm": 0.458984375, "learning_rate": 3.058390171511196e-05, "loss": 0.6973, "step": 4665 }, { "epoch": 7.699917559769167, "grad_norm": 0.419921875, "learning_rate": 3.0376875950470617e-05, "loss": 0.6972, "step": 4670 }, { "epoch": 7.708161582852432, "grad_norm": 0.447265625, "learning_rate": 3.0170427700063873e-05, "loss": 0.6962, "step": 4675 }, { "epoch": 7.716405605935696, "grad_norm": 0.4140625, "learning_rate": 2.996455867635155e-05, "loss": 0.7006, "step": 4680 }, { "epoch": 7.724649629018961, "grad_norm": 0.408203125, "learning_rate": 2.9759270586988865e-05, "loss": 0.7017, "step": 4685 }, { "epoch": 7.732893652102226, "grad_norm": 0.427734375, "learning_rate": 2.9554565134812294e-05, "loss": 0.7051, "step": 4690 }, { "epoch": 7.741137675185491, "grad_norm": 0.392578125, "learning_rate": 2.9350444017825385e-05, "loss": 0.6909, "step": 4695 }, { "epoch": 7.749381698268755, "grad_norm": 0.400390625, "learning_rate": 2.9146908929184713e-05, "loss": 0.6939, "step": 4700 }, { "epoch": 7.75762572135202, "grad_norm": 0.435546875, "learning_rate": 2.894396155718585e-05, "loss": 0.6956, "step": 4705 }, { "epoch": 7.765869744435284, "grad_norm": 0.419921875, "learning_rate": 2.874160358524931e-05, "loss": 0.6962, "step": 4710 }, { "epoch": 7.774113767518549, "grad_norm": 0.451171875, "learning_rate": 2.853983669190664e-05, "loss": 0.6911, "step": 4715 }, { "epoch": 7.7823577906018135, "grad_norm": 0.423828125, "learning_rate": 2.8338662550786443e-05, "loss": 0.6954, "step": 4720 }, { "epoch": 7.790601813685078, "grad_norm": 0.404296875, "learning_rate": 2.8138082830600554e-05, "loss": 0.694, "step": 4725 }, { "epoch": 7.798845836768343, "grad_norm": 0.40625, "learning_rate": 2.7938099195130153e-05, "loss": 0.6935, "step": 4730 }, { "epoch": 7.807089859851608, "grad_norm": 0.416015625, "learning_rate": 2.7738713303211982e-05, "loss": 0.6885, "step": 4735 }, { "epoch": 7.815333882934873, "grad_norm": 0.4375, "learning_rate": 2.753992680872457e-05, "loss": 0.7002, "step": 4740 }, { "epoch": 7.823577906018137, "grad_norm": 0.3984375, "learning_rate": 2.7341741360574548e-05, "loss": 0.6928, "step": 4745 }, { "epoch": 7.831821929101402, "grad_norm": 0.466796875, "learning_rate": 2.7144158602682924e-05, "loss": 0.6959, "step": 4750 }, { "epoch": 7.840065952184666, "grad_norm": 0.40625, "learning_rate": 2.6947180173971508e-05, "loss": 0.6907, "step": 4755 }, { "epoch": 7.848309975267931, "grad_norm": 0.435546875, "learning_rate": 2.6750807708349267e-05, "loss": 0.6982, "step": 4760 }, { "epoch": 7.856553998351195, "grad_norm": 0.40234375, "learning_rate": 2.6555042834698773e-05, "loss": 0.6945, "step": 4765 }, { "epoch": 7.86479802143446, "grad_norm": 0.427734375, "learning_rate": 2.6359887176862718e-05, "loss": 0.695, "step": 4770 }, { "epoch": 7.8730420445177245, "grad_norm": 0.396484375, "learning_rate": 2.6165342353630428e-05, "loss": 0.694, "step": 4775 }, { "epoch": 7.881286067600989, "grad_norm": 0.412109375, "learning_rate": 2.5971409978724458e-05, "loss": 0.6986, "step": 4780 }, { "epoch": 7.889530090684254, "grad_norm": 0.419921875, "learning_rate": 2.577809166078716e-05, "loss": 0.6935, "step": 4785 }, { "epoch": 7.897774113767518, "grad_norm": 0.421875, "learning_rate": 2.558538900336741e-05, "loss": 0.6991, "step": 4790 }, { "epoch": 7.906018136850783, "grad_norm": 0.384765625, "learning_rate": 2.5393303604907205e-05, "loss": 0.6974, "step": 4795 }, { "epoch": 7.914262159934048, "grad_norm": 0.396484375, "learning_rate": 2.5201837058728505e-05, "loss": 0.6956, "step": 4800 }, { "epoch": 7.922506183017313, "grad_norm": 0.40625, "learning_rate": 2.5010990953019975e-05, "loss": 0.6927, "step": 4805 }, { "epoch": 7.930750206100577, "grad_norm": 0.3984375, "learning_rate": 2.4820766870823807e-05, "loss": 0.688, "step": 4810 }, { "epoch": 7.938994229183842, "grad_norm": 0.40625, "learning_rate": 2.4631166390022574e-05, "loss": 0.695, "step": 4815 }, { "epoch": 7.947238252267106, "grad_norm": 0.4296875, "learning_rate": 2.4442191083326195e-05, "loss": 0.7014, "step": 4820 }, { "epoch": 7.955482275350371, "grad_norm": 0.396484375, "learning_rate": 2.425384251825882e-05, "loss": 0.6955, "step": 4825 }, { "epoch": 7.963726298433635, "grad_norm": 0.42578125, "learning_rate": 2.4066122257145894e-05, "loss": 0.6934, "step": 4830 }, { "epoch": 7.9719703215169, "grad_norm": 0.388671875, "learning_rate": 2.387903185710115e-05, "loss": 0.6909, "step": 4835 }, { "epoch": 7.9802143446001645, "grad_norm": 0.419921875, "learning_rate": 2.3692572870013718e-05, "loss": 0.691, "step": 4840 }, { "epoch": 7.98845836768343, "grad_norm": 0.41796875, "learning_rate": 2.3506746842535242e-05, "loss": 0.6929, "step": 4845 }, { "epoch": 7.9967023907666945, "grad_norm": 0.40625, "learning_rate": 2.3321555316067045e-05, "loss": 0.6928, "step": 4850 }, { "epoch": 8.0, "eval_loss": 2.471337080001831, "eval_runtime": 0.2361, "eval_samples_per_second": 42.357, "eval_steps_per_second": 4.236, "step": 4852 }, { "epoch": 8.004946413849959, "grad_norm": 0.423828125, "learning_rate": 2.313699982674736e-05, "loss": 0.6913, "step": 4855 }, { "epoch": 8.013190436933224, "grad_norm": 0.427734375, "learning_rate": 2.295308190543859e-05, "loss": 0.6943, "step": 4860 }, { "epoch": 8.021434460016488, "grad_norm": 0.453125, "learning_rate": 2.276980307771458e-05, "loss": 0.6958, "step": 4865 }, { "epoch": 8.029678483099753, "grad_norm": 0.3984375, "learning_rate": 2.2587164863847975e-05, "loss": 0.6957, "step": 4870 }, { "epoch": 8.037922506183017, "grad_norm": 0.392578125, "learning_rate": 2.2405168778797646e-05, "loss": 0.6914, "step": 4875 }, { "epoch": 8.046166529266282, "grad_norm": 0.408203125, "learning_rate": 2.222381633219608e-05, "loss": 0.6904, "step": 4880 }, { "epoch": 8.054410552349546, "grad_norm": 0.435546875, "learning_rate": 2.204310902833685e-05, "loss": 0.6921, "step": 4885 }, { "epoch": 8.062654575432811, "grad_norm": 0.435546875, "learning_rate": 2.1863048366162208e-05, "loss": 0.6926, "step": 4890 }, { "epoch": 8.070898598516075, "grad_norm": 0.380859375, "learning_rate": 2.1683635839250537e-05, "loss": 0.6938, "step": 4895 }, { "epoch": 8.07914262159934, "grad_norm": 0.408203125, "learning_rate": 2.15048729358041e-05, "loss": 0.6936, "step": 4900 }, { "epoch": 8.087386644682605, "grad_norm": 0.392578125, "learning_rate": 2.1326761138636553e-05, "loss": 0.6959, "step": 4905 }, { "epoch": 8.09563066776587, "grad_norm": 0.400390625, "learning_rate": 2.114930192516076e-05, "loss": 0.6883, "step": 4910 }, { "epoch": 8.103874690849134, "grad_norm": 0.404296875, "learning_rate": 2.097249676737648e-05, "loss": 0.6989, "step": 4915 }, { "epoch": 8.112118713932398, "grad_norm": 0.44921875, "learning_rate": 2.0796347131858186e-05, "loss": 0.6915, "step": 4920 }, { "epoch": 8.120362737015663, "grad_norm": 0.408203125, "learning_rate": 2.0620854479742834e-05, "loss": 0.6893, "step": 4925 }, { "epoch": 8.12860676009893, "grad_norm": 0.3984375, "learning_rate": 2.044602026671786e-05, "loss": 0.699, "step": 4930 }, { "epoch": 8.136850783182194, "grad_norm": 0.3984375, "learning_rate": 2.027184594300898e-05, "loss": 0.6962, "step": 4935 }, { "epoch": 8.145094806265458, "grad_norm": 0.40625, "learning_rate": 2.0098332953368272e-05, "loss": 0.6869, "step": 4940 }, { "epoch": 8.153338829348723, "grad_norm": 0.40625, "learning_rate": 1.9925482737062085e-05, "loss": 0.6957, "step": 4945 }, { "epoch": 8.161582852431987, "grad_norm": 0.40234375, "learning_rate": 1.9753296727859195e-05, "loss": 0.692, "step": 4950 }, { "epoch": 8.169826875515252, "grad_norm": 0.39453125, "learning_rate": 1.9581776354018854e-05, "loss": 0.6985, "step": 4955 }, { "epoch": 8.178070898598516, "grad_norm": 0.41015625, "learning_rate": 1.941092303827896e-05, "loss": 0.6876, "step": 4960 }, { "epoch": 8.186314921681781, "grad_norm": 0.392578125, "learning_rate": 1.9240738197844278e-05, "loss": 0.6863, "step": 4965 }, { "epoch": 8.194558944765046, "grad_norm": 0.40234375, "learning_rate": 1.9071223244374614e-05, "loss": 0.694, "step": 4970 }, { "epoch": 8.20280296784831, "grad_norm": 0.39453125, "learning_rate": 1.8902379583973208e-05, "loss": 0.6936, "step": 4975 }, { "epoch": 8.211046990931575, "grad_norm": 0.404296875, "learning_rate": 1.8734208617174988e-05, "loss": 0.6926, "step": 4980 }, { "epoch": 8.21929101401484, "grad_norm": 0.3984375, "learning_rate": 1.856671173893497e-05, "loss": 0.6921, "step": 4985 }, { "epoch": 8.227535037098104, "grad_norm": 0.408203125, "learning_rate": 1.839989033861673e-05, "loss": 0.6893, "step": 4990 }, { "epoch": 8.235779060181368, "grad_norm": 0.4140625, "learning_rate": 1.8233745799980817e-05, "loss": 0.6931, "step": 4995 }, { "epoch": 8.244023083264633, "grad_norm": 0.408203125, "learning_rate": 1.8068279501173335e-05, "loss": 0.6842, "step": 5000 }, { "epoch": 8.252267106347897, "grad_norm": 0.400390625, "learning_rate": 1.790349281471445e-05, "loss": 0.6998, "step": 5005 }, { "epoch": 8.260511129431162, "grad_norm": 0.404296875, "learning_rate": 1.773938710748706e-05, "loss": 0.6946, "step": 5010 }, { "epoch": 8.268755152514426, "grad_norm": 0.39453125, "learning_rate": 1.757596374072543e-05, "loss": 0.6901, "step": 5015 }, { "epoch": 8.276999175597691, "grad_norm": 0.412109375, "learning_rate": 1.741322407000391e-05, "loss": 0.6938, "step": 5020 }, { "epoch": 8.285243198680956, "grad_norm": 0.416015625, "learning_rate": 1.7251169445225657e-05, "loss": 0.6922, "step": 5025 }, { "epoch": 8.29348722176422, "grad_norm": 0.392578125, "learning_rate": 1.70898012106115e-05, "loss": 0.6844, "step": 5030 }, { "epoch": 8.301731244847485, "grad_norm": 0.404296875, "learning_rate": 1.692912070468874e-05, "loss": 0.6968, "step": 5035 }, { "epoch": 8.309975267930751, "grad_norm": 0.390625, "learning_rate": 1.676912926028007e-05, "loss": 0.6977, "step": 5040 }, { "epoch": 8.318219291014016, "grad_norm": 0.40625, "learning_rate": 1.660982820449247e-05, "loss": 0.6995, "step": 5045 }, { "epoch": 8.32646331409728, "grad_norm": 0.38671875, "learning_rate": 1.6451218858706374e-05, "loss": 0.6934, "step": 5050 }, { "epoch": 8.334707337180545, "grad_norm": 0.400390625, "learning_rate": 1.6293302538564382e-05, "loss": 0.6954, "step": 5055 }, { "epoch": 8.34295136026381, "grad_norm": 0.423828125, "learning_rate": 1.6136080553960687e-05, "loss": 0.6942, "step": 5060 }, { "epoch": 8.351195383347074, "grad_norm": 0.408203125, "learning_rate": 1.5979554209030024e-05, "loss": 0.6887, "step": 5065 }, { "epoch": 8.359439406430338, "grad_norm": 0.388671875, "learning_rate": 1.5823724802136865e-05, "loss": 0.6948, "step": 5070 }, { "epoch": 8.367683429513603, "grad_norm": 0.404296875, "learning_rate": 1.5668593625864715e-05, "loss": 0.695, "step": 5075 }, { "epoch": 8.375927452596867, "grad_norm": 0.396484375, "learning_rate": 1.5514161967005337e-05, "loss": 0.7057, "step": 5080 }, { "epoch": 8.384171475680132, "grad_norm": 0.419921875, "learning_rate": 1.536043110654809e-05, "loss": 0.6906, "step": 5085 }, { "epoch": 8.392415498763397, "grad_norm": 0.4140625, "learning_rate": 1.5207402319669306e-05, "loss": 0.6909, "step": 5090 }, { "epoch": 8.400659521846661, "grad_norm": 0.40625, "learning_rate": 1.505507687572173e-05, "loss": 0.6841, "step": 5095 }, { "epoch": 8.408903544929926, "grad_norm": 0.392578125, "learning_rate": 1.4903456038223939e-05, "loss": 0.6889, "step": 5100 }, { "epoch": 8.41714756801319, "grad_norm": 0.38671875, "learning_rate": 1.4752541064849946e-05, "loss": 0.6908, "step": 5105 }, { "epoch": 8.425391591096455, "grad_norm": 0.392578125, "learning_rate": 1.4602333207418651e-05, "loss": 0.6949, "step": 5110 }, { "epoch": 8.43363561417972, "grad_norm": 0.400390625, "learning_rate": 1.4452833711883628e-05, "loss": 0.691, "step": 5115 }, { "epoch": 8.441879637262984, "grad_norm": 0.388671875, "learning_rate": 1.4304043818322565e-05, "loss": 0.6855, "step": 5120 }, { "epoch": 8.450123660346248, "grad_norm": 0.404296875, "learning_rate": 1.4155964760927176e-05, "loss": 0.6937, "step": 5125 }, { "epoch": 8.458367683429513, "grad_norm": 0.390625, "learning_rate": 1.4008597767992871e-05, "loss": 0.6922, "step": 5130 }, { "epoch": 8.466611706512778, "grad_norm": 0.423828125, "learning_rate": 1.3861944061908583e-05, "loss": 0.6929, "step": 5135 }, { "epoch": 8.474855729596042, "grad_norm": 0.39453125, "learning_rate": 1.3716004859146592e-05, "loss": 0.6898, "step": 5140 }, { "epoch": 8.483099752679308, "grad_norm": 0.3984375, "learning_rate": 1.3570781370252582e-05, "loss": 0.6851, "step": 5145 }, { "epoch": 8.491343775762573, "grad_norm": 0.416015625, "learning_rate": 1.3426274799835337e-05, "loss": 0.6846, "step": 5150 }, { "epoch": 8.499587798845837, "grad_norm": 0.412109375, "learning_rate": 1.328248634655701e-05, "loss": 0.7024, "step": 5155 }, { "epoch": 8.507831821929102, "grad_norm": 0.419921875, "learning_rate": 1.3139417203123027e-05, "loss": 0.6881, "step": 5160 }, { "epoch": 8.516075845012367, "grad_norm": 0.404296875, "learning_rate": 1.2997068556272263e-05, "loss": 0.7002, "step": 5165 }, { "epoch": 8.524319868095631, "grad_norm": 0.40234375, "learning_rate": 1.2855441586767113e-05, "loss": 0.6909, "step": 5170 }, { "epoch": 8.532563891178896, "grad_norm": 0.39453125, "learning_rate": 1.2714537469383858e-05, "loss": 0.6878, "step": 5175 }, { "epoch": 8.54080791426216, "grad_norm": 0.390625, "learning_rate": 1.2574357372902767e-05, "loss": 0.6917, "step": 5180 }, { "epoch": 8.549051937345425, "grad_norm": 0.40234375, "learning_rate": 1.243490246009842e-05, "loss": 0.689, "step": 5185 }, { "epoch": 8.55729596042869, "grad_norm": 0.41015625, "learning_rate": 1.2296173887730123e-05, "loss": 0.6859, "step": 5190 }, { "epoch": 8.565539983511954, "grad_norm": 0.392578125, "learning_rate": 1.215817280653232e-05, "loss": 0.6858, "step": 5195 }, { "epoch": 8.573784006595218, "grad_norm": 0.412109375, "learning_rate": 1.2020900361204968e-05, "loss": 0.6894, "step": 5200 }, { "epoch": 8.582028029678483, "grad_norm": 0.396484375, "learning_rate": 1.1884357690404158e-05, "loss": 0.6886, "step": 5205 }, { "epoch": 8.590272052761748, "grad_norm": 0.396484375, "learning_rate": 1.1748545926732535e-05, "loss": 0.6903, "step": 5210 }, { "epoch": 8.598516075845012, "grad_norm": 0.392578125, "learning_rate": 1.1613466196729984e-05, "loss": 0.7021, "step": 5215 }, { "epoch": 8.606760098928277, "grad_norm": 0.408203125, "learning_rate": 1.1479119620864276e-05, "loss": 0.6826, "step": 5220 }, { "epoch": 8.615004122011541, "grad_norm": 0.40234375, "learning_rate": 1.1345507313521786e-05, "loss": 0.6954, "step": 5225 }, { "epoch": 8.623248145094806, "grad_norm": 0.40625, "learning_rate": 1.1212630382998213e-05, "loss": 0.6877, "step": 5230 }, { "epoch": 8.63149216817807, "grad_norm": 0.388671875, "learning_rate": 1.1080489931489391e-05, "loss": 0.696, "step": 5235 }, { "epoch": 8.639736191261335, "grad_norm": 0.392578125, "learning_rate": 1.0949087055082252e-05, "loss": 0.6977, "step": 5240 }, { "epoch": 8.6479802143446, "grad_norm": 0.38671875, "learning_rate": 1.0818422843745512e-05, "loss": 0.6924, "step": 5245 }, { "epoch": 8.656224237427864, "grad_norm": 0.40234375, "learning_rate": 1.0688498381320855e-05, "loss": 0.6941, "step": 5250 }, { "epoch": 8.664468260511129, "grad_norm": 0.390625, "learning_rate": 1.0559314745513805e-05, "loss": 0.6878, "step": 5255 }, { "epoch": 8.672712283594395, "grad_norm": 0.41015625, "learning_rate": 1.0430873007884857e-05, "loss": 0.6975, "step": 5260 }, { "epoch": 8.68095630667766, "grad_norm": 0.40625, "learning_rate": 1.0303174233840528e-05, "loss": 0.6863, "step": 5265 }, { "epoch": 8.689200329760924, "grad_norm": 0.41015625, "learning_rate": 1.0176219482624616e-05, "loss": 0.7022, "step": 5270 }, { "epoch": 8.697444352844188, "grad_norm": 0.4140625, "learning_rate": 1.0050009807309325e-05, "loss": 0.6892, "step": 5275 }, { "epoch": 8.705688375927453, "grad_norm": 0.39453125, "learning_rate": 9.924546254786493e-06, "loss": 0.6839, "step": 5280 }, { "epoch": 8.713932399010718, "grad_norm": 0.41796875, "learning_rate": 9.799829865759069e-06, "loss": 0.6821, "step": 5285 }, { "epoch": 8.722176422093982, "grad_norm": 0.388671875, "learning_rate": 9.675861674732312e-06, "loss": 0.6885, "step": 5290 }, { "epoch": 8.730420445177247, "grad_norm": 0.421875, "learning_rate": 9.552642710005299e-06, "loss": 0.6965, "step": 5295 }, { "epoch": 8.738664468260511, "grad_norm": 0.404296875, "learning_rate": 9.430173993662451e-06, "loss": 0.6971, "step": 5300 }, { "epoch": 8.746908491343776, "grad_norm": 0.396484375, "learning_rate": 9.308456541564881e-06, "loss": 0.6847, "step": 5305 }, { "epoch": 8.75515251442704, "grad_norm": 0.404296875, "learning_rate": 9.187491363342093e-06, "loss": 0.6982, "step": 5310 }, { "epoch": 8.763396537510305, "grad_norm": 0.443359375, "learning_rate": 9.067279462383615e-06, "loss": 0.6906, "step": 5315 }, { "epoch": 8.77164056059357, "grad_norm": 0.41015625, "learning_rate": 8.947821835830616e-06, "loss": 0.6981, "step": 5320 }, { "epoch": 8.779884583676834, "grad_norm": 0.416015625, "learning_rate": 8.829119474567671e-06, "loss": 0.6972, "step": 5325 }, { "epoch": 8.788128606760099, "grad_norm": 0.408203125, "learning_rate": 8.711173363214553e-06, "loss": 0.6875, "step": 5330 }, { "epoch": 8.796372629843363, "grad_norm": 0.43359375, "learning_rate": 8.593984480118011e-06, "loss": 0.6904, "step": 5335 }, { "epoch": 8.804616652926628, "grad_norm": 0.412109375, "learning_rate": 8.47755379734373e-06, "loss": 0.6886, "step": 5340 }, { "epoch": 8.812860676009892, "grad_norm": 0.431640625, "learning_rate": 8.361882280668165e-06, "loss": 0.6919, "step": 5345 }, { "epoch": 8.821104699093157, "grad_norm": 0.388671875, "learning_rate": 8.24697088957066e-06, "loss": 0.6934, "step": 5350 }, { "epoch": 8.829348722176421, "grad_norm": 0.38671875, "learning_rate": 8.132820577225387e-06, "loss": 0.6882, "step": 5355 }, { "epoch": 8.837592745259688, "grad_norm": 0.390625, "learning_rate": 8.019432290493457e-06, "loss": 0.7015, "step": 5360 }, { "epoch": 8.845836768342952, "grad_norm": 0.39453125, "learning_rate": 7.906806969915148e-06, "loss": 0.689, "step": 5365 }, { "epoch": 8.854080791426217, "grad_norm": 0.400390625, "learning_rate": 7.794945549701993e-06, "loss": 0.6866, "step": 5370 }, { "epoch": 8.862324814509481, "grad_norm": 0.40234375, "learning_rate": 7.683848957729056e-06, "loss": 0.696, "step": 5375 }, { "epoch": 8.870568837592746, "grad_norm": 0.4140625, "learning_rate": 7.573518115527289e-06, "loss": 0.6824, "step": 5380 }, { "epoch": 8.87881286067601, "grad_norm": 0.39453125, "learning_rate": 7.463953938275858e-06, "loss": 0.6941, "step": 5385 }, { "epoch": 8.887056883759275, "grad_norm": 0.390625, "learning_rate": 7.355157334794516e-06, "loss": 0.6901, "step": 5390 }, { "epoch": 8.89530090684254, "grad_norm": 0.404296875, "learning_rate": 7.247129207536152e-06, "loss": 0.688, "step": 5395 }, { "epoch": 8.903544929925804, "grad_norm": 0.39453125, "learning_rate": 7.1398704525792e-06, "loss": 0.6906, "step": 5400 }, { "epoch": 8.911788953009069, "grad_norm": 0.42578125, "learning_rate": 7.0333819596203e-06, "loss": 0.6878, "step": 5405 }, { "epoch": 8.920032976092333, "grad_norm": 0.404296875, "learning_rate": 6.927664611966811e-06, "loss": 0.6965, "step": 5410 }, { "epoch": 8.928276999175598, "grad_norm": 0.40234375, "learning_rate": 6.8227192865295995e-06, "loss": 0.69, "step": 5415 }, { "epoch": 8.936521022258862, "grad_norm": 0.390625, "learning_rate": 6.718546853815688e-06, "loss": 0.6857, "step": 5420 }, { "epoch": 8.944765045342127, "grad_norm": 0.4140625, "learning_rate": 6.6151481779211155e-06, "loss": 0.6922, "step": 5425 }, { "epoch": 8.953009068425391, "grad_norm": 0.40234375, "learning_rate": 6.512524116523633e-06, "loss": 0.6885, "step": 5430 }, { "epoch": 8.961253091508656, "grad_norm": 0.4375, "learning_rate": 6.410675520875742e-06, "loss": 0.6854, "step": 5435 }, { "epoch": 8.96949711459192, "grad_norm": 0.40625, "learning_rate": 6.30960323579749e-06, "loss": 0.6966, "step": 5440 }, { "epoch": 8.977741137675185, "grad_norm": 0.392578125, "learning_rate": 6.209308099669597e-06, "loss": 0.6962, "step": 5445 }, { "epoch": 8.98598516075845, "grad_norm": 0.408203125, "learning_rate": 6.109790944426397e-06, "loss": 0.707, "step": 5450 }, { "epoch": 8.994229183841714, "grad_norm": 0.41015625, "learning_rate": 6.011052595549038e-06, "loss": 0.6924, "step": 5455 }, { "epoch": 8.999175597691673, "eval_loss": 2.4814510345458984, "eval_runtime": 0.2587, "eval_samples_per_second": 38.654, "eval_steps_per_second": 3.865, "step": 5458 }, { "epoch": 9.002473206924979, "grad_norm": 0.3984375, "learning_rate": 5.913093872058528e-06, "loss": 0.6875, "step": 5460 }, { "epoch": 9.010717230008243, "grad_norm": 0.404296875, "learning_rate": 5.81591558650898e-06, "loss": 0.6871, "step": 5465 }, { "epoch": 9.01896125309151, "grad_norm": 0.396484375, "learning_rate": 5.719518544980929e-06, "loss": 0.6887, "step": 5470 }, { "epoch": 9.027205276174774, "grad_norm": 0.4453125, "learning_rate": 5.623903547074549e-06, "loss": 0.7051, "step": 5475 }, { "epoch": 9.035449299258039, "grad_norm": 0.40625, "learning_rate": 5.529071385903084e-06, "loss": 0.694, "step": 5480 }, { "epoch": 9.043693322341303, "grad_norm": 0.40234375, "learning_rate": 5.43502284808628e-06, "loss": 0.6839, "step": 5485 }, { "epoch": 9.051937345424568, "grad_norm": 0.396484375, "learning_rate": 5.341758713743828e-06, "loss": 0.6906, "step": 5490 }, { "epoch": 9.060181368507832, "grad_norm": 0.39453125, "learning_rate": 5.249279756488878e-06, "loss": 0.6895, "step": 5495 }, { "epoch": 9.068425391591097, "grad_norm": 0.396484375, "learning_rate": 5.157586743421672e-06, "loss": 0.6937, "step": 5500 }, { "epoch": 9.076669414674361, "grad_norm": 0.392578125, "learning_rate": 5.066680435123106e-06, "loss": 0.7007, "step": 5505 }, { "epoch": 9.084913437757626, "grad_norm": 0.384765625, "learning_rate": 4.976561585648509e-06, "loss": 0.6864, "step": 5510 }, { "epoch": 9.09315746084089, "grad_norm": 0.3984375, "learning_rate": 4.887230942521337e-06, "loss": 0.6886, "step": 5515 }, { "epoch": 9.101401483924155, "grad_norm": 0.404296875, "learning_rate": 4.798689246727006e-06, "loss": 0.6965, "step": 5520 }, { "epoch": 9.10964550700742, "grad_norm": 0.3984375, "learning_rate": 4.710937232706691e-06, "loss": 0.6888, "step": 5525 }, { "epoch": 9.117889530090684, "grad_norm": 0.404296875, "learning_rate": 4.623975628351273e-06, "loss": 0.6937, "step": 5530 }, { "epoch": 9.126133553173949, "grad_norm": 0.396484375, "learning_rate": 4.537805154995278e-06, "loss": 0.6989, "step": 5535 }, { "epoch": 9.134377576257213, "grad_norm": 0.408203125, "learning_rate": 4.452426527410947e-06, "loss": 0.69, "step": 5540 }, { "epoch": 9.142621599340478, "grad_norm": 0.4296875, "learning_rate": 4.36784045380223e-06, "loss": 0.6952, "step": 5545 }, { "epoch": 9.150865622423742, "grad_norm": 0.39453125, "learning_rate": 4.2840476357989825e-06, "loss": 0.6909, "step": 5550 }, { "epoch": 9.159109645507007, "grad_norm": 0.39453125, "learning_rate": 4.20104876845111e-06, "loss": 0.6835, "step": 5555 }, { "epoch": 9.167353668590271, "grad_norm": 0.404296875, "learning_rate": 4.118844540222788e-06, "loss": 0.7042, "step": 5560 }, { "epoch": 9.175597691673536, "grad_norm": 0.404296875, "learning_rate": 4.037435632986786e-06, "loss": 0.693, "step": 5565 }, { "epoch": 9.1838417147568, "grad_norm": 0.39453125, "learning_rate": 3.95682272201876e-06, "loss": 0.6854, "step": 5570 }, { "epoch": 9.192085737840065, "grad_norm": 0.392578125, "learning_rate": 3.877006475991729e-06, "loss": 0.6937, "step": 5575 }, { "epoch": 9.200329760923331, "grad_norm": 0.3984375, "learning_rate": 3.797987556970495e-06, "loss": 0.6968, "step": 5580 }, { "epoch": 9.208573784006596, "grad_norm": 0.400390625, "learning_rate": 3.7197666204060955e-06, "loss": 0.6902, "step": 5585 }, { "epoch": 9.21681780708986, "grad_norm": 0.400390625, "learning_rate": 3.6423443151304526e-06, "loss": 0.6896, "step": 5590 }, { "epoch": 9.225061830173125, "grad_norm": 0.41796875, "learning_rate": 3.565721283350931e-06, "loss": 0.696, "step": 5595 }, { "epoch": 9.23330585325639, "grad_norm": 0.408203125, "learning_rate": 3.4898981606450333e-06, "loss": 0.6895, "step": 5600 }, { "epoch": 9.241549876339654, "grad_norm": 0.39453125, "learning_rate": 3.414875575955101e-06, "loss": 0.6845, "step": 5605 }, { "epoch": 9.249793899422919, "grad_norm": 0.400390625, "learning_rate": 3.3406541515832003e-06, "loss": 0.6908, "step": 5610 }, { "epoch": 9.258037922506183, "grad_norm": 0.396484375, "learning_rate": 3.267234503185823e-06, "loss": 0.6885, "step": 5615 }, { "epoch": 9.266281945589448, "grad_norm": 0.3984375, "learning_rate": 3.1946172397688267e-06, "loss": 0.6921, "step": 5620 }, { "epoch": 9.274525968672712, "grad_norm": 0.404296875, "learning_rate": 3.1228029636824475e-06, "loss": 0.6927, "step": 5625 }, { "epoch": 9.282769991755977, "grad_norm": 0.39453125, "learning_rate": 3.051792270616216e-06, "loss": 0.689, "step": 5630 }, { "epoch": 9.291014014839241, "grad_norm": 0.416015625, "learning_rate": 2.981585749594051e-06, "loss": 0.6962, "step": 5635 }, { "epoch": 9.299258037922506, "grad_norm": 0.39453125, "learning_rate": 2.912183982969385e-06, "loss": 0.6873, "step": 5640 }, { "epoch": 9.30750206100577, "grad_norm": 0.39453125, "learning_rate": 2.8435875464203343e-06, "loss": 0.6839, "step": 5645 }, { "epoch": 9.315746084089035, "grad_norm": 0.3828125, "learning_rate": 2.7757970089449024e-06, "loss": 0.6884, "step": 5650 }, { "epoch": 9.3239901071723, "grad_norm": 0.3984375, "learning_rate": 2.708812932856253e-06, "loss": 0.6865, "step": 5655 }, { "epoch": 9.332234130255564, "grad_norm": 0.396484375, "learning_rate": 2.6426358737781098e-06, "loss": 0.6944, "step": 5660 }, { "epoch": 9.340478153338829, "grad_norm": 0.39453125, "learning_rate": 2.577266380640053e-06, "loss": 0.6866, "step": 5665 }, { "epoch": 9.348722176422093, "grad_norm": 0.40234375, "learning_rate": 2.5127049956730207e-06, "loss": 0.6917, "step": 5670 }, { "epoch": 9.356966199505358, "grad_norm": 0.384765625, "learning_rate": 2.448952254404846e-06, "loss": 0.6984, "step": 5675 }, { "epoch": 9.365210222588622, "grad_norm": 0.39453125, "learning_rate": 2.3860086856557383e-06, "loss": 0.6881, "step": 5680 }, { "epoch": 9.373454245671887, "grad_norm": 0.41015625, "learning_rate": 2.3238748115339324e-06, "loss": 0.689, "step": 5685 }, { "epoch": 9.381698268755153, "grad_norm": 0.39453125, "learning_rate": 2.2625511474313685e-06, "loss": 0.6968, "step": 5690 }, { "epoch": 9.389942291838418, "grad_norm": 0.43359375, "learning_rate": 2.2020382020194074e-06, "loss": 0.6923, "step": 5695 }, { "epoch": 9.398186314921682, "grad_norm": 0.439453125, "learning_rate": 2.1423364772445887e-06, "loss": 0.6929, "step": 5700 }, { "epoch": 9.406430338004947, "grad_norm": 0.41015625, "learning_rate": 2.0834464683245346e-06, "loss": 0.6948, "step": 5705 }, { "epoch": 9.414674361088212, "grad_norm": 0.40234375, "learning_rate": 2.025368663743743e-06, "loss": 0.6956, "step": 5710 }, { "epoch": 9.422918384171476, "grad_norm": 0.435546875, "learning_rate": 1.968103545249611e-06, "loss": 0.6857, "step": 5715 }, { "epoch": 9.43116240725474, "grad_norm": 0.427734375, "learning_rate": 1.91165158784844e-06, "loss": 0.6871, "step": 5720 }, { "epoch": 9.439406430338005, "grad_norm": 0.390625, "learning_rate": 1.8560132598014368e-06, "loss": 0.6864, "step": 5725 }, { "epoch": 9.44765045342127, "grad_norm": 0.400390625, "learning_rate": 1.8011890226208527e-06, "loss": 0.6922, "step": 5730 }, { "epoch": 9.455894476504534, "grad_norm": 0.3984375, "learning_rate": 1.7471793310662287e-06, "loss": 0.6973, "step": 5735 }, { "epoch": 9.464138499587799, "grad_norm": 0.396484375, "learning_rate": 1.6939846331405108e-06, "loss": 0.6954, "step": 5740 }, { "epoch": 9.472382522671063, "grad_norm": 0.392578125, "learning_rate": 1.6416053700863964e-06, "loss": 0.6983, "step": 5745 }, { "epoch": 9.480626545754328, "grad_norm": 0.408203125, "learning_rate": 1.5900419763826614e-06, "loss": 0.6904, "step": 5750 }, { "epoch": 9.488870568837593, "grad_norm": 0.41015625, "learning_rate": 1.5392948797405827e-06, "loss": 0.7001, "step": 5755 }, { "epoch": 9.497114591920857, "grad_norm": 0.42578125, "learning_rate": 1.489364501100332e-06, "loss": 0.6978, "step": 5760 }, { "epoch": 9.505358615004122, "grad_norm": 0.3984375, "learning_rate": 1.4402512546275114e-06, "loss": 0.6974, "step": 5765 }, { "epoch": 9.513602638087386, "grad_norm": 0.42578125, "learning_rate": 1.3919555477097668e-06, "loss": 0.6885, "step": 5770 }, { "epoch": 9.52184666117065, "grad_norm": 0.416015625, "learning_rate": 1.344477780953346e-06, "loss": 0.6884, "step": 5775 }, { "epoch": 9.530090684253915, "grad_norm": 0.400390625, "learning_rate": 1.2978183481797801e-06, "loss": 0.6899, "step": 5780 }, { "epoch": 9.53833470733718, "grad_norm": 0.392578125, "learning_rate": 1.251977636422641e-06, "loss": 0.6897, "step": 5785 }, { "epoch": 9.546578730420444, "grad_norm": 0.390625, "learning_rate": 1.2069560259243328e-06, "loss": 0.6933, "step": 5790 }, { "epoch": 9.55482275350371, "grad_norm": 0.4140625, "learning_rate": 1.1627538901329172e-06, "loss": 0.6868, "step": 5795 }, { "epoch": 9.563066776586975, "grad_norm": 0.39453125, "learning_rate": 1.1193715956990258e-06, "loss": 0.6855, "step": 5800 }, { "epoch": 9.57131079967024, "grad_norm": 0.400390625, "learning_rate": 1.076809502472831e-06, "loss": 0.6977, "step": 5805 }, { "epoch": 9.579554822753504, "grad_norm": 0.396484375, "learning_rate": 1.035067963501024e-06, "loss": 0.6969, "step": 5810 }, { "epoch": 9.587798845836769, "grad_norm": 0.400390625, "learning_rate": 9.94147325023953e-07, "loss": 0.6982, "step": 5815 }, { "epoch": 9.596042868920033, "grad_norm": 0.388671875, "learning_rate": 9.540479264726676e-07, "loss": 0.6865, "step": 5820 }, { "epoch": 9.604286892003298, "grad_norm": 0.40625, "learning_rate": 9.147701004661446e-07, "loss": 0.6897, "step": 5825 }, { "epoch": 9.612530915086563, "grad_norm": 0.404296875, "learning_rate": 8.763141728085789e-07, "loss": 0.6837, "step": 5830 }, { "epoch": 9.620774938169827, "grad_norm": 0.396484375, "learning_rate": 8.386804624865851e-07, "loss": 0.6865, "step": 5835 }, { "epoch": 9.629018961253092, "grad_norm": 0.39453125, "learning_rate": 8.018692816666118e-07, "loss": 0.6907, "step": 5840 }, { "epoch": 9.637262984336356, "grad_norm": 0.39453125, "learning_rate": 7.658809356923424e-07, "loss": 0.6902, "step": 5845 }, { "epoch": 9.64550700741962, "grad_norm": 0.39453125, "learning_rate": 7.307157230821426e-07, "loss": 0.6925, "step": 5850 }, { "epoch": 9.653751030502885, "grad_norm": 0.3984375, "learning_rate": 6.963739355266286e-07, "loss": 0.6911, "step": 5855 }, { "epoch": 9.66199505358615, "grad_norm": 0.39453125, "learning_rate": 6.628558578862021e-07, "loss": 0.6838, "step": 5860 }, { "epoch": 9.670239076669414, "grad_norm": 0.388671875, "learning_rate": 6.301617681886863e-07, "loss": 0.6883, "step": 5865 }, { "epoch": 9.678483099752679, "grad_norm": 0.408203125, "learning_rate": 5.982919376270823e-07, "loss": 0.6908, "step": 5870 }, { "epoch": 9.686727122835944, "grad_norm": 0.416015625, "learning_rate": 5.672466305572388e-07, "loss": 0.6908, "step": 5875 }, { "epoch": 9.694971145919208, "grad_norm": 0.408203125, "learning_rate": 5.370261044956971e-07, "loss": 0.6962, "step": 5880 }, { "epoch": 9.703215169002473, "grad_norm": 0.396484375, "learning_rate": 5.07630610117582e-07, "loss": 0.6932, "step": 5885 }, { "epoch": 9.711459192085737, "grad_norm": 0.390625, "learning_rate": 4.790603912544489e-07, "loss": 0.6878, "step": 5890 }, { "epoch": 9.719703215169002, "grad_norm": 0.400390625, "learning_rate": 4.5131568489236166e-07, "loss": 0.6946, "step": 5895 }, { "epoch": 9.727947238252266, "grad_norm": 0.4296875, "learning_rate": 4.2439672116982855e-07, "loss": 0.6853, "step": 5900 }, { "epoch": 9.73619126133553, "grad_norm": 0.396484375, "learning_rate": 3.983037233759368e-07, "loss": 0.6914, "step": 5905 }, { "epoch": 9.744435284418797, "grad_norm": 0.404296875, "learning_rate": 3.73036907948543e-07, "loss": 0.6898, "step": 5910 }, { "epoch": 9.752679307502062, "grad_norm": 0.388671875, "learning_rate": 3.485964844723744e-07, "loss": 0.6888, "step": 5915 }, { "epoch": 9.760923330585326, "grad_norm": 0.412109375, "learning_rate": 3.2498265567739717e-07, "loss": 0.6824, "step": 5920 }, { "epoch": 9.76916735366859, "grad_norm": 0.3984375, "learning_rate": 3.0219561743707326e-07, "loss": 0.691, "step": 5925 }, { "epoch": 9.777411376751855, "grad_norm": 0.40234375, "learning_rate": 2.8023555876673937e-07, "loss": 0.6862, "step": 5930 }, { "epoch": 9.78565539983512, "grad_norm": 0.396484375, "learning_rate": 2.5910266182207486e-07, "loss": 0.6933, "step": 5935 }, { "epoch": 9.793899422918384, "grad_norm": 0.400390625, "learning_rate": 2.3879710189753656e-07, "loss": 0.6926, "step": 5940 }, { "epoch": 9.802143446001649, "grad_norm": 0.3984375, "learning_rate": 2.1931904742495957e-07, "loss": 0.6807, "step": 5945 }, { "epoch": 9.810387469084914, "grad_norm": 0.388671875, "learning_rate": 2.0066865997212525e-07, "loss": 0.6923, "step": 5950 }, { "epoch": 9.818631492168178, "grad_norm": 0.39453125, "learning_rate": 1.8284609424142895e-07, "loss": 0.6885, "step": 5955 }, { "epoch": 9.826875515251443, "grad_norm": 0.392578125, "learning_rate": 1.6585149806860324e-07, "loss": 0.6862, "step": 5960 }, { "epoch": 9.835119538334707, "grad_norm": 0.4140625, "learning_rate": 1.4968501242148547e-07, "loss": 0.6955, "step": 5965 }, { "epoch": 9.843363561417972, "grad_norm": 0.404296875, "learning_rate": 1.3434677139885222e-07, "loss": 0.6957, "step": 5970 }, { "epoch": 9.851607584501236, "grad_norm": 0.419921875, "learning_rate": 1.1983690222929778e-07, "loss": 0.6915, "step": 5975 }, { "epoch": 9.8598516075845, "grad_norm": 0.39453125, "learning_rate": 1.0615552527017958e-07, "loss": 0.701, "step": 5980 }, { "epoch": 9.868095630667765, "grad_norm": 0.40234375, "learning_rate": 9.330275400666332e-08, "loss": 0.6959, "step": 5985 }, { "epoch": 9.87633965375103, "grad_norm": 0.396484375, "learning_rate": 8.127869505069053e-08, "loss": 0.6885, "step": 5990 }, { "epoch": 9.884583676834295, "grad_norm": 0.40234375, "learning_rate": 7.00834481402013e-08, "loss": 0.6842, "step": 5995 }, { "epoch": 9.892827699917559, "grad_norm": 0.38671875, "learning_rate": 5.971710613821291e-08, "loss": 0.6956, "step": 6000 }, { "epoch": 9.901071723000824, "grad_norm": 0.3984375, "learning_rate": 5.0179755032109253e-08, "loss": 0.6898, "step": 6005 }, { "epoch": 9.90931574608409, "grad_norm": 0.3828125, "learning_rate": 4.147147393290807e-08, "loss": 0.6899, "step": 6010 }, { "epoch": 9.917559769167354, "grad_norm": 0.404296875, "learning_rate": 3.359233507459481e-08, "loss": 0.697, "step": 6015 }, { "epoch": 9.925803792250619, "grad_norm": 0.408203125, "learning_rate": 2.6542403813545334e-08, "loss": 0.6938, "step": 6020 }, { "epoch": 9.934047815333884, "grad_norm": 0.3828125, "learning_rate": 2.0321738627981923e-08, "loss": 0.686, "step": 6025 }, { "epoch": 9.942291838417148, "grad_norm": 0.40234375, "learning_rate": 1.4930391117451426e-08, "loss": 0.6874, "step": 6030 }, { "epoch": 9.950535861500413, "grad_norm": 0.404296875, "learning_rate": 1.0368406002436715e-08, "loss": 0.6934, "step": 6035 }, { "epoch": 9.958779884583677, "grad_norm": 0.400390625, "learning_rate": 6.635821124001406e-09, "loss": 0.6913, "step": 6040 }, { "epoch": 9.967023907666942, "grad_norm": 0.388671875, "learning_rate": 3.732667443390181e-09, "loss": 0.6895, "step": 6045 }, { "epoch": 9.975267930750206, "grad_norm": 0.3984375, "learning_rate": 1.6589690418955528e-09, "loss": 0.6968, "step": 6050 }, { "epoch": 9.983511953833471, "grad_norm": 0.3984375, "learning_rate": 4.147431205359098e-10, "loss": 0.6946, "step": 6055 }, { "epoch": 9.991755976916735, "grad_norm": 0.376953125, "learning_rate": 0.0, "loss": 0.6936, "step": 6060 }, { "epoch": 9.991755976916735, "eval_loss": 2.4860482215881348, "eval_runtime": 0.2343, "eval_samples_per_second": 42.675, "eval_steps_per_second": 4.267, "step": 6060 }, { "epoch": 9.991755976916735, "step": 6060, "total_flos": 1.8500974249565487e+19, "train_loss": 1.1020318522705104, "train_runtime": 14653.0399, "train_samples_per_second": 26.478, "train_steps_per_second": 0.414 } ], "logging_steps": 5, "max_steps": 6060, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.8500974249565487e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }