{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3821579984474831, "eval_steps": 400, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023884874902967696, "eval_loss": 1.5979785919189453, "eval_runtime": 224.9995, "eval_samples_per_second": 3.778, "eval_steps_per_second": 3.778, "step": 1 }, { "epoch": 0.0014330924941780617, "grad_norm": 20.875, "learning_rate": 6.000000000000001e-07, "loss": 1.8691, "step": 6 }, { "epoch": 0.0028661849883561234, "grad_norm": 14.0625, "learning_rate": 1.2000000000000002e-06, "loss": 1.8156, "step": 12 }, { "epoch": 0.004299277482534185, "grad_norm": 11.1875, "learning_rate": 1.8000000000000001e-06, "loss": 1.6925, "step": 18 }, { "epoch": 0.005732369976712247, "grad_norm": 7.15625, "learning_rate": 2.4000000000000003e-06, "loss": 1.612, "step": 24 }, { "epoch": 0.0071654624708903086, "grad_norm": 7.25, "learning_rate": 3e-06, "loss": 1.8222, "step": 30 }, { "epoch": 0.00859855496506837, "grad_norm": 5.71875, "learning_rate": 3.6000000000000003e-06, "loss": 1.6277, "step": 36 }, { "epoch": 0.010031647459246432, "grad_norm": 5.65625, "learning_rate": 4.2000000000000004e-06, "loss": 1.5655, "step": 42 }, { "epoch": 0.011464739953424494, "grad_norm": 6.90625, "learning_rate": 4.800000000000001e-06, "loss": 1.7691, "step": 48 }, { "epoch": 0.012897832447602555, "grad_norm": 6.96875, "learning_rate": 5.400000000000001e-06, "loss": 1.7085, "step": 54 }, { "epoch": 0.014330924941780617, "grad_norm": 5.3125, "learning_rate": 6e-06, "loss": 1.4649, "step": 60 }, { "epoch": 0.01576401743595868, "grad_norm": 15.8125, "learning_rate": 6.600000000000001e-06, "loss": 1.6534, "step": 66 }, { "epoch": 0.01719710993013674, "grad_norm": 42.75, "learning_rate": 7.2000000000000005e-06, "loss": 1.673, "step": 72 }, { "epoch": 0.018630202424314804, "grad_norm": 5.5, "learning_rate": 7.800000000000002e-06, "loss": 1.429, "step": 78 }, { "epoch": 0.020063294918492864, "grad_norm": 3.875, "learning_rate": 8.400000000000001e-06, "loss": 1.6067, "step": 84 }, { "epoch": 0.021496387412670927, "grad_norm": 4.53125, "learning_rate": 9e-06, "loss": 1.4336, "step": 90 }, { "epoch": 0.022929479906848987, "grad_norm": 4.40625, "learning_rate": 9.600000000000001e-06, "loss": 1.5998, "step": 96 }, { "epoch": 0.02436257240102705, "grad_norm": 5.40625, "learning_rate": 1.02e-05, "loss": 1.5259, "step": 102 }, { "epoch": 0.02579566489520511, "grad_norm": 9.0, "learning_rate": 1.0800000000000002e-05, "loss": 1.5255, "step": 108 }, { "epoch": 0.027228757389383174, "grad_norm": 5.34375, "learning_rate": 1.14e-05, "loss": 1.5375, "step": 114 }, { "epoch": 0.028661849883561234, "grad_norm": 4.625, "learning_rate": 1.2e-05, "loss": 1.4729, "step": 120 }, { "epoch": 0.030094942377739298, "grad_norm": 5.78125, "learning_rate": 1.2600000000000001e-05, "loss": 1.5446, "step": 126 }, { "epoch": 0.03152803487191736, "grad_norm": 5.15625, "learning_rate": 1.3200000000000002e-05, "loss": 1.6895, "step": 132 }, { "epoch": 0.03296112736609542, "grad_norm": 4.59375, "learning_rate": 1.38e-05, "loss": 1.6145, "step": 138 }, { "epoch": 0.03439421986027348, "grad_norm": 4.96875, "learning_rate": 1.4400000000000001e-05, "loss": 1.4316, "step": 144 }, { "epoch": 0.035827312354451545, "grad_norm": 4.71875, "learning_rate": 1.5000000000000002e-05, "loss": 1.5619, "step": 150 }, { "epoch": 0.03726040484862961, "grad_norm": 7.9375, "learning_rate": 1.5600000000000003e-05, "loss": 1.6608, "step": 156 }, { "epoch": 0.038693497342807665, "grad_norm": 4.34375, "learning_rate": 1.62e-05, "loss": 1.6418, "step": 162 }, { "epoch": 0.04012658983698573, "grad_norm": 4.8125, "learning_rate": 1.6800000000000002e-05, "loss": 1.5532, "step": 168 }, { "epoch": 0.04155968233116379, "grad_norm": 7.90625, "learning_rate": 1.7400000000000003e-05, "loss": 1.6124, "step": 174 }, { "epoch": 0.042992774825341855, "grad_norm": 5.90625, "learning_rate": 1.8e-05, "loss": 1.5629, "step": 180 }, { "epoch": 0.04442586731951991, "grad_norm": 9.4375, "learning_rate": 1.86e-05, "loss": 1.5727, "step": 186 }, { "epoch": 0.045858959813697975, "grad_norm": 6.34375, "learning_rate": 1.9200000000000003e-05, "loss": 1.4866, "step": 192 }, { "epoch": 0.04729205230787604, "grad_norm": 10.9375, "learning_rate": 1.98e-05, "loss": 1.6203, "step": 198 }, { "epoch": 0.0487251448020541, "grad_norm": 5.46875, "learning_rate": 1.9999756307053947e-05, "loss": 1.6003, "step": 204 }, { "epoch": 0.05015823729623216, "grad_norm": 7.34375, "learning_rate": 1.9998476951563914e-05, "loss": 1.7795, "step": 210 }, { "epoch": 0.05159132979041022, "grad_norm": 5.03125, "learning_rate": 1.9996101150403543e-05, "loss": 1.6262, "step": 216 }, { "epoch": 0.053024422284588285, "grad_norm": 6.03125, "learning_rate": 1.999262916410621e-05, "loss": 1.5033, "step": 222 }, { "epoch": 0.05445751477876635, "grad_norm": 6.375, "learning_rate": 1.9988061373414342e-05, "loss": 1.528, "step": 228 }, { "epoch": 0.055890607272944405, "grad_norm": 5.375, "learning_rate": 1.9982398279237657e-05, "loss": 1.6706, "step": 234 }, { "epoch": 0.05732369976712247, "grad_norm": 5.3125, "learning_rate": 1.9975640502598243e-05, "loss": 1.8826, "step": 240 }, { "epoch": 0.05875679226130053, "grad_norm": 7.21875, "learning_rate": 1.9967788784562474e-05, "loss": 1.6844, "step": 246 }, { "epoch": 0.060189884755478595, "grad_norm": 14.0, "learning_rate": 1.9958843986159705e-05, "loss": 1.6681, "step": 252 }, { "epoch": 0.06162297724965665, "grad_norm": 5.3125, "learning_rate": 1.9948807088287884e-05, "loss": 1.5271, "step": 258 }, { "epoch": 0.06305606974383472, "grad_norm": 5.78125, "learning_rate": 1.9937679191605964e-05, "loss": 1.5941, "step": 264 }, { "epoch": 0.06448916223801278, "grad_norm": 7.75, "learning_rate": 1.9925461516413224e-05, "loss": 1.6754, "step": 270 }, { "epoch": 0.06592225473219084, "grad_norm": 5.03125, "learning_rate": 1.991215540251542e-05, "loss": 1.6616, "step": 276 }, { "epoch": 0.0673553472263689, "grad_norm": 5.46875, "learning_rate": 1.989776230907789e-05, "loss": 1.7207, "step": 282 }, { "epoch": 0.06878843972054696, "grad_norm": 4.84375, "learning_rate": 1.988228381446553e-05, "loss": 1.6092, "step": 288 }, { "epoch": 0.07022153221472502, "grad_norm": 15.625, "learning_rate": 1.9865721616069695e-05, "loss": 1.6828, "step": 294 }, { "epoch": 0.07165462470890309, "grad_norm": 7.125, "learning_rate": 1.9848077530122083e-05, "loss": 1.7341, "step": 300 }, { "epoch": 0.07308771720308115, "grad_norm": 10.625, "learning_rate": 1.9829353491495545e-05, "loss": 1.6181, "step": 306 }, { "epoch": 0.07452080969725922, "grad_norm": 4.75, "learning_rate": 1.9809551553491918e-05, "loss": 1.548, "step": 312 }, { "epoch": 0.07595390219143727, "grad_norm": 6.9375, "learning_rate": 1.9788673887616852e-05, "loss": 1.5703, "step": 318 }, { "epoch": 0.07738699468561533, "grad_norm": 6.71875, "learning_rate": 1.9766722783341682e-05, "loss": 1.7147, "step": 324 }, { "epoch": 0.0788200871797934, "grad_norm": 6.8125, "learning_rate": 1.9743700647852356e-05, "loss": 1.7598, "step": 330 }, { "epoch": 0.08025317967397146, "grad_norm": 5.0625, "learning_rate": 1.9719610005785466e-05, "loss": 1.7136, "step": 336 }, { "epoch": 0.08168627216814951, "grad_norm": 6.03125, "learning_rate": 1.9694453498951392e-05, "loss": 1.7161, "step": 342 }, { "epoch": 0.08311936466232758, "grad_norm": 7.34375, "learning_rate": 1.9668233886044597e-05, "loss": 1.6319, "step": 348 }, { "epoch": 0.08455245715650564, "grad_norm": 5.21875, "learning_rate": 1.96409540423411e-05, "loss": 1.5857, "step": 354 }, { "epoch": 0.08598554965068371, "grad_norm": 10.6875, "learning_rate": 1.961261695938319e-05, "loss": 1.7632, "step": 360 }, { "epoch": 0.08741864214486177, "grad_norm": 6.21875, "learning_rate": 1.9583225744651334e-05, "loss": 1.4205, "step": 366 }, { "epoch": 0.08885173463903982, "grad_norm": 5.875, "learning_rate": 1.9552783621223437e-05, "loss": 1.7812, "step": 372 }, { "epoch": 0.0902848271332179, "grad_norm": 4.46875, "learning_rate": 1.9521293927421388e-05, "loss": 1.5759, "step": 378 }, { "epoch": 0.09171791962739595, "grad_norm": 6.53125, "learning_rate": 1.9488760116444966e-05, "loss": 1.6537, "step": 384 }, { "epoch": 0.09315101212157402, "grad_norm": 10.8125, "learning_rate": 1.945518575599317e-05, "loss": 1.4973, "step": 390 }, { "epoch": 0.09458410461575208, "grad_norm": 4.1875, "learning_rate": 1.942057452787297e-05, "loss": 1.578, "step": 396 }, { "epoch": 0.09553949961187078, "eval_loss": 1.4027706384658813, "eval_runtime": 224.2305, "eval_samples_per_second": 3.791, "eval_steps_per_second": 3.791, "step": 400 }, { "epoch": 0.09601719710993013, "grad_norm": 3.875, "learning_rate": 1.938493022759556e-05, "loss": 1.6032, "step": 402 }, { "epoch": 0.0974502896041082, "grad_norm": 6.125, "learning_rate": 1.9348256763960146e-05, "loss": 1.7055, "step": 408 }, { "epoch": 0.09888338209828626, "grad_norm": 5.84375, "learning_rate": 1.9310558158625286e-05, "loss": 1.7454, "step": 414 }, { "epoch": 0.10031647459246432, "grad_norm": 7.0625, "learning_rate": 1.9271838545667876e-05, "loss": 1.7345, "step": 420 }, { "epoch": 0.10174956708664239, "grad_norm": 6.125, "learning_rate": 1.923210217112981e-05, "loss": 1.6099, "step": 426 }, { "epoch": 0.10318265958082044, "grad_norm": 4.59375, "learning_rate": 1.9191353392552346e-05, "loss": 1.652, "step": 432 }, { "epoch": 0.10461575207499851, "grad_norm": 5.96875, "learning_rate": 1.914959667849825e-05, "loss": 1.7092, "step": 438 }, { "epoch": 0.10604884456917657, "grad_norm": 6.4375, "learning_rate": 1.910683660806177e-05, "loss": 1.7545, "step": 444 }, { "epoch": 0.10748193706335463, "grad_norm": 10.4375, "learning_rate": 1.9063077870366504e-05, "loss": 1.5287, "step": 450 }, { "epoch": 0.1089150295575327, "grad_norm": 7.84375, "learning_rate": 1.901832526405114e-05, "loss": 1.7219, "step": 456 }, { "epoch": 0.11034812205171075, "grad_norm": 9.5625, "learning_rate": 1.8972583696743284e-05, "loss": 1.665, "step": 462 }, { "epoch": 0.11178121454588881, "grad_norm": 10.0625, "learning_rate": 1.892585818452126e-05, "loss": 1.6363, "step": 468 }, { "epoch": 0.11321430704006688, "grad_norm": 5.78125, "learning_rate": 1.8878153851364013e-05, "loss": 1.543, "step": 474 }, { "epoch": 0.11464739953424494, "grad_norm": 6.125, "learning_rate": 1.8829475928589272e-05, "loss": 1.5826, "step": 480 }, { "epoch": 0.11608049202842301, "grad_norm": 4.8125, "learning_rate": 1.8779829754279806e-05, "loss": 1.581, "step": 486 }, { "epoch": 0.11751358452260106, "grad_norm": 9.75, "learning_rate": 1.8729220772698096e-05, "loss": 1.5841, "step": 492 }, { "epoch": 0.11894667701677912, "grad_norm": 13.3125, "learning_rate": 1.8677654533689287e-05, "loss": 1.6944, "step": 498 }, { "epoch": 0.12037976951095719, "grad_norm": 4.96875, "learning_rate": 1.8625136692072577e-05, "loss": 1.6203, "step": 504 }, { "epoch": 0.12181286200513525, "grad_norm": 6.3125, "learning_rate": 1.8571673007021124e-05, "loss": 1.5639, "step": 510 }, { "epoch": 0.1232459544993133, "grad_norm": 5.5, "learning_rate": 1.851726934143048e-05, "loss": 1.6397, "step": 516 }, { "epoch": 0.12467904699349137, "grad_norm": 5.125, "learning_rate": 1.8461931661275642e-05, "loss": 1.7315, "step": 522 }, { "epoch": 0.12611213948766944, "grad_norm": 6.25, "learning_rate": 1.8405666034956842e-05, "loss": 1.7201, "step": 528 }, { "epoch": 0.1275452319818475, "grad_norm": 8.9375, "learning_rate": 1.8348478632634067e-05, "loss": 1.6047, "step": 534 }, { "epoch": 0.12897832447602556, "grad_norm": 46.25, "learning_rate": 1.8290375725550417e-05, "loss": 1.6949, "step": 540 }, { "epoch": 0.13041141697020361, "grad_norm": 5.9375, "learning_rate": 1.8231363685344422e-05, "loss": 1.7245, "step": 546 }, { "epoch": 0.13184450946438167, "grad_norm": 5.78125, "learning_rate": 1.8171448983351284e-05, "loss": 1.641, "step": 552 }, { "epoch": 0.13327760195855975, "grad_norm": 24.125, "learning_rate": 1.8110638189893267e-05, "loss": 1.6125, "step": 558 }, { "epoch": 0.1347106944527378, "grad_norm": 6.4375, "learning_rate": 1.804893797355914e-05, "loss": 1.6647, "step": 564 }, { "epoch": 0.13614378694691587, "grad_norm": 6.34375, "learning_rate": 1.798635510047293e-05, "loss": 1.7073, "step": 570 }, { "epoch": 0.13757687944109392, "grad_norm": 6.1875, "learning_rate": 1.792289643355191e-05, "loss": 1.6271, "step": 576 }, { "epoch": 0.13900997193527198, "grad_norm": 5.0625, "learning_rate": 1.785856893175402e-05, "loss": 1.6317, "step": 582 }, { "epoch": 0.14044306442945004, "grad_norm": 4.6875, "learning_rate": 1.7793379649314743e-05, "loss": 1.6578, "step": 588 }, { "epoch": 0.14187615692362812, "grad_norm": 4.84375, "learning_rate": 1.7727335734973512e-05, "loss": 1.6554, "step": 594 }, { "epoch": 0.14330924941780618, "grad_norm": 6.1875, "learning_rate": 1.766044443118978e-05, "loss": 1.5523, "step": 600 }, { "epoch": 0.14474234191198423, "grad_norm": 23.375, "learning_rate": 1.759271307334881e-05, "loss": 1.616, "step": 606 }, { "epoch": 0.1461754344061623, "grad_norm": 6.9375, "learning_rate": 1.7524149088957244e-05, "loss": 1.7729, "step": 612 }, { "epoch": 0.14760852690034035, "grad_norm": 10.25, "learning_rate": 1.7454759996828622e-05, "loss": 1.5922, "step": 618 }, { "epoch": 0.14904161939451843, "grad_norm": 7.21875, "learning_rate": 1.7384553406258842e-05, "loss": 1.583, "step": 624 }, { "epoch": 0.1504747118886965, "grad_norm": 6.9375, "learning_rate": 1.7313537016191706e-05, "loss": 1.6019, "step": 630 }, { "epoch": 0.15190780438287455, "grad_norm": 11.5, "learning_rate": 1.7241718614374678e-05, "loss": 1.6195, "step": 636 }, { "epoch": 0.1533408968770526, "grad_norm": 5.5, "learning_rate": 1.716910607650483e-05, "loss": 1.5012, "step": 642 }, { "epoch": 0.15477398937123066, "grad_norm": 6.71875, "learning_rate": 1.709570736536521e-05, "loss": 1.7686, "step": 648 }, { "epoch": 0.15620708186540874, "grad_norm": 5.71875, "learning_rate": 1.7021530529951627e-05, "loss": 1.7922, "step": 654 }, { "epoch": 0.1576401743595868, "grad_norm": 7.8125, "learning_rate": 1.6946583704589973e-05, "loss": 1.623, "step": 660 }, { "epoch": 0.15907326685376486, "grad_norm": 6.34375, "learning_rate": 1.6870875108044233e-05, "loss": 1.6039, "step": 666 }, { "epoch": 0.1605063593479429, "grad_norm": 6.46875, "learning_rate": 1.6794413042615168e-05, "loss": 1.6392, "step": 672 }, { "epoch": 0.16193945184212097, "grad_norm": 5.4375, "learning_rate": 1.6717205893229904e-05, "loss": 1.5683, "step": 678 }, { "epoch": 0.16337254433629902, "grad_norm": 4.78125, "learning_rate": 1.6639262126522417e-05, "loss": 1.6165, "step": 684 }, { "epoch": 0.1648056368304771, "grad_norm": 5.4375, "learning_rate": 1.6560590289905074e-05, "loss": 1.5341, "step": 690 }, { "epoch": 0.16623872932465517, "grad_norm": 5.25, "learning_rate": 1.6481199010631312e-05, "loss": 1.6573, "step": 696 }, { "epoch": 0.16767182181883322, "grad_norm": 5.21875, "learning_rate": 1.6401096994849558e-05, "loss": 1.5056, "step": 702 }, { "epoch": 0.16910491431301128, "grad_norm": 12.625, "learning_rate": 1.632029302664851e-05, "loss": 1.5337, "step": 708 }, { "epoch": 0.17053800680718934, "grad_norm": 4.28125, "learning_rate": 1.6238795967093865e-05, "loss": 1.5038, "step": 714 }, { "epoch": 0.17197109930136742, "grad_norm": 6.96875, "learning_rate": 1.6156614753256583e-05, "loss": 1.5587, "step": 720 }, { "epoch": 0.17340419179554548, "grad_norm": 4.90625, "learning_rate": 1.607375839723287e-05, "loss": 1.563, "step": 726 }, { "epoch": 0.17483728428972353, "grad_norm": 5.34375, "learning_rate": 1.599023598515586e-05, "loss": 1.6058, "step": 732 }, { "epoch": 0.1762703767839016, "grad_norm": 5.25, "learning_rate": 1.5906056676199256e-05, "loss": 1.7244, "step": 738 }, { "epoch": 0.17770346927807965, "grad_norm": 4.5, "learning_rate": 1.5821229701572897e-05, "loss": 1.6587, "step": 744 }, { "epoch": 0.17913656177225773, "grad_norm": 12.75, "learning_rate": 1.573576436351046e-05, "loss": 1.6018, "step": 750 }, { "epoch": 0.1805696542664358, "grad_norm": 6.0, "learning_rate": 1.564967003424938e-05, "loss": 1.6205, "step": 756 }, { "epoch": 0.18200274676061384, "grad_norm": 5.59375, "learning_rate": 1.556295615500305e-05, "loss": 1.6345, "step": 762 }, { "epoch": 0.1834358392547919, "grad_norm": 4.59375, "learning_rate": 1.5475632234925505e-05, "loss": 1.5226, "step": 768 }, { "epoch": 0.18486893174896996, "grad_norm": 4.78125, "learning_rate": 1.5387707850068633e-05, "loss": 1.6488, "step": 774 }, { "epoch": 0.18630202424314804, "grad_norm": 4.28125, "learning_rate": 1.529919264233205e-05, "loss": 1.5393, "step": 780 }, { "epoch": 0.1877351167373261, "grad_norm": 7.625, "learning_rate": 1.5210096318405768e-05, "loss": 1.5374, "step": 786 }, { "epoch": 0.18916820923150415, "grad_norm": 4.21875, "learning_rate": 1.5120428648705716e-05, "loss": 1.4963, "step": 792 }, { "epoch": 0.1906013017256822, "grad_norm": 4.25, "learning_rate": 1.5030199466302354e-05, "loss": 1.5828, "step": 798 }, { "epoch": 0.19107899922374155, "eval_loss": 1.3809266090393066, "eval_runtime": 223.0505, "eval_samples_per_second": 3.811, "eval_steps_per_second": 3.811, "step": 800 }, { "epoch": 0.19203439421986027, "grad_norm": 6.21875, "learning_rate": 1.493941866584231e-05, "loss": 1.5799, "step": 804 }, { "epoch": 0.19346748671403832, "grad_norm": 8.5, "learning_rate": 1.4848096202463373e-05, "loss": 1.6519, "step": 810 }, { "epoch": 0.1949005792082164, "grad_norm": 4.59375, "learning_rate": 1.4756242090702756e-05, "loss": 1.5897, "step": 816 }, { "epoch": 0.19633367170239446, "grad_norm": 5.75, "learning_rate": 1.4663866403398915e-05, "loss": 1.6454, "step": 822 }, { "epoch": 0.19776676419657252, "grad_norm": 4.1875, "learning_rate": 1.4570979270586944e-05, "loss": 1.5361, "step": 828 }, { "epoch": 0.19919985669075058, "grad_norm": 5.375, "learning_rate": 1.4477590878387697e-05, "loss": 1.5086, "step": 834 }, { "epoch": 0.20063294918492863, "grad_norm": 4.375, "learning_rate": 1.4383711467890776e-05, "loss": 1.6474, "step": 840 }, { "epoch": 0.20206604167910672, "grad_norm": 4.6875, "learning_rate": 1.4289351334031461e-05, "loss": 1.465, "step": 846 }, { "epoch": 0.20349913417328477, "grad_norm": 8.6875, "learning_rate": 1.4194520824461773e-05, "loss": 1.5312, "step": 852 }, { "epoch": 0.20493222666746283, "grad_norm": 5.53125, "learning_rate": 1.4099230338415728e-05, "loss": 1.4775, "step": 858 }, { "epoch": 0.2063653191616409, "grad_norm": 9.8125, "learning_rate": 1.4003490325568953e-05, "loss": 1.8343, "step": 864 }, { "epoch": 0.20779841165581894, "grad_norm": 8.0625, "learning_rate": 1.3907311284892737e-05, "loss": 1.537, "step": 870 }, { "epoch": 0.20923150414999703, "grad_norm": 6.3125, "learning_rate": 1.3810703763502744e-05, "loss": 1.7239, "step": 876 }, { "epoch": 0.21066459664417508, "grad_norm": 5.75, "learning_rate": 1.371367835550235e-05, "loss": 1.5176, "step": 882 }, { "epoch": 0.21209768913835314, "grad_norm": 4.65625, "learning_rate": 1.3616245700820922e-05, "loss": 1.641, "step": 888 }, { "epoch": 0.2135307816325312, "grad_norm": 4.0625, "learning_rate": 1.3518416484047018e-05, "loss": 1.5882, "step": 894 }, { "epoch": 0.21496387412670925, "grad_norm": 5.09375, "learning_rate": 1.342020143325669e-05, "loss": 1.6042, "step": 900 }, { "epoch": 0.2163969666208873, "grad_norm": 5.84375, "learning_rate": 1.3321611318837033e-05, "loss": 1.5516, "step": 906 }, { "epoch": 0.2178300591150654, "grad_norm": 6.15625, "learning_rate": 1.3222656952305113e-05, "loss": 1.5349, "step": 912 }, { "epoch": 0.21926315160924345, "grad_norm": 5.21875, "learning_rate": 1.3123349185122328e-05, "loss": 1.6652, "step": 918 }, { "epoch": 0.2206962441034215, "grad_norm": 17.25, "learning_rate": 1.3023698907504447e-05, "loss": 1.7149, "step": 924 }, { "epoch": 0.22212933659759956, "grad_norm": 6.8125, "learning_rate": 1.2923717047227368e-05, "loss": 1.6285, "step": 930 }, { "epoch": 0.22356242909177762, "grad_norm": 4.1875, "learning_rate": 1.2823414568428767e-05, "loss": 1.5982, "step": 936 }, { "epoch": 0.2249955215859557, "grad_norm": 5.8125, "learning_rate": 1.2722802470405744e-05, "loss": 1.5901, "step": 942 }, { "epoch": 0.22642861408013376, "grad_norm": 4.75, "learning_rate": 1.2621891786408648e-05, "loss": 1.5705, "step": 948 }, { "epoch": 0.22786170657431182, "grad_norm": 10.1875, "learning_rate": 1.252069358243114e-05, "loss": 1.5263, "step": 954 }, { "epoch": 0.22929479906848987, "grad_norm": 3.671875, "learning_rate": 1.2419218955996677e-05, "loss": 1.5622, "step": 960 }, { "epoch": 0.23072789156266793, "grad_norm": 4.625, "learning_rate": 1.2317479034941572e-05, "loss": 1.5984, "step": 966 }, { "epoch": 0.23216098405684601, "grad_norm": 7.21875, "learning_rate": 1.2215484976194675e-05, "loss": 1.6465, "step": 972 }, { "epoch": 0.23359407655102407, "grad_norm": 6.59375, "learning_rate": 1.211324796455389e-05, "loss": 1.705, "step": 978 }, { "epoch": 0.23502716904520213, "grad_norm": 5.96875, "learning_rate": 1.2010779211459649e-05, "loss": 1.5316, "step": 984 }, { "epoch": 0.23646026153938018, "grad_norm": 5.3125, "learning_rate": 1.190808995376545e-05, "loss": 1.4676, "step": 990 }, { "epoch": 0.23789335403355824, "grad_norm": 4.9375, "learning_rate": 1.1805191452505602e-05, "loss": 1.5319, "step": 996 }, { "epoch": 0.2393264465277363, "grad_norm": 5.625, "learning_rate": 1.1702094991660326e-05, "loss": 1.6112, "step": 1002 }, { "epoch": 0.24075953902191438, "grad_norm": 4.71875, "learning_rate": 1.159881187691835e-05, "loss": 1.6341, "step": 1008 }, { "epoch": 0.24219263151609244, "grad_norm": 4.3125, "learning_rate": 1.1495353434437098e-05, "loss": 1.4623, "step": 1014 }, { "epoch": 0.2436257240102705, "grad_norm": 19.625, "learning_rate": 1.1391731009600655e-05, "loss": 1.4166, "step": 1020 }, { "epoch": 0.24505881650444855, "grad_norm": 4.0625, "learning_rate": 1.128795596577563e-05, "loss": 1.5813, "step": 1026 }, { "epoch": 0.2464919089986266, "grad_norm": 6.25, "learning_rate": 1.1184039683065014e-05, "loss": 1.5772, "step": 1032 }, { "epoch": 0.2479250014928047, "grad_norm": 5.53125, "learning_rate": 1.1079993557060228e-05, "loss": 1.401, "step": 1038 }, { "epoch": 0.24935809398698275, "grad_norm": 6.65625, "learning_rate": 1.0975828997591496e-05, "loss": 1.6248, "step": 1044 }, { "epoch": 0.2507911864811608, "grad_norm": 856.0, "learning_rate": 1.0871557427476585e-05, "loss": 1.775, "step": 1050 }, { "epoch": 0.2522242789753389, "grad_norm": 4.1875, "learning_rate": 1.0767190281268187e-05, "loss": 1.586, "step": 1056 }, { "epoch": 0.25365737146951695, "grad_norm": 3.53125, "learning_rate": 1.0662739004000005e-05, "loss": 1.5397, "step": 1062 }, { "epoch": 0.255090463963695, "grad_norm": 4.125, "learning_rate": 1.055821504993164e-05, "loss": 1.8712, "step": 1068 }, { "epoch": 0.25652355645787306, "grad_norm": 5.1875, "learning_rate": 1.0453629881292537e-05, "loss": 1.5357, "step": 1074 }, { "epoch": 0.2579566489520511, "grad_norm": 3.921875, "learning_rate": 1.0348994967025012e-05, "loss": 1.4033, "step": 1080 }, { "epoch": 0.25938974144622917, "grad_norm": 5.3125, "learning_rate": 1.0244321781526533e-05, "loss": 1.5611, "step": 1086 }, { "epoch": 0.26082283394040723, "grad_norm": 4.8125, "learning_rate": 1.0139621803391454e-05, "loss": 1.577, "step": 1092 }, { "epoch": 0.2622559264345853, "grad_norm": 5.46875, "learning_rate": 1.0034906514152239e-05, "loss": 1.5149, "step": 1098 }, { "epoch": 0.26368901892876334, "grad_norm": 6.4375, "learning_rate": 9.930187397020385e-06, "loss": 1.5796, "step": 1104 }, { "epoch": 0.2651221114229414, "grad_norm": 4.28125, "learning_rate": 9.825475935627165e-06, "loss": 1.5702, "step": 1110 }, { "epoch": 0.2665552039171195, "grad_norm": 5.34375, "learning_rate": 9.720783612764314e-06, "loss": 1.5354, "step": 1116 }, { "epoch": 0.26798829641129757, "grad_norm": 4.375, "learning_rate": 9.616121909124801e-06, "loss": 1.4122, "step": 1122 }, { "epoch": 0.2694213889054756, "grad_norm": 5.46875, "learning_rate": 9.511502302043867e-06, "loss": 1.6959, "step": 1128 }, { "epoch": 0.2708544813996537, "grad_norm": 8.4375, "learning_rate": 9.406936264240386e-06, "loss": 1.5493, "step": 1134 }, { "epoch": 0.27228757389383174, "grad_norm": 5.46875, "learning_rate": 9.302435262558748e-06, "loss": 1.4156, "step": 1140 }, { "epoch": 0.2737206663880098, "grad_norm": 720.0, "learning_rate": 9.198010756711413e-06, "loss": 1.567, "step": 1146 }, { "epoch": 0.27515375888218785, "grad_norm": 3.875, "learning_rate": 9.093674198022201e-06, "loss": 1.3814, "step": 1152 }, { "epoch": 0.2765868513763659, "grad_norm": 3.671875, "learning_rate": 8.989437028170537e-06, "loss": 1.4261, "step": 1158 }, { "epoch": 0.27801994387054396, "grad_norm": 10.375, "learning_rate": 8.885310677936746e-06, "loss": 1.506, "step": 1164 }, { "epoch": 0.279453036364722, "grad_norm": 3.46875, "learning_rate": 8.781306565948528e-06, "loss": 1.3967, "step": 1170 }, { "epoch": 0.2808861288589001, "grad_norm": 3.984375, "learning_rate": 8.677436097428775e-06, "loss": 1.5761, "step": 1176 }, { "epoch": 0.2823192213530782, "grad_norm": 3.484375, "learning_rate": 8.573710662944884e-06, "loss": 1.5428, "step": 1182 }, { "epoch": 0.28375231384725624, "grad_norm": 6.25, "learning_rate": 8.47014163715962e-06, "loss": 1.5426, "step": 1188 }, { "epoch": 0.2851854063414343, "grad_norm": 6.25, "learning_rate": 8.366740377583781e-06, "loss": 1.503, "step": 1194 }, { "epoch": 0.28661849883561236, "grad_norm": 3.828125, "learning_rate": 8.263518223330698e-06, "loss": 1.4355, "step": 1200 }, { "epoch": 0.28661849883561236, "eval_loss": 1.315157413482666, "eval_runtime": 223.8181, "eval_samples_per_second": 3.798, "eval_steps_per_second": 3.798, "step": 1200 }, { "epoch": 0.2880515913297904, "grad_norm": 5.625, "learning_rate": 8.1604864938728e-06, "loss": 1.4389, "step": 1206 }, { "epoch": 0.28948468382396847, "grad_norm": 5.0625, "learning_rate": 8.057656487800283e-06, "loss": 1.5346, "step": 1212 }, { "epoch": 0.2909177763181465, "grad_norm": 4.21875, "learning_rate": 7.955039481582098e-06, "loss": 1.4492, "step": 1218 }, { "epoch": 0.2923508688123246, "grad_norm": 4.9375, "learning_rate": 7.852646728329368e-06, "loss": 1.4305, "step": 1224 }, { "epoch": 0.29378396130650264, "grad_norm": 4.9375, "learning_rate": 7.750489456561351e-06, "loss": 1.607, "step": 1230 }, { "epoch": 0.2952170538006807, "grad_norm": 4.90625, "learning_rate": 7.6485788689741e-06, "loss": 1.3777, "step": 1236 }, { "epoch": 0.2966501462948588, "grad_norm": 5.875, "learning_rate": 7.546926141211975e-06, "loss": 1.5751, "step": 1242 }, { "epoch": 0.29808323878903686, "grad_norm": 4.8125, "learning_rate": 7.445542420642097e-06, "loss": 1.5106, "step": 1248 }, { "epoch": 0.2995163312832149, "grad_norm": 4.875, "learning_rate": 7.344438825131912e-06, "loss": 1.5982, "step": 1254 }, { "epoch": 0.300949423777393, "grad_norm": 5.09375, "learning_rate": 7.243626441830009e-06, "loss": 1.5328, "step": 1260 }, { "epoch": 0.30238251627157103, "grad_norm": 4.09375, "learning_rate": 7.143116325950266e-06, "loss": 1.6138, "step": 1266 }, { "epoch": 0.3038156087657491, "grad_norm": 3.8125, "learning_rate": 7.042919499559538e-06, "loss": 1.4547, "step": 1272 }, { "epoch": 0.30524870125992715, "grad_norm": 4.1875, "learning_rate": 6.943046950368944e-06, "loss": 1.4393, "step": 1278 }, { "epoch": 0.3066817937541052, "grad_norm": 5.34375, "learning_rate": 6.843509630528977e-06, "loss": 1.4009, "step": 1284 }, { "epoch": 0.30811488624828326, "grad_norm": 5.125, "learning_rate": 6.744318455428436e-06, "loss": 1.5134, "step": 1290 }, { "epoch": 0.3095479787424613, "grad_norm": 4.96875, "learning_rate": 6.645484302497452e-06, "loss": 1.5411, "step": 1296 }, { "epoch": 0.3109810712366394, "grad_norm": 4.9375, "learning_rate": 6.547018010014654e-06, "loss": 1.5058, "step": 1302 }, { "epoch": 0.3124141637308175, "grad_norm": 3.59375, "learning_rate": 6.448930375918632e-06, "loss": 1.4026, "step": 1308 }, { "epoch": 0.31384725622499554, "grad_norm": 4.78125, "learning_rate": 6.351232156623803e-06, "loss": 1.3993, "step": 1314 }, { "epoch": 0.3152803487191736, "grad_norm": 4.21875, "learning_rate": 6.25393406584088e-06, "loss": 1.6574, "step": 1320 }, { "epoch": 0.31671344121335165, "grad_norm": 4.40625, "learning_rate": 6.157046773401964e-06, "loss": 1.5233, "step": 1326 }, { "epoch": 0.3181465337075297, "grad_norm": 5.25, "learning_rate": 6.06058090409049e-06, "loss": 1.5095, "step": 1332 }, { "epoch": 0.31957962620170777, "grad_norm": 4.625, "learning_rate": 5.9645470364761e-06, "loss": 1.3797, "step": 1338 }, { "epoch": 0.3210127186958858, "grad_norm": 5.84375, "learning_rate": 5.868955701754584e-06, "loss": 1.6089, "step": 1344 }, { "epoch": 0.3224458111900639, "grad_norm": 3.71875, "learning_rate": 5.773817382593008e-06, "loss": 1.4297, "step": 1350 }, { "epoch": 0.32387890368424194, "grad_norm": 3.578125, "learning_rate": 5.679142511980176e-06, "loss": 1.327, "step": 1356 }, { "epoch": 0.32531199617842, "grad_norm": 4.6875, "learning_rate": 5.584941472082549e-06, "loss": 1.4878, "step": 1362 }, { "epoch": 0.32674508867259805, "grad_norm": 5.125, "learning_rate": 5.491224593105695e-06, "loss": 1.4593, "step": 1368 }, { "epoch": 0.32817818116677616, "grad_norm": 7.1875, "learning_rate": 5.398002152161484e-06, "loss": 1.5287, "step": 1374 }, { "epoch": 0.3296112736609542, "grad_norm": 5.71875, "learning_rate": 5.305284372141095e-06, "loss": 1.4808, "step": 1380 }, { "epoch": 0.3310443661551323, "grad_norm": 4.09375, "learning_rate": 5.213081420593933e-06, "loss": 1.4244, "step": 1386 }, { "epoch": 0.33247745864931033, "grad_norm": 9.5, "learning_rate": 5.121403408612672e-06, "loss": 1.5213, "step": 1392 }, { "epoch": 0.3339105511434884, "grad_norm": 5.09375, "learning_rate": 5.030260389724447e-06, "loss": 1.4455, "step": 1398 }, { "epoch": 0.33534364363766644, "grad_norm": 6.6875, "learning_rate": 4.939662358788364e-06, "loss": 1.5983, "step": 1404 }, { "epoch": 0.3367767361318445, "grad_norm": 4.96875, "learning_rate": 4.849619250899458e-06, "loss": 1.3544, "step": 1410 }, { "epoch": 0.33820982862602256, "grad_norm": 4.65625, "learning_rate": 4.76014094029921e-06, "loss": 1.4412, "step": 1416 }, { "epoch": 0.3396429211202006, "grad_norm": 6.40625, "learning_rate": 4.671237239292699e-06, "loss": 1.4463, "step": 1422 }, { "epoch": 0.34107601361437867, "grad_norm": 5.25, "learning_rate": 4.582917897172603e-06, "loss": 1.5306, "step": 1428 }, { "epoch": 0.3425091061085568, "grad_norm": 4.40625, "learning_rate": 4.495192599150045e-06, "loss": 1.5532, "step": 1434 }, { "epoch": 0.34394219860273484, "grad_norm": 5.15625, "learning_rate": 4.408070965292534e-06, "loss": 1.4818, "step": 1440 }, { "epoch": 0.3453752910969129, "grad_norm": 4.125, "learning_rate": 4.321562549468991e-06, "loss": 1.4144, "step": 1446 }, { "epoch": 0.34680838359109095, "grad_norm": 4.28125, "learning_rate": 4.235676838302069e-06, "loss": 1.4173, "step": 1452 }, { "epoch": 0.348241476085269, "grad_norm": 8.5, "learning_rate": 4.150423250127846e-06, "loss": 1.4121, "step": 1458 }, { "epoch": 0.34967456857944706, "grad_norm": 5.90625, "learning_rate": 4.065811133962987e-06, "loss": 1.4121, "step": 1464 }, { "epoch": 0.3511076610736251, "grad_norm": 4.625, "learning_rate": 3.981849768479516e-06, "loss": 1.3973, "step": 1470 }, { "epoch": 0.3525407535678032, "grad_norm": 5.1875, "learning_rate": 3.898548360987325e-06, "loss": 1.4554, "step": 1476 }, { "epoch": 0.35397384606198123, "grad_norm": 5.40625, "learning_rate": 3.81591604642446e-06, "loss": 1.4958, "step": 1482 }, { "epoch": 0.3554069385561593, "grad_norm": 5.28125, "learning_rate": 3.7339618863553983e-06, "loss": 1.4843, "step": 1488 }, { "epoch": 0.35684003105033735, "grad_norm": 5.96875, "learning_rate": 3.6526948679773256e-06, "loss": 1.6051, "step": 1494 }, { "epoch": 0.35827312354451546, "grad_norm": 3.6875, "learning_rate": 3.5721239031346067e-06, "loss": 1.4176, "step": 1500 }, { "epoch": 0.3597062160386935, "grad_norm": 4.375, "learning_rate": 3.492257827341492e-06, "loss": 1.4049, "step": 1506 }, { "epoch": 0.3611393085328716, "grad_norm": 3.71875, "learning_rate": 3.4131053988131947e-06, "loss": 1.5823, "step": 1512 }, { "epoch": 0.36257240102704963, "grad_norm": 6.0, "learning_rate": 3.3346752975054763e-06, "loss": 1.4469, "step": 1518 }, { "epoch": 0.3640054935212277, "grad_norm": 4.21875, "learning_rate": 3.2569761241627694e-06, "loss": 1.4373, "step": 1524 }, { "epoch": 0.36543858601540574, "grad_norm": 6.03125, "learning_rate": 3.1800163993750166e-06, "loss": 1.4823, "step": 1530 }, { "epoch": 0.3668716785095838, "grad_norm": 4.625, "learning_rate": 3.103804562643302e-06, "loss": 1.4585, "step": 1536 }, { "epoch": 0.36830477100376185, "grad_norm": 4.28125, "learning_rate": 3.028348971454356e-06, "loss": 1.4233, "step": 1542 }, { "epoch": 0.3697378634979399, "grad_norm": 14.625, "learning_rate": 2.953657900364053e-06, "loss": 1.4869, "step": 1548 }, { "epoch": 0.37117095599211797, "grad_norm": 4.1875, "learning_rate": 2.8797395400900362e-06, "loss": 1.5315, "step": 1554 }, { "epoch": 0.3726040484862961, "grad_norm": 4.125, "learning_rate": 2.8066019966134907e-06, "loss": 1.4887, "step": 1560 }, { "epoch": 0.37403714098047414, "grad_norm": 3.796875, "learning_rate": 2.7342532902902418e-06, "loss": 1.4533, "step": 1566 }, { "epoch": 0.3754702334746522, "grad_norm": 4.03125, "learning_rate": 2.6627013549712355e-06, "loss": 1.4017, "step": 1572 }, { "epoch": 0.37690332596883025, "grad_norm": 6.84375, "learning_rate": 2.5919540371325005e-06, "loss": 1.3971, "step": 1578 }, { "epoch": 0.3783364184630083, "grad_norm": 5.5625, "learning_rate": 2.522019095014683e-06, "loss": 1.5576, "step": 1584 }, { "epoch": 0.37976951095718636, "grad_norm": 10.875, "learning_rate": 2.45290419777228e-06, "loss": 1.4719, "step": 1590 }, { "epoch": 0.3812026034513644, "grad_norm": 5.15625, "learning_rate": 2.3846169246326345e-06, "loss": 1.4618, "step": 1596 }, { "epoch": 0.3821579984474831, "eval_loss": 1.2876688241958618, "eval_runtime": 226.2654, "eval_samples_per_second": 3.757, "eval_steps_per_second": 3.757, "step": 1600 } ], "logging_steps": 6, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "total_flos": 2.9553261973639004e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }