{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.434891346998684, "learning_rate": 1.801801801801802e-07, "loss": 1.1387, "step": 1 }, { "epoch": 0.0, "grad_norm": 6.054112603313701, "learning_rate": 9.00900900900901e-07, "loss": 1.0767, "step": 5 }, { "epoch": 0.01, "grad_norm": 2.720345942108079, "learning_rate": 1.801801801801802e-06, "loss": 1.0177, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.1804731128565717, "learning_rate": 2.702702702702703e-06, "loss": 1.0144, "step": 15 }, { "epoch": 0.02, "grad_norm": 2.301849657938967, "learning_rate": 3.603603603603604e-06, "loss": 0.9791, "step": 20 }, { "epoch": 0.02, "grad_norm": 2.4337369010470637, "learning_rate": 4.504504504504505e-06, "loss": 0.9802, "step": 25 }, { "epoch": 0.03, "grad_norm": 1.9302145168774039, "learning_rate": 5.405405405405406e-06, "loss": 0.986, "step": 30 }, { "epoch": 0.03, "grad_norm": 1.8524872666819012, "learning_rate": 6.3063063063063065e-06, "loss": 0.9589, "step": 35 }, { "epoch": 0.04, "grad_norm": 1.8144517560069848, "learning_rate": 7.207207207207208e-06, "loss": 0.9633, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.010937969129206, "learning_rate": 8.108108108108109e-06, "loss": 0.9687, "step": 45 }, { "epoch": 0.05, "grad_norm": 2.004039674415607, "learning_rate": 9.00900900900901e-06, "loss": 0.9419, "step": 50 }, { "epoch": 0.05, "grad_norm": 1.9619351066753616, "learning_rate": 9.90990990990991e-06, "loss": 0.9751, "step": 55 }, { "epoch": 0.05, "grad_norm": 2.117833280361577, "learning_rate": 1.0810810810810812e-05, "loss": 0.9631, "step": 60 }, { "epoch": 0.06, "grad_norm": 3.1049875306196606, "learning_rate": 1.1711711711711713e-05, "loss": 0.9668, "step": 65 }, { "epoch": 0.06, "grad_norm": 2.214128699414308, "learning_rate": 1.2612612612612613e-05, "loss": 0.9635, "step": 70 }, { "epoch": 0.07, "grad_norm": 1.737941228120663, "learning_rate": 1.3513513513513515e-05, "loss": 0.976, "step": 75 }, { "epoch": 0.07, "grad_norm": 1.8456119696518833, "learning_rate": 1.4414414414414416e-05, "loss": 0.9846, "step": 80 }, { "epoch": 0.08, "grad_norm": 1.7815352366071144, "learning_rate": 1.5315315315315316e-05, "loss": 0.9783, "step": 85 }, { "epoch": 0.08, "grad_norm": 2.0381827638345844, "learning_rate": 1.6216216216216218e-05, "loss": 0.9701, "step": 90 }, { "epoch": 0.09, "grad_norm": 3.8119526094398513, "learning_rate": 1.711711711711712e-05, "loss": 0.9867, "step": 95 }, { "epoch": 0.09, "grad_norm": 2.3244802021753816, "learning_rate": 1.801801801801802e-05, "loss": 0.9799, "step": 100 }, { "epoch": 0.09, "grad_norm": 2.193951315428562, "learning_rate": 1.891891891891892e-05, "loss": 1.0084, "step": 105 }, { "epoch": 0.1, "grad_norm": 2.2121294806446365, "learning_rate": 1.981981981981982e-05, "loss": 0.9731, "step": 110 }, { "epoch": 0.1, "grad_norm": 2.105745826240662, "learning_rate": 1.999920408755684e-05, "loss": 0.9968, "step": 115 }, { "epoch": 0.11, "grad_norm": 2.0624008245016165, "learning_rate": 1.9995970910394228e-05, "loss": 1.0007, "step": 120 }, { "epoch": 0.11, "grad_norm": 1.9216817238638413, "learning_rate": 1.9990251527524178e-05, "loss": 0.9864, "step": 125 }, { "epoch": 0.12, "grad_norm": 1.9416400115913934, "learning_rate": 1.998204736147608e-05, "loss": 0.982, "step": 130 }, { "epoch": 0.12, "grad_norm": 1.992023353075518, "learning_rate": 1.9971360452796523e-05, "loss": 0.9901, "step": 135 }, { "epoch": 0.13, "grad_norm": 2.082117848176388, "learning_rate": 1.9958193459541804e-05, "loss": 1.0065, "step": 140 }, { "epoch": 0.13, "grad_norm": 1.9564902575341125, "learning_rate": 1.994254965661679e-05, "loss": 1.0058, "step": 145 }, { "epoch": 0.14, "grad_norm": 25.696326656261338, "learning_rate": 1.9924432934960384e-05, "loss": 1.1023, "step": 150 }, { "epoch": 0.14, "grad_norm": 15.718602388026698, "learning_rate": 1.9903847800577777e-05, "loss": 1.1922, "step": 155 }, { "epoch": 0.14, "grad_norm": 6.361436623798567, "learning_rate": 1.9880799373419698e-05, "loss": 1.1022, "step": 160 }, { "epoch": 0.15, "grad_norm": 14.018808638392496, "learning_rate": 1.9855293386108995e-05, "loss": 1.0509, "step": 165 }, { "epoch": 0.15, "grad_norm": 2.396996729925699, "learning_rate": 1.982733618251478e-05, "loss": 1.0454, "step": 170 }, { "epoch": 0.16, "grad_norm": 2.0783372576733012, "learning_rate": 1.979693471617462e-05, "loss": 1.0209, "step": 175 }, { "epoch": 0.16, "grad_norm": 2.3172426926555207, "learning_rate": 1.976409654856501e-05, "loss": 1.0314, "step": 180 }, { "epoch": 0.17, "grad_norm": 2.0444492620330186, "learning_rate": 1.97288298472207e-05, "loss": 1.003, "step": 185 }, { "epoch": 0.17, "grad_norm": 1.881906410360296, "learning_rate": 1.969114338370324e-05, "loss": 1.0024, "step": 190 }, { "epoch": 0.18, "grad_norm": 1.6740200356982151, "learning_rate": 1.9651046531419335e-05, "loss": 1.0041, "step": 195 }, { "epoch": 0.18, "grad_norm": 1.744682737538121, "learning_rate": 1.960854926328946e-05, "loss": 1.0108, "step": 200 }, { "epoch": 0.19, "grad_norm": 1.7487489845353397, "learning_rate": 1.9563662149267405e-05, "loss": 1.0009, "step": 205 }, { "epoch": 0.19, "grad_norm": 1.7482821787721043, "learning_rate": 1.9516396353711297e-05, "loss": 1.008, "step": 210 }, { "epoch": 0.19, "grad_norm": 1.7209209399401664, "learning_rate": 1.946676363260679e-05, "loss": 0.9967, "step": 215 }, { "epoch": 0.2, "grad_norm": 1.6520280771666889, "learning_rate": 1.9414776330643126e-05, "loss": 0.991, "step": 220 }, { "epoch": 0.2, "grad_norm": 1.899715362613167, "learning_rate": 1.936044737814273e-05, "loss": 1.0021, "step": 225 }, { "epoch": 0.21, "grad_norm": 1.6989493561259785, "learning_rate": 1.9303790287845183e-05, "loss": 0.9902, "step": 230 }, { "epoch": 0.21, "grad_norm": 1.7638925291836884, "learning_rate": 1.9244819151546325e-05, "loss": 0.9976, "step": 235 }, { "epoch": 0.22, "grad_norm": 1.564735067537867, "learning_rate": 1.9183548636593322e-05, "loss": 0.9787, "step": 240 }, { "epoch": 0.22, "grad_norm": 1.7635286854536336, "learning_rate": 1.9119993982236608e-05, "loss": 0.9937, "step": 245 }, { "epoch": 0.23, "grad_norm": 1.5741256061070714, "learning_rate": 1.9054170995839546e-05, "loss": 0.9648, "step": 250 }, { "epoch": 0.23, "grad_norm": 1.7091189941765559, "learning_rate": 1.8986096048946826e-05, "loss": 0.9818, "step": 255 }, { "epoch": 0.23, "grad_norm": 1.5405427151225155, "learning_rate": 1.8915786073212508e-05, "loss": 0.9958, "step": 260 }, { "epoch": 0.24, "grad_norm": 1.5492700267103499, "learning_rate": 1.8843258556188787e-05, "loss": 0.9924, "step": 265 }, { "epoch": 0.24, "grad_norm": 1.6206079313144814, "learning_rate": 1.8768531536976452e-05, "loss": 0.9804, "step": 270 }, { "epoch": 0.25, "grad_norm": 1.5730537338272383, "learning_rate": 1.86916236017382e-05, "loss": 0.9847, "step": 275 }, { "epoch": 0.25, "grad_norm": 1.6501336616437525, "learning_rate": 1.8612553879075875e-05, "loss": 0.983, "step": 280 }, { "epoch": 0.26, "grad_norm": 1.5113060463908174, "learning_rate": 1.8531342035272768e-05, "loss": 0.981, "step": 285 }, { "epoch": 0.26, "grad_norm": 1.5929173745457115, "learning_rate": 1.844800826940223e-05, "loss": 0.9789, "step": 290 }, { "epoch": 0.27, "grad_norm": 1.598141976179977, "learning_rate": 1.836257330830372e-05, "loss": 1.0036, "step": 295 }, { "epoch": 0.27, "grad_norm": 1.8798731248860165, "learning_rate": 1.8275058401427622e-05, "loss": 0.9704, "step": 300 }, { "epoch": 0.28, "grad_norm": 2.195099845300529, "learning_rate": 1.8185485315550062e-05, "loss": 0.968, "step": 305 }, { "epoch": 0.28, "grad_norm": 1.6333370138254395, "learning_rate": 1.809387632935906e-05, "loss": 0.9884, "step": 310 }, { "epoch": 0.28, "grad_norm": 1.6383788031138038, "learning_rate": 1.8000254227913346e-05, "loss": 0.976, "step": 315 }, { "epoch": 0.29, "grad_norm": 1.59375997270626, "learning_rate": 1.7904642296975263e-05, "loss": 0.9752, "step": 320 }, { "epoch": 0.29, "grad_norm": 1.5470092112631526, "learning_rate": 1.7807064317219096e-05, "loss": 0.9684, "step": 325 }, { "epoch": 0.3, "grad_norm": 1.5761252347776267, "learning_rate": 1.7707544558316332e-05, "loss": 0.983, "step": 330 }, { "epoch": 0.3, "grad_norm": 1.4924968164344596, "learning_rate": 1.760610777289929e-05, "loss": 0.9843, "step": 335 }, { "epoch": 0.31, "grad_norm": 1.4359693377296177, "learning_rate": 1.7502779190404615e-05, "loss": 0.9634, "step": 340 }, { "epoch": 0.31, "grad_norm": 1.5878997655742073, "learning_rate": 1.7397584510798208e-05, "loss": 0.9758, "step": 345 }, { "epoch": 0.32, "grad_norm": 1.6814035465687263, "learning_rate": 1.7290549898183113e-05, "loss": 0.967, "step": 350 }, { "epoch": 0.32, "grad_norm": 1.4871838892694749, "learning_rate": 1.7181701974291927e-05, "loss": 0.953, "step": 355 }, { "epoch": 0.33, "grad_norm": 1.677903630677117, "learning_rate": 1.7071067811865477e-05, "loss": 0.9638, "step": 360 }, { "epoch": 0.33, "grad_norm": 1.7990895100977344, "learning_rate": 1.6958674927919213e-05, "loss": 0.9904, "step": 365 }, { "epoch": 0.33, "grad_norm": 1.577389298855143, "learning_rate": 1.6844551276899184e-05, "loss": 0.9714, "step": 370 }, { "epoch": 0.34, "grad_norm": 1.6428970048321387, "learning_rate": 1.672872524372919e-05, "loss": 0.9925, "step": 375 }, { "epoch": 0.34, "grad_norm": 1.6313220262054722, "learning_rate": 1.6611225636750838e-05, "loss": 0.9579, "step": 380 }, { "epoch": 0.35, "grad_norm": 1.4972553968322877, "learning_rate": 1.649208168055833e-05, "loss": 0.9747, "step": 385 }, { "epoch": 0.35, "grad_norm": 1.5447739295558658, "learning_rate": 1.637132300872969e-05, "loss": 0.9805, "step": 390 }, { "epoch": 0.36, "grad_norm": 1.5876140330526054, "learning_rate": 1.6248979656456273e-05, "loss": 0.9684, "step": 395 }, { "epoch": 0.36, "grad_norm": 1.5582777256666884, "learning_rate": 1.6125082053072408e-05, "loss": 0.957, "step": 400 }, { "epoch": 0.37, "grad_norm": 1.5476213766084626, "learning_rate": 1.5999661014486956e-05, "loss": 0.9861, "step": 405 }, { "epoch": 0.37, "grad_norm": 1.6383105709040227, "learning_rate": 1.58727477355188e-05, "loss": 0.9793, "step": 410 }, { "epoch": 0.37, "grad_norm": 1.5270291237304714, "learning_rate": 1.5744373782137993e-05, "loss": 0.9608, "step": 415 }, { "epoch": 0.38, "grad_norm": 1.5686224715893557, "learning_rate": 1.5614571083614683e-05, "loss": 0.975, "step": 420 }, { "epoch": 0.38, "grad_norm": 1.5293178485058705, "learning_rate": 1.5483371924577633e-05, "loss": 0.9632, "step": 425 }, { "epoch": 0.39, "grad_norm": 1.4815279637987373, "learning_rate": 1.535080893698435e-05, "loss": 0.9689, "step": 430 }, { "epoch": 0.39, "grad_norm": 1.5169260213036269, "learning_rate": 1.5216915092004847e-05, "loss": 0.9809, "step": 435 }, { "epoch": 0.4, "grad_norm": 1.4976729343178568, "learning_rate": 1.5081723691821029e-05, "loss": 0.9712, "step": 440 }, { "epoch": 0.4, "grad_norm": 1.4442693064244245, "learning_rate": 1.4945268361343747e-05, "loss": 0.9815, "step": 445 }, { "epoch": 0.41, "grad_norm": 1.55780608281581, "learning_rate": 1.4807583039849589e-05, "loss": 0.9872, "step": 450 }, { "epoch": 0.41, "grad_norm": 1.454973247549993, "learning_rate": 1.4668701972539459e-05, "loss": 0.953, "step": 455 }, { "epoch": 0.42, "grad_norm": 1.5744354457111398, "learning_rate": 1.4528659702021108e-05, "loss": 0.9569, "step": 460 }, { "epoch": 0.42, "grad_norm": 1.5374683976132577, "learning_rate": 1.4387491059717653e-05, "loss": 0.9544, "step": 465 }, { "epoch": 0.42, "grad_norm": 1.5356499554288368, "learning_rate": 1.4245231157204282e-05, "loss": 0.9762, "step": 470 }, { "epoch": 0.43, "grad_norm": 1.451816573803636, "learning_rate": 1.4101915377475275e-05, "loss": 0.9484, "step": 475 }, { "epoch": 0.43, "grad_norm": 1.4189149204667209, "learning_rate": 1.3957579366143521e-05, "loss": 0.9568, "step": 480 }, { "epoch": 0.44, "grad_norm": 1.4438844550057277, "learning_rate": 1.3812259022574717e-05, "loss": 0.9678, "step": 485 }, { "epoch": 0.44, "grad_norm": 1.4944146133812288, "learning_rate": 1.3665990490958438e-05, "loss": 0.9684, "step": 490 }, { "epoch": 0.45, "grad_norm": 1.567533543577245, "learning_rate": 1.351881015131833e-05, "loss": 0.9523, "step": 495 }, { "epoch": 0.45, "grad_norm": 1.4298833694464113, "learning_rate": 1.3370754610463655e-05, "loss": 0.9547, "step": 500 }, { "epoch": 0.46, "grad_norm": 1.586075303116762, "learning_rate": 1.3221860692884396e-05, "loss": 0.9621, "step": 505 }, { "epoch": 0.46, "grad_norm": 1.4390997480170529, "learning_rate": 1.307216543159225e-05, "loss": 0.9361, "step": 510 }, { "epoch": 0.47, "grad_norm": 1.5962792654589735, "learning_rate": 1.2921706058909757e-05, "loss": 0.952, "step": 515 }, { "epoch": 0.47, "grad_norm": 1.5112017838877818, "learning_rate": 1.2770519997209837e-05, "loss": 0.9501, "step": 520 }, { "epoch": 0.47, "grad_norm": 1.46430521195488, "learning_rate": 1.2618644849608068e-05, "loss": 0.9656, "step": 525 }, { "epoch": 0.48, "grad_norm": 1.472561859950697, "learning_rate": 1.246611839061002e-05, "loss": 0.9545, "step": 530 }, { "epoch": 0.48, "grad_norm": 1.5276258975583332, "learning_rate": 1.2312978556715934e-05, "loss": 0.9502, "step": 535 }, { "epoch": 0.49, "grad_norm": 1.5767752712595098, "learning_rate": 1.2159263436985139e-05, "loss": 0.9497, "step": 540 }, { "epoch": 0.49, "grad_norm": 1.4814084910286585, "learning_rate": 1.2005011263562514e-05, "loss": 0.953, "step": 545 }, { "epoch": 0.5, "grad_norm": 1.4811159993074694, "learning_rate": 1.185026040216934e-05, "loss": 0.9517, "step": 550 }, { "epoch": 0.5, "grad_norm": 1.5425020495972324, "learning_rate": 1.1695049342560969e-05, "loss": 0.9536, "step": 555 }, { "epoch": 0.51, "grad_norm": 1.4813121305496708, "learning_rate": 1.1539416688953613e-05, "loss": 0.9566, "step": 560 }, { "epoch": 0.51, "grad_norm": 1.5386521152381667, "learning_rate": 1.138340115042267e-05, "loss": 0.968, "step": 565 }, { "epoch": 0.51, "grad_norm": 1.4985329754887164, "learning_rate": 1.1227041531274978e-05, "loss": 0.9536, "step": 570 }, { "epoch": 0.52, "grad_norm": 1.7426405055010268, "learning_rate": 1.1070376721397374e-05, "loss": 0.9387, "step": 575 }, { "epoch": 0.52, "grad_norm": 1.4876962547232626, "learning_rate": 1.0913445686583974e-05, "loss": 0.9479, "step": 580 }, { "epoch": 0.53, "grad_norm": 1.4005947575155968, "learning_rate": 1.075628745884457e-05, "loss": 0.94, "step": 585 }, { "epoch": 0.53, "grad_norm": 1.4806276567215155, "learning_rate": 1.0598941126696545e-05, "loss": 0.9537, "step": 590 }, { "epoch": 0.54, "grad_norm": 1.3867191265630952, "learning_rate": 1.0441445825442773e-05, "loss": 0.9362, "step": 595 }, { "epoch": 0.54, "grad_norm": 1.4082068202931468, "learning_rate": 1.0283840727437832e-05, "loss": 0.9391, "step": 600 }, { "epoch": 0.55, "grad_norm": 1.4462569599659194, "learning_rate": 1.012616503234504e-05, "loss": 0.9655, "step": 605 }, { "epoch": 0.55, "grad_norm": 1.4280342668958195, "learning_rate": 9.968457957386663e-06, "loss": 0.9297, "step": 610 }, { "epoch": 0.56, "grad_norm": 1.4502290522153605, "learning_rate": 9.810758727589814e-06, "loss": 0.9486, "step": 615 }, { "epoch": 0.56, "grad_norm": 1.3531947713121897, "learning_rate": 9.65310656603033e-06, "loss": 0.9374, "step": 620 }, { "epoch": 0.56, "grad_norm": 1.5003587863241752, "learning_rate": 9.495540684077215e-06, "loss": 0.952, "step": 625 }, { "epoch": 0.57, "grad_norm": 1.4085207232358623, "learning_rate": 9.338100271639932e-06, "loss": 0.9211, "step": 630 }, { "epoch": 0.57, "grad_norm": 1.4167180963126849, "learning_rate": 9.180824487421077e-06, "loss": 0.9291, "step": 635 }, { "epoch": 0.58, "grad_norm": 1.444242534136093, "learning_rate": 9.023752449176773e-06, "loss": 0.9338, "step": 640 }, { "epoch": 0.58, "grad_norm": 1.474627058570353, "learning_rate": 8.866923223987303e-06, "loss": 0.932, "step": 645 }, { "epoch": 0.59, "grad_norm": 1.3786541919625397, "learning_rate": 8.71037581854028e-06, "loss": 0.9287, "step": 650 }, { "epoch": 0.59, "grad_norm": 1.4174870026814845, "learning_rate": 8.554149169428894e-06, "loss": 0.9396, "step": 655 }, { "epoch": 0.6, "grad_norm": 1.4346984007547974, "learning_rate": 8.398282133467579e-06, "loss": 0.9353, "step": 660 }, { "epoch": 0.6, "grad_norm": 1.3995616959967054, "learning_rate": 8.242813478027491e-06, "loss": 0.9451, "step": 665 }, { "epoch": 0.61, "grad_norm": 1.4298853430595138, "learning_rate": 8.087781871394281e-06, "loss": 0.9294, "step": 670 }, { "epoch": 0.61, "grad_norm": 4.062626134056569, "learning_rate": 7.93322587315047e-06, "loss": 0.9486, "step": 675 }, { "epoch": 0.61, "grad_norm": 2.593063209369072, "learning_rate": 7.7791839245849e-06, "loss": 0.9323, "step": 680 }, { "epoch": 0.62, "grad_norm": 1.4402410302679418, "learning_rate": 7.625694339131564e-06, "loss": 0.9208, "step": 685 }, { "epoch": 0.62, "grad_norm": 1.427719727713364, "learning_rate": 7.4727952928402695e-06, "loss": 0.9432, "step": 690 }, { "epoch": 0.63, "grad_norm": 1.4229770799618295, "learning_rate": 7.320524814881471e-06, "loss": 0.926, "step": 695 }, { "epoch": 0.63, "grad_norm": 1.4126909774748309, "learning_rate": 7.1689207780876026e-06, "loss": 0.9282, "step": 700 }, { "epoch": 0.64, "grad_norm": 1.4661343172163777, "learning_rate": 7.018020889533348e-06, "loss": 0.9245, "step": 705 }, { "epoch": 0.64, "grad_norm": 1.5393466522100154, "learning_rate": 6.867862681157067e-06, "loss": 0.9215, "step": 710 }, { "epoch": 0.65, "grad_norm": 1.412508168571422, "learning_rate": 6.718483500425868e-06, "loss": 0.9247, "step": 715 }, { "epoch": 0.65, "grad_norm": 1.4776147088210356, "learning_rate": 6.569920501046474e-06, "loss": 0.9219, "step": 720 }, { "epoch": 0.65, "grad_norm": 1.3425578015608433, "learning_rate": 6.42221063372436e-06, "loss": 0.9258, "step": 725 }, { "epoch": 0.66, "grad_norm": 1.4129522183319783, "learning_rate": 6.275390636973315e-06, "loss": 0.9192, "step": 730 }, { "epoch": 0.66, "grad_norm": 1.4289303694831434, "learning_rate": 6.129497027977829e-06, "loss": 0.9189, "step": 735 }, { "epoch": 0.67, "grad_norm": 1.3710954157535182, "learning_rate": 5.9845660935105084e-06, "loss": 0.9164, "step": 740 }, { "epoch": 0.67, "grad_norm": 1.4387200450753754, "learning_rate": 5.8406338809067874e-06, "loss": 0.9369, "step": 745 }, { "epoch": 0.68, "grad_norm": 1.4010474045719385, "learning_rate": 5.69773618909923e-06, "loss": 0.9244, "step": 750 }, { "epoch": 0.68, "grad_norm": 1.409153799110607, "learning_rate": 5.555908559713561e-06, "loss": 0.9118, "step": 755 }, { "epoch": 0.69, "grad_norm": 1.3657173927666795, "learning_rate": 5.4151862682287624e-06, "loss": 0.9142, "step": 760 }, { "epoch": 0.69, "grad_norm": 1.3963533437536293, "learning_rate": 5.2756043152032934e-06, "loss": 0.9176, "step": 765 }, { "epoch": 0.7, "grad_norm": 1.3216489386400923, "learning_rate": 5.137197417569739e-06, "loss": 0.908, "step": 770 }, { "epoch": 0.7, "grad_norm": 1.4412475309656017, "learning_rate": 5.000000000000003e-06, "loss": 0.9165, "step": 775 }, { "epoch": 0.7, "grad_norm": 1.4134533602820125, "learning_rate": 4.86404618634314e-06, "loss": 0.9279, "step": 780 }, { "epoch": 0.71, "grad_norm": 1.3837626714460547, "learning_rate": 4.729369791138085e-06, "loss": 0.9189, "step": 785 }, { "epoch": 0.71, "grad_norm": 1.397147185881214, "learning_rate": 4.596004311203243e-06, "loss": 0.9421, "step": 790 }, { "epoch": 0.72, "grad_norm": 1.3486469288795642, "learning_rate": 4.463982917305155e-06, "loss": 0.9156, "step": 795 }, { "epoch": 0.72, "grad_norm": 1.3475341827233354, "learning_rate": 4.333338445908225e-06, "loss": 0.9292, "step": 800 }, { "epoch": 0.73, "grad_norm": 1.3536202190201114, "learning_rate": 4.2041033910076235e-06, "loss": 0.8996, "step": 805 }, { "epoch": 0.73, "grad_norm": 1.3534435686443709, "learning_rate": 4.076309896047337e-06, "loss": 0.9357, "step": 810 }, { "epoch": 0.74, "grad_norm": 1.3961829341566565, "learning_rate": 3.9499897459254375e-06, "loss": 0.9233, "step": 815 }, { "epoch": 0.74, "grad_norm": 1.348649115175699, "learning_rate": 3.825174359088526e-06, "loss": 0.9097, "step": 820 }, { "epoch": 0.75, "grad_norm": 1.4476303062234663, "learning_rate": 3.7018947797172864e-06, "loss": 0.9274, "step": 825 }, { "epoch": 0.75, "grad_norm": 1.390535701834856, "learning_rate": 3.580181670005183e-06, "loss": 0.9184, "step": 830 }, { "epoch": 0.75, "grad_norm": 1.3785793159092763, "learning_rate": 3.4600653025321085e-06, "loss": 0.9055, "step": 835 }, { "epoch": 0.76, "grad_norm": 1.354137886395205, "learning_rate": 3.341575552734978e-06, "loss": 0.9109, "step": 840 }, { "epoch": 0.76, "grad_norm": 1.4067176317883785, "learning_rate": 3.224741891477096e-06, "loss": 0.9241, "step": 845 }, { "epoch": 0.77, "grad_norm": 1.374060716621096, "learning_rate": 3.1095933777181165e-06, "loss": 0.9118, "step": 850 }, { "epoch": 0.77, "grad_norm": 1.3942906283270295, "learning_rate": 2.9961586512864947e-06, "loss": 0.9, "step": 855 }, { "epoch": 0.78, "grad_norm": 1.4465506221807978, "learning_rate": 2.884465925756159e-06, "loss": 0.9242, "step": 860 }, { "epoch": 0.78, "grad_norm": 1.3396029078248526, "learning_rate": 2.7745429814292147e-06, "loss": 0.9241, "step": 865 }, { "epoch": 0.79, "grad_norm": 1.3098914704831672, "learning_rate": 2.666417158426393e-06, "loss": 0.9228, "step": 870 }, { "epoch": 0.79, "grad_norm": 1.434445807607541, "learning_rate": 2.5601153498870137e-06, "loss": 0.9191, "step": 875 }, { "epoch": 0.79, "grad_norm": 1.3696234318588858, "learning_rate": 2.4556639952800786e-06, "loss": 0.9216, "step": 880 }, { "epoch": 0.8, "grad_norm": 1.3799439546633658, "learning_rate": 2.353089073828255e-06, "loss": 0.9066, "step": 885 }, { "epoch": 0.8, "grad_norm": 1.3783696426493683, "learning_rate": 2.252416098046275e-06, "loss": 0.9102, "step": 890 }, { "epoch": 0.81, "grad_norm": 1.3692307220867967, "learning_rate": 2.153670107395456e-06, "loss": 0.8958, "step": 895 }, { "epoch": 0.81, "grad_norm": 1.3494441419685654, "learning_rate": 2.056875662055874e-06, "loss": 0.9144, "step": 900 }, { "epoch": 0.82, "grad_norm": 1.356794216227153, "learning_rate": 1.9620568368177183e-06, "loss": 0.8964, "step": 905 }, { "epoch": 0.82, "grad_norm": 1.3429980043030805, "learning_rate": 1.8692372150934113e-06, "loss": 0.9194, "step": 910 }, { "epoch": 0.83, "grad_norm": 1.339376480893687, "learning_rate": 1.7784398830519002e-06, "loss": 0.9093, "step": 915 }, { "epoch": 0.83, "grad_norm": 1.3355920171762052, "learning_rate": 1.6896874238766703e-06, "loss": 0.8913, "step": 920 }, { "epoch": 0.84, "grad_norm": 1.293120986101463, "learning_rate": 1.6030019121488227e-06, "loss": 0.9182, "step": 925 }, { "epoch": 0.84, "grad_norm": 1.4443458217709528, "learning_rate": 1.5184049083566688e-06, "loss": 0.9123, "step": 930 }, { "epoch": 0.84, "grad_norm": 1.3695869429477345, "learning_rate": 1.4359174535331998e-06, "loss": 0.9092, "step": 935 }, { "epoch": 0.85, "grad_norm": 1.339902530091704, "learning_rate": 1.3555600640227284e-06, "loss": 0.9254, "step": 940 }, { "epoch": 0.85, "grad_norm": 1.4388242190037084, "learning_rate": 1.2773527263780626e-06, "loss": 0.8972, "step": 945 }, { "epoch": 0.86, "grad_norm": 1.430786276692823, "learning_rate": 1.2013148923894213e-06, "loss": 0.9197, "step": 950 }, { "epoch": 0.86, "grad_norm": 1.3197356728810627, "learning_rate": 1.1274654742463842e-06, "loss": 0.897, "step": 955 }, { "epoch": 0.87, "grad_norm": 1.3361601981054119, "learning_rate": 1.0558228398340188e-06, "loss": 0.9094, "step": 960 }, { "epoch": 0.87, "grad_norm": 1.3263397846241956, "learning_rate": 9.86404808164426e-07, "loss": 0.8958, "step": 965 }, { "epoch": 0.88, "grad_norm": 1.3279928456335177, "learning_rate": 9.192286449447684e-07, "loss": 0.8967, "step": 970 }, { "epoch": 0.88, "grad_norm": 1.3258111261619026, "learning_rate": 8.543110582829272e-07, "loss": 0.9021, "step": 975 }, { "epoch": 0.89, "grad_norm": 1.2957246118366699, "learning_rate": 7.916681945318649e-07, "loss": 0.9083, "step": 980 }, { "epoch": 0.89, "grad_norm": 1.3406921495185697, "learning_rate": 7.313156342736738e-07, "loss": 0.898, "step": 985 }, { "epoch": 0.89, "grad_norm": 1.3197731448045178, "learning_rate": 6.732683884443736e-07, "loss": 0.8957, "step": 990 }, { "epoch": 0.9, "grad_norm": 1.3468993226799906, "learning_rate": 6.175408946003703e-07, "loss": 0.9035, "step": 995 }, { "epoch": 0.9, "grad_norm": 1.3353098250857058, "learning_rate": 5.641470133275473e-07, "loss": 0.894, "step": 1000 }, { "epoch": 0.91, "grad_norm": 1.3632991670701868, "learning_rate": 5.131000247938367e-07, "loss": 0.9149, "step": 1005 }, { "epoch": 0.91, "grad_norm": 1.3014286387338503, "learning_rate": 4.644126254461756e-07, "loss": 0.8919, "step": 1010 }, { "epoch": 0.92, "grad_norm": 1.3376939712764266, "learning_rate": 4.180969248526334e-07, "loss": 0.9151, "step": 1015 }, { "epoch": 0.92, "grad_norm": 1.316358705323747, "learning_rate": 3.7416444269050335e-07, "loss": 0.9109, "step": 1020 }, { "epoch": 0.93, "grad_norm": 1.351422091186982, "learning_rate": 3.326261058811331e-07, "loss": 0.9046, "step": 1025 }, { "epoch": 0.93, "grad_norm": 1.3147060361947367, "learning_rate": 2.9349224587215786e-07, "loss": 0.9036, "step": 1030 }, { "epoch": 0.93, "grad_norm": 1.376354495442706, "learning_rate": 2.5677259606786686e-07, "loss": 0.9109, "step": 1035 }, { "epoch": 0.94, "grad_norm": 1.318239765222968, "learning_rate": 2.2247628940829214e-07, "loss": 0.9011, "step": 1040 }, { "epoch": 0.94, "grad_norm": 1.334098597139642, "learning_rate": 1.9061185609766996e-07, "loss": 0.9158, "step": 1045 }, { "epoch": 0.95, "grad_norm": 1.311535708505789, "learning_rate": 1.6118722148278586e-07, "loss": 0.9069, "step": 1050 }, { "epoch": 0.95, "grad_norm": 1.3399145129480423, "learning_rate": 1.3420970408178912e-07, "loss": 0.9108, "step": 1055 }, { "epoch": 0.96, "grad_norm": 1.3369774366097849, "learning_rate": 1.0968601376391996e-07, "loss": 0.9022, "step": 1060 }, { "epoch": 0.96, "grad_norm": 1.327397654041781, "learning_rate": 8.762225008062675e-08, "loss": 0.9063, "step": 1065 }, { "epoch": 0.97, "grad_norm": 1.3398923885085607, "learning_rate": 6.802390074847731e-08, "loss": 0.92, "step": 1070 }, { "epoch": 0.97, "grad_norm": 1.3031847902685991, "learning_rate": 5.0895840284257424e-08, "loss": 0.8971, "step": 1075 }, { "epoch": 0.98, "grad_norm": 1.34860295723008, "learning_rate": 3.6242328792567286e-08, "loss": 0.9018, "step": 1080 }, { "epoch": 0.98, "grad_norm": 1.314620543755103, "learning_rate": 2.406701090625463e-08, "loss": 0.8847, "step": 1085 }, { "epoch": 0.98, "grad_norm": 1.315793020724077, "learning_rate": 1.4372914879909882e-08, "loss": 0.896, "step": 1090 }, { "epoch": 0.99, "grad_norm": 1.3160311821089112, "learning_rate": 7.162451836685291e-09, "loss": 0.9092, "step": 1095 }, { "epoch": 0.99, "grad_norm": 1.381770390973591, "learning_rate": 2.4374151685913063e-09, "loss": 0.9208, "step": 1100 }, { "epoch": 1.0, "grad_norm": 1.3304085867307816, "learning_rate": 1.989800904445005e-10, "loss": 0.9189, "step": 1105 }, { "epoch": 1.0, "eval_loss": 0.9156445264816284, "eval_runtime": 344.7542, "eval_samples_per_second": 45.482, "eval_steps_per_second": 0.711, "step": 1107 }, { "epoch": 1.0, "step": 1107, "total_flos": 463566557675520.0, "train_loss": 0.9518642601066596, "train_runtime": 13070.0796, "train_samples_per_second": 10.839, "train_steps_per_second": 0.085 } ], "logging_steps": 5, "max_steps": 1107, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 463566557675520.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }