{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 9.77339219775282, "learning_rate": 1.8348623853211012e-07, "loss": 1.1557, "step": 1 }, { "epoch": 0.0, "grad_norm": 7.7326824203360625, "learning_rate": 9.174311926605506e-07, "loss": 1.1259, "step": 5 }, { "epoch": 0.01, "grad_norm": 4.217943082141626, "learning_rate": 1.8348623853211011e-06, "loss": 1.0814, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.1553293441150627, "learning_rate": 2.7522935779816517e-06, "loss": 1.0173, "step": 15 }, { "epoch": 0.02, "grad_norm": 2.624974536136745, "learning_rate": 3.6697247706422022e-06, "loss": 1.01, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.9346693086685613, "learning_rate": 4.587155963302753e-06, "loss": 0.9699, "step": 25 }, { "epoch": 0.03, "grad_norm": 2.0023694879238265, "learning_rate": 5.504587155963303e-06, "loss": 0.9871, "step": 30 }, { "epoch": 0.03, "grad_norm": 1.9243231029659136, "learning_rate": 6.422018348623854e-06, "loss": 0.9731, "step": 35 }, { "epoch": 0.04, "grad_norm": 2.3079689233656158, "learning_rate": 7.3394495412844045e-06, "loss": 0.9786, "step": 40 }, { "epoch": 0.04, "grad_norm": 1.9848142793089458, "learning_rate": 8.256880733944956e-06, "loss": 0.9683, "step": 45 }, { "epoch": 0.05, "grad_norm": 2.1601847540175347, "learning_rate": 9.174311926605506e-06, "loss": 0.9655, "step": 50 }, { "epoch": 0.05, "grad_norm": 1.995827350954353, "learning_rate": 1.0091743119266055e-05, "loss": 0.9763, "step": 55 }, { "epoch": 0.06, "grad_norm": 1.8778202478820005, "learning_rate": 1.1009174311926607e-05, "loss": 0.9828, "step": 60 }, { "epoch": 0.06, "grad_norm": 2.6097761971016706, "learning_rate": 1.1926605504587156e-05, "loss": 0.9917, "step": 65 }, { "epoch": 0.06, "grad_norm": 1.9438707878045542, "learning_rate": 1.2844036697247708e-05, "loss": 0.9781, "step": 70 }, { "epoch": 0.07, "grad_norm": 2.5283537508840412, "learning_rate": 1.3761467889908258e-05, "loss": 0.9766, "step": 75 }, { "epoch": 0.07, "grad_norm": 2.4698926435303807, "learning_rate": 1.4678899082568809e-05, "loss": 0.9744, "step": 80 }, { "epoch": 0.08, "grad_norm": 1.7950947398875594, "learning_rate": 1.559633027522936e-05, "loss": 0.9925, "step": 85 }, { "epoch": 0.08, "grad_norm": 1.849659361988556, "learning_rate": 1.6513761467889912e-05, "loss": 0.9951, "step": 90 }, { "epoch": 0.09, "grad_norm": 2.091607616206256, "learning_rate": 1.743119266055046e-05, "loss": 1.0061, "step": 95 }, { "epoch": 0.09, "grad_norm": 2.066221029636183, "learning_rate": 1.834862385321101e-05, "loss": 0.9985, "step": 100 }, { "epoch": 0.1, "grad_norm": 2.869454578032626, "learning_rate": 1.9266055045871563e-05, "loss": 0.9907, "step": 105 }, { "epoch": 0.1, "grad_norm": 2.2432123776551944, "learning_rate": 1.999994872196626e-05, "loss": 1.0027, "step": 110 }, { "epoch": 0.11, "grad_norm": 2.328668819896044, "learning_rate": 1.9998154046002822e-05, "loss": 0.9926, "step": 115 }, { "epoch": 0.11, "grad_norm": 1.9876127666903096, "learning_rate": 1.999379599421534e-05, "loss": 1.0011, "step": 120 }, { "epoch": 0.11, "grad_norm": 1.8724555785709804, "learning_rate": 1.9986875683942535e-05, "loss": 1.0242, "step": 125 }, { "epoch": 0.12, "grad_norm": 2.1894981771490443, "learning_rate": 1.9977394889447526e-05, "loss": 1.0032, "step": 130 }, { "epoch": 0.12, "grad_norm": 2.5051450231269605, "learning_rate": 1.9965356041462954e-05, "loss": 1.0154, "step": 135 }, { "epoch": 0.13, "grad_norm": 2.01438038230488, "learning_rate": 1.9950762226567783e-05, "loss": 1.0023, "step": 140 }, { "epoch": 0.13, "grad_norm": 1.974662211540992, "learning_rate": 1.9933617186395917e-05, "loss": 0.9996, "step": 145 }, { "epoch": 0.14, "grad_norm": 1.7299662712917598, "learning_rate": 1.9913925316676946e-05, "loss": 1.0024, "step": 150 }, { "epoch": 0.14, "grad_norm": 1.675351804082685, "learning_rate": 1.9891691666109112e-05, "loss": 0.9807, "step": 155 }, { "epoch": 0.15, "grad_norm": 1.7568442047779815, "learning_rate": 1.9866921935064907e-05, "loss": 1.0026, "step": 160 }, { "epoch": 0.15, "grad_norm": 1.8020826944560722, "learning_rate": 1.9839622474129595e-05, "loss": 0.9945, "step": 165 }, { "epoch": 0.16, "grad_norm": 1.880978608228776, "learning_rate": 1.9809800282473014e-05, "loss": 1.0131, "step": 170 }, { "epoch": 0.16, "grad_norm": 1.9533970099597644, "learning_rate": 1.977746300605507e-05, "loss": 1.0154, "step": 175 }, { "epoch": 0.17, "grad_norm": 1.8000655097775127, "learning_rate": 1.9742618935665478e-05, "loss": 0.9994, "step": 180 }, { "epoch": 0.17, "grad_norm": 1.6150313589958014, "learning_rate": 1.9705277004798072e-05, "loss": 1.0017, "step": 185 }, { "epoch": 0.17, "grad_norm": 1.6236825548036045, "learning_rate": 1.9665446787360444e-05, "loss": 1.0084, "step": 190 }, { "epoch": 0.18, "grad_norm": 1.681320814042372, "learning_rate": 1.9623138495219292e-05, "loss": 1.0185, "step": 195 }, { "epoch": 0.18, "grad_norm": 1.909108842556375, "learning_rate": 1.957836297558229e-05, "loss": 1.0128, "step": 200 }, { "epoch": 0.19, "grad_norm": 1.6634688766473966, "learning_rate": 1.9531131708217005e-05, "loss": 1.0145, "step": 205 }, { "epoch": 0.19, "grad_norm": 1.556894984467879, "learning_rate": 1.948145680250766e-05, "loss": 0.9924, "step": 210 }, { "epoch": 0.2, "grad_norm": 1.620552922782101, "learning_rate": 1.9429350994350483e-05, "loss": 1.0044, "step": 215 }, { "epoch": 0.2, "grad_norm": 1.635093025466642, "learning_rate": 1.93748276428884e-05, "loss": 0.9856, "step": 220 }, { "epoch": 0.21, "grad_norm": 1.674299362647312, "learning_rate": 1.931790072708596e-05, "loss": 0.9934, "step": 225 }, { "epoch": 0.21, "grad_norm": 1.6887591497181855, "learning_rate": 1.9258584842145342e-05, "loss": 1.0112, "step": 230 }, { "epoch": 0.22, "grad_norm": 1.645113774478941, "learning_rate": 1.9196895195764363e-05, "loss": 1.0066, "step": 235 }, { "epoch": 0.22, "grad_norm": 1.6845020042548609, "learning_rate": 1.913284760423745e-05, "loss": 1.0158, "step": 240 }, { "epoch": 0.22, "grad_norm": 1.647832284848899, "learning_rate": 1.9066458488400586e-05, "loss": 1.0086, "step": 245 }, { "epoch": 0.23, "grad_norm": 1.697080815625411, "learning_rate": 1.8997744869421248e-05, "loss": 1.0155, "step": 250 }, { "epoch": 0.23, "grad_norm": 1.7170796898803107, "learning_rate": 1.8926724364434447e-05, "loss": 1.0041, "step": 255 }, { "epoch": 0.24, "grad_norm": 1.706074031599582, "learning_rate": 1.8853415182025953e-05, "loss": 0.9922, "step": 260 }, { "epoch": 0.24, "grad_norm": 1.8989841472640996, "learning_rate": 1.8777836117563894e-05, "loss": 0.9941, "step": 265 }, { "epoch": 0.25, "grad_norm": 1.7121367648810273, "learning_rate": 1.8700006548379898e-05, "loss": 1.0023, "step": 270 }, { "epoch": 0.25, "grad_norm": 1.9417882651127933, "learning_rate": 1.861994642880105e-05, "loss": 1.0081, "step": 275 }, { "epoch": 0.26, "grad_norm": 1.7496252675391932, "learning_rate": 1.8537676285033886e-05, "loss": 1.0012, "step": 280 }, { "epoch": 0.26, "grad_norm": 1.5992757261270931, "learning_rate": 1.845321720990181e-05, "loss": 0.9882, "step": 285 }, { "epoch": 0.27, "grad_norm": 1.5982026868074015, "learning_rate": 1.8366590857437182e-05, "loss": 0.9935, "step": 290 }, { "epoch": 0.27, "grad_norm": 2.0122073912291203, "learning_rate": 1.8277819437329577e-05, "loss": 1.0075, "step": 295 }, { "epoch": 0.28, "grad_norm": 21.05626958615313, "learning_rate": 1.8186925709231534e-05, "loss": 1.0239, "step": 300 }, { "epoch": 0.28, "grad_norm": 5.626174907352815, "learning_rate": 1.809393297692334e-05, "loss": 1.1336, "step": 305 }, { "epoch": 0.28, "grad_norm": 3.5588583574219355, "learning_rate": 1.799886508233829e-05, "loss": 1.0703, "step": 310 }, { "epoch": 0.29, "grad_norm": 52.257959922882186, "learning_rate": 1.790174639944997e-05, "loss": 1.2478, "step": 315 }, { "epoch": 0.29, "grad_norm": 8.296874056632827, "learning_rate": 1.780260182802314e-05, "loss": 1.4918, "step": 320 }, { "epoch": 0.3, "grad_norm": 17.554936318206273, "learning_rate": 1.7701456787229805e-05, "loss": 1.2285, "step": 325 }, { "epoch": 0.3, "grad_norm": 8.45205253267018, "learning_rate": 1.7598337209132142e-05, "loss": 1.1503, "step": 330 }, { "epoch": 0.31, "grad_norm": 6.266748541585308, "learning_rate": 1.7493269532033882e-05, "loss": 1.1033, "step": 335 }, { "epoch": 0.31, "grad_norm": 111.52289151968921, "learning_rate": 1.738628069370195e-05, "loss": 1.1048, "step": 340 }, { "epoch": 0.32, "grad_norm": 7.688414643609198, "learning_rate": 1.7277398124460022e-05, "loss": 1.1117, "step": 345 }, { "epoch": 0.32, "grad_norm": 4.031319921903212, "learning_rate": 1.71666497401558e-05, "loss": 1.0816, "step": 350 }, { "epoch": 0.33, "grad_norm": 11.431111354306312, "learning_rate": 1.7054063935003813e-05, "loss": 1.0609, "step": 355 }, { "epoch": 0.33, "grad_norm": 2.2441387172030134, "learning_rate": 1.6939669574305565e-05, "loss": 1.0462, "step": 360 }, { "epoch": 0.33, "grad_norm": 2.0523696752811604, "learning_rate": 1.6823495987048922e-05, "loss": 1.0476, "step": 365 }, { "epoch": 0.34, "grad_norm": 1.8895154393147975, "learning_rate": 1.6705572958388576e-05, "loss": 1.0311, "step": 370 }, { "epoch": 0.34, "grad_norm": 1.8139444627013965, "learning_rate": 1.6585930722009602e-05, "loss": 1.0371, "step": 375 }, { "epoch": 0.35, "grad_norm": 1.7032921177242157, "learning_rate": 1.6464599952375998e-05, "loss": 1.0436, "step": 380 }, { "epoch": 0.35, "grad_norm": 1.7285341229493167, "learning_rate": 1.63416117568662e-05, "loss": 1.0119, "step": 385 }, { "epoch": 0.36, "grad_norm": 1.722890751549469, "learning_rate": 1.621699766779763e-05, "loss": 1.0317, "step": 390 }, { "epoch": 0.36, "grad_norm": 1.599051242781322, "learning_rate": 1.6090789634342278e-05, "loss": 1.0147, "step": 395 }, { "epoch": 0.37, "grad_norm": 2.011875168952266, "learning_rate": 1.5963020014335437e-05, "loss": 1.0109, "step": 400 }, { "epoch": 0.37, "grad_norm": 1.7745695288261167, "learning_rate": 1.583372156597961e-05, "loss": 1.0226, "step": 405 }, { "epoch": 0.38, "grad_norm": 1.7032574663832216, "learning_rate": 1.570292743944583e-05, "loss": 0.9938, "step": 410 }, { "epoch": 0.38, "grad_norm": 1.710578272048905, "learning_rate": 1.557067116837444e-05, "loss": 0.9883, "step": 415 }, { "epoch": 0.39, "grad_norm": 1.6957880003939496, "learning_rate": 1.5436986661277578e-05, "loss": 0.9908, "step": 420 }, { "epoch": 0.39, "grad_norm": 1.6008040760391928, "learning_rate": 1.530190819284555e-05, "loss": 1.0016, "step": 425 }, { "epoch": 0.39, "grad_norm": 1.7382539416393734, "learning_rate": 1.5165470395159314e-05, "loss": 0.9929, "step": 430 }, { "epoch": 0.4, "grad_norm": 1.8011281382514583, "learning_rate": 1.5027708248811331e-05, "loss": 0.9953, "step": 435 }, { "epoch": 0.4, "grad_norm": 1.6996850685052767, "learning_rate": 1.4888657073937077e-05, "loss": 0.9906, "step": 440 }, { "epoch": 0.41, "grad_norm": 1.618144358393542, "learning_rate": 1.4748352521159492e-05, "loss": 1.0065, "step": 445 }, { "epoch": 0.41, "grad_norm": 1.52251331168285, "learning_rate": 1.4606830562448692e-05, "loss": 0.9944, "step": 450 }, { "epoch": 0.42, "grad_norm": 1.9393071866897407, "learning_rate": 1.4464127481899312e-05, "loss": 0.9765, "step": 455 }, { "epoch": 0.42, "grad_norm": 1.6893250353047762, "learning_rate": 1.4320279866427798e-05, "loss": 1.0034, "step": 460 }, { "epoch": 0.43, "grad_norm": 1.5907871510849094, "learning_rate": 1.4175324596392075e-05, "loss": 0.9839, "step": 465 }, { "epoch": 0.43, "grad_norm": 1.6419259125990346, "learning_rate": 1.402929883613599e-05, "loss": 1.004, "step": 470 }, { "epoch": 0.44, "grad_norm": 1.5792169432621945, "learning_rate": 1.3882240024460928e-05, "loss": 0.9583, "step": 475 }, { "epoch": 0.44, "grad_norm": 1.5453338942420716, "learning_rate": 1.3734185865027061e-05, "loss": 0.9818, "step": 480 }, { "epoch": 0.44, "grad_norm": 1.6532156284337152, "learning_rate": 1.358517431668672e-05, "loss": 0.9812, "step": 485 }, { "epoch": 0.45, "grad_norm": 1.8030804926474087, "learning_rate": 1.3435243583752294e-05, "loss": 0.9799, "step": 490 }, { "epoch": 0.45, "grad_norm": 1.6418123185067888, "learning_rate": 1.3284432106201233e-05, "loss": 0.9916, "step": 495 }, { "epoch": 0.46, "grad_norm": 1.6258591855391478, "learning_rate": 1.313277854982062e-05, "loss": 0.9917, "step": 500 }, { "epoch": 0.46, "grad_norm": 2.02063585594681, "learning_rate": 1.2980321796293838e-05, "loss": 0.9821, "step": 505 }, { "epoch": 0.47, "grad_norm": 1.4641044058112402, "learning_rate": 1.2827100933231904e-05, "loss": 0.9669, "step": 510 }, { "epoch": 0.47, "grad_norm": 1.4734009074897523, "learning_rate": 1.2673155244151985e-05, "loss": 0.9887, "step": 515 }, { "epoch": 0.48, "grad_norm": 1.504049205573269, "learning_rate": 1.2518524198405699e-05, "loss": 0.9752, "step": 520 }, { "epoch": 0.48, "grad_norm": 1.5871801520121447, "learning_rate": 1.2363247441059775e-05, "loss": 0.986, "step": 525 }, { "epoch": 0.49, "grad_norm": 1.5031034877601523, "learning_rate": 1.2207364782731657e-05, "loss": 0.9827, "step": 530 }, { "epoch": 0.49, "grad_norm": 1.680552260004968, "learning_rate": 1.2050916189382646e-05, "loss": 0.977, "step": 535 }, { "epoch": 0.5, "grad_norm": 1.5439369139377679, "learning_rate": 1.189394177207125e-05, "loss": 0.9746, "step": 540 }, { "epoch": 0.5, "grad_norm": 1.4901425478827495, "learning_rate": 1.1736481776669307e-05, "loss": 0.9653, "step": 545 }, { "epoch": 0.5, "grad_norm": 1.468475262540055, "learning_rate": 1.1578576573543541e-05, "loss": 0.9887, "step": 550 }, { "epoch": 0.51, "grad_norm": 1.4398608054707567, "learning_rate": 1.1420266647205232e-05, "loss": 0.9761, "step": 555 }, { "epoch": 0.51, "grad_norm": 1.5422374491434532, "learning_rate": 1.1261592585930576e-05, "loss": 0.9582, "step": 560 }, { "epoch": 0.52, "grad_norm": 1.5286775911829815, "learning_rate": 1.1102595071354471e-05, "loss": 0.9738, "step": 565 }, { "epoch": 0.52, "grad_norm": 1.4817522889069266, "learning_rate": 1.0943314868040365e-05, "loss": 0.9813, "step": 570 }, { "epoch": 0.53, "grad_norm": 1.4293384597101042, "learning_rate": 1.0783792813028828e-05, "loss": 0.9674, "step": 575 }, { "epoch": 0.53, "grad_norm": 1.5261318491484366, "learning_rate": 1.0624069805367558e-05, "loss": 0.985, "step": 580 }, { "epoch": 0.54, "grad_norm": 1.5097693128941394, "learning_rate": 1.0464186795625481e-05, "loss": 0.9639, "step": 585 }, { "epoch": 0.54, "grad_norm": 1.5846698306417735, "learning_rate": 1.0304184775393642e-05, "loss": 0.9701, "step": 590 }, { "epoch": 0.55, "grad_norm": 1.4231247673906455, "learning_rate": 1.0144104766775574e-05, "loss": 0.9684, "step": 595 }, { "epoch": 0.55, "grad_norm": 1.4645243353450357, "learning_rate": 9.983987811869863e-06, "loss": 0.9726, "step": 600 }, { "epoch": 0.56, "grad_norm": 1.6188671105629062, "learning_rate": 9.823874962247565e-06, "loss": 0.9562, "step": 605 }, { "epoch": 0.56, "grad_norm": 1.4748824240542495, "learning_rate": 9.663807268427197e-06, "loss": 0.9677, "step": 610 }, { "epoch": 0.56, "grad_norm": 1.427861703011802, "learning_rate": 9.503825769350016e-06, "loss": 0.9645, "step": 615 }, { "epoch": 0.57, "grad_norm": 1.4151510701857246, "learning_rate": 9.343971481858246e-06, "loss": 0.9423, "step": 620 }, { "epoch": 0.57, "grad_norm": 1.4516242963328596, "learning_rate": 9.184285390178978e-06, "loss": 0.9656, "step": 625 }, { "epoch": 0.58, "grad_norm": 1.7062684460068769, "learning_rate": 9.024808435416435e-06, "loss": 0.9637, "step": 630 }, { "epoch": 0.58, "grad_norm": 1.4522675784960328, "learning_rate": 8.865581505055292e-06, "loss": 0.9786, "step": 635 }, { "epoch": 0.59, "grad_norm": 1.4328537035514395, "learning_rate": 8.706645422477739e-06, "loss": 0.948, "step": 640 }, { "epoch": 0.59, "grad_norm": 1.5567683964703176, "learning_rate": 8.548040936496989e-06, "loss": 0.9613, "step": 645 }, { "epoch": 0.6, "grad_norm": 1.5074863131808958, "learning_rate": 8.389808710909881e-06, "loss": 0.9514, "step": 650 }, { "epoch": 0.6, "grad_norm": 1.4273894196530337, "learning_rate": 8.231989314071318e-06, "loss": 0.9656, "step": 655 }, { "epoch": 0.61, "grad_norm": 1.471995206720774, "learning_rate": 8.07462320849313e-06, "loss": 0.9475, "step": 660 }, { "epoch": 0.61, "grad_norm": 1.568608987104535, "learning_rate": 7.917750740470116e-06, "loss": 0.9769, "step": 665 }, { "epoch": 0.61, "grad_norm": 1.3745365175696858, "learning_rate": 7.761412129735853e-06, "loss": 0.9603, "step": 670 }, { "epoch": 0.62, "grad_norm": 1.466522406606583, "learning_rate": 7.605647459150961e-06, "loss": 0.9332, "step": 675 }, { "epoch": 0.62, "grad_norm": 1.3463238948259868, "learning_rate": 7.4504966644264775e-06, "loss": 0.9436, "step": 680 }, { "epoch": 0.63, "grad_norm": 1.4997500356384885, "learning_rate": 7.295999523884921e-06, "loss": 0.9363, "step": 685 }, { "epoch": 0.63, "grad_norm": 1.4849900349578966, "learning_rate": 7.142195648261747e-06, "loss": 0.9699, "step": 690 }, { "epoch": 0.64, "grad_norm": 1.4732303903444304, "learning_rate": 6.989124470549746e-06, "loss": 0.9582, "step": 695 }, { "epoch": 0.64, "grad_norm": 1.3881607197451564, "learning_rate": 6.83682523588902e-06, "loss": 0.9457, "step": 700 }, { "epoch": 0.65, "grad_norm": 1.441527770787461, "learning_rate": 6.685336991505122e-06, "loss": 0.961, "step": 705 }, { "epoch": 0.65, "grad_norm": 1.5093653951413892, "learning_rate": 6.5346985766979384e-06, "loss": 0.9557, "step": 710 }, { "epoch": 0.66, "grad_norm": 1.3906632586548848, "learning_rate": 6.384948612883872e-06, "loss": 0.9274, "step": 715 }, { "epoch": 0.66, "grad_norm": 1.4296014203069103, "learning_rate": 6.2361254936939e-06, "loss": 0.9333, "step": 720 }, { "epoch": 0.67, "grad_norm": 1.4300856178576173, "learning_rate": 6.0882673751300235e-06, "loss": 0.9462, "step": 725 }, { "epoch": 0.67, "grad_norm": 1.3936601183930903, "learning_rate": 5.941412165782645e-06, "loss": 0.9479, "step": 730 }, { "epoch": 0.67, "grad_norm": 1.6621722765974563, "learning_rate": 5.79559751711138e-06, "loss": 0.9459, "step": 735 }, { "epoch": 0.68, "grad_norm": 1.4433022742228918, "learning_rate": 5.650860813791786e-06, "loss": 0.9308, "step": 740 }, { "epoch": 0.68, "grad_norm": 1.3734643883063014, "learning_rate": 5.507239164130501e-06, "loss": 0.9487, "step": 745 }, { "epoch": 0.69, "grad_norm": 1.387981074803405, "learning_rate": 5.364769390551225e-06, "loss": 0.9545, "step": 750 }, { "epoch": 0.69, "grad_norm": 1.391011395848312, "learning_rate": 5.223488020154028e-06, "loss": 0.9419, "step": 755 }, { "epoch": 0.7, "grad_norm": 1.4792090950991559, "learning_rate": 5.083431275350312e-06, "loss": 0.9426, "step": 760 }, { "epoch": 0.7, "grad_norm": 1.3955718332534048, "learning_rate": 4.9446350645759885e-06, "loss": 0.9303, "step": 765 }, { "epoch": 0.71, "grad_norm": 1.4552341272988114, "learning_rate": 4.807134973085036e-06, "loss": 0.9551, "step": 770 }, { "epoch": 0.71, "grad_norm": 1.4197472248094545, "learning_rate": 4.670966253826027e-06, "loss": 0.948, "step": 775 }, { "epoch": 0.72, "grad_norm": 1.3903141656921318, "learning_rate": 4.53616381840377e-06, "loss": 0.9404, "step": 780 }, { "epoch": 0.72, "grad_norm": 1.4683373587462776, "learning_rate": 4.402762228128531e-06, "loss": 0.9393, "step": 785 }, { "epoch": 0.72, "grad_norm": 1.3895342562056905, "learning_rate": 4.270795685155001e-06, "loss": 0.9295, "step": 790 }, { "epoch": 0.73, "grad_norm": 1.3645654190042658, "learning_rate": 4.140298023713416e-06, "loss": 0.9322, "step": 795 }, { "epoch": 0.73, "grad_norm": 1.3831028295258314, "learning_rate": 4.0113027014349374e-06, "loss": 0.9393, "step": 800 }, { "epoch": 0.74, "grad_norm": 1.402507806165818, "learning_rate": 3.883842790773647e-06, "loss": 0.9407, "step": 805 }, { "epoch": 0.74, "grad_norm": 1.3477220628229905, "learning_rate": 3.757950970527249e-06, "loss": 0.9399, "step": 810 }, { "epoch": 0.75, "grad_norm": 1.4105862702544307, "learning_rate": 3.633659517458736e-06, "loss": 0.9429, "step": 815 }, { "epoch": 0.75, "grad_norm": 1.4134155318530202, "learning_rate": 3.511000298021098e-06, "loss": 0.9267, "step": 820 }, { "epoch": 0.76, "grad_norm": 2.671468704931624, "learning_rate": 3.39000476018726e-06, "loss": 0.9405, "step": 825 }, { "epoch": 0.76, "grad_norm": 1.518363690298914, "learning_rate": 3.2707039253872796e-06, "loss": 0.9499, "step": 830 }, { "epoch": 0.77, "grad_norm": 1.391445796072115, "learning_rate": 3.153128380554941e-06, "loss": 0.9397, "step": 835 }, { "epoch": 0.77, "grad_norm": 1.4199198449516006, "learning_rate": 3.037308270285709e-06, "loss": 0.9374, "step": 840 }, { "epoch": 0.78, "grad_norm": 1.3725059360040226, "learning_rate": 2.923273289108115e-06, "loss": 0.9289, "step": 845 }, { "epoch": 0.78, "grad_norm": 1.402432038179815, "learning_rate": 2.8110526738705345e-06, "loss": 0.9399, "step": 850 }, { "epoch": 0.78, "grad_norm": 1.3538150837811775, "learning_rate": 2.700675196245288e-06, "loss": 0.9235, "step": 855 }, { "epoch": 0.79, "grad_norm": 1.3719901639969891, "learning_rate": 2.592169155352031e-06, "loss": 0.9305, "step": 860 }, { "epoch": 0.79, "grad_norm": 1.3417159608633782, "learning_rate": 2.485562370502279e-06, "loss": 0.93, "step": 865 }, { "epoch": 0.8, "grad_norm": 1.447083155674768, "learning_rate": 2.3808821740669608e-06, "loss": 0.9444, "step": 870 }, { "epoch": 0.8, "grad_norm": 1.457818919394179, "learning_rate": 2.2781554044688015e-06, "loss": 0.944, "step": 875 }, { "epoch": 0.81, "grad_norm": 1.3423569321240763, "learning_rate": 2.1774083993013715e-06, "loss": 0.9284, "step": 880 }, { "epoch": 0.81, "grad_norm": 1.4022698745517062, "learning_rate": 2.0786669885765044e-06, "loss": 0.943, "step": 885 }, { "epoch": 0.82, "grad_norm": 1.3695761420314987, "learning_rate": 1.981956488101898e-06, "loss": 0.9419, "step": 890 }, { "epoch": 0.82, "grad_norm": 1.351231324123156, "learning_rate": 1.8873016929904942e-06, "loss": 0.9315, "step": 895 }, { "epoch": 0.83, "grad_norm": 1.4420470493119792, "learning_rate": 1.7947268713034128e-06, "loss": 0.9247, "step": 900 }, { "epoch": 0.83, "grad_norm": 1.376296944069193, "learning_rate": 1.704255757827963e-06, "loss": 0.9443, "step": 905 }, { "epoch": 0.83, "grad_norm": 1.3432275033995362, "learning_rate": 1.6159115479924259e-06, "loss": 0.9453, "step": 910 }, { "epoch": 0.84, "grad_norm": 1.408037299795807, "learning_rate": 1.529716891919074e-06, "loss": 0.9168, "step": 915 }, { "epoch": 0.84, "grad_norm": 1.3321909436006458, "learning_rate": 1.4456938886170413e-06, "loss": 0.9284, "step": 920 }, { "epoch": 0.85, "grad_norm": 1.4347603275075604, "learning_rate": 1.3638640803164516e-06, "loss": 0.928, "step": 925 }, { "epoch": 0.85, "grad_norm": 1.414525145641175, "learning_rate": 1.2842484469453365e-06, "loss": 0.9247, "step": 930 }, { "epoch": 0.86, "grad_norm": 1.3556935778322545, "learning_rate": 1.2068674007506787e-06, "loss": 0.9212, "step": 935 }, { "epoch": 0.86, "grad_norm": 1.4388764825242009, "learning_rate": 1.1317407810650372e-06, "loss": 0.9168, "step": 940 }, { "epoch": 0.87, "grad_norm": 1.3989979018945433, "learning_rate": 1.0588878492200261e-06, "loss": 0.9288, "step": 945 }, { "epoch": 0.87, "grad_norm": 1.3566434474333582, "learning_rate": 9.883272836080116e-07, "loss": 0.926, "step": 950 }, { "epoch": 0.88, "grad_norm": 1.4319194447938426, "learning_rate": 9.200771748932513e-07, "loss": 0.9455, "step": 955 }, { "epoch": 0.88, "grad_norm": 1.3526304536258829, "learning_rate": 8.541550213737171e-07, "loss": 0.9261, "step": 960 }, { "epoch": 0.89, "grad_norm": 1.399241882005225, "learning_rate": 7.905777244947954e-07, "loss": 0.9316, "step": 965 }, { "epoch": 0.89, "grad_norm": 1.4094250992512831, "learning_rate": 7.293615845160196e-07, "loss": 0.9275, "step": 970 }, { "epoch": 0.89, "grad_norm": 1.3526180756349577, "learning_rate": 6.705222963319191e-07, "loss": 0.9363, "step": 975 }, { "epoch": 0.9, "grad_norm": 1.416809797785314, "learning_rate": 6.140749454480932e-07, "loss": 0.9297, "step": 980 }, { "epoch": 0.9, "grad_norm": 1.351887149750103, "learning_rate": 5.600340041135133e-07, "loss": 0.9393, "step": 985 }, { "epoch": 0.91, "grad_norm": 1.3895523431306795, "learning_rate": 5.0841332761005e-07, "loss": 0.9383, "step": 990 }, { "epoch": 0.91, "grad_norm": 1.3424090925296945, "learning_rate": 4.592261507001994e-07, "loss": 0.9215, "step": 995 }, { "epoch": 0.92, "grad_norm": 1.3838142145295658, "learning_rate": 4.124850842338779e-07, "loss": 0.9369, "step": 1000 }, { "epoch": 0.92, "grad_norm": 1.4019675844003927, "learning_rate": 3.6820211191520127e-07, "loss": 0.9269, "step": 1005 }, { "epoch": 0.93, "grad_norm": 1.4513095530474966, "learning_rate": 3.263885872300343e-07, "loss": 0.9507, "step": 1010 }, { "epoch": 0.93, "grad_norm": 1.361924533980129, "learning_rate": 2.870552305351382e-07, "loss": 0.9196, "step": 1015 }, { "epoch": 0.94, "grad_norm": 1.3927244960089797, "learning_rate": 2.5021212630962246e-07, "loss": 0.9328, "step": 1020 }, { "epoch": 0.94, "grad_norm": 1.3290530166960066, "learning_rate": 2.158687205694443e-07, "loss": 0.9405, "step": 1025 }, { "epoch": 0.94, "grad_norm": 1.3680987060349954, "learning_rate": 1.840338184455881e-07, "loss": 0.9276, "step": 1030 }, { "epoch": 0.95, "grad_norm": 1.3438546650666667, "learning_rate": 1.5471558192656776e-07, "loss": 0.9221, "step": 1035 }, { "epoch": 0.95, "grad_norm": 1.4512371380173261, "learning_rate": 1.279215277658097e-07, "loss": 0.9336, "step": 1040 }, { "epoch": 0.96, "grad_norm": 1.362018016831518, "learning_rate": 1.0365852555447642e-07, "loss": 0.9175, "step": 1045 }, { "epoch": 0.96, "grad_norm": 1.359560038203717, "learning_rate": 8.19327959602012e-08, "loss": 0.91, "step": 1050 }, { "epoch": 0.97, "grad_norm": 1.3944491658597338, "learning_rate": 6.274990913221035e-08, "loss": 0.946, "step": 1055 }, { "epoch": 0.97, "grad_norm": 1.3012436329306727, "learning_rate": 4.6114783273213395e-08, "loss": 0.9316, "step": 1060 }, { "epoch": 0.98, "grad_norm": 1.4375328358464377, "learning_rate": 3.203168337845508e-08, "loss": 0.9098, "step": 1065 }, { "epoch": 0.98, "grad_norm": 1.321439340704291, "learning_rate": 2.05042201422323e-08, "loss": 0.9084, "step": 1070 }, { "epoch": 0.99, "grad_norm": 1.4259870829955092, "learning_rate": 1.1535349032167908e-08, "loss": 0.9399, "step": 1075 }, { "epoch": 0.99, "grad_norm": 1.4012106970656215, "learning_rate": 5.127369531473525e-09, "loss": 0.9261, "step": 1080 }, { "epoch": 1.0, "grad_norm": 1.3831302460073325, "learning_rate": 1.2819245493955746e-09, "loss": 0.9336, "step": 1085 }, { "epoch": 1.0, "grad_norm": 1.3587054894081556, "learning_rate": 0.0, "loss": 0.9355, "step": 1090 }, { "epoch": 1.0, "eval_loss": 0.9393356442451477, "eval_runtime": 322.3924, "eval_samples_per_second": 47.864, "eval_steps_per_second": 0.751, "step": 1090 }, { "epoch": 1.0, "step": 1090, "total_flos": 456447649382400.0, "train_loss": 0.9795057065989993, "train_runtime": 15753.4108, "train_samples_per_second": 8.85, "train_steps_per_second": 0.069 } ], "logging_steps": 5, "max_steps": 1090, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 456447649382400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }