{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992892679459844, "eval_steps": 500, "global_step": 703, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.976748246707771, "learning_rate": 2.8169014084507043e-07, "loss": 1.2046, "step": 1 }, { "epoch": 0.01, "grad_norm": 9.141400774480953, "learning_rate": 1.4084507042253523e-06, "loss": 1.212, "step": 5 }, { "epoch": 0.01, "grad_norm": 9.49864441857399, "learning_rate": 2.8169014084507046e-06, "loss": 1.1487, "step": 10 }, { "epoch": 0.02, "grad_norm": 3.2926935702992415, "learning_rate": 4.225352112676057e-06, "loss": 1.0049, "step": 15 }, { "epoch": 0.03, "grad_norm": 1.7117549060658137, "learning_rate": 5.633802816901409e-06, "loss": 0.9194, "step": 20 }, { "epoch": 0.04, "grad_norm": 1.2735260163319009, "learning_rate": 7.042253521126761e-06, "loss": 0.8716, "step": 25 }, { "epoch": 0.04, "grad_norm": 1.0790301127605728, "learning_rate": 8.450704225352114e-06, "loss": 0.864, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.7414273564854603, "learning_rate": 9.859154929577466e-06, "loss": 0.8359, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.6656956199139998, "learning_rate": 1.1267605633802819e-05, "loss": 0.8614, "step": 40 }, { "epoch": 0.06, "grad_norm": 0.6590669608248467, "learning_rate": 1.2676056338028171e-05, "loss": 0.8375, "step": 45 }, { "epoch": 0.07, "grad_norm": 0.6842437677529626, "learning_rate": 1.4084507042253522e-05, "loss": 0.8306, "step": 50 }, { "epoch": 0.08, "grad_norm": 0.6080481338431981, "learning_rate": 1.5492957746478872e-05, "loss": 0.8391, "step": 55 }, { "epoch": 0.09, "grad_norm": 0.5829909243761902, "learning_rate": 1.6901408450704228e-05, "loss": 0.8211, "step": 60 }, { "epoch": 0.09, "grad_norm": 0.5774288402112024, "learning_rate": 1.830985915492958e-05, "loss": 0.8104, "step": 65 }, { "epoch": 0.1, "grad_norm": 0.6037500273154381, "learning_rate": 1.9718309859154933e-05, "loss": 0.7882, "step": 70 }, { "epoch": 0.11, "grad_norm": 0.5823339744513759, "learning_rate": 1.9998023297700656e-05, "loss": 0.8226, "step": 75 }, { "epoch": 0.11, "grad_norm": 0.5970199949943188, "learning_rate": 1.9989994283927287e-05, "loss": 0.7805, "step": 80 }, { "epoch": 0.12, "grad_norm": 0.5863046182186602, "learning_rate": 1.997579437055642e-05, "loss": 0.8011, "step": 85 }, { "epoch": 0.13, "grad_norm": 0.6404658915624578, "learning_rate": 1.9955432328988437e-05, "loss": 0.8092, "step": 90 }, { "epoch": 0.14, "grad_norm": 0.5889469279145607, "learning_rate": 1.9928920737019735e-05, "loss": 0.8111, "step": 95 }, { "epoch": 0.14, "grad_norm": 0.5999505405000287, "learning_rate": 1.9896275971073326e-05, "loss": 0.8034, "step": 100 }, { "epoch": 0.15, "grad_norm": 0.5958697553643331, "learning_rate": 1.9857518196082964e-05, "loss": 0.783, "step": 105 }, { "epoch": 0.16, "grad_norm": 0.6235891380868392, "learning_rate": 1.981267135303714e-05, "loss": 0.7986, "step": 110 }, { "epoch": 0.16, "grad_norm": 0.6386697510622403, "learning_rate": 1.976176314419051e-05, "loss": 0.8173, "step": 115 }, { "epoch": 0.17, "grad_norm": 0.6223044203965952, "learning_rate": 1.9704825015952005e-05, "loss": 0.7981, "step": 120 }, { "epoch": 0.18, "grad_norm": 0.6801656349332326, "learning_rate": 1.9641892139460133e-05, "loss": 0.8018, "step": 125 }, { "epoch": 0.18, "grad_norm": 0.5736017682468991, "learning_rate": 1.9573003388857476e-05, "loss": 0.7801, "step": 130 }, { "epoch": 0.19, "grad_norm": 0.6377284634315731, "learning_rate": 1.949820131727783e-05, "loss": 0.8233, "step": 135 }, { "epoch": 0.2, "grad_norm": 0.5834471085873422, "learning_rate": 1.9417532130560784e-05, "loss": 0.7792, "step": 140 }, { "epoch": 0.21, "grad_norm": 0.5750039228232332, "learning_rate": 1.933104565871001e-05, "loss": 0.79, "step": 145 }, { "epoch": 0.21, "grad_norm": 0.5788157023616078, "learning_rate": 1.9238795325112867e-05, "loss": 0.7823, "step": 150 }, { "epoch": 0.22, "grad_norm": 0.5905988672413683, "learning_rate": 1.9140838113540347e-05, "loss": 0.7907, "step": 155 }, { "epoch": 0.23, "grad_norm": 0.6036919615732501, "learning_rate": 1.9037234532947737e-05, "loss": 0.7859, "step": 160 }, { "epoch": 0.23, "grad_norm": 0.5744116424871404, "learning_rate": 1.8928048580097758e-05, "loss": 0.7902, "step": 165 }, { "epoch": 0.24, "grad_norm": 0.5858290882445865, "learning_rate": 1.8813347700029244e-05, "loss": 0.7756, "step": 170 }, { "epoch": 0.25, "grad_norm": 0.6203103361508707, "learning_rate": 1.869320274439583e-05, "loss": 0.7945, "step": 175 }, { "epoch": 0.26, "grad_norm": 0.6094752243332237, "learning_rate": 1.8567687927700255e-05, "loss": 0.8089, "step": 180 }, { "epoch": 0.26, "grad_norm": 0.6065177506682852, "learning_rate": 1.8436880781451545e-05, "loss": 0.7866, "step": 185 }, { "epoch": 0.27, "grad_norm": 0.6422514480326399, "learning_rate": 1.8300862106273113e-05, "loss": 0.7661, "step": 190 }, { "epoch": 0.28, "grad_norm": 0.5976731025044114, "learning_rate": 1.8159715921991612e-05, "loss": 0.7518, "step": 195 }, { "epoch": 0.28, "grad_norm": 0.5813547875268166, "learning_rate": 1.801352941573718e-05, "loss": 0.777, "step": 200 }, { "epoch": 0.29, "grad_norm": 0.6079636229603744, "learning_rate": 1.786239288808727e-05, "loss": 0.7517, "step": 205 }, { "epoch": 0.3, "grad_norm": 0.6116028297485965, "learning_rate": 1.770639969728726e-05, "loss": 0.7582, "step": 210 }, { "epoch": 0.31, "grad_norm": 0.6104535774475512, "learning_rate": 1.7545646201582304e-05, "loss": 0.7739, "step": 215 }, { "epoch": 0.31, "grad_norm": 0.6092743685177864, "learning_rate": 1.738023169969608e-05, "loss": 0.7747, "step": 220 }, { "epoch": 0.32, "grad_norm": 0.5879141856316753, "learning_rate": 1.721025836949317e-05, "loss": 0.7601, "step": 225 }, { "epoch": 0.33, "grad_norm": 0.5801521678476529, "learning_rate": 1.703583120486297e-05, "loss": 0.7831, "step": 230 }, { "epoch": 0.33, "grad_norm": 0.6538515323321731, "learning_rate": 1.6857057950864134e-05, "loss": 0.7792, "step": 235 }, { "epoch": 0.34, "grad_norm": 0.6283544032704685, "learning_rate": 1.6674049037169565e-05, "loss": 0.7699, "step": 240 }, { "epoch": 0.35, "grad_norm": 0.5680653535007143, "learning_rate": 1.648691750985314e-05, "loss": 0.7465, "step": 245 }, { "epoch": 0.36, "grad_norm": 0.6108554255786504, "learning_rate": 1.6295778961560242e-05, "loss": 0.7615, "step": 250 }, { "epoch": 0.36, "grad_norm": 0.5595610558009545, "learning_rate": 1.6100751460105244e-05, "loss": 0.7517, "step": 255 }, { "epoch": 0.37, "grad_norm": 0.5638534604629722, "learning_rate": 1.5901955475540087e-05, "loss": 0.7433, "step": 260 }, { "epoch": 0.38, "grad_norm": 0.6290842137244347, "learning_rate": 1.5699513805738942e-05, "loss": 0.7546, "step": 265 }, { "epoch": 0.38, "grad_norm": 0.5899201877158341, "learning_rate": 1.549355150054501e-05, "loss": 0.7586, "step": 270 }, { "epoch": 0.39, "grad_norm": 0.5848454901161203, "learning_rate": 1.5284195784526196e-05, "loss": 0.7435, "step": 275 }, { "epoch": 0.4, "grad_norm": 0.6441906832873957, "learning_rate": 1.5071575978387505e-05, "loss": 0.7557, "step": 280 }, { "epoch": 0.41, "grad_norm": 0.5889076952992223, "learning_rate": 1.4855823419088576e-05, "loss": 0.7523, "step": 285 }, { "epoch": 0.41, "grad_norm": 0.6138750994803515, "learning_rate": 1.4637071378715807e-05, "loss": 0.7466, "step": 290 }, { "epoch": 0.42, "grad_norm": 0.5460450262431842, "learning_rate": 1.4415454982159121e-05, "loss": 0.7575, "step": 295 }, { "epoch": 0.43, "grad_norm": 0.5958584774428966, "learning_rate": 1.419111112364422e-05, "loss": 0.7651, "step": 300 }, { "epoch": 0.43, "grad_norm": 0.5892595546498745, "learning_rate": 1.3964178382171942e-05, "loss": 0.7509, "step": 305 }, { "epoch": 0.44, "grad_norm": 0.5824299769787271, "learning_rate": 1.3734796935916888e-05, "loss": 0.7248, "step": 310 }, { "epoch": 0.45, "grad_norm": 0.564819905580984, "learning_rate": 1.3503108475638244e-05, "loss": 0.7288, "step": 315 }, { "epoch": 0.45, "grad_norm": 0.6263083055380363, "learning_rate": 1.326925611715627e-05, "loss": 0.7559, "step": 320 }, { "epoch": 0.46, "grad_norm": 0.5885749596082486, "learning_rate": 1.3033384312948487e-05, "loss": 0.7448, "step": 325 }, { "epoch": 0.47, "grad_norm": 0.6217412195838338, "learning_rate": 1.2795638762920254e-05, "loss": 0.742, "step": 330 }, { "epoch": 0.48, "grad_norm": 0.6379002045294617, "learning_rate": 1.2556166324404747e-05, "loss": 0.7099, "step": 335 }, { "epoch": 0.48, "grad_norm": 0.5539140087654301, "learning_rate": 1.2315114921448012e-05, "loss": 0.7362, "step": 340 }, { "epoch": 0.49, "grad_norm": 0.5709158729850722, "learning_rate": 1.2072633453435092e-05, "loss": 0.7259, "step": 345 }, { "epoch": 0.5, "grad_norm": 0.6103140402089173, "learning_rate": 1.1828871703113686e-05, "loss": 0.7525, "step": 350 }, { "epoch": 0.5, "grad_norm": 0.6006237395857831, "learning_rate": 1.158398024407215e-05, "loss": 0.7443, "step": 355 }, { "epoch": 0.51, "grad_norm": 0.5730036044160878, "learning_rate": 1.1338110347728973e-05, "loss": 0.7488, "step": 360 }, { "epoch": 0.52, "grad_norm": 0.6357913417816614, "learning_rate": 1.1091413889891211e-05, "loss": 0.7451, "step": 365 }, { "epoch": 0.53, "grad_norm": 0.6484616619442031, "learning_rate": 1.0844043256939585e-05, "loss": 0.7513, "step": 370 }, { "epoch": 0.53, "grad_norm": 0.6147553788096094, "learning_rate": 1.05961512516982e-05, "loss": 0.7547, "step": 375 }, { "epoch": 0.54, "grad_norm": 0.6026348227570094, "learning_rate": 1.0347890999046998e-05, "loss": 0.7291, "step": 380 }, { "epoch": 0.55, "grad_norm": 0.5607221603847458, "learning_rate": 1.00994158513353e-05, "loss": 0.729, "step": 385 }, { "epoch": 0.55, "grad_norm": 0.5908644383211163, "learning_rate": 9.850879293654829e-06, "loss": 0.7319, "step": 390 }, { "epoch": 0.56, "grad_norm": 0.5697397462576617, "learning_rate": 9.602434849030747e-06, "loss": 0.7522, "step": 395 }, { "epoch": 0.57, "grad_norm": 0.5939805294573912, "learning_rate": 9.354235983589229e-06, "loss": 0.7285, "step": 400 }, { "epoch": 0.58, "grad_norm": 0.5755564582128363, "learning_rate": 9.106436011760229e-06, "loss": 0.7102, "step": 405 }, { "epoch": 0.58, "grad_norm": 0.6451432716799574, "learning_rate": 8.859188001573916e-06, "loss": 0.7511, "step": 410 }, { "epoch": 0.59, "grad_norm": 0.5843168824282191, "learning_rate": 8.61264468010932e-06, "loss": 0.7022, "step": 415 }, { "epoch": 0.6, "grad_norm": 0.6112413168467156, "learning_rate": 8.3669583391536e-06, "loss": 0.7275, "step": 420 }, { "epoch": 0.6, "grad_norm": 0.581022238822059, "learning_rate": 8.122280741130177e-06, "loss": 0.737, "step": 425 }, { "epoch": 0.61, "grad_norm": 0.5858683287529846, "learning_rate": 7.878763025353875e-06, "loss": 0.7456, "step": 430 }, { "epoch": 0.62, "grad_norm": 0.5983662476814803, "learning_rate": 7.636555614670953e-06, "loss": 0.7443, "step": 435 }, { "epoch": 0.63, "grad_norm": 0.5853808275325328, "learning_rate": 7.395808122541697e-06, "loss": 0.7456, "step": 440 }, { "epoch": 0.63, "grad_norm": 0.6178373733573391, "learning_rate": 7.156669260622997e-06, "loss": 0.7272, "step": 445 }, { "epoch": 0.64, "grad_norm": 0.5767484461478896, "learning_rate": 6.9192867469079625e-06, "loss": 0.7355, "step": 450 }, { "epoch": 0.65, "grad_norm": 0.6095308999494778, "learning_rate": 6.683807214479323e-06, "loss": 0.737, "step": 455 }, { "epoch": 0.65, "grad_norm": 0.5757726348096623, "learning_rate": 6.450376120933008e-06, "loss": 0.725, "step": 460 }, { "epoch": 0.66, "grad_norm": 0.6084178997751468, "learning_rate": 6.219137658527819e-06, "loss": 0.7484, "step": 465 }, { "epoch": 0.67, "grad_norm": 0.5625041815811782, "learning_rate": 5.990234665116713e-06, "loss": 0.7322, "step": 470 }, { "epoch": 0.68, "grad_norm": 0.6160006018394228, "learning_rate": 5.7638085359147235e-06, "loss": 0.7235, "step": 475 }, { "epoch": 0.68, "grad_norm": 0.6035240580848604, "learning_rate": 5.539999136157977e-06, "loss": 0.7094, "step": 480 }, { "epoch": 0.69, "grad_norm": 0.5166781636185539, "learning_rate": 5.318944714707861e-06, "loss": 0.7209, "step": 485 }, { "epoch": 0.7, "grad_norm": 0.5822772862847312, "learning_rate": 5.100781818653549e-06, "loss": 0.7088, "step": 490 }, { "epoch": 0.7, "grad_norm": 0.6227541043433473, "learning_rate": 4.885645208965779e-06, "loss": 0.7295, "step": 495 }, { "epoch": 0.71, "grad_norm": 0.54846199474894, "learning_rate": 4.673667777253944e-06, "loss": 0.7452, "step": 500 }, { "epoch": 0.72, "grad_norm": 0.6040144782037625, "learning_rate": 4.464980463677846e-06, "loss": 0.736, "step": 505 }, { "epoch": 0.72, "grad_norm": 0.5847697359147894, "learning_rate": 4.25971217606493e-06, "loss": 0.7364, "step": 510 }, { "epoch": 0.73, "grad_norm": 0.597219775666177, "learning_rate": 4.057989710282897e-06, "loss": 0.7288, "step": 515 }, { "epoch": 0.74, "grad_norm": 0.5379233895836751, "learning_rate": 3.859937671916833e-06, "loss": 0.7383, "step": 520 }, { "epoch": 0.75, "grad_norm": 0.6251650029425307, "learning_rate": 3.6656783992993885e-06, "loss": 0.7264, "step": 525 }, { "epoch": 0.75, "grad_norm": 0.6054608357116987, "learning_rate": 3.475331887941388e-06, "loss": 0.7384, "step": 530 }, { "epoch": 0.76, "grad_norm": 0.5551297177270929, "learning_rate": 3.2890157164096315e-06, "loss": 0.7398, "step": 535 }, { "epoch": 0.77, "grad_norm": 0.6396725383662202, "learning_rate": 3.1068449736977015e-06, "loss": 0.7341, "step": 540 }, { "epoch": 0.77, "grad_norm": 0.5648152571609389, "learning_rate": 2.9289321881345257e-06, "loss": 0.7244, "step": 545 }, { "epoch": 0.78, "grad_norm": 0.57252018708692, "learning_rate": 2.755387257874764e-06, "loss": 0.7228, "step": 550 }, { "epoch": 0.79, "grad_norm": 0.5878135364575673, "learning_rate": 2.5863173830138212e-06, "loss": 0.7181, "step": 555 }, { "epoch": 0.8, "grad_norm": 0.5840059710891027, "learning_rate": 2.4218269993694733e-06, "loss": 0.7286, "step": 560 }, { "epoch": 0.8, "grad_norm": 0.5438192269829342, "learning_rate": 2.262017713971063e-06, "loss": 0.712, "step": 565 }, { "epoch": 0.81, "grad_norm": 0.5350040038892513, "learning_rate": 2.106988242295981e-06, "loss": 0.7312, "step": 570 }, { "epoch": 0.82, "grad_norm": 0.5423604435174225, "learning_rate": 1.9568343472923524e-06, "loss": 0.7155, "step": 575 }, { "epoch": 0.82, "grad_norm": 0.6113034501153347, "learning_rate": 1.8116487802254868e-06, "loss": 0.7282, "step": 580 }, { "epoch": 0.83, "grad_norm": 0.6023686669705883, "learning_rate": 1.6715212233846656e-06, "loss": 0.7159, "step": 585 }, { "epoch": 0.84, "grad_norm": 0.5618755550032292, "learning_rate": 1.5365382346857005e-06, "loss": 0.7349, "step": 590 }, { "epoch": 0.85, "grad_norm": 0.5579514856251387, "learning_rate": 1.4067831942033904e-06, "loss": 0.7106, "step": 595 }, { "epoch": 0.85, "grad_norm": 0.5910581925001689, "learning_rate": 1.2823362526669825e-06, "loss": 0.7074, "step": 600 }, { "epoch": 0.86, "grad_norm": 0.5595582019084373, "learning_rate": 1.1632742819504406e-06, "loss": 0.7244, "step": 605 }, { "epoch": 0.87, "grad_norm": 0.6034146234727371, "learning_rate": 1.0496708275880497e-06, "loss": 0.7237, "step": 610 }, { "epoch": 0.87, "grad_norm": 0.5913679291364223, "learning_rate": 9.415960633447674e-07, "loss": 0.7158, "step": 615 }, { "epoch": 0.88, "grad_norm": 0.5694936405039339, "learning_rate": 8.391167478693241e-07, "loss": 0.7209, "step": 620 }, { "epoch": 0.89, "grad_norm": 0.5805765466263503, "learning_rate": 7.422961834568565e-07, "loss": 0.7097, "step": 625 }, { "epoch": 0.9, "grad_norm": 0.5641375110299046, "learning_rate": 6.51194176946588e-07, "loss": 0.711, "step": 630 }, { "epoch": 0.9, "grad_norm": 0.5928177799223828, "learning_rate": 5.658670027786561e-07, "loss": 0.713, "step": 635 }, { "epoch": 0.91, "grad_norm": 0.6022673346018554, "learning_rate": 4.863673682329373e-07, "loss": 0.7395, "step": 640 }, { "epoch": 0.92, "grad_norm": 0.588777849626458, "learning_rate": 4.1274438087135273e-07, "loss": 0.7435, "step": 645 }, { "epoch": 0.92, "grad_norm": 0.52867680060555, "learning_rate": 3.450435182037104e-07, "loss": 0.6871, "step": 650 }, { "epoch": 0.93, "grad_norm": 0.5676798921047626, "learning_rate": 2.8330659959589944e-07, "loss": 0.727, "step": 655 }, { "epoch": 0.94, "grad_norm": 0.5666677388076765, "learning_rate": 2.275717604377292e-07, "loss": 0.7329, "step": 660 }, { "epoch": 0.95, "grad_norm": 0.5389907093110754, "learning_rate": 1.7787342858638589e-07, "loss": 0.7136, "step": 665 }, { "epoch": 0.95, "grad_norm": 0.571817712811333, "learning_rate": 1.3424230310007946e-07, "loss": 0.739, "step": 670 }, { "epoch": 0.96, "grad_norm": 0.5830917030629162, "learning_rate": 9.670533527498139e-08, "loss": 0.7209, "step": 675 }, { "epoch": 0.97, "grad_norm": 0.5837803782222483, "learning_rate": 6.528571199719502e-08, "loss": 0.7322, "step": 680 }, { "epoch": 0.97, "grad_norm": 0.5711509584660898, "learning_rate": 4.000284142003264e-08, "loss": 0.7113, "step": 685 }, { "epoch": 0.98, "grad_norm": 0.5826105062040772, "learning_rate": 2.0872340975438555e-08, "loss": 0.7327, "step": 690 }, { "epoch": 0.99, "grad_norm": 0.5724533629251033, "learning_rate": 7.906027726981568e-09, "loss": 0.715, "step": 695 }, { "epoch": 1.0, "grad_norm": 0.5578776974422313, "learning_rate": 1.111911070356131e-09, "loss": 0.7072, "step": 700 }, { "epoch": 1.0, "eval_loss": 0.7502214908599854, "eval_runtime": 9.4753, "eval_samples_per_second": 52.769, "eval_steps_per_second": 1.689, "step": 703 }, { "epoch": 1.0, "step": 703, "total_flos": 101830592102400.0, "train_loss": 0.7627222812701425, "train_runtime": 5862.0034, "train_samples_per_second": 15.353, "train_steps_per_second": 0.12 } ], "logging_steps": 5, "max_steps": 703, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 101830592102400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }