{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9810067285580444, "eval_steps": 500, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.9777716398239136, "learning_rate": 4.964866706413423e-05, "loss": 1.1427, "step": 500 }, { "epoch": 0.04, "grad_norm": 0.9765903353691101, "learning_rate": 4.929378531073446e-05, "loss": 0.4665, "step": 1000 }, { "epoch": 0.06, "grad_norm": 0.8451493382453918, "learning_rate": 4.89389035573347e-05, "loss": 0.4258, "step": 1500 }, { "epoch": 0.09, "grad_norm": 0.7855414748191833, "learning_rate": 4.8584021803934934e-05, "loss": 0.4124, "step": 2000 }, { "epoch": 0.11, "grad_norm": 0.7474697828292847, "learning_rate": 4.822914005053516e-05, "loss": 0.392, "step": 2500 }, { "epoch": 0.13, "grad_norm": 0.779282808303833, "learning_rate": 4.78742582971354e-05, "loss": 0.3807, "step": 3000 }, { "epoch": 0.15, "grad_norm": 0.723458468914032, "learning_rate": 4.751937654373563e-05, "loss": 0.3664, "step": 3500 }, { "epoch": 0.17, "grad_norm": 0.6058954000473022, "learning_rate": 4.7164494790335864e-05, "loss": 0.3646, "step": 4000 }, { "epoch": 0.19, "grad_norm": 0.6667686104774475, "learning_rate": 4.6809613036936094e-05, "loss": 0.3547, "step": 4500 }, { "epoch": 0.21, "grad_norm": 0.6711506843566895, "learning_rate": 4.645473128353633e-05, "loss": 0.3515, "step": 5000 }, { "epoch": 0.23, "grad_norm": 0.7532833814620972, "learning_rate": 4.6099849530136566e-05, "loss": 0.3473, "step": 5500 }, { "epoch": 0.26, "grad_norm": 0.6181535720825195, "learning_rate": 4.574496777673679e-05, "loss": 0.3409, "step": 6000 }, { "epoch": 0.28, "grad_norm": 0.6952047944068909, "learning_rate": 4.5390086023337024e-05, "loss": 0.3384, "step": 6500 }, { "epoch": 0.3, "grad_norm": 0.715155839920044, "learning_rate": 4.503520426993726e-05, "loss": 0.3385, "step": 7000 }, { "epoch": 0.32, "grad_norm": 0.8195307850837708, "learning_rate": 4.468032251653749e-05, "loss": 0.3292, "step": 7500 }, { "epoch": 0.34, "grad_norm": 0.6436219811439514, "learning_rate": 4.4325440763137726e-05, "loss": 0.3316, "step": 8000 }, { "epoch": 0.36, "grad_norm": 0.7924569845199585, "learning_rate": 4.3970559009737955e-05, "loss": 0.3217, "step": 8500 }, { "epoch": 0.38, "grad_norm": 0.5443514585494995, "learning_rate": 4.361567725633819e-05, "loss": 0.3201, "step": 9000 }, { "epoch": 0.4, "grad_norm": 0.7431919574737549, "learning_rate": 4.326079550293842e-05, "loss": 0.3236, "step": 9500 }, { "epoch": 0.43, "grad_norm": 0.664546549320221, "learning_rate": 4.2905913749538656e-05, "loss": 0.3161, "step": 10000 }, { "epoch": 0.45, "grad_norm": 0.6894972920417786, "learning_rate": 4.255103199613889e-05, "loss": 0.3149, "step": 10500 }, { "epoch": 0.47, "grad_norm": 0.7021653056144714, "learning_rate": 4.219615024273912e-05, "loss": 0.3109, "step": 11000 }, { "epoch": 0.49, "grad_norm": 0.7476543188095093, "learning_rate": 4.184126848933935e-05, "loss": 0.3161, "step": 11500 }, { "epoch": 0.51, "grad_norm": 0.6343545317649841, "learning_rate": 4.148638673593959e-05, "loss": 0.3131, "step": 12000 }, { "epoch": 0.53, "grad_norm": 0.5647464990615845, "learning_rate": 4.113150498253982e-05, "loss": 0.3065, "step": 12500 }, { "epoch": 0.55, "grad_norm": 0.6662362217903137, "learning_rate": 4.077662322914005e-05, "loss": 0.3039, "step": 13000 }, { "epoch": 0.57, "grad_norm": 0.6502272486686707, "learning_rate": 4.042174147574029e-05, "loss": 0.3037, "step": 13500 }, { "epoch": 0.6, "grad_norm": 0.6720430254936218, "learning_rate": 4.006685972234052e-05, "loss": 0.3077, "step": 14000 }, { "epoch": 0.62, "grad_norm": 0.5225039124488831, "learning_rate": 3.971268773244755e-05, "loss": 0.3014, "step": 14500 }, { "epoch": 0.64, "grad_norm": 0.567632794380188, "learning_rate": 3.935780597904779e-05, "loss": 0.2981, "step": 15000 }, { "epoch": 0.66, "grad_norm": 0.6883319616317749, "learning_rate": 3.9002924225648016e-05, "loss": 0.3001, "step": 15500 }, { "epoch": 0.68, "grad_norm": 0.6592826247215271, "learning_rate": 3.8648042472248246e-05, "loss": 0.3064, "step": 16000 }, { "epoch": 0.7, "grad_norm": 0.5845515727996826, "learning_rate": 3.829316071884848e-05, "loss": 0.2978, "step": 16500 }, { "epoch": 0.72, "grad_norm": 0.5542165637016296, "learning_rate": 3.793827896544872e-05, "loss": 0.2981, "step": 17000 }, { "epoch": 0.75, "grad_norm": 0.5974834561347961, "learning_rate": 3.758339721204895e-05, "loss": 0.2986, "step": 17500 }, { "epoch": 0.77, "grad_norm": 0.691500186920166, "learning_rate": 3.722851545864918e-05, "loss": 0.2982, "step": 18000 }, { "epoch": 0.79, "grad_norm": 0.5833417773246765, "learning_rate": 3.687363370524941e-05, "loss": 0.2971, "step": 18500 }, { "epoch": 0.81, "grad_norm": 0.5949202179908752, "learning_rate": 3.651875195184964e-05, "loss": 0.298, "step": 19000 }, { "epoch": 0.83, "grad_norm": 0.5094270706176758, "learning_rate": 3.616387019844988e-05, "loss": 0.2979, "step": 19500 }, { "epoch": 0.85, "grad_norm": 0.6082278490066528, "learning_rate": 3.5808988445050114e-05, "loss": 0.2911, "step": 20000 }, { "epoch": 0.87, "grad_norm": 0.4882521629333496, "learning_rate": 3.545481645515714e-05, "loss": 0.2948, "step": 20500 }, { "epoch": 0.89, "grad_norm": 0.5949715375900269, "learning_rate": 3.5099934701757377e-05, "loss": 0.2873, "step": 21000 }, { "epoch": 0.92, "grad_norm": 0.6041129231452942, "learning_rate": 3.4745052948357606e-05, "loss": 0.2909, "step": 21500 }, { "epoch": 0.94, "grad_norm": 0.47851431369781494, "learning_rate": 3.439017119495784e-05, "loss": 0.2837, "step": 22000 }, { "epoch": 0.96, "grad_norm": 0.5174229145050049, "learning_rate": 3.403528944155808e-05, "loss": 0.2888, "step": 22500 }, { "epoch": 0.98, "grad_norm": 0.5259119868278503, "learning_rate": 3.3681117451665105e-05, "loss": 0.2866, "step": 23000 }, { "epoch": 1.0, "eval_loss": 0.25954437255859375, "eval_runtime": 169.43, "eval_samples_per_second": 492.77, "eval_steps_per_second": 30.803, "step": 23482 }, { "epoch": 1.0, "grad_norm": 0.7060836553573608, "learning_rate": 3.3327655225278936e-05, "loss": 0.2907, "step": 23500 }, { "epoch": 1.02, "grad_norm": 0.6116435527801514, "learning_rate": 3.297277347187917e-05, "loss": 0.2635, "step": 24000 }, { "epoch": 1.04, "grad_norm": 0.8953403234481812, "learning_rate": 3.2618601481986205e-05, "loss": 0.2649, "step": 24500 }, { "epoch": 1.06, "grad_norm": 0.45276859402656555, "learning_rate": 3.226371972858644e-05, "loss": 0.2707, "step": 25000 }, { "epoch": 1.09, "grad_norm": 0.5363854169845581, "learning_rate": 3.190883797518667e-05, "loss": 0.2657, "step": 25500 }, { "epoch": 1.11, "grad_norm": 0.6412793397903442, "learning_rate": 3.15539562217869e-05, "loss": 0.2695, "step": 26000 }, { "epoch": 1.13, "grad_norm": 0.6201598644256592, "learning_rate": 3.1199074468387136e-05, "loss": 0.2655, "step": 26500 }, { "epoch": 1.15, "grad_norm": 0.5849852561950684, "learning_rate": 3.084419271498737e-05, "loss": 0.2738, "step": 27000 }, { "epoch": 1.17, "grad_norm": 0.5859747529029846, "learning_rate": 3.04900207250944e-05, "loss": 0.2666, "step": 27500 }, { "epoch": 1.19, "grad_norm": 0.7325323820114136, "learning_rate": 3.0135848735201432e-05, "loss": 0.265, "step": 28000 }, { "epoch": 1.21, "grad_norm": 0.5811768770217896, "learning_rate": 2.9780966981801668e-05, "loss": 0.2679, "step": 28500 }, { "epoch": 1.23, "grad_norm": 0.5710467100143433, "learning_rate": 2.94260852284019e-05, "loss": 0.2631, "step": 29000 }, { "epoch": 1.26, "grad_norm": 0.49995923042297363, "learning_rate": 2.907120347500213e-05, "loss": 0.2693, "step": 29500 }, { "epoch": 1.28, "grad_norm": 0.4296848475933075, "learning_rate": 2.8716321721602363e-05, "loss": 0.2623, "step": 30000 }, { "epoch": 1.3, "grad_norm": 0.7181215286254883, "learning_rate": 2.8361439968202595e-05, "loss": 0.265, "step": 30500 }, { "epoch": 1.32, "grad_norm": 0.6369624137878418, "learning_rate": 2.800655821480283e-05, "loss": 0.2657, "step": 31000 }, { "epoch": 1.34, "grad_norm": 0.583540141582489, "learning_rate": 2.7651676461403064e-05, "loss": 0.2602, "step": 31500 }, { "epoch": 1.36, "grad_norm": 0.597145140171051, "learning_rate": 2.7296794708003293e-05, "loss": 0.2632, "step": 32000 }, { "epoch": 1.38, "grad_norm": 0.6519151329994202, "learning_rate": 2.6941912954603526e-05, "loss": 0.2649, "step": 32500 }, { "epoch": 1.41, "grad_norm": 0.5827385783195496, "learning_rate": 2.6587031201203762e-05, "loss": 0.2646, "step": 33000 }, { "epoch": 1.43, "grad_norm": 0.5920796990394592, "learning_rate": 2.6232149447803995e-05, "loss": 0.2652, "step": 33500 }, { "epoch": 1.45, "grad_norm": 0.5716321468353271, "learning_rate": 2.5877267694404227e-05, "loss": 0.2617, "step": 34000 }, { "epoch": 1.47, "grad_norm": 0.5394742488861084, "learning_rate": 2.5523805468018058e-05, "loss": 0.2657, "step": 34500 }, { "epoch": 1.49, "grad_norm": 0.7252212166786194, "learning_rate": 2.516892371461829e-05, "loss": 0.2593, "step": 35000 }, { "epoch": 1.51, "grad_norm": 0.5848865509033203, "learning_rate": 2.481475172472532e-05, "loss": 0.2663, "step": 35500 }, { "epoch": 1.53, "grad_norm": 0.5270236134529114, "learning_rate": 2.4459869971325557e-05, "loss": 0.2651, "step": 36000 }, { "epoch": 1.55, "grad_norm": 0.491560697555542, "learning_rate": 2.410498821792579e-05, "loss": 0.2617, "step": 36500 }, { "epoch": 1.58, "grad_norm": 0.7208006381988525, "learning_rate": 2.375010646452602e-05, "loss": 0.2618, "step": 37000 }, { "epoch": 1.6, "grad_norm": 0.5443432927131653, "learning_rate": 2.3395224711126255e-05, "loss": 0.2612, "step": 37500 }, { "epoch": 1.62, "grad_norm": 0.7115702033042908, "learning_rate": 2.3040342957726488e-05, "loss": 0.2606, "step": 38000 }, { "epoch": 1.64, "grad_norm": 0.6229639649391174, "learning_rate": 2.268546120432672e-05, "loss": 0.2576, "step": 38500 }, { "epoch": 1.66, "grad_norm": 0.4943016469478607, "learning_rate": 2.2330579450926953e-05, "loss": 0.2531, "step": 39000 }, { "epoch": 1.68, "grad_norm": 0.5038357973098755, "learning_rate": 2.1975697697527182e-05, "loss": 0.2544, "step": 39500 }, { "epoch": 1.7, "grad_norm": 0.539564311504364, "learning_rate": 2.1620815944127418e-05, "loss": 0.2553, "step": 40000 }, { "epoch": 1.72, "grad_norm": 0.5986380577087402, "learning_rate": 2.126593419072765e-05, "loss": 0.2547, "step": 40500 }, { "epoch": 1.75, "grad_norm": 0.5932564735412598, "learning_rate": 2.0911052437327884e-05, "loss": 0.2633, "step": 41000 }, { "epoch": 1.77, "grad_norm": 0.5432788133621216, "learning_rate": 2.0556170683928116e-05, "loss": 0.261, "step": 41500 }, { "epoch": 1.79, "grad_norm": 0.5512367486953735, "learning_rate": 2.020128893052835e-05, "loss": 0.2619, "step": 42000 }, { "epoch": 1.81, "grad_norm": 0.6204352378845215, "learning_rate": 1.984640717712858e-05, "loss": 0.2583, "step": 42500 }, { "epoch": 1.83, "grad_norm": 0.5551834106445312, "learning_rate": 1.9491525423728814e-05, "loss": 0.259, "step": 43000 }, { "epoch": 1.85, "grad_norm": 0.5733679533004761, "learning_rate": 1.9136643670329047e-05, "loss": 0.2562, "step": 43500 }, { "epoch": 1.87, "grad_norm": 0.6171718239784241, "learning_rate": 1.878176191692928e-05, "loss": 0.2538, "step": 44000 }, { "epoch": 1.9, "grad_norm": 0.5973256826400757, "learning_rate": 1.8426880163529516e-05, "loss": 0.2539, "step": 44500 }, { "epoch": 1.92, "grad_norm": 0.4978584945201874, "learning_rate": 1.8071998410129745e-05, "loss": 0.2604, "step": 45000 }, { "epoch": 1.94, "grad_norm": 0.4526893198490143, "learning_rate": 1.771782642023678e-05, "loss": 0.2591, "step": 45500 }, { "epoch": 1.96, "grad_norm": 0.48809126019477844, "learning_rate": 1.736294466683701e-05, "loss": 0.2607, "step": 46000 }, { "epoch": 1.98, "grad_norm": 0.6709504723548889, "learning_rate": 1.7008062913437244e-05, "loss": 0.2582, "step": 46500 }, { "epoch": 2.0, "eval_loss": 0.2457771897315979, "eval_runtime": 174.1255, "eval_samples_per_second": 479.482, "eval_steps_per_second": 29.973, "step": 46964 }, { "epoch": 2.0, "grad_norm": 0.51618492603302, "learning_rate": 1.6653890923544277e-05, "loss": 0.2552, "step": 47000 }, { "epoch": 2.02, "grad_norm": 0.4965592324733734, "learning_rate": 1.6299009170144507e-05, "loss": 0.2364, "step": 47500 }, { "epoch": 2.04, "grad_norm": 0.5972116589546204, "learning_rate": 1.5944127416744743e-05, "loss": 0.2414, "step": 48000 }, { "epoch": 2.07, "grad_norm": 0.6402379274368286, "learning_rate": 1.5589245663344972e-05, "loss": 0.2387, "step": 48500 }, { "epoch": 2.09, "grad_norm": 0.4883173704147339, "learning_rate": 1.5234363909945206e-05, "loss": 0.2404, "step": 49000 }, { "epoch": 2.11, "grad_norm": 0.6644130945205688, "learning_rate": 1.487948215654544e-05, "loss": 0.241, "step": 49500 }, { "epoch": 2.13, "grad_norm": 0.530523419380188, "learning_rate": 1.4524600403145672e-05, "loss": 0.2367, "step": 50000 }, { "epoch": 2.15, "grad_norm": 0.6021013855934143, "learning_rate": 1.4169718649745906e-05, "loss": 0.241, "step": 50500 }, { "epoch": 2.17, "grad_norm": 0.7458603978157043, "learning_rate": 1.3814836896346139e-05, "loss": 0.2407, "step": 51000 }, { "epoch": 2.19, "grad_norm": 0.5217610597610474, "learning_rate": 1.3459955142946371e-05, "loss": 0.2372, "step": 51500 }, { "epoch": 2.21, "grad_norm": 0.5092360973358154, "learning_rate": 1.3105783153053403e-05, "loss": 0.2439, "step": 52000 }, { "epoch": 2.24, "grad_norm": 0.4981755018234253, "learning_rate": 1.2750901399653636e-05, "loss": 0.2371, "step": 52500 }, { "epoch": 2.26, "grad_norm": 0.5314450263977051, "learning_rate": 1.2397439173267468e-05, "loss": 0.2398, "step": 53000 }, { "epoch": 2.28, "grad_norm": 0.4607982635498047, "learning_rate": 1.2042557419867701e-05, "loss": 0.2416, "step": 53500 }, { "epoch": 2.3, "grad_norm": 0.551874577999115, "learning_rate": 1.1687675666467934e-05, "loss": 0.238, "step": 54000 }, { "epoch": 2.32, "grad_norm": 0.6242926120758057, "learning_rate": 1.1332793913068166e-05, "loss": 0.2391, "step": 54500 }, { "epoch": 2.34, "grad_norm": 0.6497530341148376, "learning_rate": 1.0978621923175198e-05, "loss": 0.2342, "step": 55000 }, { "epoch": 2.36, "grad_norm": 0.531732976436615, "learning_rate": 1.062374016977543e-05, "loss": 0.2406, "step": 55500 }, { "epoch": 2.38, "grad_norm": 0.6677132248878479, "learning_rate": 1.0268858416375663e-05, "loss": 0.2377, "step": 56000 }, { "epoch": 2.41, "grad_norm": 0.44013282656669617, "learning_rate": 9.913976662975896e-06, "loss": 0.2395, "step": 56500 }, { "epoch": 2.43, "grad_norm": 0.5449537038803101, "learning_rate": 9.55909490957613e-06, "loss": 0.2397, "step": 57000 }, { "epoch": 2.45, "grad_norm": 0.576964795589447, "learning_rate": 9.204922919683162e-06, "loss": 0.2385, "step": 57500 }, { "epoch": 2.47, "grad_norm": 0.5394704341888428, "learning_rate": 8.850041166283395e-06, "loss": 0.237, "step": 58000 }, { "epoch": 2.49, "grad_norm": 0.6450115442276001, "learning_rate": 8.495159412883627e-06, "loss": 0.2359, "step": 58500 }, { "epoch": 2.51, "grad_norm": 0.4756045341491699, "learning_rate": 8.14027765948386e-06, "loss": 0.2403, "step": 59000 }, { "epoch": 2.53, "grad_norm": 0.5701329708099365, "learning_rate": 7.785395906084093e-06, "loss": 0.2379, "step": 59500 }, { "epoch": 2.56, "grad_norm": 0.6203391551971436, "learning_rate": 7.4305141526843255e-06, "loss": 0.2376, "step": 60000 }, { "epoch": 2.58, "grad_norm": 0.4860347509384155, "learning_rate": 7.075632399284558e-06, "loss": 0.2399, "step": 60500 }, { "epoch": 2.6, "grad_norm": 0.5657050609588623, "learning_rate": 6.7207506458847925e-06, "loss": 0.2393, "step": 61000 }, { "epoch": 2.62, "grad_norm": 0.5417011380195618, "learning_rate": 6.366578655991824e-06, "loss": 0.2347, "step": 61500 }, { "epoch": 2.64, "grad_norm": 0.535099446773529, "learning_rate": 6.011696902592057e-06, "loss": 0.2391, "step": 62000 }, { "epoch": 2.66, "grad_norm": 0.44926902651786804, "learning_rate": 5.656815149192289e-06, "loss": 0.2404, "step": 62500 }, { "epoch": 2.68, "grad_norm": 0.504742443561554, "learning_rate": 5.301933395792522e-06, "loss": 0.2393, "step": 63000 }, { "epoch": 2.7, "grad_norm": 0.47763538360595703, "learning_rate": 4.947761405899554e-06, "loss": 0.242, "step": 63500 }, { "epoch": 2.73, "grad_norm": 0.6588559746742249, "learning_rate": 4.5928796524997875e-06, "loss": 0.2367, "step": 64000 }, { "epoch": 2.75, "grad_norm": 0.618839681148529, "learning_rate": 4.238707662606819e-06, "loss": 0.2348, "step": 64500 }, { "epoch": 2.77, "grad_norm": 0.5501250624656677, "learning_rate": 3.883825909207053e-06, "loss": 0.2383, "step": 65000 }, { "epoch": 2.79, "grad_norm": 0.594552755355835, "learning_rate": 3.528944155807285e-06, "loss": 0.2356, "step": 65500 }, { "epoch": 2.81, "grad_norm": 0.4626648724079132, "learning_rate": 3.174062402407518e-06, "loss": 0.2398, "step": 66000 }, { "epoch": 2.83, "grad_norm": 0.5078374743461609, "learning_rate": 2.819180649007751e-06, "loss": 0.2322, "step": 66500 }, { "epoch": 2.85, "grad_norm": 0.5001832246780396, "learning_rate": 2.4642988956079835e-06, "loss": 0.2381, "step": 67000 }, { "epoch": 2.87, "grad_norm": 0.4460981786251068, "learning_rate": 2.109417142208216e-06, "loss": 0.2349, "step": 67500 }, { "epoch": 2.9, "grad_norm": 0.5879847407341003, "learning_rate": 1.7545353888084493e-06, "loss": 0.2352, "step": 68000 }, { "epoch": 2.92, "grad_norm": 0.7035759687423706, "learning_rate": 1.3996536354086817e-06, "loss": 0.2365, "step": 68500 }, { "epoch": 2.94, "grad_norm": 0.6572188138961792, "learning_rate": 1.0447718820089146e-06, "loss": 0.2389, "step": 69000 }, { "epoch": 2.96, "grad_norm": 0.5322707891464233, "learning_rate": 6.90599892115947e-07, "loss": 0.2353, "step": 69500 }, { "epoch": 2.98, "grad_norm": 0.5399382710456848, "learning_rate": 3.3571813871617976e-07, "loss": 0.2391, "step": 70000 } ], "logging_steps": 500, "max_steps": 70446, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.7072376936136704e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }