{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996825396825397, "eval_steps": 500, "global_step": 118000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012698412698412698, "grad_norm": 0.4761015474796295, "learning_rate": 1.9915343915343918e-05, "loss": 0.0794, "step": 500 }, { "epoch": 0.025396825396825397, "grad_norm": 0.43550318479537964, "learning_rate": 1.983068783068783e-05, "loss": 0.0811, "step": 1000 }, { "epoch": 0.0380952380952381, "grad_norm": 0.4672704339027405, "learning_rate": 1.9746031746031748e-05, "loss": 0.0819, "step": 1500 }, { "epoch": 0.050793650793650794, "grad_norm": 0.5426394939422607, "learning_rate": 1.9661375661375664e-05, "loss": 0.0829, "step": 2000 }, { "epoch": 0.06349206349206349, "grad_norm": 0.3974975645542145, "learning_rate": 1.9576719576719577e-05, "loss": 0.0816, "step": 2500 }, { "epoch": 0.0761904761904762, "grad_norm": 0.6599302887916565, "learning_rate": 1.9492063492063494e-05, "loss": 0.0815, "step": 3000 }, { "epoch": 0.08888888888888889, "grad_norm": 0.35329556465148926, "learning_rate": 1.9407407407407407e-05, "loss": 0.0841, "step": 3500 }, { "epoch": 0.10158730158730159, "grad_norm": 0.42421749234199524, "learning_rate": 1.9322751322751327e-05, "loss": 0.0833, "step": 4000 }, { "epoch": 0.11428571428571428, "grad_norm": 0.4479866325855255, "learning_rate": 1.923809523809524e-05, "loss": 0.0839, "step": 4500 }, { "epoch": 0.12698412698412698, "grad_norm": 0.372086763381958, "learning_rate": 1.9153439153439156e-05, "loss": 0.0835, "step": 5000 }, { "epoch": 0.13968253968253969, "grad_norm": 0.38730981945991516, "learning_rate": 1.906878306878307e-05, "loss": 0.0841, "step": 5500 }, { "epoch": 0.1523809523809524, "grad_norm": 0.5003937482833862, "learning_rate": 1.8984126984126986e-05, "loss": 0.0829, "step": 6000 }, { "epoch": 0.16507936507936508, "grad_norm": 0.42826735973358154, "learning_rate": 1.8899470899470903e-05, "loss": 0.0835, "step": 6500 }, { "epoch": 0.17777777777777778, "grad_norm": 0.49070820212364197, "learning_rate": 1.8814814814814816e-05, "loss": 0.0827, "step": 7000 }, { "epoch": 0.19047619047619047, "grad_norm": 0.4903796911239624, "learning_rate": 1.8730158730158732e-05, "loss": 0.0823, "step": 7500 }, { "epoch": 0.20317460317460317, "grad_norm": 0.4144362211227417, "learning_rate": 1.8645502645502645e-05, "loss": 0.0842, "step": 8000 }, { "epoch": 0.21587301587301588, "grad_norm": 0.6519999504089355, "learning_rate": 1.8560846560846562e-05, "loss": 0.0827, "step": 8500 }, { "epoch": 0.22857142857142856, "grad_norm": 0.37082576751708984, "learning_rate": 1.8476190476190478e-05, "loss": 0.0835, "step": 9000 }, { "epoch": 0.24126984126984127, "grad_norm": 0.319024920463562, "learning_rate": 1.8391534391534395e-05, "loss": 0.0829, "step": 9500 }, { "epoch": 0.25396825396825395, "grad_norm": 0.4173873960971832, "learning_rate": 1.8306878306878308e-05, "loss": 0.0814, "step": 10000 }, { "epoch": 0.26666666666666666, "grad_norm": 0.4521333873271942, "learning_rate": 1.8222222222222224e-05, "loss": 0.0825, "step": 10500 }, { "epoch": 0.27936507936507937, "grad_norm": 0.4372086822986603, "learning_rate": 1.8137566137566137e-05, "loss": 0.0844, "step": 11000 }, { "epoch": 0.2920634920634921, "grad_norm": 0.40673378109931946, "learning_rate": 1.8052910052910054e-05, "loss": 0.0846, "step": 11500 }, { "epoch": 0.3047619047619048, "grad_norm": 0.524502694606781, "learning_rate": 1.796825396825397e-05, "loss": 0.0843, "step": 12000 }, { "epoch": 0.31746031746031744, "grad_norm": 0.36854442954063416, "learning_rate": 1.7883597883597884e-05, "loss": 0.0838, "step": 12500 }, { "epoch": 0.33015873015873015, "grad_norm": 0.4694221019744873, "learning_rate": 1.77989417989418e-05, "loss": 0.0834, "step": 13000 }, { "epoch": 0.34285714285714286, "grad_norm": 0.384512335062027, "learning_rate": 1.7714285714285717e-05, "loss": 0.0825, "step": 13500 }, { "epoch": 0.35555555555555557, "grad_norm": 0.3776947855949402, "learning_rate": 1.7629629629629633e-05, "loss": 0.081, "step": 14000 }, { "epoch": 0.3682539682539683, "grad_norm": 0.44691145420074463, "learning_rate": 1.7544973544973546e-05, "loss": 0.0844, "step": 14500 }, { "epoch": 0.38095238095238093, "grad_norm": 0.38754552602767944, "learning_rate": 1.7460317460317463e-05, "loss": 0.0834, "step": 15000 }, { "epoch": 0.39365079365079364, "grad_norm": 0.3924926221370697, "learning_rate": 1.7375661375661376e-05, "loss": 0.0836, "step": 15500 }, { "epoch": 0.40634920634920635, "grad_norm": 0.41219380497932434, "learning_rate": 1.7291005291005292e-05, "loss": 0.0827, "step": 16000 }, { "epoch": 0.41904761904761906, "grad_norm": 0.36697277426719666, "learning_rate": 1.720634920634921e-05, "loss": 0.0833, "step": 16500 }, { "epoch": 0.43174603174603177, "grad_norm": 0.37833482027053833, "learning_rate": 1.7121693121693125e-05, "loss": 0.0831, "step": 17000 }, { "epoch": 0.4444444444444444, "grad_norm": 0.33408552408218384, "learning_rate": 1.7037037037037038e-05, "loss": 0.0818, "step": 17500 }, { "epoch": 0.45714285714285713, "grad_norm": 0.4245634377002716, "learning_rate": 1.6952380952380955e-05, "loss": 0.0838, "step": 18000 }, { "epoch": 0.46984126984126984, "grad_norm": 0.4424809217453003, "learning_rate": 1.6867724867724868e-05, "loss": 0.0828, "step": 18500 }, { "epoch": 0.48253968253968255, "grad_norm": 0.47369641065597534, "learning_rate": 1.6783068783068784e-05, "loss": 0.0828, "step": 19000 }, { "epoch": 0.49523809523809526, "grad_norm": 0.417057603597641, "learning_rate": 1.66984126984127e-05, "loss": 0.0839, "step": 19500 }, { "epoch": 0.5079365079365079, "grad_norm": 0.450612336397171, "learning_rate": 1.6613756613756614e-05, "loss": 0.0832, "step": 20000 }, { "epoch": 0.5206349206349207, "grad_norm": 0.35937097668647766, "learning_rate": 1.652910052910053e-05, "loss": 0.0816, "step": 20500 }, { "epoch": 0.5333333333333333, "grad_norm": 0.4366040527820587, "learning_rate": 1.6444444444444444e-05, "loss": 0.0817, "step": 21000 }, { "epoch": 0.546031746031746, "grad_norm": 0.3630824387073517, "learning_rate": 1.6359788359788363e-05, "loss": 0.0823, "step": 21500 }, { "epoch": 0.5587301587301587, "grad_norm": 0.45653077960014343, "learning_rate": 1.6275132275132277e-05, "loss": 0.0814, "step": 22000 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4124685525894165, "learning_rate": 1.6190476190476193e-05, "loss": 0.0828, "step": 22500 }, { "epoch": 0.5841269841269842, "grad_norm": 0.4182330071926117, "learning_rate": 1.6105820105820106e-05, "loss": 0.0825, "step": 23000 }, { "epoch": 0.5968253968253968, "grad_norm": 0.7457558512687683, "learning_rate": 1.6021164021164023e-05, "loss": 0.0828, "step": 23500 }, { "epoch": 0.6095238095238096, "grad_norm": 0.41049671173095703, "learning_rate": 1.5936507936507936e-05, "loss": 0.0831, "step": 24000 }, { "epoch": 0.6222222222222222, "grad_norm": 0.4230283498764038, "learning_rate": 1.5851851851851852e-05, "loss": 0.0823, "step": 24500 }, { "epoch": 0.6349206349206349, "grad_norm": 0.38568949699401855, "learning_rate": 1.576719576719577e-05, "loss": 0.0811, "step": 25000 }, { "epoch": 0.6476190476190476, "grad_norm": 0.42709481716156006, "learning_rate": 1.5682539682539685e-05, "loss": 0.0818, "step": 25500 }, { "epoch": 0.6603174603174603, "grad_norm": 0.37508589029312134, "learning_rate": 1.55978835978836e-05, "loss": 0.0828, "step": 26000 }, { "epoch": 0.6730158730158731, "grad_norm": 0.43134260177612305, "learning_rate": 1.5513227513227515e-05, "loss": 0.0824, "step": 26500 }, { "epoch": 0.6857142857142857, "grad_norm": 0.37693992257118225, "learning_rate": 1.542857142857143e-05, "loss": 0.0811, "step": 27000 }, { "epoch": 0.6984126984126984, "grad_norm": 0.34098678827285767, "learning_rate": 1.5343915343915344e-05, "loss": 0.0819, "step": 27500 }, { "epoch": 0.7111111111111111, "grad_norm": 0.47179728746414185, "learning_rate": 1.525925925925926e-05, "loss": 0.082, "step": 28000 }, { "epoch": 0.7238095238095238, "grad_norm": 0.4184609651565552, "learning_rate": 1.5174603174603176e-05, "loss": 0.0825, "step": 28500 }, { "epoch": 0.7365079365079366, "grad_norm": 0.3582792282104492, "learning_rate": 1.508994708994709e-05, "loss": 0.0821, "step": 29000 }, { "epoch": 0.7492063492063492, "grad_norm": 0.5200299620628357, "learning_rate": 1.5005291005291007e-05, "loss": 0.0817, "step": 29500 }, { "epoch": 0.7619047619047619, "grad_norm": 0.4461567997932434, "learning_rate": 1.4920634920634922e-05, "loss": 0.0814, "step": 30000 }, { "epoch": 0.7746031746031746, "grad_norm": 0.3920634388923645, "learning_rate": 1.4835978835978837e-05, "loss": 0.0819, "step": 30500 }, { "epoch": 0.7873015873015873, "grad_norm": 0.41001540422439575, "learning_rate": 1.4751322751322751e-05, "loss": 0.0802, "step": 31000 }, { "epoch": 0.8, "grad_norm": 0.4187995493412018, "learning_rate": 1.4666666666666666e-05, "loss": 0.0816, "step": 31500 }, { "epoch": 0.8126984126984127, "grad_norm": 0.39321765303611755, "learning_rate": 1.4582010582010584e-05, "loss": 0.0824, "step": 32000 }, { "epoch": 0.8253968253968254, "grad_norm": 0.3958302140235901, "learning_rate": 1.44973544973545e-05, "loss": 0.0801, "step": 32500 }, { "epoch": 0.8380952380952381, "grad_norm": 0.3932056725025177, "learning_rate": 1.4412698412698414e-05, "loss": 0.0808, "step": 33000 }, { "epoch": 0.8507936507936508, "grad_norm": 0.3314465284347534, "learning_rate": 1.4328042328042329e-05, "loss": 0.0827, "step": 33500 }, { "epoch": 0.8634920634920635, "grad_norm": 0.43675485253334045, "learning_rate": 1.4243386243386244e-05, "loss": 0.0811, "step": 34000 }, { "epoch": 0.8761904761904762, "grad_norm": 0.6284595131874084, "learning_rate": 1.415873015873016e-05, "loss": 0.0805, "step": 34500 }, { "epoch": 0.8888888888888888, "grad_norm": 0.39293691515922546, "learning_rate": 1.4074074074074075e-05, "loss": 0.0803, "step": 35000 }, { "epoch": 0.9015873015873016, "grad_norm": 0.4092639088630676, "learning_rate": 1.398941798941799e-05, "loss": 0.0813, "step": 35500 }, { "epoch": 0.9142857142857143, "grad_norm": 0.41005492210388184, "learning_rate": 1.3904761904761905e-05, "loss": 0.0811, "step": 36000 }, { "epoch": 0.926984126984127, "grad_norm": 0.5190646052360535, "learning_rate": 1.3820105820105821e-05, "loss": 0.0811, "step": 36500 }, { "epoch": 0.9396825396825397, "grad_norm": 0.32034316658973694, "learning_rate": 1.3735449735449738e-05, "loss": 0.0812, "step": 37000 }, { "epoch": 0.9523809523809523, "grad_norm": 0.4857613742351532, "learning_rate": 1.3650793650793652e-05, "loss": 0.0813, "step": 37500 }, { "epoch": 0.9650793650793651, "grad_norm": 0.4523787796497345, "learning_rate": 1.3566137566137567e-05, "loss": 0.0816, "step": 38000 }, { "epoch": 0.9777777777777777, "grad_norm": 0.4204433262348175, "learning_rate": 1.3481481481481482e-05, "loss": 0.0806, "step": 38500 }, { "epoch": 0.9904761904761905, "grad_norm": 0.4313475787639618, "learning_rate": 1.3396825396825397e-05, "loss": 0.0806, "step": 39000 }, { "epoch": 1.0, "eval_loss": 0.07647726684808731, "eval_runtime": 270.8786, "eval_samples_per_second": 516.837, "eval_steps_per_second": 64.605, "step": 39375 }, { "epoch": 1.0031746031746032, "grad_norm": 0.44939786195755005, "learning_rate": 1.3312169312169313e-05, "loss": 0.0795, "step": 39500 }, { "epoch": 1.0158730158730158, "grad_norm": 0.48013949394226074, "learning_rate": 1.322751322751323e-05, "loss": 0.0751, "step": 40000 }, { "epoch": 1.0285714285714285, "grad_norm": 0.4185923635959625, "learning_rate": 1.3142857142857145e-05, "loss": 0.0736, "step": 40500 }, { "epoch": 1.0412698412698413, "grad_norm": 0.397386759519577, "learning_rate": 1.305820105820106e-05, "loss": 0.0748, "step": 41000 }, { "epoch": 1.053968253968254, "grad_norm": 0.39524805545806885, "learning_rate": 1.2973544973544974e-05, "loss": 0.0735, "step": 41500 }, { "epoch": 1.0666666666666667, "grad_norm": 0.34505075216293335, "learning_rate": 1.288888888888889e-05, "loss": 0.0746, "step": 42000 }, { "epoch": 1.0793650793650793, "grad_norm": 0.37381839752197266, "learning_rate": 1.2804232804232805e-05, "loss": 0.0728, "step": 42500 }, { "epoch": 1.0920634920634922, "grad_norm": 0.6797782182693481, "learning_rate": 1.271957671957672e-05, "loss": 0.0741, "step": 43000 }, { "epoch": 1.1047619047619048, "grad_norm": 0.41272956132888794, "learning_rate": 1.2634920634920635e-05, "loss": 0.0738, "step": 43500 }, { "epoch": 1.1174603174603175, "grad_norm": 0.382468044757843, "learning_rate": 1.255026455026455e-05, "loss": 0.0738, "step": 44000 }, { "epoch": 1.1301587301587301, "grad_norm": 0.3978229761123657, "learning_rate": 1.2465608465608468e-05, "loss": 0.074, "step": 44500 }, { "epoch": 1.1428571428571428, "grad_norm": 0.3431326746940613, "learning_rate": 1.2380952380952383e-05, "loss": 0.0745, "step": 45000 }, { "epoch": 1.1555555555555554, "grad_norm": 0.3610997200012207, "learning_rate": 1.2296296296296298e-05, "loss": 0.0729, "step": 45500 }, { "epoch": 1.1682539682539683, "grad_norm": 0.49680083990097046, "learning_rate": 1.2211640211640212e-05, "loss": 0.0732, "step": 46000 }, { "epoch": 1.180952380952381, "grad_norm": 0.3833047151565552, "learning_rate": 1.2126984126984127e-05, "loss": 0.0732, "step": 46500 }, { "epoch": 1.1936507936507936, "grad_norm": 0.2808152139186859, "learning_rate": 1.2042328042328044e-05, "loss": 0.0733, "step": 47000 }, { "epoch": 1.2063492063492063, "grad_norm": 0.5429581999778748, "learning_rate": 1.1957671957671959e-05, "loss": 0.0729, "step": 47500 }, { "epoch": 1.2190476190476192, "grad_norm": 0.34248363971710205, "learning_rate": 1.1873015873015873e-05, "loss": 0.0746, "step": 48000 }, { "epoch": 1.2317460317460318, "grad_norm": 0.5099675059318542, "learning_rate": 1.1788359788359788e-05, "loss": 0.0739, "step": 48500 }, { "epoch": 1.2444444444444445, "grad_norm": 0.3858914375305176, "learning_rate": 1.1703703703703703e-05, "loss": 0.0721, "step": 49000 }, { "epoch": 1.2571428571428571, "grad_norm": 0.3453405201435089, "learning_rate": 1.1619047619047621e-05, "loss": 0.0737, "step": 49500 }, { "epoch": 1.2698412698412698, "grad_norm": 0.4647195637226105, "learning_rate": 1.1534391534391536e-05, "loss": 0.0736, "step": 50000 }, { "epoch": 1.2825396825396824, "grad_norm": 0.4548490345478058, "learning_rate": 1.144973544973545e-05, "loss": 0.0742, "step": 50500 }, { "epoch": 1.2952380952380953, "grad_norm": 0.4145970046520233, "learning_rate": 1.1365079365079366e-05, "loss": 0.0748, "step": 51000 }, { "epoch": 1.307936507936508, "grad_norm": 0.4032251536846161, "learning_rate": 1.128042328042328e-05, "loss": 0.073, "step": 51500 }, { "epoch": 1.3206349206349206, "grad_norm": 0.5053452849388123, "learning_rate": 1.1195767195767197e-05, "loss": 0.0742, "step": 52000 }, { "epoch": 1.3333333333333333, "grad_norm": 0.42281991243362427, "learning_rate": 1.1111111111111113e-05, "loss": 0.0728, "step": 52500 }, { "epoch": 1.3460317460317461, "grad_norm": 0.4088720679283142, "learning_rate": 1.1026455026455028e-05, "loss": 0.0737, "step": 53000 }, { "epoch": 1.3587301587301588, "grad_norm": 0.4682016968727112, "learning_rate": 1.0941798941798943e-05, "loss": 0.0754, "step": 53500 }, { "epoch": 1.3714285714285714, "grad_norm": 0.35886242985725403, "learning_rate": 1.0857142857142858e-05, "loss": 0.0739, "step": 54000 }, { "epoch": 1.384126984126984, "grad_norm": 0.5034026503562927, "learning_rate": 1.0772486772486774e-05, "loss": 0.0744, "step": 54500 }, { "epoch": 1.3968253968253967, "grad_norm": 0.6038418412208557, "learning_rate": 1.0687830687830689e-05, "loss": 0.073, "step": 55000 }, { "epoch": 1.4095238095238094, "grad_norm": 0.4263134002685547, "learning_rate": 1.0603174603174604e-05, "loss": 0.0743, "step": 55500 }, { "epoch": 1.4222222222222223, "grad_norm": 0.3092331886291504, "learning_rate": 1.0518518518518519e-05, "loss": 0.0747, "step": 56000 }, { "epoch": 1.434920634920635, "grad_norm": 0.41775885224342346, "learning_rate": 1.0433862433862433e-05, "loss": 0.0736, "step": 56500 }, { "epoch": 1.4476190476190476, "grad_norm": 0.3818839192390442, "learning_rate": 1.0349206349206352e-05, "loss": 0.0736, "step": 57000 }, { "epoch": 1.4603174603174602, "grad_norm": 0.42527565360069275, "learning_rate": 1.0264550264550266e-05, "loss": 0.0741, "step": 57500 }, { "epoch": 1.4730158730158731, "grad_norm": 0.37903305888175964, "learning_rate": 1.0179894179894181e-05, "loss": 0.0727, "step": 58000 }, { "epoch": 1.4857142857142858, "grad_norm": 0.41770797967910767, "learning_rate": 1.0095238095238096e-05, "loss": 0.0733, "step": 58500 }, { "epoch": 1.4984126984126984, "grad_norm": 0.6334396600723267, "learning_rate": 1.001058201058201e-05, "loss": 0.073, "step": 59000 }, { "epoch": 1.511111111111111, "grad_norm": 0.3735711872577667, "learning_rate": 9.925925925925927e-06, "loss": 0.0739, "step": 59500 }, { "epoch": 1.5238095238095237, "grad_norm": 0.40507185459136963, "learning_rate": 9.841269841269842e-06, "loss": 0.0731, "step": 60000 }, { "epoch": 1.5365079365079364, "grad_norm": 0.4952349066734314, "learning_rate": 9.756613756613757e-06, "loss": 0.0741, "step": 60500 }, { "epoch": 1.5492063492063493, "grad_norm": 0.4670361280441284, "learning_rate": 9.671957671957672e-06, "loss": 0.0736, "step": 61000 }, { "epoch": 1.561904761904762, "grad_norm": 0.2984641492366791, "learning_rate": 9.587301587301588e-06, "loss": 0.0732, "step": 61500 }, { "epoch": 1.5746031746031746, "grad_norm": 0.5101374983787537, "learning_rate": 9.502645502645503e-06, "loss": 0.0759, "step": 62000 }, { "epoch": 1.5873015873015874, "grad_norm": 0.38656944036483765, "learning_rate": 9.417989417989418e-06, "loss": 0.0741, "step": 62500 }, { "epoch": 1.6, "grad_norm": 0.508953869342804, "learning_rate": 9.333333333333334e-06, "loss": 0.0737, "step": 63000 }, { "epoch": 1.6126984126984127, "grad_norm": 0.49415382742881775, "learning_rate": 9.248677248677249e-06, "loss": 0.0736, "step": 63500 }, { "epoch": 1.6253968253968254, "grad_norm": 0.48334264755249023, "learning_rate": 9.164021164021166e-06, "loss": 0.0739, "step": 64000 }, { "epoch": 1.638095238095238, "grad_norm": 0.3960755467414856, "learning_rate": 9.07936507936508e-06, "loss": 0.0723, "step": 64500 }, { "epoch": 1.6507936507936507, "grad_norm": 0.4537145495414734, "learning_rate": 8.994708994708995e-06, "loss": 0.0723, "step": 65000 }, { "epoch": 1.6634920634920634, "grad_norm": 0.4759564697742462, "learning_rate": 8.910052910052912e-06, "loss": 0.0737, "step": 65500 }, { "epoch": 1.6761904761904762, "grad_norm": 0.564620316028595, "learning_rate": 8.825396825396827e-06, "loss": 0.0726, "step": 66000 }, { "epoch": 1.6888888888888889, "grad_norm": 0.3793913424015045, "learning_rate": 8.740740740740741e-06, "loss": 0.0725, "step": 66500 }, { "epoch": 1.7015873015873015, "grad_norm": 0.3748345673084259, "learning_rate": 8.656084656084656e-06, "loss": 0.0734, "step": 67000 }, { "epoch": 1.7142857142857144, "grad_norm": 0.31550857424736023, "learning_rate": 8.571428571428571e-06, "loss": 0.0728, "step": 67500 }, { "epoch": 1.726984126984127, "grad_norm": 0.39485469460487366, "learning_rate": 8.486772486772487e-06, "loss": 0.074, "step": 68000 }, { "epoch": 1.7396825396825397, "grad_norm": 0.3833816647529602, "learning_rate": 8.402116402116402e-06, "loss": 0.0727, "step": 68500 }, { "epoch": 1.7523809523809524, "grad_norm": 0.45526403188705444, "learning_rate": 8.317460317460319e-06, "loss": 0.0721, "step": 69000 }, { "epoch": 1.765079365079365, "grad_norm": 0.4437309801578522, "learning_rate": 8.232804232804234e-06, "loss": 0.0714, "step": 69500 }, { "epoch": 1.7777777777777777, "grad_norm": 0.3827795386314392, "learning_rate": 8.148148148148148e-06, "loss": 0.0736, "step": 70000 }, { "epoch": 1.7904761904761903, "grad_norm": 0.3821280896663666, "learning_rate": 8.063492063492065e-06, "loss": 0.0742, "step": 70500 }, { "epoch": 1.8031746031746032, "grad_norm": 0.3558200001716614, "learning_rate": 7.97883597883598e-06, "loss": 0.0733, "step": 71000 }, { "epoch": 1.8158730158730159, "grad_norm": 0.35507771372795105, "learning_rate": 7.894179894179896e-06, "loss": 0.073, "step": 71500 }, { "epoch": 1.8285714285714287, "grad_norm": 0.4878668785095215, "learning_rate": 7.809523809523811e-06, "loss": 0.0726, "step": 72000 }, { "epoch": 1.8412698412698414, "grad_norm": 0.46924230456352234, "learning_rate": 7.724867724867726e-06, "loss": 0.0729, "step": 72500 }, { "epoch": 1.853968253968254, "grad_norm": 0.5545886158943176, "learning_rate": 7.64021164021164e-06, "loss": 0.0728, "step": 73000 }, { "epoch": 1.8666666666666667, "grad_norm": 0.33820512890815735, "learning_rate": 7.555555555555556e-06, "loss": 0.0727, "step": 73500 }, { "epoch": 1.8793650793650793, "grad_norm": 0.4180295169353485, "learning_rate": 7.470899470899472e-06, "loss": 0.0722, "step": 74000 }, { "epoch": 1.892063492063492, "grad_norm": 0.41895756125450134, "learning_rate": 7.386243386243387e-06, "loss": 0.0721, "step": 74500 }, { "epoch": 1.9047619047619047, "grad_norm": 0.37801581621170044, "learning_rate": 7.301587301587301e-06, "loss": 0.0754, "step": 75000 }, { "epoch": 1.9174603174603173, "grad_norm": 0.42890599370002747, "learning_rate": 7.216931216931218e-06, "loss": 0.0727, "step": 75500 }, { "epoch": 1.9301587301587302, "grad_norm": 0.36311328411102295, "learning_rate": 7.132275132275133e-06, "loss": 0.0732, "step": 76000 }, { "epoch": 1.9428571428571428, "grad_norm": 0.4069361090660095, "learning_rate": 7.047619047619048e-06, "loss": 0.0731, "step": 76500 }, { "epoch": 1.9555555555555557, "grad_norm": 0.38275209069252014, "learning_rate": 6.962962962962964e-06, "loss": 0.0729, "step": 77000 }, { "epoch": 1.9682539682539684, "grad_norm": 0.3496081829071045, "learning_rate": 6.878306878306879e-06, "loss": 0.0725, "step": 77500 }, { "epoch": 1.980952380952381, "grad_norm": 0.37429070472717285, "learning_rate": 6.7936507936507944e-06, "loss": 0.0726, "step": 78000 }, { "epoch": 1.9936507936507937, "grad_norm": 0.4195725619792938, "learning_rate": 6.708994708994709e-06, "loss": 0.0724, "step": 78500 }, { "epoch": 2.0, "eval_loss": 0.0749795213341713, "eval_runtime": 269.1515, "eval_samples_per_second": 520.153, "eval_steps_per_second": 65.019, "step": 78750 }, { "epoch": 2.0063492063492063, "grad_norm": 0.4257189631462097, "learning_rate": 6.624338624338626e-06, "loss": 0.07, "step": 79000 }, { "epoch": 2.019047619047619, "grad_norm": 0.37472862005233765, "learning_rate": 6.5396825396825405e-06, "loss": 0.0664, "step": 79500 }, { "epoch": 2.0317460317460316, "grad_norm": 0.4728703796863556, "learning_rate": 6.455026455026455e-06, "loss": 0.0664, "step": 80000 }, { "epoch": 2.0444444444444443, "grad_norm": 0.42774897813796997, "learning_rate": 6.370370370370371e-06, "loss": 0.0661, "step": 80500 }, { "epoch": 2.057142857142857, "grad_norm": 0.4025447368621826, "learning_rate": 6.285714285714286e-06, "loss": 0.0679, "step": 81000 }, { "epoch": 2.06984126984127, "grad_norm": 0.41302409768104553, "learning_rate": 6.201058201058202e-06, "loss": 0.0662, "step": 81500 }, { "epoch": 2.0825396825396827, "grad_norm": 0.4339478611946106, "learning_rate": 6.116402116402117e-06, "loss": 0.0662, "step": 82000 }, { "epoch": 2.0952380952380953, "grad_norm": 0.38711288571357727, "learning_rate": 6.031746031746032e-06, "loss": 0.0677, "step": 82500 }, { "epoch": 2.107936507936508, "grad_norm": 0.44815394282341003, "learning_rate": 5.9470899470899475e-06, "loss": 0.0674, "step": 83000 }, { "epoch": 2.1206349206349207, "grad_norm": 0.4252176582813263, "learning_rate": 5.862433862433863e-06, "loss": 0.067, "step": 83500 }, { "epoch": 2.1333333333333333, "grad_norm": 0.4019823670387268, "learning_rate": 5.777777777777778e-06, "loss": 0.0676, "step": 84000 }, { "epoch": 2.146031746031746, "grad_norm": 0.37775805592536926, "learning_rate": 5.693121693121694e-06, "loss": 0.0671, "step": 84500 }, { "epoch": 2.1587301587301586, "grad_norm": 0.5179104208946228, "learning_rate": 5.6084656084656084e-06, "loss": 0.0671, "step": 85000 }, { "epoch": 2.1714285714285713, "grad_norm": 0.37160980701446533, "learning_rate": 5.523809523809525e-06, "loss": 0.0677, "step": 85500 }, { "epoch": 2.1841269841269844, "grad_norm": 0.4610843360424042, "learning_rate": 5.43915343915344e-06, "loss": 0.0671, "step": 86000 }, { "epoch": 2.196825396825397, "grad_norm": 0.4135109484195709, "learning_rate": 5.3544973544973545e-06, "loss": 0.0678, "step": 86500 }, { "epoch": 2.2095238095238097, "grad_norm": 0.38079920411109924, "learning_rate": 5.26984126984127e-06, "loss": 0.0678, "step": 87000 }, { "epoch": 2.2222222222222223, "grad_norm": 0.39888954162597656, "learning_rate": 5.185185185185185e-06, "loss": 0.0669, "step": 87500 }, { "epoch": 2.234920634920635, "grad_norm": 0.37562116980552673, "learning_rate": 5.1005291005291015e-06, "loss": 0.0661, "step": 88000 }, { "epoch": 2.2476190476190476, "grad_norm": 0.4394863247871399, "learning_rate": 5.015873015873016e-06, "loss": 0.0671, "step": 88500 }, { "epoch": 2.2603174603174603, "grad_norm": 0.4748270511627197, "learning_rate": 4.931216931216932e-06, "loss": 0.067, "step": 89000 }, { "epoch": 2.273015873015873, "grad_norm": 0.4593636095523834, "learning_rate": 4.846560846560847e-06, "loss": 0.067, "step": 89500 }, { "epoch": 2.2857142857142856, "grad_norm": 0.3517415225505829, "learning_rate": 4.761904761904762e-06, "loss": 0.0669, "step": 90000 }, { "epoch": 2.2984126984126982, "grad_norm": 0.40983742475509644, "learning_rate": 4.677248677248677e-06, "loss": 0.0681, "step": 90500 }, { "epoch": 2.311111111111111, "grad_norm": 0.46570950746536255, "learning_rate": 4.592592592592593e-06, "loss": 0.0672, "step": 91000 }, { "epoch": 2.323809523809524, "grad_norm": 0.4733307957649231, "learning_rate": 4.5079365079365085e-06, "loss": 0.0671, "step": 91500 }, { "epoch": 2.3365079365079366, "grad_norm": 0.38432806730270386, "learning_rate": 4.423280423280424e-06, "loss": 0.0672, "step": 92000 }, { "epoch": 2.3492063492063493, "grad_norm": 0.31346267461776733, "learning_rate": 4.338624338624339e-06, "loss": 0.066, "step": 92500 }, { "epoch": 2.361904761904762, "grad_norm": 0.5612916350364685, "learning_rate": 4.2539682539682546e-06, "loss": 0.0666, "step": 93000 }, { "epoch": 2.3746031746031746, "grad_norm": 0.3445761501789093, "learning_rate": 4.169312169312169e-06, "loss": 0.0675, "step": 93500 }, { "epoch": 2.3873015873015873, "grad_norm": 0.41335174441337585, "learning_rate": 4.084656084656085e-06, "loss": 0.0676, "step": 94000 }, { "epoch": 2.4, "grad_norm": 0.42691895365715027, "learning_rate": 4.000000000000001e-06, "loss": 0.0669, "step": 94500 }, { "epoch": 2.4126984126984126, "grad_norm": 0.44459807872772217, "learning_rate": 3.9153439153439155e-06, "loss": 0.0661, "step": 95000 }, { "epoch": 2.425396825396825, "grad_norm": 0.39611610770225525, "learning_rate": 3.830687830687831e-06, "loss": 0.0665, "step": 95500 }, { "epoch": 2.4380952380952383, "grad_norm": 0.41603508591651917, "learning_rate": 3.7460317460317463e-06, "loss": 0.067, "step": 96000 }, { "epoch": 2.450793650793651, "grad_norm": 0.45685020089149475, "learning_rate": 3.661375661375662e-06, "loss": 0.0664, "step": 96500 }, { "epoch": 2.4634920634920636, "grad_norm": 0.41426390409469604, "learning_rate": 3.5767195767195772e-06, "loss": 0.0665, "step": 97000 }, { "epoch": 2.4761904761904763, "grad_norm": 0.4311801791191101, "learning_rate": 3.492063492063492e-06, "loss": 0.0673, "step": 97500 }, { "epoch": 2.488888888888889, "grad_norm": 0.39366066455841064, "learning_rate": 3.4074074074074077e-06, "loss": 0.0676, "step": 98000 }, { "epoch": 2.5015873015873016, "grad_norm": 0.46240171790122986, "learning_rate": 3.322751322751323e-06, "loss": 0.0674, "step": 98500 }, { "epoch": 2.5142857142857142, "grad_norm": 0.45865318179130554, "learning_rate": 3.2380952380952385e-06, "loss": 0.0671, "step": 99000 }, { "epoch": 2.526984126984127, "grad_norm": 0.38405075669288635, "learning_rate": 3.1534391534391538e-06, "loss": 0.0678, "step": 99500 }, { "epoch": 2.5396825396825395, "grad_norm": 0.48667874932289124, "learning_rate": 3.068783068783069e-06, "loss": 0.0661, "step": 100000 }, { "epoch": 2.552380952380952, "grad_norm": 0.3919212818145752, "learning_rate": 2.984126984126984e-06, "loss": 0.0654, "step": 100500 }, { "epoch": 2.565079365079365, "grad_norm": 0.4081352651119232, "learning_rate": 2.8994708994709e-06, "loss": 0.0669, "step": 101000 }, { "epoch": 2.5777777777777775, "grad_norm": 0.33449599146842957, "learning_rate": 2.814814814814815e-06, "loss": 0.0655, "step": 101500 }, { "epoch": 2.5904761904761906, "grad_norm": 0.37508487701416016, "learning_rate": 2.7301587301587303e-06, "loss": 0.0659, "step": 102000 }, { "epoch": 2.6031746031746033, "grad_norm": 0.43301156163215637, "learning_rate": 2.6455026455026455e-06, "loss": 0.0684, "step": 102500 }, { "epoch": 2.615873015873016, "grad_norm": 0.31652727723121643, "learning_rate": 2.560846560846561e-06, "loss": 0.0674, "step": 103000 }, { "epoch": 2.6285714285714286, "grad_norm": 0.38132810592651367, "learning_rate": 2.4761904761904764e-06, "loss": 0.0665, "step": 103500 }, { "epoch": 2.641269841269841, "grad_norm": 0.4249517023563385, "learning_rate": 2.3915343915343916e-06, "loss": 0.0678, "step": 104000 }, { "epoch": 2.653968253968254, "grad_norm": 0.42605915665626526, "learning_rate": 2.3068783068783073e-06, "loss": 0.0659, "step": 104500 }, { "epoch": 2.6666666666666665, "grad_norm": 0.4002751111984253, "learning_rate": 2.222222222222222e-06, "loss": 0.0665, "step": 105000 }, { "epoch": 2.6793650793650796, "grad_norm": 0.5232521891593933, "learning_rate": 2.1375661375661377e-06, "loss": 0.0676, "step": 105500 }, { "epoch": 2.6920634920634923, "grad_norm": 0.409422367811203, "learning_rate": 2.0529100529100534e-06, "loss": 0.0658, "step": 106000 }, { "epoch": 2.704761904761905, "grad_norm": 0.3971617519855499, "learning_rate": 1.968253968253968e-06, "loss": 0.0655, "step": 106500 }, { "epoch": 2.7174603174603176, "grad_norm": 0.35877570509910583, "learning_rate": 1.8835978835978838e-06, "loss": 0.0673, "step": 107000 }, { "epoch": 2.7301587301587302, "grad_norm": 0.36749425530433655, "learning_rate": 1.798941798941799e-06, "loss": 0.0681, "step": 107500 }, { "epoch": 2.742857142857143, "grad_norm": 0.3727457821369171, "learning_rate": 1.7142857142857145e-06, "loss": 0.0665, "step": 108000 }, { "epoch": 2.7555555555555555, "grad_norm": 0.40977808833122253, "learning_rate": 1.62962962962963e-06, "loss": 0.0672, "step": 108500 }, { "epoch": 2.768253968253968, "grad_norm": 0.4265407621860504, "learning_rate": 1.5449735449735451e-06, "loss": 0.0666, "step": 109000 }, { "epoch": 2.780952380952381, "grad_norm": 0.3894596993923187, "learning_rate": 1.4603174603174606e-06, "loss": 0.0673, "step": 109500 }, { "epoch": 2.7936507936507935, "grad_norm": 0.526606023311615, "learning_rate": 1.3756613756613758e-06, "loss": 0.0676, "step": 110000 }, { "epoch": 2.806349206349206, "grad_norm": 0.2910812497138977, "learning_rate": 1.2910052910052912e-06, "loss": 0.0671, "step": 110500 }, { "epoch": 2.819047619047619, "grad_norm": 0.3701234757900238, "learning_rate": 1.2063492063492065e-06, "loss": 0.0666, "step": 111000 }, { "epoch": 2.831746031746032, "grad_norm": 0.3969452679157257, "learning_rate": 1.1216931216931217e-06, "loss": 0.0668, "step": 111500 }, { "epoch": 2.8444444444444446, "grad_norm": 0.4415270686149597, "learning_rate": 1.0370370370370371e-06, "loss": 0.0661, "step": 112000 }, { "epoch": 2.857142857142857, "grad_norm": 0.3490103483200073, "learning_rate": 9.523809523809525e-07, "loss": 0.0658, "step": 112500 }, { "epoch": 2.86984126984127, "grad_norm": 0.35733526945114136, "learning_rate": 8.677248677248679e-07, "loss": 0.0661, "step": 113000 }, { "epoch": 2.8825396825396825, "grad_norm": 0.4992692172527313, "learning_rate": 7.830687830687832e-07, "loss": 0.068, "step": 113500 }, { "epoch": 2.895238095238095, "grad_norm": 0.4047030508518219, "learning_rate": 6.984126984126984e-07, "loss": 0.0683, "step": 114000 }, { "epoch": 2.907936507936508, "grad_norm": 0.4468993544578552, "learning_rate": 6.137566137566138e-07, "loss": 0.0664, "step": 114500 }, { "epoch": 2.9206349206349205, "grad_norm": 0.41356751322746277, "learning_rate": 5.291005291005291e-07, "loss": 0.067, "step": 115000 }, { "epoch": 2.9333333333333336, "grad_norm": 0.4459340572357178, "learning_rate": 4.444444444444445e-07, "loss": 0.0671, "step": 115500 }, { "epoch": 2.9460317460317462, "grad_norm": 0.42610964179039, "learning_rate": 3.597883597883598e-07, "loss": 0.0664, "step": 116000 }, { "epoch": 2.958730158730159, "grad_norm": 0.5059521794319153, "learning_rate": 2.7513227513227515e-07, "loss": 0.0658, "step": 116500 }, { "epoch": 2.9714285714285715, "grad_norm": 0.3404170572757721, "learning_rate": 1.904761904761905e-07, "loss": 0.0667, "step": 117000 }, { "epoch": 2.984126984126984, "grad_norm": 0.4388870894908905, "learning_rate": 1.0582010582010582e-07, "loss": 0.0658, "step": 117500 }, { "epoch": 2.996825396825397, "grad_norm": 0.39170539379119873, "learning_rate": 2.1164021164021167e-08, "loss": 0.0665, "step": 118000 } ], "logging_steps": 500, "max_steps": 118125, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4389780414464e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }