{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9980031062791213, "eval_steps": 500, "global_step": 3378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.863227128982544, "learning_rate": 2.0414201183431953e-06, "loss": 10.8959, "step": 25 }, { "epoch": 0.04, "grad_norm": 3.9202582836151123, "learning_rate": 4.171597633136095e-06, "loss": 10.7254, "step": 50 }, { "epoch": 0.07, "grad_norm": 3.297222375869751, "learning_rate": 6.390532544378699e-06, "loss": 10.3548, "step": 75 }, { "epoch": 0.09, "grad_norm": 2.734739065170288, "learning_rate": 8.609467455621303e-06, "loss": 9.7686, "step": 100 }, { "epoch": 0.11, "grad_norm": 2.3171167373657227, "learning_rate": 1.0828402366863907e-05, "loss": 8.8642, "step": 125 }, { "epoch": 0.13, "grad_norm": 1.8683223724365234, "learning_rate": 1.304733727810651e-05, "loss": 7.7751, "step": 150 }, { "epoch": 0.16, "grad_norm": 1.3291046619415283, "learning_rate": 1.5266272189349113e-05, "loss": 7.0121, "step": 175 }, { "epoch": 0.18, "grad_norm": 1.4056954383850098, "learning_rate": 1.7485207100591714e-05, "loss": 6.3242, "step": 200 }, { "epoch": 0.2, "grad_norm": 1.1085917949676514, "learning_rate": 1.9704142011834322e-05, "loss": 5.7192, "step": 225 }, { "epoch": 0.22, "grad_norm": 1.3172917366027832, "learning_rate": 2.1923076923076924e-05, "loss": 5.0999, "step": 250 }, { "epoch": 0.24, "grad_norm": 1.0901626348495483, "learning_rate": 2.414201183431953e-05, "loss": 4.6548, "step": 275 }, { "epoch": 0.27, "grad_norm": 1.2347195148468018, "learning_rate": 2.6360946745562133e-05, "loss": 4.3473, "step": 300 }, { "epoch": 0.29, "grad_norm": 1.0249868631362915, "learning_rate": 2.8579881656804735e-05, "loss": 4.0633, "step": 325 }, { "epoch": 0.31, "grad_norm": 1.2612847089767456, "learning_rate": 2.9911184210526318e-05, "loss": 3.8203, "step": 350 }, { "epoch": 0.33, "grad_norm": 1.0956445932388306, "learning_rate": 2.9664473684210528e-05, "loss": 3.5213, "step": 375 }, { "epoch": 0.36, "grad_norm": 1.187477946281433, "learning_rate": 2.9417763157894738e-05, "loss": 3.4107, "step": 400 }, { "epoch": 0.38, "grad_norm": 1.2866570949554443, "learning_rate": 2.9171052631578948e-05, "loss": 3.2937, "step": 425 }, { "epoch": 0.4, "grad_norm": 1.3654049634933472, "learning_rate": 2.8924342105263158e-05, "loss": 3.148, "step": 450 }, { "epoch": 0.42, "grad_norm": 3.0226898193359375, "learning_rate": 2.8677631578947368e-05, "loss": 2.992, "step": 475 }, { "epoch": 0.44, "grad_norm": 1.858105182647705, "learning_rate": 2.843092105263158e-05, "loss": 2.8713, "step": 500 }, { "epoch": 0.47, "grad_norm": 2.1696932315826416, "learning_rate": 2.818421052631579e-05, "loss": 2.6683, "step": 525 }, { "epoch": 0.49, "grad_norm": 4.214972972869873, "learning_rate": 2.79375e-05, "loss": 2.5497, "step": 550 }, { "epoch": 0.51, "grad_norm": 2.712171792984009, "learning_rate": 2.769078947368421e-05, "loss": 2.5092, "step": 575 }, { "epoch": 0.53, "grad_norm": 3.073754072189331, "learning_rate": 2.744407894736842e-05, "loss": 2.3336, "step": 600 }, { "epoch": 0.55, "grad_norm": 3.3905112743377686, "learning_rate": 2.719736842105263e-05, "loss": 2.2231, "step": 625 }, { "epoch": 0.58, "grad_norm": 2.6293680667877197, "learning_rate": 2.695065789473684e-05, "loss": 2.149, "step": 650 }, { "epoch": 0.6, "grad_norm": 2.6342837810516357, "learning_rate": 2.6703947368421052e-05, "loss": 2.1733, "step": 675 }, { "epoch": 0.62, "grad_norm": 5.107319355010986, "learning_rate": 2.6457236842105262e-05, "loss": 2.0175, "step": 700 }, { "epoch": 0.64, "grad_norm": 3.6173806190490723, "learning_rate": 2.6210526315789475e-05, "loss": 1.9354, "step": 725 }, { "epoch": 0.67, "grad_norm": 2.709559440612793, "learning_rate": 2.5963815789473685e-05, "loss": 1.8339, "step": 750 }, { "epoch": 0.69, "grad_norm": 2.523397922515869, "learning_rate": 2.5717105263157895e-05, "loss": 1.9872, "step": 775 }, { "epoch": 0.71, "grad_norm": 2.3423144817352295, "learning_rate": 2.5470394736842105e-05, "loss": 1.8357, "step": 800 }, { "epoch": 0.73, "grad_norm": 2.349581241607666, "learning_rate": 2.5223684210526315e-05, "loss": 1.7938, "step": 825 }, { "epoch": 0.75, "grad_norm": 3.185107707977295, "learning_rate": 2.4976973684210526e-05, "loss": 1.7019, "step": 850 }, { "epoch": 0.78, "grad_norm": 1.9526580572128296, "learning_rate": 2.4730263157894736e-05, "loss": 1.7636, "step": 875 }, { "epoch": 0.8, "grad_norm": 2.120420455932617, "learning_rate": 2.4483552631578946e-05, "loss": 1.7237, "step": 900 }, { "epoch": 0.82, "grad_norm": 2.198755979537964, "learning_rate": 2.4236842105263156e-05, "loss": 1.6563, "step": 925 }, { "epoch": 0.84, "grad_norm": 2.3641481399536133, "learning_rate": 2.3990131578947366e-05, "loss": 1.7045, "step": 950 }, { "epoch": 0.87, "grad_norm": 2.742950916290283, "learning_rate": 2.3743421052631583e-05, "loss": 1.6134, "step": 975 }, { "epoch": 0.89, "grad_norm": 2.467517614364624, "learning_rate": 2.3496710526315793e-05, "loss": 1.5613, "step": 1000 }, { "epoch": 0.91, "grad_norm": 1.6223593950271606, "learning_rate": 2.3250000000000003e-05, "loss": 1.6858, "step": 1025 }, { "epoch": 0.93, "grad_norm": 2.8574585914611816, "learning_rate": 2.3003289473684213e-05, "loss": 1.5701, "step": 1050 }, { "epoch": 0.95, "grad_norm": 2.3951303958892822, "learning_rate": 2.2756578947368423e-05, "loss": 1.4515, "step": 1075 }, { "epoch": 0.98, "grad_norm": 2.0053939819335938, "learning_rate": 2.2509868421052633e-05, "loss": 1.4744, "step": 1100 }, { "epoch": 1.0, "grad_norm": 1.9645187854766846, "learning_rate": 2.2263157894736843e-05, "loss": 1.4719, "step": 1125 }, { "epoch": 1.02, "grad_norm": 1.9311598539352417, "learning_rate": 2.2016447368421053e-05, "loss": 1.5421, "step": 1150 }, { "epoch": 1.04, "grad_norm": 2.094773054122925, "learning_rate": 2.1769736842105263e-05, "loss": 1.4853, "step": 1175 }, { "epoch": 1.07, "grad_norm": 1.9455292224884033, "learning_rate": 2.1523026315789476e-05, "loss": 1.4912, "step": 1200 }, { "epoch": 1.09, "grad_norm": 2.0593693256378174, "learning_rate": 2.1276315789473687e-05, "loss": 1.3928, "step": 1225 }, { "epoch": 1.11, "grad_norm": 1.7141344547271729, "learning_rate": 2.1029605263157897e-05, "loss": 1.3613, "step": 1250 }, { "epoch": 1.13, "grad_norm": 2.213785409927368, "learning_rate": 2.0782894736842107e-05, "loss": 1.3813, "step": 1275 }, { "epoch": 1.15, "grad_norm": 2.477902412414551, "learning_rate": 2.0536184210526317e-05, "loss": 1.4228, "step": 1300 }, { "epoch": 1.18, "grad_norm": 1.996187686920166, "learning_rate": 2.0289473684210527e-05, "loss": 1.3857, "step": 1325 }, { "epoch": 1.2, "grad_norm": 1.7344149351119995, "learning_rate": 2.0042763157894737e-05, "loss": 1.341, "step": 1350 }, { "epoch": 1.22, "grad_norm": 2.0151705741882324, "learning_rate": 1.9796052631578947e-05, "loss": 1.3863, "step": 1375 }, { "epoch": 1.24, "grad_norm": 1.9713648557662964, "learning_rate": 1.9549342105263157e-05, "loss": 1.397, "step": 1400 }, { "epoch": 1.26, "grad_norm": 1.835036039352417, "learning_rate": 1.9302631578947367e-05, "loss": 1.4165, "step": 1425 }, { "epoch": 1.29, "grad_norm": 2.019484043121338, "learning_rate": 1.905592105263158e-05, "loss": 1.3714, "step": 1450 }, { "epoch": 1.31, "grad_norm": 1.829540491104126, "learning_rate": 1.880921052631579e-05, "loss": 1.3149, "step": 1475 }, { "epoch": 1.33, "grad_norm": 1.961879014968872, "learning_rate": 1.85625e-05, "loss": 1.3801, "step": 1500 }, { "epoch": 1.35, "grad_norm": 2.2035787105560303, "learning_rate": 1.831578947368421e-05, "loss": 1.3486, "step": 1525 }, { "epoch": 1.38, "grad_norm": 1.8642491102218628, "learning_rate": 1.806907894736842e-05, "loss": 1.3207, "step": 1550 }, { "epoch": 1.4, "grad_norm": 1.992825984954834, "learning_rate": 1.782236842105263e-05, "loss": 1.3175, "step": 1575 }, { "epoch": 1.42, "grad_norm": 2.385167121887207, "learning_rate": 1.757565789473684e-05, "loss": 1.286, "step": 1600 }, { "epoch": 1.44, "grad_norm": 1.328168272972107, "learning_rate": 1.732894736842105e-05, "loss": 1.2824, "step": 1625 }, { "epoch": 1.46, "grad_norm": 1.647040605545044, "learning_rate": 1.708223684210526e-05, "loss": 1.3084, "step": 1650 }, { "epoch": 1.49, "grad_norm": 1.4075373411178589, "learning_rate": 1.6835526315789474e-05, "loss": 1.3287, "step": 1675 }, { "epoch": 1.51, "grad_norm": 1.74918794631958, "learning_rate": 1.6588815789473684e-05, "loss": 1.3314, "step": 1700 }, { "epoch": 1.53, "grad_norm": 1.8052846193313599, "learning_rate": 1.6342105263157894e-05, "loss": 1.2832, "step": 1725 }, { "epoch": 1.55, "grad_norm": 1.4929707050323486, "learning_rate": 1.6095394736842105e-05, "loss": 1.2565, "step": 1750 }, { "epoch": 1.58, "grad_norm": 1.5688847303390503, "learning_rate": 1.5848684210526318e-05, "loss": 1.2811, "step": 1775 }, { "epoch": 1.6, "grad_norm": 1.8629792928695679, "learning_rate": 1.5601973684210528e-05, "loss": 1.3485, "step": 1800 }, { "epoch": 1.62, "grad_norm": 1.946643352508545, "learning_rate": 1.5355263157894738e-05, "loss": 1.3489, "step": 1825 }, { "epoch": 1.64, "grad_norm": 1.6834330558776855, "learning_rate": 1.5108552631578946e-05, "loss": 1.2513, "step": 1850 }, { "epoch": 1.66, "grad_norm": 2.3857834339141846, "learning_rate": 1.4861842105263158e-05, "loss": 1.263, "step": 1875 }, { "epoch": 1.69, "grad_norm": 1.413714051246643, "learning_rate": 1.4615131578947368e-05, "loss": 1.2398, "step": 1900 }, { "epoch": 1.71, "grad_norm": 1.75337815284729, "learning_rate": 1.4368421052631578e-05, "loss": 1.3227, "step": 1925 }, { "epoch": 1.73, "grad_norm": 2.1332099437713623, "learning_rate": 1.4121710526315788e-05, "loss": 1.3663, "step": 1950 }, { "epoch": 1.75, "grad_norm": 2.0604429244995117, "learning_rate": 1.3875000000000002e-05, "loss": 1.1767, "step": 1975 }, { "epoch": 1.78, "grad_norm": 1.452750325202942, "learning_rate": 1.3628289473684212e-05, "loss": 1.2854, "step": 2000 }, { "epoch": 1.8, "grad_norm": 1.8815256357192993, "learning_rate": 1.3381578947368422e-05, "loss": 1.2671, "step": 2025 }, { "epoch": 1.82, "grad_norm": 1.517378568649292, "learning_rate": 1.3134868421052632e-05, "loss": 1.2302, "step": 2050 }, { "epoch": 1.84, "grad_norm": 1.5040243864059448, "learning_rate": 1.2888157894736842e-05, "loss": 1.2802, "step": 2075 }, { "epoch": 1.86, "grad_norm": 1.7564135789871216, "learning_rate": 1.2641447368421054e-05, "loss": 1.2337, "step": 2100 }, { "epoch": 1.89, "grad_norm": 1.5423864126205444, "learning_rate": 1.2394736842105264e-05, "loss": 1.2374, "step": 2125 }, { "epoch": 1.91, "grad_norm": 1.9401406049728394, "learning_rate": 1.2148026315789474e-05, "loss": 1.2256, "step": 2150 }, { "epoch": 1.93, "grad_norm": 2.3875536918640137, "learning_rate": 1.1901315789473684e-05, "loss": 1.2301, "step": 2175 }, { "epoch": 1.95, "grad_norm": 1.7349203824996948, "learning_rate": 1.1654605263157894e-05, "loss": 1.2379, "step": 2200 }, { "epoch": 1.97, "grad_norm": 2.1179356575012207, "learning_rate": 1.1407894736842106e-05, "loss": 1.2672, "step": 2225 }, { "epoch": 2.0, "grad_norm": 1.8508802652359009, "learning_rate": 1.1161184210526316e-05, "loss": 1.2543, "step": 2250 }, { "epoch": 2.02, "grad_norm": 1.7846442461013794, "learning_rate": 1.0914473684210526e-05, "loss": 1.1996, "step": 2275 }, { "epoch": 2.04, "grad_norm": 1.691226601600647, "learning_rate": 1.0667763157894736e-05, "loss": 1.2433, "step": 2300 }, { "epoch": 2.06, "grad_norm": 1.7513465881347656, "learning_rate": 1.0421052631578948e-05, "loss": 1.2417, "step": 2325 }, { "epoch": 2.09, "grad_norm": 2.2930405139923096, "learning_rate": 1.0174342105263158e-05, "loss": 1.2202, "step": 2350 }, { "epoch": 2.11, "grad_norm": 1.8147985935211182, "learning_rate": 9.92763157894737e-06, "loss": 1.2027, "step": 2375 }, { "epoch": 2.13, "grad_norm": 2.118342161178589, "learning_rate": 9.68092105263158e-06, "loss": 1.2547, "step": 2400 }, { "epoch": 2.15, "grad_norm": 1.2660759687423706, "learning_rate": 9.43421052631579e-06, "loss": 1.185, "step": 2425 }, { "epoch": 2.17, "grad_norm": 1.54926598072052, "learning_rate": 9.187500000000001e-06, "loss": 1.2254, "step": 2450 }, { "epoch": 2.2, "grad_norm": 1.9687325954437256, "learning_rate": 8.940789473684211e-06, "loss": 1.217, "step": 2475 }, { "epoch": 2.22, "grad_norm": 1.6193515062332153, "learning_rate": 8.694078947368422e-06, "loss": 1.2075, "step": 2500 }, { "epoch": 2.24, "grad_norm": 1.7565534114837646, "learning_rate": 8.447368421052632e-06, "loss": 1.2187, "step": 2525 }, { "epoch": 2.26, "grad_norm": 1.8760586977005005, "learning_rate": 8.200657894736842e-06, "loss": 1.1957, "step": 2550 }, { "epoch": 2.29, "grad_norm": 5.671737194061279, "learning_rate": 7.963815789473685e-06, "loss": 1.156, "step": 2575 }, { "epoch": 2.31, "grad_norm": 2.1643118858337402, "learning_rate": 7.717105263157895e-06, "loss": 1.2163, "step": 2600 }, { "epoch": 2.33, "grad_norm": 1.6212981939315796, "learning_rate": 7.470394736842106e-06, "loss": 1.1774, "step": 2625 }, { "epoch": 2.35, "grad_norm": 1.421762466430664, "learning_rate": 7.223684210526316e-06, "loss": 1.2097, "step": 2650 }, { "epoch": 2.37, "grad_norm": 1.7923425436019897, "learning_rate": 6.976973684210526e-06, "loss": 1.2083, "step": 2675 }, { "epoch": 2.4, "grad_norm": 1.4467811584472656, "learning_rate": 6.730263157894737e-06, "loss": 1.1972, "step": 2700 }, { "epoch": 2.42, "grad_norm": 1.48775315284729, "learning_rate": 6.483552631578947e-06, "loss": 1.1967, "step": 2725 }, { "epoch": 2.44, "grad_norm": 1.6732630729675293, "learning_rate": 6.236842105263159e-06, "loss": 1.1978, "step": 2750 }, { "epoch": 2.46, "grad_norm": 1.5414642095565796, "learning_rate": 5.990131578947369e-06, "loss": 1.2124, "step": 2775 }, { "epoch": 2.49, "grad_norm": 1.6581816673278809, "learning_rate": 5.74342105263158e-06, "loss": 1.2193, "step": 2800 }, { "epoch": 2.51, "grad_norm": 1.688584327697754, "learning_rate": 5.49671052631579e-06, "loss": 1.1672, "step": 2825 }, { "epoch": 2.53, "grad_norm": 2.130878210067749, "learning_rate": 5.25e-06, "loss": 1.1778, "step": 2850 }, { "epoch": 2.55, "grad_norm": 2.325465679168701, "learning_rate": 5.003289473684211e-06, "loss": 1.2189, "step": 2875 }, { "epoch": 2.57, "grad_norm": 1.1922625303268433, "learning_rate": 4.756578947368421e-06, "loss": 1.1062, "step": 2900 }, { "epoch": 2.6, "grad_norm": 1.36138117313385, "learning_rate": 4.5098684210526316e-06, "loss": 1.2074, "step": 2925 }, { "epoch": 2.62, "grad_norm": 1.736745834350586, "learning_rate": 4.2631578947368425e-06, "loss": 1.1532, "step": 2950 }, { "epoch": 2.64, "grad_norm": 1.8273382186889648, "learning_rate": 4.016447368421053e-06, "loss": 1.1977, "step": 2975 }, { "epoch": 2.66, "grad_norm": 1.504334568977356, "learning_rate": 3.7697368421052634e-06, "loss": 1.1679, "step": 3000 }, { "epoch": 2.68, "grad_norm": 1.4739781618118286, "learning_rate": 3.523026315789474e-06, "loss": 1.1503, "step": 3025 }, { "epoch": 2.71, "grad_norm": 1.9392614364624023, "learning_rate": 3.2763157894736844e-06, "loss": 1.126, "step": 3050 }, { "epoch": 2.73, "grad_norm": 1.8378140926361084, "learning_rate": 3.0296052631578945e-06, "loss": 1.242, "step": 3075 }, { "epoch": 2.75, "grad_norm": 1.7033443450927734, "learning_rate": 2.7828947368421054e-06, "loss": 1.1892, "step": 3100 }, { "epoch": 2.77, "grad_norm": 1.5492761135101318, "learning_rate": 2.536184210526316e-06, "loss": 1.1594, "step": 3125 }, { "epoch": 2.8, "grad_norm": 1.5671361684799194, "learning_rate": 2.2894736842105263e-06, "loss": 1.179, "step": 3150 }, { "epoch": 2.82, "grad_norm": 1.7326922416687012, "learning_rate": 2.042763157894737e-06, "loss": 1.2256, "step": 3175 }, { "epoch": 2.84, "grad_norm": 1.4645403623580933, "learning_rate": 1.7960526315789473e-06, "loss": 1.1041, "step": 3200 }, { "epoch": 2.86, "grad_norm": 1.4518282413482666, "learning_rate": 1.549342105263158e-06, "loss": 1.1979, "step": 3225 }, { "epoch": 2.88, "grad_norm": 1.488720417022705, "learning_rate": 1.3026315789473685e-06, "loss": 1.1615, "step": 3250 }, { "epoch": 2.91, "grad_norm": 1.3193929195404053, "learning_rate": 1.055921052631579e-06, "loss": 1.1275, "step": 3275 }, { "epoch": 2.93, "grad_norm": 1.5246660709381104, "learning_rate": 8.092105263157895e-07, "loss": 1.1317, "step": 3300 }, { "epoch": 2.95, "grad_norm": 1.8375893831253052, "learning_rate": 5.625e-07, "loss": 1.1291, "step": 3325 }, { "epoch": 2.97, "grad_norm": 2.09460186958313, "learning_rate": 3.1578947368421055e-07, "loss": 1.2138, "step": 3350 }, { "epoch": 3.0, "grad_norm": 1.9802443981170654, "learning_rate": 6.907894736842104e-08, "loss": 1.1591, "step": 3375 } ], "logging_steps": 25, "max_steps": 3378, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5.978827851300864e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }