|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.995418771290967, |
|
"eval_steps": 500, |
|
"global_step": 25500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 3.915426781519186e-06, |
|
"loss": 0.2741, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.038767650723457336, |
|
"eval_runtime": 118.2386, |
|
"eval_samples_per_second": 55.329, |
|
"eval_steps_per_second": 6.918, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 7.830853563038372e-06, |
|
"loss": 0.0414, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.02589261531829834, |
|
"eval_runtime": 118.8851, |
|
"eval_samples_per_second": 55.028, |
|
"eval_steps_per_second": 6.881, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 9.908086719973622e-06, |
|
"loss": 0.0321, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.023959027603268623, |
|
"eval_runtime": 119.5631, |
|
"eval_samples_per_second": 54.716, |
|
"eval_steps_per_second": 6.842, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 9.702003132470531e-06, |
|
"loss": 0.0291, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.022091126069426537, |
|
"eval_runtime": 118.9906, |
|
"eval_samples_per_second": 54.979, |
|
"eval_steps_per_second": 6.874, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 9.49591954496744e-06, |
|
"loss": 0.0289, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.021428581327199936, |
|
"eval_runtime": 119.0226, |
|
"eval_samples_per_second": 54.964, |
|
"eval_steps_per_second": 6.873, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 9.289835957464349e-06, |
|
"loss": 0.0262, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.021141424775123596, |
|
"eval_runtime": 118.9536, |
|
"eval_samples_per_second": 54.996, |
|
"eval_steps_per_second": 6.877, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 9.083752369961257e-06, |
|
"loss": 0.0247, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 0.01918744668364525, |
|
"eval_runtime": 119.1755, |
|
"eval_samples_per_second": 54.894, |
|
"eval_steps_per_second": 6.864, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 8.877668782458165e-06, |
|
"loss": 0.0242, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 0.01971018686890602, |
|
"eval_runtime": 118.8936, |
|
"eval_samples_per_second": 55.024, |
|
"eval_steps_per_second": 6.88, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 8.671585194955074e-06, |
|
"loss": 0.0226, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 0.019149309024214745, |
|
"eval_runtime": 118.5478, |
|
"eval_samples_per_second": 55.184, |
|
"eval_steps_per_second": 6.9, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 8.465501607451984e-06, |
|
"loss": 0.0241, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.019563956186175346, |
|
"eval_runtime": 117.7652, |
|
"eval_samples_per_second": 55.551, |
|
"eval_steps_per_second": 6.946, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 8.259418019948892e-06, |
|
"loss": 0.0233, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.020009223371744156, |
|
"eval_runtime": 118.0111, |
|
"eval_samples_per_second": 55.435, |
|
"eval_steps_per_second": 6.932, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 8.053334432445801e-06, |
|
"loss": 0.0224, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.019076339900493622, |
|
"eval_runtime": 118.6811, |
|
"eval_samples_per_second": 55.123, |
|
"eval_steps_per_second": 6.892, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 7.84725084494271e-06, |
|
"loss": 0.0225, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.018951497972011566, |
|
"eval_runtime": 118.9565, |
|
"eval_samples_per_second": 54.995, |
|
"eval_steps_per_second": 6.876, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 7.641167257439618e-06, |
|
"loss": 0.022, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 0.018673894926905632, |
|
"eval_runtime": 118.6206, |
|
"eval_samples_per_second": 55.151, |
|
"eval_steps_per_second": 6.896, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 7.435083669936527e-06, |
|
"loss": 0.0208, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.018179820850491524, |
|
"eval_runtime": 118.8159, |
|
"eval_samples_per_second": 55.06, |
|
"eval_steps_per_second": 6.885, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 7.229000082433436e-06, |
|
"loss": 0.022, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.01761673204600811, |
|
"eval_runtime": 119.0018, |
|
"eval_samples_per_second": 54.974, |
|
"eval_steps_per_second": 6.874, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 7.022916494930344e-06, |
|
"loss": 0.0225, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.01772836409509182, |
|
"eval_runtime": 119.0025, |
|
"eval_samples_per_second": 54.974, |
|
"eval_steps_per_second": 6.874, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 6.816832907427253e-06, |
|
"loss": 0.0174, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 0.01802617870271206, |
|
"eval_runtime": 118.9086, |
|
"eval_samples_per_second": 55.017, |
|
"eval_steps_per_second": 6.879, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 6.610749319924161e-06, |
|
"loss": 0.0172, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.017970656976103783, |
|
"eval_runtime": 118.9682, |
|
"eval_samples_per_second": 54.99, |
|
"eval_steps_per_second": 6.876, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 6.404665732421071e-06, |
|
"loss": 0.0167, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_loss": 0.018082452937960625, |
|
"eval_runtime": 118.2637, |
|
"eval_samples_per_second": 55.317, |
|
"eval_steps_per_second": 6.917, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 6.1985821449179794e-06, |
|
"loss": 0.0169, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 0.0184369795024395, |
|
"eval_runtime": 118.1503, |
|
"eval_samples_per_second": 55.37, |
|
"eval_steps_per_second": 6.923, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 5.992498557414887e-06, |
|
"loss": 0.0172, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"eval_loss": 0.017873156815767288, |
|
"eval_runtime": 118.4849, |
|
"eval_samples_per_second": 55.214, |
|
"eval_steps_per_second": 6.904, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 5.786414969911797e-06, |
|
"loss": 0.0165, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.018475396558642387, |
|
"eval_runtime": 118.2528, |
|
"eval_samples_per_second": 55.322, |
|
"eval_steps_per_second": 6.917, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 5.5803313824087056e-06, |
|
"loss": 0.0182, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_loss": 0.01798292063176632, |
|
"eval_runtime": 118.8845, |
|
"eval_samples_per_second": 55.028, |
|
"eval_steps_per_second": 6.881, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 5.374247794905613e-06, |
|
"loss": 0.0168, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.01794307678937912, |
|
"eval_runtime": 118.5742, |
|
"eval_samples_per_second": 55.172, |
|
"eval_steps_per_second": 6.899, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 5.168164207402523e-06, |
|
"loss": 0.0174, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"eval_loss": 0.017786763608455658, |
|
"eval_runtime": 118.992, |
|
"eval_samples_per_second": 54.978, |
|
"eval_steps_per_second": 6.874, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 4.962080619899432e-06, |
|
"loss": 0.0161, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 0.017610570415854454, |
|
"eval_runtime": 118.7482, |
|
"eval_samples_per_second": 55.091, |
|
"eval_steps_per_second": 6.889, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 4.75599703239634e-06, |
|
"loss": 0.0171, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 0.018315177410840988, |
|
"eval_runtime": 118.5603, |
|
"eval_samples_per_second": 55.179, |
|
"eval_steps_per_second": 6.899, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 4.549913444893249e-06, |
|
"loss": 0.0159, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.01768197864294052, |
|
"eval_runtime": 118.811, |
|
"eval_samples_per_second": 55.062, |
|
"eval_steps_per_second": 6.885, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 4.343829857390158e-06, |
|
"loss": 0.0157, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.01793872006237507, |
|
"eval_runtime": 117.8803, |
|
"eval_samples_per_second": 55.497, |
|
"eval_steps_per_second": 6.939, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 4.1377462698870665e-06, |
|
"loss": 0.0163, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_loss": 0.0176596250385046, |
|
"eval_runtime": 117.7426, |
|
"eval_samples_per_second": 55.562, |
|
"eval_steps_per_second": 6.947, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 3.931662682383975e-06, |
|
"loss": 0.017, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 0.017588863149285316, |
|
"eval_runtime": 118.319, |
|
"eval_samples_per_second": 55.291, |
|
"eval_steps_per_second": 6.914, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 3.725579094880884e-06, |
|
"loss": 0.0163, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 0.0175350159406662, |
|
"eval_runtime": 118.676, |
|
"eval_samples_per_second": 55.125, |
|
"eval_steps_per_second": 6.893, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 3.5194955073777926e-06, |
|
"loss": 0.0162, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.017314020544290543, |
|
"eval_runtime": 118.3195, |
|
"eval_samples_per_second": 55.291, |
|
"eval_steps_per_second": 6.913, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 3.3134119198747017e-06, |
|
"loss": 0.0144, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 0.017457639798521996, |
|
"eval_runtime": 118.5977, |
|
"eval_samples_per_second": 55.161, |
|
"eval_steps_per_second": 6.897, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 3.1073283323716105e-06, |
|
"loss": 0.0139, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 0.01773645542562008, |
|
"eval_runtime": 118.6104, |
|
"eval_samples_per_second": 55.155, |
|
"eval_steps_per_second": 6.897, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 2.9012447448685187e-06, |
|
"loss": 0.0145, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_loss": 0.0180921982973814, |
|
"eval_runtime": 118.3455, |
|
"eval_samples_per_second": 55.279, |
|
"eval_steps_per_second": 6.912, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 2.6951611573654274e-06, |
|
"loss": 0.0138, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_loss": 0.01829521358013153, |
|
"eval_runtime": 118.9023, |
|
"eval_samples_per_second": 55.02, |
|
"eval_steps_per_second": 6.88, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 2.489077569862336e-06, |
|
"loss": 0.0131, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_loss": 0.01797698438167572, |
|
"eval_runtime": 118.4871, |
|
"eval_samples_per_second": 55.213, |
|
"eval_steps_per_second": 6.904, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 2.2829939823592453e-06, |
|
"loss": 0.0135, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 0.018181076273322105, |
|
"eval_runtime": 117.5911, |
|
"eval_samples_per_second": 55.633, |
|
"eval_steps_per_second": 6.956, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 2.076910394856154e-06, |
|
"loss": 0.0134, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"eval_loss": 0.017410971224308014, |
|
"eval_runtime": 117.7062, |
|
"eval_samples_per_second": 55.579, |
|
"eval_steps_per_second": 6.95, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 1.8708268073530625e-06, |
|
"loss": 0.0139, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"eval_loss": 0.01747230626642704, |
|
"eval_runtime": 117.9312, |
|
"eval_samples_per_second": 55.473, |
|
"eval_steps_per_second": 6.936, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 1.6647432198499714e-06, |
|
"loss": 0.013, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"eval_loss": 0.017725400626659393, |
|
"eval_runtime": 118.3008, |
|
"eval_samples_per_second": 55.3, |
|
"eval_steps_per_second": 6.915, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 1.45865963234688e-06, |
|
"loss": 0.0138, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"eval_loss": 0.017776617780327797, |
|
"eval_runtime": 118.343, |
|
"eval_samples_per_second": 55.28, |
|
"eval_steps_per_second": 6.912, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 1.2525760448437888e-06, |
|
"loss": 0.014, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"eval_loss": 0.017718419432640076, |
|
"eval_runtime": 118.1253, |
|
"eval_samples_per_second": 55.382, |
|
"eval_steps_per_second": 6.925, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 1.0464924573406975e-06, |
|
"loss": 0.0134, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 0.01749224029481411, |
|
"eval_runtime": 118.0122, |
|
"eval_samples_per_second": 55.435, |
|
"eval_steps_per_second": 6.931, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 8.404088698376061e-07, |
|
"loss": 0.0137, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 0.01760455034673214, |
|
"eval_runtime": 118.2868, |
|
"eval_samples_per_second": 55.306, |
|
"eval_steps_per_second": 6.915, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 6.343252823345148e-07, |
|
"loss": 0.0131, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"eval_loss": 0.0176746416836977, |
|
"eval_runtime": 118.1089, |
|
"eval_samples_per_second": 55.39, |
|
"eval_steps_per_second": 6.926, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 4.2824169483142364e-07, |
|
"loss": 0.0137, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 0.01762452907860279, |
|
"eval_runtime": 118.4177, |
|
"eval_samples_per_second": 55.245, |
|
"eval_steps_per_second": 6.908, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 2.221581073283324e-07, |
|
"loss": 0.0135, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 0.01761581189930439, |
|
"eval_runtime": 117.528, |
|
"eval_samples_per_second": 55.663, |
|
"eval_steps_per_second": 6.96, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 1.6074519825241118e-08, |
|
"loss": 0.014, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.017628999426960945, |
|
"eval_runtime": 118.0406, |
|
"eval_samples_per_second": 55.422, |
|
"eval_steps_per_second": 6.93, |
|
"step": 25500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 25539, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.428545951977472e+16, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|