{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995418771290967, "eval_steps": 500, "global_step": 25500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 3.915426781519186e-06, "loss": 0.2741, "step": 500 }, { "epoch": 0.06, "eval_loss": 0.038767650723457336, "eval_runtime": 118.2386, "eval_samples_per_second": 55.329, "eval_steps_per_second": 6.918, "step": 500 }, { "epoch": 0.12, "learning_rate": 7.830853563038372e-06, "loss": 0.0414, "step": 1000 }, { "epoch": 0.12, "eval_loss": 0.02589261531829834, "eval_runtime": 118.8851, "eval_samples_per_second": 55.028, "eval_steps_per_second": 6.881, "step": 1000 }, { "epoch": 0.18, "learning_rate": 9.908086719973622e-06, "loss": 0.0321, "step": 1500 }, { "epoch": 0.18, "eval_loss": 0.023959027603268623, "eval_runtime": 119.5631, "eval_samples_per_second": 54.716, "eval_steps_per_second": 6.842, "step": 1500 }, { "epoch": 0.23, "learning_rate": 9.702003132470531e-06, "loss": 0.0291, "step": 2000 }, { "epoch": 0.23, "eval_loss": 0.022091126069426537, "eval_runtime": 118.9906, "eval_samples_per_second": 54.979, "eval_steps_per_second": 6.874, "step": 2000 }, { "epoch": 0.29, "learning_rate": 9.49591954496744e-06, "loss": 0.0289, "step": 2500 }, { "epoch": 0.29, "eval_loss": 0.021428581327199936, "eval_runtime": 119.0226, "eval_samples_per_second": 54.964, "eval_steps_per_second": 6.873, "step": 2500 }, { "epoch": 0.35, "learning_rate": 9.289835957464349e-06, "loss": 0.0262, "step": 3000 }, { "epoch": 0.35, "eval_loss": 0.021141424775123596, "eval_runtime": 118.9536, "eval_samples_per_second": 54.996, "eval_steps_per_second": 6.877, "step": 3000 }, { "epoch": 0.41, "learning_rate": 9.083752369961257e-06, "loss": 0.0247, "step": 3500 }, { "epoch": 0.41, "eval_loss": 0.01918744668364525, "eval_runtime": 119.1755, "eval_samples_per_second": 54.894, "eval_steps_per_second": 6.864, "step": 3500 }, { "epoch": 0.47, "learning_rate": 8.877668782458165e-06, "loss": 0.0242, "step": 4000 }, { "epoch": 0.47, "eval_loss": 0.01971018686890602, "eval_runtime": 118.8936, "eval_samples_per_second": 55.024, "eval_steps_per_second": 6.88, "step": 4000 }, { "epoch": 0.53, "learning_rate": 8.671585194955074e-06, "loss": 0.0226, "step": 4500 }, { "epoch": 0.53, "eval_loss": 0.019149309024214745, "eval_runtime": 118.5478, "eval_samples_per_second": 55.184, "eval_steps_per_second": 6.9, "step": 4500 }, { "epoch": 0.59, "learning_rate": 8.465501607451984e-06, "loss": 0.0241, "step": 5000 }, { "epoch": 0.59, "eval_loss": 0.019563956186175346, "eval_runtime": 117.7652, "eval_samples_per_second": 55.551, "eval_steps_per_second": 6.946, "step": 5000 }, { "epoch": 0.65, "learning_rate": 8.259418019948892e-06, "loss": 0.0233, "step": 5500 }, { "epoch": 0.65, "eval_loss": 0.020009223371744156, "eval_runtime": 118.0111, "eval_samples_per_second": 55.435, "eval_steps_per_second": 6.932, "step": 5500 }, { "epoch": 0.7, "learning_rate": 8.053334432445801e-06, "loss": 0.0224, "step": 6000 }, { "epoch": 0.7, "eval_loss": 0.019076339900493622, "eval_runtime": 118.6811, "eval_samples_per_second": 55.123, "eval_steps_per_second": 6.892, "step": 6000 }, { "epoch": 0.76, "learning_rate": 7.84725084494271e-06, "loss": 0.0225, "step": 6500 }, { "epoch": 0.76, "eval_loss": 0.018951497972011566, "eval_runtime": 118.9565, "eval_samples_per_second": 54.995, "eval_steps_per_second": 6.876, "step": 6500 }, { "epoch": 0.82, "learning_rate": 7.641167257439618e-06, "loss": 0.022, "step": 7000 }, { "epoch": 0.82, "eval_loss": 0.018673894926905632, "eval_runtime": 118.6206, "eval_samples_per_second": 55.151, "eval_steps_per_second": 6.896, "step": 7000 }, { "epoch": 0.88, "learning_rate": 7.435083669936527e-06, "loss": 0.0208, "step": 7500 }, { "epoch": 0.88, "eval_loss": 0.018179820850491524, "eval_runtime": 118.8159, "eval_samples_per_second": 55.06, "eval_steps_per_second": 6.885, "step": 7500 }, { "epoch": 0.94, "learning_rate": 7.229000082433436e-06, "loss": 0.022, "step": 8000 }, { "epoch": 0.94, "eval_loss": 0.01761673204600811, "eval_runtime": 119.0018, "eval_samples_per_second": 54.974, "eval_steps_per_second": 6.874, "step": 8000 }, { "epoch": 1.0, "learning_rate": 7.022916494930344e-06, "loss": 0.0225, "step": 8500 }, { "epoch": 1.0, "eval_loss": 0.01772836409509182, "eval_runtime": 119.0025, "eval_samples_per_second": 54.974, "eval_steps_per_second": 6.874, "step": 8500 }, { "epoch": 1.06, "learning_rate": 6.816832907427253e-06, "loss": 0.0174, "step": 9000 }, { "epoch": 1.06, "eval_loss": 0.01802617870271206, "eval_runtime": 118.9086, "eval_samples_per_second": 55.017, "eval_steps_per_second": 6.879, "step": 9000 }, { "epoch": 1.12, "learning_rate": 6.610749319924161e-06, "loss": 0.0172, "step": 9500 }, { "epoch": 1.12, "eval_loss": 0.017970656976103783, "eval_runtime": 118.9682, "eval_samples_per_second": 54.99, "eval_steps_per_second": 6.876, "step": 9500 }, { "epoch": 1.17, "learning_rate": 6.404665732421071e-06, "loss": 0.0167, "step": 10000 }, { "epoch": 1.17, "eval_loss": 0.018082452937960625, "eval_runtime": 118.2637, "eval_samples_per_second": 55.317, "eval_steps_per_second": 6.917, "step": 10000 }, { "epoch": 1.23, "learning_rate": 6.1985821449179794e-06, "loss": 0.0169, "step": 10500 }, { "epoch": 1.23, "eval_loss": 0.0184369795024395, "eval_runtime": 118.1503, "eval_samples_per_second": 55.37, "eval_steps_per_second": 6.923, "step": 10500 }, { "epoch": 1.29, "learning_rate": 5.992498557414887e-06, "loss": 0.0172, "step": 11000 }, { "epoch": 1.29, "eval_loss": 0.017873156815767288, "eval_runtime": 118.4849, "eval_samples_per_second": 55.214, "eval_steps_per_second": 6.904, "step": 11000 }, { "epoch": 1.35, "learning_rate": 5.786414969911797e-06, "loss": 0.0165, "step": 11500 }, { "epoch": 1.35, "eval_loss": 0.018475396558642387, "eval_runtime": 118.2528, "eval_samples_per_second": 55.322, "eval_steps_per_second": 6.917, "step": 11500 }, { "epoch": 1.41, "learning_rate": 5.5803313824087056e-06, "loss": 0.0182, "step": 12000 }, { "epoch": 1.41, "eval_loss": 0.01798292063176632, "eval_runtime": 118.8845, "eval_samples_per_second": 55.028, "eval_steps_per_second": 6.881, "step": 12000 }, { "epoch": 1.47, "learning_rate": 5.374247794905613e-06, "loss": 0.0168, "step": 12500 }, { "epoch": 1.47, "eval_loss": 0.01794307678937912, "eval_runtime": 118.5742, "eval_samples_per_second": 55.172, "eval_steps_per_second": 6.899, "step": 12500 }, { "epoch": 1.53, "learning_rate": 5.168164207402523e-06, "loss": 0.0174, "step": 13000 }, { "epoch": 1.53, "eval_loss": 0.017786763608455658, "eval_runtime": 118.992, "eval_samples_per_second": 54.978, "eval_steps_per_second": 6.874, "step": 13000 }, { "epoch": 1.59, "learning_rate": 4.962080619899432e-06, "loss": 0.0161, "step": 13500 }, { "epoch": 1.59, "eval_loss": 0.017610570415854454, "eval_runtime": 118.7482, "eval_samples_per_second": 55.091, "eval_steps_per_second": 6.889, "step": 13500 }, { "epoch": 1.64, "learning_rate": 4.75599703239634e-06, "loss": 0.0171, "step": 14000 }, { "epoch": 1.64, "eval_loss": 0.018315177410840988, "eval_runtime": 118.5603, "eval_samples_per_second": 55.179, "eval_steps_per_second": 6.899, "step": 14000 }, { "epoch": 1.7, "learning_rate": 4.549913444893249e-06, "loss": 0.0159, "step": 14500 }, { "epoch": 1.7, "eval_loss": 0.01768197864294052, "eval_runtime": 118.811, "eval_samples_per_second": 55.062, "eval_steps_per_second": 6.885, "step": 14500 }, { "epoch": 1.76, "learning_rate": 4.343829857390158e-06, "loss": 0.0157, "step": 15000 }, { "epoch": 1.76, "eval_loss": 0.01793872006237507, "eval_runtime": 117.8803, "eval_samples_per_second": 55.497, "eval_steps_per_second": 6.939, "step": 15000 }, { "epoch": 1.82, "learning_rate": 4.1377462698870665e-06, "loss": 0.0163, "step": 15500 }, { "epoch": 1.82, "eval_loss": 0.0176596250385046, "eval_runtime": 117.7426, "eval_samples_per_second": 55.562, "eval_steps_per_second": 6.947, "step": 15500 }, { "epoch": 1.88, "learning_rate": 3.931662682383975e-06, "loss": 0.017, "step": 16000 }, { "epoch": 1.88, "eval_loss": 0.017588863149285316, "eval_runtime": 118.319, "eval_samples_per_second": 55.291, "eval_steps_per_second": 6.914, "step": 16000 }, { "epoch": 1.94, "learning_rate": 3.725579094880884e-06, "loss": 0.0163, "step": 16500 }, { "epoch": 1.94, "eval_loss": 0.0175350159406662, "eval_runtime": 118.676, "eval_samples_per_second": 55.125, "eval_steps_per_second": 6.893, "step": 16500 }, { "epoch": 2.0, "learning_rate": 3.5194955073777926e-06, "loss": 0.0162, "step": 17000 }, { "epoch": 2.0, "eval_loss": 0.017314020544290543, "eval_runtime": 118.3195, "eval_samples_per_second": 55.291, "eval_steps_per_second": 6.913, "step": 17000 }, { "epoch": 2.06, "learning_rate": 3.3134119198747017e-06, "loss": 0.0144, "step": 17500 }, { "epoch": 2.06, "eval_loss": 0.017457639798521996, "eval_runtime": 118.5977, "eval_samples_per_second": 55.161, "eval_steps_per_second": 6.897, "step": 17500 }, { "epoch": 2.11, "learning_rate": 3.1073283323716105e-06, "loss": 0.0139, "step": 18000 }, { "epoch": 2.11, "eval_loss": 0.01773645542562008, "eval_runtime": 118.6104, "eval_samples_per_second": 55.155, "eval_steps_per_second": 6.897, "step": 18000 }, { "epoch": 2.17, "learning_rate": 2.9012447448685187e-06, "loss": 0.0145, "step": 18500 }, { "epoch": 2.17, "eval_loss": 0.0180921982973814, "eval_runtime": 118.3455, "eval_samples_per_second": 55.279, "eval_steps_per_second": 6.912, "step": 18500 }, { "epoch": 2.23, "learning_rate": 2.6951611573654274e-06, "loss": 0.0138, "step": 19000 }, { "epoch": 2.23, "eval_loss": 0.01829521358013153, "eval_runtime": 118.9023, "eval_samples_per_second": 55.02, "eval_steps_per_second": 6.88, "step": 19000 }, { "epoch": 2.29, "learning_rate": 2.489077569862336e-06, "loss": 0.0131, "step": 19500 }, { "epoch": 2.29, "eval_loss": 0.01797698438167572, "eval_runtime": 118.4871, "eval_samples_per_second": 55.213, "eval_steps_per_second": 6.904, "step": 19500 }, { "epoch": 2.35, "learning_rate": 2.2829939823592453e-06, "loss": 0.0135, "step": 20000 }, { "epoch": 2.35, "eval_loss": 0.018181076273322105, "eval_runtime": 117.5911, "eval_samples_per_second": 55.633, "eval_steps_per_second": 6.956, "step": 20000 }, { "epoch": 2.41, "learning_rate": 2.076910394856154e-06, "loss": 0.0134, "step": 20500 }, { "epoch": 2.41, "eval_loss": 0.017410971224308014, "eval_runtime": 117.7062, "eval_samples_per_second": 55.579, "eval_steps_per_second": 6.95, "step": 20500 }, { "epoch": 2.47, "learning_rate": 1.8708268073530625e-06, "loss": 0.0139, "step": 21000 }, { "epoch": 2.47, "eval_loss": 0.01747230626642704, "eval_runtime": 117.9312, "eval_samples_per_second": 55.473, "eval_steps_per_second": 6.936, "step": 21000 }, { "epoch": 2.53, "learning_rate": 1.6647432198499714e-06, "loss": 0.013, "step": 21500 }, { "epoch": 2.53, "eval_loss": 0.017725400626659393, "eval_runtime": 118.3008, "eval_samples_per_second": 55.3, "eval_steps_per_second": 6.915, "step": 21500 }, { "epoch": 2.58, "learning_rate": 1.45865963234688e-06, "loss": 0.0138, "step": 22000 }, { "epoch": 2.58, "eval_loss": 0.017776617780327797, "eval_runtime": 118.343, "eval_samples_per_second": 55.28, "eval_steps_per_second": 6.912, "step": 22000 }, { "epoch": 2.64, "learning_rate": 1.2525760448437888e-06, "loss": 0.014, "step": 22500 }, { "epoch": 2.64, "eval_loss": 0.017718419432640076, "eval_runtime": 118.1253, "eval_samples_per_second": 55.382, "eval_steps_per_second": 6.925, "step": 22500 }, { "epoch": 2.7, "learning_rate": 1.0464924573406975e-06, "loss": 0.0134, "step": 23000 }, { "epoch": 2.7, "eval_loss": 0.01749224029481411, "eval_runtime": 118.0122, "eval_samples_per_second": 55.435, "eval_steps_per_second": 6.931, "step": 23000 }, { "epoch": 2.76, "learning_rate": 8.404088698376061e-07, "loss": 0.0137, "step": 23500 }, { "epoch": 2.76, "eval_loss": 0.01760455034673214, "eval_runtime": 118.2868, "eval_samples_per_second": 55.306, "eval_steps_per_second": 6.915, "step": 23500 }, { "epoch": 2.82, "learning_rate": 6.343252823345148e-07, "loss": 0.0131, "step": 24000 }, { "epoch": 2.82, "eval_loss": 0.0176746416836977, "eval_runtime": 118.1089, "eval_samples_per_second": 55.39, "eval_steps_per_second": 6.926, "step": 24000 }, { "epoch": 2.88, "learning_rate": 4.2824169483142364e-07, "loss": 0.0137, "step": 24500 }, { "epoch": 2.88, "eval_loss": 0.01762452907860279, "eval_runtime": 118.4177, "eval_samples_per_second": 55.245, "eval_steps_per_second": 6.908, "step": 24500 }, { "epoch": 2.94, "learning_rate": 2.221581073283324e-07, "loss": 0.0135, "step": 25000 }, { "epoch": 2.94, "eval_loss": 0.01761581189930439, "eval_runtime": 117.528, "eval_samples_per_second": 55.663, "eval_steps_per_second": 6.96, "step": 25000 }, { "epoch": 3.0, "learning_rate": 1.6074519825241118e-08, "loss": 0.014, "step": 25500 }, { "epoch": 3.0, "eval_loss": 0.017628999426960945, "eval_runtime": 118.0406, "eval_samples_per_second": 55.422, "eval_steps_per_second": 6.93, "step": 25500 } ], "logging_steps": 500, "max_steps": 25539, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.428545951977472e+16, "train_batch_size": 18, "trial_name": null, "trial_params": null }