{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9697377269670477, "eval_steps": 50, "global_step": 276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 4.2857142857142855e-06, "loss": 0.01, "step": 3 }, { "epoch": 0.06, "learning_rate": 8.571428571428571e-06, "loss": 0.0093, "step": 6 }, { "epoch": 0.1, "learning_rate": 1.2857142857142859e-05, "loss": 0.0089, "step": 9 }, { "epoch": 0.13, "learning_rate": 1.7142857142857142e-05, "loss": 0.0091, "step": 12 }, { "epoch": 0.16, "learning_rate": 1.9923664122137406e-05, "loss": 0.0095, "step": 15 }, { "epoch": 0.19, "learning_rate": 1.969465648854962e-05, "loss": 0.0081, "step": 18 }, { "epoch": 0.23, "learning_rate": 1.9465648854961833e-05, "loss": 0.0073, "step": 21 }, { "epoch": 0.26, "learning_rate": 1.923664122137405e-05, "loss": 0.0082, "step": 24 }, { "epoch": 0.29, "learning_rate": 1.900763358778626e-05, "loss": 0.0063, "step": 27 }, { "epoch": 0.32, "learning_rate": 1.8778625954198473e-05, "loss": 0.0077, "step": 30 }, { "epoch": 0.36, "learning_rate": 1.854961832061069e-05, "loss": 0.006, "step": 33 }, { "epoch": 0.39, "learning_rate": 1.83206106870229e-05, "loss": 0.0064, "step": 36 }, { "epoch": 0.42, "learning_rate": 1.8091603053435117e-05, "loss": 0.0056, "step": 39 }, { "epoch": 0.45, "learning_rate": 1.786259541984733e-05, "loss": 0.0055, "step": 42 }, { "epoch": 0.48, "learning_rate": 1.7633587786259544e-05, "loss": 0.0052, "step": 45 }, { "epoch": 0.52, "learning_rate": 1.7404580152671757e-05, "loss": 0.0064, "step": 48 }, { "epoch": 0.54, "eval_loss": 0.000551095581613481, "eval_mse": 0.0005510955593948523, "eval_runtime": 8.5427, "eval_samples_per_second": 36.64, "eval_steps_per_second": 9.248, "step": 50 }, { "epoch": 0.55, "learning_rate": 1.717557251908397e-05, "loss": 0.0061, "step": 51 }, { "epoch": 0.58, "learning_rate": 1.6946564885496184e-05, "loss": 0.0052, "step": 54 }, { "epoch": 0.61, "learning_rate": 1.6717557251908398e-05, "loss": 0.0053, "step": 57 }, { "epoch": 0.65, "learning_rate": 1.648854961832061e-05, "loss": 0.0043, "step": 60 }, { "epoch": 0.68, "learning_rate": 1.6259541984732825e-05, "loss": 0.0035, "step": 63 }, { "epoch": 0.71, "learning_rate": 1.6030534351145038e-05, "loss": 0.0041, "step": 66 }, { "epoch": 0.74, "learning_rate": 1.5801526717557255e-05, "loss": 0.0046, "step": 69 }, { "epoch": 0.77, "learning_rate": 1.5572519083969465e-05, "loss": 0.0036, "step": 72 }, { "epoch": 0.81, "learning_rate": 1.5343511450381682e-05, "loss": 0.0042, "step": 75 }, { "epoch": 0.84, "learning_rate": 1.5114503816793895e-05, "loss": 0.0037, "step": 78 }, { "epoch": 0.87, "learning_rate": 1.4885496183206107e-05, "loss": 0.0039, "step": 81 }, { "epoch": 0.9, "learning_rate": 1.4656488549618322e-05, "loss": 0.0027, "step": 84 }, { "epoch": 0.94, "learning_rate": 1.4427480916030536e-05, "loss": 0.003, "step": 87 }, { "epoch": 0.97, "learning_rate": 1.4198473282442749e-05, "loss": 0.0031, "step": 90 }, { "epoch": 1.0, "learning_rate": 1.3969465648854963e-05, "loss": 0.0032, "step": 93 }, { "epoch": 1.03, "learning_rate": 1.3740458015267178e-05, "loss": 0.0031, "step": 96 }, { "epoch": 1.07, "learning_rate": 1.351145038167939e-05, "loss": 0.0043, "step": 99 }, { "epoch": 1.08, "eval_loss": 0.0004679011180996895, "eval_mse": 0.00046790109512442456, "eval_runtime": 8.5901, "eval_samples_per_second": 36.437, "eval_steps_per_second": 9.197, "step": 100 }, { "epoch": 1.1, "learning_rate": 1.3282442748091605e-05, "loss": 0.0032, "step": 102 }, { "epoch": 1.13, "learning_rate": 1.3053435114503818e-05, "loss": 0.0037, "step": 105 }, { "epoch": 1.16, "learning_rate": 1.2824427480916032e-05, "loss": 0.0026, "step": 108 }, { "epoch": 1.19, "learning_rate": 1.2595419847328245e-05, "loss": 0.003, "step": 111 }, { "epoch": 1.23, "learning_rate": 1.236641221374046e-05, "loss": 0.0027, "step": 114 }, { "epoch": 1.26, "learning_rate": 1.2137404580152672e-05, "loss": 0.0034, "step": 117 }, { "epoch": 1.29, "learning_rate": 1.1908396946564887e-05, "loss": 0.0031, "step": 120 }, { "epoch": 1.32, "learning_rate": 1.16793893129771e-05, "loss": 0.0038, "step": 123 }, { "epoch": 1.36, "learning_rate": 1.1450381679389312e-05, "loss": 0.003, "step": 126 }, { "epoch": 1.39, "learning_rate": 1.1221374045801527e-05, "loss": 0.003, "step": 129 }, { "epoch": 1.42, "learning_rate": 1.0992366412213743e-05, "loss": 0.0032, "step": 132 }, { "epoch": 1.45, "learning_rate": 1.0763358778625954e-05, "loss": 0.0028, "step": 135 }, { "epoch": 1.48, "learning_rate": 1.0534351145038168e-05, "loss": 0.003, "step": 138 }, { "epoch": 1.52, "learning_rate": 1.0305343511450383e-05, "loss": 0.0025, "step": 141 }, { "epoch": 1.55, "learning_rate": 1.0076335877862595e-05, "loss": 0.0033, "step": 144 }, { "epoch": 1.58, "learning_rate": 9.84732824427481e-06, "loss": 0.0031, "step": 147 }, { "epoch": 1.61, "learning_rate": 9.618320610687025e-06, "loss": 0.0028, "step": 150 }, { "epoch": 1.61, "eval_loss": 0.0006046506459824741, "eval_mse": 0.0006046506115009975, "eval_runtime": 8.4067, "eval_samples_per_second": 37.232, "eval_steps_per_second": 9.397, "step": 150 }, { "epoch": 1.65, "learning_rate": 9.389312977099237e-06, "loss": 0.0029, "step": 153 }, { "epoch": 1.68, "learning_rate": 9.16030534351145e-06, "loss": 0.0029, "step": 156 }, { "epoch": 1.71, "learning_rate": 8.931297709923665e-06, "loss": 0.0021, "step": 159 }, { "epoch": 1.74, "learning_rate": 8.702290076335879e-06, "loss": 0.0029, "step": 162 }, { "epoch": 1.78, "learning_rate": 8.473282442748092e-06, "loss": 0.0026, "step": 165 }, { "epoch": 1.81, "learning_rate": 8.244274809160306e-06, "loss": 0.0028, "step": 168 }, { "epoch": 1.84, "learning_rate": 8.015267175572519e-06, "loss": 0.0031, "step": 171 }, { "epoch": 1.87, "learning_rate": 7.786259541984733e-06, "loss": 0.0024, "step": 174 }, { "epoch": 1.9, "learning_rate": 7.557251908396948e-06, "loss": 0.0021, "step": 177 }, { "epoch": 1.94, "learning_rate": 7.328244274809161e-06, "loss": 0.0024, "step": 180 }, { "epoch": 1.97, "learning_rate": 7.0992366412213746e-06, "loss": 0.0024, "step": 183 }, { "epoch": 2.0, "learning_rate": 6.870229007633589e-06, "loss": 0.0025, "step": 186 }, { "epoch": 2.03, "learning_rate": 6.641221374045802e-06, "loss": 0.0024, "step": 189 }, { "epoch": 2.07, "learning_rate": 6.412213740458016e-06, "loss": 0.0027, "step": 192 }, { "epoch": 2.1, "learning_rate": 6.18320610687023e-06, "loss": 0.0029, "step": 195 }, { "epoch": 2.13, "learning_rate": 5.9541984732824435e-06, "loss": 0.0025, "step": 198 }, { "epoch": 2.15, "eval_loss": 0.0004669851914513856, "eval_mse": 0.0004669852106191058, "eval_runtime": 8.5435, "eval_samples_per_second": 36.636, "eval_steps_per_second": 9.247, "step": 200 }, { "epoch": 2.16, "learning_rate": 5.725190839694656e-06, "loss": 0.002, "step": 201 }, { "epoch": 2.2, "learning_rate": 5.496183206106871e-06, "loss": 0.0026, "step": 204 }, { "epoch": 2.23, "learning_rate": 5.267175572519084e-06, "loss": 0.0024, "step": 207 }, { "epoch": 2.26, "learning_rate": 5.038167938931297e-06, "loss": 0.0025, "step": 210 }, { "epoch": 2.29, "learning_rate": 4.8091603053435125e-06, "loss": 0.0022, "step": 213 }, { "epoch": 2.32, "learning_rate": 4.580152671755725e-06, "loss": 0.002, "step": 216 }, { "epoch": 2.36, "learning_rate": 4.351145038167939e-06, "loss": 0.0024, "step": 219 }, { "epoch": 2.39, "learning_rate": 4.122137404580153e-06, "loss": 0.0021, "step": 222 }, { "epoch": 2.42, "learning_rate": 3.893129770992366e-06, "loss": 0.0019, "step": 225 }, { "epoch": 2.45, "learning_rate": 3.6641221374045806e-06, "loss": 0.0021, "step": 228 }, { "epoch": 2.49, "learning_rate": 3.4351145038167944e-06, "loss": 0.0027, "step": 231 }, { "epoch": 2.52, "learning_rate": 3.206106870229008e-06, "loss": 0.0024, "step": 234 }, { "epoch": 2.55, "learning_rate": 2.9770992366412218e-06, "loss": 0.0024, "step": 237 }, { "epoch": 2.58, "learning_rate": 2.7480916030534356e-06, "loss": 0.0021, "step": 240 }, { "epoch": 2.61, "learning_rate": 2.5190839694656487e-06, "loss": 0.0019, "step": 243 }, { "epoch": 2.65, "learning_rate": 2.2900763358778625e-06, "loss": 0.0019, "step": 246 }, { "epoch": 2.68, "learning_rate": 2.0610687022900764e-06, "loss": 0.0025, "step": 249 }, { "epoch": 2.69, "eval_loss": 0.00047420692862942815, "eval_mse": 0.00047420695769180793, "eval_runtime": 8.5496, "eval_samples_per_second": 36.61, "eval_steps_per_second": 9.24, "step": 250 }, { "epoch": 2.71, "learning_rate": 1.8320610687022903e-06, "loss": 0.0021, "step": 252 }, { "epoch": 2.74, "learning_rate": 1.603053435114504e-06, "loss": 0.0024, "step": 255 }, { "epoch": 2.78, "learning_rate": 1.3740458015267178e-06, "loss": 0.0024, "step": 258 }, { "epoch": 2.81, "learning_rate": 1.1450381679389313e-06, "loss": 0.0021, "step": 261 }, { "epoch": 2.84, "learning_rate": 9.160305343511451e-07, "loss": 0.0026, "step": 264 }, { "epoch": 2.87, "learning_rate": 6.870229007633589e-07, "loss": 0.0023, "step": 267 }, { "epoch": 2.91, "learning_rate": 4.5801526717557257e-07, "loss": 0.0021, "step": 270 }, { "epoch": 2.94, "learning_rate": 2.2900763358778629e-07, "loss": 0.0026, "step": 273 }, { "epoch": 2.97, "learning_rate": 0.0, "loss": 0.0024, "step": 276 }, { "epoch": 2.97, "step": 276, "total_flos": 4678301863342080.0, "train_loss": 0.0036955964937131257, "train_runtime": 2019.4918, "train_samples_per_second": 8.831, "train_steps_per_second": 0.137 } ], "logging_steps": 3, "max_steps": 276, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 4678301863342080.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }