{ "best_metric": 0.8397957682609558, "best_model_checkpoint": "./output/training_results/C021_random_sample_llama3-8b-base_instruct_20240505_135320/checkpoint-20", "epoch": 4.0, "eval_steps": 20, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020833333333333332, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.9347, "step": 1 }, { "epoch": 0.10416666666666667, "grad_norm": 16.63088915574003, "learning_rate": 1.5e-06, "loss": 0.9437, "step": 5 }, { "epoch": 0.20833333333333334, "grad_norm": 6.257240515905425, "learning_rate": 4.5e-06, "loss": 0.8846, "step": 10 }, { "epoch": 0.3125, "grad_norm": 5.37629022424091, "learning_rate": 8.25e-06, "loss": 0.8294, "step": 15 }, { "epoch": 0.4166666666666667, "grad_norm": 4.943891569973697, "learning_rate": 1.2e-05, "loss": 0.8056, "step": 20 }, { "epoch": 0.4166666666666667, "eval_loss": 0.8397957682609558, "eval_runtime": 1.9971, "eval_samples_per_second": 170.243, "eval_steps_per_second": 1.502, "step": 20 }, { "epoch": 0.5208333333333334, "grad_norm": 4.813118762922576, "learning_rate": 1.4071209905461127e-05, "loss": 0.8125, "step": 25 }, { "epoch": 0.625, "grad_norm": 4.128443703273486, "learning_rate": 1.0166196232101288e-05, "loss": 0.8944, "step": 30 }, { "epoch": 0.7291666666666666, "grad_norm": 4.063799357973714, "learning_rate": 7.276248845991498e-06, "loss": 0.813, "step": 35 }, { "epoch": 0.8333333333333334, "grad_norm": 4.337216872639982, "learning_rate": 5.157388080190487e-06, "loss": 0.7984, "step": 40 }, { "epoch": 0.8333333333333334, "eval_loss": 0.8509224057197571, "eval_runtime": 1.9642, "eval_samples_per_second": 173.098, "eval_steps_per_second": 1.527, "step": 40 }, { "epoch": 0.9375, "grad_norm": 4.071080361855319, "learning_rate": 3.6192313334626905e-06, "loss": 0.8755, "step": 45 }, { "epoch": 1.0416666666666667, "grad_norm": 3.7602687878599754, "learning_rate": 2.514391432582838e-06, "loss": 0.7267, "step": 50 }, { "epoch": 1.1458333333333333, "grad_norm": 3.168806435107565, "learning_rate": 1.7297262757656213e-06, "loss": 0.4663, "step": 55 }, { "epoch": 1.25, "grad_norm": 4.185451904109845, "learning_rate": 1.1791620375982074e-06, "loss": 0.439, "step": 60 }, { "epoch": 1.25, "eval_loss": 0.8703265190124512, "eval_runtime": 1.9569, "eval_samples_per_second": 173.748, "eval_steps_per_second": 1.533, "step": 60 }, { "epoch": 1.3541666666666667, "grad_norm": 3.824836183235807, "learning_rate": 7.978466092394693e-07, "loss": 0.4085, "step": 65 }, { "epoch": 1.4583333333333333, "grad_norm": 4.129081897105463, "learning_rate": 5.374210410959207e-07, "loss": 0.445, "step": 70 }, { "epoch": 1.5625, "grad_norm": 4.891809804239562, "learning_rate": 3.6222476698215175e-07, "loss": 0.4392, "step": 75 }, { "epoch": 1.6666666666666665, "grad_norm": 4.139500340821145, "learning_rate": 2.462755297384099e-07, "loss": 0.4595, "step": 80 }, { "epoch": 1.6666666666666665, "eval_loss": 0.8540446162223816, "eval_runtime": 1.9629, "eval_samples_per_second": 173.216, "eval_steps_per_second": 1.528, "step": 80 }, { "epoch": 1.7708333333333335, "grad_norm": 3.493841656862903, "learning_rate": 1.7088740175034947e-07, "loss": 0.431, "step": 85 }, { "epoch": 1.875, "grad_norm": 3.4456890479451556, "learning_rate": 1.228102956599465e-07, "loss": 0.4306, "step": 90 }, { "epoch": 1.9791666666666665, "grad_norm": 4.374839662523855, "learning_rate": 9.279207916081227e-08, "loss": 0.4889, "step": 95 }, { "epoch": 2.0833333333333335, "grad_norm": 3.5459041596116894, "learning_rate": 7.448002404850094e-08, "loss": 0.3986, "step": 100 }, { "epoch": 2.0833333333333335, "eval_loss": 0.8511225581169128, "eval_runtime": 1.9574, "eval_samples_per_second": 173.696, "eval_steps_per_second": 1.533, "step": 100 }, { "epoch": 2.1875, "grad_norm": 3.3821947572330133, "learning_rate": 6.35920070839697e-08, "loss": 0.3708, "step": 105 }, { "epoch": 2.2916666666666665, "grad_norm": 3.2392814873445337, "learning_rate": 5.7299804687499997e-08, "loss": 0.4046, "step": 110 }, { "epoch": 2.3958333333333335, "grad_norm": 3.5500967091243507, "learning_rate": 5.37771434967624e-08, "loss": 0.3535, "step": 115 }, { "epoch": 2.5, "grad_norm": 4.217037134589023, "learning_rate": 5.187403540619925e-08, "loss": 0.3895, "step": 120 }, { "epoch": 2.5, "eval_loss": 0.8556583523750305, "eval_runtime": 1.9695, "eval_samples_per_second": 172.631, "eval_steps_per_second": 1.523, "step": 120 }, { "epoch": 2.6041666666666665, "grad_norm": 3.4755080047765485, "learning_rate": 5.088648238966908e-08, "loss": 0.3734, "step": 125 }, { "epoch": 2.7083333333333335, "grad_norm": 3.807858910773455, "learning_rate": 5.039701925276604e-08, "loss": 0.3789, "step": 130 }, { "epoch": 2.8125, "grad_norm": 3.51077143827924, "learning_rate": 5.0166900048082497e-08, "loss": 0.3885, "step": 135 }, { "epoch": 2.9166666666666665, "grad_norm": 3.3305744521508327, "learning_rate": 5.0065147322870076e-08, "loss": 0.3761, "step": 140 }, { "epoch": 2.9166666666666665, "eval_loss": 0.86014723777771, "eval_runtime": 1.9679, "eval_samples_per_second": 172.774, "eval_steps_per_second": 1.524, "step": 140 }, { "epoch": 3.0208333333333335, "grad_norm": 3.2369875818095495, "learning_rate": 5.002328628528332e-08, "loss": 0.3827, "step": 145 }, { "epoch": 3.125, "grad_norm": 3.2150419045419945, "learning_rate": 5.0007484528133236e-08, "loss": 0.3756, "step": 150 }, { "epoch": 3.2291666666666665, "grad_norm": 3.2615638974480397, "learning_rate": 5.0002110817570477e-08, "loss": 0.3767, "step": 155 }, { "epoch": 3.3333333333333335, "grad_norm": 4.688258261760996, "learning_rate": 5.0000504842356326e-08, "loss": 0.3652, "step": 160 }, { "epoch": 3.3333333333333335, "eval_loss": 0.8633076548576355, "eval_runtime": 1.9631, "eval_samples_per_second": 173.195, "eval_steps_per_second": 1.528, "step": 160 }, { "epoch": 3.4375, "grad_norm": 3.329470519573592, "learning_rate": 5.000009745562451e-08, "loss": 0.363, "step": 165 }, { "epoch": 3.5416666666666665, "grad_norm": 3.609002470579086, "learning_rate": 5.0000014077810156e-08, "loss": 0.3413, "step": 170 }, { "epoch": 3.6458333333333335, "grad_norm": 3.346966183480162, "learning_rate": 5.0000001343508807e-08, "loss": 0.3642, "step": 175 }, { "epoch": 3.75, "grad_norm": 3.5169423852432766, "learning_rate": 5.000000006747581e-08, "loss": 0.3712, "step": 180 }, { "epoch": 3.75, "eval_loss": 0.866676390171051, "eval_runtime": 1.9612, "eval_samples_per_second": 173.366, "eval_steps_per_second": 1.53, "step": 180 }, { "epoch": 3.8541666666666665, "grad_norm": 3.708850323060228, "learning_rate": 5.0000000001094325e-08, "loss": 0.3705, "step": 185 }, { "epoch": 3.9583333333333335, "grad_norm": 3.14278261830864, "learning_rate": 5.000000000000139e-08, "loss": 0.3576, "step": 190 }, { "epoch": 4.0, "step": 192, "total_flos": 5343577374720.0, "train_loss": 0.5120982651909193, "train_runtime": 1104.1807, "train_samples_per_second": 11.063, "train_steps_per_second": 0.174 } ], "logging_steps": 5, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 20, "total_flos": 5343577374720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }