{ "best_metric": 0.8423399925231934, "best_model_checkpoint": "./output/training_results/C018_random_sample_llama3-8b-base_instruct_20240504_182259/checkpoint-20", "epoch": 4.0, "eval_steps": 20, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020833333333333332, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.9378, "step": 1 }, { "epoch": 0.10416666666666667, "grad_norm": 12.797572312512326, "learning_rate": 1.5e-06, "loss": 0.914, "step": 5 }, { "epoch": 0.20833333333333334, "grad_norm": 6.055131806849878, "learning_rate": 3.75e-06, "loss": 0.9191, "step": 10 }, { "epoch": 0.3125, "grad_norm": 5.364360008317059, "learning_rate": 7.5e-06, "loss": 0.8384, "step": 15 }, { "epoch": 0.4166666666666667, "grad_norm": 5.029346041755506, "learning_rate": 1.125e-05, "loss": 0.8108, "step": 20 }, { "epoch": 0.4166666666666667, "eval_loss": 0.8423399925231934, "eval_runtime": 1.9929, "eval_samples_per_second": 170.602, "eval_steps_per_second": 1.505, "step": 20 }, { "epoch": 0.5208333333333334, "grad_norm": 4.414919525176414, "learning_rate": 1.5e-05, "loss": 0.8145, "step": 25 }, { "epoch": 0.625, "grad_norm": 4.112077512908992, "learning_rate": 1.0857107196807194e-05, "loss": 0.8903, "step": 30 }, { "epoch": 0.7291666666666666, "grad_norm": 3.941277162112511, "learning_rate": 7.785589881369409e-06, "loss": 0.8187, "step": 35 }, { "epoch": 0.8333333333333334, "grad_norm": 4.254470659708268, "learning_rate": 5.529292099652595e-06, "loss": 0.7995, "step": 40 }, { "epoch": 0.8333333333333334, "eval_loss": 0.8555145263671875, "eval_runtime": 1.9676, "eval_samples_per_second": 172.799, "eval_steps_per_second": 1.525, "step": 40 }, { "epoch": 0.9375, "grad_norm": 3.9948611107854326, "learning_rate": 3.888024511896068e-06, "loss": 0.882, "step": 45 }, { "epoch": 1.0416666666666667, "grad_norm": 3.9168522584614895, "learning_rate": 2.706555900111454e-06, "loss": 0.736, "step": 50 }, { "epoch": 1.1458333333333333, "grad_norm": 2.943062356850262, "learning_rate": 1.865515934042282e-06, "loss": 0.4778, "step": 55 }, { "epoch": 1.25, "grad_norm": 4.767880244063883, "learning_rate": 1.2739241815556468e-06, "loss": 0.4526, "step": 60 }, { "epoch": 1.25, "eval_loss": 0.8815763592720032, "eval_runtime": 1.9656, "eval_samples_per_second": 172.976, "eval_steps_per_second": 1.526, "step": 60 }, { "epoch": 1.3541666666666667, "grad_norm": 3.995356444110687, "learning_rate": 8.630954296648578e-07, "loss": 0.4192, "step": 65 }, { "epoch": 1.4583333333333333, "grad_norm": 4.06684246492274, "learning_rate": 5.817031181133133e-07, "loss": 0.4476, "step": 70 }, { "epoch": 1.5625, "grad_norm": 4.659083580115326, "learning_rate": 3.918112984729563e-07, "loss": 0.445, "step": 75 }, { "epoch": 1.6666666666666665, "grad_norm": 3.841871360565151, "learning_rate": 2.6571123033559406e-07, "loss": 0.4663, "step": 80 }, { "epoch": 1.6666666666666665, "eval_loss": 0.8521442413330078, "eval_runtime": 1.9598, "eval_samples_per_second": 173.485, "eval_steps_per_second": 1.531, "step": 80 }, { "epoch": 1.7708333333333335, "grad_norm": 3.5797328107351154, "learning_rate": 1.8342171723792628e-07, "loss": 0.4356, "step": 85 }, { "epoch": 1.875, "grad_norm": 3.4219289180238364, "learning_rate": 1.3073276582043678e-07, "loss": 0.4398, "step": 90 }, { "epoch": 1.9791666666666665, "grad_norm": 3.7181063684193703, "learning_rate": 9.769031226024856e-08, "loss": 0.4823, "step": 95 }, { "epoch": 2.0833333333333335, "grad_norm": 3.4287058824726855, "learning_rate": 7.74357826465857e-08, "loss": 0.3927, "step": 100 }, { "epoch": 2.0833333333333335, "eval_loss": 0.8507040143013, "eval_runtime": 1.96, "eval_samples_per_second": 173.471, "eval_steps_per_second": 1.531, "step": 100 }, { "epoch": 2.1875, "grad_norm": 3.4015335314434845, "learning_rate": 6.532831361687478e-08, "loss": 0.3733, "step": 105 }, { "epoch": 2.2916666666666665, "grad_norm": 3.3103683305418437, "learning_rate": 5.828972369827512e-08, "loss": 0.4061, "step": 110 }, { "epoch": 2.3958333333333335, "grad_norm": 3.482262897795111, "learning_rate": 5.4322954384342975e-08, "loss": 0.3621, "step": 115 }, { "epoch": 2.5, "grad_norm": 4.326841270087755, "learning_rate": 5.2163845524645534e-08, "loss": 0.4017, "step": 120 }, { "epoch": 2.5, "eval_loss": 0.8561407923698425, "eval_runtime": 1.9661, "eval_samples_per_second": 172.93, "eval_steps_per_second": 1.526, "step": 120 }, { "epoch": 2.6041666666666665, "grad_norm": 3.5184015061674216, "learning_rate": 5.1033917145757624e-08, "loss": 0.3944, "step": 125 }, { "epoch": 2.7083333333333335, "grad_norm": 3.67879442141003, "learning_rate": 5.046843690876512e-08, "loss": 0.3947, "step": 130 }, { "epoch": 2.8125, "grad_norm": 3.475723886508797, "learning_rate": 5.019958911899713e-08, "loss": 0.381, "step": 135 }, { "epoch": 2.9166666666666665, "grad_norm": 3.2733636154827503, "learning_rate": 5.0079150140309806e-08, "loss": 0.368, "step": 140 }, { "epoch": 2.9166666666666665, "eval_loss": 0.8607857823371887, "eval_runtime": 1.9683, "eval_samples_per_second": 172.741, "eval_steps_per_second": 1.524, "step": 140 }, { "epoch": 3.0208333333333335, "grad_norm": 3.1617566806185375, "learning_rate": 5.0028831355203246e-08, "loss": 0.3777, "step": 145 }, { "epoch": 3.125, "grad_norm": 3.228899698066603, "learning_rate": 5.00094821039914e-08, "loss": 0.3817, "step": 150 }, { "epoch": 3.2291666666666665, "grad_norm": 3.322344354923687, "learning_rate": 5.000275150604354e-08, "loss": 0.3773, "step": 155 }, { "epoch": 3.3333333333333335, "grad_norm": 4.645153711557618, "learning_rate": 5.000068241292119e-08, "loss": 0.3677, "step": 160 }, { "epoch": 3.3333333333333335, "eval_loss": 0.8646895885467529, "eval_runtime": 1.964, "eval_samples_per_second": 173.116, "eval_steps_per_second": 1.527, "step": 160 }, { "epoch": 3.4375, "grad_norm": 3.243053987237776, "learning_rate": 5.000013819045227e-08, "loss": 0.3656, "step": 165 }, { "epoch": 3.5416666666666665, "grad_norm": 3.582352877774696, "learning_rate": 5.000002132208559e-08, "loss": 0.3503, "step": 170 }, { "epoch": 3.6458333333333335, "grad_norm": 3.2701278989275857, "learning_rate": 5.00000022411853e-08, "loss": 0.3707, "step": 175 }, { "epoch": 3.75, "grad_norm": 3.338127751209054, "learning_rate": 5.000000013145176e-08, "loss": 0.3635, "step": 180 }, { "epoch": 3.75, "eval_loss": 0.8675721287727356, "eval_runtime": 1.9612, "eval_samples_per_second": 173.361, "eval_steps_per_second": 1.53, "step": 180 }, { "epoch": 3.8541666666666665, "grad_norm": 3.6011084207595765, "learning_rate": 5.000000000284985e-08, "loss": 0.3833, "step": 185 }, { "epoch": 3.9583333333333335, "grad_norm": 3.0931269944491455, "learning_rate": 5.000000000000758e-08, "loss": 0.3722, "step": 190 }, { "epoch": 4.0, "step": 192, "total_flos": 5305238814720.0, "train_loss": 0.5165732034171621, "train_runtime": 1007.2477, "train_samples_per_second": 12.128, "train_steps_per_second": 0.191 } ], "logging_steps": 5, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 20, "total_flos": 5305238814720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }