{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.971223021582734, "eval_steps": 25, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.64, "grad_norm": 494.05772193159083, "learning_rate": 9.999874838141888e-05, "loss": 1.7591, "step": 25 }, { "epoch": 0.64, "eval_loss": 4.927945613861084, "eval_runtime": 213.4346, "eval_samples_per_second": 46.857, "eval_steps_per_second": 0.736, "step": 25 }, { "epoch": 1.28, "grad_norm": 41.85279295369776, "learning_rate": 9.915628588978522e-05, "loss": 2.0299, "step": 50 }, { "epoch": 1.28, "eval_loss": 0.818238377571106, "eval_runtime": 211.9322, "eval_samples_per_second": 47.19, "eval_steps_per_second": 0.741, "step": 50 }, { "epoch": 1.92, "grad_norm": 10.844358526437997, "learning_rate": 9.67797005288181e-05, "loss": 0.6558, "step": 75 }, { "epoch": 1.92, "eval_loss": 0.5749660134315491, "eval_runtime": 211.6596, "eval_samples_per_second": 47.25, "eval_steps_per_second": 0.742, "step": 75 }, { "epoch": 2.56, "grad_norm": 6.085279119700608, "learning_rate": 9.294316336102132e-05, "loss": 0.4785, "step": 100 }, { "epoch": 2.56, "eval_loss": 0.38230499625205994, "eval_runtime": 213.3859, "eval_samples_per_second": 46.868, "eval_steps_per_second": 0.736, "step": 100 }, { "epoch": 3.2, "grad_norm": 3.033775912026443, "learning_rate": 8.776640921382584e-05, "loss": 0.3837, "step": 125 }, { "epoch": 3.2, "eval_loss": 0.29413464665412903, "eval_runtime": 213.545, "eval_samples_per_second": 46.833, "eval_steps_per_second": 0.735, "step": 125 }, { "epoch": 3.84, "grad_norm": 1.1383110535612262, "learning_rate": 8.141099986478212e-05, "loss": 0.3073, "step": 150 }, { "epoch": 3.84, "eval_loss": 0.23183606564998627, "eval_runtime": 212.4796, "eval_samples_per_second": 47.068, "eval_steps_per_second": 0.739, "step": 150 }, { "epoch": 4.48, "grad_norm": 1.081127753088496, "learning_rate": 7.407528184577019e-05, "loss": 0.2119, "step": 175 }, { "epoch": 4.48, "eval_loss": 0.18712490797042847, "eval_runtime": 213.5483, "eval_samples_per_second": 46.833, "eval_steps_per_second": 0.735, "step": 175 }, { "epoch": 5.12, "grad_norm": 0.36145826033813083, "learning_rate": 6.598819622856227e-05, "loss": 0.1632, "step": 200 }, { "epoch": 5.12, "eval_loss": 0.15954019129276276, "eval_runtime": 211.2408, "eval_samples_per_second": 47.344, "eval_steps_per_second": 0.743, "step": 200 }, { "epoch": 5.76, "grad_norm": 0.3381871309692255, "learning_rate": 5.7402133582686576e-05, "loss": 0.1297, "step": 225 }, { "epoch": 5.76, "eval_loss": 0.14867956936359406, "eval_runtime": 214.0779, "eval_samples_per_second": 46.717, "eval_steps_per_second": 0.733, "step": 225 }, { "epoch": 6.39, "grad_norm": 0.3059190090918364, "learning_rate": 4.85850570958441e-05, "loss": 0.1035, "step": 250 }, { "epoch": 6.39, "eval_loss": 0.14759863913059235, "eval_runtime": 212.6859, "eval_samples_per_second": 47.022, "eval_steps_per_second": 0.738, "step": 250 }, { "epoch": 7.03, "grad_norm": 0.28211565488590556, "learning_rate": 3.9812139687108815e-05, "loss": 0.0856, "step": 275 }, { "epoch": 7.03, "eval_loss": 0.14268410205841064, "eval_runtime": 210.2857, "eval_samples_per_second": 47.559, "eval_steps_per_second": 0.747, "step": 275 }, { "epoch": 7.67, "grad_norm": 0.3745125634810898, "learning_rate": 3.135717611098458e-05, "loss": 0.0574, "step": 300 }, { "epoch": 7.67, "eval_loss": 0.14823941886425018, "eval_runtime": 214.2104, "eval_samples_per_second": 46.688, "eval_steps_per_second": 0.733, "step": 300 }, { "epoch": 8.31, "grad_norm": 0.3453636602728437, "learning_rate": 2.3484038072721758e-05, "loss": 0.0448, "step": 325 }, { "epoch": 8.31, "eval_loss": 0.1552451252937317, "eval_runtime": 210.5548, "eval_samples_per_second": 47.498, "eval_steps_per_second": 0.746, "step": 325 }, { "epoch": 8.95, "grad_norm": 0.28973521029613436, "learning_rate": 1.6438439032954855e-05, "loss": 0.0318, "step": 350 }, { "epoch": 8.95, "eval_loss": 0.15622578561306, "eval_runtime": 210.6816, "eval_samples_per_second": 47.47, "eval_steps_per_second": 0.745, "step": 350 }, { "epoch": 9.59, "grad_norm": 0.20130206793775604, "learning_rate": 1.0440265714600572e-05, "loss": 0.0196, "step": 375 }, { "epoch": 9.59, "eval_loss": 0.1708817183971405, "eval_runtime": 213.4033, "eval_samples_per_second": 46.864, "eval_steps_per_second": 0.736, "step": 375 }, { "epoch": 10.23, "grad_norm": 0.12286721385196743, "learning_rate": 5.676715638695063e-06, "loss": 0.0146, "step": 400 }, { "epoch": 10.23, "eval_loss": 0.1793455183506012, "eval_runtime": 212.2341, "eval_samples_per_second": 47.122, "eval_steps_per_second": 0.74, "step": 400 }, { "epoch": 10.87, "grad_norm": 0.07993244910369399, "learning_rate": 2.2964548604209213e-06, "loss": 0.0084, "step": 425 }, { "epoch": 10.87, "eval_loss": 0.1853875368833542, "eval_runtime": 213.6886, "eval_samples_per_second": 46.802, "eval_steps_per_second": 0.735, "step": 425 }, { "epoch": 11.51, "grad_norm": 0.05627841974218994, "learning_rate": 4.049782370561583e-07, "loss": 0.0058, "step": 450 }, { "epoch": 11.51, "eval_loss": 0.1918596774339676, "eval_runtime": 211.3759, "eval_samples_per_second": 47.314, "eval_steps_per_second": 0.743, "step": 450 }, { "epoch": 11.97, "step": 468, "total_flos": 3135460343808000.0, "train_loss": 0.346917372992915, "train_runtime": 79535.2013, "train_samples_per_second": 12.071, "train_steps_per_second": 0.006 } ], "logging_steps": 25, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 2000, "total_flos": 3135460343808000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }