{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.971223021582734, "eval_steps": 25, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.64, "grad_norm": 0.9966348377433891, "learning_rate": 9.999874838141888e-05, "loss": 0.1831, "step": 25 }, { "epoch": 0.64, "eval_loss": 0.12574630975723267, "eval_runtime": 110.3252, "eval_samples_per_second": 90.65, "eval_steps_per_second": 1.423, "step": 25 }, { "epoch": 1.28, "grad_norm": 0.13412292959827216, "learning_rate": 9.915628588978522e-05, "loss": 0.1239, "step": 50 }, { "epoch": 1.28, "eval_loss": 0.10444429516792297, "eval_runtime": 110.3167, "eval_samples_per_second": 90.657, "eval_steps_per_second": 1.423, "step": 50 }, { "epoch": 1.92, "grad_norm": 0.1395974887205158, "learning_rate": 9.67797005288181e-05, "loss": 0.108, "step": 75 }, { "epoch": 1.92, "eval_loss": 0.0995471403002739, "eval_runtime": 110.0262, "eval_samples_per_second": 90.897, "eval_steps_per_second": 1.427, "step": 75 }, { "epoch": 2.56, "grad_norm": 0.1280464988861011, "learning_rate": 9.294316336102132e-05, "loss": 0.0976, "step": 100 }, { "epoch": 2.56, "eval_loss": 0.09775934368371964, "eval_runtime": 109.8969, "eval_samples_per_second": 91.003, "eval_steps_per_second": 1.429, "step": 100 }, { "epoch": 3.2, "grad_norm": 0.09561572064129414, "learning_rate": 8.776640921382584e-05, "loss": 0.094, "step": 125 }, { "epoch": 3.2, "eval_loss": 0.08864283561706543, "eval_runtime": 110.0582, "eval_samples_per_second": 90.87, "eval_steps_per_second": 1.427, "step": 125 }, { "epoch": 3.84, "grad_norm": 0.11127416265410695, "learning_rate": 8.141099986478212e-05, "loss": 0.0828, "step": 150 }, { "epoch": 3.84, "eval_loss": 0.08932521939277649, "eval_runtime": 109.3425, "eval_samples_per_second": 91.465, "eval_steps_per_second": 1.436, "step": 150 }, { "epoch": 4.48, "grad_norm": 0.22132698379917157, "learning_rate": 7.407528184577019e-05, "loss": 0.078, "step": 175 }, { "epoch": 4.48, "eval_loss": 0.09065766632556915, "eval_runtime": 110.1178, "eval_samples_per_second": 90.821, "eval_steps_per_second": 1.426, "step": 175 }, { "epoch": 5.12, "grad_norm": 0.08955720132434862, "learning_rate": 6.598819622856227e-05, "loss": 0.0767, "step": 200 }, { "epoch": 5.12, "eval_loss": 0.08660481870174408, "eval_runtime": 109.7782, "eval_samples_per_second": 91.102, "eval_steps_per_second": 1.43, "step": 200 }, { "epoch": 5.76, "grad_norm": 0.09375419122940627, "learning_rate": 5.7402133582686576e-05, "loss": 0.0697, "step": 225 }, { "epoch": 5.76, "eval_loss": 0.08401340246200562, "eval_runtime": 109.3041, "eval_samples_per_second": 91.497, "eval_steps_per_second": 1.436, "step": 225 }, { "epoch": 6.39, "grad_norm": 0.0990995516325273, "learning_rate": 4.85850570958441e-05, "loss": 0.0646, "step": 250 }, { "epoch": 6.39, "eval_loss": 0.08192210644483566, "eval_runtime": 109.5799, "eval_samples_per_second": 91.267, "eval_steps_per_second": 1.433, "step": 250 }, { "epoch": 7.03, "grad_norm": 0.07754859448629634, "learning_rate": 3.9812139687108815e-05, "loss": 0.0594, "step": 275 }, { "epoch": 7.03, "eval_loss": 0.07945634424686432, "eval_runtime": 109.4058, "eval_samples_per_second": 91.412, "eval_steps_per_second": 1.435, "step": 275 }, { "epoch": 7.67, "grad_norm": 0.06681798482966637, "learning_rate": 3.135717611098458e-05, "loss": 0.052, "step": 300 }, { "epoch": 7.67, "eval_loss": 0.07952920347452164, "eval_runtime": 110.1672, "eval_samples_per_second": 90.78, "eval_steps_per_second": 1.425, "step": 300 }, { "epoch": 8.31, "grad_norm": 0.10690045013292092, "learning_rate": 2.3484038072721758e-05, "loss": 0.0478, "step": 325 }, { "epoch": 8.31, "eval_loss": 0.08034859597682953, "eval_runtime": 109.4321, "eval_samples_per_second": 91.39, "eval_steps_per_second": 1.435, "step": 325 }, { "epoch": 8.95, "grad_norm": 0.08719559737170716, "learning_rate": 1.6438439032954855e-05, "loss": 0.0447, "step": 350 }, { "epoch": 8.95, "eval_loss": 0.07858795672655106, "eval_runtime": 110.1518, "eval_samples_per_second": 90.793, "eval_steps_per_second": 1.425, "step": 350 }, { "epoch": 9.59, "grad_norm": 0.04932389885679807, "learning_rate": 1.0440265714600572e-05, "loss": 0.0392, "step": 375 }, { "epoch": 9.59, "eval_loss": 0.07998502999544144, "eval_runtime": 109.5251, "eval_samples_per_second": 91.312, "eval_steps_per_second": 1.433, "step": 375 }, { "epoch": 10.23, "grad_norm": 0.03799019571227359, "learning_rate": 5.676715638695063e-06, "loss": 0.038, "step": 400 }, { "epoch": 10.23, "eval_loss": 0.08129393309354782, "eval_runtime": 109.5381, "eval_samples_per_second": 91.302, "eval_steps_per_second": 1.433, "step": 400 }, { "epoch": 10.87, "grad_norm": 0.034874323591440555, "learning_rate": 2.2964548604209213e-06, "loss": 0.0357, "step": 425 }, { "epoch": 10.87, "eval_loss": 0.08102953433990479, "eval_runtime": 110.2721, "eval_samples_per_second": 90.694, "eval_steps_per_second": 1.424, "step": 425 }, { "epoch": 11.51, "grad_norm": 0.038884021048955025, "learning_rate": 4.049782370561583e-07, "loss": 0.035, "step": 450 }, { "epoch": 11.51, "eval_loss": 0.08157423138618469, "eval_runtime": 109.9602, "eval_samples_per_second": 90.951, "eval_steps_per_second": 1.428, "step": 450 }, { "epoch": 11.97, "step": 468, "total_flos": 1.0672624631808e+16, "train_loss": 0.07237311357106918, "train_runtime": 39649.873, "train_samples_per_second": 24.213, "train_steps_per_second": 0.012 } ], "logging_steps": 25, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 2000, "total_flos": 1.0672624631808e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }