{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3512209246211755, "eval_steps": 1000, "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 2.676008701324463, "learning_rate": 2.3334888992599506e-05, "loss": 1.6312, "step": 1000 }, { "epoch": 0.1, "eval_loss": 1.348188042640686, "eval_runtime": 27.3501, "eval_samples_per_second": 18.281, "eval_steps_per_second": 2.303, "step": 1000 }, { "epoch": 0.19, "grad_norm": 6.8814849853515625, "learning_rate": 2.166811120741383e-05, "loss": 1.5281, "step": 2000 }, { "epoch": 0.19, "eval_loss": 1.2927687168121338, "eval_runtime": 27.3678, "eval_samples_per_second": 18.27, "eval_steps_per_second": 2.302, "step": 2000 }, { "epoch": 0.29, "grad_norm": 8.751653671264648, "learning_rate": 2.0001333422228148e-05, "loss": 1.4801, "step": 3000 }, { "epoch": 0.29, "eval_loss": 1.2705720663070679, "eval_runtime": 27.3589, "eval_samples_per_second": 18.276, "eval_steps_per_second": 2.303, "step": 3000 }, { "epoch": 0.39, "grad_norm": 6.494958877563477, "learning_rate": 1.833455563704247e-05, "loss": 1.4316, "step": 4000 }, { "epoch": 0.39, "eval_loss": 1.2496768236160278, "eval_runtime": 27.3358, "eval_samples_per_second": 18.291, "eval_steps_per_second": 2.305, "step": 4000 }, { "epoch": 0.48, "grad_norm": 5.609992980957031, "learning_rate": 1.6667777851856793e-05, "loss": 1.4187, "step": 5000 }, { "epoch": 0.48, "eval_loss": 1.2094831466674805, "eval_runtime": 27.3558, "eval_samples_per_second": 18.278, "eval_steps_per_second": 2.303, "step": 5000 }, { "epoch": 0.58, "grad_norm": 7.250636100769043, "learning_rate": 1.5001000066671111e-05, "loss": 1.3534, "step": 6000 }, { "epoch": 0.58, "eval_loss": 1.1994829177856445, "eval_runtime": 27.3608, "eval_samples_per_second": 18.274, "eval_steps_per_second": 2.303, "step": 6000 }, { "epoch": 0.68, "grad_norm": 7.297804832458496, "learning_rate": 1.3334222281485434e-05, "loss": 1.3494, "step": 7000 }, { "epoch": 0.68, "eval_loss": 1.1822370290756226, "eval_runtime": 27.3595, "eval_samples_per_second": 18.275, "eval_steps_per_second": 2.303, "step": 7000 }, { "epoch": 0.77, "grad_norm": 4.8396100997924805, "learning_rate": 1.1667444496299753e-05, "loss": 1.3102, "step": 8000 }, { "epoch": 0.77, "eval_loss": 1.1708005666732788, "eval_runtime": 27.3464, "eval_samples_per_second": 18.284, "eval_steps_per_second": 2.304, "step": 8000 }, { "epoch": 0.87, "grad_norm": 10.750748634338379, "learning_rate": 1.0000666711114074e-05, "loss": 1.2833, "step": 9000 }, { "epoch": 0.87, "eval_loss": 1.1537472009658813, "eval_runtime": 27.3595, "eval_samples_per_second": 18.275, "eval_steps_per_second": 2.303, "step": 9000 }, { "epoch": 0.97, "grad_norm": 6.962567329406738, "learning_rate": 8.333888925928397e-06, "loss": 1.2839, "step": 10000 }, { "epoch": 0.97, "eval_loss": 1.1324294805526733, "eval_runtime": 27.3553, "eval_samples_per_second": 18.278, "eval_steps_per_second": 2.303, "step": 10000 }, { "epoch": 1.06, "grad_norm": 7.520838260650635, "learning_rate": 6.667111140742717e-06, "loss": 1.065, "step": 11000 }, { "epoch": 1.06, "eval_loss": 1.1342204809188843, "eval_runtime": 27.3703, "eval_samples_per_second": 18.268, "eval_steps_per_second": 2.302, "step": 11000 }, { "epoch": 1.16, "grad_norm": 8.578242301940918, "learning_rate": 5.000333355557037e-06, "loss": 0.9611, "step": 12000 }, { "epoch": 1.16, "eval_loss": 1.1292400360107422, "eval_runtime": 27.3593, "eval_samples_per_second": 18.275, "eval_steps_per_second": 2.303, "step": 12000 }, { "epoch": 1.25, "grad_norm": 7.761205673217773, "learning_rate": 3.3335555703713584e-06, "loss": 0.9462, "step": 13000 }, { "epoch": 1.25, "eval_loss": 1.1263900995254517, "eval_runtime": 27.3672, "eval_samples_per_second": 18.27, "eval_steps_per_second": 2.302, "step": 13000 }, { "epoch": 1.35, "grad_norm": 7.373520374298096, "learning_rate": 1.6667777851856792e-06, "loss": 0.9547, "step": 14000 }, { "epoch": 1.35, "eval_loss": 1.117197036743164, "eval_runtime": 27.3661, "eval_samples_per_second": 18.271, "eval_steps_per_second": 2.302, "step": 14000 } ], "logging_steps": 1000, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 5.29975150430208e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }