{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.090909090909092, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 220.0, "learning_rate": 4e-05, "loss": 24.9691, "step": 1 }, { "epoch": 0.91, "grad_norm": 21.75, "learning_rate": 0.0002, "loss": 20.4692, "step": 5 }, { "epoch": 0.91, "eval_loss": 7.439934253692627, "eval_runtime": 0.5593, "eval_samples_per_second": 3.576, "eval_steps_per_second": 1.788, "step": 5 }, { "epoch": 1.82, "grad_norm": 22.875, "learning_rate": 0.00019396926207859084, "loss": 12.9912, "step": 10 }, { "epoch": 2.0, "eval_loss": 6.607400894165039, "eval_runtime": 0.5606, "eval_samples_per_second": 3.567, "eval_steps_per_second": 1.784, "step": 11 }, { "epoch": 2.73, "grad_norm": 3.96875, "learning_rate": 0.0001766044443118978, "loss": 10.1734, "step": 15 }, { "epoch": 2.91, "eval_loss": 6.02254581451416, "eval_runtime": 0.5738, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.743, "step": 16 }, { "epoch": 3.64, "grad_norm": 13.875, "learning_rate": 0.00015000000000000001, "loss": 9.8269, "step": 20 }, { "epoch": 4.0, "eval_loss": 3.550281047821045, "eval_runtime": 0.5671, "eval_samples_per_second": 3.527, "eval_steps_per_second": 1.763, "step": 22 }, { "epoch": 4.55, "grad_norm": 22.5, "learning_rate": 0.00011736481776669306, "loss": 5.2353, "step": 25 }, { "epoch": 4.91, "eval_loss": 1.6504679918289185, "eval_runtime": 0.5777, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.731, "step": 27 }, { "epoch": 5.45, "grad_norm": 3.078125, "learning_rate": 8.263518223330697e-05, "loss": 1.6367, "step": 30 }, { "epoch": 6.0, "eval_loss": 1.4912230968475342, "eval_runtime": 0.5649, "eval_samples_per_second": 3.54, "eval_steps_per_second": 1.77, "step": 33 }, { "epoch": 6.36, "grad_norm": 1.109375, "learning_rate": 5.000000000000002e-05, "loss": 1.4714, "step": 35 }, { "epoch": 6.91, "eval_loss": 1.420060157775879, "eval_runtime": 0.5812, "eval_samples_per_second": 3.441, "eval_steps_per_second": 1.72, "step": 38 }, { "epoch": 7.27, "grad_norm": 1.1171875, "learning_rate": 2.339555568810221e-05, "loss": 1.3916, "step": 40 }, { "epoch": 8.0, "eval_loss": 1.3933132886886597, "eval_runtime": 0.5657, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 44 }, { "epoch": 8.18, "grad_norm": 0.95703125, "learning_rate": 6.030737921409169e-06, "loss": 1.2832, "step": 45 }, { "epoch": 8.91, "eval_loss": 1.3881534337997437, "eval_runtime": 0.5859, "eval_samples_per_second": 3.414, "eval_steps_per_second": 1.707, "step": 49 }, { "epoch": 9.09, "grad_norm": 0.83203125, "learning_rate": 0.0, "loss": 1.2863, "step": 50 }, { "epoch": 9.09, "eval_loss": 1.3843274116516113, "eval_runtime": 0.5652, "eval_samples_per_second": 3.539, "eval_steps_per_second": 1.769, "step": 50 }, { "epoch": 9.09, "step": 50, "total_flos": 7.66894376306606e+16, "train_loss": 6.66651237487793, "train_runtime": 191.2414, "train_samples_per_second": 4.602, "train_steps_per_second": 0.261 } ], "logging_steps": 5, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 7.66894376306606e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }