{ "best_metric": 1.0291192531585693, "best_model_checkpoint": "savedEpoch_nccl/checkpoint-91656", "epoch": 6.0, "eval_steps": 500, "global_step": 91656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.6372342109680176, "learning_rate": 9.882461854198269e-05, "loss": 1.1441, "step": 15276 }, { "epoch": 1.0, "eval_loss": 1.0554957389831543, "eval_runtime": 57.5344, "eval_samples_per_second": 12743.952, "eval_steps_per_second": 33.198, "step": 15276 }, { "epoch": 2.0, "grad_norm": 0.6399237513542175, "learning_rate": 9.541848844472206e-05, "loss": 1.0486, "step": 30552 }, { "epoch": 2.0, "eval_loss": 1.0447489023208618, "eval_runtime": 240.72, "eval_samples_per_second": 3045.924, "eval_steps_per_second": 7.935, "step": 30552 }, { "epoch": 3.0, "grad_norm": 0.7907009124755859, "learning_rate": 9.201235834746142e-05, "loss": 1.0407, "step": 45828 }, { "epoch": 3.0, "eval_loss": 1.0373780727386475, "eval_runtime": 56.906, "eval_samples_per_second": 12884.672, "eval_steps_per_second": 33.564, "step": 45828 }, { "epoch": 4.0, "grad_norm": 0.5956621766090393, "learning_rate": 8.860622825020078e-05, "loss": 1.0376, "step": 61104 }, { "epoch": 4.0, "eval_loss": 1.0327318906784058, "eval_runtime": 57.0015, "eval_samples_per_second": 12863.086, "eval_steps_per_second": 33.508, "step": 61104 }, { "epoch": 5.0, "grad_norm": 0.6499078273773193, "learning_rate": 8.519965200321228e-05, "loss": 1.037, "step": 76380 }, { "epoch": 5.0, "eval_loss": 1.0331288576126099, "eval_runtime": 57.0146, "eval_samples_per_second": 12860.131, "eval_steps_per_second": 33.5, "step": 76380 }, { "epoch": 6.0, "grad_norm": 0.5220733284950256, "learning_rate": 8.179329883108772e-05, "loss": 1.0344, "step": 91656 }, { "epoch": 6.0, "eval_loss": 1.0291192531585693, "eval_runtime": 56.9287, "eval_samples_per_second": 12879.541, "eval_steps_per_second": 33.551, "step": 91656 } ], "logging_steps": 500, "max_steps": 458280, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 8.139053392031908e+17, "train_batch_size": 96, "trial_name": null, "trial_params": null }