{ "best_metric": 1.1285432577133179, "best_model_checkpoint": "./ryan_model/checkpoint-100", "epoch": 4.0, "eval_steps": 100, "global_step": 152, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.26, "grad_norm": 1.8508069515228271, "learning_rate": 0.00018684210526315792, "loss": 1.7256, "step": 10 }, { "epoch": 0.53, "grad_norm": 2.4252703189849854, "learning_rate": 0.0001736842105263158, "loss": 1.5727, "step": 20 }, { "epoch": 0.79, "grad_norm": 2.249209403991699, "learning_rate": 0.0001605263157894737, "loss": 1.4208, "step": 30 }, { "epoch": 1.05, "grad_norm": 2.09879207611084, "learning_rate": 0.0001486842105263158, "loss": 1.2251, "step": 40 }, { "epoch": 1.32, "grad_norm": 1.9784414768218994, "learning_rate": 0.0001355263157894737, "loss": 0.9854, "step": 50 }, { "epoch": 1.58, "grad_norm": 1.5055615901947021, "learning_rate": 0.00012236842105263157, "loss": 0.9456, "step": 60 }, { "epoch": 1.84, "grad_norm": 1.551707148551941, "learning_rate": 0.00010921052631578947, "loss": 0.8805, "step": 70 }, { "epoch": 2.11, "grad_norm": 1.3849328756332397, "learning_rate": 9.605263157894737e-05, "loss": 0.7414, "step": 80 }, { "epoch": 2.37, "grad_norm": 1.9976824522018433, "learning_rate": 8.289473684210527e-05, "loss": 0.5372, "step": 90 }, { "epoch": 2.63, "grad_norm": 2.4758734703063965, "learning_rate": 6.973684210526315e-05, "loss": 0.4821, "step": 100 }, { "epoch": 2.63, "eval_accuracy": 0.5583333333333333, "eval_loss": 1.1285432577133179, "eval_runtime": 60.0048, "eval_samples_per_second": 9.999, "eval_steps_per_second": 1.25, "step": 100 }, { "epoch": 2.89, "grad_norm": 1.7585190534591675, "learning_rate": 5.6578947368421056e-05, "loss": 0.4428, "step": 110 }, { "epoch": 3.16, "grad_norm": 0.9120551943778992, "learning_rate": 4.342105263157895e-05, "loss": 0.3698, "step": 120 }, { "epoch": 3.42, "grad_norm": 0.7287651896476746, "learning_rate": 3.0263157894736844e-05, "loss": 0.2302, "step": 130 }, { "epoch": 3.68, "grad_norm": 1.8238482475280762, "learning_rate": 1.7105263157894737e-05, "loss": 0.2082, "step": 140 }, { "epoch": 3.95, "grad_norm": 0.6237545609474182, "learning_rate": 3.9473684210526315e-06, "loss": 0.2266, "step": 150 }, { "epoch": 4.0, "step": 152, "total_flos": 1.85987442622464e+17, "train_loss": 0.7916242197940224, "train_runtime": 235.103, "train_samples_per_second": 10.208, "train_steps_per_second": 0.647 } ], "logging_steps": 10, "max_steps": 152, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 1.85987442622464e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }