{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 129, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 8701.357710303886, "learning_rate": 1.5384615384615387e-06, "loss": 28.3212, "step": 1 }, { "epoch": 0.12, "grad_norm": 481.362737716594, "learning_rate": 7.692307692307694e-06, "loss": 22.1134, "step": 5 }, { "epoch": 0.23, "grad_norm": 259.4834207727077, "learning_rate": 1.5384615384615387e-05, "loss": 14.8237, "step": 10 }, { "epoch": 0.35, "grad_norm": 131.82252822714617, "learning_rate": 1.998533413851124e-05, "loss": 12.5705, "step": 15 }, { "epoch": 0.47, "grad_norm": 387.9499566391837, "learning_rate": 1.982083682742156e-05, "loss": 13.7286, "step": 20 }, { "epoch": 0.58, "grad_norm": 219.00627726463617, "learning_rate": 1.9476531711828027e-05, "loss": 11.3422, "step": 25 }, { "epoch": 0.7, "grad_norm": 221.35867761333137, "learning_rate": 1.8958722607586883e-05, "loss": 7.7604, "step": 30 }, { "epoch": 0.81, "grad_norm": 43.14746736074378, "learning_rate": 1.827688998156891e-05, "loss": 3.4082, "step": 35 }, { "epoch": 0.93, "grad_norm": 29.64836419071723, "learning_rate": 1.7443517375622706e-05, "loss": 2.9518, "step": 40 }, { "epoch": 1.05, "grad_norm": 21.427062393117673, "learning_rate": 1.647386284781828e-05, "loss": 2.6311, "step": 45 }, { "epoch": 1.16, "grad_norm": 19.158148236389312, "learning_rate": 1.5385679615609045e-05, "loss": 2.4838, "step": 50 }, { "epoch": 1.28, "grad_norm": 14.070587316942852, "learning_rate": 1.4198891015602648e-05, "loss": 2.3473, "step": 55 }, { "epoch": 1.4, "grad_norm": 34.76326434361083, "learning_rate": 1.2935225731039349e-05, "loss": 2.2796, "step": 60 }, { "epoch": 1.51, "grad_norm": 16.17405801850995, "learning_rate": 1.161781996552765e-05, "loss": 2.1971, "step": 65 }, { "epoch": 1.63, "grad_norm": 10.06558936803853, "learning_rate": 1.0270793846761347e-05, "loss": 2.1598, "step": 70 }, { "epoch": 1.74, "grad_norm": 21.625955834057603, "learning_rate": 8.918809815760585e-06, "loss": 2.148, "step": 75 }, { "epoch": 1.86, "grad_norm": 10.59659968673671, "learning_rate": 7.586621087002945e-06, "loss": 2.0663, "step": 80 }, { "epoch": 1.98, "grad_norm": 10.684749100687451, "learning_rate": 6.298618446600856e-06, "loss": 2.079, "step": 85 }, { "epoch": 2.09, "grad_norm": 18.49345747302597, "learning_rate": 5.078383686109927e-06, "loss": 2.0018, "step": 90 }, { "epoch": 2.21, "grad_norm": 14.92738780630304, "learning_rate": 3.948257848062351e-06, "loss": 1.9669, "step": 95 }, { "epoch": 2.33, "grad_norm": 9.352817552145108, "learning_rate": 2.9289321881345257e-06, "loss": 1.9691, "step": 100 }, { "epoch": 2.44, "grad_norm": 7.228395436940449, "learning_rate": 2.0390693429435626e-06, "loss": 1.9298, "step": 105 }, { "epoch": 2.56, "grad_norm": 7.290178837796146, "learning_rate": 1.2949616394382802e-06, "loss": 1.9519, "step": 110 }, { "epoch": 2.67, "grad_norm": 6.909179219835438, "learning_rate": 7.102328018320859e-07, "loss": 1.9449, "step": 115 }, { "epoch": 2.79, "grad_norm": 6.799236287899626, "learning_rate": 2.955885174678852e-07, "loss": 1.9203, "step": 120 }, { "epoch": 2.91, "grad_norm": 7.120567007446623, "learning_rate": 5.862042845640403e-08, "loss": 1.9294, "step": 125 }, { "epoch": 3.0, "step": 129, "total_flos": 8882126585856.0, "train_loss": 4.938086860863737, "train_runtime": 252.0914, "train_samples_per_second": 16.232, "train_steps_per_second": 0.512 } ], "logging_steps": 5, "max_steps": 129, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 8882126585856.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }