{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 0.7927429676055908, "learning_rate": 4.773345421577516e-05, "loss": 0.6604, "step": 100 }, { "epoch": 0.09, "grad_norm": 0.7291208505630493, "learning_rate": 4.546690843155032e-05, "loss": 0.5917, "step": 200 }, { "epoch": 0.14, "grad_norm": 0.7045320868492126, "learning_rate": 4.320036264732548e-05, "loss": 0.5638, "step": 300 }, { "epoch": 0.18, "grad_norm": 0.7321786880493164, "learning_rate": 4.095648232094288e-05, "loss": 0.5409, "step": 400 }, { "epoch": 0.23, "grad_norm": 0.7257340550422668, "learning_rate": 3.868993653671804e-05, "loss": 0.5217, "step": 500 }, { "epoch": 0.27, "grad_norm": 0.7644676566123962, "learning_rate": 3.64233907524932e-05, "loss": 0.5, "step": 600 }, { "epoch": 0.32, "grad_norm": 0.7164526581764221, "learning_rate": 3.415684496826836e-05, "loss": 0.4851, "step": 700 }, { "epoch": 0.36, "grad_norm": 0.7228084802627563, "learning_rate": 3.189029918404352e-05, "loss": 0.4703, "step": 800 }, { "epoch": 0.41, "grad_norm": 0.7583528161048889, "learning_rate": 2.9623753399818678e-05, "loss": 0.456, "step": 900 }, { "epoch": 0.45, "grad_norm": 0.7674237489700317, "learning_rate": 2.7357207615593838e-05, "loss": 0.447, "step": 1000 }, { "epoch": 0.5, "grad_norm": 0.7338166236877441, "learning_rate": 2.5090661831368994e-05, "loss": 0.4357, "step": 1100 }, { "epoch": 0.54, "grad_norm": 0.7415379285812378, "learning_rate": 2.2824116047144154e-05, "loss": 0.4275, "step": 1200 }, { "epoch": 0.59, "grad_norm": 0.7678395509719849, "learning_rate": 2.0557570262919314e-05, "loss": 0.4136, "step": 1300 }, { "epoch": 0.63, "grad_norm": 0.7428087592124939, "learning_rate": 1.829102447869447e-05, "loss": 0.4104, "step": 1400 }, { "epoch": 0.68, "grad_norm": 0.7453845143318176, "learning_rate": 1.602447869446963e-05, "loss": 0.3989, "step": 1500 }, { "epoch": 0.73, "grad_norm": 0.6997073888778687, "learning_rate": 1.3757932910244787e-05, "loss": 0.3961, "step": 1600 }, { "epoch": 0.77, "grad_norm": 0.7334680557250977, "learning_rate": 1.1491387126019947e-05, "loss": 0.3879, "step": 1700 }, { "epoch": 0.82, "grad_norm": 0.7411799430847168, "learning_rate": 9.224841341795105e-06, "loss": 0.3799, "step": 1800 }, { "epoch": 0.86, "grad_norm": 0.818085253238678, "learning_rate": 6.958295557570263e-06, "loss": 0.3782, "step": 1900 }, { "epoch": 0.91, "grad_norm": 0.7673262357711792, "learning_rate": 4.71441523118767e-06, "loss": 0.3711, "step": 2000 }, { "epoch": 0.95, "grad_norm": 0.7118935585021973, "learning_rate": 2.447869446962829e-06, "loss": 0.3629, "step": 2100 }, { "epoch": 1.0, "grad_norm": 0.8014984726905823, "learning_rate": 1.813236627379873e-07, "loss": 0.3695, "step": 2200 } ], "logging_steps": 100, "max_steps": 2206, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2206, "total_flos": 6.205756045755679e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }