{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.004164513142385448, "eval_steps": 2000, "global_step": 5600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.451894760131836, "learning_rate": 1.9999999959757473e-05, "loss": 1.835, "step": 200 }, { "epoch": 0.0, "grad_norm": 2.7373883724212646, "learning_rate": 1.9999999832252933e-05, "loss": 1.6278, "step": 400 }, { "epoch": 0.0, "grad_norm": 3.7490854263305664, "learning_rate": 1.9999999617416517e-05, "loss": 1.6314, "step": 600 }, { "epoch": 0.0, "grad_norm": 10.143038749694824, "learning_rate": 1.999999931524823e-05, "loss": 1.5416, "step": 800 }, { "epoch": 0.0, "grad_norm": 2.783194065093994, "learning_rate": 1.999999892574807e-05, "loss": 1.5775, "step": 1000 }, { "epoch": 0.0, "grad_norm": 2.1446919441223145, "learning_rate": 1.9999998448916044e-05, "loss": 1.6922, "step": 1200 }, { "epoch": 0.0, "grad_norm": 3.6168997287750244, "learning_rate": 1.9999997884752155e-05, "loss": 1.6211, "step": 1400 }, { "epoch": 0.0, "grad_norm": 4.068266868591309, "learning_rate": 1.9999997233256404e-05, "loss": 1.6001, "step": 1600 }, { "epoch": 0.0, "grad_norm": 3.046320676803589, "learning_rate": 1.9999996494428805e-05, "loss": 1.5682, "step": 1800 }, { "epoch": 0.0, "grad_norm": 4.574249267578125, "learning_rate": 1.9999995668269356e-05, "loss": 1.5658, "step": 2000 }, { "epoch": 0.0, "grad_norm": 4.401742935180664, "learning_rate": 1.999999475956276e-05, "loss": 1.6152, "step": 2200 }, { "epoch": 0.0, "grad_norm": 4.141517162322998, "learning_rate": 1.9999993759176304e-05, "loss": 1.564, "step": 2400 }, { "epoch": 0.0, "grad_norm": 1.8213422298431396, "learning_rate": 1.9999992671458023e-05, "loss": 1.5586, "step": 2600 }, { "epoch": 0.0, "grad_norm": 2.3063032627105713, "learning_rate": 1.999999149640793e-05, "loss": 1.6118, "step": 2800 }, { "epoch": 0.0, "grad_norm": 3.5887880325317383, "learning_rate": 1.9999990234026036e-05, "loss": 1.586, "step": 3000 }, { "epoch": 0.0, "grad_norm": 2.8140385150909424, "learning_rate": 1.9999988884312347e-05, "loss": 1.6221, "step": 3200 }, { "epoch": 0.0, "grad_norm": 2.5657193660736084, "learning_rate": 1.9999987447266877e-05, "loss": 1.5533, "step": 3400 }, { "epoch": 0.0, "grad_norm": 2.193918466567993, "learning_rate": 1.9999985922889644e-05, "loss": 1.5725, "step": 3600 }, { "epoch": 0.0, "grad_norm": 2.9052414894104004, "learning_rate": 1.9999984311180655e-05, "loss": 1.5804, "step": 3800 }, { "epoch": 0.0, "grad_norm": 5.269617557525635, "learning_rate": 1.999998261213993e-05, "loss": 1.6025, "step": 4000 }, { "epoch": 0.0, "grad_norm": 2.5482230186462402, "learning_rate": 1.9999980825767474e-05, "loss": 1.5963, "step": 4200 }, { "epoch": 0.0, "grad_norm": 3.360860824584961, "learning_rate": 1.999997896164907e-05, "loss": 1.5837, "step": 4400 }, { "epoch": 0.0, "grad_norm": 3.9968528747558594, "learning_rate": 1.9999977001049872e-05, "loss": 1.5586, "step": 4600 }, { "epoch": 0.0, "grad_norm": 2.3270204067230225, "learning_rate": 1.9999974953119e-05, "loss": 1.597, "step": 4800 }, { "epoch": 0.0, "grad_norm": 2.4163918495178223, "learning_rate": 1.999997281785647e-05, "loss": 1.5405, "step": 5000 }, { "epoch": 0.0, "grad_norm": 2.7667906284332275, "learning_rate": 1.9999970595262297e-05, "loss": 1.5714, "step": 5200 }, { "epoch": 0.0, "grad_norm": 3.6416239738464355, "learning_rate": 1.9999968297103373e-05, "loss": 1.5909, "step": 5400 }, { "epoch": 0.0, "grad_norm": 4.743027210235596, "learning_rate": 1.999996590028264e-05, "loss": 1.5651, "step": 5600 } ], "logging_steps": 200, "max_steps": 6723475, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "total_flos": 7.351449119465472e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }