{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991356957649092, "eval_steps": 20, "global_step": 289, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 0.3046875, "learning_rate": 2.7586206896551727e-05, "loss": 1.6691, "step": 8 }, { "epoch": 0.06, "grad_norm": 0.193359375, "learning_rate": 5.517241379310345e-05, "loss": 1.5039, "step": 16 }, { "epoch": 0.07, "eval_loss": 1.1767374277114868, "eval_runtime": 1099.3683, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 20 }, { "epoch": 0.08, "grad_norm": 0.1591796875, "learning_rate": 8.275862068965517e-05, "loss": 1.1888, "step": 24 }, { "epoch": 0.11, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.8397, "step": 32 }, { "epoch": 0.14, "grad_norm": 0.0595703125, "learning_rate": 0.0001, "loss": 0.6852, "step": 40 }, { "epoch": 0.14, "eval_loss": 0.6382727026939392, "eval_runtime": 1099.2109, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 40 }, { "epoch": 0.17, "grad_norm": 0.06005859375, "learning_rate": 0.0001, "loss": 0.6169, "step": 48 }, { "epoch": 0.19, "grad_norm": 0.064453125, "learning_rate": 0.0001, "loss": 0.6008, "step": 56 }, { "epoch": 0.21, "eval_loss": 0.5603911280632019, "eval_runtime": 1099.0172, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 60 }, { "epoch": 0.22, "grad_norm": 0.06201171875, "learning_rate": 0.0001, "loss": 0.5572, "step": 64 }, { "epoch": 0.25, "grad_norm": 0.06884765625, "learning_rate": 0.0001, "loss": 0.5507, "step": 72 }, { "epoch": 0.28, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.5262, "step": 80 }, { "epoch": 0.28, "eval_loss": 0.5174164772033691, "eval_runtime": 1099.2276, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 80 }, { "epoch": 0.3, "grad_norm": 0.06787109375, "learning_rate": 0.0001, "loss": 0.5071, "step": 88 }, { "epoch": 0.33, "grad_norm": 0.07861328125, "learning_rate": 0.0001, "loss": 0.5059, "step": 96 }, { "epoch": 0.35, "eval_loss": 0.48593389987945557, "eval_runtime": 1099.1884, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 100 }, { "epoch": 0.36, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.4696, "step": 104 }, { "epoch": 0.39, "grad_norm": 0.078125, "learning_rate": 0.0001, "loss": 0.4786, "step": 112 }, { "epoch": 0.41, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.4769, "step": 120 }, { "epoch": 0.41, "eval_loss": 0.4559060037136078, "eval_runtime": 1098.9724, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 120 }, { "epoch": 0.44, "grad_norm": 0.1064453125, "learning_rate": 0.0001, "loss": 0.4588, "step": 128 }, { "epoch": 0.47, "grad_norm": 0.10595703125, "learning_rate": 0.0001, "loss": 0.4399, "step": 136 }, { "epoch": 0.48, "eval_loss": 0.42399317026138306, "eval_runtime": 1099.1888, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 140 }, { "epoch": 0.5, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.4681, "step": 144 }, { "epoch": 0.53, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.4085, "step": 152 }, { "epoch": 0.55, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.4356, "step": 160 }, { "epoch": 0.55, "eval_loss": 0.40195128321647644, "eval_runtime": 1099.4392, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 160 }, { "epoch": 0.58, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.4044, "step": 168 }, { "epoch": 0.61, "grad_norm": 0.11328125, "learning_rate": 0.0001, "loss": 0.4191, "step": 176 }, { "epoch": 0.62, "eval_loss": 0.3850320875644684, "eval_runtime": 1099.3747, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 180 }, { "epoch": 0.64, "grad_norm": 0.1083984375, "learning_rate": 0.0001, "loss": 0.3917, "step": 184 }, { "epoch": 0.66, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.3642, "step": 192 }, { "epoch": 0.69, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.3887, "step": 200 }, { "epoch": 0.69, "eval_loss": 0.37055206298828125, "eval_runtime": 1099.0492, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 200 }, { "epoch": 0.72, "grad_norm": 0.10546875, "learning_rate": 0.0001, "loss": 0.3842, "step": 208 }, { "epoch": 0.75, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.3826, "step": 216 }, { "epoch": 0.76, "eval_loss": 0.3577556312084198, "eval_runtime": 1099.1357, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 220 }, { "epoch": 0.77, "grad_norm": 0.1103515625, "learning_rate": 0.0001, "loss": 0.387, "step": 224 }, { "epoch": 0.8, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.3379, "step": 232 }, { "epoch": 0.83, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.3563, "step": 240 }, { "epoch": 0.83, "eval_loss": 0.3489949703216553, "eval_runtime": 1099.2538, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 240 }, { "epoch": 0.86, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.3625, "step": 248 }, { "epoch": 0.89, "grad_norm": 0.11328125, "learning_rate": 0.0001, "loss": 0.3224, "step": 256 }, { "epoch": 0.9, "eval_loss": 0.3398669362068176, "eval_runtime": 1099.0373, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 260 }, { "epoch": 0.91, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.35, "step": 264 }, { "epoch": 0.94, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.3504, "step": 272 }, { "epoch": 0.97, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.3285, "step": 280 }, { "epoch": 0.97, "eval_loss": 0.3326619565486908, "eval_runtime": 1099.1458, "eval_samples_per_second": 1.039, "eval_steps_per_second": 0.13, "step": 280 }, { "epoch": 1.0, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.3506, "step": 288 }, { "epoch": 1.0, "step": 289, "total_flos": 1.350622205624451e+18, "train_loss": 0.5342465417401601, "train_runtime": 29202.9895, "train_samples_per_second": 0.158, "train_steps_per_second": 0.01 } ], "logging_steps": 8, "max_steps": 289, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.350622205624451e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }