{ "best_metric": null, "best_model_checkpoint": null, "epoch": 35.85657370517928, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_loss": 1.3638533353805542, "eval_runtime": 2.749, "eval_samples_per_second": 365.581, "eval_steps_per_second": 22.917, "step": 251 }, { "epoch": 1.99, "grad_norm": 4.250436305999756, "learning_rate": 1.960159362549801e-05, "loss": 2.9252, "step": 500 }, { "epoch": 2.0, "eval_loss": 1.0567671060562134, "eval_runtime": 3.3198, "eval_samples_per_second": 302.733, "eval_steps_per_second": 18.977, "step": 502 }, { "epoch": 3.0, "eval_loss": 0.8845352530479431, "eval_runtime": 2.7201, "eval_samples_per_second": 369.466, "eval_steps_per_second": 23.161, "step": 753 }, { "epoch": 3.98, "grad_norm": 5.833621025085449, "learning_rate": 1.920318725099602e-05, "loss": 1.2224, "step": 1000 }, { "epoch": 4.0, "eval_loss": 0.7840549945831299, "eval_runtime": 4.2904, "eval_samples_per_second": 234.244, "eval_steps_per_second": 14.684, "step": 1004 }, { "epoch": 5.0, "eval_loss": 0.7343299388885498, "eval_runtime": 3.2339, "eval_samples_per_second": 310.774, "eval_steps_per_second": 19.481, "step": 1255 }, { "epoch": 5.98, "grad_norm": 3.5851247310638428, "learning_rate": 1.8804780876494026e-05, "loss": 0.983, "step": 1500 }, { "epoch": 6.0, "eval_loss": 0.7118874192237854, "eval_runtime": 3.2638, "eval_samples_per_second": 307.921, "eval_steps_per_second": 19.302, "step": 1506 }, { "epoch": 7.0, "eval_loss": 0.6747872829437256, "eval_runtime": 2.6898, "eval_samples_per_second": 373.639, "eval_steps_per_second": 23.422, "step": 1757 }, { "epoch": 7.97, "grad_norm": 4.352935314178467, "learning_rate": 1.8406374501992033e-05, "loss": 0.8195, "step": 2000 }, { "epoch": 8.0, "eval_loss": 0.651221513748169, "eval_runtime": 3.3304, "eval_samples_per_second": 301.764, "eval_steps_per_second": 18.917, "step": 2008 }, { "epoch": 9.0, "eval_loss": 0.6491857767105103, "eval_runtime": 2.7422, "eval_samples_per_second": 366.493, "eval_steps_per_second": 22.974, "step": 2259 }, { "epoch": 9.96, "grad_norm": 3.109755516052246, "learning_rate": 1.800796812749004e-05, "loss": 0.7231, "step": 2500 }, { "epoch": 10.0, "eval_loss": 0.6193013191223145, "eval_runtime": 4.5362, "eval_samples_per_second": 221.549, "eval_steps_per_second": 13.888, "step": 2510 }, { "epoch": 11.0, "eval_loss": 0.6184014081954956, "eval_runtime": 3.3594, "eval_samples_per_second": 299.158, "eval_steps_per_second": 18.753, "step": 2761 }, { "epoch": 11.95, "grad_norm": 5.237318992614746, "learning_rate": 1.760956175298805e-05, "loss": 0.6293, "step": 3000 }, { "epoch": 12.0, "eval_loss": 0.6006337404251099, "eval_runtime": 3.2346, "eval_samples_per_second": 310.7, "eval_steps_per_second": 19.477, "step": 3012 }, { "epoch": 13.0, "eval_loss": 0.5959585309028625, "eval_runtime": 2.6861, "eval_samples_per_second": 374.149, "eval_steps_per_second": 23.454, "step": 3263 }, { "epoch": 13.94, "grad_norm": 2.114781618118286, "learning_rate": 1.7211155378486056e-05, "loss": 0.5752, "step": 3500 }, { "epoch": 14.0, "eval_loss": 0.5836025476455688, "eval_runtime": 3.742, "eval_samples_per_second": 268.573, "eval_steps_per_second": 16.836, "step": 3514 }, { "epoch": 15.0, "eval_loss": 0.5830443501472473, "eval_runtime": 3.1989, "eval_samples_per_second": 314.17, "eval_steps_per_second": 19.694, "step": 3765 }, { "epoch": 15.94, "grad_norm": 3.505565881729126, "learning_rate": 1.6812749003984067e-05, "loss": 0.5129, "step": 4000 }, { "epoch": 16.0, "eval_loss": 0.5807380080223083, "eval_runtime": 3.1885, "eval_samples_per_second": 315.193, "eval_steps_per_second": 19.758, "step": 4016 }, { "epoch": 17.0, "eval_loss": 0.5819908976554871, "eval_runtime": 2.8978, "eval_samples_per_second": 346.815, "eval_steps_per_second": 21.741, "step": 4267 }, { "epoch": 17.93, "grad_norm": 3.355975866317749, "learning_rate": 1.6414342629482074e-05, "loss": 0.4638, "step": 4500 }, { "epoch": 18.0, "eval_loss": 0.577302098274231, "eval_runtime": 3.7339, "eval_samples_per_second": 269.154, "eval_steps_per_second": 16.872, "step": 4518 }, { "epoch": 19.0, "eval_loss": 0.5799027681350708, "eval_runtime": 3.0639, "eval_samples_per_second": 328.014, "eval_steps_per_second": 20.562, "step": 4769 }, { "epoch": 19.92, "grad_norm": 2.3375730514526367, "learning_rate": 1.601593625498008e-05, "loss": 0.4251, "step": 5000 }, { "epoch": 20.0, "eval_loss": 0.5866515040397644, "eval_runtime": 2.9346, "eval_samples_per_second": 342.461, "eval_steps_per_second": 21.468, "step": 5020 }, { "epoch": 21.0, "eval_loss": 0.5794395804405212, "eval_runtime": 2.7501, "eval_samples_per_second": 365.435, "eval_steps_per_second": 22.908, "step": 5271 }, { "epoch": 21.91, "grad_norm": 2.946040391921997, "learning_rate": 1.5617529880478087e-05, "loss": 0.3933, "step": 5500 }, { "epoch": 22.0, "eval_loss": 0.5789267420768738, "eval_runtime": 2.6945, "eval_samples_per_second": 372.989, "eval_steps_per_second": 23.381, "step": 5522 }, { "epoch": 23.0, "eval_loss": 0.5829676985740662, "eval_runtime": 2.7581, "eval_samples_per_second": 364.375, "eval_steps_per_second": 22.841, "step": 5773 }, { "epoch": 23.9, "grad_norm": 5.222957611083984, "learning_rate": 1.5219123505976096e-05, "loss": 0.3522, "step": 6000 }, { "epoch": 24.0, "eval_loss": 0.5862116813659668, "eval_runtime": 2.803, "eval_samples_per_second": 358.54, "eval_steps_per_second": 22.476, "step": 6024 }, { "epoch": 25.0, "eval_loss": 0.5760381817817688, "eval_runtime": 2.7409, "eval_samples_per_second": 366.667, "eval_steps_per_second": 22.985, "step": 6275 }, { "epoch": 25.9, "grad_norm": 2.372645616531372, "learning_rate": 1.4820717131474104e-05, "loss": 0.3406, "step": 6500 }, { "epoch": 26.0, "eval_loss": 0.5902481079101562, "eval_runtime": 2.9006, "eval_samples_per_second": 346.474, "eval_steps_per_second": 21.719, "step": 6526 }, { "epoch": 27.0, "eval_loss": 0.5866113305091858, "eval_runtime": 2.8052, "eval_samples_per_second": 358.264, "eval_steps_per_second": 22.458, "step": 6777 }, { "epoch": 27.89, "grad_norm": 3.3169634342193604, "learning_rate": 1.4422310756972113e-05, "loss": 0.3069, "step": 7000 }, { "epoch": 28.0, "eval_loss": 0.5929533839225769, "eval_runtime": 2.7082, "eval_samples_per_second": 371.096, "eval_steps_per_second": 23.263, "step": 7028 }, { "epoch": 29.0, "eval_loss": 0.5953153967857361, "eval_runtime": 2.7299, "eval_samples_per_second": 368.141, "eval_steps_per_second": 23.078, "step": 7279 }, { "epoch": 29.88, "grad_norm": 3.2775676250457764, "learning_rate": 1.4023904382470122e-05, "loss": 0.2786, "step": 7500 }, { "epoch": 30.0, "eval_loss": 0.6021310091018677, "eval_runtime": 2.7379, "eval_samples_per_second": 367.074, "eval_steps_per_second": 23.011, "step": 7530 }, { "epoch": 31.0, "eval_loss": 0.5964611172676086, "eval_runtime": 2.8422, "eval_samples_per_second": 353.605, "eval_steps_per_second": 22.166, "step": 7781 }, { "epoch": 31.87, "grad_norm": 0.9464514255523682, "learning_rate": 1.3625498007968127e-05, "loss": 0.2623, "step": 8000 }, { "epoch": 32.0, "eval_loss": 0.5960313677787781, "eval_runtime": 2.7403, "eval_samples_per_second": 366.743, "eval_steps_per_second": 22.99, "step": 8032 }, { "epoch": 33.0, "eval_loss": 0.605067789554596, "eval_runtime": 2.71, "eval_samples_per_second": 370.846, "eval_steps_per_second": 23.247, "step": 8283 }, { "epoch": 33.86, "grad_norm": 1.8620355129241943, "learning_rate": 1.3227091633466135e-05, "loss": 0.2405, "step": 8500 }, { "epoch": 34.0, "eval_loss": 0.6035953164100647, "eval_runtime": 3.5317, "eval_samples_per_second": 284.568, "eval_steps_per_second": 17.839, "step": 8534 }, { "epoch": 35.0, "eval_loss": 0.6083930134773254, "eval_runtime": 2.6851, "eval_samples_per_second": 374.284, "eval_steps_per_second": 23.463, "step": 8785 }, { "epoch": 35.86, "grad_norm": 2.5435802936553955, "learning_rate": 1.2828685258964144e-05, "loss": 0.2207, "step": 9000 } ], "logging_steps": 500, "max_steps": 25100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 2779162291851264.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }