|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 35.85657370517928, |
|
"eval_steps": 500, |
|
"global_step": 9000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3638533353805542, |
|
"eval_runtime": 2.749, |
|
"eval_samples_per_second": 365.581, |
|
"eval_steps_per_second": 22.917, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.250436305999756, |
|
"learning_rate": 1.960159362549801e-05, |
|
"loss": 2.9252, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.0567671060562134, |
|
"eval_runtime": 3.3198, |
|
"eval_samples_per_second": 302.733, |
|
"eval_steps_per_second": 18.977, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.8845352530479431, |
|
"eval_runtime": 2.7201, |
|
"eval_samples_per_second": 369.466, |
|
"eval_steps_per_second": 23.161, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 5.833621025085449, |
|
"learning_rate": 1.920318725099602e-05, |
|
"loss": 1.2224, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7840549945831299, |
|
"eval_runtime": 4.2904, |
|
"eval_samples_per_second": 234.244, |
|
"eval_steps_per_second": 14.684, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.7343299388885498, |
|
"eval_runtime": 3.2339, |
|
"eval_samples_per_second": 310.774, |
|
"eval_steps_per_second": 19.481, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 3.5851247310638428, |
|
"learning_rate": 1.8804780876494026e-05, |
|
"loss": 0.983, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.7118874192237854, |
|
"eval_runtime": 3.2638, |
|
"eval_samples_per_second": 307.921, |
|
"eval_steps_per_second": 19.302, |
|
"step": 1506 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6747872829437256, |
|
"eval_runtime": 2.6898, |
|
"eval_samples_per_second": 373.639, |
|
"eval_steps_per_second": 23.422, |
|
"step": 1757 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 4.352935314178467, |
|
"learning_rate": 1.8406374501992033e-05, |
|
"loss": 0.8195, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.651221513748169, |
|
"eval_runtime": 3.3304, |
|
"eval_samples_per_second": 301.764, |
|
"eval_steps_per_second": 18.917, |
|
"step": 2008 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.6491857767105103, |
|
"eval_runtime": 2.7422, |
|
"eval_samples_per_second": 366.493, |
|
"eval_steps_per_second": 22.974, |
|
"step": 2259 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 3.109755516052246, |
|
"learning_rate": 1.800796812749004e-05, |
|
"loss": 0.7231, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.6193013191223145, |
|
"eval_runtime": 4.5362, |
|
"eval_samples_per_second": 221.549, |
|
"eval_steps_per_second": 13.888, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.6184014081954956, |
|
"eval_runtime": 3.3594, |
|
"eval_samples_per_second": 299.158, |
|
"eval_steps_per_second": 18.753, |
|
"step": 2761 |
|
}, |
|
{ |
|
"epoch": 11.95, |
|
"grad_norm": 5.237318992614746, |
|
"learning_rate": 1.760956175298805e-05, |
|
"loss": 0.6293, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.6006337404251099, |
|
"eval_runtime": 3.2346, |
|
"eval_samples_per_second": 310.7, |
|
"eval_steps_per_second": 19.477, |
|
"step": 3012 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.5959585309028625, |
|
"eval_runtime": 2.6861, |
|
"eval_samples_per_second": 374.149, |
|
"eval_steps_per_second": 23.454, |
|
"step": 3263 |
|
}, |
|
{ |
|
"epoch": 13.94, |
|
"grad_norm": 2.114781618118286, |
|
"learning_rate": 1.7211155378486056e-05, |
|
"loss": 0.5752, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.5836025476455688, |
|
"eval_runtime": 3.742, |
|
"eval_samples_per_second": 268.573, |
|
"eval_steps_per_second": 16.836, |
|
"step": 3514 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.5830443501472473, |
|
"eval_runtime": 3.1989, |
|
"eval_samples_per_second": 314.17, |
|
"eval_steps_per_second": 19.694, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 15.94, |
|
"grad_norm": 3.505565881729126, |
|
"learning_rate": 1.6812749003984067e-05, |
|
"loss": 0.5129, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.5807380080223083, |
|
"eval_runtime": 3.1885, |
|
"eval_samples_per_second": 315.193, |
|
"eval_steps_per_second": 19.758, |
|
"step": 4016 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.5819908976554871, |
|
"eval_runtime": 2.8978, |
|
"eval_samples_per_second": 346.815, |
|
"eval_steps_per_second": 21.741, |
|
"step": 4267 |
|
}, |
|
{ |
|
"epoch": 17.93, |
|
"grad_norm": 3.355975866317749, |
|
"learning_rate": 1.6414342629482074e-05, |
|
"loss": 0.4638, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.577302098274231, |
|
"eval_runtime": 3.7339, |
|
"eval_samples_per_second": 269.154, |
|
"eval_steps_per_second": 16.872, |
|
"step": 4518 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.5799027681350708, |
|
"eval_runtime": 3.0639, |
|
"eval_samples_per_second": 328.014, |
|
"eval_steps_per_second": 20.562, |
|
"step": 4769 |
|
}, |
|
{ |
|
"epoch": 19.92, |
|
"grad_norm": 2.3375730514526367, |
|
"learning_rate": 1.601593625498008e-05, |
|
"loss": 0.4251, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.5866515040397644, |
|
"eval_runtime": 2.9346, |
|
"eval_samples_per_second": 342.461, |
|
"eval_steps_per_second": 21.468, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.5794395804405212, |
|
"eval_runtime": 2.7501, |
|
"eval_samples_per_second": 365.435, |
|
"eval_steps_per_second": 22.908, |
|
"step": 5271 |
|
}, |
|
{ |
|
"epoch": 21.91, |
|
"grad_norm": 2.946040391921997, |
|
"learning_rate": 1.5617529880478087e-05, |
|
"loss": 0.3933, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.5789267420768738, |
|
"eval_runtime": 2.6945, |
|
"eval_samples_per_second": 372.989, |
|
"eval_steps_per_second": 23.381, |
|
"step": 5522 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.5829676985740662, |
|
"eval_runtime": 2.7581, |
|
"eval_samples_per_second": 364.375, |
|
"eval_steps_per_second": 22.841, |
|
"step": 5773 |
|
}, |
|
{ |
|
"epoch": 23.9, |
|
"grad_norm": 5.222957611083984, |
|
"learning_rate": 1.5219123505976096e-05, |
|
"loss": 0.3522, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.5862116813659668, |
|
"eval_runtime": 2.803, |
|
"eval_samples_per_second": 358.54, |
|
"eval_steps_per_second": 22.476, |
|
"step": 6024 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.5760381817817688, |
|
"eval_runtime": 2.7409, |
|
"eval_samples_per_second": 366.667, |
|
"eval_steps_per_second": 22.985, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 25.9, |
|
"grad_norm": 2.372645616531372, |
|
"learning_rate": 1.4820717131474104e-05, |
|
"loss": 0.3406, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.5902481079101562, |
|
"eval_runtime": 2.9006, |
|
"eval_samples_per_second": 346.474, |
|
"eval_steps_per_second": 21.719, |
|
"step": 6526 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.5866113305091858, |
|
"eval_runtime": 2.8052, |
|
"eval_samples_per_second": 358.264, |
|
"eval_steps_per_second": 22.458, |
|
"step": 6777 |
|
}, |
|
{ |
|
"epoch": 27.89, |
|
"grad_norm": 3.3169634342193604, |
|
"learning_rate": 1.4422310756972113e-05, |
|
"loss": 0.3069, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.5929533839225769, |
|
"eval_runtime": 2.7082, |
|
"eval_samples_per_second": 371.096, |
|
"eval_steps_per_second": 23.263, |
|
"step": 7028 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.5953153967857361, |
|
"eval_runtime": 2.7299, |
|
"eval_samples_per_second": 368.141, |
|
"eval_steps_per_second": 23.078, |
|
"step": 7279 |
|
}, |
|
{ |
|
"epoch": 29.88, |
|
"grad_norm": 3.2775676250457764, |
|
"learning_rate": 1.4023904382470122e-05, |
|
"loss": 0.2786, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.6021310091018677, |
|
"eval_runtime": 2.7379, |
|
"eval_samples_per_second": 367.074, |
|
"eval_steps_per_second": 23.011, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.5964611172676086, |
|
"eval_runtime": 2.8422, |
|
"eval_samples_per_second": 353.605, |
|
"eval_steps_per_second": 22.166, |
|
"step": 7781 |
|
}, |
|
{ |
|
"epoch": 31.87, |
|
"grad_norm": 0.9464514255523682, |
|
"learning_rate": 1.3625498007968127e-05, |
|
"loss": 0.2623, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.5960313677787781, |
|
"eval_runtime": 2.7403, |
|
"eval_samples_per_second": 366.743, |
|
"eval_steps_per_second": 22.99, |
|
"step": 8032 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 0.605067789554596, |
|
"eval_runtime": 2.71, |
|
"eval_samples_per_second": 370.846, |
|
"eval_steps_per_second": 23.247, |
|
"step": 8283 |
|
}, |
|
{ |
|
"epoch": 33.86, |
|
"grad_norm": 1.8620355129241943, |
|
"learning_rate": 1.3227091633466135e-05, |
|
"loss": 0.2405, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.6035953164100647, |
|
"eval_runtime": 3.5317, |
|
"eval_samples_per_second": 284.568, |
|
"eval_steps_per_second": 17.839, |
|
"step": 8534 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 0.6083930134773254, |
|
"eval_runtime": 2.6851, |
|
"eval_samples_per_second": 374.284, |
|
"eval_steps_per_second": 23.463, |
|
"step": 8785 |
|
}, |
|
{ |
|
"epoch": 35.86, |
|
"grad_norm": 2.5435802936553955, |
|
"learning_rate": 1.2828685258964144e-05, |
|
"loss": 0.2207, |
|
"step": 9000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 25100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 2779162291851264.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|