tamilnlpSLIIT's picture
Training in progress, step 500
9bfc660 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 35.85657370517928,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_loss": 1.3638533353805542,
"eval_runtime": 2.749,
"eval_samples_per_second": 365.581,
"eval_steps_per_second": 22.917,
"step": 251
},
{
"epoch": 1.99,
"grad_norm": 4.250436305999756,
"learning_rate": 1.960159362549801e-05,
"loss": 2.9252,
"step": 500
},
{
"epoch": 2.0,
"eval_loss": 1.0567671060562134,
"eval_runtime": 3.3198,
"eval_samples_per_second": 302.733,
"eval_steps_per_second": 18.977,
"step": 502
},
{
"epoch": 3.0,
"eval_loss": 0.8845352530479431,
"eval_runtime": 2.7201,
"eval_samples_per_second": 369.466,
"eval_steps_per_second": 23.161,
"step": 753
},
{
"epoch": 3.98,
"grad_norm": 5.833621025085449,
"learning_rate": 1.920318725099602e-05,
"loss": 1.2224,
"step": 1000
},
{
"epoch": 4.0,
"eval_loss": 0.7840549945831299,
"eval_runtime": 4.2904,
"eval_samples_per_second": 234.244,
"eval_steps_per_second": 14.684,
"step": 1004
},
{
"epoch": 5.0,
"eval_loss": 0.7343299388885498,
"eval_runtime": 3.2339,
"eval_samples_per_second": 310.774,
"eval_steps_per_second": 19.481,
"step": 1255
},
{
"epoch": 5.98,
"grad_norm": 3.5851247310638428,
"learning_rate": 1.8804780876494026e-05,
"loss": 0.983,
"step": 1500
},
{
"epoch": 6.0,
"eval_loss": 0.7118874192237854,
"eval_runtime": 3.2638,
"eval_samples_per_second": 307.921,
"eval_steps_per_second": 19.302,
"step": 1506
},
{
"epoch": 7.0,
"eval_loss": 0.6747872829437256,
"eval_runtime": 2.6898,
"eval_samples_per_second": 373.639,
"eval_steps_per_second": 23.422,
"step": 1757
},
{
"epoch": 7.97,
"grad_norm": 4.352935314178467,
"learning_rate": 1.8406374501992033e-05,
"loss": 0.8195,
"step": 2000
},
{
"epoch": 8.0,
"eval_loss": 0.651221513748169,
"eval_runtime": 3.3304,
"eval_samples_per_second": 301.764,
"eval_steps_per_second": 18.917,
"step": 2008
},
{
"epoch": 9.0,
"eval_loss": 0.6491857767105103,
"eval_runtime": 2.7422,
"eval_samples_per_second": 366.493,
"eval_steps_per_second": 22.974,
"step": 2259
},
{
"epoch": 9.96,
"grad_norm": 3.109755516052246,
"learning_rate": 1.800796812749004e-05,
"loss": 0.7231,
"step": 2500
},
{
"epoch": 10.0,
"eval_loss": 0.6193013191223145,
"eval_runtime": 4.5362,
"eval_samples_per_second": 221.549,
"eval_steps_per_second": 13.888,
"step": 2510
},
{
"epoch": 11.0,
"eval_loss": 0.6184014081954956,
"eval_runtime": 3.3594,
"eval_samples_per_second": 299.158,
"eval_steps_per_second": 18.753,
"step": 2761
},
{
"epoch": 11.95,
"grad_norm": 5.237318992614746,
"learning_rate": 1.760956175298805e-05,
"loss": 0.6293,
"step": 3000
},
{
"epoch": 12.0,
"eval_loss": 0.6006337404251099,
"eval_runtime": 3.2346,
"eval_samples_per_second": 310.7,
"eval_steps_per_second": 19.477,
"step": 3012
},
{
"epoch": 13.0,
"eval_loss": 0.5959585309028625,
"eval_runtime": 2.6861,
"eval_samples_per_second": 374.149,
"eval_steps_per_second": 23.454,
"step": 3263
},
{
"epoch": 13.94,
"grad_norm": 2.114781618118286,
"learning_rate": 1.7211155378486056e-05,
"loss": 0.5752,
"step": 3500
},
{
"epoch": 14.0,
"eval_loss": 0.5836025476455688,
"eval_runtime": 3.742,
"eval_samples_per_second": 268.573,
"eval_steps_per_second": 16.836,
"step": 3514
},
{
"epoch": 15.0,
"eval_loss": 0.5830443501472473,
"eval_runtime": 3.1989,
"eval_samples_per_second": 314.17,
"eval_steps_per_second": 19.694,
"step": 3765
},
{
"epoch": 15.94,
"grad_norm": 3.505565881729126,
"learning_rate": 1.6812749003984067e-05,
"loss": 0.5129,
"step": 4000
},
{
"epoch": 16.0,
"eval_loss": 0.5807380080223083,
"eval_runtime": 3.1885,
"eval_samples_per_second": 315.193,
"eval_steps_per_second": 19.758,
"step": 4016
},
{
"epoch": 17.0,
"eval_loss": 0.5819908976554871,
"eval_runtime": 2.8978,
"eval_samples_per_second": 346.815,
"eval_steps_per_second": 21.741,
"step": 4267
},
{
"epoch": 17.93,
"grad_norm": 3.355975866317749,
"learning_rate": 1.6414342629482074e-05,
"loss": 0.4638,
"step": 4500
},
{
"epoch": 18.0,
"eval_loss": 0.577302098274231,
"eval_runtime": 3.7339,
"eval_samples_per_second": 269.154,
"eval_steps_per_second": 16.872,
"step": 4518
},
{
"epoch": 19.0,
"eval_loss": 0.5799027681350708,
"eval_runtime": 3.0639,
"eval_samples_per_second": 328.014,
"eval_steps_per_second": 20.562,
"step": 4769
},
{
"epoch": 19.92,
"grad_norm": 2.3375730514526367,
"learning_rate": 1.601593625498008e-05,
"loss": 0.4251,
"step": 5000
},
{
"epoch": 20.0,
"eval_loss": 0.5866515040397644,
"eval_runtime": 2.9346,
"eval_samples_per_second": 342.461,
"eval_steps_per_second": 21.468,
"step": 5020
},
{
"epoch": 21.0,
"eval_loss": 0.5794395804405212,
"eval_runtime": 2.7501,
"eval_samples_per_second": 365.435,
"eval_steps_per_second": 22.908,
"step": 5271
},
{
"epoch": 21.91,
"grad_norm": 2.946040391921997,
"learning_rate": 1.5617529880478087e-05,
"loss": 0.3933,
"step": 5500
},
{
"epoch": 22.0,
"eval_loss": 0.5789267420768738,
"eval_runtime": 2.6945,
"eval_samples_per_second": 372.989,
"eval_steps_per_second": 23.381,
"step": 5522
},
{
"epoch": 23.0,
"eval_loss": 0.5829676985740662,
"eval_runtime": 2.7581,
"eval_samples_per_second": 364.375,
"eval_steps_per_second": 22.841,
"step": 5773
},
{
"epoch": 23.9,
"grad_norm": 5.222957611083984,
"learning_rate": 1.5219123505976096e-05,
"loss": 0.3522,
"step": 6000
},
{
"epoch": 24.0,
"eval_loss": 0.5862116813659668,
"eval_runtime": 2.803,
"eval_samples_per_second": 358.54,
"eval_steps_per_second": 22.476,
"step": 6024
},
{
"epoch": 25.0,
"eval_loss": 0.5760381817817688,
"eval_runtime": 2.7409,
"eval_samples_per_second": 366.667,
"eval_steps_per_second": 22.985,
"step": 6275
},
{
"epoch": 25.9,
"grad_norm": 2.372645616531372,
"learning_rate": 1.4820717131474104e-05,
"loss": 0.3406,
"step": 6500
},
{
"epoch": 26.0,
"eval_loss": 0.5902481079101562,
"eval_runtime": 2.9006,
"eval_samples_per_second": 346.474,
"eval_steps_per_second": 21.719,
"step": 6526
},
{
"epoch": 27.0,
"eval_loss": 0.5866113305091858,
"eval_runtime": 2.8052,
"eval_samples_per_second": 358.264,
"eval_steps_per_second": 22.458,
"step": 6777
},
{
"epoch": 27.89,
"grad_norm": 3.3169634342193604,
"learning_rate": 1.4422310756972113e-05,
"loss": 0.3069,
"step": 7000
},
{
"epoch": 28.0,
"eval_loss": 0.5929533839225769,
"eval_runtime": 2.7082,
"eval_samples_per_second": 371.096,
"eval_steps_per_second": 23.263,
"step": 7028
},
{
"epoch": 29.0,
"eval_loss": 0.5953153967857361,
"eval_runtime": 2.7299,
"eval_samples_per_second": 368.141,
"eval_steps_per_second": 23.078,
"step": 7279
},
{
"epoch": 29.88,
"grad_norm": 3.2775676250457764,
"learning_rate": 1.4023904382470122e-05,
"loss": 0.2786,
"step": 7500
},
{
"epoch": 30.0,
"eval_loss": 0.6021310091018677,
"eval_runtime": 2.7379,
"eval_samples_per_second": 367.074,
"eval_steps_per_second": 23.011,
"step": 7530
},
{
"epoch": 31.0,
"eval_loss": 0.5964611172676086,
"eval_runtime": 2.8422,
"eval_samples_per_second": 353.605,
"eval_steps_per_second": 22.166,
"step": 7781
},
{
"epoch": 31.87,
"grad_norm": 0.9464514255523682,
"learning_rate": 1.3625498007968127e-05,
"loss": 0.2623,
"step": 8000
},
{
"epoch": 32.0,
"eval_loss": 0.5960313677787781,
"eval_runtime": 2.7403,
"eval_samples_per_second": 366.743,
"eval_steps_per_second": 22.99,
"step": 8032
},
{
"epoch": 33.0,
"eval_loss": 0.605067789554596,
"eval_runtime": 2.71,
"eval_samples_per_second": 370.846,
"eval_steps_per_second": 23.247,
"step": 8283
},
{
"epoch": 33.86,
"grad_norm": 1.8620355129241943,
"learning_rate": 1.3227091633466135e-05,
"loss": 0.2405,
"step": 8500
},
{
"epoch": 34.0,
"eval_loss": 0.6035953164100647,
"eval_runtime": 3.5317,
"eval_samples_per_second": 284.568,
"eval_steps_per_second": 17.839,
"step": 8534
},
{
"epoch": 35.0,
"eval_loss": 0.6083930134773254,
"eval_runtime": 2.6851,
"eval_samples_per_second": 374.284,
"eval_steps_per_second": 23.463,
"step": 8785
},
{
"epoch": 35.86,
"grad_norm": 2.5435802936553955,
"learning_rate": 1.2828685258964144e-05,
"loss": 0.2207,
"step": 9000
}
],
"logging_steps": 500,
"max_steps": 25100,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"total_flos": 2779162291851264.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}