sedrickkeh's picture
End of training
6325898 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9973799126637557,
"eval_steps": 500,
"global_step": 429,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06986899563318777,
"grad_norm": 5.182594166300789,
"learning_rate": 5e-06,
"loss": 0.6932,
"step": 10
},
{
"epoch": 0.13973799126637554,
"grad_norm": 0.8105274865614129,
"learning_rate": 5e-06,
"loss": 0.6288,
"step": 20
},
{
"epoch": 0.2096069868995633,
"grad_norm": 0.8904752466777895,
"learning_rate": 5e-06,
"loss": 0.6036,
"step": 30
},
{
"epoch": 0.2794759825327511,
"grad_norm": 0.6207860876300546,
"learning_rate": 5e-06,
"loss": 0.5902,
"step": 40
},
{
"epoch": 0.34934497816593885,
"grad_norm": 0.7152500537832366,
"learning_rate": 5e-06,
"loss": 0.5769,
"step": 50
},
{
"epoch": 0.4192139737991266,
"grad_norm": 0.7013464450514418,
"learning_rate": 5e-06,
"loss": 0.5766,
"step": 60
},
{
"epoch": 0.4890829694323144,
"grad_norm": 0.6585965690127796,
"learning_rate": 5e-06,
"loss": 0.5608,
"step": 70
},
{
"epoch": 0.5589519650655022,
"grad_norm": 0.5056238657103318,
"learning_rate": 5e-06,
"loss": 0.5628,
"step": 80
},
{
"epoch": 0.62882096069869,
"grad_norm": 0.5015857792164096,
"learning_rate": 5e-06,
"loss": 0.5568,
"step": 90
},
{
"epoch": 0.6986899563318777,
"grad_norm": 0.5250734927951002,
"learning_rate": 5e-06,
"loss": 0.5527,
"step": 100
},
{
"epoch": 0.7685589519650655,
"grad_norm": 0.5330395330357691,
"learning_rate": 5e-06,
"loss": 0.5538,
"step": 110
},
{
"epoch": 0.8384279475982532,
"grad_norm": 0.5383302692811954,
"learning_rate": 5e-06,
"loss": 0.5421,
"step": 120
},
{
"epoch": 0.9082969432314411,
"grad_norm": 0.4580590442515656,
"learning_rate": 5e-06,
"loss": 0.5438,
"step": 130
},
{
"epoch": 0.9781659388646288,
"grad_norm": 0.5111086405959315,
"learning_rate": 5e-06,
"loss": 0.547,
"step": 140
},
{
"epoch": 0.9991266375545852,
"eval_loss": 0.5411319136619568,
"eval_runtime": 97.15,
"eval_samples_per_second": 39.702,
"eval_steps_per_second": 0.628,
"step": 143
},
{
"epoch": 1.0480349344978166,
"grad_norm": 0.5986185773031188,
"learning_rate": 5e-06,
"loss": 0.561,
"step": 150
},
{
"epoch": 1.1179039301310043,
"grad_norm": 0.6935147463024353,
"learning_rate": 5e-06,
"loss": 0.4992,
"step": 160
},
{
"epoch": 1.1877729257641922,
"grad_norm": 0.828701854577101,
"learning_rate": 5e-06,
"loss": 0.5037,
"step": 170
},
{
"epoch": 1.25764192139738,
"grad_norm": 0.47892479220169987,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 180
},
{
"epoch": 1.3275109170305677,
"grad_norm": 0.4854767069935743,
"learning_rate": 5e-06,
"loss": 0.4972,
"step": 190
},
{
"epoch": 1.3973799126637554,
"grad_norm": 0.4636030652817092,
"learning_rate": 5e-06,
"loss": 0.4939,
"step": 200
},
{
"epoch": 1.467248908296943,
"grad_norm": 0.46460367066805386,
"learning_rate": 5e-06,
"loss": 0.497,
"step": 210
},
{
"epoch": 1.537117903930131,
"grad_norm": 0.5541857158168692,
"learning_rate": 5e-06,
"loss": 0.4967,
"step": 220
},
{
"epoch": 1.6069868995633187,
"grad_norm": 0.43929258201041527,
"learning_rate": 5e-06,
"loss": 0.4972,
"step": 230
},
{
"epoch": 1.6768558951965065,
"grad_norm": 0.5673809631668943,
"learning_rate": 5e-06,
"loss": 0.5003,
"step": 240
},
{
"epoch": 1.7467248908296944,
"grad_norm": 0.5312435781280381,
"learning_rate": 5e-06,
"loss": 0.5029,
"step": 250
},
{
"epoch": 1.8165938864628821,
"grad_norm": 0.42426976715635795,
"learning_rate": 5e-06,
"loss": 0.4881,
"step": 260
},
{
"epoch": 1.8864628820960698,
"grad_norm": 0.4900651685024478,
"learning_rate": 5e-06,
"loss": 0.4914,
"step": 270
},
{
"epoch": 1.9563318777292578,
"grad_norm": 0.5082119447902608,
"learning_rate": 5e-06,
"loss": 0.4989,
"step": 280
},
{
"epoch": 1.9982532751091702,
"eval_loss": 0.5304298996925354,
"eval_runtime": 100.0792,
"eval_samples_per_second": 38.539,
"eval_steps_per_second": 0.61,
"step": 286
},
{
"epoch": 2.0262008733624453,
"grad_norm": 0.5750485199278518,
"learning_rate": 5e-06,
"loss": 0.5172,
"step": 290
},
{
"epoch": 2.096069868995633,
"grad_norm": 0.5082128041211101,
"learning_rate": 5e-06,
"loss": 0.4502,
"step": 300
},
{
"epoch": 2.165938864628821,
"grad_norm": 0.45659427403567093,
"learning_rate": 5e-06,
"loss": 0.4449,
"step": 310
},
{
"epoch": 2.2358078602620086,
"grad_norm": 0.5382190170872588,
"learning_rate": 5e-06,
"loss": 0.4475,
"step": 320
},
{
"epoch": 2.3056768558951966,
"grad_norm": 0.5824902306199307,
"learning_rate": 5e-06,
"loss": 0.4511,
"step": 330
},
{
"epoch": 2.3755458515283845,
"grad_norm": 0.49390187220978227,
"learning_rate": 5e-06,
"loss": 0.4527,
"step": 340
},
{
"epoch": 2.445414847161572,
"grad_norm": 0.5075624377832567,
"learning_rate": 5e-06,
"loss": 0.4544,
"step": 350
},
{
"epoch": 2.51528384279476,
"grad_norm": 0.49339275826514356,
"learning_rate": 5e-06,
"loss": 0.4483,
"step": 360
},
{
"epoch": 2.5851528384279474,
"grad_norm": 0.46662802106266,
"learning_rate": 5e-06,
"loss": 0.454,
"step": 370
},
{
"epoch": 2.6550218340611353,
"grad_norm": 0.55123614316667,
"learning_rate": 5e-06,
"loss": 0.4531,
"step": 380
},
{
"epoch": 2.7248908296943233,
"grad_norm": 0.5404203217788516,
"learning_rate": 5e-06,
"loss": 0.4479,
"step": 390
},
{
"epoch": 2.7947598253275108,
"grad_norm": 0.45881743373968936,
"learning_rate": 5e-06,
"loss": 0.4518,
"step": 400
},
{
"epoch": 2.8646288209606987,
"grad_norm": 0.44281752763961424,
"learning_rate": 5e-06,
"loss": 0.4532,
"step": 410
},
{
"epoch": 2.934497816593886,
"grad_norm": 0.48358299433650903,
"learning_rate": 5e-06,
"loss": 0.4588,
"step": 420
},
{
"epoch": 2.9973799126637557,
"eval_loss": 0.5314457416534424,
"eval_runtime": 97.6794,
"eval_samples_per_second": 39.486,
"eval_steps_per_second": 0.624,
"step": 429
},
{
"epoch": 2.9973799126637557,
"step": 429,
"total_flos": 718381598638080.0,
"train_loss": 0.5104286659569729,
"train_runtime": 14367.279,
"train_samples_per_second": 15.301,
"train_steps_per_second": 0.03
}
],
"logging_steps": 10,
"max_steps": 429,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 718381598638080.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}