multinli_xlmr_lr5e-06_checkpoint / trainer_state.json
HgThinker's picture
Upload 4 files
fce4797 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5347539113428944,
"eval_steps": 100,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 6.05555534362793,
"learning_rate": 4.9915117562176395e-06,
"loss": 1.1007,
"step": 100
},
{
"epoch": 0.03,
"eval_loss": 1.03852117061615,
"eval_runtime": 454.1166,
"eval_samples_per_second": 21.651,
"eval_steps_per_second": 1.354,
"step": 100
},
{
"epoch": 0.05,
"grad_norm": 22.63409423828125,
"learning_rate": 4.983023512435278e-06,
"loss": 1.0036,
"step": 200
},
{
"epoch": 0.05,
"eval_loss": 0.84719318151474,
"eval_runtime": 452.9811,
"eval_samples_per_second": 21.705,
"eval_steps_per_second": 1.358,
"step": 200
},
{
"epoch": 0.08,
"grad_norm": 17.67691993713379,
"learning_rate": 4.974535268652917e-06,
"loss": 0.8356,
"step": 300
},
{
"epoch": 0.08,
"eval_loss": 0.5871103405952454,
"eval_runtime": 453.5224,
"eval_samples_per_second": 21.679,
"eval_steps_per_second": 1.356,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 26.09956932067871,
"learning_rate": 4.966047024870554e-06,
"loss": 0.6282,
"step": 400
},
{
"epoch": 0.1,
"eval_loss": 0.49549001455307007,
"eval_runtime": 453.1973,
"eval_samples_per_second": 21.695,
"eval_steps_per_second": 1.357,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 23.882291793823242,
"learning_rate": 4.957558781088193e-06,
"loss": 0.5428,
"step": 500
},
{
"epoch": 0.13,
"eval_loss": 0.4652431905269623,
"eval_runtime": 452.8789,
"eval_samples_per_second": 21.71,
"eval_steps_per_second": 1.358,
"step": 500
},
{
"epoch": 0.15,
"grad_norm": 18.765167236328125,
"learning_rate": 4.949070537305831e-06,
"loss": 0.5124,
"step": 600
},
{
"epoch": 0.15,
"eval_loss": 0.42608916759490967,
"eval_runtime": 453.0516,
"eval_samples_per_second": 21.702,
"eval_steps_per_second": 1.357,
"step": 600
},
{
"epoch": 0.18,
"grad_norm": 19.878719329833984,
"learning_rate": 4.94058229352347e-06,
"loss": 0.506,
"step": 700
},
{
"epoch": 0.18,
"eval_loss": 0.4183957576751709,
"eval_runtime": 452.8777,
"eval_samples_per_second": 21.71,
"eval_steps_per_second": 1.358,
"step": 700
},
{
"epoch": 0.2,
"grad_norm": 19.626863479614258,
"learning_rate": 4.9320940497411094e-06,
"loss": 0.4828,
"step": 800
},
{
"epoch": 0.2,
"eval_loss": 0.4115789830684662,
"eval_runtime": 451.8875,
"eval_samples_per_second": 21.758,
"eval_steps_per_second": 1.361,
"step": 800
},
{
"epoch": 0.23,
"grad_norm": 17.65311050415039,
"learning_rate": 4.923605805958748e-06,
"loss": 0.4698,
"step": 900
},
{
"epoch": 0.23,
"eval_loss": 0.40224820375442505,
"eval_runtime": 451.4433,
"eval_samples_per_second": 21.779,
"eval_steps_per_second": 1.362,
"step": 900
},
{
"epoch": 0.25,
"grad_norm": 20.876745223999023,
"learning_rate": 4.915117562176387e-06,
"loss": 0.4572,
"step": 1000
},
{
"epoch": 0.25,
"eval_loss": 0.41678380966186523,
"eval_runtime": 452.1237,
"eval_samples_per_second": 21.746,
"eval_steps_per_second": 1.36,
"step": 1000
},
{
"epoch": 0.28,
"grad_norm": 22.631921768188477,
"learning_rate": 4.906629318394024e-06,
"loss": 0.4347,
"step": 1100
},
{
"epoch": 0.28,
"eval_loss": 0.39895564317703247,
"eval_runtime": 452.3272,
"eval_samples_per_second": 21.736,
"eval_steps_per_second": 1.36,
"step": 1100
},
{
"epoch": 0.31,
"grad_norm": 19.680686950683594,
"learning_rate": 4.898141074611663e-06,
"loss": 0.4412,
"step": 1200
},
{
"epoch": 0.31,
"eval_loss": 0.3831236958503723,
"eval_runtime": 452.7887,
"eval_samples_per_second": 21.714,
"eval_steps_per_second": 1.358,
"step": 1200
},
{
"epoch": 0.33,
"grad_norm": 20.93834114074707,
"learning_rate": 4.889652830829302e-06,
"loss": 0.4426,
"step": 1300
},
{
"epoch": 0.33,
"eval_loss": 0.3780069649219513,
"eval_runtime": 452.3849,
"eval_samples_per_second": 21.734,
"eval_steps_per_second": 1.359,
"step": 1300
},
{
"epoch": 0.36,
"grad_norm": 20.318857192993164,
"learning_rate": 4.88116458704694e-06,
"loss": 0.4367,
"step": 1400
},
{
"epoch": 0.36,
"eval_loss": 0.36752593517303467,
"eval_runtime": 452.8709,
"eval_samples_per_second": 21.71,
"eval_steps_per_second": 1.358,
"step": 1400
},
{
"epoch": 0.38,
"grad_norm": 16.789215087890625,
"learning_rate": 4.8726763432645794e-06,
"loss": 0.4159,
"step": 1500
},
{
"epoch": 0.38,
"eval_loss": 0.37768688797950745,
"eval_runtime": 453.2249,
"eval_samples_per_second": 21.693,
"eval_steps_per_second": 1.357,
"step": 1500
},
{
"epoch": 0.41,
"grad_norm": 15.212777137756348,
"learning_rate": 4.864188099482218e-06,
"loss": 0.4145,
"step": 1600
},
{
"epoch": 0.41,
"eval_loss": 0.3734543025493622,
"eval_runtime": 453.5584,
"eval_samples_per_second": 21.677,
"eval_steps_per_second": 1.356,
"step": 1600
},
{
"epoch": 0.43,
"grad_norm": 16.271045684814453,
"learning_rate": 4.855699855699857e-06,
"loss": 0.4039,
"step": 1700
},
{
"epoch": 0.43,
"eval_loss": 0.35818716883659363,
"eval_runtime": 453.0212,
"eval_samples_per_second": 21.703,
"eval_steps_per_second": 1.358,
"step": 1700
},
{
"epoch": 0.46,
"grad_norm": 30.50907325744629,
"learning_rate": 4.847211611917494e-06,
"loss": 0.4138,
"step": 1800
},
{
"epoch": 0.46,
"eval_loss": 0.38744741678237915,
"eval_runtime": 452.8649,
"eval_samples_per_second": 21.711,
"eval_steps_per_second": 1.358,
"step": 1800
},
{
"epoch": 0.48,
"grad_norm": 12.229795455932617,
"learning_rate": 4.838723368135133e-06,
"loss": 0.4071,
"step": 1900
},
{
"epoch": 0.48,
"eval_loss": 0.3485246002674103,
"eval_runtime": 453.2995,
"eval_samples_per_second": 21.69,
"eval_steps_per_second": 1.357,
"step": 1900
},
{
"epoch": 0.51,
"grad_norm": 15.995959281921387,
"learning_rate": 4.830235124352772e-06,
"loss": 0.4141,
"step": 2000
},
{
"epoch": 0.51,
"eval_loss": 0.34898054599761963,
"eval_runtime": 453.0588,
"eval_samples_per_second": 21.701,
"eval_steps_per_second": 1.357,
"step": 2000
},
{
"epoch": 0.53,
"grad_norm": 21.010486602783203,
"learning_rate": 4.82174688057041e-06,
"loss": 0.3917,
"step": 2100
},
{
"epoch": 0.53,
"eval_loss": 0.36585670709609985,
"eval_runtime": 453.7967,
"eval_samples_per_second": 21.666,
"eval_steps_per_second": 1.355,
"step": 2100
}
],
"logging_steps": 100,
"max_steps": 58905,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 100,
"total_flos": 1.7850550446351974e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}