tryingpro's picture
Training in progress, step 90, checkpoint
70511ab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.3530364372469634,
"eval_steps": 8,
"global_step": 90,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025910931174089068,
"eval_loss": 0.21314890682697296,
"eval_runtime": 12.1183,
"eval_samples_per_second": 10.728,
"eval_steps_per_second": 5.364,
"step": 1
},
{
"epoch": 0.07773279352226721,
"grad_norm": 6.452937126159668,
"learning_rate": 3e-05,
"loss": 6.5264,
"step": 3
},
{
"epoch": 0.15546558704453442,
"grad_norm": 6.785652160644531,
"learning_rate": 6e-05,
"loss": 6.3657,
"step": 6
},
{
"epoch": 0.20728744939271254,
"eval_loss": 0.10755843669176102,
"eval_runtime": 12.1878,
"eval_samples_per_second": 10.666,
"eval_steps_per_second": 5.333,
"step": 8
},
{
"epoch": 0.23319838056680162,
"grad_norm": 5.717119216918945,
"learning_rate": 9e-05,
"loss": 4.5674,
"step": 9
},
{
"epoch": 0.31093117408906884,
"grad_norm": 6.126352787017822,
"learning_rate": 0.00012,
"loss": 1.7668,
"step": 12
},
{
"epoch": 0.38866396761133604,
"grad_norm": 2.648998498916626,
"learning_rate": 0.00015000000000000001,
"loss": 0.4109,
"step": 15
},
{
"epoch": 0.4145748987854251,
"eval_loss": 0.0012996116420254111,
"eval_runtime": 12.1969,
"eval_samples_per_second": 10.658,
"eval_steps_per_second": 5.329,
"step": 16
},
{
"epoch": 0.46639676113360323,
"grad_norm": 0.33190277218818665,
"learning_rate": 0.00018,
"loss": 0.0473,
"step": 18
},
{
"epoch": 0.5441295546558704,
"grad_norm": 0.2983362078666687,
"learning_rate": 0.00019989930665413147,
"loss": 0.0116,
"step": 21
},
{
"epoch": 0.6218623481781377,
"grad_norm": 0.2238294929265976,
"learning_rate": 0.00019839295885986296,
"loss": 0.0077,
"step": 24
},
{
"epoch": 0.6218623481781377,
"eval_loss": 0.000773053674492985,
"eval_runtime": 12.1863,
"eval_samples_per_second": 10.668,
"eval_steps_per_second": 5.334,
"step": 24
},
{
"epoch": 0.6995951417004048,
"grad_norm": 0.016547193750739098,
"learning_rate": 0.00019510565162951537,
"loss": 0.0142,
"step": 27
},
{
"epoch": 0.7773279352226721,
"grad_norm": 0.019047992303967476,
"learning_rate": 0.0001900968867902419,
"loss": 0.0045,
"step": 30
},
{
"epoch": 0.8291497975708502,
"eval_loss": 0.0002317978214705363,
"eval_runtime": 12.2198,
"eval_samples_per_second": 10.638,
"eval_steps_per_second": 5.319,
"step": 32
},
{
"epoch": 0.8550607287449393,
"grad_norm": 1.0941699743270874,
"learning_rate": 0.00018345732537213027,
"loss": 0.0019,
"step": 33
},
{
"epoch": 0.9327935222672065,
"grad_norm": 1.774532675743103,
"learning_rate": 0.00017530714660036112,
"loss": 0.0047,
"step": 36
},
{
"epoch": 1.0210526315789474,
"grad_norm": 0.029302451759576797,
"learning_rate": 0.00016579387259397127,
"loss": 0.0021,
"step": 39
},
{
"epoch": 1.0469635627530365,
"eval_loss": 2.721450073295273e-05,
"eval_runtime": 44.3093,
"eval_samples_per_second": 2.934,
"eval_steps_per_second": 1.467,
"step": 40
},
{
"epoch": 1.0987854251012146,
"grad_norm": 0.009189918637275696,
"learning_rate": 0.00015508969814521025,
"loss": 0.0005,
"step": 42
},
{
"epoch": 1.1765182186234817,
"grad_norm": 0.030586188659071922,
"learning_rate": 0.00014338837391175582,
"loss": 0.0006,
"step": 45
},
{
"epoch": 1.2542510121457489,
"grad_norm": 0.019615933299064636,
"learning_rate": 0.00013090169943749476,
"loss": 0.0008,
"step": 48
},
{
"epoch": 1.2542510121457489,
"eval_loss": 2.218412555521354e-05,
"eval_runtime": 44.4013,
"eval_samples_per_second": 2.928,
"eval_steps_per_second": 1.464,
"step": 48
},
{
"epoch": 1.3319838056680162,
"grad_norm": 0.006924801040440798,
"learning_rate": 0.00011785568947986367,
"loss": 0.045,
"step": 51
},
{
"epoch": 1.4097165991902834,
"grad_norm": 0.02355915680527687,
"learning_rate": 0.00010448648303505151,
"loss": 0.0006,
"step": 54
},
{
"epoch": 1.4615384615384617,
"eval_loss": 2.361264887440484e-05,
"eval_runtime": 44.3855,
"eval_samples_per_second": 2.929,
"eval_steps_per_second": 1.464,
"step": 56
},
{
"epoch": 1.4874493927125507,
"grad_norm": 1.601819396018982,
"learning_rate": 9.103606910965666e-05,
"loss": 0.0077,
"step": 57
},
{
"epoch": 1.5651821862348179,
"grad_norm": 0.09957171976566315,
"learning_rate": 7.774790660436858e-05,
"loss": 0.0016,
"step": 60
},
{
"epoch": 1.642914979757085,
"grad_norm": 0.03547385334968567,
"learning_rate": 6.486251759186572e-05,
"loss": 0.001,
"step": 63
},
{
"epoch": 1.668825910931174,
"eval_loss": 3.899199145962484e-05,
"eval_runtime": 43.2962,
"eval_samples_per_second": 3.003,
"eval_steps_per_second": 1.501,
"step": 64
},
{
"epoch": 1.7206477732793521,
"grad_norm": 0.025507541373372078,
"learning_rate": 5.261313375270014e-05,
"loss": 0.0005,
"step": 66
},
{
"epoch": 1.7983805668016193,
"grad_norm": 0.009455524384975433,
"learning_rate": 4.12214747707527e-05,
"loss": 0.0004,
"step": 69
},
{
"epoch": 1.8761133603238866,
"grad_norm": 0.0038223874289542437,
"learning_rate": 3.089373510131354e-05,
"loss": 0.0009,
"step": 72
},
{
"epoch": 1.8761133603238866,
"eval_loss": 3.11742551275529e-05,
"eval_runtime": 44.3844,
"eval_samples_per_second": 2.929,
"eval_steps_per_second": 1.464,
"step": 72
},
{
"epoch": 1.953846153846154,
"grad_norm": 0.005872055422514677,
"learning_rate": 2.181685175319702e-05,
"loss": 0.001,
"step": 75
},
{
"epoch": 2.042105263157895,
"grad_norm": 0.03851957991719246,
"learning_rate": 1.415512063981339e-05,
"loss": 0.0008,
"step": 78
},
{
"epoch": 2.093927125506073,
"eval_loss": 3.268746513640508e-05,
"eval_runtime": 44.3546,
"eval_samples_per_second": 2.931,
"eval_steps_per_second": 1.465,
"step": 80
},
{
"epoch": 2.119838056680162,
"grad_norm": 0.009562190622091293,
"learning_rate": 8.047222744854943e-06,
"loss": 0.0003,
"step": 81
},
{
"epoch": 2.197570850202429,
"grad_norm": 0.00980624184012413,
"learning_rate": 3.6037139304146762e-06,
"loss": 0.0003,
"step": 84
},
{
"epoch": 2.2753036437246963,
"grad_norm": 0.010969814844429493,
"learning_rate": 9.0502382320653e-07,
"loss": 0.0003,
"step": 87
},
{
"epoch": 2.3012145748987853,
"eval_loss": 2.5345105314045213e-05,
"eval_runtime": 44.4252,
"eval_samples_per_second": 2.926,
"eval_steps_per_second": 1.463,
"step": 88
},
{
"epoch": 2.3530364372469634,
"grad_norm": 2.2677857875823975,
"learning_rate": 0.0,
"loss": 0.009,
"step": 90
}
],
"logging_steps": 3,
"max_steps": 90,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 8,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.7332406838951936e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}