trial_model / trainer_state.json
ioseff's picture
Upload 14 files
d50784b verified
{
"best_metric": 1.396972417831421,
"best_model_checkpoint": "./qwen_t/qwen_o5/checkpoint-320",
"epoch": 0.11695906432748537,
"eval_steps": 10,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003341687552213868,
"grad_norm": 0.7599132657051086,
"learning_rate": 0.0002,
"loss": 3.626,
"mean_token_accuracy": 0.36439715698361397,
"step": 10
},
{
"epoch": 0.003341687552213868,
"eval_loss": 3.443260669708252,
"eval_mean_token_accuracy": 0.4417985293127242,
"eval_runtime": 41.9693,
"eval_samples_per_second": 80.058,
"eval_steps_per_second": 10.007,
"step": 10
},
{
"epoch": 0.006683375104427736,
"grad_norm": 0.8953952193260193,
"learning_rate": 0.0002,
"loss": 2.9254,
"mean_token_accuracy": 0.49326967149972917,
"step": 20
},
{
"epoch": 0.006683375104427736,
"eval_loss": 2.5516397953033447,
"eval_mean_token_accuracy": 0.565807048479716,
"eval_runtime": 40.7965,
"eval_samples_per_second": 82.36,
"eval_steps_per_second": 10.295,
"step": 20
},
{
"epoch": 0.010025062656641603,
"grad_norm": 0.6876745223999023,
"learning_rate": 0.0002,
"loss": 2.231,
"mean_token_accuracy": 0.6093000993132591,
"step": 30
},
{
"epoch": 0.010025062656641603,
"eval_loss": 2.153578042984009,
"eval_mean_token_accuracy": 0.6297694771062761,
"eval_runtime": 35.4077,
"eval_samples_per_second": 94.895,
"eval_steps_per_second": 11.862,
"step": 30
},
{
"epoch": 0.013366750208855471,
"grad_norm": 0.9506922960281372,
"learning_rate": 0.0002,
"loss": 1.8838,
"mean_token_accuracy": 0.6746647953987122,
"step": 40
},
{
"epoch": 0.013366750208855471,
"eval_loss": 2.019120454788208,
"eval_mean_token_accuracy": 0.6349048288805145,
"eval_runtime": 35.8494,
"eval_samples_per_second": 93.725,
"eval_steps_per_second": 11.716,
"step": 40
},
{
"epoch": 0.01670843776106934,
"grad_norm": 2.049982786178589,
"learning_rate": 0.0002,
"loss": 1.5276,
"mean_token_accuracy": 0.7170954093337059,
"step": 50
},
{
"epoch": 0.01670843776106934,
"eval_loss": 1.813344120979309,
"eval_mean_token_accuracy": 0.6365837232697578,
"eval_runtime": 29.3868,
"eval_samples_per_second": 114.337,
"eval_steps_per_second": 14.292,
"step": 50
},
{
"epoch": 0.020050125313283207,
"grad_norm": 0.7795844674110413,
"learning_rate": 0.0002,
"loss": 2.2638,
"mean_token_accuracy": 0.5529197990894318,
"step": 60
},
{
"epoch": 0.020050125313283207,
"eval_loss": 1.6835161447525024,
"eval_mean_token_accuracy": 0.6496368288993836,
"eval_runtime": 67.7706,
"eval_samples_per_second": 49.579,
"eval_steps_per_second": 6.197,
"step": 60
},
{
"epoch": 0.023391812865497075,
"grad_norm": 0.6929437518119812,
"learning_rate": 0.0002,
"loss": 1.7755,
"mean_token_accuracy": 0.6449611410498619,
"step": 70
},
{
"epoch": 0.023391812865497075,
"eval_loss": 1.5908516645431519,
"eval_mean_token_accuracy": 0.6815834226352828,
"eval_runtime": 63.7232,
"eval_samples_per_second": 52.728,
"eval_steps_per_second": 6.591,
"step": 70
},
{
"epoch": 0.026733500417710943,
"grad_norm": 0.5863602161407471,
"learning_rate": 0.0002,
"loss": 1.4411,
"mean_token_accuracy": 0.6989624485373497,
"step": 80
},
{
"epoch": 0.026733500417710943,
"eval_loss": 1.5338634252548218,
"eval_mean_token_accuracy": 0.6856980549437659,
"eval_runtime": 50.2939,
"eval_samples_per_second": 66.807,
"eval_steps_per_second": 8.351,
"step": 80
},
{
"epoch": 0.03007518796992481,
"grad_norm": 1.1920981407165527,
"learning_rate": 0.0002,
"loss": 1.2398,
"mean_token_accuracy": 0.733481515944004,
"step": 90
},
{
"epoch": 0.03007518796992481,
"eval_loss": 1.5052729845046997,
"eval_mean_token_accuracy": 0.6931079140731267,
"eval_runtime": 64.5138,
"eval_samples_per_second": 52.082,
"eval_steps_per_second": 6.51,
"step": 90
},
{
"epoch": 0.03341687552213868,
"grad_norm": 0.6549275517463684,
"learning_rate": 0.0002,
"loss": 1.0111,
"mean_token_accuracy": 0.7853028282523156,
"step": 100
},
{
"epoch": 0.03341687552213868,
"eval_loss": 1.518865704536438,
"eval_mean_token_accuracy": 0.6946799146987143,
"eval_runtime": 34.4958,
"eval_samples_per_second": 97.403,
"eval_steps_per_second": 12.175,
"step": 100
},
{
"epoch": 0.036758563074352546,
"grad_norm": 0.5133540630340576,
"learning_rate": 0.0002,
"loss": 1.9947,
"mean_token_accuracy": 0.6003070399165154,
"step": 110
},
{
"epoch": 0.036758563074352546,
"eval_loss": 1.4736624956130981,
"eval_mean_token_accuracy": 0.6929171987232707,
"eval_runtime": 31.0194,
"eval_samples_per_second": 108.319,
"eval_steps_per_second": 13.54,
"step": 110
},
{
"epoch": 0.040100250626566414,
"grad_norm": 0.4258256256580353,
"learning_rate": 0.0002,
"loss": 1.5854,
"mean_token_accuracy": 0.6611790612339974,
"step": 120
},
{
"epoch": 0.040100250626566414,
"eval_loss": 1.458544373512268,
"eval_mean_token_accuracy": 0.6966695138386317,
"eval_runtime": 32.5414,
"eval_samples_per_second": 103.253,
"eval_steps_per_second": 12.907,
"step": 120
},
{
"epoch": 0.04344193817878028,
"grad_norm": 0.5019882321357727,
"learning_rate": 0.0002,
"loss": 1.3932,
"mean_token_accuracy": 0.717147932946682,
"step": 130
},
{
"epoch": 0.04344193817878028,
"eval_loss": 1.4401620626449585,
"eval_mean_token_accuracy": 0.7001797112680617,
"eval_runtime": 30.1368,
"eval_samples_per_second": 111.492,
"eval_steps_per_second": 13.936,
"step": 130
},
{
"epoch": 0.04678362573099415,
"grad_norm": 0.5241239070892334,
"learning_rate": 0.0002,
"loss": 1.1731,
"mean_token_accuracy": 0.7519763350486756,
"step": 140
},
{
"epoch": 0.04678362573099415,
"eval_loss": 1.4381680488586426,
"eval_mean_token_accuracy": 0.6980286110724722,
"eval_runtime": 29.4312,
"eval_samples_per_second": 114.164,
"eval_steps_per_second": 14.271,
"step": 140
},
{
"epoch": 0.05012531328320802,
"grad_norm": 0.5657021999359131,
"learning_rate": 0.0002,
"loss": 0.9886,
"mean_token_accuracy": 0.804797975718975,
"step": 150
},
{
"epoch": 0.05012531328320802,
"eval_loss": 1.4500703811645508,
"eval_mean_token_accuracy": 0.7004464421243894,
"eval_runtime": 29.379,
"eval_samples_per_second": 114.368,
"eval_steps_per_second": 14.296,
"step": 150
},
{
"epoch": 0.053467000835421885,
"grad_norm": 0.48124462366104126,
"learning_rate": 0.0002,
"loss": 1.8415,
"mean_token_accuracy": 0.6195126965641975,
"step": 160
},
{
"epoch": 0.053467000835421885,
"eval_loss": 1.4379223585128784,
"eval_mean_token_accuracy": 0.6959139862940424,
"eval_runtime": 29.496,
"eval_samples_per_second": 113.914,
"eval_steps_per_second": 14.239,
"step": 160
},
{
"epoch": 0.05680868838763575,
"grad_norm": 0.4167322516441345,
"learning_rate": 0.0002,
"loss": 1.5117,
"mean_token_accuracy": 0.6729505002498627,
"step": 170
},
{
"epoch": 0.05680868838763575,
"eval_loss": 1.4370402097702026,
"eval_mean_token_accuracy": 0.6991755010116668,
"eval_runtime": 30.4827,
"eval_samples_per_second": 110.227,
"eval_steps_per_second": 13.778,
"step": 170
},
{
"epoch": 0.06015037593984962,
"grad_norm": 0.44749510288238525,
"learning_rate": 0.0002,
"loss": 1.2954,
"mean_token_accuracy": 0.7225258648395538,
"step": 180
},
{
"epoch": 0.06015037593984962,
"eval_loss": 1.423570156097412,
"eval_mean_token_accuracy": 0.7020650133490562,
"eval_runtime": 30.6017,
"eval_samples_per_second": 109.798,
"eval_steps_per_second": 13.725,
"step": 180
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.3989886939525604,
"learning_rate": 0.0002,
"loss": 1.213,
"mean_token_accuracy": 0.7481454327702522,
"step": 190
},
{
"epoch": 0.06349206349206349,
"eval_loss": 1.4212963581085205,
"eval_mean_token_accuracy": 0.7001493394374847,
"eval_runtime": 41.8797,
"eval_samples_per_second": 80.23,
"eval_steps_per_second": 10.029,
"step": 190
},
{
"epoch": 0.06683375104427736,
"grad_norm": 0.5422595739364624,
"learning_rate": 0.0002,
"loss": 0.942,
"mean_token_accuracy": 0.7962346121668815,
"step": 200
},
{
"epoch": 0.06683375104427736,
"eval_loss": 1.421792984008789,
"eval_mean_token_accuracy": 0.7010365227858225,
"eval_runtime": 53.936,
"eval_samples_per_second": 62.296,
"eval_steps_per_second": 7.787,
"step": 200
},
{
"epoch": 0.07017543859649122,
"grad_norm": 0.39737701416015625,
"learning_rate": 0.0002,
"loss": 1.9107,
"mean_token_accuracy": 0.5946269743144512,
"step": 210
},
{
"epoch": 0.07017543859649122,
"eval_loss": 1.4200433492660522,
"eval_mean_token_accuracy": 0.6980209651447478,
"eval_runtime": 48.7912,
"eval_samples_per_second": 68.865,
"eval_steps_per_second": 8.608,
"step": 210
},
{
"epoch": 0.07351712614870509,
"grad_norm": 0.3731982707977295,
"learning_rate": 0.0002,
"loss": 1.4745,
"mean_token_accuracy": 0.6861546367406846,
"step": 220
},
{
"epoch": 0.07351712614870509,
"eval_loss": 1.426990032196045,
"eval_mean_token_accuracy": 0.6979587059645426,
"eval_runtime": 45.8174,
"eval_samples_per_second": 73.335,
"eval_steps_per_second": 9.167,
"step": 220
},
{
"epoch": 0.07685881370091896,
"grad_norm": 0.5165483951568604,
"learning_rate": 0.0002,
"loss": 1.3166,
"mean_token_accuracy": 0.717747439444065,
"step": 230
},
{
"epoch": 0.07685881370091896,
"eval_loss": 1.4164931774139404,
"eval_mean_token_accuracy": 0.7008462209077109,
"eval_runtime": 35.0477,
"eval_samples_per_second": 95.869,
"eval_steps_per_second": 11.984,
"step": 230
},
{
"epoch": 0.08020050125313283,
"grad_norm": 0.3445465862751007,
"learning_rate": 0.0002,
"loss": 1.138,
"mean_token_accuracy": 0.7411063179373741,
"step": 240
},
{
"epoch": 0.08020050125313283,
"eval_loss": 1.4141920804977417,
"eval_mean_token_accuracy": 0.7027672590953963,
"eval_runtime": 34.6315,
"eval_samples_per_second": 97.022,
"eval_steps_per_second": 12.128,
"step": 240
},
{
"epoch": 0.0835421888053467,
"grad_norm": 0.9735682606697083,
"learning_rate": 0.0002,
"loss": 0.8767,
"mean_token_accuracy": 0.79200878739357,
"step": 250
},
{
"epoch": 0.0835421888053467,
"eval_loss": 1.421015977859497,
"eval_mean_token_accuracy": 0.6934712292892592,
"eval_runtime": 34.4466,
"eval_samples_per_second": 97.542,
"eval_steps_per_second": 12.193,
"step": 250
},
{
"epoch": 0.08688387635756056,
"grad_norm": 0.4343126118183136,
"learning_rate": 0.0002,
"loss": 1.9246,
"mean_token_accuracy": 0.5945770829916001,
"step": 260
},
{
"epoch": 0.08688387635756056,
"eval_loss": 1.4099905490875244,
"eval_mean_token_accuracy": 0.7017570126624334,
"eval_runtime": 29.6092,
"eval_samples_per_second": 113.478,
"eval_steps_per_second": 14.185,
"step": 260
},
{
"epoch": 0.09022556390977443,
"grad_norm": 0.3334052562713623,
"learning_rate": 0.0002,
"loss": 1.4759,
"mean_token_accuracy": 0.6715509802103042,
"step": 270
},
{
"epoch": 0.09022556390977443,
"eval_loss": 1.4085925817489624,
"eval_mean_token_accuracy": 0.7048604423091525,
"eval_runtime": 38.8094,
"eval_samples_per_second": 86.577,
"eval_steps_per_second": 10.822,
"step": 270
},
{
"epoch": 0.0935672514619883,
"grad_norm": 0.5291116237640381,
"learning_rate": 0.0002,
"loss": 1.3162,
"mean_token_accuracy": 0.7213364154100418,
"step": 280
},
{
"epoch": 0.0935672514619883,
"eval_loss": 1.4125802516937256,
"eval_mean_token_accuracy": 0.7007201626896858,
"eval_runtime": 41.0882,
"eval_samples_per_second": 81.775,
"eval_steps_per_second": 10.222,
"step": 280
},
{
"epoch": 0.09690893901420217,
"grad_norm": 0.3959917724132538,
"learning_rate": 0.0002,
"loss": 1.192,
"mean_token_accuracy": 0.7394131779670715,
"step": 290
},
{
"epoch": 0.09690893901420217,
"eval_loss": 1.4038469791412354,
"eval_mean_token_accuracy": 0.7027802584426743,
"eval_runtime": 31.4074,
"eval_samples_per_second": 106.981,
"eval_steps_per_second": 13.373,
"step": 290
},
{
"epoch": 0.10025062656641603,
"grad_norm": 0.6445237398147583,
"learning_rate": 0.0002,
"loss": 0.8861,
"mean_token_accuracy": 0.7985727787017822,
"step": 300
},
{
"epoch": 0.10025062656641603,
"eval_loss": 1.4055042266845703,
"eval_mean_token_accuracy": 0.7020482325837726,
"eval_runtime": 36.2285,
"eval_samples_per_second": 92.745,
"eval_steps_per_second": 11.593,
"step": 300
},
{
"epoch": 0.1035923141186299,
"grad_norm": 0.3228004276752472,
"learning_rate": 0.0002,
"loss": 1.9139,
"mean_token_accuracy": 0.6049163021147251,
"step": 310
},
{
"epoch": 0.1035923141186299,
"eval_loss": 1.401644229888916,
"eval_mean_token_accuracy": 0.70244310824644,
"eval_runtime": 29.3696,
"eval_samples_per_second": 114.404,
"eval_steps_per_second": 14.301,
"step": 310
},
{
"epoch": 0.10693400167084377,
"grad_norm": 0.35528433322906494,
"learning_rate": 0.0002,
"loss": 1.4937,
"mean_token_accuracy": 0.6764601737260818,
"step": 320
},
{
"epoch": 0.10693400167084377,
"eval_loss": 1.396972417831421,
"eval_mean_token_accuracy": 0.699098062302385,
"eval_runtime": 40.4694,
"eval_samples_per_second": 83.026,
"eval_steps_per_second": 10.378,
"step": 320
},
{
"epoch": 0.11027568922305764,
"grad_norm": 0.4269411563873291,
"learning_rate": 0.0002,
"loss": 1.2599,
"mean_token_accuracy": 0.7228553861379623,
"step": 330
},
{
"epoch": 0.11027568922305764,
"eval_loss": 1.398388385772705,
"eval_mean_token_accuracy": 0.7035389555352075,
"eval_runtime": 33.2865,
"eval_samples_per_second": 100.942,
"eval_steps_per_second": 12.618,
"step": 330
},
{
"epoch": 0.1136173767752715,
"grad_norm": 0.372363805770874,
"learning_rate": 0.0002,
"loss": 1.1301,
"mean_token_accuracy": 0.7524892643094063,
"step": 340
},
{
"epoch": 0.1136173767752715,
"eval_loss": 1.398653507232666,
"eval_mean_token_accuracy": 0.6984730128731046,
"eval_runtime": 44.4473,
"eval_samples_per_second": 75.595,
"eval_steps_per_second": 9.449,
"step": 340
},
{
"epoch": 0.11695906432748537,
"grad_norm": 0.4013306796550751,
"learning_rate": 0.0002,
"loss": 0.939,
"mean_token_accuracy": 0.7970763191580772,
"step": 350
},
{
"epoch": 0.11695906432748537,
"eval_loss": 1.3984951972961426,
"eval_mean_token_accuracy": 0.7021824714683351,
"eval_runtime": 35.5812,
"eval_samples_per_second": 94.432,
"eval_steps_per_second": 11.804,
"step": 350
}
],
"logging_steps": 10,
"max_steps": 14960,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 10,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 934541258360832.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}