ryan_model3272024 / trainer_state.json
rshrott's picture
🍻 cheers
d6a8860 verified
{
"best_metric": 0.26356959342956543,
"best_model_checkpoint": "./ryan_model3272024/checkpoint-1000",
"epoch": 0.6496519721577726,
"eval_steps": 100,
"global_step": 1400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 0.38699468970298767,
"learning_rate": 0.0001994199535962877,
"loss": 0.4038,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.6787680387496948,
"learning_rate": 0.00019883990719257543,
"loss": 0.4003,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.5743306279182434,
"learning_rate": 0.00019825986078886312,
"loss": 0.3591,
"step": 75
},
{
"epoch": 0.05,
"grad_norm": 0.41705068945884705,
"learning_rate": 0.00019767981438515082,
"loss": 0.3524,
"step": 100
},
{
"epoch": 0.05,
"eval_loss": 0.339992493391037,
"eval_na_accuracy": 0.7586872577667236,
"eval_ordinal_accuracy": 0.38746026158332825,
"eval_ordinal_mae": 0.8904515504837036,
"eval_runtime": 335.205,
"eval_samples_per_second": 11.87,
"eval_steps_per_second": 1.486,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 0.36200761795043945,
"learning_rate": 0.0001970997679814385,
"loss": 0.3071,
"step": 125
},
{
"epoch": 0.07,
"grad_norm": 0.24589791893959045,
"learning_rate": 0.00019651972157772623,
"loss": 0.3475,
"step": 150
},
{
"epoch": 0.08,
"grad_norm": 0.6089735627174377,
"learning_rate": 0.00019593967517401393,
"loss": 0.3072,
"step": 175
},
{
"epoch": 0.09,
"grad_norm": 0.5671761631965637,
"learning_rate": 0.00019535962877030162,
"loss": 0.2683,
"step": 200
},
{
"epoch": 0.09,
"eval_loss": 0.36712726950645447,
"eval_na_accuracy": 0.623552143573761,
"eval_ordinal_accuracy": 0.48916497826576233,
"eval_ordinal_mae": 0.7306416630744934,
"eval_runtime": 155.9343,
"eval_samples_per_second": 25.517,
"eval_steps_per_second": 3.194,
"step": 200
},
{
"epoch": 0.1,
"grad_norm": 1.2764167785644531,
"learning_rate": 0.00019477958236658932,
"loss": 0.2953,
"step": 225
},
{
"epoch": 0.12,
"grad_norm": 1.9076497554779053,
"learning_rate": 0.00019419953596287704,
"loss": 0.3382,
"step": 250
},
{
"epoch": 0.13,
"grad_norm": 0.2747127115726471,
"learning_rate": 0.00019361948955916474,
"loss": 0.2752,
"step": 275
},
{
"epoch": 0.14,
"grad_norm": 0.9448749423027039,
"learning_rate": 0.00019303944315545243,
"loss": 0.3314,
"step": 300
},
{
"epoch": 0.14,
"eval_loss": 0.3450469672679901,
"eval_na_accuracy": 0.6969112157821655,
"eval_ordinal_accuracy": 0.4013291001319885,
"eval_ordinal_mae": 0.8077224493026733,
"eval_runtime": 156.2328,
"eval_samples_per_second": 25.468,
"eval_steps_per_second": 3.188,
"step": 300
},
{
"epoch": 0.15,
"grad_norm": 0.2589721083641052,
"learning_rate": 0.00019245939675174015,
"loss": 0.3486,
"step": 325
},
{
"epoch": 0.16,
"grad_norm": 0.44286003708839417,
"learning_rate": 0.00019187935034802785,
"loss": 0.3386,
"step": 350
},
{
"epoch": 0.17,
"grad_norm": 0.3215602934360504,
"learning_rate": 0.00019129930394431554,
"loss": 0.3056,
"step": 375
},
{
"epoch": 0.19,
"grad_norm": 0.9510051012039185,
"learning_rate": 0.00019071925754060324,
"loss": 0.2747,
"step": 400
},
{
"epoch": 0.19,
"eval_loss": 0.28132036328315735,
"eval_na_accuracy": 0.7895752787590027,
"eval_ordinal_accuracy": 0.5423288345336914,
"eval_ordinal_mae": 0.6105712056159973,
"eval_runtime": 155.1965,
"eval_samples_per_second": 25.638,
"eval_steps_per_second": 3.209,
"step": 400
},
{
"epoch": 0.2,
"grad_norm": 0.5417093634605408,
"learning_rate": 0.00019013921113689096,
"loss": 0.2522,
"step": 425
},
{
"epoch": 0.21,
"grad_norm": 1.405881643295288,
"learning_rate": 0.00018955916473317868,
"loss": 0.3589,
"step": 450
},
{
"epoch": 0.22,
"grad_norm": 0.8319898843765259,
"learning_rate": 0.00018897911832946638,
"loss": 0.2991,
"step": 475
},
{
"epoch": 0.23,
"grad_norm": 1.9455621242523193,
"learning_rate": 0.00018839907192575407,
"loss": 0.3247,
"step": 500
},
{
"epoch": 0.23,
"eval_loss": 0.3143959045410156,
"eval_na_accuracy": 0.7104247212409973,
"eval_ordinal_accuracy": 0.4524703919887543,
"eval_ordinal_mae": 0.7256373763084412,
"eval_runtime": 157.1141,
"eval_samples_per_second": 25.326,
"eval_steps_per_second": 3.17,
"step": 500
},
{
"epoch": 0.24,
"grad_norm": 0.6339251399040222,
"learning_rate": 0.00018781902552204177,
"loss": 0.303,
"step": 525
},
{
"epoch": 0.26,
"grad_norm": 0.3713740408420563,
"learning_rate": 0.0001872389791183295,
"loss": 0.3035,
"step": 550
},
{
"epoch": 0.27,
"grad_norm": 0.7050974369049072,
"learning_rate": 0.00018665893271461718,
"loss": 0.2609,
"step": 575
},
{
"epoch": 0.28,
"grad_norm": 0.791477620601654,
"learning_rate": 0.00018607888631090488,
"loss": 0.3612,
"step": 600
},
{
"epoch": 0.28,
"eval_loss": 0.3074879050254822,
"eval_na_accuracy": 0.7586872577667236,
"eval_ordinal_accuracy": 0.4984108507633209,
"eval_ordinal_mae": 0.6415887475013733,
"eval_runtime": 154.2538,
"eval_samples_per_second": 25.795,
"eval_steps_per_second": 3.228,
"step": 600
},
{
"epoch": 0.29,
"grad_norm": 0.39196524024009705,
"learning_rate": 0.0001854988399071926,
"loss": 0.31,
"step": 625
},
{
"epoch": 0.3,
"grad_norm": 1.0753191709518433,
"learning_rate": 0.0001849187935034803,
"loss": 0.2722,
"step": 650
},
{
"epoch": 0.31,
"grad_norm": 0.8922611474990845,
"learning_rate": 0.000184338747099768,
"loss": 0.3132,
"step": 675
},
{
"epoch": 0.32,
"grad_norm": 0.6866246461868286,
"learning_rate": 0.0001837587006960557,
"loss": 0.3031,
"step": 700
},
{
"epoch": 0.32,
"eval_loss": 0.2784635126590729,
"eval_na_accuracy": 0.7895752787590027,
"eval_ordinal_accuracy": 0.5556197762489319,
"eval_ordinal_mae": 0.5720168352127075,
"eval_runtime": 154.421,
"eval_samples_per_second": 25.767,
"eval_steps_per_second": 3.225,
"step": 700
},
{
"epoch": 0.34,
"grad_norm": 1.713051676750183,
"learning_rate": 0.0001831786542923434,
"loss": 0.337,
"step": 725
},
{
"epoch": 0.35,
"grad_norm": 1.0872548818588257,
"learning_rate": 0.0001825986078886311,
"loss": 0.2918,
"step": 750
},
{
"epoch": 0.36,
"grad_norm": 1.5099256038665771,
"learning_rate": 0.0001820185614849188,
"loss": 0.2509,
"step": 775
},
{
"epoch": 0.37,
"grad_norm": 0.5774210691452026,
"learning_rate": 0.0001814385150812065,
"loss": 0.2866,
"step": 800
},
{
"epoch": 0.37,
"eval_loss": 0.28780511021614075,
"eval_na_accuracy": 0.7335907220840454,
"eval_ordinal_accuracy": 0.5775787234306335,
"eval_ordinal_mae": 0.5347856879234314,
"eval_runtime": 154.6062,
"eval_samples_per_second": 25.736,
"eval_steps_per_second": 3.221,
"step": 800
},
{
"epoch": 0.38,
"grad_norm": 0.33059367537498474,
"learning_rate": 0.00018085846867749422,
"loss": 0.2626,
"step": 825
},
{
"epoch": 0.39,
"grad_norm": 1.45087730884552,
"learning_rate": 0.0001802784222737819,
"loss": 0.3485,
"step": 850
},
{
"epoch": 0.41,
"grad_norm": 1.195901870727539,
"learning_rate": 0.0001796983758700696,
"loss": 0.3007,
"step": 875
},
{
"epoch": 0.42,
"grad_norm": 0.26779890060424805,
"learning_rate": 0.00017911832946635733,
"loss": 0.2927,
"step": 900
},
{
"epoch": 0.42,
"eval_loss": 0.2688673734664917,
"eval_na_accuracy": 0.7972972989082336,
"eval_ordinal_accuracy": 0.5573533773422241,
"eval_ordinal_mae": 0.5855077505111694,
"eval_runtime": 154.5178,
"eval_samples_per_second": 25.751,
"eval_steps_per_second": 3.223,
"step": 900
},
{
"epoch": 0.43,
"grad_norm": 0.5635965466499329,
"learning_rate": 0.00017853828306264502,
"loss": 0.269,
"step": 925
},
{
"epoch": 0.44,
"grad_norm": 2.8135786056518555,
"learning_rate": 0.00017795823665893272,
"loss": 0.2677,
"step": 950
},
{
"epoch": 0.45,
"grad_norm": 0.49396631121635437,
"learning_rate": 0.0001773781902552204,
"loss": 0.3069,
"step": 975
},
{
"epoch": 0.46,
"grad_norm": 1.3267723321914673,
"learning_rate": 0.00017679814385150814,
"loss": 0.3003,
"step": 1000
},
{
"epoch": 0.46,
"eval_loss": 0.26356959342956543,
"eval_na_accuracy": 0.7915058135986328,
"eval_ordinal_accuracy": 0.581045925617218,
"eval_ordinal_mae": 0.5543876886367798,
"eval_runtime": 157.946,
"eval_samples_per_second": 25.192,
"eval_steps_per_second": 3.153,
"step": 1000
},
{
"epoch": 0.48,
"grad_norm": 0.9938157200813293,
"learning_rate": 0.00017621809744779583,
"loss": 0.2521,
"step": 1025
},
{
"epoch": 0.49,
"grad_norm": 0.45715010166168213,
"learning_rate": 0.00017563805104408353,
"loss": 0.2926,
"step": 1050
},
{
"epoch": 0.5,
"grad_norm": 2.9666409492492676,
"learning_rate": 0.00017505800464037122,
"loss": 0.2581,
"step": 1075
},
{
"epoch": 0.51,
"grad_norm": 2.5301055908203125,
"learning_rate": 0.00017447795823665894,
"loss": 0.2522,
"step": 1100
},
{
"epoch": 0.51,
"eval_loss": 0.3009192943572998,
"eval_na_accuracy": 0.8571428656578064,
"eval_ordinal_accuracy": 0.54435133934021,
"eval_ordinal_mae": 0.5650931596755981,
"eval_runtime": 159.1216,
"eval_samples_per_second": 25.006,
"eval_steps_per_second": 3.13,
"step": 1100
},
{
"epoch": 0.52,
"grad_norm": 0.8192782998085022,
"learning_rate": 0.00017389791183294664,
"loss": 0.3584,
"step": 1125
},
{
"epoch": 0.53,
"grad_norm": 2.0657265186309814,
"learning_rate": 0.00017331786542923433,
"loss": 0.2547,
"step": 1150
},
{
"epoch": 0.55,
"grad_norm": 0.5887840390205383,
"learning_rate": 0.00017273781902552203,
"loss": 0.2335,
"step": 1175
},
{
"epoch": 0.56,
"grad_norm": 0.8169906735420227,
"learning_rate": 0.00017215777262180975,
"loss": 0.262,
"step": 1200
},
{
"epoch": 0.56,
"eval_loss": 0.279022216796875,
"eval_na_accuracy": 0.8301158547401428,
"eval_ordinal_accuracy": 0.5801791548728943,
"eval_ordinal_mae": 0.5203233361244202,
"eval_runtime": 159.9167,
"eval_samples_per_second": 24.882,
"eval_steps_per_second": 3.114,
"step": 1200
},
{
"epoch": 0.57,
"grad_norm": 2.5461835861206055,
"learning_rate": 0.00017157772621809744,
"loss": 0.2387,
"step": 1225
},
{
"epoch": 0.58,
"grad_norm": 0.7304142117500305,
"learning_rate": 0.00017099767981438517,
"loss": 0.2366,
"step": 1250
},
{
"epoch": 0.59,
"grad_norm": 1.3845186233520508,
"learning_rate": 0.00017041763341067286,
"loss": 0.2309,
"step": 1275
},
{
"epoch": 0.6,
"grad_norm": 0.5202885270118713,
"learning_rate": 0.00016983758700696058,
"loss": 0.2139,
"step": 1300
},
{
"epoch": 0.6,
"eval_loss": 0.2653418481349945,
"eval_na_accuracy": 0.7509652376174927,
"eval_ordinal_accuracy": 0.5492632389068604,
"eval_ordinal_mae": 0.562603771686554,
"eval_runtime": 158.9921,
"eval_samples_per_second": 25.026,
"eval_steps_per_second": 3.132,
"step": 1300
},
{
"epoch": 0.61,
"grad_norm": 1.6506483554840088,
"learning_rate": 0.00016925754060324828,
"loss": 0.3071,
"step": 1325
},
{
"epoch": 0.63,
"grad_norm": 0.5789369940757751,
"learning_rate": 0.00016867749419953597,
"loss": 0.2689,
"step": 1350
},
{
"epoch": 0.64,
"grad_norm": 0.5665389895439148,
"learning_rate": 0.00016809744779582367,
"loss": 0.2598,
"step": 1375
},
{
"epoch": 0.65,
"grad_norm": 0.6937847137451172,
"learning_rate": 0.0001675174013921114,
"loss": 0.2655,
"step": 1400
},
{
"epoch": 0.65,
"eval_loss": 0.2760397493839264,
"eval_na_accuracy": 0.7123551964759827,
"eval_ordinal_accuracy": 0.5426177382469177,
"eval_ordinal_mae": 0.6106911897659302,
"eval_runtime": 160.1635,
"eval_samples_per_second": 24.843,
"eval_steps_per_second": 3.109,
"step": 1400
},
{
"epoch": 0.65,
"step": 1400,
"total_flos": 1.735882797809664e+18,
"train_loss": 0.29669314997536794,
"train_runtime": 4786.838,
"train_samples_per_second": 28.807,
"train_steps_per_second": 1.801
}
],
"logging_steps": 25,
"max_steps": 8620,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"total_flos": 1.735882797809664e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}