v23 / checkpoint-358 /trainer_state.json
Markus Serloth-Schwarzer
Upload folder using huggingface_hub
57a305b verified
raw
history blame
10.5 kB
{
"best_metric": 0.5765425562858582,
"best_model_checkpoint": "v23/checkpoint-358",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 358,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03910614525139665,
"grad_norm": Infinity,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.8181,
"step": 7
},
{
"epoch": 0.0782122905027933,
"grad_norm": 48.7786750793457,
"learning_rate": 5.555555555555556e-06,
"loss": 2.4279,
"step": 14
},
{
"epoch": 0.11731843575418995,
"grad_norm": 16.421829223632812,
"learning_rate": 9.444444444444445e-06,
"loss": 1.9836,
"step": 21
},
{
"epoch": 0.1564245810055866,
"grad_norm": 37.26057052612305,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.637,
"step": 28
},
{
"epoch": 0.19553072625698323,
"grad_norm": 24.32502555847168,
"learning_rate": 1.7222222222222224e-05,
"loss": 1.4235,
"step": 35
},
{
"epoch": 0.2346368715083799,
"grad_norm": 4.514742374420166,
"learning_rate": 2.111111111111111e-05,
"loss": 0.8819,
"step": 42
},
{
"epoch": 0.2737430167597765,
"grad_norm": 17.023542404174805,
"learning_rate": 2.5e-05,
"loss": 0.847,
"step": 49
},
{
"epoch": 0.3128491620111732,
"grad_norm": 18.13689422607422,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.7611,
"step": 56
},
{
"epoch": 0.35195530726256985,
"grad_norm": 21.782562255859375,
"learning_rate": 3.277777777777778e-05,
"loss": 0.7189,
"step": 63
},
{
"epoch": 0.39106145251396646,
"grad_norm": 13.720254898071289,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.7178,
"step": 70
},
{
"epoch": 0.4301675977653631,
"grad_norm": 4.693215847015381,
"learning_rate": 4.055555555555556e-05,
"loss": 0.7092,
"step": 77
},
{
"epoch": 0.4692737430167598,
"grad_norm": 14.818086624145508,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.682,
"step": 84
},
{
"epoch": 0.5083798882681564,
"grad_norm": 26.538074493408203,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.6824,
"step": 91
},
{
"epoch": 0.547486033519553,
"grad_norm": 29.219240188598633,
"learning_rate": 4.975155279503106e-05,
"loss": 0.681,
"step": 98
},
{
"epoch": 0.5865921787709497,
"grad_norm": 8.444730758666992,
"learning_rate": 4.93167701863354e-05,
"loss": 0.7867,
"step": 105
},
{
"epoch": 0.6256983240223464,
"grad_norm": 8.663554191589355,
"learning_rate": 4.888198757763975e-05,
"loss": 0.7689,
"step": 112
},
{
"epoch": 0.664804469273743,
"grad_norm": 15.8271484375,
"learning_rate": 4.8447204968944106e-05,
"loss": 0.8417,
"step": 119
},
{
"epoch": 0.7039106145251397,
"grad_norm": 6.195271968841553,
"learning_rate": 4.801242236024845e-05,
"loss": 0.629,
"step": 126
},
{
"epoch": 0.7430167597765364,
"grad_norm": 4.593315601348877,
"learning_rate": 4.75776397515528e-05,
"loss": 0.6039,
"step": 133
},
{
"epoch": 0.7821229050279329,
"grad_norm": 22.80195426940918,
"learning_rate": 4.714285714285714e-05,
"loss": 0.5824,
"step": 140
},
{
"epoch": 0.8212290502793296,
"grad_norm": 13.558725357055664,
"learning_rate": 4.6770186335403726e-05,
"loss": 0.876,
"step": 147
},
{
"epoch": 0.8603351955307262,
"grad_norm": 4.1830668449401855,
"learning_rate": 4.633540372670807e-05,
"loss": 0.7458,
"step": 154
},
{
"epoch": 0.8994413407821229,
"grad_norm": 14.1422119140625,
"learning_rate": 4.590062111801243e-05,
"loss": 0.6289,
"step": 161
},
{
"epoch": 0.9385474860335196,
"grad_norm": 15.986943244934082,
"learning_rate": 4.546583850931677e-05,
"loss": 0.8139,
"step": 168
},
{
"epoch": 0.9776536312849162,
"grad_norm": 10.396794319152832,
"learning_rate": 4.5031055900621124e-05,
"loss": 0.7859,
"step": 175
},
{
"epoch": 1.0,
"eval_accuracy": 0.6,
"eval_f1_macro": 0.4025140193447718,
"eval_f1_micro": 0.6,
"eval_f1_weighted": 0.4689184747817452,
"eval_loss": 0.7313841581344604,
"eval_precision_macro": 0.5837690631808279,
"eval_precision_micro": 0.6,
"eval_precision_weighted": 0.5306172839506172,
"eval_recall_macro": 0.43363545726457,
"eval_recall_micro": 0.6,
"eval_recall_weighted": 0.6,
"eval_runtime": 0.8236,
"eval_samples_per_second": 382.454,
"eval_steps_per_second": 24.283,
"step": 179
},
{
"epoch": 1.0167597765363128,
"grad_norm": 15.96173095703125,
"learning_rate": 4.4596273291925465e-05,
"loss": 0.5898,
"step": 182
},
{
"epoch": 1.0558659217877095,
"grad_norm": 21.0263729095459,
"learning_rate": 4.416149068322982e-05,
"loss": 0.6724,
"step": 189
},
{
"epoch": 1.094972067039106,
"grad_norm": 8.833733558654785,
"learning_rate": 4.372670807453416e-05,
"loss": 0.5225,
"step": 196
},
{
"epoch": 1.1340782122905029,
"grad_norm": 13.83945083618164,
"learning_rate": 4.3291925465838515e-05,
"loss": 0.6694,
"step": 203
},
{
"epoch": 1.1731843575418994,
"grad_norm": 13.772929191589355,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5541,
"step": 210
},
{
"epoch": 1.2122905027932962,
"grad_norm": 8.899124145507812,
"learning_rate": 4.2422360248447204e-05,
"loss": 0.713,
"step": 217
},
{
"epoch": 1.2513966480446927,
"grad_norm": 8.794002532958984,
"learning_rate": 4.198757763975156e-05,
"loss": 0.5879,
"step": 224
},
{
"epoch": 1.2905027932960893,
"grad_norm": 4.091240882873535,
"learning_rate": 4.15527950310559e-05,
"loss": 0.5424,
"step": 231
},
{
"epoch": 1.329608938547486,
"grad_norm": 16.86524772644043,
"learning_rate": 4.1118012422360255e-05,
"loss": 0.6323,
"step": 238
},
{
"epoch": 1.3687150837988826,
"grad_norm": 10.920906066894531,
"learning_rate": 4.0683229813664596e-05,
"loss": 0.5727,
"step": 245
},
{
"epoch": 1.4078212290502794,
"grad_norm": 17.164987564086914,
"learning_rate": 4.024844720496895e-05,
"loss": 0.5847,
"step": 252
},
{
"epoch": 1.446927374301676,
"grad_norm": 12.27508544921875,
"learning_rate": 3.981366459627329e-05,
"loss": 0.4845,
"step": 259
},
{
"epoch": 1.4860335195530725,
"grad_norm": 12.798267364501953,
"learning_rate": 3.9378881987577646e-05,
"loss": 0.4762,
"step": 266
},
{
"epoch": 1.5251396648044693,
"grad_norm": 3.783871650695801,
"learning_rate": 3.894409937888199e-05,
"loss": 0.4889,
"step": 273
},
{
"epoch": 1.564245810055866,
"grad_norm": 17.070810317993164,
"learning_rate": 3.8509316770186335e-05,
"loss": 0.5382,
"step": 280
},
{
"epoch": 1.6033519553072626,
"grad_norm": 27.77778434753418,
"learning_rate": 3.807453416149068e-05,
"loss": 0.5336,
"step": 287
},
{
"epoch": 1.6424581005586592,
"grad_norm": 52.91617202758789,
"learning_rate": 3.763975155279503e-05,
"loss": 0.6025,
"step": 294
},
{
"epoch": 1.6815642458100557,
"grad_norm": 17.698348999023438,
"learning_rate": 3.7204968944099385e-05,
"loss": 0.3589,
"step": 301
},
{
"epoch": 1.7206703910614525,
"grad_norm": 14.64693832397461,
"learning_rate": 3.6770186335403726e-05,
"loss": 0.3654,
"step": 308
},
{
"epoch": 1.7597765363128492,
"grad_norm": 13.599970817565918,
"learning_rate": 3.633540372670808e-05,
"loss": 0.6611,
"step": 315
},
{
"epoch": 1.7988826815642458,
"grad_norm": 6.364068984985352,
"learning_rate": 3.590062111801242e-05,
"loss": 0.5289,
"step": 322
},
{
"epoch": 1.8379888268156424,
"grad_norm": 10.628365516662598,
"learning_rate": 3.546583850931677e-05,
"loss": 0.6834,
"step": 329
},
{
"epoch": 1.8770949720670391,
"grad_norm": 7.663080215454102,
"learning_rate": 3.503105590062112e-05,
"loss": 0.4576,
"step": 336
},
{
"epoch": 1.916201117318436,
"grad_norm": 9.863435745239258,
"learning_rate": 3.4596273291925466e-05,
"loss": 0.4385,
"step": 343
},
{
"epoch": 1.9553072625698324,
"grad_norm": 7.04995059967041,
"learning_rate": 3.4161490683229814e-05,
"loss": 0.3589,
"step": 350
},
{
"epoch": 1.994413407821229,
"grad_norm": 12.553130149841309,
"learning_rate": 3.372670807453416e-05,
"loss": 0.6421,
"step": 357
},
{
"epoch": 2.0,
"eval_accuracy": 0.7492063492063492,
"eval_f1_macro": 0.7349736157447978,
"eval_f1_micro": 0.7492063492063492,
"eval_f1_weighted": 0.7506575340838828,
"eval_loss": 0.5765425562858582,
"eval_precision_macro": 0.7150911360799,
"eval_precision_micro": 0.7492063492063492,
"eval_precision_weighted": 0.7534924677486475,
"eval_recall_macro": 0.7595969666757304,
"eval_recall_micro": 0.7492063492063492,
"eval_recall_weighted": 0.7492063492063492,
"eval_runtime": 0.8337,
"eval_samples_per_second": 377.837,
"eval_steps_per_second": 23.99,
"step": 358
}
],
"logging_steps": 7,
"max_steps": 895,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 188389207093248.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}