test-hasy-5 / trainer_state.json
khaingsmon's picture
cheers again
9924970 verified
{
"best_metric": 0.8066528066528067,
"best_model_checkpoint": "test-hasy-5/checkpoint-18935",
"epoch": 100.0,
"eval_steps": 500,
"global_step": 54100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.92,
"grad_norm": 4.6960930824279785,
"learning_rate": 1.9815157116451017e-05,
"loss": 3.9645,
"step": 500
},
{
"epoch": 1.0,
"eval_accuracy": 0.3970893970893971,
"eval_loss": 3.429507255554199,
"eval_runtime": 2.0488,
"eval_samples_per_second": 234.77,
"eval_steps_per_second": 29.773,
"step": 541
},
{
"epoch": 1.85,
"grad_norm": 3.364806652069092,
"learning_rate": 1.9630314232902035e-05,
"loss": 3.4258,
"step": 1000
},
{
"epoch": 2.0,
"eval_accuracy": 0.4781704781704782,
"eval_loss": 2.879011392593384,
"eval_runtime": 1.9708,
"eval_samples_per_second": 244.059,
"eval_steps_per_second": 30.951,
"step": 1082
},
{
"epoch": 2.77,
"grad_norm": 3.35432767868042,
"learning_rate": 1.944547134935305e-05,
"loss": 3.04,
"step": 1500
},
{
"epoch": 3.0,
"eval_accuracy": 0.5467775467775468,
"eval_loss": 2.4893012046813965,
"eval_runtime": 2.0175,
"eval_samples_per_second": 238.411,
"eval_steps_per_second": 30.235,
"step": 1623
},
{
"epoch": 3.7,
"grad_norm": 4.459615230560303,
"learning_rate": 1.9260628465804068e-05,
"loss": 2.793,
"step": 2000
},
{
"epoch": 4.0,
"eval_accuracy": 0.5738045738045738,
"eval_loss": 2.2005958557128906,
"eval_runtime": 1.962,
"eval_samples_per_second": 245.161,
"eval_steps_per_second": 31.091,
"step": 2164
},
{
"epoch": 4.62,
"grad_norm": 4.481846332550049,
"learning_rate": 1.9075785582255083e-05,
"loss": 2.5551,
"step": 2500
},
{
"epoch": 5.0,
"eval_accuracy": 0.6340956340956341,
"eval_loss": 1.9055824279785156,
"eval_runtime": 2.0221,
"eval_samples_per_second": 237.875,
"eval_steps_per_second": 30.167,
"step": 2705
},
{
"epoch": 5.55,
"grad_norm": 4.1784281730651855,
"learning_rate": 1.88909426987061e-05,
"loss": 2.3662,
"step": 3000
},
{
"epoch": 6.0,
"eval_accuracy": 0.6632016632016632,
"eval_loss": 1.7023240327835083,
"eval_runtime": 1.9666,
"eval_samples_per_second": 244.588,
"eval_steps_per_second": 31.018,
"step": 3246
},
{
"epoch": 6.47,
"grad_norm": 6.06294059753418,
"learning_rate": 1.8706099815157116e-05,
"loss": 2.1965,
"step": 3500
},
{
"epoch": 7.0,
"eval_accuracy": 0.6798336798336798,
"eval_loss": 1.5739575624465942,
"eval_runtime": 1.9941,
"eval_samples_per_second": 241.216,
"eval_steps_per_second": 30.591,
"step": 3787
},
{
"epoch": 7.39,
"grad_norm": 4.912960052490234,
"learning_rate": 1.8521256931608135e-05,
"loss": 2.1397,
"step": 4000
},
{
"epoch": 8.0,
"eval_accuracy": 0.6943866943866944,
"eval_loss": 1.4560521841049194,
"eval_runtime": 1.9835,
"eval_samples_per_second": 242.507,
"eval_steps_per_second": 30.754,
"step": 4328
},
{
"epoch": 8.32,
"grad_norm": 5.236889362335205,
"learning_rate": 1.833641404805915e-05,
"loss": 1.9955,
"step": 4500
},
{
"epoch": 9.0,
"eval_accuracy": 0.7234927234927235,
"eval_loss": 1.3202540874481201,
"eval_runtime": 2.0536,
"eval_samples_per_second": 234.218,
"eval_steps_per_second": 29.703,
"step": 4869
},
{
"epoch": 9.24,
"grad_norm": 5.675503253936768,
"learning_rate": 1.8151571164510168e-05,
"loss": 1.9282,
"step": 5000
},
{
"epoch": 10.0,
"eval_accuracy": 0.738045738045738,
"eval_loss": 1.2246184349060059,
"eval_runtime": 2.0017,
"eval_samples_per_second": 240.293,
"eval_steps_per_second": 30.474,
"step": 5410
},
{
"epoch": 10.17,
"grad_norm": 4.67825174331665,
"learning_rate": 1.7966728280961186e-05,
"loss": 1.8368,
"step": 5500
},
{
"epoch": 11.0,
"eval_accuracy": 0.738045738045738,
"eval_loss": 1.1823257207870483,
"eval_runtime": 1.9774,
"eval_samples_per_second": 243.246,
"eval_steps_per_second": 30.848,
"step": 5951
},
{
"epoch": 11.09,
"grad_norm": 4.809859275817871,
"learning_rate": 1.77818853974122e-05,
"loss": 1.812,
"step": 6000
},
{
"epoch": 12.0,
"eval_accuracy": 0.7214137214137214,
"eval_loss": 1.1297953128814697,
"eval_runtime": 2.0307,
"eval_samples_per_second": 236.864,
"eval_steps_per_second": 30.039,
"step": 6492
},
{
"epoch": 12.01,
"grad_norm": 5.255190849304199,
"learning_rate": 1.759704251386322e-05,
"loss": 1.7353,
"step": 6500
},
{
"epoch": 12.94,
"grad_norm": 8.597217559814453,
"learning_rate": 1.7412199630314234e-05,
"loss": 1.7195,
"step": 7000
},
{
"epoch": 13.0,
"eval_accuracy": 0.7484407484407485,
"eval_loss": 1.0423070192337036,
"eval_runtime": 2.0193,
"eval_samples_per_second": 238.201,
"eval_steps_per_second": 30.208,
"step": 7033
},
{
"epoch": 13.86,
"grad_norm": 6.453842639923096,
"learning_rate": 1.7227356746765253e-05,
"loss": 1.6314,
"step": 7500
},
{
"epoch": 14.0,
"eval_accuracy": 0.7422037422037422,
"eval_loss": 1.0077309608459473,
"eval_runtime": 2.0783,
"eval_samples_per_second": 231.439,
"eval_steps_per_second": 29.351,
"step": 7574
},
{
"epoch": 14.79,
"grad_norm": 8.70645523071289,
"learning_rate": 1.7042513863216268e-05,
"loss": 1.5979,
"step": 8000
},
{
"epoch": 15.0,
"eval_accuracy": 0.7463617463617463,
"eval_loss": 1.00509512424469,
"eval_runtime": 1.9889,
"eval_samples_per_second": 241.847,
"eval_steps_per_second": 30.671,
"step": 8115
},
{
"epoch": 15.71,
"grad_norm": 7.348147392272949,
"learning_rate": 1.6857670979667286e-05,
"loss": 1.5656,
"step": 8500
},
{
"epoch": 16.0,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.9325113296508789,
"eval_runtime": 1.9923,
"eval_samples_per_second": 241.43,
"eval_steps_per_second": 30.618,
"step": 8656
},
{
"epoch": 16.64,
"grad_norm": 6.420931816101074,
"learning_rate": 1.66728280961183e-05,
"loss": 1.5414,
"step": 9000
},
{
"epoch": 17.0,
"eval_accuracy": 0.7733887733887734,
"eval_loss": 0.8889437913894653,
"eval_runtime": 1.9727,
"eval_samples_per_second": 243.822,
"eval_steps_per_second": 30.921,
"step": 9197
},
{
"epoch": 17.56,
"grad_norm": 8.127350807189941,
"learning_rate": 1.6487985212569316e-05,
"loss": 1.5342,
"step": 9500
},
{
"epoch": 18.0,
"eval_accuracy": 0.7484407484407485,
"eval_loss": 0.9072721600532532,
"eval_runtime": 1.9933,
"eval_samples_per_second": 241.303,
"eval_steps_per_second": 30.602,
"step": 9738
},
{
"epoch": 18.48,
"grad_norm": 6.122061252593994,
"learning_rate": 1.6303142329020334e-05,
"loss": 1.4898,
"step": 10000
},
{
"epoch": 19.0,
"eval_accuracy": 0.7713097713097713,
"eval_loss": 0.8425627946853638,
"eval_runtime": 1.9868,
"eval_samples_per_second": 242.099,
"eval_steps_per_second": 30.703,
"step": 10279
},
{
"epoch": 19.41,
"grad_norm": 6.640945911407471,
"learning_rate": 1.611829944547135e-05,
"loss": 1.4731,
"step": 10500
},
{
"epoch": 20.0,
"eval_accuracy": 0.7442827442827443,
"eval_loss": 0.862506091594696,
"eval_runtime": 1.9786,
"eval_samples_per_second": 243.096,
"eval_steps_per_second": 30.829,
"step": 10820
},
{
"epoch": 20.33,
"grad_norm": 6.019400119781494,
"learning_rate": 1.5933456561922367e-05,
"loss": 1.451,
"step": 11000
},
{
"epoch": 21.0,
"eval_accuracy": 0.762993762993763,
"eval_loss": 0.8015209436416626,
"eval_runtime": 1.9644,
"eval_samples_per_second": 244.864,
"eval_steps_per_second": 31.053,
"step": 11361
},
{
"epoch": 21.26,
"grad_norm": 5.140503406524658,
"learning_rate": 1.5748613678373382e-05,
"loss": 1.4578,
"step": 11500
},
{
"epoch": 22.0,
"eval_accuracy": 0.7588357588357588,
"eval_loss": 0.8520306944847107,
"eval_runtime": 2.0001,
"eval_samples_per_second": 240.484,
"eval_steps_per_second": 30.498,
"step": 11902
},
{
"epoch": 22.18,
"grad_norm": 15.190984725952148,
"learning_rate": 1.55637707948244e-05,
"loss": 1.4126,
"step": 12000
},
{
"epoch": 23.0,
"eval_accuracy": 0.7713097713097713,
"eval_loss": 0.7928301692008972,
"eval_runtime": 1.9822,
"eval_samples_per_second": 242.66,
"eval_steps_per_second": 30.774,
"step": 12443
},
{
"epoch": 23.11,
"grad_norm": 11.220525741577148,
"learning_rate": 1.5378927911275416e-05,
"loss": 1.3626,
"step": 12500
},
{
"epoch": 24.0,
"eval_accuracy": 0.7837837837837838,
"eval_loss": 0.754388689994812,
"eval_runtime": 1.975,
"eval_samples_per_second": 243.545,
"eval_steps_per_second": 30.886,
"step": 12984
},
{
"epoch": 24.03,
"grad_norm": 3.5185582637786865,
"learning_rate": 1.5194085027726432e-05,
"loss": 1.3905,
"step": 13000
},
{
"epoch": 24.95,
"grad_norm": 8.19352912902832,
"learning_rate": 1.5009242144177449e-05,
"loss": 1.3694,
"step": 13500
},
{
"epoch": 25.0,
"eval_accuracy": 0.7775467775467776,
"eval_loss": 0.7698755860328674,
"eval_runtime": 2.0179,
"eval_samples_per_second": 238.368,
"eval_steps_per_second": 30.23,
"step": 13525
},
{
"epoch": 25.88,
"grad_norm": 6.003907680511475,
"learning_rate": 1.4824399260628467e-05,
"loss": 1.3612,
"step": 14000
},
{
"epoch": 26.0,
"eval_accuracy": 0.7775467775467776,
"eval_loss": 0.7602183818817139,
"eval_runtime": 1.9833,
"eval_samples_per_second": 242.521,
"eval_steps_per_second": 30.756,
"step": 14066
},
{
"epoch": 26.8,
"grad_norm": 6.613931655883789,
"learning_rate": 1.4639556377079484e-05,
"loss": 1.2963,
"step": 14500
},
{
"epoch": 27.0,
"eval_accuracy": 0.7713097713097713,
"eval_loss": 0.7532169818878174,
"eval_runtime": 2.0706,
"eval_samples_per_second": 232.305,
"eval_steps_per_second": 29.461,
"step": 14607
},
{
"epoch": 27.73,
"grad_norm": 7.66683292388916,
"learning_rate": 1.44547134935305e-05,
"loss": 1.3009,
"step": 15000
},
{
"epoch": 28.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.7012535929679871,
"eval_runtime": 1.9606,
"eval_samples_per_second": 245.335,
"eval_steps_per_second": 31.113,
"step": 15148
},
{
"epoch": 28.65,
"grad_norm": 7.342077255249023,
"learning_rate": 1.4269870609981517e-05,
"loss": 1.2598,
"step": 15500
},
{
"epoch": 29.0,
"eval_accuracy": 0.7796257796257796,
"eval_loss": 0.7084705233573914,
"eval_runtime": 1.9824,
"eval_samples_per_second": 242.632,
"eval_steps_per_second": 30.77,
"step": 15689
},
{
"epoch": 29.57,
"grad_norm": 5.679790019989014,
"learning_rate": 1.4085027726432534e-05,
"loss": 1.2565,
"step": 16000
},
{
"epoch": 30.0,
"eval_accuracy": 0.7775467775467776,
"eval_loss": 0.7023281455039978,
"eval_runtime": 1.9659,
"eval_samples_per_second": 244.668,
"eval_steps_per_second": 31.029,
"step": 16230
},
{
"epoch": 30.5,
"grad_norm": 5.493412971496582,
"learning_rate": 1.390018484288355e-05,
"loss": 1.2735,
"step": 16500
},
{
"epoch": 31.0,
"eval_accuracy": 0.7775467775467776,
"eval_loss": 0.7047860026359558,
"eval_runtime": 1.9718,
"eval_samples_per_second": 243.937,
"eval_steps_per_second": 30.936,
"step": 16771
},
{
"epoch": 31.42,
"grad_norm": 6.2688093185424805,
"learning_rate": 1.3715341959334567e-05,
"loss": 1.2743,
"step": 17000
},
{
"epoch": 32.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6794067621231079,
"eval_runtime": 1.9764,
"eval_samples_per_second": 243.372,
"eval_steps_per_second": 30.864,
"step": 17312
},
{
"epoch": 32.35,
"grad_norm": 10.169917106628418,
"learning_rate": 1.3530499075785584e-05,
"loss": 1.2441,
"step": 17500
},
{
"epoch": 33.0,
"eval_accuracy": 0.7858627858627859,
"eval_loss": 0.693196713924408,
"eval_runtime": 1.972,
"eval_samples_per_second": 243.92,
"eval_steps_per_second": 30.934,
"step": 17853
},
{
"epoch": 33.27,
"grad_norm": 8.05045223236084,
"learning_rate": 1.33456561922366e-05,
"loss": 1.2282,
"step": 18000
},
{
"epoch": 34.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7038751840591431,
"eval_runtime": 1.992,
"eval_samples_per_second": 241.466,
"eval_steps_per_second": 30.623,
"step": 18394
},
{
"epoch": 34.2,
"grad_norm": 5.410665035247803,
"learning_rate": 1.3160813308687617e-05,
"loss": 1.2204,
"step": 18500
},
{
"epoch": 35.0,
"eval_accuracy": 0.8066528066528067,
"eval_loss": 0.6860660910606384,
"eval_runtime": 1.9773,
"eval_samples_per_second": 243.258,
"eval_steps_per_second": 30.85,
"step": 18935
},
{
"epoch": 35.12,
"grad_norm": 11.123208045959473,
"learning_rate": 1.2975970425138634e-05,
"loss": 1.1808,
"step": 19000
},
{
"epoch": 36.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.6589930057525635,
"eval_runtime": 1.9769,
"eval_samples_per_second": 243.305,
"eval_steps_per_second": 30.856,
"step": 19476
},
{
"epoch": 36.04,
"grad_norm": 6.165465354919434,
"learning_rate": 1.279112754158965e-05,
"loss": 1.1933,
"step": 19500
},
{
"epoch": 36.97,
"grad_norm": 6.407803535461426,
"learning_rate": 1.2606284658040667e-05,
"loss": 1.1928,
"step": 20000
},
{
"epoch": 37.0,
"eval_accuracy": 0.7817047817047817,
"eval_loss": 0.678415834903717,
"eval_runtime": 1.9844,
"eval_samples_per_second": 242.388,
"eval_steps_per_second": 30.739,
"step": 20017
},
{
"epoch": 37.89,
"grad_norm": 4.849668979644775,
"learning_rate": 1.2421441774491683e-05,
"loss": 1.1914,
"step": 20500
},
{
"epoch": 38.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.6559053659439087,
"eval_runtime": 1.9912,
"eval_samples_per_second": 241.56,
"eval_steps_per_second": 30.634,
"step": 20558
},
{
"epoch": 38.82,
"grad_norm": 9.1309232711792,
"learning_rate": 1.2236598890942698e-05,
"loss": 1.1856,
"step": 21000
},
{
"epoch": 39.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.6769025325775146,
"eval_runtime": 2.0066,
"eval_samples_per_second": 239.713,
"eval_steps_per_second": 30.4,
"step": 21099
},
{
"epoch": 39.74,
"grad_norm": 5.001546382904053,
"learning_rate": 1.2051756007393715e-05,
"loss": 1.1585,
"step": 21500
},
{
"epoch": 40.0,
"eval_accuracy": 0.8004158004158004,
"eval_loss": 0.64976966381073,
"eval_runtime": 1.9804,
"eval_samples_per_second": 242.874,
"eval_steps_per_second": 30.801,
"step": 21640
},
{
"epoch": 40.67,
"grad_norm": 14.044866561889648,
"learning_rate": 1.1866913123844732e-05,
"loss": 1.1713,
"step": 22000
},
{
"epoch": 41.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6447434425354004,
"eval_runtime": 1.9973,
"eval_samples_per_second": 240.829,
"eval_steps_per_second": 30.542,
"step": 22181
},
{
"epoch": 41.59,
"grad_norm": 10.289350509643555,
"learning_rate": 1.1682070240295748e-05,
"loss": 1.1183,
"step": 22500
},
{
"epoch": 42.0,
"eval_accuracy": 0.7713097713097713,
"eval_loss": 0.6748064756393433,
"eval_runtime": 1.9672,
"eval_samples_per_second": 244.509,
"eval_steps_per_second": 31.008,
"step": 22722
},
{
"epoch": 42.51,
"grad_norm": 12.7116117477417,
"learning_rate": 1.1497227356746765e-05,
"loss": 1.1564,
"step": 23000
},
{
"epoch": 43.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6545261740684509,
"eval_runtime": 1.9659,
"eval_samples_per_second": 244.669,
"eval_steps_per_second": 31.029,
"step": 23263
},
{
"epoch": 43.44,
"grad_norm": 3.0720624923706055,
"learning_rate": 1.1312384473197783e-05,
"loss": 1.1215,
"step": 23500
},
{
"epoch": 44.0,
"eval_accuracy": 0.7879417879417879,
"eval_loss": 0.6690270900726318,
"eval_runtime": 1.9635,
"eval_samples_per_second": 244.971,
"eval_steps_per_second": 31.067,
"step": 23804
},
{
"epoch": 44.36,
"grad_norm": 7.927094459533691,
"learning_rate": 1.11275415896488e-05,
"loss": 1.1008,
"step": 24000
},
{
"epoch": 45.0,
"eval_accuracy": 0.7879417879417879,
"eval_loss": 0.659792423248291,
"eval_runtime": 1.9747,
"eval_samples_per_second": 243.578,
"eval_steps_per_second": 30.89,
"step": 24345
},
{
"epoch": 45.29,
"grad_norm": 8.912357330322266,
"learning_rate": 1.0942698706099817e-05,
"loss": 1.1344,
"step": 24500
},
{
"epoch": 46.0,
"eval_accuracy": 0.8024948024948025,
"eval_loss": 0.6550182104110718,
"eval_runtime": 2.0112,
"eval_samples_per_second": 239.156,
"eval_steps_per_second": 30.33,
"step": 24886
},
{
"epoch": 46.21,
"grad_norm": 9.598004341125488,
"learning_rate": 1.0757855822550833e-05,
"loss": 1.126,
"step": 25000
},
{
"epoch": 47.0,
"eval_accuracy": 0.7858627858627859,
"eval_loss": 0.6521425247192383,
"eval_runtime": 1.9713,
"eval_samples_per_second": 244.004,
"eval_steps_per_second": 30.944,
"step": 25427
},
{
"epoch": 47.13,
"grad_norm": 4.670881271362305,
"learning_rate": 1.057301293900185e-05,
"loss": 1.125,
"step": 25500
},
{
"epoch": 48.0,
"eval_accuracy": 0.7817047817047817,
"eval_loss": 0.6812848448753357,
"eval_runtime": 2.016,
"eval_samples_per_second": 238.588,
"eval_steps_per_second": 30.258,
"step": 25968
},
{
"epoch": 48.06,
"grad_norm": 8.11451244354248,
"learning_rate": 1.0388170055452866e-05,
"loss": 1.0682,
"step": 26000
},
{
"epoch": 48.98,
"grad_norm": 8.960821151733398,
"learning_rate": 1.0203327171903883e-05,
"loss": 1.0855,
"step": 26500
},
{
"epoch": 49.0,
"eval_accuracy": 0.7858627858627859,
"eval_loss": 0.6419298052787781,
"eval_runtime": 1.974,
"eval_samples_per_second": 243.673,
"eval_steps_per_second": 30.902,
"step": 26509
},
{
"epoch": 49.91,
"grad_norm": 3.053118944168091,
"learning_rate": 1.00184842883549e-05,
"loss": 1.0452,
"step": 27000
},
{
"epoch": 50.0,
"eval_accuracy": 0.8004158004158004,
"eval_loss": 0.6550863981246948,
"eval_runtime": 2.0504,
"eval_samples_per_second": 234.587,
"eval_steps_per_second": 29.75,
"step": 27050
},
{
"epoch": 50.83,
"grad_norm": 5.4594340324401855,
"learning_rate": 9.833641404805916e-06,
"loss": 1.0626,
"step": 27500
},
{
"epoch": 51.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6675499081611633,
"eval_runtime": 1.9991,
"eval_samples_per_second": 240.605,
"eval_steps_per_second": 30.513,
"step": 27591
},
{
"epoch": 51.76,
"grad_norm": 8.158236503601074,
"learning_rate": 9.648798521256933e-06,
"loss": 1.0155,
"step": 28000
},
{
"epoch": 52.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6945971846580505,
"eval_runtime": 1.9873,
"eval_samples_per_second": 242.042,
"eval_steps_per_second": 30.696,
"step": 28132
},
{
"epoch": 52.68,
"grad_norm": 5.626604080200195,
"learning_rate": 9.46395563770795e-06,
"loss": 1.0319,
"step": 28500
},
{
"epoch": 53.0,
"eval_accuracy": 0.7796257796257796,
"eval_loss": 0.6942130923271179,
"eval_runtime": 1.966,
"eval_samples_per_second": 244.665,
"eval_steps_per_second": 31.028,
"step": 28673
},
{
"epoch": 53.6,
"grad_norm": 6.82182502746582,
"learning_rate": 9.279112754158966e-06,
"loss": 1.0488,
"step": 29000
},
{
"epoch": 54.0,
"eval_accuracy": 0.7983367983367984,
"eval_loss": 0.6496003866195679,
"eval_runtime": 2.0069,
"eval_samples_per_second": 239.673,
"eval_steps_per_second": 30.395,
"step": 29214
},
{
"epoch": 54.53,
"grad_norm": 7.865675926208496,
"learning_rate": 9.094269870609981e-06,
"loss": 1.0558,
"step": 29500
},
{
"epoch": 55.0,
"eval_accuracy": 0.8045738045738046,
"eval_loss": 0.6465332508087158,
"eval_runtime": 1.9938,
"eval_samples_per_second": 241.25,
"eval_steps_per_second": 30.595,
"step": 29755
},
{
"epoch": 55.45,
"grad_norm": 7.172035217285156,
"learning_rate": 8.909426987060998e-06,
"loss": 0.9913,
"step": 30000
},
{
"epoch": 56.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6654109954833984,
"eval_runtime": 1.9693,
"eval_samples_per_second": 244.248,
"eval_steps_per_second": 30.975,
"step": 30296
},
{
"epoch": 56.38,
"grad_norm": 6.30518102645874,
"learning_rate": 8.724584103512016e-06,
"loss": 1.0555,
"step": 30500
},
{
"epoch": 57.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.656141996383667,
"eval_runtime": 1.9741,
"eval_samples_per_second": 243.66,
"eval_steps_per_second": 30.901,
"step": 30837
},
{
"epoch": 57.3,
"grad_norm": 3.0917370319366455,
"learning_rate": 8.539741219963033e-06,
"loss": 0.9803,
"step": 31000
},
{
"epoch": 58.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.673220157623291,
"eval_runtime": 1.993,
"eval_samples_per_second": 241.346,
"eval_steps_per_second": 30.607,
"step": 31378
},
{
"epoch": 58.23,
"grad_norm": 8.285308837890625,
"learning_rate": 8.35489833641405e-06,
"loss": 1.0393,
"step": 31500
},
{
"epoch": 59.0,
"eval_accuracy": 0.7817047817047817,
"eval_loss": 0.6892696619033813,
"eval_runtime": 1.9798,
"eval_samples_per_second": 242.949,
"eval_steps_per_second": 30.811,
"step": 31919
},
{
"epoch": 59.15,
"grad_norm": 3.1396327018737793,
"learning_rate": 8.170055452865066e-06,
"loss": 0.9677,
"step": 32000
},
{
"epoch": 60.0,
"eval_accuracy": 0.8045738045738046,
"eval_loss": 0.6823599934577942,
"eval_runtime": 2.0127,
"eval_samples_per_second": 238.985,
"eval_steps_per_second": 30.308,
"step": 32460
},
{
"epoch": 60.07,
"grad_norm": 12.875879287719727,
"learning_rate": 7.985212569316083e-06,
"loss": 1.0366,
"step": 32500
},
{
"epoch": 61.0,
"grad_norm": 8.770364761352539,
"learning_rate": 7.8003696857671e-06,
"loss": 1.0082,
"step": 33000
},
{
"epoch": 61.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.6618274450302124,
"eval_runtime": 2.0008,
"eval_samples_per_second": 240.403,
"eval_steps_per_second": 30.488,
"step": 33001
},
{
"epoch": 61.92,
"grad_norm": 6.0600972175598145,
"learning_rate": 7.615526802218115e-06,
"loss": 1.0096,
"step": 33500
},
{
"epoch": 62.0,
"eval_accuracy": 0.7837837837837838,
"eval_loss": 0.6691136360168457,
"eval_runtime": 1.9817,
"eval_samples_per_second": 242.719,
"eval_steps_per_second": 30.781,
"step": 33542
},
{
"epoch": 62.85,
"grad_norm": 10.777630805969238,
"learning_rate": 7.430683918669132e-06,
"loss": 0.9685,
"step": 34000
},
{
"epoch": 63.0,
"eval_accuracy": 0.8024948024948025,
"eval_loss": 0.6792653203010559,
"eval_runtime": 2.0645,
"eval_samples_per_second": 232.985,
"eval_steps_per_second": 29.547,
"step": 34083
},
{
"epoch": 63.77,
"grad_norm": 3.9615447521209717,
"learning_rate": 7.245841035120148e-06,
"loss": 0.9847,
"step": 34500
},
{
"epoch": 64.0,
"eval_accuracy": 0.7837837837837838,
"eval_loss": 0.6894533634185791,
"eval_runtime": 2.0054,
"eval_samples_per_second": 239.847,
"eval_steps_per_second": 30.417,
"step": 34624
},
{
"epoch": 64.7,
"grad_norm": 9.38687801361084,
"learning_rate": 7.060998151571166e-06,
"loss": 0.9639,
"step": 35000
},
{
"epoch": 65.0,
"eval_accuracy": 0.7733887733887734,
"eval_loss": 0.7297117114067078,
"eval_runtime": 2.0147,
"eval_samples_per_second": 238.744,
"eval_steps_per_second": 30.277,
"step": 35165
},
{
"epoch": 65.62,
"grad_norm": 12.292973518371582,
"learning_rate": 6.876155268022182e-06,
"loss": 0.9776,
"step": 35500
},
{
"epoch": 66.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.6561179757118225,
"eval_runtime": 1.9845,
"eval_samples_per_second": 242.381,
"eval_steps_per_second": 30.739,
"step": 35706
},
{
"epoch": 66.54,
"grad_norm": 14.023015022277832,
"learning_rate": 6.691312384473199e-06,
"loss": 1.0074,
"step": 36000
},
{
"epoch": 67.0,
"eval_accuracy": 0.7775467775467776,
"eval_loss": 0.6998913884162903,
"eval_runtime": 1.9686,
"eval_samples_per_second": 244.338,
"eval_steps_per_second": 30.987,
"step": 36247
},
{
"epoch": 67.47,
"grad_norm": 13.870222091674805,
"learning_rate": 6.506469500924215e-06,
"loss": 0.9466,
"step": 36500
},
{
"epoch": 68.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.6880961656570435,
"eval_runtime": 1.9686,
"eval_samples_per_second": 244.34,
"eval_steps_per_second": 30.987,
"step": 36788
},
{
"epoch": 68.39,
"grad_norm": 6.1949639320373535,
"learning_rate": 6.321626617375231e-06,
"loss": 0.9425,
"step": 37000
},
{
"epoch": 69.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.6805587410926819,
"eval_runtime": 1.9709,
"eval_samples_per_second": 244.052,
"eval_steps_per_second": 30.95,
"step": 37329
},
{
"epoch": 69.32,
"grad_norm": 7.145143508911133,
"learning_rate": 6.136783733826248e-06,
"loss": 0.9594,
"step": 37500
},
{
"epoch": 70.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7202461361885071,
"eval_runtime": 2.0125,
"eval_samples_per_second": 239.001,
"eval_steps_per_second": 30.31,
"step": 37870
},
{
"epoch": 70.24,
"grad_norm": 9.215810775756836,
"learning_rate": 5.951940850277265e-06,
"loss": 0.9311,
"step": 38000
},
{
"epoch": 71.0,
"eval_accuracy": 0.7754677754677755,
"eval_loss": 0.7161967754364014,
"eval_runtime": 1.977,
"eval_samples_per_second": 243.297,
"eval_steps_per_second": 30.855,
"step": 38411
},
{
"epoch": 71.16,
"grad_norm": 6.461187362670898,
"learning_rate": 5.767097966728281e-06,
"loss": 0.9429,
"step": 38500
},
{
"epoch": 72.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.7284368276596069,
"eval_runtime": 2.0663,
"eval_samples_per_second": 232.783,
"eval_steps_per_second": 29.521,
"step": 38952
},
{
"epoch": 72.09,
"grad_norm": 11.850204467773438,
"learning_rate": 5.582255083179298e-06,
"loss": 0.9666,
"step": 39000
},
{
"epoch": 73.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.6871474981307983,
"eval_runtime": 1.97,
"eval_samples_per_second": 244.162,
"eval_steps_per_second": 30.964,
"step": 39493
},
{
"epoch": 73.01,
"grad_norm": 8.0579252243042,
"learning_rate": 5.3974121996303146e-06,
"loss": 0.932,
"step": 39500
},
{
"epoch": 73.94,
"grad_norm": 1.1972132921218872,
"learning_rate": 5.212569316081332e-06,
"loss": 0.945,
"step": 40000
},
{
"epoch": 74.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.6778899431228638,
"eval_runtime": 2.0376,
"eval_samples_per_second": 236.067,
"eval_steps_per_second": 29.938,
"step": 40034
},
{
"epoch": 74.86,
"grad_norm": 5.484439849853516,
"learning_rate": 5.027726432532349e-06,
"loss": 0.9387,
"step": 40500
},
{
"epoch": 75.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.735752522945404,
"eval_runtime": 1.9762,
"eval_samples_per_second": 243.395,
"eval_steps_per_second": 30.867,
"step": 40575
},
{
"epoch": 75.79,
"grad_norm": 2.0908420085906982,
"learning_rate": 4.8428835489833645e-06,
"loss": 0.9132,
"step": 41000
},
{
"epoch": 76.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7043733596801758,
"eval_runtime": 1.9764,
"eval_samples_per_second": 243.367,
"eval_steps_per_second": 30.864,
"step": 41116
},
{
"epoch": 76.71,
"grad_norm": 10.380330085754395,
"learning_rate": 4.658040665434381e-06,
"loss": 0.9181,
"step": 41500
},
{
"epoch": 77.0,
"eval_accuracy": 0.7962577962577962,
"eval_loss": 0.7041053771972656,
"eval_runtime": 2.0006,
"eval_samples_per_second": 240.43,
"eval_steps_per_second": 30.491,
"step": 41657
},
{
"epoch": 77.63,
"grad_norm": 9.135781288146973,
"learning_rate": 4.473197781885398e-06,
"loss": 0.9218,
"step": 42000
},
{
"epoch": 78.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.6986111998558044,
"eval_runtime": 1.9688,
"eval_samples_per_second": 244.309,
"eval_steps_per_second": 30.983,
"step": 42198
},
{
"epoch": 78.56,
"grad_norm": 17.338001251220703,
"learning_rate": 4.288354898336414e-06,
"loss": 0.8621,
"step": 42500
},
{
"epoch": 79.0,
"eval_accuracy": 0.8004158004158004,
"eval_loss": 0.6909247040748596,
"eval_runtime": 1.997,
"eval_samples_per_second": 240.86,
"eval_steps_per_second": 30.546,
"step": 42739
},
{
"epoch": 79.48,
"grad_norm": 6.793923854827881,
"learning_rate": 4.103512014787431e-06,
"loss": 0.9236,
"step": 43000
},
{
"epoch": 80.0,
"eval_accuracy": 0.7983367983367984,
"eval_loss": 0.7135599851608276,
"eval_runtime": 1.9949,
"eval_samples_per_second": 241.121,
"eval_steps_per_second": 30.579,
"step": 43280
},
{
"epoch": 80.41,
"grad_norm": 3.9345781803131104,
"learning_rate": 3.918669131238448e-06,
"loss": 0.8667,
"step": 43500
},
{
"epoch": 81.0,
"eval_accuracy": 0.8024948024948025,
"eval_loss": 0.7008742094039917,
"eval_runtime": 1.992,
"eval_samples_per_second": 241.461,
"eval_steps_per_second": 30.622,
"step": 43821
},
{
"epoch": 81.33,
"grad_norm": 16.883420944213867,
"learning_rate": 3.7338262476894642e-06,
"loss": 0.8856,
"step": 44000
},
{
"epoch": 82.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.7127683162689209,
"eval_runtime": 2.0451,
"eval_samples_per_second": 235.201,
"eval_steps_per_second": 29.828,
"step": 44362
},
{
"epoch": 82.26,
"grad_norm": 7.969069480895996,
"learning_rate": 3.548983364140481e-06,
"loss": 0.917,
"step": 44500
},
{
"epoch": 83.0,
"eval_accuracy": 0.7983367983367984,
"eval_loss": 0.7134777307510376,
"eval_runtime": 2.0529,
"eval_samples_per_second": 234.298,
"eval_steps_per_second": 29.714,
"step": 44903
},
{
"epoch": 83.18,
"grad_norm": 1.545163631439209,
"learning_rate": 3.3641404805914975e-06,
"loss": 0.8835,
"step": 45000
},
{
"epoch": 84.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7295302748680115,
"eval_runtime": 1.9747,
"eval_samples_per_second": 243.583,
"eval_steps_per_second": 30.891,
"step": 45444
},
{
"epoch": 84.1,
"grad_norm": 5.072544097900391,
"learning_rate": 3.1792975970425146e-06,
"loss": 0.8879,
"step": 45500
},
{
"epoch": 85.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7449509501457214,
"eval_runtime": 2.0759,
"eval_samples_per_second": 231.71,
"eval_steps_per_second": 29.385,
"step": 45985
},
{
"epoch": 85.03,
"grad_norm": 5.645694732666016,
"learning_rate": 2.9944547134935308e-06,
"loss": 0.9114,
"step": 46000
},
{
"epoch": 85.95,
"grad_norm": 5.065194129943848,
"learning_rate": 2.8096118299445474e-06,
"loss": 0.8764,
"step": 46500
},
{
"epoch": 86.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7362204194068909,
"eval_runtime": 1.9869,
"eval_samples_per_second": 242.081,
"eval_steps_per_second": 30.7,
"step": 46526
},
{
"epoch": 86.88,
"grad_norm": 5.654088020324707,
"learning_rate": 2.624768946395564e-06,
"loss": 0.8674,
"step": 47000
},
{
"epoch": 87.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7232093811035156,
"eval_runtime": 2.0232,
"eval_samples_per_second": 237.746,
"eval_steps_per_second": 30.151,
"step": 47067
},
{
"epoch": 87.8,
"grad_norm": 12.72859001159668,
"learning_rate": 2.4399260628465807e-06,
"loss": 0.8583,
"step": 47500
},
{
"epoch": 88.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7407870888710022,
"eval_runtime": 1.9934,
"eval_samples_per_second": 241.296,
"eval_steps_per_second": 30.601,
"step": 47608
},
{
"epoch": 88.72,
"grad_norm": 6.526777744293213,
"learning_rate": 2.2550831792975973e-06,
"loss": 0.881,
"step": 48000
},
{
"epoch": 89.0,
"eval_accuracy": 0.8004158004158004,
"eval_loss": 0.7377821803092957,
"eval_runtime": 1.9802,
"eval_samples_per_second": 242.901,
"eval_steps_per_second": 30.804,
"step": 48149
},
{
"epoch": 89.65,
"grad_norm": 8.497318267822266,
"learning_rate": 2.070240295748614e-06,
"loss": 0.8668,
"step": 48500
},
{
"epoch": 90.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7473007440567017,
"eval_runtime": 2.0137,
"eval_samples_per_second": 238.867,
"eval_steps_per_second": 30.293,
"step": 48690
},
{
"epoch": 90.57,
"grad_norm": 6.455136775970459,
"learning_rate": 1.8853974121996305e-06,
"loss": 0.8779,
"step": 49000
},
{
"epoch": 91.0,
"eval_accuracy": 0.7983367983367984,
"eval_loss": 0.7438368201255798,
"eval_runtime": 1.9731,
"eval_samples_per_second": 243.774,
"eval_steps_per_second": 30.915,
"step": 49231
},
{
"epoch": 91.5,
"grad_norm": 5.713993072509766,
"learning_rate": 1.700554528650647e-06,
"loss": 0.8717,
"step": 49500
},
{
"epoch": 92.0,
"eval_accuracy": 0.8004158004158004,
"eval_loss": 0.7389739751815796,
"eval_runtime": 1.9686,
"eval_samples_per_second": 244.34,
"eval_steps_per_second": 30.987,
"step": 49772
},
{
"epoch": 92.42,
"grad_norm": 5.342690467834473,
"learning_rate": 1.5157116451016638e-06,
"loss": 0.8781,
"step": 50000
},
{
"epoch": 93.0,
"eval_accuracy": 0.7983367983367984,
"eval_loss": 0.7473535537719727,
"eval_runtime": 1.98,
"eval_samples_per_second": 242.926,
"eval_steps_per_second": 30.808,
"step": 50313
},
{
"epoch": 93.35,
"grad_norm": 9.870634078979492,
"learning_rate": 1.3308687615526802e-06,
"loss": 0.8845,
"step": 50500
},
{
"epoch": 94.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7445840835571289,
"eval_runtime": 1.9776,
"eval_samples_per_second": 243.222,
"eval_steps_per_second": 30.845,
"step": 50854
},
{
"epoch": 94.27,
"grad_norm": 8.909347534179688,
"learning_rate": 1.1460258780036969e-06,
"loss": 0.8623,
"step": 51000
},
{
"epoch": 95.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.7315581440925598,
"eval_runtime": 1.9728,
"eval_samples_per_second": 243.814,
"eval_steps_per_second": 30.92,
"step": 51395
},
{
"epoch": 95.19,
"grad_norm": 10.748625755310059,
"learning_rate": 9.611829944547135e-07,
"loss": 0.8341,
"step": 51500
},
{
"epoch": 96.0,
"eval_accuracy": 0.7879417879417879,
"eval_loss": 0.7457364201545715,
"eval_runtime": 2.0017,
"eval_samples_per_second": 240.29,
"eval_steps_per_second": 30.473,
"step": 51936
},
{
"epoch": 96.12,
"grad_norm": 3.179774761199951,
"learning_rate": 7.763401109057302e-07,
"loss": 0.8766,
"step": 52000
},
{
"epoch": 97.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.743617832660675,
"eval_runtime": 1.999,
"eval_samples_per_second": 240.625,
"eval_steps_per_second": 30.516,
"step": 52477
},
{
"epoch": 97.04,
"grad_norm": 12.243720054626465,
"learning_rate": 5.914972273567468e-07,
"loss": 0.8101,
"step": 52500
},
{
"epoch": 97.97,
"grad_norm": 18.670886993408203,
"learning_rate": 4.066543438077634e-07,
"loss": 0.8681,
"step": 53000
},
{
"epoch": 98.0,
"eval_accuracy": 0.7900207900207901,
"eval_loss": 0.7483807802200317,
"eval_runtime": 2.0039,
"eval_samples_per_second": 240.035,
"eval_steps_per_second": 30.441,
"step": 53018
},
{
"epoch": 98.89,
"grad_norm": 8.483085632324219,
"learning_rate": 2.2181146025878005e-07,
"loss": 0.8635,
"step": 53500
},
{
"epoch": 99.0,
"eval_accuracy": 0.7941787941787942,
"eval_loss": 0.7391884922981262,
"eval_runtime": 1.9875,
"eval_samples_per_second": 242.013,
"eval_steps_per_second": 30.692,
"step": 53559
},
{
"epoch": 99.82,
"grad_norm": 10.068202018737793,
"learning_rate": 3.696857670979668e-08,
"loss": 0.8091,
"step": 54000
},
{
"epoch": 100.0,
"eval_accuracy": 0.7920997920997921,
"eval_loss": 0.7390549182891846,
"eval_runtime": 2.0448,
"eval_samples_per_second": 235.228,
"eval_steps_per_second": 29.831,
"step": 54100
},
{
"epoch": 100.0,
"step": 54100,
"total_flos": 3.355193271048192e+19,
"train_loss": 1.2517024893769495,
"train_runtime": 5380.7558,
"train_samples_per_second": 80.379,
"train_steps_per_second": 10.054
}
],
"logging_steps": 500,
"max_steps": 54100,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"total_flos": 3.355193271048192e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}