xlm-t-roberta-base-mnli-xnli / trainer_state.json
morit's picture
upload model
fd257ef
{
"best_metric": 0.587454617023468,
"best_model_checkpoint": "models/mnli_xnli_shuff_all/checkpoint-184017",
"epoch": 1.0,
"global_step": 184017,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.9945657194715707e-05,
"loss": 0.796,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 1.9891314389431413e-05,
"loss": 0.7802,
"step": 1000
},
{
"epoch": 0.01,
"learning_rate": 1.9836971584147118e-05,
"loss": 0.7726,
"step": 1500
},
{
"epoch": 0.01,
"learning_rate": 1.9782628778862823e-05,
"loss": 0.7543,
"step": 2000
},
{
"epoch": 0.01,
"learning_rate": 1.972828597357853e-05,
"loss": 0.7571,
"step": 2500
},
{
"epoch": 0.02,
"learning_rate": 1.9673943168294238e-05,
"loss": 0.7342,
"step": 3000
},
{
"epoch": 0.02,
"learning_rate": 1.961960036300994e-05,
"loss": 0.7401,
"step": 3500
},
{
"epoch": 0.02,
"learning_rate": 1.956525755772565e-05,
"loss": 0.7427,
"step": 4000
},
{
"epoch": 0.02,
"learning_rate": 1.951091475244135e-05,
"loss": 0.7412,
"step": 4500
},
{
"epoch": 0.03,
"learning_rate": 1.9456571947157056e-05,
"loss": 0.7342,
"step": 5000
},
{
"epoch": 0.03,
"learning_rate": 1.9402229141872765e-05,
"loss": 0.7286,
"step": 5500
},
{
"epoch": 0.03,
"learning_rate": 1.9347886336588467e-05,
"loss": 0.731,
"step": 6000
},
{
"epoch": 0.04,
"learning_rate": 1.9293543531304176e-05,
"loss": 0.7345,
"step": 6500
},
{
"epoch": 0.04,
"learning_rate": 1.923920072601988e-05,
"loss": 0.7233,
"step": 7000
},
{
"epoch": 0.04,
"learning_rate": 1.9184857920735587e-05,
"loss": 0.7177,
"step": 7500
},
{
"epoch": 0.04,
"learning_rate": 1.9130515115451292e-05,
"loss": 0.7239,
"step": 8000
},
{
"epoch": 0.05,
"learning_rate": 1.9076172310166994e-05,
"loss": 0.714,
"step": 8500
},
{
"epoch": 0.05,
"learning_rate": 1.9021829504882703e-05,
"loss": 0.7209,
"step": 9000
},
{
"epoch": 0.05,
"learning_rate": 1.896748669959841e-05,
"loss": 0.7191,
"step": 9500
},
{
"epoch": 0.05,
"learning_rate": 1.8913143894314114e-05,
"loss": 0.7079,
"step": 10000
},
{
"epoch": 0.06,
"learning_rate": 1.885880108902982e-05,
"loss": 0.7131,
"step": 10500
},
{
"epoch": 0.06,
"learning_rate": 1.8804458283745525e-05,
"loss": 0.7133,
"step": 11000
},
{
"epoch": 0.06,
"learning_rate": 1.875011547846123e-05,
"loss": 0.702,
"step": 11500
},
{
"epoch": 0.07,
"learning_rate": 1.8695772673176936e-05,
"loss": 0.6985,
"step": 12000
},
{
"epoch": 0.07,
"learning_rate": 1.864142986789264e-05,
"loss": 0.7004,
"step": 12500
},
{
"epoch": 0.07,
"learning_rate": 1.8587087062608347e-05,
"loss": 0.7062,
"step": 13000
},
{
"epoch": 0.07,
"learning_rate": 1.8532744257324052e-05,
"loss": 0.6984,
"step": 13500
},
{
"epoch": 0.08,
"learning_rate": 1.8478401452039758e-05,
"loss": 0.693,
"step": 14000
},
{
"epoch": 0.08,
"learning_rate": 1.8424058646755467e-05,
"loss": 0.697,
"step": 14500
},
{
"epoch": 0.08,
"learning_rate": 1.836971584147117e-05,
"loss": 0.703,
"step": 15000
},
{
"epoch": 0.08,
"learning_rate": 1.8315373036186878e-05,
"loss": 0.6923,
"step": 15500
},
{
"epoch": 0.09,
"learning_rate": 1.826103023090258e-05,
"loss": 0.6837,
"step": 16000
},
{
"epoch": 0.09,
"learning_rate": 1.8206687425618285e-05,
"loss": 0.7035,
"step": 16500
},
{
"epoch": 0.09,
"learning_rate": 1.8152344620333994e-05,
"loss": 0.687,
"step": 17000
},
{
"epoch": 0.1,
"learning_rate": 1.8098001815049696e-05,
"loss": 0.6872,
"step": 17500
},
{
"epoch": 0.1,
"learning_rate": 1.8043659009765405e-05,
"loss": 0.6864,
"step": 18000
},
{
"epoch": 0.1,
"learning_rate": 1.798931620448111e-05,
"loss": 0.6825,
"step": 18500
},
{
"epoch": 0.1,
"learning_rate": 1.7934973399196816e-05,
"loss": 0.6864,
"step": 19000
},
{
"epoch": 0.11,
"learning_rate": 1.788063059391252e-05,
"loss": 0.6834,
"step": 19500
},
{
"epoch": 0.11,
"learning_rate": 1.7826287788628227e-05,
"loss": 0.6794,
"step": 20000
},
{
"epoch": 0.11,
"learning_rate": 1.7771944983343932e-05,
"loss": 0.676,
"step": 20500
},
{
"epoch": 0.11,
"learning_rate": 1.7717602178059638e-05,
"loss": 0.6906,
"step": 21000
},
{
"epoch": 0.12,
"learning_rate": 1.7663259372775343e-05,
"loss": 0.6764,
"step": 21500
},
{
"epoch": 0.12,
"learning_rate": 1.760891656749105e-05,
"loss": 0.6764,
"step": 22000
},
{
"epoch": 0.12,
"learning_rate": 1.7554573762206754e-05,
"loss": 0.6833,
"step": 22500
},
{
"epoch": 0.12,
"learning_rate": 1.750023095692246e-05,
"loss": 0.6712,
"step": 23000
},
{
"epoch": 0.13,
"learning_rate": 1.7445888151638165e-05,
"loss": 0.6811,
"step": 23500
},
{
"epoch": 0.13,
"learning_rate": 1.739154534635387e-05,
"loss": 0.6707,
"step": 24000
},
{
"epoch": 0.13,
"learning_rate": 1.7337202541069576e-05,
"loss": 0.6749,
"step": 24500
},
{
"epoch": 0.14,
"learning_rate": 1.728285973578528e-05,
"loss": 0.6683,
"step": 25000
},
{
"epoch": 0.14,
"learning_rate": 1.7228516930500987e-05,
"loss": 0.6689,
"step": 25500
},
{
"epoch": 0.14,
"learning_rate": 1.7174174125216695e-05,
"loss": 0.6736,
"step": 26000
},
{
"epoch": 0.14,
"learning_rate": 1.7119831319932398e-05,
"loss": 0.6724,
"step": 26500
},
{
"epoch": 0.15,
"learning_rate": 1.7065488514648106e-05,
"loss": 0.6737,
"step": 27000
},
{
"epoch": 0.15,
"learning_rate": 1.701114570936381e-05,
"loss": 0.6631,
"step": 27500
},
{
"epoch": 0.15,
"learning_rate": 1.6956802904079517e-05,
"loss": 0.6861,
"step": 28000
},
{
"epoch": 0.15,
"learning_rate": 1.6902460098795223e-05,
"loss": 0.6749,
"step": 28500
},
{
"epoch": 0.16,
"learning_rate": 1.6848117293510925e-05,
"loss": 0.6619,
"step": 29000
},
{
"epoch": 0.16,
"learning_rate": 1.6793774488226634e-05,
"loss": 0.662,
"step": 29500
},
{
"epoch": 0.16,
"learning_rate": 1.673943168294234e-05,
"loss": 0.6522,
"step": 30000
},
{
"epoch": 0.17,
"learning_rate": 1.6685088877658045e-05,
"loss": 0.671,
"step": 30500
},
{
"epoch": 0.17,
"learning_rate": 1.663074607237375e-05,
"loss": 0.6617,
"step": 31000
},
{
"epoch": 0.17,
"learning_rate": 1.6576403267089455e-05,
"loss": 0.658,
"step": 31500
},
{
"epoch": 0.17,
"learning_rate": 1.652206046180516e-05,
"loss": 0.6588,
"step": 32000
},
{
"epoch": 0.18,
"learning_rate": 1.6467717656520866e-05,
"loss": 0.6615,
"step": 32500
},
{
"epoch": 0.18,
"learning_rate": 1.6413374851236572e-05,
"loss": 0.6628,
"step": 33000
},
{
"epoch": 0.18,
"learning_rate": 1.6359032045952277e-05,
"loss": 0.6513,
"step": 33500
},
{
"epoch": 0.18,
"learning_rate": 1.6304689240667983e-05,
"loss": 0.6586,
"step": 34000
},
{
"epoch": 0.19,
"learning_rate": 1.6250346435383688e-05,
"loss": 0.6491,
"step": 34500
},
{
"epoch": 0.19,
"learning_rate": 1.6196003630099394e-05,
"loss": 0.6708,
"step": 35000
},
{
"epoch": 0.19,
"learning_rate": 1.61416608248151e-05,
"loss": 0.6565,
"step": 35500
},
{
"epoch": 0.2,
"learning_rate": 1.6087318019530808e-05,
"loss": 0.6525,
"step": 36000
},
{
"epoch": 0.2,
"learning_rate": 1.603297521424651e-05,
"loss": 0.6503,
"step": 36500
},
{
"epoch": 0.2,
"learning_rate": 1.5978632408962215e-05,
"loss": 0.6465,
"step": 37000
},
{
"epoch": 0.2,
"learning_rate": 1.5924289603677924e-05,
"loss": 0.6477,
"step": 37500
},
{
"epoch": 0.21,
"learning_rate": 1.5869946798393626e-05,
"loss": 0.6473,
"step": 38000
},
{
"epoch": 0.21,
"learning_rate": 1.5815603993109335e-05,
"loss": 0.6494,
"step": 38500
},
{
"epoch": 0.21,
"learning_rate": 1.5761261187825037e-05,
"loss": 0.6371,
"step": 39000
},
{
"epoch": 0.21,
"learning_rate": 1.5706918382540746e-05,
"loss": 0.6434,
"step": 39500
},
{
"epoch": 0.22,
"learning_rate": 1.565257557725645e-05,
"loss": 0.645,
"step": 40000
},
{
"epoch": 0.22,
"learning_rate": 1.5598232771972157e-05,
"loss": 0.65,
"step": 40500
},
{
"epoch": 0.22,
"learning_rate": 1.5543889966687863e-05,
"loss": 0.6432,
"step": 41000
},
{
"epoch": 0.23,
"learning_rate": 1.5489547161403568e-05,
"loss": 0.6437,
"step": 41500
},
{
"epoch": 0.23,
"learning_rate": 1.5435204356119273e-05,
"loss": 0.6297,
"step": 42000
},
{
"epoch": 0.23,
"learning_rate": 1.538086155083498e-05,
"loss": 0.6376,
"step": 42500
},
{
"epoch": 0.23,
"learning_rate": 1.5326518745550684e-05,
"loss": 0.6439,
"step": 43000
},
{
"epoch": 0.24,
"learning_rate": 1.527217594026639e-05,
"loss": 0.6435,
"step": 43500
},
{
"epoch": 0.24,
"learning_rate": 1.5217833134982097e-05,
"loss": 0.6474,
"step": 44000
},
{
"epoch": 0.24,
"learning_rate": 1.51634903296978e-05,
"loss": 0.6356,
"step": 44500
},
{
"epoch": 0.24,
"learning_rate": 1.5109147524413506e-05,
"loss": 0.6386,
"step": 45000
},
{
"epoch": 0.25,
"learning_rate": 1.5054804719129212e-05,
"loss": 0.6349,
"step": 45500
},
{
"epoch": 0.25,
"learning_rate": 1.5000461913844917e-05,
"loss": 0.6333,
"step": 46000
},
{
"epoch": 0.25,
"learning_rate": 1.4946119108560624e-05,
"loss": 0.6439,
"step": 46500
},
{
"epoch": 0.26,
"learning_rate": 1.4891776303276328e-05,
"loss": 0.6334,
"step": 47000
},
{
"epoch": 0.26,
"learning_rate": 1.4837433497992035e-05,
"loss": 0.6384,
"step": 47500
},
{
"epoch": 0.26,
"learning_rate": 1.478309069270774e-05,
"loss": 0.6481,
"step": 48000
},
{
"epoch": 0.26,
"learning_rate": 1.4728747887423446e-05,
"loss": 0.6308,
"step": 48500
},
{
"epoch": 0.27,
"learning_rate": 1.4674405082139151e-05,
"loss": 0.6403,
"step": 49000
},
{
"epoch": 0.27,
"learning_rate": 1.4620062276854857e-05,
"loss": 0.6347,
"step": 49500
},
{
"epoch": 0.27,
"learning_rate": 1.4565719471570562e-05,
"loss": 0.6394,
"step": 50000
},
{
"epoch": 0.27,
"learning_rate": 1.4511376666286268e-05,
"loss": 0.6266,
"step": 50500
},
{
"epoch": 0.28,
"learning_rate": 1.4457033861001975e-05,
"loss": 0.6298,
"step": 51000
},
{
"epoch": 0.28,
"learning_rate": 1.4402691055717679e-05,
"loss": 0.6187,
"step": 51500
},
{
"epoch": 0.28,
"learning_rate": 1.4348348250433386e-05,
"loss": 0.6278,
"step": 52000
},
{
"epoch": 0.29,
"learning_rate": 1.4294005445149091e-05,
"loss": 0.6331,
"step": 52500
},
{
"epoch": 0.29,
"learning_rate": 1.4239662639864795e-05,
"loss": 0.6359,
"step": 53000
},
{
"epoch": 0.29,
"learning_rate": 1.4185319834580502e-05,
"loss": 0.6297,
"step": 53500
},
{
"epoch": 0.29,
"learning_rate": 1.4130977029296206e-05,
"loss": 0.6307,
"step": 54000
},
{
"epoch": 0.3,
"learning_rate": 1.4076634224011913e-05,
"loss": 0.6297,
"step": 54500
},
{
"epoch": 0.3,
"learning_rate": 1.4022291418727619e-05,
"loss": 0.6318,
"step": 55000
},
{
"epoch": 0.3,
"learning_rate": 1.3967948613443326e-05,
"loss": 0.6346,
"step": 55500
},
{
"epoch": 0.3,
"learning_rate": 1.391360580815903e-05,
"loss": 0.6179,
"step": 56000
},
{
"epoch": 0.31,
"learning_rate": 1.3859263002874737e-05,
"loss": 0.6215,
"step": 56500
},
{
"epoch": 0.31,
"learning_rate": 1.380492019759044e-05,
"loss": 0.6269,
"step": 57000
},
{
"epoch": 0.31,
"learning_rate": 1.3750577392306146e-05,
"loss": 0.6204,
"step": 57500
},
{
"epoch": 0.32,
"learning_rate": 1.3696234587021853e-05,
"loss": 0.6297,
"step": 58000
},
{
"epoch": 0.32,
"learning_rate": 1.3641891781737557e-05,
"loss": 0.6206,
"step": 58500
},
{
"epoch": 0.32,
"learning_rate": 1.3587548976453264e-05,
"loss": 0.6251,
"step": 59000
},
{
"epoch": 0.32,
"learning_rate": 1.353320617116897e-05,
"loss": 0.6268,
"step": 59500
},
{
"epoch": 0.33,
"learning_rate": 1.3478863365884675e-05,
"loss": 0.6182,
"step": 60000
},
{
"epoch": 0.33,
"learning_rate": 1.342452056060038e-05,
"loss": 0.6201,
"step": 60500
},
{
"epoch": 0.33,
"learning_rate": 1.3370177755316086e-05,
"loss": 0.6138,
"step": 61000
},
{
"epoch": 0.33,
"learning_rate": 1.3315834950031791e-05,
"loss": 0.6241,
"step": 61500
},
{
"epoch": 0.34,
"learning_rate": 1.3261492144747497e-05,
"loss": 0.6134,
"step": 62000
},
{
"epoch": 0.34,
"learning_rate": 1.3207149339463204e-05,
"loss": 0.6235,
"step": 62500
},
{
"epoch": 0.34,
"learning_rate": 1.3152806534178908e-05,
"loss": 0.6065,
"step": 63000
},
{
"epoch": 0.35,
"learning_rate": 1.3098463728894615e-05,
"loss": 0.6088,
"step": 63500
},
{
"epoch": 0.35,
"learning_rate": 1.304412092361032e-05,
"loss": 0.612,
"step": 64000
},
{
"epoch": 0.35,
"learning_rate": 1.2989778118326026e-05,
"loss": 0.6185,
"step": 64500
},
{
"epoch": 0.35,
"learning_rate": 1.2935435313041731e-05,
"loss": 0.6032,
"step": 65000
},
{
"epoch": 0.36,
"learning_rate": 1.2881092507757435e-05,
"loss": 0.6124,
"step": 65500
},
{
"epoch": 0.36,
"learning_rate": 1.2826749702473142e-05,
"loss": 0.6094,
"step": 66000
},
{
"epoch": 0.36,
"learning_rate": 1.2772406897188848e-05,
"loss": 0.6005,
"step": 66500
},
{
"epoch": 0.36,
"learning_rate": 1.2718064091904555e-05,
"loss": 0.6132,
"step": 67000
},
{
"epoch": 0.37,
"learning_rate": 1.2663721286620258e-05,
"loss": 0.6124,
"step": 67500
},
{
"epoch": 0.37,
"learning_rate": 1.2609378481335966e-05,
"loss": 0.6142,
"step": 68000
},
{
"epoch": 0.37,
"learning_rate": 1.255503567605167e-05,
"loss": 0.6104,
"step": 68500
},
{
"epoch": 0.37,
"learning_rate": 1.2500692870767375e-05,
"loss": 0.6183,
"step": 69000
},
{
"epoch": 0.38,
"learning_rate": 1.2446350065483082e-05,
"loss": 0.607,
"step": 69500
},
{
"epoch": 0.38,
"learning_rate": 1.2392007260198786e-05,
"loss": 0.5969,
"step": 70000
},
{
"epoch": 0.38,
"learning_rate": 1.2337664454914493e-05,
"loss": 0.6052,
"step": 70500
},
{
"epoch": 0.39,
"learning_rate": 1.2283321649630198e-05,
"loss": 0.613,
"step": 71000
},
{
"epoch": 0.39,
"learning_rate": 1.2228978844345904e-05,
"loss": 0.5975,
"step": 71500
},
{
"epoch": 0.39,
"learning_rate": 1.217463603906161e-05,
"loss": 0.5998,
"step": 72000
},
{
"epoch": 0.39,
"learning_rate": 1.2120293233777316e-05,
"loss": 0.5949,
"step": 72500
},
{
"epoch": 0.4,
"learning_rate": 1.206595042849302e-05,
"loss": 0.6029,
"step": 73000
},
{
"epoch": 0.4,
"learning_rate": 1.2011607623208726e-05,
"loss": 0.6074,
"step": 73500
},
{
"epoch": 0.4,
"learning_rate": 1.1957264817924433e-05,
"loss": 0.5985,
"step": 74000
},
{
"epoch": 0.4,
"learning_rate": 1.1902922012640136e-05,
"loss": 0.6105,
"step": 74500
},
{
"epoch": 0.41,
"learning_rate": 1.1848579207355844e-05,
"loss": 0.6064,
"step": 75000
},
{
"epoch": 0.41,
"learning_rate": 1.1794236402071549e-05,
"loss": 0.5912,
"step": 75500
},
{
"epoch": 0.41,
"learning_rate": 1.1739893596787255e-05,
"loss": 0.6117,
"step": 76000
},
{
"epoch": 0.42,
"learning_rate": 1.168555079150296e-05,
"loss": 0.5947,
"step": 76500
},
{
"epoch": 0.42,
"learning_rate": 1.1631207986218664e-05,
"loss": 0.5985,
"step": 77000
},
{
"epoch": 0.42,
"learning_rate": 1.1576865180934371e-05,
"loss": 0.6108,
"step": 77500
},
{
"epoch": 0.42,
"learning_rate": 1.1522522375650076e-05,
"loss": 0.5972,
"step": 78000
},
{
"epoch": 0.43,
"learning_rate": 1.1468179570365784e-05,
"loss": 0.6002,
"step": 78500
},
{
"epoch": 0.43,
"learning_rate": 1.1413836765081487e-05,
"loss": 0.5921,
"step": 79000
},
{
"epoch": 0.43,
"learning_rate": 1.1359493959797194e-05,
"loss": 0.6026,
"step": 79500
},
{
"epoch": 0.43,
"learning_rate": 1.1305151154512898e-05,
"loss": 0.603,
"step": 80000
},
{
"epoch": 0.44,
"learning_rate": 1.1250808349228605e-05,
"loss": 0.5963,
"step": 80500
},
{
"epoch": 0.44,
"learning_rate": 1.119646554394431e-05,
"loss": 0.5942,
"step": 81000
},
{
"epoch": 0.44,
"learning_rate": 1.1142122738660015e-05,
"loss": 0.6006,
"step": 81500
},
{
"epoch": 0.45,
"learning_rate": 1.1087779933375722e-05,
"loss": 0.6026,
"step": 82000
},
{
"epoch": 0.45,
"learning_rate": 1.1033437128091427e-05,
"loss": 0.5944,
"step": 82500
},
{
"epoch": 0.45,
"learning_rate": 1.0979094322807133e-05,
"loss": 0.6031,
"step": 83000
},
{
"epoch": 0.45,
"learning_rate": 1.0924751517522838e-05,
"loss": 0.5906,
"step": 83500
},
{
"epoch": 0.46,
"learning_rate": 1.0870408712238545e-05,
"loss": 0.595,
"step": 84000
},
{
"epoch": 0.46,
"learning_rate": 1.0816065906954249e-05,
"loss": 0.5921,
"step": 84500
},
{
"epoch": 0.46,
"learning_rate": 1.0761723101669956e-05,
"loss": 0.5944,
"step": 85000
},
{
"epoch": 0.46,
"learning_rate": 1.0707380296385662e-05,
"loss": 0.5752,
"step": 85500
},
{
"epoch": 0.47,
"learning_rate": 1.0653037491101365e-05,
"loss": 0.5942,
"step": 86000
},
{
"epoch": 0.47,
"learning_rate": 1.0598694685817072e-05,
"loss": 0.5946,
"step": 86500
},
{
"epoch": 0.47,
"learning_rate": 1.0544351880532778e-05,
"loss": 0.5911,
"step": 87000
},
{
"epoch": 0.48,
"learning_rate": 1.0490009075248483e-05,
"loss": 0.5982,
"step": 87500
},
{
"epoch": 0.48,
"learning_rate": 1.0435666269964189e-05,
"loss": 0.5941,
"step": 88000
},
{
"epoch": 0.48,
"learning_rate": 1.0381323464679896e-05,
"loss": 0.598,
"step": 88500
},
{
"epoch": 0.48,
"learning_rate": 1.03269806593956e-05,
"loss": 0.5845,
"step": 89000
},
{
"epoch": 0.49,
"learning_rate": 1.0272637854111305e-05,
"loss": 0.5958,
"step": 89500
},
{
"epoch": 0.49,
"learning_rate": 1.0218295048827012e-05,
"loss": 0.584,
"step": 90000
},
{
"epoch": 0.49,
"learning_rate": 1.0163952243542716e-05,
"loss": 0.5774,
"step": 90500
},
{
"epoch": 0.49,
"learning_rate": 1.0109609438258423e-05,
"loss": 0.5809,
"step": 91000
},
{
"epoch": 0.5,
"learning_rate": 1.0055266632974127e-05,
"loss": 0.5882,
"step": 91500
},
{
"epoch": 0.5,
"learning_rate": 1.0000923827689834e-05,
"loss": 0.5948,
"step": 92000
},
{
"epoch": 0.5,
"learning_rate": 9.94658102240554e-06,
"loss": 0.5809,
"step": 92500
},
{
"epoch": 0.51,
"learning_rate": 9.892238217121245e-06,
"loss": 0.5816,
"step": 93000
},
{
"epoch": 0.51,
"learning_rate": 9.83789541183695e-06,
"loss": 0.592,
"step": 93500
},
{
"epoch": 0.51,
"learning_rate": 9.783552606552656e-06,
"loss": 0.5822,
"step": 94000
},
{
"epoch": 0.51,
"learning_rate": 9.729209801268361e-06,
"loss": 0.5809,
"step": 94500
},
{
"epoch": 0.52,
"learning_rate": 9.674866995984069e-06,
"loss": 0.5736,
"step": 95000
},
{
"epoch": 0.52,
"learning_rate": 9.620524190699772e-06,
"loss": 0.5837,
"step": 95500
},
{
"epoch": 0.52,
"learning_rate": 9.566181385415478e-06,
"loss": 0.5812,
"step": 96000
},
{
"epoch": 0.52,
"learning_rate": 9.511838580131183e-06,
"loss": 0.5769,
"step": 96500
},
{
"epoch": 0.53,
"learning_rate": 9.45749577484689e-06,
"loss": 0.5784,
"step": 97000
},
{
"epoch": 0.53,
"learning_rate": 9.403152969562596e-06,
"loss": 0.5872,
"step": 97500
},
{
"epoch": 0.53,
"learning_rate": 9.348810164278301e-06,
"loss": 0.5807,
"step": 98000
},
{
"epoch": 0.54,
"learning_rate": 9.294467358994007e-06,
"loss": 0.5857,
"step": 98500
},
{
"epoch": 0.54,
"learning_rate": 9.240124553709712e-06,
"loss": 0.5706,
"step": 99000
},
{
"epoch": 0.54,
"learning_rate": 9.185781748425418e-06,
"loss": 0.5816,
"step": 99500
},
{
"epoch": 0.54,
"learning_rate": 9.131438943141123e-06,
"loss": 0.5888,
"step": 100000
},
{
"epoch": 0.55,
"learning_rate": 9.077096137856829e-06,
"loss": 0.5799,
"step": 100500
},
{
"epoch": 0.55,
"learning_rate": 9.022753332572534e-06,
"loss": 0.5825,
"step": 101000
},
{
"epoch": 0.55,
"learning_rate": 8.968410527288241e-06,
"loss": 0.5783,
"step": 101500
},
{
"epoch": 0.55,
"learning_rate": 8.914067722003947e-06,
"loss": 0.5749,
"step": 102000
},
{
"epoch": 0.56,
"learning_rate": 8.859724916719652e-06,
"loss": 0.5817,
"step": 102500
},
{
"epoch": 0.56,
"learning_rate": 8.805382111435358e-06,
"loss": 0.582,
"step": 103000
},
{
"epoch": 0.56,
"learning_rate": 8.751039306151063e-06,
"loss": 0.5687,
"step": 103500
},
{
"epoch": 0.57,
"learning_rate": 8.696696500866769e-06,
"loss": 0.5702,
"step": 104000
},
{
"epoch": 0.57,
"learning_rate": 8.642353695582474e-06,
"loss": 0.5734,
"step": 104500
},
{
"epoch": 0.57,
"learning_rate": 8.58801089029818e-06,
"loss": 0.5769,
"step": 105000
},
{
"epoch": 0.57,
"learning_rate": 8.533668085013885e-06,
"loss": 0.5648,
"step": 105500
},
{
"epoch": 0.58,
"learning_rate": 8.47932527972959e-06,
"loss": 0.5686,
"step": 106000
},
{
"epoch": 0.58,
"learning_rate": 8.424982474445297e-06,
"loss": 0.572,
"step": 106500
},
{
"epoch": 0.58,
"learning_rate": 8.370639669161003e-06,
"loss": 0.5753,
"step": 107000
},
{
"epoch": 0.58,
"learning_rate": 8.316296863876707e-06,
"loss": 0.5745,
"step": 107500
},
{
"epoch": 0.59,
"learning_rate": 8.261954058592412e-06,
"loss": 0.5713,
"step": 108000
},
{
"epoch": 0.59,
"learning_rate": 8.20761125330812e-06,
"loss": 0.5732,
"step": 108500
},
{
"epoch": 0.59,
"learning_rate": 8.153268448023825e-06,
"loss": 0.5623,
"step": 109000
},
{
"epoch": 0.6,
"learning_rate": 8.09892564273953e-06,
"loss": 0.5679,
"step": 109500
},
{
"epoch": 0.6,
"learning_rate": 8.044582837455236e-06,
"loss": 0.5711,
"step": 110000
},
{
"epoch": 0.6,
"learning_rate": 7.990240032170941e-06,
"loss": 0.5773,
"step": 110500
},
{
"epoch": 0.6,
"learning_rate": 7.935897226886648e-06,
"loss": 0.5788,
"step": 111000
},
{
"epoch": 0.61,
"learning_rate": 7.881554421602352e-06,
"loss": 0.5626,
"step": 111500
},
{
"epoch": 0.61,
"learning_rate": 7.827211616318057e-06,
"loss": 0.5688,
"step": 112000
},
{
"epoch": 0.61,
"learning_rate": 7.772868811033763e-06,
"loss": 0.5576,
"step": 112500
},
{
"epoch": 0.61,
"learning_rate": 7.71852600574947e-06,
"loss": 0.5659,
"step": 113000
},
{
"epoch": 0.62,
"learning_rate": 7.664183200465176e-06,
"loss": 0.5523,
"step": 113500
},
{
"epoch": 0.62,
"learning_rate": 7.609840395180881e-06,
"loss": 0.5633,
"step": 114000
},
{
"epoch": 0.62,
"learning_rate": 7.5554975898965864e-06,
"loss": 0.5677,
"step": 114500
},
{
"epoch": 0.62,
"learning_rate": 7.501154784612292e-06,
"loss": 0.5604,
"step": 115000
},
{
"epoch": 0.63,
"learning_rate": 7.4468119793279965e-06,
"loss": 0.562,
"step": 115500
},
{
"epoch": 0.63,
"learning_rate": 7.392469174043703e-06,
"loss": 0.5699,
"step": 116000
},
{
"epoch": 0.63,
"learning_rate": 7.338126368759408e-06,
"loss": 0.5636,
"step": 116500
},
{
"epoch": 0.64,
"learning_rate": 7.2837835634751146e-06,
"loss": 0.5534,
"step": 117000
},
{
"epoch": 0.64,
"learning_rate": 7.22944075819082e-06,
"loss": 0.563,
"step": 117500
},
{
"epoch": 0.64,
"learning_rate": 7.1750979529065255e-06,
"loss": 0.5629,
"step": 118000
},
{
"epoch": 0.64,
"learning_rate": 7.120755147622232e-06,
"loss": 0.5682,
"step": 118500
},
{
"epoch": 0.65,
"learning_rate": 7.066412342337937e-06,
"loss": 0.5704,
"step": 119000
},
{
"epoch": 0.65,
"learning_rate": 7.012069537053643e-06,
"loss": 0.5582,
"step": 119500
},
{
"epoch": 0.65,
"learning_rate": 6.957726731769347e-06,
"loss": 0.5696,
"step": 120000
},
{
"epoch": 0.65,
"learning_rate": 6.903383926485054e-06,
"loss": 0.5649,
"step": 120500
},
{
"epoch": 0.66,
"learning_rate": 6.849041121200759e-06,
"loss": 0.5649,
"step": 121000
},
{
"epoch": 0.66,
"learning_rate": 6.7946983159164645e-06,
"loss": 0.5575,
"step": 121500
},
{
"epoch": 0.66,
"learning_rate": 6.740355510632171e-06,
"loss": 0.5649,
"step": 122000
},
{
"epoch": 0.67,
"learning_rate": 6.686012705347876e-06,
"loss": 0.5571,
"step": 122500
},
{
"epoch": 0.67,
"learning_rate": 6.631669900063582e-06,
"loss": 0.5639,
"step": 123000
},
{
"epoch": 0.67,
"learning_rate": 6.577327094779288e-06,
"loss": 0.5666,
"step": 123500
},
{
"epoch": 0.67,
"learning_rate": 6.522984289494993e-06,
"loss": 0.5539,
"step": 124000
},
{
"epoch": 0.68,
"learning_rate": 6.468641484210698e-06,
"loss": 0.5486,
"step": 124500
},
{
"epoch": 0.68,
"learning_rate": 6.4142986789264035e-06,
"loss": 0.5669,
"step": 125000
},
{
"epoch": 0.68,
"learning_rate": 6.35995587364211e-06,
"loss": 0.5627,
"step": 125500
},
{
"epoch": 0.68,
"learning_rate": 6.305613068357815e-06,
"loss": 0.554,
"step": 126000
},
{
"epoch": 0.69,
"learning_rate": 6.251270263073521e-06,
"loss": 0.5619,
"step": 126500
},
{
"epoch": 0.69,
"learning_rate": 6.196927457789227e-06,
"loss": 0.5525,
"step": 127000
},
{
"epoch": 0.69,
"learning_rate": 6.1425846525049325e-06,
"loss": 0.5609,
"step": 127500
},
{
"epoch": 0.7,
"learning_rate": 6.088241847220637e-06,
"loss": 0.5538,
"step": 128000
},
{
"epoch": 0.7,
"learning_rate": 6.0338990419363434e-06,
"loss": 0.555,
"step": 128500
},
{
"epoch": 0.7,
"learning_rate": 5.979556236652049e-06,
"loss": 0.5472,
"step": 129000
},
{
"epoch": 0.7,
"learning_rate": 5.925213431367754e-06,
"loss": 0.5593,
"step": 129500
},
{
"epoch": 0.71,
"learning_rate": 5.870870626083461e-06,
"loss": 0.5555,
"step": 130000
},
{
"epoch": 0.71,
"learning_rate": 5.816527820799166e-06,
"loss": 0.5561,
"step": 130500
},
{
"epoch": 0.71,
"learning_rate": 5.7621850155148716e-06,
"loss": 0.5585,
"step": 131000
},
{
"epoch": 0.71,
"learning_rate": 5.707842210230578e-06,
"loss": 0.5643,
"step": 131500
},
{
"epoch": 0.72,
"learning_rate": 5.6534994049462825e-06,
"loss": 0.548,
"step": 132000
},
{
"epoch": 0.72,
"learning_rate": 5.599156599661988e-06,
"loss": 0.5565,
"step": 132500
},
{
"epoch": 0.72,
"learning_rate": 5.544813794377693e-06,
"loss": 0.5441,
"step": 133000
},
{
"epoch": 0.73,
"learning_rate": 5.4904709890934e-06,
"loss": 0.5526,
"step": 133500
},
{
"epoch": 0.73,
"learning_rate": 5.436128183809105e-06,
"loss": 0.5505,
"step": 134000
},
{
"epoch": 0.73,
"learning_rate": 5.381785378524811e-06,
"loss": 0.5547,
"step": 134500
},
{
"epoch": 0.73,
"learning_rate": 5.327442573240517e-06,
"loss": 0.5512,
"step": 135000
},
{
"epoch": 0.74,
"learning_rate": 5.273099767956222e-06,
"loss": 0.5522,
"step": 135500
},
{
"epoch": 0.74,
"learning_rate": 5.218756962671927e-06,
"loss": 0.55,
"step": 136000
},
{
"epoch": 0.74,
"learning_rate": 5.164414157387632e-06,
"loss": 0.5566,
"step": 136500
},
{
"epoch": 0.74,
"learning_rate": 5.110071352103339e-06,
"loss": 0.5552,
"step": 137000
},
{
"epoch": 0.75,
"learning_rate": 5.055728546819044e-06,
"loss": 0.5499,
"step": 137500
},
{
"epoch": 0.75,
"learning_rate": 5.00138574153475e-06,
"loss": 0.5597,
"step": 138000
},
{
"epoch": 0.75,
"learning_rate": 4.947042936250456e-06,
"loss": 0.5447,
"step": 138500
},
{
"epoch": 0.76,
"learning_rate": 4.8927001309661605e-06,
"loss": 0.5513,
"step": 139000
},
{
"epoch": 0.76,
"learning_rate": 4.838357325681867e-06,
"loss": 0.5435,
"step": 139500
},
{
"epoch": 0.76,
"learning_rate": 4.784014520397572e-06,
"loss": 0.5485,
"step": 140000
},
{
"epoch": 0.76,
"learning_rate": 4.729671715113278e-06,
"loss": 0.5427,
"step": 140500
},
{
"epoch": 0.77,
"learning_rate": 4.675328909828983e-06,
"loss": 0.5565,
"step": 141000
},
{
"epoch": 0.77,
"learning_rate": 4.6209861045446895e-06,
"loss": 0.5499,
"step": 141500
},
{
"epoch": 0.77,
"learning_rate": 4.566643299260395e-06,
"loss": 0.5421,
"step": 142000
},
{
"epoch": 0.77,
"learning_rate": 4.5123004939761e-06,
"loss": 0.54,
"step": 142500
},
{
"epoch": 0.78,
"learning_rate": 4.457957688691806e-06,
"loss": 0.5435,
"step": 143000
},
{
"epoch": 0.78,
"learning_rate": 4.403614883407511e-06,
"loss": 0.5484,
"step": 143500
},
{
"epoch": 0.78,
"learning_rate": 4.349272078123218e-06,
"loss": 0.5478,
"step": 144000
},
{
"epoch": 0.79,
"learning_rate": 4.294929272838923e-06,
"loss": 0.5519,
"step": 144500
},
{
"epoch": 0.79,
"learning_rate": 4.2405864675546285e-06,
"loss": 0.5531,
"step": 145000
},
{
"epoch": 0.79,
"learning_rate": 4.186243662270334e-06,
"loss": 0.5411,
"step": 145500
},
{
"epoch": 0.79,
"learning_rate": 4.1319008569860394e-06,
"loss": 0.5424,
"step": 146000
},
{
"epoch": 0.8,
"learning_rate": 4.077558051701746e-06,
"loss": 0.5506,
"step": 146500
},
{
"epoch": 0.8,
"learning_rate": 4.02321524641745e-06,
"loss": 0.5465,
"step": 147000
},
{
"epoch": 0.8,
"learning_rate": 3.968872441133157e-06,
"loss": 0.5448,
"step": 147500
},
{
"epoch": 0.8,
"learning_rate": 3.914529635848862e-06,
"loss": 0.5347,
"step": 148000
},
{
"epoch": 0.81,
"learning_rate": 3.8601868305645676e-06,
"loss": 0.5429,
"step": 148500
},
{
"epoch": 0.81,
"learning_rate": 3.805844025280273e-06,
"loss": 0.5401,
"step": 149000
},
{
"epoch": 0.81,
"learning_rate": 3.751501219995979e-06,
"loss": 0.5428,
"step": 149500
},
{
"epoch": 0.82,
"learning_rate": 3.6971584147116848e-06,
"loss": 0.5414,
"step": 150000
},
{
"epoch": 0.82,
"learning_rate": 3.6428156094273902e-06,
"loss": 0.541,
"step": 150500
},
{
"epoch": 0.82,
"learning_rate": 3.5884728041430957e-06,
"loss": 0.5396,
"step": 151000
},
{
"epoch": 0.82,
"learning_rate": 3.534129998858801e-06,
"loss": 0.5398,
"step": 151500
},
{
"epoch": 0.83,
"learning_rate": 3.479787193574507e-06,
"loss": 0.5393,
"step": 152000
},
{
"epoch": 0.83,
"learning_rate": 3.425444388290213e-06,
"loss": 0.5378,
"step": 152500
},
{
"epoch": 0.83,
"learning_rate": 3.371101583005918e-06,
"loss": 0.5344,
"step": 153000
},
{
"epoch": 0.83,
"learning_rate": 3.316758777721624e-06,
"loss": 0.5418,
"step": 153500
},
{
"epoch": 0.84,
"learning_rate": 3.2624159724373293e-06,
"loss": 0.5266,
"step": 154000
},
{
"epoch": 0.84,
"learning_rate": 3.208073167153035e-06,
"loss": 0.5376,
"step": 154500
},
{
"epoch": 0.84,
"learning_rate": 3.15373036186874e-06,
"loss": 0.5471,
"step": 155000
},
{
"epoch": 0.85,
"learning_rate": 3.099387556584446e-06,
"loss": 0.5451,
"step": 155500
},
{
"epoch": 0.85,
"learning_rate": 3.045044751300152e-06,
"loss": 0.5451,
"step": 156000
},
{
"epoch": 0.85,
"learning_rate": 2.990701946015858e-06,
"loss": 0.5423,
"step": 156500
},
{
"epoch": 0.85,
"learning_rate": 2.9363591407315633e-06,
"loss": 0.5332,
"step": 157000
},
{
"epoch": 0.86,
"learning_rate": 2.8820163354472687e-06,
"loss": 0.5367,
"step": 157500
},
{
"epoch": 0.86,
"learning_rate": 2.827673530162974e-06,
"loss": 0.5366,
"step": 158000
},
{
"epoch": 0.86,
"learning_rate": 2.77333072487868e-06,
"loss": 0.5362,
"step": 158500
},
{
"epoch": 0.86,
"learning_rate": 2.718987919594386e-06,
"loss": 0.5386,
"step": 159000
},
{
"epoch": 0.87,
"learning_rate": 2.664645114310091e-06,
"loss": 0.5381,
"step": 159500
},
{
"epoch": 0.87,
"learning_rate": 2.610302309025797e-06,
"loss": 0.5415,
"step": 160000
},
{
"epoch": 0.87,
"learning_rate": 2.5559595037415023e-06,
"loss": 0.5308,
"step": 160500
},
{
"epoch": 0.87,
"learning_rate": 2.501616698457208e-06,
"loss": 0.5298,
"step": 161000
},
{
"epoch": 0.88,
"learning_rate": 2.4472738931729136e-06,
"loss": 0.5209,
"step": 161500
},
{
"epoch": 0.88,
"learning_rate": 2.392931087888619e-06,
"loss": 0.5374,
"step": 162000
},
{
"epoch": 0.88,
"learning_rate": 2.338588282604325e-06,
"loss": 0.5437,
"step": 162500
},
{
"epoch": 0.89,
"learning_rate": 2.2842454773200304e-06,
"loss": 0.5353,
"step": 163000
},
{
"epoch": 0.89,
"learning_rate": 2.2299026720357363e-06,
"loss": 0.5377,
"step": 163500
},
{
"epoch": 0.89,
"learning_rate": 2.1755598667514418e-06,
"loss": 0.5412,
"step": 164000
},
{
"epoch": 0.89,
"learning_rate": 2.121217061467147e-06,
"loss": 0.5236,
"step": 164500
},
{
"epoch": 0.9,
"learning_rate": 2.0668742561828527e-06,
"loss": 0.5369,
"step": 165000
},
{
"epoch": 0.9,
"learning_rate": 2.0125314508985585e-06,
"loss": 0.529,
"step": 165500
},
{
"epoch": 0.9,
"learning_rate": 1.958188645614264e-06,
"loss": 0.5338,
"step": 166000
},
{
"epoch": 0.9,
"learning_rate": 1.9038458403299699e-06,
"loss": 0.53,
"step": 166500
},
{
"epoch": 0.91,
"learning_rate": 1.8495030350456753e-06,
"loss": 0.5254,
"step": 167000
},
{
"epoch": 0.91,
"learning_rate": 1.795160229761381e-06,
"loss": 0.5382,
"step": 167500
},
{
"epoch": 0.91,
"learning_rate": 1.7408174244770865e-06,
"loss": 0.541,
"step": 168000
},
{
"epoch": 0.92,
"learning_rate": 1.6864746191927921e-06,
"loss": 0.5296,
"step": 168500
},
{
"epoch": 0.92,
"learning_rate": 1.6321318139084976e-06,
"loss": 0.5304,
"step": 169000
},
{
"epoch": 0.92,
"learning_rate": 1.5777890086242035e-06,
"loss": 0.5253,
"step": 169500
},
{
"epoch": 0.92,
"learning_rate": 1.523446203339909e-06,
"loss": 0.5189,
"step": 170000
},
{
"epoch": 0.93,
"learning_rate": 1.4691033980556146e-06,
"loss": 0.5359,
"step": 170500
},
{
"epoch": 0.93,
"learning_rate": 1.41476059277132e-06,
"loss": 0.5368,
"step": 171000
},
{
"epoch": 0.93,
"learning_rate": 1.360417787487026e-06,
"loss": 0.5265,
"step": 171500
},
{
"epoch": 0.93,
"learning_rate": 1.3060749822027314e-06,
"loss": 0.5289,
"step": 172000
},
{
"epoch": 0.94,
"learning_rate": 1.251732176918437e-06,
"loss": 0.5294,
"step": 172500
},
{
"epoch": 0.94,
"learning_rate": 1.1973893716341425e-06,
"loss": 0.5293,
"step": 173000
},
{
"epoch": 0.94,
"learning_rate": 1.1430465663498482e-06,
"loss": 0.5342,
"step": 173500
},
{
"epoch": 0.95,
"learning_rate": 1.0887037610655538e-06,
"loss": 0.5182,
"step": 174000
},
{
"epoch": 0.95,
"learning_rate": 1.0343609557812593e-06,
"loss": 0.5311,
"step": 174500
},
{
"epoch": 0.95,
"learning_rate": 9.80018150496965e-07,
"loss": 0.5386,
"step": 175000
},
{
"epoch": 0.95,
"learning_rate": 9.256753452126706e-07,
"loss": 0.5291,
"step": 175500
},
{
"epoch": 0.96,
"learning_rate": 8.713325399283762e-07,
"loss": 0.5233,
"step": 176000
},
{
"epoch": 0.96,
"learning_rate": 8.169897346440818e-07,
"loss": 0.5167,
"step": 176500
},
{
"epoch": 0.96,
"learning_rate": 7.626469293597875e-07,
"loss": 0.5339,
"step": 177000
},
{
"epoch": 0.96,
"learning_rate": 7.083041240754932e-07,
"loss": 0.5263,
"step": 177500
},
{
"epoch": 0.97,
"learning_rate": 6.539613187911987e-07,
"loss": 0.5255,
"step": 178000
},
{
"epoch": 0.97,
"learning_rate": 5.996185135069043e-07,
"loss": 0.5292,
"step": 178500
},
{
"epoch": 0.97,
"learning_rate": 5.452757082226099e-07,
"loss": 0.5247,
"step": 179000
},
{
"epoch": 0.98,
"learning_rate": 4.909329029383155e-07,
"loss": 0.5293,
"step": 179500
},
{
"epoch": 0.98,
"learning_rate": 4.3659009765402114e-07,
"loss": 0.5278,
"step": 180000
},
{
"epoch": 0.98,
"learning_rate": 3.8224729236972675e-07,
"loss": 0.5151,
"step": 180500
},
{
"epoch": 0.98,
"learning_rate": 3.279044870854323e-07,
"loss": 0.515,
"step": 181000
},
{
"epoch": 0.99,
"learning_rate": 2.73561681801138e-07,
"loss": 0.5305,
"step": 181500
},
{
"epoch": 0.99,
"learning_rate": 2.192188765168436e-07,
"loss": 0.5231,
"step": 182000
},
{
"epoch": 0.99,
"learning_rate": 1.6487607123254918e-07,
"loss": 0.526,
"step": 182500
},
{
"epoch": 0.99,
"learning_rate": 1.1053326594825478e-07,
"loss": 0.5243,
"step": 183000
},
{
"epoch": 1.0,
"learning_rate": 5.6190460663960404e-08,
"loss": 0.5192,
"step": 183500
},
{
"epoch": 1.0,
"learning_rate": 1.8476553796660092e-09,
"loss": 0.5343,
"step": 184000
},
{
"epoch": 1.0,
"eval_accuracy": 0.7746184738955824,
"eval_loss": 0.587454617023468,
"eval_runtime": 72.5338,
"eval_samples_per_second": 514.932,
"eval_steps_per_second": 64.37,
"step": 184017
}
],
"max_steps": 184017,
"num_train_epochs": 1,
"total_flos": 3.8733781342346496e+17,
"trial_name": null,
"trial_params": null
}