skript-1m-gpt-neo350m / trainer_state.json
johnpaulbin's picture
add tokenizer
b5f60de
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.613999632555576,
"global_step": 72000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 9.999999632555577e-05,
"loss": 1.5591,
"step": 200
},
{
"epoch": 0.04,
"learning_rate": 9.999999265111153e-05,
"loss": 1.3368,
"step": 400
},
{
"epoch": 0.06,
"learning_rate": 9.999998897666729e-05,
"loss": 1.2706,
"step": 600
},
{
"epoch": 0.07,
"learning_rate": 9.999998530222305e-05,
"loss": 1.2055,
"step": 800
},
{
"epoch": 0.09,
"learning_rate": 9.999998162777881e-05,
"loss": 1.1738,
"step": 1000
},
{
"epoch": 0.11,
"learning_rate": 9.999997795333457e-05,
"loss": 1.1357,
"step": 1200
},
{
"epoch": 0.13,
"learning_rate": 9.999997427889031e-05,
"loss": 1.1226,
"step": 1400
},
{
"epoch": 0.15,
"learning_rate": 9.999997060444607e-05,
"loss": 1.0901,
"step": 1600
},
{
"epoch": 0.17,
"learning_rate": 9.999996693000185e-05,
"loss": 1.0793,
"step": 1800
},
{
"epoch": 0.18,
"learning_rate": 9.999996325555761e-05,
"loss": 1.0587,
"step": 2000
},
{
"epoch": 0.2,
"learning_rate": 9.999995958111336e-05,
"loss": 1.0522,
"step": 2200
},
{
"epoch": 0.22,
"learning_rate": 9.999995590666912e-05,
"loss": 1.0277,
"step": 2400
},
{
"epoch": 0.24,
"learning_rate": 9.999995223222488e-05,
"loss": 1.0177,
"step": 2600
},
{
"epoch": 0.26,
"learning_rate": 9.999994855778064e-05,
"loss": 1.0164,
"step": 2800
},
{
"epoch": 0.28,
"learning_rate": 9.999994488333641e-05,
"loss": 0.9814,
"step": 3000
},
{
"epoch": 0.29,
"learning_rate": 9.999994120889216e-05,
"loss": 0.9802,
"step": 3200
},
{
"epoch": 0.31,
"learning_rate": 9.999993753444792e-05,
"loss": 0.9808,
"step": 3400
},
{
"epoch": 0.33,
"learning_rate": 9.999993386000368e-05,
"loss": 0.9686,
"step": 3600
},
{
"epoch": 0.35,
"learning_rate": 9.999993018555944e-05,
"loss": 0.9531,
"step": 3800
},
{
"epoch": 0.37,
"learning_rate": 9.99999265111152e-05,
"loss": 0.9332,
"step": 4000
},
{
"epoch": 0.39,
"learning_rate": 9.999992283667096e-05,
"loss": 0.9486,
"step": 4200
},
{
"epoch": 0.4,
"learning_rate": 9.999991916222672e-05,
"loss": 0.9145,
"step": 4400
},
{
"epoch": 0.42,
"learning_rate": 9.999991548778248e-05,
"loss": 0.9164,
"step": 4600
},
{
"epoch": 0.44,
"learning_rate": 9.999991181333824e-05,
"loss": 0.8984,
"step": 4800
},
{
"epoch": 0.46,
"learning_rate": 9.999990813889399e-05,
"loss": 0.9016,
"step": 5000
},
{
"epoch": 0.48,
"learning_rate": 9.999990446444975e-05,
"loss": 0.9019,
"step": 5200
},
{
"epoch": 0.5,
"learning_rate": 9.999990079000552e-05,
"loss": 0.8835,
"step": 5400
},
{
"epoch": 0.51,
"learning_rate": 9.999989711556128e-05,
"loss": 0.8829,
"step": 5600
},
{
"epoch": 0.53,
"learning_rate": 9.999989344111703e-05,
"loss": 0.8872,
"step": 5800
},
{
"epoch": 0.55,
"learning_rate": 9.999988976667279e-05,
"loss": 0.8895,
"step": 6000
},
{
"epoch": 0.57,
"learning_rate": 9.999988609222855e-05,
"loss": 0.8533,
"step": 6200
},
{
"epoch": 0.59,
"learning_rate": 9.999988241778431e-05,
"loss": 0.8515,
"step": 6400
},
{
"epoch": 0.61,
"learning_rate": 9.999987874334007e-05,
"loss": 0.8682,
"step": 6600
},
{
"epoch": 0.62,
"learning_rate": 9.999987506889583e-05,
"loss": 0.8345,
"step": 6800
},
{
"epoch": 0.64,
"learning_rate": 9.99998713944516e-05,
"loss": 0.8614,
"step": 7000
},
{
"epoch": 0.66,
"learning_rate": 9.999986772000735e-05,
"loss": 0.8481,
"step": 7200
},
{
"epoch": 0.68,
"learning_rate": 9.999986404556311e-05,
"loss": 0.8479,
"step": 7400
},
{
"epoch": 0.7,
"learning_rate": 9.999986037111887e-05,
"loss": 0.8467,
"step": 7600
},
{
"epoch": 0.72,
"learning_rate": 9.999985669667464e-05,
"loss": 0.8441,
"step": 7800
},
{
"epoch": 0.73,
"learning_rate": 9.99998530222304e-05,
"loss": 0.8087,
"step": 8000
},
{
"epoch": 0.75,
"learning_rate": 9.999984934778616e-05,
"loss": 0.8202,
"step": 8200
},
{
"epoch": 0.77,
"learning_rate": 9.99998456733419e-05,
"loss": 0.8231,
"step": 8400
},
{
"epoch": 0.79,
"learning_rate": 9.999984199889766e-05,
"loss": 0.8188,
"step": 8600
},
{
"epoch": 0.81,
"learning_rate": 9.999983832445344e-05,
"loss": 0.8103,
"step": 8800
},
{
"epoch": 0.83,
"learning_rate": 9.99998346500092e-05,
"loss": 0.8158,
"step": 9000
},
{
"epoch": 0.85,
"learning_rate": 9.999983097556495e-05,
"loss": 0.808,
"step": 9200
},
{
"epoch": 0.86,
"learning_rate": 9.99998273011207e-05,
"loss": 0.8146,
"step": 9400
},
{
"epoch": 0.88,
"learning_rate": 9.999982362667647e-05,
"loss": 0.797,
"step": 9600
},
{
"epoch": 0.9,
"learning_rate": 9.999981995223223e-05,
"loss": 0.7784,
"step": 9800
},
{
"epoch": 0.92,
"learning_rate": 9.9999816277788e-05,
"loss": 0.7864,
"step": 10000
},
{
"epoch": 0.94,
"learning_rate": 9.999981260334375e-05,
"loss": 0.7987,
"step": 10200
},
{
"epoch": 0.96,
"learning_rate": 9.999980892889951e-05,
"loss": 0.7757,
"step": 10400
},
{
"epoch": 0.97,
"learning_rate": 9.999980525445527e-05,
"loss": 0.7812,
"step": 10600
},
{
"epoch": 0.99,
"learning_rate": 9.999980158001103e-05,
"loss": 0.7801,
"step": 10800
},
{
"epoch": 1.0,
"eval_loss": 0.6798496246337891,
"eval_runtime": 1461.6898,
"eval_samples_per_second": 119.156,
"eval_steps_per_second": 7.448,
"step": 10886
},
{
"epoch": 1.01,
"learning_rate": 9.999979790556679e-05,
"loss": 0.7013,
"step": 11000
},
{
"epoch": 1.03,
"learning_rate": 9.999979423112255e-05,
"loss": 0.6525,
"step": 11200
},
{
"epoch": 1.05,
"learning_rate": 9.999979055667831e-05,
"loss": 0.6677,
"step": 11400
},
{
"epoch": 1.07,
"learning_rate": 9.999978688223407e-05,
"loss": 0.6573,
"step": 11600
},
{
"epoch": 1.08,
"learning_rate": 9.999978320778982e-05,
"loss": 0.652,
"step": 11800
},
{
"epoch": 1.1,
"learning_rate": 9.999977953334558e-05,
"loss": 0.6554,
"step": 12000
},
{
"epoch": 1.12,
"learning_rate": 9.999977585890135e-05,
"loss": 0.6593,
"step": 12200
},
{
"epoch": 1.14,
"learning_rate": 9.999977218445711e-05,
"loss": 0.6634,
"step": 12400
},
{
"epoch": 1.16,
"learning_rate": 9.999976851001287e-05,
"loss": 0.653,
"step": 12600
},
{
"epoch": 1.18,
"learning_rate": 9.999976483556862e-05,
"loss": 0.6617,
"step": 12800
},
{
"epoch": 1.19,
"learning_rate": 9.999976116112438e-05,
"loss": 0.6567,
"step": 13000
},
{
"epoch": 1.21,
"learning_rate": 9.999975748668014e-05,
"loss": 0.6683,
"step": 13200
},
{
"epoch": 1.23,
"learning_rate": 9.999975381223591e-05,
"loss": 0.6681,
"step": 13400
},
{
"epoch": 1.25,
"learning_rate": 9.999975013779166e-05,
"loss": 0.6501,
"step": 13600
},
{
"epoch": 1.27,
"learning_rate": 9.999974646334742e-05,
"loss": 0.664,
"step": 13800
},
{
"epoch": 1.29,
"learning_rate": 9.999974278890318e-05,
"loss": 0.6702,
"step": 14000
},
{
"epoch": 1.3,
"learning_rate": 9.999973911445894e-05,
"loss": 0.6535,
"step": 14200
},
{
"epoch": 1.32,
"learning_rate": 9.99997354400147e-05,
"loss": 0.6614,
"step": 14400
},
{
"epoch": 1.34,
"learning_rate": 9.999973176557046e-05,
"loss": 0.6584,
"step": 14600
},
{
"epoch": 1.36,
"learning_rate": 9.999972809112622e-05,
"loss": 0.649,
"step": 14800
},
{
"epoch": 1.38,
"learning_rate": 9.999972441668198e-05,
"loss": 0.6489,
"step": 15000
},
{
"epoch": 1.4,
"learning_rate": 9.999972074223775e-05,
"loss": 0.6568,
"step": 15200
},
{
"epoch": 1.41,
"learning_rate": 9.999971706779349e-05,
"loss": 0.645,
"step": 15400
},
{
"epoch": 1.43,
"learning_rate": 9.999971339334925e-05,
"loss": 0.6554,
"step": 15600
},
{
"epoch": 1.45,
"learning_rate": 9.999970971890503e-05,
"loss": 0.6395,
"step": 15800
},
{
"epoch": 1.47,
"learning_rate": 9.999970604446079e-05,
"loss": 0.6497,
"step": 16000
},
{
"epoch": 1.49,
"learning_rate": 9.999970237001653e-05,
"loss": 0.6485,
"step": 16200
},
{
"epoch": 1.51,
"learning_rate": 9.99996986955723e-05,
"loss": 0.6383,
"step": 16400
},
{
"epoch": 1.52,
"learning_rate": 9.999969502112805e-05,
"loss": 0.64,
"step": 16600
},
{
"epoch": 1.54,
"learning_rate": 9.999969134668382e-05,
"loss": 0.6318,
"step": 16800
},
{
"epoch": 1.56,
"learning_rate": 9.999968767223959e-05,
"loss": 0.6381,
"step": 17000
},
{
"epoch": 1.58,
"learning_rate": 9.999968399779534e-05,
"loss": 0.6484,
"step": 17200
},
{
"epoch": 1.6,
"learning_rate": 9.99996803233511e-05,
"loss": 0.6491,
"step": 17400
},
{
"epoch": 1.62,
"learning_rate": 9.999967664890686e-05,
"loss": 0.6428,
"step": 17600
},
{
"epoch": 1.64,
"learning_rate": 9.999967297446262e-05,
"loss": 0.6356,
"step": 17800
},
{
"epoch": 1.65,
"learning_rate": 9.999966930001838e-05,
"loss": 0.637,
"step": 18000
},
{
"epoch": 1.67,
"learning_rate": 9.999966562557414e-05,
"loss": 0.6364,
"step": 18200
},
{
"epoch": 1.69,
"learning_rate": 9.99996619511299e-05,
"loss": 0.6473,
"step": 18400
},
{
"epoch": 1.71,
"learning_rate": 9.999965827668566e-05,
"loss": 0.6342,
"step": 18600
},
{
"epoch": 1.73,
"learning_rate": 9.99996546022414e-05,
"loss": 0.6395,
"step": 18800
},
{
"epoch": 1.75,
"learning_rate": 9.999965092779717e-05,
"loss": 0.6441,
"step": 19000
},
{
"epoch": 1.76,
"learning_rate": 9.999964725335294e-05,
"loss": 0.6461,
"step": 19200
},
{
"epoch": 1.78,
"learning_rate": 9.99996435789087e-05,
"loss": 0.6157,
"step": 19400
},
{
"epoch": 1.8,
"learning_rate": 9.999963990446446e-05,
"loss": 0.6233,
"step": 19600
},
{
"epoch": 1.82,
"learning_rate": 9.999963623002021e-05,
"loss": 0.6305,
"step": 19800
},
{
"epoch": 1.84,
"learning_rate": 9.999963255557597e-05,
"loss": 0.6277,
"step": 20000
},
{
"epoch": 1.86,
"learning_rate": 9.999962888113173e-05,
"loss": 0.6339,
"step": 20200
},
{
"epoch": 1.87,
"learning_rate": 9.99996252066875e-05,
"loss": 0.6271,
"step": 20400
},
{
"epoch": 1.89,
"learning_rate": 9.999962153224325e-05,
"loss": 0.6145,
"step": 20600
},
{
"epoch": 1.91,
"learning_rate": 9.999961785779901e-05,
"loss": 0.6299,
"step": 20800
},
{
"epoch": 1.93,
"learning_rate": 9.999961418335477e-05,
"loss": 0.6216,
"step": 21000
},
{
"epoch": 1.95,
"learning_rate": 9.999961050891053e-05,
"loss": 0.6167,
"step": 21200
},
{
"epoch": 1.97,
"learning_rate": 9.999960683446629e-05,
"loss": 0.6254,
"step": 21400
},
{
"epoch": 1.98,
"learning_rate": 9.999960316002205e-05,
"loss": 0.6376,
"step": 21600
},
{
"epoch": 2.0,
"eval_loss": 0.5107570290565491,
"eval_runtime": 1433.9446,
"eval_samples_per_second": 121.461,
"eval_steps_per_second": 7.592,
"step": 21772
},
{
"epoch": 2.0,
"learning_rate": 9.999959948557781e-05,
"loss": 0.6037,
"step": 21800
},
{
"epoch": 2.02,
"learning_rate": 9.999959581113357e-05,
"loss": 0.4903,
"step": 22000
},
{
"epoch": 2.04,
"learning_rate": 9.999959213668933e-05,
"loss": 0.4901,
"step": 22200
},
{
"epoch": 2.06,
"learning_rate": 9.999958846224508e-05,
"loss": 0.4826,
"step": 22400
},
{
"epoch": 2.08,
"learning_rate": 9.999958478780085e-05,
"loss": 0.4881,
"step": 22600
},
{
"epoch": 2.09,
"learning_rate": 9.999958111335662e-05,
"loss": 0.4921,
"step": 22800
},
{
"epoch": 2.11,
"learning_rate": 9.999957743891238e-05,
"loss": 0.502,
"step": 23000
},
{
"epoch": 2.13,
"learning_rate": 9.999957376446812e-05,
"loss": 0.4976,
"step": 23200
},
{
"epoch": 2.15,
"learning_rate": 9.999957009002388e-05,
"loss": 0.491,
"step": 23400
},
{
"epoch": 2.17,
"learning_rate": 9.999956641557964e-05,
"loss": 0.5033,
"step": 23600
},
{
"epoch": 2.19,
"learning_rate": 9.99995627411354e-05,
"loss": 0.4924,
"step": 23800
},
{
"epoch": 2.2,
"learning_rate": 9.999955906669118e-05,
"loss": 0.5026,
"step": 24000
},
{
"epoch": 2.22,
"learning_rate": 9.999955539224692e-05,
"loss": 0.4966,
"step": 24200
},
{
"epoch": 2.24,
"learning_rate": 9.999955171780269e-05,
"loss": 0.4963,
"step": 24400
},
{
"epoch": 2.26,
"learning_rate": 9.999954804335845e-05,
"loss": 0.5072,
"step": 24600
},
{
"epoch": 2.28,
"learning_rate": 9.99995443689142e-05,
"loss": 0.4907,
"step": 24800
},
{
"epoch": 2.3,
"learning_rate": 9.999954069446997e-05,
"loss": 0.4938,
"step": 25000
},
{
"epoch": 2.31,
"learning_rate": 9.999953702002573e-05,
"loss": 0.5035,
"step": 25200
},
{
"epoch": 2.33,
"learning_rate": 9.999953334558149e-05,
"loss": 0.5006,
"step": 25400
},
{
"epoch": 2.35,
"learning_rate": 9.999952967113725e-05,
"loss": 0.4992,
"step": 25600
},
{
"epoch": 2.37,
"learning_rate": 9.9999525996693e-05,
"loss": 0.5109,
"step": 25800
},
{
"epoch": 2.39,
"learning_rate": 9.999952232224876e-05,
"loss": 0.4994,
"step": 26000
},
{
"epoch": 2.41,
"learning_rate": 9.999951864780453e-05,
"loss": 0.4925,
"step": 26200
},
{
"epoch": 2.43,
"learning_rate": 9.999951497336029e-05,
"loss": 0.5073,
"step": 26400
},
{
"epoch": 2.44,
"learning_rate": 9.999951129891605e-05,
"loss": 0.5061,
"step": 26600
},
{
"epoch": 2.46,
"learning_rate": 9.99995076244718e-05,
"loss": 0.4954,
"step": 26800
},
{
"epoch": 2.48,
"learning_rate": 9.999950395002756e-05,
"loss": 0.5123,
"step": 27000
},
{
"epoch": 2.5,
"learning_rate": 9.999950027558332e-05,
"loss": 0.5049,
"step": 27200
},
{
"epoch": 2.52,
"learning_rate": 9.999949660113909e-05,
"loss": 0.4972,
"step": 27400
},
{
"epoch": 2.54,
"learning_rate": 9.999949292669484e-05,
"loss": 0.505,
"step": 27600
},
{
"epoch": 2.55,
"learning_rate": 9.99994892522506e-05,
"loss": 0.52,
"step": 27800
},
{
"epoch": 2.57,
"learning_rate": 9.999948557780636e-05,
"loss": 0.5077,
"step": 28000
},
{
"epoch": 2.59,
"learning_rate": 9.999948190336212e-05,
"loss": 0.5159,
"step": 28200
},
{
"epoch": 2.61,
"learning_rate": 9.999947822891788e-05,
"loss": 0.5054,
"step": 28400
},
{
"epoch": 2.63,
"learning_rate": 9.999947455447364e-05,
"loss": 0.4999,
"step": 28600
},
{
"epoch": 2.65,
"learning_rate": 9.99994708800294e-05,
"loss": 0.5035,
"step": 28800
},
{
"epoch": 2.66,
"learning_rate": 9.999946720558516e-05,
"loss": 0.5041,
"step": 29000
},
{
"epoch": 2.68,
"learning_rate": 9.999946353114092e-05,
"loss": 0.4998,
"step": 29200
},
{
"epoch": 2.7,
"learning_rate": 9.999945985669667e-05,
"loss": 0.5098,
"step": 29400
},
{
"epoch": 2.72,
"learning_rate": 9.999945618225244e-05,
"loss": 0.5102,
"step": 29600
},
{
"epoch": 2.74,
"learning_rate": 9.99994525078082e-05,
"loss": 0.5031,
"step": 29800
},
{
"epoch": 2.76,
"learning_rate": 9.999944883336396e-05,
"loss": 0.5015,
"step": 30000
},
{
"epoch": 2.77,
"learning_rate": 9.999944515891971e-05,
"loss": 0.505,
"step": 30200
},
{
"epoch": 2.79,
"learning_rate": 9.999944148447547e-05,
"loss": 0.5146,
"step": 30400
},
{
"epoch": 2.81,
"learning_rate": 9.999943781003123e-05,
"loss": 0.5101,
"step": 30600
},
{
"epoch": 2.83,
"learning_rate": 9.9999434135587e-05,
"loss": 0.5155,
"step": 30800
},
{
"epoch": 2.85,
"learning_rate": 9.999943046114277e-05,
"loss": 0.5076,
"step": 31000
},
{
"epoch": 2.87,
"learning_rate": 9.999942678669851e-05,
"loss": 0.5157,
"step": 31200
},
{
"epoch": 2.88,
"learning_rate": 9.999942311225427e-05,
"loss": 0.5046,
"step": 31400
},
{
"epoch": 2.9,
"learning_rate": 9.999941943781003e-05,
"loss": 0.5078,
"step": 31600
},
{
"epoch": 2.92,
"learning_rate": 9.99994157633658e-05,
"loss": 0.5096,
"step": 31800
},
{
"epoch": 2.94,
"learning_rate": 9.999941208892156e-05,
"loss": 0.5057,
"step": 32000
},
{
"epoch": 2.96,
"learning_rate": 9.999940841447732e-05,
"loss": 0.5171,
"step": 32200
},
{
"epoch": 2.98,
"learning_rate": 9.999940474003308e-05,
"loss": 0.5212,
"step": 32400
},
{
"epoch": 2.99,
"learning_rate": 9.999940106558884e-05,
"loss": 0.5132,
"step": 32600
},
{
"epoch": 3.0,
"eval_loss": 0.39297839999198914,
"eval_runtime": 1401.043,
"eval_samples_per_second": 124.314,
"eval_steps_per_second": 7.77,
"step": 32658
},
{
"epoch": 3.01,
"learning_rate": 9.999939739114458e-05,
"loss": 0.4123,
"step": 32800
},
{
"epoch": 3.03,
"learning_rate": 9.999939371670036e-05,
"loss": 0.3688,
"step": 33000
},
{
"epoch": 3.05,
"learning_rate": 9.999939004225612e-05,
"loss": 0.3752,
"step": 33200
},
{
"epoch": 3.07,
"learning_rate": 9.999938636781188e-05,
"loss": 0.3733,
"step": 33400
},
{
"epoch": 3.09,
"learning_rate": 9.999938269336764e-05,
"loss": 0.371,
"step": 33600
},
{
"epoch": 3.1,
"learning_rate": 9.999937901892339e-05,
"loss": 0.3838,
"step": 33800
},
{
"epoch": 3.12,
"learning_rate": 9.999937534447915e-05,
"loss": 0.3865,
"step": 34000
},
{
"epoch": 3.14,
"learning_rate": 9.999937167003491e-05,
"loss": 0.3859,
"step": 34200
},
{
"epoch": 3.16,
"learning_rate": 9.999936799559068e-05,
"loss": 0.3882,
"step": 34400
},
{
"epoch": 3.18,
"learning_rate": 9.999936432114643e-05,
"loss": 0.3942,
"step": 34600
},
{
"epoch": 3.2,
"learning_rate": 9.999936064670219e-05,
"loss": 0.3843,
"step": 34800
},
{
"epoch": 3.22,
"learning_rate": 9.999935697225795e-05,
"loss": 0.3859,
"step": 35000
},
{
"epoch": 3.23,
"learning_rate": 9.999935329781371e-05,
"loss": 0.3947,
"step": 35200
},
{
"epoch": 3.25,
"learning_rate": 9.999934962336947e-05,
"loss": 0.3934,
"step": 35400
},
{
"epoch": 3.27,
"learning_rate": 9.999934594892523e-05,
"loss": 0.3932,
"step": 35600
},
{
"epoch": 3.29,
"learning_rate": 9.999934227448099e-05,
"loss": 0.3977,
"step": 35800
},
{
"epoch": 3.31,
"learning_rate": 9.999933860003675e-05,
"loss": 0.4046,
"step": 36000
},
{
"epoch": 3.33,
"learning_rate": 9.999933492559251e-05,
"loss": 0.3961,
"step": 36200
},
{
"epoch": 3.34,
"learning_rate": 9.999933125114826e-05,
"loss": 0.398,
"step": 36400
},
{
"epoch": 3.36,
"learning_rate": 9.999932757670403e-05,
"loss": 0.393,
"step": 36600
},
{
"epoch": 3.38,
"learning_rate": 9.999932390225979e-05,
"loss": 0.4043,
"step": 36800
},
{
"epoch": 3.4,
"learning_rate": 9.999932022781555e-05,
"loss": 0.4042,
"step": 37000
},
{
"epoch": 3.42,
"learning_rate": 9.99993165533713e-05,
"loss": 0.3996,
"step": 37200
},
{
"epoch": 3.44,
"learning_rate": 9.999931287892706e-05,
"loss": 0.3995,
"step": 37400
},
{
"epoch": 3.45,
"learning_rate": 9.999930920448282e-05,
"loss": 0.398,
"step": 37600
},
{
"epoch": 3.47,
"learning_rate": 9.99993055300386e-05,
"loss": 0.3978,
"step": 37800
},
{
"epoch": 3.49,
"learning_rate": 9.999930185559434e-05,
"loss": 0.4039,
"step": 38000
},
{
"epoch": 3.51,
"learning_rate": 9.99992981811501e-05,
"loss": 0.3983,
"step": 38200
},
{
"epoch": 3.53,
"learning_rate": 9.999929450670586e-05,
"loss": 0.4099,
"step": 38400
},
{
"epoch": 3.55,
"learning_rate": 9.999929083226162e-05,
"loss": 0.3965,
"step": 38600
},
{
"epoch": 3.56,
"learning_rate": 9.999928715781738e-05,
"loss": 0.4027,
"step": 38800
},
{
"epoch": 3.58,
"learning_rate": 9.999928348337314e-05,
"loss": 0.406,
"step": 39000
},
{
"epoch": 3.6,
"learning_rate": 9.99992798089289e-05,
"loss": 0.415,
"step": 39200
},
{
"epoch": 3.62,
"learning_rate": 9.999927613448467e-05,
"loss": 0.4055,
"step": 39400
},
{
"epoch": 3.64,
"learning_rate": 9.999927246004043e-05,
"loss": 0.4058,
"step": 39600
},
{
"epoch": 3.66,
"learning_rate": 9.999926878559617e-05,
"loss": 0.4051,
"step": 39800
},
{
"epoch": 3.67,
"learning_rate": 9.999926511115195e-05,
"loss": 0.4088,
"step": 40000
},
{
"epoch": 3.69,
"learning_rate": 9.999926143670771e-05,
"loss": 0.4073,
"step": 40200
},
{
"epoch": 3.71,
"learning_rate": 9.999925776226347e-05,
"loss": 0.4075,
"step": 40400
},
{
"epoch": 3.73,
"learning_rate": 9.999925408781923e-05,
"loss": 0.4071,
"step": 40600
},
{
"epoch": 3.75,
"learning_rate": 9.999925041337498e-05,
"loss": 0.4136,
"step": 40800
},
{
"epoch": 3.77,
"learning_rate": 9.999924673893074e-05,
"loss": 0.4143,
"step": 41000
},
{
"epoch": 3.78,
"learning_rate": 9.999924306448651e-05,
"loss": 0.415,
"step": 41200
},
{
"epoch": 3.8,
"learning_rate": 9.999923939004227e-05,
"loss": 0.4157,
"step": 41400
},
{
"epoch": 3.82,
"learning_rate": 9.999923571559802e-05,
"loss": 0.406,
"step": 41600
},
{
"epoch": 3.84,
"learning_rate": 9.999923204115378e-05,
"loss": 0.416,
"step": 41800
},
{
"epoch": 3.86,
"learning_rate": 9.999922836670954e-05,
"loss": 0.4142,
"step": 42000
},
{
"epoch": 3.88,
"learning_rate": 9.99992246922653e-05,
"loss": 0.4109,
"step": 42200
},
{
"epoch": 3.89,
"learning_rate": 9.999922101782106e-05,
"loss": 0.4161,
"step": 42400
},
{
"epoch": 3.91,
"learning_rate": 9.999921734337682e-05,
"loss": 0.408,
"step": 42600
},
{
"epoch": 3.93,
"learning_rate": 9.999921366893258e-05,
"loss": 0.4162,
"step": 42800
},
{
"epoch": 3.95,
"learning_rate": 9.999920999448834e-05,
"loss": 0.4165,
"step": 43000
},
{
"epoch": 3.97,
"learning_rate": 9.99992063200441e-05,
"loss": 0.4167,
"step": 43200
},
{
"epoch": 3.99,
"learning_rate": 9.999920264559986e-05,
"loss": 0.4179,
"step": 43400
},
{
"epoch": 4.0,
"eval_loss": 0.3084418773651123,
"eval_runtime": 1411.8768,
"eval_samples_per_second": 123.36,
"eval_steps_per_second": 7.71,
"step": 43544
},
{
"epoch": 4.01,
"learning_rate": 9.999919897115562e-05,
"loss": 0.3814,
"step": 43600
},
{
"epoch": 4.02,
"learning_rate": 9.999919529671138e-05,
"loss": 0.2863,
"step": 43800
},
{
"epoch": 4.04,
"learning_rate": 9.999919162226714e-05,
"loss": 0.2924,
"step": 44000
},
{
"epoch": 4.06,
"learning_rate": 9.999918794782289e-05,
"loss": 0.2942,
"step": 44200
},
{
"epoch": 4.08,
"learning_rate": 9.999918427337865e-05,
"loss": 0.3034,
"step": 44400
},
{
"epoch": 4.1,
"learning_rate": 9.999918059893441e-05,
"loss": 0.3002,
"step": 44600
},
{
"epoch": 4.12,
"learning_rate": 9.999917692449018e-05,
"loss": 0.3029,
"step": 44800
},
{
"epoch": 4.13,
"learning_rate": 9.999917325004593e-05,
"loss": 0.2977,
"step": 45000
},
{
"epoch": 4.15,
"learning_rate": 9.999916957560169e-05,
"loss": 0.3062,
"step": 45200
},
{
"epoch": 4.17,
"learning_rate": 9.999916590115745e-05,
"loss": 0.3075,
"step": 45400
},
{
"epoch": 4.19,
"learning_rate": 9.999916222671321e-05,
"loss": 0.3137,
"step": 45600
},
{
"epoch": 4.21,
"learning_rate": 9.999915855226897e-05,
"loss": 0.3037,
"step": 45800
},
{
"epoch": 4.23,
"learning_rate": 9.999915487782473e-05,
"loss": 0.31,
"step": 46000
},
{
"epoch": 4.24,
"learning_rate": 9.99991512033805e-05,
"loss": 0.3141,
"step": 46200
},
{
"epoch": 4.26,
"learning_rate": 9.999914752893625e-05,
"loss": 0.3166,
"step": 46400
},
{
"epoch": 4.28,
"learning_rate": 9.999914385449201e-05,
"loss": 0.3125,
"step": 46600
},
{
"epoch": 4.3,
"learning_rate": 9.999914018004776e-05,
"loss": 0.3109,
"step": 46800
},
{
"epoch": 4.32,
"learning_rate": 9.999913650560354e-05,
"loss": 0.3169,
"step": 47000
},
{
"epoch": 4.34,
"learning_rate": 9.99991328311593e-05,
"loss": 0.3211,
"step": 47200
},
{
"epoch": 4.35,
"learning_rate": 9.999912915671506e-05,
"loss": 0.3278,
"step": 47400
},
{
"epoch": 4.37,
"learning_rate": 9.999912548227082e-05,
"loss": 0.3154,
"step": 47600
},
{
"epoch": 4.39,
"learning_rate": 9.999912180782656e-05,
"loss": 0.3167,
"step": 47800
},
{
"epoch": 4.41,
"learning_rate": 9.999911813338232e-05,
"loss": 0.3172,
"step": 48000
},
{
"epoch": 4.43,
"learning_rate": 9.99991144589381e-05,
"loss": 0.3243,
"step": 48200
},
{
"epoch": 4.45,
"learning_rate": 9.999911078449386e-05,
"loss": 0.3286,
"step": 48400
},
{
"epoch": 4.46,
"learning_rate": 9.99991071100496e-05,
"loss": 0.3268,
"step": 48600
},
{
"epoch": 4.48,
"learning_rate": 9.999910343560537e-05,
"loss": 0.3204,
"step": 48800
},
{
"epoch": 4.5,
"learning_rate": 9.999909976116113e-05,
"loss": 0.3311,
"step": 49000
},
{
"epoch": 4.52,
"learning_rate": 9.999909608671689e-05,
"loss": 0.3241,
"step": 49200
},
{
"epoch": 4.54,
"learning_rate": 9.999909241227265e-05,
"loss": 0.3305,
"step": 49400
},
{
"epoch": 4.56,
"learning_rate": 9.999908873782841e-05,
"loss": 0.3238,
"step": 49600
},
{
"epoch": 4.57,
"learning_rate": 9.999908506338417e-05,
"loss": 0.3283,
"step": 49800
},
{
"epoch": 4.59,
"learning_rate": 9.999908138893993e-05,
"loss": 0.3275,
"step": 50000
},
{
"epoch": 4.61,
"learning_rate": 9.999907771449569e-05,
"loss": 0.3302,
"step": 50200
},
{
"epoch": 4.63,
"learning_rate": 9.999907404005145e-05,
"loss": 0.3342,
"step": 50400
},
{
"epoch": 4.65,
"learning_rate": 9.999907036560721e-05,
"loss": 0.3363,
"step": 50600
},
{
"epoch": 4.67,
"learning_rate": 9.999906669116297e-05,
"loss": 0.3319,
"step": 50800
},
{
"epoch": 4.68,
"learning_rate": 9.999906301671873e-05,
"loss": 0.3288,
"step": 51000
},
{
"epoch": 4.7,
"learning_rate": 9.999905934227448e-05,
"loss": 0.3349,
"step": 51200
},
{
"epoch": 4.72,
"learning_rate": 9.999905566783024e-05,
"loss": 0.3289,
"step": 51400
},
{
"epoch": 4.74,
"learning_rate": 9.999905199338601e-05,
"loss": 0.3393,
"step": 51600
},
{
"epoch": 4.76,
"learning_rate": 9.999904831894177e-05,
"loss": 0.3388,
"step": 51800
},
{
"epoch": 4.78,
"learning_rate": 9.999904464449752e-05,
"loss": 0.3322,
"step": 52000
},
{
"epoch": 4.8,
"learning_rate": 9.999904097005328e-05,
"loss": 0.332,
"step": 52200
},
{
"epoch": 4.81,
"learning_rate": 9.999903729560904e-05,
"loss": 0.3378,
"step": 52400
},
{
"epoch": 4.83,
"learning_rate": 9.99990336211648e-05,
"loss": 0.3369,
"step": 52600
},
{
"epoch": 4.85,
"learning_rate": 9.999902994672056e-05,
"loss": 0.337,
"step": 52800
},
{
"epoch": 4.87,
"learning_rate": 9.999902627227632e-05,
"loss": 0.3434,
"step": 53000
},
{
"epoch": 4.89,
"learning_rate": 9.999902259783208e-05,
"loss": 0.3326,
"step": 53200
},
{
"epoch": 4.91,
"learning_rate": 9.999901892338784e-05,
"loss": 0.3382,
"step": 53400
},
{
"epoch": 4.92,
"learning_rate": 9.99990152489436e-05,
"loss": 0.3443,
"step": 53600
},
{
"epoch": 4.94,
"learning_rate": 9.999901157449936e-05,
"loss": 0.3405,
"step": 53800
},
{
"epoch": 4.96,
"learning_rate": 9.999900790005512e-05,
"loss": 0.3454,
"step": 54000
},
{
"epoch": 4.98,
"learning_rate": 9.999900422561088e-05,
"loss": 0.3499,
"step": 54200
},
{
"epoch": 5.0,
"learning_rate": 9.999900055116665e-05,
"loss": 0.3442,
"step": 54400
},
{
"epoch": 5.0,
"eval_loss": 0.24815598130226135,
"eval_runtime": 1424.5159,
"eval_samples_per_second": 122.265,
"eval_steps_per_second": 7.642,
"step": 54430
},
{
"epoch": 5.02,
"learning_rate": 9.999899687672239e-05,
"loss": 0.2559,
"step": 54600
},
{
"epoch": 5.03,
"learning_rate": 9.999899320227815e-05,
"loss": 0.2383,
"step": 54800
},
{
"epoch": 5.05,
"learning_rate": 9.999898952783391e-05,
"loss": 0.2416,
"step": 55000
},
{
"epoch": 5.07,
"learning_rate": 9.999898585338969e-05,
"loss": 0.237,
"step": 55200
},
{
"epoch": 5.09,
"learning_rate": 9.999898217894545e-05,
"loss": 0.2445,
"step": 55400
},
{
"epoch": 5.11,
"learning_rate": 9.99989785045012e-05,
"loss": 0.2412,
"step": 55600
},
{
"epoch": 5.13,
"learning_rate": 9.999897483005695e-05,
"loss": 0.2431,
"step": 55800
},
{
"epoch": 5.14,
"learning_rate": 9.999897115561272e-05,
"loss": 0.2449,
"step": 56000
},
{
"epoch": 5.16,
"learning_rate": 9.999896748116848e-05,
"loss": 0.2461,
"step": 56200
},
{
"epoch": 5.18,
"learning_rate": 9.999896380672424e-05,
"loss": 0.2527,
"step": 56400
},
{
"epoch": 5.2,
"learning_rate": 9.999896013228e-05,
"loss": 0.2522,
"step": 56600
},
{
"epoch": 5.22,
"learning_rate": 9.999895645783576e-05,
"loss": 0.2485,
"step": 56800
},
{
"epoch": 5.24,
"learning_rate": 9.999895278339152e-05,
"loss": 0.2517,
"step": 57000
},
{
"epoch": 5.25,
"learning_rate": 9.999894910894728e-05,
"loss": 0.2572,
"step": 57200
},
{
"epoch": 5.27,
"learning_rate": 9.999894543450304e-05,
"loss": 0.2574,
"step": 57400
},
{
"epoch": 5.29,
"learning_rate": 9.99989417600588e-05,
"loss": 0.2551,
"step": 57600
},
{
"epoch": 5.31,
"learning_rate": 9.999893808561456e-05,
"loss": 0.2584,
"step": 57800
},
{
"epoch": 5.33,
"learning_rate": 9.999893441117032e-05,
"loss": 0.2607,
"step": 58000
},
{
"epoch": 5.35,
"learning_rate": 9.999893073672607e-05,
"loss": 0.2631,
"step": 58200
},
{
"epoch": 5.36,
"learning_rate": 9.999892706228183e-05,
"loss": 0.2609,
"step": 58400
},
{
"epoch": 5.38,
"learning_rate": 9.99989233878376e-05,
"loss": 0.265,
"step": 58600
},
{
"epoch": 5.4,
"learning_rate": 9.999891971339336e-05,
"loss": 0.2625,
"step": 58800
},
{
"epoch": 5.42,
"learning_rate": 9.999891603894911e-05,
"loss": 0.2648,
"step": 59000
},
{
"epoch": 5.44,
"learning_rate": 9.999891236450487e-05,
"loss": 0.2677,
"step": 59200
},
{
"epoch": 5.46,
"learning_rate": 9.999890869006063e-05,
"loss": 0.2667,
"step": 59400
},
{
"epoch": 5.47,
"learning_rate": 9.999890501561639e-05,
"loss": 0.2623,
"step": 59600
},
{
"epoch": 5.49,
"learning_rate": 9.999890134117216e-05,
"loss": 0.2713,
"step": 59800
},
{
"epoch": 5.51,
"learning_rate": 9.999889766672791e-05,
"loss": 0.2659,
"step": 60000
},
{
"epoch": 5.53,
"learning_rate": 9.999889399228367e-05,
"loss": 0.2688,
"step": 60200
},
{
"epoch": 5.55,
"learning_rate": 9.999889031783943e-05,
"loss": 0.2716,
"step": 60400
},
{
"epoch": 5.57,
"learning_rate": 9.999888664339519e-05,
"loss": 0.2723,
"step": 60600
},
{
"epoch": 5.59,
"learning_rate": 9.999888296895095e-05,
"loss": 0.2724,
"step": 60800
},
{
"epoch": 5.6,
"learning_rate": 9.999887929450671e-05,
"loss": 0.2697,
"step": 61000
},
{
"epoch": 5.62,
"learning_rate": 9.999887562006247e-05,
"loss": 0.2749,
"step": 61200
},
{
"epoch": 5.64,
"learning_rate": 9.999887194561823e-05,
"loss": 0.273,
"step": 61400
},
{
"epoch": 5.66,
"learning_rate": 9.999886827117398e-05,
"loss": 0.2816,
"step": 61600
},
{
"epoch": 5.68,
"learning_rate": 9.999886459672974e-05,
"loss": 0.2742,
"step": 61800
},
{
"epoch": 5.7,
"learning_rate": 9.999886092228552e-05,
"loss": 0.2788,
"step": 62000
},
{
"epoch": 5.71,
"learning_rate": 9.999885724784128e-05,
"loss": 0.2782,
"step": 62200
},
{
"epoch": 5.73,
"learning_rate": 9.999885357339704e-05,
"loss": 0.2804,
"step": 62400
},
{
"epoch": 5.75,
"learning_rate": 9.999884989895278e-05,
"loss": 0.2777,
"step": 62600
},
{
"epoch": 5.77,
"learning_rate": 9.999884622450854e-05,
"loss": 0.2822,
"step": 62800
},
{
"epoch": 5.79,
"learning_rate": 9.99988425500643e-05,
"loss": 0.2811,
"step": 63000
},
{
"epoch": 5.81,
"learning_rate": 9.999883887562006e-05,
"loss": 0.2795,
"step": 63200
},
{
"epoch": 5.82,
"learning_rate": 9.999883520117583e-05,
"loss": 0.2846,
"step": 63400
},
{
"epoch": 5.84,
"learning_rate": 9.999883152673159e-05,
"loss": 0.2887,
"step": 63600
},
{
"epoch": 5.86,
"learning_rate": 9.999882785228735e-05,
"loss": 0.2811,
"step": 63800
},
{
"epoch": 5.88,
"learning_rate": 9.99988241778431e-05,
"loss": 0.283,
"step": 64000
},
{
"epoch": 5.9,
"learning_rate": 9.999882050339887e-05,
"loss": 0.2822,
"step": 64200
},
{
"epoch": 5.92,
"learning_rate": 9.999881682895463e-05,
"loss": 0.2853,
"step": 64400
},
{
"epoch": 5.93,
"learning_rate": 9.999881315451039e-05,
"loss": 0.2869,
"step": 64600
},
{
"epoch": 5.95,
"learning_rate": 9.999880948006615e-05,
"loss": 0.283,
"step": 64800
},
{
"epoch": 5.97,
"learning_rate": 9.999880580562191e-05,
"loss": 0.2893,
"step": 65000
},
{
"epoch": 5.99,
"learning_rate": 9.999880213117766e-05,
"loss": 0.2863,
"step": 65200
},
{
"epoch": 6.0,
"eval_loss": 0.20682939887046814,
"eval_runtime": 1457.2635,
"eval_samples_per_second": 119.518,
"eval_steps_per_second": 7.47,
"step": 65316
},
{
"epoch": 6.01,
"learning_rate": 9.999879845673342e-05,
"loss": 0.2505,
"step": 65400
},
{
"epoch": 6.03,
"learning_rate": 9.999879478228919e-05,
"loss": 0.1957,
"step": 65600
},
{
"epoch": 6.04,
"learning_rate": 9.999879110784495e-05,
"loss": 0.1986,
"step": 65800
},
{
"epoch": 6.06,
"learning_rate": 9.99987874334007e-05,
"loss": 0.1984,
"step": 66000
},
{
"epoch": 6.08,
"learning_rate": 9.999878375895646e-05,
"loss": 0.1983,
"step": 66200
},
{
"epoch": 6.1,
"learning_rate": 9.999878008451222e-05,
"loss": 0.203,
"step": 66400
},
{
"epoch": 6.12,
"learning_rate": 9.999877641006798e-05,
"loss": 0.203,
"step": 66600
},
{
"epoch": 6.14,
"learning_rate": 9.999877273562375e-05,
"loss": 0.2047,
"step": 66800
},
{
"epoch": 6.15,
"learning_rate": 9.99987690611795e-05,
"loss": 0.208,
"step": 67000
},
{
"epoch": 6.17,
"learning_rate": 9.999876538673526e-05,
"loss": 0.2103,
"step": 67200
},
{
"epoch": 6.19,
"learning_rate": 9.999876171229102e-05,
"loss": 0.2141,
"step": 67400
},
{
"epoch": 6.21,
"learning_rate": 9.999875803784678e-05,
"loss": 0.2101,
"step": 67600
},
{
"epoch": 6.23,
"learning_rate": 9.999875436340254e-05,
"loss": 0.2157,
"step": 67800
},
{
"epoch": 6.25,
"learning_rate": 9.99987506889583e-05,
"loss": 0.2173,
"step": 68000
},
{
"epoch": 6.26,
"learning_rate": 9.999874701451406e-05,
"loss": 0.2143,
"step": 68200
},
{
"epoch": 6.28,
"learning_rate": 9.999874334006982e-05,
"loss": 0.2156,
"step": 68400
},
{
"epoch": 6.3,
"learning_rate": 9.999873966562557e-05,
"loss": 0.2181,
"step": 68600
},
{
"epoch": 6.32,
"learning_rate": 9.999873599118133e-05,
"loss": 0.2215,
"step": 68800
},
{
"epoch": 6.34,
"learning_rate": 9.99987323167371e-05,
"loss": 0.2214,
"step": 69000
},
{
"epoch": 6.36,
"learning_rate": 9.999872864229286e-05,
"loss": 0.221,
"step": 69200
},
{
"epoch": 6.38,
"learning_rate": 9.999872496784863e-05,
"loss": 0.2254,
"step": 69400
},
{
"epoch": 6.39,
"learning_rate": 9.999872129340437e-05,
"loss": 0.2235,
"step": 69600
},
{
"epoch": 6.41,
"learning_rate": 9.999871761896013e-05,
"loss": 0.2231,
"step": 69800
},
{
"epoch": 6.43,
"learning_rate": 9.999871394451589e-05,
"loss": 0.2267,
"step": 70000
},
{
"epoch": 6.45,
"learning_rate": 9.999871027007167e-05,
"loss": 0.2239,
"step": 70200
},
{
"epoch": 6.47,
"learning_rate": 9.999870659562741e-05,
"loss": 0.227,
"step": 70400
},
{
"epoch": 6.49,
"learning_rate": 9.999870292118317e-05,
"loss": 0.2277,
"step": 70600
},
{
"epoch": 6.5,
"learning_rate": 9.999869924673893e-05,
"loss": 0.2245,
"step": 70800
},
{
"epoch": 6.52,
"learning_rate": 9.99986955722947e-05,
"loss": 0.2284,
"step": 71000
},
{
"epoch": 6.54,
"learning_rate": 9.999869189785046e-05,
"loss": 0.2308,
"step": 71200
},
{
"epoch": 6.56,
"learning_rate": 9.999868822340622e-05,
"loss": 0.2348,
"step": 71400
},
{
"epoch": 6.58,
"learning_rate": 9.999868454896198e-05,
"loss": 0.2289,
"step": 71600
},
{
"epoch": 6.6,
"learning_rate": 9.999868087451774e-05,
"loss": 0.2311,
"step": 71800
},
{
"epoch": 6.61,
"learning_rate": 9.99986772000735e-05,
"loss": 0.2308,
"step": 72000
}
],
"max_steps": 5443000000,
"num_train_epochs": 500000,
"total_flos": 2.6739082122913382e+17,
"trial_name": null,
"trial_params": null
}