gokuls's picture
End of training
383e7cd
{
"best_metric": NaN,
"best_model_checkpoint": "add_bert_12_layer_model_complete_training_new/checkpoint-10000",
"epoch": 1.5293301581753038,
"global_step": 140001,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 2.5e-05,
"loss": 8.2896,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 5e-05,
"loss": 6.8372,
"step": 1000
},
{
"epoch": 0.02,
"learning_rate": 7.5e-05,
"loss": 6.683,
"step": 1500
},
{
"epoch": 0.02,
"learning_rate": 0.0001,
"loss": 6.5955,
"step": 2000
},
{
"epoch": 0.03,
"learning_rate": 0.000125,
"loss": 6.5373,
"step": 2500
},
{
"epoch": 0.03,
"learning_rate": 0.00015,
"loss": 6.4804,
"step": 3000
},
{
"epoch": 0.04,
"learning_rate": 0.000175,
"loss": 6.4463,
"step": 3500
},
{
"epoch": 0.04,
"learning_rate": 0.0002,
"loss": 6.408,
"step": 4000
},
{
"epoch": 0.05,
"learning_rate": 0.00022500000000000002,
"loss": 6.3827,
"step": 4500
},
{
"epoch": 0.05,
"learning_rate": 0.00025,
"loss": 6.3731,
"step": 5000
},
{
"epoch": 0.06,
"learning_rate": 0.000275,
"loss": 6.3479,
"step": 5500
},
{
"epoch": 0.07,
"learning_rate": 0.0003,
"loss": 6.3401,
"step": 6000
},
{
"epoch": 0.07,
"learning_rate": 0.0003239,
"loss": 7.0795,
"step": 6500
},
{
"epoch": 0.08,
"learning_rate": 0.00034155000000000003,
"loss": 10.0665,
"step": 7000
},
{
"epoch": 0.08,
"learning_rate": 0.00036655,
"loss": 0.0,
"step": 7500
},
{
"epoch": 0.09,
"learning_rate": 0.00039155,
"loss": 0.0,
"step": 8000
},
{
"epoch": 0.09,
"learning_rate": 0.00041654999999999996,
"loss": 0.0,
"step": 8500
},
{
"epoch": 0.1,
"learning_rate": 0.00044155,
"loss": 0.0,
"step": 9000
},
{
"epoch": 0.1,
"learning_rate": 0.00046655000000000004,
"loss": 0.0,
"step": 9500
},
{
"epoch": 0.11,
"learning_rate": 0.00049155,
"loss": 0.0,
"step": 10000
},
{
"epoch": 0.11,
"eval_accuracy": 3.0592783198016594e-05,
"eval_loss": NaN,
"eval_runtime": 1245.7969,
"eval_samples_per_second": 247.517,
"eval_steps_per_second": 3.868,
"step": 10000
},
{
"epoch": 0.11,
"learning_rate": 0.0004996303493254713,
"loss": 0.0,
"step": 10500
},
{
"epoch": 0.12,
"learning_rate": 0.0004990719646207451,
"loss": 0.0,
"step": 11000
},
{
"epoch": 0.13,
"learning_rate": 0.000498513579916019,
"loss": 0.0,
"step": 11500
},
{
"epoch": 0.13,
"learning_rate": 0.0004979551952112928,
"loss": 0.0,
"step": 12000
},
{
"epoch": 0.14,
"learning_rate": 0.0004973968105065666,
"loss": 0.0,
"step": 12500
},
{
"epoch": 0.14,
"learning_rate": 0.0004968384258018405,
"loss": 0.0,
"step": 13000
},
{
"epoch": 0.15,
"learning_rate": 0.0004962800410971143,
"loss": 0.0,
"step": 13500
},
{
"epoch": 0.15,
"learning_rate": 0.0004957216563923881,
"loss": 0.0,
"step": 14000
},
{
"epoch": 0.16,
"learning_rate": 0.0004951632716876619,
"loss": 0.0,
"step": 14500
},
{
"epoch": 0.16,
"learning_rate": 0.0004946048869829357,
"loss": 0.0,
"step": 15000
},
{
"epoch": 0.17,
"learning_rate": 0.0004940465022782096,
"loss": 0.0,
"step": 15500
},
{
"epoch": 0.17,
"learning_rate": 0.0004934881175734834,
"loss": 0.0,
"step": 16000
},
{
"epoch": 0.18,
"learning_rate": 0.0004929297328687572,
"loss": 0.0,
"step": 16500
},
{
"epoch": 0.19,
"learning_rate": 0.0004923713481640312,
"loss": 0.0,
"step": 17000
},
{
"epoch": 0.19,
"learning_rate": 0.000491812963459305,
"loss": 0.0,
"step": 17500
},
{
"epoch": 0.2,
"learning_rate": 0.0004912545787545788,
"loss": 0.0,
"step": 18000
},
{
"epoch": 0.2,
"learning_rate": 0.0004906961940498526,
"loss": 0.0,
"step": 18500
},
{
"epoch": 0.21,
"learning_rate": 0.0004901378093451264,
"loss": 0.0,
"step": 19000
},
{
"epoch": 0.21,
"learning_rate": 0.0004895794246404003,
"loss": 0.0,
"step": 19500
},
{
"epoch": 0.22,
"learning_rate": 0.0004890210399356741,
"loss": 0.0,
"step": 20000
},
{
"epoch": 0.22,
"eval_accuracy": 3.5159707526351004e-05,
"eval_loss": NaN,
"eval_runtime": 1244.1268,
"eval_samples_per_second": 247.849,
"eval_steps_per_second": 3.873,
"step": 20000
},
{
"epoch": 0.22,
"learning_rate": 0.0004884626552309479,
"loss": 0.0,
"step": 20500
},
{
"epoch": 0.23,
"learning_rate": 0.00048790427052622175,
"loss": 0.0,
"step": 21000
},
{
"epoch": 0.23,
"learning_rate": 0.00048734588582149556,
"loss": 0.0,
"step": 21500
},
{
"epoch": 0.24,
"learning_rate": 0.0004867875011167694,
"loss": 0.0,
"step": 22000
},
{
"epoch": 0.25,
"learning_rate": 0.0004862291164120433,
"loss": 0.0,
"step": 22500
},
{
"epoch": 0.25,
"learning_rate": 0.0004856707317073171,
"loss": 0.0,
"step": 23000
},
{
"epoch": 0.26,
"learning_rate": 0.00048511234700259094,
"loss": 0.0,
"step": 23500
},
{
"epoch": 0.26,
"learning_rate": 0.00048455396229786475,
"loss": 0.0,
"step": 24000
},
{
"epoch": 0.27,
"learning_rate": 0.00048399557759313855,
"loss": 0.0,
"step": 24500
},
{
"epoch": 0.27,
"learning_rate": 0.0004834371928884124,
"loss": 0.0,
"step": 25000
},
{
"epoch": 0.28,
"learning_rate": 0.0004828788081836862,
"loss": 0.0,
"step": 25500
},
{
"epoch": 0.28,
"learning_rate": 0.0004823204234789601,
"loss": 0.0,
"step": 26000
},
{
"epoch": 0.29,
"learning_rate": 0.00048176203877423393,
"loss": 0.0,
"step": 26500
},
{
"epoch": 0.29,
"learning_rate": 0.00048120365406950774,
"loss": 0.0,
"step": 27000
},
{
"epoch": 0.3,
"learning_rate": 0.0004806452693647816,
"loss": 0.0,
"step": 27500
},
{
"epoch": 0.31,
"learning_rate": 0.0004800868846600554,
"loss": 0.0,
"step": 28000
},
{
"epoch": 0.31,
"learning_rate": 0.0004795284999553292,
"loss": 0.0,
"step": 28500
},
{
"epoch": 0.32,
"learning_rate": 0.00047897011525060306,
"loss": 0.0,
"step": 29000
},
{
"epoch": 0.32,
"learning_rate": 0.00047841173054587687,
"loss": 0.0,
"step": 29500
},
{
"epoch": 0.33,
"learning_rate": 0.0004778533458411508,
"loss": 0.0,
"step": 30000
},
{
"epoch": 0.33,
"eval_accuracy": 3.357992413338889e-05,
"eval_loss": NaN,
"eval_runtime": 1245.2057,
"eval_samples_per_second": 247.635,
"eval_steps_per_second": 3.87,
"step": 30000
},
{
"epoch": 0.33,
"learning_rate": 0.0004772949611364246,
"loss": 0.0,
"step": 30500
},
{
"epoch": 0.34,
"learning_rate": 0.0004767365764316984,
"loss": 0.0,
"step": 31000
},
{
"epoch": 0.34,
"learning_rate": 0.00047617819172697225,
"loss": 0.0,
"step": 31500
},
{
"epoch": 0.35,
"learning_rate": 0.00047561980702224606,
"loss": 0.0,
"step": 32000
},
{
"epoch": 0.36,
"learning_rate": 0.00047506142231751986,
"loss": 0.0,
"step": 32500
},
{
"epoch": 0.36,
"learning_rate": 0.0004745030376127937,
"loss": 0.0,
"step": 33000
},
{
"epoch": 0.37,
"learning_rate": 0.0004739446529080675,
"loss": 0.0,
"step": 33500
},
{
"epoch": 0.37,
"learning_rate": 0.0004733862682033414,
"loss": 0.0,
"step": 34000
},
{
"epoch": 0.38,
"learning_rate": 0.00047282788349861524,
"loss": 0.0,
"step": 34500
},
{
"epoch": 0.38,
"learning_rate": 0.00047226949879388905,
"loss": 0.0,
"step": 35000
},
{
"epoch": 0.39,
"learning_rate": 0.0004717111140891629,
"loss": 0.0,
"step": 35500
},
{
"epoch": 0.39,
"learning_rate": 0.0004711527293844367,
"loss": 0.0,
"step": 36000
},
{
"epoch": 0.4,
"learning_rate": 0.0004705943446797105,
"loss": 0.0,
"step": 36500
},
{
"epoch": 0.4,
"learning_rate": 0.0004700359599749844,
"loss": 0.0,
"step": 37000
},
{
"epoch": 0.41,
"learning_rate": 0.0004694775752702582,
"loss": 0.0,
"step": 37500
},
{
"epoch": 0.42,
"learning_rate": 0.00046891919056553204,
"loss": 0.0,
"step": 38000
},
{
"epoch": 0.42,
"learning_rate": 0.0004683608058608059,
"loss": 0.0,
"step": 38500
},
{
"epoch": 0.43,
"learning_rate": 0.0004678024211560797,
"loss": 0.0,
"step": 39000
},
{
"epoch": 0.43,
"learning_rate": 0.00046724403645135356,
"loss": 0.0,
"step": 39500
},
{
"epoch": 0.44,
"learning_rate": 0.00046668565174662736,
"loss": 0.0,
"step": 40000
},
{
"epoch": 0.44,
"eval_accuracy": 3.1386399970587474e-05,
"eval_loss": NaN,
"eval_runtime": 1244.2525,
"eval_samples_per_second": 247.824,
"eval_steps_per_second": 3.873,
"step": 40000
},
{
"epoch": 0.44,
"learning_rate": 0.00046612726704190117,
"loss": 0.0,
"step": 40500
},
{
"epoch": 0.45,
"learning_rate": 0.00046556888233717503,
"loss": 0.0,
"step": 41000
},
{
"epoch": 0.45,
"learning_rate": 0.00046501049763244883,
"loss": 0.0,
"step": 41500
},
{
"epoch": 0.46,
"learning_rate": 0.0004644521129277227,
"loss": 0.0,
"step": 42000
},
{
"epoch": 0.46,
"learning_rate": 0.00046389372822299655,
"loss": 0.0,
"step": 42500
},
{
"epoch": 0.47,
"learning_rate": 0.00046333534351827036,
"loss": 0.0,
"step": 43000
},
{
"epoch": 0.48,
"learning_rate": 0.0004627769588135442,
"loss": 0.0,
"step": 43500
},
{
"epoch": 0.48,
"learning_rate": 0.000462218574108818,
"loss": 0.0,
"step": 44000
},
{
"epoch": 0.49,
"learning_rate": 0.0004616601894040919,
"loss": 0.0,
"step": 44500
},
{
"epoch": 0.49,
"learning_rate": 0.0004611018046993657,
"loss": 0.0,
"step": 45000
},
{
"epoch": 0.5,
"learning_rate": 0.0004605434199946395,
"loss": 0.0,
"step": 45500
},
{
"epoch": 0.5,
"learning_rate": 0.00045998503528991335,
"loss": 0.0,
"step": 46000
},
{
"epoch": 0.51,
"learning_rate": 0.00045942665058518715,
"loss": 0.0,
"step": 46500
},
{
"epoch": 0.51,
"learning_rate": 0.000458868265880461,
"loss": 0.0,
"step": 47000
},
{
"epoch": 0.52,
"learning_rate": 0.00045830988117573487,
"loss": 0.0,
"step": 47500
},
{
"epoch": 0.52,
"learning_rate": 0.0004577514964710087,
"loss": 0.0,
"step": 48000
},
{
"epoch": 0.53,
"learning_rate": 0.00045719311176628253,
"loss": 0.0,
"step": 48500
},
{
"epoch": 0.54,
"learning_rate": 0.00045663472706155634,
"loss": 0.0,
"step": 49000
},
{
"epoch": 0.54,
"learning_rate": 0.00045607634235683014,
"loss": 0.0,
"step": 49500
},
{
"epoch": 0.55,
"learning_rate": 0.000455517957652104,
"loss": 0.0,
"step": 50000
},
{
"epoch": 0.55,
"eval_accuracy": 2.9965971727327976e-05,
"eval_loss": NaN,
"eval_runtime": 1242.0864,
"eval_samples_per_second": 248.256,
"eval_steps_per_second": 3.88,
"step": 50000
},
{
"epoch": 0.55,
"learning_rate": 0.0004549595729473778,
"loss": 0.0,
"step": 50500
},
{
"epoch": 0.56,
"learning_rate": 0.00045440118824265167,
"loss": 0.0,
"step": 51000
},
{
"epoch": 0.56,
"learning_rate": 0.0004538428035379255,
"loss": 0.0,
"step": 51500
},
{
"epoch": 0.57,
"learning_rate": 0.00045328441883319933,
"loss": 0.0,
"step": 52000
},
{
"epoch": 0.57,
"learning_rate": 0.0004527260341284732,
"loss": 0.0,
"step": 52500
},
{
"epoch": 0.58,
"learning_rate": 0.000452167649423747,
"loss": 0.0,
"step": 53000
},
{
"epoch": 0.58,
"learning_rate": 0.0004516092647190208,
"loss": 0.0,
"step": 53500
},
{
"epoch": 0.59,
"learning_rate": 0.00045105088001429466,
"loss": 0.0,
"step": 54000
},
{
"epoch": 0.6,
"learning_rate": 0.00045049249530956846,
"loss": 0.0,
"step": 54500
},
{
"epoch": 0.6,
"learning_rate": 0.0004499341106048423,
"loss": 0.0,
"step": 55000
},
{
"epoch": 0.61,
"learning_rate": 0.0004493757259001162,
"loss": 0.0,
"step": 55500
},
{
"epoch": 0.61,
"learning_rate": 0.00044881734119539,
"loss": 0.0,
"step": 56000
},
{
"epoch": 0.62,
"learning_rate": 0.00044825895649066384,
"loss": 0.0,
"step": 56500
},
{
"epoch": 0.62,
"learning_rate": 0.00044770057178593765,
"loss": 0.0,
"step": 57000
},
{
"epoch": 0.63,
"learning_rate": 0.00044714218708121145,
"loss": 0.0,
"step": 57500
},
{
"epoch": 0.63,
"learning_rate": 0.0004465838023764853,
"loss": 0.0,
"step": 58000
},
{
"epoch": 0.64,
"learning_rate": 0.0004460254176717591,
"loss": 0.0,
"step": 58500
},
{
"epoch": 0.64,
"learning_rate": 0.00044546703296703303,
"loss": 0.0,
"step": 59000
},
{
"epoch": 0.65,
"learning_rate": 0.00044490864826230683,
"loss": 0.0,
"step": 59500
},
{
"epoch": 0.66,
"learning_rate": 0.00044435026355758064,
"loss": 0.0,
"step": 60000
},
{
"epoch": 0.66,
"eval_accuracy": 3.177880317382685e-05,
"eval_loss": NaN,
"eval_runtime": 1238.4379,
"eval_samples_per_second": 248.988,
"eval_steps_per_second": 3.891,
"step": 60000
},
{
"epoch": 0.66,
"learning_rate": 0.0004437918788528545,
"loss": 0.0,
"step": 60500
},
{
"epoch": 0.67,
"learning_rate": 0.0004432334941481283,
"loss": 0.0,
"step": 61000
},
{
"epoch": 0.67,
"learning_rate": 0.0004426751094434021,
"loss": 0.0,
"step": 61500
},
{
"epoch": 0.68,
"learning_rate": 0.00044211672473867597,
"loss": 0.0,
"step": 62000
},
{
"epoch": 0.68,
"learning_rate": 0.00044155834003394977,
"loss": 0.0,
"step": 62500
},
{
"epoch": 0.69,
"learning_rate": 0.00044099995532922363,
"loss": 0.0,
"step": 63000
},
{
"epoch": 0.69,
"learning_rate": 0.0004404415706244975,
"loss": 0.0,
"step": 63500
},
{
"epoch": 0.7,
"learning_rate": 0.0004398831859197713,
"loss": 0.0,
"step": 64000
},
{
"epoch": 0.7,
"learning_rate": 0.00043932480121504515,
"loss": 0.0,
"step": 64500
},
{
"epoch": 0.71,
"learning_rate": 0.00043876641651031896,
"loss": 0.0,
"step": 65000
},
{
"epoch": 0.72,
"learning_rate": 0.00043820803180559276,
"loss": 0.0,
"step": 65500
},
{
"epoch": 0.72,
"learning_rate": 0.0004376496471008666,
"loss": 0.0,
"step": 66000
},
{
"epoch": 0.73,
"learning_rate": 0.0004370912623961404,
"loss": 0.0,
"step": 66500
},
{
"epoch": 0.73,
"learning_rate": 0.0004365328776914143,
"loss": 0.0,
"step": 67000
},
{
"epoch": 0.74,
"learning_rate": 0.00043597449298668814,
"loss": 0.0,
"step": 67500
},
{
"epoch": 0.74,
"learning_rate": 0.00043541610828196195,
"loss": 0.0,
"step": 68000
},
{
"epoch": 0.75,
"learning_rate": 0.0004348577235772358,
"loss": 0.0,
"step": 68500
},
{
"epoch": 0.75,
"learning_rate": 0.0004342993388725096,
"loss": 0.0,
"step": 69000
},
{
"epoch": 0.76,
"learning_rate": 0.0004337409541677834,
"loss": 0.0,
"step": 69500
},
{
"epoch": 0.76,
"learning_rate": 0.0004331825694630573,
"loss": 0.0,
"step": 70000
},
{
"epoch": 0.76,
"eval_accuracy": 3.253472148144988e-05,
"eval_loss": NaN,
"eval_runtime": 1240.7983,
"eval_samples_per_second": 248.514,
"eval_steps_per_second": 3.884,
"step": 70000
},
{
"epoch": 0.77,
"learning_rate": 0.0004326241847583311,
"loss": 0.0,
"step": 70500
},
{
"epoch": 0.78,
"learning_rate": 0.00043206580005360494,
"loss": 0.0,
"step": 71000
},
{
"epoch": 0.78,
"learning_rate": 0.0004315074153488788,
"loss": 0.0,
"step": 71500
},
{
"epoch": 0.79,
"learning_rate": 0.0004309490306441526,
"loss": 0.0,
"step": 72000
},
{
"epoch": 0.79,
"learning_rate": 0.00043039064593942646,
"loss": 0.0,
"step": 72500
},
{
"epoch": 0.8,
"learning_rate": 0.00042983226123470027,
"loss": 0.0,
"step": 73000
},
{
"epoch": 0.8,
"learning_rate": 0.00042927387652997407,
"loss": 0.0,
"step": 73500
},
{
"epoch": 0.81,
"learning_rate": 0.00042871549182524793,
"loss": 0.0,
"step": 74000
},
{
"epoch": 0.81,
"learning_rate": 0.00042815710712052174,
"loss": 0.0,
"step": 74500
},
{
"epoch": 0.82,
"learning_rate": 0.0004275987224157956,
"loss": 0.0,
"step": 75000
},
{
"epoch": 0.82,
"learning_rate": 0.0004270403377110694,
"loss": 0.0,
"step": 75500
},
{
"epoch": 0.83,
"learning_rate": 0.00042648195300634326,
"loss": 0.0,
"step": 76000
},
{
"epoch": 0.84,
"learning_rate": 0.0004259235683016171,
"loss": 0.0,
"step": 76500
},
{
"epoch": 0.84,
"learning_rate": 0.0004253651835968909,
"loss": 0.0,
"step": 77000
},
{
"epoch": 0.85,
"learning_rate": 0.0004248067988921648,
"loss": 0.0,
"step": 77500
},
{
"epoch": 0.85,
"learning_rate": 0.0004242484141874386,
"loss": 0.0,
"step": 78000
},
{
"epoch": 0.86,
"learning_rate": 0.0004236900294827124,
"loss": 0.0,
"step": 78500
},
{
"epoch": 0.86,
"learning_rate": 0.00042313164477798625,
"loss": 0.0,
"step": 79000
},
{
"epoch": 0.87,
"learning_rate": 0.00042257326007326005,
"loss": 0.0,
"step": 79500
},
{
"epoch": 0.87,
"learning_rate": 0.0004220148753685339,
"loss": 0.0,
"step": 80000
},
{
"epoch": 0.87,
"eval_accuracy": 2.7615362733125802e-05,
"eval_loss": NaN,
"eval_runtime": 1242.4281,
"eval_samples_per_second": 248.188,
"eval_steps_per_second": 3.879,
"step": 80000
},
{
"epoch": 0.88,
"learning_rate": 0.00042145649066380777,
"loss": 0.0,
"step": 80500
},
{
"epoch": 0.88,
"learning_rate": 0.0004208981059590816,
"loss": 0.0,
"step": 81000
},
{
"epoch": 0.89,
"learning_rate": 0.00042033972125435544,
"loss": 0.0,
"step": 81500
},
{
"epoch": 0.9,
"learning_rate": 0.00041978133654962924,
"loss": 0.0,
"step": 82000
},
{
"epoch": 0.9,
"learning_rate": 0.00041922295184490305,
"loss": 0.0,
"step": 82500
},
{
"epoch": 0.91,
"learning_rate": 0.0004186645671401769,
"loss": 0.0,
"step": 83000
},
{
"epoch": 0.91,
"learning_rate": 0.0004181061824354507,
"loss": 0.0,
"step": 83500
},
{
"epoch": 0.92,
"learning_rate": 0.00041754779773072457,
"loss": 0.0,
"step": 84000
},
{
"epoch": 0.92,
"learning_rate": 0.00041698941302599843,
"loss": 0.0,
"step": 84500
},
{
"epoch": 0.93,
"learning_rate": 0.00041643102832127223,
"loss": 0.0,
"step": 85000
},
{
"epoch": 0.93,
"learning_rate": 0.0004158726436165461,
"loss": 0.0,
"step": 85500
},
{
"epoch": 0.94,
"learning_rate": 0.0004153142589118199,
"loss": 0.0,
"step": 86000
},
{
"epoch": 0.94,
"learning_rate": 0.0004147558742070937,
"loss": 0.0,
"step": 86500
},
{
"epoch": 0.95,
"learning_rate": 0.00041419748950236756,
"loss": 0.0,
"step": 87000
},
{
"epoch": 0.96,
"learning_rate": 0.00041363910479764136,
"loss": 0.0,
"step": 87500
},
{
"epoch": 0.96,
"learning_rate": 0.0004130807200929152,
"loss": 0.0,
"step": 88000
},
{
"epoch": 0.97,
"learning_rate": 0.0004125223353881891,
"loss": 0.0,
"step": 88500
},
{
"epoch": 0.97,
"learning_rate": 0.0004119639506834629,
"loss": 0.0,
"step": 89000
},
{
"epoch": 0.98,
"learning_rate": 0.00041140556597873675,
"loss": 0.0,
"step": 89500
},
{
"epoch": 0.98,
"learning_rate": 0.00041084718127401055,
"loss": 0.0,
"step": 90000
},
{
"epoch": 0.98,
"eval_accuracy": 2.9214303322906894e-05,
"eval_loss": NaN,
"eval_runtime": 1238.8828,
"eval_samples_per_second": 248.898,
"eval_steps_per_second": 3.89,
"step": 90000
},
{
"epoch": 0.99,
"learning_rate": 0.00041028879656928436,
"loss": 0.0,
"step": 90500
},
{
"epoch": 0.99,
"learning_rate": 0.0004097304118645582,
"loss": 0.0,
"step": 91000
},
{
"epoch": 1.0,
"learning_rate": 0.000409172027159832,
"loss": 0.0,
"step": 91500
},
{
"epoch": 1.0,
"learning_rate": 0.0004086136424551059,
"loss": 0.0,
"step": 92000
},
{
"epoch": 1.01,
"learning_rate": 0.00040805525775037974,
"loss": 0.0,
"step": 92500
},
{
"epoch": 1.02,
"learning_rate": 0.00040749687304565354,
"loss": 0.0,
"step": 93000
},
{
"epoch": 1.02,
"learning_rate": 0.0004069384883409274,
"loss": 0.0,
"step": 93500
},
{
"epoch": 1.03,
"learning_rate": 0.0004063801036362012,
"loss": 0.0,
"step": 94000
},
{
"epoch": 1.03,
"learning_rate": 0.000405821718931475,
"loss": 0.0,
"step": 94500
},
{
"epoch": 1.04,
"learning_rate": 0.00040526333422674887,
"loss": 0.0,
"step": 95000
},
{
"epoch": 1.04,
"learning_rate": 0.0004047049495220227,
"loss": 0.0,
"step": 95500
},
{
"epoch": 1.05,
"learning_rate": 0.00040414656481729653,
"loss": 0.0,
"step": 96000
},
{
"epoch": 1.05,
"learning_rate": 0.0004035881801125704,
"loss": 0.0,
"step": 96500
},
{
"epoch": 1.06,
"learning_rate": 0.0004030297954078442,
"loss": 0.0,
"step": 97000
},
{
"epoch": 1.07,
"learning_rate": 0.00040247141070311806,
"loss": 0.0,
"step": 97500
},
{
"epoch": 1.07,
"learning_rate": 0.00040191302599839186,
"loss": 0.0,
"step": 98000
},
{
"epoch": 1.08,
"learning_rate": 0.00040135464129366567,
"loss": 0.0,
"step": 98500
},
{
"epoch": 1.08,
"learning_rate": 0.0004007962565889395,
"loss": 0.0,
"step": 99000
},
{
"epoch": 1.09,
"learning_rate": 0.00040023787188421333,
"loss": 0.0,
"step": 99500
},
{
"epoch": 1.09,
"learning_rate": 0.0003996794871794872,
"loss": 0.0,
"step": 100000
},
{
"epoch": 1.09,
"eval_accuracy": 3.071726548590269e-05,
"eval_loss": NaN,
"eval_runtime": 1240.4665,
"eval_samples_per_second": 248.581,
"eval_steps_per_second": 3.885,
"step": 100000
},
{
"epoch": 1.1,
"learning_rate": 0.00039912110247476105,
"loss": 0.0,
"step": 100500
},
{
"epoch": 1.1,
"learning_rate": 0.00039856271777003485,
"loss": 0.0,
"step": 101000
},
{
"epoch": 1.11,
"learning_rate": 0.0003980043330653087,
"loss": 0.0,
"step": 101500
},
{
"epoch": 1.11,
"learning_rate": 0.0003974459483605825,
"loss": 0.0,
"step": 102000
},
{
"epoch": 1.12,
"learning_rate": 0.0003968875636558563,
"loss": 0.0,
"step": 102500
},
{
"epoch": 1.13,
"learning_rate": 0.0003963291789511302,
"loss": 0.0,
"step": 103000
},
{
"epoch": 1.13,
"learning_rate": 0.000395770794246404,
"loss": 0.0,
"step": 103500
},
{
"epoch": 1.14,
"learning_rate": 0.00039521240954167784,
"loss": 0.0,
"step": 104000
},
{
"epoch": 1.14,
"learning_rate": 0.0003946540248369517,
"loss": 0.0,
"step": 104500
},
{
"epoch": 1.15,
"learning_rate": 0.0003940956401322255,
"loss": 0.0,
"step": 105000
},
{
"epoch": 1.15,
"learning_rate": 0.00039353725542749937,
"loss": 0.0,
"step": 105500
},
{
"epoch": 1.16,
"learning_rate": 0.00039297887072277317,
"loss": 0.0,
"step": 106000
},
{
"epoch": 1.16,
"learning_rate": 0.00039242048601804703,
"loss": 0.0,
"step": 106500
},
{
"epoch": 1.17,
"learning_rate": 0.00039186210131332083,
"loss": 0.0,
"step": 107000
},
{
"epoch": 1.17,
"learning_rate": 0.00039130371660859464,
"loss": 0.0,
"step": 107500
},
{
"epoch": 1.18,
"learning_rate": 0.0003907453319038685,
"loss": 0.0,
"step": 108000
},
{
"epoch": 1.19,
"learning_rate": 0.0003901869471991423,
"loss": 0.0,
"step": 108500
},
{
"epoch": 1.19,
"learning_rate": 0.00038962856249441616,
"loss": 0.0,
"step": 109000
},
{
"epoch": 1.2,
"learning_rate": 0.00038907017778969,
"loss": 0.0,
"step": 109500
},
{
"epoch": 1.2,
"learning_rate": 0.0003885117930849638,
"loss": 0.0,
"step": 110000
},
{
"epoch": 1.2,
"eval_accuracy": 3.188648588911819e-05,
"eval_loss": NaN,
"eval_runtime": 1241.2976,
"eval_samples_per_second": 248.414,
"eval_steps_per_second": 3.882,
"step": 110000
},
{
"epoch": 1.21,
"learning_rate": 0.0003879534083802377,
"loss": 0.0,
"step": 110500
},
{
"epoch": 1.21,
"learning_rate": 0.0003873950236755115,
"loss": 0.0,
"step": 111000
},
{
"epoch": 1.22,
"learning_rate": 0.0003868366389707853,
"loss": 0.0,
"step": 111500
},
{
"epoch": 1.22,
"learning_rate": 0.00038627825426605915,
"loss": 0.0,
"step": 112000
},
{
"epoch": 1.23,
"learning_rate": 0.00038571986956133296,
"loss": 0.0,
"step": 112500
},
{
"epoch": 1.23,
"learning_rate": 0.0003851614848566068,
"loss": 0.0,
"step": 113000
},
{
"epoch": 1.24,
"learning_rate": 0.0003846031001518807,
"loss": 0.0,
"step": 113500
},
{
"epoch": 1.25,
"learning_rate": 0.0003840447154471545,
"loss": 0.0,
"step": 114000
},
{
"epoch": 1.25,
"learning_rate": 0.00038348633074242834,
"loss": 0.0,
"step": 114500
},
{
"epoch": 1.26,
"learning_rate": 0.00038292794603770214,
"loss": 0.0,
"step": 115000
},
{
"epoch": 1.26,
"learning_rate": 0.00038236956133297595,
"loss": 0.0,
"step": 115500
},
{
"epoch": 1.27,
"learning_rate": 0.0003818111766282498,
"loss": 0.0,
"step": 116000
},
{
"epoch": 1.27,
"learning_rate": 0.0003812527919235236,
"loss": 0.0,
"step": 116500
},
{
"epoch": 1.28,
"learning_rate": 0.00038069440721879747,
"loss": 0.0,
"step": 117000
},
{
"epoch": 1.28,
"learning_rate": 0.00038013602251407133,
"loss": 0.0,
"step": 117500
},
{
"epoch": 1.29,
"learning_rate": 0.00037957763780934514,
"loss": 0.0,
"step": 118000
},
{
"epoch": 1.29,
"learning_rate": 0.000379019253104619,
"loss": 0.0,
"step": 118500
},
{
"epoch": 1.3,
"learning_rate": 0.0003784608683998928,
"loss": 0.0,
"step": 119000
},
{
"epoch": 1.31,
"learning_rate": 0.0003779024836951666,
"loss": 0.0,
"step": 119500
},
{
"epoch": 1.31,
"learning_rate": 0.00037734409899044046,
"loss": 0.0,
"step": 120000
},
{
"epoch": 1.31,
"eval_accuracy": 2.9570698381633475e-05,
"eval_loss": NaN,
"eval_runtime": 1238.6519,
"eval_samples_per_second": 248.945,
"eval_steps_per_second": 3.891,
"step": 120000
},
{
"epoch": 1.32,
"learning_rate": 0.00037678571428571427,
"loss": 0.0,
"step": 120500
},
{
"epoch": 1.32,
"learning_rate": 0.00037622732958098807,
"loss": 0.0,
"step": 121000
},
{
"epoch": 1.33,
"learning_rate": 0.000375668944876262,
"loss": 0.0,
"step": 121500
},
{
"epoch": 1.33,
"learning_rate": 0.0003751105601715358,
"loss": 0.0,
"step": 122000
},
{
"epoch": 1.34,
"learning_rate": 0.00037455217546680965,
"loss": 0.0,
"step": 122500
},
{
"epoch": 1.34,
"learning_rate": 0.00037399379076208345,
"loss": 0.0,
"step": 123000
},
{
"epoch": 1.35,
"learning_rate": 0.00037343540605735726,
"loss": 0.0,
"step": 123500
},
{
"epoch": 1.35,
"learning_rate": 0.0003728770213526311,
"loss": 0.0,
"step": 124000
},
{
"epoch": 1.36,
"learning_rate": 0.0003723186366479049,
"loss": 0.0,
"step": 124500
},
{
"epoch": 1.37,
"learning_rate": 0.0003717602519431788,
"loss": 0.0,
"step": 125000
},
{
"epoch": 1.37,
"learning_rate": 0.00037120186723845264,
"loss": 0.0,
"step": 125500
},
{
"epoch": 1.38,
"learning_rate": 0.00037064348253372644,
"loss": 0.0,
"step": 126000
},
{
"epoch": 1.38,
"learning_rate": 0.0003700850978290003,
"loss": 0.0,
"step": 126500
},
{
"epoch": 1.39,
"learning_rate": 0.0003695267131242741,
"loss": 0.0,
"step": 127000
},
{
"epoch": 1.39,
"learning_rate": 0.0003689683284195479,
"loss": 0.0,
"step": 127500
},
{
"epoch": 1.4,
"learning_rate": 0.00036840994371482177,
"loss": 0.0,
"step": 128000
},
{
"epoch": 1.4,
"learning_rate": 0.0003678515590100956,
"loss": 0.0,
"step": 128500
},
{
"epoch": 1.41,
"learning_rate": 0.00036729317430536944,
"loss": 0.0,
"step": 129000
},
{
"epoch": 1.41,
"learning_rate": 0.0003667347896006433,
"loss": 0.0,
"step": 129500
},
{
"epoch": 1.42,
"learning_rate": 0.0003661764048959171,
"loss": 0.0,
"step": 130000
},
{
"epoch": 1.42,
"eval_accuracy": 3.2269763129715425e-05,
"eval_loss": NaN,
"eval_runtime": 1246.6351,
"eval_samples_per_second": 247.351,
"eval_steps_per_second": 3.866,
"step": 130000
},
{
"epoch": 1.43,
"learning_rate": 0.00036561802019119096,
"loss": 0.0,
"step": 130500
},
{
"epoch": 1.43,
"learning_rate": 0.00036505963548646476,
"loss": 0.0,
"step": 131000
},
{
"epoch": 1.44,
"learning_rate": 0.00036450125078173857,
"loss": 0.0,
"step": 131500
},
{
"epoch": 1.44,
"learning_rate": 0.0003639428660770124,
"loss": 0.0,
"step": 132000
},
{
"epoch": 1.45,
"learning_rate": 0.00036338448137228623,
"loss": 0.0,
"step": 132500
},
{
"epoch": 1.45,
"learning_rate": 0.0003628260966675601,
"loss": 0.0,
"step": 133000
},
{
"epoch": 1.46,
"learning_rate": 0.00036226771196283395,
"loss": 0.0,
"step": 133500
},
{
"epoch": 1.46,
"learning_rate": 0.00036170932725810775,
"loss": 0.0,
"step": 134000
},
{
"epoch": 1.47,
"learning_rate": 0.0003611509425533816,
"loss": 0.0,
"step": 134500
},
{
"epoch": 1.47,
"learning_rate": 0.0003605925578486554,
"loss": 0.0,
"step": 135000
},
{
"epoch": 1.48,
"learning_rate": 0.0003600341731439292,
"loss": 0.0,
"step": 135500
},
{
"epoch": 1.49,
"learning_rate": 0.0003594757884392031,
"loss": 0.0,
"step": 136000
},
{
"epoch": 1.49,
"learning_rate": 0.0003589174037344769,
"loss": 0.0,
"step": 136500
},
{
"epoch": 1.5,
"learning_rate": 0.00035835901902975075,
"loss": 0.0,
"step": 137000
},
{
"epoch": 1.5,
"learning_rate": 0.00035780063432502455,
"loss": 0.0,
"step": 137500
},
{
"epoch": 1.51,
"learning_rate": 0.0003572422496202984,
"loss": 0.0,
"step": 138000
},
{
"epoch": 1.51,
"learning_rate": 0.00035668386491557227,
"loss": 0.0,
"step": 138500
},
{
"epoch": 1.52,
"learning_rate": 0.0003561254802108461,
"loss": 0.0,
"step": 139000
},
{
"epoch": 1.52,
"learning_rate": 0.00035556709550611993,
"loss": 0.0,
"step": 139500
},
{
"epoch": 1.53,
"learning_rate": 0.00035500871080139374,
"loss": 0.0,
"step": 140000
},
{
"epoch": 1.53,
"eval_accuracy": 3.290421786718595e-05,
"eval_loss": NaN,
"eval_runtime": 1247.689,
"eval_samples_per_second": 247.142,
"eval_steps_per_second": 3.862,
"step": 140000
},
{
"epoch": 1.53,
"step": 140001,
"total_flos": 2.560247267189588e+18,
"train_loss": 0.3459514281929236,
"train_runtime": 108836.6872,
"train_samples_per_second": 269.154,
"train_steps_per_second": 4.206
}
],
"max_steps": 457720,
"num_train_epochs": 5,
"total_flos": 2.560247267189588e+18,
"trial_name": null,
"trial_params": null
}