{ "best_metric": NaN, "best_model_checkpoint": "add_bert_12_layer_model_complete_training_new/checkpoint-10000", "epoch": 1.5293301581753038, "global_step": 140001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.5e-05, "loss": 8.2896, "step": 500 }, { "epoch": 0.01, "learning_rate": 5e-05, "loss": 6.8372, "step": 1000 }, { "epoch": 0.02, "learning_rate": 7.5e-05, "loss": 6.683, "step": 1500 }, { "epoch": 0.02, "learning_rate": 0.0001, "loss": 6.5955, "step": 2000 }, { "epoch": 0.03, "learning_rate": 0.000125, "loss": 6.5373, "step": 2500 }, { "epoch": 0.03, "learning_rate": 0.00015, "loss": 6.4804, "step": 3000 }, { "epoch": 0.04, "learning_rate": 0.000175, "loss": 6.4463, "step": 3500 }, { "epoch": 0.04, "learning_rate": 0.0002, "loss": 6.408, "step": 4000 }, { "epoch": 0.05, "learning_rate": 0.00022500000000000002, "loss": 6.3827, "step": 4500 }, { "epoch": 0.05, "learning_rate": 0.00025, "loss": 6.3731, "step": 5000 }, { "epoch": 0.06, "learning_rate": 0.000275, "loss": 6.3479, "step": 5500 }, { "epoch": 0.07, "learning_rate": 0.0003, "loss": 6.3401, "step": 6000 }, { "epoch": 0.07, "learning_rate": 0.0003239, "loss": 7.0795, "step": 6500 }, { "epoch": 0.08, "learning_rate": 0.00034155000000000003, "loss": 10.0665, "step": 7000 }, { "epoch": 0.08, "learning_rate": 0.00036655, "loss": 0.0, "step": 7500 }, { "epoch": 0.09, "learning_rate": 0.00039155, "loss": 0.0, "step": 8000 }, { "epoch": 0.09, "learning_rate": 0.00041654999999999996, "loss": 0.0, "step": 8500 }, { "epoch": 0.1, "learning_rate": 0.00044155, "loss": 0.0, "step": 9000 }, { "epoch": 0.1, "learning_rate": 0.00046655000000000004, "loss": 0.0, "step": 9500 }, { "epoch": 0.11, "learning_rate": 0.00049155, "loss": 0.0, "step": 10000 }, { "epoch": 0.11, "eval_accuracy": 3.0592783198016594e-05, "eval_loss": NaN, "eval_runtime": 1245.7969, "eval_samples_per_second": 247.517, "eval_steps_per_second": 3.868, "step": 10000 }, { "epoch": 0.11, "learning_rate": 0.0004996303493254713, "loss": 0.0, "step": 10500 }, { "epoch": 0.12, "learning_rate": 0.0004990719646207451, "loss": 0.0, "step": 11000 }, { "epoch": 0.13, "learning_rate": 0.000498513579916019, "loss": 0.0, "step": 11500 }, { "epoch": 0.13, "learning_rate": 0.0004979551952112928, "loss": 0.0, "step": 12000 }, { "epoch": 0.14, "learning_rate": 0.0004973968105065666, "loss": 0.0, "step": 12500 }, { "epoch": 0.14, "learning_rate": 0.0004968384258018405, "loss": 0.0, "step": 13000 }, { "epoch": 0.15, "learning_rate": 0.0004962800410971143, "loss": 0.0, "step": 13500 }, { "epoch": 0.15, "learning_rate": 0.0004957216563923881, "loss": 0.0, "step": 14000 }, { "epoch": 0.16, "learning_rate": 0.0004951632716876619, "loss": 0.0, "step": 14500 }, { "epoch": 0.16, "learning_rate": 0.0004946048869829357, "loss": 0.0, "step": 15000 }, { "epoch": 0.17, "learning_rate": 0.0004940465022782096, "loss": 0.0, "step": 15500 }, { "epoch": 0.17, "learning_rate": 0.0004934881175734834, "loss": 0.0, "step": 16000 }, { "epoch": 0.18, "learning_rate": 0.0004929297328687572, "loss": 0.0, "step": 16500 }, { "epoch": 0.19, "learning_rate": 0.0004923713481640312, "loss": 0.0, "step": 17000 }, { "epoch": 0.19, "learning_rate": 0.000491812963459305, "loss": 0.0, "step": 17500 }, { "epoch": 0.2, "learning_rate": 0.0004912545787545788, "loss": 0.0, "step": 18000 }, { "epoch": 0.2, "learning_rate": 0.0004906961940498526, "loss": 0.0, "step": 18500 }, { "epoch": 0.21, "learning_rate": 0.0004901378093451264, "loss": 0.0, "step": 19000 }, { "epoch": 0.21, "learning_rate": 0.0004895794246404003, "loss": 0.0, "step": 19500 }, { "epoch": 0.22, "learning_rate": 0.0004890210399356741, "loss": 0.0, "step": 20000 }, { "epoch": 0.22, "eval_accuracy": 3.5159707526351004e-05, "eval_loss": NaN, "eval_runtime": 1244.1268, "eval_samples_per_second": 247.849, "eval_steps_per_second": 3.873, "step": 20000 }, { "epoch": 0.22, "learning_rate": 0.0004884626552309479, "loss": 0.0, "step": 20500 }, { "epoch": 0.23, "learning_rate": 0.00048790427052622175, "loss": 0.0, "step": 21000 }, { "epoch": 0.23, "learning_rate": 0.00048734588582149556, "loss": 0.0, "step": 21500 }, { "epoch": 0.24, "learning_rate": 0.0004867875011167694, "loss": 0.0, "step": 22000 }, { "epoch": 0.25, "learning_rate": 0.0004862291164120433, "loss": 0.0, "step": 22500 }, { "epoch": 0.25, "learning_rate": 0.0004856707317073171, "loss": 0.0, "step": 23000 }, { "epoch": 0.26, "learning_rate": 0.00048511234700259094, "loss": 0.0, "step": 23500 }, { "epoch": 0.26, "learning_rate": 0.00048455396229786475, "loss": 0.0, "step": 24000 }, { "epoch": 0.27, "learning_rate": 0.00048399557759313855, "loss": 0.0, "step": 24500 }, { "epoch": 0.27, "learning_rate": 0.0004834371928884124, "loss": 0.0, "step": 25000 }, { "epoch": 0.28, "learning_rate": 0.0004828788081836862, "loss": 0.0, "step": 25500 }, { "epoch": 0.28, "learning_rate": 0.0004823204234789601, "loss": 0.0, "step": 26000 }, { "epoch": 0.29, "learning_rate": 0.00048176203877423393, "loss": 0.0, "step": 26500 }, { "epoch": 0.29, "learning_rate": 0.00048120365406950774, "loss": 0.0, "step": 27000 }, { "epoch": 0.3, "learning_rate": 0.0004806452693647816, "loss": 0.0, "step": 27500 }, { "epoch": 0.31, "learning_rate": 0.0004800868846600554, "loss": 0.0, "step": 28000 }, { "epoch": 0.31, "learning_rate": 0.0004795284999553292, "loss": 0.0, "step": 28500 }, { "epoch": 0.32, "learning_rate": 0.00047897011525060306, "loss": 0.0, "step": 29000 }, { "epoch": 0.32, "learning_rate": 0.00047841173054587687, "loss": 0.0, "step": 29500 }, { "epoch": 0.33, "learning_rate": 0.0004778533458411508, "loss": 0.0, "step": 30000 }, { "epoch": 0.33, "eval_accuracy": 3.357992413338889e-05, "eval_loss": NaN, "eval_runtime": 1245.2057, "eval_samples_per_second": 247.635, "eval_steps_per_second": 3.87, "step": 30000 }, { "epoch": 0.33, "learning_rate": 0.0004772949611364246, "loss": 0.0, "step": 30500 }, { "epoch": 0.34, "learning_rate": 0.0004767365764316984, "loss": 0.0, "step": 31000 }, { "epoch": 0.34, "learning_rate": 0.00047617819172697225, "loss": 0.0, "step": 31500 }, { "epoch": 0.35, "learning_rate": 0.00047561980702224606, "loss": 0.0, "step": 32000 }, { "epoch": 0.36, "learning_rate": 0.00047506142231751986, "loss": 0.0, "step": 32500 }, { "epoch": 0.36, "learning_rate": 0.0004745030376127937, "loss": 0.0, "step": 33000 }, { "epoch": 0.37, "learning_rate": 0.0004739446529080675, "loss": 0.0, "step": 33500 }, { "epoch": 0.37, "learning_rate": 0.0004733862682033414, "loss": 0.0, "step": 34000 }, { "epoch": 0.38, "learning_rate": 0.00047282788349861524, "loss": 0.0, "step": 34500 }, { "epoch": 0.38, "learning_rate": 0.00047226949879388905, "loss": 0.0, "step": 35000 }, { "epoch": 0.39, "learning_rate": 0.0004717111140891629, "loss": 0.0, "step": 35500 }, { "epoch": 0.39, "learning_rate": 0.0004711527293844367, "loss": 0.0, "step": 36000 }, { "epoch": 0.4, "learning_rate": 0.0004705943446797105, "loss": 0.0, "step": 36500 }, { "epoch": 0.4, "learning_rate": 0.0004700359599749844, "loss": 0.0, "step": 37000 }, { "epoch": 0.41, "learning_rate": 0.0004694775752702582, "loss": 0.0, "step": 37500 }, { "epoch": 0.42, "learning_rate": 0.00046891919056553204, "loss": 0.0, "step": 38000 }, { "epoch": 0.42, "learning_rate": 0.0004683608058608059, "loss": 0.0, "step": 38500 }, { "epoch": 0.43, "learning_rate": 0.0004678024211560797, "loss": 0.0, "step": 39000 }, { "epoch": 0.43, "learning_rate": 0.00046724403645135356, "loss": 0.0, "step": 39500 }, { "epoch": 0.44, "learning_rate": 0.00046668565174662736, "loss": 0.0, "step": 40000 }, { "epoch": 0.44, "eval_accuracy": 3.1386399970587474e-05, "eval_loss": NaN, "eval_runtime": 1244.2525, "eval_samples_per_second": 247.824, "eval_steps_per_second": 3.873, "step": 40000 }, { "epoch": 0.44, "learning_rate": 0.00046612726704190117, "loss": 0.0, "step": 40500 }, { "epoch": 0.45, "learning_rate": 0.00046556888233717503, "loss": 0.0, "step": 41000 }, { "epoch": 0.45, "learning_rate": 0.00046501049763244883, "loss": 0.0, "step": 41500 }, { "epoch": 0.46, "learning_rate": 0.0004644521129277227, "loss": 0.0, "step": 42000 }, { "epoch": 0.46, "learning_rate": 0.00046389372822299655, "loss": 0.0, "step": 42500 }, { "epoch": 0.47, "learning_rate": 0.00046333534351827036, "loss": 0.0, "step": 43000 }, { "epoch": 0.48, "learning_rate": 0.0004627769588135442, "loss": 0.0, "step": 43500 }, { "epoch": 0.48, "learning_rate": 0.000462218574108818, "loss": 0.0, "step": 44000 }, { "epoch": 0.49, "learning_rate": 0.0004616601894040919, "loss": 0.0, "step": 44500 }, { "epoch": 0.49, "learning_rate": 0.0004611018046993657, "loss": 0.0, "step": 45000 }, { "epoch": 0.5, "learning_rate": 0.0004605434199946395, "loss": 0.0, "step": 45500 }, { "epoch": 0.5, "learning_rate": 0.00045998503528991335, "loss": 0.0, "step": 46000 }, { "epoch": 0.51, "learning_rate": 0.00045942665058518715, "loss": 0.0, "step": 46500 }, { "epoch": 0.51, "learning_rate": 0.000458868265880461, "loss": 0.0, "step": 47000 }, { "epoch": 0.52, "learning_rate": 0.00045830988117573487, "loss": 0.0, "step": 47500 }, { "epoch": 0.52, "learning_rate": 0.0004577514964710087, "loss": 0.0, "step": 48000 }, { "epoch": 0.53, "learning_rate": 0.00045719311176628253, "loss": 0.0, "step": 48500 }, { "epoch": 0.54, "learning_rate": 0.00045663472706155634, "loss": 0.0, "step": 49000 }, { "epoch": 0.54, "learning_rate": 0.00045607634235683014, "loss": 0.0, "step": 49500 }, { "epoch": 0.55, "learning_rate": 0.000455517957652104, "loss": 0.0, "step": 50000 }, { "epoch": 0.55, "eval_accuracy": 2.9965971727327976e-05, "eval_loss": NaN, "eval_runtime": 1242.0864, "eval_samples_per_second": 248.256, "eval_steps_per_second": 3.88, "step": 50000 }, { "epoch": 0.55, "learning_rate": 0.0004549595729473778, "loss": 0.0, "step": 50500 }, { "epoch": 0.56, "learning_rate": 0.00045440118824265167, "loss": 0.0, "step": 51000 }, { "epoch": 0.56, "learning_rate": 0.0004538428035379255, "loss": 0.0, "step": 51500 }, { "epoch": 0.57, "learning_rate": 0.00045328441883319933, "loss": 0.0, "step": 52000 }, { "epoch": 0.57, "learning_rate": 0.0004527260341284732, "loss": 0.0, "step": 52500 }, { "epoch": 0.58, "learning_rate": 0.000452167649423747, "loss": 0.0, "step": 53000 }, { "epoch": 0.58, "learning_rate": 0.0004516092647190208, "loss": 0.0, "step": 53500 }, { "epoch": 0.59, "learning_rate": 0.00045105088001429466, "loss": 0.0, "step": 54000 }, { "epoch": 0.6, "learning_rate": 0.00045049249530956846, "loss": 0.0, "step": 54500 }, { "epoch": 0.6, "learning_rate": 0.0004499341106048423, "loss": 0.0, "step": 55000 }, { "epoch": 0.61, "learning_rate": 0.0004493757259001162, "loss": 0.0, "step": 55500 }, { "epoch": 0.61, "learning_rate": 0.00044881734119539, "loss": 0.0, "step": 56000 }, { "epoch": 0.62, "learning_rate": 0.00044825895649066384, "loss": 0.0, "step": 56500 }, { "epoch": 0.62, "learning_rate": 0.00044770057178593765, "loss": 0.0, "step": 57000 }, { "epoch": 0.63, "learning_rate": 0.00044714218708121145, "loss": 0.0, "step": 57500 }, { "epoch": 0.63, "learning_rate": 0.0004465838023764853, "loss": 0.0, "step": 58000 }, { "epoch": 0.64, "learning_rate": 0.0004460254176717591, "loss": 0.0, "step": 58500 }, { "epoch": 0.64, "learning_rate": 0.00044546703296703303, "loss": 0.0, "step": 59000 }, { "epoch": 0.65, "learning_rate": 0.00044490864826230683, "loss": 0.0, "step": 59500 }, { "epoch": 0.66, "learning_rate": 0.00044435026355758064, "loss": 0.0, "step": 60000 }, { "epoch": 0.66, "eval_accuracy": 3.177880317382685e-05, "eval_loss": NaN, "eval_runtime": 1238.4379, "eval_samples_per_second": 248.988, "eval_steps_per_second": 3.891, "step": 60000 }, { "epoch": 0.66, "learning_rate": 0.0004437918788528545, "loss": 0.0, "step": 60500 }, { "epoch": 0.67, "learning_rate": 0.0004432334941481283, "loss": 0.0, "step": 61000 }, { "epoch": 0.67, "learning_rate": 0.0004426751094434021, "loss": 0.0, "step": 61500 }, { "epoch": 0.68, "learning_rate": 0.00044211672473867597, "loss": 0.0, "step": 62000 }, { "epoch": 0.68, "learning_rate": 0.00044155834003394977, "loss": 0.0, "step": 62500 }, { "epoch": 0.69, "learning_rate": 0.00044099995532922363, "loss": 0.0, "step": 63000 }, { "epoch": 0.69, "learning_rate": 0.0004404415706244975, "loss": 0.0, "step": 63500 }, { "epoch": 0.7, "learning_rate": 0.0004398831859197713, "loss": 0.0, "step": 64000 }, { "epoch": 0.7, "learning_rate": 0.00043932480121504515, "loss": 0.0, "step": 64500 }, { "epoch": 0.71, "learning_rate": 0.00043876641651031896, "loss": 0.0, "step": 65000 }, { "epoch": 0.72, "learning_rate": 0.00043820803180559276, "loss": 0.0, "step": 65500 }, { "epoch": 0.72, "learning_rate": 0.0004376496471008666, "loss": 0.0, "step": 66000 }, { "epoch": 0.73, "learning_rate": 0.0004370912623961404, "loss": 0.0, "step": 66500 }, { "epoch": 0.73, "learning_rate": 0.0004365328776914143, "loss": 0.0, "step": 67000 }, { "epoch": 0.74, "learning_rate": 0.00043597449298668814, "loss": 0.0, "step": 67500 }, { "epoch": 0.74, "learning_rate": 0.00043541610828196195, "loss": 0.0, "step": 68000 }, { "epoch": 0.75, "learning_rate": 0.0004348577235772358, "loss": 0.0, "step": 68500 }, { "epoch": 0.75, "learning_rate": 0.0004342993388725096, "loss": 0.0, "step": 69000 }, { "epoch": 0.76, "learning_rate": 0.0004337409541677834, "loss": 0.0, "step": 69500 }, { "epoch": 0.76, "learning_rate": 0.0004331825694630573, "loss": 0.0, "step": 70000 }, { "epoch": 0.76, "eval_accuracy": 3.253472148144988e-05, "eval_loss": NaN, "eval_runtime": 1240.7983, "eval_samples_per_second": 248.514, "eval_steps_per_second": 3.884, "step": 70000 }, { "epoch": 0.77, "learning_rate": 0.0004326241847583311, "loss": 0.0, "step": 70500 }, { "epoch": 0.78, "learning_rate": 0.00043206580005360494, "loss": 0.0, "step": 71000 }, { "epoch": 0.78, "learning_rate": 0.0004315074153488788, "loss": 0.0, "step": 71500 }, { "epoch": 0.79, "learning_rate": 0.0004309490306441526, "loss": 0.0, "step": 72000 }, { "epoch": 0.79, "learning_rate": 0.00043039064593942646, "loss": 0.0, "step": 72500 }, { "epoch": 0.8, "learning_rate": 0.00042983226123470027, "loss": 0.0, "step": 73000 }, { "epoch": 0.8, "learning_rate": 0.00042927387652997407, "loss": 0.0, "step": 73500 }, { "epoch": 0.81, "learning_rate": 0.00042871549182524793, "loss": 0.0, "step": 74000 }, { "epoch": 0.81, "learning_rate": 0.00042815710712052174, "loss": 0.0, "step": 74500 }, { "epoch": 0.82, "learning_rate": 0.0004275987224157956, "loss": 0.0, "step": 75000 }, { "epoch": 0.82, "learning_rate": 0.0004270403377110694, "loss": 0.0, "step": 75500 }, { "epoch": 0.83, "learning_rate": 0.00042648195300634326, "loss": 0.0, "step": 76000 }, { "epoch": 0.84, "learning_rate": 0.0004259235683016171, "loss": 0.0, "step": 76500 }, { "epoch": 0.84, "learning_rate": 0.0004253651835968909, "loss": 0.0, "step": 77000 }, { "epoch": 0.85, "learning_rate": 0.0004248067988921648, "loss": 0.0, "step": 77500 }, { "epoch": 0.85, "learning_rate": 0.0004242484141874386, "loss": 0.0, "step": 78000 }, { "epoch": 0.86, "learning_rate": 0.0004236900294827124, "loss": 0.0, "step": 78500 }, { "epoch": 0.86, "learning_rate": 0.00042313164477798625, "loss": 0.0, "step": 79000 }, { "epoch": 0.87, "learning_rate": 0.00042257326007326005, "loss": 0.0, "step": 79500 }, { "epoch": 0.87, "learning_rate": 0.0004220148753685339, "loss": 0.0, "step": 80000 }, { "epoch": 0.87, "eval_accuracy": 2.7615362733125802e-05, "eval_loss": NaN, "eval_runtime": 1242.4281, "eval_samples_per_second": 248.188, "eval_steps_per_second": 3.879, "step": 80000 }, { "epoch": 0.88, "learning_rate": 0.00042145649066380777, "loss": 0.0, "step": 80500 }, { "epoch": 0.88, "learning_rate": 0.0004208981059590816, "loss": 0.0, "step": 81000 }, { "epoch": 0.89, "learning_rate": 0.00042033972125435544, "loss": 0.0, "step": 81500 }, { "epoch": 0.9, "learning_rate": 0.00041978133654962924, "loss": 0.0, "step": 82000 }, { "epoch": 0.9, "learning_rate": 0.00041922295184490305, "loss": 0.0, "step": 82500 }, { "epoch": 0.91, "learning_rate": 0.0004186645671401769, "loss": 0.0, "step": 83000 }, { "epoch": 0.91, "learning_rate": 0.0004181061824354507, "loss": 0.0, "step": 83500 }, { "epoch": 0.92, "learning_rate": 0.00041754779773072457, "loss": 0.0, "step": 84000 }, { "epoch": 0.92, "learning_rate": 0.00041698941302599843, "loss": 0.0, "step": 84500 }, { "epoch": 0.93, "learning_rate": 0.00041643102832127223, "loss": 0.0, "step": 85000 }, { "epoch": 0.93, "learning_rate": 0.0004158726436165461, "loss": 0.0, "step": 85500 }, { "epoch": 0.94, "learning_rate": 0.0004153142589118199, "loss": 0.0, "step": 86000 }, { "epoch": 0.94, "learning_rate": 0.0004147558742070937, "loss": 0.0, "step": 86500 }, { "epoch": 0.95, "learning_rate": 0.00041419748950236756, "loss": 0.0, "step": 87000 }, { "epoch": 0.96, "learning_rate": 0.00041363910479764136, "loss": 0.0, "step": 87500 }, { "epoch": 0.96, "learning_rate": 0.0004130807200929152, "loss": 0.0, "step": 88000 }, { "epoch": 0.97, "learning_rate": 0.0004125223353881891, "loss": 0.0, "step": 88500 }, { "epoch": 0.97, "learning_rate": 0.0004119639506834629, "loss": 0.0, "step": 89000 }, { "epoch": 0.98, "learning_rate": 0.00041140556597873675, "loss": 0.0, "step": 89500 }, { "epoch": 0.98, "learning_rate": 0.00041084718127401055, "loss": 0.0, "step": 90000 }, { "epoch": 0.98, "eval_accuracy": 2.9214303322906894e-05, "eval_loss": NaN, "eval_runtime": 1238.8828, "eval_samples_per_second": 248.898, "eval_steps_per_second": 3.89, "step": 90000 }, { "epoch": 0.99, "learning_rate": 0.00041028879656928436, "loss": 0.0, "step": 90500 }, { "epoch": 0.99, "learning_rate": 0.0004097304118645582, "loss": 0.0, "step": 91000 }, { "epoch": 1.0, "learning_rate": 0.000409172027159832, "loss": 0.0, "step": 91500 }, { "epoch": 1.0, "learning_rate": 0.0004086136424551059, "loss": 0.0, "step": 92000 }, { "epoch": 1.01, "learning_rate": 0.00040805525775037974, "loss": 0.0, "step": 92500 }, { "epoch": 1.02, "learning_rate": 0.00040749687304565354, "loss": 0.0, "step": 93000 }, { "epoch": 1.02, "learning_rate": 0.0004069384883409274, "loss": 0.0, "step": 93500 }, { "epoch": 1.03, "learning_rate": 0.0004063801036362012, "loss": 0.0, "step": 94000 }, { "epoch": 1.03, "learning_rate": 0.000405821718931475, "loss": 0.0, "step": 94500 }, { "epoch": 1.04, "learning_rate": 0.00040526333422674887, "loss": 0.0, "step": 95000 }, { "epoch": 1.04, "learning_rate": 0.0004047049495220227, "loss": 0.0, "step": 95500 }, { "epoch": 1.05, "learning_rate": 0.00040414656481729653, "loss": 0.0, "step": 96000 }, { "epoch": 1.05, "learning_rate": 0.0004035881801125704, "loss": 0.0, "step": 96500 }, { "epoch": 1.06, "learning_rate": 0.0004030297954078442, "loss": 0.0, "step": 97000 }, { "epoch": 1.07, "learning_rate": 0.00040247141070311806, "loss": 0.0, "step": 97500 }, { "epoch": 1.07, "learning_rate": 0.00040191302599839186, "loss": 0.0, "step": 98000 }, { "epoch": 1.08, "learning_rate": 0.00040135464129366567, "loss": 0.0, "step": 98500 }, { "epoch": 1.08, "learning_rate": 0.0004007962565889395, "loss": 0.0, "step": 99000 }, { "epoch": 1.09, "learning_rate": 0.00040023787188421333, "loss": 0.0, "step": 99500 }, { "epoch": 1.09, "learning_rate": 0.0003996794871794872, "loss": 0.0, "step": 100000 }, { "epoch": 1.09, "eval_accuracy": 3.071726548590269e-05, "eval_loss": NaN, "eval_runtime": 1240.4665, "eval_samples_per_second": 248.581, "eval_steps_per_second": 3.885, "step": 100000 }, { "epoch": 1.1, "learning_rate": 0.00039912110247476105, "loss": 0.0, "step": 100500 }, { "epoch": 1.1, "learning_rate": 0.00039856271777003485, "loss": 0.0, "step": 101000 }, { "epoch": 1.11, "learning_rate": 0.0003980043330653087, "loss": 0.0, "step": 101500 }, { "epoch": 1.11, "learning_rate": 0.0003974459483605825, "loss": 0.0, "step": 102000 }, { "epoch": 1.12, "learning_rate": 0.0003968875636558563, "loss": 0.0, "step": 102500 }, { "epoch": 1.13, "learning_rate": 0.0003963291789511302, "loss": 0.0, "step": 103000 }, { "epoch": 1.13, "learning_rate": 0.000395770794246404, "loss": 0.0, "step": 103500 }, { "epoch": 1.14, "learning_rate": 0.00039521240954167784, "loss": 0.0, "step": 104000 }, { "epoch": 1.14, "learning_rate": 0.0003946540248369517, "loss": 0.0, "step": 104500 }, { "epoch": 1.15, "learning_rate": 0.0003940956401322255, "loss": 0.0, "step": 105000 }, { "epoch": 1.15, "learning_rate": 0.00039353725542749937, "loss": 0.0, "step": 105500 }, { "epoch": 1.16, "learning_rate": 0.00039297887072277317, "loss": 0.0, "step": 106000 }, { "epoch": 1.16, "learning_rate": 0.00039242048601804703, "loss": 0.0, "step": 106500 }, { "epoch": 1.17, "learning_rate": 0.00039186210131332083, "loss": 0.0, "step": 107000 }, { "epoch": 1.17, "learning_rate": 0.00039130371660859464, "loss": 0.0, "step": 107500 }, { "epoch": 1.18, "learning_rate": 0.0003907453319038685, "loss": 0.0, "step": 108000 }, { "epoch": 1.19, "learning_rate": 0.0003901869471991423, "loss": 0.0, "step": 108500 }, { "epoch": 1.19, "learning_rate": 0.00038962856249441616, "loss": 0.0, "step": 109000 }, { "epoch": 1.2, "learning_rate": 0.00038907017778969, "loss": 0.0, "step": 109500 }, { "epoch": 1.2, "learning_rate": 0.0003885117930849638, "loss": 0.0, "step": 110000 }, { "epoch": 1.2, "eval_accuracy": 3.188648588911819e-05, "eval_loss": NaN, "eval_runtime": 1241.2976, "eval_samples_per_second": 248.414, "eval_steps_per_second": 3.882, "step": 110000 }, { "epoch": 1.21, "learning_rate": 0.0003879534083802377, "loss": 0.0, "step": 110500 }, { "epoch": 1.21, "learning_rate": 0.0003873950236755115, "loss": 0.0, "step": 111000 }, { "epoch": 1.22, "learning_rate": 0.0003868366389707853, "loss": 0.0, "step": 111500 }, { "epoch": 1.22, "learning_rate": 0.00038627825426605915, "loss": 0.0, "step": 112000 }, { "epoch": 1.23, "learning_rate": 0.00038571986956133296, "loss": 0.0, "step": 112500 }, { "epoch": 1.23, "learning_rate": 0.0003851614848566068, "loss": 0.0, "step": 113000 }, { "epoch": 1.24, "learning_rate": 0.0003846031001518807, "loss": 0.0, "step": 113500 }, { "epoch": 1.25, "learning_rate": 0.0003840447154471545, "loss": 0.0, "step": 114000 }, { "epoch": 1.25, "learning_rate": 0.00038348633074242834, "loss": 0.0, "step": 114500 }, { "epoch": 1.26, "learning_rate": 0.00038292794603770214, "loss": 0.0, "step": 115000 }, { "epoch": 1.26, "learning_rate": 0.00038236956133297595, "loss": 0.0, "step": 115500 }, { "epoch": 1.27, "learning_rate": 0.0003818111766282498, "loss": 0.0, "step": 116000 }, { "epoch": 1.27, "learning_rate": 0.0003812527919235236, "loss": 0.0, "step": 116500 }, { "epoch": 1.28, "learning_rate": 0.00038069440721879747, "loss": 0.0, "step": 117000 }, { "epoch": 1.28, "learning_rate": 0.00038013602251407133, "loss": 0.0, "step": 117500 }, { "epoch": 1.29, "learning_rate": 0.00037957763780934514, "loss": 0.0, "step": 118000 }, { "epoch": 1.29, "learning_rate": 0.000379019253104619, "loss": 0.0, "step": 118500 }, { "epoch": 1.3, "learning_rate": 0.0003784608683998928, "loss": 0.0, "step": 119000 }, { "epoch": 1.31, "learning_rate": 0.0003779024836951666, "loss": 0.0, "step": 119500 }, { "epoch": 1.31, "learning_rate": 0.00037734409899044046, "loss": 0.0, "step": 120000 }, { "epoch": 1.31, "eval_accuracy": 2.9570698381633475e-05, "eval_loss": NaN, "eval_runtime": 1238.6519, "eval_samples_per_second": 248.945, "eval_steps_per_second": 3.891, "step": 120000 }, { "epoch": 1.32, "learning_rate": 0.00037678571428571427, "loss": 0.0, "step": 120500 }, { "epoch": 1.32, "learning_rate": 0.00037622732958098807, "loss": 0.0, "step": 121000 }, { "epoch": 1.33, "learning_rate": 0.000375668944876262, "loss": 0.0, "step": 121500 }, { "epoch": 1.33, "learning_rate": 0.0003751105601715358, "loss": 0.0, "step": 122000 }, { "epoch": 1.34, "learning_rate": 0.00037455217546680965, "loss": 0.0, "step": 122500 }, { "epoch": 1.34, "learning_rate": 0.00037399379076208345, "loss": 0.0, "step": 123000 }, { "epoch": 1.35, "learning_rate": 0.00037343540605735726, "loss": 0.0, "step": 123500 }, { "epoch": 1.35, "learning_rate": 0.0003728770213526311, "loss": 0.0, "step": 124000 }, { "epoch": 1.36, "learning_rate": 0.0003723186366479049, "loss": 0.0, "step": 124500 }, { "epoch": 1.37, "learning_rate": 0.0003717602519431788, "loss": 0.0, "step": 125000 }, { "epoch": 1.37, "learning_rate": 0.00037120186723845264, "loss": 0.0, "step": 125500 }, { "epoch": 1.38, "learning_rate": 0.00037064348253372644, "loss": 0.0, "step": 126000 }, { "epoch": 1.38, "learning_rate": 0.0003700850978290003, "loss": 0.0, "step": 126500 }, { "epoch": 1.39, "learning_rate": 0.0003695267131242741, "loss": 0.0, "step": 127000 }, { "epoch": 1.39, "learning_rate": 0.0003689683284195479, "loss": 0.0, "step": 127500 }, { "epoch": 1.4, "learning_rate": 0.00036840994371482177, "loss": 0.0, "step": 128000 }, { "epoch": 1.4, "learning_rate": 0.0003678515590100956, "loss": 0.0, "step": 128500 }, { "epoch": 1.41, "learning_rate": 0.00036729317430536944, "loss": 0.0, "step": 129000 }, { "epoch": 1.41, "learning_rate": 0.0003667347896006433, "loss": 0.0, "step": 129500 }, { "epoch": 1.42, "learning_rate": 0.0003661764048959171, "loss": 0.0, "step": 130000 }, { "epoch": 1.42, "eval_accuracy": 3.2269763129715425e-05, "eval_loss": NaN, "eval_runtime": 1246.6351, "eval_samples_per_second": 247.351, "eval_steps_per_second": 3.866, "step": 130000 }, { "epoch": 1.43, "learning_rate": 0.00036561802019119096, "loss": 0.0, "step": 130500 }, { "epoch": 1.43, "learning_rate": 0.00036505963548646476, "loss": 0.0, "step": 131000 }, { "epoch": 1.44, "learning_rate": 0.00036450125078173857, "loss": 0.0, "step": 131500 }, { "epoch": 1.44, "learning_rate": 0.0003639428660770124, "loss": 0.0, "step": 132000 }, { "epoch": 1.45, "learning_rate": 0.00036338448137228623, "loss": 0.0, "step": 132500 }, { "epoch": 1.45, "learning_rate": 0.0003628260966675601, "loss": 0.0, "step": 133000 }, { "epoch": 1.46, "learning_rate": 0.00036226771196283395, "loss": 0.0, "step": 133500 }, { "epoch": 1.46, "learning_rate": 0.00036170932725810775, "loss": 0.0, "step": 134000 }, { "epoch": 1.47, "learning_rate": 0.0003611509425533816, "loss": 0.0, "step": 134500 }, { "epoch": 1.47, "learning_rate": 0.0003605925578486554, "loss": 0.0, "step": 135000 }, { "epoch": 1.48, "learning_rate": 0.0003600341731439292, "loss": 0.0, "step": 135500 }, { "epoch": 1.49, "learning_rate": 0.0003594757884392031, "loss": 0.0, "step": 136000 }, { "epoch": 1.49, "learning_rate": 0.0003589174037344769, "loss": 0.0, "step": 136500 }, { "epoch": 1.5, "learning_rate": 0.00035835901902975075, "loss": 0.0, "step": 137000 }, { "epoch": 1.5, "learning_rate": 0.00035780063432502455, "loss": 0.0, "step": 137500 }, { "epoch": 1.51, "learning_rate": 0.0003572422496202984, "loss": 0.0, "step": 138000 }, { "epoch": 1.51, "learning_rate": 0.00035668386491557227, "loss": 0.0, "step": 138500 }, { "epoch": 1.52, "learning_rate": 0.0003561254802108461, "loss": 0.0, "step": 139000 }, { "epoch": 1.52, "learning_rate": 0.00035556709550611993, "loss": 0.0, "step": 139500 }, { "epoch": 1.53, "learning_rate": 0.00035500871080139374, "loss": 0.0, "step": 140000 }, { "epoch": 1.53, "eval_accuracy": 3.290421786718595e-05, "eval_loss": NaN, "eval_runtime": 1247.689, "eval_samples_per_second": 247.142, "eval_steps_per_second": 3.862, "step": 140000 }, { "epoch": 1.53, "step": 140001, "total_flos": 2.560247267189588e+18, "train_loss": 0.3459514281929236, "train_runtime": 108836.6872, "train_samples_per_second": 269.154, "train_steps_per_second": 4.206 } ], "max_steps": 457720, "num_train_epochs": 5, "total_flos": 2.560247267189588e+18, "trial_name": null, "trial_params": null }