{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 1920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 5.208333333333333e-08, "loss": 2.3378, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-07, "loss": 2.6783, "step": 5 }, { "epoch": 0.1, "learning_rate": 5.208333333333334e-07, "loss": 2.6965, "step": 10 }, { "epoch": 0.16, "learning_rate": 7.8125e-07, "loss": 2.6688, "step": 15 }, { "epoch": 0.21, "learning_rate": 1.0416666666666667e-06, "loss": 2.7272, "step": 20 }, { "epoch": 0.26, "learning_rate": 1.3020833333333335e-06, "loss": 2.6971, "step": 25 }, { "epoch": 0.31, "learning_rate": 1.5625e-06, "loss": 2.638, "step": 30 }, { "epoch": 0.36, "learning_rate": 1.8229166666666666e-06, "loss": 2.6253, "step": 35 }, { "epoch": 0.42, "learning_rate": 2.0833333333333334e-06, "loss": 2.6373, "step": 40 }, { "epoch": 0.47, "learning_rate": 2.3437500000000002e-06, "loss": 2.6102, "step": 45 }, { "epoch": 0.52, "learning_rate": 2.604166666666667e-06, "loss": 2.592, "step": 50 }, { "epoch": 0.57, "learning_rate": 2.8645833333333334e-06, "loss": 2.5777, "step": 55 }, { "epoch": 0.62, "learning_rate": 3.125e-06, "loss": 2.6194, "step": 60 }, { "epoch": 0.68, "learning_rate": 3.385416666666667e-06, "loss": 2.608, "step": 65 }, { "epoch": 0.73, "learning_rate": 3.6458333333333333e-06, "loss": 2.6459, "step": 70 }, { "epoch": 0.78, "learning_rate": 3.90625e-06, "loss": 2.6886, "step": 75 }, { "epoch": 0.83, "learning_rate": 4.166666666666667e-06, "loss": 2.4494, "step": 80 }, { "epoch": 0.89, "learning_rate": 4.427083333333334e-06, "loss": 2.6428, "step": 85 }, { "epoch": 0.94, "learning_rate": 4.6875000000000004e-06, "loss": 2.6275, "step": 90 }, { "epoch": 0.99, "learning_rate": 4.947916666666667e-06, "loss": 2.6816, "step": 95 }, { "epoch": 1.0, "eval_loss": 2.6328012943267822, "eval_runtime": 173.5828, "eval_samples_per_second": 4.413, "eval_steps_per_second": 1.106, "step": 96 }, { "epoch": 1.04, "learning_rate": 5.208333333333334e-06, "loss": 2.716, "step": 100 }, { "epoch": 1.09, "learning_rate": 5.468750000000001e-06, "loss": 2.6176, "step": 105 }, { "epoch": 1.15, "learning_rate": 5.729166666666667e-06, "loss": 2.6579, "step": 110 }, { "epoch": 1.2, "learning_rate": 5.989583333333334e-06, "loss": 2.7026, "step": 115 }, { "epoch": 1.25, "learning_rate": 6.25e-06, "loss": 2.6952, "step": 120 }, { "epoch": 1.3, "learning_rate": 6.510416666666667e-06, "loss": 2.5857, "step": 125 }, { "epoch": 1.35, "learning_rate": 6.770833333333334e-06, "loss": 2.718, "step": 130 }, { "epoch": 1.41, "learning_rate": 7.031250000000001e-06, "loss": 2.6229, "step": 135 }, { "epoch": 1.46, "learning_rate": 7.291666666666667e-06, "loss": 2.6168, "step": 140 }, { "epoch": 1.51, "learning_rate": 7.552083333333334e-06, "loss": 2.5181, "step": 145 }, { "epoch": 1.56, "learning_rate": 7.8125e-06, "loss": 2.6514, "step": 150 }, { "epoch": 1.61, "learning_rate": 8.072916666666667e-06, "loss": 2.6047, "step": 155 }, { "epoch": 1.67, "learning_rate": 8.333333333333334e-06, "loss": 2.5415, "step": 160 }, { "epoch": 1.72, "learning_rate": 8.59375e-06, "loss": 2.6892, "step": 165 }, { "epoch": 1.77, "learning_rate": 8.854166666666667e-06, "loss": 2.7038, "step": 170 }, { "epoch": 1.82, "learning_rate": 9.114583333333334e-06, "loss": 2.6528, "step": 175 }, { "epoch": 1.88, "learning_rate": 9.375000000000001e-06, "loss": 2.5846, "step": 180 }, { "epoch": 1.93, "learning_rate": 9.635416666666668e-06, "loss": 2.4526, "step": 185 }, { "epoch": 1.98, "learning_rate": 9.895833333333334e-06, "loss": 2.6582, "step": 190 }, { "epoch": 2.0, "eval_loss": 2.6169474124908447, "eval_runtime": 173.6466, "eval_samples_per_second": 4.411, "eval_steps_per_second": 1.106, "step": 192 }, { "epoch": 2.03, "learning_rate": 9.99992563069711e-06, "loss": 2.5407, "step": 195 }, { "epoch": 2.08, "learning_rate": 9.999471159635538e-06, "loss": 2.6746, "step": 200 }, { "epoch": 2.14, "learning_rate": 9.998603571300204e-06, "loss": 2.653, "step": 205 }, { "epoch": 2.19, "learning_rate": 9.997322937381829e-06, "loss": 2.5313, "step": 210 }, { "epoch": 2.24, "learning_rate": 9.995629363702008e-06, "loss": 2.7327, "step": 215 }, { "epoch": 2.29, "learning_rate": 9.993522990204453e-06, "loss": 2.5849, "step": 220 }, { "epoch": 2.34, "learning_rate": 9.991003990943424e-06, "loss": 2.6192, "step": 225 }, { "epoch": 2.4, "learning_rate": 9.988072574069363e-06, "loss": 2.6161, "step": 230 }, { "epoch": 2.45, "learning_rate": 9.984728981811676e-06, "loss": 2.536, "step": 235 }, { "epoch": 2.5, "learning_rate": 9.980973490458728e-06, "loss": 2.5393, "step": 240 }, { "epoch": 2.55, "learning_rate": 9.976806410335015e-06, "loss": 2.6175, "step": 245 }, { "epoch": 2.6, "learning_rate": 9.972228085775512e-06, "loss": 2.5245, "step": 250 }, { "epoch": 2.66, "learning_rate": 9.967238895097223e-06, "loss": 2.6436, "step": 255 }, { "epoch": 2.71, "learning_rate": 9.961839250567925e-06, "loss": 2.6141, "step": 260 }, { "epoch": 2.76, "learning_rate": 9.956029598372092e-06, "loss": 2.6642, "step": 265 }, { "epoch": 2.81, "learning_rate": 9.94981041857404e-06, "loss": 2.6052, "step": 270 }, { "epoch": 2.86, "learning_rate": 9.943182225078242e-06, "loss": 2.6589, "step": 275 }, { "epoch": 2.92, "learning_rate": 9.936145565586871e-06, "loss": 2.5745, "step": 280 }, { "epoch": 2.97, "learning_rate": 9.928701021554545e-06, "loss": 2.6676, "step": 285 }, { "epoch": 3.0, "eval_loss": 2.598314046859741, "eval_runtime": 173.726, "eval_samples_per_second": 4.409, "eval_steps_per_second": 1.105, "step": 288 }, { "epoch": 3.02, "learning_rate": 9.920849208140277e-06, "loss": 2.5292, "step": 290 }, { "epoch": 3.07, "learning_rate": 9.912590774156638e-06, "loss": 2.6445, "step": 295 }, { "epoch": 3.12, "learning_rate": 9.903926402016153e-06, "loss": 2.6219, "step": 300 }, { "epoch": 3.18, "learning_rate": 9.894856807674908e-06, "loss": 2.4258, "step": 305 }, { "epoch": 3.23, "learning_rate": 9.885382740573385e-06, "loss": 2.6602, "step": 310 }, { "epoch": 3.28, "learning_rate": 9.875504983574545e-06, "loss": 2.539, "step": 315 }, { "epoch": 3.33, "learning_rate": 9.86522435289912e-06, "loss": 2.5752, "step": 320 }, { "epoch": 3.39, "learning_rate": 9.85454169805819e-06, "loss": 2.5685, "step": 325 }, { "epoch": 3.44, "learning_rate": 9.843457901782967e-06, "loss": 2.6553, "step": 330 }, { "epoch": 3.49, "learning_rate": 9.83197387995186e-06, "loss": 2.4846, "step": 335 }, { "epoch": 3.54, "learning_rate": 9.820090581514799e-06, "loss": 2.6538, "step": 340 }, { "epoch": 3.59, "learning_rate": 9.807808988414811e-06, "loss": 2.5469, "step": 345 }, { "epoch": 3.65, "learning_rate": 9.795130115506887e-06, "loss": 2.6193, "step": 350 }, { "epoch": 3.7, "learning_rate": 9.78205501047412e-06, "loss": 2.5287, "step": 355 }, { "epoch": 3.75, "learning_rate": 9.768584753741134e-06, "loss": 2.5179, "step": 360 }, { "epoch": 3.8, "learning_rate": 9.754720458384808e-06, "loss": 2.6582, "step": 365 }, { "epoch": 3.85, "learning_rate": 9.740463270042289e-06, "loss": 2.6538, "step": 370 }, { "epoch": 3.91, "learning_rate": 9.72581436681634e-06, "loss": 2.641, "step": 375 }, { "epoch": 3.96, "learning_rate": 9.710774959177983e-06, "loss": 2.6413, "step": 380 }, { "epoch": 4.0, "eval_loss": 2.589087724685669, "eval_runtime": 173.3132, "eval_samples_per_second": 4.42, "eval_steps_per_second": 1.108, "step": 384 }, { "epoch": 4.01, "learning_rate": 9.695346289866478e-06, "loss": 2.6235, "step": 385 }, { "epoch": 4.06, "learning_rate": 9.67952963378663e-06, "loss": 2.5417, "step": 390 }, { "epoch": 4.11, "learning_rate": 9.66332629790344e-06, "loss": 2.5655, "step": 395 }, { "epoch": 4.17, "learning_rate": 9.646737621134112e-06, "loss": 2.6339, "step": 400 }, { "epoch": 4.22, "learning_rate": 9.629764974237416e-06, "loss": 2.6498, "step": 405 }, { "epoch": 4.27, "learning_rate": 9.612409759700412e-06, "loss": 2.5508, "step": 410 }, { "epoch": 4.32, "learning_rate": 9.594673411622563e-06, "loss": 2.6466, "step": 415 }, { "epoch": 4.38, "learning_rate": 9.576557395597237e-06, "loss": 2.4891, "step": 420 }, { "epoch": 4.43, "learning_rate": 9.558063208590594e-06, "loss": 2.5521, "step": 425 }, { "epoch": 4.48, "learning_rate": 9.539192378817894e-06, "loss": 2.517, "step": 430 }, { "epoch": 4.53, "learning_rate": 9.519946465617217e-06, "loss": 2.562, "step": 435 }, { "epoch": 4.58, "learning_rate": 9.500327059320606e-06, "loss": 2.7011, "step": 440 }, { "epoch": 4.64, "learning_rate": 9.480335781122661e-06, "loss": 2.4814, "step": 445 }, { "epoch": 4.69, "learning_rate": 9.459974282946572e-06, "loss": 2.6825, "step": 450 }, { "epoch": 4.74, "learning_rate": 9.439244247307618e-06, "loss": 2.5627, "step": 455 }, { "epoch": 4.79, "learning_rate": 9.41814738717414e-06, "loss": 2.5377, "step": 460 }, { "epoch": 4.84, "learning_rate": 9.396685445825987e-06, "loss": 2.6597, "step": 465 }, { "epoch": 4.9, "learning_rate": 9.374860196710474e-06, "loss": 2.6323, "step": 470 }, { "epoch": 4.95, "learning_rate": 9.352673443295834e-06, "loss": 2.609, "step": 475 }, { "epoch": 5.0, "learning_rate": 9.330127018922195e-06, "loss": 2.581, "step": 480 }, { "epoch": 5.0, "eval_loss": 2.5825417041778564, "eval_runtime": 173.2578, "eval_samples_per_second": 4.421, "eval_steps_per_second": 1.108, "step": 480 }, { "epoch": 5.05, "learning_rate": 9.307222786650079e-06, "loss": 2.6336, "step": 485 }, { "epoch": 5.1, "learning_rate": 9.283962639106464e-06, "loss": 2.6305, "step": 490 }, { "epoch": 5.16, "learning_rate": 9.260348498328393e-06, "loss": 2.5438, "step": 495 }, { "epoch": 5.21, "learning_rate": 9.23638231560414e-06, "loss": 2.5235, "step": 500 }, { "epoch": 5.26, "learning_rate": 9.212066071311978e-06, "loss": 2.5714, "step": 505 }, { "epoch": 5.31, "learning_rate": 9.18740177475654e-06, "loss": 2.6968, "step": 510 }, { "epoch": 5.36, "learning_rate": 9.162391464002776e-06, "loss": 2.5705, "step": 515 }, { "epoch": 5.42, "learning_rate": 9.137037205707552e-06, "loss": 2.5459, "step": 520 }, { "epoch": 5.47, "learning_rate": 9.111341094948876e-06, "loss": 2.5294, "step": 525 }, { "epoch": 5.52, "learning_rate": 9.08530525505277e-06, "loss": 2.5656, "step": 530 }, { "epoch": 5.57, "learning_rate": 9.058931837417823e-06, "loss": 2.6372, "step": 535 }, { "epoch": 5.62, "learning_rate": 9.032223021337415e-06, "loss": 2.539, "step": 540 }, { "epoch": 5.68, "learning_rate": 9.00518101381963e-06, "loss": 2.6005, "step": 545 }, { "epoch": 5.73, "learning_rate": 8.9778080494049e-06, "loss": 2.5895, "step": 550 }, { "epoch": 5.78, "learning_rate": 8.950106389981346e-06, "loss": 2.5937, "step": 555 }, { "epoch": 5.83, "learning_rate": 8.92207832459788e-06, "loss": 2.5635, "step": 560 }, { "epoch": 5.89, "learning_rate": 8.893726169275054e-06, "loss": 2.5176, "step": 565 }, { "epoch": 5.94, "learning_rate": 8.865052266813686e-06, "loss": 2.6101, "step": 570 }, { "epoch": 5.99, "learning_rate": 8.836058986601263e-06, "loss": 2.5884, "step": 575 }, { "epoch": 6.0, "eval_loss": 2.577636957168579, "eval_runtime": 173.3959, "eval_samples_per_second": 4.418, "eval_steps_per_second": 1.107, "step": 576 }, { "epoch": 6.04, "learning_rate": 8.806748724416156e-06, "loss": 2.5728, "step": 580 }, { "epoch": 6.09, "learning_rate": 8.777123902229658e-06, "loss": 2.652, "step": 585 }, { "epoch": 6.15, "learning_rate": 8.747186968005837e-06, "loss": 2.5489, "step": 590 }, { "epoch": 6.2, "learning_rate": 8.71694039549927e-06, "loss": 2.6273, "step": 595 }, { "epoch": 6.25, "learning_rate": 8.68638668405062e-06, "loss": 2.4337, "step": 600 }, { "epoch": 6.3, "learning_rate": 8.655528358380121e-06, "loss": 2.5657, "step": 605 }, { "epoch": 6.35, "learning_rate": 8.624367968378941e-06, "loss": 2.6715, "step": 610 }, { "epoch": 6.41, "learning_rate": 8.59290808889849e-06, "loss": 2.5736, "step": 615 }, { "epoch": 6.46, "learning_rate": 8.561151319537656e-06, "loss": 2.5889, "step": 620 }, { "epoch": 6.51, "learning_rate": 8.52910028442798e-06, "loss": 2.5689, "step": 625 }, { "epoch": 6.56, "learning_rate": 8.496757632016836e-06, "loss": 2.6729, "step": 630 }, { "epoch": 6.61, "learning_rate": 8.46412603484857e-06, "loss": 2.4771, "step": 635 }, { "epoch": 6.67, "learning_rate": 8.43120818934367e-06, "loss": 2.5163, "step": 640 }, { "epoch": 6.72, "learning_rate": 8.398006815575949e-06, "loss": 2.5749, "step": 645 }, { "epoch": 6.77, "learning_rate": 8.364524657047789e-06, "loss": 2.5716, "step": 650 }, { "epoch": 6.82, "learning_rate": 8.330764480463427e-06, "loss": 2.5333, "step": 655 }, { "epoch": 6.88, "learning_rate": 8.296729075500345e-06, "loss": 2.5299, "step": 660 }, { "epoch": 6.93, "learning_rate": 8.262421254578749e-06, "loss": 2.5526, "step": 665 }, { "epoch": 6.98, "learning_rate": 8.227843852629174e-06, "loss": 2.704, "step": 670 }, { "epoch": 7.0, "eval_loss": 2.57405686378479, "eval_runtime": 173.1315, "eval_samples_per_second": 4.424, "eval_steps_per_second": 1.109, "step": 672 }, { "epoch": 7.03, "learning_rate": 8.192999726858227e-06, "loss": 2.5605, "step": 675 }, { "epoch": 7.08, "learning_rate": 8.157891756512488e-06, "loss": 2.6373, "step": 680 }, { "epoch": 7.14, "learning_rate": 8.122522842640596e-06, "loss": 2.6236, "step": 685 }, { "epoch": 7.19, "learning_rate": 8.086895907853526e-06, "loss": 2.5717, "step": 690 }, { "epoch": 7.24, "learning_rate": 8.051013896083084e-06, "loss": 2.5228, "step": 695 }, { "epoch": 7.29, "learning_rate": 8.014879772338649e-06, "loss": 2.5166, "step": 700 }, { "epoch": 7.34, "learning_rate": 7.978496522462167e-06, "loss": 2.5228, "step": 705 }, { "epoch": 7.4, "learning_rate": 7.941867152881423e-06, "loss": 2.7549, "step": 710 }, { "epoch": 7.45, "learning_rate": 7.904994690361612e-06, "loss": 2.5288, "step": 715 }, { "epoch": 7.5, "learning_rate": 7.86788218175523e-06, "loss": 2.582, "step": 720 }, { "epoch": 7.55, "learning_rate": 7.830532693750314e-06, "loss": 2.5402, "step": 725 }, { "epoch": 7.6, "learning_rate": 7.792949312617023e-06, "loss": 2.5406, "step": 730 }, { "epoch": 7.66, "learning_rate": 7.755135143952621e-06, "loss": 2.4985, "step": 735 }, { "epoch": 7.71, "learning_rate": 7.71709331242485e-06, "loss": 2.6041, "step": 740 }, { "epoch": 7.76, "learning_rate": 7.678826961513739e-06, "loss": 2.544, "step": 745 }, { "epoch": 7.81, "learning_rate": 7.64033925325184e-06, "loss": 2.5934, "step": 750 }, { "epoch": 7.86, "learning_rate": 7.601633367962955e-06, "loss": 2.5599, "step": 755 }, { "epoch": 7.92, "learning_rate": 7.562712503999327e-06, "loss": 2.6012, "step": 760 }, { "epoch": 7.97, "learning_rate": 7.523579877477361e-06, "loss": 2.608, "step": 765 }, { "epoch": 8.0, "eval_loss": 2.571471691131592, "eval_runtime": 173.1738, "eval_samples_per_second": 4.423, "eval_steps_per_second": 1.109, "step": 768 }, { "epoch": 8.02, "learning_rate": 7.484238722011869e-06, "loss": 2.6066, "step": 770 }, { "epoch": 8.07, "learning_rate": 7.444692288448864e-06, "loss": 2.5357, "step": 775 }, { "epoch": 8.12, "learning_rate": 7.404943844596939e-06, "loss": 2.487, "step": 780 }, { "epoch": 8.18, "learning_rate": 7.364996674957243e-06, "loss": 2.5873, "step": 785 }, { "epoch": 8.23, "learning_rate": 7.324854080452071e-06, "loss": 2.6906, "step": 790 }, { "epoch": 8.28, "learning_rate": 7.284519378152104e-06, "loss": 2.575, "step": 795 }, { "epoch": 8.33, "learning_rate": 7.243995901002312e-06, "loss": 2.5426, "step": 800 }, { "epoch": 8.39, "learning_rate": 7.203286997546543e-06, "loss": 2.5829, "step": 805 }, { "epoch": 8.44, "learning_rate": 7.162396031650831e-06, "loss": 2.5422, "step": 810 }, { "epoch": 8.49, "learning_rate": 7.121326382225429e-06, "loss": 2.5977, "step": 815 }, { "epoch": 8.54, "learning_rate": 7.080081442945597e-06, "loss": 2.5636, "step": 820 }, { "epoch": 8.59, "learning_rate": 7.038664621971184e-06, "loss": 2.6223, "step": 825 }, { "epoch": 8.65, "learning_rate": 6.997079341665003e-06, "loss": 2.5567, "step": 830 }, { "epoch": 8.7, "learning_rate": 6.955329038310028e-06, "loss": 2.5051, "step": 835 }, { "epoch": 8.75, "learning_rate": 6.913417161825449e-06, "loss": 2.5383, "step": 840 }, { "epoch": 8.8, "learning_rate": 6.871347175481602e-06, "loss": 2.5363, "step": 845 }, { "epoch": 8.85, "learning_rate": 6.829122555613786e-06, "loss": 2.6013, "step": 850 }, { "epoch": 8.91, "learning_rate": 6.786746791335001e-06, "loss": 2.6019, "step": 855 }, { "epoch": 8.96, "learning_rate": 6.7442233842476545e-06, "loss": 2.5454, "step": 860 }, { "epoch": 9.0, "eval_loss": 2.5697662830352783, "eval_runtime": 173.2483, "eval_samples_per_second": 4.421, "eval_steps_per_second": 1.108, "step": 864 }, { "epoch": 9.01, "learning_rate": 6.701555848154193e-06, "loss": 2.6289, "step": 865 }, { "epoch": 9.06, "learning_rate": 6.6587477087667615e-06, "loss": 2.6078, "step": 870 }, { "epoch": 9.11, "learning_rate": 6.615802503415865e-06, "loss": 2.5328, "step": 875 }, { "epoch": 9.17, "learning_rate": 6.572723780758069e-06, "loss": 2.537, "step": 880 }, { "epoch": 9.22, "learning_rate": 6.529515100482768e-06, "loss": 2.5444, "step": 885 }, { "epoch": 9.27, "learning_rate": 6.486180033018039e-06, "loss": 2.4951, "step": 890 }, { "epoch": 9.32, "learning_rate": 6.442722159235608e-06, "loss": 2.5551, "step": 895 }, { "epoch": 9.38, "learning_rate": 6.399145070154962e-06, "loss": 2.6229, "step": 900 }, { "epoch": 9.43, "learning_rate": 6.355452366646602e-06, "loss": 2.4483, "step": 905 }, { "epoch": 9.48, "learning_rate": 6.311647659134509e-06, "loss": 2.659, "step": 910 }, { "epoch": 9.53, "learning_rate": 6.267734567297799e-06, "loss": 2.565, "step": 915 }, { "epoch": 9.58, "learning_rate": 6.2237167197716195e-06, "loss": 2.5539, "step": 920 }, { "epoch": 9.64, "learning_rate": 6.179597753847317e-06, "loss": 2.5881, "step": 925 }, { "epoch": 9.69, "learning_rate": 6.135381315171867e-06, "loss": 2.5665, "step": 930 }, { "epoch": 9.74, "learning_rate": 6.091071057446635e-06, "loss": 2.5365, "step": 935 }, { "epoch": 9.79, "learning_rate": 6.046670642125461e-06, "loss": 2.6475, "step": 940 }, { "epoch": 9.84, "learning_rate": 6.002183738112103e-06, "loss": 2.6788, "step": 945 }, { "epoch": 9.9, "learning_rate": 5.957614021457072e-06, "loss": 2.5368, "step": 950 }, { "epoch": 9.95, "learning_rate": 5.912965175053867e-06, "loss": 2.547, "step": 955 }, { "epoch": 10.0, "learning_rate": 5.8682408883346535e-06, "loss": 2.5938, "step": 960 }, { "epoch": 10.0, "eval_loss": 2.5687718391418457, "eval_runtime": 173.1474, "eval_samples_per_second": 4.424, "eval_steps_per_second": 1.109, "step": 960 }, { "epoch": 10.05, "learning_rate": 5.823444856965393e-06, "loss": 2.6, "step": 965 }, { "epoch": 10.1, "learning_rate": 5.77858078254047e-06, "loss": 2.6841, "step": 970 }, { "epoch": 10.16, "learning_rate": 5.733652372276809e-06, "loss": 2.6076, "step": 975 }, { "epoch": 10.21, "learning_rate": 5.688663338707554e-06, "loss": 2.6151, "step": 980 }, { "epoch": 10.26, "learning_rate": 5.643617399375281e-06, "loss": 2.5886, "step": 985 }, { "epoch": 10.31, "learning_rate": 5.598518276524813e-06, "loss": 2.6533, "step": 990 }, { "epoch": 10.36, "learning_rate": 5.553369696795647e-06, "loss": 2.3891, "step": 995 }, { "epoch": 10.42, "learning_rate": 5.50817539091401e-06, "loss": 2.697, "step": 1000 }, { "epoch": 10.47, "learning_rate": 5.462939093384579e-06, "loss": 2.5268, "step": 1005 }, { "epoch": 10.52, "learning_rate": 5.417664542181894e-06, "loss": 2.5242, "step": 1010 }, { "epoch": 10.57, "learning_rate": 5.372355478441483e-06, "loss": 2.5093, "step": 1015 }, { "epoch": 10.62, "learning_rate": 5.327015646150716e-06, "loss": 2.5013, "step": 1020 }, { "epoch": 10.68, "learning_rate": 5.2816487918394385e-06, "loss": 2.4911, "step": 1025 }, { "epoch": 10.73, "learning_rate": 5.236258664270385e-06, "loss": 2.6194, "step": 1030 }, { "epoch": 10.78, "learning_rate": 5.1908490141294085e-06, "loss": 2.4813, "step": 1035 }, { "epoch": 10.83, "learning_rate": 5.145423593715558e-06, "loss": 2.5679, "step": 1040 }, { "epoch": 10.89, "learning_rate": 5.09998615663101e-06, "loss": 2.5244, "step": 1045 }, { "epoch": 10.94, "learning_rate": 5.054540457470912e-06, "loss": 2.6256, "step": 1050 }, { "epoch": 10.99, "learning_rate": 5.009090251513119e-06, "loss": 2.6129, "step": 1055 }, { "epoch": 11.0, "eval_loss": 2.568225145339966, "eval_runtime": 173.1937, "eval_samples_per_second": 4.423, "eval_steps_per_second": 1.109, "step": 1056 }, { "epoch": 11.04, "learning_rate": 4.963639294407893e-06, "loss": 2.4814, "step": 1060 }, { "epoch": 11.09, "learning_rate": 4.918191341867566e-06, "loss": 2.6482, "step": 1065 }, { "epoch": 11.15, "learning_rate": 4.8727501493562e-06, "loss": 2.6263, "step": 1070 }, { "epoch": 11.2, "learning_rate": 4.827319471779255e-06, "loss": 2.6835, "step": 1075 }, { "epoch": 11.25, "learning_rate": 4.781903063173321e-06, "loss": 2.4925, "step": 1080 }, { "epoch": 11.3, "learning_rate": 4.736504676395912e-06, "loss": 2.51, "step": 1085 }, { "epoch": 11.35, "learning_rate": 4.691128062815361e-06, "loss": 2.5933, "step": 1090 }, { "epoch": 11.41, "learning_rate": 4.64577697200083e-06, "loss": 2.6073, "step": 1095 }, { "epoch": 11.46, "learning_rate": 4.600455151412482e-06, "loss": 2.6271, "step": 1100 }, { "epoch": 11.51, "learning_rate": 4.555166346091811e-06, "loss": 2.5913, "step": 1105 }, { "epoch": 11.56, "learning_rate": 4.509914298352197e-06, "loss": 2.4734, "step": 1110 }, { "epoch": 11.61, "learning_rate": 4.464702747469654e-06, "loss": 2.4006, "step": 1115 }, { "epoch": 11.67, "learning_rate": 4.4195354293738484e-06, "loss": 2.5788, "step": 1120 }, { "epoch": 11.72, "learning_rate": 4.374416076339405e-06, "loss": 2.6018, "step": 1125 }, { "epoch": 11.77, "learning_rate": 4.3293484166774795e-06, "loss": 2.3919, "step": 1130 }, { "epoch": 11.82, "learning_rate": 4.2843361744276965e-06, "loss": 2.5624, "step": 1135 }, { "epoch": 11.88, "learning_rate": 4.239383069050417e-06, "loss": 2.6089, "step": 1140 }, { "epoch": 11.93, "learning_rate": 4.194492815119393e-06, "loss": 2.6184, "step": 1145 }, { "epoch": 11.98, "learning_rate": 4.149669122014823e-06, "loss": 2.6334, "step": 1150 }, { "epoch": 12.0, "eval_loss": 2.5679450035095215, "eval_runtime": 173.1818, "eval_samples_per_second": 4.423, "eval_steps_per_second": 1.109, "step": 1152 }, { "epoch": 12.03, "learning_rate": 4.104915693616838e-06, "loss": 2.6666, "step": 1155 }, { "epoch": 12.08, "learning_rate": 4.060236227999441e-06, "loss": 2.5513, "step": 1160 }, { "epoch": 12.14, "learning_rate": 4.015634417124932e-06, "loss": 2.4863, "step": 1165 }, { "epoch": 12.19, "learning_rate": 3.971113946538826e-06, "loss": 2.5857, "step": 1170 }, { "epoch": 12.24, "learning_rate": 3.926678495065313e-06, "loss": 2.6008, "step": 1175 }, { "epoch": 12.29, "learning_rate": 3.882331734503263e-06, "loss": 2.5744, "step": 1180 }, { "epoch": 12.34, "learning_rate": 3.838077329322828e-06, "loss": 2.6885, "step": 1185 }, { "epoch": 12.4, "learning_rate": 3.7939189363626282e-06, "loss": 2.6531, "step": 1190 }, { "epoch": 12.45, "learning_rate": 3.7498602045275846e-06, "loss": 2.6082, "step": 1195 }, { "epoch": 12.5, "learning_rate": 3.705904774487396e-06, "loss": 2.5587, "step": 1200 }, { "epoch": 12.55, "learning_rate": 3.6620562783757163e-06, "loss": 2.4835, "step": 1205 }, { "epoch": 12.6, "learning_rate": 3.618318339490009e-06, "loss": 2.5595, "step": 1210 }, { "epoch": 12.66, "learning_rate": 3.5746945719921476e-06, "loss": 2.5684, "step": 1215 }, { "epoch": 12.71, "learning_rate": 3.531188580609778e-06, "loss": 2.4877, "step": 1220 }, { "epoch": 12.76, "learning_rate": 3.4878039603384505e-06, "loss": 2.7007, "step": 1225 }, { "epoch": 12.81, "learning_rate": 3.444544296144546e-06, "loss": 2.5391, "step": 1230 }, { "epoch": 12.86, "learning_rate": 3.401413162669057e-06, "loss": 2.5575, "step": 1235 }, { "epoch": 12.92, "learning_rate": 3.3584141239321953e-06, "loss": 2.5095, "step": 1240 }, { "epoch": 12.97, "learning_rate": 3.3155507330389004e-06, "loss": 2.5013, "step": 1245 }, { "epoch": 13.0, "eval_loss": 2.567814350128174, "eval_runtime": 173.1218, "eval_samples_per_second": 4.425, "eval_steps_per_second": 1.109, "step": 1248 }, { "epoch": 13.02, "learning_rate": 3.272826531885229e-06, "loss": 2.4931, "step": 1250 }, { "epoch": 13.07, "learning_rate": 3.2302450508656835e-06, "loss": 2.6623, "step": 1255 }, { "epoch": 13.12, "learning_rate": 3.1878098085814926e-06, "loss": 2.6147, "step": 1260 }, { "epoch": 13.18, "learning_rate": 3.1455243115498523e-06, "loss": 2.4961, "step": 1265 }, { "epoch": 13.23, "learning_rate": 3.1033920539141837e-06, "loss": 2.5426, "step": 1270 }, { "epoch": 13.28, "learning_rate": 3.061416517155397e-06, "loss": 2.6229, "step": 1275 }, { "epoch": 13.33, "learning_rate": 3.019601169804216e-06, "loss": 2.5512, "step": 1280 }, { "epoch": 13.39, "learning_rate": 2.97794946715456e-06, "loss": 2.5486, "step": 1285 }, { "epoch": 13.44, "learning_rate": 2.936464850978027e-06, "loss": 2.5661, "step": 1290 }, { "epoch": 13.49, "learning_rate": 2.8951507492394937e-06, "loss": 2.5613, "step": 1295 }, { "epoch": 13.54, "learning_rate": 2.854010575813856e-06, "loss": 2.5139, "step": 1300 }, { "epoch": 13.59, "learning_rate": 2.8130477302039292e-06, "loss": 2.5344, "step": 1305 }, { "epoch": 13.65, "learning_rate": 2.7722655972595438e-06, "loss": 2.6115, "step": 1310 }, { "epoch": 13.7, "learning_rate": 2.731667546897845e-06, "loss": 2.551, "step": 1315 }, { "epoch": 13.75, "learning_rate": 2.6912569338248317e-06, "loss": 2.6828, "step": 1320 }, { "epoch": 13.8, "learning_rate": 2.6510370972581455e-06, "loss": 2.502, "step": 1325 }, { "epoch": 13.85, "learning_rate": 2.61101136065115e-06, "loss": 2.6054, "step": 1330 }, { "epoch": 13.91, "learning_rate": 2.5711830314182996e-06, "loss": 2.6214, "step": 1335 }, { "epoch": 13.96, "learning_rate": 2.5315554006618487e-06, "loss": 2.519, "step": 1340 }, { "epoch": 14.0, "eval_loss": 2.567744255065918, "eval_runtime": 174.5513, "eval_samples_per_second": 4.388, "eval_steps_per_second": 1.1, "step": 1344 }, { "epoch": 14.01, "learning_rate": 2.4921317428998924e-06, "loss": 2.5582, "step": 1345 }, { "epoch": 14.06, "learning_rate": 2.4529153157957913e-06, "loss": 2.4842, "step": 1350 }, { "epoch": 14.11, "learning_rate": 2.4139093598889806e-06, "loss": 2.5287, "step": 1355 }, { "epoch": 14.17, "learning_rate": 2.3751170983272e-06, "loss": 2.6123, "step": 1360 }, { "epoch": 14.22, "learning_rate": 2.3365417366001552e-06, "loss": 2.6674, "step": 1365 }, { "epoch": 14.27, "learning_rate": 2.2981864622746438e-06, "loss": 2.6728, "step": 1370 }, { "epoch": 14.32, "learning_rate": 2.260054444731155e-06, "loss": 2.5313, "step": 1375 }, { "epoch": 14.38, "learning_rate": 2.2221488349019903e-06, "loss": 2.664, "step": 1380 }, { "epoch": 14.43, "learning_rate": 2.184472765010871e-06, "loss": 2.4754, "step": 1385 }, { "epoch": 14.48, "learning_rate": 2.147029348314136e-06, "loss": 2.6064, "step": 1390 }, { "epoch": 14.53, "learning_rate": 2.109821678843484e-06, "loss": 2.5408, "step": 1395 }, { "epoch": 14.58, "learning_rate": 2.0728528311502977e-06, "loss": 2.6351, "step": 1400 }, { "epoch": 14.64, "learning_rate": 2.036125860051594e-06, "loss": 2.4797, "step": 1405 }, { "epoch": 14.69, "learning_rate": 1.999643800377596e-06, "loss": 2.6001, "step": 1410 }, { "epoch": 14.74, "learning_rate": 1.963409666720958e-06, "loss": 2.5746, "step": 1415 }, { "epoch": 14.79, "learning_rate": 1.927426453187663e-06, "loss": 2.5829, "step": 1420 }, { "epoch": 14.84, "learning_rate": 1.8916971331496143e-06, "loss": 2.5435, "step": 1425 }, { "epoch": 14.9, "learning_rate": 1.8562246589989369e-06, "loss": 2.5677, "step": 1430 }, { "epoch": 14.95, "learning_rate": 1.8210119619040206e-06, "loss": 2.4879, "step": 1435 }, { "epoch": 15.0, "learning_rate": 1.7860619515673034e-06, "loss": 2.5076, "step": 1440 }, { "epoch": 15.0, "eval_loss": 2.5677287578582764, "eval_runtime": 174.6013, "eval_samples_per_second": 4.387, "eval_steps_per_second": 1.1, "step": 1440 }, { "epoch": 15.05, "learning_rate": 1.75137751598484e-06, "loss": 2.522, "step": 1445 }, { "epoch": 15.1, "learning_rate": 1.7169615212076612e-06, "loss": 2.5531, "step": 1450 }, { "epoch": 15.16, "learning_rate": 1.6828168111049454e-06, "loss": 2.6733, "step": 1455 }, { "epoch": 15.21, "learning_rate": 1.6489462071290213e-06, "loss": 2.5552, "step": 1460 }, { "epoch": 15.26, "learning_rate": 1.615352508082229e-06, "loss": 2.5467, "step": 1465 }, { "epoch": 15.31, "learning_rate": 1.5820384898856433e-06, "loss": 2.4187, "step": 1470 }, { "epoch": 15.36, "learning_rate": 1.549006905349702e-06, "loss": 2.5954, "step": 1475 }, { "epoch": 15.42, "learning_rate": 1.5162604839467265e-06, "loss": 2.6387, "step": 1480 }, { "epoch": 15.47, "learning_rate": 1.4838019315853796e-06, "loss": 2.6038, "step": 1485 }, { "epoch": 15.52, "learning_rate": 1.4516339303870763e-06, "loss": 2.5639, "step": 1490 }, { "epoch": 15.57, "learning_rate": 1.419759138464355e-06, "loss": 2.5704, "step": 1495 }, { "epoch": 15.62, "learning_rate": 1.3881801897012225e-06, "loss": 2.4915, "step": 1500 }, { "epoch": 15.68, "learning_rate": 1.3568996935355194e-06, "loss": 2.5739, "step": 1505 }, { "epoch": 15.73, "learning_rate": 1.325920234743291e-06, "loss": 2.6971, "step": 1510 }, { "epoch": 15.78, "learning_rate": 1.2952443732252058e-06, "loss": 2.5222, "step": 1515 }, { "epoch": 15.83, "learning_rate": 1.264874643795021e-06, "loss": 2.4783, "step": 1520 }, { "epoch": 15.89, "learning_rate": 1.234813555970129e-06, "loss": 2.5954, "step": 1525 }, { "epoch": 15.94, "learning_rate": 1.2050635937641909e-06, "loss": 2.6067, "step": 1530 }, { "epoch": 15.99, "learning_rate": 1.1756272154818715e-06, "loss": 2.5443, "step": 1535 }, { "epoch": 16.0, "eval_loss": 2.567720413208008, "eval_runtime": 174.877, "eval_samples_per_second": 4.38, "eval_steps_per_second": 1.098, "step": 1536 }, { "epoch": 16.04, "learning_rate": 1.1465068535157098e-06, "loss": 2.5967, "step": 1540 }, { "epoch": 16.09, "learning_rate": 1.1177049141451223e-06, "loss": 2.6367, "step": 1545 }, { "epoch": 16.15, "learning_rate": 1.089223777337568e-06, "loss": 2.5624, "step": 1550 }, { "epoch": 16.2, "learning_rate": 1.0610657965518861e-06, "loss": 2.5973, "step": 1555 }, { "epoch": 16.25, "learning_rate": 1.0332332985438248e-06, "loss": 2.4671, "step": 1560 }, { "epoch": 16.3, "learning_rate": 1.0057285831737739e-06, "loss": 2.5794, "step": 1565 }, { "epoch": 16.35, "learning_rate": 9.785539232167296e-07, "loss": 2.6019, "step": 1570 }, { "epoch": 16.41, "learning_rate": 9.517115641744795e-07, "loss": 2.3944, "step": 1575 }, { "epoch": 16.46, "learning_rate": 9.252037240900618e-07, "loss": 2.6003, "step": 1580 }, { "epoch": 16.51, "learning_rate": 8.990325933644717e-07, "loss": 2.5742, "step": 1585 }, { "epoch": 16.56, "learning_rate": 8.732003345756812e-07, "loss": 2.5666, "step": 1590 }, { "epoch": 16.61, "learning_rate": 8.47709082299924e-07, "loss": 2.5089, "step": 1595 }, { "epoch": 16.67, "learning_rate": 8.225609429353187e-07, "loss": 2.6752, "step": 1600 }, { "epoch": 16.72, "learning_rate": 7.977579945278091e-07, "loss": 2.6726, "step": 1605 }, { "epoch": 16.77, "learning_rate": 7.733022865994599e-07, "loss": 2.5055, "step": 1610 }, { "epoch": 16.82, "learning_rate": 7.491958399790827e-07, "loss": 2.564, "step": 1615 }, { "epoch": 16.88, "learning_rate": 7.254406466352682e-07, "loss": 2.5455, "step": 1620 }, { "epoch": 16.93, "learning_rate": 7.020386695117732e-07, "loss": 2.5649, "step": 1625 }, { "epoch": 16.98, "learning_rate": 6.789918423653285e-07, "loss": 2.5972, "step": 1630 }, { "epoch": 17.0, "eval_loss": 2.5676934719085693, "eval_runtime": 164.8301, "eval_samples_per_second": 4.647, "eval_steps_per_second": 0.582, "step": 1632 }, { "epoch": 17.03, "learning_rate": 6.563020696058387e-07, "loss": 2.5949, "step": 1635 }, { "epoch": 17.08, "learning_rate": 6.339712261390213e-07, "loss": 2.5258, "step": 1640 }, { "epoch": 17.14, "learning_rate": 6.120011572114803e-07, "loss": 2.6606, "step": 1645 }, { "epoch": 17.19, "learning_rate": 5.903936782582253e-07, "loss": 2.5174, "step": 1650 }, { "epoch": 17.24, "learning_rate": 5.691505747526633e-07, "loss": 2.6218, "step": 1655 }, { "epoch": 17.29, "learning_rate": 5.482736020590551e-07, "loss": 2.5367, "step": 1660 }, { "epoch": 17.34, "learning_rate": 5.277644852874742e-07, "loss": 2.6172, "step": 1665 }, { "epoch": 17.4, "learning_rate": 5.076249191512461e-07, "loss": 2.5443, "step": 1670 }, { "epoch": 17.45, "learning_rate": 4.878565678269204e-07, "loss": 2.406, "step": 1675 }, { "epoch": 17.5, "learning_rate": 4.6846106481675035e-07, "loss": 2.6111, "step": 1680 }, { "epoch": 17.55, "learning_rate": 4.494400128137144e-07, "loss": 2.5944, "step": 1685 }, { "epoch": 17.6, "learning_rate": 4.3079498356908446e-07, "loss": 2.589, "step": 1690 }, { "epoch": 17.66, "learning_rate": 4.1252751776254373e-07, "loss": 2.6432, "step": 1695 }, { "epoch": 17.71, "learning_rate": 3.946391248748821e-07, "loss": 2.6341, "step": 1700 }, { "epoch": 17.76, "learning_rate": 3.7713128306326286e-07, "loss": 2.4919, "step": 1705 }, { "epoch": 17.81, "learning_rate": 3.600054390390778e-07, "loss": 2.4727, "step": 1710 }, { "epoch": 17.86, "learning_rate": 3.4326300794840174e-07, "loss": 2.5534, "step": 1715 }, { "epoch": 17.92, "learning_rate": 3.269053732550581e-07, "loss": 2.6066, "step": 1720 }, { "epoch": 17.97, "learning_rate": 3.1093388662630173e-07, "loss": 2.5361, "step": 1725 }, { "epoch": 18.0, "eval_loss": 2.567695140838623, "eval_runtime": 164.9524, "eval_samples_per_second": 4.644, "eval_steps_per_second": 0.582, "step": 1728 }, { "epoch": 18.02, "learning_rate": 2.9534986782112306e-07, "loss": 2.608, "step": 1730 }, { "epoch": 18.07, "learning_rate": 2.80154604581197e-07, "loss": 2.5554, "step": 1735 }, { "epoch": 18.12, "learning_rate": 2.653493525244721e-07, "loss": 2.6435, "step": 1740 }, { "epoch": 18.18, "learning_rate": 2.5093533504141786e-07, "loss": 2.5905, "step": 1745 }, { "epoch": 18.23, "learning_rate": 2.3691374319393168e-07, "loss": 2.5416, "step": 1750 }, { "epoch": 18.28, "learning_rate": 2.232857356169199e-07, "loss": 2.5946, "step": 1755 }, { "epoch": 18.33, "learning_rate": 2.1005243842255552e-07, "loss": 2.5916, "step": 1760 }, { "epoch": 18.39, "learning_rate": 1.972149451072297e-07, "loss": 2.5382, "step": 1765 }, { "epoch": 18.44, "learning_rate": 1.8477431646118648e-07, "loss": 2.6638, "step": 1770 }, { "epoch": 18.49, "learning_rate": 1.7273158048087434e-07, "loss": 2.5931, "step": 1775 }, { "epoch": 18.54, "learning_rate": 1.6108773228399543e-07, "loss": 2.5053, "step": 1780 }, { "epoch": 18.59, "learning_rate": 1.4984373402728014e-07, "loss": 2.5557, "step": 1785 }, { "epoch": 18.65, "learning_rate": 1.3900051482698074e-07, "loss": 2.5686, "step": 1790 }, { "epoch": 18.7, "learning_rate": 1.2855897068209555e-07, "loss": 2.594, "step": 1795 }, { "epoch": 18.75, "learning_rate": 1.185199644003332e-07, "loss": 2.4774, "step": 1800 }, { "epoch": 18.8, "learning_rate": 1.0888432552681405e-07, "loss": 2.4623, "step": 1805 }, { "epoch": 18.85, "learning_rate": 9.965285027552452e-08, "loss": 2.538, "step": 1810 }, { "epoch": 18.91, "learning_rate": 9.082630146352356e-08, "loss": 2.5888, "step": 1815 }, { "epoch": 18.96, "learning_rate": 8.240540844791145e-08, "loss": 2.6119, "step": 1820 }, { "epoch": 19.0, "eval_loss": 2.567678689956665, "eval_runtime": 164.8569, "eval_samples_per_second": 4.646, "eval_steps_per_second": 0.582, "step": 1824 }, { "epoch": 19.01, "learning_rate": 7.439086706555743e-08, "loss": 2.6545, "step": 1825 }, { "epoch": 19.06, "learning_rate": 6.678333957560513e-08, "loss": 2.6277, "step": 1830 }, { "epoch": 19.11, "learning_rate": 5.958345460474635e-08, "loss": 2.533, "step": 1835 }, { "epoch": 19.17, "learning_rate": 5.279180709527765e-08, "loss": 2.5901, "step": 1840 }, { "epoch": 19.22, "learning_rate": 4.640895825593683e-08, "loss": 2.6012, "step": 1845 }, { "epoch": 19.27, "learning_rate": 4.0435435515532304e-08, "loss": 2.6356, "step": 1850 }, { "epoch": 19.32, "learning_rate": 3.487173247935627e-08, "loss": 2.5524, "step": 1855 }, { "epoch": 19.38, "learning_rate": 2.971830888840177e-08, "loss": 2.5988, "step": 1860 }, { "epoch": 19.43, "learning_rate": 2.4975590581369778e-08, "loss": 2.6605, "step": 1865 }, { "epoch": 19.48, "learning_rate": 2.0643969459482326e-08, "loss": 2.5516, "step": 1870 }, { "epoch": 19.53, "learning_rate": 1.6723803454098408e-08, "loss": 2.6022, "step": 1875 }, { "epoch": 19.58, "learning_rate": 1.3215416497138756e-08, "loss": 2.5515, "step": 1880 }, { "epoch": 19.64, "learning_rate": 1.0119098494316693e-08, "loss": 2.4395, "step": 1885 }, { "epoch": 19.69, "learning_rate": 7.43510530118452e-09, "loss": 2.5337, "step": 1890 }, { "epoch": 19.74, "learning_rate": 5.163658701989316e-09, "loss": 2.4588, "step": 1895 }, { "epoch": 19.79, "learning_rate": 3.304946391349817e-09, "loss": 2.5766, "step": 1900 }, { "epoch": 19.84, "learning_rate": 1.8591219587416053e-09, "loss": 2.4665, "step": 1905 }, { "epoch": 19.9, "learning_rate": 8.26304875812256e-10, "loss": 2.6277, "step": 1910 }, { "epoch": 19.95, "learning_rate": 2.0658048650257223e-10, "loss": 2.5522, "step": 1915 }, { "epoch": 20.0, "learning_rate": 0.0, "loss": 2.6321, "step": 1920 }, { "epoch": 20.0, "eval_loss": 2.5677125453948975, "eval_runtime": 164.791, "eval_samples_per_second": 4.648, "eval_steps_per_second": 0.583, "step": 1920 }, { "epoch": 20.0, "step": 1920, "total_flos": 1.0984887148766822e+18, "train_loss": 0.16053936282793682, "train_runtime": 1034.0653, "train_samples_per_second": 14.815, "train_steps_per_second": 1.857 } ], "logging_steps": 5, "max_steps": 1920, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 50, "total_flos": 1.0984887148766822e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }