{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.27020630048712, "global_step": 1000000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.9999999999999997e-06, "loss": 0.8784, "step": 1000 }, { "epoch": 0.03, "learning_rate": 5.999999999999999e-06, "loss": 0.77, "step": 2000 }, { "epoch": 0.05, "learning_rate": 8.999999999999999e-06, "loss": 0.7664, "step": 3000 }, { "epoch": 0.06, "learning_rate": 1.1999999999999999e-05, "loss": 0.7655, "step": 4000 }, { "epoch": 0.08, "learning_rate": 1.4999999999999999e-05, "loss": 0.765, "step": 5000 }, { "epoch": 0.08, "eval_runtime": 1.3797, "eval_samples_per_second": 724.791, "eval_steps_per_second": 11.597, "step": 5000 }, { "epoch": 0.09, "learning_rate": 1.7999999999999997e-05, "loss": 0.7647, "step": 6000 }, { "epoch": 0.11, "learning_rate": 2.1e-05, "loss": 0.7644, "step": 7000 }, { "epoch": 0.12, "learning_rate": 2.3999999999999997e-05, "loss": 0.7638, "step": 8000 }, { "epoch": 0.14, "learning_rate": 2.6999999999999996e-05, "loss": 0.7633, "step": 9000 }, { "epoch": 0.15, "learning_rate": 2.9999999999999997e-05, "loss": 0.76, "step": 10000 }, { "epoch": 0.15, "eval_runtime": 1.1376, "eval_samples_per_second": 879.066, "eval_steps_per_second": 14.065, "step": 10000 }, { "epoch": 0.17, "learning_rate": 3.2999999999999996e-05, "loss": 0.7148, "step": 11000 }, { "epoch": 0.18, "learning_rate": 3.5999999999999994e-05, "loss": 0.6963, "step": 12000 }, { "epoch": 0.2, "learning_rate": 3.9e-05, "loss": 0.6755, "step": 13000 }, { "epoch": 0.21, "learning_rate": 4.2e-05, "loss": 0.6516, "step": 14000 }, { "epoch": 0.23, "learning_rate": 4.4999999999999996e-05, "loss": 0.6412, "step": 15000 }, { "epoch": 0.23, "eval_runtime": 1.1689, "eval_samples_per_second": 855.472, "eval_steps_per_second": 13.688, "step": 15000 }, { "epoch": 0.24, "learning_rate": 4.7999999999999994e-05, "loss": 0.6348, "step": 16000 }, { "epoch": 0.26, "learning_rate": 5.1e-05, "loss": 0.6295, "step": 17000 }, { "epoch": 0.27, "learning_rate": 5.399999999999999e-05, "loss": 0.6224, "step": 18000 }, { "epoch": 0.29, "learning_rate": 5.6999999999999996e-05, "loss": 0.6169, "step": 19000 }, { "epoch": 0.31, "learning_rate": 5.9999999999999995e-05, "loss": 0.6113, "step": 20000 }, { "epoch": 0.31, "eval_runtime": 1.0179, "eval_samples_per_second": 982.441, "eval_steps_per_second": 15.719, "step": 20000 }, { "epoch": 0.32, "learning_rate": 6.299999999999999e-05, "loss": 0.6074, "step": 21000 }, { "epoch": 0.34, "learning_rate": 6.599999999999999e-05, "loss": 0.6039, "step": 22000 }, { "epoch": 0.35, "learning_rate": 6.9e-05, "loss": 0.6005, "step": 23000 }, { "epoch": 0.37, "learning_rate": 7.199999999999999e-05, "loss": 0.5968, "step": 24000 }, { "epoch": 0.38, "learning_rate": 7.5e-05, "loss": 0.5932, "step": 25000 }, { "epoch": 0.38, "eval_runtime": 1.1249, "eval_samples_per_second": 888.989, "eval_steps_per_second": 14.224, "step": 25000 }, { "epoch": 0.4, "learning_rate": 7.8e-05, "loss": 0.5912, "step": 26000 }, { "epoch": 0.41, "learning_rate": 8.1e-05, "loss": 0.58, "step": 27000 }, { "epoch": 0.43, "learning_rate": 8.4e-05, "loss": 0.5698, "step": 28000 }, { "epoch": 0.44, "learning_rate": 8.699999999999999e-05, "loss": 0.5639, "step": 29000 }, { "epoch": 0.46, "learning_rate": 8.999999999999999e-05, "loss": 0.5601, "step": 30000 }, { "epoch": 0.46, "eval_runtime": 1.0096, "eval_samples_per_second": 990.512, "eval_steps_per_second": 15.848, "step": 30000 }, { "epoch": 0.47, "learning_rate": 9.3e-05, "loss": 0.5536, "step": 31000 }, { "epoch": 0.49, "learning_rate": 9.599999999999999e-05, "loss": 0.5496, "step": 32000 }, { "epoch": 0.5, "learning_rate": 9.9e-05, "loss": 0.5458, "step": 33000 }, { "epoch": 0.52, "learning_rate": 0.000102, "loss": 0.5426, "step": 34000 }, { "epoch": 0.53, "learning_rate": 0.00010499999999999999, "loss": 0.5394, "step": 35000 }, { "epoch": 0.53, "eval_runtime": 1.3102, "eval_samples_per_second": 763.27, "eval_steps_per_second": 12.212, "step": 35000 }, { "epoch": 0.55, "learning_rate": 0.00010799999999999998, "loss": 0.5345, "step": 36000 }, { "epoch": 0.56, "learning_rate": 0.00011099999999999999, "loss": 0.5302, "step": 37000 }, { "epoch": 0.58, "learning_rate": 0.00011399999999999999, "loss": 0.527, "step": 38000 }, { "epoch": 0.6, "learning_rate": 0.000117, "loss": 0.5232, "step": 39000 }, { "epoch": 0.61, "learning_rate": 0.00011999999999999999, "loss": 0.5202, "step": 40000 }, { "epoch": 0.61, "eval_runtime": 1.0146, "eval_samples_per_second": 985.598, "eval_steps_per_second": 15.77, "step": 40000 }, { "epoch": 0.63, "learning_rate": 0.00012299999999999998, "loss": 0.5163, "step": 41000 }, { "epoch": 0.64, "learning_rate": 0.00012599999999999997, "loss": 0.5126, "step": 42000 }, { "epoch": 0.66, "learning_rate": 0.000129, "loss": 0.5094, "step": 43000 }, { "epoch": 0.67, "learning_rate": 0.00013199999999999998, "loss": 0.5061, "step": 44000 }, { "epoch": 0.69, "learning_rate": 0.000135, "loss": 0.5036, "step": 45000 }, { "epoch": 0.69, "eval_runtime": 1.0362, "eval_samples_per_second": 965.067, "eval_steps_per_second": 15.441, "step": 45000 }, { "epoch": 0.7, "learning_rate": 0.000138, "loss": 0.4995, "step": 46000 }, { "epoch": 0.72, "learning_rate": 0.00014099999999999998, "loss": 0.4967, "step": 47000 }, { "epoch": 0.73, "learning_rate": 0.00014399999999999998, "loss": 0.4934, "step": 48000 }, { "epoch": 0.75, "learning_rate": 0.000147, "loss": 0.4898, "step": 49000 }, { "epoch": 0.76, "learning_rate": 0.00015, "loss": 0.4863, "step": 50000 }, { "epoch": 0.76, "eval_runtime": 1.0374, "eval_samples_per_second": 963.96, "eval_steps_per_second": 15.423, "step": 50000 }, { "epoch": 0.78, "learning_rate": 0.0001499996172456075, "loss": 0.4824, "step": 51000 }, { "epoch": 0.79, "learning_rate": 0.00014999846898661572, "loss": 0.4778, "step": 52000 }, { "epoch": 0.81, "learning_rate": 0.00014999655523558183, "loss": 0.474, "step": 53000 }, { "epoch": 0.82, "learning_rate": 0.00014999387601343436, "loss": 0.4694, "step": 54000 }, { "epoch": 0.84, "learning_rate": 0.00014999043134947282, "loss": 0.4651, "step": 55000 }, { "epoch": 0.84, "eval_runtime": 1.0465, "eval_samples_per_second": 955.566, "eval_steps_per_second": 15.289, "step": 55000 }, { "epoch": 0.86, "learning_rate": 0.00014998622128136748, "loss": 0.4608, "step": 56000 }, { "epoch": 0.87, "learning_rate": 0.000149981245855159, "loss": 0.4566, "step": 57000 }, { "epoch": 0.89, "learning_rate": 0.00014997550512525784, "loss": 0.4523, "step": 58000 }, { "epoch": 0.9, "learning_rate": 0.0001499689991544437, "loss": 0.4483, "step": 59000 }, { "epoch": 0.92, "learning_rate": 0.00014996172801386482, "loss": 0.4447, "step": 60000 }, { "epoch": 0.92, "eval_runtime": 1.2288, "eval_samples_per_second": 813.826, "eval_steps_per_second": 13.021, "step": 60000 }, { "epoch": 0.93, "learning_rate": 0.00014995369178303722, "loss": 0.4408, "step": 61000 }, { "epoch": 0.95, "learning_rate": 0.0001499448905498439, "loss": 0.4381, "step": 62000 }, { "epoch": 0.96, "learning_rate": 0.00014993532441053364, "loss": 0.434, "step": 63000 }, { "epoch": 0.98, "learning_rate": 0.0001499249934697203, "loss": 0.4316, "step": 64000 }, { "epoch": 0.99, "learning_rate": 0.0001499138978403813, "loss": 0.4275, "step": 65000 }, { "epoch": 0.99, "eval_runtime": 1.0345, "eval_samples_per_second": 966.655, "eval_steps_per_second": 15.466, "step": 65000 }, { "epoch": 1.01, "learning_rate": 0.00014990203764385677, "loss": 0.425, "step": 66000 }, { "epoch": 1.02, "learning_rate": 0.00014988941300984784, "loss": 0.422, "step": 67000 }, { "epoch": 1.04, "learning_rate": 0.0001498760240764155, "loss": 0.4191, "step": 68000 }, { "epoch": 1.05, "learning_rate": 0.000149861870989979, "loss": 0.4164, "step": 69000 }, { "epoch": 1.07, "learning_rate": 0.0001498469539053142, "loss": 0.4138, "step": 70000 }, { "epoch": 1.07, "eval_runtime": 1.1341, "eval_samples_per_second": 881.784, "eval_steps_per_second": 14.109, "step": 70000 }, { "epoch": 1.08, "learning_rate": 0.00014983127298555198, "loss": 0.4114, "step": 71000 }, { "epoch": 1.1, "learning_rate": 0.00014981482840217632, "loss": 0.4086, "step": 72000 }, { "epoch": 1.11, "learning_rate": 0.00014979762033502262, "loss": 0.4066, "step": 73000 }, { "epoch": 1.13, "learning_rate": 0.00014977964897227547, "loss": 0.4042, "step": 74000 }, { "epoch": 1.15, "learning_rate": 0.00014976091451046687, "loss": 0.402, "step": 75000 }, { "epoch": 1.15, "eval_runtime": 1.0331, "eval_samples_per_second": 967.957, "eval_steps_per_second": 15.487, "step": 75000 }, { "epoch": 1.16, "learning_rate": 0.00014974141715447386, "loss": 0.3999, "step": 76000 }, { "epoch": 1.18, "learning_rate": 0.00014972115711751644, "loss": 0.398, "step": 77000 }, { "epoch": 1.19, "learning_rate": 0.00014970013462115505, "loss": 0.3971, "step": 78000 }, { "epoch": 1.21, "learning_rate": 0.00014967834989528843, "loss": 0.3942, "step": 79000 }, { "epoch": 1.22, "learning_rate": 0.00014965580317815078, "loss": 0.3926, "step": 80000 }, { "epoch": 1.22, "eval_runtime": 1.084, "eval_samples_per_second": 922.521, "eval_steps_per_second": 14.76, "step": 80000 }, { "epoch": 1.24, "learning_rate": 0.00014963249471630944, "loss": 0.3906, "step": 81000 }, { "epoch": 1.25, "learning_rate": 0.000149608424764662, "loss": 0.391, "step": 82000 }, { "epoch": 1.27, "learning_rate": 0.0001495835935864336, "loss": 0.3875, "step": 83000 }, { "epoch": 1.28, "learning_rate": 0.00014955800145317397, "loss": 0.3861, "step": 84000 }, { "epoch": 1.3, "learning_rate": 0.00014953164864475466, "loss": 0.3844, "step": 85000 }, { "epoch": 1.3, "eval_runtime": 1.0992, "eval_samples_per_second": 909.734, "eval_steps_per_second": 14.556, "step": 85000 }, { "epoch": 1.31, "learning_rate": 0.0001495045354493657, "loss": 0.3829, "step": 86000 }, { "epoch": 1.33, "learning_rate": 0.00014947666216351272, "loss": 0.3815, "step": 87000 }, { "epoch": 1.34, "learning_rate": 0.00014944802909201344, "loss": 0.38, "step": 88000 }, { "epoch": 1.36, "learning_rate": 0.00014941863654799456, "loss": 0.3789, "step": 89000 }, { "epoch": 1.37, "learning_rate": 0.00014938848485288825, "loss": 0.3785, "step": 90000 }, { "epoch": 1.37, "eval_runtime": 0.9266, "eval_samples_per_second": 1079.167, "eval_steps_per_second": 17.267, "step": 90000 }, { "epoch": 1.39, "learning_rate": 0.0001493575743364286, "loss": 0.3766, "step": 91000 }, { "epoch": 1.4, "learning_rate": 0.00014932590533664808, "loss": 0.3745, "step": 92000 }, { "epoch": 1.42, "learning_rate": 0.0001492934781998738, "loss": 0.3741, "step": 93000 }, { "epoch": 1.44, "learning_rate": 0.0001492602932807237, "loss": 0.3729, "step": 94000 }, { "epoch": 1.45, "learning_rate": 0.00014922635094210277, "loss": 0.3709, "step": 95000 }, { "epoch": 1.45, "eval_runtime": 0.9895, "eval_samples_per_second": 1010.579, "eval_steps_per_second": 16.169, "step": 95000 }, { "epoch": 1.47, "learning_rate": 0.000149191651555199, "loss": 0.3699, "step": 96000 }, { "epoch": 1.48, "learning_rate": 0.0001491561954994793, "loss": 0.3688, "step": 97000 }, { "epoch": 1.5, "learning_rate": 0.00014911998316268537, "loss": 0.3678, "step": 98000 }, { "epoch": 1.51, "learning_rate": 0.00014908301494082963, "loss": 0.3666, "step": 99000 }, { "epoch": 1.53, "learning_rate": 0.00014904529123819054, "loss": 0.3654, "step": 100000 }, { "epoch": 1.53, "eval_runtime": 1.0046, "eval_samples_per_second": 995.424, "eval_steps_per_second": 15.927, "step": 100000 }, { "epoch": 1.54, "learning_rate": 0.00014900681246730852, "loss": 0.3643, "step": 101000 }, { "epoch": 1.56, "learning_rate": 0.00014896757904898125, "loss": 0.3646, "step": 102000 }, { "epoch": 1.57, "learning_rate": 0.00014892759141225904, "loss": 0.3628, "step": 103000 }, { "epoch": 1.59, "learning_rate": 0.00014888684999444035, "loss": 0.3616, "step": 104000 }, { "epoch": 1.6, "learning_rate": 0.00014884535524106675, "loss": 0.3604, "step": 105000 }, { "epoch": 1.6, "eval_runtime": 1.0499, "eval_samples_per_second": 952.499, "eval_steps_per_second": 15.24, "step": 105000 }, { "epoch": 1.62, "learning_rate": 0.00014880310760591824, "loss": 0.3594, "step": 106000 }, { "epoch": 1.63, "learning_rate": 0.0001487601075510082, "loss": 0.3597, "step": 107000 }, { "epoch": 1.65, "learning_rate": 0.0001487163555465783, "loss": 0.3583, "step": 108000 }, { "epoch": 1.66, "learning_rate": 0.0001486718520710935, "loss": 0.3583, "step": 109000 }, { "epoch": 1.68, "learning_rate": 0.00014862659761123663, "loss": 0.3558, "step": 110000 }, { "epoch": 1.68, "eval_runtime": 1.0153, "eval_samples_per_second": 984.91, "eval_steps_per_second": 15.759, "step": 110000 }, { "epoch": 1.69, "learning_rate": 0.00014858059266190327, "loss": 0.3552, "step": 111000 }, { "epoch": 1.71, "learning_rate": 0.00014853383772619612, "loss": 0.3544, "step": 112000 }, { "epoch": 1.73, "learning_rate": 0.00014848633331541967, "loss": 0.3537, "step": 113000 }, { "epoch": 1.74, "learning_rate": 0.0001484380799490746, "loss": 0.3524, "step": 114000 }, { "epoch": 1.76, "learning_rate": 0.00014838907815485194, "loss": 0.3519, "step": 115000 }, { "epoch": 1.76, "eval_runtime": 1.003, "eval_samples_per_second": 997.001, "eval_steps_per_second": 15.952, "step": 115000 }, { "epoch": 1.77, "learning_rate": 0.00014833932846862748, "loss": 0.3511, "step": 116000 }, { "epoch": 1.79, "learning_rate": 0.00014828883143445582, "loss": 0.3502, "step": 117000 }, { "epoch": 1.8, "learning_rate": 0.0001482375876045644, "loss": 0.3493, "step": 118000 }, { "epoch": 1.82, "learning_rate": 0.0001481855975393476, "loss": 0.3489, "step": 119000 }, { "epoch": 1.83, "learning_rate": 0.0001481328618073604, "loss": 0.3482, "step": 120000 }, { "epoch": 1.83, "eval_runtime": 1.0596, "eval_samples_per_second": 943.744, "eval_steps_per_second": 15.1, "step": 120000 }, { "epoch": 1.85, "learning_rate": 0.0001480793809853123, "loss": 0.3478, "step": 121000 }, { "epoch": 1.86, "learning_rate": 0.00014802515565806107, "loss": 0.3468, "step": 122000 }, { "epoch": 1.88, "learning_rate": 0.00014797018641860612, "loss": 0.346, "step": 123000 }, { "epoch": 1.89, "learning_rate": 0.0001479144738680823, "loss": 0.3474, "step": 124000 }, { "epoch": 1.91, "learning_rate": 0.00014785801861575312, "loss": 0.3447, "step": 125000 }, { "epoch": 1.91, "eval_runtime": 0.9375, "eval_samples_per_second": 1066.699, "eval_steps_per_second": 17.067, "step": 125000 }, { "epoch": 1.92, "learning_rate": 0.00014780082127900416, "loss": 0.3439, "step": 126000 }, { "epoch": 1.94, "learning_rate": 0.00014774288248333635, "loss": 0.3436, "step": 127000 }, { "epoch": 1.95, "learning_rate": 0.00014768420286235908, "loss": 0.3429, "step": 128000 }, { "epoch": 1.97, "learning_rate": 0.00014762478305778328, "loss": 0.3422, "step": 129000 }, { "epoch": 1.99, "learning_rate": 0.0001475646237194144, "loss": 0.3414, "step": 130000 }, { "epoch": 1.99, "eval_runtime": 1.0085, "eval_samples_per_second": 991.553, "eval_steps_per_second": 15.865, "step": 130000 }, { "epoch": 2.0, "learning_rate": 0.00014750372550514533, "loss": 0.3409, "step": 131000 }, { "epoch": 2.02, "learning_rate": 0.0001474420890809492, "loss": 0.3401, "step": 132000 }, { "epoch": 2.03, "learning_rate": 0.00014737971512087202, "loss": 0.3396, "step": 133000 }, { "epoch": 2.05, "learning_rate": 0.00014731660430702552, "loss": 0.339, "step": 134000 }, { "epoch": 2.06, "learning_rate": 0.00014725275732957937, "loss": 0.3402, "step": 135000 }, { "epoch": 2.06, "eval_runtime": 1.1005, "eval_samples_per_second": 908.652, "eval_steps_per_second": 14.538, "step": 135000 }, { "epoch": 2.08, "learning_rate": 0.00014718817488675387, "loss": 0.3379, "step": 136000 }, { "epoch": 2.09, "learning_rate": 0.00014712285768481235, "loss": 0.3371, "step": 137000 }, { "epoch": 2.11, "learning_rate": 0.00014705680643805323, "loss": 0.3368, "step": 138000 }, { "epoch": 2.12, "learning_rate": 0.00014699002186880232, "loss": 0.3363, "step": 139000 }, { "epoch": 2.14, "learning_rate": 0.00014692250470740503, "loss": 0.3361, "step": 140000 }, { "epoch": 2.14, "eval_runtime": 1.0104, "eval_samples_per_second": 989.716, "eval_steps_per_second": 15.835, "step": 140000 }, { "epoch": 2.15, "learning_rate": 0.00014685425569221819, "loss": 0.3353, "step": 141000 }, { "epoch": 2.17, "learning_rate": 0.00014678527556960207, "loss": 0.3346, "step": 142000 }, { "epoch": 2.18, "learning_rate": 0.0001467155650939123, "loss": 0.3342, "step": 143000 }, { "epoch": 2.2, "learning_rate": 0.00014664512502749141, "loss": 0.3338, "step": 144000 }, { "epoch": 2.21, "learning_rate": 0.00014657395614066075, "loss": 0.3334, "step": 145000 }, { "epoch": 2.21, "eval_runtime": 1.0369, "eval_samples_per_second": 964.439, "eval_steps_per_second": 15.431, "step": 145000 }, { "epoch": 2.23, "learning_rate": 0.0001465020592117118, "loss": 0.3327, "step": 146000 }, { "epoch": 2.24, "learning_rate": 0.0001464294350268979, "loss": 0.3324, "step": 147000 }, { "epoch": 2.26, "learning_rate": 0.00014635608438042546, "loss": 0.3319, "step": 148000 }, { "epoch": 2.28, "learning_rate": 0.00014628200807444543, "loss": 0.3313, "step": 149000 }, { "epoch": 2.29, "learning_rate": 0.0001462072069190444, "loss": 0.3307, "step": 150000 }, { "epoch": 2.29, "eval_runtime": 1.0431, "eval_samples_per_second": 958.687, "eval_steps_per_second": 15.339, "step": 150000 }, { "epoch": 2.31, "learning_rate": 0.00014613168173223585, "loss": 0.3308, "step": 151000 }, { "epoch": 2.32, "learning_rate": 0.00014605543333995113, "loss": 0.3302, "step": 152000 }, { "epoch": 2.34, "learning_rate": 0.00014597846257603038, "loss": 0.3294, "step": 153000 }, { "epoch": 2.35, "learning_rate": 0.0001459007702822136, "loss": 0.329, "step": 154000 }, { "epoch": 2.37, "learning_rate": 0.00014582235730813128, "loss": 0.3283, "step": 155000 }, { "epoch": 2.37, "eval_runtime": 1.0629, "eval_samples_per_second": 940.817, "eval_steps_per_second": 15.053, "step": 155000 }, { "epoch": 2.38, "learning_rate": 0.00014574322451129507, "loss": 0.3281, "step": 156000 }, { "epoch": 2.4, "learning_rate": 0.00014566337275708863, "loss": 0.328, "step": 157000 }, { "epoch": 2.41, "learning_rate": 0.0001455828029187579, "loss": 0.3272, "step": 158000 }, { "epoch": 2.43, "learning_rate": 0.00014550151587740178, "loss": 0.3269, "step": 159000 }, { "epoch": 2.44, "learning_rate": 0.00014541951252196225, "loss": 0.3265, "step": 160000 }, { "epoch": 2.44, "eval_runtime": 1.0199, "eval_samples_per_second": 980.452, "eval_steps_per_second": 15.687, "step": 160000 }, { "epoch": 2.46, "learning_rate": 0.00014533679374921493, "loss": 0.3259, "step": 161000 }, { "epoch": 2.47, "learning_rate": 0.00014525336046375905, "loss": 0.3254, "step": 162000 }, { "epoch": 2.49, "learning_rate": 0.00014516921357800766, "loss": 0.3251, "step": 163000 }, { "epoch": 2.5, "learning_rate": 0.00014508435401217759, "loss": 0.3244, "step": 164000 }, { "epoch": 2.52, "learning_rate": 0.00014499878269427948, "loss": 0.3243, "step": 165000 }, { "epoch": 2.52, "eval_runtime": 1.0655, "eval_samples_per_second": 938.486, "eval_steps_per_second": 15.016, "step": 165000 }, { "epoch": 2.53, "learning_rate": 0.00014491250056010758, "loss": 0.3236, "step": 166000 }, { "epoch": 2.55, "learning_rate": 0.00014482550855322943, "loss": 0.3233, "step": 167000 }, { "epoch": 2.57, "learning_rate": 0.0001447378076249757, "loss": 0.3231, "step": 168000 }, { "epoch": 2.58, "learning_rate": 0.00014464939873442973, "loss": 0.3228, "step": 169000 }, { "epoch": 2.6, "learning_rate": 0.00014456028284841693, "loss": 0.3221, "step": 170000 }, { "epoch": 2.6, "eval_runtime": 1.1756, "eval_samples_per_second": 850.656, "eval_steps_per_second": 13.611, "step": 170000 }, { "epoch": 2.61, "learning_rate": 0.00014447046094149437, "loss": 0.3221, "step": 171000 }, { "epoch": 2.63, "learning_rate": 0.00014437993399594003, "loss": 0.3216, "step": 172000 }, { "epoch": 2.64, "learning_rate": 0.0001442887030017421, "loss": 0.3217, "step": 173000 }, { "epoch": 2.66, "learning_rate": 0.00014419676895658807, "loss": 0.3208, "step": 174000 }, { "epoch": 2.67, "learning_rate": 0.000144104132865854, "loss": 0.3207, "step": 175000 }, { "epoch": 2.67, "eval_runtime": 1.0679, "eval_samples_per_second": 936.423, "eval_steps_per_second": 14.983, "step": 175000 }, { "epoch": 2.69, "learning_rate": 0.0001440107957425933, "loss": 0.3203, "step": 176000 }, { "epoch": 2.7, "learning_rate": 0.0001439167586075258, "loss": 0.3201, "step": 177000 }, { "epoch": 2.72, "learning_rate": 0.0001438220224890265, "loss": 0.3191, "step": 178000 }, { "epoch": 2.73, "learning_rate": 0.00014372658842311449, "loss": 0.3195, "step": 179000 }, { "epoch": 2.75, "learning_rate": 0.00014363045745344137, "loss": 0.3191, "step": 180000 }, { "epoch": 2.75, "eval_runtime": 1.0169, "eval_samples_per_second": 983.42, "eval_steps_per_second": 15.735, "step": 180000 }, { "epoch": 2.76, "learning_rate": 0.00014353363063128005, "loss": 0.3183, "step": 181000 }, { "epoch": 2.78, "learning_rate": 0.0001434361090155131, "loss": 0.3177, "step": 182000 }, { "epoch": 2.79, "learning_rate": 0.00014333789367262136, "loss": 0.3178, "step": 183000 }, { "epoch": 2.81, "learning_rate": 0.00014323898567667202, "loss": 0.3177, "step": 184000 }, { "epoch": 2.82, "learning_rate": 0.00014313938610930712, "loss": 0.3171, "step": 185000 }, { "epoch": 2.82, "eval_runtime": 1.0441, "eval_samples_per_second": 957.721, "eval_steps_per_second": 15.324, "step": 185000 }, { "epoch": 2.84, "learning_rate": 0.00014303909605973154, "loss": 0.3167, "step": 186000 }, { "epoch": 2.86, "learning_rate": 0.0001429381166247012, "loss": 0.3168, "step": 187000 }, { "epoch": 2.87, "learning_rate": 0.00014283644890851103, "loss": 0.3164, "step": 188000 }, { "epoch": 2.89, "learning_rate": 0.00014273409402298291, "loss": 0.3161, "step": 189000 }, { "epoch": 2.9, "learning_rate": 0.00014263105308745343, "loss": 0.3155, "step": 190000 }, { "epoch": 2.9, "eval_runtime": 1.0119, "eval_samples_per_second": 988.212, "eval_steps_per_second": 15.811, "step": 190000 }, { "epoch": 2.92, "learning_rate": 0.00014252732722876176, "loss": 0.3149, "step": 191000 }, { "epoch": 2.93, "learning_rate": 0.0001424229175812373, "loss": 0.3149, "step": 192000 }, { "epoch": 2.95, "learning_rate": 0.00014231782528668717, "loss": 0.3146, "step": 193000 }, { "epoch": 2.96, "learning_rate": 0.00014221205149438394, "loss": 0.3145, "step": 194000 }, { "epoch": 2.98, "learning_rate": 0.0001421055973610528, "loss": 0.3138, "step": 195000 }, { "epoch": 2.98, "eval_runtime": 1.0908, "eval_samples_per_second": 916.734, "eval_steps_per_second": 14.668, "step": 195000 }, { "epoch": 2.99, "learning_rate": 0.00014199846405085913, "loss": 0.3137, "step": 196000 }, { "epoch": 3.01, "learning_rate": 0.00014189065273539564, "loss": 0.3135, "step": 197000 }, { "epoch": 3.02, "learning_rate": 0.00014178216459366958, "loss": 0.3137, "step": 198000 }, { "epoch": 3.04, "learning_rate": 0.00014167300081208988, "loss": 0.3131, "step": 199000 }, { "epoch": 3.05, "learning_rate": 0.00014156316258445421, "loss": 0.3125, "step": 200000 }, { "epoch": 3.05, "eval_runtime": 1.1346, "eval_samples_per_second": 881.333, "eval_steps_per_second": 14.101, "step": 200000 }, { "epoch": 3.07, "learning_rate": 0.00014145265111193583, "loss": 0.3121, "step": 201000 }, { "epoch": 3.08, "learning_rate": 0.00014134146760307043, "loss": 0.3122, "step": 202000 }, { "epoch": 3.1, "learning_rate": 0.00014122961327374313, "loss": 0.3131, "step": 203000 }, { "epoch": 3.12, "learning_rate": 0.0001411170893471749, "loss": 0.3116, "step": 204000 }, { "epoch": 3.13, "learning_rate": 0.00014100389705390938, "loss": 0.311, "step": 205000 }, { "epoch": 3.13, "eval_runtime": 1.1239, "eval_samples_per_second": 889.731, "eval_steps_per_second": 14.236, "step": 205000 }, { "epoch": 3.15, "learning_rate": 0.0001408900376317994, "loss": 0.311, "step": 206000 }, { "epoch": 3.16, "learning_rate": 0.0001407755123259933, "loss": 0.3108, "step": 207000 }, { "epoch": 3.18, "learning_rate": 0.00014066032238892152, "loss": 0.3104, "step": 208000 }, { "epoch": 3.19, "learning_rate": 0.00014054446908028272, "loss": 0.3102, "step": 209000 }, { "epoch": 3.21, "learning_rate": 0.00014042795366703018, "loss": 0.3097, "step": 210000 }, { "epoch": 3.21, "eval_runtime": 1.0233, "eval_samples_per_second": 977.233, "eval_steps_per_second": 15.636, "step": 210000 }, { "epoch": 3.22, "learning_rate": 0.0001403107774233577, "loss": 0.3098, "step": 211000 }, { "epoch": 3.24, "learning_rate": 0.00014019294163068597, "loss": 0.3093, "step": 212000 }, { "epoch": 3.25, "learning_rate": 0.00014007444757764835, "loss": 0.3093, "step": 213000 }, { "epoch": 3.27, "learning_rate": 0.0001399552965600768, "loss": 0.3088, "step": 214000 }, { "epoch": 3.28, "learning_rate": 0.0001398354898809877, "loss": 0.3089, "step": 215000 }, { "epoch": 3.28, "eval_runtime": 1.0098, "eval_samples_per_second": 990.287, "eval_steps_per_second": 15.845, "step": 215000 }, { "epoch": 3.3, "learning_rate": 0.0001397150288505678, "loss": 0.3315, "step": 216000 }, { "epoch": 3.31, "learning_rate": 0.00013959391478615959, "loss": 0.628, "step": 217000 }, { "epoch": 3.33, "learning_rate": 0.00013947214901224706, "loss": 0.3112, "step": 218000 }, { "epoch": 3.34, "learning_rate": 0.0001393497328604412, "loss": 0.3094, "step": 219000 }, { "epoch": 3.36, "learning_rate": 0.00013922666766946545, "loss": 0.3082, "step": 220000 }, { "epoch": 3.36, "eval_runtime": 1.0751, "eval_samples_per_second": 930.139, "eval_steps_per_second": 14.882, "step": 220000 }, { "epoch": 3.37, "learning_rate": 0.00013910295478514106, "loss": 0.3079, "step": 221000 }, { "epoch": 3.39, "learning_rate": 0.0001389785955603722, "loss": 0.3077, "step": 222000 }, { "epoch": 3.41, "learning_rate": 0.00013885359135513154, "loss": 0.3073, "step": 223000 }, { "epoch": 3.42, "learning_rate": 0.000138727943536445, "loss": 0.3064, "step": 224000 }, { "epoch": 3.44, "learning_rate": 0.00013860165347837698, "loss": 0.3066, "step": 225000 }, { "epoch": 3.44, "eval_runtime": 1.0901, "eval_samples_per_second": 917.309, "eval_steps_per_second": 14.677, "step": 225000 }, { "epoch": 3.45, "learning_rate": 0.00013847472256201535, "loss": 0.306, "step": 226000 }, { "epoch": 3.47, "learning_rate": 0.00013834715217545625, "loss": 0.3058, "step": 227000 }, { "epoch": 3.48, "learning_rate": 0.000138218943713789, "loss": 0.3056, "step": 228000 }, { "epoch": 3.5, "learning_rate": 0.0001380900985790808, "loss": 0.3054, "step": 229000 }, { "epoch": 3.51, "learning_rate": 0.00013796061818036138, "loss": 0.3051, "step": 230000 }, { "epoch": 3.51, "eval_runtime": 1.0217, "eval_samples_per_second": 978.715, "eval_steps_per_second": 15.659, "step": 230000 }, { "epoch": 3.53, "learning_rate": 0.00013783050393360768, "loss": 0.3048, "step": 231000 }, { "epoch": 3.54, "learning_rate": 0.0001376997572617282, "loss": 0.305, "step": 232000 }, { "epoch": 3.56, "learning_rate": 0.00013756837959454766, "loss": 0.3042, "step": 233000 }, { "epoch": 3.57, "learning_rate": 0.0001374363723687911, "loss": 0.3042, "step": 234000 }, { "epoch": 3.59, "learning_rate": 0.00013730373702806846, "loss": 0.304, "step": 235000 }, { "epoch": 3.59, "eval_runtime": 1.0392, "eval_samples_per_second": 962.319, "eval_steps_per_second": 15.397, "step": 235000 }, { "epoch": 3.6, "learning_rate": 0.00013717047502285855, "loss": 0.3036, "step": 236000 }, { "epoch": 3.62, "learning_rate": 0.0001370365878104933, "loss": 0.3036, "step": 237000 }, { "epoch": 3.63, "learning_rate": 0.00013690207685514185, "loss": 0.3031, "step": 238000 }, { "epoch": 3.65, "learning_rate": 0.0001367669436277944, "loss": 0.3032, "step": 239000 }, { "epoch": 3.66, "learning_rate": 0.0001366311896062463, "loss": 0.3036, "step": 240000 }, { "epoch": 3.66, "eval_runtime": 1.0097, "eval_samples_per_second": 990.396, "eval_steps_per_second": 15.846, "step": 240000 }, { "epoch": 3.68, "learning_rate": 0.00013649481627508181, "loss": 0.3031, "step": 241000 }, { "epoch": 3.7, "learning_rate": 0.0001363578251256578, "loss": 0.3023, "step": 242000 }, { "epoch": 3.71, "learning_rate": 0.00013622021765608754, "loss": 0.3022, "step": 243000 }, { "epoch": 3.73, "learning_rate": 0.00013608199537122425, "loss": 0.3017, "step": 244000 }, { "epoch": 3.74, "learning_rate": 0.0001359431597826447, "loss": 0.3019, "step": 245000 }, { "epoch": 3.74, "eval_runtime": 1.0744, "eval_samples_per_second": 930.717, "eval_steps_per_second": 14.891, "step": 245000 }, { "epoch": 3.76, "learning_rate": 0.0001358037124086327, "loss": 0.3015, "step": 246000 }, { "epoch": 3.77, "learning_rate": 0.00013566365477416233, "loss": 0.3018, "step": 247000 }, { "epoch": 3.79, "learning_rate": 0.00013552298841088144, "loss": 0.3013, "step": 248000 }, { "epoch": 3.8, "learning_rate": 0.00013538171485709486, "loss": 0.3006, "step": 249000 }, { "epoch": 3.82, "learning_rate": 0.00013523983565774753, "loss": 0.3008, "step": 250000 }, { "epoch": 3.82, "eval_runtime": 1.0168, "eval_samples_per_second": 983.434, "eval_steps_per_second": 15.735, "step": 250000 }, { "epoch": 3.83, "learning_rate": 0.00013509735236440766, "loss": 0.3003, "step": 251000 }, { "epoch": 3.85, "learning_rate": 0.00013495426653524972, "loss": 0.3, "step": 252000 }, { "epoch": 3.86, "learning_rate": 0.00013481057973503742, "loss": 0.3, "step": 253000 }, { "epoch": 3.88, "learning_rate": 0.00013466629353510651, "loss": 0.2997, "step": 254000 }, { "epoch": 3.89, "learning_rate": 0.00013452140951334787, "loss": 0.2995, "step": 255000 }, { "epoch": 3.89, "eval_runtime": 0.8192, "eval_samples_per_second": 1220.744, "eval_steps_per_second": 19.532, "step": 255000 }, { "epoch": 3.91, "learning_rate": 0.00013437592925418985, "loss": 0.2996, "step": 256000 }, { "epoch": 3.92, "learning_rate": 0.00013422985434858133, "loss": 0.299, "step": 257000 }, { "epoch": 3.94, "learning_rate": 0.00013408318639397405, "loss": 0.2987, "step": 258000 }, { "epoch": 3.95, "learning_rate": 0.00013393592699430525, "loss": 0.2986, "step": 259000 }, { "epoch": 3.97, "learning_rate": 0.00013378807775998012, "loss": 0.2984, "step": 260000 }, { "epoch": 3.97, "eval_runtime": 1.0461, "eval_samples_per_second": 955.963, "eval_steps_per_second": 15.295, "step": 260000 }, { "epoch": 3.99, "learning_rate": 0.00013363964030785422, "loss": 0.2983, "step": 261000 }, { "epoch": 4.0, "learning_rate": 0.00013349061626121578, "loss": 0.2982, "step": 262000 }, { "epoch": 4.02, "learning_rate": 0.00013334100724976783, "loss": 0.2977, "step": 263000 }, { "epoch": 4.03, "learning_rate": 0.0001331908149096106, "loss": 0.2976, "step": 264000 }, { "epoch": 4.05, "learning_rate": 0.00013304004088322342, "loss": 0.2978, "step": 265000 }, { "epoch": 4.05, "eval_runtime": 1.0225, "eval_samples_per_second": 978.001, "eval_steps_per_second": 15.648, "step": 265000 }, { "epoch": 4.06, "learning_rate": 0.00013288868681944692, "loss": 0.2971, "step": 266000 }, { "epoch": 4.08, "learning_rate": 0.00013273675437346487, "loss": 0.2972, "step": 267000 }, { "epoch": 4.09, "learning_rate": 0.00013258424520678618, "loss": 0.2969, "step": 268000 }, { "epoch": 4.11, "learning_rate": 0.00013243116098722663, "loss": 0.2968, "step": 269000 }, { "epoch": 4.12, "learning_rate": 0.00013227750338889077, "loss": 0.2966, "step": 270000 }, { "epoch": 4.12, "eval_runtime": 1.1084, "eval_samples_per_second": 902.192, "eval_steps_per_second": 14.435, "step": 270000 }, { "epoch": 4.14, "learning_rate": 0.00013212327409215343, "loss": 0.296, "step": 271000 }, { "epoch": 4.15, "learning_rate": 0.0001319684747836415, "loss": 0.2958, "step": 272000 }, { "epoch": 4.17, "learning_rate": 0.0001318131071562154, "loss": 0.2961, "step": 273000 }, { "epoch": 4.18, "learning_rate": 0.00013165717290895067, "loss": 0.2957, "step": 274000 }, { "epoch": 4.2, "learning_rate": 0.0001315006737471192, "loss": 0.2955, "step": 275000 }, { "epoch": 4.2, "eval_runtime": 1.0552, "eval_samples_per_second": 947.654, "eval_steps_per_second": 15.162, "step": 275000 }, { "epoch": 4.21, "learning_rate": 0.0001313436113821708, "loss": 0.2952, "step": 276000 }, { "epoch": 4.23, "learning_rate": 0.00013118598753171425, "loss": 0.2951, "step": 277000 }, { "epoch": 4.25, "learning_rate": 0.0001310278039194988, "loss": 0.2951, "step": 278000 }, { "epoch": 4.26, "learning_rate": 0.00013086906227539506, "loss": 0.2952, "step": 279000 }, { "epoch": 4.28, "learning_rate": 0.00013070976433537623, "loss": 0.2946, "step": 280000 }, { "epoch": 4.28, "eval_runtime": 1.0293, "eval_samples_per_second": 971.532, "eval_steps_per_second": 15.545, "step": 280000 }, { "epoch": 4.29, "learning_rate": 0.00013054991184149905, "loss": 0.2946, "step": 281000 }, { "epoch": 4.31, "learning_rate": 0.00013038950654188476, "loss": 0.2942, "step": 282000 }, { "epoch": 4.32, "learning_rate": 0.00013022855019070005, "loss": 0.2941, "step": 283000 }, { "epoch": 4.34, "learning_rate": 0.0001300670445481378, "loss": 0.2937, "step": 284000 }, { "epoch": 4.35, "learning_rate": 0.0001299049913803978, "loss": 0.2937, "step": 285000 }, { "epoch": 4.35, "eval_runtime": 1.0469, "eval_samples_per_second": 955.197, "eval_steps_per_second": 15.283, "step": 285000 }, { "epoch": 4.37, "learning_rate": 0.00012974239245966754, "loss": 0.2934, "step": 286000 }, { "epoch": 4.38, "learning_rate": 0.0001295792495641028, "loss": 0.2962, "step": 287000 }, { "epoch": 4.4, "learning_rate": 0.00012941556447780813, "loss": 0.2931, "step": 288000 }, { "epoch": 4.41, "learning_rate": 0.0001292513389908174, "loss": 0.2931, "step": 289000 }, { "epoch": 4.43, "learning_rate": 0.0001290865748990742, "loss": 0.2932, "step": 290000 }, { "epoch": 4.43, "eval_runtime": 1.0143, "eval_samples_per_second": 985.898, "eval_steps_per_second": 15.774, "step": 290000 }, { "epoch": 4.44, "learning_rate": 0.00012892127400441228, "loss": 0.2923, "step": 291000 }, { "epoch": 4.46, "learning_rate": 0.00012875543811453576, "loss": 0.2919, "step": 292000 }, { "epoch": 4.47, "learning_rate": 0.0001285890690429993, "loss": 0.2931, "step": 293000 }, { "epoch": 4.49, "learning_rate": 0.00012842216860918846, "loss": 0.292, "step": 294000 }, { "epoch": 4.5, "learning_rate": 0.0001282547386382996, "loss": 0.2914, "step": 295000 }, { "epoch": 4.5, "eval_runtime": 1.0329, "eval_samples_per_second": 968.135, "eval_steps_per_second": 15.49, "step": 295000 }, { "epoch": 4.52, "learning_rate": 0.0001280867809613201, "loss": 0.2919, "step": 296000 }, { "epoch": 4.54, "learning_rate": 0.0001279182974150082, "loss": 0.2915, "step": 297000 }, { "epoch": 4.55, "learning_rate": 0.00012774928984187297, "loss": 0.2914, "step": 298000 }, { "epoch": 4.57, "learning_rate": 0.00012757976009015413, "loss": 0.2908, "step": 299000 }, { "epoch": 4.58, "learning_rate": 0.0001274097100138019, "loss": 0.2909, "step": 300000 }, { "epoch": 4.58, "eval_runtime": 1.0054, "eval_samples_per_second": 994.612, "eval_steps_per_second": 15.914, "step": 300000 }, { "epoch": 4.6, "learning_rate": 0.00012723914147245663, "loss": 0.2906, "step": 301000 }, { "epoch": 4.61, "learning_rate": 0.00012706805633142863, "loss": 0.2906, "step": 302000 }, { "epoch": 4.63, "learning_rate": 0.00012689645646167755, "loss": 0.2902, "step": 303000 }, { "epoch": 4.64, "learning_rate": 0.00012672434373979207, "loss": 0.291, "step": 304000 }, { "epoch": 4.66, "learning_rate": 0.00012655172004796936, "loss": 0.2899, "step": 305000 }, { "epoch": 4.66, "eval_runtime": 1.0975, "eval_samples_per_second": 911.158, "eval_steps_per_second": 14.579, "step": 305000 }, { "epoch": 4.67, "learning_rate": 0.00012637858727399448, "loss": 0.2898, "step": 306000 }, { "epoch": 4.69, "learning_rate": 0.00012620494731121966, "loss": 0.2896, "step": 307000 }, { "epoch": 4.7, "learning_rate": 0.00012603080205854372, "loss": 0.2894, "step": 308000 }, { "epoch": 4.72, "learning_rate": 0.00012585615342039126, "loss": 0.2894, "step": 309000 }, { "epoch": 4.73, "learning_rate": 0.0001256810033066918, "loss": 0.2894, "step": 310000 }, { "epoch": 4.73, "eval_runtime": 1.0481, "eval_samples_per_second": 954.11, "eval_steps_per_second": 15.266, "step": 310000 }, { "epoch": 4.75, "learning_rate": 0.0001255053536328589, "loss": 0.2887, "step": 311000 }, { "epoch": 4.76, "learning_rate": 0.0001253292063197693, "loss": 0.2887, "step": 312000 }, { "epoch": 4.78, "learning_rate": 0.0001251525632937418, "loss": 0.2886, "step": 313000 }, { "epoch": 4.79, "learning_rate": 0.00012497542648651615, "loss": 0.2887, "step": 314000 }, { "epoch": 4.81, "learning_rate": 0.00012479779783523216, "loss": 0.2883, "step": 315000 }, { "epoch": 4.81, "eval_runtime": 1.0333, "eval_samples_per_second": 967.804, "eval_steps_per_second": 15.485, "step": 315000 }, { "epoch": 4.83, "learning_rate": 0.00012461967928240828, "loss": 0.2883, "step": 316000 }, { "epoch": 4.84, "learning_rate": 0.00012444107277592047, "loss": 0.2877, "step": 317000 }, { "epoch": 4.86, "learning_rate": 0.0001242619802689809, "loss": 0.2879, "step": 318000 }, { "epoch": 4.87, "learning_rate": 0.00012408240372011647, "loss": 0.2876, "step": 319000 }, { "epoch": 4.89, "learning_rate": 0.0001239023450931476, "loss": 0.2874, "step": 320000 }, { "epoch": 4.89, "eval_runtime": 1.04, "eval_samples_per_second": 961.537, "eval_steps_per_second": 15.385, "step": 320000 }, { "epoch": 4.9, "learning_rate": 0.00012372180635716656, "loss": 0.2874, "step": 321000 }, { "epoch": 4.92, "learning_rate": 0.00012354078948651604, "loss": 0.2873, "step": 322000 }, { "epoch": 4.93, "learning_rate": 0.00012335929646076758, "loss": 0.2868, "step": 323000 }, { "epoch": 4.95, "learning_rate": 0.00012317732926469976, "loss": 0.2871, "step": 324000 }, { "epoch": 4.96, "learning_rate": 0.00012299488988827675, "loss": 0.2869, "step": 325000 }, { "epoch": 4.96, "eval_runtime": 1.3977, "eval_samples_per_second": 715.452, "eval_steps_per_second": 11.447, "step": 325000 }, { "epoch": 4.98, "learning_rate": 0.0001228119803266263, "loss": 0.2867, "step": 326000 }, { "epoch": 4.99, "learning_rate": 0.0001226286025800181, "loss": 0.2866, "step": 327000 }, { "epoch": 5.01, "learning_rate": 0.00012244475865384177, "loss": 0.2862, "step": 328000 }, { "epoch": 5.02, "learning_rate": 0.00012226045055858505, "loss": 0.2858, "step": 329000 }, { "epoch": 5.04, "learning_rate": 0.00012207568030981174, "loss": 0.2859, "step": 330000 }, { "epoch": 5.04, "eval_runtime": 1.1314, "eval_samples_per_second": 883.862, "eval_steps_per_second": 14.142, "step": 330000 }, { "epoch": 5.05, "learning_rate": 0.00012189044992813972, "loss": 0.2858, "step": 331000 }, { "epoch": 5.07, "learning_rate": 0.0001217047614392187, "loss": 0.2857, "step": 332000 }, { "epoch": 5.08, "learning_rate": 0.00012151861687370828, "loss": 0.2857, "step": 333000 }, { "epoch": 5.1, "learning_rate": 0.00012133201826725558, "loss": 0.2852, "step": 334000 }, { "epoch": 5.12, "learning_rate": 0.0001211449676604731, "loss": 0.2853, "step": 335000 }, { "epoch": 5.12, "eval_runtime": 1.3419, "eval_samples_per_second": 745.216, "eval_steps_per_second": 11.923, "step": 335000 }, { "epoch": 5.13, "learning_rate": 0.00012095746709891632, "loss": 0.2852, "step": 336000 }, { "epoch": 5.15, "learning_rate": 0.00012076951863306127, "loss": 0.285, "step": 337000 }, { "epoch": 5.16, "learning_rate": 0.0001205811243182823, "loss": 0.2848, "step": 338000 }, { "epoch": 5.18, "learning_rate": 0.00012039228621482949, "loss": 0.2858, "step": 339000 }, { "epoch": 5.19, "learning_rate": 0.00012020300638780604, "loss": 0.2845, "step": 340000 }, { "epoch": 5.19, "eval_runtime": 1.2559, "eval_samples_per_second": 796.26, "eval_steps_per_second": 12.74, "step": 340000 }, { "epoch": 5.21, "learning_rate": 0.00012001328690714582, "loss": 0.284, "step": 341000 }, { "epoch": 5.22, "learning_rate": 0.00011982312984759068, "loss": 0.2845, "step": 342000 }, { "epoch": 5.24, "learning_rate": 0.00011963253728866778, "loss": 0.2841, "step": 343000 }, { "epoch": 5.25, "learning_rate": 0.00011944151131466675, "loss": 0.284, "step": 344000 }, { "epoch": 5.27, "learning_rate": 0.00011925005401461709, "loss": 0.2836, "step": 345000 }, { "epoch": 5.27, "eval_runtime": 1.1037, "eval_samples_per_second": 906.031, "eval_steps_per_second": 14.496, "step": 345000 }, { "epoch": 5.28, "learning_rate": 0.00011905816748226513, "loss": 0.2834, "step": 346000 }, { "epoch": 5.3, "learning_rate": 0.00011886585381605125, "loss": 0.2835, "step": 347000 }, { "epoch": 5.31, "learning_rate": 0.00011867311511908693, "loss": 0.2832, "step": 348000 }, { "epoch": 5.33, "learning_rate": 0.00011847995349913162, "loss": 0.2828, "step": 349000 }, { "epoch": 5.34, "learning_rate": 0.00011828637106856989, "loss": 0.2828, "step": 350000 }, { "epoch": 5.34, "eval_runtime": 1.0295, "eval_samples_per_second": 971.32, "eval_steps_per_second": 15.541, "step": 350000 }, { "epoch": 5.36, "learning_rate": 0.00011809236994438816, "loss": 0.2831, "step": 351000 }, { "epoch": 5.38, "learning_rate": 0.00011789795224815164, "loss": 0.2827, "step": 352000 }, { "epoch": 5.39, "learning_rate": 0.00011770312010598116, "loss": 0.282, "step": 353000 }, { "epoch": 5.41, "learning_rate": 0.00011750787564852973, "loss": 0.2822, "step": 354000 }, { "epoch": 5.42, "learning_rate": 0.00011731222101095955, "loss": 0.2825, "step": 355000 }, { "epoch": 5.42, "eval_runtime": 1.0697, "eval_samples_per_second": 934.885, "eval_steps_per_second": 14.958, "step": 355000 }, { "epoch": 5.44, "learning_rate": 0.00011711615833291833, "loss": 0.2822, "step": 356000 }, { "epoch": 5.45, "learning_rate": 0.0001169196897585161, "loss": 0.2824, "step": 357000 }, { "epoch": 5.47, "learning_rate": 0.00011672281743630175, "loss": 0.2818, "step": 358000 }, { "epoch": 5.48, "learning_rate": 0.0001165255435192394, "loss": 0.2815, "step": 359000 }, { "epoch": 5.5, "learning_rate": 0.00011632787016468506, "loss": 0.2819, "step": 360000 }, { "epoch": 5.5, "eval_runtime": 1.1008, "eval_samples_per_second": 908.433, "eval_steps_per_second": 14.535, "step": 360000 }, { "epoch": 5.51, "learning_rate": 0.0001161297995343628, "loss": 0.2815, "step": 361000 }, { "epoch": 5.53, "learning_rate": 0.00011593133379434138, "loss": 0.2815, "step": 362000 }, { "epoch": 5.54, "learning_rate": 0.00011573247511501028, "loss": 0.2811, "step": 363000 }, { "epoch": 5.56, "learning_rate": 0.00011553322567105619, "loss": 0.2807, "step": 364000 }, { "epoch": 5.57, "learning_rate": 0.00011533358764143905, "loss": 0.2808, "step": 365000 }, { "epoch": 5.57, "eval_runtime": 1.1301, "eval_samples_per_second": 884.842, "eval_steps_per_second": 14.157, "step": 365000 }, { "epoch": 5.59, "learning_rate": 0.00011513356320936841, "loss": 0.2808, "step": 366000 }, { "epoch": 5.6, "learning_rate": 0.00011493315456227943, "loss": 0.2817, "step": 367000 }, { "epoch": 5.62, "learning_rate": 0.00011473236389180894, "loss": 0.2803, "step": 368000 }, { "epoch": 5.63, "learning_rate": 0.00011453119339377154, "loss": 0.2803, "step": 369000 }, { "epoch": 5.65, "learning_rate": 0.00011432964526813558, "loss": 0.2817, "step": 370000 }, { "epoch": 5.65, "eval_runtime": 1.2187, "eval_samples_per_second": 820.56, "eval_steps_per_second": 13.129, "step": 370000 }, { "epoch": 5.67, "learning_rate": 0.00011412772171899904, "loss": 0.2819, "step": 371000 }, { "epoch": 5.68, "learning_rate": 0.00011392542495456556, "loss": 0.28, "step": 372000 }, { "epoch": 5.7, "learning_rate": 0.00011372275718712006, "loss": 0.2797, "step": 373000 }, { "epoch": 5.71, "learning_rate": 0.00011351972063300484, "loss": 0.2797, "step": 374000 }, { "epoch": 5.73, "learning_rate": 0.00011331631751259515, "loss": 0.2801, "step": 375000 }, { "epoch": 5.73, "eval_runtime": 1.0146, "eval_samples_per_second": 985.631, "eval_steps_per_second": 15.77, "step": 375000 }, { "epoch": 5.74, "learning_rate": 0.00011311255005027487, "loss": 0.2789, "step": 376000 }, { "epoch": 5.76, "learning_rate": 0.00011290842047441232, "loss": 0.2791, "step": 377000 }, { "epoch": 5.77, "learning_rate": 0.00011270393101733585, "loss": 0.279, "step": 378000 }, { "epoch": 5.79, "learning_rate": 0.00011249908391530946, "loss": 0.279, "step": 379000 }, { "epoch": 5.8, "learning_rate": 0.00011229388140850814, "loss": 0.279, "step": 380000 }, { "epoch": 5.8, "eval_runtime": 1.2375, "eval_samples_per_second": 808.112, "eval_steps_per_second": 12.93, "step": 380000 }, { "epoch": 5.82, "learning_rate": 0.00011208832574099368, "loss": 0.2788, "step": 381000 }, { "epoch": 5.83, "learning_rate": 0.00011188241916068993, "loss": 0.2785, "step": 382000 }, { "epoch": 5.85, "learning_rate": 0.00011167616391935826, "loss": 0.2783, "step": 383000 }, { "epoch": 5.86, "learning_rate": 0.00011146956227257293, "loss": 0.2785, "step": 384000 }, { "epoch": 5.88, "learning_rate": 0.00011126261647969645, "loss": 0.2781, "step": 385000 }, { "epoch": 5.88, "eval_runtime": 1.0191, "eval_samples_per_second": 981.273, "eval_steps_per_second": 15.7, "step": 385000 }, { "epoch": 5.89, "learning_rate": 0.00011105532880385487, "loss": 0.2782, "step": 386000 }, { "epoch": 5.91, "learning_rate": 0.00011084770151191299, "loss": 0.2782, "step": 387000 }, { "epoch": 5.92, "learning_rate": 0.00011063973687444962, "loss": 0.2779, "step": 388000 }, { "epoch": 5.94, "learning_rate": 0.00011043143716573272, "loss": 0.2774, "step": 389000 }, { "epoch": 5.96, "learning_rate": 0.00011022280466369448, "loss": 0.2776, "step": 390000 }, { "epoch": 5.96, "eval_runtime": 1.0236, "eval_samples_per_second": 976.954, "eval_steps_per_second": 15.631, "step": 390000 }, { "epoch": 5.97, "learning_rate": 0.00011001384164990662, "loss": 0.2775, "step": 391000 }, { "epoch": 5.99, "learning_rate": 0.00010980455040955506, "loss": 0.2769, "step": 392000 }, { "epoch": 6.0, "learning_rate": 0.00010959493323141538, "loss": 0.2773, "step": 393000 }, { "epoch": 6.02, "learning_rate": 0.00010938499240782739, "loss": 0.277, "step": 394000 }, { "epoch": 6.03, "learning_rate": 0.00010917473023467032, "loss": 0.277, "step": 395000 }, { "epoch": 6.03, "eval_runtime": 1.0769, "eval_samples_per_second": 928.59, "eval_steps_per_second": 14.857, "step": 395000 }, { "epoch": 6.05, "learning_rate": 0.00010896414901133761, "loss": 0.2766, "step": 396000 }, { "epoch": 6.06, "learning_rate": 0.00010875325104071177, "loss": 0.2768, "step": 397000 }, { "epoch": 6.08, "learning_rate": 0.00010854203862913927, "loss": 0.2765, "step": 398000 }, { "epoch": 6.09, "learning_rate": 0.00010833051408640509, "loss": 0.2763, "step": 399000 }, { "epoch": 6.11, "learning_rate": 0.00010811867972570786, "loss": 0.2767, "step": 400000 }, { "epoch": 6.11, "eval_runtime": 1.1081, "eval_samples_per_second": 902.417, "eval_steps_per_second": 14.439, "step": 400000 }, { "epoch": 6.12, "learning_rate": 0.00010790653786363416, "loss": 0.2759, "step": 401000 }, { "epoch": 6.14, "learning_rate": 0.00010769409082013337, "loss": 0.2759, "step": 402000 }, { "epoch": 6.15, "learning_rate": 0.00010748134091849238, "loss": 0.2757, "step": 403000 }, { "epoch": 6.17, "learning_rate": 0.00010726829048531, "loss": 0.2762, "step": 404000 }, { "epoch": 6.18, "learning_rate": 0.00010705494185047165, "loss": 0.276, "step": 405000 }, { "epoch": 6.18, "eval_runtime": 1.1676, "eval_samples_per_second": 856.476, "eval_steps_per_second": 13.704, "step": 405000 }, { "epoch": 6.2, "learning_rate": 0.0001068412973471238, "loss": 0.2754, "step": 406000 }, { "epoch": 6.21, "learning_rate": 0.00010662735931164853, "loss": 0.2755, "step": 407000 }, { "epoch": 6.23, "learning_rate": 0.0001064131300836379, "loss": 0.2752, "step": 408000 }, { "epoch": 6.25, "learning_rate": 0.0001061986120058684, "loss": 0.2748, "step": 409000 }, { "epoch": 6.26, "learning_rate": 0.00010598380742427543, "loss": 0.2749, "step": 410000 }, { "epoch": 6.26, "eval_runtime": 1.0797, "eval_samples_per_second": 926.22, "eval_steps_per_second": 14.82, "step": 410000 }, { "epoch": 6.28, "learning_rate": 0.00010576871868792746, "loss": 0.275, "step": 411000 }, { "epoch": 6.29, "learning_rate": 0.0001055533481490004, "loss": 0.2746, "step": 412000 }, { "epoch": 6.31, "learning_rate": 0.000105337698162752, "loss": 0.2741, "step": 413000 }, { "epoch": 6.32, "learning_rate": 0.00010512177108749594, "loss": 0.2746, "step": 414000 }, { "epoch": 6.34, "learning_rate": 0.00010490556928457616, "loss": 0.2743, "step": 415000 }, { "epoch": 6.34, "eval_runtime": 1.0107, "eval_samples_per_second": 989.389, "eval_steps_per_second": 15.83, "step": 415000 }, { "epoch": 6.35, "learning_rate": 0.00010468909511834088, "loss": 0.2741, "step": 416000 }, { "epoch": 6.37, "learning_rate": 0.00010447235095611692, "loss": 0.2738, "step": 417000 }, { "epoch": 6.38, "learning_rate": 0.00010425533916818376, "loss": 0.2738, "step": 418000 }, { "epoch": 6.4, "learning_rate": 0.00010403806212774747, "loss": 0.2742, "step": 419000 }, { "epoch": 6.41, "learning_rate": 0.000103820522210915, "loss": 0.2737, "step": 420000 }, { "epoch": 6.41, "eval_runtime": 1.055, "eval_samples_per_second": 947.861, "eval_steps_per_second": 15.166, "step": 420000 }, { "epoch": 6.43, "learning_rate": 0.00010360272179666802, "loss": 0.2742, "step": 421000 }, { "epoch": 6.44, "learning_rate": 0.00010338466326683697, "loss": 0.2733, "step": 422000 }, { "epoch": 6.46, "learning_rate": 0.00010316634900607497, "loss": 0.2737, "step": 423000 }, { "epoch": 6.47, "learning_rate": 0.00010294778140183182, "loss": 0.2732, "step": 424000 }, { "epoch": 6.49, "learning_rate": 0.00010272896284432785, "loss": 0.2733, "step": 425000 }, { "epoch": 6.49, "eval_runtime": 1.0035, "eval_samples_per_second": 996.544, "eval_steps_per_second": 15.945, "step": 425000 }, { "epoch": 6.51, "learning_rate": 0.00010250989572652766, "loss": 0.2728, "step": 426000 }, { "epoch": 6.52, "learning_rate": 0.00010229058244411427, "loss": 0.2729, "step": 427000 }, { "epoch": 6.54, "learning_rate": 0.00010207102539546251, "loss": 0.2728, "step": 428000 }, { "epoch": 6.55, "learning_rate": 0.00010185122698161311, "loss": 0.2726, "step": 429000 }, { "epoch": 6.57, "learning_rate": 0.00010163118960624632, "loss": 0.2725, "step": 430000 }, { "epoch": 6.57, "eval_runtime": 1.0983, "eval_samples_per_second": 910.508, "eval_steps_per_second": 14.568, "step": 430000 }, { "epoch": 6.58, "learning_rate": 0.00010141091567565561, "loss": 0.2727, "step": 431000 }, { "epoch": 6.6, "learning_rate": 0.00010119040759872142, "loss": 0.2725, "step": 432000 }, { "epoch": 6.61, "learning_rate": 0.00010096966778688472, "loss": 0.2721, "step": 433000 }, { "epoch": 6.63, "learning_rate": 0.00010074869865412074, "loss": 0.272, "step": 434000 }, { "epoch": 6.64, "learning_rate": 0.00010052750261691254, "loss": 0.2721, "step": 435000 }, { "epoch": 6.64, "eval_runtime": 0.9895, "eval_samples_per_second": 1010.612, "eval_steps_per_second": 16.17, "step": 435000 }, { "epoch": 6.66, "learning_rate": 0.0001003060820942245, "loss": 0.2716, "step": 436000 }, { "epoch": 6.67, "learning_rate": 0.00010008443950747599, "loss": 0.2716, "step": 437000 }, { "epoch": 6.69, "learning_rate": 9.986257728051483e-05, "loss": 0.2717, "step": 438000 }, { "epoch": 6.7, "learning_rate": 9.964049783959082e-05, "loss": 0.2716, "step": 439000 }, { "epoch": 6.72, "learning_rate": 9.94182036133291e-05, "loss": 0.2715, "step": 440000 }, { "epoch": 6.72, "eval_runtime": 1.0245, "eval_samples_per_second": 976.131, "eval_steps_per_second": 15.618, "step": 440000 }, { "epoch": 6.73, "learning_rate": 9.919569703270376e-05, "loss": 0.2716, "step": 441000 }, { "epoch": 6.75, "learning_rate": 9.89729805310111e-05, "loss": 0.2711, "step": 442000 }, { "epoch": 6.76, "learning_rate": 9.875005654384307e-05, "loss": 0.2712, "step": 443000 }, { "epoch": 6.78, "learning_rate": 9.852692750906071e-05, "loss": 0.2717, "step": 444000 }, { "epoch": 6.8, "learning_rate": 9.830359586676737e-05, "loss": 0.2722, "step": 445000 }, { "epoch": 6.8, "eval_runtime": 1.1145, "eval_samples_per_second": 897.295, "eval_steps_per_second": 14.357, "step": 445000 }, { "epoch": 6.81, "learning_rate": 9.808006405928215e-05, "loss": 0.2703, "step": 446000 }, { "epoch": 6.83, "learning_rate": 9.785633453111306e-05, "loss": 0.2705, "step": 447000 }, { "epoch": 6.84, "learning_rate": 9.763240972893037e-05, "loss": 0.27, "step": 448000 }, { "epoch": 6.86, "learning_rate": 9.740829210153984e-05, "loss": 0.2703, "step": 449000 }, { "epoch": 6.87, "learning_rate": 9.718398409985593e-05, "loss": 0.27, "step": 450000 }, { "epoch": 6.87, "eval_runtime": 0.9938, "eval_samples_per_second": 1006.215, "eval_steps_per_second": 16.099, "step": 450000 }, { "epoch": 6.89, "learning_rate": 9.695948817687504e-05, "loss": 0.2699, "step": 451000 }, { "epoch": 6.9, "learning_rate": 9.673480678764858e-05, "loss": 0.2698, "step": 452000 }, { "epoch": 6.92, "learning_rate": 9.650994238925626e-05, "loss": 0.2699, "step": 453000 }, { "epoch": 6.93, "learning_rate": 9.628489744077911e-05, "loss": 0.2696, "step": 454000 }, { "epoch": 6.95, "learning_rate": 9.60596744032726e-05, "loss": 0.2699, "step": 455000 }, { "epoch": 6.95, "eval_runtime": 1.0008, "eval_samples_per_second": 999.165, "eval_steps_per_second": 15.987, "step": 455000 }, { "epoch": 6.96, "learning_rate": 9.583427573973982e-05, "loss": 0.2696, "step": 456000 }, { "epoch": 6.98, "learning_rate": 9.560870391510441e-05, "loss": 0.2695, "step": 457000 }, { "epoch": 6.99, "learning_rate": 9.538296139618371e-05, "loss": 0.2691, "step": 458000 }, { "epoch": 7.01, "learning_rate": 9.515705065166178e-05, "loss": 0.2693, "step": 459000 }, { "epoch": 7.02, "learning_rate": 9.493097415206228e-05, "loss": 0.2688, "step": 460000 }, { "epoch": 7.02, "eval_runtime": 1.0225, "eval_samples_per_second": 978.034, "eval_steps_per_second": 15.649, "step": 460000 }, { "epoch": 7.04, "learning_rate": 9.47047343697216e-05, "loss": 0.269, "step": 461000 }, { "epoch": 7.05, "learning_rate": 9.447833377876176e-05, "loss": 0.269, "step": 462000 }, { "epoch": 7.07, "learning_rate": 9.425177485506336e-05, "loss": 0.2688, "step": 463000 }, { "epoch": 7.09, "learning_rate": 9.402506007623848e-05, "loss": 0.269, "step": 464000 }, { "epoch": 7.1, "learning_rate": 9.379819192160362e-05, "loss": 0.2692, "step": 465000 }, { "epoch": 7.1, "eval_runtime": 1.1401, "eval_samples_per_second": 877.142, "eval_steps_per_second": 14.034, "step": 465000 }, { "epoch": 7.12, "learning_rate": 9.357117287215258e-05, "loss": 0.2682, "step": 466000 }, { "epoch": 7.13, "learning_rate": 9.334400541052928e-05, "loss": 0.2683, "step": 467000 }, { "epoch": 7.15, "learning_rate": 9.311669202100073e-05, "loss": 0.2693, "step": 468000 }, { "epoch": 7.16, "learning_rate": 9.288923518942968e-05, "loss": 0.2683, "step": 469000 }, { "epoch": 7.18, "learning_rate": 9.26616374032477e-05, "loss": 0.2677, "step": 470000 }, { "epoch": 7.18, "eval_runtime": 0.8954, "eval_samples_per_second": 1116.774, "eval_steps_per_second": 17.868, "step": 470000 }, { "epoch": 7.19, "learning_rate": 9.243390115142761e-05, "loss": 0.2678, "step": 471000 }, { "epoch": 7.21, "learning_rate": 9.220602892445661e-05, "loss": 0.2678, "step": 472000 }, { "epoch": 7.22, "learning_rate": 9.197802321430889e-05, "loss": 0.2679, "step": 473000 }, { "epoch": 7.24, "learning_rate": 9.174988651441833e-05, "loss": 0.2673, "step": 474000 }, { "epoch": 7.25, "learning_rate": 9.152162131965137e-05, "loss": 0.2675, "step": 475000 }, { "epoch": 7.25, "eval_runtime": 1.0353, "eval_samples_per_second": 965.922, "eval_steps_per_second": 15.455, "step": 475000 }, { "epoch": 7.27, "learning_rate": 9.129323012627956e-05, "loss": 0.2693, "step": 476000 }, { "epoch": 7.28, "learning_rate": 9.106471543195244e-05, "loss": 0.2675, "step": 477000 }, { "epoch": 7.3, "learning_rate": 9.08360797356701e-05, "loss": 0.2679, "step": 478000 }, { "epoch": 7.31, "learning_rate": 9.060732553775582e-05, "loss": 0.2672, "step": 479000 }, { "epoch": 7.33, "learning_rate": 9.037845533982892e-05, "loss": 0.267, "step": 480000 }, { "epoch": 7.33, "eval_runtime": 1.0347, "eval_samples_per_second": 966.468, "eval_steps_per_second": 15.463, "step": 480000 }, { "epoch": 7.34, "learning_rate": 9.014947164477721e-05, "loss": 0.2663, "step": 481000 }, { "epoch": 7.36, "learning_rate": 8.992037695672967e-05, "loss": 0.267, "step": 482000 }, { "epoch": 7.38, "learning_rate": 8.969117378102912e-05, "loss": 0.2665, "step": 483000 }, { "epoch": 7.39, "learning_rate": 8.946186462420478e-05, "loss": 0.2662, "step": 484000 }, { "epoch": 7.41, "learning_rate": 8.923245199394482e-05, "loss": 0.2662, "step": 485000 }, { "epoch": 7.41, "eval_runtime": 1.0079, "eval_samples_per_second": 992.191, "eval_steps_per_second": 15.875, "step": 485000 }, { "epoch": 7.42, "learning_rate": 8.900293839906903e-05, "loss": 0.2664, "step": 486000 }, { "epoch": 7.44, "learning_rate": 8.87733263495013e-05, "loss": 0.2658, "step": 487000 }, { "epoch": 7.45, "learning_rate": 8.85436183562422e-05, "loss": 0.2659, "step": 488000 }, { "epoch": 7.47, "learning_rate": 8.83138169313416e-05, "loss": 0.2663, "step": 489000 }, { "epoch": 7.48, "learning_rate": 8.808392458787103e-05, "loss": 0.2656, "step": 490000 }, { "epoch": 7.48, "eval_runtime": 1.075, "eval_samples_per_second": 930.213, "eval_steps_per_second": 14.883, "step": 490000 }, { "epoch": 7.5, "learning_rate": 8.78539438398963e-05, "loss": 0.2655, "step": 491000 }, { "epoch": 7.51, "learning_rate": 8.762387720245008e-05, "loss": 0.2656, "step": 492000 }, { "epoch": 7.53, "learning_rate": 8.73937271915042e-05, "loss": 0.2655, "step": 493000 }, { "epoch": 7.54, "learning_rate": 8.716349632394235e-05, "loss": 0.2652, "step": 494000 }, { "epoch": 7.56, "learning_rate": 8.69331871175324e-05, "loss": 0.2651, "step": 495000 }, { "epoch": 7.56, "eval_runtime": 1.1978, "eval_samples_per_second": 834.871, "eval_steps_per_second": 13.358, "step": 495000 }, { "epoch": 7.57, "learning_rate": 8.67028020908989e-05, "loss": 0.2647, "step": 496000 }, { "epoch": 7.59, "learning_rate": 8.647234376349565e-05, "loss": 0.2653, "step": 497000 }, { "epoch": 7.6, "learning_rate": 8.624181465557794e-05, "loss": 0.2649, "step": 498000 }, { "epoch": 7.62, "learning_rate": 8.601121728817519e-05, "loss": 0.2647, "step": 499000 }, { "epoch": 7.64, "learning_rate": 8.578055418306327e-05, "loss": 0.2654, "step": 500000 }, { "epoch": 7.64, "eval_runtime": 1.1022, "eval_samples_per_second": 907.298, "eval_steps_per_second": 14.517, "step": 500000 }, { "epoch": 7.65, "learning_rate": 8.55498278627369e-05, "loss": 0.2646, "step": 501000 }, { "epoch": 7.67, "learning_rate": 8.531904085038221e-05, "loss": 0.2646, "step": 502000 }, { "epoch": 7.68, "learning_rate": 8.508819566984897e-05, "loss": 0.2641, "step": 503000 }, { "epoch": 7.7, "learning_rate": 8.485729484562307e-05, "loss": 0.2641, "step": 504000 }, { "epoch": 7.71, "learning_rate": 8.462634090279895e-05, "loss": 0.264, "step": 505000 }, { "epoch": 7.71, "eval_runtime": 1.0129, "eval_samples_per_second": 987.309, "eval_steps_per_second": 15.797, "step": 505000 }, { "epoch": 7.73, "learning_rate": 8.439533636705194e-05, "loss": 0.2635, "step": 506000 }, { "epoch": 7.74, "learning_rate": 8.416428376461061e-05, "loss": 0.2644, "step": 507000 }, { "epoch": 7.76, "learning_rate": 8.393318562222916e-05, "loss": 0.2642, "step": 508000 }, { "epoch": 7.77, "learning_rate": 8.370204446715997e-05, "loss": 0.2638, "step": 509000 }, { "epoch": 7.79, "learning_rate": 8.347086282712556e-05, "loss": 0.2637, "step": 510000 }, { "epoch": 7.79, "eval_runtime": 1.1071, "eval_samples_per_second": 903.278, "eval_steps_per_second": 14.452, "step": 510000 }, { "epoch": 7.8, "learning_rate": 8.323964323029136e-05, "loss": 0.2633, "step": 511000 }, { "epoch": 7.82, "learning_rate": 8.300838820523784e-05, "loss": 0.2634, "step": 512000 }, { "epoch": 7.83, "learning_rate": 8.277710028093289e-05, "loss": 0.263, "step": 513000 }, { "epoch": 7.85, "learning_rate": 8.254578198670421e-05, "loss": 0.2632, "step": 514000 }, { "epoch": 7.86, "learning_rate": 8.231443585221157e-05, "loss": 0.2629, "step": 515000 }, { "epoch": 7.86, "eval_runtime": 1.0457, "eval_samples_per_second": 956.256, "eval_steps_per_second": 15.3, "step": 515000 }, { "epoch": 7.88, "learning_rate": 8.208306440741926e-05, "loss": 0.2626, "step": 516000 }, { "epoch": 7.89, "learning_rate": 8.185167018256834e-05, "loss": 0.2629, "step": 517000 }, { "epoch": 7.91, "learning_rate": 8.162025570814896e-05, "loss": 0.2625, "step": 518000 }, { "epoch": 7.93, "learning_rate": 8.138882351487275e-05, "loss": 0.2623, "step": 519000 }, { "epoch": 7.94, "learning_rate": 8.115737613364511e-05, "loss": 0.2626, "step": 520000 }, { "epoch": 7.94, "eval_runtime": 1.0504, "eval_samples_per_second": 952.036, "eval_steps_per_second": 15.233, "step": 520000 }, { "epoch": 7.96, "learning_rate": 8.092591609553747e-05, "loss": 0.2623, "step": 521000 }, { "epoch": 7.97, "learning_rate": 8.069444593175975e-05, "loss": 0.2622, "step": 522000 }, { "epoch": 7.99, "learning_rate": 8.046296817363259e-05, "loss": 0.262, "step": 523000 }, { "epoch": 8.0, "learning_rate": 8.023148535255965e-05, "loss": 0.2619, "step": 524000 }, { "epoch": 8.02, "learning_rate": 7.999999999999999e-05, "loss": 0.262, "step": 525000 }, { "epoch": 8.02, "eval_runtime": 1.1375, "eval_samples_per_second": 879.131, "eval_steps_per_second": 14.066, "step": 525000 }, { "epoch": 8.03, "learning_rate": 7.976851464744033e-05, "loss": 0.2616, "step": 526000 }, { "epoch": 8.05, "learning_rate": 7.953703182636741e-05, "loss": 0.2616, "step": 527000 }, { "epoch": 8.06, "learning_rate": 7.930555406824026e-05, "loss": 0.2617, "step": 528000 }, { "epoch": 8.08, "learning_rate": 7.907408390446254e-05, "loss": 0.2614, "step": 529000 }, { "epoch": 8.09, "learning_rate": 7.884262386635489e-05, "loss": 0.2607, "step": 530000 }, { "epoch": 8.09, "eval_runtime": 1.0134, "eval_samples_per_second": 986.75, "eval_steps_per_second": 15.788, "step": 530000 }, { "epoch": 8.11, "learning_rate": 7.861117648512725e-05, "loss": 0.2613, "step": 531000 }, { "epoch": 8.12, "learning_rate": 7.837974429185103e-05, "loss": 0.2614, "step": 532000 }, { "epoch": 8.14, "learning_rate": 7.814832981743164e-05, "loss": 0.2614, "step": 533000 }, { "epoch": 8.15, "learning_rate": 7.791693559258072e-05, "loss": 0.2608, "step": 534000 }, { "epoch": 8.17, "learning_rate": 7.768556414778842e-05, "loss": 0.2606, "step": 535000 }, { "epoch": 8.17, "eval_runtime": 1.097, "eval_samples_per_second": 911.552, "eval_steps_per_second": 14.585, "step": 535000 }, { "epoch": 8.18, "learning_rate": 7.74542180132958e-05, "loss": 0.2606, "step": 536000 }, { "epoch": 8.2, "learning_rate": 7.72228997190671e-05, "loss": 0.2608, "step": 537000 }, { "epoch": 8.22, "learning_rate": 7.699161179476217e-05, "loss": 0.2604, "step": 538000 }, { "epoch": 8.23, "learning_rate": 7.676035676970863e-05, "loss": 0.2606, "step": 539000 }, { "epoch": 8.25, "learning_rate": 7.652913717287443e-05, "loss": 0.2604, "step": 540000 }, { "epoch": 8.25, "eval_runtime": 1.1778, "eval_samples_per_second": 849.063, "eval_steps_per_second": 13.585, "step": 540000 }, { "epoch": 8.26, "learning_rate": 7.629795553284005e-05, "loss": 0.2602, "step": 541000 }, { "epoch": 8.28, "learning_rate": 7.606681437777081e-05, "loss": 0.2605, "step": 542000 }, { "epoch": 8.29, "learning_rate": 7.583571623538939e-05, "loss": 0.26, "step": 543000 }, { "epoch": 8.31, "learning_rate": 7.560466363294806e-05, "loss": 0.2596, "step": 544000 }, { "epoch": 8.32, "learning_rate": 7.537365909720104e-05, "loss": 0.2595, "step": 545000 }, { "epoch": 8.32, "eval_runtime": 1.1629, "eval_samples_per_second": 859.911, "eval_steps_per_second": 13.759, "step": 545000 }, { "epoch": 8.34, "learning_rate": 7.514270515437691e-05, "loss": 0.2595, "step": 546000 }, { "epoch": 8.35, "learning_rate": 7.491180433015101e-05, "loss": 0.2594, "step": 547000 }, { "epoch": 8.37, "learning_rate": 7.468095914961777e-05, "loss": 0.2596, "step": 548000 }, { "epoch": 8.38, "learning_rate": 7.445017213726307e-05, "loss": 0.2596, "step": 549000 }, { "epoch": 8.4, "learning_rate": 7.421944581693674e-05, "loss": 0.2594, "step": 550000 }, { "epoch": 8.4, "eval_runtime": 0.9899, "eval_samples_per_second": 1010.184, "eval_steps_per_second": 16.163, "step": 550000 }, { "epoch": 8.41, "learning_rate": 7.39887827118248e-05, "loss": 0.259, "step": 551000 }, { "epoch": 8.43, "learning_rate": 7.375818534442207e-05, "loss": 0.2588, "step": 552000 }, { "epoch": 8.44, "learning_rate": 7.352765623650435e-05, "loss": 0.259, "step": 553000 }, { "epoch": 8.46, "learning_rate": 7.329719790910108e-05, "loss": 0.2587, "step": 554000 }, { "epoch": 8.47, "learning_rate": 7.30668128824676e-05, "loss": 0.2587, "step": 555000 }, { "epoch": 8.47, "eval_runtime": 1.1635, "eval_samples_per_second": 859.466, "eval_steps_per_second": 13.751, "step": 555000 }, { "epoch": 8.49, "learning_rate": 7.283650367605764e-05, "loss": 0.2584, "step": 556000 }, { "epoch": 8.51, "learning_rate": 7.260627280849581e-05, "loss": 0.2585, "step": 557000 }, { "epoch": 8.52, "learning_rate": 7.23761227975499e-05, "loss": 0.2584, "step": 558000 }, { "epoch": 8.54, "learning_rate": 7.21460561601037e-05, "loss": 0.2584, "step": 559000 }, { "epoch": 8.55, "learning_rate": 7.191607541212897e-05, "loss": 0.2585, "step": 560000 }, { "epoch": 8.55, "eval_runtime": 1.1711, "eval_samples_per_second": 853.863, "eval_steps_per_second": 13.662, "step": 560000 }, { "epoch": 8.57, "learning_rate": 7.168618306865838e-05, "loss": 0.2583, "step": 561000 }, { "epoch": 8.58, "learning_rate": 7.145638164375779e-05, "loss": 0.2588, "step": 562000 }, { "epoch": 8.6, "learning_rate": 7.122667365049869e-05, "loss": 0.2578, "step": 563000 }, { "epoch": 8.61, "learning_rate": 7.099706160093098e-05, "loss": 0.2578, "step": 564000 }, { "epoch": 8.63, "learning_rate": 7.076754800605516e-05, "loss": 0.2579, "step": 565000 }, { "epoch": 8.63, "eval_runtime": 1.0129, "eval_samples_per_second": 987.305, "eval_steps_per_second": 15.797, "step": 565000 }, { "epoch": 8.64, "learning_rate": 7.053813537579523e-05, "loss": 0.2581, "step": 566000 }, { "epoch": 8.66, "learning_rate": 7.030882621897088e-05, "loss": 0.2575, "step": 567000 }, { "epoch": 8.67, "learning_rate": 7.00796230432703e-05, "loss": 0.2574, "step": 568000 }, { "epoch": 8.69, "learning_rate": 6.985052835522279e-05, "loss": 0.2572, "step": 569000 }, { "epoch": 8.7, "learning_rate": 6.962154466017105e-05, "loss": 0.2572, "step": 570000 }, { "epoch": 8.7, "eval_runtime": 1.048, "eval_samples_per_second": 954.187, "eval_steps_per_second": 15.267, "step": 570000 }, { "epoch": 8.72, "learning_rate": 6.939267446224418e-05, "loss": 0.2569, "step": 571000 }, { "epoch": 8.73, "learning_rate": 6.91639202643299e-05, "loss": 0.2569, "step": 572000 }, { "epoch": 8.75, "learning_rate": 6.893528456804756e-05, "loss": 0.2569, "step": 573000 }, { "epoch": 8.77, "learning_rate": 6.870676987372044e-05, "loss": 0.2568, "step": 574000 }, { "epoch": 8.78, "learning_rate": 6.847837868034861e-05, "loss": 0.257, "step": 575000 }, { "epoch": 8.78, "eval_runtime": 1.0002, "eval_samples_per_second": 999.79, "eval_steps_per_second": 15.997, "step": 575000 }, { "epoch": 8.8, "learning_rate": 6.825011348558167e-05, "loss": 0.2573, "step": 576000 }, { "epoch": 8.81, "learning_rate": 6.802197678569109e-05, "loss": 0.2566, "step": 577000 }, { "epoch": 8.83, "learning_rate": 6.779397107554339e-05, "loss": 0.2562, "step": 578000 }, { "epoch": 8.84, "learning_rate": 6.756609884857239e-05, "loss": 0.2566, "step": 579000 }, { "epoch": 8.86, "learning_rate": 6.733836259675233e-05, "loss": 0.2564, "step": 580000 }, { "epoch": 8.86, "eval_runtime": 1.0727, "eval_samples_per_second": 932.263, "eval_steps_per_second": 14.916, "step": 580000 }, { "epoch": 8.87, "learning_rate": 6.71107648105703e-05, "loss": 0.2564, "step": 581000 }, { "epoch": 8.89, "learning_rate": 6.688330797899925e-05, "loss": 0.2562, "step": 582000 }, { "epoch": 8.9, "learning_rate": 6.665599458947072e-05, "loss": 0.2562, "step": 583000 }, { "epoch": 8.92, "learning_rate": 6.642882712784742e-05, "loss": 0.2561, "step": 584000 }, { "epoch": 8.93, "learning_rate": 6.620180807839639e-05, "loss": 0.2561, "step": 585000 }, { "epoch": 8.93, "eval_runtime": 0.9936, "eval_samples_per_second": 1006.405, "eval_steps_per_second": 16.102, "step": 585000 }, { "epoch": 8.95, "learning_rate": 6.597493992376152e-05, "loss": 0.2557, "step": 586000 }, { "epoch": 8.96, "learning_rate": 6.574822514493664e-05, "loss": 0.2554, "step": 587000 }, { "epoch": 8.98, "learning_rate": 6.552166622123824e-05, "loss": 0.2554, "step": 588000 }, { "epoch": 8.99, "learning_rate": 6.52952656302784e-05, "loss": 0.2556, "step": 589000 }, { "epoch": 9.01, "learning_rate": 6.506902584793773e-05, "loss": 0.2553, "step": 590000 }, { "epoch": 9.01, "eval_runtime": 0.9015, "eval_samples_per_second": 1109.201, "eval_steps_per_second": 17.747, "step": 590000 }, { "epoch": 9.02, "learning_rate": 6.484294934833822e-05, "loss": 0.2552, "step": 591000 }, { "epoch": 9.04, "learning_rate": 6.461703860381628e-05, "loss": 0.2551, "step": 592000 }, { "epoch": 9.06, "learning_rate": 6.439129608489559e-05, "loss": 0.2555, "step": 593000 }, { "epoch": 9.07, "learning_rate": 6.41657242602602e-05, "loss": 0.2549, "step": 594000 }, { "epoch": 9.09, "learning_rate": 6.39403255967274e-05, "loss": 0.255, "step": 595000 }, { "epoch": 9.09, "eval_runtime": 1.1107, "eval_samples_per_second": 900.319, "eval_steps_per_second": 14.405, "step": 595000 }, { "epoch": 9.1, "learning_rate": 6.371510255922088e-05, "loss": 0.2545, "step": 596000 }, { "epoch": 9.12, "learning_rate": 6.349005761074372e-05, "loss": 0.2547, "step": 597000 }, { "epoch": 9.13, "learning_rate": 6.326519321235139e-05, "loss": 0.2546, "step": 598000 }, { "epoch": 9.15, "learning_rate": 6.304051182312496e-05, "loss": 0.2549, "step": 599000 }, { "epoch": 9.16, "learning_rate": 6.281601590014407e-05, "loss": 0.2546, "step": 600000 }, { "epoch": 9.16, "eval_runtime": 1.0772, "eval_samples_per_second": 928.316, "eval_steps_per_second": 14.853, "step": 600000 }, { "epoch": 9.18, "learning_rate": 6.259170789846017e-05, "loss": 0.2546, "step": 601000 }, { "epoch": 9.19, "learning_rate": 6.236759027106965e-05, "loss": 0.2542, "step": 602000 }, { "epoch": 9.21, "learning_rate": 6.214366546888694e-05, "loss": 0.2541, "step": 603000 }, { "epoch": 9.22, "learning_rate": 6.191993594071785e-05, "loss": 0.2541, "step": 604000 }, { "epoch": 9.24, "learning_rate": 6.169640413323262e-05, "loss": 0.254, "step": 605000 }, { "epoch": 9.24, "eval_runtime": 1.0913, "eval_samples_per_second": 916.334, "eval_steps_per_second": 14.661, "step": 605000 }, { "epoch": 9.25, "learning_rate": 6.147307249093929e-05, "loss": 0.2537, "step": 606000 }, { "epoch": 9.27, "learning_rate": 6.124994345615693e-05, "loss": 0.2532, "step": 607000 }, { "epoch": 9.28, "learning_rate": 6.102701946898891e-05, "loss": 0.2536, "step": 608000 }, { "epoch": 9.3, "learning_rate": 6.0804302967296225e-05, "loss": 0.2545, "step": 609000 }, { "epoch": 9.31, "learning_rate": 6.058179638667089e-05, "loss": 0.2536, "step": 610000 }, { "epoch": 9.31, "eval_runtime": 1.0284, "eval_samples_per_second": 972.365, "eval_steps_per_second": 15.558, "step": 610000 }, { "epoch": 9.33, "learning_rate": 6.035950216040917e-05, "loss": 0.2533, "step": 611000 }, { "epoch": 9.35, "learning_rate": 6.0137422719485145e-05, "loss": 0.2531, "step": 612000 }, { "epoch": 9.36, "learning_rate": 5.991556049252401e-05, "loss": 0.2532, "step": 613000 }, { "epoch": 9.38, "learning_rate": 5.969391790577551e-05, "loss": 0.2532, "step": 614000 }, { "epoch": 9.39, "learning_rate": 5.947249738308747e-05, "loss": 0.2529, "step": 615000 }, { "epoch": 9.39, "eval_runtime": 1.014, "eval_samples_per_second": 986.174, "eval_steps_per_second": 15.779, "step": 615000 }, { "epoch": 9.41, "learning_rate": 5.925130134587924e-05, "loss": 0.2527, "step": 616000 }, { "epoch": 9.42, "learning_rate": 5.903033221311528e-05, "loss": 0.2525, "step": 617000 }, { "epoch": 9.44, "learning_rate": 5.880959240127858e-05, "loss": 0.2524, "step": 618000 }, { "epoch": 9.45, "learning_rate": 5.858908432434438e-05, "loss": 0.2525, "step": 619000 }, { "epoch": 9.47, "learning_rate": 5.8368810393753684e-05, "loss": 0.2524, "step": 620000 }, { "epoch": 9.47, "eval_runtime": 1.0588, "eval_samples_per_second": 944.48, "eval_steps_per_second": 15.112, "step": 620000 }, { "epoch": 9.48, "learning_rate": 5.814877301838688e-05, "loss": 0.2523, "step": 621000 }, { "epoch": 9.5, "learning_rate": 5.7928974604537494e-05, "loss": 0.2522, "step": 622000 }, { "epoch": 9.51, "learning_rate": 5.770941755588573e-05, "loss": 0.2537, "step": 623000 }, { "epoch": 9.53, "learning_rate": 5.749010427347233e-05, "loss": 0.254, "step": 624000 }, { "epoch": 9.54, "learning_rate": 5.7271037155672156e-05, "loss": 0.2522, "step": 625000 }, { "epoch": 9.54, "eval_runtime": 1.0707, "eval_samples_per_second": 934.001, "eval_steps_per_second": 14.944, "step": 625000 }, { "epoch": 9.56, "learning_rate": 5.7052218598168154e-05, "loss": 0.2524, "step": 626000 }, { "epoch": 9.57, "learning_rate": 5.6833650993925016e-05, "loss": 0.2522, "step": 627000 }, { "epoch": 9.59, "learning_rate": 5.661533673316303e-05, "loss": 0.2522, "step": 628000 }, { "epoch": 9.6, "learning_rate": 5.639727820333198e-05, "loss": 0.2518, "step": 629000 }, { "epoch": 9.62, "learning_rate": 5.617947778908498e-05, "loss": 0.2517, "step": 630000 }, { "epoch": 9.62, "eval_runtime": 1.1949, "eval_samples_per_second": 836.899, "eval_steps_per_second": 13.39, "step": 630000 }, { "epoch": 9.64, "learning_rate": 5.596193787225254e-05, "loss": 0.2514, "step": 631000 }, { "epoch": 9.65, "learning_rate": 5.574466083181624e-05, "loss": 0.2512, "step": 632000 }, { "epoch": 9.67, "learning_rate": 5.552764904388305e-05, "loss": 0.2511, "step": 633000 }, { "epoch": 9.68, "learning_rate": 5.5310904881659116e-05, "loss": 0.2511, "step": 634000 }, { "epoch": 9.7, "learning_rate": 5.5094430715423835e-05, "loss": 0.2509, "step": 635000 }, { "epoch": 9.7, "eval_runtime": 1.0102, "eval_samples_per_second": 989.889, "eval_steps_per_second": 15.838, "step": 635000 }, { "epoch": 9.71, "learning_rate": 5.487822891250406e-05, "loss": 0.2511, "step": 636000 }, { "epoch": 9.73, "learning_rate": 5.4662301837247985e-05, "loss": 0.2508, "step": 637000 }, { "epoch": 9.74, "learning_rate": 5.4446651850999604e-05, "loss": 0.2506, "step": 638000 }, { "epoch": 9.76, "learning_rate": 5.4231281312072544e-05, "loss": 0.2505, "step": 639000 }, { "epoch": 9.77, "learning_rate": 5.401619257572453e-05, "loss": 0.2502, "step": 640000 }, { "epoch": 9.77, "eval_runtime": 1.0069, "eval_samples_per_second": 993.184, "eval_steps_per_second": 15.891, "step": 640000 }, { "epoch": 9.79, "learning_rate": 5.3801387994131576e-05, "loss": 0.2501, "step": 641000 }, { "epoch": 9.8, "learning_rate": 5.358686991636209e-05, "loss": 0.2503, "step": 642000 }, { "epoch": 9.82, "learning_rate": 5.3372640688351476e-05, "loss": 0.2505, "step": 643000 }, { "epoch": 9.83, "learning_rate": 5.315870265287618e-05, "loss": 0.2502, "step": 644000 }, { "epoch": 9.85, "learning_rate": 5.294505814952835e-05, "loss": 0.2501, "step": 645000 }, { "epoch": 9.85, "eval_runtime": 1.0688, "eval_samples_per_second": 935.652, "eval_steps_per_second": 14.97, "step": 645000 }, { "epoch": 9.86, "learning_rate": 5.2731709514689995e-05, "loss": 0.2502, "step": 646000 }, { "epoch": 9.88, "learning_rate": 5.25186590815076e-05, "loss": 0.2501, "step": 647000 }, { "epoch": 9.9, "learning_rate": 5.2305909179866635e-05, "loss": 0.2495, "step": 648000 }, { "epoch": 9.91, "learning_rate": 5.209346213636584e-05, "loss": 0.2498, "step": 649000 }, { "epoch": 9.93, "learning_rate": 5.188132027429215e-05, "loss": 0.2495, "step": 650000 }, { "epoch": 9.93, "eval_runtime": 1.0361, "eval_samples_per_second": 965.164, "eval_steps_per_second": 15.443, "step": 650000 }, { "epoch": 9.94, "learning_rate": 5.166948591359489e-05, "loss": 0.2493, "step": 651000 }, { "epoch": 9.96, "learning_rate": 5.145796137086076e-05, "loss": 0.2493, "step": 652000 }, { "epoch": 9.97, "learning_rate": 5.124674895928823e-05, "loss": 0.2493, "step": 653000 }, { "epoch": 9.99, "learning_rate": 5.103585098866237e-05, "loss": 0.2491, "step": 654000 }, { "epoch": 10.0, "learning_rate": 5.082526976532968e-05, "loss": 0.249, "step": 655000 }, { "epoch": 10.0, "eval_runtime": 1.0267, "eval_samples_per_second": 974.027, "eval_steps_per_second": 15.584, "step": 655000 }, { "epoch": 10.02, "learning_rate": 5.061500759217261e-05, "loss": 0.2494, "step": 656000 }, { "epoch": 10.03, "learning_rate": 5.04050667685846e-05, "loss": 0.2487, "step": 657000 }, { "epoch": 10.05, "learning_rate": 5.01954495904449e-05, "loss": 0.2485, "step": 658000 }, { "epoch": 10.06, "learning_rate": 4.998615835009339e-05, "loss": 0.2488, "step": 659000 }, { "epoch": 10.08, "learning_rate": 4.97771953363055e-05, "loss": 0.2489, "step": 660000 }, { "epoch": 10.08, "eval_runtime": 1.0445, "eval_samples_per_second": 957.361, "eval_steps_per_second": 15.318, "step": 660000 }, { "epoch": 10.09, "learning_rate": 4.956856283426728e-05, "loss": 0.2487, "step": 661000 }, { "epoch": 10.11, "learning_rate": 4.936026312555037e-05, "loss": 0.248, "step": 662000 }, { "epoch": 10.12, "learning_rate": 4.915229848808698e-05, "loss": 0.2478, "step": 663000 }, { "epoch": 10.14, "learning_rate": 4.8944671196145136e-05, "loss": 0.2484, "step": 664000 }, { "epoch": 10.15, "learning_rate": 4.8737383520303546e-05, "loss": 0.2485, "step": 665000 }, { "epoch": 10.15, "eval_runtime": 1.1085, "eval_samples_per_second": 902.106, "eval_steps_per_second": 14.434, "step": 665000 }, { "epoch": 10.17, "learning_rate": 4.853043772742709e-05, "loss": 0.248, "step": 666000 }, { "epoch": 10.19, "learning_rate": 4.832383608064172e-05, "loss": 0.2476, "step": 667000 }, { "epoch": 10.2, "learning_rate": 4.811758083931005e-05, "loss": 0.2478, "step": 668000 }, { "epoch": 10.22, "learning_rate": 4.791167425900632e-05, "loss": 0.2481, "step": 669000 }, { "epoch": 10.23, "learning_rate": 4.770611859149185e-05, "loss": 0.2508, "step": 670000 }, { "epoch": 10.23, "eval_runtime": 1.1412, "eval_samples_per_second": 876.243, "eval_steps_per_second": 14.02, "step": 670000 }, { "epoch": 10.25, "learning_rate": 4.7500916084690564e-05, "loss": 0.2542, "step": 671000 }, { "epoch": 10.26, "learning_rate": 4.729606898266411e-05, "loss": 0.2507, "step": 672000 }, { "epoch": 10.28, "learning_rate": 4.709157952558768e-05, "loss": 0.2478, "step": 673000 }, { "epoch": 10.29, "learning_rate": 4.688744994972514e-05, "loss": 0.2482, "step": 674000 }, { "epoch": 10.31, "learning_rate": 4.668368248740485e-05, "loss": 0.247, "step": 675000 }, { "epoch": 10.31, "eval_runtime": 0.9224, "eval_samples_per_second": 1084.145, "eval_steps_per_second": 17.346, "step": 675000 }, { "epoch": 10.32, "learning_rate": 4.6480279366995116e-05, "loss": 0.2472, "step": 676000 }, { "epoch": 10.34, "learning_rate": 4.6277242812879914e-05, "loss": 0.2473, "step": 677000 }, { "epoch": 10.35, "learning_rate": 4.607457504543447e-05, "loss": 0.2471, "step": 678000 }, { "epoch": 10.37, "learning_rate": 4.5872278281000955e-05, "loss": 0.2469, "step": 679000 }, { "epoch": 10.38, "learning_rate": 4.567035473186444e-05, "loss": 0.2469, "step": 680000 }, { "epoch": 10.38, "eval_runtime": 0.7393, "eval_samples_per_second": 1352.617, "eval_steps_per_second": 21.642, "step": 680000 }, { "epoch": 10.4, "learning_rate": 4.546880660622845e-05, "loss": 0.2463, "step": 681000 }, { "epoch": 10.41, "learning_rate": 4.5267636108191036e-05, "loss": 0.2466, "step": 682000 }, { "epoch": 10.43, "learning_rate": 4.5066845437720555e-05, "loss": 0.2462, "step": 683000 }, { "epoch": 10.44, "learning_rate": 4.4866436790631564e-05, "loss": 0.2463, "step": 684000 }, { "epoch": 10.46, "learning_rate": 4.4666412358560955e-05, "loss": 0.2461, "step": 685000 }, { "epoch": 10.46, "eval_runtime": 0.7931, "eval_samples_per_second": 1260.883, "eval_steps_per_second": 20.174, "step": 685000 }, { "epoch": 10.48, "learning_rate": 4.4466774328943796e-05, "loss": 0.2462, "step": 686000 }, { "epoch": 10.49, "learning_rate": 4.426752488498972e-05, "loss": 0.2462, "step": 687000 }, { "epoch": 10.51, "learning_rate": 4.406866620565862e-05, "loss": 0.2459, "step": 688000 }, { "epoch": 10.52, "learning_rate": 4.3870200465637164e-05, "loss": 0.2471, "step": 689000 }, { "epoch": 10.54, "learning_rate": 4.3672129835314955e-05, "loss": 0.2481, "step": 690000 }, { "epoch": 10.54, "eval_runtime": 0.8338, "eval_samples_per_second": 1199.26, "eval_steps_per_second": 19.188, "step": 690000 }, { "epoch": 10.55, "learning_rate": 4.347445648076057e-05, "loss": 0.2463, "step": 691000 }, { "epoch": 10.57, "learning_rate": 4.327718256369826e-05, "loss": 0.2458, "step": 692000 }, { "epoch": 10.58, "learning_rate": 4.3080310241483885e-05, "loss": 0.2451, "step": 693000 }, { "epoch": 10.6, "learning_rate": 4.2883841667081675e-05, "loss": 0.2454, "step": 694000 }, { "epoch": 10.61, "learning_rate": 4.268777898904044e-05, "loss": 0.2455, "step": 695000 }, { "epoch": 10.61, "eval_runtime": 0.794, "eval_samples_per_second": 1259.505, "eval_steps_per_second": 20.152, "step": 695000 }, { "epoch": 10.63, "learning_rate": 4.2492124351470214e-05, "loss": 0.2453, "step": 696000 }, { "epoch": 10.64, "learning_rate": 4.2296879894018835e-05, "loss": 0.2449, "step": 697000 }, { "epoch": 10.66, "learning_rate": 4.210204775184834e-05, "loss": 0.245, "step": 698000 }, { "epoch": 10.67, "learning_rate": 4.190763005561186e-05, "loss": 0.2447, "step": 699000 }, { "epoch": 10.69, "learning_rate": 4.171362893143013e-05, "loss": 0.2444, "step": 700000 }, { "epoch": 10.69, "eval_runtime": 0.7798, "eval_samples_per_second": 1282.444, "eval_steps_per_second": 20.519, "step": 700000 }, { "epoch": 10.7, "learning_rate": 4.1520046500868384e-05, "loss": 0.2442, "step": 701000 }, { "epoch": 10.72, "learning_rate": 4.1326884880913074e-05, "loss": 0.2454, "step": 702000 }, { "epoch": 10.73, "learning_rate": 4.1134146183948724e-05, "loss": 0.2445, "step": 703000 }, { "epoch": 10.75, "learning_rate": 4.0941832517734885e-05, "loss": 0.2448, "step": 704000 }, { "epoch": 10.77, "learning_rate": 4.0749945985382915e-05, "loss": 0.2445, "step": 705000 }, { "epoch": 10.77, "eval_runtime": 0.7458, "eval_samples_per_second": 1340.853, "eval_steps_per_second": 21.454, "step": 705000 }, { "epoch": 10.78, "learning_rate": 4.0558488685333235e-05, "loss": 0.253, "step": 706000 }, { "epoch": 10.8, "learning_rate": 4.036746271133223e-05, "loss": 0.2533, "step": 707000 }, { "epoch": 10.81, "learning_rate": 4.0176870152409324e-05, "loss": 0.2547, "step": 708000 }, { "epoch": 10.83, "learning_rate": 3.998671309285417e-05, "loss": 0.2529, "step": 709000 }, { "epoch": 10.84, "learning_rate": 3.979699361219395e-05, "loss": 0.2457, "step": 710000 }, { "epoch": 10.84, "eval_runtime": 0.7472, "eval_samples_per_second": 1338.326, "eval_steps_per_second": 21.413, "step": 710000 }, { "epoch": 10.86, "learning_rate": 3.960771378517049e-05, "loss": 0.2438, "step": 711000 }, { "epoch": 10.87, "learning_rate": 3.941887568171766e-05, "loss": 0.2464, "step": 712000 }, { "epoch": 10.89, "learning_rate": 3.923048136693873e-05, "loss": 0.2445, "step": 713000 }, { "epoch": 10.9, "learning_rate": 3.904253290108369e-05, "loss": 0.2435, "step": 714000 }, { "epoch": 10.92, "learning_rate": 3.885503233952689e-05, "loss": 0.2446, "step": 715000 }, { "epoch": 10.92, "eval_runtime": 0.8432, "eval_samples_per_second": 1186.017, "eval_steps_per_second": 18.976, "step": 715000 }, { "epoch": 10.93, "learning_rate": 3.86679817327444e-05, "loss": 0.2432, "step": 716000 }, { "epoch": 10.95, "learning_rate": 3.848138312629171e-05, "loss": 0.2433, "step": 717000 }, { "epoch": 10.96, "learning_rate": 3.8295238560781317e-05, "loss": 0.2436, "step": 718000 }, { "epoch": 10.98, "learning_rate": 3.810955007186029e-05, "loss": 0.2433, "step": 719000 }, { "epoch": 10.99, "learning_rate": 3.792431969018824e-05, "loss": 0.243, "step": 720000 }, { "epoch": 10.99, "eval_runtime": 0.7755, "eval_samples_per_second": 1289.466, "eval_steps_per_second": 20.631, "step": 720000 }, { "epoch": 11.01, "learning_rate": 3.7739549441414945e-05, "loss": 0.2427, "step": 721000 }, { "epoch": 11.03, "learning_rate": 3.755524134615825e-05, "loss": 0.2429, "step": 722000 }, { "epoch": 11.04, "learning_rate": 3.7371397419981925e-05, "loss": 0.2428, "step": 723000 }, { "epoch": 11.06, "learning_rate": 3.7188019673373706e-05, "loss": 0.2431, "step": 724000 }, { "epoch": 11.07, "learning_rate": 3.700511011172325e-05, "loss": 0.2436, "step": 725000 }, { "epoch": 11.07, "eval_runtime": 0.7297, "eval_samples_per_second": 1370.472, "eval_steps_per_second": 21.928, "step": 725000 }, { "epoch": 11.09, "learning_rate": 3.682267073530023e-05, "loss": 0.243, "step": 726000 }, { "epoch": 11.1, "learning_rate": 3.664070353923245e-05, "loss": 0.2424, "step": 727000 }, { "epoch": 11.12, "learning_rate": 3.645921051348396e-05, "loss": 0.2423, "step": 728000 }, { "epoch": 11.13, "learning_rate": 3.627819364283345e-05, "loss": 0.2456, "step": 729000 }, { "epoch": 11.15, "learning_rate": 3.6097654906852405e-05, "loss": 0.2431, "step": 730000 }, { "epoch": 11.15, "eval_runtime": 0.7906, "eval_samples_per_second": 1264.795, "eval_steps_per_second": 20.237, "step": 730000 }, { "epoch": 11.16, "learning_rate": 3.591759627988353e-05, "loss": 0.242, "step": 731000 }, { "epoch": 11.18, "learning_rate": 3.573801973101913e-05, "loss": 0.2418, "step": 732000 }, { "epoch": 11.19, "learning_rate": 3.5558927224079534e-05, "loss": 0.2418, "step": 733000 }, { "epoch": 11.21, "learning_rate": 3.5380320717591716e-05, "loss": 0.2419, "step": 734000 }, { "epoch": 11.22, "learning_rate": 3.5202202164767836e-05, "loss": 0.2418, "step": 735000 }, { "epoch": 11.22, "eval_runtime": 0.8971, "eval_samples_per_second": 1114.723, "eval_steps_per_second": 17.836, "step": 735000 }, { "epoch": 11.24, "learning_rate": 3.5024573513483864e-05, "loss": 0.2415, "step": 736000 }, { "epoch": 11.25, "learning_rate": 3.484743670625822e-05, "loss": 0.2414, "step": 737000 }, { "epoch": 11.27, "learning_rate": 3.467079368023068e-05, "loss": 0.2413, "step": 738000 }, { "epoch": 11.28, "learning_rate": 3.449464636714107e-05, "loss": 0.2415, "step": 739000 }, { "epoch": 11.3, "learning_rate": 3.431899669330819e-05, "loss": 0.2414, "step": 740000 }, { "epoch": 11.3, "eval_runtime": 0.7754, "eval_samples_per_second": 1289.598, "eval_steps_per_second": 20.634, "step": 740000 }, { "epoch": 11.32, "learning_rate": 3.4143846579608744e-05, "loss": 0.2411, "step": 741000 }, { "epoch": 11.33, "learning_rate": 3.396919794145629e-05, "loss": 0.2412, "step": 742000 }, { "epoch": 11.35, "learning_rate": 3.3795052688780345e-05, "loss": 0.241, "step": 743000 }, { "epoch": 11.36, "learning_rate": 3.362141272600552e-05, "loss": 0.2413, "step": 744000 }, { "epoch": 11.38, "learning_rate": 3.3448279952030615e-05, "loss": 0.241, "step": 745000 }, { "epoch": 11.38, "eval_runtime": 0.937, "eval_samples_per_second": 1067.221, "eval_steps_per_second": 17.076, "step": 745000 }, { "epoch": 11.39, "learning_rate": 3.327565626020793e-05, "loss": 0.2408, "step": 746000 }, { "epoch": 11.41, "learning_rate": 3.3103543538322455e-05, "loss": 0.2408, "step": 747000 }, { "epoch": 11.42, "learning_rate": 3.293194366857137e-05, "loss": 0.2407, "step": 748000 }, { "epoch": 11.44, "learning_rate": 3.276085852754336e-05, "loss": 0.2409, "step": 749000 }, { "epoch": 11.45, "learning_rate": 3.259028998619814e-05, "loss": 0.2405, "step": 750000 }, { "epoch": 11.45, "eval_runtime": 0.7243, "eval_samples_per_second": 1380.717, "eval_steps_per_second": 22.091, "step": 750000 }, { "epoch": 11.47, "learning_rate": 3.2420239909845894e-05, "loss": 0.2403, "step": 751000 }, { "epoch": 11.48, "learning_rate": 3.2250710158127045e-05, "loss": 0.2402, "step": 752000 }, { "epoch": 11.5, "learning_rate": 3.2081702584991786e-05, "loss": 0.2398, "step": 753000 }, { "epoch": 11.51, "learning_rate": 3.191321903867988e-05, "loss": 0.2401, "step": 754000 }, { "epoch": 11.53, "learning_rate": 3.174526136170039e-05, "loss": 0.2403, "step": 755000 }, { "epoch": 11.53, "eval_runtime": 0.695, "eval_samples_per_second": 1438.835, "eval_steps_per_second": 23.021, "step": 755000 }, { "epoch": 11.54, "learning_rate": 3.157783139081155e-05, "loss": 0.24, "step": 756000 }, { "epoch": 11.56, "learning_rate": 3.141093095700072e-05, "loss": 0.2401, "step": 757000 }, { "epoch": 11.57, "learning_rate": 3.1244561885464244e-05, "loss": 0.252, "step": 758000 }, { "epoch": 11.59, "learning_rate": 3.107872599558769e-05, "loss": 0.24, "step": 759000 }, { "epoch": 11.61, "learning_rate": 3.0913425100925795e-05, "loss": 0.2396, "step": 760000 }, { "epoch": 11.61, "eval_runtime": 0.7192, "eval_samples_per_second": 1390.499, "eval_steps_per_second": 22.248, "step": 760000 }, { "epoch": 11.62, "learning_rate": 3.0748661009182616e-05, "loss": 0.2396, "step": 761000 }, { "epoch": 11.64, "learning_rate": 3.0584435522191896e-05, "loss": 0.2395, "step": 762000 }, { "epoch": 11.65, "learning_rate": 3.0420750435897183e-05, "loss": 0.2393, "step": 763000 }, { "epoch": 11.67, "learning_rate": 3.025760754033246e-05, "loss": 0.239, "step": 764000 }, { "epoch": 11.68, "learning_rate": 3.0095008619602206e-05, "loss": 0.2392, "step": 765000 }, { "epoch": 11.68, "eval_runtime": 0.7905, "eval_samples_per_second": 1264.968, "eval_steps_per_second": 20.239, "step": 765000 }, { "epoch": 11.7, "learning_rate": 2.993295545186223e-05, "loss": 0.2393, "step": 766000 }, { "epoch": 11.71, "learning_rate": 2.977144980929996e-05, "loss": 0.2392, "step": 767000 }, { "epoch": 11.73, "learning_rate": 2.961049345811523e-05, "loss": 0.2388, "step": 768000 }, { "epoch": 11.74, "learning_rate": 2.945008815850097e-05, "loss": 0.2392, "step": 769000 }, { "epoch": 11.76, "learning_rate": 2.929023566462377e-05, "loss": 0.2391, "step": 770000 }, { "epoch": 11.76, "eval_runtime": 0.8418, "eval_samples_per_second": 1187.898, "eval_steps_per_second": 19.006, "step": 770000 }, { "epoch": 11.77, "learning_rate": 2.9130937724604947e-05, "loss": 0.2401, "step": 771000 }, { "epoch": 11.79, "learning_rate": 2.8972196080501208e-05, "loss": 0.2392, "step": 772000 }, { "epoch": 11.8, "learning_rate": 2.8814012468285748e-05, "loss": 0.2395, "step": 773000 }, { "epoch": 11.82, "learning_rate": 2.865638861782922e-05, "loss": 0.2387, "step": 774000 }, { "epoch": 11.83, "learning_rate": 2.849932625288079e-05, "loss": 0.2383, "step": 775000 }, { "epoch": 11.83, "eval_runtime": 0.7561, "eval_samples_per_second": 1322.637, "eval_steps_per_second": 21.162, "step": 775000 }, { "epoch": 11.85, "learning_rate": 2.8342827091049336e-05, "loss": 0.2383, "step": 776000 }, { "epoch": 11.86, "learning_rate": 2.8186892843784587e-05, "loss": 0.2384, "step": 777000 }, { "epoch": 11.88, "learning_rate": 2.803152521635851e-05, "loss": 0.2382, "step": 778000 }, { "epoch": 11.9, "learning_rate": 2.7876725907846578e-05, "loss": 0.2378, "step": 779000 }, { "epoch": 11.91, "learning_rate": 2.7722496611109243e-05, "loss": 0.2378, "step": 780000 }, { "epoch": 11.91, "eval_runtime": 0.7835, "eval_samples_per_second": 1276.363, "eval_steps_per_second": 20.422, "step": 780000 }, { "epoch": 11.93, "learning_rate": 2.7568839012773365e-05, "loss": 0.238, "step": 781000 }, { "epoch": 11.94, "learning_rate": 2.7415754793213826e-05, "loss": 0.2375, "step": 782000 }, { "epoch": 11.96, "learning_rate": 2.7263245626535116e-05, "loss": 0.2377, "step": 783000 }, { "epoch": 11.97, "learning_rate": 2.7111313180553077e-05, "loss": 0.2378, "step": 784000 }, { "epoch": 11.99, "learning_rate": 2.6959959116776587e-05, "loss": 0.2376, "step": 785000 }, { "epoch": 11.99, "eval_runtime": 0.7664, "eval_samples_per_second": 1304.853, "eval_steps_per_second": 20.878, "step": 785000 }, { "epoch": 12.0, "learning_rate": 2.6809185090389406e-05, "loss": 0.2371, "step": 786000 }, { "epoch": 12.02, "learning_rate": 2.6658992750232167e-05, "loss": 0.2373, "step": 787000 }, { "epoch": 12.03, "learning_rate": 2.6509383738784218e-05, "loss": 0.2374, "step": 788000 }, { "epoch": 12.05, "learning_rate": 2.6360359692145757e-05, "loss": 0.237, "step": 789000 }, { "epoch": 12.06, "learning_rate": 2.6211922240019883e-05, "loss": 0.2368, "step": 790000 }, { "epoch": 12.06, "eval_runtime": 0.7543, "eval_samples_per_second": 1325.719, "eval_steps_per_second": 21.212, "step": 790000 }, { "epoch": 12.08, "learning_rate": 2.6064073005694758e-05, "loss": 0.2381, "step": 791000 }, { "epoch": 12.09, "learning_rate": 2.591681360602595e-05, "loss": 0.2373, "step": 792000 }, { "epoch": 12.11, "learning_rate": 2.577014565141866e-05, "loss": 0.2377, "step": 793000 }, { "epoch": 12.12, "learning_rate": 2.562407074581014e-05, "loss": 0.2382, "step": 794000 }, { "epoch": 12.14, "learning_rate": 2.5478590486652137e-05, "loss": 0.2374, "step": 795000 }, { "epoch": 12.14, "eval_runtime": 0.8227, "eval_samples_per_second": 1215.581, "eval_steps_per_second": 19.449, "step": 795000 }, { "epoch": 12.16, "learning_rate": 2.533370646489347e-05, "loss": 0.237, "step": 796000 }, { "epoch": 12.17, "learning_rate": 2.5189420264962586e-05, "loss": 0.2367, "step": 797000 }, { "epoch": 12.19, "learning_rate": 2.504573346475026e-05, "loss": 0.2371, "step": 798000 }, { "epoch": 12.2, "learning_rate": 2.4902647635592324e-05, "loss": 0.2372, "step": 799000 }, { "epoch": 12.22, "learning_rate": 2.476016434225246e-05, "loss": 0.2372, "step": 800000 }, { "epoch": 12.22, "eval_runtime": 0.741, "eval_samples_per_second": 1349.61, "eval_steps_per_second": 21.594, "step": 800000 }, { "epoch": 12.23, "learning_rate": 2.461828514290513e-05, "loss": 0.2364, "step": 801000 }, { "epoch": 12.25, "learning_rate": 2.447701158911855e-05, "loss": 0.2373, "step": 802000 }, { "epoch": 12.26, "learning_rate": 2.4336345225837658e-05, "loss": 0.2369, "step": 803000 }, { "epoch": 12.28, "learning_rate": 2.4196287591367296e-05, "loss": 0.2363, "step": 804000 }, { "epoch": 12.29, "learning_rate": 2.405684021735527e-05, "loss": 0.2366, "step": 805000 }, { "epoch": 12.29, "eval_runtime": 0.7797, "eval_samples_per_second": 1282.575, "eval_steps_per_second": 20.521, "step": 805000 }, { "epoch": 12.31, "learning_rate": 2.3918004628775736e-05, "loss": 0.2366, "step": 806000 }, { "epoch": 12.32, "learning_rate": 2.3779782343912463e-05, "loss": 0.2367, "step": 807000 }, { "epoch": 12.34, "learning_rate": 2.364217487434221e-05, "loss": 0.24, "step": 808000 }, { "epoch": 12.35, "learning_rate": 2.3505183724918196e-05, "loss": 0.2369, "step": 809000 }, { "epoch": 12.37, "learning_rate": 2.3368810393753687e-05, "loss": 0.2365, "step": 810000 }, { "epoch": 12.37, "eval_runtime": 0.7457, "eval_samples_per_second": 1341.078, "eval_steps_per_second": 21.457, "step": 810000 }, { "epoch": 12.38, "learning_rate": 2.32330563722056e-05, "loss": 0.2357, "step": 811000 }, { "epoch": 12.4, "learning_rate": 2.309792314485815e-05, "loss": 0.2356, "step": 812000 }, { "epoch": 12.41, "learning_rate": 2.2963412189506695e-05, "loss": 0.2358, "step": 813000 }, { "epoch": 12.43, "learning_rate": 2.282952497714145e-05, "loss": 0.2356, "step": 814000 }, { "epoch": 12.45, "learning_rate": 2.2696262971931538e-05, "loss": 0.2357, "step": 815000 }, { "epoch": 12.45, "eval_runtime": 0.7163, "eval_samples_per_second": 1396.105, "eval_steps_per_second": 22.338, "step": 815000 }, { "epoch": 12.46, "learning_rate": 2.2563627631208887e-05, "loss": 0.2355, "step": 816000 }, { "epoch": 12.48, "learning_rate": 2.2431620405452336e-05, "loss": 0.2351, "step": 817000 }, { "epoch": 12.49, "learning_rate": 2.230024273827179e-05, "loss": 0.2357, "step": 818000 }, { "epoch": 12.51, "learning_rate": 2.216949606639231e-05, "loss": 0.2353, "step": 819000 }, { "epoch": 12.52, "learning_rate": 2.2039381819638596e-05, "loss": 0.2351, "step": 820000 }, { "epoch": 12.52, "eval_runtime": 0.6211, "eval_samples_per_second": 1609.99, "eval_steps_per_second": 25.76, "step": 820000 }, { "epoch": 12.54, "learning_rate": 2.1909901420919184e-05, "loss": 0.2351, "step": 821000 }, { "epoch": 12.55, "learning_rate": 2.1781056286210997e-05, "loss": 0.235, "step": 822000 }, { "epoch": 12.57, "learning_rate": 2.1652847824543744e-05, "loss": 0.2347, "step": 823000 }, { "epoch": 12.58, "learning_rate": 2.1525277437984636e-05, "loss": 0.2348, "step": 824000 }, { "epoch": 12.6, "learning_rate": 2.1398346521623e-05, "loss": 0.2345, "step": 825000 }, { "epoch": 12.6, "eval_runtime": 0.7605, "eval_samples_per_second": 1314.934, "eval_steps_per_second": 21.039, "step": 825000 }, { "epoch": 12.61, "learning_rate": 2.1272056463554978e-05, "loss": 0.2343, "step": 826000 }, { "epoch": 12.63, "learning_rate": 2.114640864486845e-05, "loss": 0.2346, "step": 827000 }, { "epoch": 12.64, "learning_rate": 2.1021404439627775e-05, "loss": 0.2344, "step": 828000 }, { "epoch": 12.66, "learning_rate": 2.089704521485896e-05, "loss": 0.2344, "step": 829000 }, { "epoch": 12.67, "learning_rate": 2.0773332330534513e-05, "loss": 0.2343, "step": 830000 }, { "epoch": 12.67, "eval_runtime": 0.7327, "eval_samples_per_second": 1364.889, "eval_steps_per_second": 21.838, "step": 830000 }, { "epoch": 12.69, "learning_rate": 2.0650267139558772e-05, "loss": 0.2339, "step": 831000 }, { "epoch": 12.7, "learning_rate": 2.052785098775293e-05, "loss": 0.2339, "step": 832000 }, { "epoch": 12.72, "learning_rate": 2.04060852138404e-05, "loss": 0.234, "step": 833000 }, { "epoch": 12.74, "learning_rate": 2.028497114943219e-05, "loss": 0.234, "step": 834000 }, { "epoch": 12.75, "learning_rate": 2.0164510119012263e-05, "loss": 0.2338, "step": 835000 }, { "epoch": 12.75, "eval_runtime": 0.7099, "eval_samples_per_second": 1408.578, "eval_steps_per_second": 22.537, "step": 835000 }, { "epoch": 12.77, "learning_rate": 2.0044703439923217e-05, "loss": 0.2336, "step": 836000 }, { "epoch": 12.78, "learning_rate": 1.9925552422351654e-05, "loss": 0.2338, "step": 837000 }, { "epoch": 12.8, "learning_rate": 1.9807058369314016e-05, "loss": 0.2335, "step": 838000 }, { "epoch": 12.81, "learning_rate": 1.968922257664231e-05, "loss": 0.2337, "step": 839000 }, { "epoch": 12.83, "learning_rate": 1.9572046332969825e-05, "loss": 0.2335, "step": 840000 }, { "epoch": 12.83, "eval_runtime": 0.7491, "eval_samples_per_second": 1334.897, "eval_steps_per_second": 21.358, "step": 840000 }, { "epoch": 12.84, "learning_rate": 1.945553091971727e-05, "loss": 0.2334, "step": 841000 }, { "epoch": 12.86, "learning_rate": 1.933967761107847e-05, "loss": 0.234, "step": 842000 }, { "epoch": 12.87, "learning_rate": 1.9224487674006694e-05, "loss": 0.234, "step": 843000 }, { "epoch": 12.89, "learning_rate": 1.9109962368200602e-05, "loss": 0.2379, "step": 844000 }, { "epoch": 12.9, "learning_rate": 1.8996102946090586e-05, "loss": 0.2335, "step": 845000 }, { "epoch": 12.9, "eval_runtime": 0.7039, "eval_samples_per_second": 1420.612, "eval_steps_per_second": 22.73, "step": 845000 }, { "epoch": 12.92, "learning_rate": 1.888291065282509e-05, "loss": 0.2338, "step": 846000 }, { "epoch": 12.93, "learning_rate": 1.8770386726256865e-05, "loss": 0.2329, "step": 847000 }, { "epoch": 12.95, "learning_rate": 1.8658532396929565e-05, "loss": 0.2334, "step": 848000 }, { "epoch": 12.96, "learning_rate": 1.8547348888064178e-05, "loss": 0.2341, "step": 849000 }, { "epoch": 12.98, "learning_rate": 1.8436837415545772e-05, "loss": 0.2356, "step": 850000 }, { "epoch": 12.98, "eval_runtime": 0.8308, "eval_samples_per_second": 1203.685, "eval_steps_per_second": 19.259, "step": 850000 }, { "epoch": 12.99, "learning_rate": 1.8326999187910095e-05, "loss": 0.2342, "step": 851000 }, { "epoch": 13.01, "learning_rate": 1.8217835406330415e-05, "loss": 0.2344, "step": 852000 }, { "epoch": 13.03, "learning_rate": 1.810934726460436e-05, "loss": 0.2328, "step": 853000 }, { "epoch": 13.04, "learning_rate": 1.800153594914084e-05, "loss": 0.2326, "step": 854000 }, { "epoch": 13.06, "learning_rate": 1.7894402638947176e-05, "loss": 0.2325, "step": 855000 }, { "epoch": 13.06, "eval_runtime": 0.7234, "eval_samples_per_second": 1382.419, "eval_steps_per_second": 22.119, "step": 855000 }, { "epoch": 13.07, "learning_rate": 1.778794850561604e-05, "loss": 0.2327, "step": 856000 }, { "epoch": 13.09, "learning_rate": 1.7682174713312805e-05, "loss": 0.2326, "step": 857000 }, { "epoch": 13.1, "learning_rate": 1.75770824187627e-05, "loss": 0.2325, "step": 858000 }, { "epoch": 13.12, "learning_rate": 1.747267277123821e-05, "loss": 0.2327, "step": 859000 }, { "epoch": 13.13, "learning_rate": 1.7368946912546556e-05, "loss": 0.2329, "step": 860000 }, { "epoch": 13.13, "eval_runtime": 0.7568, "eval_samples_per_second": 1321.327, "eval_steps_per_second": 21.141, "step": 860000 }, { "epoch": 13.15, "learning_rate": 1.726590597701708e-05, "loss": 0.2322, "step": 861000 }, { "epoch": 13.16, "learning_rate": 1.7163551091488952e-05, "loss": 0.2375, "step": 862000 }, { "epoch": 13.18, "learning_rate": 1.7061883375298788e-05, "loss": 0.2328, "step": 863000 }, { "epoch": 13.19, "learning_rate": 1.6960903940268456e-05, "loss": 0.2323, "step": 864000 }, { "epoch": 13.21, "learning_rate": 1.6860613890692876e-05, "loss": 0.2334, "step": 865000 }, { "epoch": 13.21, "eval_runtime": 0.7389, "eval_samples_per_second": 1353.416, "eval_steps_per_second": 21.655, "step": 865000 }, { "epoch": 13.22, "learning_rate": 1.6761014323327962e-05, "loss": 0.233, "step": 866000 }, { "epoch": 13.24, "learning_rate": 1.6662106327378645e-05, "loss": 0.2334, "step": 867000 }, { "epoch": 13.25, "learning_rate": 1.6563890984486884e-05, "loss": 0.2333, "step": 868000 }, { "epoch": 13.27, "learning_rate": 1.6466369368719955e-05, "loss": 0.2324, "step": 869000 }, { "epoch": 13.29, "learning_rate": 1.6369542546558626e-05, "loss": 0.2324, "step": 870000 }, { "epoch": 13.29, "eval_runtime": 0.8823, "eval_samples_per_second": 1133.455, "eval_steps_per_second": 18.135, "step": 870000 }, { "epoch": 13.3, "learning_rate": 1.6273411576885517e-05, "loss": 0.2323, "step": 871000 }, { "epoch": 13.32, "learning_rate": 1.617797751097349e-05, "loss": 0.2322, "step": 872000 }, { "epoch": 13.33, "learning_rate": 1.608324139247421e-05, "loss": 0.2335, "step": 873000 }, { "epoch": 13.35, "learning_rate": 1.5989204257406693e-05, "loss": 0.2329, "step": 874000 }, { "epoch": 13.36, "learning_rate": 1.5895867134145974e-05, "loss": 0.2325, "step": 875000 }, { "epoch": 13.36, "eval_runtime": 0.8114, "eval_samples_per_second": 1232.442, "eval_steps_per_second": 19.719, "step": 875000 }, { "epoch": 13.38, "learning_rate": 1.5803231043411912e-05, "loss": 0.2322, "step": 876000 }, { "epoch": 13.39, "learning_rate": 1.5711296998257902e-05, "loss": 0.232, "step": 877000 }, { "epoch": 13.41, "learning_rate": 1.562006600405996e-05, "loss": 0.2322, "step": 878000 }, { "epoch": 13.42, "learning_rate": 1.5529539058505624e-05, "loss": 0.2317, "step": 879000 }, { "epoch": 13.44, "learning_rate": 1.543971715158307e-05, "loss": 0.2318, "step": 880000 }, { "epoch": 13.44, "eval_runtime": 0.8591, "eval_samples_per_second": 1164.009, "eval_steps_per_second": 18.624, "step": 880000 }, { "epoch": 13.45, "learning_rate": 1.535060126557028e-05, "loss": 0.2322, "step": 881000 }, { "epoch": 13.47, "learning_rate": 1.5262192375024284e-05, "loss": 0.232, "step": 882000 }, { "epoch": 13.48, "learning_rate": 1.5174491446770566e-05, "loss": 0.2314, "step": 883000 }, { "epoch": 13.5, "learning_rate": 1.508749943989242e-05, "loss": 0.2312, "step": 884000 }, { "epoch": 13.51, "learning_rate": 1.500121730572051e-05, "loss": 0.2314, "step": 885000 }, { "epoch": 13.51, "eval_runtime": 0.7508, "eval_samples_per_second": 1331.906, "eval_steps_per_second": 21.31, "step": 885000 }, { "epoch": 13.53, "learning_rate": 1.4915645987822406e-05, "loss": 0.2314, "step": 886000 }, { "epoch": 13.54, "learning_rate": 1.4830786421992347e-05, "loss": 0.2316, "step": 887000 }, { "epoch": 13.56, "learning_rate": 1.4746639536240942e-05, "loss": 0.2312, "step": 888000 }, { "epoch": 13.58, "learning_rate": 1.4663206250785055e-05, "loss": 0.2315, "step": 889000 }, { "epoch": 13.59, "learning_rate": 1.4580487478037748e-05, "loss": 0.2311, "step": 890000 }, { "epoch": 13.59, "eval_runtime": 0.7331, "eval_samples_per_second": 1364.004, "eval_steps_per_second": 21.824, "step": 890000 }, { "epoch": 13.61, "learning_rate": 1.4498484122598232e-05, "loss": 0.2308, "step": 891000 }, { "epoch": 13.62, "learning_rate": 1.4417197081242083e-05, "loss": 0.2305, "step": 892000 }, { "epoch": 13.64, "learning_rate": 1.433662724291136e-05, "loss": 0.2313, "step": 893000 }, { "epoch": 13.65, "learning_rate": 1.4256775488704904e-05, "loss": 0.2311, "step": 894000 }, { "epoch": 13.67, "learning_rate": 1.4177642691868717e-05, "loss": 0.231, "step": 895000 }, { "epoch": 13.67, "eval_runtime": 0.7236, "eval_samples_per_second": 1382.053, "eval_steps_per_second": 22.113, "step": 895000 }, { "epoch": 13.68, "learning_rate": 1.4099229717786368e-05, "loss": 0.231, "step": 896000 }, { "epoch": 13.7, "learning_rate": 1.4021537423969588e-05, "loss": 0.2317, "step": 897000 }, { "epoch": 13.71, "learning_rate": 1.3944566660048863e-05, "loss": 0.2308, "step": 898000 }, { "epoch": 13.73, "learning_rate": 1.3868318267764128e-05, "loss": 0.2309, "step": 899000 }, { "epoch": 13.74, "learning_rate": 1.3792793080955574e-05, "loss": 0.2308, "step": 900000 }, { "epoch": 13.74, "eval_runtime": 0.7542, "eval_samples_per_second": 1325.982, "eval_steps_per_second": 21.216, "step": 900000 }, { "epoch": 13.76, "learning_rate": 1.3717991925554562e-05, "loss": 0.2308, "step": 901000 }, { "epoch": 13.77, "learning_rate": 1.3643915619574529e-05, "loss": 0.2305, "step": 902000 }, { "epoch": 13.79, "learning_rate": 1.35705649731021e-05, "loss": 0.2304, "step": 903000 }, { "epoch": 13.8, "learning_rate": 1.3497940788288195e-05, "loss": 0.2301, "step": 904000 }, { "epoch": 13.82, "learning_rate": 1.3426043859339253e-05, "loss": 0.2304, "step": 905000 }, { "epoch": 13.82, "eval_runtime": 0.9802, "eval_samples_per_second": 1020.243, "eval_steps_per_second": 16.324, "step": 905000 }, { "epoch": 13.83, "learning_rate": 1.3354874972508582e-05, "loss": 0.2302, "step": 906000 }, { "epoch": 13.85, "learning_rate": 1.3284434906087695e-05, "loss": 0.2303, "step": 907000 }, { "epoch": 13.87, "learning_rate": 1.3214724430397915e-05, "loss": 0.2304, "step": 908000 }, { "epoch": 13.88, "learning_rate": 1.314574430778182e-05, "loss": 0.2302, "step": 909000 }, { "epoch": 13.9, "learning_rate": 1.3077495292594966e-05, "loss": 0.2305, "step": 910000 }, { "epoch": 13.9, "eval_runtime": 0.7262, "eval_samples_per_second": 1377.03, "eval_steps_per_second": 22.032, "step": 910000 }, { "epoch": 13.91, "learning_rate": 1.3009978131197669e-05, "loss": 0.2314, "step": 911000 }, { "epoch": 13.93, "learning_rate": 1.2943193561946762e-05, "loss": 0.2304, "step": 912000 }, { "epoch": 13.94, "learning_rate": 1.2877142315187628e-05, "loss": 0.2299, "step": 913000 }, { "epoch": 13.96, "learning_rate": 1.28118251132461e-05, "loss": 0.23, "step": 914000 }, { "epoch": 13.97, "learning_rate": 1.274724267042063e-05, "loss": 0.2299, "step": 915000 }, { "epoch": 13.97, "eval_runtime": 0.795, "eval_samples_per_second": 1257.794, "eval_steps_per_second": 20.125, "step": 915000 }, { "epoch": 13.99, "learning_rate": 1.2683395692974472e-05, "loss": 0.23, "step": 916000 }, { "epoch": 14.0, "learning_rate": 1.2620284879127947e-05, "loss": 0.23, "step": 917000 }, { "epoch": 14.02, "learning_rate": 1.2557910919050803e-05, "loss": 0.2295, "step": 918000 }, { "epoch": 14.03, "learning_rate": 1.2496274494854666e-05, "loss": 0.2296, "step": 919000 }, { "epoch": 14.05, "learning_rate": 1.24353762805856e-05, "loss": 0.2297, "step": 920000 }, { "epoch": 14.05, "eval_runtime": 0.7692, "eval_samples_per_second": 1300.053, "eval_steps_per_second": 20.801, "step": 920000 }, { "epoch": 14.06, "learning_rate": 1.2375216942216713e-05, "loss": 0.2306, "step": 921000 }, { "epoch": 14.08, "learning_rate": 1.2315797137640906e-05, "loss": 0.2298, "step": 922000 }, { "epoch": 14.09, "learning_rate": 1.225711751666363e-05, "loss": 0.2295, "step": 923000 }, { "epoch": 14.11, "learning_rate": 1.2199178720995825e-05, "loss": 0.2299, "step": 924000 }, { "epoch": 14.12, "learning_rate": 1.2141981384246874e-05, "loss": 0.23, "step": 925000 }, { "epoch": 14.12, "eval_runtime": 0.827, "eval_samples_per_second": 1209.23, "eval_steps_per_second": 19.348, "step": 925000 }, { "epoch": 14.14, "learning_rate": 1.2085526131917685e-05, "loss": 0.2294, "step": 926000 }, { "epoch": 14.16, "learning_rate": 1.2029813581393866e-05, "loss": 0.2289, "step": 927000 }, { "epoch": 14.17, "learning_rate": 1.197484434193893e-05, "loss": 0.2295, "step": 928000 }, { "epoch": 14.19, "learning_rate": 1.192061901468768e-05, "loss": 0.2293, "step": 929000 }, { "epoch": 14.2, "learning_rate": 1.1867138192639601e-05, "loss": 0.2293, "step": 930000 }, { "epoch": 14.2, "eval_runtime": 0.9644, "eval_samples_per_second": 1036.936, "eval_steps_per_second": 16.591, "step": 930000 }, { "epoch": 14.22, "learning_rate": 1.1814402460652382e-05, "loss": 0.2291, "step": 931000 }, { "epoch": 14.23, "learning_rate": 1.176241239543558e-05, "loss": 0.229, "step": 932000 }, { "epoch": 14.25, "learning_rate": 1.171116856554418e-05, "loss": 0.2291, "step": 933000 }, { "epoch": 14.26, "learning_rate": 1.1660671531372517e-05, "loss": 0.2301, "step": 934000 }, { "epoch": 14.28, "learning_rate": 1.1610921845148052e-05, "loss": 0.2295, "step": 935000 }, { "epoch": 14.28, "eval_runtime": 0.8534, "eval_samples_per_second": 1171.832, "eval_steps_per_second": 18.749, "step": 935000 }, { "epoch": 14.29, "learning_rate": 1.156192005092539e-05, "loss": 0.2301, "step": 936000 }, { "epoch": 14.31, "learning_rate": 1.1513666684580308e-05, "loss": 0.2291, "step": 937000 }, { "epoch": 14.32, "learning_rate": 1.1466162273803876e-05, "loss": 0.2292, "step": 938000 }, { "epoch": 14.34, "learning_rate": 1.1419407338096732e-05, "loss": 0.2287, "step": 939000 }, { "epoch": 14.35, "learning_rate": 1.1373402388763346e-05, "loss": 0.2286, "step": 940000 }, { "epoch": 14.35, "eval_runtime": 0.7875, "eval_samples_per_second": 1269.803, "eval_steps_per_second": 20.317, "step": 940000 }, { "epoch": 14.37, "learning_rate": 1.1328147928906494e-05, "loss": 0.2287, "step": 941000 }, { "epoch": 14.38, "learning_rate": 1.1283644453421678e-05, "loss": 0.2289, "step": 942000 }, { "epoch": 14.4, "learning_rate": 1.1239892448991798e-05, "loss": 0.2284, "step": 943000 }, { "epoch": 14.42, "learning_rate": 1.1196892394081743e-05, "loss": 0.2287, "step": 944000 }, { "epoch": 14.43, "learning_rate": 1.1154644758933235e-05, "loss": 0.2285, "step": 945000 }, { "epoch": 14.43, "eval_runtime": 0.7294, "eval_samples_per_second": 1370.909, "eval_steps_per_second": 21.935, "step": 945000 }, { "epoch": 14.45, "learning_rate": 1.1113150005559644e-05, "loss": 0.2283, "step": 946000 }, { "epoch": 14.46, "learning_rate": 1.1072408587740942e-05, "loss": 0.2282, "step": 947000 }, { "epoch": 14.48, "learning_rate": 1.1032420951018755e-05, "loss": 0.228, "step": 948000 }, { "epoch": 14.49, "learning_rate": 1.0993187532691458e-05, "loss": 0.2281, "step": 949000 }, { "epoch": 14.51, "learning_rate": 1.0954708761809438e-05, "loss": 0.2281, "step": 950000 }, { "epoch": 14.51, "eval_runtime": 0.7692, "eval_samples_per_second": 1300.007, "eval_steps_per_second": 20.8, "step": 950000 }, { "epoch": 14.52, "learning_rate": 1.091698505917036e-05, "loss": 0.2281, "step": 951000 }, { "epoch": 14.54, "learning_rate": 1.0880016837314599e-05, "loss": 0.2283, "step": 952000 }, { "epoch": 14.55, "learning_rate": 1.084380450052071e-05, "loss": 0.2281, "step": 953000 }, { "epoch": 14.57, "learning_rate": 1.0808348444801e-05, "loss": 0.2278, "step": 954000 }, { "epoch": 14.58, "learning_rate": 1.0773649057897206e-05, "loss": 0.2283, "step": 955000 }, { "epoch": 14.58, "eval_runtime": 0.7689, "eval_samples_per_second": 1300.511, "eval_steps_per_second": 20.808, "step": 955000 }, { "epoch": 14.6, "learning_rate": 1.073970671927628e-05, "loss": 0.2277, "step": 956000 }, { "epoch": 14.61, "learning_rate": 1.0706521800126198e-05, "loss": 0.2279, "step": 957000 }, { "epoch": 14.63, "learning_rate": 1.0674094663351906e-05, "loss": 0.2278, "step": 958000 }, { "epoch": 14.64, "learning_rate": 1.0642425663571383e-05, "loss": 0.2279, "step": 959000 }, { "epoch": 14.66, "learning_rate": 1.0611515147111736e-05, "loss": 0.2279, "step": 960000 }, { "epoch": 14.66, "eval_runtime": 0.8148, "eval_samples_per_second": 1227.238, "eval_steps_per_second": 19.636, "step": 960000 }, { "epoch": 14.67, "learning_rate": 1.0581363452005424e-05, "loss": 0.2279, "step": 961000 }, { "epoch": 14.69, "learning_rate": 1.0551970907986557e-05, "loss": 0.2277, "step": 962000 }, { "epoch": 14.71, "learning_rate": 1.0523337836487271e-05, "loss": 0.2276, "step": 963000 }, { "epoch": 14.72, "learning_rate": 1.0495464550634267e-05, "loss": 0.2278, "step": 964000 }, { "epoch": 14.74, "learning_rate": 1.046835135524533e-05, "loss": 0.2277, "step": 965000 }, { "epoch": 14.74, "eval_runtime": 0.7884, "eval_samples_per_second": 1268.404, "eval_steps_per_second": 20.294, "step": 965000 }, { "epoch": 14.75, "learning_rate": 1.044199854682601e-05, "loss": 0.2278, "step": 966000 }, { "epoch": 14.77, "learning_rate": 1.0416406413566414e-05, "loss": 0.2279, "step": 967000 }, { "epoch": 14.78, "learning_rate": 1.0391575235337991e-05, "loss": 0.2278, "step": 968000 }, { "epoch": 14.8, "learning_rate": 1.0367505283690547e-05, "loss": 0.2276, "step": 969000 }, { "epoch": 14.81, "learning_rate": 1.0344196821849202e-05, "loss": 0.2279, "step": 970000 }, { "epoch": 14.81, "eval_runtime": 0.7534, "eval_samples_per_second": 1327.252, "eval_steps_per_second": 21.236, "step": 970000 }, { "epoch": 14.83, "learning_rate": 1.032165010471157e-05, "loss": 0.2277, "step": 971000 }, { "epoch": 14.84, "learning_rate": 1.0299865378844936e-05, "loss": 0.2275, "step": 972000 }, { "epoch": 14.86, "learning_rate": 1.0278842882483569e-05, "loss": 0.2275, "step": 973000 }, { "epoch": 14.87, "learning_rate": 1.025858284552612e-05, "loss": 0.2276, "step": 974000 }, { "epoch": 14.89, "learning_rate": 1.023908548953311e-05, "loss": 0.2275, "step": 975000 }, { "epoch": 14.89, "eval_runtime": 0.7861, "eval_samples_per_second": 1272.066, "eval_steps_per_second": 20.353, "step": 975000 }, { "epoch": 14.9, "learning_rate": 1.02203510277245e-05, "loss": 0.2276, "step": 976000 }, { "epoch": 14.92, "learning_rate": 1.0202379664977364e-05, "loss": 0.2272, "step": 977000 }, { "epoch": 14.93, "learning_rate": 1.018517159782365e-05, "loss": 0.2274, "step": 978000 }, { "epoch": 14.95, "learning_rate": 1.0168727014448004e-05, "loss": 0.2272, "step": 979000 }, { "epoch": 14.96, "learning_rate": 1.0153046094685783e-05, "loss": 0.227, "step": 980000 }, { "epoch": 14.96, "eval_runtime": 0.7489, "eval_samples_per_second": 1335.226, "eval_steps_per_second": 21.364, "step": 980000 }, { "epoch": 14.98, "learning_rate": 1.0138129010020992e-05, "loss": 0.2272, "step": 981000 }, { "epoch": 15.0, "learning_rate": 1.0123975923584488e-05, "loss": 0.2273, "step": 982000 }, { "epoch": 15.01, "learning_rate": 1.0110586990152152e-05, "loss": 0.227, "step": 983000 }, { "epoch": 15.03, "learning_rate": 1.0097962356143219e-05, "loss": 0.2273, "step": 984000 }, { "epoch": 15.04, "learning_rate": 1.0086102159618668e-05, "loss": 0.227, "step": 985000 }, { "epoch": 15.04, "eval_runtime": 0.7868, "eval_samples_per_second": 1271.022, "eval_steps_per_second": 20.336, "step": 985000 }, { "epoch": 15.06, "learning_rate": 1.0075006530279694e-05, "loss": 0.2271, "step": 986000 }, { "epoch": 15.07, "learning_rate": 1.0064675589466339e-05, "loss": 0.2268, "step": 987000 }, { "epoch": 15.09, "learning_rate": 1.0055109450156098e-05, "loss": 0.2272, "step": 988000 }, { "epoch": 15.1, "learning_rate": 1.0046308216962759e-05, "loss": 0.2269, "step": 989000 }, { "epoch": 15.12, "learning_rate": 1.0038271986135177e-05, "loss": 0.2272, "step": 990000 }, { "epoch": 15.12, "eval_runtime": 0.7713, "eval_samples_per_second": 1296.438, "eval_steps_per_second": 20.743, "step": 990000 }, { "epoch": 15.13, "learning_rate": 1.0031000845556304e-05, "loss": 0.2272, "step": 991000 }, { "epoch": 15.15, "learning_rate": 1.0024494874742152e-05, "loss": 0.2272, "step": 992000 }, { "epoch": 15.16, "learning_rate": 1.0018754144840986e-05, "loss": 0.2272, "step": 993000 }, { "epoch": 15.18, "learning_rate": 1.0013778718632507e-05, "loss": 0.227, "step": 994000 }, { "epoch": 15.19, "learning_rate": 1.000956865052717e-05, "loss": 0.2269, "step": 995000 }, { "epoch": 15.19, "eval_runtime": 0.7194, "eval_samples_per_second": 1390.055, "eval_steps_per_second": 22.241, "step": 995000 }, { "epoch": 15.21, "learning_rate": 1.0006123986565623e-05, "loss": 0.2267, "step": 996000 }, { "epoch": 15.22, "learning_rate": 1.0003444764418138e-05, "loss": 0.2265, "step": 997000 }, { "epoch": 15.24, "learning_rate": 1.000153101338428e-05, "loss": 0.2268, "step": 998000 }, { "epoch": 15.25, "learning_rate": 1.00003827543925e-05, "loss": 0.2269, "step": 999000 }, { "epoch": 15.27, "learning_rate": 1e-05, "loss": 0.2268, "step": 1000000 }, { "epoch": 15.27, "eval_runtime": 0.8245, "eval_samples_per_second": 1212.903, "eval_steps_per_second": 19.406, "step": 1000000 } ], "max_steps": 1000000, "num_train_epochs": 16, "total_flos": 7.010016247012483e+22, "trial_name": null, "trial_params": null }