{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.511376783648284, "global_step": 24000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ep_loss": 0.0, "epoch": 0.01, "learning_rate": 5e-07, "loss": 10.4763, "mlm_loss": 10.4763, "step": 10 }, { "ep_loss": 0.0, "epoch": 0.02, "learning_rate": 1e-06, "loss": 10.3052, "mlm_loss": 10.3052, "step": 20 }, { "ep_loss": 0.0, "epoch": 0.02, "learning_rate": 1.5e-06, "loss": 9.9641, "mlm_loss": 9.9641, "step": 30 }, { "ep_loss": 0.0, "epoch": 0.03, "learning_rate": 2e-06, "loss": 9.5354, "mlm_loss": 9.5354, "step": 40 }, { "ep_loss": 0.0, "epoch": 0.04, "learning_rate": 2.5e-06, "loss": 9.1346, "mlm_loss": 9.1346, "step": 50 }, { "ep_loss": 0.0, "epoch": 0.05, "learning_rate": 3e-06, "loss": 8.8162, "mlm_loss": 8.8162, "step": 60 }, { "ep_loss": 0.0, "epoch": 0.05, "learning_rate": 3.5e-06, "loss": 8.558, "mlm_loss": 8.558, "step": 70 }, { "ep_loss": 0.0, "epoch": 0.06, "learning_rate": 4e-06, "loss": 8.3459, "mlm_loss": 8.3459, "step": 80 }, { "ep_loss": 0.0, "epoch": 0.07, "learning_rate": 4.5e-06, "loss": 8.175, "mlm_loss": 8.175, "step": 90 }, { "ep_loss": 0.0, "epoch": 0.08, "learning_rate": 5e-06, "loss": 8.0255, "mlm_loss": 8.0255, "step": 100 }, { "epoch": 0.08, "eval_ep_loss": -0.14438027143478394, "eval_loss": 7.826408386230469, "eval_mlm_loss": 7.826408386230469, "eval_runtime": 63.5165, "eval_samples_per_second": 1100.014, "eval_steps_per_second": 0.551, "step": 100 }, { "ep_loss": 0.0, "epoch": 0.08, "learning_rate": 5.5e-06, "loss": 7.8979, "mlm_loss": 7.8979, "step": 110 }, { "ep_loss": 0.0, "epoch": 0.09, "learning_rate": 6e-06, "loss": 7.7543, "mlm_loss": 7.7543, "step": 120 }, { "ep_loss": 0.0, "epoch": 0.1, "learning_rate": 6.5e-06, "loss": 7.5942, "mlm_loss": 7.5942, "step": 130 }, { "ep_loss": 0.0, "epoch": 0.11, "learning_rate": 7e-06, "loss": 7.4304, "mlm_loss": 7.4304, "step": 140 }, { "ep_loss": 0.0, "epoch": 0.12, "learning_rate": 7.5e-06, "loss": 7.1862, "mlm_loss": 7.1862, "step": 150 }, { "ep_loss": 0.0, "epoch": 0.12, "learning_rate": 8e-06, "loss": 6.924, "mlm_loss": 6.924, "step": 160 }, { "ep_loss": 0.0, "epoch": 0.13, "learning_rate": 8.500000000000002e-06, "loss": 6.6521, "mlm_loss": 6.6521, "step": 170 }, { "ep_loss": 0.0, "epoch": 0.14, "learning_rate": 9e-06, "loss": 6.3639, "mlm_loss": 6.3639, "step": 180 }, { "ep_loss": 0.0, "epoch": 0.15, "learning_rate": 9.5e-06, "loss": 6.0915, "mlm_loss": 6.0915, "step": 190 }, { "ep_loss": 0.0, "epoch": 0.15, "learning_rate": 1e-05, "loss": 5.8272, "mlm_loss": 5.8272, "step": 200 }, { "epoch": 0.15, "eval_ep_loss": -0.4913683831691742, "eval_loss": 5.531888008117676, "eval_mlm_loss": 5.531888008117676, "eval_runtime": 60.9938, "eval_samples_per_second": 1145.51, "eval_steps_per_second": 0.574, "step": 200 }, { "ep_loss": 0.0, "epoch": 0.16, "learning_rate": 1.0500000000000001e-05, "loss": 5.5749, "mlm_loss": 5.5749, "step": 210 }, { "ep_loss": 0.0, "epoch": 0.17, "learning_rate": 1.1e-05, "loss": 5.3413, "mlm_loss": 5.3413, "step": 220 }, { "ep_loss": 0.0, "epoch": 0.18, "learning_rate": 1.15e-05, "loss": 5.142, "mlm_loss": 5.142, "step": 230 }, { "ep_loss": 0.0, "epoch": 0.19, "learning_rate": 1.2e-05, "loss": 4.925, "mlm_loss": 4.925, "step": 240 }, { "ep_loss": 0.0, "epoch": 0.19, "learning_rate": 1.25e-05, "loss": 4.7334, "mlm_loss": 4.7334, "step": 250 }, { "ep_loss": 0.0, "epoch": 0.2, "learning_rate": 1.3e-05, "loss": 4.5702, "mlm_loss": 4.5702, "step": 260 }, { "ep_loss": 0.0, "epoch": 0.21, "learning_rate": 1.35e-05, "loss": 4.4388, "mlm_loss": 4.4388, "step": 270 }, { "ep_loss": 0.0, "epoch": 0.22, "learning_rate": 1.4e-05, "loss": 4.288, "mlm_loss": 4.288, "step": 280 }, { "ep_loss": 0.0, "epoch": 0.22, "learning_rate": 1.4500000000000002e-05, "loss": 4.1403, "mlm_loss": 4.1403, "step": 290 }, { "ep_loss": 0.0, "epoch": 0.23, "learning_rate": 1.5e-05, "loss": 4.0232, "mlm_loss": 4.0232, "step": 300 }, { "epoch": 0.23, "eval_ep_loss": -1.1010401248931885, "eval_loss": 3.8623688220977783, "eval_mlm_loss": 3.8623688220977783, "eval_runtime": 61.8594, "eval_samples_per_second": 1129.481, "eval_steps_per_second": 0.566, "step": 300 }, { "ep_loss": 0.0, "epoch": 0.24, "learning_rate": 1.55e-05, "loss": 3.9135, "mlm_loss": 3.9135, "step": 310 }, { "ep_loss": 0.0, "epoch": 0.25, "learning_rate": 1.6e-05, "loss": 3.8252, "mlm_loss": 3.8252, "step": 320 }, { "ep_loss": 0.0, "epoch": 0.25, "learning_rate": 1.65e-05, "loss": 3.7462, "mlm_loss": 3.7462, "step": 330 }, { "ep_loss": 0.0, "epoch": 0.26, "learning_rate": 1.7000000000000003e-05, "loss": 3.6491, "mlm_loss": 3.6491, "step": 340 }, { "ep_loss": 0.0, "epoch": 0.27, "learning_rate": 1.7500000000000002e-05, "loss": 3.5815, "mlm_loss": 3.5815, "step": 350 }, { "ep_loss": 0.0, "epoch": 0.28, "learning_rate": 1.8e-05, "loss": 3.5211, "mlm_loss": 3.5211, "step": 360 }, { "ep_loss": 0.0, "epoch": 0.29, "learning_rate": 1.85e-05, "loss": 3.452, "mlm_loss": 3.452, "step": 370 }, { "ep_loss": 0.0, "epoch": 0.29, "learning_rate": 1.9e-05, "loss": 3.3747, "mlm_loss": 3.3747, "step": 380 }, { "ep_loss": 0.0, "epoch": 0.3, "learning_rate": 1.95e-05, "loss": 3.3163, "mlm_loss": 3.3163, "step": 390 }, { "ep_loss": 0.0, "epoch": 0.31, "learning_rate": 2e-05, "loss": 3.2679, "mlm_loss": 3.2679, "step": 400 }, { "epoch": 0.31, "eval_ep_loss": -1.4546582698822021, "eval_loss": 3.1360955238342285, "eval_mlm_loss": 3.1360955238342285, "eval_runtime": 59.2888, "eval_samples_per_second": 1178.451, "eval_steps_per_second": 0.59, "step": 400 }, { "ep_loss": 0.0, "epoch": 0.32, "learning_rate": 2.05e-05, "loss": 3.1831, "mlm_loss": 3.1831, "step": 410 }, { "ep_loss": 0.0, "epoch": 0.32, "learning_rate": 2.1000000000000002e-05, "loss": 3.1246, "mlm_loss": 3.1246, "step": 420 }, { "ep_loss": 0.0, "epoch": 0.33, "learning_rate": 2.1499999999999997e-05, "loss": 3.0658, "mlm_loss": 3.0658, "step": 430 }, { "ep_loss": 0.0, "epoch": 0.34, "learning_rate": 2.2e-05, "loss": 3.0032, "mlm_loss": 3.0032, "step": 440 }, { "ep_loss": 0.0, "epoch": 0.35, "learning_rate": 2.2499999999999998e-05, "loss": 2.931, "mlm_loss": 2.931, "step": 450 }, { "ep_loss": 0.0, "epoch": 0.35, "learning_rate": 2.3e-05, "loss": 2.904, "mlm_loss": 2.904, "step": 460 }, { "ep_loss": 0.0, "epoch": 0.36, "learning_rate": 2.3500000000000002e-05, "loss": 2.8526, "mlm_loss": 2.8526, "step": 470 }, { "ep_loss": 0.0, "epoch": 0.37, "learning_rate": 2.4e-05, "loss": 2.8336, "mlm_loss": 2.8336, "step": 480 }, { "ep_loss": 0.0, "epoch": 0.38, "learning_rate": 2.4500000000000003e-05, "loss": 2.7953, "mlm_loss": 2.7953, "step": 490 }, { "ep_loss": 0.0, "epoch": 0.39, "learning_rate": 2.5e-05, "loss": 2.7569, "mlm_loss": 2.7569, "step": 500 }, { "epoch": 0.39, "eval_ep_loss": -1.5984127521514893, "eval_loss": 2.6636624336242676, "eval_mlm_loss": 2.6636624336242676, "eval_runtime": 64.3177, "eval_samples_per_second": 1086.31, "eval_steps_per_second": 0.544, "step": 500 }, { "ep_loss": 0.0, "epoch": 0.39, "learning_rate": 2.55e-05, "loss": 2.7157, "mlm_loss": 2.7157, "step": 510 }, { "ep_loss": 0.0, "epoch": 0.4, "learning_rate": 2.6e-05, "loss": 2.6857, "mlm_loss": 2.6857, "step": 520 }, { "ep_loss": 0.0, "epoch": 0.41, "learning_rate": 2.65e-05, "loss": 2.6796, "mlm_loss": 2.6796, "step": 530 }, { "ep_loss": 0.0, "epoch": 0.42, "learning_rate": 2.7e-05, "loss": 2.6449, "mlm_loss": 2.6449, "step": 540 }, { "ep_loss": 0.0, "epoch": 0.42, "learning_rate": 2.75e-05, "loss": 2.5932, "mlm_loss": 2.5932, "step": 550 }, { "ep_loss": 0.0, "epoch": 0.43, "learning_rate": 2.8e-05, "loss": 2.5536, "mlm_loss": 2.5536, "step": 560 }, { "ep_loss": 0.0, "epoch": 0.44, "learning_rate": 2.85e-05, "loss": 2.5271, "mlm_loss": 2.5271, "step": 570 }, { "ep_loss": 0.0, "epoch": 0.45, "learning_rate": 2.9000000000000004e-05, "loss": 2.5057, "mlm_loss": 2.5057, "step": 580 }, { "ep_loss": 0.0, "epoch": 0.46, "learning_rate": 2.95e-05, "loss": 2.476, "mlm_loss": 2.476, "step": 590 }, { "ep_loss": 0.0, "epoch": 0.46, "learning_rate": 3e-05, "loss": 2.4265, "mlm_loss": 2.4265, "step": 600 }, { "epoch": 0.46, "eval_ep_loss": -1.7573626041412354, "eval_loss": 2.359508752822876, "eval_mlm_loss": 2.359508752822876, "eval_runtime": 59.8647, "eval_samples_per_second": 1167.114, "eval_steps_per_second": 0.585, "step": 600 }, { "ep_loss": 0.0, "epoch": 0.47, "learning_rate": 3.05e-05, "loss": 2.4061, "mlm_loss": 2.4061, "step": 610 }, { "ep_loss": 0.0, "epoch": 0.48, "learning_rate": 3.1e-05, "loss": 2.4071, "mlm_loss": 2.4071, "step": 620 }, { "ep_loss": 0.0, "epoch": 0.49, "learning_rate": 3.15e-05, "loss": 2.3728, "mlm_loss": 2.3728, "step": 630 }, { "ep_loss": 0.0, "epoch": 0.49, "learning_rate": 3.2e-05, "loss": 2.3465, "mlm_loss": 2.3465, "step": 640 }, { "ep_loss": 0.0, "epoch": 0.5, "learning_rate": 3.2500000000000004e-05, "loss": 2.3065, "mlm_loss": 2.3065, "step": 650 }, { "ep_loss": 0.0, "epoch": 0.51, "learning_rate": 3.3e-05, "loss": 2.312, "mlm_loss": 2.312, "step": 660 }, { "ep_loss": 0.0, "epoch": 0.52, "learning_rate": 3.35e-05, "loss": 2.29, "mlm_loss": 2.29, "step": 670 }, { "ep_loss": 0.0, "epoch": 0.52, "learning_rate": 3.4000000000000007e-05, "loss": 2.2939, "mlm_loss": 2.2939, "step": 680 }, { "ep_loss": 0.0, "epoch": 0.53, "learning_rate": 3.4500000000000005e-05, "loss": 2.2496, "mlm_loss": 2.2496, "step": 690 }, { "ep_loss": 0.0, "epoch": 0.54, "learning_rate": 3.5000000000000004e-05, "loss": 2.2482, "mlm_loss": 2.2482, "step": 700 }, { "epoch": 0.54, "eval_ep_loss": -1.7850463390350342, "eval_loss": 2.1652746200561523, "eval_mlm_loss": 2.1652746200561523, "eval_runtime": 63.4238, "eval_samples_per_second": 1101.621, "eval_steps_per_second": 0.552, "step": 700 }, { "ep_loss": 0.0, "epoch": 0.55, "learning_rate": 3.5499999999999996e-05, "loss": 2.2379, "mlm_loss": 2.2379, "step": 710 }, { "ep_loss": 0.0, "epoch": 0.56, "learning_rate": 3.6e-05, "loss": 2.2088, "mlm_loss": 2.2088, "step": 720 }, { "ep_loss": 0.0, "epoch": 0.56, "learning_rate": 3.65e-05, "loss": 2.1767, "mlm_loss": 2.1767, "step": 730 }, { "ep_loss": 0.0, "epoch": 0.57, "learning_rate": 3.7e-05, "loss": 2.1589, "mlm_loss": 2.1589, "step": 740 }, { "ep_loss": 0.0, "epoch": 0.58, "learning_rate": 3.75e-05, "loss": 2.1297, "mlm_loss": 2.1297, "step": 750 }, { "ep_loss": 0.0, "epoch": 0.59, "learning_rate": 3.8e-05, "loss": 2.1128, "mlm_loss": 2.1128, "step": 760 }, { "ep_loss": 0.0, "epoch": 0.59, "learning_rate": 3.85e-05, "loss": 2.0861, "mlm_loss": 2.0861, "step": 770 }, { "ep_loss": 0.0, "epoch": 0.6, "learning_rate": 3.9e-05, "loss": 2.0557, "mlm_loss": 2.0557, "step": 780 }, { "ep_loss": 0.0, "epoch": 0.61, "learning_rate": 3.95e-05, "loss": 2.0596, "mlm_loss": 2.0596, "step": 790 }, { "ep_loss": 0.0, "epoch": 0.62, "learning_rate": 4e-05, "loss": 2.0349, "mlm_loss": 2.0349, "step": 800 }, { "epoch": 0.62, "eval_ep_loss": -1.7692768573760986, "eval_loss": 1.916082501411438, "eval_mlm_loss": 1.916082501411438, "eval_runtime": 62.7772, "eval_samples_per_second": 1112.968, "eval_steps_per_second": 0.558, "step": 800 }, { "ep_loss": 0.0, "epoch": 0.62, "learning_rate": 4.05e-05, "loss": 1.9934, "mlm_loss": 1.9934, "step": 810 }, { "ep_loss": 0.0, "epoch": 0.63, "learning_rate": 4.1e-05, "loss": 1.9554, "mlm_loss": 1.9554, "step": 820 }, { "ep_loss": 0.0, "epoch": 0.64, "learning_rate": 4.1500000000000006e-05, "loss": 1.9348, "mlm_loss": 1.9348, "step": 830 }, { "ep_loss": 0.0, "epoch": 0.65, "learning_rate": 4.2000000000000004e-05, "loss": 1.8978, "mlm_loss": 1.8978, "step": 840 }, { "ep_loss": 0.0, "epoch": 0.66, "learning_rate": 4.25e-05, "loss": 1.8752, "mlm_loss": 1.8752, "step": 850 }, { "ep_loss": 0.0, "epoch": 0.66, "learning_rate": 4.2999999999999995e-05, "loss": 1.8466, "mlm_loss": 1.8466, "step": 860 }, { "ep_loss": 0.0, "epoch": 0.67, "learning_rate": 4.35e-05, "loss": 1.8168, "mlm_loss": 1.8168, "step": 870 }, { "ep_loss": 0.0, "epoch": 0.68, "learning_rate": 4.4e-05, "loss": 1.8161, "mlm_loss": 1.8161, "step": 880 }, { "ep_loss": 0.0, "epoch": 0.69, "learning_rate": 4.45e-05, "loss": 1.7637, "mlm_loss": 1.7637, "step": 890 }, { "ep_loss": 0.0, "epoch": 0.69, "learning_rate": 4.4999999999999996e-05, "loss": 1.7292, "mlm_loss": 1.7292, "step": 900 }, { "epoch": 0.69, "eval_ep_loss": -1.673869252204895, "eval_loss": 1.6480634212493896, "eval_mlm_loss": 1.6480634212493896, "eval_runtime": 61.0031, "eval_samples_per_second": 1145.335, "eval_steps_per_second": 0.574, "step": 900 }, { "ep_loss": 0.0, "epoch": 0.7, "learning_rate": 4.55e-05, "loss": 1.7031, "mlm_loss": 1.7031, "step": 910 }, { "ep_loss": 0.0, "epoch": 0.71, "learning_rate": 4.6e-05, "loss": 1.6951, "mlm_loss": 1.6951, "step": 920 }, { "ep_loss": 0.0, "epoch": 0.72, "learning_rate": 4.65e-05, "loss": 1.6706, "mlm_loss": 1.6706, "step": 930 }, { "ep_loss": 0.0, "epoch": 0.73, "learning_rate": 4.7000000000000004e-05, "loss": 1.6674, "mlm_loss": 1.6674, "step": 940 }, { "ep_loss": 0.0, "epoch": 0.73, "learning_rate": 4.75e-05, "loss": 1.6446, "mlm_loss": 1.6446, "step": 950 }, { "ep_loss": 0.0, "epoch": 0.74, "learning_rate": 4.8e-05, "loss": 1.6344, "mlm_loss": 1.6344, "step": 960 }, { "ep_loss": 0.0, "epoch": 0.75, "learning_rate": 4.85e-05, "loss": 1.6182, "mlm_loss": 1.6182, "step": 970 }, { "ep_loss": 0.0, "epoch": 0.76, "learning_rate": 4.9000000000000005e-05, "loss": 1.602, "mlm_loss": 1.602, "step": 980 }, { "ep_loss": 0.0, "epoch": 0.76, "learning_rate": 4.9500000000000004e-05, "loss": 1.5965, "mlm_loss": 1.5965, "step": 990 }, { "ep_loss": 0.0, "epoch": 0.77, "learning_rate": 5e-05, "loss": 1.6022, "mlm_loss": 1.6022, "step": 1000 }, { "epoch": 0.77, "eval_ep_loss": -1.7339978218078613, "eval_loss": 1.521234393119812, "eval_mlm_loss": 1.521234393119812, "eval_runtime": 61.977, "eval_samples_per_second": 1127.338, "eval_steps_per_second": 0.565, "step": 1000 }, { "ep_loss": 0.0, "epoch": 0.78, "learning_rate": 5.05e-05, "loss": 1.5856, "mlm_loss": 1.5856, "step": 1010 }, { "ep_loss": 0.0, "epoch": 0.79, "learning_rate": 5.1e-05, "loss": 1.5641, "mlm_loss": 1.5641, "step": 1020 }, { "ep_loss": 0.0, "epoch": 0.79, "learning_rate": 5.15e-05, "loss": 1.5455, "mlm_loss": 1.5455, "step": 1030 }, { "ep_loss": 0.0, "epoch": 0.8, "learning_rate": 5.2e-05, "loss": 1.5587, "mlm_loss": 1.5587, "step": 1040 }, { "ep_loss": 0.0, "epoch": 0.81, "learning_rate": 5.25e-05, "loss": 1.5495, "mlm_loss": 1.5495, "step": 1050 }, { "ep_loss": 0.0, "epoch": 0.82, "learning_rate": 5.3e-05, "loss": 1.5388, "mlm_loss": 1.5388, "step": 1060 }, { "ep_loss": 0.0, "epoch": 0.83, "learning_rate": 5.35e-05, "loss": 1.5085, "mlm_loss": 1.5085, "step": 1070 }, { "ep_loss": 0.0, "epoch": 0.83, "learning_rate": 5.4e-05, "loss": 1.5258, "mlm_loss": 1.5258, "step": 1080 }, { "ep_loss": 0.0, "epoch": 0.84, "learning_rate": 5.45e-05, "loss": 1.5027, "mlm_loss": 1.5027, "step": 1090 }, { "ep_loss": 0.0, "epoch": 0.85, "learning_rate": 5.5e-05, "loss": 1.4991, "mlm_loss": 1.4991, "step": 1100 }, { "epoch": 0.85, "eval_ep_loss": -1.8915724754333496, "eval_loss": 1.4428980350494385, "eval_mlm_loss": 1.4428980350494385, "eval_runtime": 60.2275, "eval_samples_per_second": 1160.085, "eval_steps_per_second": 0.581, "step": 1100 }, { "ep_loss": 0.0, "epoch": 0.86, "learning_rate": 5.55e-05, "loss": 1.5068, "mlm_loss": 1.5068, "step": 1110 }, { "ep_loss": 0.0, "epoch": 0.86, "learning_rate": 5.6e-05, "loss": 1.4812, "mlm_loss": 1.4812, "step": 1120 }, { "ep_loss": 0.0, "epoch": 0.87, "learning_rate": 5.6500000000000005e-05, "loss": 1.4765, "mlm_loss": 1.4765, "step": 1130 }, { "ep_loss": 0.0, "epoch": 0.88, "learning_rate": 5.7e-05, "loss": 1.4679, "mlm_loss": 1.4679, "step": 1140 }, { "ep_loss": 0.0, "epoch": 0.89, "learning_rate": 5.75e-05, "loss": 1.4631, "mlm_loss": 1.4631, "step": 1150 }, { "ep_loss": 0.0, "epoch": 0.89, "learning_rate": 5.800000000000001e-05, "loss": 1.4562, "mlm_loss": 1.4562, "step": 1160 }, { "ep_loss": 0.0, "epoch": 0.9, "learning_rate": 5.8500000000000006e-05, "loss": 1.4542, "mlm_loss": 1.4542, "step": 1170 }, { "ep_loss": 0.0, "epoch": 0.91, "learning_rate": 5.9e-05, "loss": 1.4266, "mlm_loss": 1.4266, "step": 1180 }, { "ep_loss": 0.0, "epoch": 0.92, "learning_rate": 5.9499999999999996e-05, "loss": 1.4181, "mlm_loss": 1.4181, "step": 1190 }, { "ep_loss": 0.0, "epoch": 0.93, "learning_rate": 6e-05, "loss": 1.4133, "mlm_loss": 1.4133, "step": 1200 }, { "epoch": 0.93, "eval_ep_loss": -1.9453246593475342, "eval_loss": 1.3670274019241333, "eval_mlm_loss": 1.3670274019241333, "eval_runtime": 61.6667, "eval_samples_per_second": 1133.011, "eval_steps_per_second": 0.568, "step": 1200 }, { "ep_loss": 0.0, "epoch": 0.93, "learning_rate": 6.05e-05, "loss": 1.414, "mlm_loss": 1.414, "step": 1210 }, { "ep_loss": 0.0, "epoch": 0.94, "learning_rate": 6.1e-05, "loss": 1.4032, "mlm_loss": 1.4032, "step": 1220 }, { "ep_loss": 0.0, "epoch": 0.95, "learning_rate": 6.15e-05, "loss": 1.3958, "mlm_loss": 1.3958, "step": 1230 }, { "ep_loss": 0.0, "epoch": 0.96, "learning_rate": 6.2e-05, "loss": 1.3952, "mlm_loss": 1.3952, "step": 1240 }, { "ep_loss": 0.0, "epoch": 0.96, "learning_rate": 6.25e-05, "loss": 1.3829, "mlm_loss": 1.3829, "step": 1250 }, { "ep_loss": 0.0, "epoch": 0.97, "learning_rate": 6.3e-05, "loss": 1.3915, "mlm_loss": 1.3915, "step": 1260 }, { "ep_loss": 0.0, "epoch": 0.98, "learning_rate": 6.35e-05, "loss": 1.3881, "mlm_loss": 1.3881, "step": 1270 }, { "ep_loss": 0.0, "epoch": 0.99, "learning_rate": 6.4e-05, "loss": 1.3915, "mlm_loss": 1.3915, "step": 1280 }, { "ep_loss": 0.0, "epoch": 0.99, "learning_rate": 6.450000000000001e-05, "loss": 1.3728, "mlm_loss": 1.3728, "step": 1290 }, { "ep_loss": 0.0, "epoch": 1.0, "learning_rate": 6.500000000000001e-05, "loss": 1.3379, "mlm_loss": 1.3379, "step": 1300 }, { "epoch": 1.0, "eval_ep_loss": -2.041435480117798, "eval_loss": 1.2956568002700806, "eval_mlm_loss": 1.2956568002700806, "eval_runtime": 58.5906, "eval_samples_per_second": 1192.495, "eval_steps_per_second": 0.597, "step": 1300 }, { "ep_loss": 0.0, "epoch": 1.01, "learning_rate": 6.55e-05, "loss": 1.3488, "mlm_loss": 1.3488, "step": 1310 }, { "ep_loss": 0.0, "epoch": 1.02, "learning_rate": 6.6e-05, "loss": 1.3408, "mlm_loss": 1.3408, "step": 1320 }, { "ep_loss": 0.0, "epoch": 1.03, "learning_rate": 6.65e-05, "loss": 1.3318, "mlm_loss": 1.3318, "step": 1330 }, { "ep_loss": 0.0, "epoch": 1.03, "learning_rate": 6.7e-05, "loss": 1.3216, "mlm_loss": 1.3216, "step": 1340 }, { "ep_loss": 0.0, "epoch": 1.04, "learning_rate": 6.75e-05, "loss": 1.3142, "mlm_loss": 1.3142, "step": 1350 }, { "ep_loss": 0.0, "epoch": 1.05, "learning_rate": 6.800000000000001e-05, "loss": 1.3075, "mlm_loss": 1.3075, "step": 1360 }, { "ep_loss": 0.0, "epoch": 1.06, "learning_rate": 6.850000000000001e-05, "loss": 1.312, "mlm_loss": 1.312, "step": 1370 }, { "ep_loss": 0.0, "epoch": 1.06, "learning_rate": 6.900000000000001e-05, "loss": 1.2987, "mlm_loss": 1.2987, "step": 1380 }, { "ep_loss": 0.0, "epoch": 1.07, "learning_rate": 6.950000000000001e-05, "loss": 1.2822, "mlm_loss": 1.2822, "step": 1390 }, { "ep_loss": 0.0, "epoch": 1.08, "learning_rate": 7.000000000000001e-05, "loss": 1.2817, "mlm_loss": 1.2817, "step": 1400 }, { "epoch": 1.08, "eval_ep_loss": -2.044208288192749, "eval_loss": 1.2174968719482422, "eval_mlm_loss": 1.2174968719482422, "eval_runtime": 62.7874, "eval_samples_per_second": 1112.788, "eval_steps_per_second": 0.557, "step": 1400 }, { "ep_loss": 0.0, "epoch": 1.09, "learning_rate": 7.049999999999999e-05, "loss": 1.2673, "mlm_loss": 1.2673, "step": 1410 }, { "ep_loss": 0.0, "epoch": 1.1, "learning_rate": 7.099999999999999e-05, "loss": 1.28, "mlm_loss": 1.28, "step": 1420 }, { "ep_loss": 0.0, "epoch": 1.1, "learning_rate": 7.149999999999999e-05, "loss": 1.2606, "mlm_loss": 1.2606, "step": 1430 }, { "ep_loss": 0.0, "epoch": 1.11, "learning_rate": 7.2e-05, "loss": 1.2439, "mlm_loss": 1.2439, "step": 1440 }, { "ep_loss": 0.0, "epoch": 1.12, "learning_rate": 7.25e-05, "loss": 1.2428, "mlm_loss": 1.2428, "step": 1450 }, { "ep_loss": 0.0, "epoch": 1.13, "learning_rate": 7.3e-05, "loss": 1.2342, "mlm_loss": 1.2342, "step": 1460 }, { "ep_loss": 0.0, "epoch": 1.13, "learning_rate": 7.35e-05, "loss": 1.2391, "mlm_loss": 1.2391, "step": 1470 }, { "ep_loss": 0.0, "epoch": 1.14, "learning_rate": 7.4e-05, "loss": 1.2337, "mlm_loss": 1.2337, "step": 1480 }, { "ep_loss": 0.0, "epoch": 1.15, "learning_rate": 7.45e-05, "loss": 1.2161, "mlm_loss": 1.2161, "step": 1490 }, { "ep_loss": 0.0, "epoch": 1.16, "learning_rate": 7.5e-05, "loss": 1.2083, "mlm_loss": 1.2083, "step": 1500 }, { "epoch": 1.16, "eval_ep_loss": -2.247138738632202, "eval_loss": 1.145897388458252, "eval_mlm_loss": 1.145897388458252, "eval_runtime": 60.0275, "eval_samples_per_second": 1163.951, "eval_steps_per_second": 0.583, "step": 1500 }, { "ep_loss": 0.0, "epoch": 1.16, "learning_rate": 7.55e-05, "loss": 1.2048, "mlm_loss": 1.2048, "step": 1510 }, { "ep_loss": 0.0, "epoch": 1.17, "learning_rate": 7.6e-05, "loss": 1.1934, "mlm_loss": 1.1934, "step": 1520 }, { "ep_loss": 0.0, "epoch": 1.18, "learning_rate": 7.65e-05, "loss": 1.1741, "mlm_loss": 1.1741, "step": 1530 }, { "ep_loss": 0.0, "epoch": 1.19, "learning_rate": 7.7e-05, "loss": 1.1832, "mlm_loss": 1.1832, "step": 1540 }, { "ep_loss": 0.0, "epoch": 1.2, "learning_rate": 7.75e-05, "loss": 1.1759, "mlm_loss": 1.1759, "step": 1550 }, { "ep_loss": 0.0, "epoch": 1.2, "learning_rate": 7.8e-05, "loss": 1.1776, "mlm_loss": 1.1776, "step": 1560 }, { "ep_loss": 0.0, "epoch": 1.21, "learning_rate": 7.85e-05, "loss": 1.164, "mlm_loss": 1.164, "step": 1570 }, { "ep_loss": 0.0, "epoch": 1.22, "learning_rate": 7.9e-05, "loss": 1.1447, "mlm_loss": 1.1447, "step": 1580 }, { "ep_loss": 0.0, "epoch": 1.23, "learning_rate": 7.950000000000001e-05, "loss": 1.1669, "mlm_loss": 1.1669, "step": 1590 }, { "ep_loss": 0.0, "epoch": 1.23, "learning_rate": 8e-05, "loss": 1.1331, "mlm_loss": 1.1331, "step": 1600 }, { "epoch": 1.23, "eval_ep_loss": -2.236558198928833, "eval_loss": 1.0778173208236694, "eval_mlm_loss": 1.0778173208236694, "eval_runtime": 61.8774, "eval_samples_per_second": 1129.152, "eval_steps_per_second": 0.566, "step": 1600 }, { "ep_loss": 0.0, "epoch": 1.24, "learning_rate": 8.05e-05, "loss": 1.1332, "mlm_loss": 1.1332, "step": 1610 }, { "ep_loss": 0.0, "epoch": 1.25, "learning_rate": 8.1e-05, "loss": 1.1288, "mlm_loss": 1.1288, "step": 1620 }, { "ep_loss": 0.0, "epoch": 1.26, "learning_rate": 8.15e-05, "loss": 1.1321, "mlm_loss": 1.1321, "step": 1630 }, { "ep_loss": 0.0, "epoch": 1.26, "learning_rate": 8.2e-05, "loss": 1.11, "mlm_loss": 1.11, "step": 1640 }, { "ep_loss": 0.0, "epoch": 1.27, "learning_rate": 8.25e-05, "loss": 1.1083, "mlm_loss": 1.1083, "step": 1650 }, { "ep_loss": 0.0, "epoch": 1.28, "learning_rate": 8.300000000000001e-05, "loss": 1.0999, "mlm_loss": 1.0999, "step": 1660 }, { "ep_loss": 0.0, "epoch": 1.29, "learning_rate": 8.350000000000001e-05, "loss": 1.1041, "mlm_loss": 1.1041, "step": 1670 }, { "ep_loss": 0.0, "epoch": 1.3, "learning_rate": 8.400000000000001e-05, "loss": 1.0823, "mlm_loss": 1.0823, "step": 1680 }, { "ep_loss": 0.0, "epoch": 1.3, "learning_rate": 8.450000000000001e-05, "loss": 1.0809, "mlm_loss": 1.0809, "step": 1690 }, { "ep_loss": 0.0, "epoch": 1.31, "learning_rate": 8.5e-05, "loss": 1.0859, "mlm_loss": 1.0859, "step": 1700 }, { "epoch": 1.31, "eval_ep_loss": -2.2500903606414795, "eval_loss": 1.0151546001434326, "eval_mlm_loss": 1.0151546001434326, "eval_runtime": 60.0928, "eval_samples_per_second": 1162.685, "eval_steps_per_second": 0.582, "step": 1700 }, { "ep_loss": 0.0, "epoch": 1.32, "learning_rate": 8.55e-05, "loss": 1.0757, "mlm_loss": 1.0757, "step": 1710 }, { "ep_loss": 0.0, "epoch": 1.33, "learning_rate": 8.599999999999999e-05, "loss": 1.0638, "mlm_loss": 1.0638, "step": 1720 }, { "ep_loss": 0.0, "epoch": 1.33, "learning_rate": 8.65e-05, "loss": 1.0554, "mlm_loss": 1.0554, "step": 1730 }, { "ep_loss": 0.0, "epoch": 1.34, "learning_rate": 8.7e-05, "loss": 1.0624, "mlm_loss": 1.0624, "step": 1740 }, { "ep_loss": 0.0, "epoch": 1.35, "learning_rate": 8.75e-05, "loss": 1.0483, "mlm_loss": 1.0483, "step": 1750 }, { "ep_loss": 0.0, "epoch": 1.36, "learning_rate": 8.8e-05, "loss": 1.0432, "mlm_loss": 1.0432, "step": 1760 }, { "ep_loss": 0.0, "epoch": 1.37, "learning_rate": 8.85e-05, "loss": 1.0464, "mlm_loss": 1.0464, "step": 1770 }, { "ep_loss": 0.0, "epoch": 1.37, "learning_rate": 8.9e-05, "loss": 1.0379, "mlm_loss": 1.0379, "step": 1780 }, { "ep_loss": 0.0, "epoch": 1.38, "learning_rate": 8.95e-05, "loss": 1.0416, "mlm_loss": 1.0416, "step": 1790 }, { "ep_loss": 0.0, "epoch": 1.39, "learning_rate": 8.999999999999999e-05, "loss": 1.0178, "mlm_loss": 1.0178, "step": 1800 }, { "epoch": 1.39, "eval_ep_loss": -2.3243699073791504, "eval_loss": 0.9662984013557434, "eval_mlm_loss": 0.9662984013557434, "eval_runtime": 61.639, "eval_samples_per_second": 1133.519, "eval_steps_per_second": 0.568, "step": 1800 }, { "ep_loss": 0.0, "epoch": 1.4, "learning_rate": 9.05e-05, "loss": 1.0189, "mlm_loss": 1.0189, "step": 1810 }, { "ep_loss": 0.0, "epoch": 1.4, "learning_rate": 9.1e-05, "loss": 1.0181, "mlm_loss": 1.0181, "step": 1820 }, { "ep_loss": 0.0, "epoch": 1.41, "learning_rate": 9.15e-05, "loss": 1.006, "mlm_loss": 1.006, "step": 1830 }, { "ep_loss": 0.0, "epoch": 1.42, "learning_rate": 9.2e-05, "loss": 1.0088, "mlm_loss": 1.0088, "step": 1840 }, { "ep_loss": 0.0, "epoch": 1.43, "learning_rate": 9.25e-05, "loss": 1.008, "mlm_loss": 1.008, "step": 1850 }, { "ep_loss": 0.0, "epoch": 1.43, "learning_rate": 9.3e-05, "loss": 0.9891, "mlm_loss": 0.9891, "step": 1860 }, { "ep_loss": 0.0, "epoch": 1.44, "learning_rate": 9.35e-05, "loss": 0.9998, "mlm_loss": 0.9998, "step": 1870 }, { "ep_loss": 0.0, "epoch": 1.45, "learning_rate": 9.400000000000001e-05, "loss": 0.9927, "mlm_loss": 0.9927, "step": 1880 }, { "ep_loss": 0.0, "epoch": 1.46, "learning_rate": 9.45e-05, "loss": 0.9825, "mlm_loss": 0.9825, "step": 1890 }, { "ep_loss": 0.0, "epoch": 1.47, "learning_rate": 9.5e-05, "loss": 0.979, "mlm_loss": 0.979, "step": 1900 }, { "epoch": 1.47, "eval_ep_loss": -2.297435760498047, "eval_loss": 0.9245826005935669, "eval_mlm_loss": 0.9245826005935669, "eval_runtime": 60.9274, "eval_samples_per_second": 1146.759, "eval_steps_per_second": 0.574, "step": 1900 }, { "ep_loss": 0.0, "epoch": 1.47, "learning_rate": 9.55e-05, "loss": 0.9782, "mlm_loss": 0.9782, "step": 1910 }, { "ep_loss": 0.0, "epoch": 1.48, "learning_rate": 9.6e-05, "loss": 0.9802, "mlm_loss": 0.9802, "step": 1920 }, { "ep_loss": 0.0, "epoch": 1.49, "learning_rate": 9.65e-05, "loss": 0.9648, "mlm_loss": 0.9648, "step": 1930 }, { "ep_loss": 0.0, "epoch": 1.5, "learning_rate": 9.7e-05, "loss": 0.9576, "mlm_loss": 0.9576, "step": 1940 }, { "ep_loss": 0.0, "epoch": 1.5, "learning_rate": 9.750000000000001e-05, "loss": 0.9629, "mlm_loss": 0.9629, "step": 1950 }, { "ep_loss": 0.0, "epoch": 1.51, "learning_rate": 9.800000000000001e-05, "loss": 0.9585, "mlm_loss": 0.9585, "step": 1960 }, { "ep_loss": 0.0, "epoch": 1.52, "learning_rate": 9.850000000000001e-05, "loss": 0.9404, "mlm_loss": 0.9404, "step": 1970 }, { "ep_loss": 0.0, "epoch": 1.53, "learning_rate": 9.900000000000001e-05, "loss": 0.9538, "mlm_loss": 0.9538, "step": 1980 }, { "ep_loss": 0.0, "epoch": 1.53, "learning_rate": 9.95e-05, "loss": 0.9591, "mlm_loss": 0.9591, "step": 1990 }, { "ep_loss": 0.0, "epoch": 1.54, "learning_rate": 0.0001, "loss": 0.9467, "mlm_loss": 0.9467, "step": 2000 }, { "epoch": 1.54, "eval_ep_loss": -2.388266086578369, "eval_loss": 0.8887887597084045, "eval_mlm_loss": 0.8887887597084045, "eval_runtime": 59.4206, "eval_samples_per_second": 1175.838, "eval_steps_per_second": 0.589, "step": 2000 }, { "ep_loss": 0.0, "epoch": 1.55, "learning_rate": 0.0001005, "loss": 0.947, "mlm_loss": 0.947, "step": 2010 }, { "ep_loss": 0.0, "epoch": 1.56, "learning_rate": 0.000101, "loss": 0.9407, "mlm_loss": 0.9407, "step": 2020 }, { "ep_loss": 0.0, "epoch": 1.57, "learning_rate": 0.00010150000000000001, "loss": 0.9352, "mlm_loss": 0.9352, "step": 2030 }, { "ep_loss": 0.0, "epoch": 1.57, "learning_rate": 0.000102, "loss": 0.9234, "mlm_loss": 0.9234, "step": 2040 }, { "ep_loss": 0.0, "epoch": 1.58, "learning_rate": 0.0001025, "loss": 0.9252, "mlm_loss": 0.9252, "step": 2050 }, { "ep_loss": 0.0, "epoch": 1.59, "learning_rate": 0.000103, "loss": 0.9271, "mlm_loss": 0.9271, "step": 2060 }, { "ep_loss": 0.0, "epoch": 1.6, "learning_rate": 0.0001035, "loss": 0.9254, "mlm_loss": 0.9254, "step": 2070 }, { "ep_loss": 0.0, "epoch": 1.6, "learning_rate": 0.000104, "loss": 0.9232, "mlm_loss": 0.9232, "step": 2080 }, { "ep_loss": 0.0, "epoch": 1.61, "learning_rate": 0.00010449999999999999, "loss": 0.919, "mlm_loss": 0.919, "step": 2090 }, { "ep_loss": 0.0, "epoch": 1.62, "learning_rate": 0.000105, "loss": 0.9127, "mlm_loss": 0.9127, "step": 2100 }, { "epoch": 1.62, "eval_ep_loss": -2.262110948562622, "eval_loss": 0.8549456000328064, "eval_mlm_loss": 0.8549456000328064, "eval_runtime": 59.0907, "eval_samples_per_second": 1182.403, "eval_steps_per_second": 0.592, "step": 2100 }, { "ep_loss": 0.0, "epoch": 1.63, "learning_rate": 0.0001055, "loss": 0.9023, "mlm_loss": 0.9023, "step": 2110 }, { "ep_loss": 0.0, "epoch": 1.64, "learning_rate": 0.000106, "loss": 0.9031, "mlm_loss": 0.9031, "step": 2120 }, { "ep_loss": 0.0, "epoch": 1.64, "learning_rate": 0.0001065, "loss": 0.9039, "mlm_loss": 0.9039, "step": 2130 }, { "ep_loss": 0.0, "epoch": 1.65, "learning_rate": 0.000107, "loss": 0.9022, "mlm_loss": 0.9022, "step": 2140 }, { "ep_loss": 0.0, "epoch": 1.66, "learning_rate": 0.0001075, "loss": 0.8978, "mlm_loss": 0.8978, "step": 2150 }, { "ep_loss": 0.0, "epoch": 1.67, "learning_rate": 0.000108, "loss": 0.8893, "mlm_loss": 0.8893, "step": 2160 }, { "ep_loss": 0.0, "epoch": 1.67, "learning_rate": 0.00010850000000000001, "loss": 0.8796, "mlm_loss": 0.8796, "step": 2170 }, { "ep_loss": 0.0, "epoch": 1.68, "learning_rate": 0.000109, "loss": 0.8925, "mlm_loss": 0.8925, "step": 2180 }, { "ep_loss": 0.0, "epoch": 1.69, "learning_rate": 0.0001095, "loss": 0.8873, "mlm_loss": 0.8873, "step": 2190 }, { "ep_loss": 0.0, "epoch": 1.7, "learning_rate": 0.00011, "loss": 0.8858, "mlm_loss": 0.8858, "step": 2200 }, { "epoch": 1.7, "eval_ep_loss": -2.3254947662353516, "eval_loss": 0.8307343125343323, "eval_mlm_loss": 0.8307343125343323, "eval_runtime": 62.4113, "eval_samples_per_second": 1119.493, "eval_steps_per_second": 0.561, "step": 2200 }, { "ep_loss": 0.0, "epoch": 1.7, "learning_rate": 0.0001105, "loss": 0.888, "mlm_loss": 0.888, "step": 2210 }, { "ep_loss": 0.0, "epoch": 1.71, "learning_rate": 0.000111, "loss": 0.8706, "mlm_loss": 0.8706, "step": 2220 }, { "ep_loss": 0.0, "epoch": 1.72, "learning_rate": 0.0001115, "loss": 0.8854, "mlm_loss": 0.8854, "step": 2230 }, { "ep_loss": 0.0, "epoch": 1.73, "learning_rate": 0.000112, "loss": 0.864, "mlm_loss": 0.864, "step": 2240 }, { "ep_loss": 0.0, "epoch": 1.74, "learning_rate": 0.00011250000000000001, "loss": 0.8675, "mlm_loss": 0.8675, "step": 2250 }, { "ep_loss": 0.0, "epoch": 1.74, "learning_rate": 0.00011300000000000001, "loss": 0.8654, "mlm_loss": 0.8654, "step": 2260 }, { "ep_loss": 0.0, "epoch": 1.75, "learning_rate": 0.00011350000000000001, "loss": 0.8593, "mlm_loss": 0.8593, "step": 2270 }, { "ep_loss": 0.0, "epoch": 1.76, "learning_rate": 0.000114, "loss": 0.8586, "mlm_loss": 0.8586, "step": 2280 }, { "ep_loss": 0.0, "epoch": 1.77, "learning_rate": 0.0001145, "loss": 0.8494, "mlm_loss": 0.8494, "step": 2290 }, { "ep_loss": 0.0, "epoch": 1.77, "learning_rate": 0.000115, "loss": 0.8517, "mlm_loss": 0.8517, "step": 2300 }, { "epoch": 1.77, "eval_ep_loss": -2.2645294666290283, "eval_loss": 0.803054928779602, "eval_mlm_loss": 0.803054928779602, "eval_runtime": 61.7973, "eval_samples_per_second": 1130.616, "eval_steps_per_second": 0.566, "step": 2300 }, { "ep_loss": 0.0, "epoch": 1.78, "learning_rate": 0.0001155, "loss": 0.8396, "mlm_loss": 0.8396, "step": 2310 }, { "ep_loss": 0.0, "epoch": 1.79, "learning_rate": 0.00011600000000000001, "loss": 0.8478, "mlm_loss": 0.8478, "step": 2320 }, { "ep_loss": 0.0, "epoch": 1.8, "learning_rate": 0.00011650000000000001, "loss": 0.8495, "mlm_loss": 0.8495, "step": 2330 }, { "ep_loss": 0.0, "epoch": 1.8, "learning_rate": 0.00011700000000000001, "loss": 0.8441, "mlm_loss": 0.8441, "step": 2340 }, { "ep_loss": 0.0, "epoch": 1.81, "learning_rate": 0.0001175, "loss": 0.8441, "mlm_loss": 0.8441, "step": 2350 }, { "ep_loss": 0.0, "epoch": 1.82, "learning_rate": 0.000118, "loss": 0.8406, "mlm_loss": 0.8406, "step": 2360 }, { "ep_loss": 0.0, "epoch": 1.83, "learning_rate": 0.0001185, "loss": 0.8322, "mlm_loss": 0.8322, "step": 2370 }, { "ep_loss": 0.0, "epoch": 1.84, "learning_rate": 0.00011899999999999999, "loss": 0.8282, "mlm_loss": 0.8282, "step": 2380 }, { "ep_loss": 0.0, "epoch": 1.84, "learning_rate": 0.00011949999999999999, "loss": 0.8437, "mlm_loss": 0.8437, "step": 2390 }, { "ep_loss": 0.0, "epoch": 1.85, "learning_rate": 0.00012, "loss": 0.8415, "mlm_loss": 0.8415, "step": 2400 }, { "epoch": 1.85, "eval_ep_loss": -2.278646945953369, "eval_loss": 0.7815272212028503, "eval_mlm_loss": 0.7815272212028503, "eval_runtime": 60.3734, "eval_samples_per_second": 1157.281, "eval_steps_per_second": 0.58, "step": 2400 }, { "ep_loss": 0.0, "epoch": 1.86, "learning_rate": 0.0001205, "loss": 0.8232, "mlm_loss": 0.8232, "step": 2410 }, { "ep_loss": 0.0, "epoch": 1.87, "learning_rate": 0.000121, "loss": 0.8204, "mlm_loss": 0.8204, "step": 2420 }, { "ep_loss": 0.0, "epoch": 1.87, "learning_rate": 0.0001215, "loss": 0.8246, "mlm_loss": 0.8246, "step": 2430 }, { "ep_loss": 0.0, "epoch": 1.88, "learning_rate": 0.000122, "loss": 0.8092, "mlm_loss": 0.8092, "step": 2440 }, { "ep_loss": 0.0, "epoch": 1.89, "learning_rate": 0.0001225, "loss": 0.812, "mlm_loss": 0.812, "step": 2450 }, { "ep_loss": 0.0, "epoch": 1.9, "learning_rate": 0.000123, "loss": 0.8148, "mlm_loss": 0.8148, "step": 2460 }, { "ep_loss": 0.0, "epoch": 1.91, "learning_rate": 0.0001235, "loss": 0.8065, "mlm_loss": 0.8065, "step": 2470 }, { "ep_loss": 0.0, "epoch": 1.91, "learning_rate": 0.000124, "loss": 0.8029, "mlm_loss": 0.8029, "step": 2480 }, { "ep_loss": 0.0, "epoch": 1.92, "learning_rate": 0.0001245, "loss": 0.8148, "mlm_loss": 0.8148, "step": 2490 }, { "ep_loss": 0.0, "epoch": 1.93, "learning_rate": 0.000125, "loss": 0.806, "mlm_loss": 0.806, "step": 2500 }, { "epoch": 1.93, "eval_ep_loss": -2.1858584880828857, "eval_loss": 0.7560575604438782, "eval_mlm_loss": 0.7560575604438782, "eval_runtime": 60.8551, "eval_samples_per_second": 1148.121, "eval_steps_per_second": 0.575, "step": 2500 }, { "ep_loss": 0.0, "epoch": 1.94, "learning_rate": 0.00012550000000000001, "loss": 0.8048, "mlm_loss": 0.8048, "step": 2510 }, { "ep_loss": 0.0, "epoch": 1.94, "learning_rate": 0.000126, "loss": 0.8027, "mlm_loss": 0.8027, "step": 2520 }, { "ep_loss": 0.0, "epoch": 1.95, "learning_rate": 0.0001265, "loss": 0.797, "mlm_loss": 0.797, "step": 2530 }, { "ep_loss": 0.0, "epoch": 1.96, "learning_rate": 0.000127, "loss": 0.8041, "mlm_loss": 0.8041, "step": 2540 }, { "ep_loss": 0.0, "epoch": 1.97, "learning_rate": 0.0001275, "loss": 0.8, "mlm_loss": 0.8, "step": 2550 }, { "ep_loss": 0.0, "epoch": 1.97, "learning_rate": 0.000128, "loss": 0.7955, "mlm_loss": 0.7955, "step": 2560 }, { "ep_loss": 0.0, "epoch": 1.98, "learning_rate": 0.0001285, "loss": 0.7943, "mlm_loss": 0.7943, "step": 2570 }, { "ep_loss": 0.0, "epoch": 1.99, "learning_rate": 0.00012900000000000002, "loss": 0.7934, "mlm_loss": 0.7934, "step": 2580 }, { "ep_loss": 0.0, "epoch": 2.0, "learning_rate": 0.0001295, "loss": 0.7884, "mlm_loss": 0.7884, "step": 2590 }, { "ep_loss": 0.0, "epoch": 2.01, "learning_rate": 0.00013000000000000002, "loss": 0.7892, "mlm_loss": 0.7892, "step": 2600 }, { "epoch": 2.01, "eval_ep_loss": -2.1713080406188965, "eval_loss": 0.7403181195259094, "eval_mlm_loss": 0.7403181195259094, "eval_runtime": 64.561, "eval_samples_per_second": 1082.217, "eval_steps_per_second": 0.542, "step": 2600 }, { "ep_loss": 0.0, "epoch": 2.01, "learning_rate": 0.0001305, "loss": 0.7818, "mlm_loss": 0.7818, "step": 2610 }, { "ep_loss": 0.0, "epoch": 2.02, "learning_rate": 0.000131, "loss": 0.7771, "mlm_loss": 0.7771, "step": 2620 }, { "ep_loss": 0.0, "epoch": 2.03, "learning_rate": 0.0001315, "loss": 0.767, "mlm_loss": 0.767, "step": 2630 }, { "ep_loss": 0.0, "epoch": 2.04, "learning_rate": 0.000132, "loss": 0.7826, "mlm_loss": 0.7826, "step": 2640 }, { "ep_loss": 0.0, "epoch": 2.04, "learning_rate": 0.00013250000000000002, "loss": 0.771, "mlm_loss": 0.771, "step": 2650 }, { "ep_loss": 0.0, "epoch": 2.05, "learning_rate": 0.000133, "loss": 0.7774, "mlm_loss": 0.7774, "step": 2660 }, { "ep_loss": 0.0, "epoch": 2.06, "learning_rate": 0.00013350000000000002, "loss": 0.7674, "mlm_loss": 0.7674, "step": 2670 }, { "ep_loss": 0.0, "epoch": 2.07, "learning_rate": 0.000134, "loss": 0.7737, "mlm_loss": 0.7737, "step": 2680 }, { "ep_loss": 0.0, "epoch": 2.07, "learning_rate": 0.00013450000000000002, "loss": 0.7639, "mlm_loss": 0.7639, "step": 2690 }, { "ep_loss": 0.0, "epoch": 2.08, "learning_rate": 0.000135, "loss": 0.7655, "mlm_loss": 0.7655, "step": 2700 }, { "epoch": 2.08, "eval_ep_loss": -2.082960605621338, "eval_loss": 0.7183709144592285, "eval_mlm_loss": 0.7183709144592285, "eval_runtime": 61.749, "eval_samples_per_second": 1131.5, "eval_steps_per_second": 0.567, "step": 2700 }, { "ep_loss": 0.0, "epoch": 2.09, "learning_rate": 0.00013550000000000001, "loss": 0.7611, "mlm_loss": 0.7611, "step": 2710 }, { "ep_loss": 0.0, "epoch": 2.1, "learning_rate": 0.00013600000000000003, "loss": 0.7586, "mlm_loss": 0.7586, "step": 2720 }, { "ep_loss": 0.0, "epoch": 2.11, "learning_rate": 0.0001365, "loss": 0.762, "mlm_loss": 0.762, "step": 2730 }, { "ep_loss": 0.0, "epoch": 2.11, "learning_rate": 0.00013700000000000002, "loss": 0.7479, "mlm_loss": 0.7479, "step": 2740 }, { "ep_loss": 0.0, "epoch": 2.12, "learning_rate": 0.0001375, "loss": 0.7514, "mlm_loss": 0.7514, "step": 2750 }, { "ep_loss": 0.0, "epoch": 2.13, "learning_rate": 0.00013800000000000002, "loss": 0.7476, "mlm_loss": 0.7476, "step": 2760 }, { "ep_loss": 0.0, "epoch": 2.14, "learning_rate": 0.0001385, "loss": 0.7629, "mlm_loss": 0.7629, "step": 2770 }, { "ep_loss": 0.0, "epoch": 2.14, "learning_rate": 0.00013900000000000002, "loss": 0.7432, "mlm_loss": 0.7432, "step": 2780 }, { "ep_loss": 0.0, "epoch": 2.15, "learning_rate": 0.0001395, "loss": 0.7449, "mlm_loss": 0.7449, "step": 2790 }, { "ep_loss": 0.0, "epoch": 2.16, "learning_rate": 0.00014000000000000001, "loss": 0.7379, "mlm_loss": 0.7379, "step": 2800 }, { "epoch": 2.16, "eval_ep_loss": -2.261542558670044, "eval_loss": 0.7018134593963623, "eval_mlm_loss": 0.7018134593963623, "eval_runtime": 60.9379, "eval_samples_per_second": 1146.561, "eval_steps_per_second": 0.574, "step": 2800 }, { "ep_loss": 0.0, "epoch": 2.17, "learning_rate": 0.00014050000000000003, "loss": 0.7465, "mlm_loss": 0.7465, "step": 2810 }, { "ep_loss": 0.0, "epoch": 2.18, "learning_rate": 0.00014099999999999998, "loss": 0.737, "mlm_loss": 0.737, "step": 2820 }, { "ep_loss": 0.0, "epoch": 2.18, "learning_rate": 0.0001415, "loss": 0.7377, "mlm_loss": 0.7377, "step": 2830 }, { "ep_loss": 0.0, "epoch": 2.19, "learning_rate": 0.00014199999999999998, "loss": 0.7401, "mlm_loss": 0.7401, "step": 2840 }, { "ep_loss": 0.0, "epoch": 2.2, "learning_rate": 0.0001425, "loss": 0.7266, "mlm_loss": 0.7266, "step": 2850 }, { "ep_loss": 0.0, "epoch": 2.21, "learning_rate": 0.00014299999999999998, "loss": 0.7443, "mlm_loss": 0.7443, "step": 2860 }, { "ep_loss": 0.0, "epoch": 2.21, "learning_rate": 0.0001435, "loss": 0.7301, "mlm_loss": 0.7301, "step": 2870 }, { "ep_loss": 0.0, "epoch": 2.22, "learning_rate": 0.000144, "loss": 0.7427, "mlm_loss": 0.7427, "step": 2880 }, { "ep_loss": 0.0, "epoch": 2.23, "learning_rate": 0.0001445, "loss": 0.7387, "mlm_loss": 0.7387, "step": 2890 }, { "ep_loss": 0.0, "epoch": 2.24, "learning_rate": 0.000145, "loss": 0.721, "mlm_loss": 0.721, "step": 2900 }, { "epoch": 2.24, "eval_ep_loss": -2.377054214477539, "eval_loss": 0.6826241612434387, "eval_mlm_loss": 0.6826241612434387, "eval_runtime": 60.9503, "eval_samples_per_second": 1146.327, "eval_steps_per_second": 0.574, "step": 2900 }, { "ep_loss": 0.0, "epoch": 2.24, "learning_rate": 0.00014549999999999999, "loss": 0.7352, "mlm_loss": 0.7352, "step": 2910 }, { "ep_loss": 0.0, "epoch": 2.25, "learning_rate": 0.000146, "loss": 0.7219, "mlm_loss": 0.7219, "step": 2920 }, { "ep_loss": 0.0, "epoch": 2.26, "learning_rate": 0.00014649999999999998, "loss": 0.729, "mlm_loss": 0.729, "step": 2930 }, { "ep_loss": 0.0, "epoch": 2.27, "learning_rate": 0.000147, "loss": 0.7148, "mlm_loss": 0.7148, "step": 2940 }, { "ep_loss": 0.0, "epoch": 2.28, "learning_rate": 0.0001475, "loss": 0.721, "mlm_loss": 0.721, "step": 2950 }, { "ep_loss": 0.0, "epoch": 2.28, "learning_rate": 0.000148, "loss": 0.7113, "mlm_loss": 0.7113, "step": 2960 }, { "ep_loss": 0.0, "epoch": 2.29, "learning_rate": 0.0001485, "loss": 0.711, "mlm_loss": 0.711, "step": 2970 }, { "ep_loss": 0.0, "epoch": 2.3, "learning_rate": 0.000149, "loss": 0.7109, "mlm_loss": 0.7109, "step": 2980 }, { "ep_loss": 0.0, "epoch": 2.31, "learning_rate": 0.0001495, "loss": 0.7121, "mlm_loss": 0.7121, "step": 2990 }, { "ep_loss": 0.0, "epoch": 2.31, "learning_rate": 0.00015, "loss": 0.7106, "mlm_loss": 0.7106, "step": 3000 }, { "epoch": 2.31, "eval_ep_loss": -2.0994629859924316, "eval_loss": 0.6672462821006775, "eval_mlm_loss": 0.6672462821006775, "eval_runtime": 63.0197, "eval_samples_per_second": 1108.686, "eval_steps_per_second": 0.555, "step": 3000 }, { "ep_loss": 0.0, "epoch": 2.32, "learning_rate": 0.0001505, "loss": 0.7067, "mlm_loss": 0.7067, "step": 3010 }, { "ep_loss": 0.0, "epoch": 2.33, "learning_rate": 0.000151, "loss": 0.7082, "mlm_loss": 0.7082, "step": 3020 }, { "ep_loss": 0.0, "epoch": 2.34, "learning_rate": 0.0001515, "loss": 0.7097, "mlm_loss": 0.7097, "step": 3030 }, { "ep_loss": 0.0, "epoch": 2.34, "learning_rate": 0.000152, "loss": 0.6972, "mlm_loss": 0.6972, "step": 3040 }, { "ep_loss": 0.0, "epoch": 2.35, "learning_rate": 0.0001525, "loss": 0.7051, "mlm_loss": 0.7051, "step": 3050 }, { "ep_loss": 0.0, "epoch": 2.36, "learning_rate": 0.000153, "loss": 0.7029, "mlm_loss": 0.7029, "step": 3060 }, { "ep_loss": 0.0, "epoch": 2.37, "learning_rate": 0.0001535, "loss": 0.7037, "mlm_loss": 0.7037, "step": 3070 }, { "ep_loss": 0.0, "epoch": 2.38, "learning_rate": 0.000154, "loss": 0.6966, "mlm_loss": 0.6966, "step": 3080 }, { "ep_loss": 0.0, "epoch": 2.38, "learning_rate": 0.00015450000000000001, "loss": 0.6948, "mlm_loss": 0.6948, "step": 3090 }, { "ep_loss": 0.0, "epoch": 2.39, "learning_rate": 0.000155, "loss": 0.6974, "mlm_loss": 0.6974, "step": 3100 }, { "epoch": 2.39, "eval_ep_loss": -2.283140182495117, "eval_loss": 0.6495695114135742, "eval_mlm_loss": 0.6495695114135742, "eval_runtime": 59.8554, "eval_samples_per_second": 1167.296, "eval_steps_per_second": 0.585, "step": 3100 }, { "ep_loss": 0.0, "epoch": 2.4, "learning_rate": 0.0001555, "loss": 0.6877, "mlm_loss": 0.6877, "step": 3110 }, { "ep_loss": 0.0, "epoch": 2.41, "learning_rate": 0.000156, "loss": 0.6862, "mlm_loss": 0.6862, "step": 3120 }, { "ep_loss": 0.0, "epoch": 2.41, "learning_rate": 0.0001565, "loss": 0.6893, "mlm_loss": 0.6893, "step": 3130 }, { "ep_loss": 0.0, "epoch": 2.42, "learning_rate": 0.000157, "loss": 0.6943, "mlm_loss": 0.6943, "step": 3140 }, { "ep_loss": 0.0, "epoch": 2.43, "learning_rate": 0.0001575, "loss": 0.6804, "mlm_loss": 0.6804, "step": 3150 }, { "ep_loss": 0.0, "epoch": 2.44, "learning_rate": 0.000158, "loss": 0.6813, "mlm_loss": 0.6813, "step": 3160 }, { "ep_loss": 0.0, "epoch": 2.45, "learning_rate": 0.0001585, "loss": 0.6828, "mlm_loss": 0.6828, "step": 3170 }, { "ep_loss": 0.0, "epoch": 2.45, "learning_rate": 0.00015900000000000002, "loss": 0.6736, "mlm_loss": 0.6736, "step": 3180 }, { "ep_loss": 0.0, "epoch": 2.46, "learning_rate": 0.0001595, "loss": 0.6763, "mlm_loss": 0.6763, "step": 3190 }, { "ep_loss": 0.0, "epoch": 2.47, "learning_rate": 0.00016, "loss": 0.6796, "mlm_loss": 0.6796, "step": 3200 }, { "epoch": 2.47, "eval_ep_loss": -2.57122540473938, "eval_loss": 0.6341940760612488, "eval_mlm_loss": 0.6341940760612488, "eval_runtime": 63.1531, "eval_samples_per_second": 1106.343, "eval_steps_per_second": 0.554, "step": 3200 }, { "ep_loss": 0.0, "epoch": 2.48, "learning_rate": 0.0001605, "loss": 0.6688, "mlm_loss": 0.6688, "step": 3210 }, { "ep_loss": 0.0, "epoch": 2.48, "learning_rate": 0.000161, "loss": 0.6725, "mlm_loss": 0.6725, "step": 3220 }, { "ep_loss": 0.0, "epoch": 2.49, "learning_rate": 0.0001615, "loss": 0.6718, "mlm_loss": 0.6718, "step": 3230 }, { "ep_loss": 0.0, "epoch": 2.5, "learning_rate": 0.000162, "loss": 0.6675, "mlm_loss": 0.6675, "step": 3240 }, { "ep_loss": 0.0, "epoch": 2.51, "learning_rate": 0.00016250000000000002, "loss": 0.6665, "mlm_loss": 0.6665, "step": 3250 }, { "ep_loss": 0.0, "epoch": 2.51, "learning_rate": 0.000163, "loss": 0.6615, "mlm_loss": 0.6615, "step": 3260 }, { "ep_loss": 0.0, "epoch": 2.52, "learning_rate": 0.00016350000000000002, "loss": 0.6784, "mlm_loss": 0.6784, "step": 3270 }, { "ep_loss": 0.0, "epoch": 2.53, "learning_rate": 0.000164, "loss": 0.6686, "mlm_loss": 0.6686, "step": 3280 }, { "ep_loss": 0.0, "epoch": 2.54, "learning_rate": 0.00016450000000000001, "loss": 0.6697, "mlm_loss": 0.6697, "step": 3290 }, { "ep_loss": 0.0, "epoch": 2.55, "learning_rate": 0.000165, "loss": 0.6621, "mlm_loss": 0.6621, "step": 3300 }, { "epoch": 2.55, "eval_ep_loss": -2.320197820663452, "eval_loss": 0.6192212104797363, "eval_mlm_loss": 0.6192212104797363, "eval_runtime": 60.3086, "eval_samples_per_second": 1158.524, "eval_steps_per_second": 0.58, "step": 3300 }, { "ep_loss": 0.0, "epoch": 2.55, "learning_rate": 0.0001655, "loss": 0.6645, "mlm_loss": 0.6645, "step": 3310 }, { "ep_loss": 0.0, "epoch": 2.56, "learning_rate": 0.00016600000000000002, "loss": 0.6594, "mlm_loss": 0.6594, "step": 3320 }, { "ep_loss": 0.0, "epoch": 2.57, "learning_rate": 0.0001665, "loss": 0.6585, "mlm_loss": 0.6585, "step": 3330 }, { "ep_loss": 0.0, "epoch": 2.58, "learning_rate": 0.00016700000000000002, "loss": 0.6499, "mlm_loss": 0.6499, "step": 3340 }, { "ep_loss": 0.0, "epoch": 2.58, "learning_rate": 0.0001675, "loss": 0.641, "mlm_loss": 0.641, "step": 3350 }, { "ep_loss": 0.0, "epoch": 2.59, "learning_rate": 0.00016800000000000002, "loss": 0.653, "mlm_loss": 0.653, "step": 3360 }, { "ep_loss": 0.0, "epoch": 2.6, "learning_rate": 0.0001685, "loss": 0.6526, "mlm_loss": 0.6526, "step": 3370 }, { "ep_loss": 0.0, "epoch": 2.61, "learning_rate": 0.00016900000000000002, "loss": 0.6588, "mlm_loss": 0.6588, "step": 3380 }, { "ep_loss": 0.0, "epoch": 2.61, "learning_rate": 0.00016950000000000003, "loss": 0.6457, "mlm_loss": 0.6457, "step": 3390 }, { "ep_loss": 0.0, "epoch": 2.62, "learning_rate": 0.00017, "loss": 0.6464, "mlm_loss": 0.6464, "step": 3400 }, { "epoch": 2.62, "eval_ep_loss": -2.441110610961914, "eval_loss": 0.6078351736068726, "eval_mlm_loss": 0.6078351736068726, "eval_runtime": 60.4806, "eval_samples_per_second": 1155.23, "eval_steps_per_second": 0.579, "step": 3400 }, { "ep_loss": 0.0, "epoch": 2.63, "learning_rate": 0.00017050000000000002, "loss": 0.6517, "mlm_loss": 0.6517, "step": 3410 }, { "ep_loss": 0.0, "epoch": 2.64, "learning_rate": 0.000171, "loss": 0.6473, "mlm_loss": 0.6473, "step": 3420 }, { "ep_loss": 0.0, "epoch": 2.65, "learning_rate": 0.00017150000000000002, "loss": 0.6475, "mlm_loss": 0.6475, "step": 3430 }, { "ep_loss": 0.0, "epoch": 2.65, "learning_rate": 0.00017199999999999998, "loss": 0.649, "mlm_loss": 0.649, "step": 3440 }, { "ep_loss": 0.0, "epoch": 2.66, "learning_rate": 0.0001725, "loss": 0.652, "mlm_loss": 0.652, "step": 3450 }, { "ep_loss": 0.0, "epoch": 2.67, "learning_rate": 0.000173, "loss": 0.6484, "mlm_loss": 0.6484, "step": 3460 }, { "ep_loss": 0.0, "epoch": 2.68, "learning_rate": 0.0001735, "loss": 0.6405, "mlm_loss": 0.6405, "step": 3470 }, { "ep_loss": 0.0, "epoch": 2.68, "learning_rate": 0.000174, "loss": 0.6421, "mlm_loss": 0.6421, "step": 3480 }, { "ep_loss": 0.0, "epoch": 2.69, "learning_rate": 0.00017449999999999999, "loss": 0.6313, "mlm_loss": 0.6313, "step": 3490 }, { "ep_loss": 0.0, "epoch": 2.7, "learning_rate": 0.000175, "loss": 0.6228, "mlm_loss": 0.6228, "step": 3500 }, { "epoch": 2.7, "eval_ep_loss": -2.3252084255218506, "eval_loss": 0.5944607257843018, "eval_mlm_loss": 0.5944607257843018, "eval_runtime": 59.8998, "eval_samples_per_second": 1166.431, "eval_steps_per_second": 0.584, "step": 3500 }, { "ep_loss": 0.0, "epoch": 2.71, "learning_rate": 0.00017549999999999998, "loss": 0.6276, "mlm_loss": 0.6276, "step": 3510 }, { "ep_loss": 0.0, "epoch": 2.72, "learning_rate": 0.000176, "loss": 0.632, "mlm_loss": 0.632, "step": 3520 }, { "ep_loss": 0.0, "epoch": 2.72, "learning_rate": 0.00017649999999999998, "loss": 0.629, "mlm_loss": 0.629, "step": 3530 }, { "ep_loss": 0.0, "epoch": 2.73, "learning_rate": 0.000177, "loss": 0.6327, "mlm_loss": 0.6327, "step": 3540 }, { "ep_loss": 0.0, "epoch": 2.74, "learning_rate": 0.0001775, "loss": 0.6307, "mlm_loss": 0.6307, "step": 3550 }, { "ep_loss": 0.0, "epoch": 2.75, "learning_rate": 0.000178, "loss": 0.6156, "mlm_loss": 0.6156, "step": 3560 }, { "ep_loss": 0.0, "epoch": 2.75, "learning_rate": 0.0001785, "loss": 0.6246, "mlm_loss": 0.6246, "step": 3570 }, { "ep_loss": 0.0, "epoch": 2.76, "learning_rate": 0.000179, "loss": 0.6223, "mlm_loss": 0.6223, "step": 3580 }, { "ep_loss": 0.0, "epoch": 2.77, "learning_rate": 0.0001795, "loss": 0.6359, "mlm_loss": 0.6359, "step": 3590 }, { "ep_loss": 0.0, "epoch": 2.78, "learning_rate": 0.00017999999999999998, "loss": 0.6192, "mlm_loss": 0.6192, "step": 3600 }, { "epoch": 2.78, "eval_ep_loss": -2.543795585632324, "eval_loss": 0.5830370783805847, "eval_mlm_loss": 0.5830370783805847, "eval_runtime": 62.3115, "eval_samples_per_second": 1121.285, "eval_steps_per_second": 0.562, "step": 3600 }, { "ep_loss": 0.0, "epoch": 2.78, "learning_rate": 0.0001805, "loss": 0.619, "mlm_loss": 0.619, "step": 3610 }, { "ep_loss": 0.0, "epoch": 2.79, "learning_rate": 0.000181, "loss": 0.6107, "mlm_loss": 0.6107, "step": 3620 }, { "ep_loss": 0.0, "epoch": 2.8, "learning_rate": 0.0001815, "loss": 0.6287, "mlm_loss": 0.6287, "step": 3630 }, { "ep_loss": 0.0, "epoch": 2.81, "learning_rate": 0.000182, "loss": 0.616, "mlm_loss": 0.616, "step": 3640 }, { "ep_loss": 0.0, "epoch": 2.82, "learning_rate": 0.0001825, "loss": 0.6199, "mlm_loss": 0.6199, "step": 3650 }, { "ep_loss": 0.0, "epoch": 2.82, "learning_rate": 0.000183, "loss": 0.6179, "mlm_loss": 0.6179, "step": 3660 }, { "ep_loss": 0.0, "epoch": 2.83, "learning_rate": 0.0001835, "loss": 0.6159, "mlm_loss": 0.6159, "step": 3670 }, { "ep_loss": 0.0, "epoch": 2.84, "learning_rate": 0.000184, "loss": 0.6067, "mlm_loss": 0.6067, "step": 3680 }, { "ep_loss": 0.0, "epoch": 2.85, "learning_rate": 0.0001845, "loss": 0.6079, "mlm_loss": 0.6079, "step": 3690 }, { "ep_loss": 0.0, "epoch": 2.85, "learning_rate": 0.000185, "loss": 0.6063, "mlm_loss": 0.6063, "step": 3700 }, { "epoch": 2.85, "eval_ep_loss": -2.2521820068359375, "eval_loss": 0.5688546299934387, "eval_mlm_loss": 0.5688546299934387, "eval_runtime": 60.0349, "eval_samples_per_second": 1163.807, "eval_steps_per_second": 0.583, "step": 3700 }, { "ep_loss": 0.0, "epoch": 2.86, "learning_rate": 0.0001855, "loss": 0.6156, "mlm_loss": 0.6156, "step": 3710 }, { "ep_loss": 0.0, "epoch": 2.87, "learning_rate": 0.000186, "loss": 0.6093, "mlm_loss": 0.6093, "step": 3720 }, { "ep_loss": 0.0, "epoch": 2.88, "learning_rate": 0.0001865, "loss": 0.6073, "mlm_loss": 0.6073, "step": 3730 }, { "ep_loss": 0.0, "epoch": 2.88, "learning_rate": 0.000187, "loss": 0.6039, "mlm_loss": 0.6039, "step": 3740 }, { "ep_loss": 0.0, "epoch": 2.89, "learning_rate": 0.0001875, "loss": 0.5977, "mlm_loss": 0.5977, "step": 3750 }, { "ep_loss": 0.0, "epoch": 2.9, "learning_rate": 0.00018800000000000002, "loss": 0.6, "mlm_loss": 0.6, "step": 3760 }, { "ep_loss": 0.0, "epoch": 2.91, "learning_rate": 0.0001885, "loss": 0.6022, "mlm_loss": 0.6022, "step": 3770 }, { "ep_loss": 0.0, "epoch": 2.92, "learning_rate": 0.000189, "loss": 0.6015, "mlm_loss": 0.6015, "step": 3780 }, { "ep_loss": 0.0, "epoch": 2.92, "learning_rate": 0.0001895, "loss": 0.6016, "mlm_loss": 0.6016, "step": 3790 }, { "ep_loss": 0.0, "epoch": 2.93, "learning_rate": 0.00019, "loss": 0.5959, "mlm_loss": 0.5959, "step": 3800 }, { "epoch": 2.93, "eval_ep_loss": -2.366133213043213, "eval_loss": 0.5573724508285522, "eval_mlm_loss": 0.5573724508285522, "eval_runtime": 61.8753, "eval_samples_per_second": 1129.191, "eval_steps_per_second": 0.566, "step": 3800 }, { "ep_loss": 0.0, "epoch": 2.94, "learning_rate": 0.0001905, "loss": 0.6003, "mlm_loss": 0.6003, "step": 3810 }, { "ep_loss": 0.0, "epoch": 2.95, "learning_rate": 0.000191, "loss": 0.5984, "mlm_loss": 0.5984, "step": 3820 }, { "ep_loss": 0.0, "epoch": 2.95, "learning_rate": 0.00019150000000000002, "loss": 0.6005, "mlm_loss": 0.6005, "step": 3830 }, { "ep_loss": 0.0, "epoch": 2.96, "learning_rate": 0.000192, "loss": 0.5895, "mlm_loss": 0.5895, "step": 3840 }, { "ep_loss": 0.0, "epoch": 2.97, "learning_rate": 0.00019250000000000002, "loss": 0.5912, "mlm_loss": 0.5912, "step": 3850 }, { "ep_loss": 0.0, "epoch": 2.98, "learning_rate": 0.000193, "loss": 0.5828, "mlm_loss": 0.5828, "step": 3860 }, { "ep_loss": 0.0, "epoch": 2.98, "learning_rate": 0.00019350000000000001, "loss": 0.5865, "mlm_loss": 0.5865, "step": 3870 }, { "ep_loss": 0.0, "epoch": 2.99, "learning_rate": 0.000194, "loss": 0.5854, "mlm_loss": 0.5854, "step": 3880 }, { "ep_loss": 0.0, "epoch": 3.0, "learning_rate": 0.0001945, "loss": 0.5883, "mlm_loss": 0.5883, "step": 3890 }, { "ep_loss": 0.0, "epoch": 3.01, "learning_rate": 0.00019500000000000002, "loss": 0.5743, "mlm_loss": 0.5743, "step": 3900 }, { "epoch": 3.01, "eval_ep_loss": -2.332629680633545, "eval_loss": 0.551331102848053, "eval_mlm_loss": 0.551331102848053, "eval_runtime": 61.0219, "eval_samples_per_second": 1144.983, "eval_steps_per_second": 0.574, "step": 3900 }, { "ep_loss": 0.0, "epoch": 3.02, "learning_rate": 0.0001955, "loss": 0.5813, "mlm_loss": 0.5813, "step": 3910 }, { "ep_loss": 0.0, "epoch": 3.02, "learning_rate": 0.00019600000000000002, "loss": 0.5813, "mlm_loss": 0.5813, "step": 3920 }, { "ep_loss": 0.0, "epoch": 3.03, "learning_rate": 0.0001965, "loss": 0.5899, "mlm_loss": 0.5899, "step": 3930 }, { "ep_loss": 0.0, "epoch": 3.04, "learning_rate": 0.00019700000000000002, "loss": 0.5849, "mlm_loss": 0.5849, "step": 3940 }, { "ep_loss": 0.0, "epoch": 3.05, "learning_rate": 0.0001975, "loss": 0.5823, "mlm_loss": 0.5823, "step": 3950 }, { "ep_loss": 0.0, "epoch": 3.05, "learning_rate": 0.00019800000000000002, "loss": 0.5794, "mlm_loss": 0.5794, "step": 3960 }, { "ep_loss": 0.0, "epoch": 3.06, "learning_rate": 0.00019850000000000003, "loss": 0.5837, "mlm_loss": 0.5837, "step": 3970 }, { "ep_loss": 0.0, "epoch": 3.07, "learning_rate": 0.000199, "loss": 0.5783, "mlm_loss": 0.5783, "step": 3980 }, { "ep_loss": 0.0, "epoch": 3.08, "learning_rate": 0.00019950000000000002, "loss": 0.5733, "mlm_loss": 0.5733, "step": 3990 }, { "ep_loss": 0.0, "epoch": 3.09, "learning_rate": 0.0002, "loss": 0.5701, "mlm_loss": 0.5701, "step": 4000 }, { "epoch": 3.09, "eval_ep_loss": -2.3982014656066895, "eval_loss": 0.5421211123466492, "eval_mlm_loss": 0.5421211123466492, "eval_runtime": 62.0302, "eval_samples_per_second": 1126.37, "eval_steps_per_second": 0.564, "step": 4000 }, { "ep_loss": 0.0, "epoch": 3.09, "learning_rate": 0.00020050000000000002, "loss": 0.5737, "mlm_loss": 0.5737, "step": 4010 }, { "ep_loss": 0.0, "epoch": 3.1, "learning_rate": 0.000201, "loss": 0.5645, "mlm_loss": 0.5645, "step": 4020 }, { "ep_loss": 0.0, "epoch": 3.11, "learning_rate": 0.00020150000000000002, "loss": 0.5758, "mlm_loss": 0.5758, "step": 4030 }, { "ep_loss": 0.0, "epoch": 3.12, "learning_rate": 0.000202, "loss": 0.5849, "mlm_loss": 0.5849, "step": 4040 }, { "ep_loss": 0.0, "epoch": 3.12, "learning_rate": 0.00020250000000000002, "loss": 0.5667, "mlm_loss": 0.5667, "step": 4050 }, { "ep_loss": 0.0, "epoch": 3.13, "learning_rate": 0.00020300000000000003, "loss": 0.5726, "mlm_loss": 0.5726, "step": 4060 }, { "ep_loss": 0.0, "epoch": 3.14, "learning_rate": 0.00020349999999999999, "loss": 0.5645, "mlm_loss": 0.5645, "step": 4070 }, { "ep_loss": 0.0, "epoch": 3.15, "learning_rate": 0.000204, "loss": 0.5681, "mlm_loss": 0.5681, "step": 4080 }, { "ep_loss": 0.0, "epoch": 3.15, "learning_rate": 0.00020449999999999998, "loss": 0.564, "mlm_loss": 0.564, "step": 4090 }, { "ep_loss": 0.0, "epoch": 3.16, "learning_rate": 0.000205, "loss": 0.5655, "mlm_loss": 0.5655, "step": 4100 }, { "epoch": 3.16, "eval_ep_loss": -2.218219041824341, "eval_loss": 0.5314173698425293, "eval_mlm_loss": 0.5314173698425293, "eval_runtime": 60.8804, "eval_samples_per_second": 1147.644, "eval_steps_per_second": 0.575, "step": 4100 }, { "ep_loss": 0.0, "epoch": 3.17, "learning_rate": 0.00020549999999999998, "loss": 0.5654, "mlm_loss": 0.5654, "step": 4110 }, { "ep_loss": 0.0, "epoch": 3.18, "learning_rate": 0.000206, "loss": 0.5647, "mlm_loss": 0.5647, "step": 4120 }, { "ep_loss": 0.0, "epoch": 3.19, "learning_rate": 0.0002065, "loss": 0.568, "mlm_loss": 0.568, "step": 4130 }, { "ep_loss": 0.0, "epoch": 3.19, "learning_rate": 0.000207, "loss": 0.5538, "mlm_loss": 0.5538, "step": 4140 }, { "ep_loss": 0.0, "epoch": 3.2, "learning_rate": 0.0002075, "loss": 0.5585, "mlm_loss": 0.5585, "step": 4150 }, { "ep_loss": 0.0, "epoch": 3.21, "learning_rate": 0.000208, "loss": 0.5578, "mlm_loss": 0.5578, "step": 4160 }, { "ep_loss": 0.0, "epoch": 3.22, "learning_rate": 0.0002085, "loss": 0.5508, "mlm_loss": 0.5508, "step": 4170 }, { "ep_loss": 0.0, "epoch": 3.22, "learning_rate": 0.00020899999999999998, "loss": 0.5644, "mlm_loss": 0.5644, "step": 4180 }, { "ep_loss": 0.0, "epoch": 3.23, "learning_rate": 0.0002095, "loss": 0.5642, "mlm_loss": 0.5642, "step": 4190 }, { "ep_loss": 0.0, "epoch": 3.24, "learning_rate": 0.00021, "loss": 0.561, "mlm_loss": 0.561, "step": 4200 }, { "epoch": 3.24, "eval_ep_loss": -2.287240743637085, "eval_loss": 0.5224212408065796, "eval_mlm_loss": 0.5224212408065796, "eval_runtime": 61.3268, "eval_samples_per_second": 1139.291, "eval_steps_per_second": 0.571, "step": 4200 }, { "ep_loss": 0.0, "epoch": 3.25, "learning_rate": 0.0002105, "loss": 0.5564, "mlm_loss": 0.5564, "step": 4210 }, { "ep_loss": 0.0, "epoch": 3.25, "learning_rate": 0.000211, "loss": 0.5511, "mlm_loss": 0.5511, "step": 4220 }, { "ep_loss": 0.0, "epoch": 3.26, "learning_rate": 0.0002115, "loss": 0.5578, "mlm_loss": 0.5578, "step": 4230 }, { "ep_loss": 0.0, "epoch": 3.27, "learning_rate": 0.000212, "loss": 0.5531, "mlm_loss": 0.5531, "step": 4240 }, { "ep_loss": 0.0, "epoch": 3.28, "learning_rate": 0.0002125, "loss": 0.5569, "mlm_loss": 0.5569, "step": 4250 }, { "ep_loss": 0.0, "epoch": 3.29, "learning_rate": 0.000213, "loss": 0.5499, "mlm_loss": 0.5499, "step": 4260 }, { "ep_loss": 0.0, "epoch": 3.29, "learning_rate": 0.0002135, "loss": 0.5514, "mlm_loss": 0.5514, "step": 4270 }, { "ep_loss": 0.0, "epoch": 3.3, "learning_rate": 0.000214, "loss": 0.554, "mlm_loss": 0.554, "step": 4280 }, { "ep_loss": 0.0, "epoch": 3.31, "learning_rate": 0.0002145, "loss": 0.5444, "mlm_loss": 0.5444, "step": 4290 }, { "ep_loss": 0.0, "epoch": 3.32, "learning_rate": 0.000215, "loss": 0.5555, "mlm_loss": 0.5555, "step": 4300 }, { "epoch": 3.32, "eval_ep_loss": -2.407238245010376, "eval_loss": 0.5153276920318604, "eval_mlm_loss": 0.5153276920318604, "eval_runtime": 62.01, "eval_samples_per_second": 1126.738, "eval_steps_per_second": 0.564, "step": 4300 }, { "ep_loss": 0.0, "epoch": 3.32, "learning_rate": 0.0002155, "loss": 0.5374, "mlm_loss": 0.5374, "step": 4310 }, { "ep_loss": 0.0, "epoch": 3.33, "learning_rate": 0.000216, "loss": 0.5526, "mlm_loss": 0.5526, "step": 4320 }, { "ep_loss": 0.0, "epoch": 3.34, "learning_rate": 0.0002165, "loss": 0.5451, "mlm_loss": 0.5451, "step": 4330 }, { "ep_loss": 0.0, "epoch": 3.35, "learning_rate": 0.00021700000000000002, "loss": 0.5406, "mlm_loss": 0.5406, "step": 4340 }, { "ep_loss": 0.0, "epoch": 3.36, "learning_rate": 0.0002175, "loss": 0.5339, "mlm_loss": 0.5339, "step": 4350 }, { "ep_loss": 0.0, "epoch": 3.36, "learning_rate": 0.000218, "loss": 0.5434, "mlm_loss": 0.5434, "step": 4360 }, { "ep_loss": 0.0, "epoch": 3.37, "learning_rate": 0.0002185, "loss": 0.546, "mlm_loss": 0.546, "step": 4370 }, { "ep_loss": 0.0, "epoch": 3.38, "learning_rate": 0.000219, "loss": 0.5359, "mlm_loss": 0.5359, "step": 4380 }, { "ep_loss": 0.0, "epoch": 3.39, "learning_rate": 0.0002195, "loss": 0.5405, "mlm_loss": 0.5405, "step": 4390 }, { "ep_loss": 0.0, "epoch": 3.39, "learning_rate": 0.00022, "loss": 0.5506, "mlm_loss": 0.5506, "step": 4400 }, { "epoch": 3.39, "eval_ep_loss": -2.1430230140686035, "eval_loss": 0.5089389681816101, "eval_mlm_loss": 0.5089389681816101, "eval_runtime": 60.2829, "eval_samples_per_second": 1159.018, "eval_steps_per_second": 0.581, "step": 4400 }, { "ep_loss": 0.0, "epoch": 3.4, "learning_rate": 0.0002205, "loss": 0.5425, "mlm_loss": 0.5425, "step": 4410 }, { "ep_loss": 0.0, "epoch": 3.41, "learning_rate": 0.000221, "loss": 0.5443, "mlm_loss": 0.5443, "step": 4420 }, { "ep_loss": 0.0, "epoch": 3.42, "learning_rate": 0.00022150000000000002, "loss": 0.5433, "mlm_loss": 0.5433, "step": 4430 }, { "ep_loss": 0.0, "epoch": 3.42, "learning_rate": 0.000222, "loss": 0.5379, "mlm_loss": 0.5379, "step": 4440 }, { "ep_loss": 0.0, "epoch": 3.43, "learning_rate": 0.00022250000000000001, "loss": 0.5349, "mlm_loss": 0.5349, "step": 4450 }, { "ep_loss": 0.0, "epoch": 3.44, "learning_rate": 0.000223, "loss": 0.5385, "mlm_loss": 0.5385, "step": 4460 }, { "ep_loss": 0.0, "epoch": 3.45, "learning_rate": 0.0002235, "loss": 0.5337, "mlm_loss": 0.5337, "step": 4470 }, { "ep_loss": 0.0, "epoch": 3.46, "learning_rate": 0.000224, "loss": 0.5372, "mlm_loss": 0.5372, "step": 4480 }, { "ep_loss": 0.0, "epoch": 3.46, "learning_rate": 0.0002245, "loss": 0.5342, "mlm_loss": 0.5342, "step": 4490 }, { "ep_loss": 0.0, "epoch": 3.47, "learning_rate": 0.00022500000000000002, "loss": 0.5329, "mlm_loss": 0.5329, "step": 4500 }, { "epoch": 3.47, "eval_ep_loss": -2.252265691757202, "eval_loss": 0.501177966594696, "eval_mlm_loss": 0.501177966594696, "eval_runtime": 61.1269, "eval_samples_per_second": 1143.016, "eval_steps_per_second": 0.573, "step": 4500 }, { "ep_loss": 0.0, "epoch": 3.48, "learning_rate": 0.0002255, "loss": 0.5321, "mlm_loss": 0.5321, "step": 4510 }, { "ep_loss": 0.0, "epoch": 3.49, "learning_rate": 0.00022600000000000002, "loss": 0.5413, "mlm_loss": 0.5413, "step": 4520 }, { "ep_loss": 0.0, "epoch": 3.49, "learning_rate": 0.0002265, "loss": 0.5364, "mlm_loss": 0.5364, "step": 4530 }, { "ep_loss": 0.0, "epoch": 3.5, "learning_rate": 0.00022700000000000002, "loss": 0.5326, "mlm_loss": 0.5326, "step": 4540 }, { "ep_loss": 0.0, "epoch": 3.51, "learning_rate": 0.0002275, "loss": 0.5327, "mlm_loss": 0.5327, "step": 4550 }, { "ep_loss": 0.0, "epoch": 3.52, "learning_rate": 0.000228, "loss": 0.5334, "mlm_loss": 0.5334, "step": 4560 }, { "ep_loss": 0.0, "epoch": 3.52, "learning_rate": 0.00022850000000000002, "loss": 0.5344, "mlm_loss": 0.5344, "step": 4570 }, { "ep_loss": 0.0, "epoch": 3.53, "learning_rate": 0.000229, "loss": 0.5259, "mlm_loss": 0.5259, "step": 4580 }, { "ep_loss": 0.0, "epoch": 3.54, "learning_rate": 0.00022950000000000002, "loss": 0.519, "mlm_loss": 0.519, "step": 4590 }, { "ep_loss": 0.0, "epoch": 3.55, "learning_rate": 0.00023, "loss": 0.5243, "mlm_loss": 0.5243, "step": 4600 }, { "epoch": 3.55, "eval_ep_loss": -2.4623284339904785, "eval_loss": 0.49688172340393066, "eval_mlm_loss": 0.49688172340393066, "eval_runtime": 60.8838, "eval_samples_per_second": 1147.579, "eval_steps_per_second": 0.575, "step": 4600 }, { "ep_loss": 0.0, "epoch": 3.56, "learning_rate": 0.00023050000000000002, "loss": 0.5237, "mlm_loss": 0.5237, "step": 4610 }, { "ep_loss": 0.0, "epoch": 3.56, "learning_rate": 0.000231, "loss": 0.5249, "mlm_loss": 0.5249, "step": 4620 }, { "ep_loss": 0.0, "epoch": 3.57, "learning_rate": 0.00023150000000000002, "loss": 0.5208, "mlm_loss": 0.5208, "step": 4630 }, { "ep_loss": 0.0, "epoch": 3.58, "learning_rate": 0.00023200000000000003, "loss": 0.5213, "mlm_loss": 0.5213, "step": 4640 }, { "ep_loss": 0.0, "epoch": 3.59, "learning_rate": 0.0002325, "loss": 0.5316, "mlm_loss": 0.5316, "step": 4650 }, { "ep_loss": 0.0, "epoch": 3.59, "learning_rate": 0.00023300000000000003, "loss": 0.5231, "mlm_loss": 0.5231, "step": 4660 }, { "ep_loss": 0.0, "epoch": 3.6, "learning_rate": 0.0002335, "loss": 0.524, "mlm_loss": 0.524, "step": 4670 }, { "ep_loss": 0.0, "epoch": 3.61, "learning_rate": 0.00023400000000000002, "loss": 0.5274, "mlm_loss": 0.5274, "step": 4680 }, { "ep_loss": 0.0, "epoch": 3.62, "learning_rate": 0.00023449999999999998, "loss": 0.5186, "mlm_loss": 0.5186, "step": 4690 }, { "ep_loss": 0.0, "epoch": 3.63, "learning_rate": 0.000235, "loss": 0.5201, "mlm_loss": 0.5201, "step": 4700 }, { "epoch": 3.63, "eval_ep_loss": -2.275129556655884, "eval_loss": 0.48909062147140503, "eval_mlm_loss": 0.48909062147140503, "eval_runtime": 63.7661, "eval_samples_per_second": 1095.708, "eval_steps_per_second": 0.549, "step": 4700 }, { "ep_loss": 0.0, "epoch": 3.63, "learning_rate": 0.0002355, "loss": 0.5167, "mlm_loss": 0.5167, "step": 4710 }, { "ep_loss": 0.0, "epoch": 3.64, "learning_rate": 0.000236, "loss": 0.5152, "mlm_loss": 0.5152, "step": 4720 }, { "ep_loss": 0.0, "epoch": 3.65, "learning_rate": 0.0002365, "loss": 0.5201, "mlm_loss": 0.5201, "step": 4730 }, { "ep_loss": 0.0, "epoch": 3.66, "learning_rate": 0.000237, "loss": 0.5234, "mlm_loss": 0.5234, "step": 4740 }, { "ep_loss": 0.0, "epoch": 3.66, "learning_rate": 0.0002375, "loss": 0.5156, "mlm_loss": 0.5156, "step": 4750 }, { "ep_loss": 0.0, "epoch": 3.67, "learning_rate": 0.00023799999999999998, "loss": 0.5189, "mlm_loss": 0.5189, "step": 4760 }, { "ep_loss": 0.0, "epoch": 3.68, "learning_rate": 0.0002385, "loss": 0.5189, "mlm_loss": 0.5189, "step": 4770 }, { "ep_loss": 0.0, "epoch": 3.69, "learning_rate": 0.00023899999999999998, "loss": 0.51, "mlm_loss": 0.51, "step": 4780 }, { "ep_loss": 0.0, "epoch": 3.69, "learning_rate": 0.0002395, "loss": 0.5158, "mlm_loss": 0.5158, "step": 4790 }, { "ep_loss": 0.0, "epoch": 3.7, "learning_rate": 0.00024, "loss": 0.5121, "mlm_loss": 0.5121, "step": 4800 }, { "epoch": 3.7, "eval_ep_loss": -2.0928449630737305, "eval_loss": 0.48291251063346863, "eval_mlm_loss": 0.48291251063346863, "eval_runtime": 60.133, "eval_samples_per_second": 1161.908, "eval_steps_per_second": 0.582, "step": 4800 }, { "ep_loss": 0.0, "epoch": 3.71, "learning_rate": 0.0002405, "loss": 0.5108, "mlm_loss": 0.5108, "step": 4810 }, { "ep_loss": 0.0, "epoch": 3.72, "learning_rate": 0.000241, "loss": 0.5176, "mlm_loss": 0.5176, "step": 4820 }, { "ep_loss": 0.0, "epoch": 3.73, "learning_rate": 0.0002415, "loss": 0.5102, "mlm_loss": 0.5102, "step": 4830 }, { "ep_loss": 0.0, "epoch": 3.73, "learning_rate": 0.000242, "loss": 0.514, "mlm_loss": 0.514, "step": 4840 }, { "ep_loss": 0.0, "epoch": 3.74, "learning_rate": 0.00024249999999999999, "loss": 0.5086, "mlm_loss": 0.5086, "step": 4850 }, { "ep_loss": 0.0, "epoch": 3.75, "learning_rate": 0.000243, "loss": 0.515, "mlm_loss": 0.515, "step": 4860 }, { "ep_loss": 0.0, "epoch": 3.76, "learning_rate": 0.0002435, "loss": 0.5091, "mlm_loss": 0.5091, "step": 4870 }, { "ep_loss": 0.0, "epoch": 3.76, "learning_rate": 0.000244, "loss": 0.514, "mlm_loss": 0.514, "step": 4880 }, { "ep_loss": 0.0, "epoch": 3.77, "learning_rate": 0.0002445, "loss": 0.5051, "mlm_loss": 0.5051, "step": 4890 }, { "ep_loss": 0.0, "epoch": 3.78, "learning_rate": 0.000245, "loss": 0.526, "mlm_loss": 0.526, "step": 4900 }, { "epoch": 3.78, "eval_ep_loss": -2.452838182449341, "eval_loss": 0.4754766523838043, "eval_mlm_loss": 0.4754766523838043, "eval_runtime": 60.9895, "eval_samples_per_second": 1145.591, "eval_steps_per_second": 0.574, "step": 4900 }, { "ep_loss": 0.0, "epoch": 3.79, "learning_rate": 0.0002455, "loss": 0.5104, "mlm_loss": 0.5104, "step": 4910 }, { "ep_loss": 0.0, "epoch": 3.79, "learning_rate": 0.000246, "loss": 0.4975, "mlm_loss": 0.4975, "step": 4920 }, { "ep_loss": 0.0, "epoch": 3.8, "learning_rate": 0.00024650000000000003, "loss": 0.511, "mlm_loss": 0.511, "step": 4930 }, { "ep_loss": 0.0, "epoch": 3.81, "learning_rate": 0.000247, "loss": 0.5081, "mlm_loss": 0.5081, "step": 4940 }, { "ep_loss": 0.0, "epoch": 3.82, "learning_rate": 0.0002475, "loss": 0.5067, "mlm_loss": 0.5067, "step": 4950 }, { "ep_loss": 0.0, "epoch": 3.83, "learning_rate": 0.000248, "loss": 0.5062, "mlm_loss": 0.5062, "step": 4960 }, { "ep_loss": 0.0, "epoch": 3.83, "learning_rate": 0.0002485, "loss": 0.5054, "mlm_loss": 0.5054, "step": 4970 }, { "ep_loss": 0.0, "epoch": 3.84, "learning_rate": 0.000249, "loss": 0.5032, "mlm_loss": 0.5032, "step": 4980 }, { "ep_loss": 0.0, "epoch": 3.85, "learning_rate": 0.0002495, "loss": 0.5022, "mlm_loss": 0.5022, "step": 4990 }, { "ep_loss": 0.0, "epoch": 3.86, "learning_rate": 0.00025, "loss": 0.5077, "mlm_loss": 0.5077, "step": 5000 }, { "epoch": 3.86, "eval_ep_loss": -2.5484063625335693, "eval_loss": 0.47381114959716797, "eval_mlm_loss": 0.47381114959716797, "eval_runtime": 59.7879, "eval_samples_per_second": 1168.615, "eval_steps_per_second": 0.585, "step": 5000 }, { "ep_loss": 0.0, "epoch": 3.86, "learning_rate": 0.0002505, "loss": 0.5129, "mlm_loss": 0.5129, "step": 5010 }, { "ep_loss": 0.0, "epoch": 3.87, "learning_rate": 0.00025100000000000003, "loss": 0.5031, "mlm_loss": 0.5031, "step": 5020 }, { "ep_loss": 0.0, "epoch": 3.88, "learning_rate": 0.0002515, "loss": 0.4941, "mlm_loss": 0.4941, "step": 5030 }, { "ep_loss": 0.0, "epoch": 3.89, "learning_rate": 0.000252, "loss": 0.4976, "mlm_loss": 0.4976, "step": 5040 }, { "ep_loss": 0.0, "epoch": 3.9, "learning_rate": 0.0002525, "loss": 0.5053, "mlm_loss": 0.5053, "step": 5050 }, { "ep_loss": 0.0, "epoch": 3.9, "learning_rate": 0.000253, "loss": 0.495, "mlm_loss": 0.495, "step": 5060 }, { "ep_loss": 0.0, "epoch": 3.91, "learning_rate": 0.0002535, "loss": 0.4972, "mlm_loss": 0.4972, "step": 5070 }, { "ep_loss": 0.0, "epoch": 3.92, "learning_rate": 0.000254, "loss": 0.4934, "mlm_loss": 0.4934, "step": 5080 }, { "ep_loss": 0.0, "epoch": 3.93, "learning_rate": 0.0002545, "loss": 0.4907, "mlm_loss": 0.4907, "step": 5090 }, { "ep_loss": 0.0, "epoch": 3.93, "learning_rate": 0.000255, "loss": 0.4936, "mlm_loss": 0.4936, "step": 5100 }, { "epoch": 3.93, "eval_ep_loss": -2.5289254188537598, "eval_loss": 0.46655842661857605, "eval_mlm_loss": 0.46655842661857605, "eval_runtime": 62.2403, "eval_samples_per_second": 1122.569, "eval_steps_per_second": 0.562, "step": 5100 }, { "ep_loss": 0.0, "epoch": 3.94, "learning_rate": 0.00025550000000000003, "loss": 0.4996, "mlm_loss": 0.4996, "step": 5110 }, { "ep_loss": 0.0, "epoch": 3.95, "learning_rate": 0.000256, "loss": 0.4911, "mlm_loss": 0.4911, "step": 5120 }, { "ep_loss": 0.0, "epoch": 3.96, "learning_rate": 0.0002565, "loss": 0.4994, "mlm_loss": 0.4994, "step": 5130 }, { "ep_loss": 0.0, "epoch": 3.96, "learning_rate": 0.000257, "loss": 0.4899, "mlm_loss": 0.4899, "step": 5140 }, { "ep_loss": 0.0, "epoch": 3.97, "learning_rate": 0.0002575, "loss": 0.5057, "mlm_loss": 0.5057, "step": 5150 }, { "ep_loss": 0.0, "epoch": 3.98, "learning_rate": 0.00025800000000000004, "loss": 0.4845, "mlm_loss": 0.4845, "step": 5160 }, { "ep_loss": 0.0, "epoch": 3.99, "learning_rate": 0.0002585, "loss": 0.4925, "mlm_loss": 0.4925, "step": 5170 }, { "ep_loss": 0.0, "epoch": 4.0, "learning_rate": 0.000259, "loss": 0.4976, "mlm_loss": 0.4976, "step": 5180 }, { "ep_loss": 0.0, "epoch": 4.0, "learning_rate": 0.0002595, "loss": 0.495, "mlm_loss": 0.495, "step": 5190 }, { "ep_loss": 0.0, "epoch": 4.01, "learning_rate": 0.00026000000000000003, "loss": 0.4902, "mlm_loss": 0.4902, "step": 5200 }, { "epoch": 4.01, "eval_ep_loss": -2.2716147899627686, "eval_loss": 0.46275386214256287, "eval_mlm_loss": 0.46275386214256287, "eval_runtime": 59.0432, "eval_samples_per_second": 1183.353, "eval_steps_per_second": 0.593, "step": 5200 }, { "ep_loss": 0.0, "epoch": 4.02, "learning_rate": 0.0002605, "loss": 0.4882, "mlm_loss": 0.4882, "step": 5210 }, { "ep_loss": 0.0, "epoch": 4.03, "learning_rate": 0.000261, "loss": 0.4889, "mlm_loss": 0.4889, "step": 5220 }, { "ep_loss": 0.0, "epoch": 4.03, "learning_rate": 0.0002615, "loss": 0.4967, "mlm_loss": 0.4967, "step": 5230 }, { "ep_loss": 0.0, "epoch": 4.04, "learning_rate": 0.000262, "loss": 0.4841, "mlm_loss": 0.4841, "step": 5240 }, { "ep_loss": 0.0, "epoch": 4.05, "learning_rate": 0.00026250000000000004, "loss": 0.4883, "mlm_loss": 0.4883, "step": 5250 }, { "ep_loss": 0.0, "epoch": 4.06, "learning_rate": 0.000263, "loss": 0.4877, "mlm_loss": 0.4877, "step": 5260 }, { "ep_loss": 0.0, "epoch": 4.06, "learning_rate": 0.0002635, "loss": 0.4785, "mlm_loss": 0.4785, "step": 5270 }, { "ep_loss": 0.0, "epoch": 4.07, "learning_rate": 0.000264, "loss": 0.4896, "mlm_loss": 0.4896, "step": 5280 }, { "ep_loss": 0.0, "epoch": 4.08, "learning_rate": 0.00026450000000000003, "loss": 0.4866, "mlm_loss": 0.4866, "step": 5290 }, { "ep_loss": 0.0, "epoch": 4.09, "learning_rate": 0.00026500000000000004, "loss": 0.4871, "mlm_loss": 0.4871, "step": 5300 }, { "epoch": 4.09, "eval_ep_loss": -2.3992135524749756, "eval_loss": 0.4574643075466156, "eval_mlm_loss": 0.4574643075466156, "eval_runtime": 62.4777, "eval_samples_per_second": 1118.303, "eval_steps_per_second": 0.56, "step": 5300 }, { "ep_loss": 0.0, "epoch": 4.1, "learning_rate": 0.0002655, "loss": 0.4836, "mlm_loss": 0.4836, "step": 5310 }, { "ep_loss": 0.0, "epoch": 4.1, "learning_rate": 0.000266, "loss": 0.487, "mlm_loss": 0.487, "step": 5320 }, { "ep_loss": 0.0, "epoch": 4.11, "learning_rate": 0.0002665, "loss": 0.4922, "mlm_loss": 0.4922, "step": 5330 }, { "ep_loss": 0.0, "epoch": 4.12, "learning_rate": 0.00026700000000000004, "loss": 0.4886, "mlm_loss": 0.4886, "step": 5340 }, { "ep_loss": 0.0, "epoch": 4.13, "learning_rate": 0.0002675, "loss": 0.4774, "mlm_loss": 0.4774, "step": 5350 }, { "ep_loss": 0.0, "epoch": 4.13, "learning_rate": 0.000268, "loss": 0.4833, "mlm_loss": 0.4833, "step": 5360 }, { "ep_loss": 0.0, "epoch": 4.14, "learning_rate": 0.0002685, "loss": 0.4878, "mlm_loss": 0.4878, "step": 5370 }, { "ep_loss": 0.0, "epoch": 4.15, "learning_rate": 0.00026900000000000003, "loss": 0.4778, "mlm_loss": 0.4778, "step": 5380 }, { "ep_loss": 0.0, "epoch": 4.16, "learning_rate": 0.00026950000000000005, "loss": 0.4833, "mlm_loss": 0.4833, "step": 5390 }, { "ep_loss": 0.0, "epoch": 4.17, "learning_rate": 0.00027, "loss": 0.4849, "mlm_loss": 0.4849, "step": 5400 }, { "epoch": 4.17, "eval_ep_loss": -2.320817708969116, "eval_loss": 0.4534033536911011, "eval_mlm_loss": 0.4534033536911011, "eval_runtime": 61.6917, "eval_samples_per_second": 1132.551, "eval_steps_per_second": 0.567, "step": 5400 }, { "ep_loss": 0.0, "epoch": 4.17, "learning_rate": 0.0002705, "loss": 0.4889, "mlm_loss": 0.4889, "step": 5410 }, { "ep_loss": 0.0, "epoch": 4.18, "learning_rate": 0.00027100000000000003, "loss": 0.4793, "mlm_loss": 0.4793, "step": 5420 }, { "ep_loss": 0.0, "epoch": 4.19, "learning_rate": 0.00027150000000000004, "loss": 0.476, "mlm_loss": 0.476, "step": 5430 }, { "ep_loss": 0.0, "epoch": 4.2, "learning_rate": 0.00027200000000000005, "loss": 0.4808, "mlm_loss": 0.4808, "step": 5440 }, { "ep_loss": 0.0, "epoch": 4.2, "learning_rate": 0.0002725, "loss": 0.475, "mlm_loss": 0.475, "step": 5450 }, { "ep_loss": 0.0, "epoch": 4.21, "learning_rate": 0.000273, "loss": 0.4763, "mlm_loss": 0.4763, "step": 5460 }, { "ep_loss": 0.0, "epoch": 4.22, "learning_rate": 0.00027350000000000003, "loss": 0.4796, "mlm_loss": 0.4796, "step": 5470 }, { "ep_loss": 0.0, "epoch": 4.23, "learning_rate": 0.00027400000000000005, "loss": 0.4828, "mlm_loss": 0.4828, "step": 5480 }, { "ep_loss": 0.0, "epoch": 4.23, "learning_rate": 0.0002745, "loss": 0.4799, "mlm_loss": 0.4799, "step": 5490 }, { "ep_loss": 0.0, "epoch": 4.24, "learning_rate": 0.000275, "loss": 0.4769, "mlm_loss": 0.4769, "step": 5500 }, { "epoch": 4.24, "eval_ep_loss": -2.215350389480591, "eval_loss": 0.4512145221233368, "eval_mlm_loss": 0.4512145221233368, "eval_runtime": 60.8844, "eval_samples_per_second": 1147.569, "eval_steps_per_second": 0.575, "step": 5500 }, { "ep_loss": 0.0, "epoch": 4.25, "learning_rate": 0.00027550000000000003, "loss": 0.4799, "mlm_loss": 0.4799, "step": 5510 }, { "ep_loss": 0.0, "epoch": 4.26, "learning_rate": 0.00027600000000000004, "loss": 0.4732, "mlm_loss": 0.4732, "step": 5520 }, { "ep_loss": 0.0, "epoch": 4.27, "learning_rate": 0.00027650000000000005, "loss": 0.4756, "mlm_loss": 0.4756, "step": 5530 }, { "ep_loss": 0.0, "epoch": 4.27, "learning_rate": 0.000277, "loss": 0.4749, "mlm_loss": 0.4749, "step": 5540 }, { "ep_loss": 0.0, "epoch": 4.28, "learning_rate": 0.0002775, "loss": 0.4656, "mlm_loss": 0.4656, "step": 5550 }, { "ep_loss": 0.0, "epoch": 4.29, "learning_rate": 0.00027800000000000004, "loss": 0.473, "mlm_loss": 0.473, "step": 5560 }, { "ep_loss": 0.0, "epoch": 4.3, "learning_rate": 0.00027850000000000005, "loss": 0.4819, "mlm_loss": 0.4819, "step": 5570 }, { "ep_loss": 0.0, "epoch": 4.3, "learning_rate": 0.000279, "loss": 0.4825, "mlm_loss": 0.4825, "step": 5580 }, { "ep_loss": 0.0, "epoch": 4.31, "learning_rate": 0.0002795, "loss": 0.4799, "mlm_loss": 0.4799, "step": 5590 }, { "ep_loss": 0.0, "epoch": 4.32, "learning_rate": 0.00028000000000000003, "loss": 0.4684, "mlm_loss": 0.4684, "step": 5600 }, { "epoch": 4.32, "eval_ep_loss": -2.7064967155456543, "eval_loss": 0.44732606410980225, "eval_mlm_loss": 0.44732606410980225, "eval_runtime": 60.0339, "eval_samples_per_second": 1163.826, "eval_steps_per_second": 0.583, "step": 5600 }, { "ep_loss": 0.0, "epoch": 4.33, "learning_rate": 0.00028050000000000004, "loss": 0.47, "mlm_loss": 0.47, "step": 5610 }, { "ep_loss": 0.0, "epoch": 4.33, "learning_rate": 0.00028100000000000005, "loss": 0.4698, "mlm_loss": 0.4698, "step": 5620 }, { "ep_loss": 0.0, "epoch": 4.34, "learning_rate": 0.00028149999999999996, "loss": 0.4758, "mlm_loss": 0.4758, "step": 5630 }, { "ep_loss": 0.0, "epoch": 4.35, "learning_rate": 0.00028199999999999997, "loss": 0.475, "mlm_loss": 0.475, "step": 5640 }, { "ep_loss": 0.0, "epoch": 4.36, "learning_rate": 0.0002825, "loss": 0.4805, "mlm_loss": 0.4805, "step": 5650 }, { "ep_loss": 0.0, "epoch": 4.37, "learning_rate": 0.000283, "loss": 0.4731, "mlm_loss": 0.4731, "step": 5660 }, { "ep_loss": 0.0, "epoch": 4.37, "learning_rate": 0.0002835, "loss": 0.4738, "mlm_loss": 0.4738, "step": 5670 }, { "ep_loss": 0.0, "epoch": 4.38, "learning_rate": 0.00028399999999999996, "loss": 0.4672, "mlm_loss": 0.4672, "step": 5680 }, { "ep_loss": 0.0, "epoch": 4.39, "learning_rate": 0.0002845, "loss": 0.4614, "mlm_loss": 0.4614, "step": 5690 }, { "ep_loss": 0.0, "epoch": 4.4, "learning_rate": 0.000285, "loss": 0.4717, "mlm_loss": 0.4717, "step": 5700 }, { "epoch": 4.4, "eval_ep_loss": -2.254852294921875, "eval_loss": 0.44317254424095154, "eval_mlm_loss": 0.44317254424095154, "eval_runtime": 59.6572, "eval_samples_per_second": 1171.174, "eval_steps_per_second": 0.587, "step": 5700 }, { "ep_loss": 0.0, "epoch": 4.4, "learning_rate": 0.0002855, "loss": 0.4665, "mlm_loss": 0.4665, "step": 5710 }, { "ep_loss": 0.0, "epoch": 4.41, "learning_rate": 0.00028599999999999996, "loss": 0.4725, "mlm_loss": 0.4725, "step": 5720 }, { "ep_loss": 0.0, "epoch": 4.42, "learning_rate": 0.00028649999999999997, "loss": 0.4688, "mlm_loss": 0.4688, "step": 5730 }, { "ep_loss": 0.0, "epoch": 4.43, "learning_rate": 0.000287, "loss": 0.4742, "mlm_loss": 0.4742, "step": 5740 }, { "ep_loss": 0.0, "epoch": 4.44, "learning_rate": 0.0002875, "loss": 0.4656, "mlm_loss": 0.4656, "step": 5750 }, { "ep_loss": 0.0, "epoch": 4.44, "learning_rate": 0.000288, "loss": 0.4758, "mlm_loss": 0.4758, "step": 5760 }, { "ep_loss": 0.0, "epoch": 4.45, "learning_rate": 0.00028849999999999997, "loss": 0.4705, "mlm_loss": 0.4705, "step": 5770 }, { "ep_loss": 0.0, "epoch": 4.46, "learning_rate": 0.000289, "loss": 0.4655, "mlm_loss": 0.4655, "step": 5780 }, { "ep_loss": 0.0, "epoch": 4.47, "learning_rate": 0.0002895, "loss": 0.4636, "mlm_loss": 0.4636, "step": 5790 }, { "ep_loss": 0.0, "epoch": 4.47, "learning_rate": 0.00029, "loss": 0.4646, "mlm_loss": 0.4646, "step": 5800 }, { "epoch": 4.47, "eval_ep_loss": -2.2536895275115967, "eval_loss": 0.43992048501968384, "eval_mlm_loss": 0.43992048501968384, "eval_runtime": 62.7548, "eval_samples_per_second": 1113.364, "eval_steps_per_second": 0.558, "step": 5800 }, { "ep_loss": 0.0, "epoch": 4.48, "learning_rate": 0.00029049999999999996, "loss": 0.4691, "mlm_loss": 0.4691, "step": 5810 }, { "ep_loss": 0.0, "epoch": 4.49, "learning_rate": 0.00029099999999999997, "loss": 0.4634, "mlm_loss": 0.4634, "step": 5820 }, { "ep_loss": 0.0, "epoch": 4.5, "learning_rate": 0.0002915, "loss": 0.46, "mlm_loss": 0.46, "step": 5830 }, { "ep_loss": 0.0, "epoch": 4.5, "learning_rate": 0.000292, "loss": 0.4648, "mlm_loss": 0.4648, "step": 5840 }, { "ep_loss": 0.0, "epoch": 4.51, "learning_rate": 0.0002925, "loss": 0.4624, "mlm_loss": 0.4624, "step": 5850 }, { "ep_loss": 0.0, "epoch": 4.52, "learning_rate": 0.00029299999999999997, "loss": 0.4649, "mlm_loss": 0.4649, "step": 5860 }, { "ep_loss": 0.0, "epoch": 4.53, "learning_rate": 0.0002935, "loss": 0.4626, "mlm_loss": 0.4626, "step": 5870 }, { "ep_loss": 0.0, "epoch": 4.54, "learning_rate": 0.000294, "loss": 0.4648, "mlm_loss": 0.4648, "step": 5880 }, { "ep_loss": 0.0, "epoch": 4.54, "learning_rate": 0.0002945, "loss": 0.4694, "mlm_loss": 0.4694, "step": 5890 }, { "ep_loss": 0.0, "epoch": 4.55, "learning_rate": 0.000295, "loss": 0.4699, "mlm_loss": 0.4699, "step": 5900 }, { "epoch": 4.55, "eval_ep_loss": -2.540950298309326, "eval_loss": 0.4340103566646576, "eval_mlm_loss": 0.4340103566646576, "eval_runtime": 60.4098, "eval_samples_per_second": 1156.584, "eval_steps_per_second": 0.579, "step": 5900 }, { "ep_loss": 0.0, "epoch": 4.56, "learning_rate": 0.00029549999999999997, "loss": 0.4615, "mlm_loss": 0.4615, "step": 5910 }, { "ep_loss": 0.0, "epoch": 4.57, "learning_rate": 0.000296, "loss": 0.4628, "mlm_loss": 0.4628, "step": 5920 }, { "ep_loss": 0.0, "epoch": 4.57, "learning_rate": 0.0002965, "loss": 0.4614, "mlm_loss": 0.4614, "step": 5930 }, { "ep_loss": 0.0, "epoch": 4.58, "learning_rate": 0.000297, "loss": 0.4584, "mlm_loss": 0.4584, "step": 5940 }, { "ep_loss": 0.0, "epoch": 4.59, "learning_rate": 0.00029749999999999997, "loss": 0.4624, "mlm_loss": 0.4624, "step": 5950 }, { "ep_loss": 0.0, "epoch": 4.6, "learning_rate": 0.000298, "loss": 0.4559, "mlm_loss": 0.4559, "step": 5960 }, { "ep_loss": 0.0, "epoch": 4.6, "learning_rate": 0.0002985, "loss": 0.4583, "mlm_loss": 0.4583, "step": 5970 }, { "ep_loss": 0.0, "epoch": 4.61, "learning_rate": 0.000299, "loss": 0.4641, "mlm_loss": 0.4641, "step": 5980 }, { "ep_loss": 0.0, "epoch": 4.62, "learning_rate": 0.0002995, "loss": 0.4639, "mlm_loss": 0.4639, "step": 5990 }, { "ep_loss": 0.0, "epoch": 4.63, "learning_rate": 0.0003, "loss": 0.4632, "mlm_loss": 0.4632, "step": 6000 }, { "epoch": 4.63, "eval_ep_loss": -2.32681941986084, "eval_loss": 0.4312271177768707, "eval_mlm_loss": 0.4312271177768707, "eval_runtime": 60.5376, "eval_samples_per_second": 1154.142, "eval_steps_per_second": 0.578, "step": 6000 }, { "ep_loss": 0.0, "epoch": 4.64, "learning_rate": 0.0003005, "loss": 0.4618, "mlm_loss": 0.4618, "step": 6010 }, { "ep_loss": 0.0, "epoch": 4.64, "learning_rate": 0.000301, "loss": 0.4553, "mlm_loss": 0.4553, "step": 6020 }, { "ep_loss": 0.0, "epoch": 4.65, "learning_rate": 0.0003015, "loss": 0.4649, "mlm_loss": 0.4649, "step": 6030 }, { "ep_loss": 0.0, "epoch": 4.66, "learning_rate": 0.000302, "loss": 0.4569, "mlm_loss": 0.4569, "step": 6040 }, { "ep_loss": 0.0, "epoch": 4.67, "learning_rate": 0.0003025, "loss": 0.4658, "mlm_loss": 0.4658, "step": 6050 }, { "ep_loss": 0.0, "epoch": 4.67, "learning_rate": 0.000303, "loss": 0.4571, "mlm_loss": 0.4571, "step": 6060 }, { "ep_loss": 0.0, "epoch": 4.68, "learning_rate": 0.0003035, "loss": 0.4574, "mlm_loss": 0.4574, "step": 6070 }, { "ep_loss": 0.0, "epoch": 4.69, "learning_rate": 0.000304, "loss": 0.4539, "mlm_loss": 0.4539, "step": 6080 }, { "ep_loss": 0.0, "epoch": 4.7, "learning_rate": 0.0003045, "loss": 0.4544, "mlm_loss": 0.4544, "step": 6090 }, { "ep_loss": 0.0, "epoch": 4.7, "learning_rate": 0.000305, "loss": 0.4532, "mlm_loss": 0.4532, "step": 6100 }, { "epoch": 4.7, "eval_ep_loss": -2.5871927738189697, "eval_loss": 0.4311540722846985, "eval_mlm_loss": 0.4311540722846985, "eval_runtime": 62.3642, "eval_samples_per_second": 1120.337, "eval_steps_per_second": 0.561, "step": 6100 }, { "ep_loss": 0.0, "epoch": 4.71, "learning_rate": 0.0003055, "loss": 0.4553, "mlm_loss": 0.4553, "step": 6110 }, { "ep_loss": 0.0, "epoch": 4.72, "learning_rate": 0.000306, "loss": 0.4557, "mlm_loss": 0.4557, "step": 6120 }, { "ep_loss": 0.0, "epoch": 4.73, "learning_rate": 0.0003065, "loss": 0.4568, "mlm_loss": 0.4568, "step": 6130 }, { "ep_loss": 0.0, "epoch": 4.74, "learning_rate": 0.000307, "loss": 0.4633, "mlm_loss": 0.4633, "step": 6140 }, { "ep_loss": 0.0, "epoch": 4.74, "learning_rate": 0.0003075, "loss": 0.4517, "mlm_loss": 0.4517, "step": 6150 }, { "ep_loss": 0.0, "epoch": 4.75, "learning_rate": 0.000308, "loss": 0.4539, "mlm_loss": 0.4539, "step": 6160 }, { "ep_loss": 0.0, "epoch": 4.76, "learning_rate": 0.0003085, "loss": 0.4606, "mlm_loss": 0.4606, "step": 6170 }, { "ep_loss": 0.0, "epoch": 4.77, "learning_rate": 0.00030900000000000003, "loss": 0.4502, "mlm_loss": 0.4502, "step": 6180 }, { "ep_loss": 0.0, "epoch": 4.77, "learning_rate": 0.0003095, "loss": 0.453, "mlm_loss": 0.453, "step": 6190 }, { "ep_loss": 0.0, "epoch": 4.78, "learning_rate": 0.00031, "loss": 0.4499, "mlm_loss": 0.4499, "step": 6200 }, { "epoch": 4.78, "eval_ep_loss": -2.451572895050049, "eval_loss": 0.42733505368232727, "eval_mlm_loss": 0.42733505368232727, "eval_runtime": 59.9548, "eval_samples_per_second": 1165.361, "eval_steps_per_second": 0.584, "step": 6200 }, { "ep_loss": 0.0, "epoch": 4.79, "learning_rate": 0.0003105, "loss": 0.4582, "mlm_loss": 0.4582, "step": 6210 }, { "ep_loss": 0.0, "epoch": 4.8, "learning_rate": 0.000311, "loss": 0.4587, "mlm_loss": 0.4587, "step": 6220 }, { "ep_loss": 0.0, "epoch": 4.81, "learning_rate": 0.0003115, "loss": 0.4557, "mlm_loss": 0.4557, "step": 6230 }, { "ep_loss": 0.0, "epoch": 4.81, "learning_rate": 0.000312, "loss": 0.4503, "mlm_loss": 0.4503, "step": 6240 }, { "ep_loss": 0.0, "epoch": 4.82, "learning_rate": 0.0003125, "loss": 0.4513, "mlm_loss": 0.4513, "step": 6250 }, { "ep_loss": 0.0, "epoch": 4.83, "learning_rate": 0.000313, "loss": 0.4532, "mlm_loss": 0.4532, "step": 6260 }, { "ep_loss": 0.0, "epoch": 4.84, "learning_rate": 0.00031350000000000003, "loss": 0.4535, "mlm_loss": 0.4535, "step": 6270 }, { "ep_loss": 0.0, "epoch": 4.84, "learning_rate": 0.000314, "loss": 0.453, "mlm_loss": 0.453, "step": 6280 }, { "ep_loss": 0.0, "epoch": 4.85, "learning_rate": 0.0003145, "loss": 0.4541, "mlm_loss": 0.4541, "step": 6290 }, { "ep_loss": 0.0, "epoch": 4.86, "learning_rate": 0.000315, "loss": 0.4532, "mlm_loss": 0.4532, "step": 6300 }, { "epoch": 4.86, "eval_ep_loss": -2.6052770614624023, "eval_loss": 0.4238816797733307, "eval_mlm_loss": 0.4238816797733307, "eval_runtime": 60.9957, "eval_samples_per_second": 1145.474, "eval_steps_per_second": 0.574, "step": 6300 }, { "ep_loss": 0.0, "epoch": 4.87, "learning_rate": 0.0003155, "loss": 0.4505, "mlm_loss": 0.4505, "step": 6310 }, { "ep_loss": 0.0, "epoch": 4.87, "learning_rate": 0.000316, "loss": 0.4476, "mlm_loss": 0.4476, "step": 6320 }, { "ep_loss": 0.0, "epoch": 4.88, "learning_rate": 0.0003165, "loss": 0.4495, "mlm_loss": 0.4495, "step": 6330 }, { "ep_loss": 0.0, "epoch": 4.89, "learning_rate": 0.000317, "loss": 0.4483, "mlm_loss": 0.4483, "step": 6340 }, { "ep_loss": 0.0, "epoch": 4.9, "learning_rate": 0.0003175, "loss": 0.4498, "mlm_loss": 0.4498, "step": 6350 }, { "ep_loss": 0.0, "epoch": 4.91, "learning_rate": 0.00031800000000000003, "loss": 0.4505, "mlm_loss": 0.4505, "step": 6360 }, { "ep_loss": 0.0, "epoch": 4.91, "learning_rate": 0.0003185, "loss": 0.4362, "mlm_loss": 0.4362, "step": 6370 }, { "ep_loss": 0.0, "epoch": 4.92, "learning_rate": 0.000319, "loss": 0.449, "mlm_loss": 0.449, "step": 6380 }, { "ep_loss": 0.0, "epoch": 4.93, "learning_rate": 0.0003195, "loss": 0.4558, "mlm_loss": 0.4558, "step": 6390 }, { "ep_loss": 0.0, "epoch": 4.94, "learning_rate": 0.00032, "loss": 0.4484, "mlm_loss": 0.4484, "step": 6400 }, { "epoch": 4.94, "eval_ep_loss": -2.464764356613159, "eval_loss": 0.42165419459342957, "eval_mlm_loss": 0.42165419459342957, "eval_runtime": 60.6893, "eval_samples_per_second": 1151.257, "eval_steps_per_second": 0.577, "step": 6400 }, { "ep_loss": 0.0, "epoch": 4.94, "learning_rate": 0.00032050000000000004, "loss": 0.4442, "mlm_loss": 0.4442, "step": 6410 }, { "ep_loss": 0.0, "epoch": 4.95, "learning_rate": 0.000321, "loss": 0.4494, "mlm_loss": 0.4494, "step": 6420 }, { "ep_loss": 0.0, "epoch": 4.96, "learning_rate": 0.0003215, "loss": 0.4492, "mlm_loss": 0.4492, "step": 6430 }, { "ep_loss": 0.0, "epoch": 4.97, "learning_rate": 0.000322, "loss": 0.4481, "mlm_loss": 0.4481, "step": 6440 }, { "ep_loss": 0.0, "epoch": 4.97, "learning_rate": 0.00032250000000000003, "loss": 0.4499, "mlm_loss": 0.4499, "step": 6450 }, { "ep_loss": 0.0, "epoch": 4.98, "learning_rate": 0.000323, "loss": 0.454, "mlm_loss": 0.454, "step": 6460 }, { "ep_loss": 0.0, "epoch": 4.99, "learning_rate": 0.0003235, "loss": 0.4466, "mlm_loss": 0.4466, "step": 6470 }, { "ep_loss": 0.0, "epoch": 5.0, "learning_rate": 0.000324, "loss": 0.4473, "mlm_loss": 0.4473, "step": 6480 }, { "ep_loss": 0.0, "epoch": 5.01, "learning_rate": 0.00032450000000000003, "loss": 0.445, "mlm_loss": 0.445, "step": 6490 }, { "ep_loss": 0.0, "epoch": 5.01, "learning_rate": 0.00032500000000000004, "loss": 0.439, "mlm_loss": 0.439, "step": 6500 }, { "epoch": 5.01, "eval_ep_loss": -2.375345230102539, "eval_loss": 0.4176154136657715, "eval_mlm_loss": 0.4176154136657715, "eval_runtime": 61.6206, "eval_samples_per_second": 1133.858, "eval_steps_per_second": 0.568, "step": 6500 }, { "ep_loss": 0.0, "epoch": 5.02, "learning_rate": 0.0003255, "loss": 0.4483, "mlm_loss": 0.4483, "step": 6510 }, { "ep_loss": 0.0, "epoch": 5.03, "learning_rate": 0.000326, "loss": 0.4435, "mlm_loss": 0.4435, "step": 6520 }, { "ep_loss": 0.0, "epoch": 5.04, "learning_rate": 0.0003265, "loss": 0.4455, "mlm_loss": 0.4455, "step": 6530 }, { "ep_loss": 0.0, "epoch": 5.04, "learning_rate": 0.00032700000000000003, "loss": 0.4464, "mlm_loss": 0.4464, "step": 6540 }, { "ep_loss": 0.0, "epoch": 5.05, "learning_rate": 0.00032750000000000005, "loss": 0.4434, "mlm_loss": 0.4434, "step": 6550 }, { "ep_loss": 0.0, "epoch": 5.06, "learning_rate": 0.000328, "loss": 0.4363, "mlm_loss": 0.4363, "step": 6560 }, { "ep_loss": 0.0, "epoch": 5.07, "learning_rate": 0.0003285, "loss": 0.4354, "mlm_loss": 0.4354, "step": 6570 }, { "ep_loss": 0.0, "epoch": 5.08, "learning_rate": 0.00032900000000000003, "loss": 0.4485, "mlm_loss": 0.4485, "step": 6580 }, { "ep_loss": 0.0, "epoch": 5.08, "learning_rate": 0.00032950000000000004, "loss": 0.445, "mlm_loss": 0.445, "step": 6590 }, { "ep_loss": 0.0, "epoch": 5.09, "learning_rate": 0.00033, "loss": 0.4461, "mlm_loss": 0.4461, "step": 6600 }, { "epoch": 5.09, "eval_ep_loss": -2.5544066429138184, "eval_loss": 0.41740256547927856, "eval_mlm_loss": 0.41740256547927856, "eval_runtime": 59.4616, "eval_samples_per_second": 1175.028, "eval_steps_per_second": 0.589, "step": 6600 }, { "ep_loss": 0.0, "epoch": 5.1, "learning_rate": 0.0003305, "loss": 0.4421, "mlm_loss": 0.4421, "step": 6610 }, { "ep_loss": 0.0, "epoch": 5.11, "learning_rate": 0.000331, "loss": 0.4406, "mlm_loss": 0.4406, "step": 6620 }, { "ep_loss": 0.0, "epoch": 5.11, "learning_rate": 0.00033150000000000003, "loss": 0.4488, "mlm_loss": 0.4488, "step": 6630 }, { "ep_loss": 0.0, "epoch": 5.12, "learning_rate": 0.00033200000000000005, "loss": 0.436, "mlm_loss": 0.436, "step": 6640 }, { "ep_loss": 0.0, "epoch": 5.13, "learning_rate": 0.0003325, "loss": 0.4398, "mlm_loss": 0.4398, "step": 6650 }, { "ep_loss": 0.0, "epoch": 5.14, "learning_rate": 0.000333, "loss": 0.4381, "mlm_loss": 0.4381, "step": 6660 }, { "ep_loss": 0.0, "epoch": 5.14, "learning_rate": 0.00033350000000000003, "loss": 0.4479, "mlm_loss": 0.4479, "step": 6670 }, { "ep_loss": 0.0, "epoch": 5.15, "learning_rate": 0.00033400000000000004, "loss": 0.4409, "mlm_loss": 0.4409, "step": 6680 }, { "ep_loss": 0.0, "epoch": 5.16, "learning_rate": 0.00033450000000000005, "loss": 0.4395, "mlm_loss": 0.4395, "step": 6690 }, { "ep_loss": 0.0, "epoch": 5.17, "learning_rate": 0.000335, "loss": 0.4415, "mlm_loss": 0.4415, "step": 6700 }, { "epoch": 5.17, "eval_ep_loss": -2.5074503421783447, "eval_loss": 0.41386091709136963, "eval_mlm_loss": 0.41386091709136963, "eval_runtime": 61.5941, "eval_samples_per_second": 1134.346, "eval_steps_per_second": 0.568, "step": 6700 }, { "ep_loss": 0.0, "epoch": 5.18, "learning_rate": 0.0003355, "loss": 0.443, "mlm_loss": 0.443, "step": 6710 }, { "ep_loss": 0.0, "epoch": 5.18, "learning_rate": 0.00033600000000000004, "loss": 0.4355, "mlm_loss": 0.4355, "step": 6720 }, { "ep_loss": 0.0, "epoch": 5.19, "learning_rate": 0.00033650000000000005, "loss": 0.4301, "mlm_loss": 0.4301, "step": 6730 }, { "ep_loss": 0.0, "epoch": 5.2, "learning_rate": 0.000337, "loss": 0.4414, "mlm_loss": 0.4414, "step": 6740 }, { "ep_loss": 0.0, "epoch": 5.21, "learning_rate": 0.0003375, "loss": 0.4368, "mlm_loss": 0.4368, "step": 6750 }, { "ep_loss": 0.0, "epoch": 5.21, "learning_rate": 0.00033800000000000003, "loss": 0.4378, "mlm_loss": 0.4378, "step": 6760 }, { "ep_loss": 0.0, "epoch": 5.22, "learning_rate": 0.00033850000000000004, "loss": 0.4402, "mlm_loss": 0.4402, "step": 6770 }, { "ep_loss": 0.0, "epoch": 5.23, "learning_rate": 0.00033900000000000005, "loss": 0.439, "mlm_loss": 0.439, "step": 6780 }, { "ep_loss": 0.0, "epoch": 5.24, "learning_rate": 0.0003395, "loss": 0.4454, "mlm_loss": 0.4454, "step": 6790 }, { "ep_loss": 0.0, "epoch": 5.24, "learning_rate": 0.00034, "loss": 0.4379, "mlm_loss": 0.4379, "step": 6800 }, { "epoch": 5.24, "eval_ep_loss": -2.6114962100982666, "eval_loss": 0.4116796851158142, "eval_mlm_loss": 0.4116796851158142, "eval_runtime": 61.6861, "eval_samples_per_second": 1132.654, "eval_steps_per_second": 0.567, "step": 6800 }, { "ep_loss": 0.0, "epoch": 5.25, "learning_rate": 0.00034050000000000004, "loss": 0.4407, "mlm_loss": 0.4407, "step": 6810 }, { "ep_loss": 0.0, "epoch": 5.26, "learning_rate": 0.00034100000000000005, "loss": 0.4312, "mlm_loss": 0.4312, "step": 6820 }, { "ep_loss": 0.0, "epoch": 5.27, "learning_rate": 0.0003415, "loss": 0.4327, "mlm_loss": 0.4327, "step": 6830 }, { "ep_loss": 0.0, "epoch": 5.28, "learning_rate": 0.000342, "loss": 0.4405, "mlm_loss": 0.4405, "step": 6840 }, { "ep_loss": 0.0, "epoch": 5.28, "learning_rate": 0.00034250000000000003, "loss": 0.4395, "mlm_loss": 0.4395, "step": 6850 }, { "ep_loss": 0.0, "epoch": 5.29, "learning_rate": 0.00034300000000000004, "loss": 0.4398, "mlm_loss": 0.4398, "step": 6860 }, { "ep_loss": 0.0, "epoch": 5.3, "learning_rate": 0.00034350000000000006, "loss": 0.4372, "mlm_loss": 0.4372, "step": 6870 }, { "ep_loss": 0.0, "epoch": 5.31, "learning_rate": 0.00034399999999999996, "loss": 0.4438, "mlm_loss": 0.4438, "step": 6880 }, { "ep_loss": 0.0, "epoch": 5.31, "learning_rate": 0.00034449999999999997, "loss": 0.4356, "mlm_loss": 0.4356, "step": 6890 }, { "ep_loss": 0.0, "epoch": 5.32, "learning_rate": 0.000345, "loss": 0.4357, "mlm_loss": 0.4357, "step": 6900 }, { "epoch": 5.32, "eval_ep_loss": -2.2833127975463867, "eval_loss": 0.40905898809432983, "eval_mlm_loss": 0.40905898809432983, "eval_runtime": 61.29, "eval_samples_per_second": 1139.974, "eval_steps_per_second": 0.571, "step": 6900 }, { "ep_loss": 0.0, "epoch": 5.33, "learning_rate": 0.0003455, "loss": 0.4357, "mlm_loss": 0.4357, "step": 6910 }, { "ep_loss": 0.0, "epoch": 5.34, "learning_rate": 0.000346, "loss": 0.4243, "mlm_loss": 0.4243, "step": 6920 }, { "ep_loss": 0.0, "epoch": 5.35, "learning_rate": 0.00034649999999999997, "loss": 0.4404, "mlm_loss": 0.4404, "step": 6930 }, { "ep_loss": 0.0, "epoch": 5.35, "learning_rate": 0.000347, "loss": 0.4319, "mlm_loss": 0.4319, "step": 6940 }, { "ep_loss": 0.0, "epoch": 5.36, "learning_rate": 0.0003475, "loss": 0.4362, "mlm_loss": 0.4362, "step": 6950 }, { "ep_loss": 0.0, "epoch": 5.37, "learning_rate": 0.000348, "loss": 0.4355, "mlm_loss": 0.4355, "step": 6960 }, { "ep_loss": 0.0, "epoch": 5.38, "learning_rate": 0.00034849999999999996, "loss": 0.43, "mlm_loss": 0.43, "step": 6970 }, { "ep_loss": 0.0, "epoch": 5.38, "learning_rate": 0.00034899999999999997, "loss": 0.4342, "mlm_loss": 0.4342, "step": 6980 }, { "ep_loss": 0.0, "epoch": 5.39, "learning_rate": 0.0003495, "loss": 0.4345, "mlm_loss": 0.4345, "step": 6990 }, { "ep_loss": 0.0, "epoch": 5.4, "learning_rate": 0.00035, "loss": 0.4324, "mlm_loss": 0.4324, "step": 7000 }, { "epoch": 5.4, "eval_ep_loss": -2.64093279838562, "eval_loss": 0.4105168282985687, "eval_mlm_loss": 0.4105168282985687, "eval_runtime": 60.7916, "eval_samples_per_second": 1149.32, "eval_steps_per_second": 0.576, "step": 7000 }, { "ep_loss": 0.0, "epoch": 5.41, "learning_rate": 0.0003505, "loss": 0.4319, "mlm_loss": 0.4319, "step": 7010 }, { "ep_loss": 0.0, "epoch": 5.41, "learning_rate": 0.00035099999999999997, "loss": 0.4372, "mlm_loss": 0.4372, "step": 7020 }, { "ep_loss": 0.0, "epoch": 5.42, "learning_rate": 0.0003515, "loss": 0.4342, "mlm_loss": 0.4342, "step": 7030 }, { "ep_loss": 0.0, "epoch": 5.43, "learning_rate": 0.000352, "loss": 0.4308, "mlm_loss": 0.4308, "step": 7040 }, { "ep_loss": 0.0, "epoch": 5.44, "learning_rate": 0.0003525, "loss": 0.4373, "mlm_loss": 0.4373, "step": 7050 }, { "ep_loss": 0.0, "epoch": 5.45, "learning_rate": 0.00035299999999999996, "loss": 0.4271, "mlm_loss": 0.4271, "step": 7060 }, { "ep_loss": 0.0, "epoch": 5.45, "learning_rate": 0.0003535, "loss": 0.4355, "mlm_loss": 0.4355, "step": 7070 }, { "ep_loss": 0.0, "epoch": 5.46, "learning_rate": 0.000354, "loss": 0.4359, "mlm_loss": 0.4359, "step": 7080 }, { "ep_loss": 0.0, "epoch": 5.47, "learning_rate": 0.0003545, "loss": 0.436, "mlm_loss": 0.436, "step": 7090 }, { "ep_loss": 0.0, "epoch": 5.48, "learning_rate": 0.000355, "loss": 0.4362, "mlm_loss": 0.4362, "step": 7100 }, { "epoch": 5.48, "eval_ep_loss": -2.3400020599365234, "eval_loss": 0.4073105752468109, "eval_mlm_loss": 0.4073105752468109, "eval_runtime": 63.2925, "eval_samples_per_second": 1103.907, "eval_steps_per_second": 0.553, "step": 7100 }, { "ep_loss": 0.0, "epoch": 5.48, "learning_rate": 0.00035549999999999997, "loss": 0.4321, "mlm_loss": 0.4321, "step": 7110 }, { "ep_loss": 0.0, "epoch": 5.49, "learning_rate": 0.000356, "loss": 0.4343, "mlm_loss": 0.4343, "step": 7120 }, { "ep_loss": 0.0, "epoch": 5.5, "learning_rate": 0.0003565, "loss": 0.4307, "mlm_loss": 0.4307, "step": 7130 }, { "ep_loss": 0.0, "epoch": 5.51, "learning_rate": 0.000357, "loss": 0.4284, "mlm_loss": 0.4284, "step": 7140 }, { "ep_loss": 0.0, "epoch": 5.51, "learning_rate": 0.0003575, "loss": 0.4262, "mlm_loss": 0.4262, "step": 7150 }, { "ep_loss": 0.0, "epoch": 5.52, "learning_rate": 0.000358, "loss": 0.4248, "mlm_loss": 0.4248, "step": 7160 }, { "ep_loss": 0.0, "epoch": 5.53, "learning_rate": 0.0003585, "loss": 0.4419, "mlm_loss": 0.4419, "step": 7170 }, { "ep_loss": 0.0, "epoch": 5.54, "learning_rate": 0.000359, "loss": 0.4267, "mlm_loss": 0.4267, "step": 7180 }, { "ep_loss": 0.0, "epoch": 5.55, "learning_rate": 0.0003595, "loss": 0.4337, "mlm_loss": 0.4337, "step": 7190 }, { "ep_loss": 0.0, "epoch": 5.55, "learning_rate": 0.00035999999999999997, "loss": 0.4264, "mlm_loss": 0.4264, "step": 7200 }, { "epoch": 5.55, "eval_ep_loss": -2.597167491912842, "eval_loss": 0.40466931462287903, "eval_mlm_loss": 0.40466931462287903, "eval_runtime": 61.8389, "eval_samples_per_second": 1129.855, "eval_steps_per_second": 0.566, "step": 7200 }, { "ep_loss": 0.0, "epoch": 5.56, "learning_rate": 0.0003605, "loss": 0.4232, "mlm_loss": 0.4232, "step": 7210 }, { "ep_loss": 0.0, "epoch": 5.57, "learning_rate": 0.000361, "loss": 0.4253, "mlm_loss": 0.4253, "step": 7220 }, { "ep_loss": 0.0, "epoch": 5.58, "learning_rate": 0.0003615, "loss": 0.437, "mlm_loss": 0.437, "step": 7230 }, { "ep_loss": 0.0, "epoch": 5.58, "learning_rate": 0.000362, "loss": 0.4319, "mlm_loss": 0.4319, "step": 7240 }, { "ep_loss": 0.0, "epoch": 5.59, "learning_rate": 0.0003625, "loss": 0.4257, "mlm_loss": 0.4257, "step": 7250 }, { "ep_loss": 0.0, "epoch": 5.6, "learning_rate": 0.000363, "loss": 0.4318, "mlm_loss": 0.4318, "step": 7260 }, { "ep_loss": 0.0, "epoch": 5.61, "learning_rate": 0.0003635, "loss": 0.4308, "mlm_loss": 0.4308, "step": 7270 }, { "ep_loss": 0.0, "epoch": 5.62, "learning_rate": 0.000364, "loss": 0.4304, "mlm_loss": 0.4304, "step": 7280 }, { "ep_loss": 0.0, "epoch": 5.62, "learning_rate": 0.0003645, "loss": 0.424, "mlm_loss": 0.424, "step": 7290 }, { "ep_loss": 0.0, "epoch": 5.63, "learning_rate": 0.000365, "loss": 0.4315, "mlm_loss": 0.4315, "step": 7300 }, { "epoch": 5.63, "eval_ep_loss": -2.466102361679077, "eval_loss": 0.40299102663993835, "eval_mlm_loss": 0.40299102663993835, "eval_runtime": 59.5652, "eval_samples_per_second": 1172.984, "eval_steps_per_second": 0.588, "step": 7300 }, { "ep_loss": 0.0, "epoch": 5.64, "learning_rate": 0.0003655, "loss": 0.4305, "mlm_loss": 0.4305, "step": 7310 }, { "ep_loss": 0.0, "epoch": 5.65, "learning_rate": 0.000366, "loss": 0.4215, "mlm_loss": 0.4215, "step": 7320 }, { "ep_loss": 0.0, "epoch": 5.65, "learning_rate": 0.0003665, "loss": 0.4314, "mlm_loss": 0.4314, "step": 7330 }, { "ep_loss": 0.0, "epoch": 5.66, "learning_rate": 0.000367, "loss": 0.4334, "mlm_loss": 0.4334, "step": 7340 }, { "ep_loss": 0.0, "epoch": 5.67, "learning_rate": 0.0003675, "loss": 0.4294, "mlm_loss": 0.4294, "step": 7350 }, { "ep_loss": 0.0, "epoch": 5.68, "learning_rate": 0.000368, "loss": 0.4212, "mlm_loss": 0.4212, "step": 7360 }, { "ep_loss": 0.0, "epoch": 5.68, "learning_rate": 0.0003685, "loss": 0.4282, "mlm_loss": 0.4282, "step": 7370 }, { "ep_loss": 0.0, "epoch": 5.69, "learning_rate": 0.000369, "loss": 0.4249, "mlm_loss": 0.4249, "step": 7380 }, { "ep_loss": 0.0, "epoch": 5.7, "learning_rate": 0.0003695, "loss": 0.4236, "mlm_loss": 0.4236, "step": 7390 }, { "ep_loss": 0.0, "epoch": 5.71, "learning_rate": 0.00037, "loss": 0.4287, "mlm_loss": 0.4287, "step": 7400 }, { "epoch": 5.71, "eval_ep_loss": -2.350245475769043, "eval_loss": 0.401913046836853, "eval_mlm_loss": 0.401913046836853, "eval_runtime": 60.065, "eval_samples_per_second": 1163.223, "eval_steps_per_second": 0.583, "step": 7400 }, { "ep_loss": 0.0, "epoch": 5.72, "learning_rate": 0.0003705, "loss": 0.4203, "mlm_loss": 0.4203, "step": 7410 }, { "ep_loss": 0.0, "epoch": 5.72, "learning_rate": 0.000371, "loss": 0.4252, "mlm_loss": 0.4252, "step": 7420 }, { "ep_loss": 0.0, "epoch": 5.73, "learning_rate": 0.00037150000000000003, "loss": 0.4264, "mlm_loss": 0.4264, "step": 7430 }, { "ep_loss": 0.0, "epoch": 5.74, "learning_rate": 0.000372, "loss": 0.4225, "mlm_loss": 0.4225, "step": 7440 }, { "ep_loss": 0.0, "epoch": 5.75, "learning_rate": 0.0003725, "loss": 0.4262, "mlm_loss": 0.4262, "step": 7450 }, { "ep_loss": 0.0, "epoch": 5.75, "learning_rate": 0.000373, "loss": 0.4231, "mlm_loss": 0.4231, "step": 7460 }, { "ep_loss": 0.0, "epoch": 5.76, "learning_rate": 0.0003735, "loss": 0.4292, "mlm_loss": 0.4292, "step": 7470 }, { "ep_loss": 0.0, "epoch": 5.77, "learning_rate": 0.000374, "loss": 0.4239, "mlm_loss": 0.4239, "step": 7480 }, { "ep_loss": 0.0, "epoch": 5.78, "learning_rate": 0.0003745, "loss": 0.4209, "mlm_loss": 0.4209, "step": 7490 }, { "ep_loss": 0.0, "epoch": 5.78, "learning_rate": 0.000375, "loss": 0.4198, "mlm_loss": 0.4198, "step": 7500 }, { "epoch": 5.78, "eval_ep_loss": -2.3534927368164062, "eval_loss": 0.3982246518135071, "eval_mlm_loss": 0.3982246518135071, "eval_runtime": 63.0218, "eval_samples_per_second": 1108.649, "eval_steps_per_second": 0.555, "step": 7500 }, { "ep_loss": 0.0, "epoch": 5.79, "learning_rate": 0.0003755, "loss": 0.4212, "mlm_loss": 0.4212, "step": 7510 }, { "ep_loss": 0.0, "epoch": 5.8, "learning_rate": 0.00037600000000000003, "loss": 0.4141, "mlm_loss": 0.4141, "step": 7520 }, { "ep_loss": 0.0, "epoch": 5.81, "learning_rate": 0.0003765, "loss": 0.4266, "mlm_loss": 0.4266, "step": 7530 }, { "ep_loss": 0.0, "epoch": 5.82, "learning_rate": 0.000377, "loss": 0.4255, "mlm_loss": 0.4255, "step": 7540 }, { "ep_loss": 0.0, "epoch": 5.82, "learning_rate": 0.0003775, "loss": 0.4194, "mlm_loss": 0.4194, "step": 7550 }, { "ep_loss": 0.0, "epoch": 5.83, "learning_rate": 0.000378, "loss": 0.428, "mlm_loss": 0.428, "step": 7560 }, { "ep_loss": 0.0, "epoch": 5.84, "learning_rate": 0.0003785, "loss": 0.4221, "mlm_loss": 0.4221, "step": 7570 }, { "ep_loss": 0.0, "epoch": 5.85, "learning_rate": 0.000379, "loss": 0.4253, "mlm_loss": 0.4253, "step": 7580 }, { "ep_loss": 0.0, "epoch": 5.85, "learning_rate": 0.0003795, "loss": 0.4174, "mlm_loss": 0.4174, "step": 7590 }, { "ep_loss": 0.0, "epoch": 5.86, "learning_rate": 0.00038, "loss": 0.4204, "mlm_loss": 0.4204, "step": 7600 }, { "epoch": 5.86, "eval_ep_loss": -2.3331265449523926, "eval_loss": 0.3971656858921051, "eval_mlm_loss": 0.3971656858921051, "eval_runtime": 61.175, "eval_samples_per_second": 1142.117, "eval_steps_per_second": 0.572, "step": 7600 }, { "ep_loss": 0.0, "epoch": 5.87, "learning_rate": 0.00038050000000000003, "loss": 0.4127, "mlm_loss": 0.4127, "step": 7610 }, { "ep_loss": 0.0, "epoch": 5.88, "learning_rate": 0.000381, "loss": 0.4226, "mlm_loss": 0.4226, "step": 7620 }, { "ep_loss": 0.0, "epoch": 5.89, "learning_rate": 0.0003815, "loss": 0.418, "mlm_loss": 0.418, "step": 7630 }, { "ep_loss": 0.0, "epoch": 5.89, "learning_rate": 0.000382, "loss": 0.4291, "mlm_loss": 0.4291, "step": 7640 }, { "ep_loss": 0.0, "epoch": 5.9, "learning_rate": 0.00038250000000000003, "loss": 0.4155, "mlm_loss": 0.4155, "step": 7650 }, { "ep_loss": 0.0, "epoch": 5.91, "learning_rate": 0.00038300000000000004, "loss": 0.4213, "mlm_loss": 0.4213, "step": 7660 }, { "ep_loss": 0.0, "epoch": 5.92, "learning_rate": 0.0003835, "loss": 0.4273, "mlm_loss": 0.4273, "step": 7670 }, { "ep_loss": 0.0, "epoch": 5.92, "learning_rate": 0.000384, "loss": 0.4252, "mlm_loss": 0.4252, "step": 7680 }, { "ep_loss": 0.0, "epoch": 5.93, "learning_rate": 0.0003845, "loss": 0.4201, "mlm_loss": 0.4201, "step": 7690 }, { "ep_loss": 0.0, "epoch": 5.94, "learning_rate": 0.00038500000000000003, "loss": 0.4198, "mlm_loss": 0.4198, "step": 7700 }, { "epoch": 5.94, "eval_ep_loss": -2.514362335205078, "eval_loss": 0.39368629455566406, "eval_mlm_loss": 0.39368629455566406, "eval_runtime": 60.7647, "eval_samples_per_second": 1149.828, "eval_steps_per_second": 0.576, "step": 7700 }, { "ep_loss": 0.0, "epoch": 5.95, "learning_rate": 0.0003855, "loss": 0.4167, "mlm_loss": 0.4167, "step": 7710 }, { "ep_loss": 0.0, "epoch": 5.95, "learning_rate": 0.000386, "loss": 0.4235, "mlm_loss": 0.4235, "step": 7720 }, { "ep_loss": 0.0, "epoch": 5.96, "learning_rate": 0.0003865, "loss": 0.4233, "mlm_loss": 0.4233, "step": 7730 }, { "ep_loss": 0.0, "epoch": 5.97, "learning_rate": 0.00038700000000000003, "loss": 0.4227, "mlm_loss": 0.4227, "step": 7740 }, { "ep_loss": 0.0, "epoch": 5.98, "learning_rate": 0.00038750000000000004, "loss": 0.4194, "mlm_loss": 0.4194, "step": 7750 }, { "ep_loss": 0.0, "epoch": 5.99, "learning_rate": 0.000388, "loss": 0.4194, "mlm_loss": 0.4194, "step": 7760 }, { "ep_loss": 0.0, "epoch": 5.99, "learning_rate": 0.0003885, "loss": 0.4171, "mlm_loss": 0.4171, "step": 7770 }, { "ep_loss": 0.0, "epoch": 6.0, "learning_rate": 0.000389, "loss": 0.4197, "mlm_loss": 0.4197, "step": 7780 }, { "ep_loss": 0.0, "epoch": 6.01, "learning_rate": 0.00038950000000000003, "loss": 0.4138, "mlm_loss": 0.4138, "step": 7790 }, { "ep_loss": 0.0, "epoch": 6.02, "learning_rate": 0.00039000000000000005, "loss": 0.4149, "mlm_loss": 0.4149, "step": 7800 }, { "epoch": 6.02, "eval_ep_loss": -2.336341619491577, "eval_loss": 0.3958059251308441, "eval_mlm_loss": 0.3958059251308441, "eval_runtime": 59.0112, "eval_samples_per_second": 1183.995, "eval_steps_per_second": 0.593, "step": 7800 }, { "ep_loss": 0.0, "epoch": 6.02, "learning_rate": 0.0003905, "loss": 0.4197, "mlm_loss": 0.4197, "step": 7810 }, { "ep_loss": 0.0, "epoch": 6.03, "learning_rate": 0.000391, "loss": 0.4135, "mlm_loss": 0.4135, "step": 7820 }, { "ep_loss": 0.0, "epoch": 6.04, "learning_rate": 0.00039150000000000003, "loss": 0.4207, "mlm_loss": 0.4207, "step": 7830 }, { "ep_loss": 0.0, "epoch": 6.05, "learning_rate": 0.00039200000000000004, "loss": 0.419, "mlm_loss": 0.419, "step": 7840 }, { "ep_loss": 0.0, "epoch": 6.05, "learning_rate": 0.0003925, "loss": 0.4169, "mlm_loss": 0.4169, "step": 7850 }, { "ep_loss": 0.0, "epoch": 6.06, "learning_rate": 0.000393, "loss": 0.4183, "mlm_loss": 0.4183, "step": 7860 }, { "ep_loss": 0.0, "epoch": 6.07, "learning_rate": 0.0003935, "loss": 0.4127, "mlm_loss": 0.4127, "step": 7870 }, { "ep_loss": 0.0, "epoch": 6.08, "learning_rate": 0.00039400000000000004, "loss": 0.421, "mlm_loss": 0.421, "step": 7880 }, { "ep_loss": 0.0, "epoch": 6.09, "learning_rate": 0.00039450000000000005, "loss": 0.4135, "mlm_loss": 0.4135, "step": 7890 }, { "ep_loss": 0.0, "epoch": 6.09, "learning_rate": 0.000395, "loss": 0.4174, "mlm_loss": 0.4174, "step": 7900 }, { "epoch": 6.09, "eval_ep_loss": -2.3622541427612305, "eval_loss": 0.3937676250934601, "eval_mlm_loss": 0.3937676250934601, "eval_runtime": 60.7463, "eval_samples_per_second": 1150.176, "eval_steps_per_second": 0.576, "step": 7900 }, { "ep_loss": 0.0, "epoch": 6.1, "learning_rate": 0.0003955, "loss": 0.4256, "mlm_loss": 0.4256, "step": 7910 }, { "ep_loss": 0.0, "epoch": 6.11, "learning_rate": 0.00039600000000000003, "loss": 0.4217, "mlm_loss": 0.4217, "step": 7920 }, { "ep_loss": 0.0, "epoch": 6.12, "learning_rate": 0.00039650000000000004, "loss": 0.4239, "mlm_loss": 0.4239, "step": 7930 }, { "ep_loss": 0.0, "epoch": 6.12, "learning_rate": 0.00039700000000000005, "loss": 0.4183, "mlm_loss": 0.4183, "step": 7940 }, { "ep_loss": 0.0, "epoch": 6.13, "learning_rate": 0.0003975, "loss": 0.4172, "mlm_loss": 0.4172, "step": 7950 }, { "ep_loss": 0.0, "epoch": 6.14, "learning_rate": 0.000398, "loss": 0.42, "mlm_loss": 0.42, "step": 7960 }, { "ep_loss": 0.0, "epoch": 6.15, "learning_rate": 0.00039850000000000004, "loss": 0.4149, "mlm_loss": 0.4149, "step": 7970 }, { "ep_loss": 0.0, "epoch": 6.16, "learning_rate": 0.00039900000000000005, "loss": 0.4128, "mlm_loss": 0.4128, "step": 7980 }, { "ep_loss": 0.0, "epoch": 6.16, "learning_rate": 0.0003995, "loss": 0.4153, "mlm_loss": 0.4153, "step": 7990 }, { "ep_loss": 0.0, "epoch": 6.17, "learning_rate": 0.0004, "loss": 0.418, "mlm_loss": 0.418, "step": 8000 }, { "epoch": 6.17, "eval_ep_loss": -2.42641282081604, "eval_loss": 0.3915087580680847, "eval_mlm_loss": 0.3915087580680847, "eval_runtime": 60.1518, "eval_samples_per_second": 1161.545, "eval_steps_per_second": 0.582, "step": 8000 }, { "ep_loss": 0.0, "epoch": 6.18, "learning_rate": 0.00040050000000000003, "loss": 0.4161, "mlm_loss": 0.4161, "step": 8010 }, { "ep_loss": 0.0, "epoch": 6.19, "learning_rate": 0.00040100000000000004, "loss": 0.4158, "mlm_loss": 0.4158, "step": 8020 }, { "ep_loss": 0.0, "epoch": 6.19, "learning_rate": 0.00040150000000000006, "loss": 0.4224, "mlm_loss": 0.4224, "step": 8030 }, { "ep_loss": 0.0, "epoch": 6.2, "learning_rate": 0.000402, "loss": 0.4203, "mlm_loss": 0.4203, "step": 8040 }, { "ep_loss": 0.0, "epoch": 6.21, "learning_rate": 0.0004025, "loss": 0.4134, "mlm_loss": 0.4134, "step": 8050 }, { "ep_loss": 0.0, "epoch": 6.22, "learning_rate": 0.00040300000000000004, "loss": 0.4153, "mlm_loss": 0.4153, "step": 8060 }, { "ep_loss": 0.0, "epoch": 6.22, "learning_rate": 0.00040350000000000005, "loss": 0.4126, "mlm_loss": 0.4126, "step": 8070 }, { "ep_loss": 0.0, "epoch": 6.23, "learning_rate": 0.000404, "loss": 0.4142, "mlm_loss": 0.4142, "step": 8080 }, { "ep_loss": 0.0, "epoch": 6.24, "learning_rate": 0.0004045, "loss": 0.4141, "mlm_loss": 0.4141, "step": 8090 }, { "ep_loss": 0.0, "epoch": 6.25, "learning_rate": 0.00040500000000000003, "loss": 0.4129, "mlm_loss": 0.4129, "step": 8100 }, { "epoch": 6.25, "eval_ep_loss": -2.3123908042907715, "eval_loss": 0.39225736260414124, "eval_mlm_loss": 0.39225736260414124, "eval_runtime": 61.9154, "eval_samples_per_second": 1128.459, "eval_steps_per_second": 0.565, "step": 8100 }, { "ep_loss": 0.0, "epoch": 6.26, "learning_rate": 0.00040550000000000004, "loss": 0.4155, "mlm_loss": 0.4155, "step": 8110 }, { "ep_loss": 0.0, "epoch": 6.26, "learning_rate": 0.00040600000000000006, "loss": 0.4174, "mlm_loss": 0.4174, "step": 8120 }, { "ep_loss": 0.0, "epoch": 6.27, "learning_rate": 0.00040649999999999996, "loss": 0.4099, "mlm_loss": 0.4099, "step": 8130 }, { "ep_loss": 0.0, "epoch": 6.28, "learning_rate": 0.00040699999999999997, "loss": 0.4123, "mlm_loss": 0.4123, "step": 8140 }, { "ep_loss": 0.0, "epoch": 6.29, "learning_rate": 0.0004075, "loss": 0.4216, "mlm_loss": 0.4216, "step": 8150 }, { "ep_loss": 0.0, "epoch": 6.29, "learning_rate": 0.000408, "loss": 0.4095, "mlm_loss": 0.4095, "step": 8160 }, { "ep_loss": 0.0, "epoch": 6.3, "learning_rate": 0.0004085, "loss": 0.413, "mlm_loss": 0.413, "step": 8170 }, { "ep_loss": 0.0, "epoch": 6.31, "learning_rate": 0.00040899999999999997, "loss": 0.4165, "mlm_loss": 0.4165, "step": 8180 }, { "ep_loss": 0.0, "epoch": 6.32, "learning_rate": 0.0004095, "loss": 0.4191, "mlm_loss": 0.4191, "step": 8190 }, { "ep_loss": 0.0, "epoch": 6.32, "learning_rate": 0.00041, "loss": 0.4161, "mlm_loss": 0.4161, "step": 8200 }, { "epoch": 6.32, "eval_ep_loss": -2.4482126235961914, "eval_loss": 0.388824462890625, "eval_mlm_loss": 0.388824462890625, "eval_runtime": 59.1336, "eval_samples_per_second": 1181.545, "eval_steps_per_second": 0.592, "step": 8200 }, { "ep_loss": 0.0, "epoch": 6.33, "learning_rate": 0.0004105, "loss": 0.4218, "mlm_loss": 0.4218, "step": 8210 }, { "ep_loss": 0.0, "epoch": 6.34, "learning_rate": 0.00041099999999999996, "loss": 0.4085, "mlm_loss": 0.4085, "step": 8220 }, { "ep_loss": 0.0, "epoch": 6.35, "learning_rate": 0.0004115, "loss": 0.4114, "mlm_loss": 0.4114, "step": 8230 }, { "ep_loss": 0.0, "epoch": 6.36, "learning_rate": 0.000412, "loss": 0.4129, "mlm_loss": 0.4129, "step": 8240 }, { "ep_loss": 0.0, "epoch": 6.36, "learning_rate": 0.0004125, "loss": 0.422, "mlm_loss": 0.422, "step": 8250 }, { "ep_loss": 0.0, "epoch": 6.37, "learning_rate": 0.000413, "loss": 0.4184, "mlm_loss": 0.4184, "step": 8260 }, { "ep_loss": 0.0, "epoch": 6.38, "learning_rate": 0.00041349999999999997, "loss": 0.4192, "mlm_loss": 0.4192, "step": 8270 }, { "ep_loss": 0.0, "epoch": 6.39, "learning_rate": 0.000414, "loss": 0.4168, "mlm_loss": 0.4168, "step": 8280 }, { "ep_loss": 0.0, "epoch": 6.39, "learning_rate": 0.0004145, "loss": 0.4171, "mlm_loss": 0.4171, "step": 8290 }, { "ep_loss": 0.0, "epoch": 6.4, "learning_rate": 0.000415, "loss": 0.4131, "mlm_loss": 0.4131, "step": 8300 }, { "epoch": 6.4, "eval_ep_loss": -2.358128547668457, "eval_loss": 0.38723644614219666, "eval_mlm_loss": 0.38723644614219666, "eval_runtime": 60.0999, "eval_samples_per_second": 1162.548, "eval_steps_per_second": 0.582, "step": 8300 }, { "ep_loss": 0.0, "epoch": 6.41, "learning_rate": 0.00041549999999999996, "loss": 0.4149, "mlm_loss": 0.4149, "step": 8310 }, { "ep_loss": 0.0, "epoch": 6.42, "learning_rate": 0.000416, "loss": 0.4068, "mlm_loss": 0.4068, "step": 8320 }, { "ep_loss": 0.0, "epoch": 6.42, "learning_rate": 0.0004165, "loss": 0.4102, "mlm_loss": 0.4102, "step": 8330 }, { "ep_loss": 0.0, "epoch": 6.43, "learning_rate": 0.000417, "loss": 0.4129, "mlm_loss": 0.4129, "step": 8340 }, { "ep_loss": 0.0, "epoch": 6.44, "learning_rate": 0.0004175, "loss": 0.413, "mlm_loss": 0.413, "step": 8350 }, { "ep_loss": 0.0, "epoch": 6.45, "learning_rate": 0.00041799999999999997, "loss": 0.4143, "mlm_loss": 0.4143, "step": 8360 }, { "ep_loss": 0.0, "epoch": 6.46, "learning_rate": 0.0004185, "loss": 0.4134, "mlm_loss": 0.4134, "step": 8370 }, { "ep_loss": 0.0, "epoch": 6.46, "learning_rate": 0.000419, "loss": 0.406, "mlm_loss": 0.406, "step": 8380 }, { "ep_loss": 0.0, "epoch": 6.47, "learning_rate": 0.0004195, "loss": 0.4144, "mlm_loss": 0.4144, "step": 8390 }, { "ep_loss": 0.0, "epoch": 6.48, "learning_rate": 0.00042, "loss": 0.4131, "mlm_loss": 0.4131, "step": 8400 }, { "epoch": 6.48, "eval_ep_loss": -2.0901758670806885, "eval_loss": 0.3867802917957306, "eval_mlm_loss": 0.3867802917957306, "eval_runtime": 63.0955, "eval_samples_per_second": 1107.354, "eval_steps_per_second": 0.555, "step": 8400 }, { "ep_loss": 0.0, "epoch": 6.49, "learning_rate": 0.0004205, "loss": 0.4011, "mlm_loss": 0.4011, "step": 8410 }, { "ep_loss": 0.0, "epoch": 6.49, "learning_rate": 0.000421, "loss": 0.4117, "mlm_loss": 0.4117, "step": 8420 }, { "ep_loss": 0.0, "epoch": 6.5, "learning_rate": 0.0004215, "loss": 0.414, "mlm_loss": 0.414, "step": 8430 }, { "ep_loss": 0.0, "epoch": 6.51, "learning_rate": 0.000422, "loss": 0.4186, "mlm_loss": 0.4186, "step": 8440 }, { "ep_loss": 0.0, "epoch": 6.52, "learning_rate": 0.00042249999999999997, "loss": 0.4149, "mlm_loss": 0.4149, "step": 8450 }, { "ep_loss": 0.0, "epoch": 6.53, "learning_rate": 0.000423, "loss": 0.4154, "mlm_loss": 0.4154, "step": 8460 }, { "ep_loss": 0.0, "epoch": 6.53, "learning_rate": 0.0004235, "loss": 0.409, "mlm_loss": 0.409, "step": 8470 }, { "ep_loss": 0.0, "epoch": 6.54, "learning_rate": 0.000424, "loss": 0.4098, "mlm_loss": 0.4098, "step": 8480 }, { "ep_loss": 0.0, "epoch": 6.55, "learning_rate": 0.0004245, "loss": 0.4179, "mlm_loss": 0.4179, "step": 8490 }, { "ep_loss": 0.0, "epoch": 6.56, "learning_rate": 0.000425, "loss": 0.4147, "mlm_loss": 0.4147, "step": 8500 }, { "epoch": 6.56, "eval_ep_loss": -2.5853233337402344, "eval_loss": 0.3858720660209656, "eval_mlm_loss": 0.3858720660209656, "eval_runtime": 60.8689, "eval_samples_per_second": 1147.86, "eval_steps_per_second": 0.575, "step": 8500 }, { "ep_loss": 0.0, "epoch": 6.56, "learning_rate": 0.0004255, "loss": 0.4094, "mlm_loss": 0.4094, "step": 8510 }, { "ep_loss": 0.0, "epoch": 6.57, "learning_rate": 0.000426, "loss": 0.4136, "mlm_loss": 0.4136, "step": 8520 }, { "ep_loss": 0.0, "epoch": 6.58, "learning_rate": 0.0004265, "loss": 0.4077, "mlm_loss": 0.4077, "step": 8530 }, { "ep_loss": 0.0, "epoch": 6.59, "learning_rate": 0.000427, "loss": 0.4077, "mlm_loss": 0.4077, "step": 8540 }, { "ep_loss": 0.0, "epoch": 6.59, "learning_rate": 0.0004275, "loss": 0.4115, "mlm_loss": 0.4115, "step": 8550 }, { "ep_loss": 0.0, "epoch": 6.6, "learning_rate": 0.000428, "loss": 0.4068, "mlm_loss": 0.4068, "step": 8560 }, { "ep_loss": 0.0, "epoch": 6.61, "learning_rate": 0.0004285, "loss": 0.4096, "mlm_loss": 0.4096, "step": 8570 }, { "ep_loss": 0.0, "epoch": 6.62, "learning_rate": 0.000429, "loss": 0.4146, "mlm_loss": 0.4146, "step": 8580 }, { "ep_loss": 0.0, "epoch": 6.63, "learning_rate": 0.0004295, "loss": 0.4095, "mlm_loss": 0.4095, "step": 8590 }, { "ep_loss": 0.0, "epoch": 6.63, "learning_rate": 0.00043, "loss": 0.4105, "mlm_loss": 0.4105, "step": 8600 }, { "epoch": 6.63, "eval_ep_loss": -2.559201955795288, "eval_loss": 0.3874005973339081, "eval_mlm_loss": 0.3874005973339081, "eval_runtime": 62.3828, "eval_samples_per_second": 1120.003, "eval_steps_per_second": 0.561, "step": 8600 }, { "ep_loss": 0.0, "epoch": 6.64, "learning_rate": 0.0004305, "loss": 0.4121, "mlm_loss": 0.4121, "step": 8610 }, { "ep_loss": 0.0, "epoch": 6.65, "learning_rate": 0.000431, "loss": 0.4115, "mlm_loss": 0.4115, "step": 8620 }, { "ep_loss": 0.0, "epoch": 6.66, "learning_rate": 0.0004315, "loss": 0.4124, "mlm_loss": 0.4124, "step": 8630 }, { "ep_loss": 0.0, "epoch": 6.66, "learning_rate": 0.000432, "loss": 0.4158, "mlm_loss": 0.4158, "step": 8640 }, { "ep_loss": 0.0, "epoch": 6.67, "learning_rate": 0.0004325, "loss": 0.4099, "mlm_loss": 0.4099, "step": 8650 }, { "ep_loss": 0.0, "epoch": 6.68, "learning_rate": 0.000433, "loss": 0.4157, "mlm_loss": 0.4157, "step": 8660 }, { "ep_loss": 0.0, "epoch": 6.69, "learning_rate": 0.0004335, "loss": 0.4032, "mlm_loss": 0.4032, "step": 8670 }, { "ep_loss": 0.0, "epoch": 6.69, "learning_rate": 0.00043400000000000003, "loss": 0.4094, "mlm_loss": 0.4094, "step": 8680 }, { "ep_loss": 0.0, "epoch": 6.7, "learning_rate": 0.0004345, "loss": 0.4057, "mlm_loss": 0.4057, "step": 8690 }, { "ep_loss": 0.0, "epoch": 6.71, "learning_rate": 0.000435, "loss": 0.4144, "mlm_loss": 0.4144, "step": 8700 }, { "epoch": 6.71, "eval_ep_loss": -2.2817635536193848, "eval_loss": 0.3839435577392578, "eval_mlm_loss": 0.3839435577392578, "eval_runtime": 61.6764, "eval_samples_per_second": 1132.831, "eval_steps_per_second": 0.567, "step": 8700 }, { "ep_loss": 0.0, "epoch": 6.72, "learning_rate": 0.0004355, "loss": 0.4114, "mlm_loss": 0.4114, "step": 8710 }, { "ep_loss": 0.0, "epoch": 6.73, "learning_rate": 0.000436, "loss": 0.4091, "mlm_loss": 0.4091, "step": 8720 }, { "ep_loss": 0.0, "epoch": 6.73, "learning_rate": 0.0004365, "loss": 0.4058, "mlm_loss": 0.4058, "step": 8730 }, { "ep_loss": 0.0, "epoch": 6.74, "learning_rate": 0.000437, "loss": 0.4009, "mlm_loss": 0.4009, "step": 8740 }, { "ep_loss": 0.0, "epoch": 6.75, "learning_rate": 0.0004375, "loss": 0.405, "mlm_loss": 0.405, "step": 8750 }, { "ep_loss": 0.0, "epoch": 6.76, "learning_rate": 0.000438, "loss": 0.4084, "mlm_loss": 0.4084, "step": 8760 }, { "ep_loss": 0.0, "epoch": 6.76, "learning_rate": 0.00043850000000000003, "loss": 0.409, "mlm_loss": 0.409, "step": 8770 }, { "ep_loss": 0.0, "epoch": 6.77, "learning_rate": 0.000439, "loss": 0.407, "mlm_loss": 0.407, "step": 8780 }, { "ep_loss": 0.0, "epoch": 6.78, "learning_rate": 0.0004395, "loss": 0.4003, "mlm_loss": 0.4003, "step": 8790 }, { "ep_loss": 0.0, "epoch": 6.79, "learning_rate": 0.00044, "loss": 0.4021, "mlm_loss": 0.4021, "step": 8800 }, { "epoch": 6.79, "eval_ep_loss": -2.1474719047546387, "eval_loss": 0.38189372420310974, "eval_mlm_loss": 0.38189372420310974, "eval_runtime": 59.6801, "eval_samples_per_second": 1170.726, "eval_steps_per_second": 0.586, "step": 8800 }, { "ep_loss": 0.0, "epoch": 6.8, "learning_rate": 0.00044050000000000003, "loss": 0.4166, "mlm_loss": 0.4166, "step": 8810 }, { "ep_loss": 0.0, "epoch": 6.8, "learning_rate": 0.000441, "loss": 0.4055, "mlm_loss": 0.4055, "step": 8820 }, { "ep_loss": 0.0, "epoch": 6.81, "learning_rate": 0.0004415, "loss": 0.4095, "mlm_loss": 0.4095, "step": 8830 }, { "ep_loss": 0.0, "epoch": 6.82, "learning_rate": 0.000442, "loss": 0.4063, "mlm_loss": 0.4063, "step": 8840 }, { "ep_loss": 0.0, "epoch": 6.83, "learning_rate": 0.0004425, "loss": 0.4124, "mlm_loss": 0.4124, "step": 8850 }, { "ep_loss": 0.0, "epoch": 6.83, "learning_rate": 0.00044300000000000003, "loss": 0.4052, "mlm_loss": 0.4052, "step": 8860 }, { "ep_loss": 0.0, "epoch": 6.84, "learning_rate": 0.0004435, "loss": 0.4106, "mlm_loss": 0.4106, "step": 8870 }, { "ep_loss": 0.0, "epoch": 6.85, "learning_rate": 0.000444, "loss": 0.4042, "mlm_loss": 0.4042, "step": 8880 }, { "ep_loss": 0.0, "epoch": 6.86, "learning_rate": 0.0004445, "loss": 0.4126, "mlm_loss": 0.4126, "step": 8890 }, { "ep_loss": 0.0, "epoch": 6.86, "learning_rate": 0.00044500000000000003, "loss": 0.4019, "mlm_loss": 0.4019, "step": 8900 }, { "epoch": 6.86, "eval_ep_loss": -2.1015372276306152, "eval_loss": 0.3846256732940674, "eval_mlm_loss": 0.3846256732940674, "eval_runtime": 59.4967, "eval_samples_per_second": 1174.335, "eval_steps_per_second": 0.588, "step": 8900 }, { "ep_loss": 0.0, "epoch": 6.87, "learning_rate": 0.00044550000000000004, "loss": 0.4092, "mlm_loss": 0.4092, "step": 8910 }, { "ep_loss": 0.0, "epoch": 6.88, "learning_rate": 0.000446, "loss": 0.4053, "mlm_loss": 0.4053, "step": 8920 }, { "ep_loss": 0.0, "epoch": 6.89, "learning_rate": 0.0004465, "loss": 0.4071, "mlm_loss": 0.4071, "step": 8930 }, { "ep_loss": 0.0, "epoch": 6.9, "learning_rate": 0.000447, "loss": 0.4087, "mlm_loss": 0.4087, "step": 8940 }, { "ep_loss": 0.0, "epoch": 6.9, "learning_rate": 0.00044750000000000004, "loss": 0.4075, "mlm_loss": 0.4075, "step": 8950 }, { "ep_loss": 0.0, "epoch": 6.91, "learning_rate": 0.000448, "loss": 0.4134, "mlm_loss": 0.4134, "step": 8960 }, { "ep_loss": 0.0, "epoch": 6.92, "learning_rate": 0.0004485, "loss": 0.407, "mlm_loss": 0.407, "step": 8970 }, { "ep_loss": 0.0, "epoch": 6.93, "learning_rate": 0.000449, "loss": 0.4081, "mlm_loss": 0.4081, "step": 8980 }, { "ep_loss": 0.0, "epoch": 6.93, "learning_rate": 0.00044950000000000003, "loss": 0.4047, "mlm_loss": 0.4047, "step": 8990 }, { "ep_loss": 0.0, "epoch": 6.94, "learning_rate": 0.00045000000000000004, "loss": 0.4046, "mlm_loss": 0.4046, "step": 9000 }, { "epoch": 6.94, "eval_ep_loss": -2.1676366329193115, "eval_loss": 0.3808509409427643, "eval_mlm_loss": 0.3808509409427643, "eval_runtime": 60.7859, "eval_samples_per_second": 1149.428, "eval_steps_per_second": 0.576, "step": 9000 }, { "ep_loss": 0.0, "epoch": 6.95, "learning_rate": 0.0004505, "loss": 0.4108, "mlm_loss": 0.4108, "step": 9010 }, { "ep_loss": 0.0, "epoch": 6.96, "learning_rate": 0.000451, "loss": 0.4095, "mlm_loss": 0.4095, "step": 9020 }, { "ep_loss": 0.0, "epoch": 6.96, "learning_rate": 0.0004515, "loss": 0.4069, "mlm_loss": 0.4069, "step": 9030 }, { "ep_loss": 0.0, "epoch": 6.97, "learning_rate": 0.00045200000000000004, "loss": 0.407, "mlm_loss": 0.407, "step": 9040 }, { "ep_loss": 0.0, "epoch": 6.98, "learning_rate": 0.00045250000000000005, "loss": 0.4056, "mlm_loss": 0.4056, "step": 9050 }, { "ep_loss": 0.0, "epoch": 6.99, "learning_rate": 0.000453, "loss": 0.4107, "mlm_loss": 0.4107, "step": 9060 }, { "ep_loss": 0.0, "epoch": 7.0, "learning_rate": 0.0004535, "loss": 0.4085, "mlm_loss": 0.4085, "step": 9070 }, { "ep_loss": 0.0, "epoch": 7.0, "learning_rate": 0.00045400000000000003, "loss": 0.4146, "mlm_loss": 0.4146, "step": 9080 }, { "ep_loss": 0.0, "epoch": 7.01, "learning_rate": 0.00045450000000000004, "loss": 0.4041, "mlm_loss": 0.4041, "step": 9090 }, { "ep_loss": 0.0, "epoch": 7.02, "learning_rate": 0.000455, "loss": 0.414, "mlm_loss": 0.414, "step": 9100 }, { "epoch": 7.02, "eval_ep_loss": -2.345536231994629, "eval_loss": 0.3819408118724823, "eval_mlm_loss": 0.3819408118724823, "eval_runtime": 61.289, "eval_samples_per_second": 1139.992, "eval_steps_per_second": 0.571, "step": 9100 }, { "ep_loss": 0.0, "epoch": 7.03, "learning_rate": 0.0004555, "loss": 0.4014, "mlm_loss": 0.4014, "step": 9110 }, { "ep_loss": 0.0, "epoch": 7.03, "learning_rate": 0.000456, "loss": 0.4039, "mlm_loss": 0.4039, "step": 9120 }, { "ep_loss": 0.0, "epoch": 7.04, "learning_rate": 0.00045650000000000004, "loss": 0.4066, "mlm_loss": 0.4066, "step": 9130 }, { "ep_loss": 0.0, "epoch": 7.05, "learning_rate": 0.00045700000000000005, "loss": 0.4107, "mlm_loss": 0.4107, "step": 9140 }, { "ep_loss": 0.0, "epoch": 7.06, "learning_rate": 0.0004575, "loss": 0.4035, "mlm_loss": 0.4035, "step": 9150 }, { "ep_loss": 0.0, "epoch": 7.07, "learning_rate": 0.000458, "loss": 0.4037, "mlm_loss": 0.4037, "step": 9160 }, { "ep_loss": 0.0, "epoch": 7.07, "learning_rate": 0.00045850000000000003, "loss": 0.4035, "mlm_loss": 0.4035, "step": 9170 }, { "ep_loss": 0.0, "epoch": 7.08, "learning_rate": 0.00045900000000000004, "loss": 0.4054, "mlm_loss": 0.4054, "step": 9180 }, { "ep_loss": 0.0, "epoch": 7.09, "learning_rate": 0.00045950000000000006, "loss": 0.4078, "mlm_loss": 0.4078, "step": 9190 }, { "ep_loss": 0.0, "epoch": 7.1, "learning_rate": 0.00046, "loss": 0.4066, "mlm_loss": 0.4066, "step": 9200 }, { "epoch": 7.1, "eval_ep_loss": -2.1998836994171143, "eval_loss": 0.37953051924705505, "eval_mlm_loss": 0.37953051924705505, "eval_runtime": 59.882, "eval_samples_per_second": 1166.779, "eval_steps_per_second": 0.584, "step": 9200 }, { "ep_loss": 0.0, "epoch": 7.1, "learning_rate": 0.0004605, "loss": 0.3965, "mlm_loss": 0.3965, "step": 9210 }, { "ep_loss": 0.0, "epoch": 7.11, "learning_rate": 0.00046100000000000004, "loss": 0.3973, "mlm_loss": 0.3973, "step": 9220 }, { "ep_loss": 0.0, "epoch": 7.12, "learning_rate": 0.00046150000000000005, "loss": 0.3996, "mlm_loss": 0.3996, "step": 9230 }, { "ep_loss": 0.0, "epoch": 7.13, "learning_rate": 0.000462, "loss": 0.4075, "mlm_loss": 0.4075, "step": 9240 }, { "ep_loss": 0.0, "epoch": 7.13, "learning_rate": 0.0004625, "loss": 0.4042, "mlm_loss": 0.4042, "step": 9250 }, { "ep_loss": 0.0, "epoch": 7.14, "learning_rate": 0.00046300000000000003, "loss": 0.3976, "mlm_loss": 0.3976, "step": 9260 }, { "ep_loss": 0.0, "epoch": 7.15, "learning_rate": 0.00046350000000000004, "loss": 0.4026, "mlm_loss": 0.4026, "step": 9270 }, { "ep_loss": 0.0, "epoch": 7.16, "learning_rate": 0.00046400000000000006, "loss": 0.4047, "mlm_loss": 0.4047, "step": 9280 }, { "ep_loss": 0.0, "epoch": 7.17, "learning_rate": 0.0004645, "loss": 0.403, "mlm_loss": 0.403, "step": 9290 }, { "ep_loss": 0.0, "epoch": 7.17, "learning_rate": 0.000465, "loss": 0.4064, "mlm_loss": 0.4064, "step": 9300 }, { "epoch": 7.17, "eval_ep_loss": -2.131833553314209, "eval_loss": 0.38073843717575073, "eval_mlm_loss": 0.38073843717575073, "eval_runtime": 61.1101, "eval_samples_per_second": 1143.331, "eval_steps_per_second": 0.573, "step": 9300 }, { "ep_loss": 0.0, "epoch": 7.18, "learning_rate": 0.00046550000000000004, "loss": 0.4023, "mlm_loss": 0.4023, "step": 9310 }, { "ep_loss": 0.0, "epoch": 7.19, "learning_rate": 0.00046600000000000005, "loss": 0.4037, "mlm_loss": 0.4037, "step": 9320 }, { "ep_loss": 0.0, "epoch": 7.2, "learning_rate": 0.0004665, "loss": 0.4024, "mlm_loss": 0.4024, "step": 9330 }, { "ep_loss": 0.0, "epoch": 7.2, "learning_rate": 0.000467, "loss": 0.4031, "mlm_loss": 0.4031, "step": 9340 }, { "ep_loss": 0.0, "epoch": 7.21, "learning_rate": 0.00046750000000000003, "loss": 0.4069, "mlm_loss": 0.4069, "step": 9350 }, { "ep_loss": 0.0, "epoch": 7.22, "learning_rate": 0.00046800000000000005, "loss": 0.4134, "mlm_loss": 0.4134, "step": 9360 }, { "ep_loss": 0.0, "epoch": 7.23, "learning_rate": 0.00046850000000000006, "loss": 0.3998, "mlm_loss": 0.3998, "step": 9370 }, { "ep_loss": 0.0, "epoch": 7.23, "learning_rate": 0.00046899999999999996, "loss": 0.403, "mlm_loss": 0.403, "step": 9380 }, { "ep_loss": 0.0, "epoch": 7.24, "learning_rate": 0.0004695, "loss": 0.4067, "mlm_loss": 0.4067, "step": 9390 }, { "ep_loss": 0.0, "epoch": 7.25, "learning_rate": 0.00047, "loss": 0.4022, "mlm_loss": 0.4022, "step": 9400 }, { "epoch": 7.25, "eval_ep_loss": -2.311575412750244, "eval_loss": 0.3808383643627167, "eval_mlm_loss": 0.3808383643627167, "eval_runtime": 61.1652, "eval_samples_per_second": 1142.301, "eval_steps_per_second": 0.572, "step": 9400 }, { "ep_loss": 0.0, "epoch": 7.26, "learning_rate": 0.0004705, "loss": 0.405, "mlm_loss": 0.405, "step": 9410 }, { "ep_loss": 0.0, "epoch": 7.27, "learning_rate": 0.000471, "loss": 0.3949, "mlm_loss": 0.3949, "step": 9420 }, { "ep_loss": 0.0, "epoch": 7.27, "learning_rate": 0.00047149999999999997, "loss": 0.405, "mlm_loss": 0.405, "step": 9430 }, { "ep_loss": 0.0, "epoch": 7.28, "learning_rate": 0.000472, "loss": 0.4092, "mlm_loss": 0.4092, "step": 9440 }, { "ep_loss": 0.0, "epoch": 7.29, "learning_rate": 0.0004725, "loss": 0.4102, "mlm_loss": 0.4102, "step": 9450 }, { "ep_loss": 0.0, "epoch": 7.3, "learning_rate": 0.000473, "loss": 0.4066, "mlm_loss": 0.4066, "step": 9460 }, { "ep_loss": 0.0, "epoch": 7.3, "learning_rate": 0.00047349999999999996, "loss": 0.3987, "mlm_loss": 0.3987, "step": 9470 }, { "ep_loss": 0.0, "epoch": 7.31, "learning_rate": 0.000474, "loss": 0.399, "mlm_loss": 0.399, "step": 9480 }, { "ep_loss": 0.0, "epoch": 7.32, "learning_rate": 0.0004745, "loss": 0.4033, "mlm_loss": 0.4033, "step": 9490 }, { "ep_loss": 0.0, "epoch": 7.33, "learning_rate": 0.000475, "loss": 0.3982, "mlm_loss": 0.3982, "step": 9500 }, { "epoch": 7.33, "eval_ep_loss": -2.2588560581207275, "eval_loss": 0.37661808729171753, "eval_mlm_loss": 0.37661808729171753, "eval_runtime": 61.5272, "eval_samples_per_second": 1135.579, "eval_steps_per_second": 0.569, "step": 9500 }, { "ep_loss": 0.0, "epoch": 7.34, "learning_rate": 0.0004755, "loss": 0.4048, "mlm_loss": 0.4048, "step": 9510 }, { "ep_loss": 0.0, "epoch": 7.34, "learning_rate": 0.00047599999999999997, "loss": 0.4034, "mlm_loss": 0.4034, "step": 9520 }, { "ep_loss": 0.0, "epoch": 7.35, "learning_rate": 0.0004765, "loss": 0.4001, "mlm_loss": 0.4001, "step": 9530 }, { "ep_loss": 0.0, "epoch": 7.36, "learning_rate": 0.000477, "loss": 0.406, "mlm_loss": 0.406, "step": 9540 }, { "ep_loss": 0.0, "epoch": 7.37, "learning_rate": 0.0004775, "loss": 0.4007, "mlm_loss": 0.4007, "step": 9550 }, { "ep_loss": 0.0, "epoch": 7.37, "learning_rate": 0.00047799999999999996, "loss": 0.416, "mlm_loss": 0.416, "step": 9560 }, { "ep_loss": 0.0, "epoch": 7.38, "learning_rate": 0.0004785, "loss": 0.398, "mlm_loss": 0.398, "step": 9570 }, { "ep_loss": 0.0, "epoch": 7.39, "learning_rate": 0.000479, "loss": 0.4045, "mlm_loss": 0.4045, "step": 9580 }, { "ep_loss": 0.0, "epoch": 7.4, "learning_rate": 0.0004795, "loss": 0.4011, "mlm_loss": 0.4011, "step": 9590 }, { "ep_loss": 0.0, "epoch": 7.4, "learning_rate": 0.00048, "loss": 0.3991, "mlm_loss": 0.3991, "step": 9600 }, { "epoch": 7.4, "eval_ep_loss": -2.4242005348205566, "eval_loss": 0.37998148798942566, "eval_mlm_loss": 0.37998148798942566, "eval_runtime": 59.2852, "eval_samples_per_second": 1178.524, "eval_steps_per_second": 0.59, "step": 9600 }, { "ep_loss": 0.0, "epoch": 7.41, "learning_rate": 0.00048049999999999997, "loss": 0.3983, "mlm_loss": 0.3983, "step": 9610 }, { "ep_loss": 0.0, "epoch": 7.42, "learning_rate": 0.000481, "loss": 0.4056, "mlm_loss": 0.4056, "step": 9620 }, { "ep_loss": 0.0, "epoch": 7.43, "learning_rate": 0.0004815, "loss": 0.4034, "mlm_loss": 0.4034, "step": 9630 }, { "ep_loss": 0.0, "epoch": 7.44, "learning_rate": 0.000482, "loss": 0.4071, "mlm_loss": 0.4071, "step": 9640 }, { "ep_loss": 0.0, "epoch": 7.44, "learning_rate": 0.0004825, "loss": 0.3994, "mlm_loss": 0.3994, "step": 9650 }, { "ep_loss": 0.0, "epoch": 7.45, "learning_rate": 0.000483, "loss": 0.398, "mlm_loss": 0.398, "step": 9660 }, { "ep_loss": 0.0, "epoch": 7.46, "learning_rate": 0.0004835, "loss": 0.4038, "mlm_loss": 0.4038, "step": 9670 }, { "ep_loss": 0.0, "epoch": 7.47, "learning_rate": 0.000484, "loss": 0.401, "mlm_loss": 0.401, "step": 9680 }, { "ep_loss": 0.0, "epoch": 7.47, "learning_rate": 0.0004845, "loss": 0.4049, "mlm_loss": 0.4049, "step": 9690 }, { "ep_loss": 0.0, "epoch": 7.48, "learning_rate": 0.00048499999999999997, "loss": 0.4003, "mlm_loss": 0.4003, "step": 9700 }, { "epoch": 7.48, "eval_ep_loss": -2.468691825866699, "eval_loss": 0.37805142998695374, "eval_mlm_loss": 0.37805142998695374, "eval_runtime": 60.3023, "eval_samples_per_second": 1158.646, "eval_steps_per_second": 0.58, "step": 9700 }, { "ep_loss": 0.0, "epoch": 7.49, "learning_rate": 0.0004855, "loss": 0.3978, "mlm_loss": 0.3978, "step": 9710 }, { "ep_loss": 0.0, "epoch": 7.5, "learning_rate": 0.000486, "loss": 0.3994, "mlm_loss": 0.3994, "step": 9720 }, { "ep_loss": 0.0, "epoch": 7.5, "learning_rate": 0.0004865, "loss": 0.4029, "mlm_loss": 0.4029, "step": 9730 }, { "ep_loss": 0.0, "epoch": 7.51, "learning_rate": 0.000487, "loss": 0.3991, "mlm_loss": 0.3991, "step": 9740 }, { "ep_loss": 0.0, "epoch": 7.52, "learning_rate": 0.0004875, "loss": 0.4061, "mlm_loss": 0.4061, "step": 9750 }, { "ep_loss": 0.0, "epoch": 7.53, "learning_rate": 0.000488, "loss": 0.4047, "mlm_loss": 0.4047, "step": 9760 }, { "ep_loss": 0.0, "epoch": 7.54, "learning_rate": 0.0004885, "loss": 0.4071, "mlm_loss": 0.4071, "step": 9770 }, { "ep_loss": 0.0, "epoch": 7.54, "learning_rate": 0.000489, "loss": 0.4014, "mlm_loss": 0.4014, "step": 9780 }, { "ep_loss": 0.0, "epoch": 7.55, "learning_rate": 0.0004895, "loss": 0.4023, "mlm_loss": 0.4023, "step": 9790 }, { "ep_loss": 0.0, "epoch": 7.56, "learning_rate": 0.00049, "loss": 0.4052, "mlm_loss": 0.4052, "step": 9800 }, { "epoch": 7.56, "eval_ep_loss": -2.403273344039917, "eval_loss": 0.3772997260093689, "eval_mlm_loss": 0.3772997260093689, "eval_runtime": 59.2376, "eval_samples_per_second": 1179.471, "eval_steps_per_second": 0.591, "step": 9800 }, { "ep_loss": 0.0, "epoch": 7.57, "learning_rate": 0.0004905, "loss": 0.394, "mlm_loss": 0.394, "step": 9810 }, { "ep_loss": 0.0, "epoch": 7.57, "learning_rate": 0.000491, "loss": 0.4071, "mlm_loss": 0.4071, "step": 9820 }, { "ep_loss": 0.0, "epoch": 7.58, "learning_rate": 0.0004915, "loss": 0.3992, "mlm_loss": 0.3992, "step": 9830 }, { "ep_loss": 0.0, "epoch": 7.59, "learning_rate": 0.000492, "loss": 0.4024, "mlm_loss": 0.4024, "step": 9840 }, { "ep_loss": 0.0, "epoch": 7.6, "learning_rate": 0.0004925, "loss": 0.4042, "mlm_loss": 0.4042, "step": 9850 }, { "ep_loss": 0.0, "epoch": 7.61, "learning_rate": 0.0004930000000000001, "loss": 0.4067, "mlm_loss": 0.4067, "step": 9860 }, { "ep_loss": 0.0, "epoch": 7.61, "learning_rate": 0.0004935, "loss": 0.3995, "mlm_loss": 0.3995, "step": 9870 }, { "ep_loss": 0.0, "epoch": 7.62, "learning_rate": 0.000494, "loss": 0.3998, "mlm_loss": 0.3998, "step": 9880 }, { "ep_loss": 0.0, "epoch": 7.63, "learning_rate": 0.0004945, "loss": 0.4011, "mlm_loss": 0.4011, "step": 9890 }, { "ep_loss": 0.0, "epoch": 7.64, "learning_rate": 0.000495, "loss": 0.4003, "mlm_loss": 0.4003, "step": 9900 }, { "epoch": 7.64, "eval_ep_loss": -2.388214111328125, "eval_loss": 0.3742392659187317, "eval_mlm_loss": 0.3742392659187317, "eval_runtime": 60.995, "eval_samples_per_second": 1145.488, "eval_steps_per_second": 0.574, "step": 9900 }, { "ep_loss": 0.0, "epoch": 7.64, "learning_rate": 0.0004955, "loss": 0.3885, "mlm_loss": 0.3885, "step": 9910 }, { "ep_loss": 0.0, "epoch": 7.65, "learning_rate": 0.000496, "loss": 0.3988, "mlm_loss": 0.3988, "step": 9920 }, { "ep_loss": 0.0, "epoch": 7.66, "learning_rate": 0.0004965, "loss": 0.4044, "mlm_loss": 0.4044, "step": 9930 }, { "ep_loss": 0.0, "epoch": 7.67, "learning_rate": 0.000497, "loss": 0.4002, "mlm_loss": 0.4002, "step": 9940 }, { "ep_loss": 0.0, "epoch": 7.67, "learning_rate": 0.0004975, "loss": 0.4078, "mlm_loss": 0.4078, "step": 9950 }, { "ep_loss": 0.0, "epoch": 7.68, "learning_rate": 0.000498, "loss": 0.3993, "mlm_loss": 0.3993, "step": 9960 }, { "ep_loss": 0.0, "epoch": 7.69, "learning_rate": 0.0004985, "loss": 0.3985, "mlm_loss": 0.3985, "step": 9970 }, { "ep_loss": 0.0, "epoch": 7.7, "learning_rate": 0.0004989500000000001, "loss": 0.4052, "mlm_loss": 0.4052, "step": 9980 }, { "ep_loss": 0.0, "epoch": 7.71, "learning_rate": 0.00049945, "loss": 0.4045, "mlm_loss": 0.4045, "step": 9990 }, { "ep_loss": 0.0, "epoch": 7.71, "learning_rate": 0.00049995, "loss": 0.3959, "mlm_loss": 0.3959, "step": 10000 }, { "epoch": 7.71, "eval_ep_loss": -2.4986319541931152, "eval_loss": 0.3774937689304352, "eval_mlm_loss": 0.3774937689304352, "eval_runtime": 60.247, "eval_samples_per_second": 1159.709, "eval_steps_per_second": 0.581, "step": 10000 }, { "ep_loss": 0.0, "epoch": 7.72, "learning_rate": 0.0004997173366834171, "loss": 0.4023, "mlm_loss": 0.4023, "step": 10010 }, { "ep_loss": 0.0, "epoch": 7.73, "learning_rate": 0.0004994032663316583, "loss": 0.407, "mlm_loss": 0.407, "step": 10020 }, { "ep_loss": 0.0, "epoch": 7.74, "learning_rate": 0.0004990891959798995, "loss": 0.4024, "mlm_loss": 0.4024, "step": 10030 }, { "ep_loss": 0.0, "epoch": 7.74, "learning_rate": 0.0004987751256281407, "loss": 0.3991, "mlm_loss": 0.3991, "step": 10040 }, { "ep_loss": 0.0, "epoch": 7.75, "learning_rate": 0.000498461055276382, "loss": 0.4058, "mlm_loss": 0.4058, "step": 10050 }, { "ep_loss": 0.0, "epoch": 7.76, "learning_rate": 0.0004981469849246231, "loss": 0.4046, "mlm_loss": 0.4046, "step": 10060 }, { "ep_loss": 0.0, "epoch": 7.77, "learning_rate": 0.0004978329145728643, "loss": 0.397, "mlm_loss": 0.397, "step": 10070 }, { "ep_loss": 0.0, "epoch": 7.77, "learning_rate": 0.0004975188442211055, "loss": 0.3967, "mlm_loss": 0.3967, "step": 10080 }, { "ep_loss": 0.0, "epoch": 7.78, "learning_rate": 0.0004972047738693468, "loss": 0.3986, "mlm_loss": 0.3986, "step": 10090 }, { "ep_loss": 0.0, "epoch": 7.79, "learning_rate": 0.000496890703517588, "loss": 0.4018, "mlm_loss": 0.4018, "step": 10100 }, { "epoch": 7.79, "eval_ep_loss": -2.651991605758667, "eval_loss": 0.374184250831604, "eval_mlm_loss": 0.374184250831604, "eval_runtime": 61.0227, "eval_samples_per_second": 1144.966, "eval_steps_per_second": 0.574, "step": 10100 }, { "ep_loss": 0.0, "epoch": 7.8, "learning_rate": 0.0004965766331658292, "loss": 0.4029, "mlm_loss": 0.4029, "step": 10110 }, { "ep_loss": 0.0, "epoch": 7.81, "learning_rate": 0.0004962625628140703, "loss": 0.4006, "mlm_loss": 0.4006, "step": 10120 }, { "ep_loss": 0.0, "epoch": 7.81, "learning_rate": 0.0004959484924623116, "loss": 0.4003, "mlm_loss": 0.4003, "step": 10130 }, { "ep_loss": 0.0, "epoch": 7.82, "learning_rate": 0.0004956344221105528, "loss": 0.3982, "mlm_loss": 0.3982, "step": 10140 }, { "ep_loss": 0.0, "epoch": 7.83, "learning_rate": 0.000495320351758794, "loss": 0.4044, "mlm_loss": 0.4044, "step": 10150 }, { "ep_loss": 0.0, "epoch": 7.84, "learning_rate": 0.0004950062814070351, "loss": 0.4023, "mlm_loss": 0.4023, "step": 10160 }, { "ep_loss": 0.0, "epoch": 7.84, "learning_rate": 0.0004946922110552764, "loss": 0.4024, "mlm_loss": 0.4024, "step": 10170 }, { "ep_loss": 0.0, "epoch": 7.85, "learning_rate": 0.0004943781407035176, "loss": 0.4036, "mlm_loss": 0.4036, "step": 10180 }, { "ep_loss": 0.0, "epoch": 7.86, "learning_rate": 0.0004940640703517588, "loss": 0.3962, "mlm_loss": 0.3962, "step": 10190 }, { "ep_loss": 0.0, "epoch": 7.87, "learning_rate": 0.00049375, "loss": 0.4061, "mlm_loss": 0.4061, "step": 10200 }, { "epoch": 7.87, "eval_ep_loss": -2.436077356338501, "eval_loss": 0.3732386827468872, "eval_mlm_loss": 0.3732386827468872, "eval_runtime": 59.8162, "eval_samples_per_second": 1168.062, "eval_steps_per_second": 0.585, "step": 10200 }, { "ep_loss": 0.0, "epoch": 7.88, "learning_rate": 0.0004934359296482412, "loss": 0.3959, "mlm_loss": 0.3959, "step": 10210 }, { "ep_loss": 0.0, "epoch": 7.88, "learning_rate": 0.0004931218592964824, "loss": 0.3997, "mlm_loss": 0.3997, "step": 10220 }, { "ep_loss": 0.0, "epoch": 7.89, "learning_rate": 0.0004928077889447236, "loss": 0.4013, "mlm_loss": 0.4013, "step": 10230 }, { "ep_loss": 0.0, "epoch": 7.9, "learning_rate": 0.0004924937185929649, "loss": 0.3956, "mlm_loss": 0.3956, "step": 10240 }, { "ep_loss": 0.0, "epoch": 7.91, "learning_rate": 0.0004921796482412061, "loss": 0.3967, "mlm_loss": 0.3967, "step": 10250 }, { "ep_loss": 0.0, "epoch": 7.91, "learning_rate": 0.0004918655778894472, "loss": 0.3967, "mlm_loss": 0.3967, "step": 10260 }, { "ep_loss": 0.0, "epoch": 7.92, "learning_rate": 0.0004915515075376884, "loss": 0.3954, "mlm_loss": 0.3954, "step": 10270 }, { "ep_loss": 0.0, "epoch": 7.93, "learning_rate": 0.0004912374371859297, "loss": 0.3976, "mlm_loss": 0.3976, "step": 10280 }, { "ep_loss": 0.0, "epoch": 7.94, "learning_rate": 0.0004909233668341709, "loss": 0.3945, "mlm_loss": 0.3945, "step": 10290 }, { "ep_loss": 0.0, "epoch": 7.94, "learning_rate": 0.0004906092964824121, "loss": 0.3979, "mlm_loss": 0.3979, "step": 10300 }, { "epoch": 7.94, "eval_ep_loss": -2.4026572704315186, "eval_loss": 0.37325599789619446, "eval_mlm_loss": 0.37325599789619446, "eval_runtime": 61.5586, "eval_samples_per_second": 1135.0, "eval_steps_per_second": 0.569, "step": 10300 }, { "ep_loss": 0.0, "epoch": 7.95, "learning_rate": 0.0004902952261306532, "loss": 0.3971, "mlm_loss": 0.3971, "step": 10310 }, { "ep_loss": 0.0, "epoch": 7.96, "learning_rate": 0.0004899811557788945, "loss": 0.4018, "mlm_loss": 0.4018, "step": 10320 }, { "ep_loss": 0.0, "epoch": 7.97, "learning_rate": 0.0004896670854271357, "loss": 0.3993, "mlm_loss": 0.3993, "step": 10330 }, { "ep_loss": 0.0, "epoch": 7.98, "learning_rate": 0.0004893530150753769, "loss": 0.3995, "mlm_loss": 0.3995, "step": 10340 }, { "ep_loss": 0.0, "epoch": 7.98, "learning_rate": 0.0004890389447236181, "loss": 0.3948, "mlm_loss": 0.3948, "step": 10350 }, { "ep_loss": 0.0, "epoch": 7.99, "learning_rate": 0.0004887248743718593, "loss": 0.4028, "mlm_loss": 0.4028, "step": 10360 }, { "ep_loss": 0.0, "epoch": 8.0, "learning_rate": 0.0004884108040201005, "loss": 0.3972, "mlm_loss": 0.3972, "step": 10370 }, { "ep_loss": 0.0, "epoch": 8.01, "learning_rate": 0.0004880967336683417, "loss": 0.3899, "mlm_loss": 0.3899, "step": 10380 }, { "ep_loss": 0.0, "epoch": 8.01, "learning_rate": 0.00048778266331658295, "loss": 0.3993, "mlm_loss": 0.3993, "step": 10390 }, { "ep_loss": 0.0, "epoch": 8.02, "learning_rate": 0.0004874685929648242, "loss": 0.3945, "mlm_loss": 0.3945, "step": 10400 }, { "epoch": 8.02, "eval_ep_loss": -2.2991716861724854, "eval_loss": 0.3698738217353821, "eval_mlm_loss": 0.3698738217353821, "eval_runtime": 62.0944, "eval_samples_per_second": 1125.206, "eval_steps_per_second": 0.564, "step": 10400 }, { "ep_loss": 0.0, "epoch": 8.03, "learning_rate": 0.0004871545226130653, "loss": 0.3855, "mlm_loss": 0.3855, "step": 10410 }, { "ep_loss": 0.0, "epoch": 8.04, "learning_rate": 0.0004868404522613065, "loss": 0.3925, "mlm_loss": 0.3925, "step": 10420 }, { "ep_loss": 0.0, "epoch": 8.04, "learning_rate": 0.00048652638190954775, "loss": 0.3901, "mlm_loss": 0.3901, "step": 10430 }, { "ep_loss": 0.0, "epoch": 8.05, "learning_rate": 0.0004862123115577889, "loss": 0.39, "mlm_loss": 0.39, "step": 10440 }, { "ep_loss": 0.0, "epoch": 8.06, "learning_rate": 0.00048589824120603015, "loss": 0.3904, "mlm_loss": 0.3904, "step": 10450 }, { "ep_loss": 0.0, "epoch": 8.07, "learning_rate": 0.0004855841708542714, "loss": 0.3908, "mlm_loss": 0.3908, "step": 10460 }, { "ep_loss": 0.0, "epoch": 8.08, "learning_rate": 0.00048527010050251256, "loss": 0.3931, "mlm_loss": 0.3931, "step": 10470 }, { "ep_loss": 0.0, "epoch": 8.08, "learning_rate": 0.0004849560301507538, "loss": 0.3899, "mlm_loss": 0.3899, "step": 10480 }, { "ep_loss": 0.0, "epoch": 8.09, "learning_rate": 0.000484641959798995, "loss": 0.3908, "mlm_loss": 0.3908, "step": 10490 }, { "ep_loss": 0.0, "epoch": 8.1, "learning_rate": 0.00048432788944723624, "loss": 0.3909, "mlm_loss": 0.3909, "step": 10500 }, { "epoch": 8.1, "eval_ep_loss": -2.246208429336548, "eval_loss": 0.36925286054611206, "eval_mlm_loss": 0.36925286054611206, "eval_runtime": 61.3519, "eval_samples_per_second": 1138.823, "eval_steps_per_second": 0.57, "step": 10500 }, { "ep_loss": 0.0, "epoch": 8.11, "learning_rate": 0.00048401381909547736, "loss": 0.3922, "mlm_loss": 0.3922, "step": 10510 }, { "ep_loss": 0.0, "epoch": 8.11, "learning_rate": 0.0004836997487437186, "loss": 0.3878, "mlm_loss": 0.3878, "step": 10520 }, { "ep_loss": 0.0, "epoch": 8.12, "learning_rate": 0.0004833856783919598, "loss": 0.3901, "mlm_loss": 0.3901, "step": 10530 }, { "ep_loss": 0.0, "epoch": 8.13, "learning_rate": 0.000483071608040201, "loss": 0.3872, "mlm_loss": 0.3872, "step": 10540 }, { "ep_loss": 0.0, "epoch": 8.14, "learning_rate": 0.0004827575376884422, "loss": 0.3931, "mlm_loss": 0.3931, "step": 10550 }, { "ep_loss": 0.0, "epoch": 8.15, "learning_rate": 0.00048244346733668345, "loss": 0.3928, "mlm_loss": 0.3928, "step": 10560 }, { "ep_loss": 0.0, "epoch": 8.15, "learning_rate": 0.0004821293969849246, "loss": 0.3975, "mlm_loss": 0.3975, "step": 10570 }, { "ep_loss": 0.0, "epoch": 8.16, "learning_rate": 0.00048181532663316585, "loss": 0.4002, "mlm_loss": 0.4002, "step": 10580 }, { "ep_loss": 0.0, "epoch": 8.17, "learning_rate": 0.0004815012562814071, "loss": 0.3961, "mlm_loss": 0.3961, "step": 10590 }, { "ep_loss": 0.0, "epoch": 8.18, "learning_rate": 0.00048118718592964825, "loss": 0.392, "mlm_loss": 0.392, "step": 10600 }, { "epoch": 8.18, "eval_ep_loss": -2.7679216861724854, "eval_loss": 0.36933663487434387, "eval_mlm_loss": 0.36933663487434387, "eval_runtime": 61.879, "eval_samples_per_second": 1129.123, "eval_steps_per_second": 0.566, "step": 10600 }, { "ep_loss": 0.0, "epoch": 8.18, "learning_rate": 0.0004808731155778894, "loss": 0.3957, "mlm_loss": 0.3957, "step": 10610 }, { "ep_loss": 0.0, "epoch": 8.19, "learning_rate": 0.00048055904522613065, "loss": 0.4001, "mlm_loss": 0.4001, "step": 10620 }, { "ep_loss": 0.0, "epoch": 8.2, "learning_rate": 0.0004802449748743719, "loss": 0.398, "mlm_loss": 0.398, "step": 10630 }, { "ep_loss": 0.0, "epoch": 8.21, "learning_rate": 0.00047993090452261305, "loss": 0.3897, "mlm_loss": 0.3897, "step": 10640 }, { "ep_loss": 0.0, "epoch": 8.21, "learning_rate": 0.0004796168341708543, "loss": 0.3949, "mlm_loss": 0.3949, "step": 10650 }, { "ep_loss": 0.0, "epoch": 8.22, "learning_rate": 0.0004793027638190955, "loss": 0.3844, "mlm_loss": 0.3844, "step": 10660 }, { "ep_loss": 0.0, "epoch": 8.23, "learning_rate": 0.0004789886934673367, "loss": 0.3931, "mlm_loss": 0.3931, "step": 10670 }, { "ep_loss": 0.0, "epoch": 8.24, "learning_rate": 0.0004786746231155779, "loss": 0.3892, "mlm_loss": 0.3892, "step": 10680 }, { "ep_loss": 0.0, "epoch": 8.25, "learning_rate": 0.00047836055276381914, "loss": 0.3946, "mlm_loss": 0.3946, "step": 10690 }, { "ep_loss": 0.0, "epoch": 8.25, "learning_rate": 0.0004780464824120603, "loss": 0.393, "mlm_loss": 0.393, "step": 10700 }, { "epoch": 8.25, "eval_ep_loss": -2.653468608856201, "eval_loss": 0.36575964093208313, "eval_mlm_loss": 0.36575964093208313, "eval_runtime": 61.0152, "eval_samples_per_second": 1145.108, "eval_steps_per_second": 0.574, "step": 10700 }, { "ep_loss": 0.0, "epoch": 8.26, "learning_rate": 0.0004777324120603015, "loss": 0.3865, "mlm_loss": 0.3865, "step": 10710 }, { "ep_loss": 0.0, "epoch": 8.27, "learning_rate": 0.0004774183417085427, "loss": 0.3936, "mlm_loss": 0.3936, "step": 10720 }, { "ep_loss": 0.0, "epoch": 8.28, "learning_rate": 0.0004771042713567839, "loss": 0.3874, "mlm_loss": 0.3874, "step": 10730 }, { "ep_loss": 0.0, "epoch": 8.28, "learning_rate": 0.0004767902010050251, "loss": 0.3839, "mlm_loss": 0.3839, "step": 10740 }, { "ep_loss": 0.0, "epoch": 8.29, "learning_rate": 0.00047647613065326635, "loss": 0.3819, "mlm_loss": 0.3819, "step": 10750 }, { "ep_loss": 0.0, "epoch": 8.3, "learning_rate": 0.0004761620603015076, "loss": 0.3841, "mlm_loss": 0.3841, "step": 10760 }, { "ep_loss": 0.0, "epoch": 8.31, "learning_rate": 0.00047584798994974875, "loss": 0.3896, "mlm_loss": 0.3896, "step": 10770 }, { "ep_loss": 0.0, "epoch": 8.31, "learning_rate": 0.00047553391959799, "loss": 0.388, "mlm_loss": 0.388, "step": 10780 }, { "ep_loss": 0.0, "epoch": 8.32, "learning_rate": 0.0004752198492462312, "loss": 0.3864, "mlm_loss": 0.3864, "step": 10790 }, { "ep_loss": 0.0, "epoch": 8.33, "learning_rate": 0.0004749057788944724, "loss": 0.3887, "mlm_loss": 0.3887, "step": 10800 }, { "epoch": 8.33, "eval_ep_loss": -2.6650192737579346, "eval_loss": 0.362870454788208, "eval_mlm_loss": 0.362870454788208, "eval_runtime": 59.7372, "eval_samples_per_second": 1169.606, "eval_steps_per_second": 0.586, "step": 10800 }, { "ep_loss": 0.0, "epoch": 8.34, "learning_rate": 0.00047459170854271355, "loss": 0.3898, "mlm_loss": 0.3898, "step": 10810 }, { "ep_loss": 0.0, "epoch": 8.35, "learning_rate": 0.0004742776381909548, "loss": 0.3881, "mlm_loss": 0.3881, "step": 10820 }, { "ep_loss": 0.0, "epoch": 8.35, "learning_rate": 0.00047396356783919595, "loss": 0.3868, "mlm_loss": 0.3868, "step": 10830 }, { "ep_loss": 0.0, "epoch": 8.36, "learning_rate": 0.0004736494974874372, "loss": 0.3924, "mlm_loss": 0.3924, "step": 10840 }, { "ep_loss": 0.0, "epoch": 8.37, "learning_rate": 0.0004733354271356784, "loss": 0.3849, "mlm_loss": 0.3849, "step": 10850 }, { "ep_loss": 0.0, "epoch": 8.38, "learning_rate": 0.0004730213567839196, "loss": 0.387, "mlm_loss": 0.387, "step": 10860 }, { "ep_loss": 0.0, "epoch": 8.38, "learning_rate": 0.0004727072864321608, "loss": 0.3829, "mlm_loss": 0.3829, "step": 10870 }, { "ep_loss": 0.0, "epoch": 8.39, "learning_rate": 0.00047239321608040204, "loss": 0.3881, "mlm_loss": 0.3881, "step": 10880 }, { "ep_loss": 0.0, "epoch": 8.4, "learning_rate": 0.00047207914572864327, "loss": 0.3957, "mlm_loss": 0.3957, "step": 10890 }, { "ep_loss": 0.0, "epoch": 8.41, "learning_rate": 0.00047176507537688444, "loss": 0.3856, "mlm_loss": 0.3856, "step": 10900 }, { "epoch": 8.41, "eval_ep_loss": -2.556627035140991, "eval_loss": 0.3644386827945709, "eval_mlm_loss": 0.3644386827945709, "eval_runtime": 60.0106, "eval_samples_per_second": 1164.279, "eval_steps_per_second": 0.583, "step": 10900 }, { "ep_loss": 0.0, "epoch": 8.41, "learning_rate": 0.0004714510050251256, "loss": 0.3918, "mlm_loss": 0.3918, "step": 10910 }, { "ep_loss": 0.0, "epoch": 8.42, "learning_rate": 0.00047113693467336684, "loss": 0.3888, "mlm_loss": 0.3888, "step": 10920 }, { "ep_loss": 0.0, "epoch": 8.43, "learning_rate": 0.000470822864321608, "loss": 0.3874, "mlm_loss": 0.3874, "step": 10930 }, { "ep_loss": 0.0, "epoch": 8.44, "learning_rate": 0.00047050879396984925, "loss": 0.3826, "mlm_loss": 0.3826, "step": 10940 }, { "ep_loss": 0.0, "epoch": 8.45, "learning_rate": 0.0004701947236180905, "loss": 0.3817, "mlm_loss": 0.3817, "step": 10950 }, { "ep_loss": 0.0, "epoch": 8.45, "learning_rate": 0.00046988065326633165, "loss": 0.3877, "mlm_loss": 0.3877, "step": 10960 }, { "ep_loss": 0.0, "epoch": 8.46, "learning_rate": 0.0004695665829145729, "loss": 0.3841, "mlm_loss": 0.3841, "step": 10970 }, { "ep_loss": 0.0, "epoch": 8.47, "learning_rate": 0.0004692525125628141, "loss": 0.3854, "mlm_loss": 0.3854, "step": 10980 }, { "ep_loss": 0.0, "epoch": 8.48, "learning_rate": 0.0004689384422110553, "loss": 0.3818, "mlm_loss": 0.3818, "step": 10990 }, { "ep_loss": 0.0, "epoch": 8.48, "learning_rate": 0.0004686243718592965, "loss": 0.3844, "mlm_loss": 0.3844, "step": 11000 }, { "epoch": 8.48, "eval_ep_loss": -2.669065475463867, "eval_loss": 0.36087533831596375, "eval_mlm_loss": 0.36087533831596375, "eval_runtime": 59.8288, "eval_samples_per_second": 1167.815, "eval_steps_per_second": 0.585, "step": 11000 }, { "ep_loss": 0.0, "epoch": 8.49, "learning_rate": 0.0004683103015075377, "loss": 0.3867, "mlm_loss": 0.3867, "step": 11010 }, { "ep_loss": 0.0, "epoch": 8.5, "learning_rate": 0.0004679962311557789, "loss": 0.383, "mlm_loss": 0.383, "step": 11020 }, { "ep_loss": 0.0, "epoch": 8.51, "learning_rate": 0.0004676821608040201, "loss": 0.3897, "mlm_loss": 0.3897, "step": 11030 }, { "ep_loss": 0.0, "epoch": 8.52, "learning_rate": 0.0004673680904522613, "loss": 0.3908, "mlm_loss": 0.3908, "step": 11040 }, { "ep_loss": 0.0, "epoch": 8.52, "learning_rate": 0.00046705402010050254, "loss": 0.3854, "mlm_loss": 0.3854, "step": 11050 }, { "ep_loss": 0.0, "epoch": 8.53, "learning_rate": 0.0004667399497487437, "loss": 0.3886, "mlm_loss": 0.3886, "step": 11060 }, { "ep_loss": 0.0, "epoch": 8.54, "learning_rate": 0.00046642587939698494, "loss": 0.3814, "mlm_loss": 0.3814, "step": 11070 }, { "ep_loss": 0.0, "epoch": 8.55, "learning_rate": 0.00046611180904522617, "loss": 0.3888, "mlm_loss": 0.3888, "step": 11080 }, { "ep_loss": 0.0, "epoch": 8.55, "learning_rate": 0.00046579773869346734, "loss": 0.3884, "mlm_loss": 0.3884, "step": 11090 }, { "ep_loss": 0.0, "epoch": 8.56, "learning_rate": 0.00046548366834170857, "loss": 0.39, "mlm_loss": 0.39, "step": 11100 }, { "epoch": 8.56, "eval_ep_loss": -2.3106160163879395, "eval_loss": 0.3573386073112488, "eval_mlm_loss": 0.3573386073112488, "eval_runtime": 61.2527, "eval_samples_per_second": 1140.668, "eval_steps_per_second": 0.571, "step": 11100 }, { "ep_loss": 0.0, "epoch": 8.57, "learning_rate": 0.00046516959798994974, "loss": 0.3767, "mlm_loss": 0.3767, "step": 11110 }, { "ep_loss": 0.0, "epoch": 8.58, "learning_rate": 0.0004648555276381909, "loss": 0.3712, "mlm_loss": 0.3712, "step": 11120 }, { "ep_loss": 0.0, "epoch": 8.58, "learning_rate": 0.00046454145728643215, "loss": 0.3787, "mlm_loss": 0.3787, "step": 11130 }, { "ep_loss": 0.0, "epoch": 8.59, "learning_rate": 0.0004642273869346734, "loss": 0.3815, "mlm_loss": 0.3815, "step": 11140 }, { "ep_loss": 0.0, "epoch": 8.6, "learning_rate": 0.0004639133165829146, "loss": 0.3837, "mlm_loss": 0.3837, "step": 11150 }, { "ep_loss": 0.0, "epoch": 8.61, "learning_rate": 0.0004635992462311558, "loss": 0.3851, "mlm_loss": 0.3851, "step": 11160 }, { "ep_loss": 0.0, "epoch": 8.62, "learning_rate": 0.000463285175879397, "loss": 0.3818, "mlm_loss": 0.3818, "step": 11170 }, { "ep_loss": 0.0, "epoch": 8.62, "learning_rate": 0.00046297110552763823, "loss": 0.3841, "mlm_loss": 0.3841, "step": 11180 }, { "ep_loss": 0.0, "epoch": 8.63, "learning_rate": 0.0004626570351758794, "loss": 0.3803, "mlm_loss": 0.3803, "step": 11190 }, { "ep_loss": 0.0, "epoch": 8.64, "learning_rate": 0.00046234296482412063, "loss": 0.3799, "mlm_loss": 0.3799, "step": 11200 }, { "epoch": 8.64, "eval_ep_loss": -2.7373206615448, "eval_loss": 0.35678714513778687, "eval_mlm_loss": 0.35678714513778687, "eval_runtime": 63.4192, "eval_samples_per_second": 1101.701, "eval_steps_per_second": 0.552, "step": 11200 }, { "ep_loss": 0.0, "epoch": 8.65, "learning_rate": 0.0004620288944723618, "loss": 0.3788, "mlm_loss": 0.3788, "step": 11210 }, { "ep_loss": 0.0, "epoch": 8.65, "learning_rate": 0.000461714824120603, "loss": 0.385, "mlm_loss": 0.385, "step": 11220 }, { "ep_loss": 0.0, "epoch": 8.66, "learning_rate": 0.0004614007537688442, "loss": 0.3886, "mlm_loss": 0.3886, "step": 11230 }, { "ep_loss": 0.0, "epoch": 8.67, "learning_rate": 0.00046108668341708544, "loss": 0.3816, "mlm_loss": 0.3816, "step": 11240 }, { "ep_loss": 0.0, "epoch": 8.68, "learning_rate": 0.00046077261306532667, "loss": 0.3913, "mlm_loss": 0.3913, "step": 11250 }, { "ep_loss": 0.0, "epoch": 8.68, "learning_rate": 0.00046045854271356784, "loss": 0.3816, "mlm_loss": 0.3816, "step": 11260 }, { "ep_loss": 0.0, "epoch": 8.69, "learning_rate": 0.00046014447236180907, "loss": 0.3809, "mlm_loss": 0.3809, "step": 11270 }, { "ep_loss": 0.0, "epoch": 8.7, "learning_rate": 0.0004598304020100503, "loss": 0.3901, "mlm_loss": 0.3901, "step": 11280 }, { "ep_loss": 0.0, "epoch": 8.71, "learning_rate": 0.00045951633165829147, "loss": 0.3791, "mlm_loss": 0.3791, "step": 11290 }, { "ep_loss": 0.0, "epoch": 8.72, "learning_rate": 0.0004592022613065327, "loss": 0.3778, "mlm_loss": 0.3778, "step": 11300 }, { "epoch": 8.72, "eval_ep_loss": -2.907904624938965, "eval_loss": 0.3548813462257385, "eval_mlm_loss": 0.3548813462257385, "eval_runtime": 63.2595, "eval_samples_per_second": 1104.482, "eval_steps_per_second": 0.553, "step": 11300 }, { "ep_loss": 0.0, "epoch": 8.72, "learning_rate": 0.00045888819095477387, "loss": 0.3802, "mlm_loss": 0.3802, "step": 11310 }, { "ep_loss": 0.0, "epoch": 8.73, "learning_rate": 0.00045857412060301505, "loss": 0.3811, "mlm_loss": 0.3811, "step": 11320 }, { "ep_loss": 0.0, "epoch": 8.74, "learning_rate": 0.0004582600502512563, "loss": 0.385, "mlm_loss": 0.385, "step": 11330 }, { "ep_loss": 0.0, "epoch": 8.75, "learning_rate": 0.0004579459798994975, "loss": 0.3828, "mlm_loss": 0.3828, "step": 11340 }, { "ep_loss": 0.0, "epoch": 8.75, "learning_rate": 0.0004576319095477387, "loss": 0.3789, "mlm_loss": 0.3789, "step": 11350 }, { "ep_loss": 0.0, "epoch": 8.76, "learning_rate": 0.0004573178391959799, "loss": 0.3802, "mlm_loss": 0.3802, "step": 11360 }, { "ep_loss": 0.0, "epoch": 8.77, "learning_rate": 0.00045700376884422113, "loss": 0.38, "mlm_loss": 0.38, "step": 11370 }, { "ep_loss": 0.0, "epoch": 8.78, "learning_rate": 0.00045668969849246236, "loss": 0.3842, "mlm_loss": 0.3842, "step": 11380 }, { "ep_loss": 0.0, "epoch": 8.79, "learning_rate": 0.00045637562814070353, "loss": 0.3843, "mlm_loss": 0.3843, "step": 11390 }, { "ep_loss": 0.0, "epoch": 8.79, "learning_rate": 0.00045606155778894476, "loss": 0.3811, "mlm_loss": 0.3811, "step": 11400 }, { "epoch": 8.79, "eval_ep_loss": -2.4214723110198975, "eval_loss": 0.35386431217193604, "eval_mlm_loss": 0.35386431217193604, "eval_runtime": 59.4503, "eval_samples_per_second": 1175.25, "eval_steps_per_second": 0.589, "step": 11400 }, { "ep_loss": 0.0, "epoch": 8.8, "learning_rate": 0.00045574748743718594, "loss": 0.3777, "mlm_loss": 0.3777, "step": 11410 }, { "ep_loss": 0.0, "epoch": 8.81, "learning_rate": 0.0004554334170854271, "loss": 0.3857, "mlm_loss": 0.3857, "step": 11420 }, { "ep_loss": 0.0, "epoch": 8.82, "learning_rate": 0.00045511934673366834, "loss": 0.3818, "mlm_loss": 0.3818, "step": 11430 }, { "ep_loss": 0.0, "epoch": 8.82, "learning_rate": 0.00045480527638190957, "loss": 0.3794, "mlm_loss": 0.3794, "step": 11440 }, { "ep_loss": 0.0, "epoch": 8.83, "learning_rate": 0.00045449120603015074, "loss": 0.3783, "mlm_loss": 0.3783, "step": 11450 }, { "ep_loss": 0.0, "epoch": 8.84, "learning_rate": 0.00045417713567839197, "loss": 0.3826, "mlm_loss": 0.3826, "step": 11460 }, { "ep_loss": 0.0, "epoch": 8.85, "learning_rate": 0.0004538630653266332, "loss": 0.3786, "mlm_loss": 0.3786, "step": 11470 }, { "ep_loss": 0.0, "epoch": 8.85, "learning_rate": 0.00045354899497487437, "loss": 0.3731, "mlm_loss": 0.3731, "step": 11480 }, { "ep_loss": 0.0, "epoch": 8.86, "learning_rate": 0.0004532349246231156, "loss": 0.3766, "mlm_loss": 0.3766, "step": 11490 }, { "ep_loss": 0.0, "epoch": 8.87, "learning_rate": 0.0004529208542713568, "loss": 0.386, "mlm_loss": 0.386, "step": 11500 }, { "epoch": 8.87, "eval_ep_loss": -2.3580639362335205, "eval_loss": 0.3554854094982147, "eval_mlm_loss": 0.3554854094982147, "eval_runtime": 59.7228, "eval_samples_per_second": 1169.889, "eval_steps_per_second": 0.586, "step": 11500 }, { "ep_loss": 0.0, "epoch": 8.88, "learning_rate": 0.000452606783919598, "loss": 0.3765, "mlm_loss": 0.3765, "step": 11510 }, { "ep_loss": 0.0, "epoch": 8.89, "learning_rate": 0.0004522927135678392, "loss": 0.375, "mlm_loss": 0.375, "step": 11520 }, { "ep_loss": 0.0, "epoch": 8.89, "learning_rate": 0.0004519786432160804, "loss": 0.3831, "mlm_loss": 0.3831, "step": 11530 }, { "ep_loss": 0.0, "epoch": 8.9, "learning_rate": 0.00045166457286432163, "loss": 0.3797, "mlm_loss": 0.3797, "step": 11540 }, { "ep_loss": 0.0, "epoch": 8.91, "learning_rate": 0.0004513505025125628, "loss": 0.3729, "mlm_loss": 0.3729, "step": 11550 }, { "ep_loss": 0.0, "epoch": 8.92, "learning_rate": 0.00045103643216080403, "loss": 0.3736, "mlm_loss": 0.3736, "step": 11560 }, { "ep_loss": 0.0, "epoch": 8.92, "learning_rate": 0.00045072236180904526, "loss": 0.3756, "mlm_loss": 0.3756, "step": 11570 }, { "ep_loss": 0.0, "epoch": 8.93, "learning_rate": 0.00045040829145728643, "loss": 0.3791, "mlm_loss": 0.3791, "step": 11580 }, { "ep_loss": 0.0, "epoch": 8.94, "learning_rate": 0.00045009422110552766, "loss": 0.3701, "mlm_loss": 0.3701, "step": 11590 }, { "ep_loss": 0.0, "epoch": 8.95, "learning_rate": 0.0004497801507537689, "loss": 0.374, "mlm_loss": 0.374, "step": 11600 }, { "epoch": 8.95, "eval_ep_loss": -2.2494945526123047, "eval_loss": 0.35051652789115906, "eval_mlm_loss": 0.35051652789115906, "eval_runtime": 60.6153, "eval_samples_per_second": 1152.663, "eval_steps_per_second": 0.577, "step": 11600 }, { "ep_loss": 0.0, "epoch": 8.95, "learning_rate": 0.00044946608040201, "loss": 0.3771, "mlm_loss": 0.3771, "step": 11610 }, { "ep_loss": 0.0, "epoch": 8.96, "learning_rate": 0.00044915201005025124, "loss": 0.3787, "mlm_loss": 0.3787, "step": 11620 }, { "ep_loss": 0.0, "epoch": 8.97, "learning_rate": 0.00044883793969849247, "loss": 0.3783, "mlm_loss": 0.3783, "step": 11630 }, { "ep_loss": 0.0, "epoch": 8.98, "learning_rate": 0.0004485238693467337, "loss": 0.3683, "mlm_loss": 0.3683, "step": 11640 }, { "ep_loss": 0.0, "epoch": 8.99, "learning_rate": 0.00044820979899497487, "loss": 0.3755, "mlm_loss": 0.3755, "step": 11650 }, { "ep_loss": 0.0, "epoch": 8.99, "learning_rate": 0.0004478957286432161, "loss": 0.3751, "mlm_loss": 0.3751, "step": 11660 }, { "ep_loss": 0.0, "epoch": 9.0, "learning_rate": 0.0004475816582914573, "loss": 0.3806, "mlm_loss": 0.3806, "step": 11670 }, { "ep_loss": 0.0, "epoch": 9.01, "learning_rate": 0.0004472675879396985, "loss": 0.3723, "mlm_loss": 0.3723, "step": 11680 }, { "ep_loss": 0.0, "epoch": 9.02, "learning_rate": 0.0004469535175879397, "loss": 0.3727, "mlm_loss": 0.3727, "step": 11690 }, { "ep_loss": 0.0, "epoch": 9.02, "learning_rate": 0.00044663944723618095, "loss": 0.3717, "mlm_loss": 0.3717, "step": 11700 }, { "epoch": 9.02, "eval_ep_loss": -2.543911933898926, "eval_loss": 0.34973713755607605, "eval_mlm_loss": 0.34973713755607605, "eval_runtime": 62.1216, "eval_samples_per_second": 1124.713, "eval_steps_per_second": 0.563, "step": 11700 }, { "ep_loss": 0.0, "epoch": 9.03, "learning_rate": 0.0004463253768844221, "loss": 0.3694, "mlm_loss": 0.3694, "step": 11710 }, { "ep_loss": 0.0, "epoch": 9.04, "learning_rate": 0.0004460113065326633, "loss": 0.3673, "mlm_loss": 0.3673, "step": 11720 }, { "ep_loss": 0.0, "epoch": 9.05, "learning_rate": 0.00044569723618090453, "loss": 0.3679, "mlm_loss": 0.3679, "step": 11730 }, { "ep_loss": 0.0, "epoch": 9.06, "learning_rate": 0.0004453831658291457, "loss": 0.3757, "mlm_loss": 0.3757, "step": 11740 }, { "ep_loss": 0.0, "epoch": 9.06, "learning_rate": 0.00044506909547738693, "loss": 0.3702, "mlm_loss": 0.3702, "step": 11750 }, { "ep_loss": 0.0, "epoch": 9.07, "learning_rate": 0.00044475502512562816, "loss": 0.3697, "mlm_loss": 0.3697, "step": 11760 }, { "ep_loss": 0.0, "epoch": 9.08, "learning_rate": 0.0004444409547738694, "loss": 0.3721, "mlm_loss": 0.3721, "step": 11770 }, { "ep_loss": 0.0, "epoch": 9.09, "learning_rate": 0.00044412688442211056, "loss": 0.3713, "mlm_loss": 0.3713, "step": 11780 }, { "ep_loss": 0.0, "epoch": 9.09, "learning_rate": 0.0004438128140703518, "loss": 0.3701, "mlm_loss": 0.3701, "step": 11790 }, { "ep_loss": 0.0, "epoch": 9.1, "learning_rate": 0.000443498743718593, "loss": 0.368, "mlm_loss": 0.368, "step": 11800 }, { "epoch": 9.1, "eval_ep_loss": -2.5209760665893555, "eval_loss": 0.349372535943985, "eval_mlm_loss": 0.349372535943985, "eval_runtime": 58.6125, "eval_samples_per_second": 1192.049, "eval_steps_per_second": 0.597, "step": 11800 }, { "ep_loss": 0.0, "epoch": 9.11, "learning_rate": 0.00044318467336683414, "loss": 0.3664, "mlm_loss": 0.3664, "step": 11810 }, { "ep_loss": 0.0, "epoch": 9.12, "learning_rate": 0.00044287060301507537, "loss": 0.3766, "mlm_loss": 0.3766, "step": 11820 }, { "ep_loss": 0.0, "epoch": 9.12, "learning_rate": 0.0004425565326633166, "loss": 0.3677, "mlm_loss": 0.3677, "step": 11830 }, { "ep_loss": 0.0, "epoch": 9.13, "learning_rate": 0.00044224246231155777, "loss": 0.3701, "mlm_loss": 0.3701, "step": 11840 }, { "ep_loss": 0.0, "epoch": 9.14, "learning_rate": 0.000441928391959799, "loss": 0.3789, "mlm_loss": 0.3789, "step": 11850 }, { "ep_loss": 0.0, "epoch": 9.15, "learning_rate": 0.0004416143216080402, "loss": 0.3716, "mlm_loss": 0.3716, "step": 11860 }, { "ep_loss": 0.0, "epoch": 9.16, "learning_rate": 0.00044130025125628145, "loss": 0.3672, "mlm_loss": 0.3672, "step": 11870 }, { "ep_loss": 0.0, "epoch": 9.16, "learning_rate": 0.0004409861809045226, "loss": 0.369, "mlm_loss": 0.369, "step": 11880 }, { "ep_loss": 0.0, "epoch": 9.17, "learning_rate": 0.00044067211055276385, "loss": 0.377, "mlm_loss": 0.377, "step": 11890 }, { "ep_loss": 0.0, "epoch": 9.18, "learning_rate": 0.0004403580402010051, "loss": 0.3692, "mlm_loss": 0.3692, "step": 11900 }, { "epoch": 9.18, "eval_ep_loss": -2.5095667839050293, "eval_loss": 0.3459388017654419, "eval_mlm_loss": 0.3459388017654419, "eval_runtime": 59.1505, "eval_samples_per_second": 1181.208, "eval_steps_per_second": 0.592, "step": 11900 }, { "ep_loss": 0.0, "epoch": 9.19, "learning_rate": 0.0004400439698492462, "loss": 0.3717, "mlm_loss": 0.3717, "step": 11910 }, { "ep_loss": 0.0, "epoch": 9.19, "learning_rate": 0.00043972989949748743, "loss": 0.3706, "mlm_loss": 0.3706, "step": 11920 }, { "ep_loss": 0.0, "epoch": 9.2, "learning_rate": 0.00043941582914572866, "loss": 0.3766, "mlm_loss": 0.3766, "step": 11930 }, { "ep_loss": 0.0, "epoch": 9.21, "learning_rate": 0.00043910175879396983, "loss": 0.3675, "mlm_loss": 0.3675, "step": 11940 }, { "ep_loss": 0.0, "epoch": 9.22, "learning_rate": 0.00043878768844221106, "loss": 0.3653, "mlm_loss": 0.3653, "step": 11950 }, { "ep_loss": 0.0, "epoch": 9.22, "learning_rate": 0.0004384736180904523, "loss": 0.3683, "mlm_loss": 0.3683, "step": 11960 }, { "ep_loss": 0.0, "epoch": 9.23, "learning_rate": 0.00043815954773869346, "loss": 0.3695, "mlm_loss": 0.3695, "step": 11970 }, { "ep_loss": 0.0, "epoch": 9.24, "learning_rate": 0.0004378454773869347, "loss": 0.3674, "mlm_loss": 0.3674, "step": 11980 }, { "ep_loss": 0.0, "epoch": 9.25, "learning_rate": 0.0004375314070351759, "loss": 0.3621, "mlm_loss": 0.3621, "step": 11990 }, { "ep_loss": 0.0, "epoch": 9.26, "learning_rate": 0.00043721733668341715, "loss": 0.3714, "mlm_loss": 0.3714, "step": 12000 }, { "epoch": 9.26, "eval_ep_loss": -2.6775925159454346, "eval_loss": 0.3442252576351166, "eval_mlm_loss": 0.3442252576351166, "eval_runtime": 60.7149, "eval_samples_per_second": 1150.773, "eval_steps_per_second": 0.576, "step": 12000 }, { "ep_loss": 0.0, "epoch": 9.26, "learning_rate": 0.00043690326633165827, "loss": 0.3706, "mlm_loss": 0.3706, "step": 12010 }, { "ep_loss": 0.0, "epoch": 9.27, "learning_rate": 0.0004365891959798995, "loss": 0.3642, "mlm_loss": 0.3642, "step": 12020 }, { "ep_loss": 0.0, "epoch": 9.28, "learning_rate": 0.0004362751256281407, "loss": 0.3663, "mlm_loss": 0.3663, "step": 12030 }, { "ep_loss": 0.0, "epoch": 9.29, "learning_rate": 0.0004359610552763819, "loss": 0.3743, "mlm_loss": 0.3743, "step": 12040 }, { "ep_loss": 0.0, "epoch": 9.29, "learning_rate": 0.0004356469849246231, "loss": 0.3701, "mlm_loss": 0.3701, "step": 12050 }, { "ep_loss": 0.0, "epoch": 9.3, "learning_rate": 0.00043533291457286435, "loss": 0.3713, "mlm_loss": 0.3713, "step": 12060 }, { "ep_loss": 0.0, "epoch": 9.31, "learning_rate": 0.0004350188442211055, "loss": 0.3684, "mlm_loss": 0.3684, "step": 12070 }, { "ep_loss": 0.0, "epoch": 9.32, "learning_rate": 0.00043470477386934675, "loss": 0.3603, "mlm_loss": 0.3603, "step": 12080 }, { "ep_loss": 0.0, "epoch": 9.33, "learning_rate": 0.000434390703517588, "loss": 0.3599, "mlm_loss": 0.3599, "step": 12090 }, { "ep_loss": 0.0, "epoch": 9.33, "learning_rate": 0.00043407663316582916, "loss": 0.3653, "mlm_loss": 0.3653, "step": 12100 }, { "epoch": 9.33, "eval_ep_loss": -2.689305543899536, "eval_loss": 0.34572362899780273, "eval_mlm_loss": 0.34572362899780273, "eval_runtime": 60.5622, "eval_samples_per_second": 1153.673, "eval_steps_per_second": 0.578, "step": 12100 }, { "ep_loss": 0.0, "epoch": 9.34, "learning_rate": 0.00043376256281407033, "loss": 0.3746, "mlm_loss": 0.3746, "step": 12110 }, { "ep_loss": 0.0, "epoch": 9.35, "learning_rate": 0.00043344849246231156, "loss": 0.3707, "mlm_loss": 0.3707, "step": 12120 }, { "ep_loss": 0.0, "epoch": 9.36, "learning_rate": 0.0004331344221105528, "loss": 0.375, "mlm_loss": 0.375, "step": 12130 }, { "ep_loss": 0.0, "epoch": 9.36, "learning_rate": 0.00043282035175879396, "loss": 0.3675, "mlm_loss": 0.3675, "step": 12140 }, { "ep_loss": 0.0, "epoch": 9.37, "learning_rate": 0.0004325062814070352, "loss": 0.3683, "mlm_loss": 0.3683, "step": 12150 }, { "ep_loss": 0.0, "epoch": 9.38, "learning_rate": 0.0004321922110552764, "loss": 0.3646, "mlm_loss": 0.3646, "step": 12160 }, { "ep_loss": 0.0, "epoch": 9.39, "learning_rate": 0.0004318781407035176, "loss": 0.37, "mlm_loss": 0.37, "step": 12170 }, { "ep_loss": 0.0, "epoch": 9.39, "learning_rate": 0.0004315640703517588, "loss": 0.3701, "mlm_loss": 0.3701, "step": 12180 }, { "ep_loss": 0.0, "epoch": 9.4, "learning_rate": 0.00043125000000000005, "loss": 0.364, "mlm_loss": 0.364, "step": 12190 }, { "ep_loss": 0.0, "epoch": 9.41, "learning_rate": 0.00043093592964824117, "loss": 0.3639, "mlm_loss": 0.3639, "step": 12200 }, { "epoch": 9.41, "eval_ep_loss": -2.2502214908599854, "eval_loss": 0.3424089848995209, "eval_mlm_loss": 0.3424089848995209, "eval_runtime": 60.2107, "eval_samples_per_second": 1160.408, "eval_steps_per_second": 0.581, "step": 12200 }, { "ep_loss": 0.0, "epoch": 9.42, "learning_rate": 0.0004306218592964824, "loss": 0.3668, "mlm_loss": 0.3668, "step": 12210 }, { "ep_loss": 0.0, "epoch": 9.43, "learning_rate": 0.0004303077889447236, "loss": 0.3643, "mlm_loss": 0.3643, "step": 12220 }, { "ep_loss": 0.0, "epoch": 9.43, "learning_rate": 0.0004299937185929648, "loss": 0.3682, "mlm_loss": 0.3682, "step": 12230 }, { "ep_loss": 0.0, "epoch": 9.44, "learning_rate": 0.00042971105527638193, "loss": 0.3682, "mlm_loss": 0.3682, "step": 12240 }, { "ep_loss": 0.0, "epoch": 9.45, "learning_rate": 0.00042939698492462316, "loss": 0.3663, "mlm_loss": 0.3663, "step": 12250 }, { "ep_loss": 0.0, "epoch": 9.46, "learning_rate": 0.0004291143216080402, "loss": 0.3633, "mlm_loss": 0.3633, "step": 12260 }, { "ep_loss": 0.0, "epoch": 9.46, "learning_rate": 0.0004288002512562814, "loss": 0.3657, "mlm_loss": 0.3657, "step": 12270 }, { "ep_loss": 0.0, "epoch": 9.47, "learning_rate": 0.00042848618090452265, "loss": 0.3608, "mlm_loss": 0.3608, "step": 12280 }, { "ep_loss": 0.0, "epoch": 9.48, "learning_rate": 0.0004281721105527638, "loss": 0.3674, "mlm_loss": 0.3674, "step": 12290 }, { "ep_loss": 0.0, "epoch": 9.49, "learning_rate": 0.00042785804020100505, "loss": 0.3701, "mlm_loss": 0.3701, "step": 12300 }, { "epoch": 9.49, "eval_ep_loss": -2.4860949516296387, "eval_loss": 0.34168559312820435, "eval_mlm_loss": 0.34168559312820435, "eval_runtime": 60.0999, "eval_samples_per_second": 1162.548, "eval_steps_per_second": 0.582, "step": 12300 }, { "ep_loss": 0.0, "epoch": 9.49, "learning_rate": 0.0004275439698492463, "loss": 0.3626, "mlm_loss": 0.3626, "step": 12310 }, { "ep_loss": 0.0, "epoch": 9.5, "learning_rate": 0.00042722989949748745, "loss": 0.3667, "mlm_loss": 0.3667, "step": 12320 }, { "ep_loss": 0.0, "epoch": 9.51, "learning_rate": 0.0004269158291457286, "loss": 0.3564, "mlm_loss": 0.3564, "step": 12330 }, { "ep_loss": 0.0, "epoch": 9.52, "learning_rate": 0.00042660175879396985, "loss": 0.3653, "mlm_loss": 0.3653, "step": 12340 }, { "ep_loss": 0.0, "epoch": 9.53, "learning_rate": 0.00042628768844221103, "loss": 0.3696, "mlm_loss": 0.3696, "step": 12350 }, { "ep_loss": 0.0, "epoch": 9.53, "learning_rate": 0.00042597361809045226, "loss": 0.3653, "mlm_loss": 0.3653, "step": 12360 }, { "ep_loss": 0.0, "epoch": 9.54, "learning_rate": 0.0004256595477386935, "loss": 0.3645, "mlm_loss": 0.3645, "step": 12370 }, { "ep_loss": 0.0, "epoch": 9.55, "learning_rate": 0.0004253454773869347, "loss": 0.368, "mlm_loss": 0.368, "step": 12380 }, { "ep_loss": 0.0, "epoch": 9.56, "learning_rate": 0.0004250314070351759, "loss": 0.3637, "mlm_loss": 0.3637, "step": 12390 }, { "ep_loss": 0.0, "epoch": 9.56, "learning_rate": 0.0004247173366834171, "loss": 0.3694, "mlm_loss": 0.3694, "step": 12400 }, { "epoch": 9.56, "eval_ep_loss": -2.249842405319214, "eval_loss": 0.3413134813308716, "eval_mlm_loss": 0.3413134813308716, "eval_runtime": 60.5795, "eval_samples_per_second": 1153.344, "eval_steps_per_second": 0.578, "step": 12400 }, { "ep_loss": 0.0, "epoch": 9.57, "learning_rate": 0.00042440326633165834, "loss": 0.3612, "mlm_loss": 0.3612, "step": 12410 }, { "ep_loss": 0.0, "epoch": 9.58, "learning_rate": 0.0004240891959798995, "loss": 0.3594, "mlm_loss": 0.3594, "step": 12420 }, { "ep_loss": 0.0, "epoch": 9.59, "learning_rate": 0.0004237751256281407, "loss": 0.3705, "mlm_loss": 0.3705, "step": 12430 }, { "ep_loss": 0.0, "epoch": 9.6, "learning_rate": 0.0004234610552763819, "loss": 0.3608, "mlm_loss": 0.3608, "step": 12440 }, { "ep_loss": 0.0, "epoch": 9.6, "learning_rate": 0.0004231469849246231, "loss": 0.3611, "mlm_loss": 0.3611, "step": 12450 }, { "ep_loss": 0.0, "epoch": 9.61, "learning_rate": 0.0004228329145728643, "loss": 0.3615, "mlm_loss": 0.3615, "step": 12460 }, { "ep_loss": 0.0, "epoch": 9.62, "learning_rate": 0.00042251884422110555, "loss": 0.3641, "mlm_loss": 0.3641, "step": 12470 }, { "ep_loss": 0.0, "epoch": 9.63, "learning_rate": 0.0004222047738693467, "loss": 0.3664, "mlm_loss": 0.3664, "step": 12480 }, { "ep_loss": 0.0, "epoch": 9.63, "learning_rate": 0.00042189070351758795, "loss": 0.3612, "mlm_loss": 0.3612, "step": 12490 }, { "ep_loss": 0.0, "epoch": 9.64, "learning_rate": 0.0004215766331658292, "loss": 0.37, "mlm_loss": 0.37, "step": 12500 }, { "epoch": 9.64, "eval_ep_loss": -2.4926164150238037, "eval_loss": 0.3400452435016632, "eval_mlm_loss": 0.3400452435016632, "eval_runtime": 62.1559, "eval_samples_per_second": 1124.093, "eval_steps_per_second": 0.563, "step": 12500 }, { "ep_loss": 0.0, "epoch": 9.65, "learning_rate": 0.0004212625628140704, "loss": 0.3693, "mlm_loss": 0.3693, "step": 12510 }, { "ep_loss": 0.0, "epoch": 9.66, "learning_rate": 0.0004209484924623116, "loss": 0.3623, "mlm_loss": 0.3623, "step": 12520 }, { "ep_loss": 0.0, "epoch": 9.66, "learning_rate": 0.00042063442211055275, "loss": 0.3637, "mlm_loss": 0.3637, "step": 12530 }, { "ep_loss": 0.0, "epoch": 9.67, "learning_rate": 0.000420320351758794, "loss": 0.363, "mlm_loss": 0.363, "step": 12540 }, { "ep_loss": 0.0, "epoch": 9.68, "learning_rate": 0.00042000628140703516, "loss": 0.355, "mlm_loss": 0.355, "step": 12550 }, { "ep_loss": 0.0, "epoch": 9.69, "learning_rate": 0.0004196922110552764, "loss": 0.3651, "mlm_loss": 0.3651, "step": 12560 }, { "ep_loss": 0.0, "epoch": 9.7, "learning_rate": 0.0004193781407035176, "loss": 0.3593, "mlm_loss": 0.3593, "step": 12570 }, { "ep_loss": 0.0, "epoch": 9.7, "learning_rate": 0.0004190640703517588, "loss": 0.3591, "mlm_loss": 0.3591, "step": 12580 }, { "ep_loss": 0.0, "epoch": 9.71, "learning_rate": 0.00041875, "loss": 0.3585, "mlm_loss": 0.3585, "step": 12590 }, { "ep_loss": 0.0, "epoch": 9.72, "learning_rate": 0.00041843592964824124, "loss": 0.3618, "mlm_loss": 0.3618, "step": 12600 }, { "epoch": 9.72, "eval_ep_loss": -2.574556827545166, "eval_loss": 0.33879169821739197, "eval_mlm_loss": 0.33879169821739197, "eval_runtime": 60.5312, "eval_samples_per_second": 1154.265, "eval_steps_per_second": 0.578, "step": 12600 }, { "ep_loss": 0.0, "epoch": 9.73, "learning_rate": 0.0004181218592964824, "loss": 0.3587, "mlm_loss": 0.3587, "step": 12610 }, { "ep_loss": 0.0, "epoch": 9.73, "learning_rate": 0.0004178077889447236, "loss": 0.3668, "mlm_loss": 0.3668, "step": 12620 }, { "ep_loss": 0.0, "epoch": 9.74, "learning_rate": 0.0004174937185929648, "loss": 0.3616, "mlm_loss": 0.3616, "step": 12630 }, { "ep_loss": 0.0, "epoch": 9.75, "learning_rate": 0.00041717964824120605, "loss": 0.3626, "mlm_loss": 0.3626, "step": 12640 }, { "ep_loss": 0.0, "epoch": 9.76, "learning_rate": 0.0004168655778894472, "loss": 0.3514, "mlm_loss": 0.3514, "step": 12650 }, { "ep_loss": 0.0, "epoch": 9.76, "learning_rate": 0.00041655150753768845, "loss": 0.3605, "mlm_loss": 0.3605, "step": 12660 }, { "ep_loss": 0.0, "epoch": 9.77, "learning_rate": 0.0004162374371859297, "loss": 0.3591, "mlm_loss": 0.3591, "step": 12670 }, { "ep_loss": 0.0, "epoch": 9.78, "learning_rate": 0.00041592336683417085, "loss": 0.3642, "mlm_loss": 0.3642, "step": 12680 }, { "ep_loss": 0.0, "epoch": 9.79, "learning_rate": 0.0004156092964824121, "loss": 0.3683, "mlm_loss": 0.3683, "step": 12690 }, { "ep_loss": 0.0, "epoch": 9.8, "learning_rate": 0.0004152952261306533, "loss": 0.3614, "mlm_loss": 0.3614, "step": 12700 }, { "epoch": 9.8, "eval_ep_loss": -2.610844135284424, "eval_loss": 0.33695146441459656, "eval_mlm_loss": 0.33695146441459656, "eval_runtime": 59.2376, "eval_samples_per_second": 1179.471, "eval_steps_per_second": 0.591, "step": 12700 }, { "ep_loss": 0.0, "epoch": 9.8, "learning_rate": 0.0004149811557788945, "loss": 0.3551, "mlm_loss": 0.3551, "step": 12710 }, { "ep_loss": 0.0, "epoch": 9.81, "learning_rate": 0.00041466708542713565, "loss": 0.357, "mlm_loss": 0.357, "step": 12720 }, { "ep_loss": 0.0, "epoch": 9.82, "learning_rate": 0.0004143530150753769, "loss": 0.3631, "mlm_loss": 0.3631, "step": 12730 }, { "ep_loss": 0.0, "epoch": 9.83, "learning_rate": 0.00041403894472361806, "loss": 0.3603, "mlm_loss": 0.3603, "step": 12740 }, { "ep_loss": 0.0, "epoch": 9.83, "learning_rate": 0.0004137248743718593, "loss": 0.3624, "mlm_loss": 0.3624, "step": 12750 }, { "ep_loss": 0.0, "epoch": 9.84, "learning_rate": 0.0004134108040201005, "loss": 0.3619, "mlm_loss": 0.3619, "step": 12760 }, { "ep_loss": 0.0, "epoch": 9.85, "learning_rate": 0.00041309673366834174, "loss": 0.3623, "mlm_loss": 0.3623, "step": 12770 }, { "ep_loss": 0.0, "epoch": 9.86, "learning_rate": 0.0004127826633165829, "loss": 0.3615, "mlm_loss": 0.3615, "step": 12780 }, { "ep_loss": 0.0, "epoch": 9.87, "learning_rate": 0.00041246859296482414, "loss": 0.3569, "mlm_loss": 0.3569, "step": 12790 }, { "ep_loss": 0.0, "epoch": 9.87, "learning_rate": 0.00041215452261306537, "loss": 0.3561, "mlm_loss": 0.3561, "step": 12800 }, { "epoch": 9.87, "eval_ep_loss": -2.4833860397338867, "eval_loss": 0.3353259861469269, "eval_mlm_loss": 0.3353259861469269, "eval_runtime": 59.1455, "eval_samples_per_second": 1181.307, "eval_steps_per_second": 0.592, "step": 12800 }, { "ep_loss": 0.0, "epoch": 9.88, "learning_rate": 0.00041184045226130654, "loss": 0.362, "mlm_loss": 0.362, "step": 12810 }, { "ep_loss": 0.0, "epoch": 9.89, "learning_rate": 0.0004115263819095477, "loss": 0.3589, "mlm_loss": 0.3589, "step": 12820 }, { "ep_loss": 0.0, "epoch": 9.9, "learning_rate": 0.00041121231155778895, "loss": 0.3614, "mlm_loss": 0.3614, "step": 12830 }, { "ep_loss": 0.0, "epoch": 9.9, "learning_rate": 0.0004108982412060301, "loss": 0.3534, "mlm_loss": 0.3534, "step": 12840 }, { "ep_loss": 0.0, "epoch": 9.91, "learning_rate": 0.00041058417085427135, "loss": 0.36, "mlm_loss": 0.36, "step": 12850 }, { "ep_loss": 0.0, "epoch": 9.92, "learning_rate": 0.0004102701005025126, "loss": 0.3579, "mlm_loss": 0.3579, "step": 12860 }, { "ep_loss": 0.0, "epoch": 9.93, "learning_rate": 0.0004099560301507538, "loss": 0.358, "mlm_loss": 0.358, "step": 12870 }, { "ep_loss": 0.0, "epoch": 9.93, "learning_rate": 0.000409641959798995, "loss": 0.3568, "mlm_loss": 0.3568, "step": 12880 }, { "ep_loss": 0.0, "epoch": 9.94, "learning_rate": 0.0004093278894472362, "loss": 0.3596, "mlm_loss": 0.3596, "step": 12890 }, { "ep_loss": 0.0, "epoch": 9.95, "learning_rate": 0.00040901381909547743, "loss": 0.356, "mlm_loss": 0.356, "step": 12900 }, { "epoch": 9.95, "eval_ep_loss": -2.3914308547973633, "eval_loss": 0.33489990234375, "eval_mlm_loss": 0.33489990234375, "eval_runtime": 60.1749, "eval_samples_per_second": 1161.099, "eval_steps_per_second": 0.582, "step": 12900 }, { "ep_loss": 0.0, "epoch": 9.96, "learning_rate": 0.0004086997487437186, "loss": 0.3613, "mlm_loss": 0.3613, "step": 12910 }, { "ep_loss": 0.0, "epoch": 9.97, "learning_rate": 0.0004083856783919598, "loss": 0.3561, "mlm_loss": 0.3561, "step": 12920 }, { "ep_loss": 0.0, "epoch": 9.97, "learning_rate": 0.000408071608040201, "loss": 0.354, "mlm_loss": 0.354, "step": 12930 }, { "ep_loss": 0.0, "epoch": 9.98, "learning_rate": 0.0004077575376884422, "loss": 0.36, "mlm_loss": 0.36, "step": 12940 }, { "ep_loss": 0.0, "epoch": 9.99, "learning_rate": 0.0004074434673366834, "loss": 0.3583, "mlm_loss": 0.3583, "step": 12950 }, { "ep_loss": 0.0, "epoch": 10.0, "learning_rate": 0.00040712939698492464, "loss": 0.3608, "mlm_loss": 0.3608, "step": 12960 }, { "ep_loss": 0.0, "epoch": 10.0, "learning_rate": 0.0004068153266331658, "loss": 0.3596, "mlm_loss": 0.3596, "step": 12970 }, { "ep_loss": 0.0, "epoch": 10.01, "learning_rate": 0.00040650125628140704, "loss": 0.3559, "mlm_loss": 0.3559, "step": 12980 }, { "ep_loss": 0.0, "epoch": 10.02, "learning_rate": 0.00040618718592964827, "loss": 0.3582, "mlm_loss": 0.3582, "step": 12990 }, { "ep_loss": 0.0, "epoch": 10.03, "learning_rate": 0.0004058731155778895, "loss": 0.3516, "mlm_loss": 0.3516, "step": 13000 }, { "epoch": 10.03, "eval_ep_loss": -2.4109866619110107, "eval_loss": 0.3338833749294281, "eval_mlm_loss": 0.3338833749294281, "eval_runtime": 59.7838, "eval_samples_per_second": 1168.695, "eval_steps_per_second": 0.585, "step": 13000 }, { "ep_loss": 0.0, "epoch": 10.03, "learning_rate": 0.00040555904522613067, "loss": 0.3513, "mlm_loss": 0.3513, "step": 13010 }, { "ep_loss": 0.0, "epoch": 10.04, "learning_rate": 0.00040524497487437185, "loss": 0.3533, "mlm_loss": 0.3533, "step": 13020 }, { "ep_loss": 0.0, "epoch": 10.05, "learning_rate": 0.0004049309045226131, "loss": 0.3529, "mlm_loss": 0.3529, "step": 13030 }, { "ep_loss": 0.0, "epoch": 10.06, "learning_rate": 0.00040461683417085425, "loss": 0.3555, "mlm_loss": 0.3555, "step": 13040 }, { "ep_loss": 0.0, "epoch": 10.07, "learning_rate": 0.0004043027638190955, "loss": 0.3552, "mlm_loss": 0.3552, "step": 13050 }, { "ep_loss": 0.0, "epoch": 10.07, "learning_rate": 0.0004039886934673367, "loss": 0.3498, "mlm_loss": 0.3498, "step": 13060 }, { "ep_loss": 0.0, "epoch": 10.08, "learning_rate": 0.0004036746231155779, "loss": 0.3553, "mlm_loss": 0.3553, "step": 13070 }, { "ep_loss": 0.0, "epoch": 10.09, "learning_rate": 0.0004033605527638191, "loss": 0.3486, "mlm_loss": 0.3486, "step": 13080 }, { "ep_loss": 0.0, "epoch": 10.1, "learning_rate": 0.00040304648241206033, "loss": 0.3455, "mlm_loss": 0.3455, "step": 13090 }, { "ep_loss": 0.0, "epoch": 10.1, "learning_rate": 0.0004027324120603015, "loss": 0.358, "mlm_loss": 0.358, "step": 13100 }, { "epoch": 10.1, "eval_ep_loss": -2.4457526206970215, "eval_loss": 0.33316919207572937, "eval_mlm_loss": 0.33316919207572937, "eval_runtime": 61.6405, "eval_samples_per_second": 1133.492, "eval_steps_per_second": 0.568, "step": 13100 }, { "ep_loss": 0.0, "epoch": 10.11, "learning_rate": 0.00040241834170854274, "loss": 0.3521, "mlm_loss": 0.3521, "step": 13110 }, { "ep_loss": 0.0, "epoch": 10.12, "learning_rate": 0.0004021042713567839, "loss": 0.3618, "mlm_loss": 0.3618, "step": 13120 }, { "ep_loss": 0.0, "epoch": 10.13, "learning_rate": 0.00040179020100502514, "loss": 0.3608, "mlm_loss": 0.3608, "step": 13130 }, { "ep_loss": 0.0, "epoch": 10.13, "learning_rate": 0.0004014761306532663, "loss": 0.3554, "mlm_loss": 0.3554, "step": 13140 }, { "ep_loss": 0.0, "epoch": 10.14, "learning_rate": 0.00040116206030150754, "loss": 0.3554, "mlm_loss": 0.3554, "step": 13150 }, { "ep_loss": 0.0, "epoch": 10.15, "learning_rate": 0.00040084798994974877, "loss": 0.3482, "mlm_loss": 0.3482, "step": 13160 }, { "ep_loss": 0.0, "epoch": 10.16, "learning_rate": 0.00040053391959798994, "loss": 0.3577, "mlm_loss": 0.3577, "step": 13170 }, { "ep_loss": 0.0, "epoch": 10.17, "learning_rate": 0.00040021984924623117, "loss": 0.3448, "mlm_loss": 0.3448, "step": 13180 }, { "ep_loss": 0.0, "epoch": 10.17, "learning_rate": 0.0003999057788944724, "loss": 0.3488, "mlm_loss": 0.3488, "step": 13190 }, { "ep_loss": 0.0, "epoch": 10.18, "learning_rate": 0.00039959170854271357, "loss": 0.3552, "mlm_loss": 0.3552, "step": 13200 }, { "epoch": 10.18, "eval_ep_loss": -2.5752882957458496, "eval_loss": 0.33142197132110596, "eval_mlm_loss": 0.33142197132110596, "eval_runtime": 58.9677, "eval_samples_per_second": 1184.87, "eval_steps_per_second": 0.594, "step": 13200 }, { "ep_loss": 0.0, "epoch": 10.19, "learning_rate": 0.0003992776381909548, "loss": 0.3443, "mlm_loss": 0.3443, "step": 13210 }, { "ep_loss": 0.0, "epoch": 10.2, "learning_rate": 0.000398963567839196, "loss": 0.3487, "mlm_loss": 0.3487, "step": 13220 }, { "ep_loss": 0.0, "epoch": 10.2, "learning_rate": 0.00039864949748743715, "loss": 0.3517, "mlm_loss": 0.3517, "step": 13230 }, { "ep_loss": 0.0, "epoch": 10.21, "learning_rate": 0.0003983354271356784, "loss": 0.3544, "mlm_loss": 0.3544, "step": 13240 }, { "ep_loss": 0.0, "epoch": 10.22, "learning_rate": 0.0003980213567839196, "loss": 0.355, "mlm_loss": 0.355, "step": 13250 }, { "ep_loss": 0.0, "epoch": 10.23, "learning_rate": 0.00039770728643216083, "loss": 0.3467, "mlm_loss": 0.3467, "step": 13260 }, { "ep_loss": 0.0, "epoch": 10.24, "learning_rate": 0.000397393216080402, "loss": 0.355, "mlm_loss": 0.355, "step": 13270 }, { "ep_loss": 0.0, "epoch": 10.24, "learning_rate": 0.00039707914572864323, "loss": 0.3498, "mlm_loss": 0.3498, "step": 13280 }, { "ep_loss": 0.0, "epoch": 10.25, "learning_rate": 0.00039676507537688446, "loss": 0.3507, "mlm_loss": 0.3507, "step": 13290 }, { "ep_loss": 0.0, "epoch": 10.26, "learning_rate": 0.00039645100502512564, "loss": 0.3515, "mlm_loss": 0.3515, "step": 13300 }, { "epoch": 10.26, "eval_ep_loss": -2.6309657096862793, "eval_loss": 0.3306918740272522, "eval_mlm_loss": 0.3306918740272522, "eval_runtime": 60.495, "eval_samples_per_second": 1154.954, "eval_steps_per_second": 0.579, "step": 13300 }, { "ep_loss": 0.0, "epoch": 10.27, "learning_rate": 0.00039613693467336686, "loss": 0.3476, "mlm_loss": 0.3476, "step": 13310 }, { "ep_loss": 0.0, "epoch": 10.27, "learning_rate": 0.00039582286432160804, "loss": 0.3522, "mlm_loss": 0.3522, "step": 13320 }, { "ep_loss": 0.0, "epoch": 10.28, "learning_rate": 0.0003955087939698492, "loss": 0.3492, "mlm_loss": 0.3492, "step": 13330 }, { "ep_loss": 0.0, "epoch": 10.29, "learning_rate": 0.00039519472361809044, "loss": 0.3475, "mlm_loss": 0.3475, "step": 13340 }, { "ep_loss": 0.0, "epoch": 10.3, "learning_rate": 0.00039488065326633167, "loss": 0.3513, "mlm_loss": 0.3513, "step": 13350 }, { "ep_loss": 0.0, "epoch": 10.3, "learning_rate": 0.00039456658291457284, "loss": 0.3504, "mlm_loss": 0.3504, "step": 13360 }, { "ep_loss": 0.0, "epoch": 10.31, "learning_rate": 0.00039425251256281407, "loss": 0.3467, "mlm_loss": 0.3467, "step": 13370 }, { "ep_loss": 0.0, "epoch": 10.32, "learning_rate": 0.0003939384422110553, "loss": 0.3485, "mlm_loss": 0.3485, "step": 13380 }, { "ep_loss": 0.0, "epoch": 10.33, "learning_rate": 0.0003936243718592965, "loss": 0.354, "mlm_loss": 0.354, "step": 13390 }, { "ep_loss": 0.0, "epoch": 10.34, "learning_rate": 0.0003933103015075377, "loss": 0.3577, "mlm_loss": 0.3577, "step": 13400 }, { "epoch": 10.34, "eval_ep_loss": -2.679708957672119, "eval_loss": 0.3291432559490204, "eval_mlm_loss": 0.3291432559490204, "eval_runtime": 59.1321, "eval_samples_per_second": 1181.575, "eval_steps_per_second": 0.592, "step": 13400 }, { "ep_loss": 0.0, "epoch": 10.34, "learning_rate": 0.00039299623115577893, "loss": 0.3521, "mlm_loss": 0.3521, "step": 13410 }, { "ep_loss": 0.0, "epoch": 10.35, "learning_rate": 0.0003926821608040201, "loss": 0.3495, "mlm_loss": 0.3495, "step": 13420 }, { "ep_loss": 0.0, "epoch": 10.36, "learning_rate": 0.0003923680904522613, "loss": 0.3464, "mlm_loss": 0.3464, "step": 13430 }, { "ep_loss": 0.0, "epoch": 10.37, "learning_rate": 0.0003920540201005025, "loss": 0.3493, "mlm_loss": 0.3493, "step": 13440 }, { "ep_loss": 0.0, "epoch": 10.37, "learning_rate": 0.00039173994974874373, "loss": 0.351, "mlm_loss": 0.351, "step": 13450 }, { "ep_loss": 0.0, "epoch": 10.38, "learning_rate": 0.0003914258793969849, "loss": 0.3526, "mlm_loss": 0.3526, "step": 13460 }, { "ep_loss": 0.0, "epoch": 10.39, "learning_rate": 0.00039111180904522613, "loss": 0.3517, "mlm_loss": 0.3517, "step": 13470 }, { "ep_loss": 0.0, "epoch": 10.4, "learning_rate": 0.00039079773869346736, "loss": 0.3541, "mlm_loss": 0.3541, "step": 13480 }, { "ep_loss": 0.0, "epoch": 10.4, "learning_rate": 0.0003904836683417086, "loss": 0.35, "mlm_loss": 0.35, "step": 13490 }, { "ep_loss": 0.0, "epoch": 10.41, "learning_rate": 0.00039016959798994976, "loss": 0.3543, "mlm_loss": 0.3543, "step": 13500 }, { "epoch": 10.41, "eval_ep_loss": -2.5294482707977295, "eval_loss": 0.3268160820007324, "eval_mlm_loss": 0.3268160820007324, "eval_runtime": 59.814, "eval_samples_per_second": 1168.105, "eval_steps_per_second": 0.585, "step": 13500 }, { "ep_loss": 0.0, "epoch": 10.42, "learning_rate": 0.000389855527638191, "loss": 0.3521, "mlm_loss": 0.3521, "step": 13510 }, { "ep_loss": 0.0, "epoch": 10.43, "learning_rate": 0.00038954145728643217, "loss": 0.3447, "mlm_loss": 0.3447, "step": 13520 }, { "ep_loss": 0.0, "epoch": 10.44, "learning_rate": 0.00038922738693467334, "loss": 0.3547, "mlm_loss": 0.3547, "step": 13530 }, { "ep_loss": 0.0, "epoch": 10.44, "learning_rate": 0.00038891331658291457, "loss": 0.3457, "mlm_loss": 0.3457, "step": 13540 }, { "ep_loss": 0.0, "epoch": 10.45, "learning_rate": 0.0003885992462311558, "loss": 0.3451, "mlm_loss": 0.3451, "step": 13550 }, { "ep_loss": 0.0, "epoch": 10.46, "learning_rate": 0.00038828517587939697, "loss": 0.3447, "mlm_loss": 0.3447, "step": 13560 }, { "ep_loss": 0.0, "epoch": 10.47, "learning_rate": 0.0003879711055276382, "loss": 0.3483, "mlm_loss": 0.3483, "step": 13570 }, { "ep_loss": 0.0, "epoch": 10.47, "learning_rate": 0.0003876570351758794, "loss": 0.3505, "mlm_loss": 0.3505, "step": 13580 }, { "ep_loss": 0.0, "epoch": 10.48, "learning_rate": 0.0003873429648241206, "loss": 0.3502, "mlm_loss": 0.3502, "step": 13590 }, { "ep_loss": 0.0, "epoch": 10.49, "learning_rate": 0.00038702889447236183, "loss": 0.3503, "mlm_loss": 0.3503, "step": 13600 }, { "epoch": 10.49, "eval_ep_loss": -2.7712888717651367, "eval_loss": 0.32662275433540344, "eval_mlm_loss": 0.32662275433540344, "eval_runtime": 64.1344, "eval_samples_per_second": 1089.415, "eval_steps_per_second": 0.546, "step": 13600 }, { "ep_loss": 0.0, "epoch": 10.5, "learning_rate": 0.00038671482412060306, "loss": 0.3536, "mlm_loss": 0.3536, "step": 13610 }, { "ep_loss": 0.0, "epoch": 10.51, "learning_rate": 0.00038640075376884423, "loss": 0.3567, "mlm_loss": 0.3567, "step": 13620 }, { "ep_loss": 0.0, "epoch": 10.51, "learning_rate": 0.0003860866834170854, "loss": 0.3498, "mlm_loss": 0.3498, "step": 13630 }, { "ep_loss": 0.0, "epoch": 10.52, "learning_rate": 0.00038577261306532663, "loss": 0.3519, "mlm_loss": 0.3519, "step": 13640 }, { "ep_loss": 0.0, "epoch": 10.53, "learning_rate": 0.00038545854271356786, "loss": 0.3476, "mlm_loss": 0.3476, "step": 13650 }, { "ep_loss": 0.0, "epoch": 10.54, "learning_rate": 0.00038514447236180903, "loss": 0.346, "mlm_loss": 0.346, "step": 13660 }, { "ep_loss": 0.0, "epoch": 10.54, "learning_rate": 0.00038483040201005026, "loss": 0.3508, "mlm_loss": 0.3508, "step": 13670 }, { "ep_loss": 0.0, "epoch": 10.55, "learning_rate": 0.0003845163316582915, "loss": 0.3532, "mlm_loss": 0.3532, "step": 13680 }, { "ep_loss": 0.0, "epoch": 10.56, "learning_rate": 0.00038420226130653266, "loss": 0.3449, "mlm_loss": 0.3449, "step": 13690 }, { "ep_loss": 0.0, "epoch": 10.57, "learning_rate": 0.0003838881909547739, "loss": 0.3521, "mlm_loss": 0.3521, "step": 13700 }, { "epoch": 10.57, "eval_ep_loss": -2.5600945949554443, "eval_loss": 0.3279613256454468, "eval_mlm_loss": 0.3279613256454468, "eval_runtime": 60.8337, "eval_samples_per_second": 1148.524, "eval_steps_per_second": 0.575, "step": 13700 }, { "ep_loss": 0.0, "epoch": 10.57, "learning_rate": 0.0003835741206030151, "loss": 0.3462, "mlm_loss": 0.3462, "step": 13710 }, { "ep_loss": 0.0, "epoch": 10.58, "learning_rate": 0.00038326005025125624, "loss": 0.347, "mlm_loss": 0.347, "step": 13720 }, { "ep_loss": 0.0, "epoch": 10.59, "learning_rate": 0.00038294597989949747, "loss": 0.3462, "mlm_loss": 0.3462, "step": 13730 }, { "ep_loss": 0.0, "epoch": 10.6, "learning_rate": 0.0003826319095477387, "loss": 0.3408, "mlm_loss": 0.3408, "step": 13740 }, { "ep_loss": 0.0, "epoch": 10.61, "learning_rate": 0.0003823178391959799, "loss": 0.3471, "mlm_loss": 0.3471, "step": 13750 }, { "ep_loss": 0.0, "epoch": 10.61, "learning_rate": 0.0003820037688442211, "loss": 0.3467, "mlm_loss": 0.3467, "step": 13760 }, { "ep_loss": 0.0, "epoch": 10.62, "learning_rate": 0.0003816896984924623, "loss": 0.3482, "mlm_loss": 0.3482, "step": 13770 }, { "ep_loss": 0.0, "epoch": 10.63, "learning_rate": 0.00038137562814070355, "loss": 0.3454, "mlm_loss": 0.3454, "step": 13780 }, { "ep_loss": 0.0, "epoch": 10.64, "learning_rate": 0.00038106155778894473, "loss": 0.346, "mlm_loss": 0.346, "step": 13790 }, { "ep_loss": 0.0, "epoch": 10.64, "learning_rate": 0.00038074748743718596, "loss": 0.3461, "mlm_loss": 0.3461, "step": 13800 }, { "epoch": 10.64, "eval_ep_loss": -2.976186990737915, "eval_loss": 0.32481658458709717, "eval_mlm_loss": 0.32481658458709717, "eval_runtime": 61.249, "eval_samples_per_second": 1140.737, "eval_steps_per_second": 0.571, "step": 13800 }, { "ep_loss": 0.0, "epoch": 10.65, "learning_rate": 0.0003804334170854272, "loss": 0.3478, "mlm_loss": 0.3478, "step": 13810 }, { "ep_loss": 0.0, "epoch": 10.66, "learning_rate": 0.0003801193467336683, "loss": 0.3474, "mlm_loss": 0.3474, "step": 13820 }, { "ep_loss": 0.0, "epoch": 10.67, "learning_rate": 0.00037980527638190953, "loss": 0.3453, "mlm_loss": 0.3453, "step": 13830 }, { "ep_loss": 0.0, "epoch": 10.67, "learning_rate": 0.00037949120603015076, "loss": 0.3503, "mlm_loss": 0.3503, "step": 13840 }, { "ep_loss": 0.0, "epoch": 10.68, "learning_rate": 0.00037917713567839193, "loss": 0.3491, "mlm_loss": 0.3491, "step": 13850 }, { "ep_loss": 0.0, "epoch": 10.69, "learning_rate": 0.00037886306532663316, "loss": 0.3401, "mlm_loss": 0.3401, "step": 13860 }, { "ep_loss": 0.0, "epoch": 10.7, "learning_rate": 0.0003785489949748744, "loss": 0.3489, "mlm_loss": 0.3489, "step": 13870 }, { "ep_loss": 0.0, "epoch": 10.71, "learning_rate": 0.0003782349246231156, "loss": 0.3413, "mlm_loss": 0.3413, "step": 13880 }, { "ep_loss": 0.0, "epoch": 10.71, "learning_rate": 0.0003779208542713568, "loss": 0.3412, "mlm_loss": 0.3412, "step": 13890 }, { "ep_loss": 0.0, "epoch": 10.72, "learning_rate": 0.000377606783919598, "loss": 0.3491, "mlm_loss": 0.3491, "step": 13900 }, { "epoch": 10.72, "eval_ep_loss": -2.670948028564453, "eval_loss": 0.3239390552043915, "eval_mlm_loss": 0.3239390552043915, "eval_runtime": 60.785, "eval_samples_per_second": 1149.445, "eval_steps_per_second": 0.576, "step": 13900 }, { "ep_loss": 0.0, "epoch": 10.73, "learning_rate": 0.00037729271356783925, "loss": 0.3422, "mlm_loss": 0.3422, "step": 13910 }, { "ep_loss": 0.0, "epoch": 10.74, "learning_rate": 0.00037697864321608037, "loss": 0.3479, "mlm_loss": 0.3479, "step": 13920 }, { "ep_loss": 0.0, "epoch": 10.74, "learning_rate": 0.0003766645728643216, "loss": 0.3471, "mlm_loss": 0.3471, "step": 13930 }, { "ep_loss": 0.0, "epoch": 10.75, "learning_rate": 0.0003763505025125628, "loss": 0.3511, "mlm_loss": 0.3511, "step": 13940 }, { "ep_loss": 0.0, "epoch": 10.76, "learning_rate": 0.000376036432160804, "loss": 0.3424, "mlm_loss": 0.3424, "step": 13950 }, { "ep_loss": 0.0, "epoch": 10.77, "learning_rate": 0.0003757223618090452, "loss": 0.3419, "mlm_loss": 0.3419, "step": 13960 }, { "ep_loss": 0.0, "epoch": 10.78, "learning_rate": 0.00037540829145728645, "loss": 0.3452, "mlm_loss": 0.3452, "step": 13970 }, { "ep_loss": 0.0, "epoch": 10.78, "learning_rate": 0.00037509422110552763, "loss": 0.3401, "mlm_loss": 0.3401, "step": 13980 }, { "ep_loss": 0.0, "epoch": 10.79, "learning_rate": 0.00037478015075376886, "loss": 0.3441, "mlm_loss": 0.3441, "step": 13990 }, { "ep_loss": 0.0, "epoch": 10.8, "learning_rate": 0.0003744660804020101, "loss": 0.3429, "mlm_loss": 0.3429, "step": 14000 }, { "epoch": 10.8, "eval_ep_loss": -2.6834776401519775, "eval_loss": 0.32337966561317444, "eval_mlm_loss": 0.32337966561317444, "eval_runtime": 59.4747, "eval_samples_per_second": 1174.769, "eval_steps_per_second": 0.588, "step": 14000 }, { "ep_loss": 0.0, "epoch": 10.81, "learning_rate": 0.0003741520100502513, "loss": 0.3444, "mlm_loss": 0.3444, "step": 14010 }, { "ep_loss": 0.0, "epoch": 10.81, "learning_rate": 0.00037383793969849243, "loss": 0.3438, "mlm_loss": 0.3438, "step": 14020 }, { "ep_loss": 0.0, "epoch": 10.82, "learning_rate": 0.00037352386934673366, "loss": 0.3463, "mlm_loss": 0.3463, "step": 14030 }, { "ep_loss": 0.0, "epoch": 10.83, "learning_rate": 0.0003732097989949749, "loss": 0.3467, "mlm_loss": 0.3467, "step": 14040 }, { "ep_loss": 0.0, "epoch": 10.84, "learning_rate": 0.00037289572864321606, "loss": 0.3398, "mlm_loss": 0.3398, "step": 14050 }, { "ep_loss": 0.0, "epoch": 10.84, "learning_rate": 0.0003725816582914573, "loss": 0.3447, "mlm_loss": 0.3447, "step": 14060 }, { "ep_loss": 0.0, "epoch": 10.85, "learning_rate": 0.0003722675879396985, "loss": 0.3398, "mlm_loss": 0.3398, "step": 14070 }, { "ep_loss": 0.0, "epoch": 10.86, "learning_rate": 0.0003719535175879397, "loss": 0.3466, "mlm_loss": 0.3466, "step": 14080 }, { "ep_loss": 0.0, "epoch": 10.87, "learning_rate": 0.0003716394472361809, "loss": 0.3428, "mlm_loss": 0.3428, "step": 14090 }, { "ep_loss": 0.0, "epoch": 10.88, "learning_rate": 0.00037132537688442215, "loss": 0.3445, "mlm_loss": 0.3445, "step": 14100 }, { "epoch": 10.88, "eval_ep_loss": -2.466320037841797, "eval_loss": 0.3204542398452759, "eval_mlm_loss": 0.3204542398452759, "eval_runtime": 59.2997, "eval_samples_per_second": 1178.236, "eval_steps_per_second": 0.59, "step": 14100 }, { "ep_loss": 0.0, "epoch": 10.88, "learning_rate": 0.0003710113065326634, "loss": 0.3426, "mlm_loss": 0.3426, "step": 14110 }, { "ep_loss": 0.0, "epoch": 10.89, "learning_rate": 0.0003706972361809045, "loss": 0.3403, "mlm_loss": 0.3403, "step": 14120 }, { "ep_loss": 0.0, "epoch": 10.9, "learning_rate": 0.0003703831658291457, "loss": 0.3383, "mlm_loss": 0.3383, "step": 14130 }, { "ep_loss": 0.0, "epoch": 10.91, "learning_rate": 0.00037006909547738695, "loss": 0.3497, "mlm_loss": 0.3497, "step": 14140 }, { "ep_loss": 0.0, "epoch": 10.91, "learning_rate": 0.0003697550251256281, "loss": 0.349, "mlm_loss": 0.349, "step": 14150 }, { "ep_loss": 0.0, "epoch": 10.92, "learning_rate": 0.00036944095477386935, "loss": 0.3449, "mlm_loss": 0.3449, "step": 14160 }, { "ep_loss": 0.0, "epoch": 10.93, "learning_rate": 0.0003691268844221106, "loss": 0.3487, "mlm_loss": 0.3487, "step": 14170 }, { "ep_loss": 0.0, "epoch": 10.94, "learning_rate": 0.00036881281407035176, "loss": 0.344, "mlm_loss": 0.344, "step": 14180 }, { "ep_loss": 0.0, "epoch": 10.94, "learning_rate": 0.000368498743718593, "loss": 0.3358, "mlm_loss": 0.3358, "step": 14190 }, { "ep_loss": 0.0, "epoch": 10.95, "learning_rate": 0.0003681846733668342, "loss": 0.3406, "mlm_loss": 0.3406, "step": 14200 }, { "epoch": 10.95, "eval_ep_loss": -2.3429927825927734, "eval_loss": 0.32197174429893494, "eval_mlm_loss": 0.32197174429893494, "eval_runtime": 63.3682, "eval_samples_per_second": 1102.588, "eval_steps_per_second": 0.552, "step": 14200 }, { "ep_loss": 0.0, "epoch": 10.96, "learning_rate": 0.0003678706030150754, "loss": 0.3403, "mlm_loss": 0.3403, "step": 14210 }, { "ep_loss": 0.0, "epoch": 10.97, "learning_rate": 0.00036755653266331656, "loss": 0.3388, "mlm_loss": 0.3388, "step": 14220 }, { "ep_loss": 0.0, "epoch": 10.98, "learning_rate": 0.0003672424623115578, "loss": 0.3426, "mlm_loss": 0.3426, "step": 14230 }, { "ep_loss": 0.0, "epoch": 10.98, "learning_rate": 0.000366928391959799, "loss": 0.3417, "mlm_loss": 0.3417, "step": 14240 }, { "ep_loss": 0.0, "epoch": 10.99, "learning_rate": 0.0003666143216080402, "loss": 0.3383, "mlm_loss": 0.3383, "step": 14250 }, { "ep_loss": 0.0, "epoch": 11.0, "learning_rate": 0.0003663002512562814, "loss": 0.3468, "mlm_loss": 0.3468, "step": 14260 }, { "ep_loss": 0.0, "epoch": 11.01, "learning_rate": 0.00036598618090452265, "loss": 0.3337, "mlm_loss": 0.3337, "step": 14270 }, { "ep_loss": 0.0, "epoch": 11.01, "learning_rate": 0.0003656721105527638, "loss": 0.3397, "mlm_loss": 0.3397, "step": 14280 }, { "ep_loss": 0.0, "epoch": 11.02, "learning_rate": 0.00036535804020100505, "loss": 0.3403, "mlm_loss": 0.3403, "step": 14290 }, { "ep_loss": 0.0, "epoch": 11.03, "learning_rate": 0.0003650439698492463, "loss": 0.346, "mlm_loss": 0.346, "step": 14300 }, { "epoch": 11.03, "eval_ep_loss": -2.5903124809265137, "eval_loss": 0.31952106952667236, "eval_mlm_loss": 0.31952106952667236, "eval_runtime": 61.3925, "eval_samples_per_second": 1138.07, "eval_steps_per_second": 0.57, "step": 14300 }, { "ep_loss": 0.0, "epoch": 11.04, "learning_rate": 0.00036472989949748745, "loss": 0.3393, "mlm_loss": 0.3393, "step": 14310 }, { "ep_loss": 0.0, "epoch": 11.05, "learning_rate": 0.0003644158291457286, "loss": 0.34, "mlm_loss": 0.34, "step": 14320 }, { "ep_loss": 0.0, "epoch": 11.05, "learning_rate": 0.00036410175879396985, "loss": 0.3428, "mlm_loss": 0.3428, "step": 14330 }, { "ep_loss": 0.0, "epoch": 11.06, "learning_rate": 0.000363787688442211, "loss": 0.3406, "mlm_loss": 0.3406, "step": 14340 }, { "ep_loss": 0.0, "epoch": 11.07, "learning_rate": 0.00036347361809045225, "loss": 0.3384, "mlm_loss": 0.3384, "step": 14350 }, { "ep_loss": 0.0, "epoch": 11.08, "learning_rate": 0.0003631595477386935, "loss": 0.3385, "mlm_loss": 0.3385, "step": 14360 }, { "ep_loss": 0.0, "epoch": 11.08, "learning_rate": 0.0003628454773869347, "loss": 0.3342, "mlm_loss": 0.3342, "step": 14370 }, { "ep_loss": 0.0, "epoch": 11.09, "learning_rate": 0.0003625314070351759, "loss": 0.3353, "mlm_loss": 0.3353, "step": 14380 }, { "ep_loss": 0.0, "epoch": 11.1, "learning_rate": 0.0003622173366834171, "loss": 0.3426, "mlm_loss": 0.3426, "step": 14390 }, { "ep_loss": 0.0, "epoch": 11.11, "learning_rate": 0.00036190326633165834, "loss": 0.335, "mlm_loss": 0.335, "step": 14400 }, { "epoch": 11.11, "eval_ep_loss": -2.3407018184661865, "eval_loss": 0.31839361786842346, "eval_mlm_loss": 0.31839361786842346, "eval_runtime": 59.9564, "eval_samples_per_second": 1165.329, "eval_steps_per_second": 0.584, "step": 14400 }, { "ep_loss": 0.0, "epoch": 11.11, "learning_rate": 0.0003615891959798995, "loss": 0.339, "mlm_loss": 0.339, "step": 14410 }, { "ep_loss": 0.0, "epoch": 11.12, "learning_rate": 0.0003612751256281407, "loss": 0.3419, "mlm_loss": 0.3419, "step": 14420 }, { "ep_loss": 0.0, "epoch": 11.13, "learning_rate": 0.0003609610552763819, "loss": 0.3368, "mlm_loss": 0.3368, "step": 14430 }, { "ep_loss": 0.0, "epoch": 11.14, "learning_rate": 0.0003606469849246231, "loss": 0.338, "mlm_loss": 0.338, "step": 14440 }, { "ep_loss": 0.0, "epoch": 11.15, "learning_rate": 0.0003603329145728643, "loss": 0.3413, "mlm_loss": 0.3413, "step": 14450 }, { "ep_loss": 0.0, "epoch": 11.15, "learning_rate": 0.00036001884422110555, "loss": 0.3361, "mlm_loss": 0.3361, "step": 14460 }, { "ep_loss": 0.0, "epoch": 11.16, "learning_rate": 0.0003597047738693467, "loss": 0.3358, "mlm_loss": 0.3358, "step": 14470 }, { "ep_loss": 0.0, "epoch": 11.17, "learning_rate": 0.00035939070351758795, "loss": 0.3352, "mlm_loss": 0.3352, "step": 14480 }, { "ep_loss": 0.0, "epoch": 11.18, "learning_rate": 0.0003590766331658292, "loss": 0.3401, "mlm_loss": 0.3401, "step": 14490 }, { "ep_loss": 0.0, "epoch": 11.18, "learning_rate": 0.0003587625628140704, "loss": 0.3385, "mlm_loss": 0.3385, "step": 14500 }, { "epoch": 11.18, "eval_ep_loss": -2.600480794906616, "eval_loss": 0.3174394965171814, "eval_mlm_loss": 0.3174394965171814, "eval_runtime": 59.992, "eval_samples_per_second": 1164.639, "eval_steps_per_second": 0.583, "step": 14500 }, { "ep_loss": 0.0, "epoch": 11.19, "learning_rate": 0.0003584484924623116, "loss": 0.3374, "mlm_loss": 0.3374, "step": 14510 }, { "ep_loss": 0.0, "epoch": 11.2, "learning_rate": 0.00035813442211055275, "loss": 0.3406, "mlm_loss": 0.3406, "step": 14520 }, { "ep_loss": 0.0, "epoch": 11.21, "learning_rate": 0.000357820351758794, "loss": 0.3361, "mlm_loss": 0.3361, "step": 14530 }, { "ep_loss": 0.0, "epoch": 11.21, "learning_rate": 0.00035750628140703515, "loss": 0.339, "mlm_loss": 0.339, "step": 14540 }, { "ep_loss": 0.0, "epoch": 11.22, "learning_rate": 0.0003571922110552764, "loss": 0.3367, "mlm_loss": 0.3367, "step": 14550 }, { "ep_loss": 0.0, "epoch": 11.23, "learning_rate": 0.0003568781407035176, "loss": 0.3401, "mlm_loss": 0.3401, "step": 14560 }, { "ep_loss": 0.0, "epoch": 11.24, "learning_rate": 0.0003565640703517588, "loss": 0.3381, "mlm_loss": 0.3381, "step": 14570 }, { "ep_loss": 0.0, "epoch": 11.25, "learning_rate": 0.00035625, "loss": 0.3348, "mlm_loss": 0.3348, "step": 14580 }, { "ep_loss": 0.0, "epoch": 11.25, "learning_rate": 0.00035593592964824124, "loss": 0.3347, "mlm_loss": 0.3347, "step": 14590 }, { "ep_loss": 0.0, "epoch": 11.26, "learning_rate": 0.0003556218592964824, "loss": 0.3368, "mlm_loss": 0.3368, "step": 14600 }, { "epoch": 11.26, "eval_ep_loss": -2.644643545150757, "eval_loss": 0.3158123791217804, "eval_mlm_loss": 0.3158123791217804, "eval_runtime": 63.9607, "eval_samples_per_second": 1092.373, "eval_steps_per_second": 0.547, "step": 14600 }, { "ep_loss": 0.0, "epoch": 11.27, "learning_rate": 0.0003553077889447236, "loss": 0.3406, "mlm_loss": 0.3406, "step": 14610 }, { "ep_loss": 0.0, "epoch": 11.28, "learning_rate": 0.0003549937185929648, "loss": 0.3404, "mlm_loss": 0.3404, "step": 14620 }, { "ep_loss": 0.0, "epoch": 11.28, "learning_rate": 0.00035467964824120604, "loss": 0.3373, "mlm_loss": 0.3373, "step": 14630 }, { "ep_loss": 0.0, "epoch": 11.29, "learning_rate": 0.0003543655778894472, "loss": 0.3403, "mlm_loss": 0.3403, "step": 14640 }, { "ep_loss": 0.0, "epoch": 11.3, "learning_rate": 0.00035405150753768845, "loss": 0.3351, "mlm_loss": 0.3351, "step": 14650 }, { "ep_loss": 0.0, "epoch": 11.31, "learning_rate": 0.0003537374371859297, "loss": 0.333, "mlm_loss": 0.333, "step": 14660 }, { "ep_loss": 0.0, "epoch": 11.32, "learning_rate": 0.00035342336683417085, "loss": 0.3399, "mlm_loss": 0.3399, "step": 14670 }, { "ep_loss": 0.0, "epoch": 11.32, "learning_rate": 0.0003531092964824121, "loss": 0.3349, "mlm_loss": 0.3349, "step": 14680 }, { "ep_loss": 0.0, "epoch": 11.33, "learning_rate": 0.0003527952261306533, "loss": 0.339, "mlm_loss": 0.339, "step": 14690 }, { "ep_loss": 0.0, "epoch": 11.34, "learning_rate": 0.0003524811557788945, "loss": 0.336, "mlm_loss": 0.336, "step": 14700 }, { "epoch": 11.34, "eval_ep_loss": -2.6687519550323486, "eval_loss": 0.31672507524490356, "eval_mlm_loss": 0.31672507524490356, "eval_runtime": 61.4289, "eval_samples_per_second": 1137.396, "eval_steps_per_second": 0.57, "step": 14700 }, { "ep_loss": 0.0, "epoch": 11.35, "learning_rate": 0.00035216708542713565, "loss": 0.3387, "mlm_loss": 0.3387, "step": 14710 }, { "ep_loss": 0.0, "epoch": 11.35, "learning_rate": 0.0003518530150753769, "loss": 0.3345, "mlm_loss": 0.3345, "step": 14720 }, { "ep_loss": 0.0, "epoch": 11.36, "learning_rate": 0.00035153894472361805, "loss": 0.337, "mlm_loss": 0.337, "step": 14730 }, { "ep_loss": 0.0, "epoch": 11.37, "learning_rate": 0.0003512248743718593, "loss": 0.3307, "mlm_loss": 0.3307, "step": 14740 }, { "ep_loss": 0.0, "epoch": 11.38, "learning_rate": 0.0003509108040201005, "loss": 0.329, "mlm_loss": 0.329, "step": 14750 }, { "ep_loss": 0.0, "epoch": 11.38, "learning_rate": 0.00035059673366834174, "loss": 0.3345, "mlm_loss": 0.3345, "step": 14760 }, { "ep_loss": 0.0, "epoch": 11.39, "learning_rate": 0.0003502826633165829, "loss": 0.3323, "mlm_loss": 0.3323, "step": 14770 }, { "ep_loss": 0.0, "epoch": 11.4, "learning_rate": 0.00034996859296482414, "loss": 0.3311, "mlm_loss": 0.3311, "step": 14780 }, { "ep_loss": 0.0, "epoch": 11.41, "learning_rate": 0.00034965452261306537, "loss": 0.3304, "mlm_loss": 0.3304, "step": 14790 }, { "ep_loss": 0.0, "epoch": 11.42, "learning_rate": 0.00034934045226130654, "loss": 0.3298, "mlm_loss": 0.3298, "step": 14800 }, { "epoch": 11.42, "eval_ep_loss": -2.789717197418213, "eval_loss": 0.31507280468940735, "eval_mlm_loss": 0.31507280468940735, "eval_runtime": 60.3112, "eval_samples_per_second": 1158.476, "eval_steps_per_second": 0.58, "step": 14800 }, { "ep_loss": 0.0, "epoch": 11.42, "learning_rate": 0.0003490263819095477, "loss": 0.3325, "mlm_loss": 0.3325, "step": 14810 }, { "ep_loss": 0.0, "epoch": 11.43, "learning_rate": 0.00034871231155778894, "loss": 0.3363, "mlm_loss": 0.3363, "step": 14820 }, { "ep_loss": 0.0, "epoch": 11.44, "learning_rate": 0.0003483982412060301, "loss": 0.339, "mlm_loss": 0.339, "step": 14830 }, { "ep_loss": 0.0, "epoch": 11.45, "learning_rate": 0.00034808417085427135, "loss": 0.3372, "mlm_loss": 0.3372, "step": 14840 }, { "ep_loss": 0.0, "epoch": 11.45, "learning_rate": 0.0003477701005025126, "loss": 0.3316, "mlm_loss": 0.3316, "step": 14850 }, { "ep_loss": 0.0, "epoch": 11.46, "learning_rate": 0.00034748743718592966, "loss": 0.3393, "mlm_loss": 0.3393, "step": 14860 }, { "ep_loss": 0.0, "epoch": 11.47, "learning_rate": 0.00034717336683417083, "loss": 0.3348, "mlm_loss": 0.3348, "step": 14870 }, { "ep_loss": 0.0, "epoch": 11.48, "learning_rate": 0.00034685929648241206, "loss": 0.3322, "mlm_loss": 0.3322, "step": 14880 }, { "ep_loss": 0.0, "epoch": 11.48, "learning_rate": 0.00034654522613065323, "loss": 0.3354, "mlm_loss": 0.3354, "step": 14890 }, { "ep_loss": 0.0, "epoch": 11.49, "learning_rate": 0.00034623115577889446, "loss": 0.3337, "mlm_loss": 0.3337, "step": 14900 }, { "epoch": 11.49, "eval_ep_loss": -2.740074396133423, "eval_loss": 0.31319552659988403, "eval_mlm_loss": 0.31319552659988403, "eval_runtime": 59.6932, "eval_samples_per_second": 1170.468, "eval_steps_per_second": 0.586, "step": 14900 }, { "ep_loss": 0.0, "epoch": 11.5, "learning_rate": 0.0003459170854271357, "loss": 0.3321, "mlm_loss": 0.3321, "step": 14910 }, { "ep_loss": 0.0, "epoch": 11.51, "learning_rate": 0.0003456030150753769, "loss": 0.3382, "mlm_loss": 0.3382, "step": 14920 }, { "ep_loss": 0.0, "epoch": 11.52, "learning_rate": 0.0003452889447236181, "loss": 0.3352, "mlm_loss": 0.3352, "step": 14930 }, { "ep_loss": 0.0, "epoch": 11.52, "learning_rate": 0.0003449748743718593, "loss": 0.3359, "mlm_loss": 0.3359, "step": 14940 }, { "ep_loss": 0.0, "epoch": 11.53, "learning_rate": 0.00034466080402010055, "loss": 0.3323, "mlm_loss": 0.3323, "step": 14950 }, { "ep_loss": 0.0, "epoch": 11.54, "learning_rate": 0.0003443467336683417, "loss": 0.3371, "mlm_loss": 0.3371, "step": 14960 }, { "ep_loss": 0.0, "epoch": 11.55, "learning_rate": 0.0003440326633165829, "loss": 0.3341, "mlm_loss": 0.3341, "step": 14970 }, { "ep_loss": 0.0, "epoch": 11.55, "learning_rate": 0.0003437185929648241, "loss": 0.329, "mlm_loss": 0.329, "step": 14980 }, { "ep_loss": 0.0, "epoch": 11.56, "learning_rate": 0.0003434045226130653, "loss": 0.3323, "mlm_loss": 0.3323, "step": 14990 }, { "ep_loss": 0.0, "epoch": 11.57, "learning_rate": 0.0003430904522613065, "loss": 0.3349, "mlm_loss": 0.3349, "step": 15000 }, { "epoch": 11.57, "eval_ep_loss": -2.9178011417388916, "eval_loss": 0.3126050531864166, "eval_mlm_loss": 0.3126050531864166, "eval_runtime": 60.091, "eval_samples_per_second": 1162.72, "eval_steps_per_second": 0.582, "step": 15000 }, { "ep_loss": 0.0, "epoch": 11.58, "learning_rate": 0.00034277638190954775, "loss": 0.3315, "mlm_loss": 0.3315, "step": 15010 }, { "ep_loss": 0.0, "epoch": 11.59, "learning_rate": 0.00034246231155778893, "loss": 0.3303, "mlm_loss": 0.3303, "step": 15020 }, { "ep_loss": 0.0, "epoch": 11.59, "learning_rate": 0.00034214824120603016, "loss": 0.3337, "mlm_loss": 0.3337, "step": 15030 }, { "ep_loss": 0.0, "epoch": 11.6, "learning_rate": 0.0003418341708542714, "loss": 0.3299, "mlm_loss": 0.3299, "step": 15040 }, { "ep_loss": 0.0, "epoch": 11.61, "learning_rate": 0.0003415201005025126, "loss": 0.3366, "mlm_loss": 0.3366, "step": 15050 }, { "ep_loss": 0.0, "epoch": 11.62, "learning_rate": 0.0003412060301507538, "loss": 0.3185, "mlm_loss": 0.3185, "step": 15060 }, { "ep_loss": 0.0, "epoch": 11.62, "learning_rate": 0.00034089195979899496, "loss": 0.3277, "mlm_loss": 0.3277, "step": 15070 }, { "ep_loss": 0.0, "epoch": 11.63, "learning_rate": 0.0003405778894472362, "loss": 0.3351, "mlm_loss": 0.3351, "step": 15080 }, { "ep_loss": 0.0, "epoch": 11.64, "learning_rate": 0.00034026381909547736, "loss": 0.3306, "mlm_loss": 0.3306, "step": 15090 }, { "ep_loss": 0.0, "epoch": 11.65, "learning_rate": 0.0003399497487437186, "loss": 0.3279, "mlm_loss": 0.3279, "step": 15100 }, { "epoch": 11.65, "eval_ep_loss": -2.8001365661621094, "eval_loss": 0.31101852655410767, "eval_mlm_loss": 0.31101852655410767, "eval_runtime": 60.3645, "eval_samples_per_second": 1157.452, "eval_steps_per_second": 0.58, "step": 15100 }, { "ep_loss": 0.0, "epoch": 11.65, "learning_rate": 0.0003396356783919598, "loss": 0.3278, "mlm_loss": 0.3278, "step": 15110 }, { "ep_loss": 0.0, "epoch": 11.66, "learning_rate": 0.000339321608040201, "loss": 0.3356, "mlm_loss": 0.3356, "step": 15120 }, { "ep_loss": 0.0, "epoch": 11.67, "learning_rate": 0.0003390075376884422, "loss": 0.3322, "mlm_loss": 0.3322, "step": 15130 }, { "ep_loss": 0.0, "epoch": 11.68, "learning_rate": 0.00033869346733668345, "loss": 0.3279, "mlm_loss": 0.3279, "step": 15140 }, { "ep_loss": 0.0, "epoch": 11.69, "learning_rate": 0.0003383793969849246, "loss": 0.3347, "mlm_loss": 0.3347, "step": 15150 }, { "ep_loss": 0.0, "epoch": 11.69, "learning_rate": 0.00033806532663316585, "loss": 0.3343, "mlm_loss": 0.3343, "step": 15160 }, { "ep_loss": 0.0, "epoch": 11.7, "learning_rate": 0.000337751256281407, "loss": 0.3298, "mlm_loss": 0.3298, "step": 15170 }, { "ep_loss": 0.0, "epoch": 11.71, "learning_rate": 0.00033743718592964825, "loss": 0.3307, "mlm_loss": 0.3307, "step": 15180 }, { "ep_loss": 0.0, "epoch": 11.72, "learning_rate": 0.0003371231155778894, "loss": 0.3335, "mlm_loss": 0.3335, "step": 15190 }, { "ep_loss": 0.0, "epoch": 11.72, "learning_rate": 0.00033680904522613065, "loss": 0.332, "mlm_loss": 0.332, "step": 15200 }, { "epoch": 11.72, "eval_ep_loss": -2.703523635864258, "eval_loss": 0.3101367652416229, "eval_mlm_loss": 0.3101367652416229, "eval_runtime": 63.4743, "eval_samples_per_second": 1100.745, "eval_steps_per_second": 0.551, "step": 15200 }, { "ep_loss": 0.0, "epoch": 11.73, "learning_rate": 0.0003364949748743719, "loss": 0.3289, "mlm_loss": 0.3289, "step": 15210 }, { "ep_loss": 0.0, "epoch": 11.74, "learning_rate": 0.00033618090452261306, "loss": 0.3295, "mlm_loss": 0.3295, "step": 15220 }, { "ep_loss": 0.0, "epoch": 11.75, "learning_rate": 0.0003358668341708543, "loss": 0.3329, "mlm_loss": 0.3329, "step": 15230 }, { "ep_loss": 0.0, "epoch": 11.75, "learning_rate": 0.0003355527638190955, "loss": 0.3338, "mlm_loss": 0.3338, "step": 15240 }, { "ep_loss": 0.0, "epoch": 11.76, "learning_rate": 0.0003352386934673367, "loss": 0.3349, "mlm_loss": 0.3349, "step": 15250 }, { "ep_loss": 0.0, "epoch": 11.77, "learning_rate": 0.0003349246231155779, "loss": 0.3304, "mlm_loss": 0.3304, "step": 15260 }, { "ep_loss": 0.0, "epoch": 11.78, "learning_rate": 0.0003346105527638191, "loss": 0.329, "mlm_loss": 0.329, "step": 15270 }, { "ep_loss": 0.0, "epoch": 11.79, "learning_rate": 0.00033429648241206026, "loss": 0.3308, "mlm_loss": 0.3308, "step": 15280 }, { "ep_loss": 0.0, "epoch": 11.79, "learning_rate": 0.0003339824120603015, "loss": 0.3344, "mlm_loss": 0.3344, "step": 15290 }, { "ep_loss": 0.0, "epoch": 11.8, "learning_rate": 0.0003336683417085427, "loss": 0.3359, "mlm_loss": 0.3359, "step": 15300 }, { "epoch": 11.8, "eval_ep_loss": -2.812905788421631, "eval_loss": 0.3099025785923004, "eval_mlm_loss": 0.3099025785923004, "eval_runtime": 61.244, "eval_samples_per_second": 1140.831, "eval_steps_per_second": 0.571, "step": 15300 }, { "ep_loss": 0.0, "epoch": 11.81, "learning_rate": 0.00033335427135678395, "loss": 0.3334, "mlm_loss": 0.3334, "step": 15310 }, { "ep_loss": 0.0, "epoch": 11.82, "learning_rate": 0.0003330402010050251, "loss": 0.3209, "mlm_loss": 0.3209, "step": 15320 }, { "ep_loss": 0.0, "epoch": 11.82, "learning_rate": 0.00033272613065326635, "loss": 0.3275, "mlm_loss": 0.3275, "step": 15330 }, { "ep_loss": 0.0, "epoch": 11.83, "learning_rate": 0.0003324120603015076, "loss": 0.3285, "mlm_loss": 0.3285, "step": 15340 }, { "ep_loss": 0.0, "epoch": 11.84, "learning_rate": 0.00033209798994974875, "loss": 0.3274, "mlm_loss": 0.3274, "step": 15350 }, { "ep_loss": 0.0, "epoch": 11.85, "learning_rate": 0.00033178391959799, "loss": 0.3274, "mlm_loss": 0.3274, "step": 15360 }, { "ep_loss": 0.0, "epoch": 11.85, "learning_rate": 0.00033146984924623115, "loss": 0.3357, "mlm_loss": 0.3357, "step": 15370 }, { "ep_loss": 0.0, "epoch": 11.86, "learning_rate": 0.0003311557788944723, "loss": 0.3324, "mlm_loss": 0.3324, "step": 15380 }, { "ep_loss": 0.0, "epoch": 11.87, "learning_rate": 0.00033084170854271355, "loss": 0.3337, "mlm_loss": 0.3337, "step": 15390 }, { "ep_loss": 0.0, "epoch": 11.88, "learning_rate": 0.0003305276381909548, "loss": 0.3322, "mlm_loss": 0.3322, "step": 15400 }, { "epoch": 11.88, "eval_ep_loss": -2.6010334491729736, "eval_loss": 0.30834510922431946, "eval_mlm_loss": 0.30834510922431946, "eval_runtime": 62.7261, "eval_samples_per_second": 1113.875, "eval_steps_per_second": 0.558, "step": 15400 }, { "ep_loss": 0.0, "epoch": 11.89, "learning_rate": 0.00033021356783919596, "loss": 0.3327, "mlm_loss": 0.3327, "step": 15410 }, { "ep_loss": 0.0, "epoch": 11.89, "learning_rate": 0.0003298994974874372, "loss": 0.3269, "mlm_loss": 0.3269, "step": 15420 }, { "ep_loss": 0.0, "epoch": 11.9, "learning_rate": 0.0003295854271356784, "loss": 0.3238, "mlm_loss": 0.3238, "step": 15430 }, { "ep_loss": 0.0, "epoch": 11.91, "learning_rate": 0.00032927135678391964, "loss": 0.3273, "mlm_loss": 0.3273, "step": 15440 }, { "ep_loss": 0.0, "epoch": 11.92, "learning_rate": 0.0003289572864321608, "loss": 0.3373, "mlm_loss": 0.3373, "step": 15450 }, { "ep_loss": 0.0, "epoch": 11.92, "learning_rate": 0.00032864321608040204, "loss": 0.3262, "mlm_loss": 0.3262, "step": 15460 }, { "ep_loss": 0.0, "epoch": 11.93, "learning_rate": 0.0003283291457286432, "loss": 0.3314, "mlm_loss": 0.3314, "step": 15470 }, { "ep_loss": 0.0, "epoch": 11.94, "learning_rate": 0.0003280150753768844, "loss": 0.3296, "mlm_loss": 0.3296, "step": 15480 }, { "ep_loss": 0.0, "epoch": 11.95, "learning_rate": 0.0003277010050251256, "loss": 0.3249, "mlm_loss": 0.3249, "step": 15490 }, { "ep_loss": 0.0, "epoch": 11.96, "learning_rate": 0.00032738693467336685, "loss": 0.324, "mlm_loss": 0.324, "step": 15500 }, { "epoch": 11.96, "eval_ep_loss": -2.6185078620910645, "eval_loss": 0.30834418535232544, "eval_mlm_loss": 0.30834418535232544, "eval_runtime": 60.4091, "eval_samples_per_second": 1156.597, "eval_steps_per_second": 0.579, "step": 15500 }, { "ep_loss": 0.0, "epoch": 11.96, "learning_rate": 0.000327072864321608, "loss": 0.3247, "mlm_loss": 0.3247, "step": 15510 }, { "ep_loss": 0.0, "epoch": 11.97, "learning_rate": 0.00032675879396984925, "loss": 0.3281, "mlm_loss": 0.3281, "step": 15520 }, { "ep_loss": 0.0, "epoch": 11.98, "learning_rate": 0.0003264447236180905, "loss": 0.3287, "mlm_loss": 0.3287, "step": 15530 }, { "ep_loss": 0.0, "epoch": 11.99, "learning_rate": 0.0003261306532663317, "loss": 0.3272, "mlm_loss": 0.3272, "step": 15540 }, { "ep_loss": 0.0, "epoch": 11.99, "learning_rate": 0.0003258165829145729, "loss": 0.3355, "mlm_loss": 0.3355, "step": 15550 }, { "ep_loss": 0.0, "epoch": 12.0, "learning_rate": 0.0003255025125628141, "loss": 0.3299, "mlm_loss": 0.3299, "step": 15560 }, { "ep_loss": 0.0, "epoch": 12.01, "learning_rate": 0.0003251884422110553, "loss": 0.3293, "mlm_loss": 0.3293, "step": 15570 }, { "ep_loss": 0.0, "epoch": 12.02, "learning_rate": 0.00032487437185929645, "loss": 0.3268, "mlm_loss": 0.3268, "step": 15580 }, { "ep_loss": 0.0, "epoch": 12.02, "learning_rate": 0.0003245603015075377, "loss": 0.3202, "mlm_loss": 0.3202, "step": 15590 }, { "ep_loss": 0.0, "epoch": 12.03, "learning_rate": 0.0003242462311557789, "loss": 0.3211, "mlm_loss": 0.3211, "step": 15600 }, { "epoch": 12.03, "eval_ep_loss": -2.552333354949951, "eval_loss": 0.30786365270614624, "eval_mlm_loss": 0.30786365270614624, "eval_runtime": 61.2728, "eval_samples_per_second": 1140.295, "eval_steps_per_second": 0.571, "step": 15600 }, { "ep_loss": 0.0, "epoch": 12.04, "learning_rate": 0.0003239321608040201, "loss": 0.3184, "mlm_loss": 0.3184, "step": 15610 }, { "ep_loss": 0.0, "epoch": 12.05, "learning_rate": 0.0003236180904522613, "loss": 0.3271, "mlm_loss": 0.3271, "step": 15620 }, { "ep_loss": 0.0, "epoch": 12.06, "learning_rate": 0.00032330402010050254, "loss": 0.3285, "mlm_loss": 0.3285, "step": 15630 }, { "ep_loss": 0.0, "epoch": 12.06, "learning_rate": 0.0003229899497487437, "loss": 0.3214, "mlm_loss": 0.3214, "step": 15640 }, { "ep_loss": 0.0, "epoch": 12.07, "learning_rate": 0.00032267587939698494, "loss": 0.3269, "mlm_loss": 0.3269, "step": 15650 }, { "ep_loss": 0.0, "epoch": 12.08, "learning_rate": 0.00032236180904522617, "loss": 0.3222, "mlm_loss": 0.3222, "step": 15660 }, { "ep_loss": 0.0, "epoch": 12.09, "learning_rate": 0.00032204773869346734, "loss": 0.321, "mlm_loss": 0.321, "step": 15670 }, { "ep_loss": 0.0, "epoch": 12.09, "learning_rate": 0.0003217336683417085, "loss": 0.3198, "mlm_loss": 0.3198, "step": 15680 }, { "ep_loss": 0.0, "epoch": 12.1, "learning_rate": 0.00032141959798994975, "loss": 0.3213, "mlm_loss": 0.3213, "step": 15690 }, { "ep_loss": 0.0, "epoch": 12.11, "learning_rate": 0.000321105527638191, "loss": 0.327, "mlm_loss": 0.327, "step": 15700 }, { "epoch": 12.11, "eval_ep_loss": -2.4123196601867676, "eval_loss": 0.3056265413761139, "eval_mlm_loss": 0.3056265413761139, "eval_runtime": 62.3312, "eval_samples_per_second": 1120.931, "eval_steps_per_second": 0.562, "step": 15700 }, { "ep_loss": 0.0, "epoch": 12.12, "learning_rate": 0.00032079145728643215, "loss": 0.3254, "mlm_loss": 0.3254, "step": 15710 }, { "ep_loss": 0.0, "epoch": 12.12, "learning_rate": 0.0003204773869346734, "loss": 0.3218, "mlm_loss": 0.3218, "step": 15720 }, { "ep_loss": 0.0, "epoch": 12.13, "learning_rate": 0.0003201633165829146, "loss": 0.3266, "mlm_loss": 0.3266, "step": 15730 }, { "ep_loss": 0.0, "epoch": 12.14, "learning_rate": 0.0003198492462311558, "loss": 0.3181, "mlm_loss": 0.3181, "step": 15740 }, { "ep_loss": 0.0, "epoch": 12.15, "learning_rate": 0.000319535175879397, "loss": 0.3226, "mlm_loss": 0.3226, "step": 15750 }, { "ep_loss": 0.0, "epoch": 12.16, "learning_rate": 0.00031922110552763823, "loss": 0.3271, "mlm_loss": 0.3271, "step": 15760 }, { "ep_loss": 0.0, "epoch": 12.16, "learning_rate": 0.00031890703517587935, "loss": 0.3183, "mlm_loss": 0.3183, "step": 15770 }, { "ep_loss": 0.0, "epoch": 12.17, "learning_rate": 0.0003185929648241206, "loss": 0.3108, "mlm_loss": 0.3108, "step": 15780 }, { "ep_loss": 0.0, "epoch": 12.18, "learning_rate": 0.0003182788944723618, "loss": 0.3314, "mlm_loss": 0.3314, "step": 15790 }, { "ep_loss": 0.0, "epoch": 12.19, "learning_rate": 0.00031796482412060304, "loss": 0.3226, "mlm_loss": 0.3226, "step": 15800 }, { "epoch": 12.19, "eval_ep_loss": -2.378340244293213, "eval_loss": 0.3046373128890991, "eval_mlm_loss": 0.3046373128890991, "eval_runtime": 61.1512, "eval_samples_per_second": 1142.562, "eval_steps_per_second": 0.572, "step": 15800 }, { "ep_loss": 0.0, "epoch": 12.19, "learning_rate": 0.0003176507537688442, "loss": 0.3179, "mlm_loss": 0.3179, "step": 15810 }, { "ep_loss": 0.0, "epoch": 12.2, "learning_rate": 0.00031733668341708544, "loss": 0.3173, "mlm_loss": 0.3173, "step": 15820 }, { "ep_loss": 0.0, "epoch": 12.21, "learning_rate": 0.00031702261306532667, "loss": 0.3267, "mlm_loss": 0.3267, "step": 15830 }, { "ep_loss": 0.0, "epoch": 12.22, "learning_rate": 0.00031670854271356784, "loss": 0.3207, "mlm_loss": 0.3207, "step": 15840 }, { "ep_loss": 0.0, "epoch": 12.23, "learning_rate": 0.00031639447236180907, "loss": 0.327, "mlm_loss": 0.327, "step": 15850 }, { "ep_loss": 0.0, "epoch": 12.23, "learning_rate": 0.0003160804020100503, "loss": 0.3271, "mlm_loss": 0.3271, "step": 15860 }, { "ep_loss": 0.0, "epoch": 12.24, "learning_rate": 0.0003157663316582914, "loss": 0.3287, "mlm_loss": 0.3287, "step": 15870 }, { "ep_loss": 0.0, "epoch": 12.25, "learning_rate": 0.00031545226130653265, "loss": 0.3212, "mlm_loss": 0.3212, "step": 15880 }, { "ep_loss": 0.0, "epoch": 12.26, "learning_rate": 0.0003151381909547739, "loss": 0.3246, "mlm_loss": 0.3246, "step": 15890 }, { "ep_loss": 0.0, "epoch": 12.26, "learning_rate": 0.00031482412060301505, "loss": 0.3205, "mlm_loss": 0.3205, "step": 15900 }, { "epoch": 12.26, "eval_ep_loss": -2.3220415115356445, "eval_loss": 0.305078387260437, "eval_mlm_loss": 0.305078387260437, "eval_runtime": 61.9296, "eval_samples_per_second": 1128.2, "eval_steps_per_second": 0.565, "step": 15900 }, { "ep_loss": 0.0, "epoch": 12.27, "learning_rate": 0.0003145100502512563, "loss": 0.323, "mlm_loss": 0.323, "step": 15910 }, { "ep_loss": 0.0, "epoch": 12.28, "learning_rate": 0.0003141959798994975, "loss": 0.3259, "mlm_loss": 0.3259, "step": 15920 }, { "ep_loss": 0.0, "epoch": 12.29, "learning_rate": 0.00031388190954773873, "loss": 0.3277, "mlm_loss": 0.3277, "step": 15930 }, { "ep_loss": 0.0, "epoch": 12.29, "learning_rate": 0.0003135678391959799, "loss": 0.3207, "mlm_loss": 0.3207, "step": 15940 }, { "ep_loss": 0.0, "epoch": 12.3, "learning_rate": 0.00031325376884422113, "loss": 0.3269, "mlm_loss": 0.3269, "step": 15950 }, { "ep_loss": 0.0, "epoch": 12.31, "learning_rate": 0.00031293969849246236, "loss": 0.3261, "mlm_loss": 0.3261, "step": 15960 }, { "ep_loss": 0.0, "epoch": 12.32, "learning_rate": 0.0003126256281407035, "loss": 0.3187, "mlm_loss": 0.3187, "step": 15970 }, { "ep_loss": 0.0, "epoch": 12.33, "learning_rate": 0.0003123115577889447, "loss": 0.3191, "mlm_loss": 0.3191, "step": 15980 }, { "ep_loss": 0.0, "epoch": 12.33, "learning_rate": 0.00031199748743718594, "loss": 0.3214, "mlm_loss": 0.3214, "step": 15990 }, { "ep_loss": 0.0, "epoch": 12.34, "learning_rate": 0.0003116834170854271, "loss": 0.322, "mlm_loss": 0.322, "step": 16000 }, { "epoch": 12.34, "eval_ep_loss": -2.4821033477783203, "eval_loss": 0.30568239092826843, "eval_mlm_loss": 0.30568239092826843, "eval_runtime": 59.8027, "eval_samples_per_second": 1168.326, "eval_steps_per_second": 0.585, "step": 16000 }, { "ep_loss": 0.0, "epoch": 12.35, "learning_rate": 0.00031136934673366834, "loss": 0.3212, "mlm_loss": 0.3212, "step": 16010 }, { "ep_loss": 0.0, "epoch": 12.36, "learning_rate": 0.00031105527638190957, "loss": 0.3184, "mlm_loss": 0.3184, "step": 16020 }, { "ep_loss": 0.0, "epoch": 12.36, "learning_rate": 0.00031074120603015074, "loss": 0.3178, "mlm_loss": 0.3178, "step": 16030 }, { "ep_loss": 0.0, "epoch": 12.37, "learning_rate": 0.00031042713567839197, "loss": 0.325, "mlm_loss": 0.325, "step": 16040 }, { "ep_loss": 0.0, "epoch": 12.38, "learning_rate": 0.0003101130653266332, "loss": 0.3274, "mlm_loss": 0.3274, "step": 16050 }, { "ep_loss": 0.0, "epoch": 12.39, "learning_rate": 0.00030979899497487443, "loss": 0.3178, "mlm_loss": 0.3178, "step": 16060 }, { "ep_loss": 0.0, "epoch": 12.39, "learning_rate": 0.00030948492462311555, "loss": 0.3215, "mlm_loss": 0.3215, "step": 16070 }, { "ep_loss": 0.0, "epoch": 12.4, "learning_rate": 0.0003091708542713568, "loss": 0.3302, "mlm_loss": 0.3302, "step": 16080 }, { "ep_loss": 0.0, "epoch": 12.41, "learning_rate": 0.000308856783919598, "loss": 0.3251, "mlm_loss": 0.3251, "step": 16090 }, { "ep_loss": 0.0, "epoch": 12.42, "learning_rate": 0.0003085427135678392, "loss": 0.3253, "mlm_loss": 0.3253, "step": 16100 }, { "epoch": 12.42, "eval_ep_loss": -2.364391565322876, "eval_loss": 0.3045331537723541, "eval_mlm_loss": 0.3045331537723541, "eval_runtime": 64.7442, "eval_samples_per_second": 1079.155, "eval_steps_per_second": 0.541, "step": 16100 }, { "ep_loss": 0.0, "epoch": 12.43, "learning_rate": 0.0003082286432160804, "loss": 0.3256, "mlm_loss": 0.3256, "step": 16110 }, { "ep_loss": 0.0, "epoch": 12.43, "learning_rate": 0.00030791457286432163, "loss": 0.3187, "mlm_loss": 0.3187, "step": 16120 }, { "ep_loss": 0.0, "epoch": 12.44, "learning_rate": 0.0003076005025125628, "loss": 0.32, "mlm_loss": 0.32, "step": 16130 }, { "ep_loss": 0.0, "epoch": 12.45, "learning_rate": 0.00030728643216080403, "loss": 0.3257, "mlm_loss": 0.3257, "step": 16140 }, { "ep_loss": 0.0, "epoch": 12.46, "learning_rate": 0.00030697236180904526, "loss": 0.3199, "mlm_loss": 0.3199, "step": 16150 }, { "ep_loss": 0.0, "epoch": 12.46, "learning_rate": 0.0003066582914572865, "loss": 0.3205, "mlm_loss": 0.3205, "step": 16160 }, { "ep_loss": 0.0, "epoch": 12.47, "learning_rate": 0.0003063442211055276, "loss": 0.323, "mlm_loss": 0.323, "step": 16170 }, { "ep_loss": 0.0, "epoch": 12.48, "learning_rate": 0.00030603015075376884, "loss": 0.3194, "mlm_loss": 0.3194, "step": 16180 }, { "ep_loss": 0.0, "epoch": 12.49, "learning_rate": 0.00030571608040201007, "loss": 0.3241, "mlm_loss": 0.3241, "step": 16190 }, { "ep_loss": 0.0, "epoch": 12.5, "learning_rate": 0.00030540201005025124, "loss": 0.3215, "mlm_loss": 0.3215, "step": 16200 }, { "epoch": 12.5, "eval_ep_loss": -2.4813191890716553, "eval_loss": 0.30195391178131104, "eval_mlm_loss": 0.30195391178131104, "eval_runtime": 61.4192, "eval_samples_per_second": 1137.576, "eval_steps_per_second": 0.57, "step": 16200 }, { "ep_loss": 0.0, "epoch": 12.5, "learning_rate": 0.00030508793969849247, "loss": 0.3197, "mlm_loss": 0.3197, "step": 16210 }, { "ep_loss": 0.0, "epoch": 12.51, "learning_rate": 0.0003047738693467337, "loss": 0.3183, "mlm_loss": 0.3183, "step": 16220 }, { "ep_loss": 0.0, "epoch": 12.52, "learning_rate": 0.00030445979899497487, "loss": 0.3231, "mlm_loss": 0.3231, "step": 16230 }, { "ep_loss": 0.0, "epoch": 12.53, "learning_rate": 0.0003041457286432161, "loss": 0.3137, "mlm_loss": 0.3137, "step": 16240 }, { "ep_loss": 0.0, "epoch": 12.53, "learning_rate": 0.00030383165829145733, "loss": 0.3178, "mlm_loss": 0.3178, "step": 16250 }, { "ep_loss": 0.0, "epoch": 12.54, "learning_rate": 0.0003035175879396985, "loss": 0.3178, "mlm_loss": 0.3178, "step": 16260 }, { "ep_loss": 0.0, "epoch": 12.55, "learning_rate": 0.0003032035175879397, "loss": 0.3167, "mlm_loss": 0.3167, "step": 16270 }, { "ep_loss": 0.0, "epoch": 12.56, "learning_rate": 0.0003028894472361809, "loss": 0.3163, "mlm_loss": 0.3163, "step": 16280 }, { "ep_loss": 0.0, "epoch": 12.56, "learning_rate": 0.00030257537688442213, "loss": 0.3234, "mlm_loss": 0.3234, "step": 16290 }, { "ep_loss": 0.0, "epoch": 12.57, "learning_rate": 0.0003022613065326633, "loss": 0.3245, "mlm_loss": 0.3245, "step": 16300 }, { "epoch": 12.57, "eval_ep_loss": -2.5413894653320312, "eval_loss": 0.30073004961013794, "eval_mlm_loss": 0.30073004961013794, "eval_runtime": 60.8169, "eval_samples_per_second": 1148.842, "eval_steps_per_second": 0.575, "step": 16300 }, { "ep_loss": 0.0, "epoch": 12.58, "learning_rate": 0.00030194723618090453, "loss": 0.3177, "mlm_loss": 0.3177, "step": 16310 }, { "ep_loss": 0.0, "epoch": 12.59, "learning_rate": 0.00030163316582914576, "loss": 0.3175, "mlm_loss": 0.3175, "step": 16320 }, { "ep_loss": 0.0, "epoch": 12.6, "learning_rate": 0.00030131909547738694, "loss": 0.3225, "mlm_loss": 0.3225, "step": 16330 }, { "ep_loss": 0.0, "epoch": 12.6, "learning_rate": 0.00030100502512562816, "loss": 0.3195, "mlm_loss": 0.3195, "step": 16340 }, { "ep_loss": 0.0, "epoch": 12.61, "learning_rate": 0.0003006909547738694, "loss": 0.315, "mlm_loss": 0.315, "step": 16350 }, { "ep_loss": 0.0, "epoch": 12.62, "learning_rate": 0.00030037688442211057, "loss": 0.3238, "mlm_loss": 0.3238, "step": 16360 }, { "ep_loss": 0.0, "epoch": 12.63, "learning_rate": 0.00030006281407035174, "loss": 0.3134, "mlm_loss": 0.3134, "step": 16370 }, { "ep_loss": 0.0, "epoch": 12.63, "learning_rate": 0.00029974874371859297, "loss": 0.3246, "mlm_loss": 0.3246, "step": 16380 }, { "ep_loss": 0.0, "epoch": 12.64, "learning_rate": 0.00029943467336683414, "loss": 0.3153, "mlm_loss": 0.3153, "step": 16390 }, { "ep_loss": 0.0, "epoch": 12.65, "learning_rate": 0.00029912060301507537, "loss": 0.3205, "mlm_loss": 0.3205, "step": 16400 }, { "epoch": 12.65, "eval_ep_loss": -2.454632043838501, "eval_loss": 0.3005962073802948, "eval_mlm_loss": 0.3005962073802948, "eval_runtime": 61.3758, "eval_samples_per_second": 1138.381, "eval_steps_per_second": 0.57, "step": 16400 }, { "ep_loss": 0.0, "epoch": 12.66, "learning_rate": 0.0002988065326633166, "loss": 0.3132, "mlm_loss": 0.3132, "step": 16410 }, { "ep_loss": 0.0, "epoch": 12.66, "learning_rate": 0.0002984924623115578, "loss": 0.3186, "mlm_loss": 0.3186, "step": 16420 }, { "ep_loss": 0.0, "epoch": 12.67, "learning_rate": 0.000298178391959799, "loss": 0.3228, "mlm_loss": 0.3228, "step": 16430 }, { "ep_loss": 0.0, "epoch": 12.68, "learning_rate": 0.00029786432160804023, "loss": 0.3134, "mlm_loss": 0.3134, "step": 16440 }, { "ep_loss": 0.0, "epoch": 12.69, "learning_rate": 0.00029755025125628146, "loss": 0.3155, "mlm_loss": 0.3155, "step": 16450 }, { "ep_loss": 0.0, "epoch": 12.7, "learning_rate": 0.00029723618090452263, "loss": 0.3166, "mlm_loss": 0.3166, "step": 16460 }, { "ep_loss": 0.0, "epoch": 12.7, "learning_rate": 0.0002969221105527638, "loss": 0.3285, "mlm_loss": 0.3285, "step": 16470 }, { "ep_loss": 0.0, "epoch": 12.71, "learning_rate": 0.00029660804020100503, "loss": 0.3172, "mlm_loss": 0.3172, "step": 16480 }, { "ep_loss": 0.0, "epoch": 12.72, "learning_rate": 0.0002962939698492462, "loss": 0.3185, "mlm_loss": 0.3185, "step": 16490 }, { "ep_loss": 0.0, "epoch": 12.73, "learning_rate": 0.00029597989949748743, "loss": 0.3154, "mlm_loss": 0.3154, "step": 16500 }, { "epoch": 12.73, "eval_ep_loss": -2.530627489089966, "eval_loss": 0.2981483042240143, "eval_mlm_loss": 0.2981483042240143, "eval_runtime": 62.8785, "eval_samples_per_second": 1111.175, "eval_steps_per_second": 0.557, "step": 16500 }, { "ep_loss": 0.0, "epoch": 12.73, "learning_rate": 0.00029566582914572866, "loss": 0.3206, "mlm_loss": 0.3206, "step": 16510 }, { "ep_loss": 0.0, "epoch": 12.74, "learning_rate": 0.00029535175879396984, "loss": 0.3171, "mlm_loss": 0.3171, "step": 16520 }, { "ep_loss": 0.0, "epoch": 12.75, "learning_rate": 0.00029503768844221106, "loss": 0.3235, "mlm_loss": 0.3235, "step": 16530 }, { "ep_loss": 0.0, "epoch": 12.76, "learning_rate": 0.0002947236180904523, "loss": 0.3188, "mlm_loss": 0.3188, "step": 16540 }, { "ep_loss": 0.0, "epoch": 12.77, "learning_rate": 0.0002944095477386935, "loss": 0.3197, "mlm_loss": 0.3197, "step": 16550 }, { "ep_loss": 0.0, "epoch": 12.77, "learning_rate": 0.0002940954773869347, "loss": 0.3218, "mlm_loss": 0.3218, "step": 16560 }, { "ep_loss": 0.0, "epoch": 12.78, "learning_rate": 0.00029378140703517587, "loss": 0.3105, "mlm_loss": 0.3105, "step": 16570 }, { "ep_loss": 0.0, "epoch": 12.79, "learning_rate": 0.0002934673366834171, "loss": 0.3168, "mlm_loss": 0.3168, "step": 16580 }, { "ep_loss": 0.0, "epoch": 12.8, "learning_rate": 0.00029315326633165827, "loss": 0.324, "mlm_loss": 0.324, "step": 16590 }, { "ep_loss": 0.0, "epoch": 12.8, "learning_rate": 0.0002928391959798995, "loss": 0.314, "mlm_loss": 0.314, "step": 16600 }, { "epoch": 12.8, "eval_ep_loss": -2.1559700965881348, "eval_loss": 0.2988715171813965, "eval_mlm_loss": 0.2988715171813965, "eval_runtime": 62.228, "eval_samples_per_second": 1122.791, "eval_steps_per_second": 0.562, "step": 16600 }, { "ep_loss": 0.0, "epoch": 12.81, "learning_rate": 0.0002925251256281407, "loss": 0.3144, "mlm_loss": 0.3144, "step": 16610 }, { "ep_loss": 0.0, "epoch": 12.82, "learning_rate": 0.0002922110552763819, "loss": 0.3139, "mlm_loss": 0.3139, "step": 16620 }, { "ep_loss": 0.0, "epoch": 12.83, "learning_rate": 0.00029189698492462313, "loss": 0.3193, "mlm_loss": 0.3193, "step": 16630 }, { "ep_loss": 0.0, "epoch": 12.83, "learning_rate": 0.00029158291457286436, "loss": 0.3153, "mlm_loss": 0.3153, "step": 16640 }, { "ep_loss": 0.0, "epoch": 12.84, "learning_rate": 0.00029126884422110553, "loss": 0.3103, "mlm_loss": 0.3103, "step": 16650 }, { "ep_loss": 0.0, "epoch": 12.85, "learning_rate": 0.00029095477386934676, "loss": 0.3129, "mlm_loss": 0.3129, "step": 16660 }, { "ep_loss": 0.0, "epoch": 12.86, "learning_rate": 0.00029064070351758793, "loss": 0.3176, "mlm_loss": 0.3176, "step": 16670 }, { "ep_loss": 0.0, "epoch": 12.87, "learning_rate": 0.00029032663316582916, "loss": 0.3121, "mlm_loss": 0.3121, "step": 16680 }, { "ep_loss": 0.0, "epoch": 12.87, "learning_rate": 0.00029001256281407033, "loss": 0.3153, "mlm_loss": 0.3153, "step": 16690 }, { "ep_loss": 0.0, "epoch": 12.88, "learning_rate": 0.00028969849246231156, "loss": 0.3182, "mlm_loss": 0.3182, "step": 16700 }, { "epoch": 12.88, "eval_ep_loss": -2.1432950496673584, "eval_loss": 0.2975234389305115, "eval_mlm_loss": 0.2975234389305115, "eval_runtime": 60.288, "eval_samples_per_second": 1158.921, "eval_steps_per_second": 0.581, "step": 16700 }, { "ep_loss": 0.0, "epoch": 12.89, "learning_rate": 0.0002893844221105528, "loss": 0.316, "mlm_loss": 0.316, "step": 16710 }, { "ep_loss": 0.0, "epoch": 12.9, "learning_rate": 0.00028907035175879396, "loss": 0.3171, "mlm_loss": 0.3171, "step": 16720 }, { "ep_loss": 0.0, "epoch": 12.9, "learning_rate": 0.0002887562814070352, "loss": 0.3202, "mlm_loss": 0.3202, "step": 16730 }, { "ep_loss": 0.0, "epoch": 12.91, "learning_rate": 0.0002884422110552764, "loss": 0.3123, "mlm_loss": 0.3123, "step": 16740 }, { "ep_loss": 0.0, "epoch": 12.92, "learning_rate": 0.0002881281407035176, "loss": 0.3117, "mlm_loss": 0.3117, "step": 16750 }, { "ep_loss": 0.0, "epoch": 12.93, "learning_rate": 0.0002878140703517588, "loss": 0.3155, "mlm_loss": 0.3155, "step": 16760 }, { "ep_loss": 0.0, "epoch": 12.93, "learning_rate": 0.0002875, "loss": 0.3198, "mlm_loss": 0.3198, "step": 16770 }, { "ep_loss": 0.0, "epoch": 12.94, "learning_rate": 0.00028718592964824117, "loss": 0.3106, "mlm_loss": 0.3106, "step": 16780 }, { "ep_loss": 0.0, "epoch": 12.95, "learning_rate": 0.0002868718592964824, "loss": 0.3163, "mlm_loss": 0.3163, "step": 16790 }, { "ep_loss": 0.0, "epoch": 12.96, "learning_rate": 0.0002865577889447236, "loss": 0.3154, "mlm_loss": 0.3154, "step": 16800 }, { "epoch": 12.96, "eval_ep_loss": -2.0711417198181152, "eval_loss": 0.2960880398750305, "eval_mlm_loss": 0.2960880398750305, "eval_runtime": 62.0434, "eval_samples_per_second": 1126.132, "eval_steps_per_second": 0.564, "step": 16800 }, { "ep_loss": 0.0, "epoch": 12.97, "learning_rate": 0.00028624371859296485, "loss": 0.3144, "mlm_loss": 0.3144, "step": 16810 }, { "ep_loss": 0.0, "epoch": 12.97, "learning_rate": 0.00028592964824120603, "loss": 0.3133, "mlm_loss": 0.3133, "step": 16820 }, { "ep_loss": 0.0, "epoch": 12.98, "learning_rate": 0.00028561557788944726, "loss": 0.3093, "mlm_loss": 0.3093, "step": 16830 }, { "ep_loss": 0.0, "epoch": 12.99, "learning_rate": 0.0002853015075376885, "loss": 0.3172, "mlm_loss": 0.3172, "step": 16840 }, { "ep_loss": 0.0, "epoch": 13.0, "learning_rate": 0.00028498743718592966, "loss": 0.3161, "mlm_loss": 0.3161, "step": 16850 }, { "ep_loss": 0.0, "epoch": 13.0, "learning_rate": 0.00028467336683417083, "loss": 0.3186, "mlm_loss": 0.3186, "step": 16860 }, { "ep_loss": 0.0, "epoch": 13.01, "learning_rate": 0.00028435929648241206, "loss": 0.3094, "mlm_loss": 0.3094, "step": 16870 }, { "ep_loss": 0.0, "epoch": 13.02, "learning_rate": 0.00028404522613065323, "loss": 0.3164, "mlm_loss": 0.3164, "step": 16880 }, { "ep_loss": 0.0, "epoch": 13.03, "learning_rate": 0.00028373115577889446, "loss": 0.3129, "mlm_loss": 0.3129, "step": 16890 }, { "ep_loss": 0.0, "epoch": 13.04, "learning_rate": 0.0002834170854271357, "loss": 0.3138, "mlm_loss": 0.3138, "step": 16900 }, { "epoch": 13.04, "eval_ep_loss": -2.157414436340332, "eval_loss": 0.2959713041782379, "eval_mlm_loss": 0.2959713041782379, "eval_runtime": 59.8502, "eval_samples_per_second": 1167.398, "eval_steps_per_second": 0.585, "step": 16900 }, { "ep_loss": 0.0, "epoch": 13.04, "learning_rate": 0.0002831030150753769, "loss": 0.314, "mlm_loss": 0.314, "step": 16910 }, { "ep_loss": 0.0, "epoch": 13.05, "learning_rate": 0.0002827889447236181, "loss": 0.3165, "mlm_loss": 0.3165, "step": 16920 }, { "ep_loss": 0.0, "epoch": 13.06, "learning_rate": 0.0002824748743718593, "loss": 0.3062, "mlm_loss": 0.3062, "step": 16930 }, { "ep_loss": 0.0, "epoch": 13.07, "learning_rate": 0.00028216080402010055, "loss": 0.3115, "mlm_loss": 0.3115, "step": 16940 }, { "ep_loss": 0.0, "epoch": 13.07, "learning_rate": 0.0002818467336683417, "loss": 0.3098, "mlm_loss": 0.3098, "step": 16950 }, { "ep_loss": 0.0, "epoch": 13.08, "learning_rate": 0.0002815326633165829, "loss": 0.3064, "mlm_loss": 0.3064, "step": 16960 }, { "ep_loss": 0.0, "epoch": 13.09, "learning_rate": 0.0002812185929648241, "loss": 0.3046, "mlm_loss": 0.3046, "step": 16970 }, { "ep_loss": 0.0, "epoch": 13.1, "learning_rate": 0.0002809045226130653, "loss": 0.3117, "mlm_loss": 0.3117, "step": 16980 }, { "ep_loss": 0.0, "epoch": 13.1, "learning_rate": 0.0002805904522613065, "loss": 0.3128, "mlm_loss": 0.3128, "step": 16990 }, { "ep_loss": 0.0, "epoch": 13.11, "learning_rate": 0.00028027638190954775, "loss": 0.3065, "mlm_loss": 0.3065, "step": 17000 }, { "epoch": 13.11, "eval_ep_loss": -2.252915620803833, "eval_loss": 0.2950764298439026, "eval_mlm_loss": 0.2950764298439026, "eval_runtime": 60.871, "eval_samples_per_second": 1147.821, "eval_steps_per_second": 0.575, "step": 17000 }, { "ep_loss": 0.0, "epoch": 13.12, "learning_rate": 0.00027996231155778893, "loss": 0.3113, "mlm_loss": 0.3113, "step": 17010 }, { "ep_loss": 0.0, "epoch": 13.13, "learning_rate": 0.00027964824120603016, "loss": 0.3136, "mlm_loss": 0.3136, "step": 17020 }, { "ep_loss": 0.0, "epoch": 13.14, "learning_rate": 0.0002793341708542714, "loss": 0.309, "mlm_loss": 0.309, "step": 17030 }, { "ep_loss": 0.0, "epoch": 13.14, "learning_rate": 0.0002790201005025126, "loss": 0.3167, "mlm_loss": 0.3167, "step": 17040 }, { "ep_loss": 0.0, "epoch": 13.15, "learning_rate": 0.0002787060301507538, "loss": 0.308, "mlm_loss": 0.308, "step": 17050 }, { "ep_loss": 0.0, "epoch": 13.16, "learning_rate": 0.00027839195979899496, "loss": 0.3183, "mlm_loss": 0.3183, "step": 17060 }, { "ep_loss": 0.0, "epoch": 13.17, "learning_rate": 0.0002780778894472362, "loss": 0.3097, "mlm_loss": 0.3097, "step": 17070 }, { "ep_loss": 0.0, "epoch": 13.17, "learning_rate": 0.00027776381909547736, "loss": 0.312, "mlm_loss": 0.312, "step": 17080 }, { "ep_loss": 0.0, "epoch": 13.18, "learning_rate": 0.0002774497487437186, "loss": 0.3171, "mlm_loss": 0.3171, "step": 17090 }, { "ep_loss": 0.0, "epoch": 13.19, "learning_rate": 0.0002771356783919598, "loss": 0.3072, "mlm_loss": 0.3072, "step": 17100 }, { "epoch": 13.19, "eval_ep_loss": -2.6266889572143555, "eval_loss": 0.2936928868293762, "eval_mlm_loss": 0.2936928868293762, "eval_runtime": 60.0035, "eval_samples_per_second": 1164.416, "eval_steps_per_second": 0.583, "step": 17100 }, { "ep_loss": 0.0, "epoch": 13.2, "learning_rate": 0.000276821608040201, "loss": 0.311, "mlm_loss": 0.311, "step": 17110 }, { "ep_loss": 0.0, "epoch": 13.2, "learning_rate": 0.0002765075376884422, "loss": 0.3083, "mlm_loss": 0.3083, "step": 17120 }, { "ep_loss": 0.0, "epoch": 13.21, "learning_rate": 0.00027619346733668345, "loss": 0.3102, "mlm_loss": 0.3102, "step": 17130 }, { "ep_loss": 0.0, "epoch": 13.22, "learning_rate": 0.0002758793969849246, "loss": 0.3073, "mlm_loss": 0.3073, "step": 17140 }, { "ep_loss": 0.0, "epoch": 13.23, "learning_rate": 0.00027556532663316585, "loss": 0.3107, "mlm_loss": 0.3107, "step": 17150 }, { "ep_loss": 0.0, "epoch": 13.24, "learning_rate": 0.000275251256281407, "loss": 0.3076, "mlm_loss": 0.3076, "step": 17160 }, { "ep_loss": 0.0, "epoch": 13.24, "learning_rate": 0.00027493718592964825, "loss": 0.3042, "mlm_loss": 0.3042, "step": 17170 }, { "ep_loss": 0.0, "epoch": 13.25, "learning_rate": 0.0002746231155778894, "loss": 0.3121, "mlm_loss": 0.3121, "step": 17180 }, { "ep_loss": 0.0, "epoch": 13.26, "learning_rate": 0.00027430904522613065, "loss": 0.3078, "mlm_loss": 0.3078, "step": 17190 }, { "ep_loss": 0.0, "epoch": 13.27, "learning_rate": 0.0002739949748743719, "loss": 0.3111, "mlm_loss": 0.3111, "step": 17200 }, { "epoch": 13.27, "eval_ep_loss": -2.6017704010009766, "eval_loss": 0.29203662276268005, "eval_mlm_loss": 0.29203662276268005, "eval_runtime": 61.3385, "eval_samples_per_second": 1139.073, "eval_steps_per_second": 0.571, "step": 17200 }, { "ep_loss": 0.0, "epoch": 13.27, "learning_rate": 0.00027368090452261306, "loss": 0.3096, "mlm_loss": 0.3096, "step": 17210 }, { "ep_loss": 0.0, "epoch": 13.28, "learning_rate": 0.0002733668341708543, "loss": 0.3122, "mlm_loss": 0.3122, "step": 17220 }, { "ep_loss": 0.0, "epoch": 13.29, "learning_rate": 0.0002730527638190955, "loss": 0.3051, "mlm_loss": 0.3051, "step": 17230 }, { "ep_loss": 0.0, "epoch": 13.3, "learning_rate": 0.0002727386934673367, "loss": 0.3064, "mlm_loss": 0.3064, "step": 17240 }, { "ep_loss": 0.0, "epoch": 13.31, "learning_rate": 0.0002724246231155779, "loss": 0.3129, "mlm_loss": 0.3129, "step": 17250 }, { "ep_loss": 0.0, "epoch": 13.31, "learning_rate": 0.0002721105527638191, "loss": 0.3055, "mlm_loss": 0.3055, "step": 17260 }, { "ep_loss": 0.0, "epoch": 13.32, "learning_rate": 0.00027179648241206026, "loss": 0.3152, "mlm_loss": 0.3152, "step": 17270 }, { "ep_loss": 0.0, "epoch": 13.33, "learning_rate": 0.0002714824120603015, "loss": 0.3112, "mlm_loss": 0.3112, "step": 17280 }, { "ep_loss": 0.0, "epoch": 13.34, "learning_rate": 0.0002711683417085427, "loss": 0.3106, "mlm_loss": 0.3106, "step": 17290 }, { "ep_loss": 0.0, "epoch": 13.34, "learning_rate": 0.00027085427135678395, "loss": 0.3122, "mlm_loss": 0.3122, "step": 17300 }, { "epoch": 13.34, "eval_ep_loss": -2.469452381134033, "eval_loss": 0.2916301488876343, "eval_mlm_loss": 0.2916301488876343, "eval_runtime": 62.1214, "eval_samples_per_second": 1124.718, "eval_steps_per_second": 0.563, "step": 17300 }, { "ep_loss": 0.0, "epoch": 13.35, "learning_rate": 0.0002705402010050251, "loss": 0.3103, "mlm_loss": 0.3103, "step": 17310 }, { "ep_loss": 0.0, "epoch": 13.36, "learning_rate": 0.00027022613065326635, "loss": 0.3098, "mlm_loss": 0.3098, "step": 17320 }, { "ep_loss": 0.0, "epoch": 13.37, "learning_rate": 0.0002699120603015076, "loss": 0.3068, "mlm_loss": 0.3068, "step": 17330 }, { "ep_loss": 0.0, "epoch": 13.37, "learning_rate": 0.00026959798994974875, "loss": 0.3121, "mlm_loss": 0.3121, "step": 17340 }, { "ep_loss": 0.0, "epoch": 13.38, "learning_rate": 0.00026928391959799, "loss": 0.3136, "mlm_loss": 0.3136, "step": 17350 }, { "ep_loss": 0.0, "epoch": 13.39, "learning_rate": 0.00026896984924623115, "loss": 0.3065, "mlm_loss": 0.3065, "step": 17360 }, { "ep_loss": 0.0, "epoch": 13.4, "learning_rate": 0.0002686557788944723, "loss": 0.3091, "mlm_loss": 0.3091, "step": 17370 }, { "ep_loss": 0.0, "epoch": 13.41, "learning_rate": 0.00026834170854271355, "loss": 0.3121, "mlm_loss": 0.3121, "step": 17380 }, { "ep_loss": 0.0, "epoch": 13.41, "learning_rate": 0.0002680276381909548, "loss": 0.3048, "mlm_loss": 0.3048, "step": 17390 }, { "ep_loss": 0.0, "epoch": 13.42, "learning_rate": 0.00026771356783919596, "loss": 0.3055, "mlm_loss": 0.3055, "step": 17400 }, { "epoch": 13.42, "eval_ep_loss": -2.2017784118652344, "eval_loss": 0.29138830304145813, "eval_mlm_loss": 0.29138830304145813, "eval_runtime": 62.7279, "eval_samples_per_second": 1113.843, "eval_steps_per_second": 0.558, "step": 17400 }, { "ep_loss": 0.0, "epoch": 13.43, "learning_rate": 0.0002673994974874372, "loss": 0.3153, "mlm_loss": 0.3153, "step": 17410 }, { "ep_loss": 0.0, "epoch": 13.44, "learning_rate": 0.0002670854271356784, "loss": 0.3087, "mlm_loss": 0.3087, "step": 17420 }, { "ep_loss": 0.0, "epoch": 13.44, "learning_rate": 0.00026677135678391964, "loss": 0.3082, "mlm_loss": 0.3082, "step": 17430 }, { "ep_loss": 0.0, "epoch": 13.45, "learning_rate": 0.0002664572864321608, "loss": 0.3119, "mlm_loss": 0.3119, "step": 17440 }, { "ep_loss": 0.0, "epoch": 13.46, "learning_rate": 0.00026614321608040204, "loss": 0.3083, "mlm_loss": 0.3083, "step": 17450 }, { "ep_loss": 0.0, "epoch": 13.47, "learning_rate": 0.0002658291457286432, "loss": 0.3077, "mlm_loss": 0.3077, "step": 17460 }, { "ep_loss": 0.0, "epoch": 13.47, "learning_rate": 0.0002655150753768844, "loss": 0.3129, "mlm_loss": 0.3129, "step": 17470 }, { "ep_loss": 0.0, "epoch": 13.48, "learning_rate": 0.0002652010050251256, "loss": 0.3078, "mlm_loss": 0.3078, "step": 17480 }, { "ep_loss": 0.0, "epoch": 13.49, "learning_rate": 0.00026488693467336685, "loss": 0.3105, "mlm_loss": 0.3105, "step": 17490 }, { "ep_loss": 0.0, "epoch": 13.5, "learning_rate": 0.00026460427135678393, "loss": 0.3065, "mlm_loss": 0.3065, "step": 17500 }, { "epoch": 13.5, "eval_ep_loss": -2.5960159301757812, "eval_loss": 0.2920810580253601, "eval_mlm_loss": 0.2920810580253601, "eval_runtime": 61.7515, "eval_samples_per_second": 1131.455, "eval_steps_per_second": 0.567, "step": 17500 }, { "ep_loss": 0.0, "epoch": 13.51, "learning_rate": 0.00026429020100502516, "loss": 0.3095, "mlm_loss": 0.3095, "step": 17510 }, { "ep_loss": 0.0, "epoch": 13.51, "learning_rate": 0.00026397613065326633, "loss": 0.3073, "mlm_loss": 0.3073, "step": 17520 }, { "ep_loss": 0.0, "epoch": 13.52, "learning_rate": 0.0002636620603015075, "loss": 0.3028, "mlm_loss": 0.3028, "step": 17530 }, { "ep_loss": 0.0, "epoch": 13.53, "learning_rate": 0.00026334798994974873, "loss": 0.3029, "mlm_loss": 0.3029, "step": 17540 }, { "ep_loss": 0.0, "epoch": 13.54, "learning_rate": 0.00026303391959798996, "loss": 0.311, "mlm_loss": 0.311, "step": 17550 }, { "ep_loss": 0.0, "epoch": 13.54, "learning_rate": 0.00026271984924623113, "loss": 0.3038, "mlm_loss": 0.3038, "step": 17560 }, { "ep_loss": 0.0, "epoch": 13.55, "learning_rate": 0.00026240577889447236, "loss": 0.306, "mlm_loss": 0.306, "step": 17570 }, { "ep_loss": 0.0, "epoch": 13.56, "learning_rate": 0.0002620917085427136, "loss": 0.3034, "mlm_loss": 0.3034, "step": 17580 }, { "ep_loss": 0.0, "epoch": 13.57, "learning_rate": 0.0002617776381909548, "loss": 0.3067, "mlm_loss": 0.3067, "step": 17590 }, { "ep_loss": 0.0, "epoch": 13.58, "learning_rate": 0.000261463567839196, "loss": 0.3045, "mlm_loss": 0.3045, "step": 17600 }, { "epoch": 13.58, "eval_ep_loss": -2.6744961738586426, "eval_loss": 0.29085445404052734, "eval_mlm_loss": 0.29085445404052734, "eval_runtime": 62.86, "eval_samples_per_second": 1111.501, "eval_steps_per_second": 0.557, "step": 17600 }, { "ep_loss": 0.0, "epoch": 13.58, "learning_rate": 0.0002611494974874372, "loss": 0.3031, "mlm_loss": 0.3031, "step": 17610 }, { "ep_loss": 0.0, "epoch": 13.59, "learning_rate": 0.0002608354271356784, "loss": 0.3073, "mlm_loss": 0.3073, "step": 17620 }, { "ep_loss": 0.0, "epoch": 13.6, "learning_rate": 0.00026052135678391957, "loss": 0.3032, "mlm_loss": 0.3032, "step": 17630 }, { "ep_loss": 0.0, "epoch": 13.61, "learning_rate": 0.0002602072864321608, "loss": 0.3095, "mlm_loss": 0.3095, "step": 17640 }, { "ep_loss": 0.0, "epoch": 13.61, "learning_rate": 0.000259893216080402, "loss": 0.3087, "mlm_loss": 0.3087, "step": 17650 }, { "ep_loss": 0.0, "epoch": 13.62, "learning_rate": 0.0002595791457286432, "loss": 0.3051, "mlm_loss": 0.3051, "step": 17660 }, { "ep_loss": 0.0, "epoch": 13.63, "learning_rate": 0.0002592650753768844, "loss": 0.3022, "mlm_loss": 0.3022, "step": 17670 }, { "ep_loss": 0.0, "epoch": 13.64, "learning_rate": 0.00025895100502512566, "loss": 0.3001, "mlm_loss": 0.3001, "step": 17680 }, { "ep_loss": 0.0, "epoch": 13.64, "learning_rate": 0.00025863693467336683, "loss": 0.3067, "mlm_loss": 0.3067, "step": 17690 }, { "ep_loss": 0.0, "epoch": 13.65, "learning_rate": 0.00025832286432160806, "loss": 0.3094, "mlm_loss": 0.3094, "step": 17700 }, { "epoch": 13.65, "eval_ep_loss": -2.679147958755493, "eval_loss": 0.2893053889274597, "eval_mlm_loss": 0.2893053889274597, "eval_runtime": 60.2876, "eval_samples_per_second": 1158.928, "eval_steps_per_second": 0.581, "step": 17700 }, { "ep_loss": 0.0, "epoch": 13.66, "learning_rate": 0.0002580087939698493, "loss": 0.3084, "mlm_loss": 0.3084, "step": 17710 }, { "ep_loss": 0.0, "epoch": 13.67, "learning_rate": 0.0002576947236180904, "loss": 0.3066, "mlm_loss": 0.3066, "step": 17720 }, { "ep_loss": 0.0, "epoch": 13.68, "learning_rate": 0.00025738065326633163, "loss": 0.3087, "mlm_loss": 0.3087, "step": 17730 }, { "ep_loss": 0.0, "epoch": 13.68, "learning_rate": 0.00025706658291457286, "loss": 0.3045, "mlm_loss": 0.3045, "step": 17740 }, { "ep_loss": 0.0, "epoch": 13.69, "learning_rate": 0.0002567525125628141, "loss": 0.3043, "mlm_loss": 0.3043, "step": 17750 }, { "ep_loss": 0.0, "epoch": 13.7, "learning_rate": 0.00025643844221105526, "loss": 0.3035, "mlm_loss": 0.3035, "step": 17760 }, { "ep_loss": 0.0, "epoch": 13.71, "learning_rate": 0.0002561243718592965, "loss": 0.3071, "mlm_loss": 0.3071, "step": 17770 }, { "ep_loss": 0.0, "epoch": 13.71, "learning_rate": 0.0002558103015075377, "loss": 0.3083, "mlm_loss": 0.3083, "step": 17780 }, { "ep_loss": 0.0, "epoch": 13.72, "learning_rate": 0.0002554962311557789, "loss": 0.3082, "mlm_loss": 0.3082, "step": 17790 }, { "ep_loss": 0.0, "epoch": 13.73, "learning_rate": 0.0002551821608040201, "loss": 0.3055, "mlm_loss": 0.3055, "step": 17800 }, { "epoch": 13.73, "eval_ep_loss": -2.5126798152923584, "eval_loss": 0.28990474343299866, "eval_mlm_loss": 0.28990474343299866, "eval_runtime": 60.9715, "eval_samples_per_second": 1145.928, "eval_steps_per_second": 0.574, "step": 17800 }, { "ep_loss": 0.0, "epoch": 13.74, "learning_rate": 0.00025486809045226135, "loss": 0.3067, "mlm_loss": 0.3067, "step": 17810 }, { "ep_loss": 0.0, "epoch": 13.74, "learning_rate": 0.00025455402010050247, "loss": 0.3096, "mlm_loss": 0.3096, "step": 17820 }, { "ep_loss": 0.0, "epoch": 13.75, "learning_rate": 0.0002542399497487437, "loss": 0.3064, "mlm_loss": 0.3064, "step": 17830 }, { "ep_loss": 0.0, "epoch": 13.76, "learning_rate": 0.0002539258793969849, "loss": 0.3009, "mlm_loss": 0.3009, "step": 17840 }, { "ep_loss": 0.0, "epoch": 13.77, "learning_rate": 0.00025361180904522615, "loss": 0.3049, "mlm_loss": 0.3049, "step": 17850 }, { "ep_loss": 0.0, "epoch": 13.78, "learning_rate": 0.00025329773869346733, "loss": 0.3102, "mlm_loss": 0.3102, "step": 17860 }, { "ep_loss": 0.0, "epoch": 13.78, "learning_rate": 0.00025298366834170856, "loss": 0.3072, "mlm_loss": 0.3072, "step": 17870 }, { "ep_loss": 0.0, "epoch": 13.79, "learning_rate": 0.0002526695979899498, "loss": 0.2998, "mlm_loss": 0.2998, "step": 17880 }, { "ep_loss": 0.0, "epoch": 13.8, "learning_rate": 0.00025235552763819096, "loss": 0.3057, "mlm_loss": 0.3057, "step": 17890 }, { "ep_loss": 0.0, "epoch": 13.81, "learning_rate": 0.0002520414572864322, "loss": 0.3036, "mlm_loss": 0.3036, "step": 17900 }, { "epoch": 13.81, "eval_ep_loss": -2.5039594173431396, "eval_loss": 0.28744441270828247, "eval_mlm_loss": 0.28744441270828247, "eval_runtime": 60.8163, "eval_samples_per_second": 1148.853, "eval_steps_per_second": 0.576, "step": 17900 }, { "ep_loss": 0.0, "epoch": 13.81, "learning_rate": 0.0002517273869346734, "loss": 0.303, "mlm_loss": 0.303, "step": 17910 }, { "ep_loss": 0.0, "epoch": 13.82, "learning_rate": 0.00025141331658291453, "loss": 0.3072, "mlm_loss": 0.3072, "step": 17920 }, { "ep_loss": 0.0, "epoch": 13.83, "learning_rate": 0.00025109924623115576, "loss": 0.3035, "mlm_loss": 0.3035, "step": 17930 }, { "ep_loss": 0.0, "epoch": 13.84, "learning_rate": 0.000250785175879397, "loss": 0.3014, "mlm_loss": 0.3014, "step": 17940 }, { "ep_loss": 0.0, "epoch": 13.84, "learning_rate": 0.00025047110552763816, "loss": 0.2989, "mlm_loss": 0.2989, "step": 17950 }, { "ep_loss": 0.0, "epoch": 13.85, "learning_rate": 0.0002501570351758794, "loss": 0.3079, "mlm_loss": 0.3079, "step": 17960 }, { "ep_loss": 0.0, "epoch": 13.86, "learning_rate": 0.0002498429648241206, "loss": 0.3044, "mlm_loss": 0.3044, "step": 17970 }, { "ep_loss": 0.0, "epoch": 13.87, "learning_rate": 0.00024952889447236185, "loss": 0.3006, "mlm_loss": 0.3006, "step": 17980 }, { "ep_loss": 0.0, "epoch": 13.88, "learning_rate": 0.000249214824120603, "loss": 0.303, "mlm_loss": 0.303, "step": 17990 }, { "ep_loss": 0.0, "epoch": 13.88, "learning_rate": 0.0002489007537688442, "loss": 0.3096, "mlm_loss": 0.3096, "step": 18000 }, { "epoch": 13.88, "eval_ep_loss": -2.4838016033172607, "eval_loss": 0.2871261239051819, "eval_mlm_loss": 0.2871261239051819, "eval_runtime": 63.3763, "eval_samples_per_second": 1102.447, "eval_steps_per_second": 0.552, "step": 18000 }, { "ep_loss": 0.0, "epoch": 13.89, "learning_rate": 0.0002485866834170854, "loss": 0.3017, "mlm_loss": 0.3017, "step": 18010 }, { "ep_loss": 0.0, "epoch": 13.9, "learning_rate": 0.00024827261306532665, "loss": 0.3039, "mlm_loss": 0.3039, "step": 18020 }, { "ep_loss": 0.0, "epoch": 13.91, "learning_rate": 0.0002479585427135679, "loss": 0.3055, "mlm_loss": 0.3055, "step": 18030 }, { "ep_loss": 0.0, "epoch": 13.91, "learning_rate": 0.00024764447236180905, "loss": 0.3092, "mlm_loss": 0.3092, "step": 18040 }, { "ep_loss": 0.0, "epoch": 13.92, "learning_rate": 0.00024733040201005023, "loss": 0.3021, "mlm_loss": 0.3021, "step": 18050 }, { "ep_loss": 0.0, "epoch": 13.93, "learning_rate": 0.00024701633165829146, "loss": 0.2973, "mlm_loss": 0.2973, "step": 18060 }, { "ep_loss": 0.0, "epoch": 13.94, "learning_rate": 0.0002467022613065327, "loss": 0.314, "mlm_loss": 0.314, "step": 18070 }, { "ep_loss": 0.0, "epoch": 13.95, "learning_rate": 0.00024638819095477386, "loss": 0.3077, "mlm_loss": 0.3077, "step": 18080 }, { "ep_loss": 0.0, "epoch": 13.95, "learning_rate": 0.0002460741206030151, "loss": 0.3013, "mlm_loss": 0.3013, "step": 18090 }, { "ep_loss": 0.0, "epoch": 13.96, "learning_rate": 0.00024576005025125626, "loss": 0.3059, "mlm_loss": 0.3059, "step": 18100 }, { "epoch": 13.96, "eval_ep_loss": -2.5012893676757812, "eval_loss": 0.2863268256187439, "eval_mlm_loss": 0.2863268256187439, "eval_runtime": 60.402, "eval_samples_per_second": 1156.733, "eval_steps_per_second": 0.579, "step": 18100 }, { "ep_loss": 0.0, "epoch": 13.97, "learning_rate": 0.0002454459798994975, "loss": 0.3019, "mlm_loss": 0.3019, "step": 18110 }, { "ep_loss": 0.0, "epoch": 13.98, "learning_rate": 0.0002451319095477387, "loss": 0.3001, "mlm_loss": 0.3001, "step": 18120 }, { "ep_loss": 0.0, "epoch": 13.98, "learning_rate": 0.0002448178391959799, "loss": 0.3056, "mlm_loss": 0.3056, "step": 18130 }, { "ep_loss": 0.0, "epoch": 13.99, "learning_rate": 0.0002445037688442211, "loss": 0.2982, "mlm_loss": 0.2982, "step": 18140 }, { "ep_loss": 0.0, "epoch": 14.0, "learning_rate": 0.0002441896984924623, "loss": 0.3026, "mlm_loss": 0.3026, "step": 18150 }, { "ep_loss": 0.0, "epoch": 14.01, "learning_rate": 0.00024387562814070352, "loss": 0.3017, "mlm_loss": 0.3017, "step": 18160 }, { "ep_loss": 0.0, "epoch": 14.01, "learning_rate": 0.00024356155778894472, "loss": 0.2965, "mlm_loss": 0.2965, "step": 18170 }, { "ep_loss": 0.0, "epoch": 14.02, "learning_rate": 0.00024324748743718595, "loss": 0.3046, "mlm_loss": 0.3046, "step": 18180 }, { "ep_loss": 0.0, "epoch": 14.03, "learning_rate": 0.00024293341708542712, "loss": 0.2986, "mlm_loss": 0.2986, "step": 18190 }, { "ep_loss": 0.0, "epoch": 14.04, "learning_rate": 0.00024261934673366835, "loss": 0.2951, "mlm_loss": 0.2951, "step": 18200 }, { "epoch": 14.04, "eval_ep_loss": -2.689553737640381, "eval_loss": 0.2829967737197876, "eval_mlm_loss": 0.2829967737197876, "eval_runtime": 60.3157, "eval_samples_per_second": 1158.389, "eval_steps_per_second": 0.58, "step": 18200 }, { "ep_loss": 0.0, "epoch": 14.05, "learning_rate": 0.00024230527638190955, "loss": 0.2984, "mlm_loss": 0.2984, "step": 18210 }, { "ep_loss": 0.0, "epoch": 14.05, "learning_rate": 0.00024199120603015075, "loss": 0.2985, "mlm_loss": 0.2985, "step": 18220 }, { "ep_loss": 0.0, "epoch": 14.06, "learning_rate": 0.00024167713567839198, "loss": 0.2964, "mlm_loss": 0.2964, "step": 18230 }, { "ep_loss": 0.0, "epoch": 14.07, "learning_rate": 0.00024136306532663315, "loss": 0.2988, "mlm_loss": 0.2988, "step": 18240 }, { "ep_loss": 0.0, "epoch": 14.08, "learning_rate": 0.00024104899497487438, "loss": 0.2969, "mlm_loss": 0.2969, "step": 18250 }, { "ep_loss": 0.0, "epoch": 14.08, "learning_rate": 0.00024073492462311558, "loss": 0.3067, "mlm_loss": 0.3067, "step": 18260 }, { "ep_loss": 0.0, "epoch": 14.09, "learning_rate": 0.00024042085427135678, "loss": 0.2994, "mlm_loss": 0.2994, "step": 18270 }, { "ep_loss": 0.0, "epoch": 14.1, "learning_rate": 0.000240106783919598, "loss": 0.2913, "mlm_loss": 0.2913, "step": 18280 }, { "ep_loss": 0.0, "epoch": 14.11, "learning_rate": 0.00023979271356783919, "loss": 0.305, "mlm_loss": 0.305, "step": 18290 }, { "ep_loss": 0.0, "epoch": 14.11, "learning_rate": 0.0002394786432160804, "loss": 0.3012, "mlm_loss": 0.3012, "step": 18300 }, { "epoch": 14.11, "eval_ep_loss": -2.6690220832824707, "eval_loss": 0.28195154666900635, "eval_mlm_loss": 0.28195154666900635, "eval_runtime": 60.2432, "eval_samples_per_second": 1159.782, "eval_steps_per_second": 0.581, "step": 18300 }, { "ep_loss": 0.0, "epoch": 14.12, "learning_rate": 0.00023916457286432162, "loss": 0.3015, "mlm_loss": 0.3015, "step": 18310 }, { "ep_loss": 0.0, "epoch": 14.13, "learning_rate": 0.00023885050251256282, "loss": 0.2992, "mlm_loss": 0.2992, "step": 18320 }, { "ep_loss": 0.0, "epoch": 14.14, "learning_rate": 0.00023853643216080404, "loss": 0.2981, "mlm_loss": 0.2981, "step": 18330 }, { "ep_loss": 0.0, "epoch": 14.15, "learning_rate": 0.00023822236180904522, "loss": 0.3015, "mlm_loss": 0.3015, "step": 18340 }, { "ep_loss": 0.0, "epoch": 14.15, "learning_rate": 0.00023790829145728642, "loss": 0.3008, "mlm_loss": 0.3008, "step": 18350 }, { "ep_loss": 0.0, "epoch": 14.16, "learning_rate": 0.00023759422110552765, "loss": 0.2992, "mlm_loss": 0.2992, "step": 18360 }, { "ep_loss": 0.0, "epoch": 14.17, "learning_rate": 0.00023728015075376885, "loss": 0.2953, "mlm_loss": 0.2953, "step": 18370 }, { "ep_loss": 0.0, "epoch": 14.18, "learning_rate": 0.00023696608040201008, "loss": 0.3035, "mlm_loss": 0.3035, "step": 18380 }, { "ep_loss": 0.0, "epoch": 14.18, "learning_rate": 0.00023665201005025125, "loss": 0.3016, "mlm_loss": 0.3016, "step": 18390 }, { "ep_loss": 0.0, "epoch": 14.19, "learning_rate": 0.00023633793969849245, "loss": 0.2952, "mlm_loss": 0.2952, "step": 18400 }, { "epoch": 14.19, "eval_ep_loss": -2.827725887298584, "eval_loss": 0.28363582491874695, "eval_mlm_loss": 0.28363582491874695, "eval_runtime": 62.7901, "eval_samples_per_second": 1112.739, "eval_steps_per_second": 0.557, "step": 18400 }, { "ep_loss": 0.0, "epoch": 14.2, "learning_rate": 0.00023602386934673368, "loss": 0.2976, "mlm_loss": 0.2976, "step": 18410 }, { "ep_loss": 0.0, "epoch": 14.21, "learning_rate": 0.00023570979899497488, "loss": 0.2976, "mlm_loss": 0.2976, "step": 18420 }, { "ep_loss": 0.0, "epoch": 14.22, "learning_rate": 0.0002353957286432161, "loss": 0.2977, "mlm_loss": 0.2977, "step": 18430 }, { "ep_loss": 0.0, "epoch": 14.22, "learning_rate": 0.00023508165829145728, "loss": 0.2985, "mlm_loss": 0.2985, "step": 18440 }, { "ep_loss": 0.0, "epoch": 14.23, "learning_rate": 0.00023476758793969848, "loss": 0.2974, "mlm_loss": 0.2974, "step": 18450 }, { "ep_loss": 0.0, "epoch": 14.24, "learning_rate": 0.0002344535175879397, "loss": 0.2969, "mlm_loss": 0.2969, "step": 18460 }, { "ep_loss": 0.0, "epoch": 14.25, "learning_rate": 0.0002341394472361809, "loss": 0.3005, "mlm_loss": 0.3005, "step": 18470 }, { "ep_loss": 0.0, "epoch": 14.25, "learning_rate": 0.0002338253768844221, "loss": 0.3049, "mlm_loss": 0.3049, "step": 18480 }, { "ep_loss": 0.0, "epoch": 14.26, "learning_rate": 0.00023351130653266331, "loss": 0.2962, "mlm_loss": 0.2962, "step": 18490 }, { "ep_loss": 0.0, "epoch": 14.27, "learning_rate": 0.00023319723618090452, "loss": 0.2979, "mlm_loss": 0.2979, "step": 18500 }, { "epoch": 14.27, "eval_ep_loss": -2.5578811168670654, "eval_loss": 0.28398579359054565, "eval_mlm_loss": 0.28398579359054565, "eval_runtime": 60.3298, "eval_samples_per_second": 1158.118, "eval_steps_per_second": 0.58, "step": 18500 }, { "ep_loss": 0.0, "epoch": 14.28, "learning_rate": 0.00023288316582914574, "loss": 0.3, "mlm_loss": 0.3, "step": 18510 }, { "ep_loss": 0.0, "epoch": 14.28, "learning_rate": 0.00023256909547738694, "loss": 0.2999, "mlm_loss": 0.2999, "step": 18520 }, { "ep_loss": 0.0, "epoch": 14.29, "learning_rate": 0.00023225502512562815, "loss": 0.297, "mlm_loss": 0.297, "step": 18530 }, { "ep_loss": 0.0, "epoch": 14.3, "learning_rate": 0.00023194095477386935, "loss": 0.2958, "mlm_loss": 0.2958, "step": 18540 }, { "ep_loss": 0.0, "epoch": 14.31, "learning_rate": 0.00023162688442211055, "loss": 0.2977, "mlm_loss": 0.2977, "step": 18550 }, { "ep_loss": 0.0, "epoch": 14.32, "learning_rate": 0.00023131281407035178, "loss": 0.2997, "mlm_loss": 0.2997, "step": 18560 }, { "ep_loss": 0.0, "epoch": 14.32, "learning_rate": 0.00023099874371859298, "loss": 0.2964, "mlm_loss": 0.2964, "step": 18570 }, { "ep_loss": 0.0, "epoch": 14.33, "learning_rate": 0.00023068467336683418, "loss": 0.2953, "mlm_loss": 0.2953, "step": 18580 }, { "ep_loss": 0.0, "epoch": 14.34, "learning_rate": 0.00023037060301507538, "loss": 0.2948, "mlm_loss": 0.2948, "step": 18590 }, { "ep_loss": 0.0, "epoch": 14.35, "learning_rate": 0.00023005653266331658, "loss": 0.2986, "mlm_loss": 0.2986, "step": 18600 }, { "epoch": 14.35, "eval_ep_loss": -2.733670234680176, "eval_loss": 0.2843940854072571, "eval_mlm_loss": 0.2843940854072571, "eval_runtime": 60.4662, "eval_samples_per_second": 1155.505, "eval_steps_per_second": 0.579, "step": 18600 }, { "ep_loss": 0.0, "epoch": 14.35, "learning_rate": 0.00022974246231155778, "loss": 0.298, "mlm_loss": 0.298, "step": 18610 }, { "ep_loss": 0.0, "epoch": 14.36, "learning_rate": 0.000229428391959799, "loss": 0.3007, "mlm_loss": 0.3007, "step": 18620 }, { "ep_loss": 0.0, "epoch": 14.37, "learning_rate": 0.0002291143216080402, "loss": 0.3013, "mlm_loss": 0.3013, "step": 18630 }, { "ep_loss": 0.0, "epoch": 14.38, "learning_rate": 0.0002288002512562814, "loss": 0.2972, "mlm_loss": 0.2972, "step": 18640 }, { "ep_loss": 0.0, "epoch": 14.38, "learning_rate": 0.0002284861809045226, "loss": 0.2939, "mlm_loss": 0.2939, "step": 18650 }, { "ep_loss": 0.0, "epoch": 14.39, "learning_rate": 0.0002281721105527638, "loss": 0.2998, "mlm_loss": 0.2998, "step": 18660 }, { "ep_loss": 0.0, "epoch": 14.4, "learning_rate": 0.00022785804020100504, "loss": 0.291, "mlm_loss": 0.291, "step": 18670 }, { "ep_loss": 0.0, "epoch": 14.41, "learning_rate": 0.00022754396984924624, "loss": 0.2981, "mlm_loss": 0.2981, "step": 18680 }, { "ep_loss": 0.0, "epoch": 14.42, "learning_rate": 0.00022722989949748744, "loss": 0.2941, "mlm_loss": 0.2941, "step": 18690 }, { "ep_loss": 0.0, "epoch": 14.42, "learning_rate": 0.00022691582914572864, "loss": 0.2967, "mlm_loss": 0.2967, "step": 18700 }, { "epoch": 14.42, "eval_ep_loss": -2.5219922065734863, "eval_loss": 0.2811538577079773, "eval_mlm_loss": 0.2811538577079773, "eval_runtime": 61.1826, "eval_samples_per_second": 1141.975, "eval_steps_per_second": 0.572, "step": 18700 }, { "ep_loss": 0.0, "epoch": 14.43, "learning_rate": 0.00022660175879396984, "loss": 0.2982, "mlm_loss": 0.2982, "step": 18710 }, { "ep_loss": 0.0, "epoch": 14.44, "learning_rate": 0.00022628768844221107, "loss": 0.2967, "mlm_loss": 0.2967, "step": 18720 }, { "ep_loss": 0.0, "epoch": 14.45, "learning_rate": 0.00022597361809045227, "loss": 0.2931, "mlm_loss": 0.2931, "step": 18730 }, { "ep_loss": 0.0, "epoch": 14.45, "learning_rate": 0.00022565954773869345, "loss": 0.3003, "mlm_loss": 0.3003, "step": 18740 }, { "ep_loss": 0.0, "epoch": 14.46, "learning_rate": 0.00022534547738693468, "loss": 0.3006, "mlm_loss": 0.3006, "step": 18750 }, { "ep_loss": 0.0, "epoch": 14.47, "learning_rate": 0.00022503140703517588, "loss": 0.294, "mlm_loss": 0.294, "step": 18760 }, { "ep_loss": 0.0, "epoch": 14.48, "learning_rate": 0.0002247173366834171, "loss": 0.2954, "mlm_loss": 0.2954, "step": 18770 }, { "ep_loss": 0.0, "epoch": 14.49, "learning_rate": 0.0002244032663316583, "loss": 0.2945, "mlm_loss": 0.2945, "step": 18780 }, { "ep_loss": 0.0, "epoch": 14.49, "learning_rate": 0.00022408919597989948, "loss": 0.2994, "mlm_loss": 0.2994, "step": 18790 }, { "ep_loss": 0.0, "epoch": 14.5, "learning_rate": 0.0002237751256281407, "loss": 0.2968, "mlm_loss": 0.2968, "step": 18800 }, { "epoch": 14.5, "eval_ep_loss": -2.5969533920288086, "eval_loss": 0.2806580662727356, "eval_mlm_loss": 0.2806580662727356, "eval_runtime": 60.5377, "eval_samples_per_second": 1154.141, "eval_steps_per_second": 0.578, "step": 18800 }, { "ep_loss": 0.0, "epoch": 14.51, "learning_rate": 0.0002234610552763819, "loss": 0.2957, "mlm_loss": 0.2957, "step": 18810 }, { "ep_loss": 0.0, "epoch": 14.52, "learning_rate": 0.00022314698492462314, "loss": 0.2943, "mlm_loss": 0.2943, "step": 18820 }, { "ep_loss": 0.0, "epoch": 14.52, "learning_rate": 0.00022283291457286434, "loss": 0.3005, "mlm_loss": 0.3005, "step": 18830 }, { "ep_loss": 0.0, "epoch": 14.53, "learning_rate": 0.0002225188442211055, "loss": 0.2917, "mlm_loss": 0.2917, "step": 18840 }, { "ep_loss": 0.0, "epoch": 14.54, "learning_rate": 0.00022220477386934674, "loss": 0.2969, "mlm_loss": 0.2969, "step": 18850 }, { "ep_loss": 0.0, "epoch": 14.55, "learning_rate": 0.00022189070351758794, "loss": 0.2955, "mlm_loss": 0.2955, "step": 18860 }, { "ep_loss": 0.0, "epoch": 14.55, "learning_rate": 0.00022157663316582917, "loss": 0.2996, "mlm_loss": 0.2996, "step": 18870 }, { "ep_loss": 0.0, "epoch": 14.56, "learning_rate": 0.00022126256281407037, "loss": 0.3058, "mlm_loss": 0.3058, "step": 18880 }, { "ep_loss": 0.0, "epoch": 14.57, "learning_rate": 0.00022094849246231154, "loss": 0.2927, "mlm_loss": 0.2927, "step": 18890 }, { "ep_loss": 0.0, "epoch": 14.58, "learning_rate": 0.00022063442211055277, "loss": 0.2969, "mlm_loss": 0.2969, "step": 18900 }, { "epoch": 14.58, "eval_ep_loss": -2.575608015060425, "eval_loss": 0.28037095069885254, "eval_mlm_loss": 0.28037095069885254, "eval_runtime": 61.9019, "eval_samples_per_second": 1128.706, "eval_steps_per_second": 0.565, "step": 18900 }, { "ep_loss": 0.0, "epoch": 14.59, "learning_rate": 0.00022032035175879397, "loss": 0.2972, "mlm_loss": 0.2972, "step": 18910 }, { "ep_loss": 0.0, "epoch": 14.59, "learning_rate": 0.00022000628140703517, "loss": 0.2985, "mlm_loss": 0.2985, "step": 18920 }, { "ep_loss": 0.0, "epoch": 14.6, "learning_rate": 0.0002196922110552764, "loss": 0.2922, "mlm_loss": 0.2922, "step": 18930 }, { "ep_loss": 0.0, "epoch": 14.61, "learning_rate": 0.00021937814070351758, "loss": 0.2956, "mlm_loss": 0.2956, "step": 18940 }, { "ep_loss": 0.0, "epoch": 14.62, "learning_rate": 0.0002190640703517588, "loss": 0.293, "mlm_loss": 0.293, "step": 18950 }, { "ep_loss": 0.0, "epoch": 14.62, "learning_rate": 0.00021875, "loss": 0.2943, "mlm_loss": 0.2943, "step": 18960 }, { "ep_loss": 0.0, "epoch": 14.63, "learning_rate": 0.0002184359296482412, "loss": 0.293, "mlm_loss": 0.293, "step": 18970 }, { "ep_loss": 0.0, "epoch": 14.64, "learning_rate": 0.00021812185929648243, "loss": 0.3001, "mlm_loss": 0.3001, "step": 18980 }, { "ep_loss": 0.0, "epoch": 14.65, "learning_rate": 0.0002178077889447236, "loss": 0.2924, "mlm_loss": 0.2924, "step": 18990 }, { "ep_loss": 0.0, "epoch": 14.65, "learning_rate": 0.00021749371859296484, "loss": 0.2992, "mlm_loss": 0.2992, "step": 19000 }, { "epoch": 14.65, "eval_ep_loss": -2.6522469520568848, "eval_loss": 0.2777820825576782, "eval_mlm_loss": 0.2777820825576782, "eval_runtime": 60.7802, "eval_samples_per_second": 1149.536, "eval_steps_per_second": 0.576, "step": 19000 }, { "ep_loss": 0.0, "epoch": 14.66, "learning_rate": 0.00021717964824120604, "loss": 0.2939, "mlm_loss": 0.2939, "step": 19010 }, { "ep_loss": 0.0, "epoch": 14.67, "learning_rate": 0.00021686557788944724, "loss": 0.2979, "mlm_loss": 0.2979, "step": 19020 }, { "ep_loss": 0.0, "epoch": 14.68, "learning_rate": 0.00021655150753768847, "loss": 0.2941, "mlm_loss": 0.2941, "step": 19030 }, { "ep_loss": 0.0, "epoch": 14.69, "learning_rate": 0.00021623743718592964, "loss": 0.3014, "mlm_loss": 0.3014, "step": 19040 }, { "ep_loss": 0.0, "epoch": 14.69, "learning_rate": 0.00021592336683417084, "loss": 0.3024, "mlm_loss": 0.3024, "step": 19050 }, { "ep_loss": 0.0, "epoch": 14.7, "learning_rate": 0.00021560929648241207, "loss": 0.2957, "mlm_loss": 0.2957, "step": 19060 }, { "ep_loss": 0.0, "epoch": 14.71, "learning_rate": 0.00021529522613065327, "loss": 0.3011, "mlm_loss": 0.3011, "step": 19070 }, { "ep_loss": 0.0, "epoch": 14.72, "learning_rate": 0.0002149811557788945, "loss": 0.2979, "mlm_loss": 0.2979, "step": 19080 }, { "ep_loss": 0.0, "epoch": 14.72, "learning_rate": 0.00021466708542713567, "loss": 0.2951, "mlm_loss": 0.2951, "step": 19090 }, { "ep_loss": 0.0, "epoch": 14.73, "learning_rate": 0.00021435301507537687, "loss": 0.2898, "mlm_loss": 0.2898, "step": 19100 }, { "epoch": 14.73, "eval_ep_loss": -2.657452344894409, "eval_loss": 0.27940675616264343, "eval_mlm_loss": 0.27940675616264343, "eval_runtime": 62.1141, "eval_samples_per_second": 1124.849, "eval_steps_per_second": 0.563, "step": 19100 }, { "ep_loss": 0.0, "epoch": 14.74, "learning_rate": 0.0002140389447236181, "loss": 0.2884, "mlm_loss": 0.2884, "step": 19110 }, { "ep_loss": 0.0, "epoch": 14.75, "learning_rate": 0.0002137248743718593, "loss": 0.2946, "mlm_loss": 0.2946, "step": 19120 }, { "ep_loss": 0.0, "epoch": 14.76, "learning_rate": 0.00021341080402010053, "loss": 0.2966, "mlm_loss": 0.2966, "step": 19130 }, { "ep_loss": 0.0, "epoch": 14.76, "learning_rate": 0.0002130967336683417, "loss": 0.2902, "mlm_loss": 0.2902, "step": 19140 }, { "ep_loss": 0.0, "epoch": 14.77, "learning_rate": 0.0002127826633165829, "loss": 0.2988, "mlm_loss": 0.2988, "step": 19150 }, { "ep_loss": 0.0, "epoch": 14.78, "learning_rate": 0.00021246859296482413, "loss": 0.2917, "mlm_loss": 0.2917, "step": 19160 }, { "ep_loss": 0.0, "epoch": 14.79, "learning_rate": 0.00021215452261306533, "loss": 0.2938, "mlm_loss": 0.2938, "step": 19170 }, { "ep_loss": 0.0, "epoch": 14.79, "learning_rate": 0.00021184045226130656, "loss": 0.2886, "mlm_loss": 0.2886, "step": 19180 }, { "ep_loss": 0.0, "epoch": 14.8, "learning_rate": 0.00021152638190954774, "loss": 0.2889, "mlm_loss": 0.2889, "step": 19190 }, { "ep_loss": 0.0, "epoch": 14.81, "learning_rate": 0.00021121231155778894, "loss": 0.3002, "mlm_loss": 0.3002, "step": 19200 }, { "epoch": 14.81, "eval_ep_loss": -2.6479761600494385, "eval_loss": 0.27770867943763733, "eval_mlm_loss": 0.27770867943763733, "eval_runtime": 62.8162, "eval_samples_per_second": 1112.277, "eval_steps_per_second": 0.557, "step": 19200 }, { "ep_loss": 0.0, "epoch": 14.82, "learning_rate": 0.00021089824120603016, "loss": 0.295, "mlm_loss": 0.295, "step": 19210 }, { "ep_loss": 0.0, "epoch": 14.82, "learning_rate": 0.00021058417085427137, "loss": 0.2878, "mlm_loss": 0.2878, "step": 19220 }, { "ep_loss": 0.0, "epoch": 14.83, "learning_rate": 0.00021027010050251257, "loss": 0.2935, "mlm_loss": 0.2935, "step": 19230 }, { "ep_loss": 0.0, "epoch": 14.84, "learning_rate": 0.00020995603015075377, "loss": 0.2907, "mlm_loss": 0.2907, "step": 19240 }, { "ep_loss": 0.0, "epoch": 14.85, "learning_rate": 0.00020964195979899497, "loss": 0.2993, "mlm_loss": 0.2993, "step": 19250 }, { "ep_loss": 0.0, "epoch": 14.86, "learning_rate": 0.0002093278894472362, "loss": 0.3011, "mlm_loss": 0.3011, "step": 19260 }, { "ep_loss": 0.0, "epoch": 14.86, "learning_rate": 0.0002090138190954774, "loss": 0.2914, "mlm_loss": 0.2914, "step": 19270 }, { "ep_loss": 0.0, "epoch": 14.87, "learning_rate": 0.0002086997487437186, "loss": 0.2883, "mlm_loss": 0.2883, "step": 19280 }, { "ep_loss": 0.0, "epoch": 14.88, "learning_rate": 0.0002083856783919598, "loss": 0.2938, "mlm_loss": 0.2938, "step": 19290 }, { "ep_loss": 0.0, "epoch": 14.89, "learning_rate": 0.000208071608040201, "loss": 0.2974, "mlm_loss": 0.2974, "step": 19300 }, { "epoch": 14.89, "eval_ep_loss": -2.7994658946990967, "eval_loss": 0.2777378559112549, "eval_mlm_loss": 0.2777378559112549, "eval_runtime": 61.2539, "eval_samples_per_second": 1140.645, "eval_steps_per_second": 0.571, "step": 19300 }, { "ep_loss": 0.0, "epoch": 14.89, "learning_rate": 0.00020775753768844223, "loss": 0.2969, "mlm_loss": 0.2969, "step": 19310 }, { "ep_loss": 0.0, "epoch": 14.9, "learning_rate": 0.00020744346733668343, "loss": 0.2933, "mlm_loss": 0.2933, "step": 19320 }, { "ep_loss": 0.0, "epoch": 14.91, "learning_rate": 0.00020712939698492463, "loss": 0.2938, "mlm_loss": 0.2938, "step": 19330 }, { "ep_loss": 0.0, "epoch": 14.92, "learning_rate": 0.00020681532663316583, "loss": 0.2897, "mlm_loss": 0.2897, "step": 19340 }, { "ep_loss": 0.0, "epoch": 14.92, "learning_rate": 0.00020650125628140703, "loss": 0.2967, "mlm_loss": 0.2967, "step": 19350 }, { "ep_loss": 0.0, "epoch": 14.93, "learning_rate": 0.00020618718592964823, "loss": 0.287, "mlm_loss": 0.287, "step": 19360 }, { "ep_loss": 0.0, "epoch": 14.94, "learning_rate": 0.00020587311557788946, "loss": 0.2926, "mlm_loss": 0.2926, "step": 19370 }, { "ep_loss": 0.0, "epoch": 14.95, "learning_rate": 0.00020555904522613066, "loss": 0.2938, "mlm_loss": 0.2938, "step": 19380 }, { "ep_loss": 0.0, "epoch": 14.96, "learning_rate": 0.00020524497487437186, "loss": 0.2929, "mlm_loss": 0.2929, "step": 19390 }, { "ep_loss": 0.0, "epoch": 14.96, "learning_rate": 0.00020493090452261306, "loss": 0.2843, "mlm_loss": 0.2843, "step": 19400 }, { "epoch": 14.96, "eval_ep_loss": -2.575699806213379, "eval_loss": 0.2762228548526764, "eval_mlm_loss": 0.2762228548526764, "eval_runtime": 60.3213, "eval_samples_per_second": 1158.281, "eval_steps_per_second": 0.58, "step": 19400 }, { "ep_loss": 0.0, "epoch": 14.97, "learning_rate": 0.00020461683417085427, "loss": 0.2943, "mlm_loss": 0.2943, "step": 19410 }, { "ep_loss": 0.0, "epoch": 14.98, "learning_rate": 0.0002043027638190955, "loss": 0.2979, "mlm_loss": 0.2979, "step": 19420 }, { "ep_loss": 0.0, "epoch": 14.99, "learning_rate": 0.0002039886934673367, "loss": 0.2904, "mlm_loss": 0.2904, "step": 19430 }, { "ep_loss": 0.0, "epoch": 14.99, "learning_rate": 0.0002036746231155779, "loss": 0.288, "mlm_loss": 0.288, "step": 19440 }, { "ep_loss": 0.0, "epoch": 15.0, "learning_rate": 0.0002033605527638191, "loss": 0.2885, "mlm_loss": 0.2885, "step": 19450 }, { "ep_loss": 0.0, "epoch": 15.01, "learning_rate": 0.0002030464824120603, "loss": 0.2832, "mlm_loss": 0.2832, "step": 19460 }, { "ep_loss": 0.0, "epoch": 15.02, "learning_rate": 0.00020273241206030153, "loss": 0.2894, "mlm_loss": 0.2894, "step": 19470 }, { "ep_loss": 0.0, "epoch": 15.03, "learning_rate": 0.00020241834170854273, "loss": 0.2915, "mlm_loss": 0.2915, "step": 19480 }, { "ep_loss": 0.0, "epoch": 15.03, "learning_rate": 0.0002021042713567839, "loss": 0.2855, "mlm_loss": 0.2855, "step": 19490 }, { "ep_loss": 0.0, "epoch": 15.04, "learning_rate": 0.00020179020100502513, "loss": 0.2892, "mlm_loss": 0.2892, "step": 19500 }, { "epoch": 15.04, "eval_ep_loss": -2.560924768447876, "eval_loss": 0.27621495723724365, "eval_mlm_loss": 0.27621495723724365, "eval_runtime": 60.2208, "eval_samples_per_second": 1160.214, "eval_steps_per_second": 0.581, "step": 19500 }, { "ep_loss": 0.0, "epoch": 15.05, "learning_rate": 0.00020147613065326633, "loss": 0.2912, "mlm_loss": 0.2912, "step": 19510 }, { "ep_loss": 0.0, "epoch": 15.06, "learning_rate": 0.00020116206030150756, "loss": 0.2862, "mlm_loss": 0.2862, "step": 19520 }, { "ep_loss": 0.0, "epoch": 15.06, "learning_rate": 0.00020084798994974876, "loss": 0.2873, "mlm_loss": 0.2873, "step": 19530 }, { "ep_loss": 0.0, "epoch": 15.07, "learning_rate": 0.00020053391959798993, "loss": 0.2935, "mlm_loss": 0.2935, "step": 19540 }, { "ep_loss": 0.0, "epoch": 15.08, "learning_rate": 0.00020021984924623116, "loss": 0.2937, "mlm_loss": 0.2937, "step": 19550 }, { "ep_loss": 0.0, "epoch": 15.09, "learning_rate": 0.00019990577889447236, "loss": 0.2923, "mlm_loss": 0.2923, "step": 19560 }, { "ep_loss": 0.0, "epoch": 15.09, "learning_rate": 0.0001995917085427136, "loss": 0.2855, "mlm_loss": 0.2855, "step": 19570 }, { "ep_loss": 0.0, "epoch": 15.1, "learning_rate": 0.0001992776381909548, "loss": 0.2922, "mlm_loss": 0.2922, "step": 19580 }, { "ep_loss": 0.0, "epoch": 15.11, "learning_rate": 0.00019896356783919596, "loss": 0.2916, "mlm_loss": 0.2916, "step": 19590 }, { "ep_loss": 0.0, "epoch": 15.12, "learning_rate": 0.0001986494974874372, "loss": 0.2892, "mlm_loss": 0.2892, "step": 19600 }, { "epoch": 15.12, "eval_ep_loss": -2.657994508743286, "eval_loss": 0.27407097816467285, "eval_mlm_loss": 0.27407097816467285, "eval_runtime": 60.784, "eval_samples_per_second": 1149.463, "eval_steps_per_second": 0.576, "step": 19600 }, { "ep_loss": 0.0, "epoch": 15.13, "learning_rate": 0.0001983354271356784, "loss": 0.2876, "mlm_loss": 0.2876, "step": 19610 }, { "ep_loss": 0.0, "epoch": 15.13, "learning_rate": 0.00019802135678391962, "loss": 0.2897, "mlm_loss": 0.2897, "step": 19620 }, { "ep_loss": 0.0, "epoch": 15.14, "learning_rate": 0.00019770728643216082, "loss": 0.2864, "mlm_loss": 0.2864, "step": 19630 }, { "ep_loss": 0.0, "epoch": 15.15, "learning_rate": 0.000197393216080402, "loss": 0.2854, "mlm_loss": 0.2854, "step": 19640 }, { "ep_loss": 0.0, "epoch": 15.16, "learning_rate": 0.00019707914572864322, "loss": 0.288, "mlm_loss": 0.288, "step": 19650 }, { "ep_loss": 0.0, "epoch": 15.16, "learning_rate": 0.00019676507537688443, "loss": 0.2915, "mlm_loss": 0.2915, "step": 19660 }, { "ep_loss": 0.0, "epoch": 15.17, "learning_rate": 0.00019645100502512563, "loss": 0.2908, "mlm_loss": 0.2908, "step": 19670 }, { "ep_loss": 0.0, "epoch": 15.18, "learning_rate": 0.00019613693467336685, "loss": 0.2935, "mlm_loss": 0.2935, "step": 19680 }, { "ep_loss": 0.0, "epoch": 15.19, "learning_rate": 0.00019582286432160803, "loss": 0.289, "mlm_loss": 0.289, "step": 19690 }, { "ep_loss": 0.0, "epoch": 15.19, "learning_rate": 0.00019550879396984926, "loss": 0.2923, "mlm_loss": 0.2923, "step": 19700 }, { "epoch": 15.19, "eval_ep_loss": -2.5624516010284424, "eval_loss": 0.2729531526565552, "eval_mlm_loss": 0.2729531526565552, "eval_runtime": 59.8741, "eval_samples_per_second": 1166.933, "eval_steps_per_second": 0.585, "step": 19700 }, { "ep_loss": 0.0, "epoch": 15.2, "learning_rate": 0.00019519472361809046, "loss": 0.2805, "mlm_loss": 0.2805, "step": 19710 }, { "ep_loss": 0.0, "epoch": 15.21, "learning_rate": 0.00019488065326633166, "loss": 0.2838, "mlm_loss": 0.2838, "step": 19720 }, { "ep_loss": 0.0, "epoch": 15.22, "learning_rate": 0.0001945665829145729, "loss": 0.2901, "mlm_loss": 0.2901, "step": 19730 }, { "ep_loss": 0.0, "epoch": 15.23, "learning_rate": 0.00019425251256281406, "loss": 0.2901, "mlm_loss": 0.2901, "step": 19740 }, { "ep_loss": 0.0, "epoch": 15.23, "learning_rate": 0.0001939384422110553, "loss": 0.2861, "mlm_loss": 0.2861, "step": 19750 }, { "ep_loss": 0.0, "epoch": 15.24, "learning_rate": 0.0001936243718592965, "loss": 0.2912, "mlm_loss": 0.2912, "step": 19760 }, { "ep_loss": 0.0, "epoch": 15.25, "learning_rate": 0.0001933103015075377, "loss": 0.2871, "mlm_loss": 0.2871, "step": 19770 }, { "ep_loss": 0.0, "epoch": 15.26, "learning_rate": 0.00019299623115577892, "loss": 0.2883, "mlm_loss": 0.2883, "step": 19780 }, { "ep_loss": 0.0, "epoch": 15.26, "learning_rate": 0.0001926821608040201, "loss": 0.2872, "mlm_loss": 0.2872, "step": 19790 }, { "ep_loss": 0.0, "epoch": 15.27, "learning_rate": 0.0001923680904522613, "loss": 0.2819, "mlm_loss": 0.2819, "step": 19800 }, { "epoch": 15.27, "eval_ep_loss": -2.8246588706970215, "eval_loss": 0.2718046009540558, "eval_mlm_loss": 0.2718046009540558, "eval_runtime": 61.1372, "eval_samples_per_second": 1142.823, "eval_steps_per_second": 0.572, "step": 19800 }, { "ep_loss": 0.0, "epoch": 15.28, "learning_rate": 0.00019205402010050252, "loss": 0.2892, "mlm_loss": 0.2892, "step": 19810 }, { "ep_loss": 0.0, "epoch": 15.29, "learning_rate": 0.00019173994974874372, "loss": 0.286, "mlm_loss": 0.286, "step": 19820 }, { "ep_loss": 0.0, "epoch": 15.3, "learning_rate": 0.00019142587939698495, "loss": 0.2888, "mlm_loss": 0.2888, "step": 19830 }, { "ep_loss": 0.0, "epoch": 15.3, "learning_rate": 0.00019111180904522612, "loss": 0.287, "mlm_loss": 0.287, "step": 19840 }, { "ep_loss": 0.0, "epoch": 15.31, "learning_rate": 0.00019079773869346733, "loss": 0.293, "mlm_loss": 0.293, "step": 19850 }, { "ep_loss": 0.0, "epoch": 15.32, "learning_rate": 0.00019048366834170855, "loss": 0.2863, "mlm_loss": 0.2863, "step": 19860 }, { "ep_loss": 0.0, "epoch": 15.33, "learning_rate": 0.00019016959798994975, "loss": 0.29, "mlm_loss": 0.29, "step": 19870 }, { "ep_loss": 0.0, "epoch": 15.33, "learning_rate": 0.00018985552763819098, "loss": 0.2864, "mlm_loss": 0.2864, "step": 19880 }, { "ep_loss": 0.0, "epoch": 15.34, "learning_rate": 0.00018954145728643216, "loss": 0.2857, "mlm_loss": 0.2857, "step": 19890 }, { "ep_loss": 0.0, "epoch": 15.35, "learning_rate": 0.00018922738693467336, "loss": 0.2831, "mlm_loss": 0.2831, "step": 19900 }, { "epoch": 15.35, "eval_ep_loss": -2.7547028064727783, "eval_loss": 0.27178439497947693, "eval_mlm_loss": 0.27178439497947693, "eval_runtime": 60.3063, "eval_samples_per_second": 1158.569, "eval_steps_per_second": 0.58, "step": 19900 }, { "ep_loss": 0.0, "epoch": 15.36, "learning_rate": 0.00018891331658291459, "loss": 0.2856, "mlm_loss": 0.2856, "step": 19910 }, { "ep_loss": 0.0, "epoch": 15.36, "learning_rate": 0.0001885992462311558, "loss": 0.2896, "mlm_loss": 0.2896, "step": 19920 }, { "ep_loss": 0.0, "epoch": 15.37, "learning_rate": 0.00018828517587939701, "loss": 0.2862, "mlm_loss": 0.2862, "step": 19930 }, { "ep_loss": 0.0, "epoch": 15.38, "learning_rate": 0.0001879711055276382, "loss": 0.2833, "mlm_loss": 0.2833, "step": 19940 }, { "ep_loss": 0.0, "epoch": 15.39, "learning_rate": 0.0001876570351758794, "loss": 0.2873, "mlm_loss": 0.2873, "step": 19950 }, { "ep_loss": 0.0, "epoch": 15.4, "learning_rate": 0.00018734296482412062, "loss": 0.2858, "mlm_loss": 0.2858, "step": 19960 }, { "ep_loss": 0.0, "epoch": 15.4, "learning_rate": 0.00018702889447236182, "loss": 0.2909, "mlm_loss": 0.2909, "step": 19970 }, { "ep_loss": 0.0, "epoch": 15.41, "learning_rate": 0.000186714824120603, "loss": 0.2826, "mlm_loss": 0.2826, "step": 19980 }, { "ep_loss": 0.0, "epoch": 15.42, "learning_rate": 0.00018640075376884422, "loss": 0.2901, "mlm_loss": 0.2901, "step": 19990 }, { "ep_loss": 0.0, "epoch": 15.43, "learning_rate": 0.00018608668341708542, "loss": 0.2892, "mlm_loss": 0.2892, "step": 20000 }, { "epoch": 15.43, "eval_ep_loss": -2.696652889251709, "eval_loss": 0.27144280076026917, "eval_mlm_loss": 0.27144280076026917, "eval_runtime": 59.4275, "eval_samples_per_second": 1175.701, "eval_steps_per_second": 0.589, "step": 20000 }, { "ep_loss": 0.0, "epoch": 15.43, "learning_rate": 0.00018577261306532665, "loss": 0.2808, "mlm_loss": 0.2808, "step": 20010 }, { "ep_loss": 0.0, "epoch": 15.44, "learning_rate": 0.00018545854271356785, "loss": 0.2835, "mlm_loss": 0.2835, "step": 20020 }, { "ep_loss": 0.0, "epoch": 15.45, "learning_rate": 0.00018514447236180902, "loss": 0.2862, "mlm_loss": 0.2862, "step": 20030 }, { "ep_loss": 0.0, "epoch": 15.46, "learning_rate": 0.00018483040201005025, "loss": 0.285, "mlm_loss": 0.285, "step": 20040 }, { "ep_loss": 0.0, "epoch": 15.46, "learning_rate": 0.00018451633165829145, "loss": 0.2883, "mlm_loss": 0.2883, "step": 20050 }, { "ep_loss": 0.0, "epoch": 15.47, "learning_rate": 0.00018420226130653268, "loss": 0.2886, "mlm_loss": 0.2886, "step": 20060 }, { "ep_loss": 0.0, "epoch": 15.48, "learning_rate": 0.00018388819095477388, "loss": 0.289, "mlm_loss": 0.289, "step": 20070 }, { "ep_loss": 0.0, "epoch": 15.49, "learning_rate": 0.00018357412060301506, "loss": 0.2897, "mlm_loss": 0.2897, "step": 20080 }, { "ep_loss": 0.0, "epoch": 15.5, "learning_rate": 0.00018326005025125628, "loss": 0.2824, "mlm_loss": 0.2824, "step": 20090 }, { "ep_loss": 0.0, "epoch": 15.5, "learning_rate": 0.00018294597989949749, "loss": 0.2841, "mlm_loss": 0.2841, "step": 20100 }, { "epoch": 15.5, "eval_ep_loss": -2.557220935821533, "eval_loss": 0.2708563208580017, "eval_mlm_loss": 0.2708563208580017, "eval_runtime": 61.8437, "eval_samples_per_second": 1129.768, "eval_steps_per_second": 0.566, "step": 20100 }, { "ep_loss": 0.0, "epoch": 15.51, "learning_rate": 0.00018263190954773871, "loss": 0.2878, "mlm_loss": 0.2878, "step": 20110 }, { "ep_loss": 0.0, "epoch": 15.52, "learning_rate": 0.00018231783919597992, "loss": 0.2849, "mlm_loss": 0.2849, "step": 20120 }, { "ep_loss": 0.0, "epoch": 15.53, "learning_rate": 0.0001820037688442211, "loss": 0.2864, "mlm_loss": 0.2864, "step": 20130 }, { "ep_loss": 0.0, "epoch": 15.53, "learning_rate": 0.00018168969849246232, "loss": 0.2887, "mlm_loss": 0.2887, "step": 20140 }, { "ep_loss": 0.0, "epoch": 15.54, "learning_rate": 0.00018137562814070352, "loss": 0.2927, "mlm_loss": 0.2927, "step": 20150 }, { "ep_loss": 0.0, "epoch": 15.55, "learning_rate": 0.00018106155778894472, "loss": 0.2858, "mlm_loss": 0.2858, "step": 20160 }, { "ep_loss": 0.0, "epoch": 15.56, "learning_rate": 0.00018074748743718595, "loss": 0.2874, "mlm_loss": 0.2874, "step": 20170 }, { "ep_loss": 0.0, "epoch": 15.56, "learning_rate": 0.00018043341708542712, "loss": 0.2895, "mlm_loss": 0.2895, "step": 20180 }, { "ep_loss": 0.0, "epoch": 15.57, "learning_rate": 0.00018011934673366835, "loss": 0.2818, "mlm_loss": 0.2818, "step": 20190 }, { "ep_loss": 0.0, "epoch": 15.58, "learning_rate": 0.00017980527638190955, "loss": 0.2851, "mlm_loss": 0.2851, "step": 20200 }, { "epoch": 15.58, "eval_ep_loss": -2.7472620010375977, "eval_loss": 0.2708013951778412, "eval_mlm_loss": 0.2708013951778412, "eval_runtime": 61.0361, "eval_samples_per_second": 1144.716, "eval_steps_per_second": 0.573, "step": 20200 }, { "ep_loss": 0.0, "epoch": 15.59, "learning_rate": 0.00017949120603015075, "loss": 0.2828, "mlm_loss": 0.2828, "step": 20210 }, { "ep_loss": 0.0, "epoch": 15.6, "learning_rate": 0.00017917713567839198, "loss": 0.2764, "mlm_loss": 0.2764, "step": 20220 }, { "ep_loss": 0.0, "epoch": 15.6, "learning_rate": 0.00017886306532663315, "loss": 0.2875, "mlm_loss": 0.2875, "step": 20230 }, { "ep_loss": 0.0, "epoch": 15.61, "learning_rate": 0.00017854899497487438, "loss": 0.2809, "mlm_loss": 0.2809, "step": 20240 }, { "ep_loss": 0.0, "epoch": 15.62, "learning_rate": 0.00017823492462311558, "loss": 0.286, "mlm_loss": 0.286, "step": 20250 }, { "ep_loss": 0.0, "epoch": 15.63, "learning_rate": 0.00017792085427135678, "loss": 0.282, "mlm_loss": 0.282, "step": 20260 }, { "ep_loss": 0.0, "epoch": 15.63, "learning_rate": 0.000177606783919598, "loss": 0.2828, "mlm_loss": 0.2828, "step": 20270 }, { "ep_loss": 0.0, "epoch": 15.64, "learning_rate": 0.00017729271356783918, "loss": 0.2854, "mlm_loss": 0.2854, "step": 20280 }, { "ep_loss": 0.0, "epoch": 15.65, "learning_rate": 0.00017697864321608039, "loss": 0.2853, "mlm_loss": 0.2853, "step": 20290 }, { "ep_loss": 0.0, "epoch": 15.66, "learning_rate": 0.00017666457286432161, "loss": 0.2854, "mlm_loss": 0.2854, "step": 20300 }, { "epoch": 15.66, "eval_ep_loss": -2.763145923614502, "eval_loss": 0.269301176071167, "eval_mlm_loss": 0.269301176071167, "eval_runtime": 60.4116, "eval_samples_per_second": 1156.55, "eval_steps_per_second": 0.579, "step": 20300 }, { "ep_loss": 0.0, "epoch": 15.67, "learning_rate": 0.00017635050251256282, "loss": 0.2852, "mlm_loss": 0.2852, "step": 20310 }, { "ep_loss": 0.0, "epoch": 15.67, "learning_rate": 0.00017603643216080404, "loss": 0.2825, "mlm_loss": 0.2825, "step": 20320 }, { "ep_loss": 0.0, "epoch": 15.68, "learning_rate": 0.00017572236180904522, "loss": 0.2851, "mlm_loss": 0.2851, "step": 20330 }, { "ep_loss": 0.0, "epoch": 15.69, "learning_rate": 0.00017540829145728642, "loss": 0.285, "mlm_loss": 0.285, "step": 20340 }, { "ep_loss": 0.0, "epoch": 15.7, "learning_rate": 0.00017509422110552765, "loss": 0.2858, "mlm_loss": 0.2858, "step": 20350 }, { "ep_loss": 0.0, "epoch": 15.7, "learning_rate": 0.00017478015075376885, "loss": 0.2871, "mlm_loss": 0.2871, "step": 20360 }, { "ep_loss": 0.0, "epoch": 15.71, "learning_rate": 0.00017446608040201008, "loss": 0.2839, "mlm_loss": 0.2839, "step": 20370 }, { "ep_loss": 0.0, "epoch": 15.72, "learning_rate": 0.00017415201005025125, "loss": 0.2826, "mlm_loss": 0.2826, "step": 20380 }, { "ep_loss": 0.0, "epoch": 15.73, "learning_rate": 0.00017383793969849245, "loss": 0.2826, "mlm_loss": 0.2826, "step": 20390 }, { "ep_loss": 0.0, "epoch": 15.73, "learning_rate": 0.00017352386934673368, "loss": 0.2845, "mlm_loss": 0.2845, "step": 20400 }, { "epoch": 15.73, "eval_ep_loss": -2.86255145072937, "eval_loss": 0.2697438895702362, "eval_mlm_loss": 0.2697438895702362, "eval_runtime": 61.1581, "eval_samples_per_second": 1142.432, "eval_steps_per_second": 0.572, "step": 20400 }, { "ep_loss": 0.0, "epoch": 15.74, "learning_rate": 0.00017320979899497488, "loss": 0.279, "mlm_loss": 0.279, "step": 20410 }, { "ep_loss": 0.0, "epoch": 15.75, "learning_rate": 0.0001728957286432161, "loss": 0.2805, "mlm_loss": 0.2805, "step": 20420 }, { "ep_loss": 0.0, "epoch": 15.76, "learning_rate": 0.00017258165829145728, "loss": 0.288, "mlm_loss": 0.288, "step": 20430 }, { "ep_loss": 0.0, "epoch": 15.77, "learning_rate": 0.00017226758793969848, "loss": 0.2844, "mlm_loss": 0.2844, "step": 20440 }, { "ep_loss": 0.0, "epoch": 15.77, "learning_rate": 0.0001719535175879397, "loss": 0.2837, "mlm_loss": 0.2837, "step": 20450 }, { "ep_loss": 0.0, "epoch": 15.78, "learning_rate": 0.0001716394472361809, "loss": 0.2811, "mlm_loss": 0.2811, "step": 20460 }, { "ep_loss": 0.0, "epoch": 15.79, "learning_rate": 0.0001713253768844221, "loss": 0.2853, "mlm_loss": 0.2853, "step": 20470 }, { "ep_loss": 0.0, "epoch": 15.8, "learning_rate": 0.0001710113065326633, "loss": 0.2834, "mlm_loss": 0.2834, "step": 20480 }, { "ep_loss": 0.0, "epoch": 15.8, "learning_rate": 0.00017069723618090451, "loss": 0.2816, "mlm_loss": 0.2816, "step": 20490 }, { "ep_loss": 0.0, "epoch": 15.81, "learning_rate": 0.00017038316582914574, "loss": 0.2812, "mlm_loss": 0.2812, "step": 20500 }, { "epoch": 15.81, "eval_ep_loss": -2.8093948364257812, "eval_loss": 0.26703834533691406, "eval_mlm_loss": 0.26703834533691406, "eval_runtime": 59.5903, "eval_samples_per_second": 1172.49, "eval_steps_per_second": 0.587, "step": 20500 }, { "ep_loss": 0.0, "epoch": 15.82, "learning_rate": 0.00017006909547738694, "loss": 0.2839, "mlm_loss": 0.2839, "step": 20510 }, { "ep_loss": 0.0, "epoch": 15.83, "learning_rate": 0.00016975502512562814, "loss": 0.2801, "mlm_loss": 0.2801, "step": 20520 }, { "ep_loss": 0.0, "epoch": 15.83, "learning_rate": 0.00016944095477386935, "loss": 0.2852, "mlm_loss": 0.2852, "step": 20530 }, { "ep_loss": 0.0, "epoch": 15.84, "learning_rate": 0.00016912688442211055, "loss": 0.278, "mlm_loss": 0.278, "step": 20540 }, { "ep_loss": 0.0, "epoch": 15.85, "learning_rate": 0.00016881281407035177, "loss": 0.2848, "mlm_loss": 0.2848, "step": 20550 }, { "ep_loss": 0.0, "epoch": 15.86, "learning_rate": 0.00016849874371859298, "loss": 0.2819, "mlm_loss": 0.2819, "step": 20560 }, { "ep_loss": 0.0, "epoch": 15.87, "learning_rate": 0.00016818467336683418, "loss": 0.2835, "mlm_loss": 0.2835, "step": 20570 }, { "ep_loss": 0.0, "epoch": 15.87, "learning_rate": 0.00016787060301507538, "loss": 0.2783, "mlm_loss": 0.2783, "step": 20580 }, { "ep_loss": 0.0, "epoch": 15.88, "learning_rate": 0.00016755653266331658, "loss": 0.2824, "mlm_loss": 0.2824, "step": 20590 }, { "ep_loss": 0.0, "epoch": 15.89, "learning_rate": 0.00016724246231155778, "loss": 0.2822, "mlm_loss": 0.2822, "step": 20600 }, { "epoch": 15.89, "eval_ep_loss": -2.924039125442505, "eval_loss": 0.26756471395492554, "eval_mlm_loss": 0.26756471395492554, "eval_runtime": 62.1271, "eval_samples_per_second": 1124.613, "eval_steps_per_second": 0.563, "step": 20600 }, { "ep_loss": 0.0, "epoch": 15.9, "learning_rate": 0.000166928391959799, "loss": 0.2826, "mlm_loss": 0.2826, "step": 20610 }, { "ep_loss": 0.0, "epoch": 15.9, "learning_rate": 0.0001666143216080402, "loss": 0.2799, "mlm_loss": 0.2799, "step": 20620 }, { "ep_loss": 0.0, "epoch": 15.91, "learning_rate": 0.0001663002512562814, "loss": 0.2751, "mlm_loss": 0.2751, "step": 20630 }, { "ep_loss": 0.0, "epoch": 15.92, "learning_rate": 0.0001659861809045226, "loss": 0.2853, "mlm_loss": 0.2853, "step": 20640 }, { "ep_loss": 0.0, "epoch": 15.93, "learning_rate": 0.0001656721105527638, "loss": 0.2866, "mlm_loss": 0.2866, "step": 20650 }, { "ep_loss": 0.0, "epoch": 15.94, "learning_rate": 0.00016535804020100504, "loss": 0.2841, "mlm_loss": 0.2841, "step": 20660 }, { "ep_loss": 0.0, "epoch": 15.94, "learning_rate": 0.00016504396984924624, "loss": 0.2795, "mlm_loss": 0.2795, "step": 20670 }, { "ep_loss": 0.0, "epoch": 15.95, "learning_rate": 0.00016472989949748744, "loss": 0.2789, "mlm_loss": 0.2789, "step": 20680 }, { "ep_loss": 0.0, "epoch": 15.96, "learning_rate": 0.00016441582914572864, "loss": 0.2851, "mlm_loss": 0.2851, "step": 20690 }, { "ep_loss": 0.0, "epoch": 15.97, "learning_rate": 0.00016410175879396984, "loss": 0.2815, "mlm_loss": 0.2815, "step": 20700 }, { "epoch": 15.97, "eval_ep_loss": -3.128182888031006, "eval_loss": 0.2656788229942322, "eval_mlm_loss": 0.2656788229942322, "eval_runtime": 59.6531, "eval_samples_per_second": 1171.255, "eval_steps_per_second": 0.587, "step": 20700 }, { "ep_loss": 0.0, "epoch": 15.97, "learning_rate": 0.00016378768844221107, "loss": 0.2828, "mlm_loss": 0.2828, "step": 20710 }, { "ep_loss": 0.0, "epoch": 15.98, "learning_rate": 0.00016347361809045227, "loss": 0.2833, "mlm_loss": 0.2833, "step": 20720 }, { "ep_loss": 0.0, "epoch": 15.99, "learning_rate": 0.00016315954773869345, "loss": 0.2745, "mlm_loss": 0.2745, "step": 20730 }, { "ep_loss": 0.0, "epoch": 16.0, "learning_rate": 0.00016284547738693467, "loss": 0.278, "mlm_loss": 0.278, "step": 20740 }, { "ep_loss": 0.0, "epoch": 16.0, "learning_rate": 0.00016253140703517588, "loss": 0.2823, "mlm_loss": 0.2823, "step": 20750 }, { "ep_loss": 0.0, "epoch": 16.01, "learning_rate": 0.0001622173366834171, "loss": 0.2772, "mlm_loss": 0.2772, "step": 20760 }, { "ep_loss": 0.0, "epoch": 16.02, "learning_rate": 0.0001619032663316583, "loss": 0.2781, "mlm_loss": 0.2781, "step": 20770 }, { "ep_loss": 0.0, "epoch": 16.03, "learning_rate": 0.00016158919597989948, "loss": 0.2778, "mlm_loss": 0.2778, "step": 20780 }, { "ep_loss": 0.0, "epoch": 16.04, "learning_rate": 0.0001612751256281407, "loss": 0.2737, "mlm_loss": 0.2737, "step": 20790 }, { "ep_loss": 0.0, "epoch": 16.04, "learning_rate": 0.0001609610552763819, "loss": 0.2741, "mlm_loss": 0.2741, "step": 20800 }, { "epoch": 16.04, "eval_ep_loss": -2.925839900970459, "eval_loss": 0.2652662694454193, "eval_mlm_loss": 0.2652662694454193, "eval_runtime": 62.984, "eval_samples_per_second": 1109.313, "eval_steps_per_second": 0.556, "step": 20800 }, { "ep_loss": 0.0, "epoch": 16.05, "learning_rate": 0.00016064698492462314, "loss": 0.2835, "mlm_loss": 0.2835, "step": 20810 }, { "ep_loss": 0.0, "epoch": 16.06, "learning_rate": 0.00016033291457286434, "loss": 0.2782, "mlm_loss": 0.2782, "step": 20820 }, { "ep_loss": 0.0, "epoch": 16.07, "learning_rate": 0.0001600188442211055, "loss": 0.2757, "mlm_loss": 0.2757, "step": 20830 }, { "ep_loss": 0.0, "epoch": 16.07, "learning_rate": 0.00015970477386934674, "loss": 0.2784, "mlm_loss": 0.2784, "step": 20840 }, { "ep_loss": 0.0, "epoch": 16.08, "learning_rate": 0.00015939070351758794, "loss": 0.2811, "mlm_loss": 0.2811, "step": 20850 }, { "ep_loss": 0.0, "epoch": 16.09, "learning_rate": 0.00015907663316582917, "loss": 0.2779, "mlm_loss": 0.2779, "step": 20860 }, { "ep_loss": 0.0, "epoch": 16.1, "learning_rate": 0.00015876256281407037, "loss": 0.2773, "mlm_loss": 0.2773, "step": 20870 }, { "ep_loss": 0.0, "epoch": 16.1, "learning_rate": 0.00015844849246231154, "loss": 0.2768, "mlm_loss": 0.2768, "step": 20880 }, { "ep_loss": 0.0, "epoch": 16.11, "learning_rate": 0.00015813442211055277, "loss": 0.2836, "mlm_loss": 0.2836, "step": 20890 }, { "ep_loss": 0.0, "epoch": 16.12, "learning_rate": 0.00015782035175879397, "loss": 0.2797, "mlm_loss": 0.2797, "step": 20900 }, { "epoch": 16.12, "eval_ep_loss": -2.8426320552825928, "eval_loss": 0.2645823061466217, "eval_mlm_loss": 0.2645823061466217, "eval_runtime": 61.2517, "eval_samples_per_second": 1140.686, "eval_steps_per_second": 0.571, "step": 20900 }, { "ep_loss": 0.0, "epoch": 16.13, "learning_rate": 0.00015750628140703517, "loss": 0.2784, "mlm_loss": 0.2784, "step": 20910 }, { "ep_loss": 0.0, "epoch": 16.14, "learning_rate": 0.0001571922110552764, "loss": 0.2758, "mlm_loss": 0.2758, "step": 20920 }, { "ep_loss": 0.0, "epoch": 16.14, "learning_rate": 0.00015687814070351757, "loss": 0.2757, "mlm_loss": 0.2757, "step": 20930 }, { "ep_loss": 0.0, "epoch": 16.15, "learning_rate": 0.0001565640703517588, "loss": 0.276, "mlm_loss": 0.276, "step": 20940 }, { "ep_loss": 0.0, "epoch": 16.16, "learning_rate": 0.00015625, "loss": 0.2808, "mlm_loss": 0.2808, "step": 20950 }, { "ep_loss": 0.0, "epoch": 16.17, "learning_rate": 0.0001559359296482412, "loss": 0.2814, "mlm_loss": 0.2814, "step": 20960 }, { "ep_loss": 0.0, "epoch": 16.17, "learning_rate": 0.00015562185929648243, "loss": 0.2843, "mlm_loss": 0.2843, "step": 20970 }, { "ep_loss": 0.0, "epoch": 16.18, "learning_rate": 0.0001553077889447236, "loss": 0.2759, "mlm_loss": 0.2759, "step": 20980 }, { "ep_loss": 0.0, "epoch": 16.19, "learning_rate": 0.00015499371859296483, "loss": 0.2804, "mlm_loss": 0.2804, "step": 20990 }, { "ep_loss": 0.0, "epoch": 16.2, "learning_rate": 0.00015467964824120604, "loss": 0.2731, "mlm_loss": 0.2731, "step": 21000 }, { "epoch": 16.2, "eval_ep_loss": -2.842376708984375, "eval_loss": 0.26594194769859314, "eval_mlm_loss": 0.26594194769859314, "eval_runtime": 62.1251, "eval_samples_per_second": 1124.65, "eval_steps_per_second": 0.563, "step": 21000 }, { "ep_loss": 0.0, "epoch": 16.21, "learning_rate": 0.00015436557788944724, "loss": 0.2805, "mlm_loss": 0.2805, "step": 21010 }, { "ep_loss": 0.0, "epoch": 16.21, "learning_rate": 0.00015405150753768846, "loss": 0.2768, "mlm_loss": 0.2768, "step": 21020 }, { "ep_loss": 0.0, "epoch": 16.22, "learning_rate": 0.00015373743718592964, "loss": 0.2762, "mlm_loss": 0.2762, "step": 21030 }, { "ep_loss": 0.0, "epoch": 16.23, "learning_rate": 0.00015342336683417084, "loss": 0.2754, "mlm_loss": 0.2754, "step": 21040 }, { "ep_loss": 0.0, "epoch": 16.24, "learning_rate": 0.00015310929648241207, "loss": 0.2745, "mlm_loss": 0.2745, "step": 21050 }, { "ep_loss": 0.0, "epoch": 16.24, "learning_rate": 0.00015279522613065327, "loss": 0.2806, "mlm_loss": 0.2806, "step": 21060 }, { "ep_loss": 0.0, "epoch": 16.25, "learning_rate": 0.0001524811557788945, "loss": 0.2783, "mlm_loss": 0.2783, "step": 21070 }, { "ep_loss": 0.0, "epoch": 16.26, "learning_rate": 0.00015216708542713567, "loss": 0.2763, "mlm_loss": 0.2763, "step": 21080 }, { "ep_loss": 0.0, "epoch": 16.27, "learning_rate": 0.00015185301507537687, "loss": 0.2771, "mlm_loss": 0.2771, "step": 21090 }, { "ep_loss": 0.0, "epoch": 16.27, "learning_rate": 0.0001515389447236181, "loss": 0.2754, "mlm_loss": 0.2754, "step": 21100 }, { "epoch": 16.27, "eval_ep_loss": -3.014099597930908, "eval_loss": 0.264241486787796, "eval_mlm_loss": 0.264241486787796, "eval_runtime": 61.1999, "eval_samples_per_second": 1141.652, "eval_steps_per_second": 0.572, "step": 21100 }, { "ep_loss": 0.0, "epoch": 16.28, "learning_rate": 0.0001512248743718593, "loss": 0.279, "mlm_loss": 0.279, "step": 21110 }, { "ep_loss": 0.0, "epoch": 16.29, "learning_rate": 0.00015091080402010053, "loss": 0.2757, "mlm_loss": 0.2757, "step": 21120 }, { "ep_loss": 0.0, "epoch": 16.3, "learning_rate": 0.0001505967336683417, "loss": 0.2769, "mlm_loss": 0.2769, "step": 21130 }, { "ep_loss": 0.0, "epoch": 16.31, "learning_rate": 0.0001502826633165829, "loss": 0.2766, "mlm_loss": 0.2766, "step": 21140 }, { "ep_loss": 0.0, "epoch": 16.31, "learning_rate": 0.00014996859296482413, "loss": 0.2768, "mlm_loss": 0.2768, "step": 21150 }, { "ep_loss": 0.0, "epoch": 16.32, "learning_rate": 0.00014965452261306533, "loss": 0.2702, "mlm_loss": 0.2702, "step": 21160 }, { "ep_loss": 0.0, "epoch": 16.33, "learning_rate": 0.00014934045226130656, "loss": 0.2763, "mlm_loss": 0.2763, "step": 21170 }, { "ep_loss": 0.0, "epoch": 16.34, "learning_rate": 0.00014902638190954773, "loss": 0.2793, "mlm_loss": 0.2793, "step": 21180 }, { "ep_loss": 0.0, "epoch": 16.34, "learning_rate": 0.00014871231155778894, "loss": 0.279, "mlm_loss": 0.279, "step": 21190 }, { "ep_loss": 0.0, "epoch": 16.35, "learning_rate": 0.00014839824120603016, "loss": 0.2785, "mlm_loss": 0.2785, "step": 21200 }, { "epoch": 16.35, "eval_ep_loss": -2.9213201999664307, "eval_loss": 0.2617505192756653, "eval_mlm_loss": 0.2617505192756653, "eval_runtime": 61.3699, "eval_samples_per_second": 1138.49, "eval_steps_per_second": 0.57, "step": 21200 }, { "ep_loss": 0.0, "epoch": 16.36, "learning_rate": 0.00014808417085427136, "loss": 0.2739, "mlm_loss": 0.2739, "step": 21210 }, { "ep_loss": 0.0, "epoch": 16.37, "learning_rate": 0.00014777010050251257, "loss": 0.2729, "mlm_loss": 0.2729, "step": 21220 }, { "ep_loss": 0.0, "epoch": 16.37, "learning_rate": 0.00014745603015075377, "loss": 0.28, "mlm_loss": 0.28, "step": 21230 }, { "ep_loss": 0.0, "epoch": 16.38, "learning_rate": 0.00014714195979899497, "loss": 0.2793, "mlm_loss": 0.2793, "step": 21240 }, { "ep_loss": 0.0, "epoch": 16.39, "learning_rate": 0.0001468278894472362, "loss": 0.2787, "mlm_loss": 0.2787, "step": 21250 }, { "ep_loss": 0.0, "epoch": 16.4, "learning_rate": 0.00014654522613065328, "loss": 0.2782, "mlm_loss": 0.2782, "step": 21260 }, { "ep_loss": 0.0, "epoch": 16.41, "learning_rate": 0.00014623115577889448, "loss": 0.274, "mlm_loss": 0.274, "step": 21270 }, { "ep_loss": 0.0, "epoch": 16.41, "learning_rate": 0.00014591708542713568, "loss": 0.2757, "mlm_loss": 0.2757, "step": 21280 }, { "ep_loss": 0.0, "epoch": 16.42, "learning_rate": 0.00014560301507537688, "loss": 0.2815, "mlm_loss": 0.2815, "step": 21290 }, { "ep_loss": 0.0, "epoch": 16.43, "learning_rate": 0.00014528894472361808, "loss": 0.2757, "mlm_loss": 0.2757, "step": 21300 }, { "epoch": 16.43, "eval_ep_loss": -2.992938995361328, "eval_loss": 0.26220372319221497, "eval_mlm_loss": 0.26220372319221497, "eval_runtime": 59.293, "eval_samples_per_second": 1178.368, "eval_steps_per_second": 0.59, "step": 21300 }, { "ep_loss": 0.0, "epoch": 16.44, "learning_rate": 0.0001449748743718593, "loss": 0.2745, "mlm_loss": 0.2745, "step": 21310 }, { "ep_loss": 0.0, "epoch": 16.44, "learning_rate": 0.0001446608040201005, "loss": 0.2787, "mlm_loss": 0.2787, "step": 21320 }, { "ep_loss": 0.0, "epoch": 16.45, "learning_rate": 0.0001443467336683417, "loss": 0.2723, "mlm_loss": 0.2723, "step": 21330 }, { "ep_loss": 0.0, "epoch": 16.46, "learning_rate": 0.00014403266331658291, "loss": 0.2764, "mlm_loss": 0.2764, "step": 21340 }, { "ep_loss": 0.0, "epoch": 16.47, "learning_rate": 0.00014371859296482411, "loss": 0.2803, "mlm_loss": 0.2803, "step": 21350 }, { "ep_loss": 0.0, "epoch": 16.48, "learning_rate": 0.00014340452261306534, "loss": 0.2713, "mlm_loss": 0.2713, "step": 21360 }, { "ep_loss": 0.0, "epoch": 16.48, "learning_rate": 0.00014309045226130654, "loss": 0.2793, "mlm_loss": 0.2793, "step": 21370 }, { "ep_loss": 0.0, "epoch": 16.49, "learning_rate": 0.00014277638190954775, "loss": 0.2788, "mlm_loss": 0.2788, "step": 21380 }, { "ep_loss": 0.0, "epoch": 16.5, "learning_rate": 0.00014246231155778895, "loss": 0.2747, "mlm_loss": 0.2747, "step": 21390 }, { "ep_loss": 0.0, "epoch": 16.51, "learning_rate": 0.00014214824120603015, "loss": 0.2703, "mlm_loss": 0.2703, "step": 21400 }, { "epoch": 16.51, "eval_ep_loss": -2.9763119220733643, "eval_loss": 0.26245957612991333, "eval_mlm_loss": 0.26245957612991333, "eval_runtime": 61.7441, "eval_samples_per_second": 1131.59, "eval_steps_per_second": 0.567, "step": 21400 }, { "ep_loss": 0.0, "epoch": 16.51, "learning_rate": 0.00014183417085427135, "loss": 0.2736, "mlm_loss": 0.2736, "step": 21410 }, { "ep_loss": 0.0, "epoch": 16.52, "learning_rate": 0.00014152010050251258, "loss": 0.2762, "mlm_loss": 0.2762, "step": 21420 }, { "ep_loss": 0.0, "epoch": 16.53, "learning_rate": 0.00014120603015075378, "loss": 0.276, "mlm_loss": 0.276, "step": 21430 }, { "ep_loss": 0.0, "epoch": 16.54, "learning_rate": 0.00014089195979899498, "loss": 0.2779, "mlm_loss": 0.2779, "step": 21440 }, { "ep_loss": 0.0, "epoch": 16.54, "learning_rate": 0.00014057788944723618, "loss": 0.2775, "mlm_loss": 0.2775, "step": 21450 }, { "ep_loss": 0.0, "epoch": 16.55, "learning_rate": 0.00014026381909547738, "loss": 0.2751, "mlm_loss": 0.2751, "step": 21460 }, { "ep_loss": 0.0, "epoch": 16.56, "learning_rate": 0.0001399497487437186, "loss": 0.2755, "mlm_loss": 0.2755, "step": 21470 }, { "ep_loss": 0.0, "epoch": 16.57, "learning_rate": 0.0001396356783919598, "loss": 0.2761, "mlm_loss": 0.2761, "step": 21480 }, { "ep_loss": 0.0, "epoch": 16.58, "learning_rate": 0.000139321608040201, "loss": 0.2729, "mlm_loss": 0.2729, "step": 21490 }, { "ep_loss": 0.0, "epoch": 16.58, "learning_rate": 0.0001390075376884422, "loss": 0.2768, "mlm_loss": 0.2768, "step": 21500 }, { "epoch": 16.58, "eval_ep_loss": -2.822406053543091, "eval_loss": 0.2601699233055115, "eval_mlm_loss": 0.2601699233055115, "eval_runtime": 60.755, "eval_samples_per_second": 1150.012, "eval_steps_per_second": 0.576, "step": 21500 }, { "ep_loss": 0.0, "epoch": 16.59, "learning_rate": 0.0001386934673366834, "loss": 0.2766, "mlm_loss": 0.2766, "step": 21510 }, { "ep_loss": 0.0, "epoch": 16.6, "learning_rate": 0.00013837939698492464, "loss": 0.2762, "mlm_loss": 0.2762, "step": 21520 }, { "ep_loss": 0.0, "epoch": 16.61, "learning_rate": 0.00013806532663316584, "loss": 0.2754, "mlm_loss": 0.2754, "step": 21530 }, { "ep_loss": 0.0, "epoch": 16.61, "learning_rate": 0.00013775125628140701, "loss": 0.2788, "mlm_loss": 0.2788, "step": 21540 }, { "ep_loss": 0.0, "epoch": 16.62, "learning_rate": 0.00013743718592964824, "loss": 0.2758, "mlm_loss": 0.2758, "step": 21550 }, { "ep_loss": 0.0, "epoch": 16.63, "learning_rate": 0.00013712311557788944, "loss": 0.2759, "mlm_loss": 0.2759, "step": 21560 }, { "ep_loss": 0.0, "epoch": 16.64, "learning_rate": 0.00013680904522613067, "loss": 0.2832, "mlm_loss": 0.2832, "step": 21570 }, { "ep_loss": 0.0, "epoch": 16.64, "learning_rate": 0.00013649497487437187, "loss": 0.275, "mlm_loss": 0.275, "step": 21580 }, { "ep_loss": 0.0, "epoch": 16.65, "learning_rate": 0.00013618090452261305, "loss": 0.2729, "mlm_loss": 0.2729, "step": 21590 }, { "ep_loss": 0.0, "epoch": 16.66, "learning_rate": 0.00013586683417085428, "loss": 0.2725, "mlm_loss": 0.2725, "step": 21600 }, { "epoch": 16.66, "eval_ep_loss": -2.640625, "eval_loss": 0.2593928575515747, "eval_mlm_loss": 0.2593928575515747, "eval_runtime": 61.1714, "eval_samples_per_second": 1142.184, "eval_steps_per_second": 0.572, "step": 21600 }, { "ep_loss": 0.0, "epoch": 16.67, "learning_rate": 0.00013555276381909548, "loss": 0.2768, "mlm_loss": 0.2768, "step": 21610 }, { "ep_loss": 0.0, "epoch": 16.68, "learning_rate": 0.0001352386934673367, "loss": 0.2749, "mlm_loss": 0.2749, "step": 21620 }, { "ep_loss": 0.0, "epoch": 16.68, "learning_rate": 0.0001349246231155779, "loss": 0.2701, "mlm_loss": 0.2701, "step": 21630 }, { "ep_loss": 0.0, "epoch": 16.69, "learning_rate": 0.00013461055276381908, "loss": 0.2835, "mlm_loss": 0.2835, "step": 21640 }, { "ep_loss": 0.0, "epoch": 16.7, "learning_rate": 0.0001342964824120603, "loss": 0.2731, "mlm_loss": 0.2731, "step": 21650 }, { "ep_loss": 0.0, "epoch": 16.71, "learning_rate": 0.0001339824120603015, "loss": 0.2796, "mlm_loss": 0.2796, "step": 21660 }, { "ep_loss": 0.0, "epoch": 16.71, "learning_rate": 0.00013366834170854274, "loss": 0.2703, "mlm_loss": 0.2703, "step": 21670 }, { "ep_loss": 0.0, "epoch": 16.72, "learning_rate": 0.00013335427135678394, "loss": 0.2744, "mlm_loss": 0.2744, "step": 21680 }, { "ep_loss": 0.0, "epoch": 16.73, "learning_rate": 0.0001330402010050251, "loss": 0.2734, "mlm_loss": 0.2734, "step": 21690 }, { "ep_loss": 0.0, "epoch": 16.74, "learning_rate": 0.00013272613065326634, "loss": 0.2692, "mlm_loss": 0.2692, "step": 21700 }, { "epoch": 16.74, "eval_ep_loss": -2.509608030319214, "eval_loss": 0.2609871029853821, "eval_mlm_loss": 0.2609871029853821, "eval_runtime": 61.2639, "eval_samples_per_second": 1140.459, "eval_steps_per_second": 0.571, "step": 21700 }, { "ep_loss": 0.0, "epoch": 16.75, "learning_rate": 0.00013241206030150754, "loss": 0.2776, "mlm_loss": 0.2776, "step": 21710 }, { "ep_loss": 0.0, "epoch": 16.75, "learning_rate": 0.00013209798994974874, "loss": 0.2747, "mlm_loss": 0.2747, "step": 21720 }, { "ep_loss": 0.0, "epoch": 16.76, "learning_rate": 0.00013178391959798997, "loss": 0.2696, "mlm_loss": 0.2696, "step": 21730 }, { "ep_loss": 0.0, "epoch": 16.77, "learning_rate": 0.00013146984924623114, "loss": 0.2744, "mlm_loss": 0.2744, "step": 21740 }, { "ep_loss": 0.0, "epoch": 16.78, "learning_rate": 0.00013115577889447237, "loss": 0.2752, "mlm_loss": 0.2752, "step": 21750 }, { "ep_loss": 0.0, "epoch": 16.78, "learning_rate": 0.00013084170854271357, "loss": 0.2723, "mlm_loss": 0.2723, "step": 21760 }, { "ep_loss": 0.0, "epoch": 16.79, "learning_rate": 0.00013052763819095477, "loss": 0.2749, "mlm_loss": 0.2749, "step": 21770 }, { "ep_loss": 0.0, "epoch": 16.8, "learning_rate": 0.000130213567839196, "loss": 0.2723, "mlm_loss": 0.2723, "step": 21780 }, { "ep_loss": 0.0, "epoch": 16.81, "learning_rate": 0.00012989949748743718, "loss": 0.2793, "mlm_loss": 0.2793, "step": 21790 }, { "ep_loss": 0.0, "epoch": 16.81, "learning_rate": 0.0001295854271356784, "loss": 0.276, "mlm_loss": 0.276, "step": 21800 }, { "epoch": 16.81, "eval_ep_loss": -2.4998526573181152, "eval_loss": 0.25980517268180847, "eval_mlm_loss": 0.25980517268180847, "eval_runtime": 60.7864, "eval_samples_per_second": 1149.418, "eval_steps_per_second": 0.576, "step": 21800 }, { "ep_loss": 0.0, "epoch": 16.82, "learning_rate": 0.0001292713567839196, "loss": 0.2705, "mlm_loss": 0.2705, "step": 21810 }, { "ep_loss": 0.0, "epoch": 16.83, "learning_rate": 0.0001289572864321608, "loss": 0.2808, "mlm_loss": 0.2808, "step": 21820 }, { "ep_loss": 0.0, "epoch": 16.84, "learning_rate": 0.00012864321608040203, "loss": 0.2724, "mlm_loss": 0.2724, "step": 21830 }, { "ep_loss": 0.0, "epoch": 16.85, "learning_rate": 0.0001283291457286432, "loss": 0.2729, "mlm_loss": 0.2729, "step": 21840 }, { "ep_loss": 0.0, "epoch": 16.85, "learning_rate": 0.0001280150753768844, "loss": 0.2689, "mlm_loss": 0.2689, "step": 21850 }, { "ep_loss": 0.0, "epoch": 16.86, "learning_rate": 0.00012770100502512564, "loss": 0.2725, "mlm_loss": 0.2725, "step": 21860 }, { "ep_loss": 0.0, "epoch": 16.87, "learning_rate": 0.00012738693467336684, "loss": 0.2732, "mlm_loss": 0.2732, "step": 21870 }, { "ep_loss": 0.0, "epoch": 16.88, "learning_rate": 0.00012707286432160807, "loss": 0.2737, "mlm_loss": 0.2737, "step": 21880 }, { "ep_loss": 0.0, "epoch": 16.88, "learning_rate": 0.00012675879396984924, "loss": 0.2689, "mlm_loss": 0.2689, "step": 21890 }, { "ep_loss": 0.0, "epoch": 16.89, "learning_rate": 0.00012644472361809044, "loss": 0.2765, "mlm_loss": 0.2765, "step": 21900 }, { "epoch": 16.89, "eval_ep_loss": -2.660741090774536, "eval_loss": 0.259184867143631, "eval_mlm_loss": 0.259184867143631, "eval_runtime": 62.5826, "eval_samples_per_second": 1116.428, "eval_steps_per_second": 0.559, "step": 21900 }, { "ep_loss": 0.0, "epoch": 16.9, "learning_rate": 0.00012613065326633167, "loss": 0.2774, "mlm_loss": 0.2774, "step": 21910 }, { "ep_loss": 0.0, "epoch": 16.91, "learning_rate": 0.00012581658291457287, "loss": 0.2762, "mlm_loss": 0.2762, "step": 21920 }, { "ep_loss": 0.0, "epoch": 16.91, "learning_rate": 0.0001255025125628141, "loss": 0.2656, "mlm_loss": 0.2656, "step": 21930 }, { "ep_loss": 0.0, "epoch": 16.92, "learning_rate": 0.00012518844221105527, "loss": 0.272, "mlm_loss": 0.272, "step": 21940 }, { "ep_loss": 0.0, "epoch": 16.93, "learning_rate": 0.00012487437185929647, "loss": 0.2719, "mlm_loss": 0.2719, "step": 21950 }, { "ep_loss": 0.0, "epoch": 16.94, "learning_rate": 0.0001245603015075377, "loss": 0.2729, "mlm_loss": 0.2729, "step": 21960 }, { "ep_loss": 0.0, "epoch": 16.95, "learning_rate": 0.0001242462311557789, "loss": 0.2728, "mlm_loss": 0.2728, "step": 21970 }, { "ep_loss": 0.0, "epoch": 16.95, "learning_rate": 0.0001239321608040201, "loss": 0.273, "mlm_loss": 0.273, "step": 21980 }, { "ep_loss": 0.0, "epoch": 16.96, "learning_rate": 0.0001236180904522613, "loss": 0.2747, "mlm_loss": 0.2747, "step": 21990 }, { "ep_loss": 0.0, "epoch": 16.97, "learning_rate": 0.0001233040201005025, "loss": 0.27, "mlm_loss": 0.27, "step": 22000 }, { "epoch": 16.97, "eval_ep_loss": -2.5629868507385254, "eval_loss": 0.25795498490333557, "eval_mlm_loss": 0.25795498490333557, "eval_runtime": 60.0008, "eval_samples_per_second": 1164.468, "eval_steps_per_second": 0.583, "step": 22000 }, { "ep_loss": 0.0, "epoch": 16.98, "learning_rate": 0.00012298994974874373, "loss": 0.264, "mlm_loss": 0.264, "step": 22010 }, { "ep_loss": 0.0, "epoch": 16.98, "learning_rate": 0.00012267587939698493, "loss": 0.2758, "mlm_loss": 0.2758, "step": 22020 }, { "ep_loss": 0.0, "epoch": 16.99, "learning_rate": 0.00012236180904522613, "loss": 0.2698, "mlm_loss": 0.2698, "step": 22030 }, { "ep_loss": 0.0, "epoch": 17.0, "learning_rate": 0.00012204773869346735, "loss": 0.2673, "mlm_loss": 0.2673, "step": 22040 }, { "ep_loss": 0.0, "epoch": 17.01, "learning_rate": 0.00012173366834170854, "loss": 0.2708, "mlm_loss": 0.2708, "step": 22050 }, { "ep_loss": 0.0, "epoch": 17.02, "learning_rate": 0.00012141959798994975, "loss": 0.2709, "mlm_loss": 0.2709, "step": 22060 }, { "ep_loss": 0.0, "epoch": 17.02, "learning_rate": 0.00012110552763819095, "loss": 0.2686, "mlm_loss": 0.2686, "step": 22070 }, { "ep_loss": 0.0, "epoch": 17.03, "learning_rate": 0.00012079145728643217, "loss": 0.2693, "mlm_loss": 0.2693, "step": 22080 }, { "ep_loss": 0.0, "epoch": 17.04, "learning_rate": 0.00012047738693467338, "loss": 0.2731, "mlm_loss": 0.2731, "step": 22090 }, { "ep_loss": 0.0, "epoch": 17.05, "learning_rate": 0.00012016331658291457, "loss": 0.2714, "mlm_loss": 0.2714, "step": 22100 }, { "epoch": 17.05, "eval_ep_loss": -2.490448236465454, "eval_loss": 0.2565537393093109, "eval_mlm_loss": 0.2565537393093109, "eval_runtime": 61.9734, "eval_samples_per_second": 1127.403, "eval_steps_per_second": 0.565, "step": 22100 }, { "ep_loss": 0.0, "epoch": 17.05, "learning_rate": 0.00011984924623115578, "loss": 0.2665, "mlm_loss": 0.2665, "step": 22110 }, { "ep_loss": 0.0, "epoch": 17.06, "learning_rate": 0.00011953517587939698, "loss": 0.2699, "mlm_loss": 0.2699, "step": 22120 }, { "ep_loss": 0.0, "epoch": 17.07, "learning_rate": 0.0001192211055276382, "loss": 0.2708, "mlm_loss": 0.2708, "step": 22130 }, { "ep_loss": 0.0, "epoch": 17.08, "learning_rate": 0.0001189070351758794, "loss": 0.2682, "mlm_loss": 0.2682, "step": 22140 }, { "ep_loss": 0.0, "epoch": 17.08, "learning_rate": 0.0001185929648241206, "loss": 0.266, "mlm_loss": 0.266, "step": 22150 }, { "ep_loss": 0.0, "epoch": 17.09, "learning_rate": 0.00011827889447236181, "loss": 0.2756, "mlm_loss": 0.2756, "step": 22160 }, { "ep_loss": 0.0, "epoch": 17.1, "learning_rate": 0.00011796482412060302, "loss": 0.2648, "mlm_loss": 0.2648, "step": 22170 }, { "ep_loss": 0.0, "epoch": 17.11, "learning_rate": 0.00011765075376884423, "loss": 0.2653, "mlm_loss": 0.2653, "step": 22180 }, { "ep_loss": 0.0, "epoch": 17.12, "learning_rate": 0.00011733668341708543, "loss": 0.2675, "mlm_loss": 0.2675, "step": 22190 }, { "ep_loss": 0.0, "epoch": 17.12, "learning_rate": 0.00011702261306532663, "loss": 0.2714, "mlm_loss": 0.2714, "step": 22200 }, { "epoch": 17.12, "eval_ep_loss": -2.5780112743377686, "eval_loss": 0.2559981644153595, "eval_mlm_loss": 0.2559981644153595, "eval_runtime": 61.1202, "eval_samples_per_second": 1143.142, "eval_steps_per_second": 0.573, "step": 22200 }, { "ep_loss": 0.0, "epoch": 17.13, "learning_rate": 0.00011670854271356785, "loss": 0.2699, "mlm_loss": 0.2699, "step": 22210 }, { "ep_loss": 0.0, "epoch": 17.14, "learning_rate": 0.00011639447236180905, "loss": 0.2647, "mlm_loss": 0.2647, "step": 22220 }, { "ep_loss": 0.0, "epoch": 17.15, "learning_rate": 0.00011608040201005025, "loss": 0.2674, "mlm_loss": 0.2674, "step": 22230 }, { "ep_loss": 0.0, "epoch": 17.15, "learning_rate": 0.00011576633165829146, "loss": 0.2671, "mlm_loss": 0.2671, "step": 22240 }, { "ep_loss": 0.0, "epoch": 17.16, "learning_rate": 0.00011545226130653266, "loss": 0.2666, "mlm_loss": 0.2666, "step": 22250 }, { "ep_loss": 0.0, "epoch": 17.17, "learning_rate": 0.00011513819095477388, "loss": 0.2697, "mlm_loss": 0.2697, "step": 22260 }, { "ep_loss": 0.0, "epoch": 17.18, "learning_rate": 0.00011482412060301507, "loss": 0.2653, "mlm_loss": 0.2653, "step": 22270 }, { "ep_loss": 0.0, "epoch": 17.18, "learning_rate": 0.00011451005025125628, "loss": 0.2708, "mlm_loss": 0.2708, "step": 22280 }, { "ep_loss": 0.0, "epoch": 17.19, "learning_rate": 0.0001141959798994975, "loss": 0.2702, "mlm_loss": 0.2702, "step": 22290 }, { "ep_loss": 0.0, "epoch": 17.2, "learning_rate": 0.0001138819095477387, "loss": 0.2716, "mlm_loss": 0.2716, "step": 22300 }, { "epoch": 17.2, "eval_ep_loss": -2.6083366870880127, "eval_loss": 0.25620949268341064, "eval_mlm_loss": 0.25620949268341064, "eval_runtime": 59.9592, "eval_samples_per_second": 1165.276, "eval_steps_per_second": 0.584, "step": 22300 }, { "ep_loss": 0.0, "epoch": 17.21, "learning_rate": 0.00011356783919597991, "loss": 0.2642, "mlm_loss": 0.2642, "step": 22310 }, { "ep_loss": 0.0, "epoch": 17.22, "learning_rate": 0.0001132537688442211, "loss": 0.271, "mlm_loss": 0.271, "step": 22320 }, { "ep_loss": 0.0, "epoch": 17.22, "learning_rate": 0.00011293969849246231, "loss": 0.2711, "mlm_loss": 0.2711, "step": 22330 }, { "ep_loss": 0.0, "epoch": 17.23, "learning_rate": 0.00011262562814070353, "loss": 0.2619, "mlm_loss": 0.2619, "step": 22340 }, { "ep_loss": 0.0, "epoch": 17.24, "learning_rate": 0.00011231155778894473, "loss": 0.2704, "mlm_loss": 0.2704, "step": 22350 }, { "ep_loss": 0.0, "epoch": 17.25, "learning_rate": 0.00011199748743718593, "loss": 0.271, "mlm_loss": 0.271, "step": 22360 }, { "ep_loss": 0.0, "epoch": 17.25, "learning_rate": 0.00011168341708542713, "loss": 0.2641, "mlm_loss": 0.2641, "step": 22370 }, { "ep_loss": 0.0, "epoch": 17.26, "learning_rate": 0.00011136934673366834, "loss": 0.2709, "mlm_loss": 0.2709, "step": 22380 }, { "ep_loss": 0.0, "epoch": 17.27, "learning_rate": 0.00011105527638190956, "loss": 0.2724, "mlm_loss": 0.2724, "step": 22390 }, { "ep_loss": 0.0, "epoch": 17.28, "learning_rate": 0.00011074120603015076, "loss": 0.2711, "mlm_loss": 0.2711, "step": 22400 }, { "epoch": 17.28, "eval_ep_loss": -2.4989588260650635, "eval_loss": 0.25615644454956055, "eval_mlm_loss": 0.25615644454956055, "eval_runtime": 60.8575, "eval_samples_per_second": 1148.076, "eval_steps_per_second": 0.575, "step": 22400 }, { "ep_loss": 0.0, "epoch": 17.28, "learning_rate": 0.00011042713567839196, "loss": 0.2696, "mlm_loss": 0.2696, "step": 22410 }, { "ep_loss": 0.0, "epoch": 17.29, "learning_rate": 0.00011011306532663316, "loss": 0.267, "mlm_loss": 0.267, "step": 22420 }, { "ep_loss": 0.0, "epoch": 17.3, "learning_rate": 0.00010979899497487438, "loss": 0.2699, "mlm_loss": 0.2699, "step": 22430 }, { "ep_loss": 0.0, "epoch": 17.31, "learning_rate": 0.00010948492462311559, "loss": 0.2641, "mlm_loss": 0.2641, "step": 22440 }, { "ep_loss": 0.0, "epoch": 17.32, "learning_rate": 0.00010917085427135678, "loss": 0.2669, "mlm_loss": 0.2669, "step": 22450 }, { "ep_loss": 0.0, "epoch": 17.32, "learning_rate": 0.000108856783919598, "loss": 0.2676, "mlm_loss": 0.2676, "step": 22460 }, { "ep_loss": 0.0, "epoch": 17.33, "learning_rate": 0.0001085427135678392, "loss": 0.2687, "mlm_loss": 0.2687, "step": 22470 }, { "ep_loss": 0.0, "epoch": 17.34, "learning_rate": 0.00010822864321608041, "loss": 0.2638, "mlm_loss": 0.2638, "step": 22480 }, { "ep_loss": 0.0, "epoch": 17.35, "learning_rate": 0.00010791457286432162, "loss": 0.2629, "mlm_loss": 0.2629, "step": 22490 }, { "ep_loss": 0.0, "epoch": 17.35, "learning_rate": 0.00010760050251256281, "loss": 0.2691, "mlm_loss": 0.2691, "step": 22500 }, { "epoch": 17.35, "eval_ep_loss": -2.5942306518554688, "eval_loss": 0.2564600706100464, "eval_mlm_loss": 0.2564600706100464, "eval_runtime": 60.7031, "eval_samples_per_second": 1150.996, "eval_steps_per_second": 0.577, "step": 22500 }, { "ep_loss": 0.0, "epoch": 17.36, "learning_rate": 0.00010728643216080403, "loss": 0.2728, "mlm_loss": 0.2728, "step": 22510 }, { "ep_loss": 0.0, "epoch": 17.37, "learning_rate": 0.00010697236180904523, "loss": 0.2725, "mlm_loss": 0.2725, "step": 22520 }, { "ep_loss": 0.0, "epoch": 17.38, "learning_rate": 0.00010665829145728644, "loss": 0.2657, "mlm_loss": 0.2657, "step": 22530 }, { "ep_loss": 0.0, "epoch": 17.39, "learning_rate": 0.00010634422110552764, "loss": 0.2654, "mlm_loss": 0.2654, "step": 22540 }, { "ep_loss": 0.0, "epoch": 17.39, "learning_rate": 0.00010603015075376884, "loss": 0.2692, "mlm_loss": 0.2692, "step": 22550 }, { "ep_loss": 0.0, "epoch": 17.4, "learning_rate": 0.00010571608040201006, "loss": 0.2702, "mlm_loss": 0.2702, "step": 22560 }, { "ep_loss": 0.0, "epoch": 17.41, "learning_rate": 0.00010540201005025126, "loss": 0.2718, "mlm_loss": 0.2718, "step": 22570 }, { "ep_loss": 0.0, "epoch": 17.42, "learning_rate": 0.00010508793969849246, "loss": 0.2681, "mlm_loss": 0.2681, "step": 22580 }, { "ep_loss": 0.0, "epoch": 17.42, "learning_rate": 0.00010477386934673367, "loss": 0.2638, "mlm_loss": 0.2638, "step": 22590 }, { "ep_loss": 0.0, "epoch": 17.43, "learning_rate": 0.00010445979899497487, "loss": 0.261, "mlm_loss": 0.261, "step": 22600 }, { "epoch": 17.43, "eval_ep_loss": -2.5921716690063477, "eval_loss": 0.25487470626831055, "eval_mlm_loss": 0.25487470626831055, "eval_runtime": 61.3275, "eval_samples_per_second": 1139.277, "eval_steps_per_second": 0.571, "step": 22600 }, { "ep_loss": 0.0, "epoch": 17.44, "learning_rate": 0.00010414572864321609, "loss": 0.271, "mlm_loss": 0.271, "step": 22610 }, { "ep_loss": 0.0, "epoch": 17.45, "learning_rate": 0.00010383165829145729, "loss": 0.2708, "mlm_loss": 0.2708, "step": 22620 }, { "ep_loss": 0.0, "epoch": 17.45, "learning_rate": 0.00010351758793969849, "loss": 0.2573, "mlm_loss": 0.2573, "step": 22630 }, { "ep_loss": 0.0, "epoch": 17.46, "learning_rate": 0.0001032035175879397, "loss": 0.2669, "mlm_loss": 0.2669, "step": 22640 }, { "ep_loss": 0.0, "epoch": 17.47, "learning_rate": 0.00010288944723618091, "loss": 0.2667, "mlm_loss": 0.2667, "step": 22650 }, { "ep_loss": 0.0, "epoch": 17.48, "learning_rate": 0.00010257537688442212, "loss": 0.2699, "mlm_loss": 0.2699, "step": 22660 }, { "ep_loss": 0.0, "epoch": 17.49, "learning_rate": 0.00010226130653266331, "loss": 0.2671, "mlm_loss": 0.2671, "step": 22670 }, { "ep_loss": 0.0, "epoch": 17.49, "learning_rate": 0.00010194723618090452, "loss": 0.266, "mlm_loss": 0.266, "step": 22680 }, { "ep_loss": 0.0, "epoch": 17.5, "learning_rate": 0.00010163316582914572, "loss": 0.2624, "mlm_loss": 0.2624, "step": 22690 }, { "ep_loss": 0.0, "epoch": 17.51, "learning_rate": 0.00010131909547738694, "loss": 0.2669, "mlm_loss": 0.2669, "step": 22700 }, { "epoch": 17.51, "eval_ep_loss": -2.6028358936309814, "eval_loss": 0.2549237608909607, "eval_mlm_loss": 0.2549237608909607, "eval_runtime": 62.0888, "eval_samples_per_second": 1125.307, "eval_steps_per_second": 0.564, "step": 22700 }, { "ep_loss": 0.0, "epoch": 17.52, "learning_rate": 0.00010100502512562815, "loss": 0.2631, "mlm_loss": 0.2631, "step": 22710 }, { "ep_loss": 0.0, "epoch": 17.52, "learning_rate": 0.00010069095477386934, "loss": 0.2639, "mlm_loss": 0.2639, "step": 22720 }, { "ep_loss": 0.0, "epoch": 17.53, "learning_rate": 0.00010037688442211056, "loss": 0.2634, "mlm_loss": 0.2634, "step": 22730 }, { "ep_loss": 0.0, "epoch": 17.54, "learning_rate": 0.00010006281407035176, "loss": 0.2655, "mlm_loss": 0.2655, "step": 22740 }, { "ep_loss": 0.0, "epoch": 17.55, "learning_rate": 9.974874371859297e-05, "loss": 0.2636, "mlm_loss": 0.2636, "step": 22750 }, { "ep_loss": 0.0, "epoch": 17.55, "learning_rate": 9.943467336683417e-05, "loss": 0.2679, "mlm_loss": 0.2679, "step": 22760 }, { "ep_loss": 0.0, "epoch": 17.56, "learning_rate": 9.912060301507537e-05, "loss": 0.2624, "mlm_loss": 0.2624, "step": 22770 }, { "ep_loss": 0.0, "epoch": 17.57, "learning_rate": 9.880653266331659e-05, "loss": 0.2657, "mlm_loss": 0.2657, "step": 22780 }, { "ep_loss": 0.0, "epoch": 17.58, "learning_rate": 9.849246231155779e-05, "loss": 0.2611, "mlm_loss": 0.2611, "step": 22790 }, { "ep_loss": 0.0, "epoch": 17.59, "learning_rate": 9.817839195979899e-05, "loss": 0.2659, "mlm_loss": 0.2659, "step": 22800 }, { "epoch": 17.59, "eval_ep_loss": -2.7309961318969727, "eval_loss": 0.2529388666152954, "eval_mlm_loss": 0.2529388666152954, "eval_runtime": 59.9188, "eval_samples_per_second": 1166.061, "eval_steps_per_second": 0.584, "step": 22800 }, { "ep_loss": 0.0, "epoch": 17.59, "learning_rate": 9.78643216080402e-05, "loss": 0.2633, "mlm_loss": 0.2633, "step": 22810 }, { "ep_loss": 0.0, "epoch": 17.6, "learning_rate": 9.75502512562814e-05, "loss": 0.2594, "mlm_loss": 0.2594, "step": 22820 }, { "ep_loss": 0.0, "epoch": 17.61, "learning_rate": 9.723618090452262e-05, "loss": 0.2642, "mlm_loss": 0.2642, "step": 22830 }, { "ep_loss": 0.0, "epoch": 17.62, "learning_rate": 9.692211055276382e-05, "loss": 0.2612, "mlm_loss": 0.2612, "step": 22840 }, { "ep_loss": 0.0, "epoch": 17.62, "learning_rate": 9.660804020100502e-05, "loss": 0.2754, "mlm_loss": 0.2754, "step": 22850 }, { "ep_loss": 0.0, "epoch": 17.63, "learning_rate": 9.629396984924624e-05, "loss": 0.2599, "mlm_loss": 0.2599, "step": 22860 }, { "ep_loss": 0.0, "epoch": 17.64, "learning_rate": 9.597989949748744e-05, "loss": 0.266, "mlm_loss": 0.266, "step": 22870 }, { "ep_loss": 0.0, "epoch": 17.65, "learning_rate": 9.566582914572865e-05, "loss": 0.2661, "mlm_loss": 0.2661, "step": 22880 }, { "ep_loss": 0.0, "epoch": 17.66, "learning_rate": 9.535175879396984e-05, "loss": 0.2681, "mlm_loss": 0.2681, "step": 22890 }, { "ep_loss": 0.0, "epoch": 17.66, "learning_rate": 9.503768844221105e-05, "loss": 0.2664, "mlm_loss": 0.2664, "step": 22900 }, { "epoch": 17.66, "eval_ep_loss": -2.7242908477783203, "eval_loss": 0.2529243230819702, "eval_mlm_loss": 0.2529243230819702, "eval_runtime": 60.8076, "eval_samples_per_second": 1149.017, "eval_steps_per_second": 0.576, "step": 22900 }, { "ep_loss": 0.0, "epoch": 17.67, "learning_rate": 9.472361809045227e-05, "loss": 0.2613, "mlm_loss": 0.2613, "step": 22910 }, { "ep_loss": 0.0, "epoch": 17.68, "learning_rate": 9.440954773869347e-05, "loss": 0.2628, "mlm_loss": 0.2628, "step": 22920 }, { "ep_loss": 0.0, "epoch": 17.69, "learning_rate": 9.409547738693468e-05, "loss": 0.2671, "mlm_loss": 0.2671, "step": 22930 }, { "ep_loss": 0.0, "epoch": 17.69, "learning_rate": 9.378140703517587e-05, "loss": 0.2709, "mlm_loss": 0.2709, "step": 22940 }, { "ep_loss": 0.0, "epoch": 17.7, "learning_rate": 9.346733668341709e-05, "loss": 0.2661, "mlm_loss": 0.2661, "step": 22950 }, { "ep_loss": 0.0, "epoch": 17.71, "learning_rate": 9.31532663316583e-05, "loss": 0.2568, "mlm_loss": 0.2568, "step": 22960 }, { "ep_loss": 0.0, "epoch": 17.72, "learning_rate": 9.28391959798995e-05, "loss": 0.2658, "mlm_loss": 0.2658, "step": 22970 }, { "ep_loss": 0.0, "epoch": 17.72, "learning_rate": 9.25251256281407e-05, "loss": 0.2663, "mlm_loss": 0.2663, "step": 22980 }, { "ep_loss": 0.0, "epoch": 17.73, "learning_rate": 9.22110552763819e-05, "loss": 0.2705, "mlm_loss": 0.2705, "step": 22990 }, { "ep_loss": 0.0, "epoch": 17.74, "learning_rate": 9.189698492462312e-05, "loss": 0.2629, "mlm_loss": 0.2629, "step": 23000 }, { "epoch": 17.74, "eval_ep_loss": -2.727203369140625, "eval_loss": 0.25174808502197266, "eval_mlm_loss": 0.25174808502197266, "eval_runtime": 59.8058, "eval_samples_per_second": 1168.265, "eval_steps_per_second": 0.585, "step": 23000 }, { "ep_loss": 0.0, "epoch": 17.75, "learning_rate": 9.158291457286433e-05, "loss": 0.2614, "mlm_loss": 0.2614, "step": 23010 }, { "ep_loss": 0.0, "epoch": 17.76, "learning_rate": 9.126884422110553e-05, "loss": 0.2647, "mlm_loss": 0.2647, "step": 23020 }, { "ep_loss": 0.0, "epoch": 17.76, "learning_rate": 9.095477386934673e-05, "loss": 0.2642, "mlm_loss": 0.2642, "step": 23030 }, { "ep_loss": 0.0, "epoch": 17.77, "learning_rate": 9.064070351758794e-05, "loss": 0.2566, "mlm_loss": 0.2566, "step": 23040 }, { "ep_loss": 0.0, "epoch": 17.78, "learning_rate": 9.032663316582915e-05, "loss": 0.2607, "mlm_loss": 0.2607, "step": 23050 }, { "ep_loss": 0.0, "epoch": 17.79, "learning_rate": 9.001256281407036e-05, "loss": 0.2619, "mlm_loss": 0.2619, "step": 23060 }, { "ep_loss": 0.0, "epoch": 17.79, "learning_rate": 8.969849246231155e-05, "loss": 0.2628, "mlm_loss": 0.2628, "step": 23070 }, { "ep_loss": 0.0, "epoch": 17.8, "learning_rate": 8.938442211055277e-05, "loss": 0.2601, "mlm_loss": 0.2601, "step": 23080 }, { "ep_loss": 0.0, "epoch": 17.81, "learning_rate": 8.907035175879397e-05, "loss": 0.2662, "mlm_loss": 0.2662, "step": 23090 }, { "ep_loss": 0.0, "epoch": 17.82, "learning_rate": 8.875628140703518e-05, "loss": 0.2649, "mlm_loss": 0.2649, "step": 23100 }, { "epoch": 17.82, "eval_ep_loss": -2.70764422416687, "eval_loss": 0.2541521489620209, "eval_mlm_loss": 0.2541521489620209, "eval_runtime": 61.2146, "eval_samples_per_second": 1141.379, "eval_steps_per_second": 0.572, "step": 23100 }, { "ep_loss": 0.0, "epoch": 17.82, "learning_rate": 8.844221105527638e-05, "loss": 0.2686, "mlm_loss": 0.2686, "step": 23110 }, { "ep_loss": 0.0, "epoch": 17.83, "learning_rate": 8.812814070351758e-05, "loss": 0.2653, "mlm_loss": 0.2653, "step": 23120 }, { "ep_loss": 0.0, "epoch": 17.84, "learning_rate": 8.78140703517588e-05, "loss": 0.2673, "mlm_loss": 0.2673, "step": 23130 }, { "ep_loss": 0.0, "epoch": 17.85, "learning_rate": 8.75e-05, "loss": 0.2599, "mlm_loss": 0.2599, "step": 23140 }, { "ep_loss": 0.0, "epoch": 17.86, "learning_rate": 8.718592964824121e-05, "loss": 0.2596, "mlm_loss": 0.2596, "step": 23150 }, { "ep_loss": 0.0, "epoch": 17.86, "learning_rate": 8.687185929648241e-05, "loss": 0.2661, "mlm_loss": 0.2661, "step": 23160 }, { "ep_loss": 0.0, "epoch": 17.87, "learning_rate": 8.655778894472362e-05, "loss": 0.2628, "mlm_loss": 0.2628, "step": 23170 }, { "ep_loss": 0.0, "epoch": 17.88, "learning_rate": 8.624371859296483e-05, "loss": 0.267, "mlm_loss": 0.267, "step": 23180 }, { "ep_loss": 0.0, "epoch": 17.89, "learning_rate": 8.592964824120603e-05, "loss": 0.262, "mlm_loss": 0.262, "step": 23190 }, { "ep_loss": 0.0, "epoch": 17.89, "learning_rate": 8.561557788944723e-05, "loss": 0.263, "mlm_loss": 0.263, "step": 23200 }, { "epoch": 17.89, "eval_ep_loss": -2.7034008502960205, "eval_loss": 0.2510799467563629, "eval_mlm_loss": 0.2510799467563629, "eval_runtime": 61.803, "eval_samples_per_second": 1130.511, "eval_steps_per_second": 0.566, "step": 23200 }, { "ep_loss": 0.0, "epoch": 17.9, "learning_rate": 8.530150753768845e-05, "loss": 0.2631, "mlm_loss": 0.2631, "step": 23210 }, { "ep_loss": 0.0, "epoch": 17.91, "learning_rate": 8.498743718592965e-05, "loss": 0.2609, "mlm_loss": 0.2609, "step": 23220 }, { "ep_loss": 0.0, "epoch": 17.92, "learning_rate": 8.467336683417086e-05, "loss": 0.2605, "mlm_loss": 0.2605, "step": 23230 }, { "ep_loss": 0.0, "epoch": 17.93, "learning_rate": 8.435929648241206e-05, "loss": 0.2648, "mlm_loss": 0.2648, "step": 23240 }, { "ep_loss": 0.0, "epoch": 17.93, "learning_rate": 8.404522613065326e-05, "loss": 0.2553, "mlm_loss": 0.2553, "step": 23250 }, { "ep_loss": 0.0, "epoch": 17.94, "learning_rate": 8.373115577889448e-05, "loss": 0.2594, "mlm_loss": 0.2594, "step": 23260 }, { "ep_loss": 0.0, "epoch": 17.95, "learning_rate": 8.341708542713568e-05, "loss": 0.2585, "mlm_loss": 0.2585, "step": 23270 }, { "ep_loss": 0.0, "epoch": 17.96, "learning_rate": 8.31030150753769e-05, "loss": 0.2671, "mlm_loss": 0.2671, "step": 23280 }, { "ep_loss": 0.0, "epoch": 17.96, "learning_rate": 8.278894472361808e-05, "loss": 0.261, "mlm_loss": 0.261, "step": 23290 }, { "ep_loss": 0.0, "epoch": 17.97, "learning_rate": 8.24748743718593e-05, "loss": 0.2635, "mlm_loss": 0.2635, "step": 23300 }, { "epoch": 17.97, "eval_ep_loss": -2.886038064956665, "eval_loss": 0.24997316300868988, "eval_mlm_loss": 0.24997316300868988, "eval_runtime": 61.3957, "eval_samples_per_second": 1138.011, "eval_steps_per_second": 0.57, "step": 23300 }, { "ep_loss": 0.0, "epoch": 17.98, "learning_rate": 8.216080402010051e-05, "loss": 0.2657, "mlm_loss": 0.2657, "step": 23310 }, { "ep_loss": 0.0, "epoch": 17.99, "learning_rate": 8.184673366834171e-05, "loss": 0.2707, "mlm_loss": 0.2707, "step": 23320 }, { "ep_loss": 0.0, "epoch": 17.99, "learning_rate": 8.153266331658293e-05, "loss": 0.2651, "mlm_loss": 0.2651, "step": 23330 }, { "ep_loss": 0.0, "epoch": 18.0, "learning_rate": 8.121859296482411e-05, "loss": 0.2605, "mlm_loss": 0.2605, "step": 23340 }, { "ep_loss": 0.0, "epoch": 18.01, "learning_rate": 8.090452261306533e-05, "loss": 0.265, "mlm_loss": 0.265, "step": 23350 }, { "ep_loss": 0.0, "epoch": 18.02, "learning_rate": 8.059045226130654e-05, "loss": 0.264, "mlm_loss": 0.264, "step": 23360 }, { "ep_loss": 0.0, "epoch": 18.03, "learning_rate": 8.027638190954774e-05, "loss": 0.2656, "mlm_loss": 0.2656, "step": 23370 }, { "ep_loss": 0.0, "epoch": 18.03, "learning_rate": 7.996231155778894e-05, "loss": 0.2578, "mlm_loss": 0.2578, "step": 23380 }, { "ep_loss": 0.0, "epoch": 18.04, "learning_rate": 7.964824120603015e-05, "loss": 0.264, "mlm_loss": 0.264, "step": 23390 }, { "ep_loss": 0.0, "epoch": 18.05, "learning_rate": 7.933417085427136e-05, "loss": 0.2619, "mlm_loss": 0.2619, "step": 23400 }, { "epoch": 18.05, "eval_ep_loss": -2.918891668319702, "eval_loss": 0.24930217862129211, "eval_mlm_loss": 0.24930217862129211, "eval_runtime": 61.8569, "eval_samples_per_second": 1129.526, "eval_steps_per_second": 0.566, "step": 23400 }, { "ep_loss": 0.0, "epoch": 18.06, "learning_rate": 7.902010050251257e-05, "loss": 0.2669, "mlm_loss": 0.2669, "step": 23410 }, { "ep_loss": 0.0, "epoch": 18.06, "learning_rate": 7.870603015075376e-05, "loss": 0.2615, "mlm_loss": 0.2615, "step": 23420 }, { "ep_loss": 0.0, "epoch": 18.07, "learning_rate": 7.839195979899498e-05, "loss": 0.264, "mlm_loss": 0.264, "step": 23430 }, { "ep_loss": 0.0, "epoch": 18.08, "learning_rate": 7.807788944723618e-05, "loss": 0.2578, "mlm_loss": 0.2578, "step": 23440 }, { "ep_loss": 0.0, "epoch": 18.09, "learning_rate": 7.776381909547739e-05, "loss": 0.2566, "mlm_loss": 0.2566, "step": 23450 }, { "ep_loss": 0.0, "epoch": 18.09, "learning_rate": 7.744974874371861e-05, "loss": 0.2607, "mlm_loss": 0.2607, "step": 23460 }, { "ep_loss": 0.0, "epoch": 18.1, "learning_rate": 7.71356783919598e-05, "loss": 0.2637, "mlm_loss": 0.2637, "step": 23470 }, { "ep_loss": 0.0, "epoch": 18.11, "learning_rate": 7.682160804020101e-05, "loss": 0.2658, "mlm_loss": 0.2658, "step": 23480 }, { "ep_loss": 0.0, "epoch": 18.12, "learning_rate": 7.650753768844221e-05, "loss": 0.2622, "mlm_loss": 0.2622, "step": 23490 }, { "ep_loss": 0.0, "epoch": 18.13, "learning_rate": 7.619346733668342e-05, "loss": 0.263, "mlm_loss": 0.263, "step": 23500 }, { "epoch": 18.13, "eval_ep_loss": -2.890554189682007, "eval_loss": 0.24877700209617615, "eval_mlm_loss": 0.24877700209617615, "eval_runtime": 63.4371, "eval_samples_per_second": 1101.39, "eval_steps_per_second": 0.552, "step": 23500 }, { "ep_loss": 0.0, "epoch": 18.13, "learning_rate": 7.587939698492463e-05, "loss": 0.2602, "mlm_loss": 0.2602, "step": 23510 }, { "ep_loss": 0.0, "epoch": 18.14, "learning_rate": 7.556532663316583e-05, "loss": 0.2559, "mlm_loss": 0.2559, "step": 23520 }, { "ep_loss": 0.0, "epoch": 18.15, "learning_rate": 7.525125628140704e-05, "loss": 0.2594, "mlm_loss": 0.2594, "step": 23530 }, { "ep_loss": 0.0, "epoch": 18.16, "learning_rate": 7.493718592964824e-05, "loss": 0.264, "mlm_loss": 0.264, "step": 23540 }, { "ep_loss": 0.0, "epoch": 18.16, "learning_rate": 7.462311557788946e-05, "loss": 0.2593, "mlm_loss": 0.2593, "step": 23550 }, { "ep_loss": 0.0, "epoch": 18.17, "learning_rate": 7.430904522613066e-05, "loss": 0.2592, "mlm_loss": 0.2592, "step": 23560 }, { "ep_loss": 0.0, "epoch": 18.18, "learning_rate": 7.399497487437186e-05, "loss": 0.2568, "mlm_loss": 0.2568, "step": 23570 }, { "ep_loss": 0.0, "epoch": 18.19, "learning_rate": 7.368090452261307e-05, "loss": 0.2656, "mlm_loss": 0.2656, "step": 23580 }, { "ep_loss": 0.0, "epoch": 18.2, "learning_rate": 7.336683417085427e-05, "loss": 0.2653, "mlm_loss": 0.2653, "step": 23590 }, { "ep_loss": 0.0, "epoch": 18.2, "learning_rate": 7.305276381909547e-05, "loss": 0.2628, "mlm_loss": 0.2628, "step": 23600 }, { "epoch": 18.2, "eval_ep_loss": -2.838624954223633, "eval_loss": 0.24834777414798737, "eval_mlm_loss": 0.24834777414798737, "eval_runtime": 60.4243, "eval_samples_per_second": 1156.306, "eval_steps_per_second": 0.579, "step": 23600 }, { "ep_loss": 0.0, "epoch": 18.21, "learning_rate": 7.273869346733669e-05, "loss": 0.2593, "mlm_loss": 0.2593, "step": 23610 }, { "ep_loss": 0.0, "epoch": 18.22, "learning_rate": 7.242462311557789e-05, "loss": 0.2574, "mlm_loss": 0.2574, "step": 23620 }, { "ep_loss": 0.0, "epoch": 18.23, "learning_rate": 7.21105527638191e-05, "loss": 0.2586, "mlm_loss": 0.2586, "step": 23630 }, { "ep_loss": 0.0, "epoch": 18.23, "learning_rate": 7.179648241206029e-05, "loss": 0.2541, "mlm_loss": 0.2541, "step": 23640 }, { "ep_loss": 0.0, "epoch": 18.24, "learning_rate": 7.148241206030151e-05, "loss": 0.2589, "mlm_loss": 0.2589, "step": 23650 }, { "ep_loss": 0.0, "epoch": 18.25, "learning_rate": 7.116834170854271e-05, "loss": 0.2613, "mlm_loss": 0.2613, "step": 23660 }, { "ep_loss": 0.0, "epoch": 18.26, "learning_rate": 7.085427135678392e-05, "loss": 0.2609, "mlm_loss": 0.2609, "step": 23670 }, { "ep_loss": 0.0, "epoch": 18.26, "learning_rate": 7.054020100502514e-05, "loss": 0.2599, "mlm_loss": 0.2599, "step": 23680 }, { "ep_loss": 0.0, "epoch": 18.27, "learning_rate": 7.022613065326632e-05, "loss": 0.2592, "mlm_loss": 0.2592, "step": 23690 }, { "ep_loss": 0.0, "epoch": 18.28, "learning_rate": 6.991206030150754e-05, "loss": 0.2581, "mlm_loss": 0.2581, "step": 23700 }, { "epoch": 18.28, "eval_ep_loss": -2.8471715450286865, "eval_loss": 0.2473229169845581, "eval_mlm_loss": 0.2473229169845581, "eval_runtime": 59.2193, "eval_samples_per_second": 1179.835, "eval_steps_per_second": 0.591, "step": 23700 }, { "ep_loss": 0.0, "epoch": 18.29, "learning_rate": 6.959798994974874e-05, "loss": 0.2597, "mlm_loss": 0.2597, "step": 23710 }, { "ep_loss": 0.0, "epoch": 18.3, "learning_rate": 6.928391959798995e-05, "loss": 0.2639, "mlm_loss": 0.2639, "step": 23720 }, { "ep_loss": 0.0, "epoch": 18.3, "learning_rate": 6.896984924623116e-05, "loss": 0.2662, "mlm_loss": 0.2662, "step": 23730 }, { "ep_loss": 0.0, "epoch": 18.31, "learning_rate": 6.865577889447236e-05, "loss": 0.2628, "mlm_loss": 0.2628, "step": 23740 }, { "ep_loss": 0.0, "epoch": 18.32, "learning_rate": 6.834170854271357e-05, "loss": 0.2624, "mlm_loss": 0.2624, "step": 23750 }, { "ep_loss": 0.0, "epoch": 18.33, "learning_rate": 6.802763819095477e-05, "loss": 0.2588, "mlm_loss": 0.2588, "step": 23760 }, { "ep_loss": 0.0, "epoch": 18.33, "learning_rate": 6.771356783919599e-05, "loss": 0.2562, "mlm_loss": 0.2562, "step": 23770 }, { "ep_loss": 0.0, "epoch": 18.34, "learning_rate": 6.739949748743719e-05, "loss": 0.2569, "mlm_loss": 0.2569, "step": 23780 }, { "ep_loss": 0.0, "epoch": 18.35, "learning_rate": 6.708542713567839e-05, "loss": 0.2638, "mlm_loss": 0.2638, "step": 23790 }, { "ep_loss": 0.0, "epoch": 18.36, "learning_rate": 6.67713567839196e-05, "loss": 0.2615, "mlm_loss": 0.2615, "step": 23800 }, { "epoch": 18.36, "eval_ep_loss": -2.8369219303131104, "eval_loss": 0.24757327139377594, "eval_mlm_loss": 0.24757327139377594, "eval_runtime": 60.3589, "eval_samples_per_second": 1157.56, "eval_steps_per_second": 0.58, "step": 23800 }, { "ep_loss": 0.0, "epoch": 18.36, "learning_rate": 6.64572864321608e-05, "loss": 0.2526, "mlm_loss": 0.2526, "step": 23810 }, { "ep_loss": 0.0, "epoch": 18.37, "learning_rate": 6.6143216080402e-05, "loss": 0.261, "mlm_loss": 0.261, "step": 23820 }, { "ep_loss": 0.0, "epoch": 18.38, "learning_rate": 6.582914572864322e-05, "loss": 0.2656, "mlm_loss": 0.2656, "step": 23830 }, { "ep_loss": 0.0, "epoch": 18.39, "learning_rate": 6.551507537688442e-05, "loss": 0.2613, "mlm_loss": 0.2613, "step": 23840 }, { "ep_loss": 0.0, "epoch": 18.4, "learning_rate": 6.520100502512563e-05, "loss": 0.2593, "mlm_loss": 0.2593, "step": 23850 }, { "ep_loss": 0.0, "epoch": 18.4, "learning_rate": 6.488693467336684e-05, "loss": 0.257, "mlm_loss": 0.257, "step": 23860 }, { "ep_loss": 0.0, "epoch": 18.41, "learning_rate": 6.457286432160804e-05, "loss": 0.2612, "mlm_loss": 0.2612, "step": 23870 }, { "ep_loss": 0.0, "epoch": 18.42, "learning_rate": 6.425879396984925e-05, "loss": 0.2613, "mlm_loss": 0.2613, "step": 23880 }, { "ep_loss": 0.0, "epoch": 18.43, "learning_rate": 6.394472361809045e-05, "loss": 0.2565, "mlm_loss": 0.2565, "step": 23890 }, { "ep_loss": 0.0, "epoch": 18.43, "learning_rate": 6.363065326633167e-05, "loss": 0.2593, "mlm_loss": 0.2593, "step": 23900 }, { "epoch": 18.43, "eval_ep_loss": -2.8520493507385254, "eval_loss": 0.24821537733078003, "eval_mlm_loss": 0.24821537733078003, "eval_runtime": 61.6761, "eval_samples_per_second": 1132.838, "eval_steps_per_second": 0.567, "step": 23900 }, { "ep_loss": 0.0, "epoch": 18.44, "learning_rate": 6.331658291457285e-05, "loss": 0.2584, "mlm_loss": 0.2584, "step": 23910 }, { "ep_loss": 0.0, "epoch": 18.45, "learning_rate": 6.300251256281407e-05, "loss": 0.2539, "mlm_loss": 0.2539, "step": 23920 }, { "ep_loss": 0.0, "epoch": 18.46, "learning_rate": 6.268844221105528e-05, "loss": 0.2523, "mlm_loss": 0.2523, "step": 23930 }, { "ep_loss": 0.0, "epoch": 18.47, "learning_rate": 6.237437185929648e-05, "loss": 0.2623, "mlm_loss": 0.2623, "step": 23940 }, { "ep_loss": 0.0, "epoch": 18.47, "learning_rate": 6.206030150753769e-05, "loss": 0.2562, "mlm_loss": 0.2562, "step": 23950 }, { "ep_loss": 0.0, "epoch": 18.48, "learning_rate": 6.17462311557789e-05, "loss": 0.2622, "mlm_loss": 0.2622, "step": 23960 }, { "ep_loss": 0.0, "epoch": 18.49, "learning_rate": 6.14321608040201e-05, "loss": 0.2625, "mlm_loss": 0.2625, "step": 23970 }, { "ep_loss": 0.0, "epoch": 18.5, "learning_rate": 6.11180904522613e-05, "loss": 0.2579, "mlm_loss": 0.2579, "step": 23980 }, { "ep_loss": 0.0, "epoch": 18.5, "learning_rate": 6.080402010050251e-05, "loss": 0.255, "mlm_loss": 0.255, "step": 23990 }, { "ep_loss": 0.0, "epoch": 18.51, "learning_rate": 6.0489949748743724e-05, "loss": 0.261, "mlm_loss": 0.261, "step": 24000 }, { "epoch": 18.51, "eval_ep_loss": -2.703091859817505, "eval_loss": 0.24630548059940338, "eval_mlm_loss": 0.24630548059940338, "eval_runtime": 60.8068, "eval_samples_per_second": 1149.033, "eval_steps_per_second": 0.576, "step": 24000 } ], "max_steps": 25920, "num_train_epochs": 20, "total_flos": 6.513109813218509e+18, "trial_name": null, "trial_params": null }