diff --git "a/checkpoint-18000/trainer_state.json" "b/checkpoint-18000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-18000/trainer_state.json" @@ -0,0 +1,16216 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.883532587736212, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ep_loss": 0.0, + "epoch": 0.01, + "learning_rate": 5e-07, + "loss": 10.4763, + "mlm_loss": 10.4763, + "step": 10 + }, + { + "ep_loss": 0.0, + "epoch": 0.02, + "learning_rate": 1e-06, + "loss": 10.3052, + "mlm_loss": 10.3052, + "step": 20 + }, + { + "ep_loss": 0.0, + "epoch": 0.02, + "learning_rate": 1.5e-06, + "loss": 9.9641, + "mlm_loss": 9.9641, + "step": 30 + }, + { + "ep_loss": 0.0, + "epoch": 0.03, + "learning_rate": 2e-06, + "loss": 9.5354, + "mlm_loss": 9.5354, + "step": 40 + }, + { + "ep_loss": 0.0, + "epoch": 0.04, + "learning_rate": 2.5e-06, + "loss": 9.1346, + "mlm_loss": 9.1346, + "step": 50 + }, + { + "ep_loss": 0.0, + "epoch": 0.05, + "learning_rate": 3e-06, + "loss": 8.8162, + "mlm_loss": 8.8162, + "step": 60 + }, + { + "ep_loss": 0.0, + "epoch": 0.05, + "learning_rate": 3.5e-06, + "loss": 8.558, + "mlm_loss": 8.558, + "step": 70 + }, + { + "ep_loss": 0.0, + "epoch": 0.06, + "learning_rate": 4e-06, + "loss": 8.3459, + "mlm_loss": 8.3459, + "step": 80 + }, + { + "ep_loss": 0.0, + "epoch": 0.07, + "learning_rate": 4.5e-06, + "loss": 8.175, + "mlm_loss": 8.175, + "step": 90 + }, + { + "ep_loss": 0.0, + "epoch": 0.08, + "learning_rate": 5e-06, + "loss": 8.0255, + "mlm_loss": 8.0255, + "step": 100 + }, + { + "epoch": 0.08, + "eval_ep_loss": -0.14438027143478394, + "eval_loss": 7.826408386230469, + "eval_mlm_loss": 7.826408386230469, + "eval_runtime": 63.5165, + "eval_samples_per_second": 1100.014, + "eval_steps_per_second": 0.551, + "step": 100 + }, + { + "ep_loss": 0.0, + "epoch": 0.08, + "learning_rate": 5.5e-06, + "loss": 7.8979, + "mlm_loss": 7.8979, + "step": 110 + }, + { + "ep_loss": 0.0, + "epoch": 0.09, + "learning_rate": 6e-06, + "loss": 7.7543, + "mlm_loss": 7.7543, + "step": 120 + }, + { + "ep_loss": 0.0, + "epoch": 0.1, + "learning_rate": 6.5e-06, + "loss": 7.5942, + "mlm_loss": 7.5942, + "step": 130 + }, + { + "ep_loss": 0.0, + "epoch": 0.11, + "learning_rate": 7e-06, + "loss": 7.4304, + "mlm_loss": 7.4304, + "step": 140 + }, + { + "ep_loss": 0.0, + "epoch": 0.12, + "learning_rate": 7.5e-06, + "loss": 7.1862, + "mlm_loss": 7.1862, + "step": 150 + }, + { + "ep_loss": 0.0, + "epoch": 0.12, + "learning_rate": 8e-06, + "loss": 6.924, + "mlm_loss": 6.924, + "step": 160 + }, + { + "ep_loss": 0.0, + "epoch": 0.13, + "learning_rate": 8.500000000000002e-06, + "loss": 6.6521, + "mlm_loss": 6.6521, + "step": 170 + }, + { + "ep_loss": 0.0, + "epoch": 0.14, + "learning_rate": 9e-06, + "loss": 6.3639, + "mlm_loss": 6.3639, + "step": 180 + }, + { + "ep_loss": 0.0, + "epoch": 0.15, + "learning_rate": 9.5e-06, + "loss": 6.0915, + "mlm_loss": 6.0915, + "step": 190 + }, + { + "ep_loss": 0.0, + "epoch": 0.15, + "learning_rate": 1e-05, + "loss": 5.8272, + "mlm_loss": 5.8272, + "step": 200 + }, + { + "epoch": 0.15, + "eval_ep_loss": -0.4913683831691742, + "eval_loss": 5.531888008117676, + "eval_mlm_loss": 5.531888008117676, + "eval_runtime": 60.9938, + "eval_samples_per_second": 1145.51, + "eval_steps_per_second": 0.574, + "step": 200 + }, + { + "ep_loss": 0.0, + "epoch": 0.16, + "learning_rate": 1.0500000000000001e-05, + "loss": 5.5749, + "mlm_loss": 5.5749, + "step": 210 + }, + { + "ep_loss": 0.0, + "epoch": 0.17, + "learning_rate": 1.1e-05, + "loss": 5.3413, + "mlm_loss": 5.3413, + "step": 220 + }, + { + "ep_loss": 0.0, + "epoch": 0.18, + "learning_rate": 1.15e-05, + "loss": 5.142, + "mlm_loss": 5.142, + "step": 230 + }, + { + "ep_loss": 0.0, + "epoch": 0.19, + "learning_rate": 1.2e-05, + "loss": 4.925, + "mlm_loss": 4.925, + "step": 240 + }, + { + "ep_loss": 0.0, + "epoch": 0.19, + "learning_rate": 1.25e-05, + "loss": 4.7334, + "mlm_loss": 4.7334, + "step": 250 + }, + { + "ep_loss": 0.0, + "epoch": 0.2, + "learning_rate": 1.3e-05, + "loss": 4.5702, + "mlm_loss": 4.5702, + "step": 260 + }, + { + "ep_loss": 0.0, + "epoch": 0.21, + "learning_rate": 1.35e-05, + "loss": 4.4388, + "mlm_loss": 4.4388, + "step": 270 + }, + { + "ep_loss": 0.0, + "epoch": 0.22, + "learning_rate": 1.4e-05, + "loss": 4.288, + "mlm_loss": 4.288, + "step": 280 + }, + { + "ep_loss": 0.0, + "epoch": 0.22, + "learning_rate": 1.4500000000000002e-05, + "loss": 4.1403, + "mlm_loss": 4.1403, + "step": 290 + }, + { + "ep_loss": 0.0, + "epoch": 0.23, + "learning_rate": 1.5e-05, + "loss": 4.0232, + "mlm_loss": 4.0232, + "step": 300 + }, + { + "epoch": 0.23, + "eval_ep_loss": -1.1010401248931885, + "eval_loss": 3.8623688220977783, + "eval_mlm_loss": 3.8623688220977783, + "eval_runtime": 61.8594, + "eval_samples_per_second": 1129.481, + "eval_steps_per_second": 0.566, + "step": 300 + }, + { + "ep_loss": 0.0, + "epoch": 0.24, + "learning_rate": 1.55e-05, + "loss": 3.9135, + "mlm_loss": 3.9135, + "step": 310 + }, + { + "ep_loss": 0.0, + "epoch": 0.25, + "learning_rate": 1.6e-05, + "loss": 3.8252, + "mlm_loss": 3.8252, + "step": 320 + }, + { + "ep_loss": 0.0, + "epoch": 0.25, + "learning_rate": 1.65e-05, + "loss": 3.7462, + "mlm_loss": 3.7462, + "step": 330 + }, + { + "ep_loss": 0.0, + "epoch": 0.26, + "learning_rate": 1.7000000000000003e-05, + "loss": 3.6491, + "mlm_loss": 3.6491, + "step": 340 + }, + { + "ep_loss": 0.0, + "epoch": 0.27, + "learning_rate": 1.7500000000000002e-05, + "loss": 3.5815, + "mlm_loss": 3.5815, + "step": 350 + }, + { + "ep_loss": 0.0, + "epoch": 0.28, + "learning_rate": 1.8e-05, + "loss": 3.5211, + "mlm_loss": 3.5211, + "step": 360 + }, + { + "ep_loss": 0.0, + "epoch": 0.29, + "learning_rate": 1.85e-05, + "loss": 3.452, + "mlm_loss": 3.452, + "step": 370 + }, + { + "ep_loss": 0.0, + "epoch": 0.29, + "learning_rate": 1.9e-05, + "loss": 3.3747, + "mlm_loss": 3.3747, + "step": 380 + }, + { + "ep_loss": 0.0, + "epoch": 0.3, + "learning_rate": 1.95e-05, + "loss": 3.3163, + "mlm_loss": 3.3163, + "step": 390 + }, + { + "ep_loss": 0.0, + "epoch": 0.31, + "learning_rate": 2e-05, + "loss": 3.2679, + "mlm_loss": 3.2679, + "step": 400 + }, + { + "epoch": 0.31, + "eval_ep_loss": -1.4546582698822021, + "eval_loss": 3.1360955238342285, + "eval_mlm_loss": 3.1360955238342285, + "eval_runtime": 59.2888, + "eval_samples_per_second": 1178.451, + "eval_steps_per_second": 0.59, + "step": 400 + }, + { + "ep_loss": 0.0, + "epoch": 0.32, + "learning_rate": 2.05e-05, + "loss": 3.1831, + "mlm_loss": 3.1831, + "step": 410 + }, + { + "ep_loss": 0.0, + "epoch": 0.32, + "learning_rate": 2.1000000000000002e-05, + "loss": 3.1246, + "mlm_loss": 3.1246, + "step": 420 + }, + { + "ep_loss": 0.0, + "epoch": 0.33, + "learning_rate": 2.1499999999999997e-05, + "loss": 3.0658, + "mlm_loss": 3.0658, + "step": 430 + }, + { + "ep_loss": 0.0, + "epoch": 0.34, + "learning_rate": 2.2e-05, + "loss": 3.0032, + "mlm_loss": 3.0032, + "step": 440 + }, + { + "ep_loss": 0.0, + "epoch": 0.35, + "learning_rate": 2.2499999999999998e-05, + "loss": 2.931, + "mlm_loss": 2.931, + "step": 450 + }, + { + "ep_loss": 0.0, + "epoch": 0.35, + "learning_rate": 2.3e-05, + "loss": 2.904, + "mlm_loss": 2.904, + "step": 460 + }, + { + "ep_loss": 0.0, + "epoch": 0.36, + "learning_rate": 2.3500000000000002e-05, + "loss": 2.8526, + "mlm_loss": 2.8526, + "step": 470 + }, + { + "ep_loss": 0.0, + "epoch": 0.37, + "learning_rate": 2.4e-05, + "loss": 2.8336, + "mlm_loss": 2.8336, + "step": 480 + }, + { + "ep_loss": 0.0, + "epoch": 0.38, + "learning_rate": 2.4500000000000003e-05, + "loss": 2.7953, + "mlm_loss": 2.7953, + "step": 490 + }, + { + "ep_loss": 0.0, + "epoch": 0.39, + "learning_rate": 2.5e-05, + "loss": 2.7569, + "mlm_loss": 2.7569, + "step": 500 + }, + { + "epoch": 0.39, + "eval_ep_loss": -1.5984127521514893, + "eval_loss": 2.6636624336242676, + "eval_mlm_loss": 2.6636624336242676, + "eval_runtime": 64.3177, + "eval_samples_per_second": 1086.31, + "eval_steps_per_second": 0.544, + "step": 500 + }, + { + "ep_loss": 0.0, + "epoch": 0.39, + "learning_rate": 2.55e-05, + "loss": 2.7157, + "mlm_loss": 2.7157, + "step": 510 + }, + { + "ep_loss": 0.0, + "epoch": 0.4, + "learning_rate": 2.6e-05, + "loss": 2.6857, + "mlm_loss": 2.6857, + "step": 520 + }, + { + "ep_loss": 0.0, + "epoch": 0.41, + "learning_rate": 2.65e-05, + "loss": 2.6796, + "mlm_loss": 2.6796, + "step": 530 + }, + { + "ep_loss": 0.0, + "epoch": 0.42, + "learning_rate": 2.7e-05, + "loss": 2.6449, + "mlm_loss": 2.6449, + "step": 540 + }, + { + "ep_loss": 0.0, + "epoch": 0.42, + "learning_rate": 2.75e-05, + "loss": 2.5932, + "mlm_loss": 2.5932, + "step": 550 + }, + { + "ep_loss": 0.0, + "epoch": 0.43, + "learning_rate": 2.8e-05, + "loss": 2.5536, + "mlm_loss": 2.5536, + "step": 560 + }, + { + "ep_loss": 0.0, + "epoch": 0.44, + "learning_rate": 2.85e-05, + "loss": 2.5271, + "mlm_loss": 2.5271, + "step": 570 + }, + { + "ep_loss": 0.0, + "epoch": 0.45, + "learning_rate": 2.9000000000000004e-05, + "loss": 2.5057, + "mlm_loss": 2.5057, + "step": 580 + }, + { + "ep_loss": 0.0, + "epoch": 0.46, + "learning_rate": 2.95e-05, + "loss": 2.476, + "mlm_loss": 2.476, + "step": 590 + }, + { + "ep_loss": 0.0, + "epoch": 0.46, + "learning_rate": 3e-05, + "loss": 2.4265, + "mlm_loss": 2.4265, + "step": 600 + }, + { + "epoch": 0.46, + "eval_ep_loss": -1.7573626041412354, + "eval_loss": 2.359508752822876, + "eval_mlm_loss": 2.359508752822876, + "eval_runtime": 59.8647, + "eval_samples_per_second": 1167.114, + "eval_steps_per_second": 0.585, + "step": 600 + }, + { + "ep_loss": 0.0, + "epoch": 0.47, + "learning_rate": 3.05e-05, + "loss": 2.4061, + "mlm_loss": 2.4061, + "step": 610 + }, + { + "ep_loss": 0.0, + "epoch": 0.48, + "learning_rate": 3.1e-05, + "loss": 2.4071, + "mlm_loss": 2.4071, + "step": 620 + }, + { + "ep_loss": 0.0, + "epoch": 0.49, + "learning_rate": 3.15e-05, + "loss": 2.3728, + "mlm_loss": 2.3728, + "step": 630 + }, + { + "ep_loss": 0.0, + "epoch": 0.49, + "learning_rate": 3.2e-05, + "loss": 2.3465, + "mlm_loss": 2.3465, + "step": 640 + }, + { + "ep_loss": 0.0, + "epoch": 0.5, + "learning_rate": 3.2500000000000004e-05, + "loss": 2.3065, + "mlm_loss": 2.3065, + "step": 650 + }, + { + "ep_loss": 0.0, + "epoch": 0.51, + "learning_rate": 3.3e-05, + "loss": 2.312, + "mlm_loss": 2.312, + "step": 660 + }, + { + "ep_loss": 0.0, + "epoch": 0.52, + "learning_rate": 3.35e-05, + "loss": 2.29, + "mlm_loss": 2.29, + "step": 670 + }, + { + "ep_loss": 0.0, + "epoch": 0.52, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.2939, + "mlm_loss": 2.2939, + "step": 680 + }, + { + "ep_loss": 0.0, + "epoch": 0.53, + "learning_rate": 3.4500000000000005e-05, + "loss": 2.2496, + "mlm_loss": 2.2496, + "step": 690 + }, + { + "ep_loss": 0.0, + "epoch": 0.54, + "learning_rate": 3.5000000000000004e-05, + "loss": 2.2482, + "mlm_loss": 2.2482, + "step": 700 + }, + { + "epoch": 0.54, + "eval_ep_loss": -1.7850463390350342, + "eval_loss": 2.1652746200561523, + "eval_mlm_loss": 2.1652746200561523, + "eval_runtime": 63.4238, + "eval_samples_per_second": 1101.621, + "eval_steps_per_second": 0.552, + "step": 700 + }, + { + "ep_loss": 0.0, + "epoch": 0.55, + "learning_rate": 3.5499999999999996e-05, + "loss": 2.2379, + "mlm_loss": 2.2379, + "step": 710 + }, + { + "ep_loss": 0.0, + "epoch": 0.56, + "learning_rate": 3.6e-05, + "loss": 2.2088, + "mlm_loss": 2.2088, + "step": 720 + }, + { + "ep_loss": 0.0, + "epoch": 0.56, + "learning_rate": 3.65e-05, + "loss": 2.1767, + "mlm_loss": 2.1767, + "step": 730 + }, + { + "ep_loss": 0.0, + "epoch": 0.57, + "learning_rate": 3.7e-05, + "loss": 2.1589, + "mlm_loss": 2.1589, + "step": 740 + }, + { + "ep_loss": 0.0, + "epoch": 0.58, + "learning_rate": 3.75e-05, + "loss": 2.1297, + "mlm_loss": 2.1297, + "step": 750 + }, + { + "ep_loss": 0.0, + "epoch": 0.59, + "learning_rate": 3.8e-05, + "loss": 2.1128, + "mlm_loss": 2.1128, + "step": 760 + }, + { + "ep_loss": 0.0, + "epoch": 0.59, + "learning_rate": 3.85e-05, + "loss": 2.0861, + "mlm_loss": 2.0861, + "step": 770 + }, + { + "ep_loss": 0.0, + "epoch": 0.6, + "learning_rate": 3.9e-05, + "loss": 2.0557, + "mlm_loss": 2.0557, + "step": 780 + }, + { + "ep_loss": 0.0, + "epoch": 0.61, + "learning_rate": 3.95e-05, + "loss": 2.0596, + "mlm_loss": 2.0596, + "step": 790 + }, + { + "ep_loss": 0.0, + "epoch": 0.62, + "learning_rate": 4e-05, + "loss": 2.0349, + "mlm_loss": 2.0349, + "step": 800 + }, + { + "epoch": 0.62, + "eval_ep_loss": -1.7692768573760986, + "eval_loss": 1.916082501411438, + "eval_mlm_loss": 1.916082501411438, + "eval_runtime": 62.7772, + "eval_samples_per_second": 1112.968, + "eval_steps_per_second": 0.558, + "step": 800 + }, + { + "ep_loss": 0.0, + "epoch": 0.62, + "learning_rate": 4.05e-05, + "loss": 1.9934, + "mlm_loss": 1.9934, + "step": 810 + }, + { + "ep_loss": 0.0, + "epoch": 0.63, + "learning_rate": 4.1e-05, + "loss": 1.9554, + "mlm_loss": 1.9554, + "step": 820 + }, + { + "ep_loss": 0.0, + "epoch": 0.64, + "learning_rate": 4.1500000000000006e-05, + "loss": 1.9348, + "mlm_loss": 1.9348, + "step": 830 + }, + { + "ep_loss": 0.0, + "epoch": 0.65, + "learning_rate": 4.2000000000000004e-05, + "loss": 1.8978, + "mlm_loss": 1.8978, + "step": 840 + }, + { + "ep_loss": 0.0, + "epoch": 0.66, + "learning_rate": 4.25e-05, + "loss": 1.8752, + "mlm_loss": 1.8752, + "step": 850 + }, + { + "ep_loss": 0.0, + "epoch": 0.66, + "learning_rate": 4.2999999999999995e-05, + "loss": 1.8466, + "mlm_loss": 1.8466, + "step": 860 + }, + { + "ep_loss": 0.0, + "epoch": 0.67, + "learning_rate": 4.35e-05, + "loss": 1.8168, + "mlm_loss": 1.8168, + "step": 870 + }, + { + "ep_loss": 0.0, + "epoch": 0.68, + "learning_rate": 4.4e-05, + "loss": 1.8161, + "mlm_loss": 1.8161, + "step": 880 + }, + { + "ep_loss": 0.0, + "epoch": 0.69, + "learning_rate": 4.45e-05, + "loss": 1.7637, + "mlm_loss": 1.7637, + "step": 890 + }, + { + "ep_loss": 0.0, + "epoch": 0.69, + "learning_rate": 4.4999999999999996e-05, + "loss": 1.7292, + "mlm_loss": 1.7292, + "step": 900 + }, + { + "epoch": 0.69, + "eval_ep_loss": -1.673869252204895, + "eval_loss": 1.6480634212493896, + "eval_mlm_loss": 1.6480634212493896, + "eval_runtime": 61.0031, + "eval_samples_per_second": 1145.335, + "eval_steps_per_second": 0.574, + "step": 900 + }, + { + "ep_loss": 0.0, + "epoch": 0.7, + "learning_rate": 4.55e-05, + "loss": 1.7031, + "mlm_loss": 1.7031, + "step": 910 + }, + { + "ep_loss": 0.0, + "epoch": 0.71, + "learning_rate": 4.6e-05, + "loss": 1.6951, + "mlm_loss": 1.6951, + "step": 920 + }, + { + "ep_loss": 0.0, + "epoch": 0.72, + "learning_rate": 4.65e-05, + "loss": 1.6706, + "mlm_loss": 1.6706, + "step": 930 + }, + { + "ep_loss": 0.0, + "epoch": 0.73, + "learning_rate": 4.7000000000000004e-05, + "loss": 1.6674, + "mlm_loss": 1.6674, + "step": 940 + }, + { + "ep_loss": 0.0, + "epoch": 0.73, + "learning_rate": 4.75e-05, + "loss": 1.6446, + "mlm_loss": 1.6446, + "step": 950 + }, + { + "ep_loss": 0.0, + "epoch": 0.74, + "learning_rate": 4.8e-05, + "loss": 1.6344, + "mlm_loss": 1.6344, + "step": 960 + }, + { + "ep_loss": 0.0, + "epoch": 0.75, + "learning_rate": 4.85e-05, + "loss": 1.6182, + "mlm_loss": 1.6182, + "step": 970 + }, + { + "ep_loss": 0.0, + "epoch": 0.76, + "learning_rate": 4.9000000000000005e-05, + "loss": 1.602, + "mlm_loss": 1.602, + "step": 980 + }, + { + "ep_loss": 0.0, + "epoch": 0.76, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.5965, + "mlm_loss": 1.5965, + "step": 990 + }, + { + "ep_loss": 0.0, + "epoch": 0.77, + "learning_rate": 5e-05, + "loss": 1.6022, + "mlm_loss": 1.6022, + "step": 1000 + }, + { + "epoch": 0.77, + "eval_ep_loss": -1.7339978218078613, + "eval_loss": 1.521234393119812, + "eval_mlm_loss": 1.521234393119812, + "eval_runtime": 61.977, + "eval_samples_per_second": 1127.338, + "eval_steps_per_second": 0.565, + "step": 1000 + }, + { + "ep_loss": 0.0, + "epoch": 0.78, + "learning_rate": 5.05e-05, + "loss": 1.5856, + "mlm_loss": 1.5856, + "step": 1010 + }, + { + "ep_loss": 0.0, + "epoch": 0.79, + "learning_rate": 5.1e-05, + "loss": 1.5641, + "mlm_loss": 1.5641, + "step": 1020 + }, + { + "ep_loss": 0.0, + "epoch": 0.79, + "learning_rate": 5.15e-05, + "loss": 1.5455, + "mlm_loss": 1.5455, + "step": 1030 + }, + { + "ep_loss": 0.0, + "epoch": 0.8, + "learning_rate": 5.2e-05, + "loss": 1.5587, + "mlm_loss": 1.5587, + "step": 1040 + }, + { + "ep_loss": 0.0, + "epoch": 0.81, + "learning_rate": 5.25e-05, + "loss": 1.5495, + "mlm_loss": 1.5495, + "step": 1050 + }, + { + "ep_loss": 0.0, + "epoch": 0.82, + "learning_rate": 5.3e-05, + "loss": 1.5388, + "mlm_loss": 1.5388, + "step": 1060 + }, + { + "ep_loss": 0.0, + "epoch": 0.83, + "learning_rate": 5.35e-05, + "loss": 1.5085, + "mlm_loss": 1.5085, + "step": 1070 + }, + { + "ep_loss": 0.0, + "epoch": 0.83, + "learning_rate": 5.4e-05, + "loss": 1.5258, + "mlm_loss": 1.5258, + "step": 1080 + }, + { + "ep_loss": 0.0, + "epoch": 0.84, + "learning_rate": 5.45e-05, + "loss": 1.5027, + "mlm_loss": 1.5027, + "step": 1090 + }, + { + "ep_loss": 0.0, + "epoch": 0.85, + "learning_rate": 5.5e-05, + "loss": 1.4991, + "mlm_loss": 1.4991, + "step": 1100 + }, + { + "epoch": 0.85, + "eval_ep_loss": -1.8915724754333496, + "eval_loss": 1.4428980350494385, + "eval_mlm_loss": 1.4428980350494385, + "eval_runtime": 60.2275, + "eval_samples_per_second": 1160.085, + "eval_steps_per_second": 0.581, + "step": 1100 + }, + { + "ep_loss": 0.0, + "epoch": 0.86, + "learning_rate": 5.55e-05, + "loss": 1.5068, + "mlm_loss": 1.5068, + "step": 1110 + }, + { + "ep_loss": 0.0, + "epoch": 0.86, + "learning_rate": 5.6e-05, + "loss": 1.4812, + "mlm_loss": 1.4812, + "step": 1120 + }, + { + "ep_loss": 0.0, + "epoch": 0.87, + "learning_rate": 5.6500000000000005e-05, + "loss": 1.4765, + "mlm_loss": 1.4765, + "step": 1130 + }, + { + "ep_loss": 0.0, + "epoch": 0.88, + "learning_rate": 5.7e-05, + "loss": 1.4679, + "mlm_loss": 1.4679, + "step": 1140 + }, + { + "ep_loss": 0.0, + "epoch": 0.89, + "learning_rate": 5.75e-05, + "loss": 1.4631, + "mlm_loss": 1.4631, + "step": 1150 + }, + { + "ep_loss": 0.0, + "epoch": 0.89, + "learning_rate": 5.800000000000001e-05, + "loss": 1.4562, + "mlm_loss": 1.4562, + "step": 1160 + }, + { + "ep_loss": 0.0, + "epoch": 0.9, + "learning_rate": 5.8500000000000006e-05, + "loss": 1.4542, + "mlm_loss": 1.4542, + "step": 1170 + }, + { + "ep_loss": 0.0, + "epoch": 0.91, + "learning_rate": 5.9e-05, + "loss": 1.4266, + "mlm_loss": 1.4266, + "step": 1180 + }, + { + "ep_loss": 0.0, + "epoch": 0.92, + "learning_rate": 5.9499999999999996e-05, + "loss": 1.4181, + "mlm_loss": 1.4181, + "step": 1190 + }, + { + "ep_loss": 0.0, + "epoch": 0.93, + "learning_rate": 6e-05, + "loss": 1.4133, + "mlm_loss": 1.4133, + "step": 1200 + }, + { + "epoch": 0.93, + "eval_ep_loss": -1.9453246593475342, + "eval_loss": 1.3670274019241333, + "eval_mlm_loss": 1.3670274019241333, + "eval_runtime": 61.6667, + "eval_samples_per_second": 1133.011, + "eval_steps_per_second": 0.568, + "step": 1200 + }, + { + "ep_loss": 0.0, + "epoch": 0.93, + "learning_rate": 6.05e-05, + "loss": 1.414, + "mlm_loss": 1.414, + "step": 1210 + }, + { + "ep_loss": 0.0, + "epoch": 0.94, + "learning_rate": 6.1e-05, + "loss": 1.4032, + "mlm_loss": 1.4032, + "step": 1220 + }, + { + "ep_loss": 0.0, + "epoch": 0.95, + "learning_rate": 6.15e-05, + "loss": 1.3958, + "mlm_loss": 1.3958, + "step": 1230 + }, + { + "ep_loss": 0.0, + "epoch": 0.96, + "learning_rate": 6.2e-05, + "loss": 1.3952, + "mlm_loss": 1.3952, + "step": 1240 + }, + { + "ep_loss": 0.0, + "epoch": 0.96, + "learning_rate": 6.25e-05, + "loss": 1.3829, + "mlm_loss": 1.3829, + "step": 1250 + }, + { + "ep_loss": 0.0, + "epoch": 0.97, + "learning_rate": 6.3e-05, + "loss": 1.3915, + "mlm_loss": 1.3915, + "step": 1260 + }, + { + "ep_loss": 0.0, + "epoch": 0.98, + "learning_rate": 6.35e-05, + "loss": 1.3881, + "mlm_loss": 1.3881, + "step": 1270 + }, + { + "ep_loss": 0.0, + "epoch": 0.99, + "learning_rate": 6.4e-05, + "loss": 1.3915, + "mlm_loss": 1.3915, + "step": 1280 + }, + { + "ep_loss": 0.0, + "epoch": 0.99, + "learning_rate": 6.450000000000001e-05, + "loss": 1.3728, + "mlm_loss": 1.3728, + "step": 1290 + }, + { + "ep_loss": 0.0, + "epoch": 1.0, + "learning_rate": 6.500000000000001e-05, + "loss": 1.3379, + "mlm_loss": 1.3379, + "step": 1300 + }, + { + "epoch": 1.0, + "eval_ep_loss": -2.041435480117798, + "eval_loss": 1.2956568002700806, + "eval_mlm_loss": 1.2956568002700806, + "eval_runtime": 58.5906, + "eval_samples_per_second": 1192.495, + "eval_steps_per_second": 0.597, + "step": 1300 + }, + { + "ep_loss": 0.0, + "epoch": 1.01, + "learning_rate": 6.55e-05, + "loss": 1.3488, + "mlm_loss": 1.3488, + "step": 1310 + }, + { + "ep_loss": 0.0, + "epoch": 1.02, + "learning_rate": 6.6e-05, + "loss": 1.3408, + "mlm_loss": 1.3408, + "step": 1320 + }, + { + "ep_loss": 0.0, + "epoch": 1.03, + "learning_rate": 6.65e-05, + "loss": 1.3318, + "mlm_loss": 1.3318, + "step": 1330 + }, + { + "ep_loss": 0.0, + "epoch": 1.03, + "learning_rate": 6.7e-05, + "loss": 1.3216, + "mlm_loss": 1.3216, + "step": 1340 + }, + { + "ep_loss": 0.0, + "epoch": 1.04, + "learning_rate": 6.75e-05, + "loss": 1.3142, + "mlm_loss": 1.3142, + "step": 1350 + }, + { + "ep_loss": 0.0, + "epoch": 1.05, + "learning_rate": 6.800000000000001e-05, + "loss": 1.3075, + "mlm_loss": 1.3075, + "step": 1360 + }, + { + "ep_loss": 0.0, + "epoch": 1.06, + "learning_rate": 6.850000000000001e-05, + "loss": 1.312, + "mlm_loss": 1.312, + "step": 1370 + }, + { + "ep_loss": 0.0, + "epoch": 1.06, + "learning_rate": 6.900000000000001e-05, + "loss": 1.2987, + "mlm_loss": 1.2987, + "step": 1380 + }, + { + "ep_loss": 0.0, + "epoch": 1.07, + "learning_rate": 6.950000000000001e-05, + "loss": 1.2822, + "mlm_loss": 1.2822, + "step": 1390 + }, + { + "ep_loss": 0.0, + "epoch": 1.08, + "learning_rate": 7.000000000000001e-05, + "loss": 1.2817, + "mlm_loss": 1.2817, + "step": 1400 + }, + { + "epoch": 1.08, + "eval_ep_loss": -2.044208288192749, + "eval_loss": 1.2174968719482422, + "eval_mlm_loss": 1.2174968719482422, + "eval_runtime": 62.7874, + "eval_samples_per_second": 1112.788, + "eval_steps_per_second": 0.557, + "step": 1400 + }, + { + "ep_loss": 0.0, + "epoch": 1.09, + "learning_rate": 7.049999999999999e-05, + "loss": 1.2673, + "mlm_loss": 1.2673, + "step": 1410 + }, + { + "ep_loss": 0.0, + "epoch": 1.1, + "learning_rate": 7.099999999999999e-05, + "loss": 1.28, + "mlm_loss": 1.28, + "step": 1420 + }, + { + "ep_loss": 0.0, + "epoch": 1.1, + "learning_rate": 7.149999999999999e-05, + "loss": 1.2606, + "mlm_loss": 1.2606, + "step": 1430 + }, + { + "ep_loss": 0.0, + "epoch": 1.11, + "learning_rate": 7.2e-05, + "loss": 1.2439, + "mlm_loss": 1.2439, + "step": 1440 + }, + { + "ep_loss": 0.0, + "epoch": 1.12, + "learning_rate": 7.25e-05, + "loss": 1.2428, + "mlm_loss": 1.2428, + "step": 1450 + }, + { + "ep_loss": 0.0, + "epoch": 1.13, + "learning_rate": 7.3e-05, + "loss": 1.2342, + "mlm_loss": 1.2342, + "step": 1460 + }, + { + "ep_loss": 0.0, + "epoch": 1.13, + "learning_rate": 7.35e-05, + "loss": 1.2391, + "mlm_loss": 1.2391, + "step": 1470 + }, + { + "ep_loss": 0.0, + "epoch": 1.14, + "learning_rate": 7.4e-05, + "loss": 1.2337, + "mlm_loss": 1.2337, + "step": 1480 + }, + { + "ep_loss": 0.0, + "epoch": 1.15, + "learning_rate": 7.45e-05, + "loss": 1.2161, + "mlm_loss": 1.2161, + "step": 1490 + }, + { + "ep_loss": 0.0, + "epoch": 1.16, + "learning_rate": 7.5e-05, + "loss": 1.2083, + "mlm_loss": 1.2083, + "step": 1500 + }, + { + "epoch": 1.16, + "eval_ep_loss": -2.247138738632202, + "eval_loss": 1.145897388458252, + "eval_mlm_loss": 1.145897388458252, + "eval_runtime": 60.0275, + "eval_samples_per_second": 1163.951, + "eval_steps_per_second": 0.583, + "step": 1500 + }, + { + "ep_loss": 0.0, + "epoch": 1.16, + "learning_rate": 7.55e-05, + "loss": 1.2048, + "mlm_loss": 1.2048, + "step": 1510 + }, + { + "ep_loss": 0.0, + "epoch": 1.17, + "learning_rate": 7.6e-05, + "loss": 1.1934, + "mlm_loss": 1.1934, + "step": 1520 + }, + { + "ep_loss": 0.0, + "epoch": 1.18, + "learning_rate": 7.65e-05, + "loss": 1.1741, + "mlm_loss": 1.1741, + "step": 1530 + }, + { + "ep_loss": 0.0, + "epoch": 1.19, + "learning_rate": 7.7e-05, + "loss": 1.1832, + "mlm_loss": 1.1832, + "step": 1540 + }, + { + "ep_loss": 0.0, + "epoch": 1.2, + "learning_rate": 7.75e-05, + "loss": 1.1759, + "mlm_loss": 1.1759, + "step": 1550 + }, + { + "ep_loss": 0.0, + "epoch": 1.2, + "learning_rate": 7.8e-05, + "loss": 1.1776, + "mlm_loss": 1.1776, + "step": 1560 + }, + { + "ep_loss": 0.0, + "epoch": 1.21, + "learning_rate": 7.85e-05, + "loss": 1.164, + "mlm_loss": 1.164, + "step": 1570 + }, + { + "ep_loss": 0.0, + "epoch": 1.22, + "learning_rate": 7.9e-05, + "loss": 1.1447, + "mlm_loss": 1.1447, + "step": 1580 + }, + { + "ep_loss": 0.0, + "epoch": 1.23, + "learning_rate": 7.950000000000001e-05, + "loss": 1.1669, + "mlm_loss": 1.1669, + "step": 1590 + }, + { + "ep_loss": 0.0, + "epoch": 1.23, + "learning_rate": 8e-05, + "loss": 1.1331, + "mlm_loss": 1.1331, + "step": 1600 + }, + { + "epoch": 1.23, + "eval_ep_loss": -2.236558198928833, + "eval_loss": 1.0778173208236694, + "eval_mlm_loss": 1.0778173208236694, + "eval_runtime": 61.8774, + "eval_samples_per_second": 1129.152, + "eval_steps_per_second": 0.566, + "step": 1600 + }, + { + "ep_loss": 0.0, + "epoch": 1.24, + "learning_rate": 8.05e-05, + "loss": 1.1332, + "mlm_loss": 1.1332, + "step": 1610 + }, + { + "ep_loss": 0.0, + "epoch": 1.25, + "learning_rate": 8.1e-05, + "loss": 1.1288, + "mlm_loss": 1.1288, + "step": 1620 + }, + { + "ep_loss": 0.0, + "epoch": 1.26, + "learning_rate": 8.15e-05, + "loss": 1.1321, + "mlm_loss": 1.1321, + "step": 1630 + }, + { + "ep_loss": 0.0, + "epoch": 1.26, + "learning_rate": 8.2e-05, + "loss": 1.11, + "mlm_loss": 1.11, + "step": 1640 + }, + { + "ep_loss": 0.0, + "epoch": 1.27, + "learning_rate": 8.25e-05, + "loss": 1.1083, + "mlm_loss": 1.1083, + "step": 1650 + }, + { + "ep_loss": 0.0, + "epoch": 1.28, + "learning_rate": 8.300000000000001e-05, + "loss": 1.0999, + "mlm_loss": 1.0999, + "step": 1660 + }, + { + "ep_loss": 0.0, + "epoch": 1.29, + "learning_rate": 8.350000000000001e-05, + "loss": 1.1041, + "mlm_loss": 1.1041, + "step": 1670 + }, + { + "ep_loss": 0.0, + "epoch": 1.3, + "learning_rate": 8.400000000000001e-05, + "loss": 1.0823, + "mlm_loss": 1.0823, + "step": 1680 + }, + { + "ep_loss": 0.0, + "epoch": 1.3, + "learning_rate": 8.450000000000001e-05, + "loss": 1.0809, + "mlm_loss": 1.0809, + "step": 1690 + }, + { + "ep_loss": 0.0, + "epoch": 1.31, + "learning_rate": 8.5e-05, + "loss": 1.0859, + "mlm_loss": 1.0859, + "step": 1700 + }, + { + "epoch": 1.31, + "eval_ep_loss": -2.2500903606414795, + "eval_loss": 1.0151546001434326, + "eval_mlm_loss": 1.0151546001434326, + "eval_runtime": 60.0928, + "eval_samples_per_second": 1162.685, + "eval_steps_per_second": 0.582, + "step": 1700 + }, + { + "ep_loss": 0.0, + "epoch": 1.32, + "learning_rate": 8.55e-05, + "loss": 1.0757, + "mlm_loss": 1.0757, + "step": 1710 + }, + { + "ep_loss": 0.0, + "epoch": 1.33, + "learning_rate": 8.599999999999999e-05, + "loss": 1.0638, + "mlm_loss": 1.0638, + "step": 1720 + }, + { + "ep_loss": 0.0, + "epoch": 1.33, + "learning_rate": 8.65e-05, + "loss": 1.0554, + "mlm_loss": 1.0554, + "step": 1730 + }, + { + "ep_loss": 0.0, + "epoch": 1.34, + "learning_rate": 8.7e-05, + "loss": 1.0624, + "mlm_loss": 1.0624, + "step": 1740 + }, + { + "ep_loss": 0.0, + "epoch": 1.35, + "learning_rate": 8.75e-05, + "loss": 1.0483, + "mlm_loss": 1.0483, + "step": 1750 + }, + { + "ep_loss": 0.0, + "epoch": 1.36, + "learning_rate": 8.8e-05, + "loss": 1.0432, + "mlm_loss": 1.0432, + "step": 1760 + }, + { + "ep_loss": 0.0, + "epoch": 1.37, + "learning_rate": 8.85e-05, + "loss": 1.0464, + "mlm_loss": 1.0464, + "step": 1770 + }, + { + "ep_loss": 0.0, + "epoch": 1.37, + "learning_rate": 8.9e-05, + "loss": 1.0379, + "mlm_loss": 1.0379, + "step": 1780 + }, + { + "ep_loss": 0.0, + "epoch": 1.38, + "learning_rate": 8.95e-05, + "loss": 1.0416, + "mlm_loss": 1.0416, + "step": 1790 + }, + { + "ep_loss": 0.0, + "epoch": 1.39, + "learning_rate": 8.999999999999999e-05, + "loss": 1.0178, + "mlm_loss": 1.0178, + "step": 1800 + }, + { + "epoch": 1.39, + "eval_ep_loss": -2.3243699073791504, + "eval_loss": 0.9662984013557434, + "eval_mlm_loss": 0.9662984013557434, + "eval_runtime": 61.639, + "eval_samples_per_second": 1133.519, + "eval_steps_per_second": 0.568, + "step": 1800 + }, + { + "ep_loss": 0.0, + "epoch": 1.4, + "learning_rate": 9.05e-05, + "loss": 1.0189, + "mlm_loss": 1.0189, + "step": 1810 + }, + { + "ep_loss": 0.0, + "epoch": 1.4, + "learning_rate": 9.1e-05, + "loss": 1.0181, + "mlm_loss": 1.0181, + "step": 1820 + }, + { + "ep_loss": 0.0, + "epoch": 1.41, + "learning_rate": 9.15e-05, + "loss": 1.006, + "mlm_loss": 1.006, + "step": 1830 + }, + { + "ep_loss": 0.0, + "epoch": 1.42, + "learning_rate": 9.2e-05, + "loss": 1.0088, + "mlm_loss": 1.0088, + "step": 1840 + }, + { + "ep_loss": 0.0, + "epoch": 1.43, + "learning_rate": 9.25e-05, + "loss": 1.008, + "mlm_loss": 1.008, + "step": 1850 + }, + { + "ep_loss": 0.0, + "epoch": 1.43, + "learning_rate": 9.3e-05, + "loss": 0.9891, + "mlm_loss": 0.9891, + "step": 1860 + }, + { + "ep_loss": 0.0, + "epoch": 1.44, + "learning_rate": 9.35e-05, + "loss": 0.9998, + "mlm_loss": 0.9998, + "step": 1870 + }, + { + "ep_loss": 0.0, + "epoch": 1.45, + "learning_rate": 9.400000000000001e-05, + "loss": 0.9927, + "mlm_loss": 0.9927, + "step": 1880 + }, + { + "ep_loss": 0.0, + "epoch": 1.46, + "learning_rate": 9.45e-05, + "loss": 0.9825, + "mlm_loss": 0.9825, + "step": 1890 + }, + { + "ep_loss": 0.0, + "epoch": 1.47, + "learning_rate": 9.5e-05, + "loss": 0.979, + "mlm_loss": 0.979, + "step": 1900 + }, + { + "epoch": 1.47, + "eval_ep_loss": -2.297435760498047, + "eval_loss": 0.9245826005935669, + "eval_mlm_loss": 0.9245826005935669, + "eval_runtime": 60.9274, + "eval_samples_per_second": 1146.759, + "eval_steps_per_second": 0.574, + "step": 1900 + }, + { + "ep_loss": 0.0, + "epoch": 1.47, + "learning_rate": 9.55e-05, + "loss": 0.9782, + "mlm_loss": 0.9782, + "step": 1910 + }, + { + "ep_loss": 0.0, + "epoch": 1.48, + "learning_rate": 9.6e-05, + "loss": 0.9802, + "mlm_loss": 0.9802, + "step": 1920 + }, + { + "ep_loss": 0.0, + "epoch": 1.49, + "learning_rate": 9.65e-05, + "loss": 0.9648, + "mlm_loss": 0.9648, + "step": 1930 + }, + { + "ep_loss": 0.0, + "epoch": 1.5, + "learning_rate": 9.7e-05, + "loss": 0.9576, + "mlm_loss": 0.9576, + "step": 1940 + }, + { + "ep_loss": 0.0, + "epoch": 1.5, + "learning_rate": 9.750000000000001e-05, + "loss": 0.9629, + "mlm_loss": 0.9629, + "step": 1950 + }, + { + "ep_loss": 0.0, + "epoch": 1.51, + "learning_rate": 9.800000000000001e-05, + "loss": 0.9585, + "mlm_loss": 0.9585, + "step": 1960 + }, + { + "ep_loss": 0.0, + "epoch": 1.52, + "learning_rate": 9.850000000000001e-05, + "loss": 0.9404, + "mlm_loss": 0.9404, + "step": 1970 + }, + { + "ep_loss": 0.0, + "epoch": 1.53, + "learning_rate": 9.900000000000001e-05, + "loss": 0.9538, + "mlm_loss": 0.9538, + "step": 1980 + }, + { + "ep_loss": 0.0, + "epoch": 1.53, + "learning_rate": 9.95e-05, + "loss": 0.9591, + "mlm_loss": 0.9591, + "step": 1990 + }, + { + "ep_loss": 0.0, + "epoch": 1.54, + "learning_rate": 0.0001, + "loss": 0.9467, + "mlm_loss": 0.9467, + "step": 2000 + }, + { + "epoch": 1.54, + "eval_ep_loss": -2.388266086578369, + "eval_loss": 0.8887887597084045, + "eval_mlm_loss": 0.8887887597084045, + "eval_runtime": 59.4206, + "eval_samples_per_second": 1175.838, + "eval_steps_per_second": 0.589, + "step": 2000 + }, + { + "ep_loss": 0.0, + "epoch": 1.55, + "learning_rate": 0.0001005, + "loss": 0.947, + "mlm_loss": 0.947, + "step": 2010 + }, + { + "ep_loss": 0.0, + "epoch": 1.56, + "learning_rate": 0.000101, + "loss": 0.9407, + "mlm_loss": 0.9407, + "step": 2020 + }, + { + "ep_loss": 0.0, + "epoch": 1.57, + "learning_rate": 0.00010150000000000001, + "loss": 0.9352, + "mlm_loss": 0.9352, + "step": 2030 + }, + { + "ep_loss": 0.0, + "epoch": 1.57, + "learning_rate": 0.000102, + "loss": 0.9234, + "mlm_loss": 0.9234, + "step": 2040 + }, + { + "ep_loss": 0.0, + "epoch": 1.58, + "learning_rate": 0.0001025, + "loss": 0.9252, + "mlm_loss": 0.9252, + "step": 2050 + }, + { + "ep_loss": 0.0, + "epoch": 1.59, + "learning_rate": 0.000103, + "loss": 0.9271, + "mlm_loss": 0.9271, + "step": 2060 + }, + { + "ep_loss": 0.0, + "epoch": 1.6, + "learning_rate": 0.0001035, + "loss": 0.9254, + "mlm_loss": 0.9254, + "step": 2070 + }, + { + "ep_loss": 0.0, + "epoch": 1.6, + "learning_rate": 0.000104, + "loss": 0.9232, + "mlm_loss": 0.9232, + "step": 2080 + }, + { + "ep_loss": 0.0, + "epoch": 1.61, + "learning_rate": 0.00010449999999999999, + "loss": 0.919, + "mlm_loss": 0.919, + "step": 2090 + }, + { + "ep_loss": 0.0, + "epoch": 1.62, + "learning_rate": 0.000105, + "loss": 0.9127, + "mlm_loss": 0.9127, + "step": 2100 + }, + { + "epoch": 1.62, + "eval_ep_loss": -2.262110948562622, + "eval_loss": 0.8549456000328064, + "eval_mlm_loss": 0.8549456000328064, + "eval_runtime": 59.0907, + "eval_samples_per_second": 1182.403, + "eval_steps_per_second": 0.592, + "step": 2100 + }, + { + "ep_loss": 0.0, + "epoch": 1.63, + "learning_rate": 0.0001055, + "loss": 0.9023, + "mlm_loss": 0.9023, + "step": 2110 + }, + { + "ep_loss": 0.0, + "epoch": 1.64, + "learning_rate": 0.000106, + "loss": 0.9031, + "mlm_loss": 0.9031, + "step": 2120 + }, + { + "ep_loss": 0.0, + "epoch": 1.64, + "learning_rate": 0.0001065, + "loss": 0.9039, + "mlm_loss": 0.9039, + "step": 2130 + }, + { + "ep_loss": 0.0, + "epoch": 1.65, + "learning_rate": 0.000107, + "loss": 0.9022, + "mlm_loss": 0.9022, + "step": 2140 + }, + { + "ep_loss": 0.0, + "epoch": 1.66, + "learning_rate": 0.0001075, + "loss": 0.8978, + "mlm_loss": 0.8978, + "step": 2150 + }, + { + "ep_loss": 0.0, + "epoch": 1.67, + "learning_rate": 0.000108, + "loss": 0.8893, + "mlm_loss": 0.8893, + "step": 2160 + }, + { + "ep_loss": 0.0, + "epoch": 1.67, + "learning_rate": 0.00010850000000000001, + "loss": 0.8796, + "mlm_loss": 0.8796, + "step": 2170 + }, + { + "ep_loss": 0.0, + "epoch": 1.68, + "learning_rate": 0.000109, + "loss": 0.8925, + "mlm_loss": 0.8925, + "step": 2180 + }, + { + "ep_loss": 0.0, + "epoch": 1.69, + "learning_rate": 0.0001095, + "loss": 0.8873, + "mlm_loss": 0.8873, + "step": 2190 + }, + { + "ep_loss": 0.0, + "epoch": 1.7, + "learning_rate": 0.00011, + "loss": 0.8858, + "mlm_loss": 0.8858, + "step": 2200 + }, + { + "epoch": 1.7, + "eval_ep_loss": -2.3254947662353516, + "eval_loss": 0.8307343125343323, + "eval_mlm_loss": 0.8307343125343323, + "eval_runtime": 62.4113, + "eval_samples_per_second": 1119.493, + "eval_steps_per_second": 0.561, + "step": 2200 + }, + { + "ep_loss": 0.0, + "epoch": 1.7, + "learning_rate": 0.0001105, + "loss": 0.888, + "mlm_loss": 0.888, + "step": 2210 + }, + { + "ep_loss": 0.0, + "epoch": 1.71, + "learning_rate": 0.000111, + "loss": 0.8706, + "mlm_loss": 0.8706, + "step": 2220 + }, + { + "ep_loss": 0.0, + "epoch": 1.72, + "learning_rate": 0.0001115, + "loss": 0.8854, + "mlm_loss": 0.8854, + "step": 2230 + }, + { + "ep_loss": 0.0, + "epoch": 1.73, + "learning_rate": 0.000112, + "loss": 0.864, + "mlm_loss": 0.864, + "step": 2240 + }, + { + "ep_loss": 0.0, + "epoch": 1.74, + "learning_rate": 0.00011250000000000001, + "loss": 0.8675, + "mlm_loss": 0.8675, + "step": 2250 + }, + { + "ep_loss": 0.0, + "epoch": 1.74, + "learning_rate": 0.00011300000000000001, + "loss": 0.8654, + "mlm_loss": 0.8654, + "step": 2260 + }, + { + "ep_loss": 0.0, + "epoch": 1.75, + "learning_rate": 0.00011350000000000001, + "loss": 0.8593, + "mlm_loss": 0.8593, + "step": 2270 + }, + { + "ep_loss": 0.0, + "epoch": 1.76, + "learning_rate": 0.000114, + "loss": 0.8586, + "mlm_loss": 0.8586, + "step": 2280 + }, + { + "ep_loss": 0.0, + "epoch": 1.77, + "learning_rate": 0.0001145, + "loss": 0.8494, + "mlm_loss": 0.8494, + "step": 2290 + }, + { + "ep_loss": 0.0, + "epoch": 1.77, + "learning_rate": 0.000115, + "loss": 0.8517, + "mlm_loss": 0.8517, + "step": 2300 + }, + { + "epoch": 1.77, + "eval_ep_loss": -2.2645294666290283, + "eval_loss": 0.803054928779602, + "eval_mlm_loss": 0.803054928779602, + "eval_runtime": 61.7973, + "eval_samples_per_second": 1130.616, + "eval_steps_per_second": 0.566, + "step": 2300 + }, + { + "ep_loss": 0.0, + "epoch": 1.78, + "learning_rate": 0.0001155, + "loss": 0.8396, + "mlm_loss": 0.8396, + "step": 2310 + }, + { + "ep_loss": 0.0, + "epoch": 1.79, + "learning_rate": 0.00011600000000000001, + "loss": 0.8478, + "mlm_loss": 0.8478, + "step": 2320 + }, + { + "ep_loss": 0.0, + "epoch": 1.8, + "learning_rate": 0.00011650000000000001, + "loss": 0.8495, + "mlm_loss": 0.8495, + "step": 2330 + }, + { + "ep_loss": 0.0, + "epoch": 1.8, + "learning_rate": 0.00011700000000000001, + "loss": 0.8441, + "mlm_loss": 0.8441, + "step": 2340 + }, + { + "ep_loss": 0.0, + "epoch": 1.81, + "learning_rate": 0.0001175, + "loss": 0.8441, + "mlm_loss": 0.8441, + "step": 2350 + }, + { + "ep_loss": 0.0, + "epoch": 1.82, + "learning_rate": 0.000118, + "loss": 0.8406, + "mlm_loss": 0.8406, + "step": 2360 + }, + { + "ep_loss": 0.0, + "epoch": 1.83, + "learning_rate": 0.0001185, + "loss": 0.8322, + "mlm_loss": 0.8322, + "step": 2370 + }, + { + "ep_loss": 0.0, + "epoch": 1.84, + "learning_rate": 0.00011899999999999999, + "loss": 0.8282, + "mlm_loss": 0.8282, + "step": 2380 + }, + { + "ep_loss": 0.0, + "epoch": 1.84, + "learning_rate": 0.00011949999999999999, + "loss": 0.8437, + "mlm_loss": 0.8437, + "step": 2390 + }, + { + "ep_loss": 0.0, + "epoch": 1.85, + "learning_rate": 0.00012, + "loss": 0.8415, + "mlm_loss": 0.8415, + "step": 2400 + }, + { + "epoch": 1.85, + "eval_ep_loss": -2.278646945953369, + "eval_loss": 0.7815272212028503, + "eval_mlm_loss": 0.7815272212028503, + "eval_runtime": 60.3734, + "eval_samples_per_second": 1157.281, + "eval_steps_per_second": 0.58, + "step": 2400 + }, + { + "ep_loss": 0.0, + "epoch": 1.86, + "learning_rate": 0.0001205, + "loss": 0.8232, + "mlm_loss": 0.8232, + "step": 2410 + }, + { + "ep_loss": 0.0, + "epoch": 1.87, + "learning_rate": 0.000121, + "loss": 0.8204, + "mlm_loss": 0.8204, + "step": 2420 + }, + { + "ep_loss": 0.0, + "epoch": 1.87, + "learning_rate": 0.0001215, + "loss": 0.8246, + "mlm_loss": 0.8246, + "step": 2430 + }, + { + "ep_loss": 0.0, + "epoch": 1.88, + "learning_rate": 0.000122, + "loss": 0.8092, + "mlm_loss": 0.8092, + "step": 2440 + }, + { + "ep_loss": 0.0, + "epoch": 1.89, + "learning_rate": 0.0001225, + "loss": 0.812, + "mlm_loss": 0.812, + "step": 2450 + }, + { + "ep_loss": 0.0, + "epoch": 1.9, + "learning_rate": 0.000123, + "loss": 0.8148, + "mlm_loss": 0.8148, + "step": 2460 + }, + { + "ep_loss": 0.0, + "epoch": 1.91, + "learning_rate": 0.0001235, + "loss": 0.8065, + "mlm_loss": 0.8065, + "step": 2470 + }, + { + "ep_loss": 0.0, + "epoch": 1.91, + "learning_rate": 0.000124, + "loss": 0.8029, + "mlm_loss": 0.8029, + "step": 2480 + }, + { + "ep_loss": 0.0, + "epoch": 1.92, + "learning_rate": 0.0001245, + "loss": 0.8148, + "mlm_loss": 0.8148, + "step": 2490 + }, + { + "ep_loss": 0.0, + "epoch": 1.93, + "learning_rate": 0.000125, + "loss": 0.806, + "mlm_loss": 0.806, + "step": 2500 + }, + { + "epoch": 1.93, + "eval_ep_loss": -2.1858584880828857, + "eval_loss": 0.7560575604438782, + "eval_mlm_loss": 0.7560575604438782, + "eval_runtime": 60.8551, + "eval_samples_per_second": 1148.121, + "eval_steps_per_second": 0.575, + "step": 2500 + }, + { + "ep_loss": 0.0, + "epoch": 1.94, + "learning_rate": 0.00012550000000000001, + "loss": 0.8048, + "mlm_loss": 0.8048, + "step": 2510 + }, + { + "ep_loss": 0.0, + "epoch": 1.94, + "learning_rate": 0.000126, + "loss": 0.8027, + "mlm_loss": 0.8027, + "step": 2520 + }, + { + "ep_loss": 0.0, + "epoch": 1.95, + "learning_rate": 0.0001265, + "loss": 0.797, + "mlm_loss": 0.797, + "step": 2530 + }, + { + "ep_loss": 0.0, + "epoch": 1.96, + "learning_rate": 0.000127, + "loss": 0.8041, + "mlm_loss": 0.8041, + "step": 2540 + }, + { + "ep_loss": 0.0, + "epoch": 1.97, + "learning_rate": 0.0001275, + "loss": 0.8, + "mlm_loss": 0.8, + "step": 2550 + }, + { + "ep_loss": 0.0, + "epoch": 1.97, + "learning_rate": 0.000128, + "loss": 0.7955, + "mlm_loss": 0.7955, + "step": 2560 + }, + { + "ep_loss": 0.0, + "epoch": 1.98, + "learning_rate": 0.0001285, + "loss": 0.7943, + "mlm_loss": 0.7943, + "step": 2570 + }, + { + "ep_loss": 0.0, + "epoch": 1.99, + "learning_rate": 0.00012900000000000002, + "loss": 0.7934, + "mlm_loss": 0.7934, + "step": 2580 + }, + { + "ep_loss": 0.0, + "epoch": 2.0, + "learning_rate": 0.0001295, + "loss": 0.7884, + "mlm_loss": 0.7884, + "step": 2590 + }, + { + "ep_loss": 0.0, + "epoch": 2.01, + "learning_rate": 0.00013000000000000002, + "loss": 0.7892, + "mlm_loss": 0.7892, + "step": 2600 + }, + { + "epoch": 2.01, + "eval_ep_loss": -2.1713080406188965, + "eval_loss": 0.7403181195259094, + "eval_mlm_loss": 0.7403181195259094, + "eval_runtime": 64.561, + "eval_samples_per_second": 1082.217, + "eval_steps_per_second": 0.542, + "step": 2600 + }, + { + "ep_loss": 0.0, + "epoch": 2.01, + "learning_rate": 0.0001305, + "loss": 0.7818, + "mlm_loss": 0.7818, + "step": 2610 + }, + { + "ep_loss": 0.0, + "epoch": 2.02, + "learning_rate": 0.000131, + "loss": 0.7771, + "mlm_loss": 0.7771, + "step": 2620 + }, + { + "ep_loss": 0.0, + "epoch": 2.03, + "learning_rate": 0.0001315, + "loss": 0.767, + "mlm_loss": 0.767, + "step": 2630 + }, + { + "ep_loss": 0.0, + "epoch": 2.04, + "learning_rate": 0.000132, + "loss": 0.7826, + "mlm_loss": 0.7826, + "step": 2640 + }, + { + "ep_loss": 0.0, + "epoch": 2.04, + "learning_rate": 0.00013250000000000002, + "loss": 0.771, + "mlm_loss": 0.771, + "step": 2650 + }, + { + "ep_loss": 0.0, + "epoch": 2.05, + "learning_rate": 0.000133, + "loss": 0.7774, + "mlm_loss": 0.7774, + "step": 2660 + }, + { + "ep_loss": 0.0, + "epoch": 2.06, + "learning_rate": 0.00013350000000000002, + "loss": 0.7674, + "mlm_loss": 0.7674, + "step": 2670 + }, + { + "ep_loss": 0.0, + "epoch": 2.07, + "learning_rate": 0.000134, + "loss": 0.7737, + "mlm_loss": 0.7737, + "step": 2680 + }, + { + "ep_loss": 0.0, + "epoch": 2.07, + "learning_rate": 0.00013450000000000002, + "loss": 0.7639, + "mlm_loss": 0.7639, + "step": 2690 + }, + { + "ep_loss": 0.0, + "epoch": 2.08, + "learning_rate": 0.000135, + "loss": 0.7655, + "mlm_loss": 0.7655, + "step": 2700 + }, + { + "epoch": 2.08, + "eval_ep_loss": -2.082960605621338, + "eval_loss": 0.7183709144592285, + "eval_mlm_loss": 0.7183709144592285, + "eval_runtime": 61.749, + "eval_samples_per_second": 1131.5, + "eval_steps_per_second": 0.567, + "step": 2700 + }, + { + "ep_loss": 0.0, + "epoch": 2.09, + "learning_rate": 0.00013550000000000001, + "loss": 0.7611, + "mlm_loss": 0.7611, + "step": 2710 + }, + { + "ep_loss": 0.0, + "epoch": 2.1, + "learning_rate": 0.00013600000000000003, + "loss": 0.7586, + "mlm_loss": 0.7586, + "step": 2720 + }, + { + "ep_loss": 0.0, + "epoch": 2.11, + "learning_rate": 0.0001365, + "loss": 0.762, + "mlm_loss": 0.762, + "step": 2730 + }, + { + "ep_loss": 0.0, + "epoch": 2.11, + "learning_rate": 0.00013700000000000002, + "loss": 0.7479, + "mlm_loss": 0.7479, + "step": 2740 + }, + { + "ep_loss": 0.0, + "epoch": 2.12, + "learning_rate": 0.0001375, + "loss": 0.7514, + "mlm_loss": 0.7514, + "step": 2750 + }, + { + "ep_loss": 0.0, + "epoch": 2.13, + "learning_rate": 0.00013800000000000002, + "loss": 0.7476, + "mlm_loss": 0.7476, + "step": 2760 + }, + { + "ep_loss": 0.0, + "epoch": 2.14, + "learning_rate": 0.0001385, + "loss": 0.7629, + "mlm_loss": 0.7629, + "step": 2770 + }, + { + "ep_loss": 0.0, + "epoch": 2.14, + "learning_rate": 0.00013900000000000002, + "loss": 0.7432, + "mlm_loss": 0.7432, + "step": 2780 + }, + { + "ep_loss": 0.0, + "epoch": 2.15, + "learning_rate": 0.0001395, + "loss": 0.7449, + "mlm_loss": 0.7449, + "step": 2790 + }, + { + "ep_loss": 0.0, + "epoch": 2.16, + "learning_rate": 0.00014000000000000001, + "loss": 0.7379, + "mlm_loss": 0.7379, + "step": 2800 + }, + { + "epoch": 2.16, + "eval_ep_loss": -2.261542558670044, + "eval_loss": 0.7018134593963623, + "eval_mlm_loss": 0.7018134593963623, + "eval_runtime": 60.9379, + "eval_samples_per_second": 1146.561, + "eval_steps_per_second": 0.574, + "step": 2800 + }, + { + "ep_loss": 0.0, + "epoch": 2.17, + "learning_rate": 0.00014050000000000003, + "loss": 0.7465, + "mlm_loss": 0.7465, + "step": 2810 + }, + { + "ep_loss": 0.0, + "epoch": 2.18, + "learning_rate": 0.00014099999999999998, + "loss": 0.737, + "mlm_loss": 0.737, + "step": 2820 + }, + { + "ep_loss": 0.0, + "epoch": 2.18, + "learning_rate": 0.0001415, + "loss": 0.7377, + "mlm_loss": 0.7377, + "step": 2830 + }, + { + "ep_loss": 0.0, + "epoch": 2.19, + "learning_rate": 0.00014199999999999998, + "loss": 0.7401, + "mlm_loss": 0.7401, + "step": 2840 + }, + { + "ep_loss": 0.0, + "epoch": 2.2, + "learning_rate": 0.0001425, + "loss": 0.7266, + "mlm_loss": 0.7266, + "step": 2850 + }, + { + "ep_loss": 0.0, + "epoch": 2.21, + "learning_rate": 0.00014299999999999998, + "loss": 0.7443, + "mlm_loss": 0.7443, + "step": 2860 + }, + { + "ep_loss": 0.0, + "epoch": 2.21, + "learning_rate": 0.0001435, + "loss": 0.7301, + "mlm_loss": 0.7301, + "step": 2870 + }, + { + "ep_loss": 0.0, + "epoch": 2.22, + "learning_rate": 0.000144, + "loss": 0.7427, + "mlm_loss": 0.7427, + "step": 2880 + }, + { + "ep_loss": 0.0, + "epoch": 2.23, + "learning_rate": 0.0001445, + "loss": 0.7387, + "mlm_loss": 0.7387, + "step": 2890 + }, + { + "ep_loss": 0.0, + "epoch": 2.24, + "learning_rate": 0.000145, + "loss": 0.721, + "mlm_loss": 0.721, + "step": 2900 + }, + { + "epoch": 2.24, + "eval_ep_loss": -2.377054214477539, + "eval_loss": 0.6826241612434387, + "eval_mlm_loss": 0.6826241612434387, + "eval_runtime": 60.9503, + "eval_samples_per_second": 1146.327, + "eval_steps_per_second": 0.574, + "step": 2900 + }, + { + "ep_loss": 0.0, + "epoch": 2.24, + "learning_rate": 0.00014549999999999999, + "loss": 0.7352, + "mlm_loss": 0.7352, + "step": 2910 + }, + { + "ep_loss": 0.0, + "epoch": 2.25, + "learning_rate": 0.000146, + "loss": 0.7219, + "mlm_loss": 0.7219, + "step": 2920 + }, + { + "ep_loss": 0.0, + "epoch": 2.26, + "learning_rate": 0.00014649999999999998, + "loss": 0.729, + "mlm_loss": 0.729, + "step": 2930 + }, + { + "ep_loss": 0.0, + "epoch": 2.27, + "learning_rate": 0.000147, + "loss": 0.7148, + "mlm_loss": 0.7148, + "step": 2940 + }, + { + "ep_loss": 0.0, + "epoch": 2.28, + "learning_rate": 0.0001475, + "loss": 0.721, + "mlm_loss": 0.721, + "step": 2950 + }, + { + "ep_loss": 0.0, + "epoch": 2.28, + "learning_rate": 0.000148, + "loss": 0.7113, + "mlm_loss": 0.7113, + "step": 2960 + }, + { + "ep_loss": 0.0, + "epoch": 2.29, + "learning_rate": 0.0001485, + "loss": 0.711, + "mlm_loss": 0.711, + "step": 2970 + }, + { + "ep_loss": 0.0, + "epoch": 2.3, + "learning_rate": 0.000149, + "loss": 0.7109, + "mlm_loss": 0.7109, + "step": 2980 + }, + { + "ep_loss": 0.0, + "epoch": 2.31, + "learning_rate": 0.0001495, + "loss": 0.7121, + "mlm_loss": 0.7121, + "step": 2990 + }, + { + "ep_loss": 0.0, + "epoch": 2.31, + "learning_rate": 0.00015, + "loss": 0.7106, + "mlm_loss": 0.7106, + "step": 3000 + }, + { + "epoch": 2.31, + "eval_ep_loss": -2.0994629859924316, + "eval_loss": 0.6672462821006775, + "eval_mlm_loss": 0.6672462821006775, + "eval_runtime": 63.0197, + "eval_samples_per_second": 1108.686, + "eval_steps_per_second": 0.555, + "step": 3000 + }, + { + "ep_loss": 0.0, + "epoch": 2.32, + "learning_rate": 0.0001505, + "loss": 0.7067, + "mlm_loss": 0.7067, + "step": 3010 + }, + { + "ep_loss": 0.0, + "epoch": 2.33, + "learning_rate": 0.000151, + "loss": 0.7082, + "mlm_loss": 0.7082, + "step": 3020 + }, + { + "ep_loss": 0.0, + "epoch": 2.34, + "learning_rate": 0.0001515, + "loss": 0.7097, + "mlm_loss": 0.7097, + "step": 3030 + }, + { + "ep_loss": 0.0, + "epoch": 2.34, + "learning_rate": 0.000152, + "loss": 0.6972, + "mlm_loss": 0.6972, + "step": 3040 + }, + { + "ep_loss": 0.0, + "epoch": 2.35, + "learning_rate": 0.0001525, + "loss": 0.7051, + "mlm_loss": 0.7051, + "step": 3050 + }, + { + "ep_loss": 0.0, + "epoch": 2.36, + "learning_rate": 0.000153, + "loss": 0.7029, + "mlm_loss": 0.7029, + "step": 3060 + }, + { + "ep_loss": 0.0, + "epoch": 2.37, + "learning_rate": 0.0001535, + "loss": 0.7037, + "mlm_loss": 0.7037, + "step": 3070 + }, + { + "ep_loss": 0.0, + "epoch": 2.38, + "learning_rate": 0.000154, + "loss": 0.6966, + "mlm_loss": 0.6966, + "step": 3080 + }, + { + "ep_loss": 0.0, + "epoch": 2.38, + "learning_rate": 0.00015450000000000001, + "loss": 0.6948, + "mlm_loss": 0.6948, + "step": 3090 + }, + { + "ep_loss": 0.0, + "epoch": 2.39, + "learning_rate": 0.000155, + "loss": 0.6974, + "mlm_loss": 0.6974, + "step": 3100 + }, + { + "epoch": 2.39, + "eval_ep_loss": -2.283140182495117, + "eval_loss": 0.6495695114135742, + "eval_mlm_loss": 0.6495695114135742, + "eval_runtime": 59.8554, + "eval_samples_per_second": 1167.296, + "eval_steps_per_second": 0.585, + "step": 3100 + }, + { + "ep_loss": 0.0, + "epoch": 2.4, + "learning_rate": 0.0001555, + "loss": 0.6877, + "mlm_loss": 0.6877, + "step": 3110 + }, + { + "ep_loss": 0.0, + "epoch": 2.41, + "learning_rate": 0.000156, + "loss": 0.6862, + "mlm_loss": 0.6862, + "step": 3120 + }, + { + "ep_loss": 0.0, + "epoch": 2.41, + "learning_rate": 0.0001565, + "loss": 0.6893, + "mlm_loss": 0.6893, + "step": 3130 + }, + { + "ep_loss": 0.0, + "epoch": 2.42, + "learning_rate": 0.000157, + "loss": 0.6943, + "mlm_loss": 0.6943, + "step": 3140 + }, + { + "ep_loss": 0.0, + "epoch": 2.43, + "learning_rate": 0.0001575, + "loss": 0.6804, + "mlm_loss": 0.6804, + "step": 3150 + }, + { + "ep_loss": 0.0, + "epoch": 2.44, + "learning_rate": 0.000158, + "loss": 0.6813, + "mlm_loss": 0.6813, + "step": 3160 + }, + { + "ep_loss": 0.0, + "epoch": 2.45, + "learning_rate": 0.0001585, + "loss": 0.6828, + "mlm_loss": 0.6828, + "step": 3170 + }, + { + "ep_loss": 0.0, + "epoch": 2.45, + "learning_rate": 0.00015900000000000002, + "loss": 0.6736, + "mlm_loss": 0.6736, + "step": 3180 + }, + { + "ep_loss": 0.0, + "epoch": 2.46, + "learning_rate": 0.0001595, + "loss": 0.6763, + "mlm_loss": 0.6763, + "step": 3190 + }, + { + "ep_loss": 0.0, + "epoch": 2.47, + "learning_rate": 0.00016, + "loss": 0.6796, + "mlm_loss": 0.6796, + "step": 3200 + }, + { + "epoch": 2.47, + "eval_ep_loss": -2.57122540473938, + "eval_loss": 0.6341940760612488, + "eval_mlm_loss": 0.6341940760612488, + "eval_runtime": 63.1531, + "eval_samples_per_second": 1106.343, + "eval_steps_per_second": 0.554, + "step": 3200 + }, + { + "ep_loss": 0.0, + "epoch": 2.48, + "learning_rate": 0.0001605, + "loss": 0.6688, + "mlm_loss": 0.6688, + "step": 3210 + }, + { + "ep_loss": 0.0, + "epoch": 2.48, + "learning_rate": 0.000161, + "loss": 0.6725, + "mlm_loss": 0.6725, + "step": 3220 + }, + { + "ep_loss": 0.0, + "epoch": 2.49, + "learning_rate": 0.0001615, + "loss": 0.6718, + "mlm_loss": 0.6718, + "step": 3230 + }, + { + "ep_loss": 0.0, + "epoch": 2.5, + "learning_rate": 0.000162, + "loss": 0.6675, + "mlm_loss": 0.6675, + "step": 3240 + }, + { + "ep_loss": 0.0, + "epoch": 2.51, + "learning_rate": 0.00016250000000000002, + "loss": 0.6665, + "mlm_loss": 0.6665, + "step": 3250 + }, + { + "ep_loss": 0.0, + "epoch": 2.51, + "learning_rate": 0.000163, + "loss": 0.6615, + "mlm_loss": 0.6615, + "step": 3260 + }, + { + "ep_loss": 0.0, + "epoch": 2.52, + "learning_rate": 0.00016350000000000002, + "loss": 0.6784, + "mlm_loss": 0.6784, + "step": 3270 + }, + { + "ep_loss": 0.0, + "epoch": 2.53, + "learning_rate": 0.000164, + "loss": 0.6686, + "mlm_loss": 0.6686, + "step": 3280 + }, + { + "ep_loss": 0.0, + "epoch": 2.54, + "learning_rate": 0.00016450000000000001, + "loss": 0.6697, + "mlm_loss": 0.6697, + "step": 3290 + }, + { + "ep_loss": 0.0, + "epoch": 2.55, + "learning_rate": 0.000165, + "loss": 0.6621, + "mlm_loss": 0.6621, + "step": 3300 + }, + { + "epoch": 2.55, + "eval_ep_loss": -2.320197820663452, + "eval_loss": 0.6192212104797363, + "eval_mlm_loss": 0.6192212104797363, + "eval_runtime": 60.3086, + "eval_samples_per_second": 1158.524, + "eval_steps_per_second": 0.58, + "step": 3300 + }, + { + "ep_loss": 0.0, + "epoch": 2.55, + "learning_rate": 0.0001655, + "loss": 0.6645, + "mlm_loss": 0.6645, + "step": 3310 + }, + { + "ep_loss": 0.0, + "epoch": 2.56, + "learning_rate": 0.00016600000000000002, + "loss": 0.6594, + "mlm_loss": 0.6594, + "step": 3320 + }, + { + "ep_loss": 0.0, + "epoch": 2.57, + "learning_rate": 0.0001665, + "loss": 0.6585, + "mlm_loss": 0.6585, + "step": 3330 + }, + { + "ep_loss": 0.0, + "epoch": 2.58, + "learning_rate": 0.00016700000000000002, + "loss": 0.6499, + "mlm_loss": 0.6499, + "step": 3340 + }, + { + "ep_loss": 0.0, + "epoch": 2.58, + "learning_rate": 0.0001675, + "loss": 0.641, + "mlm_loss": 0.641, + "step": 3350 + }, + { + "ep_loss": 0.0, + "epoch": 2.59, + "learning_rate": 0.00016800000000000002, + "loss": 0.653, + "mlm_loss": 0.653, + "step": 3360 + }, + { + "ep_loss": 0.0, + "epoch": 2.6, + "learning_rate": 0.0001685, + "loss": 0.6526, + "mlm_loss": 0.6526, + "step": 3370 + }, + { + "ep_loss": 0.0, + "epoch": 2.61, + "learning_rate": 0.00016900000000000002, + "loss": 0.6588, + "mlm_loss": 0.6588, + "step": 3380 + }, + { + "ep_loss": 0.0, + "epoch": 2.61, + "learning_rate": 0.00016950000000000003, + "loss": 0.6457, + "mlm_loss": 0.6457, + "step": 3390 + }, + { + "ep_loss": 0.0, + "epoch": 2.62, + "learning_rate": 0.00017, + "loss": 0.6464, + "mlm_loss": 0.6464, + "step": 3400 + }, + { + "epoch": 2.62, + "eval_ep_loss": -2.441110610961914, + "eval_loss": 0.6078351736068726, + "eval_mlm_loss": 0.6078351736068726, + "eval_runtime": 60.4806, + "eval_samples_per_second": 1155.23, + "eval_steps_per_second": 0.579, + "step": 3400 + }, + { + "ep_loss": 0.0, + "epoch": 2.63, + "learning_rate": 0.00017050000000000002, + "loss": 0.6517, + "mlm_loss": 0.6517, + "step": 3410 + }, + { + "ep_loss": 0.0, + "epoch": 2.64, + "learning_rate": 0.000171, + "loss": 0.6473, + "mlm_loss": 0.6473, + "step": 3420 + }, + { + "ep_loss": 0.0, + "epoch": 2.65, + "learning_rate": 0.00017150000000000002, + "loss": 0.6475, + "mlm_loss": 0.6475, + "step": 3430 + }, + { + "ep_loss": 0.0, + "epoch": 2.65, + "learning_rate": 0.00017199999999999998, + "loss": 0.649, + "mlm_loss": 0.649, + "step": 3440 + }, + { + "ep_loss": 0.0, + "epoch": 2.66, + "learning_rate": 0.0001725, + "loss": 0.652, + "mlm_loss": 0.652, + "step": 3450 + }, + { + "ep_loss": 0.0, + "epoch": 2.67, + "learning_rate": 0.000173, + "loss": 0.6484, + "mlm_loss": 0.6484, + "step": 3460 + }, + { + "ep_loss": 0.0, + "epoch": 2.68, + "learning_rate": 0.0001735, + "loss": 0.6405, + "mlm_loss": 0.6405, + "step": 3470 + }, + { + "ep_loss": 0.0, + "epoch": 2.68, + "learning_rate": 0.000174, + "loss": 0.6421, + "mlm_loss": 0.6421, + "step": 3480 + }, + { + "ep_loss": 0.0, + "epoch": 2.69, + "learning_rate": 0.00017449999999999999, + "loss": 0.6313, + "mlm_loss": 0.6313, + "step": 3490 + }, + { + "ep_loss": 0.0, + "epoch": 2.7, + "learning_rate": 0.000175, + "loss": 0.6228, + "mlm_loss": 0.6228, + "step": 3500 + }, + { + "epoch": 2.7, + "eval_ep_loss": -2.3252084255218506, + "eval_loss": 0.5944607257843018, + "eval_mlm_loss": 0.5944607257843018, + "eval_runtime": 59.8998, + "eval_samples_per_second": 1166.431, + "eval_steps_per_second": 0.584, + "step": 3500 + }, + { + "ep_loss": 0.0, + "epoch": 2.71, + "learning_rate": 0.00017549999999999998, + "loss": 0.6276, + "mlm_loss": 0.6276, + "step": 3510 + }, + { + "ep_loss": 0.0, + "epoch": 2.72, + "learning_rate": 0.000176, + "loss": 0.632, + "mlm_loss": 0.632, + "step": 3520 + }, + { + "ep_loss": 0.0, + "epoch": 2.72, + "learning_rate": 0.00017649999999999998, + "loss": 0.629, + "mlm_loss": 0.629, + "step": 3530 + }, + { + "ep_loss": 0.0, + "epoch": 2.73, + "learning_rate": 0.000177, + "loss": 0.6327, + "mlm_loss": 0.6327, + "step": 3540 + }, + { + "ep_loss": 0.0, + "epoch": 2.74, + "learning_rate": 0.0001775, + "loss": 0.6307, + "mlm_loss": 0.6307, + "step": 3550 + }, + { + "ep_loss": 0.0, + "epoch": 2.75, + "learning_rate": 0.000178, + "loss": 0.6156, + "mlm_loss": 0.6156, + "step": 3560 + }, + { + "ep_loss": 0.0, + "epoch": 2.75, + "learning_rate": 0.0001785, + "loss": 0.6246, + "mlm_loss": 0.6246, + "step": 3570 + }, + { + "ep_loss": 0.0, + "epoch": 2.76, + "learning_rate": 0.000179, + "loss": 0.6223, + "mlm_loss": 0.6223, + "step": 3580 + }, + { + "ep_loss": 0.0, + "epoch": 2.77, + "learning_rate": 0.0001795, + "loss": 0.6359, + "mlm_loss": 0.6359, + "step": 3590 + }, + { + "ep_loss": 0.0, + "epoch": 2.78, + "learning_rate": 0.00017999999999999998, + "loss": 0.6192, + "mlm_loss": 0.6192, + "step": 3600 + }, + { + "epoch": 2.78, + "eval_ep_loss": -2.543795585632324, + "eval_loss": 0.5830370783805847, + "eval_mlm_loss": 0.5830370783805847, + "eval_runtime": 62.3115, + "eval_samples_per_second": 1121.285, + "eval_steps_per_second": 0.562, + "step": 3600 + }, + { + "ep_loss": 0.0, + "epoch": 2.78, + "learning_rate": 0.0001805, + "loss": 0.619, + "mlm_loss": 0.619, + "step": 3610 + }, + { + "ep_loss": 0.0, + "epoch": 2.79, + "learning_rate": 0.000181, + "loss": 0.6107, + "mlm_loss": 0.6107, + "step": 3620 + }, + { + "ep_loss": 0.0, + "epoch": 2.8, + "learning_rate": 0.0001815, + "loss": 0.6287, + "mlm_loss": 0.6287, + "step": 3630 + }, + { + "ep_loss": 0.0, + "epoch": 2.81, + "learning_rate": 0.000182, + "loss": 0.616, + "mlm_loss": 0.616, + "step": 3640 + }, + { + "ep_loss": 0.0, + "epoch": 2.82, + "learning_rate": 0.0001825, + "loss": 0.6199, + "mlm_loss": 0.6199, + "step": 3650 + }, + { + "ep_loss": 0.0, + "epoch": 2.82, + "learning_rate": 0.000183, + "loss": 0.6179, + "mlm_loss": 0.6179, + "step": 3660 + }, + { + "ep_loss": 0.0, + "epoch": 2.83, + "learning_rate": 0.0001835, + "loss": 0.6159, + "mlm_loss": 0.6159, + "step": 3670 + }, + { + "ep_loss": 0.0, + "epoch": 2.84, + "learning_rate": 0.000184, + "loss": 0.6067, + "mlm_loss": 0.6067, + "step": 3680 + }, + { + "ep_loss": 0.0, + "epoch": 2.85, + "learning_rate": 0.0001845, + "loss": 0.6079, + "mlm_loss": 0.6079, + "step": 3690 + }, + { + "ep_loss": 0.0, + "epoch": 2.85, + "learning_rate": 0.000185, + "loss": 0.6063, + "mlm_loss": 0.6063, + "step": 3700 + }, + { + "epoch": 2.85, + "eval_ep_loss": -2.2521820068359375, + "eval_loss": 0.5688546299934387, + "eval_mlm_loss": 0.5688546299934387, + "eval_runtime": 60.0349, + "eval_samples_per_second": 1163.807, + "eval_steps_per_second": 0.583, + "step": 3700 + }, + { + "ep_loss": 0.0, + "epoch": 2.86, + "learning_rate": 0.0001855, + "loss": 0.6156, + "mlm_loss": 0.6156, + "step": 3710 + }, + { + "ep_loss": 0.0, + "epoch": 2.87, + "learning_rate": 0.000186, + "loss": 0.6093, + "mlm_loss": 0.6093, + "step": 3720 + }, + { + "ep_loss": 0.0, + "epoch": 2.88, + "learning_rate": 0.0001865, + "loss": 0.6073, + "mlm_loss": 0.6073, + "step": 3730 + }, + { + "ep_loss": 0.0, + "epoch": 2.88, + "learning_rate": 0.000187, + "loss": 0.6039, + "mlm_loss": 0.6039, + "step": 3740 + }, + { + "ep_loss": 0.0, + "epoch": 2.89, + "learning_rate": 0.0001875, + "loss": 0.5977, + "mlm_loss": 0.5977, + "step": 3750 + }, + { + "ep_loss": 0.0, + "epoch": 2.9, + "learning_rate": 0.00018800000000000002, + "loss": 0.6, + "mlm_loss": 0.6, + "step": 3760 + }, + { + "ep_loss": 0.0, + "epoch": 2.91, + "learning_rate": 0.0001885, + "loss": 0.6022, + "mlm_loss": 0.6022, + "step": 3770 + }, + { + "ep_loss": 0.0, + "epoch": 2.92, + "learning_rate": 0.000189, + "loss": 0.6015, + "mlm_loss": 0.6015, + "step": 3780 + }, + { + "ep_loss": 0.0, + "epoch": 2.92, + "learning_rate": 0.0001895, + "loss": 0.6016, + "mlm_loss": 0.6016, + "step": 3790 + }, + { + "ep_loss": 0.0, + "epoch": 2.93, + "learning_rate": 0.00019, + "loss": 0.5959, + "mlm_loss": 0.5959, + "step": 3800 + }, + { + "epoch": 2.93, + "eval_ep_loss": -2.366133213043213, + "eval_loss": 0.5573724508285522, + "eval_mlm_loss": 0.5573724508285522, + "eval_runtime": 61.8753, + "eval_samples_per_second": 1129.191, + "eval_steps_per_second": 0.566, + "step": 3800 + }, + { + "ep_loss": 0.0, + "epoch": 2.94, + "learning_rate": 0.0001905, + "loss": 0.6003, + "mlm_loss": 0.6003, + "step": 3810 + }, + { + "ep_loss": 0.0, + "epoch": 2.95, + "learning_rate": 0.000191, + "loss": 0.5984, + "mlm_loss": 0.5984, + "step": 3820 + }, + { + "ep_loss": 0.0, + "epoch": 2.95, + "learning_rate": 0.00019150000000000002, + "loss": 0.6005, + "mlm_loss": 0.6005, + "step": 3830 + }, + { + "ep_loss": 0.0, + "epoch": 2.96, + "learning_rate": 0.000192, + "loss": 0.5895, + "mlm_loss": 0.5895, + "step": 3840 + }, + { + "ep_loss": 0.0, + "epoch": 2.97, + "learning_rate": 0.00019250000000000002, + "loss": 0.5912, + "mlm_loss": 0.5912, + "step": 3850 + }, + { + "ep_loss": 0.0, + "epoch": 2.98, + "learning_rate": 0.000193, + "loss": 0.5828, + "mlm_loss": 0.5828, + "step": 3860 + }, + { + "ep_loss": 0.0, + "epoch": 2.98, + "learning_rate": 0.00019350000000000001, + "loss": 0.5865, + "mlm_loss": 0.5865, + "step": 3870 + }, + { + "ep_loss": 0.0, + "epoch": 2.99, + "learning_rate": 0.000194, + "loss": 0.5854, + "mlm_loss": 0.5854, + "step": 3880 + }, + { + "ep_loss": 0.0, + "epoch": 3.0, + "learning_rate": 0.0001945, + "loss": 0.5883, + "mlm_loss": 0.5883, + "step": 3890 + }, + { + "ep_loss": 0.0, + "epoch": 3.01, + "learning_rate": 0.00019500000000000002, + "loss": 0.5743, + "mlm_loss": 0.5743, + "step": 3900 + }, + { + "epoch": 3.01, + "eval_ep_loss": -2.332629680633545, + "eval_loss": 0.551331102848053, + "eval_mlm_loss": 0.551331102848053, + "eval_runtime": 61.0219, + "eval_samples_per_second": 1144.983, + "eval_steps_per_second": 0.574, + "step": 3900 + }, + { + "ep_loss": 0.0, + "epoch": 3.02, + "learning_rate": 0.0001955, + "loss": 0.5813, + "mlm_loss": 0.5813, + "step": 3910 + }, + { + "ep_loss": 0.0, + "epoch": 3.02, + "learning_rate": 0.00019600000000000002, + "loss": 0.5813, + "mlm_loss": 0.5813, + "step": 3920 + }, + { + "ep_loss": 0.0, + "epoch": 3.03, + "learning_rate": 0.0001965, + "loss": 0.5899, + "mlm_loss": 0.5899, + "step": 3930 + }, + { + "ep_loss": 0.0, + "epoch": 3.04, + "learning_rate": 0.00019700000000000002, + "loss": 0.5849, + "mlm_loss": 0.5849, + "step": 3940 + }, + { + "ep_loss": 0.0, + "epoch": 3.05, + "learning_rate": 0.0001975, + "loss": 0.5823, + "mlm_loss": 0.5823, + "step": 3950 + }, + { + "ep_loss": 0.0, + "epoch": 3.05, + "learning_rate": 0.00019800000000000002, + "loss": 0.5794, + "mlm_loss": 0.5794, + "step": 3960 + }, + { + "ep_loss": 0.0, + "epoch": 3.06, + "learning_rate": 0.00019850000000000003, + "loss": 0.5837, + "mlm_loss": 0.5837, + "step": 3970 + }, + { + "ep_loss": 0.0, + "epoch": 3.07, + "learning_rate": 0.000199, + "loss": 0.5783, + "mlm_loss": 0.5783, + "step": 3980 + }, + { + "ep_loss": 0.0, + "epoch": 3.08, + "learning_rate": 0.00019950000000000002, + "loss": 0.5733, + "mlm_loss": 0.5733, + "step": 3990 + }, + { + "ep_loss": 0.0, + "epoch": 3.09, + "learning_rate": 0.0002, + "loss": 0.5701, + "mlm_loss": 0.5701, + "step": 4000 + }, + { + "epoch": 3.09, + "eval_ep_loss": -2.3982014656066895, + "eval_loss": 0.5421211123466492, + "eval_mlm_loss": 0.5421211123466492, + "eval_runtime": 62.0302, + "eval_samples_per_second": 1126.37, + "eval_steps_per_second": 0.564, + "step": 4000 + }, + { + "ep_loss": 0.0, + "epoch": 3.09, + "learning_rate": 0.00020050000000000002, + "loss": 0.5737, + "mlm_loss": 0.5737, + "step": 4010 + }, + { + "ep_loss": 0.0, + "epoch": 3.1, + "learning_rate": 0.000201, + "loss": 0.5645, + "mlm_loss": 0.5645, + "step": 4020 + }, + { + "ep_loss": 0.0, + "epoch": 3.11, + "learning_rate": 0.00020150000000000002, + "loss": 0.5758, + "mlm_loss": 0.5758, + "step": 4030 + }, + { + "ep_loss": 0.0, + "epoch": 3.12, + "learning_rate": 0.000202, + "loss": 0.5849, + "mlm_loss": 0.5849, + "step": 4040 + }, + { + "ep_loss": 0.0, + "epoch": 3.12, + "learning_rate": 0.00020250000000000002, + "loss": 0.5667, + "mlm_loss": 0.5667, + "step": 4050 + }, + { + "ep_loss": 0.0, + "epoch": 3.13, + "learning_rate": 0.00020300000000000003, + "loss": 0.5726, + "mlm_loss": 0.5726, + "step": 4060 + }, + { + "ep_loss": 0.0, + "epoch": 3.14, + "learning_rate": 0.00020349999999999999, + "loss": 0.5645, + "mlm_loss": 0.5645, + "step": 4070 + }, + { + "ep_loss": 0.0, + "epoch": 3.15, + "learning_rate": 0.000204, + "loss": 0.5681, + "mlm_loss": 0.5681, + "step": 4080 + }, + { + "ep_loss": 0.0, + "epoch": 3.15, + "learning_rate": 0.00020449999999999998, + "loss": 0.564, + "mlm_loss": 0.564, + "step": 4090 + }, + { + "ep_loss": 0.0, + "epoch": 3.16, + "learning_rate": 0.000205, + "loss": 0.5655, + "mlm_loss": 0.5655, + "step": 4100 + }, + { + "epoch": 3.16, + "eval_ep_loss": -2.218219041824341, + "eval_loss": 0.5314173698425293, + "eval_mlm_loss": 0.5314173698425293, + "eval_runtime": 60.8804, + "eval_samples_per_second": 1147.644, + "eval_steps_per_second": 0.575, + "step": 4100 + }, + { + "ep_loss": 0.0, + "epoch": 3.17, + "learning_rate": 0.00020549999999999998, + "loss": 0.5654, + "mlm_loss": 0.5654, + "step": 4110 + }, + { + "ep_loss": 0.0, + "epoch": 3.18, + "learning_rate": 0.000206, + "loss": 0.5647, + "mlm_loss": 0.5647, + "step": 4120 + }, + { + "ep_loss": 0.0, + "epoch": 3.19, + "learning_rate": 0.0002065, + "loss": 0.568, + "mlm_loss": 0.568, + "step": 4130 + }, + { + "ep_loss": 0.0, + "epoch": 3.19, + "learning_rate": 0.000207, + "loss": 0.5538, + "mlm_loss": 0.5538, + "step": 4140 + }, + { + "ep_loss": 0.0, + "epoch": 3.2, + "learning_rate": 0.0002075, + "loss": 0.5585, + "mlm_loss": 0.5585, + "step": 4150 + }, + { + "ep_loss": 0.0, + "epoch": 3.21, + "learning_rate": 0.000208, + "loss": 0.5578, + "mlm_loss": 0.5578, + "step": 4160 + }, + { + "ep_loss": 0.0, + "epoch": 3.22, + "learning_rate": 0.0002085, + "loss": 0.5508, + "mlm_loss": 0.5508, + "step": 4170 + }, + { + "ep_loss": 0.0, + "epoch": 3.22, + "learning_rate": 0.00020899999999999998, + "loss": 0.5644, + "mlm_loss": 0.5644, + "step": 4180 + }, + { + "ep_loss": 0.0, + "epoch": 3.23, + "learning_rate": 0.0002095, + "loss": 0.5642, + "mlm_loss": 0.5642, + "step": 4190 + }, + { + "ep_loss": 0.0, + "epoch": 3.24, + "learning_rate": 0.00021, + "loss": 0.561, + "mlm_loss": 0.561, + "step": 4200 + }, + { + "epoch": 3.24, + "eval_ep_loss": -2.287240743637085, + "eval_loss": 0.5224212408065796, + "eval_mlm_loss": 0.5224212408065796, + "eval_runtime": 61.3268, + "eval_samples_per_second": 1139.291, + "eval_steps_per_second": 0.571, + "step": 4200 + }, + { + "ep_loss": 0.0, + "epoch": 3.25, + "learning_rate": 0.0002105, + "loss": 0.5564, + "mlm_loss": 0.5564, + "step": 4210 + }, + { + "ep_loss": 0.0, + "epoch": 3.25, + "learning_rate": 0.000211, + "loss": 0.5511, + "mlm_loss": 0.5511, + "step": 4220 + }, + { + "ep_loss": 0.0, + "epoch": 3.26, + "learning_rate": 0.0002115, + "loss": 0.5578, + "mlm_loss": 0.5578, + "step": 4230 + }, + { + "ep_loss": 0.0, + "epoch": 3.27, + "learning_rate": 0.000212, + "loss": 0.5531, + "mlm_loss": 0.5531, + "step": 4240 + }, + { + "ep_loss": 0.0, + "epoch": 3.28, + "learning_rate": 0.0002125, + "loss": 0.5569, + "mlm_loss": 0.5569, + "step": 4250 + }, + { + "ep_loss": 0.0, + "epoch": 3.29, + "learning_rate": 0.000213, + "loss": 0.5499, + "mlm_loss": 0.5499, + "step": 4260 + }, + { + "ep_loss": 0.0, + "epoch": 3.29, + "learning_rate": 0.0002135, + "loss": 0.5514, + "mlm_loss": 0.5514, + "step": 4270 + }, + { + "ep_loss": 0.0, + "epoch": 3.3, + "learning_rate": 0.000214, + "loss": 0.554, + "mlm_loss": 0.554, + "step": 4280 + }, + { + "ep_loss": 0.0, + "epoch": 3.31, + "learning_rate": 0.0002145, + "loss": 0.5444, + "mlm_loss": 0.5444, + "step": 4290 + }, + { + "ep_loss": 0.0, + "epoch": 3.32, + "learning_rate": 0.000215, + "loss": 0.5555, + "mlm_loss": 0.5555, + "step": 4300 + }, + { + "epoch": 3.32, + "eval_ep_loss": -2.407238245010376, + "eval_loss": 0.5153276920318604, + "eval_mlm_loss": 0.5153276920318604, + "eval_runtime": 62.01, + "eval_samples_per_second": 1126.738, + "eval_steps_per_second": 0.564, + "step": 4300 + }, + { + "ep_loss": 0.0, + "epoch": 3.32, + "learning_rate": 0.0002155, + "loss": 0.5374, + "mlm_loss": 0.5374, + "step": 4310 + }, + { + "ep_loss": 0.0, + "epoch": 3.33, + "learning_rate": 0.000216, + "loss": 0.5526, + "mlm_loss": 0.5526, + "step": 4320 + }, + { + "ep_loss": 0.0, + "epoch": 3.34, + "learning_rate": 0.0002165, + "loss": 0.5451, + "mlm_loss": 0.5451, + "step": 4330 + }, + { + "ep_loss": 0.0, + "epoch": 3.35, + "learning_rate": 0.00021700000000000002, + "loss": 0.5406, + "mlm_loss": 0.5406, + "step": 4340 + }, + { + "ep_loss": 0.0, + "epoch": 3.36, + "learning_rate": 0.0002175, + "loss": 0.5339, + "mlm_loss": 0.5339, + "step": 4350 + }, + { + "ep_loss": 0.0, + "epoch": 3.36, + "learning_rate": 0.000218, + "loss": 0.5434, + "mlm_loss": 0.5434, + "step": 4360 + }, + { + "ep_loss": 0.0, + "epoch": 3.37, + "learning_rate": 0.0002185, + "loss": 0.546, + "mlm_loss": 0.546, + "step": 4370 + }, + { + "ep_loss": 0.0, + "epoch": 3.38, + "learning_rate": 0.000219, + "loss": 0.5359, + "mlm_loss": 0.5359, + "step": 4380 + }, + { + "ep_loss": 0.0, + "epoch": 3.39, + "learning_rate": 0.0002195, + "loss": 0.5405, + "mlm_loss": 0.5405, + "step": 4390 + }, + { + "ep_loss": 0.0, + "epoch": 3.39, + "learning_rate": 0.00022, + "loss": 0.5506, + "mlm_loss": 0.5506, + "step": 4400 + }, + { + "epoch": 3.39, + "eval_ep_loss": -2.1430230140686035, + "eval_loss": 0.5089389681816101, + "eval_mlm_loss": 0.5089389681816101, + "eval_runtime": 60.2829, + "eval_samples_per_second": 1159.018, + "eval_steps_per_second": 0.581, + "step": 4400 + }, + { + "ep_loss": 0.0, + "epoch": 3.4, + "learning_rate": 0.0002205, + "loss": 0.5425, + "mlm_loss": 0.5425, + "step": 4410 + }, + { + "ep_loss": 0.0, + "epoch": 3.41, + "learning_rate": 0.000221, + "loss": 0.5443, + "mlm_loss": 0.5443, + "step": 4420 + }, + { + "ep_loss": 0.0, + "epoch": 3.42, + "learning_rate": 0.00022150000000000002, + "loss": 0.5433, + "mlm_loss": 0.5433, + "step": 4430 + }, + { + "ep_loss": 0.0, + "epoch": 3.42, + "learning_rate": 0.000222, + "loss": 0.5379, + "mlm_loss": 0.5379, + "step": 4440 + }, + { + "ep_loss": 0.0, + "epoch": 3.43, + "learning_rate": 0.00022250000000000001, + "loss": 0.5349, + "mlm_loss": 0.5349, + "step": 4450 + }, + { + "ep_loss": 0.0, + "epoch": 3.44, + "learning_rate": 0.000223, + "loss": 0.5385, + "mlm_loss": 0.5385, + "step": 4460 + }, + { + "ep_loss": 0.0, + "epoch": 3.45, + "learning_rate": 0.0002235, + "loss": 0.5337, + "mlm_loss": 0.5337, + "step": 4470 + }, + { + "ep_loss": 0.0, + "epoch": 3.46, + "learning_rate": 0.000224, + "loss": 0.5372, + "mlm_loss": 0.5372, + "step": 4480 + }, + { + "ep_loss": 0.0, + "epoch": 3.46, + "learning_rate": 0.0002245, + "loss": 0.5342, + "mlm_loss": 0.5342, + "step": 4490 + }, + { + "ep_loss": 0.0, + "epoch": 3.47, + "learning_rate": 0.00022500000000000002, + "loss": 0.5329, + "mlm_loss": 0.5329, + "step": 4500 + }, + { + "epoch": 3.47, + "eval_ep_loss": -2.252265691757202, + "eval_loss": 0.501177966594696, + "eval_mlm_loss": 0.501177966594696, + "eval_runtime": 61.1269, + "eval_samples_per_second": 1143.016, + "eval_steps_per_second": 0.573, + "step": 4500 + }, + { + "ep_loss": 0.0, + "epoch": 3.48, + "learning_rate": 0.0002255, + "loss": 0.5321, + "mlm_loss": 0.5321, + "step": 4510 + }, + { + "ep_loss": 0.0, + "epoch": 3.49, + "learning_rate": 0.00022600000000000002, + "loss": 0.5413, + "mlm_loss": 0.5413, + "step": 4520 + }, + { + "ep_loss": 0.0, + "epoch": 3.49, + "learning_rate": 0.0002265, + "loss": 0.5364, + "mlm_loss": 0.5364, + "step": 4530 + }, + { + "ep_loss": 0.0, + "epoch": 3.5, + "learning_rate": 0.00022700000000000002, + "loss": 0.5326, + "mlm_loss": 0.5326, + "step": 4540 + }, + { + "ep_loss": 0.0, + "epoch": 3.51, + "learning_rate": 0.0002275, + "loss": 0.5327, + "mlm_loss": 0.5327, + "step": 4550 + }, + { + "ep_loss": 0.0, + "epoch": 3.52, + "learning_rate": 0.000228, + "loss": 0.5334, + "mlm_loss": 0.5334, + "step": 4560 + }, + { + "ep_loss": 0.0, + "epoch": 3.52, + "learning_rate": 0.00022850000000000002, + "loss": 0.5344, + "mlm_loss": 0.5344, + "step": 4570 + }, + { + "ep_loss": 0.0, + "epoch": 3.53, + "learning_rate": 0.000229, + "loss": 0.5259, + "mlm_loss": 0.5259, + "step": 4580 + }, + { + "ep_loss": 0.0, + "epoch": 3.54, + "learning_rate": 0.00022950000000000002, + "loss": 0.519, + "mlm_loss": 0.519, + "step": 4590 + }, + { + "ep_loss": 0.0, + "epoch": 3.55, + "learning_rate": 0.00023, + "loss": 0.5243, + "mlm_loss": 0.5243, + "step": 4600 + }, + { + "epoch": 3.55, + "eval_ep_loss": -2.4623284339904785, + "eval_loss": 0.49688172340393066, + "eval_mlm_loss": 0.49688172340393066, + "eval_runtime": 60.8838, + "eval_samples_per_second": 1147.579, + "eval_steps_per_second": 0.575, + "step": 4600 + }, + { + "ep_loss": 0.0, + "epoch": 3.56, + "learning_rate": 0.00023050000000000002, + "loss": 0.5237, + "mlm_loss": 0.5237, + "step": 4610 + }, + { + "ep_loss": 0.0, + "epoch": 3.56, + "learning_rate": 0.000231, + "loss": 0.5249, + "mlm_loss": 0.5249, + "step": 4620 + }, + { + "ep_loss": 0.0, + "epoch": 3.57, + "learning_rate": 0.00023150000000000002, + "loss": 0.5208, + "mlm_loss": 0.5208, + "step": 4630 + }, + { + "ep_loss": 0.0, + "epoch": 3.58, + "learning_rate": 0.00023200000000000003, + "loss": 0.5213, + "mlm_loss": 0.5213, + "step": 4640 + }, + { + "ep_loss": 0.0, + "epoch": 3.59, + "learning_rate": 0.0002325, + "loss": 0.5316, + "mlm_loss": 0.5316, + "step": 4650 + }, + { + "ep_loss": 0.0, + "epoch": 3.59, + "learning_rate": 0.00023300000000000003, + "loss": 0.5231, + "mlm_loss": 0.5231, + "step": 4660 + }, + { + "ep_loss": 0.0, + "epoch": 3.6, + "learning_rate": 0.0002335, + "loss": 0.524, + "mlm_loss": 0.524, + "step": 4670 + }, + { + "ep_loss": 0.0, + "epoch": 3.61, + "learning_rate": 0.00023400000000000002, + "loss": 0.5274, + "mlm_loss": 0.5274, + "step": 4680 + }, + { + "ep_loss": 0.0, + "epoch": 3.62, + "learning_rate": 0.00023449999999999998, + "loss": 0.5186, + "mlm_loss": 0.5186, + "step": 4690 + }, + { + "ep_loss": 0.0, + "epoch": 3.63, + "learning_rate": 0.000235, + "loss": 0.5201, + "mlm_loss": 0.5201, + "step": 4700 + }, + { + "epoch": 3.63, + "eval_ep_loss": -2.275129556655884, + "eval_loss": 0.48909062147140503, + "eval_mlm_loss": 0.48909062147140503, + "eval_runtime": 63.7661, + "eval_samples_per_second": 1095.708, + "eval_steps_per_second": 0.549, + "step": 4700 + }, + { + "ep_loss": 0.0, + "epoch": 3.63, + "learning_rate": 0.0002355, + "loss": 0.5167, + "mlm_loss": 0.5167, + "step": 4710 + }, + { + "ep_loss": 0.0, + "epoch": 3.64, + "learning_rate": 0.000236, + "loss": 0.5152, + "mlm_loss": 0.5152, + "step": 4720 + }, + { + "ep_loss": 0.0, + "epoch": 3.65, + "learning_rate": 0.0002365, + "loss": 0.5201, + "mlm_loss": 0.5201, + "step": 4730 + }, + { + "ep_loss": 0.0, + "epoch": 3.66, + "learning_rate": 0.000237, + "loss": 0.5234, + "mlm_loss": 0.5234, + "step": 4740 + }, + { + "ep_loss": 0.0, + "epoch": 3.66, + "learning_rate": 0.0002375, + "loss": 0.5156, + "mlm_loss": 0.5156, + "step": 4750 + }, + { + "ep_loss": 0.0, + "epoch": 3.67, + "learning_rate": 0.00023799999999999998, + "loss": 0.5189, + "mlm_loss": 0.5189, + "step": 4760 + }, + { + "ep_loss": 0.0, + "epoch": 3.68, + "learning_rate": 0.0002385, + "loss": 0.5189, + "mlm_loss": 0.5189, + "step": 4770 + }, + { + "ep_loss": 0.0, + "epoch": 3.69, + "learning_rate": 0.00023899999999999998, + "loss": 0.51, + "mlm_loss": 0.51, + "step": 4780 + }, + { + "ep_loss": 0.0, + "epoch": 3.69, + "learning_rate": 0.0002395, + "loss": 0.5158, + "mlm_loss": 0.5158, + "step": 4790 + }, + { + "ep_loss": 0.0, + "epoch": 3.7, + "learning_rate": 0.00024, + "loss": 0.5121, + "mlm_loss": 0.5121, + "step": 4800 + }, + { + "epoch": 3.7, + "eval_ep_loss": -2.0928449630737305, + "eval_loss": 0.48291251063346863, + "eval_mlm_loss": 0.48291251063346863, + "eval_runtime": 60.133, + "eval_samples_per_second": 1161.908, + "eval_steps_per_second": 0.582, + "step": 4800 + }, + { + "ep_loss": 0.0, + "epoch": 3.71, + "learning_rate": 0.0002405, + "loss": 0.5108, + "mlm_loss": 0.5108, + "step": 4810 + }, + { + "ep_loss": 0.0, + "epoch": 3.72, + "learning_rate": 0.000241, + "loss": 0.5176, + "mlm_loss": 0.5176, + "step": 4820 + }, + { + "ep_loss": 0.0, + "epoch": 3.73, + "learning_rate": 0.0002415, + "loss": 0.5102, + "mlm_loss": 0.5102, + "step": 4830 + }, + { + "ep_loss": 0.0, + "epoch": 3.73, + "learning_rate": 0.000242, + "loss": 0.514, + "mlm_loss": 0.514, + "step": 4840 + }, + { + "ep_loss": 0.0, + "epoch": 3.74, + "learning_rate": 0.00024249999999999999, + "loss": 0.5086, + "mlm_loss": 0.5086, + "step": 4850 + }, + { + "ep_loss": 0.0, + "epoch": 3.75, + "learning_rate": 0.000243, + "loss": 0.515, + "mlm_loss": 0.515, + "step": 4860 + }, + { + "ep_loss": 0.0, + "epoch": 3.76, + "learning_rate": 0.0002435, + "loss": 0.5091, + "mlm_loss": 0.5091, + "step": 4870 + }, + { + "ep_loss": 0.0, + "epoch": 3.76, + "learning_rate": 0.000244, + "loss": 0.514, + "mlm_loss": 0.514, + "step": 4880 + }, + { + "ep_loss": 0.0, + "epoch": 3.77, + "learning_rate": 0.0002445, + "loss": 0.5051, + "mlm_loss": 0.5051, + "step": 4890 + }, + { + "ep_loss": 0.0, + "epoch": 3.78, + "learning_rate": 0.000245, + "loss": 0.526, + "mlm_loss": 0.526, + "step": 4900 + }, + { + "epoch": 3.78, + "eval_ep_loss": -2.452838182449341, + "eval_loss": 0.4754766523838043, + "eval_mlm_loss": 0.4754766523838043, + "eval_runtime": 60.9895, + "eval_samples_per_second": 1145.591, + "eval_steps_per_second": 0.574, + "step": 4900 + }, + { + "ep_loss": 0.0, + "epoch": 3.79, + "learning_rate": 0.0002455, + "loss": 0.5104, + "mlm_loss": 0.5104, + "step": 4910 + }, + { + "ep_loss": 0.0, + "epoch": 3.79, + "learning_rate": 0.000246, + "loss": 0.4975, + "mlm_loss": 0.4975, + "step": 4920 + }, + { + "ep_loss": 0.0, + "epoch": 3.8, + "learning_rate": 0.00024650000000000003, + "loss": 0.511, + "mlm_loss": 0.511, + "step": 4930 + }, + { + "ep_loss": 0.0, + "epoch": 3.81, + "learning_rate": 0.000247, + "loss": 0.5081, + "mlm_loss": 0.5081, + "step": 4940 + }, + { + "ep_loss": 0.0, + "epoch": 3.82, + "learning_rate": 0.0002475, + "loss": 0.5067, + "mlm_loss": 0.5067, + "step": 4950 + }, + { + "ep_loss": 0.0, + "epoch": 3.83, + "learning_rate": 0.000248, + "loss": 0.5062, + "mlm_loss": 0.5062, + "step": 4960 + }, + { + "ep_loss": 0.0, + "epoch": 3.83, + "learning_rate": 0.0002485, + "loss": 0.5054, + "mlm_loss": 0.5054, + "step": 4970 + }, + { + "ep_loss": 0.0, + "epoch": 3.84, + "learning_rate": 0.000249, + "loss": 0.5032, + "mlm_loss": 0.5032, + "step": 4980 + }, + { + "ep_loss": 0.0, + "epoch": 3.85, + "learning_rate": 0.0002495, + "loss": 0.5022, + "mlm_loss": 0.5022, + "step": 4990 + }, + { + "ep_loss": 0.0, + "epoch": 3.86, + "learning_rate": 0.00025, + "loss": 0.5077, + "mlm_loss": 0.5077, + "step": 5000 + }, + { + "epoch": 3.86, + "eval_ep_loss": -2.5484063625335693, + "eval_loss": 0.47381114959716797, + "eval_mlm_loss": 0.47381114959716797, + "eval_runtime": 59.7879, + "eval_samples_per_second": 1168.615, + "eval_steps_per_second": 0.585, + "step": 5000 + }, + { + "ep_loss": 0.0, + "epoch": 3.86, + "learning_rate": 0.0002505, + "loss": 0.5129, + "mlm_loss": 0.5129, + "step": 5010 + }, + { + "ep_loss": 0.0, + "epoch": 3.87, + "learning_rate": 0.00025100000000000003, + "loss": 0.5031, + "mlm_loss": 0.5031, + "step": 5020 + }, + { + "ep_loss": 0.0, + "epoch": 3.88, + "learning_rate": 0.0002515, + "loss": 0.4941, + "mlm_loss": 0.4941, + "step": 5030 + }, + { + "ep_loss": 0.0, + "epoch": 3.89, + "learning_rate": 0.000252, + "loss": 0.4976, + "mlm_loss": 0.4976, + "step": 5040 + }, + { + "ep_loss": 0.0, + "epoch": 3.9, + "learning_rate": 0.0002525, + "loss": 0.5053, + "mlm_loss": 0.5053, + "step": 5050 + }, + { + "ep_loss": 0.0, + "epoch": 3.9, + "learning_rate": 0.000253, + "loss": 0.495, + "mlm_loss": 0.495, + "step": 5060 + }, + { + "ep_loss": 0.0, + "epoch": 3.91, + "learning_rate": 0.0002535, + "loss": 0.4972, + "mlm_loss": 0.4972, + "step": 5070 + }, + { + "ep_loss": 0.0, + "epoch": 3.92, + "learning_rate": 0.000254, + "loss": 0.4934, + "mlm_loss": 0.4934, + "step": 5080 + }, + { + "ep_loss": 0.0, + "epoch": 3.93, + "learning_rate": 0.0002545, + "loss": 0.4907, + "mlm_loss": 0.4907, + "step": 5090 + }, + { + "ep_loss": 0.0, + "epoch": 3.93, + "learning_rate": 0.000255, + "loss": 0.4936, + "mlm_loss": 0.4936, + "step": 5100 + }, + { + "epoch": 3.93, + "eval_ep_loss": -2.5289254188537598, + "eval_loss": 0.46655842661857605, + "eval_mlm_loss": 0.46655842661857605, + "eval_runtime": 62.2403, + "eval_samples_per_second": 1122.569, + "eval_steps_per_second": 0.562, + "step": 5100 + }, + { + "ep_loss": 0.0, + "epoch": 3.94, + "learning_rate": 0.00025550000000000003, + "loss": 0.4996, + "mlm_loss": 0.4996, + "step": 5110 + }, + { + "ep_loss": 0.0, + "epoch": 3.95, + "learning_rate": 0.000256, + "loss": 0.4911, + "mlm_loss": 0.4911, + "step": 5120 + }, + { + "ep_loss": 0.0, + "epoch": 3.96, + "learning_rate": 0.0002565, + "loss": 0.4994, + "mlm_loss": 0.4994, + "step": 5130 + }, + { + "ep_loss": 0.0, + "epoch": 3.96, + "learning_rate": 0.000257, + "loss": 0.4899, + "mlm_loss": 0.4899, + "step": 5140 + }, + { + "ep_loss": 0.0, + "epoch": 3.97, + "learning_rate": 0.0002575, + "loss": 0.5057, + "mlm_loss": 0.5057, + "step": 5150 + }, + { + "ep_loss": 0.0, + "epoch": 3.98, + "learning_rate": 0.00025800000000000004, + "loss": 0.4845, + "mlm_loss": 0.4845, + "step": 5160 + }, + { + "ep_loss": 0.0, + "epoch": 3.99, + "learning_rate": 0.0002585, + "loss": 0.4925, + "mlm_loss": 0.4925, + "step": 5170 + }, + { + "ep_loss": 0.0, + "epoch": 4.0, + "learning_rate": 0.000259, + "loss": 0.4976, + "mlm_loss": 0.4976, + "step": 5180 + }, + { + "ep_loss": 0.0, + "epoch": 4.0, + "learning_rate": 0.0002595, + "loss": 0.495, + "mlm_loss": 0.495, + "step": 5190 + }, + { + "ep_loss": 0.0, + "epoch": 4.01, + "learning_rate": 0.00026000000000000003, + "loss": 0.4902, + "mlm_loss": 0.4902, + "step": 5200 + }, + { + "epoch": 4.01, + "eval_ep_loss": -2.2716147899627686, + "eval_loss": 0.46275386214256287, + "eval_mlm_loss": 0.46275386214256287, + "eval_runtime": 59.0432, + "eval_samples_per_second": 1183.353, + "eval_steps_per_second": 0.593, + "step": 5200 + }, + { + "ep_loss": 0.0, + "epoch": 4.02, + "learning_rate": 0.0002605, + "loss": 0.4882, + "mlm_loss": 0.4882, + "step": 5210 + }, + { + "ep_loss": 0.0, + "epoch": 4.03, + "learning_rate": 0.000261, + "loss": 0.4889, + "mlm_loss": 0.4889, + "step": 5220 + }, + { + "ep_loss": 0.0, + "epoch": 4.03, + "learning_rate": 0.0002615, + "loss": 0.4967, + "mlm_loss": 0.4967, + "step": 5230 + }, + { + "ep_loss": 0.0, + "epoch": 4.04, + "learning_rate": 0.000262, + "loss": 0.4841, + "mlm_loss": 0.4841, + "step": 5240 + }, + { + "ep_loss": 0.0, + "epoch": 4.05, + "learning_rate": 0.00026250000000000004, + "loss": 0.4883, + "mlm_loss": 0.4883, + "step": 5250 + }, + { + "ep_loss": 0.0, + "epoch": 4.06, + "learning_rate": 0.000263, + "loss": 0.4877, + "mlm_loss": 0.4877, + "step": 5260 + }, + { + "ep_loss": 0.0, + "epoch": 4.06, + "learning_rate": 0.0002635, + "loss": 0.4785, + "mlm_loss": 0.4785, + "step": 5270 + }, + { + "ep_loss": 0.0, + "epoch": 4.07, + "learning_rate": 0.000264, + "loss": 0.4896, + "mlm_loss": 0.4896, + "step": 5280 + }, + { + "ep_loss": 0.0, + "epoch": 4.08, + "learning_rate": 0.00026450000000000003, + "loss": 0.4866, + "mlm_loss": 0.4866, + "step": 5290 + }, + { + "ep_loss": 0.0, + "epoch": 4.09, + "learning_rate": 0.00026500000000000004, + "loss": 0.4871, + "mlm_loss": 0.4871, + "step": 5300 + }, + { + "epoch": 4.09, + "eval_ep_loss": -2.3992135524749756, + "eval_loss": 0.4574643075466156, + "eval_mlm_loss": 0.4574643075466156, + "eval_runtime": 62.4777, + "eval_samples_per_second": 1118.303, + "eval_steps_per_second": 0.56, + "step": 5300 + }, + { + "ep_loss": 0.0, + "epoch": 4.1, + "learning_rate": 0.0002655, + "loss": 0.4836, + "mlm_loss": 0.4836, + "step": 5310 + }, + { + "ep_loss": 0.0, + "epoch": 4.1, + "learning_rate": 0.000266, + "loss": 0.487, + "mlm_loss": 0.487, + "step": 5320 + }, + { + "ep_loss": 0.0, + "epoch": 4.11, + "learning_rate": 0.0002665, + "loss": 0.4922, + "mlm_loss": 0.4922, + "step": 5330 + }, + { + "ep_loss": 0.0, + "epoch": 4.12, + "learning_rate": 0.00026700000000000004, + "loss": 0.4886, + "mlm_loss": 0.4886, + "step": 5340 + }, + { + "ep_loss": 0.0, + "epoch": 4.13, + "learning_rate": 0.0002675, + "loss": 0.4774, + "mlm_loss": 0.4774, + "step": 5350 + }, + { + "ep_loss": 0.0, + "epoch": 4.13, + "learning_rate": 0.000268, + "loss": 0.4833, + "mlm_loss": 0.4833, + "step": 5360 + }, + { + "ep_loss": 0.0, + "epoch": 4.14, + "learning_rate": 0.0002685, + "loss": 0.4878, + "mlm_loss": 0.4878, + "step": 5370 + }, + { + "ep_loss": 0.0, + "epoch": 4.15, + "learning_rate": 0.00026900000000000003, + "loss": 0.4778, + "mlm_loss": 0.4778, + "step": 5380 + }, + { + "ep_loss": 0.0, + "epoch": 4.16, + "learning_rate": 0.00026950000000000005, + "loss": 0.4833, + "mlm_loss": 0.4833, + "step": 5390 + }, + { + "ep_loss": 0.0, + "epoch": 4.17, + "learning_rate": 0.00027, + "loss": 0.4849, + "mlm_loss": 0.4849, + "step": 5400 + }, + { + "epoch": 4.17, + "eval_ep_loss": -2.320817708969116, + "eval_loss": 0.4534033536911011, + "eval_mlm_loss": 0.4534033536911011, + "eval_runtime": 61.6917, + "eval_samples_per_second": 1132.551, + "eval_steps_per_second": 0.567, + "step": 5400 + }, + { + "ep_loss": 0.0, + "epoch": 4.17, + "learning_rate": 0.0002705, + "loss": 0.4889, + "mlm_loss": 0.4889, + "step": 5410 + }, + { + "ep_loss": 0.0, + "epoch": 4.18, + "learning_rate": 0.00027100000000000003, + "loss": 0.4793, + "mlm_loss": 0.4793, + "step": 5420 + }, + { + "ep_loss": 0.0, + "epoch": 4.19, + "learning_rate": 0.00027150000000000004, + "loss": 0.476, + "mlm_loss": 0.476, + "step": 5430 + }, + { + "ep_loss": 0.0, + "epoch": 4.2, + "learning_rate": 0.00027200000000000005, + "loss": 0.4808, + "mlm_loss": 0.4808, + "step": 5440 + }, + { + "ep_loss": 0.0, + "epoch": 4.2, + "learning_rate": 0.0002725, + "loss": 0.475, + "mlm_loss": 0.475, + "step": 5450 + }, + { + "ep_loss": 0.0, + "epoch": 4.21, + "learning_rate": 0.000273, + "loss": 0.4763, + "mlm_loss": 0.4763, + "step": 5460 + }, + { + "ep_loss": 0.0, + "epoch": 4.22, + "learning_rate": 0.00027350000000000003, + "loss": 0.4796, + "mlm_loss": 0.4796, + "step": 5470 + }, + { + "ep_loss": 0.0, + "epoch": 4.23, + "learning_rate": 0.00027400000000000005, + "loss": 0.4828, + "mlm_loss": 0.4828, + "step": 5480 + }, + { + "ep_loss": 0.0, + "epoch": 4.23, + "learning_rate": 0.0002745, + "loss": 0.4799, + "mlm_loss": 0.4799, + "step": 5490 + }, + { + "ep_loss": 0.0, + "epoch": 4.24, + "learning_rate": 0.000275, + "loss": 0.4769, + "mlm_loss": 0.4769, + "step": 5500 + }, + { + "epoch": 4.24, + "eval_ep_loss": -2.215350389480591, + "eval_loss": 0.4512145221233368, + "eval_mlm_loss": 0.4512145221233368, + "eval_runtime": 60.8844, + "eval_samples_per_second": 1147.569, + "eval_steps_per_second": 0.575, + "step": 5500 + }, + { + "ep_loss": 0.0, + "epoch": 4.25, + "learning_rate": 0.00027550000000000003, + "loss": 0.4799, + "mlm_loss": 0.4799, + "step": 5510 + }, + { + "ep_loss": 0.0, + "epoch": 4.26, + "learning_rate": 0.00027600000000000004, + "loss": 0.4732, + "mlm_loss": 0.4732, + "step": 5520 + }, + { + "ep_loss": 0.0, + "epoch": 4.27, + "learning_rate": 0.00027650000000000005, + "loss": 0.4756, + "mlm_loss": 0.4756, + "step": 5530 + }, + { + "ep_loss": 0.0, + "epoch": 4.27, + "learning_rate": 0.000277, + "loss": 0.4749, + "mlm_loss": 0.4749, + "step": 5540 + }, + { + "ep_loss": 0.0, + "epoch": 4.28, + "learning_rate": 0.0002775, + "loss": 0.4656, + "mlm_loss": 0.4656, + "step": 5550 + }, + { + "ep_loss": 0.0, + "epoch": 4.29, + "learning_rate": 0.00027800000000000004, + "loss": 0.473, + "mlm_loss": 0.473, + "step": 5560 + }, + { + "ep_loss": 0.0, + "epoch": 4.3, + "learning_rate": 0.00027850000000000005, + "loss": 0.4819, + "mlm_loss": 0.4819, + "step": 5570 + }, + { + "ep_loss": 0.0, + "epoch": 4.3, + "learning_rate": 0.000279, + "loss": 0.4825, + "mlm_loss": 0.4825, + "step": 5580 + }, + { + "ep_loss": 0.0, + "epoch": 4.31, + "learning_rate": 0.0002795, + "loss": 0.4799, + "mlm_loss": 0.4799, + "step": 5590 + }, + { + "ep_loss": 0.0, + "epoch": 4.32, + "learning_rate": 0.00028000000000000003, + "loss": 0.4684, + "mlm_loss": 0.4684, + "step": 5600 + }, + { + "epoch": 4.32, + "eval_ep_loss": -2.7064967155456543, + "eval_loss": 0.44732606410980225, + "eval_mlm_loss": 0.44732606410980225, + "eval_runtime": 60.0339, + "eval_samples_per_second": 1163.826, + "eval_steps_per_second": 0.583, + "step": 5600 + }, + { + "ep_loss": 0.0, + "epoch": 4.33, + "learning_rate": 0.00028050000000000004, + "loss": 0.47, + "mlm_loss": 0.47, + "step": 5610 + }, + { + "ep_loss": 0.0, + "epoch": 4.33, + "learning_rate": 0.00028100000000000005, + "loss": 0.4698, + "mlm_loss": 0.4698, + "step": 5620 + }, + { + "ep_loss": 0.0, + "epoch": 4.34, + "learning_rate": 0.00028149999999999996, + "loss": 0.4758, + "mlm_loss": 0.4758, + "step": 5630 + }, + { + "ep_loss": 0.0, + "epoch": 4.35, + "learning_rate": 0.00028199999999999997, + "loss": 0.475, + "mlm_loss": 0.475, + "step": 5640 + }, + { + "ep_loss": 0.0, + "epoch": 4.36, + "learning_rate": 0.0002825, + "loss": 0.4805, + "mlm_loss": 0.4805, + "step": 5650 + }, + { + "ep_loss": 0.0, + "epoch": 4.37, + "learning_rate": 0.000283, + "loss": 0.4731, + "mlm_loss": 0.4731, + "step": 5660 + }, + { + "ep_loss": 0.0, + "epoch": 4.37, + "learning_rate": 0.0002835, + "loss": 0.4738, + "mlm_loss": 0.4738, + "step": 5670 + }, + { + "ep_loss": 0.0, + "epoch": 4.38, + "learning_rate": 0.00028399999999999996, + "loss": 0.4672, + "mlm_loss": 0.4672, + "step": 5680 + }, + { + "ep_loss": 0.0, + "epoch": 4.39, + "learning_rate": 0.0002845, + "loss": 0.4614, + "mlm_loss": 0.4614, + "step": 5690 + }, + { + "ep_loss": 0.0, + "epoch": 4.4, + "learning_rate": 0.000285, + "loss": 0.4717, + "mlm_loss": 0.4717, + "step": 5700 + }, + { + "epoch": 4.4, + "eval_ep_loss": -2.254852294921875, + "eval_loss": 0.44317254424095154, + "eval_mlm_loss": 0.44317254424095154, + "eval_runtime": 59.6572, + "eval_samples_per_second": 1171.174, + "eval_steps_per_second": 0.587, + "step": 5700 + }, + { + "ep_loss": 0.0, + "epoch": 4.4, + "learning_rate": 0.0002855, + "loss": 0.4665, + "mlm_loss": 0.4665, + "step": 5710 + }, + { + "ep_loss": 0.0, + "epoch": 4.41, + "learning_rate": 0.00028599999999999996, + "loss": 0.4725, + "mlm_loss": 0.4725, + "step": 5720 + }, + { + "ep_loss": 0.0, + "epoch": 4.42, + "learning_rate": 0.00028649999999999997, + "loss": 0.4688, + "mlm_loss": 0.4688, + "step": 5730 + }, + { + "ep_loss": 0.0, + "epoch": 4.43, + "learning_rate": 0.000287, + "loss": 0.4742, + "mlm_loss": 0.4742, + "step": 5740 + }, + { + "ep_loss": 0.0, + "epoch": 4.44, + "learning_rate": 0.0002875, + "loss": 0.4656, + "mlm_loss": 0.4656, + "step": 5750 + }, + { + "ep_loss": 0.0, + "epoch": 4.44, + "learning_rate": 0.000288, + "loss": 0.4758, + "mlm_loss": 0.4758, + "step": 5760 + }, + { + "ep_loss": 0.0, + "epoch": 4.45, + "learning_rate": 0.00028849999999999997, + "loss": 0.4705, + "mlm_loss": 0.4705, + "step": 5770 + }, + { + "ep_loss": 0.0, + "epoch": 4.46, + "learning_rate": 0.000289, + "loss": 0.4655, + "mlm_loss": 0.4655, + "step": 5780 + }, + { + "ep_loss": 0.0, + "epoch": 4.47, + "learning_rate": 0.0002895, + "loss": 0.4636, + "mlm_loss": 0.4636, + "step": 5790 + }, + { + "ep_loss": 0.0, + "epoch": 4.47, + "learning_rate": 0.00029, + "loss": 0.4646, + "mlm_loss": 0.4646, + "step": 5800 + }, + { + "epoch": 4.47, + "eval_ep_loss": -2.2536895275115967, + "eval_loss": 0.43992048501968384, + "eval_mlm_loss": 0.43992048501968384, + "eval_runtime": 62.7548, + "eval_samples_per_second": 1113.364, + "eval_steps_per_second": 0.558, + "step": 5800 + }, + { + "ep_loss": 0.0, + "epoch": 4.48, + "learning_rate": 0.00029049999999999996, + "loss": 0.4691, + "mlm_loss": 0.4691, + "step": 5810 + }, + { + "ep_loss": 0.0, + "epoch": 4.49, + "learning_rate": 0.00029099999999999997, + "loss": 0.4634, + "mlm_loss": 0.4634, + "step": 5820 + }, + { + "ep_loss": 0.0, + "epoch": 4.5, + "learning_rate": 0.0002915, + "loss": 0.46, + "mlm_loss": 0.46, + "step": 5830 + }, + { + "ep_loss": 0.0, + "epoch": 4.5, + "learning_rate": 0.000292, + "loss": 0.4648, + "mlm_loss": 0.4648, + "step": 5840 + }, + { + "ep_loss": 0.0, + "epoch": 4.51, + "learning_rate": 0.0002925, + "loss": 0.4624, + "mlm_loss": 0.4624, + "step": 5850 + }, + { + "ep_loss": 0.0, + "epoch": 4.52, + "learning_rate": 0.00029299999999999997, + "loss": 0.4649, + "mlm_loss": 0.4649, + "step": 5860 + }, + { + "ep_loss": 0.0, + "epoch": 4.53, + "learning_rate": 0.0002935, + "loss": 0.4626, + "mlm_loss": 0.4626, + "step": 5870 + }, + { + "ep_loss": 0.0, + "epoch": 4.54, + "learning_rate": 0.000294, + "loss": 0.4648, + "mlm_loss": 0.4648, + "step": 5880 + }, + { + "ep_loss": 0.0, + "epoch": 4.54, + "learning_rate": 0.0002945, + "loss": 0.4694, + "mlm_loss": 0.4694, + "step": 5890 + }, + { + "ep_loss": 0.0, + "epoch": 4.55, + "learning_rate": 0.000295, + "loss": 0.4699, + "mlm_loss": 0.4699, + "step": 5900 + }, + { + "epoch": 4.55, + "eval_ep_loss": -2.540950298309326, + "eval_loss": 0.4340103566646576, + "eval_mlm_loss": 0.4340103566646576, + "eval_runtime": 60.4098, + "eval_samples_per_second": 1156.584, + "eval_steps_per_second": 0.579, + "step": 5900 + }, + { + "ep_loss": 0.0, + "epoch": 4.56, + "learning_rate": 0.00029549999999999997, + "loss": 0.4615, + "mlm_loss": 0.4615, + "step": 5910 + }, + { + "ep_loss": 0.0, + "epoch": 4.57, + "learning_rate": 0.000296, + "loss": 0.4628, + "mlm_loss": 0.4628, + "step": 5920 + }, + { + "ep_loss": 0.0, + "epoch": 4.57, + "learning_rate": 0.0002965, + "loss": 0.4614, + "mlm_loss": 0.4614, + "step": 5930 + }, + { + "ep_loss": 0.0, + "epoch": 4.58, + "learning_rate": 0.000297, + "loss": 0.4584, + "mlm_loss": 0.4584, + "step": 5940 + }, + { + "ep_loss": 0.0, + "epoch": 4.59, + "learning_rate": 0.00029749999999999997, + "loss": 0.4624, + "mlm_loss": 0.4624, + "step": 5950 + }, + { + "ep_loss": 0.0, + "epoch": 4.6, + "learning_rate": 0.000298, + "loss": 0.4559, + "mlm_loss": 0.4559, + "step": 5960 + }, + { + "ep_loss": 0.0, + "epoch": 4.6, + "learning_rate": 0.0002985, + "loss": 0.4583, + "mlm_loss": 0.4583, + "step": 5970 + }, + { + "ep_loss": 0.0, + "epoch": 4.61, + "learning_rate": 0.000299, + "loss": 0.4641, + "mlm_loss": 0.4641, + "step": 5980 + }, + { + "ep_loss": 0.0, + "epoch": 4.62, + "learning_rate": 0.0002995, + "loss": 0.4639, + "mlm_loss": 0.4639, + "step": 5990 + }, + { + "ep_loss": 0.0, + "epoch": 4.63, + "learning_rate": 0.0003, + "loss": 0.4632, + "mlm_loss": 0.4632, + "step": 6000 + }, + { + "epoch": 4.63, + "eval_ep_loss": -2.32681941986084, + "eval_loss": 0.4312271177768707, + "eval_mlm_loss": 0.4312271177768707, + "eval_runtime": 60.5376, + "eval_samples_per_second": 1154.142, + "eval_steps_per_second": 0.578, + "step": 6000 + }, + { + "ep_loss": 0.0, + "epoch": 4.64, + "learning_rate": 0.0003005, + "loss": 0.4618, + "mlm_loss": 0.4618, + "step": 6010 + }, + { + "ep_loss": 0.0, + "epoch": 4.64, + "learning_rate": 0.000301, + "loss": 0.4553, + "mlm_loss": 0.4553, + "step": 6020 + }, + { + "ep_loss": 0.0, + "epoch": 4.65, + "learning_rate": 0.0003015, + "loss": 0.4649, + "mlm_loss": 0.4649, + "step": 6030 + }, + { + "ep_loss": 0.0, + "epoch": 4.66, + "learning_rate": 0.000302, + "loss": 0.4569, + "mlm_loss": 0.4569, + "step": 6040 + }, + { + "ep_loss": 0.0, + "epoch": 4.67, + "learning_rate": 0.0003025, + "loss": 0.4658, + "mlm_loss": 0.4658, + "step": 6050 + }, + { + "ep_loss": 0.0, + "epoch": 4.67, + "learning_rate": 0.000303, + "loss": 0.4571, + "mlm_loss": 0.4571, + "step": 6060 + }, + { + "ep_loss": 0.0, + "epoch": 4.68, + "learning_rate": 0.0003035, + "loss": 0.4574, + "mlm_loss": 0.4574, + "step": 6070 + }, + { + "ep_loss": 0.0, + "epoch": 4.69, + "learning_rate": 0.000304, + "loss": 0.4539, + "mlm_loss": 0.4539, + "step": 6080 + }, + { + "ep_loss": 0.0, + "epoch": 4.7, + "learning_rate": 0.0003045, + "loss": 0.4544, + "mlm_loss": 0.4544, + "step": 6090 + }, + { + "ep_loss": 0.0, + "epoch": 4.7, + "learning_rate": 0.000305, + "loss": 0.4532, + "mlm_loss": 0.4532, + "step": 6100 + }, + { + "epoch": 4.7, + "eval_ep_loss": -2.5871927738189697, + "eval_loss": 0.4311540722846985, + "eval_mlm_loss": 0.4311540722846985, + "eval_runtime": 62.3642, + "eval_samples_per_second": 1120.337, + "eval_steps_per_second": 0.561, + "step": 6100 + }, + { + "ep_loss": 0.0, + "epoch": 4.71, + "learning_rate": 0.0003055, + "loss": 0.4553, + "mlm_loss": 0.4553, + "step": 6110 + }, + { + "ep_loss": 0.0, + "epoch": 4.72, + "learning_rate": 0.000306, + "loss": 0.4557, + "mlm_loss": 0.4557, + "step": 6120 + }, + { + "ep_loss": 0.0, + "epoch": 4.73, + "learning_rate": 0.0003065, + "loss": 0.4568, + "mlm_loss": 0.4568, + "step": 6130 + }, + { + "ep_loss": 0.0, + "epoch": 4.74, + "learning_rate": 0.000307, + "loss": 0.4633, + "mlm_loss": 0.4633, + "step": 6140 + }, + { + "ep_loss": 0.0, + "epoch": 4.74, + "learning_rate": 0.0003075, + "loss": 0.4517, + "mlm_loss": 0.4517, + "step": 6150 + }, + { + "ep_loss": 0.0, + "epoch": 4.75, + "learning_rate": 0.000308, + "loss": 0.4539, + "mlm_loss": 0.4539, + "step": 6160 + }, + { + "ep_loss": 0.0, + "epoch": 4.76, + "learning_rate": 0.0003085, + "loss": 0.4606, + "mlm_loss": 0.4606, + "step": 6170 + }, + { + "ep_loss": 0.0, + "epoch": 4.77, + "learning_rate": 0.00030900000000000003, + "loss": 0.4502, + "mlm_loss": 0.4502, + "step": 6180 + }, + { + "ep_loss": 0.0, + "epoch": 4.77, + "learning_rate": 0.0003095, + "loss": 0.453, + "mlm_loss": 0.453, + "step": 6190 + }, + { + "ep_loss": 0.0, + "epoch": 4.78, + "learning_rate": 0.00031, + "loss": 0.4499, + "mlm_loss": 0.4499, + "step": 6200 + }, + { + "epoch": 4.78, + "eval_ep_loss": -2.451572895050049, + "eval_loss": 0.42733505368232727, + "eval_mlm_loss": 0.42733505368232727, + "eval_runtime": 59.9548, + "eval_samples_per_second": 1165.361, + "eval_steps_per_second": 0.584, + "step": 6200 + }, + { + "ep_loss": 0.0, + "epoch": 4.79, + "learning_rate": 0.0003105, + "loss": 0.4582, + "mlm_loss": 0.4582, + "step": 6210 + }, + { + "ep_loss": 0.0, + "epoch": 4.8, + "learning_rate": 0.000311, + "loss": 0.4587, + "mlm_loss": 0.4587, + "step": 6220 + }, + { + "ep_loss": 0.0, + "epoch": 4.81, + "learning_rate": 0.0003115, + "loss": 0.4557, + "mlm_loss": 0.4557, + "step": 6230 + }, + { + "ep_loss": 0.0, + "epoch": 4.81, + "learning_rate": 0.000312, + "loss": 0.4503, + "mlm_loss": 0.4503, + "step": 6240 + }, + { + "ep_loss": 0.0, + "epoch": 4.82, + "learning_rate": 0.0003125, + "loss": 0.4513, + "mlm_loss": 0.4513, + "step": 6250 + }, + { + "ep_loss": 0.0, + "epoch": 4.83, + "learning_rate": 0.000313, + "loss": 0.4532, + "mlm_loss": 0.4532, + "step": 6260 + }, + { + "ep_loss": 0.0, + "epoch": 4.84, + "learning_rate": 0.00031350000000000003, + "loss": 0.4535, + "mlm_loss": 0.4535, + "step": 6270 + }, + { + "ep_loss": 0.0, + "epoch": 4.84, + "learning_rate": 0.000314, + "loss": 0.453, + "mlm_loss": 0.453, + "step": 6280 + }, + { + "ep_loss": 0.0, + "epoch": 4.85, + "learning_rate": 0.0003145, + "loss": 0.4541, + "mlm_loss": 0.4541, + "step": 6290 + }, + { + "ep_loss": 0.0, + "epoch": 4.86, + "learning_rate": 0.000315, + "loss": 0.4532, + "mlm_loss": 0.4532, + "step": 6300 + }, + { + "epoch": 4.86, + "eval_ep_loss": -2.6052770614624023, + "eval_loss": 0.4238816797733307, + "eval_mlm_loss": 0.4238816797733307, + "eval_runtime": 60.9957, + "eval_samples_per_second": 1145.474, + "eval_steps_per_second": 0.574, + "step": 6300 + }, + { + "ep_loss": 0.0, + "epoch": 4.87, + "learning_rate": 0.0003155, + "loss": 0.4505, + "mlm_loss": 0.4505, + "step": 6310 + }, + { + "ep_loss": 0.0, + "epoch": 4.87, + "learning_rate": 0.000316, + "loss": 0.4476, + "mlm_loss": 0.4476, + "step": 6320 + }, + { + "ep_loss": 0.0, + "epoch": 4.88, + "learning_rate": 0.0003165, + "loss": 0.4495, + "mlm_loss": 0.4495, + "step": 6330 + }, + { + "ep_loss": 0.0, + "epoch": 4.89, + "learning_rate": 0.000317, + "loss": 0.4483, + "mlm_loss": 0.4483, + "step": 6340 + }, + { + "ep_loss": 0.0, + "epoch": 4.9, + "learning_rate": 0.0003175, + "loss": 0.4498, + "mlm_loss": 0.4498, + "step": 6350 + }, + { + "ep_loss": 0.0, + "epoch": 4.91, + "learning_rate": 0.00031800000000000003, + "loss": 0.4505, + "mlm_loss": 0.4505, + "step": 6360 + }, + { + "ep_loss": 0.0, + "epoch": 4.91, + "learning_rate": 0.0003185, + "loss": 0.4362, + "mlm_loss": 0.4362, + "step": 6370 + }, + { + "ep_loss": 0.0, + "epoch": 4.92, + "learning_rate": 0.000319, + "loss": 0.449, + "mlm_loss": 0.449, + "step": 6380 + }, + { + "ep_loss": 0.0, + "epoch": 4.93, + "learning_rate": 0.0003195, + "loss": 0.4558, + "mlm_loss": 0.4558, + "step": 6390 + }, + { + "ep_loss": 0.0, + "epoch": 4.94, + "learning_rate": 0.00032, + "loss": 0.4484, + "mlm_loss": 0.4484, + "step": 6400 + }, + { + "epoch": 4.94, + "eval_ep_loss": -2.464764356613159, + "eval_loss": 0.42165419459342957, + "eval_mlm_loss": 0.42165419459342957, + "eval_runtime": 60.6893, + "eval_samples_per_second": 1151.257, + "eval_steps_per_second": 0.577, + "step": 6400 + }, + { + "ep_loss": 0.0, + "epoch": 4.94, + "learning_rate": 0.00032050000000000004, + "loss": 0.4442, + "mlm_loss": 0.4442, + "step": 6410 + }, + { + "ep_loss": 0.0, + "epoch": 4.95, + "learning_rate": 0.000321, + "loss": 0.4494, + "mlm_loss": 0.4494, + "step": 6420 + }, + { + "ep_loss": 0.0, + "epoch": 4.96, + "learning_rate": 0.0003215, + "loss": 0.4492, + "mlm_loss": 0.4492, + "step": 6430 + }, + { + "ep_loss": 0.0, + "epoch": 4.97, + "learning_rate": 0.000322, + "loss": 0.4481, + "mlm_loss": 0.4481, + "step": 6440 + }, + { + "ep_loss": 0.0, + "epoch": 4.97, + "learning_rate": 0.00032250000000000003, + "loss": 0.4499, + "mlm_loss": 0.4499, + "step": 6450 + }, + { + "ep_loss": 0.0, + "epoch": 4.98, + "learning_rate": 0.000323, + "loss": 0.454, + "mlm_loss": 0.454, + "step": 6460 + }, + { + "ep_loss": 0.0, + "epoch": 4.99, + "learning_rate": 0.0003235, + "loss": 0.4466, + "mlm_loss": 0.4466, + "step": 6470 + }, + { + "ep_loss": 0.0, + "epoch": 5.0, + "learning_rate": 0.000324, + "loss": 0.4473, + "mlm_loss": 0.4473, + "step": 6480 + }, + { + "ep_loss": 0.0, + "epoch": 5.01, + "learning_rate": 0.00032450000000000003, + "loss": 0.445, + "mlm_loss": 0.445, + "step": 6490 + }, + { + "ep_loss": 0.0, + "epoch": 5.01, + "learning_rate": 0.00032500000000000004, + "loss": 0.439, + "mlm_loss": 0.439, + "step": 6500 + }, + { + "epoch": 5.01, + "eval_ep_loss": -2.375345230102539, + "eval_loss": 0.4176154136657715, + "eval_mlm_loss": 0.4176154136657715, + "eval_runtime": 61.6206, + "eval_samples_per_second": 1133.858, + "eval_steps_per_second": 0.568, + "step": 6500 + }, + { + "ep_loss": 0.0, + "epoch": 5.02, + "learning_rate": 0.0003255, + "loss": 0.4483, + "mlm_loss": 0.4483, + "step": 6510 + }, + { + "ep_loss": 0.0, + "epoch": 5.03, + "learning_rate": 0.000326, + "loss": 0.4435, + "mlm_loss": 0.4435, + "step": 6520 + }, + { + "ep_loss": 0.0, + "epoch": 5.04, + "learning_rate": 0.0003265, + "loss": 0.4455, + "mlm_loss": 0.4455, + "step": 6530 + }, + { + "ep_loss": 0.0, + "epoch": 5.04, + "learning_rate": 0.00032700000000000003, + "loss": 0.4464, + "mlm_loss": 0.4464, + "step": 6540 + }, + { + "ep_loss": 0.0, + "epoch": 5.05, + "learning_rate": 0.00032750000000000005, + "loss": 0.4434, + "mlm_loss": 0.4434, + "step": 6550 + }, + { + "ep_loss": 0.0, + "epoch": 5.06, + "learning_rate": 0.000328, + "loss": 0.4363, + "mlm_loss": 0.4363, + "step": 6560 + }, + { + "ep_loss": 0.0, + "epoch": 5.07, + "learning_rate": 0.0003285, + "loss": 0.4354, + "mlm_loss": 0.4354, + "step": 6570 + }, + { + "ep_loss": 0.0, + "epoch": 5.08, + "learning_rate": 0.00032900000000000003, + "loss": 0.4485, + "mlm_loss": 0.4485, + "step": 6580 + }, + { + "ep_loss": 0.0, + "epoch": 5.08, + "learning_rate": 0.00032950000000000004, + "loss": 0.445, + "mlm_loss": 0.445, + "step": 6590 + }, + { + "ep_loss": 0.0, + "epoch": 5.09, + "learning_rate": 0.00033, + "loss": 0.4461, + "mlm_loss": 0.4461, + "step": 6600 + }, + { + "epoch": 5.09, + "eval_ep_loss": -2.5544066429138184, + "eval_loss": 0.41740256547927856, + "eval_mlm_loss": 0.41740256547927856, + "eval_runtime": 59.4616, + "eval_samples_per_second": 1175.028, + "eval_steps_per_second": 0.589, + "step": 6600 + }, + { + "ep_loss": 0.0, + "epoch": 5.1, + "learning_rate": 0.0003305, + "loss": 0.4421, + "mlm_loss": 0.4421, + "step": 6610 + }, + { + "ep_loss": 0.0, + "epoch": 5.11, + "learning_rate": 0.000331, + "loss": 0.4406, + "mlm_loss": 0.4406, + "step": 6620 + }, + { + "ep_loss": 0.0, + "epoch": 5.11, + "learning_rate": 0.00033150000000000003, + "loss": 0.4488, + "mlm_loss": 0.4488, + "step": 6630 + }, + { + "ep_loss": 0.0, + "epoch": 5.12, + "learning_rate": 0.00033200000000000005, + "loss": 0.436, + "mlm_loss": 0.436, + "step": 6640 + }, + { + "ep_loss": 0.0, + "epoch": 5.13, + "learning_rate": 0.0003325, + "loss": 0.4398, + "mlm_loss": 0.4398, + "step": 6650 + }, + { + "ep_loss": 0.0, + "epoch": 5.14, + "learning_rate": 0.000333, + "loss": 0.4381, + "mlm_loss": 0.4381, + "step": 6660 + }, + { + "ep_loss": 0.0, + "epoch": 5.14, + "learning_rate": 0.00033350000000000003, + "loss": 0.4479, + "mlm_loss": 0.4479, + "step": 6670 + }, + { + "ep_loss": 0.0, + "epoch": 5.15, + "learning_rate": 0.00033400000000000004, + "loss": 0.4409, + "mlm_loss": 0.4409, + "step": 6680 + }, + { + "ep_loss": 0.0, + "epoch": 5.16, + "learning_rate": 0.00033450000000000005, + "loss": 0.4395, + "mlm_loss": 0.4395, + "step": 6690 + }, + { + "ep_loss": 0.0, + "epoch": 5.17, + "learning_rate": 0.000335, + "loss": 0.4415, + "mlm_loss": 0.4415, + "step": 6700 + }, + { + "epoch": 5.17, + "eval_ep_loss": -2.5074503421783447, + "eval_loss": 0.41386091709136963, + "eval_mlm_loss": 0.41386091709136963, + "eval_runtime": 61.5941, + "eval_samples_per_second": 1134.346, + "eval_steps_per_second": 0.568, + "step": 6700 + }, + { + "ep_loss": 0.0, + "epoch": 5.18, + "learning_rate": 0.0003355, + "loss": 0.443, + "mlm_loss": 0.443, + "step": 6710 + }, + { + "ep_loss": 0.0, + "epoch": 5.18, + "learning_rate": 0.00033600000000000004, + "loss": 0.4355, + "mlm_loss": 0.4355, + "step": 6720 + }, + { + "ep_loss": 0.0, + "epoch": 5.19, + "learning_rate": 0.00033650000000000005, + "loss": 0.4301, + "mlm_loss": 0.4301, + "step": 6730 + }, + { + "ep_loss": 0.0, + "epoch": 5.2, + "learning_rate": 0.000337, + "loss": 0.4414, + "mlm_loss": 0.4414, + "step": 6740 + }, + { + "ep_loss": 0.0, + "epoch": 5.21, + "learning_rate": 0.0003375, + "loss": 0.4368, + "mlm_loss": 0.4368, + "step": 6750 + }, + { + "ep_loss": 0.0, + "epoch": 5.21, + "learning_rate": 0.00033800000000000003, + "loss": 0.4378, + "mlm_loss": 0.4378, + "step": 6760 + }, + { + "ep_loss": 0.0, + "epoch": 5.22, + "learning_rate": 0.00033850000000000004, + "loss": 0.4402, + "mlm_loss": 0.4402, + "step": 6770 + }, + { + "ep_loss": 0.0, + "epoch": 5.23, + "learning_rate": 0.00033900000000000005, + "loss": 0.439, + "mlm_loss": 0.439, + "step": 6780 + }, + { + "ep_loss": 0.0, + "epoch": 5.24, + "learning_rate": 0.0003395, + "loss": 0.4454, + "mlm_loss": 0.4454, + "step": 6790 + }, + { + "ep_loss": 0.0, + "epoch": 5.24, + "learning_rate": 0.00034, + "loss": 0.4379, + "mlm_loss": 0.4379, + "step": 6800 + }, + { + "epoch": 5.24, + "eval_ep_loss": -2.6114962100982666, + "eval_loss": 0.4116796851158142, + "eval_mlm_loss": 0.4116796851158142, + "eval_runtime": 61.6861, + "eval_samples_per_second": 1132.654, + "eval_steps_per_second": 0.567, + "step": 6800 + }, + { + "ep_loss": 0.0, + "epoch": 5.25, + "learning_rate": 0.00034050000000000004, + "loss": 0.4407, + "mlm_loss": 0.4407, + "step": 6810 + }, + { + "ep_loss": 0.0, + "epoch": 5.26, + "learning_rate": 0.00034100000000000005, + "loss": 0.4312, + "mlm_loss": 0.4312, + "step": 6820 + }, + { + "ep_loss": 0.0, + "epoch": 5.27, + "learning_rate": 0.0003415, + "loss": 0.4327, + "mlm_loss": 0.4327, + "step": 6830 + }, + { + "ep_loss": 0.0, + "epoch": 5.28, + "learning_rate": 0.000342, + "loss": 0.4405, + "mlm_loss": 0.4405, + "step": 6840 + }, + { + "ep_loss": 0.0, + "epoch": 5.28, + "learning_rate": 0.00034250000000000003, + "loss": 0.4395, + "mlm_loss": 0.4395, + "step": 6850 + }, + { + "ep_loss": 0.0, + "epoch": 5.29, + "learning_rate": 0.00034300000000000004, + "loss": 0.4398, + "mlm_loss": 0.4398, + "step": 6860 + }, + { + "ep_loss": 0.0, + "epoch": 5.3, + "learning_rate": 0.00034350000000000006, + "loss": 0.4372, + "mlm_loss": 0.4372, + "step": 6870 + }, + { + "ep_loss": 0.0, + "epoch": 5.31, + "learning_rate": 0.00034399999999999996, + "loss": 0.4438, + "mlm_loss": 0.4438, + "step": 6880 + }, + { + "ep_loss": 0.0, + "epoch": 5.31, + "learning_rate": 0.00034449999999999997, + "loss": 0.4356, + "mlm_loss": 0.4356, + "step": 6890 + }, + { + "ep_loss": 0.0, + "epoch": 5.32, + "learning_rate": 0.000345, + "loss": 0.4357, + "mlm_loss": 0.4357, + "step": 6900 + }, + { + "epoch": 5.32, + "eval_ep_loss": -2.2833127975463867, + "eval_loss": 0.40905898809432983, + "eval_mlm_loss": 0.40905898809432983, + "eval_runtime": 61.29, + "eval_samples_per_second": 1139.974, + "eval_steps_per_second": 0.571, + "step": 6900 + }, + { + "ep_loss": 0.0, + "epoch": 5.33, + "learning_rate": 0.0003455, + "loss": 0.4357, + "mlm_loss": 0.4357, + "step": 6910 + }, + { + "ep_loss": 0.0, + "epoch": 5.34, + "learning_rate": 0.000346, + "loss": 0.4243, + "mlm_loss": 0.4243, + "step": 6920 + }, + { + "ep_loss": 0.0, + "epoch": 5.35, + "learning_rate": 0.00034649999999999997, + "loss": 0.4404, + "mlm_loss": 0.4404, + "step": 6930 + }, + { + "ep_loss": 0.0, + "epoch": 5.35, + "learning_rate": 0.000347, + "loss": 0.4319, + "mlm_loss": 0.4319, + "step": 6940 + }, + { + "ep_loss": 0.0, + "epoch": 5.36, + "learning_rate": 0.0003475, + "loss": 0.4362, + "mlm_loss": 0.4362, + "step": 6950 + }, + { + "ep_loss": 0.0, + "epoch": 5.37, + "learning_rate": 0.000348, + "loss": 0.4355, + "mlm_loss": 0.4355, + "step": 6960 + }, + { + "ep_loss": 0.0, + "epoch": 5.38, + "learning_rate": 0.00034849999999999996, + "loss": 0.43, + "mlm_loss": 0.43, + "step": 6970 + }, + { + "ep_loss": 0.0, + "epoch": 5.38, + "learning_rate": 0.00034899999999999997, + "loss": 0.4342, + "mlm_loss": 0.4342, + "step": 6980 + }, + { + "ep_loss": 0.0, + "epoch": 5.39, + "learning_rate": 0.0003495, + "loss": 0.4345, + "mlm_loss": 0.4345, + "step": 6990 + }, + { + "ep_loss": 0.0, + "epoch": 5.4, + "learning_rate": 0.00035, + "loss": 0.4324, + "mlm_loss": 0.4324, + "step": 7000 + }, + { + "epoch": 5.4, + "eval_ep_loss": -2.64093279838562, + "eval_loss": 0.4105168282985687, + "eval_mlm_loss": 0.4105168282985687, + "eval_runtime": 60.7916, + "eval_samples_per_second": 1149.32, + "eval_steps_per_second": 0.576, + "step": 7000 + }, + { + "ep_loss": 0.0, + "epoch": 5.41, + "learning_rate": 0.0003505, + "loss": 0.4319, + "mlm_loss": 0.4319, + "step": 7010 + }, + { + "ep_loss": 0.0, + "epoch": 5.41, + "learning_rate": 0.00035099999999999997, + "loss": 0.4372, + "mlm_loss": 0.4372, + "step": 7020 + }, + { + "ep_loss": 0.0, + "epoch": 5.42, + "learning_rate": 0.0003515, + "loss": 0.4342, + "mlm_loss": 0.4342, + "step": 7030 + }, + { + "ep_loss": 0.0, + "epoch": 5.43, + "learning_rate": 0.000352, + "loss": 0.4308, + "mlm_loss": 0.4308, + "step": 7040 + }, + { + "ep_loss": 0.0, + "epoch": 5.44, + "learning_rate": 0.0003525, + "loss": 0.4373, + "mlm_loss": 0.4373, + "step": 7050 + }, + { + "ep_loss": 0.0, + "epoch": 5.45, + "learning_rate": 0.00035299999999999996, + "loss": 0.4271, + "mlm_loss": 0.4271, + "step": 7060 + }, + { + "ep_loss": 0.0, + "epoch": 5.45, + "learning_rate": 0.0003535, + "loss": 0.4355, + "mlm_loss": 0.4355, + "step": 7070 + }, + { + "ep_loss": 0.0, + "epoch": 5.46, + "learning_rate": 0.000354, + "loss": 0.4359, + "mlm_loss": 0.4359, + "step": 7080 + }, + { + "ep_loss": 0.0, + "epoch": 5.47, + "learning_rate": 0.0003545, + "loss": 0.436, + "mlm_loss": 0.436, + "step": 7090 + }, + { + "ep_loss": 0.0, + "epoch": 5.48, + "learning_rate": 0.000355, + "loss": 0.4362, + "mlm_loss": 0.4362, + "step": 7100 + }, + { + "epoch": 5.48, + "eval_ep_loss": -2.3400020599365234, + "eval_loss": 0.4073105752468109, + "eval_mlm_loss": 0.4073105752468109, + "eval_runtime": 63.2925, + "eval_samples_per_second": 1103.907, + "eval_steps_per_second": 0.553, + "step": 7100 + }, + { + "ep_loss": 0.0, + "epoch": 5.48, + "learning_rate": 0.00035549999999999997, + "loss": 0.4321, + "mlm_loss": 0.4321, + "step": 7110 + }, + { + "ep_loss": 0.0, + "epoch": 5.49, + "learning_rate": 0.000356, + "loss": 0.4343, + "mlm_loss": 0.4343, + "step": 7120 + }, + { + "ep_loss": 0.0, + "epoch": 5.5, + "learning_rate": 0.0003565, + "loss": 0.4307, + "mlm_loss": 0.4307, + "step": 7130 + }, + { + "ep_loss": 0.0, + "epoch": 5.51, + "learning_rate": 0.000357, + "loss": 0.4284, + "mlm_loss": 0.4284, + "step": 7140 + }, + { + "ep_loss": 0.0, + "epoch": 5.51, + "learning_rate": 0.0003575, + "loss": 0.4262, + "mlm_loss": 0.4262, + "step": 7150 + }, + { + "ep_loss": 0.0, + "epoch": 5.52, + "learning_rate": 0.000358, + "loss": 0.4248, + "mlm_loss": 0.4248, + "step": 7160 + }, + { + "ep_loss": 0.0, + "epoch": 5.53, + "learning_rate": 0.0003585, + "loss": 0.4419, + "mlm_loss": 0.4419, + "step": 7170 + }, + { + "ep_loss": 0.0, + "epoch": 5.54, + "learning_rate": 0.000359, + "loss": 0.4267, + "mlm_loss": 0.4267, + "step": 7180 + }, + { + "ep_loss": 0.0, + "epoch": 5.55, + "learning_rate": 0.0003595, + "loss": 0.4337, + "mlm_loss": 0.4337, + "step": 7190 + }, + { + "ep_loss": 0.0, + "epoch": 5.55, + "learning_rate": 0.00035999999999999997, + "loss": 0.4264, + "mlm_loss": 0.4264, + "step": 7200 + }, + { + "epoch": 5.55, + "eval_ep_loss": -2.597167491912842, + "eval_loss": 0.40466931462287903, + "eval_mlm_loss": 0.40466931462287903, + "eval_runtime": 61.8389, + "eval_samples_per_second": 1129.855, + "eval_steps_per_second": 0.566, + "step": 7200 + }, + { + "ep_loss": 0.0, + "epoch": 5.56, + "learning_rate": 0.0003605, + "loss": 0.4232, + "mlm_loss": 0.4232, + "step": 7210 + }, + { + "ep_loss": 0.0, + "epoch": 5.57, + "learning_rate": 0.000361, + "loss": 0.4253, + "mlm_loss": 0.4253, + "step": 7220 + }, + { + "ep_loss": 0.0, + "epoch": 5.58, + "learning_rate": 0.0003615, + "loss": 0.437, + "mlm_loss": 0.437, + "step": 7230 + }, + { + "ep_loss": 0.0, + "epoch": 5.58, + "learning_rate": 0.000362, + "loss": 0.4319, + "mlm_loss": 0.4319, + "step": 7240 + }, + { + "ep_loss": 0.0, + "epoch": 5.59, + "learning_rate": 0.0003625, + "loss": 0.4257, + "mlm_loss": 0.4257, + "step": 7250 + }, + { + "ep_loss": 0.0, + "epoch": 5.6, + "learning_rate": 0.000363, + "loss": 0.4318, + "mlm_loss": 0.4318, + "step": 7260 + }, + { + "ep_loss": 0.0, + "epoch": 5.61, + "learning_rate": 0.0003635, + "loss": 0.4308, + "mlm_loss": 0.4308, + "step": 7270 + }, + { + "ep_loss": 0.0, + "epoch": 5.62, + "learning_rate": 0.000364, + "loss": 0.4304, + "mlm_loss": 0.4304, + "step": 7280 + }, + { + "ep_loss": 0.0, + "epoch": 5.62, + "learning_rate": 0.0003645, + "loss": 0.424, + "mlm_loss": 0.424, + "step": 7290 + }, + { + "ep_loss": 0.0, + "epoch": 5.63, + "learning_rate": 0.000365, + "loss": 0.4315, + "mlm_loss": 0.4315, + "step": 7300 + }, + { + "epoch": 5.63, + "eval_ep_loss": -2.466102361679077, + "eval_loss": 0.40299102663993835, + "eval_mlm_loss": 0.40299102663993835, + "eval_runtime": 59.5652, + "eval_samples_per_second": 1172.984, + "eval_steps_per_second": 0.588, + "step": 7300 + }, + { + "ep_loss": 0.0, + "epoch": 5.64, + "learning_rate": 0.0003655, + "loss": 0.4305, + "mlm_loss": 0.4305, + "step": 7310 + }, + { + "ep_loss": 0.0, + "epoch": 5.65, + "learning_rate": 0.000366, + "loss": 0.4215, + "mlm_loss": 0.4215, + "step": 7320 + }, + { + "ep_loss": 0.0, + "epoch": 5.65, + "learning_rate": 0.0003665, + "loss": 0.4314, + "mlm_loss": 0.4314, + "step": 7330 + }, + { + "ep_loss": 0.0, + "epoch": 5.66, + "learning_rate": 0.000367, + "loss": 0.4334, + "mlm_loss": 0.4334, + "step": 7340 + }, + { + "ep_loss": 0.0, + "epoch": 5.67, + "learning_rate": 0.0003675, + "loss": 0.4294, + "mlm_loss": 0.4294, + "step": 7350 + }, + { + "ep_loss": 0.0, + "epoch": 5.68, + "learning_rate": 0.000368, + "loss": 0.4212, + "mlm_loss": 0.4212, + "step": 7360 + }, + { + "ep_loss": 0.0, + "epoch": 5.68, + "learning_rate": 0.0003685, + "loss": 0.4282, + "mlm_loss": 0.4282, + "step": 7370 + }, + { + "ep_loss": 0.0, + "epoch": 5.69, + "learning_rate": 0.000369, + "loss": 0.4249, + "mlm_loss": 0.4249, + "step": 7380 + }, + { + "ep_loss": 0.0, + "epoch": 5.7, + "learning_rate": 0.0003695, + "loss": 0.4236, + "mlm_loss": 0.4236, + "step": 7390 + }, + { + "ep_loss": 0.0, + "epoch": 5.71, + "learning_rate": 0.00037, + "loss": 0.4287, + "mlm_loss": 0.4287, + "step": 7400 + }, + { + "epoch": 5.71, + "eval_ep_loss": -2.350245475769043, + "eval_loss": 0.401913046836853, + "eval_mlm_loss": 0.401913046836853, + "eval_runtime": 60.065, + "eval_samples_per_second": 1163.223, + "eval_steps_per_second": 0.583, + "step": 7400 + }, + { + "ep_loss": 0.0, + "epoch": 5.72, + "learning_rate": 0.0003705, + "loss": 0.4203, + "mlm_loss": 0.4203, + "step": 7410 + }, + { + "ep_loss": 0.0, + "epoch": 5.72, + "learning_rate": 0.000371, + "loss": 0.4252, + "mlm_loss": 0.4252, + "step": 7420 + }, + { + "ep_loss": 0.0, + "epoch": 5.73, + "learning_rate": 0.00037150000000000003, + "loss": 0.4264, + "mlm_loss": 0.4264, + "step": 7430 + }, + { + "ep_loss": 0.0, + "epoch": 5.74, + "learning_rate": 0.000372, + "loss": 0.4225, + "mlm_loss": 0.4225, + "step": 7440 + }, + { + "ep_loss": 0.0, + "epoch": 5.75, + "learning_rate": 0.0003725, + "loss": 0.4262, + "mlm_loss": 0.4262, + "step": 7450 + }, + { + "ep_loss": 0.0, + "epoch": 5.75, + "learning_rate": 0.000373, + "loss": 0.4231, + "mlm_loss": 0.4231, + "step": 7460 + }, + { + "ep_loss": 0.0, + "epoch": 5.76, + "learning_rate": 0.0003735, + "loss": 0.4292, + "mlm_loss": 0.4292, + "step": 7470 + }, + { + "ep_loss": 0.0, + "epoch": 5.77, + "learning_rate": 0.000374, + "loss": 0.4239, + "mlm_loss": 0.4239, + "step": 7480 + }, + { + "ep_loss": 0.0, + "epoch": 5.78, + "learning_rate": 0.0003745, + "loss": 0.4209, + "mlm_loss": 0.4209, + "step": 7490 + }, + { + "ep_loss": 0.0, + "epoch": 5.78, + "learning_rate": 0.000375, + "loss": 0.4198, + "mlm_loss": 0.4198, + "step": 7500 + }, + { + "epoch": 5.78, + "eval_ep_loss": -2.3534927368164062, + "eval_loss": 0.3982246518135071, + "eval_mlm_loss": 0.3982246518135071, + "eval_runtime": 63.0218, + "eval_samples_per_second": 1108.649, + "eval_steps_per_second": 0.555, + "step": 7500 + }, + { + "ep_loss": 0.0, + "epoch": 5.79, + "learning_rate": 0.0003755, + "loss": 0.4212, + "mlm_loss": 0.4212, + "step": 7510 + }, + { + "ep_loss": 0.0, + "epoch": 5.8, + "learning_rate": 0.00037600000000000003, + "loss": 0.4141, + "mlm_loss": 0.4141, + "step": 7520 + }, + { + "ep_loss": 0.0, + "epoch": 5.81, + "learning_rate": 0.0003765, + "loss": 0.4266, + "mlm_loss": 0.4266, + "step": 7530 + }, + { + "ep_loss": 0.0, + "epoch": 5.82, + "learning_rate": 0.000377, + "loss": 0.4255, + "mlm_loss": 0.4255, + "step": 7540 + }, + { + "ep_loss": 0.0, + "epoch": 5.82, + "learning_rate": 0.0003775, + "loss": 0.4194, + "mlm_loss": 0.4194, + "step": 7550 + }, + { + "ep_loss": 0.0, + "epoch": 5.83, + "learning_rate": 0.000378, + "loss": 0.428, + "mlm_loss": 0.428, + "step": 7560 + }, + { + "ep_loss": 0.0, + "epoch": 5.84, + "learning_rate": 0.0003785, + "loss": 0.4221, + "mlm_loss": 0.4221, + "step": 7570 + }, + { + "ep_loss": 0.0, + "epoch": 5.85, + "learning_rate": 0.000379, + "loss": 0.4253, + "mlm_loss": 0.4253, + "step": 7580 + }, + { + "ep_loss": 0.0, + "epoch": 5.85, + "learning_rate": 0.0003795, + "loss": 0.4174, + "mlm_loss": 0.4174, + "step": 7590 + }, + { + "ep_loss": 0.0, + "epoch": 5.86, + "learning_rate": 0.00038, + "loss": 0.4204, + "mlm_loss": 0.4204, + "step": 7600 + }, + { + "epoch": 5.86, + "eval_ep_loss": -2.3331265449523926, + "eval_loss": 0.3971656858921051, + "eval_mlm_loss": 0.3971656858921051, + "eval_runtime": 61.175, + "eval_samples_per_second": 1142.117, + "eval_steps_per_second": 0.572, + "step": 7600 + }, + { + "ep_loss": 0.0, + "epoch": 5.87, + "learning_rate": 0.00038050000000000003, + "loss": 0.4127, + "mlm_loss": 0.4127, + "step": 7610 + }, + { + "ep_loss": 0.0, + "epoch": 5.88, + "learning_rate": 0.000381, + "loss": 0.4226, + "mlm_loss": 0.4226, + "step": 7620 + }, + { + "ep_loss": 0.0, + "epoch": 5.89, + "learning_rate": 0.0003815, + "loss": 0.418, + "mlm_loss": 0.418, + "step": 7630 + }, + { + "ep_loss": 0.0, + "epoch": 5.89, + "learning_rate": 0.000382, + "loss": 0.4291, + "mlm_loss": 0.4291, + "step": 7640 + }, + { + "ep_loss": 0.0, + "epoch": 5.9, + "learning_rate": 0.00038250000000000003, + "loss": 0.4155, + "mlm_loss": 0.4155, + "step": 7650 + }, + { + "ep_loss": 0.0, + "epoch": 5.91, + "learning_rate": 0.00038300000000000004, + "loss": 0.4213, + "mlm_loss": 0.4213, + "step": 7660 + }, + { + "ep_loss": 0.0, + "epoch": 5.92, + "learning_rate": 0.0003835, + "loss": 0.4273, + "mlm_loss": 0.4273, + "step": 7670 + }, + { + "ep_loss": 0.0, + "epoch": 5.92, + "learning_rate": 0.000384, + "loss": 0.4252, + "mlm_loss": 0.4252, + "step": 7680 + }, + { + "ep_loss": 0.0, + "epoch": 5.93, + "learning_rate": 0.0003845, + "loss": 0.4201, + "mlm_loss": 0.4201, + "step": 7690 + }, + { + "ep_loss": 0.0, + "epoch": 5.94, + "learning_rate": 0.00038500000000000003, + "loss": 0.4198, + "mlm_loss": 0.4198, + "step": 7700 + }, + { + "epoch": 5.94, + "eval_ep_loss": -2.514362335205078, + "eval_loss": 0.39368629455566406, + "eval_mlm_loss": 0.39368629455566406, + "eval_runtime": 60.7647, + "eval_samples_per_second": 1149.828, + "eval_steps_per_second": 0.576, + "step": 7700 + }, + { + "ep_loss": 0.0, + "epoch": 5.95, + "learning_rate": 0.0003855, + "loss": 0.4167, + "mlm_loss": 0.4167, + "step": 7710 + }, + { + "ep_loss": 0.0, + "epoch": 5.95, + "learning_rate": 0.000386, + "loss": 0.4235, + "mlm_loss": 0.4235, + "step": 7720 + }, + { + "ep_loss": 0.0, + "epoch": 5.96, + "learning_rate": 0.0003865, + "loss": 0.4233, + "mlm_loss": 0.4233, + "step": 7730 + }, + { + "ep_loss": 0.0, + "epoch": 5.97, + "learning_rate": 0.00038700000000000003, + "loss": 0.4227, + "mlm_loss": 0.4227, + "step": 7740 + }, + { + "ep_loss": 0.0, + "epoch": 5.98, + "learning_rate": 0.00038750000000000004, + "loss": 0.4194, + "mlm_loss": 0.4194, + "step": 7750 + }, + { + "ep_loss": 0.0, + "epoch": 5.99, + "learning_rate": 0.000388, + "loss": 0.4194, + "mlm_loss": 0.4194, + "step": 7760 + }, + { + "ep_loss": 0.0, + "epoch": 5.99, + "learning_rate": 0.0003885, + "loss": 0.4171, + "mlm_loss": 0.4171, + "step": 7770 + }, + { + "ep_loss": 0.0, + "epoch": 6.0, + "learning_rate": 0.000389, + "loss": 0.4197, + "mlm_loss": 0.4197, + "step": 7780 + }, + { + "ep_loss": 0.0, + "epoch": 6.01, + "learning_rate": 0.00038950000000000003, + "loss": 0.4138, + "mlm_loss": 0.4138, + "step": 7790 + }, + { + "ep_loss": 0.0, + "epoch": 6.02, + "learning_rate": 0.00039000000000000005, + "loss": 0.4149, + "mlm_loss": 0.4149, + "step": 7800 + }, + { + "epoch": 6.02, + "eval_ep_loss": -2.336341619491577, + "eval_loss": 0.3958059251308441, + "eval_mlm_loss": 0.3958059251308441, + "eval_runtime": 59.0112, + "eval_samples_per_second": 1183.995, + "eval_steps_per_second": 0.593, + "step": 7800 + }, + { + "ep_loss": 0.0, + "epoch": 6.02, + "learning_rate": 0.0003905, + "loss": 0.4197, + "mlm_loss": 0.4197, + "step": 7810 + }, + { + "ep_loss": 0.0, + "epoch": 6.03, + "learning_rate": 0.000391, + "loss": 0.4135, + "mlm_loss": 0.4135, + "step": 7820 + }, + { + "ep_loss": 0.0, + "epoch": 6.04, + "learning_rate": 0.00039150000000000003, + "loss": 0.4207, + "mlm_loss": 0.4207, + "step": 7830 + }, + { + "ep_loss": 0.0, + "epoch": 6.05, + "learning_rate": 0.00039200000000000004, + "loss": 0.419, + "mlm_loss": 0.419, + "step": 7840 + }, + { + "ep_loss": 0.0, + "epoch": 6.05, + "learning_rate": 0.0003925, + "loss": 0.4169, + "mlm_loss": 0.4169, + "step": 7850 + }, + { + "ep_loss": 0.0, + "epoch": 6.06, + "learning_rate": 0.000393, + "loss": 0.4183, + "mlm_loss": 0.4183, + "step": 7860 + }, + { + "ep_loss": 0.0, + "epoch": 6.07, + "learning_rate": 0.0003935, + "loss": 0.4127, + "mlm_loss": 0.4127, + "step": 7870 + }, + { + "ep_loss": 0.0, + "epoch": 6.08, + "learning_rate": 0.00039400000000000004, + "loss": 0.421, + "mlm_loss": 0.421, + "step": 7880 + }, + { + "ep_loss": 0.0, + "epoch": 6.09, + "learning_rate": 0.00039450000000000005, + "loss": 0.4135, + "mlm_loss": 0.4135, + "step": 7890 + }, + { + "ep_loss": 0.0, + "epoch": 6.09, + "learning_rate": 0.000395, + "loss": 0.4174, + "mlm_loss": 0.4174, + "step": 7900 + }, + { + "epoch": 6.09, + "eval_ep_loss": -2.3622541427612305, + "eval_loss": 0.3937676250934601, + "eval_mlm_loss": 0.3937676250934601, + "eval_runtime": 60.7463, + "eval_samples_per_second": 1150.176, + "eval_steps_per_second": 0.576, + "step": 7900 + }, + { + "ep_loss": 0.0, + "epoch": 6.1, + "learning_rate": 0.0003955, + "loss": 0.4256, + "mlm_loss": 0.4256, + "step": 7910 + }, + { + "ep_loss": 0.0, + "epoch": 6.11, + "learning_rate": 0.00039600000000000003, + "loss": 0.4217, + "mlm_loss": 0.4217, + "step": 7920 + }, + { + "ep_loss": 0.0, + "epoch": 6.12, + "learning_rate": 0.00039650000000000004, + "loss": 0.4239, + "mlm_loss": 0.4239, + "step": 7930 + }, + { + "ep_loss": 0.0, + "epoch": 6.12, + "learning_rate": 0.00039700000000000005, + "loss": 0.4183, + "mlm_loss": 0.4183, + "step": 7940 + }, + { + "ep_loss": 0.0, + "epoch": 6.13, + "learning_rate": 0.0003975, + "loss": 0.4172, + "mlm_loss": 0.4172, + "step": 7950 + }, + { + "ep_loss": 0.0, + "epoch": 6.14, + "learning_rate": 0.000398, + "loss": 0.42, + "mlm_loss": 0.42, + "step": 7960 + }, + { + "ep_loss": 0.0, + "epoch": 6.15, + "learning_rate": 0.00039850000000000004, + "loss": 0.4149, + "mlm_loss": 0.4149, + "step": 7970 + }, + { + "ep_loss": 0.0, + "epoch": 6.16, + "learning_rate": 0.00039900000000000005, + "loss": 0.4128, + "mlm_loss": 0.4128, + "step": 7980 + }, + { + "ep_loss": 0.0, + "epoch": 6.16, + "learning_rate": 0.0003995, + "loss": 0.4153, + "mlm_loss": 0.4153, + "step": 7990 + }, + { + "ep_loss": 0.0, + "epoch": 6.17, + "learning_rate": 0.0004, + "loss": 0.418, + "mlm_loss": 0.418, + "step": 8000 + }, + { + "epoch": 6.17, + "eval_ep_loss": -2.42641282081604, + "eval_loss": 0.3915087580680847, + "eval_mlm_loss": 0.3915087580680847, + "eval_runtime": 60.1518, + "eval_samples_per_second": 1161.545, + "eval_steps_per_second": 0.582, + "step": 8000 + }, + { + "ep_loss": 0.0, + "epoch": 6.18, + "learning_rate": 0.00040050000000000003, + "loss": 0.4161, + "mlm_loss": 0.4161, + "step": 8010 + }, + { + "ep_loss": 0.0, + "epoch": 6.19, + "learning_rate": 0.00040100000000000004, + "loss": 0.4158, + "mlm_loss": 0.4158, + "step": 8020 + }, + { + "ep_loss": 0.0, + "epoch": 6.19, + "learning_rate": 0.00040150000000000006, + "loss": 0.4224, + "mlm_loss": 0.4224, + "step": 8030 + }, + { + "ep_loss": 0.0, + "epoch": 6.2, + "learning_rate": 0.000402, + "loss": 0.4203, + "mlm_loss": 0.4203, + "step": 8040 + }, + { + "ep_loss": 0.0, + "epoch": 6.21, + "learning_rate": 0.0004025, + "loss": 0.4134, + "mlm_loss": 0.4134, + "step": 8050 + }, + { + "ep_loss": 0.0, + "epoch": 6.22, + "learning_rate": 0.00040300000000000004, + "loss": 0.4153, + "mlm_loss": 0.4153, + "step": 8060 + }, + { + "ep_loss": 0.0, + "epoch": 6.22, + "learning_rate": 0.00040350000000000005, + "loss": 0.4126, + "mlm_loss": 0.4126, + "step": 8070 + }, + { + "ep_loss": 0.0, + "epoch": 6.23, + "learning_rate": 0.000404, + "loss": 0.4142, + "mlm_loss": 0.4142, + "step": 8080 + }, + { + "ep_loss": 0.0, + "epoch": 6.24, + "learning_rate": 0.0004045, + "loss": 0.4141, + "mlm_loss": 0.4141, + "step": 8090 + }, + { + "ep_loss": 0.0, + "epoch": 6.25, + "learning_rate": 0.00040500000000000003, + "loss": 0.4129, + "mlm_loss": 0.4129, + "step": 8100 + }, + { + "epoch": 6.25, + "eval_ep_loss": -2.3123908042907715, + "eval_loss": 0.39225736260414124, + "eval_mlm_loss": 0.39225736260414124, + "eval_runtime": 61.9154, + "eval_samples_per_second": 1128.459, + "eval_steps_per_second": 0.565, + "step": 8100 + }, + { + "ep_loss": 0.0, + "epoch": 6.26, + "learning_rate": 0.00040550000000000004, + "loss": 0.4155, + "mlm_loss": 0.4155, + "step": 8110 + }, + { + "ep_loss": 0.0, + "epoch": 6.26, + "learning_rate": 0.00040600000000000006, + "loss": 0.4174, + "mlm_loss": 0.4174, + "step": 8120 + }, + { + "ep_loss": 0.0, + "epoch": 6.27, + "learning_rate": 0.00040649999999999996, + "loss": 0.4099, + "mlm_loss": 0.4099, + "step": 8130 + }, + { + "ep_loss": 0.0, + "epoch": 6.28, + "learning_rate": 0.00040699999999999997, + "loss": 0.4123, + "mlm_loss": 0.4123, + "step": 8140 + }, + { + "ep_loss": 0.0, + "epoch": 6.29, + "learning_rate": 0.0004075, + "loss": 0.4216, + "mlm_loss": 0.4216, + "step": 8150 + }, + { + "ep_loss": 0.0, + "epoch": 6.29, + "learning_rate": 0.000408, + "loss": 0.4095, + "mlm_loss": 0.4095, + "step": 8160 + }, + { + "ep_loss": 0.0, + "epoch": 6.3, + "learning_rate": 0.0004085, + "loss": 0.413, + "mlm_loss": 0.413, + "step": 8170 + }, + { + "ep_loss": 0.0, + "epoch": 6.31, + "learning_rate": 0.00040899999999999997, + "loss": 0.4165, + "mlm_loss": 0.4165, + "step": 8180 + }, + { + "ep_loss": 0.0, + "epoch": 6.32, + "learning_rate": 0.0004095, + "loss": 0.4191, + "mlm_loss": 0.4191, + "step": 8190 + }, + { + "ep_loss": 0.0, + "epoch": 6.32, + "learning_rate": 0.00041, + "loss": 0.4161, + "mlm_loss": 0.4161, + "step": 8200 + }, + { + "epoch": 6.32, + "eval_ep_loss": -2.4482126235961914, + "eval_loss": 0.388824462890625, + "eval_mlm_loss": 0.388824462890625, + "eval_runtime": 59.1336, + "eval_samples_per_second": 1181.545, + "eval_steps_per_second": 0.592, + "step": 8200 + }, + { + "ep_loss": 0.0, + "epoch": 6.33, + "learning_rate": 0.0004105, + "loss": 0.4218, + "mlm_loss": 0.4218, + "step": 8210 + }, + { + "ep_loss": 0.0, + "epoch": 6.34, + "learning_rate": 0.00041099999999999996, + "loss": 0.4085, + "mlm_loss": 0.4085, + "step": 8220 + }, + { + "ep_loss": 0.0, + "epoch": 6.35, + "learning_rate": 0.0004115, + "loss": 0.4114, + "mlm_loss": 0.4114, + "step": 8230 + }, + { + "ep_loss": 0.0, + "epoch": 6.36, + "learning_rate": 0.000412, + "loss": 0.4129, + "mlm_loss": 0.4129, + "step": 8240 + }, + { + "ep_loss": 0.0, + "epoch": 6.36, + "learning_rate": 0.0004125, + "loss": 0.422, + "mlm_loss": 0.422, + "step": 8250 + }, + { + "ep_loss": 0.0, + "epoch": 6.37, + "learning_rate": 0.000413, + "loss": 0.4184, + "mlm_loss": 0.4184, + "step": 8260 + }, + { + "ep_loss": 0.0, + "epoch": 6.38, + "learning_rate": 0.00041349999999999997, + "loss": 0.4192, + "mlm_loss": 0.4192, + "step": 8270 + }, + { + "ep_loss": 0.0, + "epoch": 6.39, + "learning_rate": 0.000414, + "loss": 0.4168, + "mlm_loss": 0.4168, + "step": 8280 + }, + { + "ep_loss": 0.0, + "epoch": 6.39, + "learning_rate": 0.0004145, + "loss": 0.4171, + "mlm_loss": 0.4171, + "step": 8290 + }, + { + "ep_loss": 0.0, + "epoch": 6.4, + "learning_rate": 0.000415, + "loss": 0.4131, + "mlm_loss": 0.4131, + "step": 8300 + }, + { + "epoch": 6.4, + "eval_ep_loss": -2.358128547668457, + "eval_loss": 0.38723644614219666, + "eval_mlm_loss": 0.38723644614219666, + "eval_runtime": 60.0999, + "eval_samples_per_second": 1162.548, + "eval_steps_per_second": 0.582, + "step": 8300 + }, + { + "ep_loss": 0.0, + "epoch": 6.41, + "learning_rate": 0.00041549999999999996, + "loss": 0.4149, + "mlm_loss": 0.4149, + "step": 8310 + }, + { + "ep_loss": 0.0, + "epoch": 6.42, + "learning_rate": 0.000416, + "loss": 0.4068, + "mlm_loss": 0.4068, + "step": 8320 + }, + { + "ep_loss": 0.0, + "epoch": 6.42, + "learning_rate": 0.0004165, + "loss": 0.4102, + "mlm_loss": 0.4102, + "step": 8330 + }, + { + "ep_loss": 0.0, + "epoch": 6.43, + "learning_rate": 0.000417, + "loss": 0.4129, + "mlm_loss": 0.4129, + "step": 8340 + }, + { + "ep_loss": 0.0, + "epoch": 6.44, + "learning_rate": 0.0004175, + "loss": 0.413, + "mlm_loss": 0.413, + "step": 8350 + }, + { + "ep_loss": 0.0, + "epoch": 6.45, + "learning_rate": 0.00041799999999999997, + "loss": 0.4143, + "mlm_loss": 0.4143, + "step": 8360 + }, + { + "ep_loss": 0.0, + "epoch": 6.46, + "learning_rate": 0.0004185, + "loss": 0.4134, + "mlm_loss": 0.4134, + "step": 8370 + }, + { + "ep_loss": 0.0, + "epoch": 6.46, + "learning_rate": 0.000419, + "loss": 0.406, + "mlm_loss": 0.406, + "step": 8380 + }, + { + "ep_loss": 0.0, + "epoch": 6.47, + "learning_rate": 0.0004195, + "loss": 0.4144, + "mlm_loss": 0.4144, + "step": 8390 + }, + { + "ep_loss": 0.0, + "epoch": 6.48, + "learning_rate": 0.00042, + "loss": 0.4131, + "mlm_loss": 0.4131, + "step": 8400 + }, + { + "epoch": 6.48, + "eval_ep_loss": -2.0901758670806885, + "eval_loss": 0.3867802917957306, + "eval_mlm_loss": 0.3867802917957306, + "eval_runtime": 63.0955, + "eval_samples_per_second": 1107.354, + "eval_steps_per_second": 0.555, + "step": 8400 + }, + { + "ep_loss": 0.0, + "epoch": 6.49, + "learning_rate": 0.0004205, + "loss": 0.4011, + "mlm_loss": 0.4011, + "step": 8410 + }, + { + "ep_loss": 0.0, + "epoch": 6.49, + "learning_rate": 0.000421, + "loss": 0.4117, + "mlm_loss": 0.4117, + "step": 8420 + }, + { + "ep_loss": 0.0, + "epoch": 6.5, + "learning_rate": 0.0004215, + "loss": 0.414, + "mlm_loss": 0.414, + "step": 8430 + }, + { + "ep_loss": 0.0, + "epoch": 6.51, + "learning_rate": 0.000422, + "loss": 0.4186, + "mlm_loss": 0.4186, + "step": 8440 + }, + { + "ep_loss": 0.0, + "epoch": 6.52, + "learning_rate": 0.00042249999999999997, + "loss": 0.4149, + "mlm_loss": 0.4149, + "step": 8450 + }, + { + "ep_loss": 0.0, + "epoch": 6.53, + "learning_rate": 0.000423, + "loss": 0.4154, + "mlm_loss": 0.4154, + "step": 8460 + }, + { + "ep_loss": 0.0, + "epoch": 6.53, + "learning_rate": 0.0004235, + "loss": 0.409, + "mlm_loss": 0.409, + "step": 8470 + }, + { + "ep_loss": 0.0, + "epoch": 6.54, + "learning_rate": 0.000424, + "loss": 0.4098, + "mlm_loss": 0.4098, + "step": 8480 + }, + { + "ep_loss": 0.0, + "epoch": 6.55, + "learning_rate": 0.0004245, + "loss": 0.4179, + "mlm_loss": 0.4179, + "step": 8490 + }, + { + "ep_loss": 0.0, + "epoch": 6.56, + "learning_rate": 0.000425, + "loss": 0.4147, + "mlm_loss": 0.4147, + "step": 8500 + }, + { + "epoch": 6.56, + "eval_ep_loss": -2.5853233337402344, + "eval_loss": 0.3858720660209656, + "eval_mlm_loss": 0.3858720660209656, + "eval_runtime": 60.8689, + "eval_samples_per_second": 1147.86, + "eval_steps_per_second": 0.575, + "step": 8500 + }, + { + "ep_loss": 0.0, + "epoch": 6.56, + "learning_rate": 0.0004255, + "loss": 0.4094, + "mlm_loss": 0.4094, + "step": 8510 + }, + { + "ep_loss": 0.0, + "epoch": 6.57, + "learning_rate": 0.000426, + "loss": 0.4136, + "mlm_loss": 0.4136, + "step": 8520 + }, + { + "ep_loss": 0.0, + "epoch": 6.58, + "learning_rate": 0.0004265, + "loss": 0.4077, + "mlm_loss": 0.4077, + "step": 8530 + }, + { + "ep_loss": 0.0, + "epoch": 6.59, + "learning_rate": 0.000427, + "loss": 0.4077, + "mlm_loss": 0.4077, + "step": 8540 + }, + { + "ep_loss": 0.0, + "epoch": 6.59, + "learning_rate": 0.0004275, + "loss": 0.4115, + "mlm_loss": 0.4115, + "step": 8550 + }, + { + "ep_loss": 0.0, + "epoch": 6.6, + "learning_rate": 0.000428, + "loss": 0.4068, + "mlm_loss": 0.4068, + "step": 8560 + }, + { + "ep_loss": 0.0, + "epoch": 6.61, + "learning_rate": 0.0004285, + "loss": 0.4096, + "mlm_loss": 0.4096, + "step": 8570 + }, + { + "ep_loss": 0.0, + "epoch": 6.62, + "learning_rate": 0.000429, + "loss": 0.4146, + "mlm_loss": 0.4146, + "step": 8580 + }, + { + "ep_loss": 0.0, + "epoch": 6.63, + "learning_rate": 0.0004295, + "loss": 0.4095, + "mlm_loss": 0.4095, + "step": 8590 + }, + { + "ep_loss": 0.0, + "epoch": 6.63, + "learning_rate": 0.00043, + "loss": 0.4105, + "mlm_loss": 0.4105, + "step": 8600 + }, + { + "epoch": 6.63, + "eval_ep_loss": -2.559201955795288, + "eval_loss": 0.3874005973339081, + "eval_mlm_loss": 0.3874005973339081, + "eval_runtime": 62.3828, + "eval_samples_per_second": 1120.003, + "eval_steps_per_second": 0.561, + "step": 8600 + }, + { + "ep_loss": 0.0, + "epoch": 6.64, + "learning_rate": 0.0004305, + "loss": 0.4121, + "mlm_loss": 0.4121, + "step": 8610 + }, + { + "ep_loss": 0.0, + "epoch": 6.65, + "learning_rate": 0.000431, + "loss": 0.4115, + "mlm_loss": 0.4115, + "step": 8620 + }, + { + "ep_loss": 0.0, + "epoch": 6.66, + "learning_rate": 0.0004315, + "loss": 0.4124, + "mlm_loss": 0.4124, + "step": 8630 + }, + { + "ep_loss": 0.0, + "epoch": 6.66, + "learning_rate": 0.000432, + "loss": 0.4158, + "mlm_loss": 0.4158, + "step": 8640 + }, + { + "ep_loss": 0.0, + "epoch": 6.67, + "learning_rate": 0.0004325, + "loss": 0.4099, + "mlm_loss": 0.4099, + "step": 8650 + }, + { + "ep_loss": 0.0, + "epoch": 6.68, + "learning_rate": 0.000433, + "loss": 0.4157, + "mlm_loss": 0.4157, + "step": 8660 + }, + { + "ep_loss": 0.0, + "epoch": 6.69, + "learning_rate": 0.0004335, + "loss": 0.4032, + "mlm_loss": 0.4032, + "step": 8670 + }, + { + "ep_loss": 0.0, + "epoch": 6.69, + "learning_rate": 0.00043400000000000003, + "loss": 0.4094, + "mlm_loss": 0.4094, + "step": 8680 + }, + { + "ep_loss": 0.0, + "epoch": 6.7, + "learning_rate": 0.0004345, + "loss": 0.4057, + "mlm_loss": 0.4057, + "step": 8690 + }, + { + "ep_loss": 0.0, + "epoch": 6.71, + "learning_rate": 0.000435, + "loss": 0.4144, + "mlm_loss": 0.4144, + "step": 8700 + }, + { + "epoch": 6.71, + "eval_ep_loss": -2.2817635536193848, + "eval_loss": 0.3839435577392578, + "eval_mlm_loss": 0.3839435577392578, + "eval_runtime": 61.6764, + "eval_samples_per_second": 1132.831, + "eval_steps_per_second": 0.567, + "step": 8700 + }, + { + "ep_loss": 0.0, + "epoch": 6.72, + "learning_rate": 0.0004355, + "loss": 0.4114, + "mlm_loss": 0.4114, + "step": 8710 + }, + { + "ep_loss": 0.0, + "epoch": 6.73, + "learning_rate": 0.000436, + "loss": 0.4091, + "mlm_loss": 0.4091, + "step": 8720 + }, + { + "ep_loss": 0.0, + "epoch": 6.73, + "learning_rate": 0.0004365, + "loss": 0.4058, + "mlm_loss": 0.4058, + "step": 8730 + }, + { + "ep_loss": 0.0, + "epoch": 6.74, + "learning_rate": 0.000437, + "loss": 0.4009, + "mlm_loss": 0.4009, + "step": 8740 + }, + { + "ep_loss": 0.0, + "epoch": 6.75, + "learning_rate": 0.0004375, + "loss": 0.405, + "mlm_loss": 0.405, + "step": 8750 + }, + { + "ep_loss": 0.0, + "epoch": 6.76, + "learning_rate": 0.000438, + "loss": 0.4084, + "mlm_loss": 0.4084, + "step": 8760 + }, + { + "ep_loss": 0.0, + "epoch": 6.76, + "learning_rate": 0.00043850000000000003, + "loss": 0.409, + "mlm_loss": 0.409, + "step": 8770 + }, + { + "ep_loss": 0.0, + "epoch": 6.77, + "learning_rate": 0.000439, + "loss": 0.407, + "mlm_loss": 0.407, + "step": 8780 + }, + { + "ep_loss": 0.0, + "epoch": 6.78, + "learning_rate": 0.0004395, + "loss": 0.4003, + "mlm_loss": 0.4003, + "step": 8790 + }, + { + "ep_loss": 0.0, + "epoch": 6.79, + "learning_rate": 0.00044, + "loss": 0.4021, + "mlm_loss": 0.4021, + "step": 8800 + }, + { + "epoch": 6.79, + "eval_ep_loss": -2.1474719047546387, + "eval_loss": 0.38189372420310974, + "eval_mlm_loss": 0.38189372420310974, + "eval_runtime": 59.6801, + "eval_samples_per_second": 1170.726, + "eval_steps_per_second": 0.586, + "step": 8800 + }, + { + "ep_loss": 0.0, + "epoch": 6.8, + "learning_rate": 0.00044050000000000003, + "loss": 0.4166, + "mlm_loss": 0.4166, + "step": 8810 + }, + { + "ep_loss": 0.0, + "epoch": 6.8, + "learning_rate": 0.000441, + "loss": 0.4055, + "mlm_loss": 0.4055, + "step": 8820 + }, + { + "ep_loss": 0.0, + "epoch": 6.81, + "learning_rate": 0.0004415, + "loss": 0.4095, + "mlm_loss": 0.4095, + "step": 8830 + }, + { + "ep_loss": 0.0, + "epoch": 6.82, + "learning_rate": 0.000442, + "loss": 0.4063, + "mlm_loss": 0.4063, + "step": 8840 + }, + { + "ep_loss": 0.0, + "epoch": 6.83, + "learning_rate": 0.0004425, + "loss": 0.4124, + "mlm_loss": 0.4124, + "step": 8850 + }, + { + "ep_loss": 0.0, + "epoch": 6.83, + "learning_rate": 0.00044300000000000003, + "loss": 0.4052, + "mlm_loss": 0.4052, + "step": 8860 + }, + { + "ep_loss": 0.0, + "epoch": 6.84, + "learning_rate": 0.0004435, + "loss": 0.4106, + "mlm_loss": 0.4106, + "step": 8870 + }, + { + "ep_loss": 0.0, + "epoch": 6.85, + "learning_rate": 0.000444, + "loss": 0.4042, + "mlm_loss": 0.4042, + "step": 8880 + }, + { + "ep_loss": 0.0, + "epoch": 6.86, + "learning_rate": 0.0004445, + "loss": 0.4126, + "mlm_loss": 0.4126, + "step": 8890 + }, + { + "ep_loss": 0.0, + "epoch": 6.86, + "learning_rate": 0.00044500000000000003, + "loss": 0.4019, + "mlm_loss": 0.4019, + "step": 8900 + }, + { + "epoch": 6.86, + "eval_ep_loss": -2.1015372276306152, + "eval_loss": 0.3846256732940674, + "eval_mlm_loss": 0.3846256732940674, + "eval_runtime": 59.4967, + "eval_samples_per_second": 1174.335, + "eval_steps_per_second": 0.588, + "step": 8900 + }, + { + "ep_loss": 0.0, + "epoch": 6.87, + "learning_rate": 0.00044550000000000004, + "loss": 0.4092, + "mlm_loss": 0.4092, + "step": 8910 + }, + { + "ep_loss": 0.0, + "epoch": 6.88, + "learning_rate": 0.000446, + "loss": 0.4053, + "mlm_loss": 0.4053, + "step": 8920 + }, + { + "ep_loss": 0.0, + "epoch": 6.89, + "learning_rate": 0.0004465, + "loss": 0.4071, + "mlm_loss": 0.4071, + "step": 8930 + }, + { + "ep_loss": 0.0, + "epoch": 6.9, + "learning_rate": 0.000447, + "loss": 0.4087, + "mlm_loss": 0.4087, + "step": 8940 + }, + { + "ep_loss": 0.0, + "epoch": 6.9, + "learning_rate": 0.00044750000000000004, + "loss": 0.4075, + "mlm_loss": 0.4075, + "step": 8950 + }, + { + "ep_loss": 0.0, + "epoch": 6.91, + "learning_rate": 0.000448, + "loss": 0.4134, + "mlm_loss": 0.4134, + "step": 8960 + }, + { + "ep_loss": 0.0, + "epoch": 6.92, + "learning_rate": 0.0004485, + "loss": 0.407, + "mlm_loss": 0.407, + "step": 8970 + }, + { + "ep_loss": 0.0, + "epoch": 6.93, + "learning_rate": 0.000449, + "loss": 0.4081, + "mlm_loss": 0.4081, + "step": 8980 + }, + { + "ep_loss": 0.0, + "epoch": 6.93, + "learning_rate": 0.00044950000000000003, + "loss": 0.4047, + "mlm_loss": 0.4047, + "step": 8990 + }, + { + "ep_loss": 0.0, + "epoch": 6.94, + "learning_rate": 0.00045000000000000004, + "loss": 0.4046, + "mlm_loss": 0.4046, + "step": 9000 + }, + { + "epoch": 6.94, + "eval_ep_loss": -2.1676366329193115, + "eval_loss": 0.3808509409427643, + "eval_mlm_loss": 0.3808509409427643, + "eval_runtime": 60.7859, + "eval_samples_per_second": 1149.428, + "eval_steps_per_second": 0.576, + "step": 9000 + }, + { + "ep_loss": 0.0, + "epoch": 6.95, + "learning_rate": 0.0004505, + "loss": 0.4108, + "mlm_loss": 0.4108, + "step": 9010 + }, + { + "ep_loss": 0.0, + "epoch": 6.96, + "learning_rate": 0.000451, + "loss": 0.4095, + "mlm_loss": 0.4095, + "step": 9020 + }, + { + "ep_loss": 0.0, + "epoch": 6.96, + "learning_rate": 0.0004515, + "loss": 0.4069, + "mlm_loss": 0.4069, + "step": 9030 + }, + { + "ep_loss": 0.0, + "epoch": 6.97, + "learning_rate": 0.00045200000000000004, + "loss": 0.407, + "mlm_loss": 0.407, + "step": 9040 + }, + { + "ep_loss": 0.0, + "epoch": 6.98, + "learning_rate": 0.00045250000000000005, + "loss": 0.4056, + "mlm_loss": 0.4056, + "step": 9050 + }, + { + "ep_loss": 0.0, + "epoch": 6.99, + "learning_rate": 0.000453, + "loss": 0.4107, + "mlm_loss": 0.4107, + "step": 9060 + }, + { + "ep_loss": 0.0, + "epoch": 7.0, + "learning_rate": 0.0004535, + "loss": 0.4085, + "mlm_loss": 0.4085, + "step": 9070 + }, + { + "ep_loss": 0.0, + "epoch": 7.0, + "learning_rate": 0.00045400000000000003, + "loss": 0.4146, + "mlm_loss": 0.4146, + "step": 9080 + }, + { + "ep_loss": 0.0, + "epoch": 7.01, + "learning_rate": 0.00045450000000000004, + "loss": 0.4041, + "mlm_loss": 0.4041, + "step": 9090 + }, + { + "ep_loss": 0.0, + "epoch": 7.02, + "learning_rate": 0.000455, + "loss": 0.414, + "mlm_loss": 0.414, + "step": 9100 + }, + { + "epoch": 7.02, + "eval_ep_loss": -2.345536231994629, + "eval_loss": 0.3819408118724823, + "eval_mlm_loss": 0.3819408118724823, + "eval_runtime": 61.289, + "eval_samples_per_second": 1139.992, + "eval_steps_per_second": 0.571, + "step": 9100 + }, + { + "ep_loss": 0.0, + "epoch": 7.03, + "learning_rate": 0.0004555, + "loss": 0.4014, + "mlm_loss": 0.4014, + "step": 9110 + }, + { + "ep_loss": 0.0, + "epoch": 7.03, + "learning_rate": 0.000456, + "loss": 0.4039, + "mlm_loss": 0.4039, + "step": 9120 + }, + { + "ep_loss": 0.0, + "epoch": 7.04, + "learning_rate": 0.00045650000000000004, + "loss": 0.4066, + "mlm_loss": 0.4066, + "step": 9130 + }, + { + "ep_loss": 0.0, + "epoch": 7.05, + "learning_rate": 0.00045700000000000005, + "loss": 0.4107, + "mlm_loss": 0.4107, + "step": 9140 + }, + { + "ep_loss": 0.0, + "epoch": 7.06, + "learning_rate": 0.0004575, + "loss": 0.4035, + "mlm_loss": 0.4035, + "step": 9150 + }, + { + "ep_loss": 0.0, + "epoch": 7.07, + "learning_rate": 0.000458, + "loss": 0.4037, + "mlm_loss": 0.4037, + "step": 9160 + }, + { + "ep_loss": 0.0, + "epoch": 7.07, + "learning_rate": 0.00045850000000000003, + "loss": 0.4035, + "mlm_loss": 0.4035, + "step": 9170 + }, + { + "ep_loss": 0.0, + "epoch": 7.08, + "learning_rate": 0.00045900000000000004, + "loss": 0.4054, + "mlm_loss": 0.4054, + "step": 9180 + }, + { + "ep_loss": 0.0, + "epoch": 7.09, + "learning_rate": 0.00045950000000000006, + "loss": 0.4078, + "mlm_loss": 0.4078, + "step": 9190 + }, + { + "ep_loss": 0.0, + "epoch": 7.1, + "learning_rate": 0.00046, + "loss": 0.4066, + "mlm_loss": 0.4066, + "step": 9200 + }, + { + "epoch": 7.1, + "eval_ep_loss": -2.1998836994171143, + "eval_loss": 0.37953051924705505, + "eval_mlm_loss": 0.37953051924705505, + "eval_runtime": 59.882, + "eval_samples_per_second": 1166.779, + "eval_steps_per_second": 0.584, + "step": 9200 + }, + { + "ep_loss": 0.0, + "epoch": 7.1, + "learning_rate": 0.0004605, + "loss": 0.3965, + "mlm_loss": 0.3965, + "step": 9210 + }, + { + "ep_loss": 0.0, + "epoch": 7.11, + "learning_rate": 0.00046100000000000004, + "loss": 0.3973, + "mlm_loss": 0.3973, + "step": 9220 + }, + { + "ep_loss": 0.0, + "epoch": 7.12, + "learning_rate": 0.00046150000000000005, + "loss": 0.3996, + "mlm_loss": 0.3996, + "step": 9230 + }, + { + "ep_loss": 0.0, + "epoch": 7.13, + "learning_rate": 0.000462, + "loss": 0.4075, + "mlm_loss": 0.4075, + "step": 9240 + }, + { + "ep_loss": 0.0, + "epoch": 7.13, + "learning_rate": 0.0004625, + "loss": 0.4042, + "mlm_loss": 0.4042, + "step": 9250 + }, + { + "ep_loss": 0.0, + "epoch": 7.14, + "learning_rate": 0.00046300000000000003, + "loss": 0.3976, + "mlm_loss": 0.3976, + "step": 9260 + }, + { + "ep_loss": 0.0, + "epoch": 7.15, + "learning_rate": 0.00046350000000000004, + "loss": 0.4026, + "mlm_loss": 0.4026, + "step": 9270 + }, + { + "ep_loss": 0.0, + "epoch": 7.16, + "learning_rate": 0.00046400000000000006, + "loss": 0.4047, + "mlm_loss": 0.4047, + "step": 9280 + }, + { + "ep_loss": 0.0, + "epoch": 7.17, + "learning_rate": 0.0004645, + "loss": 0.403, + "mlm_loss": 0.403, + "step": 9290 + }, + { + "ep_loss": 0.0, + "epoch": 7.17, + "learning_rate": 0.000465, + "loss": 0.4064, + "mlm_loss": 0.4064, + "step": 9300 + }, + { + "epoch": 7.17, + "eval_ep_loss": -2.131833553314209, + "eval_loss": 0.38073843717575073, + "eval_mlm_loss": 0.38073843717575073, + "eval_runtime": 61.1101, + "eval_samples_per_second": 1143.331, + "eval_steps_per_second": 0.573, + "step": 9300 + }, + { + "ep_loss": 0.0, + "epoch": 7.18, + "learning_rate": 0.00046550000000000004, + "loss": 0.4023, + "mlm_loss": 0.4023, + "step": 9310 + }, + { + "ep_loss": 0.0, + "epoch": 7.19, + "learning_rate": 0.00046600000000000005, + "loss": 0.4037, + "mlm_loss": 0.4037, + "step": 9320 + }, + { + "ep_loss": 0.0, + "epoch": 7.2, + "learning_rate": 0.0004665, + "loss": 0.4024, + "mlm_loss": 0.4024, + "step": 9330 + }, + { + "ep_loss": 0.0, + "epoch": 7.2, + "learning_rate": 0.000467, + "loss": 0.4031, + "mlm_loss": 0.4031, + "step": 9340 + }, + { + "ep_loss": 0.0, + "epoch": 7.21, + "learning_rate": 0.00046750000000000003, + "loss": 0.4069, + "mlm_loss": 0.4069, + "step": 9350 + }, + { + "ep_loss": 0.0, + "epoch": 7.22, + "learning_rate": 0.00046800000000000005, + "loss": 0.4134, + "mlm_loss": 0.4134, + "step": 9360 + }, + { + "ep_loss": 0.0, + "epoch": 7.23, + "learning_rate": 0.00046850000000000006, + "loss": 0.3998, + "mlm_loss": 0.3998, + "step": 9370 + }, + { + "ep_loss": 0.0, + "epoch": 7.23, + "learning_rate": 0.00046899999999999996, + "loss": 0.403, + "mlm_loss": 0.403, + "step": 9380 + }, + { + "ep_loss": 0.0, + "epoch": 7.24, + "learning_rate": 0.0004695, + "loss": 0.4067, + "mlm_loss": 0.4067, + "step": 9390 + }, + { + "ep_loss": 0.0, + "epoch": 7.25, + "learning_rate": 0.00047, + "loss": 0.4022, + "mlm_loss": 0.4022, + "step": 9400 + }, + { + "epoch": 7.25, + "eval_ep_loss": -2.311575412750244, + "eval_loss": 0.3808383643627167, + "eval_mlm_loss": 0.3808383643627167, + "eval_runtime": 61.1652, + "eval_samples_per_second": 1142.301, + "eval_steps_per_second": 0.572, + "step": 9400 + }, + { + "ep_loss": 0.0, + "epoch": 7.26, + "learning_rate": 0.0004705, + "loss": 0.405, + "mlm_loss": 0.405, + "step": 9410 + }, + { + "ep_loss": 0.0, + "epoch": 7.27, + "learning_rate": 0.000471, + "loss": 0.3949, + "mlm_loss": 0.3949, + "step": 9420 + }, + { + "ep_loss": 0.0, + "epoch": 7.27, + "learning_rate": 0.00047149999999999997, + "loss": 0.405, + "mlm_loss": 0.405, + "step": 9430 + }, + { + "ep_loss": 0.0, + "epoch": 7.28, + "learning_rate": 0.000472, + "loss": 0.4092, + "mlm_loss": 0.4092, + "step": 9440 + }, + { + "ep_loss": 0.0, + "epoch": 7.29, + "learning_rate": 0.0004725, + "loss": 0.4102, + "mlm_loss": 0.4102, + "step": 9450 + }, + { + "ep_loss": 0.0, + "epoch": 7.3, + "learning_rate": 0.000473, + "loss": 0.4066, + "mlm_loss": 0.4066, + "step": 9460 + }, + { + "ep_loss": 0.0, + "epoch": 7.3, + "learning_rate": 0.00047349999999999996, + "loss": 0.3987, + "mlm_loss": 0.3987, + "step": 9470 + }, + { + "ep_loss": 0.0, + "epoch": 7.31, + "learning_rate": 0.000474, + "loss": 0.399, + "mlm_loss": 0.399, + "step": 9480 + }, + { + "ep_loss": 0.0, + "epoch": 7.32, + "learning_rate": 0.0004745, + "loss": 0.4033, + "mlm_loss": 0.4033, + "step": 9490 + }, + { + "ep_loss": 0.0, + "epoch": 7.33, + "learning_rate": 0.000475, + "loss": 0.3982, + "mlm_loss": 0.3982, + "step": 9500 + }, + { + "epoch": 7.33, + "eval_ep_loss": -2.2588560581207275, + "eval_loss": 0.37661808729171753, + "eval_mlm_loss": 0.37661808729171753, + "eval_runtime": 61.5272, + "eval_samples_per_second": 1135.579, + "eval_steps_per_second": 0.569, + "step": 9500 + }, + { + "ep_loss": 0.0, + "epoch": 7.34, + "learning_rate": 0.0004755, + "loss": 0.4048, + "mlm_loss": 0.4048, + "step": 9510 + }, + { + "ep_loss": 0.0, + "epoch": 7.34, + "learning_rate": 0.00047599999999999997, + "loss": 0.4034, + "mlm_loss": 0.4034, + "step": 9520 + }, + { + "ep_loss": 0.0, + "epoch": 7.35, + "learning_rate": 0.0004765, + "loss": 0.4001, + "mlm_loss": 0.4001, + "step": 9530 + }, + { + "ep_loss": 0.0, + "epoch": 7.36, + "learning_rate": 0.000477, + "loss": 0.406, + "mlm_loss": 0.406, + "step": 9540 + }, + { + "ep_loss": 0.0, + "epoch": 7.37, + "learning_rate": 0.0004775, + "loss": 0.4007, + "mlm_loss": 0.4007, + "step": 9550 + }, + { + "ep_loss": 0.0, + "epoch": 7.37, + "learning_rate": 0.00047799999999999996, + "loss": 0.416, + "mlm_loss": 0.416, + "step": 9560 + }, + { + "ep_loss": 0.0, + "epoch": 7.38, + "learning_rate": 0.0004785, + "loss": 0.398, + "mlm_loss": 0.398, + "step": 9570 + }, + { + "ep_loss": 0.0, + "epoch": 7.39, + "learning_rate": 0.000479, + "loss": 0.4045, + "mlm_loss": 0.4045, + "step": 9580 + }, + { + "ep_loss": 0.0, + "epoch": 7.4, + "learning_rate": 0.0004795, + "loss": 0.4011, + "mlm_loss": 0.4011, + "step": 9590 + }, + { + "ep_loss": 0.0, + "epoch": 7.4, + "learning_rate": 0.00048, + "loss": 0.3991, + "mlm_loss": 0.3991, + "step": 9600 + }, + { + "epoch": 7.4, + "eval_ep_loss": -2.4242005348205566, + "eval_loss": 0.37998148798942566, + "eval_mlm_loss": 0.37998148798942566, + "eval_runtime": 59.2852, + "eval_samples_per_second": 1178.524, + "eval_steps_per_second": 0.59, + "step": 9600 + }, + { + "ep_loss": 0.0, + "epoch": 7.41, + "learning_rate": 0.00048049999999999997, + "loss": 0.3983, + "mlm_loss": 0.3983, + "step": 9610 + }, + { + "ep_loss": 0.0, + "epoch": 7.42, + "learning_rate": 0.000481, + "loss": 0.4056, + "mlm_loss": 0.4056, + "step": 9620 + }, + { + "ep_loss": 0.0, + "epoch": 7.43, + "learning_rate": 0.0004815, + "loss": 0.4034, + "mlm_loss": 0.4034, + "step": 9630 + }, + { + "ep_loss": 0.0, + "epoch": 7.44, + "learning_rate": 0.000482, + "loss": 0.4071, + "mlm_loss": 0.4071, + "step": 9640 + }, + { + "ep_loss": 0.0, + "epoch": 7.44, + "learning_rate": 0.0004825, + "loss": 0.3994, + "mlm_loss": 0.3994, + "step": 9650 + }, + { + "ep_loss": 0.0, + "epoch": 7.45, + "learning_rate": 0.000483, + "loss": 0.398, + "mlm_loss": 0.398, + "step": 9660 + }, + { + "ep_loss": 0.0, + "epoch": 7.46, + "learning_rate": 0.0004835, + "loss": 0.4038, + "mlm_loss": 0.4038, + "step": 9670 + }, + { + "ep_loss": 0.0, + "epoch": 7.47, + "learning_rate": 0.000484, + "loss": 0.401, + "mlm_loss": 0.401, + "step": 9680 + }, + { + "ep_loss": 0.0, + "epoch": 7.47, + "learning_rate": 0.0004845, + "loss": 0.4049, + "mlm_loss": 0.4049, + "step": 9690 + }, + { + "ep_loss": 0.0, + "epoch": 7.48, + "learning_rate": 0.00048499999999999997, + "loss": 0.4003, + "mlm_loss": 0.4003, + "step": 9700 + }, + { + "epoch": 7.48, + "eval_ep_loss": -2.468691825866699, + "eval_loss": 0.37805142998695374, + "eval_mlm_loss": 0.37805142998695374, + "eval_runtime": 60.3023, + "eval_samples_per_second": 1158.646, + "eval_steps_per_second": 0.58, + "step": 9700 + }, + { + "ep_loss": 0.0, + "epoch": 7.49, + "learning_rate": 0.0004855, + "loss": 0.3978, + "mlm_loss": 0.3978, + "step": 9710 + }, + { + "ep_loss": 0.0, + "epoch": 7.5, + "learning_rate": 0.000486, + "loss": 0.3994, + "mlm_loss": 0.3994, + "step": 9720 + }, + { + "ep_loss": 0.0, + "epoch": 7.5, + "learning_rate": 0.0004865, + "loss": 0.4029, + "mlm_loss": 0.4029, + "step": 9730 + }, + { + "ep_loss": 0.0, + "epoch": 7.51, + "learning_rate": 0.000487, + "loss": 0.3991, + "mlm_loss": 0.3991, + "step": 9740 + }, + { + "ep_loss": 0.0, + "epoch": 7.52, + "learning_rate": 0.0004875, + "loss": 0.4061, + "mlm_loss": 0.4061, + "step": 9750 + }, + { + "ep_loss": 0.0, + "epoch": 7.53, + "learning_rate": 0.000488, + "loss": 0.4047, + "mlm_loss": 0.4047, + "step": 9760 + }, + { + "ep_loss": 0.0, + "epoch": 7.54, + "learning_rate": 0.0004885, + "loss": 0.4071, + "mlm_loss": 0.4071, + "step": 9770 + }, + { + "ep_loss": 0.0, + "epoch": 7.54, + "learning_rate": 0.000489, + "loss": 0.4014, + "mlm_loss": 0.4014, + "step": 9780 + }, + { + "ep_loss": 0.0, + "epoch": 7.55, + "learning_rate": 0.0004895, + "loss": 0.4023, + "mlm_loss": 0.4023, + "step": 9790 + }, + { + "ep_loss": 0.0, + "epoch": 7.56, + "learning_rate": 0.00049, + "loss": 0.4052, + "mlm_loss": 0.4052, + "step": 9800 + }, + { + "epoch": 7.56, + "eval_ep_loss": -2.403273344039917, + "eval_loss": 0.3772997260093689, + "eval_mlm_loss": 0.3772997260093689, + "eval_runtime": 59.2376, + "eval_samples_per_second": 1179.471, + "eval_steps_per_second": 0.591, + "step": 9800 + }, + { + "ep_loss": 0.0, + "epoch": 7.57, + "learning_rate": 0.0004905, + "loss": 0.394, + "mlm_loss": 0.394, + "step": 9810 + }, + { + "ep_loss": 0.0, + "epoch": 7.57, + "learning_rate": 0.000491, + "loss": 0.4071, + "mlm_loss": 0.4071, + "step": 9820 + }, + { + "ep_loss": 0.0, + "epoch": 7.58, + "learning_rate": 0.0004915, + "loss": 0.3992, + "mlm_loss": 0.3992, + "step": 9830 + }, + { + "ep_loss": 0.0, + "epoch": 7.59, + "learning_rate": 0.000492, + "loss": 0.4024, + "mlm_loss": 0.4024, + "step": 9840 + }, + { + "ep_loss": 0.0, + "epoch": 7.6, + "learning_rate": 0.0004925, + "loss": 0.4042, + "mlm_loss": 0.4042, + "step": 9850 + }, + { + "ep_loss": 0.0, + "epoch": 7.61, + "learning_rate": 0.0004930000000000001, + "loss": 0.4067, + "mlm_loss": 0.4067, + "step": 9860 + }, + { + "ep_loss": 0.0, + "epoch": 7.61, + "learning_rate": 0.0004935, + "loss": 0.3995, + "mlm_loss": 0.3995, + "step": 9870 + }, + { + "ep_loss": 0.0, + "epoch": 7.62, + "learning_rate": 0.000494, + "loss": 0.3998, + "mlm_loss": 0.3998, + "step": 9880 + }, + { + "ep_loss": 0.0, + "epoch": 7.63, + "learning_rate": 0.0004945, + "loss": 0.4011, + "mlm_loss": 0.4011, + "step": 9890 + }, + { + "ep_loss": 0.0, + "epoch": 7.64, + "learning_rate": 0.000495, + "loss": 0.4003, + "mlm_loss": 0.4003, + "step": 9900 + }, + { + "epoch": 7.64, + "eval_ep_loss": -2.388214111328125, + "eval_loss": 0.3742392659187317, + "eval_mlm_loss": 0.3742392659187317, + "eval_runtime": 60.995, + "eval_samples_per_second": 1145.488, + "eval_steps_per_second": 0.574, + "step": 9900 + }, + { + "ep_loss": 0.0, + "epoch": 7.64, + "learning_rate": 0.0004955, + "loss": 0.3885, + "mlm_loss": 0.3885, + "step": 9910 + }, + { + "ep_loss": 0.0, + "epoch": 7.65, + "learning_rate": 0.000496, + "loss": 0.3988, + "mlm_loss": 0.3988, + "step": 9920 + }, + { + "ep_loss": 0.0, + "epoch": 7.66, + "learning_rate": 0.0004965, + "loss": 0.4044, + "mlm_loss": 0.4044, + "step": 9930 + }, + { + "ep_loss": 0.0, + "epoch": 7.67, + "learning_rate": 0.000497, + "loss": 0.4002, + "mlm_loss": 0.4002, + "step": 9940 + }, + { + "ep_loss": 0.0, + "epoch": 7.67, + "learning_rate": 0.0004975, + "loss": 0.4078, + "mlm_loss": 0.4078, + "step": 9950 + }, + { + "ep_loss": 0.0, + "epoch": 7.68, + "learning_rate": 0.000498, + "loss": 0.3993, + "mlm_loss": 0.3993, + "step": 9960 + }, + { + "ep_loss": 0.0, + "epoch": 7.69, + "learning_rate": 0.0004985, + "loss": 0.3985, + "mlm_loss": 0.3985, + "step": 9970 + }, + { + "ep_loss": 0.0, + "epoch": 7.7, + "learning_rate": 0.0004989500000000001, + "loss": 0.4052, + "mlm_loss": 0.4052, + "step": 9980 + }, + { + "ep_loss": 0.0, + "epoch": 7.71, + "learning_rate": 0.00049945, + "loss": 0.4045, + "mlm_loss": 0.4045, + "step": 9990 + }, + { + "ep_loss": 0.0, + "epoch": 7.71, + "learning_rate": 0.00049995, + "loss": 0.3959, + "mlm_loss": 0.3959, + "step": 10000 + }, + { + "epoch": 7.71, + "eval_ep_loss": -2.4986319541931152, + "eval_loss": 0.3774937689304352, + "eval_mlm_loss": 0.3774937689304352, + "eval_runtime": 60.247, + "eval_samples_per_second": 1159.709, + "eval_steps_per_second": 0.581, + "step": 10000 + }, + { + "ep_loss": 0.0, + "epoch": 7.72, + "learning_rate": 0.0004997173366834171, + "loss": 0.4023, + "mlm_loss": 0.4023, + "step": 10010 + }, + { + "ep_loss": 0.0, + "epoch": 7.73, + "learning_rate": 0.0004994032663316583, + "loss": 0.407, + "mlm_loss": 0.407, + "step": 10020 + }, + { + "ep_loss": 0.0, + "epoch": 7.74, + "learning_rate": 0.0004990891959798995, + "loss": 0.4024, + "mlm_loss": 0.4024, + "step": 10030 + }, + { + "ep_loss": 0.0, + "epoch": 7.74, + "learning_rate": 0.0004987751256281407, + "loss": 0.3991, + "mlm_loss": 0.3991, + "step": 10040 + }, + { + "ep_loss": 0.0, + "epoch": 7.75, + "learning_rate": 0.000498461055276382, + "loss": 0.4058, + "mlm_loss": 0.4058, + "step": 10050 + }, + { + "ep_loss": 0.0, + "epoch": 7.76, + "learning_rate": 0.0004981469849246231, + "loss": 0.4046, + "mlm_loss": 0.4046, + "step": 10060 + }, + { + "ep_loss": 0.0, + "epoch": 7.77, + "learning_rate": 0.0004978329145728643, + "loss": 0.397, + "mlm_loss": 0.397, + "step": 10070 + }, + { + "ep_loss": 0.0, + "epoch": 7.77, + "learning_rate": 0.0004975188442211055, + "loss": 0.3967, + "mlm_loss": 0.3967, + "step": 10080 + }, + { + "ep_loss": 0.0, + "epoch": 7.78, + "learning_rate": 0.0004972047738693468, + "loss": 0.3986, + "mlm_loss": 0.3986, + "step": 10090 + }, + { + "ep_loss": 0.0, + "epoch": 7.79, + "learning_rate": 0.000496890703517588, + "loss": 0.4018, + "mlm_loss": 0.4018, + "step": 10100 + }, + { + "epoch": 7.79, + "eval_ep_loss": -2.651991605758667, + "eval_loss": 0.374184250831604, + "eval_mlm_loss": 0.374184250831604, + "eval_runtime": 61.0227, + "eval_samples_per_second": 1144.966, + "eval_steps_per_second": 0.574, + "step": 10100 + }, + { + "ep_loss": 0.0, + "epoch": 7.8, + "learning_rate": 0.0004965766331658292, + "loss": 0.4029, + "mlm_loss": 0.4029, + "step": 10110 + }, + { + "ep_loss": 0.0, + "epoch": 7.81, + "learning_rate": 0.0004962625628140703, + "loss": 0.4006, + "mlm_loss": 0.4006, + "step": 10120 + }, + { + "ep_loss": 0.0, + "epoch": 7.81, + "learning_rate": 0.0004959484924623116, + "loss": 0.4003, + "mlm_loss": 0.4003, + "step": 10130 + }, + { + "ep_loss": 0.0, + "epoch": 7.82, + "learning_rate": 0.0004956344221105528, + "loss": 0.3982, + "mlm_loss": 0.3982, + "step": 10140 + }, + { + "ep_loss": 0.0, + "epoch": 7.83, + "learning_rate": 0.000495320351758794, + "loss": 0.4044, + "mlm_loss": 0.4044, + "step": 10150 + }, + { + "ep_loss": 0.0, + "epoch": 7.84, + "learning_rate": 0.0004950062814070351, + "loss": 0.4023, + "mlm_loss": 0.4023, + "step": 10160 + }, + { + "ep_loss": 0.0, + "epoch": 7.84, + "learning_rate": 0.0004946922110552764, + "loss": 0.4024, + "mlm_loss": 0.4024, + "step": 10170 + }, + { + "ep_loss": 0.0, + "epoch": 7.85, + "learning_rate": 0.0004943781407035176, + "loss": 0.4036, + "mlm_loss": 0.4036, + "step": 10180 + }, + { + "ep_loss": 0.0, + "epoch": 7.86, + "learning_rate": 0.0004940640703517588, + "loss": 0.3962, + "mlm_loss": 0.3962, + "step": 10190 + }, + { + "ep_loss": 0.0, + "epoch": 7.87, + "learning_rate": 0.00049375, + "loss": 0.4061, + "mlm_loss": 0.4061, + "step": 10200 + }, + { + "epoch": 7.87, + "eval_ep_loss": -2.436077356338501, + "eval_loss": 0.3732386827468872, + "eval_mlm_loss": 0.3732386827468872, + "eval_runtime": 59.8162, + "eval_samples_per_second": 1168.062, + "eval_steps_per_second": 0.585, + "step": 10200 + }, + { + "ep_loss": 0.0, + "epoch": 7.88, + "learning_rate": 0.0004934359296482412, + "loss": 0.3959, + "mlm_loss": 0.3959, + "step": 10210 + }, + { + "ep_loss": 0.0, + "epoch": 7.88, + "learning_rate": 0.0004931218592964824, + "loss": 0.3997, + "mlm_loss": 0.3997, + "step": 10220 + }, + { + "ep_loss": 0.0, + "epoch": 7.89, + "learning_rate": 0.0004928077889447236, + "loss": 0.4013, + "mlm_loss": 0.4013, + "step": 10230 + }, + { + "ep_loss": 0.0, + "epoch": 7.9, + "learning_rate": 0.0004924937185929649, + "loss": 0.3956, + "mlm_loss": 0.3956, + "step": 10240 + }, + { + "ep_loss": 0.0, + "epoch": 7.91, + "learning_rate": 0.0004921796482412061, + "loss": 0.3967, + "mlm_loss": 0.3967, + "step": 10250 + }, + { + "ep_loss": 0.0, + "epoch": 7.91, + "learning_rate": 0.0004918655778894472, + "loss": 0.3967, + "mlm_loss": 0.3967, + "step": 10260 + }, + { + "ep_loss": 0.0, + "epoch": 7.92, + "learning_rate": 0.0004915515075376884, + "loss": 0.3954, + "mlm_loss": 0.3954, + "step": 10270 + }, + { + "ep_loss": 0.0, + "epoch": 7.93, + "learning_rate": 0.0004912374371859297, + "loss": 0.3976, + "mlm_loss": 0.3976, + "step": 10280 + }, + { + "ep_loss": 0.0, + "epoch": 7.94, + "learning_rate": 0.0004909233668341709, + "loss": 0.3945, + "mlm_loss": 0.3945, + "step": 10290 + }, + { + "ep_loss": 0.0, + "epoch": 7.94, + "learning_rate": 0.0004906092964824121, + "loss": 0.3979, + "mlm_loss": 0.3979, + "step": 10300 + }, + { + "epoch": 7.94, + "eval_ep_loss": -2.4026572704315186, + "eval_loss": 0.37325599789619446, + "eval_mlm_loss": 0.37325599789619446, + "eval_runtime": 61.5586, + "eval_samples_per_second": 1135.0, + "eval_steps_per_second": 0.569, + "step": 10300 + }, + { + "ep_loss": 0.0, + "epoch": 7.95, + "learning_rate": 0.0004902952261306532, + "loss": 0.3971, + "mlm_loss": 0.3971, + "step": 10310 + }, + { + "ep_loss": 0.0, + "epoch": 7.96, + "learning_rate": 0.0004899811557788945, + "loss": 0.4018, + "mlm_loss": 0.4018, + "step": 10320 + }, + { + "ep_loss": 0.0, + "epoch": 7.97, + "learning_rate": 0.0004896670854271357, + "loss": 0.3993, + "mlm_loss": 0.3993, + "step": 10330 + }, + { + "ep_loss": 0.0, + "epoch": 7.98, + "learning_rate": 0.0004893530150753769, + "loss": 0.3995, + "mlm_loss": 0.3995, + "step": 10340 + }, + { + "ep_loss": 0.0, + "epoch": 7.98, + "learning_rate": 0.0004890389447236181, + "loss": 0.3948, + "mlm_loss": 0.3948, + "step": 10350 + }, + { + "ep_loss": 0.0, + "epoch": 7.99, + "learning_rate": 0.0004887248743718593, + "loss": 0.4028, + "mlm_loss": 0.4028, + "step": 10360 + }, + { + "ep_loss": 0.0, + "epoch": 8.0, + "learning_rate": 0.0004884108040201005, + "loss": 0.3972, + "mlm_loss": 0.3972, + "step": 10370 + }, + { + "ep_loss": 0.0, + "epoch": 8.01, + "learning_rate": 0.0004880967336683417, + "loss": 0.3899, + "mlm_loss": 0.3899, + "step": 10380 + }, + { + "ep_loss": 0.0, + "epoch": 8.01, + "learning_rate": 0.00048778266331658295, + "loss": 0.3993, + "mlm_loss": 0.3993, + "step": 10390 + }, + { + "ep_loss": 0.0, + "epoch": 8.02, + "learning_rate": 0.0004874685929648242, + "loss": 0.3945, + "mlm_loss": 0.3945, + "step": 10400 + }, + { + "epoch": 8.02, + "eval_ep_loss": -2.2991716861724854, + "eval_loss": 0.3698738217353821, + "eval_mlm_loss": 0.3698738217353821, + "eval_runtime": 62.0944, + "eval_samples_per_second": 1125.206, + "eval_steps_per_second": 0.564, + "step": 10400 + }, + { + "ep_loss": 0.0, + "epoch": 8.03, + "learning_rate": 0.0004871545226130653, + "loss": 0.3855, + "mlm_loss": 0.3855, + "step": 10410 + }, + { + "ep_loss": 0.0, + "epoch": 8.04, + "learning_rate": 0.0004868404522613065, + "loss": 0.3925, + "mlm_loss": 0.3925, + "step": 10420 + }, + { + "ep_loss": 0.0, + "epoch": 8.04, + "learning_rate": 0.00048652638190954775, + "loss": 0.3901, + "mlm_loss": 0.3901, + "step": 10430 + }, + { + "ep_loss": 0.0, + "epoch": 8.05, + "learning_rate": 0.0004862123115577889, + "loss": 0.39, + "mlm_loss": 0.39, + "step": 10440 + }, + { + "ep_loss": 0.0, + "epoch": 8.06, + "learning_rate": 0.00048589824120603015, + "loss": 0.3904, + "mlm_loss": 0.3904, + "step": 10450 + }, + { + "ep_loss": 0.0, + "epoch": 8.07, + "learning_rate": 0.0004855841708542714, + "loss": 0.3908, + "mlm_loss": 0.3908, + "step": 10460 + }, + { + "ep_loss": 0.0, + "epoch": 8.08, + "learning_rate": 0.00048527010050251256, + "loss": 0.3931, + "mlm_loss": 0.3931, + "step": 10470 + }, + { + "ep_loss": 0.0, + "epoch": 8.08, + "learning_rate": 0.0004849560301507538, + "loss": 0.3899, + "mlm_loss": 0.3899, + "step": 10480 + }, + { + "ep_loss": 0.0, + "epoch": 8.09, + "learning_rate": 0.000484641959798995, + "loss": 0.3908, + "mlm_loss": 0.3908, + "step": 10490 + }, + { + "ep_loss": 0.0, + "epoch": 8.1, + "learning_rate": 0.00048432788944723624, + "loss": 0.3909, + "mlm_loss": 0.3909, + "step": 10500 + }, + { + "epoch": 8.1, + "eval_ep_loss": -2.246208429336548, + "eval_loss": 0.36925286054611206, + "eval_mlm_loss": 0.36925286054611206, + "eval_runtime": 61.3519, + "eval_samples_per_second": 1138.823, + "eval_steps_per_second": 0.57, + "step": 10500 + }, + { + "ep_loss": 0.0, + "epoch": 8.11, + "learning_rate": 0.00048401381909547736, + "loss": 0.3922, + "mlm_loss": 0.3922, + "step": 10510 + }, + { + "ep_loss": 0.0, + "epoch": 8.11, + "learning_rate": 0.0004836997487437186, + "loss": 0.3878, + "mlm_loss": 0.3878, + "step": 10520 + }, + { + "ep_loss": 0.0, + "epoch": 8.12, + "learning_rate": 0.0004833856783919598, + "loss": 0.3901, + "mlm_loss": 0.3901, + "step": 10530 + }, + { + "ep_loss": 0.0, + "epoch": 8.13, + "learning_rate": 0.000483071608040201, + "loss": 0.3872, + "mlm_loss": 0.3872, + "step": 10540 + }, + { + "ep_loss": 0.0, + "epoch": 8.14, + "learning_rate": 0.0004827575376884422, + "loss": 0.3931, + "mlm_loss": 0.3931, + "step": 10550 + }, + { + "ep_loss": 0.0, + "epoch": 8.15, + "learning_rate": 0.00048244346733668345, + "loss": 0.3928, + "mlm_loss": 0.3928, + "step": 10560 + }, + { + "ep_loss": 0.0, + "epoch": 8.15, + "learning_rate": 0.0004821293969849246, + "loss": 0.3975, + "mlm_loss": 0.3975, + "step": 10570 + }, + { + "ep_loss": 0.0, + "epoch": 8.16, + "learning_rate": 0.00048181532663316585, + "loss": 0.4002, + "mlm_loss": 0.4002, + "step": 10580 + }, + { + "ep_loss": 0.0, + "epoch": 8.17, + "learning_rate": 0.0004815012562814071, + "loss": 0.3961, + "mlm_loss": 0.3961, + "step": 10590 + }, + { + "ep_loss": 0.0, + "epoch": 8.18, + "learning_rate": 0.00048118718592964825, + "loss": 0.392, + "mlm_loss": 0.392, + "step": 10600 + }, + { + "epoch": 8.18, + "eval_ep_loss": -2.7679216861724854, + "eval_loss": 0.36933663487434387, + "eval_mlm_loss": 0.36933663487434387, + "eval_runtime": 61.879, + "eval_samples_per_second": 1129.123, + "eval_steps_per_second": 0.566, + "step": 10600 + }, + { + "ep_loss": 0.0, + "epoch": 8.18, + "learning_rate": 0.0004808731155778894, + "loss": 0.3957, + "mlm_loss": 0.3957, + "step": 10610 + }, + { + "ep_loss": 0.0, + "epoch": 8.19, + "learning_rate": 0.00048055904522613065, + "loss": 0.4001, + "mlm_loss": 0.4001, + "step": 10620 + }, + { + "ep_loss": 0.0, + "epoch": 8.2, + "learning_rate": 0.0004802449748743719, + "loss": 0.398, + "mlm_loss": 0.398, + "step": 10630 + }, + { + "ep_loss": 0.0, + "epoch": 8.21, + "learning_rate": 0.00047993090452261305, + "loss": 0.3897, + "mlm_loss": 0.3897, + "step": 10640 + }, + { + "ep_loss": 0.0, + "epoch": 8.21, + "learning_rate": 0.0004796168341708543, + "loss": 0.3949, + "mlm_loss": 0.3949, + "step": 10650 + }, + { + "ep_loss": 0.0, + "epoch": 8.22, + "learning_rate": 0.0004793027638190955, + "loss": 0.3844, + "mlm_loss": 0.3844, + "step": 10660 + }, + { + "ep_loss": 0.0, + "epoch": 8.23, + "learning_rate": 0.0004789886934673367, + "loss": 0.3931, + "mlm_loss": 0.3931, + "step": 10670 + }, + { + "ep_loss": 0.0, + "epoch": 8.24, + "learning_rate": 0.0004786746231155779, + "loss": 0.3892, + "mlm_loss": 0.3892, + "step": 10680 + }, + { + "ep_loss": 0.0, + "epoch": 8.25, + "learning_rate": 0.00047836055276381914, + "loss": 0.3946, + "mlm_loss": 0.3946, + "step": 10690 + }, + { + "ep_loss": 0.0, + "epoch": 8.25, + "learning_rate": 0.0004780464824120603, + "loss": 0.393, + "mlm_loss": 0.393, + "step": 10700 + }, + { + "epoch": 8.25, + "eval_ep_loss": -2.653468608856201, + "eval_loss": 0.36575964093208313, + "eval_mlm_loss": 0.36575964093208313, + "eval_runtime": 61.0152, + "eval_samples_per_second": 1145.108, + "eval_steps_per_second": 0.574, + "step": 10700 + }, + { + "ep_loss": 0.0, + "epoch": 8.26, + "learning_rate": 0.0004777324120603015, + "loss": 0.3865, + "mlm_loss": 0.3865, + "step": 10710 + }, + { + "ep_loss": 0.0, + "epoch": 8.27, + "learning_rate": 0.0004774183417085427, + "loss": 0.3936, + "mlm_loss": 0.3936, + "step": 10720 + }, + { + "ep_loss": 0.0, + "epoch": 8.28, + "learning_rate": 0.0004771042713567839, + "loss": 0.3874, + "mlm_loss": 0.3874, + "step": 10730 + }, + { + "ep_loss": 0.0, + "epoch": 8.28, + "learning_rate": 0.0004767902010050251, + "loss": 0.3839, + "mlm_loss": 0.3839, + "step": 10740 + }, + { + "ep_loss": 0.0, + "epoch": 8.29, + "learning_rate": 0.00047647613065326635, + "loss": 0.3819, + "mlm_loss": 0.3819, + "step": 10750 + }, + { + "ep_loss": 0.0, + "epoch": 8.3, + "learning_rate": 0.0004761620603015076, + "loss": 0.3841, + "mlm_loss": 0.3841, + "step": 10760 + }, + { + "ep_loss": 0.0, + "epoch": 8.31, + "learning_rate": 0.00047584798994974875, + "loss": 0.3896, + "mlm_loss": 0.3896, + "step": 10770 + }, + { + "ep_loss": 0.0, + "epoch": 8.31, + "learning_rate": 0.00047553391959799, + "loss": 0.388, + "mlm_loss": 0.388, + "step": 10780 + }, + { + "ep_loss": 0.0, + "epoch": 8.32, + "learning_rate": 0.0004752198492462312, + "loss": 0.3864, + "mlm_loss": 0.3864, + "step": 10790 + }, + { + "ep_loss": 0.0, + "epoch": 8.33, + "learning_rate": 0.0004749057788944724, + "loss": 0.3887, + "mlm_loss": 0.3887, + "step": 10800 + }, + { + "epoch": 8.33, + "eval_ep_loss": -2.6650192737579346, + "eval_loss": 0.362870454788208, + "eval_mlm_loss": 0.362870454788208, + "eval_runtime": 59.7372, + "eval_samples_per_second": 1169.606, + "eval_steps_per_second": 0.586, + "step": 10800 + }, + { + "ep_loss": 0.0, + "epoch": 8.34, + "learning_rate": 0.00047459170854271355, + "loss": 0.3898, + "mlm_loss": 0.3898, + "step": 10810 + }, + { + "ep_loss": 0.0, + "epoch": 8.35, + "learning_rate": 0.0004742776381909548, + "loss": 0.3881, + "mlm_loss": 0.3881, + "step": 10820 + }, + { + "ep_loss": 0.0, + "epoch": 8.35, + "learning_rate": 0.00047396356783919595, + "loss": 0.3868, + "mlm_loss": 0.3868, + "step": 10830 + }, + { + "ep_loss": 0.0, + "epoch": 8.36, + "learning_rate": 0.0004736494974874372, + "loss": 0.3924, + "mlm_loss": 0.3924, + "step": 10840 + }, + { + "ep_loss": 0.0, + "epoch": 8.37, + "learning_rate": 0.0004733354271356784, + "loss": 0.3849, + "mlm_loss": 0.3849, + "step": 10850 + }, + { + "ep_loss": 0.0, + "epoch": 8.38, + "learning_rate": 0.0004730213567839196, + "loss": 0.387, + "mlm_loss": 0.387, + "step": 10860 + }, + { + "ep_loss": 0.0, + "epoch": 8.38, + "learning_rate": 0.0004727072864321608, + "loss": 0.3829, + "mlm_loss": 0.3829, + "step": 10870 + }, + { + "ep_loss": 0.0, + "epoch": 8.39, + "learning_rate": 0.00047239321608040204, + "loss": 0.3881, + "mlm_loss": 0.3881, + "step": 10880 + }, + { + "ep_loss": 0.0, + "epoch": 8.4, + "learning_rate": 0.00047207914572864327, + "loss": 0.3957, + "mlm_loss": 0.3957, + "step": 10890 + }, + { + "ep_loss": 0.0, + "epoch": 8.41, + "learning_rate": 0.00047176507537688444, + "loss": 0.3856, + "mlm_loss": 0.3856, + "step": 10900 + }, + { + "epoch": 8.41, + "eval_ep_loss": -2.556627035140991, + "eval_loss": 0.3644386827945709, + "eval_mlm_loss": 0.3644386827945709, + "eval_runtime": 60.0106, + "eval_samples_per_second": 1164.279, + "eval_steps_per_second": 0.583, + "step": 10900 + }, + { + "ep_loss": 0.0, + "epoch": 8.41, + "learning_rate": 0.0004714510050251256, + "loss": 0.3918, + "mlm_loss": 0.3918, + "step": 10910 + }, + { + "ep_loss": 0.0, + "epoch": 8.42, + "learning_rate": 0.00047113693467336684, + "loss": 0.3888, + "mlm_loss": 0.3888, + "step": 10920 + }, + { + "ep_loss": 0.0, + "epoch": 8.43, + "learning_rate": 0.000470822864321608, + "loss": 0.3874, + "mlm_loss": 0.3874, + "step": 10930 + }, + { + "ep_loss": 0.0, + "epoch": 8.44, + "learning_rate": 0.00047050879396984925, + "loss": 0.3826, + "mlm_loss": 0.3826, + "step": 10940 + }, + { + "ep_loss": 0.0, + "epoch": 8.45, + "learning_rate": 0.0004701947236180905, + "loss": 0.3817, + "mlm_loss": 0.3817, + "step": 10950 + }, + { + "ep_loss": 0.0, + "epoch": 8.45, + "learning_rate": 0.00046988065326633165, + "loss": 0.3877, + "mlm_loss": 0.3877, + "step": 10960 + }, + { + "ep_loss": 0.0, + "epoch": 8.46, + "learning_rate": 0.0004695665829145729, + "loss": 0.3841, + "mlm_loss": 0.3841, + "step": 10970 + }, + { + "ep_loss": 0.0, + "epoch": 8.47, + "learning_rate": 0.0004692525125628141, + "loss": 0.3854, + "mlm_loss": 0.3854, + "step": 10980 + }, + { + "ep_loss": 0.0, + "epoch": 8.48, + "learning_rate": 0.0004689384422110553, + "loss": 0.3818, + "mlm_loss": 0.3818, + "step": 10990 + }, + { + "ep_loss": 0.0, + "epoch": 8.48, + "learning_rate": 0.0004686243718592965, + "loss": 0.3844, + "mlm_loss": 0.3844, + "step": 11000 + }, + { + "epoch": 8.48, + "eval_ep_loss": -2.669065475463867, + "eval_loss": 0.36087533831596375, + "eval_mlm_loss": 0.36087533831596375, + "eval_runtime": 59.8288, + "eval_samples_per_second": 1167.815, + "eval_steps_per_second": 0.585, + "step": 11000 + }, + { + "ep_loss": 0.0, + "epoch": 8.49, + "learning_rate": 0.0004683103015075377, + "loss": 0.3867, + "mlm_loss": 0.3867, + "step": 11010 + }, + { + "ep_loss": 0.0, + "epoch": 8.5, + "learning_rate": 0.0004679962311557789, + "loss": 0.383, + "mlm_loss": 0.383, + "step": 11020 + }, + { + "ep_loss": 0.0, + "epoch": 8.51, + "learning_rate": 0.0004676821608040201, + "loss": 0.3897, + "mlm_loss": 0.3897, + "step": 11030 + }, + { + "ep_loss": 0.0, + "epoch": 8.52, + "learning_rate": 0.0004673680904522613, + "loss": 0.3908, + "mlm_loss": 0.3908, + "step": 11040 + }, + { + "ep_loss": 0.0, + "epoch": 8.52, + "learning_rate": 0.00046705402010050254, + "loss": 0.3854, + "mlm_loss": 0.3854, + "step": 11050 + }, + { + "ep_loss": 0.0, + "epoch": 8.53, + "learning_rate": 0.0004667399497487437, + "loss": 0.3886, + "mlm_loss": 0.3886, + "step": 11060 + }, + { + "ep_loss": 0.0, + "epoch": 8.54, + "learning_rate": 0.00046642587939698494, + "loss": 0.3814, + "mlm_loss": 0.3814, + "step": 11070 + }, + { + "ep_loss": 0.0, + "epoch": 8.55, + "learning_rate": 0.00046611180904522617, + "loss": 0.3888, + "mlm_loss": 0.3888, + "step": 11080 + }, + { + "ep_loss": 0.0, + "epoch": 8.55, + "learning_rate": 0.00046579773869346734, + "loss": 0.3884, + "mlm_loss": 0.3884, + "step": 11090 + }, + { + "ep_loss": 0.0, + "epoch": 8.56, + "learning_rate": 0.00046548366834170857, + "loss": 0.39, + "mlm_loss": 0.39, + "step": 11100 + }, + { + "epoch": 8.56, + "eval_ep_loss": -2.3106160163879395, + "eval_loss": 0.3573386073112488, + "eval_mlm_loss": 0.3573386073112488, + "eval_runtime": 61.2527, + "eval_samples_per_second": 1140.668, + "eval_steps_per_second": 0.571, + "step": 11100 + }, + { + "ep_loss": 0.0, + "epoch": 8.57, + "learning_rate": 0.00046516959798994974, + "loss": 0.3767, + "mlm_loss": 0.3767, + "step": 11110 + }, + { + "ep_loss": 0.0, + "epoch": 8.58, + "learning_rate": 0.0004648555276381909, + "loss": 0.3712, + "mlm_loss": 0.3712, + "step": 11120 + }, + { + "ep_loss": 0.0, + "epoch": 8.58, + "learning_rate": 0.00046454145728643215, + "loss": 0.3787, + "mlm_loss": 0.3787, + "step": 11130 + }, + { + "ep_loss": 0.0, + "epoch": 8.59, + "learning_rate": 0.0004642273869346734, + "loss": 0.3815, + "mlm_loss": 0.3815, + "step": 11140 + }, + { + "ep_loss": 0.0, + "epoch": 8.6, + "learning_rate": 0.0004639133165829146, + "loss": 0.3837, + "mlm_loss": 0.3837, + "step": 11150 + }, + { + "ep_loss": 0.0, + "epoch": 8.61, + "learning_rate": 0.0004635992462311558, + "loss": 0.3851, + "mlm_loss": 0.3851, + "step": 11160 + }, + { + "ep_loss": 0.0, + "epoch": 8.62, + "learning_rate": 0.000463285175879397, + "loss": 0.3818, + "mlm_loss": 0.3818, + "step": 11170 + }, + { + "ep_loss": 0.0, + "epoch": 8.62, + "learning_rate": 0.00046297110552763823, + "loss": 0.3841, + "mlm_loss": 0.3841, + "step": 11180 + }, + { + "ep_loss": 0.0, + "epoch": 8.63, + "learning_rate": 0.0004626570351758794, + "loss": 0.3803, + "mlm_loss": 0.3803, + "step": 11190 + }, + { + "ep_loss": 0.0, + "epoch": 8.64, + "learning_rate": 0.00046234296482412063, + "loss": 0.3799, + "mlm_loss": 0.3799, + "step": 11200 + }, + { + "epoch": 8.64, + "eval_ep_loss": -2.7373206615448, + "eval_loss": 0.35678714513778687, + "eval_mlm_loss": 0.35678714513778687, + "eval_runtime": 63.4192, + "eval_samples_per_second": 1101.701, + "eval_steps_per_second": 0.552, + "step": 11200 + }, + { + "ep_loss": 0.0, + "epoch": 8.65, + "learning_rate": 0.0004620288944723618, + "loss": 0.3788, + "mlm_loss": 0.3788, + "step": 11210 + }, + { + "ep_loss": 0.0, + "epoch": 8.65, + "learning_rate": 0.000461714824120603, + "loss": 0.385, + "mlm_loss": 0.385, + "step": 11220 + }, + { + "ep_loss": 0.0, + "epoch": 8.66, + "learning_rate": 0.0004614007537688442, + "loss": 0.3886, + "mlm_loss": 0.3886, + "step": 11230 + }, + { + "ep_loss": 0.0, + "epoch": 8.67, + "learning_rate": 0.00046108668341708544, + "loss": 0.3816, + "mlm_loss": 0.3816, + "step": 11240 + }, + { + "ep_loss": 0.0, + "epoch": 8.68, + "learning_rate": 0.00046077261306532667, + "loss": 0.3913, + "mlm_loss": 0.3913, + "step": 11250 + }, + { + "ep_loss": 0.0, + "epoch": 8.68, + "learning_rate": 0.00046045854271356784, + "loss": 0.3816, + "mlm_loss": 0.3816, + "step": 11260 + }, + { + "ep_loss": 0.0, + "epoch": 8.69, + "learning_rate": 0.00046014447236180907, + "loss": 0.3809, + "mlm_loss": 0.3809, + "step": 11270 + }, + { + "ep_loss": 0.0, + "epoch": 8.7, + "learning_rate": 0.0004598304020100503, + "loss": 0.3901, + "mlm_loss": 0.3901, + "step": 11280 + }, + { + "ep_loss": 0.0, + "epoch": 8.71, + "learning_rate": 0.00045951633165829147, + "loss": 0.3791, + "mlm_loss": 0.3791, + "step": 11290 + }, + { + "ep_loss": 0.0, + "epoch": 8.72, + "learning_rate": 0.0004592022613065327, + "loss": 0.3778, + "mlm_loss": 0.3778, + "step": 11300 + }, + { + "epoch": 8.72, + "eval_ep_loss": -2.907904624938965, + "eval_loss": 0.3548813462257385, + "eval_mlm_loss": 0.3548813462257385, + "eval_runtime": 63.2595, + "eval_samples_per_second": 1104.482, + "eval_steps_per_second": 0.553, + "step": 11300 + }, + { + "ep_loss": 0.0, + "epoch": 8.72, + "learning_rate": 0.00045888819095477387, + "loss": 0.3802, + "mlm_loss": 0.3802, + "step": 11310 + }, + { + "ep_loss": 0.0, + "epoch": 8.73, + "learning_rate": 0.00045857412060301505, + "loss": 0.3811, + "mlm_loss": 0.3811, + "step": 11320 + }, + { + "ep_loss": 0.0, + "epoch": 8.74, + "learning_rate": 0.0004582600502512563, + "loss": 0.385, + "mlm_loss": 0.385, + "step": 11330 + }, + { + "ep_loss": 0.0, + "epoch": 8.75, + "learning_rate": 0.0004579459798994975, + "loss": 0.3828, + "mlm_loss": 0.3828, + "step": 11340 + }, + { + "ep_loss": 0.0, + "epoch": 8.75, + "learning_rate": 0.0004576319095477387, + "loss": 0.3789, + "mlm_loss": 0.3789, + "step": 11350 + }, + { + "ep_loss": 0.0, + "epoch": 8.76, + "learning_rate": 0.0004573178391959799, + "loss": 0.3802, + "mlm_loss": 0.3802, + "step": 11360 + }, + { + "ep_loss": 0.0, + "epoch": 8.77, + "learning_rate": 0.00045700376884422113, + "loss": 0.38, + "mlm_loss": 0.38, + "step": 11370 + }, + { + "ep_loss": 0.0, + "epoch": 8.78, + "learning_rate": 0.00045668969849246236, + "loss": 0.3842, + "mlm_loss": 0.3842, + "step": 11380 + }, + { + "ep_loss": 0.0, + "epoch": 8.79, + "learning_rate": 0.00045637562814070353, + "loss": 0.3843, + "mlm_loss": 0.3843, + "step": 11390 + }, + { + "ep_loss": 0.0, + "epoch": 8.79, + "learning_rate": 0.00045606155778894476, + "loss": 0.3811, + "mlm_loss": 0.3811, + "step": 11400 + }, + { + "epoch": 8.79, + "eval_ep_loss": -2.4214723110198975, + "eval_loss": 0.35386431217193604, + "eval_mlm_loss": 0.35386431217193604, + "eval_runtime": 59.4503, + "eval_samples_per_second": 1175.25, + "eval_steps_per_second": 0.589, + "step": 11400 + }, + { + "ep_loss": 0.0, + "epoch": 8.8, + "learning_rate": 0.00045574748743718594, + "loss": 0.3777, + "mlm_loss": 0.3777, + "step": 11410 + }, + { + "ep_loss": 0.0, + "epoch": 8.81, + "learning_rate": 0.0004554334170854271, + "loss": 0.3857, + "mlm_loss": 0.3857, + "step": 11420 + }, + { + "ep_loss": 0.0, + "epoch": 8.82, + "learning_rate": 0.00045511934673366834, + "loss": 0.3818, + "mlm_loss": 0.3818, + "step": 11430 + }, + { + "ep_loss": 0.0, + "epoch": 8.82, + "learning_rate": 0.00045480527638190957, + "loss": 0.3794, + "mlm_loss": 0.3794, + "step": 11440 + }, + { + "ep_loss": 0.0, + "epoch": 8.83, + "learning_rate": 0.00045449120603015074, + "loss": 0.3783, + "mlm_loss": 0.3783, + "step": 11450 + }, + { + "ep_loss": 0.0, + "epoch": 8.84, + "learning_rate": 0.00045417713567839197, + "loss": 0.3826, + "mlm_loss": 0.3826, + "step": 11460 + }, + { + "ep_loss": 0.0, + "epoch": 8.85, + "learning_rate": 0.0004538630653266332, + "loss": 0.3786, + "mlm_loss": 0.3786, + "step": 11470 + }, + { + "ep_loss": 0.0, + "epoch": 8.85, + "learning_rate": 0.00045354899497487437, + "loss": 0.3731, + "mlm_loss": 0.3731, + "step": 11480 + }, + { + "ep_loss": 0.0, + "epoch": 8.86, + "learning_rate": 0.0004532349246231156, + "loss": 0.3766, + "mlm_loss": 0.3766, + "step": 11490 + }, + { + "ep_loss": 0.0, + "epoch": 8.87, + "learning_rate": 0.0004529208542713568, + "loss": 0.386, + "mlm_loss": 0.386, + "step": 11500 + }, + { + "epoch": 8.87, + "eval_ep_loss": -2.3580639362335205, + "eval_loss": 0.3554854094982147, + "eval_mlm_loss": 0.3554854094982147, + "eval_runtime": 59.7228, + "eval_samples_per_second": 1169.889, + "eval_steps_per_second": 0.586, + "step": 11500 + }, + { + "ep_loss": 0.0, + "epoch": 8.88, + "learning_rate": 0.000452606783919598, + "loss": 0.3765, + "mlm_loss": 0.3765, + "step": 11510 + }, + { + "ep_loss": 0.0, + "epoch": 8.89, + "learning_rate": 0.0004522927135678392, + "loss": 0.375, + "mlm_loss": 0.375, + "step": 11520 + }, + { + "ep_loss": 0.0, + "epoch": 8.89, + "learning_rate": 0.0004519786432160804, + "loss": 0.3831, + "mlm_loss": 0.3831, + "step": 11530 + }, + { + "ep_loss": 0.0, + "epoch": 8.9, + "learning_rate": 0.00045166457286432163, + "loss": 0.3797, + "mlm_loss": 0.3797, + "step": 11540 + }, + { + "ep_loss": 0.0, + "epoch": 8.91, + "learning_rate": 0.0004513505025125628, + "loss": 0.3729, + "mlm_loss": 0.3729, + "step": 11550 + }, + { + "ep_loss": 0.0, + "epoch": 8.92, + "learning_rate": 0.00045103643216080403, + "loss": 0.3736, + "mlm_loss": 0.3736, + "step": 11560 + }, + { + "ep_loss": 0.0, + "epoch": 8.92, + "learning_rate": 0.00045072236180904526, + "loss": 0.3756, + "mlm_loss": 0.3756, + "step": 11570 + }, + { + "ep_loss": 0.0, + "epoch": 8.93, + "learning_rate": 0.00045040829145728643, + "loss": 0.3791, + "mlm_loss": 0.3791, + "step": 11580 + }, + { + "ep_loss": 0.0, + "epoch": 8.94, + "learning_rate": 0.00045009422110552766, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 11590 + }, + { + "ep_loss": 0.0, + "epoch": 8.95, + "learning_rate": 0.0004497801507537689, + "loss": 0.374, + "mlm_loss": 0.374, + "step": 11600 + }, + { + "epoch": 8.95, + "eval_ep_loss": -2.2494945526123047, + "eval_loss": 0.35051652789115906, + "eval_mlm_loss": 0.35051652789115906, + "eval_runtime": 60.6153, + "eval_samples_per_second": 1152.663, + "eval_steps_per_second": 0.577, + "step": 11600 + }, + { + "ep_loss": 0.0, + "epoch": 8.95, + "learning_rate": 0.00044946608040201, + "loss": 0.3771, + "mlm_loss": 0.3771, + "step": 11610 + }, + { + "ep_loss": 0.0, + "epoch": 8.96, + "learning_rate": 0.00044915201005025124, + "loss": 0.3787, + "mlm_loss": 0.3787, + "step": 11620 + }, + { + "ep_loss": 0.0, + "epoch": 8.97, + "learning_rate": 0.00044883793969849247, + "loss": 0.3783, + "mlm_loss": 0.3783, + "step": 11630 + }, + { + "ep_loss": 0.0, + "epoch": 8.98, + "learning_rate": 0.0004485238693467337, + "loss": 0.3683, + "mlm_loss": 0.3683, + "step": 11640 + }, + { + "ep_loss": 0.0, + "epoch": 8.99, + "learning_rate": 0.00044820979899497487, + "loss": 0.3755, + "mlm_loss": 0.3755, + "step": 11650 + }, + { + "ep_loss": 0.0, + "epoch": 8.99, + "learning_rate": 0.0004478957286432161, + "loss": 0.3751, + "mlm_loss": 0.3751, + "step": 11660 + }, + { + "ep_loss": 0.0, + "epoch": 9.0, + "learning_rate": 0.0004475816582914573, + "loss": 0.3806, + "mlm_loss": 0.3806, + "step": 11670 + }, + { + "ep_loss": 0.0, + "epoch": 9.01, + "learning_rate": 0.0004472675879396985, + "loss": 0.3723, + "mlm_loss": 0.3723, + "step": 11680 + }, + { + "ep_loss": 0.0, + "epoch": 9.02, + "learning_rate": 0.0004469535175879397, + "loss": 0.3727, + "mlm_loss": 0.3727, + "step": 11690 + }, + { + "ep_loss": 0.0, + "epoch": 9.02, + "learning_rate": 0.00044663944723618095, + "loss": 0.3717, + "mlm_loss": 0.3717, + "step": 11700 + }, + { + "epoch": 9.02, + "eval_ep_loss": -2.543911933898926, + "eval_loss": 0.34973713755607605, + "eval_mlm_loss": 0.34973713755607605, + "eval_runtime": 62.1216, + "eval_samples_per_second": 1124.713, + "eval_steps_per_second": 0.563, + "step": 11700 + }, + { + "ep_loss": 0.0, + "epoch": 9.03, + "learning_rate": 0.0004463253768844221, + "loss": 0.3694, + "mlm_loss": 0.3694, + "step": 11710 + }, + { + "ep_loss": 0.0, + "epoch": 9.04, + "learning_rate": 0.0004460113065326633, + "loss": 0.3673, + "mlm_loss": 0.3673, + "step": 11720 + }, + { + "ep_loss": 0.0, + "epoch": 9.05, + "learning_rate": 0.00044569723618090453, + "loss": 0.3679, + "mlm_loss": 0.3679, + "step": 11730 + }, + { + "ep_loss": 0.0, + "epoch": 9.06, + "learning_rate": 0.0004453831658291457, + "loss": 0.3757, + "mlm_loss": 0.3757, + "step": 11740 + }, + { + "ep_loss": 0.0, + "epoch": 9.06, + "learning_rate": 0.00044506909547738693, + "loss": 0.3702, + "mlm_loss": 0.3702, + "step": 11750 + }, + { + "ep_loss": 0.0, + "epoch": 9.07, + "learning_rate": 0.00044475502512562816, + "loss": 0.3697, + "mlm_loss": 0.3697, + "step": 11760 + }, + { + "ep_loss": 0.0, + "epoch": 9.08, + "learning_rate": 0.0004444409547738694, + "loss": 0.3721, + "mlm_loss": 0.3721, + "step": 11770 + }, + { + "ep_loss": 0.0, + "epoch": 9.09, + "learning_rate": 0.00044412688442211056, + "loss": 0.3713, + "mlm_loss": 0.3713, + "step": 11780 + }, + { + "ep_loss": 0.0, + "epoch": 9.09, + "learning_rate": 0.0004438128140703518, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 11790 + }, + { + "ep_loss": 0.0, + "epoch": 9.1, + "learning_rate": 0.000443498743718593, + "loss": 0.368, + "mlm_loss": 0.368, + "step": 11800 + }, + { + "epoch": 9.1, + "eval_ep_loss": -2.5209760665893555, + "eval_loss": 0.349372535943985, + "eval_mlm_loss": 0.349372535943985, + "eval_runtime": 58.6125, + "eval_samples_per_second": 1192.049, + "eval_steps_per_second": 0.597, + "step": 11800 + }, + { + "ep_loss": 0.0, + "epoch": 9.11, + "learning_rate": 0.00044318467336683414, + "loss": 0.3664, + "mlm_loss": 0.3664, + "step": 11810 + }, + { + "ep_loss": 0.0, + "epoch": 9.12, + "learning_rate": 0.00044287060301507537, + "loss": 0.3766, + "mlm_loss": 0.3766, + "step": 11820 + }, + { + "ep_loss": 0.0, + "epoch": 9.12, + "learning_rate": 0.0004425565326633166, + "loss": 0.3677, + "mlm_loss": 0.3677, + "step": 11830 + }, + { + "ep_loss": 0.0, + "epoch": 9.13, + "learning_rate": 0.00044224246231155777, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 11840 + }, + { + "ep_loss": 0.0, + "epoch": 9.14, + "learning_rate": 0.000441928391959799, + "loss": 0.3789, + "mlm_loss": 0.3789, + "step": 11850 + }, + { + "ep_loss": 0.0, + "epoch": 9.15, + "learning_rate": 0.0004416143216080402, + "loss": 0.3716, + "mlm_loss": 0.3716, + "step": 11860 + }, + { + "ep_loss": 0.0, + "epoch": 9.16, + "learning_rate": 0.00044130025125628145, + "loss": 0.3672, + "mlm_loss": 0.3672, + "step": 11870 + }, + { + "ep_loss": 0.0, + "epoch": 9.16, + "learning_rate": 0.0004409861809045226, + "loss": 0.369, + "mlm_loss": 0.369, + "step": 11880 + }, + { + "ep_loss": 0.0, + "epoch": 9.17, + "learning_rate": 0.00044067211055276385, + "loss": 0.377, + "mlm_loss": 0.377, + "step": 11890 + }, + { + "ep_loss": 0.0, + "epoch": 9.18, + "learning_rate": 0.0004403580402010051, + "loss": 0.3692, + "mlm_loss": 0.3692, + "step": 11900 + }, + { + "epoch": 9.18, + "eval_ep_loss": -2.5095667839050293, + "eval_loss": 0.3459388017654419, + "eval_mlm_loss": 0.3459388017654419, + "eval_runtime": 59.1505, + "eval_samples_per_second": 1181.208, + "eval_steps_per_second": 0.592, + "step": 11900 + }, + { + "ep_loss": 0.0, + "epoch": 9.19, + "learning_rate": 0.0004400439698492462, + "loss": 0.3717, + "mlm_loss": 0.3717, + "step": 11910 + }, + { + "ep_loss": 0.0, + "epoch": 9.19, + "learning_rate": 0.00043972989949748743, + "loss": 0.3706, + "mlm_loss": 0.3706, + "step": 11920 + }, + { + "ep_loss": 0.0, + "epoch": 9.2, + "learning_rate": 0.00043941582914572866, + "loss": 0.3766, + "mlm_loss": 0.3766, + "step": 11930 + }, + { + "ep_loss": 0.0, + "epoch": 9.21, + "learning_rate": 0.00043910175879396983, + "loss": 0.3675, + "mlm_loss": 0.3675, + "step": 11940 + }, + { + "ep_loss": 0.0, + "epoch": 9.22, + "learning_rate": 0.00043878768844221106, + "loss": 0.3653, + "mlm_loss": 0.3653, + "step": 11950 + }, + { + "ep_loss": 0.0, + "epoch": 9.22, + "learning_rate": 0.0004384736180904523, + "loss": 0.3683, + "mlm_loss": 0.3683, + "step": 11960 + }, + { + "ep_loss": 0.0, + "epoch": 9.23, + "learning_rate": 0.00043815954773869346, + "loss": 0.3695, + "mlm_loss": 0.3695, + "step": 11970 + }, + { + "ep_loss": 0.0, + "epoch": 9.24, + "learning_rate": 0.0004378454773869347, + "loss": 0.3674, + "mlm_loss": 0.3674, + "step": 11980 + }, + { + "ep_loss": 0.0, + "epoch": 9.25, + "learning_rate": 0.0004375314070351759, + "loss": 0.3621, + "mlm_loss": 0.3621, + "step": 11990 + }, + { + "ep_loss": 0.0, + "epoch": 9.26, + "learning_rate": 0.00043721733668341715, + "loss": 0.3714, + "mlm_loss": 0.3714, + "step": 12000 + }, + { + "epoch": 9.26, + "eval_ep_loss": -2.6775925159454346, + "eval_loss": 0.3442252576351166, + "eval_mlm_loss": 0.3442252576351166, + "eval_runtime": 60.7149, + "eval_samples_per_second": 1150.773, + "eval_steps_per_second": 0.576, + "step": 12000 + }, + { + "ep_loss": 0.0, + "epoch": 9.26, + "learning_rate": 0.00043690326633165827, + "loss": 0.3706, + "mlm_loss": 0.3706, + "step": 12010 + }, + { + "ep_loss": 0.0, + "epoch": 9.27, + "learning_rate": 0.0004365891959798995, + "loss": 0.3642, + "mlm_loss": 0.3642, + "step": 12020 + }, + { + "ep_loss": 0.0, + "epoch": 9.28, + "learning_rate": 0.0004362751256281407, + "loss": 0.3663, + "mlm_loss": 0.3663, + "step": 12030 + }, + { + "ep_loss": 0.0, + "epoch": 9.29, + "learning_rate": 0.0004359610552763819, + "loss": 0.3743, + "mlm_loss": 0.3743, + "step": 12040 + }, + { + "ep_loss": 0.0, + "epoch": 9.29, + "learning_rate": 0.0004356469849246231, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 12050 + }, + { + "ep_loss": 0.0, + "epoch": 9.3, + "learning_rate": 0.00043533291457286435, + "loss": 0.3713, + "mlm_loss": 0.3713, + "step": 12060 + }, + { + "ep_loss": 0.0, + "epoch": 9.31, + "learning_rate": 0.0004350188442211055, + "loss": 0.3684, + "mlm_loss": 0.3684, + "step": 12070 + }, + { + "ep_loss": 0.0, + "epoch": 9.32, + "learning_rate": 0.00043470477386934675, + "loss": 0.3603, + "mlm_loss": 0.3603, + "step": 12080 + }, + { + "ep_loss": 0.0, + "epoch": 9.33, + "learning_rate": 0.000434390703517588, + "loss": 0.3599, + "mlm_loss": 0.3599, + "step": 12090 + }, + { + "ep_loss": 0.0, + "epoch": 9.33, + "learning_rate": 0.00043407663316582916, + "loss": 0.3653, + "mlm_loss": 0.3653, + "step": 12100 + }, + { + "epoch": 9.33, + "eval_ep_loss": -2.689305543899536, + "eval_loss": 0.34572362899780273, + "eval_mlm_loss": 0.34572362899780273, + "eval_runtime": 60.5622, + "eval_samples_per_second": 1153.673, + "eval_steps_per_second": 0.578, + "step": 12100 + }, + { + "ep_loss": 0.0, + "epoch": 9.34, + "learning_rate": 0.00043376256281407033, + "loss": 0.3746, + "mlm_loss": 0.3746, + "step": 12110 + }, + { + "ep_loss": 0.0, + "epoch": 9.35, + "learning_rate": 0.00043344849246231156, + "loss": 0.3707, + "mlm_loss": 0.3707, + "step": 12120 + }, + { + "ep_loss": 0.0, + "epoch": 9.36, + "learning_rate": 0.0004331344221105528, + "loss": 0.375, + "mlm_loss": 0.375, + "step": 12130 + }, + { + "ep_loss": 0.0, + "epoch": 9.36, + "learning_rate": 0.00043282035175879396, + "loss": 0.3675, + "mlm_loss": 0.3675, + "step": 12140 + }, + { + "ep_loss": 0.0, + "epoch": 9.37, + "learning_rate": 0.0004325062814070352, + "loss": 0.3683, + "mlm_loss": 0.3683, + "step": 12150 + }, + { + "ep_loss": 0.0, + "epoch": 9.38, + "learning_rate": 0.0004321922110552764, + "loss": 0.3646, + "mlm_loss": 0.3646, + "step": 12160 + }, + { + "ep_loss": 0.0, + "epoch": 9.39, + "learning_rate": 0.0004318781407035176, + "loss": 0.37, + "mlm_loss": 0.37, + "step": 12170 + }, + { + "ep_loss": 0.0, + "epoch": 9.39, + "learning_rate": 0.0004315640703517588, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 12180 + }, + { + "ep_loss": 0.0, + "epoch": 9.4, + "learning_rate": 0.00043125000000000005, + "loss": 0.364, + "mlm_loss": 0.364, + "step": 12190 + }, + { + "ep_loss": 0.0, + "epoch": 9.41, + "learning_rate": 0.00043093592964824117, + "loss": 0.3639, + "mlm_loss": 0.3639, + "step": 12200 + }, + { + "epoch": 9.41, + "eval_ep_loss": -2.2502214908599854, + "eval_loss": 0.3424089848995209, + "eval_mlm_loss": 0.3424089848995209, + "eval_runtime": 60.2107, + "eval_samples_per_second": 1160.408, + "eval_steps_per_second": 0.581, + "step": 12200 + }, + { + "ep_loss": 0.0, + "epoch": 9.42, + "learning_rate": 0.0004306218592964824, + "loss": 0.3668, + "mlm_loss": 0.3668, + "step": 12210 + }, + { + "ep_loss": 0.0, + "epoch": 9.43, + "learning_rate": 0.0004303077889447236, + "loss": 0.3643, + "mlm_loss": 0.3643, + "step": 12220 + }, + { + "ep_loss": 0.0, + "epoch": 9.43, + "learning_rate": 0.0004299937185929648, + "loss": 0.3682, + "mlm_loss": 0.3682, + "step": 12230 + }, + { + "ep_loss": 0.0, + "epoch": 9.44, + "learning_rate": 0.00042971105527638193, + "loss": 0.3682, + "mlm_loss": 0.3682, + "step": 12240 + }, + { + "ep_loss": 0.0, + "epoch": 9.45, + "learning_rate": 0.00042939698492462316, + "loss": 0.3663, + "mlm_loss": 0.3663, + "step": 12250 + }, + { + "ep_loss": 0.0, + "epoch": 9.46, + "learning_rate": 0.0004291143216080402, + "loss": 0.3633, + "mlm_loss": 0.3633, + "step": 12260 + }, + { + "ep_loss": 0.0, + "epoch": 9.46, + "learning_rate": 0.0004288002512562814, + "loss": 0.3657, + "mlm_loss": 0.3657, + "step": 12270 + }, + { + "ep_loss": 0.0, + "epoch": 9.47, + "learning_rate": 0.00042848618090452265, + "loss": 0.3608, + "mlm_loss": 0.3608, + "step": 12280 + }, + { + "ep_loss": 0.0, + "epoch": 9.48, + "learning_rate": 0.0004281721105527638, + "loss": 0.3674, + "mlm_loss": 0.3674, + "step": 12290 + }, + { + "ep_loss": 0.0, + "epoch": 9.49, + "learning_rate": 0.00042785804020100505, + "loss": 0.3701, + "mlm_loss": 0.3701, + "step": 12300 + }, + { + "epoch": 9.49, + "eval_ep_loss": -2.4860949516296387, + "eval_loss": 0.34168559312820435, + "eval_mlm_loss": 0.34168559312820435, + "eval_runtime": 60.0999, + "eval_samples_per_second": 1162.548, + "eval_steps_per_second": 0.582, + "step": 12300 + }, + { + "ep_loss": 0.0, + "epoch": 9.49, + "learning_rate": 0.0004275439698492463, + "loss": 0.3626, + "mlm_loss": 0.3626, + "step": 12310 + }, + { + "ep_loss": 0.0, + "epoch": 9.5, + "learning_rate": 0.00042722989949748745, + "loss": 0.3667, + "mlm_loss": 0.3667, + "step": 12320 + }, + { + "ep_loss": 0.0, + "epoch": 9.51, + "learning_rate": 0.0004269158291457286, + "loss": 0.3564, + "mlm_loss": 0.3564, + "step": 12330 + }, + { + "ep_loss": 0.0, + "epoch": 9.52, + "learning_rate": 0.00042660175879396985, + "loss": 0.3653, + "mlm_loss": 0.3653, + "step": 12340 + }, + { + "ep_loss": 0.0, + "epoch": 9.53, + "learning_rate": 0.00042628768844221103, + "loss": 0.3696, + "mlm_loss": 0.3696, + "step": 12350 + }, + { + "ep_loss": 0.0, + "epoch": 9.53, + "learning_rate": 0.00042597361809045226, + "loss": 0.3653, + "mlm_loss": 0.3653, + "step": 12360 + }, + { + "ep_loss": 0.0, + "epoch": 9.54, + "learning_rate": 0.0004256595477386935, + "loss": 0.3645, + "mlm_loss": 0.3645, + "step": 12370 + }, + { + "ep_loss": 0.0, + "epoch": 9.55, + "learning_rate": 0.0004253454773869347, + "loss": 0.368, + "mlm_loss": 0.368, + "step": 12380 + }, + { + "ep_loss": 0.0, + "epoch": 9.56, + "learning_rate": 0.0004250314070351759, + "loss": 0.3637, + "mlm_loss": 0.3637, + "step": 12390 + }, + { + "ep_loss": 0.0, + "epoch": 9.56, + "learning_rate": 0.0004247173366834171, + "loss": 0.3694, + "mlm_loss": 0.3694, + "step": 12400 + }, + { + "epoch": 9.56, + "eval_ep_loss": -2.249842405319214, + "eval_loss": 0.3413134813308716, + "eval_mlm_loss": 0.3413134813308716, + "eval_runtime": 60.5795, + "eval_samples_per_second": 1153.344, + "eval_steps_per_second": 0.578, + "step": 12400 + }, + { + "ep_loss": 0.0, + "epoch": 9.57, + "learning_rate": 0.00042440326633165834, + "loss": 0.3612, + "mlm_loss": 0.3612, + "step": 12410 + }, + { + "ep_loss": 0.0, + "epoch": 9.58, + "learning_rate": 0.0004240891959798995, + "loss": 0.3594, + "mlm_loss": 0.3594, + "step": 12420 + }, + { + "ep_loss": 0.0, + "epoch": 9.59, + "learning_rate": 0.0004237751256281407, + "loss": 0.3705, + "mlm_loss": 0.3705, + "step": 12430 + }, + { + "ep_loss": 0.0, + "epoch": 9.6, + "learning_rate": 0.0004234610552763819, + "loss": 0.3608, + "mlm_loss": 0.3608, + "step": 12440 + }, + { + "ep_loss": 0.0, + "epoch": 9.6, + "learning_rate": 0.0004231469849246231, + "loss": 0.3611, + "mlm_loss": 0.3611, + "step": 12450 + }, + { + "ep_loss": 0.0, + "epoch": 9.61, + "learning_rate": 0.0004228329145728643, + "loss": 0.3615, + "mlm_loss": 0.3615, + "step": 12460 + }, + { + "ep_loss": 0.0, + "epoch": 9.62, + "learning_rate": 0.00042251884422110555, + "loss": 0.3641, + "mlm_loss": 0.3641, + "step": 12470 + }, + { + "ep_loss": 0.0, + "epoch": 9.63, + "learning_rate": 0.0004222047738693467, + "loss": 0.3664, + "mlm_loss": 0.3664, + "step": 12480 + }, + { + "ep_loss": 0.0, + "epoch": 9.63, + "learning_rate": 0.00042189070351758795, + "loss": 0.3612, + "mlm_loss": 0.3612, + "step": 12490 + }, + { + "ep_loss": 0.0, + "epoch": 9.64, + "learning_rate": 0.0004215766331658292, + "loss": 0.37, + "mlm_loss": 0.37, + "step": 12500 + }, + { + "epoch": 9.64, + "eval_ep_loss": -2.4926164150238037, + "eval_loss": 0.3400452435016632, + "eval_mlm_loss": 0.3400452435016632, + "eval_runtime": 62.1559, + "eval_samples_per_second": 1124.093, + "eval_steps_per_second": 0.563, + "step": 12500 + }, + { + "ep_loss": 0.0, + "epoch": 9.65, + "learning_rate": 0.0004212625628140704, + "loss": 0.3693, + "mlm_loss": 0.3693, + "step": 12510 + }, + { + "ep_loss": 0.0, + "epoch": 9.66, + "learning_rate": 0.0004209484924623116, + "loss": 0.3623, + "mlm_loss": 0.3623, + "step": 12520 + }, + { + "ep_loss": 0.0, + "epoch": 9.66, + "learning_rate": 0.00042063442211055275, + "loss": 0.3637, + "mlm_loss": 0.3637, + "step": 12530 + }, + { + "ep_loss": 0.0, + "epoch": 9.67, + "learning_rate": 0.000420320351758794, + "loss": 0.363, + "mlm_loss": 0.363, + "step": 12540 + }, + { + "ep_loss": 0.0, + "epoch": 9.68, + "learning_rate": 0.00042000628140703516, + "loss": 0.355, + "mlm_loss": 0.355, + "step": 12550 + }, + { + "ep_loss": 0.0, + "epoch": 9.69, + "learning_rate": 0.0004196922110552764, + "loss": 0.3651, + "mlm_loss": 0.3651, + "step": 12560 + }, + { + "ep_loss": 0.0, + "epoch": 9.7, + "learning_rate": 0.0004193781407035176, + "loss": 0.3593, + "mlm_loss": 0.3593, + "step": 12570 + }, + { + "ep_loss": 0.0, + "epoch": 9.7, + "learning_rate": 0.0004190640703517588, + "loss": 0.3591, + "mlm_loss": 0.3591, + "step": 12580 + }, + { + "ep_loss": 0.0, + "epoch": 9.71, + "learning_rate": 0.00041875, + "loss": 0.3585, + "mlm_loss": 0.3585, + "step": 12590 + }, + { + "ep_loss": 0.0, + "epoch": 9.72, + "learning_rate": 0.00041843592964824124, + "loss": 0.3618, + "mlm_loss": 0.3618, + "step": 12600 + }, + { + "epoch": 9.72, + "eval_ep_loss": -2.574556827545166, + "eval_loss": 0.33879169821739197, + "eval_mlm_loss": 0.33879169821739197, + "eval_runtime": 60.5312, + "eval_samples_per_second": 1154.265, + "eval_steps_per_second": 0.578, + "step": 12600 + }, + { + "ep_loss": 0.0, + "epoch": 9.73, + "learning_rate": 0.0004181218592964824, + "loss": 0.3587, + "mlm_loss": 0.3587, + "step": 12610 + }, + { + "ep_loss": 0.0, + "epoch": 9.73, + "learning_rate": 0.0004178077889447236, + "loss": 0.3668, + "mlm_loss": 0.3668, + "step": 12620 + }, + { + "ep_loss": 0.0, + "epoch": 9.74, + "learning_rate": 0.0004174937185929648, + "loss": 0.3616, + "mlm_loss": 0.3616, + "step": 12630 + }, + { + "ep_loss": 0.0, + "epoch": 9.75, + "learning_rate": 0.00041717964824120605, + "loss": 0.3626, + "mlm_loss": 0.3626, + "step": 12640 + }, + { + "ep_loss": 0.0, + "epoch": 9.76, + "learning_rate": 0.0004168655778894472, + "loss": 0.3514, + "mlm_loss": 0.3514, + "step": 12650 + }, + { + "ep_loss": 0.0, + "epoch": 9.76, + "learning_rate": 0.00041655150753768845, + "loss": 0.3605, + "mlm_loss": 0.3605, + "step": 12660 + }, + { + "ep_loss": 0.0, + "epoch": 9.77, + "learning_rate": 0.0004162374371859297, + "loss": 0.3591, + "mlm_loss": 0.3591, + "step": 12670 + }, + { + "ep_loss": 0.0, + "epoch": 9.78, + "learning_rate": 0.00041592336683417085, + "loss": 0.3642, + "mlm_loss": 0.3642, + "step": 12680 + }, + { + "ep_loss": 0.0, + "epoch": 9.79, + "learning_rate": 0.0004156092964824121, + "loss": 0.3683, + "mlm_loss": 0.3683, + "step": 12690 + }, + { + "ep_loss": 0.0, + "epoch": 9.8, + "learning_rate": 0.0004152952261306533, + "loss": 0.3614, + "mlm_loss": 0.3614, + "step": 12700 + }, + { + "epoch": 9.8, + "eval_ep_loss": -2.610844135284424, + "eval_loss": 0.33695146441459656, + "eval_mlm_loss": 0.33695146441459656, + "eval_runtime": 59.2376, + "eval_samples_per_second": 1179.471, + "eval_steps_per_second": 0.591, + "step": 12700 + }, + { + "ep_loss": 0.0, + "epoch": 9.8, + "learning_rate": 0.0004149811557788945, + "loss": 0.3551, + "mlm_loss": 0.3551, + "step": 12710 + }, + { + "ep_loss": 0.0, + "epoch": 9.81, + "learning_rate": 0.00041466708542713565, + "loss": 0.357, + "mlm_loss": 0.357, + "step": 12720 + }, + { + "ep_loss": 0.0, + "epoch": 9.82, + "learning_rate": 0.0004143530150753769, + "loss": 0.3631, + "mlm_loss": 0.3631, + "step": 12730 + }, + { + "ep_loss": 0.0, + "epoch": 9.83, + "learning_rate": 0.00041403894472361806, + "loss": 0.3603, + "mlm_loss": 0.3603, + "step": 12740 + }, + { + "ep_loss": 0.0, + "epoch": 9.83, + "learning_rate": 0.0004137248743718593, + "loss": 0.3624, + "mlm_loss": 0.3624, + "step": 12750 + }, + { + "ep_loss": 0.0, + "epoch": 9.84, + "learning_rate": 0.0004134108040201005, + "loss": 0.3619, + "mlm_loss": 0.3619, + "step": 12760 + }, + { + "ep_loss": 0.0, + "epoch": 9.85, + "learning_rate": 0.00041309673366834174, + "loss": 0.3623, + "mlm_loss": 0.3623, + "step": 12770 + }, + { + "ep_loss": 0.0, + "epoch": 9.86, + "learning_rate": 0.0004127826633165829, + "loss": 0.3615, + "mlm_loss": 0.3615, + "step": 12780 + }, + { + "ep_loss": 0.0, + "epoch": 9.87, + "learning_rate": 0.00041246859296482414, + "loss": 0.3569, + "mlm_loss": 0.3569, + "step": 12790 + }, + { + "ep_loss": 0.0, + "epoch": 9.87, + "learning_rate": 0.00041215452261306537, + "loss": 0.3561, + "mlm_loss": 0.3561, + "step": 12800 + }, + { + "epoch": 9.87, + "eval_ep_loss": -2.4833860397338867, + "eval_loss": 0.3353259861469269, + "eval_mlm_loss": 0.3353259861469269, + "eval_runtime": 59.1455, + "eval_samples_per_second": 1181.307, + "eval_steps_per_second": 0.592, + "step": 12800 + }, + { + "ep_loss": 0.0, + "epoch": 9.88, + "learning_rate": 0.00041184045226130654, + "loss": 0.362, + "mlm_loss": 0.362, + "step": 12810 + }, + { + "ep_loss": 0.0, + "epoch": 9.89, + "learning_rate": 0.0004115263819095477, + "loss": 0.3589, + "mlm_loss": 0.3589, + "step": 12820 + }, + { + "ep_loss": 0.0, + "epoch": 9.9, + "learning_rate": 0.00041121231155778895, + "loss": 0.3614, + "mlm_loss": 0.3614, + "step": 12830 + }, + { + "ep_loss": 0.0, + "epoch": 9.9, + "learning_rate": 0.0004108982412060301, + "loss": 0.3534, + "mlm_loss": 0.3534, + "step": 12840 + }, + { + "ep_loss": 0.0, + "epoch": 9.91, + "learning_rate": 0.00041058417085427135, + "loss": 0.36, + "mlm_loss": 0.36, + "step": 12850 + }, + { + "ep_loss": 0.0, + "epoch": 9.92, + "learning_rate": 0.0004102701005025126, + "loss": 0.3579, + "mlm_loss": 0.3579, + "step": 12860 + }, + { + "ep_loss": 0.0, + "epoch": 9.93, + "learning_rate": 0.0004099560301507538, + "loss": 0.358, + "mlm_loss": 0.358, + "step": 12870 + }, + { + "ep_loss": 0.0, + "epoch": 9.93, + "learning_rate": 0.000409641959798995, + "loss": 0.3568, + "mlm_loss": 0.3568, + "step": 12880 + }, + { + "ep_loss": 0.0, + "epoch": 9.94, + "learning_rate": 0.0004093278894472362, + "loss": 0.3596, + "mlm_loss": 0.3596, + "step": 12890 + }, + { + "ep_loss": 0.0, + "epoch": 9.95, + "learning_rate": 0.00040901381909547743, + "loss": 0.356, + "mlm_loss": 0.356, + "step": 12900 + }, + { + "epoch": 9.95, + "eval_ep_loss": -2.3914308547973633, + "eval_loss": 0.33489990234375, + "eval_mlm_loss": 0.33489990234375, + "eval_runtime": 60.1749, + "eval_samples_per_second": 1161.099, + "eval_steps_per_second": 0.582, + "step": 12900 + }, + { + "ep_loss": 0.0, + "epoch": 9.96, + "learning_rate": 0.0004086997487437186, + "loss": 0.3613, + "mlm_loss": 0.3613, + "step": 12910 + }, + { + "ep_loss": 0.0, + "epoch": 9.97, + "learning_rate": 0.0004083856783919598, + "loss": 0.3561, + "mlm_loss": 0.3561, + "step": 12920 + }, + { + "ep_loss": 0.0, + "epoch": 9.97, + "learning_rate": 0.000408071608040201, + "loss": 0.354, + "mlm_loss": 0.354, + "step": 12930 + }, + { + "ep_loss": 0.0, + "epoch": 9.98, + "learning_rate": 0.0004077575376884422, + "loss": 0.36, + "mlm_loss": 0.36, + "step": 12940 + }, + { + "ep_loss": 0.0, + "epoch": 9.99, + "learning_rate": 0.0004074434673366834, + "loss": 0.3583, + "mlm_loss": 0.3583, + "step": 12950 + }, + { + "ep_loss": 0.0, + "epoch": 10.0, + "learning_rate": 0.00040712939698492464, + "loss": 0.3608, + "mlm_loss": 0.3608, + "step": 12960 + }, + { + "ep_loss": 0.0, + "epoch": 10.0, + "learning_rate": 0.0004068153266331658, + "loss": 0.3596, + "mlm_loss": 0.3596, + "step": 12970 + }, + { + "ep_loss": 0.0, + "epoch": 10.01, + "learning_rate": 0.00040650125628140704, + "loss": 0.3559, + "mlm_loss": 0.3559, + "step": 12980 + }, + { + "ep_loss": 0.0, + "epoch": 10.02, + "learning_rate": 0.00040618718592964827, + "loss": 0.3582, + "mlm_loss": 0.3582, + "step": 12990 + }, + { + "ep_loss": 0.0, + "epoch": 10.03, + "learning_rate": 0.0004058731155778895, + "loss": 0.3516, + "mlm_loss": 0.3516, + "step": 13000 + }, + { + "epoch": 10.03, + "eval_ep_loss": -2.4109866619110107, + "eval_loss": 0.3338833749294281, + "eval_mlm_loss": 0.3338833749294281, + "eval_runtime": 59.7838, + "eval_samples_per_second": 1168.695, + "eval_steps_per_second": 0.585, + "step": 13000 + }, + { + "ep_loss": 0.0, + "epoch": 10.03, + "learning_rate": 0.00040555904522613067, + "loss": 0.3513, + "mlm_loss": 0.3513, + "step": 13010 + }, + { + "ep_loss": 0.0, + "epoch": 10.04, + "learning_rate": 0.00040524497487437185, + "loss": 0.3533, + "mlm_loss": 0.3533, + "step": 13020 + }, + { + "ep_loss": 0.0, + "epoch": 10.05, + "learning_rate": 0.0004049309045226131, + "loss": 0.3529, + "mlm_loss": 0.3529, + "step": 13030 + }, + { + "ep_loss": 0.0, + "epoch": 10.06, + "learning_rate": 0.00040461683417085425, + "loss": 0.3555, + "mlm_loss": 0.3555, + "step": 13040 + }, + { + "ep_loss": 0.0, + "epoch": 10.07, + "learning_rate": 0.0004043027638190955, + "loss": 0.3552, + "mlm_loss": 0.3552, + "step": 13050 + }, + { + "ep_loss": 0.0, + "epoch": 10.07, + "learning_rate": 0.0004039886934673367, + "loss": 0.3498, + "mlm_loss": 0.3498, + "step": 13060 + }, + { + "ep_loss": 0.0, + "epoch": 10.08, + "learning_rate": 0.0004036746231155779, + "loss": 0.3553, + "mlm_loss": 0.3553, + "step": 13070 + }, + { + "ep_loss": 0.0, + "epoch": 10.09, + "learning_rate": 0.0004033605527638191, + "loss": 0.3486, + "mlm_loss": 0.3486, + "step": 13080 + }, + { + "ep_loss": 0.0, + "epoch": 10.1, + "learning_rate": 0.00040304648241206033, + "loss": 0.3455, + "mlm_loss": 0.3455, + "step": 13090 + }, + { + "ep_loss": 0.0, + "epoch": 10.1, + "learning_rate": 0.0004027324120603015, + "loss": 0.358, + "mlm_loss": 0.358, + "step": 13100 + }, + { + "epoch": 10.1, + "eval_ep_loss": -2.4457526206970215, + "eval_loss": 0.33316919207572937, + "eval_mlm_loss": 0.33316919207572937, + "eval_runtime": 61.6405, + "eval_samples_per_second": 1133.492, + "eval_steps_per_second": 0.568, + "step": 13100 + }, + { + "ep_loss": 0.0, + "epoch": 10.11, + "learning_rate": 0.00040241834170854274, + "loss": 0.3521, + "mlm_loss": 0.3521, + "step": 13110 + }, + { + "ep_loss": 0.0, + "epoch": 10.12, + "learning_rate": 0.0004021042713567839, + "loss": 0.3618, + "mlm_loss": 0.3618, + "step": 13120 + }, + { + "ep_loss": 0.0, + "epoch": 10.13, + "learning_rate": 0.00040179020100502514, + "loss": 0.3608, + "mlm_loss": 0.3608, + "step": 13130 + }, + { + "ep_loss": 0.0, + "epoch": 10.13, + "learning_rate": 0.0004014761306532663, + "loss": 0.3554, + "mlm_loss": 0.3554, + "step": 13140 + }, + { + "ep_loss": 0.0, + "epoch": 10.14, + "learning_rate": 0.00040116206030150754, + "loss": 0.3554, + "mlm_loss": 0.3554, + "step": 13150 + }, + { + "ep_loss": 0.0, + "epoch": 10.15, + "learning_rate": 0.00040084798994974877, + "loss": 0.3482, + "mlm_loss": 0.3482, + "step": 13160 + }, + { + "ep_loss": 0.0, + "epoch": 10.16, + "learning_rate": 0.00040053391959798994, + "loss": 0.3577, + "mlm_loss": 0.3577, + "step": 13170 + }, + { + "ep_loss": 0.0, + "epoch": 10.17, + "learning_rate": 0.00040021984924623117, + "loss": 0.3448, + "mlm_loss": 0.3448, + "step": 13180 + }, + { + "ep_loss": 0.0, + "epoch": 10.17, + "learning_rate": 0.0003999057788944724, + "loss": 0.3488, + "mlm_loss": 0.3488, + "step": 13190 + }, + { + "ep_loss": 0.0, + "epoch": 10.18, + "learning_rate": 0.00039959170854271357, + "loss": 0.3552, + "mlm_loss": 0.3552, + "step": 13200 + }, + { + "epoch": 10.18, + "eval_ep_loss": -2.5752882957458496, + "eval_loss": 0.33142197132110596, + "eval_mlm_loss": 0.33142197132110596, + "eval_runtime": 58.9677, + "eval_samples_per_second": 1184.87, + "eval_steps_per_second": 0.594, + "step": 13200 + }, + { + "ep_loss": 0.0, + "epoch": 10.19, + "learning_rate": 0.0003992776381909548, + "loss": 0.3443, + "mlm_loss": 0.3443, + "step": 13210 + }, + { + "ep_loss": 0.0, + "epoch": 10.2, + "learning_rate": 0.000398963567839196, + "loss": 0.3487, + "mlm_loss": 0.3487, + "step": 13220 + }, + { + "ep_loss": 0.0, + "epoch": 10.2, + "learning_rate": 0.00039864949748743715, + "loss": 0.3517, + "mlm_loss": 0.3517, + "step": 13230 + }, + { + "ep_loss": 0.0, + "epoch": 10.21, + "learning_rate": 0.0003983354271356784, + "loss": 0.3544, + "mlm_loss": 0.3544, + "step": 13240 + }, + { + "ep_loss": 0.0, + "epoch": 10.22, + "learning_rate": 0.0003980213567839196, + "loss": 0.355, + "mlm_loss": 0.355, + "step": 13250 + }, + { + "ep_loss": 0.0, + "epoch": 10.23, + "learning_rate": 0.00039770728643216083, + "loss": 0.3467, + "mlm_loss": 0.3467, + "step": 13260 + }, + { + "ep_loss": 0.0, + "epoch": 10.24, + "learning_rate": 0.000397393216080402, + "loss": 0.355, + "mlm_loss": 0.355, + "step": 13270 + }, + { + "ep_loss": 0.0, + "epoch": 10.24, + "learning_rate": 0.00039707914572864323, + "loss": 0.3498, + "mlm_loss": 0.3498, + "step": 13280 + }, + { + "ep_loss": 0.0, + "epoch": 10.25, + "learning_rate": 0.00039676507537688446, + "loss": 0.3507, + "mlm_loss": 0.3507, + "step": 13290 + }, + { + "ep_loss": 0.0, + "epoch": 10.26, + "learning_rate": 0.00039645100502512564, + "loss": 0.3515, + "mlm_loss": 0.3515, + "step": 13300 + }, + { + "epoch": 10.26, + "eval_ep_loss": -2.6309657096862793, + "eval_loss": 0.3306918740272522, + "eval_mlm_loss": 0.3306918740272522, + "eval_runtime": 60.495, + "eval_samples_per_second": 1154.954, + "eval_steps_per_second": 0.579, + "step": 13300 + }, + { + "ep_loss": 0.0, + "epoch": 10.27, + "learning_rate": 0.00039613693467336686, + "loss": 0.3476, + "mlm_loss": 0.3476, + "step": 13310 + }, + { + "ep_loss": 0.0, + "epoch": 10.27, + "learning_rate": 0.00039582286432160804, + "loss": 0.3522, + "mlm_loss": 0.3522, + "step": 13320 + }, + { + "ep_loss": 0.0, + "epoch": 10.28, + "learning_rate": 0.0003955087939698492, + "loss": 0.3492, + "mlm_loss": 0.3492, + "step": 13330 + }, + { + "ep_loss": 0.0, + "epoch": 10.29, + "learning_rate": 0.00039519472361809044, + "loss": 0.3475, + "mlm_loss": 0.3475, + "step": 13340 + }, + { + "ep_loss": 0.0, + "epoch": 10.3, + "learning_rate": 0.00039488065326633167, + "loss": 0.3513, + "mlm_loss": 0.3513, + "step": 13350 + }, + { + "ep_loss": 0.0, + "epoch": 10.3, + "learning_rate": 0.00039456658291457284, + "loss": 0.3504, + "mlm_loss": 0.3504, + "step": 13360 + }, + { + "ep_loss": 0.0, + "epoch": 10.31, + "learning_rate": 0.00039425251256281407, + "loss": 0.3467, + "mlm_loss": 0.3467, + "step": 13370 + }, + { + "ep_loss": 0.0, + "epoch": 10.32, + "learning_rate": 0.0003939384422110553, + "loss": 0.3485, + "mlm_loss": 0.3485, + "step": 13380 + }, + { + "ep_loss": 0.0, + "epoch": 10.33, + "learning_rate": 0.0003936243718592965, + "loss": 0.354, + "mlm_loss": 0.354, + "step": 13390 + }, + { + "ep_loss": 0.0, + "epoch": 10.34, + "learning_rate": 0.0003933103015075377, + "loss": 0.3577, + "mlm_loss": 0.3577, + "step": 13400 + }, + { + "epoch": 10.34, + "eval_ep_loss": -2.679708957672119, + "eval_loss": 0.3291432559490204, + "eval_mlm_loss": 0.3291432559490204, + "eval_runtime": 59.1321, + "eval_samples_per_second": 1181.575, + "eval_steps_per_second": 0.592, + "step": 13400 + }, + { + "ep_loss": 0.0, + "epoch": 10.34, + "learning_rate": 0.00039299623115577893, + "loss": 0.3521, + "mlm_loss": 0.3521, + "step": 13410 + }, + { + "ep_loss": 0.0, + "epoch": 10.35, + "learning_rate": 0.0003926821608040201, + "loss": 0.3495, + "mlm_loss": 0.3495, + "step": 13420 + }, + { + "ep_loss": 0.0, + "epoch": 10.36, + "learning_rate": 0.0003923680904522613, + "loss": 0.3464, + "mlm_loss": 0.3464, + "step": 13430 + }, + { + "ep_loss": 0.0, + "epoch": 10.37, + "learning_rate": 0.0003920540201005025, + "loss": 0.3493, + "mlm_loss": 0.3493, + "step": 13440 + }, + { + "ep_loss": 0.0, + "epoch": 10.37, + "learning_rate": 0.00039173994974874373, + "loss": 0.351, + "mlm_loss": 0.351, + "step": 13450 + }, + { + "ep_loss": 0.0, + "epoch": 10.38, + "learning_rate": 0.0003914258793969849, + "loss": 0.3526, + "mlm_loss": 0.3526, + "step": 13460 + }, + { + "ep_loss": 0.0, + "epoch": 10.39, + "learning_rate": 0.00039111180904522613, + "loss": 0.3517, + "mlm_loss": 0.3517, + "step": 13470 + }, + { + "ep_loss": 0.0, + "epoch": 10.4, + "learning_rate": 0.00039079773869346736, + "loss": 0.3541, + "mlm_loss": 0.3541, + "step": 13480 + }, + { + "ep_loss": 0.0, + "epoch": 10.4, + "learning_rate": 0.0003904836683417086, + "loss": 0.35, + "mlm_loss": 0.35, + "step": 13490 + }, + { + "ep_loss": 0.0, + "epoch": 10.41, + "learning_rate": 0.00039016959798994976, + "loss": 0.3543, + "mlm_loss": 0.3543, + "step": 13500 + }, + { + "epoch": 10.41, + "eval_ep_loss": -2.5294482707977295, + "eval_loss": 0.3268160820007324, + "eval_mlm_loss": 0.3268160820007324, + "eval_runtime": 59.814, + "eval_samples_per_second": 1168.105, + "eval_steps_per_second": 0.585, + "step": 13500 + }, + { + "ep_loss": 0.0, + "epoch": 10.42, + "learning_rate": 0.000389855527638191, + "loss": 0.3521, + "mlm_loss": 0.3521, + "step": 13510 + }, + { + "ep_loss": 0.0, + "epoch": 10.43, + "learning_rate": 0.00038954145728643217, + "loss": 0.3447, + "mlm_loss": 0.3447, + "step": 13520 + }, + { + "ep_loss": 0.0, + "epoch": 10.44, + "learning_rate": 0.00038922738693467334, + "loss": 0.3547, + "mlm_loss": 0.3547, + "step": 13530 + }, + { + "ep_loss": 0.0, + "epoch": 10.44, + "learning_rate": 0.00038891331658291457, + "loss": 0.3457, + "mlm_loss": 0.3457, + "step": 13540 + }, + { + "ep_loss": 0.0, + "epoch": 10.45, + "learning_rate": 0.0003885992462311558, + "loss": 0.3451, + "mlm_loss": 0.3451, + "step": 13550 + }, + { + "ep_loss": 0.0, + "epoch": 10.46, + "learning_rate": 0.00038828517587939697, + "loss": 0.3447, + "mlm_loss": 0.3447, + "step": 13560 + }, + { + "ep_loss": 0.0, + "epoch": 10.47, + "learning_rate": 0.0003879711055276382, + "loss": 0.3483, + "mlm_loss": 0.3483, + "step": 13570 + }, + { + "ep_loss": 0.0, + "epoch": 10.47, + "learning_rate": 0.0003876570351758794, + "loss": 0.3505, + "mlm_loss": 0.3505, + "step": 13580 + }, + { + "ep_loss": 0.0, + "epoch": 10.48, + "learning_rate": 0.0003873429648241206, + "loss": 0.3502, + "mlm_loss": 0.3502, + "step": 13590 + }, + { + "ep_loss": 0.0, + "epoch": 10.49, + "learning_rate": 0.00038702889447236183, + "loss": 0.3503, + "mlm_loss": 0.3503, + "step": 13600 + }, + { + "epoch": 10.49, + "eval_ep_loss": -2.7712888717651367, + "eval_loss": 0.32662275433540344, + "eval_mlm_loss": 0.32662275433540344, + "eval_runtime": 64.1344, + "eval_samples_per_second": 1089.415, + "eval_steps_per_second": 0.546, + "step": 13600 + }, + { + "ep_loss": 0.0, + "epoch": 10.5, + "learning_rate": 0.00038671482412060306, + "loss": 0.3536, + "mlm_loss": 0.3536, + "step": 13610 + }, + { + "ep_loss": 0.0, + "epoch": 10.51, + "learning_rate": 0.00038640075376884423, + "loss": 0.3567, + "mlm_loss": 0.3567, + "step": 13620 + }, + { + "ep_loss": 0.0, + "epoch": 10.51, + "learning_rate": 0.0003860866834170854, + "loss": 0.3498, + "mlm_loss": 0.3498, + "step": 13630 + }, + { + "ep_loss": 0.0, + "epoch": 10.52, + "learning_rate": 0.00038577261306532663, + "loss": 0.3519, + "mlm_loss": 0.3519, + "step": 13640 + }, + { + "ep_loss": 0.0, + "epoch": 10.53, + "learning_rate": 0.00038545854271356786, + "loss": 0.3476, + "mlm_loss": 0.3476, + "step": 13650 + }, + { + "ep_loss": 0.0, + "epoch": 10.54, + "learning_rate": 0.00038514447236180903, + "loss": 0.346, + "mlm_loss": 0.346, + "step": 13660 + }, + { + "ep_loss": 0.0, + "epoch": 10.54, + "learning_rate": 0.00038483040201005026, + "loss": 0.3508, + "mlm_loss": 0.3508, + "step": 13670 + }, + { + "ep_loss": 0.0, + "epoch": 10.55, + "learning_rate": 0.0003845163316582915, + "loss": 0.3532, + "mlm_loss": 0.3532, + "step": 13680 + }, + { + "ep_loss": 0.0, + "epoch": 10.56, + "learning_rate": 0.00038420226130653266, + "loss": 0.3449, + "mlm_loss": 0.3449, + "step": 13690 + }, + { + "ep_loss": 0.0, + "epoch": 10.57, + "learning_rate": 0.0003838881909547739, + "loss": 0.3521, + "mlm_loss": 0.3521, + "step": 13700 + }, + { + "epoch": 10.57, + "eval_ep_loss": -2.5600945949554443, + "eval_loss": 0.3279613256454468, + "eval_mlm_loss": 0.3279613256454468, + "eval_runtime": 60.8337, + "eval_samples_per_second": 1148.524, + "eval_steps_per_second": 0.575, + "step": 13700 + }, + { + "ep_loss": 0.0, + "epoch": 10.57, + "learning_rate": 0.0003835741206030151, + "loss": 0.3462, + "mlm_loss": 0.3462, + "step": 13710 + }, + { + "ep_loss": 0.0, + "epoch": 10.58, + "learning_rate": 0.00038326005025125624, + "loss": 0.347, + "mlm_loss": 0.347, + "step": 13720 + }, + { + "ep_loss": 0.0, + "epoch": 10.59, + "learning_rate": 0.00038294597989949747, + "loss": 0.3462, + "mlm_loss": 0.3462, + "step": 13730 + }, + { + "ep_loss": 0.0, + "epoch": 10.6, + "learning_rate": 0.0003826319095477387, + "loss": 0.3408, + "mlm_loss": 0.3408, + "step": 13740 + }, + { + "ep_loss": 0.0, + "epoch": 10.61, + "learning_rate": 0.0003823178391959799, + "loss": 0.3471, + "mlm_loss": 0.3471, + "step": 13750 + }, + { + "ep_loss": 0.0, + "epoch": 10.61, + "learning_rate": 0.0003820037688442211, + "loss": 0.3467, + "mlm_loss": 0.3467, + "step": 13760 + }, + { + "ep_loss": 0.0, + "epoch": 10.62, + "learning_rate": 0.0003816896984924623, + "loss": 0.3482, + "mlm_loss": 0.3482, + "step": 13770 + }, + { + "ep_loss": 0.0, + "epoch": 10.63, + "learning_rate": 0.00038137562814070355, + "loss": 0.3454, + "mlm_loss": 0.3454, + "step": 13780 + }, + { + "ep_loss": 0.0, + "epoch": 10.64, + "learning_rate": 0.00038106155778894473, + "loss": 0.346, + "mlm_loss": 0.346, + "step": 13790 + }, + { + "ep_loss": 0.0, + "epoch": 10.64, + "learning_rate": 0.00038074748743718596, + "loss": 0.3461, + "mlm_loss": 0.3461, + "step": 13800 + }, + { + "epoch": 10.64, + "eval_ep_loss": -2.976186990737915, + "eval_loss": 0.32481658458709717, + "eval_mlm_loss": 0.32481658458709717, + "eval_runtime": 61.249, + "eval_samples_per_second": 1140.737, + "eval_steps_per_second": 0.571, + "step": 13800 + }, + { + "ep_loss": 0.0, + "epoch": 10.65, + "learning_rate": 0.0003804334170854272, + "loss": 0.3478, + "mlm_loss": 0.3478, + "step": 13810 + }, + { + "ep_loss": 0.0, + "epoch": 10.66, + "learning_rate": 0.0003801193467336683, + "loss": 0.3474, + "mlm_loss": 0.3474, + "step": 13820 + }, + { + "ep_loss": 0.0, + "epoch": 10.67, + "learning_rate": 0.00037980527638190953, + "loss": 0.3453, + "mlm_loss": 0.3453, + "step": 13830 + }, + { + "ep_loss": 0.0, + "epoch": 10.67, + "learning_rate": 0.00037949120603015076, + "loss": 0.3503, + "mlm_loss": 0.3503, + "step": 13840 + }, + { + "ep_loss": 0.0, + "epoch": 10.68, + "learning_rate": 0.00037917713567839193, + "loss": 0.3491, + "mlm_loss": 0.3491, + "step": 13850 + }, + { + "ep_loss": 0.0, + "epoch": 10.69, + "learning_rate": 0.00037886306532663316, + "loss": 0.3401, + "mlm_loss": 0.3401, + "step": 13860 + }, + { + "ep_loss": 0.0, + "epoch": 10.7, + "learning_rate": 0.0003785489949748744, + "loss": 0.3489, + "mlm_loss": 0.3489, + "step": 13870 + }, + { + "ep_loss": 0.0, + "epoch": 10.71, + "learning_rate": 0.0003782349246231156, + "loss": 0.3413, + "mlm_loss": 0.3413, + "step": 13880 + }, + { + "ep_loss": 0.0, + "epoch": 10.71, + "learning_rate": 0.0003779208542713568, + "loss": 0.3412, + "mlm_loss": 0.3412, + "step": 13890 + }, + { + "ep_loss": 0.0, + "epoch": 10.72, + "learning_rate": 0.000377606783919598, + "loss": 0.3491, + "mlm_loss": 0.3491, + "step": 13900 + }, + { + "epoch": 10.72, + "eval_ep_loss": -2.670948028564453, + "eval_loss": 0.3239390552043915, + "eval_mlm_loss": 0.3239390552043915, + "eval_runtime": 60.785, + "eval_samples_per_second": 1149.445, + "eval_steps_per_second": 0.576, + "step": 13900 + }, + { + "ep_loss": 0.0, + "epoch": 10.73, + "learning_rate": 0.00037729271356783925, + "loss": 0.3422, + "mlm_loss": 0.3422, + "step": 13910 + }, + { + "ep_loss": 0.0, + "epoch": 10.74, + "learning_rate": 0.00037697864321608037, + "loss": 0.3479, + "mlm_loss": 0.3479, + "step": 13920 + }, + { + "ep_loss": 0.0, + "epoch": 10.74, + "learning_rate": 0.0003766645728643216, + "loss": 0.3471, + "mlm_loss": 0.3471, + "step": 13930 + }, + { + "ep_loss": 0.0, + "epoch": 10.75, + "learning_rate": 0.0003763505025125628, + "loss": 0.3511, + "mlm_loss": 0.3511, + "step": 13940 + }, + { + "ep_loss": 0.0, + "epoch": 10.76, + "learning_rate": 0.000376036432160804, + "loss": 0.3424, + "mlm_loss": 0.3424, + "step": 13950 + }, + { + "ep_loss": 0.0, + "epoch": 10.77, + "learning_rate": 0.0003757223618090452, + "loss": 0.3419, + "mlm_loss": 0.3419, + "step": 13960 + }, + { + "ep_loss": 0.0, + "epoch": 10.78, + "learning_rate": 0.00037540829145728645, + "loss": 0.3452, + "mlm_loss": 0.3452, + "step": 13970 + }, + { + "ep_loss": 0.0, + "epoch": 10.78, + "learning_rate": 0.00037509422110552763, + "loss": 0.3401, + "mlm_loss": 0.3401, + "step": 13980 + }, + { + "ep_loss": 0.0, + "epoch": 10.79, + "learning_rate": 0.00037478015075376886, + "loss": 0.3441, + "mlm_loss": 0.3441, + "step": 13990 + }, + { + "ep_loss": 0.0, + "epoch": 10.8, + "learning_rate": 0.0003744660804020101, + "loss": 0.3429, + "mlm_loss": 0.3429, + "step": 14000 + }, + { + "epoch": 10.8, + "eval_ep_loss": -2.6834776401519775, + "eval_loss": 0.32337966561317444, + "eval_mlm_loss": 0.32337966561317444, + "eval_runtime": 59.4747, + "eval_samples_per_second": 1174.769, + "eval_steps_per_second": 0.588, + "step": 14000 + }, + { + "ep_loss": 0.0, + "epoch": 10.81, + "learning_rate": 0.0003741520100502513, + "loss": 0.3444, + "mlm_loss": 0.3444, + "step": 14010 + }, + { + "ep_loss": 0.0, + "epoch": 10.81, + "learning_rate": 0.00037383793969849243, + "loss": 0.3438, + "mlm_loss": 0.3438, + "step": 14020 + }, + { + "ep_loss": 0.0, + "epoch": 10.82, + "learning_rate": 0.00037352386934673366, + "loss": 0.3463, + "mlm_loss": 0.3463, + "step": 14030 + }, + { + "ep_loss": 0.0, + "epoch": 10.83, + "learning_rate": 0.0003732097989949749, + "loss": 0.3467, + "mlm_loss": 0.3467, + "step": 14040 + }, + { + "ep_loss": 0.0, + "epoch": 10.84, + "learning_rate": 0.00037289572864321606, + "loss": 0.3398, + "mlm_loss": 0.3398, + "step": 14050 + }, + { + "ep_loss": 0.0, + "epoch": 10.84, + "learning_rate": 0.0003725816582914573, + "loss": 0.3447, + "mlm_loss": 0.3447, + "step": 14060 + }, + { + "ep_loss": 0.0, + "epoch": 10.85, + "learning_rate": 0.0003722675879396985, + "loss": 0.3398, + "mlm_loss": 0.3398, + "step": 14070 + }, + { + "ep_loss": 0.0, + "epoch": 10.86, + "learning_rate": 0.0003719535175879397, + "loss": 0.3466, + "mlm_loss": 0.3466, + "step": 14080 + }, + { + "ep_loss": 0.0, + "epoch": 10.87, + "learning_rate": 0.0003716394472361809, + "loss": 0.3428, + "mlm_loss": 0.3428, + "step": 14090 + }, + { + "ep_loss": 0.0, + "epoch": 10.88, + "learning_rate": 0.00037132537688442215, + "loss": 0.3445, + "mlm_loss": 0.3445, + "step": 14100 + }, + { + "epoch": 10.88, + "eval_ep_loss": -2.466320037841797, + "eval_loss": 0.3204542398452759, + "eval_mlm_loss": 0.3204542398452759, + "eval_runtime": 59.2997, + "eval_samples_per_second": 1178.236, + "eval_steps_per_second": 0.59, + "step": 14100 + }, + { + "ep_loss": 0.0, + "epoch": 10.88, + "learning_rate": 0.0003710113065326634, + "loss": 0.3426, + "mlm_loss": 0.3426, + "step": 14110 + }, + { + "ep_loss": 0.0, + "epoch": 10.89, + "learning_rate": 0.0003706972361809045, + "loss": 0.3403, + "mlm_loss": 0.3403, + "step": 14120 + }, + { + "ep_loss": 0.0, + "epoch": 10.9, + "learning_rate": 0.0003703831658291457, + "loss": 0.3383, + "mlm_loss": 0.3383, + "step": 14130 + }, + { + "ep_loss": 0.0, + "epoch": 10.91, + "learning_rate": 0.00037006909547738695, + "loss": 0.3497, + "mlm_loss": 0.3497, + "step": 14140 + }, + { + "ep_loss": 0.0, + "epoch": 10.91, + "learning_rate": 0.0003697550251256281, + "loss": 0.349, + "mlm_loss": 0.349, + "step": 14150 + }, + { + "ep_loss": 0.0, + "epoch": 10.92, + "learning_rate": 0.00036944095477386935, + "loss": 0.3449, + "mlm_loss": 0.3449, + "step": 14160 + }, + { + "ep_loss": 0.0, + "epoch": 10.93, + "learning_rate": 0.0003691268844221106, + "loss": 0.3487, + "mlm_loss": 0.3487, + "step": 14170 + }, + { + "ep_loss": 0.0, + "epoch": 10.94, + "learning_rate": 0.00036881281407035176, + "loss": 0.344, + "mlm_loss": 0.344, + "step": 14180 + }, + { + "ep_loss": 0.0, + "epoch": 10.94, + "learning_rate": 0.000368498743718593, + "loss": 0.3358, + "mlm_loss": 0.3358, + "step": 14190 + }, + { + "ep_loss": 0.0, + "epoch": 10.95, + "learning_rate": 0.0003681846733668342, + "loss": 0.3406, + "mlm_loss": 0.3406, + "step": 14200 + }, + { + "epoch": 10.95, + "eval_ep_loss": -2.3429927825927734, + "eval_loss": 0.32197174429893494, + "eval_mlm_loss": 0.32197174429893494, + "eval_runtime": 63.3682, + "eval_samples_per_second": 1102.588, + "eval_steps_per_second": 0.552, + "step": 14200 + }, + { + "ep_loss": 0.0, + "epoch": 10.96, + "learning_rate": 0.0003678706030150754, + "loss": 0.3403, + "mlm_loss": 0.3403, + "step": 14210 + }, + { + "ep_loss": 0.0, + "epoch": 10.97, + "learning_rate": 0.00036755653266331656, + "loss": 0.3388, + "mlm_loss": 0.3388, + "step": 14220 + }, + { + "ep_loss": 0.0, + "epoch": 10.98, + "learning_rate": 0.0003672424623115578, + "loss": 0.3426, + "mlm_loss": 0.3426, + "step": 14230 + }, + { + "ep_loss": 0.0, + "epoch": 10.98, + "learning_rate": 0.000366928391959799, + "loss": 0.3417, + "mlm_loss": 0.3417, + "step": 14240 + }, + { + "ep_loss": 0.0, + "epoch": 10.99, + "learning_rate": 0.0003666143216080402, + "loss": 0.3383, + "mlm_loss": 0.3383, + "step": 14250 + }, + { + "ep_loss": 0.0, + "epoch": 11.0, + "learning_rate": 0.0003663002512562814, + "loss": 0.3468, + "mlm_loss": 0.3468, + "step": 14260 + }, + { + "ep_loss": 0.0, + "epoch": 11.01, + "learning_rate": 0.00036598618090452265, + "loss": 0.3337, + "mlm_loss": 0.3337, + "step": 14270 + }, + { + "ep_loss": 0.0, + "epoch": 11.01, + "learning_rate": 0.0003656721105527638, + "loss": 0.3397, + "mlm_loss": 0.3397, + "step": 14280 + }, + { + "ep_loss": 0.0, + "epoch": 11.02, + "learning_rate": 0.00036535804020100505, + "loss": 0.3403, + "mlm_loss": 0.3403, + "step": 14290 + }, + { + "ep_loss": 0.0, + "epoch": 11.03, + "learning_rate": 0.0003650439698492463, + "loss": 0.346, + "mlm_loss": 0.346, + "step": 14300 + }, + { + "epoch": 11.03, + "eval_ep_loss": -2.5903124809265137, + "eval_loss": 0.31952106952667236, + "eval_mlm_loss": 0.31952106952667236, + "eval_runtime": 61.3925, + "eval_samples_per_second": 1138.07, + "eval_steps_per_second": 0.57, + "step": 14300 + }, + { + "ep_loss": 0.0, + "epoch": 11.04, + "learning_rate": 0.00036472989949748745, + "loss": 0.3393, + "mlm_loss": 0.3393, + "step": 14310 + }, + { + "ep_loss": 0.0, + "epoch": 11.05, + "learning_rate": 0.0003644158291457286, + "loss": 0.34, + "mlm_loss": 0.34, + "step": 14320 + }, + { + "ep_loss": 0.0, + "epoch": 11.05, + "learning_rate": 0.00036410175879396985, + "loss": 0.3428, + "mlm_loss": 0.3428, + "step": 14330 + }, + { + "ep_loss": 0.0, + "epoch": 11.06, + "learning_rate": 0.000363787688442211, + "loss": 0.3406, + "mlm_loss": 0.3406, + "step": 14340 + }, + { + "ep_loss": 0.0, + "epoch": 11.07, + "learning_rate": 0.00036347361809045225, + "loss": 0.3384, + "mlm_loss": 0.3384, + "step": 14350 + }, + { + "ep_loss": 0.0, + "epoch": 11.08, + "learning_rate": 0.0003631595477386935, + "loss": 0.3385, + "mlm_loss": 0.3385, + "step": 14360 + }, + { + "ep_loss": 0.0, + "epoch": 11.08, + "learning_rate": 0.0003628454773869347, + "loss": 0.3342, + "mlm_loss": 0.3342, + "step": 14370 + }, + { + "ep_loss": 0.0, + "epoch": 11.09, + "learning_rate": 0.0003625314070351759, + "loss": 0.3353, + "mlm_loss": 0.3353, + "step": 14380 + }, + { + "ep_loss": 0.0, + "epoch": 11.1, + "learning_rate": 0.0003622173366834171, + "loss": 0.3426, + "mlm_loss": 0.3426, + "step": 14390 + }, + { + "ep_loss": 0.0, + "epoch": 11.11, + "learning_rate": 0.00036190326633165834, + "loss": 0.335, + "mlm_loss": 0.335, + "step": 14400 + }, + { + "epoch": 11.11, + "eval_ep_loss": -2.3407018184661865, + "eval_loss": 0.31839361786842346, + "eval_mlm_loss": 0.31839361786842346, + "eval_runtime": 59.9564, + "eval_samples_per_second": 1165.329, + "eval_steps_per_second": 0.584, + "step": 14400 + }, + { + "ep_loss": 0.0, + "epoch": 11.11, + "learning_rate": 0.0003615891959798995, + "loss": 0.339, + "mlm_loss": 0.339, + "step": 14410 + }, + { + "ep_loss": 0.0, + "epoch": 11.12, + "learning_rate": 0.0003612751256281407, + "loss": 0.3419, + "mlm_loss": 0.3419, + "step": 14420 + }, + { + "ep_loss": 0.0, + "epoch": 11.13, + "learning_rate": 0.0003609610552763819, + "loss": 0.3368, + "mlm_loss": 0.3368, + "step": 14430 + }, + { + "ep_loss": 0.0, + "epoch": 11.14, + "learning_rate": 0.0003606469849246231, + "loss": 0.338, + "mlm_loss": 0.338, + "step": 14440 + }, + { + "ep_loss": 0.0, + "epoch": 11.15, + "learning_rate": 0.0003603329145728643, + "loss": 0.3413, + "mlm_loss": 0.3413, + "step": 14450 + }, + { + "ep_loss": 0.0, + "epoch": 11.15, + "learning_rate": 0.00036001884422110555, + "loss": 0.3361, + "mlm_loss": 0.3361, + "step": 14460 + }, + { + "ep_loss": 0.0, + "epoch": 11.16, + "learning_rate": 0.0003597047738693467, + "loss": 0.3358, + "mlm_loss": 0.3358, + "step": 14470 + }, + { + "ep_loss": 0.0, + "epoch": 11.17, + "learning_rate": 0.00035939070351758795, + "loss": 0.3352, + "mlm_loss": 0.3352, + "step": 14480 + }, + { + "ep_loss": 0.0, + "epoch": 11.18, + "learning_rate": 0.0003590766331658292, + "loss": 0.3401, + "mlm_loss": 0.3401, + "step": 14490 + }, + { + "ep_loss": 0.0, + "epoch": 11.18, + "learning_rate": 0.0003587625628140704, + "loss": 0.3385, + "mlm_loss": 0.3385, + "step": 14500 + }, + { + "epoch": 11.18, + "eval_ep_loss": -2.600480794906616, + "eval_loss": 0.3174394965171814, + "eval_mlm_loss": 0.3174394965171814, + "eval_runtime": 59.992, + "eval_samples_per_second": 1164.639, + "eval_steps_per_second": 0.583, + "step": 14500 + }, + { + "ep_loss": 0.0, + "epoch": 11.19, + "learning_rate": 0.0003584484924623116, + "loss": 0.3374, + "mlm_loss": 0.3374, + "step": 14510 + }, + { + "ep_loss": 0.0, + "epoch": 11.2, + "learning_rate": 0.00035813442211055275, + "loss": 0.3406, + "mlm_loss": 0.3406, + "step": 14520 + }, + { + "ep_loss": 0.0, + "epoch": 11.21, + "learning_rate": 0.000357820351758794, + "loss": 0.3361, + "mlm_loss": 0.3361, + "step": 14530 + }, + { + "ep_loss": 0.0, + "epoch": 11.21, + "learning_rate": 0.00035750628140703515, + "loss": 0.339, + "mlm_loss": 0.339, + "step": 14540 + }, + { + "ep_loss": 0.0, + "epoch": 11.22, + "learning_rate": 0.0003571922110552764, + "loss": 0.3367, + "mlm_loss": 0.3367, + "step": 14550 + }, + { + "ep_loss": 0.0, + "epoch": 11.23, + "learning_rate": 0.0003568781407035176, + "loss": 0.3401, + "mlm_loss": 0.3401, + "step": 14560 + }, + { + "ep_loss": 0.0, + "epoch": 11.24, + "learning_rate": 0.0003565640703517588, + "loss": 0.3381, + "mlm_loss": 0.3381, + "step": 14570 + }, + { + "ep_loss": 0.0, + "epoch": 11.25, + "learning_rate": 0.00035625, + "loss": 0.3348, + "mlm_loss": 0.3348, + "step": 14580 + }, + { + "ep_loss": 0.0, + "epoch": 11.25, + "learning_rate": 0.00035593592964824124, + "loss": 0.3347, + "mlm_loss": 0.3347, + "step": 14590 + }, + { + "ep_loss": 0.0, + "epoch": 11.26, + "learning_rate": 0.0003556218592964824, + "loss": 0.3368, + "mlm_loss": 0.3368, + "step": 14600 + }, + { + "epoch": 11.26, + "eval_ep_loss": -2.644643545150757, + "eval_loss": 0.3158123791217804, + "eval_mlm_loss": 0.3158123791217804, + "eval_runtime": 63.9607, + "eval_samples_per_second": 1092.373, + "eval_steps_per_second": 0.547, + "step": 14600 + }, + { + "ep_loss": 0.0, + "epoch": 11.27, + "learning_rate": 0.0003553077889447236, + "loss": 0.3406, + "mlm_loss": 0.3406, + "step": 14610 + }, + { + "ep_loss": 0.0, + "epoch": 11.28, + "learning_rate": 0.0003549937185929648, + "loss": 0.3404, + "mlm_loss": 0.3404, + "step": 14620 + }, + { + "ep_loss": 0.0, + "epoch": 11.28, + "learning_rate": 0.00035467964824120604, + "loss": 0.3373, + "mlm_loss": 0.3373, + "step": 14630 + }, + { + "ep_loss": 0.0, + "epoch": 11.29, + "learning_rate": 0.0003543655778894472, + "loss": 0.3403, + "mlm_loss": 0.3403, + "step": 14640 + }, + { + "ep_loss": 0.0, + "epoch": 11.3, + "learning_rate": 0.00035405150753768845, + "loss": 0.3351, + "mlm_loss": 0.3351, + "step": 14650 + }, + { + "ep_loss": 0.0, + "epoch": 11.31, + "learning_rate": 0.0003537374371859297, + "loss": 0.333, + "mlm_loss": 0.333, + "step": 14660 + }, + { + "ep_loss": 0.0, + "epoch": 11.32, + "learning_rate": 0.00035342336683417085, + "loss": 0.3399, + "mlm_loss": 0.3399, + "step": 14670 + }, + { + "ep_loss": 0.0, + "epoch": 11.32, + "learning_rate": 0.0003531092964824121, + "loss": 0.3349, + "mlm_loss": 0.3349, + "step": 14680 + }, + { + "ep_loss": 0.0, + "epoch": 11.33, + "learning_rate": 0.0003527952261306533, + "loss": 0.339, + "mlm_loss": 0.339, + "step": 14690 + }, + { + "ep_loss": 0.0, + "epoch": 11.34, + "learning_rate": 0.0003524811557788945, + "loss": 0.336, + "mlm_loss": 0.336, + "step": 14700 + }, + { + "epoch": 11.34, + "eval_ep_loss": -2.6687519550323486, + "eval_loss": 0.31672507524490356, + "eval_mlm_loss": 0.31672507524490356, + "eval_runtime": 61.4289, + "eval_samples_per_second": 1137.396, + "eval_steps_per_second": 0.57, + "step": 14700 + }, + { + "ep_loss": 0.0, + "epoch": 11.35, + "learning_rate": 0.00035216708542713565, + "loss": 0.3387, + "mlm_loss": 0.3387, + "step": 14710 + }, + { + "ep_loss": 0.0, + "epoch": 11.35, + "learning_rate": 0.0003518530150753769, + "loss": 0.3345, + "mlm_loss": 0.3345, + "step": 14720 + }, + { + "ep_loss": 0.0, + "epoch": 11.36, + "learning_rate": 0.00035153894472361805, + "loss": 0.337, + "mlm_loss": 0.337, + "step": 14730 + }, + { + "ep_loss": 0.0, + "epoch": 11.37, + "learning_rate": 0.0003512248743718593, + "loss": 0.3307, + "mlm_loss": 0.3307, + "step": 14740 + }, + { + "ep_loss": 0.0, + "epoch": 11.38, + "learning_rate": 0.0003509108040201005, + "loss": 0.329, + "mlm_loss": 0.329, + "step": 14750 + }, + { + "ep_loss": 0.0, + "epoch": 11.38, + "learning_rate": 0.00035059673366834174, + "loss": 0.3345, + "mlm_loss": 0.3345, + "step": 14760 + }, + { + "ep_loss": 0.0, + "epoch": 11.39, + "learning_rate": 0.0003502826633165829, + "loss": 0.3323, + "mlm_loss": 0.3323, + "step": 14770 + }, + { + "ep_loss": 0.0, + "epoch": 11.4, + "learning_rate": 0.00034996859296482414, + "loss": 0.3311, + "mlm_loss": 0.3311, + "step": 14780 + }, + { + "ep_loss": 0.0, + "epoch": 11.41, + "learning_rate": 0.00034965452261306537, + "loss": 0.3304, + "mlm_loss": 0.3304, + "step": 14790 + }, + { + "ep_loss": 0.0, + "epoch": 11.42, + "learning_rate": 0.00034934045226130654, + "loss": 0.3298, + "mlm_loss": 0.3298, + "step": 14800 + }, + { + "epoch": 11.42, + "eval_ep_loss": -2.789717197418213, + "eval_loss": 0.31507280468940735, + "eval_mlm_loss": 0.31507280468940735, + "eval_runtime": 60.3112, + "eval_samples_per_second": 1158.476, + "eval_steps_per_second": 0.58, + "step": 14800 + }, + { + "ep_loss": 0.0, + "epoch": 11.42, + "learning_rate": 0.0003490263819095477, + "loss": 0.3325, + "mlm_loss": 0.3325, + "step": 14810 + }, + { + "ep_loss": 0.0, + "epoch": 11.43, + "learning_rate": 0.00034871231155778894, + "loss": 0.3363, + "mlm_loss": 0.3363, + "step": 14820 + }, + { + "ep_loss": 0.0, + "epoch": 11.44, + "learning_rate": 0.0003483982412060301, + "loss": 0.339, + "mlm_loss": 0.339, + "step": 14830 + }, + { + "ep_loss": 0.0, + "epoch": 11.45, + "learning_rate": 0.00034808417085427135, + "loss": 0.3372, + "mlm_loss": 0.3372, + "step": 14840 + }, + { + "ep_loss": 0.0, + "epoch": 11.45, + "learning_rate": 0.0003477701005025126, + "loss": 0.3316, + "mlm_loss": 0.3316, + "step": 14850 + }, + { + "ep_loss": 0.0, + "epoch": 11.46, + "learning_rate": 0.00034748743718592966, + "loss": 0.3393, + "mlm_loss": 0.3393, + "step": 14860 + }, + { + "ep_loss": 0.0, + "epoch": 11.47, + "learning_rate": 0.00034717336683417083, + "loss": 0.3348, + "mlm_loss": 0.3348, + "step": 14870 + }, + { + "ep_loss": 0.0, + "epoch": 11.48, + "learning_rate": 0.00034685929648241206, + "loss": 0.3322, + "mlm_loss": 0.3322, + "step": 14880 + }, + { + "ep_loss": 0.0, + "epoch": 11.48, + "learning_rate": 0.00034654522613065323, + "loss": 0.3354, + "mlm_loss": 0.3354, + "step": 14890 + }, + { + "ep_loss": 0.0, + "epoch": 11.49, + "learning_rate": 0.00034623115577889446, + "loss": 0.3337, + "mlm_loss": 0.3337, + "step": 14900 + }, + { + "epoch": 11.49, + "eval_ep_loss": -2.740074396133423, + "eval_loss": 0.31319552659988403, + "eval_mlm_loss": 0.31319552659988403, + "eval_runtime": 59.6932, + "eval_samples_per_second": 1170.468, + "eval_steps_per_second": 0.586, + "step": 14900 + }, + { + "ep_loss": 0.0, + "epoch": 11.5, + "learning_rate": 0.0003459170854271357, + "loss": 0.3321, + "mlm_loss": 0.3321, + "step": 14910 + }, + { + "ep_loss": 0.0, + "epoch": 11.51, + "learning_rate": 0.0003456030150753769, + "loss": 0.3382, + "mlm_loss": 0.3382, + "step": 14920 + }, + { + "ep_loss": 0.0, + "epoch": 11.52, + "learning_rate": 0.0003452889447236181, + "loss": 0.3352, + "mlm_loss": 0.3352, + "step": 14930 + }, + { + "ep_loss": 0.0, + "epoch": 11.52, + "learning_rate": 0.0003449748743718593, + "loss": 0.3359, + "mlm_loss": 0.3359, + "step": 14940 + }, + { + "ep_loss": 0.0, + "epoch": 11.53, + "learning_rate": 0.00034466080402010055, + "loss": 0.3323, + "mlm_loss": 0.3323, + "step": 14950 + }, + { + "ep_loss": 0.0, + "epoch": 11.54, + "learning_rate": 0.0003443467336683417, + "loss": 0.3371, + "mlm_loss": 0.3371, + "step": 14960 + }, + { + "ep_loss": 0.0, + "epoch": 11.55, + "learning_rate": 0.0003440326633165829, + "loss": 0.3341, + "mlm_loss": 0.3341, + "step": 14970 + }, + { + "ep_loss": 0.0, + "epoch": 11.55, + "learning_rate": 0.0003437185929648241, + "loss": 0.329, + "mlm_loss": 0.329, + "step": 14980 + }, + { + "ep_loss": 0.0, + "epoch": 11.56, + "learning_rate": 0.0003434045226130653, + "loss": 0.3323, + "mlm_loss": 0.3323, + "step": 14990 + }, + { + "ep_loss": 0.0, + "epoch": 11.57, + "learning_rate": 0.0003430904522613065, + "loss": 0.3349, + "mlm_loss": 0.3349, + "step": 15000 + }, + { + "epoch": 11.57, + "eval_ep_loss": -2.9178011417388916, + "eval_loss": 0.3126050531864166, + "eval_mlm_loss": 0.3126050531864166, + "eval_runtime": 60.091, + "eval_samples_per_second": 1162.72, + "eval_steps_per_second": 0.582, + "step": 15000 + }, + { + "ep_loss": 0.0, + "epoch": 11.58, + "learning_rate": 0.00034277638190954775, + "loss": 0.3315, + "mlm_loss": 0.3315, + "step": 15010 + }, + { + "ep_loss": 0.0, + "epoch": 11.59, + "learning_rate": 0.00034246231155778893, + "loss": 0.3303, + "mlm_loss": 0.3303, + "step": 15020 + }, + { + "ep_loss": 0.0, + "epoch": 11.59, + "learning_rate": 0.00034214824120603016, + "loss": 0.3337, + "mlm_loss": 0.3337, + "step": 15030 + }, + { + "ep_loss": 0.0, + "epoch": 11.6, + "learning_rate": 0.0003418341708542714, + "loss": 0.3299, + "mlm_loss": 0.3299, + "step": 15040 + }, + { + "ep_loss": 0.0, + "epoch": 11.61, + "learning_rate": 0.0003415201005025126, + "loss": 0.3366, + "mlm_loss": 0.3366, + "step": 15050 + }, + { + "ep_loss": 0.0, + "epoch": 11.62, + "learning_rate": 0.0003412060301507538, + "loss": 0.3185, + "mlm_loss": 0.3185, + "step": 15060 + }, + { + "ep_loss": 0.0, + "epoch": 11.62, + "learning_rate": 0.00034089195979899496, + "loss": 0.3277, + "mlm_loss": 0.3277, + "step": 15070 + }, + { + "ep_loss": 0.0, + "epoch": 11.63, + "learning_rate": 0.0003405778894472362, + "loss": 0.3351, + "mlm_loss": 0.3351, + "step": 15080 + }, + { + "ep_loss": 0.0, + "epoch": 11.64, + "learning_rate": 0.00034026381909547736, + "loss": 0.3306, + "mlm_loss": 0.3306, + "step": 15090 + }, + { + "ep_loss": 0.0, + "epoch": 11.65, + "learning_rate": 0.0003399497487437186, + "loss": 0.3279, + "mlm_loss": 0.3279, + "step": 15100 + }, + { + "epoch": 11.65, + "eval_ep_loss": -2.8001365661621094, + "eval_loss": 0.31101852655410767, + "eval_mlm_loss": 0.31101852655410767, + "eval_runtime": 60.3645, + "eval_samples_per_second": 1157.452, + "eval_steps_per_second": 0.58, + "step": 15100 + }, + { + "ep_loss": 0.0, + "epoch": 11.65, + "learning_rate": 0.0003396356783919598, + "loss": 0.3278, + "mlm_loss": 0.3278, + "step": 15110 + }, + { + "ep_loss": 0.0, + "epoch": 11.66, + "learning_rate": 0.000339321608040201, + "loss": 0.3356, + "mlm_loss": 0.3356, + "step": 15120 + }, + { + "ep_loss": 0.0, + "epoch": 11.67, + "learning_rate": 0.0003390075376884422, + "loss": 0.3322, + "mlm_loss": 0.3322, + "step": 15130 + }, + { + "ep_loss": 0.0, + "epoch": 11.68, + "learning_rate": 0.00033869346733668345, + "loss": 0.3279, + "mlm_loss": 0.3279, + "step": 15140 + }, + { + "ep_loss": 0.0, + "epoch": 11.69, + "learning_rate": 0.0003383793969849246, + "loss": 0.3347, + "mlm_loss": 0.3347, + "step": 15150 + }, + { + "ep_loss": 0.0, + "epoch": 11.69, + "learning_rate": 0.00033806532663316585, + "loss": 0.3343, + "mlm_loss": 0.3343, + "step": 15160 + }, + { + "ep_loss": 0.0, + "epoch": 11.7, + "learning_rate": 0.000337751256281407, + "loss": 0.3298, + "mlm_loss": 0.3298, + "step": 15170 + }, + { + "ep_loss": 0.0, + "epoch": 11.71, + "learning_rate": 0.00033743718592964825, + "loss": 0.3307, + "mlm_loss": 0.3307, + "step": 15180 + }, + { + "ep_loss": 0.0, + "epoch": 11.72, + "learning_rate": 0.0003371231155778894, + "loss": 0.3335, + "mlm_loss": 0.3335, + "step": 15190 + }, + { + "ep_loss": 0.0, + "epoch": 11.72, + "learning_rate": 0.00033680904522613065, + "loss": 0.332, + "mlm_loss": 0.332, + "step": 15200 + }, + { + "epoch": 11.72, + "eval_ep_loss": -2.703523635864258, + "eval_loss": 0.3101367652416229, + "eval_mlm_loss": 0.3101367652416229, + "eval_runtime": 63.4743, + "eval_samples_per_second": 1100.745, + "eval_steps_per_second": 0.551, + "step": 15200 + }, + { + "ep_loss": 0.0, + "epoch": 11.73, + "learning_rate": 0.0003364949748743719, + "loss": 0.3289, + "mlm_loss": 0.3289, + "step": 15210 + }, + { + "ep_loss": 0.0, + "epoch": 11.74, + "learning_rate": 0.00033618090452261306, + "loss": 0.3295, + "mlm_loss": 0.3295, + "step": 15220 + }, + { + "ep_loss": 0.0, + "epoch": 11.75, + "learning_rate": 0.0003358668341708543, + "loss": 0.3329, + "mlm_loss": 0.3329, + "step": 15230 + }, + { + "ep_loss": 0.0, + "epoch": 11.75, + "learning_rate": 0.0003355527638190955, + "loss": 0.3338, + "mlm_loss": 0.3338, + "step": 15240 + }, + { + "ep_loss": 0.0, + "epoch": 11.76, + "learning_rate": 0.0003352386934673367, + "loss": 0.3349, + "mlm_loss": 0.3349, + "step": 15250 + }, + { + "ep_loss": 0.0, + "epoch": 11.77, + "learning_rate": 0.0003349246231155779, + "loss": 0.3304, + "mlm_loss": 0.3304, + "step": 15260 + }, + { + "ep_loss": 0.0, + "epoch": 11.78, + "learning_rate": 0.0003346105527638191, + "loss": 0.329, + "mlm_loss": 0.329, + "step": 15270 + }, + { + "ep_loss": 0.0, + "epoch": 11.79, + "learning_rate": 0.00033429648241206026, + "loss": 0.3308, + "mlm_loss": 0.3308, + "step": 15280 + }, + { + "ep_loss": 0.0, + "epoch": 11.79, + "learning_rate": 0.0003339824120603015, + "loss": 0.3344, + "mlm_loss": 0.3344, + "step": 15290 + }, + { + "ep_loss": 0.0, + "epoch": 11.8, + "learning_rate": 0.0003336683417085427, + "loss": 0.3359, + "mlm_loss": 0.3359, + "step": 15300 + }, + { + "epoch": 11.8, + "eval_ep_loss": -2.812905788421631, + "eval_loss": 0.3099025785923004, + "eval_mlm_loss": 0.3099025785923004, + "eval_runtime": 61.244, + "eval_samples_per_second": 1140.831, + "eval_steps_per_second": 0.571, + "step": 15300 + }, + { + "ep_loss": 0.0, + "epoch": 11.81, + "learning_rate": 0.00033335427135678395, + "loss": 0.3334, + "mlm_loss": 0.3334, + "step": 15310 + }, + { + "ep_loss": 0.0, + "epoch": 11.82, + "learning_rate": 0.0003330402010050251, + "loss": 0.3209, + "mlm_loss": 0.3209, + "step": 15320 + }, + { + "ep_loss": 0.0, + "epoch": 11.82, + "learning_rate": 0.00033272613065326635, + "loss": 0.3275, + "mlm_loss": 0.3275, + "step": 15330 + }, + { + "ep_loss": 0.0, + "epoch": 11.83, + "learning_rate": 0.0003324120603015076, + "loss": 0.3285, + "mlm_loss": 0.3285, + "step": 15340 + }, + { + "ep_loss": 0.0, + "epoch": 11.84, + "learning_rate": 0.00033209798994974875, + "loss": 0.3274, + "mlm_loss": 0.3274, + "step": 15350 + }, + { + "ep_loss": 0.0, + "epoch": 11.85, + "learning_rate": 0.00033178391959799, + "loss": 0.3274, + "mlm_loss": 0.3274, + "step": 15360 + }, + { + "ep_loss": 0.0, + "epoch": 11.85, + "learning_rate": 0.00033146984924623115, + "loss": 0.3357, + "mlm_loss": 0.3357, + "step": 15370 + }, + { + "ep_loss": 0.0, + "epoch": 11.86, + "learning_rate": 0.0003311557788944723, + "loss": 0.3324, + "mlm_loss": 0.3324, + "step": 15380 + }, + { + "ep_loss": 0.0, + "epoch": 11.87, + "learning_rate": 0.00033084170854271355, + "loss": 0.3337, + "mlm_loss": 0.3337, + "step": 15390 + }, + { + "ep_loss": 0.0, + "epoch": 11.88, + "learning_rate": 0.0003305276381909548, + "loss": 0.3322, + "mlm_loss": 0.3322, + "step": 15400 + }, + { + "epoch": 11.88, + "eval_ep_loss": -2.6010334491729736, + "eval_loss": 0.30834510922431946, + "eval_mlm_loss": 0.30834510922431946, + "eval_runtime": 62.7261, + "eval_samples_per_second": 1113.875, + "eval_steps_per_second": 0.558, + "step": 15400 + }, + { + "ep_loss": 0.0, + "epoch": 11.89, + "learning_rate": 0.00033021356783919596, + "loss": 0.3327, + "mlm_loss": 0.3327, + "step": 15410 + }, + { + "ep_loss": 0.0, + "epoch": 11.89, + "learning_rate": 0.0003298994974874372, + "loss": 0.3269, + "mlm_loss": 0.3269, + "step": 15420 + }, + { + "ep_loss": 0.0, + "epoch": 11.9, + "learning_rate": 0.0003295854271356784, + "loss": 0.3238, + "mlm_loss": 0.3238, + "step": 15430 + }, + { + "ep_loss": 0.0, + "epoch": 11.91, + "learning_rate": 0.00032927135678391964, + "loss": 0.3273, + "mlm_loss": 0.3273, + "step": 15440 + }, + { + "ep_loss": 0.0, + "epoch": 11.92, + "learning_rate": 0.0003289572864321608, + "loss": 0.3373, + "mlm_loss": 0.3373, + "step": 15450 + }, + { + "ep_loss": 0.0, + "epoch": 11.92, + "learning_rate": 0.00032864321608040204, + "loss": 0.3262, + "mlm_loss": 0.3262, + "step": 15460 + }, + { + "ep_loss": 0.0, + "epoch": 11.93, + "learning_rate": 0.0003283291457286432, + "loss": 0.3314, + "mlm_loss": 0.3314, + "step": 15470 + }, + { + "ep_loss": 0.0, + "epoch": 11.94, + "learning_rate": 0.0003280150753768844, + "loss": 0.3296, + "mlm_loss": 0.3296, + "step": 15480 + }, + { + "ep_loss": 0.0, + "epoch": 11.95, + "learning_rate": 0.0003277010050251256, + "loss": 0.3249, + "mlm_loss": 0.3249, + "step": 15490 + }, + { + "ep_loss": 0.0, + "epoch": 11.96, + "learning_rate": 0.00032738693467336685, + "loss": 0.324, + "mlm_loss": 0.324, + "step": 15500 + }, + { + "epoch": 11.96, + "eval_ep_loss": -2.6185078620910645, + "eval_loss": 0.30834418535232544, + "eval_mlm_loss": 0.30834418535232544, + "eval_runtime": 60.4091, + "eval_samples_per_second": 1156.597, + "eval_steps_per_second": 0.579, + "step": 15500 + }, + { + "ep_loss": 0.0, + "epoch": 11.96, + "learning_rate": 0.000327072864321608, + "loss": 0.3247, + "mlm_loss": 0.3247, + "step": 15510 + }, + { + "ep_loss": 0.0, + "epoch": 11.97, + "learning_rate": 0.00032675879396984925, + "loss": 0.3281, + "mlm_loss": 0.3281, + "step": 15520 + }, + { + "ep_loss": 0.0, + "epoch": 11.98, + "learning_rate": 0.0003264447236180905, + "loss": 0.3287, + "mlm_loss": 0.3287, + "step": 15530 + }, + { + "ep_loss": 0.0, + "epoch": 11.99, + "learning_rate": 0.0003261306532663317, + "loss": 0.3272, + "mlm_loss": 0.3272, + "step": 15540 + }, + { + "ep_loss": 0.0, + "epoch": 11.99, + "learning_rate": 0.0003258165829145729, + "loss": 0.3355, + "mlm_loss": 0.3355, + "step": 15550 + }, + { + "ep_loss": 0.0, + "epoch": 12.0, + "learning_rate": 0.0003255025125628141, + "loss": 0.3299, + "mlm_loss": 0.3299, + "step": 15560 + }, + { + "ep_loss": 0.0, + "epoch": 12.01, + "learning_rate": 0.0003251884422110553, + "loss": 0.3293, + "mlm_loss": 0.3293, + "step": 15570 + }, + { + "ep_loss": 0.0, + "epoch": 12.02, + "learning_rate": 0.00032487437185929645, + "loss": 0.3268, + "mlm_loss": 0.3268, + "step": 15580 + }, + { + "ep_loss": 0.0, + "epoch": 12.02, + "learning_rate": 0.0003245603015075377, + "loss": 0.3202, + "mlm_loss": 0.3202, + "step": 15590 + }, + { + "ep_loss": 0.0, + "epoch": 12.03, + "learning_rate": 0.0003242462311557789, + "loss": 0.3211, + "mlm_loss": 0.3211, + "step": 15600 + }, + { + "epoch": 12.03, + "eval_ep_loss": -2.552333354949951, + "eval_loss": 0.30786365270614624, + "eval_mlm_loss": 0.30786365270614624, + "eval_runtime": 61.2728, + "eval_samples_per_second": 1140.295, + "eval_steps_per_second": 0.571, + "step": 15600 + }, + { + "ep_loss": 0.0, + "epoch": 12.04, + "learning_rate": 0.0003239321608040201, + "loss": 0.3184, + "mlm_loss": 0.3184, + "step": 15610 + }, + { + "ep_loss": 0.0, + "epoch": 12.05, + "learning_rate": 0.0003236180904522613, + "loss": 0.3271, + "mlm_loss": 0.3271, + "step": 15620 + }, + { + "ep_loss": 0.0, + "epoch": 12.06, + "learning_rate": 0.00032330402010050254, + "loss": 0.3285, + "mlm_loss": 0.3285, + "step": 15630 + }, + { + "ep_loss": 0.0, + "epoch": 12.06, + "learning_rate": 0.0003229899497487437, + "loss": 0.3214, + "mlm_loss": 0.3214, + "step": 15640 + }, + { + "ep_loss": 0.0, + "epoch": 12.07, + "learning_rate": 0.00032267587939698494, + "loss": 0.3269, + "mlm_loss": 0.3269, + "step": 15650 + }, + { + "ep_loss": 0.0, + "epoch": 12.08, + "learning_rate": 0.00032236180904522617, + "loss": 0.3222, + "mlm_loss": 0.3222, + "step": 15660 + }, + { + "ep_loss": 0.0, + "epoch": 12.09, + "learning_rate": 0.00032204773869346734, + "loss": 0.321, + "mlm_loss": 0.321, + "step": 15670 + }, + { + "ep_loss": 0.0, + "epoch": 12.09, + "learning_rate": 0.0003217336683417085, + "loss": 0.3198, + "mlm_loss": 0.3198, + "step": 15680 + }, + { + "ep_loss": 0.0, + "epoch": 12.1, + "learning_rate": 0.00032141959798994975, + "loss": 0.3213, + "mlm_loss": 0.3213, + "step": 15690 + }, + { + "ep_loss": 0.0, + "epoch": 12.11, + "learning_rate": 0.000321105527638191, + "loss": 0.327, + "mlm_loss": 0.327, + "step": 15700 + }, + { + "epoch": 12.11, + "eval_ep_loss": -2.4123196601867676, + "eval_loss": 0.3056265413761139, + "eval_mlm_loss": 0.3056265413761139, + "eval_runtime": 62.3312, + "eval_samples_per_second": 1120.931, + "eval_steps_per_second": 0.562, + "step": 15700 + }, + { + "ep_loss": 0.0, + "epoch": 12.12, + "learning_rate": 0.00032079145728643215, + "loss": 0.3254, + "mlm_loss": 0.3254, + "step": 15710 + }, + { + "ep_loss": 0.0, + "epoch": 12.12, + "learning_rate": 0.0003204773869346734, + "loss": 0.3218, + "mlm_loss": 0.3218, + "step": 15720 + }, + { + "ep_loss": 0.0, + "epoch": 12.13, + "learning_rate": 0.0003201633165829146, + "loss": 0.3266, + "mlm_loss": 0.3266, + "step": 15730 + }, + { + "ep_loss": 0.0, + "epoch": 12.14, + "learning_rate": 0.0003198492462311558, + "loss": 0.3181, + "mlm_loss": 0.3181, + "step": 15740 + }, + { + "ep_loss": 0.0, + "epoch": 12.15, + "learning_rate": 0.000319535175879397, + "loss": 0.3226, + "mlm_loss": 0.3226, + "step": 15750 + }, + { + "ep_loss": 0.0, + "epoch": 12.16, + "learning_rate": 0.00031922110552763823, + "loss": 0.3271, + "mlm_loss": 0.3271, + "step": 15760 + }, + { + "ep_loss": 0.0, + "epoch": 12.16, + "learning_rate": 0.00031890703517587935, + "loss": 0.3183, + "mlm_loss": 0.3183, + "step": 15770 + }, + { + "ep_loss": 0.0, + "epoch": 12.17, + "learning_rate": 0.0003185929648241206, + "loss": 0.3108, + "mlm_loss": 0.3108, + "step": 15780 + }, + { + "ep_loss": 0.0, + "epoch": 12.18, + "learning_rate": 0.0003182788944723618, + "loss": 0.3314, + "mlm_loss": 0.3314, + "step": 15790 + }, + { + "ep_loss": 0.0, + "epoch": 12.19, + "learning_rate": 0.00031796482412060304, + "loss": 0.3226, + "mlm_loss": 0.3226, + "step": 15800 + }, + { + "epoch": 12.19, + "eval_ep_loss": -2.378340244293213, + "eval_loss": 0.3046373128890991, + "eval_mlm_loss": 0.3046373128890991, + "eval_runtime": 61.1512, + "eval_samples_per_second": 1142.562, + "eval_steps_per_second": 0.572, + "step": 15800 + }, + { + "ep_loss": 0.0, + "epoch": 12.19, + "learning_rate": 0.0003176507537688442, + "loss": 0.3179, + "mlm_loss": 0.3179, + "step": 15810 + }, + { + "ep_loss": 0.0, + "epoch": 12.2, + "learning_rate": 0.00031733668341708544, + "loss": 0.3173, + "mlm_loss": 0.3173, + "step": 15820 + }, + { + "ep_loss": 0.0, + "epoch": 12.21, + "learning_rate": 0.00031702261306532667, + "loss": 0.3267, + "mlm_loss": 0.3267, + "step": 15830 + }, + { + "ep_loss": 0.0, + "epoch": 12.22, + "learning_rate": 0.00031670854271356784, + "loss": 0.3207, + "mlm_loss": 0.3207, + "step": 15840 + }, + { + "ep_loss": 0.0, + "epoch": 12.23, + "learning_rate": 0.00031639447236180907, + "loss": 0.327, + "mlm_loss": 0.327, + "step": 15850 + }, + { + "ep_loss": 0.0, + "epoch": 12.23, + "learning_rate": 0.0003160804020100503, + "loss": 0.3271, + "mlm_loss": 0.3271, + "step": 15860 + }, + { + "ep_loss": 0.0, + "epoch": 12.24, + "learning_rate": 0.0003157663316582914, + "loss": 0.3287, + "mlm_loss": 0.3287, + "step": 15870 + }, + { + "ep_loss": 0.0, + "epoch": 12.25, + "learning_rate": 0.00031545226130653265, + "loss": 0.3212, + "mlm_loss": 0.3212, + "step": 15880 + }, + { + "ep_loss": 0.0, + "epoch": 12.26, + "learning_rate": 0.0003151381909547739, + "loss": 0.3246, + "mlm_loss": 0.3246, + "step": 15890 + }, + { + "ep_loss": 0.0, + "epoch": 12.26, + "learning_rate": 0.00031482412060301505, + "loss": 0.3205, + "mlm_loss": 0.3205, + "step": 15900 + }, + { + "epoch": 12.26, + "eval_ep_loss": -2.3220415115356445, + "eval_loss": 0.305078387260437, + "eval_mlm_loss": 0.305078387260437, + "eval_runtime": 61.9296, + "eval_samples_per_second": 1128.2, + "eval_steps_per_second": 0.565, + "step": 15900 + }, + { + "ep_loss": 0.0, + "epoch": 12.27, + "learning_rate": 0.0003145100502512563, + "loss": 0.323, + "mlm_loss": 0.323, + "step": 15910 + }, + { + "ep_loss": 0.0, + "epoch": 12.28, + "learning_rate": 0.0003141959798994975, + "loss": 0.3259, + "mlm_loss": 0.3259, + "step": 15920 + }, + { + "ep_loss": 0.0, + "epoch": 12.29, + "learning_rate": 0.00031388190954773873, + "loss": 0.3277, + "mlm_loss": 0.3277, + "step": 15930 + }, + { + "ep_loss": 0.0, + "epoch": 12.29, + "learning_rate": 0.0003135678391959799, + "loss": 0.3207, + "mlm_loss": 0.3207, + "step": 15940 + }, + { + "ep_loss": 0.0, + "epoch": 12.3, + "learning_rate": 0.00031325376884422113, + "loss": 0.3269, + "mlm_loss": 0.3269, + "step": 15950 + }, + { + "ep_loss": 0.0, + "epoch": 12.31, + "learning_rate": 0.00031293969849246236, + "loss": 0.3261, + "mlm_loss": 0.3261, + "step": 15960 + }, + { + "ep_loss": 0.0, + "epoch": 12.32, + "learning_rate": 0.0003126256281407035, + "loss": 0.3187, + "mlm_loss": 0.3187, + "step": 15970 + }, + { + "ep_loss": 0.0, + "epoch": 12.33, + "learning_rate": 0.0003123115577889447, + "loss": 0.3191, + "mlm_loss": 0.3191, + "step": 15980 + }, + { + "ep_loss": 0.0, + "epoch": 12.33, + "learning_rate": 0.00031199748743718594, + "loss": 0.3214, + "mlm_loss": 0.3214, + "step": 15990 + }, + { + "ep_loss": 0.0, + "epoch": 12.34, + "learning_rate": 0.0003116834170854271, + "loss": 0.322, + "mlm_loss": 0.322, + "step": 16000 + }, + { + "epoch": 12.34, + "eval_ep_loss": -2.4821033477783203, + "eval_loss": 0.30568239092826843, + "eval_mlm_loss": 0.30568239092826843, + "eval_runtime": 59.8027, + "eval_samples_per_second": 1168.326, + "eval_steps_per_second": 0.585, + "step": 16000 + }, + { + "ep_loss": 0.0, + "epoch": 12.35, + "learning_rate": 0.00031136934673366834, + "loss": 0.3212, + "mlm_loss": 0.3212, + "step": 16010 + }, + { + "ep_loss": 0.0, + "epoch": 12.36, + "learning_rate": 0.00031105527638190957, + "loss": 0.3184, + "mlm_loss": 0.3184, + "step": 16020 + }, + { + "ep_loss": 0.0, + "epoch": 12.36, + "learning_rate": 0.00031074120603015074, + "loss": 0.3178, + "mlm_loss": 0.3178, + "step": 16030 + }, + { + "ep_loss": 0.0, + "epoch": 12.37, + "learning_rate": 0.00031042713567839197, + "loss": 0.325, + "mlm_loss": 0.325, + "step": 16040 + }, + { + "ep_loss": 0.0, + "epoch": 12.38, + "learning_rate": 0.0003101130653266332, + "loss": 0.3274, + "mlm_loss": 0.3274, + "step": 16050 + }, + { + "ep_loss": 0.0, + "epoch": 12.39, + "learning_rate": 0.00030979899497487443, + "loss": 0.3178, + "mlm_loss": 0.3178, + "step": 16060 + }, + { + "ep_loss": 0.0, + "epoch": 12.39, + "learning_rate": 0.00030948492462311555, + "loss": 0.3215, + "mlm_loss": 0.3215, + "step": 16070 + }, + { + "ep_loss": 0.0, + "epoch": 12.4, + "learning_rate": 0.0003091708542713568, + "loss": 0.3302, + "mlm_loss": 0.3302, + "step": 16080 + }, + { + "ep_loss": 0.0, + "epoch": 12.41, + "learning_rate": 0.000308856783919598, + "loss": 0.3251, + "mlm_loss": 0.3251, + "step": 16090 + }, + { + "ep_loss": 0.0, + "epoch": 12.42, + "learning_rate": 0.0003085427135678392, + "loss": 0.3253, + "mlm_loss": 0.3253, + "step": 16100 + }, + { + "epoch": 12.42, + "eval_ep_loss": -2.364391565322876, + "eval_loss": 0.3045331537723541, + "eval_mlm_loss": 0.3045331537723541, + "eval_runtime": 64.7442, + "eval_samples_per_second": 1079.155, + "eval_steps_per_second": 0.541, + "step": 16100 + }, + { + "ep_loss": 0.0, + "epoch": 12.43, + "learning_rate": 0.0003082286432160804, + "loss": 0.3256, + "mlm_loss": 0.3256, + "step": 16110 + }, + { + "ep_loss": 0.0, + "epoch": 12.43, + "learning_rate": 0.00030791457286432163, + "loss": 0.3187, + "mlm_loss": 0.3187, + "step": 16120 + }, + { + "ep_loss": 0.0, + "epoch": 12.44, + "learning_rate": 0.0003076005025125628, + "loss": 0.32, + "mlm_loss": 0.32, + "step": 16130 + }, + { + "ep_loss": 0.0, + "epoch": 12.45, + "learning_rate": 0.00030728643216080403, + "loss": 0.3257, + "mlm_loss": 0.3257, + "step": 16140 + }, + { + "ep_loss": 0.0, + "epoch": 12.46, + "learning_rate": 0.00030697236180904526, + "loss": 0.3199, + "mlm_loss": 0.3199, + "step": 16150 + }, + { + "ep_loss": 0.0, + "epoch": 12.46, + "learning_rate": 0.0003066582914572865, + "loss": 0.3205, + "mlm_loss": 0.3205, + "step": 16160 + }, + { + "ep_loss": 0.0, + "epoch": 12.47, + "learning_rate": 0.0003063442211055276, + "loss": 0.323, + "mlm_loss": 0.323, + "step": 16170 + }, + { + "ep_loss": 0.0, + "epoch": 12.48, + "learning_rate": 0.00030603015075376884, + "loss": 0.3194, + "mlm_loss": 0.3194, + "step": 16180 + }, + { + "ep_loss": 0.0, + "epoch": 12.49, + "learning_rate": 0.00030571608040201007, + "loss": 0.3241, + "mlm_loss": 0.3241, + "step": 16190 + }, + { + "ep_loss": 0.0, + "epoch": 12.5, + "learning_rate": 0.00030540201005025124, + "loss": 0.3215, + "mlm_loss": 0.3215, + "step": 16200 + }, + { + "epoch": 12.5, + "eval_ep_loss": -2.4813191890716553, + "eval_loss": 0.30195391178131104, + "eval_mlm_loss": 0.30195391178131104, + "eval_runtime": 61.4192, + "eval_samples_per_second": 1137.576, + "eval_steps_per_second": 0.57, + "step": 16200 + }, + { + "ep_loss": 0.0, + "epoch": 12.5, + "learning_rate": 0.00030508793969849247, + "loss": 0.3197, + "mlm_loss": 0.3197, + "step": 16210 + }, + { + "ep_loss": 0.0, + "epoch": 12.51, + "learning_rate": 0.0003047738693467337, + "loss": 0.3183, + "mlm_loss": 0.3183, + "step": 16220 + }, + { + "ep_loss": 0.0, + "epoch": 12.52, + "learning_rate": 0.00030445979899497487, + "loss": 0.3231, + "mlm_loss": 0.3231, + "step": 16230 + }, + { + "ep_loss": 0.0, + "epoch": 12.53, + "learning_rate": 0.0003041457286432161, + "loss": 0.3137, + "mlm_loss": 0.3137, + "step": 16240 + }, + { + "ep_loss": 0.0, + "epoch": 12.53, + "learning_rate": 0.00030383165829145733, + "loss": 0.3178, + "mlm_loss": 0.3178, + "step": 16250 + }, + { + "ep_loss": 0.0, + "epoch": 12.54, + "learning_rate": 0.0003035175879396985, + "loss": 0.3178, + "mlm_loss": 0.3178, + "step": 16260 + }, + { + "ep_loss": 0.0, + "epoch": 12.55, + "learning_rate": 0.0003032035175879397, + "loss": 0.3167, + "mlm_loss": 0.3167, + "step": 16270 + }, + { + "ep_loss": 0.0, + "epoch": 12.56, + "learning_rate": 0.0003028894472361809, + "loss": 0.3163, + "mlm_loss": 0.3163, + "step": 16280 + }, + { + "ep_loss": 0.0, + "epoch": 12.56, + "learning_rate": 0.00030257537688442213, + "loss": 0.3234, + "mlm_loss": 0.3234, + "step": 16290 + }, + { + "ep_loss": 0.0, + "epoch": 12.57, + "learning_rate": 0.0003022613065326633, + "loss": 0.3245, + "mlm_loss": 0.3245, + "step": 16300 + }, + { + "epoch": 12.57, + "eval_ep_loss": -2.5413894653320312, + "eval_loss": 0.30073004961013794, + "eval_mlm_loss": 0.30073004961013794, + "eval_runtime": 60.8169, + "eval_samples_per_second": 1148.842, + "eval_steps_per_second": 0.575, + "step": 16300 + }, + { + "ep_loss": 0.0, + "epoch": 12.58, + "learning_rate": 0.00030194723618090453, + "loss": 0.3177, + "mlm_loss": 0.3177, + "step": 16310 + }, + { + "ep_loss": 0.0, + "epoch": 12.59, + "learning_rate": 0.00030163316582914576, + "loss": 0.3175, + "mlm_loss": 0.3175, + "step": 16320 + }, + { + "ep_loss": 0.0, + "epoch": 12.6, + "learning_rate": 0.00030131909547738694, + "loss": 0.3225, + "mlm_loss": 0.3225, + "step": 16330 + }, + { + "ep_loss": 0.0, + "epoch": 12.6, + "learning_rate": 0.00030100502512562816, + "loss": 0.3195, + "mlm_loss": 0.3195, + "step": 16340 + }, + { + "ep_loss": 0.0, + "epoch": 12.61, + "learning_rate": 0.0003006909547738694, + "loss": 0.315, + "mlm_loss": 0.315, + "step": 16350 + }, + { + "ep_loss": 0.0, + "epoch": 12.62, + "learning_rate": 0.00030037688442211057, + "loss": 0.3238, + "mlm_loss": 0.3238, + "step": 16360 + }, + { + "ep_loss": 0.0, + "epoch": 12.63, + "learning_rate": 0.00030006281407035174, + "loss": 0.3134, + "mlm_loss": 0.3134, + "step": 16370 + }, + { + "ep_loss": 0.0, + "epoch": 12.63, + "learning_rate": 0.00029974874371859297, + "loss": 0.3246, + "mlm_loss": 0.3246, + "step": 16380 + }, + { + "ep_loss": 0.0, + "epoch": 12.64, + "learning_rate": 0.00029943467336683414, + "loss": 0.3153, + "mlm_loss": 0.3153, + "step": 16390 + }, + { + "ep_loss": 0.0, + "epoch": 12.65, + "learning_rate": 0.00029912060301507537, + "loss": 0.3205, + "mlm_loss": 0.3205, + "step": 16400 + }, + { + "epoch": 12.65, + "eval_ep_loss": -2.454632043838501, + "eval_loss": 0.3005962073802948, + "eval_mlm_loss": 0.3005962073802948, + "eval_runtime": 61.3758, + "eval_samples_per_second": 1138.381, + "eval_steps_per_second": 0.57, + "step": 16400 + }, + { + "ep_loss": 0.0, + "epoch": 12.66, + "learning_rate": 0.0002988065326633166, + "loss": 0.3132, + "mlm_loss": 0.3132, + "step": 16410 + }, + { + "ep_loss": 0.0, + "epoch": 12.66, + "learning_rate": 0.0002984924623115578, + "loss": 0.3186, + "mlm_loss": 0.3186, + "step": 16420 + }, + { + "ep_loss": 0.0, + "epoch": 12.67, + "learning_rate": 0.000298178391959799, + "loss": 0.3228, + "mlm_loss": 0.3228, + "step": 16430 + }, + { + "ep_loss": 0.0, + "epoch": 12.68, + "learning_rate": 0.00029786432160804023, + "loss": 0.3134, + "mlm_loss": 0.3134, + "step": 16440 + }, + { + "ep_loss": 0.0, + "epoch": 12.69, + "learning_rate": 0.00029755025125628146, + "loss": 0.3155, + "mlm_loss": 0.3155, + "step": 16450 + }, + { + "ep_loss": 0.0, + "epoch": 12.7, + "learning_rate": 0.00029723618090452263, + "loss": 0.3166, + "mlm_loss": 0.3166, + "step": 16460 + }, + { + "ep_loss": 0.0, + "epoch": 12.7, + "learning_rate": 0.0002969221105527638, + "loss": 0.3285, + "mlm_loss": 0.3285, + "step": 16470 + }, + { + "ep_loss": 0.0, + "epoch": 12.71, + "learning_rate": 0.00029660804020100503, + "loss": 0.3172, + "mlm_loss": 0.3172, + "step": 16480 + }, + { + "ep_loss": 0.0, + "epoch": 12.72, + "learning_rate": 0.0002962939698492462, + "loss": 0.3185, + "mlm_loss": 0.3185, + "step": 16490 + }, + { + "ep_loss": 0.0, + "epoch": 12.73, + "learning_rate": 0.00029597989949748743, + "loss": 0.3154, + "mlm_loss": 0.3154, + "step": 16500 + }, + { + "epoch": 12.73, + "eval_ep_loss": -2.530627489089966, + "eval_loss": 0.2981483042240143, + "eval_mlm_loss": 0.2981483042240143, + "eval_runtime": 62.8785, + "eval_samples_per_second": 1111.175, + "eval_steps_per_second": 0.557, + "step": 16500 + }, + { + "ep_loss": 0.0, + "epoch": 12.73, + "learning_rate": 0.00029566582914572866, + "loss": 0.3206, + "mlm_loss": 0.3206, + "step": 16510 + }, + { + "ep_loss": 0.0, + "epoch": 12.74, + "learning_rate": 0.00029535175879396984, + "loss": 0.3171, + "mlm_loss": 0.3171, + "step": 16520 + }, + { + "ep_loss": 0.0, + "epoch": 12.75, + "learning_rate": 0.00029503768844221106, + "loss": 0.3235, + "mlm_loss": 0.3235, + "step": 16530 + }, + { + "ep_loss": 0.0, + "epoch": 12.76, + "learning_rate": 0.0002947236180904523, + "loss": 0.3188, + "mlm_loss": 0.3188, + "step": 16540 + }, + { + "ep_loss": 0.0, + "epoch": 12.77, + "learning_rate": 0.0002944095477386935, + "loss": 0.3197, + "mlm_loss": 0.3197, + "step": 16550 + }, + { + "ep_loss": 0.0, + "epoch": 12.77, + "learning_rate": 0.0002940954773869347, + "loss": 0.3218, + "mlm_loss": 0.3218, + "step": 16560 + }, + { + "ep_loss": 0.0, + "epoch": 12.78, + "learning_rate": 0.00029378140703517587, + "loss": 0.3105, + "mlm_loss": 0.3105, + "step": 16570 + }, + { + "ep_loss": 0.0, + "epoch": 12.79, + "learning_rate": 0.0002934673366834171, + "loss": 0.3168, + "mlm_loss": 0.3168, + "step": 16580 + }, + { + "ep_loss": 0.0, + "epoch": 12.8, + "learning_rate": 0.00029315326633165827, + "loss": 0.324, + "mlm_loss": 0.324, + "step": 16590 + }, + { + "ep_loss": 0.0, + "epoch": 12.8, + "learning_rate": 0.0002928391959798995, + "loss": 0.314, + "mlm_loss": 0.314, + "step": 16600 + }, + { + "epoch": 12.8, + "eval_ep_loss": -2.1559700965881348, + "eval_loss": 0.2988715171813965, + "eval_mlm_loss": 0.2988715171813965, + "eval_runtime": 62.228, + "eval_samples_per_second": 1122.791, + "eval_steps_per_second": 0.562, + "step": 16600 + }, + { + "ep_loss": 0.0, + "epoch": 12.81, + "learning_rate": 0.0002925251256281407, + "loss": 0.3144, + "mlm_loss": 0.3144, + "step": 16610 + }, + { + "ep_loss": 0.0, + "epoch": 12.82, + "learning_rate": 0.0002922110552763819, + "loss": 0.3139, + "mlm_loss": 0.3139, + "step": 16620 + }, + { + "ep_loss": 0.0, + "epoch": 12.83, + "learning_rate": 0.00029189698492462313, + "loss": 0.3193, + "mlm_loss": 0.3193, + "step": 16630 + }, + { + "ep_loss": 0.0, + "epoch": 12.83, + "learning_rate": 0.00029158291457286436, + "loss": 0.3153, + "mlm_loss": 0.3153, + "step": 16640 + }, + { + "ep_loss": 0.0, + "epoch": 12.84, + "learning_rate": 0.00029126884422110553, + "loss": 0.3103, + "mlm_loss": 0.3103, + "step": 16650 + }, + { + "ep_loss": 0.0, + "epoch": 12.85, + "learning_rate": 0.00029095477386934676, + "loss": 0.3129, + "mlm_loss": 0.3129, + "step": 16660 + }, + { + "ep_loss": 0.0, + "epoch": 12.86, + "learning_rate": 0.00029064070351758793, + "loss": 0.3176, + "mlm_loss": 0.3176, + "step": 16670 + }, + { + "ep_loss": 0.0, + "epoch": 12.87, + "learning_rate": 0.00029032663316582916, + "loss": 0.3121, + "mlm_loss": 0.3121, + "step": 16680 + }, + { + "ep_loss": 0.0, + "epoch": 12.87, + "learning_rate": 0.00029001256281407033, + "loss": 0.3153, + "mlm_loss": 0.3153, + "step": 16690 + }, + { + "ep_loss": 0.0, + "epoch": 12.88, + "learning_rate": 0.00028969849246231156, + "loss": 0.3182, + "mlm_loss": 0.3182, + "step": 16700 + }, + { + "epoch": 12.88, + "eval_ep_loss": -2.1432950496673584, + "eval_loss": 0.2975234389305115, + "eval_mlm_loss": 0.2975234389305115, + "eval_runtime": 60.288, + "eval_samples_per_second": 1158.921, + "eval_steps_per_second": 0.581, + "step": 16700 + }, + { + "ep_loss": 0.0, + "epoch": 12.89, + "learning_rate": 0.0002893844221105528, + "loss": 0.316, + "mlm_loss": 0.316, + "step": 16710 + }, + { + "ep_loss": 0.0, + "epoch": 12.9, + "learning_rate": 0.00028907035175879396, + "loss": 0.3171, + "mlm_loss": 0.3171, + "step": 16720 + }, + { + "ep_loss": 0.0, + "epoch": 12.9, + "learning_rate": 0.0002887562814070352, + "loss": 0.3202, + "mlm_loss": 0.3202, + "step": 16730 + }, + { + "ep_loss": 0.0, + "epoch": 12.91, + "learning_rate": 0.0002884422110552764, + "loss": 0.3123, + "mlm_loss": 0.3123, + "step": 16740 + }, + { + "ep_loss": 0.0, + "epoch": 12.92, + "learning_rate": 0.0002881281407035176, + "loss": 0.3117, + "mlm_loss": 0.3117, + "step": 16750 + }, + { + "ep_loss": 0.0, + "epoch": 12.93, + "learning_rate": 0.0002878140703517588, + "loss": 0.3155, + "mlm_loss": 0.3155, + "step": 16760 + }, + { + "ep_loss": 0.0, + "epoch": 12.93, + "learning_rate": 0.0002875, + "loss": 0.3198, + "mlm_loss": 0.3198, + "step": 16770 + }, + { + "ep_loss": 0.0, + "epoch": 12.94, + "learning_rate": 0.00028718592964824117, + "loss": 0.3106, + "mlm_loss": 0.3106, + "step": 16780 + }, + { + "ep_loss": 0.0, + "epoch": 12.95, + "learning_rate": 0.0002868718592964824, + "loss": 0.3163, + "mlm_loss": 0.3163, + "step": 16790 + }, + { + "ep_loss": 0.0, + "epoch": 12.96, + "learning_rate": 0.0002865577889447236, + "loss": 0.3154, + "mlm_loss": 0.3154, + "step": 16800 + }, + { + "epoch": 12.96, + "eval_ep_loss": -2.0711417198181152, + "eval_loss": 0.2960880398750305, + "eval_mlm_loss": 0.2960880398750305, + "eval_runtime": 62.0434, + "eval_samples_per_second": 1126.132, + "eval_steps_per_second": 0.564, + "step": 16800 + }, + { + "ep_loss": 0.0, + "epoch": 12.97, + "learning_rate": 0.00028624371859296485, + "loss": 0.3144, + "mlm_loss": 0.3144, + "step": 16810 + }, + { + "ep_loss": 0.0, + "epoch": 12.97, + "learning_rate": 0.00028592964824120603, + "loss": 0.3133, + "mlm_loss": 0.3133, + "step": 16820 + }, + { + "ep_loss": 0.0, + "epoch": 12.98, + "learning_rate": 0.00028561557788944726, + "loss": 0.3093, + "mlm_loss": 0.3093, + "step": 16830 + }, + { + "ep_loss": 0.0, + "epoch": 12.99, + "learning_rate": 0.0002853015075376885, + "loss": 0.3172, + "mlm_loss": 0.3172, + "step": 16840 + }, + { + "ep_loss": 0.0, + "epoch": 13.0, + "learning_rate": 0.00028498743718592966, + "loss": 0.3161, + "mlm_loss": 0.3161, + "step": 16850 + }, + { + "ep_loss": 0.0, + "epoch": 13.0, + "learning_rate": 0.00028467336683417083, + "loss": 0.3186, + "mlm_loss": 0.3186, + "step": 16860 + }, + { + "ep_loss": 0.0, + "epoch": 13.01, + "learning_rate": 0.00028435929648241206, + "loss": 0.3094, + "mlm_loss": 0.3094, + "step": 16870 + }, + { + "ep_loss": 0.0, + "epoch": 13.02, + "learning_rate": 0.00028404522613065323, + "loss": 0.3164, + "mlm_loss": 0.3164, + "step": 16880 + }, + { + "ep_loss": 0.0, + "epoch": 13.03, + "learning_rate": 0.00028373115577889446, + "loss": 0.3129, + "mlm_loss": 0.3129, + "step": 16890 + }, + { + "ep_loss": 0.0, + "epoch": 13.04, + "learning_rate": 0.0002834170854271357, + "loss": 0.3138, + "mlm_loss": 0.3138, + "step": 16900 + }, + { + "epoch": 13.04, + "eval_ep_loss": -2.157414436340332, + "eval_loss": 0.2959713041782379, + "eval_mlm_loss": 0.2959713041782379, + "eval_runtime": 59.8502, + "eval_samples_per_second": 1167.398, + "eval_steps_per_second": 0.585, + "step": 16900 + }, + { + "ep_loss": 0.0, + "epoch": 13.04, + "learning_rate": 0.0002831030150753769, + "loss": 0.314, + "mlm_loss": 0.314, + "step": 16910 + }, + { + "ep_loss": 0.0, + "epoch": 13.05, + "learning_rate": 0.0002827889447236181, + "loss": 0.3165, + "mlm_loss": 0.3165, + "step": 16920 + }, + { + "ep_loss": 0.0, + "epoch": 13.06, + "learning_rate": 0.0002824748743718593, + "loss": 0.3062, + "mlm_loss": 0.3062, + "step": 16930 + }, + { + "ep_loss": 0.0, + "epoch": 13.07, + "learning_rate": 0.00028216080402010055, + "loss": 0.3115, + "mlm_loss": 0.3115, + "step": 16940 + }, + { + "ep_loss": 0.0, + "epoch": 13.07, + "learning_rate": 0.0002818467336683417, + "loss": 0.3098, + "mlm_loss": 0.3098, + "step": 16950 + }, + { + "ep_loss": 0.0, + "epoch": 13.08, + "learning_rate": 0.0002815326633165829, + "loss": 0.3064, + "mlm_loss": 0.3064, + "step": 16960 + }, + { + "ep_loss": 0.0, + "epoch": 13.09, + "learning_rate": 0.0002812185929648241, + "loss": 0.3046, + "mlm_loss": 0.3046, + "step": 16970 + }, + { + "ep_loss": 0.0, + "epoch": 13.1, + "learning_rate": 0.0002809045226130653, + "loss": 0.3117, + "mlm_loss": 0.3117, + "step": 16980 + }, + { + "ep_loss": 0.0, + "epoch": 13.1, + "learning_rate": 0.0002805904522613065, + "loss": 0.3128, + "mlm_loss": 0.3128, + "step": 16990 + }, + { + "ep_loss": 0.0, + "epoch": 13.11, + "learning_rate": 0.00028027638190954775, + "loss": 0.3065, + "mlm_loss": 0.3065, + "step": 17000 + }, + { + "epoch": 13.11, + "eval_ep_loss": -2.252915620803833, + "eval_loss": 0.2950764298439026, + "eval_mlm_loss": 0.2950764298439026, + "eval_runtime": 60.871, + "eval_samples_per_second": 1147.821, + "eval_steps_per_second": 0.575, + "step": 17000 + }, + { + "ep_loss": 0.0, + "epoch": 13.12, + "learning_rate": 0.00027996231155778893, + "loss": 0.3113, + "mlm_loss": 0.3113, + "step": 17010 + }, + { + "ep_loss": 0.0, + "epoch": 13.13, + "learning_rate": 0.00027964824120603016, + "loss": 0.3136, + "mlm_loss": 0.3136, + "step": 17020 + }, + { + "ep_loss": 0.0, + "epoch": 13.14, + "learning_rate": 0.0002793341708542714, + "loss": 0.309, + "mlm_loss": 0.309, + "step": 17030 + }, + { + "ep_loss": 0.0, + "epoch": 13.14, + "learning_rate": 0.0002790201005025126, + "loss": 0.3167, + "mlm_loss": 0.3167, + "step": 17040 + }, + { + "ep_loss": 0.0, + "epoch": 13.15, + "learning_rate": 0.0002787060301507538, + "loss": 0.308, + "mlm_loss": 0.308, + "step": 17050 + }, + { + "ep_loss": 0.0, + "epoch": 13.16, + "learning_rate": 0.00027839195979899496, + "loss": 0.3183, + "mlm_loss": 0.3183, + "step": 17060 + }, + { + "ep_loss": 0.0, + "epoch": 13.17, + "learning_rate": 0.0002780778894472362, + "loss": 0.3097, + "mlm_loss": 0.3097, + "step": 17070 + }, + { + "ep_loss": 0.0, + "epoch": 13.17, + "learning_rate": 0.00027776381909547736, + "loss": 0.312, + "mlm_loss": 0.312, + "step": 17080 + }, + { + "ep_loss": 0.0, + "epoch": 13.18, + "learning_rate": 0.0002774497487437186, + "loss": 0.3171, + "mlm_loss": 0.3171, + "step": 17090 + }, + { + "ep_loss": 0.0, + "epoch": 13.19, + "learning_rate": 0.0002771356783919598, + "loss": 0.3072, + "mlm_loss": 0.3072, + "step": 17100 + }, + { + "epoch": 13.19, + "eval_ep_loss": -2.6266889572143555, + "eval_loss": 0.2936928868293762, + "eval_mlm_loss": 0.2936928868293762, + "eval_runtime": 60.0035, + "eval_samples_per_second": 1164.416, + "eval_steps_per_second": 0.583, + "step": 17100 + }, + { + "ep_loss": 0.0, + "epoch": 13.2, + "learning_rate": 0.000276821608040201, + "loss": 0.311, + "mlm_loss": 0.311, + "step": 17110 + }, + { + "ep_loss": 0.0, + "epoch": 13.2, + "learning_rate": 0.0002765075376884422, + "loss": 0.3083, + "mlm_loss": 0.3083, + "step": 17120 + }, + { + "ep_loss": 0.0, + "epoch": 13.21, + "learning_rate": 0.00027619346733668345, + "loss": 0.3102, + "mlm_loss": 0.3102, + "step": 17130 + }, + { + "ep_loss": 0.0, + "epoch": 13.22, + "learning_rate": 0.0002758793969849246, + "loss": 0.3073, + "mlm_loss": 0.3073, + "step": 17140 + }, + { + "ep_loss": 0.0, + "epoch": 13.23, + "learning_rate": 0.00027556532663316585, + "loss": 0.3107, + "mlm_loss": 0.3107, + "step": 17150 + }, + { + "ep_loss": 0.0, + "epoch": 13.24, + "learning_rate": 0.000275251256281407, + "loss": 0.3076, + "mlm_loss": 0.3076, + "step": 17160 + }, + { + "ep_loss": 0.0, + "epoch": 13.24, + "learning_rate": 0.00027493718592964825, + "loss": 0.3042, + "mlm_loss": 0.3042, + "step": 17170 + }, + { + "ep_loss": 0.0, + "epoch": 13.25, + "learning_rate": 0.0002746231155778894, + "loss": 0.3121, + "mlm_loss": 0.3121, + "step": 17180 + }, + { + "ep_loss": 0.0, + "epoch": 13.26, + "learning_rate": 0.00027430904522613065, + "loss": 0.3078, + "mlm_loss": 0.3078, + "step": 17190 + }, + { + "ep_loss": 0.0, + "epoch": 13.27, + "learning_rate": 0.0002739949748743719, + "loss": 0.3111, + "mlm_loss": 0.3111, + "step": 17200 + }, + { + "epoch": 13.27, + "eval_ep_loss": -2.6017704010009766, + "eval_loss": 0.29203662276268005, + "eval_mlm_loss": 0.29203662276268005, + "eval_runtime": 61.3385, + "eval_samples_per_second": 1139.073, + "eval_steps_per_second": 0.571, + "step": 17200 + }, + { + "ep_loss": 0.0, + "epoch": 13.27, + "learning_rate": 0.00027368090452261306, + "loss": 0.3096, + "mlm_loss": 0.3096, + "step": 17210 + }, + { + "ep_loss": 0.0, + "epoch": 13.28, + "learning_rate": 0.0002733668341708543, + "loss": 0.3122, + "mlm_loss": 0.3122, + "step": 17220 + }, + { + "ep_loss": 0.0, + "epoch": 13.29, + "learning_rate": 0.0002730527638190955, + "loss": 0.3051, + "mlm_loss": 0.3051, + "step": 17230 + }, + { + "ep_loss": 0.0, + "epoch": 13.3, + "learning_rate": 0.0002727386934673367, + "loss": 0.3064, + "mlm_loss": 0.3064, + "step": 17240 + }, + { + "ep_loss": 0.0, + "epoch": 13.31, + "learning_rate": 0.0002724246231155779, + "loss": 0.3129, + "mlm_loss": 0.3129, + "step": 17250 + }, + { + "ep_loss": 0.0, + "epoch": 13.31, + "learning_rate": 0.0002721105527638191, + "loss": 0.3055, + "mlm_loss": 0.3055, + "step": 17260 + }, + { + "ep_loss": 0.0, + "epoch": 13.32, + "learning_rate": 0.00027179648241206026, + "loss": 0.3152, + "mlm_loss": 0.3152, + "step": 17270 + }, + { + "ep_loss": 0.0, + "epoch": 13.33, + "learning_rate": 0.0002714824120603015, + "loss": 0.3112, + "mlm_loss": 0.3112, + "step": 17280 + }, + { + "ep_loss": 0.0, + "epoch": 13.34, + "learning_rate": 0.0002711683417085427, + "loss": 0.3106, + "mlm_loss": 0.3106, + "step": 17290 + }, + { + "ep_loss": 0.0, + "epoch": 13.34, + "learning_rate": 0.00027085427135678395, + "loss": 0.3122, + "mlm_loss": 0.3122, + "step": 17300 + }, + { + "epoch": 13.34, + "eval_ep_loss": -2.469452381134033, + "eval_loss": 0.2916301488876343, + "eval_mlm_loss": 0.2916301488876343, + "eval_runtime": 62.1214, + "eval_samples_per_second": 1124.718, + "eval_steps_per_second": 0.563, + "step": 17300 + }, + { + "ep_loss": 0.0, + "epoch": 13.35, + "learning_rate": 0.0002705402010050251, + "loss": 0.3103, + "mlm_loss": 0.3103, + "step": 17310 + }, + { + "ep_loss": 0.0, + "epoch": 13.36, + "learning_rate": 0.00027022613065326635, + "loss": 0.3098, + "mlm_loss": 0.3098, + "step": 17320 + }, + { + "ep_loss": 0.0, + "epoch": 13.37, + "learning_rate": 0.0002699120603015076, + "loss": 0.3068, + "mlm_loss": 0.3068, + "step": 17330 + }, + { + "ep_loss": 0.0, + "epoch": 13.37, + "learning_rate": 0.00026959798994974875, + "loss": 0.3121, + "mlm_loss": 0.3121, + "step": 17340 + }, + { + "ep_loss": 0.0, + "epoch": 13.38, + "learning_rate": 0.00026928391959799, + "loss": 0.3136, + "mlm_loss": 0.3136, + "step": 17350 + }, + { + "ep_loss": 0.0, + "epoch": 13.39, + "learning_rate": 0.00026896984924623115, + "loss": 0.3065, + "mlm_loss": 0.3065, + "step": 17360 + }, + { + "ep_loss": 0.0, + "epoch": 13.4, + "learning_rate": 0.0002686557788944723, + "loss": 0.3091, + "mlm_loss": 0.3091, + "step": 17370 + }, + { + "ep_loss": 0.0, + "epoch": 13.41, + "learning_rate": 0.00026834170854271355, + "loss": 0.3121, + "mlm_loss": 0.3121, + "step": 17380 + }, + { + "ep_loss": 0.0, + "epoch": 13.41, + "learning_rate": 0.0002680276381909548, + "loss": 0.3048, + "mlm_loss": 0.3048, + "step": 17390 + }, + { + "ep_loss": 0.0, + "epoch": 13.42, + "learning_rate": 0.00026771356783919596, + "loss": 0.3055, + "mlm_loss": 0.3055, + "step": 17400 + }, + { + "epoch": 13.42, + "eval_ep_loss": -2.2017784118652344, + "eval_loss": 0.29138830304145813, + "eval_mlm_loss": 0.29138830304145813, + "eval_runtime": 62.7279, + "eval_samples_per_second": 1113.843, + "eval_steps_per_second": 0.558, + "step": 17400 + }, + { + "ep_loss": 0.0, + "epoch": 13.43, + "learning_rate": 0.0002673994974874372, + "loss": 0.3153, + "mlm_loss": 0.3153, + "step": 17410 + }, + { + "ep_loss": 0.0, + "epoch": 13.44, + "learning_rate": 0.0002670854271356784, + "loss": 0.3087, + "mlm_loss": 0.3087, + "step": 17420 + }, + { + "ep_loss": 0.0, + "epoch": 13.44, + "learning_rate": 0.00026677135678391964, + "loss": 0.3082, + "mlm_loss": 0.3082, + "step": 17430 + }, + { + "ep_loss": 0.0, + "epoch": 13.45, + "learning_rate": 0.0002664572864321608, + "loss": 0.3119, + "mlm_loss": 0.3119, + "step": 17440 + }, + { + "ep_loss": 0.0, + "epoch": 13.46, + "learning_rate": 0.00026614321608040204, + "loss": 0.3083, + "mlm_loss": 0.3083, + "step": 17450 + }, + { + "ep_loss": 0.0, + "epoch": 13.47, + "learning_rate": 0.0002658291457286432, + "loss": 0.3077, + "mlm_loss": 0.3077, + "step": 17460 + }, + { + "ep_loss": 0.0, + "epoch": 13.47, + "learning_rate": 0.0002655150753768844, + "loss": 0.3129, + "mlm_loss": 0.3129, + "step": 17470 + }, + { + "ep_loss": 0.0, + "epoch": 13.48, + "learning_rate": 0.0002652010050251256, + "loss": 0.3078, + "mlm_loss": 0.3078, + "step": 17480 + }, + { + "ep_loss": 0.0, + "epoch": 13.49, + "learning_rate": 0.00026488693467336685, + "loss": 0.3105, + "mlm_loss": 0.3105, + "step": 17490 + }, + { + "ep_loss": 0.0, + "epoch": 13.5, + "learning_rate": 0.00026460427135678393, + "loss": 0.3065, + "mlm_loss": 0.3065, + "step": 17500 + }, + { + "epoch": 13.5, + "eval_ep_loss": -2.5960159301757812, + "eval_loss": 0.2920810580253601, + "eval_mlm_loss": 0.2920810580253601, + "eval_runtime": 61.7515, + "eval_samples_per_second": 1131.455, + "eval_steps_per_second": 0.567, + "step": 17500 + }, + { + "ep_loss": 0.0, + "epoch": 13.51, + "learning_rate": 0.00026429020100502516, + "loss": 0.3095, + "mlm_loss": 0.3095, + "step": 17510 + }, + { + "ep_loss": 0.0, + "epoch": 13.51, + "learning_rate": 0.00026397613065326633, + "loss": 0.3073, + "mlm_loss": 0.3073, + "step": 17520 + }, + { + "ep_loss": 0.0, + "epoch": 13.52, + "learning_rate": 0.0002636620603015075, + "loss": 0.3028, + "mlm_loss": 0.3028, + "step": 17530 + }, + { + "ep_loss": 0.0, + "epoch": 13.53, + "learning_rate": 0.00026334798994974873, + "loss": 0.3029, + "mlm_loss": 0.3029, + "step": 17540 + }, + { + "ep_loss": 0.0, + "epoch": 13.54, + "learning_rate": 0.00026303391959798996, + "loss": 0.311, + "mlm_loss": 0.311, + "step": 17550 + }, + { + "ep_loss": 0.0, + "epoch": 13.54, + "learning_rate": 0.00026271984924623113, + "loss": 0.3038, + "mlm_loss": 0.3038, + "step": 17560 + }, + { + "ep_loss": 0.0, + "epoch": 13.55, + "learning_rate": 0.00026240577889447236, + "loss": 0.306, + "mlm_loss": 0.306, + "step": 17570 + }, + { + "ep_loss": 0.0, + "epoch": 13.56, + "learning_rate": 0.0002620917085427136, + "loss": 0.3034, + "mlm_loss": 0.3034, + "step": 17580 + }, + { + "ep_loss": 0.0, + "epoch": 13.57, + "learning_rate": 0.0002617776381909548, + "loss": 0.3067, + "mlm_loss": 0.3067, + "step": 17590 + }, + { + "ep_loss": 0.0, + "epoch": 13.58, + "learning_rate": 0.000261463567839196, + "loss": 0.3045, + "mlm_loss": 0.3045, + "step": 17600 + }, + { + "epoch": 13.58, + "eval_ep_loss": -2.6744961738586426, + "eval_loss": 0.29085445404052734, + "eval_mlm_loss": 0.29085445404052734, + "eval_runtime": 62.86, + "eval_samples_per_second": 1111.501, + "eval_steps_per_second": 0.557, + "step": 17600 + }, + { + "ep_loss": 0.0, + "epoch": 13.58, + "learning_rate": 0.0002611494974874372, + "loss": 0.3031, + "mlm_loss": 0.3031, + "step": 17610 + }, + { + "ep_loss": 0.0, + "epoch": 13.59, + "learning_rate": 0.0002608354271356784, + "loss": 0.3073, + "mlm_loss": 0.3073, + "step": 17620 + }, + { + "ep_loss": 0.0, + "epoch": 13.6, + "learning_rate": 0.00026052135678391957, + "loss": 0.3032, + "mlm_loss": 0.3032, + "step": 17630 + }, + { + "ep_loss": 0.0, + "epoch": 13.61, + "learning_rate": 0.0002602072864321608, + "loss": 0.3095, + "mlm_loss": 0.3095, + "step": 17640 + }, + { + "ep_loss": 0.0, + "epoch": 13.61, + "learning_rate": 0.000259893216080402, + "loss": 0.3087, + "mlm_loss": 0.3087, + "step": 17650 + }, + { + "ep_loss": 0.0, + "epoch": 13.62, + "learning_rate": 0.0002595791457286432, + "loss": 0.3051, + "mlm_loss": 0.3051, + "step": 17660 + }, + { + "ep_loss": 0.0, + "epoch": 13.63, + "learning_rate": 0.0002592650753768844, + "loss": 0.3022, + "mlm_loss": 0.3022, + "step": 17670 + }, + { + "ep_loss": 0.0, + "epoch": 13.64, + "learning_rate": 0.00025895100502512566, + "loss": 0.3001, + "mlm_loss": 0.3001, + "step": 17680 + }, + { + "ep_loss": 0.0, + "epoch": 13.64, + "learning_rate": 0.00025863693467336683, + "loss": 0.3067, + "mlm_loss": 0.3067, + "step": 17690 + }, + { + "ep_loss": 0.0, + "epoch": 13.65, + "learning_rate": 0.00025832286432160806, + "loss": 0.3094, + "mlm_loss": 0.3094, + "step": 17700 + }, + { + "epoch": 13.65, + "eval_ep_loss": -2.679147958755493, + "eval_loss": 0.2893053889274597, + "eval_mlm_loss": 0.2893053889274597, + "eval_runtime": 60.2876, + "eval_samples_per_second": 1158.928, + "eval_steps_per_second": 0.581, + "step": 17700 + }, + { + "ep_loss": 0.0, + "epoch": 13.66, + "learning_rate": 0.0002580087939698493, + "loss": 0.3084, + "mlm_loss": 0.3084, + "step": 17710 + }, + { + "ep_loss": 0.0, + "epoch": 13.67, + "learning_rate": 0.0002576947236180904, + "loss": 0.3066, + "mlm_loss": 0.3066, + "step": 17720 + }, + { + "ep_loss": 0.0, + "epoch": 13.68, + "learning_rate": 0.00025738065326633163, + "loss": 0.3087, + "mlm_loss": 0.3087, + "step": 17730 + }, + { + "ep_loss": 0.0, + "epoch": 13.68, + "learning_rate": 0.00025706658291457286, + "loss": 0.3045, + "mlm_loss": 0.3045, + "step": 17740 + }, + { + "ep_loss": 0.0, + "epoch": 13.69, + "learning_rate": 0.0002567525125628141, + "loss": 0.3043, + "mlm_loss": 0.3043, + "step": 17750 + }, + { + "ep_loss": 0.0, + "epoch": 13.7, + "learning_rate": 0.00025643844221105526, + "loss": 0.3035, + "mlm_loss": 0.3035, + "step": 17760 + }, + { + "ep_loss": 0.0, + "epoch": 13.71, + "learning_rate": 0.0002561243718592965, + "loss": 0.3071, + "mlm_loss": 0.3071, + "step": 17770 + }, + { + "ep_loss": 0.0, + "epoch": 13.71, + "learning_rate": 0.0002558103015075377, + "loss": 0.3083, + "mlm_loss": 0.3083, + "step": 17780 + }, + { + "ep_loss": 0.0, + "epoch": 13.72, + "learning_rate": 0.0002554962311557789, + "loss": 0.3082, + "mlm_loss": 0.3082, + "step": 17790 + }, + { + "ep_loss": 0.0, + "epoch": 13.73, + "learning_rate": 0.0002551821608040201, + "loss": 0.3055, + "mlm_loss": 0.3055, + "step": 17800 + }, + { + "epoch": 13.73, + "eval_ep_loss": -2.5126798152923584, + "eval_loss": 0.28990474343299866, + "eval_mlm_loss": 0.28990474343299866, + "eval_runtime": 60.9715, + "eval_samples_per_second": 1145.928, + "eval_steps_per_second": 0.574, + "step": 17800 + }, + { + "ep_loss": 0.0, + "epoch": 13.74, + "learning_rate": 0.00025486809045226135, + "loss": 0.3067, + "mlm_loss": 0.3067, + "step": 17810 + }, + { + "ep_loss": 0.0, + "epoch": 13.74, + "learning_rate": 0.00025455402010050247, + "loss": 0.3096, + "mlm_loss": 0.3096, + "step": 17820 + }, + { + "ep_loss": 0.0, + "epoch": 13.75, + "learning_rate": 0.0002542399497487437, + "loss": 0.3064, + "mlm_loss": 0.3064, + "step": 17830 + }, + { + "ep_loss": 0.0, + "epoch": 13.76, + "learning_rate": 0.0002539258793969849, + "loss": 0.3009, + "mlm_loss": 0.3009, + "step": 17840 + }, + { + "ep_loss": 0.0, + "epoch": 13.77, + "learning_rate": 0.00025361180904522615, + "loss": 0.3049, + "mlm_loss": 0.3049, + "step": 17850 + }, + { + "ep_loss": 0.0, + "epoch": 13.78, + "learning_rate": 0.00025329773869346733, + "loss": 0.3102, + "mlm_loss": 0.3102, + "step": 17860 + }, + { + "ep_loss": 0.0, + "epoch": 13.78, + "learning_rate": 0.00025298366834170856, + "loss": 0.3072, + "mlm_loss": 0.3072, + "step": 17870 + }, + { + "ep_loss": 0.0, + "epoch": 13.79, + "learning_rate": 0.0002526695979899498, + "loss": 0.2998, + "mlm_loss": 0.2998, + "step": 17880 + }, + { + "ep_loss": 0.0, + "epoch": 13.8, + "learning_rate": 0.00025235552763819096, + "loss": 0.3057, + "mlm_loss": 0.3057, + "step": 17890 + }, + { + "ep_loss": 0.0, + "epoch": 13.81, + "learning_rate": 0.0002520414572864322, + "loss": 0.3036, + "mlm_loss": 0.3036, + "step": 17900 + }, + { + "epoch": 13.81, + "eval_ep_loss": -2.5039594173431396, + "eval_loss": 0.28744441270828247, + "eval_mlm_loss": 0.28744441270828247, + "eval_runtime": 60.8163, + "eval_samples_per_second": 1148.853, + "eval_steps_per_second": 0.576, + "step": 17900 + }, + { + "ep_loss": 0.0, + "epoch": 13.81, + "learning_rate": 0.0002517273869346734, + "loss": 0.303, + "mlm_loss": 0.303, + "step": 17910 + }, + { + "ep_loss": 0.0, + "epoch": 13.82, + "learning_rate": 0.00025141331658291453, + "loss": 0.3072, + "mlm_loss": 0.3072, + "step": 17920 + }, + { + "ep_loss": 0.0, + "epoch": 13.83, + "learning_rate": 0.00025109924623115576, + "loss": 0.3035, + "mlm_loss": 0.3035, + "step": 17930 + }, + { + "ep_loss": 0.0, + "epoch": 13.84, + "learning_rate": 0.000250785175879397, + "loss": 0.3014, + "mlm_loss": 0.3014, + "step": 17940 + }, + { + "ep_loss": 0.0, + "epoch": 13.84, + "learning_rate": 0.00025047110552763816, + "loss": 0.2989, + "mlm_loss": 0.2989, + "step": 17950 + }, + { + "ep_loss": 0.0, + "epoch": 13.85, + "learning_rate": 0.0002501570351758794, + "loss": 0.3079, + "mlm_loss": 0.3079, + "step": 17960 + }, + { + "ep_loss": 0.0, + "epoch": 13.86, + "learning_rate": 0.0002498429648241206, + "loss": 0.3044, + "mlm_loss": 0.3044, + "step": 17970 + }, + { + "ep_loss": 0.0, + "epoch": 13.87, + "learning_rate": 0.00024952889447236185, + "loss": 0.3006, + "mlm_loss": 0.3006, + "step": 17980 + }, + { + "ep_loss": 0.0, + "epoch": 13.88, + "learning_rate": 0.000249214824120603, + "loss": 0.303, + "mlm_loss": 0.303, + "step": 17990 + }, + { + "ep_loss": 0.0, + "epoch": 13.88, + "learning_rate": 0.0002489007537688442, + "loss": 0.3096, + "mlm_loss": 0.3096, + "step": 18000 + }, + { + "epoch": 13.88, + "eval_ep_loss": -2.4838016033172607, + "eval_loss": 0.2871261239051819, + "eval_mlm_loss": 0.2871261239051819, + "eval_runtime": 63.3763, + "eval_samples_per_second": 1102.447, + "eval_steps_per_second": 0.552, + "step": 18000 + } + ], + "max_steps": 25920, + "num_train_epochs": 20, + "total_flos": 4.884832359913882e+18, + "trial_name": null, + "trial_params": null +}