diff --git "a/checkpoints/pre_train/from_scratch/depth/c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-03-11_18-50/checkpoint-512000/trainer_state.json" "b/checkpoints/pre_train/from_scratch/depth/c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-03-11_18-50/checkpoint-512000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/pre_train/from_scratch/depth/c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-03-11_18-50/checkpoint-512000/trainer_state.json" @@ -0,0 +1,39963 @@ +{ + "best_metric": 0.44411131739616394, + "best_model_checkpoint": "checkpoints/pre_train/from_scratch/depth/c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-03-11_18-50/checkpoint-508000", + "epoch": 0.206, + "eval_steps": 1000, + "global_step": 512000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1e-08, + "loss": 124.8125, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "loss": 74.1198, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 15.6661, + "step": 200 + }, + { + "epoch": 0.0, + "learning_rate": 3e-06, + "loss": 9.3284, + "step": 300 + }, + { + "epoch": 0.0, + "learning_rate": 4.000000000000001e-06, + "loss": 7.8171, + "step": 400 + }, + { + "epoch": 0.0, + "learning_rate": 5e-06, + "loss": 7.2308, + "step": 500 + }, + { + "epoch": 0.0, + "learning_rate": 6e-06, + "loss": 6.8223, + "step": 600 + }, + { + "epoch": 0.0, + "learning_rate": 7.000000000000001e-06, + "loss": 6.523, + "step": 700 + }, + { + "epoch": 0.0, + "learning_rate": 8.000000000000001e-06, + "loss": 6.2725, + "step": 800 + }, + { + "epoch": 0.0, + "learning_rate": 9e-06, + "loss": 6.03, + "step": 900 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 5.8229, + "step": 1000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 5.820368220489233, + "eval_average_loss_on_sentence_tokens": 2.507097470924067, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 5.671249866485596, + "eval_non_padding_tokens_in_labels": 133.5083, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39165, + "eval_padding_tokens_in_labels": 7569834.0, + "eval_reconstruction_accuracy": 0.7797755030017626, + "eval_runtime": 205.3345, + "eval_samples_per_second": 24.351, + "eval_sentence_accuracy": 0.06660595402587614, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 1000 + }, + { + "epoch": 0.0, + "learning_rate": 1.1000000000000001e-05, + "loss": 5.6046, + "step": 1100 + }, + { + "epoch": 0.0, + "learning_rate": 1.2e-05, + "loss": 5.3755, + "step": 1200 + }, + { + "epoch": 0.0, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.2232, + "step": 1300 + }, + { + "epoch": 0.0, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.0532, + "step": 1400 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-05, + "loss": 4.8414, + "step": 1500 + }, + { + "epoch": 0.0, + "learning_rate": 1.6000000000000003e-05, + "loss": 4.6234, + "step": 1600 + }, + { + "epoch": 0.0, + "learning_rate": 1.7000000000000003e-05, + "loss": 4.3834, + "step": 1700 + }, + { + "epoch": 0.0, + "learning_rate": 1.8e-05, + "loss": 4.1968, + "step": 1800 + }, + { + "epoch": 0.0, + "learning_rate": 1.9e-05, + "loss": 4.0, + "step": 1900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 3.8405, + "step": 2000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 3.828068634841912, + "eval_average_loss_on_sentence_tokens": 2.3631353529582353, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 3.7621874809265137, + "eval_non_padding_tokens_in_labels": 133.54125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3952, + "eval_padding_tokens_in_labels": 7569175.0, + "eval_reconstruction_accuracy": 0.7800270569744887, + "eval_runtime": 200.8901, + "eval_samples_per_second": 24.889, + "eval_sentence_accuracy": 0.2343657473038204, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2496, + "step": 2000 + }, + { + "epoch": 0.0, + "learning_rate": 2.1e-05, + "loss": 3.7361, + "step": 2100 + }, + { + "epoch": 0.0, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.5411, + "step": 2200 + }, + { + "epoch": 0.0, + "learning_rate": 2.3000000000000003e-05, + "loss": 3.3841, + "step": 2300 + }, + { + "epoch": 0.0, + "learning_rate": 2.4e-05, + "loss": 3.232, + "step": 2400 + }, + { + "epoch": 0.0, + "learning_rate": 2.5e-05, + "loss": 3.0805, + "step": 2500 + }, + { + "epoch": 0.0, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.9513, + "step": 2600 + }, + { + "epoch": 0.0, + "learning_rate": 2.7000000000000002e-05, + "loss": 2.7914, + "step": 2700 + }, + { + "epoch": 0.0, + "learning_rate": 2.8000000000000003e-05, + "loss": 2.6536, + "step": 2800 + }, + { + "epoch": 0.0, + "learning_rate": 2.9e-05, + "loss": 2.5597, + "step": 2900 + }, + { + "epoch": 0.0, + "learning_rate": 3e-05, + "loss": 2.4678, + "step": 3000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 2.4407313646669357, + "eval_average_loss_on_sentence_tokens": 1.6782675526872783, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 2.406015634536743, + "eval_non_padding_tokens_in_labels": 133.5311, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3945, + "eval_padding_tokens_in_labels": 7569378.0, + "eval_reconstruction_accuracy": 0.7963452890130649, + "eval_runtime": 208.8377, + "eval_samples_per_second": 23.942, + "eval_sentence_accuracy": 0.2648357169774077, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 3000 + }, + { + "epoch": 0.0, + "learning_rate": 3.1e-05, + "loss": 2.3658, + "step": 3100 + }, + { + "epoch": 0.0, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.2707, + "step": 3200 + }, + { + "epoch": 0.0, + "learning_rate": 3.3e-05, + "loss": 2.1971, + "step": 3300 + }, + { + "epoch": 0.0, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.1137, + "step": 3400 + }, + { + "epoch": 0.0, + "learning_rate": 3.5e-05, + "loss": 2.0497, + "step": 3500 + }, + { + "epoch": 0.0, + "learning_rate": 3.6e-05, + "loss": 2.0092, + "step": 3600 + }, + { + "epoch": 0.0, + "learning_rate": 3.7e-05, + "loss": 1.9505, + "step": 3700 + }, + { + "epoch": 0.0, + "learning_rate": 3.8e-05, + "loss": 1.8945, + "step": 3800 + }, + { + "epoch": 0.0, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.8491, + "step": 3900 + }, + { + "epoch": 0.0, + "learning_rate": 4e-05, + "loss": 1.8099, + "step": 4000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 1.7662658782015033, + "eval_average_loss_on_sentence_tokens": 1.7336265273792055, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 1.7646093368530273, + "eval_non_padding_tokens_in_labels": 133.50705, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38365, + "eval_padding_tokens_in_labels": 7569859.0, + "eval_reconstruction_accuracy": 0.7985967343324379, + "eval_runtime": 208.818, + "eval_samples_per_second": 23.944, + "eval_sentence_accuracy": 0.28397814344931543, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.249775, + "step": 4000 + }, + { + "epoch": 0.0, + "learning_rate": 4.1e-05, + "loss": 1.7726, + "step": 4100 + }, + { + "epoch": 0.0, + "learning_rate": 4.2e-05, + "loss": 1.7364, + "step": 4200 + }, + { + "epoch": 0.0, + "learning_rate": 4.3e-05, + "loss": 1.6976, + "step": 4300 + }, + { + "epoch": 0.0, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.686, + "step": 4400 + }, + { + "epoch": 0.0, + "learning_rate": 4.5e-05, + "loss": 1.6321, + "step": 4500 + }, + { + "epoch": 0.0, + "learning_rate": 4.600000000000001e-05, + "loss": 1.6231, + "step": 4600 + }, + { + "epoch": 0.0, + "learning_rate": 4.7e-05, + "loss": 1.5854, + "step": 4700 + }, + { + "epoch": 0.0, + "learning_rate": 4.8e-05, + "loss": 1.5527, + "step": 4800 + }, + { + "epoch": 0.0, + "learning_rate": 4.9e-05, + "loss": 1.5177, + "step": 4900 + }, + { + "epoch": 0.01, + "learning_rate": 5e-05, + "loss": 1.5022, + "step": 5000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.487370181486139, + "eval_average_loss_on_sentence_tokens": 1.2769010541784074, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 1.4777734279632568, + "eval_non_padding_tokens_in_labels": 133.50815, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3919, + "eval_padding_tokens_in_labels": 7569837.0, + "eval_reconstruction_accuracy": 0.8047751275026571, + "eval_runtime": 206.7324, + "eval_samples_per_second": 24.186, + "eval_sentence_accuracy": 0.29621637626285285, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.2496, + "step": 5000 + }, + { + "epoch": 0.01, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.4723, + "step": 5100 + }, + { + "epoch": 0.01, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.461, + "step": 5200 + }, + { + "epoch": 0.01, + "learning_rate": 5.300000000000001e-05, + "loss": 1.4535, + "step": 5300 + }, + { + "epoch": 0.01, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.4337, + "step": 5400 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-05, + "loss": 1.4288, + "step": 5500 + }, + { + "epoch": 0.01, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.4061, + "step": 5600 + }, + { + "epoch": 0.01, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.4013, + "step": 5700 + }, + { + "epoch": 0.01, + "learning_rate": 5.8e-05, + "loss": 1.3969, + "step": 5800 + }, + { + "epoch": 0.01, + "learning_rate": 5.9e-05, + "loss": 1.3982, + "step": 5900 + }, + { + "epoch": 0.01, + "learning_rate": 6e-05, + "loss": 1.38, + "step": 6000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.3544726248481205, + "eval_average_loss_on_sentence_tokens": 1.4007674228356988, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 1.356601595878601, + "eval_non_padding_tokens_in_labels": 133.53385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38385, + "eval_padding_tokens_in_labels": 7569323.0, + "eval_reconstruction_accuracy": 0.8134934096649994, + "eval_runtime": 193.0403, + "eval_samples_per_second": 25.901, + "eval_sentence_accuracy": 0.313209934143234, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 6000 + }, + { + "epoch": 0.01, + "learning_rate": 6.1e-05, + "loss": 1.3731, + "step": 6100 + }, + { + "epoch": 0.01, + "learning_rate": 6.2e-05, + "loss": 1.3649, + "step": 6200 + }, + { + "epoch": 0.01, + "learning_rate": 6.3e-05, + "loss": 1.3568, + "step": 6300 + }, + { + "epoch": 0.01, + "learning_rate": 6.400000000000001e-05, + "loss": 1.3423, + "step": 6400 + }, + { + "epoch": 0.01, + "learning_rate": 6.500000000000001e-05, + "loss": 1.3416, + "step": 6500 + }, + { + "epoch": 0.01, + "learning_rate": 6.6e-05, + "loss": 1.3365, + "step": 6600 + }, + { + "epoch": 0.01, + "learning_rate": 6.7e-05, + "loss": 1.3263, + "step": 6700 + }, + { + "epoch": 0.01, + "learning_rate": 6.800000000000001e-05, + "loss": 1.3177, + "step": 6800 + }, + { + "epoch": 0.01, + "learning_rate": 6.9e-05, + "loss": 1.3107, + "step": 6900 + }, + { + "epoch": 0.01, + "learning_rate": 7e-05, + "loss": 1.3001, + "step": 7000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.3048031180215929, + "eval_average_loss_on_sentence_tokens": 1.1782161531036692, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 1.299375057220459, + "eval_non_padding_tokens_in_labels": 133.5463, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3933, + "eval_padding_tokens_in_labels": 7569074.0, + "eval_reconstruction_accuracy": 0.8217441177880007, + "eval_runtime": 217.3638, + "eval_samples_per_second": 23.003, + "eval_sentence_accuracy": 0.3175525328835215, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 7000 + }, + { + "epoch": 0.01, + "learning_rate": 7.1e-05, + "loss": 1.2929, + "step": 7100 + }, + { + "epoch": 0.01, + "learning_rate": 7.2e-05, + "loss": 1.2904, + "step": 7200 + }, + { + "epoch": 0.01, + "learning_rate": 7.3e-05, + "loss": 1.3023, + "step": 7300 + }, + { + "epoch": 0.01, + "learning_rate": 7.4e-05, + "loss": 1.2898, + "step": 7400 + }, + { + "epoch": 0.01, + "learning_rate": 7.500000000000001e-05, + "loss": 1.2918, + "step": 7500 + }, + { + "epoch": 0.01, + "learning_rate": 7.6e-05, + "loss": 1.2835, + "step": 7600 + }, + { + "epoch": 0.01, + "learning_rate": 7.7e-05, + "loss": 1.2796, + "step": 7700 + }, + { + "epoch": 0.01, + "learning_rate": 7.800000000000001e-05, + "loss": 1.2743, + "step": 7800 + }, + { + "epoch": 0.01, + "learning_rate": 7.900000000000001e-05, + "loss": 1.2716, + "step": 7900 + }, + { + "epoch": 0.01, + "learning_rate": 8e-05, + "loss": 1.2675, + "step": 8000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.266054213592805, + "eval_average_loss_on_sentence_tokens": 1.065934506650565, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 1.2569531202316284, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3843, + "eval_padding_tokens_in_labels": 7569523.0, + "eval_reconstruction_accuracy": 0.8259082934656221, + "eval_runtime": 212.8703, + "eval_samples_per_second": 23.488, + "eval_sentence_accuracy": 0.33825165539146196, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 8000 + }, + { + "epoch": 0.01, + "learning_rate": 8.1e-05, + "loss": 1.2727, + "step": 8100 + }, + { + "epoch": 0.01, + "learning_rate": 8.2e-05, + "loss": 1.2628, + "step": 8200 + }, + { + "epoch": 0.01, + "learning_rate": 8.3e-05, + "loss": 1.2557, + "step": 8300 + }, + { + "epoch": 0.01, + "learning_rate": 8.4e-05, + "loss": 1.2461, + "step": 8400 + }, + { + "epoch": 0.01, + "learning_rate": 8.5e-05, + "loss": 1.2591, + "step": 8500 + }, + { + "epoch": 0.01, + "learning_rate": 8.6e-05, + "loss": 1.2399, + "step": 8600 + }, + { + "epoch": 0.01, + "learning_rate": 8.7e-05, + "loss": 1.2362, + "step": 8700 + }, + { + "epoch": 0.01, + "learning_rate": 8.800000000000001e-05, + "loss": 1.2394, + "step": 8800 + }, + { + "epoch": 0.01, + "learning_rate": 8.900000000000001e-05, + "loss": 1.2245, + "step": 8900 + }, + { + "epoch": 0.01, + "learning_rate": 9e-05, + "loss": 1.2245, + "step": 9000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.2339075511732376, + "eval_average_loss_on_sentence_tokens": 0.8487429284993918, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 1.2164844274520874, + "eval_non_padding_tokens_in_labels": 133.5015, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.382, + "eval_padding_tokens_in_labels": 7569970.0, + "eval_reconstruction_accuracy": 0.83143046485086, + "eval_runtime": 192.7894, + "eval_samples_per_second": 25.935, + "eval_sentence_accuracy": 0.5858695067023166, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 9000 + }, + { + "epoch": 0.01, + "learning_rate": 9.1e-05, + "loss": 1.233, + "step": 9100 + }, + { + "epoch": 0.01, + "learning_rate": 9.200000000000001e-05, + "loss": 1.2268, + "step": 9200 + }, + { + "epoch": 0.01, + "learning_rate": 9.300000000000001e-05, + "loss": 1.2247, + "step": 9300 + }, + { + "epoch": 0.01, + "learning_rate": 9.4e-05, + "loss": 1.2144, + "step": 9400 + }, + { + "epoch": 0.01, + "learning_rate": 9.5e-05, + "loss": 1.2234, + "step": 9500 + }, + { + "epoch": 0.01, + "learning_rate": 9.6e-05, + "loss": 1.2291, + "step": 9600 + }, + { + "epoch": 0.01, + "learning_rate": 9.7e-05, + "loss": 1.219, + "step": 9700 + }, + { + "epoch": 0.01, + "learning_rate": 9.8e-05, + "loss": 1.2177, + "step": 9800 + }, + { + "epoch": 0.01, + "learning_rate": 9.900000000000001e-05, + "loss": 1.2177, + "step": 9900 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 1.2118, + "step": 10000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.2210297507950179, + "eval_average_loss_on_sentence_tokens": 0.924345426302736, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 1.2075976133346558, + "eval_non_padding_tokens_in_labels": 133.52655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37465, + "eval_padding_tokens_in_labels": 7569469.0, + "eval_reconstruction_accuracy": 0.8290608470270034, + "eval_runtime": 194.042, + "eval_samples_per_second": 25.768, + "eval_sentence_accuracy": 0.5679248838085668, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2496, + "step": 10000 + }, + { + "epoch": 0.01, + "learning_rate": 9.9989898989899e-05, + "loss": 1.2113, + "step": 10100 + }, + { + "epoch": 0.01, + "learning_rate": 9.997979797979799e-05, + "loss": 1.2066, + "step": 10200 + }, + { + "epoch": 0.01, + "learning_rate": 9.996969696969698e-05, + "loss": 1.2102, + "step": 10300 + }, + { + "epoch": 0.01, + "learning_rate": 9.995959595959596e-05, + "loss": 1.2096, + "step": 10400 + }, + { + "epoch": 0.01, + "learning_rate": 9.994949494949496e-05, + "loss": 1.1896, + "step": 10500 + }, + { + "epoch": 0.01, + "learning_rate": 9.993939393939394e-05, + "loss": 1.1998, + "step": 10600 + }, + { + "epoch": 0.01, + "learning_rate": 9.992929292929294e-05, + "loss": 1.2005, + "step": 10700 + }, + { + "epoch": 0.01, + "learning_rate": 9.991919191919193e-05, + "loss": 1.1882, + "step": 10800 + }, + { + "epoch": 0.01, + "learning_rate": 9.990909090909092e-05, + "loss": 1.2088, + "step": 10900 + }, + { + "epoch": 0.01, + "learning_rate": 9.98989898989899e-05, + "loss": 1.1867, + "step": 11000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.2141552391859884, + "eval_average_loss_on_sentence_tokens": 0.618944926157735, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 1.1871484518051147, + "eval_non_padding_tokens_in_labels": 133.54435, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38005, + "eval_padding_tokens_in_labels": 7569113.0, + "eval_reconstruction_accuracy": 0.834335501864303, + "eval_runtime": 193.1553, + "eval_samples_per_second": 25.886, + "eval_sentence_accuracy": 0.6614208552407271, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 11000 + }, + { + "epoch": 0.01, + "learning_rate": 9.98888888888889e-05, + "loss": 1.1932, + "step": 11100 + }, + { + "epoch": 0.01, + "learning_rate": 9.987878787878788e-05, + "loss": 1.1864, + "step": 11200 + }, + { + "epoch": 0.01, + "learning_rate": 9.986868686868687e-05, + "loss": 1.1965, + "step": 11300 + }, + { + "epoch": 0.01, + "learning_rate": 9.985858585858587e-05, + "loss": 1.1903, + "step": 11400 + }, + { + "epoch": 0.01, + "learning_rate": 9.984848484848486e-05, + "loss": 1.1857, + "step": 11500 + }, + { + "epoch": 0.01, + "learning_rate": 9.983838383838384e-05, + "loss": 1.1734, + "step": 11600 + }, + { + "epoch": 0.01, + "learning_rate": 9.982828282828284e-05, + "loss": 1.1795, + "step": 11700 + }, + { + "epoch": 0.01, + "learning_rate": 9.981818181818182e-05, + "loss": 1.1785, + "step": 11800 + }, + { + "epoch": 0.01, + "learning_rate": 9.980808080808081e-05, + "loss": 1.1864, + "step": 11900 + }, + { + "epoch": 0.01, + "learning_rate": 9.97979797979798e-05, + "loss": 1.1597, + "step": 12000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.176720646013477, + "eval_average_loss_on_sentence_tokens": 0.7701543854124735, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 1.1584765911102295, + "eval_non_padding_tokens_in_labels": 133.5443, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37965, + "eval_padding_tokens_in_labels": 7569114.0, + "eval_reconstruction_accuracy": 0.8382670955387038, + "eval_runtime": 195.7225, + "eval_samples_per_second": 25.546, + "eval_sentence_accuracy": 0.6647720135661349, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.248775, + "step": 12000 + }, + { + "epoch": 0.01, + "learning_rate": 9.97878787878788e-05, + "loss": 1.1667, + "step": 12100 + }, + { + "epoch": 0.01, + "learning_rate": 9.977777777777779e-05, + "loss": 1.1658, + "step": 12200 + }, + { + "epoch": 0.01, + "learning_rate": 9.976767676767678e-05, + "loss": 1.1659, + "step": 12300 + }, + { + "epoch": 0.01, + "learning_rate": 9.975757575757576e-05, + "loss": 1.1665, + "step": 12400 + }, + { + "epoch": 0.01, + "learning_rate": 9.974747474747475e-05, + "loss": 1.1737, + "step": 12500 + }, + { + "epoch": 0.01, + "learning_rate": 9.973737373737374e-05, + "loss": 1.1568, + "step": 12600 + }, + { + "epoch": 0.01, + "learning_rate": 9.972727272727273e-05, + "loss": 1.1566, + "step": 12700 + }, + { + "epoch": 0.01, + "learning_rate": 9.971717171717173e-05, + "loss": 1.1671, + "step": 12800 + }, + { + "epoch": 0.01, + "learning_rate": 9.970707070707072e-05, + "loss": 1.1555, + "step": 12900 + }, + { + "epoch": 0.01, + "learning_rate": 9.96969696969697e-05, + "loss": 1.1572, + "step": 13000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.1568535021997417, + "eval_average_loss_on_sentence_tokens": 0.9724654459536084, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 1.1488085985183716, + "eval_non_padding_tokens_in_labels": 133.46615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3731, + "eval_padding_tokens_in_labels": 7570677.0, + "eval_reconstruction_accuracy": 0.8400927653601447, + "eval_runtime": 194.4983, + "eval_samples_per_second": 25.707, + "eval_sentence_accuracy": 0.6032847632207009, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 13000 + }, + { + "epoch": 0.01, + "learning_rate": 9.968686868686869e-05, + "loss": 1.1566, + "step": 13100 + }, + { + "epoch": 0.01, + "learning_rate": 9.967676767676768e-05, + "loss": 1.1457, + "step": 13200 + }, + { + "epoch": 0.01, + "learning_rate": 9.966666666666667e-05, + "loss": 1.1495, + "step": 13300 + }, + { + "epoch": 0.01, + "learning_rate": 9.965656565656566e-05, + "loss": 1.1553, + "step": 13400 + }, + { + "epoch": 0.01, + "learning_rate": 9.964646464646466e-05, + "loss": 1.145, + "step": 13500 + }, + { + "epoch": 0.01, + "learning_rate": 9.963636363636363e-05, + "loss": 1.1439, + "step": 13600 + }, + { + "epoch": 0.01, + "learning_rate": 9.962626262626264e-05, + "loss": 1.1491, + "step": 13700 + }, + { + "epoch": 0.01, + "learning_rate": 9.961616161616162e-05, + "loss": 1.1421, + "step": 13800 + }, + { + "epoch": 0.01, + "learning_rate": 9.960606060606061e-05, + "loss": 1.142, + "step": 13900 + }, + { + "epoch": 0.01, + "learning_rate": 9.95959595959596e-05, + "loss": 1.1348, + "step": 14000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.143838641284504, + "eval_average_loss_on_sentence_tokens": 0.7180619895209776, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 1.1246875524520874, + "eval_non_padding_tokens_in_labels": 133.4928, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3707, + "eval_padding_tokens_in_labels": 7570144.0, + "eval_reconstruction_accuracy": 0.8438415602536103, + "eval_runtime": 194.2283, + "eval_samples_per_second": 25.743, + "eval_sentence_accuracy": 0.6671451899438333, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2484, + "step": 14000 + }, + { + "epoch": 0.01, + "learning_rate": 9.95858585858586e-05, + "loss": 1.1347, + "step": 14100 + }, + { + "epoch": 0.01, + "learning_rate": 9.957575757575757e-05, + "loss": 1.1258, + "step": 14200 + }, + { + "epoch": 0.01, + "learning_rate": 9.956565656565658e-05, + "loss": 1.1318, + "step": 14300 + }, + { + "epoch": 0.01, + "learning_rate": 9.955555555555556e-05, + "loss": 1.1352, + "step": 14400 + }, + { + "epoch": 0.01, + "learning_rate": 9.954545454545455e-05, + "loss": 1.1215, + "step": 14500 + }, + { + "epoch": 0.01, + "learning_rate": 9.953535353535354e-05, + "loss": 1.1324, + "step": 14600 + }, + { + "epoch": 0.01, + "learning_rate": 9.952525252525253e-05, + "loss": 1.1281, + "step": 14700 + }, + { + "epoch": 0.01, + "learning_rate": 9.951515151515151e-05, + "loss": 1.1158, + "step": 14800 + }, + { + "epoch": 0.01, + "learning_rate": 9.950505050505052e-05, + "loss": 1.1272, + "step": 14900 + }, + { + "epoch": 0.01, + "learning_rate": 9.94949494949495e-05, + "loss": 1.119, + "step": 15000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 1.1287376605386688, + "eval_average_loss_on_sentence_tokens": 0.7417159426359005, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 1.1114648580551147, + "eval_non_padding_tokens_in_labels": 133.54825, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38605, + "eval_padding_tokens_in_labels": 7569035.0, + "eval_reconstruction_accuracy": 0.8459627793017432, + "eval_runtime": 193.0367, + "eval_samples_per_second": 25.902, + "eval_sentence_accuracy": 0.6208032013207242, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 15000 + }, + { + "epoch": 0.02, + "learning_rate": 9.948484848484849e-05, + "loss": 1.1233, + "step": 15100 + }, + { + "epoch": 0.02, + "learning_rate": 9.947474747474748e-05, + "loss": 1.1168, + "step": 15200 + }, + { + "epoch": 0.02, + "learning_rate": 9.946464646464647e-05, + "loss": 1.1182, + "step": 15300 + }, + { + "epoch": 0.02, + "learning_rate": 9.945454545454545e-05, + "loss": 1.1046, + "step": 15400 + }, + { + "epoch": 0.02, + "learning_rate": 9.944444444444446e-05, + "loss": 1.1104, + "step": 15500 + }, + { + "epoch": 0.02, + "learning_rate": 9.943434343434343e-05, + "loss": 1.1018, + "step": 15600 + }, + { + "epoch": 0.02, + "learning_rate": 9.942424242424243e-05, + "loss": 1.1078, + "step": 15700 + }, + { + "epoch": 0.02, + "learning_rate": 9.941414141414142e-05, + "loss": 1.1019, + "step": 15800 + }, + { + "epoch": 0.02, + "learning_rate": 9.940404040404041e-05, + "loss": 1.1044, + "step": 15900 + }, + { + "epoch": 0.02, + "learning_rate": 9.939393939393939e-05, + "loss": 1.0972, + "step": 16000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 1.1119238947339354, + "eval_average_loss_on_sentence_tokens": 0.7784522395069102, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 1.096777319908142, + "eval_non_padding_tokens_in_labels": 133.5264, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3892, + "eval_padding_tokens_in_labels": 7569472.0, + "eval_reconstruction_accuracy": 0.8471935977113347, + "eval_runtime": 193.2497, + "eval_samples_per_second": 25.873, + "eval_sentence_accuracy": 0.6547813447700397, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2496, + "step": 16000 + }, + { + "epoch": 0.02, + "learning_rate": 9.93838383838384e-05, + "loss": 1.1058, + "step": 16100 + }, + { + "epoch": 0.02, + "learning_rate": 9.937373737373737e-05, + "loss": 1.1041, + "step": 16200 + }, + { + "epoch": 0.02, + "learning_rate": 9.936363636363636e-05, + "loss": 1.1108, + "step": 16300 + }, + { + "epoch": 0.02, + "learning_rate": 9.935353535353536e-05, + "loss": 1.0893, + "step": 16400 + }, + { + "epoch": 0.02, + "learning_rate": 9.934343434343435e-05, + "loss": 1.0968, + "step": 16500 + }, + { + "epoch": 0.02, + "learning_rate": 9.933333333333334e-05, + "loss": 1.0928, + "step": 16600 + }, + { + "epoch": 0.02, + "learning_rate": 9.932323232323233e-05, + "loss": 1.1021, + "step": 16700 + }, + { + "epoch": 0.02, + "learning_rate": 9.931313131313131e-05, + "loss": 1.0968, + "step": 16800 + }, + { + "epoch": 0.02, + "learning_rate": 9.93030303030303e-05, + "loss": 1.0929, + "step": 16900 + }, + { + "epoch": 0.02, + "learning_rate": 9.92929292929293e-05, + "loss": 1.0882, + "step": 17000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 1.0947489332864637, + "eval_average_loss_on_sentence_tokens": 0.727377758263971, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 1.0779101848602295, + "eval_non_padding_tokens_in_labels": 133.52035, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3739, + "eval_padding_tokens_in_labels": 7569593.0, + "eval_reconstruction_accuracy": 0.8489618103673505, + "eval_runtime": 192.0019, + "eval_samples_per_second": 26.041, + "eval_sentence_accuracy": 0.6570603118775459, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 17000 + }, + { + "epoch": 0.02, + "learning_rate": 9.928282828282829e-05, + "loss": 1.0819, + "step": 17100 + }, + { + "epoch": 0.02, + "learning_rate": 9.927272727272728e-05, + "loss": 1.0774, + "step": 17200 + }, + { + "epoch": 0.02, + "learning_rate": 9.926262626262627e-05, + "loss": 1.089, + "step": 17300 + }, + { + "epoch": 0.02, + "learning_rate": 9.925252525252525e-05, + "loss": 1.0792, + "step": 17400 + }, + { + "epoch": 0.02, + "learning_rate": 9.924242424242425e-05, + "loss": 1.0802, + "step": 17500 + }, + { + "epoch": 0.02, + "learning_rate": 9.923232323232323e-05, + "loss": 1.0842, + "step": 17600 + }, + { + "epoch": 0.02, + "learning_rate": 9.922222222222222e-05, + "loss": 1.0787, + "step": 17700 + }, + { + "epoch": 0.02, + "learning_rate": 9.921212121212122e-05, + "loss": 1.0727, + "step": 17800 + }, + { + "epoch": 0.02, + "learning_rate": 9.920202020202021e-05, + "loss": 1.0632, + "step": 17900 + }, + { + "epoch": 0.02, + "learning_rate": 9.919191919191919e-05, + "loss": 1.0728, + "step": 18000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 1.0736240077975021, + "eval_average_loss_on_sentence_tokens": 0.7019451766630741, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 1.0568164587020874, + "eval_non_padding_tokens_in_labels": 133.5544, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37155, + "eval_padding_tokens_in_labels": 7568912.0, + "eval_reconstruction_accuracy": 0.8511325345564704, + "eval_runtime": 194.3619, + "eval_samples_per_second": 25.725, + "eval_sentence_accuracy": 0.679477632027563, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 18000 + }, + { + "epoch": 0.02, + "learning_rate": 9.918181818181819e-05, + "loss": 1.0629, + "step": 18100 + }, + { + "epoch": 0.02, + "learning_rate": 9.917171717171717e-05, + "loss": 1.065, + "step": 18200 + }, + { + "epoch": 0.02, + "learning_rate": 9.916161616161616e-05, + "loss": 1.0595, + "step": 18300 + }, + { + "epoch": 0.02, + "learning_rate": 9.915151515151515e-05, + "loss": 1.0615, + "step": 18400 + }, + { + "epoch": 0.02, + "learning_rate": 9.914141414141415e-05, + "loss": 1.0507, + "step": 18500 + }, + { + "epoch": 0.02, + "learning_rate": 9.913131313131314e-05, + "loss": 1.0493, + "step": 18600 + }, + { + "epoch": 0.02, + "learning_rate": 9.912121212121213e-05, + "loss": 1.0469, + "step": 18700 + }, + { + "epoch": 0.02, + "learning_rate": 9.911111111111112e-05, + "loss": 1.0438, + "step": 18800 + }, + { + "epoch": 0.02, + "learning_rate": 9.91010101010101e-05, + "loss": 1.035, + "step": 18900 + }, + { + "epoch": 0.02, + "learning_rate": 9.909090909090911e-05, + "loss": 1.0326, + "step": 19000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 1.0343809412669787, + "eval_average_loss_on_sentence_tokens": 0.7200114539914212, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 1.0202734470367432, + "eval_non_padding_tokens_in_labels": 133.5372, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39125, + "eval_padding_tokens_in_labels": 7569256.0, + "eval_reconstruction_accuracy": 0.8555954746896639, + "eval_runtime": 193.4157, + "eval_samples_per_second": 25.851, + "eval_sentence_accuracy": 0.6557593267177491, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2499, + "step": 19000 + }, + { + "epoch": 0.02, + "learning_rate": 9.908080808080809e-05, + "loss": 1.0231, + "step": 19100 + }, + { + "epoch": 0.02, + "learning_rate": 9.907070707070708e-05, + "loss": 1.021, + "step": 19200 + }, + { + "epoch": 0.02, + "learning_rate": 9.906060606060607e-05, + "loss": 1.0194, + "step": 19300 + }, + { + "epoch": 0.02, + "learning_rate": 9.905050505050506e-05, + "loss": 1.0132, + "step": 19400 + }, + { + "epoch": 0.02, + "learning_rate": 9.904040404040404e-05, + "loss": 1.0127, + "step": 19500 + }, + { + "epoch": 0.02, + "learning_rate": 9.903030303030305e-05, + "loss": 1.0192, + "step": 19600 + }, + { + "epoch": 0.02, + "learning_rate": 9.902020202020202e-05, + "loss": 1.0147, + "step": 19700 + }, + { + "epoch": 0.02, + "learning_rate": 9.901010101010102e-05, + "loss": 1.0094, + "step": 19800 + }, + { + "epoch": 0.02, + "learning_rate": 9.900000000000001e-05, + "loss": 1.0007, + "step": 19900 + }, + { + "epoch": 0.02, + "learning_rate": 9.8989898989899e-05, + "loss": 1.008, + "step": 20000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 1.0054360549404526, + "eval_average_loss_on_sentence_tokens": 0.6986283480065618, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.9914648532867432, + "eval_non_padding_tokens_in_labels": 133.51725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36155, + "eval_padding_tokens_in_labels": 7569655.0, + "eval_reconstruction_accuracy": 0.8592765467666694, + "eval_runtime": 192.9838, + "eval_samples_per_second": 25.909, + "eval_sentence_accuracy": 0.648446892888546, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.25, + "step": 20000 + }, + { + "epoch": 0.02, + "learning_rate": 9.897979797979798e-05, + "loss": 0.9952, + "step": 20100 + }, + { + "epoch": 0.02, + "learning_rate": 9.896969696969698e-05, + "loss": 1.0, + "step": 20200 + }, + { + "epoch": 0.02, + "learning_rate": 9.895959595959596e-05, + "loss": 0.9907, + "step": 20300 + }, + { + "epoch": 0.02, + "learning_rate": 9.894949494949495e-05, + "loss": 0.9835, + "step": 20400 + }, + { + "epoch": 0.02, + "learning_rate": 9.893939393939395e-05, + "loss": 0.9892, + "step": 20500 + }, + { + "epoch": 0.02, + "learning_rate": 9.892929292929294e-05, + "loss": 0.9863, + "step": 20600 + }, + { + "epoch": 0.02, + "learning_rate": 9.891919191919192e-05, + "loss": 0.9801, + "step": 20700 + }, + { + "epoch": 0.02, + "learning_rate": 9.890909090909092e-05, + "loss": 0.9876, + "step": 20800 + }, + { + "epoch": 0.02, + "learning_rate": 9.88989898989899e-05, + "loss": 0.9707, + "step": 20900 + }, + { + "epoch": 0.02, + "learning_rate": 9.888888888888889e-05, + "loss": 0.9756, + "step": 21000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.9760498131322295, + "eval_average_loss_on_sentence_tokens": 0.6163153640389035, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.9599023461341858, + "eval_non_padding_tokens_in_labels": 133.5387, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38515, + "eval_padding_tokens_in_labels": 7569226.0, + "eval_reconstruction_accuracy": 0.8636140190442723, + "eval_runtime": 210.4553, + "eval_samples_per_second": 23.758, + "eval_sentence_accuracy": 0.6767051877904786, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.248775, + "step": 21000 + }, + { + "epoch": 0.02, + "learning_rate": 9.887878787878788e-05, + "loss": 0.9567, + "step": 21100 + }, + { + "epoch": 0.02, + "learning_rate": 9.886868686868688e-05, + "loss": 0.964, + "step": 21200 + }, + { + "epoch": 0.02, + "learning_rate": 9.885858585858587e-05, + "loss": 0.9624, + "step": 21300 + }, + { + "epoch": 0.02, + "learning_rate": 9.884848484848486e-05, + "loss": 0.9562, + "step": 21400 + }, + { + "epoch": 0.02, + "learning_rate": 9.883838383838384e-05, + "loss": 0.9523, + "step": 21500 + }, + { + "epoch": 0.02, + "learning_rate": 9.882828282828283e-05, + "loss": 0.9547, + "step": 21600 + }, + { + "epoch": 0.02, + "learning_rate": 9.881818181818182e-05, + "loss": 0.9522, + "step": 21700 + }, + { + "epoch": 0.02, + "learning_rate": 9.880808080808081e-05, + "loss": 0.9494, + "step": 21800 + }, + { + "epoch": 0.02, + "learning_rate": 9.87979797979798e-05, + "loss": 0.9406, + "step": 21900 + }, + { + "epoch": 0.02, + "learning_rate": 9.87878787878788e-05, + "loss": 0.9491, + "step": 22000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.9524786987742968, + "eval_average_loss_on_sentence_tokens": 0.6110075454588738, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.9371874928474426, + "eval_non_padding_tokens_in_labels": 133.52525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.376, + "eval_padding_tokens_in_labels": 7569495.0, + "eval_reconstruction_accuracy": 0.8661452280388837, + "eval_runtime": 211.0883, + "eval_samples_per_second": 23.687, + "eval_sentence_accuracy": 0.6551626680065319, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 22000 + }, + { + "epoch": 0.02, + "learning_rate": 9.877777777777778e-05, + "loss": 0.945, + "step": 22100 + }, + { + "epoch": 0.02, + "learning_rate": 9.876767676767677e-05, + "loss": 0.9402, + "step": 22200 + }, + { + "epoch": 0.02, + "learning_rate": 9.875757575757576e-05, + "loss": 0.9358, + "step": 22300 + }, + { + "epoch": 0.02, + "learning_rate": 9.874747474747475e-05, + "loss": 0.9317, + "step": 22400 + }, + { + "epoch": 0.02, + "learning_rate": 9.873737373737374e-05, + "loss": 0.9336, + "step": 22500 + }, + { + "epoch": 0.02, + "learning_rate": 9.872727272727274e-05, + "loss": 0.9338, + "step": 22600 + }, + { + "epoch": 0.02, + "learning_rate": 9.871717171717172e-05, + "loss": 0.9299, + "step": 22700 + }, + { + "epoch": 0.02, + "learning_rate": 9.870707070707072e-05, + "loss": 0.9352, + "step": 22800 + }, + { + "epoch": 0.02, + "learning_rate": 9.86969696969697e-05, + "loss": 0.9271, + "step": 22900 + }, + { + "epoch": 0.02, + "learning_rate": 9.868686868686869e-05, + "loss": 0.9242, + "step": 23000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.9319681832258687, + "eval_average_loss_on_sentence_tokens": 0.614858388736674, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.9175976514816284, + "eval_non_padding_tokens_in_labels": 133.5656, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.385, + "eval_padding_tokens_in_labels": 7568688.0, + "eval_reconstruction_accuracy": 0.8681012059730197, + "eval_runtime": 191.5569, + "eval_samples_per_second": 26.102, + "eval_sentence_accuracy": 0.6564232777648178, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 23000 + }, + { + "epoch": 0.02, + "learning_rate": 9.867676767676768e-05, + "loss": 0.9284, + "step": 23100 + }, + { + "epoch": 0.02, + "learning_rate": 9.866666666666668e-05, + "loss": 0.9188, + "step": 23200 + }, + { + "epoch": 0.02, + "learning_rate": 9.865656565656565e-05, + "loss": 0.9217, + "step": 23300 + }, + { + "epoch": 0.02, + "learning_rate": 9.864646464646466e-05, + "loss": 0.9197, + "step": 23400 + }, + { + "epoch": 0.02, + "learning_rate": 9.863636363636364e-05, + "loss": 0.9258, + "step": 23500 + }, + { + "epoch": 0.02, + "learning_rate": 9.862626262626263e-05, + "loss": 0.9158, + "step": 23600 + }, + { + "epoch": 0.02, + "learning_rate": 9.861616161616162e-05, + "loss": 0.9097, + "step": 23700 + }, + { + "epoch": 0.02, + "learning_rate": 9.860606060606061e-05, + "loss": 0.9179, + "step": 23800 + }, + { + "epoch": 0.02, + "learning_rate": 9.859595959595959e-05, + "loss": 0.9085, + "step": 23900 + }, + { + "epoch": 0.02, + "learning_rate": 9.85858585858586e-05, + "loss": 0.905, + "step": 24000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.9135189319212177, + "eval_average_loss_on_sentence_tokens": 0.5638230320452057, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.8976367115974426, + "eval_non_padding_tokens_in_labels": 133.4909, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37935, + "eval_padding_tokens_in_labels": 7570182.0, + "eval_reconstruction_accuracy": 0.8701218043297757, + "eval_runtime": 203.3521, + "eval_samples_per_second": 24.588, + "eval_sentence_accuracy": 0.6768936063308629, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 24000 + }, + { + "epoch": 0.02, + "learning_rate": 9.857575757575758e-05, + "loss": 0.912, + "step": 24100 + }, + { + "epoch": 0.02, + "learning_rate": 9.856565656565657e-05, + "loss": 0.9014, + "step": 24200 + }, + { + "epoch": 0.02, + "learning_rate": 9.855555555555556e-05, + "loss": 0.9007, + "step": 24300 + }, + { + "epoch": 0.02, + "learning_rate": 9.854545454545455e-05, + "loss": 0.9044, + "step": 24400 + }, + { + "epoch": 0.02, + "learning_rate": 9.853535353535353e-05, + "loss": 0.8988, + "step": 24500 + }, + { + "epoch": 0.02, + "learning_rate": 9.852525252525254e-05, + "loss": 0.8973, + "step": 24600 + }, + { + "epoch": 0.02, + "learning_rate": 9.851515151515151e-05, + "loss": 0.8955, + "step": 24700 + }, + { + "epoch": 0.02, + "learning_rate": 9.85050505050505e-05, + "loss": 0.8895, + "step": 24800 + }, + { + "epoch": 0.02, + "learning_rate": 9.84949494949495e-05, + "loss": 0.888, + "step": 24900 + }, + { + "epoch": 0.03, + "learning_rate": 9.848484848484849e-05, + "loss": 0.8844, + "step": 25000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8956421073000624, + "eval_average_loss_on_sentence_tokens": 0.6005375855397281, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.8823632597923279, + "eval_non_padding_tokens_in_labels": 133.57675, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3766, + "eval_padding_tokens_in_labels": 7568465.0, + "eval_reconstruction_accuracy": 0.8719171785817115, + "eval_runtime": 202.4885, + "eval_samples_per_second": 24.693, + "eval_sentence_accuracy": 0.6304843253719024, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 25000 + }, + { + "epoch": 0.03, + "learning_rate": 9.847474747474747e-05, + "loss": 0.8802, + "step": 25100 + }, + { + "epoch": 0.03, + "learning_rate": 9.846464646464647e-05, + "loss": 0.8816, + "step": 25200 + }, + { + "epoch": 0.03, + "learning_rate": 9.845454545454545e-05, + "loss": 0.887, + "step": 25300 + }, + { + "epoch": 0.03, + "learning_rate": 9.844444444444444e-05, + "loss": 0.8819, + "step": 25400 + }, + { + "epoch": 0.03, + "learning_rate": 9.843434343434344e-05, + "loss": 0.8761, + "step": 25500 + }, + { + "epoch": 0.03, + "learning_rate": 9.842424242424243e-05, + "loss": 0.8787, + "step": 25600 + }, + { + "epoch": 0.03, + "learning_rate": 9.841414141414142e-05, + "loss": 0.8834, + "step": 25700 + }, + { + "epoch": 0.03, + "learning_rate": 9.840404040404041e-05, + "loss": 0.8703, + "step": 25800 + }, + { + "epoch": 0.03, + "learning_rate": 9.839393939393939e-05, + "loss": 0.8721, + "step": 25900 + }, + { + "epoch": 0.03, + "learning_rate": 9.838383838383838e-05, + "loss": 0.8663, + "step": 26000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8743593179012566, + "eval_average_loss_on_sentence_tokens": 0.6068769126055585, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.8623046875, + "eval_non_padding_tokens_in_labels": 133.5339, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3887, + "eval_padding_tokens_in_labels": 7569322.0, + "eval_reconstruction_accuracy": 0.8738750206157202, + "eval_runtime": 203.7307, + "eval_samples_per_second": 24.542, + "eval_sentence_accuracy": 0.6381197624131929, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2496, + "step": 26000 + }, + { + "epoch": 0.03, + "learning_rate": 9.837373737373737e-05, + "loss": 0.8661, + "step": 26100 + }, + { + "epoch": 0.03, + "learning_rate": 9.836363636363637e-05, + "loss": 0.8637, + "step": 26200 + }, + { + "epoch": 0.03, + "learning_rate": 9.835353535353536e-05, + "loss": 0.8637, + "step": 26300 + }, + { + "epoch": 0.03, + "learning_rate": 9.834343434343435e-05, + "loss": 0.8578, + "step": 26400 + }, + { + "epoch": 0.03, + "learning_rate": 9.833333333333333e-05, + "loss": 0.8577, + "step": 26500 + }, + { + "epoch": 0.03, + "learning_rate": 9.832323232323233e-05, + "loss": 0.8595, + "step": 26600 + }, + { + "epoch": 0.03, + "learning_rate": 9.831313131313131e-05, + "loss": 0.8533, + "step": 26700 + }, + { + "epoch": 0.03, + "learning_rate": 9.83030303030303e-05, + "loss": 0.8496, + "step": 26800 + }, + { + "epoch": 0.03, + "learning_rate": 9.82929292929293e-05, + "loss": 0.8468, + "step": 26900 + }, + { + "epoch": 0.03, + "learning_rate": 9.828282828282829e-05, + "loss": 0.8453, + "step": 27000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8525555690787086, + "eval_average_loss_on_sentence_tokens": 0.650767625728792, + "eval_average_shuffling_prob": 0.57, + "eval_loss": 0.8433398604393005, + "eval_non_padding_tokens_in_labels": 133.5122, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3798, + "eval_padding_tokens_in_labels": 7569756.0, + "eval_reconstruction_accuracy": 0.8759959938614073, + "eval_runtime": 202.5013, + "eval_samples_per_second": 24.691, + "eval_sentence_accuracy": 0.6040518958494088, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24509999999999996, + "step": 27000 + }, + { + "epoch": 0.03, + "learning_rate": 9.827272727272728e-05, + "loss": 0.8477, + "step": 27100 + }, + { + "epoch": 0.03, + "learning_rate": 9.826262626262627e-05, + "loss": 0.8419, + "step": 27200 + }, + { + "epoch": 0.03, + "learning_rate": 9.825252525252527e-05, + "loss": 0.8419, + "step": 27300 + }, + { + "epoch": 0.03, + "learning_rate": 9.824242424242424e-05, + "loss": 0.8427, + "step": 27400 + }, + { + "epoch": 0.03, + "learning_rate": 9.823232323232325e-05, + "loss": 0.8423, + "step": 27500 + }, + { + "epoch": 0.03, + "learning_rate": 9.822222222222223e-05, + "loss": 0.8354, + "step": 27600 + }, + { + "epoch": 0.03, + "learning_rate": 9.821212121212122e-05, + "loss": 0.8379, + "step": 27700 + }, + { + "epoch": 0.03, + "learning_rate": 9.820202020202021e-05, + "loss": 0.8346, + "step": 27800 + }, + { + "epoch": 0.03, + "learning_rate": 9.81919191919192e-05, + "loss": 0.8303, + "step": 27900 + }, + { + "epoch": 0.03, + "learning_rate": 9.818181818181818e-05, + "loss": 0.8251, + "step": 28000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8315735811345891, + "eval_average_loss_on_sentence_tokens": 0.5717686975453723, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.8198046684265137, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3881, + "eval_padding_tokens_in_labels": 7569523.0, + "eval_reconstruction_accuracy": 0.8783705684253781, + "eval_runtime": 204.9001, + "eval_samples_per_second": 24.402, + "eval_sentence_accuracy": 0.6659384140542286, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24960000000000004, + "step": 28000 + }, + { + "epoch": 0.03, + "learning_rate": 9.817171717171719e-05, + "loss": 0.8285, + "step": 28100 + }, + { + "epoch": 0.03, + "learning_rate": 9.816161616161617e-05, + "loss": 0.8248, + "step": 28200 + }, + { + "epoch": 0.03, + "learning_rate": 9.815151515151516e-05, + "loss": 0.8199, + "step": 28300 + }, + { + "epoch": 0.03, + "learning_rate": 9.814141414141415e-05, + "loss": 0.8164, + "step": 28400 + }, + { + "epoch": 0.03, + "learning_rate": 9.813131313131314e-05, + "loss": 0.8194, + "step": 28500 + }, + { + "epoch": 0.03, + "learning_rate": 9.812121212121212e-05, + "loss": 0.8176, + "step": 28600 + }, + { + "epoch": 0.03, + "learning_rate": 9.811111111111113e-05, + "loss": 0.8156, + "step": 28700 + }, + { + "epoch": 0.03, + "learning_rate": 9.81010101010101e-05, + "loss": 0.8153, + "step": 28800 + }, + { + "epoch": 0.03, + "learning_rate": 9.80909090909091e-05, + "loss": 0.8143, + "step": 28900 + }, + { + "epoch": 0.03, + "learning_rate": 9.808080808080809e-05, + "loss": 0.8107, + "step": 29000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8177147990802544, + "eval_average_loss_on_sentence_tokens": 0.5418528548951881, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.8052539229393005, + "eval_non_padding_tokens_in_labels": 133.5861, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40495, + "eval_padding_tokens_in_labels": 7568278.0, + "eval_reconstruction_accuracy": 0.8797695116248516, + "eval_runtime": 204.8661, + "eval_samples_per_second": 24.406, + "eval_sentence_accuracy": 0.6553824896369803, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.25, + "step": 29000 + }, + { + "epoch": 0.03, + "learning_rate": 9.807070707070708e-05, + "loss": 0.8133, + "step": 29100 + }, + { + "epoch": 0.03, + "learning_rate": 9.806060606060606e-05, + "loss": 0.8097, + "step": 29200 + }, + { + "epoch": 0.03, + "learning_rate": 9.805050505050506e-05, + "loss": 0.8049, + "step": 29300 + }, + { + "epoch": 0.03, + "learning_rate": 9.804040404040404e-05, + "loss": 0.7995, + "step": 29400 + }, + { + "epoch": 0.03, + "learning_rate": 9.803030303030303e-05, + "loss": 0.8094, + "step": 29500 + }, + { + "epoch": 0.03, + "learning_rate": 9.802020202020203e-05, + "loss": 0.8042, + "step": 29600 + }, + { + "epoch": 0.03, + "learning_rate": 9.801010101010102e-05, + "loss": 0.8008, + "step": 29700 + }, + { + "epoch": 0.03, + "learning_rate": 9.8e-05, + "loss": 0.7971, + "step": 29800 + }, + { + "epoch": 0.03, + "learning_rate": 9.7989898989899e-05, + "loss": 0.7957, + "step": 29900 + }, + { + "epoch": 0.03, + "learning_rate": 9.797979797979798e-05, + "loss": 0.7946, + "step": 30000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.8004327288663701, + "eval_average_loss_on_sentence_tokens": 0.5457024131833981, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.7889062762260437, + "eval_non_padding_tokens_in_labels": 133.53105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3804, + "eval_padding_tokens_in_labels": 7569379.0, + "eval_reconstruction_accuracy": 0.8814366250249709, + "eval_runtime": 208.745, + "eval_samples_per_second": 23.953, + "eval_sentence_accuracy": 0.6743723868142911, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 30000 + }, + { + "epoch": 0.03, + "learning_rate": 9.796969696969697e-05, + "loss": 0.798, + "step": 30100 + }, + { + "epoch": 0.03, + "learning_rate": 9.795959595959596e-05, + "loss": 0.7912, + "step": 30200 + }, + { + "epoch": 0.03, + "learning_rate": 9.794949494949496e-05, + "loss": 0.7857, + "step": 30300 + }, + { + "epoch": 0.03, + "learning_rate": 9.793939393939394e-05, + "loss": 0.7921, + "step": 30400 + }, + { + "epoch": 0.03, + "learning_rate": 9.792929292929294e-05, + "loss": 0.7905, + "step": 30500 + }, + { + "epoch": 0.03, + "learning_rate": 9.791919191919192e-05, + "loss": 0.7959, + "step": 30600 + }, + { + "epoch": 0.03, + "learning_rate": 9.790909090909091e-05, + "loss": 0.7857, + "step": 30700 + }, + { + "epoch": 0.03, + "learning_rate": 9.78989898989899e-05, + "loss": 0.7862, + "step": 30800 + }, + { + "epoch": 0.03, + "learning_rate": 9.78888888888889e-05, + "loss": 0.7906, + "step": 30900 + }, + { + "epoch": 0.03, + "learning_rate": 9.787878787878789e-05, + "loss": 0.784, + "step": 31000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.7885547745781842, + "eval_average_loss_on_sentence_tokens": 0.5723165765280285, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.7787695527076721, + "eval_non_padding_tokens_in_labels": 133.55145, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3702, + "eval_padding_tokens_in_labels": 7568971.0, + "eval_reconstruction_accuracy": 0.8823514847778449, + "eval_runtime": 208.8354, + "eval_samples_per_second": 23.942, + "eval_sentence_accuracy": 0.6661447772175068, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2496, + "step": 31000 + }, + { + "epoch": 0.03, + "learning_rate": 9.786868686868688e-05, + "loss": 0.7808, + "step": 31100 + }, + { + "epoch": 0.03, + "learning_rate": 9.785858585858586e-05, + "loss": 0.7808, + "step": 31200 + }, + { + "epoch": 0.03, + "learning_rate": 9.784848484848485e-05, + "loss": 0.7759, + "step": 31300 + }, + { + "epoch": 0.03, + "learning_rate": 9.783838383838384e-05, + "loss": 0.7779, + "step": 31400 + }, + { + "epoch": 0.03, + "learning_rate": 9.782828282828283e-05, + "loss": 0.7851, + "step": 31500 + }, + { + "epoch": 0.03, + "learning_rate": 9.781818181818183e-05, + "loss": 0.7795, + "step": 31600 + }, + { + "epoch": 0.03, + "learning_rate": 9.780808080808082e-05, + "loss": 0.7798, + "step": 31700 + }, + { + "epoch": 0.03, + "learning_rate": 9.77979797979798e-05, + "loss": 0.7758, + "step": 31800 + }, + { + "epoch": 0.03, + "learning_rate": 9.77878787878788e-05, + "loss": 0.774, + "step": 31900 + }, + { + "epoch": 0.03, + "learning_rate": 9.777777777777778e-05, + "loss": 0.7719, + "step": 32000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.7806003005675682, + "eval_average_loss_on_sentence_tokens": 0.5134654735470691, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.7685546875, + "eval_non_padding_tokens_in_labels": 133.5149, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3711, + "eval_padding_tokens_in_labels": 7569702.0, + "eval_reconstruction_accuracy": 0.8834029506841602, + "eval_runtime": 204.2196, + "eval_samples_per_second": 24.483, + "eval_sentence_accuracy": 0.6655660631291833, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 32000 + }, + { + "epoch": 0.03, + "learning_rate": 9.776767676767677e-05, + "loss": 0.7724, + "step": 32100 + }, + { + "epoch": 0.03, + "learning_rate": 9.775757575757576e-05, + "loss": 0.7749, + "step": 32200 + }, + { + "epoch": 0.03, + "learning_rate": 9.774747474747476e-05, + "loss": 0.7643, + "step": 32300 + }, + { + "epoch": 0.03, + "learning_rate": 9.773737373737373e-05, + "loss": 0.765, + "step": 32400 + }, + { + "epoch": 0.03, + "learning_rate": 9.772727272727274e-05, + "loss": 0.7724, + "step": 32500 + }, + { + "epoch": 0.03, + "learning_rate": 9.771717171717172e-05, + "loss": 0.7673, + "step": 32600 + }, + { + "epoch": 0.03, + "learning_rate": 9.770707070707071e-05, + "loss": 0.7667, + "step": 32700 + }, + { + "epoch": 0.03, + "learning_rate": 9.76969696969697e-05, + "loss": 0.7553, + "step": 32800 + }, + { + "epoch": 0.03, + "learning_rate": 9.76868686868687e-05, + "loss": 0.7703, + "step": 32900 + }, + { + "epoch": 0.03, + "learning_rate": 9.767676767676767e-05, + "loss": 0.7632, + "step": 33000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.7673551137595598, + "eval_average_loss_on_sentence_tokens": 0.5635621791583886, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.7581445574760437, + "eval_non_padding_tokens_in_labels": 133.51405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3702, + "eval_padding_tokens_in_labels": 7569719.0, + "eval_reconstruction_accuracy": 0.8845214484483891, + "eval_runtime": 205.1103, + "eval_samples_per_second": 24.377, + "eval_sentence_accuracy": 0.6431397706677194, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.2496, + "step": 33000 + }, + { + "epoch": 0.03, + "learning_rate": 9.766666666666668e-05, + "loss": 0.7604, + "step": 33100 + }, + { + "epoch": 0.03, + "learning_rate": 9.765656565656566e-05, + "loss": 0.7618, + "step": 33200 + }, + { + "epoch": 0.03, + "learning_rate": 9.764646464646465e-05, + "loss": 0.7625, + "step": 33300 + }, + { + "epoch": 0.03, + "learning_rate": 9.763636363636364e-05, + "loss": 0.7601, + "step": 33400 + }, + { + "epoch": 0.03, + "learning_rate": 9.762626262626263e-05, + "loss": 0.7563, + "step": 33500 + }, + { + "epoch": 0.03, + "learning_rate": 9.761616161616161e-05, + "loss": 0.7543, + "step": 33600 + }, + { + "epoch": 0.03, + "learning_rate": 9.760606060606062e-05, + "loss": 0.7551, + "step": 33700 + }, + { + "epoch": 0.03, + "learning_rate": 9.75959595959596e-05, + "loss": 0.7599, + "step": 33800 + }, + { + "epoch": 0.03, + "learning_rate": 9.758585858585859e-05, + "loss": 0.7625, + "step": 33900 + }, + { + "epoch": 0.03, + "learning_rate": 9.757575757575758e-05, + "loss": 0.7611, + "step": 34000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.7591298788485251, + "eval_average_loss_on_sentence_tokens": 0.5201102561647037, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.7483984231948853, + "eval_non_padding_tokens_in_labels": 133.5323, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38085, + "eval_padding_tokens_in_labels": 7569354.0, + "eval_reconstruction_accuracy": 0.8855346137987974, + "eval_runtime": 203.7605, + "eval_samples_per_second": 24.539, + "eval_sentence_accuracy": 0.6635831822994239, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 34000 + }, + { + "epoch": 0.03, + "learning_rate": 9.756565656565657e-05, + "loss": 0.7549, + "step": 34100 + }, + { + "epoch": 0.03, + "learning_rate": 9.755555555555555e-05, + "loss": 0.7509, + "step": 34200 + }, + { + "epoch": 0.03, + "learning_rate": 9.754545454545455e-05, + "loss": 0.7534, + "step": 34300 + }, + { + "epoch": 0.03, + "learning_rate": 9.753535353535353e-05, + "loss": 0.7508, + "step": 34400 + }, + { + "epoch": 0.03, + "learning_rate": 9.752525252525253e-05, + "loss": 0.7467, + "step": 34500 + }, + { + "epoch": 0.03, + "learning_rate": 9.751515151515152e-05, + "loss": 0.7498, + "step": 34600 + }, + { + "epoch": 0.03, + "learning_rate": 9.750505050505051e-05, + "loss": 0.7519, + "step": 34700 + }, + { + "epoch": 0.03, + "learning_rate": 9.74949494949495e-05, + "loss": 0.7447, + "step": 34800 + }, + { + "epoch": 0.03, + "learning_rate": 9.748484848484849e-05, + "loss": 0.7481, + "step": 34900 + }, + { + "epoch": 0.04, + "learning_rate": 9.747474747474747e-05, + "loss": 0.7466, + "step": 35000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7516491725525567, + "eval_average_loss_on_sentence_tokens": 0.571127758263971, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.7435156106948853, + "eval_non_padding_tokens_in_labels": 133.5492, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3798, + "eval_padding_tokens_in_labels": 7569016.0, + "eval_reconstruction_accuracy": 0.8860375992461342, + "eval_runtime": 203.4887, + "eval_samples_per_second": 24.571, + "eval_sentence_accuracy": 0.628245733665907, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 35000 + }, + { + "epoch": 0.04, + "learning_rate": 9.746464646464646e-05, + "loss": 0.7496, + "step": 35100 + }, + { + "epoch": 0.04, + "learning_rate": 9.745454545454546e-05, + "loss": 0.7456, + "step": 35200 + }, + { + "epoch": 0.04, + "learning_rate": 9.744444444444445e-05, + "loss": 0.7442, + "step": 35300 + }, + { + "epoch": 0.04, + "learning_rate": 9.743434343434344e-05, + "loss": 0.7444, + "step": 35400 + }, + { + "epoch": 0.04, + "learning_rate": 9.742424242424243e-05, + "loss": 0.744, + "step": 35500 + }, + { + "epoch": 0.04, + "learning_rate": 9.741414141414141e-05, + "loss": 0.7384, + "step": 35600 + }, + { + "epoch": 0.04, + "learning_rate": 9.740404040404042e-05, + "loss": 0.7403, + "step": 35700 + }, + { + "epoch": 0.04, + "learning_rate": 9.739393939393941e-05, + "loss": 0.7408, + "step": 35800 + }, + { + "epoch": 0.04, + "learning_rate": 9.738383838383839e-05, + "loss": 0.7347, + "step": 35900 + }, + { + "epoch": 0.04, + "learning_rate": 9.737373737373738e-05, + "loss": 0.7348, + "step": 36000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7414321005281114, + "eval_average_loss_on_sentence_tokens": 0.5743047935792722, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.7340429425239563, + "eval_non_padding_tokens_in_labels": 133.5598, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3852, + "eval_padding_tokens_in_labels": 7568804.0, + "eval_reconstruction_accuracy": 0.8869180909131024, + "eval_runtime": 209.43, + "eval_samples_per_second": 23.874, + "eval_sentence_accuracy": 0.6331984495845819, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 36000 + }, + { + "epoch": 0.04, + "learning_rate": 9.736363636363637e-05, + "loss": 0.7441, + "step": 36100 + }, + { + "epoch": 0.04, + "learning_rate": 9.735353535353536e-05, + "loss": 0.7386, + "step": 36200 + }, + { + "epoch": 0.04, + "learning_rate": 9.734343434343435e-05, + "loss": 0.738, + "step": 36300 + }, + { + "epoch": 0.04, + "learning_rate": 9.733333333333335e-05, + "loss": 0.7352, + "step": 36400 + }, + { + "epoch": 0.04, + "learning_rate": 9.732323232323232e-05, + "loss": 0.733, + "step": 36500 + }, + { + "epoch": 0.04, + "learning_rate": 9.731313131313132e-05, + "loss": 0.7388, + "step": 36600 + }, + { + "epoch": 0.04, + "learning_rate": 9.730303030303031e-05, + "loss": 0.7296, + "step": 36700 + }, + { + "epoch": 0.04, + "learning_rate": 9.72929292929293e-05, + "loss": 0.7272, + "step": 36800 + }, + { + "epoch": 0.04, + "learning_rate": 9.728282828282829e-05, + "loss": 0.7311, + "step": 36900 + }, + { + "epoch": 0.04, + "learning_rate": 9.727272727272728e-05, + "loss": 0.7297, + "step": 37000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7330537991627066, + "eval_average_loss_on_sentence_tokens": 0.5201192849073509, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.7235742211341858, + "eval_non_padding_tokens_in_labels": 133.5405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3904, + "eval_padding_tokens_in_labels": 7569190.0, + "eval_reconstruction_accuracy": 0.887653264101695, + "eval_runtime": 201.9756, + "eval_samples_per_second": 24.755, + "eval_sentence_accuracy": 0.6838964954151489, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 37000 + }, + { + "epoch": 0.04, + "learning_rate": 9.726262626262626e-05, + "loss": 0.733, + "step": 37100 + }, + { + "epoch": 0.04, + "learning_rate": 9.725252525252527e-05, + "loss": 0.7296, + "step": 37200 + }, + { + "epoch": 0.04, + "learning_rate": 9.724242424242425e-05, + "loss": 0.7315, + "step": 37300 + }, + { + "epoch": 0.04, + "learning_rate": 9.723232323232324e-05, + "loss": 0.7305, + "step": 37400 + }, + { + "epoch": 0.04, + "learning_rate": 9.722222222222223e-05, + "loss": 0.7251, + "step": 37500 + }, + { + "epoch": 0.04, + "learning_rate": 9.721212121212122e-05, + "loss": 0.7277, + "step": 37600 + }, + { + "epoch": 0.04, + "learning_rate": 9.72020202020202e-05, + "loss": 0.7255, + "step": 37700 + }, + { + "epoch": 0.04, + "learning_rate": 9.71919191919192e-05, + "loss": 0.7223, + "step": 37800 + }, + { + "epoch": 0.04, + "learning_rate": 9.718181818181818e-05, + "loss": 0.7243, + "step": 37900 + }, + { + "epoch": 0.04, + "learning_rate": 9.717171717171718e-05, + "loss": 0.7304, + "step": 38000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7284353406422213, + "eval_average_loss_on_sentence_tokens": 0.5293014823639913, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.7194921970367432, + "eval_non_padding_tokens_in_labels": 133.5347, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36275, + "eval_padding_tokens_in_labels": 7569306.0, + "eval_reconstruction_accuracy": 0.8879148338867446, + "eval_runtime": 203.8855, + "eval_samples_per_second": 24.524, + "eval_sentence_accuracy": 0.6850898128375832, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.248775, + "step": 38000 + }, + { + "epoch": 0.04, + "learning_rate": 9.716161616161617e-05, + "loss": 0.7218, + "step": 38100 + }, + { + "epoch": 0.04, + "learning_rate": 9.715151515151516e-05, + "loss": 0.7272, + "step": 38200 + }, + { + "epoch": 0.04, + "learning_rate": 9.714141414141414e-05, + "loss": 0.7212, + "step": 38300 + }, + { + "epoch": 0.04, + "learning_rate": 9.713131313131314e-05, + "loss": 0.7205, + "step": 38400 + }, + { + "epoch": 0.04, + "learning_rate": 9.712121212121212e-05, + "loss": 0.722, + "step": 38500 + }, + { + "epoch": 0.04, + "learning_rate": 9.711111111111111e-05, + "loss": 0.7187, + "step": 38600 + }, + { + "epoch": 0.04, + "learning_rate": 9.710101010101011e-05, + "loss": 0.7235, + "step": 38700 + }, + { + "epoch": 0.04, + "learning_rate": 9.70909090909091e-05, + "loss": 0.7176, + "step": 38800 + }, + { + "epoch": 0.04, + "learning_rate": 9.708080808080808e-05, + "loss": 0.7219, + "step": 38900 + }, + { + "epoch": 0.04, + "learning_rate": 9.707070707070708e-05, + "loss": 0.7164, + "step": 39000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7208739333669888, + "eval_average_loss_on_sentence_tokens": 0.5159909515090918, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.7117578387260437, + "eval_non_padding_tokens_in_labels": 133.53465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3794, + "eval_padding_tokens_in_labels": 7569307.0, + "eval_reconstruction_accuracy": 0.8887596596627976, + "eval_runtime": 205.54, + "eval_samples_per_second": 24.326, + "eval_sentence_accuracy": 0.6782618838265114, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.2496, + "step": 39000 + }, + { + "epoch": 0.04, + "learning_rate": 9.706060606060606e-05, + "loss": 0.7167, + "step": 39100 + }, + { + "epoch": 0.04, + "learning_rate": 9.705050505050505e-05, + "loss": 0.715, + "step": 39200 + }, + { + "epoch": 0.04, + "learning_rate": 9.704040404040405e-05, + "loss": 0.7171, + "step": 39300 + }, + { + "epoch": 0.04, + "learning_rate": 9.703030303030304e-05, + "loss": 0.7117, + "step": 39400 + }, + { + "epoch": 0.04, + "learning_rate": 9.702020202020202e-05, + "loss": 0.7153, + "step": 39500 + }, + { + "epoch": 0.04, + "learning_rate": 9.701010101010102e-05, + "loss": 0.7163, + "step": 39600 + }, + { + "epoch": 0.04, + "learning_rate": 9.7e-05, + "loss": 0.7147, + "step": 39700 + }, + { + "epoch": 0.04, + "learning_rate": 9.698989898989899e-05, + "loss": 0.7126, + "step": 39800 + }, + { + "epoch": 0.04, + "learning_rate": 9.697979797979798e-05, + "loss": 0.7164, + "step": 39900 + }, + { + "epoch": 0.04, + "learning_rate": 9.696969696969698e-05, + "loss": 0.7103, + "step": 40000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7140786936035022, + "eval_average_loss_on_sentence_tokens": 0.582324143087784, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.7082226276397705, + "eval_non_padding_tokens_in_labels": 133.5027, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3884, + "eval_padding_tokens_in_labels": 7569946.0, + "eval_reconstruction_accuracy": 0.8890573084870999, + "eval_runtime": 203.3, + "eval_samples_per_second": 24.594, + "eval_sentence_accuracy": 0.6473029231790693, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 40000 + }, + { + "epoch": 0.04, + "learning_rate": 9.695959595959597e-05, + "loss": 0.7168, + "step": 40100 + }, + { + "epoch": 0.04, + "learning_rate": 9.694949494949496e-05, + "loss": 0.7091, + "step": 40200 + }, + { + "epoch": 0.04, + "learning_rate": 9.693939393939394e-05, + "loss": 0.7144, + "step": 40300 + }, + { + "epoch": 0.04, + "learning_rate": 9.692929292929293e-05, + "loss": 0.7082, + "step": 40400 + }, + { + "epoch": 0.04, + "learning_rate": 9.691919191919192e-05, + "loss": 0.7033, + "step": 40500 + }, + { + "epoch": 0.04, + "learning_rate": 9.690909090909091e-05, + "loss": 0.7082, + "step": 40600 + }, + { + "epoch": 0.04, + "learning_rate": 9.68989898989899e-05, + "loss": 0.7102, + "step": 40700 + }, + { + "epoch": 0.04, + "learning_rate": 9.68888888888889e-05, + "loss": 0.7089, + "step": 40800 + }, + { + "epoch": 0.04, + "learning_rate": 9.687878787878788e-05, + "loss": 0.7112, + "step": 40900 + }, + { + "epoch": 0.04, + "learning_rate": 9.686868686868688e-05, + "loss": 0.7068, + "step": 41000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7092113928344924, + "eval_average_loss_on_sentence_tokens": 0.5396684740383408, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.70166015625, + "eval_non_padding_tokens_in_labels": 133.53665, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38535, + "eval_padding_tokens_in_labels": 7569267.0, + "eval_reconstruction_accuracy": 0.8896830343357756, + "eval_runtime": 203.764, + "eval_samples_per_second": 24.538, + "eval_sentence_accuracy": 0.660855599619574, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2499, + "step": 41000 + }, + { + "epoch": 0.04, + "learning_rate": 9.685858585858586e-05, + "loss": 0.7044, + "step": 41100 + }, + { + "epoch": 0.04, + "learning_rate": 9.684848484848485e-05, + "loss": 0.7047, + "step": 41200 + }, + { + "epoch": 0.04, + "learning_rate": 9.683838383838384e-05, + "loss": 0.7085, + "step": 41300 + }, + { + "epoch": 0.04, + "learning_rate": 9.682828282828284e-05, + "loss": 0.7039, + "step": 41400 + }, + { + "epoch": 0.04, + "learning_rate": 9.681818181818181e-05, + "loss": 0.7082, + "step": 41500 + }, + { + "epoch": 0.04, + "learning_rate": 9.680808080808082e-05, + "loss": 0.7049, + "step": 41600 + }, + { + "epoch": 0.04, + "learning_rate": 9.67979797979798e-05, + "loss": 0.6998, + "step": 41700 + }, + { + "epoch": 0.04, + "learning_rate": 9.678787878787879e-05, + "loss": 0.704, + "step": 41800 + }, + { + "epoch": 0.04, + "learning_rate": 9.677777777777778e-05, + "loss": 0.7037, + "step": 41900 + }, + { + "epoch": 0.04, + "learning_rate": 9.676767676767677e-05, + "loss": 0.7027, + "step": 42000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.7033071933193386, + "eval_average_loss_on_sentence_tokens": 0.5232528334696775, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.6952343583106995, + "eval_non_padding_tokens_in_labels": 133.54165, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39725, + "eval_padding_tokens_in_labels": 7569167.0, + "eval_reconstruction_accuracy": 0.8901630917295262, + "eval_runtime": 218.772, + "eval_samples_per_second": 22.855, + "eval_sentence_accuracy": 0.6761578767922192, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 42000 + }, + { + "epoch": 0.04, + "learning_rate": 9.675757575757575e-05, + "loss": 0.7046, + "step": 42100 + }, + { + "epoch": 0.04, + "learning_rate": 9.674747474747476e-05, + "loss": 0.7005, + "step": 42200 + }, + { + "epoch": 0.04, + "learning_rate": 9.673737373737374e-05, + "loss": 0.6989, + "step": 42300 + }, + { + "epoch": 0.04, + "learning_rate": 9.672727272727273e-05, + "loss": 0.6982, + "step": 42400 + }, + { + "epoch": 0.04, + "learning_rate": 9.671717171717172e-05, + "loss": 0.699, + "step": 42500 + }, + { + "epoch": 0.04, + "learning_rate": 9.670707070707071e-05, + "loss": 0.6929, + "step": 42600 + }, + { + "epoch": 0.04, + "learning_rate": 9.669696969696969e-05, + "loss": 0.6956, + "step": 42700 + }, + { + "epoch": 0.04, + "learning_rate": 9.66868686868687e-05, + "loss": 0.6974, + "step": 42800 + }, + { + "epoch": 0.04, + "learning_rate": 9.667676767676768e-05, + "loss": 0.6943, + "step": 42900 + }, + { + "epoch": 0.04, + "learning_rate": 9.666666666666667e-05, + "loss": 0.6936, + "step": 43000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.6969919202250064, + "eval_average_loss_on_sentence_tokens": 0.4954935076915419, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.687792956829071, + "eval_non_padding_tokens_in_labels": 133.48765, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3791, + "eval_padding_tokens_in_labels": 7570247.0, + "eval_reconstruction_accuracy": 0.8911127843238733, + "eval_runtime": 238.7816, + "eval_samples_per_second": 20.94, + "eval_sentence_accuracy": 0.6902130026737489, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.248775, + "step": 43000 + }, + { + "epoch": 0.04, + "learning_rate": 9.665656565656566e-05, + "loss": 0.6934, + "step": 43100 + }, + { + "epoch": 0.04, + "learning_rate": 9.664646464646465e-05, + "loss": 0.6897, + "step": 43200 + }, + { + "epoch": 0.04, + "learning_rate": 9.663636363636363e-05, + "loss": 0.6903, + "step": 43300 + }, + { + "epoch": 0.04, + "learning_rate": 9.662626262626264e-05, + "loss": 0.687, + "step": 43400 + }, + { + "epoch": 0.04, + "learning_rate": 9.661616161616161e-05, + "loss": 0.6949, + "step": 43500 + }, + { + "epoch": 0.04, + "learning_rate": 9.66060606060606e-05, + "loss": 0.6879, + "step": 43600 + }, + { + "epoch": 0.04, + "learning_rate": 9.65959595959596e-05, + "loss": 0.694, + "step": 43700 + }, + { + "epoch": 0.04, + "learning_rate": 9.658585858585859e-05, + "loss": 0.6956, + "step": 43800 + }, + { + "epoch": 0.04, + "learning_rate": 9.657575757575758e-05, + "loss": 0.6914, + "step": 43900 + }, + { + "epoch": 0.04, + "learning_rate": 9.656565656565657e-05, + "loss": 0.6887, + "step": 44000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.6931096416109109, + "eval_average_loss_on_sentence_tokens": 0.5196218586738691, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.6854687333106995, + "eval_non_padding_tokens_in_labels": 133.5434, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38705, + "eval_padding_tokens_in_labels": 7569132.0, + "eval_reconstruction_accuracy": 0.8911759960833142, + "eval_runtime": 240.4421, + "eval_samples_per_second": 20.795, + "eval_sentence_accuracy": 0.6753727995406177, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 44000 + }, + { + "epoch": 0.04, + "learning_rate": 9.655555555555555e-05, + "loss": 0.6898, + "step": 44100 + }, + { + "epoch": 0.04, + "learning_rate": 9.654545454545454e-05, + "loss": 0.686, + "step": 44200 + }, + { + "epoch": 0.04, + "learning_rate": 9.653535353535355e-05, + "loss": 0.6918, + "step": 44300 + }, + { + "epoch": 0.04, + "learning_rate": 9.652525252525253e-05, + "loss": 0.6875, + "step": 44400 + }, + { + "epoch": 0.04, + "learning_rate": 9.651515151515152e-05, + "loss": 0.6874, + "step": 44500 + }, + { + "epoch": 0.04, + "learning_rate": 9.650505050505051e-05, + "loss": 0.6909, + "step": 44600 + }, + { + "epoch": 0.04, + "learning_rate": 9.64949494949495e-05, + "loss": 0.6843, + "step": 44700 + }, + { + "epoch": 0.04, + "learning_rate": 9.64848484848485e-05, + "loss": 0.6867, + "step": 44800 + }, + { + "epoch": 0.04, + "learning_rate": 9.647474747474749e-05, + "loss": 0.6849, + "step": 44900 + }, + { + "epoch": 0.04, + "learning_rate": 9.646464646464647e-05, + "loss": 0.6881, + "step": 45000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.687252096599764, + "eval_average_loss_on_sentence_tokens": 0.5557861999151636, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.6812695264816284, + "eval_non_padding_tokens_in_labels": 133.546, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38975, + "eval_padding_tokens_in_labels": 7569080.0, + "eval_reconstruction_accuracy": 0.8916787199240944, + "eval_runtime": 264.1124, + "eval_samples_per_second": 18.931, + "eval_sentence_accuracy": 0.6427584474312272, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 45000 + }, + { + "epoch": 0.05, + "learning_rate": 9.645454545454546e-05, + "loss": 0.6839, + "step": 45100 + }, + { + "epoch": 0.05, + "learning_rate": 9.644444444444445e-05, + "loss": 0.684, + "step": 45200 + }, + { + "epoch": 0.05, + "learning_rate": 9.643434343434344e-05, + "loss": 0.6846, + "step": 45300 + }, + { + "epoch": 0.05, + "learning_rate": 9.642424242424243e-05, + "loss": 0.6803, + "step": 45400 + }, + { + "epoch": 0.05, + "learning_rate": 9.641414141414143e-05, + "loss": 0.6836, + "step": 45500 + }, + { + "epoch": 0.05, + "learning_rate": 9.64040404040404e-05, + "loss": 0.6812, + "step": 45600 + }, + { + "epoch": 0.05, + "learning_rate": 9.63939393939394e-05, + "loss": 0.6827, + "step": 45700 + }, + { + "epoch": 0.05, + "learning_rate": 9.638383838383839e-05, + "loss": 0.6838, + "step": 45800 + }, + { + "epoch": 0.05, + "learning_rate": 9.637373737373738e-05, + "loss": 0.687, + "step": 45900 + }, + { + "epoch": 0.05, + "learning_rate": 9.636363636363637e-05, + "loss": 0.6775, + "step": 46000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.6833730743732579, + "eval_average_loss_on_sentence_tokens": 0.5192337241865881, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.6759570240974426, + "eval_non_padding_tokens_in_labels": 133.537, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36575, + "eval_padding_tokens_in_labels": 7569260.0, + "eval_reconstruction_accuracy": 0.8920748148278302, + "eval_runtime": 238.564, + "eval_samples_per_second": 20.959, + "eval_sentence_accuracy": 0.6717434995603567, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 46000 + }, + { + "epoch": 0.05, + "learning_rate": 9.635353535353536e-05, + "loss": 0.6783, + "step": 46100 + }, + { + "epoch": 0.05, + "learning_rate": 9.634343434343434e-05, + "loss": 0.6818, + "step": 46200 + }, + { + "epoch": 0.05, + "learning_rate": 9.633333333333335e-05, + "loss": 0.6786, + "step": 46300 + }, + { + "epoch": 0.05, + "learning_rate": 9.632323232323233e-05, + "loss": 0.6744, + "step": 46400 + }, + { + "epoch": 0.05, + "learning_rate": 9.631313131313132e-05, + "loss": 0.6793, + "step": 46500 + }, + { + "epoch": 0.05, + "learning_rate": 9.630303030303031e-05, + "loss": 0.6771, + "step": 46600 + }, + { + "epoch": 0.05, + "learning_rate": 9.62929292929293e-05, + "loss": 0.677, + "step": 46700 + }, + { + "epoch": 0.05, + "learning_rate": 9.628282828282828e-05, + "loss": 0.678, + "step": 46800 + }, + { + "epoch": 0.05, + "learning_rate": 9.627272727272729e-05, + "loss": 0.675, + "step": 46900 + }, + { + "epoch": 0.05, + "learning_rate": 9.626262626262627e-05, + "loss": 0.6765, + "step": 47000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.6769700450165967, + "eval_average_loss_on_sentence_tokens": 0.5605648056771111, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.6716406345367432, + "eval_non_padding_tokens_in_labels": 133.49725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3745, + "eval_padding_tokens_in_labels": 7570055.0, + "eval_reconstruction_accuracy": 0.8925044198851454, + "eval_runtime": 214.0798, + "eval_samples_per_second": 23.356, + "eval_sentence_accuracy": 0.6544269384678881, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 47000 + }, + { + "epoch": 0.05, + "learning_rate": 9.625252525252526e-05, + "loss": 0.6766, + "step": 47100 + }, + { + "epoch": 0.05, + "learning_rate": 9.624242424242425e-05, + "loss": 0.6768, + "step": 47200 + }, + { + "epoch": 0.05, + "learning_rate": 9.623232323232324e-05, + "loss": 0.6745, + "step": 47300 + }, + { + "epoch": 0.05, + "learning_rate": 9.622222222222222e-05, + "loss": 0.6769, + "step": 47400 + }, + { + "epoch": 0.05, + "learning_rate": 9.621212121212123e-05, + "loss": 0.6797, + "step": 47500 + }, + { + "epoch": 0.05, + "learning_rate": 9.62020202020202e-05, + "loss": 0.674, + "step": 47600 + }, + { + "epoch": 0.05, + "learning_rate": 9.61919191919192e-05, + "loss": 0.6704, + "step": 47700 + }, + { + "epoch": 0.05, + "learning_rate": 9.618181818181819e-05, + "loss": 0.6665, + "step": 47800 + }, + { + "epoch": 0.05, + "learning_rate": 9.617171717171718e-05, + "loss": 0.6709, + "step": 47900 + }, + { + "epoch": 0.05, + "learning_rate": 9.616161616161616e-05, + "loss": 0.6688, + "step": 48000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.6725928838142679, + "eval_average_loss_on_sentence_tokens": 0.5364871779680824, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.6664257645606995, + "eval_non_padding_tokens_in_labels": 133.51045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36745, + "eval_padding_tokens_in_labels": 7569791.0, + "eval_reconstruction_accuracy": 0.892844239296409, + "eval_runtime": 219.5926, + "eval_samples_per_second": 22.769, + "eval_sentence_accuracy": 0.6750946578857645, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.25, + "step": 48000 + }, + { + "epoch": 0.0, + "learning_rate": 9.615151515151516e-05, + "loss": 0.6707, + "step": 48100 + }, + { + "epoch": 0.0, + "learning_rate": 9.614141414141414e-05, + "loss": 0.67, + "step": 48200 + }, + { + "epoch": 0.0, + "learning_rate": 9.613131313131313e-05, + "loss": 0.6662, + "step": 48300 + }, + { + "epoch": 0.0, + "learning_rate": 9.612121212121213e-05, + "loss": 0.6698, + "step": 48400 + }, + { + "epoch": 0.0, + "learning_rate": 9.611111111111112e-05, + "loss": 0.6699, + "step": 48500 + }, + { + "epoch": 0.0, + "learning_rate": 9.61010101010101e-05, + "loss": 0.6689, + "step": 48600 + }, + { + "epoch": 0.0, + "learning_rate": 9.60909090909091e-05, + "loss": 0.6696, + "step": 48700 + }, + { + "epoch": 0.0, + "learning_rate": 9.608080808080808e-05, + "loss": 0.6713, + "step": 48800 + }, + { + "epoch": 0.0, + "learning_rate": 9.607070707070707e-05, + "loss": 0.6691, + "step": 48900 + }, + { + "epoch": 0.0, + "learning_rate": 9.606060606060606e-05, + "loss": 0.6639, + "step": 49000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6675239446158762, + "eval_average_loss_on_sentence_tokens": 0.5613191960131236, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.6626562476158142, + "eval_non_padding_tokens_in_labels": 133.503, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37415, + "eval_padding_tokens_in_labels": 7569940.0, + "eval_reconstruction_accuracy": 0.8932897204714925, + "eval_runtime": 179.5348, + "eval_samples_per_second": 27.85, + "eval_sentence_accuracy": 0.6674323039101333, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 49000 + }, + { + "epoch": 0.0, + "learning_rate": 9.605050505050506e-05, + "loss": 0.6687, + "step": 49100 + }, + { + "epoch": 0.0, + "learning_rate": 9.604040404040405e-05, + "loss": 0.6655, + "step": 49200 + }, + { + "epoch": 0.0, + "learning_rate": 9.603030303030304e-05, + "loss": 0.6633, + "step": 49300 + }, + { + "epoch": 0.0, + "learning_rate": 9.602020202020202e-05, + "loss": 0.6622, + "step": 49400 + }, + { + "epoch": 0.0, + "learning_rate": 9.601010101010101e-05, + "loss": 0.6631, + "step": 49500 + }, + { + "epoch": 0.0, + "learning_rate": 9.6e-05, + "loss": 0.6666, + "step": 49600 + }, + { + "epoch": 0.0, + "learning_rate": 9.5989898989899e-05, + "loss": 0.662, + "step": 49700 + }, + { + "epoch": 0.0, + "learning_rate": 9.597979797979799e-05, + "loss": 0.6619, + "step": 49800 + }, + { + "epoch": 0.0, + "learning_rate": 9.596969696969698e-05, + "loss": 0.6712, + "step": 49900 + }, + { + "epoch": 0.0, + "learning_rate": 9.595959595959596e-05, + "loss": 0.6604, + "step": 50000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6659570246480043, + "eval_average_loss_on_sentence_tokens": 0.46263787938303186, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.65673828125, + "eval_non_padding_tokens_in_labels": 133.5366, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36705, + "eval_padding_tokens_in_labels": 7569268.0, + "eval_reconstruction_accuracy": 0.8935328802930887, + "eval_runtime": 185.5081, + "eval_samples_per_second": 26.953, + "eval_sentence_accuracy": 0.7075519945448346, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "step": 50000 + }, + { + "epoch": 0.0, + "learning_rate": 9.594949494949496e-05, + "loss": 0.6599, + "step": 50100 + }, + { + "epoch": 0.0, + "learning_rate": 9.593939393939394e-05, + "loss": 0.662, + "step": 50200 + }, + { + "epoch": 0.0, + "learning_rate": 9.592929292929293e-05, + "loss": 0.6603, + "step": 50300 + }, + { + "epoch": 0.0, + "learning_rate": 9.591919191919192e-05, + "loss": 0.6604, + "step": 50400 + }, + { + "epoch": 0.0, + "learning_rate": 9.590909090909092e-05, + "loss": 0.6632, + "step": 50500 + }, + { + "epoch": 0.0, + "learning_rate": 9.58989898989899e-05, + "loss": 0.663, + "step": 50600 + }, + { + "epoch": 0.0, + "learning_rate": 9.58888888888889e-05, + "loss": 0.6636, + "step": 50700 + }, + { + "epoch": 0.0, + "learning_rate": 9.587878787878788e-05, + "loss": 0.6626, + "step": 50800 + }, + { + "epoch": 0.0, + "learning_rate": 9.586868686868687e-05, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 0.0, + "learning_rate": 9.585858585858586e-05, + "loss": 0.6625, + "step": 51000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6592942080830513, + "eval_average_loss_on_sentence_tokens": 0.49809892553228324, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.6520312428474426, + "eval_non_padding_tokens_in_labels": 133.53045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39205, + "eval_padding_tokens_in_labels": 7569391.0, + "eval_reconstruction_accuracy": 0.8942247935907379, + "eval_runtime": 179.7347, + "eval_samples_per_second": 27.819, + "eval_sentence_accuracy": 0.7060356739103127, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 51000 + }, + { + "epoch": 0.0, + "learning_rate": 9.584848484848486e-05, + "loss": 0.6611, + "step": 51100 + }, + { + "epoch": 0.0, + "learning_rate": 9.583838383838383e-05, + "loss": 0.6606, + "step": 51200 + }, + { + "epoch": 0.0, + "learning_rate": 9.582828282828284e-05, + "loss": 0.6552, + "step": 51300 + }, + { + "epoch": 0.0, + "learning_rate": 9.581818181818182e-05, + "loss": 0.6596, + "step": 51400 + }, + { + "epoch": 0.0, + "learning_rate": 9.580808080808081e-05, + "loss": 0.6549, + "step": 51500 + }, + { + "epoch": 0.0, + "learning_rate": 9.57979797979798e-05, + "loss": 0.6598, + "step": 51600 + }, + { + "epoch": 0.0, + "learning_rate": 9.57878787878788e-05, + "loss": 0.6547, + "step": 51700 + }, + { + "epoch": 0.0, + "learning_rate": 9.577777777777777e-05, + "loss": 0.6555, + "step": 51800 + }, + { + "epoch": 0.0, + "learning_rate": 9.576767676767678e-05, + "loss": 0.6518, + "step": 51900 + }, + { + "epoch": 0.0, + "learning_rate": 9.575757575757576e-05, + "loss": 0.6524, + "step": 52000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6564306158485541, + "eval_average_loss_on_sentence_tokens": 0.5215322662238728, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.6502734422683716, + "eval_non_padding_tokens_in_labels": 133.53945, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3867, + "eval_padding_tokens_in_labels": 7569211.0, + "eval_reconstruction_accuracy": 0.8942836437317373, + "eval_runtime": 187.4864, + "eval_samples_per_second": 26.669, + "eval_sentence_accuracy": 0.682667288746927, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.25, + "step": 52000 + }, + { + "epoch": 0.0, + "learning_rate": 9.574747474747475e-05, + "loss": 0.6583, + "step": 52100 + }, + { + "epoch": 0.0, + "learning_rate": 9.573737373737374e-05, + "loss": 0.6515, + "step": 52200 + }, + { + "epoch": 0.0, + "learning_rate": 9.572727272727273e-05, + "loss": 0.6572, + "step": 52300 + }, + { + "epoch": 0.0, + "learning_rate": 9.571717171717171e-05, + "loss": 0.655, + "step": 52400 + }, + { + "epoch": 0.0, + "learning_rate": 9.570707070707072e-05, + "loss": 0.6518, + "step": 52500 + }, + { + "epoch": 0.0, + "learning_rate": 9.56969696969697e-05, + "loss": 0.6547, + "step": 52600 + }, + { + "epoch": 0.0, + "learning_rate": 9.568686868686869e-05, + "loss": 0.6515, + "step": 52700 + }, + { + "epoch": 0.0, + "learning_rate": 9.567676767676769e-05, + "loss": 0.6525, + "step": 52800 + }, + { + "epoch": 0.0, + "learning_rate": 9.566666666666667e-05, + "loss": 0.6509, + "step": 52900 + }, + { + "epoch": 0.01, + "learning_rate": 9.565656565656566e-05, + "loss": 0.6506, + "step": 53000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6534389729678094, + "eval_average_loss_on_sentence_tokens": 0.5322629099522579, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.6479687690734863, + "eval_non_padding_tokens_in_labels": 133.54535, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38095, + "eval_padding_tokens_in_labels": 7569093.0, + "eval_reconstruction_accuracy": 0.8947718050774054, + "eval_runtime": 180.4639, + "eval_samples_per_second": 27.706, + "eval_sentence_accuracy": 0.6625693111059271, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.248775, + "step": 53000 + }, + { + "epoch": 0.01, + "learning_rate": 9.564646464646465e-05, + "loss": 0.6503, + "step": 53100 + }, + { + "epoch": 0.01, + "learning_rate": 9.563636363636365e-05, + "loss": 0.6556, + "step": 53200 + }, + { + "epoch": 0.01, + "learning_rate": 9.562626262626262e-05, + "loss": 0.6524, + "step": 53300 + }, + { + "epoch": 0.01, + "learning_rate": 9.561616161616163e-05, + "loss": 0.65, + "step": 53400 + }, + { + "epoch": 0.01, + "learning_rate": 9.560606060606061e-05, + "loss": 0.649, + "step": 53500 + }, + { + "epoch": 0.01, + "learning_rate": 9.55959595959596e-05, + "loss": 0.6499, + "step": 53600 + }, + { + "epoch": 0.01, + "learning_rate": 9.558585858585859e-05, + "loss": 0.644, + "step": 53700 + }, + { + "epoch": 0.01, + "learning_rate": 9.557575757575758e-05, + "loss": 0.6499, + "step": 53800 + }, + { + "epoch": 0.01, + "learning_rate": 9.556565656565656e-05, + "loss": 0.6491, + "step": 53900 + }, + { + "epoch": 0.01, + "learning_rate": 9.555555555555557e-05, + "loss": 0.6483, + "step": 54000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6492407957310855, + "eval_average_loss_on_sentence_tokens": 0.4905874174360373, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.6422070264816284, + "eval_non_padding_tokens_in_labels": 133.5083, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 7569834.0, + "eval_reconstruction_accuracy": 0.8948583715564273, + "eval_runtime": 178.6848, + "eval_samples_per_second": 27.982, + "eval_sentence_accuracy": 0.7195210580149658, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 54000 + }, + { + "epoch": 0.01, + "learning_rate": 9.554545454545455e-05, + "loss": 0.6432, + "step": 54100 + }, + { + "epoch": 0.01, + "learning_rate": 9.553535353535354e-05, + "loss": 0.6471, + "step": 54200 + }, + { + "epoch": 0.01, + "learning_rate": 9.552525252525253e-05, + "loss": 0.6452, + "step": 54300 + }, + { + "epoch": 0.01, + "learning_rate": 9.551515151515152e-05, + "loss": 0.645, + "step": 54400 + }, + { + "epoch": 0.01, + "learning_rate": 9.550505050505051e-05, + "loss": 0.6494, + "step": 54500 + }, + { + "epoch": 0.01, + "learning_rate": 9.54949494949495e-05, + "loss": 0.645, + "step": 54600 + }, + { + "epoch": 0.01, + "learning_rate": 9.548484848484849e-05, + "loss": 0.6471, + "step": 54700 + }, + { + "epoch": 0.01, + "learning_rate": 9.547474747474748e-05, + "loss": 0.6472, + "step": 54800 + }, + { + "epoch": 0.01, + "learning_rate": 9.546464646464647e-05, + "loss": 0.6406, + "step": 54900 + }, + { + "epoch": 0.01, + "learning_rate": 9.545454545454546e-05, + "loss": 0.6452, + "step": 55000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6472917091538516, + "eval_average_loss_on_sentence_tokens": 0.505953965450823, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.6408398151397705, + "eval_non_padding_tokens_in_labels": 133.56435, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3837, + "eval_padding_tokens_in_labels": 7568713.0, + "eval_reconstruction_accuracy": 0.8953583725282072, + "eval_runtime": 180.2709, + "eval_samples_per_second": 27.736, + "eval_sentence_accuracy": 0.6841925816928958, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 55000 + }, + { + "epoch": 0.01, + "learning_rate": 9.544444444444445e-05, + "loss": 0.6464, + "step": 55100 + }, + { + "epoch": 0.01, + "learning_rate": 9.543434343434344e-05, + "loss": 0.6448, + "step": 55200 + }, + { + "epoch": 0.01, + "learning_rate": 9.542424242424242e-05, + "loss": 0.6428, + "step": 55300 + }, + { + "epoch": 0.01, + "learning_rate": 9.541414141414143e-05, + "loss": 0.6446, + "step": 55400 + }, + { + "epoch": 0.01, + "learning_rate": 9.540404040404041e-05, + "loss": 0.6435, + "step": 55500 + }, + { + "epoch": 0.01, + "learning_rate": 9.53939393939394e-05, + "loss": 0.6392, + "step": 55600 + }, + { + "epoch": 0.01, + "learning_rate": 9.538383838383839e-05, + "loss": 0.6405, + "step": 55700 + }, + { + "epoch": 0.01, + "learning_rate": 9.537373737373738e-05, + "loss": 0.6461, + "step": 55800 + }, + { + "epoch": 0.01, + "learning_rate": 9.536363636363636e-05, + "loss": 0.6471, + "step": 55900 + }, + { + "epoch": 0.01, + "learning_rate": 9.535353535353537e-05, + "loss": 0.6415, + "step": 56000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6439023794050945, + "eval_average_loss_on_sentence_tokens": 0.44112799903044153, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.6346484422683716, + "eval_non_padding_tokens_in_labels": 133.51005, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36545, + "eval_padding_tokens_in_labels": 7569799.0, + "eval_reconstruction_accuracy": 0.895817900686098, + "eval_runtime": 185.7413, + "eval_samples_per_second": 26.919, + "eval_sentence_accuracy": 0.7292021820661438, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2464, + "step": 56000 + }, + { + "epoch": 0.01, + "learning_rate": 9.534343434343435e-05, + "loss": 0.6432, + "step": 56100 + }, + { + "epoch": 0.01, + "learning_rate": 9.533333333333334e-05, + "loss": 0.6411, + "step": 56200 + }, + { + "epoch": 0.01, + "learning_rate": 9.532323232323233e-05, + "loss": 0.6412, + "step": 56300 + }, + { + "epoch": 0.01, + "learning_rate": 9.531313131313132e-05, + "loss": 0.6387, + "step": 56400 + }, + { + "epoch": 0.01, + "learning_rate": 9.53030303030303e-05, + "loss": 0.6376, + "step": 56500 + }, + { + "epoch": 0.01, + "learning_rate": 9.52929292929293e-05, + "loss": 0.6362, + "step": 56600 + }, + { + "epoch": 0.01, + "learning_rate": 9.528282828282828e-05, + "loss": 0.6422, + "step": 56700 + }, + { + "epoch": 0.01, + "learning_rate": 9.527272727272728e-05, + "loss": 0.6451, + "step": 56800 + }, + { + "epoch": 0.01, + "learning_rate": 9.526262626262627e-05, + "loss": 0.6404, + "step": 56900 + }, + { + "epoch": 0.01, + "learning_rate": 9.525252525252526e-05, + "loss": 0.6373, + "step": 57000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6396743783983692, + "eval_average_loss_on_sentence_tokens": 0.5001186924595188, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.6334179639816284, + "eval_non_padding_tokens_in_labels": 133.5231, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37345, + "eval_padding_tokens_in_labels": 7569538.0, + "eval_reconstruction_accuracy": 0.8960700602058055, + "eval_runtime": 185.3067, + "eval_samples_per_second": 26.982, + "eval_sentence_accuracy": 0.7014911981624706, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2496, + "step": 57000 + }, + { + "epoch": 0.01, + "learning_rate": 9.524242424242424e-05, + "loss": 0.6426, + "step": 57100 + }, + { + "epoch": 0.01, + "learning_rate": 9.523232323232324e-05, + "loss": 0.6368, + "step": 57200 + }, + { + "epoch": 0.01, + "learning_rate": 9.522222222222222e-05, + "loss": 0.6346, + "step": 57300 + }, + { + "epoch": 0.01, + "learning_rate": 9.521212121212121e-05, + "loss": 0.6367, + "step": 57400 + }, + { + "epoch": 0.01, + "learning_rate": 9.52020202020202e-05, + "loss": 0.6343, + "step": 57500 + }, + { + "epoch": 0.01, + "learning_rate": 9.51919191919192e-05, + "loss": 0.6386, + "step": 57600 + }, + { + "epoch": 0.01, + "learning_rate": 9.518181818181818e-05, + "loss": 0.6359, + "step": 57700 + }, + { + "epoch": 0.01, + "learning_rate": 9.517171717171718e-05, + "loss": 0.6335, + "step": 57800 + }, + { + "epoch": 0.01, + "learning_rate": 9.516161616161616e-05, + "loss": 0.6328, + "step": 57900 + }, + { + "epoch": 0.01, + "learning_rate": 9.515151515151515e-05, + "loss": 0.6384, + "step": 58000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6370066648222876, + "eval_average_loss_on_sentence_tokens": 0.4976548263667961, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.6307617425918579, + "eval_non_padding_tokens_in_labels": 133.50725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3654, + "eval_padding_tokens_in_labels": 7569855.0, + "eval_reconstruction_accuracy": 0.8963164494824748, + "eval_runtime": 179.2769, + "eval_samples_per_second": 27.89, + "eval_sentence_accuracy": 0.6865343549805301, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 58000 + }, + { + "epoch": 0.01, + "learning_rate": 9.514141414141414e-05, + "loss": 0.6317, + "step": 58100 + }, + { + "epoch": 0.01, + "learning_rate": 9.513131313131314e-05, + "loss": 0.6358, + "step": 58200 + }, + { + "epoch": 0.01, + "learning_rate": 9.512121212121213e-05, + "loss": 0.6296, + "step": 58300 + }, + { + "epoch": 0.01, + "learning_rate": 9.511111111111112e-05, + "loss": 0.63, + "step": 58400 + }, + { + "epoch": 0.01, + "learning_rate": 9.51010101010101e-05, + "loss": 0.6361, + "step": 58500 + }, + { + "epoch": 0.01, + "learning_rate": 9.509090909090909e-05, + "loss": 0.6351, + "step": 58600 + }, + { + "epoch": 0.01, + "learning_rate": 9.508080808080808e-05, + "loss": 0.6323, + "step": 58700 + }, + { + "epoch": 0.01, + "learning_rate": 9.507070707070707e-05, + "loss": 0.634, + "step": 58800 + }, + { + "epoch": 0.01, + "learning_rate": 9.506060606060607e-05, + "loss": 0.6295, + "step": 58900 + }, + { + "epoch": 0.01, + "learning_rate": 9.505050505050506e-05, + "loss": 0.6341, + "step": 59000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6346608265773294, + "eval_average_loss_on_sentence_tokens": 0.46142623564598995, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.6268359422683716, + "eval_non_padding_tokens_in_labels": 133.5453, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 7569094.0, + "eval_reconstruction_accuracy": 0.8964981649524182, + "eval_runtime": 183.1012, + "eval_samples_per_second": 27.307, + "eval_sentence_accuracy": 0.7135140955012831, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.248775, + "step": 59000 + }, + { + "epoch": 0.01, + "learning_rate": 9.504040404040404e-05, + "loss": 0.6294, + "step": 59100 + }, + { + "epoch": 0.01, + "learning_rate": 9.503030303030304e-05, + "loss": 0.6277, + "step": 59200 + }, + { + "epoch": 0.01, + "learning_rate": 9.502020202020202e-05, + "loss": 0.6339, + "step": 59300 + }, + { + "epoch": 0.01, + "learning_rate": 9.501010101010101e-05, + "loss": 0.6325, + "step": 59400 + }, + { + "epoch": 0.01, + "learning_rate": 9.5e-05, + "loss": 0.6292, + "step": 59500 + }, + { + "epoch": 0.01, + "learning_rate": 9.4989898989899e-05, + "loss": 0.629, + "step": 59600 + }, + { + "epoch": 0.01, + "learning_rate": 9.497979797979798e-05, + "loss": 0.6311, + "step": 59700 + }, + { + "epoch": 0.01, + "learning_rate": 9.496969696969698e-05, + "loss": 0.6224, + "step": 59800 + }, + { + "epoch": 0.01, + "learning_rate": 9.495959595959596e-05, + "loss": 0.6252, + "step": 59900 + }, + { + "epoch": 0.01, + "learning_rate": 9.494949494949495e-05, + "loss": 0.6277, + "step": 60000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6303119586430661, + "eval_average_loss_on_sentence_tokens": 0.5120977373903295, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.6249804496765137, + "eval_non_padding_tokens_in_labels": 133.5021, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3733, + "eval_padding_tokens_in_labels": 7569958.0, + "eval_reconstruction_accuracy": 0.8970161946003623, + "eval_runtime": 184.2832, + "eval_samples_per_second": 27.132, + "eval_sentence_accuracy": 0.6788271394476645, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 60000 + }, + { + "epoch": 0.01, + "learning_rate": 9.493939393939394e-05, + "loss": 0.629, + "step": 60100 + }, + { + "epoch": 0.01, + "learning_rate": 9.492929292929294e-05, + "loss": 0.6272, + "step": 60200 + }, + { + "epoch": 0.01, + "learning_rate": 9.491919191919191e-05, + "loss": 0.6297, + "step": 60300 + }, + { + "epoch": 0.01, + "learning_rate": 9.490909090909092e-05, + "loss": 0.6289, + "step": 60400 + }, + { + "epoch": 0.01, + "learning_rate": 9.48989898989899e-05, + "loss": 0.6294, + "step": 60500 + }, + { + "epoch": 0.01, + "learning_rate": 9.488888888888889e-05, + "loss": 0.6271, + "step": 60600 + }, + { + "epoch": 0.01, + "learning_rate": 9.487878787878788e-05, + "loss": 0.6297, + "step": 60700 + }, + { + "epoch": 0.01, + "learning_rate": 9.486868686868687e-05, + "loss": 0.6243, + "step": 60800 + }, + { + "epoch": 0.01, + "learning_rate": 9.485858585858585e-05, + "loss": 0.6271, + "step": 60900 + }, + { + "epoch": 0.01, + "learning_rate": 9.484848484848486e-05, + "loss": 0.6267, + "step": 61000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6288353472144947, + "eval_average_loss_on_sentence_tokens": 0.43980462883008054, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.6202929615974426, + "eval_non_padding_tokens_in_labels": 133.53795, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36535, + "eval_padding_tokens_in_labels": 7569241.0, + "eval_reconstruction_accuracy": 0.8970406214465344, + "eval_runtime": 184.1535, + "eval_samples_per_second": 27.151, + "eval_sentence_accuracy": 0.734504818131247, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.245775, + "step": 61000 + }, + { + "epoch": 0.01, + "learning_rate": 9.483838383838384e-05, + "loss": 0.6296, + "step": 61100 + }, + { + "epoch": 0.01, + "learning_rate": 9.482828282828283e-05, + "loss": 0.6274, + "step": 61200 + }, + { + "epoch": 0.01, + "learning_rate": 9.481818181818183e-05, + "loss": 0.625, + "step": 61300 + }, + { + "epoch": 0.01, + "learning_rate": 9.480808080808081e-05, + "loss": 0.629, + "step": 61400 + }, + { + "epoch": 0.01, + "learning_rate": 9.47979797979798e-05, + "loss": 0.6275, + "step": 61500 + }, + { + "epoch": 0.01, + "learning_rate": 9.47878787878788e-05, + "loss": 0.6204, + "step": 61600 + }, + { + "epoch": 0.01, + "learning_rate": 9.477777777777779e-05, + "loss": 0.6249, + "step": 61700 + }, + { + "epoch": 0.01, + "learning_rate": 9.476767676767677e-05, + "loss": 0.6268, + "step": 61800 + }, + { + "epoch": 0.01, + "learning_rate": 9.475757575757577e-05, + "loss": 0.6198, + "step": 61900 + }, + { + "epoch": 0.01, + "learning_rate": 9.474747474747475e-05, + "loss": 0.6167, + "step": 62000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6251375608949865, + "eval_average_loss_on_sentence_tokens": 0.5097754230456255, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.6199511885643005, + "eval_non_padding_tokens_in_labels": 133.53405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3768, + "eval_padding_tokens_in_labels": 7569319.0, + "eval_reconstruction_accuracy": 0.8972929661285823, + "eval_runtime": 186.9276, + "eval_samples_per_second": 26.748, + "eval_sentence_accuracy": 0.6917831571769519, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 62000 + }, + { + "epoch": 0.01, + "learning_rate": 9.473737373737374e-05, + "loss": 0.6196, + "step": 62100 + }, + { + "epoch": 0.01, + "learning_rate": 9.472727272727273e-05, + "loss": 0.6238, + "step": 62200 + }, + { + "epoch": 0.01, + "learning_rate": 9.471717171717173e-05, + "loss": 0.6256, + "step": 62300 + }, + { + "epoch": 0.01, + "learning_rate": 9.47070707070707e-05, + "loss": 0.6201, + "step": 62400 + }, + { + "epoch": 0.01, + "learning_rate": 9.469696969696971e-05, + "loss": 0.6239, + "step": 62500 + }, + { + "epoch": 0.01, + "learning_rate": 9.468686868686869e-05, + "loss": 0.6285, + "step": 62600 + }, + { + "epoch": 0.01, + "learning_rate": 9.467676767676768e-05, + "loss": 0.6267, + "step": 62700 + }, + { + "epoch": 0.01, + "learning_rate": 9.466666666666667e-05, + "loss": 0.618, + "step": 62800 + }, + { + "epoch": 0.01, + "learning_rate": 9.465656565656566e-05, + "loss": 0.6207, + "step": 62900 + }, + { + "epoch": 0.01, + "learning_rate": 9.464646464646464e-05, + "loss": 0.6248, + "step": 63000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6237066170559634, + "eval_average_loss_on_sentence_tokens": 0.4709436801236187, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.616894543170929, + "eval_non_padding_tokens_in_labels": 133.50025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3527, + "eval_padding_tokens_in_labels": 7569995.0, + "eval_reconstruction_accuracy": 0.8976529802773751, + "eval_runtime": 184.4409, + "eval_samples_per_second": 27.109, + "eval_sentence_accuracy": 0.7030254634198863, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 63000 + }, + { + "epoch": 0.02, + "learning_rate": 9.463636363636365e-05, + "loss": 0.626, + "step": 63100 + }, + { + "epoch": 0.02, + "learning_rate": 9.462626262626263e-05, + "loss": 0.6204, + "step": 63200 + }, + { + "epoch": 0.02, + "learning_rate": 9.461616161616162e-05, + "loss": 0.6184, + "step": 63300 + }, + { + "epoch": 0.02, + "learning_rate": 9.460606060606061e-05, + "loss": 0.6215, + "step": 63400 + }, + { + "epoch": 0.02, + "learning_rate": 9.45959595959596e-05, + "loss": 0.6206, + "step": 63500 + }, + { + "epoch": 0.02, + "learning_rate": 9.45858585858586e-05, + "loss": 0.6225, + "step": 63600 + }, + { + "epoch": 0.02, + "learning_rate": 9.457575757575759e-05, + "loss": 0.624, + "step": 63700 + }, + { + "epoch": 0.02, + "learning_rate": 9.456565656565657e-05, + "loss": 0.6145, + "step": 63800 + }, + { + "epoch": 0.02, + "learning_rate": 9.455555555555556e-05, + "loss": 0.6177, + "step": 63900 + }, + { + "epoch": 0.02, + "learning_rate": 9.454545454545455e-05, + "loss": 0.6198, + "step": 64000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.6203108429710446, + "eval_average_loss_on_sentence_tokens": 0.531143616388135, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.6163867115974426, + "eval_non_padding_tokens_in_labels": 133.55505, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3923, + "eval_padding_tokens_in_labels": 7568899.0, + "eval_reconstruction_accuracy": 0.8978317568724427, + "eval_runtime": 179.4559, + "eval_samples_per_second": 27.862, + "eval_sentence_accuracy": 0.6762520860624114, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.248775, + "step": 64000 + }, + { + "epoch": 0.02, + "learning_rate": 9.453535353535354e-05, + "loss": 0.6196, + "step": 64100 + }, + { + "epoch": 0.02, + "learning_rate": 9.452525252525253e-05, + "loss": 0.6191, + "step": 64200 + }, + { + "epoch": 0.02, + "learning_rate": 9.451515151515153e-05, + "loss": 0.617, + "step": 64300 + }, + { + "epoch": 0.02, + "learning_rate": 9.45050505050505e-05, + "loss": 0.6211, + "step": 64400 + }, + { + "epoch": 0.02, + "learning_rate": 9.449494949494951e-05, + "loss": 0.6147, + "step": 64500 + }, + { + "epoch": 0.02, + "learning_rate": 9.448484848484849e-05, + "loss": 0.6161, + "step": 64600 + }, + { + "epoch": 0.02, + "learning_rate": 9.447474747474748e-05, + "loss": 0.6163, + "step": 64700 + }, + { + "epoch": 0.02, + "learning_rate": 9.446464646464647e-05, + "loss": 0.6155, + "step": 64800 + }, + { + "epoch": 0.02, + "learning_rate": 9.445454545454546e-05, + "loss": 0.6211, + "step": 64900 + }, + { + "epoch": 0.02, + "learning_rate": 9.444444444444444e-05, + "loss": 0.6211, + "step": 65000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.618572512108926, + "eval_average_loss_on_sentence_tokens": 0.47185761206191323, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.6119335889816284, + "eval_non_padding_tokens_in_labels": 133.52775, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38395, + "eval_padding_tokens_in_labels": 7569445.0, + "eval_reconstruction_accuracy": 0.8981364639057187, + "eval_runtime": 179.604, + "eval_samples_per_second": 27.839, + "eval_sentence_accuracy": 0.706879071186319, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 65000 + }, + { + "epoch": 0.02, + "learning_rate": 9.443434343434345e-05, + "loss": 0.6122, + "step": 65100 + }, + { + "epoch": 0.02, + "learning_rate": 9.442424242424243e-05, + "loss": 0.6162, + "step": 65200 + }, + { + "epoch": 0.02, + "learning_rate": 9.441414141414142e-05, + "loss": 0.6147, + "step": 65300 + }, + { + "epoch": 0.02, + "learning_rate": 9.440404040404041e-05, + "loss": 0.6136, + "step": 65400 + }, + { + "epoch": 0.02, + "learning_rate": 9.43939393939394e-05, + "loss": 0.6189, + "step": 65500 + }, + { + "epoch": 0.02, + "learning_rate": 9.438383838383838e-05, + "loss": 0.6172, + "step": 65600 + }, + { + "epoch": 0.02, + "learning_rate": 9.437373737373739e-05, + "loss": 0.6088, + "step": 65700 + }, + { + "epoch": 0.02, + "learning_rate": 9.436363636363636e-05, + "loss": 0.6138, + "step": 65800 + }, + { + "epoch": 0.02, + "learning_rate": 9.435353535353536e-05, + "loss": 0.6194, + "step": 65900 + }, + { + "epoch": 0.02, + "learning_rate": 9.434343434343435e-05, + "loss": 0.6108, + "step": 66000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.6157918497103501, + "eval_average_loss_on_sentence_tokens": 0.4912983548345907, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.6101757884025574, + "eval_non_padding_tokens_in_labels": 133.53535, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37875, + "eval_padding_tokens_in_labels": 7569293.0, + "eval_reconstruction_accuracy": 0.8983623368728729, + "eval_runtime": 179.2072, + "eval_samples_per_second": 27.901, + "eval_sentence_accuracy": 0.69701401475048, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 66000 + }, + { + "epoch": 0.02, + "learning_rate": 9.433333333333334e-05, + "loss": 0.6105, + "step": 66100 + }, + { + "epoch": 0.02, + "learning_rate": 9.432323232323232e-05, + "loss": 0.6156, + "step": 66200 + }, + { + "epoch": 0.02, + "learning_rate": 9.431313131313132e-05, + "loss": 0.6121, + "step": 66300 + }, + { + "epoch": 0.02, + "learning_rate": 9.43030303030303e-05, + "loss": 0.6104, + "step": 66400 + }, + { + "epoch": 0.02, + "learning_rate": 9.42929292929293e-05, + "loss": 0.6129, + "step": 66500 + }, + { + "epoch": 0.02, + "learning_rate": 9.428282828282829e-05, + "loss": 0.614, + "step": 66600 + }, + { + "epoch": 0.02, + "learning_rate": 9.427272727272728e-05, + "loss": 0.6097, + "step": 66700 + }, + { + "epoch": 0.02, + "learning_rate": 9.426262626262626e-05, + "loss": 0.605, + "step": 66800 + }, + { + "epoch": 0.02, + "learning_rate": 9.425252525252526e-05, + "loss": 0.6093, + "step": 66900 + }, + { + "epoch": 0.02, + "learning_rate": 9.424242424242424e-05, + "loss": 0.6119, + "step": 67000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.6127508946551832, + "eval_average_loss_on_sentence_tokens": 0.45769831176715015, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.6056835651397705, + "eval_non_padding_tokens_in_labels": 133.5173, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37695, + "eval_padding_tokens_in_labels": 7569654.0, + "eval_reconstruction_accuracy": 0.8986821765481295, + "eval_runtime": 189.463, + "eval_samples_per_second": 26.39, + "eval_sentence_accuracy": 0.722598560841244, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.248775, + "step": 67000 + }, + { + "epoch": 0.02, + "learning_rate": 9.423232323232323e-05, + "loss": 0.6104, + "step": 67100 + }, + { + "epoch": 0.02, + "learning_rate": 9.422222222222223e-05, + "loss": 0.6145, + "step": 67200 + }, + { + "epoch": 0.02, + "learning_rate": 9.421212121212122e-05, + "loss": 0.611, + "step": 67300 + }, + { + "epoch": 0.02, + "learning_rate": 9.420202020202021e-05, + "loss": 0.6091, + "step": 67400 + }, + { + "epoch": 0.02, + "learning_rate": 9.41919191919192e-05, + "loss": 0.6089, + "step": 67500 + }, + { + "epoch": 0.02, + "learning_rate": 9.418181818181818e-05, + "loss": 0.6095, + "step": 67600 + }, + { + "epoch": 0.02, + "learning_rate": 9.417171717171717e-05, + "loss": 0.6106, + "step": 67700 + }, + { + "epoch": 0.02, + "learning_rate": 9.416161616161616e-05, + "loss": 0.609, + "step": 67800 + }, + { + "epoch": 0.02, + "learning_rate": 9.415151515151516e-05, + "loss": 0.6143, + "step": 67900 + }, + { + "epoch": 0.02, + "learning_rate": 9.414141414141415e-05, + "loss": 0.608, + "step": 68000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.6114645137747093, + "eval_average_loss_on_sentence_tokens": 0.4672389536884341, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.6049609184265137, + "eval_non_padding_tokens_in_labels": 133.59085, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3817, + "eval_padding_tokens_in_labels": 7568183.0, + "eval_reconstruction_accuracy": 0.8988409488928525, + "eval_runtime": 189.3174, + "eval_samples_per_second": 26.411, + "eval_sentence_accuracy": 0.7090099951549518, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2499, + "step": 68000 + }, + { + "epoch": 0.02, + "learning_rate": 9.413131313131314e-05, + "loss": 0.6138, + "step": 68100 + }, + { + "epoch": 0.02, + "learning_rate": 9.412121212121212e-05, + "loss": 0.6072, + "step": 68200 + }, + { + "epoch": 0.02, + "learning_rate": 9.411111111111111e-05, + "loss": 0.6098, + "step": 68300 + }, + { + "epoch": 0.02, + "learning_rate": 9.41010101010101e-05, + "loss": 0.6125, + "step": 68400 + }, + { + "epoch": 0.02, + "learning_rate": 9.40909090909091e-05, + "loss": 0.6045, + "step": 68500 + }, + { + "epoch": 0.02, + "learning_rate": 9.408080808080809e-05, + "loss": 0.6094, + "step": 68600 + }, + { + "epoch": 0.02, + "learning_rate": 9.407070707070708e-05, + "loss": 0.6064, + "step": 68700 + }, + { + "epoch": 0.02, + "learning_rate": 9.406060606060606e-05, + "loss": 0.6082, + "step": 68800 + }, + { + "epoch": 0.02, + "learning_rate": 9.405050505050506e-05, + "loss": 0.6079, + "step": 68900 + }, + { + "epoch": 0.02, + "learning_rate": 9.404040404040404e-05, + "loss": 0.6008, + "step": 69000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.608151876590835, + "eval_average_loss_on_sentence_tokens": 0.4720116755506789, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.6020703315734863, + "eval_non_padding_tokens_in_labels": 133.5251, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3814, + "eval_padding_tokens_in_labels": 7569498.0, + "eval_reconstruction_accuracy": 0.8990887072113638, + "eval_runtime": 186.7704, + "eval_samples_per_second": 26.771, + "eval_sentence_accuracy": 0.7075519945448346, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 69000 + }, + { + "epoch": 0.02, + "learning_rate": 9.403030303030303e-05, + "loss": 0.6092, + "step": 69100 + }, + { + "epoch": 0.02, + "learning_rate": 9.402020202020202e-05, + "loss": 0.6037, + "step": 69200 + }, + { + "epoch": 0.02, + "learning_rate": 9.401010101010102e-05, + "loss": 0.6072, + "step": 69300 + }, + { + "epoch": 0.02, + "learning_rate": 9.4e-05, + "loss": 0.608, + "step": 69400 + }, + { + "epoch": 0.02, + "learning_rate": 9.3989898989899e-05, + "loss": 0.6029, + "step": 69500 + }, + { + "epoch": 0.02, + "learning_rate": 9.397979797979798e-05, + "loss": 0.6047, + "step": 69600 + }, + { + "epoch": 0.02, + "learning_rate": 9.396969696969697e-05, + "loss": 0.6087, + "step": 69700 + }, + { + "epoch": 0.02, + "learning_rate": 9.395959595959598e-05, + "loss": 0.6134, + "step": 69800 + }, + { + "epoch": 0.02, + "learning_rate": 9.394949494949495e-05, + "loss": 0.6049, + "step": 69900 + }, + { + "epoch": 0.02, + "learning_rate": 9.393939393939395e-05, + "loss": 0.6049, + "step": 70000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.6062459658502415, + "eval_average_loss_on_sentence_tokens": 0.5082559228551765, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.6018261909484863, + "eval_non_padding_tokens_in_labels": 133.5463, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37215, + "eval_padding_tokens_in_labels": 7569074.0, + "eval_reconstruction_accuracy": 0.8990093565484808, + "eval_runtime": 180.9154, + "eval_samples_per_second": 27.637, + "eval_sentence_accuracy": 0.694829256913166, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 70000 + }, + { + "epoch": 0.0, + "learning_rate": 9.392929292929294e-05, + "loss": 0.6053, + "step": 70100 + }, + { + "epoch": 0.0, + "learning_rate": 9.391919191919193e-05, + "loss": 0.6025, + "step": 70200 + }, + { + "epoch": 0.0, + "learning_rate": 9.390909090909091e-05, + "loss": 0.6003, + "step": 70300 + }, + { + "epoch": 0.0, + "learning_rate": 9.389898989898991e-05, + "loss": 0.6014, + "step": 70400 + }, + { + "epoch": 0.0, + "learning_rate": 9.388888888888889e-05, + "loss": 0.6062, + "step": 70500 + }, + { + "epoch": 0.0, + "learning_rate": 9.387878787878788e-05, + "loss": 0.6035, + "step": 70600 + }, + { + "epoch": 0.0, + "learning_rate": 9.386868686868688e-05, + "loss": 0.6035, + "step": 70700 + }, + { + "epoch": 0.0, + "learning_rate": 9.385858585858587e-05, + "loss": 0.6055, + "step": 70800 + }, + { + "epoch": 0.0, + "learning_rate": 9.384848484848485e-05, + "loss": 0.6036, + "step": 70900 + }, + { + "epoch": 0.0, + "learning_rate": 9.383838383838385e-05, + "loss": 0.5997, + "step": 71000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6038137851090025, + "eval_average_loss_on_sentence_tokens": 0.43362825873360084, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.5961718559265137, + "eval_non_padding_tokens_in_labels": 133.54035, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38795, + "eval_padding_tokens_in_labels": 378.45965, + "eval_reconstruction_accuracy": 0.8994745341649631, + "eval_runtime": 184.4678, + "eval_samples_per_second": 27.105, + "eval_sentence_accuracy": 0.7377034471620579, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2464, + "step": 71000 + }, + { + "epoch": 0.0, + "learning_rate": 9.382828282828283e-05, + "loss": 0.6025, + "step": 71100 + }, + { + "epoch": 0.0, + "learning_rate": 9.381818181818182e-05, + "loss": 0.6016, + "step": 71200 + }, + { + "epoch": 0.0, + "learning_rate": 9.380808080808082e-05, + "loss": 0.5985, + "step": 71300 + }, + { + "epoch": 0.0, + "learning_rate": 9.379797979797981e-05, + "loss": 0.5995, + "step": 71400 + }, + { + "epoch": 0.0, + "learning_rate": 9.378787878787879e-05, + "loss": 0.6, + "step": 71500 + }, + { + "epoch": 0.0, + "learning_rate": 9.377777777777779e-05, + "loss": 0.6025, + "step": 71600 + }, + { + "epoch": 0.0, + "learning_rate": 9.376767676767677e-05, + "loss": 0.5976, + "step": 71700 + }, + { + "epoch": 0.0, + "learning_rate": 9.375757575757576e-05, + "loss": 0.5982, + "step": 71800 + }, + { + "epoch": 0.0, + "learning_rate": 9.374747474747475e-05, + "loss": 0.6059, + "step": 71900 + }, + { + "epoch": 0.0, + "learning_rate": 9.373737373737375e-05, + "loss": 0.597, + "step": 72000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.602120851273778, + "eval_average_loss_on_sentence_tokens": 0.464147674520523, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.595751941204071, + "eval_non_padding_tokens_in_labels": 133.5576, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39165, + "eval_padding_tokens_in_labels": 378.4424, + "eval_reconstruction_accuracy": 0.8995155764199337, + "eval_runtime": 180.3958, + "eval_samples_per_second": 27.717, + "eval_sentence_accuracy": 0.7148734006854846, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 72000 + }, + { + "epoch": 0.0, + "learning_rate": 9.372727272727272e-05, + "loss": 0.5986, + "step": 72100 + }, + { + "epoch": 0.0, + "learning_rate": 9.371717171717173e-05, + "loss": 0.6001, + "step": 72200 + }, + { + "epoch": 0.0, + "learning_rate": 9.370707070707071e-05, + "loss": 0.6003, + "step": 72300 + }, + { + "epoch": 0.0, + "learning_rate": 9.36969696969697e-05, + "loss": 0.5981, + "step": 72400 + }, + { + "epoch": 0.0, + "learning_rate": 9.368686868686869e-05, + "loss": 0.6, + "step": 72500 + }, + { + "epoch": 0.0, + "learning_rate": 9.367676767676768e-05, + "loss": 0.601, + "step": 72600 + }, + { + "epoch": 0.0, + "learning_rate": 9.366666666666668e-05, + "loss": 0.601, + "step": 72700 + }, + { + "epoch": 0.0, + "learning_rate": 9.365656565656567e-05, + "loss": 0.6012, + "step": 72800 + }, + { + "epoch": 0.0, + "learning_rate": 9.364646464646465e-05, + "loss": 0.6012, + "step": 72900 + }, + { + "epoch": 0.0, + "learning_rate": 9.363636363636364e-05, + "loss": 0.5993, + "step": 73000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.6012248219202796, + "eval_average_loss_on_sentence_tokens": 0.48388680540225854, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5960254073143005, + "eval_non_padding_tokens_in_labels": 133.53585, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36235, + "eval_padding_tokens_in_labels": 378.46415, + "eval_reconstruction_accuracy": 0.8997075971723512, + "eval_runtime": 184.9401, + "eval_samples_per_second": 27.036, + "eval_sentence_accuracy": 0.6887953774651425, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2496, + "step": 73000 + }, + { + "epoch": 0.0, + "learning_rate": 9.362626262626263e-05, + "loss": 0.6003, + "step": 73100 + }, + { + "epoch": 0.0, + "learning_rate": 9.361616161616162e-05, + "loss": 0.5975, + "step": 73200 + }, + { + "epoch": 0.0, + "learning_rate": 9.360606060606061e-05, + "loss": 0.5955, + "step": 73300 + }, + { + "epoch": 0.0, + "learning_rate": 9.35959595959596e-05, + "loss": 0.5974, + "step": 73400 + }, + { + "epoch": 0.0, + "learning_rate": 9.358585858585858e-05, + "loss": 0.5943, + "step": 73500 + }, + { + "epoch": 0.0, + "learning_rate": 9.357575757575759e-05, + "loss": 0.6001, + "step": 73600 + }, + { + "epoch": 0.0, + "learning_rate": 9.356565656565657e-05, + "loss": 0.5951, + "step": 73700 + }, + { + "epoch": 0.0, + "learning_rate": 9.355555555555556e-05, + "loss": 0.5951, + "step": 73800 + }, + { + "epoch": 0.0, + "learning_rate": 9.354545454545455e-05, + "loss": 0.5919, + "step": 73900 + }, + { + "epoch": 0.0, + "learning_rate": 9.353535353535354e-05, + "loss": 0.5922, + "step": 74000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.5973322314793412, + "eval_average_loss_on_sentence_tokens": 0.50388036422286, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5930566191673279, + "eval_non_padding_tokens_in_labels": 133.5179, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3844, + "eval_padding_tokens_in_labels": 378.4821, + "eval_reconstruction_accuracy": 0.9000992151243065, + "eval_runtime": 178.3969, + "eval_samples_per_second": 28.027, + "eval_sentence_accuracy": 0.6889344482925691, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2496, + "step": 74000 + }, + { + "epoch": 0.0, + "learning_rate": 9.352525252525252e-05, + "loss": 0.5987, + "step": 74100 + }, + { + "epoch": 0.0, + "learning_rate": 9.351515151515153e-05, + "loss": 0.5936, + "step": 74200 + }, + { + "epoch": 0.0, + "learning_rate": 9.35050505050505e-05, + "loss": 0.5986, + "step": 74300 + }, + { + "epoch": 0.0, + "learning_rate": 9.34949494949495e-05, + "loss": 0.5975, + "step": 74400 + }, + { + "epoch": 0.0, + "learning_rate": 9.348484848484849e-05, + "loss": 0.5932, + "step": 74500 + }, + { + "epoch": 0.0, + "learning_rate": 9.347474747474748e-05, + "loss": 0.5976, + "step": 74600 + }, + { + "epoch": 0.0, + "learning_rate": 9.346464646464646e-05, + "loss": 0.5945, + "step": 74700 + }, + { + "epoch": 0.0, + "learning_rate": 9.345454545454547e-05, + "loss": 0.5961, + "step": 74800 + }, + { + "epoch": 0.0, + "learning_rate": 9.344444444444444e-05, + "loss": 0.5957, + "step": 74900 + }, + { + "epoch": 0.01, + "learning_rate": 9.343434343434344e-05, + "loss": 0.5933, + "step": 75000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5947471073805392, + "eval_average_loss_on_sentence_tokens": 0.4744672230265806, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5892285108566284, + "eval_non_padding_tokens_in_labels": 133.5168, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37715, + "eval_padding_tokens_in_labels": 378.4832, + "eval_reconstruction_accuracy": 0.9003210461644378, + "eval_runtime": 180.2909, + "eval_samples_per_second": 27.733, + "eval_sentence_accuracy": 0.7160039119277909, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 75000 + }, + { + "epoch": 0.01, + "learning_rate": 9.342424242424243e-05, + "loss": 0.5933, + "step": 75100 + }, + { + "epoch": 0.01, + "learning_rate": 9.341414141414142e-05, + "loss": 0.5982, + "step": 75200 + }, + { + "epoch": 0.01, + "learning_rate": 9.34040404040404e-05, + "loss": 0.5953, + "step": 75300 + }, + { + "epoch": 0.01, + "learning_rate": 9.33939393939394e-05, + "loss": 0.5942, + "step": 75400 + }, + { + "epoch": 0.01, + "learning_rate": 9.338383838383838e-05, + "loss": 0.5904, + "step": 75500 + }, + { + "epoch": 0.01, + "learning_rate": 9.337373737373738e-05, + "loss": 0.5932, + "step": 75600 + }, + { + "epoch": 0.01, + "learning_rate": 9.336363636363637e-05, + "loss": 0.5879, + "step": 75700 + }, + { + "epoch": 0.01, + "learning_rate": 9.335353535353536e-05, + "loss": 0.5941, + "step": 75800 + }, + { + "epoch": 0.01, + "learning_rate": 9.334343434343434e-05, + "loss": 0.5933, + "step": 75900 + }, + { + "epoch": 0.01, + "learning_rate": 9.333333333333334e-05, + "loss": 0.594, + "step": 76000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5936100258216003, + "eval_average_loss_on_sentence_tokens": 0.4509413292689789, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.5870800614356995, + "eval_non_padding_tokens_in_labels": 133.53315, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38935, + "eval_padding_tokens_in_labels": 378.46685, + "eval_reconstruction_accuracy": 0.9005676659148171, + "eval_runtime": 181.886, + "eval_samples_per_second": 27.49, + "eval_sentence_accuracy": 0.7284978556175642, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.248775, + "step": 76000 + }, + { + "epoch": 0.01, + "learning_rate": 9.332323232323232e-05, + "loss": 0.5856, + "step": 76100 + }, + { + "epoch": 0.01, + "learning_rate": 9.331313131313131e-05, + "loss": 0.5924, + "step": 76200 + }, + { + "epoch": 0.01, + "learning_rate": 9.33030303030303e-05, + "loss": 0.5886, + "step": 76300 + }, + { + "epoch": 0.01, + "learning_rate": 9.32929292929293e-05, + "loss": 0.5912, + "step": 76400 + }, + { + "epoch": 0.01, + "learning_rate": 9.328282828282829e-05, + "loss": 0.594, + "step": 76500 + }, + { + "epoch": 0.01, + "learning_rate": 9.327272727272728e-05, + "loss": 0.5891, + "step": 76600 + }, + { + "epoch": 0.01, + "learning_rate": 9.326262626262626e-05, + "loss": 0.592, + "step": 76700 + }, + { + "epoch": 0.01, + "learning_rate": 9.325252525252525e-05, + "loss": 0.5912, + "step": 76800 + }, + { + "epoch": 0.01, + "learning_rate": 9.324242424242424e-05, + "loss": 0.5872, + "step": 76900 + }, + { + "epoch": 0.01, + "learning_rate": 9.323232323232324e-05, + "loss": 0.5914, + "step": 77000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5915252725691735, + "eval_average_loss_on_sentence_tokens": 0.5080231706617669, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.5877344012260437, + "eval_non_padding_tokens_in_labels": 133.4826, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.35725, + "eval_padding_tokens_in_labels": 378.5174, + "eval_reconstruction_accuracy": 0.9006746349787095, + "eval_runtime": 181.1301, + "eval_samples_per_second": 27.604, + "eval_sentence_accuracy": 0.6966102607353707, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 77000 + }, + { + "epoch": 0.01, + "learning_rate": 9.322222222222223e-05, + "loss": 0.5919, + "step": 77100 + }, + { + "epoch": 0.01, + "learning_rate": 9.321212121212122e-05, + "loss": 0.591, + "step": 77200 + }, + { + "epoch": 0.01, + "learning_rate": 9.32020202020202e-05, + "loss": 0.5892, + "step": 77300 + }, + { + "epoch": 0.01, + "learning_rate": 9.319191919191919e-05, + "loss": 0.5905, + "step": 77400 + }, + { + "epoch": 0.01, + "learning_rate": 9.318181818181818e-05, + "loss": 0.5887, + "step": 77500 + }, + { + "epoch": 0.01, + "learning_rate": 9.317171717171717e-05, + "loss": 0.5872, + "step": 77600 + }, + { + "epoch": 0.01, + "learning_rate": 9.316161616161617e-05, + "loss": 0.5868, + "step": 77700 + }, + { + "epoch": 0.01, + "learning_rate": 9.315151515151516e-05, + "loss": 0.5914, + "step": 77800 + }, + { + "epoch": 0.01, + "learning_rate": 9.314141414141414e-05, + "loss": 0.5922, + "step": 77900 + }, + { + "epoch": 0.01, + "learning_rate": 9.313131313131314e-05, + "loss": 0.59, + "step": 78000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5896440123492018, + "eval_average_loss_on_sentence_tokens": 0.42883301573801147, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.5822949409484863, + "eval_non_padding_tokens_in_labels": 133.5309, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.371, + "eval_padding_tokens_in_labels": 378.4691, + "eval_reconstruction_accuracy": 0.9009019199267778, + "eval_runtime": 179.5215, + "eval_samples_per_second": 27.852, + "eval_sentence_accuracy": 0.7457381520627344, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.245775, + "step": 78000 + }, + { + "epoch": 0.01, + "learning_rate": 9.312121212121212e-05, + "loss": 0.5913, + "step": 78100 + }, + { + "epoch": 0.01, + "learning_rate": 9.311111111111111e-05, + "loss": 0.5892, + "step": 78200 + }, + { + "epoch": 0.01, + "learning_rate": 9.31010101010101e-05, + "loss": 0.5883, + "step": 78300 + }, + { + "epoch": 0.01, + "learning_rate": 9.30909090909091e-05, + "loss": 0.5882, + "step": 78400 + }, + { + "epoch": 0.01, + "learning_rate": 9.308080808080809e-05, + "loss": 0.5878, + "step": 78500 + }, + { + "epoch": 0.01, + "learning_rate": 9.307070707070708e-05, + "loss": 0.585, + "step": 78600 + }, + { + "epoch": 0.01, + "learning_rate": 9.306060606060607e-05, + "loss": 0.5903, + "step": 78700 + }, + { + "epoch": 0.01, + "learning_rate": 9.305050505050505e-05, + "loss": 0.5915, + "step": 78800 + }, + { + "epoch": 0.01, + "learning_rate": 9.304040404040406e-05, + "loss": 0.5892, + "step": 78900 + }, + { + "epoch": 0.01, + "learning_rate": 9.303030303030303e-05, + "loss": 0.5858, + "step": 79000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5888599096476327, + "eval_average_loss_on_sentence_tokens": 0.4948459405690962, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.5846093893051147, + "eval_non_padding_tokens_in_labels": 133.5449, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3784, + "eval_padding_tokens_in_labels": 378.4551, + "eval_reconstruction_accuracy": 0.9011723973721767, + "eval_runtime": 177.9066, + "eval_samples_per_second": 28.105, + "eval_sentence_accuracy": 0.68366321531753, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.247975, + "step": 79000 + }, + { + "epoch": 0.01, + "learning_rate": 9.302020202020203e-05, + "loss": 0.5913, + "step": 79100 + }, + { + "epoch": 0.01, + "learning_rate": 9.301010101010102e-05, + "loss": 0.5847, + "step": 79200 + }, + { + "epoch": 0.01, + "learning_rate": 9.300000000000001e-05, + "loss": 0.5841, + "step": 79300 + }, + { + "epoch": 0.01, + "learning_rate": 9.298989898989899e-05, + "loss": 0.5864, + "step": 79400 + }, + { + "epoch": 0.01, + "learning_rate": 9.2979797979798e-05, + "loss": 0.5833, + "step": 79500 + }, + { + "epoch": 0.01, + "learning_rate": 9.296969696969697e-05, + "loss": 0.5878, + "step": 79600 + }, + { + "epoch": 0.01, + "learning_rate": 9.295959595959597e-05, + "loss": 0.5843, + "step": 79700 + }, + { + "epoch": 0.01, + "learning_rate": 9.294949494949496e-05, + "loss": 0.5832, + "step": 79800 + }, + { + "epoch": 0.01, + "learning_rate": 9.293939393939395e-05, + "loss": 0.5823, + "step": 79900 + }, + { + "epoch": 0.01, + "learning_rate": 9.292929292929293e-05, + "loss": 0.5881, + "step": 80000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5851331302546706, + "eval_average_loss_on_sentence_tokens": 0.500051129059485, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.581250011920929, + "eval_non_padding_tokens_in_labels": 133.524, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38495, + "eval_padding_tokens_in_labels": 378.476, + "eval_reconstruction_accuracy": 0.9012046941841834, + "eval_runtime": 180.7684, + "eval_samples_per_second": 27.66, + "eval_sentence_accuracy": 0.7132135230678127, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 80000 + }, + { + "epoch": 0.01, + "learning_rate": 9.291919191919193e-05, + "loss": 0.5838, + "step": 80100 + }, + { + "epoch": 0.01, + "learning_rate": 9.290909090909091e-05, + "loss": 0.587, + "step": 80200 + }, + { + "epoch": 0.01, + "learning_rate": 9.28989898989899e-05, + "loss": 0.5809, + "step": 80300 + }, + { + "epoch": 0.01, + "learning_rate": 9.28888888888889e-05, + "loss": 0.5805, + "step": 80400 + }, + { + "epoch": 0.01, + "learning_rate": 9.287878787878789e-05, + "loss": 0.5875, + "step": 80500 + }, + { + "epoch": 0.01, + "learning_rate": 9.286868686868687e-05, + "loss": 0.5854, + "step": 80600 + }, + { + "epoch": 0.01, + "learning_rate": 9.285858585858587e-05, + "loss": 0.5825, + "step": 80700 + }, + { + "epoch": 0.01, + "learning_rate": 9.284848484848485e-05, + "loss": 0.5851, + "step": 80800 + }, + { + "epoch": 0.01, + "learning_rate": 9.283838383838384e-05, + "loss": 0.5826, + "step": 80900 + }, + { + "epoch": 0.01, + "learning_rate": 9.282828282828283e-05, + "loss": 0.5846, + "step": 81000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5848812347960647, + "eval_average_loss_on_sentence_tokens": 0.438015416087745, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.5782519578933716, + "eval_non_padding_tokens_in_labels": 133.55745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37935, + "eval_padding_tokens_in_labels": 378.44255, + "eval_reconstruction_accuracy": 0.9013579699306444, + "eval_runtime": 178.2756, + "eval_samples_per_second": 28.046, + "eval_sentence_accuracy": 0.7363037665763454, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 81000 + }, + { + "epoch": 0.01, + "learning_rate": 9.281818181818183e-05, + "loss": 0.5809, + "step": 81100 + }, + { + "epoch": 0.01, + "learning_rate": 9.28080808080808e-05, + "loss": 0.5801, + "step": 81200 + }, + { + "epoch": 0.01, + "learning_rate": 9.279797979797981e-05, + "loss": 0.5869, + "step": 81300 + }, + { + "epoch": 0.01, + "learning_rate": 9.278787878787879e-05, + "loss": 0.5831, + "step": 81400 + }, + { + "epoch": 0.01, + "learning_rate": 9.277777777777778e-05, + "loss": 0.5825, + "step": 81500 + }, + { + "epoch": 0.01, + "learning_rate": 9.276767676767677e-05, + "loss": 0.5811, + "step": 81600 + }, + { + "epoch": 0.01, + "learning_rate": 9.275757575757576e-05, + "loss": 0.5845, + "step": 81700 + }, + { + "epoch": 0.01, + "learning_rate": 9.274747474747476e-05, + "loss": 0.5775, + "step": 81800 + }, + { + "epoch": 0.01, + "learning_rate": 9.273737373737375e-05, + "loss": 0.5791, + "step": 81900 + }, + { + "epoch": 0.01, + "learning_rate": 9.272727272727273e-05, + "loss": 0.5805, + "step": 82000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.583454402398492, + "eval_average_loss_on_sentence_tokens": 0.4770347336744101, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5787011981010437, + "eval_non_padding_tokens_in_labels": 133.55385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.44615, + "eval_reconstruction_accuracy": 0.9013869428206179, + "eval_runtime": 179.5354, + "eval_samples_per_second": 27.85, + "eval_sentence_accuracy": 0.7114908392700128, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 82000 + }, + { + "epoch": 0.01, + "learning_rate": 9.271717171717172e-05, + "loss": 0.5834, + "step": 82100 + }, + { + "epoch": 0.01, + "learning_rate": 9.270707070707071e-05, + "loss": 0.5828, + "step": 82200 + }, + { + "epoch": 0.01, + "learning_rate": 9.26969696969697e-05, + "loss": 0.5824, + "step": 82300 + }, + { + "epoch": 0.01, + "learning_rate": 9.26868686868687e-05, + "loss": 0.583, + "step": 82400 + }, + { + "epoch": 0.01, + "learning_rate": 9.267676767676769e-05, + "loss": 0.583, + "step": 82500 + }, + { + "epoch": 0.01, + "learning_rate": 9.266666666666666e-05, + "loss": 0.5788, + "step": 82600 + }, + { + "epoch": 0.01, + "learning_rate": 9.265656565656567e-05, + "loss": 0.5831, + "step": 82700 + }, + { + "epoch": 0.01, + "learning_rate": 9.264646464646465e-05, + "loss": 0.5777, + "step": 82800 + }, + { + "epoch": 0.01, + "learning_rate": 9.263636363636364e-05, + "loss": 0.5787, + "step": 82900 + }, + { + "epoch": 0.01, + "learning_rate": 9.262626262626263e-05, + "loss": 0.5796, + "step": 83000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5832240432855068, + "eval_average_loss_on_sentence_tokens": 0.4226341921392182, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.5760058760643005, + "eval_non_padding_tokens_in_labels": 133.517, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3708, + "eval_padding_tokens_in_labels": 378.483, + "eval_reconstruction_accuracy": 0.901684963356537, + "eval_runtime": 181.8085, + "eval_samples_per_second": 27.501, + "eval_sentence_accuracy": 0.7324008110969548, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 83000 + }, + { + "epoch": 0.01, + "learning_rate": 9.261616161616162e-05, + "loss": 0.583, + "step": 83100 + }, + { + "epoch": 0.01, + "learning_rate": 9.26060606060606e-05, + "loss": 0.5803, + "step": 83200 + }, + { + "epoch": 0.01, + "learning_rate": 9.259595959595961e-05, + "loss": 0.5808, + "step": 83300 + }, + { + "epoch": 0.01, + "learning_rate": 9.258585858585859e-05, + "loss": 0.5839, + "step": 83400 + }, + { + "epoch": 0.01, + "learning_rate": 9.257575757575758e-05, + "loss": 0.5822, + "step": 83500 + }, + { + "epoch": 0.01, + "learning_rate": 9.256565656565657e-05, + "loss": 0.5756, + "step": 83600 + }, + { + "epoch": 0.01, + "learning_rate": 9.255555555555556e-05, + "loss": 0.579, + "step": 83700 + }, + { + "epoch": 0.01, + "learning_rate": 9.254545454545454e-05, + "loss": 0.5821, + "step": 83800 + }, + { + "epoch": 0.01, + "learning_rate": 9.253535353535355e-05, + "loss": 0.5746, + "step": 83900 + }, + { + "epoch": 0.01, + "learning_rate": 9.252525252525253e-05, + "loss": 0.5738, + "step": 84000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5803341636764632, + "eval_average_loss_on_sentence_tokens": 0.5017403493223912, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.5769628882408142, + "eval_non_padding_tokens_in_labels": 133.5055, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38325, + "eval_padding_tokens_in_labels": 378.4945, + "eval_reconstruction_accuracy": 0.9018863806852636, + "eval_runtime": 182.366, + "eval_samples_per_second": 27.417, + "eval_sentence_accuracy": 0.683097959696377, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24750000000000005, + "step": 84000 + }, + { + "epoch": 0.01, + "learning_rate": 9.251515151515152e-05, + "loss": 0.574, + "step": 84100 + }, + { + "epoch": 0.01, + "learning_rate": 9.250505050505051e-05, + "loss": 0.5798, + "step": 84200 + }, + { + "epoch": 0.01, + "learning_rate": 9.24949494949495e-05, + "loss": 0.5788, + "step": 84300 + }, + { + "epoch": 0.01, + "learning_rate": 9.248484848484848e-05, + "loss": 0.5751, + "step": 84400 + }, + { + "epoch": 0.01, + "learning_rate": 9.247474747474749e-05, + "loss": 0.5807, + "step": 84500 + }, + { + "epoch": 0.01, + "learning_rate": 9.246464646464646e-05, + "loss": 0.5821, + "step": 84600 + }, + { + "epoch": 0.01, + "learning_rate": 9.245454545454546e-05, + "loss": 0.5825, + "step": 84700 + }, + { + "epoch": 0.01, + "learning_rate": 9.244444444444445e-05, + "loss": 0.5751, + "step": 84800 + }, + { + "epoch": 0.01, + "learning_rate": 9.243434343434344e-05, + "loss": 0.5774, + "step": 84900 + }, + { + "epoch": 0.01, + "learning_rate": 9.242424242424242e-05, + "loss": 0.5796, + "step": 85000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5787096949052694, + "eval_average_loss_on_sentence_tokens": 0.4766660092930447, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5741503834724426, + "eval_non_padding_tokens_in_labels": 133.58975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39115, + "eval_padding_tokens_in_labels": 378.41025, + "eval_reconstruction_accuracy": 0.9020456315922358, + "eval_runtime": 191.2351, + "eval_samples_per_second": 26.146, + "eval_sentence_accuracy": 0.7003247976743768, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24959999999999996, + "step": 85000 + }, + { + "epoch": 0.02, + "learning_rate": 9.241414141414142e-05, + "loss": 0.582, + "step": 85100 + }, + { + "epoch": 0.02, + "learning_rate": 9.24040404040404e-05, + "loss": 0.5763, + "step": 85200 + }, + { + "epoch": 0.02, + "learning_rate": 9.23939393939394e-05, + "loss": 0.5742, + "step": 85300 + }, + { + "epoch": 0.02, + "learning_rate": 9.238383838383839e-05, + "loss": 0.5768, + "step": 85400 + }, + { + "epoch": 0.02, + "learning_rate": 9.237373737373738e-05, + "loss": 0.5759, + "step": 85500 + }, + { + "epoch": 0.02, + "learning_rate": 9.236363636363636e-05, + "loss": 0.5797, + "step": 85600 + }, + { + "epoch": 0.02, + "learning_rate": 9.235353535353536e-05, + "loss": 0.5785, + "step": 85700 + }, + { + "epoch": 0.02, + "learning_rate": 9.234343434343434e-05, + "loss": 0.5713, + "step": 85800 + }, + { + "epoch": 0.02, + "learning_rate": 9.233333333333333e-05, + "loss": 0.5748, + "step": 85900 + }, + { + "epoch": 0.02, + "learning_rate": 9.232323232323232e-05, + "loss": 0.577, + "step": 86000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5764268346390206, + "eval_average_loss_on_sentence_tokens": 0.48346793261135856, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5723632574081421, + "eval_non_padding_tokens_in_labels": 133.5541, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37795, + "eval_padding_tokens_in_labels": 378.4459, + "eval_reconstruction_accuracy": 0.9021446313334611, + "eval_runtime": 180.1385, + "eval_samples_per_second": 27.756, + "eval_sentence_accuracy": 0.7035682882624221, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 86000 + }, + { + "epoch": 0.02, + "learning_rate": 9.231313131313132e-05, + "loss": 0.5773, + "step": 86100 + }, + { + "epoch": 0.02, + "learning_rate": 9.230303030303031e-05, + "loss": 0.5777, + "step": 86200 + }, + { + "epoch": 0.02, + "learning_rate": 9.22929292929293e-05, + "loss": 0.5755, + "step": 86300 + }, + { + "epoch": 0.02, + "learning_rate": 9.228282828282828e-05, + "loss": 0.5756, + "step": 86400 + }, + { + "epoch": 0.02, + "learning_rate": 9.227272727272727e-05, + "loss": 0.5726, + "step": 86500 + }, + { + "epoch": 0.02, + "learning_rate": 9.226262626262626e-05, + "loss": 0.5731, + "step": 86600 + }, + { + "epoch": 0.02, + "learning_rate": 9.225252525252525e-05, + "loss": 0.5735, + "step": 86700 + }, + { + "epoch": 0.02, + "learning_rate": 9.224242424242425e-05, + "loss": 0.5732, + "step": 86800 + }, + { + "epoch": 0.02, + "learning_rate": 9.223232323232324e-05, + "loss": 0.5766, + "step": 86900 + }, + { + "epoch": 0.02, + "learning_rate": 9.222222222222223e-05, + "loss": 0.5773, + "step": 87000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5757072637400064, + "eval_average_loss_on_sentence_tokens": 0.44190984756506646, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.5698339939117432, + "eval_non_padding_tokens_in_labels": 133.52125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38755, + "eval_padding_tokens_in_labels": 378.47875, + "eval_reconstruction_accuracy": 0.9024507049677796, + "eval_runtime": 181.5152, + "eval_samples_per_second": 27.546, + "eval_sentence_accuracy": 0.7239354352468282, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 87000 + }, + { + "epoch": 0.02, + "learning_rate": 9.221212121212122e-05, + "loss": 0.5699, + "step": 87100 + }, + { + "epoch": 0.02, + "learning_rate": 9.220202020202021e-05, + "loss": 0.5733, + "step": 87200 + }, + { + "epoch": 0.02, + "learning_rate": 9.219191919191919e-05, + "loss": 0.5736, + "step": 87300 + }, + { + "epoch": 0.02, + "learning_rate": 9.218181818181819e-05, + "loss": 0.571, + "step": 87400 + }, + { + "epoch": 0.02, + "learning_rate": 9.217171717171718e-05, + "loss": 0.577, + "step": 87500 + }, + { + "epoch": 0.02, + "learning_rate": 9.216161616161617e-05, + "loss": 0.5766, + "step": 87600 + }, + { + "epoch": 0.02, + "learning_rate": 9.215151515151516e-05, + "loss": 0.5675, + "step": 87700 + }, + { + "epoch": 0.02, + "learning_rate": 9.214141414141415e-05, + "loss": 0.5721, + "step": 87800 + }, + { + "epoch": 0.02, + "learning_rate": 9.213131313131313e-05, + "loss": 0.5783, + "step": 87900 + }, + { + "epoch": 0.02, + "learning_rate": 9.212121212121214e-05, + "loss": 0.5716, + "step": 88000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.572844582344282, + "eval_average_loss_on_sentence_tokens": 0.48144907870304243, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5687402486801147, + "eval_non_padding_tokens_in_labels": 133.4872, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3731, + "eval_padding_tokens_in_labels": 378.5128, + "eval_reconstruction_accuracy": 0.9027161344265159, + "eval_runtime": 179.8839, + "eval_samples_per_second": 27.796, + "eval_sentence_accuracy": 0.7018859798661331, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 88000 + }, + { + "epoch": 0.02, + "learning_rate": 9.211111111111112e-05, + "loss": 0.5689, + "step": 88100 + }, + { + "epoch": 0.02, + "learning_rate": 9.210101010101011e-05, + "loss": 0.5755, + "step": 88200 + }, + { + "epoch": 0.02, + "learning_rate": 9.20909090909091e-05, + "loss": 0.5728, + "step": 88300 + }, + { + "epoch": 0.02, + "learning_rate": 9.208080808080809e-05, + "loss": 0.57, + "step": 88400 + }, + { + "epoch": 0.02, + "learning_rate": 9.207070707070707e-05, + "loss": 0.573, + "step": 88500 + }, + { + "epoch": 0.02, + "learning_rate": 9.206060606060608e-05, + "loss": 0.5715, + "step": 88600 + }, + { + "epoch": 0.02, + "learning_rate": 9.205050505050505e-05, + "loss": 0.5703, + "step": 88700 + }, + { + "epoch": 0.02, + "learning_rate": 9.204040404040405e-05, + "loss": 0.5645, + "step": 88800 + }, + { + "epoch": 0.02, + "learning_rate": 9.203030303030304e-05, + "loss": 0.5687, + "step": 88900 + }, + { + "epoch": 0.02, + "learning_rate": 9.202020202020203e-05, + "loss": 0.572, + "step": 89000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5719850906414258, + "eval_average_loss_on_sentence_tokens": 0.4783671664329338, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.5677441358566284, + "eval_non_padding_tokens_in_labels": 133.51605, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3724, + "eval_padding_tokens_in_labels": 378.48395, + "eval_reconstruction_accuracy": 0.9026068719044638, + "eval_runtime": 181.4926, + "eval_samples_per_second": 27.549, + "eval_sentence_accuracy": 0.7147926498824627, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.25, + "step": 89000 + }, + { + "epoch": 0.02, + "learning_rate": 9.201010101010101e-05, + "loss": 0.5712, + "step": 89100 + }, + { + "epoch": 0.02, + "learning_rate": 9.200000000000001e-05, + "loss": 0.5743, + "step": 89200 + }, + { + "epoch": 0.02, + "learning_rate": 9.198989898989899e-05, + "loss": 0.5714, + "step": 89300 + }, + { + "epoch": 0.02, + "learning_rate": 9.197979797979798e-05, + "loss": 0.5716, + "step": 89400 + }, + { + "epoch": 0.02, + "learning_rate": 9.196969696969698e-05, + "loss": 0.5699, + "step": 89500 + }, + { + "epoch": 0.02, + "learning_rate": 9.195959595959597e-05, + "loss": 0.5706, + "step": 89600 + }, + { + "epoch": 0.02, + "learning_rate": 9.194949494949495e-05, + "loss": 0.5707, + "step": 89700 + }, + { + "epoch": 0.02, + "learning_rate": 9.193939393939395e-05, + "loss": 0.5693, + "step": 89800 + }, + { + "epoch": 0.02, + "learning_rate": 9.192929292929293e-05, + "loss": 0.5736, + "step": 89900 + }, + { + "epoch": 0.02, + "learning_rate": 9.191919191919192e-05, + "loss": 0.5696, + "step": 90000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5705760060940125, + "eval_average_loss_on_sentence_tokens": 0.4958657150710072, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.5672460794448853, + "eval_non_padding_tokens_in_labels": 133.53465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39145, + "eval_padding_tokens_in_labels": 378.46535, + "eval_reconstruction_accuracy": 0.9029235798408726, + "eval_runtime": 182.4107, + "eval_samples_per_second": 27.411, + "eval_sentence_accuracy": 0.689414466954977, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.247975, + "step": 90000 + }, + { + "epoch": 0.02, + "learning_rate": 9.190909090909091e-05, + "loss": 0.5743, + "step": 90100 + }, + { + "epoch": 0.02, + "learning_rate": 9.18989898989899e-05, + "loss": 0.5682, + "step": 90200 + }, + { + "epoch": 0.02, + "learning_rate": 9.188888888888888e-05, + "loss": 0.5704, + "step": 90300 + }, + { + "epoch": 0.02, + "learning_rate": 9.187878787878789e-05, + "loss": 0.571, + "step": 90400 + }, + { + "epoch": 0.02, + "learning_rate": 9.186868686868687e-05, + "loss": 0.5664, + "step": 90500 + }, + { + "epoch": 0.02, + "learning_rate": 9.185858585858586e-05, + "loss": 0.5695, + "step": 90600 + }, + { + "epoch": 0.02, + "learning_rate": 9.184848484848485e-05, + "loss": 0.5693, + "step": 90700 + }, + { + "epoch": 0.02, + "learning_rate": 9.183838383838384e-05, + "loss": 0.5692, + "step": 90800 + }, + { + "epoch": 0.02, + "learning_rate": 9.182828282828284e-05, + "loss": 0.5681, + "step": 90900 + }, + { + "epoch": 0.02, + "learning_rate": 9.181818181818183e-05, + "loss": 0.5615, + "step": 91000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.570467577755927, + "eval_average_loss_on_sentence_tokens": 0.4492230276627148, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5650488138198853, + "eval_non_padding_tokens_in_labels": 133.52025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3863, + "eval_padding_tokens_in_labels": 378.47975, + "eval_reconstruction_accuracy": 0.9030690217410837, + "eval_runtime": 177.176, + "eval_samples_per_second": 28.221, + "eval_sentence_accuracy": 0.7160173703949612, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 91000 + }, + { + "epoch": 0.02, + "learning_rate": 9.180808080808081e-05, + "loss": 0.5717, + "step": 91100 + }, + { + "epoch": 0.02, + "learning_rate": 9.17979797979798e-05, + "loss": 0.5658, + "step": 91200 + }, + { + "epoch": 0.02, + "learning_rate": 9.178787878787879e-05, + "loss": 0.5673, + "step": 91300 + }, + { + "epoch": 0.02, + "learning_rate": 9.177777777777778e-05, + "loss": 0.5686, + "step": 91400 + }, + { + "epoch": 0.02, + "learning_rate": 9.176767676767677e-05, + "loss": 0.5656, + "step": 91500 + }, + { + "epoch": 0.02, + "learning_rate": 9.175757575757577e-05, + "loss": 0.5663, + "step": 91600 + }, + { + "epoch": 0.02, + "learning_rate": 9.174747474747475e-05, + "loss": 0.5711, + "step": 91700 + }, + { + "epoch": 0.02, + "learning_rate": 9.173737373737374e-05, + "loss": 0.575, + "step": 91800 + }, + { + "epoch": 0.02, + "learning_rate": 9.172727272727273e-05, + "loss": 0.5674, + "step": 91900 + }, + { + "epoch": 0.02, + "learning_rate": 9.171717171717172e-05, + "loss": 0.5678, + "step": 92000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5693873733434068, + "eval_average_loss_on_sentence_tokens": 0.45824774626352077, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.5643163919448853, + "eval_non_padding_tokens_in_labels": 133.5017, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3822, + "eval_padding_tokens_in_labels": 378.4983, + "eval_reconstruction_accuracy": 0.9031162923360128, + "eval_runtime": 184.5556, + "eval_samples_per_second": 27.092, + "eval_sentence_accuracy": 0.7096156261776159, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 92000 + }, + { + "epoch": 0.02, + "learning_rate": 9.170707070707071e-05, + "loss": 0.5668, + "step": 92100 + }, + { + "epoch": 0.02, + "learning_rate": 9.16969696969697e-05, + "loss": 0.5673, + "step": 92200 + }, + { + "epoch": 0.02, + "learning_rate": 9.168686868686868e-05, + "loss": 0.5674, + "step": 92300 + }, + { + "epoch": 0.02, + "learning_rate": 9.167676767676769e-05, + "loss": 0.5686, + "step": 92400 + }, + { + "epoch": 0.02, + "learning_rate": 9.166666666666667e-05, + "loss": 0.5699, + "step": 92500 + }, + { + "epoch": 0.02, + "learning_rate": 9.165656565656566e-05, + "loss": 0.5669, + "step": 92600 + }, + { + "epoch": 0.02, + "learning_rate": 9.164646464646465e-05, + "loss": 0.5718, + "step": 92700 + }, + { + "epoch": 0.02, + "learning_rate": 9.163636363636364e-05, + "loss": 0.5663, + "step": 92800 + }, + { + "epoch": 0.02, + "learning_rate": 9.162626262626262e-05, + "loss": 0.5686, + "step": 92900 + }, + { + "epoch": 0.02, + "learning_rate": 9.161616161616163e-05, + "loss": 0.5667, + "step": 93000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.568526969742746, + "eval_average_loss_on_sentence_tokens": 0.4471147317158155, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.5631152391433716, + "eval_non_padding_tokens_in_labels": 133.4984, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.35565, + "eval_padding_tokens_in_labels": 378.5016, + "eval_reconstruction_accuracy": 0.9032550417481354, + "eval_runtime": 180.5891, + "eval_samples_per_second": 27.687, + "eval_sentence_accuracy": 0.7096335708005096, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 93000 + }, + { + "epoch": 0.02, + "learning_rate": 9.16060606060606e-05, + "loss": 0.5673, + "step": 93100 + }, + { + "epoch": 0.02, + "learning_rate": 9.15959595959596e-05, + "loss": 0.5637, + "step": 93200 + }, + { + "epoch": 0.02, + "learning_rate": 9.158585858585859e-05, + "loss": 0.5663, + "step": 93300 + }, + { + "epoch": 0.02, + "learning_rate": 9.157575757575758e-05, + "loss": 0.5659, + "step": 93400 + }, + { + "epoch": 0.02, + "learning_rate": 9.156565656565656e-05, + "loss": 0.5676, + "step": 93500 + }, + { + "epoch": 0.02, + "learning_rate": 9.155555555555557e-05, + "loss": 0.5641, + "step": 93600 + }, + { + "epoch": 0.02, + "learning_rate": 9.154545454545454e-05, + "loss": 0.5691, + "step": 93700 + }, + { + "epoch": 0.02, + "learning_rate": 9.153535353535354e-05, + "loss": 0.5645, + "step": 93800 + }, + { + "epoch": 0.02, + "learning_rate": 9.152525252525253e-05, + "loss": 0.5633, + "step": 93900 + }, + { + "epoch": 0.02, + "learning_rate": 9.151515151515152e-05, + "loss": 0.5587, + "step": 94000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.5670212881289921, + "eval_average_loss_on_sentence_tokens": 0.46316262855306384, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5623534917831421, + "eval_non_padding_tokens_in_labels": 133.54145, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3824, + "eval_padding_tokens_in_labels": 378.45855, + "eval_reconstruction_accuracy": 0.9033090583595714, + "eval_runtime": 181.3058, + "eval_samples_per_second": 27.578, + "eval_sentence_accuracy": 0.7093240260555924, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 94000 + }, + { + "epoch": 0.02, + "learning_rate": 9.15050505050505e-05, + "loss": 0.5671, + "step": 94100 + }, + { + "epoch": 0.02, + "learning_rate": 9.14949494949495e-05, + "loss": 0.5627, + "step": 94200 + }, + { + "epoch": 0.02, + "learning_rate": 9.148484848484848e-05, + "loss": 0.5651, + "step": 94300 + }, + { + "epoch": 0.02, + "learning_rate": 9.147474747474747e-05, + "loss": 0.5626, + "step": 94400 + }, + { + "epoch": 0.02, + "learning_rate": 9.146464646464647e-05, + "loss": 0.5628, + "step": 94500 + }, + { + "epoch": 0.02, + "learning_rate": 9.145454545454546e-05, + "loss": 0.5606, + "step": 94600 + }, + { + "epoch": 0.02, + "learning_rate": 9.144444444444444e-05, + "loss": 0.5694, + "step": 94700 + }, + { + "epoch": 0.02, + "learning_rate": 9.143434343434344e-05, + "loss": 0.5666, + "step": 94800 + }, + { + "epoch": 0.02, + "learning_rate": 9.142424242424242e-05, + "loss": 0.5675, + "step": 94900 + }, + { + "epoch": 0.03, + "learning_rate": 9.141414141414141e-05, + "loss": 0.5672, + "step": 95000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5646710075225866, + "eval_average_loss_on_sentence_tokens": 0.466276969902568, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.560302734375, + "eval_non_padding_tokens_in_labels": 133.53485, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3715, + "eval_padding_tokens_in_labels": 378.46515, + "eval_reconstruction_accuracy": 0.9033678538704473, + "eval_runtime": 180.914, + "eval_samples_per_second": 27.637, + "eval_sentence_accuracy": 0.7245814416710032, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 95000 + }, + { + "epoch": 0.03, + "learning_rate": 9.14040404040404e-05, + "loss": 0.5615, + "step": 95100 + }, + { + "epoch": 0.03, + "learning_rate": 9.13939393939394e-05, + "loss": 0.5605, + "step": 95200 + }, + { + "epoch": 0.03, + "learning_rate": 9.138383838383839e-05, + "loss": 0.5578, + "step": 95300 + }, + { + "epoch": 0.03, + "learning_rate": 9.137373737373738e-05, + "loss": 0.5643, + "step": 95400 + }, + { + "epoch": 0.03, + "learning_rate": 9.136363636363637e-05, + "loss": 0.5684, + "step": 95500 + }, + { + "epoch": 0.03, + "learning_rate": 9.135353535353535e-05, + "loss": 0.5662, + "step": 95600 + }, + { + "epoch": 0.03, + "learning_rate": 9.134343434343436e-05, + "loss": 0.5643, + "step": 95700 + }, + { + "epoch": 0.03, + "learning_rate": 9.133333333333334e-05, + "loss": 0.5654, + "step": 95800 + }, + { + "epoch": 0.03, + "learning_rate": 9.132323232323233e-05, + "loss": 0.5637, + "step": 95900 + }, + { + "epoch": 0.03, + "learning_rate": 9.131313131313132e-05, + "loss": 0.5624, + "step": 96000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5635683790732732, + "eval_average_loss_on_sentence_tokens": 0.4584220990616059, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5587792992591858, + "eval_non_padding_tokens_in_labels": 133.5039, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37155, + "eval_padding_tokens_in_labels": 378.4961, + "eval_reconstruction_accuracy": 0.9038446505791111, + "eval_runtime": 181.5547, + "eval_samples_per_second": 27.54, + "eval_sentence_accuracy": 0.7076865792165378, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 96000 + }, + { + "epoch": 0.03, + "learning_rate": 9.130303030303031e-05, + "loss": 0.5604, + "step": 96100 + }, + { + "epoch": 0.03, + "learning_rate": 9.12929292929293e-05, + "loss": 0.5606, + "step": 96200 + }, + { + "epoch": 0.03, + "learning_rate": 9.12828282828283e-05, + "loss": 0.561, + "step": 96300 + }, + { + "epoch": 0.03, + "learning_rate": 9.127272727272727e-05, + "loss": 0.5613, + "step": 96400 + }, + { + "epoch": 0.03, + "learning_rate": 9.126262626262627e-05, + "loss": 0.5598, + "step": 96500 + }, + { + "epoch": 0.03, + "learning_rate": 9.125252525252526e-05, + "loss": 0.5638, + "step": 96600 + }, + { + "epoch": 0.03, + "learning_rate": 9.124242424242425e-05, + "loss": 0.5642, + "step": 96700 + }, + { + "epoch": 0.03, + "learning_rate": 9.123232323232324e-05, + "loss": 0.5638, + "step": 96800 + }, + { + "epoch": 0.03, + "learning_rate": 9.122222222222223e-05, + "loss": 0.5619, + "step": 96900 + }, + { + "epoch": 0.03, + "learning_rate": 9.121212121212121e-05, + "loss": 0.5605, + "step": 97000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5617034687797143, + "eval_average_loss_on_sentence_tokens": 0.43023531135162507, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.5558300614356995, + "eval_non_padding_tokens_in_labels": 133.5152, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38375, + "eval_padding_tokens_in_labels": 378.4848, + "eval_reconstruction_accuracy": 0.9039132019538306, + "eval_runtime": 184.9294, + "eval_samples_per_second": 27.037, + "eval_sentence_accuracy": 0.7331858883485564, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 97000 + }, + { + "epoch": 0.03, + "learning_rate": 9.120202020202022e-05, + "loss": 0.563, + "step": 97100 + }, + { + "epoch": 0.03, + "learning_rate": 9.11919191919192e-05, + "loss": 0.5643, + "step": 97200 + }, + { + "epoch": 0.03, + "learning_rate": 9.118181818181819e-05, + "loss": 0.5601, + "step": 97300 + }, + { + "epoch": 0.03, + "learning_rate": 9.117171717171718e-05, + "loss": 0.5633, + "step": 97400 + }, + { + "epoch": 0.03, + "learning_rate": 9.116161616161617e-05, + "loss": 0.5648, + "step": 97500 + }, + { + "epoch": 0.03, + "learning_rate": 9.115151515151515e-05, + "loss": 0.5608, + "step": 97600 + }, + { + "epoch": 0.03, + "learning_rate": 9.114141414141416e-05, + "loss": 0.5653, + "step": 97700 + }, + { + "epoch": 0.03, + "learning_rate": 9.113131313131313e-05, + "loss": 0.5594, + "step": 97800 + }, + { + "epoch": 0.03, + "learning_rate": 9.112121212121213e-05, + "loss": 0.5596, + "step": 97900 + }, + { + "epoch": 0.03, + "learning_rate": 9.111111111111112e-05, + "loss": 0.563, + "step": 98000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5613030407413605, + "eval_average_loss_on_sentence_tokens": 0.4586979660373193, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.556640625, + "eval_non_padding_tokens_in_labels": 133.56865, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38695, + "eval_padding_tokens_in_labels": 378.43135, + "eval_reconstruction_accuracy": 0.9038570011786385, + "eval_runtime": 187.3759, + "eval_samples_per_second": 26.684, + "eval_sentence_accuracy": 0.7199203258743517, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 98000 + }, + { + "epoch": 0.03, + "learning_rate": 9.110101010101011e-05, + "loss": 0.5575, + "step": 98100 + }, + { + "epoch": 0.03, + "learning_rate": 9.109090909090909e-05, + "loss": 0.5598, + "step": 98200 + }, + { + "epoch": 0.03, + "learning_rate": 9.10808080808081e-05, + "loss": 0.5617, + "step": 98300 + }, + { + "epoch": 0.03, + "learning_rate": 9.107070707070707e-05, + "loss": 0.5587, + "step": 98400 + }, + { + "epoch": 0.03, + "learning_rate": 9.106060606060606e-05, + "loss": 0.5583, + "step": 98500 + }, + { + "epoch": 0.03, + "learning_rate": 9.105050505050506e-05, + "loss": 0.5617, + "step": 98600 + }, + { + "epoch": 0.03, + "learning_rate": 9.104040404040405e-05, + "loss": 0.5621, + "step": 98700 + }, + { + "epoch": 0.03, + "learning_rate": 9.103030303030303e-05, + "loss": 0.5613, + "step": 98800 + }, + { + "epoch": 0.03, + "learning_rate": 9.102020202020203e-05, + "loss": 0.562, + "step": 98900 + }, + { + "epoch": 0.03, + "learning_rate": 9.101010101010101e-05, + "loss": 0.5557, + "step": 99000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5598487379040745, + "eval_average_loss_on_sentence_tokens": 0.46145041373959567, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5553320050239563, + "eval_non_padding_tokens_in_labels": 133.4799, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3862, + "eval_padding_tokens_in_labels": 378.5201, + "eval_reconstruction_accuracy": 0.9042086435509609, + "eval_runtime": 179.4802, + "eval_samples_per_second": 27.858, + "eval_sentence_accuracy": 0.7054569598219893, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24959999999999996, + "step": 99000 + }, + { + "epoch": 0.03, + "learning_rate": 9.1e-05, + "loss": 0.5593, + "step": 99100 + }, + { + "epoch": 0.03, + "learning_rate": 9.0989898989899e-05, + "loss": 0.5602, + "step": 99200 + }, + { + "epoch": 0.03, + "learning_rate": 9.097979797979799e-05, + "loss": 0.5626, + "step": 99300 + }, + { + "epoch": 0.03, + "learning_rate": 9.096969696969697e-05, + "loss": 0.5617, + "step": 99400 + }, + { + "epoch": 0.03, + "learning_rate": 9.095959595959597e-05, + "loss": 0.5534, + "step": 99500 + }, + { + "epoch": 0.03, + "learning_rate": 9.094949494949495e-05, + "loss": 0.5567, + "step": 99600 + }, + { + "epoch": 0.03, + "learning_rate": 9.093939393939394e-05, + "loss": 0.5567, + "step": 99700 + }, + { + "epoch": 0.03, + "learning_rate": 9.092929292929293e-05, + "loss": 0.56, + "step": 99800 + }, + { + "epoch": 0.03, + "learning_rate": 9.091919191919193e-05, + "loss": 0.5547, + "step": 99900 + }, + { + "epoch": 0.03, + "learning_rate": 9.090909090909092e-05, + "loss": 0.5602, + "step": 100000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5588219110468046, + "eval_average_loss_on_sentence_tokens": 0.4177164693463704, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.5523828268051147, + "eval_non_padding_tokens_in_labels": 133.4723, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.355, + "eval_padding_tokens_in_labels": 378.5277, + "eval_reconstruction_accuracy": 0.90423542352588, + "eval_runtime": 192.9919, + "eval_samples_per_second": 25.908, + "eval_sentence_accuracy": 0.7448812963195578, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 100000 + }, + { + "epoch": 0.03, + "learning_rate": 9.089898989898991e-05, + "loss": 0.562, + "step": 100100 + }, + { + "epoch": 0.03, + "learning_rate": 9.088888888888889e-05, + "loss": 0.5617, + "step": 100200 + }, + { + "epoch": 0.03, + "learning_rate": 9.087878787878788e-05, + "loss": 0.5559, + "step": 100300 + }, + { + "epoch": 0.03, + "learning_rate": 9.086868686868687e-05, + "loss": 0.56, + "step": 100400 + }, + { + "epoch": 0.03, + "learning_rate": 9.085858585858586e-05, + "loss": 0.5604, + "step": 100500 + }, + { + "epoch": 0.03, + "learning_rate": 9.084848484848486e-05, + "loss": 0.5569, + "step": 100600 + }, + { + "epoch": 0.03, + "learning_rate": 9.083838383838385e-05, + "loss": 0.5549, + "step": 100700 + }, + { + "epoch": 0.03, + "learning_rate": 9.082828282828283e-05, + "loss": 0.5595, + "step": 100800 + }, + { + "epoch": 0.03, + "learning_rate": 9.081818181818182e-05, + "loss": 0.557, + "step": 100900 + }, + { + "epoch": 0.03, + "learning_rate": 9.080808080808081e-05, + "loss": 0.5573, + "step": 101000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5582294225061043, + "eval_average_loss_on_sentence_tokens": 0.4353347925404596, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5526660084724426, + "eval_non_padding_tokens_in_labels": 133.55725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38715, + "eval_padding_tokens_in_labels": 378.44275, + "eval_reconstruction_accuracy": 0.9043066657668282, + "eval_runtime": 182.65, + "eval_samples_per_second": 27.375, + "eval_sentence_accuracy": 0.7338812424856892, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2493749999999999, + "step": 101000 + }, + { + "epoch": 0.03, + "learning_rate": 9.07979797979798e-05, + "loss": 0.5539, + "step": 101100 + }, + { + "epoch": 0.03, + "learning_rate": 9.07878787878788e-05, + "loss": 0.5555, + "step": 101200 + }, + { + "epoch": 0.03, + "learning_rate": 9.077777777777779e-05, + "loss": 0.55, + "step": 101300 + }, + { + "epoch": 0.03, + "learning_rate": 9.076767676767676e-05, + "loss": 0.5549, + "step": 101400 + }, + { + "epoch": 0.03, + "learning_rate": 9.075757575757577e-05, + "loss": 0.5556, + "step": 101500 + }, + { + "epoch": 0.03, + "learning_rate": 9.074747474747475e-05, + "loss": 0.554, + "step": 101600 + }, + { + "epoch": 0.03, + "learning_rate": 9.073737373737374e-05, + "loss": 0.5551, + "step": 101700 + }, + { + "epoch": 0.03, + "learning_rate": 9.072727272727273e-05, + "loss": 0.5559, + "step": 101800 + }, + { + "epoch": 0.03, + "learning_rate": 9.071717171717172e-05, + "loss": 0.5564, + "step": 101900 + }, + { + "epoch": 0.03, + "learning_rate": 9.07070707070707e-05, + "loss": 0.5534, + "step": 102000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5564429037559601, + "eval_average_loss_on_sentence_tokens": 0.41410500610302425, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.5500586032867432, + "eval_non_padding_tokens_in_labels": 133.54905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39115, + "eval_padding_tokens_in_labels": 378.45095, + "eval_reconstruction_accuracy": 0.9045576819690904, + "eval_runtime": 184.3064, + "eval_samples_per_second": 27.129, + "eval_sentence_accuracy": 0.7434322680208876, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 102000 + }, + { + "epoch": 0.03, + "learning_rate": 9.069696969696971e-05, + "loss": 0.5582, + "step": 102100 + }, + { + "epoch": 0.03, + "learning_rate": 9.068686868686869e-05, + "loss": 0.5577, + "step": 102200 + }, + { + "epoch": 0.03, + "learning_rate": 9.067676767676768e-05, + "loss": 0.556, + "step": 102300 + }, + { + "epoch": 0.03, + "learning_rate": 9.066666666666667e-05, + "loss": 0.5555, + "step": 102400 + }, + { + "epoch": 0.03, + "learning_rate": 9.065656565656566e-05, + "loss": 0.5607, + "step": 102500 + }, + { + "epoch": 0.03, + "learning_rate": 9.064646464646464e-05, + "loss": 0.5593, + "step": 102600 + }, + { + "epoch": 0.03, + "learning_rate": 9.063636363636365e-05, + "loss": 0.5497, + "step": 102700 + }, + { + "epoch": 0.03, + "learning_rate": 9.062626262626262e-05, + "loss": 0.5587, + "step": 102800 + }, + { + "epoch": 0.03, + "learning_rate": 9.061616161616162e-05, + "loss": 0.5552, + "step": 102900 + }, + { + "epoch": 0.03, + "learning_rate": 9.060606060606061e-05, + "loss": 0.5536, + "step": 103000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.555188765938541, + "eval_average_loss_on_sentence_tokens": 0.4837960783740851, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.5520215034484863, + "eval_non_padding_tokens_in_labels": 133.5332, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.378, + "eval_padding_tokens_in_labels": 378.4668, + "eval_reconstruction_accuracy": 0.904576955319435, + "eval_runtime": 180.9722, + "eval_samples_per_second": 27.629, + "eval_sentence_accuracy": 0.6977452581334004, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 103000 + }, + { + "epoch": 0.03, + "learning_rate": 9.05959595959596e-05, + "loss": 0.5552, + "step": 103100 + }, + { + "epoch": 0.03, + "learning_rate": 9.058585858585858e-05, + "loss": 0.555, + "step": 103200 + }, + { + "epoch": 0.03, + "learning_rate": 9.057575757575758e-05, + "loss": 0.5593, + "step": 103300 + }, + { + "epoch": 0.03, + "learning_rate": 9.056565656565656e-05, + "loss": 0.556, + "step": 103400 + }, + { + "epoch": 0.03, + "learning_rate": 9.055555555555556e-05, + "loss": 0.5539, + "step": 103500 + }, + { + "epoch": 0.03, + "learning_rate": 9.054545454545455e-05, + "loss": 0.5553, + "step": 103600 + }, + { + "epoch": 0.03, + "learning_rate": 9.053535353535354e-05, + "loss": 0.5569, + "step": 103700 + }, + { + "epoch": 0.03, + "learning_rate": 9.052525252525252e-05, + "loss": 0.5539, + "step": 103800 + }, + { + "epoch": 0.03, + "learning_rate": 9.051515151515152e-05, + "loss": 0.555, + "step": 103900 + }, + { + "epoch": 0.03, + "learning_rate": 9.050505050505052e-05, + "loss": 0.5524, + "step": 104000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.5539533457570915, + "eval_average_loss_on_sentence_tokens": 0.48604416766219544, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.5509277582168579, + "eval_non_padding_tokens_in_labels": 133.5279, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38145, + "eval_padding_tokens_in_labels": 378.4721, + "eval_reconstruction_accuracy": 0.90464267483945, + "eval_runtime": 188.9249, + "eval_samples_per_second": 26.466, + "eval_sentence_accuracy": 0.6961033251386222, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 104000 + }, + { + "epoch": 0.03, + "learning_rate": 9.04949494949495e-05, + "loss": 0.554, + "step": 104100 + }, + { + "epoch": 0.03, + "learning_rate": 9.04848484848485e-05, + "loss": 0.55, + "step": 104200 + }, + { + "epoch": 0.03, + "learning_rate": 9.047474747474748e-05, + "loss": 0.5524, + "step": 104300 + }, + { + "epoch": 0.03, + "learning_rate": 9.046464646464647e-05, + "loss": 0.5532, + "step": 104400 + }, + { + "epoch": 0.03, + "learning_rate": 9.045454545454546e-05, + "loss": 0.5526, + "step": 104500 + }, + { + "epoch": 0.03, + "learning_rate": 9.044444444444445e-05, + "loss": 0.5531, + "step": 104600 + }, + { + "epoch": 0.03, + "learning_rate": 9.043434343434343e-05, + "loss": 0.5522, + "step": 104700 + }, + { + "epoch": 0.03, + "learning_rate": 9.042424242424244e-05, + "loss": 0.5526, + "step": 104800 + }, + { + "epoch": 0.03, + "learning_rate": 9.041414141414142e-05, + "loss": 0.5476, + "step": 104900 + }, + { + "epoch": 0.04, + "learning_rate": 9.040404040404041e-05, + "loss": 0.5549, + "step": 105000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5540709751255688, + "eval_average_loss_on_sentence_tokens": 0.42696034630334195, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5484277606010437, + "eval_non_padding_tokens_in_labels": 133.5383, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3904, + "eval_padding_tokens_in_labels": 378.4617, + "eval_reconstruction_accuracy": 0.9047819095132233, + "eval_runtime": 185.7994, + "eval_samples_per_second": 26.911, + "eval_sentence_accuracy": 0.7220153605971971, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 105000 + }, + { + "epoch": 0.04, + "learning_rate": 9.03939393939394e-05, + "loss": 0.5544, + "step": 105100 + }, + { + "epoch": 0.04, + "learning_rate": 9.038383838383839e-05, + "loss": 0.5542, + "step": 105200 + }, + { + "epoch": 0.04, + "learning_rate": 9.037373737373738e-05, + "loss": 0.5521, + "step": 105300 + }, + { + "epoch": 0.04, + "learning_rate": 9.036363636363638e-05, + "loss": 0.5545, + "step": 105400 + }, + { + "epoch": 0.04, + "learning_rate": 9.035353535353535e-05, + "loss": 0.5543, + "step": 105500 + }, + { + "epoch": 0.04, + "learning_rate": 9.034343434343435e-05, + "loss": 0.5523, + "step": 105600 + }, + { + "epoch": 0.04, + "learning_rate": 9.033333333333334e-05, + "loss": 0.5509, + "step": 105700 + }, + { + "epoch": 0.04, + "learning_rate": 9.032323232323233e-05, + "loss": 0.5498, + "step": 105800 + }, + { + "epoch": 0.04, + "learning_rate": 9.031313131313132e-05, + "loss": 0.5567, + "step": 105900 + }, + { + "epoch": 0.04, + "learning_rate": 9.030303030303031e-05, + "loss": 0.5525, + "step": 106000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5522998798566789, + "eval_average_loss_on_sentence_tokens": 0.45047443844602286, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5478320121765137, + "eval_non_padding_tokens_in_labels": 133.53755, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3767, + "eval_padding_tokens_in_labels": 378.46245, + "eval_reconstruction_accuracy": 0.9048879330578387, + "eval_runtime": 178.2425, + "eval_samples_per_second": 28.052, + "eval_sentence_accuracy": 0.7136486801729862, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2499, + "step": 106000 + }, + { + "epoch": 0.04, + "learning_rate": 9.029292929292929e-05, + "loss": 0.5513, + "step": 106100 + }, + { + "epoch": 0.04, + "learning_rate": 9.02828282828283e-05, + "loss": 0.5487, + "step": 106200 + }, + { + "epoch": 0.04, + "learning_rate": 9.027272727272728e-05, + "loss": 0.5516, + "step": 106300 + }, + { + "epoch": 0.04, + "learning_rate": 9.026262626262627e-05, + "loss": 0.552, + "step": 106400 + }, + { + "epoch": 0.04, + "learning_rate": 9.025252525252526e-05, + "loss": 0.5512, + "step": 106500 + }, + { + "epoch": 0.04, + "learning_rate": 9.024242424242425e-05, + "loss": 0.5523, + "step": 106600 + }, + { + "epoch": 0.04, + "learning_rate": 9.023232323232323e-05, + "loss": 0.5512, + "step": 106700 + }, + { + "epoch": 0.04, + "learning_rate": 9.022222222222224e-05, + "loss": 0.5493, + "step": 106800 + }, + { + "epoch": 0.04, + "learning_rate": 9.021212121212121e-05, + "loss": 0.5515, + "step": 106900 + }, + { + "epoch": 0.04, + "learning_rate": 9.02020202020202e-05, + "loss": 0.5478, + "step": 107000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5512893115370434, + "eval_average_loss_on_sentence_tokens": 0.46181873233477466, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5472558736801147, + "eval_non_padding_tokens_in_labels": 133.5104, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36775, + "eval_padding_tokens_in_labels": 378.4896, + "eval_reconstruction_accuracy": 0.9048921781635261, + "eval_runtime": 188.3038, + "eval_samples_per_second": 26.553, + "eval_sentence_accuracy": 0.7166095429504549, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2499, + "step": 107000 + }, + { + "epoch": 0.04, + "learning_rate": 9.01919191919192e-05, + "loss": 0.5492, + "step": 107100 + }, + { + "epoch": 0.04, + "learning_rate": 9.018181818181819e-05, + "loss": 0.55, + "step": 107200 + }, + { + "epoch": 0.04, + "learning_rate": 9.017171717171717e-05, + "loss": 0.5474, + "step": 107300 + }, + { + "epoch": 0.04, + "learning_rate": 9.016161616161617e-05, + "loss": 0.5521, + "step": 107400 + }, + { + "epoch": 0.04, + "learning_rate": 9.015151515151515e-05, + "loss": 0.5484, + "step": 107500 + }, + { + "epoch": 0.04, + "learning_rate": 9.014141414141415e-05, + "loss": 0.5529, + "step": 107600 + }, + { + "epoch": 0.04, + "learning_rate": 9.013131313131314e-05, + "loss": 0.5484, + "step": 107700 + }, + { + "epoch": 0.04, + "learning_rate": 9.012121212121213e-05, + "loss": 0.5484, + "step": 107800 + }, + { + "epoch": 0.04, + "learning_rate": 9.011111111111111e-05, + "loss": 0.551, + "step": 107900 + }, + { + "epoch": 0.04, + "learning_rate": 9.010101010101011e-05, + "loss": 0.5498, + "step": 108000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5493153935330194, + "eval_average_loss_on_sentence_tokens": 0.38740917828838306, + "eval_average_shuffling_prob": 0.42, + "eval_loss": 0.5418945550918579, + "eval_non_padding_tokens_in_labels": 133.53925, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40215, + "eval_padding_tokens_in_labels": 378.46075, + "eval_reconstruction_accuracy": 0.9052853177896493, + "eval_runtime": 183.6359, + "eval_samples_per_second": 27.228, + "eval_sentence_accuracy": 0.7699230175677858, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24360000000000004, + "step": 108000 + }, + { + "epoch": 0.04, + "learning_rate": 9.009090909090909e-05, + "loss": 0.5461, + "step": 108100 + }, + { + "epoch": 0.04, + "learning_rate": 9.008080808080808e-05, + "loss": 0.5493, + "step": 108200 + }, + { + "epoch": 0.04, + "learning_rate": 9.007070707070708e-05, + "loss": 0.5494, + "step": 108300 + }, + { + "epoch": 0.04, + "learning_rate": 9.006060606060607e-05, + "loss": 0.5535, + "step": 108400 + }, + { + "epoch": 0.04, + "learning_rate": 9.005050505050505e-05, + "loss": 0.5455, + "step": 108500 + }, + { + "epoch": 0.04, + "learning_rate": 9.004040404040405e-05, + "loss": 0.5541, + "step": 108600 + }, + { + "epoch": 0.04, + "learning_rate": 9.003030303030303e-05, + "loss": 0.5479, + "step": 108700 + }, + { + "epoch": 0.04, + "learning_rate": 9.002020202020202e-05, + "loss": 0.5433, + "step": 108800 + }, + { + "epoch": 0.04, + "learning_rate": 9.001010101010101e-05, + "loss": 0.5455, + "step": 108900 + }, + { + "epoch": 0.04, + "learning_rate": 9e-05, + "loss": 0.5502, + "step": 109000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5492709316992652, + "eval_average_loss_on_sentence_tokens": 0.42889635219860367, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5437890887260437, + "eval_non_padding_tokens_in_labels": 133.50065, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3706, + "eval_padding_tokens_in_labels": 378.49935, + "eval_reconstruction_accuracy": 0.9052961587806366, + "eval_runtime": 177.161, + "eval_samples_per_second": 28.223, + "eval_sentence_accuracy": 0.7316875123369282, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 109000 + }, + { + "epoch": 0.04, + "learning_rate": 8.998989898989898e-05, + "loss": 0.5478, + "step": 109100 + }, + { + "epoch": 0.04, + "learning_rate": 8.997979797979799e-05, + "loss": 0.552, + "step": 109200 + }, + { + "epoch": 0.04, + "learning_rate": 8.996969696969697e-05, + "loss": 0.5465, + "step": 109300 + }, + { + "epoch": 0.04, + "learning_rate": 8.995959595959596e-05, + "loss": 0.5472, + "step": 109400 + }, + { + "epoch": 0.04, + "learning_rate": 8.994949494949495e-05, + "loss": 0.5521, + "step": 109500 + }, + { + "epoch": 0.04, + "learning_rate": 8.993939393939394e-05, + "loss": 0.5512, + "step": 109600 + }, + { + "epoch": 0.04, + "learning_rate": 8.992929292929294e-05, + "loss": 0.5483, + "step": 109700 + }, + { + "epoch": 0.04, + "learning_rate": 8.991919191919193e-05, + "loss": 0.5504, + "step": 109800 + }, + { + "epoch": 0.04, + "learning_rate": 8.99090909090909e-05, + "loss": 0.5529, + "step": 109900 + }, + { + "epoch": 0.04, + "learning_rate": 8.98989898989899e-05, + "loss": 0.5471, + "step": 110000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5472408679662453, + "eval_average_loss_on_sentence_tokens": 0.4682184200958305, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.543749988079071, + "eval_non_padding_tokens_in_labels": 133.50335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3911, + "eval_padding_tokens_in_labels": 378.49665, + "eval_reconstruction_accuracy": 0.9054345096931254, + "eval_runtime": 179.2891, + "eval_samples_per_second": 27.888, + "eval_sentence_accuracy": 0.7106564143054533, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 110000 + }, + { + "epoch": 0.04, + "learning_rate": 8.988888888888889e-05, + "loss": 0.5474, + "step": 110100 + }, + { + "epoch": 0.04, + "learning_rate": 8.987878787878788e-05, + "loss": 0.5458, + "step": 110200 + }, + { + "epoch": 0.04, + "learning_rate": 8.986868686868687e-05, + "loss": 0.5498, + "step": 110300 + }, + { + "epoch": 0.04, + "learning_rate": 8.985858585858587e-05, + "loss": 0.5485, + "step": 110400 + }, + { + "epoch": 0.04, + "learning_rate": 8.984848484848484e-05, + "loss": 0.547, + "step": 110500 + }, + { + "epoch": 0.04, + "learning_rate": 8.983838383838385e-05, + "loss": 0.5413, + "step": 110600 + }, + { + "epoch": 0.04, + "learning_rate": 8.982828282828283e-05, + "loss": 0.5433, + "step": 110700 + }, + { + "epoch": 0.04, + "learning_rate": 8.981818181818182e-05, + "loss": 0.5472, + "step": 110800 + }, + { + "epoch": 0.04, + "learning_rate": 8.980808080808081e-05, + "loss": 0.5456, + "step": 110900 + }, + { + "epoch": 0.04, + "learning_rate": 8.97979797979798e-05, + "loss": 0.545, + "step": 111000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5472719964653469, + "eval_average_loss_on_sentence_tokens": 0.4636631319984591, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.5434863567352295, + "eval_non_padding_tokens_in_labels": 133.5197, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37915, + "eval_padding_tokens_in_labels": 378.4803, + "eval_reconstruction_accuracy": 0.9053972001384097, + "eval_runtime": 186.6518, + "eval_samples_per_second": 26.788, + "eval_sentence_accuracy": 0.7063003570979955, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2493749999999999, + "step": 111000 + }, + { + "epoch": 0.04, + "learning_rate": 8.978787878787878e-05, + "loss": 0.5478, + "step": 111100 + }, + { + "epoch": 0.04, + "learning_rate": 8.977777777777779e-05, + "loss": 0.5467, + "step": 111200 + }, + { + "epoch": 0.04, + "learning_rate": 8.976767676767677e-05, + "loss": 0.5495, + "step": 111300 + }, + { + "epoch": 0.04, + "learning_rate": 8.975757575757576e-05, + "loss": 0.5434, + "step": 111400 + }, + { + "epoch": 0.04, + "learning_rate": 8.974747474747475e-05, + "loss": 0.549, + "step": 111500 + }, + { + "epoch": 0.04, + "learning_rate": 8.973737373737374e-05, + "loss": 0.5477, + "step": 111600 + }, + { + "epoch": 0.04, + "learning_rate": 8.972727272727272e-05, + "loss": 0.5464, + "step": 111700 + }, + { + "epoch": 0.04, + "learning_rate": 8.971717171717173e-05, + "loss": 0.5461, + "step": 111800 + }, + { + "epoch": 0.04, + "learning_rate": 8.97070707070707e-05, + "loss": 0.5469, + "step": 111900 + }, + { + "epoch": 0.04, + "learning_rate": 8.96969696969697e-05, + "loss": 0.5488, + "step": 112000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5470406641862775, + "eval_average_loss_on_sentence_tokens": 0.4569940360251566, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.5428515672683716, + "eval_non_padding_tokens_in_labels": 133.5456, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38195, + "eval_padding_tokens_in_labels": 378.4544, + "eval_reconstruction_accuracy": 0.9055830608538419, + "eval_runtime": 188.59, + "eval_samples_per_second": 26.513, + "eval_sentence_accuracy": 0.7063003570979955, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2493749999999999, + "step": 112000 + }, + { + "epoch": 0.04, + "learning_rate": 8.968686868686869e-05, + "loss": 0.5439, + "step": 112100 + }, + { + "epoch": 0.04, + "learning_rate": 8.967676767676768e-05, + "loss": 0.5451, + "step": 112200 + }, + { + "epoch": 0.04, + "learning_rate": 8.966666666666666e-05, + "loss": 0.5419, + "step": 112300 + }, + { + "epoch": 0.04, + "learning_rate": 8.965656565656567e-05, + "loss": 0.5435, + "step": 112400 + }, + { + "epoch": 0.04, + "learning_rate": 8.964646464646466e-05, + "loss": 0.5456, + "step": 112500 + }, + { + "epoch": 0.04, + "learning_rate": 8.963636363636364e-05, + "loss": 0.5463, + "step": 112600 + }, + { + "epoch": 0.04, + "learning_rate": 8.962626262626264e-05, + "loss": 0.5452, + "step": 112700 + }, + { + "epoch": 0.04, + "learning_rate": 8.961616161616162e-05, + "loss": 0.5429, + "step": 112800 + }, + { + "epoch": 0.04, + "learning_rate": 8.960606060606061e-05, + "loss": 0.5417, + "step": 112900 + }, + { + "epoch": 0.04, + "learning_rate": 8.95959595959596e-05, + "loss": 0.5409, + "step": 113000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5452109241745642, + "eval_average_loss_on_sentence_tokens": 0.4416894042193107, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5406054854393005, + "eval_non_padding_tokens_in_labels": 133.48725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37425, + "eval_padding_tokens_in_labels": 378.51275, + "eval_reconstruction_accuracy": 0.9057525241276282, + "eval_runtime": 178.6648, + "eval_samples_per_second": 27.985, + "eval_sentence_accuracy": 0.7238232813537423, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 113000 + }, + { + "epoch": 0.04, + "learning_rate": 8.95858585858586e-05, + "loss": 0.5437, + "step": 113100 + }, + { + "epoch": 0.04, + "learning_rate": 8.957575757575757e-05, + "loss": 0.5437, + "step": 113200 + }, + { + "epoch": 0.04, + "learning_rate": 8.956565656565658e-05, + "loss": 0.5457, + "step": 113300 + }, + { + "epoch": 0.04, + "learning_rate": 8.955555555555556e-05, + "loss": 0.5429, + "step": 113400 + }, + { + "epoch": 0.04, + "learning_rate": 8.954545454545455e-05, + "loss": 0.5467, + "step": 113500 + }, + { + "epoch": 0.04, + "learning_rate": 8.953535353535354e-05, + "loss": 0.5463, + "step": 113600 + }, + { + "epoch": 0.04, + "learning_rate": 8.952525252525253e-05, + "loss": 0.5454, + "step": 113700 + }, + { + "epoch": 0.04, + "learning_rate": 8.951515151515151e-05, + "loss": 0.541, + "step": 113800 + }, + { + "epoch": 0.04, + "learning_rate": 8.950505050505052e-05, + "loss": 0.5445, + "step": 113900 + }, + { + "epoch": 0.04, + "learning_rate": 8.94949494949495e-05, + "loss": 0.544, + "step": 114000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5461744271791635, + "eval_average_loss_on_sentence_tokens": 0.39531639554738934, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.5394042730331421, + "eval_non_padding_tokens_in_labels": 133.5446, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.4554, + "eval_reconstruction_accuracy": 0.9056108390751078, + "eval_runtime": 179.737, + "eval_samples_per_second": 27.818, + "eval_sentence_accuracy": 0.7512022897338813, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 114000 + }, + { + "epoch": 0.04, + "learning_rate": 8.948484848484849e-05, + "loss": 0.5471, + "step": 114100 + }, + { + "epoch": 0.04, + "learning_rate": 8.947474747474748e-05, + "loss": 0.5387, + "step": 114200 + }, + { + "epoch": 0.04, + "learning_rate": 8.946464646464647e-05, + "loss": 0.5422, + "step": 114300 + }, + { + "epoch": 0.04, + "learning_rate": 8.945454545454546e-05, + "loss": 0.5397, + "step": 114400 + }, + { + "epoch": 0.04, + "learning_rate": 8.944444444444446e-05, + "loss": 0.5447, + "step": 114500 + }, + { + "epoch": 0.04, + "learning_rate": 8.943434343434343e-05, + "loss": 0.5405, + "step": 114600 + }, + { + "epoch": 0.04, + "learning_rate": 8.942424242424243e-05, + "loss": 0.542, + "step": 114700 + }, + { + "epoch": 0.04, + "learning_rate": 8.941414141414142e-05, + "loss": 0.5381, + "step": 114800 + }, + { + "epoch": 0.04, + "learning_rate": 8.940404040404041e-05, + "loss": 0.5418, + "step": 114900 + }, + { + "epoch": 0.04, + "learning_rate": 8.93939393939394e-05, + "loss": 0.5455, + "step": 115000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.5429804417624639, + "eval_average_loss_on_sentence_tokens": 0.4303811914856319, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5379003882408142, + "eval_non_padding_tokens_in_labels": 133.5207, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3759, + "eval_padding_tokens_in_labels": 378.4793, + "eval_reconstruction_accuracy": 0.9058125575602682, + "eval_runtime": 183.44, + "eval_samples_per_second": 27.257, + "eval_sentence_accuracy": 0.7340696610260735, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 115000 + }, + { + "epoch": 0.05, + "learning_rate": 8.93838383838384e-05, + "loss": 0.5416, + "step": 115100 + }, + { + "epoch": 0.05, + "learning_rate": 8.937373737373737e-05, + "loss": 0.5454, + "step": 115200 + }, + { + "epoch": 0.05, + "learning_rate": 8.936363636363636e-05, + "loss": 0.5429, + "step": 115300 + }, + { + "epoch": 0.05, + "learning_rate": 8.935353535353536e-05, + "loss": 0.5409, + "step": 115400 + }, + { + "epoch": 0.05, + "learning_rate": 8.934343434343435e-05, + "loss": 0.544, + "step": 115500 + }, + { + "epoch": 0.05, + "learning_rate": 8.933333333333334e-05, + "loss": 0.5465, + "step": 115600 + }, + { + "epoch": 0.05, + "learning_rate": 8.932323232323233e-05, + "loss": 0.5434, + "step": 115700 + }, + { + "epoch": 0.05, + "learning_rate": 8.931313131313131e-05, + "loss": 0.5445, + "step": 115800 + }, + { + "epoch": 0.05, + "learning_rate": 8.930303030303032e-05, + "loss": 0.5404, + "step": 115900 + }, + { + "epoch": 0.05, + "learning_rate": 8.92929292929293e-05, + "loss": 0.5408, + "step": 116000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5426293650049312, + "eval_average_loss_on_sentence_tokens": 0.44629240600909825, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5382714867591858, + "eval_non_padding_tokens_in_labels": 133.5395, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36925, + "eval_padding_tokens_in_labels": 378.4605, + "eval_reconstruction_accuracy": 0.906018439067606, + "eval_runtime": 184.3453, + "eval_samples_per_second": 27.123, + "eval_sentence_accuracy": 0.7213155203043408, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2499, + "step": 116000 + }, + { + "epoch": 0.05, + "learning_rate": 8.928282828282829e-05, + "loss": 0.5458, + "step": 116100 + }, + { + "epoch": 0.05, + "learning_rate": 8.927272727272728e-05, + "loss": 0.5437, + "step": 116200 + }, + { + "epoch": 0.05, + "learning_rate": 8.926262626262627e-05, + "loss": 0.5387, + "step": 116300 + }, + { + "epoch": 0.05, + "learning_rate": 8.925252525252525e-05, + "loss": 0.5438, + "step": 116400 + }, + { + "epoch": 0.05, + "learning_rate": 8.924242424242426e-05, + "loss": 0.5397, + "step": 116500 + }, + { + "epoch": 0.05, + "learning_rate": 8.923232323232323e-05, + "loss": 0.5448, + "step": 116600 + }, + { + "epoch": 0.05, + "learning_rate": 8.922222222222223e-05, + "loss": 0.5434, + "step": 116700 + }, + { + "epoch": 0.05, + "learning_rate": 8.921212121212122e-05, + "loss": 0.5407, + "step": 116800 + }, + { + "epoch": 0.05, + "learning_rate": 8.920202020202021e-05, + "loss": 0.5426, + "step": 116900 + }, + { + "epoch": 0.05, + "learning_rate": 8.919191919191919e-05, + "loss": 0.547, + "step": 117000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5412122376827058, + "eval_average_loss_on_sentence_tokens": 0.44539663300264465, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5369628667831421, + "eval_non_padding_tokens_in_labels": 133.5572, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3915, + "eval_padding_tokens_in_labels": 378.4428, + "eval_reconstruction_accuracy": 0.9059814528439205, + "eval_runtime": 184.0951, + "eval_samples_per_second": 27.16, + "eval_sentence_accuracy": 0.7291707789760798, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 117000 + }, + { + "epoch": 0.05, + "learning_rate": 8.91818181818182e-05, + "loss": 0.5386, + "step": 117100 + }, + { + "epoch": 0.05, + "learning_rate": 8.917171717171717e-05, + "loss": 0.5405, + "step": 117200 + }, + { + "epoch": 0.05, + "learning_rate": 8.916161616161616e-05, + "loss": 0.538, + "step": 117300 + }, + { + "epoch": 0.05, + "learning_rate": 8.915151515151516e-05, + "loss": 0.5385, + "step": 117400 + }, + { + "epoch": 0.05, + "learning_rate": 8.914141414141415e-05, + "loss": 0.5455, + "step": 117500 + }, + { + "epoch": 0.05, + "learning_rate": 8.913131313131313e-05, + "loss": 0.5446, + "step": 117600 + }, + { + "epoch": 0.05, + "learning_rate": 8.912121212121213e-05, + "loss": 0.5407, + "step": 117700 + }, + { + "epoch": 0.05, + "learning_rate": 8.911111111111111e-05, + "loss": 0.5387, + "step": 117800 + }, + { + "epoch": 0.05, + "learning_rate": 8.91010101010101e-05, + "loss": 0.5417, + "step": 117900 + }, + { + "epoch": 0.05, + "learning_rate": 8.90909090909091e-05, + "loss": 0.5413, + "step": 118000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5409410916417582, + "eval_average_loss_on_sentence_tokens": 0.39807229243224995, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.5344336032867432, + "eval_non_padding_tokens_in_labels": 133.48685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3755, + "eval_padding_tokens_in_labels": 378.51315, + "eval_reconstruction_accuracy": 0.9062190857945979, + "eval_runtime": 178.2074, + "eval_samples_per_second": 28.057, + "eval_sentence_accuracy": 0.7533376998582375, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 118000 + }, + { + "epoch": 0.05, + "learning_rate": 8.908080808080809e-05, + "loss": 0.5371, + "step": 118100 + }, + { + "epoch": 0.05, + "learning_rate": 8.907070707070706e-05, + "loss": 0.5443, + "step": 118200 + }, + { + "epoch": 0.05, + "learning_rate": 8.906060606060607e-05, + "loss": 0.5408, + "step": 118300 + }, + { + "epoch": 0.05, + "learning_rate": 8.905050505050505e-05, + "loss": 0.5374, + "step": 118400 + }, + { + "epoch": 0.05, + "learning_rate": 8.904040404040404e-05, + "loss": 0.5389, + "step": 118500 + }, + { + "epoch": 0.05, + "learning_rate": 8.903030303030303e-05, + "loss": 0.5388, + "step": 118600 + }, + { + "epoch": 0.05, + "learning_rate": 8.902020202020202e-05, + "loss": 0.5406, + "step": 118700 + }, + { + "epoch": 0.05, + "learning_rate": 8.901010101010102e-05, + "loss": 0.5438, + "step": 118800 + }, + { + "epoch": 0.05, + "learning_rate": 8.900000000000001e-05, + "loss": 0.5409, + "step": 118900 + }, + { + "epoch": 0.05, + "learning_rate": 8.898989898989899e-05, + "loss": 0.5412, + "step": 119000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.540772897836586, + "eval_average_loss_on_sentence_tokens": 0.39399738754853203, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.5341601371765137, + "eval_non_padding_tokens_in_labels": 133.5443, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3781, + "eval_padding_tokens_in_labels": 378.4557, + "eval_reconstruction_accuracy": 0.9061826338442874, + "eval_runtime": 183.2616, + "eval_samples_per_second": 27.283, + "eval_sentence_accuracy": 0.7484657347425844, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 119000 + }, + { + "epoch": 0.05, + "learning_rate": 8.897979797979798e-05, + "loss": 0.5439, + "step": 119100 + }, + { + "epoch": 0.05, + "learning_rate": 8.896969696969697e-05, + "loss": 0.5372, + "step": 119200 + }, + { + "epoch": 0.05, + "learning_rate": 8.895959595959596e-05, + "loss": 0.5404, + "step": 119300 + }, + { + "epoch": 0.05, + "learning_rate": 8.894949494949495e-05, + "loss": 0.5423, + "step": 119400 + }, + { + "epoch": 0.05, + "learning_rate": 8.893939393939395e-05, + "loss": 0.5374, + "step": 119500 + }, + { + "epoch": 0.05, + "learning_rate": 8.892929292929293e-05, + "loss": 0.5365, + "step": 119600 + }, + { + "epoch": 0.05, + "learning_rate": 8.891919191919193e-05, + "loss": 0.5401, + "step": 119700 + }, + { + "epoch": 0.05, + "learning_rate": 8.890909090909091e-05, + "loss": 0.5335, + "step": 119800 + }, + { + "epoch": 0.05, + "learning_rate": 8.88989898989899e-05, + "loss": 0.5396, + "step": 119900 + }, + { + "epoch": 0.05, + "learning_rate": 8.888888888888889e-05, + "loss": 0.5391, + "step": 120000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.538706937744235, + "eval_average_loss_on_sentence_tokens": 0.38176590979427183, + "eval_average_shuffling_prob": 0.405, + "eval_loss": 0.5316992402076721, + "eval_non_padding_tokens_in_labels": 133.5274, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3758, + "eval_padding_tokens_in_labels": 378.4726, + "eval_reconstruction_accuracy": 0.9063748985437218, + "eval_runtime": 182.075, + "eval_samples_per_second": 27.461, + "eval_sentence_accuracy": 0.7758133400326592, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24097500000000005, + "step": 120000 + }, + { + "epoch": 0.05, + "learning_rate": 8.887878787878789e-05, + "loss": 0.5346, + "step": 120100 + }, + { + "epoch": 0.05, + "learning_rate": 8.886868686868686e-05, + "loss": 0.5358, + "step": 120200 + }, + { + "epoch": 0.05, + "learning_rate": 8.885858585858587e-05, + "loss": 0.5369, + "step": 120300 + }, + { + "epoch": 0.05, + "learning_rate": 8.884848484848485e-05, + "loss": 0.5384, + "step": 120400 + }, + { + "epoch": 0.05, + "learning_rate": 8.883838383838384e-05, + "loss": 0.541, + "step": 120500 + }, + { + "epoch": 0.05, + "learning_rate": 8.882828282828283e-05, + "loss": 0.5419, + "step": 120600 + }, + { + "epoch": 0.05, + "learning_rate": 8.881818181818182e-05, + "loss": 0.5335, + "step": 120700 + }, + { + "epoch": 0.05, + "learning_rate": 8.88080808080808e-05, + "loss": 0.5378, + "step": 120800 + }, + { + "epoch": 0.05, + "learning_rate": 8.879797979797981e-05, + "loss": 0.5342, + "step": 120900 + }, + { + "epoch": 0.05, + "learning_rate": 8.87878787878788e-05, + "loss": 0.5387, + "step": 121000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5378723034321233, + "eval_average_loss_on_sentence_tokens": 0.42625251993221747, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5326952934265137, + "eval_non_padding_tokens_in_labels": 133.52685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3738, + "eval_padding_tokens_in_labels": 378.47315, + "eval_reconstruction_accuracy": 0.9063174222316022, + "eval_runtime": 180.1655, + "eval_samples_per_second": 27.752, + "eval_sentence_accuracy": 0.7363082527320688, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 121000 + }, + { + "epoch": 0.05, + "learning_rate": 8.877777777777778e-05, + "loss": 0.5404, + "step": 121100 + }, + { + "epoch": 0.05, + "learning_rate": 8.876767676767678e-05, + "loss": 0.5351, + "step": 121200 + }, + { + "epoch": 0.05, + "learning_rate": 8.875757575757576e-05, + "loss": 0.5391, + "step": 121300 + }, + { + "epoch": 0.05, + "learning_rate": 8.874747474747475e-05, + "loss": 0.5394, + "step": 121400 + }, + { + "epoch": 0.05, + "learning_rate": 8.873737373737375e-05, + "loss": 0.538, + "step": 121500 + }, + { + "epoch": 0.05, + "learning_rate": 8.872727272727274e-05, + "loss": 0.5348, + "step": 121600 + }, + { + "epoch": 0.05, + "learning_rate": 8.871717171717172e-05, + "loss": 0.5406, + "step": 121700 + }, + { + "epoch": 0.05, + "learning_rate": 8.870707070707072e-05, + "loss": 0.5376, + "step": 121800 + }, + { + "epoch": 0.05, + "learning_rate": 8.86969696969697e-05, + "loss": 0.5339, + "step": 121900 + }, + { + "epoch": 0.05, + "learning_rate": 8.868686868686869e-05, + "loss": 0.5388, + "step": 122000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.538006312063614, + "eval_average_loss_on_sentence_tokens": 0.4220419472001835, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5328418016433716, + "eval_non_padding_tokens_in_labels": 133.5335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36575, + "eval_padding_tokens_in_labels": 378.4665, + "eval_reconstruction_accuracy": 0.9063331305371689, + "eval_runtime": 181.5082, + "eval_samples_per_second": 27.547, + "eval_sentence_accuracy": 0.7410949808889766, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 122000 + }, + { + "epoch": 0.05, + "learning_rate": 8.867676767676768e-05, + "loss": 0.5365, + "step": 122100 + }, + { + "epoch": 0.05, + "learning_rate": 8.866666666666668e-05, + "loss": 0.541, + "step": 122200 + }, + { + "epoch": 0.05, + "learning_rate": 8.865656565656565e-05, + "loss": 0.5348, + "step": 122300 + }, + { + "epoch": 0.05, + "learning_rate": 8.864646464646466e-05, + "loss": 0.5358, + "step": 122400 + }, + { + "epoch": 0.05, + "learning_rate": 8.863636363636364e-05, + "loss": 0.5347, + "step": 122500 + }, + { + "epoch": 0.05, + "learning_rate": 8.862626262626263e-05, + "loss": 0.5389, + "step": 122600 + }, + { + "epoch": 0.05, + "learning_rate": 8.861616161616162e-05, + "loss": 0.5346, + "step": 122700 + }, + { + "epoch": 0.05, + "learning_rate": 8.860606060606061e-05, + "loss": 0.5334, + "step": 122800 + }, + { + "epoch": 0.05, + "learning_rate": 8.859595959595959e-05, + "loss": 0.5346, + "step": 122900 + }, + { + "epoch": 0.05, + "learning_rate": 8.85858585858586e-05, + "loss": 0.5397, + "step": 123000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5368524539529062, + "eval_average_loss_on_sentence_tokens": 0.4534192288006475, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5331445336341858, + "eval_non_padding_tokens_in_labels": 133.50405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3802, + "eval_padding_tokens_in_labels": 378.49595, + "eval_reconstruction_accuracy": 0.9065661486756847, + "eval_runtime": 182.4505, + "eval_samples_per_second": 27.405, + "eval_sentence_accuracy": 0.7236976689934861, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 123000 + }, + { + "epoch": 0.05, + "learning_rate": 8.857575757575758e-05, + "loss": 0.533, + "step": 123100 + }, + { + "epoch": 0.05, + "learning_rate": 8.856565656565657e-05, + "loss": 0.5359, + "step": 123200 + }, + { + "epoch": 0.05, + "learning_rate": 8.855555555555556e-05, + "loss": 0.5357, + "step": 123300 + }, + { + "epoch": 0.05, + "learning_rate": 8.854545454545455e-05, + "loss": 0.5364, + "step": 123400 + }, + { + "epoch": 0.05, + "learning_rate": 8.853535353535354e-05, + "loss": 0.538, + "step": 123500 + }, + { + "epoch": 0.05, + "learning_rate": 8.852525252525254e-05, + "loss": 0.537, + "step": 123600 + }, + { + "epoch": 0.05, + "learning_rate": 8.851515151515152e-05, + "loss": 0.5304, + "step": 123700 + }, + { + "epoch": 0.05, + "learning_rate": 8.850505050505051e-05, + "loss": 0.5346, + "step": 123800 + }, + { + "epoch": 0.05, + "learning_rate": 8.84949494949495e-05, + "loss": 0.5304, + "step": 123900 + }, + { + "epoch": 0.05, + "learning_rate": 8.848484848484849e-05, + "loss": 0.537, + "step": 124000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.5368687987523968, + "eval_average_loss_on_sentence_tokens": 0.3994587285690789, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.53076171875, + "eval_non_padding_tokens_in_labels": 133.55685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38685, + "eval_padding_tokens_in_labels": 378.44315, + "eval_reconstruction_accuracy": 0.9064622159198117, + "eval_runtime": 185.8628, + "eval_samples_per_second": 26.902, + "eval_sentence_accuracy": 0.7565453012004952, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 124000 + }, + { + "epoch": 0.05, + "learning_rate": 8.847474747474748e-05, + "loss": 0.538, + "step": 124100 + }, + { + "epoch": 0.05, + "learning_rate": 8.846464646464648e-05, + "loss": 0.5363, + "step": 124200 + }, + { + "epoch": 0.05, + "learning_rate": 8.845454545454545e-05, + "loss": 0.5335, + "step": 124300 + }, + { + "epoch": 0.05, + "learning_rate": 8.844444444444445e-05, + "loss": 0.5339, + "step": 124400 + }, + { + "epoch": 0.05, + "learning_rate": 8.843434343434344e-05, + "loss": 0.5324, + "step": 124500 + }, + { + "epoch": 0.05, + "learning_rate": 8.842424242424243e-05, + "loss": 0.5338, + "step": 124600 + }, + { + "epoch": 0.05, + "learning_rate": 8.841414141414142e-05, + "loss": 0.5366, + "step": 124700 + }, + { + "epoch": 0.05, + "learning_rate": 8.840404040404041e-05, + "loss": 0.5376, + "step": 124800 + }, + { + "epoch": 0.05, + "learning_rate": 8.839393939393939e-05, + "loss": 0.54, + "step": 124900 + }, + { + "epoch": 0.06, + "learning_rate": 8.83838383838384e-05, + "loss": 0.5346, + "step": 125000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5356507177752847, + "eval_average_loss_on_sentence_tokens": 0.4419085963909918, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5313867330551147, + "eval_non_padding_tokens_in_labels": 133.5447, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3695, + "eval_padding_tokens_in_labels": 378.4553, + "eval_reconstruction_accuracy": 0.9067444730334789, + "eval_runtime": 179.5945, + "eval_samples_per_second": 27.84, + "eval_sentence_accuracy": 0.7166633768191362, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 125000 + }, + { + "epoch": 0.06, + "learning_rate": 8.837373737373738e-05, + "loss": 0.5339, + "step": 125100 + }, + { + "epoch": 0.06, + "learning_rate": 8.836363636363637e-05, + "loss": 0.5348, + "step": 125200 + }, + { + "epoch": 0.06, + "learning_rate": 8.835353535353536e-05, + "loss": 0.5354, + "step": 125300 + }, + { + "epoch": 0.06, + "learning_rate": 8.834343434343435e-05, + "loss": 0.5372, + "step": 125400 + }, + { + "epoch": 0.06, + "learning_rate": 8.833333333333333e-05, + "loss": 0.5323, + "step": 125500 + }, + { + "epoch": 0.06, + "learning_rate": 8.832323232323234e-05, + "loss": 0.5331, + "step": 125600 + }, + { + "epoch": 0.06, + "learning_rate": 8.831313131313131e-05, + "loss": 0.5362, + "step": 125700 + }, + { + "epoch": 0.06, + "learning_rate": 8.83030303030303e-05, + "loss": 0.532, + "step": 125800 + }, + { + "epoch": 0.06, + "learning_rate": 8.82929292929293e-05, + "loss": 0.5328, + "step": 125900 + }, + { + "epoch": 0.06, + "learning_rate": 8.828282828282829e-05, + "loss": 0.5305, + "step": 126000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.534645616945462, + "eval_average_loss_on_sentence_tokens": 0.4399571029896162, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.5303027629852295, + "eval_non_padding_tokens_in_labels": 133.546, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38885, + "eval_padding_tokens_in_labels": 378.454, + "eval_reconstruction_accuracy": 0.9067975168926355, + "eval_runtime": 177.1582, + "eval_samples_per_second": 28.223, + "eval_sentence_accuracy": 0.722446031546647, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 126000 + }, + { + "epoch": 0.06, + "learning_rate": 8.827272727272727e-05, + "loss": 0.5326, + "step": 126100 + }, + { + "epoch": 0.06, + "learning_rate": 8.826262626262627e-05, + "loss": 0.5375, + "step": 126200 + }, + { + "epoch": 0.06, + "learning_rate": 8.825252525252525e-05, + "loss": 0.5365, + "step": 126300 + }, + { + "epoch": 0.06, + "learning_rate": 8.824242424242424e-05, + "loss": 0.533, + "step": 126400 + }, + { + "epoch": 0.06, + "learning_rate": 8.823232323232324e-05, + "loss": 0.5326, + "step": 126500 + }, + { + "epoch": 0.06, + "learning_rate": 8.822222222222223e-05, + "loss": 0.5314, + "step": 126600 + }, + { + "epoch": 0.06, + "learning_rate": 8.82121212121212e-05, + "loss": 0.5294, + "step": 126700 + }, + { + "epoch": 0.06, + "learning_rate": 8.820202020202021e-05, + "loss": 0.5325, + "step": 126800 + }, + { + "epoch": 0.06, + "learning_rate": 8.819191919191919e-05, + "loss": 0.5314, + "step": 126900 + }, + { + "epoch": 0.06, + "learning_rate": 8.818181818181818e-05, + "loss": 0.5311, + "step": 127000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5333439597900604, + "eval_average_loss_on_sentence_tokens": 0.41682451749317195, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5281933546066284, + "eval_non_padding_tokens_in_labels": 133.5102, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37555, + "eval_padding_tokens_in_labels": 378.4898, + "eval_reconstruction_accuracy": 0.9070008598690918, + "eval_runtime": 179.6148, + "eval_samples_per_second": 27.837, + "eval_sentence_accuracy": 0.7369811760905844, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 127000 + }, + { + "epoch": 0.06, + "learning_rate": 8.817171717171717e-05, + "loss": 0.5324, + "step": 127100 + }, + { + "epoch": 0.06, + "learning_rate": 8.816161616161617e-05, + "loss": 0.5302, + "step": 127200 + }, + { + "epoch": 0.06, + "learning_rate": 8.815151515151515e-05, + "loss": 0.533, + "step": 127300 + }, + { + "epoch": 0.06, + "learning_rate": 8.814141414141415e-05, + "loss": 0.5326, + "step": 127400 + }, + { + "epoch": 0.06, + "learning_rate": 8.813131313131313e-05, + "loss": 0.5329, + "step": 127500 + }, + { + "epoch": 0.06, + "learning_rate": 8.812121212121212e-05, + "loss": 0.5307, + "step": 127600 + }, + { + "epoch": 0.06, + "learning_rate": 8.811111111111111e-05, + "loss": 0.5346, + "step": 127700 + }, + { + "epoch": 0.06, + "learning_rate": 8.81010101010101e-05, + "loss": 0.5296, + "step": 127800 + }, + { + "epoch": 0.06, + "learning_rate": 8.80909090909091e-05, + "loss": 0.5315, + "step": 127900 + }, + { + "epoch": 0.06, + "learning_rate": 8.808080808080809e-05, + "loss": 0.5308, + "step": 128000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5322497603937367, + "eval_average_loss_on_sentence_tokens": 0.4628997805508304, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.5291894674301147, + "eval_non_padding_tokens_in_labels": 133.5205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3754, + "eval_padding_tokens_in_labels": 378.4795, + "eval_reconstruction_accuracy": 0.907111624167624, + "eval_runtime": 183.0388, + "eval_samples_per_second": 27.317, + "eval_sentence_accuracy": 0.7089875643763346, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 128000 + }, + { + "epoch": 0.06, + "learning_rate": 8.807070707070707e-05, + "loss": 0.535, + "step": 128100 + }, + { + "epoch": 0.06, + "learning_rate": 8.806060606060606e-05, + "loss": 0.5352, + "step": 128200 + }, + { + "epoch": 0.06, + "learning_rate": 8.805050505050505e-05, + "loss": 0.5331, + "step": 128300 + }, + { + "epoch": 0.06, + "learning_rate": 8.804040404040404e-05, + "loss": 0.5329, + "step": 128400 + }, + { + "epoch": 0.06, + "learning_rate": 8.803030303030304e-05, + "loss": 0.533, + "step": 128500 + }, + { + "epoch": 0.06, + "learning_rate": 8.802020202020203e-05, + "loss": 0.5344, + "step": 128600 + }, + { + "epoch": 0.06, + "learning_rate": 8.8010101010101e-05, + "loss": 0.5294, + "step": 128700 + }, + { + "epoch": 0.06, + "learning_rate": 8.800000000000001e-05, + "loss": 0.53, + "step": 128800 + }, + { + "epoch": 0.06, + "learning_rate": 8.798989898989899e-05, + "loss": 0.5305, + "step": 128900 + }, + { + "epoch": 0.06, + "learning_rate": 8.797979797979798e-05, + "loss": 0.5346, + "step": 129000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5311658427129934, + "eval_average_loss_on_sentence_tokens": 0.4358887921314271, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5269238352775574, + "eval_non_padding_tokens_in_labels": 133.56185, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40155, + "eval_padding_tokens_in_labels": 378.43815, + "eval_reconstruction_accuracy": 0.9069961700611454, + "eval_runtime": 184.3637, + "eval_samples_per_second": 27.12, + "eval_sentence_accuracy": 0.7436834927414, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 129000 + }, + { + "epoch": 0.06, + "learning_rate": 8.796969696969697e-05, + "loss": 0.5323, + "step": 129100 + }, + { + "epoch": 0.06, + "learning_rate": 8.795959595959597e-05, + "loss": 0.5332, + "step": 129200 + }, + { + "epoch": 0.06, + "learning_rate": 8.794949494949494e-05, + "loss": 0.5282, + "step": 129300 + }, + { + "epoch": 0.06, + "learning_rate": 8.793939393939395e-05, + "loss": 0.5297, + "step": 129400 + }, + { + "epoch": 0.06, + "learning_rate": 8.792929292929294e-05, + "loss": 0.535, + "step": 129500 + }, + { + "epoch": 0.06, + "learning_rate": 8.791919191919192e-05, + "loss": 0.529, + "step": 129600 + }, + { + "epoch": 0.06, + "learning_rate": 8.790909090909091e-05, + "loss": 0.5322, + "step": 129700 + }, + { + "epoch": 0.06, + "learning_rate": 8.78989898989899e-05, + "loss": 0.5308, + "step": 129800 + }, + { + "epoch": 0.06, + "learning_rate": 8.78888888888889e-05, + "loss": 0.5262, + "step": 129900 + }, + { + "epoch": 0.06, + "learning_rate": 8.787878787878789e-05, + "loss": 0.5327, + "step": 130000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5311903093349502, + "eval_average_loss_on_sentence_tokens": 0.40945821322062215, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.525634765625, + "eval_non_padding_tokens_in_labels": 133.5374, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3768, + "eval_padding_tokens_in_labels": 378.4626, + "eval_reconstruction_accuracy": 0.9070637388747735, + "eval_runtime": 189.6324, + "eval_samples_per_second": 26.367, + "eval_sentence_accuracy": 0.7510093850377735, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 130000 + }, + { + "epoch": 0.06, + "learning_rate": 8.786868686868688e-05, + "loss": 0.5321, + "step": 130100 + }, + { + "epoch": 0.06, + "learning_rate": 8.785858585858586e-05, + "loss": 0.5298, + "step": 130200 + }, + { + "epoch": 0.06, + "learning_rate": 8.784848484848486e-05, + "loss": 0.5295, + "step": 130300 + }, + { + "epoch": 0.06, + "learning_rate": 8.783838383838384e-05, + "loss": 0.5297, + "step": 130400 + }, + { + "epoch": 0.06, + "learning_rate": 8.782828282828283e-05, + "loss": 0.528, + "step": 130500 + }, + { + "epoch": 0.06, + "learning_rate": 8.781818181818183e-05, + "loss": 0.526, + "step": 130600 + }, + { + "epoch": 0.06, + "learning_rate": 8.780808080808082e-05, + "loss": 0.5318, + "step": 130700 + }, + { + "epoch": 0.06, + "learning_rate": 8.77979797979798e-05, + "loss": 0.5314, + "step": 130800 + }, + { + "epoch": 0.06, + "learning_rate": 8.77878787878788e-05, + "loss": 0.5282, + "step": 130900 + }, + { + "epoch": 0.06, + "learning_rate": 8.777777777777778e-05, + "loss": 0.5317, + "step": 131000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5299392317359742, + "eval_average_loss_on_sentence_tokens": 0.41120096344461615, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.5244531035423279, + "eval_non_padding_tokens_in_labels": 133.4979, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.383, + "eval_padding_tokens_in_labels": 378.5021, + "eval_reconstruction_accuracy": 0.9073133379190407, + "eval_runtime": 181.9493, + "eval_samples_per_second": 27.48, + "eval_sentence_accuracy": 0.7478331867855798, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 131000 + }, + { + "epoch": 0.06, + "learning_rate": 8.776767676767677e-05, + "loss": 0.5324, + "step": 131100 + }, + { + "epoch": 0.06, + "learning_rate": 8.775757575757576e-05, + "loss": 0.5355, + "step": 131200 + }, + { + "epoch": 0.06, + "learning_rate": 8.774747474747476e-05, + "loss": 0.5328, + "step": 131300 + }, + { + "epoch": 0.06, + "learning_rate": 8.773737373737373e-05, + "loss": 0.5292, + "step": 131400 + }, + { + "epoch": 0.06, + "learning_rate": 8.772727272727274e-05, + "loss": 0.5325, + "step": 131500 + }, + { + "epoch": 0.06, + "learning_rate": 8.771717171717172e-05, + "loss": 0.5273, + "step": 131600 + }, + { + "epoch": 0.06, + "learning_rate": 8.770707070707071e-05, + "loss": 0.5314, + "step": 131700 + }, + { + "epoch": 0.06, + "learning_rate": 8.76969696969697e-05, + "loss": 0.532, + "step": 131800 + }, + { + "epoch": 0.06, + "learning_rate": 8.76868686868687e-05, + "loss": 0.529, + "step": 131900 + }, + { + "epoch": 0.06, + "learning_rate": 8.767676767676767e-05, + "loss": 0.5276, + "step": 132000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5296104926831863, + "eval_average_loss_on_sentence_tokens": 0.4378864437115044, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.5255273580551147, + "eval_non_padding_tokens_in_labels": 133.50575, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37145, + "eval_padding_tokens_in_labels": 378.49425, + "eval_reconstruction_accuracy": 0.9073743106219871, + "eval_runtime": 186.1079, + "eval_samples_per_second": 26.866, + "eval_sentence_accuracy": 0.7244064815977892, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 132000 + }, + { + "epoch": 0.06, + "learning_rate": 8.766666666666668e-05, + "loss": 0.5318, + "step": 132100 + }, + { + "epoch": 0.06, + "learning_rate": 8.765656565656566e-05, + "loss": 0.5286, + "step": 132200 + }, + { + "epoch": 0.06, + "learning_rate": 8.764646464646465e-05, + "loss": 0.5301, + "step": 132300 + }, + { + "epoch": 0.06, + "learning_rate": 8.763636363636364e-05, + "loss": 0.5287, + "step": 132400 + }, + { + "epoch": 0.06, + "learning_rate": 8.762626262626263e-05, + "loss": 0.5277, + "step": 132500 + }, + { + "epoch": 0.06, + "learning_rate": 8.761616161616161e-05, + "loss": 0.5254, + "step": 132600 + }, + { + "epoch": 0.06, + "learning_rate": 8.760606060606062e-05, + "loss": 0.5284, + "step": 132700 + }, + { + "epoch": 0.06, + "learning_rate": 8.75959595959596e-05, + "loss": 0.5313, + "step": 132800 + }, + { + "epoch": 0.06, + "learning_rate": 8.758585858585859e-05, + "loss": 0.5299, + "step": 132900 + }, + { + "epoch": 0.06, + "learning_rate": 8.757575757575758e-05, + "loss": 0.5337, + "step": 133000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5292218411645392, + "eval_average_loss_on_sentence_tokens": 0.40809947199101426, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.5238671898841858, + "eval_non_padding_tokens_in_labels": 133.54255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3902, + "eval_padding_tokens_in_labels": 378.45745, + "eval_reconstruction_accuracy": 0.9073830478977843, + "eval_runtime": 181.3391, + "eval_samples_per_second": 27.573, + "eval_sentence_accuracy": 0.7448184901394297, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.248775, + "step": 133000 + }, + { + "epoch": 0.06, + "learning_rate": 8.756565656565657e-05, + "loss": 0.5308, + "step": 133100 + }, + { + "epoch": 0.06, + "learning_rate": 8.755555555555556e-05, + "loss": 0.5308, + "step": 133200 + }, + { + "epoch": 0.06, + "learning_rate": 8.754545454545456e-05, + "loss": 0.5285, + "step": 133300 + }, + { + "epoch": 0.06, + "learning_rate": 8.753535353535353e-05, + "loss": 0.5291, + "step": 133400 + }, + { + "epoch": 0.06, + "learning_rate": 8.752525252525253e-05, + "loss": 0.5263, + "step": 133500 + }, + { + "epoch": 0.06, + "learning_rate": 8.751515151515152e-05, + "loss": 0.5275, + "step": 133600 + }, + { + "epoch": 0.06, + "learning_rate": 8.750505050505051e-05, + "loss": 0.5286, + "step": 133700 + }, + { + "epoch": 0.06, + "learning_rate": 8.74949494949495e-05, + "loss": 0.5283, + "step": 133800 + }, + { + "epoch": 0.06, + "learning_rate": 8.74848484848485e-05, + "loss": 0.5283, + "step": 133900 + }, + { + "epoch": 0.06, + "learning_rate": 8.747474747474747e-05, + "loss": 0.527, + "step": 134000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.5274941273259552, + "eval_average_loss_on_sentence_tokens": 0.45092452295775926, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.5240820050239563, + "eval_non_padding_tokens_in_labels": 133.4817, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3576, + "eval_padding_tokens_in_labels": 378.5183, + "eval_reconstruction_accuracy": 0.9075426045804155, + "eval_runtime": 184.6872, + "eval_samples_per_second": 27.073, + "eval_sentence_accuracy": 0.738156548890125, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 134000 + }, + { + "epoch": 0.06, + "learning_rate": 8.746464646464648e-05, + "loss": 0.5302, + "step": 134100 + }, + { + "epoch": 0.06, + "learning_rate": 8.745454545454546e-05, + "loss": 0.526, + "step": 134200 + }, + { + "epoch": 0.06, + "learning_rate": 8.744444444444445e-05, + "loss": 0.5308, + "step": 134300 + }, + { + "epoch": 0.06, + "learning_rate": 8.743434343434344e-05, + "loss": 0.5276, + "step": 134400 + }, + { + "epoch": 0.06, + "learning_rate": 8.742424242424243e-05, + "loss": 0.526, + "step": 134500 + }, + { + "epoch": 0.06, + "learning_rate": 8.741414141414141e-05, + "loss": 0.5253, + "step": 134600 + }, + { + "epoch": 0.06, + "learning_rate": 8.740404040404042e-05, + "loss": 0.5244, + "step": 134700 + }, + { + "epoch": 0.06, + "learning_rate": 8.73939393939394e-05, + "loss": 0.5286, + "step": 134800 + }, + { + "epoch": 0.06, + "learning_rate": 8.738383838383839e-05, + "loss": 0.5314, + "step": 134900 + }, + { + "epoch": 0.07, + "learning_rate": 8.737373737373738e-05, + "loss": 0.5264, + "step": 135000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.527710662481836, + "eval_average_loss_on_sentence_tokens": 0.4174457761445335, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5226855278015137, + "eval_non_padding_tokens_in_labels": 133.54305, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39125, + "eval_padding_tokens_in_labels": 378.45695, + "eval_reconstruction_accuracy": 0.9075911673053005, + "eval_runtime": 180.1741, + "eval_samples_per_second": 27.751, + "eval_sentence_accuracy": 0.7421940890412188, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 135000 + }, + { + "epoch": 0.07, + "learning_rate": 8.736363636363637e-05, + "loss": 0.5265, + "step": 135100 + }, + { + "epoch": 0.07, + "learning_rate": 8.735353535353535e-05, + "loss": 0.526, + "step": 135200 + }, + { + "epoch": 0.07, + "learning_rate": 8.734343434343435e-05, + "loss": 0.5278, + "step": 135300 + }, + { + "epoch": 0.07, + "learning_rate": 8.733333333333333e-05, + "loss": 0.5272, + "step": 135400 + }, + { + "epoch": 0.07, + "learning_rate": 8.732323232323232e-05, + "loss": 0.5266, + "step": 135500 + }, + { + "epoch": 0.07, + "learning_rate": 8.731313131313132e-05, + "loss": 0.5251, + "step": 135600 + }, + { + "epoch": 0.07, + "learning_rate": 8.730303030303031e-05, + "loss": 0.5269, + "step": 135700 + }, + { + "epoch": 0.07, + "learning_rate": 8.729292929292929e-05, + "loss": 0.5272, + "step": 135800 + }, + { + "epoch": 0.07, + "learning_rate": 8.728282828282829e-05, + "loss": 0.5259, + "step": 135900 + }, + { + "epoch": 0.07, + "learning_rate": 8.727272727272727e-05, + "loss": 0.528, + "step": 136000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5278464673134858, + "eval_average_loss_on_sentence_tokens": 0.47127693202919063, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.5254004001617432, + "eval_non_padding_tokens_in_labels": 133.5425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38635, + "eval_padding_tokens_in_labels": 378.4575, + "eval_reconstruction_accuracy": 0.9076049008819919, + "eval_runtime": 179.5229, + "eval_samples_per_second": 27.852, + "eval_sentence_accuracy": 0.6972921564053332, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 136000 + }, + { + "epoch": 0.07, + "learning_rate": 8.726262626262626e-05, + "loss": 0.527, + "step": 136100 + }, + { + "epoch": 0.07, + "learning_rate": 8.725252525252526e-05, + "loss": 0.5252, + "step": 136200 + }, + { + "epoch": 0.07, + "learning_rate": 8.724242424242425e-05, + "loss": 0.5261, + "step": 136300 + }, + { + "epoch": 0.07, + "learning_rate": 8.723232323232323e-05, + "loss": 0.5276, + "step": 136400 + }, + { + "epoch": 0.07, + "learning_rate": 8.722222222222223e-05, + "loss": 0.5277, + "step": 136500 + }, + { + "epoch": 0.07, + "learning_rate": 8.721212121212121e-05, + "loss": 0.5258, + "step": 136600 + }, + { + "epoch": 0.07, + "learning_rate": 8.72020202020202e-05, + "loss": 0.5273, + "step": 136700 + }, + { + "epoch": 0.07, + "learning_rate": 8.71919191919192e-05, + "loss": 0.5272, + "step": 136800 + }, + { + "epoch": 0.07, + "learning_rate": 8.718181818181819e-05, + "loss": 0.5207, + "step": 136900 + }, + { + "epoch": 0.07, + "learning_rate": 8.717171717171718e-05, + "loss": 0.5256, + "step": 137000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5269098435862145, + "eval_average_loss_on_sentence_tokens": 0.4533911281072401, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.5236132740974426, + "eval_non_padding_tokens_in_labels": 133.53475, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39085, + "eval_padding_tokens_in_labels": 378.46525, + "eval_reconstruction_accuracy": 0.9077954531772741, + "eval_runtime": 179.7982, + "eval_samples_per_second": 27.809, + "eval_sentence_accuracy": 0.7073994652502378, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 137000 + }, + { + "epoch": 0.07, + "learning_rate": 8.716161616161617e-05, + "loss": 0.5271, + "step": 137100 + }, + { + "epoch": 0.07, + "learning_rate": 8.715151515151515e-05, + "loss": 0.523, + "step": 137200 + }, + { + "epoch": 0.07, + "learning_rate": 8.714141414141414e-05, + "loss": 0.5258, + "step": 137300 + }, + { + "epoch": 0.07, + "learning_rate": 8.713131313131313e-05, + "loss": 0.528, + "step": 137400 + }, + { + "epoch": 0.07, + "learning_rate": 8.712121212121212e-05, + "loss": 0.5272, + "step": 137500 + }, + { + "epoch": 0.07, + "learning_rate": 8.711111111111112e-05, + "loss": 0.5243, + "step": 137600 + }, + { + "epoch": 0.07, + "learning_rate": 8.710101010101011e-05, + "loss": 0.5305, + "step": 137700 + }, + { + "epoch": 0.07, + "learning_rate": 8.709090909090909e-05, + "loss": 0.5233, + "step": 137800 + }, + { + "epoch": 0.07, + "learning_rate": 8.708080808080809e-05, + "loss": 0.5258, + "step": 137900 + }, + { + "epoch": 0.07, + "learning_rate": 8.707070707070707e-05, + "loss": 0.528, + "step": 138000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5258892584913565, + "eval_average_loss_on_sentence_tokens": 0.416223615782161, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.5208300948143005, + "eval_non_padding_tokens_in_labels": 133.52795, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37115, + "eval_padding_tokens_in_labels": 378.47205, + "eval_reconstruction_accuracy": 0.90764159203245, + "eval_runtime": 182.2315, + "eval_samples_per_second": 27.438, + "eval_sentence_accuracy": 0.7452042995316454, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24960000000000004, + "step": 138000 + }, + { + "epoch": 0.07, + "learning_rate": 8.706060606060606e-05, + "loss": 0.5272, + "step": 138100 + }, + { + "epoch": 0.07, + "learning_rate": 8.705050505050505e-05, + "loss": 0.5252, + "step": 138200 + }, + { + "epoch": 0.07, + "learning_rate": 8.704040404040405e-05, + "loss": 0.5276, + "step": 138300 + }, + { + "epoch": 0.07, + "learning_rate": 8.703030303030304e-05, + "loss": 0.525, + "step": 138400 + }, + { + "epoch": 0.07, + "learning_rate": 8.702020202020203e-05, + "loss": 0.5277, + "step": 138500 + }, + { + "epoch": 0.07, + "learning_rate": 8.701010101010102e-05, + "loss": 0.5249, + "step": 138600 + }, + { + "epoch": 0.07, + "learning_rate": 8.7e-05, + "loss": 0.5244, + "step": 138700 + }, + { + "epoch": 0.07, + "learning_rate": 8.698989898989899e-05, + "loss": 0.5247, + "step": 138800 + }, + { + "epoch": 0.07, + "learning_rate": 8.697979797979798e-05, + "loss": 0.5224, + "step": 138900 + }, + { + "epoch": 0.07, + "learning_rate": 8.696969696969698e-05, + "loss": 0.5222, + "step": 139000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5252539540181145, + "eval_average_loss_on_sentence_tokens": 0.47253899469989136, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.5229394435882568, + "eval_non_padding_tokens_in_labels": 133.53885, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3854, + "eval_padding_tokens_in_labels": 378.46115, + "eval_reconstruction_accuracy": 0.9078570139998364, + "eval_runtime": 179.0442, + "eval_samples_per_second": 27.926, + "eval_sentence_accuracy": 0.7010336102786799, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.246975, + "step": 139000 + }, + { + "epoch": 0.07, + "learning_rate": 8.695959595959597e-05, + "loss": 0.5258, + "step": 139100 + }, + { + "epoch": 0.07, + "learning_rate": 8.694949494949496e-05, + "loss": 0.5215, + "step": 139200 + }, + { + "epoch": 0.07, + "learning_rate": 8.693939393939394e-05, + "loss": 0.5286, + "step": 139300 + }, + { + "epoch": 0.07, + "learning_rate": 8.692929292929294e-05, + "loss": 0.5249, + "step": 139400 + }, + { + "epoch": 0.07, + "learning_rate": 8.691919191919192e-05, + "loss": 0.5201, + "step": 139500 + }, + { + "epoch": 0.07, + "learning_rate": 8.690909090909091e-05, + "loss": 0.5276, + "step": 139600 + }, + { + "epoch": 0.07, + "learning_rate": 8.68989898989899e-05, + "loss": 0.522, + "step": 139700 + }, + { + "epoch": 0.07, + "learning_rate": 8.68888888888889e-05, + "loss": 0.526, + "step": 139800 + }, + { + "epoch": 0.07, + "learning_rate": 8.687878787878788e-05, + "loss": 0.5231, + "step": 139900 + }, + { + "epoch": 0.07, + "learning_rate": 8.686868686868688e-05, + "loss": 0.5242, + "step": 140000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5243534969670632, + "eval_average_loss_on_sentence_tokens": 0.4540439366345933, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5212206840515137, + "eval_non_padding_tokens_in_labels": 133.5254, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3813, + "eval_padding_tokens_in_labels": 378.4746, + "eval_reconstruction_accuracy": 0.9078524795477927, + "eval_runtime": 180.8657, + "eval_samples_per_second": 27.645, + "eval_sentence_accuracy": 0.7205214707412924, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 140000 + }, + { + "epoch": 0.07, + "learning_rate": 8.685858585858586e-05, + "loss": 0.5237, + "step": 140100 + }, + { + "epoch": 0.07, + "learning_rate": 8.684848484848485e-05, + "loss": 0.5271, + "step": 140200 + }, + { + "epoch": 0.07, + "learning_rate": 8.683838383838385e-05, + "loss": 0.5241, + "step": 140300 + }, + { + "epoch": 0.07, + "learning_rate": 8.682828282828284e-05, + "loss": 0.5237, + "step": 140400 + }, + { + "epoch": 0.07, + "learning_rate": 8.681818181818182e-05, + "loss": 0.5241, + "step": 140500 + }, + { + "epoch": 0.07, + "learning_rate": 8.680808080808082e-05, + "loss": 0.5232, + "step": 140600 + }, + { + "epoch": 0.07, + "learning_rate": 8.67979797979798e-05, + "loss": 0.5238, + "step": 140700 + }, + { + "epoch": 0.07, + "learning_rate": 8.678787878787879e-05, + "loss": 0.524, + "step": 140800 + }, + { + "epoch": 0.07, + "learning_rate": 8.677777777777778e-05, + "loss": 0.5243, + "step": 140900 + }, + { + "epoch": 0.07, + "learning_rate": 8.676767676767678e-05, + "loss": 0.5239, + "step": 141000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5235514967084085, + "eval_average_loss_on_sentence_tokens": 0.4204572168802725, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.518847644329071, + "eval_non_padding_tokens_in_labels": 133.5023, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38065, + "eval_padding_tokens_in_labels": 378.4977, + "eval_reconstruction_accuracy": 0.9079790872135156, + "eval_runtime": 181.8454, + "eval_samples_per_second": 27.496, + "eval_sentence_accuracy": 0.7473935435246828, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24909999999999996, + "step": 141000 + }, + { + "epoch": 0.07, + "learning_rate": 8.675757575757575e-05, + "loss": 0.5221, + "step": 141100 + }, + { + "epoch": 0.07, + "learning_rate": 8.674747474747476e-05, + "loss": 0.523, + "step": 141200 + }, + { + "epoch": 0.07, + "learning_rate": 8.673737373737374e-05, + "loss": 0.5217, + "step": 141300 + }, + { + "epoch": 0.07, + "learning_rate": 8.672727272727273e-05, + "loss": 0.5241, + "step": 141400 + }, + { + "epoch": 0.07, + "learning_rate": 8.671717171717172e-05, + "loss": 0.5211, + "step": 141500 + }, + { + "epoch": 0.07, + "learning_rate": 8.670707070707071e-05, + "loss": 0.5252, + "step": 141600 + }, + { + "epoch": 0.07, + "learning_rate": 8.669696969696969e-05, + "loss": 0.52, + "step": 141700 + }, + { + "epoch": 0.07, + "learning_rate": 8.66868686868687e-05, + "loss": 0.5197, + "step": 141800 + }, + { + "epoch": 0.07, + "learning_rate": 8.667676767676768e-05, + "loss": 0.5183, + "step": 141900 + }, + { + "epoch": 0.07, + "learning_rate": 8.666666666666667e-05, + "loss": 0.5232, + "step": 142000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5226269075223314, + "eval_average_loss_on_sentence_tokens": 0.447465804398073, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.519287109375, + "eval_non_padding_tokens_in_labels": 133.534, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38645, + "eval_padding_tokens_in_labels": 378.466, + "eval_reconstruction_accuracy": 0.9080146899676458, + "eval_runtime": 178.6711, + "eval_samples_per_second": 27.984, + "eval_sentence_accuracy": 0.7245814416710032, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 142000 + }, + { + "epoch": 0.07, + "learning_rate": 8.665656565656566e-05, + "loss": 0.5227, + "step": 142100 + }, + { + "epoch": 0.07, + "learning_rate": 8.664646464646465e-05, + "loss": 0.5272, + "step": 142200 + }, + { + "epoch": 0.07, + "learning_rate": 8.663636363636364e-05, + "loss": 0.5277, + "step": 142300 + }, + { + "epoch": 0.07, + "learning_rate": 8.662626262626264e-05, + "loss": 0.5207, + "step": 142400 + }, + { + "epoch": 0.07, + "learning_rate": 8.661616161616161e-05, + "loss": 0.5207, + "step": 142500 + }, + { + "epoch": 0.07, + "learning_rate": 8.66060606060606e-05, + "loss": 0.5261, + "step": 142600 + }, + { + "epoch": 0.07, + "learning_rate": 8.65959595959596e-05, + "loss": 0.5201, + "step": 142700 + }, + { + "epoch": 0.07, + "learning_rate": 8.658585858585859e-05, + "loss": 0.5225, + "step": 142800 + }, + { + "epoch": 0.07, + "learning_rate": 8.657575757575758e-05, + "loss": 0.5224, + "step": 142900 + }, + { + "epoch": 0.07, + "learning_rate": 8.656565656565657e-05, + "loss": 0.5183, + "step": 143000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.522966305414608, + "eval_average_loss_on_sentence_tokens": 0.3783006986556033, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.516406238079071, + "eval_non_padding_tokens_in_labels": 133.5272, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37745, + "eval_padding_tokens_in_labels": 378.4728, + "eval_reconstruction_accuracy": 0.9080998666817616, + "eval_runtime": 180.4509, + "eval_samples_per_second": 27.708, + "eval_sentence_accuracy": 0.7635392179733342, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 143000 + }, + { + "epoch": 0.07, + "learning_rate": 8.655555555555555e-05, + "loss": 0.5263, + "step": 143100 + }, + { + "epoch": 0.07, + "learning_rate": 8.654545454545456e-05, + "loss": 0.5201, + "step": 143200 + }, + { + "epoch": 0.07, + "learning_rate": 8.653535353535354e-05, + "loss": 0.5228, + "step": 143300 + }, + { + "epoch": 0.07, + "learning_rate": 8.652525252525253e-05, + "loss": 0.5197, + "step": 143400 + }, + { + "epoch": 0.07, + "learning_rate": 8.651515151515152e-05, + "loss": 0.519, + "step": 143500 + }, + { + "epoch": 0.07, + "learning_rate": 8.650505050505051e-05, + "loss": 0.5185, + "step": 143600 + }, + { + "epoch": 0.07, + "learning_rate": 8.649494949494949e-05, + "loss": 0.52, + "step": 143700 + }, + { + "epoch": 0.07, + "learning_rate": 8.64848484848485e-05, + "loss": 0.5218, + "step": 143800 + }, + { + "epoch": 0.07, + "learning_rate": 8.647474747474748e-05, + "loss": 0.5205, + "step": 143900 + }, + { + "epoch": 0.07, + "learning_rate": 8.646464646464647e-05, + "loss": 0.5208, + "step": 144000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5221004595351628, + "eval_average_loss_on_sentence_tokens": 0.5098099148714685, + "eval_average_shuffling_prob": 0.61, + "eval_loss": 0.5216211080551147, + "eval_non_padding_tokens_in_labels": 133.5147, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3757, + "eval_padding_tokens_in_labels": 378.4853, + "eval_reconstruction_accuracy": 0.9081429144902481, + "eval_runtime": 176.8092, + "eval_samples_per_second": 28.279, + "eval_sentence_accuracy": 0.6729727062285786, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.23789999999999994, + "step": 144000 + }, + { + "epoch": 0.07, + "learning_rate": 8.645454545454546e-05, + "loss": 0.5239, + "step": 144100 + }, + { + "epoch": 0.07, + "learning_rate": 8.644444444444445e-05, + "loss": 0.5211, + "step": 144200 + }, + { + "epoch": 0.07, + "learning_rate": 8.643434343434343e-05, + "loss": 0.5188, + "step": 144300 + }, + { + "epoch": 0.07, + "learning_rate": 8.642424242424243e-05, + "loss": 0.5197, + "step": 144400 + }, + { + "epoch": 0.07, + "learning_rate": 8.641414141414141e-05, + "loss": 0.5171, + "step": 144500 + }, + { + "epoch": 0.07, + "learning_rate": 8.64040404040404e-05, + "loss": 0.5275, + "step": 144600 + }, + { + "epoch": 0.07, + "learning_rate": 8.63939393939394e-05, + "loss": 0.5247, + "step": 144700 + }, + { + "epoch": 0.07, + "learning_rate": 8.638383838383839e-05, + "loss": 0.5246, + "step": 144800 + }, + { + "epoch": 0.07, + "learning_rate": 8.637373737373737e-05, + "loss": 0.5189, + "step": 144900 + }, + { + "epoch": 0.07, + "learning_rate": 8.636363636363637e-05, + "loss": 0.5197, + "step": 145000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.5209538504116651, + "eval_average_loss_on_sentence_tokens": 0.4416128120766297, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.517382800579071, + "eval_non_padding_tokens_in_labels": 133.53125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3802, + "eval_padding_tokens_in_labels": 378.46875, + "eval_reconstruction_accuracy": 0.908297538746756, + "eval_runtime": 183.6542, + "eval_samples_per_second": 27.225, + "eval_sentence_accuracy": 0.7287939418953111, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 145000 + }, + { + "epoch": 0.08, + "learning_rate": 8.635353535353535e-05, + "loss": 0.5189, + "step": 145100 + }, + { + "epoch": 0.08, + "learning_rate": 8.634343434343434e-05, + "loss": 0.519, + "step": 145200 + }, + { + "epoch": 0.08, + "learning_rate": 8.633333333333334e-05, + "loss": 0.5239, + "step": 145300 + }, + { + "epoch": 0.08, + "learning_rate": 8.632323232323233e-05, + "loss": 0.5227, + "step": 145400 + }, + { + "epoch": 0.08, + "learning_rate": 8.63131313131313e-05, + "loss": 0.5179, + "step": 145500 + }, + { + "epoch": 0.08, + "learning_rate": 8.630303030303031e-05, + "loss": 0.5225, + "step": 145600 + }, + { + "epoch": 0.08, + "learning_rate": 8.629292929292929e-05, + "loss": 0.5209, + "step": 145700 + }, + { + "epoch": 0.08, + "learning_rate": 8.628282828282828e-05, + "loss": 0.52, + "step": 145800 + }, + { + "epoch": 0.08, + "learning_rate": 8.627272727272727e-05, + "loss": 0.5251, + "step": 145900 + }, + { + "epoch": 0.08, + "learning_rate": 8.626262626262627e-05, + "loss": 0.5182, + "step": 146000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5208763040275032, + "eval_average_loss_on_sentence_tokens": 0.42236945046811497, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5165331959724426, + "eval_non_padding_tokens_in_labels": 133.5353, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38845, + "eval_padding_tokens_in_labels": 378.4647, + "eval_reconstruction_accuracy": 0.9082731710336435, + "eval_runtime": 181.6481, + "eval_samples_per_second": 27.526, + "eval_sentence_accuracy": 0.7420281012794516, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 146000 + }, + { + "epoch": 0.08, + "learning_rate": 8.625252525252526e-05, + "loss": 0.5188, + "step": 146100 + }, + { + "epoch": 0.08, + "learning_rate": 8.624242424242425e-05, + "loss": 0.5188, + "step": 146200 + }, + { + "epoch": 0.08, + "learning_rate": 8.623232323232323e-05, + "loss": 0.5221, + "step": 146300 + }, + { + "epoch": 0.08, + "learning_rate": 8.622222222222222e-05, + "loss": 0.5246, + "step": 146400 + }, + { + "epoch": 0.08, + "learning_rate": 8.621212121212121e-05, + "loss": 0.5197, + "step": 146500 + }, + { + "epoch": 0.08, + "learning_rate": 8.62020202020202e-05, + "loss": 0.52, + "step": 146600 + }, + { + "epoch": 0.08, + "learning_rate": 8.61919191919192e-05, + "loss": 0.5136, + "step": 146700 + }, + { + "epoch": 0.08, + "learning_rate": 8.618181818181819e-05, + "loss": 0.5191, + "step": 146800 + }, + { + "epoch": 0.08, + "learning_rate": 8.617171717171718e-05, + "loss": 0.5234, + "step": 146900 + }, + { + "epoch": 0.08, + "learning_rate": 8.616161616161616e-05, + "loss": 0.5205, + "step": 147000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5200103605375002, + "eval_average_loss_on_sentence_tokens": 0.4376017170707215, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.5162500143051147, + "eval_non_padding_tokens_in_labels": 133.5275, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38105, + "eval_padding_tokens_in_labels": 378.4725, + "eval_reconstruction_accuracy": 0.9083568217415755, + "eval_runtime": 185.6202, + "eval_samples_per_second": 26.937, + "eval_sentence_accuracy": 0.7303237209970033, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.25, + "step": 147000 + }, + { + "epoch": 0.08, + "learning_rate": 8.615151515151516e-05, + "loss": 0.5194, + "step": 147100 + }, + { + "epoch": 0.08, + "learning_rate": 8.614141414141414e-05, + "loss": 0.5171, + "step": 147200 + }, + { + "epoch": 0.08, + "learning_rate": 8.613131313131313e-05, + "loss": 0.5231, + "step": 147300 + }, + { + "epoch": 0.08, + "learning_rate": 8.612121212121213e-05, + "loss": 0.5171, + "step": 147400 + }, + { + "epoch": 0.08, + "learning_rate": 8.611111111111112e-05, + "loss": 0.5229, + "step": 147500 + }, + { + "epoch": 0.08, + "learning_rate": 8.610101010101011e-05, + "loss": 0.5188, + "step": 147600 + }, + { + "epoch": 0.08, + "learning_rate": 8.60909090909091e-05, + "loss": 0.5208, + "step": 147700 + }, + { + "epoch": 0.08, + "learning_rate": 8.608080808080808e-05, + "loss": 0.519, + "step": 147800 + }, + { + "epoch": 0.08, + "learning_rate": 8.607070707070707e-05, + "loss": 0.5172, + "step": 147900 + }, + { + "epoch": 0.08, + "learning_rate": 8.606060606060606e-05, + "loss": 0.5165, + "step": 148000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5207644910731134, + "eval_average_loss_on_sentence_tokens": 0.4449057331636606, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5173535346984863, + "eval_non_padding_tokens_in_labels": 133.5037, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3704, + "eval_padding_tokens_in_labels": 378.4963, + "eval_reconstruction_accuracy": 0.9083149913598797, + "eval_runtime": 182.0487, + "eval_samples_per_second": 27.465, + "eval_sentence_accuracy": 0.718785328476322, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 148000 + }, + { + "epoch": 0.08, + "learning_rate": 8.605050505050506e-05, + "loss": 0.5171, + "step": 148100 + }, + { + "epoch": 0.08, + "learning_rate": 8.604040404040405e-05, + "loss": 0.5174, + "step": 148200 + }, + { + "epoch": 0.08, + "learning_rate": 8.603030303030304e-05, + "loss": 0.5161, + "step": 148300 + }, + { + "epoch": 0.08, + "learning_rate": 8.602020202020202e-05, + "loss": 0.5166, + "step": 148400 + }, + { + "epoch": 0.08, + "learning_rate": 8.601010101010102e-05, + "loss": 0.5193, + "step": 148500 + }, + { + "epoch": 0.08, + "learning_rate": 8.6e-05, + "loss": 0.5194, + "step": 148600 + }, + { + "epoch": 0.08, + "learning_rate": 8.5989898989899e-05, + "loss": 0.5172, + "step": 148700 + }, + { + "epoch": 0.08, + "learning_rate": 8.597979797979799e-05, + "loss": 0.518, + "step": 148800 + }, + { + "epoch": 0.08, + "learning_rate": 8.596969696969698e-05, + "loss": 0.5172, + "step": 148900 + }, + { + "epoch": 0.08, + "learning_rate": 8.595959595959596e-05, + "loss": 0.5168, + "step": 149000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5195994685326729, + "eval_average_loss_on_sentence_tokens": 0.4361741274514896, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5158300995826721, + "eval_non_padding_tokens_in_labels": 133.5767, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39385, + "eval_padding_tokens_in_labels": 378.4233, + "eval_reconstruction_accuracy": 0.9084525244401378, + "eval_runtime": 181.7939, + "eval_samples_per_second": 27.504, + "eval_sentence_accuracy": 0.7286548710678845, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 149000 + }, + { + "epoch": 0.08, + "learning_rate": 8.594949494949496e-05, + "loss": 0.5203, + "step": 149100 + }, + { + "epoch": 0.08, + "learning_rate": 8.593939393939394e-05, + "loss": 0.5187, + "step": 149200 + }, + { + "epoch": 0.08, + "learning_rate": 8.592929292929293e-05, + "loss": 0.5207, + "step": 149300 + }, + { + "epoch": 0.08, + "learning_rate": 8.591919191919193e-05, + "loss": 0.5189, + "step": 149400 + }, + { + "epoch": 0.08, + "learning_rate": 8.590909090909092e-05, + "loss": 0.5179, + "step": 149500 + }, + { + "epoch": 0.08, + "learning_rate": 8.58989898989899e-05, + "loss": 0.5209, + "step": 149600 + }, + { + "epoch": 0.08, + "learning_rate": 8.58888888888889e-05, + "loss": 0.5226, + "step": 149700 + }, + { + "epoch": 0.08, + "learning_rate": 8.587878787878788e-05, + "loss": 0.5154, + "step": 149800 + }, + { + "epoch": 0.08, + "learning_rate": 8.586868686868687e-05, + "loss": 0.5194, + "step": 149900 + }, + { + "epoch": 0.08, + "learning_rate": 8.585858585858586e-05, + "loss": 0.5149, + "step": 150000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.518759097421262, + "eval_average_loss_on_sentence_tokens": 0.42902363379906766, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.5147558450698853, + "eval_non_padding_tokens_in_labels": 133.52555, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3808, + "eval_padding_tokens_in_labels": 378.47445, + "eval_reconstruction_accuracy": 0.9087046432870697, + "eval_runtime": 185.414, + "eval_samples_per_second": 26.967, + "eval_sentence_accuracy": 0.7210598094281049, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 150000 + }, + { + "epoch": 0.08, + "learning_rate": 8.584848484848486e-05, + "loss": 0.5211, + "step": 150100 + }, + { + "epoch": 0.08, + "learning_rate": 8.583838383838383e-05, + "loss": 0.5227, + "step": 150200 + }, + { + "epoch": 0.08, + "learning_rate": 8.582828282828284e-05, + "loss": 0.5203, + "step": 150300 + }, + { + "epoch": 0.08, + "learning_rate": 8.581818181818182e-05, + "loss": 0.515, + "step": 150400 + }, + { + "epoch": 0.08, + "learning_rate": 8.580808080808081e-05, + "loss": 0.5192, + "step": 150500 + }, + { + "epoch": 0.08, + "learning_rate": 8.57979797979798e-05, + "loss": 0.5194, + "step": 150600 + }, + { + "epoch": 0.08, + "learning_rate": 8.57878787878788e-05, + "loss": 0.5179, + "step": 150700 + }, + { + "epoch": 0.08, + "learning_rate": 8.577777777777777e-05, + "loss": 0.5158, + "step": 150800 + }, + { + "epoch": 0.08, + "learning_rate": 8.576767676767678e-05, + "loss": 0.5121, + "step": 150900 + }, + { + "epoch": 0.08, + "learning_rate": 8.575757575757576e-05, + "loss": 0.5177, + "step": 151000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5175936848569074, + "eval_average_loss_on_sentence_tokens": 0.4477953704124519, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.514453113079071, + "eval_non_padding_tokens_in_labels": 133.542, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37205, + "eval_padding_tokens_in_labels": 378.458, + "eval_reconstruction_accuracy": 0.9085627441928816, + "eval_runtime": 182.6737, + "eval_samples_per_second": 27.371, + "eval_sentence_accuracy": 0.7286144956663736, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2499, + "step": 151000 + }, + { + "epoch": 0.08, + "learning_rate": 8.574747474747475e-05, + "loss": 0.5161, + "step": 151100 + }, + { + "epoch": 0.08, + "learning_rate": 8.573737373737374e-05, + "loss": 0.5155, + "step": 151200 + }, + { + "epoch": 0.08, + "learning_rate": 8.572727272727273e-05, + "loss": 0.5214, + "step": 151300 + }, + { + "epoch": 0.08, + "learning_rate": 8.571717171717172e-05, + "loss": 0.5149, + "step": 151400 + }, + { + "epoch": 0.08, + "learning_rate": 8.570707070707072e-05, + "loss": 0.5202, + "step": 151500 + }, + { + "epoch": 0.08, + "learning_rate": 8.56969696969697e-05, + "loss": 0.5188, + "step": 151600 + }, + { + "epoch": 0.08, + "learning_rate": 8.568686868686869e-05, + "loss": 0.5186, + "step": 151700 + }, + { + "epoch": 0.08, + "learning_rate": 8.567676767676768e-05, + "loss": 0.5176, + "step": 151800 + }, + { + "epoch": 0.08, + "learning_rate": 8.566666666666667e-05, + "loss": 0.5178, + "step": 151900 + }, + { + "epoch": 0.08, + "learning_rate": 8.565656565656566e-05, + "loss": 0.5165, + "step": 152000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5178556415923546, + "eval_average_loss_on_sentence_tokens": 0.41711546618881284, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5133105516433716, + "eval_non_padding_tokens_in_labels": 133.52205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3791, + "eval_padding_tokens_in_labels": 378.47795, + "eval_reconstruction_accuracy": 0.9086763529602976, + "eval_runtime": 177.8671, + "eval_samples_per_second": 28.111, + "eval_sentence_accuracy": 0.7415525687727672, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 152000 + }, + { + "epoch": 0.08, + "learning_rate": 8.564646464646465e-05, + "loss": 0.5171, + "step": 152100 + }, + { + "epoch": 0.08, + "learning_rate": 8.563636363636363e-05, + "loss": 0.5153, + "step": 152200 + }, + { + "epoch": 0.08, + "learning_rate": 8.562626262626264e-05, + "loss": 0.5197, + "step": 152300 + }, + { + "epoch": 0.08, + "learning_rate": 8.561616161616162e-05, + "loss": 0.5125, + "step": 152400 + }, + { + "epoch": 0.08, + "learning_rate": 8.560606060606061e-05, + "loss": 0.5205, + "step": 152500 + }, + { + "epoch": 0.08, + "learning_rate": 8.55959595959596e-05, + "loss": 0.5198, + "step": 152600 + }, + { + "epoch": 0.08, + "learning_rate": 8.558585858585859e-05, + "loss": 0.5211, + "step": 152700 + }, + { + "epoch": 0.08, + "learning_rate": 8.557575757575757e-05, + "loss": 0.5172, + "step": 152800 + }, + { + "epoch": 0.08, + "learning_rate": 8.556565656565658e-05, + "loss": 0.5158, + "step": 152900 + }, + { + "epoch": 0.08, + "learning_rate": 8.555555555555556e-05, + "loss": 0.5205, + "step": 153000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5173423871916399, + "eval_average_loss_on_sentence_tokens": 0.38153586284210483, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.5111621022224426, + "eval_non_padding_tokens_in_labels": 133.47715, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36465, + "eval_padding_tokens_in_labels": 378.52285, + "eval_reconstruction_accuracy": 0.9087478075248134, + "eval_runtime": 182.2606, + "eval_samples_per_second": 27.433, + "eval_sentence_accuracy": 0.7586403359233406, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.247975, + "step": 153000 + }, + { + "epoch": 0.08, + "learning_rate": 8.554545454545455e-05, + "loss": 0.5178, + "step": 153100 + }, + { + "epoch": 0.08, + "learning_rate": 8.553535353535354e-05, + "loss": 0.5168, + "step": 153200 + }, + { + "epoch": 0.08, + "learning_rate": 8.552525252525253e-05, + "loss": 0.5159, + "step": 153300 + }, + { + "epoch": 0.08, + "learning_rate": 8.551515151515151e-05, + "loss": 0.5146, + "step": 153400 + }, + { + "epoch": 0.08, + "learning_rate": 8.550505050505052e-05, + "loss": 0.5173, + "step": 153500 + }, + { + "epoch": 0.08, + "learning_rate": 8.54949494949495e-05, + "loss": 0.5178, + "step": 153600 + }, + { + "epoch": 0.08, + "learning_rate": 8.548484848484849e-05, + "loss": 0.5189, + "step": 153700 + }, + { + "epoch": 0.08, + "learning_rate": 8.547474747474748e-05, + "loss": 0.5138, + "step": 153800 + }, + { + "epoch": 0.08, + "learning_rate": 8.546464646464647e-05, + "loss": 0.5169, + "step": 153900 + }, + { + "epoch": 0.08, + "learning_rate": 8.545454545454545e-05, + "loss": 0.5152, + "step": 154000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.5159843828896216, + "eval_average_loss_on_sentence_tokens": 0.4178657986629616, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5115039348602295, + "eval_non_padding_tokens_in_labels": 133.50955, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38655, + "eval_padding_tokens_in_labels": 378.49045, + "eval_reconstruction_accuracy": 0.908868034477433, + "eval_runtime": 180.0026, + "eval_samples_per_second": 27.777, + "eval_sentence_accuracy": 0.7409738546844438, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 154000 + }, + { + "epoch": 0.08, + "learning_rate": 8.544444444444445e-05, + "loss": 0.5135, + "step": 154100 + }, + { + "epoch": 0.08, + "learning_rate": 8.543434343434343e-05, + "loss": 0.5191, + "step": 154200 + }, + { + "epoch": 0.08, + "learning_rate": 8.542424242424242e-05, + "loss": 0.516, + "step": 154300 + }, + { + "epoch": 0.08, + "learning_rate": 8.541414141414142e-05, + "loss": 0.5153, + "step": 154400 + }, + { + "epoch": 0.08, + "learning_rate": 8.540404040404041e-05, + "loss": 0.5167, + "step": 154500 + }, + { + "epoch": 0.08, + "learning_rate": 8.539393939393939e-05, + "loss": 0.5142, + "step": 154600 + }, + { + "epoch": 0.08, + "learning_rate": 8.538383838383839e-05, + "loss": 0.5183, + "step": 154700 + }, + { + "epoch": 0.08, + "learning_rate": 8.537373737373737e-05, + "loss": 0.5144, + "step": 154800 + }, + { + "epoch": 0.08, + "learning_rate": 8.536363636363636e-05, + "loss": 0.5186, + "step": 154900 + }, + { + "epoch": 0.09, + "learning_rate": 8.535353535353535e-05, + "loss": 0.5129, + "step": 155000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5151406181187492, + "eval_average_loss_on_sentence_tokens": 0.4026679224396515, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.510058581829071, + "eval_non_padding_tokens_in_labels": 133.5266, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3974, + "eval_padding_tokens_in_labels": 378.4734, + "eval_reconstruction_accuracy": 0.9090085370012595, + "eval_runtime": 186.3532, + "eval_samples_per_second": 26.831, + "eval_sentence_accuracy": 0.7513996805857125, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "step": 155000 + }, + { + "epoch": 0.09, + "learning_rate": 8.534343434343435e-05, + "loss": 0.517, + "step": 155100 + }, + { + "epoch": 0.09, + "learning_rate": 8.533333333333334e-05, + "loss": 0.5118, + "step": 155200 + }, + { + "epoch": 0.09, + "learning_rate": 8.532323232323233e-05, + "loss": 0.5177, + "step": 155300 + }, + { + "epoch": 0.09, + "learning_rate": 8.531313131313132e-05, + "loss": 0.5163, + "step": 155400 + }, + { + "epoch": 0.09, + "learning_rate": 8.53030303030303e-05, + "loss": 0.519, + "step": 155500 + }, + { + "epoch": 0.09, + "learning_rate": 8.52929292929293e-05, + "loss": 0.5122, + "step": 155600 + }, + { + "epoch": 0.09, + "learning_rate": 8.528282828282828e-05, + "loss": 0.5123, + "step": 155700 + }, + { + "epoch": 0.09, + "learning_rate": 8.527272727272728e-05, + "loss": 0.5153, + "step": 155800 + }, + { + "epoch": 0.09, + "learning_rate": 8.526262626262627e-05, + "loss": 0.517, + "step": 155900 + }, + { + "epoch": 0.09, + "learning_rate": 8.525252525252526e-05, + "loss": 0.512, + "step": 156000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5145164259612109, + "eval_average_loss_on_sentence_tokens": 0.4623663407933066, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.5121093988418579, + "eval_non_padding_tokens_in_labels": 133.5008, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37905, + "eval_padding_tokens_in_labels": 378.4992, + "eval_reconstruction_accuracy": 0.9089612433995017, + "eval_runtime": 180.6924, + "eval_samples_per_second": 27.671, + "eval_sentence_accuracy": 0.7070046835465753, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.246975, + "step": 156000 + }, + { + "epoch": 0.09, + "learning_rate": 8.524242424242424e-05, + "loss": 0.5151, + "step": 156100 + }, + { + "epoch": 0.09, + "learning_rate": 8.523232323232324e-05, + "loss": 0.5171, + "step": 156200 + }, + { + "epoch": 0.09, + "learning_rate": 8.522222222222222e-05, + "loss": 0.5126, + "step": 156300 + }, + { + "epoch": 0.09, + "learning_rate": 8.521212121212122e-05, + "loss": 0.5138, + "step": 156400 + }, + { + "epoch": 0.09, + "learning_rate": 8.520202020202021e-05, + "loss": 0.517, + "step": 156500 + }, + { + "epoch": 0.09, + "learning_rate": 8.51919191919192e-05, + "loss": 0.5091, + "step": 156600 + }, + { + "epoch": 0.09, + "learning_rate": 8.518181818181819e-05, + "loss": 0.5106, + "step": 156700 + }, + { + "epoch": 0.09, + "learning_rate": 8.517171717171718e-05, + "loss": 0.5121, + "step": 156800 + }, + { + "epoch": 0.09, + "learning_rate": 8.516161616161616e-05, + "loss": 0.5176, + "step": 156900 + }, + { + "epoch": 0.09, + "learning_rate": 8.515151515151515e-05, + "loss": 0.5155, + "step": 157000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5154250489876065, + "eval_average_loss_on_sentence_tokens": 0.41393325709963513, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5108007788658142, + "eval_non_padding_tokens_in_labels": 133.53945, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3766, + "eval_padding_tokens_in_labels": 378.46055, + "eval_reconstruction_accuracy": 0.9090659324477461, + "eval_runtime": 183.0625, + "eval_samples_per_second": 27.313, + "eval_sentence_accuracy": 0.7349758644822079, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 157000 + }, + { + "epoch": 0.09, + "learning_rate": 8.514141414141415e-05, + "loss": 0.5084, + "step": 157100 + }, + { + "epoch": 0.09, + "learning_rate": 8.513131313131314e-05, + "loss": 0.5122, + "step": 157200 + }, + { + "epoch": 0.09, + "learning_rate": 8.512121212121213e-05, + "loss": 0.5116, + "step": 157300 + }, + { + "epoch": 0.09, + "learning_rate": 8.511111111111112e-05, + "loss": 0.5134, + "step": 157400 + }, + { + "epoch": 0.09, + "learning_rate": 8.51010101010101e-05, + "loss": 0.5122, + "step": 157500 + }, + { + "epoch": 0.09, + "learning_rate": 8.50909090909091e-05, + "loss": 0.5157, + "step": 157600 + }, + { + "epoch": 0.09, + "learning_rate": 8.508080808080808e-05, + "loss": 0.5177, + "step": 157700 + }, + { + "epoch": 0.09, + "learning_rate": 8.507070707070708e-05, + "loss": 0.514, + "step": 157800 + }, + { + "epoch": 0.09, + "learning_rate": 8.506060606060607e-05, + "loss": 0.5103, + "step": 157900 + }, + { + "epoch": 0.09, + "learning_rate": 8.505050505050506e-05, + "loss": 0.5165, + "step": 158000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5147348053121167, + "eval_average_loss_on_sentence_tokens": 0.43051861774075567, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.510986328125, + "eval_non_padding_tokens_in_labels": 133.5101, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37865, + "eval_padding_tokens_in_labels": 378.4899, + "eval_reconstruction_accuracy": 0.9090309997729781, + "eval_runtime": 180.1764, + "eval_samples_per_second": 27.751, + "eval_sentence_accuracy": 0.7314856353293736, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.249975, + "step": 158000 + }, + { + "epoch": 0.09, + "learning_rate": 8.504040404040404e-05, + "loss": 0.5149, + "step": 158100 + }, + { + "epoch": 0.09, + "learning_rate": 8.503030303030304e-05, + "loss": 0.5149, + "step": 158200 + }, + { + "epoch": 0.09, + "learning_rate": 8.502020202020202e-05, + "loss": 0.5102, + "step": 158300 + }, + { + "epoch": 0.09, + "learning_rate": 8.501010101010101e-05, + "loss": 0.512, + "step": 158400 + }, + { + "epoch": 0.09, + "learning_rate": 8.5e-05, + "loss": 0.5146, + "step": 158500 + }, + { + "epoch": 0.09, + "learning_rate": 8.4989898989899e-05, + "loss": 0.5158, + "step": 158600 + }, + { + "epoch": 0.09, + "learning_rate": 8.497979797979798e-05, + "loss": 0.5172, + "step": 158700 + }, + { + "epoch": 0.09, + "learning_rate": 8.496969696969698e-05, + "loss": 0.5113, + "step": 158800 + }, + { + "epoch": 0.09, + "learning_rate": 8.495959595959596e-05, + "loss": 0.5144, + "step": 158900 + }, + { + "epoch": 0.09, + "learning_rate": 8.494949494949495e-05, + "loss": 0.5156, + "step": 159000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5128883060522418, + "eval_average_loss_on_sentence_tokens": 0.4448943711504417, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5098339915275574, + "eval_non_padding_tokens_in_labels": 133.50975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37875, + "eval_padding_tokens_in_labels": 378.49025, + "eval_reconstruction_accuracy": 0.909018565732613, + "eval_runtime": 177.9926, + "eval_samples_per_second": 28.091, + "eval_sentence_accuracy": 0.7357878586681501, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 159000 + }, + { + "epoch": 0.09, + "learning_rate": 8.493939393939394e-05, + "loss": 0.5176, + "step": 159100 + }, + { + "epoch": 0.09, + "learning_rate": 8.492929292929294e-05, + "loss": 0.5148, + "step": 159200 + }, + { + "epoch": 0.09, + "learning_rate": 8.491919191919191e-05, + "loss": 0.5105, + "step": 159300 + }, + { + "epoch": 0.09, + "learning_rate": 8.490909090909092e-05, + "loss": 0.5117, + "step": 159400 + }, + { + "epoch": 0.09, + "learning_rate": 8.48989898989899e-05, + "loss": 0.5114, + "step": 159500 + }, + { + "epoch": 0.09, + "learning_rate": 8.488888888888889e-05, + "loss": 0.5117, + "step": 159600 + }, + { + "epoch": 0.09, + "learning_rate": 8.487878787878788e-05, + "loss": 0.5108, + "step": 159700 + }, + { + "epoch": 0.09, + "learning_rate": 8.486868686868687e-05, + "loss": 0.5138, + "step": 159800 + }, + { + "epoch": 0.09, + "learning_rate": 8.485858585858585e-05, + "loss": 0.5137, + "step": 159900 + }, + { + "epoch": 0.09, + "learning_rate": 8.484848484848486e-05, + "loss": 0.5133, + "step": 160000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5136506102035546, + "eval_average_loss_on_sentence_tokens": 0.4375489986820065, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.510205090045929, + "eval_non_padding_tokens_in_labels": 133.5723, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38895, + "eval_padding_tokens_in_labels": 378.4277, + "eval_reconstruction_accuracy": 0.9090562667115787, + "eval_runtime": 174.8213, + "eval_samples_per_second": 28.601, + "eval_sentence_accuracy": 0.7286010371992032, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 160000 + }, + { + "epoch": 0.09, + "learning_rate": 8.483838383838384e-05, + "loss": 0.515, + "step": 160100 + }, + { + "epoch": 0.09, + "learning_rate": 8.482828282828283e-05, + "loss": 0.5117, + "step": 160200 + }, + { + "epoch": 0.09, + "learning_rate": 8.481818181818182e-05, + "loss": 0.5113, + "step": 160300 + }, + { + "epoch": 0.09, + "learning_rate": 8.480808080808081e-05, + "loss": 0.5115, + "step": 160400 + }, + { + "epoch": 0.09, + "learning_rate": 8.47979797979798e-05, + "loss": 0.5118, + "step": 160500 + }, + { + "epoch": 0.09, + "learning_rate": 8.47878787878788e-05, + "loss": 0.5131, + "step": 160600 + }, + { + "epoch": 0.09, + "learning_rate": 8.477777777777778e-05, + "loss": 0.5122, + "step": 160700 + }, + { + "epoch": 0.09, + "learning_rate": 8.476767676767677e-05, + "loss": 0.5132, + "step": 160800 + }, + { + "epoch": 0.09, + "learning_rate": 8.475757575757576e-05, + "loss": 0.5112, + "step": 160900 + }, + { + "epoch": 0.09, + "learning_rate": 8.474747474747475e-05, + "loss": 0.5115, + "step": 161000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5121494832929583, + "eval_average_loss_on_sentence_tokens": 0.4137332045097454, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.5076953172683716, + "eval_non_padding_tokens_in_labels": 133.56735, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40355, + "eval_padding_tokens_in_labels": 378.43265, + "eval_reconstruction_accuracy": 0.9094175379158626, + "eval_runtime": 176.276, + "eval_samples_per_second": 28.365, + "eval_sentence_accuracy": 0.7397805372620094, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 161000 + }, + { + "epoch": 0.09, + "learning_rate": 8.473737373737374e-05, + "loss": 0.5152, + "step": 161100 + }, + { + "epoch": 0.09, + "learning_rate": 8.472727272727274e-05, + "loss": 0.5083, + "step": 161200 + }, + { + "epoch": 0.09, + "learning_rate": 8.471717171717171e-05, + "loss": 0.5151, + "step": 161300 + }, + { + "epoch": 0.09, + "learning_rate": 8.470707070707072e-05, + "loss": 0.5133, + "step": 161400 + }, + { + "epoch": 0.09, + "learning_rate": 8.46969696969697e-05, + "loss": 0.513, + "step": 161500 + }, + { + "epoch": 0.09, + "learning_rate": 8.468686868686869e-05, + "loss": 0.5133, + "step": 161600 + }, + { + "epoch": 0.09, + "learning_rate": 8.467676767676768e-05, + "loss": 0.5128, + "step": 161700 + }, + { + "epoch": 0.09, + "learning_rate": 8.466666666666667e-05, + "loss": 0.5128, + "step": 161800 + }, + { + "epoch": 0.09, + "learning_rate": 8.465656565656565e-05, + "loss": 0.5157, + "step": 161900 + }, + { + "epoch": 0.09, + "learning_rate": 8.464646464646466e-05, + "loss": 0.5095, + "step": 162000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5123739898348093, + "eval_average_loss_on_sentence_tokens": 0.4134122276363117, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.5079492330551147, + "eval_non_padding_tokens_in_labels": 133.51585, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3709, + "eval_padding_tokens_in_labels": 378.48415, + "eval_reconstruction_accuracy": 0.909345975492987, + "eval_runtime": 178.8483, + "eval_samples_per_second": 27.957, + "eval_sentence_accuracy": 0.7451235487286235, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 162000 + }, + { + "epoch": 0.09, + "learning_rate": 8.463636363636364e-05, + "loss": 0.5152, + "step": 162100 + }, + { + "epoch": 0.09, + "learning_rate": 8.462626262626263e-05, + "loss": 0.5084, + "step": 162200 + }, + { + "epoch": 0.09, + "learning_rate": 8.461616161616162e-05, + "loss": 0.5101, + "step": 162300 + }, + { + "epoch": 0.09, + "learning_rate": 8.460606060606061e-05, + "loss": 0.5112, + "step": 162400 + }, + { + "epoch": 0.09, + "learning_rate": 8.459595959595959e-05, + "loss": 0.5112, + "step": 162500 + }, + { + "epoch": 0.09, + "learning_rate": 8.45858585858586e-05, + "loss": 0.5103, + "step": 162600 + }, + { + "epoch": 0.09, + "learning_rate": 8.457575757575757e-05, + "loss": 0.5084, + "step": 162700 + }, + { + "epoch": 0.09, + "learning_rate": 8.456565656565657e-05, + "loss": 0.5094, + "step": 162800 + }, + { + "epoch": 0.09, + "learning_rate": 8.455555555555556e-05, + "loss": 0.5146, + "step": 162900 + }, + { + "epoch": 0.09, + "learning_rate": 8.454545454545455e-05, + "loss": 0.5102, + "step": 163000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5114578844925949, + "eval_average_loss_on_sentence_tokens": 0.42641381994130706, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5075488090515137, + "eval_non_padding_tokens_in_labels": 133.51745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37665, + "eval_padding_tokens_in_labels": 378.48255, + "eval_reconstruction_accuracy": 0.9093623114528884, + "eval_runtime": 182.9923, + "eval_samples_per_second": 27.324, + "eval_sentence_accuracy": 0.7393992140255172, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 163000 + }, + { + "epoch": 0.09, + "learning_rate": 8.453535353535353e-05, + "loss": 0.5104, + "step": 163100 + }, + { + "epoch": 0.09, + "learning_rate": 8.452525252525253e-05, + "loss": 0.5102, + "step": 163200 + }, + { + "epoch": 0.09, + "learning_rate": 8.451515151515151e-05, + "loss": 0.5131, + "step": 163300 + }, + { + "epoch": 0.09, + "learning_rate": 8.45050505050505e-05, + "loss": 0.5108, + "step": 163400 + }, + { + "epoch": 0.09, + "learning_rate": 8.44949494949495e-05, + "loss": 0.5126, + "step": 163500 + }, + { + "epoch": 0.09, + "learning_rate": 8.448484848484849e-05, + "loss": 0.51, + "step": 163600 + }, + { + "epoch": 0.09, + "learning_rate": 8.447474747474748e-05, + "loss": 0.5089, + "step": 163700 + }, + { + "epoch": 0.09, + "learning_rate": 8.446464646464647e-05, + "loss": 0.5071, + "step": 163800 + }, + { + "epoch": 0.09, + "learning_rate": 8.445454545454546e-05, + "loss": 0.5101, + "step": 163900 + }, + { + "epoch": 0.09, + "learning_rate": 8.444444444444444e-05, + "loss": 0.5108, + "step": 164000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.5107861714371392, + "eval_average_loss_on_sentence_tokens": 0.432817328855618, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5072851777076721, + "eval_non_padding_tokens_in_labels": 133.5762, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3908, + "eval_padding_tokens_in_labels": 378.4238, + "eval_reconstruction_accuracy": 0.9093161163668703, + "eval_runtime": 174.6575, + "eval_samples_per_second": 28.627, + "eval_sentence_accuracy": 0.7389505984531735, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.249975, + "step": 164000 + }, + { + "epoch": 0.09, + "learning_rate": 8.443434343434345e-05, + "loss": 0.5132, + "step": 164100 + }, + { + "epoch": 0.09, + "learning_rate": 8.442424242424243e-05, + "loss": 0.5111, + "step": 164200 + }, + { + "epoch": 0.09, + "learning_rate": 8.441414141414142e-05, + "loss": 0.5138, + "step": 164300 + }, + { + "epoch": 0.09, + "learning_rate": 8.440404040404041e-05, + "loss": 0.5103, + "step": 164400 + }, + { + "epoch": 0.09, + "learning_rate": 8.43939393939394e-05, + "loss": 0.51, + "step": 164500 + }, + { + "epoch": 0.09, + "learning_rate": 8.438383838383838e-05, + "loss": 0.5079, + "step": 164600 + }, + { + "epoch": 0.09, + "learning_rate": 8.437373737373739e-05, + "loss": 0.5117, + "step": 164700 + }, + { + "epoch": 0.09, + "learning_rate": 8.436363636363637e-05, + "loss": 0.5124, + "step": 164800 + }, + { + "epoch": 0.09, + "learning_rate": 8.435353535353536e-05, + "loss": 0.5131, + "step": 164900 + }, + { + "epoch": 0.1, + "learning_rate": 8.434343434343435e-05, + "loss": 0.5145, + "step": 165000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5113176658005286, + "eval_average_loss_on_sentence_tokens": 0.41975436138993133, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.5072558522224426, + "eval_non_padding_tokens_in_labels": 133.53375, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3745, + "eval_padding_tokens_in_labels": 378.46625, + "eval_reconstruction_accuracy": 0.9093977433595629, + "eval_runtime": 174.3059, + "eval_samples_per_second": 28.685, + "eval_sentence_accuracy": 0.7344734150411829, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.25, + "step": 165000 + }, + { + "epoch": 0.1, + "learning_rate": 8.433333333333334e-05, + "loss": 0.5104, + "step": 165100 + }, + { + "epoch": 0.1, + "learning_rate": 8.432323232323232e-05, + "loss": 0.5118, + "step": 165200 + }, + { + "epoch": 0.1, + "learning_rate": 8.431313131313133e-05, + "loss": 0.5079, + "step": 165300 + }, + { + "epoch": 0.1, + "learning_rate": 8.43030303030303e-05, + "loss": 0.5098, + "step": 165400 + }, + { + "epoch": 0.1, + "learning_rate": 8.42929292929293e-05, + "loss": 0.5091, + "step": 165500 + }, + { + "epoch": 0.1, + "learning_rate": 8.428282828282829e-05, + "loss": 0.5068, + "step": 165600 + }, + { + "epoch": 0.1, + "learning_rate": 8.427272727272728e-05, + "loss": 0.5105, + "step": 165700 + }, + { + "epoch": 0.1, + "learning_rate": 8.426262626262627e-05, + "loss": 0.5122, + "step": 165800 + }, + { + "epoch": 0.1, + "learning_rate": 8.425252525252526e-05, + "loss": 0.5108, + "step": 165900 + }, + { + "epoch": 0.1, + "learning_rate": 8.424242424242424e-05, + "loss": 0.5073, + "step": 166000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5105598380767212, + "eval_average_loss_on_sentence_tokens": 0.43069428934394655, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.5070019364356995, + "eval_non_padding_tokens_in_labels": 133.53515, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37775, + "eval_padding_tokens_in_labels": 378.46485, + "eval_reconstruction_accuracy": 0.9092666713788313, + "eval_runtime": 182.5788, + "eval_samples_per_second": 27.385, + "eval_sentence_accuracy": 0.7359044987169595, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.25, + "step": 166000 + }, + { + "epoch": 0.1, + "learning_rate": 8.423232323232323e-05, + "loss": 0.5113, + "step": 166100 + }, + { + "epoch": 0.1, + "learning_rate": 8.422222222222223e-05, + "loss": 0.5122, + "step": 166200 + }, + { + "epoch": 0.1, + "learning_rate": 8.421212121212122e-05, + "loss": 0.5093, + "step": 166300 + }, + { + "epoch": 0.1, + "learning_rate": 8.420202020202021e-05, + "loss": 0.513, + "step": 166400 + }, + { + "epoch": 0.1, + "learning_rate": 8.41919191919192e-05, + "loss": 0.5111, + "step": 166500 + }, + { + "epoch": 0.1, + "learning_rate": 8.418181818181818e-05, + "loss": 0.5159, + "step": 166600 + }, + { + "epoch": 0.1, + "learning_rate": 8.417171717171719e-05, + "loss": 0.5095, + "step": 166700 + }, + { + "epoch": 0.1, + "learning_rate": 8.416161616161616e-05, + "loss": 0.5124, + "step": 166800 + }, + { + "epoch": 0.1, + "learning_rate": 8.415151515151516e-05, + "loss": 0.5093, + "step": 166900 + }, + { + "epoch": 0.1, + "learning_rate": 8.414141414141415e-05, + "loss": 0.5159, + "step": 167000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5100826737725296, + "eval_average_loss_on_sentence_tokens": 0.46549671069717313, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.5079492330551147, + "eval_non_padding_tokens_in_labels": 133.5381, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37545, + "eval_padding_tokens_in_labels": 378.4619, + "eval_reconstruction_accuracy": 0.9094889655436802, + "eval_runtime": 189.2387, + "eval_samples_per_second": 26.422, + "eval_sentence_accuracy": 0.70074649631238, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.245775, + "step": 167000 + }, + { + "epoch": 0.1, + "learning_rate": 8.413131313131314e-05, + "loss": 0.51, + "step": 167100 + }, + { + "epoch": 0.1, + "learning_rate": 8.412121212121212e-05, + "loss": 0.5073, + "step": 167200 + }, + { + "epoch": 0.1, + "learning_rate": 8.411111111111112e-05, + "loss": 0.5121, + "step": 167300 + }, + { + "epoch": 0.1, + "learning_rate": 8.41010101010101e-05, + "loss": 0.513, + "step": 167400 + }, + { + "epoch": 0.1, + "learning_rate": 8.40909090909091e-05, + "loss": 0.5057, + "step": 167500 + }, + { + "epoch": 0.1, + "learning_rate": 8.408080808080809e-05, + "loss": 0.5119, + "step": 167600 + }, + { + "epoch": 0.1, + "learning_rate": 8.407070707070708e-05, + "loss": 0.5068, + "step": 167700 + }, + { + "epoch": 0.1, + "learning_rate": 8.406060606060606e-05, + "loss": 0.5103, + "step": 167800 + }, + { + "epoch": 0.1, + "learning_rate": 8.405050505050506e-05, + "loss": 0.5063, + "step": 167900 + }, + { + "epoch": 0.1, + "learning_rate": 8.404040404040404e-05, + "loss": 0.5113, + "step": 168000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5088805081799699, + "eval_average_loss_on_sentence_tokens": 0.4401378131046214, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5058789253234863, + "eval_non_padding_tokens_in_labels": 133.53495, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3771, + "eval_padding_tokens_in_labels": 378.46505, + "eval_reconstruction_accuracy": 0.9096096005855856, + "eval_runtime": 180.9351, + "eval_samples_per_second": 27.634, + "eval_sentence_accuracy": 0.7280806431352845, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 168000 + }, + { + "epoch": 0.1, + "learning_rate": 8.403030303030303e-05, + "loss": 0.5074, + "step": 168100 + }, + { + "epoch": 0.1, + "learning_rate": 8.402020202020202e-05, + "loss": 0.507, + "step": 168200 + }, + { + "epoch": 0.1, + "learning_rate": 8.401010101010102e-05, + "loss": 0.5093, + "step": 168300 + }, + { + "epoch": 0.1, + "learning_rate": 8.4e-05, + "loss": 0.5086, + "step": 168400 + }, + { + "epoch": 0.1, + "learning_rate": 8.3989898989899e-05, + "loss": 0.5095, + "step": 168500 + }, + { + "epoch": 0.1, + "learning_rate": 8.397979797979798e-05, + "loss": 0.5094, + "step": 168600 + }, + { + "epoch": 0.1, + "learning_rate": 8.396969696969697e-05, + "loss": 0.5112, + "step": 168700 + }, + { + "epoch": 0.1, + "learning_rate": 8.395959595959596e-05, + "loss": 0.5111, + "step": 168800 + }, + { + "epoch": 0.1, + "learning_rate": 8.394949494949496e-05, + "loss": 0.5041, + "step": 168900 + }, + { + "epoch": 0.1, + "learning_rate": 8.393939393939393e-05, + "loss": 0.51, + "step": 169000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5083702071986677, + "eval_average_loss_on_sentence_tokens": 0.38364726981643316, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.5027831792831421, + "eval_non_padding_tokens_in_labels": 133.53345, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 378.46655, + "eval_reconstruction_accuracy": 0.9097355458929617, + "eval_runtime": 179.9634, + "eval_samples_per_second": 27.783, + "eval_sentence_accuracy": 0.7685278231377968, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 169000 + }, + { + "epoch": 0.1, + "learning_rate": 8.392929292929294e-05, + "loss": 0.5089, + "step": 169100 + }, + { + "epoch": 0.1, + "learning_rate": 8.391919191919192e-05, + "loss": 0.5108, + "step": 169200 + }, + { + "epoch": 0.1, + "learning_rate": 8.390909090909091e-05, + "loss": 0.5068, + "step": 169300 + }, + { + "epoch": 0.1, + "learning_rate": 8.38989898989899e-05, + "loss": 0.507, + "step": 169400 + }, + { + "epoch": 0.1, + "learning_rate": 8.38888888888889e-05, + "loss": 0.5098, + "step": 169500 + }, + { + "epoch": 0.1, + "learning_rate": 8.387878787878789e-05, + "loss": 0.5086, + "step": 169600 + }, + { + "epoch": 0.1, + "learning_rate": 8.386868686868688e-05, + "loss": 0.5079, + "step": 169700 + }, + { + "epoch": 0.1, + "learning_rate": 8.385858585858586e-05, + "loss": 0.51, + "step": 169800 + }, + { + "epoch": 0.1, + "learning_rate": 8.384848484848485e-05, + "loss": 0.5076, + "step": 169900 + }, + { + "epoch": 0.1, + "learning_rate": 8.383838383838384e-05, + "loss": 0.5075, + "step": 170000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5073949396251413, + "eval_average_loss_on_sentence_tokens": 0.41900298063480107, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.5034863352775574, + "eval_non_padding_tokens_in_labels": 133.53525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.46475, + "eval_reconstruction_accuracy": 0.9098036220674467, + "eval_runtime": 179.597, + "eval_samples_per_second": 27.84, + "eval_sentence_accuracy": 0.7443474437884687, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 170000 + }, + { + "epoch": 0.1, + "learning_rate": 8.382828282828283e-05, + "loss": 0.5117, + "step": 170100 + }, + { + "epoch": 0.1, + "learning_rate": 8.381818181818182e-05, + "loss": 0.5091, + "step": 170200 + }, + { + "epoch": 0.1, + "learning_rate": 8.380808080808082e-05, + "loss": 0.5051, + "step": 170300 + }, + { + "epoch": 0.1, + "learning_rate": 8.37979797979798e-05, + "loss": 0.5047, + "step": 170400 + }, + { + "epoch": 0.1, + "learning_rate": 8.378787878787879e-05, + "loss": 0.5099, + "step": 170500 + }, + { + "epoch": 0.1, + "learning_rate": 8.377777777777778e-05, + "loss": 0.506, + "step": 170600 + }, + { + "epoch": 0.1, + "learning_rate": 8.376767676767677e-05, + "loss": 0.5091, + "step": 170700 + }, + { + "epoch": 0.1, + "learning_rate": 8.375757575757576e-05, + "loss": 0.5109, + "step": 170800 + }, + { + "epoch": 0.1, + "learning_rate": 8.374747474747475e-05, + "loss": 0.5117, + "step": 170900 + }, + { + "epoch": 0.1, + "learning_rate": 8.373737373737373e-05, + "loss": 0.5073, + "step": 171000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5073984802425082, + "eval_average_loss_on_sentence_tokens": 0.4004996039526821, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.5025488138198853, + "eval_non_padding_tokens_in_labels": 133.54075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3786, + "eval_padding_tokens_in_labels": 378.45925, + "eval_reconstruction_accuracy": 0.9098203499197685, + "eval_runtime": 181.7162, + "eval_samples_per_second": 27.515, + "eval_sentence_accuracy": 0.758101997236528, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 171000 + }, + { + "epoch": 0.1, + "learning_rate": 8.372727272727274e-05, + "loss": 0.5131, + "step": 171100 + }, + { + "epoch": 0.1, + "learning_rate": 8.371717171717172e-05, + "loss": 0.5053, + "step": 171200 + }, + { + "epoch": 0.1, + "learning_rate": 8.370707070707071e-05, + "loss": 0.5044, + "step": 171300 + }, + { + "epoch": 0.1, + "learning_rate": 8.36969696969697e-05, + "loss": 0.506, + "step": 171400 + }, + { + "epoch": 0.1, + "learning_rate": 8.368686868686869e-05, + "loss": 0.5085, + "step": 171500 + }, + { + "epoch": 0.1, + "learning_rate": 8.367676767676767e-05, + "loss": 0.5089, + "step": 171600 + }, + { + "epoch": 0.1, + "learning_rate": 8.366666666666668e-05, + "loss": 0.5082, + "step": 171700 + }, + { + "epoch": 0.1, + "learning_rate": 8.365656565656565e-05, + "loss": 0.5072, + "step": 171800 + }, + { + "epoch": 0.1, + "learning_rate": 8.364646464646465e-05, + "loss": 0.5076, + "step": 171900 + }, + { + "epoch": 0.1, + "learning_rate": 8.363636363636364e-05, + "loss": 0.5087, + "step": 172000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5077508494239177, + "eval_average_loss_on_sentence_tokens": 0.41331784853246073, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.50341796875, + "eval_non_padding_tokens_in_labels": 133.5114, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3765, + "eval_padding_tokens_in_labels": 378.4886, + "eval_reconstruction_accuracy": 0.9097790712219943, + "eval_runtime": 186.8455, + "eval_samples_per_second": 26.76, + "eval_sentence_accuracy": 0.7438763974375079, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 172000 + }, + { + "epoch": 0.1, + "learning_rate": 8.362626262626263e-05, + "loss": 0.5058, + "step": 172100 + }, + { + "epoch": 0.1, + "learning_rate": 8.361616161616162e-05, + "loss": 0.5084, + "step": 172200 + }, + { + "epoch": 0.1, + "learning_rate": 8.360606060606061e-05, + "loss": 0.504, + "step": 172300 + }, + { + "epoch": 0.1, + "learning_rate": 8.359595959595961e-05, + "loss": 0.5064, + "step": 172400 + }, + { + "epoch": 0.1, + "learning_rate": 8.358585858585859e-05, + "loss": 0.5099, + "step": 172500 + }, + { + "epoch": 0.1, + "learning_rate": 8.357575757575759e-05, + "loss": 0.5073, + "step": 172600 + }, + { + "epoch": 0.1, + "learning_rate": 8.356565656565657e-05, + "loss": 0.5081, + "step": 172700 + }, + { + "epoch": 0.1, + "learning_rate": 8.355555555555556e-05, + "loss": 0.5083, + "step": 172800 + }, + { + "epoch": 0.1, + "learning_rate": 8.354545454545455e-05, + "loss": 0.5074, + "step": 172900 + }, + { + "epoch": 0.1, + "learning_rate": 8.353535353535355e-05, + "loss": 0.5037, + "step": 173000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5080681064945874, + "eval_average_loss_on_sentence_tokens": 0.4310596998047898, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.5045605301856995, + "eval_non_padding_tokens_in_labels": 133.52145, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3559, + "eval_padding_tokens_in_labels": 378.47855, + "eval_reconstruction_accuracy": 0.9098689394648061, + "eval_runtime": 179.4063, + "eval_samples_per_second": 27.87, + "eval_sentence_accuracy": 0.7226972562671595, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 173000 + }, + { + "epoch": 0.1, + "learning_rate": 8.352525252525252e-05, + "loss": 0.5057, + "step": 173100 + }, + { + "epoch": 0.1, + "learning_rate": 8.351515151515153e-05, + "loss": 0.509, + "step": 173200 + }, + { + "epoch": 0.1, + "learning_rate": 8.350505050505051e-05, + "loss": 0.5055, + "step": 173300 + }, + { + "epoch": 0.1, + "learning_rate": 8.34949494949495e-05, + "loss": 0.5044, + "step": 173400 + }, + { + "epoch": 0.1, + "learning_rate": 8.348484848484849e-05, + "loss": 0.5049, + "step": 173500 + }, + { + "epoch": 0.1, + "learning_rate": 8.347474747474748e-05, + "loss": 0.5045, + "step": 173600 + }, + { + "epoch": 0.1, + "learning_rate": 8.346464646464646e-05, + "loss": 0.5089, + "step": 173700 + }, + { + "epoch": 0.1, + "learning_rate": 8.345454545454547e-05, + "loss": 0.5095, + "step": 173800 + }, + { + "epoch": 0.1, + "learning_rate": 8.344444444444445e-05, + "loss": 0.5101, + "step": 173900 + }, + { + "epoch": 0.1, + "learning_rate": 8.343434343434344e-05, + "loss": 0.507, + "step": 174000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5071107783282831, + "eval_average_loss_on_sentence_tokens": 0.4087777097860479, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.5025683641433716, + "eval_non_padding_tokens_in_labels": 133.5434, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3856, + "eval_padding_tokens_in_labels": 378.4566, + "eval_reconstruction_accuracy": 0.9099052087232055, + "eval_runtime": 185.2139, + "eval_samples_per_second": 26.996, + "eval_sentence_accuracy": 0.7401618604985016, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 174000 + }, + { + "epoch": 0.1, + "learning_rate": 8.342424242424243e-05, + "loss": 0.5056, + "step": 174100 + }, + { + "epoch": 0.1, + "learning_rate": 8.341414141414142e-05, + "loss": 0.5046, + "step": 174200 + }, + { + "epoch": 0.1, + "learning_rate": 8.34040404040404e-05, + "loss": 0.5084, + "step": 174300 + }, + { + "epoch": 0.1, + "learning_rate": 8.33939393939394e-05, + "loss": 0.5024, + "step": 174400 + }, + { + "epoch": 0.1, + "learning_rate": 8.338383838383838e-05, + "loss": 0.5068, + "step": 174500 + }, + { + "epoch": 0.1, + "learning_rate": 8.337373737373738e-05, + "loss": 0.5117, + "step": 174600 + }, + { + "epoch": 0.1, + "learning_rate": 8.336363636363637e-05, + "loss": 0.5079, + "step": 174700 + }, + { + "epoch": 0.1, + "learning_rate": 8.335353535353536e-05, + "loss": 0.5047, + "step": 174800 + }, + { + "epoch": 0.1, + "learning_rate": 8.334343434343435e-05, + "loss": 0.5069, + "step": 174900 + }, + { + "epoch": 0.1, + "learning_rate": 8.333333333333334e-05, + "loss": 0.5084, + "step": 175000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.5070875105765994, + "eval_average_loss_on_sentence_tokens": 0.4306217550631295, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.5036523342132568, + "eval_non_padding_tokens_in_labels": 133.5311, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37165, + "eval_padding_tokens_in_labels": 378.4689, + "eval_reconstruction_accuracy": 0.9098939274940426, + "eval_runtime": 185.5367, + "eval_samples_per_second": 26.949, + "eval_sentence_accuracy": 0.7284529940603298, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 175000 + }, + { + "epoch": 0.11, + "learning_rate": 8.332323232323232e-05, + "loss": 0.5042, + "step": 175100 + }, + { + "epoch": 0.11, + "learning_rate": 8.331313131313131e-05, + "loss": 0.5053, + "step": 175200 + }, + { + "epoch": 0.11, + "learning_rate": 8.33030303030303e-05, + "loss": 0.505, + "step": 175300 + }, + { + "epoch": 0.11, + "learning_rate": 8.32929292929293e-05, + "loss": 0.5069, + "step": 175400 + }, + { + "epoch": 0.11, + "learning_rate": 8.328282828282829e-05, + "loss": 0.5062, + "step": 175500 + }, + { + "epoch": 0.11, + "learning_rate": 8.327272727272728e-05, + "loss": 0.5043, + "step": 175600 + }, + { + "epoch": 0.11, + "learning_rate": 8.326262626262626e-05, + "loss": 0.5078, + "step": 175700 + }, + { + "epoch": 0.11, + "learning_rate": 8.325252525252527e-05, + "loss": 0.5081, + "step": 175800 + }, + { + "epoch": 0.11, + "learning_rate": 8.324242424242424e-05, + "loss": 0.5077, + "step": 175900 + }, + { + "epoch": 0.11, + "learning_rate": 8.323232323232324e-05, + "loss": 0.5055, + "step": 176000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.5059516339736924, + "eval_average_loss_on_sentence_tokens": 0.4446675366722503, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.5032617449760437, + "eval_non_padding_tokens_in_labels": 133.5839, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3996, + "eval_padding_tokens_in_labels": 378.4161, + "eval_reconstruction_accuracy": 0.9099369738817633, + "eval_runtime": 181.4925, + "eval_samples_per_second": 27.549, + "eval_sentence_accuracy": 0.7215218834676189, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 176000 + }, + { + "epoch": 0.11, + "learning_rate": 8.322222222222223e-05, + "loss": 0.5072, + "step": 176100 + }, + { + "epoch": 0.11, + "learning_rate": 8.321212121212122e-05, + "loss": 0.5075, + "step": 176200 + }, + { + "epoch": 0.11, + "learning_rate": 8.32020202020202e-05, + "loss": 0.5064, + "step": 176300 + }, + { + "epoch": 0.11, + "learning_rate": 8.31919191919192e-05, + "loss": 0.506, + "step": 176400 + }, + { + "epoch": 0.11, + "learning_rate": 8.318181818181818e-05, + "loss": 0.5082, + "step": 176500 + }, + { + "epoch": 0.11, + "learning_rate": 8.317171717171718e-05, + "loss": 0.5061, + "step": 176600 + }, + { + "epoch": 0.11, + "learning_rate": 8.316161616161617e-05, + "loss": 0.5076, + "step": 176700 + }, + { + "epoch": 0.11, + "learning_rate": 8.315151515151516e-05, + "loss": 0.5009, + "step": 176800 + }, + { + "epoch": 0.11, + "learning_rate": 8.314141414141414e-05, + "loss": 0.5061, + "step": 176900 + }, + { + "epoch": 0.11, + "learning_rate": 8.313131313131314e-05, + "loss": 0.5061, + "step": 177000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.504453393346021, + "eval_average_loss_on_sentence_tokens": 0.4093321151632018, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.5001757740974426, + "eval_non_padding_tokens_in_labels": 133.53645, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39105, + "eval_padding_tokens_in_labels": 378.46355, + "eval_reconstruction_accuracy": 0.910233325606196, + "eval_runtime": 186.5308, + "eval_samples_per_second": 26.805, + "eval_sentence_accuracy": 0.7448543793852173, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 177000 + }, + { + "epoch": 0.11, + "learning_rate": 8.312121212121212e-05, + "loss": 0.5058, + "step": 177100 + }, + { + "epoch": 0.11, + "learning_rate": 8.311111111111111e-05, + "loss": 0.5035, + "step": 177200 + }, + { + "epoch": 0.11, + "learning_rate": 8.31010101010101e-05, + "loss": 0.5058, + "step": 177300 + }, + { + "epoch": 0.11, + "learning_rate": 8.30909090909091e-05, + "loss": 0.505, + "step": 177400 + }, + { + "epoch": 0.11, + "learning_rate": 8.308080808080808e-05, + "loss": 0.5049, + "step": 177500 + }, + { + "epoch": 0.11, + "learning_rate": 8.307070707070708e-05, + "loss": 0.5023, + "step": 177600 + }, + { + "epoch": 0.11, + "learning_rate": 8.306060606060606e-05, + "loss": 0.5078, + "step": 177700 + }, + { + "epoch": 0.11, + "learning_rate": 8.305050505050505e-05, + "loss": 0.5087, + "step": 177800 + }, + { + "epoch": 0.11, + "learning_rate": 8.304040404040404e-05, + "loss": 0.5073, + "step": 177900 + }, + { + "epoch": 0.11, + "learning_rate": 8.303030303030304e-05, + "loss": 0.5064, + "step": 178000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.504032933578713, + "eval_average_loss_on_sentence_tokens": 0.4753683050689728, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.5028320550918579, + "eval_non_padding_tokens_in_labels": 133.5414, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38, + "eval_padding_tokens_in_labels": 378.4586, + "eval_reconstruction_accuracy": 0.9100058941826494, + "eval_runtime": 186.1983, + "eval_samples_per_second": 26.853, + "eval_sentence_accuracy": 0.7044430886284925, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.245775, + "step": 178000 + }, + { + "epoch": 0.11, + "learning_rate": 8.302020202020201e-05, + "loss": 0.5106, + "step": 178100 + }, + { + "epoch": 0.11, + "learning_rate": 8.301010101010102e-05, + "loss": 0.5103, + "step": 178200 + }, + { + "epoch": 0.11, + "learning_rate": 8.3e-05, + "loss": 0.5074, + "step": 178300 + }, + { + "epoch": 0.11, + "learning_rate": 8.298989898989899e-05, + "loss": 0.5024, + "step": 178400 + }, + { + "epoch": 0.11, + "learning_rate": 8.297979797979798e-05, + "loss": 0.5039, + "step": 178500 + }, + { + "epoch": 0.11, + "learning_rate": 8.296969696969697e-05, + "loss": 0.5054, + "step": 178600 + }, + { + "epoch": 0.11, + "learning_rate": 8.295959595959597e-05, + "loss": 0.504, + "step": 178700 + }, + { + "epoch": 0.11, + "learning_rate": 8.294949494949496e-05, + "loss": 0.5033, + "step": 178800 + }, + { + "epoch": 0.11, + "learning_rate": 8.293939393939394e-05, + "loss": 0.5071, + "step": 178900 + }, + { + "epoch": 0.11, + "learning_rate": 8.292929292929293e-05, + "loss": 0.5051, + "step": 179000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.5040831089741361, + "eval_average_loss_on_sentence_tokens": 0.42016180453658136, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.5003320574760437, + "eval_non_padding_tokens_in_labels": 133.55525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3955, + "eval_padding_tokens_in_labels": 378.44475, + "eval_reconstruction_accuracy": 0.9101197068795766, + "eval_runtime": 186.747, + "eval_samples_per_second": 26.774, + "eval_sentence_accuracy": 0.736514615895347, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 179000 + }, + { + "epoch": 0.11, + "learning_rate": 8.291919191919192e-05, + "loss": 0.5028, + "step": 179100 + }, + { + "epoch": 0.11, + "learning_rate": 8.290909090909091e-05, + "loss": 0.5018, + "step": 179200 + }, + { + "epoch": 0.11, + "learning_rate": 8.28989898989899e-05, + "loss": 0.5049, + "step": 179300 + }, + { + "epoch": 0.11, + "learning_rate": 8.28888888888889e-05, + "loss": 0.5067, + "step": 179400 + }, + { + "epoch": 0.11, + "learning_rate": 8.287878787878787e-05, + "loss": 0.5055, + "step": 179500 + }, + { + "epoch": 0.11, + "learning_rate": 8.286868686868687e-05, + "loss": 0.5054, + "step": 179600 + }, + { + "epoch": 0.11, + "learning_rate": 8.285858585858586e-05, + "loss": 0.5022, + "step": 179700 + }, + { + "epoch": 0.11, + "learning_rate": 8.284848484848485e-05, + "loss": 0.5099, + "step": 179800 + }, + { + "epoch": 0.11, + "learning_rate": 8.283838383838384e-05, + "loss": 0.5035, + "step": 179900 + }, + { + "epoch": 0.11, + "learning_rate": 8.282828282828283e-05, + "loss": 0.5075, + "step": 180000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.5034969104538917, + "eval_average_loss_on_sentence_tokens": 0.42972655692043993, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.5003125071525574, + "eval_non_padding_tokens_in_labels": 133.49285, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3731, + "eval_padding_tokens_in_labels": 378.50715, + "eval_reconstruction_accuracy": 0.9102164536911856, + "eval_runtime": 185.6358, + "eval_samples_per_second": 26.934, + "eval_sentence_accuracy": 0.7317503185170564, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 180000 + }, + { + "epoch": 0.11, + "learning_rate": 8.281818181818181e-05, + "loss": 0.5025, + "step": 180100 + }, + { + "epoch": 0.11, + "learning_rate": 8.280808080808082e-05, + "loss": 0.5023, + "step": 180200 + }, + { + "epoch": 0.11, + "learning_rate": 8.27979797979798e-05, + "loss": 0.5045, + "step": 180300 + }, + { + "epoch": 0.11, + "learning_rate": 8.278787878787879e-05, + "loss": 0.5015, + "step": 180400 + }, + { + "epoch": 0.11, + "learning_rate": 8.277777777777778e-05, + "loss": 0.5042, + "step": 180500 + }, + { + "epoch": 0.11, + "learning_rate": 8.276767676767677e-05, + "loss": 0.5039, + "step": 180600 + }, + { + "epoch": 0.11, + "learning_rate": 8.275757575757577e-05, + "loss": 0.5036, + "step": 180700 + }, + { + "epoch": 0.11, + "learning_rate": 8.274747474747476e-05, + "loss": 0.5071, + "step": 180800 + }, + { + "epoch": 0.11, + "learning_rate": 8.273737373737375e-05, + "loss": 0.503, + "step": 180900 + }, + { + "epoch": 0.11, + "learning_rate": 8.272727272727273e-05, + "loss": 0.5013, + "step": 181000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.503086528973495, + "eval_average_loss_on_sentence_tokens": 0.39767543354196155, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4983789026737213, + "eval_non_padding_tokens_in_labels": 133.5648, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3899, + "eval_padding_tokens_in_labels": 378.4352, + "eval_reconstruction_accuracy": 0.9102811460043612, + "eval_runtime": 183.7669, + "eval_samples_per_second": 27.208, + "eval_sentence_accuracy": 0.7569580275270515, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.248775, + "step": 181000 + }, + { + "epoch": 0.11, + "learning_rate": 8.271717171717173e-05, + "loss": 0.5017, + "step": 181100 + }, + { + "epoch": 0.11, + "learning_rate": 8.270707070707071e-05, + "loss": 0.5019, + "step": 181200 + }, + { + "epoch": 0.11, + "learning_rate": 8.26969696969697e-05, + "loss": 0.5034, + "step": 181300 + }, + { + "epoch": 0.11, + "learning_rate": 8.26868686868687e-05, + "loss": 0.5013, + "step": 181400 + }, + { + "epoch": 0.11, + "learning_rate": 8.267676767676769e-05, + "loss": 0.4999, + "step": 181500 + }, + { + "epoch": 0.11, + "learning_rate": 8.266666666666667e-05, + "loss": 0.5019, + "step": 181600 + }, + { + "epoch": 0.11, + "learning_rate": 8.265656565656567e-05, + "loss": 0.5028, + "step": 181700 + }, + { + "epoch": 0.11, + "learning_rate": 8.264646464646465e-05, + "loss": 0.5042, + "step": 181800 + }, + { + "epoch": 0.11, + "learning_rate": 8.263636363636364e-05, + "loss": 0.5015, + "step": 181900 + }, + { + "epoch": 0.11, + "learning_rate": 8.262626262626263e-05, + "loss": 0.5012, + "step": 182000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.5026936910916477, + "eval_average_loss_on_sentence_tokens": 0.3986591945198305, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4981054663658142, + "eval_non_padding_tokens_in_labels": 133.5373, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3973, + "eval_padding_tokens_in_labels": 378.4627, + "eval_reconstruction_accuracy": 0.9103327524994923, + "eval_runtime": 187.0366, + "eval_samples_per_second": 26.733, + "eval_sentence_accuracy": 0.75531160837655, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "step": 182000 + }, + { + "epoch": 0.11, + "learning_rate": 8.261616161616163e-05, + "loss": 0.5014, + "step": 182100 + }, + { + "epoch": 0.11, + "learning_rate": 8.26060606060606e-05, + "loss": 0.5042, + "step": 182200 + }, + { + "epoch": 0.11, + "learning_rate": 8.259595959595961e-05, + "loss": 0.5056, + "step": 182300 + }, + { + "epoch": 0.11, + "learning_rate": 8.258585858585859e-05, + "loss": 0.502, + "step": 182400 + }, + { + "epoch": 0.11, + "learning_rate": 8.257575757575758e-05, + "loss": 0.5039, + "step": 182500 + }, + { + "epoch": 0.11, + "learning_rate": 8.256565656565657e-05, + "loss": 0.5005, + "step": 182600 + }, + { + "epoch": 0.11, + "learning_rate": 8.255555555555556e-05, + "loss": 0.5033, + "step": 182700 + }, + { + "epoch": 0.11, + "learning_rate": 8.254545454545454e-05, + "loss": 0.5014, + "step": 182800 + }, + { + "epoch": 0.11, + "learning_rate": 8.253535353535355e-05, + "loss": 0.5004, + "step": 182900 + }, + { + "epoch": 0.11, + "learning_rate": 8.252525252525253e-05, + "loss": 0.5063, + "step": 183000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.502266636979402, + "eval_average_loss_on_sentence_tokens": 0.4846684510437903, + "eval_average_shuffling_prob": 0.57, + "eval_loss": 0.5014746189117432, + "eval_non_padding_tokens_in_labels": 133.5491, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3792, + "eval_padding_tokens_in_labels": 378.4509, + "eval_reconstruction_accuracy": 0.9101633791534266, + "eval_runtime": 189.3485, + "eval_samples_per_second": 26.406, + "eval_sentence_accuracy": 0.7019667306691549, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24509999999999996, + "step": 183000 + }, + { + "epoch": 0.11, + "learning_rate": 8.251515151515152e-05, + "loss": 0.5056, + "step": 183100 + }, + { + "epoch": 0.11, + "learning_rate": 8.250505050505051e-05, + "loss": 0.5026, + "step": 183200 + }, + { + "epoch": 0.11, + "learning_rate": 8.24949494949495e-05, + "loss": 0.5029, + "step": 183300 + }, + { + "epoch": 0.11, + "learning_rate": 8.248484848484848e-05, + "loss": 0.508, + "step": 183400 + }, + { + "epoch": 0.11, + "learning_rate": 8.247474747474749e-05, + "loss": 0.4984, + "step": 183500 + }, + { + "epoch": 0.11, + "learning_rate": 8.246464646464646e-05, + "loss": 0.5035, + "step": 183600 + }, + { + "epoch": 0.11, + "learning_rate": 8.245454545454546e-05, + "loss": 0.5037, + "step": 183700 + }, + { + "epoch": 0.11, + "learning_rate": 8.244444444444445e-05, + "loss": 0.5022, + "step": 183800 + }, + { + "epoch": 0.11, + "learning_rate": 8.243434343434344e-05, + "loss": 0.4988, + "step": 183900 + }, + { + "epoch": 0.11, + "learning_rate": 8.242424242424243e-05, + "loss": 0.4977, + "step": 184000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.5018079258068241, + "eval_average_loss_on_sentence_tokens": 0.3637360517761532, + "eval_average_shuffling_prob": 0.405, + "eval_loss": 0.4956640601158142, + "eval_non_padding_tokens_in_labels": 133.5545, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39955, + "eval_padding_tokens_in_labels": 378.4455, + "eval_reconstruction_accuracy": 0.9104091672863841, + "eval_runtime": 185.9078, + "eval_samples_per_second": 26.895, + "eval_sentence_accuracy": 0.7865756276131857, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24097500000000005, + "step": 184000 + }, + { + "epoch": 0.11, + "learning_rate": 8.241414141414142e-05, + "loss": 0.5017, + "step": 184100 + }, + { + "epoch": 0.11, + "learning_rate": 8.24040404040404e-05, + "loss": 0.5039, + "step": 184200 + }, + { + "epoch": 0.11, + "learning_rate": 8.23939393939394e-05, + "loss": 0.4999, + "step": 184300 + }, + { + "epoch": 0.11, + "learning_rate": 8.238383838383839e-05, + "loss": 0.5001, + "step": 184400 + }, + { + "epoch": 0.11, + "learning_rate": 8.237373737373738e-05, + "loss": 0.5049, + "step": 184500 + }, + { + "epoch": 0.11, + "learning_rate": 8.236363636363637e-05, + "loss": 0.5014, + "step": 184600 + }, + { + "epoch": 0.11, + "learning_rate": 8.235353535353536e-05, + "loss": 0.501, + "step": 184700 + }, + { + "epoch": 0.11, + "learning_rate": 8.234343434343434e-05, + "loss": 0.5008, + "step": 184800 + }, + { + "epoch": 0.11, + "learning_rate": 8.233333333333333e-05, + "loss": 0.5017, + "step": 184900 + }, + { + "epoch": 0.12, + "learning_rate": 8.232323232323233e-05, + "loss": 0.4976, + "step": 185000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.501423222536653, + "eval_average_loss_on_sentence_tokens": 0.39447516696748947, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4965234398841858, + "eval_non_padding_tokens_in_labels": 133.5236, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37595, + "eval_padding_tokens_in_labels": 378.4764, + "eval_reconstruction_accuracy": 0.910524381635037, + "eval_runtime": 187.5848, + "eval_samples_per_second": 26.655, + "eval_sentence_accuracy": 0.7602060042708203, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 185000 + }, + { + "epoch": 0.12, + "learning_rate": 8.231313131313132e-05, + "loss": 0.5013, + "step": 185100 + }, + { + "epoch": 0.12, + "learning_rate": 8.230303030303031e-05, + "loss": 0.4993, + "step": 185200 + }, + { + "epoch": 0.12, + "learning_rate": 8.22929292929293e-05, + "loss": 0.5008, + "step": 185300 + }, + { + "epoch": 0.12, + "learning_rate": 8.228282828282828e-05, + "loss": 0.5006, + "step": 185400 + }, + { + "epoch": 0.12, + "learning_rate": 8.227272727272729e-05, + "loss": 0.5056, + "step": 185500 + }, + { + "epoch": 0.12, + "learning_rate": 8.226262626262626e-05, + "loss": 0.5003, + "step": 185600 + }, + { + "epoch": 0.12, + "learning_rate": 8.225252525252526e-05, + "loss": 0.5038, + "step": 185700 + }, + { + "epoch": 0.12, + "learning_rate": 8.224242424242425e-05, + "loss": 0.5037, + "step": 185800 + }, + { + "epoch": 0.12, + "learning_rate": 8.223232323232324e-05, + "loss": 0.503, + "step": 185900 + }, + { + "epoch": 0.12, + "learning_rate": 8.222222222222222e-05, + "loss": 0.4982, + "step": 186000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.5012581967653236, + "eval_average_loss_on_sentence_tokens": 0.437005481900854, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4983203113079071, + "eval_non_padding_tokens_in_labels": 133.5182, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38565, + "eval_padding_tokens_in_labels": 378.4818, + "eval_reconstruction_accuracy": 0.9103556950167836, + "eval_runtime": 188.4309, + "eval_samples_per_second": 26.535, + "eval_sentence_accuracy": 0.7394485617384751, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 186000 + }, + { + "epoch": 0.12, + "learning_rate": 8.221212121212122e-05, + "loss": 0.4995, + "step": 186100 + }, + { + "epoch": 0.12, + "learning_rate": 8.22020202020202e-05, + "loss": 0.4997, + "step": 186200 + }, + { + "epoch": 0.12, + "learning_rate": 8.21919191919192e-05, + "loss": 0.4985, + "step": 186300 + }, + { + "epoch": 0.12, + "learning_rate": 8.218181818181819e-05, + "loss": 0.4958, + "step": 186400 + }, + { + "epoch": 0.12, + "learning_rate": 8.217171717171718e-05, + "loss": 0.5041, + "step": 186500 + }, + { + "epoch": 0.12, + "learning_rate": 8.216161616161616e-05, + "loss": 0.4997, + "step": 186600 + }, + { + "epoch": 0.12, + "learning_rate": 8.215151515151516e-05, + "loss": 0.502, + "step": 186700 + }, + { + "epoch": 0.12, + "learning_rate": 8.214141414141414e-05, + "loss": 0.4951, + "step": 186800 + }, + { + "epoch": 0.12, + "learning_rate": 8.213131313131313e-05, + "loss": 0.5019, + "step": 186900 + }, + { + "epoch": 0.12, + "learning_rate": 8.212121212121212e-05, + "loss": 0.5021, + "step": 187000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.50068862162456, + "eval_average_loss_on_sentence_tokens": 0.41639979461808485, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4968554675579071, + "eval_non_padding_tokens_in_labels": 133.52375, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3769, + "eval_padding_tokens_in_labels": 378.47625, + "eval_reconstruction_accuracy": 0.9105296665226067, + "eval_runtime": 186.7718, + "eval_samples_per_second": 26.771, + "eval_sentence_accuracy": 0.7401528881870547, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.25, + "step": 187000 + }, + { + "epoch": 0.12, + "learning_rate": 8.211111111111112e-05, + "loss": 0.5059, + "step": 187100 + }, + { + "epoch": 0.12, + "learning_rate": 8.21010101010101e-05, + "loss": 0.4999, + "step": 187200 + }, + { + "epoch": 0.12, + "learning_rate": 8.20909090909091e-05, + "loss": 0.4993, + "step": 187300 + }, + { + "epoch": 0.12, + "learning_rate": 8.208080808080808e-05, + "loss": 0.5016, + "step": 187400 + }, + { + "epoch": 0.12, + "learning_rate": 8.207070707070707e-05, + "loss": 0.4989, + "step": 187500 + }, + { + "epoch": 0.12, + "learning_rate": 8.206060606060606e-05, + "loss": 0.5017, + "step": 187600 + }, + { + "epoch": 0.12, + "learning_rate": 8.205050505050505e-05, + "loss": 0.5004, + "step": 187700 + }, + { + "epoch": 0.12, + "learning_rate": 8.204040404040403e-05, + "loss": 0.503, + "step": 187800 + }, + { + "epoch": 0.12, + "learning_rate": 8.203030303030304e-05, + "loss": 0.5027, + "step": 187900 + }, + { + "epoch": 0.12, + "learning_rate": 8.202020202020202e-05, + "loss": 0.4989, + "step": 188000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4989475296940222, + "eval_average_loss_on_sentence_tokens": 0.4620889521193942, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.497314453125, + "eval_non_padding_tokens_in_labels": 133.5408, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39085, + "eval_padding_tokens_in_labels": 378.4592, + "eval_reconstruction_accuracy": 0.9106133589227755, + "eval_runtime": 187.213, + "eval_samples_per_second": 26.708, + "eval_sentence_accuracy": 0.7147971360381862, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 188000 + }, + { + "epoch": 0.12, + "learning_rate": 8.201010101010101e-05, + "loss": 0.5043, + "step": 188100 + }, + { + "epoch": 0.12, + "learning_rate": 8.2e-05, + "loss": 0.5035, + "step": 188200 + }, + { + "epoch": 0.12, + "learning_rate": 8.198989898989899e-05, + "loss": 0.4988, + "step": 188300 + }, + { + "epoch": 0.12, + "learning_rate": 8.197979797979798e-05, + "loss": 0.5, + "step": 188400 + }, + { + "epoch": 0.12, + "learning_rate": 8.196969696969698e-05, + "loss": 0.498, + "step": 188500 + }, + { + "epoch": 0.12, + "learning_rate": 8.195959595959596e-05, + "loss": 0.5017, + "step": 188600 + }, + { + "epoch": 0.12, + "learning_rate": 8.194949494949495e-05, + "loss": 0.4979, + "step": 188700 + }, + { + "epoch": 0.12, + "learning_rate": 8.193939393939394e-05, + "loss": 0.5021, + "step": 188800 + }, + { + "epoch": 0.12, + "learning_rate": 8.192929292929293e-05, + "loss": 0.5027, + "step": 188900 + }, + { + "epoch": 0.12, + "learning_rate": 8.191919191919192e-05, + "loss": 0.4993, + "step": 189000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.5005919720288102, + "eval_average_loss_on_sentence_tokens": 0.37410578250726084, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.49489256739616394, + "eval_non_padding_tokens_in_labels": 133.5643, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.4154, + "eval_padding_tokens_in_labels": 378.4357, + "eval_reconstruction_accuracy": 0.9106647164465315, + "eval_runtime": 189.6427, + "eval_samples_per_second": 26.365, + "eval_sentence_accuracy": 0.7709727780070702, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 189000 + }, + { + "epoch": 0.12, + "learning_rate": 8.190909090909092e-05, + "loss": 0.4977, + "step": 189100 + }, + { + "epoch": 0.12, + "learning_rate": 8.18989898989899e-05, + "loss": 0.4982, + "step": 189200 + }, + { + "epoch": 0.12, + "learning_rate": 8.18888888888889e-05, + "loss": 0.4999, + "step": 189300 + }, + { + "epoch": 0.12, + "learning_rate": 8.187878787878789e-05, + "loss": 0.4974, + "step": 189400 + }, + { + "epoch": 0.12, + "learning_rate": 8.186868686868687e-05, + "loss": 0.4986, + "step": 189500 + }, + { + "epoch": 0.12, + "learning_rate": 8.185858585858586e-05, + "loss": 0.4975, + "step": 189600 + }, + { + "epoch": 0.12, + "learning_rate": 8.184848484848485e-05, + "loss": 0.5008, + "step": 189700 + }, + { + "epoch": 0.12, + "learning_rate": 8.183838383838385e-05, + "loss": 0.4988, + "step": 189800 + }, + { + "epoch": 0.12, + "learning_rate": 8.182828282828284e-05, + "loss": 0.5012, + "step": 189900 + }, + { + "epoch": 0.12, + "learning_rate": 8.181818181818183e-05, + "loss": 0.4981, + "step": 190000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4995229848872939, + "eval_average_loss_on_sentence_tokens": 0.4341172310881995, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.49666014313697815, + "eval_non_padding_tokens_in_labels": 133.5466, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3818, + "eval_padding_tokens_in_labels": 378.4534, + "eval_reconstruction_accuracy": 0.9106209016888521, + "eval_runtime": 190.4707, + "eval_samples_per_second": 26.251, + "eval_sentence_accuracy": 0.7324052972526782, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.2499, + "step": 190000 + }, + { + "epoch": 0.12, + "learning_rate": 8.180808080808081e-05, + "loss": 0.4975, + "step": 190100 + }, + { + "epoch": 0.12, + "learning_rate": 8.179797979797981e-05, + "loss": 0.5014, + "step": 190200 + }, + { + "epoch": 0.12, + "learning_rate": 8.178787878787879e-05, + "loss": 0.5023, + "step": 190300 + }, + { + "epoch": 0.12, + "learning_rate": 8.177777777777778e-05, + "loss": 0.496, + "step": 190400 + }, + { + "epoch": 0.12, + "learning_rate": 8.176767676767678e-05, + "loss": 0.4946, + "step": 190500 + }, + { + "epoch": 0.12, + "learning_rate": 8.175757575757577e-05, + "loss": 0.5, + "step": 190600 + }, + { + "epoch": 0.12, + "learning_rate": 8.174747474747475e-05, + "loss": 0.5003, + "step": 190700 + }, + { + "epoch": 0.12, + "learning_rate": 8.173737373737375e-05, + "loss": 0.4993, + "step": 190800 + }, + { + "epoch": 0.12, + "learning_rate": 8.172727272727273e-05, + "loss": 0.5013, + "step": 190900 + }, + { + "epoch": 0.12, + "learning_rate": 8.171717171717172e-05, + "loss": 0.4998, + "step": 191000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.5004610593072819, + "eval_average_loss_on_sentence_tokens": 0.4277986329333905, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4971874952316284, + "eval_non_padding_tokens_in_labels": 133.52905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37965, + "eval_padding_tokens_in_labels": 378.47095, + "eval_reconstruction_accuracy": 0.910654788301309, + "eval_runtime": 184.9418, + "eval_samples_per_second": 27.036, + "eval_sentence_accuracy": 0.7270622857860642, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 191000 + }, + { + "epoch": 0.12, + "learning_rate": 8.170707070707071e-05, + "loss": 0.5004, + "step": 191100 + }, + { + "epoch": 0.12, + "learning_rate": 8.16969696969697e-05, + "loss": 0.4958, + "step": 191200 + }, + { + "epoch": 0.12, + "learning_rate": 8.168686868686868e-05, + "loss": 0.494, + "step": 191300 + }, + { + "epoch": 0.12, + "learning_rate": 8.167676767676769e-05, + "loss": 0.4982, + "step": 191400 + }, + { + "epoch": 0.12, + "learning_rate": 8.166666666666667e-05, + "loss": 0.4982, + "step": 191500 + }, + { + "epoch": 0.12, + "learning_rate": 8.165656565656566e-05, + "loss": 0.5014, + "step": 191600 + }, + { + "epoch": 0.12, + "learning_rate": 8.164646464646465e-05, + "loss": 0.4991, + "step": 191700 + }, + { + "epoch": 0.12, + "learning_rate": 8.163636363636364e-05, + "loss": 0.4975, + "step": 191800 + }, + { + "epoch": 0.12, + "learning_rate": 8.162626262626262e-05, + "loss": 0.4998, + "step": 191900 + }, + { + "epoch": 0.12, + "learning_rate": 8.161616161616163e-05, + "loss": 0.4975, + "step": 192000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4981846026898154, + "eval_average_loss_on_sentence_tokens": 0.44512861122653474, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4958789050579071, + "eval_non_padding_tokens_in_labels": 133.5252, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38585, + "eval_padding_tokens_in_labels": 378.4748, + "eval_reconstruction_accuracy": 0.9108166297813477, + "eval_runtime": 184.1473, + "eval_samples_per_second": 27.152, + "eval_sentence_accuracy": 0.7213828126401923, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.248775, + "step": 192000 + }, + { + "epoch": 0.12, + "learning_rate": 8.160606060606061e-05, + "loss": 0.4966, + "step": 192100 + }, + { + "epoch": 0.12, + "learning_rate": 8.15959595959596e-05, + "loss": 0.4966, + "step": 192200 + }, + { + "epoch": 0.12, + "learning_rate": 8.158585858585859e-05, + "loss": 0.4985, + "step": 192300 + }, + { + "epoch": 0.12, + "learning_rate": 8.157575757575758e-05, + "loss": 0.5011, + "step": 192400 + }, + { + "epoch": 0.12, + "learning_rate": 8.156565656565656e-05, + "loss": 0.4984, + "step": 192500 + }, + { + "epoch": 0.12, + "learning_rate": 8.155555555555557e-05, + "loss": 0.5003, + "step": 192600 + }, + { + "epoch": 0.12, + "learning_rate": 8.154545454545455e-05, + "loss": 0.5027, + "step": 192700 + }, + { + "epoch": 0.12, + "learning_rate": 8.153535353535354e-05, + "loss": 0.4974, + "step": 192800 + }, + { + "epoch": 0.12, + "learning_rate": 8.152525252525253e-05, + "loss": 0.5019, + "step": 192900 + }, + { + "epoch": 0.12, + "learning_rate": 8.151515151515152e-05, + "loss": 0.4995, + "step": 193000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4983076393916748, + "eval_average_loss_on_sentence_tokens": 0.41685961799829463, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4946874976158142, + "eval_non_padding_tokens_in_labels": 133.51165, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36365, + "eval_padding_tokens_in_labels": 378.48835, + "eval_reconstruction_accuracy": 0.910834637415297, + "eval_runtime": 187.8294, + "eval_samples_per_second": 26.62, + "eval_sentence_accuracy": 0.7374522224415454, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 193000 + }, + { + "epoch": 0.12, + "learning_rate": 8.150505050505051e-05, + "loss": 0.4976, + "step": 193100 + }, + { + "epoch": 0.12, + "learning_rate": 8.14949494949495e-05, + "loss": 0.4975, + "step": 193200 + }, + { + "epoch": 0.12, + "learning_rate": 8.148484848484848e-05, + "loss": 0.5005, + "step": 193300 + }, + { + "epoch": 0.12, + "learning_rate": 8.147474747474748e-05, + "loss": 0.4974, + "step": 193400 + }, + { + "epoch": 0.12, + "learning_rate": 8.146464646464647e-05, + "loss": 0.4965, + "step": 193500 + }, + { + "epoch": 0.12, + "learning_rate": 8.145454545454546e-05, + "loss": 0.4982, + "step": 193600 + }, + { + "epoch": 0.12, + "learning_rate": 8.144444444444445e-05, + "loss": 0.4949, + "step": 193700 + }, + { + "epoch": 0.12, + "learning_rate": 8.143434343434344e-05, + "loss": 0.4991, + "step": 193800 + }, + { + "epoch": 0.12, + "learning_rate": 8.142424242424242e-05, + "loss": 0.5012, + "step": 193900 + }, + { + "epoch": 0.12, + "learning_rate": 8.141414141414141e-05, + "loss": 0.4974, + "step": 194000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.49795689646267405, + "eval_average_loss_on_sentence_tokens": 0.38546620639692164, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4929882884025574, + "eval_non_padding_tokens_in_labels": 133.53065, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3816, + "eval_padding_tokens_in_labels": 378.46935, + "eval_reconstruction_accuracy": 0.910939697115363, + "eval_runtime": 194.3431, + "eval_samples_per_second": 25.728, + "eval_sentence_accuracy": 0.7636154826206327, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 194000 + }, + { + "epoch": 0.12, + "learning_rate": 8.14040404040404e-05, + "loss": 0.4942, + "step": 194100 + }, + { + "epoch": 0.12, + "learning_rate": 8.13939393939394e-05, + "loss": 0.4995, + "step": 194200 + }, + { + "epoch": 0.12, + "learning_rate": 8.138383838383839e-05, + "loss": 0.5005, + "step": 194300 + }, + { + "epoch": 0.12, + "learning_rate": 8.137373737373738e-05, + "loss": 0.4971, + "step": 194400 + }, + { + "epoch": 0.12, + "learning_rate": 8.136363636363636e-05, + "loss": 0.4968, + "step": 194500 + }, + { + "epoch": 0.12, + "learning_rate": 8.135353535353537e-05, + "loss": 0.4981, + "step": 194600 + }, + { + "epoch": 0.12, + "learning_rate": 8.134343434343434e-05, + "loss": 0.4966, + "step": 194700 + }, + { + "epoch": 0.12, + "learning_rate": 8.133333333333334e-05, + "loss": 0.5003, + "step": 194800 + }, + { + "epoch": 0.12, + "learning_rate": 8.132323232323233e-05, + "loss": 0.4983, + "step": 194900 + }, + { + "epoch": 0.12, + "learning_rate": 8.131313131313132e-05, + "loss": 0.4952, + "step": 195000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.49730708506634974, + "eval_average_loss_on_sentence_tokens": 0.41823699157696087, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.49378907680511475, + "eval_non_padding_tokens_in_labels": 133.53725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3981, + "eval_padding_tokens_in_labels": 378.46275, + "eval_reconstruction_accuracy": 0.9110027317469973, + "eval_runtime": 187.9989, + "eval_samples_per_second": 26.596, + "eval_sentence_accuracy": 0.7374656809087157, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 195000 + }, + { + "epoch": 0.13, + "learning_rate": 8.13030303030303e-05, + "loss": 0.4965, + "step": 195100 + }, + { + "epoch": 0.13, + "learning_rate": 8.12929292929293e-05, + "loss": 0.4959, + "step": 195200 + }, + { + "epoch": 0.13, + "learning_rate": 8.128282828282828e-05, + "loss": 0.4984, + "step": 195300 + }, + { + "epoch": 0.13, + "learning_rate": 8.127272727272727e-05, + "loss": 0.4993, + "step": 195400 + }, + { + "epoch": 0.13, + "learning_rate": 8.126262626262627e-05, + "loss": 0.4951, + "step": 195500 + }, + { + "epoch": 0.13, + "learning_rate": 8.125252525252526e-05, + "loss": 0.4938, + "step": 195600 + }, + { + "epoch": 0.13, + "learning_rate": 8.124242424242424e-05, + "loss": 0.4986, + "step": 195700 + }, + { + "epoch": 0.13, + "learning_rate": 8.123232323232324e-05, + "loss": 0.498, + "step": 195800 + }, + { + "epoch": 0.13, + "learning_rate": 8.122222222222222e-05, + "loss": 0.4954, + "step": 195900 + }, + { + "epoch": 0.13, + "learning_rate": 8.121212121212121e-05, + "loss": 0.5045, + "step": 196000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4973203138527618, + "eval_average_loss_on_sentence_tokens": 0.4151894020551177, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4936816394329071, + "eval_non_padding_tokens_in_labels": 133.54255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3832, + "eval_padding_tokens_in_labels": 378.45745, + "eval_reconstruction_accuracy": 0.9109670747854888, + "eval_runtime": 186.6047, + "eval_samples_per_second": 26.795, + "eval_sentence_accuracy": 0.7427369138837547, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.25, + "step": 196000 + }, + { + "epoch": 0.13, + "learning_rate": 8.12020202020202e-05, + "loss": 0.4997, + "step": 196100 + }, + { + "epoch": 0.13, + "learning_rate": 8.11919191919192e-05, + "loss": 0.4965, + "step": 196200 + }, + { + "epoch": 0.13, + "learning_rate": 8.118181818181818e-05, + "loss": 0.5, + "step": 196300 + }, + { + "epoch": 0.13, + "learning_rate": 8.117171717171718e-05, + "loss": 0.4938, + "step": 196400 + }, + { + "epoch": 0.13, + "learning_rate": 8.116161616161616e-05, + "loss": 0.4921, + "step": 196500 + }, + { + "epoch": 0.13, + "learning_rate": 8.115151515151515e-05, + "loss": 0.4982, + "step": 196600 + }, + { + "epoch": 0.13, + "learning_rate": 8.114141414141414e-05, + "loss": 0.501, + "step": 196700 + }, + { + "epoch": 0.13, + "learning_rate": 8.113131313131314e-05, + "loss": 0.4925, + "step": 196800 + }, + { + "epoch": 0.13, + "learning_rate": 8.112121212121211e-05, + "loss": 0.4956, + "step": 196900 + }, + { + "epoch": 0.13, + "learning_rate": 8.111111111111112e-05, + "loss": 0.4958, + "step": 197000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.49675085190610346, + "eval_average_loss_on_sentence_tokens": 0.4353668496491843, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.49403321743011475, + "eval_non_padding_tokens_in_labels": 133.54055, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38515, + "eval_padding_tokens_in_labels": 378.45945, + "eval_reconstruction_accuracy": 0.9109477186171336, + "eval_runtime": 189.92, + "eval_samples_per_second": 26.327, + "eval_sentence_accuracy": 0.7328808297593626, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 197000 + }, + { + "epoch": 0.13, + "learning_rate": 8.11010101010101e-05, + "loss": 0.4954, + "step": 197100 + }, + { + "epoch": 0.13, + "learning_rate": 8.109090909090909e-05, + "loss": 0.4991, + "step": 197200 + }, + { + "epoch": 0.13, + "learning_rate": 8.108080808080808e-05, + "loss": 0.4972, + "step": 197300 + }, + { + "epoch": 0.13, + "learning_rate": 8.107070707070707e-05, + "loss": 0.4934, + "step": 197400 + }, + { + "epoch": 0.13, + "learning_rate": 8.106060606060607e-05, + "loss": 0.5012, + "step": 197500 + }, + { + "epoch": 0.13, + "learning_rate": 8.105050505050506e-05, + "loss": 0.4986, + "step": 197600 + }, + { + "epoch": 0.13, + "learning_rate": 8.104040404040404e-05, + "loss": 0.4993, + "step": 197700 + }, + { + "epoch": 0.13, + "learning_rate": 8.103030303030303e-05, + "loss": 0.4991, + "step": 197800 + }, + { + "epoch": 0.13, + "learning_rate": 8.102020202020203e-05, + "loss": 0.496, + "step": 197900 + }, + { + "epoch": 0.13, + "learning_rate": 8.101010101010101e-05, + "loss": 0.4966, + "step": 198000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4970454461076708, + "eval_average_loss_on_sentence_tokens": 0.3802676457475772, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.49170899391174316, + "eval_non_padding_tokens_in_labels": 133.5429, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36955, + "eval_padding_tokens_in_labels": 378.4571, + "eval_reconstruction_accuracy": 0.9109116490244651, + "eval_runtime": 186.3391, + "eval_samples_per_second": 26.833, + "eval_sentence_accuracy": 0.7740278500547311, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.245775, + "step": 198000 + }, + { + "epoch": 0.13, + "learning_rate": 8.1e-05, + "loss": 0.4955, + "step": 198100 + }, + { + "epoch": 0.13, + "learning_rate": 8.0989898989899e-05, + "loss": 0.4953, + "step": 198200 + }, + { + "epoch": 0.13, + "learning_rate": 8.097979797979799e-05, + "loss": 0.4963, + "step": 198300 + }, + { + "epoch": 0.13, + "learning_rate": 8.096969696969698e-05, + "loss": 0.4932, + "step": 198400 + }, + { + "epoch": 0.13, + "learning_rate": 8.095959595959597e-05, + "loss": 0.5001, + "step": 198500 + }, + { + "epoch": 0.13, + "learning_rate": 8.094949494949495e-05, + "loss": 0.4979, + "step": 198600 + }, + { + "epoch": 0.13, + "learning_rate": 8.093939393939394e-05, + "loss": 0.4937, + "step": 198700 + }, + { + "epoch": 0.13, + "learning_rate": 8.092929292929293e-05, + "loss": 0.4961, + "step": 198800 + }, + { + "epoch": 0.13, + "learning_rate": 8.091919191919193e-05, + "loss": 0.4978, + "step": 198900 + }, + { + "epoch": 0.13, + "learning_rate": 8.090909090909092e-05, + "loss": 0.497, + "step": 199000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.49575370333872976, + "eval_average_loss_on_sentence_tokens": 0.3934390595715764, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4911132752895355, + "eval_non_padding_tokens_in_labels": 133.5433, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3904, + "eval_padding_tokens_in_labels": 378.4567, + "eval_reconstruction_accuracy": 0.9111142192789803, + "eval_runtime": 190.5598, + "eval_samples_per_second": 26.238, + "eval_sentence_accuracy": 0.7632745347856514, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 199000 + }, + { + "epoch": 0.13, + "learning_rate": 8.089898989898991e-05, + "loss": 0.5004, + "step": 199100 + }, + { + "epoch": 0.13, + "learning_rate": 8.088888888888889e-05, + "loss": 0.5001, + "step": 199200 + }, + { + "epoch": 0.13, + "learning_rate": 8.08787878787879e-05, + "loss": 0.4976, + "step": 199300 + }, + { + "epoch": 0.13, + "learning_rate": 8.086868686868687e-05, + "loss": 0.4991, + "step": 199400 + }, + { + "epoch": 0.13, + "learning_rate": 8.085858585858586e-05, + "loss": 0.4982, + "step": 199500 + }, + { + "epoch": 0.13, + "learning_rate": 8.084848484848486e-05, + "loss": 0.5001, + "step": 199600 + }, + { + "epoch": 0.13, + "learning_rate": 8.083838383838385e-05, + "loss": 0.4987, + "step": 199700 + }, + { + "epoch": 0.13, + "learning_rate": 8.082828282828283e-05, + "loss": 0.4998, + "step": 199800 + }, + { + "epoch": 0.13, + "learning_rate": 8.081818181818183e-05, + "loss": 0.4998, + "step": 199900 + }, + { + "epoch": 0.13, + "learning_rate": 8.080808080808081e-05, + "loss": 0.4974, + "step": 200000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.49569443349238285, + "eval_average_loss_on_sentence_tokens": 0.4037770713491146, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.4915820360183716, + "eval_non_padding_tokens_in_labels": 133.50975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3928, + "eval_padding_tokens_in_labels": 378.49025, + "eval_reconstruction_accuracy": 0.911194567986377, + "eval_runtime": 181.2212, + "eval_samples_per_second": 27.591, + "eval_sentence_accuracy": 0.7456977766612235, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 200000 + }, + { + "epoch": 0.13, + "learning_rate": 8.07979797979798e-05, + "loss": 0.4982, + "step": 200100 + }, + { + "epoch": 0.13, + "learning_rate": 8.07878787878788e-05, + "loss": 0.4946, + "step": 200200 + }, + { + "epoch": 0.13, + "learning_rate": 8.077777777777779e-05, + "loss": 0.4884, + "step": 200300 + }, + { + "epoch": 0.13, + "learning_rate": 8.076767676767677e-05, + "loss": 0.4977, + "step": 200400 + }, + { + "epoch": 0.13, + "learning_rate": 8.075757575757577e-05, + "loss": 0.4958, + "step": 200500 + }, + { + "epoch": 0.13, + "learning_rate": 8.074747474747475e-05, + "loss": 0.498, + "step": 200600 + }, + { + "epoch": 0.13, + "learning_rate": 8.073737373737374e-05, + "loss": 0.4968, + "step": 200700 + }, + { + "epoch": 0.13, + "learning_rate": 8.072727272727273e-05, + "loss": 0.4994, + "step": 200800 + }, + { + "epoch": 0.13, + "learning_rate": 8.071717171717172e-05, + "loss": 0.4957, + "step": 200900 + }, + { + "epoch": 0.13, + "learning_rate": 8.07070707070707e-05, + "loss": 0.4976, + "step": 201000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4953270076965789, + "eval_average_loss_on_sentence_tokens": 0.43413643830102194, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.49256837368011475, + "eval_non_padding_tokens_in_labels": 133.48745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36665, + "eval_padding_tokens_in_labels": 378.51255, + "eval_reconstruction_accuracy": 0.9111443691728716, + "eval_runtime": 185.1428, + "eval_samples_per_second": 27.006, + "eval_sentence_accuracy": 0.7284081325030954, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 201000 + }, + { + "epoch": 0.13, + "learning_rate": 8.069696969696971e-05, + "loss": 0.5001, + "step": 201100 + }, + { + "epoch": 0.13, + "learning_rate": 8.068686868686869e-05, + "loss": 0.4975, + "step": 201200 + }, + { + "epoch": 0.13, + "learning_rate": 8.067676767676768e-05, + "loss": 0.496, + "step": 201300 + }, + { + "epoch": 0.13, + "learning_rate": 8.066666666666667e-05, + "loss": 0.4968, + "step": 201400 + }, + { + "epoch": 0.13, + "learning_rate": 8.065656565656566e-05, + "loss": 0.4978, + "step": 201500 + }, + { + "epoch": 0.13, + "learning_rate": 8.064646464646464e-05, + "loss": 0.4947, + "step": 201600 + }, + { + "epoch": 0.13, + "learning_rate": 8.063636363636365e-05, + "loss": 0.4946, + "step": 201700 + }, + { + "epoch": 0.13, + "learning_rate": 8.062626262626263e-05, + "loss": 0.4957, + "step": 201800 + }, + { + "epoch": 0.13, + "learning_rate": 8.061616161616162e-05, + "loss": 0.4961, + "step": 201900 + }, + { + "epoch": 0.13, + "learning_rate": 8.060606060606061e-05, + "loss": 0.4927, + "step": 202000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4954187000413706, + "eval_average_loss_on_sentence_tokens": 0.37076707521219915, + "eval_average_shuffling_prob": 0.42, + "eval_loss": 0.4898144602775574, + "eval_non_padding_tokens_in_labels": 133.54685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3835, + "eval_padding_tokens_in_labels": 378.45315, + "eval_reconstruction_accuracy": 0.91119052798533, + "eval_runtime": 193.7643, + "eval_samples_per_second": 25.805, + "eval_sentence_accuracy": 0.782250973495792, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24360000000000004, + "step": 202000 + }, + { + "epoch": 0.13, + "learning_rate": 8.05959595959596e-05, + "loss": 0.4975, + "step": 202100 + }, + { + "epoch": 0.13, + "learning_rate": 8.058585858585858e-05, + "loss": 0.4984, + "step": 202200 + }, + { + "epoch": 0.13, + "learning_rate": 8.057575757575759e-05, + "loss": 0.4947, + "step": 202300 + }, + { + "epoch": 0.13, + "learning_rate": 8.056565656565656e-05, + "loss": 0.4977, + "step": 202400 + }, + { + "epoch": 0.13, + "learning_rate": 8.055555555555556e-05, + "loss": 0.495, + "step": 202500 + }, + { + "epoch": 0.13, + "learning_rate": 8.054545454545455e-05, + "loss": 0.4947, + "step": 202600 + }, + { + "epoch": 0.13, + "learning_rate": 8.053535353535354e-05, + "loss": 0.4956, + "step": 202700 + }, + { + "epoch": 0.13, + "learning_rate": 8.052525252525253e-05, + "loss": 0.497, + "step": 202800 + }, + { + "epoch": 0.13, + "learning_rate": 8.051515151515152e-05, + "loss": 0.4937, + "step": 202900 + }, + { + "epoch": 0.13, + "learning_rate": 8.05050505050505e-05, + "loss": 0.492, + "step": 203000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.49410605380317446, + "eval_average_loss_on_sentence_tokens": 0.45996787390762356, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.49266600608825684, + "eval_non_padding_tokens_in_labels": 133.5405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3812, + "eval_padding_tokens_in_labels": 378.4595, + "eval_reconstruction_accuracy": 0.9112230957901286, + "eval_runtime": 196.7321, + "eval_samples_per_second": 25.415, + "eval_sentence_accuracy": 0.708633158074183, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.2464, + "step": 203000 + }, + { + "epoch": 0.13, + "learning_rate": 8.04949494949495e-05, + "loss": 0.4962, + "step": 203100 + }, + { + "epoch": 0.13, + "learning_rate": 8.048484848484849e-05, + "loss": 0.4988, + "step": 203200 + }, + { + "epoch": 0.13, + "learning_rate": 8.047474747474748e-05, + "loss": 0.4965, + "step": 203300 + }, + { + "epoch": 0.13, + "learning_rate": 8.046464646464647e-05, + "loss": 0.4942, + "step": 203400 + }, + { + "epoch": 0.13, + "learning_rate": 8.045454545454546e-05, + "loss": 0.4961, + "step": 203500 + }, + { + "epoch": 0.13, + "learning_rate": 8.044444444444444e-05, + "loss": 0.4934, + "step": 203600 + }, + { + "epoch": 0.13, + "learning_rate": 8.043434343434345e-05, + "loss": 0.4941, + "step": 203700 + }, + { + "epoch": 0.13, + "learning_rate": 8.042424242424242e-05, + "loss": 0.4955, + "step": 203800 + }, + { + "epoch": 0.13, + "learning_rate": 8.041414141414142e-05, + "loss": 0.4978, + "step": 203900 + }, + { + "epoch": 0.13, + "learning_rate": 8.040404040404041e-05, + "loss": 0.4953, + "step": 204000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.49487208386717163, + "eval_average_loss_on_sentence_tokens": 0.418085328989798, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.49143555760383606, + "eval_non_padding_tokens_in_labels": 133.53855, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3802, + "eval_padding_tokens_in_labels": 378.46145, + "eval_reconstruction_accuracy": 0.9111932878748807, + "eval_runtime": 182.319, + "eval_samples_per_second": 27.424, + "eval_sentence_accuracy": 0.7383898289877439, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.249975, + "step": 204000 + }, + { + "epoch": 0.13, + "learning_rate": 8.03939393939394e-05, + "loss": 0.4946, + "step": 204100 + }, + { + "epoch": 0.13, + "learning_rate": 8.038383838383838e-05, + "loss": 0.4975, + "step": 204200 + }, + { + "epoch": 0.13, + "learning_rate": 8.037373737373738e-05, + "loss": 0.4961, + "step": 204300 + }, + { + "epoch": 0.13, + "learning_rate": 8.036363636363636e-05, + "loss": 0.4965, + "step": 204400 + }, + { + "epoch": 0.13, + "learning_rate": 8.035353535353535e-05, + "loss": 0.4995, + "step": 204500 + }, + { + "epoch": 0.13, + "learning_rate": 8.034343434343435e-05, + "loss": 0.4938, + "step": 204600 + }, + { + "epoch": 0.13, + "learning_rate": 8.033333333333334e-05, + "loss": 0.4964, + "step": 204700 + }, + { + "epoch": 0.13, + "learning_rate": 8.032323232323232e-05, + "loss": 0.4945, + "step": 204800 + }, + { + "epoch": 0.13, + "learning_rate": 8.031313131313132e-05, + "loss": 0.491, + "step": 204900 + }, + { + "epoch": 0.14, + "learning_rate": 8.03030303030303e-05, + "loss": 0.496, + "step": 205000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.49380948884086917, + "eval_average_loss_on_sentence_tokens": 0.4128438226357274, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.49015626311302185, + "eval_non_padding_tokens_in_labels": 133.53555, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3724, + "eval_padding_tokens_in_labels": 378.46445, + "eval_reconstruction_accuracy": 0.9111428982744881, + "eval_runtime": 189.387, + "eval_samples_per_second": 26.401, + "eval_sentence_accuracy": 0.7526782349668921, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 205000 + }, + { + "epoch": 0.14, + "learning_rate": 8.02929292929293e-05, + "loss": 0.4929, + "step": 205100 + }, + { + "epoch": 0.14, + "learning_rate": 8.028282828282829e-05, + "loss": 0.4954, + "step": 205200 + }, + { + "epoch": 0.14, + "learning_rate": 8.027272727272728e-05, + "loss": 0.4928, + "step": 205300 + }, + { + "epoch": 0.14, + "learning_rate": 8.026262626262626e-05, + "loss": 0.4868, + "step": 205400 + }, + { + "epoch": 0.14, + "learning_rate": 8.025252525252526e-05, + "loss": 0.4958, + "step": 205500 + }, + { + "epoch": 0.14, + "learning_rate": 8.024242424242424e-05, + "loss": 0.4961, + "step": 205600 + }, + { + "epoch": 0.14, + "learning_rate": 8.023232323232323e-05, + "loss": 0.4964, + "step": 205700 + }, + { + "epoch": 0.14, + "learning_rate": 8.022222222222222e-05, + "loss": 0.4953, + "step": 205800 + }, + { + "epoch": 0.14, + "learning_rate": 8.021212121212122e-05, + "loss": 0.4941, + "step": 205900 + }, + { + "epoch": 0.14, + "learning_rate": 8.02020202020202e-05, + "loss": 0.4919, + "step": 206000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.49479755070565, + "eval_average_loss_on_sentence_tokens": 0.39239656104322757, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.49012696743011475, + "eval_non_padding_tokens_in_labels": 133.5357, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3844, + "eval_padding_tokens_in_labels": 378.4643, + "eval_reconstruction_accuracy": 0.9113352863447206, + "eval_runtime": 183.473, + "eval_samples_per_second": 27.252, + "eval_sentence_accuracy": 0.754122777109839, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 206000 + }, + { + "epoch": 0.14, + "learning_rate": 8.01919191919192e-05, + "loss": 0.4938, + "step": 206100 + }, + { + "epoch": 0.14, + "learning_rate": 8.018181818181818e-05, + "loss": 0.4928, + "step": 206200 + }, + { + "epoch": 0.14, + "learning_rate": 8.017171717171717e-05, + "loss": 0.4929, + "step": 206300 + }, + { + "epoch": 0.14, + "learning_rate": 8.016161616161618e-05, + "loss": 0.4964, + "step": 206400 + }, + { + "epoch": 0.14, + "learning_rate": 8.015151515151515e-05, + "loss": 0.4927, + "step": 206500 + }, + { + "epoch": 0.14, + "learning_rate": 8.014141414141415e-05, + "loss": 0.4926, + "step": 206600 + }, + { + "epoch": 0.14, + "learning_rate": 8.013131313131314e-05, + "loss": 0.4943, + "step": 206700 + }, + { + "epoch": 0.14, + "learning_rate": 8.012121212121213e-05, + "loss": 0.4935, + "step": 206800 + }, + { + "epoch": 0.14, + "learning_rate": 8.011111111111111e-05, + "loss": 0.4904, + "step": 206900 + }, + { + "epoch": 0.14, + "learning_rate": 8.010101010101011e-05, + "loss": 0.4952, + "step": 207000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.49324536753317205, + "eval_average_loss_on_sentence_tokens": 0.43504800315322917, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4906054735183716, + "eval_non_padding_tokens_in_labels": 133.5427, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3921, + "eval_padding_tokens_in_labels": 378.4573, + "eval_reconstruction_accuracy": 0.9114456201512079, + "eval_runtime": 195.3601, + "eval_samples_per_second": 25.594, + "eval_sentence_accuracy": 0.7255953128645002, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 207000 + }, + { + "epoch": 0.14, + "learning_rate": 8.009090909090909e-05, + "loss": 0.4984, + "step": 207100 + }, + { + "epoch": 0.14, + "learning_rate": 8.008080808080808e-05, + "loss": 0.4949, + "step": 207200 + }, + { + "epoch": 0.14, + "learning_rate": 8.007070707070708e-05, + "loss": 0.4915, + "step": 207300 + }, + { + "epoch": 0.14, + "learning_rate": 8.006060606060607e-05, + "loss": 0.4948, + "step": 207400 + }, + { + "epoch": 0.14, + "learning_rate": 8.005050505050506e-05, + "loss": 0.4967, + "step": 207500 + }, + { + "epoch": 0.14, + "learning_rate": 8.004040404040405e-05, + "loss": 0.4919, + "step": 207600 + }, + { + "epoch": 0.14, + "learning_rate": 8.003030303030303e-05, + "loss": 0.4916, + "step": 207700 + }, + { + "epoch": 0.14, + "learning_rate": 8.002020202020202e-05, + "loss": 0.4964, + "step": 207800 + }, + { + "epoch": 0.14, + "learning_rate": 8.001010101010101e-05, + "loss": 0.495, + "step": 207900 + }, + { + "epoch": 0.14, + "learning_rate": 8e-05, + "loss": 0.4915, + "step": 208000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.493747828556695, + "eval_average_loss_on_sentence_tokens": 0.3991245636445876, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.4893554747104645, + "eval_non_padding_tokens_in_labels": 133.54225, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3615, + "eval_padding_tokens_in_labels": 378.45775, + "eval_reconstruction_accuracy": 0.9113427345554264, + "eval_runtime": 180.4137, + "eval_samples_per_second": 27.714, + "eval_sentence_accuracy": 0.7560338794480234, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 208000 + }, + { + "epoch": 0.14, + "learning_rate": 7.9989898989899e-05, + "loss": 0.4946, + "step": 208100 + }, + { + "epoch": 0.14, + "learning_rate": 7.997979797979799e-05, + "loss": 0.4962, + "step": 208200 + }, + { + "epoch": 0.14, + "learning_rate": 7.996969696969697e-05, + "loss": 0.4907, + "step": 208300 + }, + { + "epoch": 0.14, + "learning_rate": 7.995959595959596e-05, + "loss": 0.4938, + "step": 208400 + }, + { + "epoch": 0.14, + "learning_rate": 7.994949494949495e-05, + "loss": 0.4936, + "step": 208500 + }, + { + "epoch": 0.14, + "learning_rate": 7.993939393939394e-05, + "loss": 0.4975, + "step": 208600 + }, + { + "epoch": 0.14, + "learning_rate": 7.992929292929294e-05, + "loss": 0.495, + "step": 208700 + }, + { + "epoch": 0.14, + "learning_rate": 7.991919191919193e-05, + "loss": 0.4903, + "step": 208800 + }, + { + "epoch": 0.14, + "learning_rate": 7.990909090909091e-05, + "loss": 0.4923, + "step": 208900 + }, + { + "epoch": 0.14, + "learning_rate": 7.989898989898991e-05, + "loss": 0.4951, + "step": 209000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4923818258171794, + "eval_average_loss_on_sentence_tokens": 0.411363041210563, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.48876953125, + "eval_non_padding_tokens_in_labels": 133.53775, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37425, + "eval_padding_tokens_in_labels": 378.46225, + "eval_reconstruction_accuracy": 0.9114662493341075, + "eval_runtime": 182.4312, + "eval_samples_per_second": 27.408, + "eval_sentence_accuracy": 0.7515611821917563, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 209000 + }, + { + "epoch": 0.14, + "learning_rate": 7.988888888888889e-05, + "loss": 0.488, + "step": 209100 + }, + { + "epoch": 0.14, + "learning_rate": 7.987878787878788e-05, + "loss": 0.4919, + "step": 209200 + }, + { + "epoch": 0.14, + "learning_rate": 7.986868686868688e-05, + "loss": 0.4938, + "step": 209300 + }, + { + "epoch": 0.14, + "learning_rate": 7.985858585858587e-05, + "loss": 0.4946, + "step": 209400 + }, + { + "epoch": 0.14, + "learning_rate": 7.984848484848485e-05, + "loss": 0.4884, + "step": 209500 + }, + { + "epoch": 0.14, + "learning_rate": 7.983838383838385e-05, + "loss": 0.4918, + "step": 209600 + }, + { + "epoch": 0.14, + "learning_rate": 7.982828282828283e-05, + "loss": 0.4911, + "step": 209700 + }, + { + "epoch": 0.14, + "learning_rate": 7.981818181818182e-05, + "loss": 0.4946, + "step": 209800 + }, + { + "epoch": 0.14, + "learning_rate": 7.980808080808081e-05, + "loss": 0.4928, + "step": 209900 + }, + { + "epoch": 0.14, + "learning_rate": 7.97979797979798e-05, + "loss": 0.492, + "step": 210000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.49238063824945477, + "eval_average_loss_on_sentence_tokens": 0.4237426647383707, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.4891894459724426, + "eval_non_padding_tokens_in_labels": 133.5465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3885, + "eval_padding_tokens_in_labels": 378.4535, + "eval_reconstruction_accuracy": 0.9115374820187481, + "eval_runtime": 179.7313, + "eval_samples_per_second": 27.819, + "eval_sentence_accuracy": 0.733818436305561, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 210000 + }, + { + "epoch": 0.14, + "learning_rate": 7.978787878787878e-05, + "loss": 0.4912, + "step": 210100 + }, + { + "epoch": 0.14, + "learning_rate": 7.977777777777779e-05, + "loss": 0.4927, + "step": 210200 + }, + { + "epoch": 0.14, + "learning_rate": 7.976767676767677e-05, + "loss": 0.4935, + "step": 210300 + }, + { + "epoch": 0.14, + "learning_rate": 7.975757575757576e-05, + "loss": 0.4947, + "step": 210400 + }, + { + "epoch": 0.14, + "learning_rate": 7.974747474747475e-05, + "loss": 0.4951, + "step": 210500 + }, + { + "epoch": 0.14, + "learning_rate": 7.973737373737374e-05, + "loss": 0.4922, + "step": 210600 + }, + { + "epoch": 0.14, + "learning_rate": 7.972727272727272e-05, + "loss": 0.4917, + "step": 210700 + }, + { + "epoch": 0.14, + "learning_rate": 7.971717171717173e-05, + "loss": 0.4961, + "step": 210800 + }, + { + "epoch": 0.14, + "learning_rate": 7.97070707070707e-05, + "loss": 0.4942, + "step": 210900 + }, + { + "epoch": 0.14, + "learning_rate": 7.96969696969697e-05, + "loss": 0.4915, + "step": 211000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4928760595549721, + "eval_average_loss_on_sentence_tokens": 0.43058695889764664, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.48994141817092896, + "eval_non_padding_tokens_in_labels": 133.534, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37765, + "eval_padding_tokens_in_labels": 378.466, + "eval_reconstruction_accuracy": 0.9114283352494835, + "eval_runtime": 181.8821, + "eval_samples_per_second": 27.49, + "eval_sentence_accuracy": 0.7303910133328548, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 211000 + }, + { + "epoch": 0.14, + "learning_rate": 7.968686868686869e-05, + "loss": 0.4927, + "step": 211100 + }, + { + "epoch": 0.14, + "learning_rate": 7.967676767676768e-05, + "loss": 0.4907, + "step": 211200 + }, + { + "epoch": 0.14, + "learning_rate": 7.966666666666666e-05, + "loss": 0.4909, + "step": 211300 + }, + { + "epoch": 0.14, + "learning_rate": 7.965656565656567e-05, + "loss": 0.488, + "step": 211400 + }, + { + "epoch": 0.14, + "learning_rate": 7.964646464646464e-05, + "loss": 0.4924, + "step": 211500 + }, + { + "epoch": 0.14, + "learning_rate": 7.963636363636364e-05, + "loss": 0.4903, + "step": 211600 + }, + { + "epoch": 0.14, + "learning_rate": 7.962626262626263e-05, + "loss": 0.4959, + "step": 211700 + }, + { + "epoch": 0.14, + "learning_rate": 7.961616161616162e-05, + "loss": 0.4921, + "step": 211800 + }, + { + "epoch": 0.14, + "learning_rate": 7.960606060606061e-05, + "loss": 0.4934, + "step": 211900 + }, + { + "epoch": 0.14, + "learning_rate": 7.95959595959596e-05, + "loss": 0.4944, + "step": 212000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4918791010657235, + "eval_average_loss_on_sentence_tokens": 0.4177123438534755, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.48844727873802185, + "eval_non_padding_tokens_in_labels": 133.5256, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38915, + "eval_padding_tokens_in_labels": 378.4744, + "eval_reconstruction_accuracy": 0.9116667483639984, + "eval_runtime": 186.122, + "eval_samples_per_second": 26.864, + "eval_sentence_accuracy": 0.735756455578086, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 212000 + }, + { + "epoch": 0.14, + "learning_rate": 7.958585858585858e-05, + "loss": 0.4969, + "step": 212100 + }, + { + "epoch": 0.14, + "learning_rate": 7.957575757575757e-05, + "loss": 0.4909, + "step": 212200 + }, + { + "epoch": 0.14, + "learning_rate": 7.956565656565657e-05, + "loss": 0.4894, + "step": 212300 + }, + { + "epoch": 0.14, + "learning_rate": 7.955555555555556e-05, + "loss": 0.4956, + "step": 212400 + }, + { + "epoch": 0.14, + "learning_rate": 7.954545454545455e-05, + "loss": 0.4911, + "step": 212500 + }, + { + "epoch": 0.14, + "learning_rate": 7.953535353535354e-05, + "loss": 0.4943, + "step": 212600 + }, + { + "epoch": 0.14, + "learning_rate": 7.952525252525252e-05, + "loss": 0.4911, + "step": 212700 + }, + { + "epoch": 0.14, + "learning_rate": 7.951515151515153e-05, + "loss": 0.4896, + "step": 212800 + }, + { + "epoch": 0.14, + "learning_rate": 7.95050505050505e-05, + "loss": 0.4915, + "step": 212900 + }, + { + "epoch": 0.14, + "learning_rate": 7.94949494949495e-05, + "loss": 0.4945, + "step": 213000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4908698829841157, + "eval_average_loss_on_sentence_tokens": 0.41228333046577764, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.48726561665534973, + "eval_non_padding_tokens_in_labels": 133.5487, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37365, + "eval_padding_tokens_in_labels": 378.4513, + "eval_reconstruction_accuracy": 0.9116843043281805, + "eval_runtime": 184.6259, + "eval_samples_per_second": 27.082, + "eval_sentence_accuracy": 0.7466443555188688, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 213000 + }, + { + "epoch": 0.14, + "learning_rate": 7.948484848484849e-05, + "loss": 0.491, + "step": 213100 + }, + { + "epoch": 0.14, + "learning_rate": 7.947474747474748e-05, + "loss": 0.4923, + "step": 213200 + }, + { + "epoch": 0.14, + "learning_rate": 7.946464646464646e-05, + "loss": 0.4934, + "step": 213300 + }, + { + "epoch": 0.14, + "learning_rate": 7.945454545454547e-05, + "loss": 0.4913, + "step": 213400 + }, + { + "epoch": 0.14, + "learning_rate": 7.944444444444444e-05, + "loss": 0.4935, + "step": 213500 + }, + { + "epoch": 0.14, + "learning_rate": 7.943434343434344e-05, + "loss": 0.4932, + "step": 213600 + }, + { + "epoch": 0.14, + "learning_rate": 7.942424242424243e-05, + "loss": 0.4953, + "step": 213700 + }, + { + "epoch": 0.14, + "learning_rate": 7.941414141414142e-05, + "loss": 0.4929, + "step": 213800 + }, + { + "epoch": 0.14, + "learning_rate": 7.94040404040404e-05, + "loss": 0.4899, + "step": 213900 + }, + { + "epoch": 0.14, + "learning_rate": 7.93939393939394e-05, + "loss": 0.488, + "step": 214000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.491413380543862, + "eval_average_loss_on_sentence_tokens": 0.4265113777036181, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.48863279819488525, + "eval_non_padding_tokens_in_labels": 133.52205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3583, + "eval_padding_tokens_in_labels": 378.47795, + "eval_reconstruction_accuracy": 0.9115269015412157, + "eval_runtime": 184.4964, + "eval_samples_per_second": 27.101, + "eval_sentence_accuracy": 0.7350296983508892, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 214000 + }, + { + "epoch": 0.14, + "learning_rate": 7.938383838383838e-05, + "loss": 0.4934, + "step": 214100 + }, + { + "epoch": 0.14, + "learning_rate": 7.937373737373737e-05, + "loss": 0.4892, + "step": 214200 + }, + { + "epoch": 0.14, + "learning_rate": 7.936363636363637e-05, + "loss": 0.4886, + "step": 214300 + }, + { + "epoch": 0.14, + "learning_rate": 7.935353535353536e-05, + "loss": 0.4906, + "step": 214400 + }, + { + "epoch": 0.14, + "learning_rate": 7.934343434343434e-05, + "loss": 0.491, + "step": 214500 + }, + { + "epoch": 0.14, + "learning_rate": 7.933333333333334e-05, + "loss": 0.4913, + "step": 214600 + }, + { + "epoch": 0.14, + "learning_rate": 7.932323232323232e-05, + "loss": 0.4921, + "step": 214700 + }, + { + "epoch": 0.14, + "learning_rate": 7.931313131313131e-05, + "loss": 0.4939, + "step": 214800 + }, + { + "epoch": 0.14, + "learning_rate": 7.930303030303032e-05, + "loss": 0.4902, + "step": 214900 + }, + { + "epoch": 0.14, + "learning_rate": 7.92929292929293e-05, + "loss": 0.4926, + "step": 215000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.49142011319913353, + "eval_average_loss_on_sentence_tokens": 0.42872210084706514, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.48863279819488525, + "eval_non_padding_tokens_in_labels": 133.54105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3804, + "eval_padding_tokens_in_labels": 378.45895, + "eval_reconstruction_accuracy": 0.9116686889726652, + "eval_runtime": 183.9193, + "eval_samples_per_second": 27.186, + "eval_sentence_accuracy": 0.7305928903404095, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 215000 + }, + { + "epoch": 0.15, + "learning_rate": 7.928282828282829e-05, + "loss": 0.4931, + "step": 215100 + }, + { + "epoch": 0.15, + "learning_rate": 7.927272727272728e-05, + "loss": 0.4899, + "step": 215200 + }, + { + "epoch": 0.15, + "learning_rate": 7.926262626262627e-05, + "loss": 0.4913, + "step": 215300 + }, + { + "epoch": 0.15, + "learning_rate": 7.925252525252525e-05, + "loss": 0.4885, + "step": 215400 + }, + { + "epoch": 0.15, + "learning_rate": 7.924242424242426e-05, + "loss": 0.4899, + "step": 215500 + }, + { + "epoch": 0.15, + "learning_rate": 7.923232323232323e-05, + "loss": 0.4917, + "step": 215600 + }, + { + "epoch": 0.15, + "learning_rate": 7.922222222222223e-05, + "loss": 0.4863, + "step": 215700 + }, + { + "epoch": 0.15, + "learning_rate": 7.921212121212122e-05, + "loss": 0.4923, + "step": 215800 + }, + { + "epoch": 0.15, + "learning_rate": 7.920202020202021e-05, + "loss": 0.4901, + "step": 215900 + }, + { + "epoch": 0.15, + "learning_rate": 7.919191919191919e-05, + "loss": 0.4876, + "step": 216000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.49056105998736893, + "eval_average_loss_on_sentence_tokens": 0.39907881125207223, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4863574206829071, + "eval_non_padding_tokens_in_labels": 133.53015, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39405, + "eval_padding_tokens_in_labels": 378.46985, + "eval_reconstruction_accuracy": 0.9116313835197916, + "eval_runtime": 181.624, + "eval_samples_per_second": 27.529, + "eval_sentence_accuracy": 0.7526737488111688, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "step": 216000 + }, + { + "epoch": 0.15, + "learning_rate": 7.91818181818182e-05, + "loss": 0.4895, + "step": 216100 + }, + { + "epoch": 0.15, + "learning_rate": 7.917171717171717e-05, + "loss": 0.4895, + "step": 216200 + }, + { + "epoch": 0.15, + "learning_rate": 7.916161616161616e-05, + "loss": 0.491, + "step": 216300 + }, + { + "epoch": 0.15, + "learning_rate": 7.915151515151516e-05, + "loss": 0.489, + "step": 216400 + }, + { + "epoch": 0.15, + "learning_rate": 7.914141414141415e-05, + "loss": 0.4954, + "step": 216500 + }, + { + "epoch": 0.15, + "learning_rate": 7.913131313131314e-05, + "loss": 0.4934, + "step": 216600 + }, + { + "epoch": 0.15, + "learning_rate": 7.912121212121213e-05, + "loss": 0.4894, + "step": 216700 + }, + { + "epoch": 0.15, + "learning_rate": 7.911111111111111e-05, + "loss": 0.4898, + "step": 216800 + }, + { + "epoch": 0.15, + "learning_rate": 7.91010101010101e-05, + "loss": 0.4917, + "step": 216900 + }, + { + "epoch": 0.15, + "learning_rate": 7.90909090909091e-05, + "loss": 0.4906, + "step": 217000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4903790914915998, + "eval_average_loss_on_sentence_tokens": 0.39259607258486884, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4859863221645355, + "eval_non_padding_tokens_in_labels": 133.52445, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38115, + "eval_padding_tokens_in_labels": 378.47555, + "eval_reconstruction_accuracy": 0.9115983616418045, + "eval_runtime": 182.6506, + "eval_samples_per_second": 27.375, + "eval_sentence_accuracy": 0.7647504800186624, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 217000 + }, + { + "epoch": 0.15, + "learning_rate": 7.908080808080809e-05, + "loss": 0.4893, + "step": 217100 + }, + { + "epoch": 0.15, + "learning_rate": 7.907070707070708e-05, + "loss": 0.4904, + "step": 217200 + }, + { + "epoch": 0.15, + "learning_rate": 7.906060606060607e-05, + "loss": 0.4923, + "step": 217300 + }, + { + "epoch": 0.15, + "learning_rate": 7.905050505050505e-05, + "loss": 0.4942, + "step": 217400 + }, + { + "epoch": 0.15, + "learning_rate": 7.904040404040404e-05, + "loss": 0.4873, + "step": 217500 + }, + { + "epoch": 0.15, + "learning_rate": 7.903030303030303e-05, + "loss": 0.4912, + "step": 217600 + }, + { + "epoch": 0.15, + "learning_rate": 7.902020202020203e-05, + "loss": 0.4938, + "step": 217700 + }, + { + "epoch": 0.15, + "learning_rate": 7.901010101010102e-05, + "loss": 0.4881, + "step": 217800 + }, + { + "epoch": 0.15, + "learning_rate": 7.900000000000001e-05, + "loss": 0.4911, + "step": 217900 + }, + { + "epoch": 0.15, + "learning_rate": 7.898989898989899e-05, + "loss": 0.4879, + "step": 218000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.49027530909539807, + "eval_average_loss_on_sentence_tokens": 0.4416524100453182, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.48808592557907104, + "eval_non_padding_tokens_in_labels": 133.52225, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37655, + "eval_padding_tokens_in_labels": 378.47775, + "eval_reconstruction_accuracy": 0.9117977773187639, + "eval_runtime": 179.7537, + "eval_samples_per_second": 27.816, + "eval_sentence_accuracy": 0.7212616864356596, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 218000 + }, + { + "epoch": 0.15, + "learning_rate": 7.8979797979798e-05, + "loss": 0.4905, + "step": 218100 + }, + { + "epoch": 0.15, + "learning_rate": 7.896969696969697e-05, + "loss": 0.4859, + "step": 218200 + }, + { + "epoch": 0.15, + "learning_rate": 7.895959595959596e-05, + "loss": 0.494, + "step": 218300 + }, + { + "epoch": 0.15, + "learning_rate": 7.894949494949496e-05, + "loss": 0.4913, + "step": 218400 + }, + { + "epoch": 0.15, + "learning_rate": 7.893939393939395e-05, + "loss": 0.4844, + "step": 218500 + }, + { + "epoch": 0.15, + "learning_rate": 7.892929292929293e-05, + "loss": 0.4899, + "step": 218600 + }, + { + "epoch": 0.15, + "learning_rate": 7.891919191919193e-05, + "loss": 0.4882, + "step": 218700 + }, + { + "epoch": 0.15, + "learning_rate": 7.890909090909091e-05, + "loss": 0.4892, + "step": 218800 + }, + { + "epoch": 0.15, + "learning_rate": 7.88989898989899e-05, + "loss": 0.4865, + "step": 218900 + }, + { + "epoch": 0.15, + "learning_rate": 7.88888888888889e-05, + "loss": 0.4931, + "step": 219000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4902837870566134, + "eval_average_loss_on_sentence_tokens": 0.42429331659330055, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4872363209724426, + "eval_non_padding_tokens_in_labels": 133.53645, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3841, + "eval_padding_tokens_in_labels": 378.46355, + "eval_reconstruction_accuracy": 0.9118543861558228, + "eval_runtime": 179.1594, + "eval_samples_per_second": 27.908, + "eval_sentence_accuracy": 0.7322437956466344, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 219000 + }, + { + "epoch": 0.15, + "learning_rate": 7.887878787878789e-05, + "loss": 0.4895, + "step": 219100 + }, + { + "epoch": 0.15, + "learning_rate": 7.886868686868686e-05, + "loss": 0.4908, + "step": 219200 + }, + { + "epoch": 0.15, + "learning_rate": 7.885858585858587e-05, + "loss": 0.4883, + "step": 219300 + }, + { + "epoch": 0.15, + "learning_rate": 7.884848484848485e-05, + "loss": 0.4932, + "step": 219400 + }, + { + "epoch": 0.15, + "learning_rate": 7.883838383838384e-05, + "loss": 0.4942, + "step": 219500 + }, + { + "epoch": 0.15, + "learning_rate": 7.882828282828283e-05, + "loss": 0.4868, + "step": 219600 + }, + { + "epoch": 0.15, + "learning_rate": 7.881818181818182e-05, + "loss": 0.4894, + "step": 219700 + }, + { + "epoch": 0.15, + "learning_rate": 7.88080808080808e-05, + "loss": 0.4884, + "step": 219800 + }, + { + "epoch": 0.15, + "learning_rate": 7.879797979797981e-05, + "loss": 0.4882, + "step": 219900 + }, + { + "epoch": 0.15, + "learning_rate": 7.878787878787879e-05, + "loss": 0.4911, + "step": 220000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.49014262135364284, + "eval_average_loss_on_sentence_tokens": 0.3533857311509611, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.48393553495407104, + "eval_non_padding_tokens_in_labels": 133.5509, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39685, + "eval_padding_tokens_in_labels": 378.4491, + "eval_reconstruction_accuracy": 0.911943466961429, + "eval_runtime": 181.6467, + "eval_samples_per_second": 27.526, + "eval_sentence_accuracy": 0.7753333213702513, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 220000 + }, + { + "epoch": 0.15, + "learning_rate": 7.877777777777778e-05, + "loss": 0.4876, + "step": 220100 + }, + { + "epoch": 0.15, + "learning_rate": 7.876767676767677e-05, + "loss": 0.4895, + "step": 220200 + }, + { + "epoch": 0.15, + "learning_rate": 7.875757575757576e-05, + "loss": 0.4882, + "step": 220300 + }, + { + "epoch": 0.15, + "learning_rate": 7.874747474747474e-05, + "loss": 0.4874, + "step": 220400 + }, + { + "epoch": 0.15, + "learning_rate": 7.873737373737375e-05, + "loss": 0.4932, + "step": 220500 + }, + { + "epoch": 0.15, + "learning_rate": 7.872727272727273e-05, + "loss": 0.4885, + "step": 220600 + }, + { + "epoch": 0.15, + "learning_rate": 7.871717171717172e-05, + "loss": 0.487, + "step": 220700 + }, + { + "epoch": 0.15, + "learning_rate": 7.870707070707071e-05, + "loss": 0.4851, + "step": 220800 + }, + { + "epoch": 0.15, + "learning_rate": 7.86969696969697e-05, + "loss": 0.4897, + "step": 220900 + }, + { + "epoch": 0.15, + "learning_rate": 7.868686868686869e-05, + "loss": 0.4921, + "step": 221000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.48849342408322133, + "eval_average_loss_on_sentence_tokens": 0.38415450254941935, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4838281273841858, + "eval_non_padding_tokens_in_labels": 133.54895, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3942, + "eval_padding_tokens_in_labels": 378.45105, + "eval_reconstruction_accuracy": 0.911995063421378, + "eval_runtime": 183.1247, + "eval_samples_per_second": 27.304, + "eval_sentence_accuracy": 0.7652574156154108, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 221000 + }, + { + "epoch": 0.15, + "learning_rate": 7.867676767676768e-05, + "loss": 0.4904, + "step": 221100 + }, + { + "epoch": 0.15, + "learning_rate": 7.866666666666666e-05, + "loss": 0.4911, + "step": 221200 + }, + { + "epoch": 0.15, + "learning_rate": 7.865656565656566e-05, + "loss": 0.4905, + "step": 221300 + }, + { + "epoch": 0.15, + "learning_rate": 7.864646464646465e-05, + "loss": 0.4907, + "step": 221400 + }, + { + "epoch": 0.15, + "learning_rate": 7.863636363636364e-05, + "loss": 0.4939, + "step": 221500 + }, + { + "epoch": 0.15, + "learning_rate": 7.862626262626263e-05, + "loss": 0.4913, + "step": 221600 + }, + { + "epoch": 0.15, + "learning_rate": 7.861616161616162e-05, + "loss": 0.4929, + "step": 221700 + }, + { + "epoch": 0.15, + "learning_rate": 7.86060606060606e-05, + "loss": 0.4916, + "step": 221800 + }, + { + "epoch": 0.15, + "learning_rate": 7.859595959595961e-05, + "loss": 0.4909, + "step": 221900 + }, + { + "epoch": 0.15, + "learning_rate": 7.858585858585859e-05, + "loss": 0.4892, + "step": 222000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4889283674244807, + "eval_average_loss_on_sentence_tokens": 0.3565330426281094, + "eval_average_shuffling_prob": 0.43, + "eval_loss": 0.48287108540534973, + "eval_non_padding_tokens_in_labels": 133.5352, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3773, + "eval_padding_tokens_in_labels": 378.4648, + "eval_reconstruction_accuracy": 0.9120281356301421, + "eval_runtime": 179.366, + "eval_samples_per_second": 27.876, + "eval_sentence_accuracy": 0.7827534229368169, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2451, + "step": 222000 + }, + { + "epoch": 0.15, + "learning_rate": 7.857575757575758e-05, + "loss": 0.4899, + "step": 222100 + }, + { + "epoch": 0.15, + "learning_rate": 7.856565656565657e-05, + "loss": 0.4909, + "step": 222200 + }, + { + "epoch": 0.15, + "learning_rate": 7.855555555555556e-05, + "loss": 0.4926, + "step": 222300 + }, + { + "epoch": 0.15, + "learning_rate": 7.854545454545454e-05, + "loss": 0.4884, + "step": 222400 + }, + { + "epoch": 0.15, + "learning_rate": 7.853535353535355e-05, + "loss": 0.4913, + "step": 222500 + }, + { + "epoch": 0.15, + "learning_rate": 7.852525252525252e-05, + "loss": 0.4854, + "step": 222600 + }, + { + "epoch": 0.15, + "learning_rate": 7.851515151515152e-05, + "loss": 0.4898, + "step": 222700 + }, + { + "epoch": 0.15, + "learning_rate": 7.850505050505051e-05, + "loss": 0.487, + "step": 222800 + }, + { + "epoch": 0.15, + "learning_rate": 7.84949494949495e-05, + "loss": 0.4892, + "step": 222900 + }, + { + "epoch": 0.15, + "learning_rate": 7.848484848484848e-05, + "loss": 0.4911, + "step": 223000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.48904641073159955, + "eval_average_loss_on_sentence_tokens": 0.46070478162210593, + "eval_average_shuffling_prob": 0.57, + "eval_loss": 0.48777344822883606, + "eval_non_padding_tokens_in_labels": 133.54825, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36005, + "eval_padding_tokens_in_labels": 378.45175, + "eval_reconstruction_accuracy": 0.9118642402770271, + "eval_runtime": 190.7398, + "eval_samples_per_second": 26.214, + "eval_sentence_accuracy": 0.7053537782403503, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24509999999999996, + "step": 223000 + }, + { + "epoch": 0.15, + "learning_rate": 7.847474747474748e-05, + "loss": 0.4893, + "step": 223100 + }, + { + "epoch": 0.15, + "learning_rate": 7.846464646464646e-05, + "loss": 0.49, + "step": 223200 + }, + { + "epoch": 0.15, + "learning_rate": 7.845454545454545e-05, + "loss": 0.491, + "step": 223300 + }, + { + "epoch": 0.15, + "learning_rate": 7.844444444444446e-05, + "loss": 0.4873, + "step": 223400 + }, + { + "epoch": 0.15, + "learning_rate": 7.843434343434344e-05, + "loss": 0.4873, + "step": 223500 + }, + { + "epoch": 0.15, + "learning_rate": 7.842424242424243e-05, + "loss": 0.4873, + "step": 223600 + }, + { + "epoch": 0.15, + "learning_rate": 7.841414141414142e-05, + "loss": 0.4884, + "step": 223700 + }, + { + "epoch": 0.15, + "learning_rate": 7.840404040404041e-05, + "loss": 0.488, + "step": 223800 + }, + { + "epoch": 0.15, + "learning_rate": 7.839393939393939e-05, + "loss": 0.487, + "step": 223900 + }, + { + "epoch": 0.15, + "learning_rate": 7.83838383838384e-05, + "loss": 0.4876, + "step": 224000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.48820759661921104, + "eval_average_loss_on_sentence_tokens": 0.4300523694125947, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.48548829555511475, + "eval_non_padding_tokens_in_labels": 133.5162, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3905, + "eval_padding_tokens_in_labels": 378.4838, + "eval_reconstruction_accuracy": 0.9120636173470962, + "eval_runtime": 181.4269, + "eval_samples_per_second": 27.559, + "eval_sentence_accuracy": 0.7266361009923377, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 224000 + }, + { + "epoch": 0.15, + "learning_rate": 7.837373737373738e-05, + "loss": 0.4901, + "step": 224100 + }, + { + "epoch": 0.15, + "learning_rate": 7.836363636363637e-05, + "loss": 0.4898, + "step": 224200 + }, + { + "epoch": 0.15, + "learning_rate": 7.835353535353536e-05, + "loss": 0.4865, + "step": 224300 + }, + { + "epoch": 0.15, + "learning_rate": 7.834343434343435e-05, + "loss": 0.4862, + "step": 224400 + }, + { + "epoch": 0.15, + "learning_rate": 7.833333333333333e-05, + "loss": 0.4879, + "step": 224500 + }, + { + "epoch": 0.15, + "learning_rate": 7.832323232323234e-05, + "loss": 0.487, + "step": 224600 + }, + { + "epoch": 0.15, + "learning_rate": 7.831313131313131e-05, + "loss": 0.4899, + "step": 224700 + }, + { + "epoch": 0.15, + "learning_rate": 7.830303030303031e-05, + "loss": 0.4853, + "step": 224800 + }, + { + "epoch": 0.15, + "learning_rate": 7.82929292929293e-05, + "loss": 0.49, + "step": 224900 + }, + { + "epoch": 0.15, + "learning_rate": 7.828282828282829e-05, + "loss": 0.4936, + "step": 225000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.48813357925835077, + "eval_average_loss_on_sentence_tokens": 0.40872238760263685, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.48463866114616394, + "eval_non_padding_tokens_in_labels": 133.5075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3676, + "eval_padding_tokens_in_labels": 378.4925, + "eval_reconstruction_accuracy": 0.911991956603516, + "eval_runtime": 183.2437, + "eval_samples_per_second": 27.286, + "eval_sentence_accuracy": 0.7455542196780734, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.25, + "step": 225000 + }, + { + "epoch": 0.16, + "learning_rate": 7.827272727272727e-05, + "loss": 0.4884, + "step": 225100 + }, + { + "epoch": 0.16, + "learning_rate": 7.826262626262627e-05, + "loss": 0.4899, + "step": 225200 + }, + { + "epoch": 0.16, + "learning_rate": 7.825252525252525e-05, + "loss": 0.4876, + "step": 225300 + }, + { + "epoch": 0.16, + "learning_rate": 7.824242424242425e-05, + "loss": 0.4912, + "step": 225400 + }, + { + "epoch": 0.16, + "learning_rate": 7.823232323232324e-05, + "loss": 0.4862, + "step": 225500 + }, + { + "epoch": 0.16, + "learning_rate": 7.822222222222223e-05, + "loss": 0.4895, + "step": 225600 + }, + { + "epoch": 0.16, + "learning_rate": 7.821212121212121e-05, + "loss": 0.491, + "step": 225700 + }, + { + "epoch": 0.16, + "learning_rate": 7.820202020202021e-05, + "loss": 0.4892, + "step": 225800 + }, + { + "epoch": 0.16, + "learning_rate": 7.819191919191919e-05, + "loss": 0.4903, + "step": 225900 + }, + { + "epoch": 0.16, + "learning_rate": 7.818181818181818e-05, + "loss": 0.485, + "step": 226000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4871991412092434, + "eval_average_loss_on_sentence_tokens": 0.45765147727813776, + "eval_average_shuffling_prob": 0.58, + "eval_loss": 0.48587891459465027, + "eval_non_padding_tokens_in_labels": 133.4725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3675, + "eval_padding_tokens_in_labels": 378.5275, + "eval_reconstruction_accuracy": 0.9121519649279144, + "eval_runtime": 183.1737, + "eval_samples_per_second": 27.296, + "eval_sentence_accuracy": 0.7040976546377878, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24360000000000004, + "step": 226000 + }, + { + "epoch": 0.16, + "learning_rate": 7.817171717171718e-05, + "loss": 0.4896, + "step": 226100 + }, + { + "epoch": 0.16, + "learning_rate": 7.816161616161617e-05, + "loss": 0.4911, + "step": 226200 + }, + { + "epoch": 0.16, + "learning_rate": 7.815151515151516e-05, + "loss": 0.4891, + "step": 226300 + }, + { + "epoch": 0.16, + "learning_rate": 7.814141414141415e-05, + "loss": 0.4905, + "step": 226400 + }, + { + "epoch": 0.16, + "learning_rate": 7.813131313131313e-05, + "loss": 0.488, + "step": 226500 + }, + { + "epoch": 0.16, + "learning_rate": 7.812121212121212e-05, + "loss": 0.4899, + "step": 226600 + }, + { + "epoch": 0.16, + "learning_rate": 7.811111111111111e-05, + "loss": 0.4877, + "step": 226700 + }, + { + "epoch": 0.16, + "learning_rate": 7.81010101010101e-05, + "loss": 0.4893, + "step": 226800 + }, + { + "epoch": 0.16, + "learning_rate": 7.80909090909091e-05, + "loss": 0.4896, + "step": 226900 + }, + { + "epoch": 0.16, + "learning_rate": 7.808080808080809e-05, + "loss": 0.4946, + "step": 227000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.48733561619387533, + "eval_average_loss_on_sentence_tokens": 0.39726325622313696, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4833105504512787, + "eval_non_padding_tokens_in_labels": 133.55225, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38785, + "eval_padding_tokens_in_labels": 378.44775, + "eval_reconstruction_accuracy": 0.9120904309691347, + "eval_runtime": 182.0228, + "eval_samples_per_second": 27.469, + "eval_sentence_accuracy": 0.7534319091284296, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 227000 + }, + { + "epoch": 0.16, + "learning_rate": 7.807070707070707e-05, + "loss": 0.4854, + "step": 227100 + }, + { + "epoch": 0.16, + "learning_rate": 7.806060606060607e-05, + "loss": 0.4861, + "step": 227200 + }, + { + "epoch": 0.16, + "learning_rate": 7.805050505050505e-05, + "loss": 0.4902, + "step": 227300 + }, + { + "epoch": 0.16, + "learning_rate": 7.804040404040404e-05, + "loss": 0.4882, + "step": 227400 + }, + { + "epoch": 0.16, + "learning_rate": 7.803030303030304e-05, + "loss": 0.4884, + "step": 227500 + }, + { + "epoch": 0.16, + "learning_rate": 7.802020202020203e-05, + "loss": 0.4837, + "step": 227600 + }, + { + "epoch": 0.16, + "learning_rate": 7.8010101010101e-05, + "loss": 0.4868, + "step": 227700 + }, + { + "epoch": 0.16, + "learning_rate": 7.800000000000001e-05, + "loss": 0.4908, + "step": 227800 + }, + { + "epoch": 0.16, + "learning_rate": 7.798989898989899e-05, + "loss": 0.4874, + "step": 227900 + }, + { + "epoch": 0.16, + "learning_rate": 7.797979797979798e-05, + "loss": 0.486, + "step": 228000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.48657816689784367, + "eval_average_loss_on_sentence_tokens": 0.4029945803196946, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4828417897224426, + "eval_non_padding_tokens_in_labels": 133.5291, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3911, + "eval_padding_tokens_in_labels": 378.4709, + "eval_reconstruction_accuracy": 0.912012587030008, + "eval_runtime": 177.8747, + "eval_samples_per_second": 28.11, + "eval_sentence_accuracy": 0.7614980171191702, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.248775, + "step": 228000 + }, + { + "epoch": 0.16, + "learning_rate": 7.796969696969697e-05, + "loss": 0.4872, + "step": 228100 + }, + { + "epoch": 0.16, + "learning_rate": 7.795959595959597e-05, + "loss": 0.492, + "step": 228200 + }, + { + "epoch": 0.16, + "learning_rate": 7.794949494949494e-05, + "loss": 0.4849, + "step": 228300 + }, + { + "epoch": 0.16, + "learning_rate": 7.793939393939395e-05, + "loss": 0.4854, + "step": 228400 + }, + { + "epoch": 0.16, + "learning_rate": 7.792929292929293e-05, + "loss": 0.4886, + "step": 228500 + }, + { + "epoch": 0.16, + "learning_rate": 7.791919191919192e-05, + "loss": 0.4903, + "step": 228600 + }, + { + "epoch": 0.16, + "learning_rate": 7.790909090909091e-05, + "loss": 0.4888, + "step": 228700 + }, + { + "epoch": 0.16, + "learning_rate": 7.78989898989899e-05, + "loss": 0.483, + "step": 228800 + }, + { + "epoch": 0.16, + "learning_rate": 7.788888888888888e-05, + "loss": 0.487, + "step": 228900 + }, + { + "epoch": 0.16, + "learning_rate": 7.787878787878789e-05, + "loss": 0.4855, + "step": 229000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.48644458924459016, + "eval_average_loss_on_sentence_tokens": 0.37638427194383484, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.48146483302116394, + "eval_non_padding_tokens_in_labels": 133.5192, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3705, + "eval_padding_tokens_in_labels": 378.4808, + "eval_reconstruction_accuracy": 0.9122583179014795, + "eval_runtime": 183.6595, + "eval_samples_per_second": 27.224, + "eval_sentence_accuracy": 0.7685592262278608, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 229000 + }, + { + "epoch": 0.16, + "learning_rate": 7.786868686868687e-05, + "loss": 0.4832, + "step": 229100 + }, + { + "epoch": 0.16, + "learning_rate": 7.785858585858586e-05, + "loss": 0.4863, + "step": 229200 + }, + { + "epoch": 0.16, + "learning_rate": 7.784848484848485e-05, + "loss": 0.4867, + "step": 229300 + }, + { + "epoch": 0.16, + "learning_rate": 7.783838383838384e-05, + "loss": 0.4859, + "step": 229400 + }, + { + "epoch": 0.16, + "learning_rate": 7.782828282828282e-05, + "loss": 0.4856, + "step": 229500 + }, + { + "epoch": 0.16, + "learning_rate": 7.781818181818183e-05, + "loss": 0.486, + "step": 229600 + }, + { + "epoch": 0.16, + "learning_rate": 7.78080808080808e-05, + "loss": 0.4884, + "step": 229700 + }, + { + "epoch": 0.16, + "learning_rate": 7.77979797979798e-05, + "loss": 0.4898, + "step": 229800 + }, + { + "epoch": 0.16, + "learning_rate": 7.778787878787879e-05, + "loss": 0.4856, + "step": 229900 + }, + { + "epoch": 0.16, + "learning_rate": 7.777777777777778e-05, + "loss": 0.4825, + "step": 230000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4864177170395644, + "eval_average_loss_on_sentence_tokens": 0.40747391876918015, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4828808605670929, + "eval_non_padding_tokens_in_labels": 133.52335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38175, + "eval_padding_tokens_in_labels": 378.47665, + "eval_reconstruction_accuracy": 0.9122086672194802, + "eval_runtime": 179.3824, + "eval_samples_per_second": 27.873, + "eval_sentence_accuracy": 0.74698530335385, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 230000 + }, + { + "epoch": 0.16, + "learning_rate": 7.776767676767677e-05, + "loss": 0.4835, + "step": 230100 + }, + { + "epoch": 0.16, + "learning_rate": 7.775757575757577e-05, + "loss": 0.4869, + "step": 230200 + }, + { + "epoch": 0.16, + "learning_rate": 7.774747474747474e-05, + "loss": 0.4818, + "step": 230300 + }, + { + "epoch": 0.16, + "learning_rate": 7.773737373737374e-05, + "loss": 0.4868, + "step": 230400 + }, + { + "epoch": 0.16, + "learning_rate": 7.772727272727273e-05, + "loss": 0.486, + "step": 230500 + }, + { + "epoch": 0.16, + "learning_rate": 7.771717171717172e-05, + "loss": 0.4858, + "step": 230600 + }, + { + "epoch": 0.16, + "learning_rate": 7.770707070707071e-05, + "loss": 0.4817, + "step": 230700 + }, + { + "epoch": 0.16, + "learning_rate": 7.76969696969697e-05, + "loss": 0.4917, + "step": 230800 + }, + { + "epoch": 0.16, + "learning_rate": 7.768686868686868e-05, + "loss": 0.4868, + "step": 230900 + }, + { + "epoch": 0.16, + "learning_rate": 7.767676767676769e-05, + "loss": 0.4863, + "step": 231000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4858497445552299, + "eval_average_loss_on_sentence_tokens": 0.4450572943042769, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.4839746057987213, + "eval_non_padding_tokens_in_labels": 133.55255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38615, + "eval_padding_tokens_in_labels": 378.44745, + "eval_reconstruction_accuracy": 0.9121249955324224, + "eval_runtime": 180.2536, + "eval_samples_per_second": 27.739, + "eval_sentence_accuracy": 0.7212437418127658, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 231000 + }, + { + "epoch": 0.16, + "learning_rate": 7.766666666666667e-05, + "loss": 0.486, + "step": 231100 + }, + { + "epoch": 0.16, + "learning_rate": 7.765656565656566e-05, + "loss": 0.4835, + "step": 231200 + }, + { + "epoch": 0.16, + "learning_rate": 7.764646464646465e-05, + "loss": 0.4858, + "step": 231300 + }, + { + "epoch": 0.16, + "learning_rate": 7.763636363636364e-05, + "loss": 0.4846, + "step": 231400 + }, + { + "epoch": 0.16, + "learning_rate": 7.762626262626262e-05, + "loss": 0.4824, + "step": 231500 + }, + { + "epoch": 0.16, + "learning_rate": 7.761616161616163e-05, + "loss": 0.4879, + "step": 231600 + }, + { + "epoch": 0.16, + "learning_rate": 7.76060606060606e-05, + "loss": 0.488, + "step": 231700 + }, + { + "epoch": 0.16, + "learning_rate": 7.75959595959596e-05, + "loss": 0.4853, + "step": 231800 + }, + { + "epoch": 0.16, + "learning_rate": 7.758585858585859e-05, + "loss": 0.4888, + "step": 231900 + }, + { + "epoch": 0.16, + "learning_rate": 7.757575757575758e-05, + "loss": 0.4853, + "step": 232000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.486700418348762, + "eval_average_loss_on_sentence_tokens": 0.42631501100492136, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4839843809604645, + "eval_non_padding_tokens_in_labels": 133.52525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38405, + "eval_padding_tokens_in_labels": 378.47475, + "eval_reconstruction_accuracy": 0.9122353929648707, + "eval_runtime": 176.4912, + "eval_samples_per_second": 28.33, + "eval_sentence_accuracy": 0.7276365137186642, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.248775, + "step": 232000 + }, + { + "epoch": 0.16, + "learning_rate": 7.756565656565657e-05, + "loss": 0.487, + "step": 232100 + }, + { + "epoch": 0.16, + "learning_rate": 7.755555555555556e-05, + "loss": 0.4871, + "step": 232200 + }, + { + "epoch": 0.16, + "learning_rate": 7.754545454545456e-05, + "loss": 0.4861, + "step": 232300 + }, + { + "epoch": 0.16, + "learning_rate": 7.753535353535353e-05, + "loss": 0.488, + "step": 232400 + }, + { + "epoch": 0.16, + "learning_rate": 7.752525252525254e-05, + "loss": 0.4899, + "step": 232500 + }, + { + "epoch": 0.16, + "learning_rate": 7.751515151515152e-05, + "loss": 0.489, + "step": 232600 + }, + { + "epoch": 0.16, + "learning_rate": 7.750505050505051e-05, + "loss": 0.4854, + "step": 232700 + }, + { + "epoch": 0.16, + "learning_rate": 7.74949494949495e-05, + "loss": 0.4797, + "step": 232800 + }, + { + "epoch": 0.16, + "learning_rate": 7.74848484848485e-05, + "loss": 0.4823, + "step": 232900 + }, + { + "epoch": 0.16, + "learning_rate": 7.747474747474747e-05, + "loss": 0.4867, + "step": 233000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.48615359109365697, + "eval_average_loss_on_sentence_tokens": 0.45149911619768607, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.48462891578674316, + "eval_non_padding_tokens_in_labels": 133.55515, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37755, + "eval_padding_tokens_in_labels": 378.44485, + "eval_reconstruction_accuracy": 0.9121217206689789, + "eval_runtime": 187.1197, + "eval_samples_per_second": 26.721, + "eval_sentence_accuracy": 0.7145997451863549, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2464, + "step": 233000 + }, + { + "epoch": 0.16, + "learning_rate": 7.746464646464648e-05, + "loss": 0.4906, + "step": 233100 + }, + { + "epoch": 0.16, + "learning_rate": 7.745454545454546e-05, + "loss": 0.4857, + "step": 233200 + }, + { + "epoch": 0.16, + "learning_rate": 7.744444444444445e-05, + "loss": 0.484, + "step": 233300 + }, + { + "epoch": 0.16, + "learning_rate": 7.743434343434344e-05, + "loss": 0.4857, + "step": 233400 + }, + { + "epoch": 0.16, + "learning_rate": 7.742424242424243e-05, + "loss": 0.4904, + "step": 233500 + }, + { + "epoch": 0.16, + "learning_rate": 7.741414141414141e-05, + "loss": 0.4864, + "step": 233600 + }, + { + "epoch": 0.16, + "learning_rate": 7.740404040404042e-05, + "loss": 0.4833, + "step": 233700 + }, + { + "epoch": 0.16, + "learning_rate": 7.73939393939394e-05, + "loss": 0.4846, + "step": 233800 + }, + { + "epoch": 0.16, + "learning_rate": 7.738383838383839e-05, + "loss": 0.4875, + "step": 233900 + }, + { + "epoch": 0.16, + "learning_rate": 7.737373737373738e-05, + "loss": 0.4882, + "step": 234000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4860179508035847, + "eval_average_loss_on_sentence_tokens": 0.4448253198677245, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.4840722680091858, + "eval_non_padding_tokens_in_labels": 133.52405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3741, + "eval_padding_tokens_in_labels": 378.47595, + "eval_reconstruction_accuracy": 0.9122213304855441, + "eval_runtime": 179.4953, + "eval_samples_per_second": 27.856, + "eval_sentence_accuracy": 0.7167486137778815, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 234000 + }, + { + "epoch": 0.16, + "learning_rate": 7.736363636363637e-05, + "loss": 0.4844, + "step": 234100 + }, + { + "epoch": 0.16, + "learning_rate": 7.735353535353535e-05, + "loss": 0.4827, + "step": 234200 + }, + { + "epoch": 0.16, + "learning_rate": 7.734343434343436e-05, + "loss": 0.486, + "step": 234300 + }, + { + "epoch": 0.16, + "learning_rate": 7.733333333333333e-05, + "loss": 0.4903, + "step": 234400 + }, + { + "epoch": 0.16, + "learning_rate": 7.732323232323233e-05, + "loss": 0.4845, + "step": 234500 + }, + { + "epoch": 0.16, + "learning_rate": 7.731313131313132e-05, + "loss": 0.4885, + "step": 234600 + }, + { + "epoch": 0.16, + "learning_rate": 7.730303030303031e-05, + "loss": 0.4849, + "step": 234700 + }, + { + "epoch": 0.16, + "learning_rate": 7.729292929292929e-05, + "loss": 0.4869, + "step": 234800 + }, + { + "epoch": 0.16, + "learning_rate": 7.72828282828283e-05, + "loss": 0.4823, + "step": 234900 + }, + { + "epoch": 0.17, + "learning_rate": 7.727272727272727e-05, + "loss": 0.486, + "step": 235000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4852291545981469, + "eval_average_loss_on_sentence_tokens": 0.48415178378197055, + "eval_average_shuffling_prob": 0.61, + "eval_loss": 0.48518553376197815, + "eval_non_padding_tokens_in_labels": 133.5386, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38975, + "eval_padding_tokens_in_labels": 378.4614, + "eval_reconstruction_accuracy": 0.9122713734375784, + "eval_runtime": 180.0468, + "eval_samples_per_second": 27.771, + "eval_sentence_accuracy": 0.6890645468085488, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2379, + "step": 235000 + }, + { + "epoch": 0.17, + "learning_rate": 7.726262626262626e-05, + "loss": 0.4886, + "step": 235100 + }, + { + "epoch": 0.17, + "learning_rate": 7.725252525252526e-05, + "loss": 0.4884, + "step": 235200 + }, + { + "epoch": 0.17, + "learning_rate": 7.724242424242425e-05, + "loss": 0.4829, + "step": 235300 + }, + { + "epoch": 0.17, + "learning_rate": 7.723232323232324e-05, + "loss": 0.4866, + "step": 235400 + }, + { + "epoch": 0.17, + "learning_rate": 7.722222222222223e-05, + "loss": 0.4875, + "step": 235500 + }, + { + "epoch": 0.17, + "learning_rate": 7.721212121212121e-05, + "loss": 0.4839, + "step": 235600 + }, + { + "epoch": 0.17, + "learning_rate": 7.72020202020202e-05, + "loss": 0.4853, + "step": 235700 + }, + { + "epoch": 0.17, + "learning_rate": 7.71919191919192e-05, + "loss": 0.4867, + "step": 235800 + }, + { + "epoch": 0.17, + "learning_rate": 7.718181818181819e-05, + "loss": 0.4852, + "step": 235900 + }, + { + "epoch": 0.17, + "learning_rate": 7.717171717171718e-05, + "loss": 0.4894, + "step": 236000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4847536195598446, + "eval_average_loss_on_sentence_tokens": 0.40005273191492124, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4808007776737213, + "eval_non_padding_tokens_in_labels": 133.5284, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3796, + "eval_padding_tokens_in_labels": 378.4716, + "eval_reconstruction_accuracy": 0.9124361725800988, + "eval_runtime": 178.6714, + "eval_samples_per_second": 27.984, + "eval_sentence_accuracy": 0.7512112620453281, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2496, + "step": 236000 + }, + { + "epoch": 0.17, + "learning_rate": 7.716161616161617e-05, + "loss": 0.4806, + "step": 236100 + }, + { + "epoch": 0.17, + "learning_rate": 7.715151515151515e-05, + "loss": 0.4883, + "step": 236200 + }, + { + "epoch": 0.17, + "learning_rate": 7.714141414141415e-05, + "loss": 0.4896, + "step": 236300 + }, + { + "epoch": 0.17, + "learning_rate": 7.713131313131313e-05, + "loss": 0.4865, + "step": 236400 + }, + { + "epoch": 0.17, + "learning_rate": 7.712121212121212e-05, + "loss": 0.4846, + "step": 236500 + }, + { + "epoch": 0.17, + "learning_rate": 7.711111111111112e-05, + "loss": 0.4883, + "step": 236600 + }, + { + "epoch": 0.17, + "learning_rate": 7.710101010101011e-05, + "loss": 0.488, + "step": 236700 + }, + { + "epoch": 0.17, + "learning_rate": 7.709090909090909e-05, + "loss": 0.4874, + "step": 236800 + }, + { + "epoch": 0.17, + "learning_rate": 7.708080808080809e-05, + "loss": 0.4847, + "step": 236900 + }, + { + "epoch": 0.17, + "learning_rate": 7.707070707070707e-05, + "loss": 0.4903, + "step": 237000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.48469509129342475, + "eval_average_loss_on_sentence_tokens": 0.3792491224197409, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.47997069358825684, + "eval_non_padding_tokens_in_labels": 133.50205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37315, + "eval_padding_tokens_in_labels": 378.49795, + "eval_reconstruction_accuracy": 0.9125010158094898, + "eval_runtime": 184.9108, + "eval_samples_per_second": 27.04, + "eval_sentence_accuracy": 0.7602284350494375, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "step": 237000 + }, + { + "epoch": 0.17, + "learning_rate": 7.706060606060606e-05, + "loss": 0.4866, + "step": 237100 + }, + { + "epoch": 0.17, + "learning_rate": 7.705050505050506e-05, + "loss": 0.4857, + "step": 237200 + }, + { + "epoch": 0.17, + "learning_rate": 7.704040404040405e-05, + "loss": 0.4895, + "step": 237300 + }, + { + "epoch": 0.17, + "learning_rate": 7.703030303030303e-05, + "loss": 0.4841, + "step": 237400 + }, + { + "epoch": 0.17, + "learning_rate": 7.702020202020203e-05, + "loss": 0.4872, + "step": 237500 + }, + { + "epoch": 0.17, + "learning_rate": 7.701010101010101e-05, + "loss": 0.4855, + "step": 237600 + }, + { + "epoch": 0.17, + "learning_rate": 7.7e-05, + "loss": 0.4853, + "step": 237700 + }, + { + "epoch": 0.17, + "learning_rate": 7.6989898989899e-05, + "loss": 0.4842, + "step": 237800 + }, + { + "epoch": 0.17, + "learning_rate": 7.697979797979799e-05, + "loss": 0.4855, + "step": 237900 + }, + { + "epoch": 0.17, + "learning_rate": 7.696969696969696e-05, + "loss": 0.4829, + "step": 238000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4837084133216466, + "eval_average_loss_on_sentence_tokens": 0.3751765846221103, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.4788281321525574, + "eval_non_padding_tokens_in_labels": 133.5075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37475, + "eval_padding_tokens_in_labels": 378.4925, + "eval_reconstruction_accuracy": 0.912566085113673, + "eval_runtime": 190.2244, + "eval_samples_per_second": 26.285, + "eval_sentence_accuracy": 0.7755621153121467, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 238000 + }, + { + "epoch": 0.17, + "learning_rate": 7.695959595959597e-05, + "loss": 0.4838, + "step": 238100 + }, + { + "epoch": 0.17, + "learning_rate": 7.694949494949495e-05, + "loss": 0.4846, + "step": 238200 + }, + { + "epoch": 0.17, + "learning_rate": 7.693939393939394e-05, + "loss": 0.4829, + "step": 238300 + }, + { + "epoch": 0.17, + "learning_rate": 7.692929292929293e-05, + "loss": 0.4847, + "step": 238400 + }, + { + "epoch": 0.17, + "learning_rate": 7.691919191919192e-05, + "loss": 0.4861, + "step": 238500 + }, + { + "epoch": 0.17, + "learning_rate": 7.69090909090909e-05, + "loss": 0.4836, + "step": 238600 + }, + { + "epoch": 0.17, + "learning_rate": 7.689898989898991e-05, + "loss": 0.4893, + "step": 238700 + }, + { + "epoch": 0.17, + "learning_rate": 7.688888888888889e-05, + "loss": 0.4845, + "step": 238800 + }, + { + "epoch": 0.17, + "learning_rate": 7.687878787878788e-05, + "loss": 0.4788, + "step": 238900 + }, + { + "epoch": 0.17, + "learning_rate": 7.686868686868687e-05, + "loss": 0.4834, + "step": 239000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.483762997467037, + "eval_average_loss_on_sentence_tokens": 0.46538681027169276, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.48286134004592896, + "eval_non_padding_tokens_in_labels": 133.52605, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38915, + "eval_padding_tokens_in_labels": 378.47395, + "eval_reconstruction_accuracy": 0.9122960393904543, + "eval_runtime": 181.1966, + "eval_samples_per_second": 27.594, + "eval_sentence_accuracy": 0.7116254239417159, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.245775, + "step": 239000 + }, + { + "epoch": 0.17, + "learning_rate": 7.685858585858586e-05, + "loss": 0.4842, + "step": 239100 + }, + { + "epoch": 0.17, + "learning_rate": 7.684848484848485e-05, + "loss": 0.4838, + "step": 239200 + }, + { + "epoch": 0.17, + "learning_rate": 7.683838383838385e-05, + "loss": 0.4841, + "step": 239300 + }, + { + "epoch": 0.17, + "learning_rate": 7.682828282828282e-05, + "loss": 0.4859, + "step": 239400 + }, + { + "epoch": 0.17, + "learning_rate": 7.681818181818182e-05, + "loss": 0.4821, + "step": 239500 + }, + { + "epoch": 0.17, + "learning_rate": 7.680808080808081e-05, + "loss": 0.4866, + "step": 239600 + }, + { + "epoch": 0.17, + "learning_rate": 7.67979797979798e-05, + "loss": 0.4837, + "step": 239700 + }, + { + "epoch": 0.17, + "learning_rate": 7.678787878787879e-05, + "loss": 0.4814, + "step": 239800 + }, + { + "epoch": 0.17, + "learning_rate": 7.677777777777778e-05, + "loss": 0.4851, + "step": 239900 + }, + { + "epoch": 0.17, + "learning_rate": 7.676767676767676e-05, + "loss": 0.4872, + "step": 240000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4836128154798042, + "eval_average_loss_on_sentence_tokens": 0.4551014492518385, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.4823046922683716, + "eval_non_padding_tokens_in_labels": 133.54415, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3863, + "eval_padding_tokens_in_labels": 378.45585, + "eval_reconstruction_accuracy": 0.9125078394504508, + "eval_runtime": 179.4111, + "eval_samples_per_second": 27.869, + "eval_sentence_accuracy": 0.7102975218475783, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 240000 + }, + { + "epoch": 0.17, + "learning_rate": 7.675757575757575e-05, + "loss": 0.4816, + "step": 240100 + }, + { + "epoch": 0.17, + "learning_rate": 7.674747474747475e-05, + "loss": 0.4855, + "step": 240200 + }, + { + "epoch": 0.17, + "learning_rate": 7.673737373737374e-05, + "loss": 0.4841, + "step": 240300 + }, + { + "epoch": 0.17, + "learning_rate": 7.672727272727273e-05, + "loss": 0.4837, + "step": 240400 + }, + { + "epoch": 0.17, + "learning_rate": 7.671717171717172e-05, + "loss": 0.4856, + "step": 240500 + }, + { + "epoch": 0.17, + "learning_rate": 7.670707070707071e-05, + "loss": 0.4803, + "step": 240600 + }, + { + "epoch": 0.17, + "learning_rate": 7.66969696969697e-05, + "loss": 0.4821, + "step": 240700 + }, + { + "epoch": 0.17, + "learning_rate": 7.66868686868687e-05, + "loss": 0.486, + "step": 240800 + }, + { + "epoch": 0.17, + "learning_rate": 7.667676767676768e-05, + "loss": 0.4855, + "step": 240900 + }, + { + "epoch": 0.17, + "learning_rate": 7.666666666666667e-05, + "loss": 0.4816, + "step": 241000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4827386786964679, + "eval_average_loss_on_sentence_tokens": 0.38919710604112834, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.47859373688697815, + "eval_non_padding_tokens_in_labels": 133.5114, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3864, + "eval_padding_tokens_in_labels": 378.4886, + "eval_reconstruction_accuracy": 0.9125127013471764, + "eval_runtime": 186.4071, + "eval_samples_per_second": 26.823, + "eval_sentence_accuracy": 0.7654548064672421, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 241000 + }, + { + "epoch": 0.17, + "learning_rate": 7.665656565656566e-05, + "loss": 0.4822, + "step": 241100 + }, + { + "epoch": 0.17, + "learning_rate": 7.664646464646465e-05, + "loss": 0.4838, + "step": 241200 + }, + { + "epoch": 0.17, + "learning_rate": 7.663636363636364e-05, + "loss": 0.483, + "step": 241300 + }, + { + "epoch": 0.17, + "learning_rate": 7.662626262626264e-05, + "loss": 0.4866, + "step": 241400 + }, + { + "epoch": 0.17, + "learning_rate": 7.661616161616162e-05, + "loss": 0.4847, + "step": 241500 + }, + { + "epoch": 0.17, + "learning_rate": 7.660606060606062e-05, + "loss": 0.4826, + "step": 241600 + }, + { + "epoch": 0.17, + "learning_rate": 7.65959595959596e-05, + "loss": 0.4841, + "step": 241700 + }, + { + "epoch": 0.17, + "learning_rate": 7.658585858585859e-05, + "loss": 0.4819, + "step": 241800 + }, + { + "epoch": 0.17, + "learning_rate": 7.657575757575758e-05, + "loss": 0.4848, + "step": 241900 + }, + { + "epoch": 0.17, + "learning_rate": 7.656565656565658e-05, + "loss": 0.4835, + "step": 242000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4837122275169246, + "eval_average_loss_on_sentence_tokens": 0.36139929820631683, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.47831055521965027, + "eval_non_padding_tokens_in_labels": 133.5275, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39535, + "eval_padding_tokens_in_labels": 378.4725, + "eval_reconstruction_accuracy": 0.9126062755936468, + "eval_runtime": 182.7452, + "eval_samples_per_second": 27.361, + "eval_sentence_accuracy": 0.7737990561128358, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 242000 + }, + { + "epoch": 0.17, + "learning_rate": 7.655555555555555e-05, + "loss": 0.4841, + "step": 242100 + }, + { + "epoch": 0.17, + "learning_rate": 7.654545454545456e-05, + "loss": 0.4866, + "step": 242200 + }, + { + "epoch": 0.17, + "learning_rate": 7.653535353535354e-05, + "loss": 0.482, + "step": 242300 + }, + { + "epoch": 0.17, + "learning_rate": 7.652525252525253e-05, + "loss": 0.4853, + "step": 242400 + }, + { + "epoch": 0.17, + "learning_rate": 7.651515151515152e-05, + "loss": 0.4818, + "step": 242500 + }, + { + "epoch": 0.17, + "learning_rate": 7.650505050505051e-05, + "loss": 0.4882, + "step": 242600 + }, + { + "epoch": 0.17, + "learning_rate": 7.649494949494949e-05, + "loss": 0.4804, + "step": 242700 + }, + { + "epoch": 0.17, + "learning_rate": 7.64848484848485e-05, + "loss": 0.4851, + "step": 242800 + }, + { + "epoch": 0.17, + "learning_rate": 7.647474747474748e-05, + "loss": 0.4853, + "step": 242900 + }, + { + "epoch": 0.17, + "learning_rate": 7.646464646464647e-05, + "loss": 0.4837, + "step": 243000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4823541353979427, + "eval_average_loss_on_sentence_tokens": 0.37436024426164227, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.47734373807907104, + "eval_non_padding_tokens_in_labels": 133.521, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3736, + "eval_padding_tokens_in_labels": 378.479, + "eval_reconstruction_accuracy": 0.9125293898658362, + "eval_runtime": 181.5937, + "eval_samples_per_second": 27.534, + "eval_sentence_accuracy": 0.7761991494248749, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 243000 + }, + { + "epoch": 0.17, + "learning_rate": 7.645454545454546e-05, + "loss": 0.4863, + "step": 243100 + }, + { + "epoch": 0.17, + "learning_rate": 7.644444444444445e-05, + "loss": 0.4854, + "step": 243200 + }, + { + "epoch": 0.17, + "learning_rate": 7.643434343434343e-05, + "loss": 0.4831, + "step": 243300 + }, + { + "epoch": 0.17, + "learning_rate": 7.642424242424244e-05, + "loss": 0.4827, + "step": 243400 + }, + { + "epoch": 0.17, + "learning_rate": 7.641414141414141e-05, + "loss": 0.4868, + "step": 243500 + }, + { + "epoch": 0.17, + "learning_rate": 7.64040404040404e-05, + "loss": 0.4833, + "step": 243600 + }, + { + "epoch": 0.17, + "learning_rate": 7.63939393939394e-05, + "loss": 0.4867, + "step": 243700 + }, + { + "epoch": 0.17, + "learning_rate": 7.638383838383839e-05, + "loss": 0.4826, + "step": 243800 + }, + { + "epoch": 0.17, + "learning_rate": 7.637373737373737e-05, + "loss": 0.4857, + "step": 243900 + }, + { + "epoch": 0.17, + "learning_rate": 7.636363636363637e-05, + "loss": 0.4793, + "step": 244000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.48227487520553614, + "eval_average_loss_on_sentence_tokens": 0.39066980555808045, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.47802734375, + "eval_non_padding_tokens_in_labels": 133.5328, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3901, + "eval_padding_tokens_in_labels": 378.4672, + "eval_reconstruction_accuracy": 0.9126456140210759, + "eval_runtime": 183.5898, + "eval_samples_per_second": 27.235, + "eval_sentence_accuracy": 0.7694205681267609, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 244000 + }, + { + "epoch": 0.17, + "learning_rate": 7.635353535353535e-05, + "loss": 0.4845, + "step": 244100 + }, + { + "epoch": 0.17, + "learning_rate": 7.634343434343434e-05, + "loss": 0.4846, + "step": 244200 + }, + { + "epoch": 0.17, + "learning_rate": 7.633333333333334e-05, + "loss": 0.4801, + "step": 244300 + }, + { + "epoch": 0.17, + "learning_rate": 7.632323232323233e-05, + "loss": 0.4819, + "step": 244400 + }, + { + "epoch": 0.17, + "learning_rate": 7.631313131313132e-05, + "loss": 0.4817, + "step": 244500 + }, + { + "epoch": 0.17, + "learning_rate": 7.630303030303031e-05, + "loss": 0.482, + "step": 244600 + }, + { + "epoch": 0.17, + "learning_rate": 7.629292929292929e-05, + "loss": 0.4829, + "step": 244700 + }, + { + "epoch": 0.17, + "learning_rate": 7.628282828282828e-05, + "loss": 0.4848, + "step": 244800 + }, + { + "epoch": 0.17, + "learning_rate": 7.627272727272727e-05, + "loss": 0.4806, + "step": 244900 + }, + { + "epoch": 0.17, + "learning_rate": 7.626262626262627e-05, + "loss": 0.4807, + "step": 245000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4821973628389471, + "eval_average_loss_on_sentence_tokens": 0.4122621619530543, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.47902342677116394, + "eval_non_padding_tokens_in_labels": 133.55425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3911, + "eval_padding_tokens_in_labels": 378.44575, + "eval_reconstruction_accuracy": 0.9126504670926644, + "eval_runtime": 179.8715, + "eval_samples_per_second": 27.798, + "eval_sentence_accuracy": 0.7424901753189657, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 245000 + }, + { + "epoch": 0.18, + "learning_rate": 7.625252525252526e-05, + "loss": 0.4813, + "step": 245100 + }, + { + "epoch": 0.18, + "learning_rate": 7.624242424242425e-05, + "loss": 0.485, + "step": 245200 + }, + { + "epoch": 0.18, + "learning_rate": 7.623232323232323e-05, + "loss": 0.4884, + "step": 245300 + }, + { + "epoch": 0.18, + "learning_rate": 7.622222222222223e-05, + "loss": 0.4831, + "step": 245400 + }, + { + "epoch": 0.18, + "learning_rate": 7.621212121212121e-05, + "loss": 0.4806, + "step": 245500 + }, + { + "epoch": 0.18, + "learning_rate": 7.62020202020202e-05, + "loss": 0.4836, + "step": 245600 + }, + { + "epoch": 0.18, + "learning_rate": 7.61919191919192e-05, + "loss": 0.485, + "step": 245700 + }, + { + "epoch": 0.18, + "learning_rate": 7.618181818181819e-05, + "loss": 0.479, + "step": 245800 + }, + { + "epoch": 0.18, + "learning_rate": 7.617171717171717e-05, + "loss": 0.481, + "step": 245900 + }, + { + "epoch": 0.18, + "learning_rate": 7.616161616161617e-05, + "loss": 0.4788, + "step": 246000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4811623944386545, + "eval_average_loss_on_sentence_tokens": 0.4470618104340073, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.4796484410762787, + "eval_non_padding_tokens_in_labels": 133.53455, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39055, + "eval_padding_tokens_in_labels": 378.46545, + "eval_reconstruction_accuracy": 0.9127646294449001, + "eval_runtime": 186.3216, + "eval_samples_per_second": 26.835, + "eval_sentence_accuracy": 0.7230740933479283, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.247975, + "step": 246000 + }, + { + "epoch": 0.18, + "learning_rate": 7.615151515151515e-05, + "loss": 0.4835, + "step": 246100 + }, + { + "epoch": 0.18, + "learning_rate": 7.614141414141414e-05, + "loss": 0.4817, + "step": 246200 + }, + { + "epoch": 0.18, + "learning_rate": 7.613131313131314e-05, + "loss": 0.4841, + "step": 246300 + }, + { + "epoch": 0.18, + "learning_rate": 7.612121212121213e-05, + "loss": 0.4804, + "step": 246400 + }, + { + "epoch": 0.18, + "learning_rate": 7.61111111111111e-05, + "loss": 0.4844, + "step": 246500 + }, + { + "epoch": 0.18, + "learning_rate": 7.610101010101011e-05, + "loss": 0.482, + "step": 246600 + }, + { + "epoch": 0.18, + "learning_rate": 7.609090909090909e-05, + "loss": 0.4839, + "step": 246700 + }, + { + "epoch": 0.18, + "learning_rate": 7.608080808080808e-05, + "loss": 0.4848, + "step": 246800 + }, + { + "epoch": 0.18, + "learning_rate": 7.607070707070707e-05, + "loss": 0.4858, + "step": 246900 + }, + { + "epoch": 0.18, + "learning_rate": 7.606060606060607e-05, + "loss": 0.4821, + "step": 247000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48155233786402046, + "eval_average_loss_on_sentence_tokens": 0.3966132543835729, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.4776855409145355, + "eval_non_padding_tokens_in_labels": 133.53415, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38495, + "eval_padding_tokens_in_labels": 378.46585, + "eval_reconstruction_accuracy": 0.9126456702503736, + "eval_runtime": 178.1466, + "eval_samples_per_second": 28.067, + "eval_sentence_accuracy": 0.7575187969924813, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 247000 + }, + { + "epoch": 0.18, + "learning_rate": 7.605050505050504e-05, + "loss": 0.4821, + "step": 247100 + }, + { + "epoch": 0.18, + "learning_rate": 7.604040404040405e-05, + "loss": 0.4802, + "step": 247200 + }, + { + "epoch": 0.18, + "learning_rate": 7.603030303030303e-05, + "loss": 0.4835, + "step": 247300 + }, + { + "epoch": 0.18, + "learning_rate": 7.602020202020202e-05, + "loss": 0.4833, + "step": 247400 + }, + { + "epoch": 0.18, + "learning_rate": 7.601010101010101e-05, + "loss": 0.4813, + "step": 247500 + }, + { + "epoch": 0.18, + "learning_rate": 7.6e-05, + "loss": 0.4807, + "step": 247600 + }, + { + "epoch": 0.18, + "learning_rate": 7.598989898989898e-05, + "loss": 0.48, + "step": 247700 + }, + { + "epoch": 0.18, + "learning_rate": 7.597979797979799e-05, + "loss": 0.4797, + "step": 247800 + }, + { + "epoch": 0.18, + "learning_rate": 7.596969696969697e-05, + "loss": 0.4839, + "step": 247900 + }, + { + "epoch": 0.18, + "learning_rate": 7.595959595959596e-05, + "loss": 0.4788, + "step": 248000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48191175396692115, + "eval_average_loss_on_sentence_tokens": 0.4348940072954946, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.4797656238079071, + "eval_non_padding_tokens_in_labels": 133.55555, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3901, + "eval_padding_tokens_in_labels": 378.44445, + "eval_reconstruction_accuracy": 0.9127286195770238, + "eval_runtime": 177.5079, + "eval_samples_per_second": 28.168, + "eval_sentence_accuracy": 0.7236169181904643, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 248000 + }, + { + "epoch": 0.18, + "learning_rate": 7.594949494949495e-05, + "loss": 0.4818, + "step": 248100 + }, + { + "epoch": 0.18, + "learning_rate": 7.593939393939394e-05, + "loss": 0.483, + "step": 248200 + }, + { + "epoch": 0.18, + "learning_rate": 7.592929292929293e-05, + "loss": 0.4771, + "step": 248300 + }, + { + "epoch": 0.18, + "learning_rate": 7.591919191919193e-05, + "loss": 0.4798, + "step": 248400 + }, + { + "epoch": 0.18, + "learning_rate": 7.59090909090909e-05, + "loss": 0.4824, + "step": 248500 + }, + { + "epoch": 0.18, + "learning_rate": 7.58989898989899e-05, + "loss": 0.4835, + "step": 248600 + }, + { + "epoch": 0.18, + "learning_rate": 7.588888888888889e-05, + "loss": 0.4827, + "step": 248700 + }, + { + "epoch": 0.18, + "learning_rate": 7.587878787878788e-05, + "loss": 0.484, + "step": 248800 + }, + { + "epoch": 0.18, + "learning_rate": 7.586868686868687e-05, + "loss": 0.4818, + "step": 248900 + }, + { + "epoch": 0.18, + "learning_rate": 7.585858585858586e-05, + "loss": 0.4816, + "step": 249000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48123144975709536, + "eval_average_loss_on_sentence_tokens": 0.3656519036241576, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.4759863317012787, + "eval_non_padding_tokens_in_labels": 133.54405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3779, + "eval_padding_tokens_in_labels": 378.45595, + "eval_reconstruction_accuracy": 0.9128151325521946, + "eval_runtime": 182.0406, + "eval_samples_per_second": 27.466, + "eval_sentence_accuracy": 0.7808423205986326, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.245775, + "step": 249000 + }, + { + "epoch": 0.18, + "learning_rate": 7.584848484848486e-05, + "loss": 0.4773, + "step": 249100 + }, + { + "epoch": 0.18, + "learning_rate": 7.583838383838384e-05, + "loss": 0.4822, + "step": 249200 + }, + { + "epoch": 0.18, + "learning_rate": 7.582828282828284e-05, + "loss": 0.4805, + "step": 249300 + }, + { + "epoch": 0.18, + "learning_rate": 7.581818181818182e-05, + "loss": 0.4836, + "step": 249400 + }, + { + "epoch": 0.18, + "learning_rate": 7.580808080808081e-05, + "loss": 0.4846, + "step": 249500 + }, + { + "epoch": 0.18, + "learning_rate": 7.57979797979798e-05, + "loss": 0.4792, + "step": 249600 + }, + { + "epoch": 0.18, + "learning_rate": 7.57878787878788e-05, + "loss": 0.48, + "step": 249700 + }, + { + "epoch": 0.18, + "learning_rate": 7.577777777777779e-05, + "loss": 0.4784, + "step": 249800 + }, + { + "epoch": 0.18, + "learning_rate": 7.576767676767678e-05, + "loss": 0.4801, + "step": 249900 + }, + { + "epoch": 0.18, + "learning_rate": 7.575757575757576e-05, + "loss": 0.4834, + "step": 250000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48110276768373067, + "eval_average_loss_on_sentence_tokens": 0.36085632247341287, + "eval_average_shuffling_prob": 0.43, + "eval_loss": 0.4757031202316284, + "eval_non_padding_tokens_in_labels": 133.5107, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3782, + "eval_padding_tokens_in_labels": 378.4893, + "eval_reconstruction_accuracy": 0.9128212352076264, + "eval_runtime": 182.9947, + "eval_samples_per_second": 27.323, + "eval_sentence_accuracy": 0.7811114899420388, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24509999999999996, + "step": 250000 + }, + { + "epoch": 0.18, + "learning_rate": 7.574747474747475e-05, + "loss": 0.483, + "step": 250100 + }, + { + "epoch": 0.18, + "learning_rate": 7.573737373737374e-05, + "loss": 0.4829, + "step": 250200 + }, + { + "epoch": 0.18, + "learning_rate": 7.572727272727273e-05, + "loss": 0.4841, + "step": 250300 + }, + { + "epoch": 0.18, + "learning_rate": 7.571717171717173e-05, + "loss": 0.4853, + "step": 250400 + }, + { + "epoch": 0.18, + "learning_rate": 7.570707070707072e-05, + "loss": 0.4808, + "step": 250500 + }, + { + "epoch": 0.18, + "learning_rate": 7.56969696969697e-05, + "loss": 0.4808, + "step": 250600 + }, + { + "epoch": 0.18, + "learning_rate": 7.56868686868687e-05, + "loss": 0.4848, + "step": 250700 + }, + { + "epoch": 0.18, + "learning_rate": 7.567676767676768e-05, + "loss": 0.4797, + "step": 250800 + }, + { + "epoch": 0.18, + "learning_rate": 7.566666666666667e-05, + "loss": 0.4817, + "step": 250900 + }, + { + "epoch": 0.18, + "learning_rate": 7.565656565656566e-05, + "loss": 0.4819, + "step": 251000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48151109591979063, + "eval_average_loss_on_sentence_tokens": 0.387205169283176, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.47728514671325684, + "eval_non_padding_tokens_in_labels": 133.5418, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3822, + "eval_padding_tokens_in_labels": 378.4582, + "eval_reconstruction_accuracy": 0.9128757448457676, + "eval_runtime": 191.3675, + "eval_samples_per_second": 26.128, + "eval_sentence_accuracy": 0.7523552317548047, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 251000 + }, + { + "epoch": 0.18, + "learning_rate": 7.564646464646466e-05, + "loss": 0.4818, + "step": 251100 + }, + { + "epoch": 0.18, + "learning_rate": 7.563636363636363e-05, + "loss": 0.4837, + "step": 251200 + }, + { + "epoch": 0.18, + "learning_rate": 7.562626262626264e-05, + "loss": 0.4807, + "step": 251300 + }, + { + "epoch": 0.18, + "learning_rate": 7.561616161616162e-05, + "loss": 0.4797, + "step": 251400 + }, + { + "epoch": 0.18, + "learning_rate": 7.560606060606061e-05, + "loss": 0.482, + "step": 251500 + }, + { + "epoch": 0.18, + "learning_rate": 7.55959595959596e-05, + "loss": 0.48, + "step": 251600 + }, + { + "epoch": 0.18, + "learning_rate": 7.55858585858586e-05, + "loss": 0.4785, + "step": 251700 + }, + { + "epoch": 0.18, + "learning_rate": 7.557575757575757e-05, + "loss": 0.48, + "step": 251800 + }, + { + "epoch": 0.18, + "learning_rate": 7.556565656565658e-05, + "loss": 0.4837, + "step": 251900 + }, + { + "epoch": 0.18, + "learning_rate": 7.555555555555556e-05, + "loss": 0.4796, + "step": 252000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4802777692263714, + "eval_average_loss_on_sentence_tokens": 0.35580009132894436, + "eval_average_shuffling_prob": 0.415, + "eval_loss": 0.47461915016174316, + "eval_non_padding_tokens_in_labels": 133.50995, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38275, + "eval_padding_tokens_in_labels": 378.49005, + "eval_reconstruction_accuracy": 0.9129636153966799, + "eval_runtime": 181.299, + "eval_samples_per_second": 27.579, + "eval_sentence_accuracy": 0.7869210616038904, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.242775, + "step": 252000 + }, + { + "epoch": 0.18, + "learning_rate": 7.554545454545455e-05, + "loss": 0.4812, + "step": 252100 + }, + { + "epoch": 0.18, + "learning_rate": 7.553535353535354e-05, + "loss": 0.4815, + "step": 252200 + }, + { + "epoch": 0.18, + "learning_rate": 7.552525252525253e-05, + "loss": 0.4858, + "step": 252300 + }, + { + "epoch": 0.18, + "learning_rate": 7.551515151515151e-05, + "loss": 0.4808, + "step": 252400 + }, + { + "epoch": 0.18, + "learning_rate": 7.550505050505052e-05, + "loss": 0.4821, + "step": 252500 + }, + { + "epoch": 0.18, + "learning_rate": 7.54949494949495e-05, + "loss": 0.4799, + "step": 252600 + }, + { + "epoch": 0.18, + "learning_rate": 7.548484848484849e-05, + "loss": 0.4798, + "step": 252700 + }, + { + "epoch": 0.18, + "learning_rate": 7.547474747474748e-05, + "loss": 0.4833, + "step": 252800 + }, + { + "epoch": 0.18, + "learning_rate": 7.546464646464647e-05, + "loss": 0.4827, + "step": 252900 + }, + { + "epoch": 0.18, + "learning_rate": 7.545454545454545e-05, + "loss": 0.4814, + "step": 253000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4810182866455235, + "eval_average_loss_on_sentence_tokens": 0.33261096629052994, + "eval_average_shuffling_prob": 0.39, + "eval_loss": 0.4743359386920929, + "eval_non_padding_tokens_in_labels": 133.5162, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38745, + "eval_padding_tokens_in_labels": 378.4838, + "eval_reconstruction_accuracy": 0.9127721813573659, + "eval_runtime": 184.2412, + "eval_samples_per_second": 27.138, + "eval_sentence_accuracy": 0.8045875428427871, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2379, + "step": 253000 + }, + { + "epoch": 0.18, + "learning_rate": 7.544444444444445e-05, + "loss": 0.4818, + "step": 253100 + }, + { + "epoch": 0.18, + "learning_rate": 7.543434343434343e-05, + "loss": 0.4789, + "step": 253200 + }, + { + "epoch": 0.18, + "learning_rate": 7.542424242424243e-05, + "loss": 0.4807, + "step": 253300 + }, + { + "epoch": 0.18, + "learning_rate": 7.541414141414142e-05, + "loss": 0.481, + "step": 253400 + }, + { + "epoch": 0.18, + "learning_rate": 7.540404040404041e-05, + "loss": 0.4815, + "step": 253500 + }, + { + "epoch": 0.18, + "learning_rate": 7.53939393939394e-05, + "loss": 0.4819, + "step": 253600 + }, + { + "epoch": 0.18, + "learning_rate": 7.538383838383839e-05, + "loss": 0.4788, + "step": 253700 + }, + { + "epoch": 0.18, + "learning_rate": 7.537373737373737e-05, + "loss": 0.4848, + "step": 253800 + }, + { + "epoch": 0.18, + "learning_rate": 7.536363636363636e-05, + "loss": 0.4811, + "step": 253900 + }, + { + "epoch": 0.18, + "learning_rate": 7.535353535353536e-05, + "loss": 0.4837, + "step": 254000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.47936545453857415, + "eval_average_loss_on_sentence_tokens": 0.39922644979288674, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.47578126192092896, + "eval_non_padding_tokens_in_labels": 133.55125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37245, + "eval_padding_tokens_in_labels": 378.44875, + "eval_reconstruction_accuracy": 0.9129646703062752, + "eval_runtime": 181.2893, + "eval_samples_per_second": 27.58, + "eval_sentence_accuracy": 0.7629694761964577, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24909999999999996, + "step": 254000 + }, + { + "epoch": 0.18, + "learning_rate": 7.534343434343435e-05, + "loss": 0.483, + "step": 254100 + }, + { + "epoch": 0.18, + "learning_rate": 7.533333333333334e-05, + "loss": 0.4795, + "step": 254200 + }, + { + "epoch": 0.18, + "learning_rate": 7.532323232323233e-05, + "loss": 0.4819, + "step": 254300 + }, + { + "epoch": 0.18, + "learning_rate": 7.531313131313131e-05, + "loss": 0.4841, + "step": 254400 + }, + { + "epoch": 0.18, + "learning_rate": 7.530303030303032e-05, + "loss": 0.4803, + "step": 254500 + }, + { + "epoch": 0.18, + "learning_rate": 7.52929292929293e-05, + "loss": 0.4807, + "step": 254600 + }, + { + "epoch": 0.18, + "learning_rate": 7.528282828282829e-05, + "loss": 0.4803, + "step": 254700 + }, + { + "epoch": 0.18, + "learning_rate": 7.527272727272728e-05, + "loss": 0.4813, + "step": 254800 + }, + { + "epoch": 0.18, + "learning_rate": 7.526262626262627e-05, + "loss": 0.4781, + "step": 254900 + }, + { + "epoch": 0.18, + "learning_rate": 7.525252525252525e-05, + "loss": 0.475, + "step": 255000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.48034412244919666, + "eval_average_loss_on_sentence_tokens": 0.3878262588569598, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.4761718809604645, + "eval_non_padding_tokens_in_labels": 133.51575, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37555, + "eval_padding_tokens_in_labels": 378.48425, + "eval_reconstruction_accuracy": 0.9128988627553319, + "eval_runtime": 180.8359, + "eval_samples_per_second": 27.649, + "eval_sentence_accuracy": 0.7615922263893624, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 255000 + }, + { + "epoch": 0.19, + "learning_rate": 7.524242424242425e-05, + "loss": 0.4822, + "step": 255100 + }, + { + "epoch": 0.19, + "learning_rate": 7.523232323232323e-05, + "loss": 0.4789, + "step": 255200 + }, + { + "epoch": 0.19, + "learning_rate": 7.522222222222222e-05, + "loss": 0.4815, + "step": 255300 + }, + { + "epoch": 0.19, + "learning_rate": 7.521212121212122e-05, + "loss": 0.4805, + "step": 255400 + }, + { + "epoch": 0.19, + "learning_rate": 7.520202020202021e-05, + "loss": 0.4809, + "step": 255500 + }, + { + "epoch": 0.19, + "learning_rate": 7.519191919191919e-05, + "loss": 0.4797, + "step": 255600 + }, + { + "epoch": 0.19, + "learning_rate": 7.518181818181819e-05, + "loss": 0.4823, + "step": 255700 + }, + { + "epoch": 0.19, + "learning_rate": 7.517171717171717e-05, + "loss": 0.4772, + "step": 255800 + }, + { + "epoch": 0.19, + "learning_rate": 7.516161616161616e-05, + "loss": 0.4812, + "step": 255900 + }, + { + "epoch": 0.19, + "learning_rate": 7.515151515151515e-05, + "loss": 0.4818, + "step": 256000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4797645035444745, + "eval_average_loss_on_sentence_tokens": 0.4039595736864431, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.47635743021965027, + "eval_non_padding_tokens_in_labels": 133.5482, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38255, + "eval_padding_tokens_in_labels": 378.4518, + "eval_reconstruction_accuracy": 0.9129806132401815, + "eval_runtime": 177.6759, + "eval_samples_per_second": 28.141, + "eval_sentence_accuracy": 0.7480395499488578, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 256000 + }, + { + "epoch": 0.19, + "learning_rate": 7.514141414141415e-05, + "loss": 0.4772, + "step": 256100 + }, + { + "epoch": 0.19, + "learning_rate": 7.513131313131312e-05, + "loss": 0.4827, + "step": 256200 + }, + { + "epoch": 0.19, + "learning_rate": 7.512121212121213e-05, + "loss": 0.4782, + "step": 256300 + }, + { + "epoch": 0.19, + "learning_rate": 7.511111111111111e-05, + "loss": 0.4777, + "step": 256400 + }, + { + "epoch": 0.19, + "learning_rate": 7.51010101010101e-05, + "loss": 0.4774, + "step": 256500 + }, + { + "epoch": 0.19, + "learning_rate": 7.509090909090909e-05, + "loss": 0.4808, + "step": 256600 + }, + { + "epoch": 0.19, + "learning_rate": 7.508080808080808e-05, + "loss": 0.4787, + "step": 256700 + }, + { + "epoch": 0.19, + "learning_rate": 7.507070707070706e-05, + "loss": 0.4825, + "step": 256800 + }, + { + "epoch": 0.19, + "learning_rate": 7.506060606060607e-05, + "loss": 0.4799, + "step": 256900 + }, + { + "epoch": 0.19, + "learning_rate": 7.505050505050505e-05, + "loss": 0.4803, + "step": 257000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.48031092958239135, + "eval_average_loss_on_sentence_tokens": 0.37283986486778947, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.47550782561302185, + "eval_non_padding_tokens_in_labels": 133.5634, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38185, + "eval_padding_tokens_in_labels": 378.4366, + "eval_reconstruction_accuracy": 0.9129689846258411, + "eval_runtime": 180.1543, + "eval_samples_per_second": 27.754, + "eval_sentence_accuracy": 0.7671864625764889, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.247975, + "step": 257000 + }, + { + "epoch": 0.19, + "learning_rate": 7.504040404040404e-05, + "loss": 0.4822, + "step": 257100 + }, + { + "epoch": 0.19, + "learning_rate": 7.503030303030303e-05, + "loss": 0.4836, + "step": 257200 + }, + { + "epoch": 0.19, + "learning_rate": 7.502020202020202e-05, + "loss": 0.4775, + "step": 257300 + }, + { + "epoch": 0.19, + "learning_rate": 7.5010101010101e-05, + "loss": 0.4871, + "step": 257400 + }, + { + "epoch": 0.19, + "learning_rate": 7.500000000000001e-05, + "loss": 0.4793, + "step": 257500 + }, + { + "epoch": 0.19, + "learning_rate": 7.4989898989899e-05, + "loss": 0.4801, + "step": 257600 + }, + { + "epoch": 0.19, + "learning_rate": 7.497979797979798e-05, + "loss": 0.4792, + "step": 257700 + }, + { + "epoch": 0.19, + "learning_rate": 7.496969696969698e-05, + "loss": 0.4799, + "step": 257800 + }, + { + "epoch": 0.19, + "learning_rate": 7.495959595959596e-05, + "loss": 0.4803, + "step": 257900 + }, + { + "epoch": 0.19, + "learning_rate": 7.494949494949495e-05, + "loss": 0.4791, + "step": 258000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4790597115735894, + "eval_average_loss_on_sentence_tokens": 0.4228429353165998, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.47652342915534973, + "eval_non_padding_tokens_in_labels": 133.5589, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3839, + "eval_padding_tokens_in_labels": 378.4411, + "eval_reconstruction_accuracy": 0.91304966362925, + "eval_runtime": 244.1524, + "eval_samples_per_second": 20.479, + "eval_sentence_accuracy": 0.740780949988336, + "eval_steps_per_second": 0.053, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 258000 + }, + { + "epoch": 0.19, + "learning_rate": 7.493939393939395e-05, + "loss": 0.48, + "step": 258100 + }, + { + "epoch": 0.19, + "learning_rate": 7.492929292929294e-05, + "loss": 0.4791, + "step": 258200 + }, + { + "epoch": 0.19, + "learning_rate": 7.491919191919192e-05, + "loss": 0.4807, + "step": 258300 + }, + { + "epoch": 0.19, + "learning_rate": 7.490909090909092e-05, + "loss": 0.4783, + "step": 258400 + }, + { + "epoch": 0.19, + "learning_rate": 7.48989898989899e-05, + "loss": 0.4781, + "step": 258500 + }, + { + "epoch": 0.19, + "learning_rate": 7.488888888888889e-05, + "loss": 0.4808, + "step": 258600 + }, + { + "epoch": 0.19, + "learning_rate": 7.487878787878788e-05, + "loss": 0.4816, + "step": 258700 + }, + { + "epoch": 0.19, + "learning_rate": 7.486868686868688e-05, + "loss": 0.4766, + "step": 258800 + }, + { + "epoch": 0.19, + "learning_rate": 7.485858585858587e-05, + "loss": 0.4795, + "step": 258900 + }, + { + "epoch": 0.19, + "learning_rate": 7.484848484848486e-05, + "loss": 0.4802, + "step": 259000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.47870044338514206, + "eval_average_loss_on_sentence_tokens": 0.3682073759484576, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4737499952316284, + "eval_non_padding_tokens_in_labels": 133.51935, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.376, + "eval_padding_tokens_in_labels": 378.48065, + "eval_reconstruction_accuracy": 0.9132630211541529, + "eval_runtime": 178.1224, + "eval_samples_per_second": 28.071, + "eval_sentence_accuracy": 0.7674735765427889, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 259000 + }, + { + "epoch": 0.19, + "learning_rate": 7.483838383838384e-05, + "loss": 0.4755, + "step": 259100 + }, + { + "epoch": 0.19, + "learning_rate": 7.482828282828283e-05, + "loss": 0.4811, + "step": 259200 + }, + { + "epoch": 0.19, + "learning_rate": 7.481818181818182e-05, + "loss": 0.4792, + "step": 259300 + }, + { + "epoch": 0.19, + "learning_rate": 7.480808080808081e-05, + "loss": 0.4783, + "step": 259400 + }, + { + "epoch": 0.19, + "learning_rate": 7.47979797979798e-05, + "loss": 0.4809, + "step": 259500 + }, + { + "epoch": 0.19, + "learning_rate": 7.47878787878788e-05, + "loss": 0.479, + "step": 259600 + }, + { + "epoch": 0.19, + "learning_rate": 7.477777777777778e-05, + "loss": 0.4788, + "step": 259700 + }, + { + "epoch": 0.19, + "learning_rate": 7.476767676767678e-05, + "loss": 0.4773, + "step": 259800 + }, + { + "epoch": 0.19, + "learning_rate": 7.475757575757576e-05, + "loss": 0.4785, + "step": 259900 + }, + { + "epoch": 0.19, + "learning_rate": 7.474747474747475e-05, + "loss": 0.4825, + "step": 260000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.47932753309069515, + "eval_average_loss_on_sentence_tokens": 0.4038685085031143, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.47593748569488525, + "eval_non_padding_tokens_in_labels": 133.5562, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37715, + "eval_padding_tokens_in_labels": 378.4438, + "eval_reconstruction_accuracy": 0.9130088013193992, + "eval_runtime": 186.8075, + "eval_samples_per_second": 26.766, + "eval_sentence_accuracy": 0.7495738152062734, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 260000 + }, + { + "epoch": 0.0, + "learning_rate": 7.473737373737374e-05, + "loss": 0.4788, + "step": 260100 + }, + { + "epoch": 0.0, + "learning_rate": 7.472727272727274e-05, + "loss": 0.4807, + "step": 260200 + }, + { + "epoch": 0.0, + "learning_rate": 7.471717171717171e-05, + "loss": 0.4769, + "step": 260300 + }, + { + "epoch": 0.0, + "learning_rate": 7.470707070707072e-05, + "loss": 0.4773, + "step": 260400 + }, + { + "epoch": 0.0, + "learning_rate": 7.46969696969697e-05, + "loss": 0.4802, + "step": 260500 + }, + { + "epoch": 0.0, + "learning_rate": 7.468686868686869e-05, + "loss": 0.4815, + "step": 260600 + }, + { + "epoch": 0.0, + "learning_rate": 7.467676767676768e-05, + "loss": 0.4794, + "step": 260700 + }, + { + "epoch": 0.0, + "learning_rate": 7.466666666666667e-05, + "loss": 0.48, + "step": 260800 + }, + { + "epoch": 0.0, + "learning_rate": 7.465656565656565e-05, + "loss": 0.48, + "step": 260900 + }, + { + "epoch": 0.0, + "learning_rate": 7.464646464646466e-05, + "loss": 0.476, + "step": 261000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.4791197285770289, + "eval_average_loss_on_sentence_tokens": 0.4069141705405721, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.47586914896965027, + "eval_non_padding_tokens_in_labels": 133.5345, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38245, + "eval_padding_tokens_in_labels": 378.4655, + "eval_reconstruction_accuracy": 0.9130238457610392, + "eval_runtime": 151.0836, + "eval_samples_per_second": 33.094, + "eval_sentence_accuracy": 0.7434905880452922, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.2499, + "step": 261000 + }, + { + "epoch": 0.0, + "learning_rate": 7.463636363636364e-05, + "loss": 0.4804, + "step": 261100 + }, + { + "epoch": 0.0, + "learning_rate": 7.462626262626263e-05, + "loss": 0.4784, + "step": 261200 + }, + { + "epoch": 0.0, + "learning_rate": 7.461616161616162e-05, + "loss": 0.477, + "step": 261300 + }, + { + "epoch": 0.0, + "learning_rate": 7.460606060606061e-05, + "loss": 0.4771, + "step": 261400 + }, + { + "epoch": 0.0, + "learning_rate": 7.459595959595959e-05, + "loss": 0.4775, + "step": 261500 + }, + { + "epoch": 0.0, + "learning_rate": 7.45858585858586e-05, + "loss": 0.4807, + "step": 261600 + }, + { + "epoch": 0.0, + "learning_rate": 7.457575757575758e-05, + "loss": 0.4772, + "step": 261700 + }, + { + "epoch": 0.0, + "learning_rate": 7.456565656565657e-05, + "loss": 0.4756, + "step": 261800 + }, + { + "epoch": 0.0, + "learning_rate": 7.455555555555556e-05, + "loss": 0.484, + "step": 261900 + }, + { + "epoch": 0.0, + "learning_rate": 7.454545454545455e-05, + "loss": 0.4761, + "step": 262000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.4791238471457653, + "eval_average_loss_on_sentence_tokens": 0.3913848008184978, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.47511717677116394, + "eval_non_padding_tokens_in_labels": 133.53785, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39015, + "eval_padding_tokens_in_labels": 378.46215, + "eval_reconstruction_accuracy": 0.9130468031663344, + "eval_runtime": 150.9956, + "eval_samples_per_second": 33.114, + "eval_sentence_accuracy": 0.7533780752597484, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.2496, + "step": 262000 + }, + { + "epoch": 0.0, + "learning_rate": 7.453535353535353e-05, + "loss": 0.4772, + "step": 262100 + }, + { + "epoch": 0.0, + "learning_rate": 7.452525252525254e-05, + "loss": 0.4791, + "step": 262200 + }, + { + "epoch": 0.0, + "learning_rate": 7.451515151515151e-05, + "loss": 0.4789, + "step": 262300 + }, + { + "epoch": 0.0, + "learning_rate": 7.45050505050505e-05, + "loss": 0.478, + "step": 262400 + }, + { + "epoch": 0.0, + "learning_rate": 7.44949494949495e-05, + "loss": 0.4804, + "step": 262500 + }, + { + "epoch": 0.0, + "learning_rate": 7.448484848484849e-05, + "loss": 0.4797, + "step": 262600 + }, + { + "epoch": 0.0, + "learning_rate": 7.447474747474748e-05, + "loss": 0.482, + "step": 262700 + }, + { + "epoch": 0.0, + "learning_rate": 7.446464646464647e-05, + "loss": 0.4809, + "step": 262800 + }, + { + "epoch": 0.0, + "learning_rate": 7.445454545454545e-05, + "loss": 0.4815, + "step": 262900 + }, + { + "epoch": 0.0, + "learning_rate": 7.444444444444444e-05, + "loss": 0.4806, + "step": 263000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.4785978450413963, + "eval_average_loss_on_sentence_tokens": 0.39632589013257846, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.474853515625, + "eval_non_padding_tokens_in_labels": 133.51615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38255, + "eval_padding_tokens_in_labels": 378.48385, + "eval_reconstruction_accuracy": 0.9130233508189599, + "eval_runtime": 148.1371, + "eval_samples_per_second": 33.753, + "eval_sentence_accuracy": 0.7595599978466453, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24909999999999996, + "step": 263000 + }, + { + "epoch": 0.0, + "learning_rate": 7.443434343434344e-05, + "loss": 0.4789, + "step": 263100 + }, + { + "epoch": 0.0, + "learning_rate": 7.442424242424243e-05, + "loss": 0.4797, + "step": 263200 + }, + { + "epoch": 0.0, + "learning_rate": 7.441414141414142e-05, + "loss": 0.4772, + "step": 263300 + }, + { + "epoch": 0.0, + "learning_rate": 7.440404040404041e-05, + "loss": 0.4806, + "step": 263400 + }, + { + "epoch": 0.0, + "learning_rate": 7.439393939393939e-05, + "loss": 0.4759, + "step": 263500 + }, + { + "epoch": 0.0, + "learning_rate": 7.438383838383838e-05, + "loss": 0.4809, + "step": 263600 + }, + { + "epoch": 0.0, + "learning_rate": 7.437373737373737e-05, + "loss": 0.4774, + "step": 263700 + }, + { + "epoch": 0.0, + "learning_rate": 7.436363636363637e-05, + "loss": 0.4774, + "step": 263800 + }, + { + "epoch": 0.0, + "learning_rate": 7.435353535353536e-05, + "loss": 0.4738, + "step": 263900 + }, + { + "epoch": 0.0, + "learning_rate": 7.434343434343435e-05, + "loss": 0.4759, + "step": 264000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.47793219273927284, + "eval_average_loss_on_sentence_tokens": 0.4080352563270182, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.47480469942092896, + "eval_non_padding_tokens_in_labels": 133.5246, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37835, + "eval_padding_tokens_in_labels": 378.4754, + "eval_reconstruction_accuracy": 0.913107254770963, + "eval_runtime": 148.7331, + "eval_samples_per_second": 33.617, + "eval_sentence_accuracy": 0.7477344913596641, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 264000 + }, + { + "epoch": 0.0, + "learning_rate": 7.433333333333333e-05, + "loss": 0.4799, + "step": 264100 + }, + { + "epoch": 0.0, + "learning_rate": 7.432323232323233e-05, + "loss": 0.4764, + "step": 264200 + }, + { + "epoch": 0.0, + "learning_rate": 7.431313131313131e-05, + "loss": 0.4813, + "step": 264300 + }, + { + "epoch": 0.0, + "learning_rate": 7.43030303030303e-05, + "loss": 0.4788, + "step": 264400 + }, + { + "epoch": 0.0, + "learning_rate": 7.42929292929293e-05, + "loss": 0.4769, + "step": 264500 + }, + { + "epoch": 0.0, + "learning_rate": 7.428282828282829e-05, + "loss": 0.4806, + "step": 264600 + }, + { + "epoch": 0.0, + "learning_rate": 7.427272727272727e-05, + "loss": 0.4767, + "step": 264700 + }, + { + "epoch": 0.0, + "learning_rate": 7.426262626262627e-05, + "loss": 0.4774, + "step": 264800 + }, + { + "epoch": 0.0, + "learning_rate": 7.425252525252525e-05, + "loss": 0.4772, + "step": 264900 + }, + { + "epoch": 0.01, + "learning_rate": 7.424242424242424e-05, + "loss": 0.4776, + "step": 265000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47778010057175163, + "eval_average_loss_on_sentence_tokens": 0.3993969814377167, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.47422850131988525, + "eval_non_padding_tokens_in_labels": 133.5534, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38315, + "eval_padding_tokens_in_labels": 378.4466, + "eval_reconstruction_accuracy": 0.9131475527508934, + "eval_runtime": 148.4781, + "eval_samples_per_second": 33.675, + "eval_sentence_accuracy": 0.7508927449889641, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 265000 + }, + { + "epoch": 0.01, + "learning_rate": 7.423232323232323e-05, + "loss": 0.4785, + "step": 265100 + }, + { + "epoch": 0.01, + "learning_rate": 7.422222222222223e-05, + "loss": 0.4817, + "step": 265200 + }, + { + "epoch": 0.01, + "learning_rate": 7.42121212121212e-05, + "loss": 0.4789, + "step": 265300 + }, + { + "epoch": 0.01, + "learning_rate": 7.420202020202021e-05, + "loss": 0.4785, + "step": 265400 + }, + { + "epoch": 0.01, + "learning_rate": 7.419191919191919e-05, + "loss": 0.4741, + "step": 265500 + }, + { + "epoch": 0.01, + "learning_rate": 7.418181818181818e-05, + "loss": 0.4787, + "step": 265600 + }, + { + "epoch": 0.01, + "learning_rate": 7.417171717171717e-05, + "loss": 0.4737, + "step": 265700 + }, + { + "epoch": 0.01, + "learning_rate": 7.416161616161617e-05, + "loss": 0.4768, + "step": 265800 + }, + { + "epoch": 0.01, + "learning_rate": 7.415151515151514e-05, + "loss": 0.4777, + "step": 265900 + }, + { + "epoch": 0.01, + "learning_rate": 7.414141414141415e-05, + "loss": 0.4776, + "step": 266000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4775587842072621, + "eval_average_loss_on_sentence_tokens": 0.3956433577670722, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.47383788228034973, + "eval_non_padding_tokens_in_labels": 133.50755, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3743, + "eval_padding_tokens_in_labels": 378.49245, + "eval_reconstruction_accuracy": 0.9132901597286268, + "eval_runtime": 148.3378, + "eval_samples_per_second": 33.707, + "eval_sentence_accuracy": 0.7560428517594703, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.249775, + "step": 266000 + }, + { + "epoch": 0.01, + "learning_rate": 7.413131313131314e-05, + "loss": 0.4729, + "step": 266100 + }, + { + "epoch": 0.01, + "learning_rate": 7.412121212121212e-05, + "loss": 0.4771, + "step": 266200 + }, + { + "epoch": 0.01, + "learning_rate": 7.411111111111113e-05, + "loss": 0.4757, + "step": 266300 + }, + { + "epoch": 0.01, + "learning_rate": 7.41010101010101e-05, + "loss": 0.4768, + "step": 266400 + }, + { + "epoch": 0.01, + "learning_rate": 7.40909090909091e-05, + "loss": 0.4798, + "step": 266500 + }, + { + "epoch": 0.01, + "learning_rate": 7.408080808080809e-05, + "loss": 0.4759, + "step": 266600 + }, + { + "epoch": 0.01, + "learning_rate": 7.407070707070708e-05, + "loss": 0.4774, + "step": 266700 + }, + { + "epoch": 0.01, + "learning_rate": 7.406060606060606e-05, + "loss": 0.4761, + "step": 266800 + }, + { + "epoch": 0.01, + "learning_rate": 7.405050505050506e-05, + "loss": 0.4751, + "step": 266900 + }, + { + "epoch": 0.01, + "learning_rate": 7.404040404040404e-05, + "loss": 0.4785, + "step": 267000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47770858024416096, + "eval_average_loss_on_sentence_tokens": 0.42749929798989755, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4754980504512787, + "eval_non_padding_tokens_in_labels": 133.5482, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3746, + "eval_padding_tokens_in_labels": 378.4518, + "eval_reconstruction_accuracy": 0.9132221157549606, + "eval_runtime": 146.7201, + "eval_samples_per_second": 34.078, + "eval_sentence_accuracy": 0.7311491736501158, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.248775, + "step": 267000 + }, + { + "epoch": 0.01, + "learning_rate": 7.403030303030303e-05, + "loss": 0.4794, + "step": 267100 + }, + { + "epoch": 0.01, + "learning_rate": 7.402020202020203e-05, + "loss": 0.4793, + "step": 267200 + }, + { + "epoch": 0.01, + "learning_rate": 7.401010101010102e-05, + "loss": 0.4766, + "step": 267300 + }, + { + "epoch": 0.01, + "learning_rate": 7.4e-05, + "loss": 0.4783, + "step": 267400 + }, + { + "epoch": 0.01, + "learning_rate": 7.3989898989899e-05, + "loss": 0.4769, + "step": 267500 + }, + { + "epoch": 0.01, + "learning_rate": 7.397979797979798e-05, + "loss": 0.4753, + "step": 267600 + }, + { + "epoch": 0.01, + "learning_rate": 7.396969696969697e-05, + "loss": 0.4754, + "step": 267700 + }, + { + "epoch": 0.01, + "learning_rate": 7.395959595959596e-05, + "loss": 0.4795, + "step": 267800 + }, + { + "epoch": 0.01, + "learning_rate": 7.394949494949496e-05, + "loss": 0.4809, + "step": 267900 + }, + { + "epoch": 0.01, + "learning_rate": 7.393939393939395e-05, + "loss": 0.4763, + "step": 268000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47667596559763736, + "eval_average_loss_on_sentence_tokens": 0.42630040270221137, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4744336009025574, + "eval_non_padding_tokens_in_labels": 133.55045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38195, + "eval_padding_tokens_in_labels": 378.44955, + "eval_reconstruction_accuracy": 0.9132565197103296, + "eval_runtime": 148.3338, + "eval_samples_per_second": 33.708, + "eval_sentence_accuracy": 0.7364787266495595, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 268000 + }, + { + "epoch": 0.01, + "learning_rate": 7.392929292929294e-05, + "loss": 0.4795, + "step": 268100 + }, + { + "epoch": 0.01, + "learning_rate": 7.391919191919192e-05, + "loss": 0.4771, + "step": 268200 + }, + { + "epoch": 0.01, + "learning_rate": 7.390909090909091e-05, + "loss": 0.4763, + "step": 268300 + }, + { + "epoch": 0.01, + "learning_rate": 7.38989898989899e-05, + "loss": 0.476, + "step": 268400 + }, + { + "epoch": 0.01, + "learning_rate": 7.38888888888889e-05, + "loss": 0.4772, + "step": 268500 + }, + { + "epoch": 0.01, + "learning_rate": 7.387878787878789e-05, + "loss": 0.4754, + "step": 268600 + }, + { + "epoch": 0.01, + "learning_rate": 7.386868686868688e-05, + "loss": 0.48, + "step": 268700 + }, + { + "epoch": 0.01, + "learning_rate": 7.385858585858586e-05, + "loss": 0.4808, + "step": 268800 + }, + { + "epoch": 0.01, + "learning_rate": 7.384848484848486e-05, + "loss": 0.4768, + "step": 268900 + }, + { + "epoch": 0.01, + "learning_rate": 7.383838383838384e-05, + "loss": 0.4768, + "step": 269000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47612698008141546, + "eval_average_loss_on_sentence_tokens": 0.40671469281444644, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.47294920682907104, + "eval_non_padding_tokens_in_labels": 133.5502, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37955, + "eval_padding_tokens_in_labels": 378.4498, + "eval_reconstruction_accuracy": 0.9133392767649097, + "eval_runtime": 147.2126, + "eval_samples_per_second": 33.964, + "eval_sentence_accuracy": 0.7464379923555906, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 269000 + }, + { + "epoch": 0.01, + "learning_rate": 7.382828282828283e-05, + "loss": 0.4807, + "step": 269100 + }, + { + "epoch": 0.01, + "learning_rate": 7.381818181818182e-05, + "loss": 0.4761, + "step": 269200 + }, + { + "epoch": 0.01, + "learning_rate": 7.380808080808082e-05, + "loss": 0.4751, + "step": 269300 + }, + { + "epoch": 0.01, + "learning_rate": 7.37979797979798e-05, + "loss": 0.4757, + "step": 269400 + }, + { + "epoch": 0.01, + "learning_rate": 7.37878787878788e-05, + "loss": 0.4755, + "step": 269500 + }, + { + "epoch": 0.01, + "learning_rate": 7.377777777777778e-05, + "loss": 0.4784, + "step": 269600 + }, + { + "epoch": 0.01, + "learning_rate": 7.376767676767677e-05, + "loss": 0.4756, + "step": 269700 + }, + { + "epoch": 0.01, + "learning_rate": 7.375757575757576e-05, + "loss": 0.4752, + "step": 269800 + }, + { + "epoch": 0.01, + "learning_rate": 7.374747474747476e-05, + "loss": 0.4758, + "step": 269900 + }, + { + "epoch": 0.01, + "learning_rate": 7.373737373737373e-05, + "loss": 0.4796, + "step": 270000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4758860254846748, + "eval_average_loss_on_sentence_tokens": 0.38250728115680444, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.47163087129592896, + "eval_non_padding_tokens_in_labels": 133.5016, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3713, + "eval_padding_tokens_in_labels": 378.4984, + "eval_reconstruction_accuracy": 0.9133644788343239, + "eval_runtime": 149.1484, + "eval_samples_per_second": 33.524, + "eval_sentence_accuracy": 0.7692366357420999, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 270000 + }, + { + "epoch": 0.01, + "learning_rate": 7.372727272727274e-05, + "loss": 0.4753, + "step": 270100 + }, + { + "epoch": 0.01, + "learning_rate": 7.371717171717172e-05, + "loss": 0.4787, + "step": 270200 + }, + { + "epoch": 0.01, + "learning_rate": 7.370707070707071e-05, + "loss": 0.474, + "step": 270300 + }, + { + "epoch": 0.01, + "learning_rate": 7.36969696969697e-05, + "loss": 0.4747, + "step": 270400 + }, + { + "epoch": 0.01, + "learning_rate": 7.36868686868687e-05, + "loss": 0.4796, + "step": 270500 + }, + { + "epoch": 0.01, + "learning_rate": 7.367676767676767e-05, + "loss": 0.4784, + "step": 270600 + }, + { + "epoch": 0.01, + "learning_rate": 7.366666666666668e-05, + "loss": 0.4759, + "step": 270700 + }, + { + "epoch": 0.01, + "learning_rate": 7.365656565656566e-05, + "loss": 0.4789, + "step": 270800 + }, + { + "epoch": 0.01, + "learning_rate": 7.364646464646465e-05, + "loss": 0.4739, + "step": 270900 + }, + { + "epoch": 0.01, + "learning_rate": 7.363636363636364e-05, + "loss": 0.4771, + "step": 271000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4757755676204635, + "eval_average_loss_on_sentence_tokens": 0.4109868436002649, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.47297850251197815, + "eval_non_padding_tokens_in_labels": 133.51135, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3948, + "eval_padding_tokens_in_labels": 378.48865, + "eval_reconstruction_accuracy": 0.9134419499069648, + "eval_runtime": 145.3094, + "eval_samples_per_second": 34.409, + "eval_sentence_accuracy": 0.7447242808692376, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.25, + "step": 271000 + }, + { + "epoch": 0.01, + "learning_rate": 7.362626262626263e-05, + "loss": 0.4734, + "step": 271100 + }, + { + "epoch": 0.01, + "learning_rate": 7.361616161616161e-05, + "loss": 0.4734, + "step": 271200 + }, + { + "epoch": 0.01, + "learning_rate": 7.360606060606062e-05, + "loss": 0.4785, + "step": 271300 + }, + { + "epoch": 0.01, + "learning_rate": 7.35959595959596e-05, + "loss": 0.4771, + "step": 271400 + }, + { + "epoch": 0.01, + "learning_rate": 7.358585858585859e-05, + "loss": 0.4753, + "step": 271500 + }, + { + "epoch": 0.01, + "learning_rate": 7.357575757575758e-05, + "loss": 0.4733, + "step": 271600 + }, + { + "epoch": 0.01, + "learning_rate": 7.356565656565657e-05, + "loss": 0.4782, + "step": 271700 + }, + { + "epoch": 0.01, + "learning_rate": 7.355555555555556e-05, + "loss": 0.4728, + "step": 271800 + }, + { + "epoch": 0.01, + "learning_rate": 7.354545454545455e-05, + "loss": 0.4725, + "step": 271900 + }, + { + "epoch": 0.01, + "learning_rate": 7.353535353535353e-05, + "loss": 0.4754, + "step": 272000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4765018702712024, + "eval_average_loss_on_sentence_tokens": 0.40603774000900306, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.47331055998802185, + "eval_non_padding_tokens_in_labels": 133.5109, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37835, + "eval_padding_tokens_in_labels": 378.4891, + "eval_reconstruction_accuracy": 0.9133845289282799, + "eval_runtime": 149.8023, + "eval_samples_per_second": 33.377, + "eval_sentence_accuracy": 0.7429971109157141, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.2499, + "step": 272000 + }, + { + "epoch": 0.01, + "learning_rate": 7.352525252525252e-05, + "loss": 0.4777, + "step": 272100 + }, + { + "epoch": 0.01, + "learning_rate": 7.351515151515152e-05, + "loss": 0.4761, + "step": 272200 + }, + { + "epoch": 0.01, + "learning_rate": 7.350505050505051e-05, + "loss": 0.4774, + "step": 272300 + }, + { + "epoch": 0.01, + "learning_rate": 7.34949494949495e-05, + "loss": 0.4783, + "step": 272400 + }, + { + "epoch": 0.01, + "learning_rate": 7.348484848484849e-05, + "loss": 0.4755, + "step": 272500 + }, + { + "epoch": 0.01, + "learning_rate": 7.347474747474747e-05, + "loss": 0.4761, + "step": 272600 + }, + { + "epoch": 0.01, + "learning_rate": 7.346464646464646e-05, + "loss": 0.4771, + "step": 272700 + }, + { + "epoch": 0.01, + "learning_rate": 7.345454545454545e-05, + "loss": 0.4742, + "step": 272800 + }, + { + "epoch": 0.01, + "learning_rate": 7.344444444444445e-05, + "loss": 0.4745, + "step": 272900 + }, + { + "epoch": 0.01, + "learning_rate": 7.343434343434344e-05, + "loss": 0.4775, + "step": 273000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47556038933517486, + "eval_average_loss_on_sentence_tokens": 0.3675144960351984, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.47066405415534973, + "eval_non_padding_tokens_in_labels": 133.5139, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3668, + "eval_padding_tokens_in_labels": 378.4861, + "eval_reconstruction_accuracy": 0.9134689396031366, + "eval_runtime": 146.5806, + "eval_samples_per_second": 34.111, + "eval_sentence_accuracy": 0.7734087605648967, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 273000 + }, + { + "epoch": 0.01, + "learning_rate": 7.342424242424243e-05, + "loss": 0.4784, + "step": 273100 + }, + { + "epoch": 0.01, + "learning_rate": 7.341414141414141e-05, + "loss": 0.4753, + "step": 273200 + }, + { + "epoch": 0.01, + "learning_rate": 7.340404040404041e-05, + "loss": 0.4761, + "step": 273300 + }, + { + "epoch": 0.01, + "learning_rate": 7.339393939393939e-05, + "loss": 0.4787, + "step": 273400 + }, + { + "epoch": 0.01, + "learning_rate": 7.338383838383839e-05, + "loss": 0.4777, + "step": 273500 + }, + { + "epoch": 0.01, + "learning_rate": 7.337373737373738e-05, + "loss": 0.4728, + "step": 273600 + }, + { + "epoch": 0.01, + "learning_rate": 7.336363636363637e-05, + "loss": 0.4765, + "step": 273700 + }, + { + "epoch": 0.01, + "learning_rate": 7.335353535353535e-05, + "loss": 0.4783, + "step": 273800 + }, + { + "epoch": 0.01, + "learning_rate": 7.334343434343435e-05, + "loss": 0.4722, + "step": 273900 + }, + { + "epoch": 0.01, + "learning_rate": 7.333333333333333e-05, + "loss": 0.4729, + "step": 274000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4765153068660365, + "eval_average_loss_on_sentence_tokens": 0.43528873580830446, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.4747265577316284, + "eval_non_padding_tokens_in_labels": 133.54485, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3796, + "eval_padding_tokens_in_labels": 378.45515, + "eval_reconstruction_accuracy": 0.9133469300512458, + "eval_runtime": 146.0063, + "eval_samples_per_second": 34.245, + "eval_sentence_accuracy": 0.7198844366285643, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.246975, + "step": 274000 + }, + { + "epoch": 0.01, + "learning_rate": 7.332323232323232e-05, + "loss": 0.4718, + "step": 274100 + }, + { + "epoch": 0.01, + "learning_rate": 7.331313131313132e-05, + "loss": 0.4766, + "step": 274200 + }, + { + "epoch": 0.01, + "learning_rate": 7.330303030303031e-05, + "loss": 0.4761, + "step": 274300 + }, + { + "epoch": 0.01, + "learning_rate": 7.329292929292929e-05, + "loss": 0.4743, + "step": 274400 + }, + { + "epoch": 0.01, + "learning_rate": 7.328282828282829e-05, + "loss": 0.478, + "step": 274500 + }, + { + "epoch": 0.01, + "learning_rate": 7.327272727272728e-05, + "loss": 0.4796, + "step": 274600 + }, + { + "epoch": 0.01, + "learning_rate": 7.326262626262626e-05, + "loss": 0.4798, + "step": 274700 + }, + { + "epoch": 0.01, + "learning_rate": 7.325252525252527e-05, + "loss": 0.4744, + "step": 274800 + }, + { + "epoch": 0.01, + "learning_rate": 7.324242424242425e-05, + "loss": 0.4768, + "step": 274900 + }, + { + "epoch": 0.01, + "learning_rate": 7.323232323232324e-05, + "loss": 0.4786, + "step": 275000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.47609075452377575, + "eval_average_loss_on_sentence_tokens": 0.4169782428267823, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.47337889671325684, + "eval_non_padding_tokens_in_labels": 133.5353, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3717, + "eval_padding_tokens_in_labels": 378.4647, + "eval_reconstruction_accuracy": 0.9133942521471835, + "eval_runtime": 148.0152, + "eval_samples_per_second": 33.78, + "eval_sentence_accuracy": 0.7388160137814704, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 275000 + }, + { + "epoch": 0.02, + "learning_rate": 7.322222222222223e-05, + "loss": 0.4801, + "step": 275100 + }, + { + "epoch": 0.02, + "learning_rate": 7.321212121212122e-05, + "loss": 0.4763, + "step": 275200 + }, + { + "epoch": 0.02, + "learning_rate": 7.32020202020202e-05, + "loss": 0.4753, + "step": 275300 + }, + { + "epoch": 0.02, + "learning_rate": 7.31919191919192e-05, + "loss": 0.4767, + "step": 275400 + }, + { + "epoch": 0.02, + "learning_rate": 7.318181818181818e-05, + "loss": 0.4758, + "step": 275500 + }, + { + "epoch": 0.02, + "learning_rate": 7.317171717171718e-05, + "loss": 0.4787, + "step": 275600 + }, + { + "epoch": 0.02, + "learning_rate": 7.316161616161617e-05, + "loss": 0.4772, + "step": 275700 + }, + { + "epoch": 0.02, + "learning_rate": 7.315151515151516e-05, + "loss": 0.4692, + "step": 275800 + }, + { + "epoch": 0.02, + "learning_rate": 7.314141414141414e-05, + "loss": 0.475, + "step": 275900 + }, + { + "epoch": 0.02, + "learning_rate": 7.313131313131314e-05, + "loss": 0.4766, + "step": 276000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47448624235103765, + "eval_average_loss_on_sentence_tokens": 0.3989126079932304, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4710351526737213, + "eval_non_padding_tokens_in_labels": 133.53425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36285, + "eval_padding_tokens_in_labels": 378.46575, + "eval_reconstruction_accuracy": 0.9134890504069832, + "eval_runtime": 149.9786, + "eval_samples_per_second": 33.338, + "eval_sentence_accuracy": 0.7579808710319953, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.2496, + "step": 276000 + }, + { + "epoch": 0.02, + "learning_rate": 7.312121212121212e-05, + "loss": 0.4776, + "step": 276100 + }, + { + "epoch": 0.02, + "learning_rate": 7.311111111111111e-05, + "loss": 0.4758, + "step": 276200 + }, + { + "epoch": 0.02, + "learning_rate": 7.31010101010101e-05, + "loss": 0.4758, + "step": 276300 + }, + { + "epoch": 0.02, + "learning_rate": 7.30909090909091e-05, + "loss": 0.4768, + "step": 276400 + }, + { + "epoch": 0.02, + "learning_rate": 7.308080808080808e-05, + "loss": 0.4736, + "step": 276500 + }, + { + "epoch": 0.02, + "learning_rate": 7.307070707070708e-05, + "loss": 0.4746, + "step": 276600 + }, + { + "epoch": 0.02, + "learning_rate": 7.306060606060606e-05, + "loss": 0.4745, + "step": 276700 + }, + { + "epoch": 0.02, + "learning_rate": 7.305050505050505e-05, + "loss": 0.4734, + "step": 276800 + }, + { + "epoch": 0.02, + "learning_rate": 7.304040404040404e-05, + "loss": 0.4782, + "step": 276900 + }, + { + "epoch": 0.02, + "learning_rate": 7.303030303030304e-05, + "loss": 0.4773, + "step": 277000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4750508897823913, + "eval_average_loss_on_sentence_tokens": 0.3744779222556951, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.47047850489616394, + "eval_non_padding_tokens_in_labels": 133.55205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37205, + "eval_padding_tokens_in_labels": 378.44795, + "eval_reconstruction_accuracy": 0.9134781540475414, + "eval_runtime": 151.1853, + "eval_samples_per_second": 33.072, + "eval_sentence_accuracy": 0.7716142982755217, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 277000 + }, + { + "epoch": 0.02, + "learning_rate": 7.302020202020203e-05, + "loss": 0.4715, + "step": 277100 + }, + { + "epoch": 0.02, + "learning_rate": 7.301010101010102e-05, + "loss": 0.4756, + "step": 277200 + }, + { + "epoch": 0.02, + "learning_rate": 7.3e-05, + "loss": 0.4749, + "step": 277300 + }, + { + "epoch": 0.02, + "learning_rate": 7.298989898989899e-05, + "loss": 0.4739, + "step": 277400 + }, + { + "epoch": 0.02, + "learning_rate": 7.297979797979798e-05, + "loss": 0.4773, + "step": 277500 + }, + { + "epoch": 0.02, + "learning_rate": 7.296969696969697e-05, + "loss": 0.4782, + "step": 277600 + }, + { + "epoch": 0.02, + "learning_rate": 7.295959595959597e-05, + "loss": 0.4719, + "step": 277700 + }, + { + "epoch": 0.02, + "learning_rate": 7.294949494949496e-05, + "loss": 0.4757, + "step": 277800 + }, + { + "epoch": 0.02, + "learning_rate": 7.293939393939394e-05, + "loss": 0.4785, + "step": 277900 + }, + { + "epoch": 0.02, + "learning_rate": 7.292929292929293e-05, + "loss": 0.4741, + "step": 278000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47456715042833664, + "eval_average_loss_on_sentence_tokens": 0.4143826653010609, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4717773497104645, + "eval_non_padding_tokens_in_labels": 133.50525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37475, + "eval_padding_tokens_in_labels": 378.49475, + "eval_reconstruction_accuracy": 0.9135974960043951, + "eval_runtime": 148.2718, + "eval_samples_per_second": 33.722, + "eval_sentence_accuracy": 0.7361826403718126, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 278000 + }, + { + "epoch": 0.02, + "learning_rate": 7.291919191919192e-05, + "loss": 0.4713, + "step": 278100 + }, + { + "epoch": 0.02, + "learning_rate": 7.290909090909091e-05, + "loss": 0.4769, + "step": 278200 + }, + { + "epoch": 0.02, + "learning_rate": 7.28989898989899e-05, + "loss": 0.476, + "step": 278300 + }, + { + "epoch": 0.02, + "learning_rate": 7.28888888888889e-05, + "loss": 0.473, + "step": 278400 + }, + { + "epoch": 0.02, + "learning_rate": 7.287878787878788e-05, + "loss": 0.4754, + "step": 278500 + }, + { + "epoch": 0.02, + "learning_rate": 7.286868686868688e-05, + "loss": 0.4748, + "step": 278600 + }, + { + "epoch": 0.02, + "learning_rate": 7.285858585858586e-05, + "loss": 0.4747, + "step": 278700 + }, + { + "epoch": 0.02, + "learning_rate": 7.284848484848485e-05, + "loss": 0.4679, + "step": 278800 + }, + { + "epoch": 0.02, + "learning_rate": 7.283838383838384e-05, + "loss": 0.4733, + "step": 278900 + }, + { + "epoch": 0.02, + "learning_rate": 7.282828282828284e-05, + "loss": 0.4735, + "step": 279000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4746034851638501, + "eval_average_loss_on_sentence_tokens": 0.43787291750529145, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.4729296863079071, + "eval_non_padding_tokens_in_labels": 133.542, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3859, + "eval_padding_tokens_in_labels": 378.458, + "eval_reconstruction_accuracy": 0.913557821313909, + "eval_runtime": 145.1315, + "eval_samples_per_second": 34.452, + "eval_sentence_accuracy": 0.7259945807238861, + "eval_steps_per_second": 0.09, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 279000 + }, + { + "epoch": 0.02, + "learning_rate": 7.281818181818181e-05, + "loss": 0.4737, + "step": 279100 + }, + { + "epoch": 0.02, + "learning_rate": 7.280808080808082e-05, + "loss": 0.4779, + "step": 279200 + }, + { + "epoch": 0.02, + "learning_rate": 7.27979797979798e-05, + "loss": 0.4743, + "step": 279300 + }, + { + "epoch": 0.02, + "learning_rate": 7.278787878787879e-05, + "loss": 0.4745, + "step": 279400 + }, + { + "epoch": 0.02, + "learning_rate": 7.277777777777778e-05, + "loss": 0.4735, + "step": 279500 + }, + { + "epoch": 0.02, + "learning_rate": 7.276767676767677e-05, + "loss": 0.4761, + "step": 279600 + }, + { + "epoch": 0.02, + "learning_rate": 7.275757575757575e-05, + "loss": 0.4743, + "step": 279700 + }, + { + "epoch": 0.02, + "learning_rate": 7.274747474747476e-05, + "loss": 0.4735, + "step": 279800 + }, + { + "epoch": 0.02, + "learning_rate": 7.273737373737374e-05, + "loss": 0.4776, + "step": 279900 + }, + { + "epoch": 0.02, + "learning_rate": 7.272727272727273e-05, + "loss": 0.4731, + "step": 280000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47388517690577736, + "eval_average_loss_on_sentence_tokens": 0.38617531775763636, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4698828160762787, + "eval_non_padding_tokens_in_labels": 133.51305, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38525, + "eval_padding_tokens_in_labels": 378.48695, + "eval_reconstruction_accuracy": 0.9136141410936649, + "eval_runtime": 146.6826, + "eval_samples_per_second": 34.087, + "eval_sentence_accuracy": 0.7683169738187952, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 280000 + }, + { + "epoch": 0.02, + "learning_rate": 7.271717171717172e-05, + "loss": 0.4785, + "step": 280100 + }, + { + "epoch": 0.02, + "learning_rate": 7.270707070707071e-05, + "loss": 0.4748, + "step": 280200 + }, + { + "epoch": 0.02, + "learning_rate": 7.269696969696969e-05, + "loss": 0.4758, + "step": 280300 + }, + { + "epoch": 0.02, + "learning_rate": 7.26868686868687e-05, + "loss": 0.4776, + "step": 280400 + }, + { + "epoch": 0.02, + "learning_rate": 7.267676767676767e-05, + "loss": 0.4722, + "step": 280500 + }, + { + "epoch": 0.02, + "learning_rate": 7.266666666666667e-05, + "loss": 0.4753, + "step": 280600 + }, + { + "epoch": 0.02, + "learning_rate": 7.265656565656566e-05, + "loss": 0.4748, + "step": 280700 + }, + { + "epoch": 0.02, + "learning_rate": 7.264646464646465e-05, + "loss": 0.4757, + "step": 280800 + }, + { + "epoch": 0.02, + "learning_rate": 7.263636363636363e-05, + "loss": 0.4748, + "step": 280900 + }, + { + "epoch": 0.02, + "learning_rate": 7.262626262626263e-05, + "loss": 0.4701, + "step": 281000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47442266333547567, + "eval_average_loss_on_sentence_tokens": 0.41848130867668254, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.4718261659145355, + "eval_non_padding_tokens_in_labels": 133.54025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36695, + "eval_padding_tokens_in_labels": 378.45975, + "eval_reconstruction_accuracy": 0.9136522000087455, + "eval_runtime": 149.9539, + "eval_samples_per_second": 33.344, + "eval_sentence_accuracy": 0.7398926911550954, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.2496, + "step": 281000 + }, + { + "epoch": 0.02, + "learning_rate": 7.261616161616161e-05, + "loss": 0.4771, + "step": 281100 + }, + { + "epoch": 0.02, + "learning_rate": 7.26060606060606e-05, + "loss": 0.4723, + "step": 281200 + }, + { + "epoch": 0.02, + "learning_rate": 7.25959595959596e-05, + "loss": 0.4745, + "step": 281300 + }, + { + "epoch": 0.02, + "learning_rate": 7.258585858585859e-05, + "loss": 0.4753, + "step": 281400 + }, + { + "epoch": 0.02, + "learning_rate": 7.257575757575758e-05, + "loss": 0.4721, + "step": 281500 + }, + { + "epoch": 0.02, + "learning_rate": 7.256565656565657e-05, + "loss": 0.4736, + "step": 281600 + }, + { + "epoch": 0.02, + "learning_rate": 7.255555555555555e-05, + "loss": 0.4763, + "step": 281700 + }, + { + "epoch": 0.02, + "learning_rate": 7.254545454545454e-05, + "loss": 0.4791, + "step": 281800 + }, + { + "epoch": 0.02, + "learning_rate": 7.253535353535354e-05, + "loss": 0.4745, + "step": 281900 + }, + { + "epoch": 0.02, + "learning_rate": 7.252525252525253e-05, + "loss": 0.4734, + "step": 282000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47401853379737885, + "eval_average_loss_on_sentence_tokens": 0.4074157560824644, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.47098633646965027, + "eval_non_padding_tokens_in_labels": 133.53425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38025, + "eval_padding_tokens_in_labels": 378.46575, + "eval_reconstruction_accuracy": 0.9135399300783985, + "eval_runtime": 147.1265, + "eval_samples_per_second": 33.984, + "eval_sentence_accuracy": 0.750820966497389, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.2499, + "step": 282000 + }, + { + "epoch": 0.02, + "learning_rate": 7.251515151515152e-05, + "loss": 0.4729, + "step": 282100 + }, + { + "epoch": 0.02, + "learning_rate": 7.250505050505051e-05, + "loss": 0.4729, + "step": 282200 + }, + { + "epoch": 0.02, + "learning_rate": 7.249494949494949e-05, + "loss": 0.4759, + "step": 282300 + }, + { + "epoch": 0.02, + "learning_rate": 7.24848484848485e-05, + "loss": 0.4748, + "step": 282400 + }, + { + "epoch": 0.02, + "learning_rate": 7.247474747474747e-05, + "loss": 0.4784, + "step": 282500 + }, + { + "epoch": 0.02, + "learning_rate": 7.246464646464647e-05, + "loss": 0.4757, + "step": 282600 + }, + { + "epoch": 0.02, + "learning_rate": 7.245454545454546e-05, + "loss": 0.4767, + "step": 282700 + }, + { + "epoch": 0.02, + "learning_rate": 7.244444444444445e-05, + "loss": 0.473, + "step": 282800 + }, + { + "epoch": 0.02, + "learning_rate": 7.243434343434343e-05, + "loss": 0.4768, + "step": 282900 + }, + { + "epoch": 0.02, + "learning_rate": 7.242424242424243e-05, + "loss": 0.4741, + "step": 283000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.47409192353059637, + "eval_average_loss_on_sentence_tokens": 0.372683400477421, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4694238305091858, + "eval_non_padding_tokens_in_labels": 133.51405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3734, + "eval_padding_tokens_in_labels": 378.48595, + "eval_reconstruction_accuracy": 0.9136330641171949, + "eval_runtime": 148.0502, + "eval_samples_per_second": 33.772, + "eval_sentence_accuracy": 0.7698377806090405, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 283000 + }, + { + "epoch": 0.02, + "learning_rate": 7.241414141414141e-05, + "loss": 0.4749, + "step": 283100 + }, + { + "epoch": 0.02, + "learning_rate": 7.24040404040404e-05, + "loss": 0.4729, + "step": 283200 + }, + { + "epoch": 0.02, + "learning_rate": 7.239393939393941e-05, + "loss": 0.4746, + "step": 283300 + }, + { + "epoch": 0.02, + "learning_rate": 7.238383838383839e-05, + "loss": 0.4739, + "step": 283400 + }, + { + "epoch": 0.02, + "learning_rate": 7.237373737373738e-05, + "loss": 0.4754, + "step": 283500 + }, + { + "epoch": 0.02, + "learning_rate": 7.236363636363637e-05, + "loss": 0.473, + "step": 283600 + }, + { + "epoch": 0.02, + "learning_rate": 7.235353535353536e-05, + "loss": 0.4768, + "step": 283700 + }, + { + "epoch": 0.02, + "learning_rate": 7.234343434343434e-05, + "loss": 0.472, + "step": 283800 + }, + { + "epoch": 0.02, + "learning_rate": 7.233333333333335e-05, + "loss": 0.4712, + "step": 283900 + }, + { + "epoch": 0.02, + "learning_rate": 7.232323232323233e-05, + "loss": 0.4692, + "step": 284000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4734677617721194, + "eval_average_loss_on_sentence_tokens": 0.3957510939995585, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4699999988079071, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36955, + "eval_padding_tokens_in_labels": 378.47615, + "eval_reconstruction_accuracy": 0.9136547590036262, + "eval_runtime": 145.2075, + "eval_samples_per_second": 34.433, + "eval_sentence_accuracy": 0.7604123674340983, + "eval_steps_per_second": 0.09, + "eval_variance_shuffling_prob": 0.2496, + "step": 284000 + }, + { + "epoch": 0.02, + "learning_rate": 7.231313131313132e-05, + "loss": 0.4766, + "step": 284100 + }, + { + "epoch": 0.02, + "learning_rate": 7.230303030303031e-05, + "loss": 0.4706, + "step": 284200 + }, + { + "epoch": 0.02, + "learning_rate": 7.22929292929293e-05, + "loss": 0.4756, + "step": 284300 + }, + { + "epoch": 0.02, + "learning_rate": 7.228282828282828e-05, + "loss": 0.4707, + "step": 284400 + }, + { + "epoch": 0.02, + "learning_rate": 7.227272727272729e-05, + "loss": 0.4723, + "step": 284500 + }, + { + "epoch": 0.02, + "learning_rate": 7.226262626262626e-05, + "loss": 0.4712, + "step": 284600 + }, + { + "epoch": 0.02, + "learning_rate": 7.225252525252526e-05, + "loss": 0.4777, + "step": 284700 + }, + { + "epoch": 0.02, + "learning_rate": 7.224242424242425e-05, + "loss": 0.476, + "step": 284800 + }, + { + "epoch": 0.02, + "learning_rate": 7.223232323232324e-05, + "loss": 0.4775, + "step": 284900 + }, + { + "epoch": 0.03, + "learning_rate": 7.222222222222222e-05, + "loss": 0.4753, + "step": 285000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4735775234748386, + "eval_average_loss_on_sentence_tokens": 0.39273069215220335, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4699121117591858, + "eval_non_padding_tokens_in_labels": 133.5173, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3833, + "eval_padding_tokens_in_labels": 378.4827, + "eval_reconstruction_accuracy": 0.9136377758307969, + "eval_runtime": 149.8885, + "eval_samples_per_second": 33.358, + "eval_sentence_accuracy": 0.7543650295189046, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.2499, + "step": 285000 + }, + { + "epoch": 0.03, + "learning_rate": 7.221212121212122e-05, + "loss": 0.4724, + "step": 285100 + }, + { + "epoch": 0.03, + "learning_rate": 7.22020202020202e-05, + "loss": 0.4719, + "step": 285200 + }, + { + "epoch": 0.03, + "learning_rate": 7.21919191919192e-05, + "loss": 0.4704, + "step": 285300 + }, + { + "epoch": 0.03, + "learning_rate": 7.218181818181819e-05, + "loss": 0.4742, + "step": 285400 + }, + { + "epoch": 0.03, + "learning_rate": 7.217171717171718e-05, + "loss": 0.4768, + "step": 285500 + }, + { + "epoch": 0.03, + "learning_rate": 7.216161616161616e-05, + "loss": 0.4767, + "step": 285600 + }, + { + "epoch": 0.03, + "learning_rate": 7.215151515151516e-05, + "loss": 0.4741, + "step": 285700 + }, + { + "epoch": 0.03, + "learning_rate": 7.214141414141414e-05, + "loss": 0.4743, + "step": 285800 + }, + { + "epoch": 0.03, + "learning_rate": 7.213131313131313e-05, + "loss": 0.4746, + "step": 285900 + }, + { + "epoch": 0.03, + "learning_rate": 7.212121212121213e-05, + "loss": 0.4723, + "step": 286000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4745365829511455, + "eval_average_loss_on_sentence_tokens": 0.4114421695169088, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.47172850370407104, + "eval_non_padding_tokens_in_labels": 133.55635, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37945, + "eval_padding_tokens_in_labels": 378.44365, + "eval_reconstruction_accuracy": 0.9136593363280846, + "eval_runtime": 153.9201, + "eval_samples_per_second": 32.484, + "eval_sentence_accuracy": 0.7322393094909111, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 286000 + }, + { + "epoch": 0.03, + "learning_rate": 7.211111111111112e-05, + "loss": 0.4726, + "step": 286100 + }, + { + "epoch": 0.03, + "learning_rate": 7.210101010101011e-05, + "loss": 0.4724, + "step": 286200 + }, + { + "epoch": 0.03, + "learning_rate": 7.20909090909091e-05, + "loss": 0.4715, + "step": 286300 + }, + { + "epoch": 0.03, + "learning_rate": 7.208080808080808e-05, + "loss": 0.4731, + "step": 286400 + }, + { + "epoch": 0.03, + "learning_rate": 7.207070707070707e-05, + "loss": 0.4727, + "step": 286500 + }, + { + "epoch": 0.03, + "learning_rate": 7.206060606060606e-05, + "loss": 0.4749, + "step": 286600 + }, + { + "epoch": 0.03, + "learning_rate": 7.205050505050506e-05, + "loss": 0.476, + "step": 286700 + }, + { + "epoch": 0.03, + "learning_rate": 7.204040404040405e-05, + "loss": 0.4747, + "step": 286800 + }, + { + "epoch": 0.03, + "learning_rate": 7.203030303030304e-05, + "loss": 0.4737, + "step": 286900 + }, + { + "epoch": 0.03, + "learning_rate": 7.202020202020202e-05, + "loss": 0.4723, + "step": 287000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4729571948221328, + "eval_average_loss_on_sentence_tokens": 0.41664597157116084, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.47041991353034973, + "eval_non_padding_tokens_in_labels": 133.52295, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37585, + "eval_padding_tokens_in_labels": 378.47705, + "eval_reconstruction_accuracy": 0.9137245897318657, + "eval_runtime": 151.011, + "eval_samples_per_second": 33.11, + "eval_sentence_accuracy": 0.7382731889389345, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 287000 + }, + { + "epoch": 0.03, + "learning_rate": 7.201010101010101e-05, + "loss": 0.4749, + "step": 287100 + }, + { + "epoch": 0.03, + "learning_rate": 7.2e-05, + "loss": 0.4762, + "step": 287200 + }, + { + "epoch": 0.03, + "learning_rate": 7.1989898989899e-05, + "loss": 0.4723, + "step": 287300 + }, + { + "epoch": 0.03, + "learning_rate": 7.197979797979799e-05, + "loss": 0.4749, + "step": 287400 + }, + { + "epoch": 0.03, + "learning_rate": 7.196969696969698e-05, + "loss": 0.4772, + "step": 287500 + }, + { + "epoch": 0.03, + "learning_rate": 7.195959595959596e-05, + "loss": 0.4735, + "step": 287600 + }, + { + "epoch": 0.03, + "learning_rate": 7.194949494949496e-05, + "loss": 0.477, + "step": 287700 + }, + { + "epoch": 0.03, + "learning_rate": 7.193939393939394e-05, + "loss": 0.4704, + "step": 287800 + }, + { + "epoch": 0.03, + "learning_rate": 7.192929292929293e-05, + "loss": 0.4715, + "step": 287900 + }, + { + "epoch": 0.03, + "learning_rate": 7.191919191919192e-05, + "loss": 0.4739, + "step": 288000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47334077356144394, + "eval_average_loss_on_sentence_tokens": 0.3790724025355685, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.46907225251197815, + "eval_non_padding_tokens_in_labels": 133.5426, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37945, + "eval_padding_tokens_in_labels": 378.4574, + "eval_reconstruction_accuracy": 0.913596940130399, + "eval_runtime": 151.2706, + "eval_samples_per_second": 33.053, + "eval_sentence_accuracy": 0.7706722055735998, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 288000 + }, + { + "epoch": 0.03, + "learning_rate": 7.190909090909092e-05, + "loss": 0.4712, + "step": 288100 + }, + { + "epoch": 0.03, + "learning_rate": 7.18989898989899e-05, + "loss": 0.473, + "step": 288200 + }, + { + "epoch": 0.03, + "learning_rate": 7.18888888888889e-05, + "loss": 0.4738, + "step": 288300 + }, + { + "epoch": 0.03, + "learning_rate": 7.187878787878788e-05, + "loss": 0.473, + "step": 288400 + }, + { + "epoch": 0.03, + "learning_rate": 7.186868686868687e-05, + "loss": 0.4717, + "step": 288500 + }, + { + "epoch": 0.03, + "learning_rate": 7.185858585858586e-05, + "loss": 0.4746, + "step": 288600 + }, + { + "epoch": 0.03, + "learning_rate": 7.184848484848485e-05, + "loss": 0.4754, + "step": 288700 + }, + { + "epoch": 0.03, + "learning_rate": 7.183838383838383e-05, + "loss": 0.475, + "step": 288800 + }, + { + "epoch": 0.03, + "learning_rate": 7.182828282828284e-05, + "loss": 0.4752, + "step": 288900 + }, + { + "epoch": 0.03, + "learning_rate": 7.181818181818182e-05, + "loss": 0.47, + "step": 289000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47340225767692384, + "eval_average_loss_on_sentence_tokens": 0.41941938489306724, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.47089844942092896, + "eval_non_padding_tokens_in_labels": 133.5338, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3843, + "eval_padding_tokens_in_labels": 378.4662, + "eval_reconstruction_accuracy": 0.9137043132818363, + "eval_runtime": 145.9431, + "eval_samples_per_second": 34.26, + "eval_sentence_accuracy": 0.7367837852387532, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 289000 + }, + { + "epoch": 0.03, + "learning_rate": 7.180808080808081e-05, + "loss": 0.4726, + "step": 289100 + }, + { + "epoch": 0.03, + "learning_rate": 7.17979797979798e-05, + "loss": 0.4718, + "step": 289200 + }, + { + "epoch": 0.03, + "learning_rate": 7.178787878787879e-05, + "loss": 0.4756, + "step": 289300 + }, + { + "epoch": 0.03, + "learning_rate": 7.177777777777777e-05, + "loss": 0.4766, + "step": 289400 + }, + { + "epoch": 0.03, + "learning_rate": 7.176767676767678e-05, + "loss": 0.4686, + "step": 289500 + }, + { + "epoch": 0.03, + "learning_rate": 7.175757575757576e-05, + "loss": 0.4712, + "step": 289600 + }, + { + "epoch": 0.03, + "learning_rate": 7.174747474747475e-05, + "loss": 0.4715, + "step": 289700 + }, + { + "epoch": 0.03, + "learning_rate": 7.173737373737374e-05, + "loss": 0.4752, + "step": 289800 + }, + { + "epoch": 0.03, + "learning_rate": 7.172727272727273e-05, + "loss": 0.4691, + "step": 289900 + }, + { + "epoch": 0.03, + "learning_rate": 7.171717171717171e-05, + "loss": 0.4749, + "step": 290000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47238256541956425, + "eval_average_loss_on_sentence_tokens": 0.42775129121164507, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4703320264816284, + "eval_non_padding_tokens_in_labels": 133.5149, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.382, + "eval_padding_tokens_in_labels": 378.4851, + "eval_reconstruction_accuracy": 0.9137731349178261, + "eval_runtime": 147.7242, + "eval_samples_per_second": 33.847, + "eval_sentence_accuracy": 0.7327148419975954, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.248775, + "step": 290000 + }, + { + "epoch": 0.03, + "learning_rate": 7.170707070707072e-05, + "loss": 0.4747, + "step": 290100 + }, + { + "epoch": 0.03, + "learning_rate": 7.16969696969697e-05, + "loss": 0.4768, + "step": 290200 + }, + { + "epoch": 0.03, + "learning_rate": 7.168686868686869e-05, + "loss": 0.4718, + "step": 290300 + }, + { + "epoch": 0.03, + "learning_rate": 7.167676767676768e-05, + "loss": 0.4742, + "step": 290400 + }, + { + "epoch": 0.03, + "learning_rate": 7.166666666666667e-05, + "loss": 0.4754, + "step": 290500 + }, + { + "epoch": 0.03, + "learning_rate": 7.165656565656566e-05, + "loss": 0.4724, + "step": 290600 + }, + { + "epoch": 0.03, + "learning_rate": 7.164646464646465e-05, + "loss": 0.4714, + "step": 290700 + }, + { + "epoch": 0.03, + "learning_rate": 7.163636363636363e-05, + "loss": 0.4751, + "step": 290800 + }, + { + "epoch": 0.03, + "learning_rate": 7.162626262626262e-05, + "loss": 0.4715, + "step": 290900 + }, + { + "epoch": 0.03, + "learning_rate": 7.161616161616162e-05, + "loss": 0.471, + "step": 291000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47195702366919884, + "eval_average_loss_on_sentence_tokens": 0.4475235274830868, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.47078123688697815, + "eval_non_padding_tokens_in_labels": 133.51025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37945, + "eval_padding_tokens_in_labels": 378.48975, + "eval_reconstruction_accuracy": 0.9137988783256809, + "eval_runtime": 152.3523, + "eval_samples_per_second": 32.819, + "eval_sentence_accuracy": 0.723379151937122, + "eval_steps_per_second": 0.085, + "eval_variance_shuffling_prob": 0.24750000000000005, + "step": 291000 + }, + { + "epoch": 0.03, + "learning_rate": 7.160606060606061e-05, + "loss": 0.4706, + "step": 291100 + }, + { + "epoch": 0.03, + "learning_rate": 7.15959595959596e-05, + "loss": 0.4716, + "step": 291200 + }, + { + "epoch": 0.03, + "learning_rate": 7.158585858585859e-05, + "loss": 0.4672, + "step": 291300 + }, + { + "epoch": 0.03, + "learning_rate": 7.157575757575757e-05, + "loss": 0.4716, + "step": 291400 + }, + { + "epoch": 0.03, + "learning_rate": 7.156565656565658e-05, + "loss": 0.4707, + "step": 291500 + }, + { + "epoch": 0.03, + "learning_rate": 7.155555555555555e-05, + "loss": 0.4716, + "step": 291600 + }, + { + "epoch": 0.03, + "learning_rate": 7.154545454545455e-05, + "loss": 0.4721, + "step": 291700 + }, + { + "epoch": 0.03, + "learning_rate": 7.153535353535354e-05, + "loss": 0.4703, + "step": 291800 + }, + { + "epoch": 0.03, + "learning_rate": 7.152525252525253e-05, + "loss": 0.4719, + "step": 291900 + }, + { + "epoch": 0.03, + "learning_rate": 7.151515151515152e-05, + "loss": 0.4701, + "step": 292000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.473090815703586, + "eval_average_loss_on_sentence_tokens": 0.41946946567157073, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.4706738293170929, + "eval_non_padding_tokens_in_labels": 133.5914, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3871, + "eval_padding_tokens_in_labels": 378.4086, + "eval_reconstruction_accuracy": 0.9137531336671586, + "eval_runtime": 148.0347, + "eval_samples_per_second": 33.776, + "eval_sentence_accuracy": 0.7331365406355985, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24909999999999996, + "step": 292000 + }, + { + "epoch": 0.03, + "learning_rate": 7.150505050505051e-05, + "loss": 0.4748, + "step": 292100 + }, + { + "epoch": 0.03, + "learning_rate": 7.14949494949495e-05, + "loss": 0.4727, + "step": 292200 + }, + { + "epoch": 0.03, + "learning_rate": 7.148484848484848e-05, + "loss": 0.4717, + "step": 292300 + }, + { + "epoch": 0.03, + "learning_rate": 7.147474747474749e-05, + "loss": 0.4728, + "step": 292400 + }, + { + "epoch": 0.03, + "learning_rate": 7.146464646464647e-05, + "loss": 0.4771, + "step": 292500 + }, + { + "epoch": 0.03, + "learning_rate": 7.145454545454546e-05, + "loss": 0.4763, + "step": 292600 + }, + { + "epoch": 0.03, + "learning_rate": 7.144444444444445e-05, + "loss": 0.4679, + "step": 292700 + }, + { + "epoch": 0.03, + "learning_rate": 7.143434343434344e-05, + "loss": 0.4755, + "step": 292800 + }, + { + "epoch": 0.03, + "learning_rate": 7.142424242424242e-05, + "loss": 0.4731, + "step": 292900 + }, + { + "epoch": 0.03, + "learning_rate": 7.141414141414143e-05, + "loss": 0.4715, + "step": 293000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47231336404398794, + "eval_average_loss_on_sentence_tokens": 0.3992375750974969, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.46894532442092896, + "eval_non_padding_tokens_in_labels": 133.5268, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3849, + "eval_padding_tokens_in_labels": 378.4732, + "eval_reconstruction_accuracy": 0.9138998997170213, + "eval_runtime": 149.6055, + "eval_samples_per_second": 33.421, + "eval_sentence_accuracy": 0.7483535808494984, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.25, + "step": 293000 + }, + { + "epoch": 0.03, + "learning_rate": 7.14040404040404e-05, + "loss": 0.4738, + "step": 293100 + }, + { + "epoch": 0.03, + "learning_rate": 7.13939393939394e-05, + "loss": 0.4719, + "step": 293200 + }, + { + "epoch": 0.03, + "learning_rate": 7.138383838383839e-05, + "loss": 0.4759, + "step": 293300 + }, + { + "epoch": 0.03, + "learning_rate": 7.137373737373738e-05, + "loss": 0.4745, + "step": 293400 + }, + { + "epoch": 0.03, + "learning_rate": 7.136363636363636e-05, + "loss": 0.4717, + "step": 293500 + }, + { + "epoch": 0.03, + "learning_rate": 7.135353535353537e-05, + "loss": 0.4727, + "step": 293600 + }, + { + "epoch": 0.03, + "learning_rate": 7.134343434343435e-05, + "loss": 0.4739, + "step": 293700 + }, + { + "epoch": 0.03, + "learning_rate": 7.133333333333334e-05, + "loss": 0.4716, + "step": 293800 + }, + { + "epoch": 0.03, + "learning_rate": 7.132323232323233e-05, + "loss": 0.4735, + "step": 293900 + }, + { + "epoch": 0.03, + "learning_rate": 7.131313131313132e-05, + "loss": 0.4709, + "step": 294000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.47147546483020447, + "eval_average_loss_on_sentence_tokens": 0.3910212502110088, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.46782225370407104, + "eval_non_padding_tokens_in_labels": 133.52595, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.383, + "eval_padding_tokens_in_labels": 378.47405, + "eval_reconstruction_accuracy": 0.9139088675628476, + "eval_runtime": 147.2994, + "eval_samples_per_second": 33.944, + "eval_sentence_accuracy": 0.7580257325892297, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.2496, + "step": 294000 + }, + { + "epoch": 0.03, + "learning_rate": 7.13030303030303e-05, + "loss": 0.4723, + "step": 294100 + }, + { + "epoch": 0.03, + "learning_rate": 7.12929292929293e-05, + "loss": 0.4704, + "step": 294200 + }, + { + "epoch": 0.03, + "learning_rate": 7.128282828282828e-05, + "loss": 0.4709, + "step": 294300 + }, + { + "epoch": 0.03, + "learning_rate": 7.127272727272728e-05, + "loss": 0.4707, + "step": 294400 + }, + { + "epoch": 0.03, + "learning_rate": 7.126262626262627e-05, + "loss": 0.4716, + "step": 294500 + }, + { + "epoch": 0.03, + "learning_rate": 7.125252525252526e-05, + "loss": 0.47, + "step": 294600 + }, + { + "epoch": 0.03, + "learning_rate": 7.124242424242424e-05, + "loss": 0.4726, + "step": 294700 + }, + { + "epoch": 0.03, + "learning_rate": 7.123232323232324e-05, + "loss": 0.471, + "step": 294800 + }, + { + "epoch": 0.03, + "learning_rate": 7.122222222222222e-05, + "loss": 0.4679, + "step": 294900 + }, + { + "epoch": 0.04, + "learning_rate": 7.121212121212121e-05, + "loss": 0.4724, + "step": 295000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4711932961056125, + "eval_average_loss_on_sentence_tokens": 0.3759222505551155, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4669824242591858, + "eval_non_padding_tokens_in_labels": 133.5708, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.41135, + "eval_padding_tokens_in_labels": 378.4292, + "eval_reconstruction_accuracy": 0.9140424248093597, + "eval_runtime": 148.5557, + "eval_samples_per_second": 33.657, + "eval_sentence_accuracy": 0.7712643781290937, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 295000 + }, + { + "epoch": 0.04, + "learning_rate": 7.12020202020202e-05, + "loss": 0.4739, + "step": 295100 + }, + { + "epoch": 0.04, + "learning_rate": 7.11919191919192e-05, + "loss": 0.4735, + "step": 295200 + }, + { + "epoch": 0.04, + "learning_rate": 7.118181818181818e-05, + "loss": 0.4697, + "step": 295300 + }, + { + "epoch": 0.04, + "learning_rate": 7.117171717171718e-05, + "loss": 0.4743, + "step": 295400 + }, + { + "epoch": 0.04, + "learning_rate": 7.116161616161616e-05, + "loss": 0.4733, + "step": 295500 + }, + { + "epoch": 0.04, + "learning_rate": 7.115151515151515e-05, + "loss": 0.4716, + "step": 295600 + }, + { + "epoch": 0.04, + "learning_rate": 7.114141414141414e-05, + "loss": 0.4719, + "step": 295700 + }, + { + "epoch": 0.04, + "learning_rate": 7.113131313131314e-05, + "loss": 0.469, + "step": 295800 + }, + { + "epoch": 0.04, + "learning_rate": 7.112121212121213e-05, + "loss": 0.4757, + "step": 295900 + }, + { + "epoch": 0.04, + "learning_rate": 7.111111111111112e-05, + "loss": 0.4727, + "step": 296000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4705545413755935, + "eval_average_loss_on_sentence_tokens": 0.40177160838494935, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.4674316346645355, + "eval_non_padding_tokens_in_labels": 133.4744, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37345, + "eval_padding_tokens_in_labels": 378.5256, + "eval_reconstruction_accuracy": 0.9140116556225275, + "eval_runtime": 149.8642, + "eval_samples_per_second": 33.364, + "eval_sentence_accuracy": 0.7528666535072766, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 296000 + }, + { + "epoch": 0.04, + "learning_rate": 7.11010101010101e-05, + "loss": 0.4711, + "step": 296100 + }, + { + "epoch": 0.04, + "learning_rate": 7.109090909090909e-05, + "loss": 0.469, + "step": 296200 + }, + { + "epoch": 0.04, + "learning_rate": 7.108080808080808e-05, + "loss": 0.4696, + "step": 296300 + }, + { + "epoch": 0.04, + "learning_rate": 7.107070707070707e-05, + "loss": 0.4733, + "step": 296400 + }, + { + "epoch": 0.04, + "learning_rate": 7.106060606060607e-05, + "loss": 0.4714, + "step": 296500 + }, + { + "epoch": 0.04, + "learning_rate": 7.105050505050506e-05, + "loss": 0.4722, + "step": 296600 + }, + { + "epoch": 0.04, + "learning_rate": 7.104040404040404e-05, + "loss": 0.4726, + "step": 296700 + }, + { + "epoch": 0.04, + "learning_rate": 7.103030303030304e-05, + "loss": 0.4698, + "step": 296800 + }, + { + "epoch": 0.04, + "learning_rate": 7.102020202020202e-05, + "loss": 0.473, + "step": 296900 + }, + { + "epoch": 0.04, + "learning_rate": 7.101010101010101e-05, + "loss": 0.4695, + "step": 297000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.47086696621808277, + "eval_average_loss_on_sentence_tokens": 0.38911564446421076, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.46724608540534973, + "eval_non_padding_tokens_in_labels": 133.5226, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3855, + "eval_padding_tokens_in_labels": 378.4774, + "eval_reconstruction_accuracy": 0.9141148447881915, + "eval_runtime": 147.1703, + "eval_samples_per_second": 33.974, + "eval_sentence_accuracy": 0.7541766109785203, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.2499, + "step": 297000 + }, + { + "epoch": 0.04, + "learning_rate": 7.1e-05, + "loss": 0.4694, + "step": 297100 + }, + { + "epoch": 0.04, + "learning_rate": 7.0989898989899e-05, + "loss": 0.4727, + "step": 297200 + }, + { + "epoch": 0.04, + "learning_rate": 7.097979797979798e-05, + "loss": 0.4695, + "step": 297300 + }, + { + "epoch": 0.04, + "learning_rate": 7.096969696969698e-05, + "loss": 0.4734, + "step": 297400 + }, + { + "epoch": 0.04, + "learning_rate": 7.095959595959596e-05, + "loss": 0.4711, + "step": 297500 + }, + { + "epoch": 0.04, + "learning_rate": 7.094949494949495e-05, + "loss": 0.4743, + "step": 297600 + }, + { + "epoch": 0.04, + "learning_rate": 7.093939393939394e-05, + "loss": 0.4705, + "step": 297700 + }, + { + "epoch": 0.04, + "learning_rate": 7.092929292929293e-05, + "loss": 0.4691, + "step": 297800 + }, + { + "epoch": 0.04, + "learning_rate": 7.091919191919191e-05, + "loss": 0.4725, + "step": 297900 + }, + { + "epoch": 0.04, + "learning_rate": 7.090909090909092e-05, + "loss": 0.4699, + "step": 298000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.47126993596415767, + "eval_average_loss_on_sentence_tokens": 0.39022246010310213, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.46756836771965027, + "eval_non_padding_tokens_in_labels": 133.5218, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3891, + "eval_padding_tokens_in_labels": 378.4782, + "eval_reconstruction_accuracy": 0.9140128771582438, + "eval_runtime": 150.1241, + "eval_samples_per_second": 33.306, + "eval_sentence_accuracy": 0.7556301254329141, + "eval_steps_per_second": 0.087, + "eval_variance_shuffling_prob": 0.2499, + "step": 298000 + }, + { + "epoch": 0.04, + "learning_rate": 7.08989898989899e-05, + "loss": 0.4675, + "step": 298100 + }, + { + "epoch": 0.04, + "learning_rate": 7.088888888888889e-05, + "loss": 0.4714, + "step": 298200 + }, + { + "epoch": 0.04, + "learning_rate": 7.087878787878788e-05, + "loss": 0.4712, + "step": 298300 + }, + { + "epoch": 0.04, + "learning_rate": 7.086868686868687e-05, + "loss": 0.4734, + "step": 298400 + }, + { + "epoch": 0.04, + "learning_rate": 7.085858585858585e-05, + "loss": 0.469, + "step": 298500 + }, + { + "epoch": 0.04, + "learning_rate": 7.084848484848486e-05, + "loss": 0.4748, + "step": 298600 + }, + { + "epoch": 0.04, + "learning_rate": 7.083838383838384e-05, + "loss": 0.4701, + "step": 298700 + }, + { + "epoch": 0.04, + "learning_rate": 7.082828282828283e-05, + "loss": 0.4663, + "step": 298800 + }, + { + "epoch": 0.04, + "learning_rate": 7.081818181818182e-05, + "loss": 0.468, + "step": 298900 + }, + { + "epoch": 0.04, + "learning_rate": 7.080808080808081e-05, + "loss": 0.4725, + "step": 299000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4696771839886498, + "eval_average_loss_on_sentence_tokens": 0.39267557286188554, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.46623048186302185, + "eval_non_padding_tokens_in_labels": 133.5107, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.371, + "eval_padding_tokens_in_labels": 378.4893, + "eval_reconstruction_accuracy": 0.9141521109808501, + "eval_runtime": 146.5918, + "eval_samples_per_second": 34.108, + "eval_sentence_accuracy": 0.7555673192527859, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 299000 + }, + { + "epoch": 0.04, + "learning_rate": 7.079797979797979e-05, + "loss": 0.4706, + "step": 299100 + }, + { + "epoch": 0.04, + "learning_rate": 7.07878787878788e-05, + "loss": 0.4721, + "step": 299200 + }, + { + "epoch": 0.04, + "learning_rate": 7.077777777777777e-05, + "loss": 0.4697, + "step": 299300 + }, + { + "epoch": 0.04, + "learning_rate": 7.076767676767677e-05, + "loss": 0.4696, + "step": 299400 + }, + { + "epoch": 0.04, + "learning_rate": 7.075757575757576e-05, + "loss": 0.4756, + "step": 299500 + }, + { + "epoch": 0.04, + "learning_rate": 7.074747474747475e-05, + "loss": 0.4747, + "step": 299600 + }, + { + "epoch": 0.04, + "learning_rate": 7.073737373737374e-05, + "loss": 0.4704, + "step": 299700 + }, + { + "epoch": 0.04, + "learning_rate": 7.072727272727273e-05, + "loss": 0.4725, + "step": 299800 + }, + { + "epoch": 0.04, + "learning_rate": 7.071717171717171e-05, + "loss": 0.4778, + "step": 299900 + }, + { + "epoch": 0.04, + "learning_rate": 7.07070707070707e-05, + "loss": 0.4714, + "step": 300000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.47104320257014654, + "eval_average_loss_on_sentence_tokens": 0.39304801694995956, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4674609303474426, + "eval_non_padding_tokens_in_labels": 133.53365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3854, + "eval_padding_tokens_in_labels": 378.46635, + "eval_reconstruction_accuracy": 0.9141330114693977, + "eval_runtime": 147.5872, + "eval_samples_per_second": 33.878, + "eval_sentence_accuracy": 0.7481382453747735, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 300000 + }, + { + "epoch": 0.04, + "learning_rate": 7.06969696969697e-05, + "loss": 0.471, + "step": 300100 + }, + { + "epoch": 0.04, + "learning_rate": 7.068686868686869e-05, + "loss": 0.4684, + "step": 300200 + }, + { + "epoch": 0.04, + "learning_rate": 7.067676767676768e-05, + "loss": 0.4732, + "step": 300300 + }, + { + "epoch": 0.04, + "learning_rate": 7.066666666666667e-05, + "loss": 0.4716, + "step": 300400 + }, + { + "epoch": 0.04, + "learning_rate": 7.065656565656566e-05, + "loss": 0.4696, + "step": 300500 + }, + { + "epoch": 0.04, + "learning_rate": 7.064646464646466e-05, + "loss": 0.4665, + "step": 300600 + }, + { + "epoch": 0.04, + "learning_rate": 7.063636363636365e-05, + "loss": 0.4657, + "step": 300700 + }, + { + "epoch": 0.04, + "learning_rate": 7.062626262626263e-05, + "loss": 0.4713, + "step": 300800 + }, + { + "epoch": 0.04, + "learning_rate": 7.061616161616162e-05, + "loss": 0.4689, + "step": 300900 + }, + { + "epoch": 0.04, + "learning_rate": 7.060606060606061e-05, + "loss": 0.4685, + "step": 301000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4703904819656515, + "eval_average_loss_on_sentence_tokens": 0.39480699862140906, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4670117199420929, + "eval_non_padding_tokens_in_labels": 133.5365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3708, + "eval_padding_tokens_in_labels": 378.4635, + "eval_reconstruction_accuracy": 0.9140049871019611, + "eval_runtime": 147.7402, + "eval_samples_per_second": 33.843, + "eval_sentence_accuracy": 0.7556570423672546, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.2499, + "step": 301000 + }, + { + "epoch": 0.04, + "learning_rate": 7.05959595959596e-05, + "loss": 0.4729, + "step": 301100 + }, + { + "epoch": 0.04, + "learning_rate": 7.05858585858586e-05, + "loss": 0.4722, + "step": 301200 + }, + { + "epoch": 0.04, + "learning_rate": 7.057575757575759e-05, + "loss": 0.475, + "step": 301300 + }, + { + "epoch": 0.04, + "learning_rate": 7.056565656565656e-05, + "loss": 0.4681, + "step": 301400 + }, + { + "epoch": 0.04, + "learning_rate": 7.055555555555556e-05, + "loss": 0.4724, + "step": 301500 + }, + { + "epoch": 0.04, + "learning_rate": 7.054545454545455e-05, + "loss": 0.4717, + "step": 301600 + }, + { + "epoch": 0.04, + "learning_rate": 7.053535353535354e-05, + "loss": 0.4708, + "step": 301700 + }, + { + "epoch": 0.04, + "learning_rate": 7.052525252525253e-05, + "loss": 0.471, + "step": 301800 + }, + { + "epoch": 0.04, + "learning_rate": 7.051515151515152e-05, + "loss": 0.47, + "step": 301900 + }, + { + "epoch": 0.04, + "learning_rate": 7.05050505050505e-05, + "loss": 0.472, + "step": 302000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.47049179221278115, + "eval_average_loss_on_sentence_tokens": 0.39298025065683256, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4669433534145355, + "eval_non_padding_tokens_in_labels": 133.50045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36925, + "eval_padding_tokens_in_labels": 378.49955, + "eval_reconstruction_accuracy": 0.9141422758387767, + "eval_runtime": 146.0682, + "eval_samples_per_second": 34.231, + "eval_sentence_accuracy": 0.7487438763974376, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.25, + "step": 302000 + }, + { + "epoch": 0.04, + "learning_rate": 7.049494949494951e-05, + "loss": 0.4688, + "step": 302100 + }, + { + "epoch": 0.04, + "learning_rate": 7.048484848484849e-05, + "loss": 0.4696, + "step": 302200 + }, + { + "epoch": 0.04, + "learning_rate": 7.047474747474748e-05, + "loss": 0.4662, + "step": 302300 + }, + { + "epoch": 0.04, + "learning_rate": 7.046464646464647e-05, + "loss": 0.4694, + "step": 302400 + }, + { + "epoch": 0.04, + "learning_rate": 7.045454545454546e-05, + "loss": 0.4719, + "step": 302500 + }, + { + "epoch": 0.04, + "learning_rate": 7.044444444444444e-05, + "loss": 0.4708, + "step": 302600 + }, + { + "epoch": 0.04, + "learning_rate": 7.043434343434345e-05, + "loss": 0.4705, + "step": 302700 + }, + { + "epoch": 0.04, + "learning_rate": 7.042424242424243e-05, + "loss": 0.4676, + "step": 302800 + }, + { + "epoch": 0.04, + "learning_rate": 7.041414141414142e-05, + "loss": 0.4661, + "step": 302900 + }, + { + "epoch": 0.04, + "learning_rate": 7.040404040404041e-05, + "loss": 0.4663, + "step": 303000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4703502242961042, + "eval_average_loss_on_sentence_tokens": 0.3907429823336926, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.46674805879592896, + "eval_non_padding_tokens_in_labels": 133.56035, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38215, + "eval_padding_tokens_in_labels": 378.43965, + "eval_reconstruction_accuracy": 0.9140559678974026, + "eval_runtime": 146.3658, + "eval_samples_per_second": 34.161, + "eval_sentence_accuracy": 0.7543201679616703, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 303000 + }, + { + "epoch": 0.04, + "learning_rate": 7.03939393939394e-05, + "loss": 0.4685, + "step": 303100 + }, + { + "epoch": 0.04, + "learning_rate": 7.038383838383838e-05, + "loss": 0.4699, + "step": 303200 + }, + { + "epoch": 0.04, + "learning_rate": 7.037373737373739e-05, + "loss": 0.472, + "step": 303300 + }, + { + "epoch": 0.04, + "learning_rate": 7.036363636363636e-05, + "loss": 0.4687, + "step": 303400 + }, + { + "epoch": 0.04, + "learning_rate": 7.035353535353536e-05, + "loss": 0.4723, + "step": 303500 + }, + { + "epoch": 0.04, + "learning_rate": 7.034343434343435e-05, + "loss": 0.4713, + "step": 303600 + }, + { + "epoch": 0.04, + "learning_rate": 7.033333333333334e-05, + "loss": 0.4724, + "step": 303700 + }, + { + "epoch": 0.04, + "learning_rate": 7.032323232323232e-05, + "loss": 0.4684, + "step": 303800 + }, + { + "epoch": 0.04, + "learning_rate": 7.031313131313132e-05, + "loss": 0.4709, + "step": 303900 + }, + { + "epoch": 0.04, + "learning_rate": 7.03030303030303e-05, + "loss": 0.4703, + "step": 304000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4703073392534355, + "eval_average_loss_on_sentence_tokens": 0.36349947842948843, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.46552735567092896, + "eval_non_padding_tokens_in_labels": 133.52925, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38945, + "eval_padding_tokens_in_labels": 378.47075, + "eval_reconstruction_accuracy": 0.9140969893054514, + "eval_runtime": 150.9911, + "eval_samples_per_second": 33.115, + "eval_sentence_accuracy": 0.7797476986021139, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.245775, + "step": 304000 + }, + { + "epoch": 0.04, + "learning_rate": 7.02929292929293e-05, + "loss": 0.472, + "step": 304100 + }, + { + "epoch": 0.04, + "learning_rate": 7.028282828282829e-05, + "loss": 0.4666, + "step": 304200 + }, + { + "epoch": 0.04, + "learning_rate": 7.027272727272728e-05, + "loss": 0.4693, + "step": 304300 + }, + { + "epoch": 0.04, + "learning_rate": 7.026262626262626e-05, + "loss": 0.4669, + "step": 304400 + }, + { + "epoch": 0.04, + "learning_rate": 7.025252525252526e-05, + "loss": 0.4705, + "step": 304500 + }, + { + "epoch": 0.04, + "learning_rate": 7.024242424242424e-05, + "loss": 0.4697, + "step": 304600 + }, + { + "epoch": 0.04, + "learning_rate": 7.023232323232323e-05, + "loss": 0.469, + "step": 304700 + }, + { + "epoch": 0.04, + "learning_rate": 7.022222222222222e-05, + "loss": 0.4639, + "step": 304800 + }, + { + "epoch": 0.04, + "learning_rate": 7.021212121212122e-05, + "loss": 0.4685, + "step": 304900 + }, + { + "epoch": 0.04, + "learning_rate": 7.020202020202021e-05, + "loss": 0.4727, + "step": 305000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4696876207494901, + "eval_average_loss_on_sentence_tokens": 0.42845418051750184, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.46776366233825684, + "eval_non_padding_tokens_in_labels": 133.52135, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3884, + "eval_padding_tokens_in_labels": 378.47865, + "eval_reconstruction_accuracy": 0.9141769409118754, + "eval_runtime": 147.8117, + "eval_samples_per_second": 33.827, + "eval_sentence_accuracy": 0.7275647352270892, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 305000 + }, + { + "epoch": 0.05, + "learning_rate": 7.01919191919192e-05, + "loss": 0.4674, + "step": 305100 + }, + { + "epoch": 0.05, + "learning_rate": 7.018181818181818e-05, + "loss": 0.4725, + "step": 305200 + }, + { + "epoch": 0.05, + "learning_rate": 7.017171717171717e-05, + "loss": 0.4705, + "step": 305300 + }, + { + "epoch": 0.05, + "learning_rate": 7.016161616161616e-05, + "loss": 0.4704, + "step": 305400 + }, + { + "epoch": 0.05, + "learning_rate": 7.015151515151515e-05, + "loss": 0.4731, + "step": 305500 + }, + { + "epoch": 0.05, + "learning_rate": 7.014141414141415e-05, + "loss": 0.4732, + "step": 305600 + }, + { + "epoch": 0.05, + "learning_rate": 7.013131313131314e-05, + "loss": 0.471, + "step": 305700 + }, + { + "epoch": 0.05, + "learning_rate": 7.012121212121212e-05, + "loss": 0.471, + "step": 305800 + }, + { + "epoch": 0.05, + "learning_rate": 7.011111111111112e-05, + "loss": 0.4692, + "step": 305900 + }, + { + "epoch": 0.05, + "learning_rate": 7.01010101010101e-05, + "loss": 0.4713, + "step": 306000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4706721554589619, + "eval_average_loss_on_sentence_tokens": 0.40790175267169626, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.46779295802116394, + "eval_non_padding_tokens_in_labels": 133.53445, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3895, + "eval_padding_tokens_in_labels": 378.46555, + "eval_reconstruction_accuracy": 0.9140954997165721, + "eval_runtime": 147.2792, + "eval_samples_per_second": 33.949, + "eval_sentence_accuracy": 0.7432617941033969, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 306000 + }, + { + "epoch": 0.0, + "learning_rate": 7.009090909090909e-05, + "loss": 0.4699, + "step": 306100 + }, + { + "epoch": 0.0, + "learning_rate": 7.008080808080809e-05, + "loss": 0.4694, + "step": 306200 + }, + { + "epoch": 0.0, + "learning_rate": 7.007070707070708e-05, + "loss": 0.4667, + "step": 306300 + }, + { + "epoch": 0.0, + "learning_rate": 7.006060606060606e-05, + "loss": 0.4684, + "step": 306400 + }, + { + "epoch": 0.0, + "learning_rate": 7.005050505050506e-05, + "loss": 0.4702, + "step": 306500 + }, + { + "epoch": 0.0, + "learning_rate": 7.004040404040404e-05, + "loss": 0.4708, + "step": 306600 + }, + { + "epoch": 0.0, + "learning_rate": 7.003030303030303e-05, + "loss": 0.4703, + "step": 306700 + }, + { + "epoch": 0.0, + "learning_rate": 7.002020202020202e-05, + "loss": 0.4705, + "step": 306800 + }, + { + "epoch": 0.0, + "learning_rate": 7.001010101010102e-05, + "loss": 0.4704, + "step": 306900 + }, + { + "epoch": 0.0, + "learning_rate": 7e-05, + "loss": 0.4664, + "step": 307000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.47046635229729444, + "eval_average_loss_on_sentence_tokens": 0.426595273997654, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.46854493021965027, + "eval_non_padding_tokens_in_labels": 133.529, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3652, + "eval_padding_tokens_in_labels": 378.471, + "eval_reconstruction_accuracy": 0.9140963375255675, + "eval_runtime": 223.4705, + "eval_samples_per_second": 22.374, + "eval_sentence_accuracy": 0.723531681231719, + "eval_steps_per_second": 0.058, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 307000 + }, + { + "epoch": 0.0, + "learning_rate": 6.9989898989899e-05, + "loss": 0.4698, + "step": 307100 + }, + { + "epoch": 0.0, + "learning_rate": 6.997979797979798e-05, + "loss": 0.4685, + "step": 307200 + }, + { + "epoch": 0.0, + "learning_rate": 6.996969696969697e-05, + "loss": 0.466, + "step": 307300 + }, + { + "epoch": 0.0, + "learning_rate": 6.995959595959596e-05, + "loss": 0.4668, + "step": 307400 + }, + { + "epoch": 0.0, + "learning_rate": 6.994949494949495e-05, + "loss": 0.466, + "step": 307500 + }, + { + "epoch": 0.0, + "learning_rate": 6.993939393939393e-05, + "loss": 0.4696, + "step": 307600 + }, + { + "epoch": 0.0, + "learning_rate": 6.992929292929294e-05, + "loss": 0.4678, + "step": 307700 + }, + { + "epoch": 0.0, + "learning_rate": 6.991919191919192e-05, + "loss": 0.4646, + "step": 307800 + }, + { + "epoch": 0.0, + "learning_rate": 6.990909090909091e-05, + "loss": 0.4741, + "step": 307900 + }, + { + "epoch": 0.0, + "learning_rate": 6.98989898989899e-05, + "loss": 0.4663, + "step": 308000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.46921682901781636, + "eval_average_loss_on_sentence_tokens": 0.4257099161699844, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.46727538108825684, + "eval_non_padding_tokens_in_labels": 133.52285, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36045, + "eval_padding_tokens_in_labels": 378.47715, + "eval_reconstruction_accuracy": 0.9142420896774932, + "eval_runtime": 170.8617, + "eval_samples_per_second": 29.263, + "eval_sentence_accuracy": 0.7288432896082688, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 308000 + }, + { + "epoch": 0.0, + "learning_rate": 6.988888888888889e-05, + "loss": 0.4673, + "step": 308100 + }, + { + "epoch": 0.0, + "learning_rate": 6.987878787878787e-05, + "loss": 0.4695, + "step": 308200 + }, + { + "epoch": 0.0, + "learning_rate": 6.986868686868688e-05, + "loss": 0.4691, + "step": 308300 + }, + { + "epoch": 0.0, + "learning_rate": 6.985858585858585e-05, + "loss": 0.4682, + "step": 308400 + }, + { + "epoch": 0.0, + "learning_rate": 6.984848484848485e-05, + "loss": 0.4689, + "step": 308500 + }, + { + "epoch": 0.0, + "learning_rate": 6.983838383838384e-05, + "loss": 0.47, + "step": 308600 + }, + { + "epoch": 0.0, + "learning_rate": 6.982828282828283e-05, + "loss": 0.4729, + "step": 308700 + }, + { + "epoch": 0.0, + "learning_rate": 6.981818181818182e-05, + "loss": 0.4716, + "step": 308800 + }, + { + "epoch": 0.0, + "learning_rate": 6.980808080808081e-05, + "loss": 0.4719, + "step": 308900 + }, + { + "epoch": 0.0, + "learning_rate": 6.97979797979798e-05, + "loss": 0.4695, + "step": 309000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.4687179429485481, + "eval_average_loss_on_sentence_tokens": 0.4306777197413357, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.4669824242591858, + "eval_non_padding_tokens_in_labels": 133.5333, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40045, + "eval_padding_tokens_in_labels": 378.4667, + "eval_reconstruction_accuracy": 0.9142857970051458, + "eval_runtime": 168.3979, + "eval_samples_per_second": 29.692, + "eval_sentence_accuracy": 0.7261650546413767, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.24750000000000005, + "step": 309000 + }, + { + "epoch": 0.0, + "learning_rate": 6.978787878787878e-05, + "loss": 0.4687, + "step": 309100 + }, + { + "epoch": 0.0, + "learning_rate": 6.977777777777779e-05, + "loss": 0.4701, + "step": 309200 + }, + { + "epoch": 0.0, + "learning_rate": 6.976767676767677e-05, + "loss": 0.4686, + "step": 309300 + }, + { + "epoch": 0.0, + "learning_rate": 6.975757575757576e-05, + "loss": 0.47, + "step": 309400 + }, + { + "epoch": 0.0, + "learning_rate": 6.974747474747475e-05, + "loss": 0.4676, + "step": 309500 + }, + { + "epoch": 0.0, + "learning_rate": 6.973737373737374e-05, + "loss": 0.4709, + "step": 309600 + }, + { + "epoch": 0.0, + "learning_rate": 6.972727272727274e-05, + "loss": 0.4669, + "step": 309700 + }, + { + "epoch": 0.0, + "learning_rate": 6.971717171717173e-05, + "loss": 0.4684, + "step": 309800 + }, + { + "epoch": 0.0, + "learning_rate": 6.970707070707071e-05, + "loss": 0.4651, + "step": 309900 + }, + { + "epoch": 0.0, + "learning_rate": 6.96969696969697e-05, + "loss": 0.4662, + "step": 310000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.46939390415538185, + "eval_average_loss_on_sentence_tokens": 0.4226936398155242, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.4672265648841858, + "eval_non_padding_tokens_in_labels": 133.53845, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38185, + "eval_padding_tokens_in_labels": 378.46155, + "eval_reconstruction_accuracy": 0.9142070254678426, + "eval_runtime": 167.9215, + "eval_samples_per_second": 29.776, + "eval_sentence_accuracy": 0.732136127909272, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 310000 + }, + { + "epoch": 0.0, + "learning_rate": 6.968686868686869e-05, + "loss": 0.469, + "step": 310100 + }, + { + "epoch": 0.0, + "learning_rate": 6.967676767676768e-05, + "loss": 0.4667, + "step": 310200 + }, + { + "epoch": 0.0, + "learning_rate": 6.966666666666668e-05, + "loss": 0.4717, + "step": 310300 + }, + { + "epoch": 0.0, + "learning_rate": 6.965656565656567e-05, + "loss": 0.4685, + "step": 310400 + }, + { + "epoch": 0.0, + "learning_rate": 6.964646464646465e-05, + "loss": 0.4666, + "step": 310500 + }, + { + "epoch": 0.0, + "learning_rate": 6.963636363636364e-05, + "loss": 0.4708, + "step": 310600 + }, + { + "epoch": 0.0, + "learning_rate": 6.962626262626263e-05, + "loss": 0.4677, + "step": 310700 + }, + { + "epoch": 0.0, + "learning_rate": 6.961616161616162e-05, + "loss": 0.4688, + "step": 310800 + }, + { + "epoch": 0.0, + "learning_rate": 6.960606060606061e-05, + "loss": 0.4685, + "step": 310900 + }, + { + "epoch": 0.01, + "learning_rate": 6.95959595959596e-05, + "loss": 0.4668, + "step": 311000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4689700840172494, + "eval_average_loss_on_sentence_tokens": 0.43214389286379, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.46726563572883606, + "eval_non_padding_tokens_in_labels": 133.5267, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36995, + "eval_padding_tokens_in_labels": 378.4733, + "eval_reconstruction_accuracy": 0.9142223256941376, + "eval_runtime": 178.5242, + "eval_samples_per_second": 28.007, + "eval_sentence_accuracy": 0.7222531268505392, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2464, + "step": 311000 + }, + { + "epoch": 0.01, + "learning_rate": 6.958585858585858e-05, + "loss": 0.4685, + "step": 311100 + }, + { + "epoch": 0.01, + "learning_rate": 6.957575757575759e-05, + "loss": 0.4725, + "step": 311200 + }, + { + "epoch": 0.01, + "learning_rate": 6.956565656565657e-05, + "loss": 0.4699, + "step": 311300 + }, + { + "epoch": 0.01, + "learning_rate": 6.955555555555556e-05, + "loss": 0.4704, + "step": 311400 + }, + { + "epoch": 0.01, + "learning_rate": 6.954545454545455e-05, + "loss": 0.466, + "step": 311500 + }, + { + "epoch": 0.01, + "learning_rate": 6.953535353535354e-05, + "loss": 0.4684, + "step": 311600 + }, + { + "epoch": 0.01, + "learning_rate": 6.952525252525252e-05, + "loss": 0.4647, + "step": 311700 + }, + { + "epoch": 0.01, + "learning_rate": 6.951515151515153e-05, + "loss": 0.4693, + "step": 311800 + }, + { + "epoch": 0.01, + "learning_rate": 6.95050505050505e-05, + "loss": 0.469, + "step": 311900 + }, + { + "epoch": 0.01, + "learning_rate": 6.94949494949495e-05, + "loss": 0.4698, + "step": 312000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.46882261578843154, + "eval_average_loss_on_sentence_tokens": 0.39954526247332633, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4657324254512787, + "eval_non_padding_tokens_in_labels": 133.529, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38435, + "eval_padding_tokens_in_labels": 378.471, + "eval_reconstruction_accuracy": 0.9142570277516047, + "eval_runtime": 170.3139, + "eval_samples_per_second": 29.358, + "eval_sentence_accuracy": 0.7527724442370843, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.2499, + "step": 312000 + }, + { + "epoch": 0.01, + "learning_rate": 6.948484848484849e-05, + "loss": 0.4642, + "step": 312100 + }, + { + "epoch": 0.01, + "learning_rate": 6.947474747474748e-05, + "loss": 0.4686, + "step": 312200 + }, + { + "epoch": 0.01, + "learning_rate": 6.946464646464646e-05, + "loss": 0.4675, + "step": 312300 + }, + { + "epoch": 0.01, + "learning_rate": 6.945454545454547e-05, + "loss": 0.4671, + "step": 312400 + }, + { + "epoch": 0.01, + "learning_rate": 6.944444444444444e-05, + "loss": 0.4709, + "step": 312500 + }, + { + "epoch": 0.01, + "learning_rate": 6.943434343434344e-05, + "loss": 0.4666, + "step": 312600 + }, + { + "epoch": 0.01, + "learning_rate": 6.942424242424243e-05, + "loss": 0.4693, + "step": 312700 + }, + { + "epoch": 0.01, + "learning_rate": 6.941414141414142e-05, + "loss": 0.4685, + "step": 312800 + }, + { + "epoch": 0.01, + "learning_rate": 6.94040404040404e-05, + "loss": 0.4647, + "step": 312900 + }, + { + "epoch": 0.01, + "learning_rate": 6.93939393939394e-05, + "loss": 0.4693, + "step": 313000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4687873594436031, + "eval_average_loss_on_sentence_tokens": 0.4149375102799167, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.46632811427116394, + "eval_non_padding_tokens_in_labels": 133.51655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37985, + "eval_padding_tokens_in_labels": 378.48345, + "eval_reconstruction_accuracy": 0.914257566834544, + "eval_runtime": 176.6073, + "eval_samples_per_second": 28.311, + "eval_sentence_accuracy": 0.737263803901161, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 313000 + }, + { + "epoch": 0.01, + "learning_rate": 6.938383838383838e-05, + "loss": 0.4698, + "step": 313100 + }, + { + "epoch": 0.01, + "learning_rate": 6.937373737373737e-05, + "loss": 0.4699, + "step": 313200 + }, + { + "epoch": 0.01, + "learning_rate": 6.936363636363637e-05, + "loss": 0.4677, + "step": 313300 + }, + { + "epoch": 0.01, + "learning_rate": 6.935353535353536e-05, + "loss": 0.4689, + "step": 313400 + }, + { + "epoch": 0.01, + "learning_rate": 6.934343434343434e-05, + "loss": 0.4676, + "step": 313500 + }, + { + "epoch": 0.01, + "learning_rate": 6.933333333333334e-05, + "loss": 0.4648, + "step": 313600 + }, + { + "epoch": 0.01, + "learning_rate": 6.932323232323232e-05, + "loss": 0.4658, + "step": 313700 + }, + { + "epoch": 0.01, + "learning_rate": 6.931313131313131e-05, + "loss": 0.4696, + "step": 313800 + }, + { + "epoch": 0.01, + "learning_rate": 6.93030303030303e-05, + "loss": 0.471, + "step": 313900 + }, + { + "epoch": 0.01, + "learning_rate": 6.92929292929293e-05, + "loss": 0.4668, + "step": 314000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.46819016760964993, + "eval_average_loss_on_sentence_tokens": 0.4632683696701337, + "eval_average_shuffling_prob": 0.585, + "eval_loss": 0.4679882824420929, + "eval_non_padding_tokens_in_labels": 133.53135, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.393, + "eval_padding_tokens_in_labels": 378.46865, + "eval_reconstruction_accuracy": 0.9142356946512392, + "eval_runtime": 173.9776, + "eval_samples_per_second": 28.739, + "eval_sentence_accuracy": 0.706233064762144, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.242775, + "step": 314000 + }, + { + "epoch": 0.01, + "learning_rate": 6.928282828282829e-05, + "loss": 0.4697, + "step": 314100 + }, + { + "epoch": 0.01, + "learning_rate": 6.927272727272728e-05, + "loss": 0.4689, + "step": 314200 + }, + { + "epoch": 0.01, + "learning_rate": 6.926262626262626e-05, + "loss": 0.4681, + "step": 314300 + }, + { + "epoch": 0.01, + "learning_rate": 6.925252525252525e-05, + "loss": 0.4679, + "step": 314400 + }, + { + "epoch": 0.01, + "learning_rate": 6.924242424242424e-05, + "loss": 0.4674, + "step": 314500 + }, + { + "epoch": 0.01, + "learning_rate": 6.923232323232324e-05, + "loss": 0.466, + "step": 314600 + }, + { + "epoch": 0.01, + "learning_rate": 6.922222222222223e-05, + "loss": 0.4708, + "step": 314700 + }, + { + "epoch": 0.01, + "learning_rate": 6.921212121212122e-05, + "loss": 0.4706, + "step": 314800 + }, + { + "epoch": 0.01, + "learning_rate": 6.92020202020202e-05, + "loss": 0.4685, + "step": 314900 + }, + { + "epoch": 0.01, + "learning_rate": 6.91919191919192e-05, + "loss": 0.4687, + "step": 315000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4678086174923099, + "eval_average_loss_on_sentence_tokens": 0.4048597765254314, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.4649414122104645, + "eval_non_padding_tokens_in_labels": 133.48405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3568, + "eval_padding_tokens_in_labels": 378.51595, + "eval_reconstruction_accuracy": 0.914352040888655, + "eval_runtime": 181.2435, + "eval_samples_per_second": 27.587, + "eval_sentence_accuracy": 0.7501256123602562, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 315000 + }, + { + "epoch": 0.01, + "learning_rate": 6.918181818181818e-05, + "loss": 0.4708, + "step": 315100 + }, + { + "epoch": 0.01, + "learning_rate": 6.917171717171717e-05, + "loss": 0.4663, + "step": 315200 + }, + { + "epoch": 0.01, + "learning_rate": 6.916161616161617e-05, + "loss": 0.4646, + "step": 315300 + }, + { + "epoch": 0.01, + "learning_rate": 6.915151515151516e-05, + "loss": 0.4674, + "step": 315400 + }, + { + "epoch": 0.01, + "learning_rate": 6.914141414141414e-05, + "loss": 0.4663, + "step": 315500 + }, + { + "epoch": 0.01, + "learning_rate": 6.913131313131314e-05, + "loss": 0.4708, + "step": 315600 + }, + { + "epoch": 0.01, + "learning_rate": 6.912121212121212e-05, + "loss": 0.4669, + "step": 315700 + }, + { + "epoch": 0.01, + "learning_rate": 6.911111111111111e-05, + "loss": 0.4657, + "step": 315800 + }, + { + "epoch": 0.01, + "learning_rate": 6.91010101010101e-05, + "loss": 0.4658, + "step": 315900 + }, + { + "epoch": 0.01, + "learning_rate": 6.90909090909091e-05, + "loss": 0.4692, + "step": 316000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.46827167977617207, + "eval_average_loss_on_sentence_tokens": 0.4255654901031454, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.46632811427116394, + "eval_non_padding_tokens_in_labels": 133.5558, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39985, + "eval_padding_tokens_in_labels": 378.4442, + "eval_reconstruction_accuracy": 0.9143004525756018, + "eval_runtime": 178.7372, + "eval_samples_per_second": 27.974, + "eval_sentence_accuracy": 0.7274301505553861, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 316000 + }, + { + "epoch": 0.01, + "learning_rate": 6.908080808080807e-05, + "loss": 0.4651, + "step": 316100 + }, + { + "epoch": 0.01, + "learning_rate": 6.907070707070708e-05, + "loss": 0.4703, + "step": 316200 + }, + { + "epoch": 0.01, + "learning_rate": 6.906060606060606e-05, + "loss": 0.4635, + "step": 316300 + }, + { + "epoch": 0.01, + "learning_rate": 6.905050505050505e-05, + "loss": 0.4637, + "step": 316400 + }, + { + "epoch": 0.01, + "learning_rate": 6.904040404040404e-05, + "loss": 0.4702, + "step": 316500 + }, + { + "epoch": 0.01, + "learning_rate": 6.903030303030303e-05, + "loss": 0.4684, + "step": 316600 + }, + { + "epoch": 0.01, + "learning_rate": 6.902020202020201e-05, + "loss": 0.4672, + "step": 316700 + }, + { + "epoch": 0.01, + "learning_rate": 6.901010101010102e-05, + "loss": 0.4688, + "step": 316800 + }, + { + "epoch": 0.01, + "learning_rate": 6.9e-05, + "loss": 0.4652, + "step": 316900 + }, + { + "epoch": 0.01, + "learning_rate": 6.898989898989899e-05, + "loss": 0.4691, + "step": 317000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.46655174282970374, + "eval_average_loss_on_sentence_tokens": 0.41194243625369537, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.46416014432907104, + "eval_non_padding_tokens_in_labels": 133.53575, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39725, + "eval_padding_tokens_in_labels": 378.46425, + "eval_reconstruction_accuracy": 0.9144811717452784, + "eval_runtime": 174.521, + "eval_samples_per_second": 28.65, + "eval_sentence_accuracy": 0.7457022628169468, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 317000 + }, + { + "epoch": 0.01, + "learning_rate": 6.897979797979798e-05, + "loss": 0.4655, + "step": 317100 + }, + { + "epoch": 0.01, + "learning_rate": 6.896969696969697e-05, + "loss": 0.4646, + "step": 317200 + }, + { + "epoch": 0.01, + "learning_rate": 6.895959595959596e-05, + "loss": 0.4703, + "step": 317300 + }, + { + "epoch": 0.01, + "learning_rate": 6.894949494949496e-05, + "loss": 0.4683, + "step": 317400 + }, + { + "epoch": 0.01, + "learning_rate": 6.893939393939395e-05, + "loss": 0.4639, + "step": 317500 + }, + { + "epoch": 0.01, + "learning_rate": 6.892929292929293e-05, + "loss": 0.4668, + "step": 317600 + }, + { + "epoch": 0.01, + "learning_rate": 6.891919191919193e-05, + "loss": 0.4683, + "step": 317700 + }, + { + "epoch": 0.01, + "learning_rate": 6.890909090909091e-05, + "loss": 0.4628, + "step": 317800 + }, + { + "epoch": 0.01, + "learning_rate": 6.88989898989899e-05, + "loss": 0.4644, + "step": 317900 + }, + { + "epoch": 0.01, + "learning_rate": 6.88888888888889e-05, + "loss": 0.4664, + "step": 318000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4685301933023049, + "eval_average_loss_on_sentence_tokens": 0.3755975539749733, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.4642382860183716, + "eval_non_padding_tokens_in_labels": 133.53655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3917, + "eval_padding_tokens_in_labels": 378.46345, + "eval_reconstruction_accuracy": 0.9142274341268695, + "eval_runtime": 187.5381, + "eval_samples_per_second": 26.661, + "eval_sentence_accuracy": 0.7653875141313905, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 318000 + }, + { + "epoch": 0.01, + "learning_rate": 6.887878787878789e-05, + "loss": 0.469, + "step": 318100 + }, + { + "epoch": 0.01, + "learning_rate": 6.886868686868687e-05, + "loss": 0.4675, + "step": 318200 + }, + { + "epoch": 0.01, + "learning_rate": 6.885858585858587e-05, + "loss": 0.4679, + "step": 318300 + }, + { + "epoch": 0.01, + "learning_rate": 6.884848484848485e-05, + "loss": 0.4683, + "step": 318400 + }, + { + "epoch": 0.01, + "learning_rate": 6.883838383838384e-05, + "loss": 0.4674, + "step": 318500 + }, + { + "epoch": 0.01, + "learning_rate": 6.882828282828283e-05, + "loss": 0.466, + "step": 318600 + }, + { + "epoch": 0.01, + "learning_rate": 6.881818181818183e-05, + "loss": 0.4693, + "step": 318700 + }, + { + "epoch": 0.01, + "learning_rate": 6.88080808080808e-05, + "loss": 0.4643, + "step": 318800 + }, + { + "epoch": 0.01, + "learning_rate": 6.879797979797981e-05, + "loss": 0.4661, + "step": 318900 + }, + { + "epoch": 0.01, + "learning_rate": 6.878787878787879e-05, + "loss": 0.466, + "step": 319000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.46714691558649096, + "eval_average_loss_on_sentence_tokens": 0.3946218924893846, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4638378918170929, + "eval_non_padding_tokens_in_labels": 133.50615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3855, + "eval_padding_tokens_in_labels": 378.49385, + "eval_reconstruction_accuracy": 0.9143716311739536, + "eval_runtime": 212.4083, + "eval_samples_per_second": 23.54, + "eval_sentence_accuracy": 0.7525795395409766, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.2499, + "step": 319000 + }, + { + "epoch": 0.01, + "learning_rate": 6.877777777777778e-05, + "loss": 0.4692, + "step": 319100 + }, + { + "epoch": 0.01, + "learning_rate": 6.876767676767677e-05, + "loss": 0.4678, + "step": 319200 + }, + { + "epoch": 0.01, + "learning_rate": 6.875757575757576e-05, + "loss": 0.467, + "step": 319300 + }, + { + "epoch": 0.01, + "learning_rate": 6.874747474747476e-05, + "loss": 0.4714, + "step": 319400 + }, + { + "epoch": 0.01, + "learning_rate": 6.873737373737375e-05, + "loss": 0.4698, + "step": 319500 + }, + { + "epoch": 0.01, + "learning_rate": 6.872727272727273e-05, + "loss": 0.4629, + "step": 319600 + }, + { + "epoch": 0.01, + "learning_rate": 6.871717171717172e-05, + "loss": 0.4654, + "step": 319700 + }, + { + "epoch": 0.01, + "learning_rate": 6.870707070707071e-05, + "loss": 0.4682, + "step": 319800 + }, + { + "epoch": 0.01, + "learning_rate": 6.86969696969697e-05, + "loss": 0.4636, + "step": 319900 + }, + { + "epoch": 0.01, + "learning_rate": 6.86868686868687e-05, + "loss": 0.4631, + "step": 320000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.467319749055994, + "eval_average_loss_on_sentence_tokens": 0.35087824304320164, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.4620312452316284, + "eval_non_padding_tokens_in_labels": 133.52615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3793, + "eval_padding_tokens_in_labels": 378.47385, + "eval_reconstruction_accuracy": 0.914399422241275, + "eval_runtime": 200.209, + "eval_samples_per_second": 24.974, + "eval_sentence_accuracy": 0.7817036624975326, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.245775, + "step": 320000 + }, + { + "epoch": 0.01, + "learning_rate": 6.867676767676769e-05, + "loss": 0.4633, + "step": 320100 + }, + { + "epoch": 0.01, + "learning_rate": 6.866666666666666e-05, + "loss": 0.468, + "step": 320200 + }, + { + "epoch": 0.01, + "learning_rate": 6.865656565656567e-05, + "loss": 0.4679, + "step": 320300 + }, + { + "epoch": 0.01, + "learning_rate": 6.864646464646465e-05, + "loss": 0.4645, + "step": 320400 + }, + { + "epoch": 0.01, + "learning_rate": 6.863636363636364e-05, + "loss": 0.4693, + "step": 320500 + }, + { + "epoch": 0.01, + "learning_rate": 6.862626262626263e-05, + "loss": 0.4698, + "step": 320600 + }, + { + "epoch": 0.01, + "learning_rate": 6.861616161616162e-05, + "loss": 0.4711, + "step": 320700 + }, + { + "epoch": 0.01, + "learning_rate": 6.86060606060606e-05, + "loss": 0.4643, + "step": 320800 + }, + { + "epoch": 0.01, + "learning_rate": 6.859595959595961e-05, + "loss": 0.4667, + "step": 320900 + }, + { + "epoch": 0.01, + "learning_rate": 6.858585858585859e-05, + "loss": 0.4704, + "step": 321000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.466713659083787, + "eval_average_loss_on_sentence_tokens": 0.39042755120480627, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.46333009004592896, + "eval_non_padding_tokens_in_labels": 133.5139, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37465, + "eval_padding_tokens_in_labels": 378.4861, + "eval_reconstruction_accuracy": 0.9144975222185683, + "eval_runtime": 218.363, + "eval_samples_per_second": 22.898, + "eval_sentence_accuracy": 0.7601925458036499, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.2496, + "step": 321000 + }, + { + "epoch": 0.02, + "learning_rate": 6.857575757575758e-05, + "loss": 0.4717, + "step": 321100 + }, + { + "epoch": 0.02, + "learning_rate": 6.856565656565657e-05, + "loss": 0.4675, + "step": 321200 + }, + { + "epoch": 0.02, + "learning_rate": 6.855555555555556e-05, + "loss": 0.4652, + "step": 321300 + }, + { + "epoch": 0.02, + "learning_rate": 6.854545454545454e-05, + "loss": 0.4689, + "step": 321400 + }, + { + "epoch": 0.02, + "learning_rate": 6.853535353535355e-05, + "loss": 0.4676, + "step": 321500 + }, + { + "epoch": 0.02, + "learning_rate": 6.852525252525252e-05, + "loss": 0.4684, + "step": 321600 + }, + { + "epoch": 0.02, + "learning_rate": 6.851515151515152e-05, + "loss": 0.4695, + "step": 321700 + }, + { + "epoch": 0.02, + "learning_rate": 6.850505050505051e-05, + "loss": 0.4622, + "step": 321800 + }, + { + "epoch": 0.02, + "learning_rate": 6.84949494949495e-05, + "loss": 0.4651, + "step": 321900 + }, + { + "epoch": 0.02, + "learning_rate": 6.848484848484848e-05, + "loss": 0.4684, + "step": 322000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4670196937223013, + "eval_average_loss_on_sentence_tokens": 0.3714943116892392, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.46269530057907104, + "eval_non_padding_tokens_in_labels": 133.5022, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37655, + "eval_padding_tokens_in_labels": 378.4978, + "eval_reconstruction_accuracy": 0.9143700164425573, + "eval_runtime": 189.0917, + "eval_samples_per_second": 26.442, + "eval_sentence_accuracy": 0.7737900838013889, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24797499999999992, + "step": 322000 + }, + { + "epoch": 0.02, + "learning_rate": 6.847474747474748e-05, + "loss": 0.4686, + "step": 322100 + }, + { + "epoch": 0.02, + "learning_rate": 6.846464646464646e-05, + "loss": 0.4675, + "step": 322200 + }, + { + "epoch": 0.02, + "learning_rate": 6.845454545454546e-05, + "loss": 0.4677, + "step": 322300 + }, + { + "epoch": 0.02, + "learning_rate": 6.844444444444445e-05, + "loss": 0.4682, + "step": 322400 + }, + { + "epoch": 0.02, + "learning_rate": 6.843434343434344e-05, + "loss": 0.4646, + "step": 322500 + }, + { + "epoch": 0.02, + "learning_rate": 6.842424242424242e-05, + "loss": 0.4668, + "step": 322600 + }, + { + "epoch": 0.02, + "learning_rate": 6.841414141414142e-05, + "loss": 0.4661, + "step": 322700 + }, + { + "epoch": 0.02, + "learning_rate": 6.84040404040404e-05, + "loss": 0.4662, + "step": 322800 + }, + { + "epoch": 0.02, + "learning_rate": 6.83939393939394e-05, + "loss": 0.4669, + "step": 322900 + }, + { + "epoch": 0.02, + "learning_rate": 6.838383838383839e-05, + "loss": 0.469, + "step": 323000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.46586915211108, + "eval_average_loss_on_sentence_tokens": 0.4226055842130778, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.46394529938697815, + "eval_non_padding_tokens_in_labels": 133.4853, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3805, + "eval_padding_tokens_in_labels": 378.5147, + "eval_reconstruction_accuracy": 0.9144880038352267, + "eval_runtime": 178.0607, + "eval_samples_per_second": 28.08, + "eval_sentence_accuracy": 0.742288298311411, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2499, + "step": 323000 + }, + { + "epoch": 0.02, + "learning_rate": 6.837373737373738e-05, + "loss": 0.4634, + "step": 323100 + }, + { + "epoch": 0.02, + "learning_rate": 6.836363636363637e-05, + "loss": 0.467, + "step": 323200 + }, + { + "epoch": 0.02, + "learning_rate": 6.835353535353536e-05, + "loss": 0.4652, + "step": 323300 + }, + { + "epoch": 0.02, + "learning_rate": 6.834343434343434e-05, + "loss": 0.4635, + "step": 323400 + }, + { + "epoch": 0.02, + "learning_rate": 6.833333333333333e-05, + "loss": 0.4695, + "step": 323500 + }, + { + "epoch": 0.02, + "learning_rate": 6.832323232323232e-05, + "loss": 0.4676, + "step": 323600 + }, + { + "epoch": 0.02, + "learning_rate": 6.831313131313132e-05, + "loss": 0.4622, + "step": 323700 + }, + { + "epoch": 0.02, + "learning_rate": 6.830303030303031e-05, + "loss": 0.4651, + "step": 323800 + }, + { + "epoch": 0.02, + "learning_rate": 6.82929292929293e-05, + "loss": 0.4702, + "step": 323900 + }, + { + "epoch": 0.02, + "learning_rate": 6.828282828282828e-05, + "loss": 0.4643, + "step": 324000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4664631212154839, + "eval_average_loss_on_sentence_tokens": 0.4213774046889405, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4644531309604645, + "eval_non_padding_tokens_in_labels": 133.53795, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3756, + "eval_padding_tokens_in_labels": 378.46205, + "eval_reconstruction_accuracy": 0.9144530468879706, + "eval_runtime": 204.9671, + "eval_samples_per_second": 24.394, + "eval_sentence_accuracy": 0.7364383512480486, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 324000 + }, + { + "epoch": 0.02, + "learning_rate": 6.827272727272728e-05, + "loss": 0.4638, + "step": 324100 + }, + { + "epoch": 0.02, + "learning_rate": 6.826262626262626e-05, + "loss": 0.4688, + "step": 324200 + }, + { + "epoch": 0.02, + "learning_rate": 6.825252525252525e-05, + "loss": 0.4672, + "step": 324300 + }, + { + "epoch": 0.02, + "learning_rate": 6.824242424242425e-05, + "loss": 0.4641, + "step": 324400 + }, + { + "epoch": 0.02, + "learning_rate": 6.823232323232324e-05, + "loss": 0.4664, + "step": 324500 + }, + { + "epoch": 0.02, + "learning_rate": 6.822222222222222e-05, + "loss": 0.4665, + "step": 324600 + }, + { + "epoch": 0.02, + "learning_rate": 6.821212121212122e-05, + "loss": 0.4642, + "step": 324700 + }, + { + "epoch": 0.02, + "learning_rate": 6.82020202020202e-05, + "loss": 0.4611, + "step": 324800 + }, + { + "epoch": 0.02, + "learning_rate": 6.819191919191919e-05, + "loss": 0.4638, + "step": 324900 + }, + { + "epoch": 0.02, + "learning_rate": 6.818181818181818e-05, + "loss": 0.4668, + "step": 325000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.46657716518714615, + "eval_average_loss_on_sentence_tokens": 0.4008487829660698, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.4635546803474426, + "eval_non_padding_tokens_in_labels": 133.53205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37645, + "eval_padding_tokens_in_labels": 378.46795, + "eval_reconstruction_accuracy": 0.9144185930425841, + "eval_runtime": 187.6306, + "eval_samples_per_second": 26.648, + "eval_sentence_accuracy": 0.7469673587309563, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2499, + "step": 325000 + }, + { + "epoch": 0.02, + "learning_rate": 6.817171717171718e-05, + "loss": 0.4666, + "step": 325100 + }, + { + "epoch": 0.02, + "learning_rate": 6.816161616161615e-05, + "loss": 0.4688, + "step": 325200 + }, + { + "epoch": 0.02, + "learning_rate": 6.815151515151516e-05, + "loss": 0.4665, + "step": 325300 + }, + { + "epoch": 0.02, + "learning_rate": 6.814141414141414e-05, + "loss": 0.4645, + "step": 325400 + }, + { + "epoch": 0.02, + "learning_rate": 6.813131313131313e-05, + "loss": 0.4646, + "step": 325500 + }, + { + "epoch": 0.02, + "learning_rate": 6.812121212121212e-05, + "loss": 0.467, + "step": 325600 + }, + { + "epoch": 0.02, + "learning_rate": 6.811111111111111e-05, + "loss": 0.4655, + "step": 325700 + }, + { + "epoch": 0.02, + "learning_rate": 6.81010101010101e-05, + "loss": 0.4652, + "step": 325800 + }, + { + "epoch": 0.02, + "learning_rate": 6.80909090909091e-05, + "loss": 0.4695, + "step": 325900 + }, + { + "epoch": 0.02, + "learning_rate": 6.808080808080809e-05, + "loss": 0.4646, + "step": 326000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.46612270053726707, + "eval_average_loss_on_sentence_tokens": 0.43633698297429374, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.4647558629512787, + "eval_non_padding_tokens_in_labels": 133.5365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37425, + "eval_padding_tokens_in_labels": 378.4635, + "eval_reconstruction_accuracy": 0.914568059736807, + "eval_runtime": 189.3162, + "eval_samples_per_second": 26.411, + "eval_sentence_accuracy": 0.7241597430330001, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 326000 + }, + { + "epoch": 0.02, + "learning_rate": 6.807070707070707e-05, + "loss": 0.4698, + "step": 326100 + }, + { + "epoch": 0.02, + "learning_rate": 6.806060606060607e-05, + "loss": 0.4654, + "step": 326200 + }, + { + "epoch": 0.02, + "learning_rate": 6.805050505050505e-05, + "loss": 0.4666, + "step": 326300 + }, + { + "epoch": 0.02, + "learning_rate": 6.804040404040405e-05, + "loss": 0.4674, + "step": 326400 + }, + { + "epoch": 0.02, + "learning_rate": 6.803030303030304e-05, + "loss": 0.4643, + "step": 326500 + }, + { + "epoch": 0.02, + "learning_rate": 6.802020202020203e-05, + "loss": 0.4666, + "step": 326600 + }, + { + "epoch": 0.02, + "learning_rate": 6.801010101010101e-05, + "loss": 0.4664, + "step": 326700 + }, + { + "epoch": 0.02, + "learning_rate": 6.800000000000001e-05, + "loss": 0.4667, + "step": 326800 + }, + { + "epoch": 0.02, + "learning_rate": 6.798989898989899e-05, + "loss": 0.4656, + "step": 326900 + }, + { + "epoch": 0.02, + "learning_rate": 6.797979797979798e-05, + "loss": 0.4619, + "step": 327000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.46582191455780353, + "eval_average_loss_on_sentence_tokens": 0.39547654582895087, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.46259766817092896, + "eval_non_padding_tokens_in_labels": 133.54615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38585, + "eval_padding_tokens_in_labels": 378.45385, + "eval_reconstruction_accuracy": 0.9146394862459619, + "eval_runtime": 189.8736, + "eval_samples_per_second": 26.333, + "eval_sentence_accuracy": 0.7527724442370843, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.249975, + "step": 327000 + }, + { + "epoch": 0.02, + "learning_rate": 6.796969696969698e-05, + "loss": 0.4684, + "step": 327100 + }, + { + "epoch": 0.02, + "learning_rate": 6.795959595959597e-05, + "loss": 0.4638, + "step": 327200 + }, + { + "epoch": 0.02, + "learning_rate": 6.794949494949495e-05, + "loss": 0.4676, + "step": 327300 + }, + { + "epoch": 0.02, + "learning_rate": 6.793939393939395e-05, + "loss": 0.4663, + "step": 327400 + }, + { + "epoch": 0.02, + "learning_rate": 6.792929292929293e-05, + "loss": 0.4634, + "step": 327500 + }, + { + "epoch": 0.02, + "learning_rate": 6.791919191919192e-05, + "loss": 0.4651, + "step": 327600 + }, + { + "epoch": 0.02, + "learning_rate": 6.790909090909091e-05, + "loss": 0.4672, + "step": 327700 + }, + { + "epoch": 0.02, + "learning_rate": 6.78989898989899e-05, + "loss": 0.4726, + "step": 327800 + }, + { + "epoch": 0.02, + "learning_rate": 6.788888888888888e-05, + "loss": 0.4658, + "step": 327900 + }, + { + "epoch": 0.02, + "learning_rate": 6.787878787878789e-05, + "loss": 0.4655, + "step": 328000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4658758826031563, + "eval_average_loss_on_sentence_tokens": 0.3791439223509196, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.46196287870407104, + "eval_non_padding_tokens_in_labels": 133.54505, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3859, + "eval_padding_tokens_in_labels": 378.45495, + "eval_reconstruction_accuracy": 0.9145787299457192, + "eval_runtime": 203.4853, + "eval_samples_per_second": 24.572, + "eval_sentence_accuracy": 0.7650555386078561, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 328000 + }, + { + "epoch": 0.02, + "learning_rate": 6.786868686868687e-05, + "loss": 0.4635, + "step": 328100 + }, + { + "epoch": 0.02, + "learning_rate": 6.785858585858586e-05, + "loss": 0.465, + "step": 328200 + }, + { + "epoch": 0.02, + "learning_rate": 6.784848484848485e-05, + "loss": 0.4671, + "step": 328300 + }, + { + "epoch": 0.02, + "learning_rate": 6.783838383838384e-05, + "loss": 0.4675, + "step": 328400 + }, + { + "epoch": 0.02, + "learning_rate": 6.782828282828284e-05, + "loss": 0.4693, + "step": 328500 + }, + { + "epoch": 0.02, + "learning_rate": 6.781818181818183e-05, + "loss": 0.4671, + "step": 328600 + }, + { + "epoch": 0.02, + "learning_rate": 6.78080808080808e-05, + "loss": 0.4687, + "step": 328700 + }, + { + "epoch": 0.02, + "learning_rate": 6.77979797979798e-05, + "loss": 0.4643, + "step": 328800 + }, + { + "epoch": 0.02, + "learning_rate": 6.778787878787879e-05, + "loss": 0.4685, + "step": 328900 + }, + { + "epoch": 0.02, + "learning_rate": 6.777777777777778e-05, + "loss": 0.4669, + "step": 329000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.46577658178399556, + "eval_average_loss_on_sentence_tokens": 0.3727941462907896, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4615527391433716, + "eval_non_padding_tokens_in_labels": 133.5251, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 378.4749, + "eval_reconstruction_accuracy": 0.9145539685812023, + "eval_runtime": 185.0409, + "eval_samples_per_second": 27.021, + "eval_sentence_accuracy": 0.7697076820930608, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 329000 + }, + { + "epoch": 0.02, + "learning_rate": 6.776767676767677e-05, + "loss": 0.4668, + "step": 329100 + }, + { + "epoch": 0.02, + "learning_rate": 6.775757575757577e-05, + "loss": 0.4643, + "step": 329200 + }, + { + "epoch": 0.02, + "learning_rate": 6.774747474747474e-05, + "loss": 0.4667, + "step": 329300 + }, + { + "epoch": 0.02, + "learning_rate": 6.773737373737375e-05, + "loss": 0.4674, + "step": 329400 + }, + { + "epoch": 0.02, + "learning_rate": 6.772727272727273e-05, + "loss": 0.4658, + "step": 329500 + }, + { + "epoch": 0.02, + "learning_rate": 6.771717171717172e-05, + "loss": 0.4647, + "step": 329600 + }, + { + "epoch": 0.02, + "learning_rate": 6.770707070707071e-05, + "loss": 0.4684, + "step": 329700 + }, + { + "epoch": 0.02, + "learning_rate": 6.76969696969697e-05, + "loss": 0.4646, + "step": 329800 + }, + { + "epoch": 0.02, + "learning_rate": 6.768686868686868e-05, + "loss": 0.4635, + "step": 329900 + }, + { + "epoch": 0.02, + "learning_rate": 6.767676767676769e-05, + "loss": 0.4604, + "step": 330000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.4654444666042782, + "eval_average_loss_on_sentence_tokens": 0.41131137110282945, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.4631054699420929, + "eval_non_padding_tokens_in_labels": 133.5468, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3905, + "eval_padding_tokens_in_labels": 378.4532, + "eval_reconstruction_accuracy": 0.9147261228551622, + "eval_runtime": 199.4751, + "eval_samples_per_second": 25.066, + "eval_sentence_accuracy": 0.7380623396199328, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2496, + "step": 330000 + }, + { + "epoch": 0.02, + "learning_rate": 6.766666666666667e-05, + "loss": 0.4676, + "step": 330100 + }, + { + "epoch": 0.02, + "learning_rate": 6.765656565656566e-05, + "loss": 0.4631, + "step": 330200 + }, + { + "epoch": 0.02, + "learning_rate": 6.764646464646465e-05, + "loss": 0.4649, + "step": 330300 + }, + { + "epoch": 0.02, + "learning_rate": 6.763636363636364e-05, + "loss": 0.4624, + "step": 330400 + }, + { + "epoch": 0.02, + "learning_rate": 6.762626262626262e-05, + "loss": 0.4642, + "step": 330500 + }, + { + "epoch": 0.02, + "learning_rate": 6.761616161616163e-05, + "loss": 0.464, + "step": 330600 + }, + { + "epoch": 0.02, + "learning_rate": 6.76060606060606e-05, + "loss": 0.4685, + "step": 330700 + }, + { + "epoch": 0.02, + "learning_rate": 6.75959595959596e-05, + "loss": 0.4673, + "step": 330800 + }, + { + "epoch": 0.02, + "learning_rate": 6.758585858585859e-05, + "loss": 0.4684, + "step": 330900 + }, + { + "epoch": 0.03, + "learning_rate": 6.757575757575758e-05, + "loss": 0.4669, + "step": 331000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4658720240405577, + "eval_average_loss_on_sentence_tokens": 0.39061543020910433, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.46244141459465027, + "eval_non_padding_tokens_in_labels": 133.5541, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39, + "eval_padding_tokens_in_labels": 378.4459, + "eval_reconstruction_accuracy": 0.9145385650969612, + "eval_runtime": 180.4302, + "eval_samples_per_second": 27.712, + "eval_sentence_accuracy": 0.7535620076444094, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 331000 + }, + { + "epoch": 0.03, + "learning_rate": 6.756565656565656e-05, + "loss": 0.4627, + "step": 331100 + }, + { + "epoch": 0.03, + "learning_rate": 6.755555555555557e-05, + "loss": 0.4619, + "step": 331200 + }, + { + "epoch": 0.03, + "learning_rate": 6.754545454545454e-05, + "loss": 0.4609, + "step": 331300 + }, + { + "epoch": 0.03, + "learning_rate": 6.753535353535354e-05, + "loss": 0.4648, + "step": 331400 + }, + { + "epoch": 0.03, + "learning_rate": 6.752525252525253e-05, + "loss": 0.4698, + "step": 331500 + }, + { + "epoch": 0.03, + "learning_rate": 6.751515151515152e-05, + "loss": 0.4668, + "step": 331600 + }, + { + "epoch": 0.03, + "learning_rate": 6.75050505050505e-05, + "loss": 0.4669, + "step": 331700 + }, + { + "epoch": 0.03, + "learning_rate": 6.74949494949495e-05, + "loss": 0.4654, + "step": 331800 + }, + { + "epoch": 0.03, + "learning_rate": 6.748484848484848e-05, + "loss": 0.4658, + "step": 331900 + }, + { + "epoch": 0.03, + "learning_rate": 6.747474747474747e-05, + "loss": 0.4656, + "step": 332000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.46426198265464325, + "eval_average_loss_on_sentence_tokens": 0.3590110774218402, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.45947265625, + "eval_non_padding_tokens_in_labels": 133.5296, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3857, + "eval_padding_tokens_in_labels": 378.4704, + "eval_reconstruction_accuracy": 0.9148633100204709, + "eval_runtime": 196.924, + "eval_samples_per_second": 25.391, + "eval_sentence_accuracy": 0.7821029303569186, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 332000 + }, + { + "epoch": 0.03, + "learning_rate": 6.746464646464647e-05, + "loss": 0.4643, + "step": 332100 + }, + { + "epoch": 0.03, + "learning_rate": 6.745454545454546e-05, + "loss": 0.4639, + "step": 332200 + }, + { + "epoch": 0.03, + "learning_rate": 6.744444444444445e-05, + "loss": 0.4631, + "step": 332300 + }, + { + "epoch": 0.03, + "learning_rate": 6.743434343434344e-05, + "loss": 0.4634, + "step": 332400 + }, + { + "epoch": 0.03, + "learning_rate": 6.742424242424242e-05, + "loss": 0.4631, + "step": 332500 + }, + { + "epoch": 0.03, + "learning_rate": 6.741414141414141e-05, + "loss": 0.4667, + "step": 332600 + }, + { + "epoch": 0.03, + "learning_rate": 6.74040404040404e-05, + "loss": 0.4663, + "step": 332700 + }, + { + "epoch": 0.03, + "learning_rate": 6.73939393939394e-05, + "loss": 0.4651, + "step": 332800 + }, + { + "epoch": 0.03, + "learning_rate": 6.738383838383839e-05, + "loss": 0.466, + "step": 332900 + }, + { + "epoch": 0.03, + "learning_rate": 6.737373737373738e-05, + "loss": 0.4662, + "step": 333000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4657312693397126, + "eval_average_loss_on_sentence_tokens": 0.3981056683460804, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.46268555521965027, + "eval_non_padding_tokens_in_labels": 133.54845, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37755, + "eval_padding_tokens_in_labels": 378.45155, + "eval_reconstruction_accuracy": 0.9145994321650477, + "eval_runtime": 187.8863, + "eval_samples_per_second": 26.612, + "eval_sentence_accuracy": 0.7475550451307266, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2499, + "step": 333000 + }, + { + "epoch": 0.03, + "learning_rate": 6.736363636363636e-05, + "loss": 0.4662, + "step": 333100 + }, + { + "epoch": 0.03, + "learning_rate": 6.735353535353536e-05, + "loss": 0.4679, + "step": 333200 + }, + { + "epoch": 0.03, + "learning_rate": 6.734343434343434e-05, + "loss": 0.4633, + "step": 333300 + }, + { + "epoch": 0.03, + "learning_rate": 6.733333333333333e-05, + "loss": 0.4665, + "step": 333400 + }, + { + "epoch": 0.03, + "learning_rate": 6.732323232323233e-05, + "loss": 0.4664, + "step": 333500 + }, + { + "epoch": 0.03, + "learning_rate": 6.731313131313132e-05, + "loss": 0.4652, + "step": 333600 + }, + { + "epoch": 0.03, + "learning_rate": 6.73030303030303e-05, + "loss": 0.4693, + "step": 333700 + }, + { + "epoch": 0.03, + "learning_rate": 6.72929292929293e-05, + "loss": 0.4632, + "step": 333800 + }, + { + "epoch": 0.03, + "learning_rate": 6.728282828282828e-05, + "loss": 0.4628, + "step": 333900 + }, + { + "epoch": 0.03, + "learning_rate": 6.727272727272727e-05, + "loss": 0.4666, + "step": 334000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.46544620904363904, + "eval_average_loss_on_sentence_tokens": 0.4059272985352742, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.46268555521965027, + "eval_non_padding_tokens_in_labels": 133.5154, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36755, + "eval_padding_tokens_in_labels": 378.4846, + "eval_reconstruction_accuracy": 0.9144235820441948, + "eval_runtime": 195.658, + "eval_samples_per_second": 25.555, + "eval_sentence_accuracy": 0.7525032748936781, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.25, + "step": 334000 + }, + { + "epoch": 0.03, + "learning_rate": 6.726262626262626e-05, + "loss": 0.4617, + "step": 334100 + }, + { + "epoch": 0.03, + "learning_rate": 6.725252525252526e-05, + "loss": 0.464, + "step": 334200 + }, + { + "epoch": 0.03, + "learning_rate": 6.724242424242424e-05, + "loss": 0.4652, + "step": 334300 + }, + { + "epoch": 0.03, + "learning_rate": 6.723232323232324e-05, + "loss": 0.464, + "step": 334400 + }, + { + "epoch": 0.03, + "learning_rate": 6.722222222222223e-05, + "loss": 0.4637, + "step": 334500 + }, + { + "epoch": 0.03, + "learning_rate": 6.721212121212121e-05, + "loss": 0.4661, + "step": 334600 + }, + { + "epoch": 0.03, + "learning_rate": 6.720202020202022e-05, + "loss": 0.4667, + "step": 334700 + }, + { + "epoch": 0.03, + "learning_rate": 6.71919191919192e-05, + "loss": 0.4661, + "step": 334800 + }, + { + "epoch": 0.03, + "learning_rate": 6.718181818181819e-05, + "loss": 0.468, + "step": 334900 + }, + { + "epoch": 0.03, + "learning_rate": 6.717171717171718e-05, + "loss": 0.4622, + "step": 335000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4646571075456345, + "eval_average_loss_on_sentence_tokens": 0.3783188575874442, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.46077147126197815, + "eval_non_padding_tokens_in_labels": 133.51425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3761, + "eval_padding_tokens_in_labels": 378.48575, + "eval_reconstruction_accuracy": 0.9147082061364775, + "eval_runtime": 212.9064, + "eval_samples_per_second": 23.485, + "eval_sentence_accuracy": 0.7671999210436593, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.248775, + "step": 335000 + }, + { + "epoch": 0.03, + "learning_rate": 6.716161616161617e-05, + "loss": 0.4648, + "step": 335100 + }, + { + "epoch": 0.03, + "learning_rate": 6.715151515151515e-05, + "loss": 0.465, + "step": 335200 + }, + { + "epoch": 0.03, + "learning_rate": 6.714141414141416e-05, + "loss": 0.467, + "step": 335300 + }, + { + "epoch": 0.03, + "learning_rate": 6.713131313131313e-05, + "loss": 0.4678, + "step": 335400 + }, + { + "epoch": 0.03, + "learning_rate": 6.712121212121213e-05, + "loss": 0.4614, + "step": 335500 + }, + { + "epoch": 0.03, + "learning_rate": 6.711111111111112e-05, + "loss": 0.4638, + "step": 335600 + }, + { + "epoch": 0.03, + "learning_rate": 6.710101010101011e-05, + "loss": 0.463, + "step": 335700 + }, + { + "epoch": 0.03, + "learning_rate": 6.709090909090909e-05, + "loss": 0.4674, + "step": 335800 + }, + { + "epoch": 0.03, + "learning_rate": 6.70808080808081e-05, + "loss": 0.4628, + "step": 335900 + }, + { + "epoch": 0.03, + "learning_rate": 6.707070707070707e-05, + "loss": 0.467, + "step": 336000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4642023489170626, + "eval_average_loss_on_sentence_tokens": 0.422404449526691, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.46241211891174316, + "eval_non_padding_tokens_in_labels": 133.51495, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38995, + "eval_padding_tokens_in_labels": 378.48505, + "eval_reconstruction_accuracy": 0.9147222858660325, + "eval_runtime": 206.6515, + "eval_samples_per_second": 24.195, + "eval_sentence_accuracy": 0.7424049383602204, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 336000 + }, + { + "epoch": 0.03, + "learning_rate": 6.706060606060606e-05, + "loss": 0.4668, + "step": 336100 + }, + { + "epoch": 0.03, + "learning_rate": 6.705050505050506e-05, + "loss": 0.4668, + "step": 336200 + }, + { + "epoch": 0.03, + "learning_rate": 6.704040404040405e-05, + "loss": 0.4644, + "step": 336300 + }, + { + "epoch": 0.03, + "learning_rate": 6.703030303030303e-05, + "loss": 0.4644, + "step": 336400 + }, + { + "epoch": 0.03, + "learning_rate": 6.702020202020203e-05, + "loss": 0.4681, + "step": 336500 + }, + { + "epoch": 0.03, + "learning_rate": 6.701010101010101e-05, + "loss": 0.4644, + "step": 336600 + }, + { + "epoch": 0.03, + "learning_rate": 6.7e-05, + "loss": 0.4629, + "step": 336700 + }, + { + "epoch": 0.03, + "learning_rate": 6.6989898989899e-05, + "loss": 0.4657, + "step": 336800 + }, + { + "epoch": 0.03, + "learning_rate": 6.697979797979799e-05, + "loss": 0.4642, + "step": 336900 + }, + { + "epoch": 0.03, + "learning_rate": 6.696969696969696e-05, + "loss": 0.4628, + "step": 337000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4641048028949797, + "eval_average_loss_on_sentence_tokens": 0.3905288286738258, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.4606640636920929, + "eval_non_padding_tokens_in_labels": 133.5539, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38475, + "eval_padding_tokens_in_labels": 378.4461, + "eval_reconstruction_accuracy": 0.9148472600981008, + "eval_runtime": 212.5189, + "eval_samples_per_second": 23.527, + "eval_sentence_accuracy": 0.7536337861359843, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 337000 + }, + { + "epoch": 0.03, + "learning_rate": 6.695959595959597e-05, + "loss": 0.4614, + "step": 337100 + }, + { + "epoch": 0.03, + "learning_rate": 6.694949494949495e-05, + "loss": 0.4636, + "step": 337200 + }, + { + "epoch": 0.03, + "learning_rate": 6.693939393939394e-05, + "loss": 0.4596, + "step": 337300 + }, + { + "epoch": 0.03, + "learning_rate": 6.692929292929293e-05, + "loss": 0.4641, + "step": 337400 + }, + { + "epoch": 0.03, + "learning_rate": 6.691919191919192e-05, + "loss": 0.4625, + "step": 337500 + }, + { + "epoch": 0.03, + "learning_rate": 6.690909090909092e-05, + "loss": 0.4631, + "step": 337600 + }, + { + "epoch": 0.03, + "learning_rate": 6.689898989898991e-05, + "loss": 0.4633, + "step": 337700 + }, + { + "epoch": 0.03, + "learning_rate": 6.688888888888889e-05, + "loss": 0.4632, + "step": 337800 + }, + { + "epoch": 0.03, + "learning_rate": 6.687878787878788e-05, + "loss": 0.4643, + "step": 337900 + }, + { + "epoch": 0.03, + "learning_rate": 6.686868686868687e-05, + "loss": 0.4624, + "step": 338000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.46447002299778817, + "eval_average_loss_on_sentence_tokens": 0.41146191777797975, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.46210938692092896, + "eval_non_padding_tokens_in_labels": 133.5144, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37905, + "eval_padding_tokens_in_labels": 378.4856, + "eval_reconstruction_accuracy": 0.9147002681543275, + "eval_runtime": 249.0182, + "eval_samples_per_second": 20.079, + "eval_sentence_accuracy": 0.7396549249017532, + "eval_steps_per_second": 0.052, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 338000 + }, + { + "epoch": 0.03, + "learning_rate": 6.685858585858586e-05, + "loss": 0.4663, + "step": 338100 + }, + { + "epoch": 0.03, + "learning_rate": 6.684848484848485e-05, + "loss": 0.4647, + "step": 338200 + }, + { + "epoch": 0.03, + "learning_rate": 6.683838383838385e-05, + "loss": 0.464, + "step": 338300 + }, + { + "epoch": 0.03, + "learning_rate": 6.682828282828283e-05, + "loss": 0.4643, + "step": 338400 + }, + { + "epoch": 0.03, + "learning_rate": 6.681818181818183e-05, + "loss": 0.4686, + "step": 338500 + }, + { + "epoch": 0.03, + "learning_rate": 6.680808080808081e-05, + "loss": 0.4693, + "step": 338600 + }, + { + "epoch": 0.03, + "learning_rate": 6.67979797979798e-05, + "loss": 0.4605, + "step": 338700 + }, + { + "epoch": 0.03, + "learning_rate": 6.678787878787879e-05, + "loss": 0.4672, + "step": 338800 + }, + { + "epoch": 0.03, + "learning_rate": 6.677777777777779e-05, + "loss": 0.465, + "step": 338900 + }, + { + "epoch": 0.03, + "learning_rate": 6.676767676767676e-05, + "loss": 0.463, + "step": 339000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4634113638079084, + "eval_average_loss_on_sentence_tokens": 0.3832509857899088, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4597753882408142, + "eval_non_padding_tokens_in_labels": 133.4913, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38105, + "eval_padding_tokens_in_labels": 378.5087, + "eval_reconstruction_accuracy": 0.9148679894016145, + "eval_runtime": 227.4012, + "eval_samples_per_second": 21.988, + "eval_sentence_accuracy": 0.7647773969530031, + "eval_steps_per_second": 0.057, + "eval_variance_shuffling_prob": 0.248775, + "step": 339000 + }, + { + "epoch": 0.03, + "learning_rate": 6.675757575757577e-05, + "loss": 0.4647, + "step": 339100 + }, + { + "epoch": 0.03, + "learning_rate": 6.674747474747475e-05, + "loss": 0.4636, + "step": 339200 + }, + { + "epoch": 0.03, + "learning_rate": 6.673737373737374e-05, + "loss": 0.4691, + "step": 339300 + }, + { + "epoch": 0.03, + "learning_rate": 6.672727272727273e-05, + "loss": 0.4667, + "step": 339400 + }, + { + "epoch": 0.03, + "learning_rate": 6.671717171717172e-05, + "loss": 0.4618, + "step": 339500 + }, + { + "epoch": 0.03, + "learning_rate": 6.67070707070707e-05, + "loss": 0.4646, + "step": 339600 + }, + { + "epoch": 0.03, + "learning_rate": 6.669696969696971e-05, + "loss": 0.4653, + "step": 339700 + }, + { + "epoch": 0.03, + "learning_rate": 6.668686868686869e-05, + "loss": 0.4644, + "step": 339800 + }, + { + "epoch": 0.03, + "learning_rate": 6.667676767676768e-05, + "loss": 0.4661, + "step": 339900 + }, + { + "epoch": 0.03, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4621, + "step": 340000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.4644415461737679, + "eval_average_loss_on_sentence_tokens": 0.37569862855089964, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4604785144329071, + "eval_non_padding_tokens_in_labels": 133.5423, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3844, + "eval_padding_tokens_in_labels": 378.4577, + "eval_reconstruction_accuracy": 0.9147195725191571, + "eval_runtime": 220.9373, + "eval_samples_per_second": 22.631, + "eval_sentence_accuracy": 0.7664776499721858, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.248775, + "step": 340000 + }, + { + "epoch": 0.03, + "learning_rate": 6.665656565656566e-05, + "loss": 0.4639, + "step": 340100 + }, + { + "epoch": 0.03, + "learning_rate": 6.664646464646464e-05, + "loss": 0.4633, + "step": 340200 + }, + { + "epoch": 0.03, + "learning_rate": 6.663636363636365e-05, + "loss": 0.4631, + "step": 340300 + }, + { + "epoch": 0.03, + "learning_rate": 6.662626262626262e-05, + "loss": 0.4616, + "step": 340400 + }, + { + "epoch": 0.03, + "learning_rate": 6.661616161616162e-05, + "loss": 0.4646, + "step": 340500 + }, + { + "epoch": 0.03, + "learning_rate": 6.660606060606061e-05, + "loss": 0.4628, + "step": 340600 + }, + { + "epoch": 0.03, + "learning_rate": 6.65959595959596e-05, + "loss": 0.4637, + "step": 340700 + }, + { + "epoch": 0.03, + "learning_rate": 6.658585858585858e-05, + "loss": 0.4635, + "step": 340800 + }, + { + "epoch": 0.03, + "learning_rate": 6.657575757575758e-05, + "loss": 0.4592, + "step": 340900 + }, + { + "epoch": 0.04, + "learning_rate": 6.656565656565656e-05, + "loss": 0.4643, + "step": 341000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46323580474509013, + "eval_average_loss_on_sentence_tokens": 0.395376519534006, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.460205078125, + "eval_non_padding_tokens_in_labels": 133.52065, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3903, + "eval_padding_tokens_in_labels": 378.47935, + "eval_reconstruction_accuracy": 0.9148912613649717, + "eval_runtime": 239.1441, + "eval_samples_per_second": 20.908, + "eval_sentence_accuracy": 0.751938019272525, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.25, + "step": 341000 + }, + { + "epoch": 0.04, + "learning_rate": 6.655555555555555e-05, + "loss": 0.465, + "step": 341100 + }, + { + "epoch": 0.04, + "learning_rate": 6.654545454545455e-05, + "loss": 0.4663, + "step": 341200 + }, + { + "epoch": 0.04, + "learning_rate": 6.653535353535354e-05, + "loss": 0.4628, + "step": 341300 + }, + { + "epoch": 0.04, + "learning_rate": 6.652525252525253e-05, + "loss": 0.4672, + "step": 341400 + }, + { + "epoch": 0.04, + "learning_rate": 6.651515151515152e-05, + "loss": 0.4641, + "step": 341500 + }, + { + "epoch": 0.04, + "learning_rate": 6.65050505050505e-05, + "loss": 0.4638, + "step": 341600 + }, + { + "epoch": 0.04, + "learning_rate": 6.649494949494949e-05, + "loss": 0.4644, + "step": 341700 + }, + { + "epoch": 0.04, + "learning_rate": 6.648484848484848e-05, + "loss": 0.4618, + "step": 341800 + }, + { + "epoch": 0.04, + "learning_rate": 6.647474747474748e-05, + "loss": 0.4671, + "step": 341900 + }, + { + "epoch": 0.04, + "learning_rate": 6.646464646464647e-05, + "loss": 0.4639, + "step": 342000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4638255238520706, + "eval_average_loss_on_sentence_tokens": 0.4186439274908779, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.4617968797683716, + "eval_non_padding_tokens_in_labels": 133.51745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37945, + "eval_padding_tokens_in_labels": 378.48255, + "eval_reconstruction_accuracy": 0.9148725839184269, + "eval_runtime": 244.3743, + "eval_samples_per_second": 20.46, + "eval_sentence_accuracy": 0.7310639366913705, + "eval_steps_per_second": 0.053, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 342000 + }, + { + "epoch": 0.04, + "learning_rate": 6.645454545454546e-05, + "loss": 0.4641, + "step": 342100 + }, + { + "epoch": 0.04, + "learning_rate": 6.644444444444444e-05, + "loss": 0.4627, + "step": 342200 + }, + { + "epoch": 0.04, + "learning_rate": 6.643434343434343e-05, + "loss": 0.4638, + "step": 342300 + }, + { + "epoch": 0.04, + "learning_rate": 6.642424242424242e-05, + "loss": 0.4651, + "step": 342400 + }, + { + "epoch": 0.04, + "learning_rate": 6.641414141414142e-05, + "loss": 0.4655, + "step": 342500 + }, + { + "epoch": 0.04, + "learning_rate": 6.640404040404041e-05, + "loss": 0.4656, + "step": 342600 + }, + { + "epoch": 0.04, + "learning_rate": 6.63939393939394e-05, + "loss": 0.4643, + "step": 342700 + }, + { + "epoch": 0.04, + "learning_rate": 6.638383838383838e-05, + "loss": 0.4608, + "step": 342800 + }, + { + "epoch": 0.04, + "learning_rate": 6.637373737373738e-05, + "loss": 0.4651, + "step": 342900 + }, + { + "epoch": 0.04, + "learning_rate": 6.636363636363638e-05, + "loss": 0.4603, + "step": 343000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46320935581841677, + "eval_average_loss_on_sentence_tokens": 0.3576523700077478, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.45848631858825684, + "eval_non_padding_tokens_in_labels": 133.53205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38135, + "eval_padding_tokens_in_labels": 378.46795, + "eval_reconstruction_accuracy": 0.9148901079537644, + "eval_runtime": 238.0305, + "eval_samples_per_second": 21.006, + "eval_sentence_accuracy": 0.7765984172842608, + "eval_steps_per_second": 0.055, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 343000 + }, + { + "epoch": 0.04, + "learning_rate": 6.635353535353535e-05, + "loss": 0.4613, + "step": 343100 + }, + { + "epoch": 0.04, + "learning_rate": 6.634343434343435e-05, + "loss": 0.4636, + "step": 343200 + }, + { + "epoch": 0.04, + "learning_rate": 6.633333333333334e-05, + "loss": 0.4609, + "step": 343300 + }, + { + "epoch": 0.04, + "learning_rate": 6.632323232323233e-05, + "loss": 0.4655, + "step": 343400 + }, + { + "epoch": 0.04, + "learning_rate": 6.631313131313132e-05, + "loss": 0.4629, + "step": 343500 + }, + { + "epoch": 0.04, + "learning_rate": 6.630303030303031e-05, + "loss": 0.4664, + "step": 343600 + }, + { + "epoch": 0.04, + "learning_rate": 6.629292929292929e-05, + "loss": 0.4628, + "step": 343700 + }, + { + "epoch": 0.04, + "learning_rate": 6.62828282828283e-05, + "loss": 0.4624, + "step": 343800 + }, + { + "epoch": 0.04, + "learning_rate": 6.627272727272728e-05, + "loss": 0.4649, + "step": 343900 + }, + { + "epoch": 0.04, + "learning_rate": 6.626262626262627e-05, + "loss": 0.4632, + "step": 344000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4629776054633334, + "eval_average_loss_on_sentence_tokens": 0.40440340232780597, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4603515565395355, + "eval_non_padding_tokens_in_labels": 133.5111, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38725, + "eval_padding_tokens_in_labels": 378.4889, + "eval_reconstruction_accuracy": 0.914920703885298, + "eval_runtime": 270.4285, + "eval_samples_per_second": 18.489, + "eval_sentence_accuracy": 0.750502449441025, + "eval_steps_per_second": 0.048, + "eval_variance_shuffling_prob": 0.25, + "step": 344000 + }, + { + "epoch": 0.04, + "learning_rate": 6.625252525252526e-05, + "loss": 0.4601, + "step": 344100 + }, + { + "epoch": 0.04, + "learning_rate": 6.624242424242425e-05, + "loss": 0.4642, + "step": 344200 + }, + { + "epoch": 0.04, + "learning_rate": 6.623232323232323e-05, + "loss": 0.4627, + "step": 344300 + }, + { + "epoch": 0.04, + "learning_rate": 6.622222222222224e-05, + "loss": 0.466, + "step": 344400 + }, + { + "epoch": 0.04, + "learning_rate": 6.621212121212121e-05, + "loss": 0.4604, + "step": 344500 + }, + { + "epoch": 0.04, + "learning_rate": 6.62020202020202e-05, + "loss": 0.4673, + "step": 344600 + }, + { + "epoch": 0.04, + "learning_rate": 6.61919191919192e-05, + "loss": 0.4639, + "step": 344700 + }, + { + "epoch": 0.04, + "learning_rate": 6.618181818181819e-05, + "loss": 0.4595, + "step": 344800 + }, + { + "epoch": 0.04, + "learning_rate": 6.617171717171717e-05, + "loss": 0.4591, + "step": 344900 + }, + { + "epoch": 0.04, + "learning_rate": 6.616161616161617e-05, + "loss": 0.463, + "step": 345000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46331224707769164, + "eval_average_loss_on_sentence_tokens": 0.416171979489943, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4611132740974426, + "eval_non_padding_tokens_in_labels": 133.51975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38165, + "eval_padding_tokens_in_labels": 378.48025, + "eval_reconstruction_accuracy": 0.9148039702426478, + "eval_runtime": 250.4285, + "eval_samples_per_second": 19.966, + "eval_sentence_accuracy": 0.7395248263857735, + "eval_steps_per_second": 0.052, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 345000 + }, + { + "epoch": 0.04, + "learning_rate": 6.615151515151515e-05, + "loss": 0.4628, + "step": 345100 + }, + { + "epoch": 0.04, + "learning_rate": 6.614141414141414e-05, + "loss": 0.4653, + "step": 345200 + }, + { + "epoch": 0.04, + "learning_rate": 6.613131313131314e-05, + "loss": 0.4623, + "step": 345300 + }, + { + "epoch": 0.04, + "learning_rate": 6.612121212121213e-05, + "loss": 0.4615, + "step": 345400 + }, + { + "epoch": 0.04, + "learning_rate": 6.611111111111111e-05, + "loss": 0.4668, + "step": 345500 + }, + { + "epoch": 0.04, + "learning_rate": 6.610101010101011e-05, + "loss": 0.4651, + "step": 345600 + }, + { + "epoch": 0.04, + "learning_rate": 6.609090909090909e-05, + "loss": 0.4635, + "step": 345700 + }, + { + "epoch": 0.04, + "learning_rate": 6.608080808080808e-05, + "loss": 0.4644, + "step": 345800 + }, + { + "epoch": 0.04, + "learning_rate": 6.607070707070707e-05, + "loss": 0.4712, + "step": 345900 + }, + { + "epoch": 0.04, + "learning_rate": 6.606060606060607e-05, + "loss": 0.4639, + "step": 346000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4632701055836803, + "eval_average_loss_on_sentence_tokens": 0.40625213037747854, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.46061524748802185, + "eval_non_padding_tokens_in_labels": 133.52325, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3755, + "eval_padding_tokens_in_labels": 378.47675, + "eval_reconstruction_accuracy": 0.9149516163941994, + "eval_runtime": 274.2554, + "eval_samples_per_second": 18.231, + "eval_sentence_accuracy": 0.7419832397222172, + "eval_steps_per_second": 0.047, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 346000 + }, + { + "epoch": 0.04, + "learning_rate": 6.605050505050505e-05, + "loss": 0.463, + "step": 346100 + }, + { + "epoch": 0.04, + "learning_rate": 6.604040404040405e-05, + "loss": 0.4605, + "step": 346200 + }, + { + "epoch": 0.04, + "learning_rate": 6.603030303030303e-05, + "loss": 0.4668, + "step": 346300 + }, + { + "epoch": 0.04, + "learning_rate": 6.602020202020202e-05, + "loss": 0.4634, + "step": 346400 + }, + { + "epoch": 0.04, + "learning_rate": 6.601010101010101e-05, + "loss": 0.4612, + "step": 346500 + }, + { + "epoch": 0.04, + "learning_rate": 6.6e-05, + "loss": 0.4587, + "step": 346600 + }, + { + "epoch": 0.04, + "learning_rate": 6.5989898989899e-05, + "loss": 0.4585, + "step": 346700 + }, + { + "epoch": 0.04, + "learning_rate": 6.597979797979799e-05, + "loss": 0.4625, + "step": 346800 + }, + { + "epoch": 0.04, + "learning_rate": 6.596969696969697e-05, + "loss": 0.4621, + "step": 346900 + }, + { + "epoch": 0.04, + "learning_rate": 6.595959595959596e-05, + "loss": 0.4625, + "step": 347000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46334197867577653, + "eval_average_loss_on_sentence_tokens": 0.3791339467738375, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.4595605432987213, + "eval_non_padding_tokens_in_labels": 133.5512, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3844, + "eval_padding_tokens_in_labels": 378.4488, + "eval_reconstruction_accuracy": 0.9149487399863248, + "eval_runtime": 185.0072, + "eval_samples_per_second": 27.026, + "eval_sentence_accuracy": 0.7587524898164265, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 347000 + }, + { + "epoch": 0.04, + "learning_rate": 6.594949494949495e-05, + "loss": 0.4655, + "step": 347100 + }, + { + "epoch": 0.04, + "learning_rate": 6.593939393939394e-05, + "loss": 0.4626, + "step": 347200 + }, + { + "epoch": 0.04, + "learning_rate": 6.592929292929294e-05, + "loss": 0.4662, + "step": 347300 + }, + { + "epoch": 0.04, + "learning_rate": 6.591919191919193e-05, + "loss": 0.4604, + "step": 347400 + }, + { + "epoch": 0.04, + "learning_rate": 6.59090909090909e-05, + "loss": 0.4652, + "step": 347500 + }, + { + "epoch": 0.04, + "learning_rate": 6.589898989898991e-05, + "loss": 0.465, + "step": 347600 + }, + { + "epoch": 0.04, + "learning_rate": 6.588888888888889e-05, + "loss": 0.464, + "step": 347700 + }, + { + "epoch": 0.04, + "learning_rate": 6.587878787878788e-05, + "loss": 0.4633, + "step": 347800 + }, + { + "epoch": 0.04, + "learning_rate": 6.586868686868687e-05, + "loss": 0.4616, + "step": 347900 + }, + { + "epoch": 0.04, + "learning_rate": 6.585858585858587e-05, + "loss": 0.4641, + "step": 348000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46336877596013376, + "eval_average_loss_on_sentence_tokens": 0.4097543356901395, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4609082043170929, + "eval_non_padding_tokens_in_labels": 133.54395, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3853, + "eval_padding_tokens_in_labels": 378.45605, + "eval_reconstruction_accuracy": 0.9147692092665711, + "eval_runtime": 190.4586, + "eval_samples_per_second": 26.252, + "eval_sentence_accuracy": 0.7437866743230391, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 348000 + }, + { + "epoch": 0.04, + "learning_rate": 6.584848484848484e-05, + "loss": 0.4614, + "step": 348100 + }, + { + "epoch": 0.04, + "learning_rate": 6.583838383838385e-05, + "loss": 0.4617, + "step": 348200 + }, + { + "epoch": 0.04, + "learning_rate": 6.582828282828283e-05, + "loss": 0.4591, + "step": 348300 + }, + { + "epoch": 0.04, + "learning_rate": 6.581818181818182e-05, + "loss": 0.4633, + "step": 348400 + }, + { + "epoch": 0.04, + "learning_rate": 6.580808080808081e-05, + "loss": 0.4636, + "step": 348500 + }, + { + "epoch": 0.04, + "learning_rate": 6.57979797979798e-05, + "loss": 0.4622, + "step": 348600 + }, + { + "epoch": 0.04, + "learning_rate": 6.578787878787878e-05, + "loss": 0.4616, + "step": 348700 + }, + { + "epoch": 0.04, + "learning_rate": 6.577777777777779e-05, + "loss": 0.4603, + "step": 348800 + }, + { + "epoch": 0.04, + "learning_rate": 6.576767676767677e-05, + "loss": 0.4593, + "step": 348900 + }, + { + "epoch": 0.04, + "learning_rate": 6.575757575757576e-05, + "loss": 0.459, + "step": 349000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4623074503072163, + "eval_average_loss_on_sentence_tokens": 0.42294472001835237, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.4605175852775574, + "eval_non_padding_tokens_in_labels": 133.51975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3833, + "eval_padding_tokens_in_labels": 378.48025, + "eval_reconstruction_accuracy": 0.9149967943323076, + "eval_runtime": 192.4391, + "eval_samples_per_second": 25.982, + "eval_sentence_accuracy": 0.7302474563497048, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 349000 + }, + { + "epoch": 0.04, + "learning_rate": 6.574747474747475e-05, + "loss": 0.4613, + "step": 349100 + }, + { + "epoch": 0.04, + "learning_rate": 6.573737373737374e-05, + "loss": 0.4627, + "step": 349200 + }, + { + "epoch": 0.04, + "learning_rate": 6.572727272727272e-05, + "loss": 0.4646, + "step": 349300 + }, + { + "epoch": 0.04, + "learning_rate": 6.571717171717173e-05, + "loss": 0.4615, + "step": 349400 + }, + { + "epoch": 0.04, + "learning_rate": 6.57070707070707e-05, + "loss": 0.4635, + "step": 349500 + }, + { + "epoch": 0.04, + "learning_rate": 6.56969696969697e-05, + "loss": 0.4641, + "step": 349600 + }, + { + "epoch": 0.04, + "learning_rate": 6.568686868686869e-05, + "loss": 0.4651, + "step": 349700 + }, + { + "epoch": 0.04, + "learning_rate": 6.567676767676768e-05, + "loss": 0.4607, + "step": 349800 + }, + { + "epoch": 0.04, + "learning_rate": 6.566666666666666e-05, + "loss": 0.463, + "step": 349900 + }, + { + "epoch": 0.04, + "learning_rate": 6.565656565656566e-05, + "loss": 0.4623, + "step": 350000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.46288413889905394, + "eval_average_loss_on_sentence_tokens": 0.3779186171456026, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.45894530415534973, + "eval_non_padding_tokens_in_labels": 133.52365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37045, + "eval_padding_tokens_in_labels": 378.47635, + "eval_reconstruction_accuracy": 0.9149244719505752, + "eval_runtime": 189.503, + "eval_samples_per_second": 26.385, + "eval_sentence_accuracy": 0.7676530227717264, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24877499999999997, + "step": 350000 + }, + { + "epoch": 0.04, + "learning_rate": 6.564646464646464e-05, + "loss": 0.4647, + "step": 350100 + }, + { + "epoch": 0.04, + "learning_rate": 6.563636363636364e-05, + "loss": 0.4574, + "step": 350200 + }, + { + "epoch": 0.04, + "learning_rate": 6.562626262626263e-05, + "loss": 0.4612, + "step": 350300 + }, + { + "epoch": 0.04, + "learning_rate": 6.561616161616162e-05, + "loss": 0.46, + "step": 350400 + }, + { + "epoch": 0.04, + "learning_rate": 6.560606060606061e-05, + "loss": 0.4644, + "step": 350500 + }, + { + "epoch": 0.04, + "learning_rate": 6.55959595959596e-05, + "loss": 0.4617, + "step": 350600 + }, + { + "epoch": 0.04, + "learning_rate": 6.558585858585858e-05, + "loss": 0.4626, + "step": 350700 + }, + { + "epoch": 0.04, + "learning_rate": 6.557575757575757e-05, + "loss": 0.4556, + "step": 350800 + }, + { + "epoch": 0.04, + "learning_rate": 6.556565656565657e-05, + "loss": 0.4598, + "step": 350900 + }, + { + "epoch": 0.04, + "learning_rate": 6.555555555555556e-05, + "loss": 0.4645, + "step": 351000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.4628131343984554, + "eval_average_loss_on_sentence_tokens": 0.40710603977570303, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4603222608566284, + "eval_non_padding_tokens_in_labels": 133.52875, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39095, + "eval_padding_tokens_in_labels": 378.47125, + "eval_reconstruction_accuracy": 0.914914189194788, + "eval_runtime": 180.0548, + "eval_samples_per_second": 27.769, + "eval_sentence_accuracy": 0.7379322411039532, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 351000 + }, + { + "epoch": 0.05, + "learning_rate": 6.554545454545455e-05, + "loss": 0.4604, + "step": 351100 + }, + { + "epoch": 0.05, + "learning_rate": 6.553535353535354e-05, + "loss": 0.4646, + "step": 351200 + }, + { + "epoch": 0.05, + "learning_rate": 6.552525252525252e-05, + "loss": 0.4623, + "step": 351300 + }, + { + "epoch": 0.05, + "learning_rate": 6.551515151515151e-05, + "loss": 0.4615, + "step": 351400 + }, + { + "epoch": 0.05, + "learning_rate": 6.550505050505052e-05, + "loss": 0.4647, + "step": 351500 + }, + { + "epoch": 0.05, + "learning_rate": 6.54949494949495e-05, + "loss": 0.4665, + "step": 351600 + }, + { + "epoch": 0.05, + "learning_rate": 6.548484848484849e-05, + "loss": 0.4635, + "step": 351700 + }, + { + "epoch": 0.05, + "learning_rate": 6.547474747474748e-05, + "loss": 0.4647, + "step": 351800 + }, + { + "epoch": 0.05, + "learning_rate": 6.546464646464647e-05, + "loss": 0.4612, + "step": 351900 + }, + { + "epoch": 0.05, + "learning_rate": 6.545454545454546e-05, + "loss": 0.463, + "step": 352000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4631935039334242, + "eval_average_loss_on_sentence_tokens": 0.36293178355473027, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.4586718678474426, + "eval_non_padding_tokens_in_labels": 133.5157, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37555, + "eval_padding_tokens_in_labels": 378.4843, + "eval_reconstruction_accuracy": 0.9149124197685372, + "eval_runtime": 179.4911, + "eval_samples_per_second": 27.857, + "eval_sentence_accuracy": 0.7735074559908124, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24750000000000014, + "step": 352000 + }, + { + "epoch": 0.05, + "learning_rate": 6.544444444444446e-05, + "loss": 0.4669, + "step": 352100 + }, + { + "epoch": 0.05, + "learning_rate": 6.543434343434343e-05, + "loss": 0.4636, + "step": 352200 + }, + { + "epoch": 0.05, + "learning_rate": 6.542424242424243e-05, + "loss": 0.4615, + "step": 352300 + }, + { + "epoch": 0.05, + "learning_rate": 6.541414141414142e-05, + "loss": 0.4649, + "step": 352400 + }, + { + "epoch": 0.05, + "learning_rate": 6.540404040404041e-05, + "loss": 0.4614, + "step": 352500 + }, + { + "epoch": 0.05, + "learning_rate": 6.53939393939394e-05, + "loss": 0.4658, + "step": 352600 + }, + { + "epoch": 0.05, + "learning_rate": 6.53838383838384e-05, + "loss": 0.4645, + "step": 352700 + }, + { + "epoch": 0.05, + "learning_rate": 6.537373737373737e-05, + "loss": 0.4628, + "step": 352800 + }, + { + "epoch": 0.05, + "learning_rate": 6.536363636363638e-05, + "loss": 0.4649, + "step": 352900 + }, + { + "epoch": 0.05, + "learning_rate": 6.535353535353536e-05, + "loss": 0.4672, + "step": 353000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4625007146544197, + "eval_average_loss_on_sentence_tokens": 0.39818560822479904, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.45949217677116394, + "eval_non_padding_tokens_in_labels": 133.5489, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.371, + "eval_padding_tokens_in_labels": 378.4511, + "eval_reconstruction_accuracy": 0.9149555673071461, + "eval_runtime": 179.7237, + "eval_samples_per_second": 27.82, + "eval_sentence_accuracy": 0.7566260520035172, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 353000 + }, + { + "epoch": 0.05, + "learning_rate": 6.534343434343435e-05, + "loss": 0.4602, + "step": 353100 + }, + { + "epoch": 0.05, + "learning_rate": 6.533333333333334e-05, + "loss": 0.4627, + "step": 353200 + }, + { + "epoch": 0.05, + "learning_rate": 6.532323232323233e-05, + "loss": 0.4616, + "step": 353300 + }, + { + "epoch": 0.05, + "learning_rate": 6.531313131313131e-05, + "loss": 0.4609, + "step": 353400 + }, + { + "epoch": 0.05, + "learning_rate": 6.530303030303032e-05, + "loss": 0.4668, + "step": 353500 + }, + { + "epoch": 0.05, + "learning_rate": 6.52929292929293e-05, + "loss": 0.4671, + "step": 353600 + }, + { + "epoch": 0.05, + "learning_rate": 6.528282828282829e-05, + "loss": 0.4627, + "step": 353700 + }, + { + "epoch": 0.05, + "learning_rate": 6.527272727272728e-05, + "loss": 0.4615, + "step": 353800 + }, + { + "epoch": 0.05, + "learning_rate": 6.526262626262627e-05, + "loss": 0.4639, + "step": 353900 + }, + { + "epoch": 0.05, + "learning_rate": 6.525252525252525e-05, + "loss": 0.4629, + "step": 354000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.46174227350191854, + "eval_average_loss_on_sentence_tokens": 0.3894270853622643, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.4583789110183716, + "eval_non_padding_tokens_in_labels": 133.55385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39655, + "eval_padding_tokens_in_labels": 378.44615, + "eval_reconstruction_accuracy": 0.915035141000776, + "eval_runtime": 180.6039, + "eval_samples_per_second": 27.685, + "eval_sentence_accuracy": 0.7575546862382687, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 354000 + }, + { + "epoch": 0.05, + "learning_rate": 6.524242424242425e-05, + "loss": 0.4581, + "step": 354100 + }, + { + "epoch": 0.05, + "learning_rate": 6.523232323232323e-05, + "loss": 0.4666, + "step": 354200 + }, + { + "epoch": 0.05, + "learning_rate": 6.522222222222222e-05, + "loss": 0.463, + "step": 354300 + }, + { + "epoch": 0.05, + "learning_rate": 6.521212121212122e-05, + "loss": 0.4608, + "step": 354400 + }, + { + "epoch": 0.05, + "learning_rate": 6.520202020202021e-05, + "loss": 0.4627, + "step": 354500 + }, + { + "epoch": 0.05, + "learning_rate": 6.519191919191919e-05, + "loss": 0.463, + "step": 354600 + }, + { + "epoch": 0.05, + "learning_rate": 6.518181818181819e-05, + "loss": 0.462, + "step": 354700 + }, + { + "epoch": 0.05, + "learning_rate": 6.517171717171717e-05, + "loss": 0.4654, + "step": 354800 + }, + { + "epoch": 0.05, + "learning_rate": 6.516161616161616e-05, + "loss": 0.4626, + "step": 354900 + }, + { + "epoch": 0.05, + "learning_rate": 6.515151515151516e-05, + "loss": 0.4633, + "step": 355000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4613663548104188, + "eval_average_loss_on_sentence_tokens": 0.4380340146212879, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.46031248569488525, + "eval_non_padding_tokens_in_labels": 133.50415, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3877, + "eval_padding_tokens_in_labels": 378.49585, + "eval_reconstruction_accuracy": 0.9151645068757136, + "eval_runtime": 177.1546, + "eval_samples_per_second": 28.224, + "eval_sentence_accuracy": 0.7198350889156064, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2464, + "step": 355000 + }, + { + "epoch": 0.05, + "learning_rate": 6.514141414141415e-05, + "loss": 0.4665, + "step": 355100 + }, + { + "epoch": 0.05, + "learning_rate": 6.513131313131313e-05, + "loss": 0.4594, + "step": 355200 + }, + { + "epoch": 0.05, + "learning_rate": 6.512121212121213e-05, + "loss": 0.4625, + "step": 355300 + }, + { + "epoch": 0.05, + "learning_rate": 6.511111111111111e-05, + "loss": 0.4648, + "step": 355400 + }, + { + "epoch": 0.05, + "learning_rate": 6.51010101010101e-05, + "loss": 0.4614, + "step": 355500 + }, + { + "epoch": 0.05, + "learning_rate": 6.50909090909091e-05, + "loss": 0.4589, + "step": 355600 + }, + { + "epoch": 0.05, + "learning_rate": 6.508080808080809e-05, + "loss": 0.4626, + "step": 355700 + }, + { + "epoch": 0.05, + "learning_rate": 6.507070707070708e-05, + "loss": 0.4578, + "step": 355800 + }, + { + "epoch": 0.05, + "learning_rate": 6.506060606060607e-05, + "loss": 0.4608, + "step": 355900 + }, + { + "epoch": 0.05, + "learning_rate": 6.505050505050505e-05, + "loss": 0.4616, + "step": 356000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4618177615568043, + "eval_average_loss_on_sentence_tokens": 0.36817356043292515, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.45762693881988525, + "eval_non_padding_tokens_in_labels": 133.5149, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36775, + "eval_padding_tokens_in_labels": 378.4851, + "eval_reconstruction_accuracy": 0.9151150043283051, + "eval_runtime": 191.6277, + "eval_samples_per_second": 26.092, + "eval_sentence_accuracy": 0.7685233369820733, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 356000 + }, + { + "epoch": 0.05, + "learning_rate": 6.504040404040404e-05, + "loss": 0.4583, + "step": 356100 + }, + { + "epoch": 0.05, + "learning_rate": 6.503030303030303e-05, + "loss": 0.4602, + "step": 356200 + }, + { + "epoch": 0.05, + "learning_rate": 6.502020202020202e-05, + "loss": 0.4611, + "step": 356300 + }, + { + "epoch": 0.05, + "learning_rate": 6.501010101010102e-05, + "loss": 0.4622, + "step": 356400 + }, + { + "epoch": 0.05, + "learning_rate": 6.500000000000001e-05, + "loss": 0.4656, + "step": 356500 + }, + { + "epoch": 0.05, + "learning_rate": 6.498989898989899e-05, + "loss": 0.4652, + "step": 356600 + }, + { + "epoch": 0.05, + "learning_rate": 6.497979797979798e-05, + "loss": 0.4585, + "step": 356700 + }, + { + "epoch": 0.05, + "learning_rate": 6.496969696969697e-05, + "loss": 0.4637, + "step": 356800 + }, + { + "epoch": 0.05, + "learning_rate": 6.495959595959596e-05, + "loss": 0.4591, + "step": 356900 + }, + { + "epoch": 0.05, + "learning_rate": 6.494949494949495e-05, + "loss": 0.4624, + "step": 357000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.46185705937064625, + "eval_average_loss_on_sentence_tokens": 0.3757270335839469, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.45801758766174316, + "eval_non_padding_tokens_in_labels": 133.5422, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3833, + "eval_padding_tokens_in_labels": 378.4578, + "eval_reconstruction_accuracy": 0.9150480561499014, + "eval_runtime": 182.6979, + "eval_samples_per_second": 27.368, + "eval_sentence_accuracy": 0.7643736429378937, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 357000 + }, + { + "epoch": 0.05, + "learning_rate": 6.493939393939395e-05, + "loss": 0.4627, + "step": 357100 + }, + { + "epoch": 0.05, + "learning_rate": 6.492929292929292e-05, + "loss": 0.4583, + "step": 357200 + }, + { + "epoch": 0.05, + "learning_rate": 6.491919191919193e-05, + "loss": 0.4641, + "step": 357300 + }, + { + "epoch": 0.05, + "learning_rate": 6.490909090909091e-05, + "loss": 0.4636, + "step": 357400 + }, + { + "epoch": 0.05, + "learning_rate": 6.48989898989899e-05, + "loss": 0.463, + "step": 357500 + }, + { + "epoch": 0.05, + "learning_rate": 6.488888888888889e-05, + "loss": 0.4603, + "step": 357600 + }, + { + "epoch": 0.05, + "learning_rate": 6.487878787878788e-05, + "loss": 0.4643, + "step": 357700 + }, + { + "epoch": 0.05, + "learning_rate": 6.486868686868686e-05, + "loss": 0.4619, + "step": 357800 + }, + { + "epoch": 0.05, + "learning_rate": 6.485858585858587e-05, + "loss": 0.4602, + "step": 357900 + }, + { + "epoch": 0.05, + "learning_rate": 6.484848484848485e-05, + "loss": 0.4632, + "step": 358000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4611977971053743, + "eval_average_loss_on_sentence_tokens": 0.40121794694913715, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4585253894329071, + "eval_non_padding_tokens_in_labels": 133.5459, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37685, + "eval_padding_tokens_in_labels": 378.4541, + "eval_reconstruction_accuracy": 0.9151190861382196, + "eval_runtime": 195.4841, + "eval_samples_per_second": 25.578, + "eval_sentence_accuracy": 0.7497577475909344, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.249975, + "step": 358000 + }, + { + "epoch": 0.05, + "learning_rate": 6.483838383838384e-05, + "loss": 0.462, + "step": 358100 + }, + { + "epoch": 0.05, + "learning_rate": 6.482828282828283e-05, + "loss": 0.4663, + "step": 358200 + }, + { + "epoch": 0.05, + "learning_rate": 6.481818181818182e-05, + "loss": 0.4607, + "step": 358300 + }, + { + "epoch": 0.05, + "learning_rate": 6.48080808080808e-05, + "loss": 0.4612, + "step": 358400 + }, + { + "epoch": 0.05, + "learning_rate": 6.479797979797981e-05, + "loss": 0.4594, + "step": 358500 + }, + { + "epoch": 0.05, + "learning_rate": 6.478787878787879e-05, + "loss": 0.463, + "step": 358600 + }, + { + "epoch": 0.05, + "learning_rate": 6.477777777777778e-05, + "loss": 0.4615, + "step": 358700 + }, + { + "epoch": 0.05, + "learning_rate": 6.476767676767677e-05, + "loss": 0.4601, + "step": 358800 + }, + { + "epoch": 0.05, + "learning_rate": 6.475757575757576e-05, + "loss": 0.4608, + "step": 358900 + }, + { + "epoch": 0.05, + "learning_rate": 6.474747474747474e-05, + "loss": 0.4627, + "step": 359000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.4609626142636271, + "eval_average_loss_on_sentence_tokens": 0.4040579092056113, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.45835936069488525, + "eval_non_padding_tokens_in_labels": 133.53135, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3908, + "eval_padding_tokens_in_labels": 378.46865, + "eval_reconstruction_accuracy": 0.9151019147406165, + "eval_runtime": 193.1973, + "eval_samples_per_second": 25.88, + "eval_sentence_accuracy": 0.7484029285624563, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2499, + "step": 359000 + }, + { + "epoch": 0.05, + "learning_rate": 6.473737373737375e-05, + "loss": 0.4613, + "step": 359100 + }, + { + "epoch": 0.05, + "learning_rate": 6.472727272727272e-05, + "loss": 0.4622, + "step": 359200 + }, + { + "epoch": 0.05, + "learning_rate": 6.471717171717172e-05, + "loss": 0.463, + "step": 359300 + }, + { + "epoch": 0.05, + "learning_rate": 6.470707070707071e-05, + "loss": 0.4629, + "step": 359400 + }, + { + "epoch": 0.05, + "learning_rate": 6.46969696969697e-05, + "loss": 0.4622, + "step": 359500 + }, + { + "epoch": 0.05, + "learning_rate": 6.468686868686868e-05, + "loss": 0.462, + "step": 359600 + }, + { + "epoch": 0.05, + "learning_rate": 6.467676767676768e-05, + "loss": 0.458, + "step": 359700 + }, + { + "epoch": 0.05, + "learning_rate": 6.466666666666666e-05, + "loss": 0.4604, + "step": 359800 + }, + { + "epoch": 0.05, + "learning_rate": 6.465656565656565e-05, + "loss": 0.4578, + "step": 359900 + }, + { + "epoch": 0.05, + "learning_rate": 6.464646464646466e-05, + "loss": 0.4615, + "step": 360000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.461644002480397, + "eval_average_loss_on_sentence_tokens": 0.4239041676405535, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.45992186665534973, + "eval_non_padding_tokens_in_labels": 133.52645, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37705, + "eval_padding_tokens_in_labels": 378.47355, + "eval_reconstruction_accuracy": 0.915108731733493, + "eval_runtime": 194.4233, + "eval_samples_per_second": 25.717, + "eval_sentence_accuracy": 0.7262323469772283, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.246975, + "step": 360000 + }, + { + "epoch": 0.05, + "learning_rate": 6.463636363636364e-05, + "loss": 0.4629, + "step": 360100 + }, + { + "epoch": 0.05, + "learning_rate": 6.462626262626263e-05, + "loss": 0.4628, + "step": 360200 + }, + { + "epoch": 0.05, + "learning_rate": 6.461616161616162e-05, + "loss": 0.4601, + "step": 360300 + }, + { + "epoch": 0.05, + "learning_rate": 6.460606060606061e-05, + "loss": 0.4592, + "step": 360400 + }, + { + "epoch": 0.05, + "learning_rate": 6.459595959595959e-05, + "loss": 0.4597, + "step": 360500 + }, + { + "epoch": 0.05, + "learning_rate": 6.45858585858586e-05, + "loss": 0.46, + "step": 360600 + }, + { + "epoch": 0.05, + "learning_rate": 6.457575757575758e-05, + "loss": 0.4638, + "step": 360700 + }, + { + "epoch": 0.05, + "learning_rate": 6.456565656565657e-05, + "loss": 0.4654, + "step": 360800 + }, + { + "epoch": 0.05, + "learning_rate": 6.455555555555556e-05, + "loss": 0.466, + "step": 360900 + }, + { + "epoch": 0.06, + "learning_rate": 6.454545454545455e-05, + "loss": 0.4593, + "step": 361000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.461386322103023, + "eval_average_loss_on_sentence_tokens": 0.39835830406262307, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.45848631858825684, + "eval_non_padding_tokens_in_labels": 133.53075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38625, + "eval_padding_tokens_in_labels": 378.46925, + "eval_reconstruction_accuracy": 0.9150221841166876, + "eval_runtime": 206.674, + "eval_samples_per_second": 24.193, + "eval_sentence_accuracy": 0.7508344249645593, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 361000 + }, + { + "epoch": 0.06, + "learning_rate": 6.453535353535354e-05, + "loss": 0.4619, + "step": 361100 + }, + { + "epoch": 0.06, + "learning_rate": 6.452525252525254e-05, + "loss": 0.4632, + "step": 361200 + }, + { + "epoch": 0.06, + "learning_rate": 6.451515151515151e-05, + "loss": 0.4619, + "step": 361300 + }, + { + "epoch": 0.06, + "learning_rate": 6.45050505050505e-05, + "loss": 0.4639, + "step": 361400 + }, + { + "epoch": 0.06, + "learning_rate": 6.44949494949495e-05, + "loss": 0.4589, + "step": 361500 + }, + { + "epoch": 0.06, + "learning_rate": 6.448484848484849e-05, + "loss": 0.4608, + "step": 361600 + }, + { + "epoch": 0.06, + "learning_rate": 6.447474747474748e-05, + "loss": 0.4634, + "step": 361700 + }, + { + "epoch": 0.06, + "learning_rate": 6.446464646464647e-05, + "loss": 0.461, + "step": 361800 + }, + { + "epoch": 0.06, + "learning_rate": 6.445454545454545e-05, + "loss": 0.4628, + "step": 361900 + }, + { + "epoch": 0.06, + "learning_rate": 6.444444444444446e-05, + "loss": 0.4578, + "step": 362000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.460513150521487, + "eval_average_loss_on_sentence_tokens": 0.39060586041820866, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.45741212368011475, + "eval_non_padding_tokens_in_labels": 133.56075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38675, + "eval_padding_tokens_in_labels": 378.43925, + "eval_reconstruction_accuracy": 0.9151914521769141, + "eval_runtime": 215.7818, + "eval_samples_per_second": 23.172, + "eval_sentence_accuracy": 0.759290828503239, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 362000 + }, + { + "epoch": 0.06, + "learning_rate": 6.443434343434344e-05, + "loss": 0.4612, + "step": 362100 + }, + { + "epoch": 0.06, + "learning_rate": 6.442424242424243e-05, + "loss": 0.4645, + "step": 362200 + }, + { + "epoch": 0.06, + "learning_rate": 6.441414141414142e-05, + "loss": 0.4655, + "step": 362300 + }, + { + "epoch": 0.06, + "learning_rate": 6.440404040404041e-05, + "loss": 0.4623, + "step": 362400 + }, + { + "epoch": 0.06, + "learning_rate": 6.439393939393939e-05, + "loss": 0.4605, + "step": 362500 + }, + { + "epoch": 0.06, + "learning_rate": 6.43838383838384e-05, + "loss": 0.4584, + "step": 362600 + }, + { + "epoch": 0.06, + "learning_rate": 6.437373737373738e-05, + "loss": 0.4612, + "step": 362700 + }, + { + "epoch": 0.06, + "learning_rate": 6.436363636363637e-05, + "loss": 0.4603, + "step": 362800 + }, + { + "epoch": 0.06, + "learning_rate": 6.435353535353536e-05, + "loss": 0.4588, + "step": 362900 + }, + { + "epoch": 0.06, + "learning_rate": 6.434343434343435e-05, + "loss": 0.4612, + "step": 363000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.4605611703982773, + "eval_average_loss_on_sentence_tokens": 0.409532641170309, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.45821288228034973, + "eval_non_padding_tokens_in_labels": 133.51585, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3845, + "eval_padding_tokens_in_labels": 378.48415, + "eval_reconstruction_accuracy": 0.9150788050575164, + "eval_runtime": 216.9363, + "eval_samples_per_second": 23.048, + "eval_sentence_accuracy": 0.7426785938593501, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 363000 + }, + { + "epoch": 0.06, + "learning_rate": 6.433333333333333e-05, + "loss": 0.4612, + "step": 363100 + }, + { + "epoch": 0.06, + "learning_rate": 6.432323232323234e-05, + "loss": 0.4602, + "step": 363200 + }, + { + "epoch": 0.06, + "learning_rate": 6.431313131313131e-05, + "loss": 0.46, + "step": 363300 + }, + { + "epoch": 0.06, + "learning_rate": 6.43030303030303e-05, + "loss": 0.4608, + "step": 363400 + }, + { + "epoch": 0.06, + "learning_rate": 6.42929292929293e-05, + "loss": 0.4619, + "step": 363500 + }, + { + "epoch": 0.06, + "learning_rate": 6.428282828282829e-05, + "loss": 0.4596, + "step": 363600 + }, + { + "epoch": 0.06, + "learning_rate": 6.427272727272727e-05, + "loss": 0.4626, + "step": 363700 + }, + { + "epoch": 0.06, + "learning_rate": 6.426262626262627e-05, + "loss": 0.4583, + "step": 363800 + }, + { + "epoch": 0.06, + "learning_rate": 6.425252525252525e-05, + "loss": 0.4602, + "step": 363900 + }, + { + "epoch": 0.06, + "learning_rate": 6.424242424242424e-05, + "loss": 0.4594, + "step": 364000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.45965217044456197, + "eval_average_loss_on_sentence_tokens": 0.3874480999467608, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4563671946525574, + "eval_non_padding_tokens_in_labels": 133.53235, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3814, + "eval_padding_tokens_in_labels": 378.46765, + "eval_reconstruction_accuracy": 0.9154123215670552, + "eval_runtime": 224.9297, + "eval_samples_per_second": 22.229, + "eval_sentence_accuracy": 0.7587838929064906, + "eval_steps_per_second": 0.058, + "eval_variance_shuffling_prob": 0.2499, + "step": 364000 + }, + { + "epoch": 0.06, + "learning_rate": 6.423232323232324e-05, + "loss": 0.4627, + "step": 364100 + }, + { + "epoch": 0.06, + "learning_rate": 6.422222222222223e-05, + "loss": 0.4637, + "step": 364200 + }, + { + "epoch": 0.06, + "learning_rate": 6.42121212121212e-05, + "loss": 0.4622, + "step": 364300 + }, + { + "epoch": 0.06, + "learning_rate": 6.420202020202021e-05, + "loss": 0.462, + "step": 364400 + }, + { + "epoch": 0.06, + "learning_rate": 6.419191919191919e-05, + "loss": 0.4633, + "step": 364500 + }, + { + "epoch": 0.06, + "learning_rate": 6.418181818181818e-05, + "loss": 0.4627, + "step": 364600 + }, + { + "epoch": 0.06, + "learning_rate": 6.417171717171717e-05, + "loss": 0.4577, + "step": 364700 + }, + { + "epoch": 0.06, + "learning_rate": 6.416161616161617e-05, + "loss": 0.4599, + "step": 364800 + }, + { + "epoch": 0.06, + "learning_rate": 6.415151515151516e-05, + "loss": 0.4611, + "step": 364900 + }, + { + "epoch": 0.06, + "learning_rate": 6.414141414141415e-05, + "loss": 0.4636, + "step": 365000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.4604064889481816, + "eval_average_loss_on_sentence_tokens": 0.3853102830548017, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.45710936188697815, + "eval_non_padding_tokens_in_labels": 133.5034, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37415, + "eval_padding_tokens_in_labels": 378.4966, + "eval_reconstruction_accuracy": 0.9152952825317542, + "eval_runtime": 215.0681, + "eval_samples_per_second": 23.248, + "eval_sentence_accuracy": 0.7573662676978843, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 365000 + }, + { + "epoch": 0.06, + "learning_rate": 6.413131313131313e-05, + "loss": 0.4617, + "step": 365100 + }, + { + "epoch": 0.06, + "learning_rate": 6.412121212121212e-05, + "loss": 0.4615, + "step": 365200 + }, + { + "epoch": 0.06, + "learning_rate": 6.411111111111111e-05, + "loss": 0.4572, + "step": 365300 + }, + { + "epoch": 0.06, + "learning_rate": 6.41010101010101e-05, + "loss": 0.4591, + "step": 365400 + }, + { + "epoch": 0.06, + "learning_rate": 6.40909090909091e-05, + "loss": 0.4647, + "step": 365500 + }, + { + "epoch": 0.06, + "learning_rate": 6.408080808080809e-05, + "loss": 0.4597, + "step": 365600 + }, + { + "epoch": 0.06, + "learning_rate": 6.407070707070707e-05, + "loss": 0.4621, + "step": 365700 + }, + { + "epoch": 0.06, + "learning_rate": 6.406060606060606e-05, + "loss": 0.4598, + "step": 365800 + }, + { + "epoch": 0.06, + "learning_rate": 6.405050505050505e-05, + "loss": 0.4571, + "step": 365900 + }, + { + "epoch": 0.06, + "learning_rate": 6.404040404040404e-05, + "loss": 0.4616, + "step": 366000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.46040229515922765, + "eval_average_loss_on_sentence_tokens": 0.45314583035756795, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.4600878953933716, + "eval_non_padding_tokens_in_labels": 133.55905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3853, + "eval_padding_tokens_in_labels": 378.44095, + "eval_reconstruction_accuracy": 0.9150762244217253, + "eval_runtime": 210.5461, + "eval_samples_per_second": 23.748, + "eval_sentence_accuracy": 0.7155059486424893, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.245775, + "step": 366000 + }, + { + "epoch": 0.06, + "learning_rate": 6.403030303030303e-05, + "loss": 0.4633, + "step": 366100 + }, + { + "epoch": 0.06, + "learning_rate": 6.402020202020203e-05, + "loss": 0.46, + "step": 366200 + }, + { + "epoch": 0.06, + "learning_rate": 6.4010101010101e-05, + "loss": 0.4596, + "step": 366300 + }, + { + "epoch": 0.06, + "learning_rate": 6.400000000000001e-05, + "loss": 0.4597, + "step": 366400 + }, + { + "epoch": 0.06, + "learning_rate": 6.398989898989899e-05, + "loss": 0.4591, + "step": 366500 + }, + { + "epoch": 0.06, + "learning_rate": 6.397979797979798e-05, + "loss": 0.4572, + "step": 366600 + }, + { + "epoch": 0.06, + "learning_rate": 6.396969696969697e-05, + "loss": 0.4628, + "step": 366700 + }, + { + "epoch": 0.06, + "learning_rate": 6.395959595959597e-05, + "loss": 0.462, + "step": 366800 + }, + { + "epoch": 0.06, + "learning_rate": 6.394949494949494e-05, + "loss": 0.4601, + "step": 366900 + }, + { + "epoch": 0.06, + "learning_rate": 6.393939393939395e-05, + "loss": 0.4625, + "step": 367000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.459681496370585, + "eval_average_loss_on_sentence_tokens": 0.4305277479299494, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.45829102396965027, + "eval_non_padding_tokens_in_labels": 133.52055, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38685, + "eval_padding_tokens_in_labels": 378.47945, + "eval_reconstruction_accuracy": 0.9152938578437916, + "eval_runtime": 194.8921, + "eval_samples_per_second": 25.655, + "eval_sentence_accuracy": 0.7226658531770955, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2464, + "step": 367000 + }, + { + "epoch": 0.06, + "learning_rate": 6.392929292929293e-05, + "loss": 0.4629, + "step": 367100 + }, + { + "epoch": 0.06, + "learning_rate": 6.391919191919192e-05, + "loss": 0.465, + "step": 367200 + }, + { + "epoch": 0.06, + "learning_rate": 6.390909090909091e-05, + "loss": 0.4621, + "step": 367300 + }, + { + "epoch": 0.06, + "learning_rate": 6.38989898989899e-05, + "loss": 0.4593, + "step": 367400 + }, + { + "epoch": 0.06, + "learning_rate": 6.388888888888888e-05, + "loss": 0.4631, + "step": 367500 + }, + { + "epoch": 0.06, + "learning_rate": 6.387878787878789e-05, + "loss": 0.4587, + "step": 367600 + }, + { + "epoch": 0.06, + "learning_rate": 6.386868686868687e-05, + "loss": 0.4622, + "step": 367700 + }, + { + "epoch": 0.06, + "learning_rate": 6.385858585858586e-05, + "loss": 0.4624, + "step": 367800 + }, + { + "epoch": 0.06, + "learning_rate": 6.384848484848485e-05, + "loss": 0.4602, + "step": 367900 + }, + { + "epoch": 0.06, + "learning_rate": 6.383838383838384e-05, + "loss": 0.4591, + "step": 368000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.45888319281681267, + "eval_average_loss_on_sentence_tokens": 0.37868609408612625, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.4552636742591858, + "eval_non_padding_tokens_in_labels": 133.52675, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37285, + "eval_padding_tokens_in_labels": 378.47325, + "eval_reconstruction_accuracy": 0.9152786119439462, + "eval_runtime": 183.7319, + "eval_samples_per_second": 27.214, + "eval_sentence_accuracy": 0.775041721248228, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24750000000000014, + "step": 368000 + }, + { + "epoch": 0.06, + "learning_rate": 6.382828282828282e-05, + "loss": 0.4611, + "step": 368100 + }, + { + "epoch": 0.06, + "learning_rate": 6.381818181818183e-05, + "loss": 0.4608, + "step": 368200 + }, + { + "epoch": 0.06, + "learning_rate": 6.38080808080808e-05, + "loss": 0.461, + "step": 368300 + }, + { + "epoch": 0.06, + "learning_rate": 6.37979797979798e-05, + "loss": 0.4591, + "step": 368400 + }, + { + "epoch": 0.06, + "learning_rate": 6.37878787878788e-05, + "loss": 0.4599, + "step": 368500 + }, + { + "epoch": 0.06, + "learning_rate": 6.377777777777778e-05, + "loss": 0.4574, + "step": 368600 + }, + { + "epoch": 0.06, + "learning_rate": 6.376767676767677e-05, + "loss": 0.4614, + "step": 368700 + }, + { + "epoch": 0.06, + "learning_rate": 6.375757575757576e-05, + "loss": 0.4631, + "step": 368800 + }, + { + "epoch": 0.06, + "learning_rate": 6.374747474747476e-05, + "loss": 0.461, + "step": 368900 + }, + { + "epoch": 0.06, + "learning_rate": 6.373737373737373e-05, + "loss": 0.4651, + "step": 369000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.4595635691964046, + "eval_average_loss_on_sentence_tokens": 0.43922800665922185, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.45857420563697815, + "eval_non_padding_tokens_in_labels": 133.5101, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3635, + "eval_padding_tokens_in_labels": 378.4899, + "eval_reconstruction_accuracy": 0.9152627130040344, + "eval_runtime": 180.4876, + "eval_samples_per_second": 27.703, + "eval_sentence_accuracy": 0.7219929298185799, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2464, + "step": 369000 + }, + { + "epoch": 0.06, + "learning_rate": 6.372727272727274e-05, + "loss": 0.4623, + "step": 369100 + }, + { + "epoch": 0.06, + "learning_rate": 6.371717171717172e-05, + "loss": 0.4623, + "step": 369200 + }, + { + "epoch": 0.06, + "learning_rate": 6.370707070707071e-05, + "loss": 0.4592, + "step": 369300 + }, + { + "epoch": 0.06, + "learning_rate": 6.36969696969697e-05, + "loss": 0.4599, + "step": 369400 + }, + { + "epoch": 0.06, + "learning_rate": 6.36868686868687e-05, + "loss": 0.4597, + "step": 369500 + }, + { + "epoch": 0.06, + "learning_rate": 6.367676767676767e-05, + "loss": 0.4602, + "step": 369600 + }, + { + "epoch": 0.06, + "learning_rate": 6.366666666666668e-05, + "loss": 0.4618, + "step": 369700 + }, + { + "epoch": 0.06, + "learning_rate": 6.365656565656566e-05, + "loss": 0.4607, + "step": 369800 + }, + { + "epoch": 0.06, + "learning_rate": 6.364646464646465e-05, + "loss": 0.4616, + "step": 369900 + }, + { + "epoch": 0.06, + "learning_rate": 6.363636363636364e-05, + "loss": 0.4609, + "step": 370000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.459187110143946, + "eval_average_loss_on_sentence_tokens": 0.3925992512433289, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.4562109410762787, + "eval_non_padding_tokens_in_labels": 133.4956, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37395, + "eval_padding_tokens_in_labels": 378.5044, + "eval_reconstruction_accuracy": 0.9152912976350864, + "eval_runtime": 178.8457, + "eval_samples_per_second": 27.957, + "eval_sentence_accuracy": 0.7598471118129453, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 370000 + }, + { + "epoch": 0.06, + "learning_rate": 6.362626262626263e-05, + "loss": 0.4615, + "step": 370100 + }, + { + "epoch": 0.06, + "learning_rate": 6.361616161616162e-05, + "loss": 0.4615, + "step": 370200 + }, + { + "epoch": 0.06, + "learning_rate": 6.360606060606062e-05, + "loss": 0.4634, + "step": 370300 + }, + { + "epoch": 0.06, + "learning_rate": 6.35959595959596e-05, + "loss": 0.4601, + "step": 370400 + }, + { + "epoch": 0.06, + "learning_rate": 6.358585858585859e-05, + "loss": 0.4594, + "step": 370500 + }, + { + "epoch": 0.06, + "learning_rate": 6.357575757575758e-05, + "loss": 0.4587, + "step": 370600 + }, + { + "epoch": 0.06, + "learning_rate": 6.356565656565657e-05, + "loss": 0.4581, + "step": 370700 + }, + { + "epoch": 0.06, + "learning_rate": 6.355555555555556e-05, + "loss": 0.4623, + "step": 370800 + }, + { + "epoch": 0.06, + "learning_rate": 6.354545454545455e-05, + "loss": 0.4637, + "step": 370900 + }, + { + "epoch": 0.07, + "learning_rate": 6.353535353535353e-05, + "loss": 0.4594, + "step": 371000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.4590710615400712, + "eval_average_loss_on_sentence_tokens": 0.4128894397661806, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.45705077052116394, + "eval_non_padding_tokens_in_labels": 133.5727, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3905, + "eval_padding_tokens_in_labels": 378.4273, + "eval_reconstruction_accuracy": 0.915326804664358, + "eval_runtime": 182.2075, + "eval_samples_per_second": 27.441, + "eval_sentence_accuracy": 0.7391390169935579, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 371000 + }, + { + "epoch": 0.07, + "learning_rate": 6.352525252525254e-05, + "loss": 0.4597, + "step": 371100 + }, + { + "epoch": 0.07, + "learning_rate": 6.351515151515152e-05, + "loss": 0.4582, + "step": 371200 + }, + { + "epoch": 0.07, + "learning_rate": 6.350505050505051e-05, + "loss": 0.4618, + "step": 371300 + }, + { + "epoch": 0.07, + "learning_rate": 6.34949494949495e-05, + "loss": 0.4596, + "step": 371400 + }, + { + "epoch": 0.07, + "learning_rate": 6.34848484848485e-05, + "loss": 0.4589, + "step": 371500 + }, + { + "epoch": 0.07, + "learning_rate": 6.347474747474747e-05, + "loss": 0.4611, + "step": 371600 + }, + { + "epoch": 0.07, + "learning_rate": 6.346464646464648e-05, + "loss": 0.46, + "step": 371700 + }, + { + "epoch": 0.07, + "learning_rate": 6.345454545454546e-05, + "loss": 0.4603, + "step": 371800 + }, + { + "epoch": 0.07, + "learning_rate": 6.344444444444445e-05, + "loss": 0.4597, + "step": 371900 + }, + { + "epoch": 0.07, + "learning_rate": 6.343434343434344e-05, + "loss": 0.4624, + "step": 372000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45819040712019793, + "eval_average_loss_on_sentence_tokens": 0.39431579444278525, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.45528319478034973, + "eval_non_padding_tokens_in_labels": 133.5437, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38905, + "eval_padding_tokens_in_labels": 378.4563, + "eval_reconstruction_accuracy": 0.915527054533961, + "eval_runtime": 188.5568, + "eval_samples_per_second": 26.517, + "eval_sentence_accuracy": 0.7624042205753047, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2496, + "step": 372000 + }, + { + "epoch": 0.07, + "learning_rate": 6.342424242424243e-05, + "loss": 0.4592, + "step": 372100 + }, + { + "epoch": 0.07, + "learning_rate": 6.341414141414141e-05, + "loss": 0.4581, + "step": 372200 + }, + { + "epoch": 0.07, + "learning_rate": 6.340404040404042e-05, + "loss": 0.4592, + "step": 372300 + }, + { + "epoch": 0.07, + "learning_rate": 6.33939393939394e-05, + "loss": 0.4597, + "step": 372400 + }, + { + "epoch": 0.07, + "learning_rate": 6.338383838383839e-05, + "loss": 0.4606, + "step": 372500 + }, + { + "epoch": 0.07, + "learning_rate": 6.337373737373738e-05, + "loss": 0.4584, + "step": 372600 + }, + { + "epoch": 0.07, + "learning_rate": 6.336363636363637e-05, + "loss": 0.4599, + "step": 372700 + }, + { + "epoch": 0.07, + "learning_rate": 6.335353535353535e-05, + "loss": 0.4615, + "step": 372800 + }, + { + "epoch": 0.07, + "learning_rate": 6.334343434343435e-05, + "loss": 0.4564, + "step": 372900 + }, + { + "epoch": 0.07, + "learning_rate": 6.333333333333333e-05, + "loss": 0.4596, + "step": 373000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45911966023947903, + "eval_average_loss_on_sentence_tokens": 0.37836007870087823, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.4555078148841858, + "eval_non_padding_tokens_in_labels": 133.54655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3904, + "eval_padding_tokens_in_labels": 378.45345, + "eval_reconstruction_accuracy": 0.9152518615786362, + "eval_runtime": 172.5931, + "eval_samples_per_second": 28.97, + "eval_sentence_accuracy": 0.7727806987636154, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 373000 + }, + { + "epoch": 0.07, + "learning_rate": 6.332323232323232e-05, + "loss": 0.4619, + "step": 373100 + }, + { + "epoch": 0.07, + "learning_rate": 6.331313131313132e-05, + "loss": 0.4578, + "step": 373200 + }, + { + "epoch": 0.07, + "learning_rate": 6.330303030303031e-05, + "loss": 0.4594, + "step": 373300 + }, + { + "epoch": 0.07, + "learning_rate": 6.329292929292929e-05, + "loss": 0.4605, + "step": 373400 + }, + { + "epoch": 0.07, + "learning_rate": 6.328282828282829e-05, + "loss": 0.4584, + "step": 373500 + }, + { + "epoch": 0.07, + "learning_rate": 6.327272727272727e-05, + "loss": 0.4577, + "step": 373600 + }, + { + "epoch": 0.07, + "learning_rate": 6.326262626262626e-05, + "loss": 0.4651, + "step": 373700 + }, + { + "epoch": 0.07, + "learning_rate": 6.325252525252525e-05, + "loss": 0.4593, + "step": 373800 + }, + { + "epoch": 0.07, + "learning_rate": 6.324242424242425e-05, + "loss": 0.4604, + "step": 373900 + }, + { + "epoch": 0.07, + "learning_rate": 6.323232323232323e-05, + "loss": 0.462, + "step": 374000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45950599770799955, + "eval_average_loss_on_sentence_tokens": 0.37824754066518634, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.45584961771965027, + "eval_non_padding_tokens_in_labels": 133.50445, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3556, + "eval_padding_tokens_in_labels": 378.49555, + "eval_reconstruction_accuracy": 0.9152858896937827, + "eval_runtime": 185.226, + "eval_samples_per_second": 26.994, + "eval_sentence_accuracy": 0.7636917472679312, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2493749999999999, + "step": 374000 + }, + { + "epoch": 0.07, + "learning_rate": 6.322222222222223e-05, + "loss": 0.4625, + "step": 374100 + }, + { + "epoch": 0.07, + "learning_rate": 6.321212121212121e-05, + "loss": 0.4601, + "step": 374200 + }, + { + "epoch": 0.07, + "learning_rate": 6.32020202020202e-05, + "loss": 0.461, + "step": 374300 + }, + { + "epoch": 0.07, + "learning_rate": 6.319191919191919e-05, + "loss": 0.4609, + "step": 374400 + }, + { + "epoch": 0.07, + "learning_rate": 6.318181818181818e-05, + "loss": 0.4622, + "step": 374500 + }, + { + "epoch": 0.07, + "learning_rate": 6.317171717171718e-05, + "loss": 0.4594, + "step": 374600 + }, + { + "epoch": 0.07, + "learning_rate": 6.316161616161617e-05, + "loss": 0.459, + "step": 374700 + }, + { + "epoch": 0.07, + "learning_rate": 6.315151515151515e-05, + "loss": 0.4597, + "step": 374800 + }, + { + "epoch": 0.07, + "learning_rate": 6.314141414141414e-05, + "loss": 0.4572, + "step": 374900 + }, + { + "epoch": 0.07, + "learning_rate": 6.313131313131313e-05, + "loss": 0.4568, + "step": 375000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.4587124483008773, + "eval_average_loss_on_sentence_tokens": 0.41786806430250223, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.4569628834724426, + "eval_non_padding_tokens_in_labels": 133.523, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3772, + "eval_padding_tokens_in_labels": 378.477, + "eval_reconstruction_accuracy": 0.9153025788574917, + "eval_runtime": 173.8238, + "eval_samples_per_second": 28.765, + "eval_sentence_accuracy": 0.74440127765715, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.2499, + "step": 375000 + }, + { + "epoch": 0.07, + "learning_rate": 6.312121212121212e-05, + "loss": 0.4613, + "step": 375100 + }, + { + "epoch": 0.07, + "learning_rate": 6.311111111111112e-05, + "loss": 0.4563, + "step": 375200 + }, + { + "epoch": 0.07, + "learning_rate": 6.310101010101011e-05, + "loss": 0.4606, + "step": 375300 + }, + { + "epoch": 0.07, + "learning_rate": 6.309090909090909e-05, + "loss": 0.4591, + "step": 375400 + }, + { + "epoch": 0.07, + "learning_rate": 6.308080808080809e-05, + "loss": 0.4563, + "step": 375500 + }, + { + "epoch": 0.07, + "learning_rate": 6.307070707070707e-05, + "loss": 0.4613, + "step": 375600 + }, + { + "epoch": 0.07, + "learning_rate": 6.306060606060606e-05, + "loss": 0.4574, + "step": 375700 + }, + { + "epoch": 0.07, + "learning_rate": 6.305050505050505e-05, + "loss": 0.4618, + "step": 375800 + }, + { + "epoch": 0.07, + "learning_rate": 6.304040404040405e-05, + "loss": 0.4592, + "step": 375900 + }, + { + "epoch": 0.07, + "learning_rate": 6.303030303030302e-05, + "loss": 0.4606, + "step": 376000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.4594655531914959, + "eval_average_loss_on_sentence_tokens": 0.36966857819012866, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4553906321525574, + "eval_non_padding_tokens_in_labels": 133.5249, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3884, + "eval_padding_tokens_in_labels": 378.4751, + "eval_reconstruction_accuracy": 0.9152931709827541, + "eval_runtime": 187.473, + "eval_samples_per_second": 26.671, + "eval_sentence_accuracy": 0.7719642184219498, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 376000 + }, + { + "epoch": 0.07, + "learning_rate": 6.302020202020203e-05, + "loss": 0.4592, + "step": 376100 + }, + { + "epoch": 0.07, + "learning_rate": 6.301010101010101e-05, + "loss": 0.4611, + "step": 376200 + }, + { + "epoch": 0.07, + "learning_rate": 6.3e-05, + "loss": 0.4608, + "step": 376300 + }, + { + "epoch": 0.07, + "learning_rate": 6.298989898989899e-05, + "loss": 0.4605, + "step": 376400 + }, + { + "epoch": 0.07, + "learning_rate": 6.297979797979798e-05, + "loss": 0.4598, + "step": 376500 + }, + { + "epoch": 0.07, + "learning_rate": 6.296969696969696e-05, + "loss": 0.4581, + "step": 376600 + }, + { + "epoch": 0.07, + "learning_rate": 6.295959595959597e-05, + "loss": 0.4604, + "step": 376700 + }, + { + "epoch": 0.07, + "learning_rate": 6.294949494949495e-05, + "loss": 0.4599, + "step": 376800 + }, + { + "epoch": 0.07, + "learning_rate": 6.293939393939394e-05, + "loss": 0.4608, + "step": 376900 + }, + { + "epoch": 0.07, + "learning_rate": 6.292929292929294e-05, + "loss": 0.4607, + "step": 377000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45827891418482963, + "eval_average_loss_on_sentence_tokens": 0.3813954270060987, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4548632800579071, + "eval_non_padding_tokens_in_labels": 133.5406, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.4594, + "eval_reconstruction_accuracy": 0.9154887956664197, + "eval_runtime": 169.2397, + "eval_samples_per_second": 29.544, + "eval_sentence_accuracy": 0.7679087336479624, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.248775, + "step": 377000 + }, + { + "epoch": 0.07, + "learning_rate": 6.291919191919192e-05, + "loss": 0.4591, + "step": 377100 + }, + { + "epoch": 0.07, + "learning_rate": 6.290909090909091e-05, + "loss": 0.4584, + "step": 377200 + }, + { + "epoch": 0.07, + "learning_rate": 6.28989898989899e-05, + "loss": 0.4582, + "step": 377300 + }, + { + "epoch": 0.07, + "learning_rate": 6.28888888888889e-05, + "loss": 0.4605, + "step": 377400 + }, + { + "epoch": 0.07, + "learning_rate": 6.287878787878788e-05, + "loss": 0.4582, + "step": 377500 + }, + { + "epoch": 0.07, + "learning_rate": 6.286868686868688e-05, + "loss": 0.4596, + "step": 377600 + }, + { + "epoch": 0.07, + "learning_rate": 6.285858585858586e-05, + "loss": 0.4567, + "step": 377700 + }, + { + "epoch": 0.07, + "learning_rate": 6.284848484848485e-05, + "loss": 0.4582, + "step": 377800 + }, + { + "epoch": 0.07, + "learning_rate": 6.283838383838384e-05, + "loss": 0.4559, + "step": 377900 + }, + { + "epoch": 0.07, + "learning_rate": 6.282828282828284e-05, + "loss": 0.4603, + "step": 378000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45863205604890106, + "eval_average_loss_on_sentence_tokens": 0.42933159169902135, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.457275390625, + "eval_non_padding_tokens_in_labels": 133.54925, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3927, + "eval_padding_tokens_in_labels": 378.45075, + "eval_reconstruction_accuracy": 0.9154073929129787, + "eval_runtime": 177.3524, + "eval_samples_per_second": 28.192, + "eval_sentence_accuracy": 0.7275243598255783, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 378000 + }, + { + "epoch": 0.07, + "learning_rate": 6.281818181818181e-05, + "loss": 0.4608, + "step": 378100 + }, + { + "epoch": 0.07, + "learning_rate": 6.280808080808082e-05, + "loss": 0.4631, + "step": 378200 + }, + { + "epoch": 0.07, + "learning_rate": 6.27979797979798e-05, + "loss": 0.462, + "step": 378300 + }, + { + "epoch": 0.07, + "learning_rate": 6.278787878787879e-05, + "loss": 0.458, + "step": 378400 + }, + { + "epoch": 0.07, + "learning_rate": 6.277777777777778e-05, + "loss": 0.4563, + "step": 378500 + }, + { + "epoch": 0.07, + "learning_rate": 6.276767676767677e-05, + "loss": 0.4634, + "step": 378600 + }, + { + "epoch": 0.07, + "learning_rate": 6.275757575757575e-05, + "loss": 0.4585, + "step": 378700 + }, + { + "epoch": 0.07, + "learning_rate": 6.274747474747476e-05, + "loss": 0.4582, + "step": 378800 + }, + { + "epoch": 0.07, + "learning_rate": 6.273737373737374e-05, + "loss": 0.4616, + "step": 378900 + }, + { + "epoch": 0.07, + "learning_rate": 6.272727272727273e-05, + "loss": 0.4543, + "step": 379000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.45848817876813397, + "eval_average_loss_on_sentence_tokens": 0.4012269756917843, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4559277296066284, + "eval_non_padding_tokens_in_labels": 133.52945, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.396, + "eval_padding_tokens_in_labels": 378.47055, + "eval_reconstruction_accuracy": 0.9153299755906416, + "eval_runtime": 191.1163, + "eval_samples_per_second": 26.162, + "eval_sentence_accuracy": 0.7472903619430438, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 379000 + }, + { + "epoch": 0.07, + "learning_rate": 6.271717171717172e-05, + "loss": 0.4628, + "step": 379100 + }, + { + "epoch": 0.07, + "learning_rate": 6.270707070707071e-05, + "loss": 0.4547, + "step": 379200 + }, + { + "epoch": 0.07, + "learning_rate": 6.26969696969697e-05, + "loss": 0.4593, + "step": 379300 + }, + { + "epoch": 0.07, + "learning_rate": 6.26868686868687e-05, + "loss": 0.4569, + "step": 379400 + }, + { + "epoch": 0.07, + "learning_rate": 6.267676767676768e-05, + "loss": 0.4569, + "step": 379500 + }, + { + "epoch": 0.07, + "learning_rate": 6.266666666666667e-05, + "loss": 0.4563, + "step": 379600 + }, + { + "epoch": 0.07, + "learning_rate": 6.265656565656566e-05, + "loss": 0.4587, + "step": 379700 + }, + { + "epoch": 0.07, + "learning_rate": 6.264646464646465e-05, + "loss": 0.4591, + "step": 379800 + }, + { + "epoch": 0.07, + "learning_rate": 6.263636363636364e-05, + "loss": 0.4587, + "step": 379900 + }, + { + "epoch": 0.07, + "learning_rate": 6.262626262626264e-05, + "loss": 0.4576, + "step": 380000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.4577227811478777, + "eval_average_loss_on_sentence_tokens": 0.3805179820090636, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.45433592796325684, + "eval_non_padding_tokens_in_labels": 133.5108, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39335, + "eval_padding_tokens_in_labels": 378.4892, + "eval_reconstruction_accuracy": 0.9155907077079494, + "eval_runtime": 178.3451, + "eval_samples_per_second": 28.036, + "eval_sentence_accuracy": 0.7658720189495217, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 380000 + }, + { + "epoch": 0.07, + "learning_rate": 6.261616161616161e-05, + "loss": 0.4625, + "step": 380100 + }, + { + "epoch": 0.07, + "learning_rate": 6.26060606060606e-05, + "loss": 0.4608, + "step": 380200 + }, + { + "epoch": 0.07, + "learning_rate": 6.25959595959596e-05, + "loss": 0.4558, + "step": 380300 + }, + { + "epoch": 0.07, + "learning_rate": 6.258585858585859e-05, + "loss": 0.4566, + "step": 380400 + }, + { + "epoch": 0.07, + "learning_rate": 6.257575757575758e-05, + "loss": 0.4559, + "step": 380500 + }, + { + "epoch": 0.07, + "learning_rate": 6.256565656565657e-05, + "loss": 0.4647, + "step": 380600 + }, + { + "epoch": 0.07, + "learning_rate": 6.255555555555555e-05, + "loss": 0.4603, + "step": 380700 + }, + { + "epoch": 0.07, + "learning_rate": 6.254545454545456e-05, + "loss": 0.4612, + "step": 380800 + }, + { + "epoch": 0.07, + "learning_rate": 6.253535353535354e-05, + "loss": 0.457, + "step": 380900 + }, + { + "epoch": 0.07, + "learning_rate": 6.252525252525253e-05, + "loss": 0.4581, + "step": 381000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.4584356090498463, + "eval_average_loss_on_sentence_tokens": 0.38673753451887827, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4551660120487213, + "eval_non_padding_tokens_in_labels": 133.57075, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3807, + "eval_padding_tokens_in_labels": 378.42925, + "eval_reconstruction_accuracy": 0.9154943019871913, + "eval_runtime": 192.6455, + "eval_samples_per_second": 25.954, + "eval_sentence_accuracy": 0.7592953146589625, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2499, + "step": 381000 + }, + { + "epoch": 0.08, + "learning_rate": 6.251515151515152e-05, + "loss": 0.4575, + "step": 381100 + }, + { + "epoch": 0.08, + "learning_rate": 6.250505050505051e-05, + "loss": 0.4585, + "step": 381200 + }, + { + "epoch": 0.08, + "learning_rate": 6.249494949494949e-05, + "loss": 0.4608, + "step": 381300 + }, + { + "epoch": 0.08, + "learning_rate": 6.24848484848485e-05, + "loss": 0.46, + "step": 381400 + }, + { + "epoch": 0.08, + "learning_rate": 6.247474747474747e-05, + "loss": 0.456, + "step": 381500 + }, + { + "epoch": 0.08, + "learning_rate": 6.246464646464647e-05, + "loss": 0.4592, + "step": 381600 + }, + { + "epoch": 0.08, + "learning_rate": 6.245454545454546e-05, + "loss": 0.4593, + "step": 381700 + }, + { + "epoch": 0.08, + "learning_rate": 6.244444444444445e-05, + "loss": 0.4588, + "step": 381800 + }, + { + "epoch": 0.08, + "learning_rate": 6.243434343434343e-05, + "loss": 0.4624, + "step": 381900 + }, + { + "epoch": 0.08, + "learning_rate": 6.242424242424243e-05, + "loss": 0.4572, + "step": 382000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.45808529726894753, + "eval_average_loss_on_sentence_tokens": 0.4252352477784559, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4566992223262787, + "eval_non_padding_tokens_in_labels": 133.5018, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3673, + "eval_padding_tokens_in_labels": 378.4982, + "eval_reconstruction_accuracy": 0.9154255810884175, + "eval_runtime": 184.6578, + "eval_samples_per_second": 27.077, + "eval_sentence_accuracy": 0.7363396558221329, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "step": 382000 + }, + { + "epoch": 0.08, + "learning_rate": 6.241414141414141e-05, + "loss": 0.4581, + "step": 382100 + }, + { + "epoch": 0.08, + "learning_rate": 6.24040404040404e-05, + "loss": 0.4566, + "step": 382200 + }, + { + "epoch": 0.08, + "learning_rate": 6.23939393939394e-05, + "loss": 0.4598, + "step": 382300 + }, + { + "epoch": 0.08, + "learning_rate": 6.238383838383839e-05, + "loss": 0.4619, + "step": 382400 + }, + { + "epoch": 0.08, + "learning_rate": 6.237373737373737e-05, + "loss": 0.4581, + "step": 382500 + }, + { + "epoch": 0.08, + "learning_rate": 6.236363636363637e-05, + "loss": 0.4589, + "step": 382600 + }, + { + "epoch": 0.08, + "learning_rate": 6.235353535353535e-05, + "loss": 0.4524, + "step": 382700 + }, + { + "epoch": 0.08, + "learning_rate": 6.234343434343434e-05, + "loss": 0.4583, + "step": 382800 + }, + { + "epoch": 0.08, + "learning_rate": 6.233333333333334e-05, + "loss": 0.4624, + "step": 382900 + }, + { + "epoch": 0.08, + "learning_rate": 6.232323232323233e-05, + "loss": 0.4571, + "step": 383000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.4579418128069426, + "eval_average_loss_on_sentence_tokens": 0.37753832785792507, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4542871117591858, + "eval_non_padding_tokens_in_labels": 133.51245, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38565, + "eval_padding_tokens_in_labels": 378.48755, + "eval_reconstruction_accuracy": 0.9156052990245428, + "eval_runtime": 181.3633, + "eval_samples_per_second": 27.569, + "eval_sentence_accuracy": 0.7690482172017155, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.248775, + "step": 383000 + }, + { + "epoch": 0.08, + "learning_rate": 6.23131313131313e-05, + "loss": 0.4582, + "step": 383100 + }, + { + "epoch": 0.08, + "learning_rate": 6.230303030303031e-05, + "loss": 0.4566, + "step": 383200 + }, + { + "epoch": 0.08, + "learning_rate": 6.229292929292929e-05, + "loss": 0.4633, + "step": 383300 + }, + { + "epoch": 0.08, + "learning_rate": 6.228282828282828e-05, + "loss": 0.4568, + "step": 383400 + }, + { + "epoch": 0.08, + "learning_rate": 6.227272727272727e-05, + "loss": 0.4621, + "step": 383500 + }, + { + "epoch": 0.08, + "learning_rate": 6.226262626262627e-05, + "loss": 0.4576, + "step": 383600 + }, + { + "epoch": 0.08, + "learning_rate": 6.225252525252526e-05, + "loss": 0.4614, + "step": 383700 + }, + { + "epoch": 0.08, + "learning_rate": 6.224242424242425e-05, + "loss": 0.4569, + "step": 383800 + }, + { + "epoch": 0.08, + "learning_rate": 6.223232323232323e-05, + "loss": 0.4571, + "step": 383900 + }, + { + "epoch": 0.08, + "learning_rate": 6.222222222222222e-05, + "loss": 0.4563, + "step": 384000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.4577885408119193, + "eval_average_loss_on_sentence_tokens": 0.4310150971398025, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.4566015601158142, + "eval_non_padding_tokens_in_labels": 133.4757, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36015, + "eval_padding_tokens_in_labels": 378.5243, + "eval_reconstruction_accuracy": 0.9154922645349641, + "eval_runtime": 186.506, + "eval_samples_per_second": 26.809, + "eval_sentence_accuracy": 0.724096936852872, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2464, + "step": 384000 + }, + { + "epoch": 0.08, + "learning_rate": 6.221212121212121e-05, + "loss": 0.4559, + "step": 384100 + }, + { + "epoch": 0.08, + "learning_rate": 6.22020202020202e-05, + "loss": 0.4566, + "step": 384200 + }, + { + "epoch": 0.08, + "learning_rate": 6.21919191919192e-05, + "loss": 0.4567, + "step": 384300 + }, + { + "epoch": 0.08, + "learning_rate": 6.218181818181819e-05, + "loss": 0.4566, + "step": 384400 + }, + { + "epoch": 0.08, + "learning_rate": 6.217171717171717e-05, + "loss": 0.4588, + "step": 384500 + }, + { + "epoch": 0.08, + "learning_rate": 6.216161616161617e-05, + "loss": 0.4585, + "step": 384600 + }, + { + "epoch": 0.08, + "learning_rate": 6.215151515151515e-05, + "loss": 0.4572, + "step": 384700 + }, + { + "epoch": 0.08, + "learning_rate": 6.214141414141414e-05, + "loss": 0.4569, + "step": 384800 + }, + { + "epoch": 0.08, + "learning_rate": 6.213131313131313e-05, + "loss": 0.4561, + "step": 384900 + }, + { + "epoch": 0.08, + "learning_rate": 6.212121212121213e-05, + "loss": 0.4574, + "step": 385000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.4581764295515955, + "eval_average_loss_on_sentence_tokens": 0.3813585004631373, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4547168016433716, + "eval_non_padding_tokens_in_labels": 133.5331, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3774, + "eval_padding_tokens_in_labels": 378.4669, + "eval_reconstruction_accuracy": 0.9154950753480681, + "eval_runtime": 180.2189, + "eval_samples_per_second": 27.744, + "eval_sentence_accuracy": 0.7600400165090531, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 385000 + }, + { + "epoch": 0.08, + "learning_rate": 6.21111111111111e-05, + "loss": 0.4589, + "step": 385100 + }, + { + "epoch": 0.08, + "learning_rate": 6.210101010101011e-05, + "loss": 0.4601, + "step": 385200 + }, + { + "epoch": 0.08, + "learning_rate": 6.209090909090909e-05, + "loss": 0.459, + "step": 385300 + }, + { + "epoch": 0.08, + "learning_rate": 6.208080808080808e-05, + "loss": 0.4579, + "step": 385400 + }, + { + "epoch": 0.08, + "learning_rate": 6.207070707070707e-05, + "loss": 0.4576, + "step": 385500 + }, + { + "epoch": 0.08, + "learning_rate": 6.206060606060606e-05, + "loss": 0.4604, + "step": 385600 + }, + { + "epoch": 0.08, + "learning_rate": 6.205050505050506e-05, + "loss": 0.4631, + "step": 385700 + }, + { + "epoch": 0.08, + "learning_rate": 6.204040404040405e-05, + "loss": 0.4559, + "step": 385800 + }, + { + "epoch": 0.08, + "learning_rate": 6.203030303030304e-05, + "loss": 0.4592, + "step": 385900 + }, + { + "epoch": 0.08, + "learning_rate": 6.202020202020202e-05, + "loss": 0.4549, + "step": 386000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.4585560090350433, + "eval_average_loss_on_sentence_tokens": 0.397351480903161, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4557812511920929, + "eval_non_padding_tokens_in_labels": 133.52545, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37, + "eval_padding_tokens_in_labels": 378.47455, + "eval_reconstruction_accuracy": 0.9153649581002338, + "eval_runtime": 187.2129, + "eval_samples_per_second": 26.708, + "eval_sentence_accuracy": 0.7546611157966515, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.25, + "step": 386000 + }, + { + "epoch": 0.08, + "learning_rate": 6.201010101010102e-05, + "loss": 0.462, + "step": 386100 + }, + { + "epoch": 0.08, + "learning_rate": 6.2e-05, + "loss": 0.4633, + "step": 386200 + }, + { + "epoch": 0.08, + "learning_rate": 6.1989898989899e-05, + "loss": 0.4597, + "step": 386300 + }, + { + "epoch": 0.08, + "learning_rate": 6.197979797979799e-05, + "loss": 0.4569, + "step": 386400 + }, + { + "epoch": 0.08, + "learning_rate": 6.196969696969698e-05, + "loss": 0.4596, + "step": 386500 + }, + { + "epoch": 0.08, + "learning_rate": 6.195959595959596e-05, + "loss": 0.4602, + "step": 386600 + }, + { + "epoch": 0.08, + "learning_rate": 6.194949494949496e-05, + "loss": 0.4571, + "step": 386700 + }, + { + "epoch": 0.08, + "learning_rate": 6.193939393939394e-05, + "loss": 0.4574, + "step": 386800 + }, + { + "epoch": 0.08, + "learning_rate": 6.192929292929293e-05, + "loss": 0.4522, + "step": 386900 + }, + { + "epoch": 0.08, + "learning_rate": 6.191919191919192e-05, + "loss": 0.4592, + "step": 387000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.45707962289249054, + "eval_average_loss_on_sentence_tokens": 0.41235995642397405, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.4550488293170929, + "eval_non_padding_tokens_in_labels": 133.51545, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3798, + "eval_padding_tokens_in_labels": 378.48455, + "eval_reconstruction_accuracy": 0.9155049575031934, + "eval_runtime": 189.3206, + "eval_samples_per_second": 26.41, + "eval_sentence_accuracy": 0.7419159473863657, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2496, + "step": 387000 + }, + { + "epoch": 0.08, + "learning_rate": 6.190909090909092e-05, + "loss": 0.4578, + "step": 387100 + }, + { + "epoch": 0.08, + "learning_rate": 6.18989898989899e-05, + "loss": 0.4574, + "step": 387200 + }, + { + "epoch": 0.08, + "learning_rate": 6.18888888888889e-05, + "loss": 0.461, + "step": 387300 + }, + { + "epoch": 0.08, + "learning_rate": 6.187878787878788e-05, + "loss": 0.4553, + "step": 387400 + }, + { + "epoch": 0.08, + "learning_rate": 6.186868686868687e-05, + "loss": 0.461, + "step": 387500 + }, + { + "epoch": 0.08, + "learning_rate": 6.185858585858586e-05, + "loss": 0.4599, + "step": 387600 + }, + { + "epoch": 0.08, + "learning_rate": 6.184848484848486e-05, + "loss": 0.4595, + "step": 387700 + }, + { + "epoch": 0.08, + "learning_rate": 6.183838383838383e-05, + "loss": 0.4592, + "step": 387800 + }, + { + "epoch": 0.08, + "learning_rate": 6.182828282828284e-05, + "loss": 0.4599, + "step": 387900 + }, + { + "epoch": 0.08, + "learning_rate": 6.181818181818182e-05, + "loss": 0.4588, + "step": 388000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.45723222994664386, + "eval_average_loss_on_sentence_tokens": 0.42001382784061153, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.45555663108825684, + "eval_non_padding_tokens_in_labels": 133.53635, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3855, + "eval_padding_tokens_in_labels": 378.46365, + "eval_reconstruction_accuracy": 0.9154853099251338, + "eval_runtime": 198.8623, + "eval_samples_per_second": 25.143, + "eval_sentence_accuracy": 0.741889030452025, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 388000 + }, + { + "epoch": 0.08, + "learning_rate": 6.180808080808081e-05, + "loss": 0.4583, + "step": 388100 + }, + { + "epoch": 0.08, + "learning_rate": 6.17979797979798e-05, + "loss": 0.4562, + "step": 388200 + }, + { + "epoch": 0.08, + "learning_rate": 6.17878787878788e-05, + "loss": 0.4602, + "step": 388300 + }, + { + "epoch": 0.08, + "learning_rate": 6.177777777777779e-05, + "loss": 0.4553, + "step": 388400 + }, + { + "epoch": 0.08, + "learning_rate": 6.176767676767678e-05, + "loss": 0.4616, + "step": 388500 + }, + { + "epoch": 0.08, + "learning_rate": 6.175757575757576e-05, + "loss": 0.4592, + "step": 388600 + }, + { + "epoch": 0.08, + "learning_rate": 6.174747474747475e-05, + "loss": 0.4605, + "step": 388700 + }, + { + "epoch": 0.08, + "learning_rate": 6.173737373737374e-05, + "loss": 0.4602, + "step": 388800 + }, + { + "epoch": 0.08, + "learning_rate": 6.172727272727273e-05, + "loss": 0.4579, + "step": 388900 + }, + { + "epoch": 0.08, + "learning_rate": 6.171717171717172e-05, + "loss": 0.4615, + "step": 389000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.45730543618267516, + "eval_average_loss_on_sentence_tokens": 0.39917231115251933, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.45462891459465027, + "eval_non_padding_tokens_in_labels": 133.54405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3714, + "eval_padding_tokens_in_labels": 378.45595, + "eval_reconstruction_accuracy": 0.9154660982261125, + "eval_runtime": 197.1914, + "eval_samples_per_second": 25.356, + "eval_sentence_accuracy": 0.7509869542591563, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.25, + "step": 389000 + }, + { + "epoch": 0.08, + "learning_rate": 6.170707070707072e-05, + "loss": 0.4592, + "step": 389100 + }, + { + "epoch": 0.08, + "learning_rate": 6.16969696969697e-05, + "loss": 0.4561, + "step": 389200 + }, + { + "epoch": 0.08, + "learning_rate": 6.168686868686869e-05, + "loss": 0.4569, + "step": 389300 + }, + { + "epoch": 0.08, + "learning_rate": 6.167676767676768e-05, + "loss": 0.4553, + "step": 389400 + }, + { + "epoch": 0.08, + "learning_rate": 6.166666666666667e-05, + "loss": 0.4582, + "step": 389500 + }, + { + "epoch": 0.08, + "learning_rate": 6.165656565656566e-05, + "loss": 0.4598, + "step": 389600 + }, + { + "epoch": 0.08, + "learning_rate": 6.164646464646465e-05, + "loss": 0.4605, + "step": 389700 + }, + { + "epoch": 0.08, + "learning_rate": 6.163636363636363e-05, + "loss": 0.4571, + "step": 389800 + }, + { + "epoch": 0.08, + "learning_rate": 6.162626262626264e-05, + "loss": 0.4568, + "step": 389900 + }, + { + "epoch": 0.08, + "learning_rate": 6.161616161616162e-05, + "loss": 0.4571, + "step": 390000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.45797027988585204, + "eval_average_loss_on_sentence_tokens": 0.3978860027571819, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.45518553256988525, + "eval_non_padding_tokens_in_labels": 133.5261, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3782, + "eval_padding_tokens_in_labels": 378.4739, + "eval_reconstruction_accuracy": 0.9154783777880097, + "eval_runtime": 192.9148, + "eval_samples_per_second": 25.918, + "eval_sentence_accuracy": 0.7499147630412547, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.25, + "step": 390000 + }, + { + "epoch": 0.08, + "learning_rate": 6.160606060606061e-05, + "loss": 0.4569, + "step": 390100 + }, + { + "epoch": 0.08, + "learning_rate": 6.15959595959596e-05, + "loss": 0.4622, + "step": 390200 + }, + { + "epoch": 0.08, + "learning_rate": 6.158585858585859e-05, + "loss": 0.4583, + "step": 390300 + }, + { + "epoch": 0.08, + "learning_rate": 6.157575757575757e-05, + "loss": 0.4574, + "step": 390400 + }, + { + "epoch": 0.08, + "learning_rate": 6.156565656565658e-05, + "loss": 0.4587, + "step": 390500 + }, + { + "epoch": 0.08, + "learning_rate": 6.155555555555555e-05, + "loss": 0.4562, + "step": 390600 + }, + { + "epoch": 0.08, + "learning_rate": 6.154545454545455e-05, + "loss": 0.4595, + "step": 390700 + }, + { + "epoch": 0.08, + "learning_rate": 6.153535353535354e-05, + "loss": 0.4561, + "step": 390800 + }, + { + "epoch": 0.08, + "learning_rate": 6.152525252525253e-05, + "loss": 0.4604, + "step": 390900 + }, + { + "epoch": 0.09, + "learning_rate": 6.151515151515151e-05, + "loss": 0.4553, + "step": 391000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4570821841787831, + "eval_average_loss_on_sentence_tokens": 0.4000180710115005, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4545019567012787, + "eval_non_padding_tokens_in_labels": 133.5053, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37525, + "eval_padding_tokens_in_labels": 378.4947, + "eval_reconstruction_accuracy": 0.915623868132974, + "eval_runtime": 188.0426, + "eval_samples_per_second": 26.59, + "eval_sentence_accuracy": 0.745437579629264, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2497749999999999, + "step": 391000 + }, + { + "epoch": 0.09, + "learning_rate": 6.150505050505051e-05, + "loss": 0.4576, + "step": 391100 + }, + { + "epoch": 0.09, + "learning_rate": 6.14949494949495e-05, + "loss": 0.4548, + "step": 391200 + }, + { + "epoch": 0.09, + "learning_rate": 6.148484848484849e-05, + "loss": 0.4598, + "step": 391300 + }, + { + "epoch": 0.09, + "learning_rate": 6.147474747474748e-05, + "loss": 0.4578, + "step": 391400 + }, + { + "epoch": 0.09, + "learning_rate": 6.146464646464647e-05, + "loss": 0.4621, + "step": 391500 + }, + { + "epoch": 0.09, + "learning_rate": 6.145454545454545e-05, + "loss": 0.4553, + "step": 391600 + }, + { + "epoch": 0.09, + "learning_rate": 6.144444444444445e-05, + "loss": 0.4568, + "step": 391700 + }, + { + "epoch": 0.09, + "learning_rate": 6.143434343434343e-05, + "loss": 0.4581, + "step": 391800 + }, + { + "epoch": 0.09, + "learning_rate": 6.142424242424242e-05, + "loss": 0.4588, + "step": 391900 + }, + { + "epoch": 0.09, + "learning_rate": 6.141414141414142e-05, + "loss": 0.4559, + "step": 392000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4580343463435289, + "eval_average_loss_on_sentence_tokens": 0.42305042731990666, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.45643556118011475, + "eval_non_padding_tokens_in_labels": 133.52875, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38965, + "eval_padding_tokens_in_labels": 378.47125, + "eval_reconstruction_accuracy": 0.9154839487739275, + "eval_runtime": 197.1417, + "eval_samples_per_second": 25.362, + "eval_sentence_accuracy": 0.7286548710678845, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 392000 + }, + { + "epoch": 0.09, + "learning_rate": 6.140404040404041e-05, + "loss": 0.4578, + "step": 392100 + }, + { + "epoch": 0.09, + "learning_rate": 6.139393939393939e-05, + "loss": 0.4595, + "step": 392200 + }, + { + "epoch": 0.09, + "learning_rate": 6.138383838383839e-05, + "loss": 0.4559, + "step": 392300 + }, + { + "epoch": 0.09, + "learning_rate": 6.137373737373737e-05, + "loss": 0.4578, + "step": 392400 + }, + { + "epoch": 0.09, + "learning_rate": 6.136363636363636e-05, + "loss": 0.4594, + "step": 392500 + }, + { + "epoch": 0.09, + "learning_rate": 6.135353535353535e-05, + "loss": 0.4534, + "step": 392600 + }, + { + "epoch": 0.09, + "learning_rate": 6.134343434343435e-05, + "loss": 0.4567, + "step": 392700 + }, + { + "epoch": 0.09, + "learning_rate": 6.133333333333334e-05, + "loss": 0.4562, + "step": 392800 + }, + { + "epoch": 0.09, + "learning_rate": 6.132323232323233e-05, + "loss": 0.4599, + "step": 392900 + }, + { + "epoch": 0.09, + "learning_rate": 6.131313131313131e-05, + "loss": 0.4583, + "step": 393000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4567707143842645, + "eval_average_loss_on_sentence_tokens": 0.36687443595720093, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4527050852775574, + "eval_non_padding_tokens_in_labels": 133.48715, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3574, + "eval_padding_tokens_in_labels": 378.51285, + "eval_reconstruction_accuracy": 0.9157015630111813, + "eval_runtime": 189.1964, + "eval_samples_per_second": 26.428, + "eval_sentence_accuracy": 0.7730588404184686, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.248775, + "step": 393000 + }, + { + "epoch": 0.09, + "learning_rate": 6.13030303030303e-05, + "loss": 0.4508, + "step": 393100 + }, + { + "epoch": 0.09, + "learning_rate": 6.129292929292929e-05, + "loss": 0.456, + "step": 393200 + }, + { + "epoch": 0.09, + "learning_rate": 6.128282828282828e-05, + "loss": 0.4556, + "step": 393300 + }, + { + "epoch": 0.09, + "learning_rate": 6.127272727272728e-05, + "loss": 0.4576, + "step": 393400 + }, + { + "epoch": 0.09, + "learning_rate": 6.126262626262627e-05, + "loss": 0.4555, + "step": 393500 + }, + { + "epoch": 0.09, + "learning_rate": 6.125252525252525e-05, + "loss": 0.459, + "step": 393600 + }, + { + "epoch": 0.09, + "learning_rate": 6.124242424242425e-05, + "loss": 0.4606, + "step": 393700 + }, + { + "epoch": 0.09, + "learning_rate": 6.123232323232323e-05, + "loss": 0.4574, + "step": 393800 + }, + { + "epoch": 0.09, + "learning_rate": 6.122222222222222e-05, + "loss": 0.4555, + "step": 393900 + }, + { + "epoch": 0.09, + "learning_rate": 6.121212121212121e-05, + "loss": 0.459, + "step": 394000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.45624421374660395, + "eval_average_loss_on_sentence_tokens": 0.4071993029675414, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.45406249165534973, + "eval_non_padding_tokens_in_labels": 133.5354, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38665, + "eval_padding_tokens_in_labels": 378.4646, + "eval_reconstruction_accuracy": 0.91572055784763, + "eval_runtime": 195.7624, + "eval_samples_per_second": 25.541, + "eval_sentence_accuracy": 0.7432438494805031, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.2496, + "step": 394000 + }, + { + "epoch": 0.09, + "learning_rate": 6.12020202020202e-05, + "loss": 0.4586, + "step": 394100 + }, + { + "epoch": 0.09, + "learning_rate": 6.11919191919192e-05, + "loss": 0.4581, + "step": 394200 + }, + { + "epoch": 0.09, + "learning_rate": 6.118181818181819e-05, + "loss": 0.4536, + "step": 394300 + }, + { + "epoch": 0.09, + "learning_rate": 6.117171717171718e-05, + "loss": 0.4558, + "step": 394400 + }, + { + "epoch": 0.09, + "learning_rate": 6.116161616161616e-05, + "loss": 0.4569, + "step": 394500 + }, + { + "epoch": 0.09, + "learning_rate": 6.115151515151515e-05, + "loss": 0.4591, + "step": 394600 + }, + { + "epoch": 0.09, + "learning_rate": 6.114141414141414e-05, + "loss": 0.4606, + "step": 394700 + }, + { + "epoch": 0.09, + "learning_rate": 6.113131313131314e-05, + "loss": 0.4543, + "step": 394800 + }, + { + "epoch": 0.09, + "learning_rate": 6.112121212121213e-05, + "loss": 0.4578, + "step": 394900 + }, + { + "epoch": 0.09, + "learning_rate": 6.111111111111112e-05, + "loss": 0.4576, + "step": 395000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4569539319395878, + "eval_average_loss_on_sentence_tokens": 0.4251897659100648, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.4555371105670929, + "eval_non_padding_tokens_in_labels": 133.5101, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3694, + "eval_padding_tokens_in_labels": 378.4899, + "eval_reconstruction_accuracy": 0.9156822721533191, + "eval_runtime": 188.6715, + "eval_samples_per_second": 26.501, + "eval_sentence_accuracy": 0.7203151075780142, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.245775, + "step": 395000 + }, + { + "epoch": 0.09, + "learning_rate": 6.11010101010101e-05, + "loss": 0.459, + "step": 395100 + }, + { + "epoch": 0.09, + "learning_rate": 6.10909090909091e-05, + "loss": 0.4582, + "step": 395200 + }, + { + "epoch": 0.09, + "learning_rate": 6.108080808080808e-05, + "loss": 0.4543, + "step": 395300 + }, + { + "epoch": 0.09, + "learning_rate": 6.107070707070708e-05, + "loss": 0.4567, + "step": 395400 + }, + { + "epoch": 0.09, + "learning_rate": 6.106060606060607e-05, + "loss": 0.4564, + "step": 395500 + }, + { + "epoch": 0.09, + "learning_rate": 6.105050505050506e-05, + "loss": 0.457, + "step": 395600 + }, + { + "epoch": 0.09, + "learning_rate": 6.104040404040404e-05, + "loss": 0.4548, + "step": 395700 + }, + { + "epoch": 0.09, + "learning_rate": 6.1030303030303036e-05, + "loss": 0.4596, + "step": 395800 + }, + { + "epoch": 0.09, + "learning_rate": 6.102020202020202e-05, + "loss": 0.4579, + "step": 395900 + }, + { + "epoch": 0.09, + "learning_rate": 6.101010101010102e-05, + "loss": 0.458, + "step": 396000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4562622053408025, + "eval_average_loss_on_sentence_tokens": 0.38738915950318786, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4532128870487213, + "eval_non_padding_tokens_in_labels": 133.4975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3692, + "eval_padding_tokens_in_labels": 378.5025, + "eval_reconstruction_accuracy": 0.9156828038144528, + "eval_runtime": 183.9989, + "eval_samples_per_second": 27.174, + "eval_sentence_accuracy": 0.7617357833725125, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 396000 + }, + { + "epoch": 0.09, + "learning_rate": 6.1e-05, + "loss": 0.4601, + "step": 396100 + }, + { + "epoch": 0.09, + "learning_rate": 6.0989898989899e-05, + "loss": 0.4571, + "step": 396200 + }, + { + "epoch": 0.09, + "learning_rate": 6.097979797979798e-05, + "loss": 0.4558, + "step": 396300 + }, + { + "epoch": 0.09, + "learning_rate": 6.0969696969696975e-05, + "loss": 0.4562, + "step": 396400 + }, + { + "epoch": 0.09, + "learning_rate": 6.095959595959596e-05, + "loss": 0.4563, + "step": 396500 + }, + { + "epoch": 0.09, + "learning_rate": 6.094949494949496e-05, + "loss": 0.4575, + "step": 396600 + }, + { + "epoch": 0.09, + "learning_rate": 6.0939393939393944e-05, + "loss": 0.4565, + "step": 396700 + }, + { + "epoch": 0.09, + "learning_rate": 6.0929292929292936e-05, + "loss": 0.4565, + "step": 396800 + }, + { + "epoch": 0.09, + "learning_rate": 6.091919191919192e-05, + "loss": 0.4561, + "step": 396900 + }, + { + "epoch": 0.09, + "learning_rate": 6.090909090909091e-05, + "loss": 0.4572, + "step": 397000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.45654679444990764, + "eval_average_loss_on_sentence_tokens": 0.38599136135314005, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.4532812535762787, + "eval_non_padding_tokens_in_labels": 133.4938, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38075, + "eval_padding_tokens_in_labels": 378.5062, + "eval_reconstruction_accuracy": 0.9156208166540261, + "eval_runtime": 181.9879, + "eval_samples_per_second": 27.474, + "eval_sentence_accuracy": 0.7609462199651874, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 397000 + }, + { + "epoch": 0.09, + "learning_rate": 6.08989898989899e-05, + "loss": 0.459, + "step": 397100 + }, + { + "epoch": 0.09, + "learning_rate": 6.08888888888889e-05, + "loss": 0.4543, + "step": 397200 + }, + { + "epoch": 0.09, + "learning_rate": 6.087878787878788e-05, + "loss": 0.4594, + "step": 397300 + }, + { + "epoch": 0.09, + "learning_rate": 6.0868686868686874e-05, + "loss": 0.4577, + "step": 397400 + }, + { + "epoch": 0.09, + "learning_rate": 6.085858585858586e-05, + "loss": 0.4575, + "step": 397500 + }, + { + "epoch": 0.09, + "learning_rate": 6.084848484848485e-05, + "loss": 0.4568, + "step": 397600 + }, + { + "epoch": 0.09, + "learning_rate": 6.0838383838383837e-05, + "loss": 0.4586, + "step": 397700 + }, + { + "epoch": 0.09, + "learning_rate": 6.0828282828282835e-05, + "loss": 0.4578, + "step": 397800 + }, + { + "epoch": 0.09, + "learning_rate": 6.081818181818182e-05, + "loss": 0.4592, + "step": 397900 + }, + { + "epoch": 0.09, + "learning_rate": 6.080808080808081e-05, + "loss": 0.4552, + "step": 398000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4567202181332466, + "eval_average_loss_on_sentence_tokens": 0.4388281720035666, + "eval_average_shuffling_prob": 0.565, + "eval_loss": 0.4558691382408142, + "eval_non_padding_tokens_in_labels": 133.52465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36595, + "eval_padding_tokens_in_labels": 378.47535, + "eval_reconstruction_accuracy": 0.9156978148821544, + "eval_runtime": 178.071, + "eval_samples_per_second": 28.079, + "eval_sentence_accuracy": 0.7178746388644642, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.245775, + "step": 398000 + }, + { + "epoch": 0.09, + "learning_rate": 6.07979797979798e-05, + "loss": 0.4575, + "step": 398100 + }, + { + "epoch": 0.09, + "learning_rate": 6.0787878787878796e-05, + "loss": 0.4556, + "step": 398200 + }, + { + "epoch": 0.09, + "learning_rate": 6.0777777777777775e-05, + "loss": 0.4563, + "step": 398300 + }, + { + "epoch": 0.09, + "learning_rate": 6.0767676767676774e-05, + "loss": 0.4571, + "step": 398400 + }, + { + "epoch": 0.09, + "learning_rate": 6.075757575757576e-05, + "loss": 0.4546, + "step": 398500 + }, + { + "epoch": 0.09, + "learning_rate": 6.074747474747475e-05, + "loss": 0.4553, + "step": 398600 + }, + { + "epoch": 0.09, + "learning_rate": 6.0737373737373736e-05, + "loss": 0.4539, + "step": 398700 + }, + { + "epoch": 0.09, + "learning_rate": 6.0727272727272735e-05, + "loss": 0.4557, + "step": 398800 + }, + { + "epoch": 0.09, + "learning_rate": 6.071717171717172e-05, + "loss": 0.459, + "step": 398900 + }, + { + "epoch": 0.09, + "learning_rate": 6.070707070707071e-05, + "loss": 0.4566, + "step": 399000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.45628652322661667, + "eval_average_loss_on_sentence_tokens": 0.3723490326598365, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.45259764790534973, + "eval_non_padding_tokens_in_labels": 133.54095, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3864, + "eval_padding_tokens_in_labels": 378.45905, + "eval_reconstruction_accuracy": 0.9158295038001156, + "eval_runtime": 193.4153, + "eval_samples_per_second": 25.851, + "eval_sentence_accuracy": 0.7633507994329499, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 399000 + }, + { + "epoch": 0.09, + "learning_rate": 6.06969696969697e-05, + "loss": 0.4563, + "step": 399100 + }, + { + "epoch": 0.09, + "learning_rate": 6.068686868686869e-05, + "loss": 0.4545, + "step": 399200 + }, + { + "epoch": 0.09, + "learning_rate": 6.0676767676767674e-05, + "loss": 0.4595, + "step": 399300 + }, + { + "epoch": 0.09, + "learning_rate": 6.066666666666667e-05, + "loss": 0.4533, + "step": 399400 + }, + { + "epoch": 0.09, + "learning_rate": 6.065656565656566e-05, + "loss": 0.46, + "step": 399500 + }, + { + "epoch": 0.09, + "learning_rate": 6.064646464646465e-05, + "loss": 0.4568, + "step": 399600 + }, + { + "epoch": 0.09, + "learning_rate": 6.0636363636363635e-05, + "loss": 0.4561, + "step": 399700 + }, + { + "epoch": 0.09, + "learning_rate": 6.0626262626262634e-05, + "loss": 0.4533, + "step": 399800 + }, + { + "epoch": 0.09, + "learning_rate": 6.061616161616161e-05, + "loss": 0.4564, + "step": 399900 + }, + { + "epoch": 0.09, + "learning_rate": 6.060606060606061e-05, + "loss": 0.4549, + "step": 400000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.4553141816035145, + "eval_average_loss_on_sentence_tokens": 0.3733324554825501, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.4515722692012787, + "eval_non_padding_tokens_in_labels": 133.55365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39035, + "eval_padding_tokens_in_labels": 378.44635, + "eval_reconstruction_accuracy": 0.915894086809011, + "eval_runtime": 185.7406, + "eval_samples_per_second": 26.919, + "eval_sentence_accuracy": 0.7674556319198952, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 400000 + }, + { + "epoch": 0.09, + "learning_rate": 6.05959595959596e-05, + "loss": 0.4584, + "step": 400100 + }, + { + "epoch": 0.09, + "learning_rate": 6.058585858585859e-05, + "loss": 0.4557, + "step": 400200 + }, + { + "epoch": 0.09, + "learning_rate": 6.0575757575757574e-05, + "loss": 0.4593, + "step": 400300 + }, + { + "epoch": 0.09, + "learning_rate": 6.056565656565657e-05, + "loss": 0.4564, + "step": 400400 + }, + { + "epoch": 0.09, + "learning_rate": 6.055555555555555e-05, + "loss": 0.4566, + "step": 400500 + }, + { + "epoch": 0.09, + "learning_rate": 6.054545454545455e-05, + "loss": 0.4547, + "step": 400600 + }, + { + "epoch": 0.09, + "learning_rate": 6.0535353535353535e-05, + "loss": 0.4574, + "step": 400700 + }, + { + "epoch": 0.09, + "learning_rate": 6.052525252525253e-05, + "loss": 0.4563, + "step": 400800 + }, + { + "epoch": 0.09, + "learning_rate": 6.051515151515151e-05, + "loss": 0.4578, + "step": 400900 + }, + { + "epoch": 0.1, + "learning_rate": 6.050505050505051e-05, + "loss": 0.4591, + "step": 401000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4561638007300442, + "eval_average_loss_on_sentence_tokens": 0.40011427615319023, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4536816477775574, + "eval_non_padding_tokens_in_labels": 133.53325, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3724, + "eval_padding_tokens_in_labels": 378.46675, + "eval_reconstruction_accuracy": 0.9157627381934555, + "eval_runtime": 201.7913, + "eval_samples_per_second": 24.778, + "eval_sentence_accuracy": 0.740996285463061, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 401000 + }, + { + "epoch": 0.1, + "learning_rate": 6.0494949494949496e-05, + "loss": 0.4577, + "step": 401100 + }, + { + "epoch": 0.1, + "learning_rate": 6.048484848484849e-05, + "loss": 0.4566, + "step": 401200 + }, + { + "epoch": 0.1, + "learning_rate": 6.047474747474747e-05, + "loss": 0.4533, + "step": 401300 + }, + { + "epoch": 0.1, + "learning_rate": 6.0464646464646465e-05, + "loss": 0.4564, + "step": 401400 + }, + { + "epoch": 0.1, + "learning_rate": 6.045454545454545e-05, + "loss": 0.4557, + "step": 401500 + }, + { + "epoch": 0.1, + "learning_rate": 6.044444444444445e-05, + "loss": 0.4532, + "step": 401600 + }, + { + "epoch": 0.1, + "learning_rate": 6.0434343434343434e-05, + "loss": 0.4573, + "step": 401700 + }, + { + "epoch": 0.1, + "learning_rate": 6.0424242424242426e-05, + "loss": 0.4582, + "step": 401800 + }, + { + "epoch": 0.1, + "learning_rate": 6.041414141414141e-05, + "loss": 0.4566, + "step": 401900 + }, + { + "epoch": 0.1, + "learning_rate": 6.040404040404041e-05, + "loss": 0.4532, + "step": 402000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.45530114400818616, + "eval_average_loss_on_sentence_tokens": 0.40906591942493065, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.45317381620407104, + "eval_non_padding_tokens_in_labels": 133.5336, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3919, + "eval_padding_tokens_in_labels": 378.4664, + "eval_reconstruction_accuracy": 0.9158548744422147, + "eval_runtime": 204.7816, + "eval_samples_per_second": 24.416, + "eval_sentence_accuracy": 0.7460566691190985, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 402000 + }, + { + "epoch": 0.1, + "learning_rate": 6.039393939393939e-05, + "loss": 0.4569, + "step": 402100 + }, + { + "epoch": 0.1, + "learning_rate": 6.038383838383839e-05, + "loss": 0.4594, + "step": 402200 + }, + { + "epoch": 0.1, + "learning_rate": 6.037373737373737e-05, + "loss": 0.4573, + "step": 402300 + }, + { + "epoch": 0.1, + "learning_rate": 6.0363636363636365e-05, + "loss": 0.458, + "step": 402400 + }, + { + "epoch": 0.1, + "learning_rate": 6.035353535353535e-05, + "loss": 0.4573, + "step": 402500 + }, + { + "epoch": 0.1, + "learning_rate": 6.034343434343435e-05, + "loss": 0.4594, + "step": 402600 + }, + { + "epoch": 0.1, + "learning_rate": 6.033333333333334e-05, + "loss": 0.4564, + "step": 402700 + }, + { + "epoch": 0.1, + "learning_rate": 6.0323232323232326e-05, + "loss": 0.4566, + "step": 402800 + }, + { + "epoch": 0.1, + "learning_rate": 6.0313131313131325e-05, + "loss": 0.458, + "step": 402900 + }, + { + "epoch": 0.1, + "learning_rate": 6.03030303030303e-05, + "loss": 0.4595, + "step": 403000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.45587067246228585, + "eval_average_loss_on_sentence_tokens": 0.382120330212567, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.45259764790534973, + "eval_non_padding_tokens_in_labels": 133.5404, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37785, + "eval_padding_tokens_in_labels": 378.4596, + "eval_reconstruction_accuracy": 0.9157449520588603, + "eval_runtime": 203.7444, + "eval_samples_per_second": 24.541, + "eval_sentence_accuracy": 0.766580831553825, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 403000 + }, + { + "epoch": 0.1, + "learning_rate": 6.02929292929293e-05, + "loss": 0.4538, + "step": 403100 + }, + { + "epoch": 0.1, + "learning_rate": 6.028282828282829e-05, + "loss": 0.4543, + "step": 403200 + }, + { + "epoch": 0.1, + "learning_rate": 6.027272727272728e-05, + "loss": 0.4584, + "step": 403300 + }, + { + "epoch": 0.1, + "learning_rate": 6.0262626262626264e-05, + "loss": 0.459, + "step": 403400 + }, + { + "epoch": 0.1, + "learning_rate": 6.025252525252526e-05, + "loss": 0.4547, + "step": 403500 + }, + { + "epoch": 0.1, + "learning_rate": 6.024242424242424e-05, + "loss": 0.4592, + "step": 403600 + }, + { + "epoch": 0.1, + "learning_rate": 6.023232323232324e-05, + "loss": 0.4541, + "step": 403700 + }, + { + "epoch": 0.1, + "learning_rate": 6.0222222222222225e-05, + "loss": 0.4582, + "step": 403800 + }, + { + "epoch": 0.1, + "learning_rate": 6.021212121212122e-05, + "loss": 0.4537, + "step": 403900 + }, + { + "epoch": 0.1, + "learning_rate": 6.02020202020202e-05, + "loss": 0.4583, + "step": 404000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4552759221824426, + "eval_average_loss_on_sentence_tokens": 0.4054601710037094, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4530371129512787, + "eval_non_padding_tokens_in_labels": 133.49745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36195, + "eval_padding_tokens_in_labels": 378.50255, + "eval_reconstruction_accuracy": 0.9158907829699032, + "eval_runtime": 215.0997, + "eval_samples_per_second": 23.245, + "eval_sentence_accuracy": 0.7443205268541282, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 404000 + }, + { + "epoch": 0.1, + "learning_rate": 6.01919191919192e-05, + "loss": 0.4551, + "step": 404100 + }, + { + "epoch": 0.1, + "learning_rate": 6.0181818181818187e-05, + "loss": 0.4551, + "step": 404200 + }, + { + "epoch": 0.1, + "learning_rate": 6.017171717171718e-05, + "loss": 0.4535, + "step": 404300 + }, + { + "epoch": 0.1, + "learning_rate": 6.0161616161616164e-05, + "loss": 0.4552, + "step": 404400 + }, + { + "epoch": 0.1, + "learning_rate": 6.0151515151515156e-05, + "loss": 0.4574, + "step": 404500 + }, + { + "epoch": 0.1, + "learning_rate": 6.014141414141414e-05, + "loss": 0.4571, + "step": 404600 + }, + { + "epoch": 0.1, + "learning_rate": 6.013131313131314e-05, + "loss": 0.4572, + "step": 404700 + }, + { + "epoch": 0.1, + "learning_rate": 6.0121212121212125e-05, + "loss": 0.4597, + "step": 404800 + }, + { + "epoch": 0.1, + "learning_rate": 6.011111111111112e-05, + "loss": 0.453, + "step": 404900 + }, + { + "epoch": 0.1, + "learning_rate": 6.01010101010101e-05, + "loss": 0.4561, + "step": 405000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.455496276441134, + "eval_average_loss_on_sentence_tokens": 0.40772002809122504, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.4533007740974426, + "eval_non_padding_tokens_in_labels": 133.50335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3697, + "eval_padding_tokens_in_labels": 378.49665, + "eval_reconstruction_accuracy": 0.9158480615041064, + "eval_runtime": 187.5907, + "eval_samples_per_second": 26.654, + "eval_sentence_accuracy": 0.7418621135176844, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2496, + "step": 405000 + }, + { + "epoch": 0.1, + "learning_rate": 6.00909090909091e-05, + "loss": 0.4555, + "step": 405100 + }, + { + "epoch": 0.1, + "learning_rate": 6.008080808080808e-05, + "loss": 0.4593, + "step": 405200 + }, + { + "epoch": 0.1, + "learning_rate": 6.007070707070708e-05, + "loss": 0.4549, + "step": 405300 + }, + { + "epoch": 0.1, + "learning_rate": 6.006060606060606e-05, + "loss": 0.4558, + "step": 405400 + }, + { + "epoch": 0.1, + "learning_rate": 6.0050505050505055e-05, + "loss": 0.4564, + "step": 405500 + }, + { + "epoch": 0.1, + "learning_rate": 6.004040404040404e-05, + "loss": 0.4561, + "step": 405600 + }, + { + "epoch": 0.1, + "learning_rate": 6.003030303030304e-05, + "loss": 0.456, + "step": 405700 + }, + { + "epoch": 0.1, + "learning_rate": 6.0020202020202024e-05, + "loss": 0.4572, + "step": 405800 + }, + { + "epoch": 0.1, + "learning_rate": 6.0010101010101016e-05, + "loss": 0.4576, + "step": 405900 + }, + { + "epoch": 0.1, + "learning_rate": 6e-05, + "loss": 0.4548, + "step": 406000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4550928319946064, + "eval_average_loss_on_sentence_tokens": 0.40422296273692504, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4527831971645355, + "eval_non_padding_tokens_in_labels": 133.55705, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3839, + "eval_padding_tokens_in_labels": 378.44295, + "eval_reconstruction_accuracy": 0.9158010208248183, + "eval_runtime": 210.9626, + "eval_samples_per_second": 23.701, + "eval_sentence_accuracy": 0.7464693954456547, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 406000 + }, + { + "epoch": 0.1, + "learning_rate": 5.9989898989898994e-05, + "loss": 0.4584, + "step": 406100 + }, + { + "epoch": 0.1, + "learning_rate": 5.997979797979798e-05, + "loss": 0.4582, + "step": 406200 + }, + { + "epoch": 0.1, + "learning_rate": 5.996969696969698e-05, + "loss": 0.4524, + "step": 406300 + }, + { + "epoch": 0.1, + "learning_rate": 5.995959595959596e-05, + "loss": 0.454, + "step": 406400 + }, + { + "epoch": 0.1, + "learning_rate": 5.9949494949494955e-05, + "loss": 0.4575, + "step": 406500 + }, + { + "epoch": 0.1, + "learning_rate": 5.993939393939394e-05, + "loss": 0.4532, + "step": 406600 + }, + { + "epoch": 0.1, + "learning_rate": 5.992929292929293e-05, + "loss": 0.4569, + "step": 406700 + }, + { + "epoch": 0.1, + "learning_rate": 5.991919191919192e-05, + "loss": 0.4575, + "step": 406800 + }, + { + "epoch": 0.1, + "learning_rate": 5.9909090909090916e-05, + "loss": 0.4606, + "step": 406900 + }, + { + "epoch": 0.1, + "learning_rate": 5.98989898989899e-05, + "loss": 0.4559, + "step": 407000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4549470212221938, + "eval_average_loss_on_sentence_tokens": 0.4093721189180766, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4529101550579071, + "eval_non_padding_tokens_in_labels": 133.5407, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3776, + "eval_padding_tokens_in_labels": 378.4593, + "eval_reconstruction_accuracy": 0.9157033214071999, + "eval_runtime": 211.828, + "eval_samples_per_second": 23.604, + "eval_sentence_accuracy": 0.7458592782672672, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 407000 + }, + { + "epoch": 0.1, + "learning_rate": 5.988888888888889e-05, + "loss": 0.4611, + "step": 407100 + }, + { + "epoch": 0.1, + "learning_rate": 5.987878787878788e-05, + "loss": 0.4534, + "step": 407200 + }, + { + "epoch": 0.1, + "learning_rate": 5.986868686868688e-05, + "loss": 0.4535, + "step": 407300 + }, + { + "epoch": 0.1, + "learning_rate": 5.9858585858585855e-05, + "loss": 0.4543, + "step": 407400 + }, + { + "epoch": 0.1, + "learning_rate": 5.9848484848484854e-05, + "loss": 0.4564, + "step": 407500 + }, + { + "epoch": 0.1, + "learning_rate": 5.983838383838384e-05, + "loss": 0.4577, + "step": 407600 + }, + { + "epoch": 0.1, + "learning_rate": 5.982828282828283e-05, + "loss": 0.4538, + "step": 407700 + }, + { + "epoch": 0.1, + "learning_rate": 5.9818181818181817e-05, + "loss": 0.4557, + "step": 407800 + }, + { + "epoch": 0.1, + "learning_rate": 5.9808080808080815e-05, + "loss": 0.4567, + "step": 407900 + }, + { + "epoch": 0.1, + "learning_rate": 5.97979797979798e-05, + "loss": 0.4582, + "step": 408000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4548835834047901, + "eval_average_loss_on_sentence_tokens": 0.3973962864612415, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.45222654938697815, + "eval_non_padding_tokens_in_labels": 133.5312, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36305, + "eval_padding_tokens_in_labels": 378.4688, + "eval_reconstruction_accuracy": 0.915812204304242, + "eval_runtime": 212.9637, + "eval_samples_per_second": 23.478, + "eval_sentence_accuracy": 0.7568772767240296, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.249975, + "step": 408000 + }, + { + "epoch": 0.1, + "learning_rate": 5.978787878787879e-05, + "loss": 0.4554, + "step": 408100 + }, + { + "epoch": 0.1, + "learning_rate": 5.977777777777778e-05, + "loss": 0.4569, + "step": 408200 + }, + { + "epoch": 0.1, + "learning_rate": 5.976767676767677e-05, + "loss": 0.452, + "step": 408300 + }, + { + "epoch": 0.1, + "learning_rate": 5.9757575757575755e-05, + "loss": 0.4544, + "step": 408400 + }, + { + "epoch": 0.1, + "learning_rate": 5.9747474747474754e-05, + "loss": 0.4591, + "step": 408500 + }, + { + "epoch": 0.1, + "learning_rate": 5.973737373737374e-05, + "loss": 0.4554, + "step": 408600 + }, + { + "epoch": 0.1, + "learning_rate": 5.972727272727273e-05, + "loss": 0.4573, + "step": 408700 + }, + { + "epoch": 0.1, + "learning_rate": 5.9717171717171716e-05, + "loss": 0.4557, + "step": 408800 + }, + { + "epoch": 0.1, + "learning_rate": 5.9707070707070715e-05, + "loss": 0.4558, + "step": 408900 + }, + { + "epoch": 0.1, + "learning_rate": 5.969696969696969e-05, + "loss": 0.453, + "step": 409000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4553308205550892, + "eval_average_loss_on_sentence_tokens": 0.4296903066877892, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.45423829555511475, + "eval_non_padding_tokens_in_labels": 133.5431, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3843, + "eval_padding_tokens_in_labels": 378.4569, + "eval_reconstruction_accuracy": 0.9159240780025336, + "eval_runtime": 210.7258, + "eval_samples_per_second": 23.728, + "eval_sentence_accuracy": 0.7206964308145064, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 409000 + }, + { + "epoch": 0.1, + "learning_rate": 5.968686868686869e-05, + "loss": 0.4535, + "step": 409100 + }, + { + "epoch": 0.1, + "learning_rate": 5.967676767676768e-05, + "loss": 0.4584, + "step": 409200 + }, + { + "epoch": 0.1, + "learning_rate": 5.966666666666667e-05, + "loss": 0.4537, + "step": 409300 + }, + { + "epoch": 0.1, + "learning_rate": 5.9656565656565654e-05, + "loss": 0.4531, + "step": 409400 + }, + { + "epoch": 0.1, + "learning_rate": 5.964646464646465e-05, + "loss": 0.4542, + "step": 409500 + }, + { + "epoch": 0.1, + "learning_rate": 5.963636363636363e-05, + "loss": 0.4548, + "step": 409600 + }, + { + "epoch": 0.1, + "learning_rate": 5.962626262626263e-05, + "loss": 0.4564, + "step": 409700 + }, + { + "epoch": 0.1, + "learning_rate": 5.9616161616161615e-05, + "loss": 0.4585, + "step": 409800 + }, + { + "epoch": 0.1, + "learning_rate": 5.960606060606061e-05, + "loss": 0.4591, + "step": 409900 + }, + { + "epoch": 0.1, + "learning_rate": 5.959595959595959e-05, + "loss": 0.4559, + "step": 410000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4547226335131991, + "eval_average_loss_on_sentence_tokens": 0.350848417758502, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.4500781297683716, + "eval_non_padding_tokens_in_labels": 133.5134, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38, + "eval_padding_tokens_in_labels": 378.4866, + "eval_reconstruction_accuracy": 0.9159882703146774, + "eval_runtime": 188.5634, + "eval_samples_per_second": 26.516, + "eval_sentence_accuracy": 0.7821029303569186, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 410000 + }, + { + "epoch": 0.1, + "learning_rate": 5.958585858585859e-05, + "loss": 0.455, + "step": 410100 + }, + { + "epoch": 0.1, + "learning_rate": 5.957575757575758e-05, + "loss": 0.4552, + "step": 410200 + }, + { + "epoch": 0.1, + "learning_rate": 5.956565656565657e-05, + "loss": 0.4578, + "step": 410300 + }, + { + "epoch": 0.1, + "learning_rate": 5.9555555555555554e-05, + "loss": 0.452, + "step": 410400 + }, + { + "epoch": 0.1, + "learning_rate": 5.9545454545454546e-05, + "loss": 0.4552, + "step": 410500 + }, + { + "epoch": 0.1, + "learning_rate": 5.953535353535353e-05, + "loss": 0.4594, + "step": 410600 + }, + { + "epoch": 0.1, + "learning_rate": 5.952525252525253e-05, + "loss": 0.4563, + "step": 410700 + }, + { + "epoch": 0.1, + "learning_rate": 5.9515151515151515e-05, + "loss": 0.4537, + "step": 410800 + }, + { + "epoch": 0.1, + "learning_rate": 5.950505050505051e-05, + "loss": 0.4557, + "step": 410900 + }, + { + "epoch": 0.1, + "learning_rate": 5.949494949494949e-05, + "loss": 0.4582, + "step": 411000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.4543051759273227, + "eval_average_loss_on_sentence_tokens": 0.3757707570455303, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.45072266459465027, + "eval_non_padding_tokens_in_labels": 133.51765, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.35835, + "eval_padding_tokens_in_labels": 378.48235, + "eval_reconstruction_accuracy": 0.915868036694566, + "eval_runtime": 196.2592, + "eval_samples_per_second": 25.477, + "eval_sentence_accuracy": 0.7676933981732373, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.2496, + "step": 411000 + }, + { + "epoch": 0.11, + "learning_rate": 5.948484848484849e-05, + "loss": 0.4553, + "step": 411100 + }, + { + "epoch": 0.11, + "learning_rate": 5.947474747474748e-05, + "loss": 0.4562, + "step": 411200 + }, + { + "epoch": 0.11, + "learning_rate": 5.946464646464647e-05, + "loss": 0.4554, + "step": 411300 + }, + { + "epoch": 0.11, + "learning_rate": 5.945454545454546e-05, + "loss": 0.4567, + "step": 411400 + }, + { + "epoch": 0.11, + "learning_rate": 5.9444444444444445e-05, + "loss": 0.4554, + "step": 411500 + }, + { + "epoch": 0.11, + "learning_rate": 5.9434343434343444e-05, + "loss": 0.4554, + "step": 411600 + }, + { + "epoch": 0.11, + "learning_rate": 5.942424242424243e-05, + "loss": 0.4562, + "step": 411700 + }, + { + "epoch": 0.11, + "learning_rate": 5.941414141414142e-05, + "loss": 0.456, + "step": 411800 + }, + { + "epoch": 0.11, + "learning_rate": 5.9404040404040406e-05, + "loss": 0.4579, + "step": 411900 + }, + { + "epoch": 0.11, + "learning_rate": 5.93939393939394e-05, + "loss": 0.4549, + "step": 412000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.4547742270552795, + "eval_average_loss_on_sentence_tokens": 0.38308975341185025, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4515136778354645, + "eval_non_padding_tokens_in_labels": 133.5272, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3833, + "eval_padding_tokens_in_labels": 378.4728, + "eval_reconstruction_accuracy": 0.9158756492698746, + "eval_runtime": 183.3025, + "eval_samples_per_second": 27.277, + "eval_sentence_accuracy": 0.7617133525938953, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2496, + "step": 412000 + }, + { + "epoch": 0.11, + "learning_rate": 5.9383838383838384e-05, + "loss": 0.4569, + "step": 412100 + }, + { + "epoch": 0.11, + "learning_rate": 5.937373737373738e-05, + "loss": 0.4557, + "step": 412200 + }, + { + "epoch": 0.11, + "learning_rate": 5.936363636363637e-05, + "loss": 0.4542, + "step": 412300 + }, + { + "epoch": 0.11, + "learning_rate": 5.935353535353536e-05, + "loss": 0.4563, + "step": 412400 + }, + { + "epoch": 0.11, + "learning_rate": 5.9343434343434345e-05, + "loss": 0.4567, + "step": 412500 + }, + { + "epoch": 0.11, + "learning_rate": 5.9333333333333343e-05, + "loss": 0.4542, + "step": 412600 + }, + { + "epoch": 0.11, + "learning_rate": 5.932323232323232e-05, + "loss": 0.458, + "step": 412700 + }, + { + "epoch": 0.11, + "learning_rate": 5.931313131313132e-05, + "loss": 0.4512, + "step": 412800 + }, + { + "epoch": 0.11, + "learning_rate": 5.9303030303030306e-05, + "loss": 0.4555, + "step": 412900 + }, + { + "epoch": 0.11, + "learning_rate": 5.92929292929293e-05, + "loss": 0.4573, + "step": 413000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.4549605404984693, + "eval_average_loss_on_sentence_tokens": 0.42584700426995276, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.45361328125, + "eval_non_padding_tokens_in_labels": 133.49885, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37635, + "eval_padding_tokens_in_labels": 378.50115, + "eval_reconstruction_accuracy": 0.9158659915374909, + "eval_runtime": 196.0791, + "eval_samples_per_second": 25.5, + "eval_sentence_accuracy": 0.7273493997523642, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.246975, + "step": 413000 + }, + { + "epoch": 0.11, + "learning_rate": 5.928282828282828e-05, + "loss": 0.4562, + "step": 413100 + }, + { + "epoch": 0.11, + "learning_rate": 5.927272727272728e-05, + "loss": 0.4544, + "step": 413200 + }, + { + "epoch": 0.11, + "learning_rate": 5.926262626262627e-05, + "loss": 0.4569, + "step": 413300 + }, + { + "epoch": 0.11, + "learning_rate": 5.925252525252526e-05, + "loss": 0.4573, + "step": 413400 + }, + { + "epoch": 0.11, + "learning_rate": 5.9242424242424244e-05, + "loss": 0.4551, + "step": 413500 + }, + { + "epoch": 0.11, + "learning_rate": 5.9232323232323236e-05, + "loss": 0.4534, + "step": 413600 + }, + { + "epoch": 0.11, + "learning_rate": 5.922222222222222e-05, + "loss": 0.4581, + "step": 413700 + }, + { + "epoch": 0.11, + "learning_rate": 5.921212121212122e-05, + "loss": 0.4587, + "step": 413800 + }, + { + "epoch": 0.11, + "learning_rate": 5.9202020202020205e-05, + "loss": 0.4592, + "step": 413900 + }, + { + "epoch": 0.11, + "learning_rate": 5.91919191919192e-05, + "loss": 0.4569, + "step": 414000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.45436516540669597, + "eval_average_loss_on_sentence_tokens": 0.3983396378980492, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.45179688930511475, + "eval_non_padding_tokens_in_labels": 133.5423, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38705, + "eval_padding_tokens_in_labels": 378.4577, + "eval_reconstruction_accuracy": 0.915869342934962, + "eval_runtime": 188.4851, + "eval_samples_per_second": 26.527, + "eval_sentence_accuracy": 0.7549213128286109, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.25, + "step": 414000 + }, + { + "epoch": 0.11, + "learning_rate": 5.918181818181818e-05, + "loss": 0.4608, + "step": 414100 + }, + { + "epoch": 0.11, + "learning_rate": 5.917171717171718e-05, + "loss": 0.4593, + "step": 414200 + }, + { + "epoch": 0.11, + "learning_rate": 5.916161616161616e-05, + "loss": 0.4581, + "step": 414300 + }, + { + "epoch": 0.11, + "learning_rate": 5.915151515151516e-05, + "loss": 0.4539, + "step": 414400 + }, + { + "epoch": 0.11, + "learning_rate": 5.9141414141414144e-05, + "loss": 0.4544, + "step": 414500 + }, + { + "epoch": 0.11, + "learning_rate": 5.9131313131313136e-05, + "loss": 0.4543, + "step": 414600 + }, + { + "epoch": 0.11, + "learning_rate": 5.912121212121212e-05, + "loss": 0.4551, + "step": 414700 + }, + { + "epoch": 0.11, + "learning_rate": 5.911111111111112e-05, + "loss": 0.4556, + "step": 414800 + }, + { + "epoch": 0.11, + "learning_rate": 5.91010101010101e-05, + "loss": 0.4573, + "step": 414900 + }, + { + "epoch": 0.11, + "learning_rate": 5.90909090909091e-05, + "loss": 0.456, + "step": 415000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.45410174917348295, + "eval_average_loss_on_sentence_tokens": 0.4149160036120381, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.45225584506988525, + "eval_non_padding_tokens_in_labels": 133.53425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38025, + "eval_padding_tokens_in_labels": 378.46575, + "eval_reconstruction_accuracy": 0.9159499005304477, + "eval_runtime": 206.8227, + "eval_samples_per_second": 24.175, + "eval_sentence_accuracy": 0.7390492938790891, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 415000 + }, + { + "epoch": 0.11, + "learning_rate": 5.908080808080808e-05, + "loss": 0.4555, + "step": 415100 + }, + { + "epoch": 0.11, + "learning_rate": 5.9070707070707074e-05, + "loss": 0.4533, + "step": 415200 + }, + { + "epoch": 0.11, + "learning_rate": 5.906060606060606e-05, + "loss": 0.4554, + "step": 415300 + }, + { + "epoch": 0.11, + "learning_rate": 5.905050505050506e-05, + "loss": 0.4564, + "step": 415400 + }, + { + "epoch": 0.11, + "learning_rate": 5.904040404040404e-05, + "loss": 0.4559, + "step": 415500 + }, + { + "epoch": 0.11, + "learning_rate": 5.9030303030303035e-05, + "loss": 0.4554, + "step": 415600 + }, + { + "epoch": 0.11, + "learning_rate": 5.902020202020202e-05, + "loss": 0.4546, + "step": 415700 + }, + { + "epoch": 0.11, + "learning_rate": 5.901010101010101e-05, + "loss": 0.46, + "step": 415800 + }, + { + "epoch": 0.11, + "learning_rate": 5.9e-05, + "loss": 0.4558, + "step": 415900 + }, + { + "epoch": 0.11, + "learning_rate": 5.8989898989898996e-05, + "loss": 0.4576, + "step": 416000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.455056218846855, + "eval_average_loss_on_sentence_tokens": 0.3894886972315643, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.45212891697883606, + "eval_non_padding_tokens_in_labels": 133.5347, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37, + "eval_padding_tokens_in_labels": 378.4653, + "eval_reconstruction_accuracy": 0.9158122735897448, + "eval_runtime": 195.1895, + "eval_samples_per_second": 25.616, + "eval_sentence_accuracy": 0.7554775961383171, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2499, + "step": 416000 + }, + { + "epoch": 0.11, + "learning_rate": 5.897979797979798e-05, + "loss": 0.4548, + "step": 416100 + }, + { + "epoch": 0.11, + "learning_rate": 5.8969696969696973e-05, + "loss": 0.4537, + "step": 416200 + }, + { + "epoch": 0.11, + "learning_rate": 5.895959595959596e-05, + "loss": 0.4553, + "step": 416300 + }, + { + "epoch": 0.11, + "learning_rate": 5.894949494949496e-05, + "loss": 0.4543, + "step": 416400 + }, + { + "epoch": 0.11, + "learning_rate": 5.8939393939393936e-05, + "loss": 0.456, + "step": 416500 + }, + { + "epoch": 0.11, + "learning_rate": 5.8929292929292935e-05, + "loss": 0.457, + "step": 416600 + }, + { + "epoch": 0.11, + "learning_rate": 5.891919191919192e-05, + "loss": 0.4562, + "step": 416700 + }, + { + "epoch": 0.11, + "learning_rate": 5.890909090909091e-05, + "loss": 0.4587, + "step": 416800 + }, + { + "epoch": 0.11, + "learning_rate": 5.88989898989899e-05, + "loss": 0.455, + "step": 416900 + }, + { + "epoch": 0.11, + "learning_rate": 5.8888888888888896e-05, + "loss": 0.4529, + "step": 417000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.45382608811268343, + "eval_average_loss_on_sentence_tokens": 0.4026899701557786, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.45155274868011475, + "eval_non_padding_tokens_in_labels": 133.5332, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3946, + "eval_padding_tokens_in_labels": 378.4668, + "eval_reconstruction_accuracy": 0.915958927536365, + "eval_runtime": 180.4064, + "eval_samples_per_second": 27.715, + "eval_sentence_accuracy": 0.7494706336246344, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2499, + "step": 417000 + }, + { + "epoch": 0.11, + "learning_rate": 5.887878787878788e-05, + "loss": 0.453, + "step": 417100 + }, + { + "epoch": 0.11, + "learning_rate": 5.886868686868687e-05, + "loss": 0.4519, + "step": 417200 + }, + { + "epoch": 0.11, + "learning_rate": 5.885858585858586e-05, + "loss": 0.4557, + "step": 417300 + }, + { + "epoch": 0.11, + "learning_rate": 5.884848484848485e-05, + "loss": 0.4539, + "step": 417400 + }, + { + "epoch": 0.11, + "learning_rate": 5.8838383838383835e-05, + "loss": 0.4528, + "step": 417500 + }, + { + "epoch": 0.11, + "learning_rate": 5.8828282828282834e-05, + "loss": 0.4545, + "step": 417600 + }, + { + "epoch": 0.11, + "learning_rate": 5.881818181818182e-05, + "loss": 0.4536, + "step": 417700 + }, + { + "epoch": 0.11, + "learning_rate": 5.880808080808081e-05, + "loss": 0.4556, + "step": 417800 + }, + { + "epoch": 0.11, + "learning_rate": 5.8797979797979796e-05, + "loss": 0.4524, + "step": 417900 + }, + { + "epoch": 0.11, + "learning_rate": 5.878787878787879e-05, + "loss": 0.4534, + "step": 418000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.4543180956508995, + "eval_average_loss_on_sentence_tokens": 0.3916806865794064, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4514453113079071, + "eval_non_padding_tokens_in_labels": 133.5305, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3797, + "eval_padding_tokens_in_labels": 378.4695, + "eval_reconstruction_accuracy": 0.9160118486707783, + "eval_runtime": 195.0934, + "eval_samples_per_second": 25.629, + "eval_sentence_accuracy": 0.7529608627774688, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.25, + "step": 418000 + }, + { + "epoch": 0.11, + "learning_rate": 5.8777777777777774e-05, + "loss": 0.4541, + "step": 418100 + }, + { + "epoch": 0.11, + "learning_rate": 5.876767676767677e-05, + "loss": 0.4556, + "step": 418200 + }, + { + "epoch": 0.11, + "learning_rate": 5.875757575757576e-05, + "loss": 0.4566, + "step": 418300 + }, + { + "epoch": 0.11, + "learning_rate": 5.874747474747475e-05, + "loss": 0.4535, + "step": 418400 + }, + { + "epoch": 0.11, + "learning_rate": 5.8737373737373735e-05, + "loss": 0.4557, + "step": 418500 + }, + { + "epoch": 0.11, + "learning_rate": 5.8727272727272734e-05, + "loss": 0.4545, + "step": 418600 + }, + { + "epoch": 0.11, + "learning_rate": 5.871717171717171e-05, + "loss": 0.4543, + "step": 418700 + }, + { + "epoch": 0.11, + "learning_rate": 5.870707070707071e-05, + "loss": 0.4552, + "step": 418800 + }, + { + "epoch": 0.11, + "learning_rate": 5.8696969696969696e-05, + "loss": 0.452, + "step": 418900 + }, + { + "epoch": 0.11, + "learning_rate": 5.868686868686869e-05, + "loss": 0.4576, + "step": 419000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.45419395247403765, + "eval_average_loss_on_sentence_tokens": 0.38956704778105294, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4513476490974426, + "eval_non_padding_tokens_in_labels": 133.5161, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3796, + "eval_padding_tokens_in_labels": 378.4839, + "eval_reconstruction_accuracy": 0.9161122995180937, + "eval_runtime": 183.4367, + "eval_samples_per_second": 27.257, + "eval_sentence_accuracy": 0.7549661743858452, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.25, + "step": 419000 + }, + { + "epoch": 0.11, + "learning_rate": 5.867676767676767e-05, + "loss": 0.4581, + "step": 419100 + }, + { + "epoch": 0.11, + "learning_rate": 5.866666666666667e-05, + "loss": 0.4544, + "step": 419200 + }, + { + "epoch": 0.11, + "learning_rate": 5.865656565656566e-05, + "loss": 0.4547, + "step": 419300 + }, + { + "epoch": 0.11, + "learning_rate": 5.864646464646465e-05, + "loss": 0.4601, + "step": 419400 + }, + { + "epoch": 0.11, + "learning_rate": 5.8636363636363634e-05, + "loss": 0.4538, + "step": 419500 + }, + { + "epoch": 0.11, + "learning_rate": 5.8626262626262626e-05, + "loss": 0.4569, + "step": 419600 + }, + { + "epoch": 0.11, + "learning_rate": 5.8616161616161625e-05, + "loss": 0.4559, + "step": 419700 + }, + { + "epoch": 0.11, + "learning_rate": 5.860606060606061e-05, + "loss": 0.4536, + "step": 419800 + }, + { + "epoch": 0.11, + "learning_rate": 5.85959595959596e-05, + "loss": 0.4507, + "step": 419900 + }, + { + "epoch": 0.11, + "learning_rate": 5.858585858585859e-05, + "loss": 0.4504, + "step": 420000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.4540484540330615, + "eval_average_loss_on_sentence_tokens": 0.3911749417291036, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.45118165016174316, + "eval_non_padding_tokens_in_labels": 133.50175, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3758, + "eval_padding_tokens_in_labels": 378.49825, + "eval_reconstruction_accuracy": 0.916017671011341, + "eval_runtime": 195.8832, + "eval_samples_per_second": 25.525, + "eval_sentence_accuracy": 0.7523058840418468, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.25, + "step": 420000 + }, + { + "epoch": 0.11, + "learning_rate": 5.8575757575757586e-05, + "loss": 0.4533, + "step": 420100 + }, + { + "epoch": 0.11, + "learning_rate": 5.856565656565657e-05, + "loss": 0.455, + "step": 420200 + }, + { + "epoch": 0.11, + "learning_rate": 5.855555555555556e-05, + "loss": 0.4523, + "step": 420300 + }, + { + "epoch": 0.11, + "learning_rate": 5.854545454545455e-05, + "loss": 0.4547, + "step": 420400 + }, + { + "epoch": 0.11, + "learning_rate": 5.853535353535354e-05, + "loss": 0.4574, + "step": 420500 + }, + { + "epoch": 0.11, + "learning_rate": 5.8525252525252526e-05, + "loss": 0.4538, + "step": 420600 + }, + { + "epoch": 0.11, + "learning_rate": 5.8515151515151525e-05, + "loss": 0.4547, + "step": 420700 + }, + { + "epoch": 0.11, + "learning_rate": 5.850505050505051e-05, + "loss": 0.4533, + "step": 420800 + }, + { + "epoch": 0.11, + "learning_rate": 5.84949494949495e-05, + "loss": 0.4563, + "step": 420900 + }, + { + "epoch": 0.12, + "learning_rate": 5.848484848484849e-05, + "loss": 0.4506, + "step": 421000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4540985467640468, + "eval_average_loss_on_sentence_tokens": 0.3325605135413556, + "eval_average_shuffling_prob": 0.41, + "eval_loss": 0.4486035108566284, + "eval_non_padding_tokens_in_labels": 133.5706, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3759, + "eval_padding_tokens_in_labels": 378.4294, + "eval_reconstruction_accuracy": 0.9158847350930172, + "eval_runtime": 222.6512, + "eval_samples_per_second": 22.457, + "eval_sentence_accuracy": 0.7996482853912825, + "eval_steps_per_second": 0.058, + "eval_variance_shuffling_prob": 0.2419, + "step": 421000 + }, + { + "epoch": 0.12, + "learning_rate": 5.847474747474748e-05, + "loss": 0.4545, + "step": 421100 + }, + { + "epoch": 0.12, + "learning_rate": 5.8464646464646464e-05, + "loss": 0.4531, + "step": 421200 + }, + { + "epoch": 0.12, + "learning_rate": 5.845454545454546e-05, + "loss": 0.4535, + "step": 421300 + }, + { + "epoch": 0.12, + "learning_rate": 5.844444444444445e-05, + "loss": 0.4548, + "step": 421400 + }, + { + "epoch": 0.12, + "learning_rate": 5.843434343434344e-05, + "loss": 0.4577, + "step": 421500 + }, + { + "epoch": 0.12, + "learning_rate": 5.8424242424242425e-05, + "loss": 0.4536, + "step": 421600 + }, + { + "epoch": 0.12, + "learning_rate": 5.8414141414141424e-05, + "loss": 0.4572, + "step": 421700 + }, + { + "epoch": 0.12, + "learning_rate": 5.84040404040404e-05, + "loss": 0.457, + "step": 421800 + }, + { + "epoch": 0.12, + "learning_rate": 5.83939393939394e-05, + "loss": 0.456, + "step": 421900 + }, + { + "epoch": 0.12, + "learning_rate": 5.8383838383838386e-05, + "loss": 0.4516, + "step": 422000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.45367415163107666, + "eval_average_loss_on_sentence_tokens": 0.4109063626732978, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.45169922709465027, + "eval_non_padding_tokens_in_labels": 133.5432, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37345, + "eval_padding_tokens_in_labels": 378.4568, + "eval_reconstruction_accuracy": 0.9159877787191016, + "eval_runtime": 213.5284, + "eval_samples_per_second": 23.416, + "eval_sentence_accuracy": 0.7419563227878766, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 422000 + }, + { + "epoch": 0.12, + "learning_rate": 5.837373737373738e-05, + "loss": 0.4528, + "step": 422100 + }, + { + "epoch": 0.12, + "learning_rate": 5.8363636363636364e-05, + "loss": 0.4513, + "step": 422200 + }, + { + "epoch": 0.12, + "learning_rate": 5.835353535353536e-05, + "loss": 0.4523, + "step": 422300 + }, + { + "epoch": 0.12, + "learning_rate": 5.834343434343435e-05, + "loss": 0.4487, + "step": 422400 + }, + { + "epoch": 0.12, + "learning_rate": 5.833333333333334e-05, + "loss": 0.4567, + "step": 422500 + }, + { + "epoch": 0.12, + "learning_rate": 5.8323232323232325e-05, + "loss": 0.4521, + "step": 422600 + }, + { + "epoch": 0.12, + "learning_rate": 5.831313131313132e-05, + "loss": 0.4546, + "step": 422700 + }, + { + "epoch": 0.12, + "learning_rate": 5.83030303030303e-05, + "loss": 0.4498, + "step": 422800 + }, + { + "epoch": 0.12, + "learning_rate": 5.82929292929293e-05, + "loss": 0.4568, + "step": 422900 + }, + { + "epoch": 0.12, + "learning_rate": 5.8282828282828286e-05, + "loss": 0.4555, + "step": 423000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.45358071521028714, + "eval_average_loss_on_sentence_tokens": 0.42407371863543303, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4522753953933716, + "eval_non_padding_tokens_in_labels": 133.511, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3695, + "eval_padding_tokens_in_labels": 378.489, + "eval_reconstruction_accuracy": 0.9159187616293917, + "eval_runtime": 191.5892, + "eval_samples_per_second": 26.098, + "eval_sentence_accuracy": 0.7358641233154485, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.248775, + "step": 423000 + }, + { + "epoch": 0.12, + "learning_rate": 5.827272727272728e-05, + "loss": 0.4586, + "step": 423100 + }, + { + "epoch": 0.12, + "learning_rate": 5.826262626262626e-05, + "loss": 0.4535, + "step": 423200 + }, + { + "epoch": 0.12, + "learning_rate": 5.825252525252526e-05, + "loss": 0.4543, + "step": 423300 + }, + { + "epoch": 0.12, + "learning_rate": 5.824242424242424e-05, + "loss": 0.456, + "step": 423400 + }, + { + "epoch": 0.12, + "learning_rate": 5.823232323232324e-05, + "loss": 0.4531, + "step": 423500 + }, + { + "epoch": 0.12, + "learning_rate": 5.8222222222222224e-05, + "loss": 0.4551, + "step": 423600 + }, + { + "epoch": 0.12, + "learning_rate": 5.8212121212121216e-05, + "loss": 0.4546, + "step": 423700 + }, + { + "epoch": 0.12, + "learning_rate": 5.82020202020202e-05, + "loss": 0.456, + "step": 423800 + }, + { + "epoch": 0.12, + "learning_rate": 5.81919191919192e-05, + "loss": 0.4546, + "step": 423900 + }, + { + "epoch": 0.12, + "learning_rate": 5.818181818181818e-05, + "loss": 0.4516, + "step": 424000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4533080173946252, + "eval_average_loss_on_sentence_tokens": 0.3817647262512282, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.45002928376197815, + "eval_non_padding_tokens_in_labels": 133.56065, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37775, + "eval_padding_tokens_in_labels": 378.43935, + "eval_reconstruction_accuracy": 0.9160422819540364, + "eval_runtime": 194.538, + "eval_samples_per_second": 25.702, + "eval_sentence_accuracy": 0.761874854199939, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 424000 + }, + { + "epoch": 0.12, + "learning_rate": 5.817171717171718e-05, + "loss": 0.4574, + "step": 424100 + }, + { + "epoch": 0.12, + "learning_rate": 5.816161616161616e-05, + "loss": 0.457, + "step": 424200 + }, + { + "epoch": 0.12, + "learning_rate": 5.8151515151515155e-05, + "loss": 0.4534, + "step": 424300 + }, + { + "epoch": 0.12, + "learning_rate": 5.814141414141414e-05, + "loss": 0.4541, + "step": 424400 + }, + { + "epoch": 0.12, + "learning_rate": 5.813131313131314e-05, + "loss": 0.4533, + "step": 424500 + }, + { + "epoch": 0.12, + "learning_rate": 5.8121212121212124e-05, + "loss": 0.4545, + "step": 424600 + }, + { + "epoch": 0.12, + "learning_rate": 5.8111111111111116e-05, + "loss": 0.451, + "step": 424700 + }, + { + "epoch": 0.12, + "learning_rate": 5.81010101010101e-05, + "loss": 0.4581, + "step": 424800 + }, + { + "epoch": 0.12, + "learning_rate": 5.809090909090909e-05, + "loss": 0.4573, + "step": 424900 + }, + { + "epoch": 0.12, + "learning_rate": 5.808080808080808e-05, + "loss": 0.4537, + "step": 425000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4538369270219254, + "eval_average_loss_on_sentence_tokens": 0.38655070379556167, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.4507519602775574, + "eval_non_padding_tokens_in_labels": 133.5591, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3704, + "eval_padding_tokens_in_labels": 378.4409, + "eval_reconstruction_accuracy": 0.9159895454277719, + "eval_runtime": 189.1391, + "eval_samples_per_second": 26.436, + "eval_sentence_accuracy": 0.7595824286252625, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2499, + "step": 425000 + }, + { + "epoch": 0.12, + "learning_rate": 5.807070707070708e-05, + "loss": 0.4526, + "step": 425100 + }, + { + "epoch": 0.12, + "learning_rate": 5.806060606060606e-05, + "loss": 0.4527, + "step": 425200 + }, + { + "epoch": 0.12, + "learning_rate": 5.8050505050505054e-05, + "loss": 0.4533, + "step": 425300 + }, + { + "epoch": 0.12, + "learning_rate": 5.804040404040404e-05, + "loss": 0.4523, + "step": 425400 + }, + { + "epoch": 0.12, + "learning_rate": 5.803030303030304e-05, + "loss": 0.4529, + "step": 425500 + }, + { + "epoch": 0.12, + "learning_rate": 5.8020202020202016e-05, + "loss": 0.4537, + "step": 425600 + }, + { + "epoch": 0.12, + "learning_rate": 5.8010101010101015e-05, + "loss": 0.4546, + "step": 425700 + }, + { + "epoch": 0.12, + "learning_rate": 5.8e-05, + "loss": 0.4534, + "step": 425800 + }, + { + "epoch": 0.12, + "learning_rate": 5.798989898989899e-05, + "loss": 0.4532, + "step": 425900 + }, + { + "epoch": 0.12, + "learning_rate": 5.797979797979798e-05, + "loss": 0.4534, + "step": 426000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.453601628479199, + "eval_average_loss_on_sentence_tokens": 0.40741443727735865, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4515331983566284, + "eval_non_padding_tokens_in_labels": 133.5408, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39005, + "eval_padding_tokens_in_labels": 378.4592, + "eval_reconstruction_accuracy": 0.9159300578717939, + "eval_runtime": 208.9364, + "eval_samples_per_second": 23.931, + "eval_sentence_accuracy": 0.7476582267123656, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 426000 + }, + { + "epoch": 0.12, + "learning_rate": 5.7969696969696976e-05, + "loss": 0.4524, + "step": 426100 + }, + { + "epoch": 0.12, + "learning_rate": 5.795959595959596e-05, + "loss": 0.4558, + "step": 426200 + }, + { + "epoch": 0.12, + "learning_rate": 5.7949494949494953e-05, + "loss": 0.4574, + "step": 426300 + }, + { + "epoch": 0.12, + "learning_rate": 5.793939393939394e-05, + "loss": 0.4502, + "step": 426400 + }, + { + "epoch": 0.12, + "learning_rate": 5.792929292929293e-05, + "loss": 0.4479, + "step": 426500 + }, + { + "epoch": 0.12, + "learning_rate": 5.7919191919191916e-05, + "loss": 0.4546, + "step": 426600 + }, + { + "epoch": 0.12, + "learning_rate": 5.7909090909090915e-05, + "loss": 0.4551, + "step": 426700 + }, + { + "epoch": 0.12, + "learning_rate": 5.78989898989899e-05, + "loss": 0.4538, + "step": 426800 + }, + { + "epoch": 0.12, + "learning_rate": 5.788888888888889e-05, + "loss": 0.4566, + "step": 426900 + }, + { + "epoch": 0.12, + "learning_rate": 5.787878787878788e-05, + "loss": 0.4546, + "step": 427000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.45312140435683795, + "eval_average_loss_on_sentence_tokens": 0.4096554929382383, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4511425793170929, + "eval_non_padding_tokens_in_labels": 133.53005, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3772, + "eval_padding_tokens_in_labels": 378.46995, + "eval_reconstruction_accuracy": 0.9160533856943303, + "eval_runtime": 195.1906, + "eval_samples_per_second": 25.616, + "eval_sentence_accuracy": 0.7447826008936422, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 427000 + }, + { + "epoch": 0.12, + "learning_rate": 5.786868686868687e-05, + "loss": 0.4546, + "step": 427100 + }, + { + "epoch": 0.12, + "learning_rate": 5.7858585858585854e-05, + "loss": 0.4507, + "step": 427200 + }, + { + "epoch": 0.12, + "learning_rate": 5.784848484848485e-05, + "loss": 0.4505, + "step": 427300 + }, + { + "epoch": 0.12, + "learning_rate": 5.783838383838384e-05, + "loss": 0.4531, + "step": 427400 + }, + { + "epoch": 0.12, + "learning_rate": 5.782828282828283e-05, + "loss": 0.4529, + "step": 427500 + }, + { + "epoch": 0.12, + "learning_rate": 5.7818181818181815e-05, + "loss": 0.4561, + "step": 427600 + }, + { + "epoch": 0.12, + "learning_rate": 5.7808080808080814e-05, + "loss": 0.454, + "step": 427700 + }, + { + "epoch": 0.12, + "learning_rate": 5.779797979797979e-05, + "loss": 0.4535, + "step": 427800 + }, + { + "epoch": 0.12, + "learning_rate": 5.778787878787879e-05, + "loss": 0.4563, + "step": 427900 + }, + { + "epoch": 0.12, + "learning_rate": 5.7777777777777776e-05, + "loss": 0.4529, + "step": 428000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4535422251359503, + "eval_average_loss_on_sentence_tokens": 0.35933320402280194, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.44917967915534973, + "eval_non_padding_tokens_in_labels": 133.5597, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37485, + "eval_padding_tokens_in_labels": 378.4403, + "eval_reconstruction_accuracy": 0.9160439021994228, + "eval_runtime": 198.4314, + "eval_samples_per_second": 25.198, + "eval_sentence_accuracy": 0.7744854379385218, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.248775, + "step": 428000 + }, + { + "epoch": 0.12, + "learning_rate": 5.776767676767677e-05, + "loss": 0.4508, + "step": 428100 + }, + { + "epoch": 0.12, + "learning_rate": 5.775757575757577e-05, + "loss": 0.4516, + "step": 428200 + }, + { + "epoch": 0.12, + "learning_rate": 5.774747474747475e-05, + "loss": 0.4545, + "step": 428300 + }, + { + "epoch": 0.12, + "learning_rate": 5.7737373737373744e-05, + "loss": 0.4568, + "step": 428400 + }, + { + "epoch": 0.12, + "learning_rate": 5.772727272727273e-05, + "loss": 0.4525, + "step": 428500 + }, + { + "epoch": 0.12, + "learning_rate": 5.771717171717173e-05, + "loss": 0.4538, + "step": 428600 + }, + { + "epoch": 0.12, + "learning_rate": 5.770707070707071e-05, + "loss": 0.4589, + "step": 428700 + }, + { + "epoch": 0.12, + "learning_rate": 5.7696969696969706e-05, + "loss": 0.4535, + "step": 428800 + }, + { + "epoch": 0.12, + "learning_rate": 5.768686868686869e-05, + "loss": 0.4587, + "step": 428900 + }, + { + "epoch": 0.12, + "learning_rate": 5.767676767676768e-05, + "loss": 0.4543, + "step": 429000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.45224910207126523, + "eval_average_loss_on_sentence_tokens": 0.3782856507511914, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4488476514816284, + "eval_non_padding_tokens_in_labels": 133.5223, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37495, + "eval_padding_tokens_in_labels": 378.4777, + "eval_reconstruction_accuracy": 0.9162197778501018, + "eval_runtime": 196.8279, + "eval_samples_per_second": 25.403, + "eval_sentence_accuracy": 0.7690392448902686, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.248775, + "step": 429000 + }, + { + "epoch": 0.12, + "learning_rate": 5.766666666666667e-05, + "loss": 0.4538, + "step": 429100 + }, + { + "epoch": 0.12, + "learning_rate": 5.765656565656567e-05, + "loss": 0.453, + "step": 429200 + }, + { + "epoch": 0.12, + "learning_rate": 5.7646464646464645e-05, + "loss": 0.4559, + "step": 429300 + }, + { + "epoch": 0.12, + "learning_rate": 5.7636363636363644e-05, + "loss": 0.4529, + "step": 429400 + }, + { + "epoch": 0.12, + "learning_rate": 5.762626262626263e-05, + "loss": 0.4528, + "step": 429500 + }, + { + "epoch": 0.12, + "learning_rate": 5.761616161616162e-05, + "loss": 0.4527, + "step": 429600 + }, + { + "epoch": 0.12, + "learning_rate": 5.7606060606060606e-05, + "loss": 0.4499, + "step": 429700 + }, + { + "epoch": 0.12, + "learning_rate": 5.7595959595959605e-05, + "loss": 0.4542, + "step": 429800 + }, + { + "epoch": 0.12, + "learning_rate": 5.758585858585859e-05, + "loss": 0.4559, + "step": 429900 + }, + { + "epoch": 0.12, + "learning_rate": 5.757575757575758e-05, + "loss": 0.4524, + "step": 430000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.4528948628569921, + "eval_average_loss_on_sentence_tokens": 0.361270630169716, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.44883787631988525, + "eval_non_padding_tokens_in_labels": 133.54915, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39105, + "eval_padding_tokens_in_labels": 378.45085, + "eval_reconstruction_accuracy": 0.9161057986050378, + "eval_runtime": 210.8706, + "eval_samples_per_second": 23.711, + "eval_sentence_accuracy": 0.7795189046602186, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.24697499999999992, + "step": 430000 + }, + { + "epoch": 0.12, + "learning_rate": 5.756565656565657e-05, + "loss": 0.4511, + "step": 430100 + }, + { + "epoch": 0.12, + "learning_rate": 5.755555555555556e-05, + "loss": 0.4526, + "step": 430200 + }, + { + "epoch": 0.12, + "learning_rate": 5.7545454545454545e-05, + "loss": 0.4552, + "step": 430300 + }, + { + "epoch": 0.12, + "learning_rate": 5.753535353535354e-05, + "loss": 0.4532, + "step": 430400 + }, + { + "epoch": 0.12, + "learning_rate": 5.752525252525253e-05, + "loss": 0.4518, + "step": 430500 + }, + { + "epoch": 0.12, + "learning_rate": 5.751515151515152e-05, + "loss": 0.4534, + "step": 430600 + }, + { + "epoch": 0.12, + "learning_rate": 5.7505050505050506e-05, + "loss": 0.4522, + "step": 430700 + }, + { + "epoch": 0.12, + "learning_rate": 5.7494949494949505e-05, + "loss": 0.455, + "step": 430800 + }, + { + "epoch": 0.12, + "learning_rate": 5.748484848484848e-05, + "loss": 0.4554, + "step": 430900 + }, + { + "epoch": 0.12, + "learning_rate": 5.747474747474748e-05, + "loss": 0.4501, + "step": 431000 + }, + { + "epoch": 0.12, + "eval_average_loss_on_non_sentence_tokens": 0.45225705462208726, + "eval_average_loss_on_sentence_tokens": 0.3659595571844715, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.44844725728034973, + "eval_non_padding_tokens_in_labels": 133.54935, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37295, + "eval_padding_tokens_in_labels": 378.45065, + "eval_reconstruction_accuracy": 0.9161865393022571, + "eval_runtime": 175.7433, + "eval_samples_per_second": 28.451, + "eval_sentence_accuracy": 0.7787248550971702, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.24797499999999992, + "step": 431000 + }, + { + "epoch": 0.13, + "learning_rate": 5.746464646464647e-05, + "loss": 0.4526, + "step": 431100 + }, + { + "epoch": 0.13, + "learning_rate": 5.745454545454546e-05, + "loss": 0.4527, + "step": 431200 + }, + { + "epoch": 0.13, + "learning_rate": 5.7444444444444444e-05, + "loss": 0.4544, + "step": 431300 + }, + { + "epoch": 0.13, + "learning_rate": 5.743434343434344e-05, + "loss": 0.4558, + "step": 431400 + }, + { + "epoch": 0.13, + "learning_rate": 5.742424242424243e-05, + "loss": 0.4503, + "step": 431500 + }, + { + "epoch": 0.13, + "learning_rate": 5.741414141414142e-05, + "loss": 0.4487, + "step": 431600 + }, + { + "epoch": 0.13, + "learning_rate": 5.7404040404040405e-05, + "loss": 0.4548, + "step": 431700 + }, + { + "epoch": 0.13, + "learning_rate": 5.73939393939394e-05, + "loss": 0.4545, + "step": 431800 + }, + { + "epoch": 0.13, + "learning_rate": 5.738383838383838e-05, + "loss": 0.4528, + "step": 431900 + }, + { + "epoch": 0.13, + "learning_rate": 5.737373737373738e-05, + "loss": 0.4589, + "step": 432000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4529478580966532, + "eval_average_loss_on_sentence_tokens": 0.3990669081906048, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.45040038228034973, + "eval_non_padding_tokens_in_labels": 133.52515, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38855, + "eval_padding_tokens_in_labels": 378.47485, + "eval_reconstruction_accuracy": 0.9162109412047764, + "eval_runtime": 196.2169, + "eval_samples_per_second": 25.482, + "eval_sentence_accuracy": 0.7432573079476735, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 432000 + }, + { + "epoch": 0.13, + "learning_rate": 5.7363636363636366e-05, + "loss": 0.4557, + "step": 432100 + }, + { + "epoch": 0.13, + "learning_rate": 5.735353535353536e-05, + "loss": 0.4529, + "step": 432200 + }, + { + "epoch": 0.13, + "learning_rate": 5.7343434343434344e-05, + "loss": 0.4557, + "step": 432300 + }, + { + "epoch": 0.13, + "learning_rate": 5.7333333333333336e-05, + "loss": 0.45, + "step": 432400 + }, + { + "epoch": 0.13, + "learning_rate": 5.732323232323232e-05, + "loss": 0.4502, + "step": 432500 + }, + { + "epoch": 0.13, + "learning_rate": 5.731313131313132e-05, + "loss": 0.4552, + "step": 432600 + }, + { + "epoch": 0.13, + "learning_rate": 5.7303030303030305e-05, + "loss": 0.4574, + "step": 432700 + }, + { + "epoch": 0.13, + "learning_rate": 5.72929292929293e-05, + "loss": 0.4492, + "step": 432800 + }, + { + "epoch": 0.13, + "learning_rate": 5.728282828282828e-05, + "loss": 0.4526, + "step": 432900 + }, + { + "epoch": 0.13, + "learning_rate": 5.727272727272728e-05, + "loss": 0.4525, + "step": 433000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.45328687270869567, + "eval_average_loss_on_sentence_tokens": 0.38217981170438853, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.44999998807907104, + "eval_non_padding_tokens_in_labels": 133.54105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38445, + "eval_padding_tokens_in_labels": 378.45895, + "eval_reconstruction_accuracy": 0.9161372212343634, + "eval_runtime": 172.6663, + "eval_samples_per_second": 28.958, + "eval_sentence_accuracy": 0.7592011053887703, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 433000 + }, + { + "epoch": 0.13, + "learning_rate": 5.726262626262626e-05, + "loss": 0.451, + "step": 433100 + }, + { + "epoch": 0.13, + "learning_rate": 5.725252525252526e-05, + "loss": 0.4546, + "step": 433200 + }, + { + "epoch": 0.13, + "learning_rate": 5.724242424242424e-05, + "loss": 0.4525, + "step": 433300 + }, + { + "epoch": 0.13, + "learning_rate": 5.7232323232323235e-05, + "loss": 0.4504, + "step": 433400 + }, + { + "epoch": 0.13, + "learning_rate": 5.722222222222222e-05, + "loss": 0.4575, + "step": 433500 + }, + { + "epoch": 0.13, + "learning_rate": 5.721212121212122e-05, + "loss": 0.4562, + "step": 433600 + }, + { + "epoch": 0.13, + "learning_rate": 5.7202020202020204e-05, + "loss": 0.455, + "step": 433700 + }, + { + "epoch": 0.13, + "learning_rate": 5.7191919191919196e-05, + "loss": 0.4563, + "step": 433800 + }, + { + "epoch": 0.13, + "learning_rate": 5.718181818181818e-05, + "loss": 0.4535, + "step": 433900 + }, + { + "epoch": 0.13, + "learning_rate": 5.717171717171717e-05, + "loss": 0.4522, + "step": 434000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4523972094161137, + "eval_average_loss_on_sentence_tokens": 0.4252698748663611, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.45116209983825684, + "eval_non_padding_tokens_in_labels": 133.53455, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.378, + "eval_padding_tokens_in_labels": 378.46545, + "eval_reconstruction_accuracy": 0.9161250163478463, + "eval_runtime": 190.4532, + "eval_samples_per_second": 26.253, + "eval_sentence_accuracy": 0.730915893552497, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 434000 + }, + { + "epoch": 0.13, + "learning_rate": 5.716161616161616e-05, + "loss": 0.4524, + "step": 434100 + }, + { + "epoch": 0.13, + "learning_rate": 5.715151515151516e-05, + "loss": 0.4509, + "step": 434200 + }, + { + "epoch": 0.13, + "learning_rate": 5.714141414141414e-05, + "loss": 0.4512, + "step": 434300 + }, + { + "epoch": 0.13, + "learning_rate": 5.7131313131313134e-05, + "loss": 0.4518, + "step": 434400 + }, + { + "epoch": 0.13, + "learning_rate": 5.712121212121212e-05, + "loss": 0.4563, + "step": 434500 + }, + { + "epoch": 0.13, + "learning_rate": 5.711111111111112e-05, + "loss": 0.4555, + "step": 434600 + }, + { + "epoch": 0.13, + "learning_rate": 5.71010101010101e-05, + "loss": 0.4502, + "step": 434700 + }, + { + "epoch": 0.13, + "learning_rate": 5.7090909090909096e-05, + "loss": 0.4525, + "step": 434800 + }, + { + "epoch": 0.13, + "learning_rate": 5.708080808080808e-05, + "loss": 0.4538, + "step": 434900 + }, + { + "epoch": 0.13, + "learning_rate": 5.707070707070707e-05, + "loss": 0.4534, + "step": 435000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.45245132777846925, + "eval_average_loss_on_sentence_tokens": 0.41563505673431933, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4508300721645355, + "eval_non_padding_tokens_in_labels": 133.5149, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39555, + "eval_padding_tokens_in_labels": 378.4851, + "eval_reconstruction_accuracy": 0.9161709173371749, + "eval_runtime": 188.5338, + "eval_samples_per_second": 26.52, + "eval_sentence_accuracy": 0.7357385109551923, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.248775, + "step": 435000 + }, + { + "epoch": 0.13, + "learning_rate": 5.706060606060606e-05, + "loss": 0.4557, + "step": 435100 + }, + { + "epoch": 0.13, + "learning_rate": 5.705050505050506e-05, + "loss": 0.4559, + "step": 435200 + }, + { + "epoch": 0.13, + "learning_rate": 5.7040404040404035e-05, + "loss": 0.4541, + "step": 435300 + }, + { + "epoch": 0.13, + "learning_rate": 5.7030303030303034e-05, + "loss": 0.4554, + "step": 435400 + }, + { + "epoch": 0.13, + "learning_rate": 5.702020202020202e-05, + "loss": 0.4539, + "step": 435500 + }, + { + "epoch": 0.13, + "learning_rate": 5.701010101010101e-05, + "loss": 0.4577, + "step": 435600 + }, + { + "epoch": 0.13, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.454, + "step": 435700 + }, + { + "epoch": 0.13, + "learning_rate": 5.6989898989898995e-05, + "loss": 0.4543, + "step": 435800 + }, + { + "epoch": 0.13, + "learning_rate": 5.697979797979798e-05, + "loss": 0.4542, + "step": 435900 + }, + { + "epoch": 0.13, + "learning_rate": 5.696969696969697e-05, + "loss": 0.4535, + "step": 436000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4524826898491434, + "eval_average_loss_on_sentence_tokens": 0.3948249884756723, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.44994139671325684, + "eval_non_padding_tokens_in_labels": 133.51305, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3765, + "eval_padding_tokens_in_labels": 378.48695, + "eval_reconstruction_accuracy": 0.9161991953803355, + "eval_runtime": 213.8612, + "eval_samples_per_second": 23.38, + "eval_sentence_accuracy": 0.747743463671111, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.2499, + "step": 436000 + }, + { + "epoch": 0.13, + "learning_rate": 5.695959595959596e-05, + "loss": 0.4536, + "step": 436100 + }, + { + "epoch": 0.13, + "learning_rate": 5.694949494949495e-05, + "loss": 0.4506, + "step": 436200 + }, + { + "epoch": 0.13, + "learning_rate": 5.6939393939393935e-05, + "loss": 0.4446, + "step": 436300 + }, + { + "epoch": 0.13, + "learning_rate": 5.6929292929292933e-05, + "loss": 0.4564, + "step": 436400 + }, + { + "epoch": 0.13, + "learning_rate": 5.691919191919192e-05, + "loss": 0.4526, + "step": 436500 + }, + { + "epoch": 0.13, + "learning_rate": 5.690909090909091e-05, + "loss": 0.4544, + "step": 436600 + }, + { + "epoch": 0.13, + "learning_rate": 5.6898989898989896e-05, + "loss": 0.4545, + "step": 436700 + }, + { + "epoch": 0.13, + "learning_rate": 5.6888888888888895e-05, + "loss": 0.4553, + "step": 436800 + }, + { + "epoch": 0.13, + "learning_rate": 5.6878787878787887e-05, + "loss": 0.4528, + "step": 436900 + }, + { + "epoch": 0.13, + "learning_rate": 5.686868686868687e-05, + "loss": 0.4558, + "step": 437000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.45190403891381287, + "eval_average_loss_on_sentence_tokens": 0.4277567693251613, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.4508691430091858, + "eval_non_padding_tokens_in_labels": 133.5126, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36785, + "eval_padding_tokens_in_labels": 378.4874, + "eval_reconstruction_accuracy": 0.9162466581934523, + "eval_runtime": 180.6399, + "eval_samples_per_second": 27.679, + "eval_sentence_accuracy": 0.7293457390492939, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 437000 + }, + { + "epoch": 0.13, + "learning_rate": 5.6858585858585864e-05, + "loss": 0.4565, + "step": 437100 + }, + { + "epoch": 0.13, + "learning_rate": 5.684848484848485e-05, + "loss": 0.4553, + "step": 437200 + }, + { + "epoch": 0.13, + "learning_rate": 5.683838383838385e-05, + "loss": 0.4527, + "step": 437300 + }, + { + "epoch": 0.13, + "learning_rate": 5.682828282828283e-05, + "loss": 0.4547, + "step": 437400 + }, + { + "epoch": 0.13, + "learning_rate": 5.6818181818181825e-05, + "loss": 0.4577, + "step": 437500 + }, + { + "epoch": 0.13, + "learning_rate": 5.680808080808081e-05, + "loss": 0.4514, + "step": 437600 + }, + { + "epoch": 0.13, + "learning_rate": 5.679797979797981e-05, + "loss": 0.4513, + "step": 437700 + }, + { + "epoch": 0.13, + "learning_rate": 5.678787878787879e-05, + "loss": 0.4526, + "step": 437800 + }, + { + "epoch": 0.13, + "learning_rate": 5.6777777777777786e-05, + "loss": 0.4552, + "step": 437900 + }, + { + "epoch": 0.13, + "learning_rate": 5.676767676767677e-05, + "loss": 0.4521, + "step": 438000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.4525348216251786, + "eval_average_loss_on_sentence_tokens": 0.4264076993329957, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.4513574242591858, + "eval_non_padding_tokens_in_labels": 133.534, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3919, + "eval_padding_tokens_in_labels": 378.466, + "eval_reconstruction_accuracy": 0.916168619968131, + "eval_runtime": 195.1081, + "eval_samples_per_second": 25.627, + "eval_sentence_accuracy": 0.7271789258348735, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2464, + "step": 438000 + }, + { + "epoch": 0.13, + "learning_rate": 5.675757575757576e-05, + "loss": 0.4556, + "step": 438100 + }, + { + "epoch": 0.13, + "learning_rate": 5.674747474747475e-05, + "loss": 0.4558, + "step": 438200 + }, + { + "epoch": 0.13, + "learning_rate": 5.673737373737375e-05, + "loss": 0.4526, + "step": 438300 + }, + { + "epoch": 0.13, + "learning_rate": 5.6727272727272726e-05, + "loss": 0.4559, + "step": 438400 + }, + { + "epoch": 0.13, + "learning_rate": 5.6717171717171724e-05, + "loss": 0.4527, + "step": 438500 + }, + { + "epoch": 0.13, + "learning_rate": 5.670707070707071e-05, + "loss": 0.4514, + "step": 438600 + }, + { + "epoch": 0.13, + "learning_rate": 5.66969696969697e-05, + "loss": 0.4545, + "step": 438700 + }, + { + "epoch": 0.13, + "learning_rate": 5.668686868686869e-05, + "loss": 0.4558, + "step": 438800 + }, + { + "epoch": 0.13, + "learning_rate": 5.6676767676767686e-05, + "loss": 0.4519, + "step": 438900 + }, + { + "epoch": 0.13, + "learning_rate": 5.666666666666667e-05, + "loss": 0.4499, + "step": 439000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.45205957907877703, + "eval_average_loss_on_sentence_tokens": 0.37225438303186126, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.4484277367591858, + "eval_non_padding_tokens_in_labels": 133.51255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39095, + "eval_padding_tokens_in_labels": 378.48745, + "eval_reconstruction_accuracy": 0.916317954455082, + "eval_runtime": 176.6468, + "eval_samples_per_second": 28.305, + "eval_sentence_accuracy": 0.7675498411900874, + "eval_steps_per_second": 0.074, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 439000 + }, + { + "epoch": 0.13, + "learning_rate": 5.665656565656566e-05, + "loss": 0.4546, + "step": 439100 + }, + { + "epoch": 0.13, + "learning_rate": 5.664646464646465e-05, + "loss": 0.4564, + "step": 439200 + }, + { + "epoch": 0.13, + "learning_rate": 5.663636363636364e-05, + "loss": 0.4557, + "step": 439300 + }, + { + "epoch": 0.13, + "learning_rate": 5.6626262626262625e-05, + "loss": 0.4522, + "step": 439400 + }, + { + "epoch": 0.13, + "learning_rate": 5.6616161616161624e-05, + "loss": 0.4546, + "step": 439500 + }, + { + "epoch": 0.13, + "learning_rate": 5.660606060606061e-05, + "loss": 0.4518, + "step": 439600 + }, + { + "epoch": 0.13, + "learning_rate": 5.65959595959596e-05, + "loss": 0.4514, + "step": 439700 + }, + { + "epoch": 0.13, + "learning_rate": 5.6585858585858586e-05, + "loss": 0.4533, + "step": 439800 + }, + { + "epoch": 0.13, + "learning_rate": 5.6575757575757585e-05, + "loss": 0.4559, + "step": 439900 + }, + { + "epoch": 0.13, + "learning_rate": 5.6565656565656563e-05, + "loss": 0.4532, + "step": 440000 + }, + { + "epoch": 0.13, + "eval_average_loss_on_non_sentence_tokens": 0.45245178801895825, + "eval_average_loss_on_sentence_tokens": 0.38791607286621393, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4495800733566284, + "eval_non_padding_tokens_in_labels": 133.53785, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37655, + "eval_padding_tokens_in_labels": 378.46215, + "eval_reconstruction_accuracy": 0.9161320820174, + "eval_runtime": 187.0779, + "eval_samples_per_second": 26.727, + "eval_sentence_accuracy": 0.7550917867461016, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 440000 + }, + { + "epoch": 0.13, + "learning_rate": 5.655555555555556e-05, + "loss": 0.4547, + "step": 440100 + }, + { + "epoch": 0.13, + "learning_rate": 5.654545454545455e-05, + "loss": 0.4542, + "step": 440200 + }, + { + "epoch": 0.13, + "learning_rate": 5.653535353535354e-05, + "loss": 0.4534, + "step": 440300 + }, + { + "epoch": 0.13, + "learning_rate": 5.6525252525252525e-05, + "loss": 0.4529, + "step": 440400 + }, + { + "epoch": 0.13, + "learning_rate": 5.651515151515152e-05, + "loss": 0.4573, + "step": 440500 + }, + { + "epoch": 0.13, + "learning_rate": 5.650505050505051e-05, + "loss": 0.4534, + "step": 440600 + }, + { + "epoch": 0.13, + "learning_rate": 5.64949494949495e-05, + "loss": 0.4553, + "step": 440700 + }, + { + "epoch": 0.13, + "learning_rate": 5.6484848484848486e-05, + "loss": 0.4518, + "step": 440800 + }, + { + "epoch": 0.13, + "learning_rate": 5.647474747474748e-05, + "loss": 0.4508, + "step": 440900 + }, + { + "epoch": 0.14, + "learning_rate": 5.646464646464646e-05, + "loss": 0.4532, + "step": 441000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4517082936332444, + "eval_average_loss_on_sentence_tokens": 0.3623721705881844, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.4476269483566284, + "eval_non_padding_tokens_in_labels": 133.5267, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38485, + "eval_padding_tokens_in_labels": 378.4733, + "eval_reconstruction_accuracy": 0.9162859499508518, + "eval_runtime": 173.1963, + "eval_samples_per_second": 28.869, + "eval_sentence_accuracy": 0.7763920541209827, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 441000 + }, + { + "epoch": 0.14, + "learning_rate": 5.645454545454546e-05, + "loss": 0.4512, + "step": 441100 + }, + { + "epoch": 0.14, + "learning_rate": 5.644444444444445e-05, + "loss": 0.4539, + "step": 441200 + }, + { + "epoch": 0.14, + "learning_rate": 5.643434343434344e-05, + "loss": 0.4518, + "step": 441300 + }, + { + "epoch": 0.14, + "learning_rate": 5.6424242424242424e-05, + "loss": 0.4471, + "step": 441400 + }, + { + "epoch": 0.14, + "learning_rate": 5.6414141414141416e-05, + "loss": 0.4537, + "step": 441500 + }, + { + "epoch": 0.14, + "learning_rate": 5.64040404040404e-05, + "loss": 0.4538, + "step": 441600 + }, + { + "epoch": 0.14, + "learning_rate": 5.63939393939394e-05, + "loss": 0.4548, + "step": 441700 + }, + { + "epoch": 0.14, + "learning_rate": 5.6383838383838385e-05, + "loss": 0.4537, + "step": 441800 + }, + { + "epoch": 0.14, + "learning_rate": 5.637373737373738e-05, + "loss": 0.4549, + "step": 441900 + }, + { + "epoch": 0.14, + "learning_rate": 5.636363636363636e-05, + "loss": 0.4519, + "step": 442000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.45140917290183263, + "eval_average_loss_on_sentence_tokens": 0.4123959699480161, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.44969725608825684, + "eval_non_padding_tokens_in_labels": 133.52675, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3816, + "eval_padding_tokens_in_labels": 378.47325, + "eval_reconstruction_accuracy": 0.9161752090098957, + "eval_runtime": 195.7259, + "eval_samples_per_second": 25.546, + "eval_sentence_accuracy": 0.7494526890017407, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.249775, + "step": 442000 + }, + { + "epoch": 0.14, + "learning_rate": 5.635353535353536e-05, + "loss": 0.4523, + "step": 442100 + }, + { + "epoch": 0.14, + "learning_rate": 5.634343434343434e-05, + "loss": 0.4512, + "step": 442200 + }, + { + "epoch": 0.14, + "learning_rate": 5.633333333333334e-05, + "loss": 0.4502, + "step": 442300 + }, + { + "epoch": 0.14, + "learning_rate": 5.6323232323232324e-05, + "loss": 0.4552, + "step": 442400 + }, + { + "epoch": 0.14, + "learning_rate": 5.6313131313131316e-05, + "loss": 0.4527, + "step": 442500 + }, + { + "epoch": 0.14, + "learning_rate": 5.63030303030303e-05, + "loss": 0.4522, + "step": 442600 + }, + { + "epoch": 0.14, + "learning_rate": 5.62929292929293e-05, + "loss": 0.4527, + "step": 442700 + }, + { + "epoch": 0.14, + "learning_rate": 5.6282828282828285e-05, + "loss": 0.453, + "step": 442800 + }, + { + "epoch": 0.14, + "learning_rate": 5.627272727272728e-05, + "loss": 0.4489, + "step": 442900 + }, + { + "epoch": 0.14, + "learning_rate": 5.626262626262626e-05, + "loss": 0.4548, + "step": 443000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4520179739105222, + "eval_average_loss_on_sentence_tokens": 0.3888907712859202, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.44911134243011475, + "eval_non_padding_tokens_in_labels": 133.48935, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3714, + "eval_padding_tokens_in_labels": 378.51065, + "eval_reconstruction_accuracy": 0.9162026315550399, + "eval_runtime": 178.3176, + "eval_samples_per_second": 28.04, + "eval_sentence_accuracy": 0.7574201015665656, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2499, + "step": 443000 + }, + { + "epoch": 0.14, + "learning_rate": 5.6252525252525254e-05, + "loss": 0.4554, + "step": 443100 + }, + { + "epoch": 0.14, + "learning_rate": 5.624242424242424e-05, + "loss": 0.4555, + "step": 443200 + }, + { + "epoch": 0.14, + "learning_rate": 5.623232323232324e-05, + "loss": 0.4518, + "step": 443300 + }, + { + "epoch": 0.14, + "learning_rate": 5.622222222222222e-05, + "loss": 0.4544, + "step": 443400 + }, + { + "epoch": 0.14, + "learning_rate": 5.6212121212121215e-05, + "loss": 0.4542, + "step": 443500 + }, + { + "epoch": 0.14, + "learning_rate": 5.62020202020202e-05, + "loss": 0.4505, + "step": 443600 + }, + { + "epoch": 0.14, + "learning_rate": 5.61919191919192e-05, + "loss": 0.4511, + "step": 443700 + }, + { + "epoch": 0.14, + "learning_rate": 5.618181818181818e-05, + "loss": 0.4548, + "step": 443800 + }, + { + "epoch": 0.14, + "learning_rate": 5.6171717171717176e-05, + "loss": 0.4538, + "step": 443900 + }, + { + "epoch": 0.14, + "learning_rate": 5.616161616161616e-05, + "loss": 0.45, + "step": 444000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4518679049955418, + "eval_average_loss_on_sentence_tokens": 0.42463603684322154, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.45068359375, + "eval_non_padding_tokens_in_labels": 133.5036, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37895, + "eval_padding_tokens_in_labels": 378.4964, + "eval_reconstruction_accuracy": 0.9162344523650455, + "eval_runtime": 168.3991, + "eval_samples_per_second": 29.691, + "eval_sentence_accuracy": 0.7275288459813017, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 444000 + }, + { + "epoch": 0.14, + "learning_rate": 5.615151515151515e-05, + "loss": 0.4526, + "step": 444100 + }, + { + "epoch": 0.14, + "learning_rate": 5.614141414141414e-05, + "loss": 0.4535, + "step": 444200 + }, + { + "epoch": 0.14, + "learning_rate": 5.613131313131314e-05, + "loss": 0.4524, + "step": 444300 + }, + { + "epoch": 0.14, + "learning_rate": 5.6121212121212116e-05, + "loss": 0.4551, + "step": 444400 + }, + { + "epoch": 0.14, + "learning_rate": 5.6111111111111114e-05, + "loss": 0.4537, + "step": 444500 + }, + { + "epoch": 0.14, + "learning_rate": 5.61010101010101e-05, + "loss": 0.4551, + "step": 444600 + }, + { + "epoch": 0.14, + "learning_rate": 5.609090909090909e-05, + "loss": 0.4543, + "step": 444700 + }, + { + "epoch": 0.14, + "learning_rate": 5.608080808080808e-05, + "loss": 0.4501, + "step": 444800 + }, + { + "epoch": 0.14, + "learning_rate": 5.6070707070707076e-05, + "loss": 0.4515, + "step": 444900 + }, + { + "epoch": 0.14, + "learning_rate": 5.606060606060606e-05, + "loss": 0.4547, + "step": 445000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4518116692791267, + "eval_average_loss_on_sentence_tokens": 0.3962921760635926, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4493945240974426, + "eval_non_padding_tokens_in_labels": 133.5379, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3754, + "eval_padding_tokens_in_labels": 378.4621, + "eval_reconstruction_accuracy": 0.9162919575986407, + "eval_runtime": 169.6613, + "eval_samples_per_second": 29.47, + "eval_sentence_accuracy": 0.7463258384625047, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 445000 + }, + { + "epoch": 0.14, + "learning_rate": 5.605050505050505e-05, + "loss": 0.4502, + "step": 445100 + }, + { + "epoch": 0.14, + "learning_rate": 5.604040404040404e-05, + "loss": 0.45, + "step": 445200 + }, + { + "epoch": 0.14, + "learning_rate": 5.603030303030303e-05, + "loss": 0.4532, + "step": 445300 + }, + { + "epoch": 0.14, + "learning_rate": 5.602020202020203e-05, + "loss": 0.4536, + "step": 445400 + }, + { + "epoch": 0.14, + "learning_rate": 5.6010101010101014e-05, + "loss": 0.449, + "step": 445500 + }, + { + "epoch": 0.14, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.4513, + "step": 445600 + }, + { + "epoch": 0.14, + "learning_rate": 5.598989898989899e-05, + "loss": 0.4495, + "step": 445700 + }, + { + "epoch": 0.14, + "learning_rate": 5.597979797979799e-05, + "loss": 0.4533, + "step": 445800 + }, + { + "epoch": 0.14, + "learning_rate": 5.5969696969696975e-05, + "loss": 0.4524, + "step": 445900 + }, + { + "epoch": 0.14, + "learning_rate": 5.595959595959597e-05, + "loss": 0.4515, + "step": 446000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.45135317186282914, + "eval_average_loss_on_sentence_tokens": 0.3746944430016491, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.4479687511920929, + "eval_non_padding_tokens_in_labels": 133.5278, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3804, + "eval_padding_tokens_in_labels": 378.4722, + "eval_reconstruction_accuracy": 0.9163084789994239, + "eval_runtime": 184.04, + "eval_samples_per_second": 27.168, + "eval_sentence_accuracy": 0.7694340265939311, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.248775, + "step": 446000 + }, + { + "epoch": 0.14, + "learning_rate": 5.594949494949495e-05, + "loss": 0.4517, + "step": 446100 + }, + { + "epoch": 0.14, + "learning_rate": 5.5939393939393944e-05, + "loss": 0.4529, + "step": 446200 + }, + { + "epoch": 0.14, + "learning_rate": 5.592929292929293e-05, + "loss": 0.454, + "step": 446300 + }, + { + "epoch": 0.14, + "learning_rate": 5.591919191919193e-05, + "loss": 0.4536, + "step": 446400 + }, + { + "epoch": 0.14, + "learning_rate": 5.5909090909090913e-05, + "loss": 0.4535, + "step": 446500 + }, + { + "epoch": 0.14, + "learning_rate": 5.5898989898989905e-05, + "loss": 0.4529, + "step": 446600 + }, + { + "epoch": 0.14, + "learning_rate": 5.588888888888889e-05, + "loss": 0.4516, + "step": 446700 + }, + { + "epoch": 0.14, + "learning_rate": 5.587878787878789e-05, + "loss": 0.4552, + "step": 446800 + }, + { + "epoch": 0.14, + "learning_rate": 5.586868686868687e-05, + "loss": 0.4527, + "step": 446900 + }, + { + "epoch": 0.14, + "learning_rate": 5.5858585858585867e-05, + "loss": 0.4532, + "step": 447000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.45069625287983145, + "eval_average_loss_on_sentence_tokens": 0.3662506411421745, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.4468945264816284, + "eval_non_padding_tokens_in_labels": 133.5522, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38205, + "eval_padding_tokens_in_labels": 378.4478, + "eval_reconstruction_accuracy": 0.9162400765126829, + "eval_runtime": 219.9629, + "eval_samples_per_second": 22.731, + "eval_sentence_accuracy": 0.7758402569669999, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 447000 + }, + { + "epoch": 0.14, + "learning_rate": 5.584848484848485e-05, + "loss": 0.4532, + "step": 447100 + }, + { + "epoch": 0.14, + "learning_rate": 5.5838383838383844e-05, + "loss": 0.45, + "step": 447200 + }, + { + "epoch": 0.14, + "learning_rate": 5.582828282828283e-05, + "loss": 0.4511, + "step": 447300 + }, + { + "epoch": 0.14, + "learning_rate": 5.581818181818183e-05, + "loss": 0.4488, + "step": 447400 + }, + { + "epoch": 0.14, + "learning_rate": 5.5808080808080806e-05, + "loss": 0.4518, + "step": 447500 + }, + { + "epoch": 0.14, + "learning_rate": 5.5797979797979805e-05, + "loss": 0.4505, + "step": 447600 + }, + { + "epoch": 0.14, + "learning_rate": 5.578787878787879e-05, + "loss": 0.4553, + "step": 447700 + }, + { + "epoch": 0.14, + "learning_rate": 5.577777777777778e-05, + "loss": 0.4525, + "step": 447800 + }, + { + "epoch": 0.14, + "learning_rate": 5.576767676767677e-05, + "loss": 0.4551, + "step": 447900 + }, + { + "epoch": 0.14, + "learning_rate": 5.5757575757575766e-05, + "loss": 0.4529, + "step": 448000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4508791010533585, + "eval_average_loss_on_sentence_tokens": 0.40871261491864797, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.448974609375, + "eval_non_padding_tokens_in_labels": 133.58565, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39715, + "eval_padding_tokens_in_labels": 378.41435, + "eval_reconstruction_accuracy": 0.9162782632030722, + "eval_runtime": 186.6716, + "eval_samples_per_second": 26.785, + "eval_sentence_accuracy": 0.7467654817234016, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 448000 + }, + { + "epoch": 0.14, + "learning_rate": 5.574747474747475e-05, + "loss": 0.4554, + "step": 448100 + }, + { + "epoch": 0.14, + "learning_rate": 5.573737373737374e-05, + "loss": 0.4518, + "step": 448200 + }, + { + "epoch": 0.14, + "learning_rate": 5.572727272727273e-05, + "loss": 0.4508, + "step": 448300 + }, + { + "epoch": 0.14, + "learning_rate": 5.571717171717172e-05, + "loss": 0.4547, + "step": 448400 + }, + { + "epoch": 0.14, + "learning_rate": 5.5707070707070706e-05, + "loss": 0.4516, + "step": 448500 + }, + { + "epoch": 0.14, + "learning_rate": 5.5696969696969704e-05, + "loss": 0.4543, + "step": 448600 + }, + { + "epoch": 0.14, + "learning_rate": 5.568686868686869e-05, + "loss": 0.452, + "step": 448700 + }, + { + "epoch": 0.14, + "learning_rate": 5.567676767676768e-05, + "loss": 0.45, + "step": 448800 + }, + { + "epoch": 0.14, + "learning_rate": 5.566666666666667e-05, + "loss": 0.4518, + "step": 448900 + }, + { + "epoch": 0.14, + "learning_rate": 5.5656565656565666e-05, + "loss": 0.4548, + "step": 449000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.4515498619485767, + "eval_average_loss_on_sentence_tokens": 0.3965301020308787, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4490332007408142, + "eval_non_padding_tokens_in_labels": 133.51085, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37935, + "eval_padding_tokens_in_labels": 378.48915, + "eval_reconstruction_accuracy": 0.9162197951136275, + "eval_runtime": 176.9158, + "eval_samples_per_second": 28.262, + "eval_sentence_accuracy": 0.7487752794875016, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 449000 + }, + { + "epoch": 0.14, + "learning_rate": 5.5646464646464644e-05, + "loss": 0.4535, + "step": 449100 + }, + { + "epoch": 0.14, + "learning_rate": 5.563636363636364e-05, + "loss": 0.4521, + "step": 449200 + }, + { + "epoch": 0.14, + "learning_rate": 5.562626262626263e-05, + "loss": 0.4548, + "step": 449300 + }, + { + "epoch": 0.14, + "learning_rate": 5.561616161616162e-05, + "loss": 0.4538, + "step": 449400 + }, + { + "epoch": 0.14, + "learning_rate": 5.5606060606060605e-05, + "loss": 0.453, + "step": 449500 + }, + { + "epoch": 0.14, + "learning_rate": 5.5595959595959604e-05, + "loss": 0.4533, + "step": 449600 + }, + { + "epoch": 0.14, + "learning_rate": 5.558585858585858e-05, + "loss": 0.4554, + "step": 449700 + }, + { + "epoch": 0.14, + "learning_rate": 5.557575757575758e-05, + "loss": 0.4526, + "step": 449800 + }, + { + "epoch": 0.14, + "learning_rate": 5.5565656565656566e-05, + "loss": 0.4504, + "step": 449900 + }, + { + "epoch": 0.14, + "learning_rate": 5.555555555555556e-05, + "loss": 0.4493, + "step": 450000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.45048799855341165, + "eval_average_loss_on_sentence_tokens": 0.40701277658386464, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.44847655296325684, + "eval_non_padding_tokens_in_labels": 133.5376, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.397, + "eval_padding_tokens_in_labels": 378.4624, + "eval_reconstruction_accuracy": 0.916351740892111, + "eval_runtime": 185.919, + "eval_samples_per_second": 26.893, + "eval_sentence_accuracy": 0.7483221777594344, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "step": 450000 + }, + { + "epoch": 0.14, + "learning_rate": 5.5545454545454543e-05, + "loss": 0.4524, + "step": 450100 + }, + { + "epoch": 0.14, + "learning_rate": 5.553535353535354e-05, + "loss": 0.4517, + "step": 450200 + }, + { + "epoch": 0.14, + "learning_rate": 5.552525252525253e-05, + "loss": 0.4497, + "step": 450300 + }, + { + "epoch": 0.14, + "learning_rate": 5.551515151515152e-05, + "loss": 0.4515, + "step": 450400 + }, + { + "epoch": 0.14, + "learning_rate": 5.5505050505050505e-05, + "loss": 0.4524, + "step": 450500 + }, + { + "epoch": 0.14, + "learning_rate": 5.5494949494949497e-05, + "loss": 0.4516, + "step": 450600 + }, + { + "epoch": 0.14, + "learning_rate": 5.548484848484848e-05, + "loss": 0.454, + "step": 450700 + }, + { + "epoch": 0.14, + "learning_rate": 5.547474747474748e-05, + "loss": 0.4539, + "step": 450800 + }, + { + "epoch": 0.14, + "learning_rate": 5.5464646464646466e-05, + "loss": 0.4506, + "step": 450900 + }, + { + "epoch": 0.14, + "learning_rate": 5.545454545454546e-05, + "loss": 0.4543, + "step": 451000 + }, + { + "epoch": 0.14, + "eval_average_loss_on_non_sentence_tokens": 0.450187205356626, + "eval_average_loss_on_sentence_tokens": 0.37605041135898337, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.44691407680511475, + "eval_non_padding_tokens_in_labels": 133.51525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38355, + "eval_padding_tokens_in_labels": 378.48475, + "eval_reconstruction_accuracy": 0.9163142869236999, + "eval_runtime": 225.5836, + "eval_samples_per_second": 22.165, + "eval_sentence_accuracy": 0.7711073626787733, + "eval_steps_per_second": 0.058, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 451000 + }, + { + "epoch": 0.15, + "learning_rate": 5.544444444444444e-05, + "loss": 0.4543, + "step": 451100 + }, + { + "epoch": 0.15, + "learning_rate": 5.543434343434344e-05, + "loss": 0.4504, + "step": 451200 + }, + { + "epoch": 0.15, + "learning_rate": 5.542424242424242e-05, + "loss": 0.4519, + "step": 451300 + }, + { + "epoch": 0.15, + "learning_rate": 5.541414141414142e-05, + "loss": 0.4487, + "step": 451400 + }, + { + "epoch": 0.15, + "learning_rate": 5.5404040404040404e-05, + "loss": 0.4498, + "step": 451500 + }, + { + "epoch": 0.15, + "learning_rate": 5.5393939393939396e-05, + "loss": 0.4532, + "step": 451600 + }, + { + "epoch": 0.15, + "learning_rate": 5.538383838383838e-05, + "loss": 0.4483, + "step": 451700 + }, + { + "epoch": 0.15, + "learning_rate": 5.537373737373738e-05, + "loss": 0.4519, + "step": 451800 + }, + { + "epoch": 0.15, + "learning_rate": 5.5363636363636365e-05, + "loss": 0.452, + "step": 451900 + }, + { + "epoch": 0.15, + "learning_rate": 5.535353535353536e-05, + "loss": 0.4478, + "step": 452000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.45080442465521553, + "eval_average_loss_on_sentence_tokens": 0.37894569579886855, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4475683569908142, + "eval_non_padding_tokens_in_labels": 133.5321, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3871, + "eval_padding_tokens_in_labels": 378.4679, + "eval_reconstruction_accuracy": 0.9163659920793823, + "eval_runtime": 187.0968, + "eval_samples_per_second": 26.724, + "eval_sentence_accuracy": 0.7665539146194843, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2496, + "step": 452000 + }, + { + "epoch": 0.15, + "learning_rate": 5.534343434343434e-05, + "loss": 0.4505, + "step": 452100 + }, + { + "epoch": 0.15, + "learning_rate": 5.5333333333333334e-05, + "loss": 0.4505, + "step": 452200 + }, + { + "epoch": 0.15, + "learning_rate": 5.532323232323232e-05, + "loss": 0.4508, + "step": 452300 + }, + { + "epoch": 0.15, + "learning_rate": 5.531313131313132e-05, + "loss": 0.4509, + "step": 452400 + }, + { + "epoch": 0.15, + "learning_rate": 5.5303030303030304e-05, + "loss": 0.4558, + "step": 452500 + }, + { + "epoch": 0.15, + "learning_rate": 5.5292929292929296e-05, + "loss": 0.4549, + "step": 452600 + }, + { + "epoch": 0.15, + "learning_rate": 5.528282828282828e-05, + "loss": 0.4488, + "step": 452700 + }, + { + "epoch": 0.15, + "learning_rate": 5.527272727272727e-05, + "loss": 0.4522, + "step": 452800 + }, + { + "epoch": 0.15, + "learning_rate": 5.526262626262626e-05, + "loss": 0.4536, + "step": 452900 + }, + { + "epoch": 0.15, + "learning_rate": 5.525252525252526e-05, + "loss": 0.451, + "step": 453000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.45178940517007116, + "eval_average_loss_on_sentence_tokens": 0.40888838796838545, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.4498242139816284, + "eval_non_padding_tokens_in_labels": 133.54395, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39715, + "eval_padding_tokens_in_labels": 378.45605, + "eval_reconstruction_accuracy": 0.916265308923914, + "eval_runtime": 213.413, + "eval_samples_per_second": 23.429, + "eval_sentence_accuracy": 0.7333115007088126, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 453000 + }, + { + "epoch": 0.15, + "learning_rate": 5.524242424242424e-05, + "loss": 0.451, + "step": 453100 + }, + { + "epoch": 0.15, + "learning_rate": 5.5232323232323234e-05, + "loss": 0.4515, + "step": 453200 + }, + { + "epoch": 0.15, + "learning_rate": 5.522222222222222e-05, + "loss": 0.4524, + "step": 453300 + }, + { + "epoch": 0.15, + "learning_rate": 5.521212121212122e-05, + "loss": 0.4549, + "step": 453400 + }, + { + "epoch": 0.15, + "learning_rate": 5.5202020202020196e-05, + "loss": 0.4481, + "step": 453500 + }, + { + "epoch": 0.15, + "learning_rate": 5.5191919191919195e-05, + "loss": 0.453, + "step": 453600 + }, + { + "epoch": 0.15, + "learning_rate": 5.518181818181818e-05, + "loss": 0.4535, + "step": 453700 + }, + { + "epoch": 0.15, + "learning_rate": 5.517171717171717e-05, + "loss": 0.4498, + "step": 453800 + }, + { + "epoch": 0.15, + "learning_rate": 5.516161616161617e-05, + "loss": 0.4526, + "step": 453900 + }, + { + "epoch": 0.15, + "learning_rate": 5.5151515151515156e-05, + "loss": 0.4492, + "step": 454000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.45132635430118073, + "eval_average_loss_on_sentence_tokens": 0.4154888722606727, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4496484398841858, + "eval_non_padding_tokens_in_labels": 133.5482, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39045, + "eval_padding_tokens_in_labels": 378.4518, + "eval_reconstruction_accuracy": 0.9162029048474957, + "eval_runtime": 185.812, + "eval_samples_per_second": 26.909, + "eval_sentence_accuracy": 0.7400541927611392, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 454000 + }, + { + "epoch": 0.15, + "learning_rate": 5.514141414141415e-05, + "loss": 0.4522, + "step": 454100 + }, + { + "epoch": 0.15, + "learning_rate": 5.513131313131313e-05, + "loss": 0.4483, + "step": 454200 + }, + { + "epoch": 0.15, + "learning_rate": 5.512121212121213e-05, + "loss": 0.4538, + "step": 454300 + }, + { + "epoch": 0.15, + "learning_rate": 5.511111111111111e-05, + "loss": 0.4527, + "step": 454400 + }, + { + "epoch": 0.15, + "learning_rate": 5.510101010101011e-05, + "loss": 0.4469, + "step": 454500 + }, + { + "epoch": 0.15, + "learning_rate": 5.5090909090909094e-05, + "loss": 0.4541, + "step": 454600 + }, + { + "epoch": 0.15, + "learning_rate": 5.5080808080808086e-05, + "loss": 0.4509, + "step": 454700 + }, + { + "epoch": 0.15, + "learning_rate": 5.507070707070707e-05, + "loss": 0.4519, + "step": 454800 + }, + { + "epoch": 0.15, + "learning_rate": 5.506060606060607e-05, + "loss": 0.449, + "step": 454900 + }, + { + "epoch": 0.15, + "learning_rate": 5.5050505050505056e-05, + "loss": 0.4539, + "step": 455000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.45050879700536184, + "eval_average_loss_on_sentence_tokens": 0.3957325969125623, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.44801756739616394, + "eval_non_padding_tokens_in_labels": 133.54645, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38225, + "eval_padding_tokens_in_labels": 378.45355, + "eval_reconstruction_accuracy": 0.9162025802482731, + "eval_runtime": 237.15, + "eval_samples_per_second": 21.084, + "eval_sentence_accuracy": 0.7608834137850593, + "eval_steps_per_second": 0.055, + "eval_variance_shuffling_prob": 0.2499, + "step": 455000 + }, + { + "epoch": 0.15, + "learning_rate": 5.504040404040405e-05, + "loss": 0.4521, + "step": 455100 + }, + { + "epoch": 0.15, + "learning_rate": 5.503030303030303e-05, + "loss": 0.4518, + "step": 455200 + }, + { + "epoch": 0.15, + "learning_rate": 5.5020202020202025e-05, + "loss": 0.4508, + "step": 455300 + }, + { + "epoch": 0.15, + "learning_rate": 5.501010101010101e-05, + "loss": 0.455, + "step": 455400 + }, + { + "epoch": 0.15, + "learning_rate": 5.500000000000001e-05, + "loss": 0.4577, + "step": 455500 + }, + { + "epoch": 0.15, + "learning_rate": 5.4989898989898994e-05, + "loss": 0.4494, + "step": 455600 + }, + { + "epoch": 0.15, + "learning_rate": 5.4979797979797986e-05, + "loss": 0.4509, + "step": 455700 + }, + { + "epoch": 0.15, + "learning_rate": 5.496969696969697e-05, + "loss": 0.4504, + "step": 455800 + }, + { + "epoch": 0.15, + "learning_rate": 5.495959595959596e-05, + "loss": 0.451, + "step": 455900 + }, + { + "epoch": 0.15, + "learning_rate": 5.494949494949495e-05, + "loss": 0.452, + "step": 456000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4510154857771239, + "eval_average_loss_on_sentence_tokens": 0.43393029891833634, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.45020508766174316, + "eval_non_padding_tokens_in_labels": 133.53495, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37645, + "eval_padding_tokens_in_labels": 378.46505, + "eval_reconstruction_accuracy": 0.9162739703489571, + "eval_runtime": 205.7256, + "eval_samples_per_second": 24.304, + "eval_sentence_accuracy": 0.7242898415489798, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.246975, + "step": 456000 + }, + { + "epoch": 0.15, + "learning_rate": 5.493939393939395e-05, + "loss": 0.4495, + "step": 456100 + }, + { + "epoch": 0.15, + "learning_rate": 5.492929292929293e-05, + "loss": 0.4512, + "step": 456200 + }, + { + "epoch": 0.15, + "learning_rate": 5.4919191919191924e-05, + "loss": 0.4498, + "step": 456300 + }, + { + "epoch": 0.15, + "learning_rate": 5.490909090909091e-05, + "loss": 0.4501, + "step": 456400 + }, + { + "epoch": 0.15, + "learning_rate": 5.489898989898991e-05, + "loss": 0.4557, + "step": 456500 + }, + { + "epoch": 0.15, + "learning_rate": 5.488888888888889e-05, + "loss": 0.4501, + "step": 456600 + }, + { + "epoch": 0.15, + "learning_rate": 5.4878787878787885e-05, + "loss": 0.4503, + "step": 456700 + }, + { + "epoch": 0.15, + "learning_rate": 5.486868686868687e-05, + "loss": 0.4468, + "step": 456800 + }, + { + "epoch": 0.15, + "learning_rate": 5.485858585858586e-05, + "loss": 0.4521, + "step": 456900 + }, + { + "epoch": 0.15, + "learning_rate": 5.484848484848485e-05, + "loss": 0.4549, + "step": 457000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4508512796323216, + "eval_average_loss_on_sentence_tokens": 0.3967523714144733, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.44844725728034973, + "eval_non_padding_tokens_in_labels": 133.55555, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3792, + "eval_padding_tokens_in_labels": 378.44445, + "eval_reconstruction_accuracy": 0.9162948760634206, + "eval_runtime": 225.0144, + "eval_samples_per_second": 22.221, + "eval_sentence_accuracy": 0.7547553250668437, + "eval_steps_per_second": 0.058, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 457000 + }, + { + "epoch": 0.15, + "learning_rate": 5.4838383838383847e-05, + "loss": 0.4511, + "step": 457100 + }, + { + "epoch": 0.15, + "learning_rate": 5.482828282828283e-05, + "loss": 0.4532, + "step": 457200 + }, + { + "epoch": 0.15, + "learning_rate": 5.4818181818181824e-05, + "loss": 0.4516, + "step": 457300 + }, + { + "epoch": 0.15, + "learning_rate": 5.480808080808081e-05, + "loss": 0.453, + "step": 457400 + }, + { + "epoch": 0.15, + "learning_rate": 5.47979797979798e-05, + "loss": 0.4546, + "step": 457500 + }, + { + "epoch": 0.15, + "learning_rate": 5.4787878787878786e-05, + "loss": 0.453, + "step": 457600 + }, + { + "epoch": 0.15, + "learning_rate": 5.4777777777777785e-05, + "loss": 0.4551, + "step": 457700 + }, + { + "epoch": 0.15, + "learning_rate": 5.476767676767677e-05, + "loss": 0.4522, + "step": 457800 + }, + { + "epoch": 0.15, + "learning_rate": 5.475757575757576e-05, + "loss": 0.4514, + "step": 457900 + }, + { + "epoch": 0.15, + "learning_rate": 5.474747474747475e-05, + "loss": 0.4504, + "step": 458000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4505359855717749, + "eval_average_loss_on_sentence_tokens": 0.4101980628849558, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4486425817012787, + "eval_non_padding_tokens_in_labels": 133.5283, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38075, + "eval_padding_tokens_in_labels": 378.4717, + "eval_reconstruction_accuracy": 0.9163870990929112, + "eval_runtime": 239.5064, + "eval_samples_per_second": 20.876, + "eval_sentence_accuracy": 0.738443662856425, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.248775, + "step": 458000 + }, + { + "epoch": 0.15, + "learning_rate": 5.4737373737373746e-05, + "loss": 0.4531, + "step": 458100 + }, + { + "epoch": 0.15, + "learning_rate": 5.4727272727272724e-05, + "loss": 0.4525, + "step": 458200 + }, + { + "epoch": 0.15, + "learning_rate": 5.471717171717172e-05, + "loss": 0.4546, + "step": 458300 + }, + { + "epoch": 0.15, + "learning_rate": 5.470707070707071e-05, + "loss": 0.4519, + "step": 458400 + }, + { + "epoch": 0.15, + "learning_rate": 5.46969696969697e-05, + "loss": 0.4521, + "step": 458500 + }, + { + "epoch": 0.15, + "learning_rate": 5.4686868686868686e-05, + "loss": 0.4466, + "step": 458600 + }, + { + "epoch": 0.15, + "learning_rate": 5.4676767676767684e-05, + "loss": 0.4523, + "step": 458700 + }, + { + "epoch": 0.15, + "learning_rate": 5.466666666666666e-05, + "loss": 0.4506, + "step": 458800 + }, + { + "epoch": 0.15, + "learning_rate": 5.465656565656566e-05, + "loss": 0.4489, + "step": 458900 + }, + { + "epoch": 0.15, + "learning_rate": 5.464646464646465e-05, + "loss": 0.4523, + "step": 459000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.4504066935658152, + "eval_average_loss_on_sentence_tokens": 0.36315053612470943, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.44645509123802185, + "eval_non_padding_tokens_in_labels": 133.581, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3837, + "eval_padding_tokens_in_labels": 378.419, + "eval_reconstruction_accuracy": 0.9163687881408057, + "eval_runtime": 232.3296, + "eval_samples_per_second": 21.521, + "eval_sentence_accuracy": 0.7803174403789904, + "eval_steps_per_second": 0.056, + "eval_variance_shuffling_prob": 0.2464, + "step": 459000 + }, + { + "epoch": 0.15, + "learning_rate": 5.463636363636364e-05, + "loss": 0.4512, + "step": 459100 + }, + { + "epoch": 0.15, + "learning_rate": 5.4626262626262624e-05, + "loss": 0.4524, + "step": 459200 + }, + { + "epoch": 0.15, + "learning_rate": 5.461616161616162e-05, + "loss": 0.4552, + "step": 459300 + }, + { + "epoch": 0.15, + "learning_rate": 5.460606060606061e-05, + "loss": 0.4507, + "step": 459400 + }, + { + "epoch": 0.15, + "learning_rate": 5.45959595959596e-05, + "loss": 0.4512, + "step": 459500 + }, + { + "epoch": 0.15, + "learning_rate": 5.4585858585858585e-05, + "loss": 0.4509, + "step": 459600 + }, + { + "epoch": 0.15, + "learning_rate": 5.457575757575758e-05, + "loss": 0.4499, + "step": 459700 + }, + { + "epoch": 0.15, + "learning_rate": 5.456565656565656e-05, + "loss": 0.4502, + "step": 459800 + }, + { + "epoch": 0.15, + "learning_rate": 5.455555555555556e-05, + "loss": 0.4495, + "step": 459900 + }, + { + "epoch": 0.15, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.4526, + "step": 460000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.44983237468295706, + "eval_average_loss_on_sentence_tokens": 0.41779732224400845, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.44832029938697815, + "eval_non_padding_tokens_in_labels": 133.5156, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37825, + "eval_padding_tokens_in_labels": 378.4844, + "eval_reconstruction_accuracy": 0.916437572398431, + "eval_runtime": 284.4611, + "eval_samples_per_second": 17.577, + "eval_sentence_accuracy": 0.7396100633445188, + "eval_steps_per_second": 0.046, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 460000 + }, + { + "epoch": 0.15, + "learning_rate": 5.453535353535354e-05, + "loss": 0.4522, + "step": 460100 + }, + { + "epoch": 0.15, + "learning_rate": 5.4525252525252523e-05, + "loss": 0.4529, + "step": 460200 + }, + { + "epoch": 0.15, + "learning_rate": 5.451515151515152e-05, + "loss": 0.4493, + "step": 460300 + }, + { + "epoch": 0.15, + "learning_rate": 5.45050505050505e-05, + "loss": 0.4497, + "step": 460400 + }, + { + "epoch": 0.15, + "learning_rate": 5.44949494949495e-05, + "loss": 0.4513, + "step": 460500 + }, + { + "epoch": 0.15, + "learning_rate": 5.4484848484848485e-05, + "loss": 0.4512, + "step": 460600 + }, + { + "epoch": 0.15, + "learning_rate": 5.4474747474747477e-05, + "loss": 0.4533, + "step": 460700 + }, + { + "epoch": 0.15, + "learning_rate": 5.446464646464646e-05, + "loss": 0.4485, + "step": 460800 + }, + { + "epoch": 0.15, + "learning_rate": 5.445454545454546e-05, + "loss": 0.4533, + "step": 460900 + }, + { + "epoch": 0.15, + "learning_rate": 5.4444444444444446e-05, + "loss": 0.4562, + "step": 461000 + }, + { + "epoch": 0.15, + "eval_average_loss_on_non_sentence_tokens": 0.450829053004552, + "eval_average_loss_on_sentence_tokens": 0.395018751379673, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.44837892055511475, + "eval_non_padding_tokens_in_labels": 133.5362, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3767, + "eval_padding_tokens_in_labels": 378.4638, + "eval_reconstruction_accuracy": 0.9164157757906619, + "eval_runtime": 246.7834, + "eval_samples_per_second": 20.261, + "eval_sentence_accuracy": 0.7507222710714734, + "eval_steps_per_second": 0.053, + "eval_variance_shuffling_prob": 0.25, + "step": 461000 + }, + { + "epoch": 0.16, + "learning_rate": 5.443434343434344e-05, + "loss": 0.4514, + "step": 461100 + }, + { + "epoch": 0.16, + "learning_rate": 5.442424242424242e-05, + "loss": 0.4518, + "step": 461200 + }, + { + "epoch": 0.16, + "learning_rate": 5.4414141414141415e-05, + "loss": 0.452, + "step": 461300 + }, + { + "epoch": 0.16, + "learning_rate": 5.44040404040404e-05, + "loss": 0.4532, + "step": 461400 + }, + { + "epoch": 0.16, + "learning_rate": 5.43939393939394e-05, + "loss": 0.4493, + "step": 461500 + }, + { + "epoch": 0.16, + "learning_rate": 5.4383838383838384e-05, + "loss": 0.4532, + "step": 461600 + }, + { + "epoch": 0.16, + "learning_rate": 5.4373737373737376e-05, + "loss": 0.4554, + "step": 461700 + }, + { + "epoch": 0.16, + "learning_rate": 5.436363636363636e-05, + "loss": 0.4525, + "step": 461800 + }, + { + "epoch": 0.16, + "learning_rate": 5.435353535353535e-05, + "loss": 0.4526, + "step": 461900 + }, + { + "epoch": 0.16, + "learning_rate": 5.434343434343434e-05, + "loss": 0.4489, + "step": 462000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.45008778831637586, + "eval_average_loss_on_sentence_tokens": 0.3943299631437933, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.44758787751197815, + "eval_non_padding_tokens_in_labels": 133.52685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37815, + "eval_padding_tokens_in_labels": 378.47315, + "eval_reconstruction_accuracy": 0.9163446280220293, + "eval_runtime": 254.0704, + "eval_samples_per_second": 19.68, + "eval_sentence_accuracy": 0.7581737757281031, + "eval_steps_per_second": 0.051, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 462000 + }, + { + "epoch": 0.16, + "learning_rate": 5.433333333333334e-05, + "loss": 0.453, + "step": 462100 + }, + { + "epoch": 0.16, + "learning_rate": 5.432323232323232e-05, + "loss": 0.4561, + "step": 462200 + }, + { + "epoch": 0.16, + "learning_rate": 5.4313131313131314e-05, + "loss": 0.4519, + "step": 462300 + }, + { + "epoch": 0.16, + "learning_rate": 5.430303030303031e-05, + "loss": 0.4526, + "step": 462400 + }, + { + "epoch": 0.16, + "learning_rate": 5.42929292929293e-05, + "loss": 0.4523, + "step": 462500 + }, + { + "epoch": 0.16, + "learning_rate": 5.428282828282829e-05, + "loss": 0.4513, + "step": 462600 + }, + { + "epoch": 0.16, + "learning_rate": 5.4272727272727275e-05, + "loss": 0.4517, + "step": 462700 + }, + { + "epoch": 0.16, + "learning_rate": 5.426262626262627e-05, + "loss": 0.4528, + "step": 462800 + }, + { + "epoch": 0.16, + "learning_rate": 5.425252525252525e-05, + "loss": 0.4522, + "step": 462900 + }, + { + "epoch": 0.16, + "learning_rate": 5.424242424242425e-05, + "loss": 0.4579, + "step": 463000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.450437945603515, + "eval_average_loss_on_sentence_tokens": 0.39649777439802975, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4480957090854645, + "eval_non_padding_tokens_in_labels": 133.53105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3997, + "eval_padding_tokens_in_labels": 378.46895, + "eval_reconstruction_accuracy": 0.9164671983425959, + "eval_runtime": 275.2205, + "eval_samples_per_second": 18.167, + "eval_sentence_accuracy": 0.7511708866438171, + "eval_steps_per_second": 0.047, + "eval_variance_shuffling_prob": 0.25, + "step": 463000 + }, + { + "epoch": 0.16, + "learning_rate": 5.423232323232324e-05, + "loss": 0.4492, + "step": 463100 + }, + { + "epoch": 0.16, + "learning_rate": 5.422222222222223e-05, + "loss": 0.4504, + "step": 463200 + }, + { + "epoch": 0.16, + "learning_rate": 5.4212121212121214e-05, + "loss": 0.4516, + "step": 463300 + }, + { + "epoch": 0.16, + "learning_rate": 5.420202020202021e-05, + "loss": 0.4514, + "step": 463400 + }, + { + "epoch": 0.16, + "learning_rate": 5.419191919191919e-05, + "loss": 0.4513, + "step": 463500 + }, + { + "epoch": 0.16, + "learning_rate": 5.418181818181819e-05, + "loss": 0.4477, + "step": 463600 + }, + { + "epoch": 0.16, + "learning_rate": 5.4171717171717175e-05, + "loss": 0.4509, + "step": 463700 + }, + { + "epoch": 0.16, + "learning_rate": 5.416161616161617e-05, + "loss": 0.4508, + "step": 463800 + }, + { + "epoch": 0.16, + "learning_rate": 5.415151515151515e-05, + "loss": 0.4493, + "step": 463900 + }, + { + "epoch": 0.16, + "learning_rate": 5.414141414141415e-05, + "loss": 0.4508, + "step": 464000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.45070236926127577, + "eval_average_loss_on_sentence_tokens": 0.3913581203767427, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.4480566382408142, + "eval_non_padding_tokens_in_labels": 133.54105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37575, + "eval_padding_tokens_in_labels": 378.45895, + "eval_reconstruction_accuracy": 0.9163453678460514, + "eval_runtime": 266.3411, + "eval_samples_per_second": 18.773, + "eval_sentence_accuracy": 0.752696179589786, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 464000 + }, + { + "epoch": 0.16, + "learning_rate": 5.4131313131313136e-05, + "loss": 0.4512, + "step": 464100 + }, + { + "epoch": 0.16, + "learning_rate": 5.412121212121213e-05, + "loss": 0.4566, + "step": 464200 + }, + { + "epoch": 0.16, + "learning_rate": 5.411111111111111e-05, + "loss": 0.449, + "step": 464300 + }, + { + "epoch": 0.16, + "learning_rate": 5.4101010101010105e-05, + "loss": 0.4491, + "step": 464400 + }, + { + "epoch": 0.16, + "learning_rate": 5.409090909090909e-05, + "loss": 0.451, + "step": 464500 + }, + { + "epoch": 0.16, + "learning_rate": 5.408080808080809e-05, + "loss": 0.4544, + "step": 464600 + }, + { + "epoch": 0.16, + "learning_rate": 5.4070707070707074e-05, + "loss": 0.4531, + "step": 464700 + }, + { + "epoch": 0.16, + "learning_rate": 5.4060606060606066e-05, + "loss": 0.4466, + "step": 464800 + }, + { + "epoch": 0.16, + "learning_rate": 5.405050505050505e-05, + "loss": 0.4493, + "step": 464900 + }, + { + "epoch": 0.16, + "learning_rate": 5.4040404040404044e-05, + "loss": 0.4487, + "step": 465000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.45015544844816446, + "eval_average_loss_on_sentence_tokens": 0.3591114756874559, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.446044921875, + "eval_non_padding_tokens_in_labels": 133.51775, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3778, + "eval_padding_tokens_in_labels": 378.48225, + "eval_reconstruction_accuracy": 0.9165743206628626, + "eval_runtime": 279.0146, + "eval_samples_per_second": 17.92, + "eval_sentence_accuracy": 0.772385917059953, + "eval_steps_per_second": 0.047, + "eval_variance_shuffling_prob": 0.248775, + "step": 465000 + }, + { + "epoch": 0.16, + "learning_rate": 5.403030303030303e-05, + "loss": 0.4484, + "step": 465100 + }, + { + "epoch": 0.16, + "learning_rate": 5.402020202020203e-05, + "loss": 0.4496, + "step": 465200 + }, + { + "epoch": 0.16, + "learning_rate": 5.401010101010101e-05, + "loss": 0.45, + "step": 465300 + }, + { + "epoch": 0.16, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.4508, + "step": 465400 + }, + { + "epoch": 0.16, + "learning_rate": 5.398989898989899e-05, + "loss": 0.4504, + "step": 465500 + }, + { + "epoch": 0.16, + "learning_rate": 5.397979797979799e-05, + "loss": 0.4499, + "step": 465600 + }, + { + "epoch": 0.16, + "learning_rate": 5.396969696969697e-05, + "loss": 0.452, + "step": 465700 + }, + { + "epoch": 0.16, + "learning_rate": 5.3959595959595966e-05, + "loss": 0.4534, + "step": 465800 + }, + { + "epoch": 0.16, + "learning_rate": 5.394949494949495e-05, + "loss": 0.4496, + "step": 465900 + }, + { + "epoch": 0.16, + "learning_rate": 5.393939393939394e-05, + "loss": 0.4478, + "step": 466000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.450428837097132, + "eval_average_loss_on_sentence_tokens": 0.41480329649876857, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4488281309604645, + "eval_non_padding_tokens_in_labels": 133.51295, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36455, + "eval_padding_tokens_in_labels": 378.48705, + "eval_reconstruction_accuracy": 0.9164501853764039, + "eval_runtime": 320.7873, + "eval_samples_per_second": 15.587, + "eval_sentence_accuracy": 0.7338543255513486, + "eval_steps_per_second": 0.041, + "eval_variance_shuffling_prob": 0.248775, + "step": 466000 + }, + { + "epoch": 0.16, + "learning_rate": 5.392929292929293e-05, + "loss": 0.4486, + "step": 466100 + }, + { + "epoch": 0.16, + "learning_rate": 5.391919191919193e-05, + "loss": 0.4509, + "step": 466200 + }, + { + "epoch": 0.16, + "learning_rate": 5.390909090909091e-05, + "loss": 0.4461, + "step": 466300 + }, + { + "epoch": 0.16, + "learning_rate": 5.3898989898989904e-05, + "loss": 0.4507, + "step": 466400 + }, + { + "epoch": 0.16, + "learning_rate": 5.388888888888889e-05, + "loss": 0.4507, + "step": 466500 + }, + { + "epoch": 0.16, + "learning_rate": 5.387878787878788e-05, + "loss": 0.4499, + "step": 466600 + }, + { + "epoch": 0.16, + "learning_rate": 5.386868686868687e-05, + "loss": 0.4484, + "step": 466700 + }, + { + "epoch": 0.16, + "learning_rate": 5.3858585858585865e-05, + "loss": 0.4542, + "step": 466800 + }, + { + "epoch": 0.16, + "learning_rate": 5.384848484848485e-05, + "loss": 0.4505, + "step": 466900 + }, + { + "epoch": 0.16, + "learning_rate": 5.383838383838384e-05, + "loss": 0.4504, + "step": 467000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.44887335024020497, + "eval_average_loss_on_sentence_tokens": 0.40853261492946896, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4471191465854645, + "eval_non_padding_tokens_in_labels": 133.48365, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36675, + "eval_padding_tokens_in_labels": 378.51635, + "eval_reconstruction_accuracy": 0.9166754686850144, + "eval_runtime": 259.4863, + "eval_samples_per_second": 19.269, + "eval_sentence_accuracy": 0.7406598237838031, + "eval_steps_per_second": 0.05, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 467000 + }, + { + "epoch": 0.16, + "learning_rate": 5.382828282828283e-05, + "loss": 0.4499, + "step": 467100 + }, + { + "epoch": 0.16, + "learning_rate": 5.3818181818181827e-05, + "loss": 0.4478, + "step": 467200 + }, + { + "epoch": 0.16, + "learning_rate": 5.3808080808080805e-05, + "loss": 0.4508, + "step": 467300 + }, + { + "epoch": 0.16, + "learning_rate": 5.3797979797979804e-05, + "loss": 0.449, + "step": 467400 + }, + { + "epoch": 0.16, + "learning_rate": 5.378787878787879e-05, + "loss": 0.447, + "step": 467500 + }, + { + "epoch": 0.16, + "learning_rate": 5.377777777777778e-05, + "loss": 0.4509, + "step": 467600 + }, + { + "epoch": 0.16, + "learning_rate": 5.3767676767676766e-05, + "loss": 0.4519, + "step": 467700 + }, + { + "epoch": 0.16, + "learning_rate": 5.3757575757575765e-05, + "loss": 0.4511, + "step": 467800 + }, + { + "epoch": 0.16, + "learning_rate": 5.374747474747474e-05, + "loss": 0.4539, + "step": 467900 + }, + { + "epoch": 0.16, + "learning_rate": 5.373737373737374e-05, + "loss": 0.4493, + "step": 468000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4489507427674675, + "eval_average_loss_on_sentence_tokens": 0.40624127559699263, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4470800757408142, + "eval_non_padding_tokens_in_labels": 133.50605, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37685, + "eval_padding_tokens_in_labels": 378.49395, + "eval_reconstruction_accuracy": 0.9164651521078013, + "eval_runtime": 295.3904, + "eval_samples_per_second": 16.927, + "eval_sentence_accuracy": 0.7549661743858452, + "eval_steps_per_second": 0.044, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 468000 + }, + { + "epoch": 0.16, + "learning_rate": 5.372727272727273e-05, + "loss": 0.4494, + "step": 468100 + }, + { + "epoch": 0.16, + "learning_rate": 5.371717171717172e-05, + "loss": 0.4505, + "step": 468200 + }, + { + "epoch": 0.16, + "learning_rate": 5.3707070707070704e-05, + "loss": 0.4499, + "step": 468300 + }, + { + "epoch": 0.16, + "learning_rate": 5.36969696969697e-05, + "loss": 0.4519, + "step": 468400 + }, + { + "epoch": 0.16, + "learning_rate": 5.368686868686869e-05, + "loss": 0.4539, + "step": 468500 + }, + { + "epoch": 0.16, + "learning_rate": 5.367676767676768e-05, + "loss": 0.4523, + "step": 468600 + }, + { + "epoch": 0.16, + "learning_rate": 5.3666666666666666e-05, + "loss": 0.4495, + "step": 468700 + }, + { + "epoch": 0.16, + "learning_rate": 5.365656565656566e-05, + "loss": 0.4432, + "step": 468800 + }, + { + "epoch": 0.16, + "learning_rate": 5.364646464646464e-05, + "loss": 0.449, + "step": 468900 + }, + { + "epoch": 0.16, + "learning_rate": 5.363636363636364e-05, + "loss": 0.4516, + "step": 469000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4505709745257834, + "eval_average_loss_on_sentence_tokens": 0.36691859902048624, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.44682615995407104, + "eval_non_padding_tokens_in_labels": 133.53445, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3848, + "eval_padding_tokens_in_labels": 378.46555, + "eval_reconstruction_accuracy": 0.9164181804459215, + "eval_runtime": 290.9883, + "eval_samples_per_second": 17.183, + "eval_sentence_accuracy": 0.7714303658908608, + "eval_steps_per_second": 0.045, + "eval_variance_shuffling_prob": 0.248775, + "step": 469000 + }, + { + "epoch": 0.16, + "learning_rate": 5.362626262626263e-05, + "loss": 0.454, + "step": 469100 + }, + { + "epoch": 0.16, + "learning_rate": 5.361616161616162e-05, + "loss": 0.4505, + "step": 469200 + }, + { + "epoch": 0.16, + "learning_rate": 5.3606060606060604e-05, + "loss": 0.4494, + "step": 469300 + }, + { + "epoch": 0.16, + "learning_rate": 5.35959595959596e-05, + "loss": 0.4507, + "step": 469400 + }, + { + "epoch": 0.16, + "learning_rate": 5.358585858585858e-05, + "loss": 0.4552, + "step": 469500 + }, + { + "epoch": 0.16, + "learning_rate": 5.357575757575758e-05, + "loss": 0.4523, + "step": 469600 + }, + { + "epoch": 0.16, + "learning_rate": 5.3565656565656565e-05, + "loss": 0.45, + "step": 469700 + }, + { + "epoch": 0.16, + "learning_rate": 5.355555555555556e-05, + "loss": 0.4495, + "step": 469800 + }, + { + "epoch": 0.16, + "learning_rate": 5.354545454545454e-05, + "loss": 0.4529, + "step": 469900 + }, + { + "epoch": 0.16, + "learning_rate": 5.353535353535354e-05, + "loss": 0.4523, + "step": 470000 + }, + { + "epoch": 0.16, + "eval_average_loss_on_non_sentence_tokens": 0.4489074091991261, + "eval_average_loss_on_sentence_tokens": 0.37509569790895675, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.44554686546325684, + "eval_non_padding_tokens_in_labels": 133.50615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.368, + "eval_padding_tokens_in_labels": 378.49385, + "eval_reconstruction_accuracy": 0.9165202105453998, + "eval_runtime": 275.926, + "eval_samples_per_second": 18.121, + "eval_sentence_accuracy": 0.7710176395643046, + "eval_steps_per_second": 0.047, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 470000 + }, + { + "epoch": 0.16, + "learning_rate": 5.352525252525252e-05, + "loss": 0.449, + "step": 470100 + }, + { + "epoch": 0.16, + "learning_rate": 5.351515151515152e-05, + "loss": 0.4485, + "step": 470200 + }, + { + "epoch": 0.16, + "learning_rate": 5.3505050505050503e-05, + "loss": 0.4529, + "step": 470300 + }, + { + "epoch": 0.16, + "learning_rate": 5.3494949494949495e-05, + "loss": 0.454, + "step": 470400 + }, + { + "epoch": 0.16, + "learning_rate": 5.348484848484848e-05, + "loss": 0.4502, + "step": 470500 + }, + { + "epoch": 0.16, + "learning_rate": 5.347474747474748e-05, + "loss": 0.4537, + "step": 470600 + }, + { + "epoch": 0.16, + "learning_rate": 5.3464646464646465e-05, + "loss": 0.4503, + "step": 470700 + }, + { + "epoch": 0.16, + "learning_rate": 5.3454545454545457e-05, + "loss": 0.4519, + "step": 470800 + }, + { + "epoch": 0.16, + "learning_rate": 5.3444444444444455e-05, + "loss": 0.4463, + "step": 470900 + }, + { + "epoch": 0.17, + "learning_rate": 5.3434343434343434e-05, + "loss": 0.4512, + "step": 471000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.44954331865745584, + "eval_average_loss_on_sentence_tokens": 0.3167648128189479, + "eval_average_shuffling_prob": 0.39, + "eval_loss": 0.44359374046325684, + "eval_non_padding_tokens_in_labels": 133.5345, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3676, + "eval_padding_tokens_in_labels": 378.4655, + "eval_reconstruction_accuracy": 0.9164466497495072, + "eval_runtime": 286.2673, + "eval_samples_per_second": 17.466, + "eval_sentence_accuracy": 0.8114917365011575, + "eval_steps_per_second": 0.045, + "eval_variance_shuffling_prob": 0.2379, + "step": 471000 + }, + { + "epoch": 0.17, + "learning_rate": 5.342424242424243e-05, + "loss": 0.4529, + "step": 471100 + }, + { + "epoch": 0.17, + "learning_rate": 5.341414141414142e-05, + "loss": 0.4534, + "step": 471200 + }, + { + "epoch": 0.17, + "learning_rate": 5.340404040404041e-05, + "loss": 0.4479, + "step": 471300 + }, + { + "epoch": 0.17, + "learning_rate": 5.3393939393939395e-05, + "loss": 0.4539, + "step": 471400 + }, + { + "epoch": 0.17, + "learning_rate": 5.3383838383838394e-05, + "loss": 0.4524, + "step": 471500 + }, + { + "epoch": 0.17, + "learning_rate": 5.337373737373738e-05, + "loss": 0.4493, + "step": 471600 + }, + { + "epoch": 0.17, + "learning_rate": 5.336363636363637e-05, + "loss": 0.4516, + "step": 471700 + }, + { + "epoch": 0.17, + "learning_rate": 5.3353535353535356e-05, + "loss": 0.4526, + "step": 471800 + }, + { + "epoch": 0.17, + "learning_rate": 5.334343434343435e-05, + "loss": 0.4522, + "step": 471900 + }, + { + "epoch": 0.17, + "learning_rate": 5.333333333333333e-05, + "loss": 0.4555, + "step": 472000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4498594885212395, + "eval_average_loss_on_sentence_tokens": 0.46124772353949434, + "eval_average_shuffling_prob": 0.595, + "eval_loss": 0.45041993260383606, + "eval_non_padding_tokens_in_labels": 133.5338, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3828, + "eval_padding_tokens_in_labels": 378.4662, + "eval_reconstruction_accuracy": 0.9163343117375259, + "eval_runtime": 310.0197, + "eval_samples_per_second": 16.128, + "eval_sentence_accuracy": 0.7077538715523893, + "eval_steps_per_second": 0.042, + "eval_variance_shuffling_prob": 0.24097500000000005, + "step": 472000 + }, + { + "epoch": 0.17, + "learning_rate": 5.332323232323233e-05, + "loss": 0.446, + "step": 472100 + }, + { + "epoch": 0.17, + "learning_rate": 5.331313131313132e-05, + "loss": 0.4522, + "step": 472200 + }, + { + "epoch": 0.17, + "learning_rate": 5.330303030303031e-05, + "loss": 0.4531, + "step": 472300 + }, + { + "epoch": 0.17, + "learning_rate": 5.3292929292929294e-05, + "loss": 0.4524, + "step": 472400 + }, + { + "epoch": 0.17, + "learning_rate": 5.328282828282829e-05, + "loss": 0.4476, + "step": 472500 + }, + { + "epoch": 0.17, + "learning_rate": 5.327272727272727e-05, + "loss": 0.4539, + "step": 472600 + }, + { + "epoch": 0.17, + "learning_rate": 5.326262626262627e-05, + "loss": 0.4524, + "step": 472700 + }, + { + "epoch": 0.17, + "learning_rate": 5.3252525252525255e-05, + "loss": 0.4515, + "step": 472800 + }, + { + "epoch": 0.17, + "learning_rate": 5.324242424242425e-05, + "loss": 0.4514, + "step": 472900 + }, + { + "epoch": 0.17, + "learning_rate": 5.323232323232323e-05, + "loss": 0.456, + "step": 473000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4499793249744797, + "eval_average_loss_on_sentence_tokens": 0.39746753575246824, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.44764649868011475, + "eval_non_padding_tokens_in_labels": 133.5564, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3728, + "eval_padding_tokens_in_labels": 378.4436, + "eval_reconstruction_accuracy": 0.9165448944782465, + "eval_runtime": 296.263, + "eval_samples_per_second": 16.877, + "eval_sentence_accuracy": 0.741610888797172, + "eval_steps_per_second": 0.044, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 473000 + }, + { + "epoch": 0.17, + "learning_rate": 5.322222222222223e-05, + "loss": 0.4511, + "step": 473100 + }, + { + "epoch": 0.17, + "learning_rate": 5.321212121212121e-05, + "loss": 0.4498, + "step": 473200 + }, + { + "epoch": 0.17, + "learning_rate": 5.320202020202021e-05, + "loss": 0.4567, + "step": 473300 + }, + { + "epoch": 0.17, + "learning_rate": 5.3191919191919194e-05, + "loss": 0.4497, + "step": 473400 + }, + { + "epoch": 0.17, + "learning_rate": 5.3181818181818186e-05, + "loss": 0.4519, + "step": 473500 + }, + { + "epoch": 0.17, + "learning_rate": 5.317171717171717e-05, + "loss": 0.4532, + "step": 473600 + }, + { + "epoch": 0.17, + "learning_rate": 5.316161616161617e-05, + "loss": 0.451, + "step": 473700 + }, + { + "epoch": 0.17, + "learning_rate": 5.3151515151515155e-05, + "loss": 0.4519, + "step": 473800 + }, + { + "epoch": 0.17, + "learning_rate": 5.314141414141415e-05, + "loss": 0.4508, + "step": 473900 + }, + { + "epoch": 0.17, + "learning_rate": 5.313131313131313e-05, + "loss": 0.4501, + "step": 474000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.44951458336337685, + "eval_average_loss_on_sentence_tokens": 0.40672118539342866, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.4475976526737213, + "eval_non_padding_tokens_in_labels": 133.54565, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3778, + "eval_padding_tokens_in_labels": 378.45435, + "eval_reconstruction_accuracy": 0.9164790371217479, + "eval_runtime": 299.1311, + "eval_samples_per_second": 16.715, + "eval_sentence_accuracy": 0.7429791662928203, + "eval_steps_per_second": 0.043, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 474000 + }, + { + "epoch": 0.17, + "learning_rate": 5.3121212121212124e-05, + "loss": 0.4506, + "step": 474100 + }, + { + "epoch": 0.17, + "learning_rate": 5.311111111111111e-05, + "loss": 0.4488, + "step": 474200 + }, + { + "epoch": 0.17, + "learning_rate": 5.310101010101011e-05, + "loss": 0.4504, + "step": 474300 + }, + { + "epoch": 0.17, + "learning_rate": 5.309090909090909e-05, + "loss": 0.4498, + "step": 474400 + }, + { + "epoch": 0.17, + "learning_rate": 5.3080808080808085e-05, + "loss": 0.4513, + "step": 474500 + }, + { + "epoch": 0.17, + "learning_rate": 5.307070707070707e-05, + "loss": 0.4516, + "step": 474600 + }, + { + "epoch": 0.17, + "learning_rate": 5.306060606060607e-05, + "loss": 0.4552, + "step": 474700 + }, + { + "epoch": 0.17, + "learning_rate": 5.305050505050505e-05, + "loss": 0.451, + "step": 474800 + }, + { + "epoch": 0.17, + "learning_rate": 5.3040404040404046e-05, + "loss": 0.4473, + "step": 474900 + }, + { + "epoch": 0.17, + "learning_rate": 5.303030303030303e-05, + "loss": 0.4511, + "step": 475000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4486851129747561, + "eval_average_loss_on_sentence_tokens": 0.43350331040370854, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.44797852635383606, + "eval_non_padding_tokens_in_labels": 133.53905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38825, + "eval_padding_tokens_in_labels": 378.46095, + "eval_reconstruction_accuracy": 0.9164460340640765, + "eval_runtime": 321.8424, + "eval_samples_per_second": 15.536, + "eval_sentence_accuracy": 0.7294444344752095, + "eval_steps_per_second": 0.04, + "eval_variance_shuffling_prob": 0.2483999999999999, + "step": 475000 + }, + { + "epoch": 0.17, + "learning_rate": 5.3020202020202024e-05, + "loss": 0.4499, + "step": 475100 + }, + { + "epoch": 0.17, + "learning_rate": 5.301010101010101e-05, + "loss": 0.4494, + "step": 475200 + }, + { + "epoch": 0.17, + "learning_rate": 5.300000000000001e-05, + "loss": 0.4506, + "step": 475300 + }, + { + "epoch": 0.17, + "learning_rate": 5.298989898989899e-05, + "loss": 0.4514, + "step": 475400 + }, + { + "epoch": 0.17, + "learning_rate": 5.2979797979797985e-05, + "loss": 0.4476, + "step": 475500 + }, + { + "epoch": 0.17, + "learning_rate": 5.296969696969697e-05, + "loss": 0.452, + "step": 475600 + }, + { + "epoch": 0.17, + "learning_rate": 5.295959595959596e-05, + "loss": 0.4502, + "step": 475700 + }, + { + "epoch": 0.17, + "learning_rate": 5.294949494949495e-05, + "loss": 0.4475, + "step": 475800 + }, + { + "epoch": 0.17, + "learning_rate": 5.2939393939393946e-05, + "loss": 0.4496, + "step": 475900 + }, + { + "epoch": 0.17, + "learning_rate": 5.292929292929293e-05, + "loss": 0.4522, + "step": 476000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4492960775854927, + "eval_average_loss_on_sentence_tokens": 0.4034784465314479, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.44722655415534973, + "eval_non_padding_tokens_in_labels": 133.5034, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37335, + "eval_padding_tokens_in_labels": 378.4966, + "eval_reconstruction_accuracy": 0.9165668662276113, + "eval_runtime": 300.0695, + "eval_samples_per_second": 16.663, + "eval_sentence_accuracy": 0.74066879609525, + "eval_steps_per_second": 0.043, + "eval_variance_shuffling_prob": 0.24937499999999996, + "step": 476000 + }, + { + "epoch": 0.17, + "learning_rate": 5.291919191919192e-05, + "loss": 0.4473, + "step": 476100 + }, + { + "epoch": 0.17, + "learning_rate": 5.290909090909091e-05, + "loss": 0.4506, + "step": 476200 + }, + { + "epoch": 0.17, + "learning_rate": 5.28989898989899e-05, + "loss": 0.4503, + "step": 476300 + }, + { + "epoch": 0.17, + "learning_rate": 5.2888888888888885e-05, + "loss": 0.45, + "step": 476400 + }, + { + "epoch": 0.17, + "learning_rate": 5.2878787878787884e-05, + "loss": 0.4518, + "step": 476500 + }, + { + "epoch": 0.17, + "learning_rate": 5.286868686868687e-05, + "loss": 0.4479, + "step": 476600 + }, + { + "epoch": 0.17, + "learning_rate": 5.285858585858586e-05, + "loss": 0.4484, + "step": 476700 + }, + { + "epoch": 0.17, + "learning_rate": 5.2848484848484847e-05, + "loss": 0.4528, + "step": 476800 + }, + { + "epoch": 0.17, + "learning_rate": 5.2838383838383845e-05, + "loss": 0.4498, + "step": 476900 + }, + { + "epoch": 0.17, + "learning_rate": 5.2828282828282824e-05, + "loss": 0.4488, + "step": 477000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4498867907778207, + "eval_average_loss_on_sentence_tokens": 0.3936336678634654, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.4473632872104645, + "eval_non_padding_tokens_in_labels": 133.5411, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37705, + "eval_padding_tokens_in_labels": 378.4589, + "eval_reconstruction_accuracy": 0.9164457507165583, + "eval_runtime": 313.288, + "eval_samples_per_second": 15.96, + "eval_sentence_accuracy": 0.7533242413910671, + "eval_steps_per_second": 0.041, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 477000 + }, + { + "epoch": 0.17, + "learning_rate": 5.281818181818182e-05, + "loss": 0.4508, + "step": 477100 + }, + { + "epoch": 0.17, + "learning_rate": 5.280808080808081e-05, + "loss": 0.4496, + "step": 477200 + }, + { + "epoch": 0.17, + "learning_rate": 5.27979797979798e-05, + "loss": 0.4503, + "step": 477300 + }, + { + "epoch": 0.17, + "learning_rate": 5.2787878787878785e-05, + "loss": 0.4536, + "step": 477400 + }, + { + "epoch": 0.17, + "learning_rate": 5.2777777777777784e-05, + "loss": 0.4505, + "step": 477500 + }, + { + "epoch": 0.17, + "learning_rate": 5.276767676767677e-05, + "loss": 0.4491, + "step": 477600 + }, + { + "epoch": 0.17, + "learning_rate": 5.275757575757576e-05, + "loss": 0.4512, + "step": 477700 + }, + { + "epoch": 0.17, + "learning_rate": 5.2747474747474746e-05, + "loss": 0.4498, + "step": 477800 + }, + { + "epoch": 0.17, + "learning_rate": 5.273737373737374e-05, + "loss": 0.4525, + "step": 477900 + }, + { + "epoch": 0.17, + "learning_rate": 5.272727272727272e-05, + "loss": 0.4519, + "step": 478000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4488237662104686, + "eval_average_loss_on_sentence_tokens": 0.40836171131396815, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.44700196385383606, + "eval_non_padding_tokens_in_labels": 133.5309, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39055, + "eval_padding_tokens_in_labels": 378.4691, + "eval_reconstruction_accuracy": 0.9165681971472259, + "eval_runtime": 336.7205, + "eval_samples_per_second": 14.849, + "eval_sentence_accuracy": 0.7437956466344859, + "eval_steps_per_second": 0.039, + "eval_variance_shuffling_prob": 0.2496, + "step": 478000 + }, + { + "epoch": 0.17, + "learning_rate": 5.271717171717172e-05, + "loss": 0.45, + "step": 478100 + }, + { + "epoch": 0.17, + "learning_rate": 5.270707070707071e-05, + "loss": 0.452, + "step": 478200 + }, + { + "epoch": 0.17, + "learning_rate": 5.26969696969697e-05, + "loss": 0.4477, + "step": 478300 + }, + { + "epoch": 0.17, + "learning_rate": 5.2686868686868684e-05, + "loss": 0.4535, + "step": 478400 + }, + { + "epoch": 0.17, + "learning_rate": 5.267676767676768e-05, + "loss": 0.4486, + "step": 478500 + }, + { + "epoch": 0.17, + "learning_rate": 5.266666666666666e-05, + "loss": 0.453, + "step": 478600 + }, + { + "epoch": 0.17, + "learning_rate": 5.265656565656566e-05, + "loss": 0.4477, + "step": 478700 + }, + { + "epoch": 0.17, + "learning_rate": 5.2646464646464646e-05, + "loss": 0.4532, + "step": 478800 + }, + { + "epoch": 0.17, + "learning_rate": 5.263636363636364e-05, + "loss": 0.4528, + "step": 478900 + }, + { + "epoch": 0.17, + "learning_rate": 5.262626262626262e-05, + "loss": 0.45, + "step": 479000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.4490691207967045, + "eval_average_loss_on_sentence_tokens": 0.3938376092376414, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.44649413228034973, + "eval_non_padding_tokens_in_labels": 133.5161, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3704, + "eval_padding_tokens_in_labels": 378.4839, + "eval_reconstruction_accuracy": 0.9165546114039296, + "eval_runtime": 280.389, + "eval_samples_per_second": 17.832, + "eval_sentence_accuracy": 0.7553026360651031, + "eval_steps_per_second": 0.046, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 479000 + }, + { + "epoch": 0.17, + "learning_rate": 5.261616161616162e-05, + "loss": 0.4525, + "step": 479100 + }, + { + "epoch": 0.17, + "learning_rate": 5.26060606060606e-05, + "loss": 0.4513, + "step": 479200 + }, + { + "epoch": 0.17, + "learning_rate": 5.25959595959596e-05, + "loss": 0.4492, + "step": 479300 + }, + { + "epoch": 0.17, + "learning_rate": 5.258585858585859e-05, + "loss": 0.4507, + "step": 479400 + }, + { + "epoch": 0.17, + "learning_rate": 5.2575757575757576e-05, + "loss": 0.4522, + "step": 479500 + }, + { + "epoch": 0.17, + "learning_rate": 5.2565656565656575e-05, + "loss": 0.4492, + "step": 479600 + }, + { + "epoch": 0.17, + "learning_rate": 5.255555555555556e-05, + "loss": 0.4531, + "step": 479700 + }, + { + "epoch": 0.17, + "learning_rate": 5.254545454545455e-05, + "loss": 0.4487, + "step": 479800 + }, + { + "epoch": 0.17, + "learning_rate": 5.253535353535354e-05, + "loss": 0.4522, + "step": 479900 + }, + { + "epoch": 0.17, + "learning_rate": 5.2525252525252536e-05, + "loss": 0.4469, + "step": 480000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.44918038051584747, + "eval_average_loss_on_sentence_tokens": 0.3955316651192687, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.44676756858825684, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3747, + "eval_padding_tokens_in_labels": 378.47615, + "eval_reconstruction_accuracy": 0.9165593949736748, + "eval_runtime": 299.6243, + "eval_samples_per_second": 16.688, + "eval_sentence_accuracy": 0.7511394835537532, + "eval_steps_per_second": 0.043, + "eval_variance_shuffling_prob": 0.2499, + "step": 480000 + }, + { + "epoch": 0.17, + "learning_rate": 5.2515151515151514e-05, + "loss": 0.4494, + "step": 480100 + }, + { + "epoch": 0.17, + "learning_rate": 5.250505050505051e-05, + "loss": 0.4512, + "step": 480200 + }, + { + "epoch": 0.17, + "learning_rate": 5.24949494949495e-05, + "loss": 0.4478, + "step": 480300 + }, + { + "epoch": 0.17, + "learning_rate": 5.248484848484849e-05, + "loss": 0.4486, + "step": 480400 + }, + { + "epoch": 0.17, + "learning_rate": 5.2474747474747475e-05, + "loss": 0.4482, + "step": 480500 + }, + { + "epoch": 0.17, + "learning_rate": 5.2464646464646474e-05, + "loss": 0.4505, + "step": 480600 + }, + { + "epoch": 0.17, + "learning_rate": 5.245454545454546e-05, + "loss": 0.4501, + "step": 480700 + }, + { + "epoch": 0.17, + "learning_rate": 5.244444444444445e-05, + "loss": 0.4525, + "step": 480800 + }, + { + "epoch": 0.17, + "learning_rate": 5.2434343434343436e-05, + "loss": 0.4479, + "step": 480900 + }, + { + "epoch": 0.17, + "learning_rate": 5.242424242424243e-05, + "loss": 0.4459, + "step": 481000 + }, + { + "epoch": 0.17, + "eval_average_loss_on_non_sentence_tokens": 0.45068648413470025, + "eval_average_loss_on_sentence_tokens": 0.39982860267797243, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.4483984410762787, + "eval_non_padding_tokens_in_labels": 133.5735, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38335, + "eval_padding_tokens_in_labels": 378.4265, + "eval_reconstruction_accuracy": 0.9163722238750243, + "eval_runtime": 267.2199, + "eval_samples_per_second": 18.711, + "eval_sentence_accuracy": 0.7407585192097188, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.248775, + "step": 481000 + }, + { + "epoch": 0.18, + "learning_rate": 5.2414141414141414e-05, + "loss": 0.4493, + "step": 481100 + }, + { + "epoch": 0.18, + "learning_rate": 5.240404040404041e-05, + "loss": 0.45, + "step": 481200 + }, + { + "epoch": 0.18, + "learning_rate": 5.23939393939394e-05, + "loss": 0.4546, + "step": 481300 + }, + { + "epoch": 0.18, + "learning_rate": 5.238383838383839e-05, + "loss": 0.4507, + "step": 481400 + }, + { + "epoch": 0.18, + "learning_rate": 5.2373737373737375e-05, + "loss": 0.4475, + "step": 481500 + }, + { + "epoch": 0.18, + "learning_rate": 5.2363636363636374e-05, + "loss": 0.4512, + "step": 481600 + }, + { + "epoch": 0.18, + "learning_rate": 5.235353535353535e-05, + "loss": 0.4515, + "step": 481700 + }, + { + "epoch": 0.18, + "learning_rate": 5.234343434343435e-05, + "loss": 0.4466, + "step": 481800 + }, + { + "epoch": 0.18, + "learning_rate": 5.2333333333333336e-05, + "loss": 0.45, + "step": 481900 + }, + { + "epoch": 0.18, + "learning_rate": 5.232323232323233e-05, + "loss": 0.4467, + "step": 482000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.449827123038118, + "eval_average_loss_on_sentence_tokens": 0.4248546879666541, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.44869139790534973, + "eval_non_padding_tokens_in_labels": 133.53495, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.385, + "eval_padding_tokens_in_labels": 378.46505, + "eval_reconstruction_accuracy": 0.916509760965562, + "eval_runtime": 301.8372, + "eval_samples_per_second": 16.565, + "eval_sentence_accuracy": 0.7252588511852424, + "eval_steps_per_second": 0.043, + "eval_variance_shuffling_prob": 0.246975, + "step": 482000 + }, + { + "epoch": 0.18, + "learning_rate": 5.231313131313131e-05, + "loss": 0.4505, + "step": 482100 + }, + { + "epoch": 0.18, + "learning_rate": 5.230303030303031e-05, + "loss": 0.4497, + "step": 482200 + }, + { + "epoch": 0.18, + "learning_rate": 5.229292929292929e-05, + "loss": 0.4541, + "step": 482300 + }, + { + "epoch": 0.18, + "learning_rate": 5.228282828282829e-05, + "loss": 0.448, + "step": 482400 + }, + { + "epoch": 0.18, + "learning_rate": 5.2272727272727274e-05, + "loss": 0.4505, + "step": 482500 + }, + { + "epoch": 0.18, + "learning_rate": 5.2262626262626266e-05, + "loss": 0.4487, + "step": 482600 + }, + { + "epoch": 0.18, + "learning_rate": 5.225252525252525e-05, + "loss": 0.4507, + "step": 482700 + }, + { + "epoch": 0.18, + "learning_rate": 5.224242424242425e-05, + "loss": 0.451, + "step": 482800 + }, + { + "epoch": 0.18, + "learning_rate": 5.2232323232323235e-05, + "loss": 0.4516, + "step": 482900 + }, + { + "epoch": 0.18, + "learning_rate": 5.222222222222223e-05, + "loss": 0.4492, + "step": 483000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.44936800261580345, + "eval_average_loss_on_sentence_tokens": 0.3907056500045448, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.4467187523841858, + "eval_non_padding_tokens_in_labels": 133.5131, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38775, + "eval_padding_tokens_in_labels": 378.4869, + "eval_reconstruction_accuracy": 0.9166458929236928, + "eval_runtime": 309.4025, + "eval_samples_per_second": 16.16, + "eval_sentence_accuracy": 0.7484253593410735, + "eval_steps_per_second": 0.042, + "eval_variance_shuffling_prob": 0.2499, + "step": 483000 + }, + { + "epoch": 0.18, + "learning_rate": 5.221212121212121e-05, + "loss": 0.4494, + "step": 483100 + }, + { + "epoch": 0.18, + "learning_rate": 5.2202020202020205e-05, + "loss": 0.4482, + "step": 483200 + }, + { + "epoch": 0.18, + "learning_rate": 5.219191919191919e-05, + "loss": 0.4509, + "step": 483300 + }, + { + "epoch": 0.18, + "learning_rate": 5.218181818181819e-05, + "loss": 0.4511, + "step": 483400 + }, + { + "epoch": 0.18, + "learning_rate": 5.2171717171717174e-05, + "loss": 0.4481, + "step": 483500 + }, + { + "epoch": 0.18, + "learning_rate": 5.2161616161616166e-05, + "loss": 0.4474, + "step": 483600 + }, + { + "epoch": 0.18, + "learning_rate": 5.215151515151515e-05, + "loss": 0.4462, + "step": 483700 + }, + { + "epoch": 0.18, + "learning_rate": 5.214141414141415e-05, + "loss": 0.4459, + "step": 483800 + }, + { + "epoch": 0.18, + "learning_rate": 5.213131313131313e-05, + "loss": 0.4514, + "step": 483900 + }, + { + "epoch": 0.18, + "learning_rate": 5.212121212121213e-05, + "loss": 0.4464, + "step": 484000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.44861494275327174, + "eval_average_loss_on_sentence_tokens": 0.394208058210299, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.4462011754512787, + "eval_non_padding_tokens_in_labels": 133.56905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39355, + "eval_padding_tokens_in_labels": 378.43095, + "eval_reconstruction_accuracy": 0.9165853687756198, + "eval_runtime": 355.3659, + "eval_samples_per_second": 14.07, + "eval_sentence_accuracy": 0.7562940764799828, + "eval_steps_per_second": 0.037, + "eval_variance_shuffling_prob": 0.25, + "step": 484000 + }, + { + "epoch": 0.18, + "learning_rate": 5.211111111111111e-05, + "loss": 0.4489, + "step": 484100 + }, + { + "epoch": 0.18, + "learning_rate": 5.2101010101010104e-05, + "loss": 0.451, + "step": 484200 + }, + { + "epoch": 0.18, + "learning_rate": 5.209090909090909e-05, + "loss": 0.4452, + "step": 484300 + }, + { + "epoch": 0.18, + "learning_rate": 5.208080808080809e-05, + "loss": 0.4494, + "step": 484400 + }, + { + "epoch": 0.18, + "learning_rate": 5.207070707070707e-05, + "loss": 0.4494, + "step": 484500 + }, + { + "epoch": 0.18, + "learning_rate": 5.2060606060606065e-05, + "loss": 0.4509, + "step": 484600 + }, + { + "epoch": 0.18, + "learning_rate": 5.205050505050505e-05, + "loss": 0.4503, + "step": 484700 + }, + { + "epoch": 0.18, + "learning_rate": 5.204040404040404e-05, + "loss": 0.4521, + "step": 484800 + }, + { + "epoch": 0.18, + "learning_rate": 5.203030303030303e-05, + "loss": 0.4498, + "step": 484900 + }, + { + "epoch": 0.18, + "learning_rate": 5.2020202020202026e-05, + "loss": 0.4507, + "step": 485000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.44885568557101174, + "eval_average_loss_on_sentence_tokens": 0.3652889955114637, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.44505858421325684, + "eval_non_padding_tokens_in_labels": 133.5346, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3791, + "eval_padding_tokens_in_labels": 378.4654, + "eval_reconstruction_accuracy": 0.9166430060096072, + "eval_runtime": 294.2724, + "eval_samples_per_second": 16.991, + "eval_sentence_accuracy": 0.7738349453586233, + "eval_steps_per_second": 0.044, + "eval_variance_shuffling_prob": 0.2484, + "step": 485000 + }, + { + "epoch": 0.18, + "learning_rate": 5.201010101010101e-05, + "loss": 0.4452, + "step": 485100 + }, + { + "epoch": 0.18, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.4496, + "step": 485200 + }, + { + "epoch": 0.18, + "learning_rate": 5.198989898989899e-05, + "loss": 0.4489, + "step": 485300 + }, + { + "epoch": 0.18, + "learning_rate": 5.197979797979798e-05, + "loss": 0.4511, + "step": 485400 + }, + { + "epoch": 0.18, + "learning_rate": 5.1969696969696966e-05, + "loss": 0.4507, + "step": 485500 + }, + { + "epoch": 0.18, + "learning_rate": 5.1959595959595965e-05, + "loss": 0.4468, + "step": 485600 + }, + { + "epoch": 0.18, + "learning_rate": 5.194949494949495e-05, + "loss": 0.4496, + "step": 485700 + }, + { + "epoch": 0.18, + "learning_rate": 5.193939393939394e-05, + "loss": 0.4462, + "step": 485800 + }, + { + "epoch": 0.18, + "learning_rate": 5.192929292929293e-05, + "loss": 0.4476, + "step": 485900 + }, + { + "epoch": 0.18, + "learning_rate": 5.1919191919191926e-05, + "loss": 0.4502, + "step": 486000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4485982020518786, + "eval_average_loss_on_sentence_tokens": 0.4006907630619868, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4464160203933716, + "eval_non_padding_tokens_in_labels": 133.556, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37575, + "eval_padding_tokens_in_labels": 378.444, + "eval_reconstruction_accuracy": 0.9165727978362398, + "eval_runtime": 286.2144, + "eval_samples_per_second": 17.469, + "eval_sentence_accuracy": 0.7490444488309078, + "eval_steps_per_second": 0.045, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 486000 + }, + { + "epoch": 0.18, + "learning_rate": 5.1909090909090904e-05, + "loss": 0.45, + "step": 486100 + }, + { + "epoch": 0.18, + "learning_rate": 5.18989898989899e-05, + "loss": 0.45, + "step": 486200 + }, + { + "epoch": 0.18, + "learning_rate": 5.188888888888889e-05, + "loss": 0.4534, + "step": 486300 + }, + { + "epoch": 0.18, + "learning_rate": 5.187878787878788e-05, + "loss": 0.4535, + "step": 486400 + }, + { + "epoch": 0.18, + "learning_rate": 5.1868686868686865e-05, + "loss": 0.4483, + "step": 486500 + }, + { + "epoch": 0.18, + "learning_rate": 5.1858585858585864e-05, + "loss": 0.4496, + "step": 486600 + }, + { + "epoch": 0.18, + "learning_rate": 5.184848484848485e-05, + "loss": 0.4528, + "step": 486700 + }, + { + "epoch": 0.18, + "learning_rate": 5.183838383838384e-05, + "loss": 0.4475, + "step": 486800 + }, + { + "epoch": 0.18, + "learning_rate": 5.1828282828282827e-05, + "loss": 0.4488, + "step": 486900 + }, + { + "epoch": 0.18, + "learning_rate": 5.181818181818182e-05, + "loss": 0.4486, + "step": 487000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4484604575635672, + "eval_average_loss_on_sentence_tokens": 0.37268356955499865, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.4450390636920929, + "eval_non_padding_tokens_in_labels": 133.5131, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37855, + "eval_padding_tokens_in_labels": 378.4869, + "eval_reconstruction_accuracy": 0.9166586882768385, + "eval_runtime": 248.0333, + "eval_samples_per_second": 20.159, + "eval_sentence_accuracy": 0.7699813375921905, + "eval_steps_per_second": 0.052, + "eval_variance_shuffling_prob": 0.2490999999999999, + "step": 487000 + }, + { + "epoch": 0.18, + "learning_rate": 5.1808080808080804e-05, + "loss": 0.4498, + "step": 487100 + }, + { + "epoch": 0.18, + "learning_rate": 5.17979797979798e-05, + "loss": 0.4519, + "step": 487200 + }, + { + "epoch": 0.18, + "learning_rate": 5.178787878787879e-05, + "loss": 0.4491, + "step": 487300 + }, + { + "epoch": 0.18, + "learning_rate": 5.177777777777778e-05, + "loss": 0.4501, + "step": 487400 + }, + { + "epoch": 0.18, + "learning_rate": 5.1767676767676765e-05, + "loss": 0.4484, + "step": 487500 + }, + { + "epoch": 0.18, + "learning_rate": 5.175757575757576e-05, + "loss": 0.4478, + "step": 487600 + }, + { + "epoch": 0.18, + "learning_rate": 5.174747474747474e-05, + "loss": 0.4451, + "step": 487700 + }, + { + "epoch": 0.18, + "learning_rate": 5.173737373737374e-05, + "loss": 0.4487, + "step": 487800 + }, + { + "epoch": 0.18, + "learning_rate": 5.1727272727272726e-05, + "loss": 0.4507, + "step": 487900 + }, + { + "epoch": 0.18, + "learning_rate": 5.171717171717172e-05, + "loss": 0.4476, + "step": 488000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4483856817371281, + "eval_average_loss_on_sentence_tokens": 0.44273484469751073, + "eval_average_shuffling_prob": 0.585, + "eval_loss": 0.4480957090854645, + "eval_non_padding_tokens_in_labels": 133.5341, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38045, + "eval_padding_tokens_in_labels": 378.4659, + "eval_reconstruction_accuracy": 0.9165758260588468, + "eval_runtime": 272.9021, + "eval_samples_per_second": 18.322, + "eval_sentence_accuracy": 0.7146132036535252, + "eval_steps_per_second": 0.048, + "eval_variance_shuffling_prob": 0.242775, + "step": 488000 + }, + { + "epoch": 0.18, + "learning_rate": 5.170707070707072e-05, + "loss": 0.4495, + "step": 488100 + }, + { + "epoch": 0.18, + "learning_rate": 5.16969696969697e-05, + "loss": 0.4499, + "step": 488200 + }, + { + "epoch": 0.18, + "learning_rate": 5.1686868686868694e-05, + "loss": 0.4535, + "step": 488300 + }, + { + "epoch": 0.18, + "learning_rate": 5.167676767676768e-05, + "loss": 0.4487, + "step": 488400 + }, + { + "epoch": 0.18, + "learning_rate": 5.166666666666667e-05, + "loss": 0.4503, + "step": 488500 + }, + { + "epoch": 0.18, + "learning_rate": 5.1656565656565656e-05, + "loss": 0.4482, + "step": 488600 + }, + { + "epoch": 0.18, + "learning_rate": 5.1646464646464655e-05, + "loss": 0.4493, + "step": 488700 + }, + { + "epoch": 0.18, + "learning_rate": 5.163636363636364e-05, + "loss": 0.4528, + "step": 488800 + }, + { + "epoch": 0.18, + "learning_rate": 5.162626262626263e-05, + "loss": 0.4499, + "step": 488900 + }, + { + "epoch": 0.18, + "learning_rate": 5.161616161616162e-05, + "loss": 0.4493, + "step": 489000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.44889540635543673, + "eval_average_loss_on_sentence_tokens": 0.35353539862270755, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.44458985328674316, + "eval_non_padding_tokens_in_labels": 133.5733, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38535, + "eval_padding_tokens_in_labels": 378.4267, + "eval_reconstruction_accuracy": 0.9165386214645722, + "eval_runtime": 245.3959, + "eval_samples_per_second": 20.375, + "eval_sentence_accuracy": 0.7794112369228561, + "eval_steps_per_second": 0.053, + "eval_variance_shuffling_prob": 0.24750000000000008, + "step": 489000 + }, + { + "epoch": 0.18, + "learning_rate": 5.1606060606060616e-05, + "loss": 0.4506, + "step": 489100 + }, + { + "epoch": 0.18, + "learning_rate": 5.1595959595959595e-05, + "loss": 0.4468, + "step": 489200 + }, + { + "epoch": 0.18, + "learning_rate": 5.1585858585858593e-05, + "loss": 0.4485, + "step": 489300 + }, + { + "epoch": 0.18, + "learning_rate": 5.157575757575758e-05, + "loss": 0.45, + "step": 489400 + }, + { + "epoch": 0.18, + "learning_rate": 5.156565656565657e-05, + "loss": 0.4496, + "step": 489500 + }, + { + "epoch": 0.18, + "learning_rate": 5.1555555555555556e-05, + "loss": 0.45, + "step": 489600 + }, + { + "epoch": 0.18, + "learning_rate": 5.1545454545454555e-05, + "loss": 0.4477, + "step": 489700 + }, + { + "epoch": 0.18, + "learning_rate": 5.153535353535354e-05, + "loss": 0.4523, + "step": 489800 + }, + { + "epoch": 0.18, + "learning_rate": 5.152525252525253e-05, + "loss": 0.4486, + "step": 489900 + }, + { + "epoch": 0.18, + "learning_rate": 5.151515151515152e-05, + "loss": 0.4512, + "step": 490000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.44827673317477207, + "eval_average_loss_on_sentence_tokens": 0.4083652281275835, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.44639647006988525, + "eval_non_padding_tokens_in_labels": 133.516, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37995, + "eval_padding_tokens_in_labels": 378.484, + "eval_reconstruction_accuracy": 0.9165857930929692, + "eval_runtime": 242.645, + "eval_samples_per_second": 20.606, + "eval_sentence_accuracy": 0.7480485222603047, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 490000 + }, + { + "epoch": 0.18, + "learning_rate": 5.150505050505051e-05, + "loss": 0.4522, + "step": 490100 + }, + { + "epoch": 0.18, + "learning_rate": 5.1494949494949494e-05, + "loss": 0.4475, + "step": 490200 + }, + { + "epoch": 0.18, + "learning_rate": 5.148484848484849e-05, + "loss": 0.449, + "step": 490300 + }, + { + "epoch": 0.18, + "learning_rate": 5.147474747474748e-05, + "loss": 0.4518, + "step": 490400 + }, + { + "epoch": 0.18, + "learning_rate": 5.146464646464647e-05, + "loss": 0.4479, + "step": 490500 + }, + { + "epoch": 0.18, + "learning_rate": 5.1454545454545455e-05, + "loss": 0.449, + "step": 490600 + }, + { + "epoch": 0.18, + "learning_rate": 5.144444444444445e-05, + "loss": 0.4515, + "step": 490700 + }, + { + "epoch": 0.18, + "learning_rate": 5.143434343434343e-05, + "loss": 0.4502, + "step": 490800 + }, + { + "epoch": 0.18, + "learning_rate": 5.142424242424243e-05, + "loss": 0.4481, + "step": 490900 + }, + { + "epoch": 0.18, + "learning_rate": 5.1414141414141416e-05, + "loss": 0.446, + "step": 491000 + }, + { + "epoch": 0.18, + "eval_average_loss_on_non_sentence_tokens": 0.4485890010401464, + "eval_average_loss_on_sentence_tokens": 0.4011039210307618, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.44648438692092896, + "eval_non_padding_tokens_in_labels": 133.513, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37625, + "eval_padding_tokens_in_labels": 378.487, + "eval_reconstruction_accuracy": 0.9166277314082144, + "eval_runtime": 253.3385, + "eval_samples_per_second": 19.736, + "eval_sentence_accuracy": 0.7450607425484953, + "eval_steps_per_second": 0.051, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 491000 + }, + { + "epoch": 0.19, + "learning_rate": 5.140404040404041e-05, + "loss": 0.4504, + "step": 491100 + }, + { + "epoch": 0.19, + "learning_rate": 5.1393939393939394e-05, + "loss": 0.4477, + "step": 491200 + }, + { + "epoch": 0.19, + "learning_rate": 5.138383838383839e-05, + "loss": 0.4507, + "step": 491300 + }, + { + "epoch": 0.19, + "learning_rate": 5.137373737373737e-05, + "loss": 0.45, + "step": 491400 + }, + { + "epoch": 0.19, + "learning_rate": 5.136363636363637e-05, + "loss": 0.4499, + "step": 491500 + }, + { + "epoch": 0.19, + "learning_rate": 5.1353535353535355e-05, + "loss": 0.4476, + "step": 491600 + }, + { + "epoch": 0.19, + "learning_rate": 5.134343434343435e-05, + "loss": 0.4511, + "step": 491700 + }, + { + "epoch": 0.19, + "learning_rate": 5.133333333333333e-05, + "loss": 0.4469, + "step": 491800 + }, + { + "epoch": 0.19, + "learning_rate": 5.132323232323233e-05, + "loss": 0.451, + "step": 491900 + }, + { + "epoch": 0.19, + "learning_rate": 5.1313131313131316e-05, + "loss": 0.4511, + "step": 492000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4477509351585878, + "eval_average_loss_on_sentence_tokens": 0.39500678068717454, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.44536131620407104, + "eval_non_padding_tokens_in_labels": 133.5425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39565, + "eval_padding_tokens_in_labels": 378.4575, + "eval_reconstruction_accuracy": 0.9168402111525576, + "eval_runtime": 235.2313, + "eval_samples_per_second": 21.256, + "eval_sentence_accuracy": 0.7492238950598453, + "eval_steps_per_second": 0.055, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 492000 + }, + { + "epoch": 0.19, + "learning_rate": 5.130303030303031e-05, + "loss": 0.4464, + "step": 492100 + }, + { + "epoch": 0.19, + "learning_rate": 5.129292929292929e-05, + "loss": 0.4508, + "step": 492200 + }, + { + "epoch": 0.19, + "learning_rate": 5.1282828282828285e-05, + "loss": 0.4467, + "step": 492300 + }, + { + "epoch": 0.19, + "learning_rate": 5.127272727272727e-05, + "loss": 0.4475, + "step": 492400 + }, + { + "epoch": 0.19, + "learning_rate": 5.126262626262627e-05, + "loss": 0.4476, + "step": 492500 + }, + { + "epoch": 0.19, + "learning_rate": 5.1252525252525254e-05, + "loss": 0.45, + "step": 492600 + }, + { + "epoch": 0.19, + "learning_rate": 5.1242424242424246e-05, + "loss": 0.4469, + "step": 492700 + }, + { + "epoch": 0.19, + "learning_rate": 5.123232323232323e-05, + "loss": 0.4503, + "step": 492800 + }, + { + "epoch": 0.19, + "learning_rate": 5.122222222222223e-05, + "loss": 0.4495, + "step": 492900 + }, + { + "epoch": 0.19, + "learning_rate": 5.121212121212121e-05, + "loss": 0.4493, + "step": 493000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.44749140219365, + "eval_average_loss_on_sentence_tokens": 0.35225297901165636, + "eval_average_shuffling_prob": 0.425, + "eval_loss": 0.44319334626197815, + "eval_non_padding_tokens_in_labels": 133.54575, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3854, + "eval_padding_tokens_in_labels": 378.45425, + "eval_reconstruction_accuracy": 0.9166242373200147, + "eval_runtime": 242.082, + "eval_samples_per_second": 20.654, + "eval_sentence_accuracy": 0.7942155508101997, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.244375, + "step": 493000 + }, + { + "epoch": 0.19, + "learning_rate": 5.120202020202021e-05, + "loss": 0.4519, + "step": 493100 + }, + { + "epoch": 0.19, + "learning_rate": 5.119191919191919e-05, + "loss": 0.4526, + "step": 493200 + }, + { + "epoch": 0.19, + "learning_rate": 5.1181818181818185e-05, + "loss": 0.4468, + "step": 493300 + }, + { + "epoch": 0.19, + "learning_rate": 5.117171717171717e-05, + "loss": 0.4552, + "step": 493400 + }, + { + "epoch": 0.19, + "learning_rate": 5.116161616161617e-05, + "loss": 0.4489, + "step": 493500 + }, + { + "epoch": 0.19, + "learning_rate": 5.115151515151515e-05, + "loss": 0.4534, + "step": 493600 + }, + { + "epoch": 0.19, + "learning_rate": 5.1141414141414146e-05, + "loss": 0.4492, + "step": 493700 + }, + { + "epoch": 0.19, + "learning_rate": 5.113131313131313e-05, + "loss": 0.45, + "step": 493800 + }, + { + "epoch": 0.19, + "learning_rate": 5.112121212121212e-05, + "loss": 0.4494, + "step": 493900 + }, + { + "epoch": 0.19, + "learning_rate": 5.111111111111111e-05, + "loss": 0.4494, + "step": 494000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4478022731042992, + "eval_average_loss_on_sentence_tokens": 0.420166606339787, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.4465917944908142, + "eval_non_padding_tokens_in_labels": 133.52845, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37725, + "eval_padding_tokens_in_labels": 378.47155, + "eval_reconstruction_accuracy": 0.9163385787466914, + "eval_runtime": 263.7625, + "eval_samples_per_second": 18.956, + "eval_sentence_accuracy": 0.7553789007124015, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 494000 + }, + { + "epoch": 0.19, + "learning_rate": 5.110101010101011e-05, + "loss": 0.449, + "step": 494100 + }, + { + "epoch": 0.19, + "learning_rate": 5.109090909090909e-05, + "loss": 0.4505, + "step": 494200 + }, + { + "epoch": 0.19, + "learning_rate": 5.1080808080808084e-05, + "loss": 0.449, + "step": 494300 + }, + { + "epoch": 0.19, + "learning_rate": 5.107070707070707e-05, + "loss": 0.4487, + "step": 494400 + }, + { + "epoch": 0.19, + "learning_rate": 5.106060606060606e-05, + "loss": 0.449, + "step": 494500 + }, + { + "epoch": 0.19, + "learning_rate": 5.1050505050505046e-05, + "loss": 0.4492, + "step": 494600 + }, + { + "epoch": 0.19, + "learning_rate": 5.1040404040404045e-05, + "loss": 0.4505, + "step": 494700 + }, + { + "epoch": 0.19, + "learning_rate": 5.103030303030303e-05, + "loss": 0.445, + "step": 494800 + }, + { + "epoch": 0.19, + "learning_rate": 5.102020202020202e-05, + "loss": 0.4486, + "step": 494900 + }, + { + "epoch": 0.19, + "learning_rate": 5.101010101010101e-05, + "loss": 0.4494, + "step": 495000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.44793023845921837, + "eval_average_loss_on_sentence_tokens": 0.3796614350006276, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.44487303495407104, + "eval_non_padding_tokens_in_labels": 133.5232, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38795, + "eval_padding_tokens_in_labels": 378.4768, + "eval_reconstruction_accuracy": 0.9167642792685808, + "eval_runtime": 228.6571, + "eval_samples_per_second": 21.867, + "eval_sentence_accuracy": 0.7630636854666499, + "eval_steps_per_second": 0.057, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 495000 + }, + { + "epoch": 0.19, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.4449, + "step": 495100 + }, + { + "epoch": 0.19, + "learning_rate": 5.0989898989898985e-05, + "loss": 0.4506, + "step": 495200 + }, + { + "epoch": 0.19, + "learning_rate": 5.0979797979797984e-05, + "loss": 0.4506, + "step": 495300 + }, + { + "epoch": 0.19, + "learning_rate": 5.096969696969697e-05, + "loss": 0.4495, + "step": 495400 + }, + { + "epoch": 0.19, + "learning_rate": 5.095959595959596e-05, + "loss": 0.4493, + "step": 495500 + }, + { + "epoch": 0.19, + "learning_rate": 5.0949494949494946e-05, + "loss": 0.4498, + "step": 495600 + }, + { + "epoch": 0.19, + "learning_rate": 5.0939393939393945e-05, + "loss": 0.4486, + "step": 495700 + }, + { + "epoch": 0.19, + "learning_rate": 5.092929292929293e-05, + "loss": 0.4472, + "step": 495800 + }, + { + "epoch": 0.19, + "learning_rate": 5.091919191919192e-05, + "loss": 0.4494, + "step": 495900 + }, + { + "epoch": 0.19, + "learning_rate": 5.090909090909091e-05, + "loss": 0.452, + "step": 496000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.44850401499528864, + "eval_average_loss_on_sentence_tokens": 0.3894171097851822, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4458300769329071, + "eval_non_padding_tokens_in_labels": 133.53425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37455, + "eval_padding_tokens_in_labels": 378.46575, + "eval_reconstruction_accuracy": 0.916500551233354, + "eval_runtime": 239.7034, + "eval_samples_per_second": 20.859, + "eval_sentence_accuracy": 0.7641089597502109, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.2496, + "step": 496000 + }, + { + "epoch": 0.19, + "learning_rate": 5.08989898989899e-05, + "loss": 0.4494, + "step": 496100 + }, + { + "epoch": 0.19, + "learning_rate": 5.0888888888888884e-05, + "loss": 0.4468, + "step": 496200 + }, + { + "epoch": 0.19, + "learning_rate": 5.087878787878788e-05, + "loss": 0.4511, + "step": 496300 + }, + { + "epoch": 0.19, + "learning_rate": 5.086868686868687e-05, + "loss": 0.4483, + "step": 496400 + }, + { + "epoch": 0.19, + "learning_rate": 5.085858585858586e-05, + "loss": 0.4458, + "step": 496500 + }, + { + "epoch": 0.19, + "learning_rate": 5.084848484848486e-05, + "loss": 0.4498, + "step": 496600 + }, + { + "epoch": 0.19, + "learning_rate": 5.083838383838384e-05, + "loss": 0.4484, + "step": 496700 + }, + { + "epoch": 0.19, + "learning_rate": 5.0828282828282836e-05, + "loss": 0.4548, + "step": 496800 + }, + { + "epoch": 0.19, + "learning_rate": 5.081818181818182e-05, + "loss": 0.4473, + "step": 496900 + }, + { + "epoch": 0.19, + "learning_rate": 5.080808080808081e-05, + "loss": 0.4469, + "step": 497000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.44821296435031227, + "eval_average_loss_on_sentence_tokens": 0.3614072786679825, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4442773461341858, + "eval_non_padding_tokens_in_labels": 133.5619, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38375, + "eval_padding_tokens_in_labels": 378.4381, + "eval_reconstruction_accuracy": 0.9167218663540941, + "eval_runtime": 265.365, + "eval_samples_per_second": 18.842, + "eval_sentence_accuracy": 0.7754813645091249, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 497000 + }, + { + "epoch": 0.19, + "learning_rate": 5.07979797979798e-05, + "loss": 0.447, + "step": 497100 + }, + { + "epoch": 0.19, + "learning_rate": 5.07878787878788e-05, + "loss": 0.4481, + "step": 497200 + }, + { + "epoch": 0.19, + "learning_rate": 5.077777777777778e-05, + "loss": 0.4497, + "step": 497300 + }, + { + "epoch": 0.19, + "learning_rate": 5.0767676767676774e-05, + "loss": 0.4485, + "step": 497400 + }, + { + "epoch": 0.19, + "learning_rate": 5.075757575757576e-05, + "loss": 0.4476, + "step": 497500 + }, + { + "epoch": 0.19, + "learning_rate": 5.074747474747475e-05, + "loss": 0.4525, + "step": 497600 + }, + { + "epoch": 0.19, + "learning_rate": 5.073737373737374e-05, + "loss": 0.4461, + "step": 497700 + }, + { + "epoch": 0.19, + "learning_rate": 5.0727272727272736e-05, + "loss": 0.4485, + "step": 497800 + }, + { + "epoch": 0.19, + "learning_rate": 5.071717171717172e-05, + "loss": 0.4496, + "step": 497900 + }, + { + "epoch": 0.19, + "learning_rate": 5.070707070707071e-05, + "loss": 0.4473, + "step": 498000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4473119889591081, + "eval_average_loss_on_sentence_tokens": 0.386873946308536, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.44462889432907104, + "eval_non_padding_tokens_in_labels": 133.52655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39065, + "eval_padding_tokens_in_labels": 378.47345, + "eval_reconstruction_accuracy": 0.9166979230238037, + "eval_runtime": 298.7622, + "eval_samples_per_second": 16.736, + "eval_sentence_accuracy": 0.7618255064869812, + "eval_steps_per_second": 0.044, + "eval_variance_shuffling_prob": 0.2499, + "step": 498000 + }, + { + "epoch": 0.19, + "learning_rate": 5.06969696969697e-05, + "loss": 0.4503, + "step": 498100 + }, + { + "epoch": 0.19, + "learning_rate": 5.06868686868687e-05, + "loss": 0.4485, + "step": 498200 + }, + { + "epoch": 0.19, + "learning_rate": 5.0676767676767675e-05, + "loss": 0.4495, + "step": 498300 + }, + { + "epoch": 0.19, + "learning_rate": 5.0666666666666674e-05, + "loss": 0.4505, + "step": 498400 + }, + { + "epoch": 0.19, + "learning_rate": 5.065656565656566e-05, + "loss": 0.4499, + "step": 498500 + }, + { + "epoch": 0.19, + "learning_rate": 5.064646464646465e-05, + "loss": 0.4445, + "step": 498600 + }, + { + "epoch": 0.19, + "learning_rate": 5.0636363636363636e-05, + "loss": 0.4521, + "step": 498700 + }, + { + "epoch": 0.19, + "learning_rate": 5.0626262626262635e-05, + "loss": 0.4479, + "step": 498800 + }, + { + "epoch": 0.19, + "learning_rate": 5.061616161616162e-05, + "loss": 0.4514, + "step": 498900 + }, + { + "epoch": 0.19, + "learning_rate": 5.060606060606061e-05, + "loss": 0.4499, + "step": 499000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.4483703984318253, + "eval_average_loss_on_sentence_tokens": 0.35495331700449717, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.44416990876197815, + "eval_non_padding_tokens_in_labels": 133.5205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37235, + "eval_padding_tokens_in_labels": 378.4795, + "eval_reconstruction_accuracy": 0.9165742204970928, + "eval_runtime": 277.4969, + "eval_samples_per_second": 18.018, + "eval_sentence_accuracy": 0.7865217937445045, + "eval_steps_per_second": 0.047, + "eval_variance_shuffling_prob": 0.245775, + "step": 499000 + }, + { + "epoch": 0.19, + "learning_rate": 5.05959595959596e-05, + "loss": 0.4471, + "step": 499100 + }, + { + "epoch": 0.19, + "learning_rate": 5.058585858585859e-05, + "loss": 0.4499, + "step": 499200 + }, + { + "epoch": 0.19, + "learning_rate": 5.0575757575757575e-05, + "loss": 0.4459, + "step": 499300 + }, + { + "epoch": 0.19, + "learning_rate": 5.0565656565656573e-05, + "loss": 0.4493, + "step": 499400 + }, + { + "epoch": 0.19, + "learning_rate": 5.055555555555556e-05, + "loss": 0.4508, + "step": 499500 + }, + { + "epoch": 0.19, + "learning_rate": 5.054545454545455e-05, + "loss": 0.4483, + "step": 499600 + }, + { + "epoch": 0.19, + "learning_rate": 5.0535353535353536e-05, + "loss": 0.4505, + "step": 499700 + }, + { + "epoch": 0.19, + "learning_rate": 5.052525252525253e-05, + "loss": 0.4487, + "step": 499800 + }, + { + "epoch": 0.19, + "learning_rate": 5.051515151515151e-05, + "loss": 0.4497, + "step": 499900 + }, + { + "epoch": 0.19, + "learning_rate": 5.050505050505051e-05, + "loss": 0.4501, + "step": 500000 + }, + { + "epoch": 0.19, + "eval_average_loss_on_non_sentence_tokens": 0.44784609836536377, + "eval_average_loss_on_sentence_tokens": 0.40311763498071707, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.4459179639816284, + "eval_non_padding_tokens_in_labels": 133.5414, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3899, + "eval_padding_tokens_in_labels": 378.4586, + "eval_reconstruction_accuracy": 0.9166865003617986, + "eval_runtime": 280.3886, + "eval_samples_per_second": 17.832, + "eval_sentence_accuracy": 0.7469314694851688, + "eval_steps_per_second": 0.046, + "eval_variance_shuffling_prob": 0.24997499999999995, + "step": 500000 + }, + { + "epoch": 0.19, + "learning_rate": 5.04949494949495e-05, + "loss": 0.4532, + "step": 500100 + }, + { + "epoch": 0.19, + "learning_rate": 5.048484848484849e-05, + "loss": 0.4475, + "step": 500200 + }, + { + "epoch": 0.19, + "learning_rate": 5.0474747474747474e-05, + "loss": 0.4473, + "step": 500300 + }, + { + "epoch": 0.19, + "learning_rate": 5.046464646464647e-05, + "loss": 0.4503, + "step": 500400 + }, + { + "epoch": 0.19, + "learning_rate": 5.045454545454545e-05, + "loss": 0.4494, + "step": 500500 + }, + { + "epoch": 0.19, + "learning_rate": 5.044444444444445e-05, + "loss": 0.4466, + "step": 500600 + }, + { + "epoch": 0.19, + "learning_rate": 5.0434343434343435e-05, + "loss": 0.4498, + "step": 500700 + }, + { + "epoch": 0.19, + "learning_rate": 5.042424242424243e-05, + "loss": 0.4465, + "step": 500800 + }, + { + "epoch": 0.19, + "learning_rate": 5.041414141414141e-05, + "loss": 0.4506, + "step": 500900 + }, + { + "epoch": 0.2, + "learning_rate": 5.040404040404041e-05, + "loss": 0.4462, + "step": 501000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.44783715364051657, + "eval_average_loss_on_sentence_tokens": 0.38485837250522653, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.4449804723262787, + "eval_non_padding_tokens_in_labels": 133.53375, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3882, + "eval_padding_tokens_in_labels": 378.46625, + "eval_reconstruction_accuracy": 0.9168029731315199, + "eval_runtime": 271.4767, + "eval_samples_per_second": 18.418, + "eval_sentence_accuracy": 0.7500538338686813, + "eval_steps_per_second": 0.048, + "eval_variance_shuffling_prob": 0.2499, + "step": 501000 + }, + { + "epoch": 0.2, + "learning_rate": 5.0393939393939396e-05, + "loss": 0.4491, + "step": 501100 + }, + { + "epoch": 0.2, + "learning_rate": 5.038383838383839e-05, + "loss": 0.4513, + "step": 501200 + }, + { + "epoch": 0.2, + "learning_rate": 5.0373737373737374e-05, + "loss": 0.4524, + "step": 501300 + }, + { + "epoch": 0.2, + "learning_rate": 5.0363636363636366e-05, + "loss": 0.45, + "step": 501400 + }, + { + "epoch": 0.2, + "learning_rate": 5.035353535353535e-05, + "loss": 0.4514, + "step": 501500 + }, + { + "epoch": 0.2, + "learning_rate": 5.034343434343435e-05, + "loss": 0.4487, + "step": 501600 + }, + { + "epoch": 0.2, + "learning_rate": 5.0333333333333335e-05, + "loss": 0.4454, + "step": 501700 + }, + { + "epoch": 0.2, + "learning_rate": 5.032323232323233e-05, + "loss": 0.4471, + "step": 501800 + }, + { + "epoch": 0.2, + "learning_rate": 5.031313131313131e-05, + "loss": 0.447, + "step": 501900 + }, + { + "epoch": 0.2, + "learning_rate": 5.030303030303031e-05, + "loss": 0.4522, + "step": 502000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.4479309661341563, + "eval_average_loss_on_sentence_tokens": 0.36718188662442164, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.4442187547683716, + "eval_non_padding_tokens_in_labels": 133.54275, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3842, + "eval_padding_tokens_in_labels": 378.45725, + "eval_reconstruction_accuracy": 0.9167108266061805, + "eval_runtime": 255.1305, + "eval_samples_per_second": 19.598, + "eval_sentence_accuracy": 0.7726461140919123, + "eval_steps_per_second": 0.051, + "eval_variance_shuffling_prob": 0.24797499999999995, + "step": 502000 + }, + { + "epoch": 0.2, + "learning_rate": 5.029292929292929e-05, + "loss": 0.4481, + "step": 502100 + }, + { + "epoch": 0.2, + "learning_rate": 5.028282828282829e-05, + "loss": 0.4501, + "step": 502200 + }, + { + "epoch": 0.2, + "learning_rate": 5.027272727272727e-05, + "loss": 0.4483, + "step": 502300 + }, + { + "epoch": 0.2, + "learning_rate": 5.0262626262626265e-05, + "loss": 0.4492, + "step": 502400 + }, + { + "epoch": 0.2, + "learning_rate": 5.025252525252525e-05, + "loss": 0.443, + "step": 502500 + }, + { + "epoch": 0.2, + "learning_rate": 5.024242424242425e-05, + "loss": 0.4464, + "step": 502600 + }, + { + "epoch": 0.2, + "learning_rate": 5.023232323232323e-05, + "loss": 0.4472, + "step": 502700 + }, + { + "epoch": 0.2, + "learning_rate": 5.0222222222222226e-05, + "loss": 0.4475, + "step": 502800 + }, + { + "epoch": 0.2, + "learning_rate": 5.021212121212121e-05, + "loss": 0.4491, + "step": 502900 + }, + { + "epoch": 0.2, + "learning_rate": 5.0202020202020203e-05, + "loss": 0.4477, + "step": 503000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.447982572644833, + "eval_average_loss_on_sentence_tokens": 0.37039270363973975, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.4444628953933716, + "eval_non_padding_tokens_in_labels": 133.5409, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37525, + "eval_padding_tokens_in_labels": 378.4591, + "eval_reconstruction_accuracy": 0.9167510926795958, + "eval_runtime": 264.9867, + "eval_samples_per_second": 18.869, + "eval_sentence_accuracy": 0.7662533421860139, + "eval_steps_per_second": 0.049, + "eval_variance_shuffling_prob": 0.2496, + "step": 503000 + }, + { + "epoch": 0.2, + "learning_rate": 5.019191919191919e-05, + "loss": 0.4473, + "step": 503100 + }, + { + "epoch": 0.2, + "learning_rate": 5.018181818181819e-05, + "loss": 0.4467, + "step": 503200 + }, + { + "epoch": 0.2, + "learning_rate": 5.017171717171717e-05, + "loss": 0.4503, + "step": 503300 + }, + { + "epoch": 0.2, + "learning_rate": 5.0161616161616165e-05, + "loss": 0.449, + "step": 503400 + }, + { + "epoch": 0.2, + "learning_rate": 5.015151515151515e-05, + "loss": 0.445, + "step": 503500 + }, + { + "epoch": 0.2, + "learning_rate": 5.014141414141414e-05, + "loss": 0.4517, + "step": 503600 + }, + { + "epoch": 0.2, + "learning_rate": 5.013131313131313e-05, + "loss": 0.4432, + "step": 503700 + }, + { + "epoch": 0.2, + "learning_rate": 5.0121212121212126e-05, + "loss": 0.444, + "step": 503800 + }, + { + "epoch": 0.2, + "learning_rate": 5.011111111111111e-05, + "loss": 0.4455, + "step": 503900 + }, + { + "epoch": 0.2, + "learning_rate": 5.01010101010101e-05, + "loss": 0.4476, + "step": 504000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.4476026649806157, + "eval_average_loss_on_sentence_tokens": 0.41742873312470513, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.44626954197883606, + "eval_non_padding_tokens_in_labels": 133.5177, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37395, + "eval_padding_tokens_in_labels": 378.4823, + "eval_reconstruction_accuracy": 0.9166256349434541, + "eval_runtime": 304.9035, + "eval_samples_per_second": 16.399, + "eval_sentence_accuracy": 0.7449261578767922, + "eval_steps_per_second": 0.043, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 504000 + }, + { + "epoch": 0.2, + "learning_rate": 5.009090909090909e-05, + "loss": 0.4486, + "step": 504100 + }, + { + "epoch": 0.2, + "learning_rate": 5.008080808080809e-05, + "loss": 0.4443, + "step": 504200 + }, + { + "epoch": 0.2, + "learning_rate": 5.0070707070707065e-05, + "loss": 0.4486, + "step": 504300 + }, + { + "epoch": 0.2, + "learning_rate": 5.0060606060606064e-05, + "loss": 0.4483, + "step": 504400 + }, + { + "epoch": 0.2, + "learning_rate": 5.005050505050505e-05, + "loss": 0.4512, + "step": 504500 + }, + { + "epoch": 0.2, + "learning_rate": 5.004040404040404e-05, + "loss": 0.4472, + "step": 504600 + }, + { + "epoch": 0.2, + "learning_rate": 5.0030303030303026e-05, + "loss": 0.4486, + "step": 504700 + }, + { + "epoch": 0.2, + "learning_rate": 5.0020202020202025e-05, + "loss": 0.4506, + "step": 504800 + }, + { + "epoch": 0.2, + "learning_rate": 5.0010101010101004e-05, + "loss": 0.4491, + "step": 504900 + }, + { + "epoch": 0.2, + "learning_rate": 5e-05, + "loss": 0.4514, + "step": 505000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.44775769604626464, + "eval_average_loss_on_sentence_tokens": 0.379599282083079, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.44462889432907104, + "eval_non_padding_tokens_in_labels": 133.51955, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3835, + "eval_padding_tokens_in_labels": 378.48045, + "eval_reconstruction_accuracy": 0.9166628230647265, + "eval_runtime": 188.6124, + "eval_samples_per_second": 26.509, + "eval_sentence_accuracy": 0.7637455811366124, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.2496, + "step": 505000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9989898989898994e-05, + "loss": 0.4462, + "step": 505100 + }, + { + "epoch": 0.2, + "learning_rate": 4.997979797979798e-05, + "loss": 0.4451, + "step": 505200 + }, + { + "epoch": 0.2, + "learning_rate": 4.996969696969697e-05, + "loss": 0.446, + "step": 505300 + }, + { + "epoch": 0.2, + "learning_rate": 4.9959595959595964e-05, + "loss": 0.4471, + "step": 505400 + }, + { + "epoch": 0.2, + "learning_rate": 4.994949494949495e-05, + "loss": 0.4509, + "step": 505500 + }, + { + "epoch": 0.2, + "learning_rate": 4.993939393939394e-05, + "loss": 0.449, + "step": 505600 + }, + { + "epoch": 0.2, + "learning_rate": 4.992929292929293e-05, + "loss": 0.4524, + "step": 505700 + }, + { + "epoch": 0.2, + "learning_rate": 4.991919191919192e-05, + "loss": 0.4504, + "step": 505800 + }, + { + "epoch": 0.2, + "learning_rate": 4.990909090909091e-05, + "loss": 0.4495, + "step": 505900 + }, + { + "epoch": 0.2, + "learning_rate": 4.98989898989899e-05, + "loss": 0.4519, + "step": 506000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.447795202894476, + "eval_average_loss_on_sentence_tokens": 0.39905598577908785, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.4456250071525574, + "eval_non_padding_tokens_in_labels": 133.5397, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36985, + "eval_padding_tokens_in_labels": 378.4603, + "eval_reconstruction_accuracy": 0.9165890211715879, + "eval_runtime": 294.3632, + "eval_samples_per_second": 16.986, + "eval_sentence_accuracy": 0.7502916001220234, + "eval_steps_per_second": 0.044, + "eval_variance_shuffling_prob": 0.2499, + "step": 506000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9888888888888894e-05, + "loss": 0.4486, + "step": 506100 + }, + { + "epoch": 0.2, + "learning_rate": 4.987878787878788e-05, + "loss": 0.4479, + "step": 506200 + }, + { + "epoch": 0.2, + "learning_rate": 4.986868686868687e-05, + "loss": 0.4486, + "step": 506300 + }, + { + "epoch": 0.2, + "learning_rate": 4.985858585858586e-05, + "loss": 0.4481, + "step": 506400 + }, + { + "epoch": 0.2, + "learning_rate": 4.984848484848485e-05, + "loss": 0.4491, + "step": 506500 + }, + { + "epoch": 0.2, + "learning_rate": 4.983838383838384e-05, + "loss": 0.4479, + "step": 506600 + }, + { + "epoch": 0.2, + "learning_rate": 4.982828282828283e-05, + "loss": 0.451, + "step": 506700 + }, + { + "epoch": 0.2, + "learning_rate": 4.981818181818182e-05, + "loss": 0.4516, + "step": 506800 + }, + { + "epoch": 0.2, + "learning_rate": 4.980808080808081e-05, + "loss": 0.4424, + "step": 506900 + }, + { + "epoch": 0.2, + "learning_rate": 4.97979797979798e-05, + "loss": 0.4486, + "step": 507000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.44803437524226536, + "eval_average_loss_on_sentence_tokens": 0.358323405097973, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.4439746141433716, + "eval_non_padding_tokens_in_labels": 133.5554, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3903, + "eval_padding_tokens_in_labels": 378.4446, + "eval_reconstruction_accuracy": 0.916636483618292, + "eval_runtime": 220.2595, + "eval_samples_per_second": 22.701, + "eval_sentence_accuracy": 0.7824393920361764, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.2464, + "step": 507000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9787878787878787e-05, + "loss": 0.4446, + "step": 507100 + }, + { + "epoch": 0.2, + "learning_rate": 4.977777777777778e-05, + "loss": 0.4492, + "step": 507200 + }, + { + "epoch": 0.2, + "learning_rate": 4.976767676767677e-05, + "loss": 0.4474, + "step": 507300 + }, + { + "epoch": 0.2, + "learning_rate": 4.9757575757575756e-05, + "loss": 0.4504, + "step": 507400 + }, + { + "epoch": 0.2, + "learning_rate": 4.974747474747475e-05, + "loss": 0.4453, + "step": 507500 + }, + { + "epoch": 0.2, + "learning_rate": 4.973737373737374e-05, + "loss": 0.4499, + "step": 507600 + }, + { + "epoch": 0.2, + "learning_rate": 4.9727272727272725e-05, + "loss": 0.4488, + "step": 507700 + }, + { + "epoch": 0.2, + "learning_rate": 4.971717171717172e-05, + "loss": 0.4472, + "step": 507800 + }, + { + "epoch": 0.2, + "learning_rate": 4.970707070707071e-05, + "loss": 0.4476, + "step": 507900 + }, + { + "epoch": 0.2, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.4495, + "step": 508000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.4468835614371826, + "eval_average_loss_on_sentence_tokens": 0.3845516657793475, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.44411131739616394, + "eval_non_padding_tokens_in_labels": 133.54275, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3925, + "eval_padding_tokens_in_labels": 378.45725, + "eval_reconstruction_accuracy": 0.916579486812406, + "eval_runtime": 259.1552, + "eval_samples_per_second": 19.293, + "eval_sentence_accuracy": 0.7736420406625155, + "eval_steps_per_second": 0.05, + "eval_variance_shuffling_prob": 0.24839999999999995, + "step": 508000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9686868686868686e-05, + "loss": 0.4504, + "step": 508100 + }, + { + "epoch": 0.2, + "learning_rate": 4.967676767676768e-05, + "loss": 0.4489, + "step": 508200 + }, + { + "epoch": 0.2, + "learning_rate": 4.966666666666667e-05, + "loss": 0.4463, + "step": 508300 + }, + { + "epoch": 0.2, + "learning_rate": 4.9656565656565655e-05, + "loss": 0.4517, + "step": 508400 + }, + { + "epoch": 0.2, + "learning_rate": 4.964646464646465e-05, + "loss": 0.4492, + "step": 508500 + }, + { + "epoch": 0.2, + "learning_rate": 4.963636363636364e-05, + "loss": 0.4439, + "step": 508600 + }, + { + "epoch": 0.2, + "learning_rate": 4.9626262626262624e-05, + "loss": 0.4501, + "step": 508700 + }, + { + "epoch": 0.2, + "learning_rate": 4.9616161616161616e-05, + "loss": 0.445, + "step": 508800 + }, + { + "epoch": 0.2, + "learning_rate": 4.960606060606061e-05, + "loss": 0.4486, + "step": 508900 + }, + { + "epoch": 0.2, + "learning_rate": 4.9595959595959594e-05, + "loss": 0.4461, + "step": 509000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.4473159914460935, + "eval_average_loss_on_sentence_tokens": 0.4066614333774829, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.4454882740974426, + "eval_non_padding_tokens_in_labels": 133.5574, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39835, + "eval_padding_tokens_in_labels": 378.4426, + "eval_reconstruction_accuracy": 0.9167118163614907, + "eval_runtime": 227.371, + "eval_samples_per_second": 21.99, + "eval_sentence_accuracy": 0.7431675848332048, + "eval_steps_per_second": 0.057, + "eval_variance_shuffling_prob": 0.24977499999999994, + "step": 509000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9585858585858586e-05, + "loss": 0.4457, + "step": 509100 + }, + { + "epoch": 0.2, + "learning_rate": 4.957575757575758e-05, + "loss": 0.4523, + "step": 509200 + }, + { + "epoch": 0.2, + "learning_rate": 4.956565656565657e-05, + "loss": 0.45, + "step": 509300 + }, + { + "epoch": 0.2, + "learning_rate": 4.955555555555556e-05, + "loss": 0.4447, + "step": 509400 + }, + { + "epoch": 0.2, + "learning_rate": 4.9545454545454553e-05, + "loss": 0.4515, + "step": 509500 + }, + { + "epoch": 0.2, + "learning_rate": 4.953535353535354e-05, + "loss": 0.4477, + "step": 509600 + }, + { + "epoch": 0.2, + "learning_rate": 4.952525252525253e-05, + "loss": 0.4526, + "step": 509700 + }, + { + "epoch": 0.2, + "learning_rate": 4.951515151515152e-05, + "loss": 0.4472, + "step": 509800 + }, + { + "epoch": 0.2, + "learning_rate": 4.950505050505051e-05, + "loss": 0.4489, + "step": 509900 + }, + { + "epoch": 0.2, + "learning_rate": 4.94949494949495e-05, + "loss": 0.4526, + "step": 510000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.44928097268765, + "eval_average_loss_on_sentence_tokens": 0.41792085032224835, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.4479394555091858, + "eval_non_padding_tokens_in_labels": 133.5224, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39, + "eval_padding_tokens_in_labels": 378.4776, + "eval_reconstruction_accuracy": 0.9166396032666964, + "eval_runtime": 215.7827, + "eval_samples_per_second": 23.171, + "eval_sentence_accuracy": 0.7214680495989377, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.2464, + "step": 510000 + }, + { + "epoch": 0.2, + "learning_rate": 4.948484848484849e-05, + "loss": 0.4488, + "step": 510100 + }, + { + "epoch": 0.2, + "learning_rate": 4.947474747474748e-05, + "loss": 0.4512, + "step": 510200 + }, + { + "epoch": 0.2, + "learning_rate": 4.946464646464647e-05, + "loss": 0.4514, + "step": 510300 + }, + { + "epoch": 0.2, + "learning_rate": 4.945454545454546e-05, + "loss": 0.4482, + "step": 510400 + }, + { + "epoch": 0.2, + "learning_rate": 4.9444444444444446e-05, + "loss": 0.4482, + "step": 510500 + }, + { + "epoch": 0.2, + "learning_rate": 4.943434343434344e-05, + "loss": 0.4496, + "step": 510600 + }, + { + "epoch": 0.2, + "learning_rate": 4.942424242424243e-05, + "loss": 0.4471, + "step": 510700 + }, + { + "epoch": 0.2, + "learning_rate": 4.9414141414141415e-05, + "loss": 0.4489, + "step": 510800 + }, + { + "epoch": 0.2, + "learning_rate": 4.940404040404041e-05, + "loss": 0.4537, + "step": 510900 + }, + { + "epoch": 0.2, + "learning_rate": 4.93939393939394e-05, + "loss": 0.448, + "step": 511000 + }, + { + "epoch": 0.2, + "eval_average_loss_on_non_sentence_tokens": 0.4467942866581471, + "eval_average_loss_on_sentence_tokens": 0.40977266369955806, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.445068359375, + "eval_non_padding_tokens_in_labels": 133.5221, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39015, + "eval_padding_tokens_in_labels": 378.4779, + "eval_reconstruction_accuracy": 0.9166987716319163, + "eval_runtime": 208.0243, + "eval_samples_per_second": 24.036, + "eval_sentence_accuracy": 0.7491386581011, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2499, + "step": 511000 + }, + { + "epoch": 0.21, + "learning_rate": 4.9383838383838384e-05, + "loss": 0.4505, + "step": 511100 + }, + { + "epoch": 0.21, + "learning_rate": 4.9373737373737376e-05, + "loss": 0.4463, + "step": 511200 + }, + { + "epoch": 0.21, + "learning_rate": 4.936363636363637e-05, + "loss": 0.4459, + "step": 511300 + }, + { + "epoch": 0.21, + "learning_rate": 4.935353535353536e-05, + "loss": 0.4485, + "step": 511400 + }, + { + "epoch": 0.21, + "learning_rate": 4.9343434343434346e-05, + "loss": 0.4515, + "step": 511500 + }, + { + "epoch": 0.21, + "learning_rate": 4.933333333333334e-05, + "loss": 0.4502, + "step": 511600 + }, + { + "epoch": 0.21, + "learning_rate": 4.932323232323233e-05, + "loss": 0.4469, + "step": 511700 + }, + { + "epoch": 0.21, + "learning_rate": 4.9313131313131315e-05, + "loss": 0.4467, + "step": 511800 + }, + { + "epoch": 0.21, + "learning_rate": 4.930303030303031e-05, + "loss": 0.445, + "step": 511900 + }, + { + "epoch": 0.21, + "learning_rate": 4.92929292929293e-05, + "loss": 0.4479, + "step": 512000 + }, + { + "epoch": 0.21, + "eval_average_loss_on_non_sentence_tokens": 0.44764138414622234, + "eval_average_loss_on_sentence_tokens": 0.39038210315193067, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.44510743021965027, + "eval_non_padding_tokens_in_labels": 133.53805, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.46195, + "eval_reconstruction_accuracy": 0.916720269079955, + "eval_runtime": 221.4314, + "eval_samples_per_second": 22.58, + "eval_sentence_accuracy": 0.7553968453352953, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.2499, + "step": 512000 + } + ], + "logging_steps": 100, + "max_steps": 1000000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 2000, + "total_flos": 7.011915503815885e+19, + "train_batch_size": 25, + "trial_name": null, + "trial_params": null +}