diff --git "a/checkpoints/pre_train/from_pretrained/depth/allenai_c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-04-09_19-16/checkpoint-200000/trainer_state.json" "b/checkpoints/pre_train/from_pretrained/depth/allenai_c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-04-09_19-16/checkpoint-200000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/pre_train/from_pretrained/depth/allenai_c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-04-09_19-16/checkpoint-200000/trainer_state.json" @@ -0,0 +1,19829 @@ +{ + "best_metric": 0.3332868218421936, + "best_model_checkpoint": "checkpoints/pre_train/from_pretrained/depth/allenai_c4_en/lr_0_0001_linear_bsz_200_shuffle_p_0_5/2024-04-09_19-16/checkpoint-200000", + "epoch": 0.042, + "eval_steps": 1000, + "global_step": 200000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 117027.92446249742, + "learning_rate": 1e-08, + "loss": 48.4062, + "num_input_tokens_seen": 102400, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 98187.21383153716, + "learning_rate": 1.0000000000000002e-06, + "loss": 48.4508, + "num_input_tokens_seen": 10240000, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 55606.95898896109, + "learning_rate": 2.0000000000000003e-06, + "loss": 47.7316, + "num_input_tokens_seen": 20480000, + "step": 200 + }, + { + "epoch": 0.0, + "grad_norm": 50086.50245325581, + "learning_rate": 3e-06, + "loss": 46.0, + "num_input_tokens_seen": 30720000, + "step": 300 + }, + { + "epoch": 0.0, + "grad_norm": 49593.95317979804, + "learning_rate": 4.000000000000001e-06, + "loss": 43.4838, + "num_input_tokens_seen": 40960000, + "step": 400 + }, + { + "epoch": 0.0, + "grad_norm": 9189.89336173168, + "learning_rate": 5e-06, + "loss": 40.5959, + "num_input_tokens_seen": 51200000, + "step": 500 + }, + { + "epoch": 0.0, + "grad_norm": 6370.043445691716, + "learning_rate": 6e-06, + "loss": 37.7531, + "num_input_tokens_seen": 61440000, + "step": 600 + }, + { + "epoch": 0.0, + "grad_norm": 5259.458859996911, + "learning_rate": 7.000000000000001e-06, + "loss": 33.2895, + "num_input_tokens_seen": 71680000, + "step": 700 + }, + { + "epoch": 0.0, + "grad_norm": 4966.0240635744, + "learning_rate": 8.000000000000001e-06, + "loss": 24.192, + "num_input_tokens_seen": 81920000, + "step": 800 + }, + { + "epoch": 0.0, + "grad_norm": 608.0843318878558, + "learning_rate": 9e-06, + "loss": 11.0091, + "num_input_tokens_seen": 92160000, + "step": 900 + }, + { + "epoch": 0.0, + "grad_norm": 323.02566171281194, + "learning_rate": 1e-05, + "loss": 4.5075, + "num_input_tokens_seen": 102400000, + "step": 1000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 3.735890426314327, + "eval_average_loss_on_sentence_tokens": 6.916936108694429, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 3.879453182220459, + "eval_non_padding_tokens_in_labels": 133.5083, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39165, + "eval_padding_tokens_in_labels": 378.4917, + "eval_reconstruction_accuracy": 0.7531481443581489, + "eval_runtime": 172.2869, + "eval_samples_per_second": 29.021, + "eval_sentence_accuracy": 0.0019918531412062377, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 102400000, + "step": 1000 + }, + { + "epoch": 0.0, + "grad_norm": 72.39298574285789, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.8069, + "num_input_tokens_seen": 112640000, + "step": 1100 + }, + { + "epoch": 0.0, + "grad_norm": 159.50111125690472, + "learning_rate": 1.2e-05, + "loss": 1.8009, + "num_input_tokens_seen": 122880000, + "step": 1200 + }, + { + "epoch": 0.0, + "grad_norm": 68.30627491221296, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.6564, + "num_input_tokens_seen": 133120000, + "step": 1300 + }, + { + "epoch": 0.0, + "grad_norm": 35.400115354533206, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.6037, + "num_input_tokens_seen": 143360000, + "step": 1400 + }, + { + "epoch": 0.0, + "grad_norm": 10.18265947358931, + "learning_rate": 1.5e-05, + "loss": 1.5582, + "num_input_tokens_seen": 153600000, + "step": 1500 + }, + { + "epoch": 0.0, + "grad_norm": 9.040887356645733, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.5222, + "num_input_tokens_seen": 163840000, + "step": 1600 + }, + { + "epoch": 0.0, + "grad_norm": 5.339145796173222, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.4793, + "num_input_tokens_seen": 174080000, + "step": 1700 + }, + { + "epoch": 0.0, + "grad_norm": 2.342113304871639, + "learning_rate": 1.8e-05, + "loss": 1.4391, + "num_input_tokens_seen": 184320000, + "step": 1800 + }, + { + "epoch": 0.0, + "grad_norm": 1.773095253340904, + "learning_rate": 1.9e-05, + "loss": 1.4118, + "num_input_tokens_seen": 194560000, + "step": 1900 + }, + { + "epoch": 0.0, + "grad_norm": 0.5889313696050641, + "learning_rate": 2e-05, + "loss": 1.3452, + "num_input_tokens_seen": 204800000, + "step": 2000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 1.279003297678963, + "eval_average_loss_on_sentence_tokens": 2.3100904416252224, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 1.325351595878601, + "eval_non_padding_tokens_in_labels": 133.54125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3952, + "eval_padding_tokens_in_labels": 378.45875, + "eval_reconstruction_accuracy": 0.7939266267005077, + "eval_runtime": 158.3282, + "eval_samples_per_second": 31.58, + "eval_sentence_accuracy": 0.04539989592118722, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 204800000, + "step": 2000 + }, + { + "epoch": 0.0, + "grad_norm": 0.4533874803817849, + "learning_rate": 2.1e-05, + "loss": 1.3164, + "num_input_tokens_seen": 215040000, + "step": 2100 + }, + { + "epoch": 0.0, + "grad_norm": 0.5023528114557592, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.3003, + "num_input_tokens_seen": 225280000, + "step": 2200 + }, + { + "epoch": 0.0, + "grad_norm": 0.7179893117125062, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.2726, + "num_input_tokens_seen": 235520000, + "step": 2300 + }, + { + "epoch": 0.0, + "grad_norm": 0.23368749714892442, + "learning_rate": 2.4e-05, + "loss": 1.2449, + "num_input_tokens_seen": 245760000, + "step": 2400 + }, + { + "epoch": 0.0, + "grad_norm": 0.26528883095495337, + "learning_rate": 2.5e-05, + "loss": 1.2168, + "num_input_tokens_seen": 256000000, + "step": 2500 + }, + { + "epoch": 0.0, + "grad_norm": 0.4369897591811637, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.171, + "num_input_tokens_seen": 266240000, + "step": 2600 + }, + { + "epoch": 0.0, + "grad_norm": 0.3964361903785269, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.1204, + "num_input_tokens_seen": 276480000, + "step": 2700 + }, + { + "epoch": 0.0, + "grad_norm": 0.5518399607852702, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.0646, + "num_input_tokens_seen": 286720000, + "step": 2800 + }, + { + "epoch": 0.0, + "grad_norm": 0.5533210889041299, + "learning_rate": 2.9e-05, + "loss": 1.0273, + "num_input_tokens_seen": 296960000, + "step": 2900 + }, + { + "epoch": 0.0, + "grad_norm": 0.4948192241590541, + "learning_rate": 3e-05, + "loss": 0.9912, + "num_input_tokens_seen": 307200000, + "step": 3000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.9183126312714729, + "eval_average_loss_on_sentence_tokens": 2.071033007189449, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.9700586199760437, + "eval_non_padding_tokens_in_labels": 133.5311, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3945, + "eval_padding_tokens_in_labels": 378.4689, + "eval_reconstruction_accuracy": 0.8350565051675188, + "eval_runtime": 159.6707, + "eval_samples_per_second": 31.314, + "eval_sentence_accuracy": 0.047616056848565326, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 307200000, + "step": 3000 + }, + { + "epoch": 0.0, + "grad_norm": 1.5204243719766946, + "learning_rate": 3.1e-05, + "loss": 0.9669, + "num_input_tokens_seen": 317440000, + "step": 3100 + }, + { + "epoch": 0.0, + "grad_norm": 0.8025626302801746, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.9417, + "num_input_tokens_seen": 327680000, + "step": 3200 + }, + { + "epoch": 0.0, + "grad_norm": 0.6290059040323912, + "learning_rate": 3.3e-05, + "loss": 0.923, + "num_input_tokens_seen": 337920000, + "step": 3300 + }, + { + "epoch": 0.0, + "grad_norm": 0.84089911930044, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.9124, + "num_input_tokens_seen": 348160000, + "step": 3400 + }, + { + "epoch": 0.0, + "grad_norm": 0.6866744351545562, + "learning_rate": 3.5e-05, + "loss": 0.8843, + "num_input_tokens_seen": 358400000, + "step": 3500 + }, + { + "epoch": 0.0, + "grad_norm": 1.104960914671423, + "learning_rate": 3.6e-05, + "loss": 0.8719, + "num_input_tokens_seen": 368640000, + "step": 3600 + }, + { + "epoch": 0.0, + "grad_norm": 0.6965317376806954, + "learning_rate": 3.7e-05, + "loss": 0.8381, + "num_input_tokens_seen": 378880000, + "step": 3700 + }, + { + "epoch": 0.0, + "grad_norm": 0.4779527584566995, + "learning_rate": 3.8e-05, + "loss": 0.8079, + "num_input_tokens_seen": 389120000, + "step": 3800 + }, + { + "epoch": 0.0, + "grad_norm": 0.5748217588916886, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.7808, + "num_input_tokens_seen": 399360000, + "step": 3900 + }, + { + "epoch": 0.0, + "grad_norm": 0.7680719160790532, + "learning_rate": 4e-05, + "loss": 0.7668, + "num_input_tokens_seen": 409600000, + "step": 4000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.7154172288004359, + "eval_average_loss_on_sentence_tokens": 1.6378286597585627, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.757031261920929, + "eval_non_padding_tokens_in_labels": 133.50705, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38365, + "eval_padding_tokens_in_labels": 378.49295, + "eval_reconstruction_accuracy": 0.8574579187521558, + "eval_runtime": 159.6769, + "eval_samples_per_second": 31.313, + "eval_sentence_accuracy": 0.06032982216878712, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.249775, + "num_input_tokens_seen": 409600000, + "step": 4000 + }, + { + "epoch": 0.0, + "grad_norm": 0.7187184554931695, + "learning_rate": 4.1e-05, + "loss": 0.7605, + "num_input_tokens_seen": 419840000, + "step": 4100 + }, + { + "epoch": 0.0, + "grad_norm": 0.3479579302403603, + "learning_rate": 4.2e-05, + "loss": 0.742, + "num_input_tokens_seen": 430080000, + "step": 4200 + }, + { + "epoch": 0.0, + "grad_norm": 0.5810996991843084, + "learning_rate": 4.3e-05, + "loss": 0.733, + "num_input_tokens_seen": 440320000, + "step": 4300 + }, + { + "epoch": 0.0, + "grad_norm": 0.42764991205025665, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.7189, + "num_input_tokens_seen": 450560000, + "step": 4400 + }, + { + "epoch": 0.0, + "grad_norm": 0.36334677074604205, + "learning_rate": 4.5e-05, + "loss": 0.7045, + "num_input_tokens_seen": 460800000, + "step": 4500 + }, + { + "epoch": 0.0, + "grad_norm": 0.326550478120764, + "learning_rate": 4.600000000000001e-05, + "loss": 0.6995, + "num_input_tokens_seen": 471040000, + "step": 4600 + }, + { + "epoch": 0.0, + "grad_norm": 0.3517289297561798, + "learning_rate": 4.7e-05, + "loss": 0.6818, + "num_input_tokens_seen": 481280000, + "step": 4700 + }, + { + "epoch": 0.0, + "grad_norm": 0.4094795639507133, + "learning_rate": 4.8e-05, + "loss": 0.6712, + "num_input_tokens_seen": 491520000, + "step": 4800 + }, + { + "epoch": 0.0, + "grad_norm": 0.3860976693769952, + "learning_rate": 4.9e-05, + "loss": 0.6608, + "num_input_tokens_seen": 501760000, + "step": 4900 + }, + { + "epoch": 0.01, + "grad_norm": 0.3933630540632164, + "learning_rate": 5e-05, + "loss": 0.6505, + "num_input_tokens_seen": 512000000, + "step": 5000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.6011862853229497, + "eval_average_loss_on_sentence_tokens": 1.5440813649998053, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.6436914205551147, + "eval_non_padding_tokens_in_labels": 133.50815, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3919, + "eval_padding_tokens_in_labels": 378.49185, + "eval_reconstruction_accuracy": 0.8786358199055293, + "eval_runtime": 159.8392, + "eval_samples_per_second": 31.281, + "eval_sentence_accuracy": 0.07920756545301201, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 512000000, + "step": 5000 + }, + { + "epoch": 0.01, + "grad_norm": 0.35213323995132506, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.6432, + "num_input_tokens_seen": 522240000, + "step": 5100 + }, + { + "epoch": 0.01, + "grad_norm": 0.34561932081416386, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.6402, + "num_input_tokens_seen": 532480000, + "step": 5200 + }, + { + "epoch": 0.01, + "grad_norm": 0.2883404722609347, + "learning_rate": 5.300000000000001e-05, + "loss": 0.6321, + "num_input_tokens_seen": 542720000, + "step": 5300 + }, + { + "epoch": 0.01, + "grad_norm": 0.416875358314017, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.6242, + "num_input_tokens_seen": 552960000, + "step": 5400 + }, + { + "epoch": 0.01, + "grad_norm": 0.4017204252435866, + "learning_rate": 5.500000000000001e-05, + "loss": 0.6143, + "num_input_tokens_seen": 563200000, + "step": 5500 + }, + { + "epoch": 0.01, + "grad_norm": 0.4867774006513271, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.6114, + "num_input_tokens_seen": 573440000, + "step": 5600 + }, + { + "epoch": 0.01, + "grad_norm": 0.24656542353836802, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.6014, + "num_input_tokens_seen": 583680000, + "step": 5700 + }, + { + "epoch": 0.01, + "grad_norm": 0.24945153856235952, + "learning_rate": 5.8e-05, + "loss": 0.6003, + "num_input_tokens_seen": 593920000, + "step": 5800 + }, + { + "epoch": 0.01, + "grad_norm": 0.32042545559953334, + "learning_rate": 5.9e-05, + "loss": 0.5956, + "num_input_tokens_seen": 604160000, + "step": 5900 + }, + { + "epoch": 0.01, + "grad_norm": 0.20149177282537045, + "learning_rate": 6e-05, + "loss": 0.5882, + "num_input_tokens_seen": 614400000, + "step": 6000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.5351263822580051, + "eval_average_loss_on_sentence_tokens": 1.5069590978345084, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.5788671970367432, + "eval_non_padding_tokens_in_labels": 133.53385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38385, + "eval_padding_tokens_in_labels": 378.46615, + "eval_reconstruction_accuracy": 0.889190764406181, + "eval_runtime": 159.1864, + "eval_samples_per_second": 31.41, + "eval_sentence_accuracy": 0.08739928580400882, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 614400000, + "step": 6000 + }, + { + "epoch": 0.01, + "grad_norm": 0.49233812721792247, + "learning_rate": 6.1e-05, + "loss": 0.5747, + "num_input_tokens_seen": 624640000, + "step": 6100 + }, + { + "epoch": 0.01, + "grad_norm": 0.8520129526042493, + "learning_rate": 6.2e-05, + "loss": 0.5693, + "num_input_tokens_seen": 634880000, + "step": 6200 + }, + { + "epoch": 0.01, + "grad_norm": 0.363399404364346, + "learning_rate": 6.3e-05, + "loss": 0.5569, + "num_input_tokens_seen": 645120000, + "step": 6300 + }, + { + "epoch": 0.01, + "grad_norm": 0.4110389203574224, + "learning_rate": 6.400000000000001e-05, + "loss": 0.5385, + "num_input_tokens_seen": 655360000, + "step": 6400 + }, + { + "epoch": 0.01, + "grad_norm": 0.48985495455641515, + "learning_rate": 6.500000000000001e-05, + "loss": 0.5313, + "num_input_tokens_seen": 665600000, + "step": 6500 + }, + { + "epoch": 0.01, + "grad_norm": 0.6040708463144516, + "learning_rate": 6.6e-05, + "loss": 0.5147, + "num_input_tokens_seen": 675840000, + "step": 6600 + }, + { + "epoch": 0.01, + "grad_norm": 0.6612252828597202, + "learning_rate": 6.7e-05, + "loss": 0.5058, + "num_input_tokens_seen": 686080000, + "step": 6700 + }, + { + "epoch": 0.01, + "grad_norm": 0.42919798624080896, + "learning_rate": 6.800000000000001e-05, + "loss": 0.4994, + "num_input_tokens_seen": 696320000, + "step": 6800 + }, + { + "epoch": 0.01, + "grad_norm": 0.21242321394189362, + "learning_rate": 6.9e-05, + "loss": 0.4902, + "num_input_tokens_seen": 706560000, + "step": 6900 + }, + { + "epoch": 0.01, + "grad_norm": 0.44660943216480237, + "learning_rate": 7e-05, + "loss": 0.4895, + "num_input_tokens_seen": 716800000, + "step": 7000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.4361637280436824, + "eval_average_loss_on_sentence_tokens": 1.4670178340323676, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.4828124940395355, + "eval_non_padding_tokens_in_labels": 133.5463, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3933, + "eval_padding_tokens_in_labels": 378.4537, + "eval_reconstruction_accuracy": 0.9147457630502843, + "eval_runtime": 160.0763, + "eval_samples_per_second": 31.235, + "eval_sentence_accuracy": 0.10527213020618371, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.24697499999999992, + "num_input_tokens_seen": 716800000, + "step": 7000 + }, + { + "epoch": 0.01, + "grad_norm": 0.5076886025918936, + "learning_rate": 7.1e-05, + "loss": 0.4845, + "num_input_tokens_seen": 727040000, + "step": 7100 + }, + { + "epoch": 0.01, + "grad_norm": 0.3608466110317592, + "learning_rate": 7.2e-05, + "loss": 0.4801, + "num_input_tokens_seen": 737280000, + "step": 7200 + }, + { + "epoch": 0.01, + "grad_norm": 0.19827263227182243, + "learning_rate": 7.3e-05, + "loss": 0.4757, + "num_input_tokens_seen": 747520000, + "step": 7300 + }, + { + "epoch": 0.01, + "grad_norm": 0.31662446315149845, + "learning_rate": 7.4e-05, + "loss": 0.474, + "num_input_tokens_seen": 757760000, + "step": 7400 + }, + { + "epoch": 0.01, + "grad_norm": 0.5578082637465724, + "learning_rate": 7.500000000000001e-05, + "loss": 0.4688, + "num_input_tokens_seen": 768000000, + "step": 7500 + }, + { + "epoch": 0.01, + "grad_norm": 0.6317062952444751, + "learning_rate": 7.6e-05, + "loss": 0.4668, + "num_input_tokens_seen": 778240000, + "step": 7600 + }, + { + "epoch": 0.01, + "grad_norm": 0.2758965748218114, + "learning_rate": 7.7e-05, + "loss": 0.4642, + "num_input_tokens_seen": 788480000, + "step": 7700 + }, + { + "epoch": 0.01, + "grad_norm": 0.3153156363678135, + "learning_rate": 7.800000000000001e-05, + "loss": 0.4646, + "num_input_tokens_seen": 798720000, + "step": 7800 + }, + { + "epoch": 0.01, + "grad_norm": 0.2456978118091552, + "learning_rate": 7.900000000000001e-05, + "loss": 0.4636, + "num_input_tokens_seen": 808960000, + "step": 7900 + }, + { + "epoch": 0.01, + "grad_norm": 0.5619706205956216, + "learning_rate": 8e-05, + "loss": 0.4603, + "num_input_tokens_seen": 819200000, + "step": 8000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.409034057849692, + "eval_average_loss_on_sentence_tokens": 1.4305329217038258, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.45506834983825684, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3843, + "eval_padding_tokens_in_labels": 378.47615, + "eval_reconstruction_accuracy": 0.9193214830199541, + "eval_runtime": 160.2918, + "eval_samples_per_second": 31.193, + "eval_sentence_accuracy": 0.1088251655391462, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 819200000, + "step": 8000 + }, + { + "epoch": 0.01, + "grad_norm": 0.5302097429832601, + "learning_rate": 8.1e-05, + "loss": 0.459, + "num_input_tokens_seen": 829440000, + "step": 8100 + }, + { + "epoch": 0.01, + "grad_norm": 0.23994803615503887, + "learning_rate": 8.2e-05, + "loss": 0.456, + "num_input_tokens_seen": 839680000, + "step": 8200 + }, + { + "epoch": 0.01, + "grad_norm": 0.5010205671529753, + "learning_rate": 8.3e-05, + "loss": 0.4542, + "num_input_tokens_seen": 849920000, + "step": 8300 + }, + { + "epoch": 0.01, + "grad_norm": 0.8509608251741911, + "learning_rate": 8.4e-05, + "loss": 0.4531, + "num_input_tokens_seen": 860160000, + "step": 8400 + }, + { + "epoch": 0.01, + "grad_norm": 0.4627523899902831, + "learning_rate": 8.5e-05, + "loss": 0.4514, + "num_input_tokens_seen": 870400000, + "step": 8500 + }, + { + "epoch": 0.01, + "grad_norm": 0.25954602216126105, + "learning_rate": 8.6e-05, + "loss": 0.4477, + "num_input_tokens_seen": 880640000, + "step": 8600 + }, + { + "epoch": 0.01, + "grad_norm": 0.35809980002546543, + "learning_rate": 8.7e-05, + "loss": 0.4497, + "num_input_tokens_seen": 890880000, + "step": 8700 + }, + { + "epoch": 0.01, + "grad_norm": 0.5127755165944277, + "learning_rate": 8.800000000000001e-05, + "loss": 0.4492, + "num_input_tokens_seen": 901120000, + "step": 8800 + }, + { + "epoch": 0.01, + "grad_norm": 1.183436392719888, + "learning_rate": 8.900000000000001e-05, + "loss": 0.4443, + "num_input_tokens_seen": 911360000, + "step": 8900 + }, + { + "epoch": 0.01, + "grad_norm": 0.4957735800668283, + "learning_rate": 9e-05, + "loss": 0.4412, + "num_input_tokens_seen": 921600000, + "step": 9000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.39479284220034744, + "eval_average_loss_on_sentence_tokens": 1.20901018469223, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.43144530057907104, + "eval_non_padding_tokens_in_labels": 133.5015, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.382, + "eval_padding_tokens_in_labels": 378.4985, + "eval_reconstruction_accuracy": 0.920657961678137, + "eval_runtime": 160.5519, + "eval_samples_per_second": 31.143, + "eval_sentence_accuracy": 0.2915283435318607, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 921600000, + "step": 9000 + }, + { + "epoch": 0.01, + "grad_norm": 0.25962383301178327, + "learning_rate": 9.1e-05, + "loss": 0.4351, + "num_input_tokens_seen": 931840000, + "step": 9100 + }, + { + "epoch": 0.01, + "grad_norm": 0.5270242782977608, + "learning_rate": 9.200000000000001e-05, + "loss": 0.4213, + "num_input_tokens_seen": 942080000, + "step": 9200 + }, + { + "epoch": 0.01, + "grad_norm": 0.35660468940981066, + "learning_rate": 9.300000000000001e-05, + "loss": 0.4109, + "num_input_tokens_seen": 952320000, + "step": 9300 + }, + { + "epoch": 0.01, + "grad_norm": 0.581907323150522, + "learning_rate": 9.4e-05, + "loss": 0.4076, + "num_input_tokens_seen": 962560000, + "step": 9400 + }, + { + "epoch": 0.01, + "grad_norm": 0.4161059500910824, + "learning_rate": 9.5e-05, + "loss": 0.4062, + "num_input_tokens_seen": 972800000, + "step": 9500 + }, + { + "epoch": 0.01, + "grad_norm": 0.21968142363703586, + "learning_rate": 9.6e-05, + "loss": 0.4096, + "num_input_tokens_seen": 983040000, + "step": 9600 + }, + { + "epoch": 0.01, + "grad_norm": 0.1782771876270999, + "learning_rate": 9.7e-05, + "loss": 0.4029, + "num_input_tokens_seen": 993280000, + "step": 9700 + }, + { + "epoch": 0.01, + "grad_norm": 0.20054026866622782, + "learning_rate": 9.8e-05, + "loss": 0.4011, + "num_input_tokens_seen": 1003520000, + "step": 9800 + }, + { + "epoch": 0.01, + "grad_norm": 0.5120583710342788, + "learning_rate": 9.900000000000001e-05, + "loss": 0.3995, + "num_input_tokens_seen": 1013760000, + "step": 9900 + }, + { + "epoch": 0.01, + "grad_norm": 0.31092720613740565, + "learning_rate": 0.0001, + "loss": 0.4017, + "num_input_tokens_seen": 1024000000, + "step": 10000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3854121313597723, + "eval_average_loss_on_sentence_tokens": 0.5901794197149325, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.3946484327316284, + "eval_non_padding_tokens_in_labels": 133.52655, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37465, + "eval_padding_tokens_in_labels": 378.47345, + "eval_reconstruction_accuracy": 0.9222960739914927, + "eval_runtime": 162.0886, + "eval_samples_per_second": 30.847, + "eval_sentence_accuracy": 0.6509636262493944, + "eval_steps_per_second": 0.08, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 1024000000, + "step": 10000 + }, + { + "epoch": 0.01, + "grad_norm": 0.24302657196773422, + "learning_rate": 9.9989898989899e-05, + "loss": 0.3991, + "num_input_tokens_seen": 1034240000, + "step": 10100 + }, + { + "epoch": 0.01, + "grad_norm": 0.27559064135196976, + "learning_rate": 9.997979797979799e-05, + "loss": 0.3987, + "num_input_tokens_seen": 1044480000, + "step": 10200 + }, + { + "epoch": 0.01, + "grad_norm": 0.28406574154044006, + "learning_rate": 9.996969696969698e-05, + "loss": 0.3934, + "num_input_tokens_seen": 1054720000, + "step": 10300 + }, + { + "epoch": 0.01, + "grad_norm": 0.1485544264944323, + "learning_rate": 9.995959595959596e-05, + "loss": 0.3955, + "num_input_tokens_seen": 1064960000, + "step": 10400 + }, + { + "epoch": 0.01, + "grad_norm": 0.16967436423488208, + "learning_rate": 9.994949494949496e-05, + "loss": 0.3966, + "num_input_tokens_seen": 1075200000, + "step": 10500 + }, + { + "epoch": 0.01, + "grad_norm": 0.24514603540038796, + "learning_rate": 9.993939393939394e-05, + "loss": 0.3965, + "num_input_tokens_seen": 1085440000, + "step": 10600 + }, + { + "epoch": 0.01, + "grad_norm": 0.27024939136630677, + "learning_rate": 9.992929292929294e-05, + "loss": 0.392, + "num_input_tokens_seen": 1095680000, + "step": 10700 + }, + { + "epoch": 0.01, + "grad_norm": 0.1418517401981391, + "learning_rate": 9.991919191919193e-05, + "loss": 0.3929, + "num_input_tokens_seen": 1105920000, + "step": 10800 + }, + { + "epoch": 0.01, + "grad_norm": 0.14761725092349237, + "learning_rate": 9.990909090909092e-05, + "loss": 0.3905, + "num_input_tokens_seen": 1116160000, + "step": 10900 + }, + { + "epoch": 0.01, + "grad_norm": 0.18238314762766905, + "learning_rate": 9.98989898989899e-05, + "loss": 0.391, + "num_input_tokens_seen": 1126400000, + "step": 11000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3800238027200128, + "eval_average_loss_on_sentence_tokens": 0.49753954927434607, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.3853222727775574, + "eval_non_padding_tokens_in_labels": 133.54435, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38005, + "eval_padding_tokens_in_labels": 378.45565, + "eval_reconstruction_accuracy": 0.9232105626133866, + "eval_runtime": 154.8497, + "eval_samples_per_second": 32.289, + "eval_sentence_accuracy": 0.6823532578462863, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 1126400000, + "step": 11000 + }, + { + "epoch": 0.01, + "grad_norm": 0.17781057663046584, + "learning_rate": 9.98888888888889e-05, + "loss": 0.3881, + "num_input_tokens_seen": 1136640000, + "step": 11100 + }, + { + "epoch": 0.01, + "grad_norm": 0.2376840878722443, + "learning_rate": 9.987878787878788e-05, + "loss": 0.3877, + "num_input_tokens_seen": 1146880000, + "step": 11200 + }, + { + "epoch": 0.01, + "grad_norm": 0.21930933440270067, + "learning_rate": 9.986868686868687e-05, + "loss": 0.4112, + "num_input_tokens_seen": 1157120000, + "step": 11300 + }, + { + "epoch": 0.01, + "grad_norm": 0.18168312006356727, + "learning_rate": 9.985858585858587e-05, + "loss": 0.3898, + "num_input_tokens_seen": 1167360000, + "step": 11400 + }, + { + "epoch": 0.01, + "grad_norm": 0.22021381606308368, + "learning_rate": 9.984848484848486e-05, + "loss": 0.389, + "num_input_tokens_seen": 1177600000, + "step": 11500 + }, + { + "epoch": 0.01, + "grad_norm": 0.17201460240652908, + "learning_rate": 9.983838383838384e-05, + "loss": 0.3882, + "num_input_tokens_seen": 1187840000, + "step": 11600 + }, + { + "epoch": 0.01, + "grad_norm": 0.16659289859635742, + "learning_rate": 9.982828282828284e-05, + "loss": 0.3876, + "num_input_tokens_seen": 1198080000, + "step": 11700 + }, + { + "epoch": 0.01, + "grad_norm": 0.17389107753651245, + "learning_rate": 9.981818181818182e-05, + "loss": 0.3826, + "num_input_tokens_seen": 1208320000, + "step": 11800 + }, + { + "epoch": 0.01, + "grad_norm": 0.17306721160945812, + "learning_rate": 9.980808080808081e-05, + "loss": 0.3852, + "num_input_tokens_seen": 1218560000, + "step": 11900 + }, + { + "epoch": 0.01, + "grad_norm": 0.15314627142025644, + "learning_rate": 9.97979797979798e-05, + "loss": 0.3851, + "num_input_tokens_seen": 1228800000, + "step": 12000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.37589579345651064, + "eval_average_loss_on_sentence_tokens": 0.46593042849939187, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.37996092438697815, + "eval_non_padding_tokens_in_labels": 133.5443, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37965, + "eval_padding_tokens_in_labels": 378.4557, + "eval_reconstruction_accuracy": 0.9237285910487355, + "eval_runtime": 154.7147, + "eval_samples_per_second": 32.318, + "eval_sentence_accuracy": 0.6995845819800097, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 1228800000, + "step": 12000 + }, + { + "epoch": 0.01, + "grad_norm": 0.2591230504491204, + "learning_rate": 9.97878787878788e-05, + "loss": 0.385, + "num_input_tokens_seen": 1239040000, + "step": 12100 + }, + { + "epoch": 0.01, + "grad_norm": 0.18060719909697132, + "learning_rate": 9.977777777777779e-05, + "loss": 0.3852, + "num_input_tokens_seen": 1249280000, + "step": 12200 + }, + { + "epoch": 0.01, + "grad_norm": 0.13639603743169598, + "learning_rate": 9.976767676767678e-05, + "loss": 0.3839, + "num_input_tokens_seen": 1259520000, + "step": 12300 + }, + { + "epoch": 0.01, + "grad_norm": 0.23837905971298684, + "learning_rate": 9.975757575757576e-05, + "loss": 0.3851, + "num_input_tokens_seen": 1269760000, + "step": 12400 + }, + { + "epoch": 0.01, + "grad_norm": 0.15068487388081905, + "learning_rate": 9.974747474747475e-05, + "loss": 0.384, + "num_input_tokens_seen": 1280000000, + "step": 12500 + }, + { + "epoch": 0.01, + "grad_norm": 0.22705913241215203, + "learning_rate": 9.973737373737374e-05, + "loss": 0.3793, + "num_input_tokens_seen": 1290240000, + "step": 12600 + }, + { + "epoch": 0.01, + "grad_norm": 0.18650120707548257, + "learning_rate": 9.972727272727273e-05, + "loss": 0.3832, + "num_input_tokens_seen": 1300480000, + "step": 12700 + }, + { + "epoch": 0.01, + "grad_norm": 0.22771822134343706, + "learning_rate": 9.971717171717173e-05, + "loss": 0.3791, + "num_input_tokens_seen": 1310720000, + "step": 12800 + }, + { + "epoch": 0.01, + "grad_norm": 0.3151917283377611, + "learning_rate": 9.970707070707072e-05, + "loss": 0.3925, + "num_input_tokens_seen": 1320960000, + "step": 12900 + }, + { + "epoch": 0.01, + "grad_norm": 0.1499099046123697, + "learning_rate": 9.96969696969697e-05, + "loss": 0.3834, + "num_input_tokens_seen": 1331200000, + "step": 13000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3719757643866865, + "eval_average_loss_on_sentence_tokens": 0.5306737159842966, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.37910157442092896, + "eval_non_padding_tokens_in_labels": 133.46615, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3731, + "eval_padding_tokens_in_labels": 378.53385, + "eval_reconstruction_accuracy": 0.9241808904556105, + "eval_runtime": 169.2783, + "eval_samples_per_second": 29.537, + "eval_sentence_accuracy": 0.6539065444039693, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.2483999999999999, + "num_input_tokens_seen": 1331200000, + "step": 13000 + }, + { + "epoch": 0.01, + "grad_norm": 0.20315309935395925, + "learning_rate": 9.968686868686869e-05, + "loss": 0.3822, + "num_input_tokens_seen": 1341440000, + "step": 13100 + }, + { + "epoch": 0.01, + "grad_norm": 0.15078756042478625, + "learning_rate": 9.967676767676768e-05, + "loss": 0.379, + "num_input_tokens_seen": 1351680000, + "step": 13200 + }, + { + "epoch": 0.01, + "grad_norm": 0.14908412582543276, + "learning_rate": 9.966666666666667e-05, + "loss": 0.3805, + "num_input_tokens_seen": 1361920000, + "step": 13300 + }, + { + "epoch": 0.01, + "grad_norm": 0.19913608575034622, + "learning_rate": 9.965656565656566e-05, + "loss": 0.3822, + "num_input_tokens_seen": 1372160000, + "step": 13400 + }, + { + "epoch": 0.01, + "grad_norm": 0.2315431822506722, + "learning_rate": 9.964646464646466e-05, + "loss": 0.3785, + "num_input_tokens_seen": 1382400000, + "step": 13500 + }, + { + "epoch": 0.01, + "grad_norm": 0.16156626978724312, + "learning_rate": 9.963636363636363e-05, + "loss": 0.3761, + "num_input_tokens_seen": 1392640000, + "step": 13600 + }, + { + "epoch": 0.01, + "grad_norm": 0.11581804472711785, + "learning_rate": 9.962626262626264e-05, + "loss": 0.3757, + "num_input_tokens_seen": 1402880000, + "step": 13700 + }, + { + "epoch": 0.01, + "grad_norm": 0.15559662467345098, + "learning_rate": 9.961616161616162e-05, + "loss": 0.3785, + "num_input_tokens_seen": 1413120000, + "step": 13800 + }, + { + "epoch": 0.01, + "grad_norm": 0.2567611230009723, + "learning_rate": 9.960606060606061e-05, + "loss": 0.3756, + "num_input_tokens_seen": 1423360000, + "step": 13900 + }, + { + "epoch": 0.01, + "grad_norm": 0.13475141070218874, + "learning_rate": 9.95959595959596e-05, + "loss": 0.3743, + "num_input_tokens_seen": 1433600000, + "step": 14000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3689672788563913, + "eval_average_loss_on_sentence_tokens": 0.44648890796985713, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.3724609315395355, + "eval_non_padding_tokens_in_labels": 133.4928, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3707, + "eval_padding_tokens_in_labels": 378.5072, + "eval_reconstruction_accuracy": 0.9244660977726237, + "eval_runtime": 157.1527, + "eval_samples_per_second": 31.816, + "eval_sentence_accuracy": 0.7164929029016455, + "eval_steps_per_second": 0.083, + "eval_variance_shuffling_prob": 0.2484, + "num_input_tokens_seen": 1433600000, + "step": 14000 + }, + { + "epoch": 0.01, + "grad_norm": 0.1473000355949265, + "learning_rate": 9.95858585858586e-05, + "loss": 0.375, + "num_input_tokens_seen": 1443840000, + "step": 14100 + }, + { + "epoch": 0.01, + "grad_norm": 0.23647135736851135, + "learning_rate": 9.957575757575757e-05, + "loss": 0.3787, + "num_input_tokens_seen": 1454080000, + "step": 14200 + }, + { + "epoch": 0.01, + "grad_norm": 0.11819577382708915, + "learning_rate": 9.956565656565658e-05, + "loss": 0.3766, + "num_input_tokens_seen": 1464320000, + "step": 14300 + }, + { + "epoch": 0.01, + "grad_norm": 0.17736090732544665, + "learning_rate": 9.955555555555556e-05, + "loss": 0.3729, + "num_input_tokens_seen": 1474560000, + "step": 14400 + }, + { + "epoch": 0.01, + "grad_norm": 0.1456958896154653, + "learning_rate": 9.954545454545455e-05, + "loss": 0.3777, + "num_input_tokens_seen": 1484800000, + "step": 14500 + }, + { + "epoch": 0.01, + "grad_norm": 0.14435721080607852, + "learning_rate": 9.953535353535354e-05, + "loss": 0.3801, + "num_input_tokens_seen": 1495040000, + "step": 14600 + }, + { + "epoch": 0.01, + "grad_norm": 0.16245784419041476, + "learning_rate": 9.952525252525253e-05, + "loss": 0.3799, + "num_input_tokens_seen": 1505280000, + "step": 14700 + }, + { + "epoch": 0.01, + "grad_norm": 0.15716643694271826, + "learning_rate": 9.951515151515151e-05, + "loss": 0.3742, + "num_input_tokens_seen": 1515520000, + "step": 14800 + }, + { + "epoch": 0.01, + "grad_norm": 0.12310718957919906, + "learning_rate": 9.950505050505052e-05, + "loss": 0.3753, + "num_input_tokens_seen": 1525760000, + "step": 14900 + }, + { + "epoch": 0.01, + "grad_norm": 0.14458194694097115, + "learning_rate": 9.94949494949495e-05, + "loss": 0.377, + "num_input_tokens_seen": 1536000000, + "step": 15000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.36717536831858294, + "eval_average_loss_on_sentence_tokens": 0.4820282370375661, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.3723437488079071, + "eval_non_padding_tokens_in_labels": 133.54825, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38605, + "eval_padding_tokens_in_labels": 378.45175, + "eval_reconstruction_accuracy": 0.9248219286504614, + "eval_runtime": 160.4988, + "eval_samples_per_second": 31.153, + "eval_sentence_accuracy": 0.6813887343657473, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 1536000000, + "step": 15000 + }, + { + "epoch": 0.02, + "grad_norm": 0.2644652087776747, + "learning_rate": 9.948484848484849e-05, + "loss": 0.3761, + "num_input_tokens_seen": 1546240000, + "step": 15100 + }, + { + "epoch": 0.02, + "grad_norm": 0.13911153713651134, + "learning_rate": 9.947474747474748e-05, + "loss": 0.3755, + "num_input_tokens_seen": 1556480000, + "step": 15200 + }, + { + "epoch": 0.02, + "grad_norm": 0.1586721496636001, + "learning_rate": 9.946464646464647e-05, + "loss": 0.3752, + "num_input_tokens_seen": 1566720000, + "step": 15300 + }, + { + "epoch": 0.02, + "grad_norm": 0.09363102565853981, + "learning_rate": 9.945454545454545e-05, + "loss": 0.3745, + "num_input_tokens_seen": 1576960000, + "step": 15400 + }, + { + "epoch": 0.02, + "grad_norm": 0.14030144808834033, + "learning_rate": 9.944444444444446e-05, + "loss": 0.3735, + "num_input_tokens_seen": 1587200000, + "step": 15500 + }, + { + "epoch": 0.02, + "grad_norm": 0.12466339895415234, + "learning_rate": 9.943434343434343e-05, + "loss": 0.3737, + "num_input_tokens_seen": 1597440000, + "step": 15600 + }, + { + "epoch": 0.02, + "grad_norm": 0.1544825728133745, + "learning_rate": 9.942424242424243e-05, + "loss": 0.3748, + "num_input_tokens_seen": 1607680000, + "step": 15700 + }, + { + "epoch": 0.02, + "grad_norm": 0.11560570766941806, + "learning_rate": 9.941414141414142e-05, + "loss": 0.369, + "num_input_tokens_seen": 1617920000, + "step": 15800 + }, + { + "epoch": 0.02, + "grad_norm": 0.16851998686948277, + "learning_rate": 9.940404040404041e-05, + "loss": 0.3718, + "num_input_tokens_seen": 1628160000, + "step": 15900 + }, + { + "epoch": 0.02, + "grad_norm": 0.1013062067671787, + "learning_rate": 9.939393939393939e-05, + "loss": 0.3739, + "num_input_tokens_seen": 1638400000, + "step": 16000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3650758731006156, + "eval_average_loss_on_sentence_tokens": 0.44875857155687715, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.36884766817092896, + "eval_non_padding_tokens_in_labels": 133.5264, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3892, + "eval_padding_tokens_in_labels": 378.4736, + "eval_reconstruction_accuracy": 0.9250055008312146, + "eval_runtime": 156.6637, + "eval_samples_per_second": 31.915, + "eval_sentence_accuracy": 0.7146760098336533, + "eval_steps_per_second": 0.083, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 1638400000, + "step": 16000 + }, + { + "epoch": 0.02, + "grad_norm": 0.7206209544244442, + "learning_rate": 9.93838383838384e-05, + "loss": 0.3741, + "num_input_tokens_seen": 1648640000, + "step": 16100 + }, + { + "epoch": 0.02, + "grad_norm": 0.12656907348574992, + "learning_rate": 9.937373737373737e-05, + "loss": 0.3724, + "num_input_tokens_seen": 1658880000, + "step": 16200 + }, + { + "epoch": 0.02, + "grad_norm": 0.12169840807974559, + "learning_rate": 9.936363636363636e-05, + "loss": 0.3716, + "num_input_tokens_seen": 1669120000, + "step": 16300 + }, + { + "epoch": 0.02, + "grad_norm": 0.14111603004919837, + "learning_rate": 9.935353535353536e-05, + "loss": 0.3739, + "num_input_tokens_seen": 1679360000, + "step": 16400 + }, + { + "epoch": 0.02, + "grad_norm": 0.11221000785299119, + "learning_rate": 9.934343434343435e-05, + "loss": 0.3693, + "num_input_tokens_seen": 1689600000, + "step": 16500 + }, + { + "epoch": 0.02, + "grad_norm": 0.08991521475715944, + "learning_rate": 9.933333333333334e-05, + "loss": 0.3698, + "num_input_tokens_seen": 1699840000, + "step": 16600 + }, + { + "epoch": 0.02, + "grad_norm": 0.14364552381185702, + "learning_rate": 9.932323232323233e-05, + "loss": 0.3692, + "num_input_tokens_seen": 1710080000, + "step": 16700 + }, + { + "epoch": 0.02, + "grad_norm": 0.12436604860875718, + "learning_rate": 9.931313131313131e-05, + "loss": 0.3703, + "num_input_tokens_seen": 1720320000, + "step": 16800 + }, + { + "epoch": 0.02, + "grad_norm": 0.2905394479490077, + "learning_rate": 9.93030303030303e-05, + "loss": 0.3751, + "num_input_tokens_seen": 1730560000, + "step": 16900 + }, + { + "epoch": 0.02, + "grad_norm": 0.11363640761171379, + "learning_rate": 9.92929292929293e-05, + "loss": 0.3722, + "num_input_tokens_seen": 1740800000, + "step": 17000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3628784865518768, + "eval_average_loss_on_sentence_tokens": 0.43488322014603975, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.36610352993011475, + "eval_non_padding_tokens_in_labels": 133.52035, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3739, + "eval_padding_tokens_in_labels": 378.47965, + "eval_reconstruction_accuracy": 0.9254036716381047, + "eval_runtime": 155.4863, + "eval_samples_per_second": 32.157, + "eval_sentence_accuracy": 0.7182066143879986, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 1740800000, + "step": 17000 + }, + { + "epoch": 0.02, + "grad_norm": 0.10632912830829595, + "learning_rate": 9.928282828282829e-05, + "loss": 0.3665, + "num_input_tokens_seen": 1751040000, + "step": 17100 + }, + { + "epoch": 0.02, + "grad_norm": 0.16322306333159942, + "learning_rate": 9.927272727272728e-05, + "loss": 0.3702, + "num_input_tokens_seen": 1761280000, + "step": 17200 + }, + { + "epoch": 0.02, + "grad_norm": 0.12100661511431879, + "learning_rate": 9.926262626262627e-05, + "loss": 0.3687, + "num_input_tokens_seen": 1771520000, + "step": 17300 + }, + { + "epoch": 0.02, + "grad_norm": 0.2946207264746856, + "learning_rate": 9.925252525252525e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1781760000, + "step": 17400 + }, + { + "epoch": 0.02, + "grad_norm": 0.15342596690055987, + "learning_rate": 9.924242424242425e-05, + "loss": 0.3711, + "num_input_tokens_seen": 1792000000, + "step": 17500 + }, + { + "epoch": 0.02, + "grad_norm": 0.12659288243319242, + "learning_rate": 9.923232323232323e-05, + "loss": 0.3686, + "num_input_tokens_seen": 1802240000, + "step": 17600 + }, + { + "epoch": 0.02, + "grad_norm": 0.09637991471513421, + "learning_rate": 9.922222222222222e-05, + "loss": 0.367, + "num_input_tokens_seen": 1812480000, + "step": 17700 + }, + { + "epoch": 0.02, + "grad_norm": 0.11752431666397788, + "learning_rate": 9.921212121212122e-05, + "loss": 0.3674, + "num_input_tokens_seen": 1822720000, + "step": 17800 + }, + { + "epoch": 0.02, + "grad_norm": 0.16606818096549658, + "learning_rate": 9.920202020202021e-05, + "loss": 0.3709, + "num_input_tokens_seen": 1832960000, + "step": 17900 + }, + { + "epoch": 0.02, + "grad_norm": 0.20761453361286383, + "learning_rate": 9.919191919191919e-05, + "loss": 0.3674, + "num_input_tokens_seen": 1843200000, + "step": 18000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.36206278400693837, + "eval_average_loss_on_sentence_tokens": 0.3958933896889189, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.36356446146965027, + "eval_non_padding_tokens_in_labels": 133.5544, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37155, + "eval_padding_tokens_in_labels": 378.4456, + "eval_reconstruction_accuracy": 0.9256239212958267, + "eval_runtime": 156.4083, + "eval_samples_per_second": 31.968, + "eval_sentence_accuracy": 0.7412385378721266, + "eval_steps_per_second": 0.083, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 1843200000, + "step": 18000 + }, + { + "epoch": 0.02, + "grad_norm": 0.1083439440834149, + "learning_rate": 9.918181818181819e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1853440000, + "step": 18100 + }, + { + "epoch": 0.02, + "grad_norm": 0.25975168520589126, + "learning_rate": 9.917171717171717e-05, + "loss": 0.3718, + "num_input_tokens_seen": 1863680000, + "step": 18200 + }, + { + "epoch": 0.02, + "grad_norm": 0.15694849825372573, + "learning_rate": 9.916161616161616e-05, + "loss": 0.367, + "num_input_tokens_seen": 1873920000, + "step": 18300 + }, + { + "epoch": 0.02, + "grad_norm": 0.19287524399939732, + "learning_rate": 9.915151515151515e-05, + "loss": 0.3657, + "num_input_tokens_seen": 1884160000, + "step": 18400 + }, + { + "epoch": 0.02, + "grad_norm": 0.14395940263956455, + "learning_rate": 9.914141414141415e-05, + "loss": 0.367, + "num_input_tokens_seen": 1894400000, + "step": 18500 + }, + { + "epoch": 0.02, + "grad_norm": 0.19642979999644156, + "learning_rate": 9.913131313131314e-05, + "loss": 0.3697, + "num_input_tokens_seen": 1904640000, + "step": 18600 + }, + { + "epoch": 0.02, + "grad_norm": 0.12190618543958091, + "learning_rate": 9.912121212121213e-05, + "loss": 0.3656, + "num_input_tokens_seen": 1914880000, + "step": 18700 + }, + { + "epoch": 0.02, + "grad_norm": 0.11996760009180743, + "learning_rate": 9.911111111111112e-05, + "loss": 0.3623, + "num_input_tokens_seen": 1925120000, + "step": 18800 + }, + { + "epoch": 0.02, + "grad_norm": 0.16029858370289582, + "learning_rate": 9.91010101010101e-05, + "loss": 0.3667, + "num_input_tokens_seen": 1935360000, + "step": 18900 + }, + { + "epoch": 0.02, + "grad_norm": 0.14008933937810145, + "learning_rate": 9.909090909090911e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1945600000, + "step": 19000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.36037543821358486, + "eval_average_loss_on_sentence_tokens": 0.4375366560188371, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.3639160096645355, + "eval_non_padding_tokens_in_labels": 133.5372, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39125, + "eval_padding_tokens_in_labels": 378.4628, + "eval_reconstruction_accuracy": 0.9256722739034521, + "eval_runtime": 155.4586, + "eval_samples_per_second": 32.163, + "eval_sentence_accuracy": 0.7188346761892799, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 1945600000, + "step": 19000 + }, + { + "epoch": 0.02, + "grad_norm": 0.11805541190048278, + "learning_rate": 9.908080808080809e-05, + "loss": 0.3677, + "num_input_tokens_seen": 1955840000, + "step": 19100 + }, + { + "epoch": 0.02, + "grad_norm": 0.11050390808702572, + "learning_rate": 9.907070707070708e-05, + "loss": 0.3697, + "num_input_tokens_seen": 1966080000, + "step": 19200 + }, + { + "epoch": 0.02, + "grad_norm": 0.11662551316010913, + "learning_rate": 9.906060606060607e-05, + "loss": 0.3676, + "num_input_tokens_seen": 1976320000, + "step": 19300 + }, + { + "epoch": 0.02, + "grad_norm": 0.1223301601413596, + "learning_rate": 9.905050505050506e-05, + "loss": 0.3691, + "num_input_tokens_seen": 1986560000, + "step": 19400 + }, + { + "epoch": 0.02, + "grad_norm": 0.10207303274583229, + "learning_rate": 9.904040404040404e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1996800000, + "step": 19500 + }, + { + "epoch": 0.02, + "grad_norm": 0.08686027577031438, + "learning_rate": 9.903030303030305e-05, + "loss": 0.3668, + "num_input_tokens_seen": 2007040000, + "step": 19600 + }, + { + "epoch": 0.02, + "grad_norm": 0.19016857340805707, + "learning_rate": 9.902020202020202e-05, + "loss": 0.3655, + "num_input_tokens_seen": 2017280000, + "step": 19700 + }, + { + "epoch": 0.02, + "grad_norm": 0.16012871319435779, + "learning_rate": 9.901010101010102e-05, + "loss": 0.3639, + "num_input_tokens_seen": 2027520000, + "step": 19800 + }, + { + "epoch": 0.02, + "grad_norm": 0.10128884632500922, + "learning_rate": 9.900000000000001e-05, + "loss": 0.3678, + "num_input_tokens_seen": 2037760000, + "step": 19900 + }, + { + "epoch": 0.02, + "grad_norm": 0.16727620505609542, + "learning_rate": 9.8989898989899e-05, + "loss": 0.3644, + "num_input_tokens_seen": 2048000000, + "step": 20000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.36006847868498015, + "eval_average_loss_on_sentence_tokens": 0.4308708401505413, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.36324217915534973, + "eval_non_padding_tokens_in_labels": 133.51725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36155, + "eval_padding_tokens_in_labels": 378.48275, + "eval_reconstruction_accuracy": 0.9257211964914828, + "eval_runtime": 156.4197, + "eval_samples_per_second": 31.965, + "eval_sentence_accuracy": 0.719543488793583, + "eval_steps_per_second": 0.083, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 2048000000, + "step": 20000 + }, + { + "epoch": 0.02, + "grad_norm": 0.11343092320689546, + "learning_rate": 9.897979797979798e-05, + "loss": 0.3686, + "num_input_tokens_seen": 2058240000, + "step": 20100 + }, + { + "epoch": 0.02, + "grad_norm": 0.10904926945300837, + "learning_rate": 9.896969696969698e-05, + "loss": 0.3666, + "num_input_tokens_seen": 2068480000, + "step": 20200 + }, + { + "epoch": 0.02, + "grad_norm": 0.08824138717195555, + "learning_rate": 9.895959595959596e-05, + "loss": 0.3659, + "num_input_tokens_seen": 2078720000, + "step": 20300 + }, + { + "epoch": 0.02, + "grad_norm": 0.10982444712618332, + "learning_rate": 9.894949494949495e-05, + "loss": 0.3664, + "num_input_tokens_seen": 2088960000, + "step": 20400 + }, + { + "epoch": 0.02, + "grad_norm": 0.09810357164314008, + "learning_rate": 9.893939393939395e-05, + "loss": 0.3644, + "num_input_tokens_seen": 2099200000, + "step": 20500 + }, + { + "epoch": 0.02, + "grad_norm": 0.13117198825374618, + "learning_rate": 9.892929292929294e-05, + "loss": 0.3663, + "num_input_tokens_seen": 2109440000, + "step": 20600 + }, + { + "epoch": 0.02, + "grad_norm": 0.37286179130355385, + "learning_rate": 9.891919191919192e-05, + "loss": 0.3679, + "num_input_tokens_seen": 2119680000, + "step": 20700 + }, + { + "epoch": 0.02, + "grad_norm": 0.10047485875458267, + "learning_rate": 9.890909090909092e-05, + "loss": 0.3665, + "num_input_tokens_seen": 2129920000, + "step": 20800 + }, + { + "epoch": 0.02, + "grad_norm": 0.12476892085669669, + "learning_rate": 9.88989898989899e-05, + "loss": 0.3638, + "num_input_tokens_seen": 2140160000, + "step": 20900 + }, + { + "epoch": 0.02, + "grad_norm": 0.12365876297594075, + "learning_rate": 9.888888888888889e-05, + "loss": 0.3623, + "num_input_tokens_seen": 2150400000, + "step": 21000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3587830255621528, + "eval_average_loss_on_sentence_tokens": 0.3976552794947042, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.3605078160762787, + "eval_non_padding_tokens_in_labels": 133.5387, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38515, + "eval_padding_tokens_in_labels": 378.4613, + "eval_reconstruction_accuracy": 0.9260081703881333, + "eval_runtime": 154.3969, + "eval_samples_per_second": 32.384, + "eval_sentence_accuracy": 0.742862526244011, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 2150400000, + "step": 21000 + }, + { + "epoch": 0.02, + "grad_norm": 0.09637865488859665, + "learning_rate": 9.887878787878788e-05, + "loss": 0.366, + "num_input_tokens_seen": 2160640000, + "step": 21100 + }, + { + "epoch": 0.02, + "grad_norm": 0.0837396685633993, + "learning_rate": 9.886868686868688e-05, + "loss": 0.3635, + "num_input_tokens_seen": 2170880000, + "step": 21200 + }, + { + "epoch": 0.02, + "grad_norm": 0.08748902918380133, + "learning_rate": 9.885858585858587e-05, + "loss": 0.3635, + "num_input_tokens_seen": 2181120000, + "step": 21300 + }, + { + "epoch": 0.02, + "grad_norm": 0.08861983243508997, + "learning_rate": 9.884848484848486e-05, + "loss": 0.3654, + "num_input_tokens_seen": 2191360000, + "step": 21400 + }, + { + "epoch": 0.02, + "grad_norm": 0.1095992454706993, + "learning_rate": 9.883838383838384e-05, + "loss": 0.3623, + "num_input_tokens_seen": 2201600000, + "step": 21500 + }, + { + "epoch": 0.02, + "grad_norm": 0.1678159484473445, + "learning_rate": 9.882828282828283e-05, + "loss": 0.3645, + "num_input_tokens_seen": 2211840000, + "step": 21600 + }, + { + "epoch": 0.02, + "grad_norm": 0.11648618534569517, + "learning_rate": 9.881818181818182e-05, + "loss": 0.3671, + "num_input_tokens_seen": 2222080000, + "step": 21700 + }, + { + "epoch": 0.02, + "grad_norm": 0.10011749768739321, + "learning_rate": 9.880808080808081e-05, + "loss": 0.3678, + "num_input_tokens_seen": 2232320000, + "step": 21800 + }, + { + "epoch": 0.02, + "grad_norm": 0.13526673032373565, + "learning_rate": 9.87979797979798e-05, + "loss": 0.3643, + "num_input_tokens_seen": 2242560000, + "step": 21900 + }, + { + "epoch": 0.02, + "grad_norm": 0.13394247434184048, + "learning_rate": 9.87878787878788e-05, + "loss": 0.3634, + "num_input_tokens_seen": 2252800000, + "step": 22000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3583377337473865, + "eval_average_loss_on_sentence_tokens": 0.40822070061419796, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.36058592796325684, + "eval_non_padding_tokens_in_labels": 133.52525, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.376, + "eval_padding_tokens_in_labels": 378.47475, + "eval_reconstruction_accuracy": 0.9261606204151949, + "eval_runtime": 155.145, + "eval_samples_per_second": 32.228, + "eval_sentence_accuracy": 0.7347022089830783, + "eval_steps_per_second": 0.084, + "eval_variance_shuffling_prob": 0.2497749999999999, + "num_input_tokens_seen": 2252800000, + "step": 22000 + }, + { + "epoch": 0.02, + "grad_norm": 0.10097576958217712, + "learning_rate": 9.877777777777778e-05, + "loss": 0.3616, + "num_input_tokens_seen": 2263040000, + "step": 22100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08778364505416811, + "learning_rate": 9.876767676767677e-05, + "loss": 0.3608, + "num_input_tokens_seen": 2273280000, + "step": 22200 + }, + { + "epoch": 0.02, + "grad_norm": 0.11667769923972934, + "learning_rate": 9.875757575757576e-05, + "loss": 0.3631, + "num_input_tokens_seen": 2283520000, + "step": 22300 + }, + { + "epoch": 0.02, + "grad_norm": 0.17587799482940783, + "learning_rate": 9.874747474747475e-05, + "loss": 0.3641, + "num_input_tokens_seen": 2293760000, + "step": 22400 + }, + { + "epoch": 0.02, + "grad_norm": 0.18467841943373034, + "learning_rate": 9.873737373737374e-05, + "loss": 0.365, + "num_input_tokens_seen": 2304000000, + "step": 22500 + }, + { + "epoch": 0.02, + "grad_norm": 0.11327524004811648, + "learning_rate": 9.872727272727274e-05, + "loss": 0.3626, + "num_input_tokens_seen": 2314240000, + "step": 22600 + }, + { + "epoch": 0.02, + "grad_norm": 0.13776796365489793, + "learning_rate": 9.871717171717172e-05, + "loss": 0.3658, + "num_input_tokens_seen": 2324480000, + "step": 22700 + }, + { + "epoch": 0.02, + "grad_norm": 0.1409646640902976, + "learning_rate": 9.870707070707072e-05, + "loss": 0.3613, + "num_input_tokens_seen": 2334720000, + "step": 22800 + }, + { + "epoch": 0.02, + "grad_norm": 0.14753019525405045, + "learning_rate": 9.86969696969697e-05, + "loss": 0.3664, + "num_input_tokens_seen": 2344960000, + "step": 22900 + }, + { + "epoch": 0.02, + "grad_norm": 0.1397460316378644, + "learning_rate": 9.868686868686869e-05, + "loss": 0.3637, + "num_input_tokens_seen": 2355200000, + "step": 23000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3573655042544687, + "eval_average_loss_on_sentence_tokens": 0.42437406804439193, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.3604394495487213, + "eval_non_padding_tokens_in_labels": 133.5656, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.385, + "eval_padding_tokens_in_labels": 378.4344, + "eval_reconstruction_accuracy": 0.9260999844059516, + "eval_runtime": 157.9229, + "eval_samples_per_second": 31.661, + "eval_sentence_accuracy": 0.7266361009923377, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 2355200000, + "step": 23000 + }, + { + "epoch": 0.02, + "grad_norm": 0.0987903451138666, + "learning_rate": 9.867676767676768e-05, + "loss": 0.3622, + "num_input_tokens_seen": 2365440000, + "step": 23100 + }, + { + "epoch": 0.02, + "grad_norm": 0.16954454367678937, + "learning_rate": 9.866666666666668e-05, + "loss": 0.362, + "num_input_tokens_seen": 2375680000, + "step": 23200 + }, + { + "epoch": 0.02, + "grad_norm": 0.09387896280569132, + "learning_rate": 9.865656565656565e-05, + "loss": 0.3634, + "num_input_tokens_seen": 2385920000, + "step": 23300 + }, + { + "epoch": 0.02, + "grad_norm": 0.07207999681262237, + "learning_rate": 9.864646464646466e-05, + "loss": 0.3626, + "num_input_tokens_seen": 2396160000, + "step": 23400 + }, + { + "epoch": 0.02, + "grad_norm": 0.09396686653666976, + "learning_rate": 9.863636363636364e-05, + "loss": 0.3612, + "num_input_tokens_seen": 2406400000, + "step": 23500 + }, + { + "epoch": 0.02, + "grad_norm": 0.12398359266910461, + "learning_rate": 9.862626262626263e-05, + "loss": 0.3609, + "num_input_tokens_seen": 2416640000, + "step": 23600 + }, + { + "epoch": 0.02, + "grad_norm": 0.14763177045615983, + "learning_rate": 9.861616161616162e-05, + "loss": 0.3643, + "num_input_tokens_seen": 2426880000, + "step": 23700 + }, + { + "epoch": 0.02, + "grad_norm": 0.10030012588072933, + "learning_rate": 9.860606060606061e-05, + "loss": 0.3605, + "num_input_tokens_seen": 2437120000, + "step": 23800 + }, + { + "epoch": 0.02, + "grad_norm": 0.10084957691663816, + "learning_rate": 9.859595959595959e-05, + "loss": 0.3589, + "num_input_tokens_seen": 2447360000, + "step": 23900 + }, + { + "epoch": 0.02, + "grad_norm": 0.09548032136785647, + "learning_rate": 9.85858585858586e-05, + "loss": 0.3566, + "num_input_tokens_seen": 2457600000, + "step": 24000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3553217848237189, + "eval_average_loss_on_sentence_tokens": 0.3849351337254851, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.3566015660762787, + "eval_non_padding_tokens_in_labels": 133.4909, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37935, + "eval_padding_tokens_in_labels": 378.5091, + "eval_reconstruction_accuracy": 0.9265581998113339, + "eval_runtime": 162.898, + "eval_samples_per_second": 30.694, + "eval_sentence_accuracy": 0.7520456870098875, + "eval_steps_per_second": 0.08, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 2457600000, + "step": 24000 + }, + { + "epoch": 0.02, + "grad_norm": 0.1188563096869887, + "learning_rate": 9.857575757575758e-05, + "loss": 0.3621, + "num_input_tokens_seen": 2467840000, + "step": 24100 + }, + { + "epoch": 0.02, + "grad_norm": 0.14877812617022562, + "learning_rate": 9.856565656565657e-05, + "loss": 0.3604, + "num_input_tokens_seen": 2478080000, + "step": 24200 + }, + { + "epoch": 0.02, + "grad_norm": 0.08250986132874133, + "learning_rate": 9.855555555555556e-05, + "loss": 0.3609, + "num_input_tokens_seen": 2488320000, + "step": 24300 + }, + { + "epoch": 0.02, + "grad_norm": 0.09244225265273415, + "learning_rate": 9.854545454545455e-05, + "loss": 0.3588, + "num_input_tokens_seen": 2498560000, + "step": 24400 + }, + { + "epoch": 0.02, + "grad_norm": 0.1454294300819924, + "learning_rate": 9.853535353535353e-05, + "loss": 0.361, + "num_input_tokens_seen": 2508800000, + "step": 24500 + }, + { + "epoch": 0.02, + "grad_norm": 0.20041363836805748, + "learning_rate": 9.852525252525254e-05, + "loss": 0.36, + "num_input_tokens_seen": 2519040000, + "step": 24600 + }, + { + "epoch": 0.02, + "grad_norm": 0.09722078986711696, + "learning_rate": 9.851515151515151e-05, + "loss": 0.3635, + "num_input_tokens_seen": 2529280000, + "step": 24700 + }, + { + "epoch": 0.02, + "grad_norm": 0.08990032032163263, + "learning_rate": 9.85050505050505e-05, + "loss": 0.3631, + "num_input_tokens_seen": 2539520000, + "step": 24800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08959055680739043, + "learning_rate": 9.84949494949495e-05, + "loss": 0.3609, + "num_input_tokens_seen": 2549760000, + "step": 24900 + }, + { + "epoch": 0.03, + "grad_norm": 0.11585233560862784, + "learning_rate": 9.848484848484849e-05, + "loss": 0.3623, + "num_input_tokens_seen": 2560000000, + "step": 25000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3556082151245955, + "eval_average_loss_on_sentence_tokens": 0.43450918672873573, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.3591601550579071, + "eval_non_padding_tokens_in_labels": 133.57675, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3766, + "eval_padding_tokens_in_labels": 378.42325, + "eval_reconstruction_accuracy": 0.9264658314428117, + "eval_runtime": 157.6612, + "eval_samples_per_second": 31.714, + "eval_sentence_accuracy": 0.7161609273781111, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 2560000000, + "step": 25000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09943665461342704, + "learning_rate": 9.847474747474747e-05, + "loss": 0.3594, + "num_input_tokens_seen": 2570240000, + "step": 25100 + }, + { + "epoch": 0.03, + "grad_norm": 0.18746017986890912, + "learning_rate": 9.846464646464647e-05, + "loss": 0.3591, + "num_input_tokens_seen": 2580480000, + "step": 25200 + }, + { + "epoch": 0.03, + "grad_norm": 0.08602902567018297, + "learning_rate": 9.845454545454545e-05, + "loss": 0.3577, + "num_input_tokens_seen": 2590720000, + "step": 25300 + }, + { + "epoch": 0.03, + "grad_norm": 0.08012464962844001, + "learning_rate": 9.844444444444444e-05, + "loss": 0.3609, + "num_input_tokens_seen": 2600960000, + "step": 25400 + }, + { + "epoch": 0.03, + "grad_norm": 0.12572008831678216, + "learning_rate": 9.843434343434344e-05, + "loss": 0.364, + "num_input_tokens_seen": 2611200000, + "step": 25500 + }, + { + "epoch": 0.03, + "grad_norm": 0.14940579397721981, + "learning_rate": 9.842424242424243e-05, + "loss": 0.3639, + "num_input_tokens_seen": 2621440000, + "step": 25600 + }, + { + "epoch": 0.03, + "grad_norm": 0.08412110210655141, + "learning_rate": 9.841414141414142e-05, + "loss": 0.3604, + "num_input_tokens_seen": 2631680000, + "step": 25700 + }, + { + "epoch": 0.03, + "grad_norm": 0.09362195877472666, + "learning_rate": 9.840404040404041e-05, + "loss": 0.3596, + "num_input_tokens_seen": 2641920000, + "step": 25800 + }, + { + "epoch": 0.03, + "grad_norm": 0.08569747148738543, + "learning_rate": 9.839393939393939e-05, + "loss": 0.3608, + "num_input_tokens_seen": 2652160000, + "step": 25900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09877899875115037, + "learning_rate": 9.838383838383838e-05, + "loss": 0.3591, + "num_input_tokens_seen": 2662400000, + "step": 26000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3556134664693423, + "eval_average_loss_on_sentence_tokens": 0.42163200170538406, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3585839867591858, + "eval_non_padding_tokens_in_labels": 133.5339, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3887, + "eval_padding_tokens_in_labels": 378.4661, + "eval_reconstruction_accuracy": 0.9265349576723824, + "eval_runtime": 166.333, + "eval_samples_per_second": 30.06, + "eval_sentence_accuracy": 0.7202029536849283, + "eval_steps_per_second": 0.078, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 2662400000, + "step": 26000 + }, + { + "epoch": 0.03, + "grad_norm": 0.08541063254934746, + "learning_rate": 9.837373737373737e-05, + "loss": 0.3583, + "num_input_tokens_seen": 2672640000, + "step": 26100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08526475341411524, + "learning_rate": 9.836363636363637e-05, + "loss": 0.3608, + "num_input_tokens_seen": 2682880000, + "step": 26200 + }, + { + "epoch": 0.03, + "grad_norm": 0.07892816724288247, + "learning_rate": 9.835353535353536e-05, + "loss": 0.3573, + "num_input_tokens_seen": 2693120000, + "step": 26300 + }, + { + "epoch": 0.03, + "grad_norm": 0.1057304341337342, + "learning_rate": 9.834343434343435e-05, + "loss": 0.3595, + "num_input_tokens_seen": 2703360000, + "step": 26400 + }, + { + "epoch": 0.03, + "grad_norm": 0.08824599002870707, + "learning_rate": 9.833333333333333e-05, + "loss": 0.3603, + "num_input_tokens_seen": 2713600000, + "step": 26500 + }, + { + "epoch": 0.03, + "grad_norm": 0.14259526724512908, + "learning_rate": 9.832323232323233e-05, + "loss": 0.3596, + "num_input_tokens_seen": 2723840000, + "step": 26600 + }, + { + "epoch": 0.03, + "grad_norm": 0.10958661741770682, + "learning_rate": 9.831313131313131e-05, + "loss": 0.3606, + "num_input_tokens_seen": 2734080000, + "step": 26700 + }, + { + "epoch": 0.03, + "grad_norm": 0.1002029330532675, + "learning_rate": 9.83030303030303e-05, + "loss": 0.361, + "num_input_tokens_seen": 2744320000, + "step": 26800 + }, + { + "epoch": 0.03, + "grad_norm": 0.11207220852080024, + "learning_rate": 9.82929292929293e-05, + "loss": 0.3592, + "num_input_tokens_seen": 2754560000, + "step": 26900 + }, + { + "epoch": 0.03, + "grad_norm": 0.15408787575922175, + "learning_rate": 9.828282828282829e-05, + "loss": 0.3591, + "num_input_tokens_seen": 2764800000, + "step": 27000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.35464164042436336, + "eval_average_loss_on_sentence_tokens": 0.45656515384165897, + "eval_average_shuffling_prob": 0.57, + "eval_loss": 0.3592187464237213, + "eval_non_padding_tokens_in_labels": 133.5122, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3798, + "eval_padding_tokens_in_labels": 378.4878, + "eval_reconstruction_accuracy": 0.9266176202210964, + "eval_runtime": 167.2988, + "eval_samples_per_second": 29.887, + "eval_sentence_accuracy": 0.6986873508353222, + "eval_steps_per_second": 0.078, + "eval_variance_shuffling_prob": 0.24509999999999996, + "num_input_tokens_seen": 2764800000, + "step": 27000 + }, + { + "epoch": 0.03, + "grad_norm": 0.19324864000629008, + "learning_rate": 9.827272727272728e-05, + "loss": 0.3613, + "num_input_tokens_seen": 2775040000, + "step": 27100 + }, + { + "epoch": 0.03, + "grad_norm": 0.10080048909742759, + "learning_rate": 9.826262626262627e-05, + "loss": 0.3611, + "num_input_tokens_seen": 2785280000, + "step": 27200 + }, + { + "epoch": 0.03, + "grad_norm": 0.10157552507215743, + "learning_rate": 9.825252525252527e-05, + "loss": 0.3582, + "num_input_tokens_seen": 2795520000, + "step": 27300 + }, + { + "epoch": 0.03, + "grad_norm": 0.11988636067218172, + "learning_rate": 9.824242424242424e-05, + "loss": 0.3612, + "num_input_tokens_seen": 2805760000, + "step": 27400 + }, + { + "epoch": 0.03, + "grad_norm": 0.10519465177125238, + "learning_rate": 9.823232323232325e-05, + "loss": 0.3603, + "num_input_tokens_seen": 2816000000, + "step": 27500 + }, + { + "epoch": 0.03, + "grad_norm": 0.13230602730567367, + "learning_rate": 9.822222222222223e-05, + "loss": 0.3581, + "num_input_tokens_seen": 2826240000, + "step": 27600 + }, + { + "epoch": 0.03, + "grad_norm": 0.08350293092452066, + "learning_rate": 9.821212121212122e-05, + "loss": 0.3624, + "num_input_tokens_seen": 2836480000, + "step": 27700 + }, + { + "epoch": 0.03, + "grad_norm": 0.10992582448277533, + "learning_rate": 9.820202020202021e-05, + "loss": 0.3573, + "num_input_tokens_seen": 2846720000, + "step": 27800 + }, + { + "epoch": 0.03, + "grad_norm": 0.097079855298227, + "learning_rate": 9.81919191919192e-05, + "loss": 0.3569, + "num_input_tokens_seen": 2856960000, + "step": 27900 + }, + { + "epoch": 0.03, + "grad_norm": 0.08641760728189113, + "learning_rate": 9.818181818181818e-05, + "loss": 0.3603, + "num_input_tokens_seen": 2867200000, + "step": 28000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3541137710581413, + "eval_average_loss_on_sentence_tokens": 0.39053072234269565, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.35579100251197815, + "eval_non_padding_tokens_in_labels": 133.52385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3881, + "eval_padding_tokens_in_labels": 378.47615, + "eval_reconstruction_accuracy": 0.9268094160225488, + "eval_runtime": 171.34, + "eval_samples_per_second": 29.182, + "eval_sentence_accuracy": 0.7443564160999157, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.24960000000000004, + "num_input_tokens_seen": 2867200000, + "step": 28000 + }, + { + "epoch": 0.03, + "grad_norm": 0.10167108746333979, + "learning_rate": 9.817171717171719e-05, + "loss": 0.3564, + "num_input_tokens_seen": 2877440000, + "step": 28100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08710525143703719, + "learning_rate": 9.816161616161617e-05, + "loss": 0.3572, + "num_input_tokens_seen": 2887680000, + "step": 28200 + }, + { + "epoch": 0.03, + "grad_norm": 0.0717770177483454, + "learning_rate": 9.815151515151516e-05, + "loss": 0.3582, + "num_input_tokens_seen": 2897920000, + "step": 28300 + }, + { + "epoch": 0.03, + "grad_norm": 0.09463541202404317, + "learning_rate": 9.814141414141415e-05, + "loss": 0.3568, + "num_input_tokens_seen": 2908160000, + "step": 28400 + }, + { + "epoch": 0.03, + "grad_norm": 0.07740621416559264, + "learning_rate": 9.813131313131314e-05, + "loss": 0.3584, + "num_input_tokens_seen": 2918400000, + "step": 28500 + }, + { + "epoch": 0.03, + "grad_norm": 0.18716247298945485, + "learning_rate": 9.812121212121212e-05, + "loss": 0.3592, + "num_input_tokens_seen": 2928640000, + "step": 28600 + }, + { + "epoch": 0.03, + "grad_norm": 0.15144869547979808, + "learning_rate": 9.811111111111113e-05, + "loss": 0.3611, + "num_input_tokens_seen": 2938880000, + "step": 28700 + }, + { + "epoch": 0.03, + "grad_norm": 0.08190077384648202, + "learning_rate": 9.81010101010101e-05, + "loss": 0.3588, + "num_input_tokens_seen": 2949120000, + "step": 28800 + }, + { + "epoch": 0.03, + "grad_norm": 0.09443291148084239, + "learning_rate": 9.80909090909091e-05, + "loss": 0.3608, + "num_input_tokens_seen": 2959360000, + "step": 28900 + }, + { + "epoch": 0.03, + "grad_norm": 0.07502971721046336, + "learning_rate": 9.808080808080809e-05, + "loss": 0.356, + "num_input_tokens_seen": 2969600000, + "step": 29000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.35357406801707514, + "eval_average_loss_on_sentence_tokens": 0.3986508759030095, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.35554686188697815, + "eval_non_padding_tokens_in_labels": 133.5861, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40495, + "eval_padding_tokens_in_labels": 378.4139, + "eval_reconstruction_accuracy": 0.9268235948088003, + "eval_runtime": 261.4153, + "eval_samples_per_second": 19.127, + "eval_sentence_accuracy": 0.7389685430760673, + "eval_steps_per_second": 0.05, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 2969600000, + "step": 29000 + }, + { + "epoch": 0.03, + "grad_norm": 0.12099518917998703, + "learning_rate": 9.807070707070708e-05, + "loss": 0.358, + "num_input_tokens_seen": 2979840000, + "step": 29100 + }, + { + "epoch": 0.03, + "grad_norm": 0.0940573888242316, + "learning_rate": 9.806060606060606e-05, + "loss": 0.3589, + "num_input_tokens_seen": 2990080000, + "step": 29200 + }, + { + "epoch": 0.03, + "grad_norm": 0.12086193137495242, + "learning_rate": 9.805050505050506e-05, + "loss": 0.3593, + "num_input_tokens_seen": 3000320000, + "step": 29300 + }, + { + "epoch": 0.03, + "grad_norm": 0.0944554231200817, + "learning_rate": 9.804040404040404e-05, + "loss": 0.3602, + "num_input_tokens_seen": 3010560000, + "step": 29400 + }, + { + "epoch": 0.03, + "grad_norm": 0.07325507209615767, + "learning_rate": 9.803030303030303e-05, + "loss": 0.3554, + "num_input_tokens_seen": 3020800000, + "step": 29500 + }, + { + "epoch": 0.03, + "grad_norm": 0.11685171099918144, + "learning_rate": 9.802020202020203e-05, + "loss": 0.3562, + "num_input_tokens_seen": 3031040000, + "step": 29600 + }, + { + "epoch": 0.03, + "grad_norm": 0.09985223433551571, + "learning_rate": 9.801010101010102e-05, + "loss": 0.357, + "num_input_tokens_seen": 3041280000, + "step": 29700 + }, + { + "epoch": 0.03, + "grad_norm": 0.17135952464793094, + "learning_rate": 9.8e-05, + "loss": 0.3586, + "num_input_tokens_seen": 3051520000, + "step": 29800 + }, + { + "epoch": 0.03, + "grad_norm": 0.10098197662404099, + "learning_rate": 9.7989898989899e-05, + "loss": 0.3567, + "num_input_tokens_seen": 3061760000, + "step": 29900 + }, + { + "epoch": 0.03, + "grad_norm": 0.15149886956103967, + "learning_rate": 9.797979797979798e-05, + "loss": 0.3591, + "num_input_tokens_seen": 3072000000, + "step": 30000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.35290258691268767, + "eval_average_loss_on_sentence_tokens": 0.39201884173473056, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.3546484410762787, + "eval_non_padding_tokens_in_labels": 133.53105, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3804, + "eval_padding_tokens_in_labels": 378.46895, + "eval_reconstruction_accuracy": 0.9266994940320824, + "eval_runtime": 191.0999, + "eval_samples_per_second": 26.164, + "eval_sentence_accuracy": 0.7558723778419797, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 3072000000, + "step": 30000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09305983056837887, + "learning_rate": 9.796969696969697e-05, + "loss": 0.3583, + "num_input_tokens_seen": 3082240000, + "step": 30100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08551824234236988, + "learning_rate": 9.795959595959596e-05, + "loss": 0.3592, + "num_input_tokens_seen": 3092480000, + "step": 30200 + }, + { + "epoch": 0.03, + "grad_norm": 0.0770125852850996, + "learning_rate": 9.794949494949496e-05, + "loss": 0.3567, + "num_input_tokens_seen": 3102720000, + "step": 30300 + }, + { + "epoch": 0.03, + "grad_norm": 0.09387334765692501, + "learning_rate": 9.793939393939394e-05, + "loss": 0.3587, + "num_input_tokens_seen": 3112960000, + "step": 30400 + }, + { + "epoch": 0.03, + "grad_norm": 0.08424991391139344, + "learning_rate": 9.792929292929294e-05, + "loss": 0.3588, + "num_input_tokens_seen": 3123200000, + "step": 30500 + }, + { + "epoch": 0.03, + "grad_norm": 0.12246957334765836, + "learning_rate": 9.791919191919192e-05, + "loss": 0.3557, + "num_input_tokens_seen": 3133440000, + "step": 30600 + }, + { + "epoch": 0.03, + "grad_norm": 0.10614110629234996, + "learning_rate": 9.790909090909091e-05, + "loss": 0.3561, + "num_input_tokens_seen": 3143680000, + "step": 30700 + }, + { + "epoch": 0.03, + "grad_norm": 0.13356568839284386, + "learning_rate": 9.78989898989899e-05, + "loss": 0.3574, + "num_input_tokens_seen": 3153920000, + "step": 30800 + }, + { + "epoch": 0.03, + "grad_norm": 0.0895629230825024, + "learning_rate": 9.78888888888889e-05, + "loss": 0.3563, + "num_input_tokens_seen": 3164160000, + "step": 30900 + }, + { + "epoch": 0.03, + "grad_norm": 0.16700400204720783, + "learning_rate": 9.787878787878789e-05, + "loss": 0.3546, + "num_input_tokens_seen": 3174400000, + "step": 31000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3518938701253859, + "eval_average_loss_on_sentence_tokens": 0.38815311963009613, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.3535449206829071, + "eval_non_padding_tokens_in_labels": 133.55145, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3702, + "eval_padding_tokens_in_labels": 378.44855, + "eval_reconstruction_accuracy": 0.9271468005770392, + "eval_runtime": 197.7139, + "eval_samples_per_second": 25.289, + "eval_sentence_accuracy": 0.7477883252283454, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 3174400000, + "step": 31000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07575415582861136, + "learning_rate": 9.786868686868688e-05, + "loss": 0.3548, + "num_input_tokens_seen": 3184640000, + "step": 31100 + }, + { + "epoch": 0.03, + "grad_norm": 0.10933940810732458, + "learning_rate": 9.785858585858586e-05, + "loss": 0.3562, + "num_input_tokens_seen": 3194880000, + "step": 31200 + }, + { + "epoch": 0.03, + "grad_norm": 0.11331540242392588, + "learning_rate": 9.784848484848485e-05, + "loss": 0.3523, + "num_input_tokens_seen": 3205120000, + "step": 31300 + }, + { + "epoch": 0.03, + "grad_norm": 0.11292417128466069, + "learning_rate": 9.783838383838384e-05, + "loss": 0.356, + "num_input_tokens_seen": 3215360000, + "step": 31400 + }, + { + "epoch": 0.03, + "grad_norm": 0.08401367596771823, + "learning_rate": 9.782828282828283e-05, + "loss": 0.3545, + "num_input_tokens_seen": 3225600000, + "step": 31500 + }, + { + "epoch": 0.03, + "grad_norm": 0.09355949128705393, + "learning_rate": 9.781818181818183e-05, + "loss": 0.3555, + "num_input_tokens_seen": 3235840000, + "step": 31600 + }, + { + "epoch": 0.03, + "grad_norm": 0.14463991996407455, + "learning_rate": 9.780808080808082e-05, + "loss": 0.3561, + "num_input_tokens_seen": 3246080000, + "step": 31700 + }, + { + "epoch": 0.03, + "grad_norm": 0.09184912325885994, + "learning_rate": 9.77979797979798e-05, + "loss": 0.3539, + "num_input_tokens_seen": 3256320000, + "step": 31800 + }, + { + "epoch": 0.03, + "grad_norm": 0.0846744714701543, + "learning_rate": 9.77878787878788e-05, + "loss": 0.3576, + "num_input_tokens_seen": 3266560000, + "step": 31900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09247798050269185, + "learning_rate": 9.777777777777778e-05, + "loss": 0.3542, + "num_input_tokens_seen": 3276800000, + "step": 32000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.35196479632527816, + "eval_average_loss_on_sentence_tokens": 0.3895615696675367, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.35371094942092896, + "eval_non_padding_tokens_in_labels": 133.5149, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3711, + "eval_padding_tokens_in_labels": 378.4851, + "eval_reconstruction_accuracy": 0.9270111993418964, + "eval_runtime": 197.3653, + "eval_samples_per_second": 25.334, + "eval_sentence_accuracy": 0.7485016239883718, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 3276800000, + "step": 32000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07531861469615371, + "learning_rate": 9.776767676767677e-05, + "loss": 0.3572, + "num_input_tokens_seen": 3287040000, + "step": 32100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08921473134346632, + "learning_rate": 9.775757575757576e-05, + "loss": 0.3577, + "num_input_tokens_seen": 3297280000, + "step": 32200 + }, + { + "epoch": 0.03, + "grad_norm": 0.08242856745419479, + "learning_rate": 9.774747474747476e-05, + "loss": 0.3564, + "num_input_tokens_seen": 3307520000, + "step": 32300 + }, + { + "epoch": 0.03, + "grad_norm": 0.0776079427423947, + "learning_rate": 9.773737373737373e-05, + "loss": 0.3565, + "num_input_tokens_seen": 3317760000, + "step": 32400 + }, + { + "epoch": 0.03, + "grad_norm": 0.09154570361797731, + "learning_rate": 9.772727272727274e-05, + "loss": 0.3597, + "num_input_tokens_seen": 3328000000, + "step": 32500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08798693704583865, + "learning_rate": 9.771717171717172e-05, + "loss": 0.3588, + "num_input_tokens_seen": 3338240000, + "step": 32600 + }, + { + "epoch": 0.03, + "grad_norm": 0.14094010124428097, + "learning_rate": 9.770707070707071e-05, + "loss": 0.3521, + "num_input_tokens_seen": 3348480000, + "step": 32700 + }, + { + "epoch": 0.03, + "grad_norm": 0.10830459527814777, + "learning_rate": 9.76969696969697e-05, + "loss": 0.3574, + "num_input_tokens_seen": 3358720000, + "step": 32800 + }, + { + "epoch": 0.03, + "grad_norm": 0.09838650064462762, + "learning_rate": 9.76868686868687e-05, + "loss": 0.3572, + "num_input_tokens_seen": 3368960000, + "step": 32900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09892064059533792, + "learning_rate": 9.767676767676767e-05, + "loss": 0.3565, + "num_input_tokens_seen": 3379200000, + "step": 33000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.35155470784653364, + "eval_average_loss_on_sentence_tokens": 0.40582679882311185, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3540332019329071, + "eval_non_padding_tokens_in_labels": 133.51405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3702, + "eval_padding_tokens_in_labels": 378.48595, + "eval_reconstruction_accuracy": 0.9271658648993643, + "eval_runtime": 181.0187, + "eval_samples_per_second": 27.621, + "eval_sentence_accuracy": 0.7331141098569813, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 3379200000, + "step": 33000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09490672410526875, + "learning_rate": 9.766666666666668e-05, + "loss": 0.3565, + "num_input_tokens_seen": 3389440000, + "step": 33100 + }, + { + "epoch": 0.03, + "grad_norm": 0.12053277067911962, + "learning_rate": 9.765656565656566e-05, + "loss": 0.3572, + "num_input_tokens_seen": 3399680000, + "step": 33200 + }, + { + "epoch": 0.03, + "grad_norm": 0.10292702464218745, + "learning_rate": 9.764646464646465e-05, + "loss": 0.3603, + "num_input_tokens_seen": 3409920000, + "step": 33300 + }, + { + "epoch": 0.03, + "grad_norm": 0.08015467892235541, + "learning_rate": 9.763636363636364e-05, + "loss": 0.3574, + "num_input_tokens_seen": 3420160000, + "step": 33400 + }, + { + "epoch": 0.03, + "grad_norm": 0.18993096702935208, + "learning_rate": 9.762626262626263e-05, + "loss": 0.3557, + "num_input_tokens_seen": 3430400000, + "step": 33500 + }, + { + "epoch": 0.03, + "grad_norm": 0.10957549655774815, + "learning_rate": 9.761616161616161e-05, + "loss": 0.3558, + "num_input_tokens_seen": 3440640000, + "step": 33600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07449058267938093, + "learning_rate": 9.760606060606062e-05, + "loss": 0.3578, + "num_input_tokens_seen": 3450880000, + "step": 33700 + }, + { + "epoch": 0.03, + "grad_norm": 0.14417146350264, + "learning_rate": 9.75959595959596e-05, + "loss": 0.3548, + "num_input_tokens_seen": 3461120000, + "step": 33800 + }, + { + "epoch": 0.03, + "grad_norm": 0.08946034525171458, + "learning_rate": 9.758585858585859e-05, + "loss": 0.357, + "num_input_tokens_seen": 3471360000, + "step": 33900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09929782399145008, + "learning_rate": 9.757575757575758e-05, + "loss": 0.3545, + "num_input_tokens_seen": 3481600000, + "step": 34000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3509142458735924, + "eval_average_loss_on_sentence_tokens": 0.38850409086580706, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.3526367247104645, + "eval_non_padding_tokens_in_labels": 133.5323, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38085, + "eval_padding_tokens_in_labels": 378.4677, + "eval_reconstruction_accuracy": 0.9272810225354334, + "eval_runtime": 359.0149, + "eval_samples_per_second": 13.927, + "eval_sentence_accuracy": 0.7475191558849391, + "eval_steps_per_second": 0.036, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 3481600000, + "step": 34000 + }, + { + "epoch": 0.03, + "grad_norm": 0.08803042578438312, + "learning_rate": 9.756565656565657e-05, + "loss": 0.3551, + "num_input_tokens_seen": 3491840000, + "step": 34100 + }, + { + "epoch": 0.03, + "grad_norm": 0.09000227511778754, + "learning_rate": 9.755555555555555e-05, + "loss": 0.3528, + "num_input_tokens_seen": 3502080000, + "step": 34200 + }, + { + "epoch": 0.03, + "grad_norm": 0.07184009339288842, + "learning_rate": 9.754545454545455e-05, + "loss": 0.3543, + "num_input_tokens_seen": 3512320000, + "step": 34300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06955373164847028, + "learning_rate": 9.753535353535353e-05, + "loss": 0.353, + "num_input_tokens_seen": 3522560000, + "step": 34400 + }, + { + "epoch": 0.03, + "grad_norm": 0.10089382714915662, + "learning_rate": 9.752525252525253e-05, + "loss": 0.3566, + "num_input_tokens_seen": 3532800000, + "step": 34500 + }, + { + "epoch": 0.03, + "grad_norm": 0.07870663703311585, + "learning_rate": 9.751515151515152e-05, + "loss": 0.3539, + "num_input_tokens_seen": 3543040000, + "step": 34600 + }, + { + "epoch": 0.03, + "grad_norm": 0.11014152683790428, + "learning_rate": 9.750505050505051e-05, + "loss": 0.3548, + "num_input_tokens_seen": 3553280000, + "step": 34700 + }, + { + "epoch": 0.03, + "grad_norm": 0.06868101643127211, + "learning_rate": 9.74949494949495e-05, + "loss": 0.3536, + "num_input_tokens_seen": 3563520000, + "step": 34800 + }, + { + "epoch": 0.03, + "grad_norm": 0.07749836067227124, + "learning_rate": 9.748484848484849e-05, + "loss": 0.3508, + "num_input_tokens_seen": 3573760000, + "step": 34900 + }, + { + "epoch": 0.04, + "grad_norm": 0.08979481162416578, + "learning_rate": 9.747474747474747e-05, + "loss": 0.3564, + "num_input_tokens_seen": 3584000000, + "step": 35000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.35122013557891557, + "eval_average_loss_on_sentence_tokens": 0.41586361580380293, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.3541015684604645, + "eval_non_padding_tokens_in_labels": 133.5492, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3798, + "eval_padding_tokens_in_labels": 378.4508, + "eval_reconstruction_accuracy": 0.9271378208075225, + "eval_runtime": 270.4205, + "eval_samples_per_second": 18.49, + "eval_sentence_accuracy": 0.7214770219103845, + "eval_steps_per_second": 0.048, + "eval_variance_shuffling_prob": 0.2483999999999999, + "num_input_tokens_seen": 3584000000, + "step": 35000 + }, + { + "epoch": 0.04, + "grad_norm": 0.07645890978982182, + "learning_rate": 9.746464646464646e-05, + "loss": 0.356, + "num_input_tokens_seen": 3594240000, + "step": 35100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0916052400708867, + "learning_rate": 9.745454545454546e-05, + "loss": 0.3552, + "num_input_tokens_seen": 3604480000, + "step": 35200 + }, + { + "epoch": 0.04, + "grad_norm": 0.08952154557991786, + "learning_rate": 9.744444444444445e-05, + "loss": 0.3541, + "num_input_tokens_seen": 3614720000, + "step": 35300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08791688155998088, + "learning_rate": 9.743434343434344e-05, + "loss": 0.3574, + "num_input_tokens_seen": 3624960000, + "step": 35400 + }, + { + "epoch": 0.04, + "grad_norm": 0.13359938353098977, + "learning_rate": 9.742424242424243e-05, + "loss": 0.3549, + "num_input_tokens_seen": 3635200000, + "step": 35500 + }, + { + "epoch": 0.04, + "grad_norm": 0.09505341612739326, + "learning_rate": 9.741414141414141e-05, + "loss": 0.3537, + "num_input_tokens_seen": 3645440000, + "step": 35600 + }, + { + "epoch": 0.04, + "grad_norm": 0.10653641066048555, + "learning_rate": 9.740404040404042e-05, + "loss": 0.3546, + "num_input_tokens_seen": 3655680000, + "step": 35700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08945351052929952, + "learning_rate": 9.739393939393941e-05, + "loss": 0.3538, + "num_input_tokens_seen": 3665920000, + "step": 35800 + }, + { + "epoch": 0.04, + "grad_norm": 0.08116504613386974, + "learning_rate": 9.738383838383839e-05, + "loss": 0.3574, + "num_input_tokens_seen": 3676160000, + "step": 35900 + }, + { + "epoch": 0.04, + "grad_norm": 0.06854538474040119, + "learning_rate": 9.737373737373738e-05, + "loss": 0.3543, + "num_input_tokens_seen": 3686400000, + "step": 36000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3498998822191885, + "eval_average_loss_on_sentence_tokens": 0.4090852957153307, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.3526074290275574, + "eval_non_padding_tokens_in_labels": 133.5598, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3852, + "eval_padding_tokens_in_labels": 378.4402, + "eval_reconstruction_accuracy": 0.927419787988829, + "eval_runtime": 249.7215, + "eval_samples_per_second": 20.022, + "eval_sentence_accuracy": 0.7282286862741579, + "eval_steps_per_second": 0.052, + "eval_variance_shuffling_prob": 0.2483999999999999, + "num_input_tokens_seen": 3686400000, + "step": 36000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08296180038881723, + "learning_rate": 9.736363636363637e-05, + "loss": 0.3539, + "num_input_tokens_seen": 3696640000, + "step": 36100 + }, + { + "epoch": 0.04, + "grad_norm": 0.09485430068432767, + "learning_rate": 9.735353535353536e-05, + "loss": 0.3526, + "num_input_tokens_seen": 3706880000, + "step": 36200 + }, + { + "epoch": 0.04, + "grad_norm": 0.09633669313516958, + "learning_rate": 9.734343434343435e-05, + "loss": 0.353, + "num_input_tokens_seen": 3717120000, + "step": 36300 + }, + { + "epoch": 0.04, + "grad_norm": 0.10659760333963676, + "learning_rate": 9.733333333333335e-05, + "loss": 0.3559, + "num_input_tokens_seen": 3727360000, + "step": 36400 + }, + { + "epoch": 0.04, + "grad_norm": 0.0904989581784129, + "learning_rate": 9.732323232323232e-05, + "loss": 0.3538, + "num_input_tokens_seen": 3737600000, + "step": 36500 + }, + { + "epoch": 0.04, + "grad_norm": 0.0914969604994927, + "learning_rate": 9.731313131313132e-05, + "loss": 0.3562, + "num_input_tokens_seen": 3747840000, + "step": 36600 + }, + { + "epoch": 0.04, + "grad_norm": 0.10396423837018595, + "learning_rate": 9.730303030303031e-05, + "loss": 0.3551, + "num_input_tokens_seen": 3758080000, + "step": 36700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09615253011440184, + "learning_rate": 9.72929292929293e-05, + "loss": 0.3515, + "num_input_tokens_seen": 3768320000, + "step": 36800 + }, + { + "epoch": 0.04, + "grad_norm": 0.08674715388855216, + "learning_rate": 9.728282828282829e-05, + "loss": 0.3553, + "num_input_tokens_seen": 3778560000, + "step": 36900 + }, + { + "epoch": 0.04, + "grad_norm": 0.08530404701816965, + "learning_rate": 9.727272727272728e-05, + "loss": 0.3518, + "num_input_tokens_seen": 3788800000, + "step": 37000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3484217828828089, + "eval_average_loss_on_sentence_tokens": 0.3696153863841962, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.3493945300579071, + "eval_non_padding_tokens_in_labels": 133.5405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3904, + "eval_padding_tokens_in_labels": 378.4595, + "eval_reconstruction_accuracy": 0.9276455169735485, + "eval_runtime": 330.4117, + "eval_samples_per_second": 15.133, + "eval_sentence_accuracy": 0.7642480305776375, + "eval_steps_per_second": 0.039, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 3788800000, + "step": 37000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08174306623064383, + "learning_rate": 9.726262626262626e-05, + "loss": 0.3534, + "num_input_tokens_seen": 3799040000, + "step": 37100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07512782059048059, + "learning_rate": 9.725252525252527e-05, + "loss": 0.3548, + "num_input_tokens_seen": 3809280000, + "step": 37200 + }, + { + "epoch": 0.04, + "grad_norm": 0.09527099127637856, + "learning_rate": 9.724242424242425e-05, + "loss": 0.3512, + "num_input_tokens_seen": 3819520000, + "step": 37300 + }, + { + "epoch": 0.04, + "grad_norm": 0.10304517049370258, + "learning_rate": 9.723232323232324e-05, + "loss": 0.3549, + "num_input_tokens_seen": 3829760000, + "step": 37400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08120969089956792, + "learning_rate": 9.722222222222223e-05, + "loss": 0.3531, + "num_input_tokens_seen": 3840000000, + "step": 37500 + }, + { + "epoch": 0.04, + "grad_norm": 0.07643149222238022, + "learning_rate": 9.721212121212122e-05, + "loss": 0.3554, + "num_input_tokens_seen": 3850240000, + "step": 37600 + }, + { + "epoch": 0.04, + "grad_norm": 0.09751085902533685, + "learning_rate": 9.72020202020202e-05, + "loss": 0.3525, + "num_input_tokens_seen": 3860480000, + "step": 37700 + }, + { + "epoch": 0.04, + "grad_norm": 0.14154694589107147, + "learning_rate": 9.71919191919192e-05, + "loss": 0.3536, + "num_input_tokens_seen": 3870720000, + "step": 37800 + }, + { + "epoch": 0.04, + "grad_norm": 0.09233886679616624, + "learning_rate": 9.718181818181818e-05, + "loss": 0.3535, + "num_input_tokens_seen": 3880960000, + "step": 37900 + }, + { + "epoch": 0.04, + "grad_norm": 0.11880033784495789, + "learning_rate": 9.717171717171718e-05, + "loss": 0.3525, + "num_input_tokens_seen": 3891200000, + "step": 38000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.34948382010047147, + "eval_average_loss_on_sentence_tokens": 0.36649516113498937, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.3502246141433716, + "eval_non_padding_tokens_in_labels": 133.5347, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36275, + "eval_padding_tokens_in_labels": 378.4653, + "eval_reconstruction_accuracy": 0.9273563243636103, + "eval_runtime": 586.2161, + "eval_samples_per_second": 8.529, + "eval_sentence_accuracy": 0.7661636190715452, + "eval_steps_per_second": 0.022, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 3891200000, + "step": 38000 + }, + { + "epoch": 0.04, + "grad_norm": 0.11335485632084323, + "learning_rate": 9.716161616161617e-05, + "loss": 0.3507, + "num_input_tokens_seen": 3901440000, + "step": 38100 + }, + { + "epoch": 0.04, + "grad_norm": 0.08629676111017986, + "learning_rate": 9.715151515151516e-05, + "loss": 0.353, + "num_input_tokens_seen": 3911680000, + "step": 38200 + }, + { + "epoch": 0.04, + "grad_norm": 0.11199524167103836, + "learning_rate": 9.714141414141414e-05, + "loss": 0.352, + "num_input_tokens_seen": 3921920000, + "step": 38300 + }, + { + "epoch": 0.04, + "grad_norm": 0.06529766434739856, + "learning_rate": 9.713131313131314e-05, + "loss": 0.3547, + "num_input_tokens_seen": 3932160000, + "step": 38400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08193090238913554, + "learning_rate": 9.712121212121212e-05, + "loss": 0.3506, + "num_input_tokens_seen": 3942400000, + "step": 38500 + }, + { + "epoch": 0.04, + "grad_norm": 0.11013901124148924, + "learning_rate": 9.711111111111111e-05, + "loss": 0.3569, + "num_input_tokens_seen": 3952640000, + "step": 38600 + }, + { + "epoch": 0.04, + "grad_norm": 0.11756605160784518, + "learning_rate": 9.710101010101011e-05, + "loss": 0.3523, + "num_input_tokens_seen": 3962880000, + "step": 38700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09220990135090996, + "learning_rate": 9.70909090909091e-05, + "loss": 0.3485, + "num_input_tokens_seen": 3973120000, + "step": 38800 + }, + { + "epoch": 0.04, + "grad_norm": 0.1543880808747756, + "learning_rate": 9.708080808080808e-05, + "loss": 0.3511, + "num_input_tokens_seen": 3983360000, + "step": 38900 + }, + { + "epoch": 0.04, + "grad_norm": 0.06907611844268396, + "learning_rate": 9.707070707070708e-05, + "loss": 0.3535, + "num_input_tokens_seen": 3993600000, + "step": 39000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3488292401244654, + "eval_average_loss_on_sentence_tokens": 0.3705700660187073, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.3497949242591858, + "eval_non_padding_tokens_in_labels": 133.53465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3794, + "eval_padding_tokens_in_labels": 378.46535, + "eval_reconstruction_accuracy": 0.9275949833147875, + "eval_runtime": 744.0291, + "eval_samples_per_second": 6.72, + "eval_sentence_accuracy": 0.7626554452958171, + "eval_steps_per_second": 0.017, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 3993600000, + "step": 39000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08295119963612214, + "learning_rate": 9.706060606060606e-05, + "loss": 0.3533, + "num_input_tokens_seen": 4003840000, + "step": 39100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07054661643286092, + "learning_rate": 9.705050505050505e-05, + "loss": 0.3559, + "num_input_tokens_seen": 4014080000, + "step": 39200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07907961611594057, + "learning_rate": 9.704040404040405e-05, + "loss": 0.3534, + "num_input_tokens_seen": 4024320000, + "step": 39300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08172738613360542, + "learning_rate": 9.703030303030304e-05, + "loss": 0.3526, + "num_input_tokens_seen": 4034560000, + "step": 39400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08528553680718005, + "learning_rate": 9.702020202020202e-05, + "loss": 0.3566, + "num_input_tokens_seen": 4044800000, + "step": 39500 + }, + { + "epoch": 0.04, + "grad_norm": 0.1266194635551376, + "learning_rate": 9.701010101010102e-05, + "loss": 0.3556, + "num_input_tokens_seen": 4055040000, + "step": 39600 + }, + { + "epoch": 0.04, + "grad_norm": 0.09926869766572258, + "learning_rate": 9.7e-05, + "loss": 0.3528, + "num_input_tokens_seen": 4065280000, + "step": 39700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08236671863255297, + "learning_rate": 9.698989898989899e-05, + "loss": 0.3545, + "num_input_tokens_seen": 4075520000, + "step": 39800 + }, + { + "epoch": 0.04, + "grad_norm": 0.06734181276309772, + "learning_rate": 9.697979797979798e-05, + "loss": 0.3592, + "num_input_tokens_seen": 4085760000, + "step": 39900 + }, + { + "epoch": 0.04, + "grad_norm": 0.08602425276676824, + "learning_rate": 9.696969696969698e-05, + "loss": 0.354, + "num_input_tokens_seen": 4096000000, + "step": 40000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3485691012142072, + "eval_average_loss_on_sentence_tokens": 0.3997253977245675, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.35086914896965027, + "eval_non_padding_tokens_in_labels": 133.5027, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3884, + "eval_padding_tokens_in_labels": 378.4973, + "eval_reconstruction_accuracy": 0.927575279916467, + "eval_runtime": 523.4516, + "eval_samples_per_second": 9.552, + "eval_sentence_accuracy": 0.7410501193317423, + "eval_steps_per_second": 0.025, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 4096000000, + "step": 40000 + }, + { + "epoch": 0.04, + "grad_norm": 0.09213212105692051, + "learning_rate": 9.695959595959597e-05, + "loss": 0.3518, + "num_input_tokens_seen": 4106240000, + "step": 40100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07481624218965519, + "learning_rate": 9.694949494949496e-05, + "loss": 0.3514, + "num_input_tokens_seen": 4116480000, + "step": 40200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07322344540170503, + "learning_rate": 9.693939393939394e-05, + "loss": 0.3558, + "num_input_tokens_seen": 4126720000, + "step": 40300 + }, + { + "epoch": 0.04, + "grad_norm": 0.14710467451337966, + "learning_rate": 9.692929292929293e-05, + "loss": 0.3536, + "num_input_tokens_seen": 4136960000, + "step": 40400 + }, + { + "epoch": 0.04, + "grad_norm": 0.09670546457764569, + "learning_rate": 9.691919191919192e-05, + "loss": 0.3524, + "num_input_tokens_seen": 4147200000, + "step": 40500 + }, + { + "epoch": 0.04, + "grad_norm": 0.09777138598519103, + "learning_rate": 9.690909090909091e-05, + "loss": 0.3497, + "num_input_tokens_seen": 4157440000, + "step": 40600 + }, + { + "epoch": 0.04, + "grad_norm": 0.12320811250989316, + "learning_rate": 9.68989898989899e-05, + "loss": 0.3496, + "num_input_tokens_seen": 4167680000, + "step": 40700 + }, + { + "epoch": 0.04, + "grad_norm": 0.06759419491328698, + "learning_rate": 9.68888888888889e-05, + "loss": 0.3536, + "num_input_tokens_seen": 4177920000, + "step": 40800 + }, + { + "epoch": 0.04, + "grad_norm": 0.07220955232672013, + "learning_rate": 9.687878787878788e-05, + "loss": 0.3505, + "num_input_tokens_seen": 4188160000, + "step": 40900 + }, + { + "epoch": 0.04, + "grad_norm": 0.09817091980878821, + "learning_rate": 9.686868686868688e-05, + "loss": 0.3512, + "num_input_tokens_seen": 4198400000, + "step": 41000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3483534712782303, + "eval_average_loss_on_sentence_tokens": 0.3845320189648232, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3499511778354645, + "eval_non_padding_tokens_in_labels": 133.53665, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38535, + "eval_padding_tokens_in_labels": 378.46335, + "eval_reconstruction_accuracy": 0.9276118321947775, + "eval_runtime": 743.6183, + "eval_samples_per_second": 6.724, + "eval_sentence_accuracy": 0.7493853966658891, + "eval_steps_per_second": 0.017, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 4198400000, + "step": 41000 + }, + { + "epoch": 0.04, + "grad_norm": 0.09282196682997787, + "learning_rate": 9.685858585858586e-05, + "loss": 0.3539, + "num_input_tokens_seen": 4208640000, + "step": 41100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0972639656787111, + "learning_rate": 9.684848484848485e-05, + "loss": 0.3546, + "num_input_tokens_seen": 4218880000, + "step": 41200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07699383715261104, + "learning_rate": 9.683838383838384e-05, + "loss": 0.3546, + "num_input_tokens_seen": 4229120000, + "step": 41300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08550064991420245, + "learning_rate": 9.682828282828284e-05, + "loss": 0.3487, + "num_input_tokens_seen": 4239360000, + "step": 41400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08663281868860562, + "learning_rate": 9.681818181818181e-05, + "loss": 0.3546, + "num_input_tokens_seen": 4249600000, + "step": 41500 + }, + { + "epoch": 0.04, + "grad_norm": 0.1314091237260836, + "learning_rate": 9.680808080808082e-05, + "loss": 0.3532, + "num_input_tokens_seen": 4259840000, + "step": 41600 + }, + { + "epoch": 0.04, + "grad_norm": 0.08230853332927189, + "learning_rate": 9.67979797979798e-05, + "loss": 0.3526, + "num_input_tokens_seen": 4270080000, + "step": 41700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09058968810747087, + "learning_rate": 9.678787878787879e-05, + "loss": 0.3529, + "num_input_tokens_seen": 4280320000, + "step": 41800 + }, + { + "epoch": 0.04, + "grad_norm": 0.08362187982921145, + "learning_rate": 9.677777777777778e-05, + "loss": 0.3508, + "num_input_tokens_seen": 4290560000, + "step": 41900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07496401431372365, + "learning_rate": 9.676767676767677e-05, + "loss": 0.353, + "num_input_tokens_seen": 4300800000, + "step": 42000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.34775059511786016, + "eval_average_loss_on_sentence_tokens": 0.3694599702748092, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.34871092438697815, + "eval_non_padding_tokens_in_labels": 133.54165, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39725, + "eval_padding_tokens_in_labels": 378.45835, + "eval_reconstruction_accuracy": 0.9277243161449561, + "eval_runtime": 595.3764, + "eval_samples_per_second": 8.398, + "eval_sentence_accuracy": 0.7617492418396827, + "eval_steps_per_second": 0.022, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 4300800000, + "step": 42000 + }, + { + "epoch": 0.04, + "grad_norm": 0.07745974541235941, + "learning_rate": 9.675757575757575e-05, + "loss": 0.3498, + "num_input_tokens_seen": 4311040000, + "step": 42100 + }, + { + "epoch": 0.04, + "grad_norm": 0.13083839604188546, + "learning_rate": 9.674747474747476e-05, + "loss": 0.3495, + "num_input_tokens_seen": 4321280000, + "step": 42200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07095853561131785, + "learning_rate": 9.673737373737374e-05, + "loss": 0.3491, + "num_input_tokens_seen": 4331520000, + "step": 42300 + }, + { + "epoch": 0.04, + "grad_norm": 0.0826937695356135, + "learning_rate": 9.672727272727273e-05, + "loss": 0.3518, + "num_input_tokens_seen": 4341760000, + "step": 42400 + }, + { + "epoch": 0.04, + "grad_norm": 0.07144469426191587, + "learning_rate": 9.671717171717172e-05, + "loss": 0.3529, + "num_input_tokens_seen": 4352000000, + "step": 42500 + }, + { + "epoch": 0.04, + "grad_norm": 0.0952073338799254, + "learning_rate": 9.670707070707071e-05, + "loss": 0.3504, + "num_input_tokens_seen": 4362240000, + "step": 42600 + }, + { + "epoch": 0.04, + "grad_norm": 1.1635480678499523, + "learning_rate": 9.669696969696969e-05, + "loss": 0.353, + "num_input_tokens_seen": 4372480000, + "step": 42700 + }, + { + "epoch": 0.04, + "grad_norm": 0.10093179056979647, + "learning_rate": 9.66868686868687e-05, + "loss": 0.358, + "num_input_tokens_seen": 4382720000, + "step": 42800 + }, + { + "epoch": 0.04, + "grad_norm": 0.08252292123862574, + "learning_rate": 9.667676767676768e-05, + "loss": 0.3505, + "num_input_tokens_seen": 4392960000, + "step": 42900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07931318937389821, + "learning_rate": 9.666666666666667e-05, + "loss": 0.3493, + "num_input_tokens_seen": 4403200000, + "step": 43000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3464869087189977, + "eval_average_loss_on_sentence_tokens": 0.35568430700376136, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.34688475728034973, + "eval_non_padding_tokens_in_labels": 133.48765, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3791, + "eval_padding_tokens_in_labels": 378.51235, + "eval_reconstruction_accuracy": 0.9279186730743927, + "eval_runtime": 190.6024, + "eval_samples_per_second": 26.233, + "eval_sentence_accuracy": 0.7715470059396702, + "eval_steps_per_second": 0.068, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 4403200000, + "step": 43000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08185624858381875, + "learning_rate": 9.665656565656566e-05, + "loss": 0.3497, + "num_input_tokens_seen": 4413440000, + "step": 43100 + }, + { + "epoch": 0.04, + "grad_norm": 0.12253214828166321, + "learning_rate": 9.664646464646465e-05, + "loss": 0.3523, + "num_input_tokens_seen": 4423680000, + "step": 43200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07445605356064754, + "learning_rate": 9.663636363636363e-05, + "loss": 0.353, + "num_input_tokens_seen": 4433920000, + "step": 43300 + }, + { + "epoch": 0.04, + "grad_norm": 0.10641249535769638, + "learning_rate": 9.662626262626264e-05, + "loss": 0.3499, + "num_input_tokens_seen": 4444160000, + "step": 43400 + }, + { + "epoch": 0.04, + "grad_norm": 0.1227720688741477, + "learning_rate": 9.661616161616161e-05, + "loss": 0.353, + "num_input_tokens_seen": 4454400000, + "step": 43500 + }, + { + "epoch": 0.04, + "grad_norm": 0.08644264132168863, + "learning_rate": 9.66060606060606e-05, + "loss": 0.3526, + "num_input_tokens_seen": 4464640000, + "step": 43600 + }, + { + "epoch": 0.04, + "grad_norm": 0.0908823314277891, + "learning_rate": 9.65959595959596e-05, + "loss": 0.3528, + "num_input_tokens_seen": 4474880000, + "step": 43700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08246266081349644, + "learning_rate": 9.658585858585859e-05, + "loss": 0.3497, + "num_input_tokens_seen": 4485120000, + "step": 43800 + }, + { + "epoch": 0.04, + "grad_norm": 0.10718406667229304, + "learning_rate": 9.657575757575758e-05, + "loss": 0.3512, + "num_input_tokens_seen": 4495360000, + "step": 43900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07291081009772682, + "learning_rate": 9.656565656565657e-05, + "loss": 0.3511, + "num_input_tokens_seen": 4505600000, + "step": 44000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.34731084429373066, + "eval_average_loss_on_sentence_tokens": 0.3676992978275831, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.34822264313697815, + "eval_non_padding_tokens_in_labels": 133.5434, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38705, + "eval_padding_tokens_in_labels": 378.4566, + "eval_reconstruction_accuracy": 0.9278454075182158, + "eval_runtime": 181.3876, + "eval_samples_per_second": 27.565, + "eval_sentence_accuracy": 0.7601431980906921, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 4505600000, + "step": 44000 + }, + { + "epoch": 0.04, + "grad_norm": 0.10460474289274352, + "learning_rate": 9.655555555555555e-05, + "loss": 0.3541, + "num_input_tokens_seen": 4515840000, + "step": 44100 + }, + { + "epoch": 0.04, + "grad_norm": 0.08971148322677827, + "learning_rate": 9.654545454545454e-05, + "loss": 0.3475, + "num_input_tokens_seen": 4526080000, + "step": 44200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10700023003218113, + "learning_rate": 9.653535353535355e-05, + "loss": 0.3491, + "num_input_tokens_seen": 4536320000, + "step": 44300 + }, + { + "epoch": 0.04, + "grad_norm": 0.09683642542301804, + "learning_rate": 9.652525252525253e-05, + "loss": 0.3489, + "num_input_tokens_seen": 4546560000, + "step": 44400 + }, + { + "epoch": 0.04, + "grad_norm": 0.11728531014005934, + "learning_rate": 9.651515151515152e-05, + "loss": 0.3505, + "num_input_tokens_seen": 4556800000, + "step": 44500 + }, + { + "epoch": 0.04, + "grad_norm": 0.12345552139372837, + "learning_rate": 9.650505050505051e-05, + "loss": 0.3501, + "num_input_tokens_seen": 4567040000, + "step": 44600 + }, + { + "epoch": 0.04, + "grad_norm": 0.08610761937913017, + "learning_rate": 9.64949494949495e-05, + "loss": 0.351, + "num_input_tokens_seen": 4577280000, + "step": 44700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08104733927865691, + "learning_rate": 9.64848484848485e-05, + "loss": 0.3464, + "num_input_tokens_seen": 4587520000, + "step": 44800 + }, + { + "epoch": 0.04, + "grad_norm": 0.10105244314431601, + "learning_rate": 9.647474747474749e-05, + "loss": 0.3498, + "num_input_tokens_seen": 4597760000, + "step": 44900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07044842615149681, + "learning_rate": 9.646464646464647e-05, + "loss": 0.3528, + "num_input_tokens_seen": 4608000000, + "step": 45000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3470284524732196, + "eval_average_loss_on_sentence_tokens": 0.39697075201378157, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.34932616353034973, + "eval_non_padding_tokens_in_labels": 133.546, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38975, + "eval_padding_tokens_in_labels": 378.454, + "eval_reconstruction_accuracy": 0.9278970945992355, + "eval_runtime": 182.1322, + "eval_samples_per_second": 27.453, + "eval_sentence_accuracy": 0.734177328763436, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24797499999999995, + "num_input_tokens_seen": 4608000000, + "step": 45000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07193257452827664, + "learning_rate": 9.645454545454546e-05, + "loss": 0.3499, + "num_input_tokens_seen": 4618240000, + "step": 45100 + }, + { + "epoch": 0.05, + "grad_norm": 0.11712193244247271, + "learning_rate": 9.644444444444445e-05, + "loss": 0.3537, + "num_input_tokens_seen": 4628480000, + "step": 45200 + }, + { + "epoch": 0.05, + "grad_norm": 0.07342527394282601, + "learning_rate": 9.643434343434344e-05, + "loss": 0.351, + "num_input_tokens_seen": 4638720000, + "step": 45300 + }, + { + "epoch": 0.05, + "grad_norm": 0.09563817525617035, + "learning_rate": 9.642424242424243e-05, + "loss": 0.3514, + "num_input_tokens_seen": 4648960000, + "step": 45400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07864484242289746, + "learning_rate": 9.641414141414143e-05, + "loss": 0.3511, + "num_input_tokens_seen": 4659200000, + "step": 45500 + }, + { + "epoch": 0.05, + "grad_norm": 0.09224938148311214, + "learning_rate": 9.64040404040404e-05, + "loss": 0.3537, + "num_input_tokens_seen": 4669440000, + "step": 45600 + }, + { + "epoch": 0.05, + "grad_norm": 0.09122577266819994, + "learning_rate": 9.63939393939394e-05, + "loss": 0.3513, + "num_input_tokens_seen": 4679680000, + "step": 45700 + }, + { + "epoch": 0.05, + "grad_norm": 0.07493065575000031, + "learning_rate": 9.638383838383839e-05, + "loss": 0.3521, + "num_input_tokens_seen": 4689920000, + "step": 45800 + }, + { + "epoch": 0.05, + "grad_norm": 0.08880189708464585, + "learning_rate": 9.637373737373738e-05, + "loss": 0.3504, + "num_input_tokens_seen": 4700160000, + "step": 45900 + }, + { + "epoch": 0.05, + "grad_norm": 0.09189857393953663, + "learning_rate": 9.636363636363637e-05, + "loss": 0.351, + "num_input_tokens_seen": 4710400000, + "step": 46000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3466902109401352, + "eval_average_loss_on_sentence_tokens": 0.3826554607458675, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.34828126430511475, + "eval_non_padding_tokens_in_labels": 133.537, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36575, + "eval_padding_tokens_in_labels": 378.463, + "eval_reconstruction_accuracy": 0.9279022552908046, + "eval_runtime": 183.6295, + "eval_samples_per_second": 27.229, + "eval_sentence_accuracy": 0.7523507455990812, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 4710400000, + "step": 46000 + }, + { + "epoch": 0.05, + "grad_norm": 0.090681442871565, + "learning_rate": 9.635353535353536e-05, + "loss": 0.3539, + "num_input_tokens_seen": 4720640000, + "step": 46100 + }, + { + "epoch": 0.05, + "grad_norm": 0.08256632984538824, + "learning_rate": 9.634343434343434e-05, + "loss": 0.3531, + "num_input_tokens_seen": 4730880000, + "step": 46200 + }, + { + "epoch": 0.05, + "grad_norm": 0.08020965746797032, + "learning_rate": 9.633333333333335e-05, + "loss": 0.3483, + "num_input_tokens_seen": 4741120000, + "step": 46300 + }, + { + "epoch": 0.05, + "grad_norm": 0.08282345325979987, + "learning_rate": 9.632323232323233e-05, + "loss": 0.3514, + "num_input_tokens_seen": 4751360000, + "step": 46400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07605199825986989, + "learning_rate": 9.631313131313132e-05, + "loss": 0.3488, + "num_input_tokens_seen": 4761600000, + "step": 46500 + }, + { + "epoch": 0.05, + "grad_norm": 0.09182187764578326, + "learning_rate": 9.630303030303031e-05, + "loss": 0.3521, + "num_input_tokens_seen": 4771840000, + "step": 46600 + }, + { + "epoch": 0.05, + "grad_norm": 0.08372357261098205, + "learning_rate": 9.62929292929293e-05, + "loss": 0.3522, + "num_input_tokens_seen": 4782080000, + "step": 46700 + }, + { + "epoch": 0.05, + "grad_norm": 0.09380616055632435, + "learning_rate": 9.628282828282828e-05, + "loss": 0.3507, + "num_input_tokens_seen": 4792320000, + "step": 46800 + }, + { + "epoch": 0.05, + "grad_norm": 0.11655213851680513, + "learning_rate": 9.627272727272729e-05, + "loss": 0.3513, + "num_input_tokens_seen": 4802560000, + "step": 46900 + }, + { + "epoch": 0.05, + "grad_norm": 0.08278980658320012, + "learning_rate": 9.626262626262627e-05, + "loss": 0.3544, + "num_input_tokens_seen": 4812800000, + "step": 47000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.34651943415012315, + "eval_average_loss_on_sentence_tokens": 0.383747803344111, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.34816405177116394, + "eval_non_padding_tokens_in_labels": 133.49725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3745, + "eval_padding_tokens_in_labels": 378.50275, + "eval_reconstruction_accuracy": 0.928078243223317, + "eval_runtime": 180.7147, + "eval_samples_per_second": 27.668, + "eval_sentence_accuracy": 0.7443878191899798, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 4812800000, + "step": 47000 + }, + { + "epoch": 0.05, + "grad_norm": 0.08401530827240343, + "learning_rate": 9.625252525252526e-05, + "loss": 0.3478, + "num_input_tokens_seen": 4823040000, + "step": 47100 + }, + { + "epoch": 0.05, + "grad_norm": 0.07037915772262221, + "learning_rate": 9.624242424242425e-05, + "loss": 0.3499, + "num_input_tokens_seen": 4833280000, + "step": 47200 + }, + { + "epoch": 0.05, + "grad_norm": 0.06722015118379962, + "learning_rate": 9.623232323232324e-05, + "loss": 0.3476, + "num_input_tokens_seen": 4843520000, + "step": 47300 + }, + { + "epoch": 0.05, + "grad_norm": 0.08868224292378786, + "learning_rate": 9.622222222222222e-05, + "loss": 0.3492, + "num_input_tokens_seen": 4853760000, + "step": 47400 + }, + { + "epoch": 0.05, + "grad_norm": 0.10917809763315507, + "learning_rate": 9.621212121212123e-05, + "loss": 0.3539, + "num_input_tokens_seen": 4864000000, + "step": 47500 + }, + { + "epoch": 0.05, + "grad_norm": 0.08207998928176456, + "learning_rate": 9.62020202020202e-05, + "loss": 0.3528, + "num_input_tokens_seen": 4874240000, + "step": 47600 + }, + { + "epoch": 0.05, + "grad_norm": 0.08831065661866368, + "learning_rate": 9.61919191919192e-05, + "loss": 0.3498, + "num_input_tokens_seen": 4884480000, + "step": 47700 + }, + { + "epoch": 0.05, + "grad_norm": 0.08180413994047107, + "learning_rate": 9.618181818181819e-05, + "loss": 0.3507, + "num_input_tokens_seen": 4894720000, + "step": 47800 + }, + { + "epoch": 0.05, + "grad_norm": 0.09050707353521568, + "learning_rate": 9.617171717171718e-05, + "loss": 0.3509, + "num_input_tokens_seen": 4904960000, + "step": 47900 + }, + { + "epoch": 0.05, + "grad_norm": 0.07100564159170515, + "learning_rate": 9.616161616161616e-05, + "loss": 0.3505, + "num_input_tokens_seen": 4915200000, + "step": 48000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3457927761789185, + "eval_average_loss_on_sentence_tokens": 0.3734766448407803, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.3470507860183716, + "eval_non_padding_tokens_in_labels": 133.51045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36745, + "eval_padding_tokens_in_labels": 378.48955, + "eval_reconstruction_accuracy": 0.9280340713903849, + "eval_runtime": 179.1395, + "eval_samples_per_second": 27.911, + "eval_sentence_accuracy": 0.7593895239291546, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 4915200000, + "step": 48000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07888409957542222, + "learning_rate": 9.615151515151516e-05, + "loss": 0.3486, + "num_input_tokens_seen": 4925440000, + "step": 48100 + }, + { + "epoch": 0.05, + "grad_norm": 0.08363613158378863, + "learning_rate": 9.614141414141414e-05, + "loss": 0.3523, + "num_input_tokens_seen": 4935680000, + "step": 48200 + }, + { + "epoch": 0.05, + "grad_norm": 0.08435421837853559, + "learning_rate": 9.613131313131313e-05, + "loss": 0.3507, + "num_input_tokens_seen": 4945920000, + "step": 48300 + }, + { + "epoch": 0.05, + "grad_norm": 0.08371982659437595, + "learning_rate": 9.612121212121213e-05, + "loss": 0.349, + "num_input_tokens_seen": 4956160000, + "step": 48400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07736334715004695, + "learning_rate": 9.611111111111112e-05, + "loss": 0.3512, + "num_input_tokens_seen": 4966400000, + "step": 48500 + }, + { + "epoch": 0.05, + "grad_norm": 0.0732824072080515, + "learning_rate": 9.61010101010101e-05, + "loss": 0.3489, + "num_input_tokens_seen": 4976640000, + "step": 48600 + }, + { + "epoch": 0.05, + "grad_norm": 0.07964140737058345, + "learning_rate": 9.60909090909091e-05, + "loss": 0.3504, + "num_input_tokens_seen": 4986880000, + "step": 48700 + }, + { + "epoch": 0.05, + "grad_norm": 0.07213072170490163, + "learning_rate": 9.608080808080808e-05, + "loss": 0.3534, + "num_input_tokens_seen": 4997120000, + "step": 48800 + }, + { + "epoch": 0.05, + "grad_norm": 0.0962992163231459, + "learning_rate": 9.607070707070707e-05, + "loss": 0.3505, + "num_input_tokens_seen": 5007360000, + "step": 48900 + }, + { + "epoch": 0.05, + "grad_norm": 0.0783564001642562, + "learning_rate": 9.606060606060606e-05, + "loss": 0.3498, + "num_input_tokens_seen": 5017600000, + "step": 49000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3464422826444812, + "eval_average_loss_on_sentence_tokens": 0.3808544802041267, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3480761647224426, + "eval_non_padding_tokens_in_labels": 133.5357, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3703, + "eval_padding_tokens_in_labels": 378.4643, + "eval_reconstruction_accuracy": 0.9280367094366386, + "eval_runtime": 185.9495, + "eval_samples_per_second": 26.889, + "eval_sentence_accuracy": 0.7522341055502718, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 5017600000, + "step": 49000 + }, + { + "epoch": 0.05, + "grad_norm": 0.1164685727344041, + "learning_rate": 9.605050505050506e-05, + "loss": 0.3523, + "num_input_tokens_seen": 5027840000, + "step": 49100 + }, + { + "epoch": 0.05, + "grad_norm": 0.12025511196585914, + "learning_rate": 9.604040404040405e-05, + "loss": 0.3476, + "num_input_tokens_seen": 5038080000, + "step": 49200 + }, + { + "epoch": 0.05, + "grad_norm": 0.08862530229919445, + "learning_rate": 9.603030303030304e-05, + "loss": 0.3492, + "num_input_tokens_seen": 5048320000, + "step": 49300 + }, + { + "epoch": 0.05, + "grad_norm": 0.10499169045730954, + "learning_rate": 9.602020202020202e-05, + "loss": 0.3508, + "num_input_tokens_seen": 5058560000, + "step": 49400 + }, + { + "epoch": 0.05, + "grad_norm": 0.08373904157851149, + "learning_rate": 9.601010101010101e-05, + "loss": 0.3483, + "num_input_tokens_seen": 5068800000, + "step": 49500 + }, + { + "epoch": 0.05, + "grad_norm": 0.07963393458643661, + "learning_rate": 9.6e-05, + "loss": 0.3469, + "num_input_tokens_seen": 5079040000, + "step": 49600 + }, + { + "epoch": 0.05, + "grad_norm": 0.07540016632777936, + "learning_rate": 9.5989898989899e-05, + "loss": 0.3503, + "num_input_tokens_seen": 5089280000, + "step": 49700 + }, + { + "epoch": 0.05, + "grad_norm": 0.0690679374179033, + "learning_rate": 9.597979797979799e-05, + "loss": 0.346, + "num_input_tokens_seen": 5099520000, + "step": 49800 + }, + { + "epoch": 0.05, + "grad_norm": 0.10217077328335454, + "learning_rate": 9.596969696969698e-05, + "loss": 0.3501, + "num_input_tokens_seen": 5109760000, + "step": 49900 + }, + { + "epoch": 0.05, + "grad_norm": 0.09139649856751507, + "learning_rate": 9.595959595959596e-05, + "loss": 0.3488, + "num_input_tokens_seen": 5120000000, + "step": 50000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3456298819560379, + "eval_average_loss_on_sentence_tokens": 0.3827424004363013, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.3472558557987213, + "eval_non_padding_tokens_in_labels": 133.503, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37415, + "eval_padding_tokens_in_labels": 378.497, + "eval_reconstruction_accuracy": 0.9280630376503365, + "eval_runtime": 185.2463, + "eval_samples_per_second": 26.991, + "eval_sentence_accuracy": 0.7509196619233047, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2497749999999999, + "num_input_tokens_seen": 5120000000, + "step": 50000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07996692874725114, + "learning_rate": 9.594949494949496e-05, + "loss": 0.3473, + "num_input_tokens_seen": 5130240000, + "step": 50100 + }, + { + "epoch": 0.05, + "grad_norm": 0.09208897648885365, + "learning_rate": 9.593939393939394e-05, + "loss": 0.3475, + "num_input_tokens_seen": 5140480000, + "step": 50200 + }, + { + "epoch": 0.05, + "grad_norm": 0.07564759628927722, + "learning_rate": 9.592929292929293e-05, + "loss": 0.3474, + "num_input_tokens_seen": 5150720000, + "step": 50300 + }, + { + "epoch": 0.05, + "grad_norm": 0.08101586193300216, + "learning_rate": 9.591919191919192e-05, + "loss": 0.3502, + "num_input_tokens_seen": 5160960000, + "step": 50400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07311103203936492, + "learning_rate": 9.590909090909092e-05, + "loss": 0.3498, + "num_input_tokens_seen": 5171200000, + "step": 50500 + }, + { + "epoch": 0.05, + "grad_norm": 0.07529175908295746, + "learning_rate": 9.58989898989899e-05, + "loss": 0.3516, + "num_input_tokens_seen": 5181440000, + "step": 50600 + }, + { + "epoch": 0.05, + "grad_norm": 0.14013489944659396, + "learning_rate": 9.58888888888889e-05, + "loss": 0.3488, + "num_input_tokens_seen": 5191680000, + "step": 50700 + }, + { + "epoch": 0.05, + "grad_norm": 0.06317549982363811, + "learning_rate": 9.587878787878788e-05, + "loss": 0.3495, + "num_input_tokens_seen": 5201920000, + "step": 50800 + }, + { + "epoch": 0.05, + "grad_norm": 0.09572467634283506, + "learning_rate": 9.586868686868687e-05, + "loss": 0.346, + "num_input_tokens_seen": 5212160000, + "step": 50900 + }, + { + "epoch": 0.05, + "grad_norm": 0.10188137257206885, + "learning_rate": 9.585858585858586e-05, + "loss": 0.3493, + "num_input_tokens_seen": 5222400000, + "step": 51000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3455192629173308, + "eval_average_loss_on_sentence_tokens": 0.34277344172693947, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.34541016817092896, + "eval_non_padding_tokens_in_labels": 133.5366, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36705, + "eval_padding_tokens_in_labels": 378.4634, + "eval_reconstruction_accuracy": 0.9282121329713617, + "eval_runtime": 185.3387, + "eval_samples_per_second": 26.978, + "eval_sentence_accuracy": 0.7779711809356327, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 5222400000, + "step": 51000 + }, + { + "epoch": 0.05, + "grad_norm": 0.09928819472304182, + "learning_rate": 9.584848484848486e-05, + "loss": 0.3522, + "num_input_tokens_seen": 5232640000, + "step": 51100 + }, + { + "epoch": 0.05, + "grad_norm": 0.11072744855066644, + "learning_rate": 9.583838383838383e-05, + "loss": 0.3476, + "num_input_tokens_seen": 5242880000, + "step": 51200 + }, + { + "epoch": 0.05, + "grad_norm": 0.13588869298399509, + "learning_rate": 9.582828282828284e-05, + "loss": 0.3506, + "num_input_tokens_seen": 5253120000, + "step": 51300 + }, + { + "epoch": 0.05, + "grad_norm": 0.0717246777135503, + "learning_rate": 9.581818181818182e-05, + "loss": 0.3519, + "num_input_tokens_seen": 5263360000, + "step": 51400 + }, + { + "epoch": 0.05, + "grad_norm": 0.06899891969741691, + "learning_rate": 9.580808080808081e-05, + "loss": 0.3501, + "num_input_tokens_seen": 5273600000, + "step": 51500 + }, + { + "epoch": 0.05, + "grad_norm": 0.06539497593449997, + "learning_rate": 9.57979797979798e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5283840000, + "step": 51600 + }, + { + "epoch": 0.05, + "grad_norm": 0.09572430055253284, + "learning_rate": 9.57878787878788e-05, + "loss": 0.3518, + "num_input_tokens_seen": 5294080000, + "step": 51700 + }, + { + "epoch": 0.05, + "grad_norm": 0.1042029039612538, + "learning_rate": 9.577777777777777e-05, + "loss": 0.3502, + "num_input_tokens_seen": 5304320000, + "step": 51800 + }, + { + "epoch": 0.05, + "grad_norm": 0.06667453552102366, + "learning_rate": 9.576767676767678e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5314560000, + "step": 51900 + }, + { + "epoch": 0.05, + "grad_norm": 0.07631056490125114, + "learning_rate": 9.575757575757576e-05, + "loss": 0.3512, + "num_input_tokens_seen": 5324800000, + "step": 52000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3446819479188432, + "eval_average_loss_on_sentence_tokens": 0.34953945296775785, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.3448828160762787, + "eval_non_padding_tokens_in_labels": 133.53045, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39205, + "eval_padding_tokens_in_labels": 378.46955, + "eval_reconstruction_accuracy": 0.9282531079068617, + "eval_runtime": 180.9174, + "eval_samples_per_second": 27.637, + "eval_sentence_accuracy": 0.7761453155561936, + "eval_steps_per_second": 0.072, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 5324800000, + "step": 52000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07219131618585166, + "learning_rate": 9.574747474747475e-05, + "loss": 0.3496, + "num_input_tokens_seen": 5335040000, + "step": 52100 + }, + { + "epoch": 0.05, + "grad_norm": 0.08928582338343495, + "learning_rate": 9.573737373737374e-05, + "loss": 0.3524, + "num_input_tokens_seen": 5345280000, + "step": 52200 + }, + { + "epoch": 0.05, + "grad_norm": 0.14731765885195194, + "learning_rate": 9.572727272727273e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5355520000, + "step": 52300 + }, + { + "epoch": 0.05, + "grad_norm": 0.11177042469617739, + "learning_rate": 9.571717171717171e-05, + "loss": 0.3487, + "num_input_tokens_seen": 5365760000, + "step": 52400 + }, + { + "epoch": 0.05, + "grad_norm": 0.08234725835644417, + "learning_rate": 9.570707070707072e-05, + "loss": 0.347, + "num_input_tokens_seen": 5376000000, + "step": 52500 + }, + { + "epoch": 0.05, + "grad_norm": 0.09531765668825862, + "learning_rate": 9.56969696969697e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5386240000, + "step": 52600 + }, + { + "epoch": 0.05, + "grad_norm": 0.06545113098278944, + "learning_rate": 9.568686868686869e-05, + "loss": 0.3488, + "num_input_tokens_seen": 5396480000, + "step": 52700 + }, + { + "epoch": 0.05, + "grad_norm": 0.07500119357351417, + "learning_rate": 9.567676767676769e-05, + "loss": 0.3484, + "num_input_tokens_seen": 5406720000, + "step": 52800 + }, + { + "epoch": 0.05, + "grad_norm": 0.08390697250286014, + "learning_rate": 9.566666666666667e-05, + "loss": 0.3493, + "num_input_tokens_seen": 5416960000, + "step": 52900 + }, + { + "epoch": 0.05, + "grad_norm": 0.08533829573916071, + "learning_rate": 9.565656565656566e-05, + "loss": 0.35, + "num_input_tokens_seen": 5427200000, + "step": 53000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3444883832601124, + "eval_average_loss_on_sentence_tokens": 0.36958025206355805, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.3456152379512787, + "eval_non_padding_tokens_in_labels": 133.53945, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3867, + "eval_padding_tokens_in_labels": 378.46055, + "eval_reconstruction_accuracy": 0.928286652524204, + "eval_runtime": 4573.6147, + "eval_samples_per_second": 1.093, + "eval_sentence_accuracy": 0.7618658818884921, + "eval_steps_per_second": 0.003, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 5427200000, + "step": 53000 + }, + { + "epoch": 0.05, + "grad_norm": 0.0940188610712813, + "learning_rate": 9.564646464646465e-05, + "loss": 0.3495, + "num_input_tokens_seen": 5437440000, + "step": 53100 + }, + { + "epoch": 0.05, + "grad_norm": 0.0882807312798241, + "learning_rate": 9.563636363636365e-05, + "loss": 0.35, + "num_input_tokens_seen": 5447680000, + "step": 53200 + }, + { + "epoch": 0.05, + "grad_norm": 0.07222346738658669, + "learning_rate": 9.562626262626262e-05, + "loss": 0.3505, + "num_input_tokens_seen": 5457920000, + "step": 53300 + }, + { + "epoch": 0.05, + "grad_norm": 0.10458686140493954, + "learning_rate": 9.561616161616163e-05, + "loss": 0.3499, + "num_input_tokens_seen": 5468160000, + "step": 53400 + }, + { + "epoch": 0.05, + "grad_norm": 0.08910070643417206, + "learning_rate": 9.560606060606061e-05, + "loss": 0.351, + "num_input_tokens_seen": 5478400000, + "step": 53500 + }, + { + "epoch": 0.05, + "grad_norm": 0.11193259212290033, + "learning_rate": 9.55959595959596e-05, + "loss": 0.3499, + "num_input_tokens_seen": 5488640000, + "step": 53600 + }, + { + "epoch": 0.05, + "grad_norm": 0.09204755830656844, + "learning_rate": 9.558585858585859e-05, + "loss": 0.3468, + "num_input_tokens_seen": 5498880000, + "step": 53700 + }, + { + "epoch": 0.05, + "grad_norm": 0.09813084910280927, + "learning_rate": 9.557575757575758e-05, + "loss": 0.3483, + "num_input_tokens_seen": 5509120000, + "step": 53800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07079657673683473, + "learning_rate": 9.556565656565656e-05, + "loss": 0.3452, + "num_input_tokens_seen": 5519360000, + "step": 53900 + }, + { + "epoch": 0.05, + "grad_norm": 0.08140832526541768, + "learning_rate": 9.555555555555557e-05, + "loss": 0.3499, + "num_input_tokens_seen": 5529600000, + "step": 54000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3450439910021566, + "eval_average_loss_on_sentence_tokens": 0.3887786390364147, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.3470117151737213, + "eval_non_padding_tokens_in_labels": 133.54535, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38095, + "eval_padding_tokens_in_labels": 378.45465, + "eval_reconstruction_accuracy": 0.9282456982460736, + "eval_runtime": 3925.7145, + "eval_samples_per_second": 1.274, + "eval_sentence_accuracy": 0.741642291887236, + "eval_steps_per_second": 0.003, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 5529600000, + "step": 54000 + }, + { + "epoch": 0.05, + "grad_norm": 0.11039766892895923, + "learning_rate": 9.554545454545455e-05, + "loss": 0.3511, + "num_input_tokens_seen": 5539840000, + "step": 54100 + }, + { + "epoch": 0.05, + "grad_norm": 0.07555638821211985, + "learning_rate": 9.553535353535354e-05, + "loss": 0.3493, + "num_input_tokens_seen": 5550080000, + "step": 54200 + }, + { + "epoch": 0.05, + "grad_norm": 0.09100400641780058, + "learning_rate": 9.552525252525253e-05, + "loss": 0.3476, + "num_input_tokens_seen": 5560320000, + "step": 54300 + }, + { + "epoch": 0.05, + "grad_norm": 0.06802961649617977, + "learning_rate": 9.551515151515152e-05, + "loss": 0.3467, + "num_input_tokens_seen": 5570560000, + "step": 54400 + }, + { + "epoch": 0.05, + "grad_norm": 0.09713023377935583, + "learning_rate": 9.550505050505051e-05, + "loss": 0.3466, + "num_input_tokens_seen": 5580800000, + "step": 54500 + }, + { + "epoch": 0.05, + "grad_norm": 0.07290740429712109, + "learning_rate": 9.54949494949495e-05, + "loss": 0.3478, + "num_input_tokens_seen": 5591040000, + "step": 54600 + }, + { + "epoch": 0.05, + "grad_norm": 0.07181759763014225, + "learning_rate": 9.548484848484849e-05, + "loss": 0.351, + "num_input_tokens_seen": 5601280000, + "step": 54700 + }, + { + "epoch": 0.05, + "grad_norm": 0.09168742381279946, + "learning_rate": 9.547474747474748e-05, + "loss": 0.3505, + "num_input_tokens_seen": 5611520000, + "step": 54800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07322978709469263, + "learning_rate": 9.546464646464647e-05, + "loss": 0.3514, + "num_input_tokens_seen": 5621760000, + "step": 54900 + }, + { + "epoch": 0.06, + "grad_norm": 0.11762533451992002, + "learning_rate": 9.545454545454546e-05, + "loss": 0.349, + "num_input_tokens_seen": 5632000000, + "step": 55000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.34390068436898935, + "eval_average_loss_on_sentence_tokens": 0.3347425611276311, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.34342774748802185, + "eval_non_padding_tokens_in_labels": 133.5083, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 378.4917, + "eval_reconstruction_accuracy": 0.9283603510641102, + "eval_runtime": 3740.4943, + "eval_samples_per_second": 1.337, + "eval_sentence_accuracy": 0.7846690114307248, + "eval_steps_per_second": 0.003, + "eval_variance_shuffling_prob": 0.24797499999999995, + "num_input_tokens_seen": 5632000000, + "step": 55000 + }, + { + "epoch": 0.06, + "grad_norm": 0.10444810418706606, + "learning_rate": 9.544444444444445e-05, + "loss": 0.3486, + "num_input_tokens_seen": 5642240000, + "step": 55100 + }, + { + "epoch": 0.06, + "grad_norm": 0.0839093505297172, + "learning_rate": 9.543434343434344e-05, + "loss": 0.3487, + "num_input_tokens_seen": 5652480000, + "step": 55200 + }, + { + "epoch": 0.06, + "grad_norm": 0.07635367077528293, + "learning_rate": 9.542424242424242e-05, + "loss": 0.348, + "num_input_tokens_seen": 5662720000, + "step": 55300 + }, + { + "epoch": 0.06, + "grad_norm": 0.07916981201710084, + "learning_rate": 9.541414141414143e-05, + "loss": 0.3509, + "num_input_tokens_seen": 5672960000, + "step": 55400 + }, + { + "epoch": 0.06, + "grad_norm": 0.08395551307568218, + "learning_rate": 9.540404040404041e-05, + "loss": 0.3472, + "num_input_tokens_seen": 5683200000, + "step": 55500 + }, + { + "epoch": 0.06, + "grad_norm": 0.07751358061528046, + "learning_rate": 9.53939393939394e-05, + "loss": 0.347, + "num_input_tokens_seen": 5693440000, + "step": 55600 + }, + { + "epoch": 0.06, + "grad_norm": 0.12256700555270875, + "learning_rate": 9.538383838383839e-05, + "loss": 0.3496, + "num_input_tokens_seen": 5703680000, + "step": 55700 + }, + { + "epoch": 0.06, + "grad_norm": 0.10859497398949086, + "learning_rate": 9.537373737373738e-05, + "loss": 0.3482, + "num_input_tokens_seen": 5713920000, + "step": 55800 + }, + { + "epoch": 0.06, + "grad_norm": 0.06726279835405306, + "learning_rate": 9.536363636363636e-05, + "loss": 0.3481, + "num_input_tokens_seen": 5724160000, + "step": 55900 + }, + { + "epoch": 0.06, + "grad_norm": 0.09104657189358191, + "learning_rate": 9.535353535353537e-05, + "loss": 0.3456, + "num_input_tokens_seen": 5734400000, + "step": 56000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.34522334434983, + "eval_average_loss_on_sentence_tokens": 0.37014649287114826, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3463769555091858, + "eval_non_padding_tokens_in_labels": 133.56435, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3837, + "eval_padding_tokens_in_labels": 378.43565, + "eval_reconstruction_accuracy": 0.9282281560958312, + "eval_runtime": 1548.2176, + "eval_samples_per_second": 3.23, + "eval_sentence_accuracy": 0.7598201948786046, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 5734400000, + "step": 56000 + }, + { + "epoch": 0.06, + "grad_norm": 0.08198156441563319, + "learning_rate": 9.534343434343435e-05, + "loss": 0.3488, + "num_input_tokens_seen": 5744640000, + "step": 56100 + }, + { + "epoch": 0.06, + "grad_norm": 0.08155477603928012, + "learning_rate": 9.533333333333334e-05, + "loss": 0.3502, + "num_input_tokens_seen": 5754880000, + "step": 56200 + }, + { + "epoch": 0.06, + "grad_norm": 0.08459072323749073, + "learning_rate": 9.532323232323233e-05, + "loss": 0.3511, + "num_input_tokens_seen": 5765120000, + "step": 56300 + }, + { + "epoch": 0.06, + "grad_norm": 0.08004903992023026, + "learning_rate": 9.531313131313132e-05, + "loss": 0.3489, + "num_input_tokens_seen": 5775360000, + "step": 56400 + }, + { + "epoch": 0.06, + "grad_norm": 0.1202572165308204, + "learning_rate": 9.53030303030303e-05, + "loss": 0.3467, + "num_input_tokens_seen": 5785600000, + "step": 56500 + }, + { + "epoch": 0.06, + "grad_norm": 0.07893588086913793, + "learning_rate": 9.52929292929293e-05, + "loss": 0.3461, + "num_input_tokens_seen": 5795840000, + "step": 56600 + }, + { + "epoch": 0.06, + "grad_norm": 0.07943078834329521, + "learning_rate": 9.528282828282828e-05, + "loss": 0.3464, + "num_input_tokens_seen": 5806080000, + "step": 56700 + }, + { + "epoch": 0.06, + "grad_norm": 0.07857819034638554, + "learning_rate": 9.527272727272728e-05, + "loss": 0.3462, + "num_input_tokens_seen": 5816320000, + "step": 56800 + }, + { + "epoch": 0.06, + "grad_norm": 0.08927383237353988, + "learning_rate": 9.526262626262627e-05, + "loss": 0.3458, + "num_input_tokens_seen": 5826560000, + "step": 56900 + }, + { + "epoch": 0.06, + "grad_norm": 0.07449314565897508, + "learning_rate": 9.525252525252526e-05, + "loss": 0.3475, + "num_input_tokens_seen": 5836800000, + "step": 57000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3438626248096903, + "eval_average_loss_on_sentence_tokens": 0.3242385827998598, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.34299805760383606, + "eval_non_padding_tokens_in_labels": 133.51005, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36545, + "eval_padding_tokens_in_labels": 378.48995, + "eval_reconstruction_accuracy": 0.9283950792301294, + "eval_runtime": 171.8918, + "eval_samples_per_second": 29.088, + "eval_sentence_accuracy": 0.7924210885208247, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.2464, + "num_input_tokens_seen": 5836800000, + "step": 57000 + }, + { + "epoch": 0.06, + "grad_norm": 0.10323726592166695, + "learning_rate": 9.524242424242424e-05, + "loss": 0.3472, + "num_input_tokens_seen": 5847040000, + "step": 57100 + }, + { + "epoch": 0.06, + "grad_norm": 0.0666701336498705, + "learning_rate": 9.523232323232324e-05, + "loss": 0.3468, + "num_input_tokens_seen": 5857280000, + "step": 57200 + }, + { + "epoch": 0.06, + "grad_norm": 0.06283177293624168, + "learning_rate": 9.522222222222222e-05, + "loss": 0.3462, + "num_input_tokens_seen": 5867520000, + "step": 57300 + }, + { + "epoch": 0.06, + "grad_norm": 0.13862941656094258, + "learning_rate": 9.521212121212121e-05, + "loss": 0.3483, + "num_input_tokens_seen": 5877760000, + "step": 57400 + }, + { + "epoch": 0.06, + "grad_norm": 0.07672447343238448, + "learning_rate": 9.52020202020202e-05, + "loss": 0.3474, + "num_input_tokens_seen": 5888000000, + "step": 57500 + }, + { + "epoch": 0.06, + "grad_norm": 0.07215314404420321, + "learning_rate": 9.51919191919192e-05, + "loss": 0.3465, + "num_input_tokens_seen": 5898240000, + "step": 57600 + }, + { + "epoch": 0.06, + "grad_norm": 0.08245942082087758, + "learning_rate": 9.518181818181818e-05, + "loss": 0.3501, + "num_input_tokens_seen": 5908480000, + "step": 57700 + }, + { + "epoch": 0.06, + "grad_norm": 0.09363620527515841, + "learning_rate": 9.517171717171718e-05, + "loss": 0.346, + "num_input_tokens_seen": 5918720000, + "step": 57800 + }, + { + "epoch": 0.06, + "grad_norm": 0.07896465356687354, + "learning_rate": 9.516161616161616e-05, + "loss": 0.348, + "num_input_tokens_seen": 5928960000, + "step": 57900 + }, + { + "epoch": 0.06, + "grad_norm": 0.10398892397714347, + "learning_rate": 9.515151515151515e-05, + "loss": 0.3471, + "num_input_tokens_seen": 5939200000, + "step": 58000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.34381431869384804, + "eval_average_loss_on_sentence_tokens": 0.3564367360398731, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.3443066477775574, + "eval_non_padding_tokens_in_labels": 133.5231, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37345, + "eval_padding_tokens_in_labels": 378.4769, + "eval_reconstruction_accuracy": 0.928386079885854, + "eval_runtime": 168.7907, + "eval_samples_per_second": 29.622, + "eval_sentence_accuracy": 0.772242360076803, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 5939200000, + "step": 58000 + }, + { + "epoch": 0.06, + "grad_norm": 0.08222682521306926, + "learning_rate": 9.514141414141414e-05, + "loss": 0.3482, + "num_input_tokens_seen": 5949440000, + "step": 58100 + }, + { + "epoch": 0.06, + "grad_norm": 0.08312829894182854, + "learning_rate": 9.513131313131314e-05, + "loss": 0.35, + "num_input_tokens_seen": 5959680000, + "step": 58200 + }, + { + "epoch": 0.06, + "grad_norm": 0.0685017210647811, + "learning_rate": 9.512121212121213e-05, + "loss": 0.3482, + "num_input_tokens_seen": 5969920000, + "step": 58300 + }, + { + "epoch": 0.06, + "grad_norm": 0.07372400116500949, + "learning_rate": 9.511111111111112e-05, + "loss": 0.3474, + "num_input_tokens_seen": 5980160000, + "step": 58400 + }, + { + "epoch": 0.06, + "grad_norm": 0.0767836623671639, + "learning_rate": 9.51010101010101e-05, + "loss": 0.3519, + "num_input_tokens_seen": 5990400000, + "step": 58500 + }, + { + "epoch": 0.06, + "grad_norm": 0.0959313915797849, + "learning_rate": 9.509090909090909e-05, + "loss": 0.3508, + "num_input_tokens_seen": 6000640000, + "step": 58600 + }, + { + "epoch": 0.06, + "grad_norm": 0.0750751541638554, + "learning_rate": 9.508080808080808e-05, + "loss": 0.3463, + "num_input_tokens_seen": 6010880000, + "step": 58700 + }, + { + "epoch": 0.06, + "grad_norm": 0.0818196075552596, + "learning_rate": 9.507070707070707e-05, + "loss": 0.3455, + "num_input_tokens_seen": 6021120000, + "step": 58800 + }, + { + "epoch": 0.06, + "grad_norm": 0.14324553605663087, + "learning_rate": 9.506060606060607e-05, + "loss": 0.3473, + "num_input_tokens_seen": 6031360000, + "step": 58900 + }, + { + "epoch": 0.06, + "grad_norm": 0.07761921625360899, + "learning_rate": 9.505050505050506e-05, + "loss": 0.3479, + "num_input_tokens_seen": 6041600000, + "step": 59000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3437961361110841, + "eval_average_loss_on_sentence_tokens": 0.3676124595836958, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3448437452316284, + "eval_non_padding_tokens_in_labels": 133.50725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3654, + "eval_padding_tokens_in_labels": 378.49275, + "eval_reconstruction_accuracy": 0.9284703322370546, + "eval_runtime": 169.2756, + "eval_samples_per_second": 29.538, + "eval_sentence_accuracy": 0.7572316830261812, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 6041600000, + "step": 59000 + }, + { + "epoch": 0.06, + "grad_norm": 0.09118201922586999, + "learning_rate": 9.504040404040404e-05, + "loss": 0.3479, + "num_input_tokens_seen": 6051840000, + "step": 59100 + }, + { + "epoch": 0.06, + "grad_norm": 0.09059531274374034, + "learning_rate": 9.503030303030304e-05, + "loss": 0.3483, + "num_input_tokens_seen": 6062080000, + "step": 59200 + }, + { + "epoch": 0.06, + "grad_norm": 0.1076175316702987, + "learning_rate": 9.502020202020202e-05, + "loss": 0.3444, + "num_input_tokens_seen": 6072320000, + "step": 59300 + }, + { + "epoch": 0.06, + "grad_norm": 0.07099997025811552, + "learning_rate": 9.501010101010101e-05, + "loss": 0.3458, + "num_input_tokens_seen": 6082560000, + "step": 59400 + }, + { + "epoch": 0.06, + "grad_norm": 0.07860815878581054, + "learning_rate": 9.5e-05, + "loss": 0.3499, + "num_input_tokens_seen": 6092800000, + "step": 59500 + }, + { + "epoch": 0.06, + "grad_norm": 0.06780347268494993, + "learning_rate": 9.4989898989899e-05, + "loss": 0.3451, + "num_input_tokens_seen": 6103040000, + "step": 59600 + }, + { + "epoch": 0.06, + "grad_norm": 0.10405946196955676, + "learning_rate": 9.497979797979798e-05, + "loss": 0.3469, + "num_input_tokens_seen": 6113280000, + "step": 59700 + }, + { + "epoch": 0.06, + "grad_norm": 0.07260009168091268, + "learning_rate": 9.496969696969698e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6123520000, + "step": 59800 + }, + { + "epoch": 0.06, + "grad_norm": 0.10000219445130448, + "learning_rate": 9.495959595959596e-05, + "loss": 0.3456, + "num_input_tokens_seen": 6133760000, + "step": 59900 + }, + { + "epoch": 0.06, + "grad_norm": 0.08117378630971718, + "learning_rate": 9.494949494949495e-05, + "loss": 0.3478, + "num_input_tokens_seen": 6144000000, + "step": 60000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3433743023250581, + "eval_average_loss_on_sentence_tokens": 0.3481073320694446, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.34361326694488525, + "eval_non_padding_tokens_in_labels": 133.5453, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37175, + "eval_padding_tokens_in_labels": 378.4547, + "eval_reconstruction_accuracy": 0.9284057898092313, + "eval_runtime": 184.5151, + "eval_samples_per_second": 27.098, + "eval_sentence_accuracy": 0.7782582949019327, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 6144000000, + "step": 60000 + }, + { + "epoch": 0.06, + "grad_norm": 0.1001775562250358, + "learning_rate": 9.493939393939394e-05, + "loss": 0.3492, + "num_input_tokens_seen": 6154240000, + "step": 60100 + }, + { + "epoch": 0.06, + "grad_norm": 0.09535958585890839, + "learning_rate": 9.492929292929294e-05, + "loss": 0.3465, + "num_input_tokens_seen": 6164480000, + "step": 60200 + }, + { + "epoch": 0.06, + "grad_norm": 0.08033856049799686, + "learning_rate": 9.491919191919191e-05, + "loss": 0.3466, + "num_input_tokens_seen": 6174720000, + "step": 60300 + }, + { + "epoch": 0.06, + "grad_norm": 0.07493766700278214, + "learning_rate": 9.490909090909092e-05, + "loss": 0.3475, + "num_input_tokens_seen": 6184960000, + "step": 60400 + }, + { + "epoch": 0.06, + "grad_norm": 0.07053819336019312, + "learning_rate": 9.48989898989899e-05, + "loss": 0.3446, + "num_input_tokens_seen": 6195200000, + "step": 60500 + }, + { + "epoch": 0.06, + "grad_norm": 0.09050152316774326, + "learning_rate": 9.488888888888889e-05, + "loss": 0.3441, + "num_input_tokens_seen": 6205440000, + "step": 60600 + }, + { + "epoch": 0.06, + "grad_norm": 0.07255963335024096, + "learning_rate": 9.487878787878788e-05, + "loss": 0.3486, + "num_input_tokens_seen": 6215680000, + "step": 60700 + }, + { + "epoch": 0.06, + "grad_norm": 0.09298246076203519, + "learning_rate": 9.486868686868687e-05, + "loss": 0.3488, + "num_input_tokens_seen": 6225920000, + "step": 60800 + }, + { + "epoch": 0.06, + "grad_norm": 0.08851156237202178, + "learning_rate": 9.485858585858585e-05, + "loss": 0.3463, + "num_input_tokens_seen": 6236160000, + "step": 60900 + }, + { + "epoch": 0.06, + "grad_norm": 0.12931811037310556, + "learning_rate": 9.484848484848486e-05, + "loss": 0.3482, + "num_input_tokens_seen": 6246400000, + "step": 61000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.34361996311127757, + "eval_average_loss_on_sentence_tokens": 0.37999779793362853, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.3453125059604645, + "eval_non_padding_tokens_in_labels": 133.5021, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3733, + "eval_padding_tokens_in_labels": 378.4979, + "eval_reconstruction_accuracy": 0.9285155788269797, + "eval_runtime": 178.0702, + "eval_samples_per_second": 28.079, + "eval_sentence_accuracy": 0.7498878461069141, + "eval_steps_per_second": 0.073, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 6246400000, + "step": 61000 + }, + { + "epoch": 0.06, + "grad_norm": 0.06630139715639853, + "learning_rate": 9.483838383838384e-05, + "loss": 0.3474, + "num_input_tokens_seen": 6256640000, + "step": 61100 + }, + { + "epoch": 0.06, + "grad_norm": 0.1311766209291915, + "learning_rate": 9.482828282828283e-05, + "loss": 0.35, + "num_input_tokens_seen": 6266880000, + "step": 61200 + }, + { + "epoch": 0.06, + "grad_norm": 0.07040374352329777, + "learning_rate": 9.481818181818183e-05, + "loss": 0.3478, + "num_input_tokens_seen": 6277120000, + "step": 61300 + }, + { + "epoch": 0.06, + "grad_norm": 0.07795332583582965, + "learning_rate": 9.480808080808081e-05, + "loss": 0.3452, + "num_input_tokens_seen": 6287360000, + "step": 61400 + }, + { + "epoch": 0.06, + "grad_norm": 0.06480440948523268, + "learning_rate": 9.47979797979798e-05, + "loss": 0.349, + "num_input_tokens_seen": 6297600000, + "step": 61500 + }, + { + "epoch": 0.06, + "grad_norm": 0.09798502177382536, + "learning_rate": 9.47878787878788e-05, + "loss": 0.3453, + "num_input_tokens_seen": 6307840000, + "step": 61600 + }, + { + "epoch": 0.06, + "grad_norm": 0.10768305066343614, + "learning_rate": 9.477777777777779e-05, + "loss": 0.3494, + "num_input_tokens_seen": 6318080000, + "step": 61700 + }, + { + "epoch": 0.06, + "grad_norm": 0.07054734416290279, + "learning_rate": 9.476767676767677e-05, + "loss": 0.349, + "num_input_tokens_seen": 6328320000, + "step": 61800 + }, + { + "epoch": 0.06, + "grad_norm": 0.06490027946715898, + "learning_rate": 9.475757575757577e-05, + "loss": 0.3457, + "num_input_tokens_seen": 6338560000, + "step": 61900 + }, + { + "epoch": 0.06, + "grad_norm": 0.10698790890904888, + "learning_rate": 9.474747474747475e-05, + "loss": 0.3462, + "num_input_tokens_seen": 6348800000, + "step": 62000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3427632162996734, + "eval_average_loss_on_sentence_tokens": 0.3276559449840499, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.34208983182907104, + "eval_non_padding_tokens_in_labels": 133.53795, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36535, + "eval_padding_tokens_in_labels": 378.46205, + "eval_reconstruction_accuracy": 0.9285551128473348, + "eval_runtime": 198.6356, + "eval_samples_per_second": 25.172, + "eval_sentence_accuracy": 0.7957273852889981, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.245775, + "num_input_tokens_seen": 6348800000, + "step": 62000 + }, + { + "epoch": 0.06, + "grad_norm": 0.11580237527934992, + "learning_rate": 9.473737373737374e-05, + "loss": 0.3496, + "num_input_tokens_seen": 6359040000, + "step": 62100 + }, + { + "epoch": 0.06, + "grad_norm": 0.06866398789916857, + "learning_rate": 9.472727272727273e-05, + "loss": 0.3472, + "num_input_tokens_seen": 6369280000, + "step": 62200 + }, + { + "epoch": 0.06, + "grad_norm": 0.07601249967738802, + "learning_rate": 9.471717171717173e-05, + "loss": 0.3467, + "num_input_tokens_seen": 6379520000, + "step": 62300 + }, + { + "epoch": 0.06, + "grad_norm": 0.08262113799576916, + "learning_rate": 9.47070707070707e-05, + "loss": 0.345, + "num_input_tokens_seen": 6389760000, + "step": 62400 + }, + { + "epoch": 0.06, + "grad_norm": 0.07329250196948593, + "learning_rate": 9.469696969696971e-05, + "loss": 0.3461, + "num_input_tokens_seen": 6400000000, + "step": 62500 + }, + { + "epoch": 0.06, + "grad_norm": 0.08911685926437626, + "learning_rate": 9.468686868686869e-05, + "loss": 0.3443, + "num_input_tokens_seen": 6410240000, + "step": 62600 + }, + { + "epoch": 0.06, + "grad_norm": 0.06986530135734499, + "learning_rate": 9.467676767676768e-05, + "loss": 0.3478, + "num_input_tokens_seen": 6420480000, + "step": 62700 + }, + { + "epoch": 0.06, + "grad_norm": 0.09529301417803751, + "learning_rate": 9.466666666666667e-05, + "loss": 0.3476, + "num_input_tokens_seen": 6430720000, + "step": 62800 + }, + { + "epoch": 0.06, + "grad_norm": 0.0774310314305089, + "learning_rate": 9.465656565656566e-05, + "loss": 0.3468, + "num_input_tokens_seen": 6440960000, + "step": 62900 + }, + { + "epoch": 0.06, + "grad_norm": 0.1039890135368906, + "learning_rate": 9.464646464646464e-05, + "loss": 0.3502, + "num_input_tokens_seen": 6451200000, + "step": 63000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3434480308234589, + "eval_average_loss_on_sentence_tokens": 0.3753002479624123, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3449121117591858, + "eval_non_padding_tokens_in_labels": 133.53405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3768, + "eval_padding_tokens_in_labels": 378.46595, + "eval_reconstruction_accuracy": 0.9283442781146504, + "eval_runtime": 192.6756, + "eval_samples_per_second": 25.95, + "eval_sentence_accuracy": 0.7587614621278734, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 6451200000, + "step": 63000 + }, + { + "epoch": 0.06, + "grad_norm": 0.10313247902799433, + "learning_rate": 9.463636363636365e-05, + "loss": 0.3467, + "num_input_tokens_seen": 6461440000, + "step": 63100 + }, + { + "epoch": 0.06, + "grad_norm": 0.07617404824604572, + "learning_rate": 9.462626262626263e-05, + "loss": 0.3487, + "num_input_tokens_seen": 6471680000, + "step": 63200 + }, + { + "epoch": 0.06, + "grad_norm": 0.10919917824911078, + "learning_rate": 9.461616161616162e-05, + "loss": 0.3466, + "num_input_tokens_seen": 6481920000, + "step": 63300 + }, + { + "epoch": 0.06, + "grad_norm": 0.06622016829733311, + "learning_rate": 9.460606060606061e-05, + "loss": 0.3467, + "num_input_tokens_seen": 6492160000, + "step": 63400 + }, + { + "epoch": 0.06, + "grad_norm": 0.09288499173607838, + "learning_rate": 9.45959595959596e-05, + "loss": 0.3457, + "num_input_tokens_seen": 6502400000, + "step": 63500 + }, + { + "epoch": 0.06, + "grad_norm": 0.09391680049855854, + "learning_rate": 9.45858585858586e-05, + "loss": 0.346, + "num_input_tokens_seen": 6512640000, + "step": 63600 + }, + { + "epoch": 0.06, + "grad_norm": 0.0712252488039725, + "learning_rate": 9.457575757575759e-05, + "loss": 0.3477, + "num_input_tokens_seen": 6522880000, + "step": 63700 + }, + { + "epoch": 0.06, + "grad_norm": 0.07035938494538439, + "learning_rate": 9.456565656565657e-05, + "loss": 0.3461, + "num_input_tokens_seen": 6533120000, + "step": 63800 + }, + { + "epoch": 0.06, + "grad_norm": 0.09690147305446983, + "learning_rate": 9.455555555555556e-05, + "loss": 0.3488, + "num_input_tokens_seen": 6543360000, + "step": 63900 + }, + { + "epoch": 0.06, + "grad_norm": 0.12818471044937185, + "learning_rate": 9.454545454545455e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6553600000, + "step": 64000 + }, + { + "epoch": 0.06, + "eval_average_loss_on_non_sentence_tokens": 0.3434318064200723, + "eval_average_loss_on_sentence_tokens": 0.35343790849142764, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.34395506978034973, + "eval_non_padding_tokens_in_labels": 133.50025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3527, + "eval_padding_tokens_in_labels": 378.49975, + "eval_reconstruction_accuracy": 0.9285437460028924, + "eval_runtime": 186.2919, + "eval_samples_per_second": 26.84, + "eval_sentence_accuracy": 0.7673210472481921, + "eval_steps_per_second": 0.07, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 6553600000, + "step": 64000 + }, + { + "epoch": 0.06, + "grad_norm": 0.10194671152119475, + "learning_rate": 9.453535353535354e-05, + "loss": 0.3468, + "num_input_tokens_seen": 6563840000, + "step": 64100 + }, + { + "epoch": 0.06, + "grad_norm": 0.11682624367916837, + "learning_rate": 9.452525252525253e-05, + "loss": 0.3468, + "num_input_tokens_seen": 6574080000, + "step": 64200 + }, + { + "epoch": 0.06, + "grad_norm": 0.10353527467393822, + "learning_rate": 9.451515151515153e-05, + "loss": 0.3497, + "num_input_tokens_seen": 6584320000, + "step": 64300 + }, + { + "epoch": 0.06, + "grad_norm": 0.0626468953158547, + "learning_rate": 9.45050505050505e-05, + "loss": 0.3471, + "num_input_tokens_seen": 6594560000, + "step": 64400 + }, + { + "epoch": 0.06, + "grad_norm": 0.06945456176147077, + "learning_rate": 9.449494949494951e-05, + "loss": 0.3462, + "num_input_tokens_seen": 6604800000, + "step": 64500 + }, + { + "epoch": 0.06, + "grad_norm": 0.07626563653269804, + "learning_rate": 9.448484848484849e-05, + "loss": 0.3448, + "num_input_tokens_seen": 6615040000, + "step": 64600 + }, + { + "epoch": 0.06, + "grad_norm": 0.11889633554688211, + "learning_rate": 9.447474747474748e-05, + "loss": 0.3436, + "num_input_tokens_seen": 6625280000, + "step": 64700 + }, + { + "epoch": 0.06, + "grad_norm": 0.07977598311500411, + "learning_rate": 9.446464646464647e-05, + "loss": 0.3461, + "num_input_tokens_seen": 6635520000, + "step": 64800 + }, + { + "epoch": 0.06, + "grad_norm": 0.08447589619988588, + "learning_rate": 9.445454545454546e-05, + "loss": 0.3477, + "num_input_tokens_seen": 6645760000, + "step": 64900 + }, + { + "epoch": 0.07, + "grad_norm": 0.09589559556669632, + "learning_rate": 9.444444444444444e-05, + "loss": 0.3458, + "num_input_tokens_seen": 6656000000, + "step": 65000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.34322548137929654, + "eval_average_loss_on_sentence_tokens": 0.3841889943752624, + "eval_average_shuffling_prob": 0.535, + "eval_loss": 0.34513673186302185, + "eval_non_padding_tokens_in_labels": 133.55505, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3923, + "eval_padding_tokens_in_labels": 378.44495, + "eval_reconstruction_accuracy": 0.928482469412133, + "eval_runtime": 213.7161, + "eval_samples_per_second": 23.396, + "eval_sentence_accuracy": 0.7484881655212016, + "eval_steps_per_second": 0.061, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 6656000000, + "step": 65000 + }, + { + "epoch": 0.07, + "grad_norm": 0.08741537721517857, + "learning_rate": 9.443434343434345e-05, + "loss": 0.346, + "num_input_tokens_seen": 6666240000, + "step": 65100 + }, + { + "epoch": 0.07, + "grad_norm": 0.06537200748157193, + "learning_rate": 9.442424242424243e-05, + "loss": 0.3448, + "num_input_tokens_seen": 6676480000, + "step": 65200 + }, + { + "epoch": 0.07, + "grad_norm": 0.08507388789859073, + "learning_rate": 9.441414141414142e-05, + "loss": 0.3471, + "num_input_tokens_seen": 6686720000, + "step": 65300 + }, + { + "epoch": 0.07, + "grad_norm": 0.0885920483519942, + "learning_rate": 9.440404040404041e-05, + "loss": 0.3466, + "num_input_tokens_seen": 6696960000, + "step": 65400 + }, + { + "epoch": 0.07, + "grad_norm": 0.0678531979905641, + "learning_rate": 9.43939393939394e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6707200000, + "step": 65500 + }, + { + "epoch": 0.07, + "grad_norm": 0.12977823729833665, + "learning_rate": 9.438383838383838e-05, + "loss": 0.3469, + "num_input_tokens_seen": 6717440000, + "step": 65600 + }, + { + "epoch": 0.07, + "grad_norm": 0.0675220191823822, + "learning_rate": 9.437373737373739e-05, + "loss": 0.3464, + "num_input_tokens_seen": 6727680000, + "step": 65700 + }, + { + "epoch": 0.07, + "grad_norm": 0.11131020661092701, + "learning_rate": 9.436363636363636e-05, + "loss": 0.3471, + "num_input_tokens_seen": 6737920000, + "step": 65800 + }, + { + "epoch": 0.07, + "grad_norm": 0.07820770237414024, + "learning_rate": 9.435353535353536e-05, + "loss": 0.3457, + "num_input_tokens_seen": 6748160000, + "step": 65900 + }, + { + "epoch": 0.07, + "grad_norm": 0.0937324820483306, + "learning_rate": 9.434343434343435e-05, + "loss": 0.3481, + "num_input_tokens_seen": 6758400000, + "step": 66000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.342352895393075, + "eval_average_loss_on_sentence_tokens": 0.35999443261352276, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.34322264790534973, + "eval_non_padding_tokens_in_labels": 133.52775, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38395, + "eval_padding_tokens_in_labels": 378.47225, + "eval_reconstruction_accuracy": 0.9285609888170263, + "eval_runtime": 200.5838, + "eval_samples_per_second": 24.927, + "eval_sentence_accuracy": 0.7700306853051483, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 6758400000, + "step": 66000 + }, + { + "epoch": 0.07, + "grad_norm": 0.07096015323487756, + "learning_rate": 9.433333333333334e-05, + "loss": 0.3469, + "num_input_tokens_seen": 6768640000, + "step": 66100 + }, + { + "epoch": 0.07, + "grad_norm": 0.10964663443256711, + "learning_rate": 9.432323232323232e-05, + "loss": 0.3451, + "num_input_tokens_seen": 6778880000, + "step": 66200 + }, + { + "epoch": 0.07, + "grad_norm": 0.07808475053618838, + "learning_rate": 9.431313131313132e-05, + "loss": 0.3452, + "num_input_tokens_seen": 6789120000, + "step": 66300 + }, + { + "epoch": 0.07, + "grad_norm": 0.07393486226694128, + "learning_rate": 9.43030303030303e-05, + "loss": 0.3468, + "num_input_tokens_seen": 6799360000, + "step": 66400 + }, + { + "epoch": 0.07, + "grad_norm": 0.12098883113662842, + "learning_rate": 9.42929292929293e-05, + "loss": 0.346, + "num_input_tokens_seen": 6809600000, + "step": 66500 + }, + { + "epoch": 0.07, + "grad_norm": 0.08668097036081968, + "learning_rate": 9.428282828282829e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6819840000, + "step": 66600 + }, + { + "epoch": 0.07, + "grad_norm": 0.09609684192908974, + "learning_rate": 9.427272727272728e-05, + "loss": 0.3481, + "num_input_tokens_seen": 6830080000, + "step": 66700 + }, + { + "epoch": 0.07, + "grad_norm": 0.09381847935304885, + "learning_rate": 9.426262626262626e-05, + "loss": 0.3471, + "num_input_tokens_seen": 6840320000, + "step": 66800 + }, + { + "epoch": 0.07, + "grad_norm": 0.0687517654940024, + "learning_rate": 9.425252525252526e-05, + "loss": 0.3423, + "num_input_tokens_seen": 6850560000, + "step": 66900 + }, + { + "epoch": 0.07, + "grad_norm": 0.17189691414713082, + "learning_rate": 9.424242424242424e-05, + "loss": 0.3465, + "num_input_tokens_seen": 6860800000, + "step": 67000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.343313337069124, + "eval_average_loss_on_sentence_tokens": 0.36306373169633777, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.34425780177116394, + "eval_non_padding_tokens_in_labels": 133.53535, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37875, + "eval_padding_tokens_in_labels": 378.46465, + "eval_reconstruction_accuracy": 0.928456027867997, + "eval_runtime": 202.8559, + "eval_samples_per_second": 24.648, + "eval_sentence_accuracy": 0.7664821361279093, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 6860800000, + "step": 67000 + }, + { + "epoch": 0.07, + "grad_norm": 0.12432282461961498, + "learning_rate": 9.423232323232323e-05, + "loss": 0.3481, + "num_input_tokens_seen": 6871040000, + "step": 67100 + }, + { + "epoch": 0.07, + "grad_norm": 0.3337887308064392, + "learning_rate": 9.422222222222223e-05, + "loss": 0.3446, + "num_input_tokens_seen": 6881280000, + "step": 67200 + }, + { + "epoch": 0.07, + "grad_norm": 0.06852258374784082, + "learning_rate": 9.421212121212122e-05, + "loss": 0.344, + "num_input_tokens_seen": 6891520000, + "step": 67300 + }, + { + "epoch": 0.07, + "grad_norm": 0.06882840929124208, + "learning_rate": 9.420202020202021e-05, + "loss": 0.347, + "num_input_tokens_seen": 6901760000, + "step": 67400 + }, + { + "epoch": 0.07, + "grad_norm": 0.06985777602075442, + "learning_rate": 9.41919191919192e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6912000000, + "step": 67500 + }, + { + "epoch": 0.07, + "grad_norm": 0.07091002750505872, + "learning_rate": 9.418181818181818e-05, + "loss": 0.345, + "num_input_tokens_seen": 6922240000, + "step": 67600 + }, + { + "epoch": 0.07, + "grad_norm": 0.06945496068079765, + "learning_rate": 9.417171717171717e-05, + "loss": 0.3498, + "num_input_tokens_seen": 6932480000, + "step": 67700 + }, + { + "epoch": 0.07, + "grad_norm": 0.11948420862717817, + "learning_rate": 9.416161616161616e-05, + "loss": 0.3461, + "num_input_tokens_seen": 6942720000, + "step": 67800 + }, + { + "epoch": 0.07, + "grad_norm": 0.1052529768089587, + "learning_rate": 9.415151515151516e-05, + "loss": 0.3466, + "num_input_tokens_seen": 6952960000, + "step": 67900 + }, + { + "epoch": 0.07, + "grad_norm": 0.10591262554891515, + "learning_rate": 9.414141414141415e-05, + "loss": 0.3483, + "num_input_tokens_seen": 6963200000, + "step": 68000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.34217597322835774, + "eval_average_loss_on_sentence_tokens": 0.338263500235897, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.3419726490974426, + "eval_non_padding_tokens_in_labels": 133.5173, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37695, + "eval_padding_tokens_in_labels": 378.4827, + "eval_reconstruction_accuracy": 0.9286854670634985, + "eval_runtime": 194.6147, + "eval_samples_per_second": 25.692, + "eval_sentence_accuracy": 0.7840364634737201, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 6963200000, + "step": 68000 + }, + { + "epoch": 0.07, + "grad_norm": 0.08738016001521397, + "learning_rate": 9.413131313131314e-05, + "loss": 0.3478, + "num_input_tokens_seen": 6973440000, + "step": 68100 + }, + { + "epoch": 0.07, + "grad_norm": 0.0652869129339433, + "learning_rate": 9.412121212121212e-05, + "loss": 0.3455, + "num_input_tokens_seen": 6983680000, + "step": 68200 + }, + { + "epoch": 0.07, + "grad_norm": 0.08187871329426454, + "learning_rate": 9.411111111111111e-05, + "loss": 0.3473, + "num_input_tokens_seen": 6993920000, + "step": 68300 + }, + { + "epoch": 0.07, + "grad_norm": 0.08706087667559993, + "learning_rate": 9.41010101010101e-05, + "loss": 0.3473, + "num_input_tokens_seen": 7004160000, + "step": 68400 + }, + { + "epoch": 0.07, + "grad_norm": 0.08859827152450787, + "learning_rate": 9.40909090909091e-05, + "loss": 0.3482, + "num_input_tokens_seen": 7014400000, + "step": 68500 + }, + { + "epoch": 0.07, + "grad_norm": 0.07064108596544619, + "learning_rate": 9.408080808080809e-05, + "loss": 0.3453, + "num_input_tokens_seen": 7024640000, + "step": 68600 + }, + { + "epoch": 0.07, + "grad_norm": 0.08294702997232328, + "learning_rate": 9.407070707070708e-05, + "loss": 0.3458, + "num_input_tokens_seen": 7034880000, + "step": 68700 + }, + { + "epoch": 0.07, + "grad_norm": 0.0726335525309844, + "learning_rate": 9.406060606060606e-05, + "loss": 0.3465, + "num_input_tokens_seen": 7045120000, + "step": 68800 + }, + { + "epoch": 0.07, + "grad_norm": 0.11049585277015807, + "learning_rate": 9.405050505050506e-05, + "loss": 0.3446, + "num_input_tokens_seen": 7055360000, + "step": 68900 + }, + { + "epoch": 0.07, + "grad_norm": 0.07247176442913233, + "learning_rate": 9.404040404040404e-05, + "loss": 0.3448, + "num_input_tokens_seen": 7065600000, + "step": 69000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.3427961886718506, + "eval_average_loss_on_sentence_tokens": 0.3526009744820004, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.3432910144329071, + "eval_non_padding_tokens_in_labels": 133.59085, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3817, + "eval_padding_tokens_in_labels": 378.40915, + "eval_reconstruction_accuracy": 0.9286256811414444, + "eval_runtime": 218.8196, + "eval_samples_per_second": 22.85, + "eval_sentence_accuracy": 0.7722244154539092, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 7065600000, + "step": 69000 + }, + { + "epoch": 0.07, + "grad_norm": 0.12459150637641384, + "learning_rate": 9.403030303030303e-05, + "loss": 0.3462, + "num_input_tokens_seen": 7075840000, + "step": 69100 + }, + { + "epoch": 0.07, + "grad_norm": 0.11287618845068963, + "learning_rate": 9.402020202020202e-05, + "loss": 0.344, + "num_input_tokens_seen": 7086080000, + "step": 69200 + }, + { + "epoch": 0.07, + "grad_norm": 0.09786789014548367, + "learning_rate": 9.401010101010102e-05, + "loss": 0.346, + "num_input_tokens_seen": 7096320000, + "step": 69300 + }, + { + "epoch": 0.07, + "grad_norm": 0.07027668835689135, + "learning_rate": 9.4e-05, + "loss": 0.3462, + "num_input_tokens_seen": 7106560000, + "step": 69400 + }, + { + "epoch": 0.07, + "grad_norm": 0.08940898355880508, + "learning_rate": 9.3989898989899e-05, + "loss": 0.3429, + "num_input_tokens_seen": 7116800000, + "step": 69500 + }, + { + "epoch": 0.07, + "grad_norm": 0.07568876033498681, + "learning_rate": 9.397979797979798e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7127040000, + "step": 69600 + }, + { + "epoch": 0.07, + "grad_norm": 0.09991789703265155, + "learning_rate": 9.396969696969697e-05, + "loss": 0.3435, + "num_input_tokens_seen": 7137280000, + "step": 69700 + }, + { + "epoch": 0.07, + "grad_norm": 0.07565523046801613, + "learning_rate": 9.395959595959598e-05, + "loss": 0.3479, + "num_input_tokens_seen": 7147520000, + "step": 69800 + }, + { + "epoch": 0.07, + "grad_norm": 0.07363105064783534, + "learning_rate": 9.394949494949495e-05, + "loss": 0.3448, + "num_input_tokens_seen": 7157760000, + "step": 69900 + }, + { + "epoch": 0.07, + "grad_norm": 0.06953022827512158, + "learning_rate": 9.393939393939395e-05, + "loss": 0.3476, + "num_input_tokens_seen": 7168000000, + "step": 70000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.341889601330239, + "eval_average_loss_on_sentence_tokens": 0.35126597174429625, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.34228515625, + "eval_non_padding_tokens_in_labels": 133.5251, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3814, + "eval_padding_tokens_in_labels": 378.4749, + "eval_reconstruction_accuracy": 0.9287416380468159, + "eval_runtime": 210.7151, + "eval_samples_per_second": 23.729, + "eval_sentence_accuracy": 0.7729691173039999, + "eval_steps_per_second": 0.062, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 7168000000, + "step": 70000 + }, + { + "epoch": 0.07, + "grad_norm": 0.06868918083106794, + "learning_rate": 9.392929292929294e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7178240000, + "step": 70100 + }, + { + "epoch": 0.07, + "grad_norm": 0.09163390312599551, + "learning_rate": 9.391919191919193e-05, + "loss": 0.3482, + "num_input_tokens_seen": 7188480000, + "step": 70200 + }, + { + "epoch": 0.07, + "grad_norm": 0.06911659132820105, + "learning_rate": 9.390909090909091e-05, + "loss": 0.3465, + "num_input_tokens_seen": 7198720000, + "step": 70300 + }, + { + "epoch": 0.07, + "grad_norm": 0.1735940922236739, + "learning_rate": 9.389898989898991e-05, + "loss": 0.3469, + "num_input_tokens_seen": 7208960000, + "step": 70400 + }, + { + "epoch": 0.07, + "grad_norm": 0.0693967445121511, + "learning_rate": 9.388888888888889e-05, + "loss": 0.347, + "num_input_tokens_seen": 7219200000, + "step": 70500 + }, + { + "epoch": 0.07, + "grad_norm": 0.09247946592631126, + "learning_rate": 9.387878787878788e-05, + "loss": 0.3434, + "num_input_tokens_seen": 7229440000, + "step": 70600 + }, + { + "epoch": 0.07, + "grad_norm": 0.08647064348514781, + "learning_rate": 9.386868686868688e-05, + "loss": 0.3451, + "num_input_tokens_seen": 7239680000, + "step": 70700 + }, + { + "epoch": 0.07, + "grad_norm": 0.08465774055263935, + "learning_rate": 9.385858585858587e-05, + "loss": 0.3459, + "num_input_tokens_seen": 7249920000, + "step": 70800 + }, + { + "epoch": 0.07, + "grad_norm": 0.08160853890462345, + "learning_rate": 9.384848484848485e-05, + "loss": 0.346, + "num_input_tokens_seen": 7260160000, + "step": 70900 + }, + { + "epoch": 0.07, + "grad_norm": 0.09773316930351925, + "learning_rate": 9.383838383838385e-05, + "loss": 0.3458, + "num_input_tokens_seen": 7270400000, + "step": 71000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.342347063854908, + "eval_average_loss_on_sentence_tokens": 0.3698205113014158, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.34355467557907104, + "eval_non_padding_tokens_in_labels": 133.5463, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37215, + "eval_padding_tokens_in_labels": 378.4537, + "eval_reconstruction_accuracy": 0.928544965412987, + "eval_runtime": 196.1973, + "eval_samples_per_second": 25.485, + "eval_sentence_accuracy": 0.763485384104653, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 7270400000, + "step": 71000 + }, + { + "epoch": 0.07, + "grad_norm": 0.09694912121618501, + "learning_rate": 9.382828282828283e-05, + "loss": 0.3433, + "num_input_tokens_seen": 7280640000, + "step": 71100 + }, + { + "epoch": 0.07, + "grad_norm": 0.06994791511656125, + "learning_rate": 9.381818181818182e-05, + "loss": 0.345, + "num_input_tokens_seen": 7290880000, + "step": 71200 + }, + { + "epoch": 0.07, + "grad_norm": 0.061892464252046836, + "learning_rate": 9.380808080808082e-05, + "loss": 0.3441, + "num_input_tokens_seen": 7301120000, + "step": 71300 + }, + { + "epoch": 0.07, + "grad_norm": 0.07648159296744907, + "learning_rate": 9.379797979797981e-05, + "loss": 0.3462, + "num_input_tokens_seen": 7311360000, + "step": 71400 + }, + { + "epoch": 0.07, + "grad_norm": 0.06653421358376777, + "learning_rate": 9.378787878787879e-05, + "loss": 0.3459, + "num_input_tokens_seen": 7321600000, + "step": 71500 + }, + { + "epoch": 0.07, + "grad_norm": 0.0838889686251674, + "learning_rate": 9.377777777777779e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7331840000, + "step": 71600 + }, + { + "epoch": 0.07, + "grad_norm": 0.13921143105831135, + "learning_rate": 9.376767676767677e-05, + "loss": 0.3435, + "num_input_tokens_seen": 7342080000, + "step": 71700 + }, + { + "epoch": 0.07, + "grad_norm": 0.0768485412117201, + "learning_rate": 9.375757575757576e-05, + "loss": 0.3445, + "num_input_tokens_seen": 7352320000, + "step": 71800 + }, + { + "epoch": 0.07, + "grad_norm": 0.08858481415292785, + "learning_rate": 9.374747474747475e-05, + "loss": 0.3417, + "num_input_tokens_seen": 7362560000, + "step": 71900 + }, + { + "epoch": 0.07, + "grad_norm": 0.05898526341830271, + "learning_rate": 9.373737373737375e-05, + "loss": 0.3454, + "num_input_tokens_seen": 7372800000, + "step": 72000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.34125257199383807, + "eval_average_loss_on_sentence_tokens": 0.38231520902858035, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.3430761694908142, + "eval_non_padding_tokens_in_labels": 133.52255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38615, + "eval_padding_tokens_in_labels": 378.47745, + "eval_reconstruction_accuracy": 0.9288344778343772, + "eval_runtime": 199.0704, + "eval_samples_per_second": 25.117, + "eval_sentence_accuracy": 0.7474204604590234, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2483999999999999, + "num_input_tokens_seen": 7372800000, + "step": 72000 + }, + { + "epoch": 0.07, + "grad_norm": 0.10334785379192729, + "learning_rate": 9.372727272727272e-05, + "loss": 0.3459, + "num_input_tokens_seen": 7383040000, + "step": 72100 + }, + { + "epoch": 0.07, + "grad_norm": 0.09123247079279741, + "learning_rate": 9.371717171717173e-05, + "loss": 0.3482, + "num_input_tokens_seen": 7393280000, + "step": 72200 + }, + { + "epoch": 0.07, + "grad_norm": 0.09592304943333249, + "learning_rate": 9.370707070707071e-05, + "loss": 0.3481, + "num_input_tokens_seen": 7403520000, + "step": 72300 + }, + { + "epoch": 0.07, + "grad_norm": 0.11338512820865851, + "learning_rate": 9.36969696969697e-05, + "loss": 0.3453, + "num_input_tokens_seen": 7413760000, + "step": 72400 + }, + { + "epoch": 0.07, + "grad_norm": 0.08622463169841162, + "learning_rate": 9.368686868686869e-05, + "loss": 0.3436, + "num_input_tokens_seen": 7424000000, + "step": 72500 + }, + { + "epoch": 0.07, + "grad_norm": 0.13717400842400881, + "learning_rate": 9.367676767676768e-05, + "loss": 0.3472, + "num_input_tokens_seen": 7434240000, + "step": 72600 + }, + { + "epoch": 0.07, + "grad_norm": 0.08037032483916194, + "learning_rate": 9.366666666666668e-05, + "loss": 0.3441, + "num_input_tokens_seen": 7444480000, + "step": 72700 + }, + { + "epoch": 0.07, + "grad_norm": 0.08938038588014732, + "learning_rate": 9.365656565656567e-05, + "loss": 0.3466, + "num_input_tokens_seen": 7454720000, + "step": 72800 + }, + { + "epoch": 0.07, + "grad_norm": 0.08271828816525985, + "learning_rate": 9.364646464646465e-05, + "loss": 0.348, + "num_input_tokens_seen": 7464960000, + "step": 72900 + }, + { + "epoch": 0.07, + "grad_norm": 0.06539398970316192, + "learning_rate": 9.363636363636364e-05, + "loss": 0.3417, + "num_input_tokens_seen": 7475200000, + "step": 73000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.3405605719593539, + "eval_average_loss_on_sentence_tokens": 0.32807738775413037, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.33992186188697815, + "eval_non_padding_tokens_in_labels": 133.54035, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38795, + "eval_padding_tokens_in_labels": 378.45965, + "eval_reconstruction_accuracy": 0.9288635426105132, + "eval_runtime": 200.7846, + "eval_samples_per_second": 24.902, + "eval_sentence_accuracy": 0.7939777845568575, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2464, + "num_input_tokens_seen": 7475200000, + "step": 73000 + }, + { + "epoch": 0.07, + "grad_norm": 0.09175574619050184, + "learning_rate": 9.362626262626263e-05, + "loss": 0.3479, + "num_input_tokens_seen": 7485440000, + "step": 73100 + }, + { + "epoch": 0.07, + "grad_norm": 0.07890896544646865, + "learning_rate": 9.361616161616162e-05, + "loss": 0.343, + "num_input_tokens_seen": 7495680000, + "step": 73200 + }, + { + "epoch": 0.07, + "grad_norm": 0.07548177482575796, + "learning_rate": 9.360606060606061e-05, + "loss": 0.3439, + "num_input_tokens_seen": 7505920000, + "step": 73300 + }, + { + "epoch": 0.07, + "grad_norm": 0.07783261362334166, + "learning_rate": 9.35959595959596e-05, + "loss": 0.3437, + "num_input_tokens_seen": 7516160000, + "step": 73400 + }, + { + "epoch": 0.07, + "grad_norm": 0.08754601471396727, + "learning_rate": 9.358585858585858e-05, + "loss": 0.343, + "num_input_tokens_seen": 7526400000, + "step": 73500 + }, + { + "epoch": 0.07, + "grad_norm": 0.07611358074768518, + "learning_rate": 9.357575757575759e-05, + "loss": 0.3428, + "num_input_tokens_seen": 7536640000, + "step": 73600 + }, + { + "epoch": 0.07, + "grad_norm": 0.08416987879754555, + "learning_rate": 9.356565656565657e-05, + "loss": 0.3451, + "num_input_tokens_seen": 7546880000, + "step": 73700 + }, + { + "epoch": 0.07, + "grad_norm": 0.0966604748333506, + "learning_rate": 9.355555555555556e-05, + "loss": 0.3441, + "num_input_tokens_seen": 7557120000, + "step": 73800 + }, + { + "epoch": 0.07, + "grad_norm": 0.07214552332843363, + "learning_rate": 9.354545454545455e-05, + "loss": 0.3433, + "num_input_tokens_seen": 7567360000, + "step": 73900 + }, + { + "epoch": 0.07, + "grad_norm": 0.08260994816349103, + "learning_rate": 9.353535353535354e-05, + "loss": 0.3449, + "num_input_tokens_seen": 7577600000, + "step": 74000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.341150192038058, + "eval_average_loss_on_sentence_tokens": 0.34754680608397936, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.34138670563697815, + "eval_non_padding_tokens_in_labels": 133.5576, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39165, + "eval_padding_tokens_in_labels": 378.4424, + "eval_reconstruction_accuracy": 0.9287313409367426, + "eval_runtime": 200.6052, + "eval_samples_per_second": 24.925, + "eval_sentence_accuracy": 0.7791061783336624, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 7577600000, + "step": 74000 + }, + { + "epoch": 0.07, + "grad_norm": 0.05737216624131951, + "learning_rate": 9.352525252525252e-05, + "loss": 0.3478, + "num_input_tokens_seen": 7587840000, + "step": 74100 + }, + { + "epoch": 0.07, + "grad_norm": 0.07894728474085799, + "learning_rate": 9.351515151515153e-05, + "loss": 0.3457, + "num_input_tokens_seen": 7598080000, + "step": 74200 + }, + { + "epoch": 0.07, + "grad_norm": 0.06522798763915504, + "learning_rate": 9.35050505050505e-05, + "loss": 0.342, + "num_input_tokens_seen": 7608320000, + "step": 74300 + }, + { + "epoch": 0.07, + "grad_norm": 0.06253899628035871, + "learning_rate": 9.34949494949495e-05, + "loss": 0.346, + "num_input_tokens_seen": 7618560000, + "step": 74400 + }, + { + "epoch": 0.07, + "grad_norm": 0.07204377427519774, + "learning_rate": 9.348484848484849e-05, + "loss": 0.3411, + "num_input_tokens_seen": 7628800000, + "step": 74500 + }, + { + "epoch": 0.07, + "grad_norm": 0.09382368088162311, + "learning_rate": 9.347474747474748e-05, + "loss": 0.3499, + "num_input_tokens_seen": 7639040000, + "step": 74600 + }, + { + "epoch": 0.07, + "grad_norm": 0.07565710465575991, + "learning_rate": 9.346464646464646e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7649280000, + "step": 74700 + }, + { + "epoch": 0.07, + "grad_norm": 0.07224597051475391, + "learning_rate": 9.345454545454547e-05, + "loss": 0.347, + "num_input_tokens_seen": 7659520000, + "step": 74800 + }, + { + "epoch": 0.07, + "grad_norm": 0.09848548016918349, + "learning_rate": 9.344444444444444e-05, + "loss": 0.3425, + "num_input_tokens_seen": 7669760000, + "step": 74900 + }, + { + "epoch": 0.07, + "grad_norm": 0.06691499035242632, + "learning_rate": 9.343434343434344e-05, + "loss": 0.3444, + "num_input_tokens_seen": 7680000000, + "step": 75000 + }, + { + "epoch": 0.07, + "eval_average_loss_on_non_sentence_tokens": 0.3418586268907307, + "eval_average_loss_on_sentence_tokens": 0.3764441254063272, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3434472680091858, + "eval_non_padding_tokens_in_labels": 133.53585, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36235, + "eval_padding_tokens_in_labels": 378.46415, + "eval_reconstruction_accuracy": 0.9287027882844393, + "eval_runtime": 194.593, + "eval_samples_per_second": 25.695, + "eval_sentence_accuracy": 0.7569400829041578, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 7680000000, + "step": 75000 + }, + { + "epoch": 0.08, + "grad_norm": 0.11846624898308208, + "learning_rate": 9.342424242424243e-05, + "loss": 0.3438, + "num_input_tokens_seen": 7690240000, + "step": 75100 + }, + { + "epoch": 0.08, + "grad_norm": 0.1028887779617494, + "learning_rate": 9.341414141414142e-05, + "loss": 0.345, + "num_input_tokens_seen": 7700480000, + "step": 75200 + }, + { + "epoch": 0.08, + "grad_norm": 0.07173183191993555, + "learning_rate": 9.34040404040404e-05, + "loss": 0.3465, + "num_input_tokens_seen": 7710720000, + "step": 75300 + }, + { + "epoch": 0.08, + "grad_norm": 0.07186342773364511, + "learning_rate": 9.33939393939394e-05, + "loss": 0.346, + "num_input_tokens_seen": 7720960000, + "step": 75400 + }, + { + "epoch": 0.08, + "grad_norm": 0.07691171701147574, + "learning_rate": 9.338383838383838e-05, + "loss": 0.3426, + "num_input_tokens_seen": 7731200000, + "step": 75500 + }, + { + "epoch": 0.08, + "grad_norm": 0.06019795635545335, + "learning_rate": 9.337373737373738e-05, + "loss": 0.3461, + "num_input_tokens_seen": 7741440000, + "step": 75600 + }, + { + "epoch": 0.08, + "grad_norm": 0.06727864672726841, + "learning_rate": 9.336363636363637e-05, + "loss": 0.3447, + "num_input_tokens_seen": 7751680000, + "step": 75700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07593050181996924, + "learning_rate": 9.335353535353536e-05, + "loss": 0.3459, + "num_input_tokens_seen": 7761920000, + "step": 75800 + }, + { + "epoch": 0.08, + "grad_norm": 0.08880096761244037, + "learning_rate": 9.334343434343434e-05, + "loss": 0.3475, + "num_input_tokens_seen": 7772160000, + "step": 75900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07259680279755296, + "learning_rate": 9.333333333333334e-05, + "loss": 0.3431, + "num_input_tokens_seen": 7782400000, + "step": 76000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.34137130195367715, + "eval_average_loss_on_sentence_tokens": 0.37163535620452487, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3427734375, + "eval_non_padding_tokens_in_labels": 133.5179, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3844, + "eval_padding_tokens_in_labels": 378.4821, + "eval_reconstruction_accuracy": 0.9287730649207148, + "eval_runtime": 216.7517, + "eval_samples_per_second": 23.068, + "eval_sentence_accuracy": 0.7570926121987547, + "eval_steps_per_second": 0.06, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 7782400000, + "step": 76000 + }, + { + "epoch": 0.08, + "grad_norm": 0.06812677490750443, + "learning_rate": 9.332323232323232e-05, + "loss": 0.3435, + "num_input_tokens_seen": 7792640000, + "step": 76100 + }, + { + "epoch": 0.08, + "grad_norm": 0.06603955605377859, + "learning_rate": 9.331313131313131e-05, + "loss": 0.3431, + "num_input_tokens_seen": 7802880000, + "step": 76200 + }, + { + "epoch": 0.08, + "grad_norm": 0.08076545373428365, + "learning_rate": 9.33030303030303e-05, + "loss": 0.3458, + "num_input_tokens_seen": 7813120000, + "step": 76300 + }, + { + "epoch": 0.08, + "grad_norm": 0.07536046049758754, + "learning_rate": 9.32929292929293e-05, + "loss": 0.3461, + "num_input_tokens_seen": 7823360000, + "step": 76400 + }, + { + "epoch": 0.08, + "grad_norm": 0.07224741590331137, + "learning_rate": 9.328282828282829e-05, + "loss": 0.3444, + "num_input_tokens_seen": 7833600000, + "step": 76500 + }, + { + "epoch": 0.08, + "grad_norm": 0.12621573728019794, + "learning_rate": 9.327272727272728e-05, + "loss": 0.3445, + "num_input_tokens_seen": 7843840000, + "step": 76600 + }, + { + "epoch": 0.08, + "grad_norm": 0.06705272548678033, + "learning_rate": 9.326262626262626e-05, + "loss": 0.3398, + "num_input_tokens_seen": 7854080000, + "step": 76700 + }, + { + "epoch": 0.08, + "grad_norm": 0.08137249687533962, + "learning_rate": 9.325252525252525e-05, + "loss": 0.3452, + "num_input_tokens_seen": 7864320000, + "step": 76800 + }, + { + "epoch": 0.08, + "grad_norm": 0.10118280672974078, + "learning_rate": 9.324242424242424e-05, + "loss": 0.3464, + "num_input_tokens_seen": 7874560000, + "step": 76900 + }, + { + "epoch": 0.08, + "grad_norm": 0.08451054390411802, + "learning_rate": 9.323232323232324e-05, + "loss": 0.3442, + "num_input_tokens_seen": 7884800000, + "step": 77000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.3404160022991633, + "eval_average_loss_on_sentence_tokens": 0.34872788059497994, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.34077149629592896, + "eval_non_padding_tokens_in_labels": 133.5168, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37715, + "eval_padding_tokens_in_labels": 378.4832, + "eval_reconstruction_accuracy": 0.9289657158077549, + "eval_runtime": 196.5465, + "eval_samples_per_second": 25.439, + "eval_sentence_accuracy": 0.7765042080140686, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 7884800000, + "step": 77000 + }, + { + "epoch": 0.08, + "grad_norm": 0.08523871787936498, + "learning_rate": 9.322222222222223e-05, + "loss": 0.3451, + "num_input_tokens_seen": 7895040000, + "step": 77100 + }, + { + "epoch": 0.08, + "grad_norm": 0.06757750407523518, + "learning_rate": 9.321212121212122e-05, + "loss": 0.3429, + "num_input_tokens_seen": 7905280000, + "step": 77200 + }, + { + "epoch": 0.08, + "grad_norm": 0.0829322218316798, + "learning_rate": 9.32020202020202e-05, + "loss": 0.3494, + "num_input_tokens_seen": 7915520000, + "step": 77300 + }, + { + "epoch": 0.08, + "grad_norm": 0.0690358629711246, + "learning_rate": 9.319191919191919e-05, + "loss": 0.3435, + "num_input_tokens_seen": 7925760000, + "step": 77400 + }, + { + "epoch": 0.08, + "grad_norm": 0.05813801278815191, + "learning_rate": 9.318181818181818e-05, + "loss": 0.3477, + "num_input_tokens_seen": 7936000000, + "step": 77500 + }, + { + "epoch": 0.08, + "grad_norm": 0.11575103564692506, + "learning_rate": 9.317171717171717e-05, + "loss": 0.3437, + "num_input_tokens_seen": 7946240000, + "step": 77600 + }, + { + "epoch": 0.08, + "grad_norm": 0.08219761384172114, + "learning_rate": 9.316161616161617e-05, + "loss": 0.3461, + "num_input_tokens_seen": 7956480000, + "step": 77700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07515319660392261, + "learning_rate": 9.315151515151516e-05, + "loss": 0.3442, + "num_input_tokens_seen": 7966720000, + "step": 77800 + }, + { + "epoch": 0.08, + "grad_norm": 0.09295065041473714, + "learning_rate": 9.314141414141414e-05, + "loss": 0.3433, + "num_input_tokens_seen": 7976960000, + "step": 77900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07347154151709316, + "learning_rate": 9.313131313131314e-05, + "loss": 0.3442, + "num_input_tokens_seen": 7987200000, + "step": 78000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.34018828280067864, + "eval_average_loss_on_sentence_tokens": 0.34366982341267266, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.34037110209465027, + "eval_non_padding_tokens_in_labels": 133.53315, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38935, + "eval_padding_tokens_in_labels": 378.46685, + "eval_reconstruction_accuracy": 0.9289464691271946, + "eval_runtime": 1291.8345, + "eval_samples_per_second": 3.87, + "eval_sentence_accuracy": 0.785974482746245, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 7987200000, + "step": 78000 + }, + { + "epoch": 0.08, + "grad_norm": 0.07691991284179092, + "learning_rate": 9.312121212121212e-05, + "loss": 0.3428, + "num_input_tokens_seen": 7997440000, + "step": 78100 + }, + { + "epoch": 0.08, + "grad_norm": 0.0678176635268364, + "learning_rate": 9.311111111111111e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8007680000, + "step": 78200 + }, + { + "epoch": 0.08, + "grad_norm": 0.06109140474544907, + "learning_rate": 9.31010101010101e-05, + "loss": 0.3423, + "num_input_tokens_seen": 8017920000, + "step": 78300 + }, + { + "epoch": 0.08, + "grad_norm": 0.09551448379417074, + "learning_rate": 9.30909090909091e-05, + "loss": 0.3427, + "num_input_tokens_seen": 8028160000, + "step": 78400 + }, + { + "epoch": 0.08, + "grad_norm": 0.08879177068816936, + "learning_rate": 9.308080808080809e-05, + "loss": 0.3442, + "num_input_tokens_seen": 8038400000, + "step": 78500 + }, + { + "epoch": 0.08, + "grad_norm": 0.08198448960114676, + "learning_rate": 9.307070707070708e-05, + "loss": 0.344, + "num_input_tokens_seen": 8048640000, + "step": 78600 + }, + { + "epoch": 0.08, + "grad_norm": 0.07358484539801484, + "learning_rate": 9.306060606060607e-05, + "loss": 0.343, + "num_input_tokens_seen": 8058880000, + "step": 78700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07286765340179834, + "learning_rate": 9.305050505050505e-05, + "loss": 0.3428, + "num_input_tokens_seen": 8069120000, + "step": 78800 + }, + { + "epoch": 0.08, + "grad_norm": 0.10509045627802442, + "learning_rate": 9.304040404040406e-05, + "loss": 0.3444, + "num_input_tokens_seen": 8079360000, + "step": 78900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07368895012235113, + "learning_rate": 9.303030303030303e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8089600000, + "step": 79000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.341238799049002, + "eval_average_loss_on_sentence_tokens": 0.36442352120692717, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.3422558605670929, + "eval_non_padding_tokens_in_labels": 133.4826, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.35725, + "eval_padding_tokens_in_labels": 378.5174, + "eval_reconstruction_accuracy": 0.9288687185249878, + "eval_runtime": 1270.6251, + "eval_samples_per_second": 3.935, + "eval_sentence_accuracy": 0.7631130331796078, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 8089600000, + "step": 79000 + }, + { + "epoch": 0.08, + "grad_norm": 0.07482273516557056, + "learning_rate": 9.302020202020203e-05, + "loss": 0.3448, + "num_input_tokens_seen": 8099840000, + "step": 79100 + }, + { + "epoch": 0.08, + "grad_norm": 0.06861041159944625, + "learning_rate": 9.301010101010102e-05, + "loss": 0.3456, + "num_input_tokens_seen": 8110080000, + "step": 79200 + }, + { + "epoch": 0.08, + "grad_norm": 0.061293447426085314, + "learning_rate": 9.300000000000001e-05, + "loss": 0.3457, + "num_input_tokens_seen": 8120320000, + "step": 79300 + }, + { + "epoch": 0.08, + "grad_norm": 0.09818019859365797, + "learning_rate": 9.298989898989899e-05, + "loss": 0.345, + "num_input_tokens_seen": 8130560000, + "step": 79400 + }, + { + "epoch": 0.08, + "grad_norm": 0.06136832950526215, + "learning_rate": 9.2979797979798e-05, + "loss": 0.3444, + "num_input_tokens_seen": 8140800000, + "step": 79500 + }, + { + "epoch": 0.08, + "grad_norm": 0.14518589213458455, + "learning_rate": 9.296969696969697e-05, + "loss": 0.3462, + "num_input_tokens_seen": 8151040000, + "step": 79600 + }, + { + "epoch": 0.08, + "grad_norm": 0.0883135553978262, + "learning_rate": 9.295959595959597e-05, + "loss": 0.3475, + "num_input_tokens_seen": 8161280000, + "step": 79700 + }, + { + "epoch": 0.08, + "grad_norm": 0.06796560722106207, + "learning_rate": 9.294949494949496e-05, + "loss": 0.3438, + "num_input_tokens_seen": 8171520000, + "step": 79800 + }, + { + "epoch": 0.08, + "grad_norm": 0.07663541291243003, + "learning_rate": 9.293939393939395e-05, + "loss": 0.3447, + "num_input_tokens_seen": 8181760000, + "step": 79900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07265584693048305, + "learning_rate": 9.292929292929293e-05, + "loss": 0.3426, + "num_input_tokens_seen": 8192000000, + "step": 80000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.3408686384642905, + "eval_average_loss_on_sentence_tokens": 0.30789256837767764, + "eval_average_shuffling_prob": 0.435, + "eval_loss": 0.33931639790534973, + "eval_non_padding_tokens_in_labels": 133.5309, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.371, + "eval_padding_tokens_in_labels": 378.4691, + "eval_reconstruction_accuracy": 0.9289904031614619, + "eval_runtime": 1289.4224, + "eval_samples_per_second": 3.878, + "eval_sentence_accuracy": 0.8001821379223716, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.245775, + "num_input_tokens_seen": 8192000000, + "step": 80000 + }, + { + "epoch": 0.08, + "grad_norm": 0.0656279808031633, + "learning_rate": 9.291919191919193e-05, + "loss": 0.3455, + "num_input_tokens_seen": 8202240000, + "step": 80100 + }, + { + "epoch": 0.08, + "grad_norm": 0.085849955790991, + "learning_rate": 9.290909090909091e-05, + "loss": 0.3472, + "num_input_tokens_seen": 8212480000, + "step": 80200 + }, + { + "epoch": 0.08, + "grad_norm": 0.07028302594833441, + "learning_rate": 9.28989898989899e-05, + "loss": 0.3464, + "num_input_tokens_seen": 8222720000, + "step": 80300 + }, + { + "epoch": 0.08, + "grad_norm": 0.09515892734425425, + "learning_rate": 9.28888888888889e-05, + "loss": 0.3416, + "num_input_tokens_seen": 8232960000, + "step": 80400 + }, + { + "epoch": 0.08, + "grad_norm": 0.06316765576093343, + "learning_rate": 9.287878787878789e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8243200000, + "step": 80500 + }, + { + "epoch": 0.08, + "grad_norm": 0.17738214898421117, + "learning_rate": 9.286868686868687e-05, + "loss": 0.3457, + "num_input_tokens_seen": 8253440000, + "step": 80600 + }, + { + "epoch": 0.08, + "grad_norm": 0.09503868018871886, + "learning_rate": 9.285858585858587e-05, + "loss": 0.3454, + "num_input_tokens_seen": 8263680000, + "step": 80700 + }, + { + "epoch": 0.08, + "grad_norm": 0.10113641649589798, + "learning_rate": 9.284848484848485e-05, + "loss": 0.3423, + "num_input_tokens_seen": 8273920000, + "step": 80800 + }, + { + "epoch": 0.08, + "grad_norm": 0.07925646238366256, + "learning_rate": 9.283838383838384e-05, + "loss": 0.3401, + "num_input_tokens_seen": 8284160000, + "step": 80900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07627620338141916, + "learning_rate": 9.282828282828283e-05, + "loss": 0.3454, + "num_input_tokens_seen": 8294400000, + "step": 81000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.3411574813063098, + "eval_average_loss_on_sentence_tokens": 0.38617278159397145, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.34321290254592896, + "eval_non_padding_tokens_in_labels": 133.5449, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3784, + "eval_padding_tokens_in_labels": 378.4551, + "eval_reconstruction_accuracy": 0.9288442099682978, + "eval_runtime": 390.4926, + "eval_samples_per_second": 12.804, + "eval_sentence_accuracy": 0.7470795126240422, + "eval_steps_per_second": 0.033, + "eval_variance_shuffling_prob": 0.247975, + "num_input_tokens_seen": 8294400000, + "step": 81000 + }, + { + "epoch": 0.08, + "grad_norm": 0.10697706854208926, + "learning_rate": 9.281818181818183e-05, + "loss": 0.3442, + "num_input_tokens_seen": 8304640000, + "step": 81100 + }, + { + "epoch": 0.08, + "grad_norm": 0.07180033529637163, + "learning_rate": 9.28080808080808e-05, + "loss": 0.3427, + "num_input_tokens_seen": 8314880000, + "step": 81200 + }, + { + "epoch": 0.08, + "grad_norm": 0.08454563868959443, + "learning_rate": 9.279797979797981e-05, + "loss": 0.346, + "num_input_tokens_seen": 8325120000, + "step": 81300 + }, + { + "epoch": 0.08, + "grad_norm": 0.06366271910482957, + "learning_rate": 9.278787878787879e-05, + "loss": 0.3438, + "num_input_tokens_seen": 8335360000, + "step": 81400 + }, + { + "epoch": 0.08, + "grad_norm": 0.07843829012801484, + "learning_rate": 9.277777777777778e-05, + "loss": 0.345, + "num_input_tokens_seen": 8345600000, + "step": 81500 + }, + { + "epoch": 0.08, + "grad_norm": 0.0851269571532537, + "learning_rate": 9.276767676767677e-05, + "loss": 0.3445, + "num_input_tokens_seen": 8355840000, + "step": 81600 + }, + { + "epoch": 0.08, + "grad_norm": 0.07515667256972068, + "learning_rate": 9.275757575757576e-05, + "loss": 0.3459, + "num_input_tokens_seen": 8366080000, + "step": 81700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07894990357927767, + "learning_rate": 9.274747474747476e-05, + "loss": 0.3448, + "num_input_tokens_seen": 8376320000, + "step": 81800 + }, + { + "epoch": 0.08, + "grad_norm": 0.09932349825793645, + "learning_rate": 9.273737373737375e-05, + "loss": 0.3445, + "num_input_tokens_seen": 8386560000, + "step": 81900 + }, + { + "epoch": 0.08, + "grad_norm": 0.09871964480215538, + "learning_rate": 9.272727272727273e-05, + "loss": 0.3423, + "num_input_tokens_seen": 8396800000, + "step": 82000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.34010406247328046, + "eval_average_loss_on_sentence_tokens": 0.3496776231642233, + "eval_average_shuffling_prob": 0.49, + "eval_loss": 0.340576171875, + "eval_non_padding_tokens_in_labels": 133.524, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38495, + "eval_padding_tokens_in_labels": 378.476, + "eval_reconstruction_accuracy": 0.9289674691638422, + "eval_runtime": 1027.4407, + "eval_samples_per_second": 4.866, + "eval_sentence_accuracy": 0.773677929908303, + "eval_steps_per_second": 0.013, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 8396800000, + "step": 82000 + }, + { + "epoch": 0.08, + "grad_norm": 0.08147162039993062, + "learning_rate": 9.271717171717172e-05, + "loss": 0.3434, + "num_input_tokens_seen": 8407040000, + "step": 82100 + }, + { + "epoch": 0.08, + "grad_norm": 0.08167164751579112, + "learning_rate": 9.270707070707071e-05, + "loss": 0.3414, + "num_input_tokens_seen": 8417280000, + "step": 82200 + }, + { + "epoch": 0.08, + "grad_norm": 0.059350246285675326, + "learning_rate": 9.26969696969697e-05, + "loss": 0.3451, + "num_input_tokens_seen": 8427520000, + "step": 82300 + }, + { + "epoch": 0.08, + "grad_norm": 0.09758052035385542, + "learning_rate": 9.26868686868687e-05, + "loss": 0.341, + "num_input_tokens_seen": 8437760000, + "step": 82400 + }, + { + "epoch": 0.08, + "grad_norm": 0.10631501464894487, + "learning_rate": 9.267676767676769e-05, + "loss": 0.3457, + "num_input_tokens_seen": 8448000000, + "step": 82500 + }, + { + "epoch": 0.08, + "grad_norm": 0.07155643743286304, + "learning_rate": 9.266666666666666e-05, + "loss": 0.3451, + "num_input_tokens_seen": 8458240000, + "step": 82600 + }, + { + "epoch": 0.08, + "grad_norm": 0.10517632600294342, + "learning_rate": 9.265656565656567e-05, + "loss": 0.3456, + "num_input_tokens_seen": 8468480000, + "step": 82700 + }, + { + "epoch": 0.08, + "grad_norm": 0.08390653823513555, + "learning_rate": 9.264646464646465e-05, + "loss": 0.3447, + "num_input_tokens_seen": 8478720000, + "step": 82800 + }, + { + "epoch": 0.08, + "grad_norm": 0.0885248336206367, + "learning_rate": 9.263636363636364e-05, + "loss": 0.3434, + "num_input_tokens_seen": 8488960000, + "step": 82900 + }, + { + "epoch": 0.08, + "grad_norm": 0.1482903321934672, + "learning_rate": 9.262626262626263e-05, + "loss": 0.3464, + "num_input_tokens_seen": 8499200000, + "step": 83000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.3405094459362505, + "eval_average_loss_on_sentence_tokens": 0.328369655254877, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.3399121165275574, + "eval_non_padding_tokens_in_labels": 133.55745, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37935, + "eval_padding_tokens_in_labels": 378.44255, + "eval_reconstruction_accuracy": 0.9289912798130072, + "eval_runtime": 929.5414, + "eval_samples_per_second": 5.379, + "eval_sentence_accuracy": 0.7916808728264576, + "eval_steps_per_second": 0.014, + "eval_variance_shuffling_prob": 0.24797499999999995, + "num_input_tokens_seen": 8499200000, + "step": 83000 + }, + { + "epoch": 0.08, + "grad_norm": 0.06361566670577058, + "learning_rate": 9.261616161616162e-05, + "loss": 0.3455, + "num_input_tokens_seen": 8509440000, + "step": 83100 + }, + { + "epoch": 0.08, + "grad_norm": 0.07186764272580269, + "learning_rate": 9.26060606060606e-05, + "loss": 0.3449, + "num_input_tokens_seen": 8519680000, + "step": 83200 + }, + { + "epoch": 0.08, + "grad_norm": 0.0791163062848625, + "learning_rate": 9.259595959595961e-05, + "loss": 0.3444, + "num_input_tokens_seen": 8529920000, + "step": 83300 + }, + { + "epoch": 0.08, + "grad_norm": 0.1385142461376765, + "learning_rate": 9.258585858585859e-05, + "loss": 0.3416, + "num_input_tokens_seen": 8540160000, + "step": 83400 + }, + { + "epoch": 0.08, + "grad_norm": 0.07890019718173553, + "learning_rate": 9.257575757575758e-05, + "loss": 0.3434, + "num_input_tokens_seen": 8550400000, + "step": 83500 + }, + { + "epoch": 0.08, + "grad_norm": 0.10234505601588344, + "learning_rate": 9.256565656565657e-05, + "loss": 0.3447, + "num_input_tokens_seen": 8560640000, + "step": 83600 + }, + { + "epoch": 0.08, + "grad_norm": 0.07374841585316788, + "learning_rate": 9.255555555555556e-05, + "loss": 0.347, + "num_input_tokens_seen": 8570880000, + "step": 83700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07513312300934936, + "learning_rate": 9.254545454545454e-05, + "loss": 0.343, + "num_input_tokens_seen": 8581120000, + "step": 83800 + }, + { + "epoch": 0.08, + "grad_norm": 0.08644521085368582, + "learning_rate": 9.253535353535355e-05, + "loss": 0.3426, + "num_input_tokens_seen": 8591360000, + "step": 83900 + }, + { + "epoch": 0.08, + "grad_norm": 0.10637691689009747, + "learning_rate": 9.252525252525253e-05, + "loss": 0.3441, + "num_input_tokens_seen": 8601600000, + "step": 84000 + }, + { + "epoch": 0.08, + "eval_average_loss_on_non_sentence_tokens": 0.34017264883364084, + "eval_average_loss_on_sentence_tokens": 0.35571673608315696, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.3409472703933716, + "eval_non_padding_tokens_in_labels": 133.55385, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.44615, + "eval_reconstruction_accuracy": 0.9289920217717826, + "eval_runtime": 665.1106, + "eval_samples_per_second": 7.518, + "eval_sentence_accuracy": 0.7710894180558796, + "eval_steps_per_second": 0.02, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 8601600000, + "step": 84000 + }, + { + "epoch": 0.08, + "grad_norm": 0.061016051163326854, + "learning_rate": 9.251515151515152e-05, + "loss": 0.3429, + "num_input_tokens_seen": 8611840000, + "step": 84100 + }, + { + "epoch": 0.08, + "grad_norm": 0.059152474457208996, + "learning_rate": 9.250505050505051e-05, + "loss": 0.3473, + "num_input_tokens_seen": 8622080000, + "step": 84200 + }, + { + "epoch": 0.08, + "grad_norm": 0.06333772257282903, + "learning_rate": 9.24949494949495e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8632320000, + "step": 84300 + }, + { + "epoch": 0.08, + "grad_norm": 0.20835928258618486, + "learning_rate": 9.248484848484848e-05, + "loss": 0.346, + "num_input_tokens_seen": 8642560000, + "step": 84400 + }, + { + "epoch": 0.08, + "grad_norm": 0.07385557184218723, + "learning_rate": 9.247474747474749e-05, + "loss": 0.3454, + "num_input_tokens_seen": 8652800000, + "step": 84500 + }, + { + "epoch": 0.08, + "grad_norm": 0.07531436407138824, + "learning_rate": 9.246464646464646e-05, + "loss": 0.344, + "num_input_tokens_seen": 8663040000, + "step": 84600 + }, + { + "epoch": 0.08, + "grad_norm": 0.0718995354980065, + "learning_rate": 9.245454545454546e-05, + "loss": 0.3457, + "num_input_tokens_seen": 8673280000, + "step": 84700 + }, + { + "epoch": 0.08, + "grad_norm": 0.07509291631425932, + "learning_rate": 9.244444444444445e-05, + "loss": 0.3427, + "num_input_tokens_seen": 8683520000, + "step": 84800 + }, + { + "epoch": 0.08, + "grad_norm": 0.06572581382098575, + "learning_rate": 9.243434343434344e-05, + "loss": 0.3472, + "num_input_tokens_seen": 8693760000, + "step": 84900 + }, + { + "epoch": 0.09, + "grad_norm": 0.09349556634689257, + "learning_rate": 9.242424242424242e-05, + "loss": 0.342, + "num_input_tokens_seen": 8704000000, + "step": 85000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.34011396984034464, + "eval_average_loss_on_sentence_tokens": 0.33582114081105296, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.3399316370487213, + "eval_non_padding_tokens_in_labels": 133.517, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3708, + "eval_padding_tokens_in_labels": 378.483, + "eval_reconstruction_accuracy": 0.9289447567187336, + "eval_runtime": 1527.6877, + "eval_samples_per_second": 3.273, + "eval_sentence_accuracy": 0.7889532901466075, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 8704000000, + "step": 85000 + }, + { + "epoch": 0.09, + "grad_norm": 0.11214003517500175, + "learning_rate": 9.241414141414142e-05, + "loss": 0.3436, + "num_input_tokens_seen": 8714240000, + "step": 85100 + }, + { + "epoch": 0.09, + "grad_norm": 0.08083096617906413, + "learning_rate": 9.24040404040404e-05, + "loss": 0.3412, + "num_input_tokens_seen": 8724480000, + "step": 85200 + }, + { + "epoch": 0.09, + "grad_norm": 0.07267035416635911, + "learning_rate": 9.23939393939394e-05, + "loss": 0.3455, + "num_input_tokens_seen": 8734720000, + "step": 85300 + }, + { + "epoch": 0.09, + "grad_norm": 0.09576339672749544, + "learning_rate": 9.238383838383839e-05, + "loss": 0.3432, + "num_input_tokens_seen": 8744960000, + "step": 85400 + }, + { + "epoch": 0.09, + "grad_norm": 0.06513228365598901, + "learning_rate": 9.237373737373738e-05, + "loss": 0.3462, + "num_input_tokens_seen": 8755200000, + "step": 85500 + }, + { + "epoch": 0.09, + "grad_norm": 0.08116751740143478, + "learning_rate": 9.236363636363636e-05, + "loss": 0.342, + "num_input_tokens_seen": 8765440000, + "step": 85600 + }, + { + "epoch": 0.09, + "grad_norm": 0.08267364697683331, + "learning_rate": 9.235353535353536e-05, + "loss": 0.3425, + "num_input_tokens_seen": 8775680000, + "step": 85700 + }, + { + "epoch": 0.09, + "grad_norm": 0.09587553973610198, + "learning_rate": 9.234343434343434e-05, + "loss": 0.3451, + "num_input_tokens_seen": 8785920000, + "step": 85800 + }, + { + "epoch": 0.09, + "grad_norm": 0.11202949309733819, + "learning_rate": 9.233333333333333e-05, + "loss": 0.3453, + "num_input_tokens_seen": 8796160000, + "step": 85900 + }, + { + "epoch": 0.09, + "grad_norm": 0.09606385851590031, + "learning_rate": 9.232323232323232e-05, + "loss": 0.3413, + "num_input_tokens_seen": 8806400000, + "step": 86000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.3405377344519306, + "eval_average_loss_on_sentence_tokens": 0.38161282695545656, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.3423437476158142, + "eval_non_padding_tokens_in_labels": 133.5055, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38325, + "eval_padding_tokens_in_labels": 378.4945, + "eval_reconstruction_accuracy": 0.9289641179149659, + "eval_runtime": 1031.9835, + "eval_samples_per_second": 4.845, + "eval_sentence_accuracy": 0.7494347443788468, + "eval_steps_per_second": 0.013, + "eval_variance_shuffling_prob": 0.24750000000000005, + "num_input_tokens_seen": 8806400000, + "step": 86000 + }, + { + "epoch": 0.09, + "grad_norm": 0.06562712313277312, + "learning_rate": 9.231313131313132e-05, + "loss": 0.3429, + "num_input_tokens_seen": 8816640000, + "step": 86100 + }, + { + "epoch": 0.09, + "grad_norm": 0.06466471915057402, + "learning_rate": 9.230303030303031e-05, + "loss": 0.3451, + "num_input_tokens_seen": 8826880000, + "step": 86200 + }, + { + "epoch": 0.09, + "grad_norm": 0.09768677234500682, + "learning_rate": 9.22929292929293e-05, + "loss": 0.3408, + "num_input_tokens_seen": 8837120000, + "step": 86300 + }, + { + "epoch": 0.09, + "grad_norm": 0.07147468282312987, + "learning_rate": 9.228282828282828e-05, + "loss": 0.3431, + "num_input_tokens_seen": 8847360000, + "step": 86400 + }, + { + "epoch": 0.09, + "grad_norm": 0.10760857981235282, + "learning_rate": 9.227272727272727e-05, + "loss": 0.345, + "num_input_tokens_seen": 8857600000, + "step": 86500 + }, + { + "epoch": 0.09, + "grad_norm": 0.10284808686523512, + "learning_rate": 9.226262626262626e-05, + "loss": 0.342, + "num_input_tokens_seen": 8867840000, + "step": 86600 + }, + { + "epoch": 0.09, + "grad_norm": 0.0711455991858897, + "learning_rate": 9.225252525252525e-05, + "loss": 0.3409, + "num_input_tokens_seen": 8878080000, + "step": 86700 + }, + { + "epoch": 0.09, + "grad_norm": 0.11020498964494593, + "learning_rate": 9.224242424242425e-05, + "loss": 0.3432, + "num_input_tokens_seen": 8888320000, + "step": 86800 + }, + { + "epoch": 0.09, + "grad_norm": 0.06028975605785087, + "learning_rate": 9.223232323232324e-05, + "loss": 0.3449, + "num_input_tokens_seen": 8898560000, + "step": 86900 + }, + { + "epoch": 0.09, + "grad_norm": 0.08665053191012796, + "learning_rate": 9.222222222222223e-05, + "loss": 0.344, + "num_input_tokens_seen": 8908800000, + "step": 87000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.34020601475537343, + "eval_average_loss_on_sentence_tokens": 0.36437594277657304, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3412402272224426, + "eval_non_padding_tokens_in_labels": 133.58975, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39115, + "eval_padding_tokens_in_labels": 378.41025, + "eval_reconstruction_accuracy": 0.9290044449156616, + "eval_runtime": 874.1361, + "eval_samples_per_second": 5.72, + "eval_sentence_accuracy": 0.763628941087803, + "eval_steps_per_second": 0.015, + "eval_variance_shuffling_prob": 0.24959999999999996, + "num_input_tokens_seen": 8908800000, + "step": 87000 + }, + { + "epoch": 0.09, + "grad_norm": 0.0682904679549055, + "learning_rate": 9.221212121212122e-05, + "loss": 0.3404, + "num_input_tokens_seen": 8919040000, + "step": 87100 + }, + { + "epoch": 0.09, + "grad_norm": 0.08797518582965361, + "learning_rate": 9.220202020202021e-05, + "loss": 0.3418, + "num_input_tokens_seen": 8929280000, + "step": 87200 + }, + { + "epoch": 0.09, + "grad_norm": 0.10669468838484802, + "learning_rate": 9.219191919191919e-05, + "loss": 0.3417, + "num_input_tokens_seen": 8939520000, + "step": 87300 + }, + { + "epoch": 0.09, + "grad_norm": 0.09154011830188781, + "learning_rate": 9.218181818181819e-05, + "loss": 0.344, + "num_input_tokens_seen": 8949760000, + "step": 87400 + }, + { + "epoch": 0.09, + "grad_norm": 0.12970164937079615, + "learning_rate": 9.217171717171718e-05, + "loss": 0.3427, + "num_input_tokens_seen": 8960000000, + "step": 87500 + }, + { + "epoch": 0.09, + "grad_norm": 0.11602775778409376, + "learning_rate": 9.216161616161617e-05, + "loss": 0.3441, + "num_input_tokens_seen": 8970240000, + "step": 87600 + }, + { + "epoch": 0.09, + "grad_norm": 0.060430133469014186, + "learning_rate": 9.215151515151516e-05, + "loss": 0.3465, + "num_input_tokens_seen": 8980480000, + "step": 87700 + }, + { + "epoch": 0.09, + "grad_norm": 0.10925675071129455, + "learning_rate": 9.214141414141415e-05, + "loss": 0.3428, + "num_input_tokens_seen": 8990720000, + "step": 87800 + }, + { + "epoch": 0.09, + "grad_norm": 0.08330725765431289, + "learning_rate": 9.213131313131313e-05, + "loss": 0.3418, + "num_input_tokens_seen": 9000960000, + "step": 87900 + }, + { + "epoch": 0.09, + "grad_norm": 0.0956553101303389, + "learning_rate": 9.212121212121214e-05, + "loss": 0.3449, + "num_input_tokens_seen": 9011200000, + "step": 88000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.33898415767986073, + "eval_average_loss_on_sentence_tokens": 0.37109927883678956, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.34046876430511475, + "eval_non_padding_tokens_in_labels": 133.5541, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37795, + "eval_padding_tokens_in_labels": 378.4459, + "eval_reconstruction_accuracy": 0.9290763170084979, + "eval_runtime": 349.6627, + "eval_samples_per_second": 14.299, + "eval_sentence_accuracy": 0.7628662946148187, + "eval_steps_per_second": 0.037, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 9011200000, + "step": 88000 + }, + { + "epoch": 0.09, + "grad_norm": 0.07489099789647385, + "learning_rate": 9.211111111111112e-05, + "loss": 0.3447, + "num_input_tokens_seen": 9021440000, + "step": 88100 + }, + { + "epoch": 0.09, + "grad_norm": 0.06176633418945281, + "learning_rate": 9.210101010101011e-05, + "loss": 0.3437, + "num_input_tokens_seen": 9031680000, + "step": 88200 + }, + { + "epoch": 0.09, + "grad_norm": 0.06372396782298317, + "learning_rate": 9.20909090909091e-05, + "loss": 0.3418, + "num_input_tokens_seen": 9041920000, + "step": 88300 + }, + { + "epoch": 0.09, + "grad_norm": 0.10575555398608698, + "learning_rate": 9.208080808080809e-05, + "loss": 0.343, + "num_input_tokens_seen": 9052160000, + "step": 88400 + }, + { + "epoch": 0.09, + "grad_norm": 0.0998646886119458, + "learning_rate": 9.207070707070707e-05, + "loss": 0.3424, + "num_input_tokens_seen": 9062400000, + "step": 88500 + }, + { + "epoch": 0.09, + "grad_norm": 0.09162262855306295, + "learning_rate": 9.206060606060608e-05, + "loss": 0.3438, + "num_input_tokens_seen": 9072640000, + "step": 88600 + }, + { + "epoch": 0.09, + "grad_norm": 0.08474924444542963, + "learning_rate": 9.205050505050505e-05, + "loss": 0.3455, + "num_input_tokens_seen": 9082880000, + "step": 88700 + }, + { + "epoch": 0.09, + "grad_norm": 0.06473403821305589, + "learning_rate": 9.204040404040405e-05, + "loss": 0.3418, + "num_input_tokens_seen": 9093120000, + "step": 88800 + }, + { + "epoch": 0.09, + "grad_norm": 0.0973356133025493, + "learning_rate": 9.203030303030304e-05, + "loss": 0.3447, + "num_input_tokens_seen": 9103360000, + "step": 88900 + }, + { + "epoch": 0.09, + "grad_norm": 0.07217638201576311, + "learning_rate": 9.202020202020203e-05, + "loss": 0.3447, + "num_input_tokens_seen": 9113600000, + "step": 89000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.3398090456938247, + "eval_average_loss_on_sentence_tokens": 0.3340042331614964, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.339599609375, + "eval_non_padding_tokens_in_labels": 133.52125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38755, + "eval_padding_tokens_in_labels": 378.47875, + "eval_reconstruction_accuracy": 0.9292005253664207, + "eval_runtime": 170.6226, + "eval_samples_per_second": 29.304, + "eval_sentence_accuracy": 0.7822150842500045, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 9113600000, + "step": 89000 + }, + { + "epoch": 0.09, + "grad_norm": 0.07061927483676173, + "learning_rate": 9.201010101010101e-05, + "loss": 0.3448, + "num_input_tokens_seen": 9123840000, + "step": 89100 + }, + { + "epoch": 0.09, + "grad_norm": 0.10355302971585338, + "learning_rate": 9.200000000000001e-05, + "loss": 0.3442, + "num_input_tokens_seen": 9134080000, + "step": 89200 + }, + { + "epoch": 0.09, + "grad_norm": 0.07072210050668465, + "learning_rate": 9.198989898989899e-05, + "loss": 0.3424, + "num_input_tokens_seen": 9144320000, + "step": 89300 + }, + { + "epoch": 0.09, + "grad_norm": 0.07278700210195294, + "learning_rate": 9.197979797979798e-05, + "loss": 0.3428, + "num_input_tokens_seen": 9154560000, + "step": 89400 + }, + { + "epoch": 0.09, + "grad_norm": 0.09892051114112389, + "learning_rate": 9.196969696969698e-05, + "loss": 0.3421, + "num_input_tokens_seen": 9164800000, + "step": 89500 + }, + { + "epoch": 0.09, + "grad_norm": 0.09274136465713577, + "learning_rate": 9.195959595959597e-05, + "loss": 0.342, + "num_input_tokens_seen": 9175040000, + "step": 89600 + }, + { + "epoch": 0.09, + "grad_norm": 0.05720042984416338, + "learning_rate": 9.194949494949495e-05, + "loss": 0.3414, + "num_input_tokens_seen": 9185280000, + "step": 89700 + }, + { + "epoch": 0.09, + "grad_norm": 0.06543856176291012, + "learning_rate": 9.193939393939395e-05, + "loss": 0.343, + "num_input_tokens_seen": 9195520000, + "step": 89800 + }, + { + "epoch": 0.09, + "grad_norm": 0.09840385140066671, + "learning_rate": 9.192929292929293e-05, + "loss": 0.344, + "num_input_tokens_seen": 9205760000, + "step": 89900 + }, + { + "epoch": 0.09, + "grad_norm": 0.06991607452819766, + "learning_rate": 9.191919191919192e-05, + "loss": 0.3439, + "num_input_tokens_seen": 9216000000, + "step": 90000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.3394724830060792, + "eval_average_loss_on_sentence_tokens": 0.3607540981699584, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3404003977775574, + "eval_non_padding_tokens_in_labels": 133.4872, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3731, + "eval_padding_tokens_in_labels": 378.5128, + "eval_reconstruction_accuracy": 0.9291192392647224, + "eval_runtime": 169.3894, + "eval_samples_per_second": 29.518, + "eval_sentence_accuracy": 0.7648940370018124, + "eval_steps_per_second": 0.077, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 9216000000, + "step": 90000 + }, + { + "epoch": 0.09, + "grad_norm": 0.12325044474318388, + "learning_rate": 9.190909090909091e-05, + "loss": 0.3457, + "num_input_tokens_seen": 9226240000, + "step": 90100 + }, + { + "epoch": 0.09, + "grad_norm": 0.06721112416749293, + "learning_rate": 9.18989898989899e-05, + "loss": 0.3424, + "num_input_tokens_seen": 9236480000, + "step": 90200 + }, + { + "epoch": 0.09, + "grad_norm": 0.06343014377014543, + "learning_rate": 9.188888888888888e-05, + "loss": 0.342, + "num_input_tokens_seen": 9246720000, + "step": 90300 + }, + { + "epoch": 0.09, + "grad_norm": 0.10825750064013476, + "learning_rate": 9.187878787878789e-05, + "loss": 0.3412, + "num_input_tokens_seen": 9256960000, + "step": 90400 + }, + { + "epoch": 0.09, + "grad_norm": 0.0683988167873696, + "learning_rate": 9.186868686868687e-05, + "loss": 0.3419, + "num_input_tokens_seen": 9267200000, + "step": 90500 + }, + { + "epoch": 0.09, + "grad_norm": 0.07216541811006472, + "learning_rate": 9.185858585858586e-05, + "loss": 0.343, + "num_input_tokens_seen": 9277440000, + "step": 90600 + }, + { + "epoch": 0.09, + "grad_norm": 0.09534694482099638, + "learning_rate": 9.184848484848485e-05, + "loss": 0.3426, + "num_input_tokens_seen": 9287680000, + "step": 90700 + }, + { + "epoch": 0.09, + "grad_norm": 0.1030655028650778, + "learning_rate": 9.183838383838384e-05, + "loss": 0.3425, + "num_input_tokens_seen": 9297920000, + "step": 90800 + }, + { + "epoch": 0.09, + "grad_norm": 0.0812375909127833, + "learning_rate": 9.182828282828284e-05, + "loss": 0.3422, + "num_input_tokens_seen": 9308160000, + "step": 90900 + }, + { + "epoch": 0.09, + "grad_norm": 0.10747446485155411, + "learning_rate": 9.181818181818183e-05, + "loss": 0.3427, + "num_input_tokens_seen": 9318400000, + "step": 91000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.3398029431500929, + "eval_average_loss_on_sentence_tokens": 0.3522256898906217, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.3403613269329071, + "eval_non_padding_tokens_in_labels": 133.51605, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3724, + "eval_padding_tokens_in_labels": 378.48395, + "eval_reconstruction_accuracy": 0.9291124639487429, + "eval_runtime": 207.229, + "eval_samples_per_second": 24.128, + "eval_sentence_accuracy": 0.7716815906113733, + "eval_steps_per_second": 0.063, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 9318400000, + "step": 91000 + }, + { + "epoch": 0.09, + "grad_norm": 0.10559360302031201, + "learning_rate": 9.180808080808081e-05, + "loss": 0.345, + "num_input_tokens_seen": 9328640000, + "step": 91100 + }, + { + "epoch": 0.09, + "grad_norm": 0.16541195676469786, + "learning_rate": 9.17979797979798e-05, + "loss": 0.3399, + "num_input_tokens_seen": 9338880000, + "step": 91200 + }, + { + "epoch": 0.09, + "grad_norm": 0.06520275360347193, + "learning_rate": 9.178787878787879e-05, + "loss": 0.3446, + "num_input_tokens_seen": 9349120000, + "step": 91300 + }, + { + "epoch": 0.09, + "grad_norm": 0.06761143347935858, + "learning_rate": 9.177777777777778e-05, + "loss": 0.3433, + "num_input_tokens_seen": 9359360000, + "step": 91400 + }, + { + "epoch": 0.09, + "grad_norm": 0.08664969221616324, + "learning_rate": 9.176767676767677e-05, + "loss": 0.3436, + "num_input_tokens_seen": 9369600000, + "step": 91500 + }, + { + "epoch": 0.09, + "grad_norm": 0.0954156933547266, + "learning_rate": 9.175757575757577e-05, + "loss": 0.3428, + "num_input_tokens_seen": 9379840000, + "step": 91600 + }, + { + "epoch": 0.09, + "grad_norm": 0.125747637152158, + "learning_rate": 9.174747474747475e-05, + "loss": 0.3446, + "num_input_tokens_seen": 9390080000, + "step": 91700 + }, + { + "epoch": 0.09, + "grad_norm": 0.08597708733124193, + "learning_rate": 9.173737373737374e-05, + "loss": 0.3422, + "num_input_tokens_seen": 9400320000, + "step": 91800 + }, + { + "epoch": 0.09, + "grad_norm": 0.07023084057707848, + "learning_rate": 9.172727272727273e-05, + "loss": 0.3462, + "num_input_tokens_seen": 9410560000, + "step": 91900 + }, + { + "epoch": 0.09, + "grad_norm": 0.06070007469670599, + "learning_rate": 9.171717171717172e-05, + "loss": 0.3426, + "num_input_tokens_seen": 9420800000, + "step": 92000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.3394812111849839, + "eval_average_loss_on_sentence_tokens": 0.38395972517995264, + "eval_average_shuffling_prob": 0.545, + "eval_loss": 0.34154295921325684, + "eval_non_padding_tokens_in_labels": 133.53465, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39145, + "eval_padding_tokens_in_labels": 378.46535, + "eval_reconstruction_accuracy": 0.9291135416483638, + "eval_runtime": 192.6318, + "eval_samples_per_second": 25.956, + "eval_sentence_accuracy": 0.7512292066682219, + "eval_steps_per_second": 0.067, + "eval_variance_shuffling_prob": 0.247975, + "num_input_tokens_seen": 9420800000, + "step": 92000 + }, + { + "epoch": 0.09, + "grad_norm": 0.08300365269279467, + "learning_rate": 9.170707070707071e-05, + "loss": 0.3442, + "num_input_tokens_seen": 9431040000, + "step": 92100 + }, + { + "epoch": 0.09, + "grad_norm": 0.06238361601470299, + "learning_rate": 9.16969696969697e-05, + "loss": 0.3419, + "num_input_tokens_seen": 9441280000, + "step": 92200 + }, + { + "epoch": 0.09, + "grad_norm": 0.06336933120232112, + "learning_rate": 9.168686868686868e-05, + "loss": 0.3433, + "num_input_tokens_seen": 9451520000, + "step": 92300 + }, + { + "epoch": 0.09, + "grad_norm": 0.0713563643836952, + "learning_rate": 9.167676767676769e-05, + "loss": 0.3437, + "num_input_tokens_seen": 9461760000, + "step": 92400 + }, + { + "epoch": 0.09, + "grad_norm": 0.08576764041782922, + "learning_rate": 9.166666666666667e-05, + "loss": 0.3415, + "num_input_tokens_seen": 9472000000, + "step": 92500 + }, + { + "epoch": 0.09, + "grad_norm": 0.06947579851570143, + "learning_rate": 9.165656565656566e-05, + "loss": 0.3414, + "num_input_tokens_seen": 9482240000, + "step": 92600 + }, + { + "epoch": 0.09, + "grad_norm": 0.06162582309789329, + "learning_rate": 9.164646464646465e-05, + "loss": 0.3407, + "num_input_tokens_seen": 9492480000, + "step": 92700 + }, + { + "epoch": 0.09, + "grad_norm": 0.06385515450359264, + "learning_rate": 9.163636363636364e-05, + "loss": 0.3426, + "num_input_tokens_seen": 9502720000, + "step": 92800 + }, + { + "epoch": 0.09, + "grad_norm": 0.06324595781471544, + "learning_rate": 9.162626262626262e-05, + "loss": 0.3438, + "num_input_tokens_seen": 9512960000, + "step": 92900 + }, + { + "epoch": 0.09, + "grad_norm": 0.09913526865886424, + "learning_rate": 9.161616161616163e-05, + "loss": 0.3417, + "num_input_tokens_seen": 9523200000, + "step": 93000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.33902476043354435, + "eval_average_loss_on_sentence_tokens": 0.35086911285400785, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.33955079317092896, + "eval_non_padding_tokens_in_labels": 133.52025, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3863, + "eval_padding_tokens_in_labels": 378.47975, + "eval_reconstruction_accuracy": 0.9291236398095484, + "eval_runtime": 204.0835, + "eval_samples_per_second": 24.5, + "eval_sentence_accuracy": 0.7745213271843092, + "eval_steps_per_second": 0.064, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 9523200000, + "step": 93000 + }, + { + "epoch": 0.09, + "grad_norm": 0.06772276567123629, + "learning_rate": 9.16060606060606e-05, + "loss": 0.3433, + "num_input_tokens_seen": 9533440000, + "step": 93100 + }, + { + "epoch": 0.09, + "grad_norm": 0.09056889692431093, + "learning_rate": 9.15959595959596e-05, + "loss": 0.3396, + "num_input_tokens_seen": 9543680000, + "step": 93200 + }, + { + "epoch": 0.09, + "grad_norm": 0.10522661185074027, + "learning_rate": 9.158585858585859e-05, + "loss": 0.3449, + "num_input_tokens_seen": 9553920000, + "step": 93300 + }, + { + "epoch": 0.09, + "grad_norm": 0.08799617972153644, + "learning_rate": 9.157575757575758e-05, + "loss": 0.342, + "num_input_tokens_seen": 9564160000, + "step": 93400 + }, + { + "epoch": 0.09, + "grad_norm": 0.0848877333826655, + "learning_rate": 9.156565656565656e-05, + "loss": 0.3445, + "num_input_tokens_seen": 9574400000, + "step": 93500 + }, + { + "epoch": 0.09, + "grad_norm": 0.0833692535331706, + "learning_rate": 9.155555555555557e-05, + "loss": 0.3425, + "num_input_tokens_seen": 9584640000, + "step": 93600 + }, + { + "epoch": 0.09, + "grad_norm": 0.10209865907129212, + "learning_rate": 9.154545454545454e-05, + "loss": 0.341, + "num_input_tokens_seen": 9594880000, + "step": 93700 + }, + { + "epoch": 0.09, + "grad_norm": 0.1005026949711253, + "learning_rate": 9.153535353535354e-05, + "loss": 0.3405, + "num_input_tokens_seen": 9605120000, + "step": 93800 + }, + { + "epoch": 0.09, + "grad_norm": 0.06132800419607231, + "learning_rate": 9.152525252525253e-05, + "loss": 0.3433, + "num_input_tokens_seen": 9615360000, + "step": 93900 + }, + { + "epoch": 0.09, + "grad_norm": 0.09906803082570952, + "learning_rate": 9.151515151515152e-05, + "loss": 0.342, + "num_input_tokens_seen": 9625600000, + "step": 94000 + }, + { + "epoch": 0.09, + "eval_average_loss_on_non_sentence_tokens": 0.33915797777371176, + "eval_average_loss_on_sentence_tokens": 0.3511748389299364, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3397167921066284, + "eval_non_padding_tokens_in_labels": 133.5017, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3822, + "eval_padding_tokens_in_labels": 378.4983, + "eval_reconstruction_accuracy": 0.9292305363883578, + "eval_runtime": 219.0289, + "eval_samples_per_second": 22.828, + "eval_sentence_accuracy": 0.7708965133597717, + "eval_steps_per_second": 0.059, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 9625600000, + "step": 94000 + }, + { + "epoch": 0.09, + "grad_norm": 0.06505832866464673, + "learning_rate": 9.15050505050505e-05, + "loss": 0.3443, + "num_input_tokens_seen": 9635840000, + "step": 94100 + }, + { + "epoch": 0.09, + "grad_norm": 0.07445264652014388, + "learning_rate": 9.14949494949495e-05, + "loss": 0.3416, + "num_input_tokens_seen": 9646080000, + "step": 94200 + }, + { + "epoch": 0.09, + "grad_norm": 0.0752551379574513, + "learning_rate": 9.148484848484848e-05, + "loss": 0.3439, + "num_input_tokens_seen": 9656320000, + "step": 94300 + }, + { + "epoch": 0.09, + "grad_norm": 0.07543611519104658, + "learning_rate": 9.147474747474747e-05, + "loss": 0.3426, + "num_input_tokens_seen": 9666560000, + "step": 94400 + }, + { + "epoch": 0.09, + "grad_norm": 0.07914415290205117, + "learning_rate": 9.146464646464647e-05, + "loss": 0.3431, + "num_input_tokens_seen": 9676800000, + "step": 94500 + }, + { + "epoch": 0.09, + "grad_norm": 0.07791179831690094, + "learning_rate": 9.145454545454546e-05, + "loss": 0.3397, + "num_input_tokens_seen": 9687040000, + "step": 94600 + }, + { + "epoch": 0.09, + "grad_norm": 0.1299793773579509, + "learning_rate": 9.144444444444444e-05, + "loss": 0.3436, + "num_input_tokens_seen": 9697280000, + "step": 94700 + }, + { + "epoch": 0.09, + "grad_norm": 0.08351792223066329, + "learning_rate": 9.143434343434344e-05, + "loss": 0.3428, + "num_input_tokens_seen": 9707520000, + "step": 94800 + }, + { + "epoch": 0.09, + "grad_norm": 0.08287936117248126, + "learning_rate": 9.142424242424242e-05, + "loss": 0.3432, + "num_input_tokens_seen": 9717760000, + "step": 94900 + }, + { + "epoch": 0.1, + "grad_norm": 0.0874785126013654, + "learning_rate": 9.141414141414141e-05, + "loss": 0.3459, + "num_input_tokens_seen": 9728000000, + "step": 95000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.33944769585793116, + "eval_average_loss_on_sentence_tokens": 0.3591522233836725, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.34028321504592896, + "eval_non_padding_tokens_in_labels": 133.4984, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.35565, + "eval_padding_tokens_in_labels": 378.5016, + "eval_reconstruction_accuracy": 0.9290738438085122, + "eval_runtime": 187.9306, + "eval_samples_per_second": 26.606, + "eval_sentence_accuracy": 0.7680298598524952, + "eval_steps_per_second": 0.069, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 9728000000, + "step": 95000 + }, + { + "epoch": 0.1, + "grad_norm": 0.07878880275925587, + "learning_rate": 9.14040404040404e-05, + "loss": 0.3431, + "num_input_tokens_seen": 9738240000, + "step": 95100 + }, + { + "epoch": 0.1, + "grad_norm": 0.08944762277547465, + "learning_rate": 9.13939393939394e-05, + "loss": 0.3417, + "num_input_tokens_seen": 9748480000, + "step": 95200 + }, + { + "epoch": 0.1, + "grad_norm": 0.13923533901659566, + "learning_rate": 9.138383838383839e-05, + "loss": 0.3391, + "num_input_tokens_seen": 9758720000, + "step": 95300 + }, + { + "epoch": 0.1, + "grad_norm": 0.07953572334682035, + "learning_rate": 9.137373737373738e-05, + "loss": 0.3425, + "num_input_tokens_seen": 9768960000, + "step": 95400 + }, + { + "epoch": 0.1, + "grad_norm": 0.09691032320386689, + "learning_rate": 9.136363636363637e-05, + "loss": 0.3412, + "num_input_tokens_seen": 9779200000, + "step": 95500 + }, + { + "epoch": 0.1, + "grad_norm": 0.06747777784422795, + "learning_rate": 9.135353535353535e-05, + "loss": 0.3402, + "num_input_tokens_seen": 9789440000, + "step": 95600 + }, + { + "epoch": 0.1, + "grad_norm": 0.06328075726635154, + "learning_rate": 9.134343434343436e-05, + "loss": 0.3438, + "num_input_tokens_seen": 9799680000, + "step": 95700 + }, + { + "epoch": 0.1, + "grad_norm": 0.09473343337594746, + "learning_rate": 9.133333333333334e-05, + "loss": 0.343, + "num_input_tokens_seen": 9809920000, + "step": 95800 + }, + { + "epoch": 0.1, + "grad_norm": 0.08558492919131178, + "learning_rate": 9.132323232323233e-05, + "loss": 0.3429, + "num_input_tokens_seen": 9820160000, + "step": 95900 + }, + { + "epoch": 0.1, + "grad_norm": 0.07293266538814007, + "learning_rate": 9.131313131313132e-05, + "loss": 0.3397, + "num_input_tokens_seen": 9830400000, + "step": 96000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3396380842156681, + "eval_average_loss_on_sentence_tokens": 0.3550314646608926, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3403027355670929, + "eval_non_padding_tokens_in_labels": 133.54145, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3824, + "eval_padding_tokens_in_labels": 378.45855, + "eval_reconstruction_accuracy": 0.9291697623531117, + "eval_runtime": 199.7746, + "eval_samples_per_second": 25.028, + "eval_sentence_accuracy": 0.7688956879071186, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 9830400000, + "step": 96000 + }, + { + "epoch": 0.1, + "grad_norm": 0.16452847734290502, + "learning_rate": 9.130303030303031e-05, + "loss": 0.3428, + "num_input_tokens_seen": 9840640000, + "step": 96100 + }, + { + "epoch": 0.1, + "grad_norm": 0.06403946891393698, + "learning_rate": 9.12929292929293e-05, + "loss": 0.3438, + "num_input_tokens_seen": 9850880000, + "step": 96200 + }, + { + "epoch": 0.1, + "grad_norm": 0.10737564002249865, + "learning_rate": 9.12828282828283e-05, + "loss": 0.3419, + "num_input_tokens_seen": 9861120000, + "step": 96300 + }, + { + "epoch": 0.1, + "grad_norm": 0.06702555056441523, + "learning_rate": 9.127272727272727e-05, + "loss": 0.3447, + "num_input_tokens_seen": 9871360000, + "step": 96400 + }, + { + "epoch": 0.1, + "grad_norm": 0.0712435377765655, + "learning_rate": 9.126262626262627e-05, + "loss": 0.3439, + "num_input_tokens_seen": 9881600000, + "step": 96500 + }, + { + "epoch": 0.1, + "grad_norm": 0.09675458092812166, + "learning_rate": 9.125252525252526e-05, + "loss": 0.3453, + "num_input_tokens_seen": 9891840000, + "step": 96600 + }, + { + "epoch": 0.1, + "grad_norm": 0.08082898007511545, + "learning_rate": 9.124242424242425e-05, + "loss": 0.3418, + "num_input_tokens_seen": 9902080000, + "step": 96700 + }, + { + "epoch": 0.1, + "grad_norm": 0.08286820199404239, + "learning_rate": 9.123232323232324e-05, + "loss": 0.3435, + "num_input_tokens_seen": 9912320000, + "step": 96800 + }, + { + "epoch": 0.1, + "grad_norm": 0.07276179763700379, + "learning_rate": 9.122222222222223e-05, + "loss": 0.3421, + "num_input_tokens_seen": 9922560000, + "step": 96900 + }, + { + "epoch": 0.1, + "grad_norm": 0.07187842044777065, + "learning_rate": 9.121212121212121e-05, + "loss": 0.3454, + "num_input_tokens_seen": 9932800000, + "step": 97000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3392788618099046, + "eval_average_loss_on_sentence_tokens": 0.34315975017638173, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33952149748802185, + "eval_non_padding_tokens_in_labels": 133.53485, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3715, + "eval_padding_tokens_in_labels": 378.46515, + "eval_reconstruction_accuracy": 0.9291917673840125, + "eval_runtime": 197.0705, + "eval_samples_per_second": 25.372, + "eval_sentence_accuracy": 0.7787427997200639, + "eval_steps_per_second": 0.066, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 9932800000, + "step": 97000 + }, + { + "epoch": 0.1, + "grad_norm": 0.06355399122747715, + "learning_rate": 9.120202020202022e-05, + "loss": 0.3401, + "num_input_tokens_seen": 9943040000, + "step": 97100 + }, + { + "epoch": 0.1, + "grad_norm": 0.06033280429857321, + "learning_rate": 9.11919191919192e-05, + "loss": 0.3403, + "num_input_tokens_seen": 9953280000, + "step": 97200 + }, + { + "epoch": 0.1, + "grad_norm": 0.07794157937747148, + "learning_rate": 9.118181818181819e-05, + "loss": 0.3423, + "num_input_tokens_seen": 9963520000, + "step": 97300 + }, + { + "epoch": 0.1, + "grad_norm": 0.08660291311082366, + "learning_rate": 9.117171717171718e-05, + "loss": 0.3449, + "num_input_tokens_seen": 9973760000, + "step": 97400 + }, + { + "epoch": 0.1, + "grad_norm": 0.07736900343579574, + "learning_rate": 9.116161616161617e-05, + "loss": 0.3404, + "num_input_tokens_seen": 9984000000, + "step": 97500 + }, + { + "epoch": 0.1, + "grad_norm": 0.10631789227941309, + "learning_rate": 9.115151515151515e-05, + "loss": 0.3445, + "num_input_tokens_seen": 9994240000, + "step": 97600 + }, + { + "epoch": 0.1, + "grad_norm": 0.10787327166252422, + "learning_rate": 9.114141414141416e-05, + "loss": 0.3398, + "num_input_tokens_seen": 10004480000, + "step": 97700 + }, + { + "epoch": 0.1, + "grad_norm": 0.06695669133259347, + "learning_rate": 9.113131313131313e-05, + "loss": 0.3427, + "num_input_tokens_seen": 10014720000, + "step": 97800 + }, + { + "epoch": 0.1, + "grad_norm": 0.07934106813244533, + "learning_rate": 9.112121212121213e-05, + "loss": 0.3384, + "num_input_tokens_seen": 10024960000, + "step": 97900 + }, + { + "epoch": 0.1, + "grad_norm": 0.0899581556536075, + "learning_rate": 9.111111111111112e-05, + "loss": 0.3436, + "num_input_tokens_seen": 10035200000, + "step": 98000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3390487760300781, + "eval_average_loss_on_sentence_tokens": 0.3555469483796687, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3397460877895355, + "eval_non_padding_tokens_in_labels": 133.5039, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37155, + "eval_padding_tokens_in_labels": 378.4961, + "eval_reconstruction_accuracy": 0.9293002796844652, + "eval_runtime": 200.2917, + "eval_samples_per_second": 24.964, + "eval_sentence_accuracy": 0.7658944497281389, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 10035200000, + "step": 98000 + }, + { + "epoch": 0.1, + "grad_norm": 0.06715591382663223, + "learning_rate": 9.110101010101011e-05, + "loss": 0.3415, + "num_input_tokens_seen": 10045440000, + "step": 98100 + }, + { + "epoch": 0.1, + "grad_norm": 0.0618517757911758, + "learning_rate": 9.109090909090909e-05, + "loss": 0.3402, + "num_input_tokens_seen": 10055680000, + "step": 98200 + }, + { + "epoch": 0.1, + "grad_norm": 0.09946163173884502, + "learning_rate": 9.10808080808081e-05, + "loss": 0.342, + "num_input_tokens_seen": 10065920000, + "step": 98300 + }, + { + "epoch": 0.1, + "grad_norm": 0.07022838561422128, + "learning_rate": 9.107070707070707e-05, + "loss": 0.3425, + "num_input_tokens_seen": 10076160000, + "step": 98400 + }, + { + "epoch": 0.1, + "grad_norm": 0.0650535445299592, + "learning_rate": 9.106060606060606e-05, + "loss": 0.3427, + "num_input_tokens_seen": 10086400000, + "step": 98500 + }, + { + "epoch": 0.1, + "grad_norm": 0.06829777735256078, + "learning_rate": 9.105050505050506e-05, + "loss": 0.3425, + "num_input_tokens_seen": 10096640000, + "step": 98600 + }, + { + "epoch": 0.1, + "grad_norm": 0.07495210845809419, + "learning_rate": 9.104040404040405e-05, + "loss": 0.343, + "num_input_tokens_seen": 10106880000, + "step": 98700 + }, + { + "epoch": 0.1, + "grad_norm": 0.08365289008803607, + "learning_rate": 9.103030303030303e-05, + "loss": 0.3454, + "num_input_tokens_seen": 10117120000, + "step": 98800 + }, + { + "epoch": 0.1, + "grad_norm": 0.06231042490225823, + "learning_rate": 9.102020202020203e-05, + "loss": 0.34, + "num_input_tokens_seen": 10127360000, + "step": 98900 + }, + { + "epoch": 0.1, + "grad_norm": 0.06694964588733918, + "learning_rate": 9.101010101010101e-05, + "loss": 0.3434, + "num_input_tokens_seen": 10137600000, + "step": 99000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3383218240541293, + "eval_average_loss_on_sentence_tokens": 0.3297027643237113, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.33792969584465027, + "eval_non_padding_tokens_in_labels": 133.5152, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38375, + "eval_padding_tokens_in_labels": 378.4848, + "eval_reconstruction_accuracy": 0.929413492420755, + "eval_runtime": 312.3715, + "eval_samples_per_second": 16.007, + "eval_sentence_accuracy": 0.7874818310693201, + "eval_steps_per_second": 0.042, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 10137600000, + "step": 99000 + }, + { + "epoch": 0.1, + "grad_norm": 0.07383460623181061, + "learning_rate": 9.1e-05, + "loss": 0.3404, + "num_input_tokens_seen": 10147840000, + "step": 99100 + }, + { + "epoch": 0.1, + "grad_norm": 0.06037527809710852, + "learning_rate": 9.0989898989899e-05, + "loss": 0.3435, + "num_input_tokens_seen": 10158080000, + "step": 99200 + }, + { + "epoch": 0.1, + "grad_norm": 0.0639462507685346, + "learning_rate": 9.097979797979799e-05, + "loss": 0.3405, + "num_input_tokens_seen": 10168320000, + "step": 99300 + }, + { + "epoch": 0.1, + "grad_norm": 0.06663204821035237, + "learning_rate": 9.096969696969697e-05, + "loss": 0.3433, + "num_input_tokens_seen": 10178560000, + "step": 99400 + }, + { + "epoch": 0.1, + "grad_norm": 0.10650133487305309, + "learning_rate": 9.095959595959597e-05, + "loss": 0.342, + "num_input_tokens_seen": 10188800000, + "step": 99500 + }, + { + "epoch": 0.1, + "grad_norm": 0.06757375711316102, + "learning_rate": 9.094949494949495e-05, + "loss": 0.3419, + "num_input_tokens_seen": 10199040000, + "step": 99600 + }, + { + "epoch": 0.1, + "grad_norm": 0.06517216012776189, + "learning_rate": 9.093939393939394e-05, + "loss": 0.3414, + "num_input_tokens_seen": 10209280000, + "step": 99700 + }, + { + "epoch": 0.1, + "grad_norm": 0.07075213536936033, + "learning_rate": 9.092929292929293e-05, + "loss": 0.3429, + "num_input_tokens_seen": 10219520000, + "step": 99800 + }, + { + "epoch": 0.1, + "grad_norm": 0.06970044817078326, + "learning_rate": 9.091919191919193e-05, + "loss": 0.3425, + "num_input_tokens_seen": 10229760000, + "step": 99900 + }, + { + "epoch": 0.1, + "grad_norm": 0.0903645044673645, + "learning_rate": 9.090909090909092e-05, + "loss": 0.3426, + "num_input_tokens_seen": 10240000000, + "step": 100000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3394053367921645, + "eval_average_loss_on_sentence_tokens": 0.34577318229430426, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.3396972715854645, + "eval_non_padding_tokens_in_labels": 133.56865, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38695, + "eval_padding_tokens_in_labels": 378.43135, + "eval_reconstruction_accuracy": 0.9292258822866984, + "eval_runtime": 801.8841, + "eval_samples_per_second": 6.235, + "eval_sentence_accuracy": 0.7754230444847201, + "eval_steps_per_second": 0.016, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 10240000000, + "step": 100000 + }, + { + "epoch": 0.1, + "grad_norm": 0.06043317910163659, + "learning_rate": 9.089898989898991e-05, + "loss": 0.3436, + "num_input_tokens_seen": 10250240000, + "step": 100100 + }, + { + "epoch": 0.1, + "grad_norm": 0.09848188665433825, + "learning_rate": 9.088888888888889e-05, + "loss": 0.3423, + "num_input_tokens_seen": 10260480000, + "step": 100200 + }, + { + "epoch": 0.1, + "grad_norm": 0.07824345935934307, + "learning_rate": 9.087878787878788e-05, + "loss": 0.341, + "num_input_tokens_seen": 10270720000, + "step": 100300 + }, + { + "epoch": 0.1, + "grad_norm": 0.06033577284227985, + "learning_rate": 9.086868686868687e-05, + "loss": 0.3406, + "num_input_tokens_seen": 10280960000, + "step": 100400 + }, + { + "epoch": 0.1, + "grad_norm": 0.081706256187178, + "learning_rate": 9.085858585858586e-05, + "loss": 0.3425, + "num_input_tokens_seen": 10291200000, + "step": 100500 + }, + { + "epoch": 0.1, + "grad_norm": 0.11023755851963682, + "learning_rate": 9.084848484848486e-05, + "loss": 0.3395, + "num_input_tokens_seen": 10301440000, + "step": 100600 + }, + { + "epoch": 0.1, + "grad_norm": 0.10385499511093071, + "learning_rate": 9.083838383838385e-05, + "loss": 0.3423, + "num_input_tokens_seen": 10311680000, + "step": 100700 + }, + { + "epoch": 0.1, + "grad_norm": 0.05989258781568622, + "learning_rate": 9.082828282828283e-05, + "loss": 0.3457, + "num_input_tokens_seen": 10321920000, + "step": 100800 + }, + { + "epoch": 0.1, + "grad_norm": 0.061864799010228476, + "learning_rate": 9.081818181818182e-05, + "loss": 0.3448, + "num_input_tokens_seen": 10332160000, + "step": 100900 + }, + { + "epoch": 0.1, + "grad_norm": 0.06783028446027616, + "learning_rate": 9.080808080808081e-05, + "loss": 0.3411, + "num_input_tokens_seen": 10342400000, + "step": 101000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.33883449454081455, + "eval_average_loss_on_sentence_tokens": 0.35126188006691683, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.33933594822883606, + "eval_non_padding_tokens_in_labels": 133.4799, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3862, + "eval_padding_tokens_in_labels": 378.5201, + "eval_reconstruction_accuracy": 0.929386288234439, + "eval_runtime": 570.7189, + "eval_samples_per_second": 8.761, + "eval_sentence_accuracy": 0.7663879268577171, + "eval_steps_per_second": 0.023, + "eval_variance_shuffling_prob": 0.24959999999999996, + "num_input_tokens_seen": 10342400000, + "step": 101000 + }, + { + "epoch": 0.1, + "grad_norm": 0.07924938664975544, + "learning_rate": 9.07979797979798e-05, + "loss": 0.346, + "num_input_tokens_seen": 10352640000, + "step": 101100 + }, + { + "epoch": 0.1, + "grad_norm": 0.07811861012076451, + "learning_rate": 9.07878787878788e-05, + "loss": 0.3405, + "num_input_tokens_seen": 10362880000, + "step": 101200 + }, + { + "epoch": 0.1, + "grad_norm": 0.12556437771538626, + "learning_rate": 9.077777777777779e-05, + "loss": 0.3399, + "num_input_tokens_seen": 10373120000, + "step": 101300 + }, + { + "epoch": 0.1, + "grad_norm": 0.11123979770500014, + "learning_rate": 9.076767676767676e-05, + "loss": 0.3411, + "num_input_tokens_seen": 10383360000, + "step": 101400 + }, + { + "epoch": 0.1, + "grad_norm": 0.09779336125466684, + "learning_rate": 9.075757575757577e-05, + "loss": 0.3431, + "num_input_tokens_seen": 10393600000, + "step": 101500 + }, + { + "epoch": 0.1, + "grad_norm": 0.06531009758490405, + "learning_rate": 9.074747474747475e-05, + "loss": 0.3439, + "num_input_tokens_seen": 10403840000, + "step": 101600 + }, + { + "epoch": 0.1, + "grad_norm": 0.08914353428814671, + "learning_rate": 9.073737373737374e-05, + "loss": 0.3404, + "num_input_tokens_seen": 10414080000, + "step": 101700 + }, + { + "epoch": 0.1, + "grad_norm": 0.06885771813151048, + "learning_rate": 9.072727272727273e-05, + "loss": 0.3421, + "num_input_tokens_seen": 10424320000, + "step": 101800 + }, + { + "epoch": 0.1, + "grad_norm": 0.0791212428300367, + "learning_rate": 9.071717171717172e-05, + "loss": 0.3418, + "num_input_tokens_seen": 10434560000, + "step": 101900 + }, + { + "epoch": 0.1, + "grad_norm": 0.08325757290171291, + "learning_rate": 9.07070707070707e-05, + "loss": 0.3431, + "num_input_tokens_seen": 10444800000, + "step": 102000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3385556747296517, + "eval_average_loss_on_sentence_tokens": 0.3144594738630412, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.33739256858825684, + "eval_non_padding_tokens_in_labels": 133.4723, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.355, + "eval_padding_tokens_in_labels": 378.5277, + "eval_reconstruction_accuracy": 0.9293650154238393, + "eval_runtime": 420.5008, + "eval_samples_per_second": 11.891, + "eval_sentence_accuracy": 0.7958081360920201, + "eval_steps_per_second": 0.031, + "eval_variance_shuffling_prob": 0.24750000000000008, + "num_input_tokens_seen": 10444800000, + "step": 102000 + }, + { + "epoch": 0.1, + "grad_norm": 0.07718182095010573, + "learning_rate": 9.069696969696971e-05, + "loss": 0.3413, + "num_input_tokens_seen": 10455040000, + "step": 102100 + }, + { + "epoch": 0.1, + "grad_norm": 0.09729116264469845, + "learning_rate": 9.068686868686869e-05, + "loss": 0.3432, + "num_input_tokens_seen": 10465280000, + "step": 102200 + }, + { + "epoch": 0.1, + "grad_norm": 0.12226753324780626, + "learning_rate": 9.067676767676768e-05, + "loss": 0.3391, + "num_input_tokens_seen": 10475520000, + "step": 102300 + }, + { + "epoch": 0.1, + "grad_norm": 0.13114274182253677, + "learning_rate": 9.066666666666667e-05, + "loss": 0.3406, + "num_input_tokens_seen": 10485760000, + "step": 102400 + }, + { + "epoch": 0.1, + "grad_norm": 0.09063311866247858, + "learning_rate": 9.065656565656566e-05, + "loss": 0.344, + "num_input_tokens_seen": 10496000000, + "step": 102500 + }, + { + "epoch": 0.1, + "grad_norm": 0.11025285842809815, + "learning_rate": 9.064646464646464e-05, + "loss": 0.3417, + "num_input_tokens_seen": 10506240000, + "step": 102600 + }, + { + "epoch": 0.1, + "grad_norm": 0.06895770415451717, + "learning_rate": 9.063636363636365e-05, + "loss": 0.3419, + "num_input_tokens_seen": 10516480000, + "step": 102700 + }, + { + "epoch": 0.1, + "grad_norm": 0.06736044408796647, + "learning_rate": 9.062626262626262e-05, + "loss": 0.3428, + "num_input_tokens_seen": 10526720000, + "step": 102800 + }, + { + "epoch": 0.1, + "grad_norm": 0.07537613683793364, + "learning_rate": 9.061616161616162e-05, + "loss": 0.3422, + "num_input_tokens_seen": 10536960000, + "step": 102900 + }, + { + "epoch": 0.1, + "grad_norm": 0.07411010061321405, + "learning_rate": 9.060606060606061e-05, + "loss": 0.3404, + "num_input_tokens_seen": 10547200000, + "step": 103000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.33876497910396713, + "eval_average_loss_on_sentence_tokens": 0.33645186780676356, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.33866211771965027, + "eval_non_padding_tokens_in_labels": 133.55725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38715, + "eval_padding_tokens_in_labels": 378.44275, + "eval_reconstruction_accuracy": 0.9292090978466572, + "eval_runtime": 561.5108, + "eval_samples_per_second": 8.905, + "eval_sentence_accuracy": 0.7863199167369498, + "eval_steps_per_second": 0.023, + "eval_variance_shuffling_prob": 0.2493749999999999, + "num_input_tokens_seen": 10547200000, + "step": 103000 + }, + { + "epoch": 0.1, + "grad_norm": 0.1005960107592556, + "learning_rate": 9.05959595959596e-05, + "loss": 0.3421, + "num_input_tokens_seen": 10557440000, + "step": 103100 + }, + { + "epoch": 0.1, + "grad_norm": 0.05722654895163887, + "learning_rate": 9.058585858585858e-05, + "loss": 0.3443, + "num_input_tokens_seen": 10567680000, + "step": 103200 + }, + { + "epoch": 0.1, + "grad_norm": 0.076016138502388, + "learning_rate": 9.057575757575758e-05, + "loss": 0.3406, + "num_input_tokens_seen": 10577920000, + "step": 103300 + }, + { + "epoch": 0.1, + "grad_norm": 0.06879949406937094, + "learning_rate": 9.056565656565656e-05, + "loss": 0.3397, + "num_input_tokens_seen": 10588160000, + "step": 103400 + }, + { + "epoch": 0.1, + "grad_norm": 0.06536770845759331, + "learning_rate": 9.055555555555556e-05, + "loss": 0.3413, + "num_input_tokens_seen": 10598400000, + "step": 103500 + }, + { + "epoch": 0.1, + "grad_norm": 0.069042955193525, + "learning_rate": 9.054545454545455e-05, + "loss": 0.3423, + "num_input_tokens_seen": 10608640000, + "step": 103600 + }, + { + "epoch": 0.1, + "grad_norm": 0.08636815903460973, + "learning_rate": 9.053535353535354e-05, + "loss": 0.3435, + "num_input_tokens_seen": 10618880000, + "step": 103700 + }, + { + "epoch": 0.1, + "grad_norm": 0.07437032981442936, + "learning_rate": 9.052525252525252e-05, + "loss": 0.344, + "num_input_tokens_seen": 10629120000, + "step": 103800 + }, + { + "epoch": 0.1, + "grad_norm": 0.08056558550792295, + "learning_rate": 9.051515151515152e-05, + "loss": 0.3441, + "num_input_tokens_seen": 10639360000, + "step": 103900 + }, + { + "epoch": 0.1, + "grad_norm": 0.06256905853180934, + "learning_rate": 9.050505050505052e-05, + "loss": 0.3429, + "num_input_tokens_seen": 10649600000, + "step": 104000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.3384332331333527, + "eval_average_loss_on_sentence_tokens": 0.32582473337142315, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.33790040016174316, + "eval_non_padding_tokens_in_labels": 133.54905, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39115, + "eval_padding_tokens_in_labels": 378.45095, + "eval_reconstruction_accuracy": 0.9293042902260796, + "eval_runtime": 764.1578, + "eval_samples_per_second": 6.543, + "eval_sentence_accuracy": 0.7940630215156028, + "eval_steps_per_second": 0.017, + "eval_variance_shuffling_prob": 0.24797499999999995, + "num_input_tokens_seen": 10649600000, + "step": 104000 + }, + { + "epoch": 0.1, + "grad_norm": 0.07919126298976419, + "learning_rate": 9.04949494949495e-05, + "loss": 0.341, + "num_input_tokens_seen": 10659840000, + "step": 104100 + }, + { + "epoch": 0.1, + "grad_norm": 0.07731389911015125, + "learning_rate": 9.04848484848485e-05, + "loss": 0.3413, + "num_input_tokens_seen": 10670080000, + "step": 104200 + }, + { + "epoch": 0.1, + "grad_norm": 0.10865658444760985, + "learning_rate": 9.047474747474748e-05, + "loss": 0.3432, + "num_input_tokens_seen": 10680320000, + "step": 104300 + }, + { + "epoch": 0.1, + "grad_norm": 0.06722476467796351, + "learning_rate": 9.046464646464647e-05, + "loss": 0.3405, + "num_input_tokens_seen": 10690560000, + "step": 104400 + }, + { + "epoch": 0.1, + "grad_norm": 0.07216737647581387, + "learning_rate": 9.045454545454546e-05, + "loss": 0.3415, + "num_input_tokens_seen": 10700800000, + "step": 104500 + }, + { + "epoch": 0.1, + "grad_norm": 0.10251855190955413, + "learning_rate": 9.044444444444445e-05, + "loss": 0.3455, + "num_input_tokens_seen": 10711040000, + "step": 104600 + }, + { + "epoch": 0.1, + "grad_norm": 0.09308902244522509, + "learning_rate": 9.043434343434343e-05, + "loss": 0.3424, + "num_input_tokens_seen": 10721280000, + "step": 104700 + }, + { + "epoch": 0.1, + "grad_norm": 0.07021402542900124, + "learning_rate": 9.042424242424244e-05, + "loss": 0.3411, + "num_input_tokens_seen": 10731520000, + "step": 104800 + }, + { + "epoch": 0.1, + "grad_norm": 0.06541618876917929, + "learning_rate": 9.041414141414142e-05, + "loss": 0.3415, + "num_input_tokens_seen": 10741760000, + "step": 104900 + }, + { + "epoch": 0.1, + "grad_norm": 0.06904270058759224, + "learning_rate": 9.040404040404041e-05, + "loss": 0.3435, + "num_input_tokens_seen": 10752000000, + "step": 105000 + }, + { + "epoch": 0.1, + "eval_average_loss_on_non_sentence_tokens": 0.33898531001703, + "eval_average_loss_on_sentence_tokens": 0.3671863164569564, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.3402343690395355, + "eval_non_padding_tokens_in_labels": 133.5332, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.378, + "eval_padding_tokens_in_labels": 378.4668, + "eval_reconstruction_accuracy": 0.9292269451313849, + "eval_runtime": 5969.5766, + "eval_samples_per_second": 0.838, + "eval_sentence_accuracy": 0.7582500403754016, + "eval_steps_per_second": 0.002, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 10752000000, + "step": 105000 + }, + { + "epoch": 0.11, + "grad_norm": 0.0853792558083362, + "learning_rate": 9.03939393939394e-05, + "loss": 0.3413, + "num_input_tokens_seen": 10762240000, + "step": 105100 + }, + { + "epoch": 0.11, + "grad_norm": 0.10124703275084021, + "learning_rate": 9.038383838383839e-05, + "loss": 0.3401, + "num_input_tokens_seen": 10772480000, + "step": 105200 + }, + { + "epoch": 0.11, + "grad_norm": 0.11639524718263276, + "learning_rate": 9.037373737373738e-05, + "loss": 0.3423, + "num_input_tokens_seen": 10782720000, + "step": 105300 + }, + { + "epoch": 0.11, + "grad_norm": 0.15081956616216763, + "learning_rate": 9.036363636363638e-05, + "loss": 0.3416, + "num_input_tokens_seen": 10792960000, + "step": 105400 + }, + { + "epoch": 0.11, + "grad_norm": 0.06321201031839657, + "learning_rate": 9.035353535353535e-05, + "loss": 0.3417, + "num_input_tokens_seen": 10803200000, + "step": 105500 + }, + { + "epoch": 0.11, + "grad_norm": 0.08055585878861952, + "learning_rate": 9.034343434343435e-05, + "loss": 0.3416, + "num_input_tokens_seen": 10813440000, + "step": 105600 + }, + { + "epoch": 0.11, + "grad_norm": 0.12794992781820977, + "learning_rate": 9.033333333333334e-05, + "loss": 0.3427, + "num_input_tokens_seen": 10823680000, + "step": 105700 + }, + { + "epoch": 0.11, + "grad_norm": 0.06404190753785807, + "learning_rate": 9.032323232323233e-05, + "loss": 0.3422, + "num_input_tokens_seen": 10833920000, + "step": 105800 + }, + { + "epoch": 0.11, + "grad_norm": 0.06024554761133125, + "learning_rate": 9.031313131313132e-05, + "loss": 0.3426, + "num_input_tokens_seen": 10844160000, + "step": 105900 + }, + { + "epoch": 0.11, + "grad_norm": 0.09828614698045404, + "learning_rate": 9.030303030303031e-05, + "loss": 0.3417, + "num_input_tokens_seen": 10854400000, + "step": 106000 + }, + { + "epoch": 0.11, + "eval_average_loss_on_non_sentence_tokens": 0.3387699037269536, + "eval_average_loss_on_sentence_tokens": 0.3704268573104275, + "eval_average_shuffling_prob": 0.54, + "eval_loss": 0.3401562571525574, + "eval_non_padding_tokens_in_labels": 133.5279, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38145, + "eval_padding_tokens_in_labels": 378.4721, + "eval_reconstruction_accuracy": 0.929278628876429, + "eval_runtime": 5153.8861, + "eval_samples_per_second": 0.97, + "eval_sentence_accuracy": 0.7557647101046171, + "eval_steps_per_second": 0.003, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 10854400000, + "step": 106000 + }, + { + "epoch": 0.0, + "grad_norm": 0.063909799063667, + "learning_rate": 9.029292929292929e-05, + "loss": 0.341, + "num_input_tokens_seen": 10864640000, + "step": 106100 + }, + { + "epoch": 0.0, + "grad_norm": 0.14548010690354884, + "learning_rate": 9.02828282828283e-05, + "loss": 0.3416, + "num_input_tokens_seen": 10874880000, + "step": 106200 + }, + { + "epoch": 0.0, + "grad_norm": 0.0638054634759126, + "learning_rate": 9.027272727272728e-05, + "loss": 0.3383, + "num_input_tokens_seen": 10885120000, + "step": 106300 + }, + { + "epoch": 0.0, + "grad_norm": 0.06008395574411723, + "learning_rate": 9.026262626262627e-05, + "loss": 0.34, + "num_input_tokens_seen": 10895360000, + "step": 106400 + }, + { + "epoch": 0.0, + "grad_norm": 0.14332590881883273, + "learning_rate": 9.025252525252526e-05, + "loss": 0.343, + "num_input_tokens_seen": 10905600000, + "step": 106500 + }, + { + "epoch": 0.0, + "grad_norm": 0.06262304071042607, + "learning_rate": 9.024242424242425e-05, + "loss": 0.342, + "num_input_tokens_seen": 10915840000, + "step": 106600 + }, + { + "epoch": 0.0, + "grad_norm": 0.12847645591691984, + "learning_rate": 9.023232323232323e-05, + "loss": 0.3435, + "num_input_tokens_seen": 10926080000, + "step": 106700 + }, + { + "epoch": 0.0, + "grad_norm": 0.11003928603263687, + "learning_rate": 9.022222222222224e-05, + "loss": 0.3433, + "num_input_tokens_seen": 10936320000, + "step": 106800 + }, + { + "epoch": 0.0, + "grad_norm": 0.06726141979478527, + "learning_rate": 9.021212121212121e-05, + "loss": 0.3418, + "num_input_tokens_seen": 10946560000, + "step": 106900 + }, + { + "epoch": 0.0, + "grad_norm": 0.15283580933797386, + "learning_rate": 9.02020202020202e-05, + "loss": 0.3392, + "num_input_tokens_seen": 10956800000, + "step": 107000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.33770590281410695, + "eval_average_loss_on_sentence_tokens": 0.3536209518770046, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3384082019329071, + "eval_non_padding_tokens_in_labels": 133.53755, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3767, + "eval_padding_tokens_in_labels": 378.46245, + "eval_reconstruction_accuracy": 0.929369877964153, + "eval_runtime": 287.3984, + "eval_samples_per_second": 17.397, + "eval_sentence_accuracy": 0.7703940639187468, + "eval_steps_per_second": 0.045, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 10956800000, + "step": 107000 + }, + { + "epoch": 0.0, + "grad_norm": 0.09040253053275077, + "learning_rate": 9.01919191919192e-05, + "loss": 0.3421, + "num_input_tokens_seen": 10967040000, + "step": 107100 + }, + { + "epoch": 0.0, + "grad_norm": 0.05738920329253304, + "learning_rate": 9.018181818181819e-05, + "loss": 0.3409, + "num_input_tokens_seen": 10977280000, + "step": 107200 + }, + { + "epoch": 0.0, + "grad_norm": 0.15031203315735725, + "learning_rate": 9.017171717171717e-05, + "loss": 0.3401, + "num_input_tokens_seen": 10987520000, + "step": 107300 + }, + { + "epoch": 0.0, + "grad_norm": 0.07537292120656336, + "learning_rate": 9.016161616161617e-05, + "loss": 0.3404, + "num_input_tokens_seen": 10997760000, + "step": 107400 + }, + { + "epoch": 0.0, + "grad_norm": 0.08183210615462119, + "learning_rate": 9.015151515151515e-05, + "loss": 0.3394, + "num_input_tokens_seen": 11008000000, + "step": 107500 + }, + { + "epoch": 0.0, + "grad_norm": 0.05605973298263772, + "learning_rate": 9.014141414141415e-05, + "loss": 0.3428, + "num_input_tokens_seen": 11018240000, + "step": 107600 + }, + { + "epoch": 0.0, + "grad_norm": 0.06293472519393142, + "learning_rate": 9.013131313131314e-05, + "loss": 0.3405, + "num_input_tokens_seen": 11028480000, + "step": 107700 + }, + { + "epoch": 0.0, + "grad_norm": 0.06738758761307022, + "learning_rate": 9.012121212121213e-05, + "loss": 0.3399, + "num_input_tokens_seen": 11038720000, + "step": 107800 + }, + { + "epoch": 0.0, + "grad_norm": 0.10669415155885137, + "learning_rate": 9.011111111111111e-05, + "loss": 0.3453, + "num_input_tokens_seen": 11048960000, + "step": 107900 + }, + { + "epoch": 0.0, + "grad_norm": 0.0710439698329203, + "learning_rate": 9.010101010101011e-05, + "loss": 0.3397, + "num_input_tokens_seen": 11059200000, + "step": 108000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.3382178786919567, + "eval_average_loss_on_sentence_tokens": 0.3505732609086148, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3388378918170929, + "eval_non_padding_tokens_in_labels": 133.5104, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36775, + "eval_padding_tokens_in_labels": 378.4896, + "eval_reconstruction_accuracy": 0.929342137196789, + "eval_runtime": 438.4799, + "eval_samples_per_second": 11.403, + "eval_sentence_accuracy": 0.7744091732912233, + "eval_steps_per_second": 0.03, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 11059200000, + "step": 108000 + }, + { + "epoch": 0.0, + "grad_norm": 0.06303804948698184, + "learning_rate": 9.009090909090909e-05, + "loss": 0.3398, + "num_input_tokens_seen": 11069440000, + "step": 108100 + }, + { + "epoch": 0.0, + "grad_norm": 0.0931404921167242, + "learning_rate": 9.008080808080808e-05, + "loss": 0.3413, + "num_input_tokens_seen": 11079680000, + "step": 108200 + }, + { + "epoch": 0.0, + "grad_norm": 0.08255479273677012, + "learning_rate": 9.007070707070708e-05, + "loss": 0.3404, + "num_input_tokens_seen": 11089920000, + "step": 108300 + }, + { + "epoch": 0.0, + "grad_norm": 0.11801160679689693, + "learning_rate": 9.006060606060607e-05, + "loss": 0.3411, + "num_input_tokens_seen": 11100160000, + "step": 108400 + }, + { + "epoch": 0.0, + "grad_norm": 0.07089739981330657, + "learning_rate": 9.005050505050505e-05, + "loss": 0.3404, + "num_input_tokens_seen": 11110400000, + "step": 108500 + }, + { + "epoch": 0.0, + "grad_norm": 0.08858686291703878, + "learning_rate": 9.004040404040405e-05, + "loss": 0.3429, + "num_input_tokens_seen": 11120640000, + "step": 108600 + }, + { + "epoch": 0.0, + "grad_norm": 0.08440314448253051, + "learning_rate": 9.003030303030303e-05, + "loss": 0.3437, + "num_input_tokens_seen": 11130880000, + "step": 108700 + }, + { + "epoch": 0.0, + "grad_norm": 0.05999816785563182, + "learning_rate": 9.002020202020202e-05, + "loss": 0.3437, + "num_input_tokens_seen": 11141120000, + "step": 108800 + }, + { + "epoch": 0.0, + "grad_norm": 0.07352950801409179, + "learning_rate": 9.001010101010101e-05, + "loss": 0.3437, + "num_input_tokens_seen": 11151360000, + "step": 108900 + }, + { + "epoch": 0.0, + "grad_norm": 0.08113408508735316, + "learning_rate": 9e-05, + "loss": 0.3426, + "num_input_tokens_seen": 11161600000, + "step": 109000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.336677059559612, + "eval_average_loss_on_sentence_tokens": 0.2989127229659832, + "eval_average_shuffling_prob": 0.42, + "eval_loss": 0.33497071266174316, + "eval_non_padding_tokens_in_labels": 133.53925, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40215, + "eval_padding_tokens_in_labels": 378.46075, + "eval_reconstruction_accuracy": 0.9295719247298978, + "eval_runtime": 319.107, + "eval_samples_per_second": 15.669, + "eval_sentence_accuracy": 0.8150851472356309, + "eval_steps_per_second": 0.041, + "eval_variance_shuffling_prob": 0.24360000000000004, + "num_input_tokens_seen": 11161600000, + "step": 109000 + }, + { + "epoch": 0.0, + "grad_norm": 0.06831371621775098, + "learning_rate": 8.998989898989898e-05, + "loss": 0.3402, + "num_input_tokens_seen": 11171840000, + "step": 109100 + }, + { + "epoch": 0.0, + "grad_norm": 0.09073202779756374, + "learning_rate": 8.997979797979799e-05, + "loss": 0.3416, + "num_input_tokens_seen": 11182080000, + "step": 109200 + }, + { + "epoch": 0.0, + "grad_norm": 0.07431778449085086, + "learning_rate": 8.996969696969697e-05, + "loss": 0.3402, + "num_input_tokens_seen": 11192320000, + "step": 109300 + }, + { + "epoch": 0.0, + "grad_norm": 0.12262530999625683, + "learning_rate": 8.995959595959596e-05, + "loss": 0.3436, + "num_input_tokens_seen": 11202560000, + "step": 109400 + }, + { + "epoch": 0.0, + "grad_norm": 0.07442749464150737, + "learning_rate": 8.994949494949495e-05, + "loss": 0.3397, + "num_input_tokens_seen": 11212800000, + "step": 109500 + }, + { + "epoch": 0.0, + "grad_norm": 0.08302669731064366, + "learning_rate": 8.993939393939394e-05, + "loss": 0.3418, + "num_input_tokens_seen": 11223040000, + "step": 109600 + }, + { + "epoch": 0.0, + "grad_norm": 0.10315797194695085, + "learning_rate": 8.992929292929294e-05, + "loss": 0.342, + "num_input_tokens_seen": 11233280000, + "step": 109700 + }, + { + "epoch": 0.0, + "grad_norm": 0.1169314507810068, + "learning_rate": 8.991919191919193e-05, + "loss": 0.3409, + "num_input_tokens_seen": 11243520000, + "step": 109800 + }, + { + "epoch": 0.0, + "grad_norm": 0.10079498927404842, + "learning_rate": 8.99090909090909e-05, + "loss": 0.3375, + "num_input_tokens_seen": 11253760000, + "step": 109900 + }, + { + "epoch": 0.0, + "grad_norm": 0.08332818001096717, + "learning_rate": 8.98989898989899e-05, + "loss": 0.3393, + "num_input_tokens_seen": 11264000000, + "step": 110000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.33803951407106064, + "eval_average_loss_on_sentence_tokens": 0.3318067995156536, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33775389194488525, + "eval_non_padding_tokens_in_labels": 133.50065, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3706, + "eval_padding_tokens_in_labels": 378.49935, + "eval_reconstruction_accuracy": 0.9294623534825112, + "eval_runtime": 628.0427, + "eval_samples_per_second": 7.961, + "eval_sentence_accuracy": 0.7844805928903404, + "eval_steps_per_second": 0.021, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 11264000000, + "step": 110000 + }, + { + "epoch": 0.0, + "grad_norm": 0.0786517238805655, + "learning_rate": 8.988888888888889e-05, + "loss": 0.3427, + "num_input_tokens_seen": 11274240000, + "step": 110100 + }, + { + "epoch": 0.0, + "grad_norm": 0.16714360388372584, + "learning_rate": 8.987878787878788e-05, + "loss": 0.3393, + "num_input_tokens_seen": 11284480000, + "step": 110200 + }, + { + "epoch": 0.0, + "grad_norm": 0.07448359967535027, + "learning_rate": 8.986868686868687e-05, + "loss": 0.3434, + "num_input_tokens_seen": 11294720000, + "step": 110300 + }, + { + "epoch": 0.0, + "grad_norm": 0.07005474908742435, + "learning_rate": 8.985858585858587e-05, + "loss": 0.3423, + "num_input_tokens_seen": 11304960000, + "step": 110400 + }, + { + "epoch": 0.0, + "grad_norm": 0.08938709856137758, + "learning_rate": 8.984848484848484e-05, + "loss": 0.3391, + "num_input_tokens_seen": 11315200000, + "step": 110500 + }, + { + "epoch": 0.0, + "grad_norm": 0.07307407158455324, + "learning_rate": 8.983838383838385e-05, + "loss": 0.3425, + "num_input_tokens_seen": 11325440000, + "step": 110600 + }, + { + "epoch": 0.0, + "grad_norm": 0.06483787695301417, + "learning_rate": 8.982828282828283e-05, + "loss": 0.3407, + "num_input_tokens_seen": 11335680000, + "step": 110700 + }, + { + "epoch": 0.0, + "grad_norm": 0.06307012947483126, + "learning_rate": 8.981818181818182e-05, + "loss": 0.3409, + "num_input_tokens_seen": 11345920000, + "step": 110800 + }, + { + "epoch": 0.0, + "grad_norm": 0.07034997719451151, + "learning_rate": 8.980808080808081e-05, + "loss": 0.3415, + "num_input_tokens_seen": 11356160000, + "step": 110900 + }, + { + "epoch": 0.01, + "grad_norm": 0.14738134737295056, + "learning_rate": 8.97979797979798e-05, + "loss": 0.3418, + "num_input_tokens_seen": 11366400000, + "step": 111000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33706921529961154, + "eval_average_loss_on_sentence_tokens": 0.3673884656088091, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.3384082019329071, + "eval_non_padding_tokens_in_labels": 133.50335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3911, + "eval_padding_tokens_in_labels": 378.49665, + "eval_reconstruction_accuracy": 0.9293769813914553, + "eval_runtime": 689.7015, + "eval_samples_per_second": 7.25, + "eval_sentence_accuracy": 0.7674915211656828, + "eval_steps_per_second": 0.019, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 11366400000, + "step": 111000 + }, + { + "epoch": 0.01, + "grad_norm": 0.07573604919384591, + "learning_rate": 8.978787878787878e-05, + "loss": 0.3409, + "num_input_tokens_seen": 11376640000, + "step": 111100 + }, + { + "epoch": 0.01, + "grad_norm": 0.1548835008757448, + "learning_rate": 8.977777777777779e-05, + "loss": 0.3443, + "num_input_tokens_seen": 11386880000, + "step": 111200 + }, + { + "epoch": 0.01, + "grad_norm": 0.1119601292192695, + "learning_rate": 8.976767676767677e-05, + "loss": 0.3427, + "num_input_tokens_seen": 11397120000, + "step": 111300 + }, + { + "epoch": 0.01, + "grad_norm": 0.07574429385151865, + "learning_rate": 8.975757575757576e-05, + "loss": 0.3426, + "num_input_tokens_seen": 11407360000, + "step": 111400 + }, + { + "epoch": 0.01, + "grad_norm": 0.10753881418412178, + "learning_rate": 8.974747474747475e-05, + "loss": 0.3395, + "num_input_tokens_seen": 11417600000, + "step": 111500 + }, + { + "epoch": 0.01, + "grad_norm": 0.06873658195598105, + "learning_rate": 8.973737373737374e-05, + "loss": 0.342, + "num_input_tokens_seen": 11427840000, + "step": 111600 + }, + { + "epoch": 0.01, + "grad_norm": 0.07533337100338248, + "learning_rate": 8.972727272727272e-05, + "loss": 0.3382, + "num_input_tokens_seen": 11438080000, + "step": 111700 + }, + { + "epoch": 0.01, + "grad_norm": 0.10582279126554642, + "learning_rate": 8.971717171717173e-05, + "loss": 0.3408, + "num_input_tokens_seen": 11448320000, + "step": 111800 + }, + { + "epoch": 0.01, + "grad_norm": 0.1086075576316126, + "learning_rate": 8.97070707070707e-05, + "loss": 0.3412, + "num_input_tokens_seen": 11458560000, + "step": 111900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07149906450481573, + "learning_rate": 8.96969696969697e-05, + "loss": 0.3417, + "num_input_tokens_seen": 11468800000, + "step": 112000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33803457025736694, + "eval_average_loss_on_sentence_tokens": 0.36281741948119967, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.3391992151737213, + "eval_non_padding_tokens_in_labels": 133.5197, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37915, + "eval_padding_tokens_in_labels": 378.4803, + "eval_reconstruction_accuracy": 0.929420530121983, + "eval_runtime": 842.3724, + "eval_samples_per_second": 5.936, + "eval_sentence_accuracy": 0.763310424031439, + "eval_steps_per_second": 0.015, + "eval_variance_shuffling_prob": 0.2493749999999999, + "num_input_tokens_seen": 11468800000, + "step": 112000 + }, + { + "epoch": 0.01, + "grad_norm": 0.11484166994124977, + "learning_rate": 8.968686868686869e-05, + "loss": 0.339, + "num_input_tokens_seen": 11479040000, + "step": 112100 + }, + { + "epoch": 0.01, + "grad_norm": 0.07959132982984621, + "learning_rate": 8.967676767676768e-05, + "loss": 0.3401, + "num_input_tokens_seen": 11489280000, + "step": 112200 + }, + { + "epoch": 0.01, + "grad_norm": 0.08492074075825752, + "learning_rate": 8.966666666666666e-05, + "loss": 0.3391, + "num_input_tokens_seen": 11499520000, + "step": 112300 + }, + { + "epoch": 0.01, + "grad_norm": 0.1009522568357317, + "learning_rate": 8.965656565656567e-05, + "loss": 0.3405, + "num_input_tokens_seen": 11509760000, + "step": 112400 + }, + { + "epoch": 0.01, + "grad_norm": 0.0731212778269975, + "learning_rate": 8.964646464646466e-05, + "loss": 0.3428, + "num_input_tokens_seen": 11520000000, + "step": 112500 + }, + { + "epoch": 0.01, + "grad_norm": 0.06571330156961702, + "learning_rate": 8.963636363636364e-05, + "loss": 0.3388, + "num_input_tokens_seen": 11530240000, + "step": 112600 + }, + { + "epoch": 0.01, + "grad_norm": 0.059227480647391605, + "learning_rate": 8.962626262626264e-05, + "loss": 0.3406, + "num_input_tokens_seen": 11540480000, + "step": 112700 + }, + { + "epoch": 0.01, + "grad_norm": 0.057648836955746526, + "learning_rate": 8.961616161616162e-05, + "loss": 0.3393, + "num_input_tokens_seen": 11550720000, + "step": 112800 + }, + { + "epoch": 0.01, + "grad_norm": 0.06127264929046434, + "learning_rate": 8.960606060606061e-05, + "loss": 0.3379, + "num_input_tokens_seen": 11560960000, + "step": 112900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07404114623356227, + "learning_rate": 8.95959595959596e-05, + "loss": 0.3407, + "num_input_tokens_seen": 11571200000, + "step": 113000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3378762644594762, + "eval_average_loss_on_sentence_tokens": 0.3627080939194834, + "eval_average_shuffling_prob": 0.525, + "eval_loss": 0.33903321623802185, + "eval_non_padding_tokens_in_labels": 133.5456, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38195, + "eval_padding_tokens_in_labels": 378.4544, + "eval_reconstruction_accuracy": 0.9294112725843835, + "eval_runtime": 927.7056, + "eval_samples_per_second": 5.39, + "eval_sentence_accuracy": 0.763916055054103, + "eval_steps_per_second": 0.014, + "eval_variance_shuffling_prob": 0.2493749999999999, + "num_input_tokens_seen": 11571200000, + "step": 113000 + }, + { + "epoch": 0.01, + "grad_norm": 0.09258344168224628, + "learning_rate": 8.95858585858586e-05, + "loss": 0.3409, + "num_input_tokens_seen": 11581440000, + "step": 113100 + }, + { + "epoch": 0.01, + "grad_norm": 0.1215550476985596, + "learning_rate": 8.957575757575757e-05, + "loss": 0.3424, + "num_input_tokens_seen": 11591680000, + "step": 113200 + }, + { + "epoch": 0.01, + "grad_norm": 0.08077237936415807, + "learning_rate": 8.956565656565658e-05, + "loss": 0.3399, + "num_input_tokens_seen": 11601920000, + "step": 113300 + }, + { + "epoch": 0.01, + "grad_norm": 0.0618948097258852, + "learning_rate": 8.955555555555556e-05, + "loss": 0.3421, + "num_input_tokens_seen": 11612160000, + "step": 113400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06322264135417698, + "learning_rate": 8.954545454545455e-05, + "loss": 0.3387, + "num_input_tokens_seen": 11622400000, + "step": 113500 + }, + { + "epoch": 0.01, + "grad_norm": 0.10873867242767894, + "learning_rate": 8.953535353535354e-05, + "loss": 0.3396, + "num_input_tokens_seen": 11632640000, + "step": 113600 + }, + { + "epoch": 0.01, + "grad_norm": 0.07873702823716662, + "learning_rate": 8.952525252525253e-05, + "loss": 0.3399, + "num_input_tokens_seen": 11642880000, + "step": 113700 + }, + { + "epoch": 0.01, + "grad_norm": 0.21035231552060077, + "learning_rate": 8.951515151515151e-05, + "loss": 0.3427, + "num_input_tokens_seen": 11653120000, + "step": 113800 + }, + { + "epoch": 0.01, + "grad_norm": 0.06677321062242503, + "learning_rate": 8.950505050505052e-05, + "loss": 0.3421, + "num_input_tokens_seen": 11663360000, + "step": 113900 + }, + { + "epoch": 0.01, + "grad_norm": 0.060853971491185835, + "learning_rate": 8.94949494949495e-05, + "loss": 0.3416, + "num_input_tokens_seen": 11673600000, + "step": 114000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3371595528127877, + "eval_average_loss_on_sentence_tokens": 0.34684794082447096, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.33760741353034973, + "eval_non_padding_tokens_in_labels": 133.48725, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37425, + "eval_padding_tokens_in_labels": 378.51275, + "eval_reconstruction_accuracy": 0.9294558079369816, + "eval_runtime": 995.3394, + "eval_samples_per_second": 5.023, + "eval_sentence_accuracy": 0.7778500547310998, + "eval_steps_per_second": 0.013, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 11673600000, + "step": 114000 + }, + { + "epoch": 0.01, + "grad_norm": 0.10848959856995438, + "learning_rate": 8.948484848484849e-05, + "loss": 0.3423, + "num_input_tokens_seen": 11683840000, + "step": 114100 + }, + { + "epoch": 0.01, + "grad_norm": 0.10104824967258128, + "learning_rate": 8.947474747474748e-05, + "loss": 0.3403, + "num_input_tokens_seen": 11694080000, + "step": 114200 + }, + { + "epoch": 0.01, + "grad_norm": 0.06333983900515397, + "learning_rate": 8.946464646464647e-05, + "loss": 0.3408, + "num_input_tokens_seen": 11704320000, + "step": 114300 + }, + { + "epoch": 0.01, + "grad_norm": 0.0673393734911598, + "learning_rate": 8.945454545454546e-05, + "loss": 0.3413, + "num_input_tokens_seen": 11714560000, + "step": 114400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06470467057226831, + "learning_rate": 8.944444444444446e-05, + "loss": 0.3399, + "num_input_tokens_seen": 11724800000, + "step": 114500 + }, + { + "epoch": 0.01, + "grad_norm": 0.05891359176278214, + "learning_rate": 8.943434343434343e-05, + "loss": 0.3402, + "num_input_tokens_seen": 11735040000, + "step": 114600 + }, + { + "epoch": 0.01, + "grad_norm": 0.07372948822733695, + "learning_rate": 8.942424242424243e-05, + "loss": 0.344, + "num_input_tokens_seen": 11745280000, + "step": 114700 + }, + { + "epoch": 0.01, + "grad_norm": 0.14367835887447922, + "learning_rate": 8.941414141414142e-05, + "loss": 0.3422, + "num_input_tokens_seen": 11755520000, + "step": 114800 + }, + { + "epoch": 0.01, + "grad_norm": 0.06702528655817255, + "learning_rate": 8.940404040404041e-05, + "loss": 0.3414, + "num_input_tokens_seen": 11765760000, + "step": 114900 + }, + { + "epoch": 0.01, + "grad_norm": 0.0737993016946481, + "learning_rate": 8.93939393939394e-05, + "loss": 0.3415, + "num_input_tokens_seen": 11776000000, + "step": 115000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3377356049456456, + "eval_average_loss_on_sentence_tokens": 0.3159026862504491, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.33674803376197815, + "eval_non_padding_tokens_in_labels": 133.5446, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38265, + "eval_padding_tokens_in_labels": 378.4554, + "eval_reconstruction_accuracy": 0.9293838617469636, + "eval_runtime": 986.1779, + "eval_samples_per_second": 5.07, + "eval_sentence_accuracy": 0.7997380085057513, + "eval_steps_per_second": 0.013, + "eval_variance_shuffling_prob": 0.24750000000000008, + "num_input_tokens_seen": 11776000000, + "step": 115000 + }, + { + "epoch": 0.01, + "grad_norm": 0.0886206009162089, + "learning_rate": 8.93838383838384e-05, + "loss": 0.3434, + "num_input_tokens_seen": 11786240000, + "step": 115100 + }, + { + "epoch": 0.01, + "grad_norm": 0.0567241240269402, + "learning_rate": 8.937373737373737e-05, + "loss": 0.3412, + "num_input_tokens_seen": 11796480000, + "step": 115200 + }, + { + "epoch": 0.01, + "grad_norm": 0.10841208019837266, + "learning_rate": 8.936363636363636e-05, + "loss": 0.3395, + "num_input_tokens_seen": 11806720000, + "step": 115300 + }, + { + "epoch": 0.01, + "grad_norm": 0.10718312608369451, + "learning_rate": 8.935353535353536e-05, + "loss": 0.3407, + "num_input_tokens_seen": 11816960000, + "step": 115400 + }, + { + "epoch": 0.01, + "grad_norm": 0.07895171282737952, + "learning_rate": 8.934343434343435e-05, + "loss": 0.3396, + "num_input_tokens_seen": 11827200000, + "step": 115500 + }, + { + "epoch": 0.01, + "grad_norm": 0.12466604448692362, + "learning_rate": 8.933333333333334e-05, + "loss": 0.3432, + "num_input_tokens_seen": 11837440000, + "step": 115600 + }, + { + "epoch": 0.01, + "grad_norm": 0.06045924467161078, + "learning_rate": 8.932323232323233e-05, + "loss": 0.3397, + "num_input_tokens_seen": 11847680000, + "step": 115700 + }, + { + "epoch": 0.01, + "grad_norm": 0.08890705602922427, + "learning_rate": 8.931313131313131e-05, + "loss": 0.3398, + "num_input_tokens_seen": 11857920000, + "step": 115800 + }, + { + "epoch": 0.01, + "grad_norm": 0.061796197744352487, + "learning_rate": 8.930303030303032e-05, + "loss": 0.3398, + "num_input_tokens_seen": 11868160000, + "step": 115900 + }, + { + "epoch": 0.01, + "grad_norm": 0.11175528150770513, + "learning_rate": 8.92929292929293e-05, + "loss": 0.3429, + "num_input_tokens_seen": 11878400000, + "step": 116000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33797440597648665, + "eval_average_loss_on_sentence_tokens": 0.34207410305021363, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.33815428614616394, + "eval_non_padding_tokens_in_labels": 133.5207, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3759, + "eval_padding_tokens_in_labels": 378.4793, + "eval_reconstruction_accuracy": 0.9293776627473688, + "eval_runtime": 804.0283, + "eval_samples_per_second": 6.219, + "eval_sentence_accuracy": 0.7842293681698279, + "eval_steps_per_second": 0.016, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 11878400000, + "step": 116000 + }, + { + "epoch": 0.01, + "grad_norm": 0.07628799720465715, + "learning_rate": 8.928282828282829e-05, + "loss": 0.3392, + "num_input_tokens_seen": 11888640000, + "step": 116100 + }, + { + "epoch": 0.01, + "grad_norm": 0.128539953929152, + "learning_rate": 8.927272727272728e-05, + "loss": 0.3416, + "num_input_tokens_seen": 11898880000, + "step": 116200 + }, + { + "epoch": 0.01, + "grad_norm": 0.08101616656472782, + "learning_rate": 8.926262626262627e-05, + "loss": 0.3393, + "num_input_tokens_seen": 11909120000, + "step": 116300 + }, + { + "epoch": 0.01, + "grad_norm": 0.0600318703860769, + "learning_rate": 8.925252525252525e-05, + "loss": 0.3384, + "num_input_tokens_seen": 11919360000, + "step": 116400 + }, + { + "epoch": 0.01, + "grad_norm": 0.05935466243156055, + "learning_rate": 8.924242424242426e-05, + "loss": 0.3418, + "num_input_tokens_seen": 11929600000, + "step": 116500 + }, + { + "epoch": 0.01, + "grad_norm": 0.07600442656110375, + "learning_rate": 8.923232323232323e-05, + "loss": 0.3413, + "num_input_tokens_seen": 11939840000, + "step": 116600 + }, + { + "epoch": 0.01, + "grad_norm": 0.07423660101290043, + "learning_rate": 8.922222222222223e-05, + "loss": 0.3392, + "num_input_tokens_seen": 11950080000, + "step": 116700 + }, + { + "epoch": 0.01, + "grad_norm": 0.16019722658570498, + "learning_rate": 8.921212121212122e-05, + "loss": 0.3406, + "num_input_tokens_seen": 11960320000, + "step": 116800 + }, + { + "epoch": 0.01, + "grad_norm": 0.1109211724146618, + "learning_rate": 8.920202020202021e-05, + "loss": 0.3388, + "num_input_tokens_seen": 11970560000, + "step": 116900 + }, + { + "epoch": 0.01, + "grad_norm": 0.06593373163159016, + "learning_rate": 8.919191919191919e-05, + "loss": 0.3416, + "num_input_tokens_seen": 11980800000, + "step": 117000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3371219346359078, + "eval_average_loss_on_sentence_tokens": 0.35255758917557234, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.3378222584724426, + "eval_non_padding_tokens_in_labels": 133.5395, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36925, + "eval_padding_tokens_in_labels": 378.4605, + "eval_reconstruction_accuracy": 0.9295201687347697, + "eval_runtime": 1109.9546, + "eval_samples_per_second": 4.505, + "eval_sentence_accuracy": 0.7748353580849499, + "eval_steps_per_second": 0.012, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 11980800000, + "step": 117000 + }, + { + "epoch": 0.01, + "grad_norm": 0.06516698330069705, + "learning_rate": 8.91818181818182e-05, + "loss": 0.339, + "num_input_tokens_seen": 11991040000, + "step": 117100 + }, + { + "epoch": 0.01, + "grad_norm": 0.065811104158645, + "learning_rate": 8.917171717171717e-05, + "loss": 0.3375, + "num_input_tokens_seen": 12001280000, + "step": 117200 + }, + { + "epoch": 0.01, + "grad_norm": 0.06517896279648053, + "learning_rate": 8.916161616161616e-05, + "loss": 0.343, + "num_input_tokens_seen": 12011520000, + "step": 117300 + }, + { + "epoch": 0.01, + "grad_norm": 0.11395791424144497, + "learning_rate": 8.915151515151516e-05, + "loss": 0.3415, + "num_input_tokens_seen": 12021760000, + "step": 117400 + }, + { + "epoch": 0.01, + "grad_norm": 0.07466661538083252, + "learning_rate": 8.914141414141415e-05, + "loss": 0.3401, + "num_input_tokens_seen": 12032000000, + "step": 117500 + }, + { + "epoch": 0.01, + "grad_norm": 0.07655155787322439, + "learning_rate": 8.913131313131313e-05, + "loss": 0.3395, + "num_input_tokens_seen": 12042240000, + "step": 117600 + }, + { + "epoch": 0.01, + "grad_norm": 0.05595132464125257, + "learning_rate": 8.912121212121213e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12052480000, + "step": 117700 + }, + { + "epoch": 0.01, + "grad_norm": 0.1570695988008706, + "learning_rate": 8.911111111111111e-05, + "loss": 0.3379, + "num_input_tokens_seen": 12062720000, + "step": 117800 + }, + { + "epoch": 0.01, + "grad_norm": 0.11909217122774927, + "learning_rate": 8.91010101010101e-05, + "loss": 0.3384, + "num_input_tokens_seen": 12072960000, + "step": 117900 + }, + { + "epoch": 0.01, + "grad_norm": 0.11156336396180667, + "learning_rate": 8.90909090909091e-05, + "loss": 0.3397, + "num_input_tokens_seen": 12083200000, + "step": 118000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33772991116722195, + "eval_average_loss_on_sentence_tokens": 0.3369161548350236, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33768555521965027, + "eval_non_padding_tokens_in_labels": 133.5572, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3915, + "eval_padding_tokens_in_labels": 378.4428, + "eval_reconstruction_accuracy": 0.9294051991538073, + "eval_runtime": 1299.3038, + "eval_samples_per_second": 3.848, + "eval_sentence_accuracy": 0.7814793547113608, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 12083200000, + "step": 118000 + }, + { + "epoch": 0.01, + "grad_norm": 0.06323323009961251, + "learning_rate": 8.908080808080809e-05, + "loss": 0.3413, + "num_input_tokens_seen": 12093440000, + "step": 118100 + }, + { + "epoch": 0.01, + "grad_norm": 0.11212213232645667, + "learning_rate": 8.907070707070706e-05, + "loss": 0.3398, + "num_input_tokens_seen": 12103680000, + "step": 118200 + }, + { + "epoch": 0.01, + "grad_norm": 0.07937685575014869, + "learning_rate": 8.906060606060607e-05, + "loss": 0.3414, + "num_input_tokens_seen": 12113920000, + "step": 118300 + }, + { + "epoch": 0.01, + "grad_norm": 0.07751374882453868, + "learning_rate": 8.905050505050505e-05, + "loss": 0.3419, + "num_input_tokens_seen": 12124160000, + "step": 118400 + }, + { + "epoch": 0.01, + "grad_norm": 0.05731212039080017, + "learning_rate": 8.904040404040404e-05, + "loss": 0.3415, + "num_input_tokens_seen": 12134400000, + "step": 118500 + }, + { + "epoch": 0.01, + "grad_norm": 0.07019799067366743, + "learning_rate": 8.903030303030303e-05, + "loss": 0.3388, + "num_input_tokens_seen": 12144640000, + "step": 118600 + }, + { + "epoch": 0.01, + "grad_norm": 0.0723308980480847, + "learning_rate": 8.902020202020202e-05, + "loss": 0.3413, + "num_input_tokens_seen": 12154880000, + "step": 118700 + }, + { + "epoch": 0.01, + "grad_norm": 0.0773010268966247, + "learning_rate": 8.901010101010102e-05, + "loss": 0.3378, + "num_input_tokens_seen": 12165120000, + "step": 118800 + }, + { + "epoch": 0.01, + "grad_norm": 0.07887624952817884, + "learning_rate": 8.900000000000001e-05, + "loss": 0.3385, + "num_input_tokens_seen": 12175360000, + "step": 118900 + }, + { + "epoch": 0.01, + "grad_norm": 0.12148623158425347, + "learning_rate": 8.898989898989899e-05, + "loss": 0.3389, + "num_input_tokens_seen": 12185600000, + "step": 119000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33670362235648776, + "eval_average_loss_on_sentence_tokens": 0.32421332260975705, + "eval_average_shuffling_prob": 0.45, + "eval_loss": 0.33617186546325684, + "eval_non_padding_tokens_in_labels": 133.48685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3755, + "eval_padding_tokens_in_labels": 378.51315, + "eval_reconstruction_accuracy": 0.9294773004062343, + "eval_runtime": 1179.6356, + "eval_samples_per_second": 4.239, + "eval_sentence_accuracy": 0.7984504818131247, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24750000000000008, + "num_input_tokens_seen": 12185600000, + "step": 119000 + }, + { + "epoch": 0.01, + "grad_norm": 0.09192703155776377, + "learning_rate": 8.897979797979798e-05, + "loss": 0.342, + "num_input_tokens_seen": 12195840000, + "step": 119100 + }, + { + "epoch": 0.01, + "grad_norm": 0.08887830339853593, + "learning_rate": 8.896969696969697e-05, + "loss": 0.3398, + "num_input_tokens_seen": 12206080000, + "step": 119200 + }, + { + "epoch": 0.01, + "grad_norm": 0.14033519831556548, + "learning_rate": 8.895959595959596e-05, + "loss": 0.3418, + "num_input_tokens_seen": 12216320000, + "step": 119300 + }, + { + "epoch": 0.01, + "grad_norm": 0.08789845935025163, + "learning_rate": 8.894949494949495e-05, + "loss": 0.342, + "num_input_tokens_seen": 12226560000, + "step": 119400 + }, + { + "epoch": 0.01, + "grad_norm": 0.07258552870236536, + "learning_rate": 8.893939393939395e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12236800000, + "step": 119500 + }, + { + "epoch": 0.01, + "grad_norm": 0.05727993531664272, + "learning_rate": 8.892929292929293e-05, + "loss": 0.3382, + "num_input_tokens_seen": 12247040000, + "step": 119600 + }, + { + "epoch": 0.01, + "grad_norm": 0.07292942981337314, + "learning_rate": 8.891919191919193e-05, + "loss": 0.3393, + "num_input_tokens_seen": 12257280000, + "step": 119700 + }, + { + "epoch": 0.01, + "grad_norm": 0.06976533370822229, + "learning_rate": 8.890909090909091e-05, + "loss": 0.3408, + "num_input_tokens_seen": 12267520000, + "step": 119800 + }, + { + "epoch": 0.01, + "grad_norm": 0.06315560719694033, + "learning_rate": 8.88989898989899e-05, + "loss": 0.3378, + "num_input_tokens_seen": 12277760000, + "step": 119900 + }, + { + "epoch": 0.01, + "grad_norm": 0.0586093365283434, + "learning_rate": 8.888888888888889e-05, + "loss": 0.3362, + "num_input_tokens_seen": 12288000000, + "step": 120000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.336514978374247, + "eval_average_loss_on_sentence_tokens": 0.3225603525795016, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.3359082043170929, + "eval_non_padding_tokens_in_labels": 133.5443, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3781, + "eval_padding_tokens_in_labels": 378.4557, + "eval_reconstruction_accuracy": 0.9295061677985021, + "eval_runtime": 1255.761, + "eval_samples_per_second": 3.982, + "eval_sentence_accuracy": 0.7976698907172466, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24797499999999995, + "num_input_tokens_seen": 12288000000, + "step": 120000 + }, + { + "epoch": 0.01, + "grad_norm": 0.07818123164671899, + "learning_rate": 8.887878787878789e-05, + "loss": 0.3377, + "num_input_tokens_seen": 12298240000, + "step": 120100 + }, + { + "epoch": 0.01, + "grad_norm": 0.0932476747134372, + "learning_rate": 8.886868686868686e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12308480000, + "step": 120200 + }, + { + "epoch": 0.01, + "grad_norm": 0.089493597861224, + "learning_rate": 8.885858585858587e-05, + "loss": 0.3405, + "num_input_tokens_seen": 12318720000, + "step": 120300 + }, + { + "epoch": 0.01, + "grad_norm": 0.06187421090774025, + "learning_rate": 8.884848484848485e-05, + "loss": 0.3378, + "num_input_tokens_seen": 12328960000, + "step": 120400 + }, + { + "epoch": 0.01, + "grad_norm": 0.08007348442587647, + "learning_rate": 8.883838383838384e-05, + "loss": 0.3419, + "num_input_tokens_seen": 12339200000, + "step": 120500 + }, + { + "epoch": 0.01, + "grad_norm": 0.07585638993574777, + "learning_rate": 8.882828282828283e-05, + "loss": 0.3438, + "num_input_tokens_seen": 12349440000, + "step": 120600 + }, + { + "epoch": 0.01, + "grad_norm": 0.08772452872403287, + "learning_rate": 8.881818181818182e-05, + "loss": 0.3431, + "num_input_tokens_seen": 12359680000, + "step": 120700 + }, + { + "epoch": 0.01, + "grad_norm": 0.11179932001493872, + "learning_rate": 8.88080808080808e-05, + "loss": 0.339, + "num_input_tokens_seen": 12369920000, + "step": 120800 + }, + { + "epoch": 0.01, + "grad_norm": 0.09293828422979349, + "learning_rate": 8.879797979797981e-05, + "loss": 0.3392, + "num_input_tokens_seen": 12380160000, + "step": 120900 + }, + { + "epoch": 0.01, + "grad_norm": 0.08960192986583577, + "learning_rate": 8.87878787878788e-05, + "loss": 0.3429, + "num_input_tokens_seen": 12390400000, + "step": 121000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33712448882492324, + "eval_average_loss_on_sentence_tokens": 0.28750952244917394, + "eval_average_shuffling_prob": 0.405, + "eval_loss": 0.33482420444488525, + "eval_non_padding_tokens_in_labels": 133.5274, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3758, + "eval_padding_tokens_in_labels": 378.4726, + "eval_reconstruction_accuracy": 0.9296222382178437, + "eval_runtime": 1278.978, + "eval_samples_per_second": 3.909, + "eval_sentence_accuracy": 0.8187772533960199, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24097500000000005, + "num_input_tokens_seen": 12390400000, + "step": 121000 + }, + { + "epoch": 0.02, + "grad_norm": 0.10993595474709408, + "learning_rate": 8.877777777777778e-05, + "loss": 0.343, + "num_input_tokens_seen": 12400640000, + "step": 121100 + }, + { + "epoch": 0.02, + "grad_norm": 0.07038574906798034, + "learning_rate": 8.876767676767678e-05, + "loss": 0.3403, + "num_input_tokens_seen": 12410880000, + "step": 121200 + }, + { + "epoch": 0.02, + "grad_norm": 0.06120862438070844, + "learning_rate": 8.875757575757576e-05, + "loss": 0.3389, + "num_input_tokens_seen": 12421120000, + "step": 121300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06149748129764353, + "learning_rate": 8.874747474747475e-05, + "loss": 0.3404, + "num_input_tokens_seen": 12431360000, + "step": 121400 + }, + { + "epoch": 0.02, + "grad_norm": 0.10756448253029752, + "learning_rate": 8.873737373737375e-05, + "loss": 0.3399, + "num_input_tokens_seen": 12441600000, + "step": 121500 + }, + { + "epoch": 0.02, + "grad_norm": 0.14611684150065385, + "learning_rate": 8.872727272727274e-05, + "loss": 0.3412, + "num_input_tokens_seen": 12451840000, + "step": 121600 + }, + { + "epoch": 0.02, + "grad_norm": 0.07400589932697381, + "learning_rate": 8.871717171717172e-05, + "loss": 0.3428, + "num_input_tokens_seen": 12462080000, + "step": 121700 + }, + { + "epoch": 0.02, + "grad_norm": 0.06473387366260021, + "learning_rate": 8.870707070707072e-05, + "loss": 0.3375, + "num_input_tokens_seen": 12472320000, + "step": 121800 + }, + { + "epoch": 0.02, + "grad_norm": 0.11186764929706944, + "learning_rate": 8.86969696969697e-05, + "loss": 0.3395, + "num_input_tokens_seen": 12482560000, + "step": 121900 + }, + { + "epoch": 0.02, + "grad_norm": 0.09078204466493292, + "learning_rate": 8.868686868686869e-05, + "loss": 0.3424, + "num_input_tokens_seen": 12492800000, + "step": 122000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33667086850132577, + "eval_average_loss_on_sentence_tokens": 0.33607516296373247, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33666014671325684, + "eval_non_padding_tokens_in_labels": 133.52685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3738, + "eval_padding_tokens_in_labels": 378.47315, + "eval_reconstruction_accuracy": 0.929530811401396, + "eval_runtime": 1420.7084, + "eval_samples_per_second": 3.519, + "eval_sentence_accuracy": 0.7862077628438638, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 12492800000, + "step": 122000 + }, + { + "epoch": 0.02, + "grad_norm": 0.09898018099097441, + "learning_rate": 8.867676767676768e-05, + "loss": 0.3401, + "num_input_tokens_seen": 12503040000, + "step": 122100 + }, + { + "epoch": 0.02, + "grad_norm": 0.06509830051060964, + "learning_rate": 8.866666666666668e-05, + "loss": 0.7166, + "num_input_tokens_seen": 12513280000, + "step": 122200 + }, + { + "epoch": 0.02, + "grad_norm": 0.06596074835424742, + "learning_rate": 8.865656565656565e-05, + "loss": 0.3518, + "num_input_tokens_seen": 12523520000, + "step": 122300 + }, + { + "epoch": 0.02, + "grad_norm": 0.16123073954475717, + "learning_rate": 8.864646464646466e-05, + "loss": 0.3488, + "num_input_tokens_seen": 12533760000, + "step": 122400 + }, + { + "epoch": 0.02, + "grad_norm": 0.06311566479893396, + "learning_rate": 8.863636363636364e-05, + "loss": 0.3444, + "num_input_tokens_seen": 12544000000, + "step": 122500 + }, + { + "epoch": 0.02, + "grad_norm": 0.08749753946438067, + "learning_rate": 8.862626262626263e-05, + "loss": 0.3431, + "num_input_tokens_seen": 12554240000, + "step": 122600 + }, + { + "epoch": 0.02, + "grad_norm": 0.08525149220722598, + "learning_rate": 8.861616161616162e-05, + "loss": 0.3418, + "num_input_tokens_seen": 12564480000, + "step": 122700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07814150576275324, + "learning_rate": 8.860606060606061e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12574720000, + "step": 122800 + }, + { + "epoch": 0.02, + "grad_norm": 0.06407371475896176, + "learning_rate": 8.859595959595959e-05, + "loss": 0.345, + "num_input_tokens_seen": 12584960000, + "step": 122900 + }, + { + "epoch": 0.02, + "grad_norm": 0.06704945618797756, + "learning_rate": 8.85858585858586e-05, + "loss": 0.345, + "num_input_tokens_seen": 12595200000, + "step": 123000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33989533384307674, + "eval_average_loss_on_sentence_tokens": 0.33382758090835507, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.33961912989616394, + "eval_non_padding_tokens_in_labels": 133.5335, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36575, + "eval_padding_tokens_in_labels": 378.4665, + "eval_reconstruction_accuracy": 0.9290498616999376, + "eval_runtime": 1242.1035, + "eval_samples_per_second": 4.025, + "eval_sentence_accuracy": 0.7868133938665279, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 12595200000, + "step": 123000 + }, + { + "epoch": 0.02, + "grad_norm": 0.13216241748505614, + "learning_rate": 8.857575757575758e-05, + "loss": 0.3395, + "num_input_tokens_seen": 12605440000, + "step": 123100 + }, + { + "epoch": 0.02, + "grad_norm": 0.068127398618197, + "learning_rate": 8.856565656565657e-05, + "loss": 0.3428, + "num_input_tokens_seen": 12615680000, + "step": 123200 + }, + { + "epoch": 0.02, + "grad_norm": 0.059691698269341634, + "learning_rate": 8.855555555555556e-05, + "loss": 0.3413, + "num_input_tokens_seen": 12625920000, + "step": 123300 + }, + { + "epoch": 0.02, + "grad_norm": 0.05760965964877091, + "learning_rate": 8.854545454545455e-05, + "loss": 0.34, + "num_input_tokens_seen": 12636160000, + "step": 123400 + }, + { + "epoch": 0.02, + "grad_norm": 0.0805458129226341, + "learning_rate": 8.853535353535354e-05, + "loss": 0.3441, + "num_input_tokens_seen": 12646400000, + "step": 123500 + }, + { + "epoch": 0.02, + "grad_norm": 0.06957474821992347, + "learning_rate": 8.852525252525254e-05, + "loss": 0.344, + "num_input_tokens_seen": 12656640000, + "step": 123600 + }, + { + "epoch": 0.02, + "grad_norm": 0.130779047765728, + "learning_rate": 8.851515151515152e-05, + "loss": 0.3386, + "num_input_tokens_seen": 12666880000, + "step": 123700 + }, + { + "epoch": 0.02, + "grad_norm": 0.09070395891905292, + "learning_rate": 8.850505050505051e-05, + "loss": 0.3417, + "num_input_tokens_seen": 12677120000, + "step": 123800 + }, + { + "epoch": 0.02, + "grad_norm": 0.07656148312584304, + "learning_rate": 8.84949494949495e-05, + "loss": 0.3438, + "num_input_tokens_seen": 12687360000, + "step": 123900 + }, + { + "epoch": 0.02, + "grad_norm": 0.06678296447656376, + "learning_rate": 8.848484848484849e-05, + "loss": 0.3387, + "num_input_tokens_seen": 12697600000, + "step": 124000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3389473934516936, + "eval_average_loss_on_sentence_tokens": 0.34969405750477206, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.33952149748802185, + "eval_non_padding_tokens_in_labels": 133.50405, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3802, + "eval_padding_tokens_in_labels": 378.49595, + "eval_reconstruction_accuracy": 0.9293030962054372, + "eval_runtime": 1320.1585, + "eval_samples_per_second": 3.787, + "eval_sentence_accuracy": 0.7763023310065139, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 12697600000, + "step": 124000 + }, + { + "epoch": 0.02, + "grad_norm": 0.09339018656439618, + "learning_rate": 8.847474747474748e-05, + "loss": 0.3405, + "num_input_tokens_seen": 12707840000, + "step": 124100 + }, + { + "epoch": 0.02, + "grad_norm": 0.07846884757438072, + "learning_rate": 8.846464646464648e-05, + "loss": 0.3433, + "num_input_tokens_seen": 12718080000, + "step": 124200 + }, + { + "epoch": 0.02, + "grad_norm": 0.06785875832685692, + "learning_rate": 8.845454545454545e-05, + "loss": 0.3428, + "num_input_tokens_seen": 12728320000, + "step": 124300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06154477793831758, + "learning_rate": 8.844444444444445e-05, + "loss": 0.3394, + "num_input_tokens_seen": 12738560000, + "step": 124400 + }, + { + "epoch": 0.02, + "grad_norm": 0.07250780847907222, + "learning_rate": 8.843434343434344e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12748800000, + "step": 124500 + }, + { + "epoch": 0.02, + "grad_norm": 0.10090940970730669, + "learning_rate": 8.842424242424243e-05, + "loss": 0.3409, + "num_input_tokens_seen": 12759040000, + "step": 124600 + }, + { + "epoch": 0.02, + "grad_norm": 0.10350183179116616, + "learning_rate": 8.841414141414142e-05, + "loss": 0.3408, + "num_input_tokens_seen": 12769280000, + "step": 124700 + }, + { + "epoch": 0.02, + "grad_norm": 0.06253452297505771, + "learning_rate": 8.840404040404041e-05, + "loss": 0.3365, + "num_input_tokens_seen": 12779520000, + "step": 124800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08474897246340948, + "learning_rate": 8.839393939393939e-05, + "loss": 0.3401, + "num_input_tokens_seen": 12789760000, + "step": 124900 + }, + { + "epoch": 0.02, + "grad_norm": 0.12139144575224792, + "learning_rate": 8.83838383838384e-05, + "loss": 0.3409, + "num_input_tokens_seen": 12800000000, + "step": 125000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3381892428591942, + "eval_average_loss_on_sentence_tokens": 0.31827930831309814, + "eval_average_shuffling_prob": 0.445, + "eval_loss": 0.3372558653354645, + "eval_non_padding_tokens_in_labels": 133.55685, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38685, + "eval_padding_tokens_in_labels": 378.44315, + "eval_reconstruction_accuracy": 0.9293103286475906, + "eval_runtime": 1217.6235, + "eval_samples_per_second": 4.106, + "eval_sentence_accuracy": 0.8008954366823982, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24697499999999992, + "num_input_tokens_seen": 12800000000, + "step": 125000 + }, + { + "epoch": 0.02, + "grad_norm": 0.10465464708117905, + "learning_rate": 8.837373737373738e-05, + "loss": 0.3399, + "num_input_tokens_seen": 12810240000, + "step": 125100 + }, + { + "epoch": 0.02, + "grad_norm": 0.07957051926025585, + "learning_rate": 8.836363636363637e-05, + "loss": 0.3424, + "num_input_tokens_seen": 12820480000, + "step": 125200 + }, + { + "epoch": 0.02, + "grad_norm": 0.11338324723324321, + "learning_rate": 8.835353535353536e-05, + "loss": 0.3416, + "num_input_tokens_seen": 12830720000, + "step": 125300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06918522411982228, + "learning_rate": 8.834343434343435e-05, + "loss": 0.3403, + "num_input_tokens_seen": 12840960000, + "step": 125400 + }, + { + "epoch": 0.02, + "grad_norm": 0.12900753088341513, + "learning_rate": 8.833333333333333e-05, + "loss": 0.3409, + "num_input_tokens_seen": 12851200000, + "step": 125500 + }, + { + "epoch": 0.02, + "grad_norm": 0.10635687934317502, + "learning_rate": 8.832323232323234e-05, + "loss": 0.3406, + "num_input_tokens_seen": 12861440000, + "step": 125600 + }, + { + "epoch": 0.02, + "grad_norm": 0.07086338695326674, + "learning_rate": 8.831313131313131e-05, + "loss": 0.3412, + "num_input_tokens_seen": 12871680000, + "step": 125700 + }, + { + "epoch": 0.02, + "grad_norm": 0.12956762971478383, + "learning_rate": 8.83030303030303e-05, + "loss": 0.3399, + "num_input_tokens_seen": 12881920000, + "step": 125800 + }, + { + "epoch": 0.02, + "grad_norm": 0.07980076218173981, + "learning_rate": 8.82929292929293e-05, + "loss": 0.3435, + "num_input_tokens_seen": 12892160000, + "step": 125900 + }, + { + "epoch": 0.02, + "grad_norm": 0.12427781892859804, + "learning_rate": 8.828282828282829e-05, + "loss": 0.3388, + "num_input_tokens_seen": 12902400000, + "step": 126000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.337802120848237, + "eval_average_loss_on_sentence_tokens": 0.35801994466158515, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3386914134025574, + "eval_non_padding_tokens_in_labels": 133.5447, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3695, + "eval_padding_tokens_in_labels": 378.4553, + "eval_reconstruction_accuracy": 0.9293716738541915, + "eval_runtime": 1365.4364, + "eval_samples_per_second": 3.662, + "eval_sentence_accuracy": 0.7684156692447108, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 12902400000, + "step": 126000 + }, + { + "epoch": 0.02, + "grad_norm": 0.057232448084813876, + "learning_rate": 8.827272727272727e-05, + "loss": 0.3436, + "num_input_tokens_seen": 12912640000, + "step": 126100 + }, + { + "epoch": 0.02, + "grad_norm": 0.062430473488576464, + "learning_rate": 8.826262626262627e-05, + "loss": 0.3402, + "num_input_tokens_seen": 12922880000, + "step": 126200 + }, + { + "epoch": 0.02, + "grad_norm": 0.09167723900093827, + "learning_rate": 8.825252525252525e-05, + "loss": 0.3412, + "num_input_tokens_seen": 12933120000, + "step": 126300 + }, + { + "epoch": 0.02, + "grad_norm": 0.11073694199883198, + "learning_rate": 8.824242424242424e-05, + "loss": 0.3424, + "num_input_tokens_seen": 12943360000, + "step": 126400 + }, + { + "epoch": 0.02, + "grad_norm": 0.07003764233529604, + "learning_rate": 8.823232323232324e-05, + "loss": 0.3381, + "num_input_tokens_seen": 12953600000, + "step": 126500 + }, + { + "epoch": 0.02, + "grad_norm": 0.09629506610856232, + "learning_rate": 8.822222222222223e-05, + "loss": 0.3416, + "num_input_tokens_seen": 12963840000, + "step": 126600 + }, + { + "epoch": 0.02, + "grad_norm": 0.0813591378273354, + "learning_rate": 8.82121212121212e-05, + "loss": 0.3412, + "num_input_tokens_seen": 12974080000, + "step": 126700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07515750745912969, + "learning_rate": 8.820202020202021e-05, + "loss": 0.3418, + "num_input_tokens_seen": 12984320000, + "step": 126800 + }, + { + "epoch": 0.02, + "grad_norm": 0.1866168891516927, + "learning_rate": 8.819191919191919e-05, + "loss": 0.3407, + "num_input_tokens_seen": 12994560000, + "step": 126900 + }, + { + "epoch": 0.02, + "grad_norm": 0.08646402617344381, + "learning_rate": 8.818181818181818e-05, + "loss": 0.3363, + "num_input_tokens_seen": 13004800000, + "step": 127000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33789545573324836, + "eval_average_loss_on_sentence_tokens": 0.35095872397016875, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3384960889816284, + "eval_non_padding_tokens_in_labels": 133.546, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38885, + "eval_padding_tokens_in_labels": 378.454, + "eval_reconstruction_accuracy": 0.929383427162174, + "eval_runtime": 1176.3123, + "eval_samples_per_second": 4.251, + "eval_sentence_accuracy": 0.775432016796167, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 13004800000, + "step": 127000 + }, + { + "epoch": 0.02, + "grad_norm": 0.07562887921216335, + "learning_rate": 8.817171717171717e-05, + "loss": 0.3423, + "num_input_tokens_seen": 13015040000, + "step": 127100 + }, + { + "epoch": 0.02, + "grad_norm": 0.0884698221026495, + "learning_rate": 8.816161616161617e-05, + "loss": 0.3397, + "num_input_tokens_seen": 13025280000, + "step": 127200 + }, + { + "epoch": 0.02, + "grad_norm": 0.09387397516150893, + "learning_rate": 8.815151515151515e-05, + "loss": 0.3409, + "num_input_tokens_seen": 13035520000, + "step": 127300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06444756165971814, + "learning_rate": 8.814141414141415e-05, + "loss": 0.3417, + "num_input_tokens_seen": 13045760000, + "step": 127400 + }, + { + "epoch": 0.02, + "grad_norm": 0.06301737268787702, + "learning_rate": 8.813131313131313e-05, + "loss": 0.339, + "num_input_tokens_seen": 13056000000, + "step": 127500 + }, + { + "epoch": 0.02, + "grad_norm": 0.09870613083045093, + "learning_rate": 8.812121212121212e-05, + "loss": 0.3404, + "num_input_tokens_seen": 13066240000, + "step": 127600 + }, + { + "epoch": 0.02, + "grad_norm": 0.06549426528720403, + "learning_rate": 8.811111111111111e-05, + "loss": 0.3425, + "num_input_tokens_seen": 13076480000, + "step": 127700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07944432511824111, + "learning_rate": 8.81010101010101e-05, + "loss": 0.345, + "num_input_tokens_seen": 13086720000, + "step": 127800 + }, + { + "epoch": 0.02, + "grad_norm": 0.07796838977688461, + "learning_rate": 8.80909090909091e-05, + "loss": 0.3424, + "num_input_tokens_seen": 13096960000, + "step": 127900 + }, + { + "epoch": 0.02, + "grad_norm": 0.11001324908434182, + "learning_rate": 8.808080808080809e-05, + "loss": 0.3418, + "num_input_tokens_seen": 13107200000, + "step": 128000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3369085888464342, + "eval_average_loss_on_sentence_tokens": 0.3398847174970675, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.3370312452316284, + "eval_non_padding_tokens_in_labels": 133.5102, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37555, + "eval_padding_tokens_in_labels": 378.4898, + "eval_reconstruction_accuracy": 0.9295105015743301, + "eval_runtime": 1380.2784, + "eval_samples_per_second": 3.622, + "eval_sentence_accuracy": 0.7838704757119529, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 13107200000, + "step": 128000 + }, + { + "epoch": 0.02, + "grad_norm": 0.06451328587368206, + "learning_rate": 8.807070707070707e-05, + "loss": 0.3385, + "num_input_tokens_seen": 13117440000, + "step": 128100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08468209058695414, + "learning_rate": 8.806060606060606e-05, + "loss": 0.3398, + "num_input_tokens_seen": 13127680000, + "step": 128200 + }, + { + "epoch": 0.02, + "grad_norm": 0.09252241456296469, + "learning_rate": 8.805050505050505e-05, + "loss": 0.3411, + "num_input_tokens_seen": 13137920000, + "step": 128300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06793046402583348, + "learning_rate": 8.804040404040404e-05, + "loss": 0.3425, + "num_input_tokens_seen": 13148160000, + "step": 128400 + }, + { + "epoch": 0.02, + "grad_norm": 0.0607042881278563, + "learning_rate": 8.803030303030304e-05, + "loss": 0.3446, + "num_input_tokens_seen": 13158400000, + "step": 128500 + }, + { + "epoch": 0.02, + "grad_norm": 0.07479556453840637, + "learning_rate": 8.802020202020203e-05, + "loss": 0.341, + "num_input_tokens_seen": 13168640000, + "step": 128600 + }, + { + "epoch": 0.02, + "grad_norm": 0.07625266980408234, + "learning_rate": 8.8010101010101e-05, + "loss": 0.3426, + "num_input_tokens_seen": 13178880000, + "step": 128700 + }, + { + "epoch": 0.02, + "grad_norm": 0.10507747027749789, + "learning_rate": 8.800000000000001e-05, + "loss": 0.3393, + "num_input_tokens_seen": 13189120000, + "step": 128800 + }, + { + "epoch": 0.02, + "grad_norm": 0.10725071014989479, + "learning_rate": 8.798989898989899e-05, + "loss": 0.3417, + "num_input_tokens_seen": 13199360000, + "step": 128900 + }, + { + "epoch": 0.02, + "grad_norm": 0.06433559036190181, + "learning_rate": 8.797979797979798e-05, + "loss": 0.3401, + "num_input_tokens_seen": 13209600000, + "step": 129000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33744112374749413, + "eval_average_loss_on_sentence_tokens": 0.36672480230097, + "eval_average_shuffling_prob": 0.53, + "eval_loss": 0.3387402296066284, + "eval_non_padding_tokens_in_labels": 133.5205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3754, + "eval_padding_tokens_in_labels": 378.4795, + "eval_reconstruction_accuracy": 0.9294724776939053, + "eval_runtime": 1232.4501, + "eval_samples_per_second": 4.057, + "eval_sentence_accuracy": 0.7607936906705905, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 13209600000, + "step": 129000 + }, + { + "epoch": 0.02, + "grad_norm": 0.08071821389064575, + "learning_rate": 8.796969696969697e-05, + "loss": 0.3411, + "num_input_tokens_seen": 13219840000, + "step": 129100 + }, + { + "epoch": 0.02, + "grad_norm": 0.07309450041521094, + "learning_rate": 8.795959595959597e-05, + "loss": 0.3383, + "num_input_tokens_seen": 13230080000, + "step": 129200 + }, + { + "epoch": 0.02, + "grad_norm": 0.08170683607995557, + "learning_rate": 8.794949494949494e-05, + "loss": 0.3404, + "num_input_tokens_seen": 13240320000, + "step": 129300 + }, + { + "epoch": 0.02, + "grad_norm": 0.07543615222857206, + "learning_rate": 8.793939393939395e-05, + "loss": 0.3404, + "num_input_tokens_seen": 13250560000, + "step": 129400 + }, + { + "epoch": 0.02, + "grad_norm": 0.11157150393503572, + "learning_rate": 8.792929292929294e-05, + "loss": 0.3406, + "num_input_tokens_seen": 13260800000, + "step": 129500 + }, + { + "epoch": 0.02, + "grad_norm": 0.09018037024213676, + "learning_rate": 8.791919191919192e-05, + "loss": 0.34, + "num_input_tokens_seen": 13271040000, + "step": 129600 + }, + { + "epoch": 0.02, + "grad_norm": 0.1190160702177086, + "learning_rate": 8.790909090909091e-05, + "loss": 0.3419, + "num_input_tokens_seen": 13281280000, + "step": 129700 + }, + { + "epoch": 0.02, + "grad_norm": 0.05853013320337046, + "learning_rate": 8.78989898989899e-05, + "loss": 0.3386, + "num_input_tokens_seen": 13291520000, + "step": 129800 + }, + { + "epoch": 0.02, + "grad_norm": 0.06684316103652588, + "learning_rate": 8.78888888888889e-05, + "loss": 0.3386, + "num_input_tokens_seen": 13301760000, + "step": 129900 + }, + { + "epoch": 0.02, + "grad_norm": 0.08211968967827146, + "learning_rate": 8.787878787878789e-05, + "loss": 0.3361, + "num_input_tokens_seen": 13312000000, + "step": 130000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33692733412829173, + "eval_average_loss_on_sentence_tokens": 0.3307930779910229, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.3366113305091858, + "eval_non_padding_tokens_in_labels": 133.56185, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.40155, + "eval_padding_tokens_in_labels": 378.43815, + "eval_reconstruction_accuracy": 0.929521158308166, + "eval_runtime": 1347.2888, + "eval_samples_per_second": 3.711, + "eval_sentence_accuracy": 0.7894781703662498, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 13312000000, + "step": 130000 + }, + { + "epoch": 0.02, + "grad_norm": 0.1151962793144073, + "learning_rate": 8.786868686868688e-05, + "loss": 0.3417, + "num_input_tokens_seen": 13322240000, + "step": 130100 + }, + { + "epoch": 0.02, + "grad_norm": 0.09999249679758415, + "learning_rate": 8.785858585858586e-05, + "loss": 0.3399, + "num_input_tokens_seen": 13332480000, + "step": 130200 + }, + { + "epoch": 0.02, + "grad_norm": 0.10196728005198769, + "learning_rate": 8.784848484848486e-05, + "loss": 0.3401, + "num_input_tokens_seen": 13342720000, + "step": 130300 + }, + { + "epoch": 0.02, + "grad_norm": 0.08145243935998031, + "learning_rate": 8.783838383838384e-05, + "loss": 0.3398, + "num_input_tokens_seen": 13352960000, + "step": 130400 + }, + { + "epoch": 0.02, + "grad_norm": 0.07459893384493531, + "learning_rate": 8.782828282828283e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13363200000, + "step": 130500 + }, + { + "epoch": 0.02, + "grad_norm": 0.10507680331962413, + "learning_rate": 8.781818181818183e-05, + "loss": 0.3379, + "num_input_tokens_seen": 13373440000, + "step": 130600 + }, + { + "epoch": 0.02, + "grad_norm": 0.07845708185146334, + "learning_rate": 8.780808080808082e-05, + "loss": 0.3438, + "num_input_tokens_seen": 13383680000, + "step": 130700 + }, + { + "epoch": 0.02, + "grad_norm": 0.09059859715393667, + "learning_rate": 8.77979797979798e-05, + "loss": 0.3412, + "num_input_tokens_seen": 13393920000, + "step": 130800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08646197961846365, + "learning_rate": 8.77878787878788e-05, + "loss": 0.3424, + "num_input_tokens_seen": 13404160000, + "step": 130900 + }, + { + "epoch": 0.03, + "grad_norm": 0.0954023373620258, + "learning_rate": 8.777777777777778e-05, + "loss": 0.3427, + "num_input_tokens_seen": 13414400000, + "step": 131000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33664499546509474, + "eval_average_loss_on_sentence_tokens": 0.32015826743365666, + "eval_average_shuffling_prob": 0.46, + "eval_loss": 0.33588868379592896, + "eval_non_padding_tokens_in_labels": 133.5374, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3768, + "eval_padding_tokens_in_labels": 378.4626, + "eval_reconstruction_accuracy": 0.9295504984201305, + "eval_runtime": 1166.2734, + "eval_samples_per_second": 4.287, + "eval_sentence_accuracy": 0.7955748559944013, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24839999999999995, + "num_input_tokens_seen": 13414400000, + "step": 131000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07029014636057686, + "learning_rate": 8.776767676767677e-05, + "loss": 0.3393, + "num_input_tokens_seen": 13424640000, + "step": 131100 + }, + { + "epoch": 0.03, + "grad_norm": 0.10606820233785931, + "learning_rate": 8.775757575757576e-05, + "loss": 0.3385, + "num_input_tokens_seen": 13434880000, + "step": 131200 + }, + { + "epoch": 0.03, + "grad_norm": 0.08256700380409304, + "learning_rate": 8.774747474747476e-05, + "loss": 0.3366, + "num_input_tokens_seen": 13445120000, + "step": 131300 + }, + { + "epoch": 0.03, + "grad_norm": 0.08375314551823226, + "learning_rate": 8.773737373737373e-05, + "loss": 0.3399, + "num_input_tokens_seen": 13455360000, + "step": 131400 + }, + { + "epoch": 0.03, + "grad_norm": 0.07288214403441488, + "learning_rate": 8.772727272727274e-05, + "loss": 0.343, + "num_input_tokens_seen": 13465600000, + "step": 131500 + }, + { + "epoch": 0.03, + "grad_norm": 0.057620567691380735, + "learning_rate": 8.771717171717172e-05, + "loss": 0.3426, + "num_input_tokens_seen": 13475840000, + "step": 131600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07036743567734156, + "learning_rate": 8.770707070707071e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13486080000, + "step": 131700 + }, + { + "epoch": 0.03, + "grad_norm": 0.100359942890736, + "learning_rate": 8.76969696969697e-05, + "loss": 0.3401, + "num_input_tokens_seen": 13496320000, + "step": 131800 + }, + { + "epoch": 0.03, + "grad_norm": 0.07263558802810582, + "learning_rate": 8.76868686868687e-05, + "loss": 0.3419, + "num_input_tokens_seen": 13506560000, + "step": 131900 + }, + { + "epoch": 0.03, + "grad_norm": 0.11610625865800925, + "learning_rate": 8.767676767676767e-05, + "loss": 0.3388, + "num_input_tokens_seen": 13516800000, + "step": 132000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33677598360803285, + "eval_average_loss_on_sentence_tokens": 0.32032051427718117, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.3360644578933716, + "eval_non_padding_tokens_in_labels": 133.4979, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.383, + "eval_padding_tokens_in_labels": 378.5021, + "eval_reconstruction_accuracy": 0.929667463009016, + "eval_runtime": 1260.485, + "eval_samples_per_second": 3.967, + "eval_sentence_accuracy": 0.793726559836345, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 13516800000, + "step": 132000 + }, + { + "epoch": 0.03, + "grad_norm": 0.1061028584522693, + "learning_rate": 8.766666666666668e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13527040000, + "step": 132100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08447214357959211, + "learning_rate": 8.765656565656566e-05, + "loss": 0.3401, + "num_input_tokens_seen": 13537280000, + "step": 132200 + }, + { + "epoch": 0.03, + "grad_norm": 0.07361871258037363, + "learning_rate": 8.764646464646465e-05, + "loss": 0.3391, + "num_input_tokens_seen": 13547520000, + "step": 132300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06524062597854284, + "learning_rate": 8.763636363636364e-05, + "loss": 0.3387, + "num_input_tokens_seen": 13557760000, + "step": 132400 + }, + { + "epoch": 0.03, + "grad_norm": 0.06562348478276682, + "learning_rate": 8.762626262626263e-05, + "loss": 0.3393, + "num_input_tokens_seen": 13568000000, + "step": 132500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06873905632580125, + "learning_rate": 8.761616161616161e-05, + "loss": 0.3413, + "num_input_tokens_seen": 13578240000, + "step": 132600 + }, + { + "epoch": 0.03, + "grad_norm": 0.08294645874995225, + "learning_rate": 8.760606060606062e-05, + "loss": 0.34, + "num_input_tokens_seen": 13588480000, + "step": 132700 + }, + { + "epoch": 0.03, + "grad_norm": 0.10673472122823945, + "learning_rate": 8.75959595959596e-05, + "loss": 0.3403, + "num_input_tokens_seen": 13598720000, + "step": 132800 + }, + { + "epoch": 0.03, + "grad_norm": 7.241942038383399, + "learning_rate": 8.758585858585859e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13608960000, + "step": 132900 + }, + { + "epoch": 0.03, + "grad_norm": 0.08147738583335584, + "learning_rate": 8.757575757575758e-05, + "loss": 0.3917, + "num_input_tokens_seen": 13619200000, + "step": 133000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3390597577289854, + "eval_average_loss_on_sentence_tokens": 0.3594908857717296, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3400195240974426, + "eval_non_padding_tokens_in_labels": 133.50575, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37145, + "eval_padding_tokens_in_labels": 378.49425, + "eval_reconstruction_accuracy": 0.9292768691451163, + "eval_runtime": 1224.7067, + "eval_samples_per_second": 4.083, + "eval_sentence_accuracy": 0.7677427458861952, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 13619200000, + "step": 133000 + }, + { + "epoch": 0.03, + "grad_norm": 0.0825509795808059, + "learning_rate": 8.756565656565657e-05, + "loss": 0.3437, + "num_input_tokens_seen": 13629440000, + "step": 133100 + }, + { + "epoch": 0.03, + "grad_norm": 0.08979349181886279, + "learning_rate": 8.755555555555556e-05, + "loss": 0.3431, + "num_input_tokens_seen": 13639680000, + "step": 133200 + }, + { + "epoch": 0.03, + "grad_norm": 0.061114686649264005, + "learning_rate": 8.754545454545456e-05, + "loss": 0.3387, + "num_input_tokens_seen": 13649920000, + "step": 133300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06234547765627765, + "learning_rate": 8.753535353535353e-05, + "loss": 0.3405, + "num_input_tokens_seen": 13660160000, + "step": 133400 + }, + { + "epoch": 0.03, + "grad_norm": 0.060728987489182984, + "learning_rate": 8.752525252525253e-05, + "loss": 0.3418, + "num_input_tokens_seen": 13670400000, + "step": 133500 + }, + { + "epoch": 0.03, + "grad_norm": 0.09797031920756641, + "learning_rate": 8.751515151515152e-05, + "loss": 0.3392, + "num_input_tokens_seen": 13680640000, + "step": 133600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07733905156632175, + "learning_rate": 8.750505050505051e-05, + "loss": 0.3415, + "num_input_tokens_seen": 13690880000, + "step": 133700 + }, + { + "epoch": 0.03, + "grad_norm": 0.061952258310834056, + "learning_rate": 8.74949494949495e-05, + "loss": 0.3381, + "num_input_tokens_seen": 13701120000, + "step": 133800 + }, + { + "epoch": 0.03, + "grad_norm": 0.08489324213117355, + "learning_rate": 8.74848484848485e-05, + "loss": 0.3375, + "num_input_tokens_seen": 13711360000, + "step": 133900 + }, + { + "epoch": 0.03, + "grad_norm": 0.12353131322198066, + "learning_rate": 8.747474747474747e-05, + "loss": 0.3409, + "num_input_tokens_seen": 13721600000, + "step": 134000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33663744504710014, + "eval_average_loss_on_sentence_tokens": 0.32486034868395425, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.3361132740974426, + "eval_non_padding_tokens_in_labels": 133.54255, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3902, + "eval_padding_tokens_in_labels": 378.45745, + "eval_reconstruction_accuracy": 0.9296750940586557, + "eval_runtime": 1471.5343, + "eval_samples_per_second": 3.398, + "eval_sentence_accuracy": 0.7917033036050748, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 13721600000, + "step": 134000 + }, + { + "epoch": 0.03, + "grad_norm": 0.06123874945217337, + "learning_rate": 8.746464646464648e-05, + "loss": 0.3378, + "num_input_tokens_seen": 13731840000, + "step": 134100 + }, + { + "epoch": 0.03, + "grad_norm": 0.06481649640431718, + "learning_rate": 8.745454545454546e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13742080000, + "step": 134200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06569143771795578, + "learning_rate": 8.744444444444445e-05, + "loss": 0.3413, + "num_input_tokens_seen": 13752320000, + "step": 134300 + }, + { + "epoch": 0.03, + "grad_norm": 0.07781635052276986, + "learning_rate": 8.743434343434344e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13762560000, + "step": 134400 + }, + { + "epoch": 0.03, + "grad_norm": 0.09895522704603117, + "learning_rate": 8.742424242424243e-05, + "loss": 0.3388, + "num_input_tokens_seen": 13772800000, + "step": 134500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08761339971116042, + "learning_rate": 8.741414141414141e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13783040000, + "step": 134600 + }, + { + "epoch": 0.03, + "grad_norm": 0.10937438053568757, + "learning_rate": 8.740404040404042e-05, + "loss": 0.3409, + "num_input_tokens_seen": 13793280000, + "step": 134700 + }, + { + "epoch": 0.03, + "grad_norm": 0.06438064835129055, + "learning_rate": 8.73939393939394e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13803520000, + "step": 134800 + }, + { + "epoch": 0.03, + "grad_norm": 0.06374653478979808, + "learning_rate": 8.738383838383839e-05, + "loss": 0.342, + "num_input_tokens_seen": 13813760000, + "step": 134900 + }, + { + "epoch": 0.03, + "grad_norm": 0.07925758750962557, + "learning_rate": 8.737373737373738e-05, + "loss": 0.3382, + "num_input_tokens_seen": 13824000000, + "step": 135000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3360619312445078, + "eval_average_loss_on_sentence_tokens": 0.3289877014322629, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.3357910215854645, + "eval_non_padding_tokens_in_labels": 133.4817, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3576, + "eval_padding_tokens_in_labels": 378.5183, + "eval_reconstruction_accuracy": 0.9296029828747068, + "eval_runtime": 1337.516, + "eval_samples_per_second": 3.738, + "eval_sentence_accuracy": 0.7901017460118076, + "eval_steps_per_second": 0.01, + "eval_variance_shuffling_prob": 0.2490999999999999, + "num_input_tokens_seen": 13824000000, + "step": 135000 + }, + { + "epoch": 0.03, + "grad_norm": 0.06010599611111365, + "learning_rate": 8.736363636363637e-05, + "loss": 0.338, + "num_input_tokens_seen": 13834240000, + "step": 135100 + }, + { + "epoch": 0.03, + "grad_norm": 0.0759407642505721, + "learning_rate": 8.735353535353535e-05, + "loss": 0.3387, + "num_input_tokens_seen": 13844480000, + "step": 135200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06879895598086301, + "learning_rate": 8.734343434343435e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13854720000, + "step": 135300 + }, + { + "epoch": 0.03, + "grad_norm": 0.10714571741804844, + "learning_rate": 8.733333333333333e-05, + "loss": 0.3415, + "num_input_tokens_seen": 13864960000, + "step": 135400 + }, + { + "epoch": 0.03, + "grad_norm": 0.08006745360376591, + "learning_rate": 8.732323232323232e-05, + "loss": 0.3375, + "num_input_tokens_seen": 13875200000, + "step": 135500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06188859223888466, + "learning_rate": 8.731313131313132e-05, + "loss": 0.3372, + "num_input_tokens_seen": 13885440000, + "step": 135600 + }, + { + "epoch": 0.03, + "grad_norm": 0.06391938519757401, + "learning_rate": 8.730303030303031e-05, + "loss": 0.3382, + "num_input_tokens_seen": 13895680000, + "step": 135700 + }, + { + "epoch": 0.03, + "grad_norm": 0.0925061239651103, + "learning_rate": 8.729292929292929e-05, + "loss": 0.3413, + "num_input_tokens_seen": 13905920000, + "step": 135800 + }, + { + "epoch": 0.03, + "grad_norm": 0.058610268092412626, + "learning_rate": 8.728282828282829e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13916160000, + "step": 135900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09536518063749674, + "learning_rate": 8.727272727272727e-05, + "loss": 0.3408, + "num_input_tokens_seen": 13926400000, + "step": 136000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33662507097902783, + "eval_average_loss_on_sentence_tokens": 0.3308840079122896, + "eval_average_shuffling_prob": 0.475, + "eval_loss": 0.33637696504592896, + "eval_non_padding_tokens_in_labels": 133.54305, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39125, + "eval_padding_tokens_in_labels": 378.45695, + "eval_reconstruction_accuracy": 0.9296154000511546, + "eval_runtime": 1411.9003, + "eval_samples_per_second": 3.541, + "eval_sentence_accuracy": 0.7897607981768263, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24937499999999996, + "num_input_tokens_seen": 13926400000, + "step": 136000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07766804423033806, + "learning_rate": 8.726262626262626e-05, + "loss": 0.3413, + "num_input_tokens_seen": 13936640000, + "step": 136100 + }, + { + "epoch": 0.03, + "grad_norm": 0.09077710103002072, + "learning_rate": 8.725252525252526e-05, + "loss": 0.3415, + "num_input_tokens_seen": 13946880000, + "step": 136200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06346452305399815, + "learning_rate": 8.724242424242425e-05, + "loss": 0.339, + "num_input_tokens_seen": 13957120000, + "step": 136300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06115444732868756, + "learning_rate": 8.723232323232323e-05, + "loss": 0.3401, + "num_input_tokens_seen": 13967360000, + "step": 136400 + }, + { + "epoch": 0.03, + "grad_norm": 0.11081506855551367, + "learning_rate": 8.722222222222223e-05, + "loss": 0.341, + "num_input_tokens_seen": 13977600000, + "step": 136500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06782401633972554, + "learning_rate": 8.721212121212121e-05, + "loss": 0.3391, + "num_input_tokens_seen": 13987840000, + "step": 136600 + }, + { + "epoch": 0.03, + "grad_norm": 0.09502969983413058, + "learning_rate": 8.72020202020202e-05, + "loss": 0.3373, + "num_input_tokens_seen": 13998080000, + "step": 136700 + }, + { + "epoch": 0.03, + "grad_norm": 0.05960097364182768, + "learning_rate": 8.71919191919192e-05, + "loss": 0.3408, + "num_input_tokens_seen": 14008320000, + "step": 136800 + }, + { + "epoch": 0.03, + "grad_norm": 0.09754717815423249, + "learning_rate": 8.718181818181819e-05, + "loss": 0.3386, + "num_input_tokens_seen": 14018560000, + "step": 136900 + }, + { + "epoch": 0.03, + "grad_norm": 0.13681060906707143, + "learning_rate": 8.717171717171718e-05, + "loss": 0.3374, + "num_input_tokens_seen": 14028800000, + "step": 137000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33673071242838654, + "eval_average_loss_on_sentence_tokens": 0.3826634073920176, + "eval_average_shuffling_prob": 0.56, + "eval_loss": 0.33881837129592896, + "eval_non_padding_tokens_in_labels": 133.5425, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38635, + "eval_padding_tokens_in_labels": 378.4575, + "eval_reconstruction_accuracy": 0.9295607531702728, + "eval_runtime": 1220.7289, + "eval_samples_per_second": 4.096, + "eval_sentence_accuracy": 0.7504800186624078, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.2464, + "num_input_tokens_seen": 14028800000, + "step": 137000 + }, + { + "epoch": 0.03, + "grad_norm": 0.08704722308409567, + "learning_rate": 8.716161616161617e-05, + "loss": 0.3376, + "num_input_tokens_seen": 14039040000, + "step": 137100 + }, + { + "epoch": 0.03, + "grad_norm": 0.09039247526790758, + "learning_rate": 8.715151515151515e-05, + "loss": 0.3391, + "num_input_tokens_seen": 14049280000, + "step": 137200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06960166462453907, + "learning_rate": 8.714141414141414e-05, + "loss": 0.3347, + "num_input_tokens_seen": 14059520000, + "step": 137300 + }, + { + "epoch": 0.03, + "grad_norm": 0.05774067923346692, + "learning_rate": 8.713131313131313e-05, + "loss": 0.3378, + "num_input_tokens_seen": 14069760000, + "step": 137400 + }, + { + "epoch": 0.03, + "grad_norm": 0.0791541001605494, + "learning_rate": 8.712121212121212e-05, + "loss": 0.3376, + "num_input_tokens_seen": 14080000000, + "step": 137500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08254929998766823, + "learning_rate": 8.711111111111112e-05, + "loss": 0.3387, + "num_input_tokens_seen": 14090240000, + "step": 137600 + }, + { + "epoch": 0.03, + "grad_norm": 0.11893233477446417, + "learning_rate": 8.710101010101011e-05, + "loss": 0.3375, + "num_input_tokens_seen": 14100480000, + "step": 137700 + }, + { + "epoch": 0.03, + "grad_norm": 0.12139831588845078, + "learning_rate": 8.709090909090909e-05, + "loss": 0.3361, + "num_input_tokens_seen": 14110720000, + "step": 137800 + }, + { + "epoch": 0.03, + "grad_norm": 0.0602947395832044, + "learning_rate": 8.708080808080809e-05, + "loss": 0.3386, + "num_input_tokens_seen": 14120960000, + "step": 137900 + }, + { + "epoch": 0.03, + "grad_norm": 0.07093524086002691, + "learning_rate": 8.707070707070707e-05, + "loss": 0.3373, + "num_input_tokens_seen": 14131200000, + "step": 138000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3364096340737879, + "eval_average_loss_on_sentence_tokens": 0.36649167813688954, + "eval_average_shuffling_prob": 0.55, + "eval_loss": 0.33781251311302185, + "eval_non_padding_tokens_in_labels": 133.53475, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39085, + "eval_padding_tokens_in_labels": 378.46525, + "eval_reconstruction_accuracy": 0.9296770837191504, + "eval_runtime": 1155.4956, + "eval_samples_per_second": 4.327, + "eval_sentence_accuracy": 0.7607488291133562, + "eval_steps_per_second": 0.011, + "eval_variance_shuffling_prob": 0.24750000000000008, + "num_input_tokens_seen": 14131200000, + "step": 138000 + }, + { + "epoch": 0.03, + "grad_norm": 0.08127172407773285, + "learning_rate": 8.706060606060606e-05, + "loss": 0.3408, + "num_input_tokens_seen": 14141440000, + "step": 138100 + }, + { + "epoch": 0.03, + "grad_norm": 0.10689057830577556, + "learning_rate": 8.705050505050505e-05, + "loss": 0.3398, + "num_input_tokens_seen": 14151680000, + "step": 138200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06797894820623829, + "learning_rate": 8.704040404040405e-05, + "loss": 0.3389, + "num_input_tokens_seen": 14161920000, + "step": 138300 + }, + { + "epoch": 0.03, + "grad_norm": 0.07410644517174554, + "learning_rate": 8.703030303030304e-05, + "loss": 0.3394, + "num_input_tokens_seen": 14172160000, + "step": 138400 + }, + { + "epoch": 0.03, + "grad_norm": 0.05888979944866042, + "learning_rate": 8.702020202020203e-05, + "loss": 0.341, + "num_input_tokens_seen": 14182400000, + "step": 138500 + }, + { + "epoch": 0.03, + "grad_norm": 0.09720268060806131, + "learning_rate": 8.701010101010102e-05, + "loss": 0.3415, + "num_input_tokens_seen": 14192640000, + "step": 138600 + }, + { + "epoch": 0.03, + "grad_norm": 0.1014722872767488, + "learning_rate": 8.7e-05, + "loss": 0.3355, + "num_input_tokens_seen": 14202880000, + "step": 138700 + }, + { + "epoch": 0.03, + "grad_norm": 0.08589530038764662, + "learning_rate": 8.698989898989899e-05, + "loss": 0.3412, + "num_input_tokens_seen": 14213120000, + "step": 138800 + }, + { + "epoch": 0.03, + "grad_norm": 0.09106469346083766, + "learning_rate": 8.697979797979798e-05, + "loss": 0.3404, + "num_input_tokens_seen": 14223360000, + "step": 138900 + }, + { + "epoch": 0.03, + "grad_norm": 0.10960614312138936, + "learning_rate": 8.696969696969698e-05, + "loss": 0.3384, + "num_input_tokens_seen": 14233600000, + "step": 139000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3358540172756312, + "eval_average_loss_on_sentence_tokens": 0.3350537991325914, + "eval_average_shuffling_prob": 0.48, + "eval_loss": 0.33583009243011475, + "eval_non_padding_tokens_in_labels": 133.52795, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37115, + "eval_padding_tokens_in_labels": 378.47205, + "eval_reconstruction_accuracy": 0.9296140518318037, + "eval_runtime": 729.2259, + "eval_samples_per_second": 6.857, + "eval_sentence_accuracy": 0.7898684659141888, + "eval_steps_per_second": 0.018, + "eval_variance_shuffling_prob": 0.24960000000000004, + "num_input_tokens_seen": 14233600000, + "step": 139000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07040521681095648, + "learning_rate": 8.695959595959597e-05, + "loss": 0.3388, + "num_input_tokens_seen": 14243840000, + "step": 139100 + }, + { + "epoch": 0.03, + "grad_norm": 0.12716050815858768, + "learning_rate": 8.694949494949496e-05, + "loss": 0.3394, + "num_input_tokens_seen": 14254080000, + "step": 139200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06509550533695556, + "learning_rate": 8.693939393939394e-05, + "loss": 0.343, + "num_input_tokens_seen": 14264320000, + "step": 139300 + }, + { + "epoch": 0.03, + "grad_norm": 0.0706501064494663, + "learning_rate": 8.692929292929294e-05, + "loss": 0.3398, + "num_input_tokens_seen": 14274560000, + "step": 139400 + }, + { + "epoch": 0.03, + "grad_norm": 0.08220689848773338, + "learning_rate": 8.691919191919192e-05, + "loss": 0.3382, + "num_input_tokens_seen": 14284800000, + "step": 139500 + }, + { + "epoch": 0.03, + "grad_norm": 0.07606501909291036, + "learning_rate": 8.690909090909091e-05, + "loss": 0.3388, + "num_input_tokens_seen": 14295040000, + "step": 139600 + }, + { + "epoch": 0.03, + "grad_norm": 0.059586131035865686, + "learning_rate": 8.68989898989899e-05, + "loss": 0.3414, + "num_input_tokens_seen": 14305280000, + "step": 139700 + }, + { + "epoch": 0.03, + "grad_norm": 0.12371511130704352, + "learning_rate": 8.68888888888889e-05, + "loss": 0.3383, + "num_input_tokens_seen": 14315520000, + "step": 139800 + }, + { + "epoch": 0.03, + "grad_norm": 0.05707851351477306, + "learning_rate": 8.687878787878788e-05, + "loss": 0.3415, + "num_input_tokens_seen": 14325760000, + "step": 139900 + }, + { + "epoch": 0.03, + "grad_norm": 0.08193942448234971, + "learning_rate": 8.686868686868688e-05, + "loss": 0.3384, + "num_input_tokens_seen": 14336000000, + "step": 140000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3368586357662047, + "eval_average_loss_on_sentence_tokens": 0.3712845202308761, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.3384374976158142, + "eval_non_padding_tokens_in_labels": 133.53885, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3854, + "eval_padding_tokens_in_labels": 378.46115, + "eval_reconstruction_accuracy": 0.9296334281324723, + "eval_runtime": 1569.911, + "eval_samples_per_second": 3.185, + "eval_sentence_accuracy": 0.7555718054085093, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.246975, + "num_input_tokens_seen": 14336000000, + "step": 140000 + }, + { + "epoch": 0.03, + "grad_norm": 0.08277375376462262, + "learning_rate": 8.685858585858586e-05, + "loss": 0.3393, + "num_input_tokens_seen": 14346240000, + "step": 140100 + }, + { + "epoch": 0.03, + "grad_norm": 0.07482283007429197, + "learning_rate": 8.684848484848485e-05, + "loss": 0.3385, + "num_input_tokens_seen": 14356480000, + "step": 140200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06965278206773397, + "learning_rate": 8.683838383838385e-05, + "loss": 0.3377, + "num_input_tokens_seen": 14366720000, + "step": 140300 + }, + { + "epoch": 0.03, + "grad_norm": 0.13245104768000912, + "learning_rate": 8.682828282828284e-05, + "loss": 0.3391, + "num_input_tokens_seen": 14376960000, + "step": 140400 + }, + { + "epoch": 0.03, + "grad_norm": 0.0912307366522594, + "learning_rate": 8.681818181818182e-05, + "loss": 0.338, + "num_input_tokens_seen": 14387200000, + "step": 140500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06298519282199234, + "learning_rate": 8.680808080808082e-05, + "loss": 0.3381, + "num_input_tokens_seen": 14397440000, + "step": 140600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07761969769627508, + "learning_rate": 8.67979797979798e-05, + "loss": 0.3395, + "num_input_tokens_seen": 14407680000, + "step": 140700 + }, + { + "epoch": 0.03, + "grad_norm": 0.1234624294264969, + "learning_rate": 8.678787878787879e-05, + "loss": 0.3371, + "num_input_tokens_seen": 14417920000, + "step": 140800 + }, + { + "epoch": 0.03, + "grad_norm": 0.0678215154519829, + "learning_rate": 8.677777777777778e-05, + "loss": 0.3356, + "num_input_tokens_seen": 14428160000, + "step": 140900 + }, + { + "epoch": 0.04, + "grad_norm": 0.1274912707382735, + "learning_rate": 8.676767676767678e-05, + "loss": 0.3409, + "num_input_tokens_seen": 14438400000, + "step": 141000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3360088844914854, + "eval_average_loss_on_sentence_tokens": 0.35406755339064117, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.33680665493011475, + "eval_non_padding_tokens_in_labels": 133.5254, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3813, + "eval_padding_tokens_in_labels": 378.4746, + "eval_reconstruction_accuracy": 0.9297476350614402, + "eval_runtime": 968.5551, + "eval_samples_per_second": 5.162, + "eval_sentence_accuracy": 0.7706273440163655, + "eval_steps_per_second": 0.013, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 14438400000, + "step": 141000 + }, + { + "epoch": 0.04, + "grad_norm": 0.12341067673079534, + "learning_rate": 8.675757575757575e-05, + "loss": 0.3398, + "num_input_tokens_seen": 14448640000, + "step": 141100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0826216396074604, + "learning_rate": 8.674747474747476e-05, + "loss": 0.3404, + "num_input_tokens_seen": 14458880000, + "step": 141200 + }, + { + "epoch": 0.04, + "grad_norm": 0.06209614687080655, + "learning_rate": 8.673737373737374e-05, + "loss": 0.3379, + "num_input_tokens_seen": 14469120000, + "step": 141300 + }, + { + "epoch": 0.04, + "grad_norm": 0.07851837483140854, + "learning_rate": 8.672727272727273e-05, + "loss": 0.3407, + "num_input_tokens_seen": 14479360000, + "step": 141400 + }, + { + "epoch": 0.04, + "grad_norm": 0.13216550394891108, + "learning_rate": 8.671717171717172e-05, + "loss": 0.3392, + "num_input_tokens_seen": 14489600000, + "step": 141500 + }, + { + "epoch": 0.04, + "grad_norm": 0.10765689153784626, + "learning_rate": 8.670707070707071e-05, + "loss": 0.3388, + "num_input_tokens_seen": 14499840000, + "step": 141600 + }, + { + "epoch": 0.04, + "grad_norm": 0.05934725100438484, + "learning_rate": 8.669696969696969e-05, + "loss": 0.3396, + "num_input_tokens_seen": 14510080000, + "step": 141700 + }, + { + "epoch": 0.04, + "grad_norm": 0.12073171432881455, + "learning_rate": 8.66868686868687e-05, + "loss": 0.3366, + "num_input_tokens_seen": 14520320000, + "step": 141800 + }, + { + "epoch": 0.04, + "grad_norm": 0.09598864568182336, + "learning_rate": 8.667676767676768e-05, + "loss": 0.3419, + "num_input_tokens_seen": 14530560000, + "step": 141900 + }, + { + "epoch": 0.04, + "grad_norm": 0.05916204236670847, + "learning_rate": 8.666666666666667e-05, + "loss": 0.3387, + "num_input_tokens_seen": 14540800000, + "step": 142000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33588414162368274, + "eval_average_loss_on_sentence_tokens": 0.32868484967515466, + "eval_average_shuffling_prob": 0.47, + "eval_loss": 0.33558595180511475, + "eval_non_padding_tokens_in_labels": 133.5023, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38065, + "eval_padding_tokens_in_labels": 378.4977, + "eval_reconstruction_accuracy": 0.9296376322336207, + "eval_runtime": 1371.9991, + "eval_samples_per_second": 3.644, + "eval_sentence_accuracy": 0.7925197839467404, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24909999999999996, + "num_input_tokens_seen": 14540800000, + "step": 142000 + }, + { + "epoch": 0.04, + "grad_norm": 0.1036032461058061, + "learning_rate": 8.665656565656566e-05, + "loss": 0.3386, + "num_input_tokens_seen": 14551040000, + "step": 142100 + }, + { + "epoch": 0.04, + "grad_norm": 0.066062140864855, + "learning_rate": 8.664646464646465e-05, + "loss": 0.3382, + "num_input_tokens_seen": 14561280000, + "step": 142200 + }, + { + "epoch": 0.04, + "grad_norm": 0.06991320721483257, + "learning_rate": 8.663636363636364e-05, + "loss": 0.3389, + "num_input_tokens_seen": 14571520000, + "step": 142300 + }, + { + "epoch": 0.04, + "grad_norm": 0.05987506053589008, + "learning_rate": 8.662626262626264e-05, + "loss": 0.3391, + "num_input_tokens_seen": 14581760000, + "step": 142400 + }, + { + "epoch": 0.04, + "grad_norm": 0.10430624273285174, + "learning_rate": 8.661616161616161e-05, + "loss": 0.3388, + "num_input_tokens_seen": 14592000000, + "step": 142500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06747378552212382, + "learning_rate": 8.66060606060606e-05, + "loss": 0.3411, + "num_input_tokens_seen": 14602240000, + "step": 142600 + }, + { + "epoch": 0.04, + "grad_norm": 0.11673405259074714, + "learning_rate": 8.65959595959596e-05, + "loss": 0.3392, + "num_input_tokens_seen": 14612480000, + "step": 142700 + }, + { + "epoch": 0.04, + "grad_norm": 0.0637437396958072, + "learning_rate": 8.658585858585859e-05, + "loss": 0.3366, + "num_input_tokens_seen": 14622720000, + "step": 142800 + }, + { + "epoch": 0.04, + "grad_norm": 0.12265256873152412, + "learning_rate": 8.657575757575758e-05, + "loss": 0.3403, + "num_input_tokens_seen": 14632960000, + "step": 142900 + }, + { + "epoch": 0.04, + "grad_norm": 0.1388714656895136, + "learning_rate": 8.656565656565657e-05, + "loss": 0.3368, + "num_input_tokens_seen": 14643200000, + "step": 143000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3361336731757098, + "eval_average_loss_on_sentence_tokens": 0.3507691203745785, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.3367675840854645, + "eval_non_padding_tokens_in_labels": 133.534, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38645, + "eval_padding_tokens_in_labels": 378.466, + "eval_reconstruction_accuracy": 0.9296688747275531, + "eval_runtime": 478.5696, + "eval_samples_per_second": 10.448, + "eval_sentence_accuracy": 0.7730588404184686, + "eval_steps_per_second": 0.027, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 14643200000, + "step": 143000 + }, + { + "epoch": 0.04, + "grad_norm": 0.13303122496442266, + "learning_rate": 8.655555555555555e-05, + "loss": 0.3373, + "num_input_tokens_seen": 14653440000, + "step": 143100 + }, + { + "epoch": 0.04, + "grad_norm": 0.05197085874711175, + "learning_rate": 8.654545454545456e-05, + "loss": 0.3382, + "num_input_tokens_seen": 14663680000, + "step": 143200 + }, + { + "epoch": 0.04, + "grad_norm": 0.08770486884734065, + "learning_rate": 8.653535353535354e-05, + "loss": 0.3371, + "num_input_tokens_seen": 14673920000, + "step": 143300 + }, + { + "epoch": 0.04, + "grad_norm": 0.0668017568951435, + "learning_rate": 8.652525252525253e-05, + "loss": 0.3398, + "num_input_tokens_seen": 14684160000, + "step": 143400 + }, + { + "epoch": 0.04, + "grad_norm": 0.09939508903453351, + "learning_rate": 8.651515151515152e-05, + "loss": 0.3374, + "num_input_tokens_seen": 14694400000, + "step": 143500 + }, + { + "epoch": 0.04, + "grad_norm": 0.0628187481864864, + "learning_rate": 8.650505050505051e-05, + "loss": 0.3394, + "num_input_tokens_seen": 14704640000, + "step": 143600 + }, + { + "epoch": 0.04, + "grad_norm": 0.061987025307998204, + "learning_rate": 8.649494949494949e-05, + "loss": 0.3382, + "num_input_tokens_seen": 14714880000, + "step": 143700 + }, + { + "epoch": 0.04, + "grad_norm": 0.06335168990330428, + "learning_rate": 8.64848484848485e-05, + "loss": 0.338, + "num_input_tokens_seen": 14725120000, + "step": 143800 + }, + { + "epoch": 0.04, + "grad_norm": 0.0661949132697696, + "learning_rate": 8.647474747474748e-05, + "loss": 0.3398, + "num_input_tokens_seen": 14735360000, + "step": 143900 + }, + { + "epoch": 0.04, + "grad_norm": 0.06449902861388347, + "learning_rate": 8.646464646464647e-05, + "loss": 0.3379, + "num_input_tokens_seen": 14745600000, + "step": 144000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33593365506599043, + "eval_average_loss_on_sentence_tokens": 0.30317384989373813, + "eval_average_shuffling_prob": 0.44, + "eval_loss": 0.3344433605670929, + "eval_non_padding_tokens_in_labels": 133.5272, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37745, + "eval_padding_tokens_in_labels": 378.4728, + "eval_reconstruction_accuracy": 0.9297342799986502, + "eval_runtime": 767.2255, + "eval_samples_per_second": 6.517, + "eval_sentence_accuracy": 0.8060275988300106, + "eval_steps_per_second": 0.017, + "eval_variance_shuffling_prob": 0.2464, + "num_input_tokens_seen": 14745600000, + "step": 144000 + }, + { + "epoch": 0.04, + "grad_norm": 0.0860902745009235, + "learning_rate": 8.645454545454546e-05, + "loss": 0.3351, + "num_input_tokens_seen": 14755840000, + "step": 144100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07000279931608305, + "learning_rate": 8.644444444444445e-05, + "loss": 0.3392, + "num_input_tokens_seen": 14766080000, + "step": 144200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10599213339536473, + "learning_rate": 8.643434343434343e-05, + "loss": 0.3377, + "num_input_tokens_seen": 14776320000, + "step": 144300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08187758295383828, + "learning_rate": 8.642424242424243e-05, + "loss": 0.3407, + "num_input_tokens_seen": 14786560000, + "step": 144400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08014465539806151, + "learning_rate": 8.641414141414141e-05, + "loss": 0.3358, + "num_input_tokens_seen": 14796800000, + "step": 144500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06048304227023658, + "learning_rate": 8.64040404040404e-05, + "loss": 0.3414, + "num_input_tokens_seen": 14807040000, + "step": 144600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06438723541329726, + "learning_rate": 8.63939393939394e-05, + "loss": 0.3384, + "num_input_tokens_seen": 14817280000, + "step": 144700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09519914473199349, + "learning_rate": 8.638383838383839e-05, + "loss": 0.3356, + "num_input_tokens_seen": 14827520000, + "step": 144800 + }, + { + "epoch": 0.04, + "grad_norm": 0.0629279971887291, + "learning_rate": 8.637373737373737e-05, + "loss": 0.3379, + "num_input_tokens_seen": 14837760000, + "step": 144900 + }, + { + "epoch": 0.04, + "grad_norm": 0.14868320032062737, + "learning_rate": 8.636363636363637e-05, + "loss": 0.339, + "num_input_tokens_seen": 14848000000, + "step": 145000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3366203991638344, + "eval_average_loss_on_sentence_tokens": 0.4084515253232222, + "eval_average_shuffling_prob": 0.61, + "eval_loss": 0.3399023413658142, + "eval_non_padding_tokens_in_labels": 133.5147, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3757, + "eval_padding_tokens_in_labels": 378.4853, + "eval_reconstruction_accuracy": 0.929516134490913, + "eval_runtime": 620.1317, + "eval_samples_per_second": 8.063, + "eval_sentence_accuracy": 0.732750731243383, + "eval_steps_per_second": 0.021, + "eval_variance_shuffling_prob": 0.23789999999999994, + "num_input_tokens_seen": 14848000000, + "step": 145000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08057305279683057, + "learning_rate": 8.635353535353535e-05, + "loss": 0.3381, + "num_input_tokens_seen": 14858240000, + "step": 145100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0742893291225497, + "learning_rate": 8.634343434343434e-05, + "loss": 0.3405, + "num_input_tokens_seen": 14868480000, + "step": 145200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10152571727340325, + "learning_rate": 8.633333333333334e-05, + "loss": 0.3387, + "num_input_tokens_seen": 14878720000, + "step": 145300 + }, + { + "epoch": 0.04, + "grad_norm": 0.11299864071146085, + "learning_rate": 8.632323232323233e-05, + "loss": 0.3374, + "num_input_tokens_seen": 14888960000, + "step": 145400 + }, + { + "epoch": 0.04, + "grad_norm": 0.1416406035041898, + "learning_rate": 8.63131313131313e-05, + "loss": 0.3418, + "num_input_tokens_seen": 14899200000, + "step": 145500 + }, + { + "epoch": 0.04, + "grad_norm": 0.0890992692077802, + "learning_rate": 8.630303030303031e-05, + "loss": 0.3404, + "num_input_tokens_seen": 14909440000, + "step": 145600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06902998089875753, + "learning_rate": 8.629292929292929e-05, + "loss": 0.3386, + "num_input_tokens_seen": 14919680000, + "step": 145700 + }, + { + "epoch": 0.04, + "grad_norm": 0.06466232381883003, + "learning_rate": 8.628282828282828e-05, + "loss": 0.3395, + "num_input_tokens_seen": 14929920000, + "step": 145800 + }, + { + "epoch": 0.04, + "grad_norm": 0.14656545461439915, + "learning_rate": 8.627272727272727e-05, + "loss": 0.3444, + "num_input_tokens_seen": 14940160000, + "step": 145900 + }, + { + "epoch": 0.04, + "grad_norm": 0.09194202057465904, + "learning_rate": 8.626262626262627e-05, + "loss": 0.3395, + "num_input_tokens_seen": 14950400000, + "step": 146000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33598114218919445, + "eval_average_loss_on_sentence_tokens": 0.3431391565274225, + "eval_average_shuffling_prob": 0.505, + "eval_loss": 0.3362402319908142, + "eval_non_padding_tokens_in_labels": 133.53125, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3802, + "eval_padding_tokens_in_labels": 378.46875, + "eval_reconstruction_accuracy": 0.9296845400830318, + "eval_runtime": 1659.4771, + "eval_samples_per_second": 3.013, + "eval_sentence_accuracy": 0.7774866761175014, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 14950400000, + "step": 146000 + }, + { + "epoch": 0.04, + "grad_norm": 0.0637777455006437, + "learning_rate": 8.625252525252526e-05, + "loss": 0.3378, + "num_input_tokens_seen": 14960640000, + "step": 146100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0544502912479091, + "learning_rate": 8.624242424242425e-05, + "loss": 0.3379, + "num_input_tokens_seen": 14970880000, + "step": 146200 + }, + { + "epoch": 0.04, + "grad_norm": 0.061875166694098684, + "learning_rate": 8.623232323232323e-05, + "loss": 0.3409, + "num_input_tokens_seen": 14981120000, + "step": 146300 + }, + { + "epoch": 0.04, + "grad_norm": 0.06707327171611512, + "learning_rate": 8.622222222222222e-05, + "loss": 0.3385, + "num_input_tokens_seen": 14991360000, + "step": 146400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06524160203761245, + "learning_rate": 8.621212121212121e-05, + "loss": 0.3367, + "num_input_tokens_seen": 15001600000, + "step": 146500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06145283170886387, + "learning_rate": 8.62020202020202e-05, + "loss": 0.3359, + "num_input_tokens_seen": 15011840000, + "step": 146600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06741951378315597, + "learning_rate": 8.61919191919192e-05, + "loss": 0.3352, + "num_input_tokens_seen": 15022080000, + "step": 146700 + }, + { + "epoch": 0.04, + "grad_norm": 0.06932456131410535, + "learning_rate": 8.618181818181819e-05, + "loss": 0.3393, + "num_input_tokens_seen": 15032320000, + "step": 146800 + }, + { + "epoch": 0.04, + "grad_norm": 0.12906938944051796, + "learning_rate": 8.617171717171718e-05, + "loss": 0.3388, + "num_input_tokens_seen": 15042560000, + "step": 146900 + }, + { + "epoch": 0.04, + "grad_norm": 0.09140438522164726, + "learning_rate": 8.616161616161616e-05, + "loss": 0.3376, + "num_input_tokens_seen": 15052800000, + "step": 147000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.335738609789397, + "eval_average_loss_on_sentence_tokens": 0.3347906129752027, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.3356640636920929, + "eval_non_padding_tokens_in_labels": 133.5353, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38845, + "eval_padding_tokens_in_labels": 378.4647, + "eval_reconstruction_accuracy": 0.9297315321332321, + "eval_runtime": 1716.8883, + "eval_samples_per_second": 2.912, + "eval_sentence_accuracy": 0.7881816713621763, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 15052800000, + "step": 147000 + }, + { + "epoch": 0.04, + "grad_norm": 0.09500201966623409, + "learning_rate": 8.615151515151516e-05, + "loss": 0.3395, + "num_input_tokens_seen": 15063040000, + "step": 147100 + }, + { + "epoch": 0.04, + "grad_norm": 0.06412927136053245, + "learning_rate": 8.614141414141414e-05, + "loss": 0.339, + "num_input_tokens_seen": 15073280000, + "step": 147200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10657434997855322, + "learning_rate": 8.613131313131313e-05, + "loss": 0.3414, + "num_input_tokens_seen": 15083520000, + "step": 147300 + }, + { + "epoch": 0.04, + "grad_norm": 0.06757440229454144, + "learning_rate": 8.612121212121213e-05, + "loss": 0.3371, + "num_input_tokens_seen": 15093760000, + "step": 147400 + }, + { + "epoch": 0.04, + "grad_norm": 0.07798130111639921, + "learning_rate": 8.611111111111112e-05, + "loss": 0.3418, + "num_input_tokens_seen": 15104000000, + "step": 147500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06066429573218123, + "learning_rate": 8.610101010101011e-05, + "loss": 0.3416, + "num_input_tokens_seen": 15114240000, + "step": 147600 + }, + { + "epoch": 0.04, + "grad_norm": 0.07053111779598173, + "learning_rate": 8.60909090909091e-05, + "loss": 0.3395, + "num_input_tokens_seen": 15124480000, + "step": 147700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08129714414209982, + "learning_rate": 8.608080808080808e-05, + "loss": 0.3391, + "num_input_tokens_seen": 15134720000, + "step": 147800 + }, + { + "epoch": 0.04, + "grad_norm": 0.06525150455358464, + "learning_rate": 8.607070707070707e-05, + "loss": 0.3389, + "num_input_tokens_seen": 15144960000, + "step": 147900 + }, + { + "epoch": 0.04, + "grad_norm": 0.06227274740453145, + "learning_rate": 8.606060606060606e-05, + "loss": 0.3402, + "num_input_tokens_seen": 15155200000, + "step": 148000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33573424045140443, + "eval_average_loss_on_sentence_tokens": 0.3423723558972095, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.33601561188697815, + "eval_non_padding_tokens_in_labels": 133.5275, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38105, + "eval_padding_tokens_in_labels": 378.4725, + "eval_reconstruction_accuracy": 0.9297158738711163, + "eval_runtime": 1938.2327, + "eval_samples_per_second": 2.58, + "eval_sentence_accuracy": 0.7795772246846232, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 15155200000, + "step": 148000 + }, + { + "epoch": 0.04, + "grad_norm": 0.06965609797980542, + "learning_rate": 8.605050505050506e-05, + "loss": 0.3362, + "num_input_tokens_seen": 15165440000, + "step": 148100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07342615230117693, + "learning_rate": 8.604040404040405e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15175680000, + "step": 148200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10644501174219322, + "learning_rate": 8.603030303030304e-05, + "loss": 0.3359, + "num_input_tokens_seen": 15185920000, + "step": 148300 + }, + { + "epoch": 0.04, + "grad_norm": 0.0821201390656984, + "learning_rate": 8.602020202020202e-05, + "loss": 0.3376, + "num_input_tokens_seen": 15196160000, + "step": 148400 + }, + { + "epoch": 0.04, + "grad_norm": 0.08872811995531027, + "learning_rate": 8.601010101010102e-05, + "loss": 0.339, + "num_input_tokens_seen": 15206400000, + "step": 148500 + }, + { + "epoch": 0.04, + "grad_norm": 0.09061584863208887, + "learning_rate": 8.6e-05, + "loss": 0.3369, + "num_input_tokens_seen": 15216640000, + "step": 148600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06415255500091109, + "learning_rate": 8.5989898989899e-05, + "loss": 0.3381, + "num_input_tokens_seen": 15226880000, + "step": 148700 + }, + { + "epoch": 0.04, + "grad_norm": 0.07142537129925773, + "learning_rate": 8.597979797979799e-05, + "loss": 0.3357, + "num_input_tokens_seen": 15237120000, + "step": 148800 + }, + { + "epoch": 0.04, + "grad_norm": 0.06301135926749792, + "learning_rate": 8.596969696969698e-05, + "loss": 0.3364, + "num_input_tokens_seen": 15247360000, + "step": 148900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07896060511412561, + "learning_rate": 8.595959595959596e-05, + "loss": 0.3369, + "num_input_tokens_seen": 15257600000, + "step": 149000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3359684520098983, + "eval_average_loss_on_sentence_tokens": 0.3582846525171729, + "eval_average_shuffling_prob": 0.52, + "eval_loss": 0.3370117247104645, + "eval_non_padding_tokens_in_labels": 133.5037, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3704, + "eval_padding_tokens_in_labels": 378.4963, + "eval_reconstruction_accuracy": 0.9296747423263463, + "eval_runtime": 1759.7544, + "eval_samples_per_second": 2.841, + "eval_sentence_accuracy": 0.7691738295619718, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.2496, + "num_input_tokens_seen": 15257600000, + "step": 149000 + }, + { + "epoch": 0.04, + "grad_norm": 0.06143050801022579, + "learning_rate": 8.594949494949496e-05, + "loss": 0.3374, + "num_input_tokens_seen": 15267840000, + "step": 149100 + }, + { + "epoch": 0.04, + "grad_norm": 0.08199117169652825, + "learning_rate": 8.593939393939394e-05, + "loss": 0.3374, + "num_input_tokens_seen": 15278080000, + "step": 149200 + }, + { + "epoch": 0.04, + "grad_norm": 0.07622622879839261, + "learning_rate": 8.592929292929293e-05, + "loss": 0.3397, + "num_input_tokens_seen": 15288320000, + "step": 149300 + }, + { + "epoch": 0.04, + "grad_norm": 0.05937871548829229, + "learning_rate": 8.591919191919193e-05, + "loss": 0.3357, + "num_input_tokens_seen": 15298560000, + "step": 149400 + }, + { + "epoch": 0.04, + "grad_norm": 0.11908722094901739, + "learning_rate": 8.590909090909092e-05, + "loss": 0.3398, + "num_input_tokens_seen": 15308800000, + "step": 149500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06485649792908575, + "learning_rate": 8.58989898989899e-05, + "loss": 0.3384, + "num_input_tokens_seen": 15319040000, + "step": 149600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06772540943908069, + "learning_rate": 8.58888888888889e-05, + "loss": 0.3414, + "num_input_tokens_seen": 15329280000, + "step": 149700 + }, + { + "epoch": 0.04, + "grad_norm": 0.06877853088732316, + "learning_rate": 8.587878787878788e-05, + "loss": 0.3381, + "num_input_tokens_seen": 15339520000, + "step": 149800 + }, + { + "epoch": 0.04, + "grad_norm": 0.06519559897022152, + "learning_rate": 8.586868686868687e-05, + "loss": 0.3384, + "num_input_tokens_seen": 15349760000, + "step": 149900 + }, + { + "epoch": 0.04, + "grad_norm": 0.09189514343390225, + "learning_rate": 8.585858585858586e-05, + "loss": 0.3386, + "num_input_tokens_seen": 15360000000, + "step": 150000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3363347777781528, + "eval_average_loss_on_sentence_tokens": 0.35238029442763585, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.33702147006988525, + "eval_non_padding_tokens_in_labels": 133.5767, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.39385, + "eval_padding_tokens_in_labels": 378.4233, + "eval_reconstruction_accuracy": 0.9296562901426699, + "eval_runtime": 1734.8708, + "eval_samples_per_second": 2.882, + "eval_sentence_accuracy": 0.7758851185242343, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 15360000000, + "step": 150000 + }, + { + "epoch": 0.04, + "grad_norm": 0.06448103379958556, + "learning_rate": 8.584848484848486e-05, + "loss": 0.3407, + "num_input_tokens_seen": 15370240000, + "step": 150100 + }, + { + "epoch": 0.04, + "grad_norm": 0.06677261784899476, + "learning_rate": 8.583838383838383e-05, + "loss": 0.3344, + "num_input_tokens_seen": 15380480000, + "step": 150200 + }, + { + "epoch": 0.04, + "grad_norm": 0.0719618890179875, + "learning_rate": 8.582828282828284e-05, + "loss": 0.3372, + "num_input_tokens_seen": 15390720000, + "step": 150300 + }, + { + "epoch": 0.04, + "grad_norm": 0.13296126175638098, + "learning_rate": 8.581818181818182e-05, + "loss": 0.3352, + "num_input_tokens_seen": 15400960000, + "step": 150400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06440342446315477, + "learning_rate": 8.580808080808081e-05, + "loss": 0.3393, + "num_input_tokens_seen": 15411200000, + "step": 150500 + }, + { + "epoch": 0.04, + "grad_norm": 0.09090022191491647, + "learning_rate": 8.57979797979798e-05, + "loss": 0.3372, + "num_input_tokens_seen": 15421440000, + "step": 150600 + }, + { + "epoch": 0.04, + "grad_norm": 0.07987065158558732, + "learning_rate": 8.57878787878788e-05, + "loss": 0.3384, + "num_input_tokens_seen": 15431680000, + "step": 150700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09291244182056882, + "learning_rate": 8.577777777777777e-05, + "loss": 0.3331, + "num_input_tokens_seen": 15441920000, + "step": 150800 + }, + { + "epoch": 0.04, + "grad_norm": 0.0983587129478201, + "learning_rate": 8.576767676767678e-05, + "loss": 0.3371, + "num_input_tokens_seen": 15452160000, + "step": 150900 + }, + { + "epoch": 0.04, + "grad_norm": 0.08249187005870055, + "learning_rate": 8.575757575757576e-05, + "loss": 0.3406, + "num_input_tokens_seen": 15462400000, + "step": 151000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3351797587030463, + "eval_average_loss_on_sentence_tokens": 0.35008104226452497, + "eval_average_shuffling_prob": 0.515, + "eval_loss": 0.3358691334724426, + "eval_non_padding_tokens_in_labels": 133.52555, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3808, + "eval_padding_tokens_in_labels": 378.47445, + "eval_reconstruction_accuracy": 0.9298132889485851, + "eval_runtime": 1785.4853, + "eval_samples_per_second": 2.8, + "eval_sentence_accuracy": 0.7726864894934233, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 15462400000, + "step": 151000 + }, + { + "epoch": 0.05, + "grad_norm": 0.10530289218441122, + "learning_rate": 8.574747474747475e-05, + "loss": 0.3382, + "num_input_tokens_seen": 15472640000, + "step": 151100 + }, + { + "epoch": 0.05, + "grad_norm": 0.09790812911273757, + "learning_rate": 8.573737373737374e-05, + "loss": 0.3409, + "num_input_tokens_seen": 15482880000, + "step": 151200 + }, + { + "epoch": 0.05, + "grad_norm": 0.13176730276706788, + "learning_rate": 8.572727272727273e-05, + "loss": 0.3378, + "num_input_tokens_seen": 15493120000, + "step": 151300 + }, + { + "epoch": 0.05, + "grad_norm": 0.06420486494999122, + "learning_rate": 8.571717171717172e-05, + "loss": 0.3379, + "num_input_tokens_seen": 15503360000, + "step": 151400 + }, + { + "epoch": 0.05, + "grad_norm": 0.08773818040919003, + "learning_rate": 8.570707070707072e-05, + "loss": 0.3399, + "num_input_tokens_seen": 15513600000, + "step": 151500 + }, + { + "epoch": 0.05, + "grad_norm": 0.06567485801428126, + "learning_rate": 8.56969696969697e-05, + "loss": 0.342, + "num_input_tokens_seen": 15523840000, + "step": 151600 + }, + { + "epoch": 0.05, + "grad_norm": 0.07348871068799086, + "learning_rate": 8.568686868686869e-05, + "loss": 0.3395, + "num_input_tokens_seen": 15534080000, + "step": 151700 + }, + { + "epoch": 0.05, + "grad_norm": 0.118533294513518, + "learning_rate": 8.567676767676768e-05, + "loss": 0.3398, + "num_input_tokens_seen": 15544320000, + "step": 151800 + }, + { + "epoch": 0.05, + "grad_norm": 0.08258617125533932, + "learning_rate": 8.566666666666667e-05, + "loss": 0.337, + "num_input_tokens_seen": 15554560000, + "step": 151900 + }, + { + "epoch": 0.05, + "grad_norm": 0.077408294104894, + "learning_rate": 8.565656565656566e-05, + "loss": 0.3388, + "num_input_tokens_seen": 15564800000, + "step": 152000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3357248069396781, + "eval_average_loss_on_sentence_tokens": 0.3464712697969554, + "eval_average_shuffling_prob": 0.51, + "eval_loss": 0.33619141578674316, + "eval_non_padding_tokens_in_labels": 133.542, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37205, + "eval_padding_tokens_in_labels": 378.458, + "eval_reconstruction_accuracy": 0.9297479242606727, + "eval_runtime": 1758.3649, + "eval_samples_per_second": 2.844, + "eval_sentence_accuracy": 0.7775808853876935, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.2499, + "num_input_tokens_seen": 15564800000, + "step": 152000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07268202193852646, + "learning_rate": 8.564646464646465e-05, + "loss": 0.3413, + "num_input_tokens_seen": 15575040000, + "step": 152100 + }, + { + "epoch": 0.05, + "grad_norm": 0.07358125562613957, + "learning_rate": 8.563636363636363e-05, + "loss": 0.3385, + "num_input_tokens_seen": 15585280000, + "step": 152200 + }, + { + "epoch": 0.05, + "grad_norm": 0.0788084119962466, + "learning_rate": 8.562626262626264e-05, + "loss": 0.3366, + "num_input_tokens_seen": 15595520000, + "step": 152300 + }, + { + "epoch": 0.05, + "grad_norm": 0.0987225539735557, + "learning_rate": 8.561616161616162e-05, + "loss": 0.3399, + "num_input_tokens_seen": 15605760000, + "step": 152400 + }, + { + "epoch": 0.05, + "grad_norm": 0.12488364041602576, + "learning_rate": 8.560606060606061e-05, + "loss": 0.3376, + "num_input_tokens_seen": 15616000000, + "step": 152500 + }, + { + "epoch": 0.05, + "grad_norm": 0.09134205929083482, + "learning_rate": 8.55959595959596e-05, + "loss": 0.34, + "num_input_tokens_seen": 15626240000, + "step": 152600 + }, + { + "epoch": 0.05, + "grad_norm": 0.09043381601223671, + "learning_rate": 8.558585858585859e-05, + "loss": 0.3396, + "num_input_tokens_seen": 15636480000, + "step": 152700 + }, + { + "epoch": 0.05, + "grad_norm": 0.08460857503694218, + "learning_rate": 8.557575757575757e-05, + "loss": 0.339, + "num_input_tokens_seen": 15646720000, + "step": 152800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07553074229801317, + "learning_rate": 8.556565656565658e-05, + "loss": 0.3396, + "num_input_tokens_seen": 15656960000, + "step": 152900 + }, + { + "epoch": 0.05, + "grad_norm": 0.06890428696266863, + "learning_rate": 8.555555555555556e-05, + "loss": 0.3425, + "num_input_tokens_seen": 15667200000, + "step": 153000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.33557223903525146, + "eval_average_loss_on_sentence_tokens": 0.33197347619171286, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33543944358825684, + "eval_non_padding_tokens_in_labels": 133.52205, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3791, + "eval_padding_tokens_in_labels": 378.47795, + "eval_reconstruction_accuracy": 0.9297265506915101, + "eval_runtime": 1476.0249, + "eval_samples_per_second": 3.387, + "eval_sentence_accuracy": 0.7887873023848404, + "eval_steps_per_second": 0.009, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 15667200000, + "step": 153000 + }, + { + "epoch": 0.05, + "grad_norm": 0.1343467879620693, + "learning_rate": 8.554545454545455e-05, + "loss": 0.3368, + "num_input_tokens_seen": 15677440000, + "step": 153100 + }, + { + "epoch": 0.05, + "grad_norm": 0.09415484722626355, + "learning_rate": 8.553535353535354e-05, + "loss": 0.3381, + "num_input_tokens_seen": 15687680000, + "step": 153200 + }, + { + "epoch": 0.05, + "grad_norm": 0.13187949555992096, + "learning_rate": 8.552525252525253e-05, + "loss": 0.3368, + "num_input_tokens_seen": 15697920000, + "step": 153300 + }, + { + "epoch": 0.05, + "grad_norm": 0.12800694818935374, + "learning_rate": 8.551515151515151e-05, + "loss": 0.3364, + "num_input_tokens_seen": 15708160000, + "step": 153400 + }, + { + "epoch": 0.05, + "grad_norm": 0.10350202075164999, + "learning_rate": 8.550505050505052e-05, + "loss": 0.3419, + "num_input_tokens_seen": 15718400000, + "step": 153500 + }, + { + "epoch": 0.05, + "grad_norm": 0.107483586011903, + "learning_rate": 8.54949494949495e-05, + "loss": 0.3414, + "num_input_tokens_seen": 15728640000, + "step": 153600 + }, + { + "epoch": 0.05, + "grad_norm": 0.07040572278142414, + "learning_rate": 8.548484848484849e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15738880000, + "step": 153700 + }, + { + "epoch": 0.05, + "grad_norm": 0.08107829181114014, + "learning_rate": 8.547474747474748e-05, + "loss": 0.3386, + "num_input_tokens_seen": 15749120000, + "step": 153800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07812268283507114, + "learning_rate": 8.546464646464647e-05, + "loss": 0.3404, + "num_input_tokens_seen": 15759360000, + "step": 153900 + }, + { + "epoch": 0.05, + "grad_norm": 0.06387067917952936, + "learning_rate": 8.545454545454545e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15769600000, + "step": 154000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.33486943835988975, + "eval_average_loss_on_sentence_tokens": 0.3120369303302991, + "eval_average_shuffling_prob": 0.455, + "eval_loss": 0.33375975489616394, + "eval_non_padding_tokens_in_labels": 133.47715, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.36465, + "eval_padding_tokens_in_labels": 378.52285, + "eval_reconstruction_accuracy": 0.9298207274350389, + "eval_runtime": 1673.6651, + "eval_samples_per_second": 2.987, + "eval_sentence_accuracy": 0.8039908841315699, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.247975, + "num_input_tokens_seen": 15769600000, + "step": 154000 + }, + { + "epoch": 0.05, + "grad_norm": 0.05757212088658233, + "learning_rate": 8.544444444444445e-05, + "loss": 0.3356, + "num_input_tokens_seen": 15779840000, + "step": 154100 + }, + { + "epoch": 0.05, + "grad_norm": 0.06253538024374221, + "learning_rate": 8.543434343434343e-05, + "loss": 0.3406, + "num_input_tokens_seen": 15790080000, + "step": 154200 + }, + { + "epoch": 0.05, + "grad_norm": 0.06012862278619877, + "learning_rate": 8.542424242424242e-05, + "loss": 0.3394, + "num_input_tokens_seen": 15800320000, + "step": 154300 + }, + { + "epoch": 0.05, + "grad_norm": 0.06410466357075253, + "learning_rate": 8.541414141414142e-05, + "loss": 0.3366, + "num_input_tokens_seen": 15810560000, + "step": 154400 + }, + { + "epoch": 0.05, + "grad_norm": 0.11077020785149333, + "learning_rate": 8.540404040404041e-05, + "loss": 0.3364, + "num_input_tokens_seen": 15820800000, + "step": 154500 + }, + { + "epoch": 0.05, + "grad_norm": 0.10211468023923184, + "learning_rate": 8.539393939393939e-05, + "loss": 0.3378, + "num_input_tokens_seen": 15831040000, + "step": 154600 + }, + { + "epoch": 0.05, + "grad_norm": 0.10999693264464051, + "learning_rate": 8.538383838383839e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15841280000, + "step": 154700 + }, + { + "epoch": 0.05, + "grad_norm": 0.07954955251322178, + "learning_rate": 8.537373737373737e-05, + "loss": 0.3408, + "num_input_tokens_seen": 15851520000, + "step": 154800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07225220626534273, + "learning_rate": 8.536363636363636e-05, + "loss": 0.3386, + "num_input_tokens_seen": 15861760000, + "step": 154900 + }, + { + "epoch": 0.05, + "grad_norm": 0.057597640932675484, + "learning_rate": 8.535353535353535e-05, + "loss": 0.3384, + "num_input_tokens_seen": 15872000000, + "step": 155000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.33524800508433616, + "eval_average_loss_on_sentence_tokens": 0.33608652497695135, + "eval_average_shuffling_prob": 0.485, + "eval_loss": 0.33524414896965027, + "eval_non_padding_tokens_in_labels": 133.50955, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.38655, + "eval_padding_tokens_in_labels": 378.49045, + "eval_reconstruction_accuracy": 0.9298407379960484, + "eval_runtime": 1641.448, + "eval_samples_per_second": 3.046, + "eval_sentence_accuracy": 0.7873293017747232, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.24977499999999994, + "num_input_tokens_seen": 15872000000, + "step": 155000 + }, + { + "epoch": 0.05, + "grad_norm": 0.09044932662983793, + "learning_rate": 8.534343434343435e-05, + "loss": 0.3402, + "num_input_tokens_seen": 15882240000, + "step": 155100 + }, + { + "epoch": 0.05, + "grad_norm": 0.07481005365531647, + "learning_rate": 8.533333333333334e-05, + "loss": 0.336, + "num_input_tokens_seen": 15892480000, + "step": 155200 + }, + { + "epoch": 0.05, + "grad_norm": 0.07052421649780685, + "learning_rate": 8.532323232323233e-05, + "loss": 0.3383, + "num_input_tokens_seen": 15902720000, + "step": 155300 + }, + { + "epoch": 0.05, + "grad_norm": 0.11364160762345256, + "learning_rate": 8.531313131313132e-05, + "loss": 0.3398, + "num_input_tokens_seen": 15912960000, + "step": 155400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07398920890843244, + "learning_rate": 8.53030303030303e-05, + "loss": 0.3366, + "num_input_tokens_seen": 15923200000, + "step": 155500 + }, + { + "epoch": 0.05, + "grad_norm": 0.06113417794527773, + "learning_rate": 8.52929292929293e-05, + "loss": 0.3365, + "num_input_tokens_seen": 15933440000, + "step": 155600 + }, + { + "epoch": 0.05, + "grad_norm": 0.06509223252428605, + "learning_rate": 8.528282828282828e-05, + "loss": 0.3398, + "num_input_tokens_seen": 15943680000, + "step": 155700 + }, + { + "epoch": 0.05, + "grad_norm": 0.09669740711973776, + "learning_rate": 8.527272727272728e-05, + "loss": 0.3347, + "num_input_tokens_seen": 15953920000, + "step": 155800 + }, + { + "epoch": 0.05, + "grad_norm": 0.07186618969916427, + "learning_rate": 8.526262626262627e-05, + "loss": 0.3387, + "num_input_tokens_seen": 15964160000, + "step": 155900 + }, + { + "epoch": 0.05, + "grad_norm": 0.06063334285801256, + "learning_rate": 8.525252525252526e-05, + "loss": 0.3381, + "num_input_tokens_seen": 15974400000, + "step": 156000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.33523029567759527, + "eval_average_loss_on_sentence_tokens": 0.31907218070578663, + "eval_average_shuffling_prob": 0.465, + "eval_loss": 0.3345019519329071, + "eval_non_padding_tokens_in_labels": 133.5266, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3974, + "eval_padding_tokens_in_labels": 378.4734, + "eval_reconstruction_accuracy": 0.9299205561310429, + "eval_runtime": 1716.0371, + "eval_samples_per_second": 2.914, + "eval_sentence_accuracy": 0.7958350530263607, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.248775, + "num_input_tokens_seen": 15974400000, + "step": 156000 + }, + { + "epoch": 0.05, + "grad_norm": 0.10624695591212284, + "learning_rate": 8.524242424242424e-05, + "loss": 0.3342, + "num_input_tokens_seen": 15984640000, + "step": 156100 + }, + { + "epoch": 0.05, + "grad_norm": 0.09072312928765267, + "learning_rate": 8.523232323232324e-05, + "loss": 0.3366, + "num_input_tokens_seen": 15994880000, + "step": 156200 + }, + { + "epoch": 0.05, + "grad_norm": 0.06322031383871256, + "learning_rate": 8.522222222222222e-05, + "loss": 0.3378, + "num_input_tokens_seen": 16005120000, + "step": 156300 + }, + { + "epoch": 0.05, + "grad_norm": 0.12492857695584443, + "learning_rate": 8.521212121212122e-05, + "loss": 0.3383, + "num_input_tokens_seen": 16015360000, + "step": 156400 + }, + { + "epoch": 0.05, + "grad_norm": 0.08228999441244442, + "learning_rate": 8.520202020202021e-05, + "loss": 0.3397, + "num_input_tokens_seen": 16025600000, + "step": 156500 + }, + { + "epoch": 0.05, + "grad_norm": 0.08071931575718286, + "learning_rate": 8.51919191919192e-05, + "loss": 0.3413, + "num_input_tokens_seen": 16035840000, + "step": 156600 + }, + { + "epoch": 0.05, + "grad_norm": 0.08110932869607658, + "learning_rate": 8.518181818181819e-05, + "loss": 0.3368, + "num_input_tokens_seen": 16046080000, + "step": 156700 + }, + { + "epoch": 0.05, + "grad_norm": 0.0900034224191438, + "learning_rate": 8.517171717171718e-05, + "loss": 0.3388, + "num_input_tokens_seen": 16056320000, + "step": 156800 + }, + { + "epoch": 0.05, + "grad_norm": 0.08332372463480936, + "learning_rate": 8.516161616161616e-05, + "loss": 0.3353, + "num_input_tokens_seen": 16066560000, + "step": 156900 + }, + { + "epoch": 0.05, + "grad_norm": 0.06081013972346889, + "learning_rate": 8.515151515151515e-05, + "loss": 0.3381, + "num_input_tokens_seen": 16076800000, + "step": 157000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.33511226273355893, + "eval_average_loss_on_sentence_tokens": 0.36693506717655056, + "eval_average_shuffling_prob": 0.555, + "eval_loss": 0.33653318881988525, + "eval_non_padding_tokens_in_labels": 133.5008, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.37905, + "eval_padding_tokens_in_labels": 378.4992, + "eval_reconstruction_accuracy": 0.9299216066660091, + "eval_runtime": 1535.7836, + "eval_samples_per_second": 3.256, + "eval_sentence_accuracy": 0.760080391910564, + "eval_steps_per_second": 0.008, + "eval_variance_shuffling_prob": 0.246975, + "num_input_tokens_seen": 16076800000, + "step": 157000 + }, + { + "epoch": 0.05, + "grad_norm": 0.07018505407563493, + "learning_rate": 8.514141414141415e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16087040000, + "step": 157100 + }, + { + "epoch": 0.05, + "grad_norm": 0.07494917596650877, + "learning_rate": 8.513131313131314e-05, + "loss": 0.3346, + "num_input_tokens_seen": 16097280000, + "step": 157200 + }, + { + "epoch": 0.05, + "grad_norm": 0.08021604767409782, + "learning_rate": 8.512121212121213e-05, + "loss": 0.3383, + "num_input_tokens_seen": 16107520000, + "step": 157300 + }, + { + "epoch": 0.05, + "grad_norm": 0.1003469143325938, + "learning_rate": 8.511111111111112e-05, + "loss": 0.339, + "num_input_tokens_seen": 16117760000, + "step": 157400 + }, + { + "epoch": 0.05, + "grad_norm": 0.07298321876342977, + "learning_rate": 8.51010101010101e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16128000000, + "step": 157500 + }, + { + "epoch": 0.05, + "grad_norm": 0.06775279077588713, + "learning_rate": 8.50909090909091e-05, + "loss": 0.3383, + "num_input_tokens_seen": 16138240000, + "step": 157600 + }, + { + "epoch": 0.05, + "grad_norm": 0.05894832910986607, + "learning_rate": 8.508080808080808e-05, + "loss": 0.3387, + "num_input_tokens_seen": 16148480000, + "step": 157700 + }, + { + "epoch": 0.05, + "grad_norm": 0.06508850346338979, + "learning_rate": 8.507070707070708e-05, + "loss": 0.3388, + "num_input_tokens_seen": 16158720000, + "step": 157800 + }, + { + "epoch": 0.05, + "grad_norm": 0.09100341796875, + "learning_rate": 8.506060606060607e-05, + "loss": 0.3357, + "num_input_tokens_seen": 16168960000, + "step": 157900 + }, + { + "epoch": 0.05, + "grad_norm": 0.10968675843080532, + "learning_rate": 8.505050505050506e-05, + "loss": 0.3382, + "num_input_tokens_seen": 16179200000, + "step": 158000 + }, + { + "epoch": 0.05, + "eval_average_loss_on_non_sentence_tokens": 0.3352808286967399, + "eval_average_loss_on_sentence_tokens": 0.3331373386053075, + "eval_average_shuffling_prob": 0.495, + "eval_loss": 0.33525389432907104, + "eval_non_padding_tokens_in_labels": 133.53945, + "eval_num_sentence_tokens": 23.1033, + "eval_num_sentinel_tokens_in_labels": 52.3766, + "eval_padding_tokens_in_labels": 378.46055, + "eval_reconstruction_accuracy": 0.929832096240742, + "eval_runtime": 1792.4159, + "eval_samples_per_second": 2.79, + "eval_sentence_accuracy": 0.7846779837421717, + "eval_steps_per_second": 0.007, + "eval_variance_shuffling_prob": 0.24997499999999995, + "num_input_tokens_seen": 16179200000, + "step": 158000 + }, + { + "epoch": 0.0, + "grad_norm": 0.05752241611480713, + "learning_rate": 8.504040404040404e-05, + "loss": 0.3379, + "num_input_tokens_seen": 16189440000, + "step": 158100 + }, + { + "epoch": 0.0, + "grad_norm": 0.10449355840682983, + "learning_rate": 8.503030303030304e-05, + "loss": 0.3398, + "num_input_tokens_seen": 16199680000, + "step": 158200 + }, + { + "epoch": 0.0, + "grad_norm": 0.07070832699537277, + "learning_rate": 8.502020202020202e-05, + "loss": 0.3358, + "num_input_tokens_seen": 16209920000, + "step": 158300 + }, + { + "epoch": 0.0, + "grad_norm": 0.1121927946805954, + "learning_rate": 8.501010101010101e-05, + "loss": 0.3389, + "num_input_tokens_seen": 16220160000, + "step": 158400 + }, + { + "epoch": 0.0, + "grad_norm": 0.07741651684045792, + "learning_rate": 8.5e-05, + "loss": 0.3376, + "num_input_tokens_seen": 16230400000, + "step": 158500 + }, + { + "epoch": 0.0, + "grad_norm": 0.06180635839700699, + "learning_rate": 8.4989898989899e-05, + "loss": 0.3379, + "num_input_tokens_seen": 16240640000, + "step": 158600 + }, + { + "epoch": 0.0, + "grad_norm": 0.22125934064388275, + "learning_rate": 8.497979797979798e-05, + "loss": 0.3396, + "num_input_tokens_seen": 16250880000, + "step": 158700 + }, + { + "epoch": 0.0, + "grad_norm": 0.07692223787307739, + "learning_rate": 8.496969696969698e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16261120000, + "step": 158800 + }, + { + "epoch": 0.0, + "grad_norm": 0.12157636135816574, + "learning_rate": 8.495959595959596e-05, + "loss": 0.3394, + "num_input_tokens_seen": 16271360000, + "step": 158900 + }, + { + "epoch": 0.0, + "grad_norm": 0.06225129961967468, + "learning_rate": 8.494949494949495e-05, + "loss": 0.3355, + "num_input_tokens_seen": 16281600000, + "step": 159000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.33496691140277646, + "eval_average_loss_on_sentence_tokens": 0.2979432621118372, + "eval_average_shuffling_prob": 0.42410714285714285, + "eval_loss": 0.3333478569984436, + "eval_non_padding_tokens_in_labels": 133.68450892857143, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.434107142857144, + "eval_padding_tokens_in_labels": 378.3154910714286, + "eval_reconstruction_accuracy": 0.9298194882775016, + "eval_runtime": 163.9009, + "eval_samples_per_second": 34.167, + "eval_sentence_accuracy": 0.8167692418506751, + "eval_steps_per_second": 0.085, + "eval_variance_shuffling_prob": 0.24424027423469397, + "num_input_tokens_seen": 16281600000, + "step": 159000 + }, + { + "epoch": 0.0, + "grad_norm": 0.1734858751296997, + "learning_rate": 8.493939393939394e-05, + "loss": 0.3365, + "num_input_tokens_seen": 16291840000, + "step": 159100 + }, + { + "epoch": 0.0, + "grad_norm": 0.06730858981609344, + "learning_rate": 8.492929292929294e-05, + "loss": 0.338, + "num_input_tokens_seen": 16302080000, + "step": 159200 + }, + { + "epoch": 0.0, + "grad_norm": 0.14307166635990143, + "learning_rate": 8.491919191919191e-05, + "loss": 0.3364, + "num_input_tokens_seen": 16312320000, + "step": 159300 + }, + { + "epoch": 0.0, + "grad_norm": 0.08365613967180252, + "learning_rate": 8.490909090909092e-05, + "loss": 0.3371, + "num_input_tokens_seen": 16322560000, + "step": 159400 + }, + { + "epoch": 0.0, + "grad_norm": 0.10825633257627487, + "learning_rate": 8.48989898989899e-05, + "loss": 0.3365, + "num_input_tokens_seen": 16332800000, + "step": 159500 + }, + { + "epoch": 0.0, + "grad_norm": 0.06810589134693146, + "learning_rate": 8.488888888888889e-05, + "loss": 0.3385, + "num_input_tokens_seen": 16343040000, + "step": 159600 + }, + { + "epoch": 0.0, + "grad_norm": 0.4794802963733673, + "learning_rate": 8.487878787878788e-05, + "loss": 0.3383, + "num_input_tokens_seen": 16353280000, + "step": 159700 + }, + { + "epoch": 0.0, + "grad_norm": 0.10574060678482056, + "learning_rate": 8.486868686868687e-05, + "loss": 0.3368, + "num_input_tokens_seen": 16363520000, + "step": 159800 + }, + { + "epoch": 0.0, + "grad_norm": 0.06322072446346283, + "learning_rate": 8.485858585858585e-05, + "loss": 0.3415, + "num_input_tokens_seen": 16373760000, + "step": 159900 + }, + { + "epoch": 0.0, + "grad_norm": 0.08988539129495621, + "learning_rate": 8.484848484848486e-05, + "loss": 0.3363, + "num_input_tokens_seen": 16384000000, + "step": 160000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.3360671123888822, + "eval_average_loss_on_sentence_tokens": 0.3482575810592971, + "eval_average_shuffling_prob": 0.5133928571428571, + "eval_loss": 0.3366263210773468, + "eval_non_padding_tokens_in_labels": 133.65441964285714, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.417589285714286, + "eval_padding_tokens_in_labels": 378.34558035714286, + "eval_reconstruction_accuracy": 0.9297088133295242, + "eval_runtime": 158.0038, + "eval_samples_per_second": 35.442, + "eval_sentence_accuracy": 0.7734475666071757, + "eval_steps_per_second": 0.089, + "eval_variance_shuffling_prob": 0.24982063137755103, + "num_input_tokens_seen": 16384000000, + "step": 160000 + }, + { + "epoch": 0.0, + "grad_norm": 0.18210990726947784, + "learning_rate": 8.483838383838384e-05, + "loss": 0.3358, + "num_input_tokens_seen": 16394240000, + "step": 160100 + }, + { + "epoch": 0.0, + "grad_norm": 0.057297080755233765, + "learning_rate": 8.482828282828283e-05, + "loss": 0.3396, + "num_input_tokens_seen": 16404480000, + "step": 160200 + }, + { + "epoch": 0.0, + "grad_norm": 0.08449747413396835, + "learning_rate": 8.481818181818182e-05, + "loss": 0.3376, + "num_input_tokens_seen": 16414720000, + "step": 160300 + }, + { + "epoch": 0.0, + "grad_norm": 0.11084302514791489, + "learning_rate": 8.480808080808081e-05, + "loss": 0.3366, + "num_input_tokens_seen": 16424960000, + "step": 160400 + }, + { + "epoch": 0.0, + "grad_norm": 0.10482872277498245, + "learning_rate": 8.47979797979798e-05, + "loss": 0.3376, + "num_input_tokens_seen": 16435200000, + "step": 160500 + }, + { + "epoch": 0.0, + "grad_norm": 0.09571310132741928, + "learning_rate": 8.47878787878788e-05, + "loss": 0.3387, + "num_input_tokens_seen": 16445440000, + "step": 160600 + }, + { + "epoch": 0.0, + "grad_norm": 0.079682856798172, + "learning_rate": 8.477777777777778e-05, + "loss": 0.3394, + "num_input_tokens_seen": 16455680000, + "step": 160700 + }, + { + "epoch": 0.0, + "grad_norm": 0.11301018297672272, + "learning_rate": 8.476767676767677e-05, + "loss": 0.3388, + "num_input_tokens_seen": 16465920000, + "step": 160800 + }, + { + "epoch": 0.0, + "grad_norm": 0.056826554238796234, + "learning_rate": 8.475757575757576e-05, + "loss": 0.3397, + "num_input_tokens_seen": 16476160000, + "step": 160900 + }, + { + "epoch": 0.0, + "grad_norm": 0.08774473518133163, + "learning_rate": 8.474747474747475e-05, + "loss": 0.3402, + "num_input_tokens_seen": 16486400000, + "step": 161000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.33559368422263847, + "eval_average_loss_on_sentence_tokens": 0.3454802624593376, + "eval_average_shuffling_prob": 0.5089285714285714, + "eval_loss": 0.3360770046710968, + "eval_non_padding_tokens_in_labels": 133.69040178571427, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43790178571429, + "eval_padding_tokens_in_labels": 378.3095982142857, + "eval_reconstruction_accuracy": 0.9297436759574335, + "eval_runtime": 145.2699, + "eval_samples_per_second": 38.549, + "eval_sentence_accuracy": 0.7779694115199001, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.2499202806122449, + "num_input_tokens_seen": 16486400000, + "step": 161000 + }, + { + "epoch": 0.0, + "grad_norm": 0.08663614839315414, + "learning_rate": 8.473737373737374e-05, + "loss": 0.3381, + "num_input_tokens_seen": 16496640000, + "step": 161100 + }, + { + "epoch": 0.0, + "grad_norm": 0.07178923487663269, + "learning_rate": 8.472727272727274e-05, + "loss": 0.3375, + "num_input_tokens_seen": 16506880000, + "step": 161200 + }, + { + "epoch": 0.0, + "grad_norm": 0.06003859266638756, + "learning_rate": 8.471717171717171e-05, + "loss": 0.3374, + "num_input_tokens_seen": 16517120000, + "step": 161300 + }, + { + "epoch": 0.0, + "grad_norm": 0.08149775117635727, + "learning_rate": 8.470707070707072e-05, + "loss": 0.3389, + "num_input_tokens_seen": 16527360000, + "step": 161400 + }, + { + "epoch": 0.0, + "grad_norm": 0.07902319729328156, + "learning_rate": 8.46969696969697e-05, + "loss": 0.3354, + "num_input_tokens_seen": 16537600000, + "step": 161500 + }, + { + "epoch": 0.0, + "grad_norm": 0.058713752776384354, + "learning_rate": 8.468686868686869e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16547840000, + "step": 161600 + }, + { + "epoch": 0.0, + "grad_norm": 0.07324440032243729, + "learning_rate": 8.467676767676768e-05, + "loss": 0.3385, + "num_input_tokens_seen": 16558080000, + "step": 161700 + }, + { + "epoch": 0.0, + "grad_norm": 0.11213607341051102, + "learning_rate": 8.466666666666667e-05, + "loss": 0.3369, + "num_input_tokens_seen": 16568320000, + "step": 161800 + }, + { + "epoch": 0.0, + "grad_norm": 0.10989231616258621, + "learning_rate": 8.465656565656565e-05, + "loss": 0.3356, + "num_input_tokens_seen": 16578560000, + "step": 161900 + }, + { + "epoch": 0.0, + "grad_norm": 0.1024494618177414, + "learning_rate": 8.464646464646466e-05, + "loss": 0.335, + "num_input_tokens_seen": 16588800000, + "step": 162000 + }, + { + "epoch": 0.0, + "eval_average_loss_on_non_sentence_tokens": 0.33512028812274475, + "eval_average_loss_on_sentence_tokens": 0.35323863142755096, + "eval_average_shuffling_prob": 0.5267857142857143, + "eval_loss": 0.3360247015953064, + "eval_non_padding_tokens_in_labels": 133.695, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.431919642857146, + "eval_padding_tokens_in_labels": 378.305, + "eval_reconstruction_accuracy": 0.9298402092273244, + "eval_runtime": 183.0967, + "eval_samples_per_second": 30.585, + "eval_sentence_accuracy": 0.7721430343580181, + "eval_steps_per_second": 0.076, + "eval_variance_shuffling_prob": 0.24928252551020408, + "num_input_tokens_seen": 16588800000, + "step": 162000 + }, + { + "epoch": 0.0, + "grad_norm": 0.08310101926326752, + "learning_rate": 8.463636363636364e-05, + "loss": 0.3394, + "num_input_tokens_seen": 16599040000, + "step": 162100 + }, + { + "epoch": 0.0, + "grad_norm": 0.11079742014408112, + "learning_rate": 8.462626262626263e-05, + "loss": 0.337, + "num_input_tokens_seen": 16609280000, + "step": 162200 + }, + { + "epoch": 0.0, + "grad_norm": 0.10644812881946564, + "learning_rate": 8.461616161616162e-05, + "loss": 0.3386, + "num_input_tokens_seen": 16619520000, + "step": 162300 + }, + { + "epoch": 0.0, + "grad_norm": 0.06720196455717087, + "learning_rate": 8.460606060606061e-05, + "loss": 0.3389, + "num_input_tokens_seen": 16629760000, + "step": 162400 + }, + { + "epoch": 0.0, + "grad_norm": 0.1633644998073578, + "learning_rate": 8.459595959595959e-05, + "loss": 0.3366, + "num_input_tokens_seen": 16640000000, + "step": 162500 + }, + { + "epoch": 0.0, + "grad_norm": 0.07543274760246277, + "learning_rate": 8.45858585858586e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16650240000, + "step": 162600 + }, + { + "epoch": 0.0, + "grad_norm": 0.17613619565963745, + "learning_rate": 8.457575757575757e-05, + "loss": 0.3358, + "num_input_tokens_seen": 16660480000, + "step": 162700 + }, + { + "epoch": 0.0, + "grad_norm": 0.1030188798904419, + "learning_rate": 8.456565656565657e-05, + "loss": 0.3385, + "num_input_tokens_seen": 16670720000, + "step": 162800 + }, + { + "epoch": 0.0, + "grad_norm": 0.11985214799642563, + "learning_rate": 8.455555555555556e-05, + "loss": 0.3371, + "num_input_tokens_seen": 16680960000, + "step": 162900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07139596343040466, + "learning_rate": 8.454545454545455e-05, + "loss": 0.3377, + "num_input_tokens_seen": 16691200000, + "step": 163000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33531798052456785, + "eval_average_loss_on_sentence_tokens": 0.35929313228182286, + "eval_average_shuffling_prob": 0.5401785714285714, + "eval_loss": 0.33636474609375, + "eval_non_padding_tokens_in_labels": 133.67558035714285, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.432276785714286, + "eval_padding_tokens_in_labels": 378.32441964285715, + "eval_reconstruction_accuracy": 0.9297775724800587, + "eval_runtime": 195.9667, + "eval_samples_per_second": 28.576, + "eval_sentence_accuracy": 0.7656123698468975, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.24838568239795922, + "num_input_tokens_seen": 16691200000, + "step": 163000 + }, + { + "epoch": 0.01, + "grad_norm": 0.11122092604637146, + "learning_rate": 8.453535353535353e-05, + "loss": 0.3379, + "num_input_tokens_seen": 16701440000, + "step": 163100 + }, + { + "epoch": 0.01, + "grad_norm": 0.06733439862728119, + "learning_rate": 8.452525252525253e-05, + "loss": 0.3411, + "num_input_tokens_seen": 16711680000, + "step": 163200 + }, + { + "epoch": 0.01, + "grad_norm": 0.06713900715112686, + "learning_rate": 8.451515151515151e-05, + "loss": 0.3366, + "num_input_tokens_seen": 16721920000, + "step": 163300 + }, + { + "epoch": 0.01, + "grad_norm": 0.05864796042442322, + "learning_rate": 8.45050505050505e-05, + "loss": 0.3385, + "num_input_tokens_seen": 16732160000, + "step": 163400 + }, + { + "epoch": 0.01, + "grad_norm": 0.10550223290920258, + "learning_rate": 8.44949494949495e-05, + "loss": 0.335, + "num_input_tokens_seen": 16742400000, + "step": 163500 + }, + { + "epoch": 0.01, + "grad_norm": 0.08160553127527237, + "learning_rate": 8.448484848484849e-05, + "loss": 0.3373, + "num_input_tokens_seen": 16752640000, + "step": 163600 + }, + { + "epoch": 0.01, + "grad_norm": 0.08414066582918167, + "learning_rate": 8.447474747474748e-05, + "loss": 0.3349, + "num_input_tokens_seen": 16762880000, + "step": 163700 + }, + { + "epoch": 0.01, + "grad_norm": 0.08111968636512756, + "learning_rate": 8.446464646464647e-05, + "loss": 0.3367, + "num_input_tokens_seen": 16773120000, + "step": 163800 + }, + { + "epoch": 0.01, + "grad_norm": 0.07223828881978989, + "learning_rate": 8.445454545454546e-05, + "loss": 0.3377, + "num_input_tokens_seen": 16783360000, + "step": 163900 + }, + { + "epoch": 0.01, + "grad_norm": 0.06872010231018066, + "learning_rate": 8.444444444444444e-05, + "loss": 0.3369, + "num_input_tokens_seen": 16793600000, + "step": 164000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33476428674165715, + "eval_average_loss_on_sentence_tokens": 0.30608747116229235, + "eval_average_shuffling_prob": 0.44642857142857145, + "eval_loss": 0.3334612250328064, + "eval_non_padding_tokens_in_labels": 133.70379464285713, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42892857142857, + "eval_padding_tokens_in_labels": 378.29620535714287, + "eval_reconstruction_accuracy": 0.9298813855356067, + "eval_runtime": 258.0274, + "eval_samples_per_second": 21.703, + "eval_sentence_accuracy": 0.8062409463060929, + "eval_steps_per_second": 0.054, + "eval_variance_shuffling_prob": 0.24713010204081634, + "num_input_tokens_seen": 16793600000, + "step": 164000 + }, + { + "epoch": 0.01, + "grad_norm": 0.10583063215017319, + "learning_rate": 8.443434343434345e-05, + "loss": 0.3355, + "num_input_tokens_seen": 16803840000, + "step": 164100 + }, + { + "epoch": 0.01, + "grad_norm": 0.06413723528385162, + "learning_rate": 8.442424242424243e-05, + "loss": 0.3375, + "num_input_tokens_seen": 16814080000, + "step": 164200 + }, + { + "epoch": 0.01, + "grad_norm": 0.056956853717565536, + "learning_rate": 8.441414141414142e-05, + "loss": 0.3356, + "num_input_tokens_seen": 16824320000, + "step": 164300 + }, + { + "epoch": 0.01, + "grad_norm": 0.06679971516132355, + "learning_rate": 8.440404040404041e-05, + "loss": 0.3364, + "num_input_tokens_seen": 16834560000, + "step": 164400 + }, + { + "epoch": 0.01, + "grad_norm": 0.14127741754055023, + "learning_rate": 8.43939393939394e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16844800000, + "step": 164500 + }, + { + "epoch": 0.01, + "grad_norm": 0.11149435490369797, + "learning_rate": 8.438383838383838e-05, + "loss": 0.3374, + "num_input_tokens_seen": 16855040000, + "step": 164600 + }, + { + "epoch": 0.01, + "grad_norm": 0.06579273194074631, + "learning_rate": 8.437373737373739e-05, + "loss": 0.3378, + "num_input_tokens_seen": 16865280000, + "step": 164700 + }, + { + "epoch": 0.01, + "grad_norm": 0.07448717951774597, + "learning_rate": 8.436363636363637e-05, + "loss": 0.3368, + "num_input_tokens_seen": 16875520000, + "step": 164800 + }, + { + "epoch": 0.01, + "grad_norm": 0.06106333062052727, + "learning_rate": 8.435353535353536e-05, + "loss": 0.3352, + "num_input_tokens_seen": 16885760000, + "step": 164900 + }, + { + "epoch": 0.01, + "grad_norm": 0.06262733042240143, + "learning_rate": 8.434343434343435e-05, + "loss": 0.3385, + "num_input_tokens_seen": 16896000000, + "step": 165000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33557849933623074, + "eval_average_loss_on_sentence_tokens": 0.3523027935171868, + "eval_average_shuffling_prob": 0.5267857142857143, + "eval_loss": 0.3364083468914032, + "eval_non_padding_tokens_in_labels": 133.6938392857143, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.433214285714286, + "eval_padding_tokens_in_labels": 378.30616071428574, + "eval_reconstruction_accuracy": 0.9297935503976137, + "eval_runtime": 214.1442, + "eval_samples_per_second": 26.151, + "eval_sentence_accuracy": 0.770606407414225, + "eval_steps_per_second": 0.065, + "eval_variance_shuffling_prob": 0.24928252551020408, + "num_input_tokens_seen": 16896000000, + "step": 165000 + }, + { + "epoch": 0.01, + "grad_norm": 0.12985961139202118, + "learning_rate": 8.433333333333334e-05, + "loss": 0.3376, + "num_input_tokens_seen": 16906240000, + "step": 165100 + }, + { + "epoch": 0.01, + "grad_norm": 0.08108349144458771, + "learning_rate": 8.432323232323232e-05, + "loss": 0.3377, + "num_input_tokens_seen": 16916480000, + "step": 165200 + }, + { + "epoch": 0.01, + "grad_norm": 0.08325790613889694, + "learning_rate": 8.431313131313133e-05, + "loss": 0.3381, + "num_input_tokens_seen": 16926720000, + "step": 165300 + }, + { + "epoch": 0.01, + "grad_norm": 0.15483540296554565, + "learning_rate": 8.43030303030303e-05, + "loss": 0.3365, + "num_input_tokens_seen": 16936960000, + "step": 165400 + }, + { + "epoch": 0.01, + "grad_norm": 0.07961862534284592, + "learning_rate": 8.42929292929293e-05, + "loss": 0.3355, + "num_input_tokens_seen": 16947200000, + "step": 165500 + }, + { + "epoch": 0.01, + "grad_norm": 0.11110538244247437, + "learning_rate": 8.428282828282829e-05, + "loss": 0.3354, + "num_input_tokens_seen": 16957440000, + "step": 165600 + }, + { + "epoch": 0.01, + "grad_norm": 0.17909066379070282, + "learning_rate": 8.427272727272728e-05, + "loss": 0.3366, + "num_input_tokens_seen": 16967680000, + "step": 165700 + }, + { + "epoch": 0.01, + "grad_norm": 0.0885913223028183, + "learning_rate": 8.426262626262627e-05, + "loss": 0.3379, + "num_input_tokens_seen": 16977920000, + "step": 165800 + }, + { + "epoch": 0.01, + "grad_norm": 0.11947090178728104, + "learning_rate": 8.425252525252526e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16988160000, + "step": 165900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07127726823091507, + "learning_rate": 8.424242424242424e-05, + "loss": 0.3372, + "num_input_tokens_seen": 16998400000, + "step": 166000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3354911618124106, + "eval_average_loss_on_sentence_tokens": 0.32440176839097656, + "eval_average_shuffling_prob": 0.48660714285714285, + "eval_loss": 0.3349696695804596, + "eval_non_padding_tokens_in_labels": 133.70647321428572, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42959821428571, + "eval_padding_tokens_in_labels": 378.2935267857143, + "eval_reconstruction_accuracy": 0.9298481629614324, + "eval_runtime": 186.5735, + "eval_samples_per_second": 30.015, + "eval_sentence_accuracy": 0.7907386213575138, + "eval_steps_per_second": 0.075, + "eval_variance_shuffling_prob": 0.24982063137755106, + "num_input_tokens_seen": 16998400000, + "step": 166000 + }, + { + "epoch": 0.01, + "grad_norm": 0.0701250210404396, + "learning_rate": 8.423232323232323e-05, + "loss": 0.3374, + "num_input_tokens_seen": 17008640000, + "step": 166100 + }, + { + "epoch": 0.01, + "grad_norm": 0.14006154239177704, + "learning_rate": 8.422222222222223e-05, + "loss": 0.3385, + "num_input_tokens_seen": 17018880000, + "step": 166200 + }, + { + "epoch": 0.01, + "grad_norm": 0.131784126162529, + "learning_rate": 8.421212121212122e-05, + "loss": 0.337, + "num_input_tokens_seen": 17029120000, + "step": 166300 + }, + { + "epoch": 0.01, + "grad_norm": 0.07773324102163315, + "learning_rate": 8.420202020202021e-05, + "loss": 0.3361, + "num_input_tokens_seen": 17039360000, + "step": 166400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06531457602977753, + "learning_rate": 8.41919191919192e-05, + "loss": 0.3365, + "num_input_tokens_seen": 17049600000, + "step": 166500 + }, + { + "epoch": 0.01, + "grad_norm": 0.06206774711608887, + "learning_rate": 8.418181818181818e-05, + "loss": 0.3344, + "num_input_tokens_seen": 17059840000, + "step": 166600 + }, + { + "epoch": 0.01, + "grad_norm": 0.06460825353860855, + "learning_rate": 8.417171717171719e-05, + "loss": 0.3405, + "num_input_tokens_seen": 17070080000, + "step": 166700 + }, + { + "epoch": 0.01, + "grad_norm": 0.12138912081718445, + "learning_rate": 8.416161616161616e-05, + "loss": 0.3387, + "num_input_tokens_seen": 17080320000, + "step": 166800 + }, + { + "epoch": 0.01, + "grad_norm": 0.059784505516290665, + "learning_rate": 8.415151515151516e-05, + "loss": 0.3375, + "num_input_tokens_seen": 17090560000, + "step": 166900 + }, + { + "epoch": 0.01, + "grad_norm": 0.08821374922990799, + "learning_rate": 8.414141414141415e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17100800000, + "step": 167000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3351645679737958, + "eval_average_loss_on_sentence_tokens": 0.3581120833614873, + "eval_average_shuffling_prob": 0.5357142857142857, + "eval_loss": 0.3362339437007904, + "eval_non_padding_tokens_in_labels": 133.66035714285715, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.418482142857144, + "eval_padding_tokens_in_labels": 378.33964285714285, + "eval_reconstruction_accuracy": 0.9298211553435228, + "eval_runtime": 172.7333, + "eval_samples_per_second": 32.42, + "eval_sentence_accuracy": 0.7670089396473762, + "eval_steps_per_second": 0.081, + "eval_variance_shuffling_prob": 0.24872448979591832, + "num_input_tokens_seen": 17100800000, + "step": 167000 + }, + { + "epoch": 0.01, + "grad_norm": 0.18139801919460297, + "learning_rate": 8.413131313131314e-05, + "loss": 0.34, + "num_input_tokens_seen": 17111040000, + "step": 167100 + }, + { + "epoch": 0.01, + "grad_norm": 0.062222208827733994, + "learning_rate": 8.412121212121212e-05, + "loss": 0.3378, + "num_input_tokens_seen": 17121280000, + "step": 167200 + }, + { + "epoch": 0.01, + "grad_norm": 0.06151258945465088, + "learning_rate": 8.411111111111112e-05, + "loss": 0.3364, + "num_input_tokens_seen": 17131520000, + "step": 167300 + }, + { + "epoch": 0.01, + "grad_norm": 0.09990616887807846, + "learning_rate": 8.41010101010101e-05, + "loss": 0.3356, + "num_input_tokens_seen": 17141760000, + "step": 167400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06910213083028793, + "learning_rate": 8.40909090909091e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17152000000, + "step": 167500 + }, + { + "epoch": 0.01, + "grad_norm": 0.13889914751052856, + "learning_rate": 8.408080808080809e-05, + "loss": 0.3385, + "num_input_tokens_seen": 17162240000, + "step": 167600 + }, + { + "epoch": 0.01, + "grad_norm": 0.08748584240674973, + "learning_rate": 8.407070707070708e-05, + "loss": 0.3352, + "num_input_tokens_seen": 17172480000, + "step": 167700 + }, + { + "epoch": 0.01, + "grad_norm": 0.05987832322716713, + "learning_rate": 8.406060606060606e-05, + "loss": 0.3344, + "num_input_tokens_seen": 17182720000, + "step": 167800 + }, + { + "epoch": 0.01, + "grad_norm": 0.15579642355442047, + "learning_rate": 8.405050505050506e-05, + "loss": 0.3373, + "num_input_tokens_seen": 17192960000, + "step": 167900 + }, + { + "epoch": 0.01, + "grad_norm": 0.08386600762605667, + "learning_rate": 8.404040404040404e-05, + "loss": 0.337, + "num_input_tokens_seen": 17203200000, + "step": 168000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3346613183804387, + "eval_average_loss_on_sentence_tokens": 0.3336976758496868, + "eval_average_shuffling_prob": 0.49107142857142855, + "eval_loss": 0.3345685601234436, + "eval_non_padding_tokens_in_labels": 133.6550892857143, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.418348214285714, + "eval_padding_tokens_in_labels": 378.34491071428573, + "eval_reconstruction_accuracy": 0.9298096000618984, + "eval_runtime": 171.1441, + "eval_samples_per_second": 32.721, + "eval_sentence_accuracy": 0.786704975630057, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.24992028061224475, + "num_input_tokens_seen": 17203200000, + "step": 168000 + }, + { + "epoch": 0.01, + "grad_norm": 0.09895853698253632, + "learning_rate": 8.403030303030303e-05, + "loss": 0.3369, + "num_input_tokens_seen": 17213440000, + "step": 168100 + }, + { + "epoch": 0.01, + "grad_norm": 0.10319909453392029, + "learning_rate": 8.402020202020202e-05, + "loss": 0.3363, + "num_input_tokens_seen": 17223680000, + "step": 168200 + }, + { + "epoch": 0.01, + "grad_norm": 0.07180842757225037, + "learning_rate": 8.401010101010102e-05, + "loss": 0.3363, + "num_input_tokens_seen": 17233920000, + "step": 168300 + }, + { + "epoch": 0.01, + "grad_norm": 0.07396452128887177, + "learning_rate": 8.4e-05, + "loss": 0.3348, + "num_input_tokens_seen": 17244160000, + "step": 168400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06004451587796211, + "learning_rate": 8.3989898989899e-05, + "loss": 0.3387, + "num_input_tokens_seen": 17254400000, + "step": 168500 + }, + { + "epoch": 0.01, + "grad_norm": 0.0697472020983696, + "learning_rate": 8.397979797979798e-05, + "loss": 0.3369, + "num_input_tokens_seen": 17264640000, + "step": 168600 + }, + { + "epoch": 0.01, + "grad_norm": 0.06644177436828613, + "learning_rate": 8.396969696969697e-05, + "loss": 0.3379, + "num_input_tokens_seen": 17274880000, + "step": 168700 + }, + { + "epoch": 0.01, + "grad_norm": 0.05797303840517998, + "learning_rate": 8.395959595959596e-05, + "loss": 0.337, + "num_input_tokens_seen": 17285120000, + "step": 168800 + }, + { + "epoch": 0.01, + "grad_norm": 0.08798923343420029, + "learning_rate": 8.394949494949496e-05, + "loss": 0.3361, + "num_input_tokens_seen": 17295360000, + "step": 168900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07363209873437881, + "learning_rate": 8.393939393939393e-05, + "loss": 0.3349, + "num_input_tokens_seen": 17305600000, + "step": 169000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.3342708878124493, + "eval_average_loss_on_sentence_tokens": 0.2886284713843065, + "eval_average_shuffling_prob": 0.4107142857142857, + "eval_loss": 0.33221435546875, + "eval_non_padding_tokens_in_labels": 133.67107142857142, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.433169642857145, + "eval_padding_tokens_in_labels": 378.32892857142855, + "eval_reconstruction_accuracy": 0.9299432075065357, + "eval_runtime": 154.2226, + "eval_samples_per_second": 36.311, + "eval_sentence_accuracy": 0.8217592777853364, + "eval_steps_per_second": 0.091, + "eval_variance_shuffling_prob": 0.2420280612244898, + "num_input_tokens_seen": 17305600000, + "step": 169000 + }, + { + "epoch": 0.01, + "grad_norm": 0.05402859300374985, + "learning_rate": 8.392929292929294e-05, + "loss": 0.3355, + "num_input_tokens_seen": 17315840000, + "step": 169100 + }, + { + "epoch": 0.01, + "grad_norm": 0.08121149241924286, + "learning_rate": 8.391919191919192e-05, + "loss": 0.3355, + "num_input_tokens_seen": 17326080000, + "step": 169200 + }, + { + "epoch": 0.01, + "grad_norm": 0.057257458567619324, + "learning_rate": 8.390909090909091e-05, + "loss": 0.3391, + "num_input_tokens_seen": 17336320000, + "step": 169300 + }, + { + "epoch": 0.01, + "grad_norm": 0.08611832559108734, + "learning_rate": 8.38989898989899e-05, + "loss": 0.3377, + "num_input_tokens_seen": 17346560000, + "step": 169400 + }, + { + "epoch": 0.01, + "grad_norm": 0.08593618869781494, + "learning_rate": 8.38888888888889e-05, + "loss": 0.3363, + "num_input_tokens_seen": 17356800000, + "step": 169500 + }, + { + "epoch": 0.01, + "grad_norm": 0.30592864751815796, + "learning_rate": 8.387878787878789e-05, + "loss": 0.3359, + "num_input_tokens_seen": 17367040000, + "step": 169600 + }, + { + "epoch": 0.01, + "grad_norm": 0.05819205194711685, + "learning_rate": 8.386868686868688e-05, + "loss": 0.3377, + "num_input_tokens_seen": 17377280000, + "step": 169700 + }, + { + "epoch": 0.01, + "grad_norm": 0.06875032931566238, + "learning_rate": 8.385858585858586e-05, + "loss": 0.3359, + "num_input_tokens_seen": 17387520000, + "step": 169800 + }, + { + "epoch": 0.01, + "grad_norm": 0.07274442911148071, + "learning_rate": 8.384848484848485e-05, + "loss": 0.3342, + "num_input_tokens_seen": 17397760000, + "step": 169900 + }, + { + "epoch": 0.01, + "grad_norm": 0.174887552857399, + "learning_rate": 8.383838383838384e-05, + "loss": 0.3354, + "num_input_tokens_seen": 17408000000, + "step": 170000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33486019255526167, + "eval_average_loss_on_sentence_tokens": 0.31112505067714313, + "eval_average_shuffling_prob": 0.45535714285714285, + "eval_loss": 0.3338361382484436, + "eval_non_padding_tokens_in_labels": 133.69241071428573, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.432276785714286, + "eval_padding_tokens_in_labels": 378.3075892857143, + "eval_reconstruction_accuracy": 0.9299015777475537, + "eval_runtime": 171.665, + "eval_samples_per_second": 32.622, + "eval_sentence_accuracy": 0.8033677740518131, + "eval_steps_per_second": 0.082, + "eval_variance_shuffling_prob": 0.24800701530612246, + "num_input_tokens_seen": 17408000000, + "step": 170000 + }, + { + "epoch": 0.01, + "grad_norm": 0.06349579989910126, + "learning_rate": 8.382828282828283e-05, + "loss": 0.3376, + "num_input_tokens_seen": 17418240000, + "step": 170100 + }, + { + "epoch": 0.01, + "grad_norm": 0.07009556144475937, + "learning_rate": 8.381818181818182e-05, + "loss": 0.338, + "num_input_tokens_seen": 17428480000, + "step": 170200 + }, + { + "epoch": 0.01, + "grad_norm": 0.0947631374001503, + "learning_rate": 8.380808080808082e-05, + "loss": 0.3375, + "num_input_tokens_seen": 17438720000, + "step": 170300 + }, + { + "epoch": 0.01, + "grad_norm": 0.15596534311771393, + "learning_rate": 8.37979797979798e-05, + "loss": 0.3375, + "num_input_tokens_seen": 17448960000, + "step": 170400 + }, + { + "epoch": 0.01, + "grad_norm": 0.12233918905258179, + "learning_rate": 8.378787878787879e-05, + "loss": 0.3359, + "num_input_tokens_seen": 17459200000, + "step": 170500 + }, + { + "epoch": 0.01, + "grad_norm": 0.12725937366485596, + "learning_rate": 8.377777777777778e-05, + "loss": 0.3375, + "num_input_tokens_seen": 17469440000, + "step": 170600 + }, + { + "epoch": 0.01, + "grad_norm": 0.06242549046874046, + "learning_rate": 8.376767676767677e-05, + "loss": 0.3368, + "num_input_tokens_seen": 17479680000, + "step": 170700 + }, + { + "epoch": 0.01, + "grad_norm": 0.0646473839879036, + "learning_rate": 8.375757575757576e-05, + "loss": 0.3344, + "num_input_tokens_seen": 17489920000, + "step": 170800 + }, + { + "epoch": 0.01, + "grad_norm": 0.1745670884847641, + "learning_rate": 8.374747474747475e-05, + "loss": 0.3362, + "num_input_tokens_seen": 17500160000, + "step": 170900 + }, + { + "epoch": 0.01, + "grad_norm": 0.08850549906492233, + "learning_rate": 8.373737373737373e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17510400000, + "step": 171000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33535992331162706, + "eval_average_loss_on_sentence_tokens": 0.3781250603299323, + "eval_average_shuffling_prob": 0.5758928571428571, + "eval_loss": 0.3372977077960968, + "eval_non_padding_tokens_in_labels": 133.6692857142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.432723214285716, + "eval_padding_tokens_in_labels": 378.3307142857143, + "eval_reconstruction_accuracy": 0.92984975189818, + "eval_runtime": 145.5983, + "eval_samples_per_second": 38.462, + "eval_sentence_accuracy": 0.7504902000016006, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24424027423469385, + "num_input_tokens_seen": 17510400000, + "step": 171000 + }, + { + "epoch": 0.01, + "grad_norm": 0.1790263056755066, + "learning_rate": 8.372727272727274e-05, + "loss": 0.337, + "num_input_tokens_seen": 17520640000, + "step": 171100 + }, + { + "epoch": 0.01, + "grad_norm": 0.058594960719347, + "learning_rate": 8.371717171717172e-05, + "loss": 0.3369, + "num_input_tokens_seen": 17530880000, + "step": 171200 + }, + { + "epoch": 0.01, + "grad_norm": 0.10256171971559525, + "learning_rate": 8.370707070707071e-05, + "loss": 0.3365, + "num_input_tokens_seen": 17541120000, + "step": 171300 + }, + { + "epoch": 0.01, + "grad_norm": 0.09493178129196167, + "learning_rate": 8.36969696969697e-05, + "loss": 0.3387, + "num_input_tokens_seen": 17551360000, + "step": 171400 + }, + { + "epoch": 0.01, + "grad_norm": 0.1417931765317917, + "learning_rate": 8.368686868686869e-05, + "loss": 0.3386, + "num_input_tokens_seen": 17561600000, + "step": 171500 + }, + { + "epoch": 0.01, + "grad_norm": 0.060962386429309845, + "learning_rate": 8.367676767676767e-05, + "loss": 0.3349, + "num_input_tokens_seen": 17571840000, + "step": 171600 + }, + { + "epoch": 0.01, + "grad_norm": 0.10023783892393112, + "learning_rate": 8.366666666666668e-05, + "loss": 0.3351, + "num_input_tokens_seen": 17582080000, + "step": 171700 + }, + { + "epoch": 0.01, + "grad_norm": 0.07007241994142532, + "learning_rate": 8.365656565656565e-05, + "loss": 0.3377, + "num_input_tokens_seen": 17592320000, + "step": 171800 + }, + { + "epoch": 0.01, + "grad_norm": 0.05791827291250229, + "learning_rate": 8.364646464646465e-05, + "loss": 0.3327, + "num_input_tokens_seen": 17602560000, + "step": 171900 + }, + { + "epoch": 0.01, + "grad_norm": 0.12020015716552734, + "learning_rate": 8.363636363636364e-05, + "loss": 0.3344, + "num_input_tokens_seen": 17612800000, + "step": 172000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33447587297276343, + "eval_average_loss_on_sentence_tokens": 0.3191928217033312, + "eval_average_shuffling_prob": 0.4732142857142857, + "eval_loss": 0.333740234375, + "eval_non_padding_tokens_in_labels": 133.66758928571429, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43200892857143, + "eval_padding_tokens_in_labels": 378.3324107142857, + "eval_reconstruction_accuracy": 0.9299497254510447, + "eval_runtime": 178.6957, + "eval_samples_per_second": 31.338, + "eval_sentence_accuracy": 0.7981576483205147, + "eval_steps_per_second": 0.078, + "eval_variance_shuffling_prob": 0.24928252551020408, + "num_input_tokens_seen": 17612800000, + "step": 172000 + }, + { + "epoch": 0.01, + "grad_norm": 0.0740642100572586, + "learning_rate": 8.362626262626263e-05, + "loss": 0.3341, + "num_input_tokens_seen": 17623040000, + "step": 172100 + }, + { + "epoch": 0.01, + "grad_norm": 0.09095675498247147, + "learning_rate": 8.361616161616162e-05, + "loss": 0.3373, + "num_input_tokens_seen": 17633280000, + "step": 172200 + }, + { + "epoch": 0.01, + "grad_norm": 0.061410605907440186, + "learning_rate": 8.360606060606061e-05, + "loss": 0.3363, + "num_input_tokens_seen": 17643520000, + "step": 172300 + }, + { + "epoch": 0.01, + "grad_norm": 0.1085314229130745, + "learning_rate": 8.359595959595961e-05, + "loss": 0.3364, + "num_input_tokens_seen": 17653760000, + "step": 172400 + }, + { + "epoch": 0.01, + "grad_norm": 0.06799034029245377, + "learning_rate": 8.358585858585859e-05, + "loss": 0.3383, + "num_input_tokens_seen": 17664000000, + "step": 172500 + }, + { + "epoch": 0.01, + "grad_norm": 0.07749553769826889, + "learning_rate": 8.357575757575759e-05, + "loss": 0.3395, + "num_input_tokens_seen": 17674240000, + "step": 172600 + }, + { + "epoch": 0.01, + "grad_norm": 0.13318020105361938, + "learning_rate": 8.356565656565657e-05, + "loss": 0.3391, + "num_input_tokens_seen": 17684480000, + "step": 172700 + }, + { + "epoch": 0.01, + "grad_norm": 0.05708346888422966, + "learning_rate": 8.355555555555556e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17694720000, + "step": 172800 + }, + { + "epoch": 0.01, + "grad_norm": 0.0807466059923172, + "learning_rate": 8.354545454545455e-05, + "loss": 0.3354, + "num_input_tokens_seen": 17704960000, + "step": 172900 + }, + { + "epoch": 0.01, + "grad_norm": 0.07279369980096817, + "learning_rate": 8.353535353535355e-05, + "loss": 0.3392, + "num_input_tokens_seen": 17715200000, + "step": 173000 + }, + { + "epoch": 0.01, + "eval_average_loss_on_non_sentence_tokens": 0.33411282527091735, + "eval_average_loss_on_sentence_tokens": 0.3426401705647847, + "eval_average_shuffling_prob": 0.5, + "eval_loss": 0.3345511257648468, + "eval_non_padding_tokens_in_labels": 133.6564732142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43151785714286, + "eval_padding_tokens_in_labels": 378.3435267857143, + "eval_reconstruction_accuracy": 0.9299101242942164, + "eval_runtime": 146.4606, + "eval_samples_per_second": 38.236, + "eval_sentence_accuracy": 0.7835236776604855, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.25, + "num_input_tokens_seen": 17715200000, + "step": 173000 + }, + { + "epoch": 0.02, + "grad_norm": 0.07056129723787308, + "learning_rate": 8.352525252525252e-05, + "loss": 0.3381, + "num_input_tokens_seen": 17725440000, + "step": 173100 + }, + { + "epoch": 0.02, + "grad_norm": 0.058231934905052185, + "learning_rate": 8.351515151515153e-05, + "loss": 0.3383, + "num_input_tokens_seen": 17735680000, + "step": 173200 + }, + { + "epoch": 0.02, + "grad_norm": 0.10691957920789719, + "learning_rate": 8.350505050505051e-05, + "loss": 0.3354, + "num_input_tokens_seen": 17745920000, + "step": 173300 + }, + { + "epoch": 0.02, + "grad_norm": 0.1006227657198906, + "learning_rate": 8.34949494949495e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17756160000, + "step": 173400 + }, + { + "epoch": 0.02, + "grad_norm": 0.06299128383398056, + "learning_rate": 8.348484848484849e-05, + "loss": 0.3373, + "num_input_tokens_seen": 17766400000, + "step": 173500 + }, + { + "epoch": 0.02, + "grad_norm": 0.05527783930301666, + "learning_rate": 8.347474747474748e-05, + "loss": 0.3373, + "num_input_tokens_seen": 17776640000, + "step": 173600 + }, + { + "epoch": 0.02, + "grad_norm": 0.08428135514259338, + "learning_rate": 8.346464646464646e-05, + "loss": 0.3389, + "num_input_tokens_seen": 17786880000, + "step": 173700 + }, + { + "epoch": 0.02, + "grad_norm": 0.09534034878015518, + "learning_rate": 8.345454545454547e-05, + "loss": 0.332, + "num_input_tokens_seen": 17797120000, + "step": 173800 + }, + { + "epoch": 0.02, + "grad_norm": 0.1154632493853569, + "learning_rate": 8.344444444444445e-05, + "loss": 0.3363, + "num_input_tokens_seen": 17807360000, + "step": 173900 + }, + { + "epoch": 0.02, + "grad_norm": 0.11197901517152786, + "learning_rate": 8.343434343434344e-05, + "loss": 0.3367, + "num_input_tokens_seen": 17817600000, + "step": 174000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3344556205652596, + "eval_average_loss_on_sentence_tokens": 0.3291161001283821, + "eval_average_shuffling_prob": 0.48660714285714285, + "eval_loss": 0.3342023491859436, + "eval_non_padding_tokens_in_labels": 133.694375, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42825892857143, + "eval_padding_tokens_in_labels": 378.305625, + "eval_reconstruction_accuracy": 0.9298889558224698, + "eval_runtime": 151.2881, + "eval_samples_per_second": 37.015, + "eval_sentence_accuracy": 0.7886577723711274, + "eval_steps_per_second": 0.093, + "eval_variance_shuffling_prob": 0.24982063137755106, + "num_input_tokens_seen": 17817600000, + "step": 174000 + }, + { + "epoch": 0.02, + "grad_norm": 0.07572993636131287, + "learning_rate": 8.342424242424243e-05, + "loss": 0.3372, + "num_input_tokens_seen": 17827840000, + "step": 174100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08452825993299484, + "learning_rate": 8.341414141414142e-05, + "loss": 0.3377, + "num_input_tokens_seen": 17838080000, + "step": 174200 + }, + { + "epoch": 0.02, + "grad_norm": 0.10798559337854385, + "learning_rate": 8.34040404040404e-05, + "loss": 0.3367, + "num_input_tokens_seen": 17848320000, + "step": 174300 + }, + { + "epoch": 0.02, + "grad_norm": 0.05695834010839462, + "learning_rate": 8.33939393939394e-05, + "loss": 0.3366, + "num_input_tokens_seen": 17858560000, + "step": 174400 + }, + { + "epoch": 0.02, + "grad_norm": 0.05930950492620468, + "learning_rate": 8.338383838383838e-05, + "loss": 0.3362, + "num_input_tokens_seen": 17868800000, + "step": 174500 + }, + { + "epoch": 0.02, + "grad_norm": 0.06369196623563766, + "learning_rate": 8.337373737373738e-05, + "loss": 0.3362, + "num_input_tokens_seen": 17879040000, + "step": 174600 + }, + { + "epoch": 0.02, + "grad_norm": 0.09154227375984192, + "learning_rate": 8.336363636363637e-05, + "loss": 0.335, + "num_input_tokens_seen": 17889280000, + "step": 174700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07741615921258926, + "learning_rate": 8.335353535353536e-05, + "loss": 0.335, + "num_input_tokens_seen": 17899520000, + "step": 174800 + }, + { + "epoch": 0.02, + "grad_norm": 0.0907118022441864, + "learning_rate": 8.334343434343435e-05, + "loss": 0.3369, + "num_input_tokens_seen": 17909760000, + "step": 174900 + }, + { + "epoch": 0.02, + "grad_norm": 0.11347772926092148, + "learning_rate": 8.333333333333334e-05, + "loss": 0.3383, + "num_input_tokens_seen": 17920000000, + "step": 175000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33414522889783843, + "eval_average_loss_on_sentence_tokens": 0.3224868963386971, + "eval_average_shuffling_prob": 0.48214285714285715, + "eval_loss": 0.3335919976234436, + "eval_non_padding_tokens_in_labels": 133.66316964285716, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42959821428571, + "eval_padding_tokens_in_labels": 378.33683035714284, + "eval_reconstruction_accuracy": 0.930002949621902, + "eval_runtime": 147.0424, + "eval_samples_per_second": 38.084, + "eval_sentence_accuracy": 0.793435721774484, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24968112244897953, + "num_input_tokens_seen": 17920000000, + "step": 175000 + }, + { + "epoch": 0.02, + "grad_norm": 0.06346926093101501, + "learning_rate": 8.332323232323232e-05, + "loss": 0.3337, + "num_input_tokens_seen": 17930240000, + "step": 175100 + }, + { + "epoch": 0.02, + "grad_norm": 0.11800475418567657, + "learning_rate": 8.331313131313131e-05, + "loss": 0.3356, + "num_input_tokens_seen": 17940480000, + "step": 175200 + }, + { + "epoch": 0.02, + "grad_norm": 0.09325616806745529, + "learning_rate": 8.33030303030303e-05, + "loss": 0.3354, + "num_input_tokens_seen": 17950720000, + "step": 175300 + }, + { + "epoch": 0.02, + "grad_norm": 0.09561218321323395, + "learning_rate": 8.32929292929293e-05, + "loss": 0.3355, + "num_input_tokens_seen": 17960960000, + "step": 175400 + }, + { + "epoch": 0.02, + "grad_norm": 0.18935689330101013, + "learning_rate": 8.328282828282829e-05, + "loss": 0.3381, + "num_input_tokens_seen": 17971200000, + "step": 175500 + }, + { + "epoch": 0.02, + "grad_norm": 0.059444278478622437, + "learning_rate": 8.327272727272728e-05, + "loss": 0.337, + "num_input_tokens_seen": 17981440000, + "step": 175600 + }, + { + "epoch": 0.02, + "grad_norm": 0.08911482989788055, + "learning_rate": 8.326262626262626e-05, + "loss": 0.3342, + "num_input_tokens_seen": 17991680000, + "step": 175700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07786991447210312, + "learning_rate": 8.325252525252527e-05, + "loss": 0.3366, + "num_input_tokens_seen": 18001920000, + "step": 175800 + }, + { + "epoch": 0.02, + "grad_norm": 0.0683041661977768, + "learning_rate": 8.324242424242424e-05, + "loss": 0.3392, + "num_input_tokens_seen": 18012160000, + "step": 175900 + }, + { + "epoch": 0.02, + "grad_norm": 0.10220018774271011, + "learning_rate": 8.323232323232324e-05, + "loss": 0.3357, + "num_input_tokens_seen": 18022400000, + "step": 176000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.334985200386547, + "eval_average_loss_on_sentence_tokens": 0.3235413428960298, + "eval_average_shuffling_prob": 0.46875, + "eval_loss": 0.3345249593257904, + "eval_non_padding_tokens_in_labels": 133.68790178571427, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.431607142857146, + "eval_padding_tokens_in_labels": 378.3120982142857, + "eval_reconstruction_accuracy": 0.9298755710832578, + "eval_runtime": 146.174, + "eval_samples_per_second": 38.311, + "eval_sentence_accuracy": 0.795720654026843, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.2490234375, + "num_input_tokens_seen": 18022400000, + "step": 176000 + }, + { + "epoch": 0.02, + "grad_norm": 0.06808876246213913, + "learning_rate": 8.322222222222223e-05, + "loss": 0.3357, + "num_input_tokens_seen": 18032640000, + "step": 176100 + }, + { + "epoch": 0.02, + "grad_norm": 0.06422210484743118, + "learning_rate": 8.321212121212122e-05, + "loss": 0.3381, + "num_input_tokens_seen": 18042880000, + "step": 176200 + }, + { + "epoch": 0.02, + "grad_norm": 0.06338095664978027, + "learning_rate": 8.32020202020202e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18053120000, + "step": 176300 + }, + { + "epoch": 0.02, + "grad_norm": 0.06484988331794739, + "learning_rate": 8.31919191919192e-05, + "loss": 0.3349, + "num_input_tokens_seen": 18063360000, + "step": 176400 + }, + { + "epoch": 0.02, + "grad_norm": 0.06691086292266846, + "learning_rate": 8.318181818181818e-05, + "loss": 0.3372, + "num_input_tokens_seen": 18073600000, + "step": 176500 + }, + { + "epoch": 0.02, + "grad_norm": 0.09401126205921173, + "learning_rate": 8.317171717171718e-05, + "loss": 0.3378, + "num_input_tokens_seen": 18083840000, + "step": 176600 + }, + { + "epoch": 0.02, + "grad_norm": 0.05637403205037117, + "learning_rate": 8.316161616161617e-05, + "loss": 0.3348, + "num_input_tokens_seen": 18094080000, + "step": 176700 + }, + { + "epoch": 0.02, + "grad_norm": 0.0654127299785614, + "learning_rate": 8.315151515151516e-05, + "loss": 0.3333, + "num_input_tokens_seen": 18104320000, + "step": 176800 + }, + { + "epoch": 0.02, + "grad_norm": 0.09716945141553879, + "learning_rate": 8.314141414141414e-05, + "loss": 0.3357, + "num_input_tokens_seen": 18114560000, + "step": 176900 + }, + { + "epoch": 0.02, + "grad_norm": 0.09289145469665527, + "learning_rate": 8.313131313131314e-05, + "loss": 0.336, + "num_input_tokens_seen": 18124800000, + "step": 177000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3340538308342855, + "eval_average_loss_on_sentence_tokens": 0.2958373854937884, + "eval_average_shuffling_prob": 0.42857142857142855, + "eval_loss": 0.3323538601398468, + "eval_non_padding_tokens_in_labels": 133.69035714285715, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43616071428571, + "eval_padding_tokens_in_labels": 378.3096428571429, + "eval_reconstruction_accuracy": 0.9300093034731987, + "eval_runtime": 196.0505, + "eval_samples_per_second": 28.564, + "eval_sentence_accuracy": 0.8143122393936726, + "eval_steps_per_second": 0.071, + "eval_variance_shuffling_prob": 0.2448979591836734, + "num_input_tokens_seen": 18124800000, + "step": 177000 + }, + { + "epoch": 0.02, + "grad_norm": 0.07620575278997421, + "learning_rate": 8.312121212121212e-05, + "loss": 0.3356, + "num_input_tokens_seen": 18135040000, + "step": 177100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08782439678907394, + "learning_rate": 8.311111111111111e-05, + "loss": 0.3378, + "num_input_tokens_seen": 18145280000, + "step": 177200 + }, + { + "epoch": 0.02, + "grad_norm": 0.07752939313650131, + "learning_rate": 8.31010101010101e-05, + "loss": 0.3368, + "num_input_tokens_seen": 18155520000, + "step": 177300 + }, + { + "epoch": 0.02, + "grad_norm": 0.09023579210042953, + "learning_rate": 8.30909090909091e-05, + "loss": 0.3379, + "num_input_tokens_seen": 18165760000, + "step": 177400 + }, + { + "epoch": 0.02, + "grad_norm": 0.0576581135392189, + "learning_rate": 8.308080808080808e-05, + "loss": 0.3362, + "num_input_tokens_seen": 18176000000, + "step": 177500 + }, + { + "epoch": 0.02, + "grad_norm": 0.09837550669908524, + "learning_rate": 8.307070707070708e-05, + "loss": 0.3383, + "num_input_tokens_seen": 18186240000, + "step": 177600 + }, + { + "epoch": 0.02, + "grad_norm": 0.15357324481010437, + "learning_rate": 8.306060606060606e-05, + "loss": 0.336, + "num_input_tokens_seen": 18196480000, + "step": 177700 + }, + { + "epoch": 0.02, + "grad_norm": 0.08912228047847748, + "learning_rate": 8.305050505050505e-05, + "loss": 0.3356, + "num_input_tokens_seen": 18206720000, + "step": 177800 + }, + { + "epoch": 0.02, + "grad_norm": 0.061685070395469666, + "learning_rate": 8.304040404040404e-05, + "loss": 0.3378, + "num_input_tokens_seen": 18216960000, + "step": 177900 + }, + { + "epoch": 0.02, + "grad_norm": 0.07350301742553711, + "learning_rate": 8.303030303030304e-05, + "loss": 0.336, + "num_input_tokens_seen": 18227200000, + "step": 178000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3348346746793404, + "eval_average_loss_on_sentence_tokens": 0.36414911870034844, + "eval_average_shuffling_prob": 0.5446428571428571, + "eval_loss": 0.3361293375492096, + "eval_non_padding_tokens_in_labels": 133.69598214285713, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.44299107142857, + "eval_padding_tokens_in_labels": 378.30401785714287, + "eval_reconstruction_accuracy": 0.9297814680623967, + "eval_runtime": 147.4169, + "eval_samples_per_second": 37.988, + "eval_sentence_accuracy": 0.7640077151477803, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24800701530612243, + "num_input_tokens_seen": 18227200000, + "step": 178000 + }, + { + "epoch": 0.02, + "grad_norm": 0.057471856474876404, + "learning_rate": 8.302020202020201e-05, + "loss": 0.3383, + "num_input_tokens_seen": 18237440000, + "step": 178100 + }, + { + "epoch": 0.02, + "grad_norm": 0.053672172129154205, + "learning_rate": 8.301010101010102e-05, + "loss": 0.3357, + "num_input_tokens_seen": 18247680000, + "step": 178200 + }, + { + "epoch": 0.02, + "grad_norm": 0.15875600278377533, + "learning_rate": 8.3e-05, + "loss": 0.336, + "num_input_tokens_seen": 18257920000, + "step": 178300 + }, + { + "epoch": 0.02, + "grad_norm": 0.08498847484588623, + "learning_rate": 8.298989898989899e-05, + "loss": 0.3384, + "num_input_tokens_seen": 18268160000, + "step": 178400 + }, + { + "epoch": 0.02, + "grad_norm": 0.10993433743715286, + "learning_rate": 8.297979797979798e-05, + "loss": 0.3354, + "num_input_tokens_seen": 18278400000, + "step": 178500 + }, + { + "epoch": 0.02, + "grad_norm": 0.06213699281215668, + "learning_rate": 8.296969696969697e-05, + "loss": 0.3388, + "num_input_tokens_seen": 18288640000, + "step": 178600 + }, + { + "epoch": 0.02, + "grad_norm": 0.12057744711637497, + "learning_rate": 8.295959595959597e-05, + "loss": 0.3377, + "num_input_tokens_seen": 18298880000, + "step": 178700 + }, + { + "epoch": 0.02, + "grad_norm": 0.07481426745653152, + "learning_rate": 8.294949494949496e-05, + "loss": 0.3375, + "num_input_tokens_seen": 18309120000, + "step": 178800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08393383026123047, + "learning_rate": 8.293939393939394e-05, + "loss": 0.3359, + "num_input_tokens_seen": 18319360000, + "step": 178900 + }, + { + "epoch": 0.02, + "grad_norm": 0.056701477617025375, + "learning_rate": 8.292929292929293e-05, + "loss": 0.334, + "num_input_tokens_seen": 18329600000, + "step": 179000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33522524652541197, + "eval_average_loss_on_sentence_tokens": 0.3561139258376208, + "eval_average_shuffling_prob": 0.5401785714285714, + "eval_loss": 0.3361467719078064, + "eval_non_padding_tokens_in_labels": 133.67267857142858, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43919642857143, + "eval_padding_tokens_in_labels": 378.32732142857145, + "eval_reconstruction_accuracy": 0.9298365592601712, + "eval_runtime": 146.4768, + "eval_samples_per_second": 38.231, + "eval_sentence_accuracy": 0.7666647992380892, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24838568239795922, + "num_input_tokens_seen": 18329600000, + "step": 179000 + }, + { + "epoch": 0.02, + "grad_norm": 0.09953875094652176, + "learning_rate": 8.291919191919192e-05, + "loss": 0.3382, + "num_input_tokens_seen": 18339840000, + "step": 179100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08472523838281631, + "learning_rate": 8.290909090909091e-05, + "loss": 0.3359, + "num_input_tokens_seen": 18350080000, + "step": 179200 + }, + { + "epoch": 0.02, + "grad_norm": 0.05826283246278763, + "learning_rate": 8.28989898989899e-05, + "loss": 0.3355, + "num_input_tokens_seen": 18360320000, + "step": 179300 + }, + { + "epoch": 0.02, + "grad_norm": 0.07974172383546829, + "learning_rate": 8.28888888888889e-05, + "loss": 0.3364, + "num_input_tokens_seen": 18370560000, + "step": 179400 + }, + { + "epoch": 0.02, + "grad_norm": 0.09363342076539993, + "learning_rate": 8.287878787878787e-05, + "loss": 0.3344, + "num_input_tokens_seen": 18380800000, + "step": 179500 + }, + { + "epoch": 0.02, + "grad_norm": 0.0796174556016922, + "learning_rate": 8.286868686868687e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18391040000, + "step": 179600 + }, + { + "epoch": 0.02, + "grad_norm": 0.23649919033050537, + "learning_rate": 8.285858585858586e-05, + "loss": 0.339, + "num_input_tokens_seen": 18401280000, + "step": 179700 + }, + { + "epoch": 0.02, + "grad_norm": 0.06922733038663864, + "learning_rate": 8.284848484848485e-05, + "loss": 0.3406, + "num_input_tokens_seen": 18411520000, + "step": 179800 + }, + { + "epoch": 0.02, + "grad_norm": 0.09411531686782837, + "learning_rate": 8.283838383838384e-05, + "loss": 0.3365, + "num_input_tokens_seen": 18421760000, + "step": 179900 + }, + { + "epoch": 0.02, + "grad_norm": 0.059323135763406754, + "learning_rate": 8.282828282828283e-05, + "loss": 0.3386, + "num_input_tokens_seen": 18432000000, + "step": 180000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.33464716315586546, + "eval_average_loss_on_sentence_tokens": 0.33120607980926087, + "eval_average_shuffling_prob": 0.48660714285714285, + "eval_loss": 0.33447265625, + "eval_non_padding_tokens_in_labels": 133.66513392857144, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.44464285714286, + "eval_padding_tokens_in_labels": 378.33486607142856, + "eval_reconstruction_accuracy": 0.9300187397673587, + "eval_runtime": 147.4502, + "eval_samples_per_second": 37.979, + "eval_sentence_accuracy": 0.7874932972652843, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24982063137755106, + "num_input_tokens_seen": 18432000000, + "step": 180000 + }, + { + "epoch": 0.02, + "grad_norm": 0.06853589415550232, + "learning_rate": 8.281818181818181e-05, + "loss": 0.3344, + "num_input_tokens_seen": 18442240000, + "step": 180100 + }, + { + "epoch": 0.02, + "grad_norm": 0.062015846371650696, + "learning_rate": 8.280808080808082e-05, + "loss": 0.3366, + "num_input_tokens_seen": 18452480000, + "step": 180200 + }, + { + "epoch": 0.02, + "grad_norm": 0.10780097544193268, + "learning_rate": 8.27979797979798e-05, + "loss": 0.336, + "num_input_tokens_seen": 18462720000, + "step": 180300 + }, + { + "epoch": 0.02, + "grad_norm": 0.08579158037900925, + "learning_rate": 8.278787878787879e-05, + "loss": 0.3368, + "num_input_tokens_seen": 18472960000, + "step": 180400 + }, + { + "epoch": 0.02, + "grad_norm": 0.07928329706192017, + "learning_rate": 8.277777777777778e-05, + "loss": 0.34, + "num_input_tokens_seen": 18483200000, + "step": 180500 + }, + { + "epoch": 0.02, + "grad_norm": 0.062336310744285583, + "learning_rate": 8.276767676767677e-05, + "loss": 0.3377, + "num_input_tokens_seen": 18493440000, + "step": 180600 + }, + { + "epoch": 0.02, + "grad_norm": 0.08393688499927521, + "learning_rate": 8.275757575757577e-05, + "loss": 0.3376, + "num_input_tokens_seen": 18503680000, + "step": 180700 + }, + { + "epoch": 0.02, + "grad_norm": 0.17525160312652588, + "learning_rate": 8.274747474747476e-05, + "loss": 0.3352, + "num_input_tokens_seen": 18513920000, + "step": 180800 + }, + { + "epoch": 0.02, + "grad_norm": 0.059083882719278336, + "learning_rate": 8.273737373737375e-05, + "loss": 0.3381, + "num_input_tokens_seen": 18524160000, + "step": 180900 + }, + { + "epoch": 0.02, + "grad_norm": 0.1351124793291092, + "learning_rate": 8.272727272727273e-05, + "loss": 0.3379, + "num_input_tokens_seen": 18534400000, + "step": 181000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3347389255844608, + "eval_average_loss_on_sentence_tokens": 0.3717210983426161, + "eval_average_shuffling_prob": 0.5580357142857143, + "eval_loss": 0.3363821804523468, + "eval_non_padding_tokens_in_labels": 133.66611607142858, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.432678571428575, + "eval_padding_tokens_in_labels": 378.3338839285714, + "eval_reconstruction_accuracy": 0.9298740087039197, + "eval_runtime": 146.325, + "eval_samples_per_second": 38.271, + "eval_sentence_accuracy": 0.7571409134927051, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24663185586734704, + "num_input_tokens_seen": 18534400000, + "step": 181000 + }, + { + "epoch": 0.02, + "grad_norm": 0.09595413506031036, + "learning_rate": 8.271717171717173e-05, + "loss": 0.3362, + "num_input_tokens_seen": 18544640000, + "step": 181100 + }, + { + "epoch": 0.02, + "grad_norm": 0.07348284125328064, + "learning_rate": 8.270707070707071e-05, + "loss": 0.3349, + "num_input_tokens_seen": 18554880000, + "step": 181200 + }, + { + "epoch": 0.02, + "grad_norm": 0.07745679467916489, + "learning_rate": 8.26969696969697e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18565120000, + "step": 181300 + }, + { + "epoch": 0.02, + "grad_norm": 0.11482449620962143, + "learning_rate": 8.26868686868687e-05, + "loss": 0.3353, + "num_input_tokens_seen": 18575360000, + "step": 181400 + }, + { + "epoch": 0.02, + "grad_norm": 0.11348002403974533, + "learning_rate": 8.267676767676769e-05, + "loss": 0.3365, + "num_input_tokens_seen": 18585600000, + "step": 181500 + }, + { + "epoch": 0.02, + "grad_norm": 0.08058430254459381, + "learning_rate": 8.266666666666667e-05, + "loss": 0.3353, + "num_input_tokens_seen": 18595840000, + "step": 181600 + }, + { + "epoch": 0.02, + "grad_norm": 0.09254302829504013, + "learning_rate": 8.265656565656567e-05, + "loss": 0.3384, + "num_input_tokens_seen": 18606080000, + "step": 181700 + }, + { + "epoch": 0.02, + "grad_norm": 0.0605696439743042, + "learning_rate": 8.264646464646465e-05, + "loss": 0.3362, + "num_input_tokens_seen": 18616320000, + "step": 181800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08309322595596313, + "learning_rate": 8.263636363636364e-05, + "loss": 0.3349, + "num_input_tokens_seen": 18626560000, + "step": 181900 + }, + { + "epoch": 0.02, + "grad_norm": 0.06699433922767639, + "learning_rate": 8.262626262626263e-05, + "loss": 0.333, + "num_input_tokens_seen": 18636800000, + "step": 182000 + }, + { + "epoch": 0.02, + "eval_average_loss_on_non_sentence_tokens": 0.3344739243621897, + "eval_average_loss_on_sentence_tokens": 0.362433154434974, + "eval_average_shuffling_prob": 0.5401785714285714, + "eval_loss": 0.3356846272945404, + "eval_non_padding_tokens_in_labels": 133.66875, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42080357142857, + "eval_padding_tokens_in_labels": 378.33125, + "eval_reconstruction_accuracy": 0.9299350424534542, + "eval_runtime": 146.0853, + "eval_samples_per_second": 38.334, + "eval_sentence_accuracy": 0.7638756612697981, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24838568239795922, + "num_input_tokens_seen": 18636800000, + "step": 182000 + }, + { + "epoch": 0.02, + "grad_norm": 0.07569040358066559, + "learning_rate": 8.261616161616163e-05, + "loss": 0.3381, + "num_input_tokens_seen": 18647040000, + "step": 182100 + }, + { + "epoch": 0.02, + "grad_norm": 0.08895408362150192, + "learning_rate": 8.26060606060606e-05, + "loss": 0.3354, + "num_input_tokens_seen": 18657280000, + "step": 182200 + }, + { + "epoch": 0.02, + "grad_norm": 0.05976281687617302, + "learning_rate": 8.259595959595961e-05, + "loss": 0.3367, + "num_input_tokens_seen": 18667520000, + "step": 182300 + }, + { + "epoch": 0.02, + "grad_norm": 0.14414764940738678, + "learning_rate": 8.258585858585859e-05, + "loss": 0.3341, + "num_input_tokens_seen": 18677760000, + "step": 182400 + }, + { + "epoch": 0.02, + "grad_norm": 0.06892646849155426, + "learning_rate": 8.257575757575758e-05, + "loss": 0.3359, + "num_input_tokens_seen": 18688000000, + "step": 182500 + }, + { + "epoch": 0.02, + "grad_norm": 0.05982761085033417, + "learning_rate": 8.256565656565657e-05, + "loss": 0.3339, + "num_input_tokens_seen": 18698240000, + "step": 182600 + }, + { + "epoch": 0.02, + "grad_norm": 0.10861676931381226, + "learning_rate": 8.255555555555556e-05, + "loss": 0.3388, + "num_input_tokens_seen": 18708480000, + "step": 182700 + }, + { + "epoch": 0.02, + "grad_norm": 0.06264171004295349, + "learning_rate": 8.254545454545454e-05, + "loss": 0.3381, + "num_input_tokens_seen": 18718720000, + "step": 182800 + }, + { + "epoch": 0.02, + "grad_norm": 0.08944454044103622, + "learning_rate": 8.253535353535355e-05, + "loss": 0.3387, + "num_input_tokens_seen": 18728960000, + "step": 182900 + }, + { + "epoch": 0.03, + "grad_norm": 0.0635397732257843, + "learning_rate": 8.252525252525253e-05, + "loss": 0.3384, + "num_input_tokens_seen": 18739200000, + "step": 183000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3343593922840379, + "eval_average_loss_on_sentence_tokens": 0.34456204088921494, + "eval_average_shuffling_prob": 0.5133928571428571, + "eval_loss": 0.3348127007484436, + "eval_non_padding_tokens_in_labels": 133.684375, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.433214285714286, + "eval_padding_tokens_in_labels": 378.315625, + "eval_reconstruction_accuracy": 0.9300172930186122, + "eval_runtime": 146.3417, + "eval_samples_per_second": 38.267, + "eval_sentence_accuracy": 0.7775732498859534, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24982063137755103, + "num_input_tokens_seen": 18739200000, + "step": 183000 + }, + { + "epoch": 0.03, + "grad_norm": 0.054957661777734756, + "learning_rate": 8.251515151515152e-05, + "loss": 0.3355, + "num_input_tokens_seen": 18749440000, + "step": 183100 + }, + { + "epoch": 0.03, + "grad_norm": 0.06980279833078384, + "learning_rate": 8.250505050505051e-05, + "loss": 0.335, + "num_input_tokens_seen": 18759680000, + "step": 183200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06054418906569481, + "learning_rate": 8.24949494949495e-05, + "loss": 0.3321, + "num_input_tokens_seen": 18769920000, + "step": 183300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06768016517162323, + "learning_rate": 8.248484848484848e-05, + "loss": 0.3385, + "num_input_tokens_seen": 18780160000, + "step": 183400 + }, + { + "epoch": 0.03, + "grad_norm": 0.058612313121557236, + "learning_rate": 8.247474747474749e-05, + "loss": 0.338, + "num_input_tokens_seen": 18790400000, + "step": 183500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08806338906288147, + "learning_rate": 8.246464646464646e-05, + "loss": 0.3377, + "num_input_tokens_seen": 18800640000, + "step": 183600 + }, + { + "epoch": 0.03, + "grad_norm": 0.06356330960988998, + "learning_rate": 8.245454545454546e-05, + "loss": 0.3377, + "num_input_tokens_seen": 18810880000, + "step": 183700 + }, + { + "epoch": 0.03, + "grad_norm": 0.06946519762277603, + "learning_rate": 8.244444444444445e-05, + "loss": 0.3379, + "num_input_tokens_seen": 18821120000, + "step": 183800 + }, + { + "epoch": 0.03, + "grad_norm": 0.1551639288663864, + "learning_rate": 8.243434343434344e-05, + "loss": 0.3356, + "num_input_tokens_seen": 18831360000, + "step": 183900 + }, + { + "epoch": 0.03, + "grad_norm": 0.07011517137289047, + "learning_rate": 8.242424242424243e-05, + "loss": 0.3354, + "num_input_tokens_seen": 18841600000, + "step": 184000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33475149354642014, + "eval_average_loss_on_sentence_tokens": 0.3103564171742425, + "eval_average_shuffling_prob": 0.45982142857142855, + "eval_loss": 0.3336443305015564, + "eval_non_padding_tokens_in_labels": 133.69924107142856, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43950892857143, + "eval_padding_tokens_in_labels": 378.30075892857144, + "eval_reconstruction_accuracy": 0.9298940683518437, + "eval_runtime": 147.4229, + "eval_samples_per_second": 37.986, + "eval_sentence_accuracy": 0.8024754099672666, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24838568239795916, + "num_input_tokens_seen": 18841600000, + "step": 184000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09382368624210358, + "learning_rate": 8.241414141414142e-05, + "loss": 0.3336, + "num_input_tokens_seen": 18851840000, + "step": 184100 + }, + { + "epoch": 0.03, + "grad_norm": 0.13729126751422882, + "learning_rate": 8.24040404040404e-05, + "loss": 0.3366, + "num_input_tokens_seen": 18862080000, + "step": 184200 + }, + { + "epoch": 0.03, + "grad_norm": 0.07409949600696564, + "learning_rate": 8.23939393939394e-05, + "loss": 0.3342, + "num_input_tokens_seen": 18872320000, + "step": 184300 + }, + { + "epoch": 0.03, + "grad_norm": 0.09255807101726532, + "learning_rate": 8.238383838383839e-05, + "loss": 0.3371, + "num_input_tokens_seen": 18882560000, + "step": 184400 + }, + { + "epoch": 0.03, + "grad_norm": 0.19869489967823029, + "learning_rate": 8.237373737373738e-05, + "loss": 0.3358, + "num_input_tokens_seen": 18892800000, + "step": 184500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08597985655069351, + "learning_rate": 8.236363636363637e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18903040000, + "step": 184600 + }, + { + "epoch": 0.03, + "grad_norm": 0.06865321099758148, + "learning_rate": 8.235353535353536e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18913280000, + "step": 184700 + }, + { + "epoch": 0.03, + "grad_norm": 0.15616963803768158, + "learning_rate": 8.234343434343434e-05, + "loss": 0.3384, + "num_input_tokens_seen": 18923520000, + "step": 184800 + }, + { + "epoch": 0.03, + "grad_norm": 0.07337319105863571, + "learning_rate": 8.233333333333333e-05, + "loss": 0.3374, + "num_input_tokens_seen": 18933760000, + "step": 184900 + }, + { + "epoch": 0.03, + "grad_norm": 0.13834546506404877, + "learning_rate": 8.232323232323233e-05, + "loss": 0.3351, + "num_input_tokens_seen": 18944000000, + "step": 185000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3340248133750442, + "eval_average_loss_on_sentence_tokens": 0.331300767638059, + "eval_average_shuffling_prob": 0.4955357142857143, + "eval_loss": 0.3338797390460968, + "eval_non_padding_tokens_in_labels": 133.65178571428572, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.4334375, + "eval_padding_tokens_in_labels": 378.3482142857143, + "eval_reconstruction_accuracy": 0.9300124233501934, + "eval_runtime": 145.3918, + "eval_samples_per_second": 38.517, + "eval_sentence_accuracy": 0.7869330686920264, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.2499800701530612, + "num_input_tokens_seen": 18944000000, + "step": 185000 + }, + { + "epoch": 0.03, + "grad_norm": 0.07316296547651291, + "learning_rate": 8.231313131313132e-05, + "loss": 0.3369, + "num_input_tokens_seen": 18954240000, + "step": 185100 + }, + { + "epoch": 0.03, + "grad_norm": 0.07054244726896286, + "learning_rate": 8.230303030303031e-05, + "loss": 0.338, + "num_input_tokens_seen": 18964480000, + "step": 185200 + }, + { + "epoch": 0.03, + "grad_norm": 0.16628777980804443, + "learning_rate": 8.22929292929293e-05, + "loss": 0.3365, + "num_input_tokens_seen": 18974720000, + "step": 185300 + }, + { + "epoch": 0.03, + "grad_norm": 0.16906015574932098, + "learning_rate": 8.228282828282828e-05, + "loss": 0.3368, + "num_input_tokens_seen": 18984960000, + "step": 185400 + }, + { + "epoch": 0.03, + "grad_norm": 0.06640388071537018, + "learning_rate": 8.227272727272729e-05, + "loss": 0.3384, + "num_input_tokens_seen": 18995200000, + "step": 185500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08352030813694, + "learning_rate": 8.226262626262626e-05, + "loss": 0.3355, + "num_input_tokens_seen": 19005440000, + "step": 185600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07318104803562164, + "learning_rate": 8.225252525252526e-05, + "loss": 0.3387, + "num_input_tokens_seen": 19015680000, + "step": 185700 + }, + { + "epoch": 0.03, + "grad_norm": 0.05995601788163185, + "learning_rate": 8.224242424242425e-05, + "loss": 0.3357, + "num_input_tokens_seen": 19025920000, + "step": 185800 + }, + { + "epoch": 0.03, + "grad_norm": 0.07225517928600311, + "learning_rate": 8.223232323232324e-05, + "loss": 0.3367, + "num_input_tokens_seen": 19036160000, + "step": 185900 + }, + { + "epoch": 0.03, + "grad_norm": 0.08080601692199707, + "learning_rate": 8.222222222222222e-05, + "loss": 0.3369, + "num_input_tokens_seen": 19046400000, + "step": 186000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3344601354244642, + "eval_average_loss_on_sentence_tokens": 0.3569894036506849, + "eval_average_shuffling_prob": 0.5446428571428571, + "eval_loss": 0.3354579508304596, + "eval_non_padding_tokens_in_labels": 133.6911607142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43870535714286, + "eval_padding_tokens_in_labels": 378.30883928571427, + "eval_reconstruction_accuracy": 0.9300220940636978, + "eval_runtime": 163.5163, + "eval_samples_per_second": 34.247, + "eval_sentence_accuracy": 0.7649000792323268, + "eval_steps_per_second": 0.086, + "eval_variance_shuffling_prob": 0.24800701530612243, + "num_input_tokens_seen": 19046400000, + "step": 186000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09492505341768265, + "learning_rate": 8.221212121212122e-05, + "loss": 0.3339, + "num_input_tokens_seen": 19056640000, + "step": 186100 + }, + { + "epoch": 0.03, + "grad_norm": 0.07690074294805527, + "learning_rate": 8.22020202020202e-05, + "loss": 0.3363, + "num_input_tokens_seen": 19066880000, + "step": 186200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06707073748111725, + "learning_rate": 8.21919191919192e-05, + "loss": 0.3365, + "num_input_tokens_seen": 19077120000, + "step": 186300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06364389508962631, + "learning_rate": 8.218181818181819e-05, + "loss": 0.3365, + "num_input_tokens_seen": 19087360000, + "step": 186400 + }, + { + "epoch": 0.03, + "grad_norm": 0.10559181123971939, + "learning_rate": 8.217171717171718e-05, + "loss": 0.3354, + "num_input_tokens_seen": 19097600000, + "step": 186500 + }, + { + "epoch": 0.03, + "grad_norm": 0.061228033155202866, + "learning_rate": 8.216161616161616e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19107840000, + "step": 186600 + }, + { + "epoch": 0.03, + "grad_norm": 0.1073850765824318, + "learning_rate": 8.215151515151516e-05, + "loss": 0.3382, + "num_input_tokens_seen": 19118080000, + "step": 186700 + }, + { + "epoch": 0.03, + "grad_norm": 0.05349685251712799, + "learning_rate": 8.214141414141414e-05, + "loss": 0.3381, + "num_input_tokens_seen": 19128320000, + "step": 186800 + }, + { + "epoch": 0.03, + "grad_norm": 0.06679967790842056, + "learning_rate": 8.213131313131313e-05, + "loss": 0.3379, + "num_input_tokens_seen": 19138560000, + "step": 186900 + }, + { + "epoch": 0.03, + "grad_norm": 0.0826243907213211, + "learning_rate": 8.212121212121212e-05, + "loss": 0.3353, + "num_input_tokens_seen": 19148800000, + "step": 187000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3338628028224695, + "eval_average_loss_on_sentence_tokens": 0.33257303541608346, + "eval_average_shuffling_prob": 0.49107142857142855, + "eval_loss": 0.3337925374507904, + "eval_non_padding_tokens_in_labels": 133.69004464285715, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.433839285714285, + "eval_padding_tokens_in_labels": 378.3099553571429, + "eval_reconstruction_accuracy": 0.9299993688914846, + "eval_runtime": 147.1272, + "eval_samples_per_second": 38.062, + "eval_sentence_accuracy": 0.7883776580844984, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24992028061224475, + "num_input_tokens_seen": 19148800000, + "step": 187000 + }, + { + "epoch": 0.03, + "grad_norm": 0.21756187081336975, + "learning_rate": 8.211111111111112e-05, + "loss": 0.336, + "num_input_tokens_seen": 19159040000, + "step": 187100 + }, + { + "epoch": 0.03, + "grad_norm": 0.09243348240852356, + "learning_rate": 8.21010101010101e-05, + "loss": 0.3371, + "num_input_tokens_seen": 19169280000, + "step": 187200 + }, + { + "epoch": 0.03, + "grad_norm": 0.07076194882392883, + "learning_rate": 8.20909090909091e-05, + "loss": 0.3381, + "num_input_tokens_seen": 19179520000, + "step": 187300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06662077456712723, + "learning_rate": 8.208080808080808e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19189760000, + "step": 187400 + }, + { + "epoch": 0.03, + "grad_norm": 0.055254898965358734, + "learning_rate": 8.207070707070707e-05, + "loss": 0.3339, + "num_input_tokens_seen": 19200000000, + "step": 187500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06762069463729858, + "learning_rate": 8.206060606060606e-05, + "loss": 0.3345, + "num_input_tokens_seen": 19210240000, + "step": 187600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07808098942041397, + "learning_rate": 8.205050505050505e-05, + "loss": 0.3355, + "num_input_tokens_seen": 19220480000, + "step": 187700 + }, + { + "epoch": 0.03, + "grad_norm": 0.15497083961963654, + "learning_rate": 8.204040404040403e-05, + "loss": 0.3367, + "num_input_tokens_seen": 19230720000, + "step": 187800 + }, + { + "epoch": 0.03, + "grad_norm": 0.06498348712921143, + "learning_rate": 8.203030303030304e-05, + "loss": 0.337, + "num_input_tokens_seen": 19240960000, + "step": 187900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09199763834476471, + "learning_rate": 8.202020202020202e-05, + "loss": 0.3363, + "num_input_tokens_seen": 19251200000, + "step": 188000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3351118778282919, + "eval_average_loss_on_sentence_tokens": 0.3761297383128855, + "eval_average_shuffling_prob": 0.5625, + "eval_loss": 0.3370099663734436, + "eval_non_padding_tokens_in_labels": 133.68330357142858, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.448705357142856, + "eval_padding_tokens_in_labels": 378.31669642857145, + "eval_reconstruction_accuracy": 0.9297887038948582, + "eval_runtime": 147.0364, + "eval_samples_per_second": 38.086, + "eval_sentence_accuracy": 0.7543157608304188, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24609375, + "num_input_tokens_seen": 19251200000, + "step": 188000 + }, + { + "epoch": 0.03, + "grad_norm": 0.0763116255402565, + "learning_rate": 8.201010101010101e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19261440000, + "step": 188100 + }, + { + "epoch": 0.03, + "grad_norm": 0.12345396727323532, + "learning_rate": 8.2e-05, + "loss": 0.3388, + "num_input_tokens_seen": 19271680000, + "step": 188200 + }, + { + "epoch": 0.03, + "grad_norm": 0.06399549543857574, + "learning_rate": 8.198989898989899e-05, + "loss": 0.3371, + "num_input_tokens_seen": 19281920000, + "step": 188300 + }, + { + "epoch": 0.03, + "grad_norm": 0.11177554726600647, + "learning_rate": 8.197979797979798e-05, + "loss": 0.3377, + "num_input_tokens_seen": 19292160000, + "step": 188400 + }, + { + "epoch": 0.03, + "grad_norm": 0.05926945060491562, + "learning_rate": 8.196969696969698e-05, + "loss": 0.3384, + "num_input_tokens_seen": 19302400000, + "step": 188500 + }, + { + "epoch": 0.03, + "grad_norm": 0.11829191446304321, + "learning_rate": 8.195959595959596e-05, + "loss": 0.3351, + "num_input_tokens_seen": 19312640000, + "step": 188600 + }, + { + "epoch": 0.03, + "grad_norm": 0.07786115258932114, + "learning_rate": 8.194949494949495e-05, + "loss": 0.3349, + "num_input_tokens_seen": 19322880000, + "step": 188700 + }, + { + "epoch": 0.03, + "grad_norm": 0.0580323152244091, + "learning_rate": 8.193939393939394e-05, + "loss": 0.3388, + "num_input_tokens_seen": 19333120000, + "step": 188800 + }, + { + "epoch": 0.03, + "grad_norm": 0.05895525962114334, + "learning_rate": 8.192929292929293e-05, + "loss": 0.335, + "num_input_tokens_seen": 19343360000, + "step": 188900 + }, + { + "epoch": 0.03, + "grad_norm": 0.14815880358219147, + "learning_rate": 8.191919191919192e-05, + "loss": 0.3356, + "num_input_tokens_seen": 19353600000, + "step": 189000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33403756119357925, + "eval_average_loss_on_sentence_tokens": 0.33045451967721073, + "eval_average_shuffling_prob": 0.4955357142857143, + "eval_loss": 0.3338797390460968, + "eval_non_padding_tokens_in_labels": 133.66660714285715, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42174107142857, + "eval_padding_tokens_in_labels": 378.3333928571429, + "eval_reconstruction_accuracy": 0.929998079936333, + "eval_runtime": 145.7594, + "eval_samples_per_second": 38.419, + "eval_sentence_accuracy": 0.7866289446094006, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.2499800701530612, + "num_input_tokens_seen": 19353600000, + "step": 189000 + }, + { + "epoch": 0.03, + "grad_norm": 0.11637333780527115, + "learning_rate": 8.190909090909092e-05, + "loss": 0.3359, + "num_input_tokens_seen": 19363840000, + "step": 189100 + }, + { + "epoch": 0.03, + "grad_norm": 0.06524750590324402, + "learning_rate": 8.18989898989899e-05, + "loss": 0.3349, + "num_input_tokens_seen": 19374080000, + "step": 189200 + }, + { + "epoch": 0.03, + "grad_norm": 0.10903629660606384, + "learning_rate": 8.18888888888889e-05, + "loss": 0.3314, + "num_input_tokens_seen": 19384320000, + "step": 189300 + }, + { + "epoch": 0.03, + "grad_norm": 0.06391419470310211, + "learning_rate": 8.187878787878789e-05, + "loss": 0.3354, + "num_input_tokens_seen": 19394560000, + "step": 189400 + }, + { + "epoch": 0.03, + "grad_norm": 0.15620021522045135, + "learning_rate": 8.186868686868687e-05, + "loss": 0.3334, + "num_input_tokens_seen": 19404800000, + "step": 189500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08543257415294647, + "learning_rate": 8.185858585858586e-05, + "loss": 0.3362, + "num_input_tokens_seen": 19415040000, + "step": 189600 + }, + { + "epoch": 0.03, + "grad_norm": 0.08436110615730286, + "learning_rate": 8.184848484848485e-05, + "loss": 0.3339, + "num_input_tokens_seen": 19425280000, + "step": 189700 + }, + { + "epoch": 0.03, + "grad_norm": 0.09449459612369537, + "learning_rate": 8.183838383838385e-05, + "loss": 0.3355, + "num_input_tokens_seen": 19435520000, + "step": 189800 + }, + { + "epoch": 0.03, + "grad_norm": 0.14217884838581085, + "learning_rate": 8.182828282828284e-05, + "loss": 0.3354, + "num_input_tokens_seen": 19445760000, + "step": 189900 + }, + { + "epoch": 0.03, + "grad_norm": 0.09001050144433975, + "learning_rate": 8.181818181818183e-05, + "loss": 0.3348, + "num_input_tokens_seen": 19456000000, + "step": 190000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.33426271426353615, + "eval_average_loss_on_sentence_tokens": 0.3225164881705069, + "eval_average_shuffling_prob": 0.47767857142857145, + "eval_loss": 0.333740234375, + "eval_non_padding_tokens_in_labels": 133.63816964285715, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42700892857143, + "eval_padding_tokens_in_labels": 378.3618303571429, + "eval_reconstruction_accuracy": 0.930002771343153, + "eval_runtime": 150.267, + "eval_samples_per_second": 37.267, + "eval_sentence_accuracy": 0.79475225892164, + "eval_steps_per_second": 0.093, + "eval_variance_shuffling_prob": 0.24950175382653056, + "num_input_tokens_seen": 19456000000, + "step": 190000 + }, + { + "epoch": 0.03, + "grad_norm": 0.10401882231235504, + "learning_rate": 8.180808080808081e-05, + "loss": 0.3377, + "num_input_tokens_seen": 19466240000, + "step": 190100 + }, + { + "epoch": 0.03, + "grad_norm": 0.06941093504428864, + "learning_rate": 8.179797979797981e-05, + "loss": 0.3358, + "num_input_tokens_seen": 19476480000, + "step": 190200 + }, + { + "epoch": 0.03, + "grad_norm": 0.09762857109308243, + "learning_rate": 8.178787878787879e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19486720000, + "step": 190300 + }, + { + "epoch": 0.03, + "grad_norm": 0.07456941157579422, + "learning_rate": 8.177777777777778e-05, + "loss": 0.3369, + "num_input_tokens_seen": 19496960000, + "step": 190400 + }, + { + "epoch": 0.03, + "grad_norm": 0.06345424056053162, + "learning_rate": 8.176767676767678e-05, + "loss": 0.3393, + "num_input_tokens_seen": 19507200000, + "step": 190500 + }, + { + "epoch": 0.03, + "grad_norm": 0.08950772136449814, + "learning_rate": 8.175757575757577e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19517440000, + "step": 190600 + }, + { + "epoch": 0.03, + "grad_norm": 0.06380996108055115, + "learning_rate": 8.174747474747475e-05, + "loss": 0.3347, + "num_input_tokens_seen": 19527680000, + "step": 190700 + }, + { + "epoch": 0.03, + "grad_norm": 0.10365952551364899, + "learning_rate": 8.173737373737375e-05, + "loss": 0.3365, + "num_input_tokens_seen": 19537920000, + "step": 190800 + }, + { + "epoch": 0.03, + "grad_norm": 0.1045404002070427, + "learning_rate": 8.172727272727273e-05, + "loss": 0.337, + "num_input_tokens_seen": 19548160000, + "step": 190900 + }, + { + "epoch": 0.03, + "grad_norm": 0.05314315855503082, + "learning_rate": 8.171717171717172e-05, + "loss": 0.3359, + "num_input_tokens_seen": 19558400000, + "step": 191000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3339246186655942, + "eval_average_loss_on_sentence_tokens": 0.29762447874938464, + "eval_average_shuffling_prob": 0.42857142857142855, + "eval_loss": 0.3323276937007904, + "eval_non_padding_tokens_in_labels": 133.7292857142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.46138392857143, + "eval_padding_tokens_in_labels": 378.2707142857143, + "eval_reconstruction_accuracy": 0.9300470549055907, + "eval_runtime": 151.5096, + "eval_samples_per_second": 36.961, + "eval_sentence_accuracy": 0.8147964369462741, + "eval_steps_per_second": 0.092, + "eval_variance_shuffling_prob": 0.24489795918367346, + "num_input_tokens_seen": 19558400000, + "step": 191000 + }, + { + "epoch": 0.03, + "grad_norm": 0.05644207075238228, + "learning_rate": 8.170707070707071e-05, + "loss": 0.3361, + "num_input_tokens_seen": 19568640000, + "step": 191100 + }, + { + "epoch": 0.03, + "grad_norm": 0.05854468047618866, + "learning_rate": 8.16969696969697e-05, + "loss": 0.3365, + "num_input_tokens_seen": 19578880000, + "step": 191200 + }, + { + "epoch": 0.03, + "grad_norm": 0.14275354146957397, + "learning_rate": 8.168686868686868e-05, + "loss": 0.3382, + "num_input_tokens_seen": 19589120000, + "step": 191300 + }, + { + "epoch": 0.03, + "grad_norm": 0.09600412100553513, + "learning_rate": 8.167676767676769e-05, + "loss": 0.3376, + "num_input_tokens_seen": 19599360000, + "step": 191400 + }, + { + "epoch": 0.03, + "grad_norm": 0.0842081680893898, + "learning_rate": 8.166666666666667e-05, + "loss": 0.3357, + "num_input_tokens_seen": 19609600000, + "step": 191500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06798452138900757, + "learning_rate": 8.165656565656566e-05, + "loss": 0.3367, + "num_input_tokens_seen": 19619840000, + "step": 191600 + }, + { + "epoch": 0.03, + "grad_norm": 0.06542013585567474, + "learning_rate": 8.164646464646465e-05, + "loss": 0.3374, + "num_input_tokens_seen": 19630080000, + "step": 191700 + }, + { + "epoch": 0.03, + "grad_norm": 0.09224187582731247, + "learning_rate": 8.163636363636364e-05, + "loss": 0.3354, + "num_input_tokens_seen": 19640320000, + "step": 191800 + }, + { + "epoch": 0.03, + "grad_norm": 0.0546901635825634, + "learning_rate": 8.162626262626262e-05, + "loss": 0.3385, + "num_input_tokens_seen": 19650560000, + "step": 191900 + }, + { + "epoch": 0.03, + "grad_norm": 0.07320586591959, + "learning_rate": 8.161616161616163e-05, + "loss": 0.3353, + "num_input_tokens_seen": 19660800000, + "step": 192000 + }, + { + "epoch": 0.03, + "eval_average_loss_on_non_sentence_tokens": 0.3340580080063443, + "eval_average_loss_on_sentence_tokens": 0.35987619091286427, + "eval_average_shuffling_prob": 0.5401785714285714, + "eval_loss": 0.3352312445640564, + "eval_non_padding_tokens_in_labels": 133.66955357142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.42178571428571, + "eval_padding_tokens_in_labels": 378.3304464285714, + "eval_reconstruction_accuracy": 0.9300179357079613, + "eval_runtime": 152.9655, + "eval_samples_per_second": 36.61, + "eval_sentence_accuracy": 0.7672930555666712, + "eval_steps_per_second": 0.092, + "eval_variance_shuffling_prob": 0.24838568239795922, + "num_input_tokens_seen": 19660800000, + "step": 192000 + }, + { + "epoch": 0.03, + "grad_norm": 0.09024032205343246, + "learning_rate": 8.160606060606061e-05, + "loss": 0.3368, + "num_input_tokens_seen": 19671040000, + "step": 192100 + }, + { + "epoch": 0.03, + "grad_norm": 0.06177983805537224, + "learning_rate": 8.15959595959596e-05, + "loss": 0.3359, + "num_input_tokens_seen": 19681280000, + "step": 192200 + }, + { + "epoch": 0.03, + "grad_norm": 0.08752156794071198, + "learning_rate": 8.158585858585859e-05, + "loss": 0.3356, + "num_input_tokens_seen": 19691520000, + "step": 192300 + }, + { + "epoch": 0.03, + "grad_norm": 0.10124845802783966, + "learning_rate": 8.157575757575758e-05, + "loss": 0.3359, + "num_input_tokens_seen": 19701760000, + "step": 192400 + }, + { + "epoch": 0.03, + "grad_norm": 0.06586451828479767, + "learning_rate": 8.156565656565656e-05, + "loss": 0.3355, + "num_input_tokens_seen": 19712000000, + "step": 192500 + }, + { + "epoch": 0.03, + "grad_norm": 0.06414102017879486, + "learning_rate": 8.155555555555557e-05, + "loss": 0.3354, + "num_input_tokens_seen": 19722240000, + "step": 192600 + }, + { + "epoch": 0.03, + "grad_norm": 0.05442731827497482, + "learning_rate": 8.154545454545455e-05, + "loss": 0.3361, + "num_input_tokens_seen": 19732480000, + "step": 192700 + }, + { + "epoch": 0.03, + "grad_norm": 0.0678030252456665, + "learning_rate": 8.153535353535354e-05, + "loss": 0.3364, + "num_input_tokens_seen": 19742720000, + "step": 192800 + }, + { + "epoch": 0.03, + "grad_norm": 0.06350074708461761, + "learning_rate": 8.152525252525253e-05, + "loss": 0.3328, + "num_input_tokens_seen": 19752960000, + "step": 192900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07252136617898941, + "learning_rate": 8.151515151515152e-05, + "loss": 0.3359, + "num_input_tokens_seen": 19763200000, + "step": 193000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3341383253232144, + "eval_average_loss_on_sentence_tokens": 0.36146289829821326, + "eval_average_shuffling_prob": 0.5401785714285714, + "eval_loss": 0.3353969156742096, + "eval_non_padding_tokens_in_labels": 133.69892857142858, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43691964285714, + "eval_padding_tokens_in_labels": 378.30107142857145, + "eval_reconstruction_accuracy": 0.9299784470213809, + "eval_runtime": 152.4208, + "eval_samples_per_second": 36.74, + "eval_sentence_accuracy": 0.7673530800566631, + "eval_steps_per_second": 0.092, + "eval_variance_shuffling_prob": 0.24838568239795922, + "num_input_tokens_seen": 19763200000, + "step": 193000 + }, + { + "epoch": 0.04, + "grad_norm": 0.06677644699811935, + "learning_rate": 8.150505050505051e-05, + "loss": 0.3382, + "num_input_tokens_seen": 19773440000, + "step": 193100 + }, + { + "epoch": 0.04, + "grad_norm": 0.08005284518003464, + "learning_rate": 8.14949494949495e-05, + "loss": 0.3377, + "num_input_tokens_seen": 19783680000, + "step": 193200 + }, + { + "epoch": 0.04, + "grad_norm": 0.08820047229528427, + "learning_rate": 8.148484848484848e-05, + "loss": 0.3347, + "num_input_tokens_seen": 19793920000, + "step": 193300 + }, + { + "epoch": 0.04, + "grad_norm": 0.0696980208158493, + "learning_rate": 8.147474747474748e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19804160000, + "step": 193400 + }, + { + "epoch": 0.04, + "grad_norm": 0.07348765432834625, + "learning_rate": 8.146464646464647e-05, + "loss": 0.3373, + "num_input_tokens_seen": 19814400000, + "step": 193500 + }, + { + "epoch": 0.04, + "grad_norm": 0.0645444244146347, + "learning_rate": 8.145454545454546e-05, + "loss": 0.3364, + "num_input_tokens_seen": 19824640000, + "step": 193600 + }, + { + "epoch": 0.04, + "grad_norm": 0.061729494482278824, + "learning_rate": 8.144444444444445e-05, + "loss": 0.3342, + "num_input_tokens_seen": 19834880000, + "step": 193700 + }, + { + "epoch": 0.04, + "grad_norm": 0.17211440205574036, + "learning_rate": 8.143434343434344e-05, + "loss": 0.3347, + "num_input_tokens_seen": 19845120000, + "step": 193800 + }, + { + "epoch": 0.04, + "grad_norm": 0.09250013530254364, + "learning_rate": 8.142424242424242e-05, + "loss": 0.3376, + "num_input_tokens_seen": 19855360000, + "step": 193900 + }, + { + "epoch": 0.04, + "grad_norm": 0.058953311294317245, + "learning_rate": 8.141414141414141e-05, + "loss": 0.3362, + "num_input_tokens_seen": 19865600000, + "step": 194000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3347528505790562, + "eval_average_loss_on_sentence_tokens": 0.3352199207023369, + "eval_average_shuffling_prob": 0.5044642857142857, + "eval_loss": 0.33477783203125, + "eval_non_padding_tokens_in_labels": 133.6639732142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.41955357142857, + "eval_padding_tokens_in_labels": 378.3360267857143, + "eval_reconstruction_accuracy": 0.9299342526915143, + "eval_runtime": 159.9245, + "eval_samples_per_second": 35.017, + "eval_sentence_accuracy": 0.7817389494913924, + "eval_steps_per_second": 0.088, + "eval_variance_shuffling_prob": 0.2499800701530612, + "num_input_tokens_seen": 19865600000, + "step": 194000 + }, + { + "epoch": 0.04, + "grad_norm": 0.09009237587451935, + "learning_rate": 8.14040404040404e-05, + "loss": 0.334, + "num_input_tokens_seen": 19875840000, + "step": 194100 + }, + { + "epoch": 0.04, + "grad_norm": 0.0968250259757042, + "learning_rate": 8.13939393939394e-05, + "loss": 0.3349, + "num_input_tokens_seen": 19886080000, + "step": 194200 + }, + { + "epoch": 0.04, + "grad_norm": 0.06931343674659729, + "learning_rate": 8.138383838383839e-05, + "loss": 0.3347, + "num_input_tokens_seen": 19896320000, + "step": 194300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08130525052547455, + "learning_rate": 8.137373737373738e-05, + "loss": 0.3383, + "num_input_tokens_seen": 19906560000, + "step": 194400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06598328799009323, + "learning_rate": 8.136363636363636e-05, + "loss": 0.3362, + "num_input_tokens_seen": 19916800000, + "step": 194500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06157203018665314, + "learning_rate": 8.135353535353537e-05, + "loss": 0.3368, + "num_input_tokens_seen": 19927040000, + "step": 194600 + }, + { + "epoch": 0.04, + "grad_norm": 0.09670035541057587, + "learning_rate": 8.134343434343434e-05, + "loss": 0.3363, + "num_input_tokens_seen": 19937280000, + "step": 194700 + }, + { + "epoch": 0.04, + "grad_norm": 0.07736602425575256, + "learning_rate": 8.133333333333334e-05, + "loss": 0.3344, + "num_input_tokens_seen": 19947520000, + "step": 194800 + }, + { + "epoch": 0.04, + "grad_norm": 0.08579296618700027, + "learning_rate": 8.132323232323233e-05, + "loss": 0.3365, + "num_input_tokens_seen": 19957760000, + "step": 194900 + }, + { + "epoch": 0.04, + "grad_norm": 0.10500556975603104, + "learning_rate": 8.131313131313132e-05, + "loss": 0.3353, + "num_input_tokens_seen": 19968000000, + "step": 195000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3339430441597209, + "eval_average_loss_on_sentence_tokens": 0.33347557120379934, + "eval_average_shuffling_prob": 0.4955357142857143, + "eval_loss": 0.3338884711265564, + "eval_non_padding_tokens_in_labels": 133.70263392857143, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.424598214285716, + "eval_padding_tokens_in_labels": 378.2973660714286, + "eval_reconstruction_accuracy": 0.9300206982125729, + "eval_runtime": 146.5644, + "eval_samples_per_second": 38.208, + "eval_sentence_accuracy": 0.7879975029812163, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.2499800701530612, + "num_input_tokens_seen": 19968000000, + "step": 195000 + }, + { + "epoch": 0.04, + "grad_norm": 0.06182816997170448, + "learning_rate": 8.13030303030303e-05, + "loss": 0.3353, + "num_input_tokens_seen": 19978240000, + "step": 195100 + }, + { + "epoch": 0.04, + "grad_norm": 0.05904494971036911, + "learning_rate": 8.12929292929293e-05, + "loss": 0.3347, + "num_input_tokens_seen": 19988480000, + "step": 195200 + }, + { + "epoch": 0.04, + "grad_norm": 0.0599527470767498, + "learning_rate": 8.128282828282828e-05, + "loss": 0.3334, + "num_input_tokens_seen": 19998720000, + "step": 195300 + }, + { + "epoch": 0.04, + "grad_norm": 0.07718954235315323, + "learning_rate": 8.127272727272727e-05, + "loss": 0.3373, + "num_input_tokens_seen": 20008960000, + "step": 195400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06861264258623123, + "learning_rate": 8.126262626262627e-05, + "loss": 0.3344, + "num_input_tokens_seen": 20019200000, + "step": 195500 + }, + { + "epoch": 0.04, + "grad_norm": 0.06000716984272003, + "learning_rate": 8.125252525252526e-05, + "loss": 0.3356, + "num_input_tokens_seen": 20029440000, + "step": 195600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06344378739595413, + "learning_rate": 8.124242424242424e-05, + "loss": 0.3343, + "num_input_tokens_seen": 20039680000, + "step": 195700 + }, + { + "epoch": 0.04, + "grad_norm": 0.11686629801988602, + "learning_rate": 8.123232323232324e-05, + "loss": 0.3364, + "num_input_tokens_seen": 20049920000, + "step": 195800 + }, + { + "epoch": 0.04, + "grad_norm": 0.11949914693832397, + "learning_rate": 8.122222222222222e-05, + "loss": 0.3364, + "num_input_tokens_seen": 20060160000, + "step": 195900 + }, + { + "epoch": 0.04, + "grad_norm": 0.07527315616607666, + "learning_rate": 8.121212121212121e-05, + "loss": 0.3342, + "num_input_tokens_seen": 20070400000, + "step": 196000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33439210737364616, + "eval_average_loss_on_sentence_tokens": 0.3281836105292624, + "eval_average_shuffling_prob": 0.48660714285714285, + "eval_loss": 0.33416748046875, + "eval_non_padding_tokens_in_labels": 133.66017857142856, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.4253125, + "eval_padding_tokens_in_labels": 378.33982142857144, + "eval_reconstruction_accuracy": 0.9300238563240117, + "eval_runtime": 146.5949, + "eval_samples_per_second": 38.201, + "eval_sentence_accuracy": 0.7886937870651226, + "eval_steps_per_second": 0.096, + "eval_variance_shuffling_prob": 0.24982063137755112, + "num_input_tokens_seen": 20070400000, + "step": 196000 + }, + { + "epoch": 0.04, + "grad_norm": 0.05852902680635452, + "learning_rate": 8.12020202020202e-05, + "loss": 0.3346, + "num_input_tokens_seen": 20080640000, + "step": 196100 + }, + { + "epoch": 0.04, + "grad_norm": 0.06819413602352142, + "learning_rate": 8.11919191919192e-05, + "loss": 0.3349, + "num_input_tokens_seen": 20090880000, + "step": 196200 + }, + { + "epoch": 0.04, + "grad_norm": 0.10724770277738571, + "learning_rate": 8.118181818181818e-05, + "loss": 0.3356, + "num_input_tokens_seen": 20101120000, + "step": 196300 + }, + { + "epoch": 0.04, + "grad_norm": 0.058819450438022614, + "learning_rate": 8.117171717171718e-05, + "loss": 0.338, + "num_input_tokens_seen": 20111360000, + "step": 196400 + }, + { + "epoch": 0.04, + "grad_norm": 0.07938557118177414, + "learning_rate": 8.116161616161616e-05, + "loss": 0.3355, + "num_input_tokens_seen": 20121600000, + "step": 196500 + }, + { + "epoch": 0.04, + "grad_norm": 0.11418991535902023, + "learning_rate": 8.115151515151515e-05, + "loss": 0.3383, + "num_input_tokens_seen": 20131840000, + "step": 196600 + }, + { + "epoch": 0.04, + "grad_norm": 0.08994439989328384, + "learning_rate": 8.114141414141414e-05, + "loss": 0.3347, + "num_input_tokens_seen": 20142080000, + "step": 196700 + }, + { + "epoch": 0.04, + "grad_norm": 0.07990837097167969, + "learning_rate": 8.113131313131314e-05, + "loss": 0.3329, + "num_input_tokens_seen": 20152320000, + "step": 196800 + }, + { + "epoch": 0.04, + "grad_norm": 0.0802498534321785, + "learning_rate": 8.112121212121211e-05, + "loss": 0.3335, + "num_input_tokens_seen": 20162560000, + "step": 196900 + }, + { + "epoch": 0.04, + "grad_norm": 0.06393201649188995, + "learning_rate": 8.111111111111112e-05, + "loss": 0.3373, + "num_input_tokens_seen": 20172800000, + "step": 197000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33447200076710226, + "eval_average_loss_on_sentence_tokens": 0.34971337249148143, + "eval_average_shuffling_prob": 0.5267857142857143, + "eval_loss": 0.3351789116859436, + "eval_non_padding_tokens_in_labels": 133.67285714285714, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.442410714285714, + "eval_padding_tokens_in_labels": 378.32714285714286, + "eval_reconstruction_accuracy": 0.9299531207115604, + "eval_runtime": 146.84, + "eval_samples_per_second": 38.137, + "eval_sentence_accuracy": 0.7747921151829946, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24928252551020413, + "num_input_tokens_seen": 20172800000, + "step": 197000 + }, + { + "epoch": 0.04, + "grad_norm": 0.08500557392835617, + "learning_rate": 8.11010101010101e-05, + "loss": 0.3354, + "num_input_tokens_seen": 20183040000, + "step": 197100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07332198321819305, + "learning_rate": 8.109090909090909e-05, + "loss": 0.3375, + "num_input_tokens_seen": 20193280000, + "step": 197200 + }, + { + "epoch": 0.04, + "grad_norm": 0.06770052760839462, + "learning_rate": 8.108080808080808e-05, + "loss": 0.3366, + "num_input_tokens_seen": 20203520000, + "step": 197300 + }, + { + "epoch": 0.04, + "grad_norm": 0.08552179485559464, + "learning_rate": 8.107070707070707e-05, + "loss": 0.3347, + "num_input_tokens_seen": 20213760000, + "step": 197400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06699363142251968, + "learning_rate": 8.106060606060607e-05, + "loss": 0.34, + "num_input_tokens_seen": 20224000000, + "step": 197500 + }, + { + "epoch": 0.04, + "grad_norm": 0.07519645988941193, + "learning_rate": 8.105050505050506e-05, + "loss": 0.3363, + "num_input_tokens_seen": 20234240000, + "step": 197600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06271009147167206, + "learning_rate": 8.104040404040404e-05, + "loss": 0.3359, + "num_input_tokens_seen": 20244480000, + "step": 197700 + }, + { + "epoch": 0.04, + "grad_norm": 0.09471846371889114, + "learning_rate": 8.103030303030303e-05, + "loss": 0.3375, + "num_input_tokens_seen": 20254720000, + "step": 197800 + }, + { + "epoch": 0.04, + "grad_norm": 0.06145664304494858, + "learning_rate": 8.102020202020203e-05, + "loss": 0.3383, + "num_input_tokens_seen": 20264960000, + "step": 197900 + }, + { + "epoch": 0.04, + "grad_norm": 0.1603062003850937, + "learning_rate": 8.101010101010101e-05, + "loss": 0.3362, + "num_input_tokens_seen": 20275200000, + "step": 198000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33377095456480227, + "eval_average_loss_on_sentence_tokens": 0.34455781779395156, + "eval_average_shuffling_prob": 0.5133928571428571, + "eval_loss": 0.334228515625, + "eval_non_padding_tokens_in_labels": 133.68263392857142, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.425982142857144, + "eval_padding_tokens_in_labels": 378.3173660714286, + "eval_reconstruction_accuracy": 0.9299801237005038, + "eval_runtime": 147.8072, + "eval_samples_per_second": 37.887, + "eval_sentence_accuracy": 0.7798821919343092, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.24982063137755103, + "num_input_tokens_seen": 20275200000, + "step": 198000 + }, + { + "epoch": 0.04, + "grad_norm": 0.18286630511283875, + "learning_rate": 8.1e-05, + "loss": 0.3357, + "num_input_tokens_seen": 20285440000, + "step": 198100 + }, + { + "epoch": 0.04, + "grad_norm": 0.06316962093114853, + "learning_rate": 8.0989898989899e-05, + "loss": 0.3346, + "num_input_tokens_seen": 20295680000, + "step": 198200 + }, + { + "epoch": 0.04, + "grad_norm": 0.06096937879920006, + "learning_rate": 8.097979797979799e-05, + "loss": 0.3399, + "num_input_tokens_seen": 20305920000, + "step": 198300 + }, + { + "epoch": 0.04, + "grad_norm": 0.1006731316447258, + "learning_rate": 8.096969696969698e-05, + "loss": 0.3381, + "num_input_tokens_seen": 20316160000, + "step": 198400 + }, + { + "epoch": 0.04, + "grad_norm": 0.06835881620645523, + "learning_rate": 8.095959595959597e-05, + "loss": 0.3347, + "num_input_tokens_seen": 20326400000, + "step": 198500 + }, + { + "epoch": 0.04, + "grad_norm": 0.15004444122314453, + "learning_rate": 8.094949494949495e-05, + "loss": 0.3307, + "num_input_tokens_seen": 20336640000, + "step": 198600 + }, + { + "epoch": 0.04, + "grad_norm": 0.06967467814683914, + "learning_rate": 8.093939393939394e-05, + "loss": 0.333, + "num_input_tokens_seen": 20346880000, + "step": 198700 + }, + { + "epoch": 0.04, + "grad_norm": 0.11399396508932114, + "learning_rate": 8.092929292929293e-05, + "loss": 0.3356, + "num_input_tokens_seen": 20357120000, + "step": 198800 + }, + { + "epoch": 0.04, + "grad_norm": 0.09686141461133957, + "learning_rate": 8.091919191919193e-05, + "loss": 0.3343, + "num_input_tokens_seen": 20367360000, + "step": 198900 + }, + { + "epoch": 0.04, + "grad_norm": 0.10415460169315338, + "learning_rate": 8.090909090909092e-05, + "loss": 0.3336, + "num_input_tokens_seen": 20377600000, + "step": 199000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.3338782116114574, + "eval_average_loss_on_sentence_tokens": 0.3601944916358582, + "eval_average_shuffling_prob": 0.5491071428571429, + "eval_loss": 0.3349783718585968, + "eval_non_padding_tokens_in_labels": 133.69205357142857, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43763392857143, + "eval_padding_tokens_in_labels": 378.3079464285714, + "eval_reconstruction_accuracy": 0.9300992513008994, + "eval_runtime": 148.8916, + "eval_samples_per_second": 37.611, + "eval_sentence_accuracy": 0.7625071028979824, + "eval_steps_per_second": 0.094, + "eval_variance_shuffling_prob": 0.2475884885204082, + "num_input_tokens_seen": 20377600000, + "step": 199000 + }, + { + "epoch": 0.04, + "grad_norm": 0.07303807139396667, + "learning_rate": 8.089898989898991e-05, + "loss": 0.3371, + "num_input_tokens_seen": 20387840000, + "step": 199100 + }, + { + "epoch": 0.04, + "grad_norm": 0.07402047514915466, + "learning_rate": 8.088888888888889e-05, + "loss": 0.3382, + "num_input_tokens_seen": 20398080000, + "step": 199200 + }, + { + "epoch": 0.04, + "grad_norm": 0.09586652368307114, + "learning_rate": 8.08787878787879e-05, + "loss": 0.3385, + "num_input_tokens_seen": 20408320000, + "step": 199300 + }, + { + "epoch": 0.04, + "grad_norm": 0.05604598671197891, + "learning_rate": 8.086868686868687e-05, + "loss": 0.3326, + "num_input_tokens_seen": 20418560000, + "step": 199400 + }, + { + "epoch": 0.04, + "grad_norm": 0.0688890814781189, + "learning_rate": 8.085858585858586e-05, + "loss": 0.3383, + "num_input_tokens_seen": 20428800000, + "step": 199500 + }, + { + "epoch": 0.04, + "grad_norm": 0.07135557383298874, + "learning_rate": 8.084848484848486e-05, + "loss": 0.3377, + "num_input_tokens_seen": 20439040000, + "step": 199600 + }, + { + "epoch": 0.04, + "grad_norm": 0.19412030279636383, + "learning_rate": 8.083838383838385e-05, + "loss": 0.3354, + "num_input_tokens_seen": 20449280000, + "step": 199700 + }, + { + "epoch": 0.04, + "grad_norm": 0.08356797695159912, + "learning_rate": 8.082828282828283e-05, + "loss": 0.3376, + "num_input_tokens_seen": 20459520000, + "step": 199800 + }, + { + "epoch": 0.04, + "grad_norm": 0.11555063724517822, + "learning_rate": 8.081818181818183e-05, + "loss": 0.3377, + "num_input_tokens_seen": 20469760000, + "step": 199900 + }, + { + "epoch": 0.04, + "grad_norm": 0.05644586309790611, + "learning_rate": 8.080808080808081e-05, + "loss": 0.3361, + "num_input_tokens_seen": 20480000000, + "step": 200000 + }, + { + "epoch": 0.04, + "eval_average_loss_on_non_sentence_tokens": 0.33368289770431775, + "eval_average_loss_on_sentence_tokens": 0.3252670504454762, + "eval_average_shuffling_prob": 0.49107142857142855, + "eval_loss": 0.3332868218421936, + "eval_non_padding_tokens_in_labels": 133.67678571428573, + "eval_num_sentence_tokens": 23.124330357142856, + "eval_num_sentinel_tokens_in_labels": 52.43892857142857, + "eval_padding_tokens_in_labels": 378.3232142857143, + "eval_reconstruction_accuracy": 0.9300712054491276, + "eval_runtime": 147.1757, + "eval_samples_per_second": 38.05, + "eval_sentence_accuracy": 0.7912108140121169, + "eval_steps_per_second": 0.095, + "eval_variance_shuffling_prob": 0.2499202806122448, + "num_input_tokens_seen": 20480000000, + "step": 200000 + } + ], + "logging_steps": 100, + "max_steps": 1000000, + "num_input_tokens_seen": 20480000000, + "num_train_epochs": 9223372036854775807, + "save_steps": 2000, + "total_flos": 2.73902949367808e+19, + "train_batch_size": 25, + "trial_name": null, + "trial_params": null +}