diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-10000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-10000/optimizer.pt b/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff5a1c286e76f49711bce30a66de3c4928b9bb85 --- /dev/null +++ b/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5b775aa8a202fd4d36055e1eaf56d5fc6b253a6e317aa79dc35388b41dba4c +size 883771077 diff --git a/checkpoint-10000/pytorch_model.bin b/checkpoint-10000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf73e88e2dee30b54e009059ba5b24242525a21c --- /dev/null +++ b/checkpoint-10000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ae23bef57bae1b488c85a5990592cf30f4d350cca3e7354e9ff6526ba08e78f +size 441897977 diff --git a/checkpoint-10000/rng_state.pth b/checkpoint-10000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca83514124d1f86fda275dd99990e6172fb11249 --- /dev/null +++ b/checkpoint-10000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eef4cbb69ab20145d1568f2f44ce9e8cb53c9fd697b6bade8bebff7bfee5082a +size 14511 diff --git a/checkpoint-10000/scaler.pt b/checkpoint-10000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b345659e084dfa08f03a221d79b2e302a4748dfe --- /dev/null +++ b/checkpoint-10000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be2ad76dcbc923b00e6a142f6db62aad4a46c47bb83864ccb68ddc899d0ce78 +size 557 diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..97c87a5d5cd107cc19991263a372bb494fa52769 --- /dev/null +++ b/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b421957aeaf8aaf98c5369a533b691198a7ccccd412c8da8f9b0e06d8aaee6 +size 627 diff --git a/checkpoint-10000/trainer_state.json b/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..217391e10e34387bdd11a100cb9b61d652c6726d --- /dev/null +++ b/checkpoint-10000/trainer_state.json @@ -0,0 +1,616 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 78.12415130940835, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 6.779220101431296e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10000/training_args.bin b/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-10700/config.json b/checkpoint-10700/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-10700/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-10700/optimizer.pt b/checkpoint-10700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb15568cf93302a7a37753bf4b485a2d91d585cf --- /dev/null +++ b/checkpoint-10700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62fa10a633b9cc8c455879067e98c4a60dee9731426311513762c8fcc5cc383 +size 883771077 diff --git a/checkpoint-10700/pytorch_model.bin b/checkpoint-10700/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b0f204344ff38d1c1dd0d422e33d42ddd1263d18 --- /dev/null +++ b/checkpoint-10700/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a60cffa7310860b0eb84d368758a8f86c4d12451fc09d6cc90a802ff691a75 +size 441897977 diff --git a/checkpoint-10700/rng_state.pth b/checkpoint-10700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d05a855bd4dda2cef31846e7695df09a5c35c7ab --- /dev/null +++ b/checkpoint-10700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da8e8fc3d9bda7e0d5df98b0773b928133c5ee23feebab467973b5533189833 +size 14511 diff --git a/checkpoint-10700/scaler.pt b/checkpoint-10700/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8589cc5448294c525e65a80699ab6b5cd02d801e --- /dev/null +++ b/checkpoint-10700/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:680d3cfc1ccd4ccf0f6548860547f0c185be680cb7d873b614410fff2d70ee99 +size 557 diff --git a/checkpoint-10700/scheduler.pt b/checkpoint-10700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1db5286e14ed3f6f0702445fb3b8bbf322a87a54 --- /dev/null +++ b/checkpoint-10700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11681a532209a6e5b24701b99eb2918864d8c52d36263c42e8a41b1b76442615 +size 627 diff --git a/checkpoint-10700/trainer_state.json b/checkpoint-10700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6233ba5db7869b21a1b93516f81a4b736bbfeebf --- /dev/null +++ b/checkpoint-10700/trainer_state.json @@ -0,0 +1,658 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 83.58971871968963, + "global_step": 10700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + }, + { + "epoch": 78.9, + "learning_rate": 9.904580152671757e-05, + "loss": 3.1003, + "step": 10100 + }, + { + "epoch": 79.68, + "learning_rate": 9.809160305343512e-05, + "loss": 3.1137, + "step": 10200 + }, + { + "epoch": 80.47, + "learning_rate": 9.713740458015268e-05, + "loss": 3.0958, + "step": 10300 + }, + { + "epoch": 81.25, + "learning_rate": 9.618320610687024e-05, + "loss": 3.0853, + "step": 10400 + }, + { + "epoch": 82.03, + "learning_rate": 9.522900763358779e-05, + "loss": 3.0704, + "step": 10500 + }, + { + "epoch": 82.81, + "learning_rate": 9.427480916030534e-05, + "loss": 3.03, + "step": 10600 + }, + { + "epoch": 83.59, + "learning_rate": 9.33206106870229e-05, + "loss": 3.0428, + "step": 10700 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 7.253523355926528e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10700/training_args.bin b/checkpoint-10700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-10700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-13200/config.json b/checkpoint-13200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-13200/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-13200/optimizer.pt b/checkpoint-13200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..06f4e7d723a99ac821003b727f62c3b8ecdc8927 --- /dev/null +++ b/checkpoint-13200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e38f85e320af3c483e4c5629487359bc2beb6cc4a5fb1be4e62a40daf3d86ea6 +size 883771077 diff --git a/checkpoint-13200/pytorch_model.bin b/checkpoint-13200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ee2d2d406635bdf8bc3360d8d91893a8f1324109 --- /dev/null +++ b/checkpoint-13200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232ff547d71f660c54653970cfb25dde6796c4b763e25513c7477f785bb05974 +size 441897977 diff --git a/checkpoint-13200/rng_state.pth b/checkpoint-13200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..022101de2cbd8f5976ca60d1ba76235b5cbe5c21 --- /dev/null +++ b/checkpoint-13200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65853d8eba688cfb8801838aa8cddba4a37b866067cfbe2eb1335e92205cf0d6 +size 14511 diff --git a/checkpoint-13200/scaler.pt b/checkpoint-13200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..691faa531235897ae3c245d8e4c2d342d795dcbd --- /dev/null +++ b/checkpoint-13200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0961d88247b5b0b3e4d659805dd764b2ea62614b66a9b6bd857b1aeb7de0599a +size 557 diff --git a/checkpoint-13200/scheduler.pt b/checkpoint-13200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..078f547227ea50c19d4df2ec54b42650c96e56ad --- /dev/null +++ b/checkpoint-13200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea0a608bab19c5f6d0c9e18d30677a93977ade21a6be290b4d4bd8aa2d347ca +size 627 diff --git a/checkpoint-13200/trainer_state.json b/checkpoint-13200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8949709845fbcf13e36c168548c6bf7e6ea5f34a --- /dev/null +++ b/checkpoint-13200/trainer_state.json @@ -0,0 +1,808 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 103.12415130940835, + "global_step": 13200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + }, + { + "epoch": 78.9, + "learning_rate": 9.904580152671757e-05, + "loss": 3.1003, + "step": 10100 + }, + { + "epoch": 79.68, + "learning_rate": 9.809160305343512e-05, + "loss": 3.1137, + "step": 10200 + }, + { + "epoch": 80.47, + "learning_rate": 9.713740458015268e-05, + "loss": 3.0958, + "step": 10300 + }, + { + "epoch": 81.25, + "learning_rate": 9.618320610687024e-05, + "loss": 3.0853, + "step": 10400 + }, + { + "epoch": 82.03, + "learning_rate": 9.522900763358779e-05, + "loss": 3.0704, + "step": 10500 + }, + { + "epoch": 82.81, + "learning_rate": 9.427480916030534e-05, + "loss": 3.03, + "step": 10600 + }, + { + "epoch": 83.59, + "learning_rate": 9.33206106870229e-05, + "loss": 3.0428, + "step": 10700 + }, + { + "epoch": 84.37, + "learning_rate": 9.236641221374047e-05, + "loss": 3.0299, + "step": 10800 + }, + { + "epoch": 85.16, + "learning_rate": 9.141221374045802e-05, + "loss": 3.0239, + "step": 10900 + }, + { + "epoch": 85.93, + "learning_rate": 9.045801526717558e-05, + "loss": 2.9818, + "step": 11000 + }, + { + "epoch": 86.71, + "learning_rate": 8.950381679389314e-05, + "loss": 2.9967, + "step": 11100 + }, + { + "epoch": 87.5, + "learning_rate": 8.854961832061069e-05, + "loss": 2.9866, + "step": 11200 + }, + { + "epoch": 88.28, + "learning_rate": 8.759541984732825e-05, + "loss": 2.9758, + "step": 11300 + }, + { + "epoch": 89.06, + "learning_rate": 8.664122137404582e-05, + "loss": 2.968, + "step": 11400 + }, + { + "epoch": 89.84, + "learning_rate": 8.568702290076335e-05, + "loss": 2.9276, + "step": 11500 + }, + { + "epoch": 90.62, + "learning_rate": 8.473282442748092e-05, + "loss": 2.9464, + "step": 11600 + }, + { + "epoch": 91.4, + "learning_rate": 8.377862595419848e-05, + "loss": 2.9342, + "step": 11700 + }, + { + "epoch": 92.19, + "learning_rate": 8.282442748091603e-05, + "loss": 2.9313, + "step": 11800 + }, + { + "epoch": 92.96, + "learning_rate": 8.187022900763359e-05, + "loss": 2.8932, + "step": 11900 + }, + { + "epoch": 93.74, + "learning_rate": 8.091603053435115e-05, + "loss": 2.9087, + "step": 12000 + }, + { + "epoch": 94.53, + "learning_rate": 7.996183206106872e-05, + "loss": 2.9054, + "step": 12100 + }, + { + "epoch": 95.31, + "learning_rate": 7.900763358778626e-05, + "loss": 2.8916, + "step": 12200 + }, + { + "epoch": 96.09, + "learning_rate": 7.805343511450383e-05, + "loss": 2.8888, + "step": 12300 + }, + { + "epoch": 96.87, + "learning_rate": 7.709923664122138e-05, + "loss": 2.8505, + "step": 12400 + }, + { + "epoch": 97.65, + "learning_rate": 7.614503816793893e-05, + "loss": 2.8729, + "step": 12500 + }, + { + "epoch": 98.43, + "learning_rate": 7.519083969465649e-05, + "loss": 2.8576, + "step": 12600 + }, + { + "epoch": 99.22, + "learning_rate": 7.423664122137405e-05, + "loss": 2.8569, + "step": 12700 + }, + { + "epoch": 99.99, + "learning_rate": 7.32824427480916e-05, + "loss": 2.8276, + "step": 12800 + }, + { + "epoch": 100.78, + "learning_rate": 7.232824427480916e-05, + "loss": 2.8404, + "step": 12900 + }, + { + "epoch": 101.56, + "learning_rate": 7.137404580152673e-05, + "loss": 2.8337, + "step": 13000 + }, + { + "epoch": 102.34, + "learning_rate": 7.041984732824428e-05, + "loss": 2.8285, + "step": 13100 + }, + { + "epoch": 103.12, + "learning_rate": 6.946564885496184e-05, + "loss": 2.8209, + "step": 13200 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 8.948591590637568e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-13200/training_args.bin b/checkpoint-13200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-13200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-15600/config.json b/checkpoint-15600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-15600/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-15600/optimizer.pt b/checkpoint-15600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4292ee8880803fba637373aeb12f3357e64ce72 --- /dev/null +++ b/checkpoint-15600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3e2c52e3ad73c5690c7fd3988703d0851fa21fb94faec91a09af579aac43b5 +size 883771077 diff --git a/checkpoint-15600/pytorch_model.bin b/checkpoint-15600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a38b1bc7b86fd0e0eb750acabeda3a7bd303d8f5 --- /dev/null +++ b/checkpoint-15600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cebba5a25a952644988cd41329625765366452b9a418c608d86f9668c91ab9be +size 441897977 diff --git a/checkpoint-15600/rng_state.pth b/checkpoint-15600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba6d368c1235a28ed06140915791ef8d61cd75f6 --- /dev/null +++ b/checkpoint-15600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63eefde296117e60cfadb90b3252b400980366f430405f99c807b48bd3bd38d8 +size 14511 diff --git a/checkpoint-15600/scaler.pt b/checkpoint-15600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7089615b20910a65da14f137564de3134bac9a7a --- /dev/null +++ b/checkpoint-15600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a2e3ddad874ea5cbb036b763892d0dc9047925ff82b6fb038844dd5f04a3d3 +size 557 diff --git a/checkpoint-15600/scheduler.pt b/checkpoint-15600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a31c14da9dabde139466dcdb9a384eb33c73f4f --- /dev/null +++ b/checkpoint-15600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d702af4a167ac36439f4e82ab03ccb8e03403b3b51b549f5b1b11906a9342808 +size 627 diff --git a/checkpoint-15600/trainer_state.json b/checkpoint-15600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9667781d16514ea032d9d0457180720e3e53071c --- /dev/null +++ b/checkpoint-15600/trainer_state.json @@ -0,0 +1,952 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 121.86905916585839, + "global_step": 15600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + }, + { + "epoch": 78.9, + "learning_rate": 9.904580152671757e-05, + "loss": 3.1003, + "step": 10100 + }, + { + "epoch": 79.68, + "learning_rate": 9.809160305343512e-05, + "loss": 3.1137, + "step": 10200 + }, + { + "epoch": 80.47, + "learning_rate": 9.713740458015268e-05, + "loss": 3.0958, + "step": 10300 + }, + { + "epoch": 81.25, + "learning_rate": 9.618320610687024e-05, + "loss": 3.0853, + "step": 10400 + }, + { + "epoch": 82.03, + "learning_rate": 9.522900763358779e-05, + "loss": 3.0704, + "step": 10500 + }, + { + "epoch": 82.81, + "learning_rate": 9.427480916030534e-05, + "loss": 3.03, + "step": 10600 + }, + { + "epoch": 83.59, + "learning_rate": 9.33206106870229e-05, + "loss": 3.0428, + "step": 10700 + }, + { + "epoch": 84.37, + "learning_rate": 9.236641221374047e-05, + "loss": 3.0299, + "step": 10800 + }, + { + "epoch": 85.16, + "learning_rate": 9.141221374045802e-05, + "loss": 3.0239, + "step": 10900 + }, + { + "epoch": 85.93, + "learning_rate": 9.045801526717558e-05, + "loss": 2.9818, + "step": 11000 + }, + { + "epoch": 86.71, + "learning_rate": 8.950381679389314e-05, + "loss": 2.9967, + "step": 11100 + }, + { + "epoch": 87.5, + "learning_rate": 8.854961832061069e-05, + "loss": 2.9866, + "step": 11200 + }, + { + "epoch": 88.28, + "learning_rate": 8.759541984732825e-05, + "loss": 2.9758, + "step": 11300 + }, + { + "epoch": 89.06, + "learning_rate": 8.664122137404582e-05, + "loss": 2.968, + "step": 11400 + }, + { + "epoch": 89.84, + "learning_rate": 8.568702290076335e-05, + "loss": 2.9276, + "step": 11500 + }, + { + "epoch": 90.62, + "learning_rate": 8.473282442748092e-05, + "loss": 2.9464, + "step": 11600 + }, + { + "epoch": 91.4, + "learning_rate": 8.377862595419848e-05, + "loss": 2.9342, + "step": 11700 + }, + { + "epoch": 92.19, + "learning_rate": 8.282442748091603e-05, + "loss": 2.9313, + "step": 11800 + }, + { + "epoch": 92.96, + "learning_rate": 8.187022900763359e-05, + "loss": 2.8932, + "step": 11900 + }, + { + "epoch": 93.74, + "learning_rate": 8.091603053435115e-05, + "loss": 2.9087, + "step": 12000 + }, + { + "epoch": 94.53, + "learning_rate": 7.996183206106872e-05, + "loss": 2.9054, + "step": 12100 + }, + { + "epoch": 95.31, + "learning_rate": 7.900763358778626e-05, + "loss": 2.8916, + "step": 12200 + }, + { + "epoch": 96.09, + "learning_rate": 7.805343511450383e-05, + "loss": 2.8888, + "step": 12300 + }, + { + "epoch": 96.87, + "learning_rate": 7.709923664122138e-05, + "loss": 2.8505, + "step": 12400 + }, + { + "epoch": 97.65, + "learning_rate": 7.614503816793893e-05, + "loss": 2.8729, + "step": 12500 + }, + { + "epoch": 98.43, + "learning_rate": 7.519083969465649e-05, + "loss": 2.8576, + "step": 12600 + }, + { + "epoch": 99.22, + "learning_rate": 7.423664122137405e-05, + "loss": 2.8569, + "step": 12700 + }, + { + "epoch": 99.99, + "learning_rate": 7.32824427480916e-05, + "loss": 2.8276, + "step": 12800 + }, + { + "epoch": 100.78, + "learning_rate": 7.232824427480916e-05, + "loss": 2.8404, + "step": 12900 + }, + { + "epoch": 101.56, + "learning_rate": 7.137404580152673e-05, + "loss": 2.8337, + "step": 13000 + }, + { + "epoch": 102.34, + "learning_rate": 7.041984732824428e-05, + "loss": 2.8285, + "step": 13100 + }, + { + "epoch": 103.12, + "learning_rate": 6.946564885496184e-05, + "loss": 2.8209, + "step": 13200 + }, + { + "epoch": 103.9, + "learning_rate": 6.851145038167939e-05, + "loss": 2.7906, + "step": 13300 + }, + { + "epoch": 104.68, + "learning_rate": 6.755725190839695e-05, + "loss": 2.8055, + "step": 13400 + }, + { + "epoch": 105.47, + "learning_rate": 6.66030534351145e-05, + "loss": 2.8034, + "step": 13500 + }, + { + "epoch": 106.25, + "learning_rate": 6.564885496183206e-05, + "loss": 2.7957, + "step": 13600 + }, + { + "epoch": 107.03, + "learning_rate": 6.469465648854963e-05, + "loss": 2.7888, + "step": 13700 + }, + { + "epoch": 107.81, + "learning_rate": 6.374045801526718e-05, + "loss": 2.7596, + "step": 13800 + }, + { + "epoch": 108.59, + "learning_rate": 6.278625954198474e-05, + "loss": 2.7781, + "step": 13900 + }, + { + "epoch": 109.37, + "learning_rate": 6.18320610687023e-05, + "loss": 2.7737, + "step": 14000 + }, + { + "epoch": 110.16, + "learning_rate": 6.087786259541986e-05, + "loss": 2.7692, + "step": 14100 + }, + { + "epoch": 110.93, + "learning_rate": 5.992366412213741e-05, + "loss": 2.7379, + "step": 14200 + }, + { + "epoch": 111.71, + "learning_rate": 5.897900763358779e-05, + "loss": 2.754, + "step": 14300 + }, + { + "epoch": 112.5, + "learning_rate": 5.802480916030535e-05, + "loss": 2.7503, + "step": 14400 + }, + { + "epoch": 113.28, + "learning_rate": 5.707061068702291e-05, + "loss": 2.7488, + "step": 14500 + }, + { + "epoch": 114.06, + "learning_rate": 5.6116412213740466e-05, + "loss": 2.7461, + "step": 14600 + }, + { + "epoch": 114.84, + "learning_rate": 5.5162213740458016e-05, + "loss": 2.711, + "step": 14700 + }, + { + "epoch": 115.62, + "learning_rate": 5.420801526717557e-05, + "loss": 2.7314, + "step": 14800 + }, + { + "epoch": 116.4, + "learning_rate": 5.325381679389313e-05, + "loss": 2.7222, + "step": 14900 + }, + { + "epoch": 117.19, + "learning_rate": 5.229961832061069e-05, + "loss": 2.7267, + "step": 15000 + }, + { + "epoch": 117.96, + "learning_rate": 5.134541984732825e-05, + "loss": 2.6951, + "step": 15100 + }, + { + "epoch": 118.74, + "learning_rate": 5.03912213740458e-05, + "loss": 2.7121, + "step": 15200 + }, + { + "epoch": 119.53, + "learning_rate": 4.9437022900763366e-05, + "loss": 2.7082, + "step": 15300 + }, + { + "epoch": 120.31, + "learning_rate": 4.8482824427480915e-05, + "loss": 2.7073, + "step": 15400 + }, + { + "epoch": 121.09, + "learning_rate": 4.752862595419848e-05, + "loss": 2.7014, + "step": 15500 + }, + { + "epoch": 121.87, + "learning_rate": 4.6574427480916034e-05, + "loss": 2.6753, + "step": 15600 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 1.0575225393512448e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15600/training_args.bin b/checkpoint-15600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-15600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-18000/config.json b/checkpoint-18000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-18000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-18000/optimizer.pt b/checkpoint-18000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..320a06677e3d24b04c35bac4c204ef66da5a2972 --- /dev/null +++ b/checkpoint-18000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dfa9c28077c99d33b65f5ddfbc21d4cb5871a1286d2790a94dd3f7535345c55 +size 883771077 diff --git a/checkpoint-18000/pytorch_model.bin b/checkpoint-18000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cdacf2f7306c031f104a051779e925549bd50cdf --- /dev/null +++ b/checkpoint-18000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc69590fd21b64297c3aa0fa3a6c1911177fc720e054a4f263184ae15e5ea8f +size 441897977 diff --git a/checkpoint-18000/rng_state.pth b/checkpoint-18000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..566229d7902f80fd1b14fffcab092a7d7d2a56b5 --- /dev/null +++ b/checkpoint-18000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd2296218597355b0acd06d0a718241ed040d575334015384fa29a54ab36abac +size 14511 diff --git a/checkpoint-18000/scaler.pt b/checkpoint-18000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfeaab220d5874a376d98b3e932b14c777fb3bb3 --- /dev/null +++ b/checkpoint-18000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d90cd2d5e2b92e6c934619c0740e3cb9ddb00f99c21eae5a2b88210b3ba9a6 +size 557 diff --git a/checkpoint-18000/scheduler.pt b/checkpoint-18000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdc9d3c1147297ad4007f9455abf18f685618fcc --- /dev/null +++ b/checkpoint-18000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8376a47017036f00282ed5a674372dba78d59d5b7cbf65e5098ea8802090de34 +size 627 diff --git a/checkpoint-18000/trainer_state.json b/checkpoint-18000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a80b6f042bc8bf0fcb2c02eba7563524dda852 --- /dev/null +++ b/checkpoint-18000/trainer_state.json @@ -0,0 +1,1096 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 140.6207565470417, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + }, + { + "epoch": 78.9, + "learning_rate": 9.904580152671757e-05, + "loss": 3.1003, + "step": 10100 + }, + { + "epoch": 79.68, + "learning_rate": 9.809160305343512e-05, + "loss": 3.1137, + "step": 10200 + }, + { + "epoch": 80.47, + "learning_rate": 9.713740458015268e-05, + "loss": 3.0958, + "step": 10300 + }, + { + "epoch": 81.25, + "learning_rate": 9.618320610687024e-05, + "loss": 3.0853, + "step": 10400 + }, + { + "epoch": 82.03, + "learning_rate": 9.522900763358779e-05, + "loss": 3.0704, + "step": 10500 + }, + { + "epoch": 82.81, + "learning_rate": 9.427480916030534e-05, + "loss": 3.03, + "step": 10600 + }, + { + "epoch": 83.59, + "learning_rate": 9.33206106870229e-05, + "loss": 3.0428, + "step": 10700 + }, + { + "epoch": 84.37, + "learning_rate": 9.236641221374047e-05, + "loss": 3.0299, + "step": 10800 + }, + { + "epoch": 85.16, + "learning_rate": 9.141221374045802e-05, + "loss": 3.0239, + "step": 10900 + }, + { + "epoch": 85.93, + "learning_rate": 9.045801526717558e-05, + "loss": 2.9818, + "step": 11000 + }, + { + "epoch": 86.71, + "learning_rate": 8.950381679389314e-05, + "loss": 2.9967, + "step": 11100 + }, + { + "epoch": 87.5, + "learning_rate": 8.854961832061069e-05, + "loss": 2.9866, + "step": 11200 + }, + { + "epoch": 88.28, + "learning_rate": 8.759541984732825e-05, + "loss": 2.9758, + "step": 11300 + }, + { + "epoch": 89.06, + "learning_rate": 8.664122137404582e-05, + "loss": 2.968, + "step": 11400 + }, + { + "epoch": 89.84, + "learning_rate": 8.568702290076335e-05, + "loss": 2.9276, + "step": 11500 + }, + { + "epoch": 90.62, + "learning_rate": 8.473282442748092e-05, + "loss": 2.9464, + "step": 11600 + }, + { + "epoch": 91.4, + "learning_rate": 8.377862595419848e-05, + "loss": 2.9342, + "step": 11700 + }, + { + "epoch": 92.19, + "learning_rate": 8.282442748091603e-05, + "loss": 2.9313, + "step": 11800 + }, + { + "epoch": 92.96, + "learning_rate": 8.187022900763359e-05, + "loss": 2.8932, + "step": 11900 + }, + { + "epoch": 93.74, + "learning_rate": 8.091603053435115e-05, + "loss": 2.9087, + "step": 12000 + }, + { + "epoch": 94.53, + "learning_rate": 7.996183206106872e-05, + "loss": 2.9054, + "step": 12100 + }, + { + "epoch": 95.31, + "learning_rate": 7.900763358778626e-05, + "loss": 2.8916, + "step": 12200 + }, + { + "epoch": 96.09, + "learning_rate": 7.805343511450383e-05, + "loss": 2.8888, + "step": 12300 + }, + { + "epoch": 96.87, + "learning_rate": 7.709923664122138e-05, + "loss": 2.8505, + "step": 12400 + }, + { + "epoch": 97.65, + "learning_rate": 7.614503816793893e-05, + "loss": 2.8729, + "step": 12500 + }, + { + "epoch": 98.43, + "learning_rate": 7.519083969465649e-05, + "loss": 2.8576, + "step": 12600 + }, + { + "epoch": 99.22, + "learning_rate": 7.423664122137405e-05, + "loss": 2.8569, + "step": 12700 + }, + { + "epoch": 99.99, + "learning_rate": 7.32824427480916e-05, + "loss": 2.8276, + "step": 12800 + }, + { + "epoch": 100.78, + "learning_rate": 7.232824427480916e-05, + "loss": 2.8404, + "step": 12900 + }, + { + "epoch": 101.56, + "learning_rate": 7.137404580152673e-05, + "loss": 2.8337, + "step": 13000 + }, + { + "epoch": 102.34, + "learning_rate": 7.041984732824428e-05, + "loss": 2.8285, + "step": 13100 + }, + { + "epoch": 103.12, + "learning_rate": 6.946564885496184e-05, + "loss": 2.8209, + "step": 13200 + }, + { + "epoch": 103.9, + "learning_rate": 6.851145038167939e-05, + "loss": 2.7906, + "step": 13300 + }, + { + "epoch": 104.68, + "learning_rate": 6.755725190839695e-05, + "loss": 2.8055, + "step": 13400 + }, + { + "epoch": 105.47, + "learning_rate": 6.66030534351145e-05, + "loss": 2.8034, + "step": 13500 + }, + { + "epoch": 106.25, + "learning_rate": 6.564885496183206e-05, + "loss": 2.7957, + "step": 13600 + }, + { + "epoch": 107.03, + "learning_rate": 6.469465648854963e-05, + "loss": 2.7888, + "step": 13700 + }, + { + "epoch": 107.81, + "learning_rate": 6.374045801526718e-05, + "loss": 2.7596, + "step": 13800 + }, + { + "epoch": 108.59, + "learning_rate": 6.278625954198474e-05, + "loss": 2.7781, + "step": 13900 + }, + { + "epoch": 109.37, + "learning_rate": 6.18320610687023e-05, + "loss": 2.7737, + "step": 14000 + }, + { + "epoch": 110.16, + "learning_rate": 6.087786259541986e-05, + "loss": 2.7692, + "step": 14100 + }, + { + "epoch": 110.93, + "learning_rate": 5.992366412213741e-05, + "loss": 2.7379, + "step": 14200 + }, + { + "epoch": 111.71, + "learning_rate": 5.897900763358779e-05, + "loss": 2.754, + "step": 14300 + }, + { + "epoch": 112.5, + "learning_rate": 5.802480916030535e-05, + "loss": 2.7503, + "step": 14400 + }, + { + "epoch": 113.28, + "learning_rate": 5.707061068702291e-05, + "loss": 2.7488, + "step": 14500 + }, + { + "epoch": 114.06, + "learning_rate": 5.6116412213740466e-05, + "loss": 2.7461, + "step": 14600 + }, + { + "epoch": 114.84, + "learning_rate": 5.5162213740458016e-05, + "loss": 2.711, + "step": 14700 + }, + { + "epoch": 115.62, + "learning_rate": 5.420801526717557e-05, + "loss": 2.7314, + "step": 14800 + }, + { + "epoch": 116.4, + "learning_rate": 5.325381679389313e-05, + "loss": 2.7222, + "step": 14900 + }, + { + "epoch": 117.19, + "learning_rate": 5.229961832061069e-05, + "loss": 2.7267, + "step": 15000 + }, + { + "epoch": 117.96, + "learning_rate": 5.134541984732825e-05, + "loss": 2.6951, + "step": 15100 + }, + { + "epoch": 118.74, + "learning_rate": 5.03912213740458e-05, + "loss": 2.7121, + "step": 15200 + }, + { + "epoch": 119.53, + "learning_rate": 4.9437022900763366e-05, + "loss": 2.7082, + "step": 15300 + }, + { + "epoch": 120.31, + "learning_rate": 4.8482824427480915e-05, + "loss": 2.7073, + "step": 15400 + }, + { + "epoch": 121.09, + "learning_rate": 4.752862595419848e-05, + "loss": 2.7014, + "step": 15500 + }, + { + "epoch": 121.87, + "learning_rate": 4.6574427480916034e-05, + "loss": 2.6753, + "step": 15600 + }, + { + "epoch": 122.65, + "learning_rate": 4.562022900763359e-05, + "loss": 2.6895, + "step": 15700 + }, + { + "epoch": 123.43, + "learning_rate": 4.466603053435115e-05, + "loss": 2.693, + "step": 15800 + }, + { + "epoch": 124.22, + "learning_rate": 4.37118320610687e-05, + "loss": 2.6863, + "step": 15900 + }, + { + "epoch": 124.99, + "learning_rate": 4.275763358778626e-05, + "loss": 2.6588, + "step": 16000 + }, + { + "epoch": 125.78, + "learning_rate": 4.180343511450382e-05, + "loss": 2.6755, + "step": 16100 + }, + { + "epoch": 126.56, + "learning_rate": 4.084923664122138e-05, + "loss": 2.6724, + "step": 16200 + }, + { + "epoch": 127.34, + "learning_rate": 3.989503816793893e-05, + "loss": 2.669, + "step": 16300 + }, + { + "epoch": 128.12, + "learning_rate": 3.894083969465649e-05, + "loss": 2.6671, + "step": 16400 + }, + { + "epoch": 128.9, + "learning_rate": 3.7986641221374045e-05, + "loss": 2.6429, + "step": 16500 + }, + { + "epoch": 129.68, + "learning_rate": 3.703244274809161e-05, + "loss": 2.6615, + "step": 16600 + }, + { + "epoch": 130.47, + "learning_rate": 3.6078244274809164e-05, + "loss": 2.6553, + "step": 16700 + }, + { + "epoch": 131.25, + "learning_rate": 3.512404580152672e-05, + "loss": 2.6541, + "step": 16800 + }, + { + "epoch": 132.03, + "learning_rate": 3.4169847328244276e-05, + "loss": 2.6543, + "step": 16900 + }, + { + "epoch": 132.81, + "learning_rate": 3.321564885496183e-05, + "loss": 2.6242, + "step": 17000 + }, + { + "epoch": 133.59, + "learning_rate": 3.2261450381679395e-05, + "loss": 2.6412, + "step": 17100 + }, + { + "epoch": 134.37, + "learning_rate": 3.130725190839695e-05, + "loss": 2.6423, + "step": 17200 + }, + { + "epoch": 135.16, + "learning_rate": 3.0353053435114503e-05, + "loss": 2.6372, + "step": 17300 + }, + { + "epoch": 135.93, + "learning_rate": 2.9398854961832063e-05, + "loss": 2.6107, + "step": 17400 + }, + { + "epoch": 136.71, + "learning_rate": 2.844465648854962e-05, + "loss": 2.6298, + "step": 17500 + }, + { + "epoch": 137.5, + "learning_rate": 2.7490458015267178e-05, + "loss": 2.6265, + "step": 17600 + }, + { + "epoch": 138.28, + "learning_rate": 2.6545801526717557e-05, + "loss": 2.6274, + "step": 17700 + }, + { + "epoch": 139.06, + "learning_rate": 2.5591603053435116e-05, + "loss": 2.626, + "step": 17800 + }, + { + "epoch": 139.84, + "learning_rate": 2.4637404580152672e-05, + "loss": 2.6, + "step": 17900 + }, + { + "epoch": 140.62, + "learning_rate": 2.368320610687023e-05, + "loss": 2.6194, + "step": 18000 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 1.220238561509376e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18000/training_args.bin b/checkpoint-18000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-18000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-20000/config.json b/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-20000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-20000/optimizer.pt b/checkpoint-20000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5990e417e2c646d1d135af36bdf3ae3e2761f5f --- /dev/null +++ b/checkpoint-20000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72056f8aab17063872be49b83d736ffdf2934e4b3af70af70f5d2114e10d94f3 +size 883771077 diff --git a/checkpoint-20000/pytorch_model.bin b/checkpoint-20000/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..713beb5a2ab83f384d019b18c2c94cc9dba16cfe --- /dev/null +++ b/checkpoint-20000/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e03675839d8acc7188b3941d8a6a5da29db20e7d5e3d76c0c1e8891a1630e5b +size 441897977 diff --git a/checkpoint-20000/rng_state.pth b/checkpoint-20000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f7547d3e7b3727b0024e856d8ae7826048f8146e --- /dev/null +++ b/checkpoint-20000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4335a8157d5cdbb8e82c8726feef433b662b7e4743daba3cf3e93980c91f88ef +size 14511 diff --git a/checkpoint-20000/scaler.pt b/checkpoint-20000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb3dd26f8fc8ec2438e099bccb2ee92d7696a9ac --- /dev/null +++ b/checkpoint-20000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca93c9b6da6a63f8168493043a49578a48495785a70fd3d11703a732fa0f8e96 +size 557 diff --git a/checkpoint-20000/scheduler.pt b/checkpoint-20000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a08c582e59632cbc74e451daf964a0eacfdd28a --- /dev/null +++ b/checkpoint-20000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e815534a0b7ece4c95dc1b5e50be186ea45fccf76c290e2008d4dbc2b8aea43 +size 627 diff --git a/checkpoint-20000/trainer_state.json b/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..01e9188d18505d6fa2eaaf8310921f07e8b63f75 --- /dev/null +++ b/checkpoint-20000/trainer_state.json @@ -0,0 +1,1216 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 156.2483026188167, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + }, + { + "epoch": 54.68, + "learning_rate": 7e-05, + "loss": 4.2114, + "step": 7000 + }, + { + "epoch": 55.47, + "learning_rate": 7.1e-05, + "loss": 4.1021, + "step": 7100 + }, + { + "epoch": 56.25, + "learning_rate": 7.2e-05, + "loss": 4.0074, + "step": 7200 + }, + { + "epoch": 57.03, + "learning_rate": 7.3e-05, + "loss": 3.9346, + "step": 7300 + }, + { + "epoch": 57.81, + "learning_rate": 7.4e-05, + "loss": 3.8289, + "step": 7400 + }, + { + "epoch": 58.59, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8105, + "step": 7500 + }, + { + "epoch": 59.37, + "learning_rate": 7.6e-05, + "loss": 3.755, + "step": 7600 + }, + { + "epoch": 60.16, + "learning_rate": 7.7e-05, + "loss": 3.7105, + "step": 7700 + }, + { + "epoch": 60.93, + "learning_rate": 7.800000000000001e-05, + "loss": 3.6394, + "step": 7800 + }, + { + "epoch": 61.71, + "learning_rate": 7.900000000000001e-05, + "loss": 3.6262, + "step": 7900 + }, + { + "epoch": 62.5, + "learning_rate": 8e-05, + "loss": 3.5924, + "step": 8000 + }, + { + "epoch": 63.28, + "learning_rate": 8.1e-05, + "loss": 3.558, + "step": 8100 + }, + { + "epoch": 64.06, + "learning_rate": 8.2e-05, + "loss": 3.5255, + "step": 8200 + }, + { + "epoch": 64.84, + "learning_rate": 8.3e-05, + "loss": 3.4602, + "step": 8300 + }, + { + "epoch": 65.62, + "learning_rate": 8.4e-05, + "loss": 3.4641, + "step": 8400 + }, + { + "epoch": 66.4, + "learning_rate": 8.5e-05, + "loss": 3.435, + "step": 8500 + }, + { + "epoch": 67.19, + "learning_rate": 8.6e-05, + "loss": 3.408, + "step": 8600 + }, + { + "epoch": 67.96, + "learning_rate": 8.7e-05, + "loss": 3.3594, + "step": 8700 + }, + { + "epoch": 68.74, + "learning_rate": 8.800000000000001e-05, + "loss": 3.3593, + "step": 8800 + }, + { + "epoch": 69.53, + "learning_rate": 8.900000000000001e-05, + "loss": 3.3372, + "step": 8900 + }, + { + "epoch": 70.31, + "learning_rate": 9e-05, + "loss": 3.3217, + "step": 9000 + }, + { + "epoch": 71.09, + "learning_rate": 9.1e-05, + "loss": 3.2985, + "step": 9100 + }, + { + "epoch": 71.87, + "learning_rate": 9.200000000000001e-05, + "loss": 3.2509, + "step": 9200 + }, + { + "epoch": 72.65, + "learning_rate": 9.300000000000001e-05, + "loss": 3.2584, + "step": 9300 + }, + { + "epoch": 73.43, + "learning_rate": 9.4e-05, + "loss": 3.2386, + "step": 9400 + }, + { + "epoch": 74.22, + "learning_rate": 9.5e-05, + "loss": 3.2232, + "step": 9500 + }, + { + "epoch": 74.99, + "learning_rate": 9.6e-05, + "loss": 3.1786, + "step": 9600 + }, + { + "epoch": 75.78, + "learning_rate": 9.7e-05, + "loss": 3.1855, + "step": 9700 + }, + { + "epoch": 76.56, + "learning_rate": 9.8e-05, + "loss": 3.1737, + "step": 9800 + }, + { + "epoch": 77.34, + "learning_rate": 9.900000000000001e-05, + "loss": 3.1565, + "step": 9900 + }, + { + "epoch": 78.12, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 10000 + }, + { + "epoch": 78.9, + "learning_rate": 9.904580152671757e-05, + "loss": 3.1003, + "step": 10100 + }, + { + "epoch": 79.68, + "learning_rate": 9.809160305343512e-05, + "loss": 3.1137, + "step": 10200 + }, + { + "epoch": 80.47, + "learning_rate": 9.713740458015268e-05, + "loss": 3.0958, + "step": 10300 + }, + { + "epoch": 81.25, + "learning_rate": 9.618320610687024e-05, + "loss": 3.0853, + "step": 10400 + }, + { + "epoch": 82.03, + "learning_rate": 9.522900763358779e-05, + "loss": 3.0704, + "step": 10500 + }, + { + "epoch": 82.81, + "learning_rate": 9.427480916030534e-05, + "loss": 3.03, + "step": 10600 + }, + { + "epoch": 83.59, + "learning_rate": 9.33206106870229e-05, + "loss": 3.0428, + "step": 10700 + }, + { + "epoch": 84.37, + "learning_rate": 9.236641221374047e-05, + "loss": 3.0299, + "step": 10800 + }, + { + "epoch": 85.16, + "learning_rate": 9.141221374045802e-05, + "loss": 3.0239, + "step": 10900 + }, + { + "epoch": 85.93, + "learning_rate": 9.045801526717558e-05, + "loss": 2.9818, + "step": 11000 + }, + { + "epoch": 86.71, + "learning_rate": 8.950381679389314e-05, + "loss": 2.9967, + "step": 11100 + }, + { + "epoch": 87.5, + "learning_rate": 8.854961832061069e-05, + "loss": 2.9866, + "step": 11200 + }, + { + "epoch": 88.28, + "learning_rate": 8.759541984732825e-05, + "loss": 2.9758, + "step": 11300 + }, + { + "epoch": 89.06, + "learning_rate": 8.664122137404582e-05, + "loss": 2.968, + "step": 11400 + }, + { + "epoch": 89.84, + "learning_rate": 8.568702290076335e-05, + "loss": 2.9276, + "step": 11500 + }, + { + "epoch": 90.62, + "learning_rate": 8.473282442748092e-05, + "loss": 2.9464, + "step": 11600 + }, + { + "epoch": 91.4, + "learning_rate": 8.377862595419848e-05, + "loss": 2.9342, + "step": 11700 + }, + { + "epoch": 92.19, + "learning_rate": 8.282442748091603e-05, + "loss": 2.9313, + "step": 11800 + }, + { + "epoch": 92.96, + "learning_rate": 8.187022900763359e-05, + "loss": 2.8932, + "step": 11900 + }, + { + "epoch": 93.74, + "learning_rate": 8.091603053435115e-05, + "loss": 2.9087, + "step": 12000 + }, + { + "epoch": 94.53, + "learning_rate": 7.996183206106872e-05, + "loss": 2.9054, + "step": 12100 + }, + { + "epoch": 95.31, + "learning_rate": 7.900763358778626e-05, + "loss": 2.8916, + "step": 12200 + }, + { + "epoch": 96.09, + "learning_rate": 7.805343511450383e-05, + "loss": 2.8888, + "step": 12300 + }, + { + "epoch": 96.87, + "learning_rate": 7.709923664122138e-05, + "loss": 2.8505, + "step": 12400 + }, + { + "epoch": 97.65, + "learning_rate": 7.614503816793893e-05, + "loss": 2.8729, + "step": 12500 + }, + { + "epoch": 98.43, + "learning_rate": 7.519083969465649e-05, + "loss": 2.8576, + "step": 12600 + }, + { + "epoch": 99.22, + "learning_rate": 7.423664122137405e-05, + "loss": 2.8569, + "step": 12700 + }, + { + "epoch": 99.99, + "learning_rate": 7.32824427480916e-05, + "loss": 2.8276, + "step": 12800 + }, + { + "epoch": 100.78, + "learning_rate": 7.232824427480916e-05, + "loss": 2.8404, + "step": 12900 + }, + { + "epoch": 101.56, + "learning_rate": 7.137404580152673e-05, + "loss": 2.8337, + "step": 13000 + }, + { + "epoch": 102.34, + "learning_rate": 7.041984732824428e-05, + "loss": 2.8285, + "step": 13100 + }, + { + "epoch": 103.12, + "learning_rate": 6.946564885496184e-05, + "loss": 2.8209, + "step": 13200 + }, + { + "epoch": 103.9, + "learning_rate": 6.851145038167939e-05, + "loss": 2.7906, + "step": 13300 + }, + { + "epoch": 104.68, + "learning_rate": 6.755725190839695e-05, + "loss": 2.8055, + "step": 13400 + }, + { + "epoch": 105.47, + "learning_rate": 6.66030534351145e-05, + "loss": 2.8034, + "step": 13500 + }, + { + "epoch": 106.25, + "learning_rate": 6.564885496183206e-05, + "loss": 2.7957, + "step": 13600 + }, + { + "epoch": 107.03, + "learning_rate": 6.469465648854963e-05, + "loss": 2.7888, + "step": 13700 + }, + { + "epoch": 107.81, + "learning_rate": 6.374045801526718e-05, + "loss": 2.7596, + "step": 13800 + }, + { + "epoch": 108.59, + "learning_rate": 6.278625954198474e-05, + "loss": 2.7781, + "step": 13900 + }, + { + "epoch": 109.37, + "learning_rate": 6.18320610687023e-05, + "loss": 2.7737, + "step": 14000 + }, + { + "epoch": 110.16, + "learning_rate": 6.087786259541986e-05, + "loss": 2.7692, + "step": 14100 + }, + { + "epoch": 110.93, + "learning_rate": 5.992366412213741e-05, + "loss": 2.7379, + "step": 14200 + }, + { + "epoch": 111.71, + "learning_rate": 5.897900763358779e-05, + "loss": 2.754, + "step": 14300 + }, + { + "epoch": 112.5, + "learning_rate": 5.802480916030535e-05, + "loss": 2.7503, + "step": 14400 + }, + { + "epoch": 113.28, + "learning_rate": 5.707061068702291e-05, + "loss": 2.7488, + "step": 14500 + }, + { + "epoch": 114.06, + "learning_rate": 5.6116412213740466e-05, + "loss": 2.7461, + "step": 14600 + }, + { + "epoch": 114.84, + "learning_rate": 5.5162213740458016e-05, + "loss": 2.711, + "step": 14700 + }, + { + "epoch": 115.62, + "learning_rate": 5.420801526717557e-05, + "loss": 2.7314, + "step": 14800 + }, + { + "epoch": 116.4, + "learning_rate": 5.325381679389313e-05, + "loss": 2.7222, + "step": 14900 + }, + { + "epoch": 117.19, + "learning_rate": 5.229961832061069e-05, + "loss": 2.7267, + "step": 15000 + }, + { + "epoch": 117.96, + "learning_rate": 5.134541984732825e-05, + "loss": 2.6951, + "step": 15100 + }, + { + "epoch": 118.74, + "learning_rate": 5.03912213740458e-05, + "loss": 2.7121, + "step": 15200 + }, + { + "epoch": 119.53, + "learning_rate": 4.9437022900763366e-05, + "loss": 2.7082, + "step": 15300 + }, + { + "epoch": 120.31, + "learning_rate": 4.8482824427480915e-05, + "loss": 2.7073, + "step": 15400 + }, + { + "epoch": 121.09, + "learning_rate": 4.752862595419848e-05, + "loss": 2.7014, + "step": 15500 + }, + { + "epoch": 121.87, + "learning_rate": 4.6574427480916034e-05, + "loss": 2.6753, + "step": 15600 + }, + { + "epoch": 122.65, + "learning_rate": 4.562022900763359e-05, + "loss": 2.6895, + "step": 15700 + }, + { + "epoch": 123.43, + "learning_rate": 4.466603053435115e-05, + "loss": 2.693, + "step": 15800 + }, + { + "epoch": 124.22, + "learning_rate": 4.37118320610687e-05, + "loss": 2.6863, + "step": 15900 + }, + { + "epoch": 124.99, + "learning_rate": 4.275763358778626e-05, + "loss": 2.6588, + "step": 16000 + }, + { + "epoch": 125.78, + "learning_rate": 4.180343511450382e-05, + "loss": 2.6755, + "step": 16100 + }, + { + "epoch": 126.56, + "learning_rate": 4.084923664122138e-05, + "loss": 2.6724, + "step": 16200 + }, + { + "epoch": 127.34, + "learning_rate": 3.989503816793893e-05, + "loss": 2.669, + "step": 16300 + }, + { + "epoch": 128.12, + "learning_rate": 3.894083969465649e-05, + "loss": 2.6671, + "step": 16400 + }, + { + "epoch": 128.9, + "learning_rate": 3.7986641221374045e-05, + "loss": 2.6429, + "step": 16500 + }, + { + "epoch": 129.68, + "learning_rate": 3.703244274809161e-05, + "loss": 2.6615, + "step": 16600 + }, + { + "epoch": 130.47, + "learning_rate": 3.6078244274809164e-05, + "loss": 2.6553, + "step": 16700 + }, + { + "epoch": 131.25, + "learning_rate": 3.512404580152672e-05, + "loss": 2.6541, + "step": 16800 + }, + { + "epoch": 132.03, + "learning_rate": 3.4169847328244276e-05, + "loss": 2.6543, + "step": 16900 + }, + { + "epoch": 132.81, + "learning_rate": 3.321564885496183e-05, + "loss": 2.6242, + "step": 17000 + }, + { + "epoch": 133.59, + "learning_rate": 3.2261450381679395e-05, + "loss": 2.6412, + "step": 17100 + }, + { + "epoch": 134.37, + "learning_rate": 3.130725190839695e-05, + "loss": 2.6423, + "step": 17200 + }, + { + "epoch": 135.16, + "learning_rate": 3.0353053435114503e-05, + "loss": 2.6372, + "step": 17300 + }, + { + "epoch": 135.93, + "learning_rate": 2.9398854961832063e-05, + "loss": 2.6107, + "step": 17400 + }, + { + "epoch": 136.71, + "learning_rate": 2.844465648854962e-05, + "loss": 2.6298, + "step": 17500 + }, + { + "epoch": 137.5, + "learning_rate": 2.7490458015267178e-05, + "loss": 2.6265, + "step": 17600 + }, + { + "epoch": 138.28, + "learning_rate": 2.6545801526717557e-05, + "loss": 2.6274, + "step": 17700 + }, + { + "epoch": 139.06, + "learning_rate": 2.5591603053435116e-05, + "loss": 2.626, + "step": 17800 + }, + { + "epoch": 139.84, + "learning_rate": 2.4637404580152672e-05, + "loss": 2.6, + "step": 17900 + }, + { + "epoch": 140.62, + "learning_rate": 2.368320610687023e-05, + "loss": 2.6194, + "step": 18000 + }, + { + "epoch": 141.4, + "learning_rate": 2.2729007633587788e-05, + "loss": 2.6174, + "step": 18100 + }, + { + "epoch": 142.19, + "learning_rate": 2.1774809160305344e-05, + "loss": 2.6168, + "step": 18200 + }, + { + "epoch": 142.96, + "learning_rate": 2.08206106870229e-05, + "loss": 2.5903, + "step": 18300 + }, + { + "epoch": 143.74, + "learning_rate": 1.986641221374046e-05, + "loss": 2.6097, + "step": 18400 + }, + { + "epoch": 144.53, + "learning_rate": 1.8912213740458016e-05, + "loss": 2.6107, + "step": 18500 + }, + { + "epoch": 145.31, + "learning_rate": 1.7958015267175575e-05, + "loss": 2.6078, + "step": 18600 + }, + { + "epoch": 146.09, + "learning_rate": 1.7003816793893128e-05, + "loss": 2.6033, + "step": 18700 + }, + { + "epoch": 146.87, + "learning_rate": 1.6049618320610687e-05, + "loss": 2.5791, + "step": 18800 + }, + { + "epoch": 147.65, + "learning_rate": 1.5095419847328246e-05, + "loss": 2.6027, + "step": 18900 + }, + { + "epoch": 148.43, + "learning_rate": 1.41412213740458e-05, + "loss": 2.597, + "step": 19000 + }, + { + "epoch": 149.22, + "learning_rate": 1.318702290076336e-05, + "loss": 2.5998, + "step": 19100 + }, + { + "epoch": 149.99, + "learning_rate": 1.2232824427480916e-05, + "loss": 2.5747, + "step": 19200 + }, + { + "epoch": 150.78, + "learning_rate": 1.1278625954198474e-05, + "loss": 2.5939, + "step": 19300 + }, + { + "epoch": 151.56, + "learning_rate": 1.0324427480916032e-05, + "loss": 2.5941, + "step": 19400 + }, + { + "epoch": 152.34, + "learning_rate": 9.370229007633588e-06, + "loss": 2.5906, + "step": 19500 + }, + { + "epoch": 153.12, + "learning_rate": 8.416030534351146e-06, + "loss": 2.593, + "step": 19600 + }, + { + "epoch": 153.9, + "learning_rate": 7.4618320610687024e-06, + "loss": 2.5669, + "step": 19700 + }, + { + "epoch": 154.68, + "learning_rate": 6.50763358778626e-06, + "loss": 2.5849, + "step": 19800 + }, + { + "epoch": 155.47, + "learning_rate": 5.553435114503817e-06, + "loss": 2.584, + "step": 19900 + }, + { + "epoch": 156.25, + "learning_rate": 4.599236641221375e-06, + "loss": 2.5826, + "step": 20000 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 1.3558440202862592e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-20000/training_args.bin b/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-3100/config.json b/checkpoint-3100/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-3100/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-3100/optimizer.pt b/checkpoint-3100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d566d1d2f6236f8a61a6c9c441354bb77c624c9 --- /dev/null +++ b/checkpoint-3100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad434181a50ada83a588c3c46b63f3dfca29a5b657d01b7a28d713fc6433602c +size 883771077 diff --git a/checkpoint-3100/pytorch_model.bin b/checkpoint-3100/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ab9b3ce54c3ca60a260bb72ee79cec326625d02 --- /dev/null +++ b/checkpoint-3100/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd263e9f64fd448882769ab83e862be6c47161b3e8572043cf985991f463a8f7 +size 441897977 diff --git a/checkpoint-3100/rng_state.pth b/checkpoint-3100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b6a0c22aa9c005b98990ba00026fe6b4097790f --- /dev/null +++ b/checkpoint-3100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd87e49db1ff17fcf204e53cd18cc15008802999eceffa20b6e52db312c2317 +size 14511 diff --git a/checkpoint-3100/scaler.pt b/checkpoint-3100/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..57607517be7a295c7af874e4b59c9781080cd2e6 --- /dev/null +++ b/checkpoint-3100/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a08d1ea525d3cabea81782a8d242f8902d6b51ab080afad775855a2a40232a0b +size 557 diff --git a/checkpoint-3100/scheduler.pt b/checkpoint-3100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3341d2750a85f3774e24eb38f3ce471111510b7a --- /dev/null +++ b/checkpoint-3100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3d3ea6897a30d9e4fb1fed6dd882ca3a0e8bdfd4eaa71df549557a0878451c +size 627 diff --git a/checkpoint-3100/trainer_state.json b/checkpoint-3100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b199285e8684c762bbe651442bfe3c14f3fe9347 --- /dev/null +++ b/checkpoint-3100/trainer_state.json @@ -0,0 +1,202 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 24.217264791464597, + "global_step": 3100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 2.101463476076544e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3100/training_args.bin b/checkpoint-3100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-3100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/checkpoint-6900/config.json b/checkpoint-6900/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/checkpoint-6900/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/checkpoint-6900/optimizer.pt b/checkpoint-6900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..01ede26156a05696b2f44173ac002acf15ce75ff --- /dev/null +++ b/checkpoint-6900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b21285f347117d6f4d6bf9f1af046de2e0928376cadae0a3d2aad6e8845df4 +size 883771077 diff --git a/checkpoint-6900/pytorch_model.bin b/checkpoint-6900/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..84852a3de87e97a07314f948318ee0d8fb72482f --- /dev/null +++ b/checkpoint-6900/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a4b91ded2f2341a7d9da2f5e57457216b85e287b8e16a7fe70e94b546472fa +size 441897977 diff --git a/checkpoint-6900/rng_state.pth b/checkpoint-6900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ff2a06e20dddd47483dee1bee51b926caf1576c --- /dev/null +++ b/checkpoint-6900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d5298cfd547c47eb6d1c20477b13201aaa73cc70abc17bfa4428aa35bee8e8 +size 14511 diff --git a/checkpoint-6900/scaler.pt b/checkpoint-6900/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a5781acfa31619a9b6f9822768e53ada8d108c --- /dev/null +++ b/checkpoint-6900/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5eceea9b28fb7a072d2badbf3993c026a8588d6f6eb7b1a400321e840ec7938 +size 557 diff --git a/checkpoint-6900/scheduler.pt b/checkpoint-6900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f87d91f4b424fdbf97ecbfe7acaa55118ff0b00 --- /dev/null +++ b/checkpoint-6900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f272e87f4c179f9d7734f39cadaf61f2c2c6bfce2f5ab9d3f9c108eba56da460 +size 627 diff --git a/checkpoint-6900/trainer_state.json b/checkpoint-6900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c75ee022a100ddf0171f7156b89b376be27282b1 --- /dev/null +++ b/checkpoint-6900/trainer_state.json @@ -0,0 +1,430 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 53.900096993210475, + "global_step": 6900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.78, + "learning_rate": 1.0000000000000002e-06, + "loss": 10.1419, + "step": 100 + }, + { + "epoch": 1.56, + "learning_rate": 2.0000000000000003e-06, + "loss": 9.2788, + "step": 200 + }, + { + "epoch": 2.34, + "learning_rate": 3e-06, + "loss": 8.8011, + "step": 300 + }, + { + "epoch": 3.12, + "learning_rate": 4.000000000000001e-06, + "loss": 8.3846, + "step": 400 + }, + { + "epoch": 3.9, + "learning_rate": 5e-06, + "loss": 7.8214, + "step": 500 + }, + { + "epoch": 4.68, + "learning_rate": 6e-06, + "loss": 7.3094, + "step": 600 + }, + { + "epoch": 5.47, + "learning_rate": 7.000000000000001e-06, + "loss": 6.7911, + "step": 700 + }, + { + "epoch": 6.25, + "learning_rate": 8.000000000000001e-06, + "loss": 6.4144, + "step": 800 + }, + { + "epoch": 7.03, + "learning_rate": 9e-06, + "loss": 6.2168, + "step": 900 + }, + { + "epoch": 7.81, + "learning_rate": 1e-05, + "loss": 6.0642, + "step": 1000 + }, + { + "epoch": 8.59, + "learning_rate": 1.1000000000000001e-05, + "loss": 6.0511, + "step": 1100 + }, + { + "epoch": 9.37, + "learning_rate": 1.2e-05, + "loss": 5.995, + "step": 1200 + }, + { + "epoch": 10.16, + "learning_rate": 1.3000000000000001e-05, + "loss": 5.9539, + "step": 1300 + }, + { + "epoch": 10.93, + "learning_rate": 1.4000000000000001e-05, + "loss": 5.8675, + "step": 1400 + }, + { + "epoch": 11.71, + "learning_rate": 1.5e-05, + "loss": 5.8874, + "step": 1500 + }, + { + "epoch": 12.5, + "learning_rate": 1.6000000000000003e-05, + "loss": 5.8578, + "step": 1600 + }, + { + "epoch": 13.28, + "learning_rate": 1.7000000000000003e-05, + "loss": 5.8386, + "step": 1700 + }, + { + "epoch": 14.06, + "learning_rate": 1.8e-05, + "loss": 5.8138, + "step": 1800 + }, + { + "epoch": 14.84, + "learning_rate": 1.9e-05, + "loss": 5.7399, + "step": 1900 + }, + { + "epoch": 15.62, + "learning_rate": 2e-05, + "loss": 5.7753, + "step": 2000 + }, + { + "epoch": 16.4, + "learning_rate": 2.1e-05, + "loss": 5.7564, + "step": 2100 + }, + { + "epoch": 17.19, + "learning_rate": 2.2000000000000003e-05, + "loss": 5.738, + "step": 2200 + }, + { + "epoch": 17.96, + "learning_rate": 2.3000000000000003e-05, + "loss": 5.6753, + "step": 2300 + }, + { + "epoch": 18.74, + "learning_rate": 2.4e-05, + "loss": 5.7082, + "step": 2400 + }, + { + "epoch": 19.53, + "learning_rate": 2.5e-05, + "loss": 5.6991, + "step": 2500 + }, + { + "epoch": 20.31, + "learning_rate": 2.6000000000000002e-05, + "loss": 5.6801, + "step": 2600 + }, + { + "epoch": 21.09, + "learning_rate": 2.7000000000000002e-05, + "loss": 5.6692, + "step": 2700 + }, + { + "epoch": 21.87, + "learning_rate": 2.8000000000000003e-05, + "loss": 5.6063, + "step": 2800 + }, + { + "epoch": 22.65, + "learning_rate": 2.9e-05, + "loss": 5.6445, + "step": 2900 + }, + { + "epoch": 23.43, + "learning_rate": 3e-05, + "loss": 5.6328, + "step": 3000 + }, + { + "epoch": 24.22, + "learning_rate": 3.1e-05, + "loss": 5.6217, + "step": 3100 + }, + { + "epoch": 24.99, + "learning_rate": 3.2000000000000005e-05, + "loss": 5.5601, + "step": 3200 + }, + { + "epoch": 25.78, + "learning_rate": 3.3e-05, + "loss": 5.5976, + "step": 3300 + }, + { + "epoch": 26.56, + "learning_rate": 3.4000000000000007e-05, + "loss": 5.5911, + "step": 3400 + }, + { + "epoch": 27.34, + "learning_rate": 3.5e-05, + "loss": 5.5738, + "step": 3500 + }, + { + "epoch": 28.12, + "learning_rate": 3.6e-05, + "loss": 5.566, + "step": 3600 + }, + { + "epoch": 28.9, + "learning_rate": 3.7e-05, + "loss": 5.5071, + "step": 3700 + }, + { + "epoch": 29.68, + "learning_rate": 3.8e-05, + "loss": 5.5438, + "step": 3800 + }, + { + "epoch": 30.47, + "learning_rate": 3.9000000000000006e-05, + "loss": 5.5366, + "step": 3900 + }, + { + "epoch": 31.25, + "learning_rate": 4e-05, + "loss": 5.5268, + "step": 4000 + }, + { + "epoch": 32.03, + "learning_rate": 4.1e-05, + "loss": 5.517, + "step": 4100 + }, + { + "epoch": 32.81, + "learning_rate": 4.2e-05, + "loss": 5.4574, + "step": 4200 + }, + { + "epoch": 33.59, + "learning_rate": 4.3e-05, + "loss": 5.5002, + "step": 4300 + }, + { + "epoch": 34.37, + "learning_rate": 4.4000000000000006e-05, + "loss": 5.4887, + "step": 4400 + }, + { + "epoch": 35.16, + "learning_rate": 4.5e-05, + "loss": 5.4805, + "step": 4500 + }, + { + "epoch": 35.93, + "learning_rate": 4.600000000000001e-05, + "loss": 5.4265, + "step": 4600 + }, + { + "epoch": 36.71, + "learning_rate": 4.7e-05, + "loss": 5.4615, + "step": 4700 + }, + { + "epoch": 37.5, + "learning_rate": 4.8e-05, + "loss": 5.4576, + "step": 4800 + }, + { + "epoch": 38.28, + "learning_rate": 4.9e-05, + "loss": 5.4421, + "step": 4900 + }, + { + "epoch": 39.06, + "learning_rate": 5e-05, + "loss": 5.4342, + "step": 5000 + }, + { + "epoch": 39.84, + "learning_rate": 5.1000000000000006e-05, + "loss": 5.3641, + "step": 5100 + }, + { + "epoch": 40.62, + "learning_rate": 5.2000000000000004e-05, + "loss": 5.379, + "step": 5200 + }, + { + "epoch": 41.4, + "learning_rate": 5.300000000000001e-05, + "loss": 5.3638, + "step": 5300 + }, + { + "epoch": 42.19, + "learning_rate": 5.4000000000000005e-05, + "loss": 5.3441, + "step": 5400 + }, + { + "epoch": 42.96, + "learning_rate": 5.500000000000001e-05, + "loss": 5.2759, + "step": 5500 + }, + { + "epoch": 43.74, + "learning_rate": 5.6000000000000006e-05, + "loss": 5.3011, + "step": 5600 + }, + { + "epoch": 44.53, + "learning_rate": 5.6999999999999996e-05, + "loss": 5.2758, + "step": 5700 + }, + { + "epoch": 45.31, + "learning_rate": 5.8e-05, + "loss": 5.2559, + "step": 5800 + }, + { + "epoch": 46.09, + "learning_rate": 5.9e-05, + "loss": 5.2326, + "step": 5900 + }, + { + "epoch": 46.87, + "learning_rate": 6e-05, + "loss": 5.1616, + "step": 6000 + }, + { + "epoch": 47.65, + "learning_rate": 6.1e-05, + "loss": 5.1753, + "step": 6100 + }, + { + "epoch": 48.43, + "learning_rate": 6.2e-05, + "loss": 5.1378, + "step": 6200 + }, + { + "epoch": 49.22, + "learning_rate": 6.3e-05, + "loss": 5.1, + "step": 6300 + }, + { + "epoch": 49.99, + "learning_rate": 6.400000000000001e-05, + "loss": 5.015, + "step": 6400 + }, + { + "epoch": 50.78, + "learning_rate": 6.500000000000001e-05, + "loss": 4.9758, + "step": 6500 + }, + { + "epoch": 51.56, + "learning_rate": 6.6e-05, + "loss": 4.8417, + "step": 6600 + }, + { + "epoch": 52.34, + "learning_rate": 6.7e-05, + "loss": 4.7116, + "step": 6700 + }, + { + "epoch": 53.12, + "learning_rate": 6.800000000000001e-05, + "loss": 4.5582, + "step": 6800 + }, + { + "epoch": 53.9, + "learning_rate": 6.9e-05, + "loss": 4.3437, + "step": 6900 + } + ], + "max_steps": 20480, + "num_train_epochs": 160, + "total_flos": 4.67723020664832e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6900/training_args.bin b/checkpoint-6900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/checkpoint-6900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec30949e5a8b0dd8ae0b024bcbaf9fbe0146d440 --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 1, + "classifier_dropout": null, + "eos_token_id": 2, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 260, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 3, + "position_embedding_type": "absolute", + "sep_token_id": 2, + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 32000 +} diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d761eb4b12fa5ff02b8cc88e79f336faec968ac --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e68a5dd555dde9414f73fb56457865b771fd0251d1007e500c2823b7a99b6b0 +size 441897977 diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9c21319584dde53c480461eb9c19c3719272433 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de0a5888f9534c475c0b5c0d6fa6d4920e1a50a89517b67872f8b6ca51ef166 +size 3579