lu-vae commited on
Commit
e52c1e9
1 Parent(s): ca002f6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. cola/roberta-base_lr1e-05/classifier_head.pt +3 -0
  2. cola/roberta-base_lr1e-05/config.json +27 -0
  3. cola/roberta-base_lr1e-05/merges.txt +0 -0
  4. cola/roberta-base_lr1e-05/model.safetensors +3 -0
  5. cola/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  6. cola/roberta-base_lr1e-05/tokenizer.json +0 -0
  7. cola/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  8. cola/roberta-base_lr1e-05/trainer_state.json +120 -0
  9. cola/roberta-base_lr1e-05/training_args.bin +3 -0
  10. cola/roberta-base_lr1e-05/vocab.json +0 -0
  11. mnli/roberta-base_lr1e-05/classifier_head.pt +3 -0
  12. mnli/roberta-base_lr1e-05/config.json +37 -0
  13. mnli/roberta-base_lr1e-05/merges.txt +0 -0
  14. mnli/roberta-base_lr1e-05/model.safetensors +3 -0
  15. mnli/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  16. mnli/roberta-base_lr1e-05/tokenizer.json +0 -0
  17. mnli/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  18. mnli/roberta-base_lr1e-05/trainer_state.json +282 -0
  19. mnli/roberta-base_lr1e-05/training_args.bin +3 -0
  20. mnli/roberta-base_lr1e-05/vocab.json +0 -0
  21. mrpc/roberta-base_lr1e-05/classifier_head.pt +3 -0
  22. mrpc/roberta-base_lr1e-05/config.json +27 -0
  23. mrpc/roberta-base_lr1e-05/merges.txt +0 -0
  24. mrpc/roberta-base_lr1e-05/model.safetensors +3 -0
  25. mrpc/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  26. mrpc/roberta-base_lr1e-05/tokenizer.json +0 -0
  27. mrpc/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  28. mrpc/roberta-base_lr1e-05/trainer_state.json +140 -0
  29. mrpc/roberta-base_lr1e-05/training_args.bin +3 -0
  30. mrpc/roberta-base_lr1e-05/vocab.json +0 -0
  31. qnli/roberta-base_lr1e-05/classifier_head.pt +3 -0
  32. qnli/roberta-base_lr1e-05/config.json +27 -0
  33. qnli/roberta-base_lr1e-05/merges.txt +0 -0
  34. qnli/roberta-base_lr1e-05/model.safetensors +3 -0
  35. qnli/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  36. qnli/roberta-base_lr1e-05/tokenizer.json +0 -0
  37. qnli/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  38. qnli/roberta-base_lr1e-05/trainer_state.json +162 -0
  39. qnli/roberta-base_lr1e-05/training_args.bin +3 -0
  40. qnli/roberta-base_lr1e-05/vocab.json +0 -0
  41. qqp/roberta-base_lr1e-05/classifier_head.pt +3 -0
  42. qqp/roberta-base_lr1e-05/config.json +27 -0
  43. qqp/roberta-base_lr1e-05/merges.txt +0 -0
  44. qqp/roberta-base_lr1e-05/model.safetensors +3 -0
  45. qqp/roberta-base_lr1e-05/special_tokens_map.json +15 -0
  46. qqp/roberta-base_lr1e-05/tokenizer.json +0 -0
  47. qqp/roberta-base_lr1e-05/tokenizer_config.json +57 -0
  48. qqp/roberta-base_lr1e-05/trainer_state.json +290 -0
  49. qqp/roberta-base_lr1e-05/training_args.bin +3 -0
  50. qqp/roberta-base_lr1e-05/vocab.json +0 -0
cola/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5141ce17fe52592b8578b7fbc28a0e021c36f262f9118ea79310dda82b6b47f4
3
+ size 2371407
cola/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.37.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
cola/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9ec7931db4b523d0aa2b6e8655e229a3a55f9059ec192a194ad0cf8122e9f29
3
+ size 498612824
cola/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
cola/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cola/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
cola/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6136322073831847,
3
+ "best_model_checkpoint": "./save_models/cola/roberta-base_lr1e-05/checkpoint-248",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 310,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_loss": 0.5821298956871033,
14
+ "eval_matthews_correlation": 0.0,
15
+ "eval_runtime": 1.557,
16
+ "eval_samples_per_second": 549.767,
17
+ "eval_steps_per_second": 2.569,
18
+ "step": 31
19
+ },
20
+ {
21
+ "epoch": 2.0,
22
+ "eval_loss": 0.4815237522125244,
23
+ "eval_matthews_correlation": 0.09833128071522987,
24
+ "eval_runtime": 1.2301,
25
+ "eval_samples_per_second": 695.896,
26
+ "eval_steps_per_second": 3.252,
27
+ "step": 62
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "eval_loss": 0.4497840702533722,
32
+ "eval_matthews_correlation": 0.5200031718306268,
33
+ "eval_runtime": 1.2175,
34
+ "eval_samples_per_second": 703.099,
35
+ "eval_steps_per_second": 3.286,
36
+ "step": 93
37
+ },
38
+ {
39
+ "epoch": 4.0,
40
+ "eval_loss": 0.45493587851524353,
41
+ "eval_matthews_correlation": 0.5526832237492683,
42
+ "eval_runtime": 1.456,
43
+ "eval_samples_per_second": 587.894,
44
+ "eval_steps_per_second": 2.747,
45
+ "step": 124
46
+ },
47
+ {
48
+ "epoch": 5.0,
49
+ "eval_loss": 0.4303121268749237,
50
+ "eval_matthews_correlation": 0.5818251601933869,
51
+ "eval_runtime": 1.3482,
52
+ "eval_samples_per_second": 634.941,
53
+ "eval_steps_per_second": 2.967,
54
+ "step": 155
55
+ },
56
+ {
57
+ "epoch": 6.0,
58
+ "eval_loss": 0.4925920367240906,
59
+ "eval_matthews_correlation": 0.5730783192190388,
60
+ "eval_runtime": 1.4104,
61
+ "eval_samples_per_second": 606.924,
62
+ "eval_steps_per_second": 2.836,
63
+ "step": 186
64
+ },
65
+ {
66
+ "epoch": 7.0,
67
+ "eval_loss": 0.4559820294380188,
68
+ "eval_matthews_correlation": 0.5895848105106377,
69
+ "eval_runtime": 1.2207,
70
+ "eval_samples_per_second": 701.208,
71
+ "eval_steps_per_second": 3.277,
72
+ "step": 217
73
+ },
74
+ {
75
+ "epoch": 8.0,
76
+ "eval_loss": 0.4101633131504059,
77
+ "eval_matthews_correlation": 0.6136322073831847,
78
+ "eval_runtime": 1.1981,
79
+ "eval_samples_per_second": 714.473,
80
+ "eval_steps_per_second": 3.339,
81
+ "step": 248
82
+ },
83
+ {
84
+ "epoch": 9.0,
85
+ "eval_loss": 0.4428262710571289,
86
+ "eval_matthews_correlation": 0.608505850822177,
87
+ "eval_runtime": 1.2411,
88
+ "eval_samples_per_second": 689.717,
89
+ "eval_steps_per_second": 3.223,
90
+ "step": 279
91
+ },
92
+ {
93
+ "epoch": 10.0,
94
+ "eval_loss": 0.4468447268009186,
95
+ "eval_matthews_correlation": 0.6052396882783275,
96
+ "eval_runtime": 1.4122,
97
+ "eval_samples_per_second": 606.156,
98
+ "eval_steps_per_second": 2.833,
99
+ "step": 310
100
+ },
101
+ {
102
+ "epoch": 10.0,
103
+ "step": 310,
104
+ "total_flos": 1166187850116420.0,
105
+ "train_loss": 0.3865204349640877,
106
+ "train_runtime": 143.4469,
107
+ "train_samples_per_second": 536.436,
108
+ "train_steps_per_second": 2.161
109
+ }
110
+ ],
111
+ "logging_steps": 500,
112
+ "max_steps": 310,
113
+ "num_input_tokens_seen": 0,
114
+ "num_train_epochs": 10,
115
+ "save_steps": 500,
116
+ "total_flos": 1166187850116420.0,
117
+ "train_batch_size": 256,
118
+ "trial_name": null,
119
+ "trial_params": null
120
+ }
cola/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8b9a70202047a46707b4bf2b2259dde8b824c5f9419c6f8d59245944630184
3
+ size 4347
cola/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29e735718dcbfc85857a32c8184f59173a48d73df659c330618299620d6b304
3
+ size 2374479
mnli/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.37.0.dev0",
34
+ "type_vocab_size": 1,
35
+ "use_cache": true,
36
+ "vocab_size": 50265
37
+ }
mnli/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc9a6abbb0b48c8f80deeda37976db09f744f9e1413fea5a063ea28b7a796d7
3
+ size 498615900
mnli/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
mnli/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
mnli/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
mnli/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.86832522726694,
3
+ "best_model_checkpoint": "./save_models/mnli/roberta-base_lr1e-05/checkpoint-8286",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13810,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.36,
13
+ "learning_rate": 6.031363088057901e-06,
14
+ "loss": 0.8769,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.72,
19
+ "learning_rate": 9.86826900855096e-06,
20
+ "loss": 0.4908,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 1.0,
25
+ "eval_accuracy": 0.8439306358381503,
26
+ "eval_loss": 0.40705999732017517,
27
+ "eval_runtime": 30.2499,
28
+ "eval_samples_per_second": 1298.219,
29
+ "eval_steps_per_second": 5.091,
30
+ "step": 1381
31
+ },
32
+ {
33
+ "epoch": 1.09,
34
+ "learning_rate": 9.483090670980664e-06,
35
+ "loss": 0.4345,
36
+ "step": 1500
37
+ },
38
+ {
39
+ "epoch": 1.45,
40
+ "learning_rate": 9.09791233341037e-06,
41
+ "loss": 0.3978,
42
+ "step": 2000
43
+ },
44
+ {
45
+ "epoch": 1.81,
46
+ "learning_rate": 8.712733995840074e-06,
47
+ "loss": 0.3855,
48
+ "step": 2500
49
+ },
50
+ {
51
+ "epoch": 2.0,
52
+ "eval_accuracy": 0.8618318861246212,
53
+ "eval_loss": 0.3706605136394501,
54
+ "eval_runtime": 30.5079,
55
+ "eval_samples_per_second": 1287.241,
56
+ "eval_steps_per_second": 5.048,
57
+ "step": 2762
58
+ },
59
+ {
60
+ "epoch": 2.17,
61
+ "learning_rate": 8.32755565826978e-06,
62
+ "loss": 0.3572,
63
+ "step": 3000
64
+ },
65
+ {
66
+ "epoch": 2.53,
67
+ "learning_rate": 7.942377320699485e-06,
68
+ "loss": 0.3368,
69
+ "step": 3500
70
+ },
71
+ {
72
+ "epoch": 2.9,
73
+ "learning_rate": 7.557198983129189e-06,
74
+ "loss": 0.3354,
75
+ "step": 4000
76
+ },
77
+ {
78
+ "epoch": 3.0,
79
+ "eval_accuracy": 0.8657533548929235,
80
+ "eval_loss": 0.36279234290122986,
81
+ "eval_runtime": 30.3356,
82
+ "eval_samples_per_second": 1294.551,
83
+ "eval_steps_per_second": 5.077,
84
+ "step": 4143
85
+ },
86
+ {
87
+ "epoch": 3.26,
88
+ "learning_rate": 7.172020645558895e-06,
89
+ "loss": 0.3097,
90
+ "step": 4500
91
+ },
92
+ {
93
+ "epoch": 3.62,
94
+ "learning_rate": 6.7868423079885995e-06,
95
+ "loss": 0.2998,
96
+ "step": 5000
97
+ },
98
+ {
99
+ "epoch": 3.98,
100
+ "learning_rate": 6.401663970418303e-06,
101
+ "loss": 0.3005,
102
+ "step": 5500
103
+ },
104
+ {
105
+ "epoch": 4.0,
106
+ "eval_accuracy": 0.8673066639504978,
107
+ "eval_loss": 0.36057987809181213,
108
+ "eval_runtime": 30.2539,
109
+ "eval_samples_per_second": 1298.049,
110
+ "eval_steps_per_second": 5.09,
111
+ "step": 5524
112
+ },
113
+ {
114
+ "epoch": 4.34,
115
+ "learning_rate": 6.016485632848009e-06,
116
+ "loss": 0.2734,
117
+ "step": 6000
118
+ },
119
+ {
120
+ "epoch": 4.71,
121
+ "learning_rate": 5.631307295277714e-06,
122
+ "loss": 0.2724,
123
+ "step": 6500
124
+ },
125
+ {
126
+ "epoch": 5.0,
127
+ "eval_accuracy": 0.8679178019403632,
128
+ "eval_loss": 0.3738739490509033,
129
+ "eval_runtime": 30.6701,
130
+ "eval_samples_per_second": 1280.434,
131
+ "eval_steps_per_second": 5.021,
132
+ "step": 6905
133
+ },
134
+ {
135
+ "epoch": 5.07,
136
+ "learning_rate": 5.2461289577074194e-06,
137
+ "loss": 0.2667,
138
+ "step": 7000
139
+ },
140
+ {
141
+ "epoch": 5.43,
142
+ "learning_rate": 4.860950620137123e-06,
143
+ "loss": 0.2466,
144
+ "step": 7500
145
+ },
146
+ {
147
+ "epoch": 5.79,
148
+ "learning_rate": 4.475772282566829e-06,
149
+ "loss": 0.2481,
150
+ "step": 8000
151
+ },
152
+ {
153
+ "epoch": 6.0,
154
+ "eval_accuracy": 0.86832522726694,
155
+ "eval_loss": 0.3865111470222473,
156
+ "eval_runtime": 30.6524,
157
+ "eval_samples_per_second": 1281.174,
158
+ "eval_steps_per_second": 5.024,
159
+ "step": 8286
160
+ },
161
+ {
162
+ "epoch": 6.15,
163
+ "learning_rate": 4.090593944996534e-06,
164
+ "loss": 0.2394,
165
+ "step": 8500
166
+ },
167
+ {
168
+ "epoch": 6.52,
169
+ "learning_rate": 3.705415607426239e-06,
170
+ "loss": 0.2269,
171
+ "step": 9000
172
+ },
173
+ {
174
+ "epoch": 6.88,
175
+ "learning_rate": 3.3202372698559437e-06,
176
+ "loss": 0.2264,
177
+ "step": 9500
178
+ },
179
+ {
180
+ "epoch": 7.0,
181
+ "eval_accuracy": 0.8672557357846757,
182
+ "eval_loss": 0.3937914967536926,
183
+ "eval_runtime": 30.6252,
184
+ "eval_samples_per_second": 1282.311,
185
+ "eval_steps_per_second": 5.029,
186
+ "step": 9667
187
+ },
188
+ {
189
+ "epoch": 7.24,
190
+ "learning_rate": 2.935058932285649e-06,
191
+ "loss": 0.2181,
192
+ "step": 10000
193
+ },
194
+ {
195
+ "epoch": 7.6,
196
+ "learning_rate": 2.5498805947153533e-06,
197
+ "loss": 0.2134,
198
+ "step": 10500
199
+ },
200
+ {
201
+ "epoch": 7.97,
202
+ "learning_rate": 2.164702257145058e-06,
203
+ "loss": 0.2145,
204
+ "step": 11000
205
+ },
206
+ {
207
+ "epoch": 8.0,
208
+ "eval_accuracy": 0.8681469786865627,
209
+ "eval_loss": 0.4055633544921875,
210
+ "eval_runtime": 30.7036,
211
+ "eval_samples_per_second": 1279.035,
212
+ "eval_steps_per_second": 5.016,
213
+ "step": 11048
214
+ },
215
+ {
216
+ "epoch": 8.33,
217
+ "learning_rate": 1.7795239195747632e-06,
218
+ "loss": 0.2017,
219
+ "step": 11500
220
+ },
221
+ {
222
+ "epoch": 8.69,
223
+ "learning_rate": 1.3943455820044682e-06,
224
+ "loss": 0.2008,
225
+ "step": 12000
226
+ },
227
+ {
228
+ "epoch": 9.0,
229
+ "eval_accuracy": 0.8672811998675868,
230
+ "eval_loss": 0.41524767875671387,
231
+ "eval_runtime": 30.8769,
232
+ "eval_samples_per_second": 1271.856,
233
+ "eval_steps_per_second": 4.988,
234
+ "step": 12429
235
+ },
236
+ {
237
+ "epoch": 9.05,
238
+ "learning_rate": 1.0091672444341732e-06,
239
+ "loss": 0.2013,
240
+ "step": 12500
241
+ },
242
+ {
243
+ "epoch": 9.41,
244
+ "learning_rate": 6.239889068638781e-07,
245
+ "loss": 0.1909,
246
+ "step": 13000
247
+ },
248
+ {
249
+ "epoch": 9.78,
250
+ "learning_rate": 2.3881056929358295e-07,
251
+ "loss": 0.1936,
252
+ "step": 13500
253
+ },
254
+ {
255
+ "epoch": 10.0,
256
+ "eval_accuracy": 0.8680960505207405,
257
+ "eval_loss": 0.42063575983047485,
258
+ "eval_runtime": 30.2907,
259
+ "eval_samples_per_second": 1296.472,
260
+ "eval_steps_per_second": 5.084,
261
+ "step": 13810
262
+ },
263
+ {
264
+ "epoch": 10.0,
265
+ "step": 13810,
266
+ "total_flos": 2.1610790039821677e+17,
267
+ "train_loss": 0.29975197396357794,
268
+ "train_runtime": 5291.5699,
269
+ "train_samples_per_second": 667.913,
270
+ "train_steps_per_second": 2.61
271
+ }
272
+ ],
273
+ "logging_steps": 500,
274
+ "max_steps": 13810,
275
+ "num_input_tokens_seen": 0,
276
+ "num_train_epochs": 10,
277
+ "save_steps": 500,
278
+ "total_flos": 2.1610790039821677e+17,
279
+ "train_batch_size": 256,
280
+ "trial_name": null,
281
+ "trial_params": null
282
+ }
mnli/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a381407731b0402ac4f328df0350af493419c58d93f78a051ff6a50c6d632ca2
3
+ size 4283
mnli/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mrpc/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:435024607635c1d86f68a56717c1e67c952f3cb7680f30ba13eb391ed286bdbf
3
+ size 2371407
mrpc/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.37.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
mrpc/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mrpc/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ea247447fd32b0393ba18fe7185d02329a50ce4b4fe47aad0b59f7f49e1c78
3
+ size 498612824
mrpc/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
mrpc/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
mrpc/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
mrpc/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.84512111394795,
3
+ "best_model_checkpoint": "./save_models/mrpc/roberta-base_lr1e-05/checkpoint-117",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 130,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.6839237057220708,
14
+ "eval_averaged_scores": 0.7481107201749513,
15
+ "eval_f1": 0.8122977346278317,
16
+ "eval_loss": 0.6196000576019287,
17
+ "eval_runtime": 0.9135,
18
+ "eval_samples_per_second": 401.752,
19
+ "eval_steps_per_second": 2.189,
20
+ "step": 13
21
+ },
22
+ {
23
+ "epoch": 2.0,
24
+ "eval_accuracy": 0.6839237057220708,
25
+ "eval_averaged_scores": 0.7481107201749513,
26
+ "eval_f1": 0.8122977346278317,
27
+ "eval_loss": 0.5717987418174744,
28
+ "eval_runtime": 0.9146,
29
+ "eval_samples_per_second": 401.283,
30
+ "eval_steps_per_second": 2.187,
31
+ "step": 26
32
+ },
33
+ {
34
+ "epoch": 3.0,
35
+ "eval_accuracy": 0.7084468664850136,
36
+ "eval_averaged_scores": 0.7630820363089463,
37
+ "eval_f1": 0.8177172061328791,
38
+ "eval_loss": 0.5101604461669922,
39
+ "eval_runtime": 0.9281,
40
+ "eval_samples_per_second": 395.44,
41
+ "eval_steps_per_second": 2.155,
42
+ "step": 39
43
+ },
44
+ {
45
+ "epoch": 4.0,
46
+ "eval_accuracy": 0.7956403269754768,
47
+ "eval_averaged_scores": 0.8287593900070754,
48
+ "eval_f1": 0.861878453038674,
49
+ "eval_loss": 0.45689526200294495,
50
+ "eval_runtime": 0.9188,
51
+ "eval_samples_per_second": 399.445,
52
+ "eval_steps_per_second": 2.177,
53
+ "step": 52
54
+ },
55
+ {
56
+ "epoch": 5.0,
57
+ "eval_accuracy": 0.771117166212534,
58
+ "eval_averaged_scores": 0.7961968809786075,
59
+ "eval_f1": 0.8212765957446809,
60
+ "eval_loss": 0.42808404564857483,
61
+ "eval_runtime": 0.9515,
62
+ "eval_samples_per_second": 385.719,
63
+ "eval_steps_per_second": 2.102,
64
+ "step": 65
65
+ },
66
+ {
67
+ "epoch": 6.0,
68
+ "eval_accuracy": 0.8092643051771117,
69
+ "eval_averaged_scores": 0.8343510280905638,
70
+ "eval_f1": 0.859437751004016,
71
+ "eval_loss": 0.4162532687187195,
72
+ "eval_runtime": 0.9128,
73
+ "eval_samples_per_second": 402.06,
74
+ "eval_steps_per_second": 2.191,
75
+ "step": 78
76
+ },
77
+ {
78
+ "epoch": 7.0,
79
+ "eval_accuracy": 0.8147138964577657,
80
+ "eval_averaged_scores": 0.8396278645635442,
81
+ "eval_f1": 0.8645418326693227,
82
+ "eval_loss": 0.41786280274391174,
83
+ "eval_runtime": 1.0619,
84
+ "eval_samples_per_second": 345.592,
85
+ "eval_steps_per_second": 1.883,
86
+ "step": 91
87
+ },
88
+ {
89
+ "epoch": 8.0,
90
+ "eval_accuracy": 0.8065395095367848,
91
+ "eval_averaged_scores": 0.8318411833398209,
92
+ "eval_f1": 0.8571428571428572,
93
+ "eval_loss": 0.4168005883693695,
94
+ "eval_runtime": 0.9219,
95
+ "eval_samples_per_second": 398.082,
96
+ "eval_steps_per_second": 2.169,
97
+ "step": 104
98
+ },
99
+ {
100
+ "epoch": 9.0,
101
+ "eval_accuracy": 0.8201634877384196,
102
+ "eval_averaged_scores": 0.84512111394795,
103
+ "eval_f1": 0.8700787401574802,
104
+ "eval_loss": 0.42427295446395874,
105
+ "eval_runtime": 0.9217,
106
+ "eval_samples_per_second": 398.165,
107
+ "eval_steps_per_second": 2.17,
108
+ "step": 117
109
+ },
110
+ {
111
+ "epoch": 10.0,
112
+ "eval_accuracy": 0.8119891008174387,
113
+ "eval_averaged_scores": 0.8374060812238286,
114
+ "eval_f1": 0.8628230616302187,
115
+ "eval_loss": 0.42201152443885803,
116
+ "eval_runtime": 0.9117,
117
+ "eval_samples_per_second": 402.525,
118
+ "eval_steps_per_second": 2.194,
119
+ "step": 130
120
+ },
121
+ {
122
+ "epoch": 10.0,
123
+ "step": 130,
124
+ "total_flos": 1559281933489620.0,
125
+ "train_loss": 0.44359661982609677,
126
+ "train_runtime": 101.3918,
127
+ "train_samples_per_second": 325.569,
128
+ "train_steps_per_second": 1.282
129
+ }
130
+ ],
131
+ "logging_steps": 500,
132
+ "max_steps": 130,
133
+ "num_input_tokens_seen": 0,
134
+ "num_train_epochs": 10,
135
+ "save_steps": 500,
136
+ "total_flos": 1559281933489620.0,
137
+ "train_batch_size": 256,
138
+ "trial_name": null,
139
+ "trial_params": null
140
+ }
mrpc/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8fb8fcbef58709b5e705f221664ef997ef46f14ce6b8dc733b9920fff974d97
3
+ size 4283
mrpc/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qnli/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eee208e5b34fe5933f5d0cfbbef33e9201f6006392125f56a1688072321d398
3
+ size 2371407
qnli/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.37.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
qnli/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qnli/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7078cfa1ace53ce5e8d06a424de44b2fec65d327346db34b251df196bc9432ca
3
+ size 498612824
qnli/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
qnli/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
qnli/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
qnli/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.915035799522673,
3
+ "best_model_checkpoint": "./save_models/qnli/roberta-base_lr1e-05/checkpoint-2214",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3690,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "eval_accuracy": 0.8865871121718377,
14
+ "eval_loss": 0.28326648473739624,
15
+ "eval_runtime": 8.5588,
16
+ "eval_samples_per_second": 1223.884,
17
+ "eval_steps_per_second": 4.79,
18
+ "step": 369
19
+ },
20
+ {
21
+ "epoch": 1.36,
22
+ "learning_rate": 9.198385236447522e-06,
23
+ "loss": 0.4241,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 2.0,
28
+ "eval_accuracy": 0.9020525059665871,
29
+ "eval_loss": 0.24534918367862701,
30
+ "eval_runtime": 8.5693,
31
+ "eval_samples_per_second": 1222.389,
32
+ "eval_steps_per_second": 4.785,
33
+ "step": 738
34
+ },
35
+ {
36
+ "epoch": 2.71,
37
+ "learning_rate": 7.756632064590542e-06,
38
+ "loss": 0.2508,
39
+ "step": 1000
40
+ },
41
+ {
42
+ "epoch": 3.0,
43
+ "eval_accuracy": 0.9043436754176611,
44
+ "eval_loss": 0.2600307762622833,
45
+ "eval_runtime": 8.557,
46
+ "eval_samples_per_second": 1224.147,
47
+ "eval_steps_per_second": 4.791,
48
+ "step": 1107
49
+ },
50
+ {
51
+ "epoch": 4.0,
52
+ "eval_accuracy": 0.9120763723150358,
53
+ "eval_loss": 0.23106025159358978,
54
+ "eval_runtime": 8.5249,
55
+ "eval_samples_per_second": 1228.76,
56
+ "eval_steps_per_second": 4.809,
57
+ "step": 1476
58
+ },
59
+ {
60
+ "epoch": 4.07,
61
+ "learning_rate": 6.314878892733565e-06,
62
+ "loss": 0.2044,
63
+ "step": 1500
64
+ },
65
+ {
66
+ "epoch": 5.0,
67
+ "eval_accuracy": 0.9064439140811456,
68
+ "eval_loss": 0.2730174660682678,
69
+ "eval_runtime": 8.491,
70
+ "eval_samples_per_second": 1233.666,
71
+ "eval_steps_per_second": 4.829,
72
+ "step": 1845
73
+ },
74
+ {
75
+ "epoch": 5.42,
76
+ "learning_rate": 4.873125720876586e-06,
77
+ "loss": 0.1703,
78
+ "step": 2000
79
+ },
80
+ {
81
+ "epoch": 6.0,
82
+ "eval_accuracy": 0.915035799522673,
83
+ "eval_loss": 0.2521490752696991,
84
+ "eval_runtime": 8.662,
85
+ "eval_samples_per_second": 1209.303,
86
+ "eval_steps_per_second": 4.733,
87
+ "step": 2214
88
+ },
89
+ {
90
+ "epoch": 6.78,
91
+ "learning_rate": 3.431372549019608e-06,
92
+ "loss": 0.1503,
93
+ "step": 2500
94
+ },
95
+ {
96
+ "epoch": 7.0,
97
+ "eval_accuracy": 0.9115990453460621,
98
+ "eval_loss": 0.26341933012008667,
99
+ "eval_runtime": 8.515,
100
+ "eval_samples_per_second": 1230.176,
101
+ "eval_steps_per_second": 4.815,
102
+ "step": 2583
103
+ },
104
+ {
105
+ "epoch": 8.0,
106
+ "eval_accuracy": 0.9114081145584726,
107
+ "eval_loss": 0.2649821639060974,
108
+ "eval_runtime": 8.6484,
109
+ "eval_samples_per_second": 1211.201,
110
+ "eval_steps_per_second": 4.741,
111
+ "step": 2952
112
+ },
113
+ {
114
+ "epoch": 8.13,
115
+ "learning_rate": 1.9896193771626298e-06,
116
+ "loss": 0.1338,
117
+ "step": 3000
118
+ },
119
+ {
120
+ "epoch": 9.0,
121
+ "eval_accuracy": 0.9138902147971361,
122
+ "eval_loss": 0.2706736922264099,
123
+ "eval_runtime": 8.5199,
124
+ "eval_samples_per_second": 1229.479,
125
+ "eval_steps_per_second": 4.812,
126
+ "step": 3321
127
+ },
128
+ {
129
+ "epoch": 9.49,
130
+ "learning_rate": 5.478662053056517e-07,
131
+ "loss": 0.1216,
132
+ "step": 3500
133
+ },
134
+ {
135
+ "epoch": 10.0,
136
+ "eval_accuracy": 0.9143675417661098,
137
+ "eval_loss": 0.2753640413284302,
138
+ "eval_runtime": 8.4934,
139
+ "eval_samples_per_second": 1233.316,
140
+ "eval_steps_per_second": 4.827,
141
+ "step": 3690
142
+ },
143
+ {
144
+ "epoch": 10.0,
145
+ "step": 3690,
146
+ "total_flos": 6.001437167232744e+16,
147
+ "train_loss": 0.20311757839792144,
148
+ "train_runtime": 1451.9977,
149
+ "train_samples_per_second": 649.23,
150
+ "train_steps_per_second": 2.541
151
+ }
152
+ ],
153
+ "logging_steps": 500,
154
+ "max_steps": 3690,
155
+ "num_input_tokens_seen": 0,
156
+ "num_train_epochs": 10,
157
+ "save_steps": 500,
158
+ "total_flos": 6.001437167232744e+16,
159
+ "train_batch_size": 256,
160
+ "trial_name": null,
161
+ "trial_params": null
162
+ }
qnli/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa18ae7a49399daaa388a810241480f86a86d0347e9005cbe9a492ed153797f
3
+ size 4283
qnli/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qqp/roberta-base_lr1e-05/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a4f879325b651c81b14593ad270283820f6f801b211aacac7dcad66e469042
3
+ size 2371407
qqp/roberta-base_lr1e-05/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.37.0.dev0",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
qqp/roberta-base_lr1e-05/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qqp/roberta-base_lr1e-05/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11cbaf169f4d2b46e5ac47e1e5ad45cedf24bc1ad219e3a20f7d4e694a77e7e7
3
+ size 498612824
qqp/roberta-base_lr1e-05/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
qqp/roberta-base_lr1e-05/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
qqp/roberta-base_lr1e-05/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
qqp/roberta-base_lr1e-05/trainer_state.json ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8932172851340642,
3
+ "best_model_checkpoint": "./save_models/qqp/roberta-base_lr1e-05/checkpoint-12800",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 12800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.39,
13
+ "learning_rate": 6.510416666666667e-06,
14
+ "loss": 0.488,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.78,
19
+ "learning_rate": 9.807180851063832e-06,
20
+ "loss": 0.3263,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 1.0,
25
+ "eval_accuracy": 0.879235948880033,
26
+ "eval_averaged_scores": 0.8598257175939353,
27
+ "eval_f1": 0.8404154863078377,
28
+ "eval_loss": 0.27544060349464417,
29
+ "eval_runtime": 30.2577,
30
+ "eval_samples_per_second": 1202.503,
31
+ "eval_steps_per_second": 4.726,
32
+ "step": 1280
33
+ },
34
+ {
35
+ "epoch": 1.17,
36
+ "learning_rate": 9.391622340425532e-06,
37
+ "loss": 0.2866,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 1.56,
42
+ "learning_rate": 8.976063829787235e-06,
43
+ "loss": 0.2674,
44
+ "step": 2000
45
+ },
46
+ {
47
+ "epoch": 1.95,
48
+ "learning_rate": 8.560505319148937e-06,
49
+ "loss": 0.2554,
50
+ "step": 2500
51
+ },
52
+ {
53
+ "epoch": 2.0,
54
+ "eval_accuracy": 0.891768585955751,
55
+ "eval_averaged_scores": 0.8743571206442515,
56
+ "eval_f1": 0.8569456553327521,
57
+ "eval_loss": 0.2498372197151184,
58
+ "eval_runtime": 27.6277,
59
+ "eval_samples_per_second": 1316.974,
60
+ "eval_steps_per_second": 5.176,
61
+ "step": 2560
62
+ },
63
+ {
64
+ "epoch": 2.34,
65
+ "learning_rate": 8.144946808510639e-06,
66
+ "loss": 0.2339,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 2.73,
71
+ "learning_rate": 7.729388297872341e-06,
72
+ "loss": 0.2263,
73
+ "step": 3500
74
+ },
75
+ {
76
+ "epoch": 3.0,
77
+ "eval_accuracy": 0.8935825202693417,
78
+ "eval_averaged_scores": 0.8724041088244425,
79
+ "eval_f1": 0.8512256973795435,
80
+ "eval_loss": 0.2527249753475189,
81
+ "eval_runtime": 27.9,
82
+ "eval_samples_per_second": 1304.123,
83
+ "eval_steps_per_second": 5.125,
84
+ "step": 3840
85
+ },
86
+ {
87
+ "epoch": 3.12,
88
+ "learning_rate": 7.313829787234044e-06,
89
+ "loss": 0.2159,
90
+ "step": 4000
91
+ },
92
+ {
93
+ "epoch": 3.52,
94
+ "learning_rate": 6.898271276595745e-06,
95
+ "loss": 0.2026,
96
+ "step": 4500
97
+ },
98
+ {
99
+ "epoch": 3.91,
100
+ "learning_rate": 6.482712765957447e-06,
101
+ "loss": 0.2011,
102
+ "step": 5000
103
+ },
104
+ {
105
+ "epoch": 4.0,
106
+ "eval_accuracy": 0.9005634189913426,
107
+ "eval_averaged_scores": 0.8845524747016884,
108
+ "eval_f1": 0.8685415304120341,
109
+ "eval_loss": 0.23971907794475555,
110
+ "eval_runtime": 27.6643,
111
+ "eval_samples_per_second": 1315.233,
112
+ "eval_steps_per_second": 5.169,
113
+ "step": 5120
114
+ },
115
+ {
116
+ "epoch": 4.3,
117
+ "learning_rate": 6.06715425531915e-06,
118
+ "loss": 0.1871,
119
+ "step": 5500
120
+ },
121
+ {
122
+ "epoch": 4.69,
123
+ "learning_rate": 5.651595744680851e-06,
124
+ "loss": 0.1803,
125
+ "step": 6000
126
+ },
127
+ {
128
+ "epoch": 5.0,
129
+ "eval_accuracy": 0.9041088360588154,
130
+ "eval_averaged_scores": 0.8878877207108364,
131
+ "eval_f1": 0.8716666053628573,
132
+ "eval_loss": 0.2404341995716095,
133
+ "eval_runtime": 27.871,
134
+ "eval_samples_per_second": 1305.48,
135
+ "eval_steps_per_second": 5.131,
136
+ "step": 6400
137
+ },
138
+ {
139
+ "epoch": 5.08,
140
+ "learning_rate": 5.236037234042554e-06,
141
+ "loss": 0.1775,
142
+ "step": 6500
143
+ },
144
+ {
145
+ "epoch": 5.47,
146
+ "learning_rate": 4.820478723404256e-06,
147
+ "loss": 0.1644,
148
+ "step": 7000
149
+ },
150
+ {
151
+ "epoch": 5.86,
152
+ "learning_rate": 4.404920212765958e-06,
153
+ "loss": 0.1654,
154
+ "step": 7500
155
+ },
156
+ {
157
+ "epoch": 6.0,
158
+ "eval_accuracy": 0.9068297375292016,
159
+ "eval_averaged_scores": 0.8913133079820112,
160
+ "eval_f1": 0.8757968784348208,
161
+ "eval_loss": 0.24361343681812286,
162
+ "eval_runtime": 28.0056,
163
+ "eval_samples_per_second": 1299.207,
164
+ "eval_steps_per_second": 5.106,
165
+ "step": 7680
166
+ },
167
+ {
168
+ "epoch": 6.25,
169
+ "learning_rate": 3.98936170212766e-06,
170
+ "loss": 0.1527,
171
+ "step": 8000
172
+ },
173
+ {
174
+ "epoch": 6.64,
175
+ "learning_rate": 3.5738031914893617e-06,
176
+ "loss": 0.1519,
177
+ "step": 8500
178
+ },
179
+ {
180
+ "epoch": 7.0,
181
+ "eval_accuracy": 0.9057303833997526,
182
+ "eval_averaged_scores": 0.8910411109500566,
183
+ "eval_f1": 0.8763518385003605,
184
+ "eval_loss": 0.24579738080501556,
185
+ "eval_runtime": 28.1352,
186
+ "eval_samples_per_second": 1293.219,
187
+ "eval_steps_per_second": 5.083,
188
+ "step": 8960
189
+ },
190
+ {
191
+ "epoch": 7.03,
192
+ "learning_rate": 3.1582446808510644e-06,
193
+ "loss": 0.1534,
194
+ "step": 9000
195
+ },
196
+ {
197
+ "epoch": 7.42,
198
+ "learning_rate": 2.7426861702127662e-06,
199
+ "loss": 0.138,
200
+ "step": 9500
201
+ },
202
+ {
203
+ "epoch": 7.81,
204
+ "learning_rate": 2.327127659574468e-06,
205
+ "loss": 0.1418,
206
+ "step": 10000
207
+ },
208
+ {
209
+ "epoch": 8.0,
210
+ "eval_accuracy": 0.9066923182630204,
211
+ "eval_averaged_scores": 0.8916345948024302,
212
+ "eval_f1": 0.8765768713418401,
213
+ "eval_loss": 0.25888094305992126,
214
+ "eval_runtime": 27.7859,
215
+ "eval_samples_per_second": 1309.477,
216
+ "eval_steps_per_second": 5.146,
217
+ "step": 10240
218
+ },
219
+ {
220
+ "epoch": 8.2,
221
+ "learning_rate": 1.9115691489361704e-06,
222
+ "loss": 0.1366,
223
+ "step": 10500
224
+ },
225
+ {
226
+ "epoch": 8.59,
227
+ "learning_rate": 1.4960106382978725e-06,
228
+ "loss": 0.1307,
229
+ "step": 11000
230
+ },
231
+ {
232
+ "epoch": 8.98,
233
+ "learning_rate": 1.0804521276595746e-06,
234
+ "loss": 0.1327,
235
+ "step": 11500
236
+ },
237
+ {
238
+ "epoch": 9.0,
239
+ "eval_accuracy": 0.9079840593651229,
240
+ "eval_averaged_scores": 0.8931944703014327,
241
+ "eval_f1": 0.8784048812377424,
242
+ "eval_loss": 0.25855743885040283,
243
+ "eval_runtime": 27.685,
244
+ "eval_samples_per_second": 1314.247,
245
+ "eval_steps_per_second": 5.165,
246
+ "step": 11520
247
+ },
248
+ {
249
+ "epoch": 9.38,
250
+ "learning_rate": 6.648936170212766e-07,
251
+ "loss": 0.1271,
252
+ "step": 12000
253
+ },
254
+ {
255
+ "epoch": 9.77,
256
+ "learning_rate": 2.4933510638297876e-07,
257
+ "loss": 0.1257,
258
+ "step": 12500
259
+ },
260
+ {
261
+ "epoch": 10.0,
262
+ "eval_accuracy": 0.9080665109248317,
263
+ "eval_averaged_scores": 0.8932172851340642,
264
+ "eval_f1": 0.8783680593432966,
265
+ "eval_loss": 0.26402151584625244,
266
+ "eval_runtime": 28.0618,
267
+ "eval_samples_per_second": 1296.601,
268
+ "eval_steps_per_second": 5.096,
269
+ "step": 12800
270
+ },
271
+ {
272
+ "epoch": 10.0,
273
+ "step": 12800,
274
+ "total_flos": 1.5831238702432614e+17,
275
+ "train_loss": 0.1970924124121666,
276
+ "train_runtime": 4395.9489,
277
+ "train_samples_per_second": 744.915,
278
+ "train_steps_per_second": 2.912
279
+ }
280
+ ],
281
+ "logging_steps": 500,
282
+ "max_steps": 12800,
283
+ "num_input_tokens_seen": 0,
284
+ "num_train_epochs": 10,
285
+ "save_steps": 500,
286
+ "total_flos": 1.5831238702432614e+17,
287
+ "train_batch_size": 256,
288
+ "trial_name": null,
289
+ "trial_params": null
290
+ }
qqp/roberta-base_lr1e-05/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a6b8a821e8661f87bc7947be53a5663c95a6097fed5039e0548c58e69105c9f
3
+ size 4283
qqp/roberta-base_lr1e-05/vocab.json ADDED
The diff for this file is too large to render. See raw diff