bobox commited on
Commit
d247c72
1 Parent(s): afa5e3f

Training in progress, step 2554, checkpoint

Browse files
checkpoint-2554/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-2554/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2554/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-2554/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bobox/DeBERTa-ST-AllLayers-v3-checkpoints-tmp",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.42.3",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
checkpoint-2554/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.42.3",
5
+ "pytorch": "2.1.2"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-2554/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
checkpoint-2554/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66bb1e36276c1c098c52926289b76425a27f3db889d5a7048c1cb263e28e245b
3
+ size 1130520122
checkpoint-2554/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8631cd0e79a1f08a0912c11f565aed711b237e95a58d80d9d21672834f172075
3
+ size 565251810
checkpoint-2554/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f559b53b673cbaeb2c79890b3798a53f05f92893f5f6881490ea3ce66a7599d9
3
+ size 14244
checkpoint-2554/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dfadb5a50c60e4767cb9734be0c74fcd91fb48041ecf1db5b7da5ffd8146e00
3
+ size 1064
checkpoint-2554/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-2554/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-2554/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-2554/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2554/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "sp_model_kwargs": {},
58
+ "split_by_punct": false,
59
+ "stride": 0,
60
+ "tokenizer_class": "DebertaV2Tokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]",
64
+ "vocab_type": "spm"
65
+ }
checkpoint-2554/trainer_state.json ADDED
@@ -0,0 +1,1608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.40025074439742986,
5
+ "eval_steps": 320,
6
+ "global_step": 2554,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0050148879485973985,
13
+ "grad_norm": 14.771158218383789,
14
+ "learning_rate": 9.707724425887265e-07,
15
+ "loss": 0.6329,
16
+ "step": 32
17
+ },
18
+ {
19
+ "epoch": 0.010029775897194797,
20
+ "grad_norm": 11.052021980285645,
21
+ "learning_rate": 1.9728601252609606e-06,
22
+ "loss": 0.9693,
23
+ "step": 64
24
+ },
25
+ {
26
+ "epoch": 0.015044663845792195,
27
+ "grad_norm": 20.26296615600586,
28
+ "learning_rate": 2.9749478079331944e-06,
29
+ "loss": 0.6548,
30
+ "step": 96
31
+ },
32
+ {
33
+ "epoch": 0.020059551794389594,
34
+ "grad_norm": 12.62913703918457,
35
+ "learning_rate": 3.945720250521921e-06,
36
+ "loss": 1.1279,
37
+ "step": 128
38
+ },
39
+ {
40
+ "epoch": 0.025074439742986992,
41
+ "grad_norm": 12.316486358642578,
42
+ "learning_rate": 4.916492693110647e-06,
43
+ "loss": 1.0017,
44
+ "step": 160
45
+ },
46
+ {
47
+ "epoch": 0.03008932769158439,
48
+ "grad_norm": 64.25923919677734,
49
+ "learning_rate": 5.918580375782881e-06,
50
+ "loss": 0.7571,
51
+ "step": 192
52
+ },
53
+ {
54
+ "epoch": 0.03510421564018179,
55
+ "grad_norm": 0.8205029368400574,
56
+ "learning_rate": 6.920668058455115e-06,
57
+ "loss": 0.7304,
58
+ "step": 224
59
+ },
60
+ {
61
+ "epoch": 0.04011910358877919,
62
+ "grad_norm": 6.598870754241943,
63
+ "learning_rate": 7.922755741127349e-06,
64
+ "loss": 0.7636,
65
+ "step": 256
66
+ },
67
+ {
68
+ "epoch": 0.045133991537376586,
69
+ "grad_norm": 8.728073120117188,
70
+ "learning_rate": 8.924843423799583e-06,
71
+ "loss": 0.482,
72
+ "step": 288
73
+ },
74
+ {
75
+ "epoch": 0.050148879485973984,
76
+ "grad_norm": 7.645521640777588,
77
+ "learning_rate": 9.926931106471817e-06,
78
+ "loss": 0.6312,
79
+ "step": 320
80
+ },
81
+ {
82
+ "epoch": 0.050148879485973984,
83
+ "eval_nli-pairs_loss": 1.0158467292785645,
84
+ "eval_nli-pairs_runtime": 3.7267,
85
+ "eval_nli-pairs_samples_per_second": 26.833,
86
+ "eval_nli-pairs_steps_per_second": 1.073,
87
+ "eval_sts-test_pearson_cosine": 0.7848265412179125,
88
+ "eval_sts-test_pearson_dot": 0.5437080705284749,
89
+ "eval_sts-test_pearson_euclidean": 0.7445845076364892,
90
+ "eval_sts-test_pearson_manhattan": 0.7429239204432232,
91
+ "eval_sts-test_pearson_max": 0.7848265412179125,
92
+ "eval_sts-test_spearman_cosine": 0.7989504707258924,
93
+ "eval_sts-test_spearman_dot": 0.5206855421174118,
94
+ "eval_sts-test_spearman_euclidean": 0.733568982260844,
95
+ "eval_sts-test_spearman_manhattan": 0.7349407257944446,
96
+ "eval_sts-test_spearman_max": 0.7989504707258924,
97
+ "step": 320
98
+ },
99
+ {
100
+ "epoch": 0.050148879485973984,
101
+ "eval_vitaminc-pairs_loss": 4.692601680755615,
102
+ "eval_vitaminc-pairs_runtime": 1.1397,
103
+ "eval_vitaminc-pairs_samples_per_second": 74.578,
104
+ "eval_vitaminc-pairs_steps_per_second": 2.632,
105
+ "step": 320
106
+ },
107
+ {
108
+ "epoch": 0.050148879485973984,
109
+ "eval_sts-label_loss": 3.5502490997314453,
110
+ "eval_sts-label_runtime": 0.28,
111
+ "eval_sts-label_samples_per_second": 357.117,
112
+ "eval_sts-label_steps_per_second": 14.285,
113
+ "step": 320
114
+ },
115
+ {
116
+ "epoch": 0.050148879485973984,
117
+ "eval_qnli-contrastive_loss": 0.16079513728618622,
118
+ "eval_qnli-contrastive_runtime": 0.3646,
119
+ "eval_qnli-contrastive_samples_per_second": 274.299,
120
+ "eval_qnli-contrastive_steps_per_second": 10.972,
121
+ "step": 320
122
+ },
123
+ {
124
+ "epoch": 0.050148879485973984,
125
+ "eval_scitail-pairs-qa_loss": 0.07610582560300827,
126
+ "eval_scitail-pairs-qa_runtime": 0.8885,
127
+ "eval_scitail-pairs-qa_samples_per_second": 112.548,
128
+ "eval_scitail-pairs-qa_steps_per_second": 4.502,
129
+ "step": 320
130
+ },
131
+ {
132
+ "epoch": 0.050148879485973984,
133
+ "eval_scitail-pairs-pos_loss": 0.5141278505325317,
134
+ "eval_scitail-pairs-pos_runtime": 1.3498,
135
+ "eval_scitail-pairs-pos_samples_per_second": 74.085,
136
+ "eval_scitail-pairs-pos_steps_per_second": 2.963,
137
+ "step": 320
138
+ },
139
+ {
140
+ "epoch": 0.050148879485973984,
141
+ "eval_xsum-pairs_loss": 0.25581496953964233,
142
+ "eval_xsum-pairs_runtime": 0.9407,
143
+ "eval_xsum-pairs_samples_per_second": 106.304,
144
+ "eval_xsum-pairs_steps_per_second": 4.252,
145
+ "step": 320
146
+ },
147
+ {
148
+ "epoch": 0.050148879485973984,
149
+ "eval_compression-pairs_loss": 0.09814296662807465,
150
+ "eval_compression-pairs_runtime": 0.2758,
151
+ "eval_compression-pairs_samples_per_second": 362.517,
152
+ "eval_compression-pairs_steps_per_second": 14.501,
153
+ "step": 320
154
+ },
155
+ {
156
+ "epoch": 0.050148879485973984,
157
+ "eval_sciq_pairs_loss": 0.25620242953300476,
158
+ "eval_sciq_pairs_runtime": 4.1155,
159
+ "eval_sciq_pairs_samples_per_second": 24.298,
160
+ "eval_sciq_pairs_steps_per_second": 0.972,
161
+ "step": 320
162
+ },
163
+ {
164
+ "epoch": 0.050148879485973984,
165
+ "eval_qasc_pairs_loss": 0.2044612169265747,
166
+ "eval_qasc_pairs_runtime": 1.1029,
167
+ "eval_qasc_pairs_samples_per_second": 90.672,
168
+ "eval_qasc_pairs_steps_per_second": 3.627,
169
+ "step": 320
170
+ },
171
+ {
172
+ "epoch": 0.050148879485973984,
173
+ "eval_openbookqa_pairs_loss": 1.7537646293640137,
174
+ "eval_openbookqa_pairs_runtime": 0.9037,
175
+ "eval_openbookqa_pairs_samples_per_second": 110.653,
176
+ "eval_openbookqa_pairs_steps_per_second": 4.426,
177
+ "step": 320
178
+ },
179
+ {
180
+ "epoch": 0.050148879485973984,
181
+ "eval_msmarco_pairs_loss": 0.5138561725616455,
182
+ "eval_msmarco_pairs_runtime": 2.0511,
183
+ "eval_msmarco_pairs_samples_per_second": 48.754,
184
+ "eval_msmarco_pairs_steps_per_second": 1.95,
185
+ "step": 320
186
+ },
187
+ {
188
+ "epoch": 0.050148879485973984,
189
+ "eval_nq_pairs_loss": 0.23510317504405975,
190
+ "eval_nq_pairs_runtime": 4.5293,
191
+ "eval_nq_pairs_samples_per_second": 22.078,
192
+ "eval_nq_pairs_steps_per_second": 0.883,
193
+ "step": 320
194
+ },
195
+ {
196
+ "epoch": 0.050148879485973984,
197
+ "eval_trivia_pairs_loss": 0.7808571457862854,
198
+ "eval_trivia_pairs_runtime": 6.5065,
199
+ "eval_trivia_pairs_samples_per_second": 15.369,
200
+ "eval_trivia_pairs_steps_per_second": 0.615,
201
+ "step": 320
202
+ },
203
+ {
204
+ "epoch": 0.050148879485973984,
205
+ "eval_quora_pairs_loss": 0.0392119362950325,
206
+ "eval_quora_pairs_runtime": 0.675,
207
+ "eval_quora_pairs_samples_per_second": 148.153,
208
+ "eval_quora_pairs_steps_per_second": 5.926,
209
+ "step": 320
210
+ },
211
+ {
212
+ "epoch": 0.050148879485973984,
213
+ "eval_gooaq_pairs_loss": 0.4712902009487152,
214
+ "eval_gooaq_pairs_runtime": 1.4079,
215
+ "eval_gooaq_pairs_samples_per_second": 71.028,
216
+ "eval_gooaq_pairs_steps_per_second": 2.841,
217
+ "step": 320
218
+ },
219
+ {
220
+ "epoch": 0.050148879485973984,
221
+ "eval_mrpc_pairs_loss": 0.05498996376991272,
222
+ "eval_mrpc_pairs_runtime": 0.2623,
223
+ "eval_mrpc_pairs_samples_per_second": 381.172,
224
+ "eval_mrpc_pairs_steps_per_second": 15.247,
225
+ "step": 320
226
+ },
227
+ {
228
+ "epoch": 0.05516376743457138,
229
+ "grad_norm": 0.34924012422561646,
230
+ "learning_rate": 1.092901878914405e-05,
231
+ "loss": 0.5791,
232
+ "step": 352
233
+ },
234
+ {
235
+ "epoch": 0.06017865538316878,
236
+ "grad_norm": 0.36700841784477234,
237
+ "learning_rate": 1.1931106471816284e-05,
238
+ "loss": 0.6413,
239
+ "step": 384
240
+ },
241
+ {
242
+ "epoch": 0.06519354333176618,
243
+ "grad_norm": 7.559622764587402,
244
+ "learning_rate": 1.2933194154488518e-05,
245
+ "loss": 0.4319,
246
+ "step": 416
247
+ },
248
+ {
249
+ "epoch": 0.07020843128036358,
250
+ "grad_norm": 7.982416152954102,
251
+ "learning_rate": 1.3935281837160753e-05,
252
+ "loss": 0.6672,
253
+ "step": 448
254
+ },
255
+ {
256
+ "epoch": 0.07522331922896097,
257
+ "grad_norm": 0.6726166009902954,
258
+ "learning_rate": 1.4937369519832987e-05,
259
+ "loss": 0.459,
260
+ "step": 480
261
+ },
262
+ {
263
+ "epoch": 0.08023820717755838,
264
+ "grad_norm": 14.846123695373535,
265
+ "learning_rate": 1.593945720250522e-05,
266
+ "loss": 0.7621,
267
+ "step": 512
268
+ },
269
+ {
270
+ "epoch": 0.08525309512615578,
271
+ "grad_norm": 0.7846627831459045,
272
+ "learning_rate": 1.6941544885177454e-05,
273
+ "loss": 0.864,
274
+ "step": 544
275
+ },
276
+ {
277
+ "epoch": 0.09026798307475317,
278
+ "grad_norm": 0.8993583917617798,
279
+ "learning_rate": 1.7943632567849688e-05,
280
+ "loss": 0.5081,
281
+ "step": 576
282
+ },
283
+ {
284
+ "epoch": 0.09528287102335058,
285
+ "grad_norm": 1.4990565776824951,
286
+ "learning_rate": 1.894572025052192e-05,
287
+ "loss": 0.654,
288
+ "step": 608
289
+ },
290
+ {
291
+ "epoch": 0.10029775897194797,
292
+ "grad_norm": 15.647976875305176,
293
+ "learning_rate": 1.9947807933194157e-05,
294
+ "loss": 0.6372,
295
+ "step": 640
296
+ },
297
+ {
298
+ "epoch": 0.10029775897194797,
299
+ "eval_nli-pairs_loss": 1.0652996301651,
300
+ "eval_nli-pairs_runtime": 3.6326,
301
+ "eval_nli-pairs_samples_per_second": 27.528,
302
+ "eval_nli-pairs_steps_per_second": 1.101,
303
+ "eval_sts-test_pearson_cosine": 0.785263018402905,
304
+ "eval_sts-test_pearson_dot": 0.5290450141477089,
305
+ "eval_sts-test_pearson_euclidean": 0.7433756286425983,
306
+ "eval_sts-test_pearson_manhattan": 0.7411097274300102,
307
+ "eval_sts-test_pearson_max": 0.785263018402905,
308
+ "eval_sts-test_spearman_cosine": 0.7996928912411947,
309
+ "eval_sts-test_spearman_dot": 0.5102571497667188,
310
+ "eval_sts-test_spearman_euclidean": 0.7338969723324641,
311
+ "eval_sts-test_spearman_manhattan": 0.7343494860194358,
312
+ "eval_sts-test_spearman_max": 0.7996928912411947,
313
+ "step": 640
314
+ },
315
+ {
316
+ "epoch": 0.10029775897194797,
317
+ "eval_vitaminc-pairs_loss": 4.719416618347168,
318
+ "eval_vitaminc-pairs_runtime": 1.1268,
319
+ "eval_vitaminc-pairs_samples_per_second": 75.437,
320
+ "eval_vitaminc-pairs_steps_per_second": 2.662,
321
+ "step": 640
322
+ },
323
+ {
324
+ "epoch": 0.10029775897194797,
325
+ "eval_sts-label_loss": 3.612347364425659,
326
+ "eval_sts-label_runtime": 0.2683,
327
+ "eval_sts-label_samples_per_second": 372.651,
328
+ "eval_sts-label_steps_per_second": 14.906,
329
+ "step": 640
330
+ },
331
+ {
332
+ "epoch": 0.10029775897194797,
333
+ "eval_qnli-contrastive_loss": 0.15202775597572327,
334
+ "eval_qnli-contrastive_runtime": 0.3528,
335
+ "eval_qnli-contrastive_samples_per_second": 283.457,
336
+ "eval_qnli-contrastive_steps_per_second": 11.338,
337
+ "step": 640
338
+ },
339
+ {
340
+ "epoch": 0.10029775897194797,
341
+ "eval_scitail-pairs-qa_loss": 0.07544919103384018,
342
+ "eval_scitail-pairs-qa_runtime": 0.8732,
343
+ "eval_scitail-pairs-qa_samples_per_second": 114.517,
344
+ "eval_scitail-pairs-qa_steps_per_second": 4.581,
345
+ "step": 640
346
+ },
347
+ {
348
+ "epoch": 0.10029775897194797,
349
+ "eval_scitail-pairs-pos_loss": 0.5404170751571655,
350
+ "eval_scitail-pairs-pos_runtime": 1.3146,
351
+ "eval_scitail-pairs-pos_samples_per_second": 76.067,
352
+ "eval_scitail-pairs-pos_steps_per_second": 3.043,
353
+ "step": 640
354
+ },
355
+ {
356
+ "epoch": 0.10029775897194797,
357
+ "eval_xsum-pairs_loss": 0.25958582758903503,
358
+ "eval_xsum-pairs_runtime": 0.9287,
359
+ "eval_xsum-pairs_samples_per_second": 107.679,
360
+ "eval_xsum-pairs_steps_per_second": 4.307,
361
+ "step": 640
362
+ },
363
+ {
364
+ "epoch": 0.10029775897194797,
365
+ "eval_compression-pairs_loss": 0.10066353529691696,
366
+ "eval_compression-pairs_runtime": 0.2732,
367
+ "eval_compression-pairs_samples_per_second": 366.076,
368
+ "eval_compression-pairs_steps_per_second": 14.643,
369
+ "step": 640
370
+ },
371
+ {
372
+ "epoch": 0.10029775897194797,
373
+ "eval_sciq_pairs_loss": 0.2645374834537506,
374
+ "eval_sciq_pairs_runtime": 4.0725,
375
+ "eval_sciq_pairs_samples_per_second": 24.555,
376
+ "eval_sciq_pairs_steps_per_second": 0.982,
377
+ "step": 640
378
+ },
379
+ {
380
+ "epoch": 0.10029775897194797,
381
+ "eval_qasc_pairs_loss": 0.21021947264671326,
382
+ "eval_qasc_pairs_runtime": 1.0743,
383
+ "eval_qasc_pairs_samples_per_second": 93.084,
384
+ "eval_qasc_pairs_steps_per_second": 3.723,
385
+ "step": 640
386
+ },
387
+ {
388
+ "epoch": 0.10029775897194797,
389
+ "eval_openbookqa_pairs_loss": 1.7905032634735107,
390
+ "eval_openbookqa_pairs_runtime": 0.8886,
391
+ "eval_openbookqa_pairs_samples_per_second": 112.532,
392
+ "eval_openbookqa_pairs_steps_per_second": 4.501,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 0.10029775897194797,
397
+ "eval_msmarco_pairs_loss": 0.5102832913398743,
398
+ "eval_msmarco_pairs_runtime": 2.0529,
399
+ "eval_msmarco_pairs_samples_per_second": 48.712,
400
+ "eval_msmarco_pairs_steps_per_second": 1.948,
401
+ "step": 640
402
+ },
403
+ {
404
+ "epoch": 0.10029775897194797,
405
+ "eval_nq_pairs_loss": 0.24466972053050995,
406
+ "eval_nq_pairs_runtime": 4.4973,
407
+ "eval_nq_pairs_samples_per_second": 22.235,
408
+ "eval_nq_pairs_steps_per_second": 0.889,
409
+ "step": 640
410
+ },
411
+ {
412
+ "epoch": 0.10029775897194797,
413
+ "eval_trivia_pairs_loss": 0.8748095631599426,
414
+ "eval_trivia_pairs_runtime": 6.4825,
415
+ "eval_trivia_pairs_samples_per_second": 15.426,
416
+ "eval_trivia_pairs_steps_per_second": 0.617,
417
+ "step": 640
418
+ },
419
+ {
420
+ "epoch": 0.10029775897194797,
421
+ "eval_quora_pairs_loss": 0.07820220291614532,
422
+ "eval_quora_pairs_runtime": 0.6944,
423
+ "eval_quora_pairs_samples_per_second": 144.008,
424
+ "eval_quora_pairs_steps_per_second": 5.76,
425
+ "step": 640
426
+ },
427
+ {
428
+ "epoch": 0.10029775897194797,
429
+ "eval_gooaq_pairs_loss": 0.5236212611198425,
430
+ "eval_gooaq_pairs_runtime": 1.3899,
431
+ "eval_gooaq_pairs_samples_per_second": 71.949,
432
+ "eval_gooaq_pairs_steps_per_second": 2.878,
433
+ "step": 640
434
+ },
435
+ {
436
+ "epoch": 0.10029775897194797,
437
+ "eval_mrpc_pairs_loss": 0.05494727939367294,
438
+ "eval_mrpc_pairs_runtime": 0.2598,
439
+ "eval_mrpc_pairs_samples_per_second": 384.941,
440
+ "eval_mrpc_pairs_steps_per_second": 15.398,
441
+ "step": 640
442
+ },
443
+ {
444
+ "epoch": 0.10531264692054537,
445
+ "grad_norm": 11.01974105834961,
446
+ "learning_rate": 2.0949895615866387e-05,
447
+ "loss": 0.9292,
448
+ "step": 672
449
+ },
450
+ {
451
+ "epoch": 0.11032753486914276,
452
+ "grad_norm": 0.5542309284210205,
453
+ "learning_rate": 2.1951983298538625e-05,
454
+ "loss": 1.3108,
455
+ "step": 704
456
+ },
457
+ {
458
+ "epoch": 0.11534242281774017,
459
+ "grad_norm": 15.458569526672363,
460
+ "learning_rate": 2.2954070981210856e-05,
461
+ "loss": 0.9674,
462
+ "step": 736
463
+ },
464
+ {
465
+ "epoch": 0.12035731076633756,
466
+ "grad_norm": 2.7814478874206543,
467
+ "learning_rate": 2.395615866388309e-05,
468
+ "loss": 0.9226,
469
+ "step": 768
470
+ },
471
+ {
472
+ "epoch": 0.12537219871493496,
473
+ "grad_norm": 11.393244743347168,
474
+ "learning_rate": 2.4958246346555324e-05,
475
+ "loss": 0.789,
476
+ "step": 800
477
+ },
478
+ {
479
+ "epoch": 0.13038708666353235,
480
+ "grad_norm": 9.288290977478027,
481
+ "learning_rate": 2.596033402922756e-05,
482
+ "loss": 0.5186,
483
+ "step": 832
484
+ },
485
+ {
486
+ "epoch": 0.13540197461212977,
487
+ "grad_norm": 47.65571212768555,
488
+ "learning_rate": 2.6962421711899793e-05,
489
+ "loss": 0.6726,
490
+ "step": 864
491
+ },
492
+ {
493
+ "epoch": 0.14041686256072716,
494
+ "grad_norm": 12.908064842224121,
495
+ "learning_rate": 2.7964509394572024e-05,
496
+ "loss": 0.5381,
497
+ "step": 896
498
+ },
499
+ {
500
+ "epoch": 0.14543175050932455,
501
+ "grad_norm": 14.951742172241211,
502
+ "learning_rate": 2.896659707724426e-05,
503
+ "loss": 0.581,
504
+ "step": 928
505
+ },
506
+ {
507
+ "epoch": 0.15044663845792194,
508
+ "grad_norm": 20.12006187438965,
509
+ "learning_rate": 2.9968684759916492e-05,
510
+ "loss": 0.9038,
511
+ "step": 960
512
+ },
513
+ {
514
+ "epoch": 0.15044663845792194,
515
+ "eval_nli-pairs_loss": 1.2173175811767578,
516
+ "eval_nli-pairs_runtime": 3.7098,
517
+ "eval_nli-pairs_samples_per_second": 26.955,
518
+ "eval_nli-pairs_steps_per_second": 1.078,
519
+ "eval_sts-test_pearson_cosine": 0.7840992835675669,
520
+ "eval_sts-test_pearson_dot": 0.5220462136106129,
521
+ "eval_sts-test_pearson_euclidean": 0.7457350047351855,
522
+ "eval_sts-test_pearson_manhattan": 0.7425970830541657,
523
+ "eval_sts-test_pearson_max": 0.7840992835675669,
524
+ "eval_sts-test_spearman_cosine": 0.8006376809572144,
525
+ "eval_sts-test_spearman_dot": 0.5020544543992158,
526
+ "eval_sts-test_spearman_euclidean": 0.7369257710408655,
527
+ "eval_sts-test_spearman_manhattan": 0.7362649758012406,
528
+ "eval_sts-test_spearman_max": 0.8006376809572144,
529
+ "step": 960
530
+ },
531
+ {
532
+ "epoch": 0.15044663845792194,
533
+ "eval_vitaminc-pairs_loss": 4.774902820587158,
534
+ "eval_vitaminc-pairs_runtime": 1.1212,
535
+ "eval_vitaminc-pairs_samples_per_second": 75.809,
536
+ "eval_vitaminc-pairs_steps_per_second": 2.676,
537
+ "step": 960
538
+ },
539
+ {
540
+ "epoch": 0.15044663845792194,
541
+ "eval_sts-label_loss": 3.198556900024414,
542
+ "eval_sts-label_runtime": 0.2678,
543
+ "eval_sts-label_samples_per_second": 373.382,
544
+ "eval_sts-label_steps_per_second": 14.935,
545
+ "step": 960
546
+ },
547
+ {
548
+ "epoch": 0.15044663845792194,
549
+ "eval_qnli-contrastive_loss": 0.1943340301513672,
550
+ "eval_qnli-contrastive_runtime": 0.3511,
551
+ "eval_qnli-contrastive_samples_per_second": 284.789,
552
+ "eval_qnli-contrastive_steps_per_second": 11.392,
553
+ "step": 960
554
+ },
555
+ {
556
+ "epoch": 0.15044663845792194,
557
+ "eval_scitail-pairs-qa_loss": 0.08060617744922638,
558
+ "eval_scitail-pairs-qa_runtime": 0.8778,
559
+ "eval_scitail-pairs-qa_samples_per_second": 113.92,
560
+ "eval_scitail-pairs-qa_steps_per_second": 4.557,
561
+ "step": 960
562
+ },
563
+ {
564
+ "epoch": 0.15044663845792194,
565
+ "eval_scitail-pairs-pos_loss": 0.4759831428527832,
566
+ "eval_scitail-pairs-pos_runtime": 1.3609,
567
+ "eval_scitail-pairs-pos_samples_per_second": 73.48,
568
+ "eval_scitail-pairs-pos_steps_per_second": 2.939,
569
+ "step": 960
570
+ },
571
+ {
572
+ "epoch": 0.15044663845792194,
573
+ "eval_xsum-pairs_loss": 0.27583304047584534,
574
+ "eval_xsum-pairs_runtime": 0.9343,
575
+ "eval_xsum-pairs_samples_per_second": 107.035,
576
+ "eval_xsum-pairs_steps_per_second": 4.281,
577
+ "step": 960
578
+ },
579
+ {
580
+ "epoch": 0.15044663845792194,
581
+ "eval_compression-pairs_loss": 0.10094660520553589,
582
+ "eval_compression-pairs_runtime": 0.2739,
583
+ "eval_compression-pairs_samples_per_second": 365.047,
584
+ "eval_compression-pairs_steps_per_second": 14.602,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 0.15044663845792194,
589
+ "eval_sciq_pairs_loss": 0.2688131630420685,
590
+ "eval_sciq_pairs_runtime": 4.0582,
591
+ "eval_sciq_pairs_samples_per_second": 24.641,
592
+ "eval_sciq_pairs_steps_per_second": 0.986,
593
+ "step": 960
594
+ },
595
+ {
596
+ "epoch": 0.15044663845792194,
597
+ "eval_qasc_pairs_loss": 0.23267821967601776,
598
+ "eval_qasc_pairs_runtime": 1.0554,
599
+ "eval_qasc_pairs_samples_per_second": 94.75,
600
+ "eval_qasc_pairs_steps_per_second": 3.79,
601
+ "step": 960
602
+ },
603
+ {
604
+ "epoch": 0.15044663845792194,
605
+ "eval_openbookqa_pairs_loss": 1.8053069114685059,
606
+ "eval_openbookqa_pairs_runtime": 0.8871,
607
+ "eval_openbookqa_pairs_samples_per_second": 112.727,
608
+ "eval_openbookqa_pairs_steps_per_second": 4.509,
609
+ "step": 960
610
+ },
611
+ {
612
+ "epoch": 0.15044663845792194,
613
+ "eval_msmarco_pairs_loss": 0.5809260606765747,
614
+ "eval_msmarco_pairs_runtime": 2.0498,
615
+ "eval_msmarco_pairs_samples_per_second": 48.786,
616
+ "eval_msmarco_pairs_steps_per_second": 1.951,
617
+ "step": 960
618
+ },
619
+ {
620
+ "epoch": 0.15044663845792194,
621
+ "eval_nq_pairs_loss": 0.2808491885662079,
622
+ "eval_nq_pairs_runtime": 4.4982,
623
+ "eval_nq_pairs_samples_per_second": 22.231,
624
+ "eval_nq_pairs_steps_per_second": 0.889,
625
+ "step": 960
626
+ },
627
+ {
628
+ "epoch": 0.15044663845792194,
629
+ "eval_trivia_pairs_loss": 0.9379808902740479,
630
+ "eval_trivia_pairs_runtime": 6.4578,
631
+ "eval_trivia_pairs_samples_per_second": 15.485,
632
+ "eval_trivia_pairs_steps_per_second": 0.619,
633
+ "step": 960
634
+ },
635
+ {
636
+ "epoch": 0.15044663845792194,
637
+ "eval_quora_pairs_loss": 0.0913279801607132,
638
+ "eval_quora_pairs_runtime": 0.6721,
639
+ "eval_quora_pairs_samples_per_second": 148.79,
640
+ "eval_quora_pairs_steps_per_second": 5.952,
641
+ "step": 960
642
+ },
643
+ {
644
+ "epoch": 0.15044663845792194,
645
+ "eval_gooaq_pairs_loss": 0.5807955265045166,
646
+ "eval_gooaq_pairs_runtime": 1.3915,
647
+ "eval_gooaq_pairs_samples_per_second": 71.865,
648
+ "eval_gooaq_pairs_steps_per_second": 2.875,
649
+ "step": 960
650
+ },
651
+ {
652
+ "epoch": 0.15044663845792194,
653
+ "eval_mrpc_pairs_loss": 0.05799216777086258,
654
+ "eval_mrpc_pairs_runtime": 0.2571,
655
+ "eval_mrpc_pairs_samples_per_second": 388.998,
656
+ "eval_mrpc_pairs_steps_per_second": 15.56,
657
+ "step": 960
658
+ },
659
+ {
660
+ "epoch": 0.15546152640651936,
661
+ "grad_norm": 9.773286819458008,
662
+ "learning_rate": 2.9997957904107625e-05,
663
+ "loss": 0.7964,
664
+ "step": 992
665
+ },
666
+ {
667
+ "epoch": 0.16047641435511675,
668
+ "grad_norm": 19.411075592041016,
669
+ "learning_rate": 2.9991566594209126e-05,
670
+ "loss": 0.8213,
671
+ "step": 1024
672
+ },
673
+ {
674
+ "epoch": 0.16549130230371414,
675
+ "grad_norm": 3.5282175540924072,
676
+ "learning_rate": 2.9980825799589488e-05,
677
+ "loss": 0.5396,
678
+ "step": 1056
679
+ },
680
+ {
681
+ "epoch": 0.17050619025231156,
682
+ "grad_norm": 62.66339874267578,
683
+ "learning_rate": 2.996573863646219e-05,
684
+ "loss": 0.9297,
685
+ "step": 1088
686
+ },
687
+ {
688
+ "epoch": 0.17552107820090895,
689
+ "grad_norm": 8.785274505615234,
690
+ "learning_rate": 2.994630948204727e-05,
691
+ "loss": 1.169,
692
+ "step": 1120
693
+ },
694
+ {
695
+ "epoch": 0.18053596614950634,
696
+ "grad_norm": 24.10859489440918,
697
+ "learning_rate": 2.992254397330132e-05,
698
+ "loss": 0.7486,
699
+ "step": 1152
700
+ },
701
+ {
702
+ "epoch": 0.18555085409810373,
703
+ "grad_norm": 25.545284271240234,
704
+ "learning_rate": 2.9894449005282077e-05,
705
+ "loss": 0.6821,
706
+ "step": 1184
707
+ },
708
+ {
709
+ "epoch": 0.19056574204670115,
710
+ "grad_norm": 0.8675521016120911,
711
+ "learning_rate": 2.9862032729147954e-05,
712
+ "loss": 0.6125,
713
+ "step": 1216
714
+ },
715
+ {
716
+ "epoch": 0.19558062999529854,
717
+ "grad_norm": 16.122114181518555,
718
+ "learning_rate": 2.9825304549793153e-05,
719
+ "loss": 0.8061,
720
+ "step": 1248
721
+ },
722
+ {
723
+ "epoch": 0.20059551794389593,
724
+ "grad_norm": 1.0314382314682007,
725
+ "learning_rate": 2.978427512311904e-05,
726
+ "loss": 0.6918,
727
+ "step": 1280
728
+ },
729
+ {
730
+ "epoch": 0.20059551794389593,
731
+ "eval_nli-pairs_loss": 1.1552109718322754,
732
+ "eval_nli-pairs_runtime": 3.8751,
733
+ "eval_nli-pairs_samples_per_second": 25.806,
734
+ "eval_nli-pairs_steps_per_second": 1.032,
735
+ "eval_sts-test_pearson_cosine": 0.786106976104726,
736
+ "eval_sts-test_pearson_dot": 0.5116758767219935,
737
+ "eval_sts-test_pearson_euclidean": 0.7432891018313416,
738
+ "eval_sts-test_pearson_manhattan": 0.7400929158927781,
739
+ "eval_sts-test_pearson_max": 0.786106976104726,
740
+ "eval_sts-test_spearman_cosine": 0.801377272203007,
741
+ "eval_sts-test_spearman_dot": 0.4921454166952506,
742
+ "eval_sts-test_spearman_euclidean": 0.7343686249967402,
743
+ "eval_sts-test_spearman_manhattan": 0.7331946050808561,
744
+ "eval_sts-test_spearman_max": 0.801377272203007,
745
+ "step": 1280
746
+ },
747
+ {
748
+ "epoch": 0.20059551794389593,
749
+ "eval_vitaminc-pairs_loss": 4.6789751052856445,
750
+ "eval_vitaminc-pairs_runtime": 1.1504,
751
+ "eval_vitaminc-pairs_samples_per_second": 73.889,
752
+ "eval_vitaminc-pairs_steps_per_second": 2.608,
753
+ "step": 1280
754
+ },
755
+ {
756
+ "epoch": 0.20059551794389593,
757
+ "eval_sts-label_loss": 3.5580556392669678,
758
+ "eval_sts-label_runtime": 0.2834,
759
+ "eval_sts-label_samples_per_second": 352.858,
760
+ "eval_sts-label_steps_per_second": 14.114,
761
+ "step": 1280
762
+ },
763
+ {
764
+ "epoch": 0.20059551794389593,
765
+ "eval_qnli-contrastive_loss": 0.20369713008403778,
766
+ "eval_qnli-contrastive_runtime": 0.358,
767
+ "eval_qnli-contrastive_samples_per_second": 279.331,
768
+ "eval_qnli-contrastive_steps_per_second": 11.173,
769
+ "step": 1280
770
+ },
771
+ {
772
+ "epoch": 0.20059551794389593,
773
+ "eval_scitail-pairs-qa_loss": 0.07465875148773193,
774
+ "eval_scitail-pairs-qa_runtime": 0.9504,
775
+ "eval_scitail-pairs-qa_samples_per_second": 105.214,
776
+ "eval_scitail-pairs-qa_steps_per_second": 4.209,
777
+ "step": 1280
778
+ },
779
+ {
780
+ "epoch": 0.20059551794389593,
781
+ "eval_scitail-pairs-pos_loss": 0.49434563517570496,
782
+ "eval_scitail-pairs-pos_runtime": 1.6041,
783
+ "eval_scitail-pairs-pos_samples_per_second": 62.339,
784
+ "eval_scitail-pairs-pos_steps_per_second": 2.494,
785
+ "step": 1280
786
+ },
787
+ {
788
+ "epoch": 0.20059551794389593,
789
+ "eval_xsum-pairs_loss": 0.28282061219215393,
790
+ "eval_xsum-pairs_runtime": 0.9316,
791
+ "eval_xsum-pairs_samples_per_second": 107.346,
792
+ "eval_xsum-pairs_steps_per_second": 4.294,
793
+ "step": 1280
794
+ },
795
+ {
796
+ "epoch": 0.20059551794389593,
797
+ "eval_compression-pairs_loss": 0.097385473549366,
798
+ "eval_compression-pairs_runtime": 0.2754,
799
+ "eval_compression-pairs_samples_per_second": 363.1,
800
+ "eval_compression-pairs_steps_per_second": 14.524,
801
+ "step": 1280
802
+ },
803
+ {
804
+ "epoch": 0.20059551794389593,
805
+ "eval_sciq_pairs_loss": 0.2762215733528137,
806
+ "eval_sciq_pairs_runtime": 4.2307,
807
+ "eval_sciq_pairs_samples_per_second": 23.637,
808
+ "eval_sciq_pairs_steps_per_second": 0.945,
809
+ "step": 1280
810
+ },
811
+ {
812
+ "epoch": 0.20059551794389593,
813
+ "eval_qasc_pairs_loss": 0.19347424805164337,
814
+ "eval_qasc_pairs_runtime": 1.2282,
815
+ "eval_qasc_pairs_samples_per_second": 81.421,
816
+ "eval_qasc_pairs_steps_per_second": 3.257,
817
+ "step": 1280
818
+ },
819
+ {
820
+ "epoch": 0.20059551794389593,
821
+ "eval_openbookqa_pairs_loss": 1.6875064373016357,
822
+ "eval_openbookqa_pairs_runtime": 1.1661,
823
+ "eval_openbookqa_pairs_samples_per_second": 85.754,
824
+ "eval_openbookqa_pairs_steps_per_second": 3.43,
825
+ "step": 1280
826
+ },
827
+ {
828
+ "epoch": 0.20059551794389593,
829
+ "eval_msmarco_pairs_loss": 0.5743877291679382,
830
+ "eval_msmarco_pairs_runtime": 2.1428,
831
+ "eval_msmarco_pairs_samples_per_second": 46.669,
832
+ "eval_msmarco_pairs_steps_per_second": 1.867,
833
+ "step": 1280
834
+ },
835
+ {
836
+ "epoch": 0.20059551794389593,
837
+ "eval_nq_pairs_loss": 0.30348217487335205,
838
+ "eval_nq_pairs_runtime": 4.5543,
839
+ "eval_nq_pairs_samples_per_second": 21.957,
840
+ "eval_nq_pairs_steps_per_second": 0.878,
841
+ "step": 1280
842
+ },
843
+ {
844
+ "epoch": 0.20059551794389593,
845
+ "eval_trivia_pairs_loss": 0.9221765995025635,
846
+ "eval_trivia_pairs_runtime": 6.6513,
847
+ "eval_trivia_pairs_samples_per_second": 15.035,
848
+ "eval_trivia_pairs_steps_per_second": 0.601,
849
+ "step": 1280
850
+ },
851
+ {
852
+ "epoch": 0.20059551794389593,
853
+ "eval_quora_pairs_loss": 0.03854631260037422,
854
+ "eval_quora_pairs_runtime": 0.7822,
855
+ "eval_quora_pairs_samples_per_second": 127.852,
856
+ "eval_quora_pairs_steps_per_second": 5.114,
857
+ "step": 1280
858
+ },
859
+ {
860
+ "epoch": 0.20059551794389593,
861
+ "eval_gooaq_pairs_loss": 0.528398334980011,
862
+ "eval_gooaq_pairs_runtime": 1.4882,
863
+ "eval_gooaq_pairs_samples_per_second": 67.194,
864
+ "eval_gooaq_pairs_steps_per_second": 2.688,
865
+ "step": 1280
866
+ },
867
+ {
868
+ "epoch": 0.20059551794389593,
869
+ "eval_mrpc_pairs_loss": 0.05623970925807953,
870
+ "eval_mrpc_pairs_runtime": 0.2698,
871
+ "eval_mrpc_pairs_samples_per_second": 370.713,
872
+ "eval_mrpc_pairs_steps_per_second": 14.829,
873
+ "step": 1280
874
+ },
875
+ {
876
+ "epoch": 0.20561040589249335,
877
+ "grad_norm": 0.6042119860649109,
878
+ "learning_rate": 2.9738956352942557e-05,
879
+ "loss": 0.9421,
880
+ "step": 1312
881
+ },
882
+ {
883
+ "epoch": 0.21062529384109074,
884
+ "grad_norm": 13.87867546081543,
885
+ "learning_rate": 2.968936138754259e-05,
886
+ "loss": 0.8641,
887
+ "step": 1344
888
+ },
889
+ {
890
+ "epoch": 0.21564018178968813,
891
+ "grad_norm": 44.48640441894531,
892
+ "learning_rate": 2.9635504615845257e-05,
893
+ "loss": 1.157,
894
+ "step": 1376
895
+ },
896
+ {
897
+ "epoch": 0.22065506973828553,
898
+ "grad_norm": 15.554729461669922,
899
+ "learning_rate": 2.957928148945977e-05,
900
+ "loss": 0.8772,
901
+ "step": 1408
902
+ },
903
+ {
904
+ "epoch": 0.22566995768688294,
905
+ "grad_norm": 16.644670486450195,
906
+ "learning_rate": 2.9517081112297707e-05,
907
+ "loss": 1.0496,
908
+ "step": 1440
909
+ },
910
+ {
911
+ "epoch": 0.23068484563548033,
912
+ "grad_norm": 13.053145408630371,
913
+ "learning_rate": 2.9450668912302004e-05,
914
+ "loss": 0.589,
915
+ "step": 1472
916
+ },
917
+ {
918
+ "epoch": 0.23569973358407773,
919
+ "grad_norm": 7.827791213989258,
920
+ "learning_rate": 2.9380064157562306e-05,
921
+ "loss": 0.8234,
922
+ "step": 1504
923
+ },
924
+ {
925
+ "epoch": 0.24071462153267512,
926
+ "grad_norm": 15.598438262939453,
927
+ "learning_rate": 2.930528733254901e-05,
928
+ "loss": 0.7365,
929
+ "step": 1536
930
+ },
931
+ {
932
+ "epoch": 0.24572950948127253,
933
+ "grad_norm": 13.723180770874023,
934
+ "learning_rate": 2.9226360132170112e-05,
935
+ "loss": 0.5076,
936
+ "step": 1568
937
+ },
938
+ {
939
+ "epoch": 0.2507443974298699,
940
+ "grad_norm": 10.20022964477539,
941
+ "learning_rate": 2.9143305455476866e-05,
942
+ "loss": 1.0329,
943
+ "step": 1600
944
+ },
945
+ {
946
+ "epoch": 0.2507443974298699,
947
+ "eval_nli-pairs_loss": 1.0577216148376465,
948
+ "eval_nli-pairs_runtime": 3.6476,
949
+ "eval_nli-pairs_samples_per_second": 27.415,
950
+ "eval_nli-pairs_steps_per_second": 1.097,
951
+ "eval_sts-test_pearson_cosine": 0.7876359552191669,
952
+ "eval_sts-test_pearson_dot": 0.5220803655074544,
953
+ "eval_sts-test_pearson_euclidean": 0.7444632413869628,
954
+ "eval_sts-test_pearson_manhattan": 0.7418744760088763,
955
+ "eval_sts-test_pearson_max": 0.7876359552191669,
956
+ "eval_sts-test_spearman_cosine": 0.8018874000525117,
957
+ "eval_sts-test_spearman_dot": 0.5034518981121652,
958
+ "eval_sts-test_spearman_euclidean": 0.7344750702387959,
959
+ "eval_sts-test_spearman_manhattan": 0.7332804063416474,
960
+ "eval_sts-test_spearman_max": 0.8018874000525117,
961
+ "step": 1600
962
+ },
963
+ {
964
+ "epoch": 0.2507443974298699,
965
+ "eval_vitaminc-pairs_loss": 4.784573554992676,
966
+ "eval_vitaminc-pairs_runtime": 1.145,
967
+ "eval_vitaminc-pairs_samples_per_second": 74.235,
968
+ "eval_vitaminc-pairs_steps_per_second": 2.62,
969
+ "step": 1600
970
+ },
971
+ {
972
+ "epoch": 0.2507443974298699,
973
+ "eval_sts-label_loss": 3.6113080978393555,
974
+ "eval_sts-label_runtime": 0.2746,
975
+ "eval_sts-label_samples_per_second": 364.172,
976
+ "eval_sts-label_steps_per_second": 14.567,
977
+ "step": 1600
978
+ },
979
+ {
980
+ "epoch": 0.2507443974298699,
981
+ "eval_qnli-contrastive_loss": 0.18593625724315643,
982
+ "eval_qnli-contrastive_runtime": 0.3541,
983
+ "eval_qnli-contrastive_samples_per_second": 282.413,
984
+ "eval_qnli-contrastive_steps_per_second": 11.297,
985
+ "step": 1600
986
+ },
987
+ {
988
+ "epoch": 0.2507443974298699,
989
+ "eval_scitail-pairs-qa_loss": 0.07545661181211472,
990
+ "eval_scitail-pairs-qa_runtime": 0.8854,
991
+ "eval_scitail-pairs-qa_samples_per_second": 112.941,
992
+ "eval_scitail-pairs-qa_steps_per_second": 4.518,
993
+ "step": 1600
994
+ },
995
+ {
996
+ "epoch": 0.2507443974298699,
997
+ "eval_scitail-pairs-pos_loss": 0.5018333792686462,
998
+ "eval_scitail-pairs-pos_runtime": 1.3443,
999
+ "eval_scitail-pairs-pos_samples_per_second": 74.386,
1000
+ "eval_scitail-pairs-pos_steps_per_second": 2.975,
1001
+ "step": 1600
1002
+ },
1003
+ {
1004
+ "epoch": 0.2507443974298699,
1005
+ "eval_xsum-pairs_loss": 0.2749001085758209,
1006
+ "eval_xsum-pairs_runtime": 0.9439,
1007
+ "eval_xsum-pairs_samples_per_second": 105.939,
1008
+ "eval_xsum-pairs_steps_per_second": 4.238,
1009
+ "step": 1600
1010
+ },
1011
+ {
1012
+ "epoch": 0.2507443974298699,
1013
+ "eval_compression-pairs_loss": 0.09735233336687088,
1014
+ "eval_compression-pairs_runtime": 0.2764,
1015
+ "eval_compression-pairs_samples_per_second": 361.753,
1016
+ "eval_compression-pairs_steps_per_second": 14.47,
1017
+ "step": 1600
1018
+ },
1019
+ {
1020
+ "epoch": 0.2507443974298699,
1021
+ "eval_sciq_pairs_loss": 0.2648228108882904,
1022
+ "eval_sciq_pairs_runtime": 4.1207,
1023
+ "eval_sciq_pairs_samples_per_second": 24.268,
1024
+ "eval_sciq_pairs_steps_per_second": 0.971,
1025
+ "step": 1600
1026
+ },
1027
+ {
1028
+ "epoch": 0.2507443974298699,
1029
+ "eval_qasc_pairs_loss": 0.21318012475967407,
1030
+ "eval_qasc_pairs_runtime": 1.0917,
1031
+ "eval_qasc_pairs_samples_per_second": 91.604,
1032
+ "eval_qasc_pairs_steps_per_second": 3.664,
1033
+ "step": 1600
1034
+ },
1035
+ {
1036
+ "epoch": 0.2507443974298699,
1037
+ "eval_openbookqa_pairs_loss": 1.790009617805481,
1038
+ "eval_openbookqa_pairs_runtime": 0.8969,
1039
+ "eval_openbookqa_pairs_samples_per_second": 111.496,
1040
+ "eval_openbookqa_pairs_steps_per_second": 4.46,
1041
+ "step": 1600
1042
+ },
1043
+ {
1044
+ "epoch": 0.2507443974298699,
1045
+ "eval_msmarco_pairs_loss": 0.57186359167099,
1046
+ "eval_msmarco_pairs_runtime": 2.0592,
1047
+ "eval_msmarco_pairs_samples_per_second": 48.563,
1048
+ "eval_msmarco_pairs_steps_per_second": 1.943,
1049
+ "step": 1600
1050
+ },
1051
+ {
1052
+ "epoch": 0.2507443974298699,
1053
+ "eval_nq_pairs_loss": 0.2738310396671295,
1054
+ "eval_nq_pairs_runtime": 4.5092,
1055
+ "eval_nq_pairs_samples_per_second": 22.177,
1056
+ "eval_nq_pairs_steps_per_second": 0.887,
1057
+ "step": 1600
1058
+ },
1059
+ {
1060
+ "epoch": 0.2507443974298699,
1061
+ "eval_trivia_pairs_loss": 0.8291679620742798,
1062
+ "eval_trivia_pairs_runtime": 6.526,
1063
+ "eval_trivia_pairs_samples_per_second": 15.323,
1064
+ "eval_trivia_pairs_steps_per_second": 0.613,
1065
+ "step": 1600
1066
+ },
1067
+ {
1068
+ "epoch": 0.2507443974298699,
1069
+ "eval_quora_pairs_loss": 0.08000540733337402,
1070
+ "eval_quora_pairs_runtime": 0.6761,
1071
+ "eval_quora_pairs_samples_per_second": 147.909,
1072
+ "eval_quora_pairs_steps_per_second": 5.916,
1073
+ "step": 1600
1074
+ },
1075
+ {
1076
+ "epoch": 0.2507443974298699,
1077
+ "eval_gooaq_pairs_loss": 0.5998037457466125,
1078
+ "eval_gooaq_pairs_runtime": 1.3978,
1079
+ "eval_gooaq_pairs_samples_per_second": 71.541,
1080
+ "eval_gooaq_pairs_steps_per_second": 2.862,
1081
+ "step": 1600
1082
+ },
1083
+ {
1084
+ "epoch": 0.2507443974298699,
1085
+ "eval_mrpc_pairs_loss": 0.05507182702422142,
1086
+ "eval_mrpc_pairs_runtime": 0.2617,
1087
+ "eval_mrpc_pairs_samples_per_second": 382.156,
1088
+ "eval_mrpc_pairs_steps_per_second": 15.286,
1089
+ "step": 1600
1090
+ },
1091
+ {
1092
+ "epoch": 0.2557592853784673,
1093
+ "grad_norm": 8.05022144317627,
1094
+ "learning_rate": 2.9056147399020182e-05,
1095
+ "loss": 1.4006,
1096
+ "step": 1632
1097
+ },
1098
+ {
1099
+ "epoch": 0.2607741733270647,
1100
+ "grad_norm": 0.38224154710769653,
1101
+ "learning_rate": 2.8964911249859437e-05,
1102
+ "loss": 0.5963,
1103
+ "step": 1664
1104
+ },
1105
+ {
1106
+ "epoch": 0.2657890612756621,
1107
+ "grad_norm": 0.46655791997909546,
1108
+ "learning_rate": 2.886962347822604e-05,
1109
+ "loss": 0.7488,
1110
+ "step": 1696
1111
+ },
1112
+ {
1113
+ "epoch": 0.27080394922425954,
1114
+ "grad_norm": 8.102537155151367,
1115
+ "learning_rate": 2.8770311729843616e-05,
1116
+ "loss": 0.8548,
1117
+ "step": 1728
1118
+ },
1119
+ {
1120
+ "epoch": 0.27581883717285693,
1121
+ "grad_norm": 11.803775787353516,
1122
+ "learning_rate": 2.86670048179072e-05,
1123
+ "loss": 1.3324,
1124
+ "step": 1760
1125
+ },
1126
+ {
1127
+ "epoch": 0.2808337251214543,
1128
+ "grad_norm": 16.266756057739258,
1129
+ "learning_rate": 2.8559732714723715e-05,
1130
+ "loss": 0.5804,
1131
+ "step": 1792
1132
+ },
1133
+ {
1134
+ "epoch": 0.2858486130700517,
1135
+ "grad_norm": 2.8448822498321533,
1136
+ "learning_rate": 2.8448526543016114e-05,
1137
+ "loss": 0.7827,
1138
+ "step": 1824
1139
+ },
1140
+ {
1141
+ "epoch": 0.2908635010186491,
1142
+ "grad_norm": 21.346328735351562,
1143
+ "learning_rate": 2.8333418566893796e-05,
1144
+ "loss": 0.5448,
1145
+ "step": 1856
1146
+ },
1147
+ {
1148
+ "epoch": 0.2958783889672465,
1149
+ "grad_norm": 3.4379029273986816,
1150
+ "learning_rate": 2.8214442182491866e-05,
1151
+ "loss": 0.7368,
1152
+ "step": 1888
1153
+ },
1154
+ {
1155
+ "epoch": 0.3008932769158439,
1156
+ "grad_norm": 17.05881690979004,
1157
+ "learning_rate": 2.8091631908281963e-05,
1158
+ "loss": 0.5657,
1159
+ "step": 1920
1160
+ },
1161
+ {
1162
+ "epoch": 0.3008932769158439,
1163
+ "eval_nli-pairs_loss": 1.0244356393814087,
1164
+ "eval_nli-pairs_runtime": 3.6217,
1165
+ "eval_nli-pairs_samples_per_second": 27.612,
1166
+ "eval_nli-pairs_steps_per_second": 1.104,
1167
+ "eval_sts-test_pearson_cosine": 0.781915957368962,
1168
+ "eval_sts-test_pearson_dot": 0.49821032356844613,
1169
+ "eval_sts-test_pearson_euclidean": 0.7329308897504494,
1170
+ "eval_sts-test_pearson_manhattan": 0.7292186092506918,
1171
+ "eval_sts-test_pearson_max": 0.781915957368962,
1172
+ "eval_sts-test_spearman_cosine": 0.7983596570250642,
1173
+ "eval_sts-test_spearman_dot": 0.4812350313638781,
1174
+ "eval_sts-test_spearman_euclidean": 0.7265758267352669,
1175
+ "eval_sts-test_spearman_manhattan": 0.7259264140902829,
1176
+ "eval_sts-test_spearman_max": 0.7983596570250642,
1177
+ "step": 1920
1178
+ },
1179
+ {
1180
+ "epoch": 0.3008932769158439,
1181
+ "eval_vitaminc-pairs_loss": 4.698296070098877,
1182
+ "eval_vitaminc-pairs_runtime": 1.1338,
1183
+ "eval_vitaminc-pairs_samples_per_second": 74.97,
1184
+ "eval_vitaminc-pairs_steps_per_second": 2.646,
1185
+ "step": 1920
1186
+ },
1187
+ {
1188
+ "epoch": 0.3008932769158439,
1189
+ "eval_sts-label_loss": 3.1822261810302734,
1190
+ "eval_sts-label_runtime": 0.2702,
1191
+ "eval_sts-label_samples_per_second": 370.09,
1192
+ "eval_sts-label_steps_per_second": 14.804,
1193
+ "step": 1920
1194
+ },
1195
+ {
1196
+ "epoch": 0.3008932769158439,
1197
+ "eval_qnli-contrastive_loss": 0.11326340585947037,
1198
+ "eval_qnli-contrastive_runtime": 0.3581,
1199
+ "eval_qnli-contrastive_samples_per_second": 279.28,
1200
+ "eval_qnli-contrastive_steps_per_second": 11.171,
1201
+ "step": 1920
1202
+ },
1203
+ {
1204
+ "epoch": 0.3008932769158439,
1205
+ "eval_scitail-pairs-qa_loss": 0.07009608298540115,
1206
+ "eval_scitail-pairs-qa_runtime": 0.8816,
1207
+ "eval_scitail-pairs-qa_samples_per_second": 113.424,
1208
+ "eval_scitail-pairs-qa_steps_per_second": 4.537,
1209
+ "step": 1920
1210
+ },
1211
+ {
1212
+ "epoch": 0.3008932769158439,
1213
+ "eval_scitail-pairs-pos_loss": 0.49156129360198975,
1214
+ "eval_scitail-pairs-pos_runtime": 1.3759,
1215
+ "eval_scitail-pairs-pos_samples_per_second": 72.678,
1216
+ "eval_scitail-pairs-pos_steps_per_second": 2.907,
1217
+ "step": 1920
1218
+ },
1219
+ {
1220
+ "epoch": 0.3008932769158439,
1221
+ "eval_xsum-pairs_loss": 0.25940877199172974,
1222
+ "eval_xsum-pairs_runtime": 0.9373,
1223
+ "eval_xsum-pairs_samples_per_second": 106.695,
1224
+ "eval_xsum-pairs_steps_per_second": 4.268,
1225
+ "step": 1920
1226
+ },
1227
+ {
1228
+ "epoch": 0.3008932769158439,
1229
+ "eval_compression-pairs_loss": 0.0919649675488472,
1230
+ "eval_compression-pairs_runtime": 0.2738,
1231
+ "eval_compression-pairs_samples_per_second": 365.291,
1232
+ "eval_compression-pairs_steps_per_second": 14.612,
1233
+ "step": 1920
1234
+ },
1235
+ {
1236
+ "epoch": 0.3008932769158439,
1237
+ "eval_sciq_pairs_loss": 0.29138606786727905,
1238
+ "eval_sciq_pairs_runtime": 4.1059,
1239
+ "eval_sciq_pairs_samples_per_second": 24.355,
1240
+ "eval_sciq_pairs_steps_per_second": 0.974,
1241
+ "step": 1920
1242
+ },
1243
+ {
1244
+ "epoch": 0.3008932769158439,
1245
+ "eval_qasc_pairs_loss": 0.19625085592269897,
1246
+ "eval_qasc_pairs_runtime": 1.0611,
1247
+ "eval_qasc_pairs_samples_per_second": 94.24,
1248
+ "eval_qasc_pairs_steps_per_second": 3.77,
1249
+ "step": 1920
1250
+ },
1251
+ {
1252
+ "epoch": 0.3008932769158439,
1253
+ "eval_openbookqa_pairs_loss": 1.7960456609725952,
1254
+ "eval_openbookqa_pairs_runtime": 0.9042,
1255
+ "eval_openbookqa_pairs_samples_per_second": 110.601,
1256
+ "eval_openbookqa_pairs_steps_per_second": 4.424,
1257
+ "step": 1920
1258
+ },
1259
+ {
1260
+ "epoch": 0.3008932769158439,
1261
+ "eval_msmarco_pairs_loss": 0.5171416997909546,
1262
+ "eval_msmarco_pairs_runtime": 2.0637,
1263
+ "eval_msmarco_pairs_samples_per_second": 48.457,
1264
+ "eval_msmarco_pairs_steps_per_second": 1.938,
1265
+ "step": 1920
1266
+ },
1267
+ {
1268
+ "epoch": 0.3008932769158439,
1269
+ "eval_nq_pairs_loss": 0.24809740483760834,
1270
+ "eval_nq_pairs_runtime": 4.529,
1271
+ "eval_nq_pairs_samples_per_second": 22.08,
1272
+ "eval_nq_pairs_steps_per_second": 0.883,
1273
+ "step": 1920
1274
+ },
1275
+ {
1276
+ "epoch": 0.3008932769158439,
1277
+ "eval_trivia_pairs_loss": 0.9041999578475952,
1278
+ "eval_trivia_pairs_runtime": 6.5257,
1279
+ "eval_trivia_pairs_samples_per_second": 15.324,
1280
+ "eval_trivia_pairs_steps_per_second": 0.613,
1281
+ "step": 1920
1282
+ },
1283
+ {
1284
+ "epoch": 0.3008932769158439,
1285
+ "eval_quora_pairs_loss": 0.03601976856589317,
1286
+ "eval_quora_pairs_runtime": 0.6811,
1287
+ "eval_quora_pairs_samples_per_second": 146.827,
1288
+ "eval_quora_pairs_steps_per_second": 5.873,
1289
+ "step": 1920
1290
+ },
1291
+ {
1292
+ "epoch": 0.3008932769158439,
1293
+ "eval_gooaq_pairs_loss": 0.5626399517059326,
1294
+ "eval_gooaq_pairs_runtime": 1.3943,
1295
+ "eval_gooaq_pairs_samples_per_second": 71.72,
1296
+ "eval_gooaq_pairs_steps_per_second": 2.869,
1297
+ "step": 1920
1298
+ },
1299
+ {
1300
+ "epoch": 0.3008932769158439,
1301
+ "eval_mrpc_pairs_loss": 0.04984402656555176,
1302
+ "eval_mrpc_pairs_runtime": 0.2579,
1303
+ "eval_mrpc_pairs_samples_per_second": 387.725,
1304
+ "eval_mrpc_pairs_steps_per_second": 15.509,
1305
+ "step": 1920
1306
+ },
1307
+ {
1308
+ "epoch": 0.30590816486444133,
1309
+ "grad_norm": 22.65591812133789,
1310
+ "learning_rate": 2.796502337505742e-05,
1311
+ "loss": 0.7425,
1312
+ "step": 1952
1313
+ },
1314
+ {
1315
+ "epoch": 0.3109230528130387,
1316
+ "grad_norm": 10.119640350341797,
1317
+ "learning_rate": 2.78346533155958e-05,
1318
+ "loss": 0.7819,
1319
+ "step": 1984
1320
+ },
1321
+ {
1322
+ "epoch": 0.3159379407616361,
1323
+ "grad_norm": 8.690531730651855,
1324
+ "learning_rate": 2.770055955400161e-05,
1325
+ "loss": 0.5937,
1326
+ "step": 2016
1327
+ },
1328
+ {
1329
+ "epoch": 0.3209528287102335,
1330
+ "grad_norm": 0.8992699384689331,
1331
+ "learning_rate": 2.7562780994732476e-05,
1332
+ "loss": 0.8133,
1333
+ "step": 2048
1334
+ },
1335
+ {
1336
+ "epoch": 0.3259677166588309,
1337
+ "grad_norm": 10.619684219360352,
1338
+ "learning_rate": 2.7421357611311824e-05,
1339
+ "loss": 1.0674,
1340
+ "step": 2080
1341
+ },
1342
+ {
1343
+ "epoch": 0.3309826046074283,
1344
+ "grad_norm": 7.222084045410156,
1345
+ "learning_rate": 2.727633043473141e-05,
1346
+ "loss": 0.6288,
1347
+ "step": 2112
1348
+ },
1349
+ {
1350
+ "epoch": 0.3359974925560257,
1351
+ "grad_norm": 10.166888236999512,
1352
+ "learning_rate": 2.712774154154707e-05,
1353
+ "loss": 0.5866,
1354
+ "step": 2144
1355
+ },
1356
+ {
1357
+ "epoch": 0.3410123805046231,
1358
+ "grad_norm": 0.36360761523246765,
1359
+ "learning_rate": 2.6975634041671052e-05,
1360
+ "loss": 0.6962,
1361
+ "step": 2176
1362
+ },
1363
+ {
1364
+ "epoch": 0.3460272684532205,
1365
+ "grad_norm": 9.586665153503418,
1366
+ "learning_rate": 2.6820052065864665e-05,
1367
+ "loss": 0.5562,
1368
+ "step": 2208
1369
+ },
1370
+ {
1371
+ "epoch": 0.3510421564018179,
1372
+ "grad_norm": 1.1307642459869385,
1373
+ "learning_rate": 2.6661040752934594e-05,
1374
+ "loss": 0.8871,
1375
+ "step": 2240
1376
+ },
1377
+ {
1378
+ "epoch": 0.3510421564018179,
1379
+ "eval_nli-pairs_loss": 1.0147591829299927,
1380
+ "eval_nli-pairs_runtime": 3.7201,
1381
+ "eval_nli-pairs_samples_per_second": 26.881,
1382
+ "eval_nli-pairs_steps_per_second": 1.075,
1383
+ "eval_sts-test_pearson_cosine": 0.7872126529181761,
1384
+ "eval_sts-test_pearson_dot": 0.5062045289861089,
1385
+ "eval_sts-test_pearson_euclidean": 0.7351473988633473,
1386
+ "eval_sts-test_pearson_manhattan": 0.7310226402088944,
1387
+ "eval_sts-test_pearson_max": 0.7872126529181761,
1388
+ "eval_sts-test_spearman_cosine": 0.801487068999052,
1389
+ "eval_sts-test_spearman_dot": 0.4912205722904683,
1390
+ "eval_sts-test_spearman_euclidean": 0.7267262355024484,
1391
+ "eval_sts-test_spearman_manhattan": 0.72510169253649,
1392
+ "eval_sts-test_spearman_max": 0.801487068999052,
1393
+ "step": 2240
1394
+ },
1395
+ {
1396
+ "epoch": 0.3510421564018179,
1397
+ "eval_vitaminc-pairs_loss": 4.644638538360596,
1398
+ "eval_vitaminc-pairs_runtime": 1.1453,
1399
+ "eval_vitaminc-pairs_samples_per_second": 74.215,
1400
+ "eval_vitaminc-pairs_steps_per_second": 2.619,
1401
+ "step": 2240
1402
+ },
1403
+ {
1404
+ "epoch": 0.3510421564018179,
1405
+ "eval_sts-label_loss": 3.915343999862671,
1406
+ "eval_sts-label_runtime": 0.2807,
1407
+ "eval_sts-label_samples_per_second": 356.217,
1408
+ "eval_sts-label_steps_per_second": 14.249,
1409
+ "step": 2240
1410
+ },
1411
+ {
1412
+ "epoch": 0.3510421564018179,
1413
+ "eval_qnli-contrastive_loss": 0.11220741271972656,
1414
+ "eval_qnli-contrastive_runtime": 0.3614,
1415
+ "eval_qnli-contrastive_samples_per_second": 276.705,
1416
+ "eval_qnli-contrastive_steps_per_second": 11.068,
1417
+ "step": 2240
1418
+ },
1419
+ {
1420
+ "epoch": 0.3510421564018179,
1421
+ "eval_scitail-pairs-qa_loss": 0.06635177880525589,
1422
+ "eval_scitail-pairs-qa_runtime": 0.8881,
1423
+ "eval_scitail-pairs-qa_samples_per_second": 112.594,
1424
+ "eval_scitail-pairs-qa_steps_per_second": 4.504,
1425
+ "step": 2240
1426
+ },
1427
+ {
1428
+ "epoch": 0.3510421564018179,
1429
+ "eval_scitail-pairs-pos_loss": 0.5765587687492371,
1430
+ "eval_scitail-pairs-pos_runtime": 1.3496,
1431
+ "eval_scitail-pairs-pos_samples_per_second": 74.097,
1432
+ "eval_scitail-pairs-pos_steps_per_second": 2.964,
1433
+ "step": 2240
1434
+ },
1435
+ {
1436
+ "epoch": 0.3510421564018179,
1437
+ "eval_xsum-pairs_loss": 0.2595808804035187,
1438
+ "eval_xsum-pairs_runtime": 0.9377,
1439
+ "eval_xsum-pairs_samples_per_second": 106.641,
1440
+ "eval_xsum-pairs_steps_per_second": 4.266,
1441
+ "step": 2240
1442
+ },
1443
+ {
1444
+ "epoch": 0.3510421564018179,
1445
+ "eval_compression-pairs_loss": 0.0918564721941948,
1446
+ "eval_compression-pairs_runtime": 0.2755,
1447
+ "eval_compression-pairs_samples_per_second": 363.032,
1448
+ "eval_compression-pairs_steps_per_second": 14.521,
1449
+ "step": 2240
1450
+ },
1451
+ {
1452
+ "epoch": 0.3510421564018179,
1453
+ "eval_sciq_pairs_loss": 0.284303218126297,
1454
+ "eval_sciq_pairs_runtime": 4.1289,
1455
+ "eval_sciq_pairs_samples_per_second": 24.22,
1456
+ "eval_sciq_pairs_steps_per_second": 0.969,
1457
+ "step": 2240
1458
+ },
1459
+ {
1460
+ "epoch": 0.3510421564018179,
1461
+ "eval_qasc_pairs_loss": 0.19232892990112305,
1462
+ "eval_qasc_pairs_runtime": 1.0709,
1463
+ "eval_qasc_pairs_samples_per_second": 93.384,
1464
+ "eval_qasc_pairs_steps_per_second": 3.735,
1465
+ "step": 2240
1466
+ },
1467
+ {
1468
+ "epoch": 0.3510421564018179,
1469
+ "eval_openbookqa_pairs_loss": 1.6234371662139893,
1470
+ "eval_openbookqa_pairs_runtime": 0.9558,
1471
+ "eval_openbookqa_pairs_samples_per_second": 104.62,
1472
+ "eval_openbookqa_pairs_steps_per_second": 4.185,
1473
+ "step": 2240
1474
+ },
1475
+ {
1476
+ "epoch": 0.3510421564018179,
1477
+ "eval_msmarco_pairs_loss": 0.5325217247009277,
1478
+ "eval_msmarco_pairs_runtime": 2.0971,
1479
+ "eval_msmarco_pairs_samples_per_second": 47.685,
1480
+ "eval_msmarco_pairs_steps_per_second": 1.907,
1481
+ "step": 2240
1482
+ },
1483
+ {
1484
+ "epoch": 0.3510421564018179,
1485
+ "eval_nq_pairs_loss": 0.2721095681190491,
1486
+ "eval_nq_pairs_runtime": 4.5393,
1487
+ "eval_nq_pairs_samples_per_second": 22.03,
1488
+ "eval_nq_pairs_steps_per_second": 0.881,
1489
+ "step": 2240
1490
+ },
1491
+ {
1492
+ "epoch": 0.3510421564018179,
1493
+ "eval_trivia_pairs_loss": 0.8544899821281433,
1494
+ "eval_trivia_pairs_runtime": 6.4668,
1495
+ "eval_trivia_pairs_samples_per_second": 15.464,
1496
+ "eval_trivia_pairs_steps_per_second": 0.619,
1497
+ "step": 2240
1498
+ },
1499
+ {
1500
+ "epoch": 0.3510421564018179,
1501
+ "eval_quora_pairs_loss": 0.08441996574401855,
1502
+ "eval_quora_pairs_runtime": 0.6933,
1503
+ "eval_quora_pairs_samples_per_second": 144.233,
1504
+ "eval_quora_pairs_steps_per_second": 5.769,
1505
+ "step": 2240
1506
+ },
1507
+ {
1508
+ "epoch": 0.3510421564018179,
1509
+ "eval_gooaq_pairs_loss": 0.5711588859558105,
1510
+ "eval_gooaq_pairs_runtime": 1.3941,
1511
+ "eval_gooaq_pairs_samples_per_second": 71.733,
1512
+ "eval_gooaq_pairs_steps_per_second": 2.869,
1513
+ "step": 2240
1514
+ },
1515
+ {
1516
+ "epoch": 0.3510421564018179,
1517
+ "eval_mrpc_pairs_loss": 0.05093960464000702,
1518
+ "eval_mrpc_pairs_runtime": 0.2633,
1519
+ "eval_mrpc_pairs_samples_per_second": 379.777,
1520
+ "eval_mrpc_pairs_steps_per_second": 15.191,
1521
+ "step": 2240
1522
+ },
1523
+ {
1524
+ "epoch": 0.3560570443504153,
1525
+ "grad_norm": 0.39178094267845154,
1526
+ "learning_rate": 2.6498646236636892e-05,
1527
+ "loss": 0.6805,
1528
+ "step": 2272
1529
+ },
1530
+ {
1531
+ "epoch": 0.3610719322990127,
1532
+ "grad_norm": 7.91475248336792,
1533
+ "learning_rate": 2.6332915632292237e-05,
1534
+ "loss": 1.0451,
1535
+ "step": 2304
1536
+ },
1537
+ {
1538
+ "epoch": 0.3660868202476101,
1539
+ "grad_norm": 31.54157066345215,
1540
+ "learning_rate": 2.616389702311641e-05,
1541
+ "loss": 1.0603,
1542
+ "step": 2336
1543
+ },
1544
+ {
1545
+ "epoch": 0.37110170819620747,
1546
+ "grad_norm": 8.400779724121094,
1547
+ "learning_rate": 2.5991639446269964e-05,
1548
+ "loss": 0.8142,
1549
+ "step": 2368
1550
+ },
1551
+ {
1552
+ "epoch": 0.3761165961448049,
1553
+ "grad_norm": 20.99441146850586,
1554
+ "learning_rate": 2.5816192878631166e-05,
1555
+ "loss": 1.7211,
1556
+ "step": 2400
1557
+ },
1558
+ {
1559
+ "epoch": 0.3811314840934023,
1560
+ "grad_norm": 10.574430465698242,
1561
+ "learning_rate": 2.5637608222296237e-05,
1562
+ "loss": 0.7523,
1563
+ "step": 2432
1564
+ },
1565
+ {
1566
+ "epoch": 0.3861463720419997,
1567
+ "grad_norm": 0.8941424489021301,
1568
+ "learning_rate": 2.5455937289811207e-05,
1569
+ "loss": 0.8053,
1570
+ "step": 2464
1571
+ },
1572
+ {
1573
+ "epoch": 0.3911612599905971,
1574
+ "grad_norm": 1.9402281045913696,
1575
+ "learning_rate": 2.5271232789139587e-05,
1576
+ "loss": 0.8427,
1577
+ "step": 2496
1578
+ },
1579
+ {
1580
+ "epoch": 0.3961761479391945,
1581
+ "grad_norm": 23.42873764038086,
1582
+ "learning_rate": 2.5083548308370296e-05,
1583
+ "loss": 0.8204,
1584
+ "step": 2528
1585
+ }
1586
+ ],
1587
+ "logging_steps": 32,
1588
+ "max_steps": 12762,
1589
+ "num_input_tokens_seen": 0,
1590
+ "num_train_epochs": 2,
1591
+ "save_steps": 1277,
1592
+ "stateful_callbacks": {
1593
+ "TrainerControl": {
1594
+ "args": {
1595
+ "should_epoch_stop": false,
1596
+ "should_evaluate": false,
1597
+ "should_log": false,
1598
+ "should_save": true,
1599
+ "should_training_stop": false
1600
+ },
1601
+ "attributes": {}
1602
+ }
1603
+ },
1604
+ "total_flos": 0.0,
1605
+ "train_batch_size": 32,
1606
+ "trial_name": null,
1607
+ "trial_params": null
1608
+ }
checkpoint-2554/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383c5bf6513da1dfbcd1294f0c8e85ce43118bc61e2de49d9b5d1e28eb653003
3
+ size 5624