bobox commited on
Commit
fb940b7
1 Parent(s): 136f2cc

Training in progress, step 2583, checkpoint

Browse files
checkpoint-2583/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
checkpoint-2583/README.md ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2583/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
checkpoint-2583/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bobox/DeBERTa-ST-AllLayers-v3-checkpoints-tmp",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.42.3",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
checkpoint-2583/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.42.3",
5
+ "pytorch": "2.1.2"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
checkpoint-2583/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
checkpoint-2583/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cef23cef810d729473dfc310b062ec2558628aa13842938447dbde90308451d
3
+ size 1130520122
checkpoint-2583/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b72ef6afaf46c7cf74ea6eb92f75f08bd12bfa59f3bfbb5dfa86d39fcbc8b7
3
+ size 565251810
checkpoint-2583/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6921899eecc47d3aac27605789b492ef2c8b6e3d28992a17cee235c65419a23c
3
+ size 14244
checkpoint-2583/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:365a3ccfc16189ae04609fef62a7eddb1b9b968074bfcfc76316e52c4656d98e
3
+ size 1064
checkpoint-2583/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
checkpoint-2583/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-2583/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-2583/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2583/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "max_length": 512,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "sp_model_kwargs": {},
58
+ "split_by_punct": false,
59
+ "stride": 0,
60
+ "tokenizer_class": "DebertaV2Tokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]",
64
+ "vocab_type": "spm"
65
+ }
checkpoint-2583/trainer_state.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.5,
5
+ "eval_steps": 1292,
6
+ "global_step": 2583,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012582268679829655,
13
+ "grad_norm": 4.108283996582031,
14
+ "learning_rate": 3.7166085946573755e-07,
15
+ "loss": 0.4577,
16
+ "step": 65
17
+ },
18
+ {
19
+ "epoch": 0.02516453735965931,
20
+ "grad_norm": 0.5773646831512451,
21
+ "learning_rate": 7.491289198606272e-07,
22
+ "loss": 0.4707,
23
+ "step": 130
24
+ },
25
+ {
26
+ "epoch": 0.03774680603948897,
27
+ "grad_norm": 0.6127182841300964,
28
+ "learning_rate": 1.1265969802555168e-06,
29
+ "loss": 0.5259,
30
+ "step": 195
31
+ },
32
+ {
33
+ "epoch": 0.05032907471931862,
34
+ "grad_norm": 1.513021469116211,
35
+ "learning_rate": 1.5040650406504067e-06,
36
+ "loss": 0.5501,
37
+ "step": 260
38
+ },
39
+ {
40
+ "epoch": 0.06291134339914828,
41
+ "grad_norm": 11.750137329101562,
42
+ "learning_rate": 1.8815331010452962e-06,
43
+ "loss": 0.5089,
44
+ "step": 325
45
+ },
46
+ {
47
+ "epoch": 0.07549361207897794,
48
+ "grad_norm": 4.16257905960083,
49
+ "learning_rate": 2.259001161440186e-06,
50
+ "loss": 0.4816,
51
+ "step": 390
52
+ },
53
+ {
54
+ "epoch": 0.08807588075880758,
55
+ "grad_norm": 16.527780532836914,
56
+ "learning_rate": 2.6306620209059233e-06,
57
+ "loss": 0.5822,
58
+ "step": 455
59
+ },
60
+ {
61
+ "epoch": 0.10065814943863724,
62
+ "grad_norm": 0.1845797300338745,
63
+ "learning_rate": 3.0081300813008134e-06,
64
+ "loss": 0.5686,
65
+ "step": 520
66
+ },
67
+ {
68
+ "epoch": 0.1132404181184669,
69
+ "grad_norm": 12.172422409057617,
70
+ "learning_rate": 3.3855981416957026e-06,
71
+ "loss": 0.5686,
72
+ "step": 585
73
+ },
74
+ {
75
+ "epoch": 0.12582268679829656,
76
+ "grad_norm": 0.6270273923873901,
77
+ "learning_rate": 3.7630662020905923e-06,
78
+ "loss": 0.517,
79
+ "step": 650
80
+ },
81
+ {
82
+ "epoch": 0.1384049554781262,
83
+ "grad_norm": 3.6368539333343506,
84
+ "learning_rate": 4.140534262485482e-06,
85
+ "loss": 0.3615,
86
+ "step": 715
87
+ },
88
+ {
89
+ "epoch": 0.15098722415795587,
90
+ "grad_norm": 9.541145324707031,
91
+ "learning_rate": 4.518002322880372e-06,
92
+ "loss": 0.5978,
93
+ "step": 780
94
+ },
95
+ {
96
+ "epoch": 0.16356949283778552,
97
+ "grad_norm": 9.86439323425293,
98
+ "learning_rate": 4.895470383275261e-06,
99
+ "loss": 0.5153,
100
+ "step": 845
101
+ },
102
+ {
103
+ "epoch": 0.17615176151761516,
104
+ "grad_norm": 4.669048309326172,
105
+ "learning_rate": 5.272938443670151e-06,
106
+ "loss": 0.5059,
107
+ "step": 910
108
+ },
109
+ {
110
+ "epoch": 0.18873403019744484,
111
+ "grad_norm": 9.666926383972168,
112
+ "learning_rate": 5.650406504065041e-06,
113
+ "loss": 0.5624,
114
+ "step": 975
115
+ },
116
+ {
117
+ "epoch": 0.20131629887727448,
118
+ "grad_norm": 6.078874588012695,
119
+ "learning_rate": 6.02787456445993e-06,
120
+ "loss": 0.5201,
121
+ "step": 1040
122
+ },
123
+ {
124
+ "epoch": 0.21389856755710415,
125
+ "grad_norm": 1.1067451238632202,
126
+ "learning_rate": 6.4053426248548205e-06,
127
+ "loss": 0.6127,
128
+ "step": 1105
129
+ },
130
+ {
131
+ "epoch": 0.2264808362369338,
132
+ "grad_norm": 1.1589373350143433,
133
+ "learning_rate": 6.78281068524971e-06,
134
+ "loss": 0.5333,
135
+ "step": 1170
136
+ },
137
+ {
138
+ "epoch": 0.23906310491676344,
139
+ "grad_norm": 1.977501630783081,
140
+ "learning_rate": 7.1602787456446e-06,
141
+ "loss": 0.494,
142
+ "step": 1235
143
+ },
144
+ {
145
+ "epoch": 0.2500967866821525,
146
+ "eval_StS-test_pearson_cosine": 0.8821101738384596,
147
+ "eval_StS-test_pearson_dot": 0.8032893366124795,
148
+ "eval_StS-test_pearson_euclidean": 0.8697205121607111,
149
+ "eval_StS-test_pearson_manhattan": 0.8704995590187196,
150
+ "eval_StS-test_pearson_max": 0.8821101738384596,
151
+ "eval_StS-test_spearman_cosine": 0.8943047751560564,
152
+ "eval_StS-test_spearman_dot": 0.8087424893555902,
153
+ "eval_StS-test_spearman_euclidean": 0.871583089708652,
154
+ "eval_StS-test_spearman_manhattan": 0.8737012027236009,
155
+ "eval_StS-test_spearman_max": 0.8943047751560564,
156
+ "eval_Vitaminc-test_cosine_accuracy": 0.5684210526315789,
157
+ "eval_Vitaminc-test_cosine_accuracy_threshold": 0.7028586268424988,
158
+ "eval_Vitaminc-test_cosine_ap": 0.5651043866206488,
159
+ "eval_Vitaminc-test_cosine_f1": 0.6755218216318786,
160
+ "eval_Vitaminc-test_cosine_f1_threshold": 0.5077509880065918,
161
+ "eval_Vitaminc-test_cosine_precision": 0.52046783625731,
162
+ "eval_Vitaminc-test_cosine_recall": 0.9621621621621622,
163
+ "eval_Vitaminc-test_dot_accuracy": 0.5684210526315789,
164
+ "eval_Vitaminc-test_dot_accuracy_threshold": 19.693286895751953,
165
+ "eval_Vitaminc-test_dot_ap": 0.5463931769790206,
166
+ "eval_Vitaminc-test_dot_f1": 0.6691449814126395,
167
+ "eval_Vitaminc-test_dot_f1_threshold": 13.839346885681152,
168
+ "eval_Vitaminc-test_dot_precision": 0.509915014164306,
169
+ "eval_Vitaminc-test_dot_recall": 0.972972972972973,
170
+ "eval_Vitaminc-test_euclidean_accuracy": 0.5894736842105263,
171
+ "eval_Vitaminc-test_euclidean_accuracy_threshold": 4.252468585968018,
172
+ "eval_Vitaminc-test_euclidean_ap": 0.5569049511912931,
173
+ "eval_Vitaminc-test_euclidean_f1": 0.6666666666666666,
174
+ "eval_Vitaminc-test_euclidean_f1_threshold": 6.922356128692627,
175
+ "eval_Vitaminc-test_euclidean_precision": 0.5041551246537396,
176
+ "eval_Vitaminc-test_euclidean_recall": 0.9837837837837838,
177
+ "eval_Vitaminc-test_manhattan_accuracy": 0.5815789473684211,
178
+ "eval_Vitaminc-test_manhattan_accuracy_threshold": 87.21337890625,
179
+ "eval_Vitaminc-test_manhattan_ap": 0.5572154085134091,
180
+ "eval_Vitaminc-test_manhattan_f1": 0.6666666666666667,
181
+ "eval_Vitaminc-test_manhattan_f1_threshold": 141.26380920410156,
182
+ "eval_Vitaminc-test_manhattan_precision": 0.505586592178771,
183
+ "eval_Vitaminc-test_manhattan_recall": 0.9783783783783784,
184
+ "eval_Vitaminc-test_max_accuracy": 0.5894736842105263,
185
+ "eval_Vitaminc-test_max_accuracy_threshold": 87.21337890625,
186
+ "eval_Vitaminc-test_max_ap": 0.5651043866206488,
187
+ "eval_Vitaminc-test_max_f1": 0.6755218216318786,
188
+ "eval_Vitaminc-test_max_f1_threshold": 141.26380920410156,
189
+ "eval_Vitaminc-test_max_precision": 0.52046783625731,
190
+ "eval_Vitaminc-test_max_recall": 0.9837837837837838,
191
+ "eval_mrpc-test_cosine_accuracy": 0.7473684210526316,
192
+ "eval_mrpc-test_cosine_accuracy_threshold": 0.7145693302154541,
193
+ "eval_mrpc-test_cosine_ap": 0.8563235829800693,
194
+ "eval_mrpc-test_cosine_f1": 0.8327645051194539,
195
+ "eval_mrpc-test_cosine_f1_threshold": 0.6522408723831177,
196
+ "eval_mrpc-test_cosine_precision": 0.7218934911242604,
197
+ "eval_mrpc-test_cosine_recall": 0.9838709677419355,
198
+ "eval_mrpc-test_dot_accuracy": 0.7026315789473684,
199
+ "eval_mrpc-test_dot_accuracy_threshold": 14.454626083374023,
200
+ "eval_mrpc-test_dot_ap": 0.796363256728503,
201
+ "eval_mrpc-test_dot_f1": 0.8054607508532423,
202
+ "eval_mrpc-test_dot_f1_threshold": 13.752894401550293,
203
+ "eval_mrpc-test_dot_precision": 0.6982248520710059,
204
+ "eval_mrpc-test_dot_recall": 0.9516129032258065,
205
+ "eval_mrpc-test_euclidean_accuracy": 0.7315789473684211,
206
+ "eval_mrpc-test_euclidean_accuracy_threshold": 3.890326499938965,
207
+ "eval_mrpc-test_euclidean_ap": 0.8252367395643119,
208
+ "eval_mrpc-test_euclidean_f1": 0.8165467625899281,
209
+ "eval_mrpc-test_euclidean_f1_threshold": 3.890326499938965,
210
+ "eval_mrpc-test_euclidean_precision": 0.737012987012987,
211
+ "eval_mrpc-test_euclidean_recall": 0.9153225806451613,
212
+ "eval_mrpc-test_manhattan_accuracy": 0.7289473684210527,
213
+ "eval_mrpc-test_manhattan_accuracy_threshold": 77.57926177978516,
214
+ "eval_mrpc-test_manhattan_ap": 0.8208816982117964,
215
+ "eval_mrpc-test_manhattan_f1": 0.815742397137746,
216
+ "eval_mrpc-test_manhattan_f1_threshold": 79.14703369140625,
217
+ "eval_mrpc-test_manhattan_precision": 0.7331189710610932,
218
+ "eval_mrpc-test_manhattan_recall": 0.9193548387096774,
219
+ "eval_mrpc-test_max_accuracy": 0.7473684210526316,
220
+ "eval_mrpc-test_max_accuracy_threshold": 77.57926177978516,
221
+ "eval_mrpc-test_max_ap": 0.8563235829800693,
222
+ "eval_mrpc-test_max_f1": 0.8327645051194539,
223
+ "eval_mrpc-test_max_f1_threshold": 79.14703369140625,
224
+ "eval_mrpc-test_max_precision": 0.737012987012987,
225
+ "eval_mrpc-test_max_recall": 0.9838709677419355,
226
+ "eval_nli-pairs_loss": 0.8093397617340088,
227
+ "eval_nli-pairs_runtime": 3.0363,
228
+ "eval_nli-pairs_samples_per_second": 52.696,
229
+ "eval_nli-pairs_steps_per_second": 1.647,
230
+ "eval_sequential_score": 0.5651043866206488,
231
+ "step": 1292
232
+ },
233
+ {
234
+ "epoch": 0.2500967866821525,
235
+ "eval_vitaminc-pairs_loss": 5.769770622253418,
236
+ "eval_vitaminc-pairs_runtime": 1.5488,
237
+ "eval_vitaminc-pairs_samples_per_second": 85.875,
238
+ "eval_vitaminc-pairs_steps_per_second": 3.228,
239
+ "step": 1292
240
+ },
241
+ {
242
+ "epoch": 0.2500967866821525,
243
+ "eval_qnli-contrastive_loss": 0.12109158933162689,
244
+ "eval_qnli-contrastive_runtime": 0.5097,
245
+ "eval_qnli-contrastive_samples_per_second": 313.889,
246
+ "eval_qnli-contrastive_steps_per_second": 9.809,
247
+ "step": 1292
248
+ },
249
+ {
250
+ "epoch": 0.2500967866821525,
251
+ "eval_scitail-pairs-qa_loss": 0.07553695887327194,
252
+ "eval_scitail-pairs-qa_runtime": 1.2071,
253
+ "eval_scitail-pairs-qa_samples_per_second": 132.548,
254
+ "eval_scitail-pairs-qa_steps_per_second": 4.142,
255
+ "step": 1292
256
+ },
257
+ {
258
+ "epoch": 0.2500967866821525,
259
+ "eval_scitail-pairs-pos_loss": 0.3979075253009796,
260
+ "eval_scitail-pairs-pos_runtime": 2.3649,
261
+ "eval_scitail-pairs-pos_samples_per_second": 67.656,
262
+ "eval_scitail-pairs-pos_steps_per_second": 2.114,
263
+ "step": 1292
264
+ },
265
+ {
266
+ "epoch": 0.2500967866821525,
267
+ "eval_xsum-pairs_loss": 0.313429057598114,
268
+ "eval_xsum-pairs_runtime": 1.4107,
269
+ "eval_xsum-pairs_samples_per_second": 113.419,
270
+ "eval_xsum-pairs_steps_per_second": 3.544,
271
+ "step": 1292
272
+ },
273
+ {
274
+ "epoch": 0.2500967866821525,
275
+ "eval_compression-pairs_loss": 0.08316509425640106,
276
+ "eval_compression-pairs_runtime": 0.3958,
277
+ "eval_compression-pairs_samples_per_second": 404.289,
278
+ "eval_compression-pairs_steps_per_second": 12.634,
279
+ "step": 1292
280
+ },
281
+ {
282
+ "epoch": 0.2500967866821525,
283
+ "eval_sciq_pairs_loss": 0.2692818343639374,
284
+ "eval_sciq_pairs_runtime": 7.8991,
285
+ "eval_sciq_pairs_samples_per_second": 20.255,
286
+ "eval_sciq_pairs_steps_per_second": 0.633,
287
+ "step": 1292
288
+ },
289
+ {
290
+ "epoch": 0.2500967866821525,
291
+ "eval_qasc_pairs_loss": 0.19870159029960632,
292
+ "eval_qasc_pairs_runtime": 1.4336,
293
+ "eval_qasc_pairs_samples_per_second": 111.608,
294
+ "eval_qasc_pairs_steps_per_second": 3.488,
295
+ "step": 1292
296
+ },
297
+ {
298
+ "epoch": 0.2500967866821525,
299
+ "eval_qasc_facts_sym_loss": 0.16445104777812958,
300
+ "eval_qasc_facts_sym_runtime": 0.3196,
301
+ "eval_qasc_facts_sym_samples_per_second": 500.598,
302
+ "eval_qasc_facts_sym_steps_per_second": 15.644,
303
+ "step": 1292
304
+ },
305
+ {
306
+ "epoch": 0.2500967866821525,
307
+ "eval_openbookqa_pairs_loss": 1.7182375192642212,
308
+ "eval_openbookqa_pairs_runtime": 1.2252,
309
+ "eval_openbookqa_pairs_samples_per_second": 130.592,
310
+ "eval_openbookqa_pairs_steps_per_second": 4.081,
311
+ "step": 1292
312
+ },
313
+ {
314
+ "epoch": 0.2500967866821525,
315
+ "eval_msmarco_pairs_loss": 0.4961338937282562,
316
+ "eval_msmarco_pairs_runtime": 3.1144,
317
+ "eval_msmarco_pairs_samples_per_second": 51.374,
318
+ "eval_msmarco_pairs_steps_per_second": 1.605,
319
+ "step": 1292
320
+ },
321
+ {
322
+ "epoch": 0.2500967866821525,
323
+ "eval_nq_pairs_loss": 0.4005078673362732,
324
+ "eval_nq_pairs_runtime": 7.7074,
325
+ "eval_nq_pairs_samples_per_second": 20.759,
326
+ "eval_nq_pairs_steps_per_second": 0.649,
327
+ "step": 1292
328
+ },
329
+ {
330
+ "epoch": 0.2500967866821525,
331
+ "eval_trivia_pairs_loss": 0.654505729675293,
332
+ "eval_trivia_pairs_runtime": 10.1383,
333
+ "eval_trivia_pairs_samples_per_second": 15.782,
334
+ "eval_trivia_pairs_steps_per_second": 0.493,
335
+ "step": 1292
336
+ },
337
+ {
338
+ "epoch": 0.2500967866821525,
339
+ "eval_quora_pairs_loss": 0.22833283245563507,
340
+ "eval_quora_pairs_runtime": 3.894,
341
+ "eval_quora_pairs_samples_per_second": 173.342,
342
+ "eval_quora_pairs_steps_per_second": 5.65,
343
+ "step": 1292
344
+ },
345
+ {
346
+ "epoch": 0.2500967866821525,
347
+ "eval_gooaq_pairs_loss": 0.4996432662010193,
348
+ "eval_gooaq_pairs_runtime": 2.2697,
349
+ "eval_gooaq_pairs_samples_per_second": 70.494,
350
+ "eval_gooaq_pairs_steps_per_second": 2.203,
351
+ "step": 1292
352
+ },
353
+ {
354
+ "epoch": 0.2500967866821525,
355
+ "eval_mrpc_pairs_loss": 0.04613902047276497,
356
+ "eval_mrpc_pairs_runtime": 0.3743,
357
+ "eval_mrpc_pairs_samples_per_second": 427.427,
358
+ "eval_mrpc_pairs_steps_per_second": 13.357,
359
+ "step": 1292
360
+ },
361
+ {
362
+ "epoch": 0.2516453735965931,
363
+ "grad_norm": 12.50217342376709,
364
+ "learning_rate": 7.53774680603949e-06,
365
+ "loss": 0.6236,
366
+ "step": 1300
367
+ },
368
+ {
369
+ "epoch": 0.26422764227642276,
370
+ "grad_norm": 1.8397300243377686,
371
+ "learning_rate": 7.91521486643438e-06,
372
+ "loss": 0.4947,
373
+ "step": 1365
374
+ },
375
+ {
376
+ "epoch": 0.2768099109562524,
377
+ "grad_norm": 5.885033130645752,
378
+ "learning_rate": 8.292682926829268e-06,
379
+ "loss": 0.5595,
380
+ "step": 1430
381
+ },
382
+ {
383
+ "epoch": 0.28939217963608205,
384
+ "grad_norm": 1.7783002853393555,
385
+ "learning_rate": 8.670150987224158e-06,
386
+ "loss": 0.641,
387
+ "step": 1495
388
+ },
389
+ {
390
+ "epoch": 0.30197444831591175,
391
+ "grad_norm": 5.305712699890137,
392
+ "learning_rate": 9.047619047619047e-06,
393
+ "loss": 0.5188,
394
+ "step": 1560
395
+ },
396
+ {
397
+ "epoch": 0.3145567169957414,
398
+ "grad_norm": 8.24319839477539,
399
+ "learning_rate": 9.425087108013936e-06,
400
+ "loss": 0.4927,
401
+ "step": 1625
402
+ },
403
+ {
404
+ "epoch": 0.32713898567557104,
405
+ "grad_norm": 11.07426929473877,
406
+ "learning_rate": 9.802555168408827e-06,
407
+ "loss": 0.657,
408
+ "step": 1690
409
+ },
410
+ {
411
+ "epoch": 0.3397212543554007,
412
+ "grad_norm": 9.04263687133789,
413
+ "learning_rate": 1.0180023228803716e-05,
414
+ "loss": 0.4665,
415
+ "step": 1755
416
+ },
417
+ {
418
+ "epoch": 0.3523035230352303,
419
+ "grad_norm": 1.4980370998382568,
420
+ "learning_rate": 1.0557491289198606e-05,
421
+ "loss": 0.4645,
422
+ "step": 1820
423
+ },
424
+ {
425
+ "epoch": 0.36488579171506,
426
+ "grad_norm": 16.511180877685547,
427
+ "learning_rate": 1.0934959349593495e-05,
428
+ "loss": 0.5887,
429
+ "step": 1885
430
+ },
431
+ {
432
+ "epoch": 0.37746806039488967,
433
+ "grad_norm": 5.706000804901123,
434
+ "learning_rate": 1.1312427409988386e-05,
435
+ "loss": 0.5308,
436
+ "step": 1950
437
+ },
438
+ {
439
+ "epoch": 0.3900503290747193,
440
+ "grad_norm": 1.0923340320587158,
441
+ "learning_rate": 1.1689895470383277e-05,
442
+ "loss": 0.536,
443
+ "step": 2015
444
+ },
445
+ {
446
+ "epoch": 0.40263259775454896,
447
+ "grad_norm": 6.521665573120117,
448
+ "learning_rate": 1.2067363530778166e-05,
449
+ "loss": 0.4841,
450
+ "step": 2080
451
+ },
452
+ {
453
+ "epoch": 0.4152148664343786,
454
+ "grad_norm": 7.254842758178711,
455
+ "learning_rate": 1.2444831591173055e-05,
456
+ "loss": 0.6499,
457
+ "step": 2145
458
+ },
459
+ {
460
+ "epoch": 0.4277971351142083,
461
+ "grad_norm": 14.938628196716309,
462
+ "learning_rate": 1.2822299651567945e-05,
463
+ "loss": 0.5982,
464
+ "step": 2210
465
+ },
466
+ {
467
+ "epoch": 0.44037940379403795,
468
+ "grad_norm": 10.192171096801758,
469
+ "learning_rate": 1.3199767711962834e-05,
470
+ "loss": 0.5281,
471
+ "step": 2275
472
+ },
473
+ {
474
+ "epoch": 0.4529616724738676,
475
+ "grad_norm": 8.02379035949707,
476
+ "learning_rate": 1.3577235772357725e-05,
477
+ "loss": 0.6657,
478
+ "step": 2340
479
+ },
480
+ {
481
+ "epoch": 0.46554394115369724,
482
+ "grad_norm": 2.842752695083618,
483
+ "learning_rate": 1.3954703832752614e-05,
484
+ "loss": 0.5746,
485
+ "step": 2405
486
+ },
487
+ {
488
+ "epoch": 0.4781262098335269,
489
+ "grad_norm": 0.4373825788497925,
490
+ "learning_rate": 1.432636469221835e-05,
491
+ "loss": 0.5853,
492
+ "step": 2470
493
+ },
494
+ {
495
+ "epoch": 0.4907084785133566,
496
+ "grad_norm": 16.475561141967773,
497
+ "learning_rate": 1.4703832752613242e-05,
498
+ "loss": 0.5828,
499
+ "step": 2535
500
+ }
501
+ ],
502
+ "logging_steps": 65,
503
+ "max_steps": 25830,
504
+ "num_input_tokens_seen": 0,
505
+ "num_train_epochs": 5,
506
+ "save_steps": 2583,
507
+ "stateful_callbacks": {
508
+ "TrainerControl": {
509
+ "args": {
510
+ "should_epoch_stop": false,
511
+ "should_evaluate": false,
512
+ "should_log": false,
513
+ "should_save": true,
514
+ "should_training_stop": false
515
+ },
516
+ "attributes": {}
517
+ }
518
+ },
519
+ "total_flos": 0.0,
520
+ "train_batch_size": 32,
521
+ "trial_name": null,
522
+ "trial_params": null
523
+ }
checkpoint-2583/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2519118bce6ce43cd8ca9ca2ad2ad4642dfcfe5f9a3c6dd2eeb5a81e8223693e
3
+ size 5624