du33169 commited on
Commit
5205507
·
verified ·
1 Parent(s): 92edf91

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ base_model: FacebookAI/roberta-large
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - glue
9
+ metrics:
10
+ - accuracy
11
+ - f1
12
+ model-index:
13
+ - name: QQP
14
+ results:
15
+ - task:
16
+ name: Text Classification
17
+ type: text-classification
18
+ dataset:
19
+ name: GLUE QQP
20
+ type: glue
21
+ args: qqp
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.9207024486767252
26
+ - name: F1
27
+ type: f1
28
+ value: 0.8944283456269757
29
+ ---
30
+
31
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
32
+ should probably proofread and complete it, then remove this comment. -->
33
+
34
+ # QQP
35
+
36
+ This model is a fine-tuned version of [FacebookAI/roberta-large](https://huggingface.co/FacebookAI/roberta-large) on the GLUE QQP dataset.
37
+ It achieves the following results on the evaluation set:
38
+ - Loss: 0.2704
39
+ - Accuracy: 0.9207
40
+ - F1: 0.8944
41
+ - Combined Score: 0.9076
42
+
43
+ ## Model description
44
+
45
+ More information needed
46
+
47
+ ## Intended uses & limitations
48
+
49
+ More information needed
50
+
51
+ ## Training and evaluation data
52
+
53
+ More information needed
54
+
55
+ ## Training procedure
56
+
57
+ ### Training hyperparameters
58
+
59
+ The following hyperparameters were used during training:
60
+ - learning_rate: 2e-05
61
+ - train_batch_size: 64
62
+ - eval_batch_size: 8
63
+ - seed: 42
64
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
65
+ - lr_scheduler_type: linear
66
+ - num_epochs: 6.0
67
+
68
+ ### Training results
69
+
70
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 | Combined Score |
71
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|:------:|:--------------:|
72
+ | 0.2392 | 1.0 | 5686 | 0.2472 | 0.9049 | 0.8685 | 0.8867 |
73
+ | 0.1884 | 2.0 | 11372 | 0.2308 | 0.9089 | 0.8832 | 0.8960 |
74
+ | 0.1351 | 3.0 | 17058 | 0.2317 | 0.9180 | 0.8894 | 0.9037 |
75
+ | 0.1051 | 4.0 | 22744 | 0.2704 | 0.9207 | 0.8944 | 0.9076 |
76
+ | 0.0794 | 5.0 | 28430 | 0.3272 | 0.9194 | 0.8911 | 0.9053 |
77
+ | 0.0564 | 6.0 | 34116 | 0.3516 | 0.9207 | 0.8940 | 0.9073 |
78
+
79
+
80
+ ### Framework versions
81
+
82
+ - Transformers 4.43.3
83
+ - Pytorch 1.11.0+cu113
84
+ - Datasets 2.20.0
85
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.9207024486767252,
4
+ "eval_combined_score": 0.9075653971518505,
5
+ "eval_f1": 0.8944283456269757,
6
+ "eval_loss": 0.27043241262435913,
7
+ "eval_runtime": 162.4349,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 248.9,
10
+ "eval_steps_per_second": 31.114,
11
+ "train_loss": 0.14204222600531893,
12
+ "train_runtime": 24723.4908,
13
+ "train_samples": 363846,
14
+ "train_samples_per_second": 88.3,
15
+ "train_steps_per_second": 1.38
16
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "FacebookAI/roberta-large",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "finetuning_task": "qqp",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4096,
16
+ "label2id": {
17
+ "duplicate": 1,
18
+ "not_duplicate": 0
19
+ },
20
+ "layer_norm_eps": 1e-05,
21
+ "max_position_embeddings": 514,
22
+ "model_type": "roberta",
23
+ "num_attention_heads": 16,
24
+ "num_hidden_layers": 24,
25
+ "pad_token_id": 1,
26
+ "position_embedding_type": "absolute",
27
+ "problem_type": "single_label_classification",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.43.3",
30
+ "type_vocab_size": 1,
31
+ "use_cache": true,
32
+ "vocab_size": 50265
33
+ }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.9207024486767252,
4
+ "eval_combined_score": 0.9075653971518505,
5
+ "eval_f1": 0.8944283456269757,
6
+ "eval_loss": 0.27043241262435913,
7
+ "eval_runtime": 162.4349,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 248.9,
10
+ "eval_steps_per_second": 31.114
11
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4688592e6697d3bf7e89fa6817db5d65433107fbde52eadfb724f88c6a0e64
3
+ size 1421495416
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "train_loss": 0.14204222600531893,
4
+ "train_runtime": 24723.4908,
5
+ "train_samples": 363846,
6
+ "train_samples_per_second": 88.3,
7
+ "train_steps_per_second": 1.38
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9075653971518505,
3
+ "best_model_checkpoint": "output/fine_tuned/roberta-large/QQP/checkpoint-22744",
4
+ "epoch": 6.0,
5
+ "eval_steps": 500,
6
+ "global_step": 34116,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08793527963418923,
13
+ "grad_norm": 8.288018226623535,
14
+ "learning_rate": 1.970688240121937e-05,
15
+ "loss": 0.3997,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.17587055926837847,
20
+ "grad_norm": 4.3138747215271,
21
+ "learning_rate": 1.9413764802438738e-05,
22
+ "loss": 0.3023,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.2638058389025677,
27
+ "grad_norm": 6.374757289886475,
28
+ "learning_rate": 1.9120647203658108e-05,
29
+ "loss": 0.2839,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.35174111853675694,
34
+ "grad_norm": 6.788768768310547,
35
+ "learning_rate": 1.8827529604877478e-05,
36
+ "loss": 0.2854,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.43967639817094617,
41
+ "grad_norm": 16.165164947509766,
42
+ "learning_rate": 1.8534412006096848e-05,
43
+ "loss": 0.2674,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.5276116778051354,
48
+ "grad_norm": 8.882967948913574,
49
+ "learning_rate": 1.8241294407316218e-05,
50
+ "loss": 0.2647,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.6155469574393246,
55
+ "grad_norm": 7.7459869384765625,
56
+ "learning_rate": 1.7948176808535588e-05,
57
+ "loss": 0.2544,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.7034822370735139,
62
+ "grad_norm": 7.243208885192871,
63
+ "learning_rate": 1.7655059209754954e-05,
64
+ "loss": 0.2541,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.7914175167077031,
69
+ "grad_norm": 12.087061882019043,
70
+ "learning_rate": 1.7361941610974324e-05,
71
+ "loss": 0.2485,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.8793527963418923,
76
+ "grad_norm": 5.829019546508789,
77
+ "learning_rate": 1.7068824012193694e-05,
78
+ "loss": 0.2484,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.9672880759760816,
83
+ "grad_norm": 8.462845802307129,
84
+ "learning_rate": 1.6775706413413064e-05,
85
+ "loss": 0.2392,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 1.0,
90
+ "eval_accuracy": 0.9049220875587435,
91
+ "eval_combined_score": 0.8867021824135581,
92
+ "eval_f1": 0.8684822772683727,
93
+ "eval_loss": 0.24716497957706451,
94
+ "eval_runtime": 163.5025,
95
+ "eval_samples_per_second": 247.274,
96
+ "eval_steps_per_second": 30.911,
97
+ "step": 5686
98
+ },
99
+ {
100
+ "epoch": 1.0552233556102708,
101
+ "grad_norm": 6.706808090209961,
102
+ "learning_rate": 1.6482588814632434e-05,
103
+ "loss": 0.2102,
104
+ "step": 6000
105
+ },
106
+ {
107
+ "epoch": 1.14315863524446,
108
+ "grad_norm": 5.22786808013916,
109
+ "learning_rate": 1.6189471215851803e-05,
110
+ "loss": 0.1941,
111
+ "step": 6500
112
+ },
113
+ {
114
+ "epoch": 1.2310939148786493,
115
+ "grad_norm": 6.846224784851074,
116
+ "learning_rate": 1.589635361707117e-05,
117
+ "loss": 0.1881,
118
+ "step": 7000
119
+ },
120
+ {
121
+ "epoch": 1.3190291945128385,
122
+ "grad_norm": 7.269466876983643,
123
+ "learning_rate": 1.560323601829054e-05,
124
+ "loss": 0.183,
125
+ "step": 7500
126
+ },
127
+ {
128
+ "epoch": 1.4069644741470277,
129
+ "grad_norm": 5.1561665534973145,
130
+ "learning_rate": 1.531011841950991e-05,
131
+ "loss": 0.1914,
132
+ "step": 8000
133
+ },
134
+ {
135
+ "epoch": 1.494899753781217,
136
+ "grad_norm": 8.569211959838867,
137
+ "learning_rate": 1.5017000820729278e-05,
138
+ "loss": 0.1881,
139
+ "step": 8500
140
+ },
141
+ {
142
+ "epoch": 1.5828350334154062,
143
+ "grad_norm": 7.138071060180664,
144
+ "learning_rate": 1.4723883221948648e-05,
145
+ "loss": 0.187,
146
+ "step": 9000
147
+ },
148
+ {
149
+ "epoch": 1.6707703130495954,
150
+ "grad_norm": 5.725489139556885,
151
+ "learning_rate": 1.4430765623168018e-05,
152
+ "loss": 0.1905,
153
+ "step": 9500
154
+ },
155
+ {
156
+ "epoch": 1.7587055926837847,
157
+ "grad_norm": 5.3855180740356445,
158
+ "learning_rate": 1.4137648024387384e-05,
159
+ "loss": 0.1859,
160
+ "step": 10000
161
+ },
162
+ {
163
+ "epoch": 1.8466408723179741,
164
+ "grad_norm": 6.059995651245117,
165
+ "learning_rate": 1.3844530425606754e-05,
166
+ "loss": 0.186,
167
+ "step": 10500
168
+ },
169
+ {
170
+ "epoch": 1.9345761519521631,
171
+ "grad_norm": 7.908290863037109,
172
+ "learning_rate": 1.3551412826826124e-05,
173
+ "loss": 0.1884,
174
+ "step": 11000
175
+ },
176
+ {
177
+ "epoch": 2.0,
178
+ "eval_accuracy": 0.908904279000742,
179
+ "eval_combined_score": 0.896049297852469,
180
+ "eval_f1": 0.8831943167041959,
181
+ "eval_loss": 0.23078934848308563,
182
+ "eval_runtime": 163.6895,
183
+ "eval_samples_per_second": 246.992,
184
+ "eval_steps_per_second": 30.876,
185
+ "step": 11372
186
+ },
187
+ {
188
+ "epoch": 2.0225114315863526,
189
+ "grad_norm": 7.034117698669434,
190
+ "learning_rate": 1.3258295228045492e-05,
191
+ "loss": 0.1764,
192
+ "step": 11500
193
+ },
194
+ {
195
+ "epoch": 2.1104467112205416,
196
+ "grad_norm": 6.000309467315674,
197
+ "learning_rate": 1.2965177629264862e-05,
198
+ "loss": 0.1409,
199
+ "step": 12000
200
+ },
201
+ {
202
+ "epoch": 2.198381990854731,
203
+ "grad_norm": 5.445409297943115,
204
+ "learning_rate": 1.2672060030484232e-05,
205
+ "loss": 0.1458,
206
+ "step": 12500
207
+ },
208
+ {
209
+ "epoch": 2.28631727048892,
210
+ "grad_norm": 11.053699493408203,
211
+ "learning_rate": 1.23789424317036e-05,
212
+ "loss": 0.1408,
213
+ "step": 13000
214
+ },
215
+ {
216
+ "epoch": 2.3742525501231095,
217
+ "grad_norm": 12.287221908569336,
218
+ "learning_rate": 1.208582483292297e-05,
219
+ "loss": 0.1465,
220
+ "step": 13500
221
+ },
222
+ {
223
+ "epoch": 2.4621878297572986,
224
+ "grad_norm": 2.536963939666748,
225
+ "learning_rate": 1.179270723414234e-05,
226
+ "loss": 0.1398,
227
+ "step": 14000
228
+ },
229
+ {
230
+ "epoch": 2.550123109391488,
231
+ "grad_norm": 3.656428813934326,
232
+ "learning_rate": 1.1499589635361708e-05,
233
+ "loss": 0.1391,
234
+ "step": 14500
235
+ },
236
+ {
237
+ "epoch": 2.638058389025677,
238
+ "grad_norm": 4.2873053550720215,
239
+ "learning_rate": 1.1206472036581078e-05,
240
+ "loss": 0.1407,
241
+ "step": 15000
242
+ },
243
+ {
244
+ "epoch": 2.7259936686598665,
245
+ "grad_norm": 8.166319847106934,
246
+ "learning_rate": 1.0913354437800448e-05,
247
+ "loss": 0.1401,
248
+ "step": 15500
249
+ },
250
+ {
251
+ "epoch": 2.8139289482940555,
252
+ "grad_norm": 11.278782844543457,
253
+ "learning_rate": 1.0620236839019815e-05,
254
+ "loss": 0.1435,
255
+ "step": 16000
256
+ },
257
+ {
258
+ "epoch": 2.901864227928245,
259
+ "grad_norm": 9.329524993896484,
260
+ "learning_rate": 1.0327119240239184e-05,
261
+ "loss": 0.1405,
262
+ "step": 16500
263
+ },
264
+ {
265
+ "epoch": 2.989799507562434,
266
+ "grad_norm": 7.3363356590271,
267
+ "learning_rate": 1.0034001641458554e-05,
268
+ "loss": 0.1351,
269
+ "step": 17000
270
+ },
271
+ {
272
+ "epoch": 3.0,
273
+ "eval_accuracy": 0.9179569626514964,
274
+ "eval_combined_score": 0.9036969907087687,
275
+ "eval_f1": 0.8894370187660411,
276
+ "eval_loss": 0.23165978491306305,
277
+ "eval_runtime": 163.4119,
278
+ "eval_samples_per_second": 247.412,
279
+ "eval_steps_per_second": 30.928,
280
+ "step": 17058
281
+ },
282
+ {
283
+ "epoch": 3.0777347871966234,
284
+ "grad_norm": 3.7927825450897217,
285
+ "learning_rate": 9.740884042677923e-06,
286
+ "loss": 0.1066,
287
+ "step": 17500
288
+ },
289
+ {
290
+ "epoch": 3.1656700668308124,
291
+ "grad_norm": 28.014644622802734,
292
+ "learning_rate": 9.447766443897292e-06,
293
+ "loss": 0.1089,
294
+ "step": 18000
295
+ },
296
+ {
297
+ "epoch": 3.253605346465002,
298
+ "grad_norm": 3.5056657791137695,
299
+ "learning_rate": 9.154648845116662e-06,
300
+ "loss": 0.1085,
301
+ "step": 18500
302
+ },
303
+ {
304
+ "epoch": 3.341540626099191,
305
+ "grad_norm": 3.618999481201172,
306
+ "learning_rate": 8.86153124633603e-06,
307
+ "loss": 0.1114,
308
+ "step": 19000
309
+ },
310
+ {
311
+ "epoch": 3.4294759057333803,
312
+ "grad_norm": 7.726201057434082,
313
+ "learning_rate": 8.5684136475554e-06,
314
+ "loss": 0.1096,
315
+ "step": 19500
316
+ },
317
+ {
318
+ "epoch": 3.5174111853675694,
319
+ "grad_norm": 10.587724685668945,
320
+ "learning_rate": 8.275296048774769e-06,
321
+ "loss": 0.1064,
322
+ "step": 20000
323
+ },
324
+ {
325
+ "epoch": 3.605346465001759,
326
+ "grad_norm": 6.7054948806762695,
327
+ "learning_rate": 7.982178449994138e-06,
328
+ "loss": 0.1067,
329
+ "step": 20500
330
+ },
331
+ {
332
+ "epoch": 3.693281744635948,
333
+ "grad_norm": 5.118494987487793,
334
+ "learning_rate": 7.689060851213508e-06,
335
+ "loss": 0.1067,
336
+ "step": 21000
337
+ },
338
+ {
339
+ "epoch": 3.7812170242701373,
340
+ "grad_norm": 5.718533039093018,
341
+ "learning_rate": 7.395943252432877e-06,
342
+ "loss": 0.107,
343
+ "step": 21500
344
+ },
345
+ {
346
+ "epoch": 3.8691523039043263,
347
+ "grad_norm": 3.717242479324341,
348
+ "learning_rate": 7.102825653652246e-06,
349
+ "loss": 0.105,
350
+ "step": 22000
351
+ },
352
+ {
353
+ "epoch": 3.9570875835385158,
354
+ "grad_norm": 5.7518439292907715,
355
+ "learning_rate": 6.8097080548716155e-06,
356
+ "loss": 0.1051,
357
+ "step": 22500
358
+ },
359
+ {
360
+ "epoch": 4.0,
361
+ "eval_accuracy": 0.9207024486767252,
362
+ "eval_combined_score": 0.9075653971518505,
363
+ "eval_f1": 0.8944283456269757,
364
+ "eval_loss": 0.27043241262435913,
365
+ "eval_runtime": 165.1459,
366
+ "eval_samples_per_second": 244.814,
367
+ "eval_steps_per_second": 30.603,
368
+ "step": 22744
369
+ },
370
+ {
371
+ "epoch": 4.045022863172705,
372
+ "grad_norm": 8.56693172454834,
373
+ "learning_rate": 6.5165904560909846e-06,
374
+ "loss": 0.0944,
375
+ "step": 23000
376
+ },
377
+ {
378
+ "epoch": 4.132958142806894,
379
+ "grad_norm": 0.9398753046989441,
380
+ "learning_rate": 6.223472857310353e-06,
381
+ "loss": 0.0836,
382
+ "step": 23500
383
+ },
384
+ {
385
+ "epoch": 4.220893422441083,
386
+ "grad_norm": 11.362903594970703,
387
+ "learning_rate": 5.930355258529723e-06,
388
+ "loss": 0.0786,
389
+ "step": 24000
390
+ },
391
+ {
392
+ "epoch": 4.308828702075273,
393
+ "grad_norm": 10.553667068481445,
394
+ "learning_rate": 5.637237659749092e-06,
395
+ "loss": 0.0805,
396
+ "step": 24500
397
+ },
398
+ {
399
+ "epoch": 4.396763981709462,
400
+ "grad_norm": 2.639619827270508,
401
+ "learning_rate": 5.344120060968461e-06,
402
+ "loss": 0.0779,
403
+ "step": 25000
404
+ },
405
+ {
406
+ "epoch": 4.484699261343651,
407
+ "grad_norm": 4.3451008796691895,
408
+ "learning_rate": 5.051002462187831e-06,
409
+ "loss": 0.0778,
410
+ "step": 25500
411
+ },
412
+ {
413
+ "epoch": 4.57263454097784,
414
+ "grad_norm": 1.3628908395767212,
415
+ "learning_rate": 4.7578848634072e-06,
416
+ "loss": 0.0794,
417
+ "step": 26000
418
+ },
419
+ {
420
+ "epoch": 4.66056982061203,
421
+ "grad_norm": 14.94767951965332,
422
+ "learning_rate": 4.464767264626568e-06,
423
+ "loss": 0.08,
424
+ "step": 26500
425
+ },
426
+ {
427
+ "epoch": 4.748505100246219,
428
+ "grad_norm": 3.760082721710205,
429
+ "learning_rate": 4.171649665845938e-06,
430
+ "loss": 0.0785,
431
+ "step": 27000
432
+ },
433
+ {
434
+ "epoch": 4.836440379880408,
435
+ "grad_norm": 9.946064949035645,
436
+ "learning_rate": 3.878532067065307e-06,
437
+ "loss": 0.0809,
438
+ "step": 27500
439
+ },
440
+ {
441
+ "epoch": 4.924375659514597,
442
+ "grad_norm": 16.117393493652344,
443
+ "learning_rate": 3.585414468284676e-06,
444
+ "loss": 0.0794,
445
+ "step": 28000
446
+ },
447
+ {
448
+ "epoch": 5.0,
449
+ "eval_accuracy": 0.9194410091516201,
450
+ "eval_combined_score": 0.9052938477127372,
451
+ "eval_f1": 0.8911466862738545,
452
+ "eval_loss": 0.32720956206321716,
453
+ "eval_runtime": 162.6906,
454
+ "eval_samples_per_second": 248.508,
455
+ "eval_steps_per_second": 31.065,
456
+ "step": 28430
457
+ },
458
+ {
459
+ "epoch": 5.012310939148787,
460
+ "grad_norm": 27.78730010986328,
461
+ "learning_rate": 3.2922968695040454e-06,
462
+ "loss": 0.0743,
463
+ "step": 28500
464
+ },
465
+ {
466
+ "epoch": 5.100246218782976,
467
+ "grad_norm": 8.635354995727539,
468
+ "learning_rate": 2.999179270723415e-06,
469
+ "loss": 0.0599,
470
+ "step": 29000
471
+ },
472
+ {
473
+ "epoch": 5.188181498417165,
474
+ "grad_norm": 11.963053703308105,
475
+ "learning_rate": 2.7060616719427835e-06,
476
+ "loss": 0.0619,
477
+ "step": 29500
478
+ },
479
+ {
480
+ "epoch": 5.276116778051354,
481
+ "grad_norm": 3.372576951980591,
482
+ "learning_rate": 2.412944073162153e-06,
483
+ "loss": 0.0606,
484
+ "step": 30000
485
+ },
486
+ {
487
+ "epoch": 5.3640520576855435,
488
+ "grad_norm": 6.7732343673706055,
489
+ "learning_rate": 2.119826474381522e-06,
490
+ "loss": 0.0616,
491
+ "step": 30500
492
+ },
493
+ {
494
+ "epoch": 5.451987337319733,
495
+ "grad_norm": 11.202216148376465,
496
+ "learning_rate": 1.8267088756008912e-06,
497
+ "loss": 0.0662,
498
+ "step": 31000
499
+ },
500
+ {
501
+ "epoch": 5.5399226169539215,
502
+ "grad_norm": 3.270718574523926,
503
+ "learning_rate": 1.5335912768202605e-06,
504
+ "loss": 0.0543,
505
+ "step": 31500
506
+ },
507
+ {
508
+ "epoch": 5.627857896588111,
509
+ "grad_norm": 10.084086418151855,
510
+ "learning_rate": 1.2404736780396295e-06,
511
+ "loss": 0.0578,
512
+ "step": 32000
513
+ },
514
+ {
515
+ "epoch": 5.7157931762223,
516
+ "grad_norm": 212.55088806152344,
517
+ "learning_rate": 9.473560792589988e-07,
518
+ "loss": 0.0596,
519
+ "step": 32500
520
+ },
521
+ {
522
+ "epoch": 5.80372845585649,
523
+ "grad_norm": 0.6594523787498474,
524
+ "learning_rate": 6.542384804783681e-07,
525
+ "loss": 0.0589,
526
+ "step": 33000
527
+ },
528
+ {
529
+ "epoch": 5.891663735490679,
530
+ "grad_norm": 11.235882759094238,
531
+ "learning_rate": 3.611208816977372e-07,
532
+ "loss": 0.0615,
533
+ "step": 33500
534
+ },
535
+ {
536
+ "epoch": 5.979599015124868,
537
+ "grad_norm": 18.778226852416992,
538
+ "learning_rate": 6.800328291710635e-08,
539
+ "loss": 0.0564,
540
+ "step": 34000
541
+ },
542
+ {
543
+ "epoch": 6.0,
544
+ "eval_accuracy": 0.9206777145683898,
545
+ "eval_combined_score": 0.9073393530700153,
546
+ "eval_f1": 0.894000991571641,
547
+ "eval_loss": 0.35158464312553406,
548
+ "eval_runtime": 162.3775,
549
+ "eval_samples_per_second": 248.988,
550
+ "eval_steps_per_second": 31.125,
551
+ "step": 34116
552
+ },
553
+ {
554
+ "epoch": 6.0,
555
+ "step": 34116,
556
+ "total_flos": 5.086192482321592e+17,
557
+ "train_loss": 0.14204222600531893,
558
+ "train_runtime": 24723.4908,
559
+ "train_samples_per_second": 88.3,
560
+ "train_steps_per_second": 1.38
561
+ }
562
+ ],
563
+ "logging_steps": 500,
564
+ "max_steps": 34116,
565
+ "num_input_tokens_seen": 0,
566
+ "num_train_epochs": 6,
567
+ "save_steps": 500,
568
+ "stateful_callbacks": {
569
+ "TrainerControl": {
570
+ "args": {
571
+ "should_epoch_stop": false,
572
+ "should_evaluate": false,
573
+ "should_log": false,
574
+ "should_save": true,
575
+ "should_training_stop": true
576
+ },
577
+ "attributes": {}
578
+ }
579
+ },
580
+ "total_flos": 5.086192482321592e+17,
581
+ "train_batch_size": 64,
582
+ "trial_name": null,
583
+ "trial_params": null
584
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28afc7ac3812bd139eea7325ff03dd15bc7517aef5d91a3a0dca21d9ec7bf6be
3
+ size 4783
vocab.json ADDED
The diff for this file is too large to render. See raw diff