Ihor commited on
Commit
f7dc0b0
1 Parent(s): ed4b7a7

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<<LABEL>>": 128001,
3
+ "<<SEP>>": 128002,
4
+ "[MASK]": 128000
5
+ }
config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GLiClassModel"
4
+ ],
5
+ "class_token_index": 128001,
6
+ "encoder_config": {
7
+ "_name_or_path": "microsoft/deberta-v3-base",
8
+ "add_cross_attention": false,
9
+ "architectures": null,
10
+ "attention_probs_dropout_prob": 0.1,
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": null,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": null,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "hidden_act": "gelu",
27
+ "hidden_dropout_prob": 0.1,
28
+ "hidden_size": 768,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 3072,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "layer_norm_eps": 1e-07,
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 512,
45
+ "max_relative_positions": -1,
46
+ "min_length": 0,
47
+ "model_type": "deberta-v2",
48
+ "no_repeat_ngram_size": 0,
49
+ "norm_rel_ebd": "layer_norm",
50
+ "num_attention_heads": 12,
51
+ "num_beam_groups": 1,
52
+ "num_beams": 1,
53
+ "num_hidden_layers": 12,
54
+ "num_return_sequences": 1,
55
+ "output_attentions": false,
56
+ "output_hidden_states": false,
57
+ "output_scores": false,
58
+ "pad_token_id": 0,
59
+ "pooler_dropout": 0,
60
+ "pooler_hidden_act": "gelu",
61
+ "pooler_hidden_size": 768,
62
+ "pos_att_type": [
63
+ "p2c",
64
+ "c2p"
65
+ ],
66
+ "position_biased_input": false,
67
+ "position_buckets": 256,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "relative_attention": true,
72
+ "remove_invalid_values": false,
73
+ "repetition_penalty": 1.0,
74
+ "return_dict": true,
75
+ "return_dict_in_generate": false,
76
+ "sep_token_id": null,
77
+ "share_att_key": true,
78
+ "suppress_tokens": null,
79
+ "task_specific_params": null,
80
+ "temperature": 1.0,
81
+ "tf_legacy_loss": false,
82
+ "tie_encoder_decoder": false,
83
+ "tie_word_embeddings": true,
84
+ "tokenizer_class": null,
85
+ "top_k": 50,
86
+ "top_p": 1.0,
87
+ "torch_dtype": null,
88
+ "torchscript": false,
89
+ "type_vocab_size": 0,
90
+ "typical_p": 1.0,
91
+ "use_bfloat16": false,
92
+ "vocab_size": 128003
93
+ },
94
+ "encoder_model_name": "microsoft/deberta-v3-base",
95
+ "hidden_size": 768,
96
+ "ignore_index": -100,
97
+ "initializer_range": 0.03,
98
+ "max_num_classes": 25,
99
+ "model_type": "GLiClass",
100
+ "pooling_strategy": "first",
101
+ "problem_type": "multi_label_classification",
102
+ "projector_hidden_act": "gelu",
103
+ "scorer_type": "simple",
104
+ "torch_dtype": "bfloat16",
105
+ "transformers_version": "4.40.2",
106
+ "use_lstm": false,
107
+ "vocab_size": 128003
108
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a74089e637821bcc046ce0b45a16e0c45b9d12fc775d21274b516c3b23caa45
3
+ size 372265272
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff788e493dd1f86f258427da20ed45ebae8aaa65c1274ea0867a706450cf565
3
+ size 744650362
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50efd9e038ed1d8fd02c3b897b8d6eb9c7ccc7ed56d4fb2693b64c68deb9ae11
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0dcdd51df27b6045f8c06f22c2ae42a1190b854eb7f7396adffb65879362f5b
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128001": {
44
+ "content": "<<LABEL>>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "128002": {
52
+ "content": "<<SEP>>",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ }
59
+ },
60
+ "bos_token": "[CLS]",
61
+ "clean_up_tokenization_spaces": true,
62
+ "cls_token": "[CLS]",
63
+ "do_lower_case": false,
64
+ "eos_token": "[SEP]",
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 1000000000000000019884624838656,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "sp_model_kwargs": {},
70
+ "split_by_punct": false,
71
+ "tokenizer_class": "DebertaV2Tokenizer",
72
+ "unk_token": "[UNK]",
73
+ "vocab_type": "spm"
74
+ }
trainer_state.json ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.9988002399520095,
5
+ "eval_steps": 500,
6
+ "global_step": 40000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07498500299940011,
13
+ "grad_norm": 32.0,
14
+ "learning_rate": 6.248437890527369e-06,
15
+ "loss": 1.2095,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.14997000599880023,
20
+ "grad_norm": 16.5,
21
+ "learning_rate": 1.2496875781054738e-05,
22
+ "loss": 0.8665,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.22495500899820037,
27
+ "grad_norm": 17.75,
28
+ "learning_rate": 1.8745313671582104e-05,
29
+ "loss": 0.6844,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.29994001199760045,
34
+ "grad_norm": 7.78125,
35
+ "learning_rate": 2.4993751562109475e-05,
36
+ "loss": 0.5099,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.3749250149970006,
41
+ "grad_norm": 7.375,
42
+ "learning_rate": 3.124218945263684e-05,
43
+ "loss": 0.2845,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.44991001799640074,
48
+ "grad_norm": 8.0,
49
+ "learning_rate": 3.749062734316421e-05,
50
+ "loss": 0.2251,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.5248950209958009,
55
+ "grad_norm": 6.1875,
56
+ "learning_rate": 4.373906523369158e-05,
57
+ "loss": 0.206,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.5998800239952009,
62
+ "grad_norm": 3.640625,
63
+ "learning_rate": 4.998750312421895e-05,
64
+ "loss": 0.1874,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.674865026994601,
69
+ "grad_norm": 4.8125,
70
+ "learning_rate": 4.997630981947256e-05,
71
+ "loss": 0.1793,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.7498500299940012,
76
+ "grad_norm": 3.953125,
77
+ "learning_rate": 4.9905094389741605e-05,
78
+ "loss": 0.1708,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.8248350329934013,
83
+ "grad_norm": 6.125,
84
+ "learning_rate": 4.978648912557427e-05,
85
+ "loss": 0.1614,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.8998200359928015,
90
+ "grad_norm": 3.15625,
91
+ "learning_rate": 4.962071971107133e-05,
92
+ "loss": 0.1581,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.9748050389922016,
97
+ "grad_norm": 3.984375,
98
+ "learning_rate": 4.940810157507576e-05,
99
+ "loss": 0.154,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 1.0,
104
+ "eval_accuracy": 0.6357488330240144,
105
+ "eval_f1": 0.5235656092218528,
106
+ "eval_loss": 0.1340431272983551,
107
+ "eval_precision": 0.4623448805170602,
108
+ "eval_recall": 0.6357488330240144,
109
+ "eval_runtime": 40.157,
110
+ "eval_samples_per_second": 295.191,
111
+ "eval_steps_per_second": 18.453,
112
+ "step": 6668
113
+ },
114
+ {
115
+ "epoch": 1.0497900419916018,
116
+ "grad_norm": 2.796875,
117
+ "learning_rate": 4.914903929096945e-05,
118
+ "loss": 0.1466,
119
+ "step": 7000
120
+ },
121
+ {
122
+ "epoch": 1.124775044991002,
123
+ "grad_norm": 3.8125,
124
+ "learning_rate": 4.884402580684407e-05,
125
+ "loss": 0.1503,
126
+ "step": 7500
127
+ },
128
+ {
129
+ "epoch": 1.1997600479904018,
130
+ "grad_norm": 2.90625,
131
+ "learning_rate": 4.8493641507511146e-05,
132
+ "loss": 0.1463,
133
+ "step": 8000
134
+ },
135
+ {
136
+ "epoch": 1.274745050989802,
137
+ "grad_norm": 5.15625,
138
+ "learning_rate": 4.809855311013604e-05,
139
+ "loss": 0.1473,
140
+ "step": 8500
141
+ },
142
+ {
143
+ "epoch": 1.349730053989202,
144
+ "grad_norm": 3.3125,
145
+ "learning_rate": 4.765951239559725e-05,
146
+ "loss": 0.1448,
147
+ "step": 9000
148
+ },
149
+ {
150
+ "epoch": 1.4247150569886022,
151
+ "grad_norm": 5.125,
152
+ "learning_rate": 4.717735477798505e-05,
153
+ "loss": 0.1381,
154
+ "step": 9500
155
+ },
156
+ {
157
+ "epoch": 1.4997000599880024,
158
+ "grad_norm": 3.046875,
159
+ "learning_rate": 4.665299771496145e-05,
160
+ "loss": 0.1413,
161
+ "step": 10000
162
+ },
163
+ {
164
+ "epoch": 1.5746850629874025,
165
+ "grad_norm": 4.40625,
166
+ "learning_rate": 4.608743896200624e-05,
167
+ "loss": 0.1419,
168
+ "step": 10500
169
+ },
170
+ {
171
+ "epoch": 1.6496700659868027,
172
+ "grad_norm": 3.984375,
173
+ "learning_rate": 4.548175467387103e-05,
174
+ "loss": 0.1407,
175
+ "step": 11000
176
+ },
177
+ {
178
+ "epoch": 1.7246550689862028,
179
+ "grad_norm": 4.34375,
180
+ "learning_rate": 4.483709735685378e-05,
181
+ "loss": 0.1394,
182
+ "step": 11500
183
+ },
184
+ {
185
+ "epoch": 1.799640071985603,
186
+ "grad_norm": 4.09375,
187
+ "learning_rate": 4.415469367579033e-05,
188
+ "loss": 0.1379,
189
+ "step": 12000
190
+ },
191
+ {
192
+ "epoch": 1.874625074985003,
193
+ "grad_norm": 2.0,
194
+ "learning_rate": 4.343584211993589e-05,
195
+ "loss": 0.1377,
196
+ "step": 12500
197
+ },
198
+ {
199
+ "epoch": 1.9496100779844032,
200
+ "grad_norm": 2.96875,
201
+ "learning_rate": 4.268191053217765e-05,
202
+ "loss": 0.1357,
203
+ "step": 13000
204
+ },
205
+ {
206
+ "epoch": 2.0,
207
+ "eval_accuracy": 0.6386662729880209,
208
+ "eval_f1": 0.525822096019018,
209
+ "eval_loss": 0.12524983286857605,
210
+ "eval_precision": 0.463306913237047,
211
+ "eval_recall": 0.6386662729880209,
212
+ "eval_runtime": 40.1169,
213
+ "eval_samples_per_second": 295.487,
214
+ "eval_steps_per_second": 18.471,
215
+ "step": 13336
216
+ },
217
+ {
218
+ "epoch": 2.0245950809838034,
219
+ "grad_norm": 5.4375,
220
+ "learning_rate": 4.189433350628029e-05,
221
+ "loss": 0.1366,
222
+ "step": 13500
223
+ },
224
+ {
225
+ "epoch": 2.0995800839832035,
226
+ "grad_norm": 5.875,
227
+ "learning_rate": 4.10746096571167e-05,
228
+ "loss": 0.1329,
229
+ "step": 14000
230
+ },
231
+ {
232
+ "epoch": 2.1745650869826036,
233
+ "grad_norm": 4.15625,
234
+ "learning_rate": 4.02242987690783e-05,
235
+ "loss": 0.1325,
236
+ "step": 14500
237
+ },
238
+ {
239
+ "epoch": 2.249550089982004,
240
+ "grad_norm": 3.71875,
241
+ "learning_rate": 3.9345018828090864e-05,
242
+ "loss": 0.1321,
243
+ "step": 15000
244
+ },
245
+ {
246
+ "epoch": 2.324535092981404,
247
+ "grad_norm": 2.796875,
248
+ "learning_rate": 3.843844294288368e-05,
249
+ "loss": 0.1332,
250
+ "step": 15500
251
+ },
252
+ {
253
+ "epoch": 2.3995200959808036,
254
+ "grad_norm": 3.203125,
255
+ "learning_rate": 3.750629616136989e-05,
256
+ "loss": 0.1281,
257
+ "step": 16000
258
+ },
259
+ {
260
+ "epoch": 2.4745050989802038,
261
+ "grad_norm": 2.921875,
262
+ "learning_rate": 3.6550352188196244e-05,
263
+ "loss": 0.1307,
264
+ "step": 16500
265
+ },
266
+ {
267
+ "epoch": 2.549490101979604,
268
+ "grad_norm": 4.34375,
269
+ "learning_rate": 3.557243000970787e-05,
270
+ "loss": 0.1345,
271
+ "step": 17000
272
+ },
273
+ {
274
+ "epoch": 2.624475104979004,
275
+ "grad_norm": 3.28125,
276
+ "learning_rate": 3.457439043275033e-05,
277
+ "loss": 0.129,
278
+ "step": 17500
279
+ },
280
+ {
281
+ "epoch": 2.699460107978404,
282
+ "grad_norm": 3.703125,
283
+ "learning_rate": 3.355813254389495e-05,
284
+ "loss": 0.135,
285
+ "step": 18000
286
+ },
287
+ {
288
+ "epoch": 2.7744451109778043,
289
+ "grad_norm": 2.96875,
290
+ "learning_rate": 3.252559009582478e-05,
291
+ "loss": 0.1336,
292
+ "step": 18500
293
+ },
294
+ {
295
+ "epoch": 2.8494301139772045,
296
+ "grad_norm": 3.6875,
297
+ "learning_rate": 3.14787278277573e-05,
298
+ "loss": 0.1327,
299
+ "step": 19000
300
+ },
301
+ {
302
+ "epoch": 2.9244151169766046,
303
+ "grad_norm": 4.34375,
304
+ "learning_rate": 3.0419537726905434e-05,
305
+ "loss": 0.1326,
306
+ "step": 19500
307
+ },
308
+ {
309
+ "epoch": 2.9994001199760048,
310
+ "grad_norm": 4.28125,
311
+ "learning_rate": 2.9350035238090666e-05,
312
+ "loss": 0.1295,
313
+ "step": 20000
314
+ },
315
+ {
316
+ "epoch": 3.0,
317
+ "eval_accuracy": 0.6393481806422586,
318
+ "eval_f1": 0.5262540403072158,
319
+ "eval_loss": 0.12466703355312347,
320
+ "eval_precision": 0.4630570055607114,
321
+ "eval_recall": 0.6393481806422586,
322
+ "eval_runtime": 40.1133,
323
+ "eval_samples_per_second": 295.513,
324
+ "eval_steps_per_second": 18.473,
325
+ "step": 20004
326
+ },
327
+ {
328
+ "epoch": 3.074385122975405,
329
+ "grad_norm": 4.625,
330
+ "learning_rate": 2.827225542872053e-05,
331
+ "loss": 0.1301,
332
+ "step": 20500
333
+ },
334
+ {
335
+ "epoch": 3.149370125974805,
336
+ "grad_norm": 3.4375,
337
+ "learning_rate": 2.7188249116427988e-05,
338
+ "loss": 0.1287,
339
+ "step": 21000
340
+ },
341
+ {
342
+ "epoch": 3.224355128974205,
343
+ "grad_norm": 4.0625,
344
+ "learning_rate": 2.6100078966740953e-05,
345
+ "loss": 0.1314,
346
+ "step": 21500
347
+ },
348
+ {
349
+ "epoch": 3.2993401319736053,
350
+ "grad_norm": 4.1875,
351
+ "learning_rate": 2.500981556820753e-05,
352
+ "loss": 0.1299,
353
+ "step": 22000
354
+ },
355
+ {
356
+ "epoch": 3.3743251349730055,
357
+ "grad_norm": 4.3125,
358
+ "learning_rate": 2.3919533492445064e-05,
359
+ "loss": 0.1298,
360
+ "step": 22500
361
+ },
362
+ {
363
+ "epoch": 3.4493101379724056,
364
+ "grad_norm": 4.25,
365
+ "learning_rate": 2.2831307346610255e-05,
366
+ "loss": 0.1293,
367
+ "step": 23000
368
+ },
369
+ {
370
+ "epoch": 3.5242951409718057,
371
+ "grad_norm": 3.640625,
372
+ "learning_rate": 2.17472078258016e-05,
373
+ "loss": 0.1261,
374
+ "step": 23500
375
+ },
376
+ {
377
+ "epoch": 3.599280143971206,
378
+ "grad_norm": 3.75,
379
+ "learning_rate": 2.066929777290578e-05,
380
+ "loss": 0.1298,
381
+ "step": 24000
382
+ },
383
+ {
384
+ "epoch": 3.674265146970606,
385
+ "grad_norm": 4.9375,
386
+ "learning_rate": 1.9599628253385327e-05,
387
+ "loss": 0.1302,
388
+ "step": 24500
389
+ },
390
+ {
391
+ "epoch": 3.749250149970006,
392
+ "grad_norm": 3.4375,
393
+ "learning_rate": 1.8540234652476617e-05,
394
+ "loss": 0.1295,
395
+ "step": 25000
396
+ },
397
+ {
398
+ "epoch": 3.824235152969406,
399
+ "grad_norm": 3.875,
400
+ "learning_rate": 1.7493132802224482e-05,
401
+ "loss": 0.1285,
402
+ "step": 25500
403
+ },
404
+ {
405
+ "epoch": 3.8992201559688064,
406
+ "grad_norm": 4.15625,
407
+ "learning_rate": 1.6460315145722894e-05,
408
+ "loss": 0.1272,
409
+ "step": 26000
410
+ },
411
+ {
412
+ "epoch": 3.974205158968206,
413
+ "grad_norm": 3.890625,
414
+ "learning_rate": 1.5443746945860566e-05,
415
+ "loss": 0.1294,
416
+ "step": 26500
417
+ },
418
+ {
419
+ "epoch": 4.0,
420
+ "eval_accuracy": 0.639137281367752,
421
+ "eval_f1": 0.5262984000216192,
422
+ "eval_loss": 0.12251855432987213,
423
+ "eval_precision": 0.4640513156681777,
424
+ "eval_recall": 0.639137281367752,
425
+ "eval_runtime": 40.1904,
426
+ "eval_samples_per_second": 294.946,
427
+ "eval_steps_per_second": 18.437,
428
+ "step": 26672
429
+ },
430
+ {
431
+ "epoch": 4.049190161967607,
432
+ "grad_norm": 3.1875,
433
+ "learning_rate": 1.4445362545785581e-05,
434
+ "loss": 0.1311,
435
+ "step": 27000
436
+ },
437
+ {
438
+ "epoch": 4.124175164967006,
439
+ "grad_norm": 3.390625,
440
+ "learning_rate": 1.3467061688204524e-05,
441
+ "loss": 0.1289,
442
+ "step": 27500
443
+ },
444
+ {
445
+ "epoch": 4.199160167966407,
446
+ "grad_norm": 6.0625,
447
+ "learning_rate": 1.2510705900519926e-05,
448
+ "loss": 0.1263,
449
+ "step": 28000
450
+ },
451
+ {
452
+ "epoch": 4.274145170965807,
453
+ "grad_norm": 4.15625,
454
+ "learning_rate": 1.1578114952684529e-05,
455
+ "loss": 0.1273,
456
+ "step": 28500
457
+ },
458
+ {
459
+ "epoch": 4.349130173965207,
460
+ "grad_norm": 4.21875,
461
+ "learning_rate": 1.0671063394512279e-05,
462
+ "loss": 0.1339,
463
+ "step": 29000
464
+ },
465
+ {
466
+ "epoch": 4.424115176964607,
467
+ "grad_norm": 4.5,
468
+ "learning_rate": 9.791277179034853e-06,
469
+ "loss": 0.1286,
470
+ "step": 29500
471
+ },
472
+ {
473
+ "epoch": 4.499100179964008,
474
+ "grad_norm": 3.921875,
475
+ "learning_rate": 8.940430378329174e-06,
476
+ "loss": 0.133,
477
+ "step": 30000
478
+ },
479
+ {
480
+ "epoch": 4.574085182963407,
481
+ "grad_norm": 2.828125,
482
+ "learning_rate": 8.120141998064757e-06,
483
+ "loss": 0.1241,
484
+ "step": 30500
485
+ },
486
+ {
487
+ "epoch": 4.649070185962808,
488
+ "grad_norm": 3.203125,
489
+ "learning_rate": 7.331972896832292e-06,
490
+ "loss": 0.1269,
491
+ "step": 31000
492
+ },
493
+ {
494
+ "epoch": 4.7240551889622076,
495
+ "grad_norm": 6.40625,
496
+ "learning_rate": 6.577422816115633e-06,
497
+ "loss": 0.1303,
498
+ "step": 31500
499
+ },
500
+ {
501
+ "epoch": 4.799040191961607,
502
+ "grad_norm": 3.328125,
503
+ "learning_rate": 5.857927526558302e-06,
504
+ "loss": 0.1306,
505
+ "step": 32000
506
+ },
507
+ {
508
+ "epoch": 4.874025194961008,
509
+ "grad_norm": 2.96875,
510
+ "learning_rate": 5.174856095954883e-06,
511
+ "loss": 0.129,
512
+ "step": 32500
513
+ },
514
+ {
515
+ "epoch": 4.9490101979604075,
516
+ "grad_norm": 2.65625,
517
+ "learning_rate": 4.529508284165662e-06,
518
+ "loss": 0.1271,
519
+ "step": 33000
520
+ },
521
+ {
522
+ "epoch": 5.0,
523
+ "eval_accuracy": 0.6391864911984703,
524
+ "eval_f1": 0.5263388946843862,
525
+ "eval_loss": 0.12287881225347519,
526
+ "eval_precision": 0.464080708996073,
527
+ "eval_recall": 0.6391864911984703,
528
+ "eval_runtime": 40.1821,
529
+ "eval_samples_per_second": 295.007,
530
+ "eval_steps_per_second": 18.441,
531
+ "step": 33340
532
+ },
533
+ {
534
+ "epoch": 5.023995200959808,
535
+ "grad_norm": 2.296875,
536
+ "learning_rate": 3.923112069911616e-06,
537
+ "loss": 0.1263,
538
+ "step": 33500
539
+ },
540
+ {
541
+ "epoch": 5.098980203959208,
542
+ "grad_norm": 8.375,
543
+ "learning_rate": 3.3568213141557586e-06,
544
+ "loss": 0.1289,
545
+ "step": 34000
546
+ },
547
+ {
548
+ "epoch": 5.173965206958608,
549
+ "grad_norm": 3.21875,
550
+ "learning_rate": 2.8317135645169912e-06,
551
+ "loss": 0.1291,
552
+ "step": 34500
553
+ },
554
+ {
555
+ "epoch": 5.248950209958008,
556
+ "grad_norm": 4.1875,
557
+ "learning_rate": 2.3487880048942728e-06,
558
+ "loss": 0.1271,
559
+ "step": 35000
560
+ },
561
+ {
562
+ "epoch": 5.323935212957409,
563
+ "grad_norm": 3.765625,
564
+ "learning_rate": 1.9089635542026586e-06,
565
+ "loss": 0.1306,
566
+ "step": 35500
567
+ },
568
+ {
569
+ "epoch": 5.398920215956808,
570
+ "grad_norm": 6.21875,
571
+ "learning_rate": 1.5130771178388153e-06,
572
+ "loss": 0.1287,
573
+ "step": 36000
574
+ },
575
+ {
576
+ "epoch": 5.473905218956209,
577
+ "grad_norm": 4.65625,
578
+ "learning_rate": 1.1618819952033066e-06,
579
+ "loss": 0.1268,
580
+ "step": 36500
581
+ },
582
+ {
583
+ "epoch": 5.548890221955609,
584
+ "grad_norm": 3.625,
585
+ "learning_rate": 8.560464463097773e-07,
586
+ "loss": 0.1293,
587
+ "step": 37000
588
+ },
589
+ {
590
+ "epoch": 5.623875224955009,
591
+ "grad_norm": 4.28125,
592
+ "learning_rate": 5.961524202084901e-07,
593
+ "loss": 0.1292,
594
+ "step": 37500
595
+ },
596
+ {
597
+ "epoch": 5.698860227954409,
598
+ "grad_norm": 3.03125,
599
+ "learning_rate": 3.826944476438388e-07,
600
+ "loss": 0.1311,
601
+ "step": 38000
602
+ },
603
+ {
604
+ "epoch": 5.7738452309538095,
605
+ "grad_norm": 3.609375,
606
+ "learning_rate": 2.1607870005291575e-07,
607
+ "loss": 0.1269,
608
+ "step": 38500
609
+ },
610
+ {
611
+ "epoch": 5.848830233953209,
612
+ "grad_norm": 4.6875,
613
+ "learning_rate": 9.662221669560767e-08,
614
+ "loss": 0.1273,
615
+ "step": 39000
616
+ },
617
+ {
618
+ "epoch": 5.92381523695261,
619
+ "grad_norm": 3.15625,
620
+ "learning_rate": 2.4552301386951415e-08,
621
+ "loss": 0.1312,
622
+ "step": 39500
623
+ },
624
+ {
625
+ "epoch": 5.9988002399520095,
626
+ "grad_norm": 4.78125,
627
+ "learning_rate": 6.089979592838191e-12,
628
+ "loss": 0.1285,
629
+ "step": 40000
630
+ }
631
+ ],
632
+ "logging_steps": 500,
633
+ "max_steps": 40008,
634
+ "num_input_tokens_seen": 0,
635
+ "num_train_epochs": 6,
636
+ "save_steps": 1000,
637
+ "total_flos": 1.7186915377741824e+17,
638
+ "train_batch_size": 16,
639
+ "trial_name": null,
640
+ "trial_params": null
641
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef50d79747e142d8f7a8a481db4201a6e5d9e5b873ac2f5107138330d013c9da
3
+ size 5048