bowphs commited on
Commit
56a9957
·
verified ·
1 Parent(s): 27ad21a

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 1536,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 6144,
11
+ "layer_norm_eps": 1e-07,
12
+ "legacy": true,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "num_attention_heads": 24,
17
+ "num_hidden_layers": 24,
18
+ "pad_token_id": 0,
19
+ "pooler_dropout": 0,
20
+ "pooler_hidden_act": "gelu",
21
+ "pooler_hidden_size": 1536,
22
+ "pos_att_type": null,
23
+ "position_biased_input": true,
24
+ "relative_attention": false,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.47.0",
27
+ "type_vocab_size": 0,
28
+ "vocab_size": 7118
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1fd6fd81e440daec3f421dcb58c5ebdd1f52b3b050b9ec7e007c3a789b411a
3
+ size 2776250200
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0bdc60ba278b2aaada0ceb3bbff71779426b7613cb9ea81b83b3c46f4172a97
3
+ size 5552738773
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca1ebd4029c642140f53a557dd76f33d7b77ae315fc79ae62c06e26f04258f9
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb2b48a881416424368779eb036d46a0986387f3a29f511abab9c39d0a27edae
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "[EOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.093143939971924,
3
+ "best_model_checkpoint": "2024-12-03-roberta-evacun/checkpoint-16456",
4
+ "epoch": 17.0,
5
+ "eval_steps": 500,
6
+ "global_step": 16456,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.5167958656330749,
13
+ "grad_norm": 5.255120277404785,
14
+ "learning_rate": 4.974146845915202e-05,
15
+ "loss": 5.2027,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 1.0,
20
+ "eval_accuracy": 0.21748133776447606,
21
+ "eval_loss": 5.119329929351807,
22
+ "eval_runtime": 571.3565,
23
+ "eval_samples_per_second": 54.166,
24
+ "eval_steps_per_second": 3.387,
25
+ "step": 968
26
+ },
27
+ {
28
+ "epoch": 1.0330749354005169,
29
+ "grad_norm": 5.84419584274292,
30
+ "learning_rate": 4.948293691830403e-05,
31
+ "loss": 5.0886,
32
+ "step": 1000
33
+ },
34
+ {
35
+ "epoch": 1.5498708010335918,
36
+ "grad_norm": 4.444571495056152,
37
+ "learning_rate": 4.922440537745605e-05,
38
+ "loss": 5.0616,
39
+ "step": 1500
40
+ },
41
+ {
42
+ "epoch": 2.0,
43
+ "eval_accuracy": 0.2174891710390481,
44
+ "eval_loss": 5.119706153869629,
45
+ "eval_runtime": 570.6419,
46
+ "eval_samples_per_second": 54.234,
47
+ "eval_steps_per_second": 3.391,
48
+ "step": 1936
49
+ },
50
+ {
51
+ "epoch": 2.0661498708010337,
52
+ "grad_norm": 3.8420157432556152,
53
+ "learning_rate": 4.896587383660807e-05,
54
+ "loss": 5.0698,
55
+ "step": 2000
56
+ },
57
+ {
58
+ "epoch": 2.5829457364341084,
59
+ "grad_norm": 6.509148120880127,
60
+ "learning_rate": 4.870734229576008e-05,
61
+ "loss": 5.0702,
62
+ "step": 2500
63
+ },
64
+ {
65
+ "epoch": 3.0,
66
+ "eval_accuracy": 0.21680529124469344,
67
+ "eval_loss": 5.109970569610596,
68
+ "eval_runtime": 570.6566,
69
+ "eval_samples_per_second": 54.232,
70
+ "eval_steps_per_second": 3.391,
71
+ "step": 2904
72
+ },
73
+ {
74
+ "epoch": 3.0992248062015504,
75
+ "grad_norm": 6.749443531036377,
76
+ "learning_rate": 4.84488107549121e-05,
77
+ "loss": 5.0508,
78
+ "step": 3000
79
+ },
80
+ {
81
+ "epoch": 3.616020671834625,
82
+ "grad_norm": 3.7069902420043945,
83
+ "learning_rate": 4.819027921406412e-05,
84
+ "loss": 5.0692,
85
+ "step": 3500
86
+ },
87
+ {
88
+ "epoch": 4.0,
89
+ "eval_accuracy": 0.21822890079972776,
90
+ "eval_loss": 5.109030246734619,
91
+ "eval_runtime": 570.6551,
92
+ "eval_samples_per_second": 54.232,
93
+ "eval_steps_per_second": 3.391,
94
+ "step": 3872
95
+ },
96
+ {
97
+ "epoch": 4.1322997416020675,
98
+ "grad_norm": 4.267265319824219,
99
+ "learning_rate": 4.793174767321613e-05,
100
+ "loss": 5.0638,
101
+ "step": 4000
102
+ },
103
+ {
104
+ "epoch": 4.649095607235142,
105
+ "grad_norm": 4.44468879699707,
106
+ "learning_rate": 4.7673216132368156e-05,
107
+ "loss": 5.0761,
108
+ "step": 4500
109
+ },
110
+ {
111
+ "epoch": 5.0,
112
+ "eval_accuracy": 0.21750397926781473,
113
+ "eval_loss": 5.103179931640625,
114
+ "eval_runtime": 570.6875,
115
+ "eval_samples_per_second": 54.229,
116
+ "eval_steps_per_second": 3.391,
117
+ "step": 4840
118
+ },
119
+ {
120
+ "epoch": 5.165374677002584,
121
+ "grad_norm": 4.793745994567871,
122
+ "learning_rate": 4.741468459152017e-05,
123
+ "loss": 5.0508,
124
+ "step": 5000
125
+ },
126
+ {
127
+ "epoch": 5.682170542635659,
128
+ "grad_norm": 4.304242134094238,
129
+ "learning_rate": 4.7156153050672187e-05,
130
+ "loss": 5.053,
131
+ "step": 5500
132
+ },
133
+ {
134
+ "epoch": 6.0,
135
+ "eval_accuracy": 0.21806737555634184,
136
+ "eval_loss": 5.103706359863281,
137
+ "eval_runtime": 570.7022,
138
+ "eval_samples_per_second": 54.228,
139
+ "eval_steps_per_second": 3.391,
140
+ "step": 5808
141
+ },
142
+ {
143
+ "epoch": 6.198449612403101,
144
+ "grad_norm": 5.248096466064453,
145
+ "learning_rate": 4.6897621509824205e-05,
146
+ "loss": 5.0451,
147
+ "step": 6000
148
+ },
149
+ {
150
+ "epoch": 6.715245478036175,
151
+ "grad_norm": 4.274606227874756,
152
+ "learning_rate": 4.663908996897622e-05,
153
+ "loss": 5.0631,
154
+ "step": 6500
155
+ },
156
+ {
157
+ "epoch": 7.0,
158
+ "eval_accuracy": 0.21671510548702227,
159
+ "eval_loss": 5.111293315887451,
160
+ "eval_runtime": 570.6785,
161
+ "eval_samples_per_second": 54.23,
162
+ "eval_steps_per_second": 3.391,
163
+ "step": 6776
164
+ },
165
+ {
166
+ "epoch": 7.231524547803618,
167
+ "grad_norm": 4.097196102142334,
168
+ "learning_rate": 4.6380558428128236e-05,
169
+ "loss": 5.0515,
170
+ "step": 7000
171
+ },
172
+ {
173
+ "epoch": 7.7483204134366925,
174
+ "grad_norm": 5.150557994842529,
175
+ "learning_rate": 4.6122026887280254e-05,
176
+ "loss": 5.0643,
177
+ "step": 7500
178
+ },
179
+ {
180
+ "epoch": 8.0,
181
+ "eval_accuracy": 0.21643664071910304,
182
+ "eval_loss": 5.104173183441162,
183
+ "eval_runtime": 570.7404,
184
+ "eval_samples_per_second": 54.224,
185
+ "eval_steps_per_second": 3.39,
186
+ "step": 7744
187
+ },
188
+ {
189
+ "epoch": 8.264599483204135,
190
+ "grad_norm": 4.464865684509277,
191
+ "learning_rate": 4.5863495346432266e-05,
192
+ "loss": 5.0575,
193
+ "step": 8000
194
+ },
195
+ {
196
+ "epoch": 8.78139534883721,
197
+ "grad_norm": 4.509471893310547,
198
+ "learning_rate": 4.5604963805584284e-05,
199
+ "loss": 5.0641,
200
+ "step": 8500
201
+ },
202
+ {
203
+ "epoch": 9.0,
204
+ "eval_accuracy": 0.2164407022306587,
205
+ "eval_loss": 5.1050801277160645,
206
+ "eval_runtime": 570.7609,
207
+ "eval_samples_per_second": 54.222,
208
+ "eval_steps_per_second": 3.39,
209
+ "step": 8712
210
+ },
211
+ {
212
+ "epoch": 9.29767441860465,
213
+ "grad_norm": 4.612584114074707,
214
+ "learning_rate": 4.5346432264736296e-05,
215
+ "loss": 5.0507,
216
+ "step": 9000
217
+ },
218
+ {
219
+ "epoch": 9.814470284237727,
220
+ "grad_norm": 5.6399335861206055,
221
+ "learning_rate": 4.5087900723888315e-05,
222
+ "loss": 5.0477,
223
+ "step": 9500
224
+ },
225
+ {
226
+ "epoch": 10.0,
227
+ "eval_accuracy": 0.2173244230825529,
228
+ "eval_loss": 5.097902297973633,
229
+ "eval_runtime": 570.8859,
230
+ "eval_samples_per_second": 54.21,
231
+ "eval_steps_per_second": 3.389,
232
+ "step": 9680
233
+ },
234
+ {
235
+ "epoch": 10.330749354005167,
236
+ "grad_norm": 4.067675590515137,
237
+ "learning_rate": 4.4829369183040333e-05,
238
+ "loss": 5.0538,
239
+ "step": 10000
240
+ },
241
+ {
242
+ "epoch": 10.847545219638242,
243
+ "grad_norm": 3.6515378952026367,
244
+ "learning_rate": 4.4570837642192345e-05,
245
+ "loss": 5.0516,
246
+ "step": 10500
247
+ },
248
+ {
249
+ "epoch": 11.0,
250
+ "eval_accuracy": 0.21694436479870868,
251
+ "eval_loss": 5.105097770690918,
252
+ "eval_runtime": 570.8353,
253
+ "eval_samples_per_second": 54.215,
254
+ "eval_steps_per_second": 3.39,
255
+ "step": 10648
256
+ },
257
+ {
258
+ "epoch": 11.363824289405684,
259
+ "grad_norm": 3.8045644760131836,
260
+ "learning_rate": 4.4312306101344364e-05,
261
+ "loss": 5.0503,
262
+ "step": 11000
263
+ },
264
+ {
265
+ "epoch": 11.88062015503876,
266
+ "grad_norm": 6.31251859664917,
267
+ "learning_rate": 4.405377456049638e-05,
268
+ "loss": 5.0535,
269
+ "step": 11500
270
+ },
271
+ {
272
+ "epoch": 12.0,
273
+ "eval_accuracy": 0.21655864853378665,
274
+ "eval_loss": 5.102573394775391,
275
+ "eval_runtime": 570.862,
276
+ "eval_samples_per_second": 54.213,
277
+ "eval_steps_per_second": 3.39,
278
+ "step": 11616
279
+ },
280
+ {
281
+ "epoch": 12.396899224806202,
282
+ "grad_norm": 7.145920276641846,
283
+ "learning_rate": 4.3795243019648394e-05,
284
+ "loss": 5.0599,
285
+ "step": 12000
286
+ },
287
+ {
288
+ "epoch": 12.913695090439276,
289
+ "grad_norm": 4.022646427154541,
290
+ "learning_rate": 4.353671147880042e-05,
291
+ "loss": 5.0491,
292
+ "step": 12500
293
+ },
294
+ {
295
+ "epoch": 13.0,
296
+ "eval_accuracy": 0.21710994508587633,
297
+ "eval_loss": 5.104992866516113,
298
+ "eval_runtime": 570.798,
299
+ "eval_samples_per_second": 54.219,
300
+ "eval_steps_per_second": 3.39,
301
+ "step": 12584
302
+ },
303
+ {
304
+ "epoch": 13.429974160206719,
305
+ "grad_norm": 3.64132022857666,
306
+ "learning_rate": 4.327817993795243e-05,
307
+ "loss": 5.0419,
308
+ "step": 13000
309
+ },
310
+ {
311
+ "epoch": 13.946770025839793,
312
+ "grad_norm": 4.630414009094238,
313
+ "learning_rate": 4.301964839710445e-05,
314
+ "loss": 5.0547,
315
+ "step": 13500
316
+ },
317
+ {
318
+ "epoch": 14.0,
319
+ "eval_accuracy": 0.21658861456412434,
320
+ "eval_loss": 5.10886287689209,
321
+ "eval_runtime": 570.8383,
322
+ "eval_samples_per_second": 54.215,
323
+ "eval_steps_per_second": 3.39,
324
+ "step": 13552
325
+ },
326
+ {
327
+ "epoch": 14.463049095607236,
328
+ "grad_norm": 4.112677097320557,
329
+ "learning_rate": 4.276111685625647e-05,
330
+ "loss": 5.0602,
331
+ "step": 14000
332
+ },
333
+ {
334
+ "epoch": 14.97984496124031,
335
+ "grad_norm": 5.0369873046875,
336
+ "learning_rate": 4.250258531540848e-05,
337
+ "loss": 5.055,
338
+ "step": 14500
339
+ },
340
+ {
341
+ "epoch": 15.0,
342
+ "eval_accuracy": 0.21731427174975562,
343
+ "eval_loss": 5.101281642913818,
344
+ "eval_runtime": 570.8524,
345
+ "eval_samples_per_second": 54.214,
346
+ "eval_steps_per_second": 3.39,
347
+ "step": 14520
348
+ },
349
+ {
350
+ "epoch": 15.496124031007753,
351
+ "grad_norm": 5.0534281730651855,
352
+ "learning_rate": 4.22440537745605e-05,
353
+ "loss": 5.0547,
354
+ "step": 15000
355
+ },
356
+ {
357
+ "epoch": 16.0,
358
+ "eval_accuracy": 0.21727949673638433,
359
+ "eval_loss": 5.1018967628479,
360
+ "eval_runtime": 570.8378,
361
+ "eval_samples_per_second": 54.215,
362
+ "eval_steps_per_second": 3.39,
363
+ "step": 15488
364
+ },
365
+ {
366
+ "epoch": 16.012403100775195,
367
+ "grad_norm": 5.0263752937316895,
368
+ "learning_rate": 4.198552223371252e-05,
369
+ "loss": 5.0508,
370
+ "step": 15500
371
+ },
372
+ {
373
+ "epoch": 16.52919896640827,
374
+ "grad_norm": 3.9026286602020264,
375
+ "learning_rate": 4.172699069286453e-05,
376
+ "loss": 5.0532,
377
+ "step": 16000
378
+ },
379
+ {
380
+ "epoch": 17.0,
381
+ "eval_accuracy": 0.21736250405690846,
382
+ "eval_loss": 5.093143939971924,
383
+ "eval_runtime": 570.8413,
384
+ "eval_samples_per_second": 54.215,
385
+ "eval_steps_per_second": 3.39,
386
+ "step": 16456
387
+ }
388
+ ],
389
+ "logging_steps": 500,
390
+ "max_steps": 96700,
391
+ "num_input_tokens_seen": 0,
392
+ "num_train_epochs": 100,
393
+ "save_steps": 500,
394
+ "stateful_callbacks": {
395
+ "TrainerControl": {
396
+ "args": {
397
+ "should_epoch_stop": false,
398
+ "should_evaluate": false,
399
+ "should_log": false,
400
+ "should_save": true,
401
+ "should_training_stop": false
402
+ },
403
+ "attributes": {}
404
+ }
405
+ },
406
+ "total_flos": 5.31244216577348e+17,
407
+ "train_batch_size": 16,
408
+ "trial_name": null,
409
+ "trial_params": null
410
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91d12bff95227ff53637ef76714e7e63db1124e17a30b683660fb63448b8b5f1
3
+ size 5368