kiddothe2b commited on
Commit
e93dd90
1 Parent(s): 8c84888

Training in progress, step 6400

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "data/models/longformer-replicated-random-pos-encodings-4096",
3
+ "architectures": [
4
+ "LongformerForMaskedLM"
5
+ ],
6
+ "attention_mode": "longformer",
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "attention_window": [
9
+ 512,
10
+ 512,
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classifier_dropout": null,
18
+ "cls_token_id": 0,
19
+ "eos_token_id": 2,
20
+ "gradient_checkpointing": false,
21
+ "hidden_act": "gelu",
22
+ "hidden_dropout_prob": 0.1,
23
+ "hidden_size": 768,
24
+ "ignore_attention_mask": false,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "layer_norm_eps": 1e-05,
28
+ "max_position_embeddings": 4098,
29
+ "model_max_length": 4096,
30
+ "model_type": "longformer",
31
+ "num_attention_heads": 12,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 1,
34
+ "position_embedding_type": "absolute",
35
+ "sep_token_id": 2,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.20.0",
38
+ "type_vocab_size": 1,
39
+ "use_cache": true,
40
+ "vocab_size": 50265
41
+ }
last-checkpoint/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "data/models/longformer-replicated-random-pos-encodings-4096",
3
+ "architectures": [
4
+ "LongformerForMaskedLM"
5
+ ],
6
+ "attention_mode": "longformer",
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "attention_window": [
9
+ 512,
10
+ 512,
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classifier_dropout": null,
18
+ "cls_token_id": 0,
19
+ "eos_token_id": 2,
20
+ "gradient_checkpointing": false,
21
+ "hidden_act": "gelu",
22
+ "hidden_dropout_prob": 0.1,
23
+ "hidden_size": 768,
24
+ "ignore_attention_mask": false,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "layer_norm_eps": 1e-05,
28
+ "max_position_embeddings": 4098,
29
+ "model_max_length": 4096,
30
+ "model_type": "longformer",
31
+ "num_attention_heads": 12,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 1,
34
+ "position_embedding_type": "absolute",
35
+ "sep_token_id": 2,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.20.0",
38
+ "type_vocab_size": 1,
39
+ "use_cache": true,
40
+ "vocab_size": 50265
41
+ }
last-checkpoint/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:768403e40ef24719f9874bffdb116dac53668a99c00e4e33eefffb1d608e2344
3
+ size 166971723
last-checkpoint/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd622ee944edad1403b47d7d3094143fdec8b0c7996031b5cf2e20e83239116e
3
+ size 382294563
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba08963cd8662bc2571f405f460c8c72ebf7c284eca70fb8787e17881f55d3b
3
+ size 15523
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e4d707c7718e9baeb8896ef586b8a849cf9f9da184de0c131d6ac2046e66da
3
+ size 623
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
last-checkpoint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "max_length": 4096,
9
+ "model_max_length": 4096,
10
+ "name_or_path": "data/models/longformer-replicated-random-pos-encodings-4096",
11
+ "pad_token": "<pad>",
12
+ "sep_token": "</s>",
13
+ "special_tokens_map_file": null,
14
+ "tokenizer_class": "RobertaTokenizer",
15
+ "trim_offsets": true,
16
+ "unk_token": "<unk>"
17
+ }
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "global_step": 6400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 1.5625e-06,
13
+ "loss": 6.9056,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.03,
18
+ "learning_rate": 3.125e-06,
19
+ "loss": 6.0648,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "learning_rate": 4.6875000000000004e-06,
25
+ "loss": 5.7713,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.06,
30
+ "learning_rate": 6.25e-06,
31
+ "loss": 5.6855,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.08,
36
+ "learning_rate": 7.8125e-06,
37
+ "loss": 5.6563,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.09,
42
+ "learning_rate": 9.375000000000001e-06,
43
+ "loss": 5.6212,
44
+ "step": 600
45
+ },
46
+ {
47
+ "epoch": 0.11,
48
+ "learning_rate": 1.09375e-05,
49
+ "loss": 5.6477,
50
+ "step": 700
51
+ },
52
+ {
53
+ "epoch": 0.12,
54
+ "learning_rate": 1.25e-05,
55
+ "loss": 5.5888,
56
+ "step": 800
57
+ },
58
+ {
59
+ "epoch": 0.14,
60
+ "learning_rate": 1.4062500000000001e-05,
61
+ "loss": 5.5783,
62
+ "step": 900
63
+ },
64
+ {
65
+ "epoch": 0.16,
66
+ "learning_rate": 1.5625e-05,
67
+ "loss": 5.5722,
68
+ "step": 1000
69
+ },
70
+ {
71
+ "epoch": 0.17,
72
+ "learning_rate": 1.71875e-05,
73
+ "loss": 5.5792,
74
+ "step": 1100
75
+ },
76
+ {
77
+ "epoch": 0.19,
78
+ "learning_rate": 1.8750000000000002e-05,
79
+ "loss": 5.5576,
80
+ "step": 1200
81
+ },
82
+ {
83
+ "epoch": 0.2,
84
+ "learning_rate": 2.0312500000000002e-05,
85
+ "loss": 5.5382,
86
+ "step": 1300
87
+ },
88
+ {
89
+ "epoch": 0.22,
90
+ "learning_rate": 2.1875e-05,
91
+ "loss": 5.5523,
92
+ "step": 1400
93
+ },
94
+ {
95
+ "epoch": 0.23,
96
+ "learning_rate": 2.34375e-05,
97
+ "loss": 5.5438,
98
+ "step": 1500
99
+ },
100
+ {
101
+ "epoch": 0.25,
102
+ "learning_rate": 2.5e-05,
103
+ "loss": 5.5019,
104
+ "step": 1600
105
+ },
106
+ {
107
+ "epoch": 0.27,
108
+ "learning_rate": 2.6562500000000002e-05,
109
+ "loss": 5.5062,
110
+ "step": 1700
111
+ },
112
+ {
113
+ "epoch": 0.28,
114
+ "learning_rate": 2.8125000000000003e-05,
115
+ "loss": 5.5222,
116
+ "step": 1800
117
+ },
118
+ {
119
+ "epoch": 0.3,
120
+ "learning_rate": 2.96875e-05,
121
+ "loss": 5.5033,
122
+ "step": 1900
123
+ },
124
+ {
125
+ "epoch": 0.31,
126
+ "learning_rate": 3.125e-05,
127
+ "loss": 5.5172,
128
+ "step": 2000
129
+ },
130
+ {
131
+ "epoch": 0.33,
132
+ "learning_rate": 3.2812500000000005e-05,
133
+ "loss": 5.5073,
134
+ "step": 2100
135
+ },
136
+ {
137
+ "epoch": 0.34,
138
+ "learning_rate": 3.4375e-05,
139
+ "loss": 5.5096,
140
+ "step": 2200
141
+ },
142
+ {
143
+ "epoch": 0.36,
144
+ "learning_rate": 3.59375e-05,
145
+ "loss": 5.5062,
146
+ "step": 2300
147
+ },
148
+ {
149
+ "epoch": 0.38,
150
+ "learning_rate": 3.7500000000000003e-05,
151
+ "loss": 5.505,
152
+ "step": 2400
153
+ },
154
+ {
155
+ "epoch": 0.39,
156
+ "learning_rate": 3.90625e-05,
157
+ "loss": 5.4843,
158
+ "step": 2500
159
+ },
160
+ {
161
+ "epoch": 0.41,
162
+ "learning_rate": 4.0625000000000005e-05,
163
+ "loss": 5.4931,
164
+ "step": 2600
165
+ },
166
+ {
167
+ "epoch": 0.42,
168
+ "learning_rate": 4.21875e-05,
169
+ "loss": 5.4763,
170
+ "step": 2700
171
+ },
172
+ {
173
+ "epoch": 0.44,
174
+ "learning_rate": 4.375e-05,
175
+ "loss": 5.4743,
176
+ "step": 2800
177
+ },
178
+ {
179
+ "epoch": 0.45,
180
+ "learning_rate": 4.5312500000000004e-05,
181
+ "loss": 5.4694,
182
+ "step": 2900
183
+ },
184
+ {
185
+ "epoch": 0.47,
186
+ "learning_rate": 4.6875e-05,
187
+ "loss": 5.4848,
188
+ "step": 3000
189
+ },
190
+ {
191
+ "epoch": 0.48,
192
+ "learning_rate": 4.8437500000000005e-05,
193
+ "loss": 5.4686,
194
+ "step": 3100
195
+ },
196
+ {
197
+ "epoch": 0.5,
198
+ "learning_rate": 5e-05,
199
+ "loss": 5.4841,
200
+ "step": 3200
201
+ },
202
+ {
203
+ "epoch": 0.52,
204
+ "learning_rate": 5.15625e-05,
205
+ "loss": 5.4468,
206
+ "step": 3300
207
+ },
208
+ {
209
+ "epoch": 0.53,
210
+ "learning_rate": 5.3125000000000004e-05,
211
+ "loss": 5.4505,
212
+ "step": 3400
213
+ },
214
+ {
215
+ "epoch": 0.55,
216
+ "learning_rate": 5.46875e-05,
217
+ "loss": 5.4143,
218
+ "step": 3500
219
+ },
220
+ {
221
+ "epoch": 0.56,
222
+ "learning_rate": 5.6250000000000005e-05,
223
+ "loss": 5.3818,
224
+ "step": 3600
225
+ },
226
+ {
227
+ "epoch": 0.58,
228
+ "learning_rate": 5.78125e-05,
229
+ "loss": 5.3292,
230
+ "step": 3700
231
+ },
232
+ {
233
+ "epoch": 0.59,
234
+ "learning_rate": 5.9375e-05,
235
+ "loss": 5.2801,
236
+ "step": 3800
237
+ },
238
+ {
239
+ "epoch": 0.61,
240
+ "learning_rate": 6.0937500000000004e-05,
241
+ "loss": 5.2343,
242
+ "step": 3900
243
+ },
244
+ {
245
+ "epoch": 0.62,
246
+ "learning_rate": 6.25e-05,
247
+ "loss": 5.1744,
248
+ "step": 4000
249
+ },
250
+ {
251
+ "epoch": 0.64,
252
+ "learning_rate": 6.40625e-05,
253
+ "loss": 5.1309,
254
+ "step": 4100
255
+ },
256
+ {
257
+ "epoch": 0.66,
258
+ "learning_rate": 6.562500000000001e-05,
259
+ "loss": 5.0384,
260
+ "step": 4200
261
+ },
262
+ {
263
+ "epoch": 0.67,
264
+ "learning_rate": 6.71875e-05,
265
+ "loss": 4.8088,
266
+ "step": 4300
267
+ },
268
+ {
269
+ "epoch": 0.69,
270
+ "learning_rate": 6.875e-05,
271
+ "loss": 4.4311,
272
+ "step": 4400
273
+ },
274
+ {
275
+ "epoch": 0.7,
276
+ "learning_rate": 7.031250000000001e-05,
277
+ "loss": 3.9741,
278
+ "step": 4500
279
+ },
280
+ {
281
+ "epoch": 0.72,
282
+ "learning_rate": 7.1875e-05,
283
+ "loss": 3.5943,
284
+ "step": 4600
285
+ },
286
+ {
287
+ "epoch": 0.73,
288
+ "learning_rate": 7.34375e-05,
289
+ "loss": 3.3068,
290
+ "step": 4700
291
+ },
292
+ {
293
+ "epoch": 0.75,
294
+ "learning_rate": 7.500000000000001e-05,
295
+ "loss": 3.0763,
296
+ "step": 4800
297
+ },
298
+ {
299
+ "epoch": 0.77,
300
+ "learning_rate": 7.65625e-05,
301
+ "loss": 2.9353,
302
+ "step": 4900
303
+ },
304
+ {
305
+ "epoch": 0.78,
306
+ "learning_rate": 7.8125e-05,
307
+ "loss": 2.7872,
308
+ "step": 5000
309
+ },
310
+ {
311
+ "epoch": 0.8,
312
+ "learning_rate": 7.96875e-05,
313
+ "loss": 2.7278,
314
+ "step": 5100
315
+ },
316
+ {
317
+ "epoch": 0.81,
318
+ "learning_rate": 8.125000000000001e-05,
319
+ "loss": 2.6579,
320
+ "step": 5200
321
+ },
322
+ {
323
+ "epoch": 0.83,
324
+ "learning_rate": 8.28125e-05,
325
+ "loss": 2.6168,
326
+ "step": 5300
327
+ },
328
+ {
329
+ "epoch": 0.84,
330
+ "learning_rate": 8.4375e-05,
331
+ "loss": 2.5703,
332
+ "step": 5400
333
+ },
334
+ {
335
+ "epoch": 0.86,
336
+ "learning_rate": 8.593750000000001e-05,
337
+ "loss": 2.5617,
338
+ "step": 5500
339
+ },
340
+ {
341
+ "epoch": 0.88,
342
+ "learning_rate": 8.75e-05,
343
+ "loss": 2.5283,
344
+ "step": 5600
345
+ },
346
+ {
347
+ "epoch": 0.89,
348
+ "learning_rate": 8.90625e-05,
349
+ "loss": 2.5204,
350
+ "step": 5700
351
+ },
352
+ {
353
+ "epoch": 0.91,
354
+ "learning_rate": 9.062500000000001e-05,
355
+ "loss": 2.4751,
356
+ "step": 5800
357
+ },
358
+ {
359
+ "epoch": 0.92,
360
+ "learning_rate": 9.21875e-05,
361
+ "loss": 2.4574,
362
+ "step": 5900
363
+ },
364
+ {
365
+ "epoch": 0.94,
366
+ "learning_rate": 9.375e-05,
367
+ "loss": 2.45,
368
+ "step": 6000
369
+ },
370
+ {
371
+ "epoch": 0.95,
372
+ "learning_rate": 9.53125e-05,
373
+ "loss": 2.4293,
374
+ "step": 6100
375
+ },
376
+ {
377
+ "epoch": 0.97,
378
+ "learning_rate": 9.687500000000001e-05,
379
+ "loss": 2.3883,
380
+ "step": 6200
381
+ },
382
+ {
383
+ "epoch": 0.98,
384
+ "learning_rate": 9.84375e-05,
385
+ "loss": 2.4105,
386
+ "step": 6300
387
+ },
388
+ {
389
+ "epoch": 1.0,
390
+ "learning_rate": 0.0001,
391
+ "loss": 2.4034,
392
+ "step": 6400
393
+ },
394
+ {
395
+ "epoch": 1.0,
396
+ "eval_accuracy": 0.6022303017856984,
397
+ "eval_loss": 2.167095899581909,
398
+ "eval_runtime": 42429.4228,
399
+ "eval_samples_per_second": 7.727,
400
+ "eval_steps_per_second": 0.966,
401
+ "step": 6400
402
+ }
403
+ ],
404
+ "max_steps": 6400,
405
+ "num_train_epochs": 9223372036854775807,
406
+ "total_flos": 2.707934115004416e+17,
407
+ "trial_name": null,
408
+ "trial_params": null
409
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b69ce2ed76b51bc748236916fb1576c16ad30075cccff9f164ec341c1d31d7b5
3
+ size 3439
last-checkpoint/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd622ee944edad1403b47d7d3094143fdec8b0c7996031b5cf2e20e83239116e
3
+ size 382294563
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "max_length": 4096,
9
+ "model_max_length": 4096,
10
+ "name_or_path": "data/models/longformer-replicated-random-pos-encodings-4096",
11
+ "pad_token": "<pad>",
12
+ "sep_token": "</s>",
13
+ "special_tokens_map_file": null,
14
+ "tokenizer_class": "RobertaTokenizer",
15
+ "trim_offsets": true,
16
+ "unk_token": "<unk>"
17
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b69ce2ed76b51bc748236916fb1576c16ad30075cccff9f164ec341c1d31d7b5
3
+ size 3439
vocab.json ADDED
The diff for this file is too large to render. See raw diff