Jellyfish042 commited on
Commit
21a1d5b
1 Parent(s): dd465f4

Delete checkpoint-12500

Browse files
checkpoint-12500/config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "_name_or_path": "./models/checkpoint-22500",
3
- "architectures": [
4
- "MT5ForConditionalGeneration"
5
- ],
6
- "d_ff": 1024,
7
- "d_kv": 64,
8
- "d_model": 512,
9
- "decoder_start_token_id": 0,
10
- "dense_act_fn": "gelu_new",
11
- "dropout_rate": 0.1,
12
- "eos_token_id": 1,
13
- "feed_forward_proj": "gated-gelu",
14
- "initializer_factor": 1.0,
15
- "is_encoder_decoder": true,
16
- "is_gated_act": true,
17
- "layer_norm_epsilon": 1e-06,
18
- "model_type": "mt5",
19
- "num_decoder_layers": 8,
20
- "num_heads": 6,
21
- "num_layers": 8,
22
- "pad_token_id": 0,
23
- "relative_attention_max_distance": 128,
24
- "relative_attention_num_buckets": 32,
25
- "tie_word_embeddings": false,
26
- "tokenizer_class": "T5Tokenizer",
27
- "torch_dtype": "float32",
28
- "transformers_version": "4.28.0",
29
- "use_cache": true,
30
- "vocab_size": 250112
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-12500/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "decoder_start_token_id": 0,
4
- "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.28.0"
7
- }
 
 
 
 
 
 
 
 
checkpoint-12500/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:251bb699cbd7c1797bebb5bcedcd74292aabc09a4cb21ce122ac3a51e53ec6d3
3
- size 2401525449
 
 
 
 
checkpoint-12500/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:55ff5c5ded29a1d564c8afbb527177efed95588c6fa9d94ec7730f4147f3aa0d
3
- size 1200770757
 
 
 
 
checkpoint-12500/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c3cb84444af4f999d9c415841ca7c0fe71f057954a4a51d018c9df23037e5cc
3
- size 14503
 
 
 
 
checkpoint-12500/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f5c59ab51ce6acd9a1c5ded66ecbd7d577f8b4cfcb24b74f403d329eb3d28e4
3
- size 623
 
 
 
 
checkpoint-12500/trainer_state.json DELETED
@@ -1,428 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.7621196070511306,
5
- "global_step": 12500,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 1.9951222486433756e-05,
13
- "loss": 0.0754,
14
- "step": 200
15
- },
16
- {
17
- "epoch": 0.02,
18
- "learning_rate": 1.990244497286751e-05,
19
- "loss": 0.0747,
20
- "step": 400
21
- },
22
- {
23
- "epoch": 0.04,
24
- "learning_rate": 1.9853667459301263e-05,
25
- "loss": 0.0737,
26
- "step": 600
27
- },
28
- {
29
- "epoch": 0.05,
30
- "learning_rate": 1.9804889945735017e-05,
31
- "loss": 0.073,
32
- "step": 800
33
- },
34
- {
35
- "epoch": 0.06,
36
- "learning_rate": 1.975611243216877e-05,
37
- "loss": 0.0723,
38
- "step": 1000
39
- },
40
- {
41
- "epoch": 0.07,
42
- "learning_rate": 1.9707334918602525e-05,
43
- "loss": 0.072,
44
- "step": 1200
45
- },
46
- {
47
- "epoch": 0.09,
48
- "learning_rate": 1.965855740503628e-05,
49
- "loss": 0.0709,
50
- "step": 1400
51
- },
52
- {
53
- "epoch": 0.1,
54
- "learning_rate": 1.9609779891470033e-05,
55
- "loss": 0.0703,
56
- "step": 1600
57
- },
58
- {
59
- "epoch": 0.11,
60
- "learning_rate": 1.9561002377903787e-05,
61
- "loss": 0.0708,
62
- "step": 1800
63
- },
64
- {
65
- "epoch": 0.12,
66
- "learning_rate": 1.951222486433754e-05,
67
- "loss": 0.0711,
68
- "step": 2000
69
- },
70
- {
71
- "epoch": 0.13,
72
- "learning_rate": 1.9463447350771295e-05,
73
- "loss": 0.0699,
74
- "step": 2200
75
- },
76
- {
77
- "epoch": 0.15,
78
- "learning_rate": 1.941466983720505e-05,
79
- "loss": 0.0701,
80
- "step": 2400
81
- },
82
- {
83
- "epoch": 0.15,
84
- "eval_loss": 0.055753033608198166,
85
- "eval_runtime": 1008.1775,
86
- "eval_samples_per_second": 82.199,
87
- "eval_steps_per_second": 13.7,
88
- "step": 2500
89
- },
90
- {
91
- "epoch": 0.16,
92
- "learning_rate": 1.9365892323638806e-05,
93
- "loss": 0.071,
94
- "step": 2600
95
- },
96
- {
97
- "epoch": 0.17,
98
- "learning_rate": 1.931711481007256e-05,
99
- "loss": 0.0692,
100
- "step": 2800
101
- },
102
- {
103
- "epoch": 0.18,
104
- "learning_rate": 1.9268337296506314e-05,
105
- "loss": 0.0687,
106
- "step": 3000
107
- },
108
- {
109
- "epoch": 0.2,
110
- "learning_rate": 1.9219559782940067e-05,
111
- "loss": 0.0689,
112
- "step": 3200
113
- },
114
- {
115
- "epoch": 0.21,
116
- "learning_rate": 1.917078226937382e-05,
117
- "loss": 0.0681,
118
- "step": 3400
119
- },
120
- {
121
- "epoch": 0.22,
122
- "learning_rate": 1.9122004755807575e-05,
123
- "loss": 0.0678,
124
- "step": 3600
125
- },
126
- {
127
- "epoch": 0.23,
128
- "learning_rate": 1.907322724224133e-05,
129
- "loss": 0.0666,
130
- "step": 3800
131
- },
132
- {
133
- "epoch": 0.24,
134
- "learning_rate": 1.9024449728675083e-05,
135
- "loss": 0.0676,
136
- "step": 4000
137
- },
138
- {
139
- "epoch": 0.26,
140
- "learning_rate": 1.8975672215108837e-05,
141
- "loss": 0.0662,
142
- "step": 4200
143
- },
144
- {
145
- "epoch": 0.27,
146
- "learning_rate": 1.892689470154259e-05,
147
- "loss": 0.0676,
148
- "step": 4400
149
- },
150
- {
151
- "epoch": 0.28,
152
- "learning_rate": 1.8878117187976345e-05,
153
- "loss": 0.0663,
154
- "step": 4600
155
- },
156
- {
157
- "epoch": 0.29,
158
- "learning_rate": 1.88293396744101e-05,
159
- "loss": 0.0661,
160
- "step": 4800
161
- },
162
- {
163
- "epoch": 0.3,
164
- "learning_rate": 1.8780562160843853e-05,
165
- "loss": 0.0661,
166
- "step": 5000
167
- },
168
- {
169
- "epoch": 0.3,
170
- "eval_loss": 0.05344025790691376,
171
- "eval_runtime": 1009.8843,
172
- "eval_samples_per_second": 82.06,
173
- "eval_steps_per_second": 13.677,
174
- "step": 5000
175
- },
176
- {
177
- "epoch": 0.32,
178
- "learning_rate": 1.8731784647277606e-05,
179
- "loss": 0.0674,
180
- "step": 5200
181
- },
182
- {
183
- "epoch": 0.33,
184
- "learning_rate": 1.868300713371136e-05,
185
- "loss": 0.066,
186
- "step": 5400
187
- },
188
- {
189
- "epoch": 0.34,
190
- "learning_rate": 1.8634229620145114e-05,
191
- "loss": 0.0642,
192
- "step": 5600
193
- },
194
- {
195
- "epoch": 0.35,
196
- "learning_rate": 1.8585452106578868e-05,
197
- "loss": 0.0655,
198
- "step": 5800
199
- },
200
- {
201
- "epoch": 0.37,
202
- "learning_rate": 1.8536674593012622e-05,
203
- "loss": 0.0654,
204
- "step": 6000
205
- },
206
- {
207
- "epoch": 0.38,
208
- "learning_rate": 1.8487897079446376e-05,
209
- "loss": 0.0641,
210
- "step": 6200
211
- },
212
- {
213
- "epoch": 0.39,
214
- "learning_rate": 1.843911956588013e-05,
215
- "loss": 0.0636,
216
- "step": 6400
217
- },
218
- {
219
- "epoch": 0.4,
220
- "learning_rate": 1.8390342052313884e-05,
221
- "loss": 0.0658,
222
- "step": 6600
223
- },
224
- {
225
- "epoch": 0.41,
226
- "learning_rate": 1.8341564538747638e-05,
227
- "loss": 0.0639,
228
- "step": 6800
229
- },
230
- {
231
- "epoch": 0.43,
232
- "learning_rate": 1.8292787025181395e-05,
233
- "loss": 0.0642,
234
- "step": 7000
235
- },
236
- {
237
- "epoch": 0.44,
238
- "learning_rate": 1.8244009511615145e-05,
239
- "loss": 0.0648,
240
- "step": 7200
241
- },
242
- {
243
- "epoch": 0.45,
244
- "learning_rate": 1.8195231998048903e-05,
245
- "loss": 0.0645,
246
- "step": 7400
247
- },
248
- {
249
- "epoch": 0.46,
250
- "eval_loss": 0.05153830349445343,
251
- "eval_runtime": 1007.6883,
252
- "eval_samples_per_second": 82.239,
253
- "eval_steps_per_second": 13.707,
254
- "step": 7500
255
- },
256
- {
257
- "epoch": 0.46,
258
- "learning_rate": 1.8146454484482653e-05,
259
- "loss": 0.0639,
260
- "step": 7600
261
- },
262
- {
263
- "epoch": 0.48,
264
- "learning_rate": 1.809767697091641e-05,
265
- "loss": 0.0628,
266
- "step": 7800
267
- },
268
- {
269
- "epoch": 0.49,
270
- "learning_rate": 1.8048899457350164e-05,
271
- "loss": 0.0647,
272
- "step": 8000
273
- },
274
- {
275
- "epoch": 0.5,
276
- "learning_rate": 1.800012194378392e-05,
277
- "loss": 0.0629,
278
- "step": 8200
279
- },
280
- {
281
- "epoch": 0.51,
282
- "learning_rate": 1.7951344430217672e-05,
283
- "loss": 0.0638,
284
- "step": 8400
285
- },
286
- {
287
- "epoch": 0.52,
288
- "learning_rate": 1.7902566916651426e-05,
289
- "loss": 0.0635,
290
- "step": 8600
291
- },
292
- {
293
- "epoch": 0.54,
294
- "learning_rate": 1.785378940308518e-05,
295
- "loss": 0.0628,
296
- "step": 8800
297
- },
298
- {
299
- "epoch": 0.55,
300
- "learning_rate": 1.7805011889518934e-05,
301
- "loss": 0.063,
302
- "step": 9000
303
- },
304
- {
305
- "epoch": 0.56,
306
- "learning_rate": 1.7756234375952688e-05,
307
- "loss": 0.0636,
308
- "step": 9200
309
- },
310
- {
311
- "epoch": 0.57,
312
- "learning_rate": 1.7707456862386442e-05,
313
- "loss": 0.0633,
314
- "step": 9400
315
- },
316
- {
317
- "epoch": 0.59,
318
- "learning_rate": 1.7658679348820196e-05,
319
- "loss": 0.0624,
320
- "step": 9600
321
- },
322
- {
323
- "epoch": 0.6,
324
- "learning_rate": 1.760990183525395e-05,
325
- "loss": 0.0615,
326
- "step": 9800
327
- },
328
- {
329
- "epoch": 0.61,
330
- "learning_rate": 1.7561124321687703e-05,
331
- "loss": 0.062,
332
- "step": 10000
333
- },
334
- {
335
- "epoch": 0.61,
336
- "eval_loss": 0.05109347030520439,
337
- "eval_runtime": 1005.611,
338
- "eval_samples_per_second": 82.409,
339
- "eval_steps_per_second": 13.735,
340
- "step": 10000
341
- },
342
- {
343
- "epoch": 0.62,
344
- "learning_rate": 1.7512346808121457e-05,
345
- "loss": 0.0616,
346
- "step": 10200
347
- },
348
- {
349
- "epoch": 0.63,
350
- "learning_rate": 1.746356929455521e-05,
351
- "loss": 0.0613,
352
- "step": 10400
353
- },
354
- {
355
- "epoch": 0.65,
356
- "learning_rate": 1.7414791780988965e-05,
357
- "loss": 0.0616,
358
- "step": 10600
359
- },
360
- {
361
- "epoch": 0.66,
362
- "learning_rate": 1.736601426742272e-05,
363
- "loss": 0.0623,
364
- "step": 10800
365
- },
366
- {
367
- "epoch": 0.67,
368
- "learning_rate": 1.7317236753856473e-05,
369
- "loss": 0.062,
370
- "step": 11000
371
- },
372
- {
373
- "epoch": 0.68,
374
- "learning_rate": 1.7268459240290227e-05,
375
- "loss": 0.0632,
376
- "step": 11200
377
- },
378
- {
379
- "epoch": 0.7,
380
- "learning_rate": 1.7219681726723984e-05,
381
- "loss": 0.0623,
382
- "step": 11400
383
- },
384
- {
385
- "epoch": 0.71,
386
- "learning_rate": 1.7170904213157735e-05,
387
- "loss": 0.0608,
388
- "step": 11600
389
- },
390
- {
391
- "epoch": 0.72,
392
- "learning_rate": 1.7122126699591492e-05,
393
- "loss": 0.061,
394
- "step": 11800
395
- },
396
- {
397
- "epoch": 0.73,
398
- "learning_rate": 1.7073349186025242e-05,
399
- "loss": 0.061,
400
- "step": 12000
401
- },
402
- {
403
- "epoch": 0.74,
404
- "learning_rate": 1.7024571672459e-05,
405
- "loss": 0.0605,
406
- "step": 12200
407
- },
408
- {
409
- "epoch": 0.76,
410
- "learning_rate": 1.697579415889275e-05,
411
- "loss": 0.0607,
412
- "step": 12400
413
- },
414
- {
415
- "epoch": 0.76,
416
- "eval_loss": 0.04968786612153053,
417
- "eval_runtime": 1006.7134,
418
- "eval_samples_per_second": 82.318,
419
- "eval_steps_per_second": 13.72,
420
- "step": 12500
421
- }
422
- ],
423
- "max_steps": 82005,
424
- "num_train_epochs": 5,
425
- "total_flos": 6.34499629056e+17,
426
- "trial_name": null,
427
- "trial_params": null
428
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-12500/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69cea6267551e6c3db2312cb16b8e111ea7cc82c9a729b741a0cc0164750a77c
3
- size 3695