sarakolding commited on
Commit
9ab896f
1 Parent(s): 9e59cb5

initial commit

Browse files
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sarakolding/daT5-base",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.01,
11
+ "early_stopping": true,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "layer_norm_epsilon": 1e-06,
17
+ "length_penalty": 5,
18
+ "max_length": 128,
19
+ "min_length": 15,
20
+ "model_type": "mt5",
21
+ "no_repeat_ngram_size": 3,
22
+ "num_beams": 4,
23
+ "num_decoder_layers": 12,
24
+ "num_heads": 12,
25
+ "num_layers": 12,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "tie_word_embeddings": false,
31
+ "tokenizer_class": "T5Tokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.20.0.dev0",
34
+ "use_cache": true,
35
+ "vocab_size": 30000
36
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc91e2fa415ded95ce0b768fcf744fde086c771a2610643122ac4a93e015b460
3
+ size 1954640153
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a46dc9d540973454dc66d8964580d2bacf8336960c1d019241dbf2f5ac7dfa1d
3
+ size 977332173
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0362545844db8f9d3ba81e0d323c72d39cf5ff6a92b9f21b560faeaf428e6e6e
3
+ size 14503
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb514f3549501e7fbfb775146d40e9d3027ea20fa05036f069600337153c15ff
3
+ size 559
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5343e0993f724df030d39b5a8bed28c5148bbf4ad76c9e3fb3569506e763e049
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dabe934d94d2182b6b15a66f895d7de8197a2c5e478c5a9e16746c51005e8b22
3
+ size 766659
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "sp_model_kwargs": {}, "special_tokens_map_file": "/home/ucloud/.cache/huggingface/transformers/90e4e02f17b0b14f97d96c94bd8e5652250a216e5950bb43c79afe24011e808c.294ebaa4cd17bb284635004c92d2c4d522ec488c828dcce0c2471b6f28e3fe82", "name_or_path": "sarakolding/daT5-base", "tokenizer_class": "T5Tokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 2.132361888885498,
3
+ "best_model_checkpoint": "./26-125356_megasuperkanin/checkpoint-100000",
4
+ "epoch": 0.9769822970807769,
5
+ "global_step": 100000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 5e-05,
13
+ "loss": 2.6761,
14
+ "step": 2500
15
+ },
16
+ {
17
+ "epoch": 0.05,
18
+ "learning_rate": 5e-05,
19
+ "loss": 2.551,
20
+ "step": 5000
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "eval_gen_len": 28.4674,
25
+ "eval_loss": 2.423037052154541,
26
+ "eval_rouge1": 0.214,
27
+ "eval_rouge2": 0.0668,
28
+ "eval_rougeL": 0.1717,
29
+ "eval_rougeLsum": 0.1777,
30
+ "eval_runtime": 1015.6418,
31
+ "eval_samples_per_second": 2.265,
32
+ "eval_steps_per_second": 0.284,
33
+ "step": 5000
34
+ },
35
+ {
36
+ "epoch": 0.07,
37
+ "learning_rate": 5e-05,
38
+ "loss": 2.5186,
39
+ "step": 7500
40
+ },
41
+ {
42
+ "epoch": 0.1,
43
+ "learning_rate": 5e-05,
44
+ "loss": 2.4717,
45
+ "step": 10000
46
+ },
47
+ {
48
+ "epoch": 0.1,
49
+ "eval_gen_len": 25.6604,
50
+ "eval_loss": 2.3709843158721924,
51
+ "eval_rouge1": 0.2071,
52
+ "eval_rouge2": 0.0634,
53
+ "eval_rougeL": 0.1686,
54
+ "eval_rougeLsum": 0.1745,
55
+ "eval_runtime": 951.1096,
56
+ "eval_samples_per_second": 2.418,
57
+ "eval_steps_per_second": 0.303,
58
+ "step": 10000
59
+ },
60
+ {
61
+ "epoch": 0.12,
62
+ "learning_rate": 5e-05,
63
+ "loss": 2.4593,
64
+ "step": 12500
65
+ },
66
+ {
67
+ "epoch": 0.15,
68
+ "learning_rate": 5e-05,
69
+ "loss": 2.4281,
70
+ "step": 15000
71
+ },
72
+ {
73
+ "epoch": 0.15,
74
+ "eval_gen_len": 28.8296,
75
+ "eval_loss": 2.3228819370269775,
76
+ "eval_rouge1": 0.2137,
77
+ "eval_rouge2": 0.0662,
78
+ "eval_rougeL": 0.1711,
79
+ "eval_rougeLsum": 0.1768,
80
+ "eval_runtime": 1022.9494,
81
+ "eval_samples_per_second": 2.248,
82
+ "eval_steps_per_second": 0.282,
83
+ "step": 15000
84
+ },
85
+ {
86
+ "epoch": 0.17,
87
+ "learning_rate": 5e-05,
88
+ "loss": 2.4049,
89
+ "step": 17500
90
+ },
91
+ {
92
+ "epoch": 0.2,
93
+ "learning_rate": 5e-05,
94
+ "loss": 2.3735,
95
+ "step": 20000
96
+ },
97
+ {
98
+ "epoch": 0.2,
99
+ "eval_gen_len": 29.9183,
100
+ "eval_loss": 2.2881429195404053,
101
+ "eval_rouge1": 0.2164,
102
+ "eval_rouge2": 0.0668,
103
+ "eval_rougeL": 0.1735,
104
+ "eval_rougeLsum": 0.1808,
105
+ "eval_runtime": 1036.2984,
106
+ "eval_samples_per_second": 2.219,
107
+ "eval_steps_per_second": 0.278,
108
+ "step": 20000
109
+ },
110
+ {
111
+ "epoch": 0.22,
112
+ "learning_rate": 5e-05,
113
+ "loss": 2.3732,
114
+ "step": 22500
115
+ },
116
+ {
117
+ "epoch": 0.24,
118
+ "learning_rate": 5e-05,
119
+ "loss": 2.377,
120
+ "step": 25000
121
+ },
122
+ {
123
+ "epoch": 0.24,
124
+ "eval_gen_len": 29.5183,
125
+ "eval_loss": 2.2759358882904053,
126
+ "eval_rouge1": 0.2209,
127
+ "eval_rouge2": 0.0694,
128
+ "eval_rougeL": 0.1782,
129
+ "eval_rougeLsum": 0.1851,
130
+ "eval_runtime": 1036.1071,
131
+ "eval_samples_per_second": 2.22,
132
+ "eval_steps_per_second": 0.278,
133
+ "step": 25000
134
+ },
135
+ {
136
+ "epoch": 0.27,
137
+ "learning_rate": 5e-05,
138
+ "loss": 2.3513,
139
+ "step": 27500
140
+ },
141
+ {
142
+ "epoch": 0.29,
143
+ "learning_rate": 5e-05,
144
+ "loss": 2.3444,
145
+ "step": 30000
146
+ },
147
+ {
148
+ "epoch": 0.29,
149
+ "eval_gen_len": 29.3183,
150
+ "eval_loss": 2.2552034854888916,
151
+ "eval_rouge1": 0.2194,
152
+ "eval_rouge2": 0.0679,
153
+ "eval_rougeL": 0.1757,
154
+ "eval_rougeLsum": 0.1829,
155
+ "eval_runtime": 1037.4604,
156
+ "eval_samples_per_second": 2.217,
157
+ "eval_steps_per_second": 0.278,
158
+ "step": 30000
159
+ },
160
+ {
161
+ "epoch": 0.32,
162
+ "learning_rate": 5e-05,
163
+ "loss": 2.3504,
164
+ "step": 32500
165
+ },
166
+ {
167
+ "epoch": 0.34,
168
+ "learning_rate": 5e-05,
169
+ "loss": 2.3203,
170
+ "step": 35000
171
+ },
172
+ {
173
+ "epoch": 0.34,
174
+ "eval_gen_len": 32.2061,
175
+ "eval_loss": 2.235518455505371,
176
+ "eval_rouge1": 0.2284,
177
+ "eval_rouge2": 0.0722,
178
+ "eval_rougeL": 0.1819,
179
+ "eval_rougeLsum": 0.1892,
180
+ "eval_runtime": 1121.1561,
181
+ "eval_samples_per_second": 2.051,
182
+ "eval_steps_per_second": 0.257,
183
+ "step": 35000
184
+ },
185
+ {
186
+ "epoch": 0.37,
187
+ "learning_rate": 5e-05,
188
+ "loss": 2.3087,
189
+ "step": 37500
190
+ },
191
+ {
192
+ "epoch": 0.39,
193
+ "learning_rate": 5e-05,
194
+ "loss": 2.3132,
195
+ "step": 40000
196
+ },
197
+ {
198
+ "epoch": 0.39,
199
+ "eval_gen_len": 29.5452,
200
+ "eval_loss": 2.2289836406707764,
201
+ "eval_rouge1": 0.2183,
202
+ "eval_rouge2": 0.0673,
203
+ "eval_rougeL": 0.1759,
204
+ "eval_rougeLsum": 0.1827,
205
+ "eval_runtime": 1055.2895,
206
+ "eval_samples_per_second": 2.179,
207
+ "eval_steps_per_second": 0.273,
208
+ "step": 40000
209
+ },
210
+ {
211
+ "epoch": 0.42,
212
+ "learning_rate": 5e-05,
213
+ "loss": 2.3063,
214
+ "step": 42500
215
+ },
216
+ {
217
+ "epoch": 0.44,
218
+ "learning_rate": 5e-05,
219
+ "loss": 2.3116,
220
+ "step": 45000
221
+ },
222
+ {
223
+ "epoch": 0.44,
224
+ "eval_gen_len": 30.2935,
225
+ "eval_loss": 2.218207359313965,
226
+ "eval_rouge1": 0.2239,
227
+ "eval_rouge2": 0.07,
228
+ "eval_rougeL": 0.1798,
229
+ "eval_rougeLsum": 0.1879,
230
+ "eval_runtime": 1063.5185,
231
+ "eval_samples_per_second": 2.163,
232
+ "eval_steps_per_second": 0.271,
233
+ "step": 45000
234
+ },
235
+ {
236
+ "epoch": 0.46,
237
+ "learning_rate": 5e-05,
238
+ "loss": 2.3014,
239
+ "step": 47500
240
+ },
241
+ {
242
+ "epoch": 0.49,
243
+ "learning_rate": 5e-05,
244
+ "loss": 2.2852,
245
+ "step": 50000
246
+ },
247
+ {
248
+ "epoch": 0.49,
249
+ "eval_gen_len": 28.6443,
250
+ "eval_loss": 2.2090706825256348,
251
+ "eval_rouge1": 0.2251,
252
+ "eval_rouge2": 0.0703,
253
+ "eval_rougeL": 0.1812,
254
+ "eval_rougeLsum": 0.1887,
255
+ "eval_runtime": 1045.7282,
256
+ "eval_samples_per_second": 2.199,
257
+ "eval_steps_per_second": 0.275,
258
+ "step": 50000
259
+ },
260
+ {
261
+ "epoch": 0.51,
262
+ "learning_rate": 5e-05,
263
+ "loss": 2.2963,
264
+ "step": 52500
265
+ },
266
+ {
267
+ "epoch": 0.54,
268
+ "learning_rate": 5e-05,
269
+ "loss": 2.2683,
270
+ "step": 55000
271
+ },
272
+ {
273
+ "epoch": 0.54,
274
+ "eval_gen_len": 29.9661,
275
+ "eval_loss": 2.1879115104675293,
276
+ "eval_rouge1": 0.2257,
277
+ "eval_rouge2": 0.0716,
278
+ "eval_rougeL": 0.1806,
279
+ "eval_rougeLsum": 0.1876,
280
+ "eval_runtime": 1061.3075,
281
+ "eval_samples_per_second": 2.167,
282
+ "eval_steps_per_second": 0.271,
283
+ "step": 55000
284
+ },
285
+ {
286
+ "epoch": 0.56,
287
+ "learning_rate": 5e-05,
288
+ "loss": 2.2735,
289
+ "step": 57500
290
+ },
291
+ {
292
+ "epoch": 0.59,
293
+ "learning_rate": 5e-05,
294
+ "loss": 2.2614,
295
+ "step": 60000
296
+ },
297
+ {
298
+ "epoch": 0.59,
299
+ "eval_gen_len": 30.4435,
300
+ "eval_loss": 2.1871089935302734,
301
+ "eval_rouge1": 0.2316,
302
+ "eval_rouge2": 0.075,
303
+ "eval_rougeL": 0.1863,
304
+ "eval_rougeLsum": 0.1936,
305
+ "eval_runtime": 1083.7377,
306
+ "eval_samples_per_second": 2.122,
307
+ "eval_steps_per_second": 0.266,
308
+ "step": 60000
309
+ },
310
+ {
311
+ "epoch": 0.61,
312
+ "learning_rate": 5e-05,
313
+ "loss": 2.2735,
314
+ "step": 62500
315
+ },
316
+ {
317
+ "epoch": 0.64,
318
+ "learning_rate": 5e-05,
319
+ "loss": 2.252,
320
+ "step": 65000
321
+ },
322
+ {
323
+ "epoch": 0.64,
324
+ "eval_gen_len": 30.6239,
325
+ "eval_loss": 2.175469160079956,
326
+ "eval_rouge1": 0.226,
327
+ "eval_rouge2": 0.0729,
328
+ "eval_rougeL": 0.1834,
329
+ "eval_rougeLsum": 0.1914,
330
+ "eval_runtime": 1080.4009,
331
+ "eval_samples_per_second": 2.129,
332
+ "eval_steps_per_second": 0.267,
333
+ "step": 65000
334
+ },
335
+ {
336
+ "epoch": 0.66,
337
+ "learning_rate": 5e-05,
338
+ "loss": 2.2509,
339
+ "step": 67500
340
+ },
341
+ {
342
+ "epoch": 0.68,
343
+ "learning_rate": 5e-05,
344
+ "loss": 2.262,
345
+ "step": 70000
346
+ },
347
+ {
348
+ "epoch": 0.68,
349
+ "eval_gen_len": 30.9983,
350
+ "eval_loss": 2.16789174079895,
351
+ "eval_rouge1": 0.2256,
352
+ "eval_rouge2": 0.0716,
353
+ "eval_rougeL": 0.1815,
354
+ "eval_rougeLsum": 0.1889,
355
+ "eval_runtime": 1104.0224,
356
+ "eval_samples_per_second": 2.083,
357
+ "eval_steps_per_second": 0.261,
358
+ "step": 70000
359
+ },
360
+ {
361
+ "epoch": 0.71,
362
+ "learning_rate": 5e-05,
363
+ "loss": 2.2398,
364
+ "step": 72500
365
+ },
366
+ {
367
+ "epoch": 0.73,
368
+ "learning_rate": 5e-05,
369
+ "loss": 2.228,
370
+ "step": 75000
371
+ },
372
+ {
373
+ "epoch": 0.73,
374
+ "eval_gen_len": 29.9704,
375
+ "eval_loss": 2.1669178009033203,
376
+ "eval_rouge1": 0.2253,
377
+ "eval_rouge2": 0.0725,
378
+ "eval_rougeL": 0.1822,
379
+ "eval_rougeLsum": 0.1894,
380
+ "eval_runtime": 1052.7669,
381
+ "eval_samples_per_second": 2.185,
382
+ "eval_steps_per_second": 0.274,
383
+ "step": 75000
384
+ },
385
+ {
386
+ "epoch": 0.76,
387
+ "learning_rate": 5e-05,
388
+ "loss": 2.25,
389
+ "step": 77500
390
+ },
391
+ {
392
+ "epoch": 0.78,
393
+ "learning_rate": 5e-05,
394
+ "loss": 2.234,
395
+ "step": 80000
396
+ },
397
+ {
398
+ "epoch": 0.78,
399
+ "eval_gen_len": 29.4826,
400
+ "eval_loss": 2.1604671478271484,
401
+ "eval_rouge1": 0.2283,
402
+ "eval_rouge2": 0.0747,
403
+ "eval_rougeL": 0.1855,
404
+ "eval_rougeLsum": 0.1937,
405
+ "eval_runtime": 1075.8159,
406
+ "eval_samples_per_second": 2.138,
407
+ "eval_steps_per_second": 0.268,
408
+ "step": 80000
409
+ },
410
+ {
411
+ "epoch": 0.81,
412
+ "learning_rate": 5e-05,
413
+ "loss": 2.236,
414
+ "step": 82500
415
+ },
416
+ {
417
+ "epoch": 0.83,
418
+ "learning_rate": 5e-05,
419
+ "loss": 2.2289,
420
+ "step": 85000
421
+ },
422
+ {
423
+ "epoch": 0.83,
424
+ "eval_gen_len": 30.0213,
425
+ "eval_loss": 2.1517326831817627,
426
+ "eval_rouge1": 0.2226,
427
+ "eval_rouge2": 0.0705,
428
+ "eval_rougeL": 0.1801,
429
+ "eval_rougeLsum": 0.1873,
430
+ "eval_runtime": 1072.8178,
431
+ "eval_samples_per_second": 2.144,
432
+ "eval_steps_per_second": 0.268,
433
+ "step": 85000
434
+ },
435
+ {
436
+ "epoch": 0.85,
437
+ "learning_rate": 5e-05,
438
+ "loss": 2.2214,
439
+ "step": 87500
440
+ },
441
+ {
442
+ "epoch": 0.88,
443
+ "learning_rate": 5e-05,
444
+ "loss": 2.2043,
445
+ "step": 90000
446
+ },
447
+ {
448
+ "epoch": 0.88,
449
+ "eval_gen_len": 29.5361,
450
+ "eval_loss": 2.1455490589141846,
451
+ "eval_rouge1": 0.2265,
452
+ "eval_rouge2": 0.075,
453
+ "eval_rougeL": 0.1838,
454
+ "eval_rougeLsum": 0.1908,
455
+ "eval_runtime": 1058.731,
456
+ "eval_samples_per_second": 2.172,
457
+ "eval_steps_per_second": 0.272,
458
+ "step": 90000
459
+ },
460
+ {
461
+ "epoch": 0.9,
462
+ "learning_rate": 5e-05,
463
+ "loss": 2.2419,
464
+ "step": 92500
465
+ },
466
+ {
467
+ "epoch": 0.93,
468
+ "learning_rate": 5e-05,
469
+ "loss": 2.2259,
470
+ "step": 95000
471
+ },
472
+ {
473
+ "epoch": 0.93,
474
+ "eval_gen_len": 29.6874,
475
+ "eval_loss": 2.1389129161834717,
476
+ "eval_rouge1": 0.2287,
477
+ "eval_rouge2": 0.0713,
478
+ "eval_rougeL": 0.1844,
479
+ "eval_rougeLsum": 0.1911,
480
+ "eval_runtime": 1069.2344,
481
+ "eval_samples_per_second": 2.151,
482
+ "eval_steps_per_second": 0.269,
483
+ "step": 95000
484
+ },
485
+ {
486
+ "epoch": 0.95,
487
+ "learning_rate": 5e-05,
488
+ "loss": 2.2202,
489
+ "step": 97500
490
+ },
491
+ {
492
+ "epoch": 0.98,
493
+ "learning_rate": 5e-05,
494
+ "loss": 2.2307,
495
+ "step": 100000
496
+ },
497
+ {
498
+ "epoch": 0.98,
499
+ "eval_gen_len": 30.7513,
500
+ "eval_loss": 2.132361888885498,
501
+ "eval_rouge1": 0.2293,
502
+ "eval_rouge2": 0.0741,
503
+ "eval_rougeL": 0.1845,
504
+ "eval_rougeLsum": 0.1924,
505
+ "eval_runtime": 1089.9927,
506
+ "eval_samples_per_second": 2.11,
507
+ "eval_steps_per_second": 0.264,
508
+ "step": 100000
509
+ }
510
+ ],
511
+ "max_steps": 102356,
512
+ "num_train_epochs": 1,
513
+ "total_flos": 1.8696291573252096e+17,
514
+ "trial_name": null,
515
+ "trial_params": null
516
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38a62961f75bc74d8cd1c3b0178206ed678b491c8842aa32921846d2fbbba66c
3
+ size 3375