pszemraj commited on
Commit
6fe1cc3
1 Parent(s): b02d4c7

add fp32 chk

Browse files
all_results.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_gen_len": 122.641,
4
- "eval_loss": 2.1311049461364746,
5
- "eval_rouge1": 58.1121,
6
- "eval_rouge2": 29.2636,
7
- "eval_rougeL": 44.676,
8
- "eval_rougeLsum": 54.2933,
9
- "eval_runtime": 1900.7101,
10
  "eval_samples": 1000,
11
- "eval_samples_per_second": 0.526,
12
- "eval_steps_per_second": 0.132,
13
- "train_loss": 0.16691997828690902,
14
- "train_runtime": 19759.1358,
15
  "train_samples": 29441,
16
- "train_samples_per_second": 2.98,
17
- "train_steps_per_second": 0.012
18
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_gen_len": 122.85,
4
+ "eval_loss": 2.1855268478393555,
5
+ "eval_rouge1": 58.2278,
6
+ "eval_rouge2": 28.9939,
7
+ "eval_rougeL": 44.6946,
8
+ "eval_rougeLsum": 54.4068,
9
+ "eval_runtime": 3779.6108,
10
  "eval_samples": 1000,
11
+ "eval_samples_per_second": 0.265,
12
+ "eval_steps_per_second": 0.066,
13
+ "train_loss": 0.10453087175669877,
14
+ "train_runtime": 27358.0249,
15
  "train_samples": 29441,
16
+ "train_samples_per_second": 1.076,
17
+ "train_steps_per_second": 0.004
18
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "pszemraj/flan-t5-large-stacked-samsum1024-WIP2",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
@@ -25,7 +25,7 @@
25
  "relative_attention_max_distance": 128,
26
  "relative_attention_num_buckets": 32,
27
  "tie_word_embeddings": false,
28
- "torch_dtype": "bfloat16",
29
  "transformers_version": "4.26.0.dev0",
30
  "use_cache": true,
31
  "vocab_size": 32128
 
1
  {
2
+ "_name_or_path": "pszemraj/flan-t5-large-stacked-samsum-1024-FP32",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
25
  "relative_attention_max_distance": 128,
26
  "relative_attention_num_buckets": 32,
27
  "tie_word_embeddings": false,
28
+ "torch_dtype": "float32",
29
  "transformers_version": "4.26.0.dev0",
30
  "use_cache": true,
31
  "vocab_size": 32128
eval_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_gen_len": 122.641,
4
- "eval_loss": 2.1311049461364746,
5
- "eval_rouge1": 58.1121,
6
- "eval_rouge2": 29.2636,
7
- "eval_rougeL": 44.676,
8
- "eval_rougeLsum": 54.2933,
9
- "eval_runtime": 1900.7101,
10
  "eval_samples": 1000,
11
- "eval_samples_per_second": 0.526,
12
- "eval_steps_per_second": 0.132
13
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "eval_gen_len": 122.85,
4
+ "eval_loss": 2.1855268478393555,
5
+ "eval_rouge1": 58.2278,
6
+ "eval_rouge2": 28.9939,
7
+ "eval_rougeL": 44.6946,
8
+ "eval_rougeLsum": 54.4068,
9
+ "eval_runtime": 3779.6108,
10
  "eval_samples": 1000,
11
+ "eval_samples_per_second": 0.265,
12
+ "eval_steps_per_second": 0.066
13
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58cee48c190407ece73528b737a057bb4244c0cd4e25a9762e17ac867c3a06b2
3
- size 1566493509
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e1444ded77a29b2a7173d934981b6e9107abf86815db08298cd227e8c8b5f9
3
+ size 3132793669
tokenizer_config.json CHANGED
@@ -104,7 +104,7 @@
104
  "eos_token": "</s>",
105
  "extra_ids": 100,
106
  "model_max_length": 512,
107
- "name_or_path": "pszemraj/flan-t5-large-stacked-samsum1024-WIP2",
108
  "pad_token": "<pad>",
109
  "sp_model_kwargs": {},
110
  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
 
104
  "eos_token": "</s>",
105
  "extra_ids": 100,
106
  "model_max_length": 512,
107
+ "name_or_path": "pszemraj/flan-t5-large-stacked-samsum-1024-FP32",
108
  "pad_token": "<pad>",
109
  "sp_model_kwargs": {},
110
  "special_tokens_map_file": "/home/younes_huggingface_co/.cache/huggingface/hub/models--google--t5-v1_1-large/snapshots/314bc112b191ec17b625ba81438dc73d6c23659d/special_tokens_map.json",
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "train_loss": 0.16691997828690902,
4
- "train_runtime": 19759.1358,
5
  "train_samples": 29441,
6
- "train_samples_per_second": 2.98,
7
- "train_steps_per_second": 0.012
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.10453087175669877,
4
+ "train_runtime": 27358.0249,
5
  "train_samples": 29441,
6
+ "train_samples_per_second": 1.076,
7
+ "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -1,507 +1,318 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.99972833469166,
5
- "global_step": 230,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.03,
12
- "learning_rate": 0.00035999999999999997,
13
- "loss": 0.2157,
14
  "step": 3
15
  },
16
  {
17
  "epoch": 0.05,
18
- "learning_rate": 0.0005999707572027913,
19
- "loss": 0.2186,
20
  "step": 6
21
  },
22
  {
23
  "epoch": 0.08,
24
- "learning_rate": 0.0005995322292545942,
25
- "loss": 0.2325,
26
  "step": 9
27
  },
28
  {
29
  "epoch": 0.1,
30
- "learning_rate": 0.0005985682199945492,
31
- "loss": 0.2308,
32
  "step": 12
33
  },
34
  {
35
  "epoch": 0.13,
36
- "learning_rate": 0.000597080420622471,
37
- "loss": 0.236,
38
  "step": 15
39
  },
40
  {
41
  "epoch": 0.16,
42
- "learning_rate": 0.0005950714412440158,
43
- "loss": 0.2367,
44
  "step": 18
45
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  {
47
  "epoch": 0.18,
48
- "learning_rate": 0.0005925448062916689,
49
- "loss": 0.2416,
50
  "step": 21
51
  },
52
  {
53
  "epoch": 0.21,
54
- "learning_rate": 0.0005895049483416934,
55
- "loss": 0.2413,
56
  "step": 24
57
  },
58
  {
59
  "epoch": 0.23,
60
- "learning_rate": 0.000585957200337884,
61
- "loss": 0.2479,
62
  "step": 27
63
  },
64
  {
65
  "epoch": 0.26,
66
- "learning_rate": 0.0005819077862357724,
67
- "loss": 0.2352,
68
  "step": 30
69
  },
70
  {
71
  "epoch": 0.29,
72
- "learning_rate": 0.0005773638100836939,
73
- "loss": 0.2438,
74
  "step": 33
75
  },
76
  {
77
  "epoch": 0.31,
78
- "learning_rate": 0.0005723332435598725,
79
- "loss": 0.2371,
80
  "step": 36
81
  },
82
  {
83
  "epoch": 0.34,
84
- "learning_rate": 0.0005668249119873892,
85
- "loss": 0.2392,
86
  "step": 39
87
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  {
89
  "epoch": 0.37,
90
- "learning_rate": 0.0005608484788515657,
91
- "loss": 0.2229,
92
  "step": 42
93
  },
94
  {
95
  "epoch": 0.39,
96
- "learning_rate": 0.0005544144288469277,
97
- "loss": 0.2289,
98
  "step": 45
99
  },
100
  {
101
  "epoch": 0.42,
102
- "learning_rate": 0.0005475340494834885,
103
- "loss": 0.2342,
104
  "step": 48
105
  },
106
  {
107
  "epoch": 0.44,
108
- "learning_rate": 0.00054021941128462,
109
- "loss": 0.2318,
110
  "step": 51
111
  },
112
  {
113
  "epoch": 0.47,
114
- "learning_rate": 0.0005324833466112538,
115
- "loss": 0.2189,
116
  "step": 54
117
  },
118
  {
119
  "epoch": 0.5,
120
- "learning_rate": 0.0005243394271495595,
121
- "loss": 0.2193,
122
  "step": 57
123
  },
124
  {
125
  "epoch": 0.52,
126
- "learning_rate": 0.0005158019401015953,
127
- "loss": 0.2233,
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "step": 60
129
  },
130
  {
131
  "epoch": 0.55,
132
- "learning_rate": 0.0005068858631207009,
133
- "loss": 0.2111,
134
  "step": 63
135
  },
136
  {
137
  "epoch": 0.57,
138
- "learning_rate": 0.0004976068380356041,
139
- "loss": 0.2166,
140
  "step": 66
141
  },
142
  {
143
  "epoch": 0.6,
144
- "learning_rate": 0.00048798114340933813,
145
- "loss": 0.2088,
146
  "step": 69
147
  },
148
  {
149
  "epoch": 0.63,
150
- "learning_rate": 0.0004780256659811104,
151
- "loss": 0.2029,
152
  "step": 72
153
  },
154
  {
155
  "epoch": 0.65,
156
- "learning_rate": 0.00046775787104122397,
157
- "loss": 0.2072,
158
  "step": 75
159
  },
160
  {
161
  "epoch": 0.68,
162
- "learning_rate": 0.00045719577179102375,
163
- "loss": 0.2002,
164
  "step": 78
165
  },
166
  {
167
  "epoch": 0.7,
168
- "learning_rate": 0.0004463578977416198,
169
- "loss": 0.202,
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  "step": 81
171
  },
172
  {
173
  "epoch": 0.73,
174
- "learning_rate": 0.0004352632622068292,
175
- "loss": 0.1967,
176
  "step": 84
177
  },
178
  {
179
  "epoch": 0.76,
180
- "learning_rate": 0.0004239313289473625,
181
- "loss": 0.1939,
182
  "step": 87
183
  },
184
  {
185
  "epoch": 0.78,
186
- "learning_rate": 0.0004123819780247736,
187
- "loss": 0.1905,
188
  "step": 90
189
  },
190
  {
191
  "epoch": 0.81,
192
- "learning_rate": 0.0004006354709250765,
193
- "loss": 0.1853,
194
  "step": 93
195
  },
196
  {
197
  "epoch": 0.83,
198
- "learning_rate": 0.000388712415013214,
199
- "loss": 0.1809,
200
  "step": 96
201
  },
202
  {
203
  "epoch": 0.86,
204
- "learning_rate": 0.0003766337273807371,
205
- "loss": 0.177,
206
  "step": 99
207
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  {
209
  "epoch": 0.89,
210
- "learning_rate": 0.00036442059815011896,
211
- "loss": 0.1808,
212
  "step": 102
213
  },
214
  {
215
  "epoch": 0.91,
216
- "learning_rate": 0.0003520944533000791,
217
- "loss": 0.1707,
218
  "step": 105
219
  },
220
  {
221
  "epoch": 0.94,
222
- "learning_rate": 0.00033967691707713674,
223
- "loss": 0.172,
224
  "step": 108
225
  },
226
  {
227
  "epoch": 0.96,
228
- "learning_rate": 0.0003271897740593341,
229
- "loss": 0.1632,
230
  "step": 111
231
  },
232
  {
233
  "epoch": 0.99,
234
- "learning_rate": 0.000314654930938684,
235
- "loss": 0.1734,
236
  "step": 114
237
  },
238
  {
239
  "epoch": 1.0,
240
- "eval_gen_len": 122.123,
241
- "eval_loss": 1.8751367330551147,
242
- "eval_rouge1": 57.9286,
243
- "eval_rouge2": 29.2743,
244
- "eval_rougeL": 44.7181,
245
- "eval_rougeLsum": 54.2295,
246
- "eval_runtime": 1692.8237,
247
- "eval_samples_per_second": 0.591,
248
- "eval_steps_per_second": 0.148,
249
- "step": 115
250
- },
251
- {
252
- "epoch": 1.02,
253
- "learning_rate": 0.00030209437808938845,
254
- "loss": 0.1607,
255
- "step": 117
256
- },
257
- {
258
- "epoch": 1.04,
259
- "learning_rate": 0.0002895301509892498,
260
- "loss": 0.1423,
261
- "step": 120
262
- },
263
- {
264
- "epoch": 1.07,
265
- "learning_rate": 0.0002769842915619544,
266
- "loss": 0.1361,
267
- "step": 123
268
- },
269
- {
270
- "epoch": 1.1,
271
- "learning_rate": 0.0002644788095080497,
272
- "loss": 0.1438,
273
- "step": 126
274
- },
275
- {
276
- "epoch": 1.12,
277
- "learning_rate": 0.00025203564369244956,
278
- "loss": 0.1367,
279
- "step": 129
280
- },
281
- {
282
- "epoch": 1.15,
283
- "learning_rate": 0.00023967662365621063,
284
- "loss": 0.1333,
285
- "step": 132
286
- },
287
- {
288
- "epoch": 1.17,
289
- "learning_rate": 0.0002274234313200997,
290
- "loss": 0.1325,
291
- "step": 135
292
- },
293
- {
294
- "epoch": 1.2,
295
- "learning_rate": 0.000215297562947137,
296
- "loss": 0.1367,
297
- "step": 138
298
- },
299
- {
300
- "epoch": 1.23,
301
- "learning_rate": 0.00020332029143084668,
302
- "loss": 0.1309,
303
- "step": 141
304
- },
305
- {
306
- "epoch": 1.25,
307
- "learning_rate": 0.00019151262897537235,
308
- "loss": 0.1356,
309
- "step": 144
310
- },
311
- {
312
- "epoch": 1.28,
313
- "learning_rate": 0.00017989529023293153,
314
- "loss": 0.128,
315
- "step": 147
316
- },
317
- {
318
- "epoch": 1.3,
319
- "learning_rate": 0.00016848865596327673,
320
- "loss": 0.1215,
321
- "step": 150
322
- },
323
- {
324
- "epoch": 1.33,
325
- "learning_rate": 0.0001573127372789174,
326
- "loss": 0.1244,
327
- "step": 153
328
- },
329
- {
330
- "epoch": 1.36,
331
- "learning_rate": 0.00014638714053882856,
332
- "loss": 0.1265,
333
- "step": 156
334
- },
335
- {
336
- "epoch": 1.38,
337
- "learning_rate": 0.00013573103295223495,
338
- "loss": 0.117,
339
- "step": 159
340
- },
341
- {
342
- "epoch": 1.41,
343
- "learning_rate": 0.0001253631089528132,
344
- "loss": 0.1216,
345
- "step": 162
346
- },
347
- {
348
- "epoch": 1.43,
349
- "learning_rate": 0.0001153015574023025,
350
- "loss": 0.1157,
351
- "step": 165
352
- },
353
- {
354
- "epoch": 1.46,
355
- "learning_rate": 0.00010556402968106073,
356
- "loss": 0.1197,
357
- "step": 168
358
- },
359
- {
360
- "epoch": 1.49,
361
- "learning_rate": 9.616760872154511e-05,
362
- "loss": 0.1227,
363
- "step": 171
364
- },
365
- {
366
- "epoch": 1.51,
367
- "learning_rate": 8.712877903904379e-05,
368
- "loss": 0.1156,
369
- "step": 174
370
- },
371
- {
372
- "epoch": 1.54,
373
- "learning_rate": 7.846339781223482e-05,
374
- "loss": 0.1198,
375
- "step": 177
376
- },
377
- {
378
- "epoch": 1.57,
379
- "learning_rate": 7.018666706430662e-05,
380
- "loss": 0.1192,
381
- "step": 180
382
- },
383
- {
384
- "epoch": 1.59,
385
- "learning_rate": 6.231310699344282e-05,
386
- "loss": 0.116,
387
- "step": 183
388
- },
389
- {
390
- "epoch": 1.62,
391
- "learning_rate": 5.485653049946145e-05,
392
- "loss": 0.119,
393
- "step": 186
394
- },
395
- {
396
- "epoch": 1.64,
397
- "learning_rate": 4.7830018951294724e-05,
398
- "loss": 0.1135,
399
- "step": 189
400
- },
401
- {
402
- "epoch": 1.67,
403
- "learning_rate": 4.124589923782276e-05,
404
- "loss": 0.1169,
405
- "step": 192
406
- },
407
- {
408
- "epoch": 1.7,
409
- "learning_rate": 3.51157221423219e-05,
410
- "loss": 0.115,
411
- "step": 195
412
- },
413
- {
414
- "epoch": 1.72,
415
- "learning_rate": 2.945024207846589e-05,
416
- "loss": 0.1156,
417
- "step": 198
418
- },
419
- {
420
- "epoch": 1.75,
421
- "learning_rate": 2.425939822342968e-05,
422
- "loss": 0.1157,
423
- "step": 201
424
- },
425
- {
426
- "epoch": 1.77,
427
- "learning_rate": 1.9552297081195668e-05,
428
- "loss": 0.1136,
429
- "step": 204
430
- },
431
- {
432
- "epoch": 1.8,
433
- "learning_rate": 1.5337196506651038e-05,
434
- "loss": 0.111,
435
- "step": 207
436
- },
437
- {
438
- "epoch": 1.83,
439
- "learning_rate": 1.162149121850433e-05,
440
- "loss": 0.113,
441
- "step": 210
442
- },
443
- {
444
- "epoch": 1.85,
445
- "learning_rate": 8.411699826436147e-06,
446
- "loss": 0.1098,
447
- "step": 213
448
- },
449
- {
450
- "epoch": 1.88,
451
- "learning_rate": 5.7134533952425395e-06,
452
- "loss": 0.1111,
453
- "step": 216
454
- },
455
- {
456
- "epoch": 1.9,
457
- "learning_rate": 3.5314855660341646e-06,
458
- "loss": 0.1098,
459
- "step": 219
460
- },
461
- {
462
- "epoch": 1.93,
463
- "learning_rate": 1.869624251821089e-06,
464
- "loss": 0.1111,
465
- "step": 222
466
- },
467
- {
468
- "epoch": 1.96,
469
- "learning_rate": 7.307849220527406e-07,
470
- "loss": 0.1121,
471
- "step": 225
472
- },
473
- {
474
- "epoch": 1.98,
475
- "learning_rate": 1.1696548789369431e-07,
476
- "loss": 0.1098,
477
- "step": 228
478
- },
479
- {
480
- "epoch": 2.0,
481
- "eval_gen_len": 122.364,
482
- "eval_loss": 2.1311135292053223,
483
- "eval_rouge1": 58.1114,
484
- "eval_rouge2": 29.339,
485
- "eval_rougeL": 44.7611,
486
- "eval_rougeLsum": 54.2823,
487
- "eval_runtime": 1701.9255,
488
- "eval_samples_per_second": 0.588,
489
- "eval_steps_per_second": 0.147,
490
- "step": 230
491
- },
492
- {
493
- "epoch": 2.0,
494
- "step": 230,
495
- "total_flos": 2.7141869303916134e+17,
496
- "train_loss": 0.16691997828690902,
497
- "train_runtime": 19759.1358,
498
- "train_samples_per_second": 2.98,
499
- "train_steps_per_second": 0.012
500
  }
501
  ],
502
- "max_steps": 230,
503
- "num_train_epochs": 2,
504
- "total_flos": 2.7141869303916134e+17,
505
  "trial_name": null,
506
  "trial_params": null
507
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997283346916599,
5
+ "global_step": 115,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.03,
12
+ "learning_rate": 0.0001,
13
+ "loss": 0.1341,
14
  "step": 3
15
  },
16
  {
17
  "epoch": 0.05,
18
+ "learning_rate": 9.982307470588098e-05,
19
+ "loss": 0.1267,
20
  "step": 6
21
  },
22
  {
23
  "epoch": 0.08,
24
+ "learning_rate": 9.92935509259118e-05,
25
+ "loss": 0.1213,
26
  "step": 9
27
  },
28
  {
29
  "epoch": 0.1,
30
+ "learning_rate": 9.841517610611309e-05,
31
+ "loss": 0.1211,
32
  "step": 12
33
  },
34
  {
35
  "epoch": 0.13,
36
+ "learning_rate": 9.719416651541839e-05,
37
+ "loss": 0.1172,
38
  "step": 15
39
  },
40
  {
41
  "epoch": 0.16,
42
+ "learning_rate": 9.563916325306594e-05,
43
+ "loss": 0.1195,
44
  "step": 18
45
  },
46
+ {
47
+ "epoch": 0.17,
48
+ "eval_gen_len": 121.8,
49
+ "eval_loss": 2.063453435897827,
50
+ "eval_rouge1": 57.8829,
51
+ "eval_rouge2": 28.7887,
52
+ "eval_rougeL": 44.4256,
53
+ "eval_rougeLsum": 54.1299,
54
+ "eval_runtime": 3417.7231,
55
+ "eval_samples_per_second": 0.293,
56
+ "eval_steps_per_second": 0.073,
57
+ "step": 20
58
+ },
59
  {
60
  "epoch": 0.18,
61
+ "learning_rate": 9.376117109543769e-05,
62
+ "loss": 0.1129,
63
  "step": 21
64
  },
65
  {
66
  "epoch": 0.21,
67
+ "learning_rate": 9.157348061512727e-05,
68
+ "loss": 0.1161,
69
  "step": 24
70
  },
71
  {
72
  "epoch": 0.23,
73
+ "learning_rate": 8.90915741234015e-05,
74
+ "loss": 0.1184,
75
  "step": 27
76
  },
77
  {
78
  "epoch": 0.26,
79
+ "learning_rate": 8.633301610170135e-05,
80
+ "loss": 0.1038,
81
  "step": 30
82
  },
83
  {
84
  "epoch": 0.29,
85
+ "learning_rate": 8.33173288976002e-05,
86
+ "loss": 0.1064,
87
  "step": 33
88
  },
89
  {
90
  "epoch": 0.31,
91
+ "learning_rate": 8.006585456492029e-05,
92
+ "loss": 0.1107,
93
  "step": 36
94
  },
95
  {
96
  "epoch": 0.34,
97
+ "learning_rate": 7.660160382576683e-05,
98
+ "loss": 0.1084,
99
  "step": 39
100
  },
101
+ {
102
+ "epoch": 0.35,
103
+ "eval_gen_len": 122.893,
104
+ "eval_loss": 2.117846965789795,
105
+ "eval_rouge1": 58.0416,
106
+ "eval_rouge2": 28.6487,
107
+ "eval_rougeL": 44.3905,
108
+ "eval_rougeLsum": 54.1557,
109
+ "eval_runtime": 3584.8893,
110
+ "eval_samples_per_second": 0.279,
111
+ "eval_steps_per_second": 0.07,
112
+ "step": 40
113
+ },
114
  {
115
  "epoch": 0.37,
116
+ "learning_rate": 7.294909322337689e-05,
117
+ "loss": 0.1026,
118
  "step": 42
119
  },
120
  {
121
  "epoch": 0.39,
122
+ "learning_rate": 6.91341716182545e-05,
123
+ "loss": 0.1059,
124
  "step": 45
125
  },
126
  {
127
  "epoch": 0.42,
128
+ "learning_rate": 6.518383725548074e-05,
129
+ "loss": 0.1078,
130
  "step": 48
131
  },
132
  {
133
  "epoch": 0.44,
134
+ "learning_rate": 6.112604669781572e-05,
135
+ "loss": 0.1053,
136
  "step": 51
137
  },
138
  {
139
  "epoch": 0.47,
140
+ "learning_rate": 5.698951697677498e-05,
141
+ "loss": 0.1027,
142
  "step": 54
143
  },
144
  {
145
  "epoch": 0.5,
146
+ "learning_rate": 5.2803522361859594e-05,
147
+ "loss": 0.0993,
148
  "step": 57
149
  },
150
  {
151
  "epoch": 0.52,
152
+ "learning_rate": 4.859768718620656e-05,
153
+ "loss": 0.1019,
154
+ "step": 60
155
+ },
156
+ {
157
+ "epoch": 0.52,
158
+ "eval_gen_len": 120.524,
159
+ "eval_loss": 2.157585382461548,
160
+ "eval_rouge1": 57.816,
161
+ "eval_rouge2": 28.7069,
162
+ "eval_rougeL": 44.4242,
163
+ "eval_rougeLsum": 53.9598,
164
+ "eval_runtime": 3448.4317,
165
+ "eval_samples_per_second": 0.29,
166
+ "eval_steps_per_second": 0.072,
167
  "step": 60
168
  },
169
  {
170
  "epoch": 0.55,
171
+ "learning_rate": 4.4401776194834613e-05,
172
+ "loss": 0.0959,
173
  "step": 63
174
  },
175
  {
176
  "epoch": 0.57,
177
+ "learning_rate": 4.0245483899193595e-05,
178
+ "loss": 0.1051,
179
  "step": 66
180
  },
181
  {
182
  "epoch": 0.6,
183
+ "learning_rate": 3.6158224428757535e-05,
184
+ "loss": 0.0953,
185
  "step": 69
186
  },
187
  {
188
  "epoch": 0.63,
189
+ "learning_rate": 3.216892336688435e-05,
190
+ "loss": 0.0952,
191
  "step": 72
192
  },
193
  {
194
  "epoch": 0.65,
195
+ "learning_rate": 2.8305813044122097e-05,
196
+ "loss": 0.0957,
197
  "step": 75
198
  },
199
  {
200
  "epoch": 0.68,
201
+ "learning_rate": 2.459623273767354e-05,
202
+ "loss": 0.0975,
203
  "step": 78
204
  },
205
  {
206
  "epoch": 0.7,
207
+ "eval_gen_len": 121.793,
208
+ "eval_loss": 2.182112693786621,
209
+ "eval_rouge1": 57.9597,
210
+ "eval_rouge2": 28.8178,
211
+ "eval_rougeL": 44.4854,
212
+ "eval_rougeLsum": 54.068,
213
+ "eval_runtime": 3498.8291,
214
+ "eval_samples_per_second": 0.286,
215
+ "eval_steps_per_second": 0.071,
216
+ "step": 80
217
+ },
218
+ {
219
+ "epoch": 0.7,
220
+ "learning_rate": 2.1066435191009715e-05,
221
+ "loss": 0.0943,
222
  "step": 81
223
  },
224
  {
225
  "epoch": 0.73,
226
+ "learning_rate": 1.774140082289563e-05,
227
+ "loss": 0.0988,
228
  "step": 84
229
  },
230
  {
231
  "epoch": 0.76,
232
+ "learning_rate": 1.4644660940672627e-05,
233
+ "loss": 0.0954,
234
  "step": 87
235
  },
236
  {
237
  "epoch": 0.78,
238
+ "learning_rate": 1.1798131208919627e-05,
239
+ "loss": 0.0977,
240
  "step": 90
241
  },
242
  {
243
  "epoch": 0.81,
244
+ "learning_rate": 9.221956552036992e-06,
245
+ "loss": 0.0954,
246
  "step": 93
247
  },
248
  {
249
  "epoch": 0.83,
250
+ "learning_rate": 6.934368588379553e-06,
251
+ "loss": 0.0942,
252
  "step": 96
253
  },
254
  {
255
  "epoch": 0.86,
256
+ "learning_rate": 4.951556604879048e-06,
257
+ "loss": 0.0947,
258
  "step": 99
259
  },
260
+ {
261
+ "epoch": 0.87,
262
+ "eval_gen_len": 122.77,
263
+ "eval_loss": 2.1846070289611816,
264
+ "eval_rouge1": 57.9637,
265
+ "eval_rouge2": 28.7446,
266
+ "eval_rougeL": 44.3826,
267
+ "eval_rougeLsum": 54.0399,
268
+ "eval_runtime": 3495.1079,
269
+ "eval_samples_per_second": 0.286,
270
+ "eval_steps_per_second": 0.072,
271
+ "step": 100
272
+ },
273
  {
274
  "epoch": 0.89,
275
+ "learning_rate": 3.2875529852700147e-06,
276
+ "loss": 0.0952,
277
  "step": 102
278
  },
279
  {
280
  "epoch": 0.91,
281
+ "learning_rate": 1.9541339027450256e-06,
282
+ "loss": 0.1001,
283
  "step": 105
284
  },
285
  {
286
  "epoch": 0.94,
287
+ "learning_rate": 9.607359798384785e-07,
288
+ "loss": 0.0953,
289
  "step": 108
290
  },
291
  {
292
  "epoch": 0.96,
293
+ "learning_rate": 3.143895053378698e-07,
294
+ "loss": 0.0963,
295
  "step": 111
296
  },
297
  {
298
  "epoch": 0.99,
299
+ "learning_rate": 1.9668680847356735e-08,
300
+ "loss": 0.0924,
301
  "step": 114
302
  },
303
  {
304
  "epoch": 1.0,
305
+ "step": 115,
306
+ "total_flos": 1.3570473694593024e+17,
307
+ "train_loss": 0.10453087175669877,
308
+ "train_runtime": 27358.0249,
309
+ "train_samples_per_second": 1.076,
310
+ "train_steps_per_second": 0.004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  }
312
  ],
313
+ "max_steps": 115,
314
+ "num_train_epochs": 1,
315
+ "total_flos": 1.3570473694593024e+17,
316
  "trial_name": null,
317
  "trial_params": null
318
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d828e7eb3a78727a7503ac20ad0242d41b3eee713dc1fcfae6e553e64791098
3
- size 3643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:874b9ef1a9fa7307bd62b5629cff23971f0e3f551dc50b0975c5d755424c33f1
3
+ size 3707