dq158 commited on
Commit
176a5c1
1 Parent(s): 696b86b

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "pingusPongus",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
  "classifier_dropout": 0.0,
7
- "d_ff": 2048,
8
  "d_kv": 64,
9
- "d_model": 768,
10
  "decoder_start_token_id": 0,
11
  "dense_act_fn": "gelu_new",
12
  "dropout_rate": 0.1,
@@ -18,45 +18,16 @@
18
  "layer_norm_epsilon": 1e-06,
19
  "model_type": "t5",
20
  "n_positions": 512,
21
- "num_decoder_layers": 12,
22
- "num_heads": 12,
23
- "num_layers": 12,
24
  "output_past": true,
25
  "pad_token_id": 0,
26
  "relative_attention_max_distance": 128,
27
  "relative_attention_num_buckets": 32,
28
- "task_specific_params": {
29
- "summarization": {
30
- "early_stopping": true,
31
- "length_penalty": 2.0,
32
- "max_length": 200,
33
- "min_length": 30,
34
- "no_repeat_ngram_size": 3,
35
- "num_beams": 4,
36
- "prefix": "summarize: "
37
- },
38
- "translation_en_to_de": {
39
- "early_stopping": true,
40
- "max_length": 300,
41
- "num_beams": 4,
42
- "prefix": "translate English to German: "
43
- },
44
- "translation_en_to_fr": {
45
- "early_stopping": true,
46
- "max_length": 300,
47
- "num_beams": 4,
48
- "prefix": "translate English to French: "
49
- },
50
- "translation_en_to_ro": {
51
- "early_stopping": true,
52
- "max_length": 300,
53
- "num_beams": 4,
54
- "prefix": "translate English to Romanian: "
55
- }
56
- },
57
  "tie_word_embeddings": false,
58
  "torch_dtype": "float32",
59
- "transformers_version": "4.34.1",
60
  "use_cache": true,
61
  "vocab_size": 32128
62
  }
 
1
  {
2
+ "_name_or_path": "google/flan-t5-large",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
  "classifier_dropout": 0.0,
7
+ "d_ff": 2816,
8
  "d_kv": 64,
9
+ "d_model": 1024,
10
  "decoder_start_token_id": 0,
11
  "dense_act_fn": "gelu_new",
12
  "dropout_rate": 0.1,
 
18
  "layer_norm_epsilon": 1e-06,
19
  "model_type": "t5",
20
  "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 16,
23
+ "num_layers": 24,
24
  "output_past": true,
25
  "pad_token_id": 0,
26
  "relative_attention_max_distance": 128,
27
  "relative_attention_num_buckets": 32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "tie_word_embeddings": false,
29
  "torch_dtype": "float32",
30
+ "transformers_version": "4.35.2",
31
  "use_cache": true,
32
  "vocab_size": 32128
33
  }
last-checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d18f93b6416cb6de922b525ae2aefefd5555f3956bf539033033a8a0334866a
3
+ size 3132668808
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df5f5ee2b56d6c0f93fb801be312fc1bd48ab0eacd78bb34294fb243c2b7397a
3
- size 37990394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e04dbe6cc517d74a5eb81747881c0161660f2668ab3564ad3304a3fd6f87af59
3
+ size 6265677800
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbfd5efb5880d038e40ef818b6a478489100d8537842fb87344a0d7f88275ee0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427e669d0ee683c4b12f0805ce85ad0ea605698ac777a13ff0e4e41b5b4ddf99
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71d106c12a183d47d2349d6d228d20595c1cad95f8d19fec2a8622032de302f5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3199f68c919ad9f15fb49df0b36624f234cba762e6bf2c59cdcbf6ebb2295917
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,379 +1,452 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8720549976018488,
5
  "eval_steps": 500,
6
- "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "learning_rate": 4e-05,
14
- "loss": 4.4951,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.03,
19
- "learning_rate": 8e-05,
20
- "loss": 3.777,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.04,
25
- "learning_rate": 7.999831247941866e-05,
26
- "loss": 3.6246,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.06,
31
- "learning_rate": 7.99932500600609e-05,
32
- "loss": 3.5067,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.07,
37
- "learning_rate": 7.998481316907362e-05,
38
- "loss": 3.4947,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.09,
43
- "learning_rate": 7.99730025183281e-05,
44
- "loss": 3.4452,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.1,
49
- "learning_rate": 7.995781910436019e-05,
50
- "loss": 3.3696,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.12,
55
- "learning_rate": 7.993926420828609e-05,
56
- "loss": 3.4226,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.13,
61
- "learning_rate": 7.991733939569422e-05,
62
- "loss": 3.3765,
63
  "step": 4500
64
  },
65
  {
66
- "epoch": 0.15,
67
- "learning_rate": 7.989204651651322e-05,
68
- "loss": 3.4237,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.16,
73
- "learning_rate": 7.986338770485576e-05,
74
- "loss": 3.3054,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.17,
79
- "learning_rate": 7.983136537883862e-05,
80
- "loss": 3.3544,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 0.19,
85
- "learning_rate": 7.97959822403785e-05,
86
- "loss": 3.3659,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 0.2,
91
- "learning_rate": 7.97572412749641e-05,
92
- "loss": 3.3426,
93
  "step": 7000
94
  },
95
  {
96
  "epoch": 0.22,
97
- "learning_rate": 7.971514575140424e-05,
98
- "loss": 3.3332,
99
  "step": 7500
100
  },
101
  {
102
  "epoch": 0.23,
103
- "learning_rate": 7.966969922155206e-05,
104
- "loss": 3.3163,
105
  "step": 8000
106
  },
107
  {
108
  "epoch": 0.25,
109
- "learning_rate": 7.962090552000528e-05,
110
- "loss": 3.3127,
111
  "step": 8500
112
  },
113
  {
114
  "epoch": 0.26,
115
- "learning_rate": 7.956876876378266e-05,
116
- "loss": 3.3187,
117
  "step": 9000
118
  },
119
  {
120
- "epoch": 0.28,
121
- "learning_rate": 7.951329335197668e-05,
122
- "loss": 3.3195,
123
  "step": 9500
124
  },
125
  {
126
  "epoch": 0.29,
127
- "learning_rate": 7.94544839653823e-05,
128
- "loss": 3.2599,
129
  "step": 10000
130
  },
131
  {
132
- "epoch": 0.31,
133
- "learning_rate": 7.939234556610201e-05,
134
- "loss": 3.3353,
135
  "step": 10500
136
  },
137
  {
138
  "epoch": 0.32,
139
- "learning_rate": 7.932688339712721e-05,
140
- "loss": 3.1893,
141
  "step": 11000
142
  },
143
  {
144
  "epoch": 0.33,
145
- "learning_rate": 7.925810298189578e-05,
146
- "loss": 3.2074,
147
  "step": 11500
148
  },
149
  {
150
  "epoch": 0.35,
151
- "learning_rate": 7.918601012382602e-05,
152
- "loss": 3.2427,
153
  "step": 12000
154
  },
155
  {
156
  "epoch": 0.36,
157
- "learning_rate": 7.911061090582703e-05,
158
- "loss": 3.3292,
159
  "step": 12500
160
  },
161
  {
162
  "epoch": 0.38,
163
- "learning_rate": 7.90319116897854e-05,
164
- "loss": 3.2498,
165
  "step": 13000
166
  },
167
  {
168
  "epoch": 0.39,
169
- "learning_rate": 7.89499191160285e-05,
170
- "loss": 3.2745,
171
  "step": 13500
172
  },
173
  {
174
  "epoch": 0.41,
175
- "learning_rate": 7.88646401027641e-05,
176
- "loss": 3.2645,
177
  "step": 14000
178
  },
179
  {
180
  "epoch": 0.42,
181
- "learning_rate": 7.87760818454967e-05,
182
- "loss": 3.2276,
183
  "step": 14500
184
  },
185
  {
186
- "epoch": 0.44,
187
- "learning_rate": 7.868425181642037e-05,
188
- "loss": 3.235,
189
  "step": 15000
190
  },
191
  {
192
  "epoch": 0.45,
193
- "learning_rate": 7.858915776378836e-05,
194
- "loss": 3.1867,
195
  "step": 15500
196
  },
197
  {
198
- "epoch": 0.47,
199
- "learning_rate": 7.849080771125918e-05,
200
- "loss": 3.1661,
201
  "step": 16000
202
  },
203
  {
204
  "epoch": 0.48,
205
- "learning_rate": 7.838920995721975e-05,
206
- "loss": 3.2233,
207
  "step": 16500
208
  },
209
  {
210
  "epoch": 0.49,
211
- "learning_rate": 7.828437307408509e-05,
212
- "loss": 3.1632,
213
  "step": 17000
214
  },
215
  {
216
  "epoch": 0.51,
217
- "learning_rate": 7.81763059075751e-05,
218
- "loss": 3.2981,
219
  "step": 17500
220
  },
221
  {
222
  "epoch": 0.52,
223
- "learning_rate": 7.806501757596819e-05,
224
- "loss": 3.2572,
225
  "step": 18000
226
  },
227
  {
228
  "epoch": 0.54,
229
- "learning_rate": 7.795051746933185e-05,
230
- "loss": 3.1959,
231
  "step": 18500
232
  },
233
  {
234
  "epoch": 0.55,
235
- "learning_rate": 7.783281524873039e-05,
236
- "loss": 3.2433,
237
  "step": 19000
238
  },
239
  {
240
- "epoch": 0.57,
241
- "learning_rate": 7.771192084540983e-05,
242
- "loss": 3.1956,
243
  "step": 19500
244
  },
245
  {
246
  "epoch": 0.58,
247
- "learning_rate": 7.75878444599598e-05,
248
- "loss": 3.2134,
249
  "step": 20000
250
  },
251
  {
252
- "epoch": 0.6,
253
- "learning_rate": 7.746059656145306e-05,
254
- "loss": 3.1633,
255
  "step": 20500
256
  },
257
  {
258
  "epoch": 0.61,
259
- "learning_rate": 7.733018788656199e-05,
260
- "loss": 3.2601,
261
  "step": 21000
262
  },
263
  {
264
  "epoch": 0.62,
265
- "learning_rate": 7.71966294386527e-05,
266
- "loss": 3.173,
267
  "step": 21500
268
  },
269
  {
270
  "epoch": 0.64,
271
- "learning_rate": 7.70599324868567e-05,
272
- "loss": 3.2264,
273
  "step": 22000
274
  },
275
  {
276
  "epoch": 0.65,
277
- "learning_rate": 7.692010856511996e-05,
278
- "loss": 3.1828,
279
  "step": 22500
280
  },
281
  {
282
  "epoch": 0.67,
283
- "learning_rate": 7.677716947122976e-05,
284
- "loss": 3.1522,
285
  "step": 23000
286
  },
287
  {
288
  "epoch": 0.68,
289
- "learning_rate": 7.663112726581924e-05,
290
- "loss": 3.2148,
291
  "step": 23500
292
  },
293
  {
294
- "epoch": 0.7,
295
- "learning_rate": 7.648199427134978e-05,
296
- "loss": 3.1741,
297
  "step": 24000
298
  },
299
  {
300
  "epoch": 0.71,
301
- "learning_rate": 7.632978307107125e-05,
302
- "loss": 3.2386,
303
  "step": 24500
304
  },
305
  {
306
- "epoch": 0.73,
307
- "learning_rate": 7.617450650796032e-05,
308
- "loss": 3.1865,
309
  "step": 25000
310
  },
311
  {
312
  "epoch": 0.74,
313
- "learning_rate": 7.601617768363678e-05,
314
- "loss": 3.2224,
315
  "step": 25500
316
  },
317
  {
318
- "epoch": 0.76,
319
- "learning_rate": 7.58548099572581e-05,
320
- "loss": 3.1192,
321
  "step": 26000
322
  },
323
  {
324
  "epoch": 0.77,
325
- "learning_rate": 7.569041694439229e-05,
326
- "loss": 3.1802,
327
  "step": 26500
328
  },
329
  {
330
  "epoch": 0.78,
331
- "learning_rate": 7.552301251586894e-05,
332
- "loss": 3.1781,
333
  "step": 27000
334
  },
335
  {
336
  "epoch": 0.8,
337
- "learning_rate": 7.5352610796609e-05,
338
- "loss": 3.1921,
339
  "step": 27500
340
  },
341
  {
342
  "epoch": 0.81,
343
- "learning_rate": 7.517922616443289e-05,
344
- "loss": 3.1896,
345
  "step": 28000
346
  },
347
  {
348
- "epoch": 0.83,
349
- "learning_rate": 7.500287324884736e-05,
350
- "loss": 3.1911,
351
  "step": 28500
352
  },
353
  {
354
  "epoch": 0.84,
355
- "learning_rate": 7.482356692981116e-05,
356
- "loss": 3.1367,
357
  "step": 29000
358
  },
359
  {
360
- "epoch": 0.86,
361
- "learning_rate": 7.464132233647945e-05,
362
- "loss": 3.1416,
363
  "step": 29500
364
  },
365
  {
366
  "epoch": 0.87,
367
- "learning_rate": 7.445615484592736e-05,
368
- "loss": 3.1682,
369
  "step": 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  }
371
  ],
372
  "logging_steps": 500,
373
- "max_steps": 172005,
374
- "num_train_epochs": 5,
375
- "save_steps": 5000,
376
- "total_flos": 1.02801736728576e+18,
377
  "trial_name": null,
378
  "trial_params": null
379
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 34567,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "learning_rate": 0.0001,
14
+ "loss": 3.3228,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.03,
19
+ "learning_rate": 9.999994258403258e-05,
20
+ "loss": 2.8639,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.04,
25
+ "learning_rate": 9.99997703362622e-05,
26
+ "loss": 2.8198,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.06,
31
+ "learning_rate": 9.999948325708443e-05,
32
+ "loss": 2.7858,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.07,
37
+ "learning_rate": 9.999908134715859e-05,
38
+ "loss": 2.7422,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.09,
43
+ "learning_rate": 9.999856460740773e-05,
44
+ "loss": 2.7274,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.1,
49
+ "learning_rate": 9.99979330390186e-05,
50
+ "loss": 2.6958,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.12,
55
+ "learning_rate": 9.999718664344171e-05,
56
+ "loss": 2.6617,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.13,
61
+ "learning_rate": 9.999632542239125e-05,
62
+ "loss": 2.6747,
63
  "step": 4500
64
  },
65
  {
66
+ "epoch": 0.14,
67
+ "learning_rate": 9.999534937784512e-05,
68
+ "loss": 2.6564,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.16,
73
+ "learning_rate": 9.999425851204496e-05,
74
+ "loss": 2.585,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.17,
79
+ "learning_rate": 9.99930528274961e-05,
80
+ "loss": 2.6385,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 0.19,
85
+ "learning_rate": 9.999173232696753e-05,
86
+ "loss": 2.6262,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 0.2,
91
+ "learning_rate": 9.999029701349196e-05,
92
+ "loss": 2.6055,
93
  "step": 7000
94
  },
95
  {
96
  "epoch": 0.22,
97
+ "learning_rate": 9.998874689036583e-05,
98
+ "loss": 2.5917,
99
  "step": 7500
100
  },
101
  {
102
  "epoch": 0.23,
103
+ "learning_rate": 9.998708196114922e-05,
104
+ "loss": 2.6162,
105
  "step": 8000
106
  },
107
  {
108
  "epoch": 0.25,
109
+ "learning_rate": 9.99853022296658e-05,
110
+ "loss": 2.6188,
111
  "step": 8500
112
  },
113
  {
114
  "epoch": 0.26,
115
+ "learning_rate": 9.998340770000302e-05,
116
+ "loss": 2.5671,
117
  "step": 9000
118
  },
119
  {
120
+ "epoch": 0.27,
121
+ "learning_rate": 9.998139837651193e-05,
122
+ "loss": 2.5897,
123
  "step": 9500
124
  },
125
  {
126
  "epoch": 0.29,
127
+ "learning_rate": 9.997927426380721e-05,
128
+ "loss": 2.5414,
129
  "step": 10000
130
  },
131
  {
132
+ "epoch": 0.3,
133
+ "learning_rate": 9.997703536676718e-05,
134
+ "loss": 2.5139,
135
  "step": 10500
136
  },
137
  {
138
  "epoch": 0.32,
139
+ "learning_rate": 9.997468169053379e-05,
140
+ "loss": 2.5904,
141
  "step": 11000
142
  },
143
  {
144
  "epoch": 0.33,
145
+ "learning_rate": 9.997221324051255e-05,
146
+ "loss": 2.6288,
147
  "step": 11500
148
  },
149
  {
150
  "epoch": 0.35,
151
+ "learning_rate": 9.996963002237263e-05,
152
+ "loss": 2.598,
153
  "step": 12000
154
  },
155
  {
156
  "epoch": 0.36,
157
+ "learning_rate": 9.996693204204674e-05,
158
+ "loss": 2.5276,
159
  "step": 12500
160
  },
161
  {
162
  "epoch": 0.38,
163
+ "learning_rate": 9.996411930573117e-05,
164
+ "loss": 2.5817,
165
  "step": 13000
166
  },
167
  {
168
  "epoch": 0.39,
169
+ "learning_rate": 9.996119181988575e-05,
170
+ "loss": 2.5316,
171
  "step": 13500
172
  },
173
  {
174
  "epoch": 0.41,
175
+ "learning_rate": 9.995814959123386e-05,
176
+ "loss": 2.4692,
177
  "step": 14000
178
  },
179
  {
180
  "epoch": 0.42,
181
+ "learning_rate": 9.995499262676243e-05,
182
+ "loss": 2.5464,
183
  "step": 14500
184
  },
185
  {
186
+ "epoch": 0.43,
187
+ "learning_rate": 9.99517209337218e-05,
188
+ "loss": 2.5222,
189
  "step": 15000
190
  },
191
  {
192
  "epoch": 0.45,
193
+ "learning_rate": 9.994833451962592e-05,
194
+ "loss": 2.5304,
195
  "step": 15500
196
  },
197
  {
198
+ "epoch": 0.46,
199
+ "learning_rate": 9.994483339225213e-05,
200
+ "loss": 2.6063,
201
  "step": 16000
202
  },
203
  {
204
  "epoch": 0.48,
205
+ "learning_rate": 9.994121755964129e-05,
206
+ "loss": 2.5286,
207
  "step": 16500
208
  },
209
  {
210
  "epoch": 0.49,
211
+ "learning_rate": 9.993748703009764e-05,
212
+ "loss": 2.5273,
213
  "step": 17000
214
  },
215
  {
216
  "epoch": 0.51,
217
+ "learning_rate": 9.993364181218885e-05,
218
+ "loss": 2.4868,
219
  "step": 17500
220
  },
221
  {
222
  "epoch": 0.52,
223
+ "learning_rate": 9.992968191474601e-05,
224
+ "loss": 2.435,
225
  "step": 18000
226
  },
227
  {
228
  "epoch": 0.54,
229
+ "learning_rate": 9.992560734686357e-05,
230
+ "loss": 2.484,
231
  "step": 18500
232
  },
233
  {
234
  "epoch": 0.55,
235
+ "learning_rate": 9.992141811789933e-05,
236
+ "loss": 2.5301,
237
  "step": 19000
238
  },
239
  {
240
+ "epoch": 0.56,
241
+ "learning_rate": 9.991711423747445e-05,
242
+ "loss": 2.4857,
243
  "step": 19500
244
  },
245
  {
246
  "epoch": 0.58,
247
+ "learning_rate": 9.991269571547339e-05,
248
+ "loss": 2.4958,
249
  "step": 20000
250
  },
251
  {
252
+ "epoch": 0.59,
253
+ "learning_rate": 9.99081625620439e-05,
254
+ "loss": 2.4757,
255
  "step": 20500
256
  },
257
  {
258
  "epoch": 0.61,
259
+ "learning_rate": 9.990351478759696e-05,
260
+ "loss": 2.544,
261
  "step": 21000
262
  },
263
  {
264
  "epoch": 0.62,
265
+ "learning_rate": 9.989875240280689e-05,
266
+ "loss": 2.4796,
267
  "step": 21500
268
  },
269
  {
270
  "epoch": 0.64,
271
+ "learning_rate": 9.989387541861111e-05,
272
+ "loss": 2.4968,
273
  "step": 22000
274
  },
275
  {
276
  "epoch": 0.65,
277
+ "learning_rate": 9.988888384621031e-05,
278
+ "loss": 2.4426,
279
  "step": 22500
280
  },
281
  {
282
  "epoch": 0.67,
283
+ "learning_rate": 9.988377769706834e-05,
284
+ "loss": 2.4471,
285
  "step": 23000
286
  },
287
  {
288
  "epoch": 0.68,
289
+ "learning_rate": 9.987855698291218e-05,
290
+ "loss": 2.5022,
291
  "step": 23500
292
  },
293
  {
294
+ "epoch": 0.69,
295
+ "learning_rate": 9.98732217157319e-05,
296
+ "loss": 2.5202,
297
  "step": 24000
298
  },
299
  {
300
  "epoch": 0.71,
301
+ "learning_rate": 9.98677719077807e-05,
302
+ "loss": 2.5562,
303
  "step": 24500
304
  },
305
  {
306
+ "epoch": 0.72,
307
+ "learning_rate": 9.986220757157482e-05,
308
+ "loss": 2.4888,
309
  "step": 25000
310
  },
311
  {
312
  "epoch": 0.74,
313
+ "learning_rate": 9.985652871989352e-05,
314
+ "loss": 2.5049,
315
  "step": 25500
316
  },
317
  {
318
+ "epoch": 0.75,
319
+ "learning_rate": 9.98507353657791e-05,
320
+ "loss": 2.4664,
321
  "step": 26000
322
  },
323
  {
324
  "epoch": 0.77,
325
+ "learning_rate": 9.984482752253677e-05,
326
+ "loss": 2.4528,
327
  "step": 26500
328
  },
329
  {
330
  "epoch": 0.78,
331
+ "learning_rate": 9.98388052037347e-05,
332
+ "loss": 2.4577,
333
  "step": 27000
334
  },
335
  {
336
  "epoch": 0.8,
337
+ "learning_rate": 9.983266842320402e-05,
338
+ "loss": 2.4889,
339
  "step": 27500
340
  },
341
  {
342
  "epoch": 0.81,
343
+ "learning_rate": 9.982641719503866e-05,
344
+ "loss": 2.4272,
345
  "step": 28000
346
  },
347
  {
348
+ "epoch": 0.82,
349
+ "learning_rate": 9.982005153359547e-05,
350
+ "loss": 2.4783,
351
  "step": 28500
352
  },
353
  {
354
  "epoch": 0.84,
355
+ "learning_rate": 9.981357145349406e-05,
356
+ "loss": 2.4795,
357
  "step": 29000
358
  },
359
  {
360
+ "epoch": 0.85,
361
+ "learning_rate": 9.98069769696168e-05,
362
+ "loss": 2.4807,
363
  "step": 29500
364
  },
365
  {
366
  "epoch": 0.87,
367
+ "learning_rate": 9.980026809710888e-05,
368
+ "loss": 2.4951,
369
  "step": 30000
370
+ },
371
+ {
372
+ "epoch": 0.88,
373
+ "learning_rate": 9.979344485137813e-05,
374
+ "loss": 2.5137,
375
+ "step": 30500
376
+ },
377
+ {
378
+ "epoch": 0.9,
379
+ "learning_rate": 9.978650724809511e-05,
380
+ "loss": 2.5249,
381
+ "step": 31000
382
+ },
383
+ {
384
+ "epoch": 0.91,
385
+ "learning_rate": 9.977945530319297e-05,
386
+ "loss": 2.4092,
387
+ "step": 31500
388
+ },
389
+ {
390
+ "epoch": 0.93,
391
+ "learning_rate": 9.977228903286746e-05,
392
+ "loss": 2.4978,
393
+ "step": 32000
394
+ },
395
+ {
396
+ "epoch": 0.94,
397
+ "learning_rate": 9.976500845357694e-05,
398
+ "loss": 2.4361,
399
+ "step": 32500
400
+ },
401
+ {
402
+ "epoch": 0.95,
403
+ "learning_rate": 9.975761358204227e-05,
404
+ "loss": 2.4774,
405
+ "step": 33000
406
+ },
407
+ {
408
+ "epoch": 0.97,
409
+ "learning_rate": 9.975010443524679e-05,
410
+ "loss": 2.4662,
411
+ "step": 33500
412
+ },
413
+ {
414
+ "epoch": 0.98,
415
+ "learning_rate": 9.974248103043629e-05,
416
+ "loss": 2.4252,
417
+ "step": 34000
418
+ },
419
+ {
420
+ "epoch": 1.0,
421
+ "learning_rate": 9.973474338511898e-05,
422
+ "loss": 2.4689,
423
+ "step": 34500
424
+ },
425
+ {
426
+ "epoch": 1.0,
427
+ "eval_bleu": 1.0,
428
+ "eval_brevity_penalty": 1.0,
429
+ "eval_length_ratio": 1.0,
430
+ "eval_loss": 2.3501155376434326,
431
+ "eval_precisions": [
432
+ 1.0,
433
+ 1.0,
434
+ 1.0,
435
+ 1.0
436
+ ],
437
+ "eval_reference_length": 1966592,
438
+ "eval_runtime": 3383.1867,
439
+ "eval_samples_per_second": 1.135,
440
+ "eval_steps_per_second": 1.135,
441
+ "eval_translation_length": 1966592,
442
+ "step": 34567
443
  }
444
  ],
445
  "logging_steps": 500,
446
+ "max_steps": 1037010,
447
+ "num_train_epochs": 30,
448
+ "save_steps": 500,
449
+ "total_flos": 7.966891375696282e+16,
450
  "trial_name": null,
451
  "trial_params": null
452
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac93b83e20cb3266b0249c2b9cb223a898cd7a840eb97f009fc035d490eabb4d
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7c8666ad1a059ed7265a7f4363a6ee3ab5fbfde75b0f7af29bc8baac32797e7
3
  size 4728