lilferrit commited on
Commit
e188f5a
1 Parent(s): b6887bd

End of training

Browse files
Files changed (5) hide show
  1. README.md +20 -5
  2. all_results.json +16 -0
  3. eval_results.json +10 -0
  4. train_results.json +9 -0
  5. trainer_state.json +830 -0
README.md CHANGED
@@ -1,13 +1,28 @@
1
  ---
 
 
 
2
  license: apache-2.0
3
  base_model: google/mt5-small
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - bleu
8
  model-index:
9
  - name: ft-wmt14-5
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -15,11 +30,11 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # ft-wmt14-5
17
 
18
- This model is a fine-tuned version of [google/mt5-small](https://huggingface.co/google/mt5-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 2.0597
21
- - Bleu: 20.6113
22
- - Gen Len: 30.701
23
 
24
  ## Model description
25
 
 
1
  ---
2
+ language:
3
+ - de
4
+ - en
5
  license: apache-2.0
6
  base_model: google/mt5-small
7
  tags:
8
  - generated_from_trainer
9
+ datasets:
10
+ - lilferrit/wmt14-short
11
  metrics:
12
  - bleu
13
  model-index:
14
  - name: ft-wmt14-5
15
+ results:
16
+ - task:
17
+ name: Translation
18
+ type: translation
19
+ dataset:
20
+ name: lilferrit/wmt14-short
21
+ type: lilferrit/wmt14-short
22
+ metrics:
23
+ - name: Bleu
24
+ type: bleu
25
+ value: 20.7584
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
30
 
31
  # ft-wmt14-5
32
 
33
+ This model is a fine-tuned version of [google/mt5-small](https://huggingface.co/google/mt5-small) on the lilferrit/wmt14-short dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 2.0604
36
+ - Bleu: 20.7584
37
+ - Gen Len: 30.499
38
 
39
  ## Model description
40
 
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.7777777777777777,
3
+ "eval_bleu": 20.7584,
4
+ "eval_gen_len": 30.499,
5
+ "eval_loss": 2.0603742599487305,
6
+ "eval_runtime": 371.1712,
7
+ "eval_samples": 3000,
8
+ "eval_samples_per_second": 8.083,
9
+ "eval_steps_per_second": 1.01,
10
+ "total_flos": 1.4240580791795712e+17,
11
+ "train_loss": 0.5475473999023438,
12
+ "train_runtime": 14821.2356,
13
+ "train_samples": 576000,
14
+ "train_samples_per_second": 107.953,
15
+ "train_steps_per_second": 6.747
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.7777777777777777,
3
+ "eval_bleu": 20.7584,
4
+ "eval_gen_len": 30.499,
5
+ "eval_loss": 2.0603742599487305,
6
+ "eval_runtime": 371.1712,
7
+ "eval_samples": 3000,
8
+ "eval_samples_per_second": 8.083,
9
+ "eval_steps_per_second": 1.01
10
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.7777777777777777,
3
+ "total_flos": 1.4240580791795712e+17,
4
+ "train_loss": 0.5475473999023438,
5
+ "train_runtime": 14821.2356,
6
+ "train_samples": 576000,
7
+ "train_samples_per_second": 107.953,
8
+ "train_steps_per_second": 6.747
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 20.7584,
3
+ "best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14-5/checkpoint-90000",
4
+ "epoch": 2.7777777777777777,
5
+ "eval_steps": 10000,
6
+ "global_step": 100000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.027777777777777776,
13
+ "grad_norm": 1.9314790964126587,
14
+ "learning_rate": 0.0005,
15
+ "loss": 3.3589,
16
+ "step": 1000
17
+ },
18
+ {
19
+ "epoch": 0.05555555555555555,
20
+ "grad_norm": 1.7348469495773315,
21
+ "learning_rate": 0.0005,
22
+ "loss": 2.5263,
23
+ "step": 2000
24
+ },
25
+ {
26
+ "epoch": 0.08333333333333333,
27
+ "grad_norm": 1.9181748628616333,
28
+ "learning_rate": 0.0005,
29
+ "loss": 2.3365,
30
+ "step": 3000
31
+ },
32
+ {
33
+ "epoch": 0.1111111111111111,
34
+ "grad_norm": 1.6642646789550781,
35
+ "learning_rate": 0.0005,
36
+ "loss": 2.2207,
37
+ "step": 4000
38
+ },
39
+ {
40
+ "epoch": 0.1388888888888889,
41
+ "grad_norm": 1.1876742839813232,
42
+ "learning_rate": 0.0005,
43
+ "loss": 2.1363,
44
+ "step": 5000
45
+ },
46
+ {
47
+ "epoch": 0.16666666666666666,
48
+ "grad_norm": 1.567658543586731,
49
+ "learning_rate": 0.0005,
50
+ "loss": 2.0733,
51
+ "step": 6000
52
+ },
53
+ {
54
+ "epoch": 0.19444444444444445,
55
+ "grad_norm": 1.2552471160888672,
56
+ "learning_rate": 0.0005,
57
+ "loss": 2.0262,
58
+ "step": 7000
59
+ },
60
+ {
61
+ "epoch": 0.2222222222222222,
62
+ "grad_norm": 1.049357533454895,
63
+ "learning_rate": 0.0005,
64
+ "loss": 1.9775,
65
+ "step": 8000
66
+ },
67
+ {
68
+ "epoch": 0.25,
69
+ "grad_norm": 1.303145170211792,
70
+ "learning_rate": 0.0005,
71
+ "loss": 1.9412,
72
+ "step": 9000
73
+ },
74
+ {
75
+ "epoch": 0.2777777777777778,
76
+ "grad_norm": 1.0213723182678223,
77
+ "learning_rate": 0.0005,
78
+ "loss": 1.9166,
79
+ "step": 10000
80
+ },
81
+ {
82
+ "epoch": 0.2777777777777778,
83
+ "eval_bleu": 15.8119,
84
+ "eval_gen_len": 32.097,
85
+ "eval_loss": 2.31050968170166,
86
+ "eval_runtime": 410.6001,
87
+ "eval_samples_per_second": 7.306,
88
+ "eval_steps_per_second": 0.913,
89
+ "step": 10000
90
+ },
91
+ {
92
+ "epoch": 0.3055555555555556,
93
+ "grad_norm": 1.2851905822753906,
94
+ "learning_rate": 0.0005,
95
+ "loss": 1.8878,
96
+ "step": 11000
97
+ },
98
+ {
99
+ "epoch": 0.3333333333333333,
100
+ "grad_norm": 0.8447160720825195,
101
+ "learning_rate": 0.0005,
102
+ "loss": 1.8492,
103
+ "step": 12000
104
+ },
105
+ {
106
+ "epoch": 0.3611111111111111,
107
+ "grad_norm": 1.1516064405441284,
108
+ "learning_rate": 0.0005,
109
+ "loss": 1.8309,
110
+ "step": 13000
111
+ },
112
+ {
113
+ "epoch": 0.3888888888888889,
114
+ "grad_norm": 1.0370670557022095,
115
+ "learning_rate": 0.0005,
116
+ "loss": 1.8057,
117
+ "step": 14000
118
+ },
119
+ {
120
+ "epoch": 0.4166666666666667,
121
+ "grad_norm": 1.1649495363235474,
122
+ "learning_rate": 0.0005,
123
+ "loss": 1.7867,
124
+ "step": 15000
125
+ },
126
+ {
127
+ "epoch": 0.4444444444444444,
128
+ "grad_norm": 1.2666045427322388,
129
+ "learning_rate": 0.0005,
130
+ "loss": 1.7679,
131
+ "step": 16000
132
+ },
133
+ {
134
+ "epoch": 0.4722222222222222,
135
+ "grad_norm": 1.0923264026641846,
136
+ "learning_rate": 0.0005,
137
+ "loss": 1.7563,
138
+ "step": 17000
139
+ },
140
+ {
141
+ "epoch": 0.5,
142
+ "grad_norm": 1.560994029045105,
143
+ "learning_rate": 0.0005,
144
+ "loss": 1.7342,
145
+ "step": 18000
146
+ },
147
+ {
148
+ "epoch": 0.5277777777777778,
149
+ "grad_norm": 0.9684827327728271,
150
+ "learning_rate": 0.0005,
151
+ "loss": 1.7228,
152
+ "step": 19000
153
+ },
154
+ {
155
+ "epoch": 0.5555555555555556,
156
+ "grad_norm": 0.9182453751564026,
157
+ "learning_rate": 0.0005,
158
+ "loss": 1.7184,
159
+ "step": 20000
160
+ },
161
+ {
162
+ "epoch": 0.5555555555555556,
163
+ "eval_bleu": 17.5903,
164
+ "eval_gen_len": 31.1153,
165
+ "eval_loss": 2.19934344291687,
166
+ "eval_runtime": 393.3017,
167
+ "eval_samples_per_second": 7.628,
168
+ "eval_steps_per_second": 0.953,
169
+ "step": 20000
170
+ },
171
+ {
172
+ "epoch": 0.5833333333333334,
173
+ "grad_norm": 0.8953577280044556,
174
+ "learning_rate": 0.0005,
175
+ "loss": 1.7042,
176
+ "step": 21000
177
+ },
178
+ {
179
+ "epoch": 0.6111111111111112,
180
+ "grad_norm": 0.9418250918388367,
181
+ "learning_rate": 0.0005,
182
+ "loss": 1.683,
183
+ "step": 22000
184
+ },
185
+ {
186
+ "epoch": 0.6388888888888888,
187
+ "grad_norm": 0.8577601909637451,
188
+ "learning_rate": 0.0005,
189
+ "loss": 1.6799,
190
+ "step": 23000
191
+ },
192
+ {
193
+ "epoch": 0.6666666666666666,
194
+ "grad_norm": 0.9786076545715332,
195
+ "learning_rate": 0.0005,
196
+ "loss": 1.6675,
197
+ "step": 24000
198
+ },
199
+ {
200
+ "epoch": 0.6944444444444444,
201
+ "grad_norm": 0.9262654781341553,
202
+ "learning_rate": 0.0005,
203
+ "loss": 1.6499,
204
+ "step": 25000
205
+ },
206
+ {
207
+ "epoch": 0.7222222222222222,
208
+ "grad_norm": 0.8759564757347107,
209
+ "learning_rate": 0.0005,
210
+ "loss": 1.6468,
211
+ "step": 26000
212
+ },
213
+ {
214
+ "epoch": 0.75,
215
+ "grad_norm": 1.0495752096176147,
216
+ "learning_rate": 0.0005,
217
+ "loss": 1.6285,
218
+ "step": 27000
219
+ },
220
+ {
221
+ "epoch": 0.7777777777777778,
222
+ "grad_norm": 1.092642068862915,
223
+ "learning_rate": 0.0005,
224
+ "loss": 1.6276,
225
+ "step": 28000
226
+ },
227
+ {
228
+ "epoch": 0.8055555555555556,
229
+ "grad_norm": 0.8775661587715149,
230
+ "learning_rate": 0.0005,
231
+ "loss": 1.6172,
232
+ "step": 29000
233
+ },
234
+ {
235
+ "epoch": 0.8333333333333334,
236
+ "grad_norm": 0.8970679044723511,
237
+ "learning_rate": 0.0005,
238
+ "loss": 1.6061,
239
+ "step": 30000
240
+ },
241
+ {
242
+ "epoch": 0.8333333333333334,
243
+ "eval_bleu": 18.9604,
244
+ "eval_gen_len": 30.327,
245
+ "eval_loss": 2.1379551887512207,
246
+ "eval_runtime": 380.095,
247
+ "eval_samples_per_second": 7.893,
248
+ "eval_steps_per_second": 0.987,
249
+ "step": 30000
250
+ },
251
+ {
252
+ "epoch": 0.8611111111111112,
253
+ "grad_norm": 0.9657310247421265,
254
+ "learning_rate": 0.0005,
255
+ "loss": 1.5959,
256
+ "step": 31000
257
+ },
258
+ {
259
+ "epoch": 0.8888888888888888,
260
+ "grad_norm": 0.8748376369476318,
261
+ "learning_rate": 0.0005,
262
+ "loss": 1.5908,
263
+ "step": 32000
264
+ },
265
+ {
266
+ "epoch": 0.9166666666666666,
267
+ "grad_norm": 0.8462302088737488,
268
+ "learning_rate": 0.0005,
269
+ "loss": 1.5845,
270
+ "step": 33000
271
+ },
272
+ {
273
+ "epoch": 0.9444444444444444,
274
+ "grad_norm": 0.9005241394042969,
275
+ "learning_rate": 0.0005,
276
+ "loss": 1.5699,
277
+ "step": 34000
278
+ },
279
+ {
280
+ "epoch": 0.9722222222222222,
281
+ "grad_norm": 0.9596630930900574,
282
+ "learning_rate": 0.0005,
283
+ "loss": 1.5752,
284
+ "step": 35000
285
+ },
286
+ {
287
+ "epoch": 1.0,
288
+ "grad_norm": 0.8307533860206604,
289
+ "learning_rate": 0.0005,
290
+ "loss": 1.5634,
291
+ "step": 36000
292
+ },
293
+ {
294
+ "epoch": 1.0277777777777777,
295
+ "grad_norm": 0.9918788075447083,
296
+ "learning_rate": 0.0005,
297
+ "loss": 1.5117,
298
+ "step": 37000
299
+ },
300
+ {
301
+ "epoch": 1.0555555555555556,
302
+ "grad_norm": 0.9118058085441589,
303
+ "learning_rate": 0.0005,
304
+ "loss": 1.5023,
305
+ "step": 38000
306
+ },
307
+ {
308
+ "epoch": 1.0833333333333333,
309
+ "grad_norm": 0.7213552594184875,
310
+ "learning_rate": 0.0005,
311
+ "loss": 1.5087,
312
+ "step": 39000
313
+ },
314
+ {
315
+ "epoch": 1.1111111111111112,
316
+ "grad_norm": 1.0255305767059326,
317
+ "learning_rate": 0.0005,
318
+ "loss": 1.516,
319
+ "step": 40000
320
+ },
321
+ {
322
+ "epoch": 1.1111111111111112,
323
+ "eval_bleu": 19.1444,
324
+ "eval_gen_len": 30.2727,
325
+ "eval_loss": 2.1365692615509033,
326
+ "eval_runtime": 377.1737,
327
+ "eval_samples_per_second": 7.954,
328
+ "eval_steps_per_second": 0.994,
329
+ "step": 40000
330
+ },
331
+ {
332
+ "epoch": 1.1388888888888888,
333
+ "grad_norm": 0.8766499161720276,
334
+ "learning_rate": 0.0005,
335
+ "loss": 1.5096,
336
+ "step": 41000
337
+ },
338
+ {
339
+ "epoch": 1.1666666666666667,
340
+ "grad_norm": 1.1786612272262573,
341
+ "learning_rate": 0.0005,
342
+ "loss": 1.4982,
343
+ "step": 42000
344
+ },
345
+ {
346
+ "epoch": 1.1944444444444444,
347
+ "grad_norm": 1.011268973350525,
348
+ "learning_rate": 0.0005,
349
+ "loss": 1.5013,
350
+ "step": 43000
351
+ },
352
+ {
353
+ "epoch": 1.2222222222222223,
354
+ "grad_norm": 1.0863969326019287,
355
+ "learning_rate": 0.0005,
356
+ "loss": 1.4878,
357
+ "step": 44000
358
+ },
359
+ {
360
+ "epoch": 1.25,
361
+ "grad_norm": 0.9729832410812378,
362
+ "learning_rate": 0.0005,
363
+ "loss": 1.4922,
364
+ "step": 45000
365
+ },
366
+ {
367
+ "epoch": 1.2777777777777777,
368
+ "grad_norm": 1.3476896286010742,
369
+ "learning_rate": 0.0005,
370
+ "loss": 1.4876,
371
+ "step": 46000
372
+ },
373
+ {
374
+ "epoch": 1.3055555555555556,
375
+ "grad_norm": 0.8493963479995728,
376
+ "learning_rate": 0.0005,
377
+ "loss": 1.4823,
378
+ "step": 47000
379
+ },
380
+ {
381
+ "epoch": 1.3333333333333333,
382
+ "grad_norm": 1.0311123132705688,
383
+ "learning_rate": 0.0005,
384
+ "loss": 1.4739,
385
+ "step": 48000
386
+ },
387
+ {
388
+ "epoch": 1.3611111111111112,
389
+ "grad_norm": 1.259581446647644,
390
+ "learning_rate": 0.0005,
391
+ "loss": 1.4747,
392
+ "step": 49000
393
+ },
394
+ {
395
+ "epoch": 1.3888888888888888,
396
+ "grad_norm": 1.1934195756912231,
397
+ "learning_rate": 0.0005,
398
+ "loss": 1.4675,
399
+ "step": 50000
400
+ },
401
+ {
402
+ "epoch": 1.3888888888888888,
403
+ "eval_bleu": 19.7588,
404
+ "eval_gen_len": 30.1127,
405
+ "eval_loss": 2.120835781097412,
406
+ "eval_runtime": 372.4281,
407
+ "eval_samples_per_second": 8.055,
408
+ "eval_steps_per_second": 1.007,
409
+ "step": 50000
410
+ },
411
+ {
412
+ "epoch": 1.4166666666666667,
413
+ "grad_norm": 1.1824595928192139,
414
+ "learning_rate": 0.0005,
415
+ "loss": 1.4659,
416
+ "step": 51000
417
+ },
418
+ {
419
+ "epoch": 1.4444444444444444,
420
+ "grad_norm": 1.1661032438278198,
421
+ "learning_rate": 0.0005,
422
+ "loss": 1.4737,
423
+ "step": 52000
424
+ },
425
+ {
426
+ "epoch": 1.4722222222222223,
427
+ "grad_norm": 0.7856634259223938,
428
+ "learning_rate": 0.0005,
429
+ "loss": 1.4595,
430
+ "step": 53000
431
+ },
432
+ {
433
+ "epoch": 1.5,
434
+ "grad_norm": 0.9908609986305237,
435
+ "learning_rate": 0.0005,
436
+ "loss": 1.4656,
437
+ "step": 54000
438
+ },
439
+ {
440
+ "epoch": 1.5277777777777777,
441
+ "grad_norm": 0.9270644187927246,
442
+ "learning_rate": 0.0005,
443
+ "loss": 1.4524,
444
+ "step": 55000
445
+ },
446
+ {
447
+ "epoch": 1.5555555555555556,
448
+ "grad_norm": 0.9910904169082642,
449
+ "learning_rate": 0.0005,
450
+ "loss": 1.4453,
451
+ "step": 56000
452
+ },
453
+ {
454
+ "epoch": 1.5833333333333335,
455
+ "grad_norm": 1.0300639867782593,
456
+ "learning_rate": 0.0005,
457
+ "loss": 1.451,
458
+ "step": 57000
459
+ },
460
+ {
461
+ "epoch": 1.6111111111111112,
462
+ "grad_norm": 0.809105396270752,
463
+ "learning_rate": 0.0005,
464
+ "loss": 1.444,
465
+ "step": 58000
466
+ },
467
+ {
468
+ "epoch": 1.6388888888888888,
469
+ "grad_norm": 0.7915866374969482,
470
+ "learning_rate": 0.0005,
471
+ "loss": 1.4421,
472
+ "step": 59000
473
+ },
474
+ {
475
+ "epoch": 1.6666666666666665,
476
+ "grad_norm": 0.9778928756713867,
477
+ "learning_rate": 0.0005,
478
+ "loss": 1.4416,
479
+ "step": 60000
480
+ },
481
+ {
482
+ "epoch": 1.6666666666666665,
483
+ "eval_bleu": 19.9263,
484
+ "eval_gen_len": 30.4463,
485
+ "eval_loss": 2.088862657546997,
486
+ "eval_runtime": 383.2772,
487
+ "eval_samples_per_second": 7.827,
488
+ "eval_steps_per_second": 0.978,
489
+ "step": 60000
490
+ },
491
+ {
492
+ "epoch": 1.6944444444444444,
493
+ "grad_norm": 0.8484209775924683,
494
+ "learning_rate": 0.0005,
495
+ "loss": 1.4313,
496
+ "step": 61000
497
+ },
498
+ {
499
+ "epoch": 1.7222222222222223,
500
+ "grad_norm": 0.8703031539916992,
501
+ "learning_rate": 0.0005,
502
+ "loss": 1.4405,
503
+ "step": 62000
504
+ },
505
+ {
506
+ "epoch": 1.75,
507
+ "grad_norm": 1.4096006155014038,
508
+ "learning_rate": 0.0005,
509
+ "loss": 1.4375,
510
+ "step": 63000
511
+ },
512
+ {
513
+ "epoch": 1.7777777777777777,
514
+ "grad_norm": 0.9177774786949158,
515
+ "learning_rate": 0.0005,
516
+ "loss": 1.4262,
517
+ "step": 64000
518
+ },
519
+ {
520
+ "epoch": 1.8055555555555556,
521
+ "grad_norm": 1.2332441806793213,
522
+ "learning_rate": 0.0005,
523
+ "loss": 1.4233,
524
+ "step": 65000
525
+ },
526
+ {
527
+ "epoch": 1.8333333333333335,
528
+ "grad_norm": 0.8750177621841431,
529
+ "learning_rate": 0.0005,
530
+ "loss": 1.4287,
531
+ "step": 66000
532
+ },
533
+ {
534
+ "epoch": 1.8611111111111112,
535
+ "grad_norm": 0.6736052632331848,
536
+ "learning_rate": 0.0005,
537
+ "loss": 1.4231,
538
+ "step": 67000
539
+ },
540
+ {
541
+ "epoch": 1.8888888888888888,
542
+ "grad_norm": 0.7802408933639526,
543
+ "learning_rate": 0.0005,
544
+ "loss": 1.4106,
545
+ "step": 68000
546
+ },
547
+ {
548
+ "epoch": 1.9166666666666665,
549
+ "grad_norm": 1.1860034465789795,
550
+ "learning_rate": 0.0005,
551
+ "loss": 1.4121,
552
+ "step": 69000
553
+ },
554
+ {
555
+ "epoch": 1.9444444444444444,
556
+ "grad_norm": 0.926054835319519,
557
+ "learning_rate": 0.0005,
558
+ "loss": 1.4111,
559
+ "step": 70000
560
+ },
561
+ {
562
+ "epoch": 1.9444444444444444,
563
+ "eval_bleu": 20.3323,
564
+ "eval_gen_len": 30.1207,
565
+ "eval_loss": 2.079472541809082,
566
+ "eval_runtime": 371.9755,
567
+ "eval_samples_per_second": 8.065,
568
+ "eval_steps_per_second": 1.008,
569
+ "step": 70000
570
+ },
571
+ {
572
+ "epoch": 1.9722222222222223,
573
+ "grad_norm": 1.1691533327102661,
574
+ "learning_rate": 0.0005,
575
+ "loss": 1.407,
576
+ "step": 71000
577
+ },
578
+ {
579
+ "epoch": 2.0,
580
+ "grad_norm": 0.9077666997909546,
581
+ "learning_rate": 0.0005,
582
+ "loss": 1.4051,
583
+ "step": 72000
584
+ },
585
+ {
586
+ "epoch": 2.0277777777777777,
587
+ "grad_norm": 0.9149623513221741,
588
+ "learning_rate": 0.0005,
589
+ "loss": 1.3517,
590
+ "step": 73000
591
+ },
592
+ {
593
+ "epoch": 2.0555555555555554,
594
+ "grad_norm": 1.0772947072982788,
595
+ "learning_rate": 0.0005,
596
+ "loss": 1.3624,
597
+ "step": 74000
598
+ },
599
+ {
600
+ "epoch": 2.0833333333333335,
601
+ "grad_norm": 0.7283540964126587,
602
+ "learning_rate": 0.0005,
603
+ "loss": 1.355,
604
+ "step": 75000
605
+ },
606
+ {
607
+ "epoch": 2.111111111111111,
608
+ "grad_norm": 0.7279065847396851,
609
+ "learning_rate": 0.0005,
610
+ "loss": 1.3526,
611
+ "step": 76000
612
+ },
613
+ {
614
+ "epoch": 2.138888888888889,
615
+ "grad_norm": 1.2707905769348145,
616
+ "learning_rate": 0.0005,
617
+ "loss": 1.3535,
618
+ "step": 77000
619
+ },
620
+ {
621
+ "epoch": 2.1666666666666665,
622
+ "grad_norm": 0.9000493288040161,
623
+ "learning_rate": 0.0005,
624
+ "loss": 1.3519,
625
+ "step": 78000
626
+ },
627
+ {
628
+ "epoch": 2.1944444444444446,
629
+ "grad_norm": 1.043967843055725,
630
+ "learning_rate": 0.0005,
631
+ "loss": 1.3567,
632
+ "step": 79000
633
+ },
634
+ {
635
+ "epoch": 2.2222222222222223,
636
+ "grad_norm": 1.1248853206634521,
637
+ "learning_rate": 0.0005,
638
+ "loss": 1.3603,
639
+ "step": 80000
640
+ },
641
+ {
642
+ "epoch": 2.2222222222222223,
643
+ "eval_bleu": 20.5373,
644
+ "eval_gen_len": 30.5943,
645
+ "eval_loss": 2.085047960281372,
646
+ "eval_runtime": 373.0705,
647
+ "eval_samples_per_second": 8.041,
648
+ "eval_steps_per_second": 1.005,
649
+ "step": 80000
650
+ },
651
+ {
652
+ "epoch": 2.25,
653
+ "grad_norm": 1.056221842765808,
654
+ "learning_rate": 0.0005,
655
+ "loss": 1.3657,
656
+ "step": 81000
657
+ },
658
+ {
659
+ "epoch": 2.2777777777777777,
660
+ "grad_norm": 0.9176587462425232,
661
+ "learning_rate": 0.0005,
662
+ "loss": 1.3572,
663
+ "step": 82000
664
+ },
665
+ {
666
+ "epoch": 2.3055555555555554,
667
+ "grad_norm": 1.0105085372924805,
668
+ "learning_rate": 0.0005,
669
+ "loss": 1.3498,
670
+ "step": 83000
671
+ },
672
+ {
673
+ "epoch": 2.3333333333333335,
674
+ "grad_norm": 1.1589380502700806,
675
+ "learning_rate": 0.0005,
676
+ "loss": 1.3567,
677
+ "step": 84000
678
+ },
679
+ {
680
+ "epoch": 2.361111111111111,
681
+ "grad_norm": 0.7733587622642517,
682
+ "learning_rate": 0.0005,
683
+ "loss": 1.3533,
684
+ "step": 85000
685
+ },
686
+ {
687
+ "epoch": 2.388888888888889,
688
+ "grad_norm": 1.036777138710022,
689
+ "learning_rate": 0.0005,
690
+ "loss": 1.3469,
691
+ "step": 86000
692
+ },
693
+ {
694
+ "epoch": 2.4166666666666665,
695
+ "grad_norm": 1.4935026168823242,
696
+ "learning_rate": 0.0005,
697
+ "loss": 1.3469,
698
+ "step": 87000
699
+ },
700
+ {
701
+ "epoch": 2.4444444444444446,
702
+ "grad_norm": 0.864630937576294,
703
+ "learning_rate": 0.0005,
704
+ "loss": 1.3506,
705
+ "step": 88000
706
+ },
707
+ {
708
+ "epoch": 2.4722222222222223,
709
+ "grad_norm": 0.8495751619338989,
710
+ "learning_rate": 0.0005,
711
+ "loss": 1.3408,
712
+ "step": 89000
713
+ },
714
+ {
715
+ "epoch": 2.5,
716
+ "grad_norm": 1.0840762853622437,
717
+ "learning_rate": 0.0005,
718
+ "loss": 1.3378,
719
+ "step": 90000
720
+ },
721
+ {
722
+ "epoch": 2.5,
723
+ "eval_bleu": 20.7584,
724
+ "eval_gen_len": 30.499,
725
+ "eval_loss": 2.0603742599487305,
726
+ "eval_runtime": 368.0992,
727
+ "eval_samples_per_second": 8.15,
728
+ "eval_steps_per_second": 1.019,
729
+ "step": 90000
730
+ },
731
+ {
732
+ "epoch": 2.5277777777777777,
733
+ "grad_norm": 0.7769622802734375,
734
+ "learning_rate": 0.0005,
735
+ "loss": 1.3409,
736
+ "step": 91000
737
+ },
738
+ {
739
+ "epoch": 2.5555555555555554,
740
+ "grad_norm": 1.049972414970398,
741
+ "learning_rate": 0.0005,
742
+ "loss": 1.3443,
743
+ "step": 92000
744
+ },
745
+ {
746
+ "epoch": 2.5833333333333335,
747
+ "grad_norm": 0.965621292591095,
748
+ "learning_rate": 0.0005,
749
+ "loss": 1.342,
750
+ "step": 93000
751
+ },
752
+ {
753
+ "epoch": 2.611111111111111,
754
+ "grad_norm": 0.8234182000160217,
755
+ "learning_rate": 0.0005,
756
+ "loss": 1.3297,
757
+ "step": 94000
758
+ },
759
+ {
760
+ "epoch": 2.638888888888889,
761
+ "grad_norm": 0.9464855790138245,
762
+ "learning_rate": 0.0005,
763
+ "loss": 1.3345,
764
+ "step": 95000
765
+ },
766
+ {
767
+ "epoch": 2.6666666666666665,
768
+ "grad_norm": 0.987382709980011,
769
+ "learning_rate": 0.0005,
770
+ "loss": 1.3284,
771
+ "step": 96000
772
+ },
773
+ {
774
+ "epoch": 2.6944444444444446,
775
+ "grad_norm": 0.6439863443374634,
776
+ "learning_rate": 0.0005,
777
+ "loss": 1.3285,
778
+ "step": 97000
779
+ },
780
+ {
781
+ "epoch": 2.7222222222222223,
782
+ "grad_norm": 0.8853390216827393,
783
+ "learning_rate": 0.0005,
784
+ "loss": 1.3339,
785
+ "step": 98000
786
+ },
787
+ {
788
+ "epoch": 2.75,
789
+ "grad_norm": 0.7582658529281616,
790
+ "learning_rate": 0.0005,
791
+ "loss": 1.3281,
792
+ "step": 99000
793
+ },
794
+ {
795
+ "epoch": 2.7777777777777777,
796
+ "grad_norm": 0.9061763882637024,
797
+ "learning_rate": 0.0005,
798
+ "loss": 1.3381,
799
+ "step": 100000
800
+ },
801
+ {
802
+ "epoch": 2.7777777777777777,
803
+ "eval_bleu": 20.6113,
804
+ "eval_gen_len": 30.701,
805
+ "eval_loss": 2.059664726257324,
806
+ "eval_runtime": 371.2241,
807
+ "eval_samples_per_second": 8.081,
808
+ "eval_steps_per_second": 1.01,
809
+ "step": 100000
810
+ },
811
+ {
812
+ "epoch": 2.7777777777777777,
813
+ "step": 100000,
814
+ "total_flos": 1.4240580791795712e+17,
815
+ "train_loss": 0.5475473999023438,
816
+ "train_runtime": 14821.2356,
817
+ "train_samples_per_second": 107.953,
818
+ "train_steps_per_second": 6.747
819
+ }
820
+ ],
821
+ "logging_steps": 1000,
822
+ "max_steps": 100000,
823
+ "num_input_tokens_seen": 0,
824
+ "num_train_epochs": 3,
825
+ "save_steps": 10000,
826
+ "total_flos": 1.4240580791795712e+17,
827
+ "train_batch_size": 8,
828
+ "trial_name": null,
829
+ "trial_params": null
830
+ }