JulienRPA commited on
Commit
0b2eb0e
1 Parent(s): bf0ba28

Training in progress, step 2000

Browse files
added_tokens.json CHANGED
The diff for this file is too large to render. See raw diff
 
config.json CHANGED
@@ -84,7 +84,7 @@
84
  "typical_p": 1.0,
85
  "use_bfloat16": false,
86
  "use_cache": true,
87
- "vocab_size": 34522
88
  },
89
  "decoder_start_token_id": 101,
90
  "early_stopping": true,
@@ -167,7 +167,7 @@
167
  "typical_p": 1.0,
168
  "use_bfloat16": false,
169
  "use_cache": true,
170
- "vocab_size": 34522
171
  },
172
  "eos_token_id": 102,
173
  "is_encoder_decoder": true,
@@ -178,5 +178,5 @@
178
  "pad_token_id": 0,
179
  "torch_dtype": "float32",
180
  "transformers_version": null,
181
- "vocab_size": 34522
182
  }
 
84
  "typical_p": 1.0,
85
  "use_bfloat16": false,
86
  "use_cache": true,
87
+ "vocab_size": 32608
88
  },
89
  "decoder_start_token_id": 101,
90
  "early_stopping": true,
 
167
  "typical_p": 1.0,
168
  "use_bfloat16": false,
169
  "use_cache": true,
170
+ "vocab_size": 32608
171
  },
172
  "eos_token_id": 102,
173
  "is_encoder_decoder": true,
 
178
  "pad_token_id": 0,
179
  "torch_dtype": "float32",
180
  "transformers_version": null,
181
+ "vocab_size": 32608
182
  }
last-checkpoint/added_tokens.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/config.json CHANGED
@@ -84,7 +84,7 @@
84
  "typical_p": 1.0,
85
  "use_bfloat16": false,
86
  "use_cache": true,
87
- "vocab_size": 34522
88
  },
89
  "decoder_start_token_id": 101,
90
  "early_stopping": true,
@@ -167,7 +167,7 @@
167
  "typical_p": 1.0,
168
  "use_bfloat16": false,
169
  "use_cache": true,
170
- "vocab_size": 34522
171
  },
172
  "eos_token_id": 102,
173
  "is_encoder_decoder": true,
@@ -178,5 +178,5 @@
178
  "pad_token_id": 0,
179
  "torch_dtype": "float32",
180
  "transformers_version": null,
181
- "vocab_size": 34522
182
  }
 
84
  "typical_p": 1.0,
85
  "use_bfloat16": false,
86
  "use_cache": true,
87
+ "vocab_size": 32608
88
  },
89
  "decoder_start_token_id": 101,
90
  "early_stopping": true,
 
167
  "typical_p": 1.0,
168
  "use_bfloat16": false,
169
  "use_cache": true,
170
+ "vocab_size": 32608
171
  },
172
  "eos_token_id": 102,
173
  "is_encoder_decoder": true,
 
178
  "pad_token_id": 0,
179
  "torch_dtype": "float32",
180
  "transformers_version": null,
181
+ "vocab_size": 32608
182
  }
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68eb91b9bf98a03eaaadcb8bd0b58614a7003d8477069f541ff537e0ec0bc34b
3
- size 2023671531
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95248b59e7dac0d789b6b077fbe09ce5175324e63b5e9cdd37a3a30c2cde9027
3
+ size 2000137067
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43a9f23b83b30496ac553a4c926b305dc1444595fad21ac2439436b871476ca3
3
- size 1014236857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4238518d766751ead10635c197669c039a8c46869571a7f8fc96716f256600df
3
+ size 1002469625
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a515c4e1bc2c8452db42ea9dca43bd2d9ef7f8fe92b3a49a30af214963f24ac
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda8e14cf65113c4145f87b0ecbda755c0d32ab5bbb56548e3c45d7ecd14a2c9
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2fc1f775554587532b2ba4f009351aacf9c868152217b1b6eb9954f3a42aa4b
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c68e8f52d353c005549c69e33b5a29ace5f59d7300d7ea3a17b4a529d455d0
3
  size 627
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.289308176100629,
5
- "global_step": 9000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -10,591 +10,138 @@
10
  {
11
  "epoch": 0.07,
12
  "learning_rate": 2.0000000000000003e-06,
13
- "loss": 86.0421,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.14,
18
  "learning_rate": 4.000000000000001e-06,
19
- "loss": 51.6706,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.21,
24
  "learning_rate": 6e-06,
25
- "loss": 41.1349,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.28,
30
  "learning_rate": 8.000000000000001e-06,
31
- "loss": 36.5061,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.35,
36
  "learning_rate": 1e-05,
37
- "loss": 33.1858,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.42,
42
  "learning_rate": 1.2e-05,
43
- "loss": 30.5206,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.49,
48
  "learning_rate": 1.4000000000000001e-05,
49
- "loss": 28.0073,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.56,
54
  "learning_rate": 1.6000000000000003e-05,
55
- "loss": 26.1939,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.63,
60
  "learning_rate": 1.8e-05,
61
- "loss": 24.3465,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.7,
66
  "learning_rate": 2e-05,
67
- "loss": 22.5759,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.77,
72
  "learning_rate": 2.2000000000000003e-05,
73
- "loss": 20.9294,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.84,
78
  "learning_rate": 2.4e-05,
79
- "loss": 19.3762,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.91,
84
  "learning_rate": 2.6000000000000002e-05,
85
- "loss": 17.72,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.98,
90
  "learning_rate": 2.8000000000000003e-05,
91
- "loss": 15.7901,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 1.05,
96
  "learning_rate": 3e-05,
97
- "loss": 14.0008,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 1.12,
102
  "learning_rate": 3.2000000000000005e-05,
103
- "loss": 12.3777,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 1.19,
108
  "learning_rate": 3.4000000000000007e-05,
109
- "loss": 10.7261,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 1.26,
114
  "learning_rate": 3.6e-05,
115
- "loss": 9.1024,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 1.33,
120
  "learning_rate": 3.8e-05,
121
- "loss": 7.4676,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 1.4,
126
  "learning_rate": 4e-05,
127
- "loss": 6.6044,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 1.4,
132
- "eval_bleu": 3.8045,
133
  "eval_em": 0.0,
134
- "eval_gen_len": 158.8473,
135
- "eval_loss": 6.171305179595947,
136
- "eval_runtime": 1556.9786,
137
- "eval_samples_per_second": 0.774,
138
- "eval_steps_per_second": 0.097,
139
  "step": 2000
140
- },
141
- {
142
- "epoch": 1.47,
143
- "learning_rate": 4.2e-05,
144
- "loss": 6.0941,
145
- "step": 2100
146
- },
147
- {
148
- "epoch": 1.54,
149
- "learning_rate": 4.4000000000000006e-05,
150
- "loss": 5.6741,
151
- "step": 2200
152
- },
153
- {
154
- "epoch": 1.61,
155
- "learning_rate": 4.600000000000001e-05,
156
- "loss": 5.4757,
157
- "step": 2300
158
- },
159
- {
160
- "epoch": 1.68,
161
- "learning_rate": 4.8e-05,
162
- "loss": 5.242,
163
- "step": 2400
164
- },
165
- {
166
- "epoch": 1.75,
167
- "learning_rate": 5e-05,
168
- "loss": 5.0108,
169
- "step": 2500
170
- },
171
- {
172
- "epoch": 1.82,
173
- "learning_rate": 4.944121591417077e-05,
174
- "loss": 4.8595,
175
- "step": 2600
176
- },
177
- {
178
- "epoch": 1.89,
179
- "learning_rate": 4.888243182834153e-05,
180
- "loss": 4.695,
181
- "step": 2700
182
- },
183
- {
184
- "epoch": 1.96,
185
- "learning_rate": 4.8323647742512295e-05,
186
- "loss": 4.5706,
187
- "step": 2800
188
- },
189
- {
190
- "epoch": 2.03,
191
- "learning_rate": 4.776486365668306e-05,
192
- "loss": 4.2498,
193
- "step": 2900
194
- },
195
- {
196
- "epoch": 2.1,
197
- "learning_rate": 4.720607957085382e-05,
198
- "loss": 4.1223,
199
- "step": 3000
200
- },
201
- {
202
- "epoch": 2.17,
203
- "learning_rate": 4.664729548502459e-05,
204
- "loss": 4.0181,
205
- "step": 3100
206
- },
207
- {
208
- "epoch": 2.24,
209
- "learning_rate": 4.6088511399195353e-05,
210
- "loss": 3.8722,
211
- "step": 3200
212
- },
213
- {
214
- "epoch": 2.31,
215
- "learning_rate": 4.552972731336611e-05,
216
- "loss": 3.7786,
217
- "step": 3300
218
- },
219
- {
220
- "epoch": 2.38,
221
- "learning_rate": 4.497094322753688e-05,
222
- "loss": 3.6403,
223
- "step": 3400
224
- },
225
- {
226
- "epoch": 2.45,
227
- "learning_rate": 4.4412159141707646e-05,
228
- "loss": 3.5437,
229
- "step": 3500
230
- },
231
- {
232
- "epoch": 2.52,
233
- "learning_rate": 4.385337505587841e-05,
234
- "loss": 3.389,
235
- "step": 3600
236
- },
237
- {
238
- "epoch": 2.59,
239
- "learning_rate": 4.329459097004918e-05,
240
- "loss": 3.2395,
241
- "step": 3700
242
- },
243
- {
244
- "epoch": 2.66,
245
- "learning_rate": 4.2735806884219945e-05,
246
- "loss": 3.1786,
247
- "step": 3800
248
- },
249
- {
250
- "epoch": 2.73,
251
- "learning_rate": 4.2177022798390704e-05,
252
- "loss": 3.0657,
253
- "step": 3900
254
- },
255
- {
256
- "epoch": 2.8,
257
- "learning_rate": 4.161823871256147e-05,
258
- "loss": 3.032,
259
- "step": 4000
260
- },
261
- {
262
- "epoch": 2.8,
263
- "eval_bleu": 27.701,
264
- "eval_em": 0.0,
265
- "eval_gen_len": 33.9568,
266
- "eval_loss": 2.904534339904785,
267
- "eval_runtime": 297.5953,
268
- "eval_samples_per_second": 4.049,
269
- "eval_steps_per_second": 0.507,
270
- "step": 4000
271
- },
272
- {
273
- "epoch": 2.87,
274
- "learning_rate": 4.105945462673223e-05,
275
- "loss": 2.8755,
276
- "step": 4100
277
- },
278
- {
279
- "epoch": 2.94,
280
- "learning_rate": 4.0500670540903e-05,
281
- "loss": 2.8396,
282
- "step": 4200
283
- },
284
- {
285
- "epoch": 3.0,
286
- "learning_rate": 3.994188645507376e-05,
287
- "loss": 2.7454,
288
- "step": 4300
289
- },
290
- {
291
- "epoch": 3.07,
292
- "learning_rate": 3.938310236924452e-05,
293
- "loss": 2.5218,
294
- "step": 4400
295
- },
296
- {
297
- "epoch": 3.14,
298
- "learning_rate": 3.882431828341529e-05,
299
- "loss": 2.4895,
300
- "step": 4500
301
- },
302
- {
303
- "epoch": 3.21,
304
- "learning_rate": 3.8265534197586055e-05,
305
- "loss": 2.4554,
306
- "step": 4600
307
- },
308
- {
309
- "epoch": 3.28,
310
- "learning_rate": 3.7706750111756815e-05,
311
- "loss": 2.3573,
312
- "step": 4700
313
- },
314
- {
315
- "epoch": 3.35,
316
- "learning_rate": 3.714796602592758e-05,
317
- "loss": 2.2979,
318
- "step": 4800
319
- },
320
- {
321
- "epoch": 3.42,
322
- "learning_rate": 3.658918194009835e-05,
323
- "loss": 2.1874,
324
- "step": 4900
325
- },
326
- {
327
- "epoch": 3.49,
328
- "learning_rate": 3.603039785426911e-05,
329
- "loss": 2.1803,
330
- "step": 5000
331
- },
332
- {
333
- "epoch": 3.56,
334
- "learning_rate": 3.5471613768439874e-05,
335
- "loss": 2.1553,
336
- "step": 5100
337
- },
338
- {
339
- "epoch": 3.63,
340
- "learning_rate": 3.491282968261064e-05,
341
- "loss": 2.0567,
342
- "step": 5200
343
- },
344
- {
345
- "epoch": 3.7,
346
- "learning_rate": 3.4354045596781406e-05,
347
- "loss": 2.0147,
348
- "step": 5300
349
- },
350
- {
351
- "epoch": 3.77,
352
- "learning_rate": 3.379526151095217e-05,
353
- "loss": 1.9817,
354
- "step": 5400
355
- },
356
- {
357
- "epoch": 3.84,
358
- "learning_rate": 3.323647742512294e-05,
359
- "loss": 1.8843,
360
- "step": 5500
361
- },
362
- {
363
- "epoch": 3.91,
364
- "learning_rate": 3.26776933392937e-05,
365
- "loss": 1.8849,
366
- "step": 5600
367
- },
368
- {
369
- "epoch": 3.98,
370
- "learning_rate": 3.2118909253464465e-05,
371
- "loss": 1.8232,
372
- "step": 5700
373
- },
374
- {
375
- "epoch": 4.05,
376
- "learning_rate": 3.156012516763523e-05,
377
- "loss": 1.6714,
378
- "step": 5800
379
- },
380
- {
381
- "epoch": 4.12,
382
- "learning_rate": 3.100134108180599e-05,
383
- "loss": 1.6047,
384
- "step": 5900
385
- },
386
- {
387
- "epoch": 4.19,
388
- "learning_rate": 3.0442556995976757e-05,
389
- "loss": 1.5893,
390
- "step": 6000
391
- },
392
- {
393
- "epoch": 4.19,
394
- "eval_bleu": 57.7509,
395
- "eval_em": 0.0033,
396
- "eval_gen_len": 44.132,
397
- "eval_loss": 1.7326730489730835,
398
- "eval_runtime": 426.2734,
399
- "eval_samples_per_second": 2.827,
400
- "eval_steps_per_second": 0.354,
401
- "step": 6000
402
- },
403
- {
404
- "epoch": 4.26,
405
- "learning_rate": 2.9883772910147524e-05,
406
- "loss": 1.5864,
407
- "step": 6100
408
- },
409
- {
410
- "epoch": 4.33,
411
- "learning_rate": 2.9324988824318283e-05,
412
- "loss": 1.5608,
413
- "step": 6200
414
- },
415
- {
416
- "epoch": 4.4,
417
- "learning_rate": 2.876620473848905e-05,
418
- "loss": 1.5144,
419
- "step": 6300
420
- },
421
- {
422
- "epoch": 4.47,
423
- "learning_rate": 2.8207420652659816e-05,
424
- "loss": 1.4582,
425
- "step": 6400
426
- },
427
- {
428
- "epoch": 4.54,
429
- "learning_rate": 2.7648636566830576e-05,
430
- "loss": 1.4793,
431
- "step": 6500
432
- },
433
- {
434
- "epoch": 4.61,
435
- "learning_rate": 2.7089852481001342e-05,
436
- "loss": 1.472,
437
- "step": 6600
438
- },
439
- {
440
- "epoch": 4.68,
441
- "learning_rate": 2.653106839517211e-05,
442
- "loss": 1.4424,
443
- "step": 6700
444
- },
445
- {
446
- "epoch": 4.75,
447
- "learning_rate": 2.597228430934287e-05,
448
- "loss": 1.3779,
449
- "step": 6800
450
- },
451
- {
452
- "epoch": 4.82,
453
- "learning_rate": 2.5413500223513638e-05,
454
- "loss": 1.3611,
455
- "step": 6900
456
- },
457
- {
458
- "epoch": 4.89,
459
- "learning_rate": 2.48547161376844e-05,
460
- "loss": 1.3311,
461
- "step": 7000
462
- },
463
- {
464
- "epoch": 4.96,
465
- "learning_rate": 2.4295932051855164e-05,
466
- "loss": 1.3164,
467
- "step": 7100
468
- },
469
- {
470
- "epoch": 5.03,
471
- "learning_rate": 2.373714796602593e-05,
472
- "loss": 1.2119,
473
- "step": 7200
474
- },
475
- {
476
- "epoch": 5.1,
477
- "learning_rate": 2.3178363880196693e-05,
478
- "loss": 1.1122,
479
- "step": 7300
480
- },
481
- {
482
- "epoch": 5.17,
483
- "learning_rate": 2.2619579794367456e-05,
484
- "loss": 1.1198,
485
- "step": 7400
486
- },
487
- {
488
- "epoch": 5.24,
489
- "learning_rate": 2.206079570853822e-05,
490
- "loss": 1.0416,
491
- "step": 7500
492
- },
493
- {
494
- "epoch": 5.31,
495
- "learning_rate": 2.1502011622708985e-05,
496
- "loss": 1.1042,
497
- "step": 7600
498
- },
499
- {
500
- "epoch": 5.38,
501
- "learning_rate": 2.0943227536879752e-05,
502
- "loss": 1.0715,
503
- "step": 7700
504
- },
505
- {
506
- "epoch": 5.45,
507
- "learning_rate": 2.0384443451050515e-05,
508
- "loss": 1.0815,
509
- "step": 7800
510
- },
511
- {
512
- "epoch": 5.52,
513
- "learning_rate": 1.982565936522128e-05,
514
- "loss": 1.0445,
515
- "step": 7900
516
- },
517
- {
518
- "epoch": 5.59,
519
- "learning_rate": 1.9266875279392044e-05,
520
- "loss": 1.0512,
521
- "step": 8000
522
- },
523
- {
524
- "epoch": 5.59,
525
- "eval_bleu": 71.1272,
526
- "eval_em": 0.0241,
527
- "eval_gen_len": 46.0672,
528
- "eval_loss": 1.2382431030273438,
529
- "eval_runtime": 358.1089,
530
- "eval_samples_per_second": 3.365,
531
- "eval_steps_per_second": 0.422,
532
- "step": 8000
533
- },
534
- {
535
- "epoch": 5.66,
536
- "learning_rate": 1.8708091193562807e-05,
537
- "loss": 1.0201,
538
- "step": 8100
539
- },
540
- {
541
- "epoch": 5.73,
542
- "learning_rate": 1.8149307107733573e-05,
543
- "loss": 1.0377,
544
- "step": 8200
545
- },
546
- {
547
- "epoch": 5.8,
548
- "learning_rate": 1.7590523021904336e-05,
549
- "loss": 0.986,
550
- "step": 8300
551
- },
552
- {
553
- "epoch": 5.87,
554
- "learning_rate": 1.70317389360751e-05,
555
- "loss": 1.0244,
556
- "step": 8400
557
- },
558
- {
559
- "epoch": 5.94,
560
- "learning_rate": 1.6472954850245866e-05,
561
- "loss": 0.9654,
562
- "step": 8500
563
- },
564
- {
565
- "epoch": 6.01,
566
- "learning_rate": 1.5914170764416632e-05,
567
- "loss": 0.9454,
568
- "step": 8600
569
- },
570
- {
571
- "epoch": 6.08,
572
- "learning_rate": 1.5355386678587395e-05,
573
- "loss": 0.8179,
574
- "step": 8700
575
- },
576
- {
577
- "epoch": 6.15,
578
- "learning_rate": 1.479660259275816e-05,
579
- "loss": 0.8433,
580
- "step": 8800
581
- },
582
- {
583
- "epoch": 6.22,
584
- "learning_rate": 1.4237818506928924e-05,
585
- "loss": 0.8235,
586
- "step": 8900
587
- },
588
- {
589
- "epoch": 6.29,
590
- "learning_rate": 1.3679034421099687e-05,
591
- "loss": 0.832,
592
- "step": 9000
593
  }
594
  ],
595
  "max_steps": 11448,
596
  "num_train_epochs": 8,
597
- "total_flos": 5913945388013520.0,
598
  "trial_name": null,
599
  "trial_params": null
600
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.397624039133473,
5
+ "global_step": 2000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
10
  {
11
  "epoch": 0.07,
12
  "learning_rate": 2.0000000000000003e-06,
13
+ "loss": 87.2627,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.14,
18
  "learning_rate": 4.000000000000001e-06,
19
+ "loss": 52.4123,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.21,
24
  "learning_rate": 6e-06,
25
+ "loss": 41.3932,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.28,
30
  "learning_rate": 8.000000000000001e-06,
31
+ "loss": 36.7185,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.35,
36
  "learning_rate": 1e-05,
37
+ "loss": 33.4597,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.42,
42
  "learning_rate": 1.2e-05,
43
+ "loss": 31.1323,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.49,
48
  "learning_rate": 1.4000000000000001e-05,
49
+ "loss": 28.9204,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.56,
54
  "learning_rate": 1.6000000000000003e-05,
55
+ "loss": 27.0128,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.63,
60
  "learning_rate": 1.8e-05,
61
+ "loss": 25.1703,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.7,
66
  "learning_rate": 2e-05,
67
+ "loss": 23.5486,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.77,
72
  "learning_rate": 2.2000000000000003e-05,
73
+ "loss": 21.8419,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.84,
78
  "learning_rate": 2.4e-05,
79
+ "loss": 20.3387,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.91,
84
  "learning_rate": 2.6000000000000002e-05,
85
+ "loss": 18.7216,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.98,
90
  "learning_rate": 2.8000000000000003e-05,
91
+ "loss": 16.7862,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 1.05,
96
  "learning_rate": 3e-05,
97
+ "loss": 15.034,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 1.12,
102
  "learning_rate": 3.2000000000000005e-05,
103
+ "loss": 13.3747,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 1.19,
108
  "learning_rate": 3.4000000000000007e-05,
109
+ "loss": 11.7581,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 1.26,
114
  "learning_rate": 3.6e-05,
115
+ "loss": 9.9702,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 1.33,
120
  "learning_rate": 3.8e-05,
121
+ "loss": 8.3363,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 1.4,
126
  "learning_rate": 4e-05,
127
+ "loss": 6.9199,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 1.4,
132
+ "eval_bleu": 3.4801,
133
  "eval_em": 0.0,
134
+ "eval_gen_len": 220.966,
135
+ "eval_loss": 6.3751444816589355,
136
+ "eval_runtime": 1604.7901,
137
+ "eval_samples_per_second": 0.751,
138
+ "eval_steps_per_second": 0.094,
139
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  }
141
  ],
142
  "max_steps": 11448,
143
  "num_train_epochs": 8,
144
+ "total_flos": 1328908826910720.0,
145
  "trial_name": null,
146
  "trial_params": null
147
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e8aedd63b2a88667896dddc2d8d2d2c614cf71faf40847285ec9385424c5189
3
- size 4219
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bb3fd12f0e9726bf97eb4344fe52ca7c4a117923bbaddabaa3847b6466f151f
3
+ size 4155
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8187c4773c5f4d0bd44245457277c1f918e820cd9b50aefa43f4174fa6922015
3
- size 1014236857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4238518d766751ead10635c197669c039a8c46869571a7f8fc96716f256600df
3
+ size 1002469625
runs/Jun05_10-45-59_0a95bf9de5ac/1685962630.6427722/events.out.tfevents.1685962630.0a95bf9de5ac.3272.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47943655f1ff4b3e0a5d8ce007bf952b1846b4f3695c5dcf053bdd1511d8da6e
3
+ size 6302
runs/Jun05_10-45-59_0a95bf9de5ac/events.out.tfevents.1685962630.0a95bf9de5ac.3272.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098e0e6b4e1c013ebd1598cfcedd0c3450868df3bf0b760afd29eede229bcb12
3
+ size 12098
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d751abc785a6e2c459a4f7613a56d593338fd22e62fed58d5c7302b4b930f15
3
- size 4219
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bb3fd12f0e9726bf97eb4344fe52ca7c4a117923bbaddabaa3847b6466f151f
3
+ size 4155