z-uo commited on
Commit
0b9a74a
1 Parent(s): b59a683

train with metrics

Browse files
all_results.json CHANGED
@@ -1,8 +1,18 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.0042496273615588885,
4
- "train_runtime": 149.3052,
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 87599,
6
- "train_samples_per_second": 1173.422,
7
- "train_steps_per_second": 391.145
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_gen_len": 9.2974,
4
+ "eval_loss": 0.5938774347305298,
5
+ "eval_rouge1": 17.5052,
6
+ "eval_rouge2": 5.8714,
7
+ "eval_rougeL": 17.4487,
8
+ "eval_rougeLsum": 17.4528,
9
+ "eval_runtime": 905.9638,
10
+ "eval_samples": 10570,
11
+ "eval_samples_per_second": 11.667,
12
+ "eval_steps_per_second": 3.89,
13
+ "train_loss": 0.1063714675380759,
14
+ "train_runtime": 3374.8833,
15
  "train_samples": 87599,
16
+ "train_samples_per_second": 51.912,
17
+ "train_steps_per_second": 17.304
18
  }
eval_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_gen_len": 9.2974,
4
+ "eval_loss": 0.5938774347305298,
5
+ "eval_rouge1": 17.5052,
6
+ "eval_rouge2": 5.8714,
7
+ "eval_rougeL": 17.4487,
8
+ "eval_rougeLsum": 17.4528,
9
+ "eval_runtime": 905.9638,
10
+ "eval_samples": 10570,
11
+ "eval_samples_per_second": 11.667,
12
+ "eval_steps_per_second": 3.89
13
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f83b689a45d5a7157f384e72a16e11cab17acbdf1df54d8e37d37692749639d8
3
  size 990284749
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:befb20cfeecf2f9507ccac39fe16381da87a9ebc601a837d7145634b51041857
3
  size 990284749
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.0042496273615588885,
4
- "train_runtime": 149.3052,
5
  "train_samples": 87599,
6
- "train_samples_per_second": 1173.422,
7
- "train_steps_per_second": 391.145
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.1063714675380759,
4
+ "train_runtime": 3374.8833,
5
  "train_samples": 87599,
6
+ "train_samples_per_second": 51.912,
7
+ "train_steps_per_second": 17.304
8
  }
trainer_state.json CHANGED
@@ -10,712 +10,712 @@
10
  {
11
  "epoch": 0.02,
12
  "learning_rate": 2.974315068493151e-05,
13
- "loss": 2.5432,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.03,
18
  "learning_rate": 2.9486301369863017e-05,
19
- "loss": 1.083,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.05,
24
  "learning_rate": 2.922945205479452e-05,
25
- "loss": 0.8696,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.07,
30
  "learning_rate": 2.8972602739726026e-05,
31
- "loss": 0.7068,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.09,
36
  "learning_rate": 2.8715753424657534e-05,
37
- "loss": 0.7191,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.1,
42
  "learning_rate": 2.8458904109589042e-05,
43
- "loss": 0.6643,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.12,
48
  "learning_rate": 2.820205479452055e-05,
49
- "loss": 0.6743,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.14,
54
  "learning_rate": 2.7945205479452054e-05,
55
- "loss": 0.596,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.15,
60
  "learning_rate": 2.7688356164383562e-05,
61
- "loss": 0.6311,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.17,
66
  "learning_rate": 2.743150684931507e-05,
67
- "loss": 0.5768,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.19,
72
  "learning_rate": 2.7174657534246575e-05,
73
- "loss": 0.6081,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.21,
78
  "learning_rate": 2.6917808219178083e-05,
79
- "loss": 0.5806,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.22,
84
  "learning_rate": 2.666095890410959e-05,
85
- "loss": 0.6169,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.24,
90
  "learning_rate": 2.6404109589041096e-05,
91
- "loss": 0.5526,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.26,
96
  "learning_rate": 2.6147260273972604e-05,
97
- "loss": 0.585,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.27,
102
  "learning_rate": 2.589041095890411e-05,
103
- "loss": 0.5513,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.29,
108
  "learning_rate": 2.563356164383562e-05,
109
- "loss": 0.524,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.31,
114
  "learning_rate": 2.5376712328767124e-05,
115
- "loss": 0.5277,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.33,
120
  "learning_rate": 2.511986301369863e-05,
121
- "loss": 0.5515,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.34,
126
  "learning_rate": 2.4863013698630137e-05,
127
- "loss": 0.5447,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.36,
132
  "learning_rate": 2.4606164383561645e-05,
133
- "loss": 0.5196,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.38,
138
  "learning_rate": 2.4349315068493153e-05,
139
- "loss": 0.5573,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.39,
144
  "learning_rate": 2.409246575342466e-05,
145
- "loss": 0.5565,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.41,
150
  "learning_rate": 2.3835616438356165e-05,
151
- "loss": 0.552,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.43,
156
  "learning_rate": 2.357876712328767e-05,
157
- "loss": 0.5339,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.45,
162
  "learning_rate": 2.3321917808219178e-05,
163
- "loss": 0.4993,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.46,
168
  "learning_rate": 2.3065068493150686e-05,
169
- "loss": 0.5248,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.48,
174
  "learning_rate": 2.2808219178082194e-05,
175
- "loss": 0.4782,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.5,
180
  "learning_rate": 2.25513698630137e-05,
181
- "loss": 0.5286,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.51,
186
  "learning_rate": 2.2294520547945206e-05,
187
- "loss": 0.5126,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.53,
192
  "learning_rate": 2.203767123287671e-05,
193
- "loss": 0.4846,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.55,
198
  "learning_rate": 2.178082191780822e-05,
199
- "loss": 0.4735,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.57,
204
  "learning_rate": 2.1523972602739727e-05,
205
- "loss": 0.496,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.58,
210
  "learning_rate": 2.126712328767123e-05,
211
- "loss": 0.5229,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.6,
216
  "learning_rate": 2.101027397260274e-05,
217
- "loss": 0.4823,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.62,
222
  "learning_rate": 2.0753424657534248e-05,
223
- "loss": 0.4881,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.63,
228
  "learning_rate": 2.0496575342465756e-05,
229
- "loss": 0.4779,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.65,
234
  "learning_rate": 2.023972602739726e-05,
235
- "loss": 0.5249,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.67,
240
  "learning_rate": 1.9982876712328768e-05,
241
- "loss": 0.5014,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.68,
246
  "learning_rate": 1.9726027397260273e-05,
247
- "loss": 0.4519,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.7,
252
  "learning_rate": 1.946917808219178e-05,
253
- "loss": 0.4549,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.72,
258
  "learning_rate": 1.921232876712329e-05,
259
- "loss": 0.4927,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.74,
264
  "learning_rate": 1.8955479452054797e-05,
265
- "loss": 0.4839,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.75,
270
  "learning_rate": 1.8698630136986305e-05,
271
- "loss": 0.4898,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.77,
276
  "learning_rate": 1.8441780821917806e-05,
277
- "loss": 0.4725,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.79,
282
  "learning_rate": 1.8184931506849314e-05,
283
- "loss": 0.4598,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.8,
288
  "learning_rate": 1.7928082191780822e-05,
289
- "loss": 0.4918,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.82,
294
  "learning_rate": 1.767123287671233e-05,
295
- "loss": 0.4515,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.84,
300
  "learning_rate": 1.7414383561643838e-05,
301
- "loss": 0.4373,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.86,
306
  "learning_rate": 1.7157534246575342e-05,
307
- "loss": 0.4971,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.87,
312
  "learning_rate": 1.690068493150685e-05,
313
- "loss": 0.4792,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.89,
318
  "learning_rate": 1.6643835616438355e-05,
319
- "loss": 0.4383,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.91,
324
  "learning_rate": 1.6386986301369863e-05,
325
- "loss": 0.4652,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.92,
330
  "learning_rate": 1.613013698630137e-05,
331
- "loss": 0.4516,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.94,
336
  "learning_rate": 1.5873287671232876e-05,
337
- "loss": 0.5019,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.96,
342
  "learning_rate": 1.5616438356164384e-05,
343
- "loss": 0.4944,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.98,
348
  "learning_rate": 1.535958904109589e-05,
349
- "loss": 0.4797,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.99,
354
  "learning_rate": 1.5102739726027398e-05,
355
- "loss": 0.4745,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 1.01,
360
  "learning_rate": 1.4845890410958904e-05,
361
- "loss": 0.4121,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 1.03,
366
  "learning_rate": 1.4589041095890412e-05,
367
- "loss": 0.4557,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 1.04,
372
  "learning_rate": 1.4332191780821918e-05,
373
- "loss": 0.4516,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 1.06,
378
  "learning_rate": 1.4075342465753425e-05,
379
- "loss": 0.4715,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 1.08,
384
  "learning_rate": 1.3818493150684933e-05,
385
- "loss": 0.4518,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 1.1,
390
  "learning_rate": 1.3561643835616437e-05,
391
- "loss": 0.4445,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 1.11,
396
  "learning_rate": 1.3304794520547945e-05,
397
- "loss": 0.4456,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 1.13,
402
  "learning_rate": 1.3047945205479453e-05,
403
- "loss": 0.4544,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 1.15,
408
  "learning_rate": 1.279109589041096e-05,
409
- "loss": 0.4314,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 1.16,
414
  "learning_rate": 1.2534246575342466e-05,
415
- "loss": 0.4332,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 1.18,
420
  "learning_rate": 1.2277397260273974e-05,
421
- "loss": 0.4429,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 1.2,
426
  "learning_rate": 1.202054794520548e-05,
427
- "loss": 0.4599,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 1.22,
432
  "learning_rate": 1.1763698630136986e-05,
433
- "loss": 0.4372,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 1.23,
438
  "learning_rate": 1.1506849315068493e-05,
439
- "loss": 0.4084,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 1.25,
444
  "learning_rate": 1.125e-05,
445
- "loss": 0.4318,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 1.27,
450
  "learning_rate": 1.0993150684931507e-05,
451
- "loss": 0.4225,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 1.28,
456
  "learning_rate": 1.0736301369863013e-05,
457
- "loss": 0.4185,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 1.3,
462
  "learning_rate": 1.0479452054794521e-05,
463
- "loss": 0.4868,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 1.32,
468
  "learning_rate": 1.0222602739726028e-05,
469
- "loss": 0.4171,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 1.34,
474
  "learning_rate": 9.965753424657534e-06,
475
- "loss": 0.4177,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 1.35,
480
  "learning_rate": 9.708904109589042e-06,
481
- "loss": 0.4538,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 1.37,
486
  "learning_rate": 9.452054794520548e-06,
487
- "loss": 0.4138,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 1.39,
492
  "learning_rate": 9.195205479452054e-06,
493
- "loss": 0.4093,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 1.4,
498
  "learning_rate": 8.938356164383562e-06,
499
- "loss": 0.4392,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 1.42,
504
  "learning_rate": 8.681506849315069e-06,
505
- "loss": 0.4318,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 1.44,
510
  "learning_rate": 8.424657534246577e-06,
511
- "loss": 0.4066,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 1.46,
516
  "learning_rate": 8.167808219178081e-06,
517
- "loss": 0.4141,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 1.47,
522
  "learning_rate": 7.91095890410959e-06,
523
- "loss": 0.3985,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 1.49,
528
  "learning_rate": 7.654109589041097e-06,
529
- "loss": 0.4232,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 1.51,
534
  "learning_rate": 7.397260273972603e-06,
535
- "loss": 0.4073,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 1.52,
540
  "learning_rate": 7.14041095890411e-06,
541
- "loss": 0.4375,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 1.54,
546
  "learning_rate": 6.883561643835617e-06,
547
- "loss": 0.4144,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 1.56,
552
  "learning_rate": 6.626712328767123e-06,
553
- "loss": 0.4341,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 1.58,
558
  "learning_rate": 6.3698630136986296e-06,
559
- "loss": 0.4282,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 1.59,
564
  "learning_rate": 6.1130136986301376e-06,
565
- "loss": 0.4644,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 1.61,
570
  "learning_rate": 5.856164383561644e-06,
571
- "loss": 0.41,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 1.63,
576
  "learning_rate": 5.599315068493151e-06,
577
- "loss": 0.4132,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 1.64,
582
  "learning_rate": 5.342465753424657e-06,
583
- "loss": 0.4193,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 1.66,
588
  "learning_rate": 5.085616438356165e-06,
589
- "loss": 0.418,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 1.68,
594
  "learning_rate": 4.8287671232876716e-06,
595
- "loss": 0.4214,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 1.7,
600
  "learning_rate": 4.571917808219178e-06,
601
- "loss": 0.4677,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 1.71,
606
  "learning_rate": 4.315068493150685e-06,
607
- "loss": 0.4458,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 1.73,
612
  "learning_rate": 4.058219178082192e-06,
613
- "loss": 0.3912,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 1.75,
618
  "learning_rate": 3.801369863013699e-06,
619
- "loss": 0.3969,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 1.76,
624
  "learning_rate": 3.5445205479452056e-06,
625
- "loss": 0.388,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 1.78,
630
  "learning_rate": 3.2876712328767123e-06,
631
- "loss": 0.4206,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 1.8,
636
  "learning_rate": 3.0308219178082194e-06,
637
- "loss": 0.4116,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 1.82,
642
  "learning_rate": 2.773972602739726e-06,
643
- "loss": 0.4316,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 1.83,
648
  "learning_rate": 2.517123287671233e-06,
649
- "loss": 0.4312,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 1.85,
654
  "learning_rate": 2.2602739726027396e-06,
655
- "loss": 0.4136,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 1.87,
660
  "learning_rate": 2.0034246575342467e-06,
661
- "loss": 0.4342,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 1.88,
666
  "learning_rate": 1.7465753424657534e-06,
667
- "loss": 0.4248,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 1.9,
672
  "learning_rate": 1.4897260273972603e-06,
673
- "loss": 0.4591,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.92,
678
  "learning_rate": 1.232876712328767e-06,
679
- "loss": 0.46,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.93,
684
  "learning_rate": 9.76027397260274e-07,
685
- "loss": 0.4103,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.95,
690
  "learning_rate": 7.191780821917808e-07,
691
- "loss": 0.4147,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.97,
696
  "learning_rate": 4.6232876712328767e-07,
697
- "loss": 0.3829,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.99,
702
  "learning_rate": 2.0547945205479452e-07,
703
- "loss": 0.431,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 2.0,
708
  "step": 58400,
709
- "total_flos": 9.686242758540902e+16,
710
- "train_loss": 0.0042496273615588885,
711
- "train_runtime": 149.3052,
712
- "train_samples_per_second": 1173.422,
713
- "train_steps_per_second": 391.145
714
  }
715
  ],
716
  "max_steps": 58400,
717
  "num_train_epochs": 2,
718
- "total_flos": 9.686242758540902e+16,
719
  "trial_name": null,
720
  "trial_params": null
721
  }
 
10
  {
11
  "epoch": 0.02,
12
  "learning_rate": 2.974315068493151e-05,
13
+ "loss": 2.7124,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.03,
18
  "learning_rate": 2.9486301369863017e-05,
19
+ "loss": 1.247,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.05,
24
  "learning_rate": 2.922945205479452e-05,
25
+ "loss": 1.0737,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.07,
30
  "learning_rate": 2.8972602739726026e-05,
31
+ "loss": 0.9364,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.09,
36
  "learning_rate": 2.8715753424657534e-05,
37
+ "loss": 0.9113,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.1,
42
  "learning_rate": 2.8458904109589042e-05,
43
+ "loss": 0.8625,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.12,
48
  "learning_rate": 2.820205479452055e-05,
49
+ "loss": 0.8349,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.14,
54
  "learning_rate": 2.7945205479452054e-05,
55
+ "loss": 0.7934,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.15,
60
  "learning_rate": 2.7688356164383562e-05,
61
+ "loss": 0.7953,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.17,
66
  "learning_rate": 2.743150684931507e-05,
67
+ "loss": 0.7222,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.19,
72
  "learning_rate": 2.7174657534246575e-05,
73
+ "loss": 0.7703,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.21,
78
  "learning_rate": 2.6917808219178083e-05,
79
+ "loss": 0.7948,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.22,
84
  "learning_rate": 2.666095890410959e-05,
85
+ "loss": 0.7789,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.24,
90
  "learning_rate": 2.6404109589041096e-05,
91
+ "loss": 0.7879,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.26,
96
  "learning_rate": 2.6147260273972604e-05,
97
+ "loss": 0.7719,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.27,
102
  "learning_rate": 2.589041095890411e-05,
103
+ "loss": 0.7402,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.29,
108
  "learning_rate": 2.563356164383562e-05,
109
+ "loss": 0.7068,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.31,
114
  "learning_rate": 2.5376712328767124e-05,
115
+ "loss": 0.6811,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.33,
120
  "learning_rate": 2.511986301369863e-05,
121
+ "loss": 0.7144,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.34,
126
  "learning_rate": 2.4863013698630137e-05,
127
+ "loss": 0.7241,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.36,
132
  "learning_rate": 2.4606164383561645e-05,
133
+ "loss": 0.739,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.38,
138
  "learning_rate": 2.4349315068493153e-05,
139
+ "loss": 0.7408,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.39,
144
  "learning_rate": 2.409246575342466e-05,
145
+ "loss": 0.7175,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.41,
150
  "learning_rate": 2.3835616438356165e-05,
151
+ "loss": 0.7625,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.43,
156
  "learning_rate": 2.357876712328767e-05,
157
+ "loss": 0.6987,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.45,
162
  "learning_rate": 2.3321917808219178e-05,
163
+ "loss": 0.684,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.46,
168
  "learning_rate": 2.3065068493150686e-05,
169
+ "loss": 0.7049,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.48,
174
  "learning_rate": 2.2808219178082194e-05,
175
+ "loss": 0.669,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.5,
180
  "learning_rate": 2.25513698630137e-05,
181
+ "loss": 0.6912,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.51,
186
  "learning_rate": 2.2294520547945206e-05,
187
+ "loss": 0.6724,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.53,
192
  "learning_rate": 2.203767123287671e-05,
193
+ "loss": 0.6429,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.55,
198
  "learning_rate": 2.178082191780822e-05,
199
+ "loss": 0.6582,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.57,
204
  "learning_rate": 2.1523972602739727e-05,
205
+ "loss": 0.6778,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.58,
210
  "learning_rate": 2.126712328767123e-05,
211
+ "loss": 0.6743,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.6,
216
  "learning_rate": 2.101027397260274e-05,
217
+ "loss": 0.6421,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.62,
222
  "learning_rate": 2.0753424657534248e-05,
223
+ "loss": 0.6484,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.63,
228
  "learning_rate": 2.0496575342465756e-05,
229
+ "loss": 0.6851,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.65,
234
  "learning_rate": 2.023972602739726e-05,
235
+ "loss": 0.7124,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.67,
240
  "learning_rate": 1.9982876712328768e-05,
241
+ "loss": 0.6624,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.68,
246
  "learning_rate": 1.9726027397260273e-05,
247
+ "loss": 0.6519,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.7,
252
  "learning_rate": 1.946917808219178e-05,
253
+ "loss": 0.6528,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.72,
258
  "learning_rate": 1.921232876712329e-05,
259
+ "loss": 0.6881,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.74,
264
  "learning_rate": 1.8955479452054797e-05,
265
+ "loss": 0.6608,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.75,
270
  "learning_rate": 1.8698630136986305e-05,
271
+ "loss": 0.6633,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.77,
276
  "learning_rate": 1.8441780821917806e-05,
277
+ "loss": 0.6357,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.79,
282
  "learning_rate": 1.8184931506849314e-05,
283
+ "loss": 0.6411,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.8,
288
  "learning_rate": 1.7928082191780822e-05,
289
+ "loss": 0.6579,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.82,
294
  "learning_rate": 1.767123287671233e-05,
295
+ "loss": 0.6476,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.84,
300
  "learning_rate": 1.7414383561643838e-05,
301
+ "loss": 0.633,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.86,
306
  "learning_rate": 1.7157534246575342e-05,
307
+ "loss": 0.7078,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.87,
312
  "learning_rate": 1.690068493150685e-05,
313
+ "loss": 0.6676,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.89,
318
  "learning_rate": 1.6643835616438355e-05,
319
+ "loss": 0.6343,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.91,
324
  "learning_rate": 1.6386986301369863e-05,
325
+ "loss": 0.6723,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.92,
330
  "learning_rate": 1.613013698630137e-05,
331
+ "loss": 0.6235,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.94,
336
  "learning_rate": 1.5873287671232876e-05,
337
+ "loss": 0.6666,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.96,
342
  "learning_rate": 1.5616438356164384e-05,
343
+ "loss": 0.7011,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.98,
348
  "learning_rate": 1.535958904109589e-05,
349
+ "loss": 0.6765,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.99,
354
  "learning_rate": 1.5102739726027398e-05,
355
+ "loss": 0.6493,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 1.01,
360
  "learning_rate": 1.4845890410958904e-05,
361
+ "loss": 0.5903,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 1.03,
366
  "learning_rate": 1.4589041095890412e-05,
367
+ "loss": 0.6206,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 1.04,
372
  "learning_rate": 1.4332191780821918e-05,
373
+ "loss": 0.5903,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 1.06,
378
  "learning_rate": 1.4075342465753425e-05,
379
+ "loss": 0.6578,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 1.08,
384
  "learning_rate": 1.3818493150684933e-05,
385
+ "loss": 0.6314,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 1.1,
390
  "learning_rate": 1.3561643835616437e-05,
391
+ "loss": 0.6242,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 1.11,
396
  "learning_rate": 1.3304794520547945e-05,
397
+ "loss": 0.6273,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 1.13,
402
  "learning_rate": 1.3047945205479453e-05,
403
+ "loss": 0.6294,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 1.15,
408
  "learning_rate": 1.279109589041096e-05,
409
+ "loss": 0.6141,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 1.16,
414
  "learning_rate": 1.2534246575342466e-05,
415
+ "loss": 0.5828,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 1.18,
420
  "learning_rate": 1.2277397260273974e-05,
421
+ "loss": 0.577,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 1.2,
426
  "learning_rate": 1.202054794520548e-05,
427
+ "loss": 0.6136,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 1.22,
432
  "learning_rate": 1.1763698630136986e-05,
433
+ "loss": 0.6233,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 1.23,
438
  "learning_rate": 1.1506849315068493e-05,
439
+ "loss": 0.5801,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 1.25,
444
  "learning_rate": 1.125e-05,
445
+ "loss": 0.6176,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 1.27,
450
  "learning_rate": 1.0993150684931507e-05,
451
+ "loss": 0.6007,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 1.28,
456
  "learning_rate": 1.0736301369863013e-05,
457
+ "loss": 0.5864,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 1.3,
462
  "learning_rate": 1.0479452054794521e-05,
463
+ "loss": 0.651,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 1.32,
468
  "learning_rate": 1.0222602739726028e-05,
469
+ "loss": 0.5897,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 1.34,
474
  "learning_rate": 9.965753424657534e-06,
475
+ "loss": 0.5907,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 1.35,
480
  "learning_rate": 9.708904109589042e-06,
481
+ "loss": 0.631,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 1.37,
486
  "learning_rate": 9.452054794520548e-06,
487
+ "loss": 0.6517,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 1.39,
492
  "learning_rate": 9.195205479452054e-06,
493
+ "loss": 0.5801,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 1.4,
498
  "learning_rate": 8.938356164383562e-06,
499
+ "loss": 0.6225,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 1.42,
504
  "learning_rate": 8.681506849315069e-06,
505
+ "loss": 0.6194,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 1.44,
510
  "learning_rate": 8.424657534246577e-06,
511
+ "loss": 0.5939,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 1.46,
516
  "learning_rate": 8.167808219178081e-06,
517
+ "loss": 0.6029,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 1.47,
522
  "learning_rate": 7.91095890410959e-06,
523
+ "loss": 0.5907,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 1.49,
528
  "learning_rate": 7.654109589041097e-06,
529
+ "loss": 0.582,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 1.51,
534
  "learning_rate": 7.397260273972603e-06,
535
+ "loss": 0.5766,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 1.52,
540
  "learning_rate": 7.14041095890411e-06,
541
+ "loss": 0.6133,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 1.54,
546
  "learning_rate": 6.883561643835617e-06,
547
+ "loss": 0.5998,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 1.56,
552
  "learning_rate": 6.626712328767123e-06,
553
+ "loss": 0.6405,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 1.58,
558
  "learning_rate": 6.3698630136986296e-06,
559
+ "loss": 0.5943,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 1.59,
564
  "learning_rate": 6.1130136986301376e-06,
565
+ "loss": 0.6153,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 1.61,
570
  "learning_rate": 5.856164383561644e-06,
571
+ "loss": 0.5876,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 1.63,
576
  "learning_rate": 5.599315068493151e-06,
577
+ "loss": 0.6071,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 1.64,
582
  "learning_rate": 5.342465753424657e-06,
583
+ "loss": 0.5965,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 1.66,
588
  "learning_rate": 5.085616438356165e-06,
589
+ "loss": 0.586,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 1.68,
594
  "learning_rate": 4.8287671232876716e-06,
595
+ "loss": 0.5812,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 1.7,
600
  "learning_rate": 4.571917808219178e-06,
601
+ "loss": 0.6407,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 1.71,
606
  "learning_rate": 4.315068493150685e-06,
607
+ "loss": 0.6125,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 1.73,
612
  "learning_rate": 4.058219178082192e-06,
613
+ "loss": 0.5673,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 1.75,
618
  "learning_rate": 3.801369863013699e-06,
619
+ "loss": 0.5724,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 1.76,
624
  "learning_rate": 3.5445205479452056e-06,
625
+ "loss": 0.5494,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 1.78,
630
  "learning_rate": 3.2876712328767123e-06,
631
+ "loss": 0.6055,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 1.8,
636
  "learning_rate": 3.0308219178082194e-06,
637
+ "loss": 0.5904,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 1.82,
642
  "learning_rate": 2.773972602739726e-06,
643
+ "loss": 0.6092,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 1.83,
648
  "learning_rate": 2.517123287671233e-06,
649
+ "loss": 0.6142,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 1.85,
654
  "learning_rate": 2.2602739726027396e-06,
655
+ "loss": 0.6156,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 1.87,
660
  "learning_rate": 2.0034246575342467e-06,
661
+ "loss": 0.6119,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 1.88,
666
  "learning_rate": 1.7465753424657534e-06,
667
+ "loss": 0.603,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 1.9,
672
  "learning_rate": 1.4897260273972603e-06,
673
+ "loss": 0.6102,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.92,
678
  "learning_rate": 1.232876712328767e-06,
679
+ "loss": 0.6271,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.93,
684
  "learning_rate": 9.76027397260274e-07,
685
+ "loss": 0.5944,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.95,
690
  "learning_rate": 7.191780821917808e-07,
691
+ "loss": 0.6215,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.97,
696
  "learning_rate": 4.6232876712328767e-07,
697
+ "loss": 0.5623,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.99,
702
  "learning_rate": 2.0547945205479452e-07,
703
+ "loss": 0.568,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 2.0,
708
  "step": 58400,
709
+ "total_flos": 7.731654113627136e+16,
710
+ "train_loss": 0.1063714675380759,
711
+ "train_runtime": 3374.8833,
712
+ "train_samples_per_second": 51.912,
713
+ "train_steps_per_second": 17.304
714
  }
715
  ],
716
  "max_steps": 58400,
717
  "num_train_epochs": 2,
718
+ "total_flos": 7.731654113627136e+16,
719
  "trial_name": null,
720
  "trial_params": null
721
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b12994a7a655385d0a3c0d9185c29fa4c8afd4c81e4b449c00e38652af68e330
3
  size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27427062fea82c900bf392207a9ffd2341e20b264fef7f86768cda41ba705db8
3
  size 2991