sekarmulyani commited on
Commit
5d3abcb
·
1 Parent(s): 9908fcf

Upload 8 files

Browse files
Files changed (6) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +112 -672
  6. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0665fe7d442f8bbba1ae059bff6270660242a07678e0703e7514cb19706073e0
3
  size 995641861
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:010cc660b5d560447c7fb8b57393b8f97dba3cba806dcec5ee0218894aebf79d
3
  size 995641861
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8aa3fe30180d66d68538a7a9c18a7fef46e6dfcbc3adc5e789b26d04ddfe8e1f
3
  size 497807197
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8292a97db24ad842f8357890c37607ffdb8c0abbf25b087a8cc581fd45f68c4
3
  size 497807197
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f6cf74835c1af9f9e3dc4bcfbc0eae1e84048401ffb87d26ff318411e17c02d
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbc44877a85dc9e31508ab5cdcb4b09e15e4ccd881628820393d3ed5e0b4726
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:381b4f1af09e750b9ce29da1e140136f186310be59fcc0dc325e00c9f5f3a3d0
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272d9d323ca9bc4225a532f1ca51900b269e2ae9a9366402febf725ced99fda9
3
  size 627
trainer_state.json CHANGED
@@ -1,743 +1,183 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 57385,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
- "learning_rate": 9.945543260433913e-06,
14
- "loss": 3.7697,
15
  "step": 500
16
  },
17
  {
18
- "epoch": 0.09,
19
- "learning_rate": 9.891086520867823e-06,
20
- "loss": 2.9544,
21
  "step": 1000
22
  },
23
- {
24
- "epoch": 0.13,
25
- "learning_rate": 9.836629781301735e-06,
26
- "loss": 2.4309,
27
- "step": 1500
28
- },
29
- {
30
- "epoch": 0.17,
31
- "learning_rate": 9.782173041735646e-06,
32
- "loss": 2.1416,
33
- "step": 2000
34
- },
35
- {
36
- "epoch": 0.22,
37
- "learning_rate": 9.727716302169558e-06,
38
- "loss": 2.0346,
39
- "step": 2500
40
- },
41
- {
42
- "epoch": 0.26,
43
- "learning_rate": 9.673259562603468e-06,
44
- "loss": 1.9859,
45
- "step": 3000
46
- },
47
- {
48
- "epoch": 0.3,
49
- "learning_rate": 9.61880282303738e-06,
50
- "loss": 1.9512,
51
- "step": 3500
52
- },
53
- {
54
- "epoch": 0.35,
55
- "learning_rate": 9.56434608347129e-06,
56
- "loss": 1.9171,
57
- "step": 4000
58
- },
59
- {
60
- "epoch": 0.39,
61
- "learning_rate": 9.509889343905202e-06,
62
- "loss": 1.8989,
63
- "step": 4500
64
- },
65
- {
66
- "epoch": 0.44,
67
- "learning_rate": 9.455432604339114e-06,
68
- "loss": 1.868,
69
- "step": 5000
70
- },
71
- {
72
- "epoch": 0.48,
73
- "learning_rate": 9.400975864773026e-06,
74
- "loss": 1.8423,
75
- "step": 5500
76
- },
77
- {
78
- "epoch": 0.52,
79
- "learning_rate": 9.346519125206936e-06,
80
- "loss": 1.8311,
81
- "step": 6000
82
- },
83
- {
84
- "epoch": 0.57,
85
- "learning_rate": 9.292062385640848e-06,
86
- "loss": 1.8139,
87
- "step": 6500
88
- },
89
- {
90
- "epoch": 0.61,
91
- "learning_rate": 9.237605646074758e-06,
92
- "loss": 1.809,
93
- "step": 7000
94
- },
95
- {
96
- "epoch": 0.65,
97
- "learning_rate": 9.18314890650867e-06,
98
- "loss": 1.7932,
99
- "step": 7500
100
- },
101
- {
102
- "epoch": 0.7,
103
- "learning_rate": 9.128692166942582e-06,
104
- "loss": 1.7807,
105
- "step": 8000
106
- },
107
- {
108
- "epoch": 0.74,
109
- "learning_rate": 9.074235427376494e-06,
110
- "loss": 1.7729,
111
- "step": 8500
112
- },
113
- {
114
- "epoch": 0.78,
115
- "learning_rate": 9.019778687810404e-06,
116
- "loss": 1.7695,
117
- "step": 9000
118
- },
119
- {
120
- "epoch": 0.83,
121
- "learning_rate": 8.965321948244316e-06,
122
- "loss": 1.7464,
123
- "step": 9500
124
- },
125
- {
126
- "epoch": 0.87,
127
- "learning_rate": 8.910865208678226e-06,
128
- "loss": 1.7436,
129
- "step": 10000
130
- },
131
- {
132
- "epoch": 0.91,
133
- "learning_rate": 8.856408469112138e-06,
134
- "loss": 1.736,
135
- "step": 10500
136
- },
137
- {
138
- "epoch": 0.96,
139
- "learning_rate": 8.80195172954605e-06,
140
- "loss": 1.729,
141
- "step": 11000
142
- },
143
  {
144
  "epoch": 1.0,
145
- "eval_loss": 1.7001079320907593,
146
- "eval_runtime": 3.4215,
147
- "eval_samples_per_second": 39.748,
148
- "eval_steps_per_second": 4.969,
149
- "step": 11477
150
  },
151
  {
152
- "epoch": 1.0,
153
- "learning_rate": 8.747494989979961e-06,
154
- "loss": 1.7264,
155
- "step": 11500
156
- },
157
- {
158
- "epoch": 1.05,
159
- "learning_rate": 8.693038250413871e-06,
160
- "loss": 1.708,
161
- "step": 12000
162
- },
163
- {
164
- "epoch": 1.09,
165
- "learning_rate": 8.638581510847783e-06,
166
- "loss": 1.6948,
167
- "step": 12500
168
- },
169
- {
170
- "epoch": 1.13,
171
- "learning_rate": 8.584124771281695e-06,
172
- "loss": 1.698,
173
- "step": 13000
174
- },
175
- {
176
- "epoch": 1.18,
177
- "learning_rate": 8.529668031715605e-06,
178
- "loss": 1.692,
179
- "step": 13500
180
- },
181
- {
182
- "epoch": 1.22,
183
- "learning_rate": 8.475211292149517e-06,
184
- "loss": 1.684,
185
- "step": 14000
186
- },
187
- {
188
- "epoch": 1.26,
189
- "learning_rate": 8.420754552583429e-06,
190
- "loss": 1.6879,
191
- "step": 14500
192
- },
193
- {
194
- "epoch": 1.31,
195
- "learning_rate": 8.36629781301734e-06,
196
- "loss": 1.6804,
197
- "step": 15000
198
- },
199
- {
200
- "epoch": 1.35,
201
- "learning_rate": 8.311841073451251e-06,
202
- "loss": 1.6713,
203
- "step": 15500
204
- },
205
- {
206
- "epoch": 1.39,
207
- "learning_rate": 8.257384333885163e-06,
208
- "loss": 1.6703,
209
- "step": 16000
210
- },
211
- {
212
- "epoch": 1.44,
213
- "learning_rate": 8.202927594319073e-06,
214
- "loss": 1.6646,
215
- "step": 16500
216
- },
217
- {
218
- "epoch": 1.48,
219
- "learning_rate": 8.148470854752985e-06,
220
- "loss": 1.651,
221
- "step": 17000
222
- },
223
- {
224
- "epoch": 1.52,
225
- "learning_rate": 8.094014115186897e-06,
226
- "loss": 1.6488,
227
- "step": 17500
228
- },
229
- {
230
- "epoch": 1.57,
231
- "learning_rate": 8.039557375620808e-06,
232
- "loss": 1.6452,
233
- "step": 18000
234
- },
235
- {
236
- "epoch": 1.61,
237
- "learning_rate": 7.985100636054719e-06,
238
- "loss": 1.6386,
239
- "step": 18500
240
- },
241
- {
242
- "epoch": 1.66,
243
- "learning_rate": 7.93064389648863e-06,
244
- "loss": 1.6349,
245
- "step": 19000
246
- },
247
- {
248
- "epoch": 1.7,
249
- "learning_rate": 7.87618715692254e-06,
250
- "loss": 1.6345,
251
- "step": 19500
252
- },
253
- {
254
- "epoch": 1.74,
255
- "learning_rate": 7.821730417356452e-06,
256
- "loss": 1.6294,
257
- "step": 20000
258
- },
259
- {
260
- "epoch": 1.79,
261
- "learning_rate": 7.767273677790364e-06,
262
- "loss": 1.631,
263
- "step": 20500
264
- },
265
- {
266
- "epoch": 1.83,
267
- "learning_rate": 7.712816938224276e-06,
268
- "loss": 1.6261,
269
- "step": 21000
270
- },
271
- {
272
- "epoch": 1.87,
273
- "learning_rate": 7.658360198658186e-06,
274
- "loss": 1.6281,
275
- "step": 21500
276
- },
277
- {
278
- "epoch": 1.92,
279
- "learning_rate": 7.603903459092098e-06,
280
- "loss": 1.611,
281
- "step": 22000
282
- },
283
- {
284
- "epoch": 1.96,
285
- "learning_rate": 7.549446719526009e-06,
286
- "loss": 1.6155,
287
- "step": 22500
288
  },
289
  {
290
- "epoch": 2.0,
291
- "eval_loss": 1.596663475036621,
292
- "eval_runtime": 3.4296,
293
- "eval_samples_per_second": 39.655,
294
- "eval_steps_per_second": 4.957,
295
- "step": 22954
296
  },
297
  {
298
  "epoch": 2.0,
299
- "learning_rate": 7.49498997995992e-06,
300
- "loss": 1.6029,
301
- "step": 23000
302
- },
303
- {
304
- "epoch": 2.05,
305
- "learning_rate": 7.440533240393831e-06,
306
- "loss": 1.607,
307
- "step": 23500
308
- },
309
- {
310
- "epoch": 2.09,
311
- "learning_rate": 7.386076500827744e-06,
312
- "loss": 1.5977,
313
- "step": 24000
314
- },
315
- {
316
- "epoch": 2.13,
317
- "learning_rate": 7.331619761261655e-06,
318
- "loss": 1.5922,
319
- "step": 24500
320
- },
321
- {
322
- "epoch": 2.18,
323
- "learning_rate": 7.277163021695566e-06,
324
- "loss": 1.5956,
325
- "step": 25000
326
  },
327
  {
328
- "epoch": 2.22,
329
- "learning_rate": 7.222706282129477e-06,
330
- "loss": 1.5855,
331
- "step": 25500
332
- },
333
- {
334
- "epoch": 2.27,
335
- "learning_rate": 7.168249542563388e-06,
336
- "loss": 1.5826,
337
- "step": 26000
338
- },
339
- {
340
- "epoch": 2.31,
341
- "learning_rate": 7.1137928029972995e-06,
342
- "loss": 1.5846,
343
- "step": 26500
344
- },
345
- {
346
- "epoch": 2.35,
347
- "learning_rate": 7.059336063431211e-06,
348
- "loss": 1.5899,
349
- "step": 27000
350
- },
351
- {
352
- "epoch": 2.4,
353
- "learning_rate": 7.004879323865122e-06,
354
- "loss": 1.5828,
355
- "step": 27500
356
- },
357
- {
358
- "epoch": 2.44,
359
- "learning_rate": 6.950422584299033e-06,
360
- "loss": 1.5762,
361
- "step": 28000
362
- },
363
- {
364
- "epoch": 2.48,
365
- "learning_rate": 6.895965844732945e-06,
366
- "loss": 1.5739,
367
- "step": 28500
368
- },
369
- {
370
- "epoch": 2.53,
371
- "learning_rate": 6.841509105166856e-06,
372
- "loss": 1.574,
373
- "step": 29000
374
- },
375
- {
376
- "epoch": 2.57,
377
- "learning_rate": 6.787052365600767e-06,
378
- "loss": 1.5759,
379
- "step": 29500
380
- },
381
- {
382
- "epoch": 2.61,
383
- "learning_rate": 6.732595626034678e-06,
384
- "loss": 1.5737,
385
- "step": 30000
386
- },
387
- {
388
- "epoch": 2.66,
389
- "learning_rate": 6.67813888646859e-06,
390
- "loss": 1.5637,
391
- "step": 30500
392
- },
393
- {
394
- "epoch": 2.7,
395
- "learning_rate": 6.623682146902502e-06,
396
- "loss": 1.5635,
397
- "step": 31000
398
- },
399
- {
400
- "epoch": 2.74,
401
- "learning_rate": 6.569225407336413e-06,
402
- "loss": 1.5641,
403
- "step": 31500
404
- },
405
- {
406
- "epoch": 2.79,
407
- "learning_rate": 6.514768667770324e-06,
408
- "loss": 1.553,
409
- "step": 32000
410
- },
411
- {
412
- "epoch": 2.83,
413
- "learning_rate": 6.460311928204235e-06,
414
- "loss": 1.5699,
415
- "step": 32500
416
- },
417
- {
418
- "epoch": 2.88,
419
- "learning_rate": 6.405855188638146e-06,
420
- "loss": 1.5695,
421
- "step": 33000
422
  },
423
  {
424
- "epoch": 2.92,
425
- "learning_rate": 6.3513984490720584e-06,
426
- "loss": 1.5665,
427
- "step": 33500
428
  },
429
  {
430
  "epoch": 2.96,
431
- "learning_rate": 6.296941709505969e-06,
432
- "loss": 1.5527,
433
- "step": 34000
434
  },
435
  {
436
  "epoch": 3.0,
437
- "eval_loss": 1.5436657667160034,
438
- "eval_runtime": 3.2624,
439
- "eval_samples_per_second": 41.687,
440
- "eval_steps_per_second": 5.211,
441
- "step": 34431
442
- },
443
- {
444
- "epoch": 3.01,
445
- "learning_rate": 6.24248496993988e-06,
446
- "loss": 1.5506,
447
- "step": 34500
448
- },
449
- {
450
- "epoch": 3.05,
451
- "learning_rate": 6.188028230373791e-06,
452
- "loss": 1.559,
453
- "step": 35000
454
- },
455
- {
456
- "epoch": 3.09,
457
- "learning_rate": 6.133571490807702e-06,
458
- "loss": 1.5388,
459
- "step": 35500
460
- },
461
- {
462
- "epoch": 3.14,
463
- "learning_rate": 6.079114751241613e-06,
464
- "loss": 1.5467,
465
- "step": 36000
466
- },
467
- {
468
- "epoch": 3.18,
469
- "learning_rate": 6.024658011675526e-06,
470
- "loss": 1.5391,
471
- "step": 36500
472
- },
473
- {
474
- "epoch": 3.22,
475
- "learning_rate": 5.970201272109437e-06,
476
- "loss": 1.5364,
477
- "step": 37000
478
- },
479
- {
480
- "epoch": 3.27,
481
- "learning_rate": 5.915744532543348e-06,
482
- "loss": 1.5376,
483
- "step": 37500
484
- },
485
- {
486
- "epoch": 3.31,
487
- "learning_rate": 5.861287792977259e-06,
488
- "loss": 1.5397,
489
- "step": 38000
490
- },
491
- {
492
- "epoch": 3.35,
493
- "learning_rate": 5.806831053411171e-06,
494
- "loss": 1.5336,
495
- "step": 38500
496
- },
497
- {
498
- "epoch": 3.4,
499
- "learning_rate": 5.752374313845082e-06,
500
- "loss": 1.5378,
501
- "step": 39000
502
- },
503
- {
504
- "epoch": 3.44,
505
- "learning_rate": 5.697917574278993e-06,
506
- "loss": 1.5318,
507
- "step": 39500
508
- },
509
- {
510
- "epoch": 3.49,
511
- "learning_rate": 5.643460834712905e-06,
512
- "loss": 1.5252,
513
- "step": 40000
514
- },
515
- {
516
- "epoch": 3.53,
517
- "learning_rate": 5.589004095146816e-06,
518
- "loss": 1.5333,
519
- "step": 40500
520
- },
521
- {
522
- "epoch": 3.57,
523
- "learning_rate": 5.5345473555807275e-06,
524
- "loss": 1.5299,
525
- "step": 41000
526
- },
527
- {
528
- "epoch": 3.62,
529
- "learning_rate": 5.4800906160146385e-06,
530
- "loss": 1.5215,
531
- "step": 41500
532
- },
533
- {
534
- "epoch": 3.66,
535
- "learning_rate": 5.4256338764485495e-06,
536
- "loss": 1.52,
537
- "step": 42000
538
- },
539
- {
540
- "epoch": 3.7,
541
- "learning_rate": 5.3711771368824605e-06,
542
- "loss": 1.5258,
543
- "step": 42500
544
- },
545
- {
546
- "epoch": 3.75,
547
- "learning_rate": 5.316720397316373e-06,
548
- "loss": 1.5256,
549
- "step": 43000
550
- },
551
- {
552
- "epoch": 3.79,
553
- "learning_rate": 5.262263657750284e-06,
554
- "loss": 1.5205,
555
- "step": 43500
556
- },
557
- {
558
- "epoch": 3.83,
559
- "learning_rate": 5.207806918184195e-06,
560
- "loss": 1.5236,
561
- "step": 44000
562
  },
563
  {
564
- "epoch": 3.88,
565
- "learning_rate": 5.153350178618106e-06,
566
- "loss": 1.5281,
567
- "step": 44500
568
- },
569
- {
570
- "epoch": 3.92,
571
- "learning_rate": 5.098893439052017e-06,
572
- "loss": 1.5175,
573
- "step": 45000
574
  },
575
  {
576
- "epoch": 3.96,
577
- "learning_rate": 5.044436699485928e-06,
578
- "loss": 1.5215,
579
- "step": 45500
580
  },
581
  {
582
  "epoch": 4.0,
583
- "eval_loss": 1.5107132196426392,
584
- "eval_runtime": 3.2672,
585
- "eval_samples_per_second": 41.625,
586
- "eval_steps_per_second": 5.203,
587
- "step": 45908
588
- },
589
- {
590
- "epoch": 4.01,
591
- "learning_rate": 4.98997995991984e-06,
592
- "loss": 1.5202,
593
- "step": 46000
594
- },
595
- {
596
- "epoch": 4.05,
597
- "learning_rate": 4.935523220353751e-06,
598
- "loss": 1.5136,
599
- "step": 46500
600
- },
601
- {
602
- "epoch": 4.1,
603
- "learning_rate": 4.881066480787663e-06,
604
- "loss": 1.5119,
605
- "step": 47000
606
- },
607
- {
608
- "epoch": 4.14,
609
- "learning_rate": 4.826609741221574e-06,
610
- "loss": 1.5052,
611
- "step": 47500
612
- },
613
- {
614
- "epoch": 4.18,
615
- "learning_rate": 4.772153001655485e-06,
616
- "loss": 1.5088,
617
- "step": 48000
618
  },
619
  {
620
  "epoch": 4.23,
621
- "learning_rate": 4.717696262089397e-06,
622
- "loss": 1.5078,
623
- "step": 48500
624
- },
625
- {
626
- "epoch": 4.27,
627
- "learning_rate": 4.663239522523308e-06,
628
- "loss": 1.5099,
629
- "step": 49000
630
- },
631
- {
632
- "epoch": 4.31,
633
- "learning_rate": 4.608782782957219e-06,
634
- "loss": 1.5098,
635
- "step": 49500
636
- },
637
- {
638
- "epoch": 4.36,
639
- "learning_rate": 4.5543260433911305e-06,
640
- "loss": 1.5044,
641
- "step": 50000
642
- },
643
- {
644
- "epoch": 4.4,
645
- "learning_rate": 4.4998693038250415e-06,
646
- "loss": 1.5049,
647
- "step": 50500
648
- },
649
- {
650
- "epoch": 4.44,
651
- "learning_rate": 4.445412564258953e-06,
652
- "loss": 1.4958,
653
- "step": 51000
654
- },
655
- {
656
- "epoch": 4.49,
657
- "learning_rate": 4.390955824692864e-06,
658
- "loss": 1.5073,
659
- "step": 51500
660
- },
661
- {
662
- "epoch": 4.53,
663
- "learning_rate": 4.336499085126776e-06,
664
- "loss": 1.5015,
665
- "step": 52000
666
  },
667
  {
668
- "epoch": 4.57,
669
- "learning_rate": 4.282042345560687e-06,
670
- "loss": 1.5022,
671
- "step": 52500
672
  },
673
  {
674
- "epoch": 4.62,
675
- "learning_rate": 4.227585605994598e-06,
676
- "loss": 1.4973,
677
- "step": 53000
 
 
678
  },
679
  {
680
- "epoch": 4.66,
681
- "learning_rate": 4.17312886642851e-06,
682
- "loss": 1.4944,
683
- "step": 53500
684
  },
685
  {
686
- "epoch": 4.71,
687
- "learning_rate": 4.118672126862421e-06,
688
- "loss": 1.5024,
689
- "step": 54000
690
  },
691
  {
692
- "epoch": 4.75,
693
- "learning_rate": 4.064215387296332e-06,
694
- "loss": 1.4981,
695
- "step": 54500
696
  },
697
  {
698
- "epoch": 4.79,
699
- "learning_rate": 4.009758647730244e-06,
700
- "loss": 1.4942,
701
- "step": 55000
 
 
702
  },
703
  {
704
- "epoch": 4.84,
705
- "learning_rate": 3.955301908164155e-06,
706
- "loss": 1.4964,
707
- "step": 55500
708
  },
709
  {
710
- "epoch": 4.88,
711
- "learning_rate": 3.900845168598066e-06,
712
- "loss": 1.4995,
713
- "step": 56000
714
  },
715
  {
716
- "epoch": 4.92,
717
- "learning_rate": 3.846388429031978e-06,
718
- "loss": 1.4959,
719
- "step": 56500
 
 
720
  },
721
  {
722
- "epoch": 4.97,
723
- "learning_rate": 3.7919316894658886e-06,
724
- "loss": 1.4922,
725
- "step": 57000
726
  },
727
  {
728
- "epoch": 5.0,
729
- "eval_loss": 1.4904537200927734,
730
- "eval_runtime": 3.2628,
731
- "eval_samples_per_second": 41.682,
732
- "eval_steps_per_second": 5.21,
733
- "step": 57385
734
  }
735
  ],
736
  "logging_steps": 500,
737
- "max_steps": 91816,
738
- "num_train_epochs": 8,
739
- "save_steps": 500,
740
- "total_flos": 8.99595804672e+16,
741
  "trial_name": null,
742
  "trial_params": null
743
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
  "eval_steps": 500,
6
+ "global_step": 9456,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.42,
13
+ "learning_rate": 9.894247038917091e-06,
14
+ "loss": 3.5932,
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 0.85,
19
+ "learning_rate": 9.78849407783418e-06,
20
+ "loss": 3.336,
21
  "step": 1000
22
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  {
24
  "epoch": 1.0,
25
+ "eval_loss": 3.2544538974761963,
26
+ "eval_runtime": 6.101,
27
+ "eval_samples_per_second": 41.961,
28
+ "eval_steps_per_second": 5.245,
29
+ "step": 1182
30
  },
31
  {
32
+ "epoch": 1.27,
33
+ "learning_rate": 9.68274111675127e-06,
34
+ "loss": 3.2562,
35
+ "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
  {
38
+ "epoch": 1.69,
39
+ "learning_rate": 9.57698815566836e-06,
40
+ "loss": 3.1967,
41
+ "step": 2000
 
 
42
  },
43
  {
44
  "epoch": 2.0,
45
+ "eval_loss": 3.1576380729675293,
46
+ "eval_runtime": 6.0951,
47
+ "eval_samples_per_second": 42.001,
48
+ "eval_steps_per_second": 5.25,
49
+ "step": 2364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  },
51
  {
52
+ "epoch": 2.12,
53
+ "learning_rate": 9.47123519458545e-06,
54
+ "loss": 3.1613,
55
+ "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  },
57
  {
58
+ "epoch": 2.54,
59
+ "learning_rate": 9.36548223350254e-06,
60
+ "loss": 3.1268,
61
+ "step": 3000
62
  },
63
  {
64
  "epoch": 2.96,
65
+ "learning_rate": 9.25972927241963e-06,
66
+ "loss": 3.1148,
67
+ "step": 3500
68
  },
69
  {
70
  "epoch": 3.0,
71
+ "eval_loss": 3.1073131561279297,
72
+ "eval_runtime": 6.0936,
73
+ "eval_samples_per_second": 42.011,
74
+ "eval_steps_per_second": 5.251,
75
+ "step": 3546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  },
77
  {
78
+ "epoch": 3.38,
79
+ "learning_rate": 9.15397631133672e-06,
80
+ "loss": 3.0804,
81
+ "step": 4000
 
 
 
 
 
 
82
  },
83
  {
84
+ "epoch": 3.81,
85
+ "learning_rate": 9.048223350253808e-06,
86
+ "loss": 3.0814,
87
+ "step": 4500
88
  },
89
  {
90
  "epoch": 4.0,
91
+ "eval_loss": 3.074193000793457,
92
+ "eval_runtime": 6.0939,
93
+ "eval_samples_per_second": 42.009,
94
+ "eval_steps_per_second": 5.251,
95
+ "step": 4728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  },
97
  {
98
  "epoch": 4.23,
99
+ "learning_rate": 8.942470389170898e-06,
100
+ "loss": 3.054,
101
+ "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  },
103
  {
104
+ "epoch": 4.65,
105
+ "learning_rate": 8.836717428087988e-06,
106
+ "loss": 3.0355,
107
+ "step": 5500
108
  },
109
  {
110
+ "epoch": 5.0,
111
+ "eval_loss": 3.04995059967041,
112
+ "eval_runtime": 6.09,
113
+ "eval_samples_per_second": 42.036,
114
+ "eval_steps_per_second": 5.254,
115
+ "step": 5910
116
  },
117
  {
118
+ "epoch": 5.08,
119
+ "learning_rate": 8.730964467005076e-06,
120
+ "loss": 3.0365,
121
+ "step": 6000
122
  },
123
  {
124
+ "epoch": 5.5,
125
+ "learning_rate": 8.625211505922166e-06,
126
+ "loss": 3.0104,
127
+ "step": 6500
128
  },
129
  {
130
+ "epoch": 5.92,
131
+ "learning_rate": 8.519458544839256e-06,
132
+ "loss": 3.0126,
133
+ "step": 7000
134
  },
135
  {
136
+ "epoch": 6.0,
137
+ "eval_loss": 3.0317230224609375,
138
+ "eval_runtime": 6.0899,
139
+ "eval_samples_per_second": 42.037,
140
+ "eval_steps_per_second": 5.255,
141
+ "step": 7092
142
  },
143
  {
144
+ "epoch": 6.35,
145
+ "learning_rate": 8.413705583756346e-06,
146
+ "loss": 2.9923,
147
+ "step": 7500
148
  },
149
  {
150
+ "epoch": 6.77,
151
+ "learning_rate": 8.307952622673435e-06,
152
+ "loss": 2.9902,
153
+ "step": 8000
154
  },
155
  {
156
+ "epoch": 7.0,
157
+ "eval_loss": 3.0167293548583984,
158
+ "eval_runtime": 6.0906,
159
+ "eval_samples_per_second": 42.032,
160
+ "eval_steps_per_second": 5.254,
161
+ "step": 8274
162
  },
163
  {
164
+ "epoch": 7.19,
165
+ "learning_rate": 8.202199661590525e-06,
166
+ "loss": 2.9783,
167
+ "step": 8500
168
  },
169
  {
170
+ "epoch": 7.61,
171
+ "learning_rate": 8.096446700507615e-06,
172
+ "loss": 2.9722,
173
+ "step": 9000
 
 
174
  }
175
  ],
176
  "logging_steps": 500,
177
+ "max_steps": 47280,
178
+ "num_train_epochs": 40,
179
+ "save_steps": 9456,
180
+ "total_flos": 1.4819961470976e+16,
181
  "trial_name": null,
182
  "trial_params": null
183
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed2cc3807546f2cb55ecbce521c3690c744d9469e27b3404476816476ca082c6
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae185e461383878779e83ce5ef9c772ed06a4c034c0c8dd422cbd6909ed7f255
3
  size 4027