Bingsu commited on
Commit
73b05b1
1 Parent(s): 83847fb

Training in progress, step 10000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ef929787429c88c576570a6fdf928468aedd3cf5c685150c4d00c23ed2574b5
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c205aa1e4f5bca6eeb78422e0866131d9cc58c5a15fab7af01cc2505a790df94
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b649041f4e942a1c71886ca3f83f11fc846bb4752950f41b84cd207e3cc556c
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48f41b7b489ac24545b446eb3e28cf98daf67da888804a7513071641e3cb0ab
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e521fc0b39ea1a8ce6e0a681bc8a834da87e83fd7cd3754aacc6c4b558e0937
3
- size 14439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b635e00d41e3e6e46449e8609b1f91fbb1205e9587df7d15d6fa3ee2cfe466d
3
+ size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:677543783c6139b35f47c70ebab6ad516807591d9ad24cec274ace85b8305217
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97030c307e5479014866fb832070891e4b01f3e0d70ac54668f29d2602b645e8
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,916 +1,316 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1289213579716373,
5
- "global_step": 30000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 0.00040004211081201384,
13
- "loss": 8.3496,
14
  "step": 200
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 0.000400168442509171,
19
- "loss": 8.2272,
20
  "step": 400
21
  },
22
  {
23
  "epoch": 0.0,
24
- "learning_rate": 0.000400378992874836,
25
- "loss": 7.6879,
26
  "step": 600
27
  },
28
  {
29
  "epoch": 0.0,
30
- "learning_rate": 0.0004006737582146567,
31
- "loss": 7.4747,
32
  "step": 800
33
  },
34
  {
35
  "epoch": 0.0,
36
- "learning_rate": 0.0004010527333566261,
37
- "loss": 7.2829,
38
  "step": 1000
39
  },
40
  {
41
  "epoch": 0.01,
42
- "learning_rate": 0.0004015159116511832,
43
- "loss": 7.1171,
44
  "step": 1200
45
  },
46
  {
47
  "epoch": 0.01,
48
- "learning_rate": 0.00040206328497132196,
49
- "loss": 6.9445,
50
  "step": 1400
51
  },
52
  {
53
  "epoch": 0.01,
54
- "learning_rate": 0.0004026948437127389,
55
- "loss": 6.8391,
56
  "step": 1600
57
  },
58
  {
59
  "epoch": 0.01,
60
- "learning_rate": 0.0004034105767939909,
61
- "loss": 6.7131,
62
  "step": 1800
63
  },
64
  {
65
  "epoch": 0.01,
66
- "learning_rate": 0.00040421047165670534,
67
- "loss": 6.6113,
68
  "step": 2000
69
  },
70
  {
71
  "epoch": 0.01,
72
- "learning_rate": 0.0004050945142657896,
73
- "loss": 6.4966,
74
  "step": 2200
75
  },
76
  {
77
  "epoch": 0.01,
78
- "learning_rate": 0.0004060626891096795,
79
- "loss": 6.3979,
80
  "step": 2400
81
  },
82
  {
83
  "epoch": 0.01,
84
- "learning_rate": 0.0004071149792006148,
85
- "loss": 6.3116,
86
  "step": 2600
87
  },
88
  {
89
  "epoch": 0.01,
90
- "learning_rate": 0.00040825136607492915,
91
- "loss": 6.2301,
92
  "step": 2800
93
  },
94
  {
95
  "epoch": 0.01,
96
- "learning_rate": 0.0004094718297933883,
97
- "loss": 6.123,
98
  "step": 3000
99
  },
100
  {
101
  "epoch": 0.01,
102
- "learning_rate": 0.0004107763489415231,
103
- "loss": 6.0802,
104
  "step": 3200
105
  },
106
  {
107
  "epoch": 0.01,
108
- "learning_rate": 0.00041216490063001633,
109
- "loss": 6.0029,
110
  "step": 3400
111
  },
112
  {
113
  "epoch": 0.02,
114
- "learning_rate": 0.00041363746049510354,
115
- "loss": 5.9471,
116
  "step": 3600
117
  },
118
  {
119
  "epoch": 0.02,
120
- "learning_rate": 0.0004151940026989945,
121
- "loss": 5.9132,
122
  "step": 3800
123
  },
124
  {
125
  "epoch": 0.02,
126
- "learning_rate": 0.0004168344999303346,
127
- "loss": 5.8561,
128
  "step": 4000
129
  },
130
  {
131
  "epoch": 0.02,
132
- "learning_rate": 0.00041855892340467854,
133
- "loss": 5.8044,
134
  "step": 4200
135
  },
136
  {
137
  "epoch": 0.02,
138
- "learning_rate": 0.0004203672428649916,
139
- "loss": 5.734,
140
  "step": 4400
141
  },
142
  {
143
  "epoch": 0.02,
144
- "learning_rate": 0.0004222594265821944,
145
- "loss": 5.7245,
146
  "step": 4600
147
  },
148
  {
149
  "epoch": 0.02,
150
- "learning_rate": 0.0004242354413557057,
151
- "loss": 5.6867,
152
  "step": 4800
153
  },
154
  {
155
  "epoch": 0.02,
156
- "learning_rate": 0.00042629525251402893,
157
- "loss": 5.6387,
158
  "step": 5000
159
  },
160
  {
161
  "epoch": 0.02,
162
- "learning_rate": 0.0004284388239153662,
163
- "loss": 5.6119,
164
  "step": 5200
165
  },
166
  {
167
  "epoch": 0.02,
168
- "learning_rate": 0.0004306661179482429,
169
- "loss": 5.5533,
170
  "step": 5400
171
  },
172
  {
173
  "epoch": 0.02,
174
- "learning_rate": 0.0004329770955321787,
175
- "loss": 5.517,
176
  "step": 5600
177
  },
178
  {
179
  "epoch": 0.02,
180
- "learning_rate": 0.0004353717161183629,
181
- "loss": 5.4864,
182
  "step": 5800
183
  },
184
  {
185
  "epoch": 0.03,
186
- "learning_rate": 0.0004378499376903721,
187
- "loss": 5.4671,
188
  "step": 6000
189
  },
190
  {
191
  "epoch": 0.03,
192
- "learning_rate": 0.00044041171676490604,
193
- "loss": 5.4412,
194
  "step": 6200
195
  },
196
  {
197
  "epoch": 0.03,
198
- "learning_rate": 0.0004430570083925455,
199
- "loss": 5.4108,
200
  "step": 6400
201
  },
202
  {
203
  "epoch": 0.03,
204
- "learning_rate": 0.0004457857661585539,
205
- "loss": 5.3807,
206
  "step": 6600
207
  },
208
  {
209
  "epoch": 0.03,
210
- "learning_rate": 0.0004485979421836768,
211
- "loss": 5.3353,
212
  "step": 6800
213
  },
214
  {
215
  "epoch": 0.03,
216
- "learning_rate": 0.0004514934871249904,
217
- "loss": 5.3277,
218
  "step": 7000
219
  },
220
  {
221
  "epoch": 0.03,
222
- "learning_rate": 0.00045447235017676696,
223
- "loss": 5.2979,
224
  "step": 7200
225
  },
226
  {
227
  "epoch": 0.03,
228
- "learning_rate": 0.00045753447907136494,
229
- "loss": 5.2791,
230
  "step": 7400
231
  },
232
  {
233
  "epoch": 0.03,
234
- "learning_rate": 0.000460679820080143,
235
- "loss": 5.2494,
236
  "step": 7600
237
  },
238
  {
239
  "epoch": 0.03,
240
- "learning_rate": 0.00046390831801440893,
241
- "loss": 5.2175,
242
  "step": 7800
243
  },
244
  {
245
  "epoch": 0.03,
246
- "learning_rate": 0.0004672199162263843,
247
- "loss": 5.2038,
248
  "step": 8000
249
  },
250
  {
251
  "epoch": 0.04,
252
- "learning_rate": 0.0004706145566101966,
253
- "loss": 5.1835,
254
  "step": 8200
255
  },
256
  {
257
  "epoch": 0.04,
258
- "learning_rate": 0.0004740921796029061,
259
- "loss": 5.1691,
260
  "step": 8400
261
  },
262
  {
263
  "epoch": 0.04,
264
- "learning_rate": 0.0004776527241855382,
265
- "loss": 5.1582,
266
  "step": 8600
267
  },
268
  {
269
  "epoch": 0.04,
270
- "learning_rate": 0.0004812961278841711,
271
- "loss": 5.1504,
272
  "step": 8800
273
  },
274
  {
275
  "epoch": 0.04,
276
- "learning_rate": 0.0004850223267710129,
277
- "loss": 5.1162,
278
  "step": 9000
279
  },
280
  {
281
  "epoch": 0.04,
282
- "learning_rate": 0.0004888312554655432,
283
- "loss": 5.0957,
284
  "step": 9200
285
  },
286
  {
287
  "epoch": 0.04,
288
- "learning_rate": 0.0004927228471356421,
289
- "loss": 5.079,
290
  "step": 9400
291
  },
292
  {
293
  "epoch": 0.04,
294
- "learning_rate": 0.0004966970334987757,
295
- "loss": 5.0572,
296
  "step": 9600
297
  },
298
  {
299
  "epoch": 0.04,
300
- "learning_rate": 0.0005007537448231871,
301
- "loss": 5.0342,
302
  "step": 9800
303
  },
304
  {
305
  "epoch": 0.04,
306
- "learning_rate": 0.0005048929099291249,
307
- "loss": 5.0106,
308
  "step": 10000
309
- },
310
- {
311
- "epoch": 0.04,
312
- "learning_rate": 0.0005091144561900837,
313
- "loss": 5.0155,
314
- "step": 10200
315
- },
316
- {
317
- "epoch": 0.04,
318
- "learning_rate": 0.0005134183095340927,
319
- "loss": 4.9817,
320
- "step": 10400
321
- },
322
- {
323
- "epoch": 0.05,
324
- "learning_rate": 0.0005178043944449977,
325
- "loss": 4.9742,
326
- "step": 10600
327
- },
328
- {
329
- "epoch": 0.05,
330
- "learning_rate": 0.0005222726339638023,
331
- "loss": 4.9299,
332
- "step": 10800
333
- },
334
- {
335
- "epoch": 0.05,
336
- "learning_rate": 0.0005268229496900086,
337
- "loss": 4.9208,
338
- "step": 11000
339
- },
340
- {
341
- "epoch": 0.05,
342
- "learning_rate": 0.0005314552617829947,
343
- "loss": 4.8617,
344
- "step": 11200
345
- },
346
- {
347
- "epoch": 0.05,
348
- "learning_rate": 0.0005361694889634196,
349
- "loss": 4.7952,
350
- "step": 11400
351
- },
352
- {
353
- "epoch": 0.05,
354
- "learning_rate": 0.0005409655485146408,
355
- "loss": 4.7641,
356
- "step": 11600
357
- },
358
- {
359
- "epoch": 0.05,
360
- "learning_rate": 0.0005458433562841782,
361
- "loss": 4.7361,
362
- "step": 11800
363
- },
364
- {
365
- "epoch": 0.05,
366
- "learning_rate": 0.0005508028266851747,
367
- "loss": 4.7023,
368
- "step": 12000
369
- },
370
- {
371
- "epoch": 0.05,
372
- "learning_rate": 0.000555843872697916,
373
- "loss": 4.6561,
374
- "step": 12200
375
- },
376
- {
377
- "epoch": 0.05,
378
- "learning_rate": 0.0005609664058713396,
379
- "loss": 4.63,
380
- "step": 12400
381
- },
382
- {
383
- "epoch": 0.05,
384
- "learning_rate": 0.0005661703363245996,
385
- "loss": 4.6307,
386
- "step": 12600
387
- },
388
- {
389
- "epoch": 0.06,
390
- "learning_rate": 0.0005714555727486404,
391
- "loss": 4.5881,
392
- "step": 12800
393
- },
394
- {
395
- "epoch": 0.06,
396
- "learning_rate": 0.0005768220224077955,
397
- "loss": 4.5489,
398
- "step": 13000
399
- },
400
- {
401
- "epoch": 0.06,
402
- "learning_rate": 0.0005822695911414169,
403
- "loss": 4.5521,
404
- "step": 13200
405
- },
406
- {
407
- "epoch": 0.06,
408
- "learning_rate": 0.0005877981833655298,
409
- "loss": 4.5165,
410
- "step": 13400
411
- },
412
- {
413
- "epoch": 0.06,
414
- "learning_rate": 0.0005934077020745051,
415
- "loss": 4.505,
416
- "step": 13600
417
- },
418
- {
419
- "epoch": 0.06,
420
- "learning_rate": 0.0005990980488427659,
421
- "loss": 4.4863,
422
- "step": 13800
423
- },
424
- {
425
- "epoch": 0.06,
426
- "learning_rate": 0.000604869123826509,
427
- "loss": 4.5071,
428
- "step": 14000
429
- },
430
- {
431
- "epoch": 0.06,
432
- "learning_rate": 0.0006107208257654633,
433
- "loss": 4.4501,
434
- "step": 14200
435
- },
436
- {
437
- "epoch": 0.06,
438
- "learning_rate": 0.0006166530519846631,
439
- "loss": 4.4623,
440
- "step": 14400
441
- },
442
- {
443
- "epoch": 0.06,
444
- "learning_rate": 0.0006226656983962468,
445
- "loss": 4.4336,
446
- "step": 14600
447
- },
448
- {
449
- "epoch": 0.06,
450
- "learning_rate": 0.0006287586595012887,
451
- "loss": 4.4335,
452
- "step": 14800
453
- },
454
- {
455
- "epoch": 0.06,
456
- "learning_rate": 0.000634931828391647,
457
- "loss": 4.4142,
458
- "step": 15000
459
- },
460
- {
461
- "epoch": 0.07,
462
- "learning_rate": 0.0006411850967518416,
463
- "loss": 4.4145,
464
- "step": 15200
465
- },
466
- {
467
- "epoch": 0.07,
468
- "learning_rate": 0.0006475183548609511,
469
- "loss": 4.3842,
470
- "step": 15400
471
- },
472
- {
473
- "epoch": 0.07,
474
- "learning_rate": 0.0006539314915945428,
475
- "loss": 4.3748,
476
- "step": 15600
477
- },
478
- {
479
- "epoch": 0.07,
480
- "learning_rate": 0.0006604243944266178,
481
- "loss": 4.3815,
482
- "step": 15800
483
- },
484
- {
485
- "epoch": 0.07,
486
- "learning_rate": 0.0006669969494315867,
487
- "loss": 4.352,
488
- "step": 16000
489
- },
490
- {
491
- "epoch": 0.07,
492
- "learning_rate": 0.0006736490412862749,
493
- "loss": 4.3575,
494
- "step": 16200
495
- },
496
- {
497
- "epoch": 0.07,
498
- "learning_rate": 0.000680380553271933,
499
- "loss": 4.3416,
500
- "step": 16400
501
- },
502
- {
503
- "epoch": 0.07,
504
- "learning_rate": 0.0006871913672762998,
505
- "loss": 4.341,
506
- "step": 16600
507
- },
508
- {
509
- "epoch": 0.07,
510
- "learning_rate": 0.0006940813637956594,
511
- "loss": 4.3183,
512
- "step": 16800
513
- },
514
- {
515
- "epoch": 0.07,
516
- "learning_rate": 0.0007010504219369541,
517
- "loss": 4.3145,
518
- "step": 17000
519
- },
520
- {
521
- "epoch": 0.07,
522
- "learning_rate": 0.0007080984194198885,
523
- "loss": 4.3065,
524
- "step": 17200
525
- },
526
- {
527
- "epoch": 0.07,
528
- "learning_rate": 0.0007152252325790948,
529
- "loss": 4.2805,
530
- "step": 17400
531
- },
532
- {
533
- "epoch": 0.08,
534
- "learning_rate": 0.0007224307363662818,
535
- "loss": 4.2804,
536
- "step": 17600
537
- },
538
- {
539
- "epoch": 0.08,
540
- "learning_rate": 0.0007297148043524434,
541
- "loss": 4.2996,
542
- "step": 17800
543
- },
544
- {
545
- "epoch": 0.08,
546
- "learning_rate": 0.0007370773087300737,
547
- "loss": 4.2743,
548
- "step": 18000
549
- },
550
- {
551
- "epoch": 0.08,
552
- "learning_rate": 0.0007445181203154048,
553
- "loss": 4.2621,
554
- "step": 18200
555
- },
556
- {
557
- "epoch": 0.08,
558
- "learning_rate": 0.0007520371085506811,
559
- "loss": 4.2548,
560
- "step": 18400
561
- },
562
- {
563
- "epoch": 0.08,
564
- "learning_rate": 0.0007596341415064441,
565
- "loss": 4.2643,
566
- "step": 18600
567
- },
568
- {
569
- "epoch": 0.08,
570
- "learning_rate": 0.0007673090858838494,
571
- "loss": 4.266,
572
- "step": 18800
573
- },
574
- {
575
- "epoch": 0.08,
576
- "learning_rate": 0.0007750618070170041,
577
- "loss": 4.2503,
578
- "step": 19000
579
- },
580
- {
581
- "epoch": 0.08,
582
- "learning_rate": 0.0007828921688753324,
583
- "loss": 4.2093,
584
- "step": 19200
585
- },
586
- {
587
- "epoch": 0.08,
588
- "learning_rate": 0.0007908000340659631,
589
- "loss": 4.2449,
590
- "step": 19400
591
- },
592
- {
593
- "epoch": 0.08,
594
- "learning_rate": 0.0007987852638361333,
595
- "loss": 4.2158,
596
- "step": 19600
597
- },
598
- {
599
- "epoch": 0.09,
600
- "learning_rate": 0.0008068477180756314,
601
- "loss": 4.202,
602
- "step": 19800
603
- },
604
- {
605
- "epoch": 0.09,
606
- "learning_rate": 0.0008149872553192515,
607
- "loss": 4.2065,
608
- "step": 20000
609
- },
610
- {
611
- "epoch": 0.09,
612
- "learning_rate": 0.0008232037327492777,
613
- "loss": 4.1773,
614
- "step": 20200
615
- },
616
- {
617
- "epoch": 0.09,
618
- "learning_rate": 0.0008314970061979818,
619
- "loss": 4.1904,
620
- "step": 20400
621
- },
622
- {
623
- "epoch": 0.09,
624
- "learning_rate": 0.0008398669301501703,
625
- "loss": 4.1868,
626
- "step": 20600
627
- },
628
- {
629
- "epoch": 0.09,
630
- "learning_rate": 0.0008483133577457148,
631
- "loss": 4.2006,
632
- "step": 20800
633
- },
634
- {
635
- "epoch": 0.09,
636
- "learning_rate": 0.0008568361407821495,
637
- "loss": 4.1467,
638
- "step": 21000
639
- },
640
- {
641
- "epoch": 0.09,
642
- "learning_rate": 0.0008654351297172607,
643
- "loss": 4.1585,
644
- "step": 21200
645
- },
646
- {
647
- "epoch": 0.09,
648
- "learning_rate": 0.0008741101736717116,
649
- "loss": 4.1547,
650
- "step": 21400
651
- },
652
- {
653
- "epoch": 0.09,
654
- "learning_rate": 0.0008828611204316911,
655
- "loss": 4.1557,
656
- "step": 21600
657
- },
658
- {
659
- "epoch": 0.09,
660
- "learning_rate": 0.0008916878164515838,
661
- "loss": 4.1496,
662
- "step": 21800
663
- },
664
- {
665
- "epoch": 0.09,
666
- "learning_rate": 0.0009005901068566691,
667
- "loss": 4.1434,
668
- "step": 22000
669
- },
670
- {
671
- "epoch": 0.1,
672
- "learning_rate": 0.0009095678354458306,
673
- "loss": 4.1173,
674
- "step": 22200
675
- },
676
- {
677
- "epoch": 0.1,
678
- "learning_rate": 0.0009186208446943008,
679
- "loss": 4.1364,
680
- "step": 22400
681
- },
682
- {
683
- "epoch": 0.1,
684
- "learning_rate": 0.0009277489757564244,
685
- "loss": 4.1445,
686
- "step": 22600
687
- },
688
- {
689
- "epoch": 0.1,
690
- "learning_rate": 0.0009369520684684475,
691
- "loss": 4.1156,
692
- "step": 22800
693
- },
694
- {
695
- "epoch": 0.1,
696
- "learning_rate": 0.0009462299613513248,
697
- "loss": 4.1033,
698
- "step": 23000
699
- },
700
- {
701
- "epoch": 0.1,
702
- "learning_rate": 0.0009555824916135536,
703
- "loss": 4.1187,
704
- "step": 23200
705
- },
706
- {
707
- "epoch": 0.1,
708
- "learning_rate": 0.0009650094951540386,
709
- "loss": 4.0823,
710
- "step": 23400
711
- },
712
- {
713
- "epoch": 0.1,
714
- "learning_rate": 0.0009745108065649499,
715
- "loss": 4.0624,
716
- "step": 23600
717
- },
718
- {
719
- "epoch": 0.1,
720
- "learning_rate": 0.0009840862591346498,
721
- "loss": 4.0845,
722
- "step": 23800
723
- },
724
- {
725
- "epoch": 0.1,
726
- "learning_rate": 0.0009937356848506058,
727
- "loss": 4.0483,
728
- "step": 24000
729
- },
730
- {
731
- "epoch": 0.1,
732
- "learning_rate": 0.001003458914402332,
733
- "loss": 4.0512,
734
- "step": 24200
735
- },
736
- {
737
- "epoch": 0.1,
738
- "learning_rate": 0.0010132557771843787,
739
- "loss": 4.0606,
740
- "step": 24400
741
- },
742
- {
743
- "epoch": 0.11,
744
- "learning_rate": 0.0010231261012993067,
745
- "loss": 4.046,
746
- "step": 24600
747
- },
748
- {
749
- "epoch": 0.11,
750
- "learning_rate": 0.0010330697135607168,
751
- "loss": 4.0315,
752
- "step": 24800
753
- },
754
- {
755
- "epoch": 0.11,
756
- "learning_rate": 0.00104308643949628,
757
- "loss": 4.0179,
758
- "step": 25000
759
- },
760
- {
761
- "epoch": 0.11,
762
- "learning_rate": 0.001053176103350803,
763
- "loss": 4.0351,
764
- "step": 25200
765
- },
766
- {
767
- "epoch": 0.11,
768
- "learning_rate": 0.0010633385280893123,
769
- "loss": 4.02,
770
- "step": 25400
771
- },
772
- {
773
- "epoch": 0.11,
774
- "learning_rate": 0.0010735735354001595,
775
- "loss": 4.0201,
776
- "step": 25600
777
- },
778
- {
779
- "epoch": 0.11,
780
- "learning_rate": 0.0010838809456981471,
781
- "loss": 4.0044,
782
- "step": 25800
783
- },
784
- {
785
- "epoch": 0.11,
786
- "learning_rate": 0.001094260578127686,
787
- "loss": 3.9914,
788
- "step": 26000
789
- },
790
- {
791
- "epoch": 0.11,
792
- "learning_rate": 0.0011047122505659646,
793
- "loss": 3.9991,
794
- "step": 26200
795
- },
796
- {
797
- "epoch": 0.11,
798
- "learning_rate": 0.0011152357796261423,
799
- "loss": 4.0109,
800
- "step": 26400
801
- },
802
- {
803
- "epoch": 0.11,
804
- "learning_rate": 0.0011258309806605731,
805
- "loss": 4.0405,
806
- "step": 26600
807
- },
808
- {
809
- "epoch": 0.12,
810
- "learning_rate": 0.0011364976677640387,
811
- "loss": 4.0349,
812
- "step": 26800
813
- },
814
- {
815
- "epoch": 0.12,
816
- "learning_rate": 0.0011472356537770186,
817
- "loss": 4.0312,
818
- "step": 27000
819
- },
820
- {
821
- "epoch": 0.12,
822
- "learning_rate": 0.0011580447502889633,
823
- "loss": 4.0185,
824
- "step": 27200
825
- },
826
- {
827
- "epoch": 0.12,
828
- "learning_rate": 0.0011689247676416152,
829
- "loss": 4.011,
830
- "step": 27400
831
- },
832
- {
833
- "epoch": 0.12,
834
- "learning_rate": 0.0011798755149323176,
835
- "loss": 3.9898,
836
- "step": 27600
837
- },
838
- {
839
- "epoch": 0.12,
840
- "learning_rate": 0.001190896800017379,
841
- "loss": 3.981,
842
- "step": 27800
843
- },
844
- {
845
- "epoch": 0.12,
846
- "learning_rate": 0.0012019884295154416,
847
- "loss": 3.949,
848
- "step": 28000
849
- },
850
- {
851
- "epoch": 0.12,
852
- "learning_rate": 0.0012131502088108658,
853
- "loss": 3.9896,
854
- "step": 28200
855
- },
856
- {
857
- "epoch": 0.12,
858
- "learning_rate": 0.0012243819420571598,
859
- "loss": 3.9951,
860
- "step": 28400
861
- },
862
- {
863
- "epoch": 0.12,
864
- "learning_rate": 0.0012356834321804039,
865
- "loss": 4.0361,
866
- "step": 28600
867
- },
868
- {
869
- "epoch": 0.12,
870
- "learning_rate": 0.0012470544808827113,
871
- "loss": 4.1212,
872
- "step": 28800
873
- },
874
- {
875
- "epoch": 0.12,
876
- "learning_rate": 0.001258494888645708,
877
- "loss": 4.0721,
878
- "step": 29000
879
- },
880
- {
881
- "epoch": 0.13,
882
- "learning_rate": 0.0012700044547340368,
883
- "loss": 4.0311,
884
- "step": 29200
885
- },
886
- {
887
- "epoch": 0.13,
888
- "learning_rate": 0.0012815829771988738,
889
- "loss": 4.0114,
890
- "step": 29400
891
- },
892
- {
893
- "epoch": 0.13,
894
- "learning_rate": 0.001293230252881479,
895
- "loss": 3.9868,
896
- "step": 29600
897
- },
898
- {
899
- "epoch": 0.13,
900
- "learning_rate": 0.0013049460774167514,
901
- "loss": 3.9881,
902
- "step": 29800
903
- },
904
- {
905
- "epoch": 0.13,
906
- "learning_rate": 0.0013167302452368242,
907
- "loss": 3.9705,
908
- "step": 30000
909
  }
910
  ],
911
  "max_steps": 500000,
912
  "num_train_epochs": 3,
913
- "total_flos": 4.781489946624e+16,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.042973785990545764,
5
+ "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
+ "learning_rate": 4.0004211081201405e-05,
13
+ "loss": 3.9197,
14
  "step": 200
15
  },
16
  {
17
  "epoch": 0.0,
18
+ "learning_rate": 4.0016844250917146e-05,
19
+ "loss": 3.9108,
20
  "step": 400
21
  },
22
  {
23
  "epoch": 0.0,
24
+ "learning_rate": 4.003789928748371e-05,
25
+ "loss": 3.8655,
26
  "step": 600
27
  },
28
  {
29
  "epoch": 0.0,
30
+ "learning_rate": 4.006737582146571e-05,
31
+ "loss": 3.8602,
32
  "step": 800
33
  },
34
  {
35
  "epoch": 0.0,
36
+ "learning_rate": 4.010527333566261e-05,
37
+ "loss": 3.8478,
38
  "step": 1000
39
  },
40
  {
41
  "epoch": 0.01,
42
+ "learning_rate": 4.0151591165118474e-05,
43
+ "loss": 3.8608,
44
  "step": 1200
45
  },
46
  {
47
  "epoch": 0.01,
48
+ "learning_rate": 4.020632849713237e-05,
49
+ "loss": 3.8431,
50
  "step": 1400
51
  },
52
  {
53
  "epoch": 0.01,
54
+ "learning_rate": 4.0269484371273996e-05,
55
+ "loss": 3.8447,
56
  "step": 1600
57
  },
58
  {
59
  "epoch": 0.01,
60
+ "learning_rate": 4.034105767939918e-05,
61
+ "loss": 3.8422,
62
  "step": 1800
63
  },
64
  {
65
  "epoch": 0.01,
66
+ "learning_rate": 4.0421047165670686e-05,
67
+ "loss": 3.8019,
68
  "step": 2000
69
  },
70
  {
71
  "epoch": 0.01,
72
+ "learning_rate": 4.050945142657905e-05,
73
+ "loss": 3.8109,
74
  "step": 2200
75
  },
76
  {
77
  "epoch": 0.01,
78
+ "learning_rate": 4.0606268910968035e-05,
79
+ "loss": 3.8192,
80
  "step": 2400
81
  },
82
  {
83
  "epoch": 0.01,
84
+ "learning_rate": 4.071149792006154e-05,
85
+ "loss": 3.8037,
86
  "step": 2600
87
  },
88
  {
89
  "epoch": 0.01,
90
+ "learning_rate": 4.082513660749298e-05,
91
+ "loss": 3.8074,
92
  "step": 2800
93
  },
94
  {
95
  "epoch": 0.01,
96
+ "learning_rate": 4.0947182979338874e-05,
97
+ "loss": 3.801,
98
  "step": 3000
99
  },
100
  {
101
  "epoch": 0.01,
102
+ "learning_rate": 4.1077634894152375e-05,
103
+ "loss": 3.786,
104
  "step": 3200
105
  },
106
  {
107
  "epoch": 0.01,
108
+ "learning_rate": 4.121649006300163e-05,
109
+ "loss": 3.7799,
110
  "step": 3400
111
  },
112
  {
113
  "epoch": 0.02,
114
+ "learning_rate": 4.136374604951046e-05,
115
+ "loss": 3.7721,
116
  "step": 3600
117
  },
118
  {
119
  "epoch": 0.02,
120
+ "learning_rate": 4.1519400269899535e-05,
121
+ "loss": 3.7829,
122
  "step": 3800
123
  },
124
  {
125
  "epoch": 0.02,
126
+ "learning_rate": 4.1683449993033545e-05,
127
+ "loss": 3.7922,
128
  "step": 4000
129
  },
130
  {
131
  "epoch": 0.02,
132
+ "learning_rate": 4.185589234046783e-05,
133
+ "loss": 3.7686,
134
  "step": 4200
135
  },
136
  {
137
  "epoch": 0.02,
138
+ "learning_rate": 4.203672428649923e-05,
139
+ "loss": 3.7742,
140
  "step": 4400
141
  },
142
  {
143
  "epoch": 0.02,
144
+ "learning_rate": 4.2225942658219505e-05,
145
+ "loss": 3.753,
146
  "step": 4600
147
  },
148
  {
149
  "epoch": 0.02,
150
+ "learning_rate": 4.242354413557057e-05,
151
+ "loss": 3.7663,
152
  "step": 4800
153
  },
154
  {
155
  "epoch": 0.02,
156
+ "learning_rate": 4.262952525140289e-05,
157
+ "loss": 3.7589,
158
  "step": 5000
159
  },
160
  {
161
  "epoch": 0.02,
162
+ "learning_rate": 4.2843882391536666e-05,
163
+ "loss": 3.7652,
164
  "step": 5200
165
  },
166
  {
167
  "epoch": 0.02,
168
+ "learning_rate": 4.306661179482435e-05,
169
+ "loss": 3.7618,
170
  "step": 5400
171
  },
172
  {
173
  "epoch": 0.02,
174
+ "learning_rate": 4.329770955321802e-05,
175
+ "loss": 3.7441,
176
  "step": 5600
177
  },
178
  {
179
  "epoch": 0.02,
180
+ "learning_rate": 4.3537171611836355e-05,
181
+ "loss": 3.7364,
182
  "step": 5800
183
  },
184
  {
185
  "epoch": 0.03,
186
+ "learning_rate": 4.3784993769037384e-05,
187
+ "loss": 3.7501,
188
  "step": 6000
189
  },
190
  {
191
  "epoch": 0.03,
192
+ "learning_rate": 4.404117167649071e-05,
193
+ "loss": 3.7363,
194
  "step": 6200
195
  },
196
  {
197
  "epoch": 0.03,
198
+ "learning_rate": 4.43057008392547e-05,
199
+ "loss": 3.7279,
200
  "step": 6400
201
  },
202
  {
203
  "epoch": 0.03,
204
+ "learning_rate": 4.457857661585541e-05,
205
+ "loss": 3.7158,
206
  "step": 6600
207
  },
208
  {
209
  "epoch": 0.03,
210
+ "learning_rate": 4.485979421836779e-05,
211
+ "loss": 3.74,
212
  "step": 6800
213
  },
214
  {
215
  "epoch": 0.03,
216
+ "learning_rate": 4.514934871249906e-05,
217
+ "loss": 3.7441,
218
  "step": 7000
219
  },
220
  {
221
  "epoch": 0.03,
222
+ "learning_rate": 4.544723501767687e-05,
223
+ "loss": 3.7141,
224
  "step": 7200
225
  },
226
  {
227
  "epoch": 0.03,
228
+ "learning_rate": 4.575344790713656e-05,
229
+ "loss": 3.7264,
230
  "step": 7400
231
  },
232
  {
233
  "epoch": 0.03,
234
+ "learning_rate": 4.6067982008014406e-05,
235
+ "loss": 3.7221,
236
  "step": 7600
237
  },
238
  {
239
  "epoch": 0.03,
240
+ "learning_rate": 4.639083180144098e-05,
241
+ "loss": 3.7363,
242
  "step": 7800
243
  },
244
  {
245
  "epoch": 0.03,
246
+ "learning_rate": 4.672199162263839e-05,
247
+ "loss": 3.73,
248
  "step": 8000
249
  },
250
  {
251
  "epoch": 0.04,
252
+ "learning_rate": 4.706145566101981e-05,
253
+ "loss": 3.7205,
254
  "step": 8200
255
  },
256
  {
257
  "epoch": 0.04,
258
+ "learning_rate": 4.740921796029065e-05,
259
+ "loss": 3.7192,
260
  "step": 8400
261
  },
262
  {
263
  "epoch": 0.04,
264
+ "learning_rate": 4.776527241855393e-05,
265
+ "loss": 3.7095,
266
  "step": 8600
267
  },
268
  {
269
  "epoch": 0.04,
270
+ "learning_rate": 4.812961278841707e-05,
271
+ "loss": 3.7311,
272
  "step": 8800
273
  },
274
  {
275
  "epoch": 0.04,
276
+ "learning_rate": 4.85022326771014e-05,
277
+ "loss": 3.7031,
278
  "step": 9000
279
  },
280
  {
281
  "epoch": 0.04,
282
+ "learning_rate": 4.888312554655438e-05,
283
+ "loss": 3.6934,
284
  "step": 9200
285
  },
286
  {
287
  "epoch": 0.04,
288
+ "learning_rate": 4.92722847135643e-05,
289
+ "loss": 3.7218,
290
  "step": 9400
291
  },
292
  {
293
  "epoch": 0.04,
294
+ "learning_rate": 4.9669703349877704e-05,
295
+ "loss": 3.711,
296
  "step": 9600
297
  },
298
  {
299
  "epoch": 0.04,
300
+ "learning_rate": 5.0075374482318863e-05,
301
+ "loss": 3.7002,
302
  "step": 9800
303
  },
304
  {
305
  "epoch": 0.04,
306
+ "learning_rate": 5.0489290992912625e-05,
307
+ "loss": 3.698,
308
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  }
310
  ],
311
  "max_steps": 500000,
312
  "num_train_epochs": 3,
313
+ "total_flos": 1.593829982208e+16,
314
  "trial_name": null,
315
  "trial_params": null
316
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b76655be47d6fa9534e270ebad89f26359197ddd96685592630dba92c71f881
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1575e3c0c4f23842bbe1ba840257aad501aecba3d1800f59d7014cd73b0cee
3
  size 3375
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b649041f4e942a1c71886ca3f83f11fc846bb4752950f41b84cd207e3cc556c
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48f41b7b489ac24545b446eb3e28cf98daf67da888804a7513071641e3cb0ab
3
  size 146774203
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b76655be47d6fa9534e270ebad89f26359197ddd96685592630dba92c71f881
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1575e3c0c4f23842bbe1ba840257aad501aecba3d1800f59d7014cd73b0cee
3
  size 3375