Azamorn commited on
Commit
1e49201
1 Parent(s): f8b9bbf

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +1 -1
  2. model.safetensors +1 -1
  3. optimizer.pt +1 -1
  4. scheduler.pt +1 -1
  5. test.py +1 -1
  6. trainer_state.json +205 -805
  7. training_args.bin +1 -1
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "retnet-tinystories/checkpoint-2000",
3
  "activation_dropout": 0.0,
4
  "activation_fn": "swish",
5
  "architectures": [
 
1
  {
2
+ "_name_or_path": "retnet-tinystories/checkpoint-1000",
3
  "activation_dropout": 0.0,
4
  "activation_fn": "swish",
5
  "architectures": [
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a4476df93c13330ac9bd204b05aa25cadc8aa9356e295e4a1a4c330cb7e6f15
3
  size 1615784368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb8d686544ce31e8332f4d01bf2f1292e335f25b4bde181585534af00dbb892
3
  size 1615784368
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60e51b3393d17c358c62db87d47057d81911db30098b095117c2224a95990545
3
  size 3231643962
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42bf76c3ae8af625277b114f928e4b22175b21adf0ef5c85ad1163f7e2ed5d6e
3
  size 3231643962
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cc9f4ab547746c5f2b075d31622dd3c792964f62b6ce351d90227b6dd977bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8864d82a35bb3c3fc324c1383bdf60d8a772caaa4595c375264f859cb778d14b
3
  size 1064
test.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import AutoTokenizer
2
  from retnet.modeling_retnet import RetNetForCausalLM
3
 
4
- model = RetNetForCausalLM.from_pretrained("retnet-tinystories")
5
  tokenizer = AutoTokenizer.from_pretrained('gpt2')
6
  tokenizer.model_max_length = 16384
7
  tokenizer.pad_token = tokenizer.eos_token
 
1
  from transformers import AutoTokenizer
2
  from retnet.modeling_retnet import RetNetForCausalLM
3
 
4
+ model = RetNetForCausalLM.from_pretrained("./")
5
  tokenizer = AutoTokenizer.from_pretrained('gpt2')
6
  tokenizer.model_max_length = 16384
7
  tokenizer.pad_token = tokenizer.eos_token
trainer_state.json CHANGED
@@ -1,1220 +1,620 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "learning_rate": 0.0004995,
14
- "loss": 2.0736,
15
  "step": 5
16
  },
17
  {
18
  "epoch": 0.01,
19
- "learning_rate": 0.000499,
20
- "loss": 2.165,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
- "learning_rate": 0.0004985,
26
- "loss": 2.084,
27
  "step": 15
28
  },
29
  {
30
  "epoch": 0.02,
31
- "learning_rate": 0.000498,
32
- "loss": 2.1784,
33
  "step": 20
34
  },
35
  {
36
  "epoch": 0.03,
37
- "learning_rate": 0.0004975,
38
- "loss": 2.0932,
39
  "step": 25
40
  },
41
  {
42
  "epoch": 0.03,
43
- "learning_rate": 0.000497,
44
- "loss": 2.056,
45
  "step": 30
46
  },
47
  {
48
  "epoch": 0.04,
49
- "learning_rate": 0.0004965,
50
- "loss": 2.0944,
51
  "step": 35
52
  },
53
  {
54
  "epoch": 0.04,
55
- "learning_rate": 0.000496,
56
- "loss": 2.1092,
57
  "step": 40
58
  },
59
  {
60
  "epoch": 0.04,
61
- "learning_rate": 0.0004955,
62
- "loss": 2.1146,
63
  "step": 45
64
  },
65
  {
66
  "epoch": 0.05,
67
- "learning_rate": 0.000495,
68
- "loss": 2.1368,
69
  "step": 50
70
  },
71
  {
72
  "epoch": 0.06,
73
- "learning_rate": 0.0004945,
74
- "loss": 2.0964,
75
  "step": 55
76
  },
77
  {
78
  "epoch": 0.06,
79
- "learning_rate": 0.000494,
80
- "loss": 2.0192,
81
  "step": 60
82
  },
83
  {
84
  "epoch": 0.07,
85
- "learning_rate": 0.0004935,
86
- "loss": 2.0938,
87
  "step": 65
88
  },
89
  {
90
  "epoch": 0.07,
91
- "learning_rate": 0.0004930000000000001,
92
- "loss": 2.0871,
93
  "step": 70
94
  },
95
  {
96
  "epoch": 0.07,
97
- "learning_rate": 0.0004925,
98
- "loss": 2.0773,
99
  "step": 75
100
  },
101
  {
102
  "epoch": 0.08,
103
- "learning_rate": 0.000492,
104
- "loss": 2.0618,
105
  "step": 80
106
  },
107
  {
108
  "epoch": 0.09,
109
- "learning_rate": 0.0004915,
110
- "loss": 2.0788,
111
  "step": 85
112
  },
113
  {
114
  "epoch": 0.09,
115
- "learning_rate": 0.000491,
116
- "loss": 2.1009,
117
  "step": 90
118
  },
119
  {
120
  "epoch": 0.1,
121
- "learning_rate": 0.0004905,
122
- "loss": 2.0138,
123
  "step": 95
124
  },
125
  {
126
  "epoch": 0.1,
127
- "learning_rate": 0.00049,
128
- "loss": 2.0689,
129
  "step": 100
130
  },
131
  {
132
  "epoch": 0.1,
133
- "learning_rate": 0.0004895,
134
- "loss": 2.0381,
135
  "step": 105
136
  },
137
  {
138
  "epoch": 0.11,
139
- "learning_rate": 0.000489,
140
- "loss": 2.1174,
141
  "step": 110
142
  },
143
  {
144
  "epoch": 0.12,
145
- "learning_rate": 0.0004885,
146
- "loss": 2.0315,
147
  "step": 115
148
  },
149
  {
150
  "epoch": 0.12,
151
- "learning_rate": 0.000488,
152
- "loss": 2.0351,
153
  "step": 120
154
  },
155
  {
156
  "epoch": 0.12,
157
- "learning_rate": 0.0004875,
158
- "loss": 2.1047,
159
  "step": 125
160
  },
161
  {
162
  "epoch": 0.13,
163
- "learning_rate": 0.000487,
164
- "loss": 2.0413,
165
  "step": 130
166
  },
167
  {
168
  "epoch": 0.14,
169
- "learning_rate": 0.0004865,
170
- "loss": 2.0978,
171
  "step": 135
172
  },
173
  {
174
  "epoch": 0.14,
175
- "learning_rate": 0.000486,
176
- "loss": 2.1018,
177
  "step": 140
178
  },
179
  {
180
  "epoch": 0.14,
181
- "learning_rate": 0.0004855,
182
- "loss": 2.0679,
183
  "step": 145
184
  },
185
  {
186
  "epoch": 0.15,
187
- "learning_rate": 0.00048499999999999997,
188
- "loss": 1.974,
189
  "step": 150
190
  },
191
  {
192
  "epoch": 0.15,
193
- "learning_rate": 0.0004845,
194
- "loss": 2.0155,
195
  "step": 155
196
  },
197
  {
198
  "epoch": 0.16,
199
- "learning_rate": 0.000484,
200
- "loss": 2.0396,
201
  "step": 160
202
  },
203
  {
204
  "epoch": 0.17,
205
- "learning_rate": 0.0004835,
206
- "loss": 2.0207,
207
  "step": 165
208
  },
209
  {
210
  "epoch": 0.17,
211
- "learning_rate": 0.000483,
212
- "loss": 2.017,
213
  "step": 170
214
  },
215
  {
216
  "epoch": 0.17,
217
- "learning_rate": 0.0004825,
218
- "loss": 2.0844,
219
  "step": 175
220
  },
221
  {
222
  "epoch": 0.18,
223
- "learning_rate": 0.000482,
224
- "loss": 2.0593,
225
  "step": 180
226
  },
227
  {
228
  "epoch": 0.18,
229
- "learning_rate": 0.0004815,
230
- "loss": 2.0018,
231
  "step": 185
232
  },
233
  {
234
  "epoch": 0.19,
235
- "learning_rate": 0.000481,
236
- "loss": 1.9544,
237
  "step": 190
238
  },
239
  {
240
  "epoch": 0.2,
241
- "learning_rate": 0.00048049999999999997,
242
- "loss": 2.063,
243
  "step": 195
244
  },
245
  {
246
  "epoch": 0.2,
247
- "learning_rate": 0.00048,
248
- "loss": 2.0345,
249
  "step": 200
250
  },
251
  {
252
  "epoch": 0.2,
253
- "learning_rate": 0.0004795,
254
- "loss": 2.0164,
255
  "step": 205
256
  },
257
  {
258
  "epoch": 0.21,
259
- "learning_rate": 0.000479,
260
- "loss": 1.9801,
261
  "step": 210
262
  },
263
  {
264
  "epoch": 0.21,
265
- "learning_rate": 0.0004785,
266
- "loss": 2.038,
267
  "step": 215
268
  },
269
  {
270
  "epoch": 0.22,
271
- "learning_rate": 0.00047799999999999996,
272
- "loss": 2.0919,
273
  "step": 220
274
  },
275
  {
276
  "epoch": 0.23,
277
- "learning_rate": 0.0004775,
278
- "loss": 2.0151,
279
  "step": 225
280
  },
281
  {
282
  "epoch": 0.23,
283
- "learning_rate": 0.000477,
284
- "loss": 1.9425,
285
  "step": 230
286
  },
287
  {
288
  "epoch": 0.23,
289
- "learning_rate": 0.0004765,
290
- "loss": 2.0252,
291
  "step": 235
292
  },
293
  {
294
  "epoch": 0.24,
295
- "learning_rate": 0.00047599999999999997,
296
- "loss": 1.965,
297
  "step": 240
298
  },
299
  {
300
  "epoch": 0.24,
301
- "learning_rate": 0.0004755,
302
- "loss": 1.9773,
303
  "step": 245
304
  },
305
  {
306
  "epoch": 0.25,
307
- "learning_rate": 0.000475,
308
- "loss": 1.9828,
309
  "step": 250
310
  },
311
  {
312
  "epoch": 0.26,
313
- "learning_rate": 0.0004745,
314
- "loss": 1.9247,
315
  "step": 255
316
  },
317
  {
318
  "epoch": 0.26,
319
- "learning_rate": 0.000474,
320
- "loss": 1.9878,
321
  "step": 260
322
  },
323
  {
324
  "epoch": 0.27,
325
- "learning_rate": 0.00047349999999999996,
326
- "loss": 2.0097,
327
  "step": 265
328
  },
329
  {
330
  "epoch": 0.27,
331
- "learning_rate": 0.000473,
332
- "loss": 2.0276,
333
  "step": 270
334
  },
335
  {
336
  "epoch": 0.28,
337
- "learning_rate": 0.0004725,
338
- "loss": 1.949,
339
  "step": 275
340
  },
341
  {
342
  "epoch": 0.28,
343
- "learning_rate": 0.000472,
344
- "loss": 2.0144,
345
  "step": 280
346
  },
347
  {
348
  "epoch": 0.28,
349
- "learning_rate": 0.00047149999999999997,
350
- "loss": 1.9787,
351
  "step": 285
352
  },
353
  {
354
  "epoch": 0.29,
355
- "learning_rate": 0.000471,
356
- "loss": 2.0019,
357
  "step": 290
358
  },
359
  {
360
  "epoch": 0.29,
361
- "learning_rate": 0.0004705,
362
- "loss": 2.0421,
363
  "step": 295
364
  },
365
  {
366
  "epoch": 0.3,
367
- "learning_rate": 0.00047,
368
- "loss": 1.9843,
369
  "step": 300
370
  },
371
  {
372
  "epoch": 0.3,
373
- "learning_rate": 0.0004695,
374
- "loss": 1.9604,
375
  "step": 305
376
  },
377
  {
378
  "epoch": 0.31,
379
- "learning_rate": 0.00046899999999999996,
380
- "loss": 1.9696,
381
  "step": 310
382
  },
383
  {
384
  "epoch": 0.32,
385
- "learning_rate": 0.00046850000000000006,
386
- "loss": 1.988,
387
  "step": 315
388
  },
389
  {
390
  "epoch": 0.32,
391
- "learning_rate": 0.00046800000000000005,
392
- "loss": 1.9925,
393
  "step": 320
394
  },
395
  {
396
  "epoch": 0.33,
397
- "learning_rate": 0.00046750000000000003,
398
- "loss": 1.9881,
399
  "step": 325
400
  },
401
  {
402
  "epoch": 0.33,
403
- "learning_rate": 0.000467,
404
- "loss": 1.9817,
405
  "step": 330
406
  },
407
  {
408
  "epoch": 0.34,
409
- "learning_rate": 0.0004665,
410
- "loss": 1.9384,
411
  "step": 335
412
  },
413
  {
414
  "epoch": 0.34,
415
- "learning_rate": 0.00046600000000000005,
416
- "loss": 1.9971,
417
  "step": 340
418
  },
419
  {
420
  "epoch": 0.34,
421
- "learning_rate": 0.00046550000000000004,
422
- "loss": 2.0022,
423
  "step": 345
424
  },
425
  {
426
  "epoch": 0.35,
427
- "learning_rate": 0.000465,
428
- "loss": 1.9593,
429
  "step": 350
430
  },
431
  {
432
  "epoch": 0.35,
433
- "learning_rate": 0.0004645,
434
- "loss": 1.9979,
435
  "step": 355
436
  },
437
  {
438
  "epoch": 0.36,
439
- "learning_rate": 0.00046400000000000006,
440
- "loss": 1.9264,
441
  "step": 360
442
  },
443
  {
444
  "epoch": 0.36,
445
- "learning_rate": 0.00046350000000000004,
446
- "loss": 1.9665,
447
  "step": 365
448
  },
449
  {
450
  "epoch": 0.37,
451
- "learning_rate": 0.00046300000000000003,
452
- "loss": 1.9307,
453
  "step": 370
454
  },
455
  {
456
  "epoch": 0.38,
457
- "learning_rate": 0.0004625,
458
- "loss": 1.9584,
459
  "step": 375
460
  },
461
  {
462
  "epoch": 0.38,
463
- "learning_rate": 0.000462,
464
- "loss": 2.0025,
465
  "step": 380
466
  },
467
  {
468
  "epoch": 0.39,
469
- "learning_rate": 0.00046150000000000005,
470
- "loss": 1.98,
471
  "step": 385
472
  },
473
  {
474
  "epoch": 0.39,
475
- "learning_rate": 0.00046100000000000004,
476
- "loss": 1.9673,
477
  "step": 390
478
  },
479
  {
480
  "epoch": 0.4,
481
- "learning_rate": 0.0004605,
482
- "loss": 1.9598,
483
  "step": 395
484
  },
485
  {
486
  "epoch": 0.4,
487
- "learning_rate": 0.00046,
488
- "loss": 2.0057,
489
  "step": 400
490
  },
491
  {
492
  "epoch": 0.41,
493
- "learning_rate": 0.00045950000000000006,
494
- "loss": 1.9537,
495
  "step": 405
496
  },
497
  {
498
  "epoch": 0.41,
499
- "learning_rate": 0.00045900000000000004,
500
- "loss": 1.9726,
501
  "step": 410
502
  },
503
  {
504
  "epoch": 0.41,
505
- "learning_rate": 0.00045850000000000003,
506
- "loss": 1.9011,
507
  "step": 415
508
  },
509
  {
510
  "epoch": 0.42,
511
- "learning_rate": 0.000458,
512
- "loss": 1.9988,
513
  "step": 420
514
  },
515
  {
516
  "epoch": 0.42,
517
- "learning_rate": 0.0004575,
518
- "loss": 1.9626,
519
  "step": 425
520
  },
521
  {
522
  "epoch": 0.43,
523
- "learning_rate": 0.00045700000000000005,
524
- "loss": 1.9083,
525
  "step": 430
526
  },
527
  {
528
  "epoch": 0.43,
529
- "learning_rate": 0.00045650000000000004,
530
- "loss": 1.9429,
531
  "step": 435
532
  },
533
  {
534
  "epoch": 0.44,
535
- "learning_rate": 0.000456,
536
- "loss": 1.9819,
537
  "step": 440
538
  },
539
  {
540
  "epoch": 0.45,
541
- "learning_rate": 0.0004555,
542
- "loss": 1.9259,
543
  "step": 445
544
  },
545
  {
546
  "epoch": 0.45,
547
- "learning_rate": 0.000455,
548
- "loss": 1.9024,
549
  "step": 450
550
  },
551
  {
552
  "epoch": 0.46,
553
- "learning_rate": 0.00045450000000000004,
554
- "loss": 1.983,
555
  "step": 455
556
  },
557
  {
558
  "epoch": 0.46,
559
- "learning_rate": 0.00045400000000000003,
560
- "loss": 1.9776,
561
  "step": 460
562
  },
563
  {
564
  "epoch": 0.47,
565
- "learning_rate": 0.0004535,
566
- "loss": 1.9816,
567
  "step": 465
568
  },
569
  {
570
  "epoch": 0.47,
571
- "learning_rate": 0.000453,
572
- "loss": 1.9402,
573
  "step": 470
574
  },
575
  {
576
  "epoch": 0.47,
577
- "learning_rate": 0.00045250000000000005,
578
- "loss": 1.9931,
579
  "step": 475
580
  },
581
  {
582
  "epoch": 0.48,
583
- "learning_rate": 0.00045200000000000004,
584
- "loss": 1.9528,
585
  "step": 480
586
  },
587
  {
588
  "epoch": 0.48,
589
- "learning_rate": 0.0004515,
590
- "loss": 1.9392,
591
  "step": 485
592
  },
593
  {
594
  "epoch": 0.49,
595
- "learning_rate": 0.000451,
596
- "loss": 1.9249,
597
  "step": 490
598
  },
599
  {
600
  "epoch": 0.49,
601
- "learning_rate": 0.0004505,
602
- "loss": 1.9711,
603
  "step": 495
604
  },
605
  {
606
  "epoch": 0.5,
607
- "learning_rate": 0.00045000000000000004,
608
- "loss": 1.8939,
609
  "step": 500
610
- },
611
- {
612
- "epoch": 0.51,
613
- "learning_rate": 0.00044950000000000003,
614
- "loss": 1.96,
615
- "step": 505
616
- },
617
- {
618
- "epoch": 0.51,
619
- "learning_rate": 0.000449,
620
- "loss": 1.9014,
621
- "step": 510
622
- },
623
- {
624
- "epoch": 0.52,
625
- "learning_rate": 0.0004485,
626
- "loss": 1.9354,
627
- "step": 515
628
- },
629
- {
630
- "epoch": 0.52,
631
- "learning_rate": 0.000448,
632
- "loss": 1.9597,
633
- "step": 520
634
- },
635
- {
636
- "epoch": 0.53,
637
- "learning_rate": 0.00044750000000000004,
638
- "loss": 1.9367,
639
- "step": 525
640
- },
641
- {
642
- "epoch": 0.53,
643
- "learning_rate": 0.000447,
644
- "loss": 1.9698,
645
- "step": 530
646
- },
647
- {
648
- "epoch": 0.54,
649
- "learning_rate": 0.0004465,
650
- "loss": 1.8743,
651
- "step": 535
652
- },
653
- {
654
- "epoch": 0.54,
655
- "learning_rate": 0.000446,
656
- "loss": 1.9083,
657
- "step": 540
658
- },
659
- {
660
- "epoch": 0.55,
661
- "learning_rate": 0.00044550000000000004,
662
- "loss": 1.9096,
663
- "step": 545
664
- },
665
- {
666
- "epoch": 0.55,
667
- "learning_rate": 0.00044500000000000003,
668
- "loss": 2.0429,
669
- "step": 550
670
- },
671
- {
672
- "epoch": 0.56,
673
- "learning_rate": 0.0004445,
674
- "loss": 1.9972,
675
- "step": 555
676
- },
677
- {
678
- "epoch": 0.56,
679
- "learning_rate": 0.000444,
680
- "loss": 1.8974,
681
- "step": 560
682
- },
683
- {
684
- "epoch": 0.56,
685
- "learning_rate": 0.0004435,
686
- "loss": 1.9467,
687
- "step": 565
688
- },
689
- {
690
- "epoch": 0.57,
691
- "learning_rate": 0.00044300000000000003,
692
- "loss": 1.9382,
693
- "step": 570
694
- },
695
- {
696
- "epoch": 0.57,
697
- "learning_rate": 0.0004425,
698
- "loss": 1.8893,
699
- "step": 575
700
- },
701
- {
702
- "epoch": 0.58,
703
- "learning_rate": 0.000442,
704
- "loss": 1.9392,
705
- "step": 580
706
- },
707
- {
708
- "epoch": 0.58,
709
- "learning_rate": 0.0004415,
710
- "loss": 1.9229,
711
- "step": 585
712
- },
713
- {
714
- "epoch": 0.59,
715
- "learning_rate": 0.000441,
716
- "loss": 1.9042,
717
- "step": 590
718
- },
719
- {
720
- "epoch": 0.59,
721
- "learning_rate": 0.00044050000000000003,
722
- "loss": 1.9104,
723
- "step": 595
724
- },
725
- {
726
- "epoch": 0.6,
727
- "learning_rate": 0.00044,
728
- "loss": 1.9541,
729
- "step": 600
730
- },
731
- {
732
- "epoch": 0.6,
733
- "learning_rate": 0.0004395,
734
- "loss": 1.8948,
735
- "step": 605
736
- },
737
- {
738
- "epoch": 0.61,
739
- "learning_rate": 0.000439,
740
- "loss": 1.8849,
741
- "step": 610
742
- },
743
- {
744
- "epoch": 0.61,
745
- "learning_rate": 0.00043850000000000003,
746
- "loss": 1.9097,
747
- "step": 615
748
- },
749
- {
750
- "epoch": 0.62,
751
- "learning_rate": 0.000438,
752
- "loss": 1.9457,
753
- "step": 620
754
- },
755
- {
756
- "epoch": 0.62,
757
- "learning_rate": 0.0004375,
758
- "loss": 1.9231,
759
- "step": 625
760
- },
761
- {
762
- "epoch": 0.63,
763
- "learning_rate": 0.000437,
764
- "loss": 1.9228,
765
- "step": 630
766
- },
767
- {
768
- "epoch": 0.64,
769
- "learning_rate": 0.0004365,
770
- "loss": 1.9258,
771
- "step": 635
772
- },
773
- {
774
- "epoch": 0.64,
775
- "learning_rate": 0.000436,
776
- "loss": 1.8884,
777
- "step": 640
778
- },
779
- {
780
- "epoch": 0.65,
781
- "learning_rate": 0.0004355,
782
- "loss": 1.8693,
783
- "step": 645
784
- },
785
- {
786
- "epoch": 0.65,
787
- "learning_rate": 0.000435,
788
- "loss": 1.8635,
789
- "step": 650
790
- },
791
- {
792
- "epoch": 0.66,
793
- "learning_rate": 0.0004345,
794
- "loss": 1.9319,
795
- "step": 655
796
- },
797
- {
798
- "epoch": 0.66,
799
- "learning_rate": 0.00043400000000000003,
800
- "loss": 1.9116,
801
- "step": 660
802
- },
803
- {
804
- "epoch": 0.67,
805
- "learning_rate": 0.0004335,
806
- "loss": 1.9299,
807
- "step": 665
808
- },
809
- {
810
- "epoch": 0.67,
811
- "learning_rate": 0.000433,
812
- "loss": 1.8971,
813
- "step": 670
814
- },
815
- {
816
- "epoch": 0.68,
817
- "learning_rate": 0.0004325,
818
- "loss": 1.9317,
819
- "step": 675
820
- },
821
- {
822
- "epoch": 0.68,
823
- "learning_rate": 0.000432,
824
- "loss": 1.9523,
825
- "step": 680
826
- },
827
- {
828
- "epoch": 0.69,
829
- "learning_rate": 0.0004315,
830
- "loss": 1.8683,
831
- "step": 685
832
- },
833
- {
834
- "epoch": 0.69,
835
- "learning_rate": 0.000431,
836
- "loss": 1.9321,
837
- "step": 690
838
- },
839
- {
840
- "epoch": 0.69,
841
- "learning_rate": 0.0004305,
842
- "loss": 1.8971,
843
- "step": 695
844
- },
845
- {
846
- "epoch": 0.7,
847
- "learning_rate": 0.00043,
848
- "loss": 1.8666,
849
- "step": 700
850
- },
851
- {
852
- "epoch": 0.7,
853
- "learning_rate": 0.0004295,
854
- "loss": 1.888,
855
- "step": 705
856
- },
857
- {
858
- "epoch": 0.71,
859
- "learning_rate": 0.000429,
860
- "loss": 1.8946,
861
- "step": 710
862
- },
863
- {
864
- "epoch": 0.71,
865
- "learning_rate": 0.0004285,
866
- "loss": 1.9037,
867
- "step": 715
868
- },
869
- {
870
- "epoch": 0.72,
871
- "learning_rate": 0.000428,
872
- "loss": 1.8533,
873
- "step": 720
874
- },
875
- {
876
- "epoch": 0.72,
877
- "learning_rate": 0.0004275,
878
- "loss": 1.9052,
879
- "step": 725
880
- },
881
- {
882
- "epoch": 0.73,
883
- "learning_rate": 0.000427,
884
- "loss": 1.8677,
885
- "step": 730
886
- },
887
- {
888
- "epoch": 0.73,
889
- "learning_rate": 0.0004265,
890
- "loss": 1.9164,
891
- "step": 735
892
- },
893
- {
894
- "epoch": 0.74,
895
- "learning_rate": 0.000426,
896
- "loss": 1.8823,
897
- "step": 740
898
- },
899
- {
900
- "epoch": 0.74,
901
- "learning_rate": 0.0004255,
902
- "loss": 1.9215,
903
- "step": 745
904
- },
905
- {
906
- "epoch": 0.75,
907
- "learning_rate": 0.000425,
908
- "loss": 1.8903,
909
- "step": 750
910
- },
911
- {
912
- "epoch": 0.76,
913
- "learning_rate": 0.0004245,
914
- "loss": 1.8148,
915
- "step": 755
916
- },
917
- {
918
- "epoch": 0.76,
919
- "learning_rate": 0.000424,
920
- "loss": 1.8324,
921
- "step": 760
922
- },
923
- {
924
- "epoch": 0.77,
925
- "learning_rate": 0.0004235,
926
- "loss": 1.9011,
927
- "step": 765
928
- },
929
- {
930
- "epoch": 0.77,
931
- "learning_rate": 0.000423,
932
- "loss": 1.9424,
933
- "step": 770
934
- },
935
- {
936
- "epoch": 0.78,
937
- "learning_rate": 0.00042249999999999997,
938
- "loss": 1.9028,
939
- "step": 775
940
- },
941
- {
942
- "epoch": 0.78,
943
- "learning_rate": 0.000422,
944
- "loss": 1.8789,
945
- "step": 780
946
- },
947
- {
948
- "epoch": 0.79,
949
- "learning_rate": 0.0004215,
950
- "loss": 1.9016,
951
- "step": 785
952
- },
953
- {
954
- "epoch": 0.79,
955
- "learning_rate": 0.000421,
956
- "loss": 1.8837,
957
- "step": 790
958
- },
959
- {
960
- "epoch": 0.8,
961
- "learning_rate": 0.0004205,
962
- "loss": 1.8651,
963
- "step": 795
964
- },
965
- {
966
- "epoch": 0.8,
967
- "learning_rate": 0.00042,
968
- "loss": 1.8212,
969
- "step": 800
970
- },
971
- {
972
- "epoch": 0.81,
973
- "learning_rate": 0.0004195,
974
- "loss": 1.9073,
975
- "step": 805
976
- },
977
- {
978
- "epoch": 0.81,
979
- "learning_rate": 0.000419,
980
- "loss": 1.9267,
981
- "step": 810
982
- },
983
- {
984
- "epoch": 0.81,
985
- "learning_rate": 0.0004185,
986
- "loss": 1.8311,
987
- "step": 815
988
- },
989
- {
990
- "epoch": 0.82,
991
- "learning_rate": 0.00041799999999999997,
992
- "loss": 1.8537,
993
- "step": 820
994
- },
995
- {
996
- "epoch": 0.82,
997
- "learning_rate": 0.0004175,
998
- "loss": 1.8717,
999
- "step": 825
1000
- },
1001
- {
1002
- "epoch": 0.83,
1003
- "learning_rate": 0.000417,
1004
- "loss": 1.8241,
1005
- "step": 830
1006
- },
1007
- {
1008
- "epoch": 0.83,
1009
- "learning_rate": 0.0004165,
1010
- "loss": 1.8863,
1011
- "step": 835
1012
- },
1013
- {
1014
- "epoch": 0.84,
1015
- "learning_rate": 0.000416,
1016
- "loss": 1.9053,
1017
- "step": 840
1018
- },
1019
- {
1020
- "epoch": 0.84,
1021
- "learning_rate": 0.00041549999999999996,
1022
- "loss": 1.87,
1023
- "step": 845
1024
- },
1025
- {
1026
- "epoch": 0.85,
1027
- "learning_rate": 0.000415,
1028
- "loss": 1.837,
1029
- "step": 850
1030
- },
1031
- {
1032
- "epoch": 0.85,
1033
- "learning_rate": 0.0004145,
1034
- "loss": 1.8941,
1035
- "step": 855
1036
- },
1037
- {
1038
- "epoch": 0.86,
1039
- "learning_rate": 0.000414,
1040
- "loss": 1.9237,
1041
- "step": 860
1042
- },
1043
- {
1044
- "epoch": 0.86,
1045
- "learning_rate": 0.00041349999999999997,
1046
- "loss": 1.8854,
1047
- "step": 865
1048
- },
1049
- {
1050
- "epoch": 0.87,
1051
- "learning_rate": 0.000413,
1052
- "loss": 1.8401,
1053
- "step": 870
1054
- },
1055
- {
1056
- "epoch": 0.88,
1057
- "learning_rate": 0.0004125,
1058
- "loss": 1.8314,
1059
- "step": 875
1060
- },
1061
- {
1062
- "epoch": 0.88,
1063
- "learning_rate": 0.000412,
1064
- "loss": 1.858,
1065
- "step": 880
1066
- },
1067
- {
1068
- "epoch": 0.89,
1069
- "learning_rate": 0.0004115,
1070
- "loss": 1.8806,
1071
- "step": 885
1072
- },
1073
- {
1074
- "epoch": 0.89,
1075
- "learning_rate": 0.00041099999999999996,
1076
- "loss": 1.8244,
1077
- "step": 890
1078
- },
1079
- {
1080
- "epoch": 0.9,
1081
- "learning_rate": 0.0004105,
1082
- "loss": 1.8432,
1083
- "step": 895
1084
- },
1085
- {
1086
- "epoch": 0.9,
1087
- "learning_rate": 0.00041,
1088
- "loss": 1.8596,
1089
- "step": 900
1090
- },
1091
- {
1092
- "epoch": 0.91,
1093
- "learning_rate": 0.0004095,
1094
- "loss": 1.8276,
1095
- "step": 905
1096
- },
1097
- {
1098
- "epoch": 0.91,
1099
- "learning_rate": 0.00040899999999999997,
1100
- "loss": 1.9486,
1101
- "step": 910
1102
- },
1103
- {
1104
- "epoch": 0.92,
1105
- "learning_rate": 0.0004085,
1106
- "loss": 1.8382,
1107
- "step": 915
1108
- },
1109
- {
1110
- "epoch": 0.92,
1111
- "learning_rate": 0.000408,
1112
- "loss": 1.8735,
1113
- "step": 920
1114
- },
1115
- {
1116
- "epoch": 0.93,
1117
- "learning_rate": 0.0004075,
1118
- "loss": 1.8203,
1119
- "step": 925
1120
- },
1121
- {
1122
- "epoch": 0.93,
1123
- "learning_rate": 0.00040699999999999997,
1124
- "loss": 1.8045,
1125
- "step": 930
1126
- },
1127
- {
1128
- "epoch": 0.94,
1129
- "learning_rate": 0.00040649999999999996,
1130
- "loss": 1.8732,
1131
- "step": 935
1132
- },
1133
- {
1134
- "epoch": 0.94,
1135
- "learning_rate": 0.00040600000000000006,
1136
- "loss": 1.8032,
1137
- "step": 940
1138
- },
1139
- {
1140
- "epoch": 0.94,
1141
- "learning_rate": 0.00040550000000000004,
1142
- "loss": 1.8079,
1143
- "step": 945
1144
- },
1145
- {
1146
- "epoch": 0.95,
1147
- "learning_rate": 0.00040500000000000003,
1148
- "loss": 1.8855,
1149
- "step": 950
1150
- },
1151
- {
1152
- "epoch": 0.95,
1153
- "learning_rate": 0.0004045,
1154
- "loss": 1.8511,
1155
- "step": 955
1156
- },
1157
- {
1158
- "epoch": 0.96,
1159
- "learning_rate": 0.000404,
1160
- "loss": 1.8529,
1161
- "step": 960
1162
- },
1163
- {
1164
- "epoch": 0.96,
1165
- "learning_rate": 0.00040350000000000005,
1166
- "loss": 1.8482,
1167
- "step": 965
1168
- },
1169
- {
1170
- "epoch": 0.97,
1171
- "learning_rate": 0.00040300000000000004,
1172
- "loss": 1.8288,
1173
- "step": 970
1174
- },
1175
- {
1176
- "epoch": 0.97,
1177
- "learning_rate": 0.0004025,
1178
- "loss": 1.8781,
1179
- "step": 975
1180
- },
1181
- {
1182
- "epoch": 0.98,
1183
- "learning_rate": 0.000402,
1184
- "loss": 1.8458,
1185
- "step": 980
1186
- },
1187
- {
1188
- "epoch": 0.98,
1189
- "learning_rate": 0.00040150000000000006,
1190
- "loss": 1.8426,
1191
- "step": 985
1192
- },
1193
- {
1194
- "epoch": 0.99,
1195
- "learning_rate": 0.00040100000000000004,
1196
- "loss": 1.8517,
1197
- "step": 990
1198
- },
1199
- {
1200
- "epoch": 0.99,
1201
- "learning_rate": 0.00040050000000000003,
1202
- "loss": 1.8426,
1203
- "step": 995
1204
- },
1205
- {
1206
- "epoch": 1.0,
1207
- "learning_rate": 0.0004,
1208
- "loss": 1.7787,
1209
- "step": 1000
1210
  }
1211
  ],
1212
  "logging_steps": 5,
1213
- "max_steps": 5000,
1214
  "num_input_tokens_seen": 0,
1215
- "num_train_epochs": 5,
1216
  "save_steps": 500,
1217
- "total_flos": 2.65273620205824e+16,
1218
  "train_batch_size": 10,
1219
  "trial_name": null,
1220
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "learning_rate": 0.0004975,
14
+ "loss": 1.8757,
15
  "step": 5
16
  },
17
  {
18
  "epoch": 0.01,
19
+ "learning_rate": 0.000495,
20
+ "loss": 1.8758,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.01,
25
+ "learning_rate": 0.0004925,
26
+ "loss": 1.9307,
27
  "step": 15
28
  },
29
  {
30
  "epoch": 0.02,
31
+ "learning_rate": 0.00049,
32
+ "loss": 1.9338,
33
  "step": 20
34
  },
35
  {
36
  "epoch": 0.03,
37
+ "learning_rate": 0.0004875,
38
+ "loss": 1.8599,
39
  "step": 25
40
  },
41
  {
42
  "epoch": 0.03,
43
+ "learning_rate": 0.00048499999999999997,
44
+ "loss": 1.9875,
45
  "step": 30
46
  },
47
  {
48
  "epoch": 0.04,
49
+ "learning_rate": 0.0004825,
50
+ "loss": 1.9947,
51
  "step": 35
52
  },
53
  {
54
  "epoch": 0.04,
55
+ "learning_rate": 0.00048,
56
+ "loss": 1.9015,
57
  "step": 40
58
  },
59
  {
60
  "epoch": 0.04,
61
+ "learning_rate": 0.0004775,
62
+ "loss": 1.8941,
63
  "step": 45
64
  },
65
  {
66
  "epoch": 0.05,
67
+ "learning_rate": 0.000475,
68
+ "loss": 1.8592,
69
  "step": 50
70
  },
71
  {
72
  "epoch": 0.06,
73
+ "learning_rate": 0.0004725,
74
+ "loss": 1.8977,
75
  "step": 55
76
  },
77
  {
78
  "epoch": 0.06,
79
+ "learning_rate": 0.00047,
80
+ "loss": 1.886,
81
  "step": 60
82
  },
83
  {
84
  "epoch": 0.07,
85
+ "learning_rate": 0.00046750000000000003,
86
+ "loss": 1.9486,
87
  "step": 65
88
  },
89
  {
90
  "epoch": 0.07,
91
+ "learning_rate": 0.000465,
92
+ "loss": 1.8669,
93
  "step": 70
94
  },
95
  {
96
  "epoch": 0.07,
97
+ "learning_rate": 0.0004625,
98
+ "loss": 1.936,
99
  "step": 75
100
  },
101
  {
102
  "epoch": 0.08,
103
+ "learning_rate": 0.00046,
104
+ "loss": 1.8385,
105
  "step": 80
106
  },
107
  {
108
  "epoch": 0.09,
109
+ "learning_rate": 0.0004575,
110
+ "loss": 1.8045,
111
  "step": 85
112
  },
113
  {
114
  "epoch": 0.09,
115
+ "learning_rate": 0.000455,
116
+ "loss": 1.9058,
117
  "step": 90
118
  },
119
  {
120
  "epoch": 0.1,
121
+ "learning_rate": 0.00045250000000000005,
122
+ "loss": 1.868,
123
  "step": 95
124
  },
125
  {
126
  "epoch": 0.1,
127
+ "learning_rate": 0.00045000000000000004,
128
+ "loss": 1.8055,
129
  "step": 100
130
  },
131
  {
132
  "epoch": 0.1,
133
+ "learning_rate": 0.00044750000000000004,
134
+ "loss": 1.849,
135
  "step": 105
136
  },
137
  {
138
  "epoch": 0.11,
139
+ "learning_rate": 0.00044500000000000003,
140
+ "loss": 1.869,
141
  "step": 110
142
  },
143
  {
144
  "epoch": 0.12,
145
+ "learning_rate": 0.0004425,
146
+ "loss": 1.8587,
147
  "step": 115
148
  },
149
  {
150
  "epoch": 0.12,
151
+ "learning_rate": 0.00044,
152
+ "loss": 1.9206,
153
  "step": 120
154
  },
155
  {
156
  "epoch": 0.12,
157
+ "learning_rate": 0.0004375,
158
+ "loss": 1.8406,
159
  "step": 125
160
  },
161
  {
162
  "epoch": 0.13,
163
+ "learning_rate": 0.000435,
164
+ "loss": 1.8721,
165
  "step": 130
166
  },
167
  {
168
  "epoch": 0.14,
169
+ "learning_rate": 0.0004325,
170
+ "loss": 1.9409,
171
  "step": 135
172
  },
173
  {
174
  "epoch": 0.14,
175
+ "learning_rate": 0.00043,
176
+ "loss": 1.9222,
177
  "step": 140
178
  },
179
  {
180
  "epoch": 0.14,
181
+ "learning_rate": 0.0004275,
182
+ "loss": 1.8705,
183
  "step": 145
184
  },
185
  {
186
  "epoch": 0.15,
187
+ "learning_rate": 0.000425,
188
+ "loss": 1.9348,
189
  "step": 150
190
  },
191
  {
192
  "epoch": 0.15,
193
+ "learning_rate": 0.00042249999999999997,
194
+ "loss": 1.8167,
195
  "step": 155
196
  },
197
  {
198
  "epoch": 0.16,
199
+ "learning_rate": 0.00042,
200
+ "loss": 1.8904,
201
  "step": 160
202
  },
203
  {
204
  "epoch": 0.17,
205
+ "learning_rate": 0.0004175,
206
+ "loss": 1.8545,
207
  "step": 165
208
  },
209
  {
210
  "epoch": 0.17,
211
+ "learning_rate": 0.000415,
212
+ "loss": 1.8448,
213
  "step": 170
214
  },
215
  {
216
  "epoch": 0.17,
217
+ "learning_rate": 0.0004125,
218
+ "loss": 1.8898,
219
  "step": 175
220
  },
221
  {
222
  "epoch": 0.18,
223
+ "learning_rate": 0.00041,
224
+ "loss": 1.8338,
225
  "step": 180
226
  },
227
  {
228
  "epoch": 0.18,
229
+ "learning_rate": 0.0004075,
230
+ "loss": 1.8246,
231
  "step": 185
232
  },
233
  {
234
  "epoch": 0.19,
235
+ "learning_rate": 0.00040500000000000003,
236
+ "loss": 1.8754,
237
  "step": 190
238
  },
239
  {
240
  "epoch": 0.2,
241
+ "learning_rate": 0.0004025,
242
+ "loss": 1.8603,
243
  "step": 195
244
  },
245
  {
246
  "epoch": 0.2,
247
+ "learning_rate": 0.0004,
248
+ "loss": 1.799,
249
  "step": 200
250
  },
251
  {
252
  "epoch": 0.2,
253
+ "learning_rate": 0.0003975,
254
+ "loss": 1.8652,
255
  "step": 205
256
  },
257
  {
258
  "epoch": 0.21,
259
+ "learning_rate": 0.000395,
260
+ "loss": 1.8406,
261
  "step": 210
262
  },
263
  {
264
  "epoch": 0.21,
265
+ "learning_rate": 0.0003925,
266
+ "loss": 1.8341,
267
  "step": 215
268
  },
269
  {
270
  "epoch": 0.22,
271
+ "learning_rate": 0.00039000000000000005,
272
+ "loss": 1.9399,
273
  "step": 220
274
  },
275
  {
276
  "epoch": 0.23,
277
+ "learning_rate": 0.00038750000000000004,
278
+ "loss": 1.8095,
279
  "step": 225
280
  },
281
  {
282
  "epoch": 0.23,
283
+ "learning_rate": 0.00038500000000000003,
284
+ "loss": 1.8286,
285
  "step": 230
286
  },
287
  {
288
  "epoch": 0.23,
289
+ "learning_rate": 0.00038250000000000003,
290
+ "loss": 1.8846,
291
  "step": 235
292
  },
293
  {
294
  "epoch": 0.24,
295
+ "learning_rate": 0.00038,
296
+ "loss": 1.8101,
297
  "step": 240
298
  },
299
  {
300
  "epoch": 0.24,
301
+ "learning_rate": 0.0003775,
302
+ "loss": 1.8791,
303
  "step": 245
304
  },
305
  {
306
  "epoch": 0.25,
307
+ "learning_rate": 0.000375,
308
+ "loss": 1.8181,
309
  "step": 250
310
  },
311
  {
312
  "epoch": 0.26,
313
+ "learning_rate": 0.0003725,
314
+ "loss": 1.8555,
315
  "step": 255
316
  },
317
  {
318
  "epoch": 0.26,
319
+ "learning_rate": 0.00037,
320
+ "loss": 1.8328,
321
  "step": 260
322
  },
323
  {
324
  "epoch": 0.27,
325
+ "learning_rate": 0.0003675,
326
+ "loss": 1.814,
327
  "step": 265
328
  },
329
  {
330
  "epoch": 0.27,
331
+ "learning_rate": 0.000365,
332
+ "loss": 1.8647,
333
  "step": 270
334
  },
335
  {
336
  "epoch": 0.28,
337
+ "learning_rate": 0.0003625,
338
+ "loss": 1.8754,
339
  "step": 275
340
  },
341
  {
342
  "epoch": 0.28,
343
+ "learning_rate": 0.00035999999999999997,
344
+ "loss": 1.8184,
345
  "step": 280
346
  },
347
  {
348
  "epoch": 0.28,
349
+ "learning_rate": 0.0003575,
350
+ "loss": 1.8879,
351
  "step": 285
352
  },
353
  {
354
  "epoch": 0.29,
355
+ "learning_rate": 0.000355,
356
+ "loss": 1.8329,
357
  "step": 290
358
  },
359
  {
360
  "epoch": 0.29,
361
+ "learning_rate": 0.0003525,
362
+ "loss": 1.7787,
363
  "step": 295
364
  },
365
  {
366
  "epoch": 0.3,
367
+ "learning_rate": 0.00035,
368
+ "loss": 1.7543,
369
  "step": 300
370
  },
371
  {
372
  "epoch": 0.3,
373
+ "learning_rate": 0.0003475,
374
+ "loss": 1.7782,
375
  "step": 305
376
  },
377
  {
378
  "epoch": 0.31,
379
+ "learning_rate": 0.000345,
380
+ "loss": 1.8857,
381
  "step": 310
382
  },
383
  {
384
  "epoch": 0.32,
385
+ "learning_rate": 0.00034250000000000003,
386
+ "loss": 1.7608,
387
  "step": 315
388
  },
389
  {
390
  "epoch": 0.32,
391
+ "learning_rate": 0.00034,
392
+ "loss": 1.8622,
393
  "step": 320
394
  },
395
  {
396
  "epoch": 0.33,
397
+ "learning_rate": 0.0003375,
398
+ "loss": 1.7055,
399
  "step": 325
400
  },
401
  {
402
  "epoch": 0.33,
403
+ "learning_rate": 0.000335,
404
+ "loss": 1.7356,
405
  "step": 330
406
  },
407
  {
408
  "epoch": 0.34,
409
+ "learning_rate": 0.0003325,
410
+ "loss": 1.8353,
411
  "step": 335
412
  },
413
  {
414
  "epoch": 0.34,
415
+ "learning_rate": 0.00033,
416
+ "loss": 1.7389,
417
  "step": 340
418
  },
419
  {
420
  "epoch": 0.34,
421
+ "learning_rate": 0.00032750000000000005,
422
+ "loss": 1.8115,
423
  "step": 345
424
  },
425
  {
426
  "epoch": 0.35,
427
+ "learning_rate": 0.00032500000000000004,
428
+ "loss": 1.7303,
429
  "step": 350
430
  },
431
  {
432
  "epoch": 0.35,
433
+ "learning_rate": 0.00032250000000000003,
434
+ "loss": 1.7603,
435
  "step": 355
436
  },
437
  {
438
  "epoch": 0.36,
439
+ "learning_rate": 0.00032,
440
+ "loss": 1.7925,
441
  "step": 360
442
  },
443
  {
444
  "epoch": 0.36,
445
+ "learning_rate": 0.0003175,
446
+ "loss": 1.806,
447
  "step": 365
448
  },
449
  {
450
  "epoch": 0.37,
451
+ "learning_rate": 0.000315,
452
+ "loss": 1.8047,
453
  "step": 370
454
  },
455
  {
456
  "epoch": 0.38,
457
+ "learning_rate": 0.0003125,
458
+ "loss": 1.7939,
459
  "step": 375
460
  },
461
  {
462
  "epoch": 0.38,
463
+ "learning_rate": 0.00031,
464
+ "loss": 1.7539,
465
  "step": 380
466
  },
467
  {
468
  "epoch": 0.39,
469
+ "learning_rate": 0.0003075,
470
+ "loss": 1.7817,
471
  "step": 385
472
  },
473
  {
474
  "epoch": 0.39,
475
+ "learning_rate": 0.000305,
476
+ "loss": 1.7652,
477
  "step": 390
478
  },
479
  {
480
  "epoch": 0.4,
481
+ "learning_rate": 0.0003025,
482
+ "loss": 1.757,
483
  "step": 395
484
  },
485
  {
486
  "epoch": 0.4,
487
+ "learning_rate": 0.0003,
488
+ "loss": 1.7845,
489
  "step": 400
490
  },
491
  {
492
  "epoch": 0.41,
493
+ "learning_rate": 0.00029749999999999997,
494
+ "loss": 1.7701,
495
  "step": 405
496
  },
497
  {
498
  "epoch": 0.41,
499
+ "learning_rate": 0.000295,
500
+ "loss": 1.7759,
501
  "step": 410
502
  },
503
  {
504
  "epoch": 0.41,
505
+ "learning_rate": 0.0002925,
506
+ "loss": 1.697,
507
  "step": 415
508
  },
509
  {
510
  "epoch": 0.42,
511
+ "learning_rate": 0.00029,
512
+ "loss": 1.7623,
513
  "step": 420
514
  },
515
  {
516
  "epoch": 0.42,
517
+ "learning_rate": 0.0002875,
518
+ "loss": 1.7926,
519
  "step": 425
520
  },
521
  {
522
  "epoch": 0.43,
523
+ "learning_rate": 0.000285,
524
+ "loss": 1.8367,
525
  "step": 430
526
  },
527
  {
528
  "epoch": 0.43,
529
+ "learning_rate": 0.0002825,
530
+ "loss": 1.764,
531
  "step": 435
532
  },
533
  {
534
  "epoch": 0.44,
535
+ "learning_rate": 0.00028000000000000003,
536
+ "loss": 1.7322,
537
  "step": 440
538
  },
539
  {
540
  "epoch": 0.45,
541
+ "learning_rate": 0.0002775,
542
+ "loss": 1.7723,
543
  "step": 445
544
  },
545
  {
546
  "epoch": 0.45,
547
+ "learning_rate": 0.000275,
548
+ "loss": 1.7971,
549
  "step": 450
550
  },
551
  {
552
  "epoch": 0.46,
553
+ "learning_rate": 0.0002725,
554
+ "loss": 1.7938,
555
  "step": 455
556
  },
557
  {
558
  "epoch": 0.46,
559
+ "learning_rate": 0.00027,
560
+ "loss": 1.8143,
561
  "step": 460
562
  },
563
  {
564
  "epoch": 0.47,
565
+ "learning_rate": 0.0002675,
566
+ "loss": 1.735,
567
  "step": 465
568
  },
569
  {
570
  "epoch": 0.47,
571
+ "learning_rate": 0.00026500000000000004,
572
+ "loss": 1.7571,
573
  "step": 470
574
  },
575
  {
576
  "epoch": 0.47,
577
+ "learning_rate": 0.00026250000000000004,
578
+ "loss": 1.7636,
579
  "step": 475
580
  },
581
  {
582
  "epoch": 0.48,
583
+ "learning_rate": 0.00026000000000000003,
584
+ "loss": 1.7344,
585
  "step": 480
586
  },
587
  {
588
  "epoch": 0.48,
589
+ "learning_rate": 0.0002575,
590
+ "loss": 1.7156,
591
  "step": 485
592
  },
593
  {
594
  "epoch": 0.49,
595
+ "learning_rate": 0.000255,
596
+ "loss": 1.6996,
597
  "step": 490
598
  },
599
  {
600
  "epoch": 0.49,
601
+ "learning_rate": 0.0002525,
602
+ "loss": 1.7917,
603
  "step": 495
604
  },
605
  {
606
  "epoch": 0.5,
607
+ "learning_rate": 0.00025,
608
+ "loss": 1.7578,
609
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
  ],
612
  "logging_steps": 5,
613
+ "max_steps": 1000,
614
  "num_input_tokens_seen": 0,
615
+ "num_train_epochs": 1,
616
  "save_steps": 500,
617
+ "total_flos": 1.31426122150656e+16,
618
  "train_batch_size": 10,
619
  "trial_name": null,
620
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69db0ed0280f7b72e766ea62e28c3f9ca85173e00962b7b7d10e2f20229f6d83
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85272a8d5fb6a062784ff0b0596d3e93eb82123aa8a88168a5a2c485a228cfef
3
  size 4728