kiddothe2b commited on
Commit
cb30f37
1 Parent(s): 830e0fb

Training in progress, step 12800

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5d52832157d0f5b80b6a377d00b6a204e38b38fba5786f88358c0819d171ee4
3
- size 6318359
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c8c311bed380f6c5231042dd3172757e53d32d3926ff696064a6f7e652b2260
3
+ size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7c7e586c98c80af7b8b14023b28831609a0aa6b2bd8e695f6d4f000731d7e55
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01755e0768402bffb76f967e33df76e23d5b263c52bad4b9110c9a221b45c611
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff7dc9aba89d2b981ce2c9aa897aebeb0dab9301700e5fac047b6fa6ef1a780f
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c6f9685929a5db844ce472a185dad9d0c6482918c842f5a9b7670626b6da045
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:841a158b0d212253f125ebf1f87bda4797e00292f1d39571b4724f0ab5ed90ad
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a63c18679f872f561021a84d9bfcd3fad0c807bcef87d1a807b9818f9895c1f
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1,
5
- "global_step": 6400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -399,11 +399,404 @@
399
  "eval_samples_per_second": 43.51,
400
  "eval_steps_per_second": 2.719,
401
  "step": 6400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  }
403
  ],
404
  "max_steps": 64000,
405
  "num_train_epochs": 9223372036854775807,
406
- "total_flos": 3.38491764375552e+16,
407
  "trial_name": null,
408
  "trial_params": null
409
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2,
5
+ "global_step": 12800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
399
  "eval_samples_per_second": 43.51,
400
  "eval_steps_per_second": 2.719,
401
  "step": 6400
402
+ },
403
+ {
404
+ "epoch": 0.1,
405
+ "learning_rate": 0.001,
406
+ "loss": 8.359,
407
+ "step": 6500
408
+ },
409
+ {
410
+ "epoch": 0.1,
411
+ "learning_rate": 0.001,
412
+ "loss": 8.1564,
413
+ "step": 6600
414
+ },
415
+ {
416
+ "epoch": 0.1,
417
+ "learning_rate": 0.001,
418
+ "loss": 8.1469,
419
+ "step": 6700
420
+ },
421
+ {
422
+ "epoch": 0.11,
423
+ "learning_rate": 0.001,
424
+ "loss": 8.1211,
425
+ "step": 6800
426
+ },
427
+ {
428
+ "epoch": 0.11,
429
+ "learning_rate": 0.001,
430
+ "loss": 8.0988,
431
+ "step": 6900
432
+ },
433
+ {
434
+ "epoch": 0.11,
435
+ "learning_rate": 0.001,
436
+ "loss": 8.0913,
437
+ "step": 7000
438
+ },
439
+ {
440
+ "epoch": 0.11,
441
+ "learning_rate": 0.001,
442
+ "loss": 8.0833,
443
+ "step": 7100
444
+ },
445
+ {
446
+ "epoch": 0.11,
447
+ "learning_rate": 0.001,
448
+ "loss": 8.0939,
449
+ "step": 7200
450
+ },
451
+ {
452
+ "epoch": 0.11,
453
+ "learning_rate": 0.001,
454
+ "loss": 8.4426,
455
+ "step": 7300
456
+ },
457
+ {
458
+ "epoch": 0.12,
459
+ "learning_rate": 0.001,
460
+ "loss": 8.3,
461
+ "step": 7400
462
+ },
463
+ {
464
+ "epoch": 0.12,
465
+ "learning_rate": 0.001,
466
+ "loss": 8.1645,
467
+ "step": 7500
468
+ },
469
+ {
470
+ "epoch": 0.12,
471
+ "learning_rate": 0.001,
472
+ "loss": 8.108,
473
+ "step": 7600
474
+ },
475
+ {
476
+ "epoch": 0.12,
477
+ "learning_rate": 0.001,
478
+ "loss": 8.0965,
479
+ "step": 7700
480
+ },
481
+ {
482
+ "epoch": 0.12,
483
+ "learning_rate": 0.001,
484
+ "loss": 8.0708,
485
+ "step": 7800
486
+ },
487
+ {
488
+ "epoch": 0.12,
489
+ "learning_rate": 0.001,
490
+ "loss": 8.0746,
491
+ "step": 7900
492
+ },
493
+ {
494
+ "epoch": 0.12,
495
+ "learning_rate": 0.001,
496
+ "loss": 8.0687,
497
+ "step": 8000
498
+ },
499
+ {
500
+ "epoch": 0.13,
501
+ "learning_rate": 0.001,
502
+ "loss": 8.0715,
503
+ "step": 8100
504
+ },
505
+ {
506
+ "epoch": 0.13,
507
+ "learning_rate": 0.001,
508
+ "loss": 8.0598,
509
+ "step": 8200
510
+ },
511
+ {
512
+ "epoch": 0.13,
513
+ "learning_rate": 0.001,
514
+ "loss": 8.053,
515
+ "step": 8300
516
+ },
517
+ {
518
+ "epoch": 0.13,
519
+ "learning_rate": 0.001,
520
+ "loss": 8.0404,
521
+ "step": 8400
522
+ },
523
+ {
524
+ "epoch": 0.13,
525
+ "learning_rate": 0.001,
526
+ "loss": 8.0718,
527
+ "step": 8500
528
+ },
529
+ {
530
+ "epoch": 0.13,
531
+ "learning_rate": 0.001,
532
+ "loss": 8.0426,
533
+ "step": 8600
534
+ },
535
+ {
536
+ "epoch": 0.14,
537
+ "learning_rate": 0.001,
538
+ "loss": 8.032,
539
+ "step": 8700
540
+ },
541
+ {
542
+ "epoch": 0.14,
543
+ "learning_rate": 0.001,
544
+ "loss": 8.0338,
545
+ "step": 8800
546
+ },
547
+ {
548
+ "epoch": 0.14,
549
+ "learning_rate": 0.001,
550
+ "loss": 8.0468,
551
+ "step": 8900
552
+ },
553
+ {
554
+ "epoch": 0.14,
555
+ "learning_rate": 0.001,
556
+ "loss": 8.0502,
557
+ "step": 9000
558
+ },
559
+ {
560
+ "epoch": 0.14,
561
+ "learning_rate": 0.001,
562
+ "loss": 8.0691,
563
+ "step": 9100
564
+ },
565
+ {
566
+ "epoch": 0.14,
567
+ "learning_rate": 0.001,
568
+ "loss": 8.0611,
569
+ "step": 9200
570
+ },
571
+ {
572
+ "epoch": 0.15,
573
+ "learning_rate": 0.001,
574
+ "loss": 8.0346,
575
+ "step": 9300
576
+ },
577
+ {
578
+ "epoch": 0.15,
579
+ "learning_rate": 0.001,
580
+ "loss": 8.0438,
581
+ "step": 9400
582
+ },
583
+ {
584
+ "epoch": 0.15,
585
+ "learning_rate": 0.001,
586
+ "loss": 8.0291,
587
+ "step": 9500
588
+ },
589
+ {
590
+ "epoch": 0.15,
591
+ "learning_rate": 0.001,
592
+ "loss": 8.0502,
593
+ "step": 9600
594
+ },
595
+ {
596
+ "epoch": 0.15,
597
+ "learning_rate": 0.001,
598
+ "loss": 8.05,
599
+ "step": 9700
600
+ },
601
+ {
602
+ "epoch": 0.15,
603
+ "learning_rate": 0.001,
604
+ "loss": 8.0522,
605
+ "step": 9800
606
+ },
607
+ {
608
+ "epoch": 0.15,
609
+ "learning_rate": 0.001,
610
+ "loss": 8.0441,
611
+ "step": 9900
612
+ },
613
+ {
614
+ "epoch": 0.16,
615
+ "learning_rate": 0.001,
616
+ "loss": 8.0455,
617
+ "step": 10000
618
+ },
619
+ {
620
+ "epoch": 0.16,
621
+ "learning_rate": 0.001,
622
+ "loss": 8.0476,
623
+ "step": 10100
624
+ },
625
+ {
626
+ "epoch": 0.16,
627
+ "learning_rate": 0.001,
628
+ "loss": 8.03,
629
+ "step": 10200
630
+ },
631
+ {
632
+ "epoch": 0.16,
633
+ "learning_rate": 0.001,
634
+ "loss": 8.0411,
635
+ "step": 10300
636
+ },
637
+ {
638
+ "epoch": 0.16,
639
+ "learning_rate": 0.001,
640
+ "loss": 8.0654,
641
+ "step": 10400
642
+ },
643
+ {
644
+ "epoch": 0.16,
645
+ "learning_rate": 0.001,
646
+ "loss": 8.0569,
647
+ "step": 10500
648
+ },
649
+ {
650
+ "epoch": 0.17,
651
+ "learning_rate": 0.001,
652
+ "loss": 8.0407,
653
+ "step": 10600
654
+ },
655
+ {
656
+ "epoch": 0.17,
657
+ "learning_rate": 0.001,
658
+ "loss": 8.0456,
659
+ "step": 10700
660
+ },
661
+ {
662
+ "epoch": 0.17,
663
+ "learning_rate": 0.001,
664
+ "loss": 8.0497,
665
+ "step": 10800
666
+ },
667
+ {
668
+ "epoch": 0.17,
669
+ "learning_rate": 0.001,
670
+ "loss": 8.0309,
671
+ "step": 10900
672
+ },
673
+ {
674
+ "epoch": 0.17,
675
+ "learning_rate": 0.001,
676
+ "loss": 8.0575,
677
+ "step": 11000
678
+ },
679
+ {
680
+ "epoch": 0.17,
681
+ "learning_rate": 0.001,
682
+ "loss": 8.0367,
683
+ "step": 11100
684
+ },
685
+ {
686
+ "epoch": 0.17,
687
+ "learning_rate": 0.001,
688
+ "loss": 8.0366,
689
+ "step": 11200
690
+ },
691
+ {
692
+ "epoch": 0.18,
693
+ "learning_rate": 0.001,
694
+ "loss": 8.0422,
695
+ "step": 11300
696
+ },
697
+ {
698
+ "epoch": 0.18,
699
+ "learning_rate": 0.001,
700
+ "loss": 8.0385,
701
+ "step": 11400
702
+ },
703
+ {
704
+ "epoch": 0.18,
705
+ "learning_rate": 0.001,
706
+ "loss": 8.0415,
707
+ "step": 11500
708
+ },
709
+ {
710
+ "epoch": 0.18,
711
+ "learning_rate": 0.001,
712
+ "loss": 8.0335,
713
+ "step": 11600
714
+ },
715
+ {
716
+ "epoch": 0.18,
717
+ "learning_rate": 0.001,
718
+ "loss": 8.0523,
719
+ "step": 11700
720
+ },
721
+ {
722
+ "epoch": 0.18,
723
+ "learning_rate": 0.001,
724
+ "loss": 8.0565,
725
+ "step": 11800
726
+ },
727
+ {
728
+ "epoch": 0.19,
729
+ "learning_rate": 0.001,
730
+ "loss": 8.0442,
731
+ "step": 11900
732
+ },
733
+ {
734
+ "epoch": 0.19,
735
+ "learning_rate": 0.001,
736
+ "loss": 8.0478,
737
+ "step": 12000
738
+ },
739
+ {
740
+ "epoch": 0.19,
741
+ "learning_rate": 0.001,
742
+ "loss": 8.0319,
743
+ "step": 12100
744
+ },
745
+ {
746
+ "epoch": 0.19,
747
+ "learning_rate": 0.001,
748
+ "loss": 8.0463,
749
+ "step": 12200
750
+ },
751
+ {
752
+ "epoch": 0.19,
753
+ "learning_rate": 0.001,
754
+ "loss": 8.0549,
755
+ "step": 12300
756
+ },
757
+ {
758
+ "epoch": 0.19,
759
+ "learning_rate": 0.001,
760
+ "loss": 8.0676,
761
+ "step": 12400
762
+ },
763
+ {
764
+ "epoch": 0.2,
765
+ "learning_rate": 0.001,
766
+ "loss": 8.0362,
767
+ "step": 12500
768
+ },
769
+ {
770
+ "epoch": 0.2,
771
+ "learning_rate": 0.001,
772
+ "loss": 8.0437,
773
+ "step": 12600
774
+ },
775
+ {
776
+ "epoch": 0.2,
777
+ "learning_rate": 0.001,
778
+ "loss": 8.0502,
779
+ "step": 12700
780
+ },
781
+ {
782
+ "epoch": 0.2,
783
+ "learning_rate": 0.001,
784
+ "loss": 8.046,
785
+ "step": 12800
786
+ },
787
+ {
788
+ "epoch": 0.2,
789
+ "eval_accuracy": 0.032781328733133316,
790
+ "eval_loss": 8.042789459228516,
791
+ "eval_runtime": 7220.6269,
792
+ "eval_samples_per_second": 45.406,
793
+ "eval_steps_per_second": 2.838,
794
+ "step": 12800
795
  }
796
  ],
797
  "max_steps": 64000,
798
  "num_train_epochs": 9223372036854775807,
799
+ "total_flos": 6.76983528751104e+16,
800
  "trial_name": null,
801
  "trial_params": null
802
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7c7e586c98c80af7b8b14023b28831609a0aa6b2bd8e695f6d4f000731d7e55
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01755e0768402bffb76f967e33df76e23d5b263c52bad4b9110c9a221b45c611
3
  size 372832803