Bingsu commited on
Commit
83847fb
1 Parent(s): 35deea0

Training in progress, step 30000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efef12e6736ac05b05123978b5a7ba02086375a879e1e08c05db35ff70c647a0
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ef929787429c88c576570a6fdf928468aedd3cf5c685150c4d00c23ed2574b5
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:669ff7fd28968817843d8d3e735a9f1604e6f86bd0620d14ba500c796ee6cb84
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b649041f4e942a1c71886ca3f83f11fc846bb4752950f41b84cd207e3cc556c
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17ed50c4af1d37fb0d41b85169a6f1f89705f404faa32f3817c74e84cd5180c1
3
  size 14439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e521fc0b39ea1a8ce6e0a681bc8a834da87e83fd7cd3754aacc6c4b558e0937
3
  size 14439
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60920ec13686e98f9f0d129e472adaac1417d4bc756e4485725a87068a11e2f1
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:677543783c6139b35f47c70ebab6ad516807591d9ad24cec274ace85b8305217
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.08594757198109153,
5
- "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -606,11 +606,311 @@
606
  "learning_rate": 0.0008149872553192515,
607
  "loss": 4.2065,
608
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  }
610
  ],
611
  "max_steps": 500000,
612
  "num_train_epochs": 3,
613
- "total_flos": 3.187659964416e+16,
614
  "trial_name": null,
615
  "trial_params": null
616
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1289213579716373,
5
+ "global_step": 30000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
606
  "learning_rate": 0.0008149872553192515,
607
  "loss": 4.2065,
608
  "step": 20000
609
+ },
610
+ {
611
+ "epoch": 0.09,
612
+ "learning_rate": 0.0008232037327492777,
613
+ "loss": 4.1773,
614
+ "step": 20200
615
+ },
616
+ {
617
+ "epoch": 0.09,
618
+ "learning_rate": 0.0008314970061979818,
619
+ "loss": 4.1904,
620
+ "step": 20400
621
+ },
622
+ {
623
+ "epoch": 0.09,
624
+ "learning_rate": 0.0008398669301501703,
625
+ "loss": 4.1868,
626
+ "step": 20600
627
+ },
628
+ {
629
+ "epoch": 0.09,
630
+ "learning_rate": 0.0008483133577457148,
631
+ "loss": 4.2006,
632
+ "step": 20800
633
+ },
634
+ {
635
+ "epoch": 0.09,
636
+ "learning_rate": 0.0008568361407821495,
637
+ "loss": 4.1467,
638
+ "step": 21000
639
+ },
640
+ {
641
+ "epoch": 0.09,
642
+ "learning_rate": 0.0008654351297172607,
643
+ "loss": 4.1585,
644
+ "step": 21200
645
+ },
646
+ {
647
+ "epoch": 0.09,
648
+ "learning_rate": 0.0008741101736717116,
649
+ "loss": 4.1547,
650
+ "step": 21400
651
+ },
652
+ {
653
+ "epoch": 0.09,
654
+ "learning_rate": 0.0008828611204316911,
655
+ "loss": 4.1557,
656
+ "step": 21600
657
+ },
658
+ {
659
+ "epoch": 0.09,
660
+ "learning_rate": 0.0008916878164515838,
661
+ "loss": 4.1496,
662
+ "step": 21800
663
+ },
664
+ {
665
+ "epoch": 0.09,
666
+ "learning_rate": 0.0009005901068566691,
667
+ "loss": 4.1434,
668
+ "step": 22000
669
+ },
670
+ {
671
+ "epoch": 0.1,
672
+ "learning_rate": 0.0009095678354458306,
673
+ "loss": 4.1173,
674
+ "step": 22200
675
+ },
676
+ {
677
+ "epoch": 0.1,
678
+ "learning_rate": 0.0009186208446943008,
679
+ "loss": 4.1364,
680
+ "step": 22400
681
+ },
682
+ {
683
+ "epoch": 0.1,
684
+ "learning_rate": 0.0009277489757564244,
685
+ "loss": 4.1445,
686
+ "step": 22600
687
+ },
688
+ {
689
+ "epoch": 0.1,
690
+ "learning_rate": 0.0009369520684684475,
691
+ "loss": 4.1156,
692
+ "step": 22800
693
+ },
694
+ {
695
+ "epoch": 0.1,
696
+ "learning_rate": 0.0009462299613513248,
697
+ "loss": 4.1033,
698
+ "step": 23000
699
+ },
700
+ {
701
+ "epoch": 0.1,
702
+ "learning_rate": 0.0009555824916135536,
703
+ "loss": 4.1187,
704
+ "step": 23200
705
+ },
706
+ {
707
+ "epoch": 0.1,
708
+ "learning_rate": 0.0009650094951540386,
709
+ "loss": 4.0823,
710
+ "step": 23400
711
+ },
712
+ {
713
+ "epoch": 0.1,
714
+ "learning_rate": 0.0009745108065649499,
715
+ "loss": 4.0624,
716
+ "step": 23600
717
+ },
718
+ {
719
+ "epoch": 0.1,
720
+ "learning_rate": 0.0009840862591346498,
721
+ "loss": 4.0845,
722
+ "step": 23800
723
+ },
724
+ {
725
+ "epoch": 0.1,
726
+ "learning_rate": 0.0009937356848506058,
727
+ "loss": 4.0483,
728
+ "step": 24000
729
+ },
730
+ {
731
+ "epoch": 0.1,
732
+ "learning_rate": 0.001003458914402332,
733
+ "loss": 4.0512,
734
+ "step": 24200
735
+ },
736
+ {
737
+ "epoch": 0.1,
738
+ "learning_rate": 0.0010132557771843787,
739
+ "loss": 4.0606,
740
+ "step": 24400
741
+ },
742
+ {
743
+ "epoch": 0.11,
744
+ "learning_rate": 0.0010231261012993067,
745
+ "loss": 4.046,
746
+ "step": 24600
747
+ },
748
+ {
749
+ "epoch": 0.11,
750
+ "learning_rate": 0.0010330697135607168,
751
+ "loss": 4.0315,
752
+ "step": 24800
753
+ },
754
+ {
755
+ "epoch": 0.11,
756
+ "learning_rate": 0.00104308643949628,
757
+ "loss": 4.0179,
758
+ "step": 25000
759
+ },
760
+ {
761
+ "epoch": 0.11,
762
+ "learning_rate": 0.001053176103350803,
763
+ "loss": 4.0351,
764
+ "step": 25200
765
+ },
766
+ {
767
+ "epoch": 0.11,
768
+ "learning_rate": 0.0010633385280893123,
769
+ "loss": 4.02,
770
+ "step": 25400
771
+ },
772
+ {
773
+ "epoch": 0.11,
774
+ "learning_rate": 0.0010735735354001595,
775
+ "loss": 4.0201,
776
+ "step": 25600
777
+ },
778
+ {
779
+ "epoch": 0.11,
780
+ "learning_rate": 0.0010838809456981471,
781
+ "loss": 4.0044,
782
+ "step": 25800
783
+ },
784
+ {
785
+ "epoch": 0.11,
786
+ "learning_rate": 0.001094260578127686,
787
+ "loss": 3.9914,
788
+ "step": 26000
789
+ },
790
+ {
791
+ "epoch": 0.11,
792
+ "learning_rate": 0.0011047122505659646,
793
+ "loss": 3.9991,
794
+ "step": 26200
795
+ },
796
+ {
797
+ "epoch": 0.11,
798
+ "learning_rate": 0.0011152357796261423,
799
+ "loss": 4.0109,
800
+ "step": 26400
801
+ },
802
+ {
803
+ "epoch": 0.11,
804
+ "learning_rate": 0.0011258309806605731,
805
+ "loss": 4.0405,
806
+ "step": 26600
807
+ },
808
+ {
809
+ "epoch": 0.12,
810
+ "learning_rate": 0.0011364976677640387,
811
+ "loss": 4.0349,
812
+ "step": 26800
813
+ },
814
+ {
815
+ "epoch": 0.12,
816
+ "learning_rate": 0.0011472356537770186,
817
+ "loss": 4.0312,
818
+ "step": 27000
819
+ },
820
+ {
821
+ "epoch": 0.12,
822
+ "learning_rate": 0.0011580447502889633,
823
+ "loss": 4.0185,
824
+ "step": 27200
825
+ },
826
+ {
827
+ "epoch": 0.12,
828
+ "learning_rate": 0.0011689247676416152,
829
+ "loss": 4.011,
830
+ "step": 27400
831
+ },
832
+ {
833
+ "epoch": 0.12,
834
+ "learning_rate": 0.0011798755149323176,
835
+ "loss": 3.9898,
836
+ "step": 27600
837
+ },
838
+ {
839
+ "epoch": 0.12,
840
+ "learning_rate": 0.001190896800017379,
841
+ "loss": 3.981,
842
+ "step": 27800
843
+ },
844
+ {
845
+ "epoch": 0.12,
846
+ "learning_rate": 0.0012019884295154416,
847
+ "loss": 3.949,
848
+ "step": 28000
849
+ },
850
+ {
851
+ "epoch": 0.12,
852
+ "learning_rate": 0.0012131502088108658,
853
+ "loss": 3.9896,
854
+ "step": 28200
855
+ },
856
+ {
857
+ "epoch": 0.12,
858
+ "learning_rate": 0.0012243819420571598,
859
+ "loss": 3.9951,
860
+ "step": 28400
861
+ },
862
+ {
863
+ "epoch": 0.12,
864
+ "learning_rate": 0.0012356834321804039,
865
+ "loss": 4.0361,
866
+ "step": 28600
867
+ },
868
+ {
869
+ "epoch": 0.12,
870
+ "learning_rate": 0.0012470544808827113,
871
+ "loss": 4.1212,
872
+ "step": 28800
873
+ },
874
+ {
875
+ "epoch": 0.12,
876
+ "learning_rate": 0.001258494888645708,
877
+ "loss": 4.0721,
878
+ "step": 29000
879
+ },
880
+ {
881
+ "epoch": 0.13,
882
+ "learning_rate": 0.0012700044547340368,
883
+ "loss": 4.0311,
884
+ "step": 29200
885
+ },
886
+ {
887
+ "epoch": 0.13,
888
+ "learning_rate": 0.0012815829771988738,
889
+ "loss": 4.0114,
890
+ "step": 29400
891
+ },
892
+ {
893
+ "epoch": 0.13,
894
+ "learning_rate": 0.001293230252881479,
895
+ "loss": 3.9868,
896
+ "step": 29600
897
+ },
898
+ {
899
+ "epoch": 0.13,
900
+ "learning_rate": 0.0013049460774167514,
901
+ "loss": 3.9881,
902
+ "step": 29800
903
+ },
904
+ {
905
+ "epoch": 0.13,
906
+ "learning_rate": 0.0013167302452368242,
907
+ "loss": 3.9705,
908
+ "step": 30000
909
  }
910
  ],
911
  "max_steps": 500000,
912
  "num_train_epochs": 3,
913
+ "total_flos": 4.781489946624e+16,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:669ff7fd28968817843d8d3e735a9f1604e6f86bd0620d14ba500c796ee6cb84
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b649041f4e942a1c71886ca3f83f11fc846bb4752950f41b84cd207e3cc556c
3
  size 146774203