Bingsu commited on
Commit
c70d807
1 Parent(s): 21430a4

Training in progress, step 30000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8999b8d69ea2c480c61524c89efac987588c6717adbe72d4a81fa8728d4c9449
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:991640a131f2a0a32a17ba1af542f31b7776932281bd0a73639dd3a4960e3a40
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2086ad80db4aea83ae4e401109af83f1badcb088ad6d8e2646435f614981265
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c10b0dd9b3e24c2c1ca2db9a9e924f901a4d183202a5c32479436a975f462f9d
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baddf50e1f62d0d009af36c5769a89381444f87b64cb66458a109849b3e81eff
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f754532050c5b1775c36eee5da06c337e5bc03296f22630efbd5a1c263b25446
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d84da135b4e1f71d4d4cedad216f0163422e7daea6dbfb3d8920ec2c7486ef2f
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2f5c3de2046b6ec35a993f60879cf7288b2cb7906fdbd23f2869d9429fbe1b
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.08594757198109153,
5
- "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -606,11 +606,311 @@
606
  "learning_rate": 8.149872553192528e-05,
607
  "loss": 3.606,
608
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  }
610
  ],
611
  "max_steps": 500000,
612
  "num_train_epochs": 3,
613
- "total_flos": 3.187659964416e+16,
614
  "trial_name": null,
615
  "trial_params": null
616
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1289213579716373,
5
+ "global_step": 30000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
606
  "learning_rate": 8.149872553192528e-05,
607
  "loss": 3.606,
608
  "step": 20000
609
+ },
610
+ {
611
+ "epoch": 0.09,
612
+ "learning_rate": 8.232037327492777e-05,
613
+ "loss": 3.6232,
614
+ "step": 20200
615
+ },
616
+ {
617
+ "epoch": 0.09,
618
+ "learning_rate": 8.314970061979832e-05,
619
+ "loss": 3.6182,
620
+ "step": 20400
621
+ },
622
+ {
623
+ "epoch": 0.09,
624
+ "learning_rate": 8.398669301501699e-05,
625
+ "loss": 3.5904,
626
+ "step": 20600
627
+ },
628
+ {
629
+ "epoch": 0.09,
630
+ "learning_rate": 8.483133577457161e-05,
631
+ "loss": 3.6162,
632
+ "step": 20800
633
+ },
634
+ {
635
+ "epoch": 0.09,
636
+ "learning_rate": 8.568361407821506e-05,
637
+ "loss": 3.6084,
638
+ "step": 21000
639
+ },
640
+ {
641
+ "epoch": 0.09,
642
+ "learning_rate": 8.65435129717262e-05,
643
+ "loss": 3.598,
644
+ "step": 21200
645
+ },
646
+ {
647
+ "epoch": 0.09,
648
+ "learning_rate": 8.74110173671712e-05,
649
+ "loss": 3.6229,
650
+ "step": 21400
651
+ },
652
+ {
653
+ "epoch": 0.09,
654
+ "learning_rate": 8.828611204316915e-05,
655
+ "loss": 3.6101,
656
+ "step": 21600
657
+ },
658
+ {
659
+ "epoch": 0.09,
660
+ "learning_rate": 8.916878164515847e-05,
661
+ "loss": 3.5859,
662
+ "step": 21800
663
+ },
664
+ {
665
+ "epoch": 0.09,
666
+ "learning_rate": 9.005901068566706e-05,
667
+ "loss": 3.6207,
668
+ "step": 22000
669
+ },
670
+ {
671
+ "epoch": 0.1,
672
+ "learning_rate": 9.095678354458317e-05,
673
+ "loss": 3.6058,
674
+ "step": 22200
675
+ },
676
+ {
677
+ "epoch": 0.1,
678
+ "learning_rate": 9.18620844694301e-05,
679
+ "loss": 3.5935,
680
+ "step": 22400
681
+ },
682
+ {
683
+ "epoch": 0.1,
684
+ "learning_rate": 9.27748975756424e-05,
685
+ "loss": 3.5917,
686
+ "step": 22600
687
+ },
688
+ {
689
+ "epoch": 0.1,
690
+ "learning_rate": 9.369520684684475e-05,
691
+ "loss": 3.5861,
692
+ "step": 22800
693
+ },
694
+ {
695
+ "epoch": 0.1,
696
+ "learning_rate": 9.462299613513243e-05,
697
+ "loss": 3.5976,
698
+ "step": 23000
699
+ },
700
+ {
701
+ "epoch": 0.1,
702
+ "learning_rate": 9.555824916135554e-05,
703
+ "loss": 3.5914,
704
+ "step": 23200
705
+ },
706
+ {
707
+ "epoch": 0.1,
708
+ "learning_rate": 9.650094951540393e-05,
709
+ "loss": 3.5778,
710
+ "step": 23400
711
+ },
712
+ {
713
+ "epoch": 0.1,
714
+ "learning_rate": 9.745108065649507e-05,
715
+ "loss": 3.6049,
716
+ "step": 23600
717
+ },
718
+ {
719
+ "epoch": 0.1,
720
+ "learning_rate": 9.840862591346507e-05,
721
+ "loss": 3.5878,
722
+ "step": 23800
723
+ },
724
+ {
725
+ "epoch": 0.1,
726
+ "learning_rate": 9.93735684850606e-05,
727
+ "loss": 3.5918,
728
+ "step": 24000
729
+ },
730
+ {
731
+ "epoch": 0.1,
732
+ "learning_rate": 0.00010034589144023332,
733
+ "loss": 3.59,
734
+ "step": 24200
735
+ },
736
+ {
737
+ "epoch": 0.1,
738
+ "learning_rate": 0.00010132557771843796,
739
+ "loss": 3.5815,
740
+ "step": 24400
741
+ },
742
+ {
743
+ "epoch": 0.11,
744
+ "learning_rate": 0.00010231261012993076,
745
+ "loss": 3.5806,
746
+ "step": 24600
747
+ },
748
+ {
749
+ "epoch": 0.11,
750
+ "learning_rate": 0.0001033069713560718,
751
+ "loss": 3.5927,
752
+ "step": 24800
753
+ },
754
+ {
755
+ "epoch": 0.11,
756
+ "learning_rate": 0.00010430864394962811,
757
+ "loss": 3.595,
758
+ "step": 25000
759
+ },
760
+ {
761
+ "epoch": 0.11,
762
+ "learning_rate": 0.00010531761033508039,
763
+ "loss": 3.5751,
764
+ "step": 25200
765
+ },
766
+ {
767
+ "epoch": 0.11,
768
+ "learning_rate": 0.00010633385280893134,
769
+ "loss": 3.5741,
770
+ "step": 25400
771
+ },
772
+ {
773
+ "epoch": 0.11,
774
+ "learning_rate": 0.00010735735354001593,
775
+ "loss": 3.5752,
776
+ "step": 25600
777
+ },
778
+ {
779
+ "epoch": 0.11,
780
+ "learning_rate": 0.00010838809456981471,
781
+ "loss": 3.5666,
782
+ "step": 25800
783
+ },
784
+ {
785
+ "epoch": 0.11,
786
+ "learning_rate": 0.00010942605781276871,
787
+ "loss": 3.5594,
788
+ "step": 26000
789
+ },
790
+ {
791
+ "epoch": 0.11,
792
+ "learning_rate": 0.00011047122505659652,
793
+ "loss": 3.5652,
794
+ "step": 26200
795
+ },
796
+ {
797
+ "epoch": 0.11,
798
+ "learning_rate": 0.00011152357796261427,
799
+ "loss": 3.5546,
800
+ "step": 26400
801
+ },
802
+ {
803
+ "epoch": 0.11,
804
+ "learning_rate": 0.00011258309806605742,
805
+ "loss": 3.5427,
806
+ "step": 26600
807
+ },
808
+ {
809
+ "epoch": 0.12,
810
+ "learning_rate": 0.00011364976677640404,
811
+ "loss": 3.5766,
812
+ "step": 26800
813
+ },
814
+ {
815
+ "epoch": 0.12,
816
+ "learning_rate": 0.00011472356537770196,
817
+ "loss": 3.5791,
818
+ "step": 27000
819
+ },
820
+ {
821
+ "epoch": 0.12,
822
+ "learning_rate": 0.00011580447502889649,
823
+ "loss": 3.5722,
824
+ "step": 27200
825
+ },
826
+ {
827
+ "epoch": 0.12,
828
+ "learning_rate": 0.00011689247676416152,
829
+ "loss": 3.5775,
830
+ "step": 27400
831
+ },
832
+ {
833
+ "epoch": 0.12,
834
+ "learning_rate": 0.00011798755149323179,
835
+ "loss": 3.5579,
836
+ "step": 27600
837
+ },
838
+ {
839
+ "epoch": 0.12,
840
+ "learning_rate": 0.00011908968000173793,
841
+ "loss": 3.5618,
842
+ "step": 27800
843
+ },
844
+ {
845
+ "epoch": 0.12,
846
+ "learning_rate": 0.00012019884295154414,
847
+ "loss": 3.5614,
848
+ "step": 28000
849
+ },
850
+ {
851
+ "epoch": 0.12,
852
+ "learning_rate": 0.00012131502088108667,
853
+ "loss": 3.5444,
854
+ "step": 28200
855
+ },
856
+ {
857
+ "epoch": 0.12,
858
+ "learning_rate": 0.00012243819420571607,
859
+ "loss": 3.5463,
860
+ "step": 28400
861
+ },
862
+ {
863
+ "epoch": 0.12,
864
+ "learning_rate": 0.00012356834321804048,
865
+ "loss": 3.5502,
866
+ "step": 28600
867
+ },
868
+ {
869
+ "epoch": 0.12,
870
+ "learning_rate": 0.00012470544808827115,
871
+ "loss": 3.5287,
872
+ "step": 28800
873
+ },
874
+ {
875
+ "epoch": 0.12,
876
+ "learning_rate": 0.00012584948886457082,
877
+ "loss": 3.5414,
878
+ "step": 29000
879
+ },
880
+ {
881
+ "epoch": 0.13,
882
+ "learning_rate": 0.00012700044547340377,
883
+ "loss": 3.5504,
884
+ "step": 29200
885
+ },
886
+ {
887
+ "epoch": 0.13,
888
+ "learning_rate": 0.00012815829771988744,
889
+ "loss": 3.5381,
890
+ "step": 29400
891
+ },
892
+ {
893
+ "epoch": 0.13,
894
+ "learning_rate": 0.00012932302528814797,
895
+ "loss": 3.5551,
896
+ "step": 29600
897
+ },
898
+ {
899
+ "epoch": 0.13,
900
+ "learning_rate": 0.00013049460774167522,
901
+ "loss": 3.5331,
902
+ "step": 29800
903
+ },
904
+ {
905
+ "epoch": 0.13,
906
+ "learning_rate": 0.00013167302452368236,
907
+ "loss": 3.5359,
908
+ "step": 30000
909
  }
910
  ],
911
  "max_steps": 500000,
912
  "num_train_epochs": 3,
913
+ "total_flos": 4.781489946624e+16,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2086ad80db4aea83ae4e401109af83f1badcb088ad6d8e2646435f614981265
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c10b0dd9b3e24c2c1ca2db9a9e924f901a4d183202a5c32479436a975f462f9d
3
  size 146774203