Nadav commited on
Commit
49debac
1 Parent(s): 552197f

Training in progress, step 1850000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84eeca699785d889add4fce9e83fcf219cc03b8c3e8612092092ba4f022e339b
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfb68686d566f5019ee82a8c96b0a5544a86168b18ae5533f028d07705e256d7
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a47c42cd40edaf177247b0f81cc113941e45da543bcd8075122f86f8a439a53
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64dd9bb8ac07ad494b77a3974d9a13d4d5d6c9061220ee2632308b55b6ccca8c
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c22e615daa20a7523bf096df9dcc68366ed60a8151bafc863df6c6b53275a84a
3
- size 21643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8093364ac983f02083f889c77722892d297eae3bcec837969f1e20972859470
3
+ size 21579
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c4c724e259a52a66e7ae3019ca30f1baaafdcfcaf6dbe949cbda0206af52d55
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b4f57439a544d0a66cc7c8aa509e6d07139998cf7951a6fd1fc7884297b3c7
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a24dd415d95b2d83e758fabab0d2c6d80262a248eda13bb423bd8c9ef9f0d1d
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:203a36bdaa16f61697b76694bf2a74dc1a746df9c496ed1bca73de3ffd507a20
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.15,
5
- "global_step": 1800000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -13686,11 +13686,391 @@
13686
  "eval_samples_per_second": 83.142,
13687
  "eval_steps_per_second": 0.65,
13688
  "step": 1800000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13689
  }
13690
  ],
13691
  "max_steps": 2000000,
13692
  "num_train_epochs": 9223372036854775807,
13693
- "total_flos": 1.5772558930477056e+22,
13694
  "trial_name": null,
13695
  "trial_params": null
13696
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.025,
5
+ "global_step": 1850000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
13686
  "eval_samples_per_second": 83.142,
13687
  "eval_steps_per_second": 0.65,
13688
  "step": 1800000
13689
+ },
13690
+ {
13691
+ "epoch": 0.0,
13692
+ "learning_rate": 1.4208113677502687e-05,
13693
+ "loss": 0.4365,
13694
+ "step": 1801000
13695
+ },
13696
+ {
13697
+ "epoch": 0.0,
13698
+ "learning_rate": 1.4166492588365344e-05,
13699
+ "loss": 0.4384,
13700
+ "step": 1802000
13701
+ },
13702
+ {
13703
+ "epoch": 0.0,
13704
+ "learning_rate": 1.4125072039508715e-05,
13705
+ "loss": 0.4379,
13706
+ "step": 1803000
13707
+ },
13708
+ {
13709
+ "epoch": 0.0,
13710
+ "learning_rate": 1.4083852157106983e-05,
13711
+ "loss": 0.4377,
13712
+ "step": 1804000
13713
+ },
13714
+ {
13715
+ "epoch": 0.0,
13716
+ "learning_rate": 1.4042833066723076e-05,
13717
+ "loss": 0.4385,
13718
+ "step": 1805000
13719
+ },
13720
+ {
13721
+ "epoch": 0.0,
13722
+ "eval_loss": 0.42015281319618225,
13723
+ "eval_runtime": 78.5938,
13724
+ "eval_samples_per_second": 81.431,
13725
+ "eval_steps_per_second": 0.636,
13726
+ "step": 1805000
13727
+ },
13728
+ {
13729
+ "epoch": 0.0,
13730
+ "learning_rate": 1.4002055611082185e-05,
13731
+ "loss": 0.4387,
13732
+ "step": 1806000
13733
+ },
13734
+ {
13735
+ "epoch": 0.0,
13736
+ "learning_rate": 1.396143827787245e-05,
13737
+ "loss": 0.4379,
13738
+ "step": 1807000
13739
+ },
13740
+ {
13741
+ "epoch": 0.0,
13742
+ "learning_rate": 1.3921022109574423e-05,
13743
+ "loss": 0.4373,
13744
+ "step": 1808000
13745
+ },
13746
+ {
13747
+ "epoch": 0.0,
13748
+ "learning_rate": 1.3880847343598854e-05,
13749
+ "loss": 0.4382,
13750
+ "step": 1809000
13751
+ },
13752
+ {
13753
+ "epoch": 0.01,
13754
+ "learning_rate": 1.384087358540966e-05,
13755
+ "loss": 0.438,
13756
+ "step": 1810000
13757
+ },
13758
+ {
13759
+ "epoch": 0.01,
13760
+ "eval_loss": 0.4181618392467499,
13761
+ "eval_runtime": 77.0873,
13762
+ "eval_samples_per_second": 83.023,
13763
+ "eval_steps_per_second": 0.649,
13764
+ "step": 1810000
13765
+ },
13766
+ {
13767
+ "epoch": 0.01,
13768
+ "learning_rate": 1.3801061244895656e-05,
13769
+ "loss": 0.4382,
13770
+ "step": 1811000
13771
+ },
13772
+ {
13773
+ "epoch": 0.01,
13774
+ "learning_rate": 1.3761450557829634e-05,
13775
+ "loss": 0.4392,
13776
+ "step": 1812000
13777
+ },
13778
+ {
13779
+ "epoch": 0.01,
13780
+ "learning_rate": 1.372204164487259e-05,
13781
+ "loss": 0.4387,
13782
+ "step": 1813000
13783
+ },
13784
+ {
13785
+ "epoch": 0.01,
13786
+ "learning_rate": 1.368283462607094e-05,
13787
+ "loss": 0.4388,
13788
+ "step": 1814000
13789
+ },
13790
+ {
13791
+ "epoch": 0.01,
13792
+ "learning_rate": 1.3643868524915881e-05,
13793
+ "loss": 0.4392,
13794
+ "step": 1815000
13795
+ },
13796
+ {
13797
+ "epoch": 0.01,
13798
+ "eval_loss": 0.42043235898017883,
13799
+ "eval_runtime": 79.2626,
13800
+ "eval_samples_per_second": 80.744,
13801
+ "eval_steps_per_second": 0.631,
13802
+ "step": 1815000
13803
+ },
13804
+ {
13805
+ "epoch": 0.01,
13806
+ "learning_rate": 1.3605065449912204e-05,
13807
+ "loss": 0.4395,
13808
+ "step": 1816000
13809
+ },
13810
+ {
13811
+ "epoch": 0.01,
13812
+ "learning_rate": 1.3566464625393676e-05,
13813
+ "loss": 0.4391,
13814
+ "step": 1817000
13815
+ },
13816
+ {
13817
+ "epoch": 0.01,
13818
+ "learning_rate": 1.352810446627972e-05,
13819
+ "loss": 0.4379,
13820
+ "step": 1818000
13821
+ },
13822
+ {
13823
+ "epoch": 0.01,
13824
+ "learning_rate": 1.3489908292326226e-05,
13825
+ "loss": 0.4377,
13826
+ "step": 1819000
13827
+ },
13828
+ {
13829
+ "epoch": 0.01,
13830
+ "learning_rate": 1.3451952611981318e-05,
13831
+ "loss": 0.4389,
13832
+ "step": 1820000
13833
+ },
13834
+ {
13835
+ "epoch": 0.01,
13836
+ "eval_loss": 0.41748473048210144,
13837
+ "eval_runtime": 77.7669,
13838
+ "eval_samples_per_second": 82.297,
13839
+ "eval_steps_per_second": 0.643,
13840
+ "step": 1820000
13841
+ },
13842
+ {
13843
+ "epoch": 0.01,
13844
+ "learning_rate": 1.3414161553535873e-05,
13845
+ "loss": 0.4386,
13846
+ "step": 1821000
13847
+ },
13848
+ {
13849
+ "epoch": 0.01,
13850
+ "learning_rate": 1.3376573327101957e-05,
13851
+ "loss": 0.4383,
13852
+ "step": 1822000
13853
+ },
13854
+ {
13855
+ "epoch": 0.01,
13856
+ "learning_rate": 1.333918804717982e-05,
13857
+ "loss": 0.4371,
13858
+ "step": 1823000
13859
+ },
13860
+ {
13861
+ "epoch": 0.01,
13862
+ "learning_rate": 1.3302079989360922e-05,
13863
+ "loss": 0.4369,
13864
+ "step": 1824000
13865
+ },
13866
+ {
13867
+ "epoch": 0.01,
13868
+ "learning_rate": 1.3265100537030001e-05,
13869
+ "loss": 0.4378,
13870
+ "step": 1825000
13871
+ },
13872
+ {
13873
+ "epoch": 0.01,
13874
+ "eval_loss": 0.4193136692047119,
13875
+ "eval_runtime": 79.079,
13876
+ "eval_samples_per_second": 80.932,
13877
+ "eval_steps_per_second": 0.632,
13878
+ "step": 1825000
13879
+ },
13880
+ {
13881
+ "epoch": 0.01,
13882
+ "learning_rate": 1.3228324370776315e-05,
13883
+ "loss": 0.4385,
13884
+ "step": 1826000
13885
+ },
13886
+ {
13887
+ "epoch": 0.01,
13888
+ "learning_rate": 1.319175160262646e-05,
13889
+ "loss": 0.4363,
13890
+ "step": 1827000
13891
+ },
13892
+ {
13893
+ "epoch": 0.01,
13894
+ "learning_rate": 1.3155418611556128e-05,
13895
+ "loss": 0.438,
13896
+ "step": 1828000
13897
+ },
13898
+ {
13899
+ "epoch": 0.01,
13900
+ "learning_rate": 1.3119252769539538e-05,
13901
+ "loss": 0.4378,
13902
+ "step": 1829000
13903
+ },
13904
+ {
13905
+ "epoch": 0.01,
13906
+ "learning_rate": 1.3083326518189592e-05,
13907
+ "loss": 0.4377,
13908
+ "step": 1830000
13909
+ },
13910
+ {
13911
+ "epoch": 0.01,
13912
+ "eval_loss": 0.4179893732070923,
13913
+ "eval_runtime": 78.6845,
13914
+ "eval_samples_per_second": 81.337,
13915
+ "eval_steps_per_second": 0.635,
13916
+ "step": 1830000
13917
+ },
13918
+ {
13919
+ "epoch": 0.02,
13920
+ "learning_rate": 1.3047568042535075e-05,
13921
+ "loss": 0.4388,
13922
+ "step": 1831000
13923
+ },
13924
+ {
13925
+ "epoch": 0.02,
13926
+ "learning_rate": 1.3012013515599501e-05,
13927
+ "loss": 0.439,
13928
+ "step": 1832000
13929
+ },
13930
+ {
13931
+ "epoch": 0.02,
13932
+ "learning_rate": 1.2976698294195656e-05,
13933
+ "loss": 0.4392,
13934
+ "step": 1833000
13935
+ },
13936
+ {
13937
+ "epoch": 0.02,
13938
+ "learning_rate": 1.2941586829267356e-05,
13939
+ "loss": 0.4378,
13940
+ "step": 1834000
13941
+ },
13942
+ {
13943
+ "epoch": 0.02,
13944
+ "learning_rate": 1.2906644387183456e-05,
13945
+ "loss": 0.4372,
13946
+ "step": 1835000
13947
+ },
13948
+ {
13949
+ "epoch": 0.02,
13950
+ "eval_loss": 0.4213528037071228,
13951
+ "eval_runtime": 77.6423,
13952
+ "eval_samples_per_second": 82.429,
13953
+ "eval_steps_per_second": 0.644,
13954
+ "step": 1835000
13955
+ },
13956
+ {
13957
+ "epoch": 0.02,
13958
+ "learning_rate": 1.287194095903841e-05,
13959
+ "loss": 0.4367,
13960
+ "step": 1836000
13961
+ },
13962
+ {
13963
+ "epoch": 0.02,
13964
+ "learning_rate": 1.2837407174229876e-05,
13965
+ "loss": 0.437,
13966
+ "step": 1837000
13967
+ },
13968
+ {
13969
+ "epoch": 0.02,
13970
+ "learning_rate": 1.2803077978326747e-05,
13971
+ "loss": 0.4377,
13972
+ "step": 1838000
13973
+ },
13974
+ {
13975
+ "epoch": 0.02,
13976
+ "learning_rate": 1.2768953475901701e-05,
13977
+ "loss": 0.4383,
13978
+ "step": 1839000
13979
+ },
13980
+ {
13981
+ "epoch": 0.02,
13982
+ "learning_rate": 1.2735101405857255e-05,
13983
+ "loss": 0.4379,
13984
+ "step": 1840000
13985
+ },
13986
+ {
13987
+ "epoch": 0.02,
13988
+ "eval_loss": 0.41641688346862793,
13989
+ "eval_runtime": 79.1386,
13990
+ "eval_samples_per_second": 80.871,
13991
+ "eval_steps_per_second": 0.632,
13992
+ "step": 1840000
13993
+ },
13994
+ {
13995
+ "epoch": 0.02,
13996
+ "learning_rate": 1.2701386191707756e-05,
13997
+ "loss": 0.4379,
13998
+ "step": 1841000
13999
+ },
14000
+ {
14001
+ "epoch": 0.02,
14002
+ "learning_rate": 1.2667875980807157e-05,
14003
+ "loss": 0.4384,
14004
+ "step": 1842000
14005
+ },
14006
+ {
14007
+ "epoch": 0.02,
14008
+ "learning_rate": 1.2634570875233356e-05,
14009
+ "loss": 0.4379,
14010
+ "step": 1843000
14011
+ },
14012
+ {
14013
+ "epoch": 0.02,
14014
+ "learning_rate": 1.2601470976439498e-05,
14015
+ "loss": 0.4368,
14016
+ "step": 1844000
14017
+ },
14018
+ {
14019
+ "epoch": 0.02,
14020
+ "learning_rate": 1.2568576385253613e-05,
14021
+ "loss": 0.4379,
14022
+ "step": 1845000
14023
+ },
14024
+ {
14025
+ "epoch": 0.02,
14026
+ "eval_loss": 0.41581401228904724,
14027
+ "eval_runtime": 81.6085,
14028
+ "eval_samples_per_second": 78.423,
14029
+ "eval_steps_per_second": 0.613,
14030
+ "step": 1845000
14031
+ },
14032
+ {
14033
+ "epoch": 0.02,
14034
+ "learning_rate": 1.2535919788427315e-05,
14035
+ "loss": 0.4365,
14036
+ "step": 1846000
14037
+ },
14038
+ {
14039
+ "epoch": 0.02,
14040
+ "learning_rate": 1.2503435906882624e-05,
14041
+ "loss": 0.4374,
14042
+ "step": 1847000
14043
+ },
14044
+ {
14045
+ "epoch": 0.02,
14046
+ "learning_rate": 1.247115763157773e-05,
14047
+ "loss": 0.4381,
14048
+ "step": 1848000
14049
+ },
14050
+ {
14051
+ "epoch": 0.02,
14052
+ "learning_rate": 1.2439117030626584e-05,
14053
+ "loss": 0.4368,
14054
+ "step": 1849000
14055
+ },
14056
+ {
14057
+ "epoch": 0.03,
14058
+ "learning_rate": 1.2407250056299487e-05,
14059
+ "loss": 0.4383,
14060
+ "step": 1850000
14061
+ },
14062
+ {
14063
+ "epoch": 0.03,
14064
+ "eval_loss": 0.4171139597892761,
14065
+ "eval_runtime": 78.6968,
14066
+ "eval_samples_per_second": 81.325,
14067
+ "eval_steps_per_second": 0.635,
14068
+ "step": 1850000
14069
  }
14070
  ],
14071
  "max_steps": 2000000,
14072
  "num_train_epochs": 9223372036854775807,
14073
+ "total_flos": 1.6210685567434752e+22,
14074
  "trial_name": null,
14075
  "trial_params": null
14076
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c28844cf989cd101405725df159cedff2187e54e737139f666fe6aff1d4bf03
3
  size 5551
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffbace6af33e15cfb1f1ee5cd7d43fec11995860b2c004e4c591e320c40cf9b
3
  size 5551
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a47c42cd40edaf177247b0f81cc113941e45da543bcd8075122f86f8a439a53
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64dd9bb8ac07ad494b77a3974d9a13d4d5d6c9061220ee2632308b55b6ccca8c
3
  size 449471589
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c28844cf989cd101405725df159cedff2187e54e737139f666fe6aff1d4bf03
3
  size 5551
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffbace6af33e15cfb1f1ee5cd7d43fec11995860b2c004e4c591e320c40cf9b
3
  size 5551