joelniklaus commited on
Commit
1f6148a
1 Parent(s): 6bf9def

Training in progress, step 1000000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e4514153040aa52acec0b23e5c2fbe78b6288537c483ca9a45d4dcaba2a25a8
3
  size 885330713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e5a92f11128c60bae53e142b40413437208f5b8c48d647384f7155fa97b3238
3
  size 885330713
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41f76ab7c37d8cb8748fce3a12de59d4e1d441d259c2a6701a0e4742c0c1c2e5
3
  size 442678571
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b4259dfdc9d95d40bfcdd98b7cbe401b1f4e0ac89f3aee63b1c92426a50e86b
3
  size 442678571
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4801294808a210863b6ecf643fbd8e926b42895cf927e0548ceeb29ac4942a6e
3
  size 17563
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a8c5011437bb6a4b42e3427dd3a9a6650136424f9f2781ff1ae4fc747298ec
3
  size 17563
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcaa11162b7c951c9bfa5282cd6dd6ff1417ab0e4a551347904767629ee02b1f
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1972554f4c7d872911a1ab4e0fd839fb51f0fb3051ee3a13ef5ecb7c7c14439
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a43de67cf39913c0f0a434773186e831fd0385b2a45f6452c82a19dcf8be8c48
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62aba35c522b0e30aa960a3ecaac259d22e10553ef1443be4a8e96f7c73f8aa
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.339467,
5
- "global_step": 950000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -5877,11 +5877,320 @@
5877
  "eval_samples_per_second": 501.78,
5878
  "eval_steps_per_second": 1.004,
5879
  "step": 950000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5880
  }
5881
  ],
5882
  "max_steps": 1000000,
5883
  "num_train_epochs": 9223372036854775807,
5884
- "total_flos": 6.401249364538294e+19,
5885
  "trial_name": null,
5886
  "trial_params": null
5887
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.389467,
5
+ "global_step": 1000000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
5877
  "eval_samples_per_second": 501.78,
5878
  "eval_steps_per_second": 1.004,
5879
  "step": 950000
5880
+ },
5881
+ {
5882
+ "epoch": 1.34,
5883
+ "learning_rate": 6.651113057525916e-07,
5884
+ "loss": 0.899,
5885
+ "step": 951000
5886
+ },
5887
+ {
5888
+ "epoch": 1.34,
5889
+ "learning_rate": 6.385277592210082e-07,
5890
+ "loss": 0.8642,
5891
+ "step": 952000
5892
+ },
5893
+ {
5894
+ "epoch": 1.34,
5895
+ "learning_rate": 6.124571418766378e-07,
5896
+ "loss": 0.9313,
5897
+ "step": 953000
5898
+ },
5899
+ {
5900
+ "epoch": 1.34,
5901
+ "learning_rate": 5.869518794409723e-07,
5902
+ "loss": 0.9976,
5903
+ "step": 954000
5904
+ },
5905
+ {
5906
+ "epoch": 1.34,
5907
+ "learning_rate": 5.619611892955956e-07,
5908
+ "loss": 0.9632,
5909
+ "step": 955000
5910
+ },
5911
+ {
5912
+ "epoch": 1.35,
5913
+ "learning_rate": 5.375353255232474e-07,
5914
+ "loss": 0.833,
5915
+ "step": 956000
5916
+ },
5917
+ {
5918
+ "epoch": 1.35,
5919
+ "learning_rate": 5.136256546577067e-07,
5920
+ "loss": 0.8247,
5921
+ "step": 957000
5922
+ },
5923
+ {
5924
+ "epoch": 1.35,
5925
+ "learning_rate": 4.902571588535909e-07,
5926
+ "loss": 0.789,
5927
+ "step": 958000
5928
+ },
5929
+ {
5930
+ "epoch": 1.35,
5931
+ "learning_rate": 4.674526502020382e-07,
5932
+ "loss": 0.7219,
5933
+ "step": 959000
5934
+ },
5935
+ {
5936
+ "epoch": 1.35,
5937
+ "learning_rate": 4.451667234591728e-07,
5938
+ "loss": 0.8131,
5939
+ "step": 960000
5940
+ },
5941
+ {
5942
+ "epoch": 1.35,
5943
+ "learning_rate": 4.234441936661282e-07,
5944
+ "loss": 0.9334,
5945
+ "step": 961000
5946
+ },
5947
+ {
5948
+ "epoch": 1.35,
5949
+ "learning_rate": 4.0224180986853655e-07,
5950
+ "loss": 0.9777,
5951
+ "step": 962000
5952
+ },
5953
+ {
5954
+ "epoch": 1.35,
5955
+ "learning_rate": 3.8160220819785095e-07,
5956
+ "loss": 1.0358,
5957
+ "step": 963000
5958
+ },
5959
+ {
5960
+ "epoch": 1.35,
5961
+ "learning_rate": 3.6148429387927175e-07,
5962
+ "loss": 1.0623,
5963
+ "step": 964000
5964
+ },
5965
+ {
5966
+ "epoch": 1.35,
5967
+ "learning_rate": 3.419285222713675e-07,
5968
+ "loss": 1.0595,
5969
+ "step": 965000
5970
+ },
5971
+ {
5972
+ "epoch": 1.36,
5973
+ "learning_rate": 3.228959565747369e-07,
5974
+ "loss": 0.9013,
5975
+ "step": 966000
5976
+ },
5977
+ {
5978
+ "epoch": 1.36,
5979
+ "learning_rate": 3.044248696072116e-07,
5980
+ "loss": 0.9035,
5981
+ "step": 967000
5982
+ },
5983
+ {
5984
+ "epoch": 1.36,
5985
+ "learning_rate": 2.86478484246272e-07,
5986
+ "loss": 0.865,
5987
+ "step": 968000
5988
+ },
5989
+ {
5990
+ "epoch": 1.36,
5991
+ "learning_rate": 2.690928890965172e-07,
5992
+ "loss": 0.8449,
5993
+ "step": 969000
5994
+ },
5995
+ {
5996
+ "epoch": 1.36,
5997
+ "learning_rate": 2.5223346831947934e-07,
5998
+ "loss": 1.0131,
5999
+ "step": 970000
6000
+ },
6001
+ {
6002
+ "epoch": 1.36,
6003
+ "learning_rate": 2.359180811469297e-07,
6004
+ "loss": 1.0522,
6005
+ "step": 971000
6006
+ },
6007
+ {
6008
+ "epoch": 1.36,
6009
+ "learning_rate": 2.2016240528467956e-07,
6010
+ "loss": 1.0527,
6011
+ "step": 972000
6012
+ },
6013
+ {
6014
+ "epoch": 1.36,
6015
+ "learning_rate": 2.0493507016841605e-07,
6016
+ "loss": 1.0244,
6017
+ "step": 973000
6018
+ },
6019
+ {
6020
+ "epoch": 1.36,
6021
+ "learning_rate": 1.902666966315303e-07,
6022
+ "loss": 0.9939,
6023
+ "step": 974000
6024
+ },
6025
+ {
6026
+ "epoch": 1.36,
6027
+ "learning_rate": 1.7612807899859974e-07,
6028
+ "loss": 0.975,
6029
+ "step": 975000
6030
+ },
6031
+ {
6032
+ "epoch": 1.37,
6033
+ "learning_rate": 1.6254764878778085e-07,
6034
+ "loss": 0.8924,
6035
+ "step": 976000
6036
+ },
6037
+ {
6038
+ "epoch": 1.37,
6039
+ "learning_rate": 1.4949836648880388e-07,
6040
+ "loss": 0.8095,
6041
+ "step": 977000
6042
+ },
6043
+ {
6044
+ "epoch": 1.37,
6045
+ "learning_rate": 1.3699424128894024e-07,
6046
+ "loss": 0.7741,
6047
+ "step": 978000
6048
+ },
6049
+ {
6050
+ "epoch": 1.37,
6051
+ "learning_rate": 1.2503540993129005e-07,
6052
+ "loss": 0.7888,
6053
+ "step": 979000
6054
+ },
6055
+ {
6056
+ "epoch": 1.37,
6057
+ "learning_rate": 1.1363314412082271e-07,
6058
+ "loss": 0.8592,
6059
+ "step": 980000
6060
+ },
6061
+ {
6062
+ "epoch": 1.37,
6063
+ "learning_rate": 1.0276474121272417e-07,
6064
+ "loss": 0.8308,
6065
+ "step": 981000
6066
+ },
6067
+ {
6068
+ "epoch": 1.37,
6069
+ "learning_rate": 9.245205661059241e-08,
6070
+ "loss": 0.8657,
6071
+ "step": 982000
6072
+ },
6073
+ {
6074
+ "epoch": 1.37,
6075
+ "learning_rate": 8.2674557095902e-08,
6076
+ "loss": 0.9413,
6077
+ "step": 983000
6078
+ },
6079
+ {
6080
+ "epoch": 1.37,
6081
+ "learning_rate": 7.345190436134352e-08,
6082
+ "loss": 0.9376,
6083
+ "step": 984000
6084
+ },
6085
+ {
6086
+ "epoch": 1.37,
6087
+ "learning_rate": 6.476573551197352e-08,
6088
+ "loss": 0.8103,
6089
+ "step": 985000
6090
+ },
6091
+ {
6092
+ "epoch": 1.38,
6093
+ "learning_rate": 5.6633517670373616e-08,
6094
+ "loss": 0.8122,
6095
+ "step": 986000
6096
+ },
6097
+ {
6098
+ "epoch": 1.38,
6099
+ "learning_rate": 4.903905906762374e-08,
6100
+ "loss": 0.809,
6101
+ "step": 987000
6102
+ },
6103
+ {
6104
+ "epoch": 1.38,
6105
+ "learning_rate": 4.1990856170864845e-08,
6106
+ "loss": 0.7404,
6107
+ "step": 988000
6108
+ },
6109
+ {
6110
+ "epoch": 1.38,
6111
+ "learning_rate": 3.549521501085562e-08,
6112
+ "loss": 0.8188,
6113
+ "step": 989000
6114
+ },
6115
+ {
6116
+ "epoch": 1.38,
6117
+ "learning_rate": 2.9539202348127794e-08,
6118
+ "loss": 0.9535,
6119
+ "step": 990000
6120
+ },
6121
+ {
6122
+ "epoch": 1.38,
6123
+ "learning_rate": 2.412965863871075e-08,
6124
+ "loss": 0.9178,
6125
+ "step": 991000
6126
+ },
6127
+ {
6128
+ "epoch": 1.38,
6129
+ "learning_rate": 1.9271233047113424e-08,
6130
+ "loss": 0.9305,
6131
+ "step": 992000
6132
+ },
6133
+ {
6134
+ "epoch": 1.38,
6135
+ "learning_rate": 1.4954252135407352e-08,
6136
+ "loss": 0.938,
6137
+ "step": 993000
6138
+ },
6139
+ {
6140
+ "epoch": 1.38,
6141
+ "learning_rate": 1.1187396973016962e-08,
6142
+ "loss": 0.969,
6143
+ "step": 994000
6144
+ },
6145
+ {
6146
+ "epoch": 1.38,
6147
+ "learning_rate": 7.963167508967528e-09,
6148
+ "loss": 0.8864,
6149
+ "step": 995000
6150
+ },
6151
+ {
6152
+ "epoch": 1.39,
6153
+ "learning_rate": 5.288047385498818e-09,
6154
+ "loss": 0.8899,
6155
+ "step": 996000
6156
+ },
6157
+ {
6158
+ "epoch": 1.39,
6159
+ "learning_rate": 3.156710266344343e-09,
6160
+ "loss": 0.8701,
6161
+ "step": 997000
6162
+ },
6163
+ {
6164
+ "epoch": 1.39,
6165
+ "learning_rate": 1.5721306152016724e-09,
6166
+ "loss": 0.8305,
6167
+ "step": 998000
6168
+ },
6169
+ {
6170
+ "epoch": 1.39,
6171
+ "learning_rate": 5.350904473455653e-10,
6172
+ "loss": 0.961,
6173
+ "step": 999000
6174
+ },
6175
+ {
6176
+ "epoch": 1.39,
6177
+ "learning_rate": 4.3524949094875254e-11,
6178
+ "loss": 1.0488,
6179
+ "step": 1000000
6180
+ },
6181
+ {
6182
+ "epoch": 1.39,
6183
+ "eval_accuracy": 0.8387476162115661,
6184
+ "eval_loss": 0.7079769372940063,
6185
+ "eval_runtime": 10.2424,
6186
+ "eval_samples_per_second": 488.169,
6187
+ "eval_steps_per_second": 0.976,
6188
+ "step": 1000000
6189
  }
6190
  ],
6191
  "max_steps": 1000000,
6192
  "num_train_epochs": 9223372036854775807,
6193
+ "total_flos": 6.738157336654774e+19,
6194
  "trial_name": null,
6195
  "trial_params": null
6196
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41f76ab7c37d8cb8748fce3a12de59d4e1d441d259c2a6701a0e4742c0c1c2e5
3
  size 442678571
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b4259dfdc9d95d40bfcdd98b7cbe401b1f4e0ac89f3aee63b1c92426a50e86b
3
  size 442678571