Nadav commited on
Commit
924be84
1 Parent(s): a667843

Training in progress, step 2000000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24480137122a3ca1298b2aa2acbf1d8e05d75ba9f182abd41ff9618c60e00071
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38f36466b9f2b124ce3950f4272937ae40e2fa26880ec00a4e1f83639190fb7d
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4158aaedff079b2378ceb72199c920ad399c00fbc03838dbc3a2204ee0d64219
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fc35de7c7ab795f6ce22b4d822a3c81dd28eb6da159fa0e6bc70e2d249fbce8
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a16c585a386790723cc51bc4a838a254dc71110b475f7ebf887ed7011d90a8f
3
  size 21579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c86960e82d428869302623bd9f7002f37b98a8296d67cde31b64acf1793fdd0e
3
  size 21579
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abaeb1638369c701afb9b3b4e706b5c028681adb6ebf26ba2bfe37402d287efd
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c2c5dcfeda6d6eb5b101bdcd99b94aa97e0eb4affa75fa0e151082e701b9eb
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2c8322c0057a49117b93f76b6d690bf483c56843cf994e2b3614611effcb47d
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994a0fabdb31bb0426e3f82b99b32aaddcc1766fdd4539450b1f928f65099fb8
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.075,
5
- "global_step": 1950000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -14826,11 +14826,391 @@
14826
  "eval_samples_per_second": 82.821,
14827
  "eval_steps_per_second": 0.647,
14828
  "step": 1950000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14829
  }
14830
  ],
14831
  "max_steps": 2000000,
14832
  "num_train_epochs": 9223372036854775807,
14833
- "total_flos": 1.7086938841350144e+22,
14834
  "trial_name": null,
14835
  "trial_params": null
14836
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1,
5
+ "global_step": 2000000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
14826
  "eval_samples_per_second": 82.821,
14827
  "eval_steps_per_second": 0.647,
14828
  "step": 1950000
14829
+ },
14830
+ {
14831
+ "epoch": 0.08,
14832
+ "learning_rate": 1.026354625870075e-05,
14833
+ "loss": 0.4364,
14834
+ "step": 1951000
14835
+ },
14836
+ {
14837
+ "epoch": 0.08,
14838
+ "learning_rate": 1.0253060901106556e-05,
14839
+ "loss": 0.4361,
14840
+ "step": 1952000
14841
+ },
14842
+ {
14843
+ "epoch": 0.08,
14844
+ "learning_rate": 1.0242798171546145e-05,
14845
+ "loss": 0.4365,
14846
+ "step": 1953000
14847
+ },
14848
+ {
14849
+ "epoch": 0.08,
14850
+ "learning_rate": 1.0232747509747644e-05,
14851
+ "loss": 0.4373,
14852
+ "step": 1954000
14853
+ },
14854
+ {
14855
+ "epoch": 0.08,
14856
+ "learning_rate": 1.0222899204125646e-05,
14857
+ "loss": 0.4362,
14858
+ "step": 1955000
14859
+ },
14860
+ {
14861
+ "epoch": 0.08,
14862
+ "eval_loss": 0.4164978265762329,
14863
+ "eval_runtime": 80.0596,
14864
+ "eval_samples_per_second": 79.94,
14865
+ "eval_steps_per_second": 0.625,
14866
+ "step": 1955000
14867
+ },
14868
+ {
14869
+ "epoch": 0.08,
14870
+ "learning_rate": 1.0213263451653737e-05,
14871
+ "loss": 0.4367,
14872
+ "step": 1956000
14873
+ },
14874
+ {
14875
+ "epoch": 0.08,
14876
+ "learning_rate": 1.0203849598659497e-05,
14877
+ "loss": 0.4367,
14878
+ "step": 1957000
14879
+ },
14880
+ {
14881
+ "epoch": 0.08,
14882
+ "learning_rate": 1.0194638827271399e-05,
14883
+ "loss": 0.4364,
14884
+ "step": 1958000
14885
+ },
14886
+ {
14887
+ "epoch": 0.08,
14888
+ "learning_rate": 1.0185640695119401e-05,
14889
+ "loss": 0.4363,
14890
+ "step": 1959000
14891
+ },
14892
+ {
14893
+ "epoch": 0.08,
14894
+ "learning_rate": 1.017685522961337e-05,
14895
+ "loss": 0.4362,
14896
+ "step": 1960000
14897
+ },
14898
+ {
14899
+ "epoch": 0.08,
14900
+ "eval_loss": 0.42052188515663147,
14901
+ "eval_runtime": 77.8558,
14902
+ "eval_samples_per_second": 82.203,
14903
+ "eval_steps_per_second": 0.642,
14904
+ "step": 1960000
14905
+ },
14906
+ {
14907
+ "epoch": 0.08,
14908
+ "learning_rate": 1.0168282457515363e-05,
14909
+ "loss": 0.4369,
14910
+ "step": 1961000
14911
+ },
14912
+ {
14913
+ "epoch": 0.08,
14914
+ "learning_rate": 1.0159930658730172e-05,
14915
+ "loss": 0.4364,
14916
+ "step": 1962000
14917
+ },
14918
+ {
14919
+ "epoch": 0.08,
14920
+ "learning_rate": 1.0151791179631108e-05,
14921
+ "loss": 0.4359,
14922
+ "step": 1963000
14923
+ },
14924
+ {
14925
+ "epoch": 0.08,
14926
+ "learning_rate": 1.0143856216286122e-05,
14927
+ "loss": 0.4368,
14928
+ "step": 1964000
14929
+ },
14930
+ {
14931
+ "epoch": 0.08,
14932
+ "learning_rate": 1.0136134046869866e-05,
14933
+ "loss": 0.4357,
14934
+ "step": 1965000
14935
+ },
14936
+ {
14937
+ "epoch": 0.08,
14938
+ "eval_loss": 0.41740044951438904,
14939
+ "eval_runtime": 78.1991,
14940
+ "eval_samples_per_second": 81.842,
14941
+ "eval_steps_per_second": 0.639,
14942
+ "step": 1965000
14943
+ },
14944
+ {
14945
+ "epoch": 0.08,
14946
+ "learning_rate": 1.0128632097947403e-05,
14947
+ "loss": 0.4365,
14948
+ "step": 1966000
14949
+ },
14950
+ {
14951
+ "epoch": 0.08,
14952
+ "learning_rate": 1.0121335373458022e-05,
14953
+ "loss": 0.4362,
14954
+ "step": 1967000
14955
+ },
14956
+ {
14957
+ "epoch": 0.08,
14958
+ "learning_rate": 1.011425151149977e-05,
14959
+ "loss": 0.4361,
14960
+ "step": 1968000
14961
+ },
14962
+ {
14963
+ "epoch": 0.08,
14964
+ "learning_rate": 1.010738729828653e-05,
14965
+ "loss": 0.4375,
14966
+ "step": 1969000
14967
+ },
14968
+ {
14969
+ "epoch": 0.09,
14970
+ "learning_rate": 1.0100729012562797e-05,
14971
+ "loss": 0.4372,
14972
+ "step": 1970000
14973
+ },
14974
+ {
14975
+ "epoch": 0.09,
14976
+ "eval_loss": 0.4145086705684662,
14977
+ "eval_runtime": 79.8319,
14978
+ "eval_samples_per_second": 80.168,
14979
+ "eval_steps_per_second": 0.626,
14980
+ "step": 1970000
14981
+ },
14982
+ {
14983
+ "epoch": 0.09,
14984
+ "learning_rate": 1.0094289991138392e-05,
14985
+ "loss": 0.4363,
14986
+ "step": 1971000
14987
+ },
14988
+ {
14989
+ "epoch": 0.09,
14990
+ "learning_rate": 1.0088057362697175e-05,
14991
+ "loss": 0.4375,
14992
+ "step": 1972000
14993
+ },
14994
+ {
14995
+ "epoch": 0.09,
14996
+ "learning_rate": 1.0082049524936494e-05,
14997
+ "loss": 0.4372,
14998
+ "step": 1973000
14999
+ },
15000
+ {
15001
+ "epoch": 0.09,
15002
+ "learning_rate": 1.0076242416653332e-05,
15003
+ "loss": 0.4349,
15004
+ "step": 1974000
15005
+ },
15006
+ {
15007
+ "epoch": 0.09,
15008
+ "learning_rate": 1.0070648308262255e-05,
15009
+ "loss": 0.436,
15010
+ "step": 1975000
15011
+ },
15012
+ {
15013
+ "epoch": 0.09,
15014
+ "eval_loss": 0.4151042103767395,
15015
+ "eval_runtime": 79.0273,
15016
+ "eval_samples_per_second": 80.985,
15017
+ "eval_steps_per_second": 0.633,
15018
+ "step": 1975000
15019
+ },
15020
+ {
15021
+ "epoch": 0.09,
15022
+ "learning_rate": 1.006526721680391e-05,
15023
+ "loss": 0.4342,
15024
+ "step": 1976000
15025
+ },
15026
+ {
15027
+ "epoch": 0.09,
15028
+ "learning_rate": 1.0060099158670026e-05,
15029
+ "loss": 0.4363,
15030
+ "step": 1977000
15031
+ },
15032
+ {
15033
+ "epoch": 0.09,
15034
+ "learning_rate": 1.0055148998189381e-05,
15035
+ "loss": 0.437,
15036
+ "step": 1978000
15037
+ },
15038
+ {
15039
+ "epoch": 0.09,
15040
+ "learning_rate": 1.0050411475939925e-05,
15041
+ "loss": 0.436,
15042
+ "step": 1979000
15043
+ },
15044
+ {
15045
+ "epoch": 0.09,
15046
+ "learning_rate": 1.0045882183469046e-05,
15047
+ "loss": 0.4355,
15048
+ "step": 1980000
15049
+ },
15050
+ {
15051
+ "epoch": 0.09,
15052
+ "eval_loss": 0.4141569435596466,
15053
+ "eval_runtime": 79.5726,
15054
+ "eval_samples_per_second": 80.43,
15055
+ "eval_steps_per_second": 0.628,
15056
+ "step": 1980000
15057
+ },
15058
+ {
15059
+ "epoch": 0.09,
15060
+ "learning_rate": 1.0041565983372807e-05,
15061
+ "loss": 0.4359,
15062
+ "step": 1981000
15063
+ },
15064
+ {
15065
+ "epoch": 0.09,
15066
+ "learning_rate": 1.0037462888799093e-05,
15067
+ "loss": 0.4362,
15068
+ "step": 1982000
15069
+ },
15070
+ {
15071
+ "epoch": 0.09,
15072
+ "learning_rate": 1.0033576695766748e-05,
15073
+ "loss": 0.4376,
15074
+ "step": 1983000
15075
+ },
15076
+ {
15077
+ "epoch": 0.09,
15078
+ "learning_rate": 1.0029899635949539e-05,
15079
+ "loss": 0.4373,
15080
+ "step": 1984000
15081
+ },
15082
+ {
15083
+ "epoch": 0.09,
15084
+ "learning_rate": 1.0026435717192568e-05,
15085
+ "loss": 0.4367,
15086
+ "step": 1985000
15087
+ },
15088
+ {
15089
+ "epoch": 0.09,
15090
+ "eval_loss": 0.4171934127807617,
15091
+ "eval_runtime": 77.9474,
15092
+ "eval_samples_per_second": 82.107,
15093
+ "eval_steps_per_second": 0.641,
15094
+ "step": 1985000
15095
+ },
15096
+ {
15097
+ "epoch": 0.09,
15098
+ "learning_rate": 1.0023184950047551e-05,
15099
+ "loss": 0.4361,
15100
+ "step": 1986000
15101
+ },
15102
+ {
15103
+ "epoch": 0.09,
15104
+ "learning_rate": 1.002015027554519e-05,
15105
+ "loss": 0.4377,
15106
+ "step": 1987000
15107
+ },
15108
+ {
15109
+ "epoch": 0.09,
15110
+ "learning_rate": 1.0017325627506754e-05,
15111
+ "loss": 0.4373,
15112
+ "step": 1988000
15113
+ },
15114
+ {
15115
+ "epoch": 0.09,
15116
+ "learning_rate": 1.0014716663814055e-05,
15117
+ "loss": 0.4368,
15118
+ "step": 1989000
15119
+ },
15120
+ {
15121
+ "epoch": 0.1,
15122
+ "learning_rate": 1.0012320461270247e-05,
15123
+ "loss": 0.4358,
15124
+ "step": 1990000
15125
+ },
15126
+ {
15127
+ "epoch": 0.1,
15128
+ "eval_loss": 0.41612717509269714,
15129
+ "eval_runtime": 80.5577,
15130
+ "eval_samples_per_second": 79.446,
15131
+ "eval_steps_per_second": 0.621,
15132
+ "step": 1990000
15133
+ },
15134
+ {
15135
+ "epoch": 0.1,
15136
+ "learning_rate": 1.0010134948139825e-05,
15137
+ "loss": 0.4366,
15138
+ "step": 1991000
15139
+ },
15140
+ {
15141
+ "epoch": 0.1,
15142
+ "learning_rate": 1.0008162636276321e-05,
15143
+ "loss": 0.4369,
15144
+ "step": 1992000
15145
+ },
15146
+ {
15147
+ "epoch": 0.1,
15148
+ "learning_rate": 1.0006403531687724e-05,
15149
+ "loss": 0.4372,
15150
+ "step": 1993000
15151
+ },
15152
+ {
15153
+ "epoch": 0.1,
15154
+ "learning_rate": 1.0004859079123212e-05,
15155
+ "loss": 0.4361,
15156
+ "step": 1994000
15157
+ },
15158
+ {
15159
+ "epoch": 0.1,
15160
+ "learning_rate": 1.0003526191291106e-05,
15161
+ "loss": 0.4369,
15162
+ "step": 1995000
15163
+ },
15164
+ {
15165
+ "epoch": 0.1,
15166
+ "eval_loss": 0.4170204997062683,
15167
+ "eval_runtime": 80.1918,
15168
+ "eval_samples_per_second": 79.809,
15169
+ "eval_steps_per_second": 0.624,
15170
+ "step": 1995000
15171
+ },
15172
+ {
15173
+ "epoch": 0.1,
15174
+ "learning_rate": 1.0002406524857334e-05,
15175
+ "loss": 0.436,
15176
+ "step": 1996000
15177
+ },
15178
+ {
15179
+ "epoch": 0.1,
15180
+ "learning_rate": 1.0001500883167451e-05,
15181
+ "loss": 0.4372,
15182
+ "step": 1997000
15183
+ },
15184
+ {
15185
+ "epoch": 0.1,
15186
+ "learning_rate": 1.0000807455884181e-05,
15187
+ "loss": 0.4369,
15188
+ "step": 1998000
15189
+ },
15190
+ {
15191
+ "epoch": 0.1,
15192
+ "learning_rate": 1.0000327631969819e-05,
15193
+ "loss": 0.4362,
15194
+ "step": 1999000
15195
+ },
15196
+ {
15197
+ "epoch": 0.1,
15198
+ "learning_rate": 1.00000604522778e-05,
15199
+ "loss": 0.4363,
15200
+ "step": 2000000
15201
+ },
15202
+ {
15203
+ "epoch": 0.1,
15204
+ "eval_loss": 0.41442054510116577,
15205
+ "eval_runtime": 79.9098,
15206
+ "eval_samples_per_second": 80.09,
15207
+ "eval_steps_per_second": 0.626,
15208
+ "step": 2000000
15209
  }
15210
  ],
15211
  "max_steps": 2000000,
15212
  "num_train_epochs": 9223372036854775807,
15213
+ "total_flos": 1.752506547830784e+22,
15214
  "trial_name": null,
15215
  "trial_params": null
15216
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4158aaedff079b2378ceb72199c920ad399c00fbc03838dbc3a2204ee0d64219
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fc35de7c7ab795f6ce22b4d822a3c81dd28eb6da159fa0e6bc70e2d249fbce8
3
  size 449471589