Nadav commited on
Commit
3eac625
1 Parent(s): ab47c11

Training in progress, step 1750000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6263ea7e43a6acbefa798ce6055706ef15240d94f08fb8faefbf26e23ac3a25
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c223f21c9f3d69fb40b6ad537a2d1e1726b01ec615931fd84b4f155a73edb6cb
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abc7a8543a963e582a29e31e1e0c78fea4345a1b73b925ed6cc4d7ab61edbd1e
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4b2f64ee4b8a3f1cf3d86fb133d82c77bc0f7052c00d93cb35fb4180acc8509
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7eefc1725778458a372a52de0baec705be0fcd52c035947880ee6c60789db03
3
  size 21643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e052c7897af7d62d87b26b3f0036377845bb2408ce5c5d3e7b4078dbe5f611ef
3
  size 21643
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f873a347744a9c52f42be277b16c7300feca4fe83dae00b3348477c6cab3f68
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af6d04926cbb05a843491ada6b24ca053dbb81e1dc7c6706a5415b4d4cca0e78
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f676f9b0130b013ba493986d64992bf63d68d6bad5cd11e3728c43b657e50e05
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd9e8ca586c336641c0b85f2a85288a9eeaaab808e84d3e0180b33f991192ef6
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1,
5
- "global_step": 1700000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -12926,11 +12926,391 @@
12926
  "eval_samples_per_second": 81.826,
12927
  "eval_steps_per_second": 0.639,
12928
  "step": 1700000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12929
  }
12930
  ],
12931
  "max_steps": 2000000,
12932
  "num_train_epochs": 9223372036854775807,
12933
- "total_flos": 1.4896305656561664e+22,
12934
  "trial_name": null,
12935
  "trial_params": null
12936
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.125,
5
+ "global_step": 1750000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
12926
  "eval_samples_per_second": 81.826,
12927
  "eval_steps_per_second": 0.639,
12928
  "step": 1700000
12929
+ },
12930
+ {
12931
+ "epoch": 0.1,
12932
+ "learning_rate": 1.9356303574345033e-05,
12933
+ "loss": 0.44,
12934
+ "step": 1701000
12935
+ },
12936
+ {
12937
+ "epoch": 0.1,
12938
+ "learning_rate": 1.9295375753654256e-05,
12939
+ "loss": 0.4406,
12940
+ "step": 1702000
12941
+ },
12942
+ {
12943
+ "epoch": 0.1,
12944
+ "learning_rate": 1.9234693500252896e-05,
12945
+ "loss": 0.4392,
12946
+ "step": 1703000
12947
+ },
12948
+ {
12949
+ "epoch": 0.1,
12950
+ "learning_rate": 1.9174195978495195e-05,
12951
+ "loss": 0.4389,
12952
+ "step": 1704000
12953
+ },
12954
+ {
12955
+ "epoch": 0.1,
12956
+ "learning_rate": 1.9113823092023844e-05,
12957
+ "loss": 0.4395,
12958
+ "step": 1705000
12959
+ },
12960
+ {
12961
+ "epoch": 0.1,
12962
+ "eval_loss": 0.42103949189186096,
12963
+ "eval_runtime": 79.7626,
12964
+ "eval_samples_per_second": 80.238,
12965
+ "eval_steps_per_second": 0.627,
12966
+ "step": 1705000
12967
+ },
12968
+ {
12969
+ "epoch": 0.1,
12970
+ "learning_rate": 1.9053635675406527e-05,
12971
+ "loss": 0.4387,
12972
+ "step": 1706000
12973
+ },
12974
+ {
12975
+ "epoch": 0.1,
12976
+ "learning_rate": 1.899363391198454e-05,
12977
+ "loss": 0.4405,
12978
+ "step": 1707000
12979
+ },
12980
+ {
12981
+ "epoch": 0.1,
12982
+ "learning_rate": 1.893381798453365e-05,
12983
+ "loss": 0.4395,
12984
+ "step": 1708000
12985
+ },
12986
+ {
12987
+ "epoch": 0.1,
12988
+ "learning_rate": 1.887418807526355e-05,
12989
+ "loss": 0.439,
12990
+ "step": 1709000
12991
+ },
12992
+ {
12993
+ "epoch": 0.1,
12994
+ "learning_rate": 1.8814803716459616e-05,
12995
+ "loss": 0.439,
12996
+ "step": 1710000
12997
+ },
12998
+ {
12999
+ "epoch": 0.1,
13000
+ "eval_loss": 0.42225512862205505,
13001
+ "eval_runtime": 77.1239,
13002
+ "eval_samples_per_second": 82.983,
13003
+ "eval_steps_per_second": 0.648,
13004
+ "step": 1710000
13005
+ },
13006
+ {
13007
+ "epoch": 0.11,
13008
+ "learning_rate": 1.875560536579964e-05,
13009
+ "loss": 0.439,
13010
+ "step": 1711000
13011
+ },
13012
+ {
13013
+ "epoch": 0.11,
13014
+ "learning_rate": 1.8696534225358292e-05,
13015
+ "loss": 0.4392,
13016
+ "step": 1712000
13017
+ },
13018
+ {
13019
+ "epoch": 0.11,
13020
+ "learning_rate": 1.8637708616967782e-05,
13021
+ "loss": 0.4397,
13022
+ "step": 1713000
13023
+ },
13024
+ {
13025
+ "epoch": 0.11,
13026
+ "learning_rate": 1.8579010950865316e-05,
13027
+ "loss": 0.4385,
13028
+ "step": 1714000
13029
+ },
13030
+ {
13031
+ "epoch": 0.11,
13032
+ "learning_rate": 1.852050038374741e-05,
13033
+ "loss": 0.4391,
13034
+ "step": 1715000
13035
+ },
13036
+ {
13037
+ "epoch": 0.11,
13038
+ "eval_loss": 0.420716792345047,
13039
+ "eval_runtime": 78.5003,
13040
+ "eval_samples_per_second": 81.528,
13041
+ "eval_steps_per_second": 0.637,
13042
+ "step": 1715000
13043
+ },
13044
+ {
13045
+ "epoch": 0.11,
13046
+ "learning_rate": 1.8462235323533128e-05,
13047
+ "loss": 0.4395,
13048
+ "step": 1716000
13049
+ },
13050
+ {
13051
+ "epoch": 0.11,
13052
+ "learning_rate": 1.8404099300970416e-05,
13053
+ "loss": 0.4378,
13054
+ "step": 1717000
13055
+ },
13056
+ {
13057
+ "epoch": 0.11,
13058
+ "learning_rate": 1.8346208764813356e-05,
13059
+ "loss": 0.4408,
13060
+ "step": 1718000
13061
+ },
13062
+ {
13063
+ "epoch": 0.11,
13064
+ "learning_rate": 1.8288447994466744e-05,
13065
+ "loss": 0.4388,
13066
+ "step": 1719000
13067
+ },
13068
+ {
13069
+ "epoch": 0.11,
13070
+ "learning_rate": 1.8230932687039617e-05,
13071
+ "loss": 0.439,
13072
+ "step": 1720000
13073
+ },
13074
+ {
13075
+ "epoch": 0.11,
13076
+ "eval_loss": 0.41757285594940186,
13077
+ "eval_runtime": 79.8473,
13078
+ "eval_samples_per_second": 80.153,
13079
+ "eval_steps_per_second": 0.626,
13080
+ "step": 1720000
13081
+ },
13082
+ {
13083
+ "epoch": 0.11,
13084
+ "learning_rate": 1.8173547872002242e-05,
13085
+ "loss": 0.4384,
13086
+ "step": 1721000
13087
+ },
13088
+ {
13089
+ "epoch": 0.11,
13090
+ "learning_rate": 1.811640849341029e-05,
13091
+ "loss": 0.4401,
13092
+ "step": 1722000
13093
+ },
13094
+ {
13095
+ "epoch": 0.11,
13096
+ "learning_rate": 1.8059400332198968e-05,
13097
+ "loss": 0.438,
13098
+ "step": 1723000
13099
+ },
13100
+ {
13101
+ "epoch": 0.11,
13102
+ "learning_rate": 1.8002580852796262e-05,
13103
+ "loss": 0.4401,
13104
+ "step": 1724000
13105
+ },
13106
+ {
13107
+ "epoch": 0.11,
13108
+ "learning_rate": 1.7945950228284155e-05,
13109
+ "loss": 0.4401,
13110
+ "step": 1725000
13111
+ },
13112
+ {
13113
+ "epoch": 0.11,
13114
+ "eval_loss": 0.41903457045555115,
13115
+ "eval_runtime": 77.4134,
13116
+ "eval_samples_per_second": 82.673,
13117
+ "eval_steps_per_second": 0.646,
13118
+ "step": 1725000
13119
+ },
13120
+ {
13121
+ "epoch": 0.11,
13122
+ "learning_rate": 1.788950863116934e-05,
13123
+ "loss": 0.4383,
13124
+ "step": 1726000
13125
+ },
13126
+ {
13127
+ "epoch": 0.11,
13128
+ "learning_rate": 1.783331239121836e-05,
13129
+ "loss": 0.4383,
13130
+ "step": 1727000
13131
+ },
13132
+ {
13133
+ "epoch": 0.11,
13134
+ "learning_rate": 1.7777305143227536e-05,
13135
+ "loss": 0.4401,
13136
+ "step": 1728000
13137
+ },
13138
+ {
13139
+ "epoch": 0.11,
13140
+ "learning_rate": 1.772143127833117e-05,
13141
+ "loss": 0.4391,
13142
+ "step": 1729000
13143
+ },
13144
+ {
13145
+ "epoch": 0.12,
13146
+ "learning_rate": 1.766574712475575e-05,
13147
+ "loss": 0.439,
13148
+ "step": 1730000
13149
+ },
13150
+ {
13151
+ "epoch": 0.12,
13152
+ "eval_loss": 0.4182310104370117,
13153
+ "eval_runtime": 75.5121,
13154
+ "eval_samples_per_second": 84.755,
13155
+ "eval_steps_per_second": 0.662,
13156
+ "step": 1730000
13157
+ },
13158
+ {
13159
+ "epoch": 0.12,
13160
+ "learning_rate": 1.7610252852124898e-05,
13161
+ "loss": 0.4387,
13162
+ "step": 1731000
13163
+ },
13164
+ {
13165
+ "epoch": 0.12,
13166
+ "learning_rate": 1.755494862948377e-05,
13167
+ "loss": 0.4382,
13168
+ "step": 1732000
13169
+ },
13170
+ {
13171
+ "epoch": 0.12,
13172
+ "learning_rate": 1.7499889644232756e-05,
13173
+ "loss": 0.4385,
13174
+ "step": 1733000
13175
+ },
13176
+ {
13177
+ "epoch": 0.12,
13178
+ "learning_rate": 1.744496583592041e-05,
13179
+ "loss": 0.4408,
13180
+ "step": 1734000
13181
+ },
13182
+ {
13183
+ "epoch": 0.12,
13184
+ "learning_rate": 1.7390287219108524e-05,
13185
+ "loss": 0.4401,
13186
+ "step": 1735000
13187
+ },
13188
+ {
13189
+ "epoch": 0.12,
13190
+ "eval_loss": 0.4186602830886841,
13191
+ "eval_runtime": 80.3977,
13192
+ "eval_samples_per_second": 79.604,
13193
+ "eval_steps_per_second": 0.622,
13194
+ "step": 1735000
13195
+ },
13196
+ {
13197
+ "epoch": 0.12,
13198
+ "learning_rate": 1.733574449368513e-05,
13199
+ "loss": 0.4391,
13200
+ "step": 1736000
13201
+ },
13202
+ {
13203
+ "epoch": 0.12,
13204
+ "learning_rate": 1.7281392654451555e-05,
13205
+ "loss": 0.4401,
13206
+ "step": 1737000
13207
+ },
13208
+ {
13209
+ "epoch": 0.12,
13210
+ "learning_rate": 1.7227339997768454e-05,
13211
+ "loss": 0.4405,
13212
+ "step": 1738000
13213
+ },
13214
+ {
13215
+ "epoch": 0.12,
13216
+ "learning_rate": 1.7173370044430122e-05,
13217
+ "loss": 0.439,
13218
+ "step": 1739000
13219
+ },
13220
+ {
13221
+ "epoch": 0.12,
13222
+ "learning_rate": 1.7119591471902336e-05,
13223
+ "loss": 0.4397,
13224
+ "step": 1740000
13225
+ },
13226
+ {
13227
+ "epoch": 0.12,
13228
+ "eval_loss": 0.41898131370544434,
13229
+ "eval_runtime": 77.8556,
13230
+ "eval_samples_per_second": 82.203,
13231
+ "eval_steps_per_second": 0.642,
13232
+ "step": 1740000
13233
+ },
13234
+ {
13235
+ "epoch": 0.12,
13236
+ "learning_rate": 1.7066004444003927e-05,
13237
+ "loss": 0.4388,
13238
+ "step": 1741000
13239
+ },
13240
+ {
13241
+ "epoch": 0.12,
13242
+ "learning_rate": 1.7012609123970294e-05,
13243
+ "loss": 0.4388,
13244
+ "step": 1742000
13245
+ },
13246
+ {
13247
+ "epoch": 0.12,
13248
+ "learning_rate": 1.6959405674452816e-05,
13249
+ "loss": 0.4393,
13250
+ "step": 1743000
13251
+ },
13252
+ {
13253
+ "epoch": 0.12,
13254
+ "learning_rate": 1.6906447172961322e-05,
13255
+ "loss": 0.4386,
13256
+ "step": 1744000
13257
+ },
13258
+ {
13259
+ "epoch": 0.12,
13260
+ "learning_rate": 1.6853627757817506e-05,
13261
+ "loss": 0.4379,
13262
+ "step": 1745000
13263
+ },
13264
+ {
13265
+ "epoch": 0.12,
13266
+ "eval_loss": 0.41874217987060547,
13267
+ "eval_runtime": 79.3439,
13268
+ "eval_samples_per_second": 80.662,
13269
+ "eval_steps_per_second": 0.63,
13270
+ "step": 1745000
13271
+ },
13272
+ {
13273
+ "epoch": 0.12,
13274
+ "learning_rate": 1.6801053228400387e-05,
13275
+ "loss": 0.4407,
13276
+ "step": 1746000
13277
+ },
13278
+ {
13279
+ "epoch": 0.12,
13280
+ "learning_rate": 1.6748618490574697e-05,
13281
+ "loss": 0.4398,
13282
+ "step": 1747000
13283
+ },
13284
+ {
13285
+ "epoch": 0.12,
13286
+ "learning_rate": 1.669637642742642e-05,
13287
+ "loss": 0.4385,
13288
+ "step": 1748000
13289
+ },
13290
+ {
13291
+ "epoch": 0.12,
13292
+ "learning_rate": 1.6644327198093962e-05,
13293
+ "loss": 0.4376,
13294
+ "step": 1749000
13295
+ },
13296
+ {
13297
+ "epoch": 0.12,
13298
+ "learning_rate": 1.6592522720912954e-05,
13299
+ "loss": 0.4381,
13300
+ "step": 1750000
13301
+ },
13302
+ {
13303
+ "epoch": 0.12,
13304
+ "eval_loss": 0.42223626375198364,
13305
+ "eval_runtime": 79.3599,
13306
+ "eval_samples_per_second": 80.645,
13307
+ "eval_steps_per_second": 0.63,
13308
+ "step": 1750000
13309
  }
13310
  ],
13311
  "max_steps": 2000000,
13312
  "num_train_epochs": 9223372036854775807,
13313
+ "total_flos": 1.533443229351936e+22,
13314
  "trial_name": null,
13315
  "trial_params": null
13316
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abc7a8543a963e582a29e31e1e0c78fea4345a1b73b925ed6cc4d7ab61edbd1e
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4b2f64ee4b8a3f1cf3d86fb133d82c77bc0f7052c00d93cb35fb4180acc8509
3
  size 449471589