mosama commited on
Commit
2268aad
·
verified ·
1 Parent(s): 91bc873

Training in progress, step 1900, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -26,13 +26,13 @@
26
  "rank_pattern": {},
27
  "revision": null,
28
  "target_modules": [
29
- "k_proj",
30
- "gate_proj",
31
- "down_proj",
32
- "q_proj",
33
  "v_proj",
 
 
34
  "o_proj",
35
- "up_proj"
 
 
36
  ],
37
  "task_type": "CAUSAL_LM",
38
  "use_dora": false,
 
26
  "rank_pattern": {},
27
  "revision": null,
28
  "target_modules": [
 
 
 
 
29
  "v_proj",
30
+ "up_proj",
31
+ "q_proj",
32
  "o_proj",
33
+ "k_proj",
34
+ "gate_proj",
35
+ "down_proj"
36
  ],
37
  "task_type": "CAUSAL_LM",
38
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a44dfb5e264781c6f6c2ec17953b55c8d56028cea17c18c1ea00e1a273ca0df
3
  size 1370666272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f76dbb9a45ae718cb4c4ffa542564fbd46a97583f94b15b7d3e80c39275a70f
3
  size 1370666272
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64d8002ffbd9ad944ead17d83c487490d6c027e65dfe6f984e192a6959e76693
3
  size 697294462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb3c2e24267fa356ea44dc14e7953e417fa1d6dd44f526c4daea1bcf6b647b7
3
  size 697294462
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1839c408b2800d1f16254de5db0d477776bbfae78a9c676838bcb325c436cdf1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3dad5e9640794d19b0f41e34b58f722c69f08c60cfeb247e583e12e03c10e0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4293837762562377,
5
  "eval_steps": 500,
6
- "global_step": 1850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12957,6 +12957,356 @@
12957
  "learning_rate": 0.00019639739459366182,
12958
  "loss": 0.9533,
12959
  "step": 1850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12960
  }
12961
  ],
12962
  "logging_steps": 1,
@@ -12976,7 +13326,7 @@
12976
  "attributes": {}
12977
  }
12978
  },
12979
- "total_flos": 8.211719251820544e+17,
12980
  "train_batch_size": 32,
12981
  "trial_name": null,
12982
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4409887431820819,
5
  "eval_steps": 500,
6
+ "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12957
  "learning_rate": 0.00019639739459366182,
12958
  "loss": 0.9533,
12959
  "step": 1850
12960
+ },
12961
+ {
12962
+ "epoch": 0.42961587559475456,
12963
+ "grad_norm": 0.47029098868370056,
12964
+ "learning_rate": 0.0001963935140270619,
12965
+ "loss": 0.8904,
12966
+ "step": 1851
12967
+ },
12968
+ {
12969
+ "epoch": 0.4298479749332714,
12970
+ "grad_norm": 0.5885578393936157,
12971
+ "learning_rate": 0.00019638963140997906,
12972
+ "loss": 0.8957,
12973
+ "step": 1852
12974
+ },
12975
+ {
12976
+ "epoch": 0.43008007427178835,
12977
+ "grad_norm": 0.5930177569389343,
12978
+ "learning_rate": 0.00019638574674249587,
12979
+ "loss": 1.0084,
12980
+ "step": 1853
12981
+ },
12982
+ {
12983
+ "epoch": 0.4303121736103052,
12984
+ "grad_norm": 0.5558833479881287,
12985
+ "learning_rate": 0.00019638186002469494,
12986
+ "loss": 0.9102,
12987
+ "step": 1854
12988
+ },
12989
+ {
12990
+ "epoch": 0.4305442729488221,
12991
+ "grad_norm": 0.5855537056922913,
12992
+ "learning_rate": 0.000196377971256659,
12993
+ "loss": 0.9468,
12994
+ "step": 1855
12995
+ },
12996
+ {
12997
+ "epoch": 0.430776372287339,
12998
+ "grad_norm": 0.5754596590995789,
12999
+ "learning_rate": 0.00019637408043847074,
13000
+ "loss": 0.9044,
13001
+ "step": 1856
13002
+ },
13003
+ {
13004
+ "epoch": 0.43100847162585587,
13005
+ "grad_norm": 0.584676206111908,
13006
+ "learning_rate": 0.00019637018757021296,
13007
+ "loss": 0.8508,
13008
+ "step": 1857
13009
+ },
13010
+ {
13011
+ "epoch": 0.43124057096437274,
13012
+ "grad_norm": 0.49439355731010437,
13013
+ "learning_rate": 0.0001963662926519684,
13014
+ "loss": 0.8681,
13015
+ "step": 1858
13016
+ },
13017
+ {
13018
+ "epoch": 0.43147267030288966,
13019
+ "grad_norm": 0.5786611437797546,
13020
+ "learning_rate": 0.00019636239568382,
13021
+ "loss": 0.9083,
13022
+ "step": 1859
13023
+ },
13024
+ {
13025
+ "epoch": 0.4317047696414065,
13026
+ "grad_norm": 0.5431936383247375,
13027
+ "learning_rate": 0.00019635849666585058,
13028
+ "loss": 0.9406,
13029
+ "step": 1860
13030
+ },
13031
+ {
13032
+ "epoch": 0.4319368689799234,
13033
+ "grad_norm": 0.6521342992782593,
13034
+ "learning_rate": 0.00019635459559814314,
13035
+ "loss": 0.8659,
13036
+ "step": 1861
13037
+ },
13038
+ {
13039
+ "epoch": 0.4321689683184403,
13040
+ "grad_norm": 0.5077570676803589,
13041
+ "learning_rate": 0.00019635069248078062,
13042
+ "loss": 0.9172,
13043
+ "step": 1862
13044
+ },
13045
+ {
13046
+ "epoch": 0.4324010676569572,
13047
+ "grad_norm": 0.5636994242668152,
13048
+ "learning_rate": 0.00019634678731384608,
13049
+ "loss": 0.9587,
13050
+ "step": 1863
13051
+ },
13052
+ {
13053
+ "epoch": 0.43263316699547405,
13054
+ "grad_norm": 0.48513078689575195,
13055
+ "learning_rate": 0.00019634288009742255,
13056
+ "loss": 0.9519,
13057
+ "step": 1864
13058
+ },
13059
+ {
13060
+ "epoch": 0.432865266333991,
13061
+ "grad_norm": 0.519437849521637,
13062
+ "learning_rate": 0.00019633897083159318,
13063
+ "loss": 0.9289,
13064
+ "step": 1865
13065
+ },
13066
+ {
13067
+ "epoch": 0.43309736567250784,
13068
+ "grad_norm": 0.5995944738388062,
13069
+ "learning_rate": 0.00019633505951644113,
13070
+ "loss": 0.9566,
13071
+ "step": 1866
13072
+ },
13073
+ {
13074
+ "epoch": 0.4333294650110247,
13075
+ "grad_norm": 0.5057395100593567,
13076
+ "learning_rate": 0.00019633114615204958,
13077
+ "loss": 0.9654,
13078
+ "step": 1867
13079
+ },
13080
+ {
13081
+ "epoch": 0.43356156434954163,
13082
+ "grad_norm": 0.5791558623313904,
13083
+ "learning_rate": 0.00019632723073850176,
13084
+ "loss": 0.9469,
13085
+ "step": 1868
13086
+ },
13087
+ {
13088
+ "epoch": 0.4337936636880585,
13089
+ "grad_norm": 0.5840992331504822,
13090
+ "learning_rate": 0.000196323313275881,
13091
+ "loss": 0.918,
13092
+ "step": 1869
13093
+ },
13094
+ {
13095
+ "epoch": 0.43402576302657536,
13096
+ "grad_norm": 0.550893247127533,
13097
+ "learning_rate": 0.00019631939376427062,
13098
+ "loss": 0.8612,
13099
+ "step": 1870
13100
+ },
13101
+ {
13102
+ "epoch": 0.4342578623650923,
13103
+ "grad_norm": 0.537064790725708,
13104
+ "learning_rate": 0.00019631547220375398,
13105
+ "loss": 0.9316,
13106
+ "step": 1871
13107
+ },
13108
+ {
13109
+ "epoch": 0.43448996170360915,
13110
+ "grad_norm": 0.5622636675834656,
13111
+ "learning_rate": 0.00019631154859441454,
13112
+ "loss": 0.8822,
13113
+ "step": 1872
13114
+ },
13115
+ {
13116
+ "epoch": 0.434722061042126,
13117
+ "grad_norm": 0.599727213382721,
13118
+ "learning_rate": 0.0001963076229363357,
13119
+ "loss": 0.956,
13120
+ "step": 1873
13121
+ },
13122
+ {
13123
+ "epoch": 0.43495416038064294,
13124
+ "grad_norm": 0.5084268450737,
13125
+ "learning_rate": 0.00019630369522960104,
13126
+ "loss": 0.8993,
13127
+ "step": 1874
13128
+ },
13129
+ {
13130
+ "epoch": 0.4351862597191598,
13131
+ "grad_norm": 0.547834038734436,
13132
+ "learning_rate": 0.00019629976547429402,
13133
+ "loss": 0.9046,
13134
+ "step": 1875
13135
+ },
13136
+ {
13137
+ "epoch": 0.4354183590576767,
13138
+ "grad_norm": 0.5189753770828247,
13139
+ "learning_rate": 0.0001962958336704983,
13140
+ "loss": 0.8458,
13141
+ "step": 1876
13142
+ },
13143
+ {
13144
+ "epoch": 0.4356504583961936,
13145
+ "grad_norm": 0.501224160194397,
13146
+ "learning_rate": 0.00019629189981829753,
13147
+ "loss": 0.905,
13148
+ "step": 1877
13149
+ },
13150
+ {
13151
+ "epoch": 0.43588255773471046,
13152
+ "grad_norm": 0.5444706082344055,
13153
+ "learning_rate": 0.0001962879639177753,
13154
+ "loss": 0.8975,
13155
+ "step": 1878
13156
+ },
13157
+ {
13158
+ "epoch": 0.43611465707322733,
13159
+ "grad_norm": 0.5328624248504639,
13160
+ "learning_rate": 0.00019628402596901545,
13161
+ "loss": 0.9257,
13162
+ "step": 1879
13163
+ },
13164
+ {
13165
+ "epoch": 0.43634675641174425,
13166
+ "grad_norm": 0.5254698991775513,
13167
+ "learning_rate": 0.00019628008597210168,
13168
+ "loss": 0.8739,
13169
+ "step": 1880
13170
+ },
13171
+ {
13172
+ "epoch": 0.4365788557502611,
13173
+ "grad_norm": 0.5245271921157837,
13174
+ "learning_rate": 0.0001962761439271178,
13175
+ "loss": 0.8952,
13176
+ "step": 1881
13177
+ },
13178
+ {
13179
+ "epoch": 0.436810955088778,
13180
+ "grad_norm": 0.5154178142547607,
13181
+ "learning_rate": 0.00019627219983414768,
13182
+ "loss": 0.9408,
13183
+ "step": 1882
13184
+ },
13185
+ {
13186
+ "epoch": 0.4370430544272949,
13187
+ "grad_norm": 0.5660544037818909,
13188
+ "learning_rate": 0.00019626825369327525,
13189
+ "loss": 0.8846,
13190
+ "step": 1883
13191
+ },
13192
+ {
13193
+ "epoch": 0.4372751537658118,
13194
+ "grad_norm": 0.5544506907463074,
13195
+ "learning_rate": 0.0001962643055045844,
13196
+ "loss": 0.9322,
13197
+ "step": 1884
13198
+ },
13199
+ {
13200
+ "epoch": 0.43750725310432864,
13201
+ "grad_norm": 0.49590614438056946,
13202
+ "learning_rate": 0.00019626035526815912,
13203
+ "loss": 0.9737,
13204
+ "step": 1885
13205
+ },
13206
+ {
13207
+ "epoch": 0.43773935244284556,
13208
+ "grad_norm": 0.5184259414672852,
13209
+ "learning_rate": 0.0001962564029840835,
13210
+ "loss": 0.9169,
13211
+ "step": 1886
13212
+ },
13213
+ {
13214
+ "epoch": 0.43797145178136243,
13215
+ "grad_norm": 0.5171828866004944,
13216
+ "learning_rate": 0.00019625244865244156,
13217
+ "loss": 0.8724,
13218
+ "step": 1887
13219
+ },
13220
+ {
13221
+ "epoch": 0.4382035511198793,
13222
+ "grad_norm": 0.606625497341156,
13223
+ "learning_rate": 0.0001962484922733174,
13224
+ "loss": 0.8666,
13225
+ "step": 1888
13226
+ },
13227
+ {
13228
+ "epoch": 0.4384356504583962,
13229
+ "grad_norm": 0.5377411842346191,
13230
+ "learning_rate": 0.0001962445338467952,
13231
+ "loss": 0.9142,
13232
+ "step": 1889
13233
+ },
13234
+ {
13235
+ "epoch": 0.4386677497969131,
13236
+ "grad_norm": 0.5942894220352173,
13237
+ "learning_rate": 0.00019624057337295922,
13238
+ "loss": 0.957,
13239
+ "step": 1890
13240
+ },
13241
+ {
13242
+ "epoch": 0.43889984913542995,
13243
+ "grad_norm": 0.5858636498451233,
13244
+ "learning_rate": 0.00019623661085189364,
13245
+ "loss": 0.9022,
13246
+ "step": 1891
13247
+ },
13248
+ {
13249
+ "epoch": 0.4391319484739469,
13250
+ "grad_norm": 0.5353084206581116,
13251
+ "learning_rate": 0.00019623264628368275,
13252
+ "loss": 0.8723,
13253
+ "step": 1892
13254
+ },
13255
+ {
13256
+ "epoch": 0.43936404781246374,
13257
+ "grad_norm": 0.5895339846611023,
13258
+ "learning_rate": 0.0001962286796684109,
13259
+ "loss": 0.9509,
13260
+ "step": 1893
13261
+ },
13262
+ {
13263
+ "epoch": 0.4395961471509806,
13264
+ "grad_norm": 0.5124474763870239,
13265
+ "learning_rate": 0.0001962247110061625,
13266
+ "loss": 0.9523,
13267
+ "step": 1894
13268
+ },
13269
+ {
13270
+ "epoch": 0.43982824648949753,
13271
+ "grad_norm": 0.53212571144104,
13272
+ "learning_rate": 0.00019622074029702194,
13273
+ "loss": 0.8931,
13274
+ "step": 1895
13275
+ },
13276
+ {
13277
+ "epoch": 0.4400603458280144,
13278
+ "grad_norm": 0.4760664999485016,
13279
+ "learning_rate": 0.00019621676754107367,
13280
+ "loss": 0.9609,
13281
+ "step": 1896
13282
+ },
13283
+ {
13284
+ "epoch": 0.44029244516653127,
13285
+ "grad_norm": 0.4855426549911499,
13286
+ "learning_rate": 0.0001962127927384022,
13287
+ "loss": 0.9561,
13288
+ "step": 1897
13289
+ },
13290
+ {
13291
+ "epoch": 0.4405245445050482,
13292
+ "grad_norm": 0.6112794876098633,
13293
+ "learning_rate": 0.00019620881588909212,
13294
+ "loss": 0.9166,
13295
+ "step": 1898
13296
+ },
13297
+ {
13298
+ "epoch": 0.44075664384356505,
13299
+ "grad_norm": 0.5399686098098755,
13300
+ "learning_rate": 0.00019620483699322802,
13301
+ "loss": 0.8998,
13302
+ "step": 1899
13303
+ },
13304
+ {
13305
+ "epoch": 0.4409887431820819,
13306
+ "grad_norm": 0.5019717216491699,
13307
+ "learning_rate": 0.00019620085605089448,
13308
+ "loss": 0.8652,
13309
+ "step": 1900
13310
  }
13311
  ],
13312
  "logging_steps": 1,
 
13326
  "attributes": {}
13327
  }
13328
  },
13329
+ "total_flos": 8.433657609977856e+17,
13330
  "train_batch_size": 32,
13331
  "trial_name": null,
13332
  "trial_params": null