alicegoesdown commited on
Commit
8cacbab
·
verified ·
1 Parent(s): bd40830

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/lora_lower/adapter_config.json CHANGED
@@ -16,14 +16,14 @@
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
- "transformer.h.12.mlp.dense_4h_to_h": 8,
20
- "transformer.h.12.mlp.dense_h_to_4h": 8,
21
- "transformer.h.12.self_attention.dense": 8,
22
- "transformer.h.12.self_attention.query_key_value": 8,
23
- "transformer.h.13.mlp.dense_4h_to_h": 8,
24
- "transformer.h.13.mlp.dense_h_to_4h": 8,
25
- "transformer.h.13.self_attention.dense": 8,
26
- "transformer.h.13.self_attention.query_key_value": 8,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
@@ -44,30 +44,30 @@
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
- "transformer.h.19.mlp.dense_4h_to_h": 16,
48
- "transformer.h.19.mlp.dense_h_to_4h": 16,
49
- "transformer.h.19.self_attention.dense": 16,
50
- "transformer.h.19.self_attention.query_key_value": 16,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
- "transformer.h.20.mlp.dense_4h_to_h": 16,
56
- "transformer.h.20.mlp.dense_h_to_4h": 16,
57
- "transformer.h.20.self_attention.dense": 16,
58
- "transformer.h.20.self_attention.query_key_value": 16,
59
- "transformer.h.21.mlp.dense_4h_to_h": 32,
60
- "transformer.h.21.mlp.dense_h_to_4h": 32,
61
- "transformer.h.21.self_attention.dense": 32,
62
- "transformer.h.21.self_attention.query_key_value": 32,
63
- "transformer.h.22.mlp.dense_4h_to_h": 32,
64
- "transformer.h.22.mlp.dense_h_to_4h": 32,
65
- "transformer.h.22.self_attention.dense": 32,
66
- "transformer.h.22.self_attention.query_key_value": 32,
67
- "transformer.h.23.mlp.dense_4h_to_h": 32,
68
- "transformer.h.23.mlp.dense_h_to_4h": 32,
69
- "transformer.h.23.self_attention.dense": 32,
70
- "transformer.h.23.self_attention.query_key_value": 32,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
@@ -111,9 +111,7 @@
111
  "lora_dropout": 0.0,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
- "modules_to_save": [
115
- "lm_head"
116
- ],
117
  "peft_type": "LORA",
118
  "r": 4,
119
  "rank_pattern": {
@@ -133,14 +131,14 @@
133
  "transformer.h.11.mlp.dense_h_to_4h": 4,
134
  "transformer.h.11.self_attention.dense": 4,
135
  "transformer.h.11.self_attention.query_key_value": 4,
136
- "transformer.h.12.mlp.dense_4h_to_h": 4,
137
- "transformer.h.12.mlp.dense_h_to_4h": 4,
138
- "transformer.h.12.self_attention.dense": 4,
139
- "transformer.h.12.self_attention.query_key_value": 4,
140
- "transformer.h.13.mlp.dense_4h_to_h": 4,
141
- "transformer.h.13.mlp.dense_h_to_4h": 4,
142
- "transformer.h.13.self_attention.dense": 4,
143
- "transformer.h.13.self_attention.query_key_value": 4,
144
  "transformer.h.14.mlp.dense_4h_to_h": 8,
145
  "transformer.h.14.mlp.dense_h_to_4h": 8,
146
  "transformer.h.14.self_attention.dense": 8,
@@ -161,30 +159,30 @@
161
  "transformer.h.18.mlp.dense_h_to_4h": 8,
162
  "transformer.h.18.self_attention.dense": 8,
163
  "transformer.h.18.self_attention.query_key_value": 8,
164
- "transformer.h.19.mlp.dense_4h_to_h": 8,
165
- "transformer.h.19.mlp.dense_h_to_4h": 8,
166
- "transformer.h.19.self_attention.dense": 8,
167
- "transformer.h.19.self_attention.query_key_value": 8,
168
  "transformer.h.2.mlp.dense_4h_to_h": 4,
169
  "transformer.h.2.mlp.dense_h_to_4h": 4,
170
  "transformer.h.2.self_attention.dense": 4,
171
  "transformer.h.2.self_attention.query_key_value": 4,
172
- "transformer.h.20.mlp.dense_4h_to_h": 8,
173
- "transformer.h.20.mlp.dense_h_to_4h": 8,
174
- "transformer.h.20.self_attention.dense": 8,
175
- "transformer.h.20.self_attention.query_key_value": 8,
176
- "transformer.h.21.mlp.dense_4h_to_h": 16,
177
- "transformer.h.21.mlp.dense_h_to_4h": 16,
178
- "transformer.h.21.self_attention.dense": 16,
179
- "transformer.h.21.self_attention.query_key_value": 16,
180
- "transformer.h.22.mlp.dense_4h_to_h": 16,
181
- "transformer.h.22.mlp.dense_h_to_4h": 16,
182
- "transformer.h.22.self_attention.dense": 16,
183
- "transformer.h.22.self_attention.query_key_value": 16,
184
- "transformer.h.23.mlp.dense_4h_to_h": 16,
185
- "transformer.h.23.mlp.dense_h_to_4h": 16,
186
- "transformer.h.23.self_attention.dense": 16,
187
- "transformer.h.23.self_attention.query_key_value": 16,
188
  "transformer.h.3.mlp.dense_4h_to_h": 4,
189
  "transformer.h.3.mlp.dense_h_to_4h": 4,
190
  "transformer.h.3.self_attention.dense": 4,
@@ -216,62 +214,54 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
219
- "transformer.h.8.mlp.dense_4h_to_h",
220
- "transformer.h.6.self_attention.query_key_value",
221
- "transformer.h.4.mlp.dense_h_to_4h",
222
- "transformer.h.11.mlp.dense_4h_to_h",
223
- "transformer.h.5.self_attention.query_key_value",
224
- "transformer.h.10.mlp.dense_h_to_4h",
225
- "transformer.h.4.self_attention.query_key_value",
226
  "transformer.h.4.mlp.dense_4h_to_h",
227
- "transformer.h.1.mlp.dense_4h_to_h",
228
- "transformer.h.13.mlp.dense_4h_to_h",
229
- "transformer.h.6.self_attention.dense",
230
- "transformer.h.13.mlp.dense_h_to_4h",
231
- "transformer.h.3.mlp.dense_h_to_4h",
232
- "transformer.h.2.mlp.dense_4h_to_h",
233
  "transformer.h.9.mlp.dense_4h_to_h",
234
- "transformer.h.7.self_attention.dense",
 
235
  "transformer.h.8.self_attention.query_key_value",
236
- "transformer.h.6.mlp.dense_h_to_4h",
 
 
237
  "transformer.h.11.self_attention.dense",
 
 
 
 
 
 
238
  "transformer.h.10.mlp.dense_4h_to_h",
239
- "transformer.h.12.self_attention.query_key_value",
240
- "transformer.h.12.mlp.dense_h_to_4h",
241
- "transformer.h.12.mlp.dense_4h_to_h",
242
- "transformer.h.0.mlp.dense_h_to_4h",
243
- "transformer.h.11.mlp.dense_h_to_4h",
244
  "transformer.h.7.mlp.dense_h_to_4h",
245
- "transformer.h.1.mlp.dense_h_to_4h",
246
- "transformer.h.13.self_attention.query_key_value",
247
- "transformer.h.9.self_attention.dense",
248
- "transformer.h.5.mlp.dense_h_to_4h",
249
- "transformer.h.10.self_attention.dense",
250
- "transformer.h.0.mlp.dense_4h_to_h",
251
- "transformer.h.11.self_attention.query_key_value",
252
  "transformer.h.2.mlp.dense_h_to_4h",
 
 
 
 
 
253
  "transformer.h.10.self_attention.query_key_value",
 
 
 
 
 
 
 
 
 
 
 
 
254
  "transformer.h.9.self_attention.query_key_value",
255
- "transformer.h.8.mlp.dense_h_to_4h",
256
  "transformer.h.0.self_attention.query_key_value",
257
- "transformer.h.0.self_attention.dense",
258
- "transformer.h.4.self_attention.dense",
259
- "transformer.h.13.self_attention.dense",
260
- "transformer.h.2.self_attention.query_key_value",
261
- "transformer.h.3.self_attention.query_key_value",
262
- "transformer.h.7.self_attention.query_key_value",
263
  "transformer.h.5.mlp.dense_4h_to_h",
264
- "transformer.h.8.self_attention.dense",
265
- "transformer.h.9.mlp.dense_h_to_4h",
266
  "transformer.h.6.mlp.dense_4h_to_h",
267
- "transformer.h.7.mlp.dense_4h_to_h",
268
- "transformer.h.3.mlp.dense_4h_to_h",
269
- "transformer.h.3.self_attention.dense",
270
- "transformer.h.1.self_attention.query_key_value",
271
- "transformer.h.1.self_attention.dense",
272
- "transformer.h.5.self_attention.dense",
273
- "transformer.h.12.self_attention.dense",
274
- "transformer.h.2.self_attention.dense"
275
  ],
276
  "task_type": "CAUSAL_LM",
277
  "use_dora": false,
 
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
+ "transformer.h.12.mlp.dense_4h_to_h": 16,
20
+ "transformer.h.12.mlp.dense_h_to_4h": 16,
21
+ "transformer.h.12.self_attention.dense": 16,
22
+ "transformer.h.12.self_attention.query_key_value": 16,
23
+ "transformer.h.13.mlp.dense_4h_to_h": 16,
24
+ "transformer.h.13.mlp.dense_h_to_4h": 16,
25
+ "transformer.h.13.self_attention.dense": 16,
26
+ "transformer.h.13.self_attention.query_key_value": 16,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
 
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
+ "transformer.h.19.mlp.dense_4h_to_h": 128,
48
+ "transformer.h.19.mlp.dense_h_to_4h": 128,
49
+ "transformer.h.19.self_attention.dense": 128,
50
+ "transformer.h.19.self_attention.query_key_value": 128,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
+ "transformer.h.20.mlp.dense_4h_to_h": 128,
56
+ "transformer.h.20.mlp.dense_h_to_4h": 128,
57
+ "transformer.h.20.self_attention.dense": 128,
58
+ "transformer.h.20.self_attention.query_key_value": 128,
59
+ "transformer.h.21.mlp.dense_4h_to_h": 128,
60
+ "transformer.h.21.mlp.dense_h_to_4h": 128,
61
+ "transformer.h.21.self_attention.dense": 128,
62
+ "transformer.h.21.self_attention.query_key_value": 128,
63
+ "transformer.h.22.mlp.dense_4h_to_h": 128,
64
+ "transformer.h.22.mlp.dense_h_to_4h": 128,
65
+ "transformer.h.22.self_attention.dense": 128,
66
+ "transformer.h.22.self_attention.query_key_value": 128,
67
+ "transformer.h.23.mlp.dense_4h_to_h": 128,
68
+ "transformer.h.23.mlp.dense_h_to_4h": 128,
69
+ "transformer.h.23.self_attention.dense": 128,
70
+ "transformer.h.23.self_attention.query_key_value": 128,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
 
111
  "lora_dropout": 0.0,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
+ "modules_to_save": null,
 
 
115
  "peft_type": "LORA",
116
  "r": 4,
117
  "rank_pattern": {
 
131
  "transformer.h.11.mlp.dense_h_to_4h": 4,
132
  "transformer.h.11.self_attention.dense": 4,
133
  "transformer.h.11.self_attention.query_key_value": 4,
134
+ "transformer.h.12.mlp.dense_4h_to_h": 8,
135
+ "transformer.h.12.mlp.dense_h_to_4h": 8,
136
+ "transformer.h.12.self_attention.dense": 8,
137
+ "transformer.h.12.self_attention.query_key_value": 8,
138
+ "transformer.h.13.mlp.dense_4h_to_h": 8,
139
+ "transformer.h.13.mlp.dense_h_to_4h": 8,
140
+ "transformer.h.13.self_attention.dense": 8,
141
+ "transformer.h.13.self_attention.query_key_value": 8,
142
  "transformer.h.14.mlp.dense_4h_to_h": 8,
143
  "transformer.h.14.mlp.dense_h_to_4h": 8,
144
  "transformer.h.14.self_attention.dense": 8,
 
159
  "transformer.h.18.mlp.dense_h_to_4h": 8,
160
  "transformer.h.18.self_attention.dense": 8,
161
  "transformer.h.18.self_attention.query_key_value": 8,
162
+ "transformer.h.19.mlp.dense_4h_to_h": 64,
163
+ "transformer.h.19.mlp.dense_h_to_4h": 64,
164
+ "transformer.h.19.self_attention.dense": 64,
165
+ "transformer.h.19.self_attention.query_key_value": 64,
166
  "transformer.h.2.mlp.dense_4h_to_h": 4,
167
  "transformer.h.2.mlp.dense_h_to_4h": 4,
168
  "transformer.h.2.self_attention.dense": 4,
169
  "transformer.h.2.self_attention.query_key_value": 4,
170
+ "transformer.h.20.mlp.dense_4h_to_h": 64,
171
+ "transformer.h.20.mlp.dense_h_to_4h": 64,
172
+ "transformer.h.20.self_attention.dense": 64,
173
+ "transformer.h.20.self_attention.query_key_value": 64,
174
+ "transformer.h.21.mlp.dense_4h_to_h": 64,
175
+ "transformer.h.21.mlp.dense_h_to_4h": 64,
176
+ "transformer.h.21.self_attention.dense": 64,
177
+ "transformer.h.21.self_attention.query_key_value": 64,
178
+ "transformer.h.22.mlp.dense_4h_to_h": 64,
179
+ "transformer.h.22.mlp.dense_h_to_4h": 64,
180
+ "transformer.h.22.self_attention.dense": 64,
181
+ "transformer.h.22.self_attention.query_key_value": 64,
182
+ "transformer.h.23.mlp.dense_4h_to_h": 64,
183
+ "transformer.h.23.mlp.dense_h_to_4h": 64,
184
+ "transformer.h.23.self_attention.dense": 64,
185
+ "transformer.h.23.self_attention.query_key_value": 64,
186
  "transformer.h.3.mlp.dense_4h_to_h": 4,
187
  "transformer.h.3.mlp.dense_h_to_4h": 4,
188
  "transformer.h.3.self_attention.dense": 4,
 
214
  },
215
  "revision": null,
216
  "target_modules": [
217
+ "transformer.h.0.mlp.dense_h_to_4h",
218
+ "transformer.h.8.mlp.dense_h_to_4h",
 
 
 
 
 
219
  "transformer.h.4.mlp.dense_4h_to_h",
220
+ "transformer.h.2.self_attention.dense",
 
 
 
 
 
221
  "transformer.h.9.mlp.dense_4h_to_h",
222
+ "transformer.h.11.mlp.dense_4h_to_h",
223
+ "transformer.h.9.mlp.dense_h_to_4h",
224
  "transformer.h.8.self_attention.query_key_value",
225
+ "transformer.h.6.self_attention.dense",
226
+ "transformer.h.3.self_attention.query_key_value",
227
+ "transformer.h.3.self_attention.dense",
228
  "transformer.h.11.self_attention.dense",
229
+ "transformer.h.1.self_attention.query_key_value",
230
+ "transformer.h.5.mlp.dense_h_to_4h",
231
+ "transformer.h.2.self_attention.query_key_value",
232
+ "transformer.h.10.mlp.dense_h_to_4h",
233
+ "transformer.h.3.mlp.dense_4h_to_h",
234
+ "transformer.h.4.self_attention.query_key_value",
235
  "transformer.h.10.mlp.dense_4h_to_h",
236
+ "transformer.h.4.mlp.dense_h_to_4h",
237
+ "transformer.h.7.self_attention.dense",
238
+ "transformer.h.0.self_attention.dense",
 
 
239
  "transformer.h.7.mlp.dense_h_to_4h",
 
 
 
 
 
 
 
240
  "transformer.h.2.mlp.dense_h_to_4h",
241
+ "transformer.h.8.self_attention.dense",
242
+ "transformer.h.11.mlp.dense_h_to_4h",
243
+ "transformer.h.7.mlp.dense_4h_to_h",
244
+ "transformer.h.7.self_attention.query_key_value",
245
+ "transformer.h.0.mlp.dense_4h_to_h",
246
  "transformer.h.10.self_attention.query_key_value",
247
+ "transformer.h.11.self_attention.query_key_value",
248
+ "transformer.h.1.mlp.dense_4h_to_h",
249
+ "transformer.h.1.mlp.dense_h_to_4h",
250
+ "transformer.h.5.self_attention.dense",
251
+ "transformer.h.10.self_attention.dense",
252
+ "transformer.h.6.self_attention.query_key_value",
253
+ "transformer.h.6.mlp.dense_h_to_4h",
254
+ "transformer.h.5.self_attention.query_key_value",
255
+ "transformer.h.4.self_attention.dense",
256
+ "transformer.h.8.mlp.dense_4h_to_h",
257
+ "transformer.h.3.mlp.dense_h_to_4h",
258
+ "transformer.h.1.self_attention.dense",
259
  "transformer.h.9.self_attention.query_key_value",
260
+ "transformer.h.9.self_attention.dense",
261
  "transformer.h.0.self_attention.query_key_value",
 
 
 
 
 
 
262
  "transformer.h.5.mlp.dense_4h_to_h",
 
 
263
  "transformer.h.6.mlp.dense_4h_to_h",
264
+ "transformer.h.2.mlp.dense_4h_to_h"
 
 
 
 
 
 
 
265
  ],
266
  "task_type": "CAUSAL_LM",
267
  "use_dora": false,
last-checkpoint/lora_lower/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:462b6bff6479bc4430adab26dfb4c275946fb47f94c48e49edfad477094d0422
3
- size 2058899176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b632073286f4624203e332784f0e1c31b8f4a8d74fbc2833acf0b3b3947b1ea
3
+ size 3162128
last-checkpoint/lora_middle/adapter_config.json CHANGED
@@ -16,14 +16,14 @@
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
- "transformer.h.12.mlp.dense_4h_to_h": 8,
20
- "transformer.h.12.mlp.dense_h_to_4h": 8,
21
- "transformer.h.12.self_attention.dense": 8,
22
- "transformer.h.12.self_attention.query_key_value": 8,
23
- "transformer.h.13.mlp.dense_4h_to_h": 8,
24
- "transformer.h.13.mlp.dense_h_to_4h": 8,
25
- "transformer.h.13.self_attention.dense": 8,
26
- "transformer.h.13.self_attention.query_key_value": 8,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
@@ -44,30 +44,30 @@
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
- "transformer.h.19.mlp.dense_4h_to_h": 16,
48
- "transformer.h.19.mlp.dense_h_to_4h": 16,
49
- "transformer.h.19.self_attention.dense": 16,
50
- "transformer.h.19.self_attention.query_key_value": 16,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
- "transformer.h.20.mlp.dense_4h_to_h": 16,
56
- "transformer.h.20.mlp.dense_h_to_4h": 16,
57
- "transformer.h.20.self_attention.dense": 16,
58
- "transformer.h.20.self_attention.query_key_value": 16,
59
- "transformer.h.21.mlp.dense_4h_to_h": 32,
60
- "transformer.h.21.mlp.dense_h_to_4h": 32,
61
- "transformer.h.21.self_attention.dense": 32,
62
- "transformer.h.21.self_attention.query_key_value": 32,
63
- "transformer.h.22.mlp.dense_4h_to_h": 32,
64
- "transformer.h.22.mlp.dense_h_to_4h": 32,
65
- "transformer.h.22.self_attention.dense": 32,
66
- "transformer.h.22.self_attention.query_key_value": 32,
67
- "transformer.h.23.mlp.dense_4h_to_h": 32,
68
- "transformer.h.23.mlp.dense_h_to_4h": 32,
69
- "transformer.h.23.self_attention.dense": 32,
70
- "transformer.h.23.self_attention.query_key_value": 32,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
@@ -111,9 +111,7 @@
111
  "lora_dropout": 0.1,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
- "modules_to_save": [
115
- "lm_head"
116
- ],
117
  "peft_type": "LORA",
118
  "r": 8,
119
  "rank_pattern": {
@@ -133,14 +131,14 @@
133
  "transformer.h.11.mlp.dense_h_to_4h": 4,
134
  "transformer.h.11.self_attention.dense": 4,
135
  "transformer.h.11.self_attention.query_key_value": 4,
136
- "transformer.h.12.mlp.dense_4h_to_h": 4,
137
- "transformer.h.12.mlp.dense_h_to_4h": 4,
138
- "transformer.h.12.self_attention.dense": 4,
139
- "transformer.h.12.self_attention.query_key_value": 4,
140
- "transformer.h.13.mlp.dense_4h_to_h": 4,
141
- "transformer.h.13.mlp.dense_h_to_4h": 4,
142
- "transformer.h.13.self_attention.dense": 4,
143
- "transformer.h.13.self_attention.query_key_value": 4,
144
  "transformer.h.14.mlp.dense_4h_to_h": 8,
145
  "transformer.h.14.mlp.dense_h_to_4h": 8,
146
  "transformer.h.14.self_attention.dense": 8,
@@ -161,30 +159,30 @@
161
  "transformer.h.18.mlp.dense_h_to_4h": 8,
162
  "transformer.h.18.self_attention.dense": 8,
163
  "transformer.h.18.self_attention.query_key_value": 8,
164
- "transformer.h.19.mlp.dense_4h_to_h": 8,
165
- "transformer.h.19.mlp.dense_h_to_4h": 8,
166
- "transformer.h.19.self_attention.dense": 8,
167
- "transformer.h.19.self_attention.query_key_value": 8,
168
  "transformer.h.2.mlp.dense_4h_to_h": 4,
169
  "transformer.h.2.mlp.dense_h_to_4h": 4,
170
  "transformer.h.2.self_attention.dense": 4,
171
  "transformer.h.2.self_attention.query_key_value": 4,
172
- "transformer.h.20.mlp.dense_4h_to_h": 8,
173
- "transformer.h.20.mlp.dense_h_to_4h": 8,
174
- "transformer.h.20.self_attention.dense": 8,
175
- "transformer.h.20.self_attention.query_key_value": 8,
176
- "transformer.h.21.mlp.dense_4h_to_h": 16,
177
- "transformer.h.21.mlp.dense_h_to_4h": 16,
178
- "transformer.h.21.self_attention.dense": 16,
179
- "transformer.h.21.self_attention.query_key_value": 16,
180
- "transformer.h.22.mlp.dense_4h_to_h": 16,
181
- "transformer.h.22.mlp.dense_h_to_4h": 16,
182
- "transformer.h.22.self_attention.dense": 16,
183
- "transformer.h.22.self_attention.query_key_value": 16,
184
- "transformer.h.23.mlp.dense_4h_to_h": 16,
185
- "transformer.h.23.mlp.dense_h_to_4h": 16,
186
- "transformer.h.23.self_attention.dense": 16,
187
- "transformer.h.23.self_attention.query_key_value": 16,
188
  "transformer.h.3.mlp.dense_4h_to_h": 4,
189
  "transformer.h.3.mlp.dense_h_to_4h": 4,
190
  "transformer.h.3.self_attention.dense": 4,
@@ -216,34 +214,34 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
219
- "transformer.h.20.mlp.dense_h_to_4h",
220
- "transformer.h.17.mlp.dense_h_to_4h",
221
- "transformer.h.14.self_attention.dense",
222
- "transformer.h.20.self_attention.query_key_value",
 
 
223
  "transformer.h.14.mlp.dense_4h_to_h",
 
 
 
224
  "transformer.h.16.mlp.dense_h_to_4h",
225
- "transformer.h.19.self_attention.dense",
226
- "transformer.h.20.mlp.dense_4h_to_h",
227
- "transformer.h.15.mlp.dense_h_to_4h",
228
- "transformer.h.15.self_attention.query_key_value",
229
- "transformer.h.18.mlp.dense_4h_to_h",
230
- "transformer.h.16.mlp.dense_4h_to_h",
231
- "transformer.h.19.mlp.dense_h_to_4h",
232
- "transformer.h.19.self_attention.query_key_value",
233
- "transformer.h.15.self_attention.dense",
234
- "transformer.h.18.self_attention.dense",
235
  "transformer.h.17.mlp.dense_4h_to_h",
236
- "transformer.h.16.self_attention.query_key_value",
237
- "transformer.h.19.mlp.dense_4h_to_h",
238
- "transformer.h.20.self_attention.dense",
239
- "transformer.h.18.self_attention.query_key_value",
240
- "transformer.h.15.mlp.dense_4h_to_h",
241
  "transformer.h.14.mlp.dense_h_to_4h",
242
- "transformer.h.17.self_attention.dense",
243
  "transformer.h.16.self_attention.dense",
 
 
 
 
 
 
244
  "transformer.h.18.mlp.dense_h_to_4h",
245
- "transformer.h.17.self_attention.query_key_value",
246
- "transformer.h.14.self_attention.query_key_value"
247
  ],
248
  "task_type": "CAUSAL_LM",
249
  "use_dora": false,
 
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
+ "transformer.h.12.mlp.dense_4h_to_h": 16,
20
+ "transformer.h.12.mlp.dense_h_to_4h": 16,
21
+ "transformer.h.12.self_attention.dense": 16,
22
+ "transformer.h.12.self_attention.query_key_value": 16,
23
+ "transformer.h.13.mlp.dense_4h_to_h": 16,
24
+ "transformer.h.13.mlp.dense_h_to_4h": 16,
25
+ "transformer.h.13.self_attention.dense": 16,
26
+ "transformer.h.13.self_attention.query_key_value": 16,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
 
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
+ "transformer.h.19.mlp.dense_4h_to_h": 128,
48
+ "transformer.h.19.mlp.dense_h_to_4h": 128,
49
+ "transformer.h.19.self_attention.dense": 128,
50
+ "transformer.h.19.self_attention.query_key_value": 128,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
+ "transformer.h.20.mlp.dense_4h_to_h": 128,
56
+ "transformer.h.20.mlp.dense_h_to_4h": 128,
57
+ "transformer.h.20.self_attention.dense": 128,
58
+ "transformer.h.20.self_attention.query_key_value": 128,
59
+ "transformer.h.21.mlp.dense_4h_to_h": 128,
60
+ "transformer.h.21.mlp.dense_h_to_4h": 128,
61
+ "transformer.h.21.self_attention.dense": 128,
62
+ "transformer.h.21.self_attention.query_key_value": 128,
63
+ "transformer.h.22.mlp.dense_4h_to_h": 128,
64
+ "transformer.h.22.mlp.dense_h_to_4h": 128,
65
+ "transformer.h.22.self_attention.dense": 128,
66
+ "transformer.h.22.self_attention.query_key_value": 128,
67
+ "transformer.h.23.mlp.dense_4h_to_h": 128,
68
+ "transformer.h.23.mlp.dense_h_to_4h": 128,
69
+ "transformer.h.23.self_attention.dense": 128,
70
+ "transformer.h.23.self_attention.query_key_value": 128,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
 
111
  "lora_dropout": 0.1,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
+ "modules_to_save": null,
 
 
115
  "peft_type": "LORA",
116
  "r": 8,
117
  "rank_pattern": {
 
131
  "transformer.h.11.mlp.dense_h_to_4h": 4,
132
  "transformer.h.11.self_attention.dense": 4,
133
  "transformer.h.11.self_attention.query_key_value": 4,
134
+ "transformer.h.12.mlp.dense_4h_to_h": 8,
135
+ "transformer.h.12.mlp.dense_h_to_4h": 8,
136
+ "transformer.h.12.self_attention.dense": 8,
137
+ "transformer.h.12.self_attention.query_key_value": 8,
138
+ "transformer.h.13.mlp.dense_4h_to_h": 8,
139
+ "transformer.h.13.mlp.dense_h_to_4h": 8,
140
+ "transformer.h.13.self_attention.dense": 8,
141
+ "transformer.h.13.self_attention.query_key_value": 8,
142
  "transformer.h.14.mlp.dense_4h_to_h": 8,
143
  "transformer.h.14.mlp.dense_h_to_4h": 8,
144
  "transformer.h.14.self_attention.dense": 8,
 
159
  "transformer.h.18.mlp.dense_h_to_4h": 8,
160
  "transformer.h.18.self_attention.dense": 8,
161
  "transformer.h.18.self_attention.query_key_value": 8,
162
+ "transformer.h.19.mlp.dense_4h_to_h": 64,
163
+ "transformer.h.19.mlp.dense_h_to_4h": 64,
164
+ "transformer.h.19.self_attention.dense": 64,
165
+ "transformer.h.19.self_attention.query_key_value": 64,
166
  "transformer.h.2.mlp.dense_4h_to_h": 4,
167
  "transformer.h.2.mlp.dense_h_to_4h": 4,
168
  "transformer.h.2.self_attention.dense": 4,
169
  "transformer.h.2.self_attention.query_key_value": 4,
170
+ "transformer.h.20.mlp.dense_4h_to_h": 64,
171
+ "transformer.h.20.mlp.dense_h_to_4h": 64,
172
+ "transformer.h.20.self_attention.dense": 64,
173
+ "transformer.h.20.self_attention.query_key_value": 64,
174
+ "transformer.h.21.mlp.dense_4h_to_h": 64,
175
+ "transformer.h.21.mlp.dense_h_to_4h": 64,
176
+ "transformer.h.21.self_attention.dense": 64,
177
+ "transformer.h.21.self_attention.query_key_value": 64,
178
+ "transformer.h.22.mlp.dense_4h_to_h": 64,
179
+ "transformer.h.22.mlp.dense_h_to_4h": 64,
180
+ "transformer.h.22.self_attention.dense": 64,
181
+ "transformer.h.22.self_attention.query_key_value": 64,
182
+ "transformer.h.23.mlp.dense_4h_to_h": 64,
183
+ "transformer.h.23.mlp.dense_h_to_4h": 64,
184
+ "transformer.h.23.self_attention.dense": 64,
185
+ "transformer.h.23.self_attention.query_key_value": 64,
186
  "transformer.h.3.mlp.dense_4h_to_h": 4,
187
  "transformer.h.3.mlp.dense_h_to_4h": 4,
188
  "transformer.h.3.self_attention.dense": 4,
 
214
  },
215
  "revision": null,
216
  "target_modules": [
217
+ "transformer.h.16.self_attention.query_key_value",
218
+ "transformer.h.13.mlp.dense_4h_to_h",
219
+ "transformer.h.12.mlp.dense_h_to_4h",
220
+ "transformer.h.13.self_attention.query_key_value",
221
+ "transformer.h.12.self_attention.dense",
222
+ "transformer.h.17.self_attention.dense",
223
  "transformer.h.14.mlp.dense_4h_to_h",
224
+ "transformer.h.12.self_attention.query_key_value",
225
+ "transformer.h.14.self_attention.query_key_value",
226
+ "transformer.h.13.self_attention.dense",
227
  "transformer.h.16.mlp.dense_h_to_4h",
228
+ "transformer.h.17.self_attention.query_key_value",
229
+ "transformer.h.12.mlp.dense_4h_to_h",
 
 
 
 
 
 
 
 
230
  "transformer.h.17.mlp.dense_4h_to_h",
231
+ "transformer.h.18.mlp.dense_4h_to_h",
232
+ "transformer.h.14.self_attention.dense",
 
 
 
233
  "transformer.h.14.mlp.dense_h_to_4h",
234
+ "transformer.h.13.mlp.dense_h_to_4h",
235
  "transformer.h.16.self_attention.dense",
236
+ "transformer.h.17.mlp.dense_h_to_4h",
237
+ "transformer.h.15.self_attention.query_key_value",
238
+ "transformer.h.16.mlp.dense_4h_to_h",
239
+ "transformer.h.18.self_attention.query_key_value",
240
+ "transformer.h.15.mlp.dense_h_to_4h",
241
+ "transformer.h.15.self_attention.dense",
242
  "transformer.h.18.mlp.dense_h_to_4h",
243
+ "transformer.h.18.self_attention.dense",
244
+ "transformer.h.15.mlp.dense_4h_to_h"
245
  ],
246
  "task_type": "CAUSAL_LM",
247
  "use_dora": false,
last-checkpoint/lora_middle/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2705a293473662f885bc2c9b4b1643921a5f4db0ad8025e88a7dcd2aa0221f5
3
- size 2058889288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ae83ba1e2af168e27f5e54b5a606f7a08f1f83f754f042a77ff6f6aedc080e2
3
+ size 3679656
last-checkpoint/lora_top/adapter_config.json CHANGED
@@ -16,14 +16,14 @@
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
- "transformer.h.12.mlp.dense_4h_to_h": 8,
20
- "transformer.h.12.mlp.dense_h_to_4h": 8,
21
- "transformer.h.12.self_attention.dense": 8,
22
- "transformer.h.12.self_attention.query_key_value": 8,
23
- "transformer.h.13.mlp.dense_4h_to_h": 8,
24
- "transformer.h.13.mlp.dense_h_to_4h": 8,
25
- "transformer.h.13.self_attention.dense": 8,
26
- "transformer.h.13.self_attention.query_key_value": 8,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
@@ -44,30 +44,30 @@
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
- "transformer.h.19.mlp.dense_4h_to_h": 16,
48
- "transformer.h.19.mlp.dense_h_to_4h": 16,
49
- "transformer.h.19.self_attention.dense": 16,
50
- "transformer.h.19.self_attention.query_key_value": 16,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
- "transformer.h.20.mlp.dense_4h_to_h": 16,
56
- "transformer.h.20.mlp.dense_h_to_4h": 16,
57
- "transformer.h.20.self_attention.dense": 16,
58
- "transformer.h.20.self_attention.query_key_value": 16,
59
- "transformer.h.21.mlp.dense_4h_to_h": 32,
60
- "transformer.h.21.mlp.dense_h_to_4h": 32,
61
- "transformer.h.21.self_attention.dense": 32,
62
- "transformer.h.21.self_attention.query_key_value": 32,
63
- "transformer.h.22.mlp.dense_4h_to_h": 32,
64
- "transformer.h.22.mlp.dense_h_to_4h": 32,
65
- "transformer.h.22.self_attention.dense": 32,
66
- "transformer.h.22.self_attention.query_key_value": 32,
67
- "transformer.h.23.mlp.dense_4h_to_h": 32,
68
- "transformer.h.23.mlp.dense_h_to_4h": 32,
69
- "transformer.h.23.self_attention.dense": 32,
70
- "transformer.h.23.self_attention.query_key_value": 32,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
@@ -111,9 +111,7 @@
111
  "lora_dropout": 0.3,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
- "modules_to_save": [
115
- "lm_head"
116
- ],
117
  "peft_type": "LORA",
118
  "r": 16,
119
  "rank_pattern": {
@@ -133,14 +131,14 @@
133
  "transformer.h.11.mlp.dense_h_to_4h": 4,
134
  "transformer.h.11.self_attention.dense": 4,
135
  "transformer.h.11.self_attention.query_key_value": 4,
136
- "transformer.h.12.mlp.dense_4h_to_h": 4,
137
- "transformer.h.12.mlp.dense_h_to_4h": 4,
138
- "transformer.h.12.self_attention.dense": 4,
139
- "transformer.h.12.self_attention.query_key_value": 4,
140
- "transformer.h.13.mlp.dense_4h_to_h": 4,
141
- "transformer.h.13.mlp.dense_h_to_4h": 4,
142
- "transformer.h.13.self_attention.dense": 4,
143
- "transformer.h.13.self_attention.query_key_value": 4,
144
  "transformer.h.14.mlp.dense_4h_to_h": 8,
145
  "transformer.h.14.mlp.dense_h_to_4h": 8,
146
  "transformer.h.14.self_attention.dense": 8,
@@ -161,30 +159,30 @@
161
  "transformer.h.18.mlp.dense_h_to_4h": 8,
162
  "transformer.h.18.self_attention.dense": 8,
163
  "transformer.h.18.self_attention.query_key_value": 8,
164
- "transformer.h.19.mlp.dense_4h_to_h": 8,
165
- "transformer.h.19.mlp.dense_h_to_4h": 8,
166
- "transformer.h.19.self_attention.dense": 8,
167
- "transformer.h.19.self_attention.query_key_value": 8,
168
  "transformer.h.2.mlp.dense_4h_to_h": 4,
169
  "transformer.h.2.mlp.dense_h_to_4h": 4,
170
  "transformer.h.2.self_attention.dense": 4,
171
  "transformer.h.2.self_attention.query_key_value": 4,
172
- "transformer.h.20.mlp.dense_4h_to_h": 8,
173
- "transformer.h.20.mlp.dense_h_to_4h": 8,
174
- "transformer.h.20.self_attention.dense": 8,
175
- "transformer.h.20.self_attention.query_key_value": 8,
176
- "transformer.h.21.mlp.dense_4h_to_h": 16,
177
- "transformer.h.21.mlp.dense_h_to_4h": 16,
178
- "transformer.h.21.self_attention.dense": 16,
179
- "transformer.h.21.self_attention.query_key_value": 16,
180
- "transformer.h.22.mlp.dense_4h_to_h": 16,
181
- "transformer.h.22.mlp.dense_h_to_4h": 16,
182
- "transformer.h.22.self_attention.dense": 16,
183
- "transformer.h.22.self_attention.query_key_value": 16,
184
- "transformer.h.23.mlp.dense_4h_to_h": 16,
185
- "transformer.h.23.mlp.dense_h_to_4h": 16,
186
- "transformer.h.23.self_attention.dense": 16,
187
- "transformer.h.23.self_attention.query_key_value": 16,
188
  "transformer.h.3.mlp.dense_4h_to_h": 4,
189
  "transformer.h.3.mlp.dense_h_to_4h": 4,
190
  "transformer.h.3.self_attention.dense": 4,
@@ -216,18 +214,26 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
 
 
 
 
 
 
 
 
 
219
  "transformer.h.21.self_attention.dense",
220
- "transformer.h.22.self_attention.query_key_value",
221
  "transformer.h.21.self_attention.query_key_value",
222
- "transformer.h.23.self_attention.query_key_value",
223
- "transformer.h.21.mlp.dense_4h_to_h",
224
- "transformer.h.22.self_attention.dense",
 
225
  "transformer.h.23.self_attention.dense",
226
- "transformer.h.22.mlp.dense_4h_to_h",
227
  "transformer.h.23.mlp.dense_h_to_4h",
228
- "transformer.h.21.mlp.dense_h_to_4h",
229
- "transformer.h.23.mlp.dense_4h_to_h",
230
- "transformer.h.22.mlp.dense_h_to_4h"
231
  ],
232
  "task_type": "CAUSAL_LM",
233
  "use_dora": false,
 
16
  "transformer.h.11.mlp.dense_h_to_4h": 8,
17
  "transformer.h.11.self_attention.dense": 8,
18
  "transformer.h.11.self_attention.query_key_value": 8,
19
+ "transformer.h.12.mlp.dense_4h_to_h": 16,
20
+ "transformer.h.12.mlp.dense_h_to_4h": 16,
21
+ "transformer.h.12.self_attention.dense": 16,
22
+ "transformer.h.12.self_attention.query_key_value": 16,
23
+ "transformer.h.13.mlp.dense_4h_to_h": 16,
24
+ "transformer.h.13.mlp.dense_h_to_4h": 16,
25
+ "transformer.h.13.self_attention.dense": 16,
26
+ "transformer.h.13.self_attention.query_key_value": 16,
27
  "transformer.h.14.mlp.dense_4h_to_h": 16,
28
  "transformer.h.14.mlp.dense_h_to_4h": 16,
29
  "transformer.h.14.self_attention.dense": 16,
 
44
  "transformer.h.18.mlp.dense_h_to_4h": 16,
45
  "transformer.h.18.self_attention.dense": 16,
46
  "transformer.h.18.self_attention.query_key_value": 16,
47
+ "transformer.h.19.mlp.dense_4h_to_h": 128,
48
+ "transformer.h.19.mlp.dense_h_to_4h": 128,
49
+ "transformer.h.19.self_attention.dense": 128,
50
+ "transformer.h.19.self_attention.query_key_value": 128,
51
  "transformer.h.2.mlp.dense_4h_to_h": 8,
52
  "transformer.h.2.mlp.dense_h_to_4h": 8,
53
  "transformer.h.2.self_attention.dense": 8,
54
  "transformer.h.2.self_attention.query_key_value": 8,
55
+ "transformer.h.20.mlp.dense_4h_to_h": 128,
56
+ "transformer.h.20.mlp.dense_h_to_4h": 128,
57
+ "transformer.h.20.self_attention.dense": 128,
58
+ "transformer.h.20.self_attention.query_key_value": 128,
59
+ "transformer.h.21.mlp.dense_4h_to_h": 128,
60
+ "transformer.h.21.mlp.dense_h_to_4h": 128,
61
+ "transformer.h.21.self_attention.dense": 128,
62
+ "transformer.h.21.self_attention.query_key_value": 128,
63
+ "transformer.h.22.mlp.dense_4h_to_h": 128,
64
+ "transformer.h.22.mlp.dense_h_to_4h": 128,
65
+ "transformer.h.22.self_attention.dense": 128,
66
+ "transformer.h.22.self_attention.query_key_value": 128,
67
+ "transformer.h.23.mlp.dense_4h_to_h": 128,
68
+ "transformer.h.23.mlp.dense_h_to_4h": 128,
69
+ "transformer.h.23.self_attention.dense": 128,
70
+ "transformer.h.23.self_attention.query_key_value": 128,
71
  "transformer.h.3.mlp.dense_4h_to_h": 8,
72
  "transformer.h.3.mlp.dense_h_to_4h": 8,
73
  "transformer.h.3.self_attention.dense": 8,
 
111
  "lora_dropout": 0.3,
112
  "megatron_config": null,
113
  "megatron_core": "megatron.core",
114
+ "modules_to_save": null,
 
 
115
  "peft_type": "LORA",
116
  "r": 16,
117
  "rank_pattern": {
 
131
  "transformer.h.11.mlp.dense_h_to_4h": 4,
132
  "transformer.h.11.self_attention.dense": 4,
133
  "transformer.h.11.self_attention.query_key_value": 4,
134
+ "transformer.h.12.mlp.dense_4h_to_h": 8,
135
+ "transformer.h.12.mlp.dense_h_to_4h": 8,
136
+ "transformer.h.12.self_attention.dense": 8,
137
+ "transformer.h.12.self_attention.query_key_value": 8,
138
+ "transformer.h.13.mlp.dense_4h_to_h": 8,
139
+ "transformer.h.13.mlp.dense_h_to_4h": 8,
140
+ "transformer.h.13.self_attention.dense": 8,
141
+ "transformer.h.13.self_attention.query_key_value": 8,
142
  "transformer.h.14.mlp.dense_4h_to_h": 8,
143
  "transformer.h.14.mlp.dense_h_to_4h": 8,
144
  "transformer.h.14.self_attention.dense": 8,
 
159
  "transformer.h.18.mlp.dense_h_to_4h": 8,
160
  "transformer.h.18.self_attention.dense": 8,
161
  "transformer.h.18.self_attention.query_key_value": 8,
162
+ "transformer.h.19.mlp.dense_4h_to_h": 64,
163
+ "transformer.h.19.mlp.dense_h_to_4h": 64,
164
+ "transformer.h.19.self_attention.dense": 64,
165
+ "transformer.h.19.self_attention.query_key_value": 64,
166
  "transformer.h.2.mlp.dense_4h_to_h": 4,
167
  "transformer.h.2.mlp.dense_h_to_4h": 4,
168
  "transformer.h.2.self_attention.dense": 4,
169
  "transformer.h.2.self_attention.query_key_value": 4,
170
+ "transformer.h.20.mlp.dense_4h_to_h": 64,
171
+ "transformer.h.20.mlp.dense_h_to_4h": 64,
172
+ "transformer.h.20.self_attention.dense": 64,
173
+ "transformer.h.20.self_attention.query_key_value": 64,
174
+ "transformer.h.21.mlp.dense_4h_to_h": 64,
175
+ "transformer.h.21.mlp.dense_h_to_4h": 64,
176
+ "transformer.h.21.self_attention.dense": 64,
177
+ "transformer.h.21.self_attention.query_key_value": 64,
178
+ "transformer.h.22.mlp.dense_4h_to_h": 64,
179
+ "transformer.h.22.mlp.dense_h_to_4h": 64,
180
+ "transformer.h.22.self_attention.dense": 64,
181
+ "transformer.h.22.self_attention.query_key_value": 64,
182
+ "transformer.h.23.mlp.dense_4h_to_h": 64,
183
+ "transformer.h.23.mlp.dense_h_to_4h": 64,
184
+ "transformer.h.23.self_attention.dense": 64,
185
+ "transformer.h.23.self_attention.query_key_value": 64,
186
  "transformer.h.3.mlp.dense_4h_to_h": 4,
187
  "transformer.h.3.mlp.dense_h_to_4h": 4,
188
  "transformer.h.3.self_attention.dense": 4,
 
214
  },
215
  "revision": null,
216
  "target_modules": [
217
+ "transformer.h.19.self_attention.query_key_value",
218
+ "transformer.h.22.mlp.dense_4h_to_h",
219
+ "transformer.h.22.self_attention.dense",
220
+ "transformer.h.21.mlp.dense_h_to_4h",
221
+ "transformer.h.21.mlp.dense_4h_to_h",
222
+ "transformer.h.20.self_attention.query_key_value",
223
+ "transformer.h.20.mlp.dense_h_to_4h",
224
+ "transformer.h.19.self_attention.dense",
225
+ "transformer.h.22.mlp.dense_h_to_4h",
226
  "transformer.h.21.self_attention.dense",
227
+ "transformer.h.23.mlp.dense_4h_to_h",
228
  "transformer.h.21.self_attention.query_key_value",
229
+ "transformer.h.22.self_attention.query_key_value",
230
+ "transformer.h.20.self_attention.dense",
231
+ "transformer.h.19.mlp.dense_h_to_4h",
232
+ "transformer.h.19.mlp.dense_4h_to_h",
233
  "transformer.h.23.self_attention.dense",
 
234
  "transformer.h.23.mlp.dense_h_to_4h",
235
+ "transformer.h.23.self_attention.query_key_value",
236
+ "transformer.h.20.mlp.dense_4h_to_h"
 
237
  ],
238
  "task_type": "CAUSAL_LM",
239
  "use_dora": false,
last-checkpoint/lora_top/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4dcaaf734b9b1a56cb2294cf0fa8500e082ee74b2b20b5cd2c67e1122555870
3
- size 2058359328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db717b03cbf2f1a46c5f15b26bf9f789de7fc458cf6131e22f3bbb9f834a45e1
3
+ size 5249824
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c82dcc7cfa9a457a5aa17056d6b326ba023342eb225aaf0898e6e54c2bb6077
3
- size 2061522259
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad554f121b2fb4412ef9896a650c093864011bb63532cae8be5cadb109859887
3
+ size 10520058
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99714ae94e1721c655a4d2e5fffbbc6ed7e1f5ed893f7bf8f89ada975f3ed81f
3
  size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cc7552dbdf9469e78ab73bf70c6f588116bcf08f142cb145b7886c28a210e1
3
  size 14180
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f41cbca3e8e87d3857cac1912cf18c05169bb171e7530cf76d23b482cbc432c
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ff91ed2121713e2d5688c93acd928083291e754cea825d48e464b7b5d969d0e
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,238 +1,125 @@
1
  {
2
- "best_metric": 3.867088794708252,
3
- "best_model_checkpoint": "./output/checkpoint-300",
4
- "epoch": 0.04899559039686428,
5
  "eval_steps": 150,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0016331863465621427,
13
- "grad_norm": 60.602169036865234,
14
- "learning_rate": 2.154434690031884e-06,
15
- "loss": 4.2742,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0032663726931242854,
20
- "grad_norm": 57.758113861083984,
21
- "learning_rate": 4.308869380063768e-06,
22
- "loss": 4.0288,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.004899559039686428,
27
- "grad_norm": 55.49625778198242,
28
- "learning_rate": 6.463304070095652e-06,
29
- "loss": 4.1612,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.006532745386248571,
34
- "grad_norm": 46.22760772705078,
35
- "learning_rate": 8.617738760127536e-06,
36
- "loss": 4.1069,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.008165931732810714,
41
- "grad_norm": 47.324954986572266,
42
- "learning_rate": 1.077217345015942e-05,
43
- "loss": 4.0317,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.009799118079372856,
48
- "grad_norm": 46.110965728759766,
49
- "learning_rate": 1.2926608140191304e-05,
50
- "loss": 3.9325,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.011432304425935,
55
- "grad_norm": 47.741973876953125,
56
- "learning_rate": 1.5081042830223187e-05,
57
- "loss": 3.9713,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.013065490772497142,
62
- "grad_norm": 40.646671295166016,
63
- "learning_rate": 1.723547752025507e-05,
64
- "loss": 3.9214,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.014698677119059285,
69
- "grad_norm": 44.510902404785156,
70
- "learning_rate": 1.9389912210286956e-05,
71
- "loss": 3.9046,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.01633186346562143,
76
- "grad_norm": 43.503135681152344,
77
- "learning_rate": 2.154434690031884e-05,
78
- "loss": 3.971,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.01796504981218357,
83
- "grad_norm": 53.51116180419922,
84
- "learning_rate": 2.154412549938943e-05,
85
- "loss": 3.9408,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.019598236158745713,
90
- "grad_norm": 45.50636672973633,
91
- "learning_rate": 2.1543461305702127e-05,
92
- "loss": 3.955,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.021231422505307854,
97
- "grad_norm": 45.04021072387695,
98
- "learning_rate": 2.1542354346559332e-05,
99
- "loss": 3.9399,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.02286460885187,
104
- "grad_norm": 41.114078521728516,
105
- "learning_rate": 2.15408046674638e-05,
106
- "loss": 3.857,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.02449779519843214,
111
- "grad_norm": 55.586185455322266,
112
- "learning_rate": 2.1538812332116767e-05,
113
- "loss": 3.9741,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.02449779519843214,
118
- "eval_loss": 4.046905040740967,
119
- "eval_runtime": 12.6477,
120
- "eval_samples_per_second": 39.533,
121
- "eval_steps_per_second": 39.533,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.026130981544994283,
126
- "grad_norm": 39.35482406616211,
127
- "learning_rate": 2.1536377422415337e-05,
128
- "loss": 3.9314,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.027764167891556425,
133
- "grad_norm": 49.44642639160156,
134
- "learning_rate": 2.1533500038449112e-05,
135
- "loss": 3.9469,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.02939735423811857,
140
- "grad_norm": 47.13311004638672,
141
- "learning_rate": 2.1530180298496075e-05,
142
- "loss": 3.8316,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.031030540584680712,
147
- "grad_norm": 45.86495590209961,
148
- "learning_rate": 2.1526418339017734e-05,
149
- "loss": 3.9398,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.03266372693124286,
154
- "grad_norm": 42.1937141418457,
155
- "learning_rate": 2.152221431465351e-05,
156
- "loss": 3.8032,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.034296913277805,
161
- "grad_norm": 43.07133865356445,
162
- "learning_rate": 2.1517568398214374e-05,
163
- "loss": 3.823,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.03593009962436714,
168
- "grad_norm": 52.06683349609375,
169
- "learning_rate": 2.1512480780675756e-05,
170
- "loss": 3.6545,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.03756328597092928,
175
- "grad_norm": 43.83436965942383,
176
- "learning_rate": 2.150695167116969e-05,
177
- "loss": 3.8256,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.039196472317491425,
182
- "grad_norm": 46.70238494873047,
183
- "learning_rate": 2.1500981296976207e-05,
184
- "loss": 3.8661,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.04082965866405357,
189
- "grad_norm": 47.87880325317383,
190
- "learning_rate": 2.1494569903514006e-05,
191
- "loss": 3.7335,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.04246284501061571,
196
- "grad_norm": 42.191898345947266,
197
- "learning_rate": 2.1487717754330366e-05,
198
- "loss": 3.7399,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.04409603135717785,
203
- "grad_norm": 41.03351974487305,
204
- "learning_rate": 2.1480425131090295e-05,
205
- "loss": 3.703,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.04572921770374,
210
- "grad_norm": 41.71271896362305,
211
- "learning_rate": 2.1472692333564976e-05,
212
- "loss": 3.6198,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.04736240405030214,
217
- "grad_norm": 40.65316390991211,
218
- "learning_rate": 2.1464519679619426e-05,
219
- "loss": 3.6979,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.04899559039686428,
224
- "grad_norm": 37.19804382324219,
225
- "learning_rate": 2.1455907505199437e-05,
226
- "loss": 3.638,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.04899559039686428,
231
- "eval_loss": 3.867088794708252,
232
- "eval_runtime": 17.9999,
233
- "eval_samples_per_second": 27.778,
234
- "eval_steps_per_second": 27.778,
235
- "step": 300
236
  }
237
  ],
238
  "logging_steps": 10,
@@ -252,7 +139,7 @@
252
  "attributes": {}
253
  }
254
  },
255
- "total_flos": 1.4443567455141888e+16,
256
  "train_batch_size": 16,
257
  "trial_name": null,
258
  "trial_params": null
 
1
  {
2
+ "best_metric": 4.310510158538818,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.02449779519843214,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0016331863465621427,
13
+ "grad_norm": 2.129213571548462,
14
+ "learning_rate": 9.999999999999997e-06,
15
+ "loss": 4.4958,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0032663726931242854,
20
+ "grad_norm": 1.9591890573501587,
21
+ "learning_rate": 1.9999999999999995e-05,
22
+ "loss": 4.2389,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.004899559039686428,
27
+ "grad_norm": 1.9256017208099365,
28
+ "learning_rate": 2.9999999999999987e-05,
29
+ "loss": 4.3691,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.006532745386248571,
34
+ "grad_norm": 2.024043083190918,
35
+ "learning_rate": 3.999999999999999e-05,
36
+ "loss": 4.3816,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.008165931732810714,
41
+ "grad_norm": 2.1886556148529053,
42
+ "learning_rate": 4.999999999999998e-05,
43
+ "loss": 4.2901,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.009799118079372856,
48
+ "grad_norm": 2.0718095302581787,
49
+ "learning_rate": 5.9999999999999974e-05,
50
+ "loss": 4.2541,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.011432304425935,
55
+ "grad_norm": 2.2743079662323,
56
+ "learning_rate": 6.999999999999997e-05,
57
+ "loss": 4.2627,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.013065490772497142,
62
+ "grad_norm": 2.5313594341278076,
63
+ "learning_rate": 7.999999999999998e-05,
64
+ "loss": 4.1977,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.014698677119059285,
69
+ "grad_norm": 2.4693851470947266,
70
+ "learning_rate": 8.999999999999997e-05,
71
+ "loss": 4.1341,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.01633186346562143,
76
+ "grad_norm": 2.289997100830078,
77
+ "learning_rate": 9.999999999999996e-05,
78
+ "loss": 4.2624,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.01796504981218357,
83
+ "grad_norm": 2.836108922958374,
84
+ "learning_rate": 9.999897234791826e-05,
85
+ "loss": 4.2417,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.019598236158745713,
90
+ "grad_norm": 2.3288657665252686,
91
+ "learning_rate": 9.999588943391593e-05,
92
+ "loss": 4.2338,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.021231422505307854,
97
+ "grad_norm": 2.600402593612671,
98
+ "learning_rate": 9.999075138471947e-05,
99
+ "loss": 4.2208,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.02286460885187,
104
+ "grad_norm": 2.5480844974517822,
105
+ "learning_rate": 9.998355841153395e-05,
106
+ "loss": 4.1776,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.02449779519843214,
111
+ "grad_norm": 2.910768747329712,
112
+ "learning_rate": 9.997431081003435e-05,
113
+ "loss": 4.2473,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.02449779519843214,
118
+ "eval_loss": 4.310510158538818,
119
+ "eval_runtime": 12.9228,
120
+ "eval_samples_per_second": 38.691,
121
+ "eval_steps_per_second": 38.691,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 2063558931775488.0,
143
  "train_batch_size": 16,
144
  "trial_name": null,
145
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f40345b3fd623d3ea644b5cebfee50c24fbc32ad1c4eb11de85e1691db783a37
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:592c59c98adf2e8f3e83b6394c6b1788941e44d546133031b57e9181a2e64999
3
  size 5496