Crystalcareai commited on
Commit
e1a9e30
1 Parent(s): 9eeec7b

Create axolotl_config.yml

Browse files
Files changed (1) hide show
  1. configs/axolotl_config.yml +552 -0
configs/axolotl_config.yml ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: mistralai/Mixtral-8x22B-v0.1
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: AutoTokenizer
4
+ tokenizer_use_fast: true
5
+
6
+ # load_in_8bit: true
7
+ # load_in_4bit: false
8
+ # strict: false
9
+
10
+ datasets:
11
+ - path: /workspace/datasets/dolphin-2.9.2/dolphin201-sharegpt2.jsonl
12
+ type: sharegpt
13
+ conversation: chatml
14
+ - path: /workspace/datasets/dolphin-2.9.2/dolphin-coder-codegen-sharegpt2.jsonl
15
+ type: sharegpt
16
+ conversation: chatml
17
+ - path: /workspace/datasets/dolphin-2.9.2/dolphin-coder-translate-sharegpt2.jsonl
18
+ type: sharegpt
19
+ conversation: chatml
20
+ - path: /workspace/datasets/dolphin-2.9.2/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
21
+ type: sharegpt
22
+ conversation: chatml
23
+ - path: /workspace/datasets/dolphin-2.9.2/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
24
+ type: sharegpt
25
+ conversation: chatml
26
+ - path: /workspace/datasets/dolphin-2.9.2/not_samantha_norefusals.jsonl
27
+ type: sharegpt
28
+ conversation: chatml
29
+ - path: /workspace/datasets/dolphin-2.9.2/openhermes200k_unfiltered.jsonl
30
+ type: sharegpt
31
+ conversation: chatml
32
+ - path: /workspace/datasets/dolphin-2.9.2/Orca-Math-resort-unfiltered.jsonl
33
+ type: sharegpt
34
+ conversation: chatml
35
+ - path: /workspace/datasets/dolphin-2.9.2/SystemChat_sharegpt.jsonl
36
+ type: sharegpt
37
+ conversation: chatml
38
+ - path: /workspace/datasets/dolphin-2.9.2/toolbench_instruct_j1s1_3k_unfiltered.jsonl
39
+ type: sharegpt
40
+ conversation: chatml
41
+ - path: /workspace/datasets/dolphin-2.9.2/toolbench_negative_unfiltered.jsonl
42
+ type: sharegpt
43
+ conversation: chatml
44
+ - path: /workspace/datasets/dolphin-2.9.2/toolbench_react_10p_unfiltered.jsonl
45
+ type: sharegpt
46
+ conversation: chatml
47
+ - path: /workspace/datasets/dolphin-2.9.2/toolbench_tflan_cot_30p_unfiltered.jsonl
48
+ type: sharegpt
49
+ conversation: chatml
50
+ - path: /workspace/datasets/dolphin-2.9.2/agent_instruct_react_unfiltered.jsonl
51
+ type: sharegpt
52
+ conversation: chatml
53
+
54
+ chat_template: chatml
55
+ dataset_prepared_path: mixtral-8x22b-data
56
+ val_set_size: 0.01
57
+ output_dir: mixtral-8x2b
58
+
59
+ sequence_len: 16384
60
+ sample_packing: true
61
+ pad_to_sequence_len: true
62
+
63
+ unfrozen_parameters:
64
+ - ^lm_head.weight$
65
+ - ^model.embed_tokens.weight$
66
+ - model.layers.54.block_sparse_moe.experts.0.w1
67
+ - model.layers.53.block_sparse_moe.experts.0.w1
68
+ - model.layers.55.block_sparse_moe.experts.0.w1
69
+ - model.layers.51.block_sparse_moe.experts.0.w1
70
+ - model.layers.52.block_sparse_moe.experts.0.w1
71
+ - model.layers.50.block_sparse_moe.experts.0.w1
72
+ - model.layers.47.block_sparse_moe.experts.0.w1
73
+ - model.layers.49.block_sparse_moe.experts.0.w1
74
+ - model.layers.48.block_sparse_moe.experts.0.w1
75
+ - model.layers.46.block_sparse_moe.experts.0.w1
76
+ - model.layers.44.block_sparse_moe.experts.0.w1
77
+ - model.layers.45.block_sparse_moe.experts.0.w1
78
+ - model.layers.13.block_sparse_moe.experts.0.w1
79
+ - model.layers.43.block_sparse_moe.experts.0.w1
80
+ # block_sparse_moe.experts.0.w2 layers
81
+ - model.layers.36.block_sparse_moe.experts.0.w2
82
+ - model.layers.42.block_sparse_moe.experts.0.w2
83
+ - model.layers.41.block_sparse_moe.experts.0.w2
84
+ - model.layers.37.block_sparse_moe.experts.0.w2
85
+ - model.layers.34.block_sparse_moe.experts.0.w2
86
+ - model.layers.38.block_sparse_moe.experts.0.w2
87
+ - model.layers.47.block_sparse_moe.experts.0.w2
88
+ - model.layers.35.block_sparse_moe.experts.0.w2
89
+ - model.layers.44.block_sparse_moe.experts.0.w2
90
+ - model.layers.32.block_sparse_moe.experts.0.w2
91
+ - model.layers.40.block_sparse_moe.experts.0.w2
92
+ - model.layers.39.block_sparse_moe.experts.0.w2
93
+ - model.layers.45.block_sparse_moe.experts.0.w2
94
+ - model.layers.33.block_sparse_moe.experts.0.w2
95
+ # block_sparse_moe.experts.0.w3 layers
96
+ - model.layers.46.block_sparse_moe.experts.0.w3
97
+ - model.layers.47.block_sparse_moe.experts.0.w3
98
+ - model.layers.44.block_sparse_moe.experts.0.w3
99
+ - model.layers.45.block_sparse_moe.experts.0.w3
100
+ - model.layers.43.block_sparse_moe.experts.0.w3
101
+ - model.layers.49.block_sparse_moe.experts.0.w3
102
+ - model.layers.48.block_sparse_moe.experts.0.w3
103
+ - model.layers.42.block_sparse_moe.experts.0.w3
104
+ - model.layers.36.block_sparse_moe.experts.0.w3
105
+ - model.layers.39.block_sparse_moe.experts.0.w3
106
+ - model.layers.41.block_sparse_moe.experts.0.w3
107
+ - model.layers.38.block_sparse_moe.experts.0.w3
108
+ - model.layers.50.block_sparse_moe.experts.0.w3
109
+ - model.layers.37.block_sparse_moe.experts.0.w3
110
+ # block_sparse_moe.experts.1.w1 layers
111
+ - model.layers.54.block_sparse_moe.experts.1.w1
112
+ - model.layers.53.block_sparse_moe.experts.1.w1
113
+ - model.layers.52.block_sparse_moe.experts.1.w1
114
+ - model.layers.51.block_sparse_moe.experts.1.w1
115
+ - model.layers.50.block_sparse_moe.experts.1.w1
116
+ - model.layers.48.block_sparse_moe.experts.1.w1
117
+ - model.layers.49.block_sparse_moe.experts.1.w1
118
+ - model.layers.47.block_sparse_moe.experts.1.w1
119
+ - model.layers.46.block_sparse_moe.experts.1.w1
120
+ - model.layers.7.block_sparse_moe.experts.1.w1
121
+ - model.layers.45.block_sparse_moe.experts.1.w1
122
+ - model.layers.42.block_sparse_moe.experts.1.w1
123
+ - model.layers.12.block_sparse_moe.experts.1.w1
124
+ - model.layers.13.block_sparse_moe.experts.1.w1
125
+ # block_sparse_moe.experts.1.w2 layers
126
+ - model.layers.46.block_sparse_moe.experts.1.w2
127
+ - model.layers.37.block_sparse_moe.experts.1.w2
128
+ - model.layers.34.block_sparse_moe.experts.1.w2
129
+ - model.layers.45.block_sparse_moe.experts.1.w2
130
+ - model.layers.43.block_sparse_moe.experts.1.w2
131
+ - model.layers.39.block_sparse_moe.experts.1.w2
132
+ - model.layers.38.block_sparse_moe.experts.1.w2
133
+ - model.layers.42.block_sparse_moe.experts.1.w2
134
+ - model.layers.48.block_sparse_moe.experts.1.w2
135
+ - model.layers.36.block_sparse_moe.experts.1.w2
136
+ - model.layers.40.block_sparse_moe.experts.1.w2
137
+ - model.layers.41.block_sparse_moe.experts.1.w2
138
+ - model.layers.44.block_sparse_moe.experts.1.w2
139
+ - model.layers.33.block_sparse_moe.experts.1.w2
140
+ # block_sparse_moe.experts.1.w3 layers
141
+ - model.layers.47.block_sparse_moe.experts.1.w3
142
+ - model.layers.46.block_sparse_moe.experts.1.w3
143
+ - model.layers.48.block_sparse_moe.experts.1.w3
144
+ - model.layers.42.block_sparse_moe.experts.1.w3
145
+ - model.layers.45.block_sparse_moe.experts.1.w3
146
+ - model.layers.40.block_sparse_moe.experts.1.w3
147
+ - model.layers.43.block_sparse_moe.experts.1.w3
148
+ - model.layers.38.block_sparse_moe.experts.1.w3
149
+ - model.layers.39.block_sparse_moe.experts.1.w3
150
+ - model.layers.41.block_sparse_moe.experts.1.w3
151
+ - model.layers.34.block_sparse_moe.experts.1.w3
152
+ - model.layers.37.block_sparse_moe.experts.1.w3
153
+ - model.layers.44.block_sparse_moe.experts.1.w3
154
+ - model.layers.49.block_sparse_moe.experts.1.w3
155
+ # block_sparse_moe.experts.2.w1 layers
156
+ - model.layers.53.block_sparse_moe.experts.2.w1
157
+ - model.layers.52.block_sparse_moe.experts.2.w1
158
+ - model.layers.51.block_sparse_moe.experts.2.w1
159
+ - model.layers.54.block_sparse_moe.experts.2.w1
160
+ - model.layers.50.block_sparse_moe.experts.2.w1
161
+ - model.layers.49.block_sparse_moe.experts.2.w1
162
+ - model.layers.48.block_sparse_moe.experts.2.w1
163
+ - model.layers.45.block_sparse_moe.experts.2.w1
164
+ - model.layers.46.block_sparse_moe.experts.2.w1
165
+ - model.layers.47.block_sparse_moe.experts.2.w1
166
+ - model.layers.55.block_sparse_moe.experts.2.w1
167
+ - model.layers.17.block_sparse_moe.experts.2.w1
168
+ - model.layers.43.block_sparse_moe.experts.2.w1
169
+ - model.layers.13.block_sparse_moe.experts.2.w1
170
+ # block_sparse_moe.experts.2.w2 layers
171
+ - model.layers.44.block_sparse_moe.experts.2.w2
172
+ - model.layers.34.block_sparse_moe.experts.2.w2
173
+ - model.layers.48.block_sparse_moe.experts.2.w2
174
+ - model.layers.33.block_sparse_moe.experts.2.w2
175
+ - model.layers.39.block_sparse_moe.experts.2.w2
176
+ - model.layers.36.block_sparse_moe.experts.2.w2
177
+ - model.layers.40.block_sparse_moe.experts.2.w2
178
+ - model.layers.32.block_sparse_moe.experts.2.w2
179
+ - model.layers.46.block_sparse_moe.experts.2.w2
180
+ - model.layers.43.block_sparse_moe.experts.2.w2
181
+ - model.layers.37.block_sparse_moe.experts.2.w2
182
+ - model.layers.38.block_sparse_moe.experts.2.w2
183
+ - model.layers.47.block_sparse_moe.experts.2.w2
184
+ - model.layers.42.block_sparse_moe.experts.2.w2
185
+ # block_sparse_moe.experts.2.w3 layers
186
+ - model.layers.46.block_sparse_moe.experts.2.w3
187
+ - model.layers.48.block_sparse_moe.experts.2.w3
188
+ - model.layers.45.block_sparse_moe.experts.2.w3
189
+ - model.layers.47.block_sparse_moe.experts.2.w3
190
+ - model.layers.43.block_sparse_moe.experts.2.w3
191
+ - model.layers.49.block_sparse_moe.experts.2.w3
192
+ - model.layers.40.block_sparse_moe.experts.2.w3
193
+ - model.layers.44.block_sparse_moe.experts.2.w3
194
+ - model.layers.39.block_sparse_moe.experts.2.w3
195
+ - model.layers.38.block_sparse_moe.experts.2.w3
196
+ - model.layers.41.block_sparse_moe.experts.2.w3
197
+ - model.layers.52.block_sparse_moe.experts.2.w3
198
+ - model.layers.51.block_sparse_moe.experts.2.w3
199
+ - model.layers.50.block_sparse_moe.experts.2.w3
200
+ # block_sparse_moe.experts.3.w1 layers
201
+ - model.layers.54.block_sparse_moe.experts.3.w1
202
+ - model.layers.52.block_sparse_moe.experts.3.w1
203
+ - model.layers.53.block_sparse_moe.experts.3.w1
204
+ - model.layers.51.block_sparse_moe.experts.3.w1
205
+ - model.layers.48.block_sparse_moe.experts.3.w1
206
+ - model.layers.50.block_sparse_moe.experts.3.w1
207
+ - model.layers.49.block_sparse_moe.experts.3.w1
208
+ - model.layers.46.block_sparse_moe.experts.3.w1
209
+ - model.layers.55.block_sparse_moe.experts.3.w1
210
+ - model.layers.47.block_sparse_moe.experts.3.w1
211
+ - model.layers.45.block_sparse_moe.experts.3.w1
212
+ - model.layers.44.block_sparse_moe.experts.3.w1
213
+ - model.layers.12.block_sparse_moe.experts.3.w1
214
+ - model.layers.28.block_sparse_moe.experts.3.w1
215
+ # block_sparse_moe.experts.3.w2 layers
216
+ - model.layers.38.block_sparse_moe.experts.3.w2
217
+ - model.layers.37.block_sparse_moe.experts.3.w2
218
+ - model.layers.35.block_sparse_moe.experts.3.w2
219
+ - model.layers.47.block_sparse_moe.experts.3.w2
220
+ - model.layers.39.block_sparse_moe.experts.3.w2
221
+ - model.layers.44.block_sparse_moe.experts.3.w2
222
+ - model.layers.41.block_sparse_moe.experts.3.w2
223
+ - model.layers.43.block_sparse_moe.experts.3.w2
224
+ - model.layers.36.block_sparse_moe.experts.3.w2
225
+ - model.layers.34.block_sparse_moe.experts.3.w2
226
+ - model.layers.33.block_sparse_moe.experts.3.w2
227
+ - model.layers.46.block_sparse_moe.experts.3.w2
228
+ - model.layers.32.block_sparse_moe.experts.3.w2
229
+ - model.layers.40.block_sparse_moe.experts.3.w2
230
+ # block_sparse_moe.experts.3.w3 layers
231
+ - model.layers.46.block_sparse_moe.experts.3.w3
232
+ - model.layers.48.block_sparse_moe.experts.3.w3
233
+ - model.layers.45.block_sparse_moe.experts.3.w3
234
+ - model.layers.47.block_sparse_moe.experts.3.w3
235
+ - model.layers.44.block_sparse_moe.experts.3.w3
236
+ - model.layers.43.block_sparse_moe.experts.3.w3
237
+ - model.layers.49.block_sparse_moe.experts.3.w3
238
+ - model.layers.41.block_sparse_moe.experts.3.w3
239
+ - model.layers.39.block_sparse_moe.experts.3.w3
240
+ - model.layers.36.block_sparse_moe.experts.3.w3
241
+ - model.layers.37.block_sparse_moe.experts.3.w3
242
+ - model.layers.50.block_sparse_moe.experts.3.w3
243
+ - model.layers.35.block_sparse_moe.experts.3.w3
244
+ - model.layers.42.block_sparse_moe.experts.3.w3
245
+ # block_sparse_moe.experts.4.w1 layers
246
+ - model.layers.52.block_sparse_moe.experts.4.w1
247
+ - model.layers.51.block_sparse_moe.experts.4.w1
248
+ - model.layers.50.block_sparse_moe.experts.4.w1
249
+ - model.layers.53.block_sparse_moe.experts.4.w1
250
+ - model.layers.49.block_sparse_moe.experts.4.w1
251
+ - model.layers.54.block_sparse_moe.experts.4.w1
252
+ - model.layers.48.block_sparse_moe.experts.4.w1
253
+ - model.layers.55.block_sparse_moe.experts.4.w1
254
+ - model.layers.47.block_sparse_moe.experts.4.w1
255
+ - model.layers.44.block_sparse_moe.experts.4.w1
256
+ - model.layers.46.block_sparse_moe.experts.4.w1
257
+ - model.layers.45.block_sparse_moe.experts.4.w1
258
+ - model.layers.12.block_sparse_moe.experts.4.w1
259
+ - model.layers.42.block_sparse_moe.experts.4.w1
260
+ # block_sparse_moe.experts.4.w2 layers
261
+ - model.layers.42.block_sparse_moe.experts.4.w2
262
+ - model.layers.44.block_sparse_moe.experts.4.w2
263
+ - model.layers.46.block_sparse_moe.experts.4.w2
264
+ - model.layers.38.block_sparse_moe.experts.4.w2
265
+ - model.layers.34.block_sparse_moe.experts.4.w2
266
+ - model.layers.41.block_sparse_moe.experts.4.w2
267
+ - model.layers.45.block_sparse_moe.experts.4.w2
268
+ - model.layers.32.block_sparse_moe.experts.4.w2
269
+ - model.layers.37.block_sparse_moe.experts.4.w2
270
+ - model.layers.48.block_sparse_moe.experts.4.w2
271
+ - model.layers.36.block_sparse_moe.experts.4.w2
272
+ - model.layers.33.block_sparse_moe.experts.4.w2
273
+ - model.layers.40.block_sparse_moe.experts.4.w2
274
+ - model.layers.30.block_sparse_moe.experts.4.w2
275
+ # block_sparse_moe.experts.4.w3 layers
276
+ - model.layers.48.block_sparse_moe.experts.4.w3
277
+ - model.layers.44.block_sparse_moe.experts.4.w3
278
+ - model.layers.47.block_sparse_moe.experts.4.w3
279
+ - model.layers.46.block_sparse_moe.experts.4.w3
280
+ - model.layers.45.block_sparse_moe.experts.4.w3
281
+ - model.layers.49.block_sparse_moe.experts.4.w3
282
+ - model.layers.38.block_sparse_moe.experts.4.w3
283
+ - model.layers.40.block_sparse_moe.experts.4.w3
284
+ - model.layers.43.block_sparse_moe.experts.4.w3
285
+ - model.layers.36.block_sparse_moe.experts.4.w3
286
+ - model.layers.42.block_sparse_moe.experts.4.w3
287
+ - model.layers.41.block_sparse_moe.experts.4.w3
288
+ - model.layers.50.block_sparse_moe.experts.4.w3
289
+ - model.layers.37.block_sparse_moe.experts.4.w3
290
+ # block_sparse_moe.experts.5.w1 layers
291
+ - model.layers.54.block_sparse_moe.experts.5.w1
292
+ - model.layers.53.block_sparse_moe.experts.5.w1
293
+ - model.layers.52.block_sparse_moe.experts.5.w1
294
+ - model.layers.51.block_sparse_moe.experts.5.w1
295
+ - model.layers.50.block_sparse_moe.experts.5.w1
296
+ - model.layers.48.block_sparse_moe.experts.5.w1
297
+ - model.layers.49.block_sparse_moe.experts.5.w1
298
+ - model.layers.10.block_sparse_moe.experts.5.w1
299
+ - model.layers.47.block_sparse_moe.experts.5.w1
300
+ - model.layers.55.block_sparse_moe.experts.5.w1
301
+ - model.layers.46.block_sparse_moe.experts.5.w1
302
+ - model.layers.12.block_sparse_moe.experts.5.w1
303
+ - model.layers.44.block_sparse_moe.experts.5.w1
304
+ - model.layers.5.block_sparse_moe.experts.5.w1
305
+ # block_sparse_moe.experts.5.w2 layers
306
+ - model.layers.39.block_sparse_moe.experts.5.w2
307
+ - model.layers.32.block_sparse_moe.experts.5.w2
308
+ - model.layers.43.block_sparse_moe.experts.5.w2
309
+ - model.layers.41.block_sparse_moe.experts.5.w2
310
+ - model.layers.46.block_sparse_moe.experts.5.w2
311
+ - model.layers.42.block_sparse_moe.experts.5.w2
312
+ - model.layers.38.block_sparse_moe.experts.5.w2
313
+ - model.layers.34.block_sparse_moe.experts.5.w2
314
+ - model.layers.45.block_sparse_moe.experts.5.w2
315
+ - model.layers.47.block_sparse_moe.experts.5.w2
316
+ - model.layers.36.block_sparse_moe.experts.5.w2
317
+ - model.layers.44.block_sparse_moe.experts.5.w2
318
+ - model.layers.33.block_sparse_moe.experts.5.w2
319
+ - model.layers.35.block_sparse_moe.experts.5.w2
320
+ # block_sparse_moe.experts.5.w3 layers
321
+ - model.layers.48.block_sparse_moe.experts.5.w3
322
+ - model.layers.46.block_sparse_moe.experts.5.w3
323
+ - model.layers.47.block_sparse_moe.experts.5.w3
324
+ - model.layers.44.block_sparse_moe.experts.5.w3
325
+ - model.layers.38.block_sparse_moe.experts.5.w3
326
+ - model.layers.41.block_sparse_moe.experts.5.w3
327
+ - model.layers.49.block_sparse_moe.experts.5.w3
328
+ - model.layers.42.block_sparse_moe.experts.5.w3
329
+ - model.layers.40.block_sparse_moe.experts.5.w3
330
+ - model.layers.43.block_sparse_moe.experts.5.w3
331
+ - model.layers.36.block_sparse_moe.experts.5.w3
332
+ - model.layers.39.block_sparse_moe.experts.5.w3
333
+ - model.layers.45.block_sparse_moe.experts.5.w3
334
+ - model.layers.37.block_sparse_moe.experts.5.w3
335
+ # block_sparse_moe.experts.6.w1 layers
336
+ - model.layers.54.block_sparse_moe.experts.6.w1
337
+ - model.layers.52.block_sparse_moe.experts.6.w1
338
+ - model.layers.51.block_sparse_moe.experts.6.w1
339
+ - model.layers.53.block_sparse_moe.experts.6.w1
340
+ - model.layers.50.block_sparse_moe.experts.6.w1
341
+ - model.layers.48.block_sparse_moe.experts.6.w1
342
+ - model.layers.49.block_sparse_moe.experts.6.w1
343
+ - model.layers.55.block_sparse_moe.experts.6.w1
344
+ - model.layers.45.block_sparse_moe.experts.6.w1
345
+ - model.layers.47.block_sparse_moe.experts.6.w1
346
+ - model.layers.43.block_sparse_moe.experts.6.w1
347
+ - model.layers.46.block_sparse_moe.experts.6.w1
348
+ - model.layers.13.block_sparse_moe.experts.6.w1
349
+ - model.layers.17.block_sparse_moe.experts.6.w1
350
+ # block_sparse_moe.experts.6.w2 layers
351
+ - model.layers.36.block_sparse_moe.experts.6.w2
352
+ - model.layers.38.block_sparse_moe.experts.6.w2
353
+ - model.layers.45.block_sparse_moe.experts.6.w2
354
+ - model.layers.48.block_sparse_moe.experts.6.w2
355
+ - model.layers.44.block_sparse_moe.experts.6.w2
356
+ - model.layers.32.block_sparse_moe.experts.6.w2
357
+ - model.layers.42.block_sparse_moe.experts.6.w2
358
+ - model.layers.40.block_sparse_moe.experts.6.w2
359
+ - model.layers.34.block_sparse_moe.experts.6.w2
360
+ - model.layers.46.block_sparse_moe.experts.6.w2
361
+ - model.layers.41.block_sparse_moe.experts.6.w2
362
+ - model.layers.47.block_sparse_moe.experts.6.w2
363
+ - model.layers.35.block_sparse_moe.experts.6.w2
364
+ - model.layers.39.block_sparse_moe.experts.6.w2
365
+ # block_sparse_moe.experts.6.w3 layers
366
+ - model.layers.46.block_sparse_moe.experts.6.w3
367
+ - model.layers.45.block_sparse_moe.experts.6.w3
368
+ - model.layers.43.block_sparse_moe.experts.6.w3
369
+ - model.layers.48.block_sparse_moe.experts.6.w3
370
+ - model.layers.47.block_sparse_moe.experts.6.w3
371
+ - model.layers.37.block_sparse_moe.experts.6.w3
372
+ - model.layers.44.block_sparse_moe.experts.6.w3
373
+ - model.layers.40.block_sparse_moe.experts.6.w3
374
+ - model.layers.41.block_sparse_moe.experts.6.w3
375
+ - model.layers.36.block_sparse_moe.experts.6.w3
376
+ - model.layers.42.block_sparse_moe.experts.6.w3
377
+ - model.layers.38.block_sparse_moe.experts.6.w3
378
+ - model.layers.39.block_sparse_moe.experts.6.w3
379
+ - model.layers.35.block_sparse_moe.experts.6.w3
380
+ # block_sparse_moe.experts.7.w1 layers
381
+ - model.layers.54.block_sparse_moe.experts.7.w1
382
+ - model.layers.53.block_sparse_moe.experts.7.w1
383
+ - model.layers.52.block_sparse_moe.experts.7.w1
384
+ - model.layers.51.block_sparse_moe.experts.7.w1
385
+ - model.layers.49.block_sparse_moe.experts.7.w1
386
+ - model.layers.47.block_sparse_moe.experts.7.w1
387
+ - model.layers.48.block_sparse_moe.experts.7.w1
388
+ - model.layers.50.block_sparse_moe.experts.7.w1
389
+ - model.layers.13.block_sparse_moe.experts.7.w1
390
+ - model.layers.45.block_sparse_moe.experts.7.w1
391
+ - model.layers.46.block_sparse_moe.experts.7.w1
392
+ - model.layers.44.block_sparse_moe.experts.7.w1
393
+ - model.layers.18.block_sparse_moe.experts.7.w1
394
+ - model.layers.43.block_sparse_moe.experts.7.w1
395
+ # block_sparse_moe.experts.7.w2 layers
396
+ - model.layers.34.block_sparse_moe.experts.7.w2
397
+ - model.layers.33.block_sparse_moe.experts.7.w2
398
+ - model.layers.44.block_sparse_moe.experts.7.w2
399
+ - model.layers.46.block_sparse_moe.experts.7.w2
400
+ - model.layers.41.block_sparse_moe.experts.7.w2
401
+ - model.layers.42.block_sparse_moe.experts.7.w2
402
+ - model.layers.37.block_sparse_moe.experts.7.w2
403
+ - model.layers.39.block_sparse_moe.experts.7.w2
404
+ - model.layers.40.block_sparse_moe.experts.7.w2
405
+ - model.layers.43.block_sparse_moe.experts.7.w2
406
+ - model.layers.35.block_sparse_moe.experts.7.w2
407
+ - model.layers.36.block_sparse_moe.experts.7.w2
408
+ - model.layers.48.block_sparse_moe.experts.7.w2
409
+ - model.layers.38.block_sparse_moe.experts.7.w2
410
+ # block_sparse_moe.experts.7.w3 layers
411
+ - model.layers.47.block_sparse_moe.experts.7.w3
412
+ - model.layers.46.block_sparse_moe.experts.7.w3
413
+ - model.layers.45.block_sparse_moe.experts.7.w3
414
+ - model.layers.44.block_sparse_moe.experts.7.w3
415
+ - model.layers.40.block_sparse_moe.experts.7.w3
416
+ - model.layers.48.block_sparse_moe.experts.7.w3
417
+ - model.layers.43.block_sparse_moe.experts.7.w3
418
+ - model.layers.41.block_sparse_moe.experts.7.w3
419
+ - model.layers.39.block_sparse_moe.experts.7.w3
420
+ - model.layers.42.block_sparse_moe.experts.7.w3
421
+ - model.layers.49.block_sparse_moe.experts.7.w3
422
+ - model.layers.31.block_sparse_moe.experts.7.w3
423
+ - model.layers.37.block_sparse_moe.experts.7.w3
424
+ - model.layers.35.block_sparse_moe.experts.7.w3
425
+ # block_sparse_moe.gate layers
426
+ - model.layers.0.block_sparse_moe.gate
427
+ - model.layers.1.block_sparse_moe.gate
428
+ - model.layers.2.block_sparse_moe.gate
429
+ - model.layers.3.block_sparse_moe.gate
430
+ - model.layers.4.block_sparse_moe.gate
431
+ - model.layers.5.block_sparse_moe.gate
432
+ - model.layers.6.block_sparse_moe.gate
433
+ - model.layers.7.block_sparse_moe.gate
434
+ - model.layers.8.block_sparse_moe.gate
435
+ - model.layers.9.block_sparse_moe.gate
436
+ - model.layers.10.block_sparse_moe.gate
437
+ - model.layers.11.block_sparse_moe.gate
438
+ - model.layers.12.block_sparse_moe.gate
439
+ - model.layers.13.block_sparse_moe.gate
440
+ # self_attn.k_proj layers
441
+ - model.layers.46.self_attn.k_proj
442
+ - model.layers.48.self_attn.k_proj
443
+ - model.layers.45.self_attn.k_proj
444
+ - model.layers.39.self_attn.k_proj
445
+ - model.layers.44.self_attn.k_proj
446
+ - model.layers.47.self_attn.k_proj
447
+ - model.layers.51.self_attn.k_proj
448
+ - model.layers.36.self_attn.k_proj
449
+ - model.layers.35.self_attn.k_proj
450
+ - model.layers.41.self_attn.k_proj
451
+ - model.layers.42.self_attn.k_proj
452
+ - model.layers.38.self_attn.k_proj
453
+ - model.layers.43.self_attn.k_proj
454
+ - model.layers.34.self_attn.k_proj
455
+ # self_attn.o_proj layers
456
+ - model.layers.20.self_attn.o_proj
457
+ - model.layers.19.self_attn.o_proj
458
+ - model.layers.16.self_attn.o_proj
459
+ - model.layers.13.self_attn.o_proj
460
+ - model.layers.18.self_attn.o_proj
461
+ - model.layers.17.self_attn.o_proj
462
+ - model.layers.42.self_attn.o_proj
463
+ - model.layers.12.self_attn.o_proj
464
+ - model.layers.14.self_attn.o_proj
465
+ - model.layers.15.self_attn.o_proj
466
+ - model.layers.40.self_attn.o_proj
467
+ - model.layers.22.self_attn.o_proj
468
+ - model.layers.23.self_attn.o_proj
469
+ - model.layers.38.self_attn.o_proj
470
+ # self_attn.q_proj layers
471
+ - model.layers.0.self_attn.q_proj
472
+ - model.layers.1.self_attn.q_proj
473
+ - model.layers.2.self_attn.q_proj
474
+ - model.layers.22.self_attn.q_proj
475
+ - model.layers.27.self_attn.q_proj
476
+ - model.layers.28.self_attn.q_proj
477
+ - model.layers.13.self_attn.q_proj
478
+ - model.layers.21.self_attn.q_proj
479
+ - model.layers.24.self_attn.q_proj
480
+ - model.layers.33.self_attn.q_proj
481
+ - model.layers.14.self_attn.q_proj
482
+ - model.layers.11.self_attn.q_proj
483
+ - model.layers.15.self_attn.q_proj
484
+ - model.layers.20.self_attn.q_proj
485
+ # self_attn.v_proj layers
486
+ - model.layers.32.self_attn.v_proj
487
+ - model.layers.34.self_attn.v_proj
488
+ - model.layers.35.self_attn.v_proj
489
+ - model.layers.38.self_attn.v_proj
490
+ - model.layers.41.self_attn.v_proj
491
+ - model.layers.46.self_attn.v_proj
492
+ - model.layers.22.self_attn.v_proj
493
+ - model.layers.29.self_attn.v_proj
494
+ - model.layers.36.self_attn.v_proj
495
+ - model.layers.45.self_attn.v_proj
496
+ - model.layers.31.self_attn.v_proj
497
+ - model.layers.5.self_attn.v_proj
498
+ - model.layers.44.self_attn.v_proj
499
+ - model.layers.8.self_attn.v_proj
500
+
501
+ # adapter: lora
502
+ # lora_model_dir:
503
+ # lora_r: 32
504
+ # lora_alpha: 16
505
+ # lora_dropout: 0.05
506
+ # lora_target_linear: true
507
+ # lora_fan_in_fan_out:
508
+
509
+ wandb_project: mixtral-8x22b
510
+ wandb_entity:
511
+ wandb_watch:
512
+ wandb_name:
513
+ wandb_log_model:
514
+
515
+ gradient_accumulation_steps: 8
516
+ micro_batch_size: 1
517
+ num_epochs: 3
518
+ optimizer: adamw_8bit
519
+ lr_scheduler: cosine
520
+ learning_rate: 1e-5
521
+
522
+ train_on_inputs: false
523
+ group_by_length: false
524
+ bf16: auto
525
+ fp16:
526
+ tf32: false
527
+
528
+ gradient_checkpointing: true
529
+ early_stopping_patience:
530
+ resume_from_checkpoint:
531
+ local_rank:
532
+ logging_steps: 1
533
+ xformers_attention:
534
+ flash_attention: true
535
+
536
+ warmup_steps: 10
537
+ evals_per_epoch: 2
538
+ eval_table_size:
539
+ eval_max_new_tokens: 128
540
+ saves_per_epoch: 4
541
+ save_total_limit: 2
542
+ debug:
543
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json
544
+ weight_decay: 0.1
545
+ fsdp:
546
+ fsdp_config:
547
+ special_tokens:
548
+ eos_token: "<|im_end|>"
549
+ unk_token: "<unk>"
550
+ bos_token: "<s>"
551
+ tokens:
552
+ - "<|im_start|>"