Update README.md
Browse files
README.md
CHANGED
@@ -9,9 +9,47 @@ pipeline_tag: text-generation
|
|
9 |
<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/GO4MY_3adP2G9EHKZbZpg.webp" width="500" height="500">
|
10 |
|
11 |
|
12 |
-
This model is the
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
Test this out and see if you find anything interesting or intriguing. I will keep iterating more versions but this one seems like a fun and useful way to start.
|
17 |
|
@@ -164,227 +202,3 @@ In summary, bees contribute significantly to ecosystems beyond pollination by en
|
|
164 |
|
165 |
---
|
166 |
|
167 |
-
## Training Snapshot
|
168 |
-
|
169 |
-
```
|
170 |
-
|
171 |
-
Step Training Loss
|
172 |
-
1 3.223000
|
173 |
-
2 3.221300
|
174 |
-
3 3.215900
|
175 |
-
4 3.210600
|
176 |
-
5 3.203000
|
177 |
-
6 3.193500
|
178 |
-
7 3.184000
|
179 |
-
8 3.173400
|
180 |
-
9 3.162400
|
181 |
-
10 3.151500
|
182 |
-
11 3.140500
|
183 |
-
12 3.128800
|
184 |
-
13 3.117600
|
185 |
-
14 3.106700
|
186 |
-
15 3.095500
|
187 |
-
16 3.084700
|
188 |
-
17 3.073700
|
189 |
-
18 3.062700
|
190 |
-
19 3.052300
|
191 |
-
20 3.041800
|
192 |
-
|
193 |
-
|
194 |
-
201 1.273200
|
195 |
-
202 1.257600
|
196 |
-
203 1.241900
|
197 |
-
204 1.226100
|
198 |
-
205 1.210800
|
199 |
-
206 1.195500
|
200 |
-
207 1.180800
|
201 |
-
208 1.166000
|
202 |
-
209 1.151200
|
203 |
-
210 1.136900
|
204 |
-
211 1.122000
|
205 |
-
212 1.106600
|
206 |
-
213 1.091200
|
207 |
-
214 1.075200
|
208 |
-
215 1.059200
|
209 |
-
216 1.042900
|
210 |
-
217 1.026600
|
211 |
-
218 1.010300
|
212 |
-
219 0.994200
|
213 |
-
|
214 |
-
416 0.041700
|
215 |
-
417 0.041700
|
216 |
-
418 0.041600
|
217 |
-
419 0.041600
|
218 |
-
420 0.041600
|
219 |
-
421 0.041600
|
220 |
-
422 0.041500
|
221 |
-
423 0.041500
|
222 |
-
424 0.041500
|
223 |
-
425 0.041400
|
224 |
-
426 0.041400
|
225 |
-
427 0.041400
|
226 |
-
428 0.041400
|
227 |
-
429 0.041300
|
228 |
-
430 0.041300
|
229 |
-
431 0.041300
|
230 |
-
432 0.041200
|
231 |
-
433 0.041200
|
232 |
-
434 0.041200
|
233 |
-
435 0.041100
|
234 |
-
436 0.041200
|
235 |
-
437 0.041100
|
236 |
-
438 0.041100
|
237 |
-
439 0.041100
|
238 |
-
440 0.041000
|
239 |
-
441 0.041000
|
240 |
-
442 0.041000
|
241 |
-
443 0.040900
|
242 |
-
444 0.040900
|
243 |
-
445 0.040900
|
244 |
-
|
245 |
-
668 0.035200
|
246 |
-
669 0.035100
|
247 |
-
670 0.035100
|
248 |
-
671 0.035100
|
249 |
-
672 0.035100
|
250 |
-
673 0.035000
|
251 |
-
674 0.035000
|
252 |
-
675 0.035000
|
253 |
-
676 0.035000
|
254 |
-
677 0.034900
|
255 |
-
678 0.034900
|
256 |
-
679 0.034900
|
257 |
-
680 0.034800
|
258 |
-
681 0.034800
|
259 |
-
682 0.034800
|
260 |
-
683 0.034800
|
261 |
-
684 0.034800
|
262 |
-
685 0.034700
|
263 |
-
686 0.034700
|
264 |
-
687 0.034700
|
265 |
-
688 0.034700
|
266 |
-
689 0.034600
|
267 |
-
690 0.034600
|
268 |
-
691 0.034600
|
269 |
-
692 0.034600
|
270 |
-
693 0.034500
|
271 |
-
694 0.034500
|
272 |
-
695 0.034500
|
273 |
-
696 0.034400
|
274 |
-
697 0.034400
|
275 |
-
698 0.034400
|
276 |
-
699 0.034400
|
277 |
-
700 0.034300
|
278 |
-
701 0.034300
|
279 |
-
702 0.034300
|
280 |
-
703 0.034300
|
281 |
-
704 0.034200
|
282 |
-
705 0.034200
|
283 |
-
706 0.034200
|
284 |
-
707 0.034200
|
285 |
-
708 0.034100
|
286 |
-
709 0.034100
|
287 |
-
710 0.034100
|
288 |
-
711 0.034100
|
289 |
-
712 0.034000
|
290 |
-
713 0.034000
|
291 |
-
714 0.034000
|
292 |
-
715 0.034000
|
293 |
-
716 0.033900
|
294 |
-
717 0.033900
|
295 |
-
718 0.033800
|
296 |
-
719 0.033800
|
297 |
-
720 0.033800
|
298 |
-
721 0.033800
|
299 |
-
|
300 |
-
1209 0.006600
|
301 |
-
1210 0.006500
|
302 |
-
1211 0.006300
|
303 |
-
1212 0.006200
|
304 |
-
1213 0.006100
|
305 |
-
1214 0.006000
|
306 |
-
1215 0.005800
|
307 |
-
1216 0.005700
|
308 |
-
1217 0.005600
|
309 |
-
1218 0.005500
|
310 |
-
1219 0.005400
|
311 |
-
1220 0.005300
|
312 |
-
1221 0.005100
|
313 |
-
1222 0.004900
|
314 |
-
1223 0.004800
|
315 |
-
1224 0.004700
|
316 |
-
1225 0.004600
|
317 |
-
1226 0.004500
|
318 |
-
1227 0.004400
|
319 |
-
1228 0.004300
|
320 |
-
1229 0.004200
|
321 |
-
1230 0.004000
|
322 |
-
1231 0.003900
|
323 |
-
1232 0.003800
|
324 |
-
1233 0.003700
|
325 |
-
1234 0.003500
|
326 |
-
1235 0.003400
|
327 |
-
1236 0.003300
|
328 |
-
1237 0.003200
|
329 |
-
1238 0.003000
|
330 |
-
1239 0.003000
|
331 |
-
1240 0.002900
|
332 |
-
1241 0.002800
|
333 |
-
1242 0.002700
|
334 |
-
1243 0.002600
|
335 |
-
1244 0.002500
|
336 |
-
1245 0.002400
|
337 |
-
1246 0.002300
|
338 |
-
1247 0.002200
|
339 |
-
1248 0.002100
|
340 |
-
1249 0.002000
|
341 |
-
1250 0.001900
|
342 |
-
1251 0.001800
|
343 |
-
1252 0.001800
|
344 |
-
1253 0.001700
|
345 |
-
1254 0.001600
|
346 |
-
1255 0.001600
|
347 |
-
1256 0.001500
|
348 |
-
1257 0.001400
|
349 |
-
1258 0.001300
|
350 |
-
1259 0.001300
|
351 |
-
1260 0.001200
|
352 |
-
1261 0.001200
|
353 |
-
1262 0.001100
|
354 |
-
1263 0.001100
|
355 |
-
1264 0.001000
|
356 |
-
1265 0.001000
|
357 |
-
1266 0.000900
|
358 |
-
1267 0.000900
|
359 |
-
1268 0.000800
|
360 |
-
1269 0.000800
|
361 |
-
1270 0.000800
|
362 |
-
1271 0.000800
|
363 |
-
1272 0.000700
|
364 |
-
1273 0.000700
|
365 |
-
1274 0.000700
|
366 |
-
1275 0.000600
|
367 |
-
1276 0.000600
|
368 |
-
1277 0.000600
|
369 |
-
1278 0.000600
|
370 |
-
1279 0.000500
|
371 |
-
1280 0.000500
|
372 |
-
1281 0.000500
|
373 |
-
1282 0.000500
|
374 |
-
1283 0.000500
|
375 |
-
1284 0.000500
|
376 |
-
1285 0.000500
|
377 |
-
1286 0.000400
|
378 |
-
1287 0.000400
|
379 |
-
1288 0.000400
|
380 |
-
1289 0.000400
|
381 |
-
1290 0.000400
|
382 |
-
1291 0.000400
|
383 |
-
1292 0.000400
|
384 |
-
1293 0.000400
|
385 |
-
1294 0.000400
|
386 |
-
1295 0.000400
|
387 |
-
1296 0.000400
|
388 |
-
1297 0.000300
|
389 |
-
1298 0.000300
|
390 |
-
```
|
|
|
9 |
<img src="https://cdn-uploads.huggingface.co/production/uploads/64740cf7485a7c8e1bd51ac9/GO4MY_3adP2G9EHKZbZpg.webp" width="500" height="500">
|
10 |
|
11 |
|
12 |
+
**This model is the MLX trained version on the experimental 'Internal Knowledge Map' dataset.** Training was done 100% local on my M2 Ultra 128GB. I've found that there are noticable differences between the Transformers (Unsloth) version I trained in a Colab and the ones I have been training locally using MLX. I personally prefer the MLX ones as they seem to train MUCH better and retain more of the aspects from fine-tuning. In particular, I also ripped this model to shreds and it still seems to work amazingly. Here was my training set up:
|
13 |
|
14 |
+
```
|
15 |
+
model.freeze()
|
16 |
+
for l in model.model.layers:
|
17 |
+
l.self_attn.q_proj = LoRALinear.from_linear(
|
18 |
+
l.self_attn.q_proj, r=128, lora_alpha=256, lora_dropout=0.001
|
19 |
+
)
|
20 |
+
l.self_attn.v_proj = LoRALinear.from_linear(
|
21 |
+
l.self_attn.v_proj, r=128, lora_alpha=256, lora_dropout=0.001
|
22 |
+
)
|
23 |
+
l.self_attn.o_proj = LoRALinear.from_linear(
|
24 |
+
l.self_attn.o_proj, r=128, lora_alpha=256, lora_dropout=0.001
|
25 |
+
)
|
26 |
+
|
27 |
+
l.self_attn.k_proj = LoRALinear.from_linear(
|
28 |
+
l.self_attn.k_proj, r=128, lora_alpha=256
|
29 |
+
)
|
30 |
+
|
31 |
+
if hasattr(l, "block_sparse_moe"):
|
32 |
+
l.block_sparse_moe.gate = LoRALinear.from_linear(
|
33 |
+
l.block_sparse_moe.gate, r=16, lora_alpha=32, lora_dropout=0.001
|
34 |
+
)
|
35 |
+
|
36 |
+
p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10**6
|
37 |
+
print(f"Total parameters {p:.3f}M")
|
38 |
+
p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10**6
|
39 |
+
print(f"Trainable parameters {p:.3f}M")
|
40 |
+
|
41 |
+
|
42 |
+
trainingArgs = TrainingArgs(
|
43 |
+
batch_size=4,
|
44 |
+
iters=4200,
|
45 |
+
val_batches=25,
|
46 |
+
steps_per_report=10,
|
47 |
+
steps_per_eval=200,
|
48 |
+
steps_per_save=100,
|
49 |
+
adapter_file="adapters.npz",
|
50 |
+
max_seq_length=4096,
|
51 |
+
)
|
52 |
+
```
|
53 |
|
54 |
Test this out and see if you find anything interesting or intriguing. I will keep iterating more versions but this one seems like a fun and useful way to start.
|
55 |
|
|
|
202 |
|
203 |
---
|
204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|