thomasgauthier
/

Unmixtraled-22B-v0.1-expert-3

@@ -35,4 +35,109 @@ The following named weight correspondance was used:
 | [Unmixtraled-22B-v0.1-expert-5](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs | 1099.32373046875 |
 | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
 | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
-| [Unmixtraled-22B-v0.1-lerp](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs | 1873.9874267578125 |

 | [Unmixtraled-22B-v0.1-expert-5](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs | 1099.32373046875 |
 | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
 | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
+| [Unmixtraled-22B-v0.1-lerp](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs | 1873.9874267578125 |
+# Code
+The following code was used to extract the experts and construct the dense models:
+```python
+# pip install -U transformers huggingface_hub "git+https://github.com/arcee-ai/mergekit@7467108c05d56ef2bb4b8f33936d437dc448f7dd"
+import fnmatch
+import json
+import os
+import re
+import shutil
+import torch
+from huggingface_hub import snapshot_download
+from mergekit.architecture import get_architecture_info
+from mergekit.common import ModelReference
+from mergekit.io import LazyTensorLoader, TensorWriter
+from tqdm import tqdm
+MIXTRAL_MODEL_ID = "mistral-community/Mixtral-8x22B-v0.1"
+MIXTRAL_PATH = snapshot_download(repo_id=MIXTRAL_MODEL_ID)
+print(f"Mixtral downloaded to: {MIXTRAL_PATH}")
+MISTRAL_PATH = snapshot_download(
+    repo_id="mistralai/Mistral-7B-v0.1", allow_patterns=["config.json"]
+)
+print(f"Mistral config downloaded to: {MISTRAL_PATH}")
+with open(os.path.join(MISTRAL_PATH, "config.json"), "r") as f:
+    mistral_config = json.load(f)
+with open(os.path.join(MIXTRAL_PATH, "config.json"), "r") as f:
+    mixtral_config = json.load(f)
+combined_config = {
+    key: mixtral_config[key] for key in mistral_config if key in mixtral_config
+}
+combined_config["architectures"] = ["MistralForCausalLM"]
+combined_config["model_type"] = "mistral"
+mixtral_model_ref = ModelReference.parse(MIXTRAL_PATH)
+mixtral_architecture_info = get_architecture_info(mixtral_model_ref.config())
+mixtral_loader = LazyTensorLoader(mixtral_model_ref.tensor_index(), lazy_unpickle=True)
+ALLOW_LIST = ["generation_config.json", "tokenizer.model", "tokenizer_config.json"]
+def copy_directory(src, dest, allowed_patterns):
+    os.makedirs(dest, exist_ok=True)
+    for root, dirs, files in os.walk(src):
+        # Only keep directories that match at least one of the allowed patterns
+        dirs[:] = [d for d in dirs if any(fnmatch.fnmatch(d, pattern) for pattern in allowed_patterns)]
+        for file in files:
+            # Only copy files that match at least one of the allowed patterns
+            if any(fnmatch.fnmatch(file, pattern) for pattern in allowed_patterns):
+                src_path = os.path.join(root, file)
+                dest_path = os.path.join(dest, os.path.relpath(src_path, src))
+                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+                shutil.copy2(src_path, dest_path)
+def get_tensor(layer_num, expert_num, tensor_type):
+    weight_name = f"model.layers.{layer_num}.block_sparse_moe.experts.{expert_num}.{tensor_type}.weight"
+    return mixtral_loader.get_tensor(weight_name)
+def extract_layer_number(string):
+    match = re.search(r"layers\.(\d+)\.", string)
+    return int(match.group(1)) if match else None
+def save_expert_as_dense(output_path, expert_num):
+    dense_model_ref = ModelReference.parse(output_path)
+    dense_architecture_info = get_architecture_info(dense_model_ref.config())
+    writer = TensorWriter(output_path, safe_serialization=True)
+    for weight_info in tqdm(dense_architecture_info.all_weights(dense_model_ref.config())):
+        if weight_info.name.endswith(".up_proj.weight"):
+            layer_num = extract_layer_number(weight_info.name)
+            writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w3"))
+        elif weight_info.name.endswith(".down_proj.weight"):
+            layer_num = extract_layer_number(weight_info.name)
+            writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w2"))
+        elif weight_info.name.endswith(".gate_proj.weight"):
+            layer_num = extract_layer_number(weight_info.name)
+            writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w1"))
+        else:
+            writer.save_tensor(weight_info.name, mixtral_loader.get_tensor(weight_info.name))
+    writer.finalize()
+num_experts = mixtral_config["num_local_experts"]
+for expert_num in range(num_experts):
+    dense_path = f"./dense_expert_{expert_num}"
+    copy_directory(MIXTRAL_PATH, dense_path, ALLOW_LIST)
+    with open(os.path.join(dense_path, "config.json"), "w") as f:
+        json.dump(combined_config, f, indent=2)
+    save_expert_as_dense(dense_path, expert_num)
+    print(f"Dense model #{expert_num} saved to {os.path.abspath(dense_path)}")
+```