File size: 7,658 Bytes
b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 aa2f9f0 b29249f cf7b767 b29249f cf7b767 b29249f cf7b767 20e31fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
---
license: apache-2.0
tags:
- mixtral
- dense
- mistral
- expert
---
# Unmixtraled 22B 8x linear merge
> [!WARNING]
> This model outputs gibberish as it was not trained under the dense configuration. Finetuning or merging is needed to make this model useful.
This is a 22B Mistral model recycling weights from [mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1).
The model was adapted from a Mixtral architecture to a dense Mistral architecture with the same number of layers, attention heads and hidden dimensions.
Embeddings, attention, layer norms and LM head weights were taken directly from the 8x22B model, MLP weights are a linear merge of experts 0 to 7 weights.
The following named weight correspondance was used:
| Mistral weight | Mixtral weight |
|----------------|------------------------------|
| `gate_proj` | `experts.{expert_num}.w1` |
| `down_proj` | `experts.{expert_num}.w2` |
| `up_proj` | `experts.{expert_num}.w3` |
This mergekit configuration was used to merge the experts:
```yaml
models:
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-0
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-1
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-2
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-3
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-4
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-5
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-6
- model: thomasgauthier/Unmixtraled-22B-v0.1-expert-7
merge_method: linear
dtype: float16
```
## Unmixtraled models
| Expert | Source | Wikitext perplexity |
|--------|-----------------|---------------------|
| [Unmixtraled-22B-v0.1-expert-0](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-0) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 0 MLPs | 696.6932983398438 |
| [Unmixtraled-22B-v0.1-expert-1](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-1) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 1 MLPs | 6853.04248046875 |
| [Unmixtraled-22B-v0.1-expert-2](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-2) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 2 MLPs | 4689.181640625 |
| [Unmixtraled-22B-v0.1-expert-3](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-3) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 3 MLPs | 782.3755493164062 |
| [Unmixtraled-22B-v0.1-expert-4](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-4) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 4 MLPs | 2844.943603515625 |
| [Unmixtraled-22B-v0.1-expert-5](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs | 1099.32373046875 |
| [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
| [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
| [**Unmixtraled-22B-v0.1-lerp**](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | **Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs** | **1873.9874267578125** |
# Code
The following code was used to extract the experts and construct the dense models:
```python
# pip install -U transformers huggingface_hub "git+https://github.com/arcee-ai/mergekit@7467108c05d56ef2bb4b8f33936d437dc448f7dd"
import fnmatch
import json
import os
import re
import shutil
import torch
from huggingface_hub import snapshot_download
from mergekit.architecture import get_architecture_info
from mergekit.common import ModelReference
from mergekit.io import LazyTensorLoader, TensorWriter
from tqdm import tqdm
MIXTRAL_MODEL_ID = "mistral-community/Mixtral-8x22B-v0.1"
MIXTRAL_PATH = snapshot_download(repo_id=MIXTRAL_MODEL_ID)
print(f"Mixtral downloaded to: {MIXTRAL_PATH}")
MISTRAL_PATH = snapshot_download(
repo_id="mistralai/Mistral-7B-v0.1", allow_patterns=["config.json"]
)
print(f"Mistral config downloaded to: {MISTRAL_PATH}")
with open(os.path.join(MISTRAL_PATH, "config.json"), "r") as f:
mistral_config = json.load(f)
with open(os.path.join(MIXTRAL_PATH, "config.json"), "r") as f:
mixtral_config = json.load(f)
combined_config = {
key: mixtral_config[key] for key in mistral_config if key in mixtral_config
}
combined_config["architectures"] = ["MistralForCausalLM"]
combined_config["model_type"] = "mistral"
mixtral_model_ref = ModelReference.parse(MIXTRAL_PATH)
mixtral_architecture_info = get_architecture_info(mixtral_model_ref.config())
mixtral_loader = LazyTensorLoader(mixtral_model_ref.tensor_index(), lazy_unpickle=True)
ALLOW_LIST = ["generation_config.json", "tokenizer.model", "tokenizer_config.json"]
def copy_directory(src, dest, allowed_patterns):
os.makedirs(dest, exist_ok=True)
for root, dirs, files in os.walk(src):
# Only keep directories that match at least one of the allowed patterns
dirs[:] = [d for d in dirs if any(fnmatch.fnmatch(d, pattern) for pattern in allowed_patterns)]
for file in files:
# Only copy files that match at least one of the allowed patterns
if any(fnmatch.fnmatch(file, pattern) for pattern in allowed_patterns):
src_path = os.path.join(root, file)
dest_path = os.path.join(dest, os.path.relpath(src_path, src))
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
shutil.copy2(src_path, dest_path)
def get_tensor(layer_num, expert_num, tensor_type):
weight_name = f"model.layers.{layer_num}.block_sparse_moe.experts.{expert_num}.{tensor_type}.weight"
return mixtral_loader.get_tensor(weight_name)
def extract_layer_number(string):
match = re.search(r"layers\.(\d+)\.", string)
return int(match.group(1)) if match else None
def save_expert_as_dense(output_path, expert_num):
dense_model_ref = ModelReference.parse(output_path)
dense_architecture_info = get_architecture_info(dense_model_ref.config())
writer = TensorWriter(output_path, safe_serialization=True)
for weight_info in tqdm(dense_architecture_info.all_weights(dense_model_ref.config())):
if weight_info.name.endswith(".up_proj.weight"):
layer_num = extract_layer_number(weight_info.name)
writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w3"))
elif weight_info.name.endswith(".down_proj.weight"):
layer_num = extract_layer_number(weight_info.name)
writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w2"))
elif weight_info.name.endswith(".gate_proj.weight"):
layer_num = extract_layer_number(weight_info.name)
writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w1"))
else:
writer.save_tensor(weight_info.name, mixtral_loader.get_tensor(weight_info.name))
writer.finalize()
num_experts = mixtral_config["num_local_experts"]
for expert_num in range(num_experts):
dense_path = f"./dense_expert_{expert_num}"
copy_directory(MIXTRAL_PATH, dense_path, ALLOW_LIST)
with open(os.path.join(dense_path, "config.json"), "w") as f:
json.dump(combined_config, f, indent=2)
save_expert_as_dense(dense_path, expert_num)
print(f"Dense model #{expert_num} saved to {os.path.abspath(dense_path)}")
``` |