diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..1f9be8bbe6c8b10a2eacdb9690f27ea2ecccb7a4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/cfg.yml b/cfg.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8fbff893f64bec234de0fdfded92133fc9aa2437
--- /dev/null
+++ b/cfg.yml
@@ -0,0 +1,9 @@
+
+models:
+   - model: Sao10K/L3.1-8B-Niitama-v1.1
+   - model: akjindal53244/Llama-3.1-Storm-8B
+merge_method: slerp
+base_model: Sao10K/L3.1-8B-Niitama-v1.1
+parameters:
+  t: 0.0001
+dtype: bfloat16
diff --git a/merged/README.md b/merged/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..190c12a24e4b5035ac18f2e3bfe441fa6d4bf36f
--- /dev/null
+++ b/merged/README.md
@@ -0,0 +1,41 @@
+---
+base_model:
+- Sao10K/L3.1-8B-Niitama-v1.1
+- akjindal53244/Llama-3.1-Storm-8B
+library_name: transformers
+tags:
+- mergekit
+- merge
+
+---
+# Untitled Model (1)
+
+This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
+
+## Merge Details
+### Merge Method
+
+This model was merged using the SLERP merge method.
+
+### Models Merged
+
+The following models were included in the merge:
+* [Sao10K/L3.1-8B-Niitama-v1.1](https://huggingface.co/Sao10K/L3.1-8B-Niitama-v1.1)
+* [akjindal53244/Llama-3.1-Storm-8B](https://huggingface.co/akjindal53244/Llama-3.1-Storm-8B)
+
+### Configuration
+
+The following YAML configuration was used to produce this model:
+
+```yaml
+
+models:
+   - model: Sao10K/L3.1-8B-Niitama-v1.1
+   - model: akjindal53244/Llama-3.1-Storm-8B
+merge_method: slerp
+base_model: Sao10K/L3.1-8B-Niitama-v1.1
+parameters:
+  t: 0.0001
+dtype: bfloat16
+
+```
diff --git a/merged/config.json b/merged/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3c9cecacf6c75d7b7d929bd9e1d050ff5f1f82b
--- /dev/null
+++ b/merged/config.json
@@ -0,0 +1,40 @@
+{
+  "_name_or_path": "Sao10K/L3.1-8B-Niitama-v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/merged/mergekit_config.yml b/merged/mergekit_config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8fbff893f64bec234de0fdfded92133fc9aa2437
--- /dev/null
+++ b/merged/mergekit_config.yml
@@ -0,0 +1,9 @@
+
+models:
+   - model: Sao10K/L3.1-8B-Niitama-v1.1
+   - model: akjindal53244/Llama-3.1-Storm-8B
+merge_method: slerp
+base_model: Sao10K/L3.1-8B-Niitama-v1.1
+parameters:
+  t: 0.0001
+dtype: bfloat16
diff --git a/merged/model-00001-of-00004.safetensors b/merged/model-00001-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0d46d4c61f2a3a4941c9ec69c99ce020de2d9053
--- /dev/null
+++ b/merged/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af9a5b2faf534b3436326bdbeefa80b0fd86f2bbd8e7f0928c00159d377fb203
+size 4953586384
diff --git a/merged/model-00002-of-00004.safetensors b/merged/model-00002-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..06d7d563fff37902e87fb2972af6836bdc3e7e21
--- /dev/null
+++ b/merged/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:847a742e9d75abd2a5a0e5dc5115a49ad2a65eca3d7fc892cda15562279eaf6b
+size 4999819336
diff --git a/merged/model-00003-of-00004.safetensors b/merged/model-00003-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..529d86b7e3147ea059891e92f69eb8da5fb748aa
--- /dev/null
+++ b/merged/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:751446ae7557167d61feda071f498d9dfe92724c3edd0369d155d0d3f41d6032
+size 4915916144
diff --git a/merged/model-00004-of-00004.safetensors b/merged/model-00004-of-00004.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..baef5621905f666ef0023171f623f662980fccc6
--- /dev/null
+++ b/merged/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c10b6ad7984b33e5b85099c8efca491933b98159eef931b841690e769d8e52aa
+size 1191234472
diff --git a/merged/model.safetensors.index.json b/merged/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..800be80123800bcbd3c6ddfe4e9e188533cea798
--- /dev/null
+++ b/merged/model.safetensors.index.json
@@ -0,0 +1 @@
+{"metadata": {"mergekit_version": "0.0.5.2", "total_size": 16060522496}, "weight_map": {"lm_head.weight": "model-00001-of-00004.safetensors", "model.embed_tokens.weight": "model-00001-of-00004.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00004.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", "model.layers.25.input_layernorm.weight": "model-00002-of-00004.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", "model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.input_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors", "model.norm.weight": "model-00004-of-00004.safetensors"}}
\ No newline at end of file
diff --git a/merged/special_tokens_map.json b/merged/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..02ee80b6196926a5ad790a004d9efd6ab1ba6542
--- /dev/null
+++ b/merged/special_tokens_map.json
@@ -0,0 +1,16 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/merged/tokenizer.json b/merged/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/merged/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/merged/tokenizer_config.json b/merged/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb8ed7f5dbeb80c47d8587b3c794f149548e6117
--- /dev/null
+++ b/merged/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/mergekit/.github/workflows/pre-commit.yml b/mergekit/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cc47d22e8d6b95853713ee674cda7744ed1ef766
--- /dev/null
+++ b/mergekit/.github/workflows/pre-commit.yml
@@ -0,0 +1,39 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+          cache: "pip"
+      - uses: pre-commit/action@v3.0.0
+
+  pytest:
+    if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request'
+    name: PyTest
+    needs: [pre-commit]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.9", "3.10", "3.11"]
+    timeout-minutes: 5
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: "pip"
+      - name: Install dependencies
+        run: pip3 install -U -e .[test]
+      - name: Run tests
+        run: pytest .
diff --git a/mergekit/.gitignore b/mergekit/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..68bc17f9ff2104a9d7b6777058bb4c343ca72609
--- /dev/null
+++ b/mergekit/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/mergekit/.pre-commit-config.yaml b/mergekit/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cc9a3e66af89f73ef187c31e00471d7bcb48ea5
--- /dev/null
+++ b/mergekit/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-yaml
+        args: ["--allow-multiple-documents"]
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: 23.11.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
diff --git a/mergekit/LICENSE b/mergekit/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0a041280bd00a9d068f503b8ee7ce35214bd24a1
--- /dev/null
+++ b/mergekit/LICENSE
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/mergekit/README.md b/mergekit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa7081a1ecba33d3bbc22ece72b4d47c03c19d1f
--- /dev/null
+++ b/mergekit/README.md
@@ -0,0 +1,418 @@
+# mergekit
+
+`mergekit` is a toolkit for merging pre-trained language models. `mergekit` uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention.
+
+## Contents
+
+- [Why Merge Models?](#why-merge-models)
+- [Features](#features)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Merge Configuration](#merge-configuration)
+  - [Parameter Specification](#parameter-specification)
+  - [Tokenizer Configuration](#tokenizer-configuration)
+  - [Chat Template Configuration](#chat-template-configuration)
+  - [Examples](#examples)
+- [Merge Methods](#merge-methods)
+- [LoRA extraction](#lora-extraction)
+- [Mixture of Experts merging](#mixture-of-experts-merging)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
+- [Merge in the Cloud](#-merge-in-the-cloud-)
+- [Citation](#citation)
+
+## Why Merge Models?
+
+Model merging is a powerful technique that allows combining the strengths of different models without the computational overhead of ensembling or the need for additional training. By operating directly in the weight space of models, merging can:
+
+- Combine multiple specialized models into a single versatile model
+- Transfer capabilities between models without access to training data
+- Find optimal trade-offs between different model behaviors
+- Improve performance while maintaining inference costs
+- Create new capabilities through creative model combinations
+
+Unlike traditional ensembling which requires running multiple models, merged models maintain the same inference cost as a single model while often achieving comparable or superior performance.
+
+## Features
+
+Key features of `mergekit` include:
+
+- Supports Llama, Mistral, GPT-NeoX, StableLM, and more
+- Many [merge methods](#merge-methods)
+- GPU or CPU execution
+- Lazy loading of tensors for low memory use
+- Interpolated gradients for parameter values (inspired by Gryphe's [BlockMerge_Gradient](https://github.com/Gryphe/BlockMerge_Gradient) script)
+- Piecewise assembly of language models from layers ("Frankenmerging")
+- [Mixture of Experts merging](#mixture-of-experts-merging)
+- [LORA extraction](#lora-extraction)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
+
+🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a mega-GPU backed graphical user interface for mergekit in Arcee! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at the [Arcee App](https://app.arcee.ai). There is also a [Hugging Face Space](https://huggingface.co/mergekit-community) with limited amounts of GPUs.
+
+## Installation
+
+```sh
+git clone https://github.com/arcee-ai/mergekit.git
+cd mergekit
+
+pip install -e .  # install the package and make scripts available
+```
+
+If the above fails with the error of:
+
+```
+ERROR: File "setup.py" or "setup.cfg" not found. Directory cannot be installed in editable mode:
+(A "pyproject.toml" file was found, but editable mode currently requires a setuptools-based build.)
+```
+
+You may need to upgrade pip to > 21.3 with the command `python3 -m pip install --upgrade pip`
+
+## Usage
+
+The script `mergekit-yaml` is the main entry point for `mergekit`. It takes a YAML configuration file and an output path, like so:
+
+```sh
+mergekit-yaml path/to/your/config.yml ./output-model-directory [--cuda] [--lazy-unpickle] [--allow-crimes] [... other options]
+```
+
+This will run the merge and write your merged model to `./output-model-directory`.
+
+For more information on the arguments accepted by `mergekit-yaml` run the command `mergekit-yaml --help`.
+
+### Uploading to Huggingface
+
+When you have a merged model you're happy with, you may want to share it on the Hugging Face Hub. `mergekit` generates a `README.md` for your merge with some basic information for a model card. You can edit it to include more details about your merge, like giving it a good name or explaining what it's good at; rewrite it entirely; or use the generated `README.md` as-is. It is also possible to edit your `README.md` online once it has been uploaded to the Hub.
+
+Once you're happy with your model card and merged model, you can upload it to the Hugging Face Hub using the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) Python library.
+
+```sh
+# log in to huggingface with an access token (must have write permission)
+huggingface-cli login
+# upload your model
+huggingface-cli upload your_hf_username/my-cool-model ./output-model-directory .
+```
+
+The [documentation](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-upload) for `huggingface_hub` goes into more detail about other options for uploading.
+
+## Merge Configuration
+
+Merge configurations are YAML documents specifying the operations to perform in order to produce your merged model.
+Below are the primary elements of a configuration file:
+
+- `merge_method`: Specifies the method to use for merging models. See [Merge Methods](#merge-methods) for a list.
+- `slices`: Defines slices of layers from different models to be used. This field is mutually exclusive with `models`.
+- `models`: Defines entire models to be used for merging. This field is mutually exclusive with `slices`.
+- `base_model`: Specifies the base model used in some merging methods.
+- `parameters`: Holds various parameters such as weights and densities, which can also be specified at different levels of the configuration.
+- `dtype`: Specifies the data type used for the merging operation.
+- `tokenizer` or `tokenizer_source`: Determines how to construct a tokenizer for the merged model.
+- `chat_template`: Specifies a chat template for the merged model.
+
+### Parameter Specification
+
+Parameters are flexible and can be set with varying precedence. They can be specified conditionally using tensor name filters, which allows finer control such as differentiating between attention heads and fully connected layers.
+
+Parameters can be specified as:
+
+- **Scalars**: Single floating-point values.
+- **Gradients**: List of floating-point values, specifying an interpolated gradient.
+
+The parameters can be set at different levels, with decreasing precedence as follows:
+
+1. `slices.*.sources.parameters` - applying to a specific input slice
+2. `slices.*.parameters` - applying to a specific output slice
+3. `models.*.parameters` or `input_model_parameters` - applying to any tensors coming from specific input models
+4. `parameters` - catchall
+
+### Tokenizer Configuration
+
+The tokenizer behavior can be configured in two ways: using the new `tokenizer` field (recommended) or the legacy `tokenizer_source` field (maintained for backward compatibility). These fields are mutually exclusive - you should use one or the other, not both.
+
+#### Modern Configuration (tokenizer)
+
+The `tokenizer` field provides fine-grained control over vocabulary and embeddings:
+
+```yaml
+tokenizer:
+  source: "union"  # or "base" or a specific model path
+  tokens:          # Optional: configure specific tokens
+    <token_name>:
+      source: ...  # Specify embedding source
+      force: false # Optional: force this embedding for all models
+  pad_to_multiple_of: null  # Optional: pad vocabulary size
+```
+
+##### Tokenizer Source
+
+The `source` field determines the vocabulary of the output model:
+
+- `union`: Combine vocabularies from all input models (default)
+- `base`: Use vocabulary from the base model
+- `"path/to/model"`: Use vocabulary from a specific model
+
+##### Token Embedding Handling
+
+When merging models with different vocabularies, mergekit uses smart defaults to handle token embeddings:
+
+- If a token exists in the base model, its embedding is used as the default
+- If only one model has the token, that model's embedding is used
+- Otherwise, an average of all available embeddings is used
+
+You can override these defaults for specific tokens:
+
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # Use embedding from a specific model
+    <|im_start|>:
+      source: "path/to/chatml/model"
+
+    # Force a specific embedding for all models
+    <|special|>:
+      source: "path/to/model"
+      force: true
+
+    # Map a token to another model's token embedding
+    <|renamed_token|>:
+      source:
+        kind: "model_token"
+        model: "path/to/model"
+        token: "<|original_token|>"  # or use token_id: 1234
+```
+
+##### Practical Example
+
+Here's how you might preserve both Llama 3 Instruct and ChatML prompt formats when merging models:
+
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # ChatML tokens
+    <|im_start|>:
+      source: "chatml_model"
+    <|im_end|>:
+      source: "chatml_model"
+
+    # Llama 3 tokens - force original embeddings
+    <|start_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|end_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|eot_id|>:
+      source: "llama3_model"
+      force: true
+```
+
+#### Legacy Configuration (tokenizer_source)
+
+For backward compatibility, the `tokenizer_source` field is still supported:
+
+```yaml
+tokenizer_source: "union"  # or "base" or a model path
+```
+
+This provides basic tokenizer selection but lacks the fine-grained control of the modern `tokenizer` field.
+
+### Chat Template Configuration
+
+The optional `chat_template` field allows overriding the chat template used for the merged model.
+
+```yaml
+chat_template: "auto"  # or a template name or Jinja2 template
+```
+
+Options include:
+
+- `"auto"`: Automatically select the most common template among input models
+- Built-in templates: `"alpaca"`, `"chatml"`, `"llama3"`, `"mistral"`, `"exaone"`
+- A Jinja2 template string for custom formatting
+
+### Examples
+
+Several examples of merge configurations are available in [`examples/`](examples/).
+
+## Merge Methods
+
+A quick overview of the currently supported merge methods:
+
+| Method                                                                                           | `merge_method` value | Multi-Model | Uses base model |
+| ------------------------------------------------------------------------------------------------ | -------------------- | ----------- | --------------- |
+| Linear ([Model Soups](https://arxiv.org/abs/2203.05482))                                         | `linear`             | ✅          | ❌              |
+| SLERP                                                                                            | `slerp`              | ❌          | ✅              |
+| Nearswap                                                                                         | `nearswap`           | ❌          | ✅              |
+| [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                              | `task_arithmetic`    | ✅          | ✅              |
+| [TIES](https://arxiv.org/abs/2306.01708)                                                         | `ties`               | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)                | `dare_ties`          | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [Task Arithmetic](https://arxiv.org/abs/2212.04089)     | `dare_linear`        | ✅          | ✅              |
+| Passthrough                                                                                      | `passthrough`        | ❌          | ❌              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)                                            | `breadcrumbs`        | ✅          | ✅              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795) + [TIES](https://arxiv.org/abs/2306.01708) | `breadcrumbs_ties`   | ✅          | ✅              |
+| [Model Stock](https://arxiv.org/abs/2403.19522)                                                  | `model_stock`        | ✅          | ✅              |
+| NuSLERP                                                                                          | `nuslerp`            | ❌          | ✅              |
+| [DELLA](https://arxiv.org/abs/2406.11617)                                                        | `della`              | ✅          | ✅              |
+| [DELLA](https://arxiv.org/abs/2406.11617) [Task Arithmetic](https://arxiv.org/abs/2212.04089)    | `della_linear`       | ✅          | ✅              |
+
+### Linear
+
+The classic merge method - a simple weighted average.
+
+Parameters:
+
+- `weight` - relative (or absolute if `normalize=False`) weighting of a given tensor
+- `normalize` - if true, the weights of all models contributing to a tensor will be normalized. Default behavior.
+
+### SLERP
+
+Spherically interpolate the parameters of two models. One must be set as `base_model`.
+
+Parameters:
+
+- `t` - interpolation factor. At `t=0` will return `base_model`, at `t=1` will return the other one.
+
+### Nearswap
+
+Interpolates base model with secondary model if similarity is below t. Accepts two models.
+
+Parameters:
+
+- `t` - similarity threshold
+
+### [Task Arithmetic](https://arxiv.org/abs/2212.04089)
+
+Computes "task vectors" for each model by subtracting a base model. Merges the task vectors linearly and adds back the base. Works great for models that were fine tuned from a common ancestor. Also a super useful mental framework for several of the more involved merge methods.
+
+Parameters: same as [Linear](#linear)
+
+### [TIES](https://arxiv.org/abs/2306.01708)
+
+Builds on the task arithmetic framework. Resolves interference between models by sparsifying the task vectors and applying a sign consensus algorithm. Allows you to merge a larger number of models and retain more of their strengths.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+
+### [DARE](https://arxiv.org/abs/2311.03099)
+
+In the same vein as TIES, sparsifies task vectors to reduce interference. Differs in that DARE uses random pruning with a novel rescaling to better match performance of the original models. DARE can be used either with the sign consensus algorithm of TIES (`dare_ties`) or without (`dare_linear`).
+
+Parameters: same as [TIES](#ties) for `dare_ties`, or [Linear](#linear) for `dare_linear`
+
+### Passthrough
+
+`passthrough` is a no-op that simply passes input tensors through unmodified. It is meant to be used for layer-stacking type merges where you have only one input model. Useful for frankenmerging.
+
+### [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)
+
+An extension of task arithmetic that discards both small and extremely large differences from the base model. As with DARE, the Model Breadcrumbs algorithm can be used with (`breadcrumbs_ties`) or without (`breadcrumbs`) the sign consensus algorithm of TIES.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+- `gamma` - fraction of largest magnitude differences to remove
+
+Note that `gamma` corresponds with the parameter `β` described in the paper, while `density` is the final density of the sparsified tensors (related to `γ` and `β` by `density = 1 - γ - β`). For good default values, try `density: 0.9` and `gamma: 0.01`.
+
+### [Model Stock](https://arxiv.org/abs/2403.19522)
+
+Uses some neat geometric properties of fine tuned models to compute good weights for linear interpolation. Requires at least three models, including a base model.
+
+Parameters:
+
+- `filter_wise`: if true, weight calculation will be per-row rather than per-tensor. Not recommended.
+
+### NuSLERP
+
+Spherically interpolate between parameters, but with more options and more sensical configuration! Does not require a base model, but can use one to do spherical interpolation of task vectors. Only works with either two models or two plus a base model.
+
+Parameters:
+
+- `weight`: relative weighting of a given tensor
+- `nuslerp_flatten`: set to false to do row-wise/column-wise interpolation instead of treating tensors as vectors
+- `nuslerp_row_wise`: SLERP row vectors instead of column vectors
+
+To replicate the behavior of the original `slerp` method, set `weight` to `1-t` and `t` for your first and second model respectively.
+
+### [DELLA](https://arxiv.org/abs/2406.11617)
+
+Building upon DARE, DELLA uses adaptive pruning based on parameter magnitudes. DELLA first ranks parameters in each row of delta parameters and assigns drop probabilities inversely proportional to their magnitudes. This allows it to retain more important changes while reducing interference. After pruning, it rescales the remaining parameters similar to [DARE](#dare). DELLA can be used with (`della`) or without (`della_linear`) the sign elect step of TIES
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+- `epsilon` - maximum change in drop probability based on magnitude. Drop probabilities assigned will range from `density - epsilon` to `density + epsilon`. (When selecting values for `density` and `epsilon`, ensure that the range of probabilities falls within 0 to 1)
+- `lambda` - scaling factor for the final merged delta parameters before merging with the base parameters.
+
+## LoRA extraction
+
+Mergekit allows extracting PEFT-compatible low-rank approximations of finetuned models.
+
+### Usage
+
+```sh
+mergekit-extract-lora finetuned_model_id_or_path base_model_id_or_path output_path [--no-lazy-unpickle] --rank=desired_rank
+```
+
+## Mixture of Experts merging
+
+The `mergekit-moe` script supports merging multiple dense models into a mixture of experts, either for direct use or for further training. For more details see the [`mergekit-moe` documentation](docs/moe.md).
+
+## Evolutionary merge methods
+
+See [`docs/evolve.md`](docs/evolve.md) for details.
+
+## ✨ Merge in the Cloud ✨
+
+We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arcee App](https://app.arcee.ai). Or through python - grab an ARCEE_API_KEY:
+
+`export ARCEE_API_KEY=<your-api-key>`
+`pip install -q arcee-py`
+
+```python
+import arcee
+arcee.merge_yaml("bio-merge","./examples/bio-merge.yml")
+```
+
+Check your merge status at the [Arcee App](https://app.arcee.ai)
+
+When complete, either deploy your merge:
+
+```python
+arcee.start_deployment("bio-merge", merging="bio-merge")
+```
+
+Or download your merge:
+
+`!arcee merging download bio-merge`
+
+## Citation
+
+If you find `mergekit` useful in your research, please consider citing the [paper](https://aclanthology.org/2024.emnlp-industry.36/):
+
+```bibtex
+@inproceedings{goddard-etal-2024-arcees,
+    title = "Arcee{'}s {M}erge{K}it: A Toolkit for Merging Large Language Models",
+    author = "Goddard, Charles  and
+      Siriwardhana, Shamane  and
+      Ehghaghi, Malikeh  and
+      Meyers, Luke  and
+      Karpukhin, Vladimir  and
+      Benedict, Brian  and
+      McQuade, Mark  and
+      Solawetz, Jacob",
+    editor = "Dernoncourt, Franck  and
+      Preo{\c{t}}iuc-Pietro, Daniel  and
+      Shimorina, Anastasia",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, US",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-industry.36",
+    doi = "10.18653/v1/2024.emnlp-industry.36",
+    pages = "477--485",
+    abstract = "The rapid growth of open-source language models provides the opportunity to merge model checkpoints, combining their parameters to improve performance and versatility. Advances in transfer learning have led to numerous task-specific models, which model merging can integrate into powerful multitask models without additional training. MergeKit is an open-source library designed to support this process with an efficient and extensible framework suitable for any hardware. It has facilitated the merging of thousands of models, contributing to some of the world{'}s most powerful open-source model checkpoints. The library is accessible at: https://github.com/arcee-ai/mergekit.",
+}
+```
diff --git a/mergekit/docs/evolve.md b/mergekit/docs/evolve.md
new file mode 100644
index 0000000000000000000000000000000000000000..930fc2797bfe0bf1ff92ca94cb37a23e259bff2e
--- /dev/null
+++ b/mergekit/docs/evolve.md
@@ -0,0 +1,176 @@
+# mergekit-evolve
+
+`mergekit-evolve` is a script that uses an evolutionary algorithm (CMA-ES) to optimize the parameters of a merge against model metrics. This is inspired by SakanaAI's [Evolutionary Optimization of Model Merging Recipes](https://arxiv.org/abs/2403.13187), in particular their parameter-space approach. `mergekit-evolve` uses EleutherAI's [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) to define and evaluate the scoring function. The script is set up to be run either single-node or on a Ray cluster and has a few different strategies for scheduling operations depending on your particular configuration of compute.
+
+## Installation
+
+Install `mergekit` with the `evolve` (and optionally `vllm`) features:
+
+```sh
+git clone https://github.com/arcee-ai/mergekit.git
+cd mergekit
+
+pip install -e .[evolve,vllm]
+```
+
+If you had a perfectly good pytorch environment going and installing an older version of vLLM downgraded it and broke flash attention, run the following commands to fix it:
+
+```sh
+pip uninstall flash-attn
+pip cache purge
+pip install flash-attn
+```
+
+## Configuration
+
+`mergekit-evolve` takes in a YAML configuration file that defines how the merge is parameterized and what metrics to optimize. The general syntax is as follows:
+
+```yml
+genome:
+    models:
+       - model_1
+       - model_2
+       ...
+       - model_n
+    merge_method: dare_ties
+    base_model: base_model_if_needed
+    tokenizer_source: null # optional
+    layer_granularity: 8
+
+    # optional:
+    normalize: false
+    allow_negative_weights: false
+    smooth: false
+    filters: ...
+tasks:
+  - name: lm_eval_task_name
+    weight: 1.0 # optional
+    metric: "acc,none" # defaults to acc,none
+  - name: ... # as many as you want
+```
+
+### Genome Definition
+
+The `genome` section of the configuration file defines the parameter space that `mergekit-evolve` will be optimizing in.
+
+#### `models`
+
+This should be a list of all of the models you want available to be merged. Depending on the merge method not all are guaranteed to be used in the final merge.
+
+#### `merge_method`
+
+Merge method to be used. Currently supported values are `linear`, `dare_ties`, `task_arithmetic`, `ties`, and `slerp`.
+
+#### `base_model`
+
+The base model for the merge, if applicable.
+
+#### `layer_granularity`
+
+A set of parameters will be introduced for each consecutive slice of `layer_granularity` layers. So for example, a 32-layer model like `mistralai/Mistral-7B-v0.1` with `layer_granularity: 8` will be divided into 4 groups of 8 layers with different merge parameters for each. The value specified here must be a divisor of the number of layers in your input models. Large values of `layer_granularity` will reduce the search space greatly, meaning you will get faster convergence at the cost of a potentially less good global solution.
+
+When not set, one set of parameters will be used for all layers.
+
+#### `normalize`
+
+Sets the `normalize` flag when merging. For methods like `linear`, `ties`, and `dare_ties` this constrains the search space to a set of definitely valid models. Similarly to `layer_granularity`, this can greatly speed up convergence at the cost of ruling out oddball solutions that might score better than more standard merges.
+
+#### `allow_negative_weights`
+
+Pretty self explanatory. When this flag is not set, the absolute value of weight parameters is used. Sensible search space reduction for `linear` and `slerp`. For task arithmetic based methods you probably want `allow_negative_weights: true`.
+
+#### `smooth`
+
+If set to `true`, then parameter values will be interpolated across layers instead of assigning a single, fixed value to each block.
+
+#### `filters`
+
+Accepts a list of filters, as in `mergekit-yaml`, by which to separate the parameters. So, for example, setting filters as below for a Llama-based merge:
+
+```yaml
+filters:
+  - self_attn
+  - mlp
+```
+
+Will divide up the merge parameters into three groups - self attention parameters, MLP parameters, and a third for everything else. Separating the parameters out like this can be very beneficial when merging models trained on different prompt formats. It also makes your parameter space three times as big though!
+
+### Task Definition
+
+To evaluate the produced merges you need to specify a list of tasks supported by the EleutherAI LM evaluation harness. This can be either [built in tasks](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks) (don't be naughty) or tasks you define yourself (see the [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md) for how). If your task does not use `acc` as the metric then you must specify the correct metric name. Each task can also optionally have a weight associated.
+
+`mergekit-evolve` aims to maximize the score of the merge, so if you are using any tasks or metrics where a lower score is better (like perplexity) be sure to assign a negative weight to that task.
+
+## Running `mergekit-evolve`
+
+```sh
+mergekit-evolve [OPTIONS] --storage-path PATH GENOME_CONFIG_PATH
+```
+
+`mergekit-evolve` needs a storage path specified, where it will save the input models, merges to evaluate, and the config for the current best merge evaluated. If you are not using in-memory merging this can require a _lot_ of space - expect at least one fp16 model per GPU.
+
+Some important options:
+
+### Scheduling Strategy (`--strategy`)
+
+There are three different strategies implemented for scheduling merging and evaluation jobs.
+
+#### `pool`
+
+Assigns an actor to each GPU in your cluster and guarantees merges and evaluations are performed on the same node. This is a safe default suitable for any configuration, local or distributed.
+
+#### `buffered`
+
+Maintains a buffer of tasks scheduled to ensure that there is always a model merging or ready to evaluate for each GPU. Allows for concurrent merging and evaluation of models on the same GPU if enough VRAM is available. Only suitable for a single-node setup or when `--storage-path` points to a fast shared filesystem.
+
+#### `serial`
+
+Uses Ray placement groups to ensure merges and their evaluations happen on the same node, but otherwise just lets Ray take the wheel. Maybe give a try if you're having trouble with the other two, otherwise probably don't use it.
+
+### Evaluation LLM Backend
+
+By default `mergekit-evolve` will use the `hf` backend for `lm-eval`. To use vLLM instead, pass the `--vllm` flag.
+
+### On-Disk vs. In-Memory
+
+By default `mergekit-evolve` will perform merges, write the result to disk, then start up an instance of lm-eval pointing at that path. This is a safe default and will generally always work but also causes a lot of GPU downtime and eats disk space. When using the `pool` scheduling strategy, you have the option to instead keep a model resident in memory and directly update its parameters instead of merging to disk. This is much faster and uses no additional disk space. However, it does involve mucking around in the internals of vLLM and the LM evaluation harness. So it might break at any moment! Choose wisely. Use `--in-memory` to enable this mode.
+
+### Task search path
+
+If you're using custom task definitions (and you should be) then you can append to the search path using the `--task-search-path` option. This should point to the directory your custom task YAML is in (or a parent of that directory). Multiple paths can be included by repeating the option.
+
+### Batch size
+
+Override the batch size used during merge evaluation. If using vLLM `auto` is recommended (default).
+
+### CMA-ES options
+
+#### `--max-fevals`
+
+Maximum number of merges to evaluate. Note that the `cma` package is very loosey-goosey with this number and will happily go over by 50% depending on the size of each generation. Set to 100 by default.
+
+#### `--sigma0`
+
+Initial value of sigma for CMA-ES. No need to play with this unless you really know what you're doing.
+
+### WandB logging
+
+`mergekit-evolve` supports logging metrics to Weights & Biases. Enable this functionality with the `--wandb` flag. Project and entity names can be overridden with the `--wandb-project` and `--wandb-entity` options.
+
+### Example
+
+```sh
+mergekit-evolve --strategy pool --wandb --wandb-project mergekit-evolve --wandb-entity arcee-ai --storage-path /path/to/mergekit-evolve/ ./config.yml
+```
+
+## Output
+
+`mergekit-evolve` will write the merge configuration for the best merge found so far to the storage path with the filename `best_config.yaml`. If you're using WandB it will also log the config as an artifact. The script will keep running until a KeyboardInterrupt is received or `--max-fevals` is generously exceeded.
+
+## Caveats
+
+`mergekit-evolve` is a work in progress and has probably not been tested on your specific configuration. Keep an eye on the output before leaving it running, and if you run in to any issues don't hesitate to file an issue!
+
+## Acknowledgements
+
+Thanks to SakanaAI for the inspiration and the EleutherAI team for the LM evaluation harness.
diff --git a/mergekit/docs/moe.md b/mergekit/docs/moe.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d62d4a383172effe2b227334827ffd7bda4c979
--- /dev/null
+++ b/mergekit/docs/moe.md
@@ -0,0 +1,124 @@
+# mergekit-moe
+
+`mergekit-moe` is a script for combining Mistral or Llama models of the same size into Mixtral Mixture of Experts models. The script will combine the self-attention and layer normalization parameters from a "base" model with the MLP parameters from a set of "expert" models.
+
+If using the `hidden` or `cheap_embed` gate mode, the output model will be usable without any further training. If you are initializing a model to do further training on, such as for sparse upcycling, then use the `random` gate mode to get a model ready for training.
+
+## Configuration
+
+`mergekit-moe` uses its own YML configuration syntax, which looks like so:
+
+```yml
+base_model: path/to/self_attn_donor
+gate_mode: hidden # one of "hidden", "cheap_embed", or "random"
+dtype: bfloat16 # output dtype (float32, float16, or bfloat16)
+## (optional)
+# experts_per_token: 2
+experts:
+  - source_model: expert_model_1
+    positive_prompts:
+      - "This is a prompt that is demonstrative of what expert_model_1 excels at"
+    ## (optional)
+    # negative_prompts:
+    #   - "This is a prompt expert_model_1 should not be used for"
+  - source_model: expert_model_2
+  # ... and so on
+```
+
+The script takes two arguments, an input config and an output path: `mergekit-moe ./config.yml ./my-clowncar-moe-12x180B`
+
+Currently the script can output models that use the Mixtral, Deepseek MoE, or Qwen MoE architectures. Some output architectures support a shared expert which will be activated for all tokens, which can be configured like this:
+
+```yml
+base_model: path/to/self_attn_donor
+gate_mode: hidden # one of "hidden", "cheap_embed", or "random"
+dtype: bfloat16 # output dtype (float32, float16, or bfloat16)
+experts:
+  ...
+shared_experts:
+  - source_model: model_name
+    positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed
+      - "blah blah"
+    # (optional, but recommended:)
+    residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model
+```
+
+Currently only up to one shared expert is supported.
+
+An appropriate architecture will be inferred based on the input models and presence or absence of shared experts in your configuration. Alternatively, you can explicitly specify an output architecture by setting the `architecture:` field in your config. For example:
+
+```yml
+base_model: path/to/self_attn_donor
+architecture: qwen
+# ... and so on
+```
+
+### Gate Modes
+
+There are three methods for populating the MoE gates implemented.
+
+#### "hidden"
+
+Uses the hidden state representations of the positive/negative prompts for MoE gate parameters. Best quality and most effective option; the default. Requires evaluating each prompt using the base model so you might not be able to use this on constrained hardware (depending on the model). You can use `--load-in-8bit` or `--load-in-4bit` to reduce VRAM usage.
+
+#### "cheap_embed"
+
+Uses only the raw token embedding of the prompts, using the same gate parameters for every layer. Distinctly less effective than "hidden". Can be run on much, much lower end hardware.
+
+#### "random"
+
+Randomly initializes the MoE gates. Good for if you are going to fine tune the model afterwards, or maybe if you want something a little unhinged? I won't judge.
+
+## Example Configurations
+
+Sparse upcycling of smol_llama into a 8x220M MoE:
+
+```yml
+base_model: BEE-spoke-data/smol_llama-220M-GQA
+gate_mode: random
+dtype: bfloat16
+experts:
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+  - source_model: BEE-spoke-data/smol_llama-220M-GQA
+# and then train the sucker!
+```
+
+Shove some Mistral models in a clown car:
+
+```yml
+base_model: NousResearch/Hermes-2-Pro-Mistral-7B
+gate_mode: hidden
+dtype: bfloat16
+experts:
+  - source_model: NousResearch/Hermes-2-Pro-Mistral-7B
+    positive_prompts:
+      - "<|im_start|>user\nHello, who are you?<|im_end|>"
+      - "<|im_start|>user\nI need help with"
+  - source_model: BioMistral/BioMistral-7B-DARE
+    positive_prompts:
+      - "As a doctor of medicine,"
+  - source_model: PocketDoc/Dans-AdventurousWinds-7b
+    positive_prompts:
+      - "[Genres: Science Fiction]\n[Tags: humor, old school, sci fi]"
+      - "> get ye flask"
+      - "[Mode: Interactive Storyteller]"
+  - source_model: VAGOsolutions/SauerkrautLM-7b-HerO
+    positive_prompts:
+      - "<|im_start|>user\nWie geht es dir?<|im_end|>"
+      - "Das ist ein Satz auf Deutsch."
+```
+
+## FAQ
+
+### What does the "Your model has duplicated tensors but the --clone-tensors flag is not set" warning mean?
+
+Answer from [Charles O. Goddard (cg123)](https://github.com/cg123)
+(also see [this GitHub issue](https://github.com/arcee-ai/mergekit/issues/279#issuecomment-2081818104)):
+
+> This is completely benign. This happens when a single tensor from a model is used in multiple places, like when doing sparse upcycling with the moe script or doing passthrough merges that repeat layers. Having `--clone-tensors` set can use slightly more memory, but having it unset will slow down saving and introduce small memory usage spikes in cases where this warning occurs. It's honestly a small enough difference that the warning could be removed entirely.
diff --git a/mergekit/examples/bio-merge.yml b/mergekit/examples/bio-merge.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c47101f5efa3a5ad39b70ea3c9da7c07dcd5d6fe
--- /dev/null
+++ b/mergekit/examples/bio-merge.yml
@@ -0,0 +1,15 @@
+models:
+  - model: mistralai/Mistral-7B-Instruct-v0.2
+    parameters:
+      density: 0.5
+      weight: 0.5
+  - model: BioMistral/BioMistral-7B
+    parameters:
+      density: 0.5
+      weight: 0.5
+merge_method: ties
+base_model: mistralai/Mistral-7B-v0.1
+parameters:
+  normalize: false
+  int8_mask: true
+dtype: float16
diff --git a/mergekit/examples/gradient-slerp.yml b/mergekit/examples/gradient-slerp.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ab653c077e526839f914f91efba00623187d7663
--- /dev/null
+++ b/mergekit/examples/gradient-slerp.yml
@@ -0,0 +1,20 @@
+slices:
+  - sources:
+      - model: psmathur/orca_mini_v3_13b
+        layer_range: [0, 40]
+      - model: garage-bAInd/Platypus2-13B
+        layer_range: [0, 40]
+# or, the equivalent models: syntax:
+# models:
+#   - model: psmathur/orca_mini_v3_13b
+#   - model: garage-bAInd/Platypus2-13B
+merge_method: slerp
+base_model: psmathur/orca_mini_v3_13b
+parameters:
+  t:
+    - filter: self_attn
+      value: [0, 0.5, 0.3, 0.7, 1]
+    - filter: mlp
+      value: [1, 0.5, 0.7, 0.3, 0]
+    - value: 0.5 # fallback for rest of tensors
+dtype: float16
diff --git a/mergekit/examples/linear.yml b/mergekit/examples/linear.yml
new file mode 100644
index 0000000000000000000000000000000000000000..765c3751e8494a2378dceee7c2a7a335732d5483
--- /dev/null
+++ b/mergekit/examples/linear.yml
@@ -0,0 +1,12 @@
+models:
+  - model: psmathur/orca_mini_v3_13b
+    parameters:
+      weight: 1.0
+  - model: WizardLM/WizardLM-13B-V1.2
+    parameters:
+      weight: 0.3
+  - model: garage-bAInd/Platypus2-13B
+    parameters:
+      weight: 0.5
+merge_method: linear
+dtype: float16
diff --git a/mergekit/examples/mega.yml b/mergekit/examples/mega.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4732403b6ab3ee2cc8301fc52ad1fe6d847e30b5
--- /dev/null
+++ b/mergekit/examples/mega.yml
@@ -0,0 +1,37 @@
+slices:
+  - sources:
+      - model: psmathur/orca_mini_v3_13b
+        layer_range: [0, 40]
+      - model: garage-bAInd/Platypus2-13B
+        layer_range: [0, 40]
+merge_method: slerp
+base_model: psmathur/orca_mini_v3_13b
+parameters:
+  t:
+    - filter: self_attn
+      value: [0, 0.5, 0.3, 0.7, 1]
+    - filter: mlp
+      value: [1, 0.5, 0.7, 0.3, 0]
+    - value: 0.5 # fallback for rest of tensors
+dtype: float16
+name: gradient-slerp
+---
+models:
+  - model: gradient-slerp
+    parameters:
+      density: [1, 0.7, 0.1] # density gradient
+      weight: 1.0
+  - model: WizardLM/WizardMath-13B-V1.0
+    parameters:
+      density: 0.33
+      weight:
+        - filter: mlp
+          value: 0.5
+        - value: 0
+merge_method: ties
+base_model: TheBloke/Llama-2-13B-fp16
+parameters:
+  normalize: true
+  int8_mask: true
+dtype: float16
+name: gradient-slerp-ties
diff --git a/mergekit/examples/orcamini-platy-44layer.yml b/mergekit/examples/orcamini-platy-44layer.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0070c28fdafeda6bbbf4f4c024b0191981a244c9
--- /dev/null
+++ b/mergekit/examples/orcamini-platy-44layer.yml
@@ -0,0 +1,9 @@
+slices:
+  - sources:
+    - model: psmathur/orca_mini_v3_13b
+      layer_range: [0, 24]
+  - sources:
+    - model: garage-bAInd/Platypus2-13B
+      layer_range: [20, 40]
+merge_method: passthrough
+dtype: float16
diff --git a/mergekit/examples/ties.yml b/mergekit/examples/ties.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8c5cfe5c96758df5a7a9f74ce422a2cf1deaefe0
--- /dev/null
+++ b/mergekit/examples/ties.yml
@@ -0,0 +1,22 @@
+models:
+  - model: psmathur/orca_mini_v3_13b
+    parameters:
+      density: [1, 0.7, 0.1] # density gradient
+      weight: 1.0
+  - model: garage-bAInd/Platypus2-13B
+    parameters:
+      density: 0.5
+      weight: [0, 0.3, 0.7, 1] # weight gradient
+  - model: WizardLM/WizardMath-13B-V1.0
+    parameters:
+      density: 0.33
+      weight:
+        - filter: mlp
+          value: 0.5
+        - value: 0
+merge_method: ties
+base_model: TheBloke/Llama-2-13B-fp16
+parameters:
+  normalize: true
+  int8_mask: true
+dtype: float16
diff --git a/mergekit/mergekit.egg-info/PKG-INFO b/mergekit/mergekit.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..6f57a0b61db80789e80b5ecb865ef97c28e14f63
--- /dev/null
+++ b/mergekit/mergekit.egg-info/PKG-INFO
@@ -0,0 +1,458 @@
+Metadata-Version: 2.1
+Name: mergekit
+Version: 0.0.5.2
+Summary: Tools for merging pre-trained large language models
+Author-email: Charles Goddard <chargoddard@gmail.com>
+License: LGPL-3.0-or-later
+Project-URL: repository, https://github.com/cg123/mergekit
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.0.0
+Requires-Dist: tqdm==4.66.5
+Requires-Dist: click==8.1.7
+Requires-Dist: safetensors~=0.4.3
+Requires-Dist: accelerate~=1.0.1
+Requires-Dist: pydantic~=2.9.2
+Requires-Dist: immutables==0.20
+Requires-Dist: transformers>=4.45.2
+Requires-Dist: tokenizers>=0.20.1
+Requires-Dist: huggingface_hub
+Requires-Dist: peft
+Requires-Dist: typing-extensions
+Requires-Dist: sentencepiece
+Requires-Dist: protobuf
+Requires-Dist: scipy
+Requires-Dist: datasets
+Provides-Extra: dev
+Requires-Dist: black~=24.10.0; extra == "dev"
+Requires-Dist: isort~=5.13.2; extra == "dev"
+Requires-Dist: pre-commit~=4.0.1; extra == "dev"
+Provides-Extra: test
+Requires-Dist: pytest~=8.3.3; extra == "test"
+Provides-Extra: evolve
+Requires-Dist: ray; extra == "evolve"
+Requires-Dist: cma; extra == "evolve"
+Requires-Dist: lm_eval; extra == "evolve"
+Requires-Dist: wandb; extra == "evolve"
+Provides-Extra: vllm
+Requires-Dist: vllm==0.3.2; extra == "vllm"
+Requires-Dist: lm_eval[vllm]; extra == "vllm"
+
+# mergekit
+
+`mergekit` is a toolkit for merging pre-trained language models. `mergekit` uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention.
+
+## Contents
+
+- [Why Merge Models?](#why-merge-models)
+- [Features](#features)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Merge Configuration](#merge-configuration)
+  - [Parameter Specification](#parameter-specification)
+  - [Tokenizer Configuration](#tokenizer-configuration)
+  - [Chat Template Configuration](#chat-template-configuration)
+  - [Examples](#examples)
+- [Merge Methods](#merge-methods)
+- [LoRA extraction](#lora-extraction)
+- [Mixture of Experts merging](#mixture-of-experts-merging)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
+- [Merge in the Cloud](#-merge-in-the-cloud-)
+- [Citation](#citation)
+
+## Why Merge Models?
+
+Model merging is a powerful technique that allows combining the strengths of different models without the computational overhead of ensembling or the need for additional training. By operating directly in the weight space of models, merging can:
+
+- Combine multiple specialized models into a single versatile model
+- Transfer capabilities between models without access to training data
+- Find optimal trade-offs between different model behaviors
+- Improve performance while maintaining inference costs
+- Create new capabilities through creative model combinations
+
+Unlike traditional ensembling which requires running multiple models, merged models maintain the same inference cost as a single model while often achieving comparable or superior performance.
+
+## Features
+
+Key features of `mergekit` include:
+
+- Supports Llama, Mistral, GPT-NeoX, StableLM, and more
+- Many [merge methods](#merge-methods)
+- GPU or CPU execution
+- Lazy loading of tensors for low memory use
+- Interpolated gradients for parameter values (inspired by Gryphe's [BlockMerge_Gradient](https://github.com/Gryphe/BlockMerge_Gradient) script)
+- Piecewise assembly of language models from layers ("Frankenmerging")
+- [Mixture of Experts merging](#mixture-of-experts-merging)
+- [LORA extraction](#lora-extraction)
+- [Evolutionary merge methods](#evolutionary-merge-methods)
+
+🌐 GUI Launch Alert 🤗 - We are excited to announce the launch of a mega-GPU backed graphical user interface for mergekit in Arcee! This GUI simplifies the merging process, making it more accessible to a broader audience. Check it out and contribute at the [Arcee App](https://app.arcee.ai). There is also a [Hugging Face Space](https://huggingface.co/mergekit-community) with limited amounts of GPUs.
+
+## Installation
+
+```sh
+git clone https://github.com/arcee-ai/mergekit.git
+cd mergekit
+
+pip install -e .  # install the package and make scripts available
+```
+
+If the above fails with the error of:
+
+```
+ERROR: File "setup.py" or "setup.cfg" not found. Directory cannot be installed in editable mode:
+(A "pyproject.toml" file was found, but editable mode currently requires a setuptools-based build.)
+```
+
+You may need to upgrade pip to > 21.3 with the command `python3 -m pip install --upgrade pip`
+
+## Usage
+
+The script `mergekit-yaml` is the main entry point for `mergekit`. It takes a YAML configuration file and an output path, like so:
+
+```sh
+mergekit-yaml path/to/your/config.yml ./output-model-directory [--cuda] [--lazy-unpickle] [--allow-crimes] [... other options]
+```
+
+This will run the merge and write your merged model to `./output-model-directory`.
+
+For more information on the arguments accepted by `mergekit-yaml` run the command `mergekit-yaml --help`.
+
+### Uploading to Huggingface
+
+When you have a merged model you're happy with, you may want to share it on the Hugging Face Hub. `mergekit` generates a `README.md` for your merge with some basic information for a model card. You can edit it to include more details about your merge, like giving it a good name or explaining what it's good at; rewrite it entirely; or use the generated `README.md` as-is. It is also possible to edit your `README.md` online once it has been uploaded to the Hub.
+
+Once you're happy with your model card and merged model, you can upload it to the Hugging Face Hub using the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) Python library.
+
+```sh
+# log in to huggingface with an access token (must have write permission)
+huggingface-cli login
+# upload your model
+huggingface-cli upload your_hf_username/my-cool-model ./output-model-directory .
+```
+
+The [documentation](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-upload) for `huggingface_hub` goes into more detail about other options for uploading.
+
+## Merge Configuration
+
+Merge configurations are YAML documents specifying the operations to perform in order to produce your merged model.
+Below are the primary elements of a configuration file:
+
+- `merge_method`: Specifies the method to use for merging models. See [Merge Methods](#merge-methods) for a list.
+- `slices`: Defines slices of layers from different models to be used. This field is mutually exclusive with `models`.
+- `models`: Defines entire models to be used for merging. This field is mutually exclusive with `slices`.
+- `base_model`: Specifies the base model used in some merging methods.
+- `parameters`: Holds various parameters such as weights and densities, which can also be specified at different levels of the configuration.
+- `dtype`: Specifies the data type used for the merging operation.
+- `tokenizer` or `tokenizer_source`: Determines how to construct a tokenizer for the merged model.
+- `chat_template`: Specifies a chat template for the merged model.
+
+### Parameter Specification
+
+Parameters are flexible and can be set with varying precedence. They can be specified conditionally using tensor name filters, which allows finer control such as differentiating between attention heads and fully connected layers.
+
+Parameters can be specified as:
+
+- **Scalars**: Single floating-point values.
+- **Gradients**: List of floating-point values, specifying an interpolated gradient.
+
+The parameters can be set at different levels, with decreasing precedence as follows:
+
+1. `slices.*.sources.parameters` - applying to a specific input slice
+2. `slices.*.parameters` - applying to a specific output slice
+3. `models.*.parameters` or `input_model_parameters` - applying to any tensors coming from specific input models
+4. `parameters` - catchall
+
+### Tokenizer Configuration
+
+The tokenizer behavior can be configured in two ways: using the new `tokenizer` field (recommended) or the legacy `tokenizer_source` field (maintained for backward compatibility). These fields are mutually exclusive - you should use one or the other, not both.
+
+#### Modern Configuration (tokenizer)
+
+The `tokenizer` field provides fine-grained control over vocabulary and embeddings:
+
+```yaml
+tokenizer:
+  source: "union"  # or "base" or a specific model path
+  tokens:          # Optional: configure specific tokens
+    <token_name>:
+      source: ...  # Specify embedding source
+      force: false # Optional: force this embedding for all models
+  pad_to_multiple_of: null  # Optional: pad vocabulary size
+```
+
+##### Tokenizer Source
+
+The `source` field determines the vocabulary of the output model:
+
+- `union`: Combine vocabularies from all input models (default)
+- `base`: Use vocabulary from the base model
+- `"path/to/model"`: Use vocabulary from a specific model
+
+##### Token Embedding Handling
+
+When merging models with different vocabularies, mergekit uses smart defaults to handle token embeddings:
+
+- If a token exists in the base model, its embedding is used as the default
+- If only one model has the token, that model's embedding is used
+- Otherwise, an average of all available embeddings is used
+
+You can override these defaults for specific tokens:
+
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # Use embedding from a specific model
+    <|im_start|>:
+      source: "path/to/chatml/model"
+
+    # Force a specific embedding for all models
+    <|special|>:
+      source: "path/to/model"
+      force: true
+
+    # Map a token to another model's token embedding
+    <|renamed_token|>:
+      source:
+        kind: "model_token"
+        model: "path/to/model"
+        token: "<|original_token|>"  # or use token_id: 1234
+```
+
+##### Practical Example
+
+Here's how you might preserve both Llama 3 Instruct and ChatML prompt formats when merging models:
+
+```yaml
+tokenizer:
+  source: union
+  tokens:
+    # ChatML tokens
+    <|im_start|>:
+      source: "chatml_model"
+    <|im_end|>:
+      source: "chatml_model"
+
+    # Llama 3 tokens - force original embeddings
+    <|start_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|end_header_id|>:
+      source: "llama3_model"
+      force: true
+    <|eot_id|>:
+      source: "llama3_model"
+      force: true
+```
+
+#### Legacy Configuration (tokenizer_source)
+
+For backward compatibility, the `tokenizer_source` field is still supported:
+
+```yaml
+tokenizer_source: "union"  # or "base" or a model path
+```
+
+This provides basic tokenizer selection but lacks the fine-grained control of the modern `tokenizer` field.
+
+### Chat Template Configuration
+
+The optional `chat_template` field allows overriding the chat template used for the merged model.
+
+```yaml
+chat_template: "auto"  # or a template name or Jinja2 template
+```
+
+Options include:
+
+- `"auto"`: Automatically select the most common template among input models
+- Built-in templates: `"alpaca"`, `"chatml"`, `"llama3"`, `"mistral"`, `"exaone"`
+- A Jinja2 template string for custom formatting
+
+### Examples
+
+Several examples of merge configurations are available in [`examples/`](examples/).
+
+## Merge Methods
+
+A quick overview of the currently supported merge methods:
+
+| Method                                                                                           | `merge_method` value | Multi-Model | Uses base model |
+| ------------------------------------------------------------------------------------------------ | -------------------- | ----------- | --------------- |
+| Linear ([Model Soups](https://arxiv.org/abs/2203.05482))                                         | `linear`             | ✅          | ❌              |
+| SLERP                                                                                            | `slerp`              | ❌          | ✅              |
+| Nearswap                                                                                         | `nearswap`           | ❌          | ✅              |
+| [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                              | `task_arithmetic`    | ✅          | ✅              |
+| [TIES](https://arxiv.org/abs/2306.01708)                                                         | `ties`               | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)                | `dare_ties`          | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [Task Arithmetic](https://arxiv.org/abs/2212.04089)     | `dare_linear`        | ✅          | ✅              |
+| Passthrough                                                                                      | `passthrough`        | ❌          | ❌              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)                                            | `breadcrumbs`        | ✅          | ✅              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795) + [TIES](https://arxiv.org/abs/2306.01708) | `breadcrumbs_ties`   | ✅          | ✅              |
+| [Model Stock](https://arxiv.org/abs/2403.19522)                                                  | `model_stock`        | ✅          | ✅              |
+| NuSLERP                                                                                          | `nuslerp`            | ❌          | ✅              |
+| [DELLA](https://arxiv.org/abs/2406.11617)                                                        | `della`              | ✅          | ✅              |
+| [DELLA](https://arxiv.org/abs/2406.11617) [Task Arithmetic](https://arxiv.org/abs/2212.04089)    | `della_linear`       | ✅          | ✅              |
+
+### Linear
+
+The classic merge method - a simple weighted average.
+
+Parameters:
+
+- `weight` - relative (or absolute if `normalize=False`) weighting of a given tensor
+- `normalize` - if true, the weights of all models contributing to a tensor will be normalized. Default behavior.
+
+### SLERP
+
+Spherically interpolate the parameters of two models. One must be set as `base_model`.
+
+Parameters:
+
+- `t` - interpolation factor. At `t=0` will return `base_model`, at `t=1` will return the other one.
+
+### Nearswap
+
+Interpolates base model with secondary model if similarity is below t. Accepts two models.
+
+Parameters:
+
+- `t` - similarity threshold
+
+### [Task Arithmetic](https://arxiv.org/abs/2212.04089)
+
+Computes "task vectors" for each model by subtracting a base model. Merges the task vectors linearly and adds back the base. Works great for models that were fine tuned from a common ancestor. Also a super useful mental framework for several of the more involved merge methods.
+
+Parameters: same as [Linear](#linear)
+
+### [TIES](https://arxiv.org/abs/2306.01708)
+
+Builds on the task arithmetic framework. Resolves interference between models by sparsifying the task vectors and applying a sign consensus algorithm. Allows you to merge a larger number of models and retain more of their strengths.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+
+### [DARE](https://arxiv.org/abs/2311.03099)
+
+In the same vein as TIES, sparsifies task vectors to reduce interference. Differs in that DARE uses random pruning with a novel rescaling to better match performance of the original models. DARE can be used either with the sign consensus algorithm of TIES (`dare_ties`) or without (`dare_linear`).
+
+Parameters: same as [TIES](#ties) for `dare_ties`, or [Linear](#linear) for `dare_linear`
+
+### Passthrough
+
+`passthrough` is a no-op that simply passes input tensors through unmodified. It is meant to be used for layer-stacking type merges where you have only one input model. Useful for frankenmerging.
+
+### [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)
+
+An extension of task arithmetic that discards both small and extremely large differences from the base model. As with DARE, the Model Breadcrumbs algorithm can be used with (`breadcrumbs_ties`) or without (`breadcrumbs`) the sign consensus algorithm of TIES.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+- `gamma` - fraction of largest magnitude differences to remove
+
+Note that `gamma` corresponds with the parameter `β` described in the paper, while `density` is the final density of the sparsified tensors (related to `γ` and `β` by `density = 1 - γ - β`). For good default values, try `density: 0.9` and `gamma: 0.01`.
+
+### [Model Stock](https://arxiv.org/abs/2403.19522)
+
+Uses some neat geometric properties of fine tuned models to compute good weights for linear interpolation. Requires at least three models, including a base model.
+
+Parameters:
+
+- `filter_wise`: if true, weight calculation will be per-row rather than per-tensor. Not recommended.
+
+### NuSLERP
+
+Spherically interpolate between parameters, but with more options and more sensical configuration! Does not require a base model, but can use one to do spherical interpolation of task vectors. Only works with either two models or two plus a base model.
+
+Parameters:
+
+- `weight`: relative weighting of a given tensor
+- `nuslerp_flatten`: set to false to do row-wise/column-wise interpolation instead of treating tensors as vectors
+- `nuslerp_row_wise`: SLERP row vectors instead of column vectors
+
+To replicate the behavior of the original `slerp` method, set `weight` to `1-t` and `t` for your first and second model respectively.
+
+### [DELLA](https://arxiv.org/abs/2406.11617)
+
+Building upon DARE, DELLA uses adaptive pruning based on parameter magnitudes. DELLA first ranks parameters in each row of delta parameters and assigns drop probabilities inversely proportional to their magnitudes. This allows it to retain more important changes while reducing interference. After pruning, it rescales the remaining parameters similar to [DARE](#dare). DELLA can be used with (`della`) or without (`della_linear`) the sign elect step of TIES
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+- `epsilon` - maximum change in drop probability based on magnitude. Drop probabilities assigned will range from `density - epsilon` to `density + epsilon`. (When selecting values for `density` and `epsilon`, ensure that the range of probabilities falls within 0 to 1)
+- `lambda` - scaling factor for the final merged delta parameters before merging with the base parameters.
+
+## LoRA extraction
+
+Mergekit allows extracting PEFT-compatible low-rank approximations of finetuned models.
+
+### Usage
+
+```sh
+mergekit-extract-lora finetuned_model_id_or_path base_model_id_or_path output_path [--no-lazy-unpickle] --rank=desired_rank
+```
+
+## Mixture of Experts merging
+
+The `mergekit-moe` script supports merging multiple dense models into a mixture of experts, either for direct use or for further training. For more details see the [`mergekit-moe` documentation](docs/moe.md).
+
+## Evolutionary merge methods
+
+See [`docs/evolve.md`](docs/evolve.md) for details.
+
+## ✨ Merge in the Cloud ✨
+
+We host merging on Arcee's cloud GPUs - you can launch a cloud merge in the [Arcee App](https://app.arcee.ai). Or through python - grab an ARCEE_API_KEY:
+
+`export ARCEE_API_KEY=<your-api-key>`
+`pip install -q arcee-py`
+
+```python
+import arcee
+arcee.merge_yaml("bio-merge","./examples/bio-merge.yml")
+```
+
+Check your merge status at the [Arcee App](https://app.arcee.ai)
+
+When complete, either deploy your merge:
+
+```python
+arcee.start_deployment("bio-merge", merging="bio-merge")
+```
+
+Or download your merge:
+
+`!arcee merging download bio-merge`
+
+## Citation
+
+If you find `mergekit` useful in your research, please consider citing the [paper](https://aclanthology.org/2024.emnlp-industry.36/):
+
+```bibtex
+@inproceedings{goddard-etal-2024-arcees,
+    title = "Arcee{'}s {M}erge{K}it: A Toolkit for Merging Large Language Models",
+    author = "Goddard, Charles  and
+      Siriwardhana, Shamane  and
+      Ehghaghi, Malikeh  and
+      Meyers, Luke  and
+      Karpukhin, Vladimir  and
+      Benedict, Brian  and
+      McQuade, Mark  and
+      Solawetz, Jacob",
+    editor = "Dernoncourt, Franck  and
+      Preo{\c{t}}iuc-Pietro, Daniel  and
+      Shimorina, Anastasia",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, US",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-industry.36",
+    doi = "10.18653/v1/2024.emnlp-industry.36",
+    pages = "477--485",
+    abstract = "The rapid growth of open-source language models provides the opportunity to merge model checkpoints, combining their parameters to improve performance and versatility. Advances in transfer learning have led to numerous task-specific models, which model merging can integrate into powerful multitask models without additional training. MergeKit is an open-source library designed to support this process with an efficient and extensible framework suitable for any hardware. It has facilitated the merging of thousands of models, contributing to some of the world{'}s most powerful open-source model checkpoints. The library is accessible at: https://github.com/arcee-ai/mergekit.",
+}
+```
diff --git a/mergekit/mergekit.egg-info/SOURCES.txt b/mergekit/mergekit.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb248c086b81892e11eafcd7e75b5e1ecd944d4b
--- /dev/null
+++ b/mergekit/mergekit.egg-info/SOURCES.txt
@@ -0,0 +1,119 @@
+LICENSE
+README.md
+pyproject.toml
+mergekit/__init__.py
+mergekit/architecture.py
+mergekit/card.py
+mergekit/common.py
+mergekit/config.py
+mergekit/graph.py
+mergekit/merge.py
+mergekit/options.py
+mergekit/plan.py
+mergekit/sparsify.py
+mergekit.egg-info/PKG-INFO
+mergekit.egg-info/SOURCES.txt
+mergekit.egg-info/dependency_links.txt
+mergekit.egg-info/entry_points.txt
+mergekit.egg-info/requires.txt
+mergekit.egg-info/top_level.txt
+mergekit/_data/__init__.py
+mergekit/_data/architectures/__init__.py
+mergekit/_data/architectures/baichuan.json
+mergekit/_data/architectures/bert-masked-lm.json
+mergekit/_data/architectures/bert-sequence-classification.json
+mergekit/_data/architectures/bert.json
+mergekit/_data/architectures/chatglm.json
+mergekit/_data/architectures/cohere.json
+mergekit/_data/architectures/distilbert-masked-lm.json
+mergekit/_data/architectures/distilbert-sequence-classification.json
+mergekit/_data/architectures/distilbert-token-classification.json
+mergekit/_data/architectures/distilbert.json
+mergekit/_data/architectures/exaone.json
+mergekit/_data/architectures/falcon.json
+mergekit/_data/architectures/gemma.json
+mergekit/_data/architectures/gemma2.json
+mergekit/_data/architectures/gpt-neox.json
+mergekit/_data/architectures/gpt2-sequence-classification.json
+mergekit/_data/architectures/gpt2.json
+mergekit/_data/architectures/gptbigcode.json
+mergekit/_data/architectures/internlm2.json
+mergekit/_data/architectures/jais.json
+mergekit/_data/architectures/llama.json
+mergekit/_data/architectures/mamba.json
+mergekit/_data/architectures/mistral.json
+mergekit/_data/architectures/phi-1.json
+mergekit/_data/architectures/phi2-old.json
+mergekit/_data/architectures/phi2.json
+mergekit/_data/architectures/phi3-small.json
+mergekit/_data/architectures/phi3.json
+mergekit/_data/architectures/qwen.json
+mergekit/_data/architectures/qwen2.json
+mergekit/_data/architectures/roberta-masked-lm.json
+mergekit/_data/architectures/roberta-sequence-classification.json
+mergekit/_data/architectures/roberta-token-classification.json
+mergekit/_data/architectures/roberta.json
+mergekit/_data/architectures/solar.json
+mergekit/_data/architectures/stablelm.json
+mergekit/_data/architectures/stablelm2.json
+mergekit/_data/architectures/starcoder2.json
+mergekit/_data/chat_templates/__init__.py
+mergekit/_data/chat_templates/alpaca.jinja
+mergekit/_data/chat_templates/chatml.jinja
+mergekit/_data/chat_templates/exaone.jinja
+mergekit/_data/chat_templates/llama3.jinja
+mergekit/_data/chat_templates/mistral.jinja
+mergekit/evo/__init__.py
+mergekit/evo/actors.py
+mergekit/evo/config.py
+mergekit/evo/genome.py
+mergekit/evo/helpers.py
+mergekit/evo/monkeypatch.py
+mergekit/evo/strategy.py
+mergekit/io/__init__.py
+mergekit/io/lazy_tensor_loader.py
+mergekit/io/lazy_unpickle.py
+mergekit/io/loader.py
+mergekit/io/tasks.py
+mergekit/io/tensor_writer.py
+mergekit/merge_methods/__init__.py
+mergekit/merge_methods/base.py
+mergekit/merge_methods/generalized_task_arithmetic.py
+mergekit/merge_methods/linear.py
+mergekit/merge_methods/model_stock.py
+mergekit/merge_methods/nearswap.py
+mergekit/merge_methods/nuslerp.py
+mergekit/merge_methods/passthrough.py
+mergekit/merge_methods/rectify_embed.py
+mergekit/merge_methods/slerp.py
+mergekit/merge_methods/tokenizer_permute.py
+mergekit/moe/__init__.py
+mergekit/moe/arch.py
+mergekit/moe/common.py
+mergekit/moe/config.py
+mergekit/moe/deepseek.py
+mergekit/moe/mixtral.py
+mergekit/moe/qwen.py
+mergekit/moe/router.py
+mergekit/scripts/__init__.py
+mergekit/scripts/bakllama.py
+mergekit/scripts/evolve.py
+mergekit/scripts/extract_lora.py
+mergekit/scripts/layershuffle.py
+mergekit/scripts/legacy.py
+mergekit/scripts/megamerge.py
+mergekit/scripts/moe.py
+mergekit/scripts/run_yaml.py
+mergekit/scripts/tokensurgeon.py
+mergekit/tokenizer/__init__.py
+mergekit/tokenizer/build.py
+mergekit/tokenizer/config.py
+mergekit/tokenizer/embed.py
+tests/test_basic_merges.py
+tests/test_chat_template.py
+tests/test_graph.py
+tests/test_io.py
+tests/test_lazy_unpickle.py
+tests/test_modelref.py
+tests/test_sparsify.py
+tests/test_tokenizer.py
\ No newline at end of file
diff --git a/mergekit/mergekit.egg-info/dependency_links.txt b/mergekit/mergekit.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/mergekit/mergekit.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/mergekit/mergekit.egg-info/entry_points.txt b/mergekit/mergekit.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d35f77551c680f7c9c412e38502e01e8f99de00b
--- /dev/null
+++ b/mergekit/mergekit.egg-info/entry_points.txt
@@ -0,0 +1,10 @@
+[console_scripts]
+bakllama = mergekit.scripts.bakllama:main
+mergekit-evolve = mergekit.scripts.evolve:main
+mergekit-extract-lora = mergekit.scripts.extract_lora:main
+mergekit-layershuffle = mergekit.scripts.layershuffle:main
+mergekit-legacy = mergekit.scripts.legacy:main
+mergekit-mega = mergekit.scripts.megamerge:main
+mergekit-moe = mergekit.scripts.moe:main
+mergekit-tokensurgeon = mergekit.scripts.tokensurgeon:main
+mergekit-yaml = mergekit.scripts.run_yaml:main
diff --git a/mergekit/mergekit.egg-info/requires.txt b/mergekit/mergekit.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b73e13251b6b033ec804501da38e0cb34c37633
--- /dev/null
+++ b/mergekit/mergekit.egg-info/requires.txt
@@ -0,0 +1,34 @@
+torch>=2.0.0
+tqdm==4.66.5
+click==8.1.7
+safetensors~=0.4.3
+accelerate~=1.0.1
+pydantic~=2.9.2
+immutables==0.20
+transformers>=4.45.2
+tokenizers>=0.20.1
+huggingface_hub
+peft
+typing-extensions
+sentencepiece
+protobuf
+scipy
+datasets
+
+[dev]
+black~=24.10.0
+isort~=5.13.2
+pre-commit~=4.0.1
+
+[evolve]
+ray
+cma
+lm_eval
+wandb
+
+[test]
+pytest~=8.3.3
+
+[vllm]
+vllm==0.3.2
+lm_eval[vllm]
diff --git a/mergekit/mergekit.egg-info/top_level.txt b/mergekit/mergekit.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..638bb697f402b8b1653e727c849cb526250bafca
--- /dev/null
+++ b/mergekit/mergekit.egg-info/top_level.txt
@@ -0,0 +1 @@
+mergekit
diff --git a/mergekit/mergekit/__init__.py b/mergekit/mergekit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb2657c6b5d1b355bf36278e91532e3fc26ce07d
Binary files /dev/null and b/mergekit/mergekit/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/architecture.cpython-310.pyc b/mergekit/mergekit/__pycache__/architecture.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fd0dc01aba0c8f0ebd15484e158c41ce4d5a2aa
Binary files /dev/null and b/mergekit/mergekit/__pycache__/architecture.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/card.cpython-310.pyc b/mergekit/mergekit/__pycache__/card.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6922d7e703742b587c92e9eaffd8ca40688a744
Binary files /dev/null and b/mergekit/mergekit/__pycache__/card.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/common.cpython-310.pyc b/mergekit/mergekit/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbd090913ef97f0533e1406bb932a53ee4884b7f
Binary files /dev/null and b/mergekit/mergekit/__pycache__/common.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/config.cpython-310.pyc b/mergekit/mergekit/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f0de5c3e1e53357c8789392e0cd3b8d509b6b
Binary files /dev/null and b/mergekit/mergekit/__pycache__/config.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/graph.cpython-310.pyc b/mergekit/mergekit/__pycache__/graph.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94581a80387571e838ee503fbfb2894898baa3f5
Binary files /dev/null and b/mergekit/mergekit/__pycache__/graph.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/merge.cpython-310.pyc b/mergekit/mergekit/__pycache__/merge.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a3a69449949863916ebb07bc2e22ab035701610
Binary files /dev/null and b/mergekit/mergekit/__pycache__/merge.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/options.cpython-310.pyc b/mergekit/mergekit/__pycache__/options.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b825b88877000222339da6d5995f40e782c67f97
Binary files /dev/null and b/mergekit/mergekit/__pycache__/options.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/plan.cpython-310.pyc b/mergekit/mergekit/__pycache__/plan.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07fc5854ae8612bb43c98f488d773594a64b3ba6
Binary files /dev/null and b/mergekit/mergekit/__pycache__/plan.cpython-310.pyc differ
diff --git a/mergekit/mergekit/__pycache__/sparsify.cpython-310.pyc b/mergekit/mergekit/__pycache__/sparsify.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f671b67eaf803038df51aa1dc91ff86e5da56a5f
Binary files /dev/null and b/mergekit/mergekit/__pycache__/sparsify.cpython-310.pyc differ
diff --git a/mergekit/mergekit/_data/__init__.py b/mergekit/mergekit/_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/_data/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/_data/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aba39fd3c953b763120bca8a1ce9dcc8641c7f6
Binary files /dev/null and b/mergekit/mergekit/_data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/_data/architectures/__init__.py b/mergekit/mergekit/_data/architectures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/_data/architectures/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/_data/architectures/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eccabed5763bda11dacc9b8dc62bfdf2041ba8bc
Binary files /dev/null and b/mergekit/mergekit/_data/architectures/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/_data/architectures/baichuan.json b/mergekit/mergekit/_data/architectures/baichuan.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d28020c62749750bb9340bbd808d240c8daa1dd
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/baichuan.json
@@ -0,0 +1,47 @@
+{
+    "model_type": "baichuan",
+    "architectures": [
+        "BaichuanForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.W_pack.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/bert-masked-lm.json b/mergekit/mergekit/_data/architectures/bert-masked-lm.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6430e402628c5467dd2f79b570400d329e6df60
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/bert-masked-lm.json
@@ -0,0 +1,119 @@
+{
+    "model_type": "bert",
+    "architectures": [
+        "BertForMaskedLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "bert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "bert.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "bert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "bert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "bert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "bert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "bert.embeddings.LayerNorm.gamma"
+            ]
+        },
+        {
+            "name": "bert.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "bert.pooler.dense.weight"
+        },
+        {
+            "name": "bert.pooler.dense.bias"
+        },
+        {
+            "name": "cls.predictions.bias"
+        },
+        {
+            "name": "cls.predictions.decoder.weight",
+            "optional": true,
+            "tied_names": [
+                "bert.embeddings.word_embeddings.weight"
+            ],
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
+                ]
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/bert-sequence-classification.json b/mergekit/mergekit/_data/architectures/bert-sequence-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..81a61ff7d727508039af5b06fe7f0ad1d0327ff2
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/bert-sequence-classification.json
@@ -0,0 +1,118 @@
+{
+    "model_type": "bert",
+    "architectures": [
+        "BertForSequenceClassification",
+        "BertForMultipleChoice",
+        "BertForTokenClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "bert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "bert.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "bert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "bert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "bert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "bert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "bert.embeddings.LayerNorm.gamma"
+            ]
+        },
+        {
+            "name": "bert.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "bert.pooler.dense.weight",
+            "optional": true
+        },
+        {
+            "name": "bert.pooler.dense.bias",
+            "optional": true
+        },
+        {
+            "name": "classifier.bias"
+        },
+        {
+            "name": "classifier.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
+                ]
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/bert.json b/mergekit/mergekit/_data/architectures/bert.json
new file mode 100644
index 0000000000000000000000000000000000000000..5de5f5b029a7f6d59021c18fcafeba7382488065
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/bert.json
@@ -0,0 +1,175 @@
+{
+    "model_type": "bert",
+    "architectures": [
+        "BertModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "embeddings.position_embeddings.weight",
+            "aliases": [
+                "bert.embeddings.position_embeddings.weight"
+            ]
+        },
+        {
+            "name": "embeddings.token_type_embeddings.weight",
+            "aliases": [
+                "bert.embeddings.token_type_embeddings.weight"
+            ]
+        },
+        {
+            "name": "embeddings.word_embeddings.weight",
+            "is_embed": true,
+            "aliases": [
+                "bert.embeddings.word_embeddings.weight"
+            ]
+        },
+        {
+            "name": "embeddings.LayerNorm.bias",
+            "aliases": [
+                "embeddings.LayerNorm.beta",
+                "bert.embeddings.LayerNorm.bias",
+                "bert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "embeddings.LayerNorm.weight",
+            "aliases": [
+                "embeddings.LayerNorm.gamma",
+                "bert.embeddings.LayerNorm.weight",
+                "bert.embeddings.LayerNorm.gamma",
+                "bert.embeddings.LayerNorm.weight"
+            ]
+        },
+        {
+            "name": "embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64",
+            "aliases": [
+                "bert.embeddings.position_ids"
+            ]
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "pooler.dense.weight",
+            "aliases": [
+                "bert.pooler.dense.weight"
+            ]
+        },
+        {
+            "name": "pooler.dense.bias",
+            "aliases": [
+                "bert.pooler.dense.bias"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.query.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.query.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.query.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.query.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.key.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.key.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.key.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.key.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.value.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.value.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.self.value.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.self.value.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.output.dense.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.output.dense.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
+                "aliases": [
+                    "encoder.layer.${layer_index}.attention.output.LayerNorm.beta",
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
+                "aliases": [
+                    "encoder.layer.${layer_index}.attention.output.LayerNorm.gamma",
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
+                    "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.intermediate.dense.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.intermediate.dense.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.output.dense.weight",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.dense.weight"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.output.dense.bias",
+                "aliases": [
+                    "bert.encoder.layer.${layer_index}.output.dense.bias"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.output.LayerNorm.bias",
+                "aliases": [
+                    "encoder.layer.${layer_index}.output.LayerNorm.beta",
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
+                ]
+            },
+            {
+                "name": "encoder.layer.${layer_index}.output.LayerNorm.weight",
+                "aliases": [
+                    "encoder.layer.${layer_index}.output.LayerNorm.gamma",
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
+                    "bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
+                ]
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/chatglm.json b/mergekit/mergekit/_data/architectures/chatglm.json
new file mode 100644
index 0000000000000000000000000000000000000000..b094685f82c5734027070bc9d45636abd4df2d88
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/chatglm.json
@@ -0,0 +1,50 @@
+{
+    "model_type": "chatglm",
+    "architectures": [
+        "ChatGLMModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.embedding.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "transformer.rotary_pos_emb.inv_freq"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.encoder.final_layernorm.weight"
+        },
+        {
+            "name": "transformer.output_layer.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.encoder.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.mlp.dense_4h_to_h.weight"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.mlp.dense_h_to_4h.weight"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.self_attention.dense.weight"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.self_attention.query_key_value.bias"
+            },
+            {
+                "name": "transformer.encoder.layers.${layer_index}.self_attention.query_key_value.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/cohere.json b/mergekit/mergekit/_data/architectures/cohere.json
new file mode 100644
index 0000000000000000000000000000000000000000..c80331032a188df7edebce76ce7de0b4c64403d5
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/cohere.json
@@ -0,0 +1,51 @@
+{
+    "model_type": "cohere",
+    "architectures": [
+        "CohereForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/distilbert-masked-lm.json b/mergekit/mergekit/_data/architectures/distilbert-masked-lm.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a079811ccbc953e29f0ba3831096bad0bda4323
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/distilbert-masked-lm.json
@@ -0,0 +1,105 @@
+{
+    "model_type": "distilbert",
+    "architectures": [
+        "DistilBertForMaskedLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "distilbert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "distilbert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.gamma"
+            ]
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "vocab_transform.weight"
+        },
+        {
+            "name": "vocab_transform.bias"
+        },
+        {
+            "name": "vocab_layer_norm.bias"
+        },
+        {
+            "name": "vocab_layer_norm.weight"
+        },
+        {
+            "name": "vocab_projector.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "distilbert.embeddings.word_embeddings.weight"
+            ]
+        },
+        {
+            "name": "vocab_projector.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/distilbert-sequence-classification.json b/mergekit/mergekit/_data/architectures/distilbert-sequence-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..00c727639e7bd02ccd09279c59ae4b98c47529cd
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/distilbert-sequence-classification.json
@@ -0,0 +1,94 @@
+{
+    "model_type": "distilbert",
+    "architectures": [
+        "DistilBertForSequenceClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "distilbert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "distilbert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.gamma"
+            ]
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "classifier.bias"
+        },
+        {
+            "name": "classifier.weight"
+        },
+        {
+            "name": "pre_classifier.bias"
+        },
+        {
+            "name": "pre_classifier.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/distilbert-token-classification.json b/mergekit/mergekit/_data/architectures/distilbert-token-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..456c512f185db5b8fd3569a77713d455ffdfc04f
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/distilbert-token-classification.json
@@ -0,0 +1,88 @@
+{
+    "model_type": "distilbert",
+    "architectures": [
+        "DistilBertForTokenClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "distilbert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "distilbert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.gamma"
+            ]
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "classifier.bias"
+        },
+        {
+            "name": "classifier.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/distilbert.json b/mergekit/mergekit/_data/architectures/distilbert.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd99716235b5b6c9e620be1bb25ffb46823bc98b
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/distilbert.json
@@ -0,0 +1,81 @@
+{
+    "model_type": "distilbert",
+    "architectures": [
+        "DistilBertModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "distilbert.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "distilbert.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.bias",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.beta"
+            ]
+        },
+        {
+            "name": "distilbert.embeddings.LayerNorm.weight",
+            "aliases": [
+                "distilbert.embeddings.LayerNorm.gamma"
+            ]
+        }
+    ],
+    "post_weights": [],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.k_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.q_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.v_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.attention.out_lin.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.sa_layer_norm.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin1.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.weight"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.ffn.lin2.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.bias"
+            },
+            {
+                "name": "distilbert.transformer.layer.${layer_index}.output_layer_norm.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/exaone.json b/mergekit/mergekit/_data/architectures/exaone.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9024473f744621f8b345c02ba3fe69e68b919e1
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/exaone.json
@@ -0,0 +1,78 @@
+{
+    "model_type": "exaone",
+    "architectures": [
+        "ExaoneForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight",
+            "is_embed": true,
+            "output_space": "running_residual"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.q_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.k_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.v_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_v_${layer_index}",
+                "head_split": "output"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.attention.out_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "running_residual",
+                "head_split": "input"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc_0.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc_1.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "running_residual"
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight",
+            "input_space": "running_residual"
+        },
+        {
+            "name": "lm_head.weight",
+            "input_space": "running_residual",
+            "is_embed": true
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/falcon.json b/mergekit/mergekit/_data/architectures/falcon.json
new file mode 100644
index 0000000000000000000000000000000000000000..648dab06250ca450b67f9fa15c381543c9c55ed6
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/falcon.json
@@ -0,0 +1,53 @@
+{
+    "model_type": "falcon",
+    "architectures": [
+        "FalconForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.word_embeddings.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight"
+        },
+        {
+            "name": "transformer.ln_f.bias"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.ln_attn.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_attn.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_mlp.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_mlp.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.dense_4h_to_h.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.dense_h_to_4h.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.self_attention.dense.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.self_attention.query_key_value.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/gemma.json b/mergekit/mergekit/_data/architectures/gemma.json
new file mode 100644
index 0000000000000000000000000000000000000000..43dbe27ae2c95357c057e091b05aee029b5e1cd1
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gemma.json
@@ -0,0 +1,85 @@
+{
+    "model_type": "gemma",
+    "architectures": [
+        "GemmaForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true,
+            "output_space": "h_0"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight",
+                "input_space": "h_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_v_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "post_attn_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+                "input_space": "h_a_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "post_mlp_${layer_index}"
+            }
+        ],
+        "procedural_spaces": [
+            {
+                "name": "h_a_${layer_index}",
+                "type": "residual",
+                "inputs": [
+                    "h_${layer_index}",
+                    "post_attn_${layer_index}"
+                ]
+            },
+            {
+                "name": "h_${layer_index+1}",
+                "type": "residual",
+                "inputs": [
+                    "h_a_${layer_index}",
+                    "post_mlp_${layer_index}"
+                ]
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "model.norm.weight",
+            "input_space": "h_${num_layers}"
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/gemma2.json b/mergekit/mergekit/_data/architectures/gemma2.json
new file mode 100644
index 0000000000000000000000000000000000000000..525052450521f8a435dbc5b13565f96597d717ae
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gemma2.json
@@ -0,0 +1,63 @@
+{
+    "model_type": "gemma2",
+    "architectures": [
+        "Gemma2ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.pre_feedforward_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_feedforward_layernorm.weight"
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/gpt-neox.json b/mergekit/mergekit/_data/architectures/gpt-neox.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8bf29a3ba8ad16ee61fd75392511662c52e4e25
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gpt-neox.json
@@ -0,0 +1,74 @@
+{
+    "model_type": "gpt_neox",
+    "architectures": [
+        "GPTNeoXForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "gpt_neox.embed_in.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "gpt_neox.final_layer_norm.bias"
+        },
+        {
+            "name": "gpt_neox.final_layer_norm.weight"
+        },
+        {
+            "name": "embed_out.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.dense.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.dense.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.query_key_value.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.query_key_value.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.mlp.dense_4h_to_h.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.mlp.dense_4h_to_h.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.mlp.dense_h_to_4h.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.mlp.dense_h_to_4h.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.post_attention_layernorm.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.masked_bias"
+            },
+            {
+                "name": "gpt_neox.layers.${layer_index}.attention.rotary_emb.inv_freq"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/gpt2-sequence-classification.json b/mergekit/mergekit/_data/architectures/gpt2-sequence-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..54cf31f6159988057ab8b4e6dfe019691a7d2d07
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gpt2-sequence-classification.json
@@ -0,0 +1,66 @@
+{
+    "model_type": "gpt2",
+    "architectures": [
+        "GPT2ForSequenceClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight"
+        },
+        {
+            "name": "transformer.wpe.weight"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight"
+        },
+        {
+            "name": "transformer.ln_f.bias"
+        },
+        {
+            "name": "score.weight"
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/gpt2.json b/mergekit/mergekit/_data/architectures/gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..64a04e9d4ddef9dc496d7f290f92b6db7ce6e263
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gpt2.json
@@ -0,0 +1,64 @@
+{
+    "model_type": "gpt2",
+    "architectures": [
+        "GPT2LMHeadModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "wte.weight",
+            "is_embed": true
+        },
+        {
+            "name": "wpe.weight"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "ln_f.weight"
+        },
+        {
+            "name": "ln_f.bias"
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "h.${layer_index}.attn.c_attn.weight"
+            },
+            {
+                "name": "h.${layer_index}.attn.c_attn.bias"
+            },
+            {
+                "name": "h.${layer_index}.attn.c_proj.weight"
+            },
+            {
+                "name": "h.${layer_index}.attn.c_proj.bias"
+            },
+            {
+                "name": "h.${layer_index}.ln_1.weight"
+            },
+            {
+                "name": "h.${layer_index}.ln_1.bias"
+            },
+            {
+                "name": "h.${layer_index}.ln_2.weight"
+            },
+            {
+                "name": "h.${layer_index}.ln_2.bias"
+            },
+            {
+                "name": "h.${layer_index}.mlp.c_proj.weight"
+            },
+            {
+                "name": "h.${layer_index}.mlp.c_proj.bias"
+            },
+            {
+                "name": "h.${layer_index}.mlp.c_fc.weight"
+            },
+            {
+                "name": "h.${layer_index}.mlp.c_fc.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/gptbigcode.json b/mergekit/mergekit/_data/architectures/gptbigcode.json
new file mode 100644
index 0000000000000000000000000000000000000000..c12bac5c7335bae04c443ddf3cffd2dfd5ace306
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/gptbigcode.json
@@ -0,0 +1,72 @@
+{
+    "model_type": "gpt_bigcode",
+    "architectures": [
+        "GPTBigCodeForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight",
+            "is_embed": true
+        },
+        {
+            "name": "transformer.wpe.weight"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight"
+        },
+        {
+            "name": "transformer.ln_f.bias"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "transformer.wte.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/internlm2.json b/mergekit/mergekit/_data/architectures/internlm2.json
new file mode 100644
index 0000000000000000000000000000000000000000..888faa484a58a3e9fa88e37497d9f792bf2289ec
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/internlm2.json
@@ -0,0 +1,51 @@
+{
+    "model_type": "internlm2",
+    "architectures": [
+        "InternLM2ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.tok_embeddings.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "output.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.tok_embeddings.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.attention_norm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.ffn_norm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.attention.wqkv.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.attention.wo.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w1.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w2.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.feed_forward.w3.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/jais.json b/mergekit/mergekit/_data/architectures/jais.json
new file mode 100644
index 0000000000000000000000000000000000000000..c08cca95ff49b58168b8c796b70db1fa491056f4
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/jais.json
@@ -0,0 +1,70 @@
+{
+    "model_type": "jais",
+    "architectures": [
+        "JAISLMHeadModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight",
+            "is_embed": true
+        },
+        {
+            "name": "transformer.relative_pe.slopes"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight"
+        },
+        {
+            "name": "transformer.ln_f.bias"
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc2.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_fc2.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/llama.json b/mergekit/mergekit/_data/architectures/llama.json
new file mode 100644
index 0000000000000000000000000000000000000000..00918a2c06803dcddd03e6345a7c4dcb74930f40
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/llama.json
@@ -0,0 +1,83 @@
+{
+    "model_type": "llama",
+    "architectures": [
+        "LlamaForCausalLM",
+        "LLaMaForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true,
+            "output_space": "running_residual"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_qk_${layer_index}",
+                "head_split": "output",
+                "is_kq": true
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "attn_v_${layer_index}",
+                "head_split": "output"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "running_residual",
+                "head_split": "input"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+                "input_space": "running_residual"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+                "input_space": "running_residual",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "running_residual"
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "model.norm.weight",
+            "input_space": "running_residual"
+        },
+        {
+            "name": "lm_head.weight",
+            "input_space": "running_residual",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/mamba.json b/mergekit/mergekit/_data/architectures/mamba.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c4735325e80469b5bd46e77b2aca93416860bc6
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/mamba.json
@@ -0,0 +1,60 @@
+{
+    "model_type": "mamba",
+    "architectures": [
+        "MambaForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "backbone.embeddings.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "backbone.norm_f.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "backbone.embeddings.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "backbone.layers.${layer_index}.mixer.A_log"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.conv1d.bias"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.conv1d.weight"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.D"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.dt_proj.bias"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.dt_proj.weight"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.in_proj.weight"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.out_proj.weight"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.mixer.x_proj.weight"
+            },
+            {
+                "name": "backbone.layers.${layer_index}.norm.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/mistral.json b/mergekit/mergekit/_data/architectures/mistral.json
new file mode 100644
index 0000000000000000000000000000000000000000..c024370d4ca80acb690b3264fb2e45b0ef9b2d01
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/mistral.json
@@ -0,0 +1,90 @@
+{
+    "model_type": "mistral",
+    "architectures": [
+        "MistralForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true,
+            "output_space": "h_0"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight",
+                "input_space": "h_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_v_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "post_attn_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+                "input_space": "h_a_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "post_mlp_${layer_index}"
+            }
+        ],
+        "procedural_spaces": [
+            {
+                "name": "h_a_${layer_index}",
+                "type": "residual",
+                "inputs": [
+                    "h_${layer_index}",
+                    "post_attn_${layer_index}"
+                ]
+            },
+            {
+                "name": "h_${layer_index+1}",
+                "type": "residual",
+                "inputs": [
+                    "h_a_${layer_index}",
+                    "post_mlp_${layer_index}"
+                ]
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "model.norm.weight",
+            "input_space": "h_${num_layers}"
+        },
+        {
+            "name": "lm_head.weight",
+            "input_space": "h_${num_layers}",
+            "is_embed": true
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/phi-1.json b/mergekit/mergekit/_data/architectures/phi-1.json
new file mode 100644
index 0000000000000000000000000000000000000000..612bd7dc4fd70a383d689c1f8dc5a3ffa3bcca8b
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/phi-1.json
@@ -0,0 +1,66 @@
+{
+    "model_type": "mixformer-sequential",
+    "architectures": [
+        "MixFormerSequentialForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "layers.0.wte.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "layers.${layer_index}.ln.bias"
+            },
+            {
+                "name": "layers.${layer_index}.ln.weight"
+            },
+            {
+                "name": "layers.${layer_index}.mixer.Wqkv.bias"
+            },
+            {
+                "name": "layers.${layer_index}.mixer.Wqkv.weight"
+            },
+            {
+                "name": "layers.${layer_index}.mixer.out_proj.bias"
+            },
+            {
+                "name": "layers.${layer_index}.mixer.out_proj.weight"
+            },
+            {
+                "name": "layers.${layer_index}.mixer.rotary_emb.inv_freq"
+            },
+            {
+                "name": "layers.${layer_index}.mlp.fc1.bias"
+            },
+            {
+                "name": "layers.${layer_index}.mlp.fc1.weight"
+            },
+            {
+                "name": "layers.${layer_index}.mlp.fc2.bias"
+            },
+            {
+                "name": "layers.${layer_index}.mlp.fc2.weight"
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "layers.${num_layers}.linear.bias",
+            "is_embed": true
+        },
+        {
+            "name": "layers.${num_layers}.linear.weight",
+            "is_embed": true
+        },
+        {
+            "name": "layers.${num_layers}.ln.bias"
+        },
+        {
+            "name": "layers.${num_layers}.ln.weight"
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/phi2-old.json b/mergekit/mergekit/_data/architectures/phi2-old.json
new file mode 100644
index 0000000000000000000000000000000000000000..a735ecc47e6e1ced7a641161e13ab04c294fc8f2
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/phi2-old.json
@@ -0,0 +1,62 @@
+{
+    "model_type": "phi-msft",
+    "architectures": [
+        "PhiForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.embd.wte.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.linear.bias"
+        },
+        {
+            "name": "lm_head.linear.weight",
+            "is_embed": true
+        },
+        {
+            "name": "lm_head.ln.bias"
+        },
+        {
+            "name": "lm_head.ln.weight"
+        }
+    ],
+    "num_layers_config_key": "n_layer",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.ln.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mixer.out_proj.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mixer.out_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mixer.Wqkv.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mixer.Wqkv.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.fc1.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.fc1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.fc2.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.fc2.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/phi2.json b/mergekit/mergekit/_data/architectures/phi2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f509e51cf50202d4e2ef5263dd40f6e6b7527674
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/phi2.json
@@ -0,0 +1,74 @@
+{
+    "model_type": "phi",
+    "architectures": [
+        "PhiForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.bias"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        },
+        {
+            "name": "model.final_layernorm.bias"
+        },
+        {
+            "name": "model.final_layernorm.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.fc1.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.fc1.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.fc2.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.fc2.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/phi3-small.json b/mergekit/mergekit/_data/architectures/phi3-small.json
new file mode 100644
index 0000000000000000000000000000000000000000..f27dfac4d531e9ee70d7a4885419d14d1fb21947
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/phi3-small.json
@@ -0,0 +1,69 @@
+{
+    "model_type": "phi3small",
+    "architectures": [
+        "Phi3SmallForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
+        },
+        {
+            "name": "model.final_layernorm.weight"
+        },
+        {
+            "name": "model.final_layernorm.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.dense.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.query_key_value.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.query_key_value.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/phi3.json b/mergekit/mergekit/_data/architectures/phi3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c606b842c980c3060317a55db6dc5ab26d7dc52
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/phi3.json
@@ -0,0 +1,44 @@
+{
+    "model_type": "phi3",
+    "architectures": [
+        "Phi3ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        },
+        {
+            "name": "model.norm.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.qkv_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/qwen.json b/mergekit/mergekit/_data/architectures/qwen.json
new file mode 100644
index 0000000000000000000000000000000000000000..33bf88dcc5be5fb2716fc3fdfbc63b7b92892cad
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/qwen.json
@@ -0,0 +1,50 @@
+{
+    "model_type": "qwen",
+    "architectures": [
+        "QWenLMHeadModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "transformer.wte.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "transformer.ln_f.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.bias"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_attn.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.attn.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.ln_2.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.c_proj.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.w1.weight"
+            },
+            {
+                "name": "transformer.h.${layer_index}.mlp.w2.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/qwen2.json b/mergekit/mergekit/_data/architectures/qwen2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c71315231fa881c6393832884a53e8f29b1eee09
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/qwen2.json
@@ -0,0 +1,66 @@
+{
+    "model_type": "qwen2",
+    "architectures": [
+        "Qwen2ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/roberta-masked-lm.json b/mergekit/mergekit/_data/architectures/roberta-masked-lm.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aae76a1bcefc0d702f63f6539d9072ce8a93404
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/roberta-masked-lm.json
@@ -0,0 +1,107 @@
+{
+    "model_type": "roberta",
+    "architectures": [
+        "RobertaForMaskedLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "roberta.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.word_embeddings.weight",
+            "is_embed": true
+        },
+        {
+            "name": "roberta.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.bias"
+        },
+        {
+            "name": "roberta.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.bias"
+        },
+        {
+            "name": "lm_head.dense.weight"
+        },
+        {
+            "name": "lm_head.dense.bias"
+        },
+        {
+            "name": "lm_head.layer_norm.weight"
+        },
+        {
+            "name": "lm_head.layer_norm.bias"
+        },
+        {
+            "name": "lm_head.decoder.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "roberta.embeddings.word_embeddings.weight"
+            ]
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/roberta-sequence-classification.json b/mergekit/mergekit/_data/architectures/roberta-sequence-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e137e6d81df7ea4008043ea04c886a18ead80a0
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/roberta-sequence-classification.json
@@ -0,0 +1,95 @@
+{
+    "model_type": "roberta",
+    "architectures": [
+        "RobertaForSequenceClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "roberta.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.word_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.bias"
+        },
+        {
+            "name": "roberta.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "classifier.dense.weight"
+        },
+        {
+            "name": "classifier.dense.bias"
+        },
+        {
+            "name": "classifier.out_proj.weight"
+        },
+        {
+            "name": "classifier.out_proj.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/roberta-token-classification.json b/mergekit/mergekit/_data/architectures/roberta-token-classification.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b9cae9f6d02b92882c0b9a86909e96745876b52
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/roberta-token-classification.json
@@ -0,0 +1,89 @@
+{
+    "model_type": "roberta",
+    "architectures": [
+        "RobertaForTokenClassification"
+    ],
+    "pre_weights": [
+        {
+            "name": "roberta.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.word_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.bias"
+        },
+        {
+            "name": "roberta.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "classifier.weight"
+        },
+        {
+            "name": "classifier.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/roberta.json b/mergekit/mergekit/_data/architectures/roberta.json
new file mode 100644
index 0000000000000000000000000000000000000000..cab186c0dee7d430da105ce5630373339e54876a
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/roberta.json
@@ -0,0 +1,89 @@
+{
+    "model_type": "roberta",
+    "architectures": [
+        "RobertaModel"
+    ],
+    "pre_weights": [
+        {
+            "name": "roberta.embeddings.position_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.word_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.token_type_embeddings.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.weight"
+        },
+        {
+            "name": "roberta.embeddings.LayerNorm.bias"
+        },
+        {
+            "name": "roberta.embeddings.position_ids",
+            "optional": true,
+            "force_dtype": "int64"
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "pooler.dense.weight"
+        },
+        {
+            "name": "pooler.dense.bias"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.output.LayerNorm.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.query.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.key.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.attention.self.value.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.intermediate.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.dense.bias"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.weight"
+            },
+            {
+                "name": "roberta.encoder.layer.${layer_index}.output.LayerNorm.bias"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/solar.json b/mergekit/mergekit/_data/architectures/solar.json
new file mode 100644
index 0000000000000000000000000000000000000000..78fd5998f412c9383f131edcb19c20b943142478
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/solar.json
@@ -0,0 +1,82 @@
+{
+  "model_type": "solar",
+  "architectures": [
+    "SolarForCausalLM"
+  ],
+  "pre_weights": [
+    {
+      "name": "model.embed_tokens.weight",
+      "is_embed": true,
+      "output_space": "running_residual"
+    }
+  ],
+  "num_layers_config_key": "num_hidden_layers",
+  "layer_templates": {
+    "weights": [
+      {
+        "name": "model.layers.${layer_index}.input_layernorm.weight",
+        "input_space": "running_residual"
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_qk_${layer_index}",
+        "head_split": "output",
+        "is_kq": true
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_qk_${layer_index}",
+        "head_split": "output",
+        "is_kq": true
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "attn_v_${layer_index}",
+        "head_split": "output"
+      },
+      {
+        "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+        "input_space": "attn_v_${layer_index}",
+        "output_space": "running_residual",
+        "head_split": "input"
+      },
+      {
+        "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+        "input_space": "running_residual"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "up_${layer_index}"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+        "input_space": "running_residual",
+        "output_space": "up_${layer_index}"
+      },
+      {
+        "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+        "input_space": "up_${layer_index}",
+        "output_space": "running_residual"
+      }
+    ]
+  },
+  "post_weights": [
+    {
+      "name": "model.norm.weight",
+      "input_space": "running_residual"
+    },
+    {
+      "name": "lm_head.weight",
+      "input_space": "running_residual",
+      "is_embed": true,
+      "optional": true,
+      "tied_names": [
+        "model.lm_head.weight"
+      ]
+    }
+  ]
+}
diff --git a/mergekit/mergekit/_data/architectures/stablelm.json b/mergekit/mergekit/_data/architectures/stablelm.json
new file mode 100644
index 0000000000000000000000000000000000000000..827d523a4e7c137b52606d92fd144f691c2222e4
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/stablelm.json
@@ -0,0 +1,98 @@
+{
+    "model_type": "stablelm_epoch",
+    "architectures": [
+        "StableLMEpochForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true,
+            "output_space": "h_0"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight",
+                "input_space": "h_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias",
+                "input_space": "h_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_qk_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight",
+                "input_space": "h_${layer_index}",
+                "output_space": "attn_v_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight",
+                "input_space": "attn_v_${layer_index}",
+                "output_space": "post_attn_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
+                "input_space": "h_a_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.bias",
+                "input_space": "h_a_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight",
+                "input_space": "h_a_${layer_index}",
+                "output_space": "up_${layer_index}"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
+                "input_space": "up_${layer_index}",
+                "output_space": "post_mlp_${layer_index}"
+            }
+        ],
+        "procedural_spaces": [
+            {
+                "name": "h_a_${layer_index}",
+                "type": "residual",
+                "inputs": [
+                    "h_${layer_index}",
+                    "post_attn_${layer_index}"
+                ]
+            },
+            {
+                "name": "h_${layer_index+1}",
+                "type": "residual",
+                "inputs": [
+                    "h_a_${layer_index}",
+                    "post_mlp_${layer_index}"
+                ]
+            }
+        ]
+    },
+    "post_weights": [
+        {
+            "name": "model.norm.weight",
+            "input_space": "h_${num_layers}"
+        },
+        {
+            "name": "lm_head.weight",
+            "input_space": "h_${num_layers}",
+            "is_embed": true
+        }
+    ]
+}
diff --git a/mergekit/mergekit/_data/architectures/stablelm2.json b/mergekit/mergekit/_data/architectures/stablelm2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2d2f461a60a8c0c4c2733c5dfc1bdc2750cec95
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/stablelm2.json
@@ -0,0 +1,74 @@
+{
+    "model_type": "stablelm",
+    "architectures": [
+        "StableLmForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "model.norm.weight"
+        },
+        {
+            "name": "model.norm.bias"
+        },
+        {
+            "name": "lm_head.weight",
+            "is_embed": true
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.down_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.gate_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.up_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.bias",
+                "optional": true
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.bias",
+                "optional": true
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.bias",
+                "optional": true
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/architectures/starcoder2.json b/mergekit/mergekit/_data/architectures/starcoder2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2266899965d2a5947bd945b99a595aa4379961a
--- /dev/null
+++ b/mergekit/mergekit/_data/architectures/starcoder2.json
@@ -0,0 +1,81 @@
+{
+    "model_type": "starcoder2",
+    "architectures": [
+        "Starcoder2ForCausalLM"
+    ],
+    "pre_weights": [
+        {
+            "name": "model.embed_tokens.weight",
+            "is_embed": true
+        }
+    ],
+    "post_weights": [
+        {
+            "name": "lm_head.weight",
+            "is_embed": true,
+            "optional": true,
+            "tied_names": [
+                "model.embed_tokens.weight"
+            ]
+        },
+        {
+            "name": "model.norm.bias"
+        },
+        {
+            "name": "model.norm.weight"
+        }
+    ],
+    "num_layers_config_key": "num_hidden_layers",
+    "layer_templates": {
+        "weights": [
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.input_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.q_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.k_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.v_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.self_attn.o_proj.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.post_attention_layernorm.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.c_fc.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.c_fc.weight"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.c_proj.bias"
+            },
+            {
+                "name": "model.layers.${layer_index}.mlp.c_proj.weight"
+            }
+        ]
+    }
+}
diff --git a/mergekit/mergekit/_data/chat_templates/__init__.py b/mergekit/mergekit/_data/chat_templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/_data/chat_templates/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/_data/chat_templates/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e2f39790e5b1d3a4623369c2686bccd935766f8
Binary files /dev/null and b/mergekit/mergekit/_data/chat_templates/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/_data/chat_templates/alpaca.jinja b/mergekit/mergekit/_data/chat_templates/alpaca.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..45837b0af4ddd9d61b6b4967b9918ad39e254c0f
--- /dev/null
+++ b/mergekit/mergekit/_data/chat_templates/alpaca.jinja
@@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
diff --git a/mergekit/mergekit/_data/chat_templates/chatml.jinja b/mergekit/mergekit/_data/chat_templates/chatml.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..4f3444551538221d95146b63eade319a807aea0e
--- /dev/null
+++ b/mergekit/mergekit/_data/chat_templates/chatml.jinja
@@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}
+{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
diff --git a/mergekit/mergekit/_data/chat_templates/exaone.jinja b/mergekit/mergekit/_data/chat_templates/exaone.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..3a4d07ae35547d7a350f5b11e3db6248319763b3
--- /dev/null
+++ b/mergekit/mergekit/_data/chat_templates/exaone.jinja
@@ -0,0 +1,14 @@
+{% for message in messages %}
+    {% if loop.first and message['role'] != 'system' %}
+        {{ '[|system|][|endofturn|]\n' }}
+    {% endif %}
+    {{ '[|' + message['role'] + '|]' + message['content'] }}
+    {% if message['role'] == 'user' %}
+        {{ '\n' }}
+    {% else %}
+        {{ '[|endofturn|]\n' }}
+    {% endif %}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ '[|assistant|]' }}
+{% endif %}
diff --git a/mergekit/mergekit/_data/chat_templates/llama3.jinja b/mergekit/mergekit/_data/chat_templates/llama3.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0fcec78aaef89ed38b6a242c7405180036efc138
--- /dev/null
+++ b/mergekit/mergekit/_data/chat_templates/llama3.jinja
@@ -0,0 +1,7 @@
+{% set loop_messages = messages %}
+{% for message in loop_messages %}
+{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}
+{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}
+{{ content }}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
diff --git a/mergekit/mergekit/_data/chat_templates/mistral.jinja b/mergekit/mergekit/_data/chat_templates/mistral.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..40b37ad7f90d4e4131afd82a9632c067bdd3e01d
--- /dev/null
+++ b/mergekit/mergekit/_data/chat_templates/mistral.jinja
@@ -0,0 +1,24 @@
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.first and system_message is defined %}
+            {{- ' [INST] ' + system_message + '\n\n' + message['content'] + ' [/INST]' }}
+        {%- else %}
+            {{- ' [INST] ' + message['content'] + ' [/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}
diff --git a/mergekit/mergekit/architecture.py b/mergekit/mergekit/architecture.py
new file mode 100644
index 0000000000000000000000000000000000000000..40872160378666325c5f275dceec1a3c181a41b2
--- /dev/null
+++ b/mergekit/mergekit/architecture.py
@@ -0,0 +1,384 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import importlib.resources
+import string
+from abc import ABC, abstractmethod
+from typing import ClassVar, Dict, List, Optional, Tuple, Union
+
+from pydantic import BaseModel, Field
+from transformers import PretrainedConfig
+from typing_extensions import Literal
+
+import mergekit._data.architectures
+
+
+class WeightInfo(BaseModel, frozen=True):
+    """Information about an individual weight tensor in a model.
+
+    Attributes:
+        name (str):
+            The name of the tensor representing the weight.
+        is_embed (bool):
+            Indicates whether the weight is for an embedding or language model head.
+        input_space (Optional[str]):
+            The name of the input space associated with the weight, if applicable.
+        output_space (Optional[str]):
+            The name of the output space associated with the weight, if applicable.
+        optional (bool):
+            Indicates whether the weight can be omitted from a model.
+        aliases (Optional[List[str]]):
+            List of alternative names for the weight, if applicable.
+        tied_names (Optional[List[str]]):
+            List of names for weights that are tied to this weight, if applicable.
+        force_dtype (Optional[str]):
+            Mandatory dtype for the weight, if applicable.
+    """
+
+    name: str
+    is_embed: bool = False
+    input_space: Optional[str] = None
+    output_space: Optional[str] = None
+    optional: bool = False
+    tied: bool = False
+    aliases: Optional[Tuple[str, ...]] = None
+    tied_names: Optional[Tuple[str, ...]] = None
+    force_dtype: Optional[str] = None
+    head_split: Literal[None, "input", "output"] = None
+    is_kq: Optional[bool] = False
+
+
+class ProceduralSpaceInfo(BaseModel, frozen=True):
+    """Defines a procedural space computed from one or more other spaces.
+
+    Currently only supports residual connections.
+
+    Attributes:
+        name (str): The name of the space defined.
+        type (str): The type of procedural space.
+        inputs (List[str]): List of names of spaces used to define this space."""
+
+    name: str
+    type: Literal["residual"]
+    inputs: List[str]
+
+
+class ArchitectureInfo(ABC):
+    @abstractmethod
+    def name(self) -> str:
+        """Return the name of the architecture."""
+        ...
+
+    @abstractmethod
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        """Return a list of all weights preceding the first layer."""
+        ...
+
+    @abstractmethod
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        """Return a list of all weights following the final layer."""
+        ...
+
+    @abstractmethod
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        """Return a list of all weights associated with a given layer."""
+        ...
+
+    @abstractmethod
+    def sliceable(self) -> bool:
+        """
+        Return True if the layers of this architecture can be meaningfully sliced.
+        """
+        ...
+
+    def num_layers_config_key(self) -> str:
+        """Key in config that represents number of layers"""
+        return "num_hidden_layers"
+
+    def num_layers(self, config: PretrainedConfig) -> int:
+        """Return the number of layers in a model."""
+        return getattr(config, self.num_layers_config_key())
+
+    def all_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        """Return all weights associated with a model."""
+        num_layers = self.num_layers(config)
+        res = list(self.pre_weights(config))
+        for layer_idx in range(num_layers):
+            res.extend(self.layer_weights(layer_idx, config))
+        res.extend(self.post_weights(config))
+        return res
+
+    def procedural_spaces(self, config: PretrainedConfig) -> List[ProceduralSpaceInfo]:
+        """Return a list of all procedurally defined spaces in a model."""
+        return []
+
+    def has_defined_spaces(self) -> bool:
+        """
+        Return True if this architecture defines space information needed for
+        matching-based merge methods.
+        """
+        return False
+
+
+class ConfiguredArchitectureInfo(BaseModel, frozen=True, arbitrary_types_allowed=True):
+    info: ArchitectureInfo
+    config: PretrainedConfig
+
+    def name(self) -> str:
+        return self.info.name()
+
+    def num_layers(self) -> int:
+        return self.info.num_layers(self.config)
+
+    def pre_weights(self) -> List[WeightInfo]:
+        return self.info.pre_weights(self.config)
+
+    def post_weights(self) -> List[WeightInfo]:
+        return self.info.post_weights(self.config)
+
+    def layer_weights(self, index: int) -> List[WeightInfo]:
+        return self.info.layer_weights(index, self.config)
+
+    def procedural_spaces(self) -> List[ProceduralSpaceInfo]:
+        return self.info.procedural_spaces(self.config)
+
+    def all_weights(self) -> List[WeightInfo]:
+        return self.info.all_weights(self.config)
+
+
+class JSONLayerTemplates(BaseModel, frozen=True):
+    weights: List[WeightInfo]
+    procedural_spaces: Optional[List[ProceduralSpaceInfo]] = None
+
+
+class JSONArchitectureDefinition(BaseModel, frozen=True):
+    expected_model_type: str = Field(alias="model_type")
+    architectures: List[str]
+    pre_weights: List[WeightInfo]
+    layer_templates: JSONLayerTemplates
+    post_weights: List[WeightInfo]
+    procedural_spaces: Optional[List[ProceduralSpaceInfo]] = None
+    num_layers_config_key: Optional[str] = None
+
+
+class TemplateWithArithmetic(string.Template):
+    idpattern = r"(?a:[_a-z][_a-z0-9]*([+-]1)?)"
+
+
+def _template_substitution(
+    template: str, num_layers: int, layer_idx: Optional[int] = None
+) -> str:
+    if "{" not in template:
+        return template
+
+    substitutions = {
+        "num_layers": num_layers,
+        "num_layers+1": num_layers + 1,
+        "num_layers-1": num_layers - 1,
+    }
+
+    if layer_idx is not None:
+        substitutions.update(
+            {
+                "layer_index": layer_idx,
+                "layer_index+1": layer_idx + 1,
+                "layer_index-1": layer_idx - 1,
+            }
+        )
+
+    return TemplateWithArithmetic(template).substitute(substitutions)
+
+
+class JsonArchitectureInfo(ArchitectureInfo, BaseModel, frozen=True):
+    definition: JSONArchitectureDefinition
+
+    def _substitute(
+        self,
+        item: Union[WeightInfo, ProceduralSpaceInfo],
+        config: PretrainedConfig,
+        layer_idx: Optional[int] = None,
+    ) -> Union[WeightInfo, ProceduralSpaceInfo]:
+        num_layers = self.num_layers(config)
+
+        obj_dict = item.model_dump(mode="json", exclude_unset=True)
+        for key in obj_dict:
+            if isinstance(obj_dict[key], str):
+                obj_dict[key] = _template_substitution(
+                    obj_dict[key], num_layers, layer_idx
+                )
+            elif isinstance(obj_dict[key], list):
+                obj_dict[key] = [
+                    (
+                        _template_substitution(s, num_layers, layer_idx)
+                        if isinstance(s, str)
+                        else s
+                    )
+                    for s in obj_dict[key]
+                ]
+        return type(item).model_validate(obj_dict)
+
+    def name(self) -> str:
+        return self.definition.expected_model_type
+
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [
+            self._substitute(wi, config=config) for wi in self.definition.pre_weights
+        ]
+
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        return [
+            self._substitute(wi, config=config, layer_idx=index)
+            for wi in self.definition.layer_templates.weights
+        ]
+
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [
+            self._substitute(wi, config=config) for wi in self.definition.post_weights
+        ]
+
+    def sliceable(self) -> bool:
+        return True
+
+    def procedural_spaces(self, config: PretrainedConfig) -> List[ProceduralSpaceInfo]:
+        res = []
+        for s in self.definition.procedural_spaces or []:
+            res.append(self._substitute(s, config=config))
+        for idx in range(self.num_layers(config)):
+            for s in self.definition.layer_templates.procedural_spaces or []:
+                res.append(self._substitute(s, config=config, layer_idx=idx))
+        return res
+
+    def has_defined_spaces(self) -> bool:
+        if (
+            self.definition.procedural_spaces
+            or self.definition.layer_templates.procedural_spaces
+        ):
+            return True
+        for wi in (
+            self.definition.layer_templates.weights
+            + self.definition.pre_weights
+            + self.definition.post_weights
+        ):
+            if wi.input_space or wi.output_space:
+                return True
+        return False
+
+    def num_layers_config_key(self) -> str:
+        return self.definition.num_layers_config_key
+
+
+class MixtralTensorNames(ArchitectureInfo, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "MixtralForCausalLM"
+    num_local_experts: int
+
+    def name(self) -> str:
+        return "mixtral"
+
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        return MixtralTensorNames(num_local_experts=config.num_local_experts)
+
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return MISTRAL_INFO.pre_weights(config)
+
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return MISTRAL_INFO.post_weights(config)
+
+    def num_layers_config_key(self) -> str:
+        return MISTRAL_INFO.num_layers_config_key()
+
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        num_experts = self.num_local_experts
+        prefix = f"model.layers.{index}"
+        tensor_names = []
+        for expert_idx in range(num_experts):
+            for param in ("w1", "w2", "w3"):
+                tensor_names.append(
+                    prefix + f".block_sparse_moe.experts.{expert_idx}.{param}.weight"
+                )
+        tensor_names.append(prefix + ".block_sparse_moe.gate.weight")
+        res = []
+        for name in tensor_names:
+            res.append(WeightInfo(name=name))
+        for weight_info in MISTRAL_INFO.layer_weights(index, config):
+            if ".mlp." in weight_info.name:
+                continue
+            res.append(weight_info)
+        return res
+
+    def sliceable(self) -> bool:
+        return True
+
+    def has_defined_spaces(self) -> bool:
+        return False
+
+
+def _load_json_arch(name: str) -> JsonArchitectureInfo:
+    text = importlib.resources.read_text(mergekit._data.architectures, name)
+    return JsonArchitectureInfo(
+        definition=JSONArchitectureDefinition.model_validate_json(text)
+    )
+
+
+def _load_all_architectures() -> (
+    Tuple[List[JsonArchitectureInfo], Dict[str, List[JsonArchitectureInfo]]]
+):
+    architectures: List[JsonArchitectureInfo] = []
+    for f in importlib.resources.contents(mergekit._data.architectures):
+        if f.lower().endswith(".json"):
+            architectures.append(_load_json_arch(f))
+
+    name_to_arch: Dict[str, List[JsonArchitectureInfo]] = {}
+    for arch_info in architectures:
+        for name in arch_info.definition.architectures:
+            name_to_arch[name] = name_to_arch.get(name, [])
+            name_to_arch[name].append(arch_info)
+    return architectures, name_to_arch
+
+
+JSON_ARCHITECTURES, NAME_TO_ARCH = _load_all_architectures()
+MISTRAL_INFO = _load_json_arch("mistral.json")
+QWEN2_INFO = _load_json_arch("qwen2.json")
+
+
+def get_architecture_info(config: PretrainedConfig) -> ArchitectureInfo:
+    if len(config.architectures) != 1:
+        raise RuntimeError("More than one architecture in config?")
+
+    arch_name = config.architectures[0]
+
+    if arch_name == MixtralTensorNames.ARCHITECTURE_NAME:
+        return MixtralTensorNames.from_config(config)
+
+    if arch_name not in NAME_TO_ARCH:
+        raise RuntimeError(f"Unsupported architecture {arch_name}")
+
+    candidates = list(NAME_TO_ARCH[arch_name])
+    if len(candidates) == 1:
+        return candidates[0]
+
+    for c in candidates:
+        if c.definition.expected_model_type == config.model_type:
+            return c
+
+    raise RuntimeError(
+        f"Unsupported model_type {config.model_type} for architecture {arch_name}"
+    )
diff --git a/mergekit/mergekit/card.py b/mergekit/mergekit/card.py
new file mode 100644
index 0000000000000000000000000000000000000000..5adb30aa0ecfd4a70587a9fa5299923bc471632a
--- /dev/null
+++ b/mergekit/mergekit/card.py
@@ -0,0 +1,250 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import os
+from typing import Generator, List, Optional
+
+import huggingface_hub
+import yaml
+from huggingface_hub.utils import HFValidationError
+from yaml.nodes import SequenceNode as SequenceNode
+
+from mergekit.config import MergeConfiguration, ModelReference
+
+CARD_TEMPLATE = """---
+{metadata}
+---
+# {name}
+
+This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
+
+## Merge Details
+### Merge Method
+
+This model was merged using the {merge_method} merge method{base_text}.
+
+### Models Merged
+
+The following models were included in the merge:
+{model_list}
+
+### Configuration
+
+The following YAML configuration was used to produce this model:
+
+```yaml
+{config_yaml}
+```
+"""
+
+CARD_TEMPLATE_LORA = """---
+{metadata}
+---
+# {name}
+
+This is a LoRA extracted from a language model. It was extracted using [mergekit](https://github.com/arcee-ai/mergekit).
+
+## LoRA Details
+
+{details}
+
+### Parameters
+
+The following command was used to extract this LoRA adapter:
+
+```sh
+{invocation}
+```
+"""
+
+
+def is_hf(path: str) -> bool:
+    """
+    Determines if the given path is a Hugging Face model repository.
+
+    Args:
+        path: A string path to check.
+    """
+    if path[0] in "/~" or path.count("/") > 1:
+        return False  # definitely a local path
+    if not os.path.exists(path):
+        return True  # If path doesn't exist locally, it must be a HF repo
+    try:
+        return huggingface_hub.repo_exists(path, repo_type="model", token=False)
+    except HFValidationError:
+        return False
+
+
+def extract_hf_paths(models: List[ModelReference]) -> Generator[str, None, None]:
+    """
+    Yields all valid Hugging Face paths from a list of ModelReference objects.
+
+    Args:
+        models: A list of ModelReference objects.
+    """
+    for model in models:
+        if is_hf(model.model.path):
+            yield model.model.path
+
+        if model.lora and is_hf(model.lora.path):
+            yield model.lora.path
+
+
+def method_md(merge_method: str) -> str:
+    """
+    Returns a markdown string for the given merge method.
+
+    Args:
+        merge_method: A string indicating the merge method used.
+    """
+    methods = {
+        "linear": "[linear](https://arxiv.org/abs/2203.05482)",
+        "ties": "[TIES](https://arxiv.org/abs/2306.01708)",
+        "slerp": "SLERP",
+        "task_arithmetic": "[task arithmetic](https://arxiv.org/abs/2212.04089)",
+        "dare_ties": "[DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)",
+        "dare_linear": "linear [DARE](https://arxiv.org/abs/2311.03099)",
+        "model_stock": "[Model Stock](https://arxiv.org/abs/2403.19522)",
+        "della": "[DELLA](https://arxiv.org/abs/2406.11617)",
+        "della_linear": "linear [DELLA](https://arxiv.org/abs/2406.11617)",
+        "nuslerp": "NuSLERP",
+    }
+    return methods.get(merge_method, merge_method)
+
+
+def maybe_link_hf(path: str) -> str:
+    """
+    Convert a path to a clickable link if it's a Hugging Face model path.
+
+    Args:
+        path: A string path to possibly convert to a link.
+    """
+    if is_hf(path):
+        return f"[{path}](https://huggingface.co/{path})"
+    return path
+
+
+def modelref_md(model: ModelReference) -> str:
+    """
+    Generates markdown description for a ModelReference object.
+
+    Args:
+        model: A ModelReference object.
+
+    Returns:
+        A markdown formatted string describing the model reference.
+    """
+    text = maybe_link_hf(model.model.path)
+    if model.lora:
+        text += " + " + maybe_link_hf(model.lora.path)
+    return text
+
+
+def generate_card(
+    config: MergeConfiguration,
+    config_yaml: str,
+    name: Optional[str] = None,
+) -> str:
+    """
+    Generates a markdown card for a merged model configuration.
+
+    Args:
+        config: A MergeConfiguration object.
+        config_yaml: YAML source text of the config.
+        name: An optional name for the model.
+    """
+    if not name:
+        name = "Untitled Model (1)"
+
+    hf_bases = list(extract_hf_paths(config.referenced_models()))
+    tags = ["mergekit", "merge"]
+
+    actual_base = config.base_model
+    if config.merge_method == "slerp":
+        # curse my past self
+        actual_base = None
+
+    base_text = ""
+    if actual_base:
+        base_text = f" using {modelref_md(actual_base)} as a base"
+
+    model_bullets = []
+    for model in config.referenced_models():
+        if model == actual_base:
+            # actual_base is mentioned in base_text - don't include in list
+            continue
+
+        model_bullets.append("* " + modelref_md(model))
+
+    return CARD_TEMPLATE.format(
+        metadata=yaml.dump(
+            {"base_model": hf_bases, "tags": tags, "library_name": "transformers"}
+        ),
+        model_list="\n".join(model_bullets),
+        base_text=base_text,
+        merge_method=method_md(config.merge_method),
+        name=name,
+        config_yaml=config_yaml,
+    )
+
+
+def generate_card_lora(
+    base_model_ref: ModelReference,
+    finetuned_model_ref: ModelReference,
+    invocation: str,
+    extended: bool,
+    vocab_size: int,
+    name: str,
+) -> str:
+    """
+    Generates a markdown card for a merged model configuration.
+
+    Args:
+        config: A MergeConfiguration object.
+        config_yaml: YAML source text of the config.
+        name: An optional name for the model.
+    """
+    if not name:
+        name = "Untitled LoRA Model (1)"
+
+    hf_bases = list(extract_hf_paths([base_model_ref, finetuned_model_ref]))
+    tags = ["mergekit", "peft"]
+
+    finetuned_ref_md = modelref_md(finetuned_model_ref)
+    basemodel_ref_md = modelref_md(base_model_ref)
+
+    details = f"This LoRA adapter was extracted from {finetuned_ref_md} and uses {basemodel_ref_md} as a base."
+
+    if extended:
+        details += f"\n\n> [!WARNING]\n> This LoRA adapter has an extended vocabulary. Make sure to call `model.resize_token_embeddings({vocab_size})` before applying the adapter to {basemodel_ref_md}"
+
+    if os.path.isdir(base_model_ref.model.path) or os.path.isdir(
+        finetuned_model_ref.model.path
+    ):
+        logging.warning(
+            "Some model identifiers you provided are directory paths and will appear as such in the model card, you may want to edit it."
+        )
+
+    return CARD_TEMPLATE_LORA.format(
+        metadata=yaml.dump(
+            {"base_model": hf_bases, "tags": tags, "library_name": "transformers"}
+        ),
+        name=name,
+        details=details,
+        base_model=base_model_ref.model.path,
+        finetuned_model=finetuned_model_ref.model.path,
+        invocation=invocation,
+    )
diff --git a/mergekit/mergekit/common.py b/mergekit/mergekit/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d4eac5ef2395bf26c853b7079ee384367eb7ad
--- /dev/null
+++ b/mergekit/mergekit/common.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import binascii
+import logging
+import os
+import os.path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterator,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    get_args,
+)
+
+import huggingface_hub
+import immutables
+import peft
+import torch
+import transformers
+from pydantic import BaseModel, model_serializer, model_validator
+from pydantic_core import core_schema
+from transformers import AutoConfig, PretrainedConfig
+from typing_extensions import TypeVar
+
+from mergekit.io import LazyTensorLoader, ShardedTensorIndex
+
+
+class ModelPath(BaseModel, frozen=True):
+    path: str
+    revision: Optional[str] = None
+
+    @model_validator(mode="before")
+    def validate_string(cls, value):
+        if isinstance(value, str):
+            at_ct = value.count("@")
+            if at_ct > 1:
+                raise RuntimeError(f"Invalid model path - multiple @: {value}")
+            elif at_ct == 1:
+                path, rev = value.split("@")
+                return {"path": path, "revision": rev}
+            else:
+                return {"path": value}
+        return value
+
+    def __str__(self):
+        if self.revision:
+            return f"{self.path}@{self.revision}"
+        return self.path
+
+    def _unique_id(self):
+        return (
+            os.path.basename(self.path)
+            + "_"
+            + str(binascii.crc32(self.__str__().encode()))
+        )
+
+
+class ModelReference(BaseModel, frozen=True):
+    """A reference to a language model.
+
+    Can be a hf hub path (username/repo), or local. Optionally includes a LoRA."""
+
+    model: ModelPath
+    lora: Optional[ModelPath] = None
+    override_architecture: Optional[str] = None
+
+    def merged(
+        self, cache_dir: Optional[str] = None, trust_remote_code: bool = False
+    ) -> "ModelReference":
+        """Merge the LoRA if applicable and return a reference to the result."""
+        if not self.lora:
+            return self
+
+        if not cache_dir:
+            raise RuntimeError("Need to specify cache dir to merge adapters")
+
+        out_path = os.path.join(
+            cache_dir,
+            self.model._unique_id() + "_" + self.lora._unique_id(),
+        )
+
+        if not os.path.exists(out_path):
+            os.makedirs(out_path, exist_ok=True)
+
+            config = self.config(trust_remote_code)
+            auto_cls = _get_auto_cls(config.architectures[0])
+
+            logging.info(f"Loading {self.model} for merge...")
+            model = auto_cls.from_pretrained(
+                self.model.path,
+                revision=self.model.revision,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=trust_remote_code,
+            )
+            model = peft.PeftModel.from_pretrained(
+                model, self.lora.path, revision=self.lora.revision, is_trainable=False
+            )
+            logging.info(f"Merging {self.lora} into {self.model}")
+            model = model.merge_and_unload()
+            model.save_pretrained(out_path, safe_serialization=True)
+            del model
+
+        return ModelReference(model=out_path)
+
+    def config(self, trust_remote_code: bool = False) -> PretrainedConfig:
+        res = AutoConfig.from_pretrained(
+            self.model.path,
+            revision=self.model.revision,
+            trust_remote_code=trust_remote_code,
+        )
+        if self.override_architecture:
+            res.architectures = [self.override_architecture]
+        return res
+
+    def tensor_index(self, cache_dir: Optional[str] = None) -> ShardedTensorIndex:
+        assert self.lora is None
+
+        path = self.model.path
+        if not os.path.exists(path):
+            has_safetensors = any(
+                fn.lower().endswith(".safetensors")
+                for fn in huggingface_hub.list_repo_files(
+                    path, repo_type="model", revision=self.model.revision
+                )
+            )
+            patterns = ["tokenizer.model", "*.json"]
+            if has_safetensors:
+                patterns.append("*.safetensors")
+            else:
+                patterns.append("*.bin")
+
+            path = huggingface_hub.snapshot_download(
+                path,
+                revision=self.model.revision,
+                cache_dir=cache_dir,
+                allow_patterns=patterns,
+            )
+
+        return ShardedTensorIndex.from_disk(path)
+
+    def lazy_loader(
+        self, cache_dir: Optional[str] = None, lazy_unpickle: bool = True
+    ) -> LazyTensorLoader:
+        return LazyTensorLoader(
+            self.tensor_index(cache_dir),
+            lazy_unpickle=lazy_unpickle,
+        )
+
+    @model_validator(mode="before")
+    def validate_string(cls, value):
+        if isinstance(value, str):
+            chunks = value.split("+")
+            if len(chunks) == 1:
+                return {"model": value}
+            elif len(chunks) == 2:
+                return {"model": chunks[0], "lora": chunks[1]}
+            raise RuntimeError(f"Can't parse {value}")
+        return value
+
+    @model_serializer()
+    def serialize(self):
+        res = str(self)
+        if '"' in res or " " in res:
+            return self
+        return res
+
+    @classmethod
+    def parse(cls, value: str) -> "ModelReference":
+        """Parse a ModelReference. Format: '<MODEL_PATH>(+<LORA_PATH>)?'"""
+        return ModelReference.model_validate(value)
+
+    def __str__(self) -> str:
+        if self.lora:
+            return f"{str(self.model)}+{str(self.lora)}"
+        return str(self.model)
+
+
+def dtype_from_name(name: Optional[str]) -> Optional[torch.dtype]:
+    if not name:
+        return None
+
+    if name.startswith("torch."):
+        name = name[len("torch.") :]
+
+    if name == "bfloat16":
+        return torch.bfloat16
+    elif name == "float16":
+        return torch.float16
+    elif name == "float32":
+        return torch.float32
+    elif name == "int64":
+        return torch.int64
+    raise RuntimeError(f'Unimplemented dtype "{name}"')
+
+
+def parse_kmb(value: Union[str, int]) -> int:
+    if isinstance(value, int):
+        return value
+    elif value.isnumeric():
+        return int(value)
+    elif value[-1].lower() == "k":
+        return int(value[:-1]) * 1000
+    elif value[-1].lower() == "m":
+        return int(value[:-1]) * 1000 * 1000
+    elif value[-1].lower() == "b":
+        return int(value[:-1]) * 1000 * 1000 * 1000
+    else:
+        raise ValueError(value)
+
+
+T_K = TypeVar("T_K")
+T_V = TypeVar("T_V")
+
+
+class ImmutableMap(Generic[T_K, T_V]):
+    data: immutables.Map[T_K, T_V]
+
+    def __init__(self, data: Mapping[T_K, T_V]):
+        self.data = data
+
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source: Any, handler: Callable[[Any], core_schema.CoreSchema]
+    ) -> core_schema.CoreSchema:
+        instance_schema = core_schema.is_instance_schema(cls)
+
+        args = get_args(source)
+        if args:
+            dict_schema = handler(Dict[args[0], args[1]])
+        else:
+            dict_schema = handler(Dict)
+
+        non_instance_schema = core_schema.with_info_after_validator_function(
+            lambda value, _info: immutables.Map(value), dict_schema
+        )
+        return core_schema.union_schema([instance_schema, non_instance_schema])
+
+    def __iter__(self):
+        return self.data.__iter__()
+
+    def __getitem__(self, key: T_K) -> T_V:
+        return self.data[key]
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def keys(self) -> Iterator[T_K]:
+        return self.data.keys()
+
+    def items(self) -> Iterator[Tuple[T_K, T_V]]:
+        return self.data.items()
+
+    def values(self) -> Iterator[T_V]:
+        return self.data.values()
+
+
+def _get_auto_cls(arch_name: str):
+    """Get the AutoModel class for a given architecture name."""
+    if arch_name.endswith("ForMaskedLM"):
+        auto_cls = transformers.AutoModelForMaskedLM
+    elif arch_name.endswith("ForSequenceClassification"):
+        auto_cls = transformers.AutoModelForSequenceClassification
+    elif arch_name.endswith("ForTokenClassification"):
+        auto_cls = transformers.AutoModelForTokenClassification
+    else:
+        if not arch_name.endswith("ForCausalLM") or arch_name.endswith("LMHeadModel"):
+            logging.warning(
+                f"Unknown model type {arch_name} - assuming AutoModelForCausalLM"
+            )
+        auto_cls = transformers.AutoModelForCausalLM
+    return auto_cls
diff --git a/mergekit/mergekit/config.py b/mergekit/mergekit/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c79de7c5536ed57416dff0435e034d7c6f2b415
--- /dev/null
+++ b/mergekit/mergekit/config.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import yaml
+from pydantic import BaseModel, model_validator
+from typing_extensions import Literal, TypeAlias
+
+from mergekit.common import ModelReference
+from mergekit.tokenizer.config import TokenizerConfig
+
+ScalarOrGradient: TypeAlias = Union[float, List[float]]
+
+
+class ConditionalParameter(BaseModel):
+    value: ScalarOrGradient
+    filter: Optional[str] = None
+
+
+ParameterSetting: TypeAlias = Union[
+    ConditionalParameter, List[ConditionalParameter], ScalarOrGradient
+]
+
+
+def evaluate_setting(
+    tensor_name: str, setting: ParameterSetting, t: float = 0
+) -> float:
+    if isinstance(setting, (float, int, bool, str)):
+        return setting
+    elif isinstance(setting, list):
+        if all(isinstance(e, (int, float)) for e in setting):
+            scaled = t * (len(setting) - 1)
+            i0 = int(scaled)
+            i1 = min(len(setting) - 1, i0 + 1)
+            frac = scaled - i0
+
+            return (1 - frac) * setting[i0] + frac * setting[i1]
+        elif all(isinstance(e, (float, int, bool, str)) for e in setting):
+            return setting[int(t * (len(setting) - 1))]
+        else:
+            for cond in setting:
+                if (
+                    (cond.filter is None)
+                    or (cond.filter == "*")
+                    or (tensor_name and cond.filter in tensor_name)
+                ):
+                    res = evaluate_setting(tensor_name, cond.value, t)
+                    return res
+    else:
+        raise RuntimeError(f"Unexpected setting value: {setting}")
+    return None
+
+
+class InputSliceDefinition(BaseModel):
+    model: ModelReference
+    layer_range: Tuple[int, int]
+    parameters: Optional[Dict[str, ParameterSetting]] = None
+
+
+class InputModelDefinition(BaseModel):
+    model: ModelReference
+    parameters: Optional[Dict[str, ParameterSetting]] = None
+
+
+class OutputSliceDefinition(BaseModel):
+    sources: List[InputSliceDefinition]
+    base_model: Optional[ModelReference] = None
+    residual_weight: Optional[float] = None
+    parameters: Optional[Dict[str, ParameterSetting]] = None
+
+
+class MergeConfiguration(BaseModel):
+    merge_method: str
+    slices: Optional[List[OutputSliceDefinition]] = None
+    models: Optional[List[InputModelDefinition]] = None
+    parameters: Optional[Dict[str, ParameterSetting]] = None
+    base_model: Optional[ModelReference] = None
+    dtype: Optional[str] = None
+    tokenizer_source: Union[
+        Literal["union"], Literal["base"], ModelReference, None
+    ] = None
+    tokenizer: Optional[TokenizerConfig] = None
+    chat_template: Optional[str] = None
+    out_dtype: Optional[str] = None
+
+    def referenced_models(self) -> List[ModelReference]:
+        models = set()
+        if self.base_model:
+            models.add(self.base_model)
+        if self.models:
+            for model_in in self.models:
+                models.add(model_in.model)
+        if self.slices:
+            for s in self.slices:
+                for src in s.sources:
+                    models.add(src.model)
+        return list(models)
+
+    @model_validator(mode="after")
+    def validate_inputs(self):
+        if ((not self.slices) and (not self.models)) or (self.slices and self.models):
+            raise RuntimeError("Must specify either output slices or models to merge")
+        return self
+
+    @model_validator(mode="after")
+    def validate_tokenizer(self):
+        if self.tokenizer_source and self.tokenizer:
+            raise RuntimeError("Cannot specify both tokenizer_source and tokenizer")
+        return self
+
+    def to_yaml(self) -> str:
+        return yaml.dump(
+            self.model_dump(exclude_defaults=True, mode="json"),
+            Dumper=ConfigYamlDumper,
+        ).rstrip()
+
+
+class ConfigReader(BaseModel):
+    config: MergeConfiguration
+    t: float
+    tensor_name: Optional[str] = None
+    slice_out: Optional[OutputSliceDefinition] = None
+
+    @property
+    def base_model(self) -> Optional[ModelReference]:
+        if self.slice_out and self.slice_out.base_model:
+            res = self.slice_out.base_model
+        else:
+            res = self.config.base_model
+
+        return res
+
+    def for_out_slice(self, slice: OutputSliceDefinition) -> "ConfigReader":
+        return ConfigReader(
+            config=self.config,
+            t=self.t,
+            tensor_name=self.tensor_name,
+            slice_out=slice,
+        )
+
+    def for_tensor(self, tensor_name: str) -> "ConfigReader":
+        return ConfigReader(
+            config=self.config,
+            t=self.t,
+            tensor_name=tensor_name,
+            slice_out=self.slice_out,
+        )
+
+    def with_t(self, t: float) -> "ConfigReader":
+        return ConfigReader(
+            config=self.config,
+            t=t,
+            tensor_name=self.tensor_name,
+            slice_out=self.slice_out,
+        )
+
+    def parameter(
+        self,
+        name: str,
+        model: Optional[ModelReference] = None,
+        default: Any = None,
+        required: bool = False,
+    ) -> Any:
+        if self.slice_out:
+            if model:
+                for s in self.slice_out.sources:
+                    if s.model == model and s.parameters and name in s.parameters:
+                        value = evaluate_setting(
+                            self.tensor_name, s.parameters[name], self.t
+                        )
+                        if value is not None:
+                            return value
+
+            if self.slice_out.parameters and name in self.slice_out.parameters:
+                value = evaluate_setting(
+                    self.tensor_name, self.slice_out.parameters[name], self.t
+                )
+                if value is not None:
+                    return value
+
+        if self.config.parameters and name in self.config.parameters:
+            value = evaluate_setting(
+                self.tensor_name,
+                self.config.parameters[name],
+                self.t,
+            )
+            if value is not None:
+                return value
+
+        if required:
+            path_paths = [str(s) for s in [model, self.tensor_name] if s]
+            p = ".".join(path_paths)
+            suffix = f" for {p}" if p else ""
+            raise RuntimeError(f"Missing required parameter {name}{suffix}")
+        return default
+
+
+class ConfigYamlDumper(yaml.Dumper):
+    """Custom YAML dumper to format lists of numbers in flow style."""
+
+    def represent_list(self, data: Iterable[Any]) -> yaml.SequenceNode:
+        flow_style = all(isinstance(e, (int, float)) for e in data)
+        return self.represent_sequence(
+            "tag:yaml.org,2002:seq", data, flow_style=flow_style
+        )
+
+
+ConfigYamlDumper.add_representer(list, ConfigYamlDumper.represent_list)
diff --git a/mergekit/mergekit/evo/__init__.py b/mergekit/mergekit/evo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/evo/actors.py b/mergekit/mergekit/evo/actors.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5c498695bc679d58734b64654c5fc662f61728
--- /dev/null
+++ b/mergekit/mergekit/evo/actors.py
@@ -0,0 +1,324 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import gc
+import logging
+import tempfile
+from typing import Optional, Union
+
+import lm_eval
+import lm_eval.api.model
+import lm_eval.models.huggingface
+import lm_eval.tasks
+import ray
+import ray.util.queue
+import ray.util.scheduling_strategies
+import torch
+import transformers
+from transformers.utils import is_flash_attn_2_available
+
+try:
+    import vllm
+except ImportError:
+    vllm = None
+
+
+from mergekit.architecture import ConfiguredArchitectureInfo, get_architecture_info
+from mergekit.config import MergeConfiguration
+from mergekit.evo.config import EvolMergeConfiguration
+from mergekit.evo.genome import InvalidGenotypeError, ModelGenome
+from mergekit.evo.helpers import _eval_model, evaluate_model, merge_model
+from mergekit.evo.monkeypatch import (
+    NoInit,
+    monkeypatch_lmeval_shuffle,
+    monkeypatch_lmeval_vllm,
+)
+from mergekit.graph import Executor
+from mergekit.io.tasks import LoaderCache, ReturnTensor
+from mergekit.merge import _model_out_config
+from mergekit.options import MergeOptions
+from mergekit.plan import MergePlanner
+
+
+class MergeActorBase:
+    def __init__(
+        self,
+        config: EvolMergeConfiguration,
+        genome: ModelGenome,
+        merge_options: MergeOptions,
+        model_storage_path: Optional[str] = None,
+        vllm: bool = False,
+        batch_size: Optional[int] = None,
+        task_manager: Optional[lm_eval.tasks.TaskManager] = None,
+    ):
+        self.config = config
+        self.genome = genome
+        self.merge_options = merge_options
+        self.cache = LoaderCache()
+        self.cache.setup(merge_options)
+        self.model_storage_path = model_storage_path
+        self.vllm = vllm
+        self.batch_size = batch_size
+        self.task_manager = task_manager
+
+        if config.shuffle:
+            monkeypatch_lmeval_shuffle()
+
+        # monkeypatch_tqdm()
+        monkeypatch_lmeval_vllm()
+
+
+@ray.remote(num_cpus=1, num_gpus=1.0)
+class OnDiskMergeEvaluator(MergeActorBase):
+    """
+    Merges models to disk then evaluates them in a separate process.
+
+    Maximum compatibility and potential for parallelism, but higher overhead.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def evaluate_genotype(
+        self,
+        genotype: torch.Tensor,
+    ) -> dict:
+        gc.collect()
+        torch.cuda.empty_cache()
+        logging.info("Merging model")
+        merged_path = merge_model(
+            genotype, self.genome, self.model_storage_path, self.merge_options
+        )
+        if not merged_path:
+            logging.error("Model merge failed")
+            return {"score": None, "results": None}
+
+        logging.info(f"Model merged to {merged_path}")
+        return evaluate_model(
+            merged_path,
+            self.config.tasks,
+            num_fewshot=self.config.num_fewshot,
+            limit=self.config.limit,
+            vllm=self.vllm,
+            batch_size=self.batch_size,
+            task_manager=self.task_manager,
+        )
+
+
+@ray.remote(num_cpus=1, num_gpus=1)
+class InMemoryMergeEvaluator(MergeActorBase):
+    """
+    Performs merges in memory, using a single model instance.
+
+    This reduces overhead from disk I/O and model loading, but prevents
+    parallelism and may be slower for large models.
+
+    Implementation is dark sorcery tampering with the internals of lm-eval,
+    transformers, and vLLM and may break at any time.
+    """
+
+    model: Union[
+        lm_eval.models.huggingface.HFLM, lm_eval.models.vllm_causallms.VLLM, None
+    ] = None
+    arch_info: Optional[ConfiguredArchitectureInfo] = None
+
+    def __init__(
+        self,
+        *args,
+        vllm: bool = False,
+        **kwargs,
+    ):
+        # assert not vllm, "VLLM is not supported for in-memory merging"
+        super().__init__(*args, vllm=vllm, **kwargs)
+
+    def _maybe_init_model(self, config: MergeConfiguration):
+        ai = get_architecture_info(self.genome._input_config_example)
+        cfg_out = _model_out_config(
+            config,
+            ai,
+            trust_remote_code=self.merge_options.trust_remote_code,
+        )
+        cfg_out.use_cache = True
+        cfg_out.torch_dtype = torch.bfloat16
+
+        if self.arch_info is not None:
+            different = False
+            for key in cfg_out.to_diff_dict():
+                if key in ["architectures", "model_type"]:
+                    # to get to here we must have --allow-crimes set, so let it ride
+                    continue
+                elif key in ["use_cache", "torch_dtype"]:
+                    continue
+                elif key.endswith("_token_id"):
+                    # update our config but don't fail if it's different
+                    setattr(self.arch_info.config, key, getattr(cfg_out, key, None))
+                    continue
+
+                if getattr(cfg_out, key) != getattr(self.arch_info.config, key, None):
+                    logging.warn(f"Config key {key} changed, reinitializing model")
+                    different = True
+                    break
+
+            if not different:
+                return
+
+        self.inner_model = None
+
+        model_kwargs = {
+            "trust_remote_code": self.merge_options.trust_remote_code,
+            "torch_dtype": torch.bfloat16,
+        }
+        if is_flash_attn_2_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
+
+        with NoInit():
+            inner_model = (
+                transformers.AutoModelForCausalLM.from_config(
+                    cfg_out,
+                    **model_kwargs,
+                )
+                .bfloat16()
+                .cuda()
+                .eval()
+                .requires_grad_(False)
+            )
+
+        if self.vllm:
+            # oh i hate this
+            with tempfile.TemporaryDirectory(
+                dir=self.model_storage_path, prefix="vllm"
+            ) as tempdir:
+                inner_model.save_pretrained(
+                    tempdir, safe_serialization=True, out_shard_size=1_000_000_000_000
+                )
+                del inner_model
+                tokenizer_donor = self.genome.definition.base_model
+                if tokenizer_donor is None:
+                    logging.warning(
+                        "Base model not set, using tokenizer from first model in genome"
+                    )
+                    tokenizer_donor = self.genome.definition.models[0]
+                tok = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer_donor.model.path, use_fast=True
+                )
+                tok.save_pretrained(tempdir)
+
+                max_model_len = None
+                if (
+                    seq_len := getattr(cfg_out, "max_position_embeddings", None)
+                ) is not None:
+                    max_model_len = seq_len
+                if (window_sz := getattr(cfg_out, "sliding_window", None)) is not None:
+                    max_model_len = min(max_model_len or 1024, window_sz)
+                if max_model_len and max_model_len > 8192:
+                    max_model_len = 8192
+                    logging.warn(f"Clipping sequence length to {max_model_len}")
+
+                mem_util = (
+                    0.7 if self.merge_options.cuda else 0.9
+                )  # reduce memory usage if we're also using cuda for the merge
+                self.model = lm_eval.models.vllm_causallms.VLLM(
+                    pretrained=tempdir,
+                    batch_size=self.batch_size or "auto",
+                    max_model_len=max_model_len,
+                    gpu_memory_utilization=mem_util,
+                    dtype="bfloat16",
+                    device="cuda",
+                    trust_remote_code=self.merge_options.trust_remote_code,
+                )
+        else:
+            self.model = lm_eval.models.huggingface.HFLM(pretrained=inner_model)
+        self.arch_info = ConfiguredArchitectureInfo(info=ai, config=cfg_out)
+        logging.info("Model initialized")
+
+    def evaluate(self, genotype: torch.Tensor) -> dict:
+        try:
+            config = self.genome.genotype_merge_config(genotype)
+        except InvalidGenotypeError as e:
+            logging.error("Invalid genotype", exc_info=e)
+            return {"score": None, "results": None}
+
+        self._maybe_init_model(config)
+
+        planner = MergePlanner(
+            config,
+            self.arch_info.info,
+            self.merge_options,
+            self.arch_info.config,
+        )
+
+        tasks = planner.plan_in_memory()
+
+        model = self.model.model
+        if vllm is not None and isinstance(model, vllm.LLM):
+            assert (
+                model.llm_engine.parallel_config.world_size == 1
+            ), "Must be single GPU"
+            worker = model.llm_engine.driver_worker
+            model = worker.model_runner.model
+        param_dict = dict(model.named_parameters())
+
+        stacked_mapping = {
+            # mappings for Llama/Mistral attention weights to vLLM packed tensors
+            ".q_proj.": (".qkv_proj.", "q"),
+            ".k_proj.": (".qkv_proj.", "k"),
+            ".v_proj.": (".qkv_proj.", "v"),
+            ".gate_proj.": (".gate_up_proj.", 0),
+            ".up_proj.": (".gate_up_proj.", 1),
+        }
+
+        executor = Executor(
+            tasks,
+            math_device="cuda" if self.merge_options.cuda else "cpu",
+            storage_device="cuda" if self.merge_options.cuda else "cpu",
+        )
+        for tensor_task, value in executor.run(quiet=True):
+            assert isinstance(tensor_task, ReturnTensor)
+            name = tensor_task.weight_info.name
+
+            if name in param_dict:
+                param_dict[name].data.copy_(value, non_blocking=True)
+            elif self.vllm:
+                stacked = False
+                for needle, (replacement, shard_id) in stacked_mapping.items():
+                    if needle in name:
+                        target = name.replace(needle, replacement)
+                        param = param_dict[target]
+                        weight_loader = param.weight_loader
+                        weight_loader(param, value, shard_id)
+                        stacked = True
+                        break
+
+                if not stacked:
+                    raise ValueError(f"Unknown parameter {name}")
+            else:
+                raise ValueError(f"Unknown parameter {name}")
+
+            del value
+
+        return _eval_model(
+            self.model,
+            self.config.tasks,
+            num_fewshot=self.config.num_fewshot,
+            limit=self.config.limit,
+            task_manager=self.task_manager,
+            batch_size=self.batch_size,
+        )
+
+    def evaluate_genotype(
+        self,
+        genotype: torch.Tensor,
+    ) -> dict:
+        return self.evaluate(genotype)
diff --git a/mergekit/mergekit/evo/config.py b/mergekit/mergekit/evo/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a66abae0a7afb4d59f63e76bdd312a87ec0945
--- /dev/null
+++ b/mergekit/mergekit/evo/config.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import List, Optional
+
+from pydantic import BaseModel, model_validator
+
+from mergekit.evo.genome import ModelGenomeDefinition
+
+
+class TaskConfiguration(BaseModel, frozen=True):
+    name: str
+    weight: float = 1.0
+    metric: str = "acc,none"
+
+    @model_validator(mode="before")
+    def validate_string(cls, value):
+        if isinstance(value, str):
+            return {"name": value}
+        return value
+
+
+class EvolMergeConfiguration(BaseModel, frozen=True):
+    genome: ModelGenomeDefinition
+    tasks: List[TaskConfiguration]
+    limit: Optional[int] = None
+    num_fewshot: Optional[int] = None
+    shuffle: bool = False
+    random_init: bool = False
+
+
+NAUGHTY_PREFIXES = [
+    "mmlu",
+    "hendrycks",
+    "agieval",
+    "gsm8k",
+    "hellaswag",
+    "winogrande",
+    "arc_",
+    "ai2_arc",
+    "truthfulqa",
+    "bigbench",
+    "piqa",
+    "openbookqa",
+]
+
+
+def check_for_naughty_config(config: EvolMergeConfiguration, allow: bool = False):
+    """
+    Check if the given configuration is naughty and should be disallowed.
+
+    mergekit-evolve is perfectly set up to directly optimize against the test set
+    of common benchmarks, which just makes the world a worse place. There are
+    cases where this is useful but it deserves a giant honking warning.
+    """
+    suffix = ""
+    if not allow:
+        suffix = (
+            " To proceed, set the "
+            "--i-understand-the-depths-of-the-evils-i-am-unleashing flag."
+        )
+    for task in config.tasks:
+        for prefix in NAUGHTY_PREFIXES:
+            if task.name.startswith(prefix):
+                if task.name.endswith("_train"):
+                    # there aren't any tasks that match this pattern in base
+                    # lm-eval, but it'd be a sane thing to do to add tasks for
+                    # the training sets of these benchmarks. don't warn about
+                    # them
+                    continue
+
+                message = (
+                    f"Task {task.name} is a common benchmark task. "
+                    "Optimizing against this task directly is unsporting at best "
+                    "and outright malicious at worst. Using mergekit-evolve to "
+                    "game benchmarks will be a black mark on your name for a "
+                    f"thousand generations.{suffix}"
+                )
+                if not allow:
+                    raise ValueError(message)
+                else:
+                    logging.warning(message)
diff --git a/mergekit/mergekit/evo/genome.py b/mergekit/mergekit/evo/genome.py
new file mode 100644
index 0000000000000000000000000000000000000000..696296443b11002d88187e31033301ddeb867c9e
--- /dev/null
+++ b/mergekit/mergekit/evo/genome.py
@@ -0,0 +1,383 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import os
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import transformers
+from pydantic import BaseModel, model_validator
+
+from mergekit.common import ModelReference
+from mergekit.config import MergeConfiguration
+
+METHOD_PARAM_MAPS = {
+    "linear": ["weight"],
+    "task_arithmetic": ["weight"],
+    "ties": ["weight", "density"],
+    "dare_ties": ["weight", "density"],
+    "slerp": ["t"],
+}
+
+
+class InvalidGenotypeError(RuntimeError):
+    pass
+
+
+class ModelGenomeDefinition(BaseModel, frozen=True):
+    models: List[ModelReference]
+    merge_method: str
+    base_model: Optional[ModelReference] = None
+    tokenizer_source: Optional[str] = None
+    layer_granularity: int = 0
+    normalize: Optional[bool] = None
+    allow_negative_weights: bool = False
+    filters: Optional[List[str]] = None
+    smooth: bool = False
+
+    @model_validator(mode="after")
+    def validate(self):
+        assert self.merge_method in METHOD_PARAM_MAPS, "Invalid merge method"
+
+        if self.merge_method in ["ties", "dare_ties", "task_arithmetic"]:
+            assert self.base_model is not None, "base_model is required for this method"
+
+        if self.merge_method == "slerp":
+            assert not self.smooth, "smooth is not supported for slerp merge method"
+            assert (
+                not self.filters
+            ), "tensor name filtering is not supported for slerp merge method"
+
+        return self
+
+
+class ModelGenome:
+    definiton: ModelGenomeDefinition
+    num_layers: int
+    _input_config_example: transformers.PretrainedConfig
+
+    def __init__(
+        self, definition: ModelGenomeDefinition, trust_remote_code: bool = False
+    ):
+        self.definition = definition
+
+        self._input_config_example = self.definition.models[0].config(
+            trust_remote_code=trust_remote_code
+        )
+        self.num_layers = self._input_config_example.num_hidden_layers
+
+        assert (
+            self.definition.layer_granularity < 1
+            or self.num_layers % self.definition.layer_granularity == 0
+        ), "Number of layers must be a multiple of layer_granularity"
+
+    def initial_genotype(self, random: bool = False) -> torch.Tensor:
+        """Generate an initial genotype for the given number of layers."""
+        if self.definition.layer_granularity > 0:
+            n_layer_groups = self.num_layers // self.definition.layer_granularity
+        else:
+            n_layer_groups = 1
+        n_param_sets = len(self.definition.filters or []) + 1
+        n_models = len(self.definition.models)
+        n_params = len(METHOD_PARAM_MAPS[self.definition.merge_method])
+
+        if random:
+            return torch.rand(n_layer_groups, n_models, n_param_sets, n_params)
+        else:
+            x0_t = torch.zeros(n_layer_groups, n_models, n_param_sets, n_params)
+            # weight is always first
+            x0_t[:, :, :, 0] = 1 / n_models
+            if n_params > 1:
+                # sometimes followed by density
+                x0_t[:, :, :, 1:] = 1
+            return x0_t
+
+    def genotype_merge_config(
+        self, genotype: Union[torch.Tensor, np.ndarray]
+    ) -> MergeConfiguration:
+        """Convert a genotype tensor to a mergekit configuration."""
+
+        genotype = self._to_torch(genotype)
+
+        (n_layer_groups, n_models, n_param_sets, n_params) = genotype.shape
+        if self.definition.layer_granularity > 0:
+            assert n_layer_groups * self.definition.layer_granularity == self.num_layers
+        assert n_models == len(self.definition.models)
+        assert n_params == len(METHOD_PARAM_MAPS[self.definition.merge_method])
+
+        if self.definition.merge_method == "slerp":
+            slices = self._slerp_slices(genotype)
+            models = None
+        else:
+            param_arrays = {}
+            for param_idx, param in enumerate(
+                METHOD_PARAM_MAPS[self.definition.merge_method]
+            ):
+                values = genotype[:, :, :, param_idx]
+                if param == "density":
+                    # ensure density is in [0, 1]
+                    values = torch.abs(values).clamp(0, 1)
+                if not self.definition.allow_negative_weights and param in [
+                    "weight",
+                    "t",
+                ]:
+                    values = torch.abs(values)
+                param_arrays[param] = values
+
+            if self.definition.smooth:
+                slices = None
+                models = self._smooth_config_models(n_param_sets, param_arrays)
+            else:
+                models = None
+                slices = self._discrete_config_slices(
+                    n_layer_groups, n_param_sets, param_arrays
+                )
+
+        normalize = self.definition.normalize
+        if normalize is None:
+            normalize = self.definition.merge_method in ["ties", "dare_ties", "linear"]
+        return MergeConfiguration.model_validate(
+            {
+                "merge_method": self.definition.merge_method,
+                "slices": slices,
+                "models": models,
+                "parameters": {
+                    "normalize": normalize,
+                    "int8_mask": True,
+                },
+                "dtype": "bfloat16",
+                "base_model": self.definition.base_model,
+                "tokenizer_source": self.definition.tokenizer_source,
+            }
+        )
+
+    def _discrete_config_slices(
+        self,
+        n_layer_groups: int,
+        n_param_sets: int,
+        param_arrays: Dict[str, torch.Tensor],
+    ) -> List[Dict]:
+        """Generate merge config output slices for non-interpolated parameters."""
+        slices = []
+        layer_step = (
+            self.definition.layer_granularity
+            if self.definition.layer_granularity > 0
+            else self.num_layers
+        )
+        for slice_idx in range(n_layer_groups):
+            sources = []
+            for model_idx, model in enumerate(self.definition.models):
+                params = {}
+                if n_param_sets > 1:
+                    for param, values in param_arrays.items():
+                        params[param] = []
+                        for set_idx in range(n_param_sets):
+                            value = values[
+                                slice_idx,
+                                model_idx,
+                                set_idx,
+                            ]
+                            filter_ = (self.definition.filters + [None])[set_idx]
+                            params[param].append(
+                                {"filter": filter_, "value": value.item()}
+                            )
+                else:
+                    for param, values in param_arrays.items():
+                        params[param] = values[
+                            slice_idx,
+                            model_idx,
+                            0,
+                        ].item()
+
+                sources.append(
+                    {
+                        "model": model,
+                        "layer_range": [
+                            slice_idx * layer_step,
+                            (slice_idx + 1) * layer_step,
+                        ],
+                        "parameters": params,
+                    }
+                )
+
+            if self.definition.base_model and (
+                self.definition.base_model not in self.definition.models
+            ):
+                sources.append(
+                    {
+                        "model": self.definition.base_model,
+                        "layer_range": [
+                            slice_idx * layer_step,
+                            (slice_idx + 1) * layer_step,
+                        ],
+                    }
+                )
+            slices.append({"sources": sources})
+        return slices
+
+    def _smooth_config_models(
+        self, n_param_sets: int, param_arrays: Dict[str, torch.Tensor]
+    ) -> List[Dict]:
+        """Generate merge config model section with parameter interpolation."""
+        models = []
+        for model_idx, model in enumerate(self.definition.models):
+            params = {}
+            if n_param_sets > 1:
+                for param, values in param_arrays.items():
+                    params[param] = []
+                    for set_idx in range(n_param_sets):
+                        value = values[:, model_idx, set_idx]
+                        filter_ = (self.definition.filters + [None])[set_idx]
+                        params[param].append(
+                            {
+                                "filter": filter_,
+                                "value": _unpack_single_element(value.tolist()),
+                            }
+                        )
+            else:
+                for param, values in param_arrays.items():
+                    params[param] = _unpack_single_element(
+                        values[:, model_idx, 0].tolist()
+                    )
+
+            models.append(
+                {
+                    "model": model,
+                    "layer_range": [0, self.num_layers],
+                    "parameters": params,
+                }
+            )
+
+        if self.definition.base_model and (
+            self.definition.base_model not in self.definition.models
+        ):
+            models.append({"model": self.definition.base_model})
+        return models
+
+    def _slerp_slices(self, genotype: torch.Tensor) -> List[Dict]:
+        """Generate merge config output slices for SLERP.
+
+        This method is a bit more complex because it requires choosing the
+        two models with the highest weight for each layer group and calculating
+        the interpolation parameter t. Parameter interpolation and component
+        splitting are not supported because it's too hard and I don't want to.
+        """
+        n_layer_groups, n_models, _, _ = genotype.shape
+        layer_step = (
+            self.definition.layer_granularity
+            if self.definition.layer_granularity > 0
+            else self.num_layers
+        )
+        slices = []
+        for slice_idx in range(n_layer_groups):
+            s = {
+                "sources": [
+                    {
+                        "model": self.definition.models[i],
+                        "layer_range": [
+                            slice_idx * layer_step,
+                            (slice_idx + 1) * layer_step,
+                        ],
+                    }
+                    for i in range(n_models)
+                ]
+            }
+
+            # Choose the two models with the highest weight and
+            # calculate the interpolation parameter t
+            chosen = torch.topk(genotype[slice_idx, :, 0, 0], 2)
+            t = torch.softmax(chosen.values, dim=-1)[1].item()
+            s["parameters"] = {"t": t}
+            s["base_model"] = self.definition.models[chosen.indices[0].item()]
+            s["sources"] = [
+                s["sources"][chosen.indices[0].item()],
+                s["sources"][chosen.indices[1].item()],
+            ]
+            if self.definition.tokenizer_source:
+                s["sources"][0]["parameters"] = {"weight": 1 - t}
+                s["sources"][1]["parameters"] = {"weight": t}
+
+            if self.definition.base_model and (
+                self.definition.base_model not in self.definition.models
+            ):
+                s["sources"].append(
+                    {
+                        "model": self.definition.base_model,
+                        "layer_range": [
+                            slice_idx * layer_step,
+                            (slice_idx + 1) * layer_step,
+                        ],
+                    }
+                )
+
+            slices.append(s)
+        return slices
+
+    def _to_torch(self, genotype: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+        """Convert a genotype to a torch tensor of the correct shape."""
+        if not isinstance(genotype, torch.Tensor):
+            genotype = torch.tensor(genotype)
+        if len(genotype.shape) == 1:
+            num_layer_groups = (
+                self.num_layers // self.definition.layer_granularity
+                if self.definition.layer_granularity > 0
+                else 1
+            )
+            genotype = genotype.view(
+                num_layer_groups,
+                len(self.definition.models),
+                len(self.definition.filters or []) + 1,
+                -1,
+            )
+
+        if len(genotype.shape) != 4:
+            logging.error(f"Invalid genotype shape: {genotype.shape}")
+            raise InvalidGenotypeError(
+                "Invalid genotype shape - must be 4D tensor or 1D array"
+            )
+
+        return genotype
+
+    def genotype_to_param_arrays(
+        self, genotype: Union[torch.Tensor, np.ndarray]
+    ) -> Dict[str, torch.Tensor]:
+        """Convert a genotype tensor to a dictionary of numpy arrays."""
+        genotype = self._to_torch(genotype)
+
+        res = {}
+        for idx, param_name in enumerate(
+            METHOD_PARAM_MAPS[self.definition.merge_method]
+        ):
+            for model_idx, model in enumerate(self.definition.models):
+                model_name = os.path.basename(model.model.path)
+                for set_idx, filter_ in enumerate(
+                    (self.definition.filters or []) + [None]
+                ):
+                    suffix = ""
+                    if filter_ is not None:
+                        suffix = f"_{filter_}"
+                    res[f"{model_name}_{param_name}{suffix}"] = genotype[
+                        :, model_idx, set_idx, idx
+                    ]
+
+        return res
+
+
+def _unpack_single_element(x: List) -> Any:
+    if len(x) == 1:
+        return x[0]
+    return x
diff --git a/mergekit/mergekit/evo/helpers.py b/mergekit/mergekit/evo/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87829d5c19e6f6ab71a4c894c937fc2b2687073
--- /dev/null
+++ b/mergekit/mergekit/evo/helpers.py
@@ -0,0 +1,126 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import os
+import shutil
+import tempfile
+from typing import Any, Dict, List, Optional, Union
+
+import lm_eval
+import lm_eval.api.model
+import lm_eval.models.huggingface
+import lm_eval.tasks
+import ray
+import ray.util.queue
+import ray.util.scheduling_strategies
+import torch
+
+from mergekit.evo.config import TaskConfiguration
+from mergekit.evo.genome import InvalidGenotypeError, ModelGenome
+from mergekit.evo.monkeypatch import monkeypatch_lmeval_vllm
+from mergekit.merge import run_merge
+from mergekit.options import MergeOptions
+
+
+def _eval_model(
+    model: Union[str, lm_eval.api.model.LM],
+    tasks: List[TaskConfiguration],
+    model_args: Optional[Dict[str, Any]] = None,
+    task_manager: Optional[lm_eval.tasks.TaskManager] = None,
+    **kwargs,
+) -> Dict[str, Any]:
+    results = lm_eval.evaluator.simple_evaluate(
+        model=model,
+        model_args=model_args,
+        tasks=list(set([task.name for task in tasks])),
+        log_samples=False,
+        verbosity="WARNING",
+        task_manager=task_manager,
+        **kwargs,
+    )
+
+    logging.info(results["results"])
+    res = 0
+    for task in tasks:
+        res += results["results"][task.name][task.metric] * task.weight
+    return {"score": res, "results": results["results"]}
+
+
+def evaluate_model(
+    merged_path: str,
+    tasks: List[TaskConfiguration],
+    num_fewshot: Optional[int],
+    limit: Optional[int],
+    vllm: bool,
+    batch_size: Optional[int] = None,
+    task_manager: Optional[lm_eval.tasks.TaskManager] = None,
+) -> dict:
+    # monkeypatch_tqdm()
+    monkeypatch_lmeval_vllm()
+    try:
+        model_args = {
+            "pretrained": merged_path,
+            "dtype": "bfloat16",
+        }
+        if vllm:
+            model_args["gpu_memory_utilization"] = 0.8
+            model_args["tensor_parallel_size"] = 1
+            model_args["batch_size"] = "auto"
+            model_args["max_model_len"] = 4096
+        else:
+            model_args["use_cache"] = True
+
+        res = _eval_model(
+            "vllm" if vllm else "huggingface",
+            tasks,
+            model_args,
+            num_fewshot=num_fewshot,
+            limit=limit,
+            batch_size=batch_size,
+            task_manager=task_manager,
+        )
+        return res
+    finally:
+        shutil.rmtree(merged_path)
+
+
+evaluate_model_ray = ray.remote(num_cpus=1, num_gpus=1.0)(evaluate_model)
+
+
+def merge_model(
+    genotype: torch.Tensor,
+    genome: ModelGenome,
+    model_storage_path: str,
+    merge_options: MergeOptions,
+) -> str:
+    # monkeypatch_tqdm()
+    try:
+        cfg = genome.genotype_merge_config(genotype)
+    except InvalidGenotypeError as e:
+        logging.error("Invalid genotype", exc_info=e)
+        return None
+    os.makedirs(model_storage_path, exist_ok=True)
+    res = tempfile.mkdtemp(prefix="merged", dir=model_storage_path)
+    run_merge(cfg, out_path=res, options=merge_options)
+    return res
+
+
+merge_model_ray = ray.remote(
+    num_cpus=1,
+    num_gpus=1,
+    max_retries=3,
+    retry_exceptions=[ConnectionError],
+)(merge_model)
diff --git a/mergekit/mergekit/evo/monkeypatch.py b/mergekit/mergekit/evo/monkeypatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..83224d4f6246a80f834775ec568a3659515f3f73
--- /dev/null
+++ b/mergekit/mergekit/evo/monkeypatch.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+
+import torch
+import transformers
+
+
+def monkeypatch_lmeval_shuffle():
+    """Monkeypatch lm_eval to shuffle the dataset after downloading."""
+    import lm_eval.api.task
+
+    if hasattr(lm_eval.api.task.Task, "_monkey_patched"):
+        return
+
+    _old_task_dl = lm_eval.api.task.Task.download
+
+    def _dl_shuffled(self: lm_eval.api.task.Task, *args, **kwargs):
+        _old_task_dl(self, *args, **kwargs)
+        self.dataset = self.dataset.shuffle()
+
+    lm_eval.api.task.Task.download = _dl_shuffled
+
+    _old_ct_dl = lm_eval.api.task.ConfigurableTask.download
+
+    def _ct_dl_shuffled(self, *args, **kwargs):
+        _old_ct_dl(self, *args, **kwargs)
+        self.dataset = self.dataset.shuffle()
+
+    lm_eval.api.task.ConfigurableTask.download = _ct_dl_shuffled
+
+    lm_eval.api.task.Task._monkey_patched = True
+    print("monkey has been patched")
+
+
+def monkeypatch_tqdm(lm_eval: bool = True, mergekit: bool = True):
+    """Patch lm_eval & mergekit to use Ray's tqdm for progress bars."""
+
+    from ray.experimental.tqdm_ray import tqdm as tqdm_ray
+
+    def _tqdm_wrap(iterable=None, disable: bool = False, **kwargs):
+        if disable:
+            if iterable is not None:
+                return iterable
+            return lambda x: x
+        res = tqdm_ray(iterable=iterable, **kwargs, flush_interval_s=1.0)
+        res.refresh()
+        return res
+
+    def _patch_lm_eval():
+        import lm_eval
+
+        if hasattr(lm_eval, "_mk_tqdm_patched"):
+            return
+
+        import lm_eval.api.metrics
+        import lm_eval.api.model
+        import lm_eval.api.task
+        import lm_eval.models.huggingface
+        import lm_eval.models.vllm_causallms
+
+        for module in (
+            lm_eval.models.huggingface,
+            lm_eval.models.vllm_causallms,
+            lm_eval.api.model,
+            lm_eval.api.task,
+            lm_eval.api.metrics,
+        ):
+            setattr(module, "tqdm", _tqdm_wrap)
+
+        lm_eval._mk_tqdm_patched = True
+
+    if lm_eval:
+        _patch_lm_eval()
+
+    if mergekit:
+        del mergekit
+
+        import mergekit
+        import mergekit.graph
+        import mergekit.merge
+        import mergekit.tokenizer
+
+        fake_module = type("fake_module", (), {"tqdm": staticmethod(_tqdm_wrap)})()
+
+        mergekit.graph.tqdm = fake_module
+        mergekit.merge.tqdm = fake_module
+        mergekit.tokenizer.tqdm = fake_module
+
+
+def monkeypatch_lmeval_vllm():
+    # HACK: fix crash on some tasks due to unset AUTO_MODEL_CLASS for vLLM
+    import lm_eval.models.vllm_causallms
+
+    lm_eval.models.vllm_causallms.VLLM.AUTO_MODEL_CLASS = (
+        transformers.AutoModelForCausalLM
+    )
+
+
+class NoInit:
+    def __enter__(self):
+        def noop(*args, **kwargs):
+            pass
+
+        (k, u, n) = (
+            torch.nn.init.kaiming_uniform_,
+            torch.nn.init.uniform_,
+            torch.nn.init.normal_,
+        )
+        torch.nn.init.kaiming_uniform_ = noop
+        torch.nn.init.uniform_ = noop
+        torch.nn.init.normal_ = noop
+
+        transformers.modeling_utils._init_weights = False
+        self.funcs = (k, u, n)
+
+    def __exit__(self, *args):
+        (k, u, n) = self.funcs
+        (
+            torch.nn.init.kaiming_uniform_,
+            torch.nn.init.uniform_,
+            torch.nn.init.normal_,
+        ) = (
+            k,
+            u,
+            n,
+        )
+        transformers.modeling_utils._init_weights = True
diff --git a/mergekit/mergekit/evo/strategy.py b/mergekit/mergekit/evo/strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4c581c145faf16d83beb048a6bdd9ac24c2e6c
--- /dev/null
+++ b/mergekit/mergekit/evo/strategy.py
@@ -0,0 +1,301 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import asyncio
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+import lm_eval.tasks
+import numpy as np
+import ray
+import ray.util.queue
+import ray.util.scheduling_strategies
+import torch
+
+from mergekit.evo.actors import InMemoryMergeEvaluator, OnDiskMergeEvaluator
+from mergekit.evo.config import EvolMergeConfiguration
+from mergekit.evo.genome import ModelGenome
+from mergekit.evo.helpers import evaluate_model_ray, merge_model_ray
+from mergekit.options import MergeOptions
+
+
+class EvaluationStrategyBase(ABC):
+    def __init__(
+        self,
+        config: EvolMergeConfiguration,
+        genome: ModelGenome,
+        merge_options: MergeOptions,
+        num_gpus: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        task_search_path: Union[str, List[str], None] = None,
+        model_storage_path: Optional[str] = None,
+    ):
+        self.config = config
+        self.genome = genome
+        self.merge_options = merge_options
+        self.num_gpus = num_gpus or torch.cuda.device_count()
+        self.batch_size = batch_size
+        self.task_manager = lm_eval.tasks.TaskManager(include_path=task_search_path)
+        self.model_storage_path = model_storage_path
+        if self.model_storage_path:
+            os.makedirs(self.model_storage_path, exist_ok=True)
+
+    @abstractmethod
+    def evaluate_genotypes(self, genotypes: List[np.ndarray]) -> List[dict]:
+        pass
+
+    @abstractmethod
+    def evaluate_genotype(self, genotype: np.ndarray) -> dict:
+        pass
+
+
+class ActorPoolEvaluationStrategy(EvaluationStrategyBase):
+    """
+    Uses a fixed-size pool of actors to evaluate genotypes in parallel.
+    """
+
+    def __init__(
+        self,
+        *args,
+        in_memory: bool = False,
+        vllm: bool = False,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        if in_memory:
+            self.actor_cls = InMemoryMergeEvaluator
+        else:
+            self.actor_cls = OnDiskMergeEvaluator
+
+        self.actor_pool = ray.util.ActorPool(
+            [
+                self.actor_cls.remote(
+                    self.config,
+                    self.genome,
+                    self.merge_options,
+                    model_storage_path=self.model_storage_path,
+                    vllm=vllm,
+                    batch_size=self.batch_size,
+                    task_manager=self.task_manager,
+                )
+                for _ in range(self.num_gpus)
+            ]
+        )
+
+    def evaluate_genotypes(self, genotypes: List[np.ndarray]) -> List[dict]:
+        return list(
+            self.actor_pool.map(
+                lambda a, x: a.evaluate_genotype.remote(x),
+                genotypes,
+            )
+        )
+
+    def evaluate_genotype(self, genotype: np.ndarray) -> dict:
+        return self.evaluate_genotypes([genotype])[0]
+
+
+@ray.remote
+class BufferedRayEvaluationStrategyActor:
+    def __init__(
+        self,
+        config: EvolMergeConfiguration,
+        genome: ModelGenome,
+        merge_options: MergeOptions,
+        vllm: bool = False,
+        num_gpus: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        task_manager: Optional[lm_eval.tasks.TaskManager] = None,
+        model_storage_path: Optional[str] = None,
+    ):
+        self.config = config
+        self.genome = genome
+        self.merge_options = merge_options
+        self.vllm = vllm
+        self.num_gpus = num_gpus or torch.cuda.device_count()
+        self.input_queue = []
+        self.batch_size = batch_size
+        self.task_manager = task_manager
+        self.model_storage_path = model_storage_path
+        self._shutdown = False
+
+    async def evaluate_genotype(self, genotype: np.ndarray):
+        future_result = asyncio.Future()
+        self.input_queue.append((genotype, future_result))
+        return await future_result
+
+    async def process_queue(self):
+        merging: Dict[ray.ObjectRef, asyncio.Future] = {}
+        merged: List[Tuple[asyncio.Future, ray.ObjectRef]] = []
+        evaluating: Dict[ray.ObjectRef, asyncio.Future] = {}
+
+        logging.info("Starting processing loop")
+
+        try:
+            while not self._shutdown:
+                while self.input_queue and (len(merging) + len(merged) < self.num_gpus):
+                    genotype, future_result = self.input_queue.pop(0)
+                    merging[
+                        merge_model_ray.remote(
+                            genotype,
+                            self.genome,
+                            self.model_storage_path,
+                            self.merge_options,
+                        )
+                    ] = future_result
+
+                while merged and len(evaluating) < self.num_gpus:
+                    future_result, merged_path = merged.pop()
+                    evaluating[
+                        evaluate_model_ray.remote(
+                            merged_path,
+                            self.config.tasks,
+                            num_fewshot=self.config.num_fewshot,
+                            limit=self.config.limit,
+                            vllm=self.vllm,
+                            batch_size=self.batch_size,
+                            task_manager=self.task_manager,
+                        )
+                    ] = future_result
+
+                ready, _ = ray.wait(
+                    list(merging.keys()) + list(evaluating.keys()),
+                    num_returns=1,
+                    fetch_local=False,
+                    timeout=1,
+                )
+                for r in ready:
+                    if r in merging:
+                        future_result = merging.pop(r)
+                        merged.append((future_result, r))
+                    elif r in evaluating:
+                        future_result = evaluating.pop(r)
+                        future_result.set_result(await r)
+
+                if (
+                    not self.input_queue
+                    and not merging
+                    and not merged
+                    and not evaluating
+                ):
+                    await asyncio.sleep(1)
+        except Exception as e:
+            logging.error("Error in processing loop", exc_info=e)
+            raise
+
+    async def shutdown(self):
+        self._shutdown = True
+
+
+class BufferedRayEvaluationStrategy(EvaluationStrategyBase):
+    def __init__(
+        self,
+        *args,
+        vllm: bool = False,
+        in_memory: bool = False,
+        **kwargs,
+    ):
+        if in_memory:
+            raise ValueError("In-memory evaluation is not supported for buffered mode")
+
+        super().__init__(*args, **kwargs)
+        self.actor = BufferedRayEvaluationStrategyActor.options(
+            max_concurrency=1000
+        ).remote(
+            self.config,
+            self.genome,
+            self.merge_options,
+            model_storage_path=self.model_storage_path,
+            vllm=vllm,
+            num_gpus=self.num_gpus,
+            task_manager=self.task_manager,
+        )
+        self.actor.process_queue.remote()
+
+    def evaluate_genotypes(self, genotypes: List[np.ndarray]) -> List[dict]:
+        return ray.get([self.actor.evaluate_genotype.remote(x) for x in genotypes])
+
+    def evaluate_genotype(self, genotype: np.ndarray) -> dict:
+        return ray.get(self.actor.evaluate_genotype.remote(genotype))
+
+
+@ray.remote
+def evaluate_genotype_serial(
+    genotype: np.ndarray,
+    config: EvolMergeConfiguration,
+    genome: ModelGenome,
+    merge_options: MergeOptions,
+    model_storage_path: Optional[str] = None,
+    vllm: bool = False,
+    batch_size: Optional[int] = None,
+    task_manager: Optional[lm_eval.tasks.TaskManager] = None,
+):
+    pg = ray.util.placement_group([{"CPU": 1, "GPU": 1}], strategy="STRICT_PACK")
+    strat = ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
+        placement_group=pg
+    )
+    merged_path = merge_model_ray.options(scheduling_strategy=strat).remote(
+        genotype, genome, model_storage_path, merge_options
+    )
+    if not merged_path:
+        return {"score": None, "results": None}
+    res = ray.get(
+        evaluate_model_ray.options(scheduling_strategy=strat).remote(
+            merged_path,
+            config.tasks,
+            num_fewshot=config.num_fewshot,
+            limit=config.limit,
+            vllm=vllm,
+            batch_size=batch_size,
+            task_manager=task_manager,
+        )
+    )
+    ray.util.remove_placement_group(pg)
+    return res
+
+
+class SerialEvaluationStrategy(EvaluationStrategyBase):
+    def __init__(
+        self,
+        *args,
+        vllm: bool = False,
+        in_memory: bool = False,
+        **kwargs,
+    ):
+        self.vllm = vllm
+        if in_memory:
+            raise ValueError("In-memory evaluation is not supported for serial mode")
+        super().__init__(*args, **kwargs)
+
+    def evaluate_genotypes(self, genotypes: List[np.ndarray]) -> List[dict]:
+        return ray.get(
+            [
+                evaluate_genotype_serial.remote(
+                    x,
+                    self.config,
+                    self.genome,
+                    self.merge_options,
+                    model_storage_path=self.model_storage_path,
+                    vllm=self.vllm,
+                    batch_size=self.batch_size,
+                    task_manager=self.task_manager,
+                )
+                for x in genotypes
+            ]
+        )
+
+    def evaluate_genotype(self, genotype: np.ndarray) -> dict:
+        return self.evaluate_genotypes([genotype])[0]
diff --git a/mergekit/mergekit/graph.py b/mergekit/mergekit/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81cb85b426209aaa8804fd0bc30556302045ec2
--- /dev/null
+++ b/mergekit/mergekit/graph.py
@@ -0,0 +1,272 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+"""
+Module for computational graph execution.
+
+Classes:
+    Task: Abstract base class representing a computational task.
+    Executor: Class for scheduling and executing directed acyclic task graphs.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
+
+import networkx
+import torch
+import tqdm
+from pydantic import BaseModel
+from typing_extensions import Generic, TypeVar
+
+ValueT = TypeVar("ValueT")
+
+
+class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
+    """
+    Abstract base class representing a task in a computational graph.
+
+    This class should be extended to define specific tasks. Each task can have arguments (dependencies) and a defined execution strategy.
+
+    Attributes:
+        Generic[ValueT] (TypeVar): The type of the value that the task returns upon execution.
+
+    Methods:
+        arguments: Abstract method to define task arguments (dependencies).
+        execute: Abstract method to execute the task.
+        priority: Returns the priority of the task for scheduling purposes.
+        group_label: Returns an optional label for task grouping.
+    """
+
+    @abstractmethod
+    def arguments(self) -> Dict[str, "Task"]:
+        """
+        Returns a dictionary of arguments required for this task. The keys of the dictionary
+        are argument names, and the values are Task instances. These keys correspond to the
+        keyword argument names expected by the execute method.
+
+        For example, if this method returns {'input1': taskA, 'input2': taskB}, the execute
+        method should expect to be called as execute(input1=valueA, input2=valueB), where
+        valueA and valueB are the outputs of taskA and taskB respectively.
+
+        Returns:
+            Dict[str, "Task"]: A dictionary mapping argument names to Task instances.
+        """
+        ...
+
+    @abstractmethod
+    def execute(self, **kwargs) -> ValueT:
+        """
+        Executes the task using the results of its dependencies.
+
+        The keyword arguments (**kwargs) for this method are dynamically determined based on
+        the dictionary returned by the 'arguments' method. Each key in the 'arguments' method's
+        return dictionary becomes a keyword argument in this method, with its value being
+        the result of the corresponding task's execution.
+
+        Returns:
+            ValueT: The result of the task execution.
+        """
+        ...
+
+    def priority(self) -> int:
+        """
+        Returns the priority of the task for scheduling.
+
+        Higher numbers indicate higher priority. Default is 0.
+
+        Returns:
+            int: The priority of the task.
+        """
+        return 0
+
+    def group_label(self) -> Optional[str]:
+        """
+        Returns an optional label used for grouping tasks together.
+
+        Returns:
+            Optional[str]: The group label of the task, if any.
+        """
+        return None
+
+    def uses_accelerator(self) -> bool:
+        """
+        Returns True if the task can take advantage of matrix operation
+        acceleration (such as on a GPU).
+        """
+        return False
+
+
+class Executor:
+    """
+    Schedules and executes a set of tasks and their dependencies.
+
+    Handles scheduling, execution, the movement of data between devices, and the lifecycle of intermediate results.
+
+    Attributes:
+        math_device (torch.device): Device used for tensor computations.
+        storage_device (torch.device): Device used for storing intermediate results.
+        targets (List[Task]): List of target tasks to be executed.
+        schedule (List[Task]): Calculated execution schedule of tasks.
+        dependencies (Dict[Task, Set[Task]]): Dependencies of each task.
+    """
+
+    math_device: torch.device
+    storage_device: torch.device
+    targets: List[Task]
+    schedule: List[Task]
+    dependencies: Dict[Task, Set[Task]]
+
+    def __init__(
+        self,
+        tasks: List[Task],
+        math_device: torch.device = torch.device("cpu"),
+        storage_device: torch.device = torch.device("cpu"),
+    ):
+        """
+        Initializes the Executor with a list of tasks and device configurations.
+
+        Args:
+            tasks (List[Task]): The list of tasks to be executed.
+            math_device (torch.device, optional): The device for tensor computations. Defaults to CPU.
+            storage_device (torch.device, optional): The device for storing results. Defaults to CPU.
+        """
+        self.math_device = math_device
+        self.storage_device = storage_device
+        self.schedule = self._make_schedule(tasks)
+        self.targets = tasks
+
+    def run(self, quiet: bool = False) -> Iterator[Tuple[Task, Any]]:
+        """
+        Execute the computed schedule and yield the target values.
+
+        Yields:
+            Iterator[Tuple[Task, Any]]: An iterator of task-result pairs.
+        """
+        # determine last usage of each value, so they can be evicted afterwards
+        last_use_index = {}
+        for idx, task in reversed(list(enumerate(self.schedule))):
+            for t in self.dependencies[task]:
+                if t not in last_use_index:
+                    last_use_index[t] = idx
+            if task not in last_use_index:
+                last_use_index[task] = idx
+
+        values: Dict[Task, Any] = {}
+        for idx, task in (
+            pbar := tqdm.tqdm(
+                list(enumerate(self.schedule)),
+                disable=quiet,
+                desc="Executing graph",
+            )
+        ):
+            use_math_device = task.uses_accelerator()
+
+            arguments = {}
+            for name, dep in task.arguments().items():
+                value = values[dep]
+
+                # ensure any input tensors are on math device if task asks for it
+                if use_math_device:
+                    if (
+                        isinstance(value, torch.Tensor)
+                        and value.device != self.math_device
+                    ):
+                        value = value.to(self.math_device)
+                    elif isinstance(value, dict):
+                        for key in value:
+                            if (
+                                isinstance(value[key], torch.Tensor)
+                                and value[key].device != self.math_device
+                            ):
+                                value[key] = value[key].to(self.math_device)
+
+                arguments[name] = value
+                del value
+
+            res = task.execute(**arguments)
+            del arguments
+
+            if isinstance(res, torch.Tensor) and res.device != self.storage_device:
+                res = res.to(self.storage_device)
+
+            values[task] = res
+            del res
+
+            if task in self.targets:
+                yield (task, values[task])
+
+            # evict unreferenced values
+            expired = []
+            for key in values:
+                if idx >= last_use_index[key]:
+                    expired.append(key)
+
+            for key in expired:
+                del values[key]
+
+        del values
+        del pbar
+
+    def execute(self) -> None:
+        """
+        Execute all tasks and discard results.
+        """
+        for task, value in self.run():
+            pass
+
+    DUMMY_TASK_VALUE = "!!DUMMY!!"
+
+    def _make_schedule(self, targets: List[Task]) -> List[Task]:
+        self.schedule = []
+        self.dependencies = self._build_dependencies(targets)
+
+        edge_tups = []
+        for node in self.dependencies:
+            for dependency in self.dependencies[node]:
+                edge_tups.append((dependency, node))
+
+        for task in targets:
+            # add edges from a dummy node to each target to guarantee
+            # they will be included in the final schedule
+            edge_tups.append((Executor.DUMMY_TASK_VALUE, task))
+
+        def _compare_key(task: Union[Task, str]):
+            if task == Executor.DUMMY_TASK_VALUE:
+                return ("", 0)
+            return (
+                task.group_label() or "",
+                -task.priority(),
+            )
+
+        graph = networkx.DiGraph(edge_tups)
+        res = [
+            t
+            for t in networkx.lexicographical_topological_sort(graph, key=_compare_key)
+            if t != Executor.DUMMY_TASK_VALUE
+        ]
+        return res
+
+    def _build_dependencies(self, targets: List[Task]) -> Dict[Task, Set[Task]]:
+        task_dependencies: Dict[Task, Set[Task]] = {}
+        to_process = list(targets)
+        while to_process:
+            child = to_process.pop()
+            if child in task_dependencies:
+                continue
+
+            task_dependencies[child] = set()
+            for _, dep in child.arguments().items():
+                task_dependencies[child].add(dep)
+                to_process.append(dep)
+        return task_dependencies
diff --git a/mergekit/mergekit/io/__init__.py b/mergekit/mergekit/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e8eea92aa2e31572321c2ba2357b49f2be30cb
--- /dev/null
+++ b/mergekit/mergekit/io/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from mergekit.io.lazy_tensor_loader import (
+    LazyTensorLoader,
+    ShardedTensorIndex,
+    ShardInfo,
+)
+from mergekit.io.tensor_writer import TensorWriter
+
+__all__ = [
+    "LazyTensorLoader",
+    "ShardedTensorIndex",
+    "ShardInfo",
+    "TensorWriter",
+]
diff --git a/mergekit/mergekit/io/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51c4e2ecde8bba61a060567b77d6bb71304dc2bb
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/__pycache__/lazy_tensor_loader.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/lazy_tensor_loader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6839447c82e8e299c8229df31f61b708ed404586
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/lazy_tensor_loader.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/__pycache__/lazy_unpickle.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/lazy_unpickle.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c957e1ffda9e6a773cb7d26167ed349681ec7536
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/lazy_unpickle.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/__pycache__/loader.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/loader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29f2ab9265e34ea6128883e36f6c7c5e333954c8
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/loader.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/__pycache__/tasks.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/tasks.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc4f7b93549f5719ce2bcff8c3c1eeff048677c2
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/tasks.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/__pycache__/tensor_writer.cpython-310.pyc b/mergekit/mergekit/io/__pycache__/tensor_writer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a20548ad8a4a07401fe64dec4228fdf3dcd599c9
Binary files /dev/null and b/mergekit/mergekit/io/__pycache__/tensor_writer.cpython-310.pyc differ
diff --git a/mergekit/mergekit/io/lazy_tensor_loader.py b/mergekit/mergekit/io/lazy_tensor_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79c5714bf209be9af4fa5ccf97bc05a9f1cc563
--- /dev/null
+++ b/mergekit/mergekit/io/lazy_tensor_loader.py
@@ -0,0 +1,149 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import json
+import logging
+import os
+import os.path
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import safetensors
+import safetensors.torch
+import torch
+from torch import Tensor
+
+from mergekit.io.loader import TensorLoader
+
+
+@dataclass
+class ShardInfo:
+    filename: str
+    contained_keys: List[str]
+
+
+@dataclass
+class ShardedTensorIndex:
+    base_path: str
+    is_safetensors: bool
+    tensor_paths: Dict[str, str]
+    shards: List[ShardInfo]
+
+    @classmethod
+    def from_disk(cls, base_path: str) -> "ShardedTensorIndex":
+        model_path = None
+        for model_file_name in [
+            "model.safetensors",
+            "pytorch_model.bin",
+        ]:
+            candidate_path = os.path.join(base_path, model_file_name)
+            if os.path.exists(candidate_path) or os.path.exists(
+                candidate_path + ".index.json"
+            ):
+                model_path = candidate_path
+                break
+
+        if not model_path:
+            raise RuntimeError(f"Unable to find model files at {base_path}")
+
+        is_safetensors = model_path.endswith(".safetensors")
+        tensor_paths = None
+        shards = []
+
+        if os.path.exists(model_path + ".index.json"):
+            # shared model - parse index
+            with open(model_path + ".index.json", "r") as fd:
+                weight_map = json.load(fd)["weight_map"]
+            tensor_paths = weight_map
+
+            shard_names = list(sorted(set(tensor_paths[e] for e in tensor_paths)))
+            for shard_name in shard_names:
+                info = ShardInfo(
+                    shard_name,
+                    [key for key in tensor_paths if tensor_paths[key] == shard_name],
+                )
+                shards.append(info)
+
+        elif os.path.exists(model_path):
+            shard_name = os.path.basename(model_path)
+
+            # get list of tensors contained in single-file checkpoint
+            if model_path.lower().endswith(".safetensors"):
+                with safetensors.safe_open(model_path, framework="pt") as st:
+                    tensor_paths = {key: shard_name for key in st.keys()}
+            else:
+                # this is ugly but not much else can be done
+                shard = torch.load(model_path, map_location="meta")
+                if "state_dict" in shard:
+                    shard = shard["state_dict"]
+
+                tensor_paths = {key: shard_name for key in shard}
+
+            shards.append(
+                ShardInfo(os.path.basename(model_path), list(tensor_paths.keys()))
+            )
+
+        return ShardedTensorIndex(
+            base_path=base_path,
+            is_safetensors=is_safetensors,
+            tensor_paths=tensor_paths,
+            shards=shards,
+        )
+
+
+class LazyTensorLoader:
+    index: ShardedTensorIndex
+    current_shard: Optional[TensorLoader]
+    lazy_unpickle: bool
+
+    def __init__(self, index: ShardedTensorIndex, lazy_unpickle: bool = True):
+        self.index = index
+        self.current_shard = None
+        self.lazy_unpickle = lazy_unpickle
+
+    def get_tensor(
+        self, key: str, device: str = "cpu", aliases: Optional[List[str]] = None
+    ) -> Optional[Tensor]:
+        if aliases and key not in self.index.tensor_paths:
+            for alias in aliases:
+                if alias in self.index.tensor_paths:
+                    key = alias
+                    break
+
+        if self.current_shard is None or key not in self.current_shard.keys():
+            if key not in self.index.tensor_paths:
+                raise KeyError(key)
+
+            self.current_shard = None
+            self.current_keys = None
+
+            shard_file = self.index.tensor_paths[key]
+            shard_full_path = os.path.join(self.index.base_path, shard_file)
+            logging.debug(f"Opening shard {shard_full_path}")
+            self.current_shard = TensorLoader.get(
+                shard_full_path, use_lazy_unpickle=self.lazy_unpickle, device=device
+            )
+
+        return self.current_shard.get_tensor(key).to(device)
+
+    def flush(self):
+        self.current_shard = None
+        self.current_keys = None
+
+    @classmethod
+    def from_disk(
+        cls, base_path: str, lazy_unpickle: bool = True
+    ) -> "LazyTensorLoader":
+        return LazyTensorLoader(ShardedTensorIndex.from_disk(base_path), lazy_unpickle)
diff --git a/mergekit/mergekit/io/lazy_unpickle.py b/mergekit/mergekit/io/lazy_unpickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b0905d3c5cdba79d2293c7a868e3f37d250ac4
--- /dev/null
+++ b/mergekit/mergekit/io/lazy_unpickle.py
@@ -0,0 +1,200 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import codecs
+import collections
+import contextlib
+import operator
+import os
+import pickle
+import zipfile
+from functools import reduce
+from typing import Any, Optional, Tuple, Union
+
+import accelerate
+import numpy
+import torch
+from pydantic import BaseModel, PrivateAttr
+
+ACCEPTABLE_TYPES = {
+    ("torch._utils", "_rebuild_tensor_v2"): torch._utils._rebuild_tensor_v2,
+    ("collections", "OrderedDict"): collections.OrderedDict,
+    ("numpy.core.multiarray", "scalar"): numpy.core.multiarray.scalar,
+    ("numpy", "dtype"): numpy.core.multiarray.scalar,
+    ("_codecs", "encode"): codecs.encode,
+    **{
+        ("torch", name): getattr(torch, name)
+        for name in [
+            "DoubleStorage",
+            "FloatStorage",
+            "HalfStorage",
+            "LongStorage",
+            "IntStorage",
+            "ShortStorage",
+            "CharStorage",
+            "ByteStorage",
+            "BoolStorage",
+            "BFloat16Storage",
+        ]
+    },
+}
+
+
+class DeferredLoad(BaseModel, arbitrary_types_allowed=True):
+    name: str
+    location: str
+    dtype: torch.dtype
+
+    # set after construction by rebuild()
+    file_offset: Optional[int] = None
+    shape: Optional[Union[torch.Size, Tuple[int, ...]]] = None
+    stride: Optional[Tuple[int, ...]] = None
+
+    # set arbitrarily in Torch innards
+    requires_grad: bool = False
+    _backward_hooks: Any = PrivateAttr(None)
+
+    @staticmethod
+    def rebuild(
+        load: "DeferredLoad",
+        offset: int,
+        shape: Union[torch.Size, Tuple[int, ...]],
+        stride: Tuple[int, ...],
+    ) -> "DeferredLoad":
+        load.shape = shape
+        load.stride = stride
+        load.file_offset = offset * dtype_bytes(load.dtype)
+        return load
+
+    def execute(
+        self,
+        reader: "TorchArchiveReader",
+        map_location: Any = None,
+    ) -> torch.Tensor:
+        total_params = reduce(operator.mul, self.shape)
+        total_bytes = total_params * dtype_bytes(self.dtype)
+
+        f = reader.open_file(file_name=self.name, offset=self.file_offset)
+        storage = torch.UntypedStorage.from_buffer(
+            f.read(total_bytes), "little", dtype=self.dtype
+        )
+        storage = torch.serialization._get_restore_location(map_location)(
+            storage, self.location
+        )
+
+        tensor = torch.tensor([], dtype=self.dtype, device=storage.device)
+        tensor.set_(storage, 0, self.shape, self.stride)
+        tensor.requires_grad = self.requires_grad
+        tensor._backward_hooks = self._backward_hooks
+        return tensor
+
+
+class LazyTorchUnpickler(pickle.Unpickler):
+    def find_class(self, module: str, name: str) -> Any:
+        if (module, name) in ACCEPTABLE_TYPES:
+            return ACCEPTABLE_TYPES[(module, name)]
+        raise pickle.UnpicklingError(f"Unsupported type {module}.{name}")
+
+    def persistent_load(self, pid: Any) -> Any:
+        if not isinstance(pid, tuple) or pid[0] != "storage":
+            raise RuntimeError(f"Unpickling object with unexpected PID: {repr(pid)}")
+
+        storage_type, key, location, _ = pid[1:]
+        return DeferredLoad(name=key, location=location, dtype=get_dtype(storage_type))
+
+
+class TorchArchiveReader:
+    """
+    Class for lazily reading (sections of) files from a torch ZIP archive.
+
+    Maintains a handle to the most recently opened file for faster access with
+    consecutive reads from the same file.
+    """
+
+    archive: zipfile.ZipFile
+    archive_name: str
+    file_name: Optional[str] = None
+    file: Optional[zipfile.ZipExtFile] = None
+
+    def __init__(self, path: str):
+        self.archive = zipfile.ZipFile(path, mode="r")
+        self.archive_name = os.path.basename(os.path.normpath(path)).split(".")[0]
+
+    def open_file(self, file_name: str, offset: int = 0) -> zipfile.ZipExtFile:
+        if self.file_name != file_name or (
+            self.file is not None and self.file.tell() > offset
+        ):
+            if self.file is not None:
+                self.file.close()
+
+            try:
+                fd = self.archive.open(f"archive/data/{file_name}", mode="r")
+            except Exception:
+                fd = self.archive.open(
+                    f"{self.archive_name}/data/{file_name}", mode="r"
+                )
+            self.file = fd
+            self.file_name = file_name
+
+        skip_bytes = offset - self.file.tell()
+        assert skip_bytes >= 0
+        self.file.seek(skip_bytes, os.SEEK_CUR)
+
+        return self.file
+
+
+@contextlib.contextmanager
+def torch_lazy_load():
+    """
+    Context manager under which `torch.load` will return a `DeferredLoad` instead
+    of `torch.Tensor.`
+    """
+    old_unpickler = pickle.Unpickler
+    old_load = pickle.load
+    old_rebuild_tensor = torch._utils._rebuild_tensor
+    try:
+
+        def load_monkeypatch(*args, **kwargs):
+            return pickle.Unpickler(*args, **kwargs).load()
+
+        pickle.Unpickler = LazyTorchUnpickler
+        pickle.load = load_monkeypatch
+        torch._utils._rebuild_tensor = DeferredLoad.rebuild
+
+        with accelerate.init_empty_weights():
+            yield
+
+    finally:
+        torch._utils._rebuild_tensor = old_rebuild_tensor
+        pickle.Unpickler = old_unpickler
+        pickle.load = old_load
+
+
+def dtype_bytes(dtype: torch.dtype) -> int:
+    """Return the number of bytes used to store a single instance of `dtype`."""
+    if dtype.is_floating_point:
+        ti = torch.finfo(dtype)
+    else:
+        ti = torch.iinfo(dtype)
+    return max(1, ti.bits // 8)
+
+
+def get_dtype(storage_type: Any):
+    if isinstance(storage_type, torch.dtype):
+        return storage_type
+    dtype = storage_type.dtype
+    if not isinstance(dtype, torch.dtype):
+        dtype = storage_type(0).dtype
+    return dtype
diff --git a/mergekit/mergekit/io/loader.py b/mergekit/mergekit/io/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2117b33b62447dfe41acf743ce9ea9fb67d9eb7
--- /dev/null
+++ b/mergekit/mergekit/io/loader.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Sequence
+
+import safetensors
+import torch
+
+from mergekit.io.lazy_unpickle import DeferredLoad, TorchArchiveReader, torch_lazy_load
+
+
+class TensorLoader(ABC):
+    """Base class for (potentially lazy) tensor loaders."""
+
+    @abstractmethod
+    def get_tensor(self, key: str) -> torch.Tensor:
+        ...
+
+    @abstractmethod
+    def keys(self) -> Sequence[str]:
+        ...
+
+    @classmethod
+    def get(
+        cls,
+        shard_path: str,
+        use_lazy_unpickle: bool = False,
+        device: Optional[str] = None,
+    ) -> "TensorLoader":
+        if shard_path.lower().endswith(".safetensors"):
+            # not a subclass of TensorLoader, but exposes same api
+            return safetensors.safe_open(
+                shard_path, framework="pt", device=device or "cpu"
+            )
+        elif use_lazy_unpickle:
+            return LazyPickleLoader(shard_path, device=device)
+        return DumbPytorchLoader(shard_path, device=device)
+
+
+class LazyPickleLoader(TensorLoader):
+    """Loader for pytorch files using a custom unpickler and vigorous monkeypatching."""
+
+    zip_reader: TorchArchiveReader
+    index: Dict[str, DeferredLoad]
+    device: Optional[str] = None
+
+    def __init__(self, path: str, device: Optional[str] = None):
+        self.zip_reader = TorchArchiveReader(path)
+        self.device = device
+        with torch_lazy_load():
+            self.index = torch.load(path)
+
+    def get_tensor(self, key: str) -> torch.Tensor:
+        if key not in self.index:
+            raise KeyError(key)
+
+        return self.index[key].execute(self.zip_reader, map_location=self.device)
+
+    def keys(self) -> Sequence[str]:
+        return self.index.keys()
+
+
+class DumbPytorchLoader(TensorLoader):
+    """Naive `torch.load` shard loading."""
+
+    tensors: Dict[str, torch.Tensor]
+
+    def __init__(self, path: str, device: Optional[str] = None):
+        self.tensors = torch.load(path, map_location=device, weights_only=True)
+
+    def get_tensor(self, key: str) -> torch.Tensor:
+        return self.tensors[key]
+
+    def keys(self) -> Sequence[str]:
+        return self.tensors.keys()
diff --git a/mergekit/mergekit/io/tasks.py b/mergekit/mergekit/io/tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..499ad4c0eeda5d5ca227c727f7ab025897ac23fb
--- /dev/null
+++ b/mergekit/mergekit/io/tasks.py
@@ -0,0 +1,230 @@
+import os
+import re
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference, dtype_from_name
+from mergekit.graph import Task
+from mergekit.io.lazy_tensor_loader import LazyTensorLoader
+from mergekit.io.tensor_writer import TensorWriter
+from mergekit.options import MergeOptions
+
+
+class LoaderCache:
+    loaders: Dict[ModelReference, LazyTensorLoader] = {}
+    lora_cache_dir: Optional[str] = None
+    hf_cache_dir: Optional[str] = None
+    lazy_unpickle: bool = False
+    trust_remote_code: bool = False
+
+    # singleton instance
+    _instance: Optional["LoaderCache"] = None
+
+    def __new__(cls) -> "LoaderCache":
+        if cls._instance is None:
+            cls._instance = super(LoaderCache, cls).__new__(cls)
+        return cls._instance
+
+    def get(self, model: ModelReference) -> LazyTensorLoader:
+        if model not in self.loaders:
+            merged = model.merged(
+                cache_dir=self.lora_cache_dir, trust_remote_code=self.trust_remote_code
+            )
+            self.loaders[model] = merged.lazy_loader(
+                cache_dir=self.hf_cache_dir, lazy_unpickle=self.lazy_unpickle
+            )
+        return self.loaders[model]
+
+    def flush_all(self):
+        for loader in self.loaders.values():
+            loader.flush()
+
+    def setup(self, options: MergeOptions):
+        self.lora_cache_dir = options.lora_merge_cache
+        self.hf_cache_dir = options.transformers_cache
+        self.lazy_unpickle = options.lazy_unpickle
+        self.trust_remote_code = options.trust_remote_code
+
+
+shard_name_re = re.compile(r"model\-([0-9]+)-of-([0-9]+)")
+
+
+def _normalized_shard_name(path: str) -> int:
+    name, _ext = os.path.splitext(os.path.basename(path))
+    name = name.lower().replace("pytorch_model", "model")
+    if m := shard_name_re.search(name):
+        frac = int(m.group(1)) / int(m.group(2))
+        name = f"model-{int(frac*100):03d}pct"
+    return name
+
+
+class LoadTensor(Task[Optional[torch.Tensor]]):
+    model: ModelReference
+    tensor: str
+    dtype: Optional[str] = None
+    device: Optional[str] = None
+    optional: bool = False
+    aliases: Optional[Tuple[str, ...]] = None
+    tied_names: Optional[Tuple[str, ...]] = None
+
+    def arguments(self) -> Dict[str, Task]:
+        return {}
+
+    def _resolve_name(self, loader: LazyTensorLoader) -> Optional[str]:
+        all_names = (
+            [self.tensor] + list(self.aliases or []) + list(self.tied_names or [])
+        )
+        for name in all_names:
+            if name in loader.index.tensor_paths:
+                return name
+        return None
+
+    def execute(self) -> Optional[torch.Tensor]:
+        loader = LoaderCache().get(self.model)
+        name = self._resolve_name(loader)
+        if not name:
+            if not self.optional:
+                raise RuntimeError(
+                    f"Tensor {self.tensor} required but not present in model {self.model}"
+                )
+            return None
+
+        x = loader.get_tensor(name, device=self.device or "cpu")
+        if self.dtype and (dtype := dtype_from_name(self.dtype)) != x.dtype:
+            x = x.to(dtype=dtype)
+        return x
+
+    def priority(self) -> int:
+        return -1000
+
+    def group_label(self) -> Optional[str]:
+        loader = LoaderCache().get(self.model)
+        name = self._resolve_name(loader)
+        # if name:
+        #     shard_path = loader.index.tensor_paths[name]
+        #     return _normalized_shard_name(shard_path)
+        # return None
+        return name
+
+
+class GatherTensors(Task[Dict[ModelReference, torch.Tensor]]):
+    weight_info: ImmutableMap[ModelReference, WeightInfo]
+    dtype: Optional[str] = None
+    device: Optional[str] = None
+
+    def arguments(self) -> Dict[str, Task]:
+        return {
+            f"{str(model)}:{wi.name}": LoadTensor(
+                model=model,
+                tensor=wi.name,
+                dtype=wi.force_dtype or self.dtype,
+                device=self.device,
+                optional=wi.optional,
+                aliases=wi.aliases,
+                tied_names=wi.tied_names,
+            )
+            for (model, wi) in self.weight_info.items()
+        }
+
+    def group_label(self) -> Optional[str]:
+        return max(t.group_label() or "" for t in self.arguments().values())
+
+    def priority(self) -> int:
+        return -10
+
+    def execute(self, **kwargs) -> Dict[ModelReference, torch.Tensor]:
+        key2model = {
+            f"{str(model)}:{wi.name}": model for (model, wi) in self.weight_info.items()
+        }
+        return {
+            key2model[key]: kwargs[key] for key in key2model if kwargs[key] is not None
+        }
+
+
+class TensorWriterTask(Task[TensorWriter]):
+    out_path: str
+    max_shard_size: int
+    safe_serialization: bool = True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {}
+
+    def execute(self, **_kwargs) -> TensorWriter:
+        return TensorWriter(
+            self.out_path,
+            max_shard_size=self.max_shard_size,
+            safe_serialization=self.safe_serialization,
+        )
+
+
+class SaveTensor(Task[None]):
+    tensor_name: str
+    tensor_task: Task
+    writer_task: TensorWriterTask
+    clone: bool
+    optional: bool = False
+    dtype: Optional[str] = None
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"writer": self.writer_task, "tensor": self.tensor_task}
+
+    def priority(self) -> int:
+        return 1000
+
+    def group_label(self) -> Optional[str]:
+        return self.tensor_task.group_label()
+
+    def execute(self, writer: TensorWriter, tensor: Optional[torch.Tensor]) -> None:
+        if tensor is None:
+            if not self.optional:
+                raise RuntimeError(f"No value for required tensor {self.tensor_name}")
+            return
+        if self.dtype:
+            tensor = tensor.to(dtype=dtype_from_name(self.dtype))
+        writer.save_tensor(name=self.tensor_name, tensor=tensor, clone=self.clone)
+
+
+class FinalizeModel(Task[None]):
+    tensor_save_tasks: Tuple[Task, ...]
+    writer_task: TensorWriterTask
+
+    def arguments(self) -> Dict[str, Task]:
+        return {
+            "writer": self.writer_task,
+            **{f"_unused_{idx}": t for idx, t in enumerate(self.tensor_save_tasks)},
+        }
+
+    def execute(self, writer: TensorWriter, **kwargs) -> None:
+        writer.finalize()
+
+
+class BuildStateDict(Task[Dict[str, torch.Tensor]]):
+    tensors: ImmutableMap[WeightInfo, Task[torch.Tensor]]
+
+    def arguments(self) -> Dict[str, Task]:
+        return {str(wi): t for wi, t in self.tensors.items()}
+
+    def execute(self, **kwargs) -> Dict[str, torch.Tensor]:
+        return {str(wi): t for wi, t in self.tensors.items()}
+
+
+class ReturnTensor(Task[torch.Tensor]):
+    weight_info: WeightInfo
+    tensor_task: Task[torch.Tensor]
+    dtype: Optional[str] = None
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensor": self.tensor_task}
+
+    def priority(self) -> int:
+        return 10000
+
+    def group_label(self) -> Optional[str]:
+        return self.tensor_task.group_label()
+
+    def execute(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.dtype and (dtype := dtype_from_name(self.dtype)) != tensor.dtype:
+            tensor = tensor.to(dtype=dtype)
+        return tensor
diff --git a/mergekit/mergekit/io/tensor_writer.py b/mergekit/mergekit/io/tensor_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea58222ec7852963a31514337ab17145b6e147d
--- /dev/null
+++ b/mergekit/mergekit/io/tensor_writer.py
@@ -0,0 +1,162 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import json
+import logging
+import os
+from typing import Dict
+
+import safetensors
+import torch
+
+
+class TensorWriter:
+    out_path: str
+    max_shard_size: int
+    shards_written: int
+    weight_map = Dict[str, str]
+    current_shard: Dict[str, torch.Tensor]
+    current_shard_size: int
+    total_size: int
+    safe_serialization: bool
+
+    def __init__(
+        self,
+        out_path: str,
+        max_shard_size: int = 1000 * 1000 * 1000 * 5,
+        safe_serialization: bool = True,
+    ) -> None:
+        os.makedirs(out_path, exist_ok=True)
+
+        self.out_path = out_path
+        self.max_shard_size = max_shard_size
+        self.safe_serialization = safe_serialization
+        self.shards_written = 0
+        self.weight_map = {}
+        self.current_shard = {}
+        self.current_shard_size = 0
+        self.total_size = 0
+
+    def save_tensor(self, name: str, tensor: torch.Tensor, clone: bool = False):
+        if not tensor.is_contiguous():
+            tensor = tensor.contiguous()
+
+        tensor_size = tensor.numel() * tensor.element_size()
+        if (
+            self.current_shard
+            and self.current_shard_size + tensor_size > self.max_shard_size
+        ):
+            self.flush_current_shard()
+
+        if clone:
+            tensor = tensor.clone()
+
+        self.current_shard[name] = tensor
+        self.total_size += tensor_size
+        self.current_shard_size += tensor_size
+
+    def flush_current_shard(self):
+        if not self.current_shard:
+            return
+
+        logging.info(f"Writing shard #{self.shards_written+1} to disk")
+
+        prefix, extension = self._get_name_components()
+        shard_name = f"{prefix}-{self.shards_written+1}.{extension}"
+
+        for key in self.current_shard:
+            self.weight_map[key] = shard_name
+
+        shard_path = os.path.join(self.out_path, shard_name)
+        if self.safe_serialization:
+            self._save_st(shard_path)
+        else:
+            torch.save(self.current_shard, shard_path)
+
+        self.current_shard = {}
+        self.current_shard_size = 0
+        self.shards_written = self.shards_written + 1
+
+    def finalize(self):
+        self.flush_current_shard()
+
+        logging.info("Finalizing shard names")
+
+        prefix, extension = self._get_name_components()
+
+        # standardize shard names to hf format
+        total_shards = self.shards_written
+        name_remap = {}
+        for idx in range(total_shards):
+            name_remap[
+                f"{prefix}-{idx+1}.{extension}"
+            ] = f"{prefix}-{idx+1:05d}-of-{total_shards:05d}.{extension}"
+
+        for old_name, new_name in name_remap.items():
+            os.rename(
+                os.path.join(self.out_path, old_name),
+                os.path.join(self.out_path, new_name),
+            )
+
+        for key in self.weight_map:
+            self.weight_map[key] = name_remap[self.weight_map[key]]
+
+        with open(
+            os.path.join(self.out_path, f"{prefix}.{extension}.index.json"),
+            "w",
+            encoding="utf-8",
+        ) as file:
+            json.dump(
+                {
+                    "metadata": {
+                        "mergekit_version": "0.0.5.2",
+                        "total_size": self.total_size,
+                    },
+                    "weight_map": self.weight_map,
+                },
+                file,
+            )
+
+    def _get_name_components(self):
+        if self.safe_serialization:
+            return "model", "safetensors"
+        return "pytorch_model", "bin"
+
+    def _save_st(self, shard_path: str):
+        def _do_save():
+            safetensors.torch.save_file(
+                self.current_shard,
+                shard_path,
+                metadata={"format": "pt"},
+            )
+
+        try:
+            _do_save()
+        except RuntimeError as e:
+            if (
+                len(e.args) > 0
+                and isinstance(e.args[0], str)
+                and "share memory" in e.args[0]
+            ):
+                logging.warning(
+                    "Your model has duplicated tensors but the --clone-tensors "
+                    "flag is not set."
+                )
+                self.current_shard = {
+                    key: self.current_shard[key].clone() for key in self.current_shard
+                }
+                _do_save()
+            else:
+                raise
diff --git a/mergekit/mergekit/merge.py b/mergekit/mergekit/merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d659505b0f2d7cb495d2a06424926631c6b1eb1
--- /dev/null
+++ b/mergekit/mergekit/merge.py
@@ -0,0 +1,283 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import importlib
+import importlib.resources
+import logging
+import os
+import shutil
+from collections import Counter
+from typing import Optional
+
+import tqdm
+import transformers
+
+from mergekit._data import chat_templates
+from mergekit.architecture import ArchitectureInfo, get_architecture_info
+from mergekit.card import generate_card
+from mergekit.config import MergeConfiguration
+from mergekit.graph import Executor
+from mergekit.io.tasks import LoaderCache
+from mergekit.options import MergeOptions
+from mergekit.plan import MergePlanner
+from mergekit.tokenizer import TokenizerInfo
+
+
+def run_merge(
+    merge_config: MergeConfiguration,
+    out_path: str,
+    options: MergeOptions,
+    config_source: Optional[str] = None,
+):
+    if options.random_seed is not None:
+        transformers.trainer_utils.set_seed(options.random_seed)
+
+    if not merge_config.models and not merge_config.slices:
+        raise RuntimeError("No output requested")
+
+    model_arch_info = [
+        get_architecture_info(m.config(trust_remote_code=options.trust_remote_code))
+        for m in merge_config.referenced_models()
+    ]
+    if not options.allow_crimes:
+        if not all(a == model_arch_info[0] for a in model_arch_info[1:]):
+            raise RuntimeError(
+                "Must specify --allow-crimes to attempt to mix different architectures"
+            )
+    arch_info = model_arch_info[0]
+
+    # initialize loader cache and set options
+    loader_cache = LoaderCache()
+    loader_cache.setup(options=options)
+
+    # create config for output model
+    cfg_out = _model_out_config(
+        merge_config, arch_info, trust_remote_code=options.trust_remote_code
+    )
+
+    # warm up loader cache
+    for model in (
+        pbar := tqdm.tqdm(
+            merge_config.referenced_models(),
+            desc="Warmup loader cache",
+            disable=options.quiet,
+        )
+    ):
+        loader_cache.get(model)
+    del pbar
+
+    logging.info("Planning operations")
+    targets = MergePlanner(
+        merge_config,
+        arch_info,
+        options=options,
+        out_model_config=cfg_out,
+    ).plan_to_disk(out_path=out_path)
+
+    exec = Executor(
+        tasks=targets,
+        math_device="cuda" if options.cuda else "cpu",
+        storage_device="cuda" if options.low_cpu_memory else "cpu",
+    )
+
+    tokenizer = None
+    for _task, value in exec.run(quiet=options.quiet):
+        if isinstance(value, TokenizerInfo):
+            tokenizer = value.tokenizer
+
+    if tokenizer:
+        pad_to_multiple_of = None
+        if merge_config.tokenizer and merge_config.tokenizer.pad_to_multiple_of:
+            pad_to_multiple_of = merge_config.tokenizer.pad_to_multiple_of
+        _update_config_vocab(cfg_out, tokenizer, pad_to_multiple_of=pad_to_multiple_of)
+
+    logging.info("Saving config")
+    cfg_out.save_pretrained(out_path)
+
+    if options.write_model_card:
+        if not config_source:
+            config_source = merge_config.to_yaml()
+
+        card_md = generate_card(
+            config=merge_config,
+            config_yaml=config_source,
+            name=os.path.basename(out_path),
+        )
+        with open(os.path.join(out_path, "README.md"), "w", encoding="utf-8") as fp:
+            fp.write(card_md)
+
+        with open(
+            os.path.join(out_path, "mergekit_config.yml"), "w", encoding="utf-8"
+        ) as fp:
+            fp.write(config_source)
+
+    if tokenizer is None:
+        if options.copy_tokenizer:
+            try:
+                _copy_tokenizer(
+                    merge_config, out_path, trust_remote_code=options.trust_remote_code
+                )
+            except Exception as e:
+                logging.error(
+                    "Failed to copy tokenizer. The merge was still successful, just copy it from somewhere else.",
+                    exc_info=e,
+                )
+        elif merge_config.chat_template:
+            logging.warning(
+                "Chat template specified but no tokenizer found. Chat template will not be saved."
+            )
+
+    if tokenizer:
+        logging.info("Saving tokenizer")
+        _set_chat_template(tokenizer, merge_config)
+        tokenizer.save_pretrained(out_path, safe_serialization=True)
+
+
+def _set_chat_template(
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    merge_config: MergeConfiguration,
+    trust_remote_code: bool = False,
+):
+    chat_template = merge_config.chat_template
+    if not chat_template:
+        return
+
+    if chat_template == "auto":
+        # see if there is a plurality chat template among the input models
+        model_templates = []
+        for model in merge_config.referenced_models():
+            try:
+                tok = transformers.AutoTokenizer.from_pretrained(
+                    model.model.path,
+                    revision=model.model.revision,
+                    trust_remote_code=trust_remote_code,
+                )
+                template = tok.chat_template
+                if isinstance(template, dict):
+                    template = template.get("default", None)
+                if template:
+                    model_templates.append(template.strip())
+            except Exception as e:
+                logging.warning(f"Unable to load tokenizer for {model}", exc_info=e)
+
+        if not model_templates:
+            return
+
+        chat_template = Counter(model_templates).most_common(1)[0][0]
+        logging.info(f"Auto-selected chat template: {chat_template}")
+
+    elif importlib.resources.is_resource(chat_templates, chat_template + ".jinja"):
+        with importlib.resources.open_text(
+            chat_templates, chat_template + ".jinja"
+        ) as fp:
+            chat_template = fp.read()
+
+    elif len(chat_template) < 20 or "{" not in chat_template:
+        raise RuntimeError(f"Invalid chat template: {chat_template}")
+
+    tokenizer.chat_template = chat_template
+
+
+def _copy_tokenizer(
+    merge_config: MergeConfiguration, out_path: str, trust_remote_code: bool = False
+):
+    donor_model = merge_config.base_model or (merge_config.referenced_models()[0])
+
+    if (
+        (not merge_config.chat_template)
+        and os.path.exists(
+            os.path.join(donor_model.model.path, "tokenizer_config.json")
+        )
+        and (
+            os.path.exists(os.path.join(donor_model.model.path, "tokenizer.json"))
+            or os.path.exists(os.path.join(donor_model.model.path, "tokenizer.model"))
+        )
+    ):
+        logging.info(f"Copying tokenizer from {donor_model}")
+
+        for file_name in [
+            "tokenizer_config.json",
+            "special_tokens_map.json",
+            "tokenizer.json",
+            "tokenizer.model",
+        ]:
+            if os.path.exists(os.path.join(donor_model.model.path, file_name)):
+                shutil.copy(
+                    os.path.join(donor_model.model.path, file_name),
+                    os.path.join(out_path, file_name),
+                )
+
+        return
+
+    # fallback: try actually loading the tokenizer and saving it
+    logging.info(f"Reserializing tokenizer from {donor_model}")
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        donor_model.model.path,
+        revision=donor_model.model.revision,
+        trust_remote_code=trust_remote_code,
+    )
+    _set_chat_template(tokenizer, merge_config)
+    tokenizer.save_pretrained(out_path, safe_serialization=True)
+
+
+def _model_out_config(
+    config: MergeConfiguration,
+    arch_info: ArchitectureInfo,
+    trust_remote_code: bool = False,
+) -> transformers.PretrainedConfig:
+    """Return a configuration for the resulting model."""
+    if config.base_model:
+        res = config.base_model.config(trust_remote_code=trust_remote_code)
+    else:
+        res = config.referenced_models()[0].config(trust_remote_code=trust_remote_code)
+    if config.out_dtype:
+        res.torch_dtype = config.out_dtype
+    elif config.dtype:
+        res.torch_dtype = config.dtype
+
+    if config.slices:
+        try:
+            num_layers = sum(
+                s.sources[0].layer_range[1] - s.sources[0].layer_range[0]
+                for s in config.slices
+            )
+            setattr(res, arch_info.num_layers_config_key(), num_layers)
+        except Exception as e:
+            logging.warning(
+                "Unable to set number of layers in output config - you may need to manually correct it.",
+                exc_info=e,
+            )
+
+    return res
+
+
+def _update_config_vocab(
+    config: transformers.PretrainedConfig,
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    pad_to_multiple_of: Optional[int] = None,
+):
+    vocab_size = len(tokenizer.get_vocab())
+    if pad_to_multiple_of and vocab_size % pad_to_multiple_of:
+        vocab_size = vocab_size + pad_to_multiple_of - (vocab_size % pad_to_multiple_of)
+    try:
+        config.vocab_size = vocab_size
+    except Exception as e:
+        logging.warning(
+            "Unable to set vocabulary size in output config - you may need to manually correct it.",
+            exc_info=e,
+        )
+
+
+__all__ = ["MergeOptions", "run_merge"]
diff --git a/mergekit/mergekit/merge_methods/__init__.py b/mergekit/mergekit/merge_methods/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b5b29f563b81493b71fa4e4738d75592ff412c
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/__init__.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from mergekit.merge_methods.base import MergeMethod
+from mergekit.merge_methods.generalized_task_arithmetic import (
+    ConsensusMethod,
+    GeneralizedTaskArithmeticMerge,
+    SparsificationMethod,
+)
+from mergekit.merge_methods.linear import LinearMerge
+from mergekit.merge_methods.model_stock import ModelStockMerge
+from mergekit.merge_methods.nearswap import NearSwapMerge
+from mergekit.merge_methods.nuslerp import NuSlerpMerge
+from mergekit.merge_methods.passthrough import PassthroughMerge
+from mergekit.merge_methods.slerp import SlerpMerge
+from mergekit.merge_methods.tokenizer_permute import TokenizerPermutationMerge
+
+
+def get(method: str) -> MergeMethod:
+    if method == "linear":
+        return LinearMerge()
+    elif method == "slerp":
+        return SlerpMerge()
+    elif method == "nearswap":
+        return NearSwapMerge()
+    elif method == "nuslerp":
+        return NuSlerpMerge()
+    elif method == "passthrough":
+        return PassthroughMerge()
+    elif method == "task_arithmetic":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=None,
+            default_normalize=False,
+            default_rescale=False,
+        )
+    elif method == "ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.magnitude,
+            default_normalize=True,
+            default_rescale=False,
+        )
+    elif method == "dare_ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.random,
+            default_normalize=False,
+            default_rescale=True,
+        )
+    elif method == "dare_linear":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.random,
+            default_normalize=False,
+            default_rescale=True,
+        )
+    elif method == "breadcrumbs":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.magnitude_outliers,
+            default_normalize=False,
+            default_rescale=False,
+        )
+    elif method == "breadcrumbs_ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.magnitude_outliers,
+            default_normalize=False,
+            default_rescale=False,
+        )
+    elif method == "model_stock":
+        return ModelStockMerge()
+
+    elif method == "della":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.rank_magnitude_sampling,
+            default_normalize=True,
+            default_rescale=True,
+        )
+
+    elif method == "della_linear":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.rank_magnitude_sampling,
+            default_normalize=False,
+            default_rescale=True,
+        )
+
+    elif method == "consensus_ta":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.consensus_ta,
+            default_normalize=False,
+            default_rescale=False,
+        )
+
+    elif method == "consensus_ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.consensus_ties,
+            default_normalize=True,
+            default_rescale=False,
+        )
+    raise RuntimeError(f"Unimplemented merge method {method}")
+
+
+__all__ = [
+    "MergeMethod",
+    "get",
+    "LinearMerge",
+    "SlerpMerge",
+    "PassthroughMerge",
+    "GeneralizedTaskArithmeticMerge",
+    "TokenizerPermutationMerge",
+]
diff --git a/mergekit/mergekit/merge_methods/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e0408c6bd7d1a0ab7be047e4a8b5478ce883223
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/base.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7b24a7084cd5a089e90a1bb32501f6530e7a8c9
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/base.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/generalized_task_arithmetic.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/generalized_task_arithmetic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b01ab2de8ef16200945db86b40ec5bec70d9ef3
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/generalized_task_arithmetic.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/linear.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/linear.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88d06ac2c6371fce7615554c8b7716dd6d2b5b8f
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/linear.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/model_stock.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/model_stock.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ca4782203b62ff6a7b2b796b751e96fa27d1a14
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/model_stock.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/nearswap.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/nearswap.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3c40bf6e77134a084557592025f6c5c9b977dc8
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/nearswap.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/nuslerp.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/nuslerp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2d37e3b7515e7da07c9ed3ab9de9f5fc4c02aee
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/nuslerp.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/passthrough.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/passthrough.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ac9c94ed732364a85ea795aa7380fee9057204
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/passthrough.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/rectify_embed.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/rectify_embed.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f3494c2341bbd3be36b7049f9b2ccd8dd9fe222
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/rectify_embed.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/slerp.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/slerp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6a9b9fffcdaca6d1708c5329b86ca7254979f6f
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/slerp.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/__pycache__/tokenizer_permute.cpython-310.pyc b/mergekit/mergekit/merge_methods/__pycache__/tokenizer_permute.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeb411a747932be0441dbf53098f2448b095ec4b
Binary files /dev/null and b/mergekit/mergekit/merge_methods/__pycache__/tokenizer_permute.cpython-310.pyc differ
diff --git a/mergekit/mergekit/merge_methods/base.py b/mergekit/mergekit/merge_methods/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..917ed08952bb959fb902ae45b9761121424d7279
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/base.py
@@ -0,0 +1,54 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from abc import ABC, abstractmethod
+from typing import Any, List, Optional, Union
+
+from pydantic import BaseModel
+from typing_extensions import TypeAlias
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.io.tasks import GatherTensors
+from mergekit.tokenizer import PermutedEmbeddings
+
+MergeTensorInput: TypeAlias = Union[GatherTensors, PermutedEmbeddings]
+
+
+class ConfigParameterDef(BaseModel):
+    name: str
+    required: bool = False
+    default_value: Any = None
+
+
+class MergeMethod(ABC):
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        return []
+
+    def parameters(self) -> List[ConfigParameterDef]:
+        return []
+
+    @abstractmethod
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        parameters: ImmutableMap[str, Any],
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+        base_model: Optional[ModelReference],
+    ) -> Task:
+        ...
diff --git a/mergekit/mergekit/merge_methods/generalized_task_arithmetic.py b/mergekit/mergekit/merge_methods/generalized_task_arithmetic.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bb3f0c7547b4bc8ca9cb1cfc7e7103a9f63a056
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/generalized_task_arithmetic.py
@@ -0,0 +1,293 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from pydantic import BaseModel
+from typing_extensions import Literal
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.sparsify import SparsificationMethod, get_tall_mask, sparsify
+
+
+class ConsensusMethod(str, Enum):
+    count = "count"
+    sum = "sum"
+
+
+class GeneralizedTaskArithmeticMerge(MergeMethod, BaseModel, frozen=True):
+    consensus_method: Optional[ConsensusMethod]
+    sparsification_method: Optional[SparsificationMethod]
+    default_normalize: bool
+    default_rescale: bool
+
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(name="int8_mask", required=False, default_value=False),
+            ConfigParameterDef(
+                name="normalize", required=False, default_value=self.default_normalize
+            ),
+            ConfigParameterDef(
+                name="rescale", required=False, default_value=self.default_rescale
+            ),
+        ]
+
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        res = [
+            ConfigParameterDef(name="weight", required=True),
+            ConfigParameterDef(name="density", required=False, default_value=1.0),
+        ]
+        if self.sparsification_method == SparsificationMethod.magnitude_outliers:
+            res.append(
+                ConfigParameterDef(
+                    name="gamma",
+                    default_value=0.01,
+                )
+            )
+        if self.sparsification_method == SparsificationMethod.rank_magnitude_sampling:
+            res.append(
+                ConfigParameterDef(
+                    name="epsilon",
+                    default_value=0.15,
+                )
+            )
+            res.append(
+                ConfigParameterDef(
+                    name="lambda",
+                    default_value=1.0,
+                )
+            )
+        if (
+            self.sparsification_method == SparsificationMethod.consensus_ta
+            or self.sparsification_method == SparsificationMethod.consensus_ties
+        ):
+            res.append(
+                ConfigParameterDef(
+                    name="k",
+                    default_value=1,
+                )
+            )
+            res.append(
+                ConfigParameterDef(
+                    name="lambda",
+                    default_value=1.0,
+                )
+            )
+        return res
+
+    def make_task(
+        self,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        base_model: Optional[ModelReference],
+        parameters: ImmutableMap[str, Any],
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+    ) -> Task:
+        return GTATask(
+            method=self,
+            tensors=tensors,
+            base_model=base_model,
+            tensor_parameters=tensor_parameters,
+            int8_mask=parameters["int8_mask"],
+            normalize=parameters["normalize"],
+            rescale=parameters["rescale"],
+            weight_info=output_weight,
+        )
+
+
+class GTATask(Task[torch.Tensor]):
+    method: GeneralizedTaskArithmeticMerge
+    tensors: MergeTensorInput
+    base_model: ModelReference
+    weight_info: WeightInfo
+    tensor_parameters: ImmutableMap[ModelReference, Any]
+    int8_mask: bool
+    normalize: bool
+    rescale: bool
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.tensors}
+
+    def execute(
+        self,
+        tensors: Dict[ModelReference, torch.Tensor],
+        **_kwargs,
+    ) -> torch.Tensor:
+        # collect task vectors
+        tvs, base = get_task_vectors(
+            self.weight_info,
+            self.base_model,
+            tensors,
+            tensor_parameters=self.tensor_parameters.data,
+        )
+        if not tvs:
+            return base
+
+        # sparsify
+        if (
+            self.method.sparsification_method
+            and self.method.sparsification_method != SparsificationMethod.consensus_ta
+        ):
+            for tv_info in tvs:
+                kwargs = {}
+                if "gamma" in tv_info:
+                    kwargs["gamma"] = tv_info["gamma"]
+
+                if "epsilon" in tv_info:
+                    kwargs["epsilon"] = tv_info["epsilon"]
+
+                tv_info["sparsified_delta"] = sparsify(
+                    tv_info["delta"],
+                    density=tv_info["density"],
+                    method=self.method.sparsification_method,
+                    rescale=self.rescale,
+                    **kwargs,
+                )
+
+            deltas = torch.stack([tv["sparsified_delta"] for tv in tvs], dim=0)
+        else:
+            deltas = torch.stack([tv["delta"] for tv in tvs], dim=0)
+        weights = torch.tensor(
+            [tv["weight"] for tv in tvs], dtype=deltas.dtype, device=deltas.device
+        )
+        while len(deltas.shape) > len(weights.shape):
+            weights.unsqueeze_(-1)
+
+        weighted_deltas = deltas * weights
+
+        # get sign consensus and mix deltas
+        if self.method.consensus_method:
+            mask_dtype = torch.int8 if self.int8_mask else base.dtype
+            mask = get_mask(
+                weighted_deltas,
+                method=self.method.consensus_method,
+                mask_dtype=mask_dtype,
+            )
+            mixed_delta = (weighted_deltas * mask).sum(dim=0)
+            divisor = (weights * mask).sum(dim=0)
+            divisor[divisor == 0] = 1
+        else:
+            mixed_delta = weighted_deltas.sum(dim=0)
+            divisor = weights.sum(dim=0)
+            divisor[divisor.abs() < 1e-8] = 1
+
+        if self.normalize:
+            mixed_delta /= divisor
+
+        if (
+            self.method.sparsification_method
+            == SparsificationMethod.rank_magnitude_sampling
+        ):
+            lambda_factor = tvs[0]["lambda"]
+            mixed_delta *= lambda_factor
+
+        if (
+            self.method.sparsification_method == SparsificationMethod.consensus_ta
+            or self.method.sparsification_method == SparsificationMethod.consensus_ties
+        ):
+            for tv_info in tvs:
+                tv_info["tall_mask"] = get_tall_mask(
+                    tv_info["delta"],
+                    tv_info["lambda"],
+                    mixed_delta,
+                )
+            tall_masks = torch.stack([tv["tall_mask"] for tv in tvs], dim=0)
+            consensus_mask = tall_masks.sum(dim=0) >= tvs[0]["k"]
+            mixed_delta = mixed_delta * consensus_mask
+
+        return (base + mixed_delta).to(base.dtype)
+
+    def group_label(self) -> Optional[str]:
+        return self.tensors.group_label()
+
+
+def get_task_vectors(
+    weight_info: WeightInfo,
+    base_model: ModelReference,
+    tensors: ImmutableMap[ModelReference, torch.Tensor],
+    tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+) -> Tuple[List[Dict[str, Any]], torch.Tensor]:
+    keys = list(tensors.keys())
+    base = tensors[base_model]
+
+    parameter_name = weight_info.name
+
+    res = []
+    for model in keys:
+        if model == base_model:
+            continue
+
+        x = tensors[model].to(base.dtype)
+        if x.shape != base.shape:
+            if weight_info.is_embed:
+                x = x[: base.shape[0], : base.shape[1]]
+                logging.warning(f"Using submatrix of {model}:{parameter_name}")
+            else:
+                logging.warning(
+                    f"skipping {model}:{parameter_name} due to size mismatch"
+                )
+                continue
+
+        delta = x - base
+        del x
+        del tensors[model]
+
+        d = {}
+        d["model"] = model
+        d["delta"] = delta
+        for p in tensor_parameters[model]:
+            d[p] = tensor_parameters[model][p]
+        res.append(d)
+    return res, base
+
+
+def get_mask(
+    delta: torch.Tensor,
+    method: Literal["sum", "count"] = "sum",
+    mask_dtype: Optional[torch.dtype] = None,
+):
+    """Returns a mask determining which delta vectors should be merged
+    into the final model.
+
+    For the methodology described in the TIES paper use 'sum'. For a
+    simpler naive count of signs, use 'count'."""
+    if mask_dtype is None:
+        mask_dtype = delta.dtype
+
+    sign = delta.sign().to(mask_dtype)
+
+    if method == "sum":
+        sign_weight = delta.sum(dim=0)
+        majority_sign = (sign_weight >= 0).to(mask_dtype) * 2 - 1
+        del sign_weight
+    elif method == "count":
+        majority_sign = (sign.sum(dim=0) >= 0).to(mask_dtype) * 2 - 1
+    else:
+        raise RuntimeError(f'Unimplemented mask method "{method}"')
+
+    return sign == majority_sign
diff --git a/mergekit/mergekit/merge_methods/linear.py b/mergekit/mergekit/merge_methods/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..48224bb8b87baa820f443ae96acbe65b015af7f4
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/linear.py
@@ -0,0 +1,97 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
+
+
+class LinearMergeTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]]
+    normalize: bool
+    weight_info: WeightInfo
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(
+        self, tensors: Dict[ModelReference, torch.Tensor], **_kwargs
+    ) -> torch.Tensor:
+        keys = list(tensors.keys())
+
+        tensors = [tensors[key] for key in keys]
+        weights = [self.tensor_parameters[key]["weight"] for key in keys]
+
+        rectify_embed_sizes(self.weight_info, tensors)
+
+        unique_shapes = set(t.shape for t in tensors)
+        if len(unique_shapes) != 1:
+            raise RuntimeError(
+                f"Tensor size mismatch for {self.weight_info.name}, sizes: {list(unique_shapes)}"
+            )
+
+        tensors = torch.stack(tensors, dim=0)
+        weights = torch.tensor(weights, dtype=tensors.dtype, device=tensors.device)
+        while len(weights.shape) < len(tensors.shape):
+            weights.unsqueeze_(-1)
+
+        res = (weights * tensors).sum(dim=0)
+        if self.normalize:
+            res = res / weights.sum(dim=0)
+
+        return res
+
+    def group_label(self) -> Optional[str]:
+        return self.gather_tensors.group_label()
+
+
+class LinearMerge(MergeMethod):
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(name="normalize", required=False, default_value=True),
+        ]
+
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        return [ConfigParameterDef(name="weight", required=True)]
+
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        parameters: Dict[str, Any],
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+        **_kwargs,
+    ) -> Task:
+        return LinearMergeTask(
+            gather_tensors=tensors,
+            tensor_parameters=tensor_parameters,
+            normalize=parameters["normalize"],
+            weight_info=output_weight,
+        )
diff --git a/mergekit/mergekit/merge_methods/model_stock.py b/mergekit/mergekit/merge_methods/model_stock.py
new file mode 100644
index 0000000000000000000000000000000000000000..94b1e05b83b5b793c1205da7adad697dded094cb
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/model_stock.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
+
+
+class ModelStockMergeTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    base_model: ModelReference
+    weight_info: WeightInfo
+    filter_wise: bool = False
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> torch.Tensor:
+        if len(tensors) == 1 and self.base_model in tensors:
+            return tensors[self.base_model]
+        if len(tensors) < 3:
+            if self.weight_info.optional:
+                logging.warning(
+                    f"Optional weight {self.weight_info.name} not present in enough models, discarding"
+                )
+                return None
+
+            raise ValueError(
+                "ModelStockMerge requires at least 3 models (base plus two+ others)"
+            )
+
+        w_0, ws = self.get_rectified_weights(tensors)
+        out_shape = w_0.shape
+
+        if self.filter_wise:
+            if w_0.dim() == 1:
+                # bias (or other single-vector) parameters should be treated as row vectors
+                w_0 = w_0.unsqueeze(0)
+                ws = [w.unsqueeze(0) for w in ws]
+        else:
+            w_0 = w_0.view(-1)
+            ws = [w.view(-1) for w in ws]
+
+        offsets = [w - w_0 for w in ws]
+
+        # now there is a question of how to come up with a value for theta.
+        # in the two-vector case, we can get an exact angle between the two vectors
+        # but the paper doesn't explicitly say what to do in the multi-vector case -
+        # they keep using a singular theta value and don't elaborate on how to
+        # calculate it. i'm going to assume an average of pairwise angles for now? i guess?
+
+        cos_thetas = []
+        for i, w_0_offset in enumerate(offsets):
+            for j in range(i + 1, len(offsets)):
+                w_1_offset = offsets[j]
+
+                norm_product = torch.norm(w_0_offset, dim=-1) * torch.norm(
+                    w_1_offset, dim=-1
+                )
+                cos_theta = (
+                    (w_0_offset * w_1_offset).sum(dim=-1) / norm_product.clamp(min=1e-6)
+                ).clamp(-1, 1)
+                cos_thetas.append(cos_theta)
+
+        cos_theta = torch.stack(cos_thetas).mean(dim=0).unsqueeze(-1)
+        N = len(ws)
+        t = (N * cos_theta) / (1 + (N - 1) * cos_theta)
+
+        w_avg = sum(ws) / len(ws)
+        w_h = t * w_avg + (1 - t) * w_0
+
+        return w_h.reshape(out_shape)
+
+    def get_rectified_weights(self, tensors: Dict[ModelReference, torch.Tensor]):
+        if self.base_model not in tensors:
+            raise ValueError("Base model tensor not found")
+
+        all_weights = [tensors[self.base_model]] + [
+            tensors[k] for k in tensors if k != self.base_model
+        ]
+        rectify_embed_sizes(self.weight_info, all_weights)
+        w_0 = all_weights[0]
+        ws = all_weights[1:]
+        return w_0, ws
+
+    def group_label(self) -> Optional[str]:
+        return self.gather_tensors.group_label()
+
+
+class ModelStockMerge(MergeMethod):
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(name="filter_wise", required=False, default_value=False)
+        ]
+
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        base_model: Optional[ModelReference],
+        parameters: ImmutableMap[str, Any],
+        **_kwargs,
+    ) -> Task:
+        return ModelStockMergeTask(
+            gather_tensors=tensors,
+            base_model=base_model,
+            weight_info=output_weight,
+            filter_wise=parameters["filter_wise"],
+        )
diff --git a/mergekit/mergekit/merge_methods/nearswap.py b/mergekit/mergekit/merge_methods/nearswap.py
new file mode 100644
index 0000000000000000000000000000000000000000..10371b5ee45f31e4756d3a333a4cf0c928639679
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/nearswap.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2025 Charles Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
+
+
+class NearSwapTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    base_model: ModelReference
+    t: float
+    weight_info: WeightInfo
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> torch.Tensor:
+        if self.t <= 0:
+            raise RuntimeError(f"Threshold cannot be <= zero, got {self.t}")
+        if len(tensors) == 1:
+            return list(tensors.values())[0]
+        elif len(tensors) != 2:
+            raise RuntimeError(
+                f"Nearswap merge expects exactly two models, got {len(tensors)}"
+            )
+        elif self.base_model not in tensors:
+            raise RuntimeError("Base model not in input tensors")
+
+        [a, b] = list(tensors.items())
+        if a[0] != self.base_model:
+            [a, b] = [b, a]
+        prepped_tensors = [a[1], b[1]]
+
+        rectify_embed_sizes(self.weight_info, prepped_tensors)
+
+        return (
+            nearswap(
+                self.t,
+                prepped_tensors[0],
+                prepped_tensors[1],
+            )
+            .to(prepped_tensors[0].dtype)
+            .to(prepped_tensors[0].device)
+        )
+
+
+class NearSwapMerge(MergeMethod):
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [ConfigParameterDef(name="t", required=True)]
+
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        parameters: ImmutableMap[str, Any],
+        base_model: Optional[ModelReference],
+        **_kwargs,
+    ) -> Task:
+        return NearSwapTask(
+            gather_tensors=tensors,
+            base_model=base_model,
+            weight_info=output_weight,
+            t=parameters["t"],
+        )
+
+
+def nearswap(t: float, v0: torch.Tensor, v1: torch.Tensor) -> torch.Tensor:
+    """
+    NearSwap implementation using PyTorch.
+
+    Adapted from: https://huggingface.co/alchemonaut/QuartetAnemoi-70B-t0.0001
+
+    Parameters:
+        t (float): The sameness threshold.
+        v0 (torch.Tensor): Weights from the base model.
+        v1 (torch.Tensor): Weights from the secondary model.
+
+    Returns:
+        torch.Tensor: Resulting interpolated weights.
+    """
+    # Compute the absolute difference
+    lweight = torch.abs(v0 - v1)
+
+    # Compute the interpolation factor
+    lweight = t / lweight
+    lweight = torch.nan_to_num(lweight, nan=1.0, posinf=1.0, neginf=1.0)
+    lweight = torch.clamp(lweight, min=0.0, max=1.0)
+
+    # Linearly interpolate between v0 and v1
+    return lweight * v1 + (1 - lweight) * v0
diff --git a/mergekit/mergekit/merge_methods/nuslerp.py b/mergekit/mergekit/merge_methods/nuslerp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67f2f22d3535adb2b3365d9a7df3a311e91a746
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/nuslerp.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch._tensor import Tensor
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
+
+
+class NuSlerpTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]]
+    weight_info: WeightInfo
+    row_wise: bool
+    flatten: bool
+    base_model: Optional[ModelReference]
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> Tensor:
+        if len(tensors) == 1:
+            return list(tensors.values())[0]
+
+        if self.base_model is not None:
+            if len(tensors) != 3:
+                raise RuntimeError(
+                    "NuSlerp base model can not be one of the two models to merge"
+                )
+            base_tensor = tensors.pop(self.base_model)
+        else:
+            base_tensor = None
+
+        keys = list(tensors.keys())
+        tensors = [tensors[key] for key in keys]
+        weights = [self.tensor_parameters[key]["weight"] for key in keys]
+
+        if len(tensors) != 2:
+            print(keys)
+            print(self.base_model)
+            raise RuntimeError(
+                "NuSlerp merge expects exactly two models (plus optional base model)"
+            )
+
+        if abs(sum(weights)) < 1e-6:
+            # this is fairly arbitrary, but it's more sane than exploding
+            t = 0.5
+        else:
+            t = weights[1] / sum(weights)
+
+        if base_tensor is not None:
+            tensors.append(base_tensor)
+        rectify_embed_sizes(self.weight_info, tensors)
+
+        if base_tensor is not None:
+            base_tensor = tensors.pop()
+            return base_tensor + nuslerp(
+                t,
+                tensors[0] - base_tensor,
+                tensors[1] - base_tensor,
+                dim=0 if self.row_wise else -1,
+                flatten=self.flatten,
+            )
+        return nuslerp(
+            t,
+            tensors[0],
+            tensors[1],
+            dim=0 if self.row_wise else -1,
+            flatten=self.flatten,
+        )
+
+
+class NuSlerpMerge(MergeMethod):
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(
+                name="nuslerp_row_wise",
+                required=False,
+                default_value=False,
+            ),
+            ConfigParameterDef(
+                name="nuslerp_flatten",
+                required=False,
+                default_value=True,
+            ),
+        ]
+
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        return [ConfigParameterDef(name="weight", required=True)]
+
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        base_model: Optional[ModelReference],
+        parameters: ImmutableMap[str, Any],
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+        **_kwargs,
+    ) -> Task:
+        return NuSlerpTask(
+            gather_tensors=tensors,
+            tensor_parameters=tensor_parameters,
+            weight_info=output_weight,
+            row_wise=parameters["nuslerp_row_wise"],
+            flatten=parameters["nuslerp_flatten"],
+            base_model=base_model,
+        )
+
+
+def nuslerp(
+    t: float,
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    dim: int = -1,
+    eps: float = 1e-8,
+    flatten: bool = False,
+):
+    out_shape = v0.shape
+
+    def _normalize(x: torch.Tensor, eps: float = 1e-7) -> torch.Tensor:
+        return x / torch.norm(x, dim=-1, keepdim=True).clamp(min=eps)
+
+    if flatten:
+        v0 = v0.view(-1)
+        v1 = v1.view(-1)
+    elif dim != -1:
+        v0 = v0.transpose(dim, -1)
+        v1 = v1.transpose(dim, -1)
+
+    v0_u = _normalize(v0)
+    v1_u = _normalize(v1)
+
+    cos_theta = torch.sum(v0_u * v1_u, dim=-1, keepdim=True)
+    theta = torch.acos(cos_theta.clamp(-1, 1))
+    sin_theta = torch.sin(theta)
+
+    colinear = (sin_theta.abs() < eps).squeeze()
+
+    res = (torch.sin((1 - t) * theta) * v0 + torch.sin(t * theta) * v1) / sin_theta
+    # Use linear interpolation for (nearly) colinear vectors
+    res[colinear] = (1 - t) * v0[colinear] + t * v1[colinear]
+
+    if dim != -1 and not flatten:
+        res = res.transpose(dim, -1)
+    return res.view(out_shape)
diff --git a/mergekit/mergekit/merge_methods/passthrough.py b/mergekit/mergekit/merge_methods/passthrough.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b0bf12a53c032d1108ed2afec0f082ee964db7
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/passthrough.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+
+
+class PassthroughMergeTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]]
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> torch.Tensor:
+        if len(tensors) != 1:
+            raise RuntimeError("Passthrough merge expects exactly one tensor")
+
+        model, tensor = list(tensors.items())[0]
+        scale = self.tensor_parameters[model].data.get("scale", None)
+        if scale is not None:
+            tensor = tensor * scale
+
+        return tensor
+
+    def group_label(self) -> Optional[str]:
+        return self.gather_tensors.group_label()
+
+
+class PassthroughMerge(MergeMethod):
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        return [ConfigParameterDef(name="scale", required=False, default_value=None)]
+
+    def make_task(
+        self,
+        *,
+        tensors: MergeTensorInput,
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+        **kwargs,
+    ) -> Task:
+        return PassthroughMergeTask(
+            gather_tensors=tensors, tensor_parameters=tensor_parameters
+        )
diff --git a/mergekit/mergekit/merge_methods/rectify_embed.py b/mergekit/mergekit/merge_methods/rectify_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d116b4fa0b36e684cee982853344caaa431c586
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/rectify_embed.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+
+import logging
+from typing import List
+
+import torch
+
+from mergekit.architecture import WeightInfo
+
+
+def rectify_embed_sizes(weight_info: WeightInfo, tensors: List[torch.Tensor]):
+    # TODO: use arch_info.embed_weights() instead
+    if weight_info.is_embed and all(len(t.shape) == 2 for t in tensors):
+        # special case - if lm_head.weight or embed_tokens.weight have a size
+        # mismatch, take the largest common submatrix of all of them
+        if take_common_submatrix(tensors):
+            logging.warning(
+                f"Using common submatrix of size {tensors[0].shape} for {weight_info.name}"
+            )
+
+
+def take_common_submatrix(tensors: List[torch.Tensor]) -> bool:
+    min_size = [None, None]
+    for t in tensors:
+        for idx in range(2):
+            if min_size[idx] is None or t.shape[idx] < min_size[idx]:
+                min_size[idx] = t.shape[idx]
+
+    if not all(t.shape == torch.Size(min_size) for t in tensors):
+        for idx in range(len(tensors)):
+            tensors[idx] = tensors[idx][: min_size[0], : min_size[1]]
+        return True
+    return False
diff --git a/mergekit/mergekit/merge_methods/slerp.py b/mergekit/mergekit/merge_methods/slerp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d33dd5a9fc0433536b1de4fee22468c04ba6a393
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/slerp.py
@@ -0,0 +1,170 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.rectify_embed import rectify_embed_sizes
+
+
+class SlerpTask(Task[torch.Tensor]):
+    gather_tensors: MergeTensorInput
+    base_model: ModelReference
+    t: float
+    weight_info: WeightInfo
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tensors": self.gather_tensors}
+
+    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> torch.Tensor:
+        if len(tensors) == 1:
+            return list(tensors.values())[0]
+        elif len(tensors) != 2:
+            raise RuntimeError("Slerp merge expects exactly two models")
+        elif self.base_model not in tensors:
+            raise RuntimeError("Base model not in input tensors")
+
+        [a, b] = list(tensors.items())
+        if a[0] != self.base_model:
+            [a, b] = [b, a]
+        prepped_tensors = [a[1], b[1]]
+
+        rectify_embed_sizes(self.weight_info, prepped_tensors)
+
+        return (
+            slerp(
+                self.t,
+                prepped_tensors[0],
+                prepped_tensors[1],
+            )
+            .to(prepped_tensors[0].dtype)
+            .to(prepped_tensors[0].device)
+        )
+
+    def group_label(self) -> Optional[str]:
+        return self.gather_tensors.group_label()
+
+
+class SlerpMerge(MergeMethod):
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [ConfigParameterDef(name="t", required=True)]
+
+    def make_task(
+        self,
+        *,
+        output_weight: WeightInfo,
+        tensors: MergeTensorInput,
+        parameters: ImmutableMap[str, Any],
+        base_model: Optional[ModelReference],
+        **_kwargs,
+    ) -> Task:
+        return SlerpTask(
+            gather_tensors=tensors,
+            base_model=base_model,
+            weight_info=output_weight,
+            t=parameters["t"],
+        )
+
+
+def lerp(
+    t: float, v0: Union[np.ndarray, torch.Tensor], v1: Union[np.ndarray, torch.Tensor]
+) -> Union[np.ndarray, torch.Tensor]:
+    return (1 - t) * v0 + t * v1
+
+
+def slerp(
+    t: Union[float, np.ndarray],
+    v0: Union[np.ndarray, torch.Tensor],
+    v1: Union[np.ndarray, torch.Tensor],
+    DOT_THRESHOLD: float = 0.9995,
+    eps: float = 1e-8,
+):
+    """
+    Spherical linear interpolation
+
+    From: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
+    Args:
+        t (float/np.ndarray): Float value between 0.0 and 1.0
+        v0 (np.ndarray): Starting vector
+        v1 (np.ndarray): Final vector
+        DOT_THRESHOLD (float): Threshold for considering the two vectors as
+                               colinear. Not recommended to alter this.
+    Returns:
+        v2 (np.ndarray): Interpolation vector between v0 and v1
+    """
+    is_torch = False
+    if not isinstance(v0, np.ndarray):
+        is_torch = True
+        v0 = v0.detach().cpu().float().numpy()
+    if not isinstance(v1, np.ndarray):
+        is_torch = True
+        v1 = v1.detach().cpu().float().numpy()
+
+    # Copy the vectors to reuse them later
+    v0_copy = np.copy(v0)
+    v1_copy = np.copy(v1)
+
+    # Normalize the vectors to get the directions and angles
+    v0 = normalize(v0, eps)
+    v1 = normalize(v1, eps)
+
+    # Dot product with the normalized vectors (can't use np.dot in W)
+    dot = np.sum(v0 * v1)
+
+    # If absolute value of dot product is almost 1, vectors are ~colinear, so use lerp
+    if np.abs(dot) > DOT_THRESHOLD:
+        res = lerp(t, v0_copy, v1_copy)
+        return maybe_torch(res, is_torch)
+
+    # Calculate initial angle between v0 and v1
+    theta_0 = np.arccos(dot)
+    sin_theta_0 = np.sin(theta_0)
+
+    # Angle at timestep t
+    theta_t = theta_0 * t
+    sin_theta_t = np.sin(theta_t)
+
+    # Finish the slerp algorithm
+    s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+    s1 = sin_theta_t / sin_theta_0
+    res = s0 * v0_copy + s1 * v1_copy
+
+    return maybe_torch(res, is_torch)
+
+
+def maybe_torch(v: np.ndarray, is_torch: bool):
+    if is_torch:
+        return torch.from_numpy(v)
+    return v
+
+
+def normalize(v: np.ndarray, eps: float):
+    norm_v = np.linalg.norm(v)
+    if norm_v > eps:
+        v = v / norm_v
+    return v
diff --git a/mergekit/mergekit/merge_methods/tokenizer_permute.py b/mergekit/mergekit/merge_methods/tokenizer_permute.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c6f9c5027406ddf533c796346a59ec4fdf0fff
--- /dev/null
+++ b/mergekit/mergekit/merge_methods/tokenizer_permute.py
@@ -0,0 +1,153 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Any, Dict, List, Optional
+
+import torch
+from pydantic import BaseModel
+
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.merge_methods.base import (
+    ConfigParameterDef,
+    MergeMethod,
+    MergeTensorInput,
+)
+from mergekit.merge_methods.slerp import slerp
+from mergekit.tokenizer import BuildTokenizer, TokenizerInfo
+
+
+class TokenizerPermutationMergeTask(Task[torch.Tensor]):
+    tokenizer_task: BuildTokenizer
+    gather_tensors: MergeTensorInput
+    base_model: Optional[ModelReference]
+    use_slerp: bool
+    slerp_t: Optional[float]
+    tensor_parameters: ImmutableMap[ModelReference, Any]
+
+    def uses_accelerator(self) -> bool:
+        return True
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tokenizer_info": self.tokenizer_task, "tensors": self.gather_tensors}
+
+    def execute(
+        self, tokenizer_info: TokenizerInfo, tensors: Dict[ModelReference, torch.Tensor]
+    ) -> torch.Tensor:
+        if not tensors:
+            return None
+        if len(tensors) == 1:
+            return list(tensors.values())[0]
+
+        if self.use_slerp and self.slerp_t is None:
+            raise RuntimeError("Must set t to use embed_slerp")
+
+        models = []
+        expanded = []
+        masks = []
+        weights = []
+        for model in tensors:
+            models.append(model)
+
+            x = tensors[model]
+            p = tokenizer_info.permutations[model]
+
+            xp = torch.zeros((len(p), x.shape[-1]), dtype=x.dtype, device=x.device)
+            mask = torch.zeros((len(p),), dtype=torch.bool, device=x.device)
+            for out_idx in p:
+                in_idx = p[out_idx]
+                if in_idx < 0:
+                    continue
+
+                xp[out_idx, :] = x[in_idx, :]
+                mask[out_idx] = 1
+
+            expanded.append(xp)
+            masks.append(mask)
+
+            is_base = model == self.base_model
+            if self.use_slerp:
+                weight = (1.0 - self.slerp_t) if is_base else self.slerp_t
+            else:
+                weight = self.tensor_parameters[model]["weight"]
+
+            weights.append(weight)
+
+        expanded = torch.stack(expanded, dim=0)
+        masks = torch.stack(masks, dim=0).unsqueeze(-1)
+        weights = (
+            torch.tensor(weights, dtype=expanded.dtype, device=expanded.device)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+
+        total_weight = (masks * weights).sum(dim=0)
+        scale = 1 / total_weight
+        scale[total_weight.abs() < 1e-8] = 0
+
+        linear_merged = (expanded * weights * masks).sum(dim=0) * scale
+
+        if self.use_slerp:
+            if expanded.shape[0] != 2:
+                raise RuntimeError("SLERP takes exactly two models")
+
+            if models[0] == self.base_model:
+                v0 = expanded[0, ...]
+                v1 = expanded[1, ...]
+            else:
+                v0 = expanded[1, ...]
+                v1 = expanded[0, ...]
+
+            res = slerp(self.slerp_t, v0, v1)
+            need_linear = (masks.sum(dim=0) != 2).squeeze(dim=-1)
+            res[need_linear, :] = linear_merged[need_linear, :].to(
+                device=res.device, dtype=res.dtype
+            )
+            return res
+
+        return linear_merged
+
+
+class TokenizerPermutationMerge(MergeMethod, BaseModel):
+    tokenizer_task: BuildTokenizer
+
+    def parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(name="t", required=False),
+            ConfigParameterDef(name="embed_slerp", required=False, default_value=False),
+        ]
+
+    def tensor_parameters(self) -> List[ConfigParameterDef]:
+        return [
+            ConfigParameterDef(name="weight", required=False),
+        ]
+
+    def make_task(
+        self,
+        *,
+        tensors: MergeTensorInput,
+        parameters: Dict[str, Any],
+        tensor_parameters: ImmutableMap[ModelReference, ImmutableMap[str, Any]],
+        base_model: Optional[ModelReference],
+        **_kwargs,
+    ) -> Task:
+        return TokenizerPermutationMergeTask(
+            base_model=base_model,
+            tokenizer_task=self.tokenizer_task,
+            gather_tensors=tensors,
+            use_slerp=parameters["embed_slerp"],
+            slerp_t=parameters["t"],
+            tensor_parameters=tensor_parameters,
+        )
diff --git a/mergekit/mergekit/moe/__init__.py b/mergekit/mergekit/moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc1cf067c6a433d9b28daea68f9bc2820b73366e
--- /dev/null
+++ b/mergekit/mergekit/moe/__init__.py
@@ -0,0 +1,19 @@
+from typing import List
+
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.deepseek import DeepseekMoE
+from mergekit.moe.mixtral import MixtralMoE
+
+ALL_OUTPUT_ARCHITECTURES: List[MoEOutputArchitecture] = [MixtralMoE(), DeepseekMoE()]
+
+try:
+    from mergekit.moe.qwen import QwenMoE
+except ImportError:
+    pass
+else:
+    ALL_OUTPUT_ARCHITECTURES.append(QwenMoE())
+
+__all__ = [
+    "ALL_OUTPUT_ARCHITECTURES",
+    "MoEOutputArchitecture",
+]
diff --git a/mergekit/mergekit/moe/arch.py b/mergekit/mergekit/moe/arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a54d610ce71ac6a84cd845746361c9584ab3a8
--- /dev/null
+++ b/mergekit/mergekit/moe/arch.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import torch
+
+from mergekit.moe.config import MoEMergeConfig
+from mergekit.options import MergeOptions
+
+
+class MoEOutputArchitecture(ABC):
+    @abstractmethod
+    def name(self) -> str:
+        """Return a human-readable name for the architecture."""
+        pass
+
+    @abstractmethod
+    def supports_config(
+        self,
+        config: MoEMergeConfig,
+        explain: bool = False,
+        trust_remote_code: bool = False,
+    ) -> bool:
+        """Return whether this architecture supports the given config.
+
+        If `explain` is True, log an explanation of why the config is not supported."""
+        pass
+
+    @abstractmethod
+    def write_model(
+        self,
+        out_path: str,
+        config: MoEMergeConfig,
+        merge_options: MergeOptions,
+        router_weights: List[torch.Tensor],
+        shared_router_weights: Optional[List[torch.Tensor]] = None,
+    ):
+        """Write the config and tensors for the output MoE to the given path."""
+        pass
diff --git a/mergekit/mergekit/moe/common.py b/mergekit/mergekit/moe/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5970b4ae5cca69cf47b103ac6b472f1c6dcd859
--- /dev/null
+++ b/mergekit/mergekit/moe/common.py
@@ -0,0 +1,105 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import Dict, Optional, Tuple
+
+import torch
+import tqdm
+import transformers
+
+from mergekit.architecture import WeightInfo
+from mergekit.common import ModelReference, dtype_from_name
+from mergekit.io import LazyTensorLoader, TensorWriter
+from mergekit.merge import MergeOptions
+from mergekit.moe.config import Expert, MoEMergeConfig
+
+
+def initialize_io(
+    config: MoEMergeConfig,
+    out_path: str,
+    merge_options: MergeOptions,
+) -> Tuple[Dict[ModelReference, LazyTensorLoader], LazyTensorLoader, TensorWriter]:
+    base_model = config.base_model
+    loaders: Dict[ModelReference, LazyTensorLoader] = {}
+    for model in tqdm.tqdm(
+        [base_model] + [e.source_model for e in config.experts], desc="Warm up loaders"
+    ):
+        loaders[model] = model.lazy_loader(
+            cache_dir=merge_options.transformers_cache,
+            lazy_unpickle=merge_options.lazy_unpickle,
+        )
+
+    base_loader = loaders.get(base_model)
+    writer = TensorWriter(
+        out_path=out_path,
+        max_shard_size=merge_options.out_shard_size,
+        safe_serialization=merge_options.safe_serialization,
+    )
+
+    return loaders, base_loader, writer
+
+
+def select_dtype(
+    config: MoEMergeConfig, base_cfg: transformers.PretrainedConfig
+) -> Optional[torch.dtype]:
+    out_dtype = None
+    if config.dtype:
+        out_dtype = dtype_from_name(config.dtype)
+
+    if out_dtype is None and base_cfg.torch_dtype:
+        out_dtype = base_cfg.torch_dtype
+        if isinstance(out_dtype, str):
+            out_dtype = dtype_from_name(out_dtype)
+    return out_dtype
+
+
+def noise_and_scale(
+    tensor: torch.Tensor, expert: Expert, is_residual: bool = False
+) -> torch.Tensor:
+    if expert.noise_scale is not None:
+        noise = torch.randn_like(tensor) * expert.noise_scale
+        tensor = tensor + noise
+    if is_residual and expert.residual_scale is not None:
+        tensor = tensor * expert.residual_scale
+    return tensor
+
+
+def copy_tensor_out(
+    weight_info: WeightInfo,
+    loader: LazyTensorLoader,
+    writer: TensorWriter,
+    expert: Optional[Expert] = None,
+    is_residual: bool = False,
+    output_name: Optional[str] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    clone: bool = False,
+):
+    out_tensor_name = output_name or weight_info.name
+    try:
+        tensor = loader.get_tensor(weight_info.name, aliases=weight_info.aliases)
+    except KeyError:
+        tensor = None
+    if tensor is None and not weight_info.optional:
+        logging.error(f"Missing weight: {weight_info.name} / {out_tensor_name}")
+        raise KeyError(out_tensor_name)
+
+    if expert:
+        tensor = noise_and_scale(tensor, expert, is_residual=is_residual)
+    writer.save_tensor(
+        out_tensor_name,
+        tensor.to(dtype=out_dtype),
+        clone=clone,
+    )
diff --git a/mergekit/mergekit/moe/config.py b/mergekit/mergekit/moe/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc484a8a6a68775110f5ee3e338af0834f31b9e
--- /dev/null
+++ b/mergekit/mergekit/moe/config.py
@@ -0,0 +1,98 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+from mergekit.common import ModelReference
+
+
+class Expert(BaseModel):
+    """
+    Defines a model to be used as a set of layerwise experts in a MoE model.
+    """
+
+    source_model: ModelReference
+
+    positive_prompts: Optional[List[str]] = None
+    negative_prompts: Optional[List[str]] = None
+    noise_scale: Optional[float] = None
+    residual_scale: Optional[float] = None
+
+
+class MoEMergeConfig(BaseModel):
+    """
+    Configuration for merging a set of "expert" models into a MoE model.
+    """
+
+    base_model: ModelReference
+    experts: List[Expert]
+    gate_mode: str = (
+        "hidden"  # possible values: "hidden", "cheap_embed", "random", "uniform_random"
+    )
+    # "hidden" uses hidden state vectors for the given prompts for each layer
+    # "cheap_embed" uses the average of token embeddings for the prompts, same for each layer
+    # "random" is random
+    # "uniform_random" matches default initialization for torch.nn.Linear
+    dtype: Optional[str] = None
+    experts_per_token: int = 2
+    shared_experts: Optional[List[Expert]] = None
+    architecture: Optional[str] = None
+
+
+def is_bad_config(config: MoEMergeConfig, allow_all_same: bool = False) -> bool:
+    if config.experts_per_token < 1:
+        logging.error("Experts per token must be >= 1")
+        return True
+
+    if len(config.experts) < config.experts_per_token:
+        logging.error("Must include at least as many experts as experts_per_token.")
+        return True
+
+    if config.gate_mode == "random":
+        return False  # eh we're good
+
+    for expert_idx, expert in enumerate(config.experts):
+        if not expert.positive_prompts:
+            logging.error(f"Expert {expert_idx} has no positive prompts.")
+            return True
+
+    def prompt_tup(e: Expert):
+        return (tuple(e.positive_prompts), tuple(e.negative_prompts or []))
+
+    # let's just nip this trend in the bud
+    p_first = prompt_tup(config.experts[0])
+    if all(prompt_tup(e) == p_first for e in config.experts[1:]):
+        logging.error(
+            "Your positive and negative prompts are identical for all experts. This will not produce a functioning MoE."
+        )
+        logging.error(
+            "For each expert, `positive_prompts` must contain one or more example prompt reflecting what should be routed to that expert."
+        )
+        return True
+
+    if not allow_all_same:
+        if all(
+            e.source_model == config.experts[0].source_model for e in config.experts[1:]
+        ):
+            logging.error(
+                "All of your expert models are the same. This will produce "
+                "a model that uses more resources but gives the exact same output. "
+                "If you plan to train the model after merging, proceed with the "
+                "--i-understand-this-is-not-useful-without-training flag."
+            )
+            return True
diff --git a/mergekit/mergekit/moe/deepseek.py b/mergekit/mergekit/moe/deepseek.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce62865b1ebc4105b8c9346684f11f146617eb3
--- /dev/null
+++ b/mergekit/mergekit/moe/deepseek.py
@@ -0,0 +1,193 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import json
+import logging
+import os
+from typing import Dict, List, Optional
+
+import torch
+import tqdm
+import transformers
+
+from mergekit.architecture import get_architecture_info
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
+from mergekit.moe.config import MoEMergeConfig
+from mergekit.options import MergeOptions
+
+
+class DeepseekMoE(MoEOutputArchitecture):
+    def name(self) -> str:
+        return "DeepSeek MoE"
+
+    def supports_config(
+        self,
+        config: MoEMergeConfig,
+        explain: bool = False,
+        trust_remote_code: bool = False,
+    ) -> bool:
+        if config.shared_experts:
+            if len(config.shared_experts) > 1:
+                if explain:
+                    logging.warning(
+                        "DeepSeek MoE merge does not support more than one shared expert"
+                    )
+                return False
+
+            if (
+                config.shared_experts[0].positive_prompts
+                or config.shared_experts[0].negative_prompts
+            ):
+                if explain:
+                    logging.warning(
+                        "DeepSeek MoE merge does not support gating shared experts"
+                    )
+                return False
+
+        model_types = []
+        for model_ref in (
+            [config.base_model]
+            + [e.source_model for e in config.experts]
+            + [e.source_model for e in (config.shared_experts or [])]
+        ):
+            model_cfg = model_ref.config(trust_remote_code=trust_remote_code)
+            model_types.append(model_cfg.model_type)
+
+        if len(set(model_types)) != 1:
+            if explain:
+                logging.warning(
+                    "Deepseek MoE requires all input models to have the same architecture"
+                )
+            return False
+        if model_types[0] not in ("llama", "mistral"):
+            if explain:
+                logging.warning(
+                    "Deepseek MoE requires all input models to be Llama or Mistral models"
+                )
+            return False
+        return True
+
+    def _generate_config(
+        self,
+        base_config: transformers.PretrainedConfig,
+        num_experts: int,
+        shared_experts: Optional[int] = None,
+        experts_per_token: Optional[int] = None,
+    ) -> Dict:
+        if shared_experts and shared_experts > 1:
+            raise NotImplementedError(
+                "Shared experts must be 0 or 1 for DeepSeek output"
+            )
+
+        res = base_config.to_dict()
+        res["architectures"] = ["DeepseekForCausalLM"]
+        res["model_type"] = "deepseek"
+        res["n_routed_experts"] = num_experts
+        res["n_shared_experts"] = shared_experts or None
+        res["num_experts_per_tok"] = experts_per_token or (1 if shared_experts else 2)
+        res["first_k_dense_replace"] = 0
+        res["moe_layer_freq"] = 1
+        res["scoring_func"] = "softmax"
+        res["norm_topk_prob"] = True
+        res["moe_intermediate_size"] = res["intermediate_size"]
+        res["auto_map"] = {
+            "AutoConfig": "deepseek-ai/deepseek-moe-16b-base--configuration_deepseek.DeepseekConfig",
+            "AutoModel": "deepseek-ai/deepseek-moe-16b-base--modeling_deepseek.DeepseekModel",
+            "AutoModelForCausalLM": "deepseek-ai/deepseek-moe-16b-base--modeling_deepseek.DeepseekForCausalLM",
+        }
+        return res
+
+    def write_model(
+        self,
+        out_path: str,
+        config: MoEMergeConfig,
+        merge_options: MergeOptions,
+        router_weights: List[torch.Tensor],
+        shared_router_weights: Optional[List[torch.Tensor]] = None,
+    ):
+        base_model = config.base_model
+        base_cfg = base_model.config(trust_remote_code=merge_options.trust_remote_code)
+
+        out_dtype = select_dtype(config, base_cfg)
+        out_cfg = self._generate_config(
+            base_cfg,
+            len(config.experts),
+            len(config.shared_experts or []),
+            config.experts_per_token,
+        )
+        if out_dtype is not None:
+            out_cfg["torch_dtype"] = str(out_dtype).removeprefix("torch.")
+        with open(os.path.join(out_path, "config.json"), "w", encoding="utf-8") as f:
+            json.dump(out_cfg, f, indent=4)
+
+        shared_def = config.shared_experts[0] if config.shared_experts else None
+
+        loaders, base_loader, writer = initialize_io(config, out_path, merge_options)
+        shared_loader = loaders.get(shared_def.source_model) if shared_def else None
+        for weight_info in tqdm.tqdm(
+            get_architecture_info(base_cfg).all_weights(base_cfg),
+            desc="Weights",
+        ):
+            tensor_name = weight_info.name
+            if ".mlp." in tensor_name:
+                for expert_idx, expert in enumerate(config.experts):
+                    expert_name = tensor_name.replace(
+                        ".mlp.", f".mlp.experts.{expert_idx}."
+                    )
+                    expert_loader = loaders.get(expert.source_model)
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        is_residual="down_proj" in tensor_name,
+                        output_name=expert_name,
+                        out_dtype=out_dtype,
+                        clone=merge_options.clone_tensors,
+                    )
+
+                if shared_def is not None:
+                    copy_tensor_out(
+                        weight_info,
+                        shared_loader,
+                        writer,
+                        expert=shared_def,
+                        is_residual="down_proj" in tensor_name,
+                        output_name=tensor_name.replace(
+                            ".mlp.", ".mlp.shared_experts."
+                        ),
+                        out_dtype=out_dtype,
+                        clone=merge_options.clone_tensors,
+                    )
+            else:
+                copy_tensor_out(
+                    weight_info,
+                    base_loader,
+                    writer,
+                    out_dtype=out_dtype,
+                    clone=merge_options.clone_tensors,
+                )
+
+        for layer_idx, weight in enumerate(
+            tqdm.tqdm(router_weights, desc="Router weights")
+        ):
+            writer.save_tensor(
+                f"model.layers.{layer_idx}.mlp.gate.weight",
+                weight.to(dtype=out_dtype).contiguous(),
+                clone=merge_options.clone_tensors,
+            )
+
+        writer.finalize()
diff --git a/mergekit/mergekit/moe/mixtral.py b/mergekit/mergekit/moe/mixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3fe97df23aca06df3882f457906d4e4ff10d7af
--- /dev/null
+++ b/mergekit/mergekit/moe/mixtral.py
@@ -0,0 +1,176 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import List, Optional
+
+import torch
+import tqdm
+import transformers
+
+from mergekit.architecture import MISTRAL_INFO, WeightInfo
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
+from mergekit.moe.config import MoEMergeConfig
+from mergekit.options import MergeOptions
+
+
+class MixtralMoE(MoEOutputArchitecture):
+    def name(self) -> str:
+        return "Mixtral"
+
+    def supports_config(
+        self,
+        config: MoEMergeConfig,
+        explain: bool = False,
+        trust_remote_code: bool = False,
+    ) -> bool:
+        if config.shared_experts:
+            if explain:
+                logging.warning("Mixtral does not support shared experts")
+            return False
+
+        model_types = []
+        for model_ref in [config.base_model] + [e.source_model for e in config.experts]:
+            model_cfg = model_ref.config(trust_remote_code=trust_remote_code)
+            model_types.append(model_cfg.model_type)
+
+        if len(set(model_types)) != 1:
+            if explain:
+                logging.warning(
+                    "Mixtral requires all input models to have the same architecture"
+                )
+            return False
+        if model_types[0] not in ("llama", "mistral"):
+            if explain:
+                logging.warning(
+                    "Mixtral requires all input models to be Llama or Mistral models"
+                )
+            return False
+        return True
+
+    def _generate_config(
+        self,
+        base_config: transformers.PretrainedConfig,
+        num_experts: int,
+        shared_experts: Optional[int] = None,
+        experts_per_token: Optional[int] = None,
+    ) -> transformers.PretrainedConfig:
+        if shared_experts:
+            raise NotImplementedError("Shared experts not supported for Mixtral output")
+
+        if not isinstance(base_config, transformers.MistralConfig):
+            base_cfg_mistral = transformers.MistralConfig(**base_config.to_dict())
+            base_cfg_mistral.sliding_window = None
+            base_cfg_mistral.max_position_embeddings = (
+                base_config.max_position_embeddings
+            )
+            base_config = base_cfg_mistral
+
+        out_cfg = transformers.MixtralConfig(**base_config.to_dict())
+        out_cfg.architectures = ["MixtralForCausalLM"]
+        out_cfg.num_local_experts = num_experts
+        out_cfg.num_experts_per_tok = experts_per_token or 2
+        out_cfg.sliding_window = None
+
+        if (out_cfg.num_local_experts & (out_cfg.num_local_experts - 1)) != 0:
+            logging.warning(
+                f"Your model has {out_cfg.num_local_experts} experts, which is "
+                "not a power of two. The model will not be usable in llama.cpp."
+            )
+        return out_cfg
+
+    def _remap_weight_name(self, weight: WeightInfo) -> str:
+        if ".mlp." not in weight.name:
+            # Everything but MLP is identical to base Mistral
+            return weight.name
+
+        res = weight.name
+        for needle, replacement in [
+            (".mlp.gate_proj", ".block_sparse_moe.experts.{expert_idx}.w1"),
+            (".mlp.down_proj", ".block_sparse_moe.experts.{expert_idx}.w2"),
+            (".mlp.up_proj", ".block_sparse_moe.experts.{expert_idx}.w3"),
+        ]:
+            res = res.replace(needle, replacement)
+        return res
+
+    def _router_weight_name(self, layer_idx: int) -> str:
+        return f"model.layers.{layer_idx}.block_sparse_moe.gate.weight"
+
+    def write_model(
+        self,
+        out_path: str,
+        config: MoEMergeConfig,
+        merge_options: MergeOptions,
+        router_weights: List[torch.Tensor],
+        shared_router_weights: Optional[List[torch.Tensor]] = None,
+    ):
+        base_model = config.base_model
+        base_cfg = base_model.config(trust_remote_code=merge_options.trust_remote_code)
+
+        assert len(router_weights) == base_cfg.num_hidden_layers, (
+            f"Expected {base_cfg.num_hidden_layers} router weights, "
+            f"got {len(router_weights)}"
+        )
+
+        out_dtype = select_dtype(config, base_cfg)
+        out_cfg = self._generate_config(
+            base_cfg,
+            len(config.experts),
+            len(config.shared_experts or []),
+            config.experts_per_token,
+        )
+        out_cfg.torch_dtype = out_dtype
+        out_cfg.save_pretrained(out_path)
+
+        loaders, base_loader, writer = initialize_io(config, out_path, merge_options)
+        for weight_info in tqdm.tqdm(
+            MISTRAL_INFO.all_weights(base_cfg),
+            desc="Weights",
+        ):
+            tensor_name = self._remap_weight_name(weight_info)
+            if "{expert_idx}" in tensor_name:
+                for expert_index, expert in enumerate(config.experts):
+                    expert_name = tensor_name.replace("{expert_idx}", str(expert_index))
+                    expert_loader = loaders.get(expert.source_model)
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        out_dtype=out_dtype,
+                        output_name=expert_name,
+                        clone=merge_options.clone_tensors,
+                        is_residual="down_proj" in tensor_name,
+                    )
+            else:
+                copy_tensor_out(
+                    weight_info,
+                    base_loader,
+                    writer,
+                    out_dtype=out_dtype,
+                    clone=merge_options.clone_tensors,
+                )
+
+        for layer_idx, weight in enumerate(
+            tqdm.tqdm(router_weights, desc="Router weights")
+        ):
+            writer.save_tensor(
+                self._router_weight_name(layer_idx),
+                weight.to(dtype=out_dtype).contiguous(),
+                clone=merge_options.clone_tensors,
+            )
+
+        writer.finalize()
diff --git a/mergekit/mergekit/moe/qwen.py b/mergekit/mergekit/moe/qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..65337a0ab921fd05d175a96149805eb6e9b401eb
--- /dev/null
+++ b/mergekit/mergekit/moe/qwen.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import List, Optional
+
+import torch
+import tqdm
+import transformers
+
+# explicitly import the config class so that we can catch errors upstream
+# if the transformers version installed is too old
+from transformers.models.qwen2_moe import Qwen2MoeConfig
+
+from mergekit.architecture import QWEN2_INFO
+from mergekit.moe.arch import MoEOutputArchitecture
+from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
+from mergekit.moe.config import MoEMergeConfig
+from mergekit.options import MergeOptions
+
+
+class QwenMoE(MoEOutputArchitecture):
+    def name(self) -> str:
+        return "Qwen MoE"
+
+    def supports_config(
+        self,
+        config: MoEMergeConfig,
+        explain: bool = False,
+        trust_remote_code: bool = False,
+    ) -> bool:
+        if len(config.shared_experts or []) != 1:
+            if explain:
+                logging.warning("Qwen MoE merge requires exactly one shared expert")
+            return False
+
+        if (
+            config.gate_mode != "random"
+            and not config.shared_experts[0].positive_prompts
+        ):
+            if explain:
+                logging.warning("Qwen MoE requires the shared expert to have prompts")
+            return False
+
+        model_types = []
+        for model_ref in (
+            [config.base_model]
+            + [e.source_model for e in config.experts]
+            + [e.source_model for e in (config.shared_experts or [])]
+        ):
+            model_cfg = model_ref.config(trust_remote_code=trust_remote_code)
+            model_types.append(model_cfg.model_type)
+
+        if len(set(model_types)) != 1:
+            if explain:
+                logging.warning(
+                    "Qwen MoE requires all input models to have the same architecture"
+                )
+            return False
+        if model_types[0] not in ("llama", "mistral", "qwen2"):
+            if explain:
+                logging.warning(
+                    "Qwen MoE requires all input models to be Qwen2, Llama or Mistral models"
+                )
+            return False
+        return True
+
+    def _generate_config(
+        self,
+        base_config: transformers.PretrainedConfig,
+        num_experts: int,
+        experts_per_token: Optional[int] = None,
+    ) -> Qwen2MoeConfig:
+        out_cfg = Qwen2MoeConfig(**base_config.to_dict())
+        out_cfg.architectures = ["Qwen2MoeForCausalLM"]
+        out_cfg.num_experts = num_experts
+        out_cfg.num_experts_per_tok = experts_per_token or 2
+        out_cfg.decoder_sparse_step = 1
+        out_cfg.norm_topk_prob = True
+        out_cfg.sliding_window = None
+        out_cfg.use_sliding_window = False
+        out_cfg.shared_expert_intermediate_size = out_cfg.intermediate_size
+        out_cfg.moe_intermediate_size = out_cfg.intermediate_size
+
+        if (out_cfg.num_experts & (out_cfg.num_experts - 1)) != 0:
+            logging.warning(
+                f"Your model has {out_cfg.num_experts} experts, which is "
+                "not a power of two. The model will not be usable in llama.cpp."
+            )
+        return out_cfg
+
+    def write_model(
+        self,
+        out_path: str,
+        config: MoEMergeConfig,
+        merge_options: MergeOptions,
+        router_weights: List[torch.Tensor],
+        shared_router_weights: Optional[List[torch.Tensor]] = None,
+    ):
+        base_model = config.base_model
+        base_cfg = base_model.config(trust_remote_code=merge_options.trust_remote_code)
+
+        out_dtype = select_dtype(config, base_cfg)
+        out_cfg = self._generate_config(
+            base_cfg,
+            len(config.experts),
+            config.experts_per_token,
+        )
+        if out_dtype is not None:
+            out_cfg.torch_dtype = out_dtype
+        out_cfg.save_pretrained(out_path)
+
+        shared_def = config.shared_experts[0]
+
+        loaders, base_loader, writer = initialize_io(config, out_path, merge_options)
+        shared_loader = loaders.get(shared_def.source_model) if shared_def else None
+        for weight_info in tqdm.tqdm(
+            QWEN2_INFO.all_weights(base_cfg),
+            desc="Weights",
+        ):
+            tensor_name = weight_info.name
+            if ".mlp." in tensor_name:
+                for expert_idx, expert in enumerate(config.experts):
+                    expert_name = tensor_name.replace(
+                        ".mlp.", f".mlp.experts.{expert_idx}."
+                    )
+                    expert_loader = loaders.get(expert.source_model)
+                    copy_tensor_out(
+                        weight_info,
+                        expert_loader,
+                        writer,
+                        expert=expert,
+                        is_residual="down_proj" in tensor_name,
+                        output_name=expert_name,
+                        out_dtype=out_dtype,
+                        clone=merge_options.clone_tensors,
+                    )
+
+                copy_tensor_out(
+                    weight_info,
+                    shared_loader,
+                    writer,
+                    expert=shared_def,
+                    is_residual="down_proj" in tensor_name,
+                    output_name=tensor_name.replace(".mlp.", ".mlp.shared_expert."),
+                    out_dtype=out_dtype,
+                    clone=merge_options.clone_tensors,
+                )
+            else:
+                try:
+                    tensor = base_loader.get_tensor(
+                        tensor_name, aliases=weight_info.aliases
+                    )
+                except KeyError:
+                    if tensor_name.endswith("_proj.bias"):
+                        # qwen 2 moe wants attention bias, give it zeros
+                        head_dim = out_cfg.hidden_size // out_cfg.num_attention_heads
+                        num_heads = (
+                            out_cfg.num_key_value_heads
+                            if (
+                                tensor_name.endswith("k_proj.bias")
+                                or tensor_name.endswith("v_proj.bias")
+                            )
+                            else out_cfg.num_attention_heads
+                        )
+                        tensor = torch.zeros(num_heads * head_dim, dtype=out_dtype)
+                    elif weight_info.optional:
+                        continue
+                    else:
+                        raise
+
+                writer.save_tensor(
+                    tensor_name,
+                    tensor.to(dtype=out_dtype),
+                    clone=merge_options.clone_tensors,
+                )
+
+        for layer_idx, weight in enumerate(
+            tqdm.tqdm(router_weights, desc="Router weights")
+        ):
+            writer.save_tensor(
+                f"model.layers.{layer_idx}.mlp.gate.weight",
+                weight.to(dtype=out_dtype).contiguous(),
+                clone=merge_options.clone_tensors,
+            )
+            writer.save_tensor(
+                f"model.layers.{layer_idx}.mlp.shared_expert_gate.weight",
+                shared_router_weights[layer_idx].to(dtype=out_dtype).contiguous(),
+                clone=merge_options.clone_tensors,
+            )
+
+        writer.finalize()
diff --git a/mergekit/mergekit/moe/router.py b/mergekit/mergekit/moe/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..45338c77be39a122574787ad65f2284b6aedf9e7
--- /dev/null
+++ b/mergekit/mergekit/moe/router.py
@@ -0,0 +1,189 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import math
+from typing import Dict, List, Union
+
+import torch
+import tqdm
+import transformers
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, MistralForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from mergekit.common import ModelReference
+from mergekit.moe.config import Expert
+
+
+def get_hidden_states(
+    model: Union[MistralForCausalLM, LlamaForCausalLM],
+    tokenized: transformers.BatchEncoding,
+    average: bool = True,
+) -> List[torch.Tensor]:
+    with torch.no_grad():
+        output: CausalLMOutputWithPast = model(
+            **tokenized.to(model.device), output_hidden_states=True, return_dict=True
+        )
+    hidden_states = torch.stack(
+        output.hidden_states[:-1]
+    )  # (num_layers, batch_size, seq_len, hidden_size)
+    if average:
+        # use average over sequence
+        hidden_states = hidden_states.sum(dim=2) / hidden_states.shape[2]
+    else:
+        # take last value
+        hidden_states = hidden_states[:, :, -1, :]
+    return hidden_states.sum(dim=1) / hidden_states.shape[1]
+
+
+def get_cheap_embedding(
+    embed: torch.Tensor,
+    tokenized: Dict[str, torch.Tensor],
+    num_layers: int,
+    vocab_size: int,
+) -> torch.Tensor:
+    onehot = torch.nn.functional.one_hot(
+        tokenized["input_ids"], num_classes=vocab_size
+    )  # (batch_size, seq_len, 32000)
+    h = onehot.float() @ embed.float()  # (batch_size, seq_len, hidden_size)
+    embedded = (
+        (h * tokenized["attention_mask"].unsqueeze(-1))
+        .sum(dim=1)
+        .sum(dim=0, keepdim=True)
+    )  # (1, hidden_size)
+    res = embedded / embedded.norm(dim=-1, keepdim=True).clamp(
+        min=1e-8
+    )  # (1, hidden_size)
+    return res.repeat(num_layers, 1)
+
+
+def tokenize_prompts(
+    prompts: List[str], tokenizer: transformers.PreTrainedTokenizerBase
+):
+    return tokenizer(
+        [(tokenizer.bos_token or "") + p for p in prompts],
+        return_tensors="pt",
+        padding=True,
+        add_special_tokens=False,
+    )
+
+
+def get_gate_params(
+    model_ref: ModelReference,
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    experts: List[Expert],
+    mode: str = "hidden",
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
+    lazy_unpickle: bool = False,
+    trust_remote_code: bool = False,
+    device: str = "auto",
+):
+    gate_vecs = []
+    _do_it = None
+
+    model_cfg = model_ref.config(trust_remote_code=trust_remote_code)
+
+    if mode == "random":
+        return torch.randn(
+            (model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
+        )
+    elif mode == "uniform_random":
+        in_features = model_cfg.hidden_size
+        scale = math.sqrt(1.0 / in_features)
+        return (
+            torch.rand(
+                (model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
+            )
+            * 2
+            * scale
+            - scale
+        )
+    elif mode == "cheap_embed":
+        embed = model_ref.lazy_loader(lazy_unpickle=lazy_unpickle).get_tensor(
+            "model.embed_tokens.weight"
+        )
+
+        def _do_it(tokenized):
+            return get_cheap_embedding(
+                embed,
+                tokenized,
+                num_layers=model_cfg.num_hidden_layers,
+                vocab_size=model_cfg.vocab_size,
+            )
+
+    elif mode in ("hidden", "hidden_avg", "hidden_last"):
+        model = AutoModelForCausalLM.from_pretrained(
+            model_ref.model.path,
+            revision=model_ref.model.revision,
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+            low_cpu_mem_usage=True,
+            load_in_4bit=load_in_4bit,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=trust_remote_code,
+        )
+
+        def _do_it(tokenized):
+            return get_hidden_states(
+                model, tokenized=tokenized, average=mode == "hidden_avg"
+            )
+
+    gate_vecs = []
+    for expert in tqdm.tqdm(experts, desc="expert prompts"):
+        hidden_states = _do_it(tokenize_prompts(expert.positive_prompts, tokenizer))
+        if expert.negative_prompts:
+            hidden_states -= _do_it(
+                tokenize_prompts(expert.negative_prompts, tokenizer)
+            )
+
+        hidden_states /= hidden_states.norm(p=2, dim=-1, keepdim=True).clamp(min=1e-8)
+        gate_vecs.append(hidden_states)
+    gate_vecs = torch.stack(gate_vecs, dim=0)  # (num_expert, num_layer, hidden_size)
+    return gate_vecs.permute(1, 0, 2)
+
+
+def warn_degenerate_gates(gate_vecs: torch.Tensor, threshold: float = 5.0):
+    degen_indices = []
+    num_layers, _num_experts, _hidden_size = gate_vecs.shape
+    for idx in range(num_layers):
+        c = torch.linalg.cond(gate_vecs[idx, :, :].float())
+        if c > threshold:
+            degen_indices.append(idx)
+
+    if degen_indices:
+        if len(degen_indices) == 1:
+            layer_str = f"layer {degen_indices[0]}"
+            verb = "has"
+        elif len(degen_indices) == 2:
+            layer_str = f"layers {' and '.join(map(str, degen_indices))}"
+            verb = "have"
+        elif len(degen_indices) >= num_layers:
+            layer_str = "ALL layers"
+            verb = "have"
+        else:
+            layer_str = (
+                "layers "
+                + ", ".join(map(str, degen_indices[:-1]))
+                + ", and "
+                + str(degen_indices[-1])
+            )
+            verb = "have"
+
+        logging.warning(
+            f"{layer_str} {verb} degenerate routing parameters "
+            "- your prompts may be too similar."
+        )
+        logging.warning("One or more experts will be underutilized in your model.")
diff --git a/mergekit/mergekit/options.py b/mergekit/mergekit/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..4701ffc12f137be7b8818a8cf12c823664468b82
--- /dev/null
+++ b/mergekit/mergekit/options.py
@@ -0,0 +1,111 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import functools
+import typing
+from typing import Any, Callable, Optional, Union
+
+import click
+from click.core import Context, Parameter
+from pydantic import BaseModel
+
+from mergekit.common import parse_kmb
+
+
+class MergeOptions(BaseModel):
+    allow_crimes: bool = False
+    transformers_cache: Optional[str] = None
+    lora_merge_cache: Optional[str] = None
+    cuda: bool = False
+    low_cpu_memory: bool = False
+    out_shard_size: int = parse_kmb("5B")
+    copy_tokenizer: bool = True
+    clone_tensors: bool = False
+    trust_remote_code: bool = False
+    random_seed: Optional[int] = None
+    lazy_unpickle: bool = False
+    write_model_card: bool = True
+    safe_serialization: bool = True
+    quiet: bool = False
+    read_to_gpu: bool = False
+
+
+OPTION_HELP = {
+    "allow_crimes": "Allow mixing architectures",
+    "transformers_cache": "Override storage path for downloaded models",
+    "lora_merge_cache": "Path to store merged LORA models",
+    "cuda": "Perform matrix arithmetic on GPU",
+    "low_cpu_memory": "Store results and intermediate values on GPU. Useful if VRAM > RAM",
+    "out_shard_size": "Number of parameters per output shard  [default: 5B]",
+    "copy_tokenizer": "Copy a tokenizer to the output",
+    "clone_tensors": "Clone tensors before saving, to allow multiple occurrences of the same layer",
+    "trust_remote_code": "Trust remote code from huggingface repos (danger)",
+    "random_seed": "Seed for reproducible use of randomized merge methods",
+    "lazy_unpickle": "Experimental lazy unpickler for lower memory usage",
+    "write_model_card": "Output README.md containing details of the merge",
+    "safe_serialization": "Save output in safetensors. Do this, don't poison the world with more pickled models.",
+    "quiet": "Suppress progress bars and other non-essential output",
+    "read_to_gpu": "Read model weights directly to GPU",
+}
+
+
+class ShardSizeParamType(click.ParamType):
+    name = "size"
+
+    def convert(
+        self, value: Any, param: Optional[Parameter], ctx: Optional[Context]
+    ) -> int:
+        return parse_kmb(value)
+
+
+def add_merge_options(f: Callable) -> Callable:
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        arg_dict = {}
+        for field_name in MergeOptions.model_fields:
+            if field_name in kwargs:
+                arg_dict[field_name] = kwargs.pop(field_name)
+
+        kwargs["merge_options"] = MergeOptions(**arg_dict)
+        f(*args, **kwargs)
+
+    for field_name, info in reversed(MergeOptions.model_fields.items()):
+        origin = typing.get_origin(info.annotation)
+        if origin is Union:
+            ty, prob_none = typing.get_args(info.annotation)
+            assert prob_none is type(None)
+            field_type = ty
+        else:
+            field_type = info.annotation
+
+        if field_name == "out_shard_size":
+            field_type = ShardSizeParamType()
+
+        arg_name = field_name.replace("_", "-")
+        if field_type == bool:
+            arg_str = f"--{arg_name}/--no-{arg_name}"
+        else:
+            arg_str = f"--{arg_name}"
+
+        help_str = OPTION_HELP.get(field_name, None)
+        wrapper = click.option(
+            arg_str,
+            type=field_type,
+            default=info.default,
+            help=help_str,
+            show_default=field_name != "out_shard_size",
+        )(wrapper)
+
+    return wrapper
diff --git a/mergekit/mergekit/plan.py b/mergekit/mergekit/plan.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e407be1b561e153e81e42917c05f0071a031535
--- /dev/null
+++ b/mergekit/mergekit/plan.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from functools import lru_cache
+from typing import Any, List, Optional, Tuple
+
+from mergekit import merge_methods
+from mergekit.architecture import (
+    ArchitectureInfo,
+    ConfiguredArchitectureInfo,
+    WeightInfo,
+)
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.config import (
+    ConfigReader,
+    InputSliceDefinition,
+    MergeConfiguration,
+    OutputSliceDefinition,
+)
+from mergekit.graph import Task
+from mergekit.io.tasks import (
+    FinalizeModel,
+    GatherTensors,
+    LoaderCache,
+    ReturnTensor,
+    SaveTensor,
+    TensorWriterTask,
+)
+from mergekit.merge_methods import MergeMethod
+from mergekit.options import MergeOptions
+from mergekit.tokenizer import BuildTokenizer, PermutedEmbeddings
+
+
+class MergePlanner:
+    config: MergeConfiguration
+    arch_info: ArchitectureInfo
+    options: MergeOptions
+    out_model_config: Any
+    _method: MergeMethod
+    _tensors: List[Tuple[WeightInfo, Task]]
+    _current_layers: int = 0
+    _tokenizer_task: Optional[BuildTokenizer] = None
+
+    def __init__(
+        self,
+        config: MergeConfiguration,
+        arch_info: ArchitectureInfo,
+        options: MergeOptions,
+        out_model_config: Any,
+    ):
+        self.config = config
+        self.arch_info = arch_info
+        self.options = options
+        self.out_model_config = out_model_config
+        self._method = merge_methods.get(config.merge_method)
+
+        token_cfg = {}
+        tokenizer_source = config.tokenizer_source
+        if config.tokenizer is not None:
+            token_cfg = config.tokenizer.tokens or {}
+            tokenizer_source = config.tokenizer.source
+        if tokenizer_source is not None:
+            self._tokenizer_task = BuildTokenizer(
+                base_model=config.base_model,
+                referenced_models=tuple(config.referenced_models()),
+                tokenizer_source=tokenizer_source,
+                trust_remote_code=options.trust_remote_code,
+                add_tokens=tuple(token_cfg.keys()),
+            )
+
+    @lru_cache
+    def model_arch_info(self, model: ModelReference):
+        return ConfiguredArchitectureInfo(
+            info=self.arch_info,
+            config=model.config(trust_remote_code=self.options.trust_remote_code),
+        )
+
+    def normalize_config(self):
+        base_model = self.config.base_model
+
+        # if models to merge are specified instead of output slices, compute them
+        if self.config.models:
+            if self.config.slices:
+                raise RuntimeError(
+                    "Must specify either models to merge or output slices"
+                )
+
+            slices_in = []
+            base_included = False
+
+            for model_in in self.config.models:
+                if base_model and model_in.model == base_model:
+                    base_included = True
+
+                model_info = self.model_arch_info(model_in.model)
+                slices_in.append(
+                    InputSliceDefinition(
+                        layer_range=[0, model_info.num_layers()],
+                        model=model_in.model,
+                        parameters=model_in.parameters,
+                    )
+                )
+
+            if base_model and not base_included:
+                logging.info("Base model specified but not in input models - adding")
+                base_info = self.model_arch_info(base_model)
+                slices_in.append(
+                    InputSliceDefinition(
+                        layer_range=[0, base_info.num_layers()],
+                        model=base_model,
+                    )
+                )
+
+            self.config.slices = [OutputSliceDefinition(sources=slices_in)]
+            self.config.models = None
+
+    def plan_tensor(
+        self,
+        weight: WeightInfo,
+        weights_in: List[WeightInfo],
+        models: List[ModelReference],
+        cfg_reader: ConfigReader,
+    ):
+        if weight.optional:
+            # check if any input weights are present
+            any_weight = False
+            for model, w_in in zip(models, weights_in):
+                index = LoaderCache().get(model).index
+                if any(
+                    name in index.tensor_paths
+                    for name in [w_in.name] + (w_in.aliases or [])
+                ):
+                    any_weight = True
+                    break
+
+            if not any_weight:
+                logging.info(f"Skipping optional weight {weight.name}")
+                return
+
+        tensor_merge_method = self._method
+        cfg_g = cfg_reader.for_tensor(weight.name)
+        global_params = {}
+        for p in tensor_merge_method.parameters():
+            global_params[p.name] = cfg_g.parameter(
+                p.name, model=None, required=p.required, default=p.default_value
+            )
+
+        base_model = cfg_reader.base_model
+
+        tensor_params = {}
+        for model, weight_in in zip(models, weights_in):
+            is_base = model == base_model
+            tensor_params[model] = {}
+            cfg_m = cfg_reader.for_tensor(weight_in.name)
+            for p in tensor_merge_method.tensor_parameters():
+                tensor_params[model][p.name] = cfg_m.parameter(
+                    p.name,
+                    model=model,
+                    required=p.required and not is_base,
+                    default=p.default_value,
+                )
+
+        gather_tensors = GatherTensors(
+            weight_info=ImmutableMap(data=dict(zip(models, weights_in))),
+            dtype=self.config.dtype,
+            device="cuda" if self.options.read_to_gpu else None,
+        )
+
+        tensor_input_task = gather_tensors
+        if self._tokenizer_task and weight.is_embed:
+            token_cfg = {}
+            pad_to_multiple = None
+            if cfg_reader.config.tokenizer:
+                token_cfg = cfg_reader.config.tokenizer.tokens
+                pad_to_multiple = cfg_reader.config.tokenizer.pad_to_multiple_of
+            tensor_input_task = PermutedEmbeddings(
+                gather_tensors=gather_tensors,
+                tokenizer_task=self._tokenizer_task,
+                tokens=token_cfg,
+                pad_to_multiple_of=pad_to_multiple,
+                base_model=base_model,
+            )
+
+        tensor_task = tensor_merge_method.make_task(
+            output_weight=weight,
+            tensors=tensor_input_task,
+            parameters=ImmutableMap(data=global_params),
+            tensor_parameters=ImmutableMap(
+                data={
+                    key: ImmutableMap(data=tensor_params[key]) for key in tensor_params
+                }
+            ),
+            base_model=base_model,
+        )
+        self._tensors.append((weight, tensor_task))
+
+    def plan_layer(
+        self,
+        sources: List[InputSliceDefinition],
+        layer_offset: int,
+        t: float,
+        cfg_reader: ConfigReader,
+    ):
+        weights_out: List[WeightInfo] = self.arch_info.layer_weights(
+            index=self._current_layers,
+            config=self.out_model_config,
+        )
+        weights_in: List[List[WeightInfo]] = [
+            self.model_arch_info(s.model).layer_weights(
+                index=s.layer_range[0] + layer_offset
+            )
+            for s in sources
+        ]
+
+        for idx, w_o in enumerate(weights_out):
+            self.plan_tensor(
+                weight=w_o,
+                weights_in=[weights_in[j][idx] for j in range(len(weights_in))],
+                models=[s.model for s in sources],
+                cfg_reader=cfg_reader.with_t(t),
+            )
+
+        self._current_layers += 1
+
+    def plan_slice(self, definition: OutputSliceDefinition):
+        slice_lengths = [
+            s.layer_range[1] - s.layer_range[0] for s in definition.sources
+        ]
+        if not all(s == slice_lengths[0] for s in slice_lengths):
+            raise RuntimeError(
+                "All inputs to a slice must contain the same number of layers"
+            )
+        num_layers = slice_lengths[0]
+
+        cfg_reader = ConfigReader(config=self.config, slice_out=definition, t=0)
+        for idx in range(num_layers):
+            # compute t for interpolated gradients
+            if num_layers > 1:
+                t = idx / (num_layers - 1)
+            else:
+                t = 1
+
+            self.plan_layer(
+                definition.sources,
+                layer_offset=idx,
+                t=t,
+                cfg_reader=cfg_reader,
+            )
+
+    def plan_to_disk(self, out_path: str) -> List[Task]:
+        """Plan the merge to be streamed to disk, returning a list of tasks."""
+        self._plan()
+
+        writer_task = TensorWriterTask(
+            out_path=out_path,
+            max_shard_size=self.options.out_shard_size,
+            safe_serialization=self.options.safe_serialization,
+        )
+        save_tasks = []
+        for weight, tensor_task in self._tensors:
+            save_tasks.append(
+                SaveTensor(
+                    tensor_name=weight.name,
+                    tensor_task=tensor_task,
+                    writer_task=writer_task,
+                    clone=self.options.clone_tensors,
+                    optional=weight.optional,
+                    dtype=weight.force_dtype or self.config.out_dtype,
+                )
+            )
+        finalize = FinalizeModel(
+            tensor_save_tasks=tuple(save_tasks), writer_task=writer_task
+        )
+
+        res = save_tasks + [finalize]
+        if self._tokenizer_task:
+            res.append(self._tokenizer_task)
+        return res
+
+    def plan_in_memory(self) -> List[ReturnTensor]:
+        """Plan the merge to be performed in memory."""
+        self._plan()
+        return [
+            ReturnTensor(
+                weight_info=w,
+                tensor_task=t,
+                dtype=w.force_dtype or self.config.out_dtype,
+            )
+            for w, t in self._tensors
+        ]
+
+    def _plan(self):
+        self.normalize_config()
+        self._tensors = []
+
+        for weight_info in self.arch_info.pre_weights(config=self.out_model_config):
+            self.plan_tensor(
+                weight_info,
+                [weight_info] * len(self.config.slices[0].sources),
+                [s.model for s in self.config.slices[0].sources],
+                ConfigReader(
+                    config=self.config,
+                    t=0,
+                    tensor_name=weight_info.name,
+                ).for_out_slice(self.config.slices[0]),
+            )
+
+        for out_slice in self.config.slices:
+            self.plan_slice(out_slice)
+
+        for weight_info in self.arch_info.post_weights(config=self.out_model_config):
+            self.plan_tensor(
+                weight_info,
+                [weight_info] * len(self.config.slices[-1].sources),
+                [s.model for s in self.config.slices[-1].sources],
+                ConfigReader(
+                    config=self.config,
+                    t=1,
+                    tensor_name=weight_info.name,
+                ).for_out_slice(self.config.slices[-1]),
+            )
diff --git a/mergekit/mergekit/scripts/ABM/activations_based_merge.py b/mergekit/mergekit/scripts/ABM/activations_based_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb3c912aad113e916b7dbdf8596820a16a05dfd0
--- /dev/null
+++ b/mergekit/mergekit/scripts/ABM/activations_based_merge.py
@@ -0,0 +1,171 @@
+import logging
+import os
+from typing import Optional
+
+import click
+import safetensors.torch
+import torch
+import tqdm
+from transformers import AutoTokenizer
+
+from mergekit.architecture import get_architecture_info
+from mergekit.common import ModelReference, dtype_from_name
+from mergekit.io.tasks import LoaderCache
+from mergekit.io.tensor_writer import TensorWriter
+from mergekit.options import MergeOptions, add_merge_options
+
+
+@click.command("mergekit-activation-based-merge")
+@click.argument("model_path", type=str)
+@click.argument("secondary_model_path", type=str)
+@click.argument("merge_unmerge_directory", type=str)
+@click.option("--out-path", "-o", required=True, type=str, help="Output model path")
+@click.option(
+    "--dtype",
+    type=str,
+    default="float16",
+    help="Data type to convert weights to",
+)
+@click.option(
+    "--device",
+    "-d",
+    type=str,
+    default="cuda",
+    help="Device to compute on (default: cuda)",
+)
+@add_merge_options
+def main(
+    model_path: str,
+    secondary_model_path,
+    merge_unmerge_directory: str,
+    out_path: str,
+    dtype: Optional[str],
+    device: Optional[str],
+    merge_options: MergeOptions,
+):
+    model = ModelReference.model_validate(model_path)
+    secondary_model = ModelReference.model_validate(secondary_model_path)
+
+    dtype = dtype_from_name(dtype) if dtype else None
+
+    cache = LoaderCache()
+    cache.lazy_unpickle = merge_options.lazy_unpickle
+    cache.hf_cache_dir = merge_options.transformers_cache
+
+    for m in tqdm.tqdm([model, secondary_model], desc="Preparing models"):
+        cache.get(m)
+
+    writer = TensorWriter(
+        out_path=out_path,
+        max_shard_size=merge_options.out_shard_size,
+        safe_serialization=merge_options.safe_serialization,
+    )
+
+    model_config = model.config(trust_remote_code=merge_options.trust_remote_code)
+    model_arch_info = get_architecture_info(
+        model.config(trust_remote_code=merge_options.trust_remote_code)
+    )
+
+    loader_1 = cache.get(model)
+    loader_2 = cache.get(secondary_model)
+
+    os.makedirs(out_path, exist_ok=True)
+
+    merge_unmerge_dictionary = {}
+    # load files from merge_unmerge_directory
+    spaces = [
+        f.split("_unmerge")[0]
+        for f in os.listdir(merge_unmerge_directory)
+        if "_unmerge" in f
+    ]
+    for i in spaces:
+        logging.info(f"Loading merge/unmerge tensors for {i}")
+        m = safetensors.torch.load_file(
+            os.path.join(merge_unmerge_directory, f"{i}_merge.safetensor"),
+            device=device,
+        )
+        u = safetensors.torch.load_file(
+            os.path.join(merge_unmerge_directory, f"{i}_unmerge.safetensor"),
+            device=device,
+        )
+        merge_unmerge_dictionary[i] = (
+            m[i].to(device, dtype=dtype),
+            u[i].to(device, dtype=dtype),
+        )
+
+    for weight_info in model_arch_info.all_weights(config=model_config):
+        merge_matrix, unmerge_matrix = None, None
+
+        if weight_info.input_space in merge_unmerge_dictionary:
+            _, unmerge_matrix = merge_unmerge_dictionary[weight_info.input_space]
+            unmerge_matrix = unmerge_matrix.chunk(2, dim=0)
+
+        if weight_info.output_space in merge_unmerge_dictionary:
+            merge_matrix, _ = merge_unmerge_dictionary[weight_info.output_space]
+            merge_matrix = merge_matrix.chunk(2, dim=1)
+
+        original_w = loader_1.get_tensor(weight_info.name, device=device)
+        original_w2 = loader_2.get_tensor(weight_info.name, device=device)
+
+        if dtype is not None:
+            original_w = original_w.to(dtype=dtype)
+            original_w2 = original_w2.to(dtype=dtype)
+
+        w = torch.clone(original_w)
+        w2 = torch.clone(original_w2)
+
+        if not merge_matrix and not unmerge_matrix:
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 1 and model 2 has no merge or unmerge matrix"
+            )
+
+        if merge_matrix is not None:
+            if weight_info.is_embed:
+                w = (merge_matrix[0] @ w.T).T
+                w2 = (merge_matrix[1] @ w2.T).T
+            else:
+                w = merge_matrix[0] @ w
+                w2 = merge_matrix[1] @ w2
+
+        if unmerge_matrix is not None:
+            w = w @ unmerge_matrix[0]
+            w2 = w2 @ unmerge_matrix[1]
+
+        # check if weights have not mutated, if yes then  shoot warning
+        if torch.allclose(original_w, w):
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 1 has NOT mutated during merge"
+            )
+        else:
+            logging.warning(
+                f"✅ Weight {weight_info.name} for model 1 has mutated during merge"
+            )
+
+        if torch.allclose(original_w2, w2):
+            logging.warning(
+                f"❌ Weight {weight_info.name} for model 2 has NOT mutated during merge"
+            )
+        else:
+            logging.warning(
+                f"✅ Weight {weight_info.name} for model 2 has mutated during merge"
+            )
+
+        # average weights and save them
+        if merge_matrix:
+            w = w + w2
+        else:
+            w = (w + w2) / 2
+        writer.save_tensor(weight_info.name, w)
+    writer.finalize()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.save_pretrained(out_path, safe_serialization=True)
+
+    # write config
+    model_out_config = model.config(trust_remote_code=merge_options.trust_remote_code)
+    if dtype:
+        model_out_config.torch_dtype = dtype
+    model_out_config.save_pretrained(out_path)
+
+
+main()
diff --git a/mergekit/mergekit/scripts/ABM/extract_activations.py b/mergekit/mergekit/scripts/ABM/extract_activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb5961b9655f3ebda9ca3c1626209c52d025c6d
--- /dev/null
+++ b/mergekit/mergekit/scripts/ABM/extract_activations.py
@@ -0,0 +1,347 @@
+import logging
+import os
+from collections import defaultdict
+from typing import List, Optional
+
+import click
+import datasets
+import numpy as np
+import torch
+from safetensors.torch import save_file
+from torch.utils.data import DataLoader
+from transformers import AutoModel, AutoTokenizer, DefaultDataCollator
+
+from mergekit.architecture import _template_substitution, get_architecture_info
+from mergekit.common import ModelReference
+
+logging.basicConfig(level=logging.INFO)
+
+# set seed
+torch.manual_seed(42)
+np.random.seed(42)
+
+
+def clean_name(name):
+    return name.replace(".weight", "").replace("model.", "")
+
+
+def parse_items(ctx, param, value):
+    if value is not None:
+        return [item.strip() for item in value.split(",")]
+
+
+def remove_pads(attention_mask, feature_vector):
+    if (
+        len(feature_vector.shape) == 3
+    ):  # Hidden states: (batch_size, seq_length, embedding_dim)
+        # Expand mask to match the feature_vector dimensions and apply it
+        expanded_mask = attention_mask.unsqueeze(-1)
+        filtered_feature_vector = feature_vector * expanded_mask
+    else:
+        raise ValueError("Unsupported feature vector shape.")
+
+    return filtered_feature_vector
+
+
+def get_attention_output_hook(storage_dict, space_name, capture_input=True):
+    """
+    Returns a hook function that stores the output of the attention layer.
+    """
+
+    def hook(module, input, output):
+        # NOTE: shape of input is [batch, seq_len, dim] and output is Tuple[(seq_len, dim),...]
+        if capture_input:
+            o = input[0].detach()
+        else:
+            o = output.detach()
+
+        if space_name not in storage_dict:
+            storage_dict[space_name] = o
+        else:
+            storage_dict[space_name] = torch.cat((storage_dict[space_name], o), dim=0)
+
+    return hook
+
+
+"""
+
+What this script does:
+
+It tries to map input/output spaces to activation maps
+
+"""
+
+
+@click.command("mergekit-abm-extract-activations")
+@click.argument("model-path", type=str)
+@click.option(
+    "--dataset", "-d", required=True, type=str, help="Dataset to use for activations"
+)
+@click.option("--out-path", "-o", required=True, type=str, help="Output model path")
+@click.option("--batch-size", "-b", type=int, default=2, help="Batch size")
+@click.option(
+    "--dataset-size",
+    "-s",
+    type=int,
+    default=None,
+    help="Dataset size. If None, use full dataset",
+)
+@click.option(
+    "--dataset-column", "-c", type=str, default="text", help="Dataset column to use"
+)
+@click.option(
+    "--dataset-subset", "-u", type=str, default="eval", help="Dataset subset to use"
+)
+@click.option(
+    "--chat-template/--no-chat-template",
+    default=False,
+    help="use Chat template for inference",
+)
+@click.option("--max-length", "-l", type=int, default=512, help="Max length")
+@click.option("--dtype", type=str, default=None, help="Data type to convert weights to")
+@click.option(
+    "--device", type=str, default=None, help="device to compute the activations"
+)
+@click.option(
+    "--ignore-spaces",
+    "-i",
+    type=str,
+    default="",
+    callback=parse_items,
+    help="Spaces to ignore separated by comma. Example: up_${layer_index}",
+)
+def main(
+    model_path: str,
+    dataset: str,
+    dataset_column: str,
+    out_path: str,
+    batch_size: int,
+    max_length: int,
+    dataset_size: Optional[int],
+    dataset_subset: Optional[str],
+    chat_template: Optional[bool],
+    dtype: Optional[str],
+    device: Optional[str],
+    ignore_spaces: Optional[List[str]],
+):
+    # sorting out locations to hook into
+    # we do this via the predefined json architecture definitions in mergekit
+
+    model = ModelReference.model_validate(model_path)
+
+    model_config = model.config()
+    model_arch_info = get_architecture_info(model_config)
+
+    _json = model_arch_info.definition
+
+    residual_space = None
+
+    weights = []
+    for weight in _json.layer_templates.weights:
+        if weight.is_kq:
+            residual_space = weight.input_space
+        weights.append(weight)
+
+    if residual_space is None:
+        raise ValueError("No residual space found")
+
+    # ======================== Mapping spaces to weights ========================
+
+    # just a list of connected components
+    space_to_output_weight_templates = defaultdict(list)
+    space_to_input_weight_templates = defaultdict(list)
+
+    for layer_template in weights:
+        if (
+            not layer_template.input_space
+            or layer_template.input_space in ignore_spaces
+        ):
+            continue
+        space_to_output_weight_templates[layer_template.input_space].append(
+            layer_template.name
+        )
+
+    for layer_template in weights:
+        if (
+            not layer_template.output_space
+            or layer_template.output_space in ignore_spaces
+        ):
+            continue
+        space_to_input_weight_templates[layer_template.output_space].append(
+            layer_template.name
+        )
+
+    # remove the residual space from the input and output
+    space_to_input_weight_templates.pop(residual_space, None)
+    space_to_output_weight_templates.pop(residual_space, None)
+
+    # NOTE: if space has input and output weights, remove one or the other because hooking
+    # into both will result in duplicate activations
+    to_remove = []
+    for space, input_weights in space_to_input_weight_templates.items():
+        if space in space_to_output_weight_templates:
+            # if count of input weights and output weights is non zero, remove the space from space to output_weights
+            if (
+                len(input_weights) > 0
+                and len(space_to_output_weight_templates[space]) > 0
+            ):
+                to_remove.append(space)
+
+    # remove keys from output
+    space_to_output_weight_templates = {
+        k: v for k, v in space_to_output_weight_templates.items() if k not in to_remove
+    }
+
+    num_layers = model_arch_info.num_layers(model_config)
+
+    space_to_input_weights = {}
+    for k, v in space_to_input_weight_templates.items():
+        for j in range(num_layers):
+            f = lambda x: _template_substitution(x, num_layers=num_layers, layer_idx=j)
+            space_to_input_weights[f(k)] = [f(_v) for _v in v]
+
+    space_to_output_weights = {}
+    for k, v in space_to_output_weight_templates.items():
+        for j in range(num_layers):
+            f = lambda x: _template_substitution(x, num_layers=num_layers, layer_idx=j)
+            space_to_output_weights[f(k)] = [f(_v) for _v in v]
+
+    # ================== Load model, tokenizer for inference and prepare dataset ==================
+
+    model = AutoModel.from_pretrained(
+        model_path, output_attentions=True, attn_implementation="eager"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    tokenize_function = None
+    if chat_template:
+        logging.info("Using chat template for inference")
+        tokenize_function = lambda x: tokenizer.apply_chat_template(
+            x,
+            padding="longest",
+            max_length=max_length,
+            truncation=True,
+            return_dict=True,
+        )
+    else:
+        logging.info("Using default tokenizer (no chat template) for inference")
+        tokenize_function = lambda x: tokenizer(
+            x,
+            padding="longest",
+            max_length=max_length,
+            truncation=True,
+        )
+
+    model.eval()
+    model.to(device)
+    if dtype is not None:
+        model = model.to(dtype=dtype)
+
+    dataset = datasets.load_dataset(dataset)[dataset_subset]
+
+    if dataset_size is not None:
+        logging.info("Using dataset size %s", dataset_size)
+        dataset = dataset.select(range(dataset_size))
+
+    def tokenize(element):
+        outputs = tokenize_function(element[dataset_column])
+        return {
+            "input_ids": outputs["input_ids"],
+            "attention_mask": outputs["attention_mask"],
+        }
+
+    dataset = dataset.map(tokenize).select_columns(["input_ids", "attention_mask"])
+
+    datasets_dataloader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, collate_fn=DefaultDataCollator()
+    )
+
+    feature_storage = {}
+    storage_dict = {}
+
+    # ================== Hooking into the model ==================
+
+    # NOTE: if the capture input set to True seems confusing, a space's output is a weight recieving input from the space
+    for k, v in space_to_output_weights.items():
+        for weight in v:
+            weight = clean_name(weight)
+            model.get_submodule(weight).register_forward_hook(
+                get_attention_output_hook(feature_storage, k, capture_input=True)
+            )
+    for k, v in space_to_input_weights.items():
+        for weight in v:
+            weight = clean_name(weight)
+            model.get_submodule(weight).register_forward_hook(
+                get_attention_output_hook(feature_storage, k, capture_input=False)
+            )
+
+    # ================== Inference ==================
+
+    for batch in datasets_dataloader:
+        with torch.no_grad():
+            inputs = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(
+                **inputs, output_hidden_states=True, output_attentions=False
+            )
+
+            # NOTE: https://huggingface.co/docs/transformers/en/main_classes/output#transformers.modeling_outputs.BaseModelOutput
+
+            # Store attention masks
+            attention_mask = inputs["attention_mask"]
+            if "attention_mask" not in feature_storage:
+                feature_storage["attention_mask"] = attention_mask.cpu().detach()
+            else:
+                feature_storage["attention_mask"] = torch.cat(
+                    (feature_storage["attention_mask"], attention_mask.cpu().detach()),
+                    dim=0,
+                )
+
+            hidden_states = [
+                remove_pads(attention_mask, hidden_state)
+                for hidden_state in outputs.hidden_states
+            ]
+            hidden_states = torch.stack(outputs.hidden_states, dim=1)
+
+            if residual_space not in feature_storage:
+                feature_storage[residual_space] = hidden_states
+            else:
+                feature_storage[residual_space] = torch.cat(
+                    (feature_storage[residual_space], hidden_states), dim=0
+                )
+
+            for space_name, v in storage_dict.items():
+                if space_name not in feature_storage:
+                    feature_storage[space_name] = v
+                else:
+                    feature_storage[space_name] = torch.cat(
+                        (feature_storage[space_name], v), dim=0
+                    )
+
+            storage_dict = {}
+
+    # ================== Save activations/features ==================
+
+    logging.info("Feature storage:")
+    for k, v in feature_storage.items():
+        if v is not None:
+            logging.info(f"{k}: Shape: {v.shape}")
+
+    abs_path = os.path.abspath(model_path)
+    if os.path.exists(abs_path):
+        model_path = abs_path
+
+    model_path = model_path.replace("/", "_")
+
+    # create output directory
+    os.makedirs(out_path, exist_ok=True)
+
+    save_file(
+        feature_storage, os.path.join(out_path, f"{model_path}_features.safetensor")
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/ABM/extract_permutation_matrices.py b/mergekit/mergekit/scripts/ABM/extract_permutation_matrices.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c586926375a6b213333be2f8149a8a16d3cd44
--- /dev/null
+++ b/mergekit/mergekit/scripts/ABM/extract_permutation_matrices.py
@@ -0,0 +1,226 @@
+import os
+import sys
+from collections import defaultdict
+
+import click
+import numpy as np
+import safetensors.torch
+import scipy
+import torch
+
+from mergekit.architecture import _template_substitution, get_architecture_info
+from mergekit.common import ModelReference
+
+
+def calc_correlation_matrix(feats):
+    feats = feats.view(-1, feats.shape[-1])
+
+    return torch.corrcoef(feats.T)
+
+
+def match_tensors_permute(
+    absval=False,
+    correlation_matrix=None,
+):
+    """
+    This function is adapted from ZipIt! (https://github.com/gstoica27/ZipIt)
+    """
+
+    Om = correlation_matrix.shape[0] // 2
+    device = correlation_matrix.device
+
+    mats = [torch.eye(Om, device=device)]
+
+    corr_submatrix = correlation_matrix[:Om, Om:].cpu().numpy()
+    if absval:
+        corr_submatrix = np.absolute(corr_submatrix)
+    _, col_ind = scipy.optimize.linear_sum_assignment(corr_submatrix, maximize=True)
+
+    new_mat = torch.eye(Om, device=device)[torch.tensor(col_ind).long().to(device)]
+    mats.append(new_mat.T)
+
+    unmerge_mats = mats
+
+    unmerge = torch.cat(unmerge_mats, dim=0)
+
+    merge = torch.cat(mats, dim=0)
+    merge = merge / (merge.sum(dim=0, keepdim=True) + 1e-5)
+
+    return merge.T, unmerge
+
+
+def match_tensors_permute_MHA(
+    n_heads=32,
+    absval=False,
+    correlation_matrix=None,
+):
+    """
+    Handles different head permutations in attention.
+    Modified version of the function here: https://github.com/nverma1/merging-text-transformers/blob/main/matching_functions.py#L76
+    """
+
+    Om = correlation_matrix.shape[0] // 2
+    device = correlation_matrix.device
+    query_size = Om // n_heads
+
+    mats = [torch.eye(Om, device=device)]
+    head_perms = []
+
+    costs = np.ones((n_heads, n_heads)) * -sys.maxsize
+
+    col_inds_storage = defaultdict(lambda: defaultdict(int))
+
+    for j in range(n_heads):
+        for k in range(n_heads):
+            head1_idx = [query_size * j, query_size * (j + 1)]
+            head2_idx = [query_size * k, query_size * (k + 1)]
+
+            corr_submatrix = (
+                correlation_matrix[
+                    head1_idx[0] : head1_idx[1],
+                    (Om + head2_idx[0]) : (Om + head2_idx[1]),
+                ]
+                .cpu()
+                .numpy()
+            )
+            if absval:
+                corr_submatrix = np.absolute(corr_submatrix)
+
+            # compute perm for head j & head k
+            row_ind, col_ind = scipy.optimize.linear_sum_assignment(
+                corr_submatrix, maximize=True
+            )
+
+            costs[j, k] = corr_submatrix[row_ind, col_ind].sum()
+
+            col_inds_storage[j][k] = col_ind
+
+    outer_row_ind, outer_col_ind = scipy.optimize.linear_sum_assignment(
+        costs, maximize=True
+    )
+
+    for j in range(n_heads):
+        head_1 = outer_row_ind[j]
+        head_2 = outer_col_ind[j]
+
+        head_perm = col_inds_storage[head_1][head_2]
+        head_perms.append(torch.tensor(head_perm + query_size * head_2))
+
+    new_mat = torch.eye(Om, device=device)[
+        torch.cat(head_perms).clone().detach().long().to(device)
+    ]
+    mats.append(new_mat.T)
+
+    unmerge_mats = mats
+
+    unmerge = torch.cat(unmerge_mats, dim=0)
+    merge = torch.cat(mats, dim=0)
+    merge = merge / (merge.sum(dim=0, keepdim=True) + 1e-5)
+
+    return merge.T, unmerge
+
+
+@click.command("mergekit-abm-extract-permutations")
+@click.argument("model1-ft", type=str, required=True)
+@click.argument("model2-ft", type=str, required=True)
+@click.option("--model_path", type=str, required=True, help="Model information")
+@click.option(
+    "--out_path", required=True, type=str, help="Output path for metric tensors"
+)
+@click.option(
+    "--absval/--no-absval",
+    required=False,
+    default=False,
+    help="Use absolute value on correlation matrices/submatrices while calculating merge/unmerge matrices",
+)
+@click.option(
+    "--device",
+    "-d",
+    type=str,
+    default="cpu",
+    help="Device to compute on (default: cpu)",
+)
+def main(model1_ft, model2_ft, model_path, out_path, absval, device):
+    os.makedirs(out_path, exist_ok=True)
+
+    model = ModelReference.model_validate(model_path)
+
+    model_config = model.config()
+
+    model_arch_info = get_architecture_info(model_config)
+
+    _json = model_arch_info.definition
+
+    residual_space = None
+    kq_space = None
+    v_space = None
+
+    # extract the residual, attention related spaces
+    for weight in _json.layer_templates.weights:
+        if weight.is_kq:
+            kq_space = weight.output_space
+            residual_space = weight.input_space
+            continue
+
+        # assuming order is observed
+        if (
+            not weight.is_kq
+            and weight.head_split
+            and (weight.input_space == residual_space)
+        ):
+            v_space = weight.output_space
+            continue
+
+    num_layers = model_arch_info.num_layers(model_config)
+
+    kq_spaces = []
+    v_spaces = []
+    for j in range(num_layers):
+        kq_spaces.append(
+            _template_substitution(kq_space, num_layers=num_layers, layer_idx=j)
+        )
+        v_spaces.append(
+            _template_substitution(v_space, num_layers=num_layers, layer_idx=j)
+        )
+
+    model1_features = safetensors.torch.load_file(model1_ft, device=device)
+    model2_features = safetensors.torch.load_file(model2_ft, device=device)
+
+    model1_features.pop("attention_mask")
+    model2_features.pop("attention_mask")
+
+    for feature_space in model1_features.keys():
+        concatenated_feature = torch.cat(
+            (model1_features[feature_space], model2_features[feature_space]), dim=-1
+        )
+
+        correlation_matrix = calc_correlation_matrix(concatenated_feature)
+
+        if feature_space in (kq_spaces + v_spaces):
+            merge, unmerge = match_tensors_permute_MHA(
+                correlation_matrix=correlation_matrix,
+                n_heads=model_config.num_attention_heads,
+                absval=absval,
+            )
+
+        else:
+            merge, unmerge = match_tensors_permute(
+                correlation_matrix=correlation_matrix,
+                absval=absval,
+            )
+
+        safetensors.torch.save_file(
+            {feature_space: merge.contiguous()},
+            f"{out_path}/{feature_space}_merge.safetensor",
+        )
+
+        safetensors.torch.save_file(
+            {feature_space: unmerge.contiguous()},
+            f"{out_path}/{feature_space}_unmerge.safetensor",
+        )
+
+        del merge, unmerge, correlation_matrix, concatenated_feature
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/__init__.py b/mergekit/mergekit/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mergekit/mergekit/scripts/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/scripts/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..262083a29e7f84cb19c1f40ea14c7dc662c02025
Binary files /dev/null and b/mergekit/mergekit/scripts/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/scripts/__pycache__/run_yaml.cpython-310.pyc b/mergekit/mergekit/scripts/__pycache__/run_yaml.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8602448c564a52180e3ec1fa851258c19b62e698
Binary files /dev/null and b/mergekit/mergekit/scripts/__pycache__/run_yaml.cpython-310.pyc differ
diff --git a/mergekit/mergekit/scripts/bakllama.py b/mergekit/mergekit/scripts/bakllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..3942c1a5e6d050243a596c3965585dfe444a4333
--- /dev/null
+++ b/mergekit/mergekit/scripts/bakllama.py
@@ -0,0 +1,85 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import List, Optional
+
+import click
+import yaml
+from pydantic import BaseModel
+
+from mergekit.common import MergeOptions
+from mergekit.config import (
+    ConditionalParameter,
+    InputSliceDefinition,
+    MergeConfiguration,
+)
+from mergekit.merge import run_merge
+
+
+class LayerSlice(BaseModel):
+    model: str
+    start: int
+    end: int
+    scale: Optional[float] = None
+
+
+class BakllamaConfig(BaseModel):
+    layer_slices: List[LayerSlice]
+    embedding_source: Optional[str] = None
+    lm_head_source: Optional[str] = None
+
+
+@click.command("bakllama")
+@click.argument("config_path", type=click.Path(exists=True, dir_okay=False))
+@click.argument("out_path", type=str)
+@click.option(
+    "--clone-tensors/--no-clone-tensors",
+    type=bool,
+    is_flag=True,
+    help="Clone tensors before saving, to allow multiple occurrences of the same layer",
+    default=False,
+)
+@click.option("--fp16/--no-fp16", type=bool, default=False)
+def main(
+    config_path: str,
+    out_path: str,
+    clone_tensors: bool,
+    fp16: bool,
+):
+    """Wrapper for using legacy bakllama configuration files."""
+    with open(config_path, "r", encoding="utf-8") as file:
+        config = BakllamaConfig.model_validate(yaml.safe_load(file))
+
+    slices = []
+    for s in config.layer_slices:
+        parameters = {}
+        if s.scale is not None:
+            parameters["scale"] = ConditionalParameter(
+                value=s.scale, filter="down_proj"
+            )
+        slices.append(
+            InputSliceDefinition(
+                model=s.model, layer_range=(s.start, s.end), parameters=parameters
+            )
+        )
+
+    merge_config = MergeConfiguration(
+        merge_method="passthrough", slices=slices, dtype="float16" if fp16 else None
+    )
+    run_merge(merge_config, out_path, MergeOptions(clone_tensors=clone_tensors))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/evolve.py b/mergekit/mergekit/scripts/evolve.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b259822bafaeecee1ce2ca81eb90abece2c587
--- /dev/null
+++ b/mergekit/mergekit/scripts/evolve.py
@@ -0,0 +1,395 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import os
+import time
+from typing import List, Optional
+
+import click
+import cma
+import numpy as np
+import pandas
+import ray
+import torch
+import tqdm
+import transformers
+import yaml
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+
+from mergekit.common import ModelReference
+from mergekit.evo.config import (
+    EvolMergeConfiguration,
+    ModelGenomeDefinition,
+    check_for_naughty_config,
+)
+from mergekit.evo.genome import ModelGenome
+from mergekit.evo.strategy import (
+    ActorPoolEvaluationStrategy,
+    BufferedRayEvaluationStrategy,
+    SerialEvaluationStrategy,
+)
+from mergekit.merge import run_merge
+from mergekit.options import MergeOptions
+
+
+@click.command("mergekit-evolve")
+@click.argument("genome-config-path", type=str)
+@click.option("--max-fevals", type=int, default=100)
+@click.option("--vllm/--no-vllm", is_flag=True, default=False, help="Use vLLM")
+@click.option(
+    "--strategy",
+    "-s",
+    type=click.Choice(["pool", "buffered", "serial"]),
+    default="pool",
+    help="Evaluation scheduling strategy",
+)
+@click.option(
+    "--in-memory/--no-in-memory",
+    is_flag=True,
+    default=False,
+    help="Use in-memory merge & evaluation",
+)
+@click.option(
+    "--storage-path",
+    type=str,
+    help="Path to storage accessible to all nodes for model storage",
+    required=True,
+)
+@click.option("--num-gpus", type=int, help="Number of GPUs to use across all nodes")
+@click.option("--merge-cuda/--no-merge-cuda", is_flag=True, default=True)
+@click.option("--trust-remote-code/--no-trust-remote-code", is_flag=True, default=False)
+@click.option("--allow-crimes/--no-allow-crimes", is_flag=True, default=False)
+@click.option("--random-seed", type=int, default=0)
+@click.option("--batch-size", type=int, default=None, help="Batch size for evaluation")
+@click.option("--sigma0", type=float, default=1 / 6, help="Initial sigma for CMA-ES")
+@click.option("use_wandb", "--wandb/--no-wandb", is_flag=True, default=False)
+@click.option("--wandb-project", type=str, help="Wandb project name")
+@click.option("--wandb-entity", type=str, help="Wandb entity name")
+@click.option(
+    "--task-search-path",
+    type=str,
+    multiple=True,
+    help="Path to search for lmeval tasks",
+)
+@click.option(
+    "--i-understand-the-depths-of-the-evils-i-am-unleashing",
+    "allow_benchmark_tasks",
+    is_flag=True,
+    default=False,
+    help="Allow benchmark tasks as objectives",
+)
+@click.option(
+    "--save-final-model/--no-save-final-model",
+    is_flag=True,
+    default=True,
+    help="Save the final merged model",
+)
+@click.option(
+    "--reshard/--no-reshard",
+    is_flag=True,
+    default=True,
+    help="Convert models to single-shard safetensors for faster merge",
+)
+@click.option(
+    "--timeout",
+    type=float,
+    default=None,
+    help="Maximum time to run the optimization in seconds",
+)
+@click.option(
+    "--force-population-size",
+    type=int,
+    default=None,
+    help="Force a specific initial population size for CMA-ES",
+)
+def main(
+    genome_config_path: str,
+    max_fevals: int,
+    vllm: bool,
+    strategy: str,
+    in_memory: bool,
+    storage_path: Optional[str],
+    num_gpus: Optional[int],
+    merge_cuda: bool,
+    trust_remote_code: bool,
+    allow_crimes: bool,
+    random_seed: int,
+    batch_size: Optional[int],
+    sigma0: float,
+    use_wandb: bool,
+    wandb_project: Optional[str],
+    wandb_entity: Optional[str],
+    task_search_path: List[str],
+    allow_benchmark_tasks: bool,
+    save_final_model: bool,
+    reshard: bool,
+    timeout: Optional[float],
+    force_population_size: Optional[int],
+):
+    config = EvolMergeConfiguration.model_validate(
+        yaml.safe_load(open(genome_config_path, "r", encoding="utf-8"))
+    )
+
+    check_for_naughty_config(config, allow=allow_benchmark_tasks)
+
+    if use_wandb:
+        if not wandb:
+            raise RuntimeError("wandb is not installed")
+        run = wandb.init(
+            project=wandb_project or "mergekit-evolve",
+            entity=wandb_entity,
+            config=config.model_dump(mode="json"),
+        )
+    else:
+        run = None
+
+    merge_options = MergeOptions(
+        transformers_cache=os.path.join(storage_path, "transformers_cache"),
+        lora_merge_cache=os.path.join(storage_path, "lora_merge_cache"),
+        cuda=merge_cuda,
+        low_cpu_memory=merge_cuda and not in_memory,
+        out_shard_size=1_000_000_000_000,  # one trillion bytes!
+        trust_remote_code=trust_remote_code,
+        allow_crimes=allow_crimes,
+        random_seed=random_seed,
+        quiet=True,
+        read_to_gpu=merge_cuda and not in_memory,
+        copy_tokenizer=True,
+        safe_serialization=True,
+    )
+
+    # convert models to single-shard safetensors
+    if reshard:
+        resharded_models = []
+        resharded_base = None
+        for model in tqdm.tqdm(config.genome.models, desc="Resharding models"):
+            resharded_models.append(
+                _reshard_model(
+                    model,
+                    storage_path,
+                    merge_options.lora_merge_cache,
+                    trust_remote_code,
+                )
+            )
+        if config.genome.base_model is not None:
+            resharded_base = _reshard_model(
+                config.genome.base_model,
+                storage_path,
+                merge_options.lora_merge_cache,
+                trust_remote_code,
+            )
+    else:
+        resharded_models = config.genome.models
+        resharded_base = config.genome.base_model
+
+    genome = ModelGenome(
+        ModelGenomeDefinition.model_validate(
+            {
+                **config.genome.model_dump(
+                    exclude=[
+                        "models",
+                        "base_model",
+                    ]
+                ),
+                "models": resharded_models,
+                "base_model": resharded_base,
+            }
+        ),
+        trust_remote_code=trust_remote_code,
+    )
+
+    if strategy == "pool":
+        strat_cls = ActorPoolEvaluationStrategy
+    elif strategy == "buffered":
+        strat_cls = BufferedRayEvaluationStrategy
+    elif strategy == "serial":
+        strat_cls = SerialEvaluationStrategy
+    else:
+        raise ValueError(f"Unknown strategy {strategy}")
+
+    strat = strat_cls(
+        config,
+        genome,
+        merge_options,
+        num_gpus=num_gpus,
+        vllm=vllm,
+        in_memory=in_memory,
+        model_storage_path=os.path.join(storage_path, "merged"),
+        batch_size=batch_size,
+        task_search_path=task_search_path,
+    )
+
+    x0 = genome.initial_genotype(random=config.random_init).view(-1).numpy()
+    xbest = x0
+    xbest_cost = np.inf
+
+    def progress_callback(es: cma.CMAEvolutionStrategy):
+        nonlocal xbest, xbest_cost
+
+        res = es.result
+        if use_wandb:
+            best_params = genome.genotype_to_param_arrays(res.xbest)
+            mean_params = genome.genotype_to_param_arrays(res.xfavorite)
+            run.log(
+                {
+                    "best_score": -res.fbest,
+                    "best_genome": wandb.Table(data=pandas.DataFrame(best_params)),
+                    "mean_genome": wandb.Table(data=pandas.DataFrame(mean_params)),
+                    "mean_std": genome.genotype_to_param_arrays(res.stds),
+                    "evaluations": res.evaluations,
+                },
+                commit=True,
+                step=res.evaluations,
+            )
+
+        if res.fbest < xbest_cost:
+            xbest = res.xbest
+            xbest_cost = res.fbest
+            print(f"New best score: {-xbest_cost:.4f}")
+            best_yaml = genome.genotype_merge_config(xbest).to_yaml()
+            with open(os.path.join(storage_path, "best_config.yaml"), "w") as f:
+                f.write(best_yaml)
+            print(f"Merge configuration:\n{best_yaml}")
+
+            if use_wandb:
+                art = wandb.Artifact("best_config", type="merge_config")
+                art.add_file(os.path.join(storage_path, "best_config.yaml"))
+                run.log_artifact(art)
+
+    def parallel_evaluate(x: List[np.ndarray]) -> List[float]:
+        print(f"Received {len(x)} genotypes")
+        res = strat.evaluate_genotypes(x)
+
+        if use_wandb:
+            res = list(res)
+            score_mean = np.mean([r["score"] for r in res])
+            score_std = np.std([r["score"] for r in res])
+            run.log(
+                {
+                    "population/score_mean": score_mean,
+                    "population/score_std": score_std,
+                },
+                commit=False,
+            )
+            for task in res[0]["results"]:
+                for metric in res[0]["results"][task]:
+                    values = [r["results"][task][metric] for r in res]
+                    values = [v for v in values if v is not None]
+                    if not values or all(isinstance(v, str) for v in values):
+                        continue
+
+                    mean = np.mean(values)
+                    max_val = max(values)
+                    min_val = min(values)
+
+                    metric_pretty = metric.replace(",none", "")
+                    if metric_pretty.endswith("_stderr"):
+                        # don't log stats for stderr that's just silly
+                        continue
+
+                    run.log(
+                        {
+                            f"population/{task}_{metric_pretty}_mean": mean,
+                            f"population/{task}_{metric_pretty}_max": max_val,
+                            f"population/{task}_{metric_pretty}_min": min_val,
+                        },
+                        commit=False,
+                    )
+
+        return [-x["score"] for x in res]  # maximize
+
+    try:
+        cma_opts = {"maxfevals": max_fevals, "timeout": timeout}
+        if force_population_size is not None:
+            cma_opts["popsize"] = force_population_size
+        xbest, es = cma.fmin2(
+            None,
+            parallel_objective=parallel_evaluate,
+            x0=x0,
+            sigma0=sigma0,
+            options=cma_opts,
+            callback=progress_callback,
+        )
+        xbest_cost = es.result.fbest
+    except KeyboardInterrupt:
+        ray.shutdown()
+
+    print("!!! OPTIMIZATION COMPLETE !!!")
+    print(f"Best cost: {xbest_cost:.4f}")
+    print()
+
+    # pause for a bit to let any CUDA-using processes clean up
+    time.sleep(1.0)
+
+    # save the best merge configuration using original model references
+    genome_pretty = ModelGenome(config.genome, trust_remote_code=trust_remote_code)
+    best_config = genome_pretty.genotype_merge_config(xbest)
+    print("Best merge configuration:")
+    print(best_config.to_yaml())
+
+    if save_final_model:
+        print("Saving final model...")
+        run_merge(best_config, os.path.join(storage_path, "final_model"), merge_options)
+
+
+def _reshard_model(
+    model: ModelReference, storage_path: str, merge_cache: str, trust_remote_code: bool
+) -> ModelReference:
+    merged = model.merged(
+        cache_dir=merge_cache,
+        trust_remote_code=trust_remote_code,
+    )
+    out_path = os.path.join(
+        storage_path,
+        "input_models",
+        merged.model._unique_id(),
+    )
+
+    if os.path.exists(out_path):
+        logging.info(f"Using existing resharded model at {out_path}")
+        return ModelReference(model=out_path)
+
+    model_hf = transformers.AutoModelForCausalLM.from_pretrained(
+        merged.model.path,
+        revision=merged.model.revision,
+        trust_remote_code=trust_remote_code,
+        torch_dtype=torch.bfloat16,
+        cache_dir=os.path.join(storage_path, "transformers_cache"),
+    )
+    model_hf.save_pretrained(
+        out_path, safe_serialization=True, out_shard_size=1_000_000_000_000
+    )
+    try:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model.model.path,
+            revision=model.model.revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=True,
+        )
+        tokenizer.save_pretrained(out_path)
+    except Exception as e:
+        logging.warning(f"Could not save tokenizer for {model.model}", exc_info=e)
+
+    return ModelReference(model=out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/extract_lora.py b/mergekit/mergekit/scripts/extract_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c010bb1bb0aafe271eedea666b0f0bc7537e07
--- /dev/null
+++ b/mergekit/mergekit/scripts/extract_lora.py
@@ -0,0 +1,595 @@
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import bitsandbytes as bnb
+import click
+import torch
+from peft.tuners.lora import QuantLinear
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM
+from transformers.pytorch_utils import Conv1D
+
+from mergekit.card import generate_card_lora
+from mergekit.common import ModelReference
+from mergekit.io import LazyTensorLoader
+
+
+def low_rank_decomposition(
+    weight: torch.Tensor, max_rank: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Decompose a 2D matrix into low-rank matrices L and R using SVD.
+
+    :param weight: The matrix to decompose, of shape (H, W)
+    :param max_rank: The maximum rank of the decomposition
+    :return: A tuple of tensors (L, R)
+    """
+    assert (
+        weight.dim() == 2
+    ), f"Only support 2D matrix, but input has {weight.dim()} dimensions."
+    assert (
+        max_rank >= 1
+    ), f"Maximum rank must be a positive integer, but input max_rank={max_rank}."
+
+    dtype = weight.dtype
+
+    U, S, Vh = torch.linalg.svd(weight.float(), full_matrices=False)
+
+    final_rank = min(min(weight.shape), max_rank)
+
+    # Distribute S to both to improve numerical precision.
+    sqrt_S = torch.sqrt(torch.diag(S[:final_rank]))
+    L = sqrt_S @ Vh[:final_rank, :]
+    R = U[:, :final_rank] @ sqrt_S
+
+    return L.to(dtype), R.to(dtype)
+
+
+def decompose_delta_weight(
+    base_weight: torch.Tensor,
+    finetuned_weight: torch.Tensor,
+    max_rank: int,
+    device: Optional[str] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Decompose the delta weight into low-rank matrices L and R.
+
+    :param new_weight: The updated weight matrix after applying LoRA
+    :param base_weight: The original weight matrix before LoRA
+    :param max_rank: The maximum rank for the low-rank decomposition
+    :param device: The device to perform computation on
+    :return: A tuple of tensors (L, R)
+    """
+    assert (
+        base_weight.size() == finetuned_weight.size()
+    ), f"Mismatched dimensions: {base_weight.size()} != {finetuned_weight.size()}"
+
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    base_weight = base_weight.to(device)
+    finetuned_weight = finetuned_weight.to(device)
+
+    delta_weight = finetuned_weight - base_weight
+
+    L, R = low_rank_decomposition(delta_weight, max_rank)
+
+    return L, R
+
+
+def get_model_details(
+    model_id: str, skip_undecomposable: bool
+) -> List[Tuple[str, str, torch.Size]]:
+    """
+    Retrieve architectural details of a given pre-trained model.
+
+    :param model_id: The identifier of the pre-trained model to load
+    :param skip_undecomposable: Skip saving undecomposable modules
+    :return: A list of tuples where each tuple contains:
+             - type: The type of the module ('embedding', 'linear', or 'to_save')
+             - name: The full name of the module
+             - size: The dimensions of the module's weight tensor
+    """
+
+    # Avoid loading weights as we won't need them
+    pretrained_model = AutoModelForCausalLM.from_pretrained(
+        model_id, state_dict={}, device_map="meta"
+    )
+
+    module_details = []
+
+    for name, module in pretrained_model.named_modules():
+        if module == pretrained_model.get_input_embeddings():
+            # if isinstance(module, torch.nn.Embedding):
+            module_details.append(("embedding", name, module.weight.size()))
+        elif module == pretrained_model.get_output_embeddings():
+            # if isinstance(module, torch.nn.Embedding):
+            module_details.append(("output", name, module.weight.size()))
+        elif hasattr(module, "weight") and isinstance(module.weight, torch.Tensor):
+            if (
+                # SEE: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/model.py
+                isinstance(
+                    module,
+                    (
+                        torch.nn.Linear,
+                        torch.nn.Conv2d,
+                        bnb.nn.Linear4bit,
+                        bnb.nn.Linear8bitLt,
+                        QuantLinear,
+                        Conv1D,
+                    ),
+                )
+                or (
+                    "Linear" in module.__class__.__name__
+                    and module.__class__.__name__
+                    not in ("LlamaLinearScalingRotaryEmbedding",)
+                )
+            ):
+                module_details.append(("linear", name, module.weight.size()))
+            elif not skip_undecomposable:
+                module_details.append(("to_save", name, module.weight.size()))
+            else:
+                logging.info(f"Skipping undecomposable module '{name}'.")
+
+    return module_details
+
+
+def validate_and_combine_details(
+    base_model_id: str,
+    finetuned_model_id: str,
+    skip_undecomposable: bool,
+    extend_vocab: bool,
+) -> List[Tuple[str, str]]:
+    """
+    Validate and combine details from a base model and a fine-tuned model.
+
+    :param base_model_id: The identifier for the base model
+    :param finetuned_model_id: The identifier for the fine-tuned model
+    :param skip_undecomposable: Skip saving undecomposable modules
+    :return: A list of tuples with the type and name of the validated/combined model layers
+    """
+
+    base_model_details = get_model_details(base_model_id, skip_undecomposable)
+    finetuned_model_details = get_model_details(finetuned_model_id, skip_undecomposable)
+
+    module_details = []
+
+    base_model_embedding_size = None
+    finetuned_model_embedding_size = None
+
+    for i, (base_layer, finetuned_layer) in enumerate(
+        zip(base_model_details, finetuned_model_details)
+    ):
+        base_type, base_name, base_size = base_layer
+        finetuned_type, finetuned_name, finetuned_size = finetuned_layer
+
+        assert (
+            base_type == finetuned_type
+        ), f"Layer type mismatch: {base_type} != {finetuned_type}"
+        assert (
+            base_name == finetuned_name
+        ), f"Layer name mismatch: {base_name} != {finetuned_name}"
+
+        if base_type == "embedding":
+            base_model_embedding_size = base_size[0]
+
+        if finetuned_type == "embedding":
+            finetuned_model_embedding_size = finetuned_size[0]
+
+        # Fine-tuned models with added vocab will have have their extra rows truncated unless `extend_vocab` is specified
+        if base_type != "to_save" and finetuned_size[0] > base_size[0]:
+            assert (
+                base_size[1] == finetuned_size[1]
+            ), f"Column dimension mismatch in layer '{base_name}': {base_size} != {finetuned_size}"
+
+            if base_type == "embedding" or base_type == "output":
+                if not extend_vocab:
+                    logging.warning(
+                        f"Finetuned module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows truncated for weight decomposition! To preserve all embeddings, invoke script with --extend-vocab"
+                    )
+                else:
+                    logging.warning(
+                        f"Base module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows added for weight decomposition. Make sure to call `model.resize_token_embeddings({finetuned_size[0]})` before applying LoRA for inference!"
+                    )
+            else:
+                logging.warning(
+                    f"Finetuned module '{base_name}' will have {finetuned_size[0] - base_size[0]} rows truncated for weight decomposition!"
+                )
+
+        else:
+            assert (
+                base_size == finetuned_size
+            ), f"Dimension mismatch in layer '{base_name}': {base_size} != {finetuned_size}"
+
+        module_details.append((base_type, base_name))
+
+    return module_details, base_model_embedding_size, finetuned_model_embedding_size
+
+
+def extract_lora(
+    module_details: List[Tuple[str, str]],
+    base_model_ref: ModelReference,
+    finetuned_model_ref: ModelReference,
+    max_rank: int,
+    extend_vocab: bool,
+    no_lazy_unpickle: bool,
+    device: Optional[str],
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, int]]:
+    """
+    Process module details to decompose weights and generate LoRA weights and ranks.
+
+    :param module_details: List of module details.
+    :param base_model_ref: Reference to the base model.
+    :param finetuned_model_ref: Reference to the fine-tuned model.
+    :param max_rank: The maximum rank for the low-rank decomposition.
+    :param no_lazy_unpickle: Flag to disable lazy unpickle.
+    :param device: The device to perform computation on.
+    :return: A tuple containing LoRA weights dictionary and ranks dictionary.
+    """
+
+    base_loader = LazyTensorLoader(
+        base_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
+    )
+    finetuned_loader = LazyTensorLoader(
+        finetuned_model_ref.tensor_index(), lazy_unpickle=(not no_lazy_unpickle)
+    )
+
+    lora_weights = {}
+    ranks = {}
+
+    for module_type, module_name in tqdm(module_details):
+        base_weight = base_loader.get_tensor(f"{module_name}.weight")
+        finetuned_weight = finetuned_loader.get_tensor(f"{module_name}.weight")
+
+        if module_type == "to_save":
+            lora_weights[
+                f"base_model.model.{module_name}.weight"
+            ] = finetuned_weight.to("cpu").contiguous()
+
+            logging.info(
+                f"[{module_type}] {module_name}: output_dims=({finetuned_weight.shape})"
+            )
+
+        else:
+            if finetuned_weight.shape[0] > base_weight.shape[0]:
+                if extend_vocab:
+                    print(f"Extra tokens found!, module name : {module_name}")
+
+                    new_base_weight = torch.empty(
+                        finetuned_weight.shape, device=base_weight.device
+                    )
+                    new_base_weight.normal_(mean=0.0, std=0.02)
+
+                    # Copy original base_weight values into the new tensor
+                    new_base_weight[: base_weight.shape[0]] = base_weight
+
+                    if module_type == "embedding" or module_type == "output":
+                        lora_weights[
+                            f"base_model.model.{module_name}.base_layer.weight"
+                        ] = new_base_weight.to("cpu").contiguous()
+
+                    base_weight = new_base_weight
+                else:
+                    logging.warning(
+                        f"Finetuned module '{module_name}' will have {finetuned_weight.shape[0] - base_weight.shape[0]} rows truncated for weight decomposition!"
+                    )
+                    finetuned_weight = finetuned_weight[: base_weight.shape[0]]
+
+            if module_type == "embedding":
+                # These need to be transposed for some reason...
+                lora_embedding_A, lora_embedding_B = decompose_delta_weight(
+                    base_weight.T, finetuned_weight.T, max_rank, device=device
+                )
+
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_embedding_A"
+                ] = lora_embedding_A.to("cpu").contiguous()
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_embedding_B"
+                ] = lora_embedding_B.to("cpu").contiguous()
+
+                ranks[module_name] = lora_embedding_A.shape[0]
+
+                logging.info(
+                    f"[{module_type}] {module_name}: final_rank={ranks[module_name]}, "
+                    f"input_dims=({base_weight.shape}), "
+                    f"output_dims=({lora_embedding_A.shape}, {lora_embedding_B.shape})"
+                )
+
+            else:
+                lora_A, lora_B = decompose_delta_weight(
+                    base_weight, finetuned_weight, max_rank, device=device
+                )
+
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_A.weight"
+                ] = lora_A.to("cpu").contiguous()
+                lora_weights[
+                    f"base_model.model.{module_name}.lora_B.weight"
+                ] = lora_B.to("cpu").contiguous()
+
+                ranks[module_name] = lora_A.shape[0]
+
+                logging.info(
+                    f"[{module_type}] {module_name}: final_rank={ranks[module_name]}, "
+                    f"input_dims=({base_weight.shape}), "
+                    f"output_dims=({lora_A.shape}, {lora_B.shape})"
+                )
+
+    return lora_weights, ranks
+
+
+def reconstruct_invocation(args: Dict[str, Any]) -> str:
+    """
+    Reconstruct the command-line invocation string based on the given arguments.
+
+    :param args: A dictionary containing the command arguments with keys matching the parameter names.
+                 Expected keys are 'base_model', 'finetuned_model', 'out_path', 'no_lazy_unpickle',
+                 'skip_undecomposable, 'max_rank', 'model_name', 'device' and 'verbose'.
+    :return: The reconstructed command-line invocation string.
+    """
+
+    # Provide a default value for out_path if it's not in the dictionary
+    out_path = args.get("out_path", "OUTPUT_PATH")
+
+    invocation = f"mergekit-extract-lora {args['finetuned_model']} {args['base_model']} {out_path}"
+    if args.get("no_lazy_unpickle"):
+        invocation += " --no-lazy-unpickle"
+    if args.get("skip_undecomposable"):
+        invocation += " --skip-undecomposable"
+    if args.get("max_rank"):
+        invocation += f" --rank={args['max_rank']}"
+    if args.get("extend_vocab"):
+        invocation += " --extend-vocab"
+    if args.get("model_name"):
+        invocation += f" --model_name={args['model_name']}"
+    if args.get("device"):
+        invocation += f" --device={args['device']}"
+    if args.get("verbose"):
+        invocation += " --verbose"
+
+    return invocation
+
+
+def create_peft_config(
+    base_model_name_or_path: str,
+    rank: int,
+    alpha: int,
+    rank_pattern: Dict[str, int],
+    alpha_pattern: Dict[str, int],
+    target_modules: List[str],
+    modules_to_save: List[str],
+) -> Dict[str, Any]:
+    """
+    Create a PEFT (Parameter-Efficient Fine-Tuning) configuration dictionary.
+
+    :param base_model_name_or_path: The path or name of the base model.
+    :param rank: The rank for the low-rank adaptation.
+    :param alpha: The scaling factor for low-rank adaptation.
+    :param rank_pattern: A dictionary specifying rank patterns for different modules.
+    :param alpha_pattern: A dictionary specifying alpha patterns for different modules.
+    :param target_modules: A list of module names to apply the adaptation to.
+    :param modules_to_save: A list of module names to save during the adaptation.
+    :return: A dictionary containing the PEFT configuration.
+    """
+    return {
+        "alpha_pattern": alpha_pattern,
+        "auto_mapping": None,
+        "base_model_name_or_path": base_model_name_or_path,
+        "bias": "none",
+        "fan_in_fan_out": False,
+        "inference_mode": True,
+        "init_lora_weights": True,
+        "layers_pattern": None,
+        "layers_to_transform": None,
+        "loftq_config": {},
+        "lora_alpha": alpha,
+        "lora_dropout": 0,
+        "megatron_config": None,
+        "megatron_core": "megatron.core",
+        "modules_to_save": modules_to_save,
+        "peft_type": "LORA",
+        "r": rank,
+        "rank_pattern": rank_pattern,
+        "revision": None,
+        "target_modules": target_modules,
+        "task_type": "CAUSAL_LM",
+        "use_rslora": False,
+    }
+
+
+def save_model_and_config(
+    lora_weights: Dict[str, torch.Tensor],
+    ranks: Dict[str, int],
+    extended: bool,
+    embedding_size: int,
+    module_details: List[Tuple[str, str]],
+    invocation_args: Dict[str, Any],
+) -> None:
+    """
+    Save the PEFT model and configuration to the specified output path.
+
+    :param lora_weights: The LoRA weights.
+    :param ranks: The ranks of the LoRA weights.
+    :param module_details: Details of the model modules.
+    :param invocation_args: The command-line invocation arguments.
+    """
+
+    base_model_ref = ModelReference.parse(invocation_args["base_model"])
+    finetuned_model_ref = ModelReference.parse(invocation_args["finetuned_model"])
+    out_path = invocation_args["out_path"]
+    model_name = invocation_args["model_name"]
+
+    # Work out the actual final rank and only retain those that were lower.
+    final_max_rank = max(ranks.values())
+    ranks = {k: v for k, v in ranks.items() if v != final_max_rank}
+
+    lora_config = create_peft_config(
+        base_model_name_or_path=base_model_ref.model.path,
+        rank=final_max_rank,
+        alpha=final_max_rank,  # Setting the alpha to the rank value as `peft` will scale the LoRA weights by alpha/r when applying the adapter
+        rank_pattern=ranks,
+        alpha_pattern=ranks,
+        target_modules=list(
+            set(
+                module_name.split(".")[-1]
+                for module_type, module_name in module_details
+                if module_type != "to_save"
+            )
+        ),
+        modules_to_save=list(
+            set(
+                module_name.split(".")[-1]
+                for module_type, module_name in module_details
+                if module_type == "to_save"
+            )
+        ),
+    )
+
+    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
+        json.dump(lora_config, f, indent=2)
+
+    save_file(lora_weights, os.path.join(out_path, "adapter_model.safetensors"))
+
+    invocation_args.pop("out_path")  # don't include out_path for privacy
+    invocation = reconstruct_invocation(invocation_args)
+
+    card_md = generate_card_lora(
+        base_model_ref=base_model_ref,
+        finetuned_model_ref=finetuned_model_ref,
+        invocation=invocation,
+        extended=extended,
+        vocab_size=embedding_size,
+        name=model_name,
+    )
+
+    with open(os.path.join(out_path, "README.md"), "w", encoding="utf-8") as fp:
+        fp.write(card_md)
+
+    logging.info(f"PEFT LoRA adapters saved to {out_path}")
+
+
+@click.command("mergekit-extract-lora")
+@click.argument("finetuned_model", type=str)
+@click.argument("base_model", type=str)
+@click.argument("out_path", type=click.Path())
+@click.option(
+    "--no-lazy-unpickle",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Disable lazy unpickler (more stable, higher memory usage)",
+)
+@click.option(
+    "--skip-undecomposable",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Skip saving undecomposable modules in the LoRA",
+)
+@click.option(
+    "--rank",
+    "max_rank",
+    type=int,
+    default=32,
+    help="The maximum rank for the low-rank decomposition",
+)
+@click.option(
+    "--extend-vocab",
+    is_flag=True,
+    default=False,
+    help="Extend vocabulary for models with additional tokens instead of truncating",
+)
+@click.option(
+    "--model_name",
+    type=str,
+    default=None,
+    help="Name of the resulting model (shown in the model card)",
+)
+@click.option(
+    "--device",
+    type=str,
+    default=None,
+    help="PyTorch device to perform SVD computation on",
+)
+@click.option(
+    "--verbose", "-v", type=bool, is_flag=True, default=False, help="Verbose logging"
+)
+def main(
+    finetuned_model: str,
+    base_model: str,
+    out_path: str,
+    no_lazy_unpickle: bool,
+    skip_undecomposable: bool,
+    max_rank: int,
+    extend_vocab: bool,
+    model_name: str,
+    device: str,
+    verbose: bool,
+) -> None:
+    """
+    Decomposes delta weights between a base model and a finetuned model, saving a PEFT model to the specified output path.
+
+    \b
+    Arguments:
+    FINETUNED_MODEL - the model ID or path to use as the PEFT extraction target model.
+    BASE_MODEL - the model ID or path to use as the base model.
+    OUT_PATH - the output path where the PEFT model will be saved.
+    """
+
+    invocation_args = {
+        "base_model": base_model,
+        "finetuned_model": finetuned_model,
+        "max_rank": max_rank,
+        "extend_vocab": extend_vocab,
+        "device": device,
+        "out_path": out_path,
+        "model_name": model_name,
+        "no_lazy_unpickle": no_lazy_unpickle,
+        "skip_undecomposable": skip_undecomposable,
+        "verbose": verbose,
+    }
+
+    logging.basicConfig(level=logging.INFO if verbose else logging.WARNING)
+
+    os.makedirs(out_path, exist_ok=True)
+
+    base_model_ref = ModelReference.parse(base_model)
+    finetuned_model_ref = ModelReference.parse(finetuned_model)
+
+    (
+        module_details,
+        base_model_embedding_size,
+        finetuned_model_embedding_size,
+    ) = validate_and_combine_details(
+        ModelReference.parse(base_model).model.path,
+        ModelReference.parse(finetuned_model).model.path,
+        skip_undecomposable,
+        extend_vocab,
+    )
+
+    lora_weights, ranks = extract_lora(
+        module_details,
+        base_model_ref,
+        finetuned_model_ref,
+        max_rank,
+        extend_vocab,
+        no_lazy_unpickle,
+        device,
+    )
+
+    save_model_and_config(
+        lora_weights,
+        ranks,
+        finetuned_model_embedding_size > base_model_embedding_size and extend_vocab,
+        finetuned_model_embedding_size if extend_vocab else base_model_embedding_size,
+        module_details,
+        invocation_args,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/layershuffle.py b/mergekit/mergekit/scripts/layershuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..89fa858c49696b505febfc15876dded987bd4cf8
--- /dev/null
+++ b/mergekit/mergekit/scripts/layershuffle.py
@@ -0,0 +1,144 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import random
+from typing import List
+
+import click
+import yaml
+
+from mergekit.architecture import get_architecture_info
+from mergekit.common import ModelReference
+from mergekit.config import (
+    InputSliceDefinition,
+    MergeConfiguration,
+    OutputSliceDefinition,
+)
+from mergekit.merge import run_merge
+from mergekit.options import MergeOptions, add_merge_options
+
+
+@click.command("mergekit-layershuffle")
+@click.argument("out_path", type=str)
+@click.option("--model", "-m", multiple=True, type=str, help="Add a model to the merge")
+@click.option(
+    "--weight",
+    "-w",
+    multiple=True,
+    type=float,
+    default=[],
+    show_default=False,
+    help="Weighting for a model",
+)
+@click.option(
+    "--print-yaml/--no-print-yaml",
+    is_flag=True,
+    help="Print YAML merge config for resulting model",
+)
+@click.option(
+    "--write-yaml",
+    type=click.Path(writable=True),
+    help="Path to write YAML merge config to",
+)
+@click.option(
+    "--dry-run", is_flag=True, help="Generate a config but do not run the merge"
+)
+@click.option("--fp16/--no-fp16", is_flag=True, help="Use FP16 precision")
+@click.option(
+    "--full-random/--no-full-random",
+    is_flag=True,
+    help="Randomize layer index as well as source model",
+)
+@add_merge_options
+def main(
+    out_path: str,
+    model: List[str],
+    weight: List[float],
+    print_yaml: bool,
+    write_yaml: bool,
+    dry_run: bool,
+    fp16: bool,
+    full_random: bool,
+    merge_options: MergeOptions,
+):
+    models = [ModelReference.parse(m) for m in model]
+
+    m0_cfg = models[0].config()
+    arch_info = get_architecture_info(m0_cfg)
+    total_num_layers = arch_info.num_layers(m0_cfg)
+
+    out_slices: List[OutputSliceDefinition] = []
+
+    if full_random:
+        for model, frac in zip(models, weight):
+            cfg = model.config()
+            num_layers = int(arch_info.num_layers(cfg) * frac)
+            for _ in range(num_layers):
+                src_idx = random.randrange(0, num_layers)
+                out_slices.append(
+                    OutputSliceDefinition(
+                        sources=[
+                            InputSliceDefinition(
+                                model=str(model),
+                                layer_range=(src_idx, src_idx + 1),
+                            )
+                        ]
+                    )
+                )
+        random.shuffle(out_slices)
+    else:
+        for layer_idx in range(total_num_layers):
+            src_model = random.choices(models, weights=weight, k=1)[0]
+            if out_slices and out_slices[-1].sources[0].model == str(src_model):
+                out_slices[-1].sources[0].layer_range = (
+                    out_slices[-1].sources[0].layer_range[0],
+                    layer_idx + 1,
+                )
+            else:
+                out_slices.append(
+                    OutputSliceDefinition(
+                        sources=[
+                            InputSliceDefinition(
+                                model=str(src_model),
+                                layer_range=(layer_idx, layer_idx + 1),
+                            )
+                        ]
+                    )
+                )
+    merge_config = MergeConfiguration(
+        merge_method="passthrough", slices=out_slices, dtype="float16" if fp16 else None
+    )
+
+    if print_yaml or write_yaml:
+        yaml_str = yaml.dump(merge_config.model_dump(exclude_none=True, mode="json"))
+
+        if print_yaml:
+            print(yaml_str)
+        if write_yaml:
+            with open(write_yaml, "w", encoding="utf-8") as file:
+                file.write(yaml_str)
+
+    if dry_run:
+        return
+
+    run_merge(
+        merge_config,
+        out_path,
+        options=merge_options,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/legacy.py b/mergekit/mergekit/scripts/legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..06353d5f860061d0cd55075e06a97cedbf2c7f68
--- /dev/null
+++ b/mergekit/mergekit/scripts/legacy.py
@@ -0,0 +1,142 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import List, Optional
+
+import click
+import yaml
+
+from mergekit.config import InputModelDefinition, MergeConfiguration
+from mergekit.merge import run_merge
+from mergekit.options import MergeOptions, add_merge_options
+
+
+@click.command("mergekit-legacy")
+@click.argument("out_path", type=str)
+@click.option(
+    "--merge", "merge", type=str, multiple=True, help="Add a model to the merge"
+)
+@click.option(
+    "--density",
+    "density",
+    type=float,
+    multiple=True,
+    default=[],
+    help="Fraction of weights to keep for each model (ties only)",
+)
+@click.option(
+    "--weight",
+    "weight",
+    type=float,
+    multiple=True,
+    default=[],
+    help="Weighting for a model (default 1.0 for all models if not specified)",
+)
+@click.option(
+    "--method", "method", type=str, default="ties", help="Method used to merge models"
+)
+@click.option(
+    "--base-model", "base_model", type=str, default=None, help="Base model for merge"
+)
+@click.option(
+    "--normalize/--no-normalize",
+    "normalize",
+    is_flag=True,
+    default=True,
+    help="Divide merged parameters by the sum of weights",
+)
+@click.option(
+    "--int8-mask/--no-int8-mask",
+    "int8_mask",
+    is_flag=True,
+    help="Store intermediate masks in int8 to save memory",
+)
+@click.option("--bf16/--no-bf16", "bf16", is_flag=True, help="Use bfloat16")
+@click.option(
+    "--naive-count/--no-naive-count",
+    "naive_count",
+    is_flag=True,
+    help="Use naive sign count instead of weight (ties only)",
+)
+@click.option(
+    "--print-yaml/--no-print-yaml",
+    "print_yaml",
+    is_flag=True,
+    help="Print generated YAML configuration",
+)
+@add_merge_options
+def main(
+    out_path: str,
+    merge: List[str],
+    density: List[float],
+    weight: List[float],
+    method: str,
+    base_model: Optional[str],
+    normalize: bool,
+    int8_mask: bool,
+    bf16: bool,
+    naive_count: bool,
+    print_yaml: bool,
+    merge_options: MergeOptions,
+):
+    """Wrapper for using a subset of legacy-style script arguments."""
+    models = [InputModelDefinition(model=model, parameters={}) for model in merge]
+    if base_model and base_model not in merge:
+        models.append(InputModelDefinition(model=base_model, parameters={}))
+
+    parameters = {}
+
+    if density:
+        if len(density) == 1:
+            density = [density[0]] * len(models)
+        for idx, d in enumerate(density):
+            models[idx].parameters["density"] = d
+
+    if method == "slerp":
+        assert len(weight) == 1, "Must specify exactly one weight for SLERP"
+        parameters["t"] = weight[0]
+    else:
+        if weight:
+            if len(weight) == 1:
+                weight = [weight[0]] * len(models)
+            for idx, w in enumerate(weight):
+                models[idx].parameters["weight"] = w
+
+    if int8_mask:
+        parameters["int8_mask"] = True
+    if naive_count:
+        parameters["consensus_method"] = "count"
+    parameters["normalize"] = normalize
+
+    merge_config = MergeConfiguration(
+        merge_method=method,
+        models=models,
+        parameters=parameters,
+        base_model=base_model,
+        dtype="bfloat16" if bf16 else None,
+    )
+
+    if print_yaml:
+        print(yaml.dump(merge_config.model_dump(mode="json", exclude_none=True)))
+
+    run_merge(
+        merge_config,
+        out_path,
+        options=merge_options,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/megamerge.py b/mergekit/mergekit/scripts/megamerge.py
new file mode 100644
index 0000000000000000000000000000000000000000..59fc7a429d9d5898df8d63bf28c1fbdd518aaaca
--- /dev/null
+++ b/mergekit/mergekit/scripts/megamerge.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Merges multiple models and their dependencies into a single model
+using multiple merge yaml documents in a single yaml file as the input
+"""
+
+import logging
+import os
+import sys
+from pathlib import Path
+
+import click
+import yaml
+
+from mergekit.config import MergeConfiguration
+from mergekit.merge import MergeOptions, run_merge
+from mergekit.options import add_merge_options
+
+merges = {}
+
+
+def has_circular_dependency(nodes):
+    """
+    Detects circular in merges dependencies using DFS
+    Returns the node where the cycle is detected
+    """
+
+    def dfs(node, visited, stack):
+        """
+        Returns True if a cycle is detected
+        """
+        visited[node] = True
+        stack[node] = True
+
+        for dependency in nodes[node]["deps"]:
+            if not visited[dependency]:
+                if dfs(dependency, visited, stack):
+                    return True
+            elif stack[dependency]:
+                return True
+
+        stack[node] = False
+        return False
+
+    visited = {key: False for key in nodes}
+    stack = {key: False for key in nodes}
+
+    for node in nodes:
+        if not visited[node]:
+            if dfs(node, visited, stack):
+                return node
+
+    return None
+
+
+def merge(m: str, merge_options: MergeOptions, force: bool, out_path: Path):
+    """
+    Merges a model and its dependencies
+
+    Params:
+        m: name of the model to merge
+        merge_options: MergeOptions
+        force: overwrite existing merge results
+        out_path: output path
+    """
+    # check if output_path exists
+    if os.path.exists(out_path / m):
+        if not force:
+            logging.info("Skipping %s as it already exists", m)
+            del merges[m]
+            return
+        logging.info("Overwriting %s as --force was specified", m)
+
+    if len(merges[m]["deps"]) != 0:
+        for dep in merges[m]["deps"]:
+            if dep in merges:
+                merge(dep, merge_options, force, out_path)
+
+    logging.info("Merging model %s", m)
+    merge_config: MergeConfiguration = MergeConfiguration.model_validate(merges[m])
+    run_merge(
+        merge_config,
+        str(out_path / merges[m]["name"]),
+        options=merge_options,
+    )
+    del merges[m]
+
+
+def add_model_deps(model: str, name: str, out_path: Path):
+    """
+    Adds a model to `name`s dependencies if it is not already there and is a merge
+    """
+    model_lora = model.split("+")
+    # name must not have a slash to avoid path traversal
+    # therefore, we can use it to check if its a merge from the config
+    if "/" not in model_lora[0]:
+        # avoid duplicate deps
+        if model_lora[0] not in merges[name]["deps"]:
+            merges[name]["deps"].append(model_lora[0])
+        model = str(out_path / model_lora[0])
+        if len(model_lora) == 2:
+            model += "+" + model_lora[1]
+
+    return model
+
+
+@click.command("mergekit-mega")
+@click.argument("config_file")
+@click.argument("out_path")
+@click.option(
+    "--verbose", "-v", type=bool, default=False, is_flag=True, help="Verbose logging"
+)
+@click.option(
+    "--force",
+    "-f",
+    type=bool,
+    default=False,
+    is_flag=True,
+    help="Overwrite existing merge results instead of skipping them",
+)
+@click.option(
+    "--require-nameless",
+    "-R",
+    type=bool,
+    default=False,
+    is_flag=True,
+    help="Enforces exactly one unnamed merge in the YAML, which will inherit the input file's name.",
+)
+@add_merge_options
+def main(
+    merge_options: MergeOptions,
+    config_file: str,
+    out_path: str,
+    force: bool,
+    verbose: bool,
+    require_nameless: bool,
+):
+    """
+    Main entrypoint for mergekit-mega command see module docstring for more info
+    Params are supplied by click decorators
+    """
+    logging.basicConfig(level=logging.INFO if verbose else logging.WARNING)
+
+    out_path = Path(out_path)
+    final_found = False
+
+    with open(config_file, "r", encoding="utf-8") as f:
+        data = yaml.load_all(f, Loader=yaml.FullLoader)
+
+        for d in data:
+            if "name" not in d:
+                if final_found:
+                    logging.error("Only one merge must not have a name")
+                    sys.exit(1)
+                # this sets the name of the final merge to the config file name without the extension
+                d["name"] = os.path.basename(config_file).rsplit(".", maxsplit=1)[0]
+                final_found = True
+
+            if "/" in d["name"]:
+                logging.error("name must not contain a slash")
+                sys.exit(1)
+
+            merges[d["name"]] = d
+            merges[d["name"]]["deps"] = []
+            if "base_model" in d:
+                d["base_model"] = add_model_deps(d["base_model"], d["name"], out_path)
+            if "slices" in d:
+                for slc in d["slices"]:
+                    for src in slc["sources"]:
+                        src["model"] = add_model_deps(src["model"], d["name"], out_path)
+            if "models" in d:
+                for mdl in d["models"]:
+                    mdl["model"] = add_model_deps(mdl["model"], d["name"], out_path)
+
+    if require_nameless and not final_found:
+        logging.error("No final merge found")
+        sys.exit(1)
+
+    logging.info("Merging: %s", ", ".join(merges))
+
+    if (dep := has_circular_dependency(merges)) is not None:
+        logging.error("Circular dependency detected: %s", dep)
+        sys.exit(1)
+
+    while len(merges) != 0:
+        m = list(merges.keys())[0]
+        merge(m, merge_options, force, out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/moe.py b/mergekit/mergekit/scripts/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0c11f71da7af66946b8823e32664bddd84c3d3
--- /dev/null
+++ b/mergekit/mergekit/scripts/moe.py
@@ -0,0 +1,231 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+import os
+import sys
+from typing import List
+
+import click
+import transformers
+import yaml
+
+from mergekit.merge import MergeOptions
+from mergekit.moe import ALL_OUTPUT_ARCHITECTURES, MoEOutputArchitecture
+from mergekit.moe.config import MoEMergeConfig, is_bad_config
+from mergekit.moe.router import get_gate_params, warn_degenerate_gates
+from mergekit.options import add_merge_options
+
+
+def build(
+    config: MoEMergeConfig,
+    out_path: str,
+    merge_options: MergeOptions,
+    load_in_4bit: bool = False,
+    load_in_8bit: bool = False,
+    device: str = "auto",
+    allow_all_same: bool = False,
+    verbose: bool = False,
+):
+    if is_bad_config(config, allow_all_same=allow_all_same):
+        sys.exit(1)
+
+    base_model = config.base_model
+    out_arch = select_output_arch(config, merge_options, verbose=verbose)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        base_model.model.path, revision=base_model.model.revision
+    )
+    tokenizer.padding_side = "left"
+    tokenizer.pad_token_id = tokenizer.bos_token_id
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    logging.info("Getting gate parameters...")
+    need_gates = list(config.experts)
+    if config.shared_experts:
+        has_prompts = any(e.positive_prompts for e in config.shared_experts)
+        assert all(
+            bool(e.positive_prompts) == has_prompts for e in config.shared_experts
+        ), "Must specify prompts for all shared experts or none, not a mix"
+        if has_prompts:
+            need_gates.extend(config.shared_experts)
+
+    gate_vecs = get_gate_params(
+        base_model,
+        tokenizer,
+        need_gates,
+        mode=config.gate_mode,
+        load_in_4bit=load_in_4bit,
+        load_in_8bit=load_in_8bit,
+        lazy_unpickle=merge_options.lazy_unpickle,
+        trust_remote_code=merge_options.trust_remote_code,
+        device=device,
+    )
+    # gate_vecs: (num_layers, num_experts, hidden_size)
+    router_weights = gate_vecs[:, : len(config.experts), :]
+    shared_router_weights = gate_vecs[:, len(config.experts) :, :]
+    warn_degenerate_gates(gate_vecs)
+
+    out_arch.write_model(
+        out_path,
+        config,
+        merge_options,
+        router_weights=[router_weights[i, ...] for i in range(router_weights.shape[0])],
+        shared_router_weights=[
+            shared_router_weights[i, ...] for i in range(router_weights.shape[0])
+        ],
+    )
+
+    if merge_options.copy_tokenizer:
+        logging.info("Saving tokenizer...")
+        tokenizer.save_pretrained(out_path, safe_serialization=True)
+
+    logging.info("Done.")
+
+
+def select_output_arch(
+    config: MoEMergeConfig,
+    merge_options: MergeOptions,
+    verbose: bool = False,
+) -> MoEOutputArchitecture:
+    candidates_in = ALL_OUTPUT_ARCHITECTURES
+    if config.architecture:
+        candidates_in = [
+            a
+            for a in candidates_in
+            if a.name().lower().startswith(config.architecture.lower())
+        ]
+    if not candidates_in:
+        logging.error(
+            f"No output architecture found that matches the given architecture: {config.architecture}"
+        )
+        logging.error("All supported output architectures:")
+        for arch in ALL_OUTPUT_ARCHITECTURES:
+            logging.error(f"  * {arch.name()}")
+        sys.exit(1)
+
+    candidates: List[MoEOutputArchitecture] = []
+    for arch in candidates_in:
+        if arch.supports_config(
+            config, explain=verbose, trust_remote_code=merge_options.trust_remote_code
+        ):
+            candidates.append(arch)
+        else:
+            logging.info(f"Output architecture {arch.name()} does not support config.")
+
+    if not candidates:
+        logging.error(
+            "No output architecture found that is compatible with the given models."
+        )
+
+        logging.error("All supported output architectures:")
+        for arch in ALL_OUTPUT_ARCHITECTURES:
+            logging.error(f"  * {arch.name()}")
+        sys.exit(1)
+
+    # for compatibility with older configs, default to Mixtral if available
+    for arch in candidates:
+        if arch.name() == "Mixtral":
+            return arch
+
+    if len(candidates) > 1:
+        logging.warning(
+            "Multiple output architectures found that are compatible with the given models."
+        )
+        logging.warning(f"Defaulting to {candidates[0].name()}")
+    else:
+        logging.info(f"Selected output architecture: {candidates[0].name()}")
+    return candidates[0]
+
+
+@click.command("mergekit-moe")
+@click.argument("config_path", type=click.Path(exists=True, dir_okay=False))
+@click.argument("out_path", type=click.Path())
+@click.option(
+    "--load-in-4bit",
+    is_flag=True,
+    type=bool,
+    default=False,
+    help="Load model in 4bit for computing hidden states",
+)
+@click.option(
+    "--load-in-8bit",
+    is_flag=True,
+    type=bool,
+    default=False,
+    help="Load model in 8bit for computing hidden states",
+)
+@click.option(
+    "--device",
+    type=str,
+    default="auto",
+    help="Device to use to compute embeddings",
+    show_default=True,
+)
+@click.option(
+    "--verbose", "-v", type=bool, default=False, is_flag=True, help="Verbose logging"
+)
+@click.option(
+    "--i-understand-this-is-not-useful-without-training",
+    type=bool,
+    default=False,
+    is_flag=True,
+    help="Really make the questionable model you want.",
+)
+@add_merge_options
+def main(
+    config_path: str,
+    out_path: str,
+    load_in_4bit: bool,
+    load_in_8bit: bool,
+    device: str,
+    merge_options: MergeOptions,
+    verbose: bool,
+    i_understand_this_is_not_useful_without_training: bool,
+):
+    """Create a Mixture of Experts model by combining the pretrained weights of multiple models."""
+    logging.basicConfig(level=logging.INFO if verbose else logging.WARNING)
+
+    if merge_options.cuda:
+        logging.warning(
+            '--cuda is a no-op for mergekit-moe, use "--device cuda" instead'
+        )
+
+    with open(config_path, "r", encoding="utf-8") as file:
+        config_source = file.read()
+
+    config = MoEMergeConfig.model_validate(yaml.safe_load(config_source))
+    build(
+        config,
+        out_path=out_path,
+        merge_options=merge_options,
+        load_in_4bit=load_in_4bit,
+        load_in_8bit=load_in_8bit,
+        device=device,
+        allow_all_same=i_understand_this_is_not_useful_without_training,
+        verbose=verbose,
+    )
+
+    if merge_options.write_model_card:
+        # TODO: generate a README.md as well
+        with open(
+            os.path.join(out_path, "mergekit_moe_config.yml"), "w", encoding="utf-8"
+        ) as fp:
+            fp.write(config_source)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/run_yaml.py b/mergekit/mergekit/scripts/run_yaml.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0a63a0f6d177e03c336fd57bf8b4f4604bb0fb
--- /dev/null
+++ b/mergekit/mergekit/scripts/run_yaml.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+
+import click
+import yaml
+
+from mergekit.config import MergeConfiguration
+from mergekit.merge import run_merge
+from mergekit.options import MergeOptions, add_merge_options
+
+
+@click.command("mergekit-yaml")
+@click.argument("config_file")
+@click.argument("out_path")
+@click.option(
+    "--verbose", "-v", type=bool, default=False, is_flag=True, help="Verbose logging"
+)
+@add_merge_options
+def main(
+    merge_options: MergeOptions,
+    config_file: str,
+    out_path: str,
+    verbose: bool,
+):
+    logging.basicConfig(level=logging.INFO if verbose else logging.WARNING)
+
+    with open(config_file, "r", encoding="utf-8") as file:
+        config_source = file.read()
+
+    merge_config: MergeConfiguration = MergeConfiguration.model_validate(
+        yaml.safe_load(config_source)
+    )
+    run_merge(
+        merge_config,
+        out_path,
+        options=merge_options,
+        config_source=config_source,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mergekit/mergekit/scripts/tokensurgeon.py b/mergekit/mergekit/scripts/tokensurgeon.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d38fdfa98ba475c866aebb0c7aca733100c193
--- /dev/null
+++ b/mergekit/mergekit/scripts/tokensurgeon.py
@@ -0,0 +1,612 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import enum
+import logging
+import sys
+from typing import Dict, Generator, List, Optional, Tuple, Union
+
+import click
+import torch
+import tqdm
+import transformers
+from typing_extensions import TypeAlias
+
+from mergekit.architecture import (
+    ConfiguredArchitectureInfo,
+    WeightInfo,
+    get_architecture_info,
+)
+from mergekit.common import ModelReference
+from mergekit.io import TensorWriter
+from mergekit.io.tasks import LoaderCache
+from mergekit.options import MergeOptions, add_merge_options
+
+LOG = logging.getLogger(__name__)
+
+
+@click.command("mergekit-tokensurgeon")
+@click.argument("model", type=str)
+@click.argument("donor", type=str)
+@click.argument("out_path", type=str)
+@click.option(
+    "-v", "verbosity", count=True, help="Verbose logging", default=0, show_default=False
+)
+@click.option(
+    "-k",
+    type=int,
+    default=8,
+    help="Number of nearest neighbours to use for embedding interpolation",
+)
+@click.option(
+    "--barycentric/--no-barycentric",
+    "-b/-nb",
+    is_flag=True,
+    default=False,
+    help="Use barycentric interpolation instead of distance weighting",
+)
+@click.option(
+    "--cosine-similarity/--no-cosine-similarity",
+    "-c/-nc",
+    is_flag=True,
+    default=False,
+    help="Use cosine similarity for nearest neighbour search",
+)
+@add_merge_options
+def main(
+    model: str,
+    donor: str,
+    out_path: str,
+    verbosity: int,
+    k: int,
+    barycentric: bool,
+    cosine_similarity: bool,
+    merge_options: MergeOptions,
+):
+    """
+    Replace the tokenizer of a model with that of a donor model. Attempts to
+    approximate embeddings for tokens that are in the donor model but not the
+    original model.
+
+    This greatly reduces the amount of training required to settle in the new
+    embeddings, and potentially removes the need for fine-tuning entirely for
+    tokenizers that are sufficiently similar.
+
+    The model and donor model must have the same architecture.
+    """
+    log_level = logging.WARNING
+    if verbosity == 1:
+        log_level = logging.INFO
+    elif verbosity > 1:
+        log_level = logging.DEBUG
+    logging.basicConfig(level=log_level)
+    LOG.warning("This tool is experimental and may produce unexpected results.")
+
+    model = ModelReference.model_validate(model)
+    donor = ModelReference.model_validate(donor)
+
+    cache = LoaderCache()
+    cache.setup(options=merge_options)
+
+    device = "cuda" if merge_options.cuda else "cpu"
+
+    arch_info, donor_cfg = validate_architecture(model, donor, merge_options)
+    embed_info, lm_head_info = get_embedding_info(model, merge_options)
+    donor_embed_info, donor_lm_head_info = get_embedding_info(donor, merge_options)
+
+    _, old_vocab = load_tokenizer(model, merge_options)
+    tokenizer, new_vocab = load_tokenizer(donor, merge_options)
+    common_tokens = list(set(old_vocab.keys()) & set(new_vocab.keys()))
+
+    old_embed = cache.get(model).get_tensor(
+        embed_info.name, aliases=embed_info.aliases, device=device
+    )
+    donor_embed = cache.get(donor).get_tensor(
+        donor_embed_info.name, aliases=donor_embed_info.aliases, device=device
+    )
+
+    (_, hidden_size_0) = old_embed.shape
+    (_, hidden_size_1) = donor_embed.shape
+    if hidden_size_1 != hidden_size_0:
+        report_issue(
+            f"Embedding sizes do not match: {hidden_size_0} vs {hidden_size_1}",
+            error=not merge_options.allow_crimes,
+        )
+
+    min_overlap = max(hidden_size_0, hidden_size_1)
+    if len(common_tokens) < min_overlap:
+        report_issue(
+            f"Common tokens ({len(common_tokens)}) less than embedding size ({min_overlap})",
+            error=not merge_options.allow_crimes,
+        )
+
+    LOG.info("Computing new embeddings")
+    new_embed = get_embeddings(
+        old_embed,
+        donor_embed,
+        old_vocab,
+        new_vocab,
+        common_tokens,
+        accept_prefix=False,
+        k=k,
+        barycentric=barycentric,
+        cosine_similarity=cosine_similarity,
+        name=embed_info.name,
+    )
+
+    if lm_head_info:
+        try:
+            old_lm_head = cache.get(model).get_tensor(
+                lm_head_info.name, aliases=lm_head_info.aliases, device=device
+            )
+        except KeyError:
+            if lm_head_info.optional:
+                logging.info(f"LM head tensor {lm_head_info.name} not found, skipping")
+            else:
+                report_issue(
+                    f"Could not load LM head tensor {lm_head_info.name}",
+                    error=True,
+                )
+            old_lm_head = None
+
+        if old_lm_head is not None:
+            donor_lm_head = cache.get(donor).get_tensor(
+                donor_lm_head_info.name,
+                aliases=donor_lm_head_info.aliases,
+                device=device,
+            )
+
+            LOG.info("Computing new lm_head embeddings")
+            new_lm_head = get_embeddings(
+                old_lm_head,
+                donor_lm_head,
+                old_vocab,
+                new_vocab,
+                common_tokens,
+                accept_prefix=True,
+                k=k,
+                barycentric=barycentric,
+                cosine_similarity=cosine_similarity,
+                name=lm_head_info.name,
+            )
+        else:
+            new_lm_head = None
+
+    # Save out the new model
+    LOG.info(f"Saving new model to {out_path}")
+    writer = TensorWriter(
+        out_path,
+        max_shard_size=merge_options.out_shard_size,
+        safe_serialization=merge_options.safe_serialization,
+    )
+    for weight_info in tqdm.tqdm(arch_info.all_weights(), desc="Saving weights"):
+        if weight_info.name == embed_info.name:
+            tensor = new_embed
+        elif lm_head_info is not None and weight_info.name == lm_head_info.name:
+            tensor = new_lm_head
+        else:
+            tensor = cache.get(model).get_tensor(
+                weight_info.name, aliases=weight_info.aliases
+            )
+        if tensor is None:
+            if weight_info.optional:
+                continue
+            report_issue(f"Could not load weight tensor {weight_info.name}", error=True)
+        writer.save_tensor(weight_info.name, tensor, clone=merge_options.clone_tensors)
+    writer.finalize()
+
+    tokenizer.save_pretrained(out_path)
+    cfg_out = arch_info.config
+    try:
+        cfg_out.vocab_size = new_embed.shape[0]
+    except AttributeError:
+        LOG.error(
+            "Could not set vocab size in config.json - you may need to update it manually."
+        )
+    for key in [
+        "pad_token_id",
+        "eos_token_id",
+        "bos_token_id",
+        "unk_token_id",
+        "mask_token_id",
+        "padding_side",
+    ]:
+        if hasattr(donor_cfg, key) and (value := getattr(donor_cfg, key)) is not None:
+            try:
+                setattr(cfg_out, key, value)
+            except AttributeError:
+                LOG.error(f"Could not set {key}!")
+    cfg_out.save_pretrained(out_path)
+
+
+class TokenMarker(enum.Enum):
+    SPECIAL = "special"
+    WORD_START = "word_start"
+
+
+NormalizedToken: TypeAlias = Union[str, Tuple[TokenMarker, str]]
+
+
+def normalize_token(
+    token: str,
+    special_tokens_map: Dict[str, Union[str, List[str]]],
+    word_start_prefix: str = "▁",
+) -> NormalizedToken:
+    """
+    Normalize a token for comparison.
+    """
+    if token.startswith(word_start_prefix):
+        return (TokenMarker.WORD_START, token[len(word_start_prefix) :])
+
+    for special_token_type, values in special_tokens_map.items():
+        if isinstance(values, str):
+            values = [values]
+        if token in values:
+            return (TokenMarker.SPECIAL, special_token_type)
+    return token
+
+
+def token_prefixes(
+    token: NormalizedToken, allow_whitespace: bool = False
+) -> Generator[NormalizedToken, None, None]:
+    """Yield potential prefixes of a token."""
+    marker = None
+    if isinstance(token, tuple):
+        marker, token = token
+
+    for i in range(len(token) - 1, 0, -1):
+        prefix = token[:i]
+        if not allow_whitespace and not prefix.strip():
+            break
+        if marker is not None:
+            yield (marker, prefix)
+        else:
+            yield prefix
+
+
+def get_embedding_info(
+    model: ModelReference, options: MergeOptions
+) -> Tuple[WeightInfo, WeightInfo]:
+    """Get WeightInfo for the input and output embeddings of a model."""
+    cfg = model.config(trust_remote_code=options.trust_remote_code)
+    arch_info = get_architecture_info(cfg)
+
+    embed, lm_head = None, None
+    for weight_info in arch_info.pre_weights(cfg):
+        if weight_info.is_embed:
+            if embed is not None:
+                raise RuntimeError("Multiple input embeddings found")
+            embed = weight_info
+
+    for weight_info in arch_info.post_weights(cfg):
+        if weight_info.is_embed:
+            if lm_head is not None:
+                raise RuntimeError("Multiple output embeddings found")
+            lm_head = weight_info
+
+    return embed, lm_head
+
+
+def report_issue(message: str, error: bool = False):
+    """Log an issue and exit if error is True."""
+    if error:
+        LOG.error(message)
+        sys.exit(1)
+    else:
+        LOG.warning(message)
+
+
+def get_embeddings(
+    original_embed: torch.Tensor,
+    donor_embed: torch.Tensor,
+    original_vocab: Dict[NormalizedToken, int],
+    donor_vocab: Dict[NormalizedToken, int],
+    common_tokens: List[str],
+    *,
+    accept_prefix: bool = False,
+    k: int = 8,
+    barycentric: bool = False,
+    cosine_similarity: bool = False,
+    log_reconstruction_error: bool = True,
+    log_statistics: bool = True,
+    name: Optional[str] = None,
+) -> torch.Tensor:
+    """
+    Generate embeddings for a target vocabulary.
+
+    For tokens present in both vocabularies, the embedding from original_embed is
+    directly copied. For tokens not present in the original vocabulary, the
+    embedding is approximated using the k-nearest neighbours among the tokens that
+    are present in both vocabularies. This can be done using either barycentric
+    interpolation or distance weighted averaging.
+
+    Args:
+        original_embed (torch.Tensor): Embedding matrix for the original vocabulary.
+        donor_embed (torch.Tensor): Embedding matrix for the new vocabulary.
+        original_vocab (Dict[NormalizedToken, int]): Maps tokens to indices in
+            original_embed.
+        donor_vocab (Dict[NormalizedToken, int]): Maps tokens to indices in
+            donor_embed.
+        common_tokens (List[str]): Tokens that are common to both vocabularies.
+        accept_prefix (bool): If True, allows using prefix matches for tokens when
+            an exact match is not found.
+        k (int): Number of nearest neighbours to use for embedding interpolation.
+        barycentric (bool): If True, uses barycentric interpolation for embedding
+            approximation. Otherwise, uses distance weighting.
+        cosine_similarity (bool): If True, uses cosine similarity to find nearest
+            neighbors. Otherwise, uses Euclidean distance.
+        log_reconstruction_error (bool): If True, logs the mean squared error of
+            the reconstructed embeddings.
+        log_statistics (bool): If True, logs statistics about the embedding
+            approximation process.
+        name (Optional[str]): Name of the embedding matrix. Used for logging.
+
+    Returns:
+        torch.Tensor: Embedding matrix for the new vocabulary.
+            Shape is (len(donor_vocab), original_embed.shape[1]).
+    """
+    hidden_size_0 = original_embed.shape[1]
+    hidden_size_1 = donor_embed.shape[1]
+
+    e_c_0 = torch.empty(
+        len(common_tokens),
+        hidden_size_0,
+        device=original_embed.device,
+        dtype=original_embed.dtype,
+    )
+    e_c_1 = torch.empty(
+        len(common_tokens),
+        hidden_size_1,
+        device=donor_embed.device,
+        dtype=donor_embed.dtype,
+    )
+    for i, token in enumerate(common_tokens):
+        idx_0 = original_vocab[token]
+        idx_1 = donor_vocab[token]
+        e_c_0[i] = original_embed[idx_0]
+        e_c_1[i] = donor_embed[idx_1]
+
+    exact_matches = 0
+    prefix_matches = 0
+    knn_matches = 0
+    res = torch.zeros(
+        max(donor_vocab.values()) + 1,
+        hidden_size_0,
+        device=original_embed.device,
+        dtype=original_embed.dtype,
+    )
+
+    # message for tqdm
+    desc = "Computing embeddings"
+    if name:
+        desc += f" ({name})"
+
+    knn_reconstruction_error = []
+    for token in tqdm.tqdm(donor_vocab, desc=desc):
+        idx_1 = donor_vocab[token]
+        if token in original_vocab:
+            res[idx_1] = original_embed[original_vocab[token]]
+            exact_matches += 1
+            continue
+
+        if isinstance(token, str):
+            if len(token) == 1 and ord(token) < 256:
+                # check for matching byte tokens
+                byte_tok = f"<0x{ord(token):02X}>"
+                if byte_tok in original_vocab:
+                    res[idx_1] = original_embed[original_vocab[byte_tok]]
+                    exact_matches += 1
+                    continue
+            elif token.startswith("<0x") and token.endswith(">") and len(token) == 6:
+                # check for character tokens matching byte tokens
+                try:
+                    byte = int(token[3:-1], 16)
+                except ValueError:
+                    pass
+                else:
+                    if chr(byte) in original_vocab:
+                        res[idx_1] = original_embed[original_vocab[chr(byte)]]
+                        exact_matches += 1
+                        continue
+
+        if accept_prefix:
+            # For the LM head, we can accept prefix matches so long as the prefix is
+            # not also in the new vocab - this is to avoid including the same embedding
+            # vector multiple times
+            found_prefix = False
+            for prefix in token_prefixes(token, allow_whitespace=False):
+                if prefix in original_vocab and prefix not in donor_vocab:
+                    res[idx_1] = original_embed[original_vocab[prefix]]
+                    found_prefix = True
+                    break
+
+            if found_prefix:
+                prefix_matches += 1
+                continue
+
+        # If we can't find a prefix match, approximate from k nearest neighbours
+        token_embedding = donor_embed[idx_1]
+        if cosine_similarity:
+            cos_similarities = torch.nn.functional.cosine_similarity(
+                token_embedding.unsqueeze(0), e_c_1, dim=1
+            )
+            distances = 1 - cos_similarities
+        else:
+            # euclidean distance
+            distances = torch.cdist(token_embedding.unsqueeze(0), e_c_1).squeeze()
+        _, indices = torch.topk(distances, k, largest=False)
+        knn_embeddings = e_c_1[indices]
+
+        if barycentric:
+            # Find least squares barycentric weights
+            # Constrain sum of weights to 1 by adding a row of 1s
+            constraint_row = torch.ones(
+                (1, knn_embeddings.shape[0]), device=original_embed.device
+            )
+            knn_e_c = torch.cat([knn_embeddings.T, constraint_row], dim=0)
+            e_c = torch.cat(
+                [
+                    token_embedding,
+                    torch.tensor([1.0], device=e_c_0.device, dtype=e_c_0.dtype),
+                ]
+            ).unsqueeze(-1)
+            weights = torch.linalg.lstsq(
+                knn_e_c.float(), e_c.float(), rcond=1e-6
+            ).solution.to(dtype=e_c_0.dtype)
+        else:
+            # Just weight by distance
+            if cosine_similarity:
+                weights = cos_similarities[indices].unsqueeze(-1).to(dtype=e_c_0.dtype)
+            else:
+                # weights = 1 / distances[indices].to(dtype=e_c_0.dtype).clamp(min=1e-6)
+                weights = torch.nn.functional.softmin(
+                    distances[indices].to(dtype=e_c_0.dtype), dim=0
+                )
+            weights /= weights.sum()
+
+        if log_reconstruction_error:
+            # compute reconstruction error in donor_embed space
+            knn_reconstruction_error.append(
+                torch.nn.functional.mse_loss(
+                    (knn_embeddings.T.to(weights.dtype) @ weights).squeeze(),
+                    token_embedding,
+                ).item()
+            )
+
+        # Reconstruct the embedding in original_embed space
+        res[idx_1] = (e_c_0[indices].T @ weights).squeeze()
+        knn_matches += 1
+
+    if log_statistics:
+        LOG.info("Token breakdown:")
+        LOG.info(f"\tExact matches: {exact_matches}")
+        if prefix_matches:
+            LOG.info(f"\tPrefix matches: {prefix_matches}")
+        LOG.info(f"\tKNN solutions: {knn_matches}")
+
+        pct_approx = int((len(donor_vocab) - exact_matches) * 100 / len(donor_vocab))
+        if pct_approx > 10:
+            # encourage best practices
+            LOG.warning(
+                f"Large number of tokens ({pct_approx}%) could not be exactly "
+                "matched - be sure to fine tune this sucker!"
+            )
+
+    if knn_reconstruction_error:
+        knn_err = torch.tensor(
+            knn_reconstruction_error, device=original_embed.device, dtype=torch.float32
+        )
+        LOG.info("KNN reconstruction error:")
+        LOG.info(f"\tMean: {knn_err.mean().item()}")
+        LOG.debug(f"\tMedian: {knn_err.median().item()}")
+        LOG.debug(f"\tMax: {knn_err.max().item()}")
+        LOG.debug(f"\tMin: {knn_err.min().item()}")
+        LOG.debug(f"\tStddev: {knn_err.std().item()}")
+        if knn_err.mean().isnan() or knn_err.mean().isinf():
+            LOG.error(
+                "NaN or infinite reconstruction error detected - output is "
+                "definitely broken!"
+            )
+        if knn_err.mean().item() >= 0.01:
+            LOG.warning("Unreasonably high reconstruction error - expect some issues!")
+
+    return res
+
+
+def load_tokenizer(
+    model: ModelReference, options: MergeOptions
+) -> Tuple[transformers.PreTrainedTokenizerBase, Dict[NormalizedToken, int]]:
+    """Load a tokenizer from a model. Returns the tokenizer and a mapping of
+    normalized tokens to indices."""
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model.model.path,
+        revision=model.model.revision,
+        trust_remote_code=options.trust_remote_code,
+    )
+
+    gpt2_style = [
+        transformers.GPT2Tokenizer,
+        transformers.GPT2TokenizerFast,
+        transformers.OpenAIGPTTokenizer,
+        transformers.OpenAIGPTTokenizerFast,
+    ]
+    for candidate in ["Qwen2Tokenizer", "Qwen2TokenizerFast"]:
+        if hasattr(transformers, candidate):
+            gpt2_style.append(getattr(transformers, candidate))
+
+    sp_style = [
+        transformers.LlamaTokenizer,
+        transformers.LlamaTokenizerFast,
+        transformers.T5Tokenizer,
+        transformers.T5TokenizerFast,
+    ]
+    for candidate in ["GemmaTokenizer", "GemmaTokenizerFast"]:
+        if hasattr(transformers, candidate):
+            sp_style.append(getattr(transformers, candidate))
+
+    vocab = tokenizer.get_vocab()
+    if isinstance(
+        tokenizer,
+        tuple(gpt2_style),
+    ):
+        word_start_prefix = "Ġ"
+    elif isinstance(
+        tokenizer,
+        tuple(sp_style),
+    ):
+        if "Ġhello" in vocab:
+            # dumb special case for deepseek's tokenizer
+            word_start_prefix = "Ġ"
+        else:
+            word_start_prefix = "▁"
+    else:
+        LOG.warning("Unknown tokenizer type - assuming 'Ġ' word start prefix")
+        word_start_prefix = "Ġ"
+
+    tokenizer.all_special_tokens
+    return tokenizer, {
+        normalize_token(
+            token,
+            special_tokens_map=tokenizer.special_tokens_map,
+            word_start_prefix=word_start_prefix,
+        ): i
+        for token, i in vocab.items()
+    }
+
+
+def validate_architecture(
+    model: ModelReference, donor: ModelReference, options: MergeOptions
+) -> Tuple[ConfiguredArchitectureInfo, transformers.PretrainedConfig]:
+    """
+    Validate that the architectures of two models match.
+
+    Returns the architecture info for the model and the config for the donor.
+    """
+    model_cfg = model.config(trust_remote_code=options.trust_remote_code)
+    donor_cfg = donor.config(trust_remote_code=options.trust_remote_code)
+    model_arch_info = get_architecture_info(model_cfg)
+    donor_arch_info = get_architecture_info(donor_cfg)
+    if donor_arch_info != model_arch_info:
+        report_issue(
+            f"Model architectures do not match: {model_arch_info.name()} vs {donor_arch_info.name()}",
+            error=not options.allow_crimes,
+        )
+
+    return ConfiguredArchitectureInfo(info=model_arch_info, config=model_cfg), donor_cfg
+
+
+if __name__ == "__main__":
+    with torch.no_grad():
+        main()
diff --git a/mergekit/mergekit/sparsify.py b/mergekit/mergekit/sparsify.py
new file mode 100644
index 0000000000000000000000000000000000000000..f782247f6ace3ec8a264a73a746fde7c39e2bc45
--- /dev/null
+++ b/mergekit/mergekit/sparsify.py
@@ -0,0 +1,203 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from enum import Enum
+
+import torch
+
+
+class SparsificationMethod(str, Enum):
+    magnitude = "magnitude"
+    random = "random"
+    magnitude_outliers = "magnitude_outliers"
+    rank_magnitude_sampling = "rank_magnitude_sampling"
+    consensus_ta = "consensus_ta"
+    consensus_ties = "consensus_ties"
+
+
+def rescale_sum(tensor: torch.Tensor, mask: torch.Tensor):
+    """Rescales the values to match the original tensor sum."""
+    org_sum = tensor.abs().sum()
+    new_sum = (tensor * mask).abs().sum()
+
+    if org_sum >= 1e-8 and new_sum >= 1e-8:
+        tensor *= org_sum / new_sum
+    return tensor * mask
+
+
+def magnitude(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
+    """Masks out the smallest values, retaining a proportion of `density`."""
+    if density >= 1:
+        return tensor
+
+    k = int(density * tensor.numel())
+
+    assert k > 0, "not gonna zero out the whole tensor buddy"
+    mask = torch.zeros_like(tensor)
+    w = tensor.abs().view(-1)
+    if w.device.type == "cpu":
+        w = w.float()
+    topk = torch.argsort(w, descending=True)[:k]
+    mask.view(-1)[topk] = 1
+
+    if rescale:
+        res = rescale_sum(tensor, mask)
+    else:
+        res = tensor * mask
+
+    return res
+
+
+def magnitude_outliers(
+    tensor: torch.Tensor, density: float, rescale: bool, gamma: float = 0.01
+):
+    """Masks out smallest values in addition to large outliers.
+
+    The `gamma` proportion of the largest weights are first removed, then the
+    smallest weights are removed to achieve the desired density.
+
+    Args:
+        tensor (torch.Tensor): The tensor to sparsify.
+        density (float): The proportion of weights to retain.
+        gamma (float): Percent of largest weights to remove.
+    """
+    if density >= 1:
+        return tensor
+
+    num_elems = tensor.numel()
+    target_n = int(density * num_elems)
+    n_top = int(gamma * num_elems)
+    n_bot = num_elems - target_n - n_top
+
+    if n_bot < 0:
+        # cut down on the number of large weights to remove in
+        # order to hit the target density
+        n_top += n_bot
+        n_bot = 0
+
+    w = tensor.abs().view(-1)
+    if w.device.type == "cpu":
+        w = w.float()
+    indices = torch.sort(w, descending=False).indices
+    mask = torch.zeros_like(tensor)
+
+    mask.view(-1)[indices[n_bot:-n_top]] = 1
+
+    if rescale:
+        res = rescale_sum(tensor, mask)
+    else:
+        res = tensor * mask
+    return res
+
+
+def bernoulli(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
+    if density >= 1:
+        return tensor
+
+    if (tensor.device.type != "cpu") or tensor.dtype == torch.bfloat16:
+        work_dtype = tensor.dtype
+    else:
+        # torch.bernoulli not implemented for float16 on CPU, upcast to float32
+        work_dtype = torch.float32
+
+    mask = torch.bernoulli(
+        torch.full_like(input=tensor, fill_value=density, dtype=work_dtype)
+    )
+    res = tensor.to(work_dtype) * mask
+    if rescale:
+        res /= density
+
+    return res.to(tensor.dtype)
+
+
+def rank_magnitude(
+    tensor: torch.Tensor, density: float, rescale: bool = True, epsilon: float = 0.05
+) -> torch.Tensor:
+    if density >= 1:
+        return tensor
+
+    if density <= epsilon or density >= (1 - epsilon):
+        raise ValueError(
+            f"Error: density +- epsilon must be in the range (0, 1). density + epsilon = {density+epsilon}, density - epsilon = {density-epsilon}"
+        )
+
+    if (tensor.device.type != "cpu") or tensor.dtype == torch.bfloat16:
+        work_dtype = tensor.dtype
+    else:
+        work_dtype = torch.float32
+
+    if len(tensor.shape) < 2:
+        tensor = tensor.unsqueeze(0)
+
+    # Get Rank matrix for the delta values
+    tensor_abs = torch.abs(tensor)
+
+    sorted_indices = torch.argsort(tensor_abs, dim=1, descending=False)
+
+    ranking_tensor = torch.zeros_like(tensor_abs, dtype=work_dtype)
+    for i in range(tensor_abs.size(0)):
+        ranking_tensor[i][sorted_indices[i]] = torch.arange(
+            1, tensor.size(1) + 1, dtype=work_dtype
+        ).to(tensor.device)
+
+    # Normalise rank matrix to the probability range to density +- epsilon
+    range_vals = (
+        ranking_tensor.max(dim=1, keepdim=True).values
+        - ranking_tensor.min(dim=1, keepdim=True).values
+    )
+    norm_metrics = (ranking_tensor - ranking_tensor.min(dim=1, keepdim=True).values) / (
+        range_vals
+    )
+    final_probabilities = (density - epsilon) + norm_metrics * (2 * epsilon)
+
+    mask = torch.bernoulli(final_probabilities).to(work_dtype)
+    res = tensor.to(work_dtype) * mask
+
+    if rescale:
+        res = res / (final_probabilities.to(work_dtype))
+
+    return res.squeeze(0)
+
+
+def sparsify(
+    tensor: torch.Tensor,
+    density: float,
+    method: SparsificationMethod,
+    gamma: float = 0,
+    rescale: bool = False,
+    epsilon: float = 0.15,
+) -> torch.Tensor:
+    if (
+        method == SparsificationMethod.magnitude
+        or method == SparsificationMethod.consensus_ties
+    ):
+        return magnitude(tensor, density=density, rescale=rescale)
+    elif method == SparsificationMethod.random:
+        return bernoulli(tensor, density=density, rescale=rescale)
+    elif method == SparsificationMethod.magnitude_outliers:
+        return magnitude_outliers(tensor, density=density, rescale=rescale, gamma=gamma)
+    elif method == SparsificationMethod.rank_magnitude_sampling:
+        return rank_magnitude(tensor, density=density, rescale=rescale, epsilon=epsilon)
+    else:
+        raise NotImplementedError(method)
+
+
+def get_tall_mask(
+    delta: torch.Tensor,  # individual task vectors
+    lambda_factor: float,  # hyper-parameter lambda for generating TALL masks
+    mixed_delta: torch.Tensor,  # multi-task vector
+):
+    mask = delta.abs() > lambda_factor * (mixed_delta - delta).abs()
+    return mask
diff --git a/mergekit/mergekit/tokenizer/__init__.py b/mergekit/mergekit/tokenizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff42a4663d0bb7c9e697d7d687ab7d4b6c7a0e6
--- /dev/null
+++ b/mergekit/mergekit/tokenizer/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from mergekit.tokenizer.build import BuildTokenizer, TokenizerInfo
+from mergekit.tokenizer.config import TokenizerConfig
+from mergekit.tokenizer.embed import PermutedEmbeddings
+
+__all__ = ["BuildTokenizer", "TokenizerInfo", "TokenizerConfig", "PermutedEmbeddings"]
diff --git a/mergekit/mergekit/tokenizer/__pycache__/__init__.cpython-310.pyc b/mergekit/mergekit/tokenizer/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23b41042d3a7801251bb3dad7c6aba92673aed90
Binary files /dev/null and b/mergekit/mergekit/tokenizer/__pycache__/__init__.cpython-310.pyc differ
diff --git a/mergekit/mergekit/tokenizer/__pycache__/build.cpython-310.pyc b/mergekit/mergekit/tokenizer/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea397360aaea29aa048a031a237003cdbedf9324
Binary files /dev/null and b/mergekit/mergekit/tokenizer/__pycache__/build.cpython-310.pyc differ
diff --git a/mergekit/mergekit/tokenizer/__pycache__/config.cpython-310.pyc b/mergekit/mergekit/tokenizer/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bad0cdc3170565252e3bd0bd32305c5a45d2eb35
Binary files /dev/null and b/mergekit/mergekit/tokenizer/__pycache__/config.cpython-310.pyc differ
diff --git a/mergekit/mergekit/tokenizer/__pycache__/embed.cpython-310.pyc b/mergekit/mergekit/tokenizer/__pycache__/embed.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59ca72d4fab5f40e42e0f9ae63791f025b4561b2
Binary files /dev/null and b/mergekit/mergekit/tokenizer/__pycache__/embed.cpython-310.pyc differ
diff --git a/mergekit/mergekit/tokenizer/build.py b/mergekit/mergekit/tokenizer/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cefed91829c375cf94fc34322ac63839e784ea3
--- /dev/null
+++ b/mergekit/mergekit/tokenizer/build.py
@@ -0,0 +1,302 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import json
+import logging
+import tempfile
+from typing import Dict, List, Optional, Tuple, Union
+
+import tokenizers
+import tokenizers.models
+import tqdm
+import transformers
+from pydantic import BaseModel
+from typing_extensions import Literal
+
+from mergekit.common import ModelPath, ModelReference
+from mergekit.graph import Task
+
+
+def get_vocab_size(model_path: ModelPath, trust_remote_code: bool) -> Optional[int]:
+    try:
+        cfg = transformers.AutoConfig.from_pretrained(
+            model_path.path,
+            revision=model_path.revision,
+            trust_remote_code=trust_remote_code,
+        )
+        return cfg.vocab_size
+    except Exception as e:
+        logging.warning(f"Unable to get vocab size for {model_path}", exc_info=e)
+
+    return None
+
+
+def get_stripped_tokenizer(
+    path: ModelPath, trust_remote_code: bool = False
+) -> transformers.PreTrainedTokenizerFast:
+    """
+    Return a tokenizer for a model that only contains used tokens.
+
+    Strips any tokens with indices >= model.vocab_size.
+    """
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        path.path,
+        revision=path.revision,
+        trust_remote_code=trust_remote_code,
+        use_fast=True,
+    )
+    vocab_size = get_vocab_size(path, trust_remote_code=trust_remote_code) or len(
+        tokenizer.get_vocab()
+    )
+
+    unused_toks = [
+        tok for tok, idx in tokenizer.get_vocab().items() if idx >= vocab_size
+    ]
+    if not unused_toks:
+        # we're good, ship it
+        return tokenizer
+
+    if not tokenizer.is_fast:
+        raise RuntimeError(
+            f"Model {path} has unused tokens and does not support fast "
+            "tokenizer - can not be used in tokenizer merge"
+        )
+
+    tok_dict = json.loads(tokenizer._tokenizer.to_str())
+    if tok_dict["model"]["type"] != "BPE":
+        raise RuntimeError(
+            f"Tokenizer for {path} has type {tok_dict['model']['type']}, "
+            "but only BPE is currently supported for tokenizer merge"
+        )
+
+    tok_dict["added_tokens"] = [
+        e for e in tok_dict["added_tokens"] if e["id"] < vocab_size
+    ]
+
+    for tok in unused_toks:
+        if tok in tok_dict["model"]["vocab"]:
+            del tok_dict["model"]["vocab"][tok]
+
+    def _keep_merge(m):
+        if isinstance(m, str) and m.count(" ") == 1:
+            toks = m.split(" ")
+        elif isinstance(m, list):
+            toks = m
+        else:
+            raise RuntimeError(f"Unexpected merge format: {repr(m)} ({type(m)})")
+        for tok in toks:
+            if tok in unused_toks:
+                return False
+        return True
+
+    tok_dict["model"]["merges"] = [
+        e for e in tok_dict["model"]["merges"] if _keep_merge(e)
+    ]
+    tokenizer._tokenizer = tokenizers.Tokenizer.from_str(json.dumps(tok_dict))
+    return tokenizer
+
+
+def build_union_tokenizer(
+    base_tok: transformers.PreTrainedTokenizerBase,
+    tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
+    trust_remote_code: bool = False,
+) -> transformers.PreTrainedTokenizerBase:
+    out_added_tokens = {}
+    out_vocab = {}
+
+    warned_added_tokens = set()
+
+    for model, tokenizer in tokenizers.items():
+        vocab_size = (
+            get_vocab_size(model.model, trust_remote_code=trust_remote_code)
+            or tokenizer.vocab_size
+        )
+        added_tokens = tokenizer.added_tokens_decoder
+
+        vocab = tokenizer.get_vocab()
+        for tok, idx in vocab.items():
+            if idx >= vocab_size:
+                logging.warning(
+                    f"Token {repr(tok)} present in {str(model)} tokenizer but >= vocab_size"
+                )
+                continue
+            if tok in added_tokens:
+                # deal with later
+                continue
+
+            if tok not in out_vocab:
+                out_vocab[tok] = len(out_vocab)
+
+        for tok_idx, info in tokenizer.added_tokens_decoder.items():
+            tok = info.content
+            if tok_idx >= vocab_size:
+                continue
+
+            if tok in out_added_tokens:
+                if (out_added_tokens[tok] != info) and tok not in warned_added_tokens:
+                    logging.warning(
+                        f"Token '{tok}' added with multiple different settings, using first"
+                    )
+                    warned_added_tokens.add(tok)
+
+                continue
+            out_added_tokens[tok] = info
+
+    # HACK: save base tokenizer to temp dir and reload to avoid mutating base_tok
+    with tempfile.TemporaryDirectory() as p:
+        base_tok.save_pretrained(p, legacy_format=False, safe_serialization=True)
+        res = transformers.AutoTokenizer.from_pretrained(
+            p, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    orig_base_vocab = base_tok.get_vocab()
+    for tok in out_vocab:
+        if tok in out_added_tokens:
+            continue
+
+        if tok not in orig_base_vocab:
+            res.add_tokens(tok)
+
+    for info in out_added_tokens.values():
+        res.add_tokens(info)
+    return res
+
+
+class TokenizerInfo(BaseModel, arbitrary_types_allowed=True):
+    tokenizer: transformers.PreTrainedTokenizerBase
+    permutations: Dict[ModelReference, Dict[int, int]]
+    original_vocabs: Dict[ModelReference, Dict[str, int]]
+
+
+def build_tokenizer(
+    base_model: Optional[ModelReference],
+    referenced_models: List[ModelReference],
+    tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference],
+    trust_remote_code: bool,
+    add_tokens: Optional[List[str]] = None,
+) -> TokenizerInfo:
+    if base_model is None:
+        base_model = referenced_models[0]
+    if base_model is None:
+        raise RuntimeError("No models referenced")
+
+    #
+    tokenizer_base = get_stripped_tokenizer(
+        base_model.model, trust_remote_code=trust_remote_code
+    )
+
+    # load all tokenizers
+    logging.info("Loading tokenizers")
+    tokenizers = {base_model: tokenizer_base}
+    for model in referenced_models:
+        if model == base_model:
+            continue
+
+        try:
+            model_tok = transformers.AutoTokenizer.from_pretrained(
+                model.model.path,
+                revision=model.model.revision,
+                trust_remote_code=trust_remote_code,
+            )
+        except Exception as e:
+            logging.error(e)
+            logging.warning(
+                f"Unable to load tokenizer for {model}. Assuming same as {base_model}."
+            )
+            continue
+        tokenizers[model] = model_tok
+
+    logging.info("Building output tokenizer")
+    # build final vocabulary
+    if isinstance(tokenizer_source, ModelReference):
+        tokenizer_out = transformers.AutoTokenizer.from_pretrained(
+            tokenizer_source.model.path,
+            revision=tokenizer_source.model.revision,
+            trust_remote_code=trust_remote_code,
+        )
+    elif tokenizer_source == "base":
+        # it done
+        tokenizer_out = tokenizer_base
+    elif tokenizer_source == "union":
+        tokenizer_out = build_union_tokenizer(
+            tokenizer_base, tokenizers, trust_remote_code=trust_remote_code
+        )
+    else:
+        raise RuntimeError(f"Unimplemented tokenizer source: {tokenizer_source}")
+
+    for tok in add_tokens:
+        tokenizer_out.add_tokens(tok)
+
+    vocab_out = tokenizer_out.get_vocab()
+
+    logging.info("Building permutations")
+    permutations = {}
+    for model in (
+        pbar := tqdm.tqdm(referenced_models, desc="Building tokenizer permutations")
+    ):
+        if model in tokenizers:
+            model_vocab = tokenizers[model].get_vocab()
+        else:
+            model_vocab = tokenizers[base_model].get_vocab()
+
+        vocab_size = get_vocab_size(model.model, trust_remote_code=trust_remote_code)
+        if vocab_size is None:
+            vocab_size = len(model_vocab)
+
+        p = {}
+        for tok in vocab_out:
+            new_idx = vocab_out[tok]
+            if tok not in model_vocab:
+                p[new_idx] = -1
+                continue
+
+            orig_idx = model_vocab[tok]
+            if orig_idx >= vocab_size:
+                logging.warning(
+                    f"{model} token {repr(tok)} has index {orig_idx}>{vocab_size-1} (padding?)"
+                )
+                continue
+
+            p[new_idx] = orig_idx
+
+        permutations[model] = p
+
+    del pbar
+
+    return TokenizerInfo(
+        tokenizer=tokenizer_out,
+        permutations=permutations,
+        original_vocabs={model: tok.get_vocab() for model, tok in tokenizers.items()},
+    )
+
+
+class BuildTokenizer(Task[TokenizerInfo]):
+    base_model: Optional[ModelReference]
+    referenced_models: Tuple[ModelReference, ...]
+    tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference]
+    add_tokens: Optional[Tuple[str, ...]]
+    trust_remote_code: bool = False
+
+    def arguments(self) -> Dict[str, Task]:
+        return {}
+
+    def execute(self, **_kwargs) -> TokenizerInfo:
+        return build_tokenizer(
+            base_model=self.base_model,
+            referenced_models=self.referenced_models,
+            tokenizer_source=self.tokenizer_source,
+            trust_remote_code=self.trust_remote_code,
+            add_tokens=self.add_tokens,
+        )
diff --git a/mergekit/mergekit/tokenizer/config.py b/mergekit/mergekit/tokenizer/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bdaeca21248421580aec62e218582875e72bd7b
--- /dev/null
+++ b/mergekit/mergekit/tokenizer/config.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+from typing import Dict, Optional, Union
+
+import pydantic
+from pydantic import BaseModel
+from typing_extensions import Literal
+
+from mergekit.common import ModelReference
+
+
+class ModelTokenEmbedding(BaseModel, frozen=True):
+    kind: Literal["model_token"]
+    model: ModelReference
+    token_id: Optional[int] = None
+    token: Optional[str] = None
+
+    @pydantic.model_validator(mode="after")
+    def validate_token(self):
+        if self.token_id is None and self.token is None:
+            raise ValueError("token_id or token must be specified")
+        if self.token_id is not None and self.token is not None:
+            raise ValueError("only one of token_id or token may be specified")
+        return self
+
+
+class ZeroEmbedding(BaseModel, frozen=True):
+    kind: Literal["zero"]
+
+
+class TokenEmbeddingConfig(BaseModel, frozen=True):
+    source: Union[ModelTokenEmbedding, ZeroEmbedding, ModelReference, None] = None
+    force: bool = False
+
+
+class TokenizerConfig(BaseModel, frozen=True):
+    source: Union[ModelReference, Literal["union"], Literal["base"]] = "union"
+    tokens: Optional[Dict[str, TokenEmbeddingConfig]] = None
+    pad_to_multiple_of: Optional[int] = None
diff --git a/mergekit/mergekit/tokenizer/embed.py b/mergekit/mergekit/tokenizer/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..a853d1af32a563d3466297691a74da9134ed955b
--- /dev/null
+++ b/mergekit/mergekit/tokenizer/embed.py
@@ -0,0 +1,192 @@
+# Copyright (C) 2024 Charles O. Goddard
+#
+# This software is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see http://www.gnu.org/licenses/.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+
+from mergekit.common import ImmutableMap, ModelReference
+from mergekit.graph import Task
+from mergekit.io.tasks import GatherTensors
+from mergekit.tokenizer.build import BuildTokenizer, TokenizerInfo
+from mergekit.tokenizer.config import (
+    ModelTokenEmbedding,
+    TokenEmbeddingConfig,
+    ZeroEmbedding,
+)
+
+
+class PermutedEmbeddings(Task[Dict[ModelReference, torch.Tensor]]):
+    gather_tensors: GatherTensors
+    tokenizer_task: BuildTokenizer
+    tokens: Optional[ImmutableMap[str, TokenEmbeddingConfig]]
+    pad_to_multiple_of: Optional[int]
+    base_model: Optional[ModelReference]
+
+    def arguments(self) -> Dict[str, Task]:
+        return {"tokenizer_info": self.tokenizer_task, "tensors": self.gather_tensors}
+
+    def execute(
+        self, tokenizer_info: TokenizerInfo, tensors: Dict[ModelReference, torch.Tensor]
+    ) -> Dict[ModelReference, torch.Tensor]:
+        tokenizer = tokenizer_info.tokenizer
+        permutations = tokenizer_info.permutations
+
+        models = set(tensors.keys())
+        if self.base_model:
+            models.add(self.base_model)
+        models = list(models)
+
+        vocab = tokenizer.get_vocab()
+        vocab_size = len(vocab)
+        if self.pad_to_multiple_of and vocab_size % self.pad_to_multiple_of:
+            vocab_size = (
+                vocab_size // self.pad_to_multiple_of + 1
+            ) * self.pad_to_multiple_of
+        embed_size = tensors[models[0]].shape[1]
+        assert all(
+            t.shape[1] == embed_size for t in tensors.values()
+        ), "Embedding sizes must match"
+
+        dtype = tensors[models[0]].dtype
+        device = tensors[models[0]].device
+
+        token_configs = dict(**(self.tokens or {}))
+        tokens_to_average = self.assign_embedding_sources(
+            permutations, models, vocab, token_configs
+        )
+
+        default_embeds = {}
+        for token, token_id in vocab.items():
+            embed = torch.zeros(embed_size, dtype=dtype, device=device)
+            if token in tokens_to_average:
+                count = 0
+                for model in models:
+                    p = permutations[model]
+                    if p[token_id] < 0:
+                        continue
+                    embed += tensors[model][p[token_id]]
+                    count += 1
+                embed /= count
+            elif cfg := token_configs.get(token, None):
+                cfg: TokenEmbeddingConfig
+                embed = self.compute_default_embedding(
+                    tokenizer_info, tensors, permutations, token, token_id, cfg
+                )
+            else:
+                continue
+            default_embeds[token] = embed
+
+        result = {}
+        for model in models:
+            p = permutations[model]
+            old_embed = tensors[model]
+            new_embed = torch.zeros(
+                (vocab_size, embed_size), dtype=dtype, device=device
+            )
+            for token, token_id in vocab.items():
+                force = False
+                if token in token_configs:
+                    force = token_configs[token].force
+
+                if p[token_id] >= 0 and not force:
+                    new_embed[token_id, :] = old_embed[p[token_id]]
+                elif token in default_embeds:
+                    new_embed[token_id, :] = default_embeds[token]
+                else:
+                    logging.error(
+                        f"No embedding for token {repr(token)} in model {model}!"
+                    )
+
+            if vocab_size > len(vocab):
+                # as suggested by https://nlp.stanford.edu/~johnhew/vocab-expansion.html
+                avg_embed = torch.mean(new_embed[: len(vocab), :], dim=0)
+                new_embed[len(vocab) :, :] = avg_embed
+            result[model] = new_embed
+
+        return result
+
+    def assign_embedding_sources(
+        self,
+        permutations: Dict[ModelReference, Dict[int, int]],
+        models: list[ModelReference],
+        vocab: Dict[str, int],
+        token_configs: Dict[str, TokenEmbeddingConfig],
+    ):
+        permutation_list = [permutations[model] for model in models]
+
+        tokens_to_average = set()
+        # find tokens that are only present in one model
+        for token, token_id in vocab.items():
+            if token in token_configs:
+                continue
+
+            has_token = [p[token_id] >= 0 for p in permutation_list]
+            num_present = sum(int(x) for x in has_token)
+            if num_present == 1:
+                donor_model = models[has_token.index(True)]
+                token_configs[token] = TokenEmbeddingConfig(source=donor_model)
+                continue
+
+            if num_present == 0:
+                token_configs[token] = TokenEmbeddingConfig(source=ZeroEmbedding())
+                logging.warning(f"Token {repr(token)} not found in any model")
+                continue
+
+            if num_present > 0 and self.base_model is not None:
+                if permutations[self.base_model][token_id] >= 0:
+                    token_configs[token] = TokenEmbeddingConfig(source=self.base_model)
+                    continue
+
+            tokens_to_average.add(token)
+        return tokens_to_average
+
+    def compute_default_embedding(
+        self,
+        tokenizer_info: TokenizerInfo,
+        tensors: Dict[ModelReference, torch.Tensor],
+        permutations: Dict[ModelReference, Dict[int, int]],
+        token: str,
+        token_id: int,
+        cfg: TokenEmbeddingConfig,
+    ) -> torch.Tensor:
+        if isinstance(cfg.source, ZeroEmbedding):
+            pass
+        elif isinstance(cfg.source, ModelTokenEmbedding):
+            model = cfg.source.model
+            assert (
+                model in permutations
+            ), f"Model {model} referenced but not part of merge"
+            p = permutations[model]
+            src_token_id = cfg.source.token_id
+            if src_token_id is None:
+                src_token = cfg.source.token
+                assert (
+                    src_token in tokenizer_info.original_vocabs[model]
+                ), f"Token {repr(src_token)} not found in model {model}"
+                src_token_id = tokenizer_info.original_vocabs[model][src_token]
+            assert (
+                src_token_id >= 0 and src_token_id < tensors[model].shape[0]
+            ), f"Token ID {src_token_id} out of range for model {model}"
+            embed = tensors[model][src_token_id]
+        elif isinstance(cfg.source, ModelReference):
+            model = cfg.source
+            p = permutations[model]
+            assert p[token_id] >= 0, f"Token {repr(token)} not found in model {model}"
+            embed = tensors[model][p[token_id]]
+        else:
+            raise NotImplementedError(cfg)
+        return embed
diff --git a/mergekit/notebook.ipynb b/mergekit/notebook.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..4a85e3fae8a039802336d314d3b7b7c3fb037132
--- /dev/null
+++ b/mergekit/notebook.ipynb
@@ -0,0 +1,91 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": true,
+        "id": "cmjOVVtJdiPZ"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/cg123/mergekit.git\n",
+        "%cd mergekit\n",
+        "%pip install -e ."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "84cRJT6_ecbw"
+      },
+      "outputs": [],
+      "source": [
+        "OUTPUT_PATH = \"./merged\"  # folder to store the result in\n",
+        "LORA_MERGE_CACHE = \"/tmp\"  # change if you want to keep these for some reason\n",
+        "CONFIG_YML = \"./examples/gradient-slerp.yml\"  # merge configuration file\n",
+        "COPY_TOKENIZER = True  # you want a tokenizer? yeah, that's what i thought\n",
+        "LAZY_UNPICKLE = False  # experimental low-memory model loader\n",
+        "LOW_CPU_MEMORY = False  # enable if you somehow have more VRAM than RAM+swap"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6nw26xQLkBax"
+      },
+      "outputs": [],
+      "source": [
+        "# actually do merge\n",
+        "import torch\n",
+        "import yaml\n",
+        "\n",
+        "from mergekit.config import MergeConfiguration\n",
+        "from mergekit.merge import MergeOptions, run_merge\n",
+        "\n",
+        "with open(CONFIG_YML, \"r\", encoding=\"utf-8\") as fp:\n",
+        "    merge_config = MergeConfiguration.model_validate(yaml.safe_load(fp))\n",
+        "\n",
+        "run_merge(\n",
+        "    merge_config,\n",
+        "    out_path=OUTPUT_PATH,\n",
+        "    options=MergeOptions(\n",
+        "        lora_merge_cache=LORA_MERGE_CACHE,\n",
+        "        cuda=torch.cuda.is_available(),\n",
+        "        copy_tokenizer=COPY_TOKENIZER,\n",
+        "        lazy_unpickle=LAZY_UNPICKLE,\n",
+        "        low_cpu_memory=LOW_CPU_MEMORY,\n",
+        "    ),\n",
+        ")\n",
+        "print(\"Done!\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/mergekit/pyproject.toml b/mergekit/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..e04fd46415617434e37a0fa066fe4e89d922d0d1
--- /dev/null
+++ b/mergekit/pyproject.toml
@@ -0,0 +1,87 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mergekit"
+description = "Tools for merging pre-trained large language models"
+readme = "README.md"
+license = { text = "LGPL-3.0-or-later" }
+version = "0.0.5.2"
+authors = [{ name = "Charles Goddard", email = "chargoddard@gmail.com" }]
+dependencies = [
+    "torch>=2.0.0",
+    "tqdm==4.66.5",
+    "click==8.1.7",
+    "safetensors~=0.4.3",
+    "accelerate~=1.0.1",
+    "pydantic~=2.9.2",
+    "immutables==0.20",
+    "transformers>=4.45.2",
+    "tokenizers>=0.20.1",
+    "huggingface_hub",
+    "peft",
+    "typing-extensions",
+    "sentencepiece",
+    "protobuf",
+    "scipy",
+    "datasets",
+]
+
+[project.optional-dependencies]
+dev = ["black~=24.10.0", "isort~=5.13.2", "pre-commit~=4.0.1"]
+test = ["pytest~=8.3.3"]
+evolve = ["ray", "cma", "lm_eval", "wandb"]
+vllm = ["vllm==0.3.2", "lm_eval[vllm]"]
+
+[project.urls]
+repository = "https://github.com/cg123/mergekit"
+
+
+[project.scripts]
+mergekit-yaml = "mergekit.scripts.run_yaml:main"
+mergekit-mega = "mergekit.scripts.megamerge:main"
+mergekit-legacy = "mergekit.scripts.legacy:main"
+mergekit-layershuffle = "mergekit.scripts.layershuffle:main"
+bakllama = "mergekit.scripts.bakllama:main"
+mergekit-moe = "mergekit.scripts.moe:main"
+mergekit-tokensurgeon = "mergekit.scripts.tokensurgeon:main"
+mergekit-extract-lora = "mergekit.scripts.extract_lora:main"
+mergekit-evolve = "mergekit.scripts.evolve:main"
+
+[tool.setuptools]
+packages = [
+    "mergekit",
+    "mergekit.io",
+    "mergekit.merge_methods",
+    "mergekit.moe",
+    "mergekit.scripts",
+    "mergekit.evo",
+    "mergekit.tokenizer",
+    "mergekit._data",
+    "mergekit._data.architectures",
+    "mergekit._data.chat_templates",
+]
+include-package-data = true
+package-data = { "mergekit._data.architectures" = [
+    "*.json",
+], "mergekit._data.chat_templates" = [
+    "*.jinja",
+] }
+
+[tool.isort]
+profile = "black"
+
+[tool.black]
+line-length = 88
+target-version = ['py37']
+include = '\.pyi?$'
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+filterwarnings = [
+    "ignore::pydantic.PydanticDeprecatedSince20:huggingface_hub.*:",
+    "ignore::FutureWarning:huggingface_hub.*:",
+    "ignore:(read_text|open_text|contents|is_resource) is deprecated:DeprecationWarning", # yes i know, but files() doesn't exist in 3.8
+]
+testpaths = ["tests"]
diff --git a/mergekit/tests/common.py b/mergekit/tests/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a542cd44620bace436ad7c9b8e46ccfe8f1bfafd
--- /dev/null
+++ b/mergekit/tests/common.py
@@ -0,0 +1,65 @@
+import os
+import tempfile
+from typing import Callable, Optional
+
+from transformers import AutoConfig, LlamaConfig, LlamaForCausalLM
+
+from mergekit.architecture import get_architecture_info
+from mergekit.config import MergeConfiguration
+from mergekit.io.lazy_tensor_loader import LazyTensorLoader, ShardedTensorIndex
+from mergekit.merge import MergeOptions, run_merge
+
+
+def run_and_check_merge(
+    config: MergeConfiguration,
+    check_nan: bool = True,
+    check_tensors: bool = True,
+    validate: Optional[Callable[[str], None]] = None,
+    index_json_name: Optional[str] = None,
+):
+    if index_json_name is None:
+        index_json_name = "model.safetensors.index.json"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        run_merge(config, out_path=tmpdir, options=MergeOptions())
+        assert os.path.exists(
+            os.path.join(tmpdir, index_json_name)
+        ), "No index file for merge"
+        assert os.path.exists(
+            os.path.join(tmpdir, "config.json")
+        ), "No config json produced by merge"
+
+        if check_nan:
+            # check for NaN in output
+            loader = LazyTensorLoader.from_disk(tmpdir, lazy_unpickle=False)
+            tp = loader.index.tensor_paths
+            sorted_tensors = sorted(tp.keys(), key=lambda k: tp[k])
+            for tensor_name in sorted_tensors:
+                tensor = loader.get_tensor(tensor_name)
+                has_nan = tensor.view(-1).isnan().any()
+                assert not has_nan, "Output contains NaN"
+
+        if check_tensors:
+            config = AutoConfig.from_pretrained(tmpdir)
+            arch_info = get_architecture_info(config)
+
+            index = ShardedTensorIndex.from_disk(tmpdir)
+            for weight_info in arch_info.all_weights(config):
+                if weight_info.name not in index.tensor_paths:
+                    raise RuntimeError(f"Output missing tensor {tensor_name}")
+
+        if validate:
+            validate(tmpdir)
+
+
+def make_picollama(path: str, vocab_size: int = 64):
+    cfg = LlamaConfig(
+        vocab_size=vocab_size,
+        hidden_size=32,
+        intermediate_size=48,
+        num_attention_heads=16,
+        num_hidden_layers=2,
+    )
+    model = LlamaForCausalLM(cfg)
+    model.save_pretrained(path, safe_serialization=True)
+    return str(path)
diff --git a/mergekit/tests/test_basic_merges.py b/mergekit/tests/test_basic_merges.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d23c1c5603505b5025d388f6e30fbb4faf8c69d
--- /dev/null
+++ b/mergekit/tests/test_basic_merges.py
@@ -0,0 +1,214 @@
+from typing import Dict, Optional
+
+import pytest
+from common import make_picollama, run_and_check_merge
+from transformers import AutoConfig
+
+from mergekit.config import (
+    InputModelDefinition,
+    InputSliceDefinition,
+    MergeConfiguration,
+    OutputSliceDefinition,
+    ParameterSetting,
+)
+from mergekit.io import LazyTensorLoader
+
+
+@pytest.fixture(scope="session")
+def model_a(tmp_path_factory):
+    return make_picollama(tmp_path_factory.mktemp("model_a"))
+
+
+@pytest.fixture(scope="session")
+def model_b(tmp_path_factory):
+    return make_picollama(tmp_path_factory.mktemp("model_b"))
+
+
+@pytest.fixture(scope="session")
+def model_c(tmp_path_factory):
+    return make_picollama(tmp_path_factory.mktemp("model_c"))
+
+
+class TestBasicMerges:
+    def test_gpt2_copy(self):
+        config = MergeConfiguration(
+            merge_method="passthrough",
+            models=[InputModelDefinition(model="gpt2")],
+            dtype="bfloat16",
+        )
+        run_and_check_merge(config)
+
+    def test_gpt2_stack(self):
+        config = MergeConfiguration(
+            merge_method="passthrough",
+            slices=[
+                OutputSliceDefinition(
+                    sources=[InputSliceDefinition(model="gpt2", layer_range=[0, 12])]
+                )
+            ]
+            * 2,
+            dtype="bfloat16",
+        )
+
+        def _check_config_layers(p: str):
+            config = AutoConfig.from_pretrained(p)
+            assert config.n_layer == 24
+
+        run_and_check_merge(config, validate=_check_config_layers)
+
+    def test_passthrough_scale(self, model_a):
+        config = MergeConfiguration(
+            merge_method="passthrough",
+            models=[
+                InputModelDefinition(
+                    model=model_a,
+                    parameters={
+                        "scale": [
+                            {"filter": "o_proj", "value": 0},
+                            {"value": 1},
+                        ]
+                    },
+                )
+            ],
+        )
+
+        def _check_o_proj(p: str):
+            loader = LazyTensorLoader.from_disk(p)
+            saw_any = False
+            for name in loader.index.tensor_paths:
+                if "o_proj" in name:
+                    param = loader.get_tensor(name)
+                    assert (param == 0).all()
+                    saw_any = True
+                elif "lm_head" in name:
+                    param = loader.get_tensor(name)
+                    assert param.count_nonzero() > 0
+
+            assert saw_any, "No o_proj parameters found"
+
+        run_and_check_merge(config, validate=_check_o_proj)
+
+    def test_linear_merge(self, model_a, model_b):
+        config = self.two_model_config(model_a, model_b, merge_method="linear")
+        run_and_check_merge(config)
+
+    def test_slerp_merge(self, model_a, model_b):
+        config = self.two_model_config(
+            model_a, model_b, merge_method="slerp", base_model=model_a
+        )
+        config.parameters = {"t": 0.35}
+        run_and_check_merge(config)
+
+    def test_nearswap_merge(self, model_a, model_b):
+        config = self.two_model_config(
+            model_a, model_b, merge_method="nearswap", base_model=model_a
+        )
+        config.parameters = {"t": 0.0001}
+        run_and_check_merge(config)
+
+    def test_nuslerp_merges(self, model_a, model_b, model_c):
+        for base_model in [None, model_c]:
+            for row_wise in [False, True]:
+                for flatten in [False, True]:
+                    print(
+                        f"Testing nuslerp with row_wise={row_wise}, flatten={flatten}, base_model={base_model}"
+                    )
+                    run_and_check_merge(
+                        self.two_model_config(
+                            model_a,
+                            model_b,
+                            merge_method="nuslerp",
+                            base_model=base_model,
+                            params={
+                                "nuslerp_row_wise": row_wise,
+                                "nuslerp_flatten": flatten,
+                            },
+                        )
+                    )
+
+        # test weights that sum to zero
+        config = self.two_model_config(
+            model_a,
+            model_b,
+            merge_method="nuslerp",
+            base_model=model_c,
+            params={"nuslerp_row_wise": False, "nuslerp_flatten": False},
+        )
+        config.models[0].parameters["weight"] = -0.5
+        config.models[1].parameters["weight"] = 0.5
+        run_and_check_merge(config)
+
+    def test_task_arithmetic_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_a, model_b, merge_method="task_arithmetic", base_model=model_c
+        )
+        run_and_check_merge(config)
+
+    def test_breadcrumbs_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_a, model_b, merge_method="breadcrumbs", base_model=model_c
+        )
+        run_and_check_merge(config)
+
+    def test_ties_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_a,
+            model_b,
+            merge_method="ties",
+            base_model=model_c,
+            params={"density": 0.3},
+        )
+        run_and_check_merge(config)
+
+    def test_dare_ties_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_a,
+            model_b,
+            merge_method="dare_ties",
+            base_model=model_c,
+            params={"density": 0.66},
+        )
+        run_and_check_merge(config)
+
+    def test_model_stock_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_b, model_c, merge_method="model_stock", base_model=model_a
+        )
+        run_and_check_merge(config)
+
+    def test_model_stock_filterwise_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_b,
+            model_c,
+            merge_method="model_stock",
+            base_model=model_a,
+            params={"filter_wise": True},
+        )
+        run_and_check_merge(config)
+
+    def two_model_config(
+        self,
+        model_a,
+        model_b,
+        merge_method: str,
+        base_model: Optional[str] = None,
+        params: Optional[Dict[str, ParameterSetting]] = None,
+    ):
+        config = MergeConfiguration(
+            merge_method=merge_method,
+            base_model=base_model,
+            models=[
+                InputModelDefinition(
+                    model=model_a,
+                    parameters={"weight": 0.6},
+                ),
+                InputModelDefinition(
+                    model=model_b,
+                    parameters={"weight": 0.4},
+                ),
+            ],
+            dtype="bfloat16",
+            parameters=params,
+        )
+
+        return config
diff --git a/mergekit/tests/test_chat_template.py b/mergekit/tests/test_chat_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..af511a2b36a2a5208432aafb9f7c5826749ae5d4
--- /dev/null
+++ b/mergekit/tests/test_chat_template.py
@@ -0,0 +1,52 @@
+from typing import Optional
+
+from common import run_and_check_merge
+from test_basic_merges import model_b
+from test_tokenizer import model_base
+from transformers import AutoTokenizer
+
+from mergekit.config import InputModelDefinition, MergeConfiguration
+
+
+def check_chat_template(model_path: str, needle: Optional[str] = None):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if needle is None:
+        assert not tokenizer.chat_template, "Expected no chat template"
+        return
+    assert (
+        tokenizer.chat_template and needle in tokenizer.chat_template
+    ), f"Expected chat template to contain {needle}"
+
+
+class TestChatTemplate:
+    def test_template_chatml(self, model_base, model_b):
+        config = MergeConfiguration(
+            merge_method="linear",
+            models=[
+                InputModelDefinition(model=model_base, parameters={"weight": 0.5}),
+                InputModelDefinition(model=model_b, parameters={"weight": 0.5}),
+            ],
+            base_model=model_base,
+            dtype="bfloat16",
+            chat_template="chatml",
+        )
+        run_and_check_merge(
+            config,
+            validate=lambda p: check_chat_template(p, "<|im_start|>"),
+        )
+
+    def test_template_literal_jinja(self, model_base, model_b):
+        config = MergeConfiguration(
+            merge_method="linear",
+            models=[
+                InputModelDefinition(model=model_base, parameters={"weight": 0.5}),
+                InputModelDefinition(model=model_b, parameters={"weight": 0.5}),
+            ],
+            base_model=model_base,
+            dtype="bfloat16",
+            chat_template="{{messages[0]['content']}}",
+        )
+        run_and_check_merge(
+            config,
+            validate=lambda p: check_chat_template(p, "{{messages[0]['content']}}"),
+        )
diff --git a/mergekit/tests/test_graph.py b/mergekit/tests/test_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7574e495a639f693789f532b6f5753b32a6cec2d
--- /dev/null
+++ b/mergekit/tests/test_graph.py
@@ -0,0 +1,208 @@
+from typing import Any, Dict, Optional
+
+import networkx
+import pytest
+
+from mergekit.common import ImmutableMap
+from mergekit.graph import Executor, Task
+
+EXECUTION_COUNTS: Dict[Task, int] = {}
+
+
+class DummyTask(Task):
+    result: Any
+    dependencies: ImmutableMap[str, Task]
+    name: str = "DummyTask"
+    grouplabel: Optional[str] = None
+    execution_count: int = 0
+
+    def arguments(self):
+        return self.dependencies
+
+    def group_label(self) -> Optional[str]:
+        return self.grouplabel
+
+    def execute(self, **kwargs):
+        EXECUTION_COUNTS[self] = EXECUTION_COUNTS.get(self, 0) + 1
+        return self.result
+
+
+def create_mock_task(name, result=None, dependencies=None, group_label=None):
+    if dependencies is None:
+        dependencies = {}
+    return DummyTask(
+        result=result,
+        dependencies=ImmutableMap(data=dependencies),
+        name=name,
+        grouplabel=group_label,
+    )
+
+
+# Test cases for the Task implementation
+class TestTaskClass:
+    def test_task_execute(self):
+        # Testing the execute method
+        task = create_mock_task("task1", result=42)
+        assert task.execute() == 42, "Task execution did not return expected result"
+
+    def test_task_priority(self):
+        task = create_mock_task("task1")
+        assert task.priority() == 0, "Default priority should be 0"
+
+    def test_task_group_label(self):
+        task = create_mock_task("task1")
+        assert task.group_label() is None, "Default group label should be None"
+
+
+# Test cases for the Executor implementation
+class TestExecutorClass:
+    def test_executor_initialization(self):
+        # Testing initialization with single task
+        task = create_mock_task("task1")
+        executor = Executor([task])
+        assert executor.targets == [
+            task
+        ], "Executor did not initialize with correct targets"
+
+    def test_executor_empty_list(self):
+        list(Executor([]).run())
+
+    def test_executor_scheduling(self):
+        # Testing scheduling with dependencies
+        task1 = create_mock_task("task1", result=1)
+        task2 = create_mock_task("task2", result=2, dependencies={"task1": task1})
+        executor = Executor([task2])
+        assert (
+            len(executor._make_schedule([task2])) == 2
+        ), "Schedule should include two tasks"
+
+    def test_executor_dependency_building(self):
+        # Testing dependency building
+        task1 = create_mock_task("task1")
+        task2 = create_mock_task("task2", dependencies={"task1": task1})
+        executor = Executor([task2])
+        dependencies = executor._build_dependencies([task2])
+        assert task1 in dependencies[task2], "Task1 should be a dependency of Task2"
+
+    def test_executor_run(self):
+        # Testing execution through the run method
+        task1 = create_mock_task("task1", result=10)
+        task2 = create_mock_task("task2", result=20, dependencies={"task1": task1})
+        executor = Executor([task2])
+        results = list(executor.run())
+        assert (
+            len(results) == 1 and results[0][1] == 20
+        ), "Executor run did not yield correct results"
+
+    def test_executor_execute(self):
+        # Testing execute method for side effects
+        task1 = create_mock_task("task1", result=10)
+        executor = Executor([task1])
+        # No assert needed; we're ensuring no exceptions are raised and method completes
+        executor.execute()
+
+    def test_dependency_ordering(self):
+        # Testing the order of task execution respects dependencies
+        task1 = create_mock_task("task1", result=1)
+        task2 = create_mock_task("task2", result=2, dependencies={"task1": task1})
+        task3 = create_mock_task("task3", result=3, dependencies={"task2": task2})
+        executor = Executor([task3])
+
+        schedule = executor._make_schedule([task3])
+        assert schedule.index(task1) < schedule.index(
+            task2
+        ), "Task1 should be scheduled before Task2"
+        assert schedule.index(task2) < schedule.index(
+            task3
+        ), "Task2 should be scheduled before Task3"
+
+
+class TestExecutorGroupLabel:
+    def test_group_label_scheduling(self):
+        # Create tasks with group labels and dependencies
+        task1 = create_mock_task("task1", group_label="group1")
+        task2 = create_mock_task(
+            "task2", dependencies={"task1": task1}, group_label="group1"
+        )
+        task3 = create_mock_task("task3", group_label="group2")
+        task4 = create_mock_task(
+            "task4", dependencies={"task2": task2, "task3": task3}, group_label="group1"
+        )
+
+        # Initialize Executor with the tasks
+        executor = Executor([task4])
+
+        # Get the scheduled tasks
+        schedule = executor._make_schedule([task4])
+
+        # Check if tasks with the same group label are scheduled consecutively when possible
+        group_labels_in_order = [
+            task.group_label() for task in schedule if task.group_label()
+        ]
+        assert group_labels_in_order == [
+            "group1",
+            "group1",
+            "group2",
+            "group1",
+        ], "Tasks with same group label are not scheduled consecutively"
+
+    def test_group_label_with_dependencies(self):
+        # Creating tasks with dependencies and group labels
+        task1 = create_mock_task("task1", result=1, group_label="group1")
+        task2 = create_mock_task(
+            "task2", result=2, dependencies={"task1": task1}, group_label="group2"
+        )
+        task3 = create_mock_task(
+            "task3", result=3, dependencies={"task2": task2}, group_label="group1"
+        )
+
+        executor = Executor([task3])
+        schedule = executor._make_schedule([task3])
+        scheduled_labels = [
+            task.group_label() for task in schedule if task.group_label()
+        ]
+
+        # Check if task3 is scheduled after task1 and task2 due to dependency, even though it has the same group label as task1
+        group1_indices = [
+            i for i, label in enumerate(scheduled_labels) if label == "group1"
+        ]
+        group2_index = scheduled_labels.index("group2")
+
+        assert (
+            group1_indices[-1] > group2_index
+        ), "Task with the same group label but later dependency was not scheduled after different group label"
+
+
+class TestExecutorSingleExecution:
+    def test_single_execution_per_task(self):
+        EXECUTION_COUNTS.clear()
+
+        shared_task = create_mock_task("shared_task", result=100)
+        task1 = create_mock_task("task1", dependencies={"shared": shared_task})
+        task2 = create_mock_task("task2", dependencies={"shared": shared_task})
+        task3 = create_mock_task("task3", dependencies={"task1": task1, "task2": task2})
+
+        Executor([task3]).execute()
+
+        assert shared_task in EXECUTION_COUNTS, "Dependency not executed"
+        assert (
+            EXECUTION_COUNTS[shared_task] == 1
+        ), "Shared dependency should be executed exactly once"
+
+
+class CircularTask(Task):
+    def arguments(self) -> Dict[str, Task]:
+        return {"its_a_me": self}
+
+    def execute(self, **_kwargs) -> Any:
+        assert False, "Task with circular dependency executed"
+
+
+class TestExecutorCircularDependency:
+    def test_circular_dependency(self):
+        with pytest.raises(networkx.NetworkXUnfeasible):
+            Executor([CircularTask()]).execute()
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/mergekit/tests/test_io.py b/mergekit/tests/test_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1e1d1aa7cad372e28fff1a1098f33517c707846
--- /dev/null
+++ b/mergekit/tests/test_io.py
@@ -0,0 +1,37 @@
+import os
+import tempfile
+
+import torch
+
+from mergekit.io import TensorWriter
+
+
+class TestTensorWriter:
+    def test_safetensors(self):
+        with tempfile.TemporaryDirectory() as d:
+            writer = TensorWriter(d, safe_serialization=True)
+            writer.save_tensor("steve", torch.randn(4))
+            writer.finalize()
+
+            assert os.path.exists(os.path.join(d, "model-00001-of-00001.safetensors"))
+            assert os.path.exists(os.path.join(d, "model.safetensors.index.json"))
+
+    def test_pickle(self):
+        with tempfile.TemporaryDirectory() as d:
+            writer = TensorWriter(d, safe_serialization=False)
+            writer.save_tensor("timothan", torch.randn(4))
+            writer.finalize()
+
+            assert os.path.exists(os.path.join(d, "pytorch_model-00001-of-00001.bin"))
+            assert os.path.exists(os.path.join(d, "pytorch_model.bin.index.json"))
+
+    def test_duplicate_tensor(self):
+        with tempfile.TemporaryDirectory() as d:
+            writer = TensorWriter(d, safe_serialization=True)
+            jim = torch.randn(4)
+            writer.save_tensor("jim", jim)
+            writer.save_tensor("jimbo", jim)
+            writer.finalize()
+
+            assert os.path.exists(os.path.join(d, "model-00001-of-00001.safetensors"))
+            assert os.path.exists(os.path.join(d, "model.safetensors.index.json"))
diff --git a/mergekit/tests/test_lazy_unpickle.py b/mergekit/tests/test_lazy_unpickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..915b7bbb55d4e4b91646e7577fda7e9a1506ff8a
--- /dev/null
+++ b/mergekit/tests/test_lazy_unpickle.py
@@ -0,0 +1,18 @@
+import torch
+
+from mergekit.io import LazyTensorLoader
+
+
+class TestLazyUnpickle:
+    def test_lazy_unpickle(self, tmp_path):
+        data = {
+            "a": torch.tensor([1, 2, 3]),
+            "b": torch.tensor([4, 5, 6]),
+        }
+        path = tmp_path / "pytorch_model.bin"
+        torch.save(data, path)
+        loader = LazyTensorLoader.from_disk(tmp_path)
+        for name in data:
+            assert name in loader.index.tensor_paths
+            tensor = loader.get_tensor(name)
+            assert torch.equal(tensor, data[name])
diff --git a/mergekit/tests/test_modelref.py b/mergekit/tests/test_modelref.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b250ee82c47420a71f0e0312b02127b22849ec
--- /dev/null
+++ b/mergekit/tests/test_modelref.py
@@ -0,0 +1,43 @@
+import pytest
+
+from mergekit.common import ModelPath, ModelReference
+
+
+class TestModelReference:
+    def test_parse_simple(self):
+        text = "hf_user/model"
+        mr = ModelReference.parse(text)
+        assert mr.model == ModelPath(path="hf_user/model", revision=None)
+        assert mr.lora is None
+        assert str(mr) == text
+
+    def test_parse_lora(self):
+        text = "hf_user/model+hf_user/lora"
+        mr = ModelReference.parse(text)
+        assert mr.model == ModelPath(path="hf_user/model", revision=None)
+        assert mr.lora == ModelPath(path="hf_user/lora", revision=None)
+        assert str(mr) == text
+
+    def test_parse_revision(self):
+        text = "hf_user/model@v0.0.1"
+        mr = ModelReference.parse(text)
+        assert mr.model == ModelPath(path="hf_user/model", revision="v0.0.1")
+        assert mr.lora is None
+        assert str(mr) == text
+
+    def test_parse_lora_plus_revision(self):
+        text = "hf_user/model@v0.0.1+hf_user/lora@main"
+        mr = ModelReference.parse(text)
+        assert mr.model == ModelPath(path="hf_user/model", revision="v0.0.1")
+        assert mr.lora == ModelPath(path="hf_user/lora", revision="main")
+        assert str(mr) == text
+
+    def test_parse_bad(self):
+        with pytest.raises(RuntimeError):
+            ModelReference.parse("@@@@@")
+
+        with pytest.raises(RuntimeError):
+            ModelReference.parse("a+b+c")
+
+        with pytest.raises(RuntimeError):
+            ModelReference.parse("a+b+c@d+e@f@g")
diff --git a/mergekit/tests/test_sparsify.py b/mergekit/tests/test_sparsify.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1fccff34c7c1577a0cd2374691f3bb06c70172
--- /dev/null
+++ b/mergekit/tests/test_sparsify.py
@@ -0,0 +1,81 @@
+import pytest
+import torch
+
+from mergekit.sparsify import SparsificationMethod, sparsify
+
+
+@pytest.fixture
+def sample_tensor():
+    res = torch.randn(128, 64)
+    res[res == 0] = 7  # very low chance, but hey!
+    return res
+
+
+class TestMagnitude:
+    def test_full_density(self, sample_tensor):
+        assert torch.equal(
+            sparsify(sample_tensor, density=1, method=SparsificationMethod.magnitude),
+            sample_tensor,
+        )
+
+    def test_zero_density(self, sample_tensor):
+        with pytest.raises(AssertionError):
+            sparsify(sample_tensor, density=0, method=SparsificationMethod.magnitude)
+
+    def test_partial_density(self, sample_tensor):
+        result = sparsify(
+            sample_tensor, density=0.5, method=SparsificationMethod.magnitude
+        )
+        assert torch.count_nonzero(result) == sample_tensor.view(-1).shape[0] // 2
+
+    def test_outliers(self, sample_tensor):
+        for gamma_0 in [0.1, 0.2, 0.5, 1.0]:
+            for density in [0.1, 0.3, 0.5, 0.6, 0.9, 1.0]:
+                sparsity = 1 - density
+                gamma = gamma_0 * sparsity
+                result = sparsify(
+                    sample_tensor,
+                    density=density,
+                    method=SparsificationMethod.magnitude_outliers,
+                    gamma=gamma,
+                )
+                assert torch.count_nonzero(result) == int(
+                    sample_tensor.view(-1).shape[0] * density
+                )
+
+
+class TestBernoulli:
+    NUM_ITERATIONS = 1000
+
+    def test_bernoulli_with_rescale(self, sample_tensor):
+        ref_abs_sum = sample_tensor.abs().sum()
+        avg_abs_sum = torch.zeros_like(ref_abs_sum)
+        for _ in range(TestBernoulli.NUM_ITERATIONS):
+            rescaled = sparsify(
+                sample_tensor,
+                density=0.5,
+                method=SparsificationMethod.random,
+                rescale=True,
+            )
+            avg_abs_sum += rescaled.abs().sum()
+        avg_abs_sum /= TestBernoulli.NUM_ITERATIONS
+
+        assert torch.isclose(avg_abs_sum, ref_abs_sum, rtol=0.01)
+
+    def test_bernoulli_without_rescale(self, sample_tensor):
+        result = sparsify(
+            sample_tensor,
+            density=0.5,
+            method=SparsificationMethod.random,
+            rescale=False,
+        )
+        assert 0 < torch.count_nonzero(result) <= sample_tensor.view(-1).shape[0]
+
+    def test_cpu_dtypes(self, sample_tensor):
+        for dt in (torch.float16, torch.bfloat16, torch.float32):
+            sparsify(
+                tensor=sample_tensor.to(dtype=dt).cpu(),
+                density=0.5,
+                method=SparsificationMethod.random,
+                rescale=True,
+            )
diff --git a/mergekit/tests/test_tokenizer.py b/mergekit/tests/test_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a799e8c4157021baba7f0d72568c54726716397f
--- /dev/null
+++ b/mergekit/tests/test_tokenizer.py
@@ -0,0 +1,328 @@
+import json
+import os
+import tempfile
+from typing import Dict, List, Optional, Union
+
+import pytest
+import tokenizers
+import torch
+from common import make_picollama, run_and_check_merge
+from transformers import LlamaConfig, LlamaTokenizerFast, PreTrainedTokenizerBase
+
+from mergekit.config import InputModelDefinition, MergeConfiguration
+from mergekit.io import LazyTensorLoader
+from mergekit.tokenizer import TokenizerConfig
+
+
+@pytest.fixture(scope="session")
+def model_base(tmp_path_factory):
+    model_path = make_picollama(tmp_path_factory.mktemp("model_base"), vocab_size=64)
+    make_tokenizer(vocab_size=64, added_tokens=[]).save_pretrained(model_path)
+    return model_path
+
+
+@pytest.fixture(scope="session")
+def model_chatml(tmp_path_factory):
+    model_path = make_picollama(tmp_path_factory.mktemp("model_chatml"), vocab_size=66)
+    make_tokenizer(
+        vocab_size=64, added_tokens=["<|im_start|>", "<|im_end|>"]
+    ).save_pretrained(model_path)
+    return model_path
+
+
+@pytest.fixture(scope="session")
+def model_padded(tmp_path_factory):
+    model_path = make_picollama(tmp_path_factory.mktemp("model_padded"), vocab_size=64)
+    make_tokenizer(
+        vocab_size=64,
+        added_tokens=["<UNUSED_0>", "<UNUSED_1>", "<UNUSED_2>", "<UNUSED_3>"],
+    ).save_pretrained(model_path)
+    return model_path
+
+
+def make_tokenizer(
+    vocab_size: int, added_tokens: List[Union[str, tokenizers.AddedToken]]
+) -> PreTrainedTokenizerBase:
+    tokens = ["<unk>", "<s>", "</s>"] + [f"_tok_{idx}" for idx in range(3, vocab_size)]
+    tokens = tokens[:vocab_size]
+    tok_data = {
+        "version": "1.0",
+        "model": {
+            "type": "BPE",
+            "vocab": dict(zip(tokens, range(vocab_size))),
+            "merges": [],
+        },
+        "added_tokens": [],
+    }
+    tok = tokenizers.Tokenizer.from_str(json.dumps(tok_data))
+    with tempfile.TemporaryDirectory() as p:
+        tok_path = os.path.join(p, "tokenizer.json")
+        tok.save(tok_path)
+        res = LlamaTokenizerFast(tokenizer_file=tok_path)
+
+    res.add_tokens(added_tokens)
+    return res
+
+
+def check_tokenizer(
+    expected_size: int,
+    expected_added_ct: Optional[int] = None,
+    must_contain: Optional[List[str]] = None,
+    must_not_contain: Optional[List[str]] = None,
+):
+    def _cb(model_path: str):
+        tok: LlamaTokenizerFast = LlamaTokenizerFast.from_pretrained(model_path)
+        vocab = tok.get_vocab()
+        print(vocab)
+        assert len(vocab) == expected_size
+
+        if expected_added_ct is not None:
+            assert len(tok.added_tokens_decoder) == expected_added_ct
+
+        if must_contain:
+            for tok in must_contain:
+                assert tok in vocab
+
+        if must_not_contain:
+            for tok in must_not_contain:
+                assert tok not in vocab
+
+    return _cb
+
+
+class ModelEmbeddings:
+    embed_tokens: torch.Tensor
+    vocab: Dict[str, int]
+
+    def __init__(self, model_path: str):
+        tokenizer = LlamaTokenizerFast.from_pretrained(model_path)
+        loader = LazyTensorLoader.from_disk(model_path)
+        self.embed_tokens = loader.get_tensor("model.embed_tokens.weight")
+        self.vocab = tokenizer.get_vocab()
+
+    def token_embedding(self, token: str) -> Optional[torch.Tensor]:
+        idx = self.vocab.get(token)
+        if idx is None:
+            return None
+        return self.embed_tokens[idx, :]
+
+
+class TestTokenizerMerges:
+    def test_legacy_mode(self, model_base: str, model_padded: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_padded, model_chatml], base_model=model_base
+        )
+        # when no tokenizer_source is set, expect output tokenizer to be from base_model
+        run_and_check_merge(
+            config, validate=check_tokenizer(expected_size=64, expected_added_ct=3)
+        )
+
+    def test_source_base(self, model_base: str, model_padded: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_padded, model_chatml],
+            base_model=model_base,
+            tokenizer_source="base",
+        )
+        # expect the same output but it's a different code path
+        run_and_check_merge(
+            config, validate=check_tokenizer(expected_size=64, expected_added_ct=3)
+        )
+
+    def test_source_union(self, model_base: str, model_padded: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_padded, model_chatml],
+            base_model=model_base,
+            tokenizer_source="union",
+        )
+
+        def _check_embed(model_path: str):
+            # output should have all tokens used by any model
+            # but not include any unused tokens
+            check_tokenizer(
+                expected_size=66,
+                expected_added_ct=5,
+                must_contain=["<|im_start|>", "<|im_end|>"],
+                must_not_contain=[f"<UNUSED_{idx}>" for idx in range(4)],
+            )(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("<|im_start|>"),
+                emb_chatml.token_embedding("<|im_start|>"),
+            ), "Token <|im_start|> should be from model_chatml"
+            assert torch.allclose(
+                emb_out.token_embedding("<|im_end|>"),
+                emb_chatml.token_embedding("<|im_end|>"),
+                atol=1e-3,
+                rtol=1e-4,
+            ), "Token <|im_end|> should be from model_chatml"
+
+        run_and_check_merge(
+            config,
+            validate=_check_embed,
+        )
+
+    def test_source_model(self, model_base: str, model_padded: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_padded, model_chatml],
+            base_model=model_base,
+            tokenizer_source=model_chatml,
+        )
+        # tokenizer should match model_chatml
+        run_and_check_merge(
+            config,
+            validate=check_tokenizer(
+                expected_size=66, must_contain=["<|im_start|>", "<|im_end|>"]
+            ),
+        )
+
+    def test_slerp_union(self, model_base: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_chatml],
+            base_model=model_base,
+            tokenizer_source="union",
+            merge_method="slerp",
+            t=0.5,
+        )
+
+        run_and_check_merge(
+            config,
+            validate=check_tokenizer(
+                expected_size=66,
+                must_contain=["<|im_start|>", "<|im_end|>"],
+            ),
+        )
+
+    def test_force_token(self, model_base: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_chatml],
+            base_model=model_base,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="union",
+                tokens={
+                    "_tok_10": {"source": model_chatml, "force": True},
+                    "_tok_11": {"source": model_base, "force": True},
+                },
+            ),
+        )
+
+        def _check_embed(model_path: str):
+            check_tokenizer(
+                expected_size=66, must_contain=["<|im_start|>", "<|im_end|>"]
+            )(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_base = ModelEmbeddings(model_base)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_10"),
+                emb_chatml.token_embedding("_tok_10"),
+            ), "Token _tok_10 should be from model_chatml"
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_11"),
+                emb_base.token_embedding("_tok_11"),
+            ), "Token _tok_11 should be from model_base"
+
+        run_and_check_merge(config, validate=_check_embed)
+
+    def test_model_token_id(self, model_base: str, model_chatml: str):
+        config = self.make_config(
+            [model_base, model_chatml],
+            base_model=model_base,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="base",
+                tokens={
+                    "_tok_20": {
+                        "source": {
+                            "kind": "model_token",
+                            "model": model_chatml,
+                            "token_id": 64,
+                        },
+                        "force": True,
+                    },
+                    "_tok_21": {
+                        "source": {
+                            "kind": "model_token",
+                            "model": model_base,
+                            "token": "<s>",
+                        },
+                        "force": True,
+                    },
+                },
+            ),
+        )
+
+        def _check_embed(model_path: str):
+            check_tokenizer(expected_size=64, must_contain=["_tok_10"])(model_path)
+            emb_out = ModelEmbeddings(model_path)
+            emb_base = ModelEmbeddings(model_base)
+            emb_chatml = ModelEmbeddings(model_chatml)
+
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_20"), emb_chatml.embed_tokens[64, :]
+            ), "Token _tok_20 should be == model_chatml token 64"
+            assert torch.allclose(
+                emb_out.token_embedding("_tok_21"), emb_base.token_embedding("<s>")
+            ), "Token _tok_21 should be == model_base <s>"
+
+        run_and_check_merge(config, validate=_check_embed)
+
+    def test_pad_to_multiple_of(self, model_chatml: str):
+        config = self.make_config(
+            [model_chatml],
+            base_model=model_chatml,
+            merge_method="linear",
+            tokenizer_config=TokenizerConfig(
+                source="base",
+                pad_to_multiple_of=16,
+            ),
+        )
+        real_vocab_size = 64 + 2
+        padded_size = (real_vocab_size // 16 + 1) * 16
+
+        def _check_result(model_path: str):
+            cfg = LlamaConfig.from_pretrained(model_path)
+            assert (
+                cfg.vocab_size == padded_size
+            ), f"Expected vocab size {padded_size}, got {cfg.vocab_size}"
+            check_tokenizer(
+                expected_size=real_vocab_size,
+                must_contain=["<|im_start|>", "<|im_end|>"],
+            )(model_path)
+
+            emb_out = ModelEmbeddings(model_path)
+            assert (
+                emb_out.embed_tokens.shape[0] == padded_size
+            ), "Embedding size mismatch"
+
+        run_and_check_merge(config, validate=_check_result)
+
+    def make_config(
+        self,
+        models: List[str],
+        base_model: Optional[str] = None,
+        merge_method: str = "linear",
+        tokenizer_source: Optional[str] = None,
+        t: Optional[float] = None,
+        tokenizer_config: Optional[TokenizerConfig] = None,
+    ):
+        parameters = {"t": t} if t is not None else {}
+        config = MergeConfiguration(
+            merge_method=merge_method,
+            base_model=base_model,
+            models=[
+                InputModelDefinition(
+                    model=m,
+                    parameters={"weight": 1.0},
+                )
+                for m in models
+            ],
+            dtype="float32",
+            tokenizer_source=tokenizer_source,
+            parameters=parameters,
+            tokenizer=tokenizer_config,
+        )
+        return config