Spaces:
Running
Running
Upload 10 files
Browse files- chatml_to_mistral.py +103 -0
- enable_fix_mistral_regex_true.md +569 -0
- folder+subfolder_content_combiner_anyfiles.py +86 -0
- model_tools.md +21 -4
- modify_arcee_fusion_tukey_fence_parameter.md +94 -0
- shield_embeddings.py +66 -0
- shield_norms.py +113 -0
- vocab_id_patcher.py +127 -0
chatml_to_mistral.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import argparse
|
| 4 |
+
from colorama import init, Fore, Style
|
| 5 |
+
|
| 6 |
+
init()
|
| 7 |
+
|
| 8 |
+
def load_json(path):
|
| 9 |
+
if not os.path.exists(path):
|
| 10 |
+
return {}
|
| 11 |
+
try:
|
| 12 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 13 |
+
return json.load(f)
|
| 14 |
+
except Exception:
|
| 15 |
+
return {}
|
| 16 |
+
|
| 17 |
+
def save_json(path, data):
|
| 18 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 19 |
+
json.dump(data, f, indent=2)
|
| 20 |
+
|
| 21 |
+
def convert_model(model_path):
|
| 22 |
+
model_name = os.path.basename(model_path)
|
| 23 |
+
print(f"Scanning: {model_name}...")
|
| 24 |
+
|
| 25 |
+
gen_path = os.path.join(model_path, "generation_config.json")
|
| 26 |
+
tok_conf_path = os.path.join(model_path, "tokenizer_config.json")
|
| 27 |
+
|
| 28 |
+
# 1. Check if it is actually a ChatML/32000 model
|
| 29 |
+
gen_data = load_json(gen_path)
|
| 30 |
+
current_eos = gen_data.get("eos_token_id")
|
| 31 |
+
|
| 32 |
+
# Handle list format (e.g. [32000, 2])
|
| 33 |
+
if isinstance(current_eos, list):
|
| 34 |
+
current_eos = current_eos[0]
|
| 35 |
+
|
| 36 |
+
if str(current_eos) != "32000":
|
| 37 |
+
# If it's already 2, we skip it (unless forced, but let's be safe)
|
| 38 |
+
if str(current_eos) == "2":
|
| 39 |
+
return # Already Mistral format
|
| 40 |
+
print(f" Skipping: EOS ID is {current_eos} (Not 32000)")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
print(f"{Fore.YELLOW} -> Detected ChatML (EOS: 32000). Converting to Mistral format...{Style.RESET_ALL}")
|
| 44 |
+
|
| 45 |
+
# 2. Patch Generation Config
|
| 46 |
+
gen_data["eos_token_id"] = 2
|
| 47 |
+
gen_data["pad_token_id"] = 2 # Usually good practice to align pad/eos for base mistral
|
| 48 |
+
save_json(gen_path, gen_data)
|
| 49 |
+
print(f" Fixed generation_config.json (ID: 2)")
|
| 50 |
+
|
| 51 |
+
# 3. Patch Tokenizer Config
|
| 52 |
+
if os.path.exists(tok_conf_path):
|
| 53 |
+
tok_data = load_json(tok_conf_path)
|
| 54 |
+
|
| 55 |
+
# Change string to </s>
|
| 56 |
+
tok_data["eos_token"] = "</s>"
|
| 57 |
+
|
| 58 |
+
# Remove chat_template if it exists (prevents auto-detection issues later)
|
| 59 |
+
if "chat_template" in tok_data:
|
| 60 |
+
del tok_data["chat_template"]
|
| 61 |
+
|
| 62 |
+
save_json(tok_conf_path, tok_data)
|
| 63 |
+
print(f" Fixed tokenizer_config.json (Str: </s>)")
|
| 64 |
+
|
| 65 |
+
# 4. Patch Special Tokens Map
|
| 66 |
+
spec_path = os.path.join(model_path, "special_tokens_map.json")
|
| 67 |
+
if os.path.exists(spec_path):
|
| 68 |
+
spec_data = load_json(spec_path)
|
| 69 |
+
spec_data["eos_token"] = "</s>"
|
| 70 |
+
save_json(spec_path, spec_data)
|
| 71 |
+
print(f" Fixed special_tokens_map.json")
|
| 72 |
+
|
| 73 |
+
print(f"{Fore.GREEN} -> Successfully converted {model_name}{Style.RESET_ALL}")
|
| 74 |
+
|
| 75 |
+
def main():
|
| 76 |
+
parser = argparse.ArgumentParser(description="Convert ChatML models (EOS 32000) to Mistral format (EOS 2)")
|
| 77 |
+
parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
|
| 78 |
+
args = parser.parse_args()
|
| 79 |
+
|
| 80 |
+
print(f"{Fore.CYAN}--- CHATML TO MISTRAL CONVERTER ---{Style.RESET_ALL}")
|
| 81 |
+
print("This script changes metadata only. It allows ChatML models to be merged")
|
| 82 |
+
print("using 'tokenizer: source: base' without errors.\n")
|
| 83 |
+
|
| 84 |
+
if not os.path.exists(args.base_dir):
|
| 85 |
+
print(f"Error: Directory {args.base_dir} does not exist.")
|
| 86 |
+
return
|
| 87 |
+
|
| 88 |
+
# Walk through all subdirectories
|
| 89 |
+
count = 0
|
| 90 |
+
for root, dirs, files in os.walk(args.base_dir):
|
| 91 |
+
for name in dirs:
|
| 92 |
+
# We look at every folder, check if it's a model inside convert_model
|
| 93 |
+
full_path = os.path.join(root, name)
|
| 94 |
+
# Simple check if it looks like a model folder
|
| 95 |
+
if os.path.exists(os.path.join(full_path, "config.json")):
|
| 96 |
+
convert_model(full_path)
|
| 97 |
+
count += 1
|
| 98 |
+
|
| 99 |
+
print("-" * 60)
|
| 100 |
+
print("Scan complete.")
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
main()
|
enable_fix_mistral_regex_true.md
ADDED
|
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
```
|
| 2 |
+
architecture: MistralForCausalLM
|
| 3 |
+
base_model: B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop
|
| 4 |
+
models:
|
| 5 |
+
- model: B:/12B/models--allura-org--Tlacuilo-12B
|
| 6 |
+
- model: B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop
|
| 7 |
+
- model: B:/12B/models--EldritchLabs--Altair-Stock-12B-v1/MPOA_alt
|
| 8 |
+
- model: B:/12B/models--inflatebot--MN-12B-Mag-Mell-R1
|
| 9 |
+
- model: B:/12B/models--MuXodious--Irix-12B-Model_Stock-absolute-heresy
|
| 10 |
+
- model: B:/12B/models--aixonlab--Aether-12b
|
| 11 |
+
- model: B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B
|
| 12 |
+
- model: B:/12B/models--SicariusSicariiStuff--Sweet_Dreams_12B
|
| 13 |
+
- model: B:/12B/models--Epiculous--Azure_Dusk-v0.2
|
| 14 |
+
# - model: B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B.backup
|
| 15 |
+
merge_method: karcher
|
| 16 |
+
parameters:
|
| 17 |
+
max_iter: 30 # Maximum iterations for Karcher mean algorithm (default 10 )
|
| 18 |
+
tol: 1e-9 # Convergence tolerance for Karcher mean algorithm (default 1e5)
|
| 19 |
+
dtype: float32
|
| 20 |
+
out_dtype: bfloat16
|
| 21 |
+
tokenizer:
|
| 22 |
+
source: "union"
|
| 23 |
+
tokens:
|
| 24 |
+
# Force ChatML EOS tokens
|
| 25 |
+
"<|im_start|>":
|
| 26 |
+
source: "B:/12B/models--allura-org--Tlacuilo-12B"
|
| 27 |
+
force: true
|
| 28 |
+
"<|im_end|>":
|
| 29 |
+
source: "B:/12B/models--allura-org--Tlacuilo-12B"
|
| 30 |
+
force: true
|
| 31 |
+
# Keep Mistral tokens
|
| 32 |
+
"[INST]":
|
| 33 |
+
source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
|
| 34 |
+
# source: "B:/12B/models--mistralai--Mistral-Nemo-Instruct-2407" # The tokenizer system requires all models referenced in token configurations to be present in the merge's model list to build proper embedding permutations.
|
| 35 |
+
"[/INST]":
|
| 36 |
+
source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
|
| 37 |
+
# Force </s> as fallback EOS
|
| 38 |
+
"</s>":
|
| 39 |
+
source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
|
| 40 |
+
force: true
|
| 41 |
+
|
| 42 |
+
chat_template: "chatml"
|
| 43 |
+
name: 🧜 Nocturne-Nereid-12B-v1
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
```
|
| 47 |
+
C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda
|
| 48 |
+
|
| 49 |
+
Waiting for 0 seconds, press CTRL+C to quit ...
|
| 50 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 51 |
+
Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1499.99it/s]
|
| 52 |
+
Planning operations
|
| 53 |
+
Executing graph: 0%| | 0/4361 [00:00<?, ?it/s]The tokenizer you are loading from 'B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 54 |
+
Loading tokenizers
|
| 55 |
+
The tokenizer you are loading from 'B:/12B/models--allura-org--Tlacuilo-12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 56 |
+
The tokenizer you are loading from 'B:/12B/models--aixonlab--Aether-12b' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 57 |
+
The tokenizer you are loading from 'B:/12B/models--Epiculous--Azure_Dusk-v0.2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 58 |
+
The tokenizer you are loading from 'B:/12B/models--inflatebot--MN-12B-Mag-Mell-R1' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 59 |
+
The tokenizer you are loading from 'B:/12B/models--EldritchLabs--Altair-Stock-12B-v1/MPOA_alt' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 60 |
+
The tokenizer you are loading from 'B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 61 |
+
The tokenizer you are loading from 'B:/12B/models--SicariusSicariiStuff--Sweet_Dreams_12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 62 |
+
Building output tokenizer
|
| 63 |
+
Token '<|im_start|>' added with multiple different settings, using first
|
| 64 |
+
Building permutations
|
| 65 |
+
Building tokenizer permutations: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 9.54it/s]
|
| 66 |
+
Executing graph: 0%| | 3/4361 [00:07<2:44:37, 2.27s/it]
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
Based on the codebase, mergekit doesn't currently support passing `fix_mistral_regex=True` to tokenizer loading. The warnings you're seeing come from the transformers library, but mergekit only passes `trust_remote_code` to tokenizer loading calls, not the `fix_mistral_regex` parameter.
|
| 72 |
+
|
| 73 |
+
## Current Tokenizer Loading in MergeKit
|
| 74 |
+
|
| 75 |
+
The main tokenizer loading happens in `mergekit/tokenizer/build.py`:
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
# In get_stripped_tokenizer()
|
| 79 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 80 |
+
path.path,
|
| 81 |
+
revision=path.revision,
|
| 82 |
+
trust_remote_code=trust_remote_code,
|
| 83 |
+
use_fast=True,
|
| 84 |
+
)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
And in `build_tokenizer()`:
|
| 88 |
+
```python
|
| 89 |
+
model_tok = transformers.AutoTokenizer.from_pretrained(
|
| 90 |
+
model.model.path,
|
| 91 |
+
revision=model.model.revision,
|
| 92 |
+
trust_remote_code=trust_remote_code,
|
| 93 |
+
)
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## How to Add Support
|
| 97 |
+
|
| 98 |
+
To add `fix_mistral_regex=True` support, you would need to:
|
| 99 |
+
|
| 100 |
+
1. **Add the parameter to MergeOptions** - This would require modifying the options structure to include a new field for `fix_mistral_regex`
|
| 101 |
+
|
| 102 |
+
2. **Update all tokenizer loading calls** - Modify the `AutoTokenizer.from_pretrained()` calls throughout the codebase to pass this parameter:
|
| 103 |
+
- `mergekit/tokenizer/build.py` (lines 49-54, 203-207, 155-157)
|
| 104 |
+
- `mergekit/merge.py` (line 271-275)
|
| 105 |
+
- `mergekit/scripts/tokensurgeon.py` (line 148-152)
|
| 106 |
+
|
| 107 |
+
3. **Add CLI flag** - Add a `--fix-mistral-regex` flag to the CLI that sets this option
|
| 108 |
+
|
| 109 |
+
## Temporary Workaround
|
| 110 |
+
|
| 111 |
+
Until mergekit adds native support, you could:
|
| 112 |
+
|
| 113 |
+
1. **Modify the source locally** - Add `fix_mistral_regex=True` directly to the `AutoTokenizer.from_pretrained()` calls in your local mergekit installation
|
| 114 |
+
|
| 115 |
+
2. **Use a different tokenizer source** - If one of your models has a correct tokenizer, you could specify it as the tokenizer source in your YAML config
|
| 116 |
+
|
| 117 |
+
The warnings won't prevent your merge from completing, but as noted, they may lead to incorrect tokenization behavior with the affected models.
|
| 118 |
+
|
| 119 |
+
## Notes
|
| 120 |
+
|
| 121 |
+
The `trust_remote_code` parameter you're already using via `--trust-remote-code` follows the same pattern through the codebase - it's passed from CLI options to the tokenizer loading functions. Adding `fix_mistral_regex` support would require implementing the same pattern.
|
| 122 |
+
|
| 123 |
+
---
|
| 124 |
+
|
| 125 |
+
> Explain how to enable this for merges on my local copy, step by step.
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
To enable `fix_mistral_regex=True` for your local mergekit installation, you'll need to modify several files to add this parameter throughout the tokenizer loading pipeline.
|
| 130 |
+
|
| 131 |
+
## Step-by-Step Local Modifications
|
| 132 |
+
|
| 133 |
+
### 1. Add the option to MergeOptions
|
| 134 |
+
|
| 135 |
+
Edit `mergekit/options.py` and add the new field:
|
| 136 |
+
|
| 137 |
+
```python
|
| 138 |
+
# In MergeOptions class (around line 41)
|
| 139 |
+
fix_mistral_regex: bool = False
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
Also add it to `OPTION_HELP` and `OPTION_CATEGORIES` dictionaries:
|
| 143 |
+
|
| 144 |
+
```python
|
| 145 |
+
# Around line 114
|
| 146 |
+
OPTION_HELP = {
|
| 147 |
+
# ... existing options ...
|
| 148 |
+
"fix_mistral_regex": "Fix incorrect regex pattern in Mistral tokenizers",
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# Around line 140
|
| 152 |
+
OPTION_CATEGORIES = {
|
| 153 |
+
# ... existing categories ...
|
| 154 |
+
"fix_mistral_regex": "Dangerous Options",
|
| 155 |
+
}
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### 2. Update tokenizer loading in mergekit/tokenizer/build.py
|
| 159 |
+
|
| 160 |
+
Modify all `AutoTokenizer.from_pretrained()` calls to include the new parameter:
|
| 161 |
+
|
| 162 |
+
```python
|
| 163 |
+
# Line 49-54 in get_stripped_tokenizer()
|
| 164 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 165 |
+
path.path,
|
| 166 |
+
revision=path.revision,
|
| 167 |
+
trust_remote_code=trust_remote_code,
|
| 168 |
+
use_fast=True,
|
| 169 |
+
fix_mistral_regex=fix_mistral_regex, # Add this line
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Line 203-207 in build_tokenizer()
|
| 173 |
+
model_tok = transformers.AutoTokenizer.from_pretrained(
|
| 174 |
+
model.model.path,
|
| 175 |
+
revision=model.model.revision,
|
| 176 |
+
trust_remote_code=trust_remote_code,
|
| 177 |
+
fix_mistral_regex=fix_mistral_regex, # Add this line
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Line 155-157 in build_union_tokenizer()
|
| 181 |
+
res = transformers.AutoTokenizer.from_pretrained(
|
| 182 |
+
p, use_fast=True, trust_remote_code=trust_remote_code,
|
| 183 |
+
fix_mistral_regex=fix_mistral_regex, # Add this line
|
| 184 |
+
)
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
You'll also need to update the function signatures to accept this parameter and pass it through the call chain.
|
| 188 |
+
|
| 189 |
+
### 3. Update mergekit/merge.py
|
| 190 |
+
|
| 191 |
+
Modify the tokenizer loading in `_copy_tokenizer()`:
|
| 192 |
+
|
| 193 |
+
```python
|
| 194 |
+
# Line 271-275
|
| 195 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 196 |
+
donor_model.model.path,
|
| 197 |
+
revision=donor_model.model.revision,
|
| 198 |
+
trust_remote_code=options.trust_remote_code,
|
| 199 |
+
fix_mistral_regex=options.fix_mistral_regex, # Add this line
|
| 200 |
+
)
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
### 4. Update mergekit/scripts/tokensurgeon.py
|
| 204 |
+
|
| 205 |
+
Modify the tokenizer loading:
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
# Line 148-152 in get_stuff()
|
| 209 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 210 |
+
model.model.path,
|
| 211 |
+
revision=model.model.revision,
|
| 212 |
+
trust_remote_code=options.trust_remote_code,
|
| 213 |
+
fix_mistral_regex=options.fix_mistral_regex, # Add this line
|
| 214 |
+
)
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
### 5. Update mergekit/scripts/moe.py
|
| 218 |
+
|
| 219 |
+
Modify the tokenizer loading:
|
| 220 |
+
|
| 221 |
+
```python
|
| 222 |
+
# Line 36-38
|
| 223 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 224 |
+
base_model.model.path,
|
| 225 |
+
revision=base_model.model.revision,
|
| 226 |
+
fix_mistral_regex=merge_options.fix_mistral_regex, # Add this line
|
| 227 |
+
)
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
### 6. Update function signatures
|
| 231 |
+
|
| 232 |
+
You'll need to update the function signatures to pass the `fix_mistral_regex` parameter through the call chain. The key functions to update are:
|
| 233 |
+
- `get_stripped_tokenizer()` in `mergekit/tokenizer/build.py`
|
| 234 |
+
- `build_union_tokenizer()` in `mergekit/tokenizer/build.py`
|
| 235 |
+
- `build_tokenizer()` in `mergekit/tokenizer/build.py`
|
| 236 |
+
- `BuildTokenizer.execute()` in `mergekit/tokenizer/build.py`
|
| 237 |
+
|
| 238 |
+
### 7. Use the new flag
|
| 239 |
+
|
| 240 |
+
After making these changes, you can use the new flag in your command:
|
| 241 |
+
|
| 242 |
+
```bash
|
| 243 |
+
mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
## Notes
|
| 247 |
+
|
| 248 |
+
The `trust_remote_code` parameter follows the same pattern through the codebase - it's passed from CLI options to the tokenizer loading functions [1](#0-0) [2](#0-1) . Adding `fix_mistral_regex` support requires implementing the same pattern throughout all tokenizer loading locations [3](#0-2) [4](#0-3) .
|
| 249 |
+
|
| 250 |
+
The warnings you're seeing come from the transformers library and indicate that the tokenizer regex pattern may lead to incorrect tokenization [5](#0-4) . While the merge will complete without this fix, enabling it will ensure proper tokenization behavior.
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
> I did all of that and now it says this
|
| 255 |
+
|
| 256 |
+
```
|
| 257 |
+
C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
|
| 258 |
+
|
| 259 |
+
Waiting for 0 seconds, press CTRL+C to quit ...
|
| 260 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 261 |
+
Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1499.99it/s]
|
| 262 |
+
Planning operations
|
| 263 |
+
Executing graph: 0%| | 1/4361 [00:00<14:53, 4.88it/s]
|
| 264 |
+
Traceback (most recent call last):
|
| 265 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 266 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 267 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Scripts\mergekit-yaml.exe\__main__.py", line 5, in <module>
|
| 268 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1485, in __call__
|
| 269 |
+
return self.main(*args, **kwargs)
|
| 270 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 271 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1406, in main
|
| 272 |
+
rv = self.invoke(ctx)
|
| 273 |
+
^^^^^^^^^^^^^^^^
|
| 274 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1269, in invoke
|
| 275 |
+
return ctx.invoke(self.callback, **ctx.params)
|
| 276 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 277 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 824, in invoke
|
| 278 |
+
return callback(*args, **kwargs)
|
| 279 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 280 |
+
File "C:\mergekit-main\mergekit\options.py", line 169, in wrapper
|
| 281 |
+
return f(*args, **kwargs)
|
| 282 |
+
^^^^^^^^^^^^^^^^^^
|
| 283 |
+
File "C:\mergekit-main\mergekit\scripts\run_yaml.py", line 30, in main
|
| 284 |
+
run_merge(
|
| 285 |
+
File "C:\mergekit-main\mergekit\merge.py", line 85, in run_merge
|
| 286 |
+
for _task, value in exec.run(quiet=options.quiet):
|
| 287 |
+
File "C:\mergekit-main\mergekit\graph.py", line 747, in run
|
| 288 |
+
for handle, value in self._run(quiet=quiet, desc=desc):
|
| 289 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 290 |
+
File "C:\mergekit-main\mergekit\graph.py", line 712, in _run
|
| 291 |
+
res = task.execute(**cpu_arguments)
|
| 292 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 293 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 294, in execute
|
| 294 |
+
return build_tokenizer(
|
| 295 |
+
^^^^^^^^^^^^^^^^
|
| 296 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 193, in build_tokenizer
|
| 297 |
+
tokenizer_base = get_stripped_tokenizer(
|
| 298 |
+
^^^^^^^^^^^^^^^^^^^^^^^
|
| 299 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 54, in get_stripped_tokenizer
|
| 300 |
+
fix_mistral_regex=fix_mistral_regex, # Add this line
|
| 301 |
+
^^^^^^^^^^^^^^^^^
|
| 302 |
+
NameError: name 'fix_mistral_regex' is not defined
|
| 303 |
+
|
| 304 |
+
C:\mergekit-main>
|
| 305 |
+
```
|
| 306 |
+
|
| 307 |
+
---
|
| 308 |
+
|
| 309 |
+
The error occurs because you added `fix_mistral_regex` to the `AutoTokenizer.from_pretrained()` call in `get_stripped_tokenizer()` but didn't update the function signature to accept this parameter. The parameter needs to be passed through the entire call chain.
|
| 310 |
+
|
| 311 |
+
## Fix the function signatures
|
| 312 |
+
|
| 313 |
+
### 1. Update `get_stripped_tokenizer()` in mergekit/tokenizer/build.py
|
| 314 |
+
|
| 315 |
+
```python
|
| 316 |
+
def get_stripped_tokenizer(
|
| 317 |
+
path: ModelPath, trust_remote_code: bool = False, fix_mistral_regex: bool = False
|
| 318 |
+
) -> transformers.PreTrainedTokenizerFast:
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
### 2. Update `build_tokenizer()` in mergekit/tokenizer/build.py
|
| 322 |
+
|
| 323 |
+
```python
|
| 324 |
+
def build_tokenizer(
|
| 325 |
+
base_model: Optional[ModelReference],
|
| 326 |
+
referenced_models: List[ModelReference],
|
| 327 |
+
tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference],
|
| 328 |
+
trust_remote_code: bool,
|
| 329 |
+
fix_mistral_regex: bool = False,
|
| 330 |
+
add_tokens: Optional[List[str]] = None,
|
| 331 |
+
) -> TokenizerInfo:
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
### 3. Update `BuildTokenizer.execute()` in mergekit/tokenizer/build.py
|
| 335 |
+
|
| 336 |
+
```python
|
| 337 |
+
class BuildTokenizer(Task[TokenizerInfo]):
|
| 338 |
+
base_model: Optional[ModelReference]
|
| 339 |
+
referenced_models: Tuple[ModelReference, ...]
|
| 340 |
+
tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference]
|
| 341 |
+
add_tokens: Optional[Tuple[str, ...]]
|
| 342 |
+
trust_remote_code: bool = False
|
| 343 |
+
fix_mistral_regex: bool = False
|
| 344 |
+
|
| 345 |
+
def execute(self, **_kwargs) -> TokenizerInfo:
|
| 346 |
+
return build_tokenizer(
|
| 347 |
+
base_model=self.base_model,
|
| 348 |
+
referenced_models=self.referenced_models,
|
| 349 |
+
tokenizer_source=self.tokenizer_source,
|
| 350 |
+
trust_remote_code=self.trust_remote_code,
|
| 351 |
+
fix_mistral_regex=self.fix_mistral_regex,
|
| 352 |
+
add_tokens=self.add_tokens,
|
| 353 |
+
)
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
### 4. Update the call in `build_tokenizer()`
|
| 357 |
+
|
| 358 |
+
```python
|
| 359 |
+
tokenizer_base = get_stripped_tokenizer(
|
| 360 |
+
base_model.model, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
|
| 361 |
+
)
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
### 5. Update the call in `MergePlanner.__init__()` in mergekit/plan.py
|
| 365 |
+
|
| 366 |
+
```python
|
| 367 |
+
self._tokenizer_task = BuildTokenizer(
|
| 368 |
+
base_model=config.base_model,
|
| 369 |
+
referenced_models=tuple(config.referenced_models()),
|
| 370 |
+
tokenizer_source=tokenizer_source,
|
| 371 |
+
trust_remote_code=options.trust_remote_code,
|
| 372 |
+
fix_mistral_regex=options.fix_mistral_regex,
|
| 373 |
+
add_tokens=tuple(token_cfg.keys()),
|
| 374 |
+
)
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
### 6. Update `build_union_tokenizer()` in mergekit/tokenizer/build.py
|
| 378 |
+
|
| 379 |
+
```python
|
| 380 |
+
def build_union_tokenizer(
|
| 381 |
+
base_tok: transformers.PreTrainedTokenizerBase,
|
| 382 |
+
tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
|
| 383 |
+
trust_remote_code: bool = False,
|
| 384 |
+
fix_mistral_regex: bool = False,
|
| 385 |
+
) -> transformers.PreTrainedTokenizerBase:
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
And update the call within it:
|
| 389 |
+
```python
|
| 390 |
+
res = transformers.AutoTokenizer.from_pretrained(
|
| 391 |
+
p, use_fast=True, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
|
| 392 |
+
)
|
| 393 |
+
```
|
| 394 |
+
|
| 395 |
+
Also update the call to `build_union_tokenizer()` in `build_tokenizer()`:
|
| 396 |
+
```python
|
| 397 |
+
tokenizer_out = build_union_tokenizer(
|
| 398 |
+
tokenizer_base, tokenizers, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
|
| 399 |
+
)
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
+
## Notes
|
| 403 |
+
|
| 404 |
+
The error occurs because `get_stripped_tokenizer()` doesn't have `fix_mistral_regex` in its parameter list, so when you try to use it in the function body, Python raises a `NameError` [1](#0-0) . The parameter must be added to each function in the call chain from `BuildTokenizer.execute()` down to the actual `AutoTokenizer.from_pretrained()` calls [2](#0-1) .
|
| 405 |
+
|
| 406 |
+
---
|
| 407 |
+
|
| 408 |
+
> I now get this error
|
| 409 |
+
|
| 410 |
+
```
|
| 411 |
+
C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
|
| 412 |
+
|
| 413 |
+
Waiting for 0 seconds, press CTRL+C to quit ...
|
| 414 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 415 |
+
Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1809.80it/s]
|
| 416 |
+
Planning operations
|
| 417 |
+
Executing graph: 0%| | 0/4361 [00:00<?, ?it/s]Loading tokenizers
|
| 418 |
+
Building output tokenizer
|
| 419 |
+
Token '<|im_start|>' added with multiple different settings, using first
|
| 420 |
+
Executing graph: 0%| | 1/4361 [00:04<5:27:21, 4.51s/it]
|
| 421 |
+
Traceback (most recent call last):
|
| 422 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 423 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 424 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Scripts\mergekit-yaml.exe\__main__.py", line 5, in <module>
|
| 425 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1485, in __call__
|
| 426 |
+
return self.main(*args, **kwargs)
|
| 427 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 428 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1406, in main
|
| 429 |
+
rv = self.invoke(ctx)
|
| 430 |
+
^^^^^^^^^^^^^^^^
|
| 431 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1269, in invoke
|
| 432 |
+
return ctx.invoke(self.callback, **ctx.params)
|
| 433 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 434 |
+
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 824, in invoke
|
| 435 |
+
return callback(*args, **kwargs)
|
| 436 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 437 |
+
File "C:\mergekit-main\mergekit\options.py", line 169, in wrapper
|
| 438 |
+
return f(*args, **kwargs)
|
| 439 |
+
^^^^^^^^^^^^^^^^^^
|
| 440 |
+
File "C:\mergekit-main\mergekit\scripts\run_yaml.py", line 30, in main
|
| 441 |
+
run_merge(
|
| 442 |
+
File "C:\mergekit-main\mergekit\merge.py", line 85, in run_merge
|
| 443 |
+
for _task, value in exec.run(quiet=options.quiet):
|
| 444 |
+
File "C:\mergekit-main\mergekit\graph.py", line 747, in run
|
| 445 |
+
for handle, value in self._run(quiet=quiet, desc=desc):
|
| 446 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
File "C:\mergekit-main\mergekit\graph.py", line 712, in _run
|
| 448 |
+
res = task.execute(**cpu_arguments)
|
| 449 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 450 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 296, in execute
|
| 451 |
+
return build_tokenizer(
|
| 452 |
+
^^^^^^^^^^^^^^^^
|
| 453 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 232, in build_tokenizer
|
| 454 |
+
tokenizer_out = build_union_tokenizer(
|
| 455 |
+
^^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
File "C:\mergekit-main\mergekit\tokenizer\build.py", line 158, in build_union_tokenizer
|
| 457 |
+
fix_mistral_regex=fix_mistral_regex, # Add this line
|
| 458 |
+
^^^^^^^^^^^^^^^^^
|
| 459 |
+
NameError: name 'fix_mistral_regex' is not defined
|
| 460 |
+
|
| 461 |
+
C:\mergekit-main>
|
| 462 |
+
```
|
| 463 |
+
|
| 464 |
+
---
|
| 465 |
+
|
| 466 |
+
The error occurs because you haven't updated the function signature for `build_union_tokenizer()` to accept the `fix_mistral_regex` parameter. Here's the fix:
|
| 467 |
+
|
| 468 |
+
## Fix for `build_union_tokenizer()`
|
| 469 |
+
|
| 470 |
+
Update the function signature in `mergekit/tokenizer/build.py`:
|
| 471 |
+
|
| 472 |
+
```python
|
| 473 |
+
def build_union_tokenizer(
|
| 474 |
+
base_tok: transformers.PreTrainedTokenizerBase,
|
| 475 |
+
tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
|
| 476 |
+
trust_remote_code: bool = False,
|
| 477 |
+
fix_mistral_regex: bool = False, # Add this parameter
|
| 478 |
+
) -> transformers.PreTrainedTokenizerBase:
|
| 479 |
+
```
|
| 480 |
+
|
| 481 |
+
And update the `AutoTokenizer.from_pretrained()` call within the same function (around line 155-157):
|
| 482 |
+
|
| 483 |
+
```python
|
| 484 |
+
res = transformers.AutoTokenizer.from_pretrained(
|
| 485 |
+
p,
|
| 486 |
+
use_fast=True,
|
| 487 |
+
trust_remote_code=trust_remote_code,
|
| 488 |
+
fix_mistral_regex=fix_mistral_regex # This line already exists
|
| 489 |
+
)
|
| 490 |
+
```
|
| 491 |
+
|
| 492 |
+
The call to `build_union_tokenizer()` in `build_tokenizer()` (line 232) should already be passing the parameter correctly based on your previous updates.
|
| 493 |
+
|
| 494 |
+
## Complete Function Update
|
| 495 |
+
|
| 496 |
+
Here's the complete updated function for `build_union_tokenizer()`:
|
| 497 |
+
|
| 498 |
+
```python
|
| 499 |
+
def build_union_tokenizer(
|
| 500 |
+
base_tok: transformers.PreTrainedTokenizerBase,
|
| 501 |
+
tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
|
| 502 |
+
trust_remote_code: bool = False,
|
| 503 |
+
fix_mistral_regex: bool = False,
|
| 504 |
+
) -> transformers.PreTrainedTokenizerBase:
|
| 505 |
+
# ... rest of function remains the same ...
|
| 506 |
+
```
|
| 507 |
+
|
| 508 |
+
After this change, your merge should proceed without the `NameError` [1](#0-0) .
|
| 509 |
+
|
| 510 |
+
## Notes
|
| 511 |
+
|
| 512 |
+
The error pattern shows that `fix_mistral_regex` needs to be passed through the entire call chain from the CLI options down to each `AutoTokenizer.from_pretrained()` call [2](#0-1) . You've successfully updated most of the chain, but this function signature was the missing piece.
|
| 513 |
+
|
| 514 |
+
---
|
| 515 |
+
|
| 516 |
+
**Analyzing the Fix**
|
| 517 |
+
|
| 518 |
+
I'm currently focused on dissecting the `fix_mistral_regex=True` parameter within `mergekit`. My primary goal is to determine the precise function of this fix in the context of merging Mistral models. I need to understand its impact and how it addresses a potential issue or enhances the merging process.
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
**Investigating the Root**
|
| 522 |
+
|
| 523 |
+
I'm now diving into the 'why' behind the regex fix, focusing on the author's explanation. The provided text details issues with Mistral tokenizers and chat templates, particularly around premature termination, endless repetition, and hallucinated end-of-sequence tokens. The fix seems to address these problems in a specific Karcher merge, notably resolving issues with chat templates and jailbreaks, while acknowledging some template-specific behavior.
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
**Clarifying the Bugfix**
|
| 527 |
+
|
| 528 |
+
I'm now zeroing in on the technical implementation. The core of `fix_mistral_regex=True` is overriding a faulty regex pattern in the tokenizer config. This pattern is part of how the tokenizer handles pre-tokenization, and the original pattern was causing issues with specific Mistral models, including those Nemo 12B-based models. This fix is crucial for compatibility because of this bug in the `transformers` library, which causes the tokenizer to misbehave in edge cases.
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
**Connecting the Incompatibilities**
|
| 532 |
+
|
| 533 |
+
I'm now understanding how different tokenizer configs cause issues during the merge process. Building a "union" tokenizer with conflicting regex patterns leads to misaligned token mappings, especially for special tokens. The `fix_mistral_regex=True` parameter directly addresses this problem. I am able to identify the core issues: early termination or endless repetition, and hallucination of `<|im_end|>`. These are a direct result of the broken regex and tokenizer incompatibility across the various models being merged.
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
**Mapping the Tokenization**
|
| 537 |
+
|
| 538 |
+
I'm now focusing on how the fix's implementation. It seems by overriding the pre-tokenization regex, mergekit accurately maps overlapping tokens and inserts the ChatML tokens, like `<|im_start|>` and `<|im_end|>`, correctly. This prevents the underlying regex from disrupting token boundaries, ensuring proper handling of different prompt formats during the merge. I'm now documenting how this fixes ChatML/Special Tokens, generation loops, and resolves incompatibilities.
|
| 539 |
+
|
| 540 |
+
Based on the logs, the Hugging Face issue linked in the warnings, and the author's notes, here is an explanation of what `fix_mistral_regex=True` does and why it solves the merge issues.
|
| 541 |
+
|
| 542 |
+
### What the fix actually does (Technically)
|
| 543 |
+
Mistral models (specifically those using their newer "Tekken" tokenizer, like Mistral Nemo 12B and Mistral Small 3) rely on a specific Regular Expression (regex) pattern to pre-tokenize text—meaning how it splits sentences into chunks before assigning token IDs.
|
| 544 |
+
|
| 545 |
+
Recently, a bug was discovered where the regex pattern saved inside the `tokenizer_config.json` of these models was slightly incorrect. This caused the tokenizer to misinterpret certain character boundaries, spaces, and special tokens.
|
| 546 |
+
|
| 547 |
+
When you pass `fix_mistral_regex=True` to `AutoTokenizer.from_pretrained()` in the `transformers` library, it tells the library to **ignore the broken regex pattern in the model's files and dynamically replace it with the corrected pattern**.
|
| 548 |
+
|
| 549 |
+
In the context of `mergekit`, if this fix isn't applied, `mergekit` tries to build a "union" tokenizer out of 9 different models using a broken set of rules. This results in a corrupted vocabulary mapping where special tokens get misaligned.
|
| 550 |
+
|
| 551 |
+
### Why it fixes the issues (According to the Author)
|
| 552 |
+
|
| 553 |
+
According to the author's notes, applying this fix during the merge process resolved several severe, model-breaking behaviors:
|
| 554 |
+
|
| 555 |
+
**1. It stops the model from hallucinating `<|im_end|>` as plain text**
|
| 556 |
+
Without the fix, the broken regex was likely causing the tokenizer to fail at recognizing `<|im_end|>` as a single, distinct "Special Token." Instead, the model was treating it as regular text. Because the text generation interface didn't see the actual *End of Sequence ID*, it just printed the literal characters `<|im_end|>` to the screen and kept generating. Fixing the regex ensures the union tokenizer correctly isolates and maps these ChatML special tokens.
|
| 557 |
+
|
| 558 |
+
**2. It fixes endless repetition and early termination**
|
| 559 |
+
Because the token mappings were corrupted by the bad regex, the merged model's language head was firing the wrong token IDs. It would either output an EOS (End of Sequence) token immediately (terminating early) or completely lose the ability to output an EOS token (repeating endlessly). The fix restores the correct token IDs for stopping generation.
|
| 560 |
+
|
| 561 |
+
**3. It resolves "Tokenizer Incompatibility" across different fine-tunes**
|
| 562 |
+
The author merged 9 different models. Some of these models (like `Mistral-Nemo-Instruct-2407`) use the default Mistral `[INST]` format, while others (like `Tlacuilo-12B`) were fine-tuned to use `ChatML` (`<|im_start|>`).
|
| 563 |
+
Normally, merging models with different prompt formats and added tokens causes severe brain damage to the model. By fixing the underlying regex, `mergekit` was able to accurately map the overlapping base tokens and cleanly inject the forced ChatML tokens (as defined in the YAML config), creating a stable hybrid tokenizer.
|
| 564 |
+
|
| 565 |
+
### The Author's Final Result
|
| 566 |
+
By applying this fix, the author successfully created a 9-model Karcher merge that:
|
| 567 |
+
* **Works perfectly with ChatML:** It follows instructions, doesn't loop, and stops when it is supposed to.
|
| 568 |
+
* **Accepts Jailbreaks:** The control vectors from the "heretic/noslop" models successfully carried over.
|
| 569 |
+
* **Has one remaining quirk:** The author notes that if you try to use the *Mistral Tekken* prompt format instead of *ChatML*, the model terminates early and refuses prompts. This makes sense: the YAML config explicitly forced the ChatML tokens to overwrite the Mistral ones, meaning the model's brain is now permanently wired to expect ChatML.
|
folder+subfolder_content_combiner_anyfiles.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# --- Configuration ---
|
| 5 |
+
OUTPUT_FILENAME = "!combo.txt"
|
| 6 |
+
# Folders to skip to prevent massive/useless data bloat
|
| 7 |
+
BLACKLIST_DIRS = {".git", "__pycache__", ".ipynb_checkpoints", "node_modules", "venv", ".vscode"}
|
| 8 |
+
# --- End Configuration ---
|
| 9 |
+
|
| 10 |
+
def combine_all_recursive():
|
| 11 |
+
"""
|
| 12 |
+
Recursively combines all files in the script's directory and all subdirectories
|
| 13 |
+
into a single output file with relative path tags.
|
| 14 |
+
"""
|
| 15 |
+
try:
|
| 16 |
+
# Determine the script's own path
|
| 17 |
+
if getattr(sys, 'frozen', False):
|
| 18 |
+
script_full_path = os.path.abspath(sys.executable)
|
| 19 |
+
else:
|
| 20 |
+
script_full_path = os.path.abspath(__file__)
|
| 21 |
+
|
| 22 |
+
script_dir = os.path.dirname(script_full_path)
|
| 23 |
+
output_filepath = os.path.join(script_dir, OUTPUT_FILENAME)
|
| 24 |
+
|
| 25 |
+
print(f"Scanning recursively in: {script_dir}")
|
| 26 |
+
print(f"Output file: {output_filepath}")
|
| 27 |
+
print("-" * 40)
|
| 28 |
+
|
| 29 |
+
file_list = []
|
| 30 |
+
|
| 31 |
+
# os.walk handles the recursion automatically
|
| 32 |
+
for root, dirs, files in os.walk(script_dir):
|
| 33 |
+
# Modify dirs in-place to skip blacklisted folders
|
| 34 |
+
dirs[:] = [d for d in dirs if d not in BLACKLIST_DIRS]
|
| 35 |
+
|
| 36 |
+
for filename in files:
|
| 37 |
+
full_path = os.path.join(root, filename)
|
| 38 |
+
|
| 39 |
+
# Exclude the script itself and the output file
|
| 40 |
+
if (os.path.normpath(full_path) == os.path.normpath(script_full_path) or
|
| 41 |
+
filename == OUTPUT_FILENAME):
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Store relative path for sorting and tagging
|
| 45 |
+
rel_path = os.path.relpath(full_path, script_dir)
|
| 46 |
+
file_list.append((rel_path, full_path))
|
| 47 |
+
|
| 48 |
+
# Sort by relative path so the combo file is organized logically
|
| 49 |
+
file_list.sort(key=lambda x: x[0])
|
| 50 |
+
|
| 51 |
+
if not file_list:
|
| 52 |
+
print("No files found to combine.")
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
print(f"Found {len(file_list)} files. Starting combination...")
|
| 56 |
+
|
| 57 |
+
with open(output_filepath, 'w', encoding='utf-8', errors='replace') as outfile:
|
| 58 |
+
outfile.write("```\n") # Opening wrapper
|
| 59 |
+
|
| 60 |
+
for rel_path, full_path in file_list:
|
| 61 |
+
print(f"Adding: {rel_path}")
|
| 62 |
+
try:
|
| 63 |
+
outfile.write(f"<{rel_path}>\n")
|
| 64 |
+
with open(full_path, 'r', encoding='utf-8', errors='replace') as infile:
|
| 65 |
+
outfile.write(infile.read())
|
| 66 |
+
outfile.write(f"\n</{rel_path}>\n\n")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
outfile.write(f"ERROR READING FILE {rel_path}: {e}\n")
|
| 69 |
+
outfile.write(f"</{rel_path}>\n\n")
|
| 70 |
+
|
| 71 |
+
outfile.write("```\n") # Closing wrapper
|
| 72 |
+
|
| 73 |
+
print("-" * 40)
|
| 74 |
+
print(f"SUCCESS! Combined {len(file_list)} files into '{OUTPUT_FILENAME}'")
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"CRITICAL ERROR: {e}")
|
| 78 |
+
import traceback
|
| 79 |
+
traceback.print_exc()
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
combine_all_recursive()
|
| 83 |
+
try:
|
| 84 |
+
input("\nProcess complete. Press Enter to exit...")
|
| 85 |
+
except EOFError:
|
| 86 |
+
pass
|
model_tools.md
CHANGED
|
@@ -8,7 +8,7 @@ pinned: false
|
|
| 8 |
---
|
| 9 |
|
| 10 |
# Model Tools by Naphula
|
| 11 |
-
Tools to enhance LLM quantizations and merging
|
| 12 |
|
| 13 |
# [graph_v18.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/graph_v18.py)
|
| 14 |
- Merge models in minutes instead of hours on low VRAM. For a 3060/3060 Ti user: This script enables functionality that is otherwise impossible (merging 70B models or large 7B merges with `--cuda`) without OOM. [More details here](https://huggingface.co/spaces/Naphula/model_tools/blob/main/mergekit_low-VRAM-graph_patch.md)
|
|
@@ -17,6 +17,14 @@ Tools to enhance LLM quantizations and merging
|
|
| 17 |
# config.py
|
| 18 |
- Simply replace line 13 | BEFORE `ScalarOrGradient: TypeAlias = Union[float, List[float]]` → AFTER `ScalarOrGradient: TypeAlias = Union[float, List[float], str, bool]` | to allow for custom filepath strings within parameter settings.
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# [audit_della.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/audit_della.py)
|
| 21 |
- Audit the compatibility of donor models for `Della` merges before merging. See: [example chart Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.png), [example log Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.log), [example chart Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.png), [example log Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.log)
|
| 22 |
|
|
@@ -40,13 +48,16 @@ Tools to enhance LLM quantizations and merging
|
|
| 40 |
- Then assign the num_experts_per_tok in config.json (or the config.yaml)
|
| 41 |
|
| 42 |
# [tokensurgeon.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
|
| 43 |
-
- Uses adaptive VRAM from Grim Jim's `measure.py` like `graph_v18` to prevent OOM. Use recommended [batch file](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fix_tokenizers.bat) here or modify sh. This
|
| 44 |
|
| 45 |
# [tokeninspector.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokeninspector.py)
|
| 46 |
- Audit your tokensurgeon results.
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
# [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
|
| 49 |
-
- Updated! This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
|
| 50 |
|
| 51 |
# [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
|
| 52 |
- This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
|
|
@@ -63,6 +74,9 @@ Tools to enhance LLM quantizations and merging
|
|
| 63 |
# [textonly_ripper_v2.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper_v2.py)
|
| 64 |
- Converts a sharded, multimodal (text and vision) model into a text-only version. Readme at [textonly_ripper.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper.md)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
# [vocab_resizer.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_resizer.py)
|
| 67 |
- Converts models with larger vocab_sizes to a standard size (default 131072 Mistral 24B) for use with mergekit. Note that `tokenizer.model` must be manually copied into the `/fixed/` folder.
|
| 68 |
|
|
@@ -70,11 +84,14 @@ Tools to enhance LLM quantizations and merging
|
|
| 70 |
- This script will load a "fat" 18.9GB model (default Gemma 9B), force it to tie the weights (deduplicating the lm_head), and re-save it. This will drop the file size to ~17.2GB and make it compatible with the others.
|
| 71 |
|
| 72 |
# [model_index_json_generator.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator.py)
|
| 73 |
-
- Generates a missing `model.safetensors.index.json` file. Useful for cases where safetensors may have been sharded at the wrong size.
|
| 74 |
|
| 75 |
# [folder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder_content_combiner_anyfiles.py)
|
| 76 |
- Combines all files in the script's current directory into a single output file, sorted alphabetically.
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
# [GGUF Repo Suite](https://huggingface.co/spaces/Naphula/gguf-repo-suite)
|
| 79 |
- Create and quantize Hugging Face models
|
| 80 |
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
# Model Tools by Naphula
|
| 11 |
+
Tools to enhance LLM quantizations and merging. Merge and audit large language models with low VRAM.
|
| 12 |
|
| 13 |
# [graph_v18.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/graph_v18.py)
|
| 14 |
- Merge models in minutes instead of hours on low VRAM. For a 3060/3060 Ti user: This script enables functionality that is otherwise impossible (merging 70B models or large 7B merges with `--cuda`) without OOM. [More details here](https://huggingface.co/spaces/Naphula/model_tools/blob/main/mergekit_low-VRAM-graph_patch.md)
|
|
|
|
| 17 |
# config.py
|
| 18 |
- Simply replace line 13 | BEFORE `ScalarOrGradient: TypeAlias = Union[float, List[float]]` → AFTER `ScalarOrGradient: TypeAlias = Union[float, List[float], str, bool]` | to allow for custom filepath strings within parameter settings.
|
| 19 |
|
| 20 |
+
# [enable_fix_mistral_regex_true.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/enable_fix_mistral_regex_true.md)
|
| 21 |
+
- Merge models with extreme tokenizer incompatibility. Requires modifying the `mergekit.yaml` `tokenizer` section and adding `--fix-mistral-regex` to your merge commands. (Note: Do not use `token_surgeon.py`, `gen_id_patcher.py`, or `vocab_id_patcher.py` with this, they are obsolete now.) Configured for MN 12B by default. Follow the steps in this guide to modify these scripts:
|
| 22 |
+
- `mergekit/merge.py`
|
| 23 |
+
- `mergekit/options.py`
|
| 24 |
+
- `mergekit/scripts/moe.py`
|
| 25 |
+
- `mergekit/scripts/tokensurgeon.py`
|
| 26 |
+
- `mergekit/tokenizer/build.py`
|
| 27 |
+
|
| 28 |
# [audit_della.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/audit_della.py)
|
| 29 |
- Audit the compatibility of donor models for `Della` merges before merging. See: [example chart Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.png), [example log Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.log), [example chart Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.png), [example log Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.log)
|
| 30 |
|
|
|
|
| 48 |
- Then assign the num_experts_per_tok in config.json (or the config.yaml)
|
| 49 |
|
| 50 |
# [tokensurgeon.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
|
| 51 |
+
- Uses adaptive VRAM from Grim Jim's `measure.py` like `graph_v18` to prevent OOM. Use recommended [batch file](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fix_tokenizers.bat) here or modify sh. This avoids 'Potemkin village' fake patches like `gen_id_patcher` and `vocab_id_patcher`. For this to work properly, you must also run [shield_embeddings.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/shield_embeddings.py) and [shield_norms.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/shield_norms.py) on any merges made from models patched with with tokensurgeon.
|
| 52 |
|
| 53 |
# [tokeninspector.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokeninspector.py)
|
| 54 |
- Audit your tokensurgeon results.
|
| 55 |
|
| 56 |
+
# [arcee_fusion_salience_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/arcee_fusion_salience_scanner.py)
|
| 57 |
+
- Scan the salience % of your arcee_fusion merges. The default `tukey_fence` value is 1.5 which results in 12.5% salience, but [this can be adjusted (see guide here)](modify_arcee_fusion_tukey_fence_parameter.md).
|
| 58 |
+
|
| 59 |
# [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
|
| 60 |
+
- Updated! This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) and [vocab_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_id_patcher.py), or the [chatml_to_mistral.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/chatml_to_mistral.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
|
| 61 |
|
| 62 |
# [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
|
| 63 |
- This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
|
|
|
|
| 74 |
# [textonly_ripper_v2.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper_v2.py)
|
| 75 |
- Converts a sharded, multimodal (text and vision) model into a text-only version. Readme at [textonly_ripper.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper.md)
|
| 76 |
|
| 77 |
+
# [json_reverter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/json_reverter.py)
|
| 78 |
+
- Revert changes to all JSON files done by `gen_id_patcher.py`, `vocab_id_patcher.py` or other scripts, within a specified root folder. It re-downloads the source files from the HF repo.
|
| 79 |
+
|
| 80 |
# [vocab_resizer.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_resizer.py)
|
| 81 |
- Converts models with larger vocab_sizes to a standard size (default 131072 Mistral 24B) for use with mergekit. Note that `tokenizer.model` must be manually copied into the `/fixed/` folder.
|
| 82 |
|
|
|
|
| 84 |
- This script will load a "fat" 18.9GB model (default Gemma 9B), force it to tie the weights (deduplicating the lm_head), and re-save it. This will drop the file size to ~17.2GB and make it compatible with the others.
|
| 85 |
|
| 86 |
# [model_index_json_generator.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator.py)
|
| 87 |
+
- Generates a missing `model.safetensors.index.json` file. Useful for cases where safetensors may have been sharded at the wrong size. [Single tensor variant here.](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator_SingleTensor.py)
|
| 88 |
|
| 89 |
# [folder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder_content_combiner_anyfiles.py)
|
| 90 |
- Combines all files in the script's current directory into a single output file, sorted alphabetically.
|
| 91 |
|
| 92 |
+
# [folder+subfolder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder+subfolder_content_combiner_anyfiles.py)
|
| 93 |
+
- Combines all files in the script's directory, including all files within subdirectories (excluding blacklisted formats) into a single output file, sorted alphabetically.
|
| 94 |
+
|
| 95 |
# [GGUF Repo Suite](https://huggingface.co/spaces/Naphula/gguf-repo-suite)
|
| 96 |
- Create and quantize Hugging Face models
|
| 97 |
|
modify_arcee_fusion_tukey_fence_parameter.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The default setting is 1.5 which merges to 12% salience. If you want 25% then decrease to 0.75. If you want 33% use 0.4, and 0.0 results in 50% salience.
|
| 2 |
+
|
| 3 |
+
`arcee_fusion` hardcodes this to 1.5 by default but you can simply edit the python script before merging (easier), or update the code to assign it as a yaml parameter (more complex).
|
| 4 |
+
|
| 5 |
+
[Here is a scanner to audit Arcee_Fusion merge salience](https://huggingface.co/spaces/Naphula/model_tools/blob/main/arcee_fusion_salience_scanner.py)
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
In the context of merging models using the `mergekit` framework, the provided code showcases a **Dynamic Threshold Fusion** mechanism. Here's an overview of how modifying the **Tukey fence** parameter (from **1.5** to **0.75** or **3.0**) would affect the model merging process.
|
| 10 |
+
|
| 11 |
+
## Impact of Changing Tukey Fence Parameter
|
| 12 |
+
|
| 13 |
+
### Current Implementation
|
| 14 |
+
In your code, the dynamic threshold is set as:
|
| 15 |
+
|
| 16 |
+
```python
|
| 17 |
+
dynamic_threshold = median + 1.0 * iqr # Tukey Fence
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
This uses **1.0** as the multiplier instead of **1.5**, but let's assume it was meant to be **1.5** for generalizing the Tukey fence.
|
| 21 |
+
|
| 22 |
+
### Changes to k Values
|
| 23 |
+
|
| 24 |
+
#### 1. **If k = 0.75**
|
| 25 |
+
- **Increased Sensitivity**: The dynamic threshold would be lower, leading to more values being included in the fusion mask.
|
| 26 |
+
- **More Features Merged**: This results in a potentially more aggressive merging of parameters, effectively incorporating even slight differences between the model weights.
|
| 27 |
+
- **Use Cases**: Ideal for exploratory processes or applications where capturing subtle variations is critical.
|
| 28 |
+
|
| 29 |
+
#### Implementation
|
| 30 |
+
You would change the line in `calculate_dynamic_threshold`:
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
dynamic_threshold = median + 0.75 * iqr # Tukey Fence with k = 0.75
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
#### 2. **If k = 3.0**
|
| 39 |
+
- **Decreased Sensitivity**: The dynamic threshold would be much higher, allowing only significant differences to be considered.
|
| 40 |
+
- **Fewer Features Merged**: This could lead to a conservative merging approach, where only clear outliers are merged, possibly overlooking important but smaller differences.
|
| 41 |
+
- **Use Cases**: Useful in high-stakes environments where only distinctly different parameters should contribute to the final model.
|
| 42 |
+
|
| 43 |
+
#### Implementation
|
| 44 |
+
You would change the same line to:
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
dynamic_threshold = median + 3.0 * iqr # Tukey Fence with k = 3.0
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## Summary of Effects on Fusion Process
|
| 53 |
+
- **k = 0.75**: This results in a more inclusive fusion mask. More difference is merged, potentially capturing more nuanced information.
|
| 54 |
+
- **k = 3.0**: This leads to a restrictive fusion mask. Smaller differences would be ignored, possibly resulting in a model that retains more characteristics of the base model if the other model has less deviation.
|
| 55 |
+
|
| 56 |
+
### Final Thought
|
| 57 |
+
The choice of **k** can significantly dictate the behavior and performance of the merged model. Deciding whether to accommodate subtle differences or enforce strict conditions on merging will greatly depend on the specific use case and the importance of the characteristics being learned from either model. If you'd like to delve deeper into any specific aspect or need further assistance, just let me know!
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
The reason you are seeing a consistent **~12%** is not because of a hardcoded number, but because of a specific statistical formula used to identify "outliers" in the weight distribution.
|
| 62 |
+
|
| 63 |
+
In `arcee_fusion.py`, the density is controlled by the **Tukey's Fence** method. Here are the specific lines of code and the math behind them:
|
| 64 |
+
|
| 65 |
+
### 1. The Threshold Calculation
|
| 66 |
+
In `arcee_fusion.py`, inside the `DynamicThresholdFusion` class:
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
# Line 41-42: Calculate the Interquartile Range (IQR)
|
| 70 |
+
iqr = q3 - q1
|
| 71 |
+
|
| 72 |
+
# Line 45: The "Magic" Formula
|
| 73 |
+
dynamic_threshold = median + 1.5 * iqr
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### 2. The Technical Analysis of "12%"
|
| 77 |
+
The value **`1.5`** is the standard multiplier for **Tukey's Fences**, a statistical method used to find "outliers."
|
| 78 |
+
|
| 79 |
+
* **The Logic:** Arcee Fusion assumes that most weight changes in a fine-tune are "noise" or "jitter." It only wants to keep the "salient" changes—the ones that are statistically significant.
|
| 80 |
+
* **The Distribution:** Importance scores in neural networks usually follow a **Laplace** or **Heavy-Tailed Gaussian** distribution.
|
| 81 |
+
* **The Result:** In these types of distributions, the data points that fall above `Median + 1.5 * IQR` (the "upper fence") typically represent the top **10% to 15%** of the values.
|
| 82 |
+
|
| 83 |
+
Your result of **12.1%** means that in every layer, roughly 12% of the weights were "important enough" to cross that statistical fence. The other 88% were considered "noise" and were reverted to the Mistral Base weights.
|
| 84 |
+
|
| 85 |
+
### 3. Why is it so consistent across layers?
|
| 86 |
+
The scanner shows almost no variation (11.9% to 12.2%). This reveals a technical truth about your fine-tunes: **The "knowledge" is spread evenly.**
|
| 87 |
+
|
| 88 |
+
If you were merging a model that only learned a specific task (like coding), you would see a "spike" in the middle layers (where logic is processed) and 0% in the early layers. Because your layers are all ~12%, it means the fine-tuned models you are using modified the entire "brain" of the model uniformly.
|
| 89 |
+
|
| 90 |
+
### 4. How to change this (The "Patch")
|
| 91 |
+
If you want a higher density (e.g., you want 25% or 50% "New Info"), you have to modify the multiplier in `arcee_fusion.py`.
|
| 92 |
+
|
| 93 |
+
* **To get MORE new info:** Lower the `1.5` to `0.5`.
|
| 94 |
+
* **To get LESS new info:** Raise the `1.5` to `3.0` (this is the "Extreme Outlier" threshold).
|
shield_embeddings.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import gc
|
| 4 |
+
import shutil
|
| 5 |
+
from safetensors.torch import load_file, save_file
|
| 6 |
+
import argparse
|
| 7 |
+
|
| 8 |
+
def get_weight_map(model_path):
|
| 9 |
+
index_path = os.path.join(model_path, "model.safetensors.index.json")
|
| 10 |
+
if os.path.exists(index_path):
|
| 11 |
+
with open(index_path, 'r') as f:
|
| 12 |
+
return json.load(f)["weight_map"]
|
| 13 |
+
for f in os.listdir(model_path):
|
| 14 |
+
if f.endswith(".safetensors"):
|
| 15 |
+
return {"model.embed_tokens.weight": f, "lm_head.weight": f}
|
| 16 |
+
return {}
|
| 17 |
+
|
| 18 |
+
def swap_tensor(base_path, merged_path, tensor_name):
|
| 19 |
+
base_map = get_weight_map(base_path)
|
| 20 |
+
merged_map = get_weight_map(merged_path)
|
| 21 |
+
|
| 22 |
+
if tensor_name not in base_map or tensor_name not in merged_map:
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
base_shard = os.path.join(base_path, base_map[tensor_name])
|
| 26 |
+
merged_shard = os.path.join(merged_path, merged_map[tensor_name])
|
| 27 |
+
|
| 28 |
+
# Load pristine tensor
|
| 29 |
+
base_tensors = load_file(base_shard, device="cpu")
|
| 30 |
+
pristine_tensor = base_tensors[tensor_name].clone()
|
| 31 |
+
del base_tensors
|
| 32 |
+
gc.collect()
|
| 33 |
+
|
| 34 |
+
# Load merged shards
|
| 35 |
+
merged_tensors = load_file(merged_shard, device="cpu")
|
| 36 |
+
merged_tensors[tensor_name] = pristine_tensor
|
| 37 |
+
|
| 38 |
+
# ATOMIC RENAME STRATEGY (The only way to beat Windows 1224)
|
| 39 |
+
backup_shard = merged_shard + ".old"
|
| 40 |
+
os.rename(merged_shard, backup_shard) # Move current file to side
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
save_file(merged_tensors, merged_shard, metadata={"format": "pt"})
|
| 44 |
+
except Exception as e:
|
| 45 |
+
os.rename(backup_shard, merged_shard) # Restore if fail
|
| 46 |
+
raise e
|
| 47 |
+
|
| 48 |
+
# Cleanup
|
| 49 |
+
del merged_tensors
|
| 50 |
+
del pristine_tensor
|
| 51 |
+
gc.collect()
|
| 52 |
+
os.remove(backup_shard) # Now delete the old mapped file
|
| 53 |
+
|
| 54 |
+
print(f" ✅ Successfully shielded {tensor_name}!")
|
| 55 |
+
|
| 56 |
+
def main():
|
| 57 |
+
parser = argparse.ArgumentParser()
|
| 58 |
+
parser.add_argument("base_model")
|
| 59 |
+
parser.add_argument("merged_model")
|
| 60 |
+
args = parser.parse_args()
|
| 61 |
+
|
| 62 |
+
swap_tensor(args.base_model, args.merged_model, "model.embed_tokens.weight")
|
| 63 |
+
swap_tensor(args.base_model, args.merged_model, "lm_head.weight")
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
main()
|
shield_norms.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python shield_norms.py "B:\12B\models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop" "C:\Quanter\model_cache\EldritchLabs__Nocturne-Nereid-12B-v1"
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import gc
|
| 6 |
+
import re
|
| 7 |
+
from safetensors.torch import load_file, save_file
|
| 8 |
+
import argparse
|
| 9 |
+
|
| 10 |
+
def get_weight_map(model_path):
|
| 11 |
+
index_path = os.path.join(model_path, "model.safetensors.index.json")
|
| 12 |
+
if os.path.exists(index_path):
|
| 13 |
+
with open(index_path, 'r') as f:
|
| 14 |
+
return json.load(f)["weight_map"]
|
| 15 |
+
# Fallback for single file models
|
| 16 |
+
res = {}
|
| 17 |
+
for f in os.listdir(model_path):
|
| 18 |
+
if f.endswith(".safetensors"):
|
| 19 |
+
# This is slow but accurate for single-file models
|
| 20 |
+
tensors = load_file(os.path.join(model_path, f))
|
| 21 |
+
for k in tensors.keys():
|
| 22 |
+
res[k] = f
|
| 23 |
+
return res
|
| 24 |
+
|
| 25 |
+
def shield_norms(base_path, merged_path):
|
| 26 |
+
print("\n[1] Mapping tensors...")
|
| 27 |
+
base_map = get_weight_map(base_path)
|
| 28 |
+
merged_map = get_weight_map(merged_path)
|
| 29 |
+
|
| 30 |
+
# Identify all normalization tensors
|
| 31 |
+
# Catching: model.norm.weight, input_layernorm.weight, post_attention_layernorm.weight
|
| 32 |
+
norm_pattern = re.compile(r".*norm\.weight$")
|
| 33 |
+
target_tensors = [t for t in merged_map.keys() if norm_pattern.match(t)]
|
| 34 |
+
|
| 35 |
+
if not target_tensors:
|
| 36 |
+
print(" [!] No normalization tensors found!")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
print(f" -> Found {len(target_tensors)} normalization tensors to shield.")
|
| 40 |
+
|
| 41 |
+
# Group by shard to minimize file opening
|
| 42 |
+
shards_to_process = {}
|
| 43 |
+
for t in target_tensors:
|
| 44 |
+
shard = merged_map[t]
|
| 45 |
+
if shard not in shards_to_process:
|
| 46 |
+
shards_to_process[shard] = []
|
| 47 |
+
shards_to_process[shard].append(t)
|
| 48 |
+
|
| 49 |
+
print(f"\n[2] Processing {len(shards_to_process)} shards...")
|
| 50 |
+
|
| 51 |
+
for shard_name, tensors in shards_to_process.items():
|
| 52 |
+
merged_shard_path = os.path.join(merged_path, shard_name)
|
| 53 |
+
backup_shard_path = merged_shard_path + ".old"
|
| 54 |
+
|
| 55 |
+
print(f" -> Shard: {shard_name}")
|
| 56 |
+
|
| 57 |
+
# 1. Load the merged shard
|
| 58 |
+
merged_tensors = load_file(merged_shard_path, device="cpu")
|
| 59 |
+
|
| 60 |
+
# 2. Update each target tensor in this shard
|
| 61 |
+
for t_name in tensors:
|
| 62 |
+
if t_name in base_map:
|
| 63 |
+
base_shard_path = os.path.join(base_path, base_map[t_name])
|
| 64 |
+
base_data = load_file(base_shard_path, device="cpu")
|
| 65 |
+
|
| 66 |
+
print(f" Injecting pristine: {t_name}")
|
| 67 |
+
merged_tensors[t_name] = base_data[t_name].clone()
|
| 68 |
+
|
| 69 |
+
del base_data
|
| 70 |
+
else:
|
| 71 |
+
print(f" [!] Warning: {t_name} not found in base model. Skipping.")
|
| 72 |
+
|
| 73 |
+
# 3. Atomic Rename Strategy for Windows
|
| 74 |
+
if os.path.exists(backup_shard_path):
|
| 75 |
+
os.remove(backup_shard_path)
|
| 76 |
+
|
| 77 |
+
os.rename(merged_shard_path, backup_shard_path)
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
save_file(merged_tensors, merged_shard_path, metadata={"format": "pt"})
|
| 81 |
+
print(f" ✅ Shard saved successfully.")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f" ❌ Error saving shard: {e}")
|
| 84 |
+
os.rename(backup_shard_path, merged_shard_path)
|
| 85 |
+
raise e
|
| 86 |
+
|
| 87 |
+
# 4. Cleanup and release handles
|
| 88 |
+
del merged_tensors
|
| 89 |
+
gc.collect()
|
| 90 |
+
try:
|
| 91 |
+
os.remove(backup_shard_path)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f" [!] Note: Could not delete .old file immediately (OS lock). It will be orphaned: {e}")
|
| 94 |
+
|
| 95 |
+
def main():
|
| 96 |
+
parser = argparse.ArgumentParser(description="Revert all normalization layers to base model values.")
|
| 97 |
+
parser.add_argument("base_model", help="Path to the pristine base model")
|
| 98 |
+
parser.add_argument("merged_model", help="Path to the merged model with artifacts")
|
| 99 |
+
args = parser.parse_args()
|
| 100 |
+
|
| 101 |
+
print("="*60)
|
| 102 |
+
print("🛡️ NORM SHIELD: RE-CALIBRATING OUTPUT MANIFOLD")
|
| 103 |
+
print("="*60)
|
| 104 |
+
|
| 105 |
+
shield_norms(args.base_model, args.merged_model)
|
| 106 |
+
|
| 107 |
+
print("\n" + "="*60)
|
| 108 |
+
print("Done! All normalization layers have been reverted to Base.")
|
| 109 |
+
print("This should eliminate the 'napad' / 'derrotó' vector drift.")
|
| 110 |
+
print("="*60)
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
main()
|
vocab_id_patcher.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import argparse
|
| 4 |
+
from colorama import init, Fore, Style
|
| 5 |
+
|
| 6 |
+
init()
|
| 7 |
+
|
| 8 |
+
# Updated list to include Dolphin, Noromaid, and Hermes
|
| 9 |
+
TARGET_MODELS = [
|
| 10 |
+
"dreamgen--WizardLM-2-7B",
|
| 11 |
+
"KoboldAI--Mistral-7B-Erebus-v3",
|
| 12 |
+
"teknium--Mistral-Trismegistus-7B",
|
| 13 |
+
"dolphin-2.2.1-mistral-7b",
|
| 14 |
+
"Noromaid-7B-0.4-DPO",
|
| 15 |
+
"Hermes-Trismegistus-Mistral-7B"
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
def load_json(path):
|
| 19 |
+
if not os.path.exists(path):
|
| 20 |
+
return {}
|
| 21 |
+
try:
|
| 22 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 23 |
+
return json.load(f)
|
| 24 |
+
except Exception:
|
| 25 |
+
return {}
|
| 26 |
+
|
| 27 |
+
def save_json(path, data):
|
| 28 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 29 |
+
json.dump(data, f, indent=2)
|
| 30 |
+
|
| 31 |
+
def patch_model(model_path):
|
| 32 |
+
print(f"Processing: {Fore.YELLOW}{os.path.basename(model_path)}{Style.RESET_ALL}")
|
| 33 |
+
|
| 34 |
+
# 1. Fix Tokenizer Config
|
| 35 |
+
tok_config_path = os.path.join(model_path, "tokenizer_config.json")
|
| 36 |
+
if os.path.exists(tok_config_path):
|
| 37 |
+
data = load_json(tok_config_path)
|
| 38 |
+
|
| 39 |
+
# Force EOS to standard Mistral
|
| 40 |
+
data["eos_token"] = "</s>"
|
| 41 |
+
|
| 42 |
+
# Ensure model_max_length is set
|
| 43 |
+
if "model_max_length" not in data or data["model_max_length"] > 100000:
|
| 44 |
+
data["model_max_length"] = 32768
|
| 45 |
+
|
| 46 |
+
save_json(tok_config_path, data)
|
| 47 |
+
print(f" -> Updated tokenizer_config.json (eos_token: </s>)")
|
| 48 |
+
else:
|
| 49 |
+
print(f" {Fore.RED}! tokenizer_config.json missing{Style.RESET_ALL}")
|
| 50 |
+
|
| 51 |
+
# 2. Fix Special Tokens Map
|
| 52 |
+
spec_map_path = os.path.join(model_path, "special_tokens_map.json")
|
| 53 |
+
if os.path.exists(spec_map_path):
|
| 54 |
+
data = load_json(spec_map_path)
|
| 55 |
+
data["eos_token"] = "</s>"
|
| 56 |
+
save_json(spec_map_path, data)
|
| 57 |
+
print(f" -> Updated special_tokens_map.json")
|
| 58 |
+
|
| 59 |
+
# 3. Attempt to fix Tokenizer JSON (The source of "MISSING" ID)
|
| 60 |
+
tok_json_path = os.path.join(model_path, "tokenizer.json")
|
| 61 |
+
if os.path.exists(tok_json_path):
|
| 62 |
+
data = load_json(tok_json_path)
|
| 63 |
+
|
| 64 |
+
# Check if it has the standard structure
|
| 65 |
+
if "model" in data and "vocab" in data["model"]:
|
| 66 |
+
vocab = data["model"]["vocab"]
|
| 67 |
+
|
| 68 |
+
# Ensure </s> exists and is 2
|
| 69 |
+
if "</s>" not in vocab:
|
| 70 |
+
print(f" {Fore.YELLOW}! '</s>' was missing from vocab. Adding it as ID 2.{Style.RESET_ALL}")
|
| 71 |
+
vocab["</s>"] = 2
|
| 72 |
+
elif vocab["</s>"] != 2:
|
| 73 |
+
print(f" {Fore.RED}! '</s>' is ID {vocab['</s>']}. Forcing to 2.{Style.RESET_ALL}")
|
| 74 |
+
vocab["</s>"] = 2
|
| 75 |
+
|
| 76 |
+
# Save back
|
| 77 |
+
data["model"]["vocab"] = vocab
|
| 78 |
+
save_json(tok_json_path, data)
|
| 79 |
+
print(f" -> Patched tokenizer.json vocab")
|
| 80 |
+
else:
|
| 81 |
+
print(f" {Fore.RED}! tokenizer.json has unexpected structure.{Style.RESET_ALL}")
|
| 82 |
+
else:
|
| 83 |
+
print(f" {Fore.RED}! tokenizer.json not found. (This is why Vocab ID is MISSING){Style.RESET_ALL}")
|
| 84 |
+
print(f" {Fore.CYAN}* Attempting to create minimal tokenizer.json wrapper...{Style.RESET_ALL}")
|
| 85 |
+
# Create a dummy wrapper so Mergekit can find the ID
|
| 86 |
+
dummy_data = {
|
| 87 |
+
"version": "1.0",
|
| 88 |
+
"model": {
|
| 89 |
+
"type": "BPE",
|
| 90 |
+
"vocab": {"<unk>": 0, "<s>": 1, "</s>": 2}
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
save_json(tok_json_path, dummy_data)
|
| 94 |
+
print(f" -> Created minimal tokenizer.json")
|
| 95 |
+
|
| 96 |
+
def main():
|
| 97 |
+
parser = argparse.ArgumentParser(description="Patch missing Vocab IDs for specific models")
|
| 98 |
+
parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
|
| 99 |
+
args = parser.parse_args()
|
| 100 |
+
|
| 101 |
+
print(f"{Fore.CYAN}--- VOCAB ID PATCHER ---{Style.RESET_ALL}")
|
| 102 |
+
|
| 103 |
+
# Walk through the directory to find the specific models
|
| 104 |
+
found = 0
|
| 105 |
+
for root, dirs, files in os.walk(args.base_dir):
|
| 106 |
+
for name in dirs:
|
| 107 |
+
# Check if this folder matches one of our targets
|
| 108 |
+
if any(target in name for target in TARGET_MODELS):
|
| 109 |
+
full_path = os.path.join(root, name)
|
| 110 |
+
|
| 111 |
+
# CHECK FOR 'fixed' SUBDIRECTORY
|
| 112 |
+
# If the model files are actually inside a 'fixed' subfolder, we must patch that instead.
|
| 113 |
+
fixed_sub = os.path.join(full_path, "fixed")
|
| 114 |
+
if os.path.exists(fixed_sub):
|
| 115 |
+
print(f"Found target '{name}' -> Detected 'fixed' subdir. Patching inside.")
|
| 116 |
+
patch_model(fixed_sub)
|
| 117 |
+
else:
|
| 118 |
+
patch_model(full_path)
|
| 119 |
+
|
| 120 |
+
found += 1
|
| 121 |
+
|
| 122 |
+
if found == 0:
|
| 123 |
+
print(f"{Fore.RED}No target models found in {args.base_dir}{Style.RESET_ALL}")
|
| 124 |
+
print(f"Looking for: {TARGET_MODELS}")
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
main()
|