Naphula commited on
Commit
f43fd2b
·
verified ·
1 Parent(s): a5709f7

Upload 10 files

Browse files
chatml_to_mistral.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from colorama import init, Fore, Style
5
+
6
+ init()
7
+
8
+ def load_json(path):
9
+ if not os.path.exists(path):
10
+ return {}
11
+ try:
12
+ with open(path, 'r', encoding='utf-8') as f:
13
+ return json.load(f)
14
+ except Exception:
15
+ return {}
16
+
17
+ def save_json(path, data):
18
+ with open(path, 'w', encoding='utf-8') as f:
19
+ json.dump(data, f, indent=2)
20
+
21
+ def convert_model(model_path):
22
+ model_name = os.path.basename(model_path)
23
+ print(f"Scanning: {model_name}...")
24
+
25
+ gen_path = os.path.join(model_path, "generation_config.json")
26
+ tok_conf_path = os.path.join(model_path, "tokenizer_config.json")
27
+
28
+ # 1. Check if it is actually a ChatML/32000 model
29
+ gen_data = load_json(gen_path)
30
+ current_eos = gen_data.get("eos_token_id")
31
+
32
+ # Handle list format (e.g. [32000, 2])
33
+ if isinstance(current_eos, list):
34
+ current_eos = current_eos[0]
35
+
36
+ if str(current_eos) != "32000":
37
+ # If it's already 2, we skip it (unless forced, but let's be safe)
38
+ if str(current_eos) == "2":
39
+ return # Already Mistral format
40
+ print(f" Skipping: EOS ID is {current_eos} (Not 32000)")
41
+ return
42
+
43
+ print(f"{Fore.YELLOW} -> Detected ChatML (EOS: 32000). Converting to Mistral format...{Style.RESET_ALL}")
44
+
45
+ # 2. Patch Generation Config
46
+ gen_data["eos_token_id"] = 2
47
+ gen_data["pad_token_id"] = 2 # Usually good practice to align pad/eos for base mistral
48
+ save_json(gen_path, gen_data)
49
+ print(f" Fixed generation_config.json (ID: 2)")
50
+
51
+ # 3. Patch Tokenizer Config
52
+ if os.path.exists(tok_conf_path):
53
+ tok_data = load_json(tok_conf_path)
54
+
55
+ # Change string to </s>
56
+ tok_data["eos_token"] = "</s>"
57
+
58
+ # Remove chat_template if it exists (prevents auto-detection issues later)
59
+ if "chat_template" in tok_data:
60
+ del tok_data["chat_template"]
61
+
62
+ save_json(tok_conf_path, tok_data)
63
+ print(f" Fixed tokenizer_config.json (Str: </s>)")
64
+
65
+ # 4. Patch Special Tokens Map
66
+ spec_path = os.path.join(model_path, "special_tokens_map.json")
67
+ if os.path.exists(spec_path):
68
+ spec_data = load_json(spec_path)
69
+ spec_data["eos_token"] = "</s>"
70
+ save_json(spec_path, spec_data)
71
+ print(f" Fixed special_tokens_map.json")
72
+
73
+ print(f"{Fore.GREEN} -> Successfully converted {model_name}{Style.RESET_ALL}")
74
+
75
+ def main():
76
+ parser = argparse.ArgumentParser(description="Convert ChatML models (EOS 32000) to Mistral format (EOS 2)")
77
+ parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
78
+ args = parser.parse_args()
79
+
80
+ print(f"{Fore.CYAN}--- CHATML TO MISTRAL CONVERTER ---{Style.RESET_ALL}")
81
+ print("This script changes metadata only. It allows ChatML models to be merged")
82
+ print("using 'tokenizer: source: base' without errors.\n")
83
+
84
+ if not os.path.exists(args.base_dir):
85
+ print(f"Error: Directory {args.base_dir} does not exist.")
86
+ return
87
+
88
+ # Walk through all subdirectories
89
+ count = 0
90
+ for root, dirs, files in os.walk(args.base_dir):
91
+ for name in dirs:
92
+ # We look at every folder, check if it's a model inside convert_model
93
+ full_path = os.path.join(root, name)
94
+ # Simple check if it looks like a model folder
95
+ if os.path.exists(os.path.join(full_path, "config.json")):
96
+ convert_model(full_path)
97
+ count += 1
98
+
99
+ print("-" * 60)
100
+ print("Scan complete.")
101
+
102
+ if __name__ == "__main__":
103
+ main()
enable_fix_mistral_regex_true.md ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```
2
+ architecture: MistralForCausalLM
3
+ base_model: B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop
4
+ models:
5
+ - model: B:/12B/models--allura-org--Tlacuilo-12B
6
+ - model: B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop
7
+ - model: B:/12B/models--EldritchLabs--Altair-Stock-12B-v1/MPOA_alt
8
+ - model: B:/12B/models--inflatebot--MN-12B-Mag-Mell-R1
9
+ - model: B:/12B/models--MuXodious--Irix-12B-Model_Stock-absolute-heresy
10
+ - model: B:/12B/models--aixonlab--Aether-12b
11
+ - model: B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B
12
+ - model: B:/12B/models--SicariusSicariiStuff--Sweet_Dreams_12B
13
+ - model: B:/12B/models--Epiculous--Azure_Dusk-v0.2
14
+ # - model: B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B.backup
15
+ merge_method: karcher
16
+ parameters:
17
+ max_iter: 30 # Maximum iterations for Karcher mean algorithm (default 10 )
18
+ tol: 1e-9 # Convergence tolerance for Karcher mean algorithm (default 1e5)
19
+ dtype: float32
20
+ out_dtype: bfloat16
21
+ tokenizer:
22
+ source: "union"
23
+ tokens:
24
+ # Force ChatML EOS tokens
25
+ "<|im_start|>":
26
+ source: "B:/12B/models--allura-org--Tlacuilo-12B"
27
+ force: true
28
+ "<|im_end|>":
29
+ source: "B:/12B/models--allura-org--Tlacuilo-12B"
30
+ force: true
31
+ # Keep Mistral tokens
32
+ "[INST]":
33
+ source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
34
+ # source: "B:/12B/models--mistralai--Mistral-Nemo-Instruct-2407" # The tokenizer system requires all models referenced in token configurations to be present in the merge's model list to build proper embedding permutations.
35
+ "[/INST]":
36
+ source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
37
+ # Force </s> as fallback EOS
38
+ "</s>":
39
+ source: "B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop"
40
+ force: true
41
+
42
+ chat_template: "chatml"
43
+ name: 🧜‍ Nocturne-Nereid-12B-v1
44
+ ```
45
+
46
+ ```
47
+ C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda
48
+
49
+ Waiting for 0 seconds, press CTRL+C to quit ...
50
+ `torch_dtype` is deprecated! Use `dtype` instead!
51
+ Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1499.99it/s]
52
+ Planning operations
53
+ Executing graph: 0%| | 0/4361 [00:00<?, ?it/s]The tokenizer you are loading from 'B:/12B/models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
54
+ Loading tokenizers
55
+ The tokenizer you are loading from 'B:/12B/models--allura-org--Tlacuilo-12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
56
+ The tokenizer you are loading from 'B:/12B/models--aixonlab--Aether-12b' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
57
+ The tokenizer you are loading from 'B:/12B/models--Epiculous--Azure_Dusk-v0.2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
58
+ The tokenizer you are loading from 'B:/12B/models--inflatebot--MN-12B-Mag-Mell-R1' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
59
+ The tokenizer you are loading from 'B:/12B/models--EldritchLabs--Altair-Stock-12B-v1/MPOA_alt' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
60
+ The tokenizer you are loading from 'B:/12B/models--SicariusSicariiStuff--Impish_Bloodmoon_12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
61
+ The tokenizer you are loading from 'B:/12B/models--SicariusSicariiStuff--Sweet_Dreams_12B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
62
+ Building output tokenizer
63
+ Token '<|im_start|>' added with multiple different settings, using first
64
+ Building permutations
65
+ Building tokenizer permutations: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 9.54it/s]
66
+ Executing graph: 0%| | 3/4361 [00:07<2:44:37, 2.27s/it]
67
+ ```
68
+
69
+ ---
70
+
71
+ Based on the codebase, mergekit doesn't currently support passing `fix_mistral_regex=True` to tokenizer loading. The warnings you're seeing come from the transformers library, but mergekit only passes `trust_remote_code` to tokenizer loading calls, not the `fix_mistral_regex` parameter.
72
+
73
+ ## Current Tokenizer Loading in MergeKit
74
+
75
+ The main tokenizer loading happens in `mergekit/tokenizer/build.py`:
76
+
77
+ ```python
78
+ # In get_stripped_tokenizer()
79
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
80
+ path.path,
81
+ revision=path.revision,
82
+ trust_remote_code=trust_remote_code,
83
+ use_fast=True,
84
+ )
85
+ ```
86
+
87
+ And in `build_tokenizer()`:
88
+ ```python
89
+ model_tok = transformers.AutoTokenizer.from_pretrained(
90
+ model.model.path,
91
+ revision=model.model.revision,
92
+ trust_remote_code=trust_remote_code,
93
+ )
94
+ ```
95
+
96
+ ## How to Add Support
97
+
98
+ To add `fix_mistral_regex=True` support, you would need to:
99
+
100
+ 1. **Add the parameter to MergeOptions** - This would require modifying the options structure to include a new field for `fix_mistral_regex`
101
+
102
+ 2. **Update all tokenizer loading calls** - Modify the `AutoTokenizer.from_pretrained()` calls throughout the codebase to pass this parameter:
103
+ - `mergekit/tokenizer/build.py` (lines 49-54, 203-207, 155-157)
104
+ - `mergekit/merge.py` (line 271-275)
105
+ - `mergekit/scripts/tokensurgeon.py` (line 148-152)
106
+
107
+ 3. **Add CLI flag** - Add a `--fix-mistral-regex` flag to the CLI that sets this option
108
+
109
+ ## Temporary Workaround
110
+
111
+ Until mergekit adds native support, you could:
112
+
113
+ 1. **Modify the source locally** - Add `fix_mistral_regex=True` directly to the `AutoTokenizer.from_pretrained()` calls in your local mergekit installation
114
+
115
+ 2. **Use a different tokenizer source** - If one of your models has a correct tokenizer, you could specify it as the tokenizer source in your YAML config
116
+
117
+ The warnings won't prevent your merge from completing, but as noted, they may lead to incorrect tokenization behavior with the affected models.
118
+
119
+ ## Notes
120
+
121
+ The `trust_remote_code` parameter you're already using via `--trust-remote-code` follows the same pattern through the codebase - it's passed from CLI options to the tokenizer loading functions. Adding `fix_mistral_regex` support would require implementing the same pattern.
122
+
123
+ ---
124
+
125
+ > Explain how to enable this for merges on my local copy, step by step.
126
+
127
+ ---
128
+
129
+ To enable `fix_mistral_regex=True` for your local mergekit installation, you'll need to modify several files to add this parameter throughout the tokenizer loading pipeline.
130
+
131
+ ## Step-by-Step Local Modifications
132
+
133
+ ### 1. Add the option to MergeOptions
134
+
135
+ Edit `mergekit/options.py` and add the new field:
136
+
137
+ ```python
138
+ # In MergeOptions class (around line 41)
139
+ fix_mistral_regex: bool = False
140
+ ```
141
+
142
+ Also add it to `OPTION_HELP` and `OPTION_CATEGORIES` dictionaries:
143
+
144
+ ```python
145
+ # Around line 114
146
+ OPTION_HELP = {
147
+ # ... existing options ...
148
+ "fix_mistral_regex": "Fix incorrect regex pattern in Mistral tokenizers",
149
+ }
150
+
151
+ # Around line 140
152
+ OPTION_CATEGORIES = {
153
+ # ... existing categories ...
154
+ "fix_mistral_regex": "Dangerous Options",
155
+ }
156
+ ```
157
+
158
+ ### 2. Update tokenizer loading in mergekit/tokenizer/build.py
159
+
160
+ Modify all `AutoTokenizer.from_pretrained()` calls to include the new parameter:
161
+
162
+ ```python
163
+ # Line 49-54 in get_stripped_tokenizer()
164
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
165
+ path.path,
166
+ revision=path.revision,
167
+ trust_remote_code=trust_remote_code,
168
+ use_fast=True,
169
+ fix_mistral_regex=fix_mistral_regex, # Add this line
170
+ )
171
+
172
+ # Line 203-207 in build_tokenizer()
173
+ model_tok = transformers.AutoTokenizer.from_pretrained(
174
+ model.model.path,
175
+ revision=model.model.revision,
176
+ trust_remote_code=trust_remote_code,
177
+ fix_mistral_regex=fix_mistral_regex, # Add this line
178
+ )
179
+
180
+ # Line 155-157 in build_union_tokenizer()
181
+ res = transformers.AutoTokenizer.from_pretrained(
182
+ p, use_fast=True, trust_remote_code=trust_remote_code,
183
+ fix_mistral_regex=fix_mistral_regex, # Add this line
184
+ )
185
+ ```
186
+
187
+ You'll also need to update the function signatures to accept this parameter and pass it through the call chain.
188
+
189
+ ### 3. Update mergekit/merge.py
190
+
191
+ Modify the tokenizer loading in `_copy_tokenizer()`:
192
+
193
+ ```python
194
+ # Line 271-275
195
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
196
+ donor_model.model.path,
197
+ revision=donor_model.model.revision,
198
+ trust_remote_code=options.trust_remote_code,
199
+ fix_mistral_regex=options.fix_mistral_regex, # Add this line
200
+ )
201
+ ```
202
+
203
+ ### 4. Update mergekit/scripts/tokensurgeon.py
204
+
205
+ Modify the tokenizer loading:
206
+
207
+ ```python
208
+ # Line 148-152 in get_stuff()
209
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
210
+ model.model.path,
211
+ revision=model.model.revision,
212
+ trust_remote_code=options.trust_remote_code,
213
+ fix_mistral_regex=options.fix_mistral_regex, # Add this line
214
+ )
215
+ ```
216
+
217
+ ### 5. Update mergekit/scripts/moe.py
218
+
219
+ Modify the tokenizer loading:
220
+
221
+ ```python
222
+ # Line 36-38
223
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
224
+ base_model.model.path,
225
+ revision=base_model.model.revision,
226
+ fix_mistral_regex=merge_options.fix_mistral_regex, # Add this line
227
+ )
228
+ ```
229
+
230
+ ### 6. Update function signatures
231
+
232
+ You'll need to update the function signatures to pass the `fix_mistral_regex` parameter through the call chain. The key functions to update are:
233
+ - `get_stripped_tokenizer()` in `mergekit/tokenizer/build.py`
234
+ - `build_union_tokenizer()` in `mergekit/tokenizer/build.py`
235
+ - `build_tokenizer()` in `mergekit/tokenizer/build.py`
236
+ - `BuildTokenizer.execute()` in `mergekit/tokenizer/build.py`
237
+
238
+ ### 7. Use the new flag
239
+
240
+ After making these changes, you can use the new flag in your command:
241
+
242
+ ```bash
243
+ mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
244
+ ```
245
+
246
+ ## Notes
247
+
248
+ The `trust_remote_code` parameter follows the same pattern through the codebase - it's passed from CLI options to the tokenizer loading functions [1](#0-0) [2](#0-1) . Adding `fix_mistral_regex` support requires implementing the same pattern throughout all tokenizer loading locations [3](#0-2) [4](#0-3) .
249
+
250
+ The warnings you're seeing come from the transformers library and indicate that the tokenizer regex pattern may lead to incorrect tokenization [5](#0-4) . While the merge will complete without this fix, enabling it will ensure proper tokenization behavior.
251
+
252
+ ---
253
+
254
+ > I did all of that and now it says this
255
+
256
+ ```
257
+ C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
258
+
259
+ Waiting for 0 seconds, press CTRL+C to quit ...
260
+ `torch_dtype` is deprecated! Use `dtype` instead!
261
+ Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1499.99it/s]
262
+ Planning operations
263
+ Executing graph: 0%| | 1/4361 [00:00<14:53, 4.88it/s]
264
+ Traceback (most recent call last):
265
+ File "<frozen runpy>", line 198, in _run_module_as_main
266
+ File "<frozen runpy>", line 88, in _run_code
267
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Scripts\mergekit-yaml.exe\__main__.py", line 5, in <module>
268
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1485, in __call__
269
+ return self.main(*args, **kwargs)
270
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
271
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1406, in main
272
+ rv = self.invoke(ctx)
273
+ ^^^^^^^^^^^^^^^^
274
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1269, in invoke
275
+ return ctx.invoke(self.callback, **ctx.params)
276
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
277
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 824, in invoke
278
+ return callback(*args, **kwargs)
279
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
280
+ File "C:\mergekit-main\mergekit\options.py", line 169, in wrapper
281
+ return f(*args, **kwargs)
282
+ ^^^^^^^^^^^^^^^^^^
283
+ File "C:\mergekit-main\mergekit\scripts\run_yaml.py", line 30, in main
284
+ run_merge(
285
+ File "C:\mergekit-main\mergekit\merge.py", line 85, in run_merge
286
+ for _task, value in exec.run(quiet=options.quiet):
287
+ File "C:\mergekit-main\mergekit\graph.py", line 747, in run
288
+ for handle, value in self._run(quiet=quiet, desc=desc):
289
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
290
+ File "C:\mergekit-main\mergekit\graph.py", line 712, in _run
291
+ res = task.execute(**cpu_arguments)
292
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
293
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 294, in execute
294
+ return build_tokenizer(
295
+ ^^^^^^^^^^^^^^^^
296
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 193, in build_tokenizer
297
+ tokenizer_base = get_stripped_tokenizer(
298
+ ^^^^^^^^^^^^^^^^^^^^^^^
299
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 54, in get_stripped_tokenizer
300
+ fix_mistral_regex=fix_mistral_regex, # Add this line
301
+ ^^^^^^^^^^^^^^^^^
302
+ NameError: name 'fix_mistral_regex' is not defined
303
+
304
+ C:\mergekit-main>
305
+ ```
306
+
307
+ ---
308
+
309
+ The error occurs because you added `fix_mistral_regex` to the `AutoTokenizer.from_pretrained()` call in `get_stripped_tokenizer()` but didn't update the function signature to accept this parameter. The parameter needs to be passed through the entire call chain.
310
+
311
+ ## Fix the function signatures
312
+
313
+ ### 1. Update `get_stripped_tokenizer()` in mergekit/tokenizer/build.py
314
+
315
+ ```python
316
+ def get_stripped_tokenizer(
317
+ path: ModelPath, trust_remote_code: bool = False, fix_mistral_regex: bool = False
318
+ ) -> transformers.PreTrainedTokenizerFast:
319
+ ```
320
+
321
+ ### 2. Update `build_tokenizer()` in mergekit/tokenizer/build.py
322
+
323
+ ```python
324
+ def build_tokenizer(
325
+ base_model: Optional[ModelReference],
326
+ referenced_models: List[ModelReference],
327
+ tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference],
328
+ trust_remote_code: bool,
329
+ fix_mistral_regex: bool = False,
330
+ add_tokens: Optional[List[str]] = None,
331
+ ) -> TokenizerInfo:
332
+ ```
333
+
334
+ ### 3. Update `BuildTokenizer.execute()` in mergekit/tokenizer/build.py
335
+
336
+ ```python
337
+ class BuildTokenizer(Task[TokenizerInfo]):
338
+ base_model: Optional[ModelReference]
339
+ referenced_models: Tuple[ModelReference, ...]
340
+ tokenizer_source: Union[Literal["union"], Literal["base"], ModelReference]
341
+ add_tokens: Optional[Tuple[str, ...]]
342
+ trust_remote_code: bool = False
343
+ fix_mistral_regex: bool = False
344
+
345
+ def execute(self, **_kwargs) -> TokenizerInfo:
346
+ return build_tokenizer(
347
+ base_model=self.base_model,
348
+ referenced_models=self.referenced_models,
349
+ tokenizer_source=self.tokenizer_source,
350
+ trust_remote_code=self.trust_remote_code,
351
+ fix_mistral_regex=self.fix_mistral_regex,
352
+ add_tokens=self.add_tokens,
353
+ )
354
+ ```
355
+
356
+ ### 4. Update the call in `build_tokenizer()`
357
+
358
+ ```python
359
+ tokenizer_base = get_stripped_tokenizer(
360
+ base_model.model, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
361
+ )
362
+ ```
363
+
364
+ ### 5. Update the call in `MergePlanner.__init__()` in mergekit/plan.py
365
+
366
+ ```python
367
+ self._tokenizer_task = BuildTokenizer(
368
+ base_model=config.base_model,
369
+ referenced_models=tuple(config.referenced_models()),
370
+ tokenizer_source=tokenizer_source,
371
+ trust_remote_code=options.trust_remote_code,
372
+ fix_mistral_regex=options.fix_mistral_regex,
373
+ add_tokens=tuple(token_cfg.keys()),
374
+ )
375
+ ```
376
+
377
+ ### 6. Update `build_union_tokenizer()` in mergekit/tokenizer/build.py
378
+
379
+ ```python
380
+ def build_union_tokenizer(
381
+ base_tok: transformers.PreTrainedTokenizerBase,
382
+ tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
383
+ trust_remote_code: bool = False,
384
+ fix_mistral_regex: bool = False,
385
+ ) -> transformers.PreTrainedTokenizerBase:
386
+ ```
387
+
388
+ And update the call within it:
389
+ ```python
390
+ res = transformers.AutoTokenizer.from_pretrained(
391
+ p, use_fast=True, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
392
+ )
393
+ ```
394
+
395
+ Also update the call to `build_union_tokenizer()` in `build_tokenizer()`:
396
+ ```python
397
+ tokenizer_out = build_union_tokenizer(
398
+ tokenizer_base, tokenizers, trust_remote_code=trust_remote_code, fix_mistral_regex=fix_mistral_regex
399
+ )
400
+ ```
401
+
402
+ ## Notes
403
+
404
+ The error occurs because `get_stripped_tokenizer()` doesn't have `fix_mistral_regex` in its parameter list, so when you try to use it in the function body, Python raises a `NameError` [1](#0-0) . The parameter must be added to each function in the call chain from `BuildTokenizer.execute()` down to the actual `AutoTokenizer.from_pretrained()` calls [2](#0-1) .
405
+
406
+ ---
407
+
408
+ > I now get this error
409
+
410
+ ```
411
+ C:\mergekit-main>timeout /t 3 /nobreak && mergekit-yaml C:\mergekit-main\altair7.yaml C:\mergekit-main\merged_model_altair7 --copy-tokenizer --allow-crimes --out-shard-size 5B --trust-remote-code --lazy-unpickle --random-seed 420 --cuda --fix-mistral-regex
412
+
413
+ Waiting for 0 seconds, press CTRL+C to quit ...
414
+ `torch_dtype` is deprecated! Use `dtype` instead!
415
+ Warmup loader cache: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1809.80it/s]
416
+ Planning operations
417
+ Executing graph: 0%| | 0/4361 [00:00<?, ?it/s]Loading tokenizers
418
+ Building output tokenizer
419
+ Token '<|im_start|>' added with multiple different settings, using first
420
+ Executing graph: 0%| | 1/4361 [00:04<5:27:21, 4.51s/it]
421
+ Traceback (most recent call last):
422
+ File "<frozen runpy>", line 198, in _run_module_as_main
423
+ File "<frozen runpy>", line 88, in _run_code
424
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Scripts\mergekit-yaml.exe\__main__.py", line 5, in <module>
425
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1485, in __call__
426
+ return self.main(*args, **kwargs)
427
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
428
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1406, in main
429
+ rv = self.invoke(ctx)
430
+ ^^^^^^^^^^^^^^^^
431
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1269, in invoke
432
+ return ctx.invoke(self.callback, **ctx.params)
433
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
434
+ File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 824, in invoke
435
+ return callback(*args, **kwargs)
436
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
437
+ File "C:\mergekit-main\mergekit\options.py", line 169, in wrapper
438
+ return f(*args, **kwargs)
439
+ ^^^^^^^^^^^^^^^^^^
440
+ File "C:\mergekit-main\mergekit\scripts\run_yaml.py", line 30, in main
441
+ run_merge(
442
+ File "C:\mergekit-main\mergekit\merge.py", line 85, in run_merge
443
+ for _task, value in exec.run(quiet=options.quiet):
444
+ File "C:\mergekit-main\mergekit\graph.py", line 747, in run
445
+ for handle, value in self._run(quiet=quiet, desc=desc):
446
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
447
+ File "C:\mergekit-main\mergekit\graph.py", line 712, in _run
448
+ res = task.execute(**cpu_arguments)
449
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
450
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 296, in execute
451
+ return build_tokenizer(
452
+ ^^^^^^^^^^^^^^^^
453
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 232, in build_tokenizer
454
+ tokenizer_out = build_union_tokenizer(
455
+ ^^^^^^^^^^^^^^^^^^^^^^
456
+ File "C:\mergekit-main\mergekit\tokenizer\build.py", line 158, in build_union_tokenizer
457
+ fix_mistral_regex=fix_mistral_regex, # Add this line
458
+ ^^^^^^^^^^^^^^^^^
459
+ NameError: name 'fix_mistral_regex' is not defined
460
+
461
+ C:\mergekit-main>
462
+ ```
463
+
464
+ ---
465
+
466
+ The error occurs because you haven't updated the function signature for `build_union_tokenizer()` to accept the `fix_mistral_regex` parameter. Here's the fix:
467
+
468
+ ## Fix for `build_union_tokenizer()`
469
+
470
+ Update the function signature in `mergekit/tokenizer/build.py`:
471
+
472
+ ```python
473
+ def build_union_tokenizer(
474
+ base_tok: transformers.PreTrainedTokenizerBase,
475
+ tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
476
+ trust_remote_code: bool = False,
477
+ fix_mistral_regex: bool = False, # Add this parameter
478
+ ) -> transformers.PreTrainedTokenizerBase:
479
+ ```
480
+
481
+ And update the `AutoTokenizer.from_pretrained()` call within the same function (around line 155-157):
482
+
483
+ ```python
484
+ res = transformers.AutoTokenizer.from_pretrained(
485
+ p,
486
+ use_fast=True,
487
+ trust_remote_code=trust_remote_code,
488
+ fix_mistral_regex=fix_mistral_regex # This line already exists
489
+ )
490
+ ```
491
+
492
+ The call to `build_union_tokenizer()` in `build_tokenizer()` (line 232) should already be passing the parameter correctly based on your previous updates.
493
+
494
+ ## Complete Function Update
495
+
496
+ Here's the complete updated function for `build_union_tokenizer()`:
497
+
498
+ ```python
499
+ def build_union_tokenizer(
500
+ base_tok: transformers.PreTrainedTokenizerBase,
501
+ tokenizers: Dict[ModelReference, transformers.PreTrainedTokenizerBase],
502
+ trust_remote_code: bool = False,
503
+ fix_mistral_regex: bool = False,
504
+ ) -> transformers.PreTrainedTokenizerBase:
505
+ # ... rest of function remains the same ...
506
+ ```
507
+
508
+ After this change, your merge should proceed without the `NameError` [1](#0-0) .
509
+
510
+ ## Notes
511
+
512
+ The error pattern shows that `fix_mistral_regex` needs to be passed through the entire call chain from the CLI options down to each `AutoTokenizer.from_pretrained()` call [2](#0-1) . You've successfully updated most of the chain, but this function signature was the missing piece.
513
+
514
+ ---
515
+
516
+ **Analyzing the Fix**
517
+
518
+ I'm currently focused on dissecting the `fix_mistral_regex=True` parameter within `mergekit`. My primary goal is to determine the precise function of this fix in the context of merging Mistral models. I need to understand its impact and how it addresses a potential issue or enhances the merging process.
519
+
520
+
521
+ **Investigating the Root**
522
+
523
+ I'm now diving into the 'why' behind the regex fix, focusing on the author's explanation. The provided text details issues with Mistral tokenizers and chat templates, particularly around premature termination, endless repetition, and hallucinated end-of-sequence tokens. The fix seems to address these problems in a specific Karcher merge, notably resolving issues with chat templates and jailbreaks, while acknowledging some template-specific behavior.
524
+
525
+
526
+ **Clarifying the Bugfix**
527
+
528
+ I'm now zeroing in on the technical implementation. The core of `fix_mistral_regex=True` is overriding a faulty regex pattern in the tokenizer config. This pattern is part of how the tokenizer handles pre-tokenization, and the original pattern was causing issues with specific Mistral models, including those Nemo 12B-based models. This fix is crucial for compatibility because of this bug in the `transformers` library, which causes the tokenizer to misbehave in edge cases.
529
+
530
+
531
+ **Connecting the Incompatibilities**
532
+
533
+ I'm now understanding how different tokenizer configs cause issues during the merge process. Building a "union" tokenizer with conflicting regex patterns leads to misaligned token mappings, especially for special tokens. The `fix_mistral_regex=True` parameter directly addresses this problem. I am able to identify the core issues: early termination or endless repetition, and hallucination of `<|im_end|>`. These are a direct result of the broken regex and tokenizer incompatibility across the various models being merged.
534
+
535
+
536
+ **Mapping the Tokenization**
537
+
538
+ I'm now focusing on how the fix's implementation. It seems by overriding the pre-tokenization regex, mergekit accurately maps overlapping tokens and inserts the ChatML tokens, like `<|im_start|>` and `<|im_end|>`, correctly. This prevents the underlying regex from disrupting token boundaries, ensuring proper handling of different prompt formats during the merge. I'm now documenting how this fixes ChatML/Special Tokens, generation loops, and resolves incompatibilities.
539
+
540
+ Based on the logs, the Hugging Face issue linked in the warnings, and the author's notes, here is an explanation of what `fix_mistral_regex=True` does and why it solves the merge issues.
541
+
542
+ ### What the fix actually does (Technically)
543
+ Mistral models (specifically those using their newer "Tekken" tokenizer, like Mistral Nemo 12B and Mistral Small 3) rely on a specific Regular Expression (regex) pattern to pre-tokenize text—meaning how it splits sentences into chunks before assigning token IDs.
544
+
545
+ Recently, a bug was discovered where the regex pattern saved inside the `tokenizer_config.json` of these models was slightly incorrect. This caused the tokenizer to misinterpret certain character boundaries, spaces, and special tokens.
546
+
547
+ When you pass `fix_mistral_regex=True` to `AutoTokenizer.from_pretrained()` in the `transformers` library, it tells the library to **ignore the broken regex pattern in the model's files and dynamically replace it with the corrected pattern**.
548
+
549
+ In the context of `mergekit`, if this fix isn't applied, `mergekit` tries to build a "union" tokenizer out of 9 different models using a broken set of rules. This results in a corrupted vocabulary mapping where special tokens get misaligned.
550
+
551
+ ### Why it fixes the issues (According to the Author)
552
+
553
+ According to the author's notes, applying this fix during the merge process resolved several severe, model-breaking behaviors:
554
+
555
+ **1. It stops the model from hallucinating `<|im_end|>` as plain text**
556
+ Without the fix, the broken regex was likely causing the tokenizer to fail at recognizing `<|im_end|>` as a single, distinct "Special Token." Instead, the model was treating it as regular text. Because the text generation interface didn't see the actual *End of Sequence ID*, it just printed the literal characters `<|im_end|>` to the screen and kept generating. Fixing the regex ensures the union tokenizer correctly isolates and maps these ChatML special tokens.
557
+
558
+ **2. It fixes endless repetition and early termination**
559
+ Because the token mappings were corrupted by the bad regex, the merged model's language head was firing the wrong token IDs. It would either output an EOS (End of Sequence) token immediately (terminating early) or completely lose the ability to output an EOS token (repeating endlessly). The fix restores the correct token IDs for stopping generation.
560
+
561
+ **3. It resolves "Tokenizer Incompatibility" across different fine-tunes**
562
+ The author merged 9 different models. Some of these models (like `Mistral-Nemo-Instruct-2407`) use the default Mistral `[INST]` format, while others (like `Tlacuilo-12B`) were fine-tuned to use `ChatML` (`<|im_start|>`).
563
+ Normally, merging models with different prompt formats and added tokens causes severe brain damage to the model. By fixing the underlying regex, `mergekit` was able to accurately map the overlapping base tokens and cleanly inject the forced ChatML tokens (as defined in the YAML config), creating a stable hybrid tokenizer.
564
+
565
+ ### The Author's Final Result
566
+ By applying this fix, the author successfully created a 9-model Karcher merge that:
567
+ * **Works perfectly with ChatML:** It follows instructions, doesn't loop, and stops when it is supposed to.
568
+ * **Accepts Jailbreaks:** The control vectors from the "heretic/noslop" models successfully carried over.
569
+ * **Has one remaining quirk:** The author notes that if you try to use the *Mistral Tekken* prompt format instead of *ChatML*, the model terminates early and refuses prompts. This makes sense: the YAML config explicitly forced the ChatML tokens to overwrite the Mistral ones, meaning the model's brain is now permanently wired to expect ChatML.
folder+subfolder_content_combiner_anyfiles.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # --- Configuration ---
5
+ OUTPUT_FILENAME = "!combo.txt"
6
+ # Folders to skip to prevent massive/useless data bloat
7
+ BLACKLIST_DIRS = {".git", "__pycache__", ".ipynb_checkpoints", "node_modules", "venv", ".vscode"}
8
+ # --- End Configuration ---
9
+
10
+ def combine_all_recursive():
11
+ """
12
+ Recursively combines all files in the script's directory and all subdirectories
13
+ into a single output file with relative path tags.
14
+ """
15
+ try:
16
+ # Determine the script's own path
17
+ if getattr(sys, 'frozen', False):
18
+ script_full_path = os.path.abspath(sys.executable)
19
+ else:
20
+ script_full_path = os.path.abspath(__file__)
21
+
22
+ script_dir = os.path.dirname(script_full_path)
23
+ output_filepath = os.path.join(script_dir, OUTPUT_FILENAME)
24
+
25
+ print(f"Scanning recursively in: {script_dir}")
26
+ print(f"Output file: {output_filepath}")
27
+ print("-" * 40)
28
+
29
+ file_list = []
30
+
31
+ # os.walk handles the recursion automatically
32
+ for root, dirs, files in os.walk(script_dir):
33
+ # Modify dirs in-place to skip blacklisted folders
34
+ dirs[:] = [d for d in dirs if d not in BLACKLIST_DIRS]
35
+
36
+ for filename in files:
37
+ full_path = os.path.join(root, filename)
38
+
39
+ # Exclude the script itself and the output file
40
+ if (os.path.normpath(full_path) == os.path.normpath(script_full_path) or
41
+ filename == OUTPUT_FILENAME):
42
+ continue
43
+
44
+ # Store relative path for sorting and tagging
45
+ rel_path = os.path.relpath(full_path, script_dir)
46
+ file_list.append((rel_path, full_path))
47
+
48
+ # Sort by relative path so the combo file is organized logically
49
+ file_list.sort(key=lambda x: x[0])
50
+
51
+ if not file_list:
52
+ print("No files found to combine.")
53
+ return
54
+
55
+ print(f"Found {len(file_list)} files. Starting combination...")
56
+
57
+ with open(output_filepath, 'w', encoding='utf-8', errors='replace') as outfile:
58
+ outfile.write("```\n") # Opening wrapper
59
+
60
+ for rel_path, full_path in file_list:
61
+ print(f"Adding: {rel_path}")
62
+ try:
63
+ outfile.write(f"<{rel_path}>\n")
64
+ with open(full_path, 'r', encoding='utf-8', errors='replace') as infile:
65
+ outfile.write(infile.read())
66
+ outfile.write(f"\n</{rel_path}>\n\n")
67
+ except Exception as e:
68
+ outfile.write(f"ERROR READING FILE {rel_path}: {e}\n")
69
+ outfile.write(f"</{rel_path}>\n\n")
70
+
71
+ outfile.write("```\n") # Closing wrapper
72
+
73
+ print("-" * 40)
74
+ print(f"SUCCESS! Combined {len(file_list)} files into '{OUTPUT_FILENAME}'")
75
+
76
+ except Exception as e:
77
+ print(f"CRITICAL ERROR: {e}")
78
+ import traceback
79
+ traceback.print_exc()
80
+
81
+ if __name__ == "__main__":
82
+ combine_all_recursive()
83
+ try:
84
+ input("\nProcess complete. Press Enter to exit...")
85
+ except EOFError:
86
+ pass
model_tools.md CHANGED
@@ -8,7 +8,7 @@ pinned: false
8
  ---
9
 
10
  # Model Tools by Naphula
11
- Tools to enhance LLM quantizations and merging
12
 
13
  # [graph_v18.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/graph_v18.py)
14
  - Merge models in minutes instead of hours on low VRAM. For a 3060/3060 Ti user: This script enables functionality that is otherwise impossible (merging 70B models or large 7B merges with `--cuda`) without OOM. [More details here](https://huggingface.co/spaces/Naphula/model_tools/blob/main/mergekit_low-VRAM-graph_patch.md)
@@ -17,6 +17,14 @@ Tools to enhance LLM quantizations and merging
17
  # config.py
18
  - Simply replace line 13 | BEFORE `ScalarOrGradient: TypeAlias = Union[float, List[float]]` → AFTER `ScalarOrGradient: TypeAlias = Union[float, List[float], str, bool]` | to allow for custom filepath strings within parameter settings.
19
 
 
 
 
 
 
 
 
 
20
  # [audit_della.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/audit_della.py)
21
  - Audit the compatibility of donor models for `Della` merges before merging. See: [example chart Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.png), [example log Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.log), [example chart Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.png), [example log Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.log)
22
 
@@ -40,13 +48,16 @@ Tools to enhance LLM quantizations and merging
40
  - Then assign the num_experts_per_tok in config.json (or the config.yaml)
41
 
42
  # [tokensurgeon.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
43
- - Uses adaptive VRAM from Grim Jim's `measure.py` like `graph_v18` to prevent OOM. Use recommended [batch file](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fix_tokenizers.bat) here or modify sh. This supposedly avoids 'Potemkin village' fake patches like `gen_id_patcher` and `vocab_id_patcher`.
44
 
45
  # [tokeninspector.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokeninspector.py)
46
  - Audit your tokensurgeon results.
47
 
 
 
 
48
  # [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
49
- - Updated! This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
50
 
51
  # [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
52
  - This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
@@ -63,6 +74,9 @@ Tools to enhance LLM quantizations and merging
63
  # [textonly_ripper_v2.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper_v2.py)
64
  - Converts a sharded, multimodal (text and vision) model into a text-only version. Readme at [textonly_ripper.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper.md)
65
 
 
 
 
66
  # [vocab_resizer.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_resizer.py)
67
  - Converts models with larger vocab_sizes to a standard size (default 131072 Mistral 24B) for use with mergekit. Note that `tokenizer.model` must be manually copied into the `/fixed/` folder.
68
 
@@ -70,11 +84,14 @@ Tools to enhance LLM quantizations and merging
70
  - This script will load a "fat" 18.9GB model (default Gemma 9B), force it to tie the weights (deduplicating the lm_head), and re-save it. This will drop the file size to ~17.2GB and make it compatible with the others.
71
 
72
  # [model_index_json_generator.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator.py)
73
- - Generates a missing `model.safetensors.index.json` file. Useful for cases where safetensors may have been sharded at the wrong size.
74
 
75
  # [folder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder_content_combiner_anyfiles.py)
76
  - Combines all files in the script's current directory into a single output file, sorted alphabetically.
77
 
 
 
 
78
  # [GGUF Repo Suite](https://huggingface.co/spaces/Naphula/gguf-repo-suite)
79
  - Create and quantize Hugging Face models
80
 
 
8
  ---
9
 
10
  # Model Tools by Naphula
11
+ Tools to enhance LLM quantizations and merging. Merge and audit large language models with low VRAM.
12
 
13
  # [graph_v18.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/graph_v18.py)
14
  - Merge models in minutes instead of hours on low VRAM. For a 3060/3060 Ti user: This script enables functionality that is otherwise impossible (merging 70B models or large 7B merges with `--cuda`) without OOM. [More details here](https://huggingface.co/spaces/Naphula/model_tools/blob/main/mergekit_low-VRAM-graph_patch.md)
 
17
  # config.py
18
  - Simply replace line 13 | BEFORE `ScalarOrGradient: TypeAlias = Union[float, List[float]]` → AFTER `ScalarOrGradient: TypeAlias = Union[float, List[float], str, bool]` | to allow for custom filepath strings within parameter settings.
19
 
20
+ # [enable_fix_mistral_regex_true.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/enable_fix_mistral_regex_true.md)
21
+ - Merge models with extreme tokenizer incompatibility. Requires modifying the `mergekit.yaml` `tokenizer` section and adding `--fix-mistral-regex` to your merge commands. (Note: Do not use `token_surgeon.py`, `gen_id_patcher.py`, or `vocab_id_patcher.py` with this, they are obsolete now.) Configured for MN 12B by default. Follow the steps in this guide to modify these scripts:
22
+ - `mergekit/merge.py`
23
+ - `mergekit/options.py`
24
+ - `mergekit/scripts/moe.py`
25
+ - `mergekit/scripts/tokensurgeon.py`
26
+ - `mergekit/tokenizer/build.py`
27
+
28
  # [audit_della.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/audit_della.py)
29
  - Audit the compatibility of donor models for `Della` merges before merging. See: [example chart Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.png), [example log Asmodeus](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Asmodeus_Audit.log), [example chart Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.png), [example log Slimaki](https://huggingface.co/spaces/Naphula/model_tools/blob/main/Audits/Slimaki_Audit.log)
30
 
 
48
  - Then assign the num_experts_per_tok in config.json (or the config.yaml)
49
 
50
  # [tokensurgeon.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokensurgeon.py)
51
+ - Uses adaptive VRAM from Grim Jim's `measure.py` like `graph_v18` to prevent OOM. Use recommended [batch file](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fix_tokenizers.bat) here or modify sh. This avoids 'Potemkin village' fake patches like `gen_id_patcher` and `vocab_id_patcher`. For this to work properly, you must also run [shield_embeddings.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/shield_embeddings.py) and [shield_norms.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/shield_norms.py) on any merges made from models patched with with tokensurgeon.
52
 
53
  # [tokeninspector.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/tokeninspector.py)
54
  - Audit your tokensurgeon results.
55
 
56
+ # [arcee_fusion_salience_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/arcee_fusion_salience_scanner.py)
57
+ - Scan the salience % of your arcee_fusion merges. The default `tukey_fence` value is 1.5 which results in 12.5% salience, but [this can be adjusted (see guide here)](modify_arcee_fusion_tukey_fence_parameter.md).
58
+
59
  # [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
60
+ - Updated! This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) and [vocab_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_id_patcher.py), or the [chatml_to_mistral.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/chatml_to_mistral.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
61
 
62
  # [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
63
  - This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
 
74
  # [textonly_ripper_v2.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper_v2.py)
75
  - Converts a sharded, multimodal (text and vision) model into a text-only version. Readme at [textonly_ripper.md](https://huggingface.co/spaces/Naphula/model_tools/blob/main/textonly_ripper.md)
76
 
77
+ # [json_reverter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/json_reverter.py)
78
+ - Revert changes to all JSON files done by `gen_id_patcher.py`, `vocab_id_patcher.py` or other scripts, within a specified root folder. It re-downloads the source files from the HF repo.
79
+
80
  # [vocab_resizer.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/vocab_resizer.py)
81
  - Converts models with larger vocab_sizes to a standard size (default 131072 Mistral 24B) for use with mergekit. Note that `tokenizer.model` must be manually copied into the `/fixed/` folder.
82
 
 
84
  - This script will load a "fat" 18.9GB model (default Gemma 9B), force it to tie the weights (deduplicating the lm_head), and re-save it. This will drop the file size to ~17.2GB and make it compatible with the others.
85
 
86
  # [model_index_json_generator.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator.py)
87
+ - Generates a missing `model.safetensors.index.json` file. Useful for cases where safetensors may have been sharded at the wrong size. [Single tensor variant here.](https://huggingface.co/spaces/Naphula/model_tools/blob/main/model_index_json_generator_SingleTensor.py)
88
 
89
  # [folder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder_content_combiner_anyfiles.py)
90
  - Combines all files in the script's current directory into a single output file, sorted alphabetically.
91
 
92
+ # [folder+subfolder_content_combiner_anyfiles.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/folder+subfolder_content_combiner_anyfiles.py)
93
+ - Combines all files in the script's directory, including all files within subdirectories (excluding blacklisted formats) into a single output file, sorted alphabetically.
94
+
95
  # [GGUF Repo Suite](https://huggingface.co/spaces/Naphula/gguf-repo-suite)
96
  - Create and quantize Hugging Face models
97
 
modify_arcee_fusion_tukey_fence_parameter.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The default setting is 1.5 which merges to 12% salience. If you want 25% then decrease to 0.75. If you want 33% use 0.4, and 0.0 results in 50% salience.
2
+
3
+ `arcee_fusion` hardcodes this to 1.5 by default but you can simply edit the python script before merging (easier), or update the code to assign it as a yaml parameter (more complex).
4
+
5
+ [Here is a scanner to audit Arcee_Fusion merge salience](https://huggingface.co/spaces/Naphula/model_tools/blob/main/arcee_fusion_salience_scanner.py)
6
+
7
+ ---
8
+
9
+ In the context of merging models using the `mergekit` framework, the provided code showcases a **Dynamic Threshold Fusion** mechanism. Here's an overview of how modifying the **Tukey fence** parameter (from **1.5** to **0.75** or **3.0**) would affect the model merging process.
10
+
11
+ ## Impact of Changing Tukey Fence Parameter
12
+
13
+ ### Current Implementation
14
+ In your code, the dynamic threshold is set as:
15
+
16
+ ```python
17
+ dynamic_threshold = median + 1.0 * iqr # Tukey Fence
18
+ ```
19
+
20
+ This uses **1.0** as the multiplier instead of **1.5**, but let's assume it was meant to be **1.5** for generalizing the Tukey fence.
21
+
22
+ ### Changes to k Values
23
+
24
+ #### 1. **If k = 0.75**
25
+ - **Increased Sensitivity**: The dynamic threshold would be lower, leading to more values being included in the fusion mask.
26
+ - **More Features Merged**: This results in a potentially more aggressive merging of parameters, effectively incorporating even slight differences between the model weights.
27
+ - **Use Cases**: Ideal for exploratory processes or applications where capturing subtle variations is critical.
28
+
29
+ #### Implementation
30
+ You would change the line in `calculate_dynamic_threshold`:
31
+
32
+ ```python
33
+ dynamic_threshold = median + 0.75 * iqr # Tukey Fence with k = 0.75
34
+ ```
35
+
36
+ ---
37
+
38
+ #### 2. **If k = 3.0**
39
+ - **Decreased Sensitivity**: The dynamic threshold would be much higher, allowing only significant differences to be considered.
40
+ - **Fewer Features Merged**: This could lead to a conservative merging approach, where only clear outliers are merged, possibly overlooking important but smaller differences.
41
+ - **Use Cases**: Useful in high-stakes environments where only distinctly different parameters should contribute to the final model.
42
+
43
+ #### Implementation
44
+ You would change the same line to:
45
+
46
+ ```python
47
+ dynamic_threshold = median + 3.0 * iqr # Tukey Fence with k = 3.0
48
+ ```
49
+
50
+ ---
51
+
52
+ ## Summary of Effects on Fusion Process
53
+ - **k = 0.75**: This results in a more inclusive fusion mask. More difference is merged, potentially capturing more nuanced information.
54
+ - **k = 3.0**: This leads to a restrictive fusion mask. Smaller differences would be ignored, possibly resulting in a model that retains more characteristics of the base model if the other model has less deviation.
55
+
56
+ ### Final Thought
57
+ The choice of **k** can significantly dictate the behavior and performance of the merged model. Deciding whether to accommodate subtle differences or enforce strict conditions on merging will greatly depend on the specific use case and the importance of the characteristics being learned from either model. If you'd like to delve deeper into any specific aspect or need further assistance, just let me know!
58
+
59
+ ---
60
+
61
+ The reason you are seeing a consistent **~12%** is not because of a hardcoded number, but because of a specific statistical formula used to identify "outliers" in the weight distribution.
62
+
63
+ In `arcee_fusion.py`, the density is controlled by the **Tukey's Fence** method. Here are the specific lines of code and the math behind them:
64
+
65
+ ### 1. The Threshold Calculation
66
+ In `arcee_fusion.py`, inside the `DynamicThresholdFusion` class:
67
+
68
+ ```python
69
+ # Line 41-42: Calculate the Interquartile Range (IQR)
70
+ iqr = q3 - q1
71
+
72
+ # Line 45: The "Magic" Formula
73
+ dynamic_threshold = median + 1.5 * iqr
74
+ ```
75
+
76
+ ### 2. The Technical Analysis of "12%"
77
+ The value **`1.5`** is the standard multiplier for **Tukey's Fences**, a statistical method used to find "outliers."
78
+
79
+ * **The Logic:** Arcee Fusion assumes that most weight changes in a fine-tune are "noise" or "jitter." It only wants to keep the "salient" changes—the ones that are statistically significant.
80
+ * **The Distribution:** Importance scores in neural networks usually follow a **Laplace** or **Heavy-Tailed Gaussian** distribution.
81
+ * **The Result:** In these types of distributions, the data points that fall above `Median + 1.5 * IQR` (the "upper fence") typically represent the top **10% to 15%** of the values.
82
+
83
+ Your result of **12.1%** means that in every layer, roughly 12% of the weights were "important enough" to cross that statistical fence. The other 88% were considered "noise" and were reverted to the Mistral Base weights.
84
+
85
+ ### 3. Why is it so consistent across layers?
86
+ The scanner shows almost no variation (11.9% to 12.2%). This reveals a technical truth about your fine-tunes: **The "knowledge" is spread evenly.**
87
+
88
+ If you were merging a model that only learned a specific task (like coding), you would see a "spike" in the middle layers (where logic is processed) and 0% in the early layers. Because your layers are all ~12%, it means the fine-tuned models you are using modified the entire "brain" of the model uniformly.
89
+
90
+ ### 4. How to change this (The "Patch")
91
+ If you want a higher density (e.g., you want 25% or 50% "New Info"), you have to modify the multiplier in `arcee_fusion.py`.
92
+
93
+ * **To get MORE new info:** Lower the `1.5` to `0.5`.
94
+ * **To get LESS new info:** Raise the `1.5` to `3.0` (this is the "Extreme Outlier" threshold).
shield_embeddings.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gc
4
+ import shutil
5
+ from safetensors.torch import load_file, save_file
6
+ import argparse
7
+
8
+ def get_weight_map(model_path):
9
+ index_path = os.path.join(model_path, "model.safetensors.index.json")
10
+ if os.path.exists(index_path):
11
+ with open(index_path, 'r') as f:
12
+ return json.load(f)["weight_map"]
13
+ for f in os.listdir(model_path):
14
+ if f.endswith(".safetensors"):
15
+ return {"model.embed_tokens.weight": f, "lm_head.weight": f}
16
+ return {}
17
+
18
+ def swap_tensor(base_path, merged_path, tensor_name):
19
+ base_map = get_weight_map(base_path)
20
+ merged_map = get_weight_map(merged_path)
21
+
22
+ if tensor_name not in base_map or tensor_name not in merged_map:
23
+ return
24
+
25
+ base_shard = os.path.join(base_path, base_map[tensor_name])
26
+ merged_shard = os.path.join(merged_path, merged_map[tensor_name])
27
+
28
+ # Load pristine tensor
29
+ base_tensors = load_file(base_shard, device="cpu")
30
+ pristine_tensor = base_tensors[tensor_name].clone()
31
+ del base_tensors
32
+ gc.collect()
33
+
34
+ # Load merged shards
35
+ merged_tensors = load_file(merged_shard, device="cpu")
36
+ merged_tensors[tensor_name] = pristine_tensor
37
+
38
+ # ATOMIC RENAME STRATEGY (The only way to beat Windows 1224)
39
+ backup_shard = merged_shard + ".old"
40
+ os.rename(merged_shard, backup_shard) # Move current file to side
41
+
42
+ try:
43
+ save_file(merged_tensors, merged_shard, metadata={"format": "pt"})
44
+ except Exception as e:
45
+ os.rename(backup_shard, merged_shard) # Restore if fail
46
+ raise e
47
+
48
+ # Cleanup
49
+ del merged_tensors
50
+ del pristine_tensor
51
+ gc.collect()
52
+ os.remove(backup_shard) # Now delete the old mapped file
53
+
54
+ print(f" ✅ Successfully shielded {tensor_name}!")
55
+
56
+ def main():
57
+ parser = argparse.ArgumentParser()
58
+ parser.add_argument("base_model")
59
+ parser.add_argument("merged_model")
60
+ args = parser.parse_args()
61
+
62
+ swap_tensor(args.base_model, args.merged_model, "model.embed_tokens.weight")
63
+ swap_tensor(args.base_model, args.merged_model, "lm_head.weight")
64
+
65
+ if __name__ == "__main__":
66
+ main()
shield_norms.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python shield_norms.py "B:\12B\models--p-e-w--Mistral-Nemo-Instruct-2407-heretic-noslop" "C:\Quanter\model_cache\EldritchLabs__Nocturne-Nereid-12B-v1"
2
+
3
+ import os
4
+ import json
5
+ import gc
6
+ import re
7
+ from safetensors.torch import load_file, save_file
8
+ import argparse
9
+
10
+ def get_weight_map(model_path):
11
+ index_path = os.path.join(model_path, "model.safetensors.index.json")
12
+ if os.path.exists(index_path):
13
+ with open(index_path, 'r') as f:
14
+ return json.load(f)["weight_map"]
15
+ # Fallback for single file models
16
+ res = {}
17
+ for f in os.listdir(model_path):
18
+ if f.endswith(".safetensors"):
19
+ # This is slow but accurate for single-file models
20
+ tensors = load_file(os.path.join(model_path, f))
21
+ for k in tensors.keys():
22
+ res[k] = f
23
+ return res
24
+
25
+ def shield_norms(base_path, merged_path):
26
+ print("\n[1] Mapping tensors...")
27
+ base_map = get_weight_map(base_path)
28
+ merged_map = get_weight_map(merged_path)
29
+
30
+ # Identify all normalization tensors
31
+ # Catching: model.norm.weight, input_layernorm.weight, post_attention_layernorm.weight
32
+ norm_pattern = re.compile(r".*norm\.weight$")
33
+ target_tensors = [t for t in merged_map.keys() if norm_pattern.match(t)]
34
+
35
+ if not target_tensors:
36
+ print(" [!] No normalization tensors found!")
37
+ return
38
+
39
+ print(f" -> Found {len(target_tensors)} normalization tensors to shield.")
40
+
41
+ # Group by shard to minimize file opening
42
+ shards_to_process = {}
43
+ for t in target_tensors:
44
+ shard = merged_map[t]
45
+ if shard not in shards_to_process:
46
+ shards_to_process[shard] = []
47
+ shards_to_process[shard].append(t)
48
+
49
+ print(f"\n[2] Processing {len(shards_to_process)} shards...")
50
+
51
+ for shard_name, tensors in shards_to_process.items():
52
+ merged_shard_path = os.path.join(merged_path, shard_name)
53
+ backup_shard_path = merged_shard_path + ".old"
54
+
55
+ print(f" -> Shard: {shard_name}")
56
+
57
+ # 1. Load the merged shard
58
+ merged_tensors = load_file(merged_shard_path, device="cpu")
59
+
60
+ # 2. Update each target tensor in this shard
61
+ for t_name in tensors:
62
+ if t_name in base_map:
63
+ base_shard_path = os.path.join(base_path, base_map[t_name])
64
+ base_data = load_file(base_shard_path, device="cpu")
65
+
66
+ print(f" Injecting pristine: {t_name}")
67
+ merged_tensors[t_name] = base_data[t_name].clone()
68
+
69
+ del base_data
70
+ else:
71
+ print(f" [!] Warning: {t_name} not found in base model. Skipping.")
72
+
73
+ # 3. Atomic Rename Strategy for Windows
74
+ if os.path.exists(backup_shard_path):
75
+ os.remove(backup_shard_path)
76
+
77
+ os.rename(merged_shard_path, backup_shard_path)
78
+
79
+ try:
80
+ save_file(merged_tensors, merged_shard_path, metadata={"format": "pt"})
81
+ print(f" ✅ Shard saved successfully.")
82
+ except Exception as e:
83
+ print(f" ❌ Error saving shard: {e}")
84
+ os.rename(backup_shard_path, merged_shard_path)
85
+ raise e
86
+
87
+ # 4. Cleanup and release handles
88
+ del merged_tensors
89
+ gc.collect()
90
+ try:
91
+ os.remove(backup_shard_path)
92
+ except Exception as e:
93
+ print(f" [!] Note: Could not delete .old file immediately (OS lock). It will be orphaned: {e}")
94
+
95
+ def main():
96
+ parser = argparse.ArgumentParser(description="Revert all normalization layers to base model values.")
97
+ parser.add_argument("base_model", help="Path to the pristine base model")
98
+ parser.add_argument("merged_model", help="Path to the merged model with artifacts")
99
+ args = parser.parse_args()
100
+
101
+ print("="*60)
102
+ print("🛡️ NORM SHIELD: RE-CALIBRATING OUTPUT MANIFOLD")
103
+ print("="*60)
104
+
105
+ shield_norms(args.base_model, args.merged_model)
106
+
107
+ print("\n" + "="*60)
108
+ print("Done! All normalization layers have been reverted to Base.")
109
+ print("This should eliminate the 'napad' / 'derrotó' vector drift.")
110
+ print("="*60)
111
+
112
+ if __name__ == "__main__":
113
+ main()
vocab_id_patcher.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from colorama import init, Fore, Style
5
+
6
+ init()
7
+
8
+ # Updated list to include Dolphin, Noromaid, and Hermes
9
+ TARGET_MODELS = [
10
+ "dreamgen--WizardLM-2-7B",
11
+ "KoboldAI--Mistral-7B-Erebus-v3",
12
+ "teknium--Mistral-Trismegistus-7B",
13
+ "dolphin-2.2.1-mistral-7b",
14
+ "Noromaid-7B-0.4-DPO",
15
+ "Hermes-Trismegistus-Mistral-7B"
16
+ ]
17
+
18
+ def load_json(path):
19
+ if not os.path.exists(path):
20
+ return {}
21
+ try:
22
+ with open(path, 'r', encoding='utf-8') as f:
23
+ return json.load(f)
24
+ except Exception:
25
+ return {}
26
+
27
+ def save_json(path, data):
28
+ with open(path, 'w', encoding='utf-8') as f:
29
+ json.dump(data, f, indent=2)
30
+
31
+ def patch_model(model_path):
32
+ print(f"Processing: {Fore.YELLOW}{os.path.basename(model_path)}{Style.RESET_ALL}")
33
+
34
+ # 1. Fix Tokenizer Config
35
+ tok_config_path = os.path.join(model_path, "tokenizer_config.json")
36
+ if os.path.exists(tok_config_path):
37
+ data = load_json(tok_config_path)
38
+
39
+ # Force EOS to standard Mistral
40
+ data["eos_token"] = "</s>"
41
+
42
+ # Ensure model_max_length is set
43
+ if "model_max_length" not in data or data["model_max_length"] > 100000:
44
+ data["model_max_length"] = 32768
45
+
46
+ save_json(tok_config_path, data)
47
+ print(f" -> Updated tokenizer_config.json (eos_token: </s>)")
48
+ else:
49
+ print(f" {Fore.RED}! tokenizer_config.json missing{Style.RESET_ALL}")
50
+
51
+ # 2. Fix Special Tokens Map
52
+ spec_map_path = os.path.join(model_path, "special_tokens_map.json")
53
+ if os.path.exists(spec_map_path):
54
+ data = load_json(spec_map_path)
55
+ data["eos_token"] = "</s>"
56
+ save_json(spec_map_path, data)
57
+ print(f" -> Updated special_tokens_map.json")
58
+
59
+ # 3. Attempt to fix Tokenizer JSON (The source of "MISSING" ID)
60
+ tok_json_path = os.path.join(model_path, "tokenizer.json")
61
+ if os.path.exists(tok_json_path):
62
+ data = load_json(tok_json_path)
63
+
64
+ # Check if it has the standard structure
65
+ if "model" in data and "vocab" in data["model"]:
66
+ vocab = data["model"]["vocab"]
67
+
68
+ # Ensure </s> exists and is 2
69
+ if "</s>" not in vocab:
70
+ print(f" {Fore.YELLOW}! '</s>' was missing from vocab. Adding it as ID 2.{Style.RESET_ALL}")
71
+ vocab["</s>"] = 2
72
+ elif vocab["</s>"] != 2:
73
+ print(f" {Fore.RED}! '</s>' is ID {vocab['</s>']}. Forcing to 2.{Style.RESET_ALL}")
74
+ vocab["</s>"] = 2
75
+
76
+ # Save back
77
+ data["model"]["vocab"] = vocab
78
+ save_json(tok_json_path, data)
79
+ print(f" -> Patched tokenizer.json vocab")
80
+ else:
81
+ print(f" {Fore.RED}! tokenizer.json has unexpected structure.{Style.RESET_ALL}")
82
+ else:
83
+ print(f" {Fore.RED}! tokenizer.json not found. (This is why Vocab ID is MISSING){Style.RESET_ALL}")
84
+ print(f" {Fore.CYAN}* Attempting to create minimal tokenizer.json wrapper...{Style.RESET_ALL}")
85
+ # Create a dummy wrapper so Mergekit can find the ID
86
+ dummy_data = {
87
+ "version": "1.0",
88
+ "model": {
89
+ "type": "BPE",
90
+ "vocab": {"<unk>": 0, "<s>": 1, "</s>": 2}
91
+ }
92
+ }
93
+ save_json(tok_json_path, dummy_data)
94
+ print(f" -> Created minimal tokenizer.json")
95
+
96
+ def main():
97
+ parser = argparse.ArgumentParser(description="Patch missing Vocab IDs for specific models")
98
+ parser.add_argument("base_dir", help="Directory containing the model folders (e.g. B:\\7B)")
99
+ args = parser.parse_args()
100
+
101
+ print(f"{Fore.CYAN}--- VOCAB ID PATCHER ---{Style.RESET_ALL}")
102
+
103
+ # Walk through the directory to find the specific models
104
+ found = 0
105
+ for root, dirs, files in os.walk(args.base_dir):
106
+ for name in dirs:
107
+ # Check if this folder matches one of our targets
108
+ if any(target in name for target in TARGET_MODELS):
109
+ full_path = os.path.join(root, name)
110
+
111
+ # CHECK FOR 'fixed' SUBDIRECTORY
112
+ # If the model files are actually inside a 'fixed' subfolder, we must patch that instead.
113
+ fixed_sub = os.path.join(full_path, "fixed")
114
+ if os.path.exists(fixed_sub):
115
+ print(f"Found target '{name}' -> Detected 'fixed' subdir. Patching inside.")
116
+ patch_model(fixed_sub)
117
+ else:
118
+ patch_model(full_path)
119
+
120
+ found += 1
121
+
122
+ if found == 0:
123
+ print(f"{Fore.RED}No target models found in {args.base_dir}{Style.RESET_ALL}")
124
+ print(f"Looking for: {TARGET_MODELS}")
125
+
126
+ if __name__ == "__main__":
127
+ main()