NewEden-Forge
/

Revision-Script

Model card Files Files and versions

xet

Community

Delta-Vector commited on Jul 27, 2025

Commit

8c785f0

verified ·

1 Parent(s): 921350d

Upload re-vision.py with huggingface_hub

Browse files

Files changed (1) hide show

re-vision.py +209 -0

re-vision.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# pip install pathlib safetensors tqdm
+import json
+import os
+from pathlib import Path
+from safetensors.torch import load_file, save_file, safe_open
+from collections import defaultdict
+import torch # Needed for tensor manipulation if any dtype/device casting were required (not expected here)
+import shutil
+from tqdm import tqdm # Optional: for progress bar
+# --- Configuration ---
+BASE_MODEL_DIR = Path("/home/dgxuser/workspace/Mango/models/Mistral-Small-3.2-24B-Instruct-2506")
+TRAINED_MODEL_DIR = Path("/home/dgxuser/workspace/Mango/axolotl/24B-Retrain/merged")
+OUTPUT_MODEL_DIR = Path("/home/dgxuser/workspace/docshotgun/models/MS3.2-Venice-SFT-KTO-0.35-beta-re-vision")
+# Define the prefix used in the base model for language model layers
+BASE_LM_PREFIX = "language_model."
+# Define the prefix used in the trained model for language model layers
+# (Assuming the trained model has the prefix stripped)
+TRAINED_LM_PREFIX = "" # If trained keys are 'model.layers...', this is effectively empty relative to the base
+# --- Safety Check ---
+if OUTPUT_MODEL_DIR.exists() and any(OUTPUT_MODEL_DIR.iterdir()):
+    print(f"Warning: Output directory {OUTPUT_MODEL_DIR} already exists and is not empty.")
+    # Decide if you want to overwrite or stop
+    # input("Press Enter to continue and potentially overwrite files, or Ctrl+C to abort.")
+    pass # Or raise an error: raise FileExistsError(f"Output directory {OUTPUT_MODEL_DIR} is not empty.")
+# --- Create Output Directory ---
+OUTPUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)
+# --- Load Index Files ---
+try:
+    base_index_path = next(BASE_MODEL_DIR.glob("*.safetensors.index.json"))
+    with open(base_index_path, 'r') as f:
+        base_index = json.load(f)
+    print(f"Loaded base model index from: {base_index_path}")
+except StopIteration:
+    raise FileNotFoundError(f"Could not find *.safetensors.index.json in {BASE_MODEL_DIR}")
+try:
+    trained_index_path = next(TRAINED_MODEL_DIR.glob("*.safetensors.index.json"))
+    with open(trained_index_path, 'r') as f:
+        trained_index = json.load(f)
+    print(f"Loaded trained model index from: {trained_index_path}")
+except StopIteration:
+    raise FileNotFoundError(f"Could not find *.safetensors.index.json in {TRAINED_MODEL_DIR}")
+# --- Prepare Trained Tensor Lookup ---
+# Create a map from trained tensor name to the shard file it's in
+trained_tensor_to_shard = trained_index.get("weight_map", {})
+if not trained_tensor_to_shard:
+     raise ValueError("Could not find 'weight_map' in trained model index.")
+print(f"Built lookup map for {len(trained_tensor_to_shard)} trained tensors.")
+# --- Process Shards ---
+base_weight_map = base_index.get("weight_map", {})
+if not base_weight_map:
+     raise ValueError("Could not find 'weight_map' in base model index.")
+# Group base tensors by the shard they belong to
+base_shards_content = defaultdict(list)
+for tensor_name, shard_file in base_weight_map.items():
+    base_shards_content[shard_file].append(tensor_name)
+print(f"Processing {len(base_shards_content)} shards from the base model...")
+# Use tqdm for progress bar over shards
+for shard_file, tensors_in_shard in tqdm(base_shards_content.items(), desc="Merging Shards"):
+    base_shard_path = BASE_MODEL_DIR / shard_file
+    output_shard_path = OUTPUT_MODEL_DIR / shard_file
+    # Load the current base model shard
+    # print(f"  Loading base shard: {shard_file}")
+    current_shard_tensors = load_file(base_shard_path, device="cpu") # Load to CPU to save GPU memory
+    # Identify which tensors in this shard need replacement
+    tensors_to_replace = {} # {base_tensor_name: trained_tensor_name}
+    for base_tensor_name in tensors_in_shard:
+        if base_tensor_name.startswith(BASE_LM_PREFIX):
+            # Derive the corresponding name in the trained model
+            # e.g., language_model.model.layers.0... -> model.layers.0...
+            potential_trained_name = base_tensor_name[len(BASE_LM_PREFIX):]
+            # Check if this derived name exists in the trained model's index
+            if potential_trained_name in trained_tensor_to_shard:
+                tensors_to_replace[base_tensor_name] = potential_trained_name
+            else:
+                 # This might happen for non-layer LM parts if the naming convention differs
+                 # Or if the base model has LM parts not present in the stripped trained model
+                 # print(f"    Debug: Base tensor {base_tensor_name} starts with prefix, but derived name {potential_trained_name} not found in trained model map. Skipping replacement.")
+                 pass # Keep the base tensor
+        # --- Explicit Check for LM Head (Common Case) ---
+        # Many models have `lm_head.weight` outside the `language_model` block
+        # Check if the trained model also has `lm_head.weight` (or similar)
+        elif base_tensor_name == "lm_head.weight": # Adjust if your LM head has a different name
+            if "lm_head.weight" in trained_tensor_to_shard:
+                 tensors_to_replace[base_tensor_name] = "lm_head.weight"
+            else:
+                # print(f"    Debug: Base tensor 'lm_head.weight' found, but not present in trained model map. Skipping replacement.")
+                pass # Keep the base tensor
+    # Group the needed trained tensors by the shard they are located in
+    needed_trained_shards = defaultdict(list) # {trained_shard_file: [list of trained_tensor_names]}
+    for base_name, trained_name in tensors_to_replace.items():
+        try:
+            trained_shard_file = trained_tensor_to_shard[trained_name]
+            needed_trained_shards[trained_shard_file].append(trained_name)
+        except KeyError:
+            print(f"    Warning: Tensor '{trained_name}' (derived from '{base_name}') listed for replacement but not found in trained model's weight map. Skipping.")
+            # Remove from replacement list if lookup fails
+            del tensors_to_replace[base_name]
+    # Load needed trained shards one by one and perform replacements
+    loaded_trained_tensors = {}
+    for trained_shard_file, trained_tensor_names in needed_trained_shards.items():
+        trained_shard_path = TRAINED_MODEL_DIR / trained_shard_file
+        # print(f"    Loading trained shard: {trained_shard_file} for {len(trained_tensor_names)} tensor(s)")
+        try:
+            # Load only the required tensors from the trained shard if possible (optimisation - requires safetensors >= 0.4.0)
+            # Note: As of mid-2023, load_file loads the whole shard. This is aspirational or requires custom loading.
+            # For now, we load the whole shard.
+            shard_data = load_file(trained_shard_path, device="cpu")
+            for name in trained_tensor_names:
+                 if name in shard_data:
+                     loaded_trained_tensors[name] = shard_data[name]
+                 else:
+                     print(f"      Warning: Expected tensor '{name}' not found within loaded trained shard '{trained_shard_file}'.")
+            del shard_data # Free memory
+        except FileNotFoundError:
+             print(f"    Error: Could not find required trained shard file: {trained_shard_path}. Cannot perform replacements for tensors in this shard.")
+             # Remove base tensors that relied on this missing shard from replacement list
+             base_names_to_remove = [b_name for b_name, t_name in tensors_to_replace.items() if t_name in trained_tensor_names]
+             for b_name in base_names_to_remove:
+                 del tensors_to_replace[b_name]
+                 print(f"      Skipping replacement for base tensor: {b_name}")
+    # Perform the replacements in the loaded base shard dictionary
+    replacement_count = 0
+    for base_name, trained_name in tensors_to_replace.items():
+        if trained_name in loaded_trained_tensors:
+            # Sanity check shapes (optional but recommended)
+            if current_shard_tensors[base_name].shape != loaded_trained_tensors[trained_name].shape:
+                 print(f"    Warning: Shape mismatch for {base_name}! Base: {current_shard_tensors[base_name].shape}, Trained: {loaded_trained_tensors[trained_name].shape}. Skipping replacement.")
+                 continue
+            current_shard_tensors[base_name] = loaded_trained_tensors[trained_name]
+            replacement_count += 1
+        # else: # Already handled by warnings above
+        #    print(f"    Warning: Trained tensor '{trained_name}' was expected but not loaded. Skipping replacement for '{base_name}'.")
+    # print(f"    Replaced {replacement_count} tensors in shard {shard_file}.")
+    # Save the modified shard to the output directory
+    # Ensure the directory for the shard exists if shards are nested (unlikely but possible)
+    output_shard_path.parent.mkdir(parents=True, exist_ok=True)
+    # print(f"  Saving modified shard to: {output_shard_path}")
+    # Metadata can be copied if needed, but usually not necessary for simple weight replacement
+    # Pass existing metadata from base_index if available and relevant per-tensor
+    save_file(current_shard_tensors, output_shard_path)
+    # Clean up loaded tensors for this shard
+    del current_shard_tensors
+    del loaded_trained_tensors
+print("Finished processing shards.")
+# --- Copy Non-Tensor Files ---
+print("Copying non-tensor files (index, config, tokenizer, etc.)...")
+copied_files = []
+skipped_files = []
+for item in BASE_MODEL_DIR.iterdir():
+    # Skip the actual shard files and the index we processed
+    if item.is_file() and (".safetensors" not in item.name) and (".md" not in item.name):
+         output_path = OUTPUT_MODEL_DIR / item.name
+         try:
+            shutil.copy2(item, output_path) # copy2 preserves metadata
+            copied_files.append(item.name)
+         except Exception as e:
+             skipped_files.append(f"{item.name} (Error: {e})")
+    elif item.is_dir(): # Also copy relevant subdirectories like tokenizer configs
+         output_path = OUTPUT_MODEL_DIR / item.name
+         if output_path.exists():
+             shutil.rmtree(output_path) # Overwrite directory if exists
+         try:
+             shutil.copytree(item, output_path)
+             copied_files.append(f"{item.name}/")
+         except Exception as e:
+            skipped_files.append(f"{item.name}/ (Error: {e})")
+# Specifically copy the original base index file to the new directory
+try:
+    shutil.copy2(base_index_path, OUTPUT_MODEL_DIR / base_index_path.name)
+    copied_files.append(base_index_path.name)
+except Exception as e:
+    skipped_files.append(f"{base_index_path.name} (Error: {e})")
+print(f"Copied: {', '.join(copied_files)}")
+if skipped_files:
+    print(f"Skipped/Errors: {', '.join(skipped_files)}")
+print(f"\nSuccessfully created merged model in: {OUTPUT_MODEL_DIR}")