Upload re-vision.py with huggingface_hub

8c785f0 verified 8 months ago

10.8 kB

	# pip install pathlib safetensors tqdm

	import json
	import os
	from pathlib import Path
	from safetensors.torch import load_file, save_file, safe_open
	from collections import defaultdict
	import torch # Needed for tensor manipulation if any dtype/device casting were required (not expected here)
	import shutil
	from tqdm import tqdm # Optional: for progress bar

	# --- Configuration ---
	BASE_MODEL_DIR = Path("/home/dgxuser/workspace/Mango/models/Mistral-Small-3.2-24B-Instruct-2506")
	TRAINED_MODEL_DIR = Path("/home/dgxuser/workspace/Mango/axolotl/24B-Retrain/merged")
	OUTPUT_MODEL_DIR = Path("/home/dgxuser/workspace/docshotgun/models/MS3.2-Venice-SFT-KTO-0.35-beta-re-vision")

	# Define the prefix used in the base model for language model layers
	BASE_LM_PREFIX = "language_model."
	# Define the prefix used in the trained model for language model layers
	# (Assuming the trained model has the prefix stripped)
	TRAINED_LM_PREFIX = "" # If trained keys are 'model.layers...', this is effectively empty relative to the base

	# --- Safety Check ---
	if OUTPUT_MODEL_DIR.exists() and any(OUTPUT_MODEL_DIR.iterdir()):
	print(f"Warning: Output directory {OUTPUT_MODEL_DIR} already exists and is not empty.")
	# Decide if you want to overwrite or stop
	# input("Press Enter to continue and potentially overwrite files, or Ctrl+C to abort.")
	pass # Or raise an error: raise FileExistsError(f"Output directory {OUTPUT_MODEL_DIR} is not empty.")

	# --- Create Output Directory ---
	OUTPUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)

	# --- Load Index Files ---
	try:
	base_index_path = next(BASE_MODEL_DIR.glob("*.safetensors.index.json"))
	with open(base_index_path, 'r') as f:
	base_index = json.load(f)
	print(f"Loaded base model index from: {base_index_path}")
	except StopIteration:
	raise FileNotFoundError(f"Could not find *.safetensors.index.json in {BASE_MODEL_DIR}")

	try:
	trained_index_path = next(TRAINED_MODEL_DIR.glob("*.safetensors.index.json"))
	with open(trained_index_path, 'r') as f:
	trained_index = json.load(f)
	print(f"Loaded trained model index from: {trained_index_path}")
	except StopIteration:
	raise FileNotFoundError(f"Could not find *.safetensors.index.json in {TRAINED_MODEL_DIR}")


	# --- Prepare Trained Tensor Lookup ---
	# Create a map from trained tensor name to the shard file it's in
	trained_tensor_to_shard = trained_index.get("weight_map", {})
	if not trained_tensor_to_shard:
	raise ValueError("Could not find 'weight_map' in trained model index.")
	print(f"Built lookup map for {len(trained_tensor_to_shard)} trained tensors.")

	# --- Process Shards ---
	base_weight_map = base_index.get("weight_map", {})
	if not base_weight_map:
	raise ValueError("Could not find 'weight_map' in base model index.")

	# Group base tensors by the shard they belong to
	base_shards_content = defaultdict(list)
	for tensor_name, shard_file in base_weight_map.items():
	base_shards_content[shard_file].append(tensor_name)

	print(f"Processing {len(base_shards_content)} shards from the base model...")

	# Use tqdm for progress bar over shards
	for shard_file, tensors_in_shard in tqdm(base_shards_content.items(), desc="Merging Shards"):
	base_shard_path = BASE_MODEL_DIR / shard_file
	output_shard_path = OUTPUT_MODEL_DIR / shard_file

	# Load the current base model shard
	# print(f" Loading base shard: {shard_file}")
	current_shard_tensors = load_file(base_shard_path, device="cpu") # Load to CPU to save GPU memory

	# Identify which tensors in this shard need replacement
	tensors_to_replace = {} # {base_tensor_name: trained_tensor_name}
	for base_tensor_name in tensors_in_shard:
	if base_tensor_name.startswith(BASE_LM_PREFIX):
	# Derive the corresponding name in the trained model
	# e.g., language_model.model.layers.0... -> model.layers.0...
	potential_trained_name = base_tensor_name[len(BASE_LM_PREFIX):]

	# Check if this derived name exists in the trained model's index
	if potential_trained_name in trained_tensor_to_shard:
	tensors_to_replace[base_tensor_name] = potential_trained_name
	else:
	# This might happen for non-layer LM parts if the naming convention differs
	# Or if the base model has LM parts not present in the stripped trained model
	# print(f" Debug: Base tensor {base_tensor_name} starts with prefix, but derived name {potential_trained_name} not found in trained model map. Skipping replacement.")
	pass # Keep the base tensor

	# --- Explicit Check for LM Head (Common Case) ---
	# Many models have `lm_head.weight` outside the `language_model` block
	# Check if the trained model also has `lm_head.weight` (or similar)
	elif base_tensor_name == "lm_head.weight": # Adjust if your LM head has a different name
	if "lm_head.weight" in trained_tensor_to_shard:
	tensors_to_replace[base_tensor_name] = "lm_head.weight"
	else:
	# print(f" Debug: Base tensor 'lm_head.weight' found, but not present in trained model map. Skipping replacement.")
	pass # Keep the base tensor

	# Group the needed trained tensors by the shard they are located in
	needed_trained_shards = defaultdict(list) # {trained_shard_file: [list of trained_tensor_names]}
	for base_name, trained_name in tensors_to_replace.items():
	try:
	trained_shard_file = trained_tensor_to_shard[trained_name]
	needed_trained_shards[trained_shard_file].append(trained_name)
	except KeyError:
	print(f" Warning: Tensor '{trained_name}' (derived from '{base_name}') listed for replacement but not found in trained model's weight map. Skipping.")
	# Remove from replacement list if lookup fails
	del tensors_to_replace[base_name]


	# Load needed trained shards one by one and perform replacements
	loaded_trained_tensors = {}
	for trained_shard_file, trained_tensor_names in needed_trained_shards.items():
	trained_shard_path = TRAINED_MODEL_DIR / trained_shard_file
	# print(f" Loading trained shard: {trained_shard_file} for {len(trained_tensor_names)} tensor(s)")
	try:
	# Load only the required tensors from the trained shard if possible (optimisation - requires safetensors >= 0.4.0)
	# Note: As of mid-2023, load_file loads the whole shard. This is aspirational or requires custom loading.
	# For now, we load the whole shard.
	shard_data = load_file(trained_shard_path, device="cpu")
	for name in trained_tensor_names:
	if name in shard_data:
	loaded_trained_tensors[name] = shard_data[name]
	else:
	print(f" Warning: Expected tensor '{name}' not found within loaded trained shard '{trained_shard_file}'.")
	del shard_data # Free memory
	except FileNotFoundError:
	print(f" Error: Could not find required trained shard file: {trained_shard_path}. Cannot perform replacements for tensors in this shard.")
	# Remove base tensors that relied on this missing shard from replacement list
	base_names_to_remove = [b_name for b_name, t_name in tensors_to_replace.items() if t_name in trained_tensor_names]
	for b_name in base_names_to_remove:
	del tensors_to_replace[b_name]
	print(f" Skipping replacement for base tensor: {b_name}")


	# Perform the replacements in the loaded base shard dictionary
	replacement_count = 0
	for base_name, trained_name in tensors_to_replace.items():
	if trained_name in loaded_trained_tensors:
	# Sanity check shapes (optional but recommended)
	if current_shard_tensors[base_name].shape != loaded_trained_tensors[trained_name].shape:
	print(f" Warning: Shape mismatch for {base_name}! Base: {current_shard_tensors[base_name].shape}, Trained: {loaded_trained_tensors[trained_name].shape}. Skipping replacement.")
	continue
	current_shard_tensors[base_name] = loaded_trained_tensors[trained_name]
	replacement_count += 1
	# else: # Already handled by warnings above
	# print(f" Warning: Trained tensor '{trained_name}' was expected but not loaded. Skipping replacement for '{base_name}'.")

	# print(f" Replaced {replacement_count} tensors in shard {shard_file}.")

	# Save the modified shard to the output directory
	# Ensure the directory for the shard exists if shards are nested (unlikely but possible)
	output_shard_path.parent.mkdir(parents=True, exist_ok=True)
	# print(f" Saving modified shard to: {output_shard_path}")
	# Metadata can be copied if needed, but usually not necessary for simple weight replacement
	# Pass existing metadata from base_index if available and relevant per-tensor
	save_file(current_shard_tensors, output_shard_path)

	# Clean up loaded tensors for this shard
	del current_shard_tensors
	del loaded_trained_tensors

	print("Finished processing shards.")

	# --- Copy Non-Tensor Files ---
	print("Copying non-tensor files (index, config, tokenizer, etc.)...")
	copied_files = []
	skipped_files = []

	for item in BASE_MODEL_DIR.iterdir():
	# Skip the actual shard files and the index we processed
	if item.is_file() and (".safetensors" not in item.name) and (".md" not in item.name):
	output_path = OUTPUT_MODEL_DIR / item.name
	try:
	shutil.copy2(item, output_path) # copy2 preserves metadata
	copied_files.append(item.name)
	except Exception as e:
	skipped_files.append(f"{item.name} (Error: {e})")
	elif item.is_dir(): # Also copy relevant subdirectories like tokenizer configs
	output_path = OUTPUT_MODEL_DIR / item.name
	if output_path.exists():
	shutil.rmtree(output_path) # Overwrite directory if exists
	try:
	shutil.copytree(item, output_path)
	copied_files.append(f"{item.name}/")
	except Exception as e:
	skipped_files.append(f"{item.name}/ (Error: {e})")

	# Specifically copy the original base index file to the new directory
	try:
	shutil.copy2(base_index_path, OUTPUT_MODEL_DIR / base_index_path.name)
	copied_files.append(base_index_path.name)
	except Exception as e:
	skipped_files.append(f"{base_index_path.name} (Error: {e})")


	print(f"Copied: {', '.join(copied_files)}")
	if skipped_files:
	print(f"Skipped/Errors: {', '.join(skipped_files)}")


	print(f"\nSuccessfully created merged model in: {OUTPUT_MODEL_DIR}")