Upload fancy-apply.py

b860c85 verified 28 days ago

12.6 kB

	#!/usr/bin/env python3

	# THIS IS FOR ADVANCED LORA INTO BASE MODEL MERGING
	# Designed for use with the LTX2 model
	#
	# Make a text file with a list of LORAs you want to merge in this format for each line:
	# <path to safetensors>,<strength>,<lerp>
	#
	# The "lerp" parameter means "how much should I overwrite tensors that compete with LORAs listed above?". A value of "0" just mixes them all together, while "1" hard applies the LORA delta
	#
	# You can also supply a separate audio and video strengths like this:
	# <path to safetensors>,<video strength>,<lerp>,<audio strength>
	#
	# Use this script like this:
	# python fancy-apply.py <base model safetensors> <lora list txt file> <merged output filename>

	import argparse
	import os
	from typing import Dict, Tuple, List, Optional
	from collections import defaultdict
	import math

	import torch
	from safetensors.torch import load_file, save_file
	from safetensors import safe_open


	# ----------------- Tuning knobs ----------------- #

	# If True, the normalized component uses:
	# scale_norm = eff_strength / max(1.0, sum_eff_strengths_for_key)
	NORMALIZE_OVERLAPS = True

	# Per‑LoRA clipping threshold:
	# If not None, each LoRA's delta is clipped so that:
	# \|\|delta\|\| <= CLIP_RATIO * \|\|W\|\|
	CLIP_RATIO: Optional[float] = 1.0


	# ----------------- Parsing LoRA list ----------------- #

	def parse_lora_list(path: str) -> List[Tuple[str, float, float, float]]:
	"""
	Parse list_of_loras.txt with lines like:
	filename.safetensors,0.7,0.0
	filename2.safetensors,1.0,0.5,0.3

	Returns list of tuples:
	(path, video_strength, lerp_with_existing, audio_strength)

	Where:
	video_strength: base strength for video/shared weights
	audio_strength: base strength for audio weights
	(defaults to video_strength if omitted)
	lerp_with_existing in [0, 1]:
	0.0 -> fully normalized
	1.0 -> fully direct
	between -> blend between normalized and direct
	"""
	loras: List[Tuple[str, float, float, float]] = []
	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue

	parts = [p.strip() for p in line.split(",")]
	if len(parts) < 3:
	raise ValueError(f"Invalid LoRA line (need at least file,video_strength,lerp): {line}")

	filename = parts[0]
	video_strength = float(parts[1])
	lerp = float(parts[2])

	if len(parts) >= 4:
	audio_strength = float(parts[3])
	else:
	audio_strength = video_strength

	lerp = max(0.0, min(1.0, lerp))

	loras.append((filename, video_strength, lerp, audio_strength))

	return loras


	# ----------------- Base loading ----------------- #

	def load_base_with_metadata(path: str):
	with safe_open(path, framework="pt", device="cpu") as f:
	metadata = f.metadata() or {}
	tensors = load_file(path, device="cpu")
	return tensors, metadata


	# ----------------- LoRA key grouping ----------------- #

	def group_lora_pairs(lora_tensors: Dict[str, torch.Tensor]):
	prefixes = {}
	for k in lora_tensors.keys():
	if k.endswith(".lora_A.weight"):
	prefix = k[: -len(".lora_A.weight")]
	prefixes.setdefault(prefix, {})["A"] = k
	elif k.endswith(".lora_B.weight"):
	prefix = k[: -len(".lora_B.weight")]
	prefixes.setdefault(prefix, {})["B"] = k
	elif k.endswith(".alpha"):
	prefix = k[: -len(".alpha")]
	prefixes.setdefault(prefix, {})["alpha"] = k

	for prefix, keys in prefixes.items():
	if "A" not in keys or "B" not in keys:
	print(f"Warning: incomplete LoRA prefix {prefix}")
	continue
	yield prefix, keys["A"], keys["B"], keys.get("alpha")


	def find_base_weight_key(base_tensors, lora_prefix):
	candidates = [
	f"{lora_prefix}.weight",
	f"model.{lora_prefix}.weight",
	lora_prefix,
	f"model.{lora_prefix}",
	]
	for c in candidates:
	if c in base_tensors:
	return c
	return None


	# ----------------- Audio / video classification ----------------- #

	def classify_prefix(prefix: str) -> str:
	"""
	Classify a LoRA prefix as 'audio', 'video', 'cross', or 'shared'.
	"""
	p = prefix.lower()

	# Cross-modal first
	if "audio_to_video" in p or "video_to_audio" in p:
	return "cross"

	# Audio-specific
	if "audio_attn" in p or "audio_ff" in p or ".audio_" in p:
	return "audio"

	# Video-specific (heuristic)
	if "video_attn" in p or "video_ff" in p or ".video_" in p:
	return "video"

	# Default: shared (treated as video-strength)
	return "shared"


	def effective_strength_for_prefix(
	prefix: str,
	video_strength: float,
	audio_strength: float,
	) -> float:
	kind = classify_prefix(prefix)
	if kind == "audio":
	return audio_strength
	elif kind == "video":
	return video_strength
	elif kind == "cross":
	# Blend strengths for cross-modal
	return math.sqrt(max(video_strength, 0.0) * max(audio_strength, 0.0))
	else:
	# shared
	return video_strength


	# ----------------- Pass 1: strength sums per key ----------------- #

	def compute_strength_sums(
	base_tensors,
	lora_specs: List[Tuple[str, float, float, float]],
	) -> Dict[str, float]:
	"""
	For each base weight key, compute the sum of effective strengths of all LoRAs
	that touch it (using video/audio/cross classification).
	"""
	strength_sum: Dict[str, float] = defaultdict(float)

	for lora_path, video_strength, lerp, audio_strength in lora_specs:
	print(f"[Pass 1] Scanning {lora_path} (video={video_strength}, audio={audio_strength}, lerp={lerp})")
	lora_tensors = load_file(lora_path, device="cpu")

	for prefix, A_key, B_key, alpha_key in group_lora_pairs(lora_tensors):
	base_key = find_base_weight_key(base_tensors, prefix)
	if base_key is None:
	continue

	eff_strength = effective_strength_for_prefix(prefix, video_strength, audio_strength)
	strength_sum[base_key] += eff_strength

	del lora_tensors

	print(f"[Pass 1] Keys with strength contributions: {len(strength_sum)}")
	return strength_sum


	# ----------------- Pass 2: streaming application ----------------- #

	def apply_loras_streaming(
	base_tensors,
	lora_specs: List[Tuple[str, float, float, float]],
	strength_sum: Dict[str, float],
	clip_ratio: Optional[float] = CLIP_RATIO,
	):
	for lora_path, video_strength, lerp, audio_strength in lora_specs:
	print(f"[Pass 2] Applying {lora_path} (video={video_strength}, audio={audio_strength}, lerp={lerp})")
	lora_tensors = load_file(lora_path, device="cpu")

	applied = 0
	skipped = 0

	for prefix, A_key, B_key, alpha_key in group_lora_pairs(lora_tensors):
	base_key = find_base_weight_key(base_tensors, prefix)
	if base_key is None:
	skipped += 1
	continue

	W = base_tensors[base_key]

	A = lora_tensors[A_key].to(torch.float32)
	B = lora_tensors[B_key].to(torch.float32)
	delta = B @ A

	if delta.shape != W.shape:
	raise ValueError(
	f"Shape mismatch for {prefix}: delta {delta.shape} vs base {W.shape}"
	)

	rank = A.shape[0] if A.dim() == 2 else A.numel()

	# Effective strength for this prefix (audio/video/cross/shared)
	eff_strength = effective_strength_for_prefix(prefix, video_strength, audio_strength)

	# Base strength + alpha scaling
	if alpha_key is not None:
	alpha = float(lora_tensors[alpha_key].to(torch.float32).item())
	base_scale = eff_strength * alpha / max(rank, 1)
	else:
	base_scale = eff_strength

	# Weighted normalization
	if NORMALIZE_OVERLAPS:
	total_strength = strength_sum.get(base_key, 0.0)
	denom = max(1.0, total_strength)
	scale_norm = base_scale / denom
	else:
	scale_norm = base_scale

	# Direct (unnormalized) component
	scale_direct = base_scale

	# LERP between normalized and direct
	scale = (1.0 - lerp) * scale_norm + lerp * scale_direct

	delta_scaled = delta * scale

	# Per‑LoRA clipping
	if clip_ratio is not None:
	Wf = W.to(torch.float32)
	base_norm = Wf.norm().item()
	delta_norm = delta_scaled.norm().item()

	if delta_norm > clip_ratio * base_norm and delta_norm > 0:
	delta_scaled = (clip_ratio base_norm) / delta_norm

	# Apply update
	W_new = W.to(torch.float32) + delta_scaled
	base_tensors[base_key] = W_new.to(W.dtype)

	applied += 1

	print(f"[Pass 2] {lora_path}: applied {applied}, skipped {skipped}")
	del lora_tensors


	def apply_loras_to_base(base_tensors, lora_specs):
	strength_sum = compute_strength_sums(base_tensors, lora_specs)
	apply_loras_streaming(base_tensors, lora_specs, strength_sum)


	# ----------------- FP8 conversion ----------------- #

	def is_vae_key(key: str) -> bool:
	return any(key.startswith(p) for p in [
	"first_stage_model.",
	"model.first_stage_model.",
	"vae.",
	"model.vae.",
	])


	def is_text_encoder_key(key: str) -> bool:
	return any(key.startswith(p) for p in [
	"text_encoder.",
	"model.text_encoder.",
	"cond_stage_model.",
	"model.cond_stage_model.",
	])


	def is_unet_key(key: str) -> bool:
	return any(key.startswith(p) for p in [
	"model.diffusion_model.",
	"diffusion_model.",
	])


	def convert_to_fp8_inplace(tensors: Dict[str, torch.Tensor]):
	fp8_dtype = torch.float8_e4m3fn

	converted = 0
	skipped_vae = 0
	skipped_other = 0

	for k, v in list(tensors.items()):
	if not torch.is_floating_point(v):
	skipped_other += 1
	continue

	if is_vae_key(k):
	skipped_vae += 1
	continue

	if is_unet_key(k) or is_text_encoder_key(k):
	tensors[k] = v.to(fp8_dtype)
	converted += 1
	else:
	skipped_other += 1

	print(
	f"FP8 conversion: converted={converted}, "
	f"skipped_vae={skipped_vae}, skipped_other={skipped_other}"
	)


	# ----------------- Main CLI ----------------- #

	def main():
	parser = argparse.ArgumentParser(
	description=(
	"Apply LTX2-style LoRAs with separate video/audio strengths, "
	"strength-weighted normalization, LERP blending, per‑LoRA clipping, "
	"FP8 conversion, and metadata preservation (streaming, memory‑efficient)."
	)
	)
	parser.add_argument("base", help="Base checkpoint (.safetensors)")
	parser.add_argument("lora_list", help="Text file: path,video_strength,lerp[,audio_strength]")
	parser.add_argument("output", help="Output FP8 checkpoint (.safetensors)")

	args = parser.parse_args()

	if not os.path.isfile(args.base):
	raise FileNotFoundError(args.base)

	lora_specs = parse_lora_list(args.lora_list)
	if not lora_specs:
	raise ValueError("No LoRAs specified.")

	print(f"Loading base checkpoint: {args.base}")
	base_tensors, metadata = load_base_with_metadata(args.base)
	print(f"Base checkpoint has {len(base_tensors)} tensors.")

	apply_loras_to_base(base_tensors, lora_specs)

	print("Converting UNet + text encoder to FP8 (leaving VAE untouched)...")
	convert_to_fp8_inplace(base_tensors)

	print(f"Saving merged FP8 checkpoint to: {args.output}")
	os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
	save_file(base_tensors, args.output, metadata=metadata)
	print("Done.")


	if __name__ == "__main__":
	main()