Update conversion script and include more triple weights to fp8

Browse files

Files changed (4) hide show

convert_safetensors_to_fp8.py +218 -179
fp8info.txt +0 -0
hunyuanvideo_foley_fp8_e4m3fn.safetensors +2 -2
hunyuanvideo_foley_fp8_e5m2.safetensors +2 -2

convert_safetensors_to_fp8.py CHANGED Viewed

@@ -1,192 +1,231 @@
-import torch
-import os
 import argparse
-from safetensors.torch import save_file
-from safetensors import safe_open
-from collections import OrderedDict
-from tqdm import tqdm
-import gc
-def should_convert_to_fp8(tensor_name: str) -> bool:
-    """
-    Conservative FP8 conversion policy:
-    - Only convert .weight tensors (not biases)
-    - Only convert transformer block layers
-    - Skip normalization layers (precision sensitive)
-    """
-    if not tensor_name.endswith(".weight"):
         return False
-    if not "blocks." in tensor_name:
         return False
-    if "cross_attn" in tensor_name or \
-       "ffn" in tensor_name or \
-       "self_attn" in tensor_name or \
-       "linear" in tensor_name:  # Added "linear" for broader coverage
-        if ".norm_k.weight" in tensor_name or \
-           ".norm_q.weight" in tensor_name or \
-           ".norm.weight" in tensor_name:
-            return False
-        return True
     return False
-def convert_safetensors_to_fp8(input_path: str, fp8_variant: str = "e4m3fn", device: str = "cuda"):
     """
-    Convert a single SafeTensors file to FP8, saving in the same directory
-    with fp8_e*** appended to the filename.
-    Args:
-        input_path: Path to input .safetensors file
-        fp8_variant: "e4m3fn" or "e5m2"
-        device: Device to use for conversion ("cuda" or "cpu")
     """
-    if not os.path.exists(input_path):
-        raise FileNotFoundError(f"Input file not found: {input_path}")
-    if not input_path.endswith('.safetensors'):
-        raise ValueError("Input file must be a .safetensors file")
-    # Determine target dtype
-    if fp8_variant == "e5m2":
-        target_dtype = torch.float8_e5m2
-    elif fp8_variant == "e4m3fn":
-        target_dtype = torch.float8_e4m3fn
-    else:
-        raise ValueError(f"Unsupported FP8 variant: {fp8_variant}. Use 'e4m3fn' or 'e5m2'")
-    # Generate output path
-    input_dir = os.path.dirname(input_path)
-    input_filename = os.path.basename(input_path)
-    name_without_ext = os.path.splitext(input_filename)[0]
-    output_filename = f"{name_without_ext}_fp8_{fp8_variant}.safetensors"
-    output_path = os.path.join(input_dir, output_filename)
-    print(f"Converting: {input_path}")
-    print(f"Output: {output_path}")
-    print(f"Target dtype: {target_dtype}")
-    print(f"Device: {device}")
-    # Check if output already exists
-    if os.path.exists(output_path):
-        response = input(f"Output file {output_path} already exists. Overwrite? (y/N): ")
-        if response.lower() != 'y':
-            print("Conversion cancelled.")
-            return
-    converted_state_dict = OrderedDict()
-    conversion_stats = {"converted": 0, "skipped": 0, "total": 0}
-    try:
-        # Load and process tensors
-        with safe_open(input_path, framework="pt", device="cpu") as f:
-            tensor_names = list(f.keys())
-            conversion_stats["total"] = len(tensor_names)
-            print(f"Processing {len(tensor_names)} tensors...")
-            for tensor_name in tqdm(tensor_names, desc="Converting tensors"):
-                original_tensor = f.get_tensor(tensor_name)
-                if should_convert_to_fp8(tensor_name):
-                    # Convert to FP8
-                    converted_tensor = original_tensor.to(device).to(target_dtype).to("cpu")
-                    converted_state_dict[tensor_name] = converted_tensor
-                    conversion_stats["converted"] += 1
-                    # Clean up GPU memory if using CUDA
-                    if device == "cuda" and torch.cuda.is_available():
-                        del converted_tensor
-                else:
-                    # Keep original precision
-                    converted_state_dict[tensor_name] = original_tensor.to("cpu")
-                    conversion_stats["skipped"] += 1
-        # Save converted model
-        print(f"Saving converted model to: {output_path}")
-        save_file(converted_state_dict, output_path)
-        # Print conversion statistics
-        print(f"\nConversion complete!")
-        print(f"Total tensors: {conversion_stats['total']}")
-        print(f"Converted to FP8: {conversion_stats['converted']}")
-        print(f"Kept original precision: {conversion_stats['skipped']}")
-        print(f"Conversion rate: {conversion_stats['converted']/conversion_stats['total']*100:.1f}%")
-        # Calculate file sizes
-        input_size = os.path.getsize(input_path) / (1024**3)  # GB
-        output_size = os.path.getsize(output_path) / (1024**3)  # GB
-        size_reduction = (1 - output_size/input_size) * 100
-        print(f"\nFile size comparison:")
-        print(f"Original: {input_size:.2f} GB")
-        print(f"Converted: {output_size:.2f} GB")
-        print(f"Size reduction: {size_reduction:.1f}%")
-    except Exception as e:
-        print(f"Error during conversion: {e}")
-        if os.path.exists(output_path):
-            print(f"Removing incomplete output file: {output_path}")
-            os.remove(output_path)
-        raise
-    finally:
-        # Clean up memory
-        if 'converted_state_dict' in locals():
-            del converted_state_dict
-        if 'original_tensor' in locals():
-            del original_tensor
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
 def main():
-    parser = argparse.ArgumentParser(
-        description="Convert SafeTensors model to FP8 precision",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python convert_safetensors_to_fp8.py model.safetensors
-  python convert_safetensors_to_fp8.py model.safetensors --variant e5m2
-  python convert_safetensors_to_fp8.py model.safetensors --device cpu
-        """
-    )
-    parser.add_argument(
-        "input_file",
-        help="Path to input .safetensors file"
-    )
-    parser.add_argument(
-        "--variant",
-        choices=["e4m3fn", "e5m2"],
-        default="e4m3fn",
-        help="FP8 variant to use (default: e4m3fn)"
-    )
-    parser.add_argument(
-        "--device",
-        choices=["cuda", "cpu"],
-        default="cuda" if torch.cuda.is_available() else "cpu",
-        help="Device to use for conversion"
     )
-    args = parser.parse_args()
-    print(f"=== SafeTensors to FP8 Converter ===")
-    print(f"PyTorch version: {torch.__version__}")
-    print(f"CUDA available: {torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        print(f"CUDA device: {torch.cuda.get_device_name()}")
-    print()
-    try:
-        convert_safetensors_to_fp8(
-            input_path=args.input_file,
-            fp8_variant=args.variant,
-            device=args.device
-        )
-    except Exception as e:
-        print(f"Conversion failed: {e}")
-        exit(1)
 if __name__ == "__main__":
-    main()

+#!/usr/bin/env python3
+"""
+Mixed-FP8 safetensors converter for Hunyuan-Foley checkpoints.
+- Converts selected .weight tensors to FP8 storage (E5M2 by default on pre-Hopper).
+- Keeps math in FP16/BF16; this is a storage-only change in the file.
+- Honors existing FP8 tensors in the input unless --recode-fp8 is set.
+- Skips norms, biases, visual_proj.*, final_layer.* by design.
+- Optional --aggressive converts modulation linears too.
+USAGE (simple):
+  python convert_fp8.py in.safetensors [out.safetensors]   # out is optional
+USAGE (flags):
+  python convert_fp8.py in.safetensors out.safetensors --fp8 auto --aggressive
+Notes:
+- “auto” picks FP8_E5M2 on SM < 90 (e.g., 3090), else FP8_E4M3FN.
+- You can force a format: --fp8 e5m2 | e4m3fn
+- Dry run: add --dry to print what would change without writing.
+"""
 import argparse
+import re
+from typing import Dict, Tuple
+from pathlib import Path
+import torch
+from safetensors.torch import load_file, save_file
+# --------------------------- Policy (names) ---------------------------
+# Skip norms/bias and sensitive endpoints explicitly
+_DENY_SUBSTRINGS = (
+    ".bias", ".norm", "q_norm.", "k_norm.",
+    "final_layer.", "visual_proj.",
+)
+# Allowed patterns target this architecture’s large linears
+_ALLOW_PATTERNS = tuple(re.compile(p) for p in (
+    # Single-stream blocks
+    r"^single_blocks\.\d+\.linear1\.weight$",
+    r"^single_blocks\.\d+\.linear2\.w[123]\.weight$",           # w1/w2/w3
+    r"^single_blocks\.\d+\.linear_qkv\.weight$",
+    r"^single_blocks\.\d+\.modulation\.linear\.weight$",        # gated by --aggressive
+    # Triple-stream blocks: MLPs (dominant size)
+    r"^triple_blocks\.\d+\.audio_mlp\.fc[12]\.weight$",
+    r"^triple_blocks\.\d+\.v_cond_mlp\.fc[12]\.weight$",
+    # Triple-stream blocks: attention projections
+    r"^triple_blocks\.\d+\.(audio_self_attn_qkv|v_cond_attn_qkv|text_cross_kv)\.weight$",
+    r"^triple_blocks\.\d+\.(audio_self_proj|v_cond_self_proj)\.weight$",
+    # r"^triple_blocks\.\d+\.(audio_cross_q|v_cond_cross_q)\.weight$",
+    # r"^triple_blocks\.\d+\.(audio_cross_proj|v_cond_cross_proj)\.weight$",
+    # Triple-stream blocks: modulation linears (gated)
+    r"^triple_blocks\.\d+\.(audio_mod|v_cond_mod)\.linear\.weight$",
+))
+# --------------------------- Helpers ---------------------------
+def default_out_path(in_path: str, tgt_dtype: torch.dtype) -> str:
+    """<in>_fp8_<e5m2|e4m3fn>.safetensors (idempotent if already suffixed)."""
+    suffix = "e5m2" if tgt_dtype == torch.float8_e5m2 else "e4m3fn"
+    p = Path(in_path)
+    stem = re.sub(r"_fp8_e(5m2|4m3fn)$", "", p.stem)  # strip prior suffix
+    ext = p.suffix or ".safetensors"
+    return str(p.with_name(f"{stem}_fp8_{suffix}{ext}"))
+def pick_fp8_dtype(fp8_mode: str) -> torch.dtype:
+    """Pick target FP8 dtype."""
+    m = fp8_mode.lower()
+    if m == "e5m2":
+        return torch.float8_e5m2
+    if m == "e4m3fn":
+        return torch.float8_e4m3fn
+    # auto
+    try:
+        major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
+    except Exception:
+        major = 0
+    return torch.float8_e5m2 if major < 9 else torch.float8_e4m3fn
+def bytes_of(t: torch.Tensor) -> int:
+    """Size in bytes (FP8=1 byte/elt)."""
+    if t.dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+        return t.numel() * 1
+    return t.numel() * t.element_size()
+def human_gb(nbytes: int) -> float:
+    return nbytes / (1024 ** 3)
+def _is_denied(name: str) -> bool:
+    return any(tok in name for tok in _DENY_SUBSTRINGS)
+def should_convert_to_fp8(name: str, aggressive: bool) -> bool:
+    """Match names for conversion, with modulation linears gated by --aggressive."""
+    if not name.endswith(".weight"):
         return False
+    if _is_denied(name):
         return False
+    for pat in _ALLOW_PATTERNS:
+        if pat.search(name):
+            # Gate modulation linears (single/triple) behind --aggressive
+            if (
+                ".modulation.linear.weight" in name
+                or ".audio_mod.linear.weight" in name
+                or ".v_cond_mod.linear.weight" in name
+            ):
+                return aggressive
+            return True
     return False
+# --------------------------- Core ---------------------------
+def convert_state_dict(
+    sd: Dict[str, torch.Tensor],
+    fp8_mode: str = "auto",
+    aggressive: bool = False,
+    recode_fp8: bool = False,
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, int]]:
     """
+    Convert selected weights to FP8 storage according to the policy.
+    Honors existing FP8 unless recode_fp8=True.
+    Returns (new_sd, stats) with byte counts.
     """
+    tgt_dtype = pick_fp8_dtype(fp8_mode)
+    out: Dict[str, torch.Tensor] = {}
+    stats = {
+        "total_before": 0,
+        "total_after": 0,
+        "converted_count": 0,
+        "kept_fp8_count": 0,
+        "skipped_count": 0,
+    }
+    for name, tensor in sd.items():
+        before = bytes_of(tensor)
+        stats["total_before"] += before
+        # Respect existing FP8 unless asked to recode
+        if tensor.dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+            if recode_fp8:
+                out[name] = tensor.to(dtype=tgt_dtype)
+                stats["converted_count"] += 1
+            else:
+                out[name] = tensor
+                stats["kept_fp8_count"] += 1
+            stats["total_after"] += bytes_of(out[name])
+            continue
+        # Decide conversion
+        if should_convert_to_fp8(name, aggressive):
+            out[name] = tensor.to(dtype=tgt_dtype)
+            stats["converted_count"] += 1
+        else:
+            out[name] = tensor
+            stats["skipped_count"] += 1
+        stats["total_after"] += bytes_of(out[name])
+    return out, stats
+# --------------------------- CLI ---------------------------
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Convert selected weights in a safetensors file to FP8 storage.")
+    p.add_argument("in_path",  help="Input .safetensors")
+    p.add_argument("out_path", nargs="?", help="Output .safetensors (optional)")
+    p.add_argument("--fp8", choices=["auto", "e5m2", "e4m3fn"], default="auto",
+                   help='Target FP8 storage dtype: "auto" (default), "e5m2", or "e4m3fn"')
+    p.add_argument("--aggressive", action="store_true",
+                   help="Also convert modulation linears (audio_mod/v_cond_mod + single modulation.linear).")
+    p.add_argument("--recode-fp8", action="store_true",
+                   help="Re-encode existing FP8 tensors to the chosen target dtype.")
+    p.add_argument("--dry", action="store_true",
+                   help="Dry run: report only; do not write output file.")
+    return p.parse_args()
 def main():
+    args = parse_args()
+    print(f"[load] {args.in_path}")
+    sd = load_file(args.in_path)
+    tgt = pick_fp8_dtype(args.fp8)
+    if not args.out_path:
+        args.out_path = default_out_path(args.in_path, tgt)
+        print(f"[auto-out] {args.out_path}")
+    print(f"[policy] fp8_mode={args.fp8} -> {str(tgt).replace('torch.','')}, "
+          f"aggressive={args.aggressive}, recode_fp8={args.recode_fp8}")
+    new_sd, stats = convert_state_dict(
+        sd,
+        fp8_mode=args.fp8,
+        aggressive=args.aggressive,
+        recode_fp8=args.recode_fp8,
     )
+    saved = stats["total_before"] - stats["total_after"]
+    print(f"[stats] tensors: {len(sd)}")
+    print(f"[stats] converted: {stats['converted_count']} | kept_fp8: {stats['kept_fp8_count']} "
+          f"| skipped: {stats['skipped_count']}")
+    print(f"[bytes] before={human_gb(stats['total_before']):.3f} GiB | "
+          f"after={human_gb(stats['total_after']):.3f} GiB | saved={human_gb(saved):.3f} GiB")
+    if args.dry:
+        print("[dry] no file written")
+        return
+    print(f"[save] {args.out_path}")
+    save_file(new_sd, args.out_path)
+    print("[done]")
 if __name__ == "__main__":
+    main()

fp8info.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

hunyuanvideo_foley_fp8_e4m3fn.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce2af6afbe910197e48020261d5ce2af30267b3d15e2f8f385570cc4049e3934
-size 6318689840

 version https://git-lfs.github.com/spec/v1
+oid sha256:1b2fa56b9d9bd0c89f3d7e486f9f00032f247cf10dd860cc6d7f0b734bca8a31
+size 5341941120

hunyuanvideo_foley_fp8_e5m2.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43983270051d1b0bb3078f97040fd7bf84ffbee607d79ea1559c21a64741e5fc
-size 6318689840

 version https://git-lfs.github.com/spec/v1
+oid sha256:3fa9d76e614aff32cf089aa8cf249b18547670c4be656e73a51804caec0f7963
+size 5341941120