Add metal flash sdpa

Browse files

Files changed (14) hide show

.gitattributes +35 -0
README.md +69 -0
benchmark_flash_sdpa.py +301 -0
build.toml +18 -0
flake.nix +17 -0
sdpa-metal/common.h +7 -0
sdpa-metal/scaled_dot_product_attention.metal +2070 -0
sdpa-metal/scaled_dot_product_attention.mm +330 -0
tests/__init__.py +0 -0
tests/test_flash_attention.py +1132 -0
torch-ext/sdpa_flash/__init__.py +11 -0
torch-ext/sdpa_flash/_custom_ops.py +117 -0
torch-ext/torch_binding.cpp +11 -0
torch-ext/torch_binding.h +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: apache-2.0
+tags:
+- kernel
+---
+# Metal Flash Attention
+A PyTorch extension that provides optimized Metal implementations of Flash Attention kernels for Metal.
+## Supported Features
+- Variable-length sequences without padding
+- Causal masking
+- Grouped Query Attention (GQA) and Multi-Query Attention (MQA)
+- Softcapping support for attention score regularization
+- Data types: `float32`, `float16`, `bfloat16`
+- Head dimensions: `32`, `64`, `72`, `80`, `96`, `128`, `256`
+## API Reference
+### flash_attention_varlen
+```python
+sdpa_flash.flash_attention_varlen(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    do_causal: bool,
+    scale: float,
+    softcapping: float
+) -> None
+```
+- **out**: Output tensor `[total_q_tokens, num_heads, head_dim]`, modified in-place.
+- **query/key/value**: Input tensors `[total_tokens, num_heads(_kv), head_dim]`.
+- **cu_seqlens_q/cu_seqlens_k**: Cumulative sequence lengths (`torch.int32`), `[batch_size + 1]`.
+- **max_seqlen_q/max_seqlen_k**: Maximum sequence lengths.
+- **do_causal**: Enable causal masking.
+- **scale**: Attention score scaling factor (e.g., `1/sqrt(head_dim)`).
+- **softcapping**: Softcapping value for score regularization (use `1.0` for no softcapping).
+### flash_attn_varlen_func
+Compatibility wrapper matching the original Flash Attention API:
+```python
+out = sdpa_flash.flash_attn_varlen_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    dropout_p: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: Tuple[int, int] = (-1, -1),
+    alibi_slopes: Optional[torch.Tensor] = None,
+    deterministic: bool = False,
+    return_attn_probs: bool = False
+)
+```

benchmark_flash_sdpa.py ADDED Viewed

	@@ -0,0 +1,301 @@

+#!/usr/bin/env python3
+"""Benchmark script for metal-sdpa-flash (Flash SDPA)"""
+import torch
+import time
+import sdpa_flash
+from typing import List, Tuple
+import numpy as np
+def create_cu_seqlens(seq_lengths: List[int]) -> torch.Tensor:
+    """Create cumulative sequence lengths tensor."""
+    cu_seqlens = [0]
+    for length in seq_lengths:
+        cu_seqlens.append(cu_seqlens[-1] + length)
+    return torch.tensor(cu_seqlens, dtype=torch.int32, device="mps")
+def warmup(func, *args, num_warmup=10):
+    """Warmup the GPU by running the function multiple times"""
+    for _ in range(num_warmup):
+        func(*args)
+    torch.mps.synchronize()
+def benchmark_flash_sdpa(
+    batch_size: int,
+    num_heads: int,
+    seq_len: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    causal: bool = False,
+    num_iterations: int = 100,
+) -> float:
+    """Benchmark Flash SDPA with given parameters"""
+    # Create sequence lengths (all equal for fair comparison)
+    seq_lengths = [seq_len] * batch_size
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    # Create input tensors in Flash format (total_tokens, num_heads, head_dim)
+    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    out = torch.empty_like(query)
+    scale = 1.0 / (head_dim ** 0.5)
+    # Define the function to benchmark
+    def run_flash_sdpa():
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=seq_len,
+            max_seqlen_k=seq_len,
+            mask=None,
+            do_causal=causal,
+            scale=scale,
+            softcapping=1.0,
+        )
+    # Warmup
+    warmup(run_flash_sdpa, num_warmup=10)
+    # Benchmark
+    torch.mps.synchronize()
+    start_time = time.perf_counter()
+    for _ in range(num_iterations):
+        run_flash_sdpa()
+    torch.mps.synchronize()
+    end_time = time.perf_counter()
+    avg_time_ms = (end_time - start_time) * 1000 / num_iterations
+    return avg_time_ms
+def benchmark_flash_gqa(
+    batch_size: int,
+    num_heads_q: int,
+    num_heads_kv: int,
+    seq_len: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    causal: bool = False,
+    num_iterations: int = 100,
+) -> float:
+    """Benchmark Flash Attention with Grouped Query Attention"""
+    # Create sequence lengths
+    seq_lengths = [seq_len] * batch_size
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    # Create input tensors with different head counts
+    query = torch.randn(total_tokens, num_heads_q, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads_kv, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads_kv, head_dim, dtype=dtype, device="mps")
+    out = torch.empty_like(query)
+    scale = 1.0 / (head_dim ** 0.5)
+    # Define the function to benchmark
+    def run_flash_gqa():
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=seq_len,
+            max_seqlen_k=seq_len,
+            mask=None,
+            do_causal=causal,
+            scale=scale,
+            softcapping=1.0,
+        )
+    # Warmup
+    warmup(run_flash_gqa, num_warmup=10)
+    # Benchmark
+    torch.mps.synchronize()
+    start_time = time.perf_counter()
+    for _ in range(num_iterations):
+        run_flash_gqa()
+    torch.mps.synchronize()
+    end_time = time.perf_counter()
+    avg_time_ms = (end_time - start_time) * 1000 / num_iterations
+    return avg_time_ms
+def benchmark_variable_length(
+    seq_lengths: List[int],
+    num_heads: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    causal: bool = False,
+    num_iterations: int = 100,
+) -> float:
+    """Benchmark Flash Attention with variable sequence lengths"""
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    max_seqlen = max(seq_lengths)
+    # Create input tensors
+    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    out = torch.empty_like(query)
+    scale = 1.0 / (head_dim ** 0.5)
+    # Define the function to benchmark
+    def run_varlen():
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+            mask=None,
+            do_causal=causal,
+            scale=scale,
+            softcapping=1.0,
+        )
+    # Warmup
+    warmup(run_varlen, num_warmup=10)
+    # Benchmark
+    torch.mps.synchronize()
+    start_time = time.perf_counter()
+    for _ in range(num_iterations):
+        run_varlen()
+    torch.mps.synchronize()
+    end_time = time.perf_counter()
+    avg_time_ms = (end_time - start_time) * 1000 / num_iterations
+    return avg_time_ms
+def main():
+    print("=" * 80)
+    print("Metal Flash SDPA Benchmark")
+    print("=" * 80)
+    # Test configurations (matching the plain SDPA benchmark)
+    configs = [
+        # (batch_size, num_heads, seq_len, head_dim, dtype, causal, name)
+        (1, 32, 512, 64, torch.float32, False, "Small seq, float32"),
+        (1, 32, 512, 64, torch.float16, False, "Small seq, float16"),
+        (1, 32, 512, 64, torch.bfloat16, False, "Small seq, bfloat16"),
+        (4, 32, 2048, 64, torch.float16, False, "Medium seq, float16"),
+        (4, 32, 2048, 64, torch.float16, True, "Medium seq, float16, causal"),
+        (2, 32, 4096, 64, torch.float16, False, "Large seq, float16"),
+        (2, 32, 4096, 64, torch.float16, True, "Large seq, float16, causal"),
+        # Different head dimensions
+        (2, 32, 2048, 32, torch.float16, False, "head_dim=32"),
+        (2, 32, 2048, 64, torch.float16, False, "head_dim=64"),
+        (2, 32, 2048, 128, torch.float16, False, "head_dim=128"),
+        # Vector kernel cases (q_seq=1) - Flash doesn't have a special vector kernel
+        # but we benchmark these cases for fair comparison with plain SDPA
+        (16, 32, 1, 64, torch.float16, False, "Vector kernel (q_seq=1)"),
+        (16, 32, 1, 128, torch.float16, False, "Vector kernel (q_seq=1, head_dim=128)"),
+    ]
+    print("\nFlash Attention Benchmarks:")
+    print("-" * 80)
+    print(f"{'Config':<40} {'Time (ms)':<15} {'TFLOPS':<15}")
+    print("-" * 80)
+    for batch_size, num_heads, seq_len, head_dim, dtype, causal, name in configs:
+        time_ms = benchmark_flash_sdpa(
+            batch_size, num_heads, seq_len, head_dim, dtype, causal
+        )
+        # Calculate FLOPS (approximate)
+        # Attention: 2 * batch * heads * seq_len^2 * head_dim
+        flops = 2 * batch_size * num_heads * seq_len * seq_len * head_dim
+        tflops = (flops / 1e12) / (time_ms / 1000)
+        print(f"{name:<40} {time_ms:<15.3f} {tflops:<15.2f}")
+    # GQA benchmarks
+    print("\n\nGrouped Query Attention (GQA) Benchmarks:")
+    print("-" * 80)
+    print(f"{'Config':<40} {'Time (ms)':<15} {'TFLOPS':<15}")
+    print("-" * 80)
+    gqa_configs = [
+        # (batch_size, num_heads_q, num_heads_kv, seq_len, head_dim, dtype, causal, name)
+        (2, 32, 8, 2048, 64, torch.float16, False, "GQA 4:1 ratio"),
+        (2, 32, 4, 2048, 64, torch.float16, False, "GQA 8:1 ratio"),
+        (2, 32, 1, 2048, 64, torch.float16, False, "MQA (32:1 ratio)"),
+        (2, 32, 8, 2048, 128, torch.float16, False, "GQA 4:1, head_dim=128"),
+    ]
+    for batch_size, num_heads_q, num_heads_kv, seq_len, head_dim, dtype, causal, name in gqa_configs:
+        time_ms = benchmark_flash_gqa(
+            batch_size, num_heads_q, num_heads_kv, seq_len, head_dim, dtype, causal
+        )
+        # Calculate FLOPS for GQA
+        flops = 2 * batch_size * num_heads_q * seq_len * seq_len * head_dim
+        tflops = (flops / 1e12) / (time_ms / 1000)
+        print(f"{name:<40} {time_ms:<15.3f} {tflops:<15.2f}")
+    # Variable length sequences (unique to Flash Attention)
+    print("\n\nVariable Length Sequence Benchmarks:")
+    print("-" * 80)
+    print(f"{'Config':<40} {'Time (ms)':<15} {'TFLOPS':<15}")
+    print("-" * 80)
+    varlen_configs = [
+        # (seq_lengths, num_heads, head_dim, dtype, causal, name)
+        ([512, 1024, 2048, 4096], 32, 64, torch.float16, False, "Variable [512-4096]"),
+        ([128, 256, 512, 1024, 2048], 32, 64, torch.float16, False, "Variable [128-2048]"),
+        ([2048, 2048, 2048, 2048], 32, 64, torch.float16, False, "Fixed 4x2048 (baseline)"),
+    ]
+    for seq_lengths, num_heads, head_dim, dtype, causal, name in varlen_configs:
+        time_ms = benchmark_variable_length(
+            seq_lengths, num_heads, head_dim, dtype, causal
+        )
+        # Calculate FLOPS for variable length
+        total_flops = 0
+        for seq_len in seq_lengths:
+            total_flops += 2 * num_heads * seq_len * seq_len * head_dim
+        tflops = (total_flops / 1e12) / (time_ms / 1000)
+        print(f"{name:<40} {time_ms:<15.3f} {tflops:<15.2f}")
+    print("\n" + "=" * 80)
+    print("Benchmark completed!")
+if __name__ == "__main__":
+    main()

build.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[general]
+name = "sdpa_flash"
+universal = false
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+[kernel.sdpa_metal]
+backend = "metal"
+src = [
+  "sdpa-metal/scaled_dot_product_attention.mm",
+  "sdpa-metal/scaled_dot_product_attention.metal",
+  "sdpa-metal/common.h",
+]
+depends = [ "torch" ]

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for SDPA kernel";
+  inputs = {
+    kernel-builder.url = "path:../..";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

sdpa-metal/common.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#ifndef SDPA_METAL_COMMON_H
+#define SDPA_METAL_COMMON_H
+// Common definitions for Metal kernels
+// This file is included by Metal shaders, so it should not contain C++ code
+#endif // SDPA_METAL_COMMON_H

sdpa-metal/scaled_dot_product_attention.metal ADDED Viewed

	@@ -0,0 +1,2070 @@

+// Updated from MLX commit has f70764a
+#include <metal_stdlib>
+#include <metal_simdgroup>
+using namespace metal;
+#define STEEL_CONST static constant constexpr const
+#define STEEL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
+#if defined(__HAVE_BFLOAT__)
+typedef bfloat bfloat16_t;
+typedef half float16_t;
+#else
+typedef half float16_t;
+/////////////////////////////////////////////////////////////////////////////
+// Helpers
+/////////////////////////////////////////////////////////////////////////////
+constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x) {
+  // Check for nan
+  if ((as_type<uint32_t>(x) & ~_fp_encoding_traits<float>::sign_mask) >
+      _fp_encoding_traits<float>::inf_mask) {
+    return uint16_t(as_type<uint32_t>(0x7FC0));
+  }
+  // Take bits
+  uint32_t float_bits = as_type<uint32_t>(x);
+  // Round to nearest even
+  float_bits += ((float_bits >> 16) & 1) + as_type<uint32_t>(0x7FFF);
+  // Take upper 16 bits
+  return float_bits >> 16;
+}
+constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x) {
+  // Upper 16 bits are the data and lower 16 bits are 0s
+  return as_type<float>((uint32_t)x << 16);
+}
+struct _MLX_BFloat16;
+template <typename T>
+static constexpr constant bool can_convert_to_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<T, float>;
+template <typename T>
+static constexpr constant bool can_convert_from_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<float, T>;
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat struct
+/////////////////////////////////////////////////////////////////////////////
+struct _MLX_BFloat16 {
+  /////////////////////////////////////////////////////////////////////////////
+  // Constructors
+  uint16_t bits_;
+  _MLX_BFloat16() thread = default;
+  _MLX_BFloat16() threadgroup = default;
+  _MLX_BFloat16() device = default;
+  _MLX_BFloat16() constant = default;
+  struct bits_to_bfloat_struct {};
+  static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat() {
+    return bits_to_bfloat_struct();
+  }
+  constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)
+      : bits_(bits) {}
+  /////////////////////////////////////////////////////////////////////////////
+  // Conversions to bfloat
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) thread
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) device
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) constant
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  /////////////////////////////////////////////////////////////////////////////
+  // Conversions from bfloat
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const thread {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const threadgroup {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const device {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const constant {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+};
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat operators
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+// Unary ops
+constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x) {
+  return -static_cast<float>(x);
+}
+/////////////////////////////////////////////////////////////////////////////
+// Binary operators
+#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
+  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);          \
+  }
+#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)    \
+  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) { \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }                                                                       \
+  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) { \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }
+/////////////////////////////////////////////////////////////////////////////
+// Arithmetic Operators
+#define bfloat_binop(_op_, _operator_)                                       \
+  bfloat_binop_base(                                                         \
+      _op_, _operator_, _MLX_BFloat16, _MLX_BFloat16, _MLX_BFloat16, float); \
+  bfloat_binop_helper(_op_, _operator_, float, float, float);                \
+  bfloat_binop_helper(_op_, _operator_, float, half, float);                 \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);      \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);     \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);      \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);
+bfloat_binop(+, operator+);
+bfloat_binop(-, operator-);
+bfloat_binop(*, operator*);
+bfloat_binop(/, operator/);
+/////////////////////////////////////////////////////////////////////////////
+// Comparison ops
+#define bfloat_compop(__op__, __operator__)                             \
+  bfloat_binop_base(                                                    \
+      __op__, __operator__, bool, _MLX_BFloat16, _MLX_BFloat16, float); \
+  bfloat_binop_helper(__op__, __operator__, bool, float, float);        \
+  bfloat_binop_helper(__op__, __operator__, bool, half, float);         \
+  bfloat_binop_helper(__op__, __operator__, bool, int32_t, float);      \
+  bfloat_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
+  bfloat_binop_helper(__op__, __operator__, bool, int64_t, float);      \
+  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);
+bfloat_compop(>, operator>);
+bfloat_compop(<, operator<);
+bfloat_compop(>=, operator>=);
+bfloat_compop(<=, operator<=);
+bfloat_compop(==, operator==);
+bfloat_compop(!=, operator!=);
+#undef bfloat_compop
+#undef bfloat_binop_base
+#undef bfloat_binop_helper
+#undef bfloat_binop
+/////////////////////////////////////////////////////////////////////////////
+// Inplace Operators
+#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space) \
+  constexpr METAL_FUNC addr_space _MLX_BFloat16& __operator__(            \
+      addr_space _MLX_BFloat16& lhs, itype rhs) {                         \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);         \
+    return lhs;                                                           \
+  }                                                                       \
+  constexpr METAL_FUNC addr_space itype& __operator__(                    \
+      addr_space itype& lhs, _MLX_BFloat16 rhs) {                         \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);         \
+    return lhs;                                                           \
+  }
+#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype) \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, device);         \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, thread);         \
+  bfloat_inplace_op_helper(__op__, __operator__, itype, threadgroup);
+#define bfloat_inplace_op(itype)                             \
+  bfloat_inplace_op_addr_space_helper(+, operator+=, itype); \
+  bfloat_inplace_op_addr_space_helper(-, operator-=, itype); \
+  bfloat_inplace_op_addr_space_helper(*, operator*=, itype); \
+  bfloat_inplace_op_addr_space_helper(/, operator/=, itype);
+bfloat_inplace_op(float);
+bfloat_inplace_op(half);
+bfloat_inplace_op(int16_t);
+bfloat_inplace_op(int32_t);
+bfloat_inplace_op(int64_t);
+bfloat_inplace_op(uint16_t);
+bfloat_inplace_op(uint32_t);
+bfloat_inplace_op(uint64_t);
+#undef bfloat_inplace_op_helper
+#undef bfloat_inplace_op_addr_space_helper
+#undef bfloat_inplace_op
+#define bfloat_inplace_op_helper(__op__, __operator__, addr_space) \
+  constexpr METAL_FUNC addr_space _MLX_BFloat16& __operator__(     \
+      addr_space _MLX_BFloat16& lhs, _MLX_BFloat16 rhs) {          \
+    lhs = static_cast<float>(lhs) __op__ static_cast<float>(rhs);  \
+    return lhs;                                                    \
+  }
+#define bfloat_inplace_op_addr_space_helper(__op__, __operator__) \
+  bfloat_inplace_op_helper(__op__, __operator__, device);         \
+  bfloat_inplace_op_helper(__op__, __operator__, thread);         \
+  bfloat_inplace_op_helper(__op__, __operator__, threadgroup);
+bfloat_inplace_op_addr_space_helper(+, operator+=);
+bfloat_inplace_op_addr_space_helper(-, operator-=);
+bfloat_inplace_op_addr_space_helper(*, operator*=);
+bfloat_inplace_op_addr_space_helper(/, operator/=);
+#undef bfloat_inplace_op_helper
+#undef bfloat_inplace_op_addr_space_helper
+/////////////////////////////////////////////////////////////////////////////
+// Bfloat typedef
+/////////////////////////////////////////////////////////////////////////////
+typedef struct _MLX_BFloat16 bfloat16_t;
+#endif
+// ============ "mlx/backend/metal/kernels/scaled_dot_product_attention_params.h"
+struct MLXFastAttentionParams {
+  const int M;
+  const int N;
+  const int K;
+  const int ldq; // ldq == ldo
+  const int ldk;
+  const int ldv;
+  const int lds;
+  const int ldo;
+  const int tiles_n;
+  const int tiles_m;
+  const int batch_stride_q;
+  const int batch_stride_k;
+  const int batch_stride_v;
+  const int batch_stride_o;
+  const int swizzle_log;
+  const int gemm_n_iterations_aligned;
+  const int gemm_k_iterations_aligned;
+  const int gemm_sv_m_block_iterations;
+  const int batch_ndim;
+  const float alpha;
+  const float softcapping;
+};
+struct MLXScaledDotProductAttentionParams {
+  // Associated dimensions & transposition information
+  const uint QUERY_SEQUENCE_LENGTH = 1;
+  const uint N_Q_HEADS = 32;
+  const uint N_KV_HEADS = 32;
+  const uint KV_TILES = 1;
+  const float INV_ALPHA = 0.08838834764831843f;
+};
+// ============ "mlx/backend/metal/kernels/scaled_dot_product_attention_params.sdpa_vector"
+// ============ "mlx/backend/metal/kernels/utils.h"
+template <typename U>
+struct Limits {
+  static const constant U max = metal::numeric_limits<U>::max();
+  static const constant U min = metal::numeric_limits<U>::min();
+  static const constant U finite_max = metal::numeric_limits<U>::max();
+  static const constant U finite_min = metal::numeric_limits<U>::min();
+};
+#define instantiate_default_limit(type)                                      \
+  template <>                                                                \
+  struct Limits<type> {                                                      \
+    static constexpr constant type max = metal::numeric_limits<type>::max(); \
+    static constexpr constant type min = metal::numeric_limits<type>::min(); \
+    static constexpr constant type finite_max =                              \
+        metal::numeric_limits<type>::max();                                  \
+    static constexpr constant type finite_min =                              \
+        metal::numeric_limits<type>::min();                                  \
+  };
+instantiate_default_limit(uint8_t);
+instantiate_default_limit(uint16_t);
+instantiate_default_limit(uint32_t);
+instantiate_default_limit(uint64_t);
+instantiate_default_limit(int8_t);
+instantiate_default_limit(int16_t);
+instantiate_default_limit(int32_t);
+instantiate_default_limit(int64_t);
+#define instantiate_float_limit(type)             \
+  template <>                                     \
+  struct Limits<type> {                           \
+    static constexpr constant type max =          \
+        metal::numeric_limits<type>::infinity();  \
+    static constexpr constant type min =          \
+        -metal::numeric_limits<type>::infinity(); \
+    static constexpr constant type finite_max =   \
+        metal::numeric_limits<type>::max();       \
+    static constexpr constant type finite_min =   \
+        -metal::numeric_limits<type>::max();      \
+  };
+instantiate_float_limit(half);
+instantiate_float_limit(float);
+instantiate_float_limit(bfloat16_t);
+// ============ "mlx/backend/metal/kernels/steel/attn/loader.h"
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short dst_ld,
+    short reduction_dim,
+    short tgp_size,
+    short alignment = 1,
+    short n_reads = (BCOLS * BROWS) / (tgp_size),
+    short TCOLS = BCOLS / n_reads,
+    short TROWS = tgp_size / TCOLS>
+struct BlockLoader {
+  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
+  STEEL_CONST short vec_size = n_reads;
+  // Leading dimension for src
+  const int src_ld;
+  const int tile_stride;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  struct alignas(alignment * sizeof(T)) ReadVector {
+    uint8_t v[sizeof(T) * vec_size];
+  };
+  /* Constructor */
+  METAL_FUNC BlockLoader(
+      const device T* src_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj) {}
+  /* Apply operation to threadgroup without bound checking */
+  template <typename UnaryOp>
+  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = op.apply(dst[i * dst_ld + j]);
+      }
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
+          *((const device ReadVector*)(&src[i * src_ld]));
+    }
+  }
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(short2 src_tile_dim) const {
+    src_tile_dim = src_tile_dim - short2(bj, bi);
+    // Skip loading if thread has no valid reads
+    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+      return;
+    }
+    // Use fast thread memory for bound checks
+    bool tmp_idx[vec_size];
+    T tmp_val[vec_size];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      // Make sure tmp_idx only contains valid indices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
+      }
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
+      }
+      // Zero out uneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
+      }
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * dst_ld + j] = tmp_val[j];
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    src += tile_stride;
+  }
+};
+template <int R, int C>
+struct CShape {
+  STEEL_CONST int kRows = R;
+  STEEL_CONST int kCols = C;
+};
+template <
+    typename T,
+    short BROWS,
+    short BCOLS,
+    short kDstStrRow,
+    short kDstStrCol,
+    short reduction_dim,
+    short tgp_size,
+    short n_reads = (BCOLS * BROWS) / (tgp_size),
+    short TCOLS = BCOLS / n_reads,
+    short TROWS = tgp_size / TCOLS>
+struct BlockLoaderT {
+  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
+  STEEL_CONST short vec_size = n_reads;
+  // Leading dimension for src
+  const int src_ld;
+  const int tile_stride;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  /* Constructor */
+  METAL_FUNC BlockLoaderT(
+      const device T* src_,
+      const int src_ld_,
+      threadgroup T* dst_,
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(src_ld_),
+        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * kDstStrRow + bj * kDstStrCol),
+        src(src_ + bi * src_ld + bj) {}
+  /* Apply operation to threadgroup without bound checking */
+  template <typename UnaryOp>
+  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * kDstStrRow + j * kDstStrCol] =
+            op.apply(dst[i * kDstStrRow + j * kDstStrCol]);
+      }
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * kDstStrRow + j * kDstStrCol] = src[i * src_ld + j];
+      }
+    }
+  }
+  /* Load from device memory into threadgroup memory - with bound checking */
+  METAL_FUNC void load_safe(short2 src_tile_dim) const {
+    src_tile_dim = src_tile_dim - short2(bj, bi);
+    // Skip loading if thread has no valid reads
+    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * kDstStrRow + j * kDstStrCol] = T(0);
+        }
+      }
+      return;
+    }
+    // Use fast thread memory for bound checks
+    bool tmp_idx[vec_size];
+    T tmp_val[vec_size];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < BROWS; i += TROWS) {
+      // Make sure tmp_idx only contains valid indices
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
+      }
+      // Read valid indices into tmp_val
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
+      }
+      // Zero out uneeded values
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
+      }
+      // Copy values to threadgroup memory
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < vec_size; j++) {
+        dst[i * kDstStrRow + j * kDstStrCol] = tmp_val[j];
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    src += tile_stride;
+  }
+};
+// ============ "mlx/backend/metal/kernels/steel/utils/type_traits.h"
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+template <typename T>
+struct pointer_element {};
+template <typename T>
+struct pointer_element<thread T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<device T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<constant T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+struct pointer_element<threadgroup T*> {
+  using type = remove_cv_t<T>;
+};
+template <typename T>
+using pointer_element_t = typename pointer_element<remove_cv_t<T>>::type;
+// ============ "mlx/backend/metal/kernels/steel/utils/integral_constant.h"
+///////////////////////////////////////////////////////////////////////////////
+// Integral constant with casting
+///////////////////////////////////////////////////////////////////////////////
+template <int val>
+using Int = integral_constant<int, val>;
+///////////////////////////////////////////////////////////////////////////////
+// Binary Operators on Integral constants
+///////////////////////////////////////////////////////////////////////////////
+#define integral_const_binop(__op__, __operator__)          \
+  template <typename T, T tv, typename U, U uv>             \
+  METAL_FUNC constexpr auto __operator__(                   \
+      integral_constant<T, tv>, integral_constant<U, uv>) { \
+    constexpr auto res = tv __op__ uv;                      \
+    return integral_constant<decltype(res), res>{};         \
+  }
+integral_const_binop(+, operator+);
+integral_const_binop(-, operator-);
+integral_const_binop(*, operator*);
+integral_const_binop(/, operator/);
+integral_const_binop(==, operator==);
+integral_const_binop(!=, operator!=);
+integral_const_binop(<, operator<);
+integral_const_binop(>, operator>);
+integral_const_binop(<=, operator<=);
+integral_const_binop(>=, operator>=);
+integral_const_binop(&&, operator&&);
+integral_const_binop(||, operator||);
+#undef integral_const_binop
+///////////////////////////////////////////////////////////////////////////////
+// Reduction operators
+///////////////////////////////////////////////////////////////////////////////
+template <typename T>
+METAL_FUNC constexpr T sum(T x) {
+  return x;
+}
+template <typename T, typename... Us>
+METAL_FUNC constexpr auto sum(T x, Us... us) {
+  return x + sum(us...);
+}
+// ============ "mlx/backend/metal/kernels/steel/gemm/transforms.h"
+template <typename OutT, typename InT>
+struct TransformNone {
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT) {
+    return static_cast<OutT>(x);
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAdd {
+  TransformAdd(const float, const float) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT c) {
+    return static_cast<OutT>(x) + c;
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAxpby {
+  const float alpha;
+  const float beta;
+  TransformAxpby(const float alpha_, const float beta_)
+      : alpha(alpha_), beta(beta_) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  METAL_FUNC OutT apply(InT x, OutT c) const {
+    return static_cast<OutT>(x * alpha + (beta * c));
+  }
+};
+template <typename T>
+struct AccumHelper {
+  typedef float accum_type;
+};
+struct BlockSwizzle {
+  static METAL_FUNC int2
+  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
+    const int tid_x = (tid.x) >> swizzle_log;
+    const int tid_y =
+        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
+    return int2(tid_x, tid_y);
+  }
+};
+// ============ "mlx/backend/metal/kernels/steel/attn/mma.h"
+template <typename RInt, typename CInt>
+struct Shape2D {
+  RInt r;
+  CInt c;
+  Shape2D(RInt r_, CInt c_) : r(r_), c(c_) {}
+};
+template <typename Shape, typename Layout>
+struct Layout2D {
+  Shape shape;
+  Layout layout;
+};
+template <typename T, int kFragRows_, int kFragCols_>
+struct BaseMMAFrag {
+  static_assert(
+      kFragRows_ == 8,
+      "Only 8 x 8 fragment matrices are currently supported");
+  static_assert(
+      kFragCols_ == 8,
+      "Only 8 x 8 fragment matrices are currently supported");
+};
+template <typename T>
+struct BaseMMAFrag<T, 8, 8> {
+  STEEL_CONST int kFragRows = 8;
+  STEEL_CONST int kFragCols = 8;
+  STEEL_CONST int kElemsPerFrag = (kFragRows * kFragCols) / 32;
+  STEEL_CONST int kElemRows = 1;
+  STEEL_CONST int kElemCols = 2;
+  static_assert(
+      kElemRows * kElemCols == kElemsPerFrag,
+      "MMAFrag shape is not consistent with MMAFrag size");
+  typedef metal::simdgroup_matrix<T, kFragRows, kFragCols> mat_type;
+  typedef metal::vec<T, kElemsPerFrag> frag_type;
+  typedef metal::vec<T, kElemRows> row_frag_type;
+  typedef metal::vec<T, kElemCols> col_frag_type;
+  template <typename U>
+  using dtype_mat_t = typename metal::simdgroup_matrix<U, kFragRows, kFragCols>;
+  template <typename U>
+  using dtype_frag_t = typename metal::vec<U, kElemsPerFrag>;
+  METAL_FUNC static constexpr short2 get_coord(ushort simd_lane_id
+                                               [[thread_index_in_simdgroup]]) {
+    const short qid = simd_lane_id / 4;
+    const short fm = (qid & 4) + ((simd_lane_id / 2) % 4);
+    const short fn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
+    return short2{fn, fm};
+  }
+  template <typename SrcPtrType, typename StrX, typename StrY>
+  METAL_FUNC static constexpr void
+  load(thread frag_type& dst, SrcPtrType src, StrX str_x, StrY str_y) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        dst[i * kElemCols + j] = static_cast<T>(src[i * str_x.value + j * str_y.value]);
+      }
+    }
+  }
+  template <
+      typename SrcPtrType,
+      typename StrX,
+      typename StrY,
+      typename LimX,
+      typename LimY,
+      typename OffX,
+      typename OffY>
+  METAL_FUNC static constexpr void load_safe(
+      thread frag_type& dst,
+      SrcPtrType src,
+      StrX str_x,
+      StrY str_y,
+      LimX lim_x,
+      LimY lim_y,
+      OffX off_x = Int<0>{},
+      OffY off_y = Int<0>{}) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
+          dst[i * kElemCols + j] =
+              static_cast<T>(src[(off_x + i) * str_x + (off_y + j) * str_y.value]);
+        } else {
+          dst[i * kElemCols + j] = T(0);
+        }
+      }
+    }
+  }
+  template <typename DstPtrType, typename StrX, typename StrY>
+  METAL_FUNC static constexpr void
+  store(const thread frag_type& src, DstPtrType dst, StrX str_x, StrY str_y) {
+    using U = pointer_element_t<DstPtrType>;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        dst[i * str_x + j * str_y.value] = static_cast<U>(src[i * kElemCols + j]);
+      }
+    }
+  }
+  template <
+      typename DstPtrType,
+      typename StrX,
+      typename StrY,
+      typename LimX,
+      typename LimY,
+      typename OffX,
+      typename OffY>
+  METAL_FUNC static constexpr void store_safe(
+      const thread frag_type& src,
+      DstPtrType dst,
+      StrX str_x,
+      StrY str_y,
+      LimX lim_x,
+      LimY lim_y,
+      OffX off_x = Int<0>{},
+      OffY off_y = Int<0>{}) {
+    using U = pointer_element_t<DstPtrType>;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
+          dst[(off_x + i) * str_x + (off_y + j) * str_y.value] =
+              static_cast<U>(src[i * kElemCols + j]);
+        }
+      }
+    }
+  }
+  template <typename Atype, typename Btype, typename Ctype>
+  METAL_FUNC static constexpr void mma(
+      thread frag_type& D,
+      thread dtype_frag_t<Atype>& A,
+      thread dtype_frag_t<Btype>& B,
+      thread dtype_frag_t<Ctype>& C) {
+    mat_type D_mat;
+    dtype_mat_t<Atype> A_mat;
+    dtype_mat_t<Btype> B_mat;
+    dtype_mat_t<Ctype> C_mat;
+    reinterpret_cast<thread dtype_frag_t<Atype>&>(A_mat.thread_elements()) = A;
+    reinterpret_cast<thread dtype_frag_t<Btype>&>(B_mat.thread_elements()) = B;
+    reinterpret_cast<thread dtype_frag_t<Ctype>&>(C_mat.thread_elements()) = C;
+    mma(D_mat, A_mat, B_mat, C_mat);
+    D = reinterpret_cast<thread frag_type&>(D_mat.thread_elements());
+  }
+  template <typename Atype, typename Btype, typename Ctype>
+  METAL_FUNC static constexpr void mma(
+      thread mat_type& D,
+      thread dtype_mat_t<Atype>& A,
+      thread dtype_mat_t<Btype>& B,
+      thread dtype_mat_t<Ctype>& C) {
+    simdgroup_multiply_accumulate(D, A, B, C);
+  }
+  template <typename Op>
+  METAL_FUNC static constexpr void row_reduce(
+      thread const frag_type& inp_vals,
+      thread T* reduced_vals) {
+    T thr_reduce = Op::apply(inp_vals.x, inp_vals.y);
+    T qgr_reduce = simd_shuffle_xor(thr_reduce, ushort(1));
+    qgr_reduce = Op::apply(thr_reduce, qgr_reduce);
+    T sgr_reduce = simd_shuffle_xor(qgr_reduce, ushort(8));
+    sgr_reduce = Op::apply(qgr_reduce, sgr_reduce);
+    reduced_vals[0] = Op::apply(reduced_vals[0], sgr_reduce);
+  }
+  template <typename Op>
+  METAL_FUNC static constexpr void row_bin_op(
+      thread frag_type& inp_vals,
+      thread T* row_vals) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kElemRows; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kElemCols; j++) {
+        inp_vals[i * kElemCols + j] =
+            Op::apply(inp_vals[i * kElemCols + j], row_vals[i]);
+      }
+    }
+  }
+};
+template <
+    typename T,
+    int kTileRows_,
+    int kTileCols_,
+    class MMAFrag_ = BaseMMAFrag<T, 8, 8>>
+struct MMATile {
+  using MMAFrag_t = MMAFrag_;
+  using elem_type = T;
+  STEEL_CONST int kFragRows = MMAFrag_t::kFragRows;
+  STEEL_CONST int kFragCols = MMAFrag_t::kFragCols;
+  STEEL_CONST int kElemsPerFrag = MMAFrag_t::kElemsPerFrag;
+  STEEL_CONST int kTileRows = kTileRows_;
+  STEEL_CONST int kTileCols = kTileCols_;
+  STEEL_CONST int kRows = kTileRows * kFragRows;
+  STEEL_CONST int kCols = kTileCols * kFragCols;
+  STEEL_CONST int kNumFrags = kTileRows * kTileCols;
+  STEEL_CONST int kElemsPerTile = kNumFrags * kElemsPerFrag;
+  STEEL_CONST int kRowsPerThread = kTileRows * MMAFrag_t::kElemRows;
+  STEEL_CONST int kColsPerThread = kTileCols * MMAFrag_t::kElemCols;
+  typedef typename MMAFrag_t::mat_type mat_type;
+  typedef typename MMAFrag_t::frag_type frag_type;
+  frag_type val_frags[kNumFrags]; // = {frag_type(0)};
+  METAL_FUNC MMATile() thread {}
+  METAL_FUNC constexpr void clear() {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kNumFrags; ++i) {
+      val_frags[i] = frag_type(0);
+    }
+  }
+  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
+    return val_frags[i * kTileCols + j];
+  }
+  METAL_FUNC constexpr const thread frag_type& frag_at(
+      const short i,
+      const short j) const {
+    return val_frags[i * kTileCols + j];
+  }
+  METAL_FUNC mat_type mat_at(const short i, const short j) {
+    mat_type val_mat;
+    STEEL_PRAGMA_UNROLL
+    for (short ii = 0; ii < kElemsPerFrag; ++ii) {
+      val_mat.thread_elements()[ii] = frag_at(i, j)[ii];
+    }
+    return val_mat;
+  }
+  METAL_FUNC thread elem_type* elems() {
+    return reinterpret_cast<thread elem_type*>(val_frags);
+  }
+  METAL_FUNC const thread elem_type* elems() const {
+    return reinterpret_cast<const thread elem_type*>(val_frags);
+  }
+  template <typename Op>
+  METAL_FUNC void row_reduce(thread T vals[kRowsPerThread]) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::template row_reduce<Op>(
+            frag_at(i, j), &vals[i * MMAFrag_t::kElemRows]);
+      }
+    }
+  }
+  template <typename Op>
+  METAL_FUNC void row_bin_op(thread T vals[kRowsPerThread]) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::template row_bin_op<Op>(
+            frag_at(i, j), &vals[i * MMAFrag_t::kElemRows]);
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y, int str_x, int str_y>
+  METAL_FUNC void load(const threadgroup U* src) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load(
+            frag_at(i, j),
+            &(
+                src[(i * kFragRows) * w_x * str_x +
+                    (j * kFragCols) * w_y * str_y]),
+            Int<str_x>{},
+            Int<str_y>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y, int str_x, int str_y>
+  METAL_FUNC void store(threadgroup U* dst) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store(
+            frag_at(i, j),
+            &(
+                dst[(i * kFragRows) * w_x * str_x +
+                    (j * kFragCols) * w_y * str_y]),
+            Int<str_x>{},
+            Int<str_y>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void load(const device U* src, const int ld) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load(
+            frag_at(i, j),
+            &(src[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
+            ld,
+            Int<1>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void store(device U* dst, const int ld) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store(
+            frag_at(i, j),
+            &(dst[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
+            ld,
+            Int<1>{});
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void
+  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::load_safe(
+            frag_at(i, j),
+            src,
+            ld,
+            Int<1>{},
+            src_tile_dims.y,
+            src_tile_dims.x,
+            (i * kFragRows) * w_x,
+            (j * kFragCols) * w_y);
+      }
+    }
+  }
+  template <typename U, int w_x, int w_y>
+  METAL_FUNC void
+  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < kTileRows; ++i) {
+      STEEL_PRAGMA_UNROLL
+      for (int j = 0; j < kTileCols; ++j) {
+        MMAFrag_t::store_safe(
+            frag_at(i, j),
+            dst,
+            ld,
+            Int<1>{},
+            dst_tile_dims.y,
+            dst_tile_dims.x,
+            (i * kFragRows) * w_x,
+            (j * kFragCols) * w_y);
+      }
+    }
+  }
+};
+template <
+    typename Dtype,
+    typename Atype,
+    typename Btype,
+    typename Ctype,
+    int M,
+    int N,
+    int K,
+    class MMAFragD,
+    class MMAFragA,
+    class MMAFragB,
+    class MMAFragC>
+METAL_FUNC void tile_matmad(
+    thread MMATile<Dtype, M, N, MMAFragD>& D,
+    thread MMATile<Atype, M, K, MMAFragA>& A,
+    thread MMATile<Btype, K, N, MMAFragB>& B,
+    thread MMATile<Ctype, M, N, MMAFragC>& C) {
+  STEEL_PRAGMA_UNROLL
+  for (short m = 0; m < M; ++m) {
+    STEEL_PRAGMA_UNROLL
+    for (short n = 0; n < N; ++n) {
+      short m_serp = m; //(n % 2) ? (M - 1 - m) : m;
+      short n_serp = (m % 2) ? (N - 1 - n) : n;
+      STEEL_PRAGMA_UNROLL
+      for (short k = 0; k < K; ++k) {
+        MMAFragD::mma(
+            D.frag_at(m_serp, n_serp),
+            A.frag_at(m_serp, k),
+            B.frag_at(k, n_serp),
+            C.frag_at(m_serp, n_serp));
+      }
+    }
+  }
+}
+template <
+    typename T,
+    typename U,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    bool transpose_a,
+    bool transpose_b,
+    short lda_tgp,
+    short ldb_tgp,
+    typename AccumType = float,
+    typename Epilogue = TransformNone<U, AccumType>>
+struct BlockMMA {
+  // MMAFrag size
+  STEEL_CONST short kFragSize = 8;
+  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TM_stride = kFragSize * WM;
+  // Warp tile simdgroup matrix strides along M
+  STEEL_CONST short TN_stride = kFragSize * WN;
+  // Warp tile size along M
+  STEEL_CONST short TM = BM / TM_stride;
+  // Warp tile size along N
+  STEEL_CONST short TN = BN / TN_stride;
+  // Threadgroup A strides
+  STEEL_CONST short A_str_m = transpose_a ? 1 : lda_tgp; // M
+  STEEL_CONST short A_str_k = transpose_a ? lda_tgp : 1; // K
+  // Threadgroup B strides
+  STEEL_CONST short B_str_k = transpose_b ? 1 : ldb_tgp; // K
+  STEEL_CONST short B_str_n = transpose_b ? ldb_tgp : 1; // N
+  // Threadgroup strides along K
+  STEEL_CONST short tile_stride_a = kFragSize * A_str_k;
+  STEEL_CONST short tile_stride_b = kFragSize * B_str_k;
+  // Simdgroup matrices
+  MMATile<AccumType, TM, 1, MMAFrag_acc_t> Atile;
+  MMATile<AccumType, 1, TN, MMAFrag_acc_t> Btile;
+  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile;
+  // Offsets within threadgroup
+  short sm;
+  short sn;
+  short As_offset;
+  short Bs_offset;
+  /* Constructor */
+  METAL_FUNC BlockMMA(
+      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
+      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
+    // Determine thread position in simdgroup matrix
+    short tm = kFragSize * (simd_group_id / WN);
+    short tn = kFragSize * (simd_group_id % WN);
+    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
+    sm = simd_coord.y;
+    sn = simd_coord.x;
+    // Determine thread and simdgroup offset
+    As_offset = (tm + sm) * A_str_m + (sn)*A_str_k; // M, K
+    Bs_offset = (sm)*B_str_k + (tn + sn) * B_str_n; // K, N
+    sm += tm;
+    sn += tn;
+  }
+  /* (BM, BK) X (BK, BN) multiply accumulate function */
+  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
+    // Adjust for simdgroup and thread location
+    As += As_offset;
+    Bs += Bs_offset;
+    // Iterate over BK in blocks of kFragSize
+    STEEL_PRAGMA_UNROLL
+    for (short kk = 0; kk < BK; kk += kFragSize) {
+      simdgroup_barrier(mem_flags::mem_none);
+      Atile.template load<T, WM, 1, A_str_m, A_str_k>(As);
+      simdgroup_barrier(mem_flags::mem_none);
+      Btile.template load<T, 1, WN, B_str_k, B_str_n>(Bs);
+      simdgroup_barrier(mem_flags::mem_none);
+      tile_matmad(Ctile, Atile, Btile, Ctile);
+      // Progress to next simdgroup tile
+      As += tile_stride_a;
+      Bs += tile_stride_b;
+    }
+  }
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(device U* D, const int ldd) {
+    // Apply epilogue
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
+    }
+    // Adjust for simdgroup and thread location
+    D += sm * ldd + sn;
+    Ctile.template store<U, WM, WN>(D, ldd);
+  }
+  METAL_FUNC void
+  store_result_safe(device U* D, const int ldd, short2 dst_tile_dims) {
+    // Apply epilogue
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
+    }
+    // Adjust for simdgroup and thread location
+    D += sm * ldd + sn;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    Ctile.template store_safe<U, WM, WN>(D, ldd, dst_tile_dims);
+  }
+  /* Apply epilogue */
+  template <typename UnaryEpilogue>
+  METAL_FUNC void apply_epilogue(thread const UnaryEpilogue& epilogue_op) {
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
+      Ctile.elems()[i] = epilogue_op.apply(Ctile.elems()[i]);
+    }
+  }
+  /* Apply epilogue */
+  template <typename BinaryEpilogue>
+  METAL_FUNC void apply_epilogue(
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const BinaryEpilogue& epilogue_op) {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < decltype(Ctile)::kElemsPerFrag; k++) {
+          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+        }
+      }
+    }
+  }
+  /* Apply epilogue */
+  template <typename BinaryEpilogue>
+  METAL_FUNC void apply_epilogue_safe(
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const BinaryEpilogue& epilogue_op) {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+        // Read C
+        U c_elems[kelems] = {0};
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          if ((j * TN_stride + k) < dst_tile_dims.x) {
+            c_elems[k] = C[offset_c + k * fdc];
+          }
+        }
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          accum[k] = epilogue_op.apply(accum[k], c_elems[k]);
+        }
+      }
+    }
+  }
+  /* Store results from simdgroup_matrix results into device memory */
+  METAL_FUNC void store_result(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    D += (sm)*ldd + sn;
+    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+    // Loop over all simdgroup tiles
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < TM; i++) {
+      STEEL_PRAGMA_UNROLL
+      for (short j = 0; j < TN; j++) {
+        // Get accumulated result and associated offset in C
+        thread const auto& accum = Ctile.frag_at(i, j);
+        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+        // Apply epilogue
+        STEEL_PRAGMA_UNROLL
+        for (short k = 0; k < kelems; k++) {
+          D[offset_d + k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+        }
+      }
+    }
+  }
+  METAL_FUNC void store_result_safe(
+      device U* D,
+      const int ldd,
+      const device U* C,
+      const int ldc,
+      const int fdc,
+      short2 dst_tile_dims,
+      thread const Epilogue& epilogue_op) const {
+    // Adjust for simdgroup and thread location
+    C += (sm)*ldc + (sn)*fdc;
+    D += (sm)*ldd + sn;
+    dst_tile_dims -= short2(sn, sm);
+    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
+      return;
+    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < TM; i++) {
+      if (i * TM_stride < dst_tile_dims.y) {
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = Ctile.frag_at(i, j);
+          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
+          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);
+          // Apply epilogue
+          STEEL_PRAGMA_UNROLL
+          for (short k = 0; k < kelems; k++) {
+            if ((j * TN_stride + k) < dst_tile_dims.x) {
+              D[offset_d + k] =
+                  epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+// ============ "mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h"
+struct AttnParams {
+  int B; ///< Batch Size
+  int H; ///< Heads
+  int D; ///< Head Dim
+  int qL; ///< Query Sequence Length
+  int kL; ///< Key Sequence Length
+  int gqa_factor; ///< Group Query factor
+  float scale; ///< Attention scale
+  float softcapping; ///< Softcapping value (1.0 for no softcapping)
+  int NQ; ///< Number of query blocks
+  int NK; ///< Number of key/value blocks
+  int NQ_aligned; ///< Number of full query blocks
+  int NK_aligned; ///< Number of full key/value blocks
+  int qL_rem; ///< Remainder in last query block
+  int kL_rem; ///< Remainder in last key/value block
+  int qL_off; ///< Offset in query sequence start
+  int64_t Q_strides[3]; ///< Query  strides (B, H, L, D = 1)
+  int64_t K_strides[3]; ///< Key    strides (B, H, L, D = 1)
+  int64_t V_strides[3]; ///< Value  strides (B, H, L, D = 1)
+  int64_t O_strides[3]; ///< Output strides (B, H, L, D = 1)
+  // Flash Attention variable-length support
+  int total_q_tokens; ///< Total number of query tokens (sum of all sequence lengths)
+  int total_k_tokens; ///< Total number of key/value tokens
+  int max_seqlen_q; ///< Maximum query sequence length
+  int max_seqlen_k; ///< Maximum key/value sequence length
+};
+struct AttnMaskParams {
+  int64_t M_strides[3]; ///< Mask  strides (B, H, qL, kL = 1)
+};
+///////////////////////////////////////////////////////////////////////////////
+// GEMM kernels
+///////////////////////////////////////////////////////////////////////////////
+constant bool align_Q [[function_constant(200)]];
+constant bool align_K [[function_constant(201)]];
+constant bool has_mask [[function_constant(300)]];
+constant bool do_causal [[function_constant(301)]];
+template <typename T>
+struct TransformScale {
+  T scale;
+  METAL_FUNC TransformScale(T scale_) : scale(scale_) {}
+  METAL_FUNC T apply(T x) const {
+    return scale * x;
+  }
+};
+struct MaxOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return metal::max(x, y);
+  }
+};
+struct SumOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return x + y;
+  }
+};
+struct MulOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return x * y;
+  }
+};
+struct SubOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return x - y;
+  }
+};
+struct ExpSubOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return fast::exp2(x - y);
+  }
+};
+struct DivOp {
+  template <typename T>
+  METAL_FUNC static constexpr T apply(T x, T y) {
+    return x / y;
+  }
+};
+// clang-format off
+template <
+    typename T,
+    int BQ,
+    int BK,
+    int BD,
+    int WM,
+    int WN,
+    typename MaskType = float,
+    typename AccumType = float>
+[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void attention(
+    const device T* Q [[buffer(0)]],
+    const device T* K [[buffer(1)]],
+    const device T* V [[buffer(2)]],
+    device T* O [[buffer(3)]],
+    const constant AttnParams* params [[buffer(4)]],
+    const constant AttnMaskParams* mask_params [[buffer(5), function_constant(has_mask)]],
+    const device MaskType* mask [[buffer(6), function_constant(has_mask)]],
+    const device int* cu_seqlens_q [[buffer(7)]],  // Cumulative query sequence lengths
+    const device int* cu_seqlens_k [[buffer(8)]],  // Cumulative key sequence lengths
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]) { // clang-format on
+  // Pacifying compiler
+  (void)lid;
+  // Flash Attention variable-length indexing
+  // tid.z is now the sequence index within the batch
+  int batch_idx = tid.z;
+  int head_idx = tid.y;
+  int block_idx = tid.x;
+  // Get sequence boundaries from cumulative lengths
+  int q_seq_start = cu_seqlens_q[batch_idx];
+  int q_seq_end = cu_seqlens_q[batch_idx + 1];
+  int k_seq_start = cu_seqlens_k[batch_idx];
+  int k_seq_end = cu_seqlens_k[batch_idx + 1];
+  int q_seq_len = q_seq_end - q_seq_start;
+  int k_seq_len = k_seq_end - k_seq_start;
+  // Check if this block is within the sequence
+  if (block_idx * BQ >= q_seq_len) {
+    return;
+  }
+  // Calculate offsets in the packed tensor format
+  // Q/O shape: [total_tokens, num_heads, head_dim]
+  // K/V shape: [total_tokens, num_heads_kv, head_dim]
+  int q_offset = q_seq_start + block_idx * BQ;
+  int k_offset = k_seq_start;
+  ulong kv_head_idx = head_idx / params->gqa_factor;
+  // Move pointers to the correct position in packed format
+  Q += q_offset * params->H * params->D + head_idx * params->D;
+  K += k_offset * (params->H / params->gqa_factor) * params->D + kv_head_idx * params->D;
+  V += k_offset * (params->H / params->gqa_factor) * params->D + kv_head_idx * params->D;
+  O += q_offset * params->H * params->D + head_idx * params->D;
+  if (has_mask) {
+    // Mask indexing would need to be updated based on the mask format
+    mask += batch_idx * mask_params->M_strides[0] +
+            head_idx * mask_params->M_strides[1];
+  }
+  // Prepare threadgroup memory
+  constexpr short padQ = 16 / sizeof(T);
+  constexpr short padK = 16 / sizeof(T);
+  constexpr short padV = 16 / sizeof(T);
+  constexpr short LDQ_tgp = BD + padQ;
+  constexpr short LDK_tgp = BK + padK;
+  constexpr short LDV_tgp = BD + padV;
+  constexpr short tgp_mem_0 = (BK + padK) * (BD);
+  constexpr short tgp_mem_1 = BK * (BD + padV);
+  constexpr short tgp_mem_s = tgp_mem_0 > tgp_mem_1 ? tgp_mem_0 : tgp_mem_1;
+  threadgroup T Q_smem[BQ * (BD + padQ)];
+  threadgroup T KV_smem[tgp_mem_s];
+  threadgroup T* Qs = Q_smem;
+  threadgroup T* Ks = KV_smem;
+  threadgroup T* Vs = KV_smem;
+  // Prepare block loaders
+  using QBlockLoader = BlockLoaderT<
+      /* typename T = */ T,
+      /* short BROWS = */ BQ,
+      /* short BCOLS = */ BD,
+      /* short kDstStrRow = */ LDQ_tgp,
+      /* short kDstStrCol = */ 1,
+      /* short reduction_dim = */ 1,
+      /* short tgp_size = */ WM * WN * 32>;
+  // K is loaded in transposed
+  using KBlockLoader = BlockLoaderT<
+      /* typename T = */ T,
+      /* short BROWS = */ BK,
+      /* short BCOLS = */ BD,
+      /* short kDstStrRow = */ 1,
+      /* short kDstStrCol = */ LDK_tgp,
+      /* short reduction_dim = */ 0,
+      /* short tgp_size = */ WM * WN * 32>;
+  using VBlockLoader = BlockLoaderT<
+      /* typename T = */ T,
+      /* short BROWS = */ BK,
+      /* short BCOLS = */ BD,
+      /* short kDstStrRow = */ LDV_tgp,
+      /* short kDstStrCol = */ 1,
+      /* short reduction_dim = */ 0,
+      /* short tgp_size = */ WM * WN * 32>;
+  // For packed tensors, stride between tokens is H * D
+  int q_stride = params->H * params->D;
+  int kv_stride = (params->H / params->gqa_factor) * params->D;
+  QBlockLoader loader_q(
+      Q, q_stride, Qs, simd_group_id, simd_lane_id);
+  KBlockLoader loader_k(
+      K, kv_stride, Ks, simd_group_id, simd_lane_id);
+  VBlockLoader loader_v(
+      V, kv_stride, Vs, simd_group_id, simd_lane_id);
+  // Apply softcapping adjustment to scale if needed
+  float adjusted_scale = params->scale;
+  if (params->softcapping != 1.0f) {
+    adjusted_scale = params->scale / params->softcapping;
+  }
+  TransformScale<T> ts(static_cast<T>(adjusted_scale * 1.44269504089));
+  // Prepare MMA tiles
+  constexpr short kFragSize = 8; // MMAFrag size
+  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;
+  constexpr int kNWarps = WM * WN;
+  static_assert(
+      BQ >= (kNWarps * kFragSize) && BQ % (kNWarps * kFragSize) == 0,
+      "Each simdgroup must host atleast 1 simdgroup matrix along Q sequence.");
+  // Q seq frags per warp
+  constexpr int TQ = BQ / (kNWarps * kFragSize);
+  // KV sequence frags (all warps load the same frags)
+  constexpr int TK = BK / kFragSize;
+  // HeadDim frags (all warps load the same frags)
+  constexpr int TD = BD / kFragSize;
+  static_assert(TQ == 1, "Check TQ");
+  MMATile<AccumType, TQ, 1, MMAFrag_acc_t> Qtile;
+  MMATile<AccumType, 1, TK, MMAFrag_acc_t> Ktile;
+  MMATile<AccumType, TQ, TK, MMAFrag_acc_t> Stile;
+  MMATile<AccumType, 1, 1, MMAFrag_acc_t> Vtile;
+  MMATile<AccumType, TQ, TD, MMAFrag_acc_t> Otile;
+  Otile.clear();
+  // Prepare mma tile offsets
+  const short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
+  const short sm = simd_coord.y;
+  const short sn = simd_coord.x;
+  const short tm = kFragSize * TQ * simd_group_id;
+  const short Qs_offset = (tm + sm) * LDQ_tgp + sn;
+  const short Ks_offset = sm * LDK_tgp + sn;
+  const short Vs_offset = sm * LDV_tgp + sn;
+  constexpr short Qs_tile_stride = kFragSize;
+  constexpr short Ks_tile_stride = kFragSize * LDK_tgp;
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Load Q blocks apply scale
+  int q_block_end = min(block_idx * BQ + BQ, q_seq_len);
+  int q_block_size = q_block_end - block_idx * BQ;
+  if (q_block_size < BQ) {
+    loader_q.load_safe(short2(BD, q_block_size));
+  } else {
+    loader_q.load_unsafe();
+  }
+  loader_q.apply_inplace_op(ts);
+  // Init row reduction variables
+  constexpr short kRowsPT = decltype(Stile)::kRowsPerThread;
+  AccumType max_score[kRowsPT];
+  AccumType sum_score[kRowsPT] = {0};
+  // Init to -Inf
+  STEEL_PRAGMA_UNROLL
+  for (short i = 0; i < kRowsPT; ++i) {
+    max_score[i] = Limits<AccumType>::min;
+  }
+  // Calculate number of K blocks for this sequence
+  int kb_lim = (k_seq_len + BK - 1) / BK;
+  if (do_causal) {
+    // For causal mask, limit to blocks that could affect this query block
+    // Use sequence-local positions, not global offsets
+    int q_block_start_in_seq = block_idx * BQ;
+    int q_block_end_in_seq = q_block_start_in_seq + q_block_size;
+    kb_lim = min(kb_lim, (q_block_end_in_seq + BK - 1) / BK);
+  }
+  // Loop over KV seq length
+  for (int kb = 0; kb < kb_lim; kb++) {
+    // Load K block and apply scale
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    int k_block_end = min(kb * BK + BK, k_seq_len);
+    int k_block_size = k_block_end - kb * BK;
+    if (k_block_size < BK) {
+      loader_k.load_safe(short2(BD, k_block_size));
+    } else {
+      loader_k.load_unsafe();
+    }
+    // Do S = Q @ K.T
+    Stile.clear();
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    STEEL_PRAGMA_UNROLL
+    for (short dd = 0; dd < TD; dd++) {
+      simdgroup_barrier(mem_flags::mem_none);
+      Qtile.template load<T, 1, 1, LDQ_tgp, 1>(
+          &Qs[Qs_offset + dd * Qs_tile_stride]);
+      Ktile.template load<T, 1, 1, LDK_tgp, 1>(
+          &Ks[Ks_offset + dd * Ks_tile_stride]);
+      simdgroup_barrier(mem_flags::mem_none);
+      tile_matmad(Stile, Qtile, Ktile, Stile);
+    }
+    // Mask out length sequence
+    if (k_block_size < BK) {
+      using stile_t = decltype(Stile);
+      using selem_t = typename stile_t::elem_type;
+      constexpr auto neg_inf = -metal::numeric_limits<selem_t>::infinity();
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < stile_t::kTileRows; i++) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < stile_t::kTileCols; j++) {
+          short col_pos = sn + (j * stile_t::kFragCols);
+          STEEL_PRAGMA_UNROLL
+          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemCols; jj++) {
+            if ((col_pos + jj) >= k_block_size) {
+              Stile.frag_at(i, j)[jj] = neg_inf;
+            }
+          }
+        }
+      }
+    }
+    // Mask out if causal
+    if (do_causal) {
+      using stile_t = decltype(Stile);
+      using selem_t = typename stile_t::elem_type;
+      constexpr auto neg_inf = -metal::numeric_limits<selem_t>::infinity();
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < stile_t::kTileRows; i++) {
+        // Use sequence-local positions for causal mask
+        const int row_pos_in_seq = block_idx * BQ + tm + sm + (i * stile_t::kFragRows);
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < stile_t::kTileCols; j++) {
+          const int col_pos_in_seq = kb * BK + sn + (j * stile_t::kFragCols);
+          STEEL_PRAGMA_UNROLL
+          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemCols; jj++) {
+            if (row_pos_in_seq < (col_pos_in_seq + jj)) {
+              Stile.frag_at(i, j)[jj] = neg_inf;
+            }
+          }
+        }
+      }
+    }
+    // Other masking as needed
+    if (has_mask) {
+      using stile_t = decltype(Stile);
+      using selem_t = typename stile_t::elem_type;
+      constexpr auto neg_inf = -metal::numeric_limits<selem_t>::infinity();
+      constexpr bool is_bool = is_same_v<MaskType, bool>;
+      using melem_t = typename metal::conditional_t<is_bool, bool, selem_t>;
+      using MMAFrag_mask_t = BaseMMAFrag<melem_t, kFragSize, kFragSize>;
+      using frag_t = typename MMAFrag_mask_t::frag_type;
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < stile_t::kTileRows; i++) {
+        // Use sequence-local positions
+        const int row_pos_in_seq = block_idx * BQ + tm + sm + (i * stile_t::kFragRows);
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < stile_t::kTileCols; j++) {
+          const int col_pos_in_seq = kb * BK + sn + (j * stile_t::kFragCols);
+          frag_t mfrag;
+          MMAFrag_mask_t::load_safe(
+              mfrag,
+              mask,
+              int(mask_params->M_strides[2]),
+              Int<1>{},
+              q_seq_len,
+              k_seq_len,
+              row_pos_in_seq,  // Already sequence-local
+              col_pos_in_seq); // Already sequence-local
+          STEEL_PRAGMA_UNROLL
+          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemsPerFrag; jj++) {
+            if constexpr (is_bool) {
+              Stile.frag_at(i, j)[jj] =
+                  mfrag[jj] ? Stile.frag_at(i, j)[jj] : neg_inf;
+            } else {
+              Stile.frag_at(i, j)[jj] += 1.44269504089 * selem_t(mfrag[jj]);
+            }
+          }
+        }
+      }
+    }
+    // Apply softcapping if needed (tanh(score) * softcapping)
+    if (params->softcapping != 1.0f) {
+      using stile_t = decltype(Stile);
+      using selem_t = typename stile_t::elem_type;
+      const selem_t softcapping_val = static_cast<selem_t>(params->softcapping);
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < stile_t::kTileRows; i++) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < stile_t::kTileCols; j++) {
+          STEEL_PRAGMA_UNROLL
+          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemsPerFrag; jj++) {
+            Stile.frag_at(i, j)[jj] = metal::tanh(Stile.frag_at(i, j)[jj]) * softcapping_val;
+          }
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Load V blocks
+    if (k_block_size < BK) {
+      loader_v.load_safe(short2(BD, k_block_size));
+    } else {
+      loader_v.load_unsafe();
+    }
+    // Do softmax
+    // Temp variables
+    AccumType new_max[kRowsPT];
+    AccumType factor[kRowsPT];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kRowsPT; ++i) {
+      new_max[i] = max_score[i];
+    }
+    // Row max
+    Stile.template row_reduce<MaxOp>(new_max);
+    // exp(Si - rowmax(Si))
+    Stile.template row_bin_op<ExpSubOp>(new_max);
+    // Factor exp(rowmax(Si) - rowmax(Si-1))
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kRowsPT; ++i) {
+      factor[i] = fast::exp2(max_score[i] - new_max[i]);
+    }
+    // Save max for next iteration
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kRowsPT; ++i) {
+      max_score[i] = new_max[i];
+    }
+    // Row Sum
+    AccumType sum_score_tmp[kRowsPT] = {0};
+    Stile.template row_reduce<SumOp>(sum_score_tmp);
+    // Update norm
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < kRowsPT; ++i) {
+      sum_score[i] = sum_score[i] * factor[i] + sum_score_tmp[i];
+    }
+    // Update O
+    Otile.template row_bin_op<MulOp>(factor);
+    // Load V into registers
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    STEEL_PRAGMA_UNROLL
+    for (short iq = 0; iq < TQ; iq++) {
+      STEEL_PRAGMA_UNROLL
+      for (short id = 0; id < TD; id++) {
+        STEEL_PRAGMA_UNROLL
+        for (short ik = 0; ik < TK; ik++) {
+          if constexpr (BD == 128) {
+            simdgroup_barrier(mem_flags::mem_none);
+          }
+          const short kk = ik * kFragSize;
+          const short dd = id * kFragSize;
+          Vtile.template load<T, 1, 1, LDV_tgp, 1>(
+              &Vs[Vs_offset + kk * LDV_tgp + dd]);
+          if constexpr (BD == 128) {
+            simdgroup_barrier(mem_flags::mem_none);
+          }
+          MMAFrag_acc_t::mma(
+              Otile.frag_at(iq, id),
+              Stile.frag_at(iq, ik),
+              Vtile.frag_at(0, 0),
+              Otile.frag_at(iq, id));
+        }
+      }
+    }
+    // Prepare for next iteration
+    loader_k.next();
+    loader_v.next();
+  }
+  // Normalize output
+  Otile.template row_bin_op<DivOp>(sum_score);
+  threadgroup_barrier(mem_flags::mem_none);
+  // Store results
+  // O is already pointing to the correct block position from earlier adjustment
+  // Just need to offset within the block for this thread's tile
+  device T* O_tile = O + (tm + sm) * params->H * params->D + sn;
+  if (q_block_size < BQ) {
+    // Only store if this thread's tile is within the valid range
+    if ((tm + sm) < q_block_size && sn < BD) {
+      auto dst_tile_dims = short2(BD - sn, q_block_size - (tm + sm));
+      Otile.template store_safe<T, 1, 1>(O_tile, params->H * params->D, dst_tile_dims);
+    }
+  } else {
+    Otile.template store<T, 1, 1>(O_tile, params->H * params->D);
+  }
+}
+// clang-format off
+// SDPA full instantiations
+// Instantiate a templated kernel.
+// Extra args are used as template parameters:
+// e.g. instantiate_kernel(binary_int, binary, a, b) ->
+// [[host_name(binary_int)]] [kernel] binary<a, b>
+#define instantiate_kernel(name, func, ...) \
+  template [[host_name(                     \
+      name)]] [[kernel]] decltype(func<__VA_ARGS__>) func<__VA_ARGS__>;
+#define instantiate_attn(tname, dtype, bq, bk, bd, wm, wn, mname, mtype) \
+  instantiate_kernel(                                                    \
+      "steel_attention_" #tname "_bq" #bq "_bk" #bk "_bd" #bd            \
+      "_wm" #wm "_wn" #wn "_mask" #mname,                                \
+  attention, dtype, bq, bk, bd, wm, wn, mtype, float)
+#define instantiate_attn_shapes_helper(iname, itype, mname, mtype)  \
+    instantiate_attn(iname, itype, 16, 8, 256, 2, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 16, 128, 4, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 32,  96, 4, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 32,  80, 4, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 32,  72, 4, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 32,  64, 4, 1, mname, mtype) \
+    instantiate_attn(iname, itype, 32, 32,  32, 4, 1, mname, mtype)
+#define instantiate_attn_mask_helper(iname, itype) \
+    instantiate_attn_shapes_helper(iname, itype, iname, itype) \
+    instantiate_attn_shapes_helper(iname, itype, bool_, bool)
+instantiate_attn_mask_helper(float16, half);
+instantiate_attn_mask_helper(bfloat16, bfloat16_t);
+instantiate_attn_mask_helper(float32, float);

sdpa-metal/scaled_dot_product_attention.mm ADDED Viewed

	@@ -0,0 +1,330 @@

+#include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <algorithm>
+#include <dlfcn.h>
+#include <string>
+#include <vector>
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+static std::string getModuleDirectory() {
+  Dl_info dl_info;
+  if (dladdr((void *)getModuleDirectory, &dl_info)) {
+    std::string path(dl_info.dli_fname);
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+      return path.substr(0, pos);
+    }
+  }
+  return ".";
+}
+// Helper function to get dtype string
+static std::string getDtypeString(torch::ScalarType dtype) {
+  switch (dtype) {
+  case torch::kFloat:
+    return "float32";
+  case torch::kHalf:
+    return "float16";
+  case torch::kBFloat16:
+    return "bfloat16";
+  default:
+    TORCH_CHECK(false, "Unsupported dtype for SDPA: ", dtype);
+  }
+}
+// Helper function to get dtype string for kernel names
+static std::string getKernelDtypeString(torch::ScalarType dtype) {
+  switch (dtype) {
+  case torch::kFloat:
+    return "float32";  // Match the instantiation names
+  case torch::kHalf:
+    return "float16";
+  case torch::kBFloat16:
+    return "bfloat16";
+  default:
+    TORCH_CHECK(false, "Unsupported dtype for SDPA: ", dtype);
+  }
+}
+// Parameters structure matching Flash Attention's AttnParams
+struct AttnParams {
+  int32_t B;              // batch size
+  int32_t H;              // number of heads
+  int32_t D;              // head dimension
+  int32_t qL;             // query sequence length (per sequence)
+  int32_t kL;             // key sequence length (per sequence)
+  int32_t gqa_factor;     // grouped query attention factor
+  float scale;            // attention scale
+  float softcapping;      // softcapping value (1.0 for no softcapping)
+  int32_t NQ;             // number of query blocks
+  int32_t NK;             // number of key blocks
+  int32_t NQ_aligned;     // aligned query blocks
+  int32_t NK_aligned;     // aligned key blocks
+  int32_t qL_rem;         // remainder query length
+  int32_t kL_rem;         // remainder key length
+  int32_t qL_off;         // query offset
+  int64_t Q_strides[3];   // query tensor strides
+  int64_t K_strides[3];   // key tensor strides
+  int64_t V_strides[3];   // value tensor strides
+  int64_t O_strides[3];   // output tensor strides
+  // Flash Attention variable-length support
+  int32_t total_q_tokens; // Total number of query tokens
+  int32_t total_k_tokens; // Total number of key/value tokens
+  int32_t max_seqlen_q;   // Maximum query sequence length
+  int32_t max_seqlen_k;   // Maximum key/value sequence length
+};
+// Forward declarations for kernel implementations
+void call_flash_attention_varlen(
+    id<MTLDevice> device,
+    id<MTLCommandBuffer> cmdBuf,
+    id<MTLLibrary> lib,
+    torch::Tensor &out,
+    torch::Tensor &query,
+    torch::Tensor &key,
+    torch::Tensor &value,
+    torch::Tensor &cu_seqlens_q,
+    torch::Tensor &cu_seqlens_k,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_k,
+    bool do_causal,
+    double scale,
+    double softcapping);
+void flash_attention_varlen(
+    torch::Tensor &out,           // [total_q_tokens, num_heads, head_size]
+    torch::Tensor &query,         // [total_q_tokens, num_heads, head_size]
+    torch::Tensor &key,           // [total_k_tokens, num_heads_kv, head_size]
+    torch::Tensor &value,         // [total_k_tokens, num_heads_kv, head_size]
+    torch::Tensor &cu_seqlens_q,  // [batch_size + 1]
+    torch::Tensor &cu_seqlens_k,  // [batch_size + 1]
+    int64_t max_seqlen_q,         // Maximum query sequence length
+    int64_t max_seqlen_k,         // Maximum key sequence length
+    bool do_causal,               // Whether to use causal mask
+    double scale,                 // Attention scale
+    double softcapping) {         // Softcapping value
+  try {
+    // Get device and stream
+    id<MTLDevice> device = at::mps::MPSDevice::getInstance()->device();
+    at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+  // Get dimensions from Flash Attention format
+  int64_t total_q_tokens = query.size(0);
+  int64_t num_heads = query.size(1);
+  int64_t head_dim = query.size(2);
+  int64_t num_heads_kv = key.size(1);
+  int64_t batch_size = cu_seqlens_q.size(0) - 1;  // cu_seqlens has batch_size + 1 elements
+  // Check if we support this head dimension
+  std::vector<int> supported_head_dims = {32, 64, 72, 80, 96, 128, 256};
+  bool supported_head_dim = std::find(supported_head_dims.begin(),
+                                      supported_head_dims.end(),
+                                      head_dim) != supported_head_dims.end();
+  TORCH_CHECK(supported_head_dim, "Head dimension ", head_dim, " is not supported");
+  TORCH_CHECK(cu_seqlens_q.size(0) == cu_seqlens_k.size(0),
+              "cu_seqlens_q and cu_seqlens_k must have the same size");
+  // Load Metal library
+  static id<MTLLibrary> lib = nil;
+  if (!lib) {
+    NSError *error = nil;
+    NSString *path = [NSString stringWithFormat:@"%s/" METALLIB_PATH,
+                               getModuleDirectory().c_str()];
+    NSURL *url = [NSURL fileURLWithPath:path];
+    lib = [device newLibraryWithURL:url error:&error];
+    TORCH_CHECK(lib, "Failed to load Metal library: ",
+                error ? error.localizedDescription.UTF8String : "unknown error");
+  }
+  // Get command buffer
+  id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
+  TORCH_CHECK(cmdBuf, "Failed to get MPS command buffer");
+  // For variable-length Flash Attention, always use the full attention kernel
+  // Call the Flash Attention kernel
+  call_flash_attention_varlen(device, cmdBuf, lib, out, query, key, value,
+                              cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
+                              do_causal, scale, softcapping);
+  } catch (const std::exception& e) {
+    throw;
+  } catch (...) {
+    throw;
+  }
+}
+// Implementation of Flash Attention variable-length kernel
+void call_flash_attention_varlen(
+    id<MTLDevice> device,
+    id<MTLCommandBuffer> cmdBuf,
+    id<MTLLibrary> lib,
+    torch::Tensor &out,
+    torch::Tensor &query,
+    torch::Tensor &key,
+    torch::Tensor &value,
+    torch::Tensor &cu_seqlens_q,
+    torch::Tensor &cu_seqlens_k,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_k,
+    bool do_causal,
+    double scale,
+    double softcapping) {
+  // Get dimensions
+  int64_t total_q_tokens = query.size(0);
+  int64_t num_heads = query.size(1);
+  int64_t head_dim = query.size(2);
+  int64_t num_heads_kv = key.size(1);
+  int64_t batch_size = cu_seqlens_q.size(0) - 1;
+  // Grouped Query Attention factor
+  int32_t gqa_factor = num_heads / num_heads_kv;
+  // Block sizes based on head dimension
+  const int BQ = (head_dim == 256) ? 16 : 32;  // Use BQ=16 for head_dim=256
+  const int bk = (head_dim == 256) ? 8 : ((head_dim >= 128) ? 16 : 32);  // Use bk=8 for head_dim=256
+  const int WM = (head_dim == 256) ? 2 : 4;  // Use WM=2 for head_dim=256
+  const int WN = 1;
+  // Setup parameters
+  AttnParams params = {}; // Zero-initialize all fields
+  params.B = batch_size;
+  params.H = num_heads;
+  params.D = head_dim;
+  params.gqa_factor = gqa_factor;
+  params.scale = static_cast<float>(scale);
+  params.softcapping = static_cast<float>(softcapping);
+  params.total_q_tokens = total_q_tokens;
+  params.total_k_tokens = key.size(0);
+  params.max_seqlen_q = max_seqlen_q;
+  params.max_seqlen_k = max_seqlen_k;
+  // Initialize fields that might be checked but aren't used in Flash Attention
+  params.qL = 0;  // Not used in variable-length attention
+  params.kL = 0;  // Not used in variable-length attention
+  params.NQ = 0;  // Not used
+  params.NK = 0;  // Not used
+  params.NQ_aligned = 0;
+  params.NK_aligned = 0;
+  params.qL_rem = 0;
+  params.kL_rem = 0;
+  params.qL_off = 0;
+  // Strides are not used for packed tensors (contiguous)
+  params.Q_strides[0] = 0;
+  params.Q_strides[1] = 0;
+  params.Q_strides[2] = 0;
+  params.K_strides[0] = 0;
+  params.K_strides[1] = 0;
+  params.K_strides[2] = 0;
+  params.V_strides[0] = 0;
+  params.V_strides[1] = 0;
+  params.V_strides[2] = 0;
+  params.O_strides[0] = 0;
+  params.O_strides[1] = 0;
+  params.O_strides[2] = 0;
+  // For variable-length attention, we'll process each sequence separately
+  // The kernel will handle the cu_seqlens internally
+  bool has_mask = false;  // Masks are not supported in Flash Attention
+  // Setup function constants
+  MTLFunctionConstantValues *constants = [MTLFunctionConstantValues new];
+  [constants setConstantValue:&has_mask type:MTLDataTypeBool atIndex:300];
+  [constants setConstantValue:&do_causal type:MTLDataTypeBool atIndex:301];
+  // Construct kernel name based on data type and head dimension
+  std::string kernel_name = "steel_attention_";
+  kernel_name += getKernelDtypeString(query.scalar_type());
+  kernel_name += "_bq" + std::to_string(BQ);
+  kernel_name += "_bk" + std::to_string(bk);
+  kernel_name += "_bd" + std::to_string(head_dim);
+  kernel_name += "_wm" + std::to_string(WM) + "_wn" + std::to_string(WN);
+  kernel_name += "_maskbool_";  // Always use bool for mask type (no masks supported)
+  // Get kernel function
+  NSError *error = nil;
+  id<MTLFunction> function = [lib newFunctionWithName:[NSString stringWithUTF8String:kernel_name.c_str()]
+                                      constantValues:constants
+                                              error:&error];
+  TORCH_CHECK(function, "Failed to get Metal function: ", kernel_name,
+              " Error: ", error ? error.localizedDescription.UTF8String : "unknown");
+  // Create compute pipeline
+  id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:function error:&error];
+  TORCH_CHECK(pipeline, "Failed to create compute pipeline: ",
+              error ? error.localizedDescription.UTF8String : "unknown");
+  // Setup command encoder with dispatch sync
+  at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+  dispatch_queue_t q = stream->queue();
+  dispatch_sync(q, ^{
+    id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
+    TORCH_CHECK(encoder, "Failed to create compute encoder");
+    [encoder setComputePipelineState:pipeline];
+    // Set buffers
+    int buffer_idx = 0;
+    // Query buffer - index 0
+    [encoder setBuffer:getMTLBufferStorage(query)
+              offset:query.storage_offset() * query.element_size()
+              atIndex:buffer_idx++];
+    // Key buffer - index 1
+    [encoder setBuffer:getMTLBufferStorage(key)
+              offset:key.storage_offset() * key.element_size()
+              atIndex:buffer_idx++];
+    // Value buffer - index 2
+    [encoder setBuffer:getMTLBufferStorage(value)
+              offset:value.storage_offset() * value.element_size()
+              atIndex:buffer_idx++];
+    // Output buffer - index 3
+    [encoder setBuffer:getMTLBufferStorage(out)
+              offset:out.storage_offset() * out.element_size()
+              atIndex:buffer_idx++];
+    // Parameters - index 4
+    [encoder setBytes:&params length:sizeof(AttnParams) atIndex:buffer_idx++];
+    // Skip mask parameters - indices 5 and 6 (masks not supported)
+    buffer_idx += 2;
+    // Set cu_seqlens buffers - indices 7 and 8
+    [encoder setBuffer:getMTLBufferStorage(cu_seqlens_q)
+              offset:cu_seqlens_q.storage_offset() * cu_seqlens_q.element_size()
+              atIndex:7];
+    [encoder setBuffer:getMTLBufferStorage(cu_seqlens_k)
+              offset:cu_seqlens_k.storage_offset() * cu_seqlens_k.element_size()
+              atIndex:8];
+    // Calculate grid dimensions
+    // We need to process each sequence independently
+    int64_t max_blocks_q = (max_seqlen_q + BQ - 1) / BQ;
+    MTLSize gridSize = MTLSizeMake(max_blocks_q, num_heads, batch_size);
+    MTLSize threadgroupSize = MTLSizeMake(32, WM, WN);
+    [encoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadgroupSize];
+    [encoder endEncoding];
+    stream->synchronize(at::mps::SyncType::COMMIT);
+  });
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_flash_attention.py ADDED Viewed

	@@ -0,0 +1,1132 @@

+import torch
+import pytest
+import sdpa_flash
+def create_cu_seqlens(seq_lengths):
+    """Create cumulative sequence lengths tensor."""
+    cu_seqlens = [0]
+    for length in seq_lengths:
+        cu_seqlens.append(cu_seqlens[-1] + length)
+    return torch.tensor(cu_seqlens, dtype=torch.int32, device="mps")
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_single_sequence(dtype, head_dim):
+    """Test Flash Attention with a single sequence."""
+    torch.manual_seed(42)
+    # Single sequence
+    seq_len = 32
+    num_heads = 4
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors in Flash Attention format
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth
+    # Flash Attention computes attention separately for each head
+    expected = torch.zeros_like(out)
+    for h in range(num_heads):
+        q_h = query[:, h, :]  # [seq_len, head_dim]
+        k_h = key[:, h, :]
+        v_h = value[:, h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 and float16)
+    if dtype == torch.bfloat16:
+        # Higher tolerance for head_dim=128 with bfloat16
+        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_variable_lengths(dtype, head_dim):
+    """Test Flash Attention with variable-length sequences."""
+    torch.manual_seed(42)
+    # Variable sequence lengths
+    seq_lengths_q = [8, 16, 12]
+    seq_lengths_k = [10, 20, 15]
+    batch_size = len(seq_lengths_q)
+    num_heads = 4
+    # Create cumulative sequence lengths
+    cu_seqlens_q = create_cu_seqlens(seq_lengths_q)
+    cu_seqlens_k = create_cu_seqlens(seq_lengths_k)
+    total_q = sum(seq_lengths_q)
+    total_k = sum(seq_lengths_k)
+    max_seqlen_q = max(seq_lengths_q)
+    max_seqlen_k = max(seq_lengths_k)
+    # Create input tensors
+    query = torch.randn(total_q, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_k, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_k, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth for each sequence
+    expected = torch.zeros_like(out)
+    for i in range(batch_size):
+        q_start, q_end = cu_seqlens_q[i].item(), cu_seqlens_q[i+1].item()
+        k_start, k_end = cu_seqlens_k[i].item(), cu_seqlens_k[i+1].item()
+        q_i = query[q_start:q_end]
+        k_i = key[k_start:k_end]
+        v_i = value[k_start:k_end]
+        # Compute attention for each head separately
+        for h in range(num_heads):
+            q_h = q_i[:, h, :]  # [seq_len, head_dim]
+            k_h = k_i[:, h, :]
+            v_h = v_i[:, h, :]
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[q_start:q_end, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 and float16)
+    if dtype == torch.bfloat16:
+        # Higher tolerance for bfloat16 with variable length sequences
+        rtol, atol = 2e-2, 2e-2
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_causal(dtype, head_dim):
+    """Test Flash Attention with causal masking."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_lengths = [16, 24]
+    batch_size = len(seq_lengths)
+    num_heads = 4
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    max_seqlen = max(seq_lengths)
+    # Create input tensors
+    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention with causal mask
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        do_causal=True,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth with causal mask
+    expected = torch.zeros_like(out)
+    for i in range(batch_size):
+        start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
+        seq_len = end - start
+        q_i = query[start:end]
+        k_i = key[start:end]
+        v_i = value[start:end]
+        # Compute attention for each head separately
+        for h in range(num_heads):
+            q_h = q_i[:, h, :]  # [seq_len, head_dim]
+            k_h = k_i[:, h, :]
+            v_h = v_i[:, h, :]
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+            # Apply causal mask
+            causal_mask = torch.triu(torch.ones(seq_len, seq_len, device="mps"), diagonal=1).bool()
+            scores.masked_fill_(causal_mask, float("-inf"))
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 and float16)
+    if dtype == torch.bfloat16:
+        # Higher tolerance for head_dim=128 with bfloat16
+        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_gqa(dtype, head_dim):
+    """Test Flash Attention with Grouped Query Attention."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_len = 32
+    num_heads = 8
+    num_kv_heads = 2  # GQA with 4:1 ratio
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth with GQA
+    # Each query head attends to its corresponding kv head (with repetition)
+    expected = torch.zeros_like(query)
+    gqa_factor = num_heads // num_kv_heads
+    for h in range(num_heads):
+        kv_h = h // gqa_factor
+        q_h = query[:, h, :]  # [seq_len, head_dim]
+        k_h = key[:, kv_h, :]
+        v_h = value[:, kv_h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 and float16)
+    if dtype == torch.bfloat16:
+        # Higher tolerance for head_dim=128 with bfloat16
+        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_head_dimensions(head_dim):
+    """Test Flash Attention with different supported head dimensions."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_len = 16
+    num_heads = 4
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
+    key = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
+    value = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Basic check that output is not zeros
+    assert out.abs().max().item() > 0
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+def test_flash_attention_large_head_dim(dtype):
+    """Test Flash Attention with head_dim=128 specifically."""
+    torch.manual_seed(42)
+    # Test dimensions with head_dim=128
+    seq_lengths = [32, 64]
+    batch_size = len(seq_lengths)
+    num_heads = 8
+    head_dim = 128
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    max_seqlen = max(seq_lengths)
+    # Create input tensors
+    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth
+    expected = torch.zeros_like(out)
+    for i in range(batch_size):
+        start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
+        q_i = query[start:end]
+        k_i = key[start:end]
+        v_i = value[start:end]
+        # Compute attention for each head separately
+        for h in range(num_heads):
+            q_h = q_i[:, h, :]  # [seq_len, head_dim]
+            k_h = k_i[:, h, :]
+            v_h = v_i[:, h, :]
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 with head_dim=128)
+    if dtype == torch.bfloat16:
+        # bfloat16 with head_dim=128 has known precision issues
+        rtol, atol = 2e-2, 2e-2
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+def test_flash_attention_large_head_dim_causal(dtype):
+    """Test Flash Attention with head_dim=128 and causal masking."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_len = 48
+    num_heads = 4
+    head_dim = 128
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention with causal mask
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=True,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth with causal mask
+    expected = torch.zeros_like(out)
+    for h in range(num_heads):
+        q_h = query[:, h, :]  # [seq_len, head_dim]
+        k_h = key[:, h, :]
+        v_h = value[:, h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        # Apply causal mask
+        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device="mps"), diagonal=1).bool()
+        scores.masked_fill_(causal_mask, float("-inf"))
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 with head_dim=128)
+    if dtype == torch.bfloat16:
+        # bfloat16 with head_dim=128 has known precision issues
+        rtol, atol = 2e-2, 2e-2
+    elif dtype == torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    else:
+        rtol, atol = 1e-3, 1e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+def test_flash_attention_large_head_dim_gqa():
+    """Test Flash Attention with head_dim=128 and GQA."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_len = 32
+    num_heads = 16
+    num_kv_heads = 4  # GQA with 4:1 ratio
+    head_dim = 128
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
+    key = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.float32, device="mps")
+    value = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.float32, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth with GQA
+    expected = torch.zeros_like(query)
+    gqa_factor = num_heads // num_kv_heads
+    for h in range(num_heads):
+        kv_h = h // gqa_factor
+        q_h = query[:, h, :]  # [seq_len, head_dim]
+        k_h = key[:, kv_h, :]
+        v_h = value[:, kv_h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results
+    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
+def test_flash_attention_edge_cases():
+    """Test Flash Attention edge cases."""
+    torch.manual_seed(42)
+    # Test 1: Single token sequence
+    query = torch.randn(1, 1, 64, device="mps")
+    key = torch.randn(1, 1, 64, device="mps")
+    value = torch.randn(1, 1, 64, device="mps")
+    cu_seqlens = create_cu_seqlens([1])
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=1,
+        max_seqlen_k=1,
+        do_causal=False,
+        scale=0.125,
+        softcapping=1.0,
+    )
+    # With single token, output should equal value
+    torch.testing.assert_close(out, value, rtol=1e-5, atol=1e-5)
+    # Test 2: Empty sequence in batch
+    seq_lengths = [8, 0, 12]  # Middle sequence is empty
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    query = torch.randn(total_tokens, 4, 64, device="mps")
+    key = torch.randn(total_tokens, 4, 64, device="mps")
+    value = torch.randn(total_tokens, 4, 64, device="mps")
+    out = torch.empty_like(query)
+    # This should handle empty sequences gracefully
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max(seq_lengths) if seq_lengths else 0,
+        max_seqlen_k=max(seq_lengths) if seq_lengths else 0,
+        do_causal=False,
+        scale=0.125,
+        softcapping=1.0,
+    )
+def test_flash_attention_unsupported_cases():
+    """Test that unsupported cases raise appropriate errors."""
+    # Test 1: Unsupported head dimension
+    query = torch.randn(16, 4, 48, device="mps")  # head_dim = 48 (not supported)
+    key = torch.randn(16, 4, 48, device="mps")
+    value = torch.randn(16, 4, 48, device="mps")
+    cu_seqlens = create_cu_seqlens([16])
+    out = torch.empty_like(query)
+    with pytest.raises(RuntimeError, match="Head dimension .* is not supported"):
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=16,
+            max_seqlen_k=16,
+            do_causal=False,
+            scale=0.144,
+            softcapping=1.0,
+        )
+    # Test 2: Calling function with wrong number of arguments
+    query = torch.randn(16, 4, 64, device="mps")
+    key = torch.randn(16, 4, 64, device="mps")
+    value = torch.randn(16, 4, 64, device="mps")
+    mask = torch.randn(1, 1, 16, 16, device="mps")
+    cu_seqlens = create_cu_seqlens([16])
+    out = torch.empty_like(query)
+    # The function signature no longer accepts mask parameter
+    with pytest.raises(TypeError):
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=16,
+            max_seqlen_k=16,
+            mask=mask,  # This parameter doesn't exist anymore
+            do_causal=False,
+            scale=0.125,
+            softcapping=1.0,
+        )
+    # Test 3: Wrong dtype for cu_seqlens (should be int32)
+    cu_seqlens_wrong = torch.tensor([0, 16], dtype=torch.int64, device="mps")
+    # This will silently fail (output will be unchanged)
+    # We can detect this by initializing output to a known value
+    out = torch.full_like(query, -999.0)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens_wrong,
+        cu_seqlens_k=cu_seqlens_wrong,
+        max_seqlen_q=16,
+        max_seqlen_k=16,
+        do_causal=False,
+        scale=0.125,
+        softcapping=1.0,
+    )
+    # Check that output wasn't modified (kernel didn't run)
+    assert (out == -999.0).all(), "cu_seqlens with wrong dtype should cause kernel to not run"
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_small_sequences(dtype, head_dim):
+    """Test Flash Attention with small sequence lengths (2-8)."""
+    torch.manual_seed(42)
+    # Test different small sequence lengths
+    for seq_len in [2, 4, 6, 8]:
+        num_heads = 4
+        # Create cumulative sequence lengths
+        cu_seqlens = create_cu_seqlens([seq_len])
+        # Create input tensors
+        query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+        key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+        value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+        # Scale factor
+        scale = 1.0 / (head_dim ** 0.5)
+        # Call Flash Attention
+        out = torch.empty_like(query)
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=seq_len,
+            max_seqlen_k=seq_len,
+            do_causal=False,
+            scale=scale,
+            softcapping=1.0,
+        )
+        # Compute ground truth
+        expected = torch.zeros_like(out)
+        for h in range(num_heads):
+            q_h = query[:, h, :]  # [seq_len, head_dim]
+            k_h = key[:, h, :]
+            v_h = value[:, h, :]
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[:, h, :] = torch.matmul(attn_weights, v_h)
+        # Check results (higher tolerance for bfloat16)
+        if dtype == torch.bfloat16:
+            rtol, atol = 2e-2, 2e-2
+        elif dtype == torch.float16:
+            rtol, atol = 2e-3, 2e-3
+        else:
+            rtol, atol = 1e-3, 1e-3
+        torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_cross_attention(dtype, head_dim):
+    """Test Flash Attention with different q_seq and k_seq (cross-attention)."""
+    torch.manual_seed(42)
+    # Test various q_seq, k_seq combinations
+    test_cases = [
+        (16, 32),   # q_seq < k_seq
+        (32, 16),   # q_seq > k_seq
+        (8, 128),   # large difference
+        (1, 64),    # single query token
+    ]
+    for q_seq, k_seq in test_cases:
+        num_heads = 4
+        # Create cumulative sequence lengths
+        cu_seqlens_q = create_cu_seqlens([q_seq])
+        cu_seqlens_k = create_cu_seqlens([k_seq])
+        # Create input tensors
+        query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
+        key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+        value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+        # Scale factor
+        scale = 1.0 / (head_dim ** 0.5)
+        # Call Flash Attention
+        out = torch.empty_like(query)
+        sdpa_flash.flash_attention_varlen(
+            out=out,
+            query=query,
+            key=key,
+            value=value,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=q_seq,
+            max_seqlen_k=k_seq,
+            do_causal=False,
+            scale=scale,
+            softcapping=1.0,
+        )
+        # Compute ground truth
+        expected = torch.zeros_like(out)
+        for h in range(num_heads):
+            q_h = query[:, h, :]  # [q_seq, head_dim]
+            k_h = key[:, h, :]    # [k_seq, head_dim]
+            v_h = value[:, h, :]  # [k_seq, head_dim]
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[:, h, :] = torch.matmul(attn_weights, v_h)
+        # Check results (higher tolerance for bfloat16)
+        if dtype == torch.bfloat16:
+            rtol, atol = 2e-2, 2e-2
+        elif dtype == torch.float16:
+            rtol, atol = 2e-3, 2e-3
+        else:
+            rtol, atol = 1e-3, 1e-3
+        torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+def test_flash_attention_large_sequences(dtype):
+    """Test Flash Attention with large k_seq (>= 1024)."""
+    torch.manual_seed(42)
+    # Test dimensions - large k_seq to test 2-pass algorithms
+    q_seq = 32
+    k_seq = 2048  # Large k_seq
+    num_heads = 4
+    head_dim = 64  # Use smaller head_dim to avoid memory issues
+    # Create cumulative sequence lengths
+    cu_seqlens_q = create_cu_seqlens([q_seq])
+    cu_seqlens_k = create_cu_seqlens([k_seq])
+    # Create input tensors
+    query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=q_seq,
+        max_seqlen_k=k_seq,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth
+    expected = torch.zeros_like(out)
+    for h in range(num_heads):
+        q_h = query[:, h, :]  # [q_seq, head_dim]
+        k_h = key[:, h, :]    # [k_seq, head_dim]
+        v_h = value[:, h, :]  # [k_seq, head_dim]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for large sequences)
+    if dtype == torch.bfloat16:
+        rtol, atol = 3e-2, 3e-2
+    elif dtype == torch.float16:
+        rtol, atol = 5e-3, 5e-3
+    else:
+        rtol, atol = 2e-3, 2e-3
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("gqa_ratio", [2, 4, 8])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128])
+def test_flash_attention_gqa_ratios(gqa_ratio, head_dim):
+    """Test Flash Attention with different GQA ratios."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_len = 32
+    num_heads = 16
+    num_kv_heads = num_heads // gqa_ratio
+    dtype = torch.float32
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens([seq_len])
+    # Create input tensors
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Compute ground truth with GQA
+    expected = torch.zeros_like(query)
+    gqa_factor = num_heads // num_kv_heads
+    for h in range(num_heads):
+        kv_h = h // gqa_factor
+        q_h = query[:, h, :]  # [seq_len, head_dim]
+        k_h = key[:, kv_h, :]
+        v_h = value[:, kv_h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results
+    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
+def test_flash_attention_single_query_token():
+    """Test Flash Attention with single query token (q_seq = 1)."""
+    torch.manual_seed(42)
+    # Test dimensions - single query token
+    q_seq = 1
+    k_seq = 64
+    num_heads = 8
+    head_dim = 64
+    dtype = torch.float32
+    # Create cumulative sequence lengths
+    cu_seqlens_q = create_cu_seqlens([q_seq])
+    cu_seqlens_k = create_cu_seqlens([k_seq])
+    # Create input tensors
+    query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=q_seq,
+        max_seqlen_k=k_seq,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # With single token, output should be weighted average of values
+    expected = torch.zeros_like(out)
+    for h in range(num_heads):
+        q_h = query[:, h, :]  # [1, head_dim]
+        k_h = key[:, h, :]    # [k_seq, head_dim]
+        v_h = value[:, h, :]  # [k_seq, head_dim]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
+def test_flash_attn_varlen_func():
+    """Test the flash_attn_varlen_func compatibility function."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_lengths = [8, 12]
+    num_heads = 4
+    head_dim = 64
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    max_seqlen = max(seq_lengths)
+    # Create input tensors
+    q = torch.randn(total_tokens, num_heads, head_dim, device="mps")
+    k = torch.randn(total_tokens, num_heads, head_dim, device="mps")
+    v = torch.randn(total_tokens, num_heads, head_dim, device="mps")
+    # Call the compatibility function
+    out = sdpa_flash.flash_attn_varlen_func(
+        q=q,
+        k=k,
+        v=v,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        dropout_p=0.0,
+        softmax_scale=None,  # Will use 1/sqrt(head_dim)
+        causal=False,
+    )
+    # Check that output has correct shape and is not zeros
+    assert out.shape == q.shape
+    assert out.abs().max().item() > 0
+    # Test with causal
+    out_causal = sdpa_flash.flash_attn_varlen_func(
+        q=q,
+        k=k,
+        v=v,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        dropout_p=0.0,
+        softmax_scale=0.125,
+        causal=True,
+    )
+    assert out_causal.shape == q.shape
+    assert out_causal.abs().max().item() > 0
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+def test_flash_attention_softcapping(dtype, head_dim):
+    """Test Flash Attention with softcapping."""
+    torch.manual_seed(42)
+    # Test dimensions
+    seq_lengths = [32, 24]
+    num_heads = 4
+    softcapping = 50.0
+    # Create cumulative sequence lengths
+    cu_seqlens = create_cu_seqlens(seq_lengths)
+    total_tokens = sum(seq_lengths)
+    max_seqlen = max(seq_lengths)
+    # Create input tensors
+    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
+    # Scale factor
+    scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention with softcapping
+    out = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_k=max_seqlen,
+        do_causal=False,
+        scale=scale,
+        softcapping=softcapping,
+    )
+    # Compute ground truth with softcapping
+    # The kernel applies: softmax(tanh(qk^T*scale/cap)*cap)v
+    expected = torch.zeros_like(query)
+    for i, (start, end) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
+        q_seq = query[start:end]
+        k_seq = key[start:end]
+        v_seq = value[start:end]
+        for h in range(num_heads):
+            q_h = q_seq[:, h, :]
+            k_h = k_seq[:, h, :]
+            v_h = v_seq[:, h, :]
+            # Apply softcapping formula
+            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * (scale / softcapping)
+            scores = torch.tanh(scores) * softcapping
+            attn_weights = torch.softmax(scores, dim=-1)
+            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
+    # Check results (higher tolerance for bfloat16 and softcapping)
+    if dtype == torch.bfloat16:
+        rtol, atol = 3e-2, 3e-2
+    elif dtype == torch.float16:
+        rtol, atol = 2e-2, 2e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+def test_flash_attention_softcapping_edge_cases(dtype):
+    """Test Flash Attention softcapping with edge cases."""
+    torch.manual_seed(42)
+    # Test with softcapping = 1.0 (no softcapping)
+    seq_len = 16
+    num_heads = 2
+    head_dim = 64
+    cu_seqlens = create_cu_seqlens([seq_len])
+    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
+    scale = 1.0 / (head_dim ** 0.5)
+    # With softcapping = 1.0 (no effect)
+    out_no_cap = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out_no_cap,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1.0,
+    )
+    # Regular computation without softcapping
+    expected = torch.zeros_like(query)
+    for h in range(num_heads):
+        q_h = query[:, h, :]
+        k_h = key[:, h, :]
+        v_h = value[:, h, :]
+        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        attn_weights = torch.softmax(scores, dim=-1)
+        expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    # Should be identical when softcapping = 1.0
+    rtol, atol = (2e-2, 2e-2) if dtype != torch.float32 else (1e-3, 1e-3)
+    torch.testing.assert_close(out_no_cap, expected, rtol=rtol, atol=atol)
+    # Test with very large softcapping value
+    out_large_cap = torch.empty_like(query)
+    sdpa_flash.flash_attention_varlen(
+        out=out_large_cap,
+        query=query,
+        key=key,
+        value=value,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        max_seqlen_q=seq_len,
+        max_seqlen_k=seq_len,
+        do_causal=False,
+        scale=scale,
+        softcapping=1000.0,
+    )
+    # With very large softcapping, should be close to no softcapping
+    torch.testing.assert_close(out_large_cap, expected, rtol=rtol, atol=atol)

torch-ext/sdpa_flash/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from ._custom_ops import (
+    flash_attention_varlen,
+    flash_attn_varlen_func,
+)
+from ._ops import ops
+__all__ = [
+    "flash_attention_varlen",
+    "flash_attn_varlen_func",
+    "ops",
+]

torch-ext/sdpa_flash/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import List, Optional
+import torch
+from ._ops import ops
+def flash_attention_varlen(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    do_causal: bool = False,
+    scale: Optional[float] = None,
+    softcapping: float = 1.0,
+) -> None:
+    """
+    Flash Attention with variable-length sequences.
+    Args:
+        out: Output tensor of shape [total_q_tokens, num_heads, head_dim]
+        query: Query tensor of shape [total_q_tokens, num_heads, head_dim]
+        key: Key tensor of shape [total_k_tokens, num_heads_kv, head_dim]
+        value: Value tensor of shape [total_k_tokens, num_heads_kv, head_dim]
+        cu_seqlens_q: Cumulative sequence lengths for queries, shape [batch_size + 1], dtype must be torch.int32
+        cu_seqlens_k: Cumulative sequence lengths for keys, shape [batch_size + 1], dtype must be torch.int32
+        max_seqlen_q: Maximum sequence length in the query batch
+        max_seqlen_k: Maximum sequence length in the key batch
+        do_causal: Whether to apply causal masking
+        scale: Attention scale factor (default: 1/sqrt(head_dim))
+        softcapping: Softcapping value (default: 1.0, must be 1.0 for this implementation)
+    Note:
+        - cu_seqlens_q and cu_seqlens_k must have dtype torch.int32 for Metal compatibility
+        - Supported head dimensions: 32, 64, 72, 80, 96, 128
+        - Masks are not supported
+    """
+    if scale is None:
+        scale = query.shape[-1] ** -0.5
+    ops.flash_attention_varlen(
+        out,
+        query,
+        key,
+        value,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        do_causal,
+        scale,
+        softcapping,
+    )
+def flash_attn_varlen_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    dropout_p: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: tuple = (-1, -1),
+    alibi_slopes: Optional[torch.Tensor] = None,
+    deterministic: bool = False,
+    return_attn_probs: bool = False,
+) -> torch.Tensor:
+    """
+    Flash Attention function with API compatible with the original Flash Attention.
+    Note: This implementation does not support:
+    - dropout
+    - window attention
+    - alibi slopes
+    - returning attention probabilities
+    """
+    if dropout_p > 0:
+        raise NotImplementedError("Dropout is not supported in this implementation")
+    if window_size != (-1, -1):
+        raise NotImplementedError("Window attention is not supported")
+    if alibi_slopes is not None:
+        raise NotImplementedError("ALiBi is not supported")
+    if return_attn_probs:
+        raise NotImplementedError("Returning attention probabilities is not supported")
+    # Create output tensor
+    out = torch.empty_like(q)
+    # Call the kernel
+    flash_attention_varlen(
+        out=out,
+        query=q,
+        key=k,
+        value=v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        do_causal=causal,
+        scale=softmax_scale,
+        softcapping=1.0,
+    )
+    return out
+__all__ = [
+    "flash_attention_varlen",
+    "flash_attn_varlen_func",
+]

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,11 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("flash_attention_varlen(Tensor! out, Tensor query, Tensor key, Tensor value, Tensor cu_seqlens_q, Tensor cu_seqlens_k, int max_seqlen_q, int max_seqlen_k, bool do_causal, float scale, float softcapping) -> ()");
+  ops.impl("flash_attention_varlen", torch::kMPS, flash_attention_varlen);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#pragma once
+#include <torch/torch.h>
+void flash_attention_varlen(
+    torch::Tensor &out,
+    torch::Tensor &query,
+    torch::Tensor &key,
+    torch::Tensor &value,
+    torch::Tensor &cu_seqlens_q,
+    torch::Tensor &cu_seqlens_k,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_k,
+    bool do_causal,
+    double scale,
+    double softcapping);